First defining some test data

y = torch.tensor((0,1,0,1,0)).unsqueeze(1)
yhat = torch.tensor((.1,.8,.49,.53,.6)).unsqueeze(1)

y.dtype, yhat.dtype

(torch.int64, torch.float32)

And some multi-label (multi-task) test data

yhat_mt = torch.rand(10,4)
yhat_mt

tensor([[0.3137, 0.7057, 0.8091, 0.6279],
        [0.0104, 0.9387, 0.2502, 0.6880],
        [0.2544, 0.0550, 0.6559, 0.1570],
        [0.5806, 0.3732, 0.9301, 0.9665],
        [0.2598, 0.8693, 0.7225, 0.9324],
        [0.0742, 0.7114, 0.7245, 0.1057],
        [0.1661, 0.9369, 0.6056, 0.0246],
        [0.4413, 0.2379, 0.3213, 0.8767],
        [0.0652, 0.1064, 0.9167, 0.0079],
        [0.4280, 0.9548, 0.9420, 0.8374]])

y_mt = torch.randint(2, yhat_mt.shape)
y_mt

tensor([[0, 0, 0, 1],
        [0, 0, 0, 0],
        [0, 1, 1, 0],
        [1, 1, 1, 0],
        [1, 0, 1, 0],
        [1, 0, 0, 1],
        [1, 0, 1, 0],
        [0, 1, 1, 0],
        [0, 1, 1, 0],
        [1, 1, 1, 1]])

Simple Accuracies

accuracy(y, yhat), null_accuracy(y)

(tensor(0.8000), tensor(0.6000))

accuracy(y_mt, yhat_mt), null_accuracy(y_mt)

(tensor(0.4750), tensor(0.5000))

ROC Curve & Optimal Threshold

y.shape, yhat.shape, y_mt.shape, yhat_mt.shape

(torch.Size([5, 1]),
 torch.Size([5, 1]),
 torch.Size([10, 4]),
 torch.Size([10, 4]))

To use @ inference time for cutoff of yhat
From these explanations
Not implementing Youden's J-Score, but thats another option too - also in the above link

Test

test_roc = ROC(y, yhat)

test_roc.fpr, test_roc.tpr, test_roc.thresholds, test_roc.optimal_thresh(), test_roc.auroc

(array([0.        , 0.        , 0.33333333, 0.33333333, 1.        ]),
 array([0. , 0.5, 0.5, 1. , 1. ]),
 array([1.8 , 0.8 , 0.6 , 0.53, 0.1 ], dtype=float32),
 0.53,
 0.8333333333333334)

Test

LABELS

['diabetes', 'stroke', 'alzheimers', 'coronaryheart']

y_mt.shape, yhat_mt.shape

(torch.Size([10, 4]), torch.Size([10, 4]))

mt_rocs = MultiLabelROC(y_mt, yhat_mt, LABELS)

mt_rocs.ROCs['diabetes'].fpr, mt_rocs.ROCs['diabetes'].tpr, mt_rocs.ROCs['diabetes'].thresholds, \
mt_rocs.ROCs['diabetes'].optimal_thresh(), mt_rocs.ROCs['diabetes'].auroc

(array([0. , 0. , 0.2, 0.2, 0.4, 0.4, 0.6, 0.6, 1. ]),
 array([0. , 0.2, 0.2, 0.4, 0.4, 0.6, 0.6, 1. , 1. ]),
 array([1.580641  , 0.5806411 , 0.44134134, 0.42800868, 0.31371653,
        0.25979996, 0.2543732 , 0.07419437, 0.01037359], dtype=float32),
 0.25979996,
 0.64)

mt_rocs.ROCs['stroke'].fpr, mt_rocs.ROCs['stroke'].tpr, mt_rocs.ROCs['stroke'].thresholds, \
mt_rocs.ROCs['stroke'].optimal_thresh(), mt_rocs.ROCs['stroke'].auroc

(array([0., 0., 1., 1.]),
 array([0. , 0.2, 0.2, 1. ]),
 array([1.9547961 , 0.954796  , 0.7056961 , 0.05504644], dtype=float32),
 0.954796,
 0.2)

Plot

Test

yhat_train=torch.rand(64,1)
yhat_valid=torch.rand(64,1)
y_train = torch.randint(2, yhat_train.shape)
y_valid = torch.randint(2, yhat_valid.shape)

yhat_train_mt = torch.rand(64,4)
yhat_valid_mt = torch.rand(64,4)
y_train_mt = torch.randint(2, yhat_train_mt.shape)
y_valid_mt = torch.randint(2, yhat_valid_mt.shape)

train_mt_rocs = MultiLabelROC(y_train_mt, yhat_train_mt, LABELS)
valid_mt_rocs = MultiLabelROC(y_valid_mt, yhat_valid_mt, LABELS)

train_roc = ROC(y_train, yhat_train)
valid_roc = ROC(y_valid, yhat_valid)

plot_train_valid_rocs(train_mt_rocs.ROCs, valid_mt_rocs.ROCs, LABELS, multilabel=True)

plot_train_valid_rocs(train_roc, valid_roc, LABELS, multilabel=False) #LABELS ignored

plot_rocs(train_mt_rocs.ROCs, LABELS, title='Train Multi-task', multilabel=True)

plot_rocs([train_roc, valid_roc], ['train','valid'], title='Train, Valid - Single Task')

plot_rocs([train_roc], ['train'], title='Train - Single Task')

train_roc.plot('Train - single Task', 'testing call from class')

train_mt_rocs.plot('Train - MultiLabel from class')

AUROC Score & Confidence Interval

AUROC Score

yhat_mt = torch.rand(64,4)
y_mt = torch.randint(2,size=(64,4))
# y_mt, yhat_mt

print("average=weighted: ",skl_metrics.roc_auc_score(y_mt, yhat_mt, average='weighted'))
print("average=macro: ",skl_metrics.roc_auc_score(y_mt, yhat_mt))
print("average=micro: ",skl_metrics.roc_auc_score(y_mt, yhat_mt, average='micro'))
print("average=none: ",skl_metrics.roc_auc_score(y_mt, yhat_mt, average=None))
print("acc: ", accuracy(y_mt, yhat_mt))
print("null acc: ", null_accuracy(y_mt))

average=weighted:  0.4879348513598988
average=macro:  0.4870098039215687
average=micro:  0.4809995112414467
average=none:  [0.55882353 0.51568627 0.53137255 0.34215686]
acc:  tensor(0.4727)
null acc:  tensor(0.5156)

aurocs = []
for label in range (y_mt.shape[1]):
    aurocs.append(skl_metrics.roc_auc_score(y_mt[:,label], yhat_mt[:,label]))
print("aurocs: ", aurocs)
print("np.mean of aurocs: ", np.mean(aurocs))

aurocs:  [0.5588235294117647, 0.5156862745098039, 0.5313725490196078, 0.3421568627450981]
np.mean of aurocs:  0.4870098039215687

So ...

average=None ==> calling individually
average=macro ==> mean of individual calls

skl_metrics.roc_auc_score(y_mt, yhat_mt, average=None) == aurocs # None
skl_metrics.roc_auc_score(y_mt, yhat_mt) == np.mean(aurocs) #macro

True

Confidence Interval

Based on this explanation
This is the bootstrapping method not the DeLong method

y_pred = np.array([0.21, 0.32, 0.63, 0.35, 0.92, 0.79, 0.82, 0.99, 0.04])
y_true = np.array([0,    1,    0,    0,    1,    1,    0,    1,    0   ])

auroc_score(y_true, y_pred), auroc_ci(y_true, y_pred)

(0.8, (0.389, 1.0))

auroc_score(y_mt, yhat_mt)

array([0.55882353, 0.51568627, 0.53137255, 0.34215686])

Metrics

Simple Accuracies

`accuracy`[source]

`null_accuracy`[source]

ROC Curve & Optimal Threshold

`class` `ROC`[source]

`ROC.optimal_thresh`[source]

`ROC.plot`[source]

`class` `MultiLabelROC`[source]

`MultiLabelROC.plot`[source]

Plot

`plot_rocs`[source]

`plot_train_valid_rocs`[source]

AUROC Score & Confidence Interval

AUROC Score

`auroc_score`[source]

Confidence Interval

`auroc_ci`[source]

Metrics

Simple Accuracies

accuracy[source]

null_accuracy[source]

ROC Curve & Optimal Threshold

class ROC[source]

ROC.optimal_thresh[source]

ROC.plot[source]

class MultiLabelROC[source]

MultiLabelROC.plot[source]

Plot

plot_rocs[source]

plot_train_valid_rocs[source]

AUROC Score & Confidence Interval

AUROC Score

auroc_score[source]

Confidence Interval

auroc_ci[source]

`accuracy`[source]

`null_accuracy`[source]

`class` `ROC`[source]

`ROC.optimal_thresh`[source]

`ROC.plot`[source]

`class` `MultiLabelROC`[source]

`MultiLabelROC.plot`[source]

`plot_rocs`[source]

`plot_train_valid_rocs`[source]

`auroc_score`[source]

`auroc_ci`[source]