Classes and functions for calculating and plotting metrics.
First defining some test data
y = torch.tensor((0,1,0,1,0)).unsqueeze(1)
yhat = torch.tensor((.1,.8,.49,.53,.6)).unsqueeze(1)
y.dtype, yhat.dtype
And some multi-label (multi-task) test data
yhat_mt = torch.rand(10,4)
yhat_mt
y_mt = torch.randint(2, yhat_mt.shape)
y_mt
accuracy(y, yhat), null_accuracy(y)
accuracy(y_mt, yhat_mt), null_accuracy(y_mt)
y.shape, yhat.shape, y_mt.shape, yhat_mt.shape
- To use @ inference time for cutoff of yhat
- From these explanations
- Not implementing Youden's J-Score, but thats another option too - also in the above link
Test
test_roc = ROC(y, yhat)
test_roc.fpr, test_roc.tpr, test_roc.thresholds, test_roc.optimal_thresh(), test_roc.auroc
Test
LABELS
y_mt.shape, yhat_mt.shape
mt_rocs = MultiLabelROC(y_mt, yhat_mt, LABELS)
mt_rocs.ROCs['diabetes'].fpr, mt_rocs.ROCs['diabetes'].tpr, mt_rocs.ROCs['diabetes'].thresholds, \
mt_rocs.ROCs['diabetes'].optimal_thresh(), mt_rocs.ROCs['diabetes'].auroc
mt_rocs.ROCs['stroke'].fpr, mt_rocs.ROCs['stroke'].tpr, mt_rocs.ROCs['stroke'].thresholds, \
mt_rocs.ROCs['stroke'].optimal_thresh(), mt_rocs.ROCs['stroke'].auroc
Test
yhat_train=torch.rand(64,1)
yhat_valid=torch.rand(64,1)
y_train = torch.randint(2, yhat_train.shape)
y_valid = torch.randint(2, yhat_valid.shape)
yhat_train_mt = torch.rand(64,4)
yhat_valid_mt = torch.rand(64,4)
y_train_mt = torch.randint(2, yhat_train_mt.shape)
y_valid_mt = torch.randint(2, yhat_valid_mt.shape)
train_mt_rocs = MultiLabelROC(y_train_mt, yhat_train_mt, LABELS)
valid_mt_rocs = MultiLabelROC(y_valid_mt, yhat_valid_mt, LABELS)
train_roc = ROC(y_train, yhat_train)
valid_roc = ROC(y_valid, yhat_valid)
plot_train_valid_rocs(train_mt_rocs.ROCs, valid_mt_rocs.ROCs, LABELS, multilabel=True)
plot_train_valid_rocs(train_roc, valid_roc, LABELS, multilabel=False) #LABELS ignored
plot_rocs(train_mt_rocs.ROCs, LABELS, title='Train Multi-task', multilabel=True)
plot_rocs([train_roc, valid_roc], ['train','valid'], title='Train, Valid - Single Task')
plot_rocs([train_roc], ['train'], title='Train - Single Task')
train_roc.plot('Train - single Task', 'testing call from class')
train_mt_rocs.plot('Train - MultiLabel from class')
yhat_mt = torch.rand(64,4)
y_mt = torch.randint(2,size=(64,4))
# y_mt, yhat_mt
print("average=weighted: ",skl_metrics.roc_auc_score(y_mt, yhat_mt, average='weighted'))
print("average=macro: ",skl_metrics.roc_auc_score(y_mt, yhat_mt))
print("average=micro: ",skl_metrics.roc_auc_score(y_mt, yhat_mt, average='micro'))
print("average=none: ",skl_metrics.roc_auc_score(y_mt, yhat_mt, average=None))
print("acc: ", accuracy(y_mt, yhat_mt))
print("null acc: ", null_accuracy(y_mt))
aurocs = []
for label in range (y_mt.shape[1]):
aurocs.append(skl_metrics.roc_auc_score(y_mt[:,label], yhat_mt[:,label]))
print("aurocs: ", aurocs)
print("np.mean of aurocs: ", np.mean(aurocs))
So ...
average=None
==> calling individuallyaverage=macro
==> mean of individual calls
skl_metrics.roc_auc_score(y_mt, yhat_mt, average=None) == aurocs # None
skl_metrics.roc_auc_score(y_mt, yhat_mt) == np.mean(aurocs) #macro
Confidence Interval
- Based on this explanation
- This is the bootstrapping method not the DeLong method
y_pred = np.array([0.21, 0.32, 0.63, 0.35, 0.92, 0.79, 0.82, 0.99, 0.04])
y_true = np.array([0, 1, 0, 0, 1, 1, 0, 1, 0 ])
auroc_score(y_true, y_pred), auroc_ci(y_true, y_pred)
auroc_score(y_mt, yhat_mt)