Classes and helper functions for all Deep Learning models used in this library.
DEVICE
This is the RNNDropout
from fast.ai renamed as InputDropout
x = torch.randn(2,3,5) #bs=2, seq_len(bptt)=3, x(emb width)=5
mask = dropout_mask(x, (2,1,5), 0.75)
x, mask, x*mask
mask.std(), (x*mask).std(), x.std()
mask.mean(), (x*mask).mean(), x.mean()
dp = InputDropout(0.3)
tst_input = torch.randn(2,3,5)
tst_input, dp(tst_input)
tst_input.std(), dp(tst_input).std()
out, m = create_linear_layers(100, 4, bn=True)
m, out
Load Data
SYNTHEA_DATAGEN_DATES['1K']
preprocess_ehr_dataset(PATH_1K, SYNTHEA_DATAGEN_DATES['1K'], conditions_dict=CONDITIONS, age_start=240, age_stop=360, age_in_months=True)
CONDITIONS.keys()
labels = ['diabetes', 'stroke', 'alzheimers', 'coronary_heart', 'breast_cancer', 'epilepsy']
ehr_1K_data = EHRData(PATH_1K, labels, age_start=240, age_stop=360, age_in_months=True)
demograph_dims, rec_dims, demograph_dims_wd, rec_dims_wd = get_all_emb_dims(EhrVocabList.load(PATH_1K))
train_dl, valid_dl, train_pos_wts, valid_pos_wts = ehr_1K_data.get_data()
ehr_1K_data.splits.train
Inspect a single patient
pt = ehr_1K_data.splits.train[3]
pt.obs_nums, pt.obs_offsts
len(pt.img_nums), len(pt.img_offsts)
len(pt.obs_nums), len(pt.obs_offsts)
pt
len(train_dl), len(valid_dl)
train_pos_wts, valid_pos_wts
demograph_dims, demograph_dims_wd
rec_dims, rec_dims_wd
Inspect Model
model = EHR_LSTM(demograph_dims, rec_dims, demograph_dims_wd, rec_dims_wd, num_labels=len(labels))
model
# print(f'{name}::\n{param}')
train_loss_fn, valid_loss_fn = get_loss_fn(train_pos_wts), get_loss_fn(valid_pos_wts)
model = EHR_LSTM(demograph_dims, rec_dims, demograph_dims_wd, rec_dims_wd, len(labels)).to(DEVICE) #put on GPU before instantiating optim
optimizer = torch.optim.Adagrad(model.parameters())
h = RunHistory(labels)
%time h = fit(2, h, model, train_loss_fn, valid_loss_fn, optimizer, auroc_score, \
train_dl, valid_dl, to_chkpt_path=MODEL_STORE, from_chkpt_path=None, verbosity=1)
%time h = fit(3, h, model, train_loss_fn, valid_loss_fn, optimizer, auroc_score, \
train_dl, valid_dl, to_chkpt_path=MODEL_STORE, from_chkpt_path=MODEL_STORE, verbosity=1)
test_dl, test_pos_wts = ehr_1K_data.get_test_data()
len(test_dl), test_pos_wts
test_loss_fn = get_loss_fn(test_pos_wts)
h = predict(h, model, test_loss_fn, auroc_score, test_dl, chkpt_path=MODEL_STORE)
plot_fit_results(h, labels)
h = summarize_prediction(h, labels)
h.prediction_summary
Important: Make sure to only use labels that have atleast 1 "True" value in each split. That is, "y_true" contains both false and at least one true.
Else AUROC score calculation is not possible resulting in this error ValueError: Only one class present in y_true. ROC AUC score is not defined in that case.
ehr_1K_data.load_splits()
ehr_1K_data.splits.get_label_counts(list(CONDITIONS.keys()))
In this small 1K dataset, 'lung_cancer' and 'rheumatoid_arthritis' have single classes in some splits as seen in the prevalence counts above and would result in the above failure when fit is run.
However in large datasets the possibility of this is very low, but its something to watch out for.
Based on the Deepr paper
Nguyen. et al. Deepr: A Convolutional Net for Medical Records
Sizes (+ Conv Arithmetic)
bs = 64 #batch size # of patients
wd = 85 #rec_emb_width (concat of 1 yr of records)
ht = 240 #num of years of pt data (seq_len or bptt in lstm)
Each pt is a 20 by 85 matrix
- 20 years on axis 0 (height)
- 85 codes on axis 1 (width)
tst_pts = torch.randn(bs,ht,wd)
tst_pts.shape
But ...
- Input :: $(N, C_{in}, H_{in}, W_{in})$
- Output:: $(N, C_{out}, H_{out}, W_{out})$
So need to reshape to insert $C_{in}$ (which is 1 in this case) after bs
tst_pts = tst_pts.reshape(bs,1,ht,wd)
tst_pts.shape
m = nn.Sequential(
nn.Conv2d(in_channels=1, out_channels=2, kernel_size=(5,5), padding=2), nn.ReLU(),
nn.Conv2d(2,4,kernel_size=(3,3), padding=1), nn.ReLU(),
nn.Conv2d(4,8,kernel_size=(3,3), stride=2, padding=1), nn.ReLU(),
nn.Conv2d(8,16,kernel_size=(3,3), stride=2, padding=1), nn.ReLU(),
nn.Conv2d(16,32,kernel_size=(3,3), stride=2, padding=1), nn.ReLU(),
nn.AdaptiveMaxPool2d((4,4)),
nn.Flatten()
)
out = m(tst_pts)
out.shape
- AdaptivePool ensures output before Flatten is bs x 16 x 5 x 5
- And thus Flatten will always flatten it to bs x 400
- So can use 400 safely - no matter the size of the input (which will change based on vocab dims)
for name, param in m.named_parameters():
print(name)
Based on perf tuning recommendation..
Load Data + Inspect
ehr_1K_data = EHRData(PATH_1K, labels, age_start=240, age_stop=360, age_in_months=True, lazy_load_gpu=False) #entire dataset on GPU
demograph_dims, rec_dims, demograph_dims_wd, rec_dims_wd = get_all_emb_dims(EhrVocabList.load(PATH_1K))
train_dl, valid_dl, train_pos_wts, valid_pos_wts = ehr_1K_data.get_data()
Inspect Model
model = EHR_CNN(demograph_dims, rec_dims, demograph_dims_wd, rec_dims_wd, len(labels))
model
# print(f'{name}::\n{param}')
train_loss_fn, valid_loss_fn = get_loss_fn(train_pos_wts), get_loss_fn(valid_pos_wts)
model = EHR_CNN(demograph_dims, rec_dims, demograph_dims_wd, rec_dims_wd, len(labels)).to(DEVICE)
optimizer = torch.optim.Adagrad(model.parameters())
h = RunHistory(labels)
%time h = fit(3, h, model, train_loss_fn, valid_loss_fn, optimizer, auroc_score, \
train_dl, valid_dl, to_chkpt_path=MODEL_STORE, from_chkpt_path=None, verbosity=1)
%time h = fit(2, h, model, train_loss_fn, valid_loss_fn, optimizer, auroc_score, \
train_dl, valid_dl, to_chkpt_path=MODEL_STORE, from_chkpt_path=MODEL_STORE, verbosity=1)
test_dl, test_pos_wts = ehr_1K_data.get_test_data()
len(test_dl), test_pos_wts
test_loss_fn = get_loss_fn(test_pos_wts)
h = predict(h, model, test_loss_fn, auroc_score, test_dl, chkpt_path=MODEL_STORE)
plot_fit_results(h, labels)
h = summarize_prediction(h, labels)
h.prediction_summary