Load Cleaned Data
PATH_1K, PATH_10K
CONDITIONS
train_dfs, valid_dfs, test_dfs = load_cleaned_ehrdata(PATH_1K)
all_dfs = train_dfs
patients_df, patient_demographics_df, all_rec_dfs = all_dfs[0], all_dfs[1], all_dfs[2:]
patients_df.head()
vals = all_dfs[0].iloc[1].values
vals
vals[0], vals[1], vals[2], vals[4], vals[6], vals[8]
Load Vocabs
vocab_list_1K = EhrVocabList.load(PATH_1K)
obs_vocab, alg_vocab, crpl_vocab, med_vocab, img_vocab, proc_vocab, cnd_vocab, imm_vocab = vocab_list_1K.records_vocabs
bday, bmonth, byear, marital, race, ethnicity, gender, birthplace, city, state, zipcode = vocab_list_1K.demographics_vocabs
age_mean, age_std = vocab_list_1K.age_mean, vocab_list_1K.age_std
demographics_dims, recs_dims, demographics_dims_width, recs_dims_width = get_all_emb_dims(EhrVocabList.load(PATH_1K))
demographics_dims
recs_dims
demographics_dims_width, recs_dims_width
tst_ptid='18794a7c-c7fa-47d6-8387-9fc133c4e1e3'
tst_pt = patients_df[patients_df.patient == tst_ptid]
tst_pt
demograph_vector = patient_demographics_df.loc[tst_ptid]
demograph_vector
tst_pt_birthdate = tst_pt.birthdate.values[0]
tst_pt_birthdate
# tst_pt.diabetes.values[0], tst_pt.stroke.values[0], tst_pt.alzheimers.values[0], tst_pt.coronary_heart.values[0], tst_pt.lung_cancer.values[0]
patients_df.columns[2:]
cnds=[]
for col in (patients_df.columns[2:]):
if '_age' not in col:
cnds.append(col)
cnds
tst_pt_conditions = {}
for cnd in cnds:
tst_pt_conditions[cnd] = tst_pt[cnd].values[0]
tst_pt_conditions
- Filter out for this patient
rec_dfs[[ptid]]
+demographics[[ptid]]
df already filtered by cutoff age- This is being done in
PatientList
, this is just for testing, i.e. this will never be called
- Use double
[[ptid]]
so that - even when single values are returned, we get a df
def get_rec_dfs(all_rec_dfs, ptid):
'''Get all dfs for this patient, this is being done in `PatientList` so this is for testing only'''
rec_dfs = []
for rec_df in all_rec_dfs:
try:
rec_dfs.append(rec_df.loc[[ptid]])
except KeyError:
rec_dfs.append(pd.DataFrame())
return rec_dfs
%time rec_dfs = get_rec_dfs(all_rec_dfs, tst_ptid)
rec_dfs -- observations, allergies, careplans, medications, imaging_studies, procedures, conditions, immunizations
The following are empty for this ptid (to check and confirm, uncomment following code snippet and run)
- allergies - 1
- imaging_studies - 4
# print(f'{name}: {rec_df.shape}')
# display(rec_df.tail())
p_obs, p_alg, p_crpl, p_med, p_img, p_proc, p_cnd, p_immn = rec_dfs
- For empty rec dfs like alg seen above, we have to indicate that nothing was recorded for this particular record (say allergies)
- So the codes list will be
xxnone
s of lengthage_span
- For example for
age_start
=10 andage_stop
=35, we will get 25xxnone
- For example for
- So the codes list will be
- For non-empty rec dfs
- Collate codes by year or month (depending on
age_in_months
value) - Compute respective offsets
- Collate codes by year or month (depending on
For using age in days or hours (for example in case of hospitalization or ICU datasets)
- This function will need to be modified (in addition to
insert_age()
inpreprocessing.clean
) - For example for age in days,
insert_age()
will insert a column in each data frame with the age in days, which then can be used by this function to filter for the rightage_span
def collate_codes_offsts(rec_df, age_start, age_stop, age_in_months=False):
"""Return a single patient's EmbeddingBag lookup codes and offsets for the given age span and age units"""
codes = []
offsts = [0]
age_span = age_stop - age_start
if rec_df.empty:
codes = ['xxnone'] * age_span
offsts = list(range(age_span))
else:
for i in range(age_start, age_stop, 1):
if age_in_months: res = (rec_df.code[rec_df.age_months == i]).values
else : res = (rec_df.code[rec_df.age == i]).values
if len(res) > 0:
codes.extend(res)
if i < age_stop - 1: offsts.append(offsts[-1] + len(res))
else:
codes.append('xxnone')
if i < age_stop - 1: offsts.append(offsts[-1] + 1)
assert len(offsts) == age_span
return codes, offsts
Tests
Collate EHR codes for our test patient, from age 410 months to 420 months
%time all_codes_offsts = [collate_codes_offsts(df, age_start=410, age_stop=420, age_in_months=True) for df in rec_dfs]
Observation codes for the above age_span
is obtained as below ..
obs_codes = all_codes_offsts[0][0]
obs_codes
and the corresponding offsets (to be used for EmbeddingBag lookups) is obtained as below.
obs_offsts = all_codes_offsts[0][1]
obs_offsts
len(obs_codes), len(obs_offsts)
Note that: no matter how many observations are recorded, the number of offsets will always be equal to the age_span
Collate EHR codes for our test patient, from 10 to 30 years
%time all_codes_offsts = [collate_codes_offsts(df, age_start=10, age_stop=30) for df in rec_dfs]
obs_codes = all_codes_offsts[0][0]
obs_codes_num = obs_vocab.numericalize(all_codes_offsts[0][0]) #codes numericalized
obs_offsts = all_codes_offsts[0][1] #offsets
obs_codes[10:20]
obs_codes_num[10:20]
len(obs_codes), len(obs_codes_num), len(obs_offsts)
assert len(obs_codes) == len(obs_codes_num)
obs_vocab.textify(obs_codes_num[10:20])
proc_codes = all_codes_offsts[5][0]
proc_codes_num,proc_offsts = proc_vocab.numericalize(all_codes_offsts[5][0]), all_codes_offsts[5][1]
assert len(proc_codes) == len(proc_codes_num)
len(proc_codes), len(proc_codes_num), len(proc_offsts)
Putting all this into a function
Test - Codes & Offsets
%time codenums, offsts = get_codenums_offsts(rec_dfs, vocab_list_1K.records_vocabs, age_start=10, age_stop=30, age_in_months=False)
med_num, med_o = codenums[3],offsts[3]
med_codes, med_offsts = collate_codes_offsts(p_med, age_start=10, age_stop=30)
assert len(med_num) == len(med_codes)
assert med_o == med_offsts
med_vocab.textify(med_num)[2]
med_codes
med_offsts, med_o
len(med_codes),len(med_offsts)
alg_codes, alg_offsts = collate_codes_offsts(p_alg, age_start=10, age_stop=30)
alg_codes
alg_offsts
obs_num, obs_o = codenums[0],offsts[0]
obs_codes, obs_offsts = collate_codes_offsts(p_obs, age_start=10, age_stop=30)
len(obs_codes), len(obs_offsts), len(obs_num), len(obs_o)
assert len(obs_codes) == len(obs_num)
assert obs_o == obs_offsts
%time codenums1, offsts1 = get_codenums_offsts(rec_dfs, vocab_list_1K.records_vocabs, age_start=220, age_stop=420, age_in_months=True)
for codenum, offst in zip(codenums1, offsts1):
print(len(codenum))
assert len(offst) == 200
Test - Demographics
demograph_vector
get_demographics(demograph_vector, vocab_list_1K.demographics_vocabs, vocab_list_1K.age_mean, vocab_list_1K.age_std)
dem_vector = ['1988-05-16','S',None,None,'F','North Adams','Marlborough','Massachusetts',1901,11141]
dem_vector = pd.Series(dem_vector)
dem_vector
get_demographics(dem_vector, vocab_list_1K.demographics_vocabs, vocab_list_1K.age_mean, vocab_list_1K.age_std)
get_demographics(patient_demographics_df.loc['f5dcd418-09fe-4a2f-baa0-3da800bd8c3a'], vocab_list_1K.demographics_vocabs, \
vocab_list_1K.age_mean, vocab_list_1K.age_std) #has null
- Based on the concept of
ItemBase
as used in fastai v1.x, in our case a single patient
Tests
%time p1 = Patient.create(rec_dfs, demograph_vector, vocab_list_1K, tst_ptid, tst_pt_birthdate, tst_pt_conditions, age_start=10, age_stop=30, age_in_months=False)
len(p1.obs_nums)
assert len(p1.obs_nums) == len(obs_codes)
p1.obs_offsts
p1.conditions['diabetes'], p1.ptid, p1.birthdate
p1
p1.demographics
p1.age_now
p1.demographics.shape, p1.age_now.shape
torch.cat((p1.demographics, p1.age_now.type(torch.LongTensor)), dim=0)
p1.to_gpu()
p1.conditions
p1.demographics, p1.age_now
- Based on the concept of
ItemList
as used in fastai v1.x, which is a list ofItemBase
objects - In our case
PatientList
is a list ofPatient
objects
Multiprocessing Implementation
- Chunk total number of patients based on number of cores available on machine
- Send each chunk of patients into a core
- Let the parallelized sub proc in each core load all data and do the heavy lifting
- The main proc just sends a list of indxs (patients) to work on
def get_pckl_dir(path, split, age_start, age_stop, age_in_months):
'''Util function to construct pickle dir name - for persisting transformed `PatientList`s'''
dir_name = ''
dir_name += 'months' if age_in_months else 'years'
dir_name += f'_{age_start}_to_{age_stop}'
pckl_dir = Path(f'{path}/processed/{dir_name}/{split}')
return pckl_dir
Tests
tst_pckl_dir = get_pckl_dir(PATH_1K, split='train', age_start=10, age_stop=30, age_in_months=False)
tst_pckl_dir
%time PatientList.create_save(all_dfs, vocab_list_1K, tst_pckl_dir, age_start=10, age_stop=30, age_in_months=False)
Tests
%time create_all_ptlists(PATH_1K, age_start=240, age_stop=360, age_in_months=True) #20 to 30 yrs in mos (seq_len = 120)
Other examples
%time create_all_ptlists(PATH_1K, age_start=0, age_stop=20, age_in_months=False, verbose=False)
664 total patients completed, saved patient list to datasets/synthea/1K/processed/years_0_to_20/train
222 total patients completed, saved patient list to datasets/synthea/1K/processed/years_0_to_20/valid
222 total patients completed, saved patient list to datasets/synthea/1K/processed/years_0_to_20/test
CPU times: user 3.66 s, sys: 1.98 s, total: 5.64 s
Wall time: 32.4 s
%time create_all_ptlists(PATH_1K, age_start=0, age_stop=35, age_in_months=False, verbose=False)
664 total patients completed, saved patient list to datasets/synthea/1K/processed/years_0_to_35/train
222 total patients completed, saved patient list to datasets/synthea/1K/processed/years_0_to_35/valid
222 total patients completed, saved patient list to datasets/synthea/1K/processed/years_0_to_35/test
CPU times: user 3.66 s, sys: 1.93 s, total: 5.59 s
Wall time: 40.2 s
%time create_all_ptlists(PATH_1K, age_start=120, age_stop=360, age_in_months=True) #10 to 30 yrs in mos (seq_len = 240)
664 total patients completed, saved patient list to datasets/synthea/1K/processed/months_120_to_360/train
222 total patients completed, saved patient list to datasets/synthea/1K/processed/months_120_to_360/valid
222 total patients completed, saved patient list to datasets/synthea/1K/processed/months_120_to_360/test
CPU times: user 3.84 s, sys: 1.98 s, total: 5.82 s
Wall time: 46.8 s
ptlist_train = PatientList.load(PATH_1K, 'train', age_start=240, age_stop=360, age_in_months=True)
ptlist_valid = PatientList.load(PATH_1K, 'valid', age_start=240, age_stop=360, age_in_months=True)
ptlist_test = PatientList.load(PATH_1K, 'test', age_start=240, age_stop=360, age_in_months=True)
len(ptlist_train), len(ptlist_valid), len(ptlist_test)
ptlist_train
ptlist_test
len(ptlist_train.items[300].obs_nums), len(ptlist_valid.items[200].obs_nums)
Offsets length must be same as age_span (in this case 120)
len(ptlist_train.items[300].obs_offsts), len(ptlist_valid.items[200].obs_offsts)
ptlist_train.items[300].obs_nums
ptlist_train[300].proc_nums
ptlist_train[300].conditions
cnds
counts = []
for cnd in cnds:
train_count = [ptlist_train[i].conditions[cnd] == 1 for i in range(len(ptlist_train))].count(True)
valid_count = [ptlist_valid[i].conditions[cnd] == 1 for i in range(len(ptlist_valid))].count(True)
test_count = [ptlist_test[i].conditions[cnd] == 1 for i in range(len(ptlist_test))].count(True)
total_count = train_count+valid_count+test_count
counts.append([train_count, valid_count, test_count, total_count])
counts_df = pd.DataFrame(counts, index=cnds, columns=['train','valid','test','total'])
counts_df
CONDITIONS
preprocess_ehr_dataset(PATH_1K, SYNTHEA_DATAGEN_DATES['1K'], CONDITIONS, from_raw_data=True)