Load Cleaned Data

PATH_1K, PATH_10K

('/home/vinod/.lemonpie/datasets/synthea/1K',
 '/home/vinod/.lemonpie/datasets/synthea/10K')

CONDITIONS

{'diabetes': '44054006',
 'stroke': '230690007',
 'alzheimers': '26929004',
 'coronary_heart': '53741008',
 'lung_cancer': '254637007',
 'breast_cancer': '254837009',
 'rheumatoid_arthritis': '69896004',
 'epilepsy': '84757009'}

train_dfs, valid_dfs, test_dfs = load_cleaned_ehrdata(PATH_1K)

all_dfs = train_dfs
patients_df, patient_demographics_df, all_rec_dfs = all_dfs[0], all_dfs[1], all_dfs[2:]

patients_df.head()

vals = all_dfs[0].iloc[1].values
vals

array(['e52a1bbc-7b12-4d01-82cc-1196da05e399', '2016-12-29', False, nan,
       False, nan, False, nan, False, nan, False, nan, False, nan, False,
       nan, False, nan], dtype=object)

vals[0], vals[1], vals[2], vals[4], vals[6], vals[8]

('e52a1bbc-7b12-4d01-82cc-1196da05e399',
 '2016-12-29',
 False,
 False,
 False,
 False)

Load Vocabs

vocab_list_1K = EhrVocabList.load(PATH_1K)
obs_vocab, alg_vocab, crpl_vocab, med_vocab, img_vocab, proc_vocab, cnd_vocab, imm_vocab = vocab_list_1K.records_vocabs
bday, bmonth, byear, marital, race, ethnicity, gender, birthplace, city, state, zipcode  = vocab_list_1K.demographics_vocabs
age_mean, age_std = vocab_list_1K.age_mean, vocab_list_1K.age_std

demographics_dims, recs_dims, demographics_dims_width, recs_dims_width = get_all_emb_dims(EhrVocabList.load(PATH_1K))

demographics_dims

[(33, 8),
 (14, 7),
 (124, 11),
 (5, 5),
 (7, 6),
 (4, 5),
 (4, 5),
 (243, 14),
 (208, 13),
 (3, 5),
 (181, 13)]

recs_dims

[(536, 17),
 (26, 8),
 (50, 9),
 (226, 13),
 (11, 6),
 (137, 12),
 (184, 13),
 (20, 7)]

demographics_dims_width, recs_dims_width

(92, 85)

Assemble Single Patient

tst_ptid='18794a7c-c7fa-47d6-8387-9fc133c4e1e3'

tst_pt = patients_df[patients_df.patient == tst_ptid]
tst_pt

demograph_vector = patient_demographics_df.loc[tst_ptid]
demograph_vector

birthdate                    1982-06-21
marital                               M
race                              white
ethnicity                      hispanic
gender                                F
birthplace      Panama City  Panama  PA
city                            Waltham
state                     Massachusetts
zip                                2472
age_now_days                      14147
Name: 18794a7c-c7fa-47d6-8387-9fc133c4e1e3, dtype: object

tst_pt_birthdate = tst_pt.birthdate.values[0]
tst_pt_birthdate

'1982-06-21'

#     tst_pt.diabetes.values[0], tst_pt.stroke.values[0], tst_pt.alzheimers.values[0], tst_pt.coronary_heart.values[0], tst_pt.lung_cancer.values[0]

patients_df.columns[2:]

Index(['diabetes', 'diabetes_age', 'stroke', 'stroke_age', 'alzheimers',
       'alzheimers_age', 'coronary_heart', 'coronary_heart_age', 'lung_cancer',
       'lung_cancer_age', 'breast_cancer', 'breast_cancer_age',
       'rheumatoid_arthritis', 'rheumatoid_arthritis_age', 'epilepsy',
       'epilepsy_age'],
      dtype='object')

cnds=[]
for col in (patients_df.columns[2:]):
    if '_age' not in col:
        cnds.append(col)

cnds

['diabetes',
 'stroke',
 'alzheimers',
 'coronary_heart',
 'lung_cancer',
 'breast_cancer',
 'rheumatoid_arthritis',
 'epilepsy']

tst_pt_conditions = {}

for cnd in cnds:
    tst_pt_conditions[cnd] = tst_pt[cnd].values[0]

tst_pt_conditions

{'diabetes': True,
 'stroke': False,
 'alzheimers': False,
 'coronary_heart': False,
 'lung_cancer': False,
 'breast_cancer': False,
 'rheumatoid_arthritis': False,
 'epilepsy': False}

Collate Codes & Offsets

Filter out for this patient
- rec_dfs[[ptid]] + demographics[[ptid]]
- ~~df already filtered by cutoff age~~
- This is being done in PatientList, this is just for testing, i.e. this will never be called
Use double [[ptid]] so that - even when single values are returned, we get a df

def get_rec_dfs(all_rec_dfs, ptid):
    '''Get all dfs for this patient, this is being done in `PatientList` so this is for testing only'''
    rec_dfs = []
    for rec_df in all_rec_dfs:
        try:
            rec_dfs.append(rec_df.loc[[ptid]])
        except KeyError:
            rec_dfs.append(pd.DataFrame())
    return rec_dfs

%time rec_dfs = get_rec_dfs(all_rec_dfs, tst_ptid)

CPU times: user 12.3 ms, sys: 4.07 ms, total: 16.3 ms
Wall time: 16.3 ms

rec_dfs -- observations, allergies, careplans, medications, imaging_studies, procedures, conditions, immunizations

The following are empty for this ptid (to check and confirm, uncomment following code snippet and run)

allergies - 1
imaging_studies - 4

#     print(f'{name}: {rec_df.shape}')
#     display(rec_df.tail())

p_obs, p_alg, p_crpl, p_med, p_img, p_proc, p_cnd, p_immn = rec_dfs

For empty rec dfs like alg seen above, we have to indicate that nothing was recorded for this particular record (say allergies)
1. So the codes list will be xxnones of length age_span
  - For example for age_start=10 and age_stop=35, we will get 25 xxnone
For non-empty rec dfs
1. Collate codes by year or month (depending on age_in_months value)
2. Compute respective offsets

For using age in days or hours (for example in case of hospitalization or ICU datasets)

This function will need to be modified (in addition to insert_age() in preprocessing.clean)
For example for age in days, insert_age() will insert a column in each data frame with the age in days, which then can be used by this function to filter for the right age_span

def collate_codes_offsts(rec_df, age_start, age_stop, age_in_months=False):
    """Return a single patient's EmbeddingBag lookup codes and offsets for the given age span and age units"""
    codes  = []
    offsts = [0]
    age_span = age_stop - age_start
    if rec_df.empty: 
        codes = ['xxnone'] * age_span
        offsts = list(range(age_span))
    else:
        for i in range(age_start, age_stop, 1):
            if age_in_months: res = (rec_df.code[rec_df.age_months == i]).values
            else            : res = (rec_df.code[rec_df.age == i]).values
            if len(res) > 0: 
                codes.extend(res)      
                if i < age_stop - 1: offsts.append(offsts[-1] + len(res))
            else:          
                codes.append('xxnone') 
                if i < age_stop - 1: offsts.append(offsts[-1] + 1)
    
    assert len(offsts) == age_span
    return codes, offsts

Tests

Collate EHR codes for our test patient, from age 410 months to 420 months

%time all_codes_offsts = [collate_codes_offsts(df, age_start=410, age_stop=420, age_in_months=True) for df in rec_dfs]

CPU times: user 17.3 ms, sys: 126 µs, total: 17.4 ms
Wall time: 16.2 ms

Observation codes for the above age_span is obtained as below ..

obs_codes = all_codes_offsts[0][0]
obs_codes

['8302-2||168.8||cm||numeric',
 '72514-3||2.0||{score}||numeric',
 '29463-7||79.8||kg||numeric',
 '39156-5||28.0||kg/m2||numeric',
 '8462-4||71.0||mm[Hg]||numeric',
 '8480-6||113.0||mm[Hg]||numeric',
 '8867-4||93.0||/min||numeric',
 '9279-1||15.0||/min||numeric',
 '2339-0||107.5||mg/dL||numeric',
 '6299-2||12.2||mg/dL||numeric',
 '38483-4||1.5||mg/dL||numeric',
 '49765-1||9.6||mg/dL||numeric',
 '2947-0||142.2||mmol/L||numeric',
 '6298-4||5.1||mmol/L||numeric',
 '2069-3||109.8||mmol/L||numeric',
 '20565-8||22.3||mmol/L||numeric',
 '2093-3||207.3||mg/dL||numeric',
 '2571-8||197.2||mg/dL||numeric',
 '18262-6||115.3||mg/dL||numeric',
 '2085-9||52.5||mg/dL||numeric',
 '14959-1||15.8||mg/g||numeric',
 '6690-2||5.3||10*3/uL||numeric',
 '789-8||4.6||10*6/uL||numeric',
 '718-7||13.5||g/dL||numeric',
 '4544-3||38.0||%||numeric',
 '787-2||80.7||fL||numeric',
 '785-6||28.4||pg||numeric',
 '786-4||33.1||g/dL||numeric',
 '21000-5||44.9||fL||numeric',
 '777-3||175.2||10*3/uL||numeric',
 '32207-3||449.8||fL||numeric',
 '32623-1||12.2||fL||numeric',
 '33914-3||67.9||mL/min/{1.73_m2}||numeric',
 '72166-2||Never smoker||xxxnan||text',
 '4548-4||7.0||%||numeric',
 'xxnone',
 'xxnone',
 'xxnone',
 'xxnone',
 'xxnone',
 'xxnone',
 'xxnone',
 'xxnone',
 'xxnone']

and the corresponding offsets (to be used for EmbeddingBag lookups) is obtained as below.

obs_offsts = all_codes_offsts[0][1]
obs_offsts

[0, 35, 36, 37, 38, 39, 40, 41, 42, 43]

len(obs_codes), len(obs_offsts)

(44, 10)

Note that: no matter how many observations are recorded, the number of offsets will always be equal to the age_span

Collate EHR codes for our test patient, from 10 to 30 years

%time all_codes_offsts = [collate_codes_offsts(df, age_start=10, age_stop=30) for df in rec_dfs]

CPU times: user 25.1 ms, sys: 85 µs, total: 25.2 ms
Wall time: 25.1 ms

Numericalize

Once we have all the codes from the vocabs, we need to numericalize them
Call to vocab.numericalize() must always be a list
codes returned from collate_codes_offsts() is always a list even if a single item

obs_codes = all_codes_offsts[0][0]

obs_codes_num = obs_vocab.numericalize(all_codes_offsts[0][0]) #codes numericalized
obs_offsts = all_codes_offsts[0][1] #offsets

obs_codes[10:20]

['xxnone',
 'xxnone',
 'xxnone',
 'xxnone',
 'xxnone',
 'xxnone',
 'xxnone',
 'xxnone',
 '8302-2||168.8||cm||numeric',
 '72514-3||1.0||{score}||numeric']

obs_codes_num[10:20]

[0, 0, 0, 0, 0, 0, 0, 0, 5, 7]

len(obs_codes), len(obs_codes_num), len(obs_offsts)

(59, 59, 20)

assert len(obs_codes) == len(obs_codes_num)

obs_vocab.textify(obs_codes_num[10:20])

[('xxnone', 'Nothing recorded'),
 ('xxnone', 'Nothing recorded'),
 ('xxnone', 'Nothing recorded'),
 ('xxnone', 'Nothing recorded'),
 ('xxnone', 'Nothing recorded'),
 ('xxnone', 'Nothing recorded'),
 ('xxnone', 'Nothing recorded'),
 ('xxnone', 'Nothing recorded'),
 ('8302-2||160.29999999999998||cm||numeric', 'Body Height'),
 ('72514-3||0.0||{score}||numeric',
  'Pain severity - 0-10 verbal numeric rating [Score] - Reported')]

proc_codes = all_codes_offsts[5][0]

proc_codes_num,proc_offsts = proc_vocab.numericalize(all_codes_offsts[5][0]), all_codes_offsts[5][1]

assert len(proc_codes) == len(proc_codes_num)

len(proc_codes), len(proc_codes_num), len(proc_offsts)

(21, 21, 20)

Putting all this into a function

Test - Codes & Offsets

%time codenums, offsts = get_codenums_offsts(rec_dfs, vocab_list_1K.records_vocabs, age_start=10, age_stop=30, age_in_months=False)

CPU times: user 92.2 ms, sys: 304 µs, total: 92.5 ms
Wall time: 90.9 ms

med_num, med_o = codenums[3],offsts[3]

med_codes, med_offsts = collate_codes_offsts(p_med, age_start=10, age_stop=30)

assert len(med_num) == len(med_codes)
assert med_o == med_offsts

med_vocab.textify(med_num)[2]

('xxnone', 'Nothing recorded')

med_codes

['xxnone',
 'xxnone',
 'xxnone',
 'xxnone',
 'xxnone',
 'xxnone',
 'xxnone',
 'xxnone',
 'xxnone',
 'xxnone',
 'xxnone',
 'xxnone',
 'xxnone',
 'xxnone',
 'xxnone',
 'xxnone',
 'xxnone',
 '749762||START',
 '807283||START',
 '749762||STOP',
 '807283||STOP']

med_offsts, med_o

([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 20],
 [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 20])

len(med_codes),len(med_offsts)

(21, 20)

alg_codes, alg_offsts = collate_codes_offsts(p_alg, age_start=10, age_stop=30)

alg_codes

['xxnone',
 'xxnone',
 'xxnone',
 'xxnone',
 'xxnone',
 'xxnone',
 'xxnone',
 'xxnone',
 'xxnone',
 'xxnone',
 'xxnone',
 'xxnone',
 'xxnone',
 'xxnone',
 'xxnone',
 'xxnone',
 'xxnone',
 'xxnone',
 'xxnone',
 'xxnone']

alg_offsts

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]

obs_num, obs_o = codenums[0],offsts[0]

obs_codes, obs_offsts = collate_codes_offsts(p_obs, age_start=10, age_stop=30)

len(obs_codes), len(obs_offsts), len(obs_num), len(obs_o)

(59, 20, 59, 20)

assert len(obs_codes) == len(obs_num)
assert obs_o == obs_offsts

%time codenums1, offsts1 = get_codenums_offsts(rec_dfs, vocab_list_1K.records_vocabs, age_start=220, age_stop=420, age_in_months=True)

CPU times: user 420 ms, sys: 0 ns, total: 420 ms
Wall time: 420 ms

for codenum, offst in zip(codenums1, offsts1):
    print(len(codenum))
    assert len(offst) == 200

305
200
200
201
200
275
202
201

Test - Demographics

demograph_vector

birthdate                    1982-06-21
marital                               M
race                              white
ethnicity                      hispanic
gender                                F
birthplace      Panama City  Panama  PA
city                            Waltham
state                     Massachusetts
zip                                2472
age_now_days                      14147
Name: 18794a7c-c7fa-47d6-8387-9fc133c4e1e3, dtype: object

get_demographics(demograph_vector, vocab_list_1K.demographics_vocabs, vocab_list_1K.age_mean, vocab_list_1K.age_std)

([22, 7, 84, 2, 3, 2, 2, 5, 5, 2, 4], -0.22388270869843924)

dem_vector = ['1988-05-16','S',None,None,'F','North Adams','Marlborough','Massachusetts',1901,11141]

dem_vector = pd.Series(dem_vector)
dem_vector

0       1988-05-16
1                S
2             None
3             None
4                F
5      North Adams
6      Marlborough
7    Massachusetts
8             1901
9            11141
dtype: object

get_demographics(dem_vector, vocab_list_1K.demographics_vocabs, vocab_list_1K.age_mean, vocab_list_1K.age_std)

([17, 6, 90, 4, 0, 0, 2, 1, 31, 2, 1], -0.536998027641407)

get_demographics(patient_demographics_df.loc['f5dcd418-09fe-4a2f-baa0-3da800bd8c3a'], vocab_list_1K.demographics_vocabs, \
                 vocab_list_1K.age_mean, vocab_list_1K.age_std) #has null

([19, 11, 98, 3, 3, 3, 3, 151, 3, 2, 113], -0.7689700220811906)

`ItemBase` - Patient

Based on the concept of ItemBase as used in fastai v1.x, in our case a single patient

Tests

%time p1 = Patient.create(rec_dfs, demograph_vector, vocab_list_1K, tst_ptid, tst_pt_birthdate, tst_pt_conditions, age_start=10, age_stop=30, age_in_months=False)

CPU times: user 85.6 ms, sys: 250 µs, total: 85.8 ms
Wall time: 84.5 ms

len(p1.obs_nums)

59

assert len(p1.obs_nums) == len(obs_codes)

p1.obs_offsts

tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
        18, 56])

p1.conditions['diabetes'], p1.ptid, p1.birthdate

(True, '18794a7c-c7fa-47d6-8387-9fc133c4e1e3', '1982-06-21')

p1

ptid:18794a7c-c7fa-47d6-8387-9fc133c4e1e3, birthdate:1982-06-21, [('diabetes', True), ('stroke', False)].., device:cpu

p1.demographics

tensor([22,  7, 84,  2,  3,  2,  2,  5,  5,  2,  4])

p1.age_now

tensor([-0.2239], dtype=torch.float64)

p1.demographics.shape, p1.age_now.shape

(torch.Size([11]), torch.Size([1]))

torch.cat((p1.demographics, p1.age_now.type(torch.LongTensor)), dim=0)

tensor([22,  7, 84,  2,  3,  2,  2,  5,  5,  2,  4,  0])

p1.to_gpu()

ptid:18794a7c-c7fa-47d6-8387-9fc133c4e1e3, birthdate:1982-06-21, conditions:[('diabetes', True), ('stroke', False)].., device:cuda:0

p1.conditions

{'diabetes': True,
 'stroke': False,
 'alzheimers': False,
 'coronary_heart': False,
 'lung_cancer': False,
 'breast_cancer': False,
 'rheumatoid_arthritis': False,
 'epilepsy': False}

p1.demographics, p1.age_now

(tensor([22,  7, 84,  2,  3,  2,  2,  5,  5,  2,  4], device='cuda:0'),
 tensor([-0.2239], device='cuda:0', dtype=torch.float64))

`ItemList` - PatientList

Based on the concept of ItemList as used in fastai v1.x, which is a list of ItemBase objects
In our case PatientList is a list of Patient objects

Multiprocessing Implementation

Chunk total number of patients based on number of cores available on machine
Send each chunk of patients into a core
- Let the parallelized sub proc in each core load all data and do the heavy lifting
- The main proc just sends a list of indxs (patients) to work on

def get_pckl_dir(path, split, age_start, age_stop, age_in_months):
    '''Util function to construct pickle dir name - for persisting transformed `PatientList`s'''
    dir_name = ''
    dir_name += 'months' if age_in_months else 'years'
    dir_name += f'_{age_start}_to_{age_stop}'
    pckl_dir = Path(f'{path}/processed/{dir_name}/{split}')
    return pckl_dir

Tests

tst_pckl_dir = get_pckl_dir(PATH_1K, split='train', age_start=10, age_stop=30, age_in_months=False)

tst_pckl_dir

Path('/home/vinod/.lemonpie/datasets/synthea/1K/processed/years_10_to_30/train')

%time PatientList.create_save(all_dfs, vocab_list_1K, tst_pckl_dir, age_start=10, age_stop=30, age_in_months=False)

702 total patients completed, saved patient list to /home/vinod/.lemonpie/datasets/synthea/1K/processed/years_10_to_30/train
CPU times: user 745 ms, sys: 597 ms, total: 1.34 s
Wall time: 5.9 s

Tests

%time create_all_ptlists(PATH_1K, age_start=240, age_stop=360, age_in_months=True) #20 to 30 yrs in mos (seq_len = 120)

702 total patients completed, saved patient list to /home/vinod/.lemonpie/datasets/synthea/1K/processed/months_240_to_360/train
234 total patients completed, saved patient list to /home/vinod/.lemonpie/datasets/synthea/1K/processed/months_240_to_360/valid
235 total patients completed, saved patient list to /home/vinod/.lemonpie/datasets/synthea/1K/processed/months_240_to_360/test
CPU times: user 1.65 s, sys: 1.72 s, total: 3.37 s
Wall time: 20 s

Other examples

%time create_all_ptlists(PATH_1K, age_start=0, age_stop=20, age_in_months=False, verbose=False)

664 total patients completed, saved patient list to datasets/synthea/1K/processed/years_0_to_20/train
222 total patients completed, saved patient list to datasets/synthea/1K/processed/years_0_to_20/valid
222 total patients completed, saved patient list to datasets/synthea/1K/processed/years_0_to_20/test
CPU times: user 3.66 s, sys: 1.98 s, total: 5.64 s
Wall time: 32.4 s

%time create_all_ptlists(PATH_1K, age_start=0, age_stop=35, age_in_months=False, verbose=False)

664 total patients completed, saved patient list to datasets/synthea/1K/processed/years_0_to_35/train
222 total patients completed, saved patient list to datasets/synthea/1K/processed/years_0_to_35/valid
222 total patients completed, saved patient list to datasets/synthea/1K/processed/years_0_to_35/test
CPU times: user 3.66 s, sys: 1.93 s, total: 5.59 s
Wall time: 40.2 s

%time create_all_ptlists(PATH_1K, age_start=120, age_stop=360, age_in_months=True) #10 to 30 yrs in mos (seq_len = 240)

664 total patients completed, saved patient list to datasets/synthea/1K/processed/months_120_to_360/train
222 total patients completed, saved patient list to datasets/synthea/1K/processed/months_120_to_360/valid
222 total patients completed, saved patient list to datasets/synthea/1K/processed/months_120_to_360/test
CPU times: user 3.84 s, sys: 1.98 s, total: 5.82 s
Wall time: 46.8 s

ptlist_train = PatientList.load(PATH_1K, 'train', age_start=240, age_stop=360, age_in_months=True)
ptlist_valid = PatientList.load(PATH_1K, 'valid', age_start=240, age_stop=360, age_in_months=True)
ptlist_test  = PatientList.load(PATH_1K, 'test',  age_start=240, age_stop=360, age_in_months=True)

len(ptlist_train), len(ptlist_valid), len(ptlist_test)

(702, 234, 235)

ptlist_train

PatientList (702 items)
base path:/home/vinod/.lemonpie/datasets/synthea/1K; split:train; age span:120 months
age_start:240; age_stop:360; age_type:months
ptid:0ace3e15-8aa4-41c5-8b90-2408285ebcfe, birthdate:1986-04-02, [('diabetes', False), ('stroke', False)].., device:cpu
ptid:af1495be-5077-4087-98b1-9ff624c7582c, birthdate:2008-07-17, [('diabetes', False), ('stroke', False)].., device:cpu
ptid:f23e12d9-2ec6-4006-b041-ea78d374e9c9, birthdate:2014-09-06, [('diabetes', False), ('stroke', False)].., device:cpu
ptid:1968aa31-5fce-461a-9486-6e385a7b75e7, birthdate:1986-04-11, [('diabetes', False), ('stroke', False)].., device:cpu
ptid:1211c8ff-ab73-49f3-b2ab-87b7a03f6167, birthdate:1972-03-24, [('diabetes', False), ('stroke', False)].., device:cpu
ptid:27a8b7b6-007d-4036-82a7-80a9ab670dcb, birthdate:2005-04-13, [('diabetes', False), ('stroke', False)].., device:cpu
ptid:532696f2-0b76-4eb0-9aea-a74e2fb1bed2, birthdate:1967-05-18, [('diabetes', False), ('stroke', False)].., device:cpu
ptid:8641e13a-c832-4d97-811a-b735d0abb45e, birthdate:1982-10-06, [('diabetes', False), ('stroke', False)].., device:cpu
ptid:7f874045-4062-405d-8c23-abb12d0af23e, birthdate:1972-05-20, [('diabetes', False), ('stroke', False)].., device:cpu
ptid:0b6a83ae-fcb1-4b75-9ffa-d52898167d66, birthdate:1989-08-05, [('diabetes', False), ('stroke', False)].., device:cpu...]

ptlist_test

PatientList (235 items)
base path:/home/vinod/.lemonpie/datasets/synthea/1K; split:test; age span:120 months
age_start:240; age_stop:360; age_type:months
ptid:b6e2e2fd-9584-4938-8b64-b61758009f72, birthdate:2014-04-11, [('diabetes', False), ('stroke', False)].., device:cpu
ptid:9043026f-7d60-4b65-9c54-8ae48769ea00, birthdate:1962-09-11, [('diabetes', False), ('stroke', False)].., device:cpu
ptid:cb1be644-93a5-4208-8776-f5feece6967a, birthdate:1954-02-03, [('diabetes', False), ('stroke', False)].., device:cpu
ptid:817d7365-56fc-43f6-845e-ea9f5623d1aa, birthdate:1982-11-04, [('diabetes', False), ('stroke', False)].., device:cpu
ptid:8b119fdd-0fea-46dd-9106-b5c7813e7260, birthdate:2000-07-25, [('diabetes', False), ('stroke', False)].., device:cpu
ptid:6a8faf4c-5577-4f9b-964d-6dc253b3b0c1, birthdate:1949-03-16, [('diabetes', False), ('stroke', False)].., device:cpu
ptid:6cf93841-acdc-45c5-aca5-7c3efefd7494, birthdate:1984-06-12, [('diabetes', False), ('stroke', False)].., device:cpu
ptid:8db0d104-4c3f-40d3-bcf5-f5eb81b7308f, birthdate:2002-03-02, [('diabetes', False), ('stroke', False)].., device:cpu
ptid:1a7e8f38-95df-4c27-8897-21ed04c3b98b, birthdate:1942-05-23, [('diabetes', False), ('stroke', False)].., device:cpu
ptid:04db6603-0017-4cc6-a46d-6df577b0a10d, birthdate:1972-09-04, [('diabetes', False), ('stroke', False)].., device:cpu...]

len(ptlist_train.items[300].obs_nums), len(ptlist_valid.items[200].obs_nums)

(145, 120)

Offsets length must be same as age_span (in this case 120)

len(ptlist_train.items[300].obs_offsts), len(ptlist_valid.items[200].obs_offsts)

(120, 120)

ptlist_train.items[300].obs_nums

tensor([108, 112, 121,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
        108, 112, 121,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
        108, 112, 121,   0,   5,   7,  15,  19,  27,  33,  39,  43,  54,  60,
         64,  71,  73,  79,  84,  87,  95,  97, 102, 468,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0])

ptlist_train[300].proc_nums

tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

ptlist_train[300].conditions

{'diabetes': False,
 'stroke': False,
 'alzheimers': False,
 'coronary_heart': False,
 'lung_cancer': False,
 'breast_cancer': False,
 'rheumatoid_arthritis': False,
 'epilepsy': False}

cnds

['diabetes',
 'stroke',
 'alzheimers',
 'coronary_heart',
 'lung_cancer',
 'breast_cancer',
 'rheumatoid_arthritis',
 'epilepsy']

counts = []
for cnd in cnds:
    train_count = [ptlist_train[i].conditions[cnd] == 1 for i in range(len(ptlist_train))].count(True)
    valid_count = [ptlist_valid[i].conditions[cnd] == 1 for i in range(len(ptlist_valid))].count(True)
    test_count  = [ptlist_test[i].conditions[cnd] == 1 for i in range(len(ptlist_test))].count(True)
    total_count = train_count+valid_count+test_count
    counts.append([train_count, valid_count, test_count, total_count])
    
counts_df = pd.DataFrame(counts, index=cnds, columns=['train','valid','test','total'])

counts_df

Do All Preprocessing

CONDITIONS

{'diabetes': '44054006',
 'stroke': '230690007',
 'alzheimers': '26929004',
 'coronary_heart': '53741008',
 'lung_cancer': '254637007',
 'breast_cancer': '254837009',
 'rheumatoid_arthritis': '69896004',
 'epilepsy': '84757009'}

preprocess_ehr_dataset(PATH_1K, SYNTHEA_DATAGEN_DATES['1K'], CONDITIONS, from_raw_data=True)

------------------- Splitting and cleaning raw dataset -------------------
Splits:: train: 0.6, valid: 0.2, test: 0.2
Split patients into:: Train: 702, Valid: 234, Test: 235 -- Total before split: 1171
Saved train data to /home/vinod/.lemonpie/datasets/synthea/1K/raw_split/train
Saved valid data to /home/vinod/.lemonpie/datasets/synthea/1K/raw_split/valid
Saved test data to /home/vinod/.lemonpie/datasets/synthea/1K/raw_split/test
Saved cleaned "train" data to /home/vinod/.lemonpie/datasets/synthea/1K/cleaned/train
Saved vocab code tables to /home/vinod/.lemonpie/datasets/synthea/1K/cleaned/train/codes
Saved cleaned "valid" data to /home/vinod/.lemonpie/datasets/synthea/1K/cleaned/valid
Saved cleaned "test" data to /home/vinod/.lemonpie/datasets/synthea/1K/cleaned/test
------------------- Creating vocab lists -------------------
Saved vocab lists to /home/vinod/.lemonpie/datasets/synthea/1K/processed
------------------- Creating patient lists -------------------
702 total patients completed, saved patient list to /home/vinod/.lemonpie/datasets/synthea/1K/processed/years_0_to_20/train
234 total patients completed, saved patient list to /home/vinod/.lemonpie/datasets/synthea/1K/processed/years_0_to_20/valid
235 total patients completed, saved patient list to /home/vinod/.lemonpie/datasets/synthea/1K/processed/years_0_to_20/test

Transform

Assemble Single Patient

Collate Codes & Offsets

`collate_codes_offsts`[source]

Numericalize

`get_codenums_offsts`[source]

`get_demographics`[source]

`ItemBase` - Patient

`class` `Patient`[source]

`Patient.create`[source]

`Patient.pin_memory`[source]

`Patient.to_gpu`[source]

`ItemList` - PatientList

`get_pckl_dir`[source]

`class` `PatientList`[source]

`PatientList.create_save`[source]

`PatientList._create_pts_chunk`[source]

`PatientList.load`[source]

`create_all_ptlists`[source]

Do All Preprocessing

`preprocess_ehr_dataset`[source]

	patient	birthdate	diabetes	diabetes_age	stroke	stroke_age	alzheimers	alzheimers_age	coronary_heart	coronary_heart_age	lung_cancer	lung_cancer_age	breast_cancer	breast_cancer_age	rheumatoid_arthritis	rheumatoid_arthritis_age	epilepsy	epilepsy_age
indx
0	b1d50391-79c5-403c-919f-3ded66c9d77a	1959-09-01	False	NaN	False	NaN	False	NaN	False	NaN	False	NaN	False	NaN	False	NaN	False	NaN
1	e52a1bbc-7b12-4d01-82cc-1196da05e399	2016-12-29	False	NaN	False	NaN	False	NaN	False	NaN	False	NaN	False	NaN	False	NaN	False	NaN
2	88587157-4de8-4459-b6d4-fd571b847575	1980-05-31	False	NaN	False	NaN	False	NaN	False	NaN	False	NaN	False	NaN	False	NaN	False	NaN
3	18794a7c-c7fa-47d6-8387-9fc133c4e1e3	1982-06-21	True	25.0	False	NaN	False	NaN	False	NaN	False	NaN	False	NaN	False	NaN	False	NaN
4	92c8d9dc-2430-4b35-8fa8-63bf6913737b	1985-11-07	False	NaN	False	NaN	False	NaN	False	NaN	False	NaN	False	NaN	False	NaN	False	NaN

	train	valid	test	total
diabetes	43	14	19	76
stroke	30	7	11	48
alzheimers	12	7	6	25
coronary_heart	39	11	11	61
lung_cancer	12	0	2	14
breast_cancer	11	8	2	21
rheumatoid_arthritis	2	0	0	2
epilepsy	15	5	2	22

Transform

Assemble Single Patient

Collate Codes & Offsets

collate_codes_offsts[source]

Numericalize

get_codenums_offsts[source]

get_demographics[source]

ItemBase - Patient

class Patient[source]

Patient.create[source]

Patient.pin_memory[source]

Patient.to_gpu[source]

ItemList - PatientList

get_pckl_dir[source]

class PatientList[source]

PatientList.create_save[source]

PatientList._create_pts_chunk[source]

PatientList.load[source]

create_all_ptlists[source]

Do All Preprocessing

preprocess_ehr_dataset[source]

`collate_codes_offsts`[source]

`get_codenums_offsts`[source]

`get_demographics`[source]

`ItemBase` - Patient

`class` `Patient`[source]

`Patient.create`[source]

`Patient.pin_memory`[source]

`Patient.to_gpu`[source]

`ItemList` - PatientList

`get_pckl_dir`[source]

`class` `PatientList`[source]

`PatientList.create_save`[source]

`PatientList._create_pts_chunk`[source]

`PatientList.load`[source]

`create_all_ptlists`[source]

`preprocess_ehr_dataset`[source]