Once dataset is assembled, the folder will look as follows ..
DATA_STORE
PATH_1K
os.listdir(f'{PATH_1K}/raw_original')
dfs = read_raw_ehrdata(f'{PATH_1K}/raw_original')
patients, observations, allergies, careplans, medications, imaging_studies, procedures, conditions, immunizations = dfs
train, valid, test = split_patients(patients, .2,.1)
len(patients), len(train), len(valid), len(test)
assert len(patients) == len(train)+len(valid)+len(test)
split_ehr_dataset(PATH_1K) #will use default values for split percents
patients
data frame looks like this before cleanup..
patients.head()
pt_data = cleanup_pts(patients, is_train=True, today=SYNTHEA_DATAGEN_DATES['1K'])
patients, pt_demo, pt_codes = pt_data[0], pt_data[1], pt_data[2]
Our cleanup function produces the following 3 dfs - patients
, pt_demographics
, pt_codes
for df in pt_data:
display(df.head())
The case for keeping a record of the data generation date
Also note the difference in age_now
if it were calculated based on default (pd.Timestamp.today()
) vs SYNTHEA_DATAGEN_DATES['1K']
which is the data generation date for this 1K dataset.
(pd.to_datetime(pd.Timestamp.today()) - patients.iloc[2])[0].days, (pd.to_datetime(SYNTHEA_DATAGEN_DATES['1K']) - patients.iloc[2])[0].days
SYNTHEA_DATAGEN_DATES['1K'], pd.Timestamp.today()
That is - 1K dataset data generation date is set to March 15th while today is March 31.
- Drops rows with null in the
VALUE
column - Creates a new
code
column with a concatenation ofcode
,value
,units
andtype
- so that we can use the following logic during vocab creation for observations (further detailed in the vocab documentation)
For numeric
for 'numeric'
get unique 'codes'
for each unique code
get unique 'units'
for each unique unit
bucketize 'values'
create vocab entry for each 'bucket' -- code||value_bucket||units
For text
for 'text'
get unique 'codes'
for each unique code
get unique 'units' #this will be null
for each unique unit
get unique 'values'
create vocab entry for each -- code||value||units
'observations' df before cleanup ..
observations.head()
obs_data = cleanup_obs(observations, is_train=True)
after cleanup..
for df in obs_data:
display(df.head())
allergies
have a start and stop date in the same row indicating when an allergy (indicated by its code) started and stopped (or not) for a patient.
So in the cleanup, we flatten that out, meaning create new rows for stop dates.
The dataframe looks as follows before cleanup..
allergies.head()
alg_data = cleanup_algs(allergies, is_train=True)
Resulting in the following output after cleanup..
for df in alg_data:
display(df.head(3))
display(df.tail(3))
careplans.head()
crpl_data = cleanup_crpls(careplans, is_train=True)
for df in crpl_data:
display(df.head(3))
display(df.tail(3))
medications.head()
med_data = cleanup_meds(medications, is_train=True)
for df in med_data:
display(df.head(3))
display(df.tail(3))
imaging_studies.head()
img_data = cleanup_img(imaging_studies, is_train=True)
for df in img_data:
display(df.head(3))
procedures.head()
proc_data = cleanup_procs(procedures, is_train=True)
for df in proc_data:
display(df.head(3))
conditions.head()
cnd_data = cleanup_cnds(conditions, is_train=True)
for df in cnd_data:
display(df.head(3))
display(df.tail(3))
immunizations.head()
imm_data = cleanup_immns(immunizations, is_train=True)
for df in imm_data:
display(df.head(3))
data_tables, code_tables = cleanup_dataset(f'{PATH_1K}/raw_split/train', is_train=True)
patients, pt_demographics, observations, allergies, \
careplans, medications, imaging_studies, procedures, conditions, immunizations = data_tables
pt_codes, obs_codes, alg_codes, crpl_codes, med_codes, img_codes, proc_codes, cnd_codes, imm_codes = code_tables
conditions.count()
obs_codes.count()
The labels we intend to predict are conditions and must be in the CONDITIONS
dict
- Adding them to the
patients
df - And adding the patient's age when the particular condition was recorded
for key in CONDITIONS.keys():
print(key,"::",f'{CONDITIONS[key]}||START')
tmp_pts = extract_ys(patients, conditions, cnd_dict=CONDITIONS)
tmp_pts.count()
Inserting patient's age in months and years into each record df
- this can be modified to records the patient's age in days or even hours that might be more relevant for datasets involving hospitalizations or ER admissions
clean_raw_ehrdata(PATH_1K, 0.2, 0.2, CONDITIONS, SYNTHEA_DATAGEN_DATES['1K'])
train_dfs, valid_dfs, test_dfs = load_cleaned_ehrdata(PATH_1K)
code_dfs = load_ehr_vocabcodes(PATH_1K)
# display(df.head())
thispt = train_dfs[0].iloc[20]
thispt
# display(df.head())
Making sure condition counts match - after extracting y
for each patient
CONDITIONS
patients
dfs after cleaning, with y
extracted
pts_train, pts_valid, pts_test = train_dfs[0], valid_dfs[0], test_dfs[0]
conditions
dfs
cnd_train, cnd_valid, cnd_test = train_dfs[8], valid_dfs[8], test_dfs[8]
Counts for each condition in conditions
and patients
dfs in each split
for pts, cnds, split in zip([pts_train, pts_valid, pts_test],[cnd_train, cnd_valid, cnd_test], ['train','valid','test']):
print('\n',split)
print('diabetes:: ', len(cnds[cnds['code'] == '44054006||START']), len(pts[pts['diabetes'] == 1]))
print('stroke:: ', len(cnds[cnds['code'] == '230690007||START']), len(pts[pts['stroke'] == 1]))
print('alzheimers:: ', len(cnds[cnds['code'] == '26929004||START']), len(pts[pts['alzheimers'] == 1]))
print('coronary_heart:: ', len(cnds[cnds['code'] == '53741008||START']), len(pts[pts['coronary_heart'] == 1]))
print('lung_cancer:: ', len(cnds[cnds['code'] == '254637007||START']), len(pts[pts['lung_cancer'] == 1]))
print('breast_cancer:: ', len(cnds[cnds['code'] == '254837009||START']), len(pts[pts['breast_cancer'] == 1]))
print('rheumatoid_arthritis:: ', len(cnds[cnds['code'] == '69896004||START']), len(pts[pts['rheumatoid_arthritis'] == 1]))
print('epilepsy:: ', len(cnds[cnds['code'] == '84757009||START']), len(pts[pts['epilepsy'] == 1]))
for pts, cnds, split in zip([pts_train, pts_valid, pts_test],[cnd_train, cnd_valid, cnd_test], ['train','valid','test']):
assert len(cnds[cnds['code'] == '44054006||START']) == len(pts[pts['diabetes'] == 1]), f'error in {split} for diabetes'
assert len(cnds[cnds['code'] == '230690007||START']) == len(pts[pts['stroke'] == 1]), f'error in {split} for stroke'
assert len(cnds[cnds['code'] == '26929004||START']) == len(pts[pts['alzheimers'] == 1]), f'error in {split} for alzheimers'
assert len(cnds[cnds['code'] == '53741008||START']) == len(pts[pts['coronary_heart'] == 1]), f'error in {split} for coronary_heart'
assert len(cnds[cnds['code'] == '254637007||START']) == len(pts[pts['lung_cancer'] == 1]), f'error in {split} for lung_cancer'
assert len(cnds[cnds['code'] == '254837009||START']) == len(pts[pts['breast_cancer'] == 1]), f'error in {split} for breast_cancer'
assert len(cnds[cnds['code'] == '69896004||START']) == len(pts[pts['rheumatoid_arthritis'] == 1]), f'error in {split} for rheumatoid_arthritis'
assert len(cnds[cnds['code'] == '84757009||START']) == len(pts[pts['epilepsy'] == 1]), f'error in {split} for epilepsy'