In case it's useful... I used an ipython notebook to do the job. The output of the notebook cells include details that aren't straightforward to share, so this is an export as plain python code.
hg rev 3b05b4c2de41 Mon Jun 13 17:15:19 2016 -0500 -- Dan
# coding: utf-8 # # Diagnoses and Procedures for Breast Cancer Survey Participants # # context: # - [#4166: GPC breast cancer survey: submit diagnoses, procedures for KUMC consented participants](https://bmi-work.kumc.edu/work/ticket/4166) # - [GPC:ticket:448 all diagnoses and procedures for breast cancer cohort](https://informatics.gpcnetwork.org/trac/Project/ticket/448) # # > ... take your list of consented breast cancer patients and join them with the CDM DX and PX tables; extract the relevant slice ... # # **Note:** incorporating the CONDITION table is yet TODO. # ## Preface: PyData Scientific Python Tools # # See also [PyData](http://pydata.org/). # In[1]: import numpy as np import pandas as pd import matplotlib as mp dict(pd=pd.__version__, np=np.__version__, mp=mp.__version__) # In[2]: get_ipython().magic(u'matplotlib inline') # In[3]: # $ ORACLE_HOME=/opt/instantclient_11_2 pip install cx_Oracle import cx_Oracle import sqlalchemy as sa dict(cx_Oracle=cx_Oracle.__version__, sa=sa.__version__) # ## Build Crosswalk # # Back in May, we had our tumor registrar, Tim Metcalf, review our cohort before sending surveys to them. The file he reviewed was **breast_cancer_survey_sample.xlsx** (md5sum `1f114cc0c8ba6fb1c168be0a442fb4ad`) sent Thursday, May 21, 2015 5:12 PM. Exporting as CSV gives: # # In[4]: def lower_cols(df): return df.rename(columns=dict((n, n.lower()) for n in df.keys())) survey_sample = lower_cols(pd.read_csv('breast_cancer_survey_sample.csv', parse_dates=['FIRST_BC_DX']))[['order_id', 'patient_num', 'mrn', 'first_bc_dx']] survey_sample.count() # Our consented patients are in [KUMC REDCap project 6491](https://redcap.kumc.edu/redcap_v6.11.5/DataExport/index.php?pid=6941). An export of the [crosswalk report](https://redcap.kumc.edu/redcap_v6.11.5/DataExport/index.php?pid=6941&report_id=9920) gives: # In[5]: consented = pd.read_csv('GPCShareThoughtsOnBr_DATA_2016-06-13_1541.csv') consented.count() # Now we have MRNs of all consented participants: # In[6]: consented_mrn = pd.merge(consented, survey_sample) del consented_mrn['patient_num'] consented_mrn.count() # In[7]: consented_mrn[['study_id', 'mrn', 'first_bc_dx', 'date_shift']].to_csv('consented_mrn.csv', index=False) # ## Save Crosswalk in PCORNET_TRIAL table # ### Access to CDM and NightHeron # In[8]: def heron_access(identified=False, flip_flop='B2'): from os import environ db = ('NHERON' if identified else 'BHERON') + flip_flop username = environ['LOGNAME'] id_key = '{username}_NHERON{flip_flop}'.format(username=username.upper(), flip_flop=flip_flop) addr = sa.engine.url.URL(database=db, drivername='oracle', host='localhost', port=8521 if identified else 5521, username=username, password=environ[id_key]) return sa.create_engine(addr) deid = heron_access() pd.read_sql('select * from global_name', deid) # In[9]: iddb = heron_access(identified=True) pd.read_sql('select * from global_name', iddb) # In[10]: deid.execute('alter session set current_schema = pcornet_cdm') deid.execute('select count(*) from demographic').fetchall() # ### Crosswalk study_id to current patient_num via MRN # In[11]: def deid_mrns(pat): mrn_list_expr = ', '.join("'%d'" % n for n in pat.mrn) crosswalk = pd.read_sql(''' select patient_num, to_number(patient_ide) mrn from nightherondata.patient_mapping pm where pm.patient_ide_source = '[email protected]' and patient_ide in ({mrn_list}) '''.format(mrn_list=mrn_list_expr), iddb) return pd.merge(pat, crosswalk) consented_crosswalk = deid_mrns(consented_mrn)[['patient_num', 'study_id', 'first_bc_dx', 'date_shift']] consented_crosswalk.head(10) # ### Save Study ID crosswalk to PCORNET_TRIAL table # In[12]: pcornet_trial = pd.DataFrame(dict( patid=consented_crosswalk.patient_num, participantid=consented_crosswalk.study_id)) pcornet_trial.head() # In[13]: # ref # http://www.public-health.uiowa.edu/herce/research/gpc/GPC%20Breast%20Cancer%20IRB%20protocol%20approved%2022%20Jul%202015.pdf pcornet_trial['trialid'] = 'UIOWA-201501798' # This is the gpc-dev identifier; we could use the PCORnet identifier pcornet_trial['trial_siteid'] = 'KUMC' # they consented by the time we did our patient mapping file for GPC:ticket:385 pcornet_trial['trial_enroll_date'] = np.datetime64('2015-11-01') pcornet_trial['trial_end_date'] = pcornet_trial.trial_enroll_date pcornet_trial.head() # In[14]: deid.execute('delete from pcornet_trial where trialid = :bc', bc='UIOWA-201501798') pcornet_trial.to_sql('pcornet_trial', schema='pcornet_cdm', con=deid, index=False, if_exists='append') # In[15]: pd.read_sql('select count(*) from pcornet_trial', deid) # ## Find relevant Diagnoses and Procedures # # GPC DROC request from Elizabeth Chrischilles Feb 29, 2016 says: # # > ... all diagnoses and procedures from one year prior to diagnosis to the date of the most recent data available at all sites (at least December 2015) for consented subjects. # In[16]: scratch = 'dconnolly' consented_crosswalk.to_sql('crosswalk', schema=scratch, con=deid, if_exists='replace') # In[17]: deid.execute('select count(*) from {scratch}.crosswalk'.format(scratch=scratch)).fetchone() # In[18]: bc_dx = pd.read_sql(''' with dx_unshifted as ( select patid, encounterid, enc_type , admit_date + (select date_shift from {scratch}.crosswalk where patient_num = patid) admit_date , providerid, dx, dx_type, dx_source, pdx from pcornet_cdm.diagnosis dx ) select dx.* from dx_unshifted dx join {scratch}.crosswalk bc on dx.patid = bc.patient_num where dx.admit_date >= bc.first_bc_dx - 365 '''.format(scratch=scratch), deid) # In[19]: bc_dx.describe() # In[20]: bc_px = pd.read_sql( ''' with bc as ( select patient_num, date_shift, first_bc_dx from {scratch}.crosswalk ) , px_unshifted as ( select proceduresid, patid, encounterid, enc_type , admit_date + (select date_shift from bc where patient_num = patid) admit_date , providerid , px_date + (select date_shift from bc where patient_num = patid) px_date , px, px_type, px_source , raw_px, raw_px_type from pcornet_cdm.procedures ) select pxs.* from px_unshifted pxs join bc on pxs.patid = bc.patient_num where pxs.admit_date >= bc.first_bc_dx - 365 '''.format(scratch=scratch), deid) len(bc_px) # In[21]: bc_px.describe() # In[22]: px_review = bc_px[['patid', 'encounterid', 'px_date', 'enc_type', 'px_type', 'px_source']].copy() px_review['year'] = bc_px.px_date.dt.year px_by_yr = px_review.groupby('year') pd.DataFrame(dict(participants=px_by_yr.patid.nunique(), encounters=px_by_yr.encounterid.nunique())).plot(kind='bar', title='BC Procedures') and None # In[23]: px_review[['patid', 'px_type']].groupby('px_type').count().plot(kind='bar') and None # In[24]: px_review[['patid', 'enc_type']].groupby('enc_type').count().plot(kind='bar') and None # In[25]: px_review[['patid', 'px_source']].groupby('px_source').count().plot(kind='bar') and None # ## Save Diagnoses and Procedures as CSV and .zip # In[26]: pcornet_trial.to_csv('pcornet_trial_crosswalk.csv', index=False) bc_dx.to_csv('kumc_bc_dx.csv', index=False) bc_px.to_csv('kumc_bc_px.csv', index=False) # In[27]: get_ipython().system(u' zip kumc_bc_dx_px.zip pcornet_trial_crosswalk.csv kumc_bc_dx.csv kumc_bc_px.csv')
_______________________________________________ Gpc-dev mailing list [email protected] http://listserv.kumc.edu/mailman/listinfo/gpc-dev
