Connected Global Burdens of Disease Data to ICD-10 Codes

llm-extraction
youainti 3 years ago
parent 123fe3b5e4
commit 29644a0ad5

@ -1,72 +1,103 @@
import pandas as pd import pandas as pd
import numpy as np import numpy as np
import itertools
IHME_COD_FILEPATH = "./IHME_GBD_2019_COD_CAUSE_ICD_CODE_MAP_Y2020M10D15.XLSX"
IHME_NONFATAL_FILEPATH = "./IHME_GBD_2019_NONFATAL_CAUSE_ICD_CODE_MAP_Y2020M10D15.XLSX"
IHME_COD_FILEPATH = "./GlobalBurdenDisease/IHME_GBD_2019_COD_CAUSE_ICD_CODE_MAP_Y2020M10D15.XLSX"
IHME_NONFATAL_FILEPATH = "./GlobalBurdenDisease/IHME_GBD_2019_NONFATAL_CAUSE_ICD_CODE_MAP_Y2020M10D15.XLSX"
ICD10CM_ORDER_FILEPATH = "./icd10_combined-who-cms.psv" ICD10CM_ORDER_FILEPATH = "./icd10_combined-who-cms.psv"
def justify(string):
'''
The purpose of this is to transform codes such as A00 and A000
into a normalized, sortable format e.g. 'A00----' and 'A000---'
'''
return string.ljust(7,"-")
class CodeRange():
def __init__(self,cause,code_book,codes):
self.cause = cause
self.code_book = code_book
self.code_list = []
codes = "" if type(codes) is float else codes #normalize codes to string...
codes = [x.strip().replace('.','').split('-') for x in codes.split(",")]
for rng in codes:
if rng[0] is None:
raise Exception("Listed ICD10 Code (Begin:{}) is not in codebook".format(rng), rng)
#lookup codes
if len(rng) == 1:
begin = justify(rng[0])
if self.code_book.get(begin) is None:
continue
else:
self.code_list.append(begin)
else:
begin = justify(rng[0])
end = justify(rng[1])
begin_bitmask = [x >= begin for x in list(self.code_book)]
end_bitmask = [x <= end for x in list(self.code_book)]
bitmask = [x and y for x,y in zip(begin_bitmask,end_bitmask)]
self.code_list.extend(list(itertools.compress(list(self.code_book),bitmask)))
def __str__(self):
txt = ''
for item in self.code_list:
txt += "{} | {}\n".format(item, self.cause)
return txt
#READ in ICD10CM codes #READ in ICD10CM codes
icd10_codes = {} icd10_codes = {}
with open(ICD10CM_ORDER_FILEPATH,"r") as icd_fh: with open(ICD10CM_ORDER_FILEPATH,"r") as icd_fh:
for idx,line in enumerate(icd_fh.readlines()): for idx,line in enumerate(icd_fh.readlines()):
code, descr = line.split("|") #read info
code = code.strip() code, descr, source = line.split("|")
#cleanup info
code = justify(code.strip())
descr = descr.strip() descr = descr.strip()
icd10_codes[code] = (idx,descr) source = source.strip()
#Store in code dict
icd10_codes[code] = (idx,descr, source)
cod = pd.read_excel(IHME_COD_FILEPATH,header=1) cod = pd.read_excel(IHME_COD_FILEPATH,header=1)
print(cod.columns)
with open("COD_cause2code.psv", "w") as outfh:
itt = 0 itt = 0
for row in cod.itertuples(): for row in cod.itertuples():
cause = row[1] cause = row[1]
codes = row[2] codes = row[2]
codes = "" if type(codes) is float else codes #normalize codes to string... c = CodeRange(cause,icd10_codes,codes)
codes = [x.strip().replace('.','').split('-') for x in codes.split(",")]
print(cause)
#print(codes)
for rng in codes:
begin = rng[0]
end = rng[1] if len(rng) > 1 else rng[0]
print(rng, icd10_codes.get(begin,None),icd10_codes.get(end,None))
outfh.write(c.__str__())
itt += 1
if itt > 5:
break
cod = pd.read_excel(IHME_NONFATAL_FILEPATH,header=1) nonfatal = pd.read_excel(IHME_NONFATAL_FILEPATH,header=1)
print(cod.columns) with open("NONFATAL_cause2code.psv", "w") as outfh:
itt = 0 itt = 0
for row in cod.itertuples(): for row in nonfatal.itertuples():
print(row)
cause = row[2] cause = row[2]
codes= row[3] codes= row[3]
c = CodeRange(cause,icd10_codes,codes)
codes = "" if type(codes) is float else codes #normalize codes to string... outfh.write(c.__str__())
codes = [x.strip().replace('.','').split('-') for x in codes.split(",")]
print(cause)
print(codes)
for rng in codes:
begin = rng[0]
end = rng[1] if len(rng) > 1 else rng[0]
print(rng, icd10_codes.get(begin,None),icd10_codes.get(end,None))
itt += 1
if itt > 5:
break

Loading…
Cancel
Save