From 5600ad932df0371432c45a61c95bf52fe55b87e5 Mon Sep 17 00:00:00 2001 From: will king Date: Fri, 24 Mar 2023 17:15:11 -0700 Subject: [PATCH] Got the basics for matching and importing icd->GBD data --- .dbeaver/.project-metadata.json.bak | 2 +- scripts/import-icd10_to_cause.py | 32 +++++++++++++++ scripts/umls_requests.py | 60 +++++++++++++++++------------ 3 files changed, 69 insertions(+), 25 deletions(-) create mode 100644 scripts/import-icd10_to_cause.py diff --git a/.dbeaver/.project-metadata.json.bak b/.dbeaver/.project-metadata.json.bak index a30d11e..b49a7c8 100644 --- a/.dbeaver/.project-metadata.json.bak +++ b/.dbeaver/.project-metadata.json.bak @@ -1 +1 @@ -{"resources":{"development_sql/ASSOICATING NCTIDs to NDCs and Marketing dates.sql":{"default-schema":"public","default-datasource":"postgres-jdbc-186c896a347-2a3d946d2dea4df7","default-catalog":"aact_db"}}} \ No newline at end of file +{"resources":{"Scripts/DiseaseBurdens_create_table.sql":{"default-datasource":"postgres-jdbc-186c896a347-2a3d946d2dea4df7","default-catalog":"aact_db","default-schema":"public"},"Scripts/GlobalBurdensOfDisease2019Codebook.sql":{"default-datasource":"postgres-jdbc-186c896a347-2a3d946d2dea4df7","default-catalog":"aact_db","default-schema":"DiseaseBurden"},"development_sql/ASSOICATING NCTIDs to NDCs and Marketing dates.sql":{"default-schema":"public","default-datasource":"postgres-jdbc-186c896a347-2a3d946d2dea4df7","default-catalog":"aact_db"}}} \ No newline at end of file diff --git a/scripts/import-icd10_to_cause.py b/scripts/import-icd10_to_cause.py new file mode 100644 index 0000000..fea46e3 --- /dev/null +++ b/scripts/import-icd10_to_cause.py @@ -0,0 +1,32 @@ +from drugtools.env_setup import ENV,postgres_conn +from psycopg2 import extras +from collections import namedtuple + + +FILES=[ + "../non-db_data_sources/GBD and ICD-10_(2019 version)/NONFATAL_cause2code.psv", + "../non-db_data_sources/GBD and ICD-10_(2019 version)/COD_cause2code.psv" +] +SEP="|" + +sql = """ +INSERT INTO "DiseaseBurden".icd10_to_cause + (code,cause_text) +VALUES + (%(code)s,%(cause)s) +""" + + +#read in files +#adjust codes? +#load files to table + +with postgres_conn() as pconn, pconn.cursor(cursor_factory=extras.DictCursor) as pcurse: + for fpath in FILES: + with open(fpath,"r") as fh: + for line in fh.readlines(): + code,cause = line.split(SEP) + code = code.strip() + cause = cause.strip() + + pcurse.execute(sql,{"code":code,"cause":cause}) \ No newline at end of file diff --git a/scripts/umls_requests.py b/scripts/umls_requests.py index 8d3da9a..f91b8df 100644 --- a/scripts/umls_requests.py +++ b/scripts/umls_requests.py @@ -1,10 +1,11 @@ -from dotenv import dotenv_values import requests import json from drugtools.env_setup import ENV,postgres_conn from psycopg2 import extras +from collections import namedtuple +RecordStuff = namedtuple("RecordStuff", "nct_id condition ui uri rootSource name") class Requestor(): def __init__(self,api_key): @@ -21,7 +22,6 @@ class Requestor(): query = "https://uts-ws.nlm.nih.gov/rest/search/current/" r = requests.get(query,params=query_terms) - print(r.url) return r @@ -29,30 +29,42 @@ r = Requestor(ENV.get("UMLS_API_KEY")) print(json.dumps(r.search("leukemia").json(),indent=2)) -conditions_link = {} -pconn = postgres_conn() -pcurse = pconn.cursor(cursor_factory=extras.DictCursor) -sql = """ -select nct_id, downcase_mesh_term -from ctgov.browse_conditions bc -where -mesh_type = 'mesh-list' -and -nct_id in (select distinct nct_id from history.trial_snapshots ts) -order by nct_id -; -""" -pcurse.execute(sql) -rows = pcurse.fetchall() -for row in rows: - nctid = row[0] - condition = row[1] - print(nctid,condition) +with postgres_conn() as pconn, pconn.cursor(cursor_factory=extras.DictCursor) as pcurse: + sql = """ + select nct_id, downcase_mesh_term + from ctgov.browse_conditions bc + where + mesh_type = 'mesh-list' + and + nct_id in (select distinct nct_id from history.trial_snapshots ts) + order by nct_id + ; + """ + pcurse.execute(sql) + rows = pcurse.fetchall() - results = r.search(row[1]).json().get('result', Exception("No result entry in json")).get('results',Exception("No results entry in json")) - for entry in results: - print("\t", entry["ui"], entry["name"]) + entries = [] + for row in rows: + nctid = row[0] + condition = row[1] + print(nctid,condition) + results = r.search(row[1]).json().get('result', Exception("No result entry in json")).get('results',Exception("No results entry in json")) + #if results are empty? + if not results: + entries.append(RecordStuff(nctid,condition,None,None,None,None)) + else: + for entry in results: + entries.append(RecordStuff(nctid, condition, entry["ui"], entry["uri"], entry["rootSource"], entry["name"])) + + sql_insert = """ + INSERT INTO "DiseaseBurden".trial_to_icd10 + (nct_id, "condition", ui,uri,rootsource,"name") + VALUES + (%(nct_id)s, %(condition)s, %(ui)s, %(uri)s, %(rootSource)s, %(name)s) + """ + for entry in entries: + pcurse.execute(sql_insert,entry._asdict())