diff --git a/RxMix/ASSOICATING NCTIDs to NDCs and Marketing dates.sql b/RxMix/ASSOICATING NCTIDs to NDCs and Marketing dates.sql new file mode 100644 index 0000000..941f443 --- /dev/null +++ b/RxMix/ASSOICATING NCTIDs to NDCs and Marketing dates.sql @@ -0,0 +1,27 @@ +/*Get relationships between brands and branded drugs and packs*/ +select * from rxnorm_migrated.rxnorm_relations rr +where tty1 = 'BN' and tty2 in ('SBD', 'BPCK') +limit 100; + +/*get all the ndc codes associated with an rxcui + * Same as query + * http://will-office:4000/REST/rxcui/1668240/allhistoricalndcs.json + * note the different formats of the dates. + * + * Based on http://will-office:4000/RxNav/search?searchBy=RXCUI&searchTerm=1668240 + * it appears that this rxcui is a sbd or bpck (branded drug or pack) + * + * If I grab every brand, then every branded drug or pack associated with that drug and then every + * */ +select * from ALLNDC_HISTORY ah +where RXCUI ='1668240' +and SAB='RXNORM' +; +/** + * If I grab every brand, then every branded drug or pack associated with that drug and then attach that to the nsde data I would get the marketing dates required. + * trial -> mesh_term -> IN/MIN (rxcui) -> BN (rxcui) -> SBD/BPCK (rxcui) -> ndc11 -> nsde (marketing dates) + * */ + +/* + * I do need to figure out a way to change the date types when importing into postgres. In mariadb they ar mmYYYY wheras in the jsonapi they are YYYYmm but I want is YYYY-mm-01 + */*/ \ No newline at end of file diff --git a/RxMix/RxMixInABox.py b/RxMix/RxMixInABox.py deleted file mode 100644 index 55a6a55..0000000 --- a/RxMix/RxMixInABox.py +++ /dev/null @@ -1,49 +0,0 @@ -import requests -from abc import ABC, abstractmethod -from dataclasses import dataclass - -BASE_URL = "http://LOCALHOST:4000/REST" -FORMAT = '.json' - -@dataclass -class RxCui(): - id: str - - def get_atc_class(self): - pass - def get_brandnames(self): - pass - - - - -def FindRxcuiByString(name: str, **kwargs) -> RxCui: - ''' - Find a RxCUI by string based on a string - Defaults to searching RxNorm (i.e. drugs) using a best match option - ''' - - url = BASE_URL + "/rxcui" + FORMAT - query = {'allsrc':0, 'srclist':'RXNORM', 'search':2} | kwargs | {'name':name} - r = requests.get(url, params=query) - - #extract RxCUIs - return [RxCui(x) for x in r.json()['idGroup']['rxnormId']] - - -def get_brands_from_ingredients(rxcui: RxCui): - ''' - This is used to query for properties - ''' - url = BASE_URL + "/brands" + FORMAT - r = requests.get(url, params={"ingredientids": rxcui.id}) - j = r.json() - - return [ AssociatedBrand(x,rxcui) for x in j['brandGroup']['conceptProperties']] - -class AssociatedBrand(): - def __init__(self,brand,ingredient: RxCui): - self.ingredient_rxcui = ingredient - self.brand_rxcui = RxCui(brand['rxcui']) - -def get_rx_property(rxcui) \ No newline at end of file diff --git a/RxMix/migrate_rxnav.py b/RxMix/migrate_rxnav.py new file mode 100644 index 0000000..05076e3 --- /dev/null +++ b/RxMix/migrate_rxnav.py @@ -0,0 +1,147 @@ +import psycopg2 as psyco +import pymysql +from dotenv import load_dotenv +import os + + +##############NOTE +''' + + +mariadb --mariadb.connect--> incrementally fetched dict --psycopg2--> postgres + +I will have the ability to reduce memory usage and simplify what I am doing. + + +''' + + +####################CONSTANTS################################# + +#SPLIT_RE = re.compile("(\w+)(\((\d+)\))?") + + +###################QUERIES######################### + +QUERY_columns_from_Information_Schema = """ +SELECT * +FROM INFORMATION_SCHEMA.columns +WHERE + TABLE_SCHEMA=%s + and + TABLE_NAME=%s +; +""" + + +QUERY_data_from_table = "SELECT * FROM {schema}.{table} limit 10" + + +########FUNCTIONS################# + + +def convert_column(d): + """ + Given the metadata about a column in mysql, make the portion of the `create table` + statement that corresponds to that column in postgres + """ + #extract + data_type = d["DATA_TYPE"] + position = d["ORDINAL_POSITION"] + table_name = d["TABLE_NAME"] + d["IS_NULLABLE"] = "NOT NULL" if d["IS_NULLABLE"] == "NO" else "" + + #convert + if data_type=="varchar": + string = "{COLUMN_NAME} character varying({CHARACTER_MAXIMUM_LENGTH}) COLLATE pg_catalog.\"default\" {IS_NULLABLE}".format(**d) + elif data_type=="char": + string = "{COLUMN_NAME} character({CHARACTER_MAXIMUM_LENGTH}) COLLATE pg_catalog.\"default\" {IS_NULLABLE}".format(**d) + elif data_type=="tinyint": + string = "{COLUMN_NAME} smallint {IS_NULLABLE}".format(**d) + elif data_type=="decimal": + string = "{COLUMN_NAME} numeric({NUMERIC_PRECISION},{NUMERIC_SCALE}) {IS_NULLABLE}".format(**d) + elif data_type=="int": + string = "{COLUMN_NAME} integer {IS_NULLABLE},".format(**d) + elif data_type=="enum": + string = None + elif data_type=="text": + string = None + + return string + +if __name__ == "__main__": + #process environment variables + load_dotenv() + POSTGRES_HOST = os.getenv("POSTGRES_HOST") + POSTGRES_DB = os.getenv("POSTGRES_DB") + POSTGRES_USER = os.getenv("POSTGRES_USER") + POSTGRES_PASSWD = os.getenv("POSTGRES_PASSWD") + POSTGRES_PORT = os.getenv("POSTGRES_PORT") + + MARIADB_HOST = os.getenv("MARIADB_HOST") + MARIADB_DB = os.getenv("MARIADB_DB") + MARIADB_USER = os.getenv("MARIADB_USER") + MARIADB_PASSWD = os.getenv("MARIADB_PASSWD") + MARIADB_PORT = os.getenv("MARIADB_PORT") + + #get & convert datatypes for each table of interest + tables_of_interest = [ + "rxnorm_props" + ,"rxnorm_relations" + ,"ALLNDC_HISTORY" + ,"ALLRXCUI_HISTORY" + ] + mschema="rxnorm_current" + pschema="rxnorm_migrated" + + + with pymysql.connect( + user=MARIADB_USER + ,password=MARIADB_PASSWD + ,host=MARIADB_HOST + ,port=MARIADB_PORT + ,database=MARIADB_DB + ,cursorclass=pymysql.cursors.DictCursor + ) as mcon, psyco.connect( + user=POSTGRES_USER + ,password=POSTGRES_PASSWD + ,host=POSTGRES_HOST + ,port=POSTGRES_PORT + ,database=POSTGRES_DB + ) as pcon: + with mcon.cursor() as mcurse, pcon.cursor() as pcurse: + for table in tables_of_interest: #create equivalent table in postgres + continue + q = QUERY_columns_from_Information_Schema + + mcurse.execute(q,[mschema,table]) + + columns = [convert_column(a) for a in mcurse.fetchall() ] + column_sql = ",\n".join(columns) + + #create a header and footer + header="CREATE TABLE IF NOT EXISTS {schema}.{table_name}\n(".format(schema=pschema, table_name=table) + footer=");" + + #CREATE TABLE + create_table_statement = "\n".join([header,column_sql,footer]) + pcurse.execute(create_table_statement) + + #extract data from mysql + # + + with mcon.cursor() as mcurse, pcon.cursor() as pcurse: + for table in tables_of_interest: + mcurse.execute("select * from rxnorm_current.{table} limit 10".format(table=table)) + print(mcurse.fetchone()) + + + + + + + + + + + diff --git a/RxMix/views to history.sql b/RxMix/views to history.sql new file mode 100644 index 0000000..3e9d6fc --- /dev/null +++ b/RxMix/views to history.sql @@ -0,0 +1,30 @@ +/***************CREATE VIEWS*******************/ +create view if not exists + history.match_drugs_to_trials as +select nct_id, rxcui, propvalue1 +from + ctgov.browse_interventions as bi + join + rxnorm_migrated.rxnorm_props as rp + on bi.downcase_mesh_term = rp.propvalue1 +where + propname='RxNorm Name' + and + nct_id in (select nct_id from history.trial_snapshots) +; + + +/********************IN DEVLEOPMENT*********************/ + +/* Get the count of brand names attached to each trial + * I should develop this into a view that matches trials to brands + * then create a view that gets the counts. + */ +select rxcui1,count(rxcui2) from rxnorm_migrated.rxnorm_relations rr +where + rxcui1 in (select rxcui from history.match_drugs_to_trials) + and + tty2 = 'BN' +group by rxcui1 +order by count(rxcui2) desc +; \ No newline at end of file diff --git a/justfile b/justfile index f704e90..11f779a 100644 --- a/justfile +++ b/justfile @@ -16,6 +16,9 @@ docker_container := `docker container ls -a | grep aact_db | cut -f 1 -d " " | t #Various paths for docker stuff docker-compose_path := "./AACT_downloader/docker-compose.yaml" +#rxnorm_mappings +rxnorm_mappings_url := "https://dailymed-data.nlm.nih.gov/public-release-files/rxnorm_mappings.zip" + #Number of historical trials to download. count := "100" @@ -101,3 +104,9 @@ get-histories: download-trial-histories parse-trial-histories get-nsde: cd market_data && bash download_nsde.sh cd market_data && python extract_nsde.py + +get-rxnorm-mappings: + #this may not be needed, all it does is match spls to rxcuis and I think I already have that. + curl {{rxnorm_mappings_url}} > ./market_data/rxnorm_mappings.zip + cd ./market_data && unzip ./rxnorm_mappings.zip + rm ./market_data/rxnorm_mappings.zip diff --git a/market_data/readme.md b/market_data/readme.md new file mode 100644 index 0000000..c009a81 --- /dev/null +++ b/market_data/readme.md @@ -0,0 +1 @@ +downloads and extracts nsde data.