added RxNavInABox mariadb -> postgres importation and some of the views etc I am developing.

4 years ago · 97af862419
parent 4cc4c5c99f
commit 97af862419
2 changed files with 172 additions and 0 deletions
--- a/RxMix/migrate_rxnav.py
+++ b/RxMix/migrate_rxnav.py
@ -0,0 +1,142 @@
 import psycopg2 as psyco
 import pymysql
 from dotenv import load_dotenv
 import os
 ##############NOTE
 '''
 mariadb --mariadb.connect--> incrementally fetched dict --psycopg2--> postgres
 I will have the ability to reduce memory usage and simplify what I am doing.
 '''
 ####################CONSTANTS#################################
 #SPLIT_RE = re.compile("(\w+)(\((\d+)\))?")
 ###################QUERIES#########################
 QUERY_columns_from_Information_Schema = """
 SELECT *
 FROM INFORMATION_SCHEMA.columns
 WHERE 
    TABLE_SCHEMA=%s
    and 
    TABLE_NAME=%s
 ;
 """
 QUERY_data_from_table = "SELECT * FROM {schema}.{table} limit 10"
 ########FUNCTIONS#################
 def convert_column(d):
    """
    Given the metadata about a column in mysql, make the portion of the `create table` 
    statement that corresponds to that column in postgres
    """
    #extract
    data_type = d["DATA_TYPE"]
    position = d["ORDINAL_POSITION"]
    table_name = d["TABLE_NAME"]
    d["IS_NULLABLE"] = "NOT NULL" if d["IS_NULLABLE"] == "NO" else ""
    #convert
    if data_type=="varchar":
        string = "{COLUMN_NAME} character varying({CHARACTER_MAXIMUM_LENGTH}) COLLATE pg_catalog.\"default\" {IS_NULLABLE}".format(**d)
    elif data_type=="char":
        string = "{COLUMN_NAME} character({CHARACTER_MAXIMUM_LENGTH}) COLLATE pg_catalog.\"default\" {IS_NULLABLE}".format(**d)
    elif data_type=="tinyint":
        string = "{COLUMN_NAME} smallint {IS_NULLABLE}".format(**d)
    elif data_type=="decimal":
        string = "{COLUMN_NAME} numeric({NUMERIC_PRECISION},{NUMERIC_SCALE}) {IS_NULLABLE}".format(**d)
    elif data_type=="int":
        string = "{COLUMN_NAME} integer {IS_NULLABLE},".format(**d)
    elif data_type=="enum":
        string = None
    elif data_type=="text":
        string = None
    return string
 if __name__ == "__main__":
    #process environment variables
    load_dotenv()
    POSTGRES_HOST = os.getenv("POSTGRES_HOST")
    POSTGRES_DB =  os.getenv("POSTGRES_DB")
    POSTGRES_USER =  os.getenv("POSTGRES_USER")
    POSTGRES_PASSWD =  os.getenv("POSTGRES_PASSWD")
    POSTGRES_PORT =  os.getenv("POSTGRES_PORT")
    MARIADB_HOST = os.getenv("MARIADB_HOST")
    MARIADB_DB = os.getenv("MARIADB_DB")
    MARIADB_USER = os.getenv("MARIADB_USER")
    MARIADB_PASSWD = os.getenv("MARIADB_PASSWD")
    MARIADB_PORT = os.getenv("MARIADB_PORT")
    #get & convert datatypes for each table of interest
    tables_of_interest = ["rxnorm_props","rxnorm_relations"]
    mschema="rxnorm_current"
    pschema="rxnorm_migrated"
    with pymysql.connect(
        user=MARIADB_USER
        ,password=MARIADB_PASSWD
        ,host=MARIADB_HOST
        ,port=MARIADB_PORT
        ,database=MARIADB_DB
        ,cursorclass=pymysql.cursors.DictCursor
    ) as mcon, psyco.connect(
        user=POSTGRES_USER
        ,password=POSTGRES_PASSWD
        ,host=POSTGRES_HOST
        ,port=POSTGRES_PORT
        ,database=POSTGRES_DB
    ) as pcon:
        with mcon.cursor() as mcurse, pcon.cursor() as pcurse:
            for table in tables_of_interest: #create equivalent table in postgres
                continue
                q = QUERY_columns_from_Information_Schema
                mcurse.execute(q,[mschema,table])
                columns = [convert_column(a) for a in mcurse.fetchall() ]
                column_sql = ",\n".join(columns)
                #create a header and footer
                header="CREATE TABLE IF NOT EXISTS {schema}.{table_name}\n(".format(schema=pschema, table_name=table)
                footer=");"
                #CREATE TABLE
                create_table_statement = "\n".join([header,column_sql,footer])
                pcurse.execute(create_table_statement)
                #extract data from mysql
                #
        with mcon.cursor() as mcurse, pcon.cursor() as pcurse:
            for table in tables_of_interest: 
                mcurse.execute("select * from rxnorm_current.{table} limit 10".format(table=table))
                print(mcurse.fetchone())
--- a/history.sql
+++ b/history.sql
@ -0,0 +1,30 @@
 /***************CREATE VIEWS*******************/
 create view if not exists 
 	history.match_drugs_to_trials as
 select nct_id,  rxcui, propvalue1
 from 
 	ctgov.browse_interventions as bi
 	join
 	rxnorm_migrated.rxnorm_props as rp
 	on bi.downcase_mesh_term = rp.propvalue1 
 where 
 	propname='RxNorm Name'
 	and 
 	nct_id in (select nct_id from history.trial_snapshots)
 ;
 /********************IN DEVLEOPMENT*********************/
 /* Get the count of brand names attached to each trial
 * I should develop this into a view that matches trials to brands
 * then create a view that gets the counts.
 */
 select rxcui1,count(rxcui2) from rxnorm_migrated.rxnorm_relations rr 
 where 
 	rxcui1 in (select rxcui from history.match_drugs_to_trials)
 	and
 	tty2 = 'BN'
 group by rxcui1 
 order by count(rxcui2) desc 
 ;