ClinicalTrialsDataProcessing/market_data/migrate_rxnav.py

import connetorx as cx
from sqlalchemy import create_engine
import re

####################CONSTANTS#################################
MYSQL_CONNECTION_STRING="mysql://webuser:9521354c77aa@localhost/"
POSTGRES_CONNECTION_STRING="postgresql://root:root@localhost/aact_db"
POSTGRES_ENGINE = create_engine(POSTGRES_CONNECTION_STRING)
SPLIT_RE = re.compile("(\w+)(\((\d+)\))?")


###################QUERIES#########################

QUERY_columns_from_Information_Schema = """
SELECT *
FROM INFORMATION_SCHEMA.columns
WHERE
    TABLE_SCHEMA="rxnorm_current"
"""

QUERY_data_from_table = ""


########FUNCTIONS#################
def query_mysql(query):
    """
    runs a query against the MYSQL database, returning a pandas df
    """
    return cx.read_sql(MYSQL_CONNECTION_STRING, query)

def insert_table_postgres(df, table, schema):
    """
    Inserts data into a table
    """
    return df.to_sql(
            table
            ,POSTGRES_ENGINE
            ,schema=schema
            ,if_exists="append"
            ,method="multi"
            )


def convert_mysql_types_to_pgsql(binary_type):
    """
    Given a binary string of a column's type,
    convert to utf8, and then parse it into
    a postgres type
    """
    string_type = binary_type.decode("utf-8").lower()

    #get the value name and length out.
    val_type,_,length = SPLIT_RE.match(string_type).groups()

def convert_column(df_row):
    #extract
    position = df_row.ORDINAL_POSITION
    table_name = df_row.TABLE_NAME

    #convert
    if data_type=="varchar":
        string = "{column_name} character varying({data_length}) COLLATE pg_catalog.\"default\" {is_nullable},".format(
                column_name = df_row.COLUMN_NAME
                ,data_length = np.int64(df_row.CHARACTER_MAXIMUM_LENGTH)
                ,is_nullable = "NOT NULL" if df_row.IS_NULLABLE == "NO" else ""
                )
    elif data_type=="char":
        string = "{column_name} char({data_length})[] COLLATE pg_catalog.\"default\" {is_nullable},".format(
                column_name = df_row.COLUMN_NAME
                ,data_length = np.int64(df_row.CHARACTER_MAXIMUM_LENGTH)
                ,is_nullable = "NOT NULL" if df_row.IS_NULLABLE == "NO" else ""
                )
    elif data_type=="tinyint":
        string = "{column_name} smallint {is_nullable},".format(
                column_name = df_row.COLUMN_NAME
                ,is_nullable = "NOT NULL" if df_row.IS_NULLABLE == "NO" else ""
                )
        series_type = numpy.int8
    elif data_type=="decimal":
        string = "{column_name} numeric({precision},{scale}) {is_nullable},".format(
                column_name = df_row.COLUMN_NAME
                ,is_nullable = "NOT NULL" if df_row.IS_NULLABLE == "NO" else ""
                ,precision= np.int64(df_row.NUMERIC_PRECISION)
                ,scale= np.int64(df_row.NUMERIC_SCALE)
                )
    elif data_type=="int":
        pass
    elif data_type=="enum":
        pass
    elif data_type=="text":
        pass

    return string