reorganized python scripts into package, added requisit sql, and added shell scripts to notify me when the systems are up again.

llm-extraction
youainti 3 years ago
parent 804a90c247
commit 39397cc224

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

@ -8,6 +8,8 @@ services:
image: postgres:14-alpine
networks:
- pharmaceutical_research
shm_size: '4gb' #adjust the shared memeory /dev/shm when running
#https://stackoverflow.com/questions/30210362/how-to-increase-the-size-of-the-dev-shm-in-docker-container
container_name: aact_db
#restart: always #restart after crashes
environment:

@ -0,0 +1,6 @@
-- Create a schema handling trial history.
CREATE SCHEMA rxnorm_migrated;
--Create role for anyone who needs to both select and insert on historical data
GRANT ALL ON ALL TABLES IN SCHEMA rxnorm_migrated TO root;

@ -0,0 +1,2 @@
#!/bin/bash
wget --post-data="postgres complete:$(date)" -qO- https://ntfy.sh/$NTFY > /dev/null

@ -0,0 +1,6 @@
#!/bin/bash
#install wget
apt update
apt install -y wget
#send notification
wget --post-data="mariadb complete:$(date)" -qO- https://ntfy.sh/$NTFY > /dev/null

@ -26,6 +26,7 @@ services:
- ./AACT_downloader/aact_downloads/postgres_data.dmp:/mnt/host_data/postgres_data.dmp
# this is the folder containing entrypoint info.
- ./AACT_downloader/docker-entrypoint-initdb.d/:/docker-entrypoint-initdb.d/
shm-size: 1g
rxnav-db:

@ -1 +0,0 @@
<mxfile host="Electron" modified="2022-09-19T21:58:15.288Z" agent="5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) draw.io/16.5.1 Chrome/96.0.4664.110 Electron/16.0.7 Safari/537.36" etag="K1oYB1ahwdBUMmjzqr-S" version="16.5.1" type="device"><diagram id="-7mtYT5q5bNZQN0eJ9dG" name="Page-1">7Vxtb6M4EP41kfY+JOI1Lx/z0t5VykZVu7fX/XRywBBvASPjbJP99TcmJoSatLQJobeLVDV4MI49z3jm8dikY07DzZ8MxavP1MVBx9DcTcecdQxDH9kGfAjJdicZGvZO4DPiykq54J78xFKoSemauDgpVOSUBpzERaFDowg7vCBDjNGnYjWPBsVvjZGPFcG9gwJV+g9x+UqOwtZy+V+Y+Kvsm3VN3glRVlkKkhVy6dOByLzqmFNGKd9dhZspDoTyMr3snrs+cnffMYYjXuUBdGMFnuZynkxCf+V9vXPQ9661a+UHCtZywNOARATG/4URFCQ9n/6Q3efbTCfJEwkDFEFp4tGI38s7GpSdFQncOdrStehTwpHzmJUmK8rIT6iPArilgwBuMy4hN7VCjXvxpGyT4QTq3GYD1Z+JPqNNoeIcJTzrDQ0CFCdkmfZPPBgi5pNoQjmnoawUoCUOJvB1PqPryJ3SgDK4FdHdAEkQZKKOYboIDz0n7Tujj/jgTt8Z4qUHd6RGMeN4cxQqfW8AMHMwDTFnW6iSPSBNRs4ZPbO2p9wCdUvKVofWl1VE0ur9fdO5YcCFtI032Imt2Mli+uVmppgGjJkr2ilRpRShgPgRFAPsiceE0oTtjaU4JK4rWp4kMXJI5M/TajMrl9zJwQsRhce9IJ1iK3gQQwuTmJKIp8qwJ/AH6plqPbtjQ1+nUNbzMvyJ6oxPaQTdRySFCoMxPWFhUBNGOeJouTf1Sjgfn3Qq+BJtsyLYZl1Y9xWsbyIXYOGERtUAFxOlv+zbfXUCeZ5nOE4LfSn0/aahHyjQOzSMhVtMWuTrRH7UNPKWqUB/FTGIn6EYlqHdRxBIV5RXt4Pl0LZsrcQOhg5u7eCIHehG04YwVOzgnrO1w9cMuyC/ZdSFElzNBW9SzaElhzWTQ9soskOzeXY4Um3mqmWH73QQw4/NDrNxHC4FZmNoapx+aC3sdcDeODPMlqSHmQJJDVu8z49343xQV6f54SpQ64pkGsPYoywUOTkBeWsHZ7eD5vmgbiiGMCMJqAKDcLJmQrUtB7wwB+wOihzQrsoBR7WZibp+nI3n39LM/FrMOuOaIY7b5eOJ7mI3HT8uPdTVHQXyxuxhGyHeBHnz1FBNGC9m0+797bw7AxV0r+ZYJJGSnpO0e0mNpwvKQoWhldjLfhfq/AajpplvOBY0ckpd3DqJdzmJ/lvjQino9TkJNa8ITgIE2UKjBfysgJdEhcsCbqnrhs+IPWIOChczHcigT+X3tOCfF/yS7MGFwVezB+M4DvL0wWIdLjGDi0958vCP1hZqsIWyDMKFjUFNHR56glm7LKzfCKqSvvriv7pHdMtozAgG4i56ukBhS/1qwX7YNPaGSgXuNos0cdyuBi+7GrS04mqwar6otryhoeYNYcY4OBZHDMReovbp7mH6982sZQfvcg7GB88XGiUnkPf7ilrUBoV6cG88aWioOaBdTPg3RHEMWk96vGRLsQ0QNQeI/rMAoVf1EPt1xvlNRU0dPQ8RbOOsiduGiPe5isEHDxElx8swb88Z1YJ244Eha1jdHDBbwGsAvPGTRqa6AYBdH2ehHXS0oj6NUHCVSydpvMSuVE5eZ05pLOPtd8z5VoZ4tOZUBHkeZgQAdMa2D4eFb6IxQEcWZ1lo35W2BRxEB18OsDAeumYOrhDXgIr4+EUEzXIEGQ4QJz+KPTk/Pkaj+GjN4TOqiM+oUXjU3XfwKR5hoTivXwJcem6/qOzMMTqgI8xeco0pzz1wS0WPN3vJUck3dOXDnf17sa86MPPIUkY2r/W0wcAqktZdqTIAsu1bMZiDKtTzEswVhPZdOAE0NZb9HnOqX3FK6Udek3kbpGPG0PaggrRWFfHsWJ3+bPVjyLfVc/h3TZ7XGEqOQwgdrUj0mH5iJs5aovR/iJOVEGIWJuKTwr8gq0iS3i8y5Y9EvAwnrWcNsj3GbJly2pzfW4BZ9CRZ+QI+Yfib+oRBVRp0qk84Lc6qq9Dqs3SFhI2FKZv+JefrkVPs+xBt6HZhXnU/fIi21MT04u66O/vSMSCImbpQ7QKWf6l2RJeoJ8azicS+lnmt4NpmLuv+0YxRMXbburqyLd337Ne1srVUai5AlabiIi5chXiRxmU0jtPXa5diQF/H6cyIA+TkwkR0CpyNjwPx6sXeBhVDa5MiVd7CfvNvcFz4zMzgNyUDGat7nQ0c8QaXYQNWw2StZwz7hxjpryCUlm4xIzB8QR92wg3hDwfXB2BDKW9JFBqFulHiZ6nEryy+/y95m/Uab9NG2qhI3E5cZ51C3MQ7ZPufS9tVz390zrz6Dw==</diagram></mxfile>

@ -0,0 +1,11 @@
<?xml version="1.0" encoding="UTF-8"?>
<projectDescription>
<name>development_sql</name>
<comment></comment>
<projects>
</projects>
<buildSpec>
</buildSpec>
<natures>
</natures>
</projectDescription>

@ -1,15 +0,0 @@
# Description
# This program tests the ability to connect the the DB
#
import psycopg2 as psyco
conn = psyco.connect(dbname="aact_db", user="root", host="localhost", password="root")
curse = conn.cursor()
curse.execute("select nct_id FROM ctgov.studies LIMIT 10;")
print(curse.fetchall())
curse.close()
conn.close()

@ -1,20 +0,0 @@
# File descriptions
db_connection.py
- is just a test file
- [ ] TODO: should be incorporated in a tests justfile recipe. maybe moved to a test location?
downloader_prep.sql
- contains sql to identify which trials are of interest.
- [ ] TODO: add into the automation routine somewhere.
downloader.py
- does the actual downloading
- setup to also act as a python module if needed.
- [ ] TODO: there are quite a few things that need cleaned or refactored.
./tests/download_tests.py
- downloads some test html values from clinicaltrials.gov

@ -1,19 +0,0 @@
import downloader as dldr
if __name__ == "__main__":
dbc = dldr.DBConnectionCreator(
dbname="aact_db"
,user="root"
,host="will-office"
,port=5432
,password="root")
with open('selected_trials.sql','r') as fh:
sqlfile = fh.read()
with dbc.new() as connection:
with connection.cursor() as curse:
curse.execute(sqlfile)

@ -1,17 +0,0 @@
#!/bin/bash
set -e
download_and_unzip () {
curl "$1" > out.zip
unzip ./out.zip
rm ./out.zip
}
#date on market data
download_and_unzip "https://download.open.fda.gov/other/nsde/other-nsde-0001-of-0003.json.zip"
download_and_unzip "https://download.open.fda.gov/other/nsde/other-nsde-0002-of-0003.json.zip"
download_and_unzip "https://download.open.fda.gov/other/nsde/other-nsde-0003-of-0003.json.zip"
#rxnorm data
download_and_unzip "https://dailymed-data.nlm.nih.gov/public-release-files/rxnorm_mappings.zip"

@ -1,95 +0,0 @@
import connetorx as cx
from sqlalchemy import create_engine
import re
####################CONSTANTS#################################
MYSQL_CONNECTION_STRING="mysql://webuser:9521354c77aa@localhost/"
POSTGRES_CONNECTION_STRING="postgresql://root:root@localhost/aact_db"
POSTGRES_ENGINE = create_engine(POSTGRES_CONNECTION_STRING)
SPLIT_RE = re.compile("(\w+)(\((\d+)\))?")
###################QUERIES#########################
QUERY_columns_from_Information_Schema = """
SELECT *
FROM INFORMATION_SCHEMA.columns
WHERE
TABLE_SCHEMA="rxnorm_current"
"""
QUERY_data_from_table = ""
########FUNCTIONS#################
def query_mysql(query):
"""
runs a query against the MYSQL database, returning a pandas df
"""
return cx.read_sql(MYSQL_CONNECTION_STRING, query)
def insert_table_postgres(df, table, schema):
"""
Inserts data into a table
"""
return df.to_sql(
table
,POSTGRES_ENGINE
,schema=schema
,if_exists="append"
,method="multi"
)
def convert_mysql_types_to_pgsql(binary_type):
"""
Given a binary string of a column's type,
convert to utf8, and then parse it into
a postgres type
"""
string_type = binary_type.decode("utf-8").lower()
#get the value name and length out.
val_type,_,length = SPLIT_RE.match(string_type).groups()
def convert_column(df_row):
#extract
position = df_row.ORDINAL_POSITION
table_name = df_row.TABLE_NAME
#convert
if data_type=="varchar":
string = "{column_name} character varying({data_length}) COLLATE pg_catalog.\"default\" {is_nullable},".format(
column_name = df_row.COLUMN_NAME
,data_length = np.int64(df_row.CHARACTER_MAXIMUM_LENGTH)
,is_nullable = "NOT NULL" if df_row.IS_NULLABLE == "NO" else ""
)
elif data_type=="char":
string = "{column_name} char({data_length})[] COLLATE pg_catalog.\"default\" {is_nullable},".format(
column_name = df_row.COLUMN_NAME
,data_length = np.int64(df_row.CHARACTER_MAXIMUM_LENGTH)
,is_nullable = "NOT NULL" if df_row.IS_NULLABLE == "NO" else ""
)
elif data_type=="tinyint":
string = "{column_name} smallint {is_nullable},".format(
column_name = df_row.COLUMN_NAME
,is_nullable = "NOT NULL" if df_row.IS_NULLABLE == "NO" else ""
)
series_type = numpy.int8
elif data_type=="decimal":
string = "{column_name} numeric({precision},{scale}) {is_nullable},".format(
column_name = df_row.COLUMN_NAME
,is_nullable = "NOT NULL" if df_row.IS_NULLABLE == "NO" else ""
,precision= np.int64(df_row.NUMERIC_PRECISION)
,scale= np.int64(df_row.NUMERIC_SCALE)
)
elif data_type=="int":
pass
elif data_type=="enum":
pass
elif data_type=="text":
pass
return string

@ -1 +0,0 @@
downloads and extracts nsde data.

@ -0,0 +1,11 @@
from drugtools.env_setup import postgres_conn, mariadb_conn, ENV
print(ENV)
with postgres_conn() as pconn, pconn.cursor() as curse:
curse.execute("select nct_id FROM ctgov.studies LIMIT 10;")
print(curse.fetchall())
with mariadb_conn() as mconn, mconn.cursor() as mcurse:
mcurse.execute("select * FROM ALLNDC_HISTORY LIMIT 10;")
print(mcurse.fetchall())

@ -1,9 +1,7 @@
import downloader as download
from drugtools.historical_nct_downloader import make_request, get_highest_version_number, step_generator
#this uses the history downloader script to test downloads
def print_response_to_file(r):
pass
def pretty_print_response(r):
pass
def trial_downloads(nct_id):
"""
@ -11,25 +9,25 @@ def trial_downloads(nct_id):
Instead it writes to files
"""
print("downloading {}".format(nct_id))
r = download.make_request(nct_id,1,2)
r = make_request(nct_id,1,2)
responses = [ r ]
print(r.url)
v = download.get_highest_version_number(r)
v = get_highest_version_number(r)
if v == 2:
pass
elif v%2 == 0:
for version_a, version_b in download.step_generator(v):
for version_a, version_b in step_generator(v):
print("\t versions {} & {}".format(version_a,version_b))
req = download.make_request(nct_id,version_a,version_b)
req = make_request(nct_id,version_a,version_b)
responses.append(req)
elif v %2 == 1:
for version_a, version_b in download.step_generator(v):
for version_a, version_b in step_generator(v):
print("\t downloading versions {} & {}".format(version_a,version_b))
req = download.make_request(nct_id,version_a,version_b)
req = make_request(nct_id,version_a,version_b)
responses.append(req)
responses.append(download.make_request(nct_id,1,v))
responses.append(make_request(nct_id,1,v))
print("\tDownloaded {} versions".format(v))

@ -1,11 +1,17 @@
import json
import psycopg2 as psyco
from psycopg2.extras import execute_values
import datetime as dt
from drugtools.env_setup import postgres_conn, ENV
import requests
import zipfile
import io
def file_generator(max_num):
URL_STEM = 'https://download.open.fda.gov/other/nsde/'
NUMBER_OF_NSDE_FILES = int(ENV["NUMBER_OF_NSDE_FILES"])
def filename_generator(max_num):
for itt in range(1,max_num+1):
filename = "other-nsde-{:0>4}-of-{:0>4}.json".format(itt,max_num)
filename = "other-nsde-{:0>4}-of-{:0>4}.json.zip".format(itt,max_num)
yield filename
def get_date(result,key):
@ -45,18 +51,22 @@ def build_values(result):
,reactivation_date
)
def download_and_extract_zip(base_url,filename):
response = requests.get(base_url + filename)
if __name__ == "__main__":
for x in file_generator(3):
with zipfile.ZipFile(io.BytesIO(response.content)) as the_zip:
contents_list = the_zip.infolist()
for content_name in contents_list:
return the_zip.read(content_name)
def run():
for filename in filename_generator(NUMBER_OF_NSDE_FILES):
#It would be nice to replace this^^ file_generator with something that retrieves and unzips the files directly.
with (
psyco.connect(dbname="aact_db", user="root", host="localhost", password="root") as con,
con.cursor() as curse,
open("./"+x,"r") as j
):
print(x)
with (postgres_conn() as con , con.cursor() as curse):
print(filename)
results = json.loads(j.read())["results"]
j = download_and_extract_zip(URL_STEM, filename)
results = json.loads(j)["results"]
query = """
INSERT INTO spl.nsde (
proprietary_name
@ -82,4 +92,5 @@ VALUES %s;
if __name__ == "__main__":
run()

@ -0,0 +1,30 @@
import pymysql
import psycopg2 as psyco
from dotenv import dotenv_values
env_path = "../containers/.env"
ENV = dotenv_values(env_path)
def mariadb_conn(**kwargs):
return pymysql.connect(
database=ENV["MYSQL_DB"]
,user=ENV["MYSQL_USER"]
,host=ENV["MYSQL_HOST"]
,port=int(ENV["MYSQL_PORT"])
,password=ENV["MYSQL_PASSWORD"]
,**kwargs
)
def postgres_conn(**kwargs):
return psyco.connect(
dbname=ENV["POSTGRES_DB"]
,user=ENV["POSTGRES_USER"]
,host=ENV["POSTGRES_HOST"]
,port=ENV["POSTGRES_PORT"]
,password=ENV["POSTGRES_PASSWORD"]
,**kwargs
)
def get_tables_of_interest():
return ENV["TABLES_OF_INTEREST"].split(",")

@ -1,13 +1,17 @@
import requests
import psycopg2 as psyco
from datetime import datetime
from bs4 import BeautifulSoup
from multiprocessing import Pool, Value
from multiprocess import Pool, Value
import math
import time
import argparse
from dotenv import dotenv_values
from drugtools.env_setup import postgres_conn, ENV
############ GLOBALS
RESET_TIME = Value('I',int(ENV["TRIAL_DOWNLOAD_RESET_TIME"]))
DELAY_TIME = Value("I",int(ENV["TRIAL_DOWNLOAD_DELAY_TIME"]))
TRIAL_RESERVATION_LIMIT=int(ENV["TRIAL_RESERVATION_LIMIT"])
############ Functions
def get_highest_version_number(response):
"""
Navigate to the version table and and extract the highest posted version.
@ -114,7 +118,7 @@ def write_incomplete(cursor, nct_id):
"""
cursor.execute(query, [nct_id] )
def download_trial_records(nct_id, db_connection_specs, delay_time, reset_time):
def download_trial_records(nct_id, delay_time, reset_time):
"""
Manage the download of all records associated with a given trial.
It uses a single connection and cursor for downloading the entire trial.
@ -130,7 +134,7 @@ def download_trial_records(nct_id, db_connection_specs, delay_time, reset_time):
# A new connection is created every time the function is called so that this
# function can be run using a multiprocessing pool
with db_connection_specs.new() as db_conn:
with postgres_conn() as db_conn:
with db_conn.cursor() as cursor:
#upload the first two versions
@ -164,27 +168,6 @@ def download_trial_records(nct_id, db_connection_specs, delay_time, reset_time):
class DBConnectionCreator():
"""
Creates new database connections based on a specified set of parameters.
This simplifies connection creation by allowing the programmer to pass
around the preconfigured connection creator.
"""
def __init__(self,dbname, user, host, port, password):
self.dbname = dbname
self.user = user
self.host = host
self.port = port
self.password=password
def new(self):
return psyco.connect(
dbname=self.dbname
,user=self.user
,host=self.host
,port=self.port
,password=self.password
)
def step_generator(max_version):
"""
@ -222,58 +205,30 @@ def reserve_trials(db_connection, limit=10):
return nctids_list
if __name__ == "__main__":
env_path = ''
config = dotenv_values(env_path)
"""
Main!
"""
parser = argparse.ArgumentParser(description="Download historical data")
parser.add_argument(
"-c"
,"--count"
, dest="count"
, type=int
, default=10
, help="Specify how many studies to download (default 10). If you want to download all of them, just enter some number higher than the total number of trials selected."
)
parser.add_argument(
"-H"
,"--host"
, dest="host"
, default="localhost"
, help="Specify the hostname of the postgres server (Default: localhost)"
)
args = parser.parse_args()
#instantiate a database connnection creator
dbc = DBConnectionCreator(
dbname="aact_db"
,user="root"
,host=args.host
,port=5432
,password="root")
def reserve_and_download_versions(limit):
#db connection
with dbc.new() as con:
with postgres_conn() as con:
#get list of nct_ids
nctids = reserve_trials(con, args.count)
print(nctids)
nctids = reserve_trials(con, limit)
print("reserving_trials: ", nctids)
reset_time = 10
delay_time = Value("I",1)
#lambda that parameterizes the downloader, allowing it to be passed to the pool.
def downloader(nct):
download_trial_records(nct, dbc, delay_time, reset_time)
download_trial_records(nct, DELAY_TIME, RESET_TIME)
#start analyzing them
with Pool(processes=12) as process_pool:
process_pool.map(downloader, nctids)
def run():
reserve_and_download_versions(TRIAL_RESERVATION_LIMIT)
if __name__ == "__main__":
"""
Main!
"""
run()

@ -1,12 +1,12 @@
from collections import namedtuple
from copy import copy
from datetime import datetime
import psycopg2
from bs4 import BeautifulSoup
import argparse
#import textprocessing as tp #cuz tp is important
from drugtools.env_setup import ENV,postgres_conn
#requires Python 3.10
#### GLOBALS
VERBOSE = True if ENV["VERBOSE"] == "True" else False
###CLASSES AND CONSTRUCTORS
@ -380,34 +380,9 @@ def get_data_from_versions(nct_id,html, version_a_int, version_b_int):
return version_a,version_b
if __name__ == "__main__":
argParser = argparse.ArgumentParser()
# Adding diagnostic printing
argParser.add_argument(
"-V"
,"--verbose"
, help="Display a lot of of diagnostic information"
, action='store_true'
)
# host
argParser.add_argument(
"--host"
, help="Change hostname"
)
args = argParser.parse_args()
VERBOSE = args.verbose
if args.host:
host=args.host
else:
host="localhost"
with psycopg2.connect(dbname="aact_db", user="root", password="root",host=host) as db_connection:
def run():
with postgres_conn() as db_connection:
#pull the requests from the db
with db_connection.cursor() as curse:
sql = """
@ -430,6 +405,8 @@ if __name__ == "__main__":
version2.load_to_db(db_connection)
if __name__ == "__main__":
run()
"""
Documentation:

@ -0,0 +1,15 @@
from drugtools.env_setup import postgres_conn
from pathlib import Path
def run():
#get relative path
p = Path(__file__).with_name("selected_trials.sql")
with open(p,'r') as fh:
sqlfile = fh.read()
with postgres_conn() as connection:
with connection.cursor() as curse:
curse.execute(sqlfile)
if __name__ == "__main__":
run()

@ -4,6 +4,7 @@ from psycopg2 import extras
import pymysql
from dotenv import load_dotenv
import os
from drugtools.env_setup import postgres_conn, mariadb_conn, get_tables_of_interest
##############NOTE
@ -17,7 +18,10 @@ I will have the ability to reduce memory usage and simplify what I am doing.
'''
############### GLOBALS
#these are hardcoded so they shouldn't require any updates
mschema="rxnorm_current"
pschema="rxnorm_migrated"
########FUNCTIONS#################
@ -51,46 +55,12 @@ def convert_column(d):
return string
if __name__ == "__main__":
#process environment variables
load_dotenv()
POSTGRES_HOST = os.getenv("POSTGRES_HOST")
POSTGRES_DB = os.getenv("POSTGRES_DB")
POSTGRES_USER = os.getenv("POSTGRES_USER")
POSTGRES_PASSWD = os.getenv("POSTGRES_PASSWD")
POSTGRES_PORT = int(os.getenv("POSTGRES_PORT"))
MARIADB_HOST = os.getenv("MARIADB_HOST")
MARIADB_DB = os.getenv("MARIADB_DB")
MARIADB_USER = os.getenv("MARIADB_USER")
MARIADB_PASSWD = os.getenv("MARIADB_PASSWD")
MARIADB_PORT = int(os.getenv("MARIADB_PORT"))
def run():
#get & convert datatypes for each table of interest
tables_of_interest = [
"rxnorm_props"
,"rxnorm_relations"
,"ALLNDC_HISTORY"
,"ALLRXCUI_HISTORY"
]
mschema="rxnorm_current"
pschema="rxnorm_migrated"
with pymysql.connect(
user=MARIADB_USER
,password=MARIADB_PASSWD
,host=MARIADB_HOST
,port=MARIADB_PORT
,database=MARIADB_DB
,cursorclass=pymysql.cursors.DictCursor
) as mcon, psyco.connect(
user=POSTGRES_USER
,password=POSTGRES_PASSWD
,host=POSTGRES_HOST
,port=POSTGRES_PORT
,database=POSTGRES_DB
) as pcon:
tables_of_interest = get_tables_of_interest()
with mariadb_conn(cursorclass=pymysql.cursors.DictCursor) as mcon, postgres_conn() as pcon:
with mcon.cursor() as mcurse, pcon.cursor(cursor_factory=extras.DictCursor) as pcurse:
for table in tables_of_interest: #create equivalent table in postgres
@ -115,35 +85,15 @@ if __name__ == "__main__":
pcurse.execute(create_table_statement)
pcon.commit()
#check if tables already exist and have the proper size
#msize_check = 'select count(*) from {schema}.{table};'.format(schema=mschema,table=table)
#psize_check = 'select count(*) from {schema}.{table};'.format(schema=pschema,table=table)
#yes I am using an insecure way to build these^^^ statements.
#It shouldn't matter because if someone is changing this source to
#to harm your db, you've already lost.
#mcurse.execute(msize_check)
#pcurse.execute(psize_check)
#psize = pcurse.fetchall()[0][0]
#msize = mcurse.fetchall()[0]['count(*)']
#if psize > msize :
# #if they arn't the same, mention error and continue
# raise Exception("TABLE {} in postgres has more data than mysql".format(table))
# continue
#elif psize != 0:
# raise Exception("TABLE {} in postgres is not empty".format(table))
# continue
#Get the data from mysql
mcurse.execute("SELECT * FROM {schema}.{table}".format(schema=mschema,table=table))
#FIX setting up sql this^^^ way is improper.
a = mcurse.fetchall()
results = mcurse.fetchall()
#build the insert statement template
#get list of field names
column_list = [sql.SQL(x) for x in a[0]]
column_inserts = [sql.SQL("%({})s".format(x)) for x in a[0]] #fix with sql.Placeholder
column_list = [sql.SQL(x) for x in results[0]]
column_inserts = [sql.SQL("%({})s".format(x)) for x in results[0]] #fix with sql.Placeholder
#generate insert statement
psql_insert = sql.SQL("INSERT INTO {table} ({columns}) VALUES %s ").format(
table=sql.Identifier(pschema,table)
@ -162,4 +112,7 @@ if __name__ == "__main__":
])
#insert the data with page_size
extras.execute_values(pcurse,psql_insert,argslist=a,template=template, page_size=1000)
extras.execute_values(pcurse,psql_insert,argslist=results,template=template, page_size=1000)
if __name__ == "__main__":
run()
Loading…
Cancel
Save