From 9623bc055075b8bb3c4e889db2ae092021fbd2aa Mon Sep 17 00:00:00 2001 From: youainti Date: Wed, 1 Jun 2022 13:10:19 -0700 Subject: [PATCH] Housekeeping: Renamed folder to distinguish between aact and history downloader. Updated .gitignore to handle orangebook data --- .gitignore | 2 + downloader/downloader.py | 96 --------- .../db_connection.py | 0 history_downloader/downloader.py | 191 ++++++++++++++++++ history_downloader/downloader_prep.sql | 21 ++ 5 files changed, 214 insertions(+), 96 deletions(-) delete mode 100644 downloader/downloader.py rename {downloader => history_downloader}/db_connection.py (100%) create mode 100644 history_downloader/downloader.py create mode 100644 history_downloader/downloader_prep.sql diff --git a/.gitignore b/.gitignore index eea0d8a..75fd352 100644 --- a/.gitignore +++ b/.gitignore @@ -183,3 +183,5 @@ Manifest.toml *_clinical_trials/ *_clinical_trials.zip NCT*.html +/Orangebook/EOBZIP_*/ +/Orangebook/Orangebooks/ diff --git a/downloader/downloader.py b/downloader/downloader.py deleted file mode 100644 index 15f72c4..0000000 --- a/downloader/downloader.py +++ /dev/null @@ -1,96 +0,0 @@ -import requests -import psycopg2 as psyco -from datetime import datetime -from bs4 import BeautifulSoup - - -def get_highest_version_number(response): - #navigate to a specific part of the returned html and extract the highest posted version. - soup = BeautifulSoup(response.text, features="lxml") - version_value = soup.findChildren("fieldset")[0].table.tbody.findChildren("tr")[-1].td.text - return int(version_value) - -def make_request(nct_id,version1,version2): - #create url - baseurl = "https://clinicaltrials.gov/ct2/history/{}?A={}&B={}&C=Side-by-Side" - url = baseurl.format(nct_id,version1,version2) - - #make request - response = requests.get(url) - - #return the response - return response - -def upload_response(db_conn, nct_id, version_a, version_b, response): - - timestamp = datetime.strptime(r.headers['date'], "%a, %d %b %Y %H:%M:%S %Z") - - #this uploads the response values. - db_conn.execute(""" - INSERT INTO http.responses - (nct_id,version_a,version_b,url,response_code,response_date, html) - VALUES (%s,%s,%s,%s,%s,%s,%s) - RETURNING id - ; - """ - ,(nct_id - ,version_a - ,version_b - ,response.url - ,response.status_code - ,datetime.isoformat(timestamp) - ,response.text)) #FIX: there is an issue with storing the tags for some reason. - - pk_response = cursor.fetchall() - - return pk_response - - -def download_trial_records(nct_id, db_conn): - #download first 2 records - r = make_request(nct_id,1,2) - upload_response(db_conn, nct_id,1,2,r) - - #extract last version - v = get_highest_version_number(r) - - #download the remaining versions - if v == 2: - return None - elif v % 2 == 0: - for version_a, version_b in step_generator(v): - r = make_request(nct_id, version_a, version_b) - upload_response(db_conn,nct_id,version_a, version_b, r) - - elif v % 2 == 1: - for version_a, version_b in step_generator(v): - r = make_request(nct_id, version_a, version_b) - upload_response(db_conn,nct_id,version_a, version_b, r) - - r = make_request(nct_id, 1, v) - upload_response(db_conn,nct_id,1, v, r) - -def step_generator(max_version): - old=3 - for i in range(4,max_version,2): - yield (old,i) - old = i + 1 - -if __name__ == "__main__": - - #make request - nct_id = "NCT00658567" - r = make_request(nct_id,1,36) - print(r.url) - print(r.status_code) - - v = get_highest_version_number(r) - print("highest version", v ) - - #db connection - with psyco.connect(dbname="aact_db", user="root", host="localhost", password="root") as con: - with con.cursor() as cursor: - #insert response - - download_trial_records(nct_id,cursor) - print("download complete") diff --git a/downloader/db_connection.py b/history_downloader/db_connection.py similarity index 100% rename from downloader/db_connection.py rename to history_downloader/db_connection.py diff --git a/history_downloader/downloader.py b/history_downloader/downloader.py new file mode 100644 index 0000000..3d5168d --- /dev/null +++ b/history_downloader/downloader.py @@ -0,0 +1,191 @@ +from tkinter import W +import requests +import psycopg2 as psyco +from datetime import datetime +from bs4 import BeautifulSoup + +from multiprocessing import Pool + +def get_highest_version_number(response): + #navigate to a specific part of the returned html and extract the highest posted version. + soup = BeautifulSoup(response.text, features="lxml") + version_value = soup.findChildren("fieldset")[0].table.tbody.findChildren("tr")[-1].td.text + return int(version_value) + +def make_request(nct_id,version1,version2): + #create url + baseurl = "https://clinicaltrials.gov/ct2/history/{}?A={}&B={}&C=Side-by-Side" + url = baseurl.format(nct_id,version1,version2) + + #make request + response = requests.get(url) + + #return the response + return response + +def upload_response(db_cursor, nct_id, version_a, version_b, response): + + timestamp = datetime.strptime(response.headers['date'], "%a, %d %b %Y %H:%M:%S %Z") + + #this uploads the response values. + db_cursor.execute(""" + INSERT INTO http.responses + (nct_id,version_a,version_b,url,response_code,response_date, html) + VALUES (%s,%s,%s,%s,%s,%s,%s) + ; + """ + ,(nct_id + ,version_a + ,version_b + ,response.url + ,response.status_code + ,datetime.isoformat(timestamp) + ,response.text + ) + ) + +def download_and_handle_errors(cursor, nct_id, version_a, version_b): + r = make_request(nct_id, version_a, version_b) + if r.status_code == 200: + upload_response(cursor,nct_id,version_a, version_b, r) + else: + #TODO: this should handle errors by + # - [ ] write http code to http.responses + # - [ ] write incomplete to http.download_status + # - [ ] tell all other processes to slow down the request speed + # - [x] raise exception + raise Exception("Download of {} (versions {},{}) returned http code != 200".format(nct_id,version_a,version_b)) + + return r + +def download_trial_records(nct_id, db_connection_specs): + # A new connection is created every time the function is called so that this + # function can be run using a multiprocessing pool + with db_connection_specs.new() as db_conn: + with db_conn.cursor() as cursor: + + #upload the first two versions + r = download_and_handle_errors(cursor, nct_id, 1, 2) + #extract last version + v = get_highest_version_number(r) + + + #download and upload the remaining versions + if v == 2: + return None + elif v % 2 == 0: + for version_a, version_b in step_generator(v): + #download the history, handling any errors as they come up, and submitting it to the database. + download_and_handle_errors(cursor, nct_id, version_a, version_b) + elif v % 2 == 1: + #if there are an odd number of submissions treat at as even + for version_a, version_b in step_generator(v): + download_and_handle_errors(cursor, nct_id, version_a, version_b) + #now handle an odd number of versions by downloading the 1 vs (end) comparison. + download_and_handle_errors(cursor, nct_id, 1, v) + + #now mark the trial as having been downloaded + cursor.execute( + """ + INSERT INTO http.download_status (nct_id,status) + VALUES (%s, 'Downloaded'::http.history_download_status) + """ + , [nct_id] + ) + + + +class DBConnectionCreator(): + def __init__(self,dbname, user, host, port, password): + self.dbname = dbname + self.user = user + self.host = host + self.port = port + self.password=password + + def new(self): + return psyco.connect( + dbname=self.dbname + ,user=self.user + ,host=self.host + ,port=self.port + ,password=self.password + ) + +def step_generator(max_version): + old=3 + for i in range(4,max_version,2): + yield (old,i) + old = i + 1 + +def flag_trials_of_interest(db_connection): + query = """ + INSERT INTO http.download_status (nct_id, status) + SELECT nct_id, 'Of Interest'::http.history_download_status AS status + FROM ctgov.studies + WHERE + is_fda_regulated_drug=TRUE + AND + study_type = 'Interventional' + AND + phase='Phase 3' + AND + overall_status in ('Terminated', 'Completed') + AND + start_date > '2008-01-01' + AND + completion_date < '2022-01-01' + ; + """ +def reserve_trials(db_connection, limit=10): + query = """ + WITH OF_INTEREST AS + (SELECT NCT_ID + FROM HTTP.MOST_RECENT_DOWNLOAD_STATUS + WHERE HTTP.MOST_RECENT_DOWNLOAD_STATUS.STATUS = 'Of Interest'::HTTP.HISTORY_DOWNLOAD_STATUS + LIMIT %s + ) + INSERT INTO HTTP.DOWNLOAD_STATUS (NCT_ID,STATUS) + SELECT OF_INTEREST.NCT_ID, 'Reserved'::HTTP.HISTORY_DOWNLOAD_STATUS AS STATUS + FROM OF_INTEREST + RETURNING NCT_ID; + """ + + with db_connection.cursor() as cursor: + cursor.execute(query, [limit] ) + nctids_list = cursor.fetchall() + nctids_list = [ x[0] for x in nctids_list] + + + return nctids_list + +if __name__ == "__main__": + #instantiate a database connnection creator + dbc = DBConnectionCreator( + dbname="aact_db" + ,user="python_downloader" + ,host="localhost" + ,port=5432 + ,password="download") + + #lambda that parameterizes the downloader + def downloader(nct): + download_trial_records(nct, dbc) + + #db connection + with dbc.new() as con: + #select lists to download + #flag_trials_of_interest(con) #SHould only be run once + + + #get list of nct_ids + nctids = reserve_trials(con, 1) + print(nctids) + + #start analyzing them + with Pool(processes=3) as process_pool: + process_pool.map(downloader, nctids) + + + + diff --git a/history_downloader/downloader_prep.sql b/history_downloader/downloader_prep.sql new file mode 100644 index 0000000..43323f6 --- /dev/null +++ b/history_downloader/downloader_prep.sql @@ -0,0 +1,21 @@ +DELETE FROM http.download_status; + +INSERT INTO http.download_status (nct_id, status) +SELECT nct_id, 'Of Interest'::http.history_download_status AS status +FROM ctgov.studies +WHERE +is_fda_regulated_drug=TRUE +AND +study_type = 'Interventional' +AND +phase='Phase 3' +AND +overall_status in ('Terminated', 'Completed') +AND +start_date > '2008-01-01' +AND +completion_date < '2022-01-01' +; + + +SELECT count(*) FROM http.download_status ;