Housekeeping: Renamed folder to distinguish between aact and history downloader. Updated .gitignore to handle orangebook data
parent
84b38166e1
commit
9623bc0550
@ -1,96 +0,0 @@
|
|||||||
import requests
|
|
||||||
import psycopg2 as psyco
|
|
||||||
from datetime import datetime
|
|
||||||
from bs4 import BeautifulSoup
|
|
||||||
|
|
||||||
|
|
||||||
def get_highest_version_number(response):
|
|
||||||
#navigate to a specific part of the returned html and extract the highest posted version.
|
|
||||||
soup = BeautifulSoup(response.text, features="lxml")
|
|
||||||
version_value = soup.findChildren("fieldset")[0].table.tbody.findChildren("tr")[-1].td.text
|
|
||||||
return int(version_value)
|
|
||||||
|
|
||||||
def make_request(nct_id,version1,version2):
|
|
||||||
#create url
|
|
||||||
baseurl = "https://clinicaltrials.gov/ct2/history/{}?A={}&B={}&C=Side-by-Side"
|
|
||||||
url = baseurl.format(nct_id,version1,version2)
|
|
||||||
|
|
||||||
#make request
|
|
||||||
response = requests.get(url)
|
|
||||||
|
|
||||||
#return the response
|
|
||||||
return response
|
|
||||||
|
|
||||||
def upload_response(db_conn, nct_id, version_a, version_b, response):
|
|
||||||
|
|
||||||
timestamp = datetime.strptime(r.headers['date'], "%a, %d %b %Y %H:%M:%S %Z")
|
|
||||||
|
|
||||||
#this uploads the response values.
|
|
||||||
db_conn.execute("""
|
|
||||||
INSERT INTO http.responses
|
|
||||||
(nct_id,version_a,version_b,url,response_code,response_date, html)
|
|
||||||
VALUES (%s,%s,%s,%s,%s,%s,%s)
|
|
||||||
RETURNING id
|
|
||||||
;
|
|
||||||
"""
|
|
||||||
,(nct_id
|
|
||||||
,version_a
|
|
||||||
,version_b
|
|
||||||
,response.url
|
|
||||||
,response.status_code
|
|
||||||
,datetime.isoformat(timestamp)
|
|
||||||
,response.text)) #FIX: there is an issue with storing the tags for some reason.
|
|
||||||
|
|
||||||
pk_response = cursor.fetchall()
|
|
||||||
|
|
||||||
return pk_response
|
|
||||||
|
|
||||||
|
|
||||||
def download_trial_records(nct_id, db_conn):
|
|
||||||
#download first 2 records
|
|
||||||
r = make_request(nct_id,1,2)
|
|
||||||
upload_response(db_conn, nct_id,1,2,r)
|
|
||||||
|
|
||||||
#extract last version
|
|
||||||
v = get_highest_version_number(r)
|
|
||||||
|
|
||||||
#download the remaining versions
|
|
||||||
if v == 2:
|
|
||||||
return None
|
|
||||||
elif v % 2 == 0:
|
|
||||||
for version_a, version_b in step_generator(v):
|
|
||||||
r = make_request(nct_id, version_a, version_b)
|
|
||||||
upload_response(db_conn,nct_id,version_a, version_b, r)
|
|
||||||
|
|
||||||
elif v % 2 == 1:
|
|
||||||
for version_a, version_b in step_generator(v):
|
|
||||||
r = make_request(nct_id, version_a, version_b)
|
|
||||||
upload_response(db_conn,nct_id,version_a, version_b, r)
|
|
||||||
|
|
||||||
r = make_request(nct_id, 1, v)
|
|
||||||
upload_response(db_conn,nct_id,1, v, r)
|
|
||||||
|
|
||||||
def step_generator(max_version):
|
|
||||||
old=3
|
|
||||||
for i in range(4,max_version,2):
|
|
||||||
yield (old,i)
|
|
||||||
old = i + 1
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
|
|
||||||
#make request
|
|
||||||
nct_id = "NCT00658567"
|
|
||||||
r = make_request(nct_id,1,36)
|
|
||||||
print(r.url)
|
|
||||||
print(r.status_code)
|
|
||||||
|
|
||||||
v = get_highest_version_number(r)
|
|
||||||
print("highest version", v )
|
|
||||||
|
|
||||||
#db connection
|
|
||||||
with psyco.connect(dbname="aact_db", user="root", host="localhost", password="root") as con:
|
|
||||||
with con.cursor() as cursor:
|
|
||||||
#insert response
|
|
||||||
|
|
||||||
download_trial_records(nct_id,cursor)
|
|
||||||
print("download complete")
|
|
||||||
@ -0,0 +1,191 @@
|
|||||||
|
from tkinter import W
|
||||||
|
import requests
|
||||||
|
import psycopg2 as psyco
|
||||||
|
from datetime import datetime
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
from multiprocessing import Pool
|
||||||
|
|
||||||
|
def get_highest_version_number(response):
|
||||||
|
#navigate to a specific part of the returned html and extract the highest posted version.
|
||||||
|
soup = BeautifulSoup(response.text, features="lxml")
|
||||||
|
version_value = soup.findChildren("fieldset")[0].table.tbody.findChildren("tr")[-1].td.text
|
||||||
|
return int(version_value)
|
||||||
|
|
||||||
|
def make_request(nct_id,version1,version2):
|
||||||
|
#create url
|
||||||
|
baseurl = "https://clinicaltrials.gov/ct2/history/{}?A={}&B={}&C=Side-by-Side"
|
||||||
|
url = baseurl.format(nct_id,version1,version2)
|
||||||
|
|
||||||
|
#make request
|
||||||
|
response = requests.get(url)
|
||||||
|
|
||||||
|
#return the response
|
||||||
|
return response
|
||||||
|
|
||||||
|
def upload_response(db_cursor, nct_id, version_a, version_b, response):
|
||||||
|
|
||||||
|
timestamp = datetime.strptime(response.headers['date'], "%a, %d %b %Y %H:%M:%S %Z")
|
||||||
|
|
||||||
|
#this uploads the response values.
|
||||||
|
db_cursor.execute("""
|
||||||
|
INSERT INTO http.responses
|
||||||
|
(nct_id,version_a,version_b,url,response_code,response_date, html)
|
||||||
|
VALUES (%s,%s,%s,%s,%s,%s,%s)
|
||||||
|
;
|
||||||
|
"""
|
||||||
|
,(nct_id
|
||||||
|
,version_a
|
||||||
|
,version_b
|
||||||
|
,response.url
|
||||||
|
,response.status_code
|
||||||
|
,datetime.isoformat(timestamp)
|
||||||
|
,response.text
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
def download_and_handle_errors(cursor, nct_id, version_a, version_b):
|
||||||
|
r = make_request(nct_id, version_a, version_b)
|
||||||
|
if r.status_code == 200:
|
||||||
|
upload_response(cursor,nct_id,version_a, version_b, r)
|
||||||
|
else:
|
||||||
|
#TODO: this should handle errors by
|
||||||
|
# - [ ] write http code to http.responses
|
||||||
|
# - [ ] write incomplete to http.download_status
|
||||||
|
# - [ ] tell all other processes to slow down the request speed
|
||||||
|
# - [x] raise exception
|
||||||
|
raise Exception("Download of {} (versions {},{}) returned http code != 200".format(nct_id,version_a,version_b))
|
||||||
|
|
||||||
|
return r
|
||||||
|
|
||||||
|
def download_trial_records(nct_id, db_connection_specs):
|
||||||
|
# A new connection is created every time the function is called so that this
|
||||||
|
# function can be run using a multiprocessing pool
|
||||||
|
with db_connection_specs.new() as db_conn:
|
||||||
|
with db_conn.cursor() as cursor:
|
||||||
|
|
||||||
|
#upload the first two versions
|
||||||
|
r = download_and_handle_errors(cursor, nct_id, 1, 2)
|
||||||
|
#extract last version
|
||||||
|
v = get_highest_version_number(r)
|
||||||
|
|
||||||
|
|
||||||
|
#download and upload the remaining versions
|
||||||
|
if v == 2:
|
||||||
|
return None
|
||||||
|
elif v % 2 == 0:
|
||||||
|
for version_a, version_b in step_generator(v):
|
||||||
|
#download the history, handling any errors as they come up, and submitting it to the database.
|
||||||
|
download_and_handle_errors(cursor, nct_id, version_a, version_b)
|
||||||
|
elif v % 2 == 1:
|
||||||
|
#if there are an odd number of submissions treat at as even
|
||||||
|
for version_a, version_b in step_generator(v):
|
||||||
|
download_and_handle_errors(cursor, nct_id, version_a, version_b)
|
||||||
|
#now handle an odd number of versions by downloading the 1 vs (end) comparison.
|
||||||
|
download_and_handle_errors(cursor, nct_id, 1, v)
|
||||||
|
|
||||||
|
#now mark the trial as having been downloaded
|
||||||
|
cursor.execute(
|
||||||
|
"""
|
||||||
|
INSERT INTO http.download_status (nct_id,status)
|
||||||
|
VALUES (%s, 'Downloaded'::http.history_download_status)
|
||||||
|
"""
|
||||||
|
, [nct_id]
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
class DBConnectionCreator():
|
||||||
|
def __init__(self,dbname, user, host, port, password):
|
||||||
|
self.dbname = dbname
|
||||||
|
self.user = user
|
||||||
|
self.host = host
|
||||||
|
self.port = port
|
||||||
|
self.password=password
|
||||||
|
|
||||||
|
def new(self):
|
||||||
|
return psyco.connect(
|
||||||
|
dbname=self.dbname
|
||||||
|
,user=self.user
|
||||||
|
,host=self.host
|
||||||
|
,port=self.port
|
||||||
|
,password=self.password
|
||||||
|
)
|
||||||
|
|
||||||
|
def step_generator(max_version):
|
||||||
|
old=3
|
||||||
|
for i in range(4,max_version,2):
|
||||||
|
yield (old,i)
|
||||||
|
old = i + 1
|
||||||
|
|
||||||
|
def flag_trials_of_interest(db_connection):
|
||||||
|
query = """
|
||||||
|
INSERT INTO http.download_status (nct_id, status)
|
||||||
|
SELECT nct_id, 'Of Interest'::http.history_download_status AS status
|
||||||
|
FROM ctgov.studies
|
||||||
|
WHERE
|
||||||
|
is_fda_regulated_drug=TRUE
|
||||||
|
AND
|
||||||
|
study_type = 'Interventional'
|
||||||
|
AND
|
||||||
|
phase='Phase 3'
|
||||||
|
AND
|
||||||
|
overall_status in ('Terminated', 'Completed')
|
||||||
|
AND
|
||||||
|
start_date > '2008-01-01'
|
||||||
|
AND
|
||||||
|
completion_date < '2022-01-01'
|
||||||
|
;
|
||||||
|
"""
|
||||||
|
def reserve_trials(db_connection, limit=10):
|
||||||
|
query = """
|
||||||
|
WITH OF_INTEREST AS
|
||||||
|
(SELECT NCT_ID
|
||||||
|
FROM HTTP.MOST_RECENT_DOWNLOAD_STATUS
|
||||||
|
WHERE HTTP.MOST_RECENT_DOWNLOAD_STATUS.STATUS = 'Of Interest'::HTTP.HISTORY_DOWNLOAD_STATUS
|
||||||
|
LIMIT %s
|
||||||
|
)
|
||||||
|
INSERT INTO HTTP.DOWNLOAD_STATUS (NCT_ID,STATUS)
|
||||||
|
SELECT OF_INTEREST.NCT_ID, 'Reserved'::HTTP.HISTORY_DOWNLOAD_STATUS AS STATUS
|
||||||
|
FROM OF_INTEREST
|
||||||
|
RETURNING NCT_ID;
|
||||||
|
"""
|
||||||
|
|
||||||
|
with db_connection.cursor() as cursor:
|
||||||
|
cursor.execute(query, [limit] )
|
||||||
|
nctids_list = cursor.fetchall()
|
||||||
|
nctids_list = [ x[0] for x in nctids_list]
|
||||||
|
|
||||||
|
|
||||||
|
return nctids_list
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
#instantiate a database connnection creator
|
||||||
|
dbc = DBConnectionCreator(
|
||||||
|
dbname="aact_db"
|
||||||
|
,user="python_downloader"
|
||||||
|
,host="localhost"
|
||||||
|
,port=5432
|
||||||
|
,password="download")
|
||||||
|
|
||||||
|
#lambda that parameterizes the downloader
|
||||||
|
def downloader(nct):
|
||||||
|
download_trial_records(nct, dbc)
|
||||||
|
|
||||||
|
#db connection
|
||||||
|
with dbc.new() as con:
|
||||||
|
#select lists to download
|
||||||
|
#flag_trials_of_interest(con) #SHould only be run once
|
||||||
|
|
||||||
|
|
||||||
|
#get list of nct_ids
|
||||||
|
nctids = reserve_trials(con, 1)
|
||||||
|
print(nctids)
|
||||||
|
|
||||||
|
#start analyzing them
|
||||||
|
with Pool(processes=3) as process_pool:
|
||||||
|
process_pool.map(downloader, nctids)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@ -0,0 +1,21 @@
|
|||||||
|
DELETE FROM http.download_status;
|
||||||
|
|
||||||
|
INSERT INTO http.download_status (nct_id, status)
|
||||||
|
SELECT nct_id, 'Of Interest'::http.history_download_status AS status
|
||||||
|
FROM ctgov.studies
|
||||||
|
WHERE
|
||||||
|
is_fda_regulated_drug=TRUE
|
||||||
|
AND
|
||||||
|
study_type = 'Interventional'
|
||||||
|
AND
|
||||||
|
phase='Phase 3'
|
||||||
|
AND
|
||||||
|
overall_status in ('Terminated', 'Completed')
|
||||||
|
AND
|
||||||
|
start_date > '2008-01-01'
|
||||||
|
AND
|
||||||
|
completion_date < '2022-01-01'
|
||||||
|
;
|
||||||
|
|
||||||
|
|
||||||
|
SELECT count(*) FROM http.download_status ;
|
||||||
Loading…
Reference in New Issue