|
|
|
|
@ -7,12 +7,18 @@ from bs4 import BeautifulSoup
|
|
|
|
|
from multiprocessing import Pool
|
|
|
|
|
|
|
|
|
|
def get_highest_version_number(response):
|
|
|
|
|
"""
|
|
|
|
|
Extract the highest version currently available from the version number.
|
|
|
|
|
"""
|
|
|
|
|
#navigate to a specific part of the returned html and extract the highest posted version.
|
|
|
|
|
soup = BeautifulSoup(response.text, features="lxml")
|
|
|
|
|
version_value = soup.findChildren("fieldset")[0].table.tbody.findChildren("tr")[-1].td.text
|
|
|
|
|
return int(version_value)
|
|
|
|
|
|
|
|
|
|
def make_request(nct_id,version1,version2):
|
|
|
|
|
"""
|
|
|
|
|
Request a page comparing two snapshots
|
|
|
|
|
"""
|
|
|
|
|
#create url
|
|
|
|
|
baseurl = "https://clinicaltrials.gov/ct2/history/{}?A={}&B={}&C=Side-by-Side"
|
|
|
|
|
url = baseurl.format(nct_id,version1,version2)
|
|
|
|
|
@ -24,6 +30,9 @@ def make_request(nct_id,version1,version2):
|
|
|
|
|
return response
|
|
|
|
|
|
|
|
|
|
def upload_response(db_cursor, nct_id, version_a, version_b, response):
|
|
|
|
|
"""
|
|
|
|
|
Upload a requested page (with versions) to the database.
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
timestamp = datetime.strptime(response.headers['date'], "%a, %d %b %Y %H:%M:%S %Z")
|
|
|
|
|
|
|
|
|
|
@ -45,7 +54,12 @@ def upload_response(db_cursor, nct_id, version_a, version_b, response):
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
def download_and_handle_errors(cursor, nct_id, version_a, version_b):
|
|
|
|
|
"""
|
|
|
|
|
Request a page, checking for http error codes, and handle the errors as requested.
|
|
|
|
|
"""
|
|
|
|
|
#request page
|
|
|
|
|
r = make_request(nct_id, version_a, version_b)
|
|
|
|
|
#check for
|
|
|
|
|
if r.status_code == 200:
|
|
|
|
|
upload_response(cursor,nct_id,version_a, version_b, r)
|
|
|
|
|
else:
|
|
|
|
|
@ -59,6 +73,19 @@ def download_and_handle_errors(cursor, nct_id, version_a, version_b):
|
|
|
|
|
return r
|
|
|
|
|
|
|
|
|
|
def download_trial_records(nct_id, db_connection_specs):
|
|
|
|
|
"""
|
|
|
|
|
Manage the download of all records associated with a given trial.
|
|
|
|
|
It uses a single connection and cursor for downloading the entire trial.
|
|
|
|
|
|
|
|
|
|
The benefit of distributing the work at the trial level is that errors related
|
|
|
|
|
to a trial can be handled at that level.
|
|
|
|
|
|
|
|
|
|
This doesn't reserve a trial for download, but it does release the reservation.
|
|
|
|
|
"""
|
|
|
|
|
#for testing
|
|
|
|
|
print(nct_id)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# A new connection is created every time the function is called so that this
|
|
|
|
|
# function can be run using a multiprocessing pool
|
|
|
|
|
with db_connection_specs.new() as db_conn:
|
|
|
|
|
@ -96,6 +123,11 @@ def download_trial_records(nct_id, db_connection_specs):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class DBConnectionCreator():
|
|
|
|
|
"""
|
|
|
|
|
Creates new database connections based on a specified set of parameters.
|
|
|
|
|
This simplifies connection creation by allowing the programmer to pass
|
|
|
|
|
around the preconfigured connection creator.
|
|
|
|
|
"""
|
|
|
|
|
def __init__(self,dbname, user, host, port, password):
|
|
|
|
|
self.dbname = dbname
|
|
|
|
|
self.user = user
|
|
|
|
|
@ -113,12 +145,23 @@ class DBConnectionCreator():
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
def step_generator(max_version):
|
|
|
|
|
"""
|
|
|
|
|
Used to generate a list of versions to request
|
|
|
|
|
The specific pattern generated is
|
|
|
|
|
|
|
|
|
|
(3,4), (5,6), (7,8),...,(max_version-1,max_version)
|
|
|
|
|
"""
|
|
|
|
|
old=3
|
|
|
|
|
for i in range(4,max_version,2):
|
|
|
|
|
yield (old,i)
|
|
|
|
|
old = i + 1
|
|
|
|
|
|
|
|
|
|
def flag_trials_of_interest(db_connection):
|
|
|
|
|
"""
|
|
|
|
|
Mark the queries of interest as "of interest"
|
|
|
|
|
INCOMPLETE
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
query = """
|
|
|
|
|
INSERT INTO http.download_status (nct_id, status)
|
|
|
|
|
SELECT nct_id, 'Of Interest'::http.history_download_status AS status
|
|
|
|
|
@ -135,9 +178,14 @@ def flag_trials_of_interest(db_connection):
|
|
|
|
|
start_date > '2008-01-01'
|
|
|
|
|
AND
|
|
|
|
|
completion_date < '2022-01-01'
|
|
|
|
|
;
|
|
|
|
|
;
|
|
|
|
|
"""
|
|
|
|
|
#TODO: actually send it to the database.
|
|
|
|
|
|
|
|
|
|
def reserve_trials(db_connection, limit=10):
|
|
|
|
|
"""
|
|
|
|
|
Reserves a certain number of trials for processing in the DB.
|
|
|
|
|
"""
|
|
|
|
|
query = """
|
|
|
|
|
WITH OF_INTEREST AS
|
|
|
|
|
(SELECT NCT_ID
|
|
|
|
|
@ -156,10 +204,12 @@ def reserve_trials(db_connection, limit=10):
|
|
|
|
|
nctids_list = cursor.fetchall()
|
|
|
|
|
nctids_list = [ x[0] for x in nctids_list]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
return nctids_list
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
|
"""
|
|
|
|
|
Main!
|
|
|
|
|
"""
|
|
|
|
|
#instantiate a database connnection creator
|
|
|
|
|
dbc = DBConnectionCreator(
|
|
|
|
|
dbname="aact_db"
|
|
|
|
|
@ -168,7 +218,7 @@ if __name__ == "__main__":
|
|
|
|
|
,port=5432
|
|
|
|
|
,password="download")
|
|
|
|
|
|
|
|
|
|
#lambda that parameterizes the downloader
|
|
|
|
|
#lambda that parameterizes the downloader, allowing it to be passed to the pool.
|
|
|
|
|
def downloader(nct):
|
|
|
|
|
download_trial_records(nct, dbc)
|
|
|
|
|
|
|
|
|
|
@ -179,7 +229,7 @@ if __name__ == "__main__":
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#get list of nct_ids
|
|
|
|
|
nctids = reserve_trials(con, 1)
|
|
|
|
|
nctids = reserve_trials(con, 4)
|
|
|
|
|
print(nctids)
|
|
|
|
|
|
|
|
|
|
#start analyzing them
|
|
|
|
|
|