|
|
|
|
@ -3,8 +3,9 @@ import requests
|
|
|
|
|
import psycopg2 as psyco
|
|
|
|
|
from datetime import datetime
|
|
|
|
|
from bs4 import BeautifulSoup
|
|
|
|
|
|
|
|
|
|
from multiprocessing import Pool
|
|
|
|
|
from multiprocessing import Pool, Value
|
|
|
|
|
import math
|
|
|
|
|
import time
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_highest_version_number(response):
|
|
|
|
|
@ -25,7 +26,6 @@ def get_highest_version_number(response):
|
|
|
|
|
for row in reversed(table_rows):
|
|
|
|
|
# if it is <td headers="VersionNumber">xx</td> then it contains what we need.
|
|
|
|
|
for td in row.findChildren("td"):
|
|
|
|
|
print("\n", td)
|
|
|
|
|
if ("headers" in td.attrs) and (td.attrs["headers"][0]=="VersionNumber"):
|
|
|
|
|
#Note the use of [0] above. attribute elements are lists.
|
|
|
|
|
version_number = int(td.text)
|
|
|
|
|
@ -69,26 +69,52 @@ def upload_response(db_cursor, nct_id, version_a, version_b, response):
|
|
|
|
|
)
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
def download_and_handle_errors(cursor, nct_id, version_a, version_b):
|
|
|
|
|
def download_and_handle_errors(cursor, nct_id, version_a, version_b, delay_time, reset_time):
|
|
|
|
|
"""
|
|
|
|
|
Request a page, checking for http error codes, and handle the errors as requested.
|
|
|
|
|
"""
|
|
|
|
|
#sleep log10(counts of delays)
|
|
|
|
|
time.sleep(math.log10(delay_time.value))
|
|
|
|
|
|
|
|
|
|
#request page
|
|
|
|
|
r = make_request(nct_id, version_a, version_b)
|
|
|
|
|
#check for
|
|
|
|
|
if r.status_code == 200:
|
|
|
|
|
upload_response(cursor,nct_id,version_a, version_b, r)
|
|
|
|
|
elif r.status_code == 503:
|
|
|
|
|
# write http code to http.responses
|
|
|
|
|
upload_response(cursor, nct_id, version_a, version_b, r)
|
|
|
|
|
# write incomplete to http.download_status
|
|
|
|
|
write_incomplete(cursor,nct_id)
|
|
|
|
|
# tell all other processes to slow down the request speed
|
|
|
|
|
delay_time.value += 1
|
|
|
|
|
# Delay
|
|
|
|
|
print("Recieved 503 on {}, increasing delay count to {}".format(nct_id, delay_tiome))
|
|
|
|
|
time.sleep(reset_time)
|
|
|
|
|
else:
|
|
|
|
|
#TODO: this should handle errors by
|
|
|
|
|
# - [ ] write http code to http.responses
|
|
|
|
|
# - [ ] write incomplete to http.download_status
|
|
|
|
|
# - [ ] tell all other processes to slow down the request speed
|
|
|
|
|
# - [x] raise exception
|
|
|
|
|
raise Exception("Download of {} (versions {},{}) returned http code != 200".format(nct_id,version_a,version_b))
|
|
|
|
|
# write http code to http.responses
|
|
|
|
|
upload_response(cursor, nct_id, version_a, version_b, r)
|
|
|
|
|
# write incomplete to http.download_status
|
|
|
|
|
write_incomplete(cursor,nct_id)
|
|
|
|
|
# raise exception
|
|
|
|
|
#raise Exception("Download of {} (versions {},{}) returned http code {}".format(nct_id,version_a,version_b, r.status_code))
|
|
|
|
|
|
|
|
|
|
# Delay
|
|
|
|
|
time.sleep(reset_time)
|
|
|
|
|
return r
|
|
|
|
|
|
|
|
|
|
def download_trial_records(nct_id, db_connection_specs):
|
|
|
|
|
def write_incomplete(cursor, nct_id):
|
|
|
|
|
"""
|
|
|
|
|
Flags a trial as not having been fully downloaded.
|
|
|
|
|
"""
|
|
|
|
|
query = """
|
|
|
|
|
INSERT INTO HTTP.DOWNLOAD_STATUS (NCT_ID,STATUS)
|
|
|
|
|
%s, 'Incomplete'::HTTP.HISTORY_DOWNLOAD_STATUS
|
|
|
|
|
"""
|
|
|
|
|
cursor.execute(query, [nct_id] )
|
|
|
|
|
|
|
|
|
|
def download_trial_records(nct_id, db_connection_specs, delay_time, reset_time):
|
|
|
|
|
"""
|
|
|
|
|
Manage the download of all records associated with a given trial.
|
|
|
|
|
It uses a single connection and cursor for downloading the entire trial.
|
|
|
|
|
@ -108,7 +134,7 @@ def download_trial_records(nct_id, db_connection_specs):
|
|
|
|
|
with db_conn.cursor() as cursor:
|
|
|
|
|
|
|
|
|
|
#upload the first two versions
|
|
|
|
|
r = download_and_handle_errors(cursor, nct_id, 1, 2)
|
|
|
|
|
r = download_and_handle_errors(cursor, nct_id, 1, 2, delay_time, reset_time)
|
|
|
|
|
#extract last version
|
|
|
|
|
v = get_highest_version_number(r)
|
|
|
|
|
|
|
|
|
|
@ -119,13 +145,13 @@ def download_trial_records(nct_id, db_connection_specs):
|
|
|
|
|
elif v % 2 == 0:
|
|
|
|
|
for version_a, version_b in step_generator(v):
|
|
|
|
|
#download the history, handling any errors as they come up, and submitting it to the database.
|
|
|
|
|
download_and_handle_errors(cursor, nct_id, version_a, version_b)
|
|
|
|
|
download_and_handle_errors(cursor, nct_id, version_a, version_b, delay_time, reset_time)
|
|
|
|
|
elif v % 2 == 1:
|
|
|
|
|
#if there are an odd number of submissions treat at as even
|
|
|
|
|
for version_a, version_b in step_generator(v):
|
|
|
|
|
download_and_handle_errors(cursor, nct_id, version_a, version_b)
|
|
|
|
|
download_and_handle_errors(cursor, nct_id, version_a, version_b, delay_time, reset_time)
|
|
|
|
|
#now handle an odd number of versions by downloading the 1 vs (end) comparison.
|
|
|
|
|
download_and_handle_errors(cursor, nct_id, 1, v)
|
|
|
|
|
download_and_handle_errors(cursor, nct_id, 1, v, delay_time, reset_time)
|
|
|
|
|
|
|
|
|
|
#now mark the trial as having been downloaded
|
|
|
|
|
cursor.execute(
|
|
|
|
|
@ -229,26 +255,27 @@ if __name__ == "__main__":
|
|
|
|
|
dbc = DBConnectionCreator(
|
|
|
|
|
dbname="aact_db"
|
|
|
|
|
,user="python_downloader"
|
|
|
|
|
,host="localhost"
|
|
|
|
|
,host="will-office"
|
|
|
|
|
,port=5432
|
|
|
|
|
,password="download")
|
|
|
|
|
|
|
|
|
|
#lambda that parameterizes the downloader, allowing it to be passed to the pool.
|
|
|
|
|
def downloader(nct):
|
|
|
|
|
download_trial_records(nct, dbc)
|
|
|
|
|
|
|
|
|
|
#db connection
|
|
|
|
|
with dbc.new() as con:
|
|
|
|
|
#select lists to download
|
|
|
|
|
#flag_trials_of_interest(con) #SHould only be run once
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#get list of nct_ids
|
|
|
|
|
nctids = reserve_trials(con, 10)
|
|
|
|
|
nctids = reserve_trials(con, 1500)
|
|
|
|
|
print(nctids)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
reset_time = 10
|
|
|
|
|
delay_time = Value("I",1)
|
|
|
|
|
#lambda that parameterizes the downloader, allowing it to be passed to the pool.
|
|
|
|
|
def downloader(nct):
|
|
|
|
|
download_trial_records(nct, dbc, delay_time, reset_time)
|
|
|
|
|
|
|
|
|
|
#start analyzing them
|
|
|
|
|
with Pool(processes=4) as process_pool:
|
|
|
|
|
with Pool(processes=12) as process_pool:
|
|
|
|
|
process_pool.map(downloader, nctids)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|