Added rate limiting functionality. Not tested as things ran just fine for the 1700 trials I downloaded.

history-download
youainti 4 years ago
parent e849ee50be
commit ff2c5b9ddd

@ -3,8 +3,9 @@ import requests
import psycopg2 as psyco import psycopg2 as psyco
from datetime import datetime from datetime import datetime
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from multiprocessing import Pool, Value
from multiprocessing import Pool import math
import time
def get_highest_version_number(response): def get_highest_version_number(response):
@ -25,7 +26,6 @@ def get_highest_version_number(response):
for row in reversed(table_rows): for row in reversed(table_rows):
# if it is <td headers="VersionNumber">xx</td> then it contains what we need. # if it is <td headers="VersionNumber">xx</td> then it contains what we need.
for td in row.findChildren("td"): for td in row.findChildren("td"):
print("\n", td)
if ("headers" in td.attrs) and (td.attrs["headers"][0]=="VersionNumber"): if ("headers" in td.attrs) and (td.attrs["headers"][0]=="VersionNumber"):
#Note the use of [0] above. attribute elements are lists. #Note the use of [0] above. attribute elements are lists.
version_number = int(td.text) version_number = int(td.text)
@ -69,26 +69,52 @@ def upload_response(db_cursor, nct_id, version_a, version_b, response):
) )
) )
def download_and_handle_errors(cursor, nct_id, version_a, version_b): def download_and_handle_errors(cursor, nct_id, version_a, version_b, delay_time, reset_time):
""" """
Request a page, checking for http error codes, and handle the errors as requested. Request a page, checking for http error codes, and handle the errors as requested.
""" """
#sleep log10(counts of delays)
time.sleep(math.log10(delay_time.value))
#request page #request page
r = make_request(nct_id, version_a, version_b) r = make_request(nct_id, version_a, version_b)
#check for #check for
if r.status_code == 200: if r.status_code == 200:
upload_response(cursor,nct_id,version_a, version_b, r) upload_response(cursor,nct_id,version_a, version_b, r)
elif r.status_code == 503:
# write http code to http.responses
upload_response(cursor, nct_id, version_a, version_b, r)
# write incomplete to http.download_status
write_incomplete(cursor,nct_id)
# tell all other processes to slow down the request speed
delay_time.value += 1
# Delay
print("Recieved 503 on {}, increasing delay count to {}".format(nct_id, delay_tiome))
time.sleep(reset_time)
else: else:
#TODO: this should handle errors by #TODO: this should handle errors by
# - [ ] write http code to http.responses # write http code to http.responses
# - [ ] write incomplete to http.download_status upload_response(cursor, nct_id, version_a, version_b, r)
# - [ ] tell all other processes to slow down the request speed # write incomplete to http.download_status
# - [x] raise exception write_incomplete(cursor,nct_id)
raise Exception("Download of {} (versions {},{}) returned http code != 200".format(nct_id,version_a,version_b)) # raise exception
#raise Exception("Download of {} (versions {},{}) returned http code {}".format(nct_id,version_a,version_b, r.status_code))
# Delay
time.sleep(reset_time)
return r return r
def download_trial_records(nct_id, db_connection_specs): def write_incomplete(cursor, nct_id):
"""
Flags a trial as not having been fully downloaded.
"""
query = """
INSERT INTO HTTP.DOWNLOAD_STATUS (NCT_ID,STATUS)
%s, 'Incomplete'::HTTP.HISTORY_DOWNLOAD_STATUS
"""
cursor.execute(query, [nct_id] )
def download_trial_records(nct_id, db_connection_specs, delay_time, reset_time):
""" """
Manage the download of all records associated with a given trial. Manage the download of all records associated with a given trial.
It uses a single connection and cursor for downloading the entire trial. It uses a single connection and cursor for downloading the entire trial.
@ -108,7 +134,7 @@ def download_trial_records(nct_id, db_connection_specs):
with db_conn.cursor() as cursor: with db_conn.cursor() as cursor:
#upload the first two versions #upload the first two versions
r = download_and_handle_errors(cursor, nct_id, 1, 2) r = download_and_handle_errors(cursor, nct_id, 1, 2, delay_time, reset_time)
#extract last version #extract last version
v = get_highest_version_number(r) v = get_highest_version_number(r)
@ -119,13 +145,13 @@ def download_trial_records(nct_id, db_connection_specs):
elif v % 2 == 0: elif v % 2 == 0:
for version_a, version_b in step_generator(v): for version_a, version_b in step_generator(v):
#download the history, handling any errors as they come up, and submitting it to the database. #download the history, handling any errors as they come up, and submitting it to the database.
download_and_handle_errors(cursor, nct_id, version_a, version_b) download_and_handle_errors(cursor, nct_id, version_a, version_b, delay_time, reset_time)
elif v % 2 == 1: elif v % 2 == 1:
#if there are an odd number of submissions treat at as even #if there are an odd number of submissions treat at as even
for version_a, version_b in step_generator(v): for version_a, version_b in step_generator(v):
download_and_handle_errors(cursor, nct_id, version_a, version_b) download_and_handle_errors(cursor, nct_id, version_a, version_b, delay_time, reset_time)
#now handle an odd number of versions by downloading the 1 vs (end) comparison. #now handle an odd number of versions by downloading the 1 vs (end) comparison.
download_and_handle_errors(cursor, nct_id, 1, v) download_and_handle_errors(cursor, nct_id, 1, v, delay_time, reset_time)
#now mark the trial as having been downloaded #now mark the trial as having been downloaded
cursor.execute( cursor.execute(
@ -229,26 +255,27 @@ if __name__ == "__main__":
dbc = DBConnectionCreator( dbc = DBConnectionCreator(
dbname="aact_db" dbname="aact_db"
,user="python_downloader" ,user="python_downloader"
,host="localhost" ,host="will-office"
,port=5432 ,port=5432
,password="download") ,password="download")
#lambda that parameterizes the downloader, allowing it to be passed to the pool.
def downloader(nct):
download_trial_records(nct, dbc)
#db connection #db connection
with dbc.new() as con: with dbc.new() as con:
#select lists to download
#flag_trials_of_interest(con) #SHould only be run once
#get list of nct_ids #get list of nct_ids
nctids = reserve_trials(con, 10) nctids = reserve_trials(con, 1500)
print(nctids) print(nctids)
reset_time = 10
delay_time = Value("I",1)
#lambda that parameterizes the downloader, allowing it to be passed to the pool.
def downloader(nct):
download_trial_records(nct, dbc, delay_time, reset_time)
#start analyzing them #start analyzing them
with Pool(processes=4) as process_pool: with Pool(processes=12) as process_pool:
process_pool.map(downloader, nctids) process_pool.map(downloader, nctids)

Loading…
Cancel
Save