From 820f082e0daad7f9093eb3800932133b2bc4f01c Mon Sep 17 00:00:00 2001 From: youainti Date: Wed, 1 Jun 2022 14:38:42 -0700 Subject: [PATCH] added documentation comments to classes and functions --- history_downloader/downloader.py | 58 +++++++++++++++++++++++++++++--- 1 file changed, 54 insertions(+), 4 deletions(-) diff --git a/history_downloader/downloader.py b/history_downloader/downloader.py index 3d5168d..0a0c2d3 100644 --- a/history_downloader/downloader.py +++ b/history_downloader/downloader.py @@ -7,12 +7,18 @@ from bs4 import BeautifulSoup from multiprocessing import Pool def get_highest_version_number(response): + """ + Extract the highest version currently available from the version number. + """ #navigate to a specific part of the returned html and extract the highest posted version. soup = BeautifulSoup(response.text, features="lxml") version_value = soup.findChildren("fieldset")[0].table.tbody.findChildren("tr")[-1].td.text return int(version_value) def make_request(nct_id,version1,version2): + """ + Request a page comparing two snapshots + """ #create url baseurl = "https://clinicaltrials.gov/ct2/history/{}?A={}&B={}&C=Side-by-Side" url = baseurl.format(nct_id,version1,version2) @@ -24,6 +30,9 @@ def make_request(nct_id,version1,version2): return response def upload_response(db_cursor, nct_id, version_a, version_b, response): + """ + Upload a requested page (with versions) to the database. + """ timestamp = datetime.strptime(response.headers['date'], "%a, %d %b %Y %H:%M:%S %Z") @@ -45,7 +54,12 @@ def upload_response(db_cursor, nct_id, version_a, version_b, response): ) def download_and_handle_errors(cursor, nct_id, version_a, version_b): + """ + Request a page, checking for http error codes, and handle the errors as requested. + """ + #request page r = make_request(nct_id, version_a, version_b) + #check for if r.status_code == 200: upload_response(cursor,nct_id,version_a, version_b, r) else: @@ -59,6 +73,19 @@ def download_and_handle_errors(cursor, nct_id, version_a, version_b): return r def download_trial_records(nct_id, db_connection_specs): + """ + Manage the download of all records associated with a given trial. + It uses a single connection and cursor for downloading the entire trial. + + The benefit of distributing the work at the trial level is that errors related + to a trial can be handled at that level. + + This doesn't reserve a trial for download, but it does release the reservation. + """ + #for testing + print(nct_id) + + # A new connection is created every time the function is called so that this # function can be run using a multiprocessing pool with db_connection_specs.new() as db_conn: @@ -96,6 +123,11 @@ def download_trial_records(nct_id, db_connection_specs): class DBConnectionCreator(): + """ + Creates new database connections based on a specified set of parameters. + This simplifies connection creation by allowing the programmer to pass + around the preconfigured connection creator. + """ def __init__(self,dbname, user, host, port, password): self.dbname = dbname self.user = user @@ -113,12 +145,23 @@ class DBConnectionCreator(): ) def step_generator(max_version): + """ + Used to generate a list of versions to request + The specific pattern generated is + + (3,4), (5,6), (7,8),...,(max_version-1,max_version) + """ old=3 for i in range(4,max_version,2): yield (old,i) old = i + 1 def flag_trials_of_interest(db_connection): + """ + Mark the queries of interest as "of interest" + INCOMPLETE + """ + query = """ INSERT INTO http.download_status (nct_id, status) SELECT nct_id, 'Of Interest'::http.history_download_status AS status @@ -135,9 +178,14 @@ def flag_trials_of_interest(db_connection): start_date > '2008-01-01' AND completion_date < '2022-01-01' - ; + ; """ + #TODO: actually send it to the database. + def reserve_trials(db_connection, limit=10): + """ + Reserves a certain number of trials for processing in the DB. + """ query = """ WITH OF_INTEREST AS (SELECT NCT_ID @@ -156,10 +204,12 @@ def reserve_trials(db_connection, limit=10): nctids_list = cursor.fetchall() nctids_list = [ x[0] for x in nctids_list] - return nctids_list if __name__ == "__main__": + """ + Main! + """ #instantiate a database connnection creator dbc = DBConnectionCreator( dbname="aact_db" @@ -168,7 +218,7 @@ if __name__ == "__main__": ,port=5432 ,password="download") - #lambda that parameterizes the downloader + #lambda that parameterizes the downloader, allowing it to be passed to the pool. def downloader(nct): download_trial_records(nct, dbc) @@ -179,7 +229,7 @@ if __name__ == "__main__": #get list of nct_ids - nctids = reserve_trials(con, 1) + nctids = reserve_trials(con, 4) print(nctids) #start analyzing them