Housekeeping: Renamed folder to distinguish between aact and history downloader. Updated .gitignore to handle orangebook data

4 years ago · 9623bc0550
parent 84b38166e1
commit 9623bc0550
5 changed files with 214 additions and 96 deletions
--- a/.gitignore
+++ b/.gitignore
@ -183,3 +183,5 @@ Manifest.toml
 *_clinical_trials/
 *_clinical_trials.zip
 NCT*.html
+/Orangebook/EOBZIP_*/
+/Orangebook/Orangebooks/
--- a/downloader/downloader.py
+++ b/downloader/downloader.py
@ -1,96 +0,0 @@
-import requests
-import psycopg2 as psyco
-from datetime import datetime
-from bs4 import BeautifulSoup
-
-
-def get_highest_version_number(response):
-    #navigate to a specific part of the returned html and extract the highest posted version.
-    soup = BeautifulSoup(response.text, features="lxml")
-    version_value = soup.findChildren("fieldset")[0].table.tbody.findChildren("tr")[-1].td.text
-    return int(version_value)
-
-def make_request(nct_id,version1,version2):
-    #create url
-    baseurl = "https://clinicaltrials.gov/ct2/history/{}?A={}&B={}&C=Side-by-Side"
-    url = baseurl.format(nct_id,version1,version2)
-
-    #make request
-    response = requests.get(url)
-
-    #return the response
-    return response
-
-def upload_response(db_conn, nct_id, version_a, version_b, response):
-
-    timestamp = datetime.strptime(r.headers['date'], "%a, %d %b %Y %H:%M:%S %Z")
-
-    #this uploads the response values.
-    db_conn.execute("""
-    INSERT INTO http.responses
-    (nct_id,version_a,version_b,url,response_code,response_date, html) 
-    VALUES  (%s,%s,%s,%s,%s,%s,%s)
-    RETURNING id
-    ;
-    """
-    ,(nct_id
-        ,version_a
-        ,version_b
-        ,response.url
-        ,response.status_code
-        ,datetime.isoformat(timestamp)
-        ,response.text)) #FIX: there is an issue with storing the tags for some reason.
-
-    pk_response = cursor.fetchall()
-
-    return pk_response
-
-
-def download_trial_records(nct_id, db_conn):
-    #download first 2 records
-    r = make_request(nct_id,1,2)
-    upload_response(db_conn, nct_id,1,2,r)
-
-    #extract last version
-    v = get_highest_version_number(r)
-
-    #download the remaining versions
-    if v == 2:
-        return None
-    elif v % 2 == 0:
-        for version_a, version_b in step_generator(v):
-            r = make_request(nct_id, version_a, version_b)
-            upload_response(db_conn,nct_id,version_a, version_b, r)
-
-    elif v % 2 == 1:
-        for version_a, version_b in step_generator(v):
-            r = make_request(nct_id, version_a, version_b)
-            upload_response(db_conn,nct_id,version_a, version_b, r)
-
-        r = make_request(nct_id, 1, v)
-        upload_response(db_conn,nct_id,1, v, r)
-
-def step_generator(max_version):
-    old=3
-    for i in range(4,max_version,2):
-        yield (old,i)
-        old = i + 1
-
-if __name__ == "__main__":
-
-    #make request
-    nct_id = "NCT00658567"
-    r = make_request(nct_id,1,36)
-    print(r.url)
-    print(r.status_code)
-
-    v = get_highest_version_number(r)
-    print("highest version", v )
-
-    #db connection
-    with psyco.connect(dbname="aact_db", user="root", host="localhost", password="root") as con:
-        with con.cursor() as cursor:
-            #insert response
-
-            download_trial_records(nct_id,cursor)
-            print("download complete") 
--- a/history_downloader/db_connection.py
+++ b/history_downloader/db_connection.py
--- a/history_downloader/downloader.py
+++ b/history_downloader/downloader.py
@ -0,0 +1,191 @@
+from tkinter import W
+import requests
+import psycopg2 as psyco
+from datetime import datetime
+from bs4 import BeautifulSoup
+
+from multiprocessing import Pool
+
+def get_highest_version_number(response):
+    #navigate to a specific part of the returned html and extract the highest posted version.
+    soup = BeautifulSoup(response.text, features="lxml")
+    version_value = soup.findChildren("fieldset")[0].table.tbody.findChildren("tr")[-1].td.text
+    return int(version_value)
+
+def make_request(nct_id,version1,version2):
+    #create url
+    baseurl = "https://clinicaltrials.gov/ct2/history/{}?A={}&B={}&C=Side-by-Side"
+    url = baseurl.format(nct_id,version1,version2)
+
+    #make request
+    response = requests.get(url)
+
+    #return the response
+    return response
+
+def upload_response(db_cursor, nct_id, version_a, version_b, response):
+
+    timestamp = datetime.strptime(response.headers['date'], "%a, %d %b %Y %H:%M:%S %Z")
+
+    #this uploads the response values.
+    db_cursor.execute("""
+        INSERT INTO http.responses
+        (nct_id,version_a,version_b,url,response_code,response_date, html) 
+        VALUES  (%s,%s,%s,%s,%s,%s,%s)
+        ;
+        """
+        ,(nct_id
+            ,version_a
+            ,version_b
+            ,response.url
+            ,response.status_code
+            ,datetime.isoformat(timestamp)
+            ,response.text
+        )
+    ) 
+
+def download_and_handle_errors(cursor, nct_id, version_a, version_b):
+    r = make_request(nct_id, version_a, version_b)
+    if r.status_code == 200:
+        upload_response(cursor,nct_id,version_a, version_b, r)
+    else:
+        #TODO: this should handle errors by
+        #  - [ ] write http code to http.responses
+        #  - [ ] write incomplete to http.download_status
+        #  - [ ] tell all other processes to slow down the request speed
+        #  - [x] raise exception
+        raise Exception("Download of {} (versions {},{}) returned http code != 200".format(nct_id,version_a,version_b))
+    
+    return r
+
+def download_trial_records(nct_id, db_connection_specs):
+    # A new connection is created every time the function is called so that this 
+    # function can be run using a multiprocessing pool
+    with db_connection_specs.new() as db_conn:
+        with db_conn.cursor() as cursor:
+
+            #upload the first two versions
+            r = download_and_handle_errors(cursor, nct_id, 1, 2)
+            #extract last version
+            v = get_highest_version_number(r)
+
+
+            #download and upload the remaining versions
+            if v == 2:
+                return None
+            elif v % 2 == 0:
+                for version_a, version_b in step_generator(v):
+                    #download the history, handling any errors as they come up, and submitting it to the database.
+                    download_and_handle_errors(cursor, nct_id, version_a, version_b)
+            elif v % 2 == 1:
+                #if there are an odd number of submissions treat at as even
+                for version_a, version_b in step_generator(v):
+                    download_and_handle_errors(cursor, nct_id, version_a, version_b)
+                #now handle an odd number of versions by downloading the 1 vs (end) comparison.
+                download_and_handle_errors(cursor, nct_id, 1, v)
+
+            #now mark the trial as having been downloaded
+            cursor.execute(
+                """
+                INSERT INTO http.download_status (nct_id,status)
+                VALUES (%s, 'Downloaded'::http.history_download_status)
+                """
+                , [nct_id]
+            )
+
+
+
+class DBConnectionCreator():
+    def __init__(self,dbname, user, host, port, password):
+        self.dbname = dbname
+        self.user = user
+        self.host = host
+        self.port = port
+        self.password=password
+
+    def new(self):
+        return psyco.connect(
+                dbname=self.dbname
+                ,user=self.user
+                ,host=self.host
+                ,port=self.port
+                ,password=self.password
+                )
+
+def step_generator(max_version):
+    old=3
+    for i in range(4,max_version,2):
+        yield (old,i)
+        old = i + 1
+
+def flag_trials_of_interest(db_connection):
+    query = """
+    INSERT INTO http.download_status (nct_id, status)
+        SELECT nct_id, 'Of Interest'::http.history_download_status AS status
+        FROM ctgov.studies
+        WHERE 
+            is_fda_regulated_drug=TRUE
+            AND
+            study_type = 'Interventional'
+            AND
+            phase='Phase 3'
+            AND 
+            overall_status in ('Terminated', 'Completed')
+            AND 
+            start_date > '2008-01-01'
+            AND
+            completion_date < '2022-01-01'
+        ;
+    """
+def reserve_trials(db_connection, limit=10):
+    query = """
+    WITH OF_INTEREST AS
+        (SELECT NCT_ID
+            FROM HTTP.MOST_RECENT_DOWNLOAD_STATUS
+            WHERE HTTP.MOST_RECENT_DOWNLOAD_STATUS.STATUS = 'Of Interest'::HTTP.HISTORY_DOWNLOAD_STATUS
+            LIMIT %s
+        )
+    INSERT INTO HTTP.DOWNLOAD_STATUS (NCT_ID,STATUS)
+        SELECT OF_INTEREST.NCT_ID, 'Reserved'::HTTP.HISTORY_DOWNLOAD_STATUS AS STATUS
+        FROM OF_INTEREST 
+    RETURNING NCT_ID;
+    """
+
+    with db_connection.cursor() as cursor:
+        cursor.execute(query, [limit] )
+        nctids_list = cursor.fetchall()
+        nctids_list = [ x[0] for x in nctids_list]
+
+
+    return nctids_list
+
+if __name__ == "__main__":
+    #instantiate a database connnection creator
+    dbc = DBConnectionCreator(
+            dbname="aact_db"
+            ,user="python_downloader"
+            ,host="localhost"
+            ,port=5432
+            ,password="download")
+    
+    #lambda that parameterizes the downloader
+    def downloader(nct):
+        download_trial_records(nct, dbc)
+
+    #db connection
+    with dbc.new() as con:
+        #select lists to download
+        #flag_trials_of_interest(con) #SHould only be run once
+
+
+        #get list of nct_ids
+        nctids = reserve_trials(con, 1)
+        print(nctids)
+
+        #start analyzing them
+        with Pool(processes=3) as process_pool:
+            process_pool.map(downloader, nctids)
+
+
+
+
--- a/history_downloader/downloader_prep.sql
+++ b/history_downloader/downloader_prep.sql
@ -0,0 +1,21 @@
+DELETE FROM http.download_status;
+
+INSERT INTO http.download_status (nct_id, status)
+SELECT nct_id, 'Of Interest'::http.history_download_status AS status
+FROM ctgov.studies
+WHERE 
+is_fda_regulated_drug=TRUE
+AND
+study_type = 'Interventional'
+AND
+phase='Phase 3'
+AND 
+overall_status in ('Terminated', 'Completed')
+AND 
+start_date > '2008-01-01'
+AND
+completion_date < '2022-01-01'
+;
+
+
+SELECT count(*) FROM http.download_status ;