Added history schema, got extraction_lib to read and write from db

3 years ago · ee3e37e834
parent 4ae3064bf2
commit ee3e37e834
5 changed files with 128 additions and 132 deletions
--- a/AACT_downloader/docker-entrypoint-initdb.d/030_HistoricalSchema.sql
+++ b/AACT_downloader/docker-entrypoint-initdb.d/030_HistoricalSchema.sql
@ -19,8 +19,98 @@ GRANT USAGE ON SCHEMA history TO history_reader;
 GRANT SELECT ON ALL TABLES IN SCHEMA http TO history_reader;


-
 /* History Tables
 Below is where I would construct the parsed trial history tables that I need.
+
+Possible fields
+    nct_id
+    version
+    --Study Status
+    overall_status^
+    primary_completion_date^
+    completion_date^
+    last_update_submitted_date
+    --SponsorCollaborators
+    sponsor (multi?)
+    collaborators (multi?)
+    --Oversight
+    fda_regulated_drug (ignore)
+    fda_regulated_device (ignore)
+    dmc (ignore)
+    --StuldyDescription
+    summary    
+    detailed_description
+    --Conditions
+    Conditions
+    Keywords
+    --StudyDesign
+    Study type
+    Primary Purpose
+    Study Phase
+    Interventional Study Model
+    Number of Arms
+    Masking
+    Allocation
+    Enrollment  ^
+    --ArmsAndInterventions
+    Arms (multiple) (Ignore)
+    --ProtocolOutcomeMeasures
+    --Eligibility
+    --ContactsLocation
+    --IPDSharing
+    --References
+    --ParticipantFlow
+    --BaselineCharacteristics
+    --ROutcomeMeasures
+    --AdverseEvents
+    --LimitationsAndCaveats
+    --More Information
+
 */

+CREATE TYPE history.updatable_catetories AS ENUM
+    ('Actual', 'Anticipated', 'Expected');
+
+ALTER TYPE history.updatable_catetories
+    OWNER TO root;
+
+COMMENT ON TYPE history.updatable_catetories
+    IS 'This enum is used to capture the different types of categories that a date or enrollemnt figure may have.';
+
+
+
+CREATE TYPE history.study_statuses AS ENUM
+    ('Available', 'Withdrawn', 'Withheld', 'Temporarily not available', 'Active, not recruiting', 'Recruiting', 'Not yet recruiting', 'Enrolling by invitation', 'Suspended', 'No longer available', 'Approved for marketing', 'Unknown status', 'Completed', 'Terminated');
+
+ALTER TYPE history.study_statuses
+    OWNER TO root;
+
+COMMENT ON TYPE history.study_statuses
+    IS 'This enum is used to record study status. These are pulled from the ClinicalTrials.gov documentation.';
+
+
+-- Table: history.trial_snapshots
+
+DROP TABLE IF EXISTS history.trial_snapshots;
+
+CREATE TABLE IF NOT EXISTS history.trial_snapshots
+(
+    nct_id character varying(15) COLLATE pg_catalog."default" NOT NULL,
+    version integer NOT NULL,
+    primary_completion_date timestamp without time zone,
+    primary_completion_date_category history.updatable_catetories,
+    start_date timestamp without time zone,
+    start_date_category history.updatable_catetories,
+    completion_date timestamp without time zone,
+    completion_date_category history.updatable_catetories,
+    overall_status history.study_statuses,
+    enrollment integer,
+    enrollment_category history.updatable_catetories,
+    sponsor character varying(255) COLLATE pg_catalog."default",
+    responsible_party character varying(255) COLLATE pg_catalog."default",
+    CONSTRAINT trial_snapshots_pkey PRIMARY KEY (nct_id, version)
+);
+
+
+ALTER TABLE IF EXISTS history.trial_snapshots
+    OWNER to root;
--- a/Parser/extraction_lib.py
+++ b/Parser/extraction_lib.py
@ -108,6 +108,8 @@ class VersionData():
                #catch any error, print the applicable information, and raise the error.
                print(self)
                raise err
+            
+            db_connection.commit()
        
 def optional_strip(possible_string):
    if type(possible_string) == str:
@ -126,7 +128,6 @@ def extract_study_statuses(study_status_form, version_a,version_b):
    #iterate through rows, 
    for trow in rows:
        #matching on rowLabels
-        #print(trow.__str__()[:80])
        match tr_to_td(trow):
            case ["Primary Completion:" as row_label, tag]: 
                old,new = split_by_version(tag)
@ -178,13 +179,15 @@ def extract_study_design(study_status_form, version_a,version_b):
        match tr_to_td(trow):
            case ["Enrollment:" as row_label, tag]: 
                old,new = split_by_version(tag)
-                tagdate1 = extract_text_and_tag(old.text)
-                version_a._enrollment = tagdate1.text
-                version_a._enrollment_category = optional_strip(tagdate1.tag)
+
+                #Extract tag and text, add them to preallocated object
+                tagtext1 = extract_text_and_tag(old.text)
+                version_a._enrollment = tagtext1.text
+                version_a._enrollment_category = optional_strip(tagtext1.tag)
                
-                tagdate2 = extract_text_and_tag(new.text)
-                version_b._enrollment = tagdate2.text
-                version_b._enrollment_category = optional_strip(tagdate2.tag)
+                tagtext2 = extract_text_and_tag(new.text)
+                version_b._enrollment = tagtext2.text
+                version_b._enrollment_category = optional_strip(tagtext2.tag)

             
 def extract_sponsor_data(study_status_form, version_a,version_b):
@ -301,6 +304,8 @@ def get_forms(soup,version_a,version_b):
        if not "id" in form.attrs:
            continue

+        #for each type of form (identified by the ID field) 
+        #  extract and add the data to the preallocated objects
        match form.attrs["id"]:
            case "form_StudyStatus":
                extract_study_statuses(form,version_a,version_b)
@ -348,8 +353,12 @@ date_MMMM_DD_YYYY = "%B %d, %Y"

 def get_data_from_versions(nct_id,html, version_a_int, version_b_int):
    soup = BeautifulSoup(html,"lxml")
+
+    #preallocate version data
    version_a = VersionData(nct_id, version_a_int)
    version_b = VersionData(nct_id, version_b_int)
+
+    #extract data from html and put it in the preallocated objects
    get_forms(soup, version_a, version_b)

    return version_a,version_b
@ -357,25 +366,31 @@ def get_data_from_versions(nct_id,html, version_a_int, version_b_int):

 if __name__ == "__main__":

-    with psycopg2.connect(dbname="aact_db", user="root", password="root",host="localhost") as db_connection:
+    with psycopg2.connect(dbname="aact_db", user="root", password="root",host="will-office") as db_connection:
        #pull the requests from the db
        with db_connection.cursor() as curse:
            sql = """
            SELECT nct_id, version_a,version_b, html
            FROM http.responses
+            WHERE response_code = 200
            """
-            responses = curse.execute(sql)
-            for response in responses.fetch_all():
+            curse.execute(sql)
+            for response in curse.fetchall():
                #
                nct_id, version_a, version_b, html = response

+                print(nct_id)
                version1, version2 = get_data_from_versions(nct_id, html, version_a, version_b)
+                print(version1.nct_id)
+                print(version2._enrollment)
+            
+                if version_b == version_a + 1:
+                    version1.load_to_db(db_connection)
+                    version2.load_to_db(db_connection)
+                else:
+                    version2.load_to_db(db_connection)

-            if version_b == version_a + 1:
-                version1.load_to_db(db_connection)
-                version2.load_to_db(db_connection)
-            else:
-                version2.load_to_db(db_connection)
+                exit(1)


 """
@ -402,4 +417,4 @@ TO add a new field to extraction-lib
        - splitting into old and new versions
        - Extracting the data for both old and new
        - add the data to the passed VersionData objects
-"""
+"""
--- a/Parser/prototype_history.sql
+++ b/Parser/prototype_history.sql
@ -1,113 +0,0 @@
-/*
-Create schema history
-
-
-CREATE TABLE history.versions
-    nct_id
-    version
-    --Study Status
-    overall_status^
-    primary_completion_date^
-    completion_date^
-    last_update_submitted_date
-    --SponsorCollaborators
-    sponsor (multi?)
-    collaborators (multi?)
-    --Oversight
-    fda_regulated_drug (ignore)
-    fda_regulated_device (ignore)
-    dmc (ignore)
-    --StuldyDescription
-    summary    
-    detailed_description
-    --Conditions
-    Conditions
-    Keywords
-    --StudyDesign
-    Study type
-    Primary Purpose
-    Study Phase
-    Interventional Study Model
-    Number of Arms
-    Masking
-    Allocation
-    Enrollment  ^
-    --ArmsAndInterventions
-    Arms (multiple) (Ignore)
-    --ProtocolOutcomeMeasures
-    --Eligibility
-    --ContactsLocation
-    --IPDSharing
-    --References
-    --ParticipantFlow
-    --BaselineCharacteristics
-    --ROutcomeMeasures
-    --AdverseEvents
-    --LimitationsAndCaveats
-    --More Information
-
-
-CREATE TABLE history.colaborators
-    nct_id
-    version
-    collaborator_name
-
-CREATE TABLE history.locations
-    nct_id
-    version
-    location name
-    location contact info
-
-CREATE TABLE history.arms
-*/
-
-/*
-Create the history 
-*/
-
-CREATE TYPE history.updatable_catetories AS ENUM
-    ('Actual', 'Anticipated', 'Expected');
-
-ALTER TYPE history.updatable_catetories
-    OWNER TO root;
-
-COMMENT ON TYPE history.updatable_catetories
-    IS 'This enum is used to capture the different types of categories that a date or enrollemnt figure may have.';
-
-
-
-CREATE TYPE history.study_statuses AS ENUM
-    ('Available', 'Withdrawn', 'Withheld', 'Temporarily not available', 'Active, not recruiting', 'Recruiting', 'Not yet recruiting', 'Enrolling by invitation', 'Suspended', 'No longer available', 'Approved for marketing', 'Unknown status', 'Completed', 'Terminated');
-
-ALTER TYPE history.study_statuses
-    OWNER TO root;
-
-
-
-
-- Table: history.trial_snapshots
-
-DROP TABLE IF EXISTS history.trial_snapshots;
-
-CREATE TABLE IF NOT EXISTS history.trial_snapshots
-(
-    nct_id character varying(15) COLLATE pg_catalog."default" NOT NULL,
-    version integer NOT NULL,
-    primary_completion_date timestamp without time zone,
-    primary_completion_date_category history.updatable_catetories,
-    start_date timestamp without time zone,
-    start_date_category history.updatable_catetories,
-    completion_date timestamp without time zone,
-    completion_date_category history.updatable_catetories,
-    overall_status history.study_statuses,
-    enrollment integer,
-    enrollment_category history.updatable_catetories,
-    sponsor character varying(255) COLLATE pg_catalog."default",
-    responsible_party character varying(255) COLLATE pg_catalog."default",
-    CONSTRAINT trial_snapshots_pkey PRIMARY KEY (nct_id, version)
-);
-
-TABLESPACE pg_default;
-
-ALTER TABLE IF EXISTS history.trial_snapshots
-    OWNER to root;
--- a/README.md
+++ b/README.md
@ -4,7 +4,7 @@ This represents my

 ## Prerequisites

-> Python >= 3.8
+> Python >= 3.10 (requires match statement)
 > Docker >= 20.10
 > Curl >= 7
 > Just >= 1.9
--- a/6
+++ b/6
@ -63,7 +63,7 @@ build: check-status setup-containers

 #remove containers and rebuild based on previously downloaded data
 rebuild: clean-docker build 
-    #system will be built from scratch, including downloading data
+    #system will be built from scratch, using previously downloaded data

 #download data and create the containers
 create: check-status download-aact-data build
@ -76,6 +76,10 @@ recreate: clean-docker create
 #Register trials of interest in the database based on ./history_downloader/selected_trials.sql
 select-trials:
    cd history_downloader && python ./select_trials.py
+
 #Download trial histories based on registered trials of interest.
 download-trial-histories:
    cd history_downloader && python ./downloader.py
+
+test-db-connection:
+    cd history_downloader && python db_connection.py