Added history schema, got extraction_lib to read and write from db

3 years ago · ee3e37e834
parent 4ae3064bf2
commit ee3e37e834
5 changed files with 128 additions and 132 deletions
--- a/AACT_downloader/docker-entrypoint-initdb.d/030_HistoricalSchema.sql
+++ b/AACT_downloader/docker-entrypoint-initdb.d/030_HistoricalSchema.sql
@ -19,8 +19,98 @@ GRANT USAGE ON SCHEMA history TO history_reader;
 GRANT SELECT ON ALL TABLES IN SCHEMA http TO history_reader;
 /* History Tables
 Below is where I would construct the parsed trial history tables that I need.
 Possible fields
    nct_id
    version
    --Study Status
    overall_status^
    primary_completion_date^
    completion_date^
    last_update_submitted_date
    --SponsorCollaborators
    sponsor (multi?)
    collaborators (multi?)
    --Oversight
    fda_regulated_drug (ignore)
    fda_regulated_device (ignore)
    dmc (ignore)
    --StuldyDescription
    summary    
    detailed_description
    --Conditions
    Conditions
    Keywords
    --StudyDesign
    Study type
    Primary Purpose
    Study Phase
    Interventional Study Model
    Number of Arms
    Masking
    Allocation
    Enrollment  ^
    --ArmsAndInterventions
    Arms (multiple) (Ignore)
    --ProtocolOutcomeMeasures
    --Eligibility
    --ContactsLocation
    --IPDSharing
    --References
    --ParticipantFlow
    --BaselineCharacteristics
    --ROutcomeMeasures
    --AdverseEvents
    --LimitationsAndCaveats
    --More Information
 */
 CREATE TYPE history.updatable_catetories AS ENUM
    ('Actual', 'Anticipated', 'Expected');
 ALTER TYPE history.updatable_catetories
    OWNER TO root;
 COMMENT ON TYPE history.updatable_catetories
    IS 'This enum is used to capture the different types of categories that a date or enrollemnt figure may have.';
 CREATE TYPE history.study_statuses AS ENUM
    ('Available', 'Withdrawn', 'Withheld', 'Temporarily not available', 'Active, not recruiting', 'Recruiting', 'Not yet recruiting', 'Enrolling by invitation', 'Suspended', 'No longer available', 'Approved for marketing', 'Unknown status', 'Completed', 'Terminated');
 ALTER TYPE history.study_statuses
    OWNER TO root;
 COMMENT ON TYPE history.study_statuses
    IS 'This enum is used to record study status. These are pulled from the ClinicalTrials.gov documentation.';
 -- Table: history.trial_snapshots
 DROP TABLE IF EXISTS history.trial_snapshots;
 CREATE TABLE IF NOT EXISTS history.trial_snapshots
 (
    nct_id character varying(15) COLLATE pg_catalog."default" NOT NULL,
    version integer NOT NULL,
    primary_completion_date timestamp without time zone,
    primary_completion_date_category history.updatable_catetories,
    start_date timestamp without time zone,
    start_date_category history.updatable_catetories,
    completion_date timestamp without time zone,
    completion_date_category history.updatable_catetories,
    overall_status history.study_statuses,
    enrollment integer,
    enrollment_category history.updatable_catetories,
    sponsor character varying(255) COLLATE pg_catalog."default",
    responsible_party character varying(255) COLLATE pg_catalog."default",
    CONSTRAINT trial_snapshots_pkey PRIMARY KEY (nct_id, version)
 );
 ALTER TABLE IF EXISTS history.trial_snapshots
    OWNER to root;
--- a/Parser/extraction_lib.py
+++ b/Parser/extraction_lib.py
@ -109,6 +109,8 @@ class VersionData():
                print(self)
                raise err
            db_connection.commit()
 def optional_strip(possible_string):
    if type(possible_string) == str:
        return possible_string.strip()
@ -126,7 +128,6 @@ def extract_study_statuses(study_status_form, version_a,version_b):
    #iterate through rows, 
    for trow in rows:
        #matching on rowLabels
        #print(trow.__str__()[:80])
        match tr_to_td(trow):
            case ["Primary Completion:" as row_label, tag]: 
                old,new = split_by_version(tag)
@ -178,13 +179,15 @@ def extract_study_design(study_status_form, version_a,version_b):
        match tr_to_td(trow):
            case ["Enrollment:" as row_label, tag]: 
                old,new = split_by_version(tag)
                tagdate1 = extract_text_and_tag(old.text)
                version_a._enrollment = tagdate1.text
                version_a._enrollment_category = optional_strip(tagdate1.tag)
-                tagdate2 = extract_text_and_tag(new.text)
+                #Extract tag and text, add them to preallocated object
-                version_b._enrollment = tagdate2.text
+                tagtext1 = extract_text_and_tag(old.text)
-                version_b._enrollment_category = optional_strip(tagdate2.tag)
+                version_a._enrollment = tagtext1.text
                version_a._enrollment_category = optional_strip(tagtext1.tag)
                tagtext2 = extract_text_and_tag(new.text)
                version_b._enrollment = tagtext2.text
                version_b._enrollment_category = optional_strip(tagtext2.tag)
 def extract_sponsor_data(study_status_form, version_a,version_b):
@ -301,6 +304,8 @@ def get_forms(soup,version_a,version_b):
        if not "id" in form.attrs:
            continue
        #for each type of form (identified by the ID field) 
        #  extract and add the data to the preallocated objects
        match form.attrs["id"]:
            case "form_StudyStatus":
                extract_study_statuses(form,version_a,version_b)
@ -348,8 +353,12 @@ date_MMMM_DD_YYYY = "%B %d, %Y"
 def get_data_from_versions(nct_id,html, version_a_int, version_b_int):
    soup = BeautifulSoup(html,"lxml")
    #preallocate version data
    version_a = VersionData(nct_id, version_a_int)
    version_b = VersionData(nct_id, version_b_int)
    #extract data from html and put it in the preallocated objects
    get_forms(soup, version_a, version_b)
    return version_a,version_b
@ -357,25 +366,31 @@ def get_data_from_versions(nct_id,html, version_a_int, version_b_int):
 if __name__ == "__main__":
-    with psycopg2.connect(dbname="aact_db", user="root", password="root",host="localhost") as db_connection:
+    with psycopg2.connect(dbname="aact_db", user="root", password="root",host="will-office") as db_connection:
        #pull the requests from the db
        with db_connection.cursor() as curse:
            sql = """
            SELECT nct_id, version_a,version_b, html
            FROM http.responses
            WHERE response_code = 200
            """
-            responses = curse.execute(sql)
+            curse.execute(sql)
-            for response in responses.fetch_all():
+            for response in curse.fetchall():
                #
                nct_id, version_a, version_b, html = response
                print(nct_id)
                version1, version2 = get_data_from_versions(nct_id, html, version_a, version_b)
                print(version1.nct_id)
                print(version2._enrollment)
                if version_b == version_a + 1:
                    version1.load_to_db(db_connection)
                    version2.load_to_db(db_connection)
                else:
                    version2.load_to_db(db_connection)
-            if version_b == version_a + 1:
+                exit(1)
                version1.load_to_db(db_connection)
                version2.load_to_db(db_connection)
            else:
                version2.load_to_db(db_connection)
 """
--- a/Parser/prototype_history.sql
+++ b/Parser/prototype_history.sql
@ -1,113 +0,0 @@
 /*
 Create schema history
 CREATE TABLE history.versions
    nct_id
    version
    --Study Status
    overall_status^
    primary_completion_date^
    completion_date^
    last_update_submitted_date
    --SponsorCollaborators
    sponsor (multi?)
    collaborators (multi?)
    --Oversight
    fda_regulated_drug (ignore)
    fda_regulated_device (ignore)
    dmc (ignore)
    --StuldyDescription
    summary    
    detailed_description
    --Conditions
    Conditions
    Keywords
    --StudyDesign
    Study type
    Primary Purpose
    Study Phase
    Interventional Study Model
    Number of Arms
    Masking
    Allocation
    Enrollment  ^
    --ArmsAndInterventions
    Arms (multiple) (Ignore)
    --ProtocolOutcomeMeasures
    --Eligibility
    --ContactsLocation
    --IPDSharing
    --References
    --ParticipantFlow
    --BaselineCharacteristics
    --ROutcomeMeasures
    --AdverseEvents
    --LimitationsAndCaveats
    --More Information
 CREATE TABLE history.colaborators
    nct_id
    version
    collaborator_name
 CREATE TABLE history.locations
    nct_id
    version
    location name
    location contact info
 CREATE TABLE history.arms
 */
 /*
 Create the history 
 */
 CREATE TYPE history.updatable_catetories AS ENUM
    ('Actual', 'Anticipated', 'Expected');
 ALTER TYPE history.updatable_catetories
    OWNER TO root;
 COMMENT ON TYPE history.updatable_catetories
    IS 'This enum is used to capture the different types of categories that a date or enrollemnt figure may have.';
 CREATE TYPE history.study_statuses AS ENUM
    ('Available', 'Withdrawn', 'Withheld', 'Temporarily not available', 'Active, not recruiting', 'Recruiting', 'Not yet recruiting', 'Enrolling by invitation', 'Suspended', 'No longer available', 'Approved for marketing', 'Unknown status', 'Completed', 'Terminated');
 ALTER TYPE history.study_statuses
    OWNER TO root;
 -- Table: history.trial_snapshots
 DROP TABLE IF EXISTS history.trial_snapshots;
 CREATE TABLE IF NOT EXISTS history.trial_snapshots
 (
    nct_id character varying(15) COLLATE pg_catalog."default" NOT NULL,
    version integer NOT NULL,
    primary_completion_date timestamp without time zone,
    primary_completion_date_category history.updatable_catetories,
    start_date timestamp without time zone,
    start_date_category history.updatable_catetories,
    completion_date timestamp without time zone,
    completion_date_category history.updatable_catetories,
    overall_status history.study_statuses,
    enrollment integer,
    enrollment_category history.updatable_catetories,
    sponsor character varying(255) COLLATE pg_catalog."default",
    responsible_party character varying(255) COLLATE pg_catalog."default",
    CONSTRAINT trial_snapshots_pkey PRIMARY KEY (nct_id, version)
 );
 TABLESPACE pg_default;
 ALTER TABLE IF EXISTS history.trial_snapshots
    OWNER to root;
--- a/README.md
+++ b/README.md
@ -4,7 +4,7 @@ This represents my
 ## Prerequisites
-> Python >= 3.8
+> Python >= 3.10 (requires match statement)
 > Docker >= 20.10
 > Curl >= 7
 > Just >= 1.9
--- a/6
+++ b/6
@ -63,7 +63,7 @@ build: check-status setup-containers
 #remove containers and rebuild based on previously downloaded data
 rebuild: clean-docker build 
-    #system will be built from scratch, including downloading data
+    #system will be built from scratch, using previously downloaded data
 #download data and create the containers
 create: check-status download-aact-data build
@ -76,6 +76,10 @@ recreate: clean-docker create
 #Register trials of interest in the database based on ./history_downloader/selected_trials.sql
 select-trials:
    cd history_downloader && python ./select_trials.py
 #Download trial histories based on registered trials of interest.
 download-trial-histories:
    cd history_downloader && python ./downloader.py
 test-db-connection:
    cd history_downloader && python db_connection.py