From ee3e37e8346c8e1204a005adfb8c01f50b106fd0 Mon Sep 17 00:00:00 2001 From: youainti Date: Thu, 29 Dec 2022 17:44:24 -0800 Subject: [PATCH] Added history schema, got extraction_lib to read and write from db --- .../030_HistoricalSchema.sql | 92 +++++++++++++- .../{extraction-lib.py => extraction_lib.py} | 47 +++++--- Parser/prototype_history.sql | 113 ------------------ README.md | 2 +- justfile | 6 +- 5 files changed, 128 insertions(+), 132 deletions(-) rename Parser/{extraction-lib.py => extraction_lib.py} (92%) delete mode 100644 Parser/prototype_history.sql diff --git a/AACT_downloader/docker-entrypoint-initdb.d/030_HistoricalSchema.sql b/AACT_downloader/docker-entrypoint-initdb.d/030_HistoricalSchema.sql index 2f605f8..6d764de 100644 --- a/AACT_downloader/docker-entrypoint-initdb.d/030_HistoricalSchema.sql +++ b/AACT_downloader/docker-entrypoint-initdb.d/030_HistoricalSchema.sql @@ -19,8 +19,98 @@ GRANT USAGE ON SCHEMA history TO history_reader; GRANT SELECT ON ALL TABLES IN SCHEMA http TO history_reader; - /* History Tables Below is where I would construct the parsed trial history tables that I need. + +Possible fields + nct_id + version + --Study Status + overall_status^ + primary_completion_date^ + completion_date^ + last_update_submitted_date + --SponsorCollaborators + sponsor (multi?) + collaborators (multi?) + --Oversight + fda_regulated_drug (ignore) + fda_regulated_device (ignore) + dmc (ignore) + --StuldyDescription + summary + detailed_description + --Conditions + Conditions + Keywords + --StudyDesign + Study type + Primary Purpose + Study Phase + Interventional Study Model + Number of Arms + Masking + Allocation + Enrollment ^ + --ArmsAndInterventions + Arms (multiple) (Ignore) + --ProtocolOutcomeMeasures + --Eligibility + --ContactsLocation + --IPDSharing + --References + --ParticipantFlow + --BaselineCharacteristics + --ROutcomeMeasures + --AdverseEvents + --LimitationsAndCaveats + --More Information + */ +CREATE TYPE history.updatable_catetories AS ENUM + ('Actual', 'Anticipated', 'Expected'); + +ALTER TYPE history.updatable_catetories + OWNER TO root; + +COMMENT ON TYPE history.updatable_catetories + IS 'This enum is used to capture the different types of categories that a date or enrollemnt figure may have.'; + + + +CREATE TYPE history.study_statuses AS ENUM + ('Available', 'Withdrawn', 'Withheld', 'Temporarily not available', 'Active, not recruiting', 'Recruiting', 'Not yet recruiting', 'Enrolling by invitation', 'Suspended', 'No longer available', 'Approved for marketing', 'Unknown status', 'Completed', 'Terminated'); + +ALTER TYPE history.study_statuses + OWNER TO root; + +COMMENT ON TYPE history.study_statuses + IS 'This enum is used to record study status. These are pulled from the ClinicalTrials.gov documentation.'; + + +-- Table: history.trial_snapshots + +DROP TABLE IF EXISTS history.trial_snapshots; + +CREATE TABLE IF NOT EXISTS history.trial_snapshots +( + nct_id character varying(15) COLLATE pg_catalog."default" NOT NULL, + version integer NOT NULL, + primary_completion_date timestamp without time zone, + primary_completion_date_category history.updatable_catetories, + start_date timestamp without time zone, + start_date_category history.updatable_catetories, + completion_date timestamp without time zone, + completion_date_category history.updatable_catetories, + overall_status history.study_statuses, + enrollment integer, + enrollment_category history.updatable_catetories, + sponsor character varying(255) COLLATE pg_catalog."default", + responsible_party character varying(255) COLLATE pg_catalog."default", + CONSTRAINT trial_snapshots_pkey PRIMARY KEY (nct_id, version) +); + + +ALTER TABLE IF EXISTS history.trial_snapshots + OWNER to root; diff --git a/Parser/extraction-lib.py b/Parser/extraction_lib.py similarity index 92% rename from Parser/extraction-lib.py rename to Parser/extraction_lib.py index 8fa37e6..1d3b26b 100644 --- a/Parser/extraction-lib.py +++ b/Parser/extraction_lib.py @@ -108,6 +108,8 @@ class VersionData(): #catch any error, print the applicable information, and raise the error. print(self) raise err + + db_connection.commit() def optional_strip(possible_string): if type(possible_string) == str: @@ -126,7 +128,6 @@ def extract_study_statuses(study_status_form, version_a,version_b): #iterate through rows, for trow in rows: #matching on rowLabels - #print(trow.__str__()[:80]) match tr_to_td(trow): case ["Primary Completion:" as row_label, tag]: old,new = split_by_version(tag) @@ -178,13 +179,15 @@ def extract_study_design(study_status_form, version_a,version_b): match tr_to_td(trow): case ["Enrollment:" as row_label, tag]: old,new = split_by_version(tag) - tagdate1 = extract_text_and_tag(old.text) - version_a._enrollment = tagdate1.text - version_a._enrollment_category = optional_strip(tagdate1.tag) + + #Extract tag and text, add them to preallocated object + tagtext1 = extract_text_and_tag(old.text) + version_a._enrollment = tagtext1.text + version_a._enrollment_category = optional_strip(tagtext1.tag) - tagdate2 = extract_text_and_tag(new.text) - version_b._enrollment = tagdate2.text - version_b._enrollment_category = optional_strip(tagdate2.tag) + tagtext2 = extract_text_and_tag(new.text) + version_b._enrollment = tagtext2.text + version_b._enrollment_category = optional_strip(tagtext2.tag) def extract_sponsor_data(study_status_form, version_a,version_b): @@ -301,6 +304,8 @@ def get_forms(soup,version_a,version_b): if not "id" in form.attrs: continue + #for each type of form (identified by the ID field) + # extract and add the data to the preallocated objects match form.attrs["id"]: case "form_StudyStatus": extract_study_statuses(form,version_a,version_b) @@ -348,8 +353,12 @@ date_MMMM_DD_YYYY = "%B %d, %Y" def get_data_from_versions(nct_id,html, version_a_int, version_b_int): soup = BeautifulSoup(html,"lxml") + + #preallocate version data version_a = VersionData(nct_id, version_a_int) version_b = VersionData(nct_id, version_b_int) + + #extract data from html and put it in the preallocated objects get_forms(soup, version_a, version_b) return version_a,version_b @@ -357,25 +366,31 @@ def get_data_from_versions(nct_id,html, version_a_int, version_b_int): if __name__ == "__main__": - with psycopg2.connect(dbname="aact_db", user="root", password="root",host="localhost") as db_connection: + with psycopg2.connect(dbname="aact_db", user="root", password="root",host="will-office") as db_connection: #pull the requests from the db with db_connection.cursor() as curse: sql = """ SELECT nct_id, version_a,version_b, html FROM http.responses + WHERE response_code = 200 """ - responses = curse.execute(sql) - for response in responses.fetch_all(): + curse.execute(sql) + for response in curse.fetchall(): # nct_id, version_a, version_b, html = response + print(nct_id) version1, version2 = get_data_from_versions(nct_id, html, version_a, version_b) + print(version1.nct_id) + print(version2._enrollment) + + if version_b == version_a + 1: + version1.load_to_db(db_connection) + version2.load_to_db(db_connection) + else: + version2.load_to_db(db_connection) - if version_b == version_a + 1: - version1.load_to_db(db_connection) - version2.load_to_db(db_connection) - else: - version2.load_to_db(db_connection) + exit(1) """ @@ -402,4 +417,4 @@ TO add a new field to extraction-lib - splitting into old and new versions - Extracting the data for both old and new - add the data to the passed VersionData objects -""" \ No newline at end of file +""" diff --git a/Parser/prototype_history.sql b/Parser/prototype_history.sql deleted file mode 100644 index 334ec70..0000000 --- a/Parser/prototype_history.sql +++ /dev/null @@ -1,113 +0,0 @@ -/* -Create schema history - - -CREATE TABLE history.versions - nct_id - version - --Study Status - overall_status^ - primary_completion_date^ - completion_date^ - last_update_submitted_date - --SponsorCollaborators - sponsor (multi?) - collaborators (multi?) - --Oversight - fda_regulated_drug (ignore) - fda_regulated_device (ignore) - dmc (ignore) - --StuldyDescription - summary - detailed_description - --Conditions - Conditions - Keywords - --StudyDesign - Study type - Primary Purpose - Study Phase - Interventional Study Model - Number of Arms - Masking - Allocation - Enrollment ^ - --ArmsAndInterventions - Arms (multiple) (Ignore) - --ProtocolOutcomeMeasures - --Eligibility - --ContactsLocation - --IPDSharing - --References - --ParticipantFlow - --BaselineCharacteristics - --ROutcomeMeasures - --AdverseEvents - --LimitationsAndCaveats - --More Information - - -CREATE TABLE history.colaborators - nct_id - version - collaborator_name - -CREATE TABLE history.locations - nct_id - version - location name - location contact info - -CREATE TABLE history.arms -*/ - -/* -Create the history -*/ - -CREATE TYPE history.updatable_catetories AS ENUM - ('Actual', 'Anticipated', 'Expected'); - -ALTER TYPE history.updatable_catetories - OWNER TO root; - -COMMENT ON TYPE history.updatable_catetories - IS 'This enum is used to capture the different types of categories that a date or enrollemnt figure may have.'; - - - -CREATE TYPE history.study_statuses AS ENUM - ('Available', 'Withdrawn', 'Withheld', 'Temporarily not available', 'Active, not recruiting', 'Recruiting', 'Not yet recruiting', 'Enrolling by invitation', 'Suspended', 'No longer available', 'Approved for marketing', 'Unknown status', 'Completed', 'Terminated'); - -ALTER TYPE history.study_statuses - OWNER TO root; - - - - --- Table: history.trial_snapshots - -DROP TABLE IF EXISTS history.trial_snapshots; - -CREATE TABLE IF NOT EXISTS history.trial_snapshots -( - nct_id character varying(15) COLLATE pg_catalog."default" NOT NULL, - version integer NOT NULL, - primary_completion_date timestamp without time zone, - primary_completion_date_category history.updatable_catetories, - start_date timestamp without time zone, - start_date_category history.updatable_catetories, - completion_date timestamp without time zone, - completion_date_category history.updatable_catetories, - overall_status history.study_statuses, - enrollment integer, - enrollment_category history.updatable_catetories, - sponsor character varying(255) COLLATE pg_catalog."default", - responsible_party character varying(255) COLLATE pg_catalog."default", - CONSTRAINT trial_snapshots_pkey PRIMARY KEY (nct_id, version) -); - -TABLESPACE pg_default; - -ALTER TABLE IF EXISTS history.trial_snapshots - OWNER to root; \ No newline at end of file diff --git a/README.md b/README.md index e4842fb..ad7f0e7 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ This represents my ## Prerequisites -> Python >= 3.8 +> Python >= 3.10 (requires match statement) > Docker >= 20.10 > Curl >= 7 > Just >= 1.9 diff --git a/justfile b/justfile index 695092c..57d54e4 100644 --- a/justfile +++ b/justfile @@ -63,7 +63,7 @@ build: check-status setup-containers #remove containers and rebuild based on previously downloaded data rebuild: clean-docker build - #system will be built from scratch, including downloading data + #system will be built from scratch, using previously downloaded data #download data and create the containers create: check-status download-aact-data build @@ -76,6 +76,10 @@ recreate: clean-docker create #Register trials of interest in the database based on ./history_downloader/selected_trials.sql select-trials: cd history_downloader && python ./select_trials.py + #Download trial histories based on registered trials of interest. download-trial-histories: cd history_downloader && python ./downloader.py + +test-db-connection: + cd history_downloader && python db_connection.py