diff --git a/Parser/extraction-lib.py b/Parser/extraction-lib.py index 0e07ec6..8202d15 100644 --- a/Parser/extraction-lib.py +++ b/Parser/extraction-lib.py @@ -1,9 +1,8 @@ from collections import namedtuple from copy import copy from datetime import datetime -from ensurepip import version + from bs4 import BeautifulSoup -import abc import textprocessing as tp #cuz tp is important #requires Python 3.10 @@ -50,6 +49,9 @@ class VersionData(): #self._responsible_party_category = None #I don't believe this is included in the raw data #self._collaborators = None #currently going to ignore as I've not fount it in AACT + def load_to_db(db_cursor): + #load to initial table, then load any extra details into other tables + pass def extract_study_statuses(study_status_form, version_a,version_b): """ @@ -66,21 +68,21 @@ def extract_study_statuses(study_status_form, version_a,version_b): match tr_to_td(trow): case ["Primary Completion:" as row_label, tag]: old,new = split_by_version(tag) - tagdate1 = extract_date_and_tag(old.text,date_MMMM_YYYY) + tagdate1 = extract_date_and_tag(old.text) version_a._primary_completion_date = tagdate1.date version_a._primary_completion_date_category = tagdate1.tag - tagdate2 = extract_date_and_tag(new.text,date_MMMM_YYYY) + tagdate2 = extract_date_and_tag(new.text) version_b._primary_completion_date = tagdate2.date version_b._primary_completion_date_category = tagdate2.tag case ["Study Completion:" as row_label, tag]: old,new = split_by_version(tag) - tagdate1 = extract_date_and_tag(old.text,date_MMMM_YYYY) + tagdate1 = extract_date_and_tag(old.text) version_a._completion_date = tagdate1.date version_a._completion_date_category = tagdate1.tag - tagdate2 = extract_date_and_tag(new.text,date_MMMM_YYYY) + tagdate2 = extract_date_and_tag(new.text) version_b._completion_date = tagdate2.date version_b._completion_date_category = tagdate2.tag @@ -101,7 +103,6 @@ def extract_study_design(study_status_form, version_a,version_b): #iterate through rows, for trow in rows: #matching on rowLabels - #print(trow.__str__()[:80]) match tr_to_td(trow): case ["Enrollment:" as row_label, tag]: old,new = split_by_version(tag) @@ -125,7 +126,6 @@ def extract_sponsor_data(study_status_form, version_a,version_b): #iterate through rows, for trow in rows: #matching on rowLabels - #print(trow.__str__()[:80]) match tr_to_td(trow): case ["Sponsor:" as row_label, tag]: old, new = split_by_version(tag) @@ -157,16 +157,12 @@ def split_by_version(tag): return old,new -def extract_date_and_tag(text, date_format): +def extract_date_and_tag(text): """ Extracts a datetype according to the date format and the estimate tag based on """ - #FIX: Currently, there are multiple (mixed) data formats in use - #Theses can exist in the same data field, in two different versions - #so instead of using a single (passed) data format, I need to - #select between various data formats. text = text.strip() @@ -179,8 +175,12 @@ def extract_date_and_tag(text, date_format): estimate_tag = date_split[1].split("]")[0].strip() else: estimate_tag = None - date_object = datetime.strptime(date_split[0].strip(), date_format) + try: + date_object = datetime.strptime(date_split[0].strip(), date_MMMM_YYYY) + except ValueError as ve: + date_object = datetime.strptime(date_split[0].strip(), date_MMMM_DD_YYYY) + return TagDatePair(estimate_tag, date_object) @@ -274,14 +274,19 @@ def get_forms(soup,version_a,version_b): date_MMMM_YYYY = "%B %Y" date_MMMM_DD_YYYY = "%B %d, %Y" +def get_data_from_versions(nct_id,html, version_a_int, version_b_int): + soup = BeautifulSoup(html,"lxml") + version_a = VersionData(nct_id, version_a_int) + version_b = VersionData(nct_id, version_b_int) + get_forms(soup, version_a, version_b) + + return version_a,version_b + + if __name__ == "__main__": for file in ["./NCT00658567.html", "./NCT01303796.html"]: with open(file) as fh: - soup = BeautifulSoup(fh, "lxml") - - version1 = VersionData("NCT00658567",1) - version2 = VersionData("NCT00658567",2) - get_forms(soup, version1, version2) + version1, version2 = get_data_from_versions(file, fh.read(), 1,2) print(version1.__dict__) #order messed up somewhere:w print(version2.__dict__) #order messed up somewhere:w diff --git a/Parser/prototype_history.sql b/Parser/prototype_history.sql index b66142b..49eef82 100644 --- a/Parser/prototype_history.sql +++ b/Parser/prototype_history.sql @@ -59,4 +59,53 @@ CREATE TABLE history.locations location contact info CREATE TABLE history.arms -*/ \ No newline at end of file +*/ + +/* +Create the history +*/ + +CREATE TYPE history.updatable_catetories AS ENUM + ('Actual', 'Anticipated', 'Expected'); + +ALTER TYPE history.updatable_catetories + OWNER TO root; + +COMMENT ON TYPE history.updatable_catetories + IS 'This enum is used to capture the different types of categories that a date or enrollemnt figure may have.'; + + + +CREATE TYPE history.study_statuses AS ENUM + ('Available', 'Withdrawn', 'Withheld', 'Temporarily not available', 'Active, not recruiting', 'Recruiting', 'Not yet recruiting', 'Enrolling by invitation', 'Suspended', 'No longer available', 'Approved for marketing', 'Unknown status', 'Completed', 'Terminated'); + +ALTER TYPE history.study_statuses + OWNER TO root; + + + + +-- Table: history.trial_snapshots + +-- DROP TABLE IF EXISTS history.trial_snapshots; + +CREATE TABLE IF NOT EXISTS history.trial_snapshots +( + nct_id character varying(15) COLLATE pg_catalog."default" NOT NULL, + version integer NOT NULL, + primary_completion_date timestamp without time zone, + primary_completion_date_category history.updatable_catetories, + completion_date timestamp without time zone, + completion_date_category history.updatable_catetories, + overall_status history.study_statuses, + enrollment integer, + enrollment_category history.updatable_catetories, + sponsor character varying(255) COLLATE pg_catalog."default", + responsible_party character varying(255) COLLATE pg_catalog."default", + CONSTRAINT trial_snapshots_pkey PRIMARY KEY (nct_id, version) +) + +TABLESPACE pg_default; + +ALTER TABLE IF EXISTS history.trial_snapshots + OWNER to root; \ No newline at end of file