extraction-lib as a functional module and the associated history file.

4 years ago · 453e82974e
parent a9027c9467
commit 453e82974e
2 changed files with 74 additions and 20 deletions
--- a/Parser/extraction-lib.py
+++ b/Parser/extraction-lib.py
@ -1,9 +1,8 @@
 from collections import namedtuple
 from copy import copy
 from datetime import datetime
-from ensurepip import version
+
 from bs4 import BeautifulSoup
 import abc
 import textprocessing as tp #cuz tp is important
 #requires Python 3.10
@ -50,6 +49,9 @@ class VersionData():
        #self._responsible_party_category = None #I don't believe this is included in the raw data
        #self._collaborators = None #currently going to ignore as I've not fount it in AACT
    def load_to_db(db_cursor):
        #load to initial table, then load any extra details into other tables
        pass
 def extract_study_statuses(study_status_form, version_a,version_b):
    """
@ -66,21 +68,21 @@ def extract_study_statuses(study_status_form, version_a,version_b):
        match tr_to_td(trow):
            case ["Primary Completion:" as row_label, tag]: 
                old,new = split_by_version(tag)
-                tagdate1 = extract_date_and_tag(old.text,date_MMMM_YYYY)
+                tagdate1 = extract_date_and_tag(old.text)
                version_a._primary_completion_date = tagdate1.date
                version_a._primary_completion_date_category = tagdate1.tag
-                tagdate2 = extract_date_and_tag(new.text,date_MMMM_YYYY)
+                tagdate2 = extract_date_and_tag(new.text)
                version_b._primary_completion_date = tagdate2.date
                version_b._primary_completion_date_category = tagdate2.tag
            case ["Study Completion:" as row_label, tag]:
                old,new = split_by_version(tag)
-                tagdate1 = extract_date_and_tag(old.text,date_MMMM_YYYY)
+                tagdate1 = extract_date_and_tag(old.text)
                version_a._completion_date = tagdate1.date
                version_a._completion_date_category = tagdate1.tag                
-                tagdate2 = extract_date_and_tag(new.text,date_MMMM_YYYY)
+                tagdate2 = extract_date_and_tag(new.text)
                version_b._completion_date = tagdate2.date
                version_b._completion_date_category = tagdate2.tag
@ -101,7 +103,6 @@ def extract_study_design(study_status_form, version_a,version_b):
    #iterate through rows, 
    for trow in rows:
        #matching on rowLabels
        #print(trow.__str__()[:80])
        match tr_to_td(trow):
            case ["Enrollment:" as row_label, tag]: 
                old,new = split_by_version(tag)
@ -125,7 +126,6 @@ def extract_sponsor_data(study_status_form, version_a,version_b):
    #iterate through rows, 
    for trow in rows:
        #matching on rowLabels
        #print(trow.__str__()[:80])
        match tr_to_td(trow):
            case ["Sponsor:" as row_label, tag]: 
                old, new = split_by_version(tag)
@ -157,16 +157,12 @@ def split_by_version(tag):
    return old,new
-def extract_date_and_tag(text, date_format):
+def extract_date_and_tag(text):
    """
    Extracts a datetype according to the date format
    and the estimate tag based on 
    """
    #FIX: Currently, there are multiple (mixed) data formats in use
    #Theses can exist in the same data field, in two different versions
    #so instead of using a single (passed) data format, I need to 
    #select between various data formats.
    text = text.strip()
@ -179,7 +175,11 @@ def extract_date_and_tag(text, date_format):
        estimate_tag = date_split[1].split("]")[0].strip()
    else:
        estimate_tag = None
-    date_object = datetime.strptime(date_split[0].strip(), date_format)
+    
    try:
        date_object = datetime.strptime(date_split[0].strip(), date_MMMM_YYYY)
    except ValueError as ve:
        date_object = datetime.strptime(date_split[0].strip(), date_MMMM_DD_YYYY)
    return TagDatePair(estimate_tag, date_object)
@ -274,14 +274,19 @@ def get_forms(soup,version_a,version_b):
 date_MMMM_YYYY = "%B %Y"
 date_MMMM_DD_YYYY = "%B %d, %Y"
 def get_data_from_versions(nct_id,html, version_a_int, version_b_int):
    soup = BeautifulSoup(html,"lxml")
    version_a = VersionData(nct_id, version_a_int)
    version_b = VersionData(nct_id, version_b_int)
    get_forms(soup, version_a, version_b)
    return version_a,version_b
 if __name__ == "__main__":
    for file in ["./NCT00658567.html", "./NCT01303796.html"]:
        with open(file) as fh:
-            soup = BeautifulSoup(fh, "lxml")
+            version1, version2 = get_data_from_versions(file, fh.read(), 1,2)
        version1 = VersionData("NCT00658567",1)
        version2 = VersionData("NCT00658567",2)
        get_forms(soup, version1, version2)
        print(version1.__dict__) #order messed up somewhere:w
        print(version2.__dict__) #order messed up somewhere:w
--- a/Parser/prototype_history.sql
+++ b/Parser/prototype_history.sql
@ -60,3 +60,52 @@ CREATE TABLE history.locations
 CREATE TABLE history.arms
 */
 /*
 Create the history 
 */
 CREATE TYPE history.updatable_catetories AS ENUM
    ('Actual', 'Anticipated', 'Expected');
 ALTER TYPE history.updatable_catetories
    OWNER TO root;
 COMMENT ON TYPE history.updatable_catetories
    IS 'This enum is used to capture the different types of categories that a date or enrollemnt figure may have.';
 CREATE TYPE history.study_statuses AS ENUM
    ('Available', 'Withdrawn', 'Withheld', 'Temporarily not available', 'Active, not recruiting', 'Recruiting', 'Not yet recruiting', 'Enrolling by invitation', 'Suspended', 'No longer available', 'Approved for marketing', 'Unknown status', 'Completed', 'Terminated');
 ALTER TYPE history.study_statuses
    OWNER TO root;
 -- Table: history.trial_snapshots
 -- DROP TABLE IF EXISTS history.trial_snapshots;
 CREATE TABLE IF NOT EXISTS history.trial_snapshots
 (
    nct_id character varying(15) COLLATE pg_catalog."default" NOT NULL,
    version integer NOT NULL,
    primary_completion_date timestamp without time zone,
    primary_completion_date_category history.updatable_catetories,
    completion_date timestamp without time zone,
    completion_date_category history.updatable_catetories,
    overall_status history.study_statuses,
    enrollment integer,
    enrollment_category history.updatable_catetories,
    sponsor character varying(255) COLLATE pg_catalog."default",
    responsible_party character varying(255) COLLATE pg_catalog."default",
    CONSTRAINT trial_snapshots_pkey PRIMARY KEY (nct_id, version)
 )
 TABLESPACE pg_default;
 ALTER TABLE IF EXISTS history.trial_snapshots
    OWNER to root;