extraction-lib as a functional module and the associated history file.

4 years ago · 453e82974e
parent a9027c9467
commit 453e82974e
2 changed files with 74 additions and 20 deletions
--- a/Parser/extraction-lib.py
+++ b/Parser/extraction-lib.py
@ -1,9 +1,8 @@
 from collections import namedtuple
 from copy import copy
 from datetime import datetime
-from ensurepip import version
+
 from bs4 import BeautifulSoup
-import abc
 import textprocessing as tp #cuz tp is important
 #requires Python 3.10

@ -50,6 +49,9 @@ class VersionData():
        #self._responsible_party_category = None #I don't believe this is included in the raw data
        #self._collaborators = None #currently going to ignore as I've not fount it in AACT

+    def load_to_db(db_cursor):
+        #load to initial table, then load any extra details into other tables
+        pass

 def extract_study_statuses(study_status_form, version_a,version_b):
    """
@ -66,21 +68,21 @@ def extract_study_statuses(study_status_form, version_a,version_b):
        match tr_to_td(trow):
            case ["Primary Completion:" as row_label, tag]: 
                old,new = split_by_version(tag)
-                tagdate1 = extract_date_and_tag(old.text,date_MMMM_YYYY)
+                tagdate1 = extract_date_and_tag(old.text)
                version_a._primary_completion_date = tagdate1.date
                version_a._primary_completion_date_category = tagdate1.tag
                
-                tagdate2 = extract_date_and_tag(new.text,date_MMMM_YYYY)
+                tagdate2 = extract_date_and_tag(new.text)
                version_b._primary_completion_date = tagdate2.date
                version_b._primary_completion_date_category = tagdate2.tag

            case ["Study Completion:" as row_label, tag]:
                old,new = split_by_version(tag)
-                tagdate1 = extract_date_and_tag(old.text,date_MMMM_YYYY)
+                tagdate1 = extract_date_and_tag(old.text)
                version_a._completion_date = tagdate1.date
                version_a._completion_date_category = tagdate1.tag                

-                tagdate2 = extract_date_and_tag(new.text,date_MMMM_YYYY)
+                tagdate2 = extract_date_and_tag(new.text)
                version_b._completion_date = tagdate2.date
                version_b._completion_date_category = tagdate2.tag

@ -101,7 +103,6 @@ def extract_study_design(study_status_form, version_a,version_b):
    #iterate through rows, 
    for trow in rows:
        #matching on rowLabels
-        #print(trow.__str__()[:80])
        match tr_to_td(trow):
            case ["Enrollment:" as row_label, tag]: 
                old,new = split_by_version(tag)
@ -125,7 +126,6 @@ def extract_sponsor_data(study_status_form, version_a,version_b):
    #iterate through rows, 
    for trow in rows:
        #matching on rowLabels
-        #print(trow.__str__()[:80])
        match tr_to_td(trow):
            case ["Sponsor:" as row_label, tag]: 
                old, new = split_by_version(tag)
@ -157,16 +157,12 @@ def split_by_version(tag):
    return old,new


-def extract_date_and_tag(text, date_format):
+def extract_date_and_tag(text):
    """
    Extracts a datetype according to the date format
    and the estimate tag based on 

    """
-    #FIX: Currently, there are multiple (mixed) data formats in use
-    #Theses can exist in the same data field, in two different versions
-    #so instead of using a single (passed) data format, I need to 
-    #select between various data formats.

    text = text.strip()

@ -179,8 +175,12 @@ def extract_date_and_tag(text, date_format):
        estimate_tag = date_split[1].split("]")[0].strip()
    else:
        estimate_tag = None
-    date_object = datetime.strptime(date_split[0].strip(), date_format)
    
+    try:
+        date_object = datetime.strptime(date_split[0].strip(), date_MMMM_YYYY)
+    except ValueError as ve:
+        date_object = datetime.strptime(date_split[0].strip(), date_MMMM_DD_YYYY)
+
    return TagDatePair(estimate_tag, date_object)


@ -274,14 +274,19 @@ def get_forms(soup,version_a,version_b):
 date_MMMM_YYYY = "%B %Y"
 date_MMMM_DD_YYYY = "%B %d, %Y"

+def get_data_from_versions(nct_id,html, version_a_int, version_b_int):
+    soup = BeautifulSoup(html,"lxml")
+    version_a = VersionData(nct_id, version_a_int)
+    version_b = VersionData(nct_id, version_b_int)
+    get_forms(soup, version_a, version_b)
+
+    return version_a,version_b
+
+
 if __name__ == "__main__":

    for file in ["./NCT00658567.html", "./NCT01303796.html"]:
        with open(file) as fh:
-            soup = BeautifulSoup(fh, "lxml")
-        
-        version1 = VersionData("NCT00658567",1)
-        version2 = VersionData("NCT00658567",2)
-        get_forms(soup, version1, version2)
+            version1, version2 = get_data_from_versions(file, fh.read(), 1,2)
        print(version1.__dict__) #order messed up somewhere:w
        print(version2.__dict__) #order messed up somewhere:w
--- a/Parser/prototype_history.sql
+++ b/Parser/prototype_history.sql
@ -59,4 +59,53 @@ CREATE TABLE history.locations
    location contact info

 CREATE TABLE history.arms
-*/
+*/
+
+/*
+Create the history 
+*/
+
+CREATE TYPE history.updatable_catetories AS ENUM
+    ('Actual', 'Anticipated', 'Expected');
+
+ALTER TYPE history.updatable_catetories
+    OWNER TO root;
+
+COMMENT ON TYPE history.updatable_catetories
+    IS 'This enum is used to capture the different types of categories that a date or enrollemnt figure may have.';
+
+
+
+CREATE TYPE history.study_statuses AS ENUM
+    ('Available', 'Withdrawn', 'Withheld', 'Temporarily not available', 'Active, not recruiting', 'Recruiting', 'Not yet recruiting', 'Enrolling by invitation', 'Suspended', 'No longer available', 'Approved for marketing', 'Unknown status', 'Completed', 'Terminated');
+
+ALTER TYPE history.study_statuses
+    OWNER TO root;
+
+
+
+
+-- Table: history.trial_snapshots
+
+-- DROP TABLE IF EXISTS history.trial_snapshots;
+
+CREATE TABLE IF NOT EXISTS history.trial_snapshots
+(
+    nct_id character varying(15) COLLATE pg_catalog."default" NOT NULL,
+    version integer NOT NULL,
+    primary_completion_date timestamp without time zone,
+    primary_completion_date_category history.updatable_catetories,
+    completion_date timestamp without time zone,
+    completion_date_category history.updatable_catetories,
+    overall_status history.study_statuses,
+    enrollment integer,
+    enrollment_category history.updatable_catetories,
+    sponsor character varying(255) COLLATE pg_catalog."default",
+    responsible_party character varying(255) COLLATE pg_catalog."default",
+    CONSTRAINT trial_snapshots_pkey PRIMARY KEY (nct_id, version)
+)
+
+TABLESPACE pg_default;
+
+ALTER TABLE IF EXISTS history.trial_snapshots
+    OWNER to root;