added feature to extract start_date. deleted unused files and modified others for ease of use.

4 years ago · 9d5a726494
parent 453e82974e
commit 9d5a726494
4 changed files with 2130 additions and 1446 deletions
--- a/Parser/NCT01303796.html
+++ b/Parser/NCT01303796.html
--- a/Parser/extraction-lib.py
+++ b/Parser/extraction-lib.py
@ -1,9 +1,9 @@
 from collections import namedtuple
 from copy import copy
 from datetime import datetime
-
+import psycopg2
 from bs4 import BeautifulSoup
-import textprocessing as tp #cuz tp is important
+#import textprocessing as tp #cuz tp is important
 #requires Python 3.10


@ -28,12 +28,14 @@ class VersionData():
    """
    def __init__(self,nct_id,version_id):
        #identifiers
-        self.nct_id = nct_id
+        self.nct_id = nct_id.strip()
        self.version_id = version_id

        #Study Status
        self._primary_completion_date = None
        self._primary_completion_date_category = None
+        self._start_date = None
+        self._start_date_category = None
        self._completion_date = None
        self._completion_date_category = None
        self._overall_status = None
@ -49,9 +51,73 @@ class VersionData():
        #self._responsible_party_category = None #I don't believe this is included in the raw data
        #self._collaborators = None #currently going to ignore as I've not fount it in AACT

-    def load_to_db(db_cursor):
+    def load_to_db(self,db_connection):
        #load to initial table, then load any extra details into other tables
-        pass
+        sql = """
+        INSERT INTO history.trial_snapshots
+            (
+                nct_id,
+                version,
+                primary_completion_date,
+                primary_completion_date_category,
+                start_date,
+                start_date_category,
+                completion_date,
+                completion_date_category,
+                overall_status,
+                enrollment,
+                enrollment_category,
+                sponsor,
+                responsible_party
+            )
+            VALUES
+            (
+                %s,
+                %s,
+                %s,
+                %s,
+                %s,
+                %s,
+                %s,
+                %s,
+                %s,
+                %s,
+                %s,
+                %s,
+                %s
+            )
+        """
+
+        with db_connection.cursor() as db_cursor:
+            try:
+                db_cursor.execute(
+                    sql,
+                    (
+                        self.nct_id,
+                        self.version_id,
+                        self._primary_completion_date,
+                        self._primary_completion_date_category,
+                        self._start_date,
+                        self._start_date_category,
+                        self._completion_date,
+                        self._completion_date_category,
+                        self._overall_status,
+                        self._enrollment,
+                        self._enrollment_category,
+                        self._sponsor,
+                        self._responsible_party
+                    )
+                )
+            except Exception as err:
+                #catch any error, print the applicable information, and raise the error.
+                print(self)
+                raise err
+        
+def optional_strip(possible_string):
+    if type(possible_string) == str:
+        return possible_string.strip()
+    else:
+        return possible_string

 def extract_study_statuses(study_status_form, version_a,version_b):
    """
@ -70,26 +136,36 @@ def extract_study_statuses(study_status_form, version_a,version_b):
                old,new = split_by_version(tag)
                tagdate1 = extract_date_and_tag(old.text)
                version_a._primary_completion_date = tagdate1.date
-                version_a._primary_completion_date_category = tagdate1.tag
+                version_a._primary_completion_date_category = optional_strip(tagdate1.tag)
                
                tagdate2 = extract_date_and_tag(new.text)
                version_b._primary_completion_date = tagdate2.date
-                version_b._primary_completion_date_category = tagdate2.tag
+                version_b._primary_completion_date_category = optional_strip(tagdate2.tag)
+
+            case ["Study Start:" as row_label, tag]: 
+                old,new = split_by_version(tag)
+                tagdate1 = extract_date_and_tag(old.text)
+                version_a._start_date = tagdate1.date
+                version_a._start_date_category = optional_strip(tagdate1.tag)
+                
+                tagdate2 = extract_date_and_tag(new.text)
+                version_b._start_date = tagdate2.date
+                version_b._start_date_category = optional_strip(tagdate2.tag)

            case ["Study Completion:" as row_label, tag]:
                old,new = split_by_version(tag)
                tagdate1 = extract_date_and_tag(old.text)
                version_a._completion_date = tagdate1.date
-                version_a._completion_date_category = tagdate1.tag                
+                version_a._completion_date_category = optional_strip(tagdate1.tag)              

                tagdate2 = extract_date_and_tag(new.text)
                version_b._completion_date = tagdate2.date
-                version_b._completion_date_category = tagdate2.tag
+                version_b._completion_date_category = optional_strip(tagdate2.tag)

            case ["Overall Status:" as row_label, tag]:
                old,new = split_by_version(tag)
-                version_a._overall_status = old.text
-                version_b._overall_status = new.text              
+                version_a._overall_status = optional_strip(old.text)
+                version_b._overall_status = optional_strip(new.text)         


 def extract_study_design(study_status_form, version_a,version_b):
@ -108,11 +184,11 @@ def extract_study_design(study_status_form, version_a,version_b):
                old,new = split_by_version(tag)
                tagdate1 = extract_text_and_tag(old.text)
                version_a._enrollment = tagdate1.text
-                version_a._enrollment_category = tagdate1.tag
+                version_a._enrollment_category = optional_strip(tagdate1.tag)
                
                tagdate2 = extract_text_and_tag(new.text)
                version_b._enrollment = tagdate2.text
-                version_b._enrollment_category = tagdate2.tag
+                version_b._enrollment_category = optional_strip(tagdate2.tag)

             
 def extract_sponsor_data(study_status_form, version_a,version_b):
@ -129,13 +205,13 @@ def extract_sponsor_data(study_status_form, version_a,version_b):
        match tr_to_td(trow):
            case ["Sponsor:" as row_label, tag]: 
                old, new = split_by_version(tag)
-                version_a._sponsor = old.text
-                version_b._sponsor = new.text
+                version_a._sponsor = optional_strip(old.text)
+                version_b._sponsor = optional_strip(new.text)

            case ["Responsible Party:" as row_label, tag]: 
                old, new = split_by_version(tag)
-                version_a._responsible_party = old.text
-                version_b._responsible_party = new.text
+                version_a._responsible_party = optional_strip(old.text)
+                version_b._responsible_party = optional_strip(new.text)

            case ["Collaborators:" as row_label, tag]: 
                #old, new = split_by_version(tag)
@ -285,8 +361,14 @@ def get_data_from_versions(nct_id,html, version_a_int, version_b_int):

 if __name__ == "__main__":

-    for file in ["./NCT00658567.html", "./NCT01303796.html"]:
-        with open(file) as fh:
+    for file in ["NCT00658567", "NCT01303796"]:
+        with open("./{}.html".format(file)) as fh:
            version1, version2 = get_data_from_versions(file, fh.read(), 1,2)
        print(version1.__dict__) #order messed up somewhere:w
        print(version2.__dict__) #order messed up somewhere:w
+
+
+        with psycopg2.connect(dbname="aact_db", user="root", password="root",host="will-office") as db_connection:
+            version1.load_to_db(db_connection)
+            version2.load_to_db(db_connection)
+            #print(db_connection)
--- a/Parser/prototype_history.sql
+++ b/Parser/prototype_history.sql
@ -87,7 +87,7 @@ ALTER TYPE history.study_statuses

 -- Table: history.trial_snapshots

-- DROP TABLE IF EXISTS history.trial_snapshots;
+DROP TABLE IF EXISTS history.trial_snapshots;

 CREATE TABLE IF NOT EXISTS history.trial_snapshots
 (
@ -95,6 +95,8 @@ CREATE TABLE IF NOT EXISTS history.trial_snapshots
    version integer NOT NULL,
    primary_completion_date timestamp without time zone,
    primary_completion_date_category history.updatable_catetories,
+    start_date timestamp without time zone,
+    start_date_category history.updatable_catetories,
    completion_date timestamp without time zone,
    completion_date_category history.updatable_catetories,
    overall_status history.study_statuses,
@ -103,7 +105,7 @@ CREATE TABLE IF NOT EXISTS history.trial_snapshots
    sponsor character varying(255) COLLATE pg_catalog."default",
    responsible_party character varying(255) COLLATE pg_catalog."default",
    CONSTRAINT trial_snapshots_pkey PRIMARY KEY (nct_id, version)
-)
+);

 TABLESPACE pg_default;

--- a/Parser/textprocessing.py
+++ b/Parser/textprocessing.py
@ -1,184 +0,0 @@
-from copy import copy
-from datetime import datetime
-from bs4 import BeautifulSoup
-import re
-
-form = """
-<tr>
-    <td colspan="2">
-      <form id="form_StudyStatus">
-        <div class="w3-responsive">
-          <fieldset class="entryReq" id="StudyStatus"
-            style="margin:auto;margin-bottom:1em;padding-bottom:0.5em;width:98%;">
-            <legend class="moduleLabel"> <img id="StudyStatusImg" class="toggleImage"
-                onclick="toggleModule('StudyStatus');" src="html/images/collapse.png"
-                alt='Open or close this module'>
-              Study Status</legend>
-            <div id="StudyStatusBody" class="moduleBody">
-              <table class="indent1 moduleTable resultTable">
-                <thead>
-                  <tr>
-                    <th style="width:210px;"></th>
-                    <th></th>
-                  </tr>
-                </thead>
-                <tbody>
-                  <tr>
-                    <td class="rowLabel" style="min-width: 210px;">Record Verification:</td>
-                    <td>April <span class="drop_hilite">2008</span> <span class="add_hilite">2017</span> </td>
-                  </tr>
-                  <tr>
-                    <td class="rowLabel" style="min-width: 210px;">Overall Status:</td>
-                    <td><span class="drop_hilite">Recruiting</span> <span class="add_hilite">Completed</span></td>
-                  </tr>
-                  <tr>
-                    <td class="rowLabel" style="min-width: 210px;">Study Start:</td>
-                    <td>March 2008 </td>
-                  </tr>
-                  <tr>
-                    <td class="rowLabel" style="min-width: 210px;">Primary Completion:</td>
-                    <td> <span class="add_hilite">December 2009 [Actual]</span> </td>
-                  </tr>
-                  <tr>
-                    <td class="rowLabel" style="min-width: 210px;">Study Completion:</td>
-                    <td>December 2009 [ <span class="drop_hilite">Anticipated</span> <span
-                        class="add_hilite">Actual</span>] </td>
-                  </tr>
-                  <tr style="border-bottom:1px solid lightgray">
-                    <td colspan="3"></td>
-                  </tr>
-                  <tr>
-                    <td class="rowLabel" style="min-width: 210px;">First Submitted:</td>
-                    <td>April 10, 2008 </td>
-                  </tr>
-                  <tr>
-                    <td class="rowLabel" style="min-width: 210px;">First Submitted that<br />Met QC Criteria:</td>
-                    <td>April 10, 2008 </td>
-                  </tr>
-                  <tr>
-                    <td class="rowLabel" style="min-width: 210px;">First Posted:</td>
-                    <td>April 15, 2008 [Estimate] </td>
-                  </tr>
-                  <tr style="border-bottom:1px solid lightgray">
-                    <td colspan="3"></td>
-                  </tr>
-                  <tr>
-                    <td class="rowLabel" style="min-width: 210px;">Results First Submitted:</td>
-                    <td> <span class="add_hilite">February 6, 2014</span> </td>
-                  </tr>
-                  <tr>
-                    <td class="rowLabel" style="min-width: 210px;">Results First Submitted that<br />Met QC
-                      Criteria:</td>
-                    <td> <span class="add_hilite">August 29, 2014</span> </td>
-                  </tr>
-                  <tr>
-                    <td class="rowLabel" style="min-width: 210px;">Results First Posted:</td>
-                    <td> <span class="add_hilite">September 9, 2014 [Estimate]</span> </td>
-                  </tr>
-                  <tr style="border-bottom:1px solid lightgray">
-                    <td colspan="3"></td>
-                  </tr>
-                  <tr>
-                    <td class="rowLabel" style="min-width: 210px;">Last Update Submitted that<br />Met QC Criteria:
-                    </td>
-                    <td>April <span class="drop_hilite">10</span> <span class="add_hilite">18</span>, <span
-                        class="drop_hilite">2008</span> <span class="add_hilite">2017</span> </td>
-                  </tr>
-                  <tr>
-                    <td class="rowLabel" style="min-width: 210px;">Last Update Posted:</td>
-                    <td><span class="drop_hilite">April 15, 2008 [Estimate]</span> <span class="add_hilite">May 19,
-                        2017 [Actual]</span> </td>
-                  </tr>
-                </tbody>
-              </table>
-            </div>
-          </fieldset>
-        </div>
-      </form>
-    </td>
-  </tr>
-"""
-
-
-entry1 = """
-<tr>
-<td class="rowLabel" style="min-width: 210px;">Record Verification:</td>
-<td>April <span class="drop_hilite">2008</span> <span class="add_hilite">2017</span> </td>
-</tr>
-"""
-
-
-entry2 = '<td> <span class="add_hilite">December 2009 [Actual]</span> </td>'
-
-DROP_HILITE_re = re.compile('<span class="drop_hilite">[\[\]\w]*</span>\s?')
-ADD_HILITE_re = re.compile('<span class="add_hilite">\w*</span>\s?')
-TAGS_RE = re.compile('<[=-_,.:;"/\w\s]+>')
-
-def extract_new_data(td):
-    text = td.__str__()
-    return TAGS_RE.sub("",DROP_HILITE_re.sub(" ",text)).strip()
-
-def extract_old_data(td):
-    text = td.__str__()
-    return TAGS_RE.sub("",ADD_HILITE_re.sub(" ",text)).strip()
-
-def delete_tags(td):
-    text = td.__str__()
-    return TAGS_RE.sub(" ",text).strip()
-
-
-def extract_date_and_tag(text, date_format):
-    """
-    Extracts a datetype according to the date format
-    and the estimate tag based on 
-
-    """
-    if not text:
-        return " "
-        
-    date_split = text.split("[")
-    if len(date_split) > 1:
-        estimate_tag = date_split[1].split("]")[0].strip()
-    else:
-        estimate_tag = None
-    date_object = datetime.strptime(date_split[0].strip(), date_format)
-    
-    return estimate_tag, date_object
-    #TODO: Write test
-
-def extract_text_and_tag(text):
-    """
-    
-    """
-    pass
-
-if __name__ == "__main__":
-    Entry = BeautifulSoup(entry1, "lxml")
-    Form = BeautifulSoup(form, "lxml")
-
-
-
-    print(extract_new_data(Entry.find_all("td")[1]))
-    print(extract_old_data(Entry.find_all("td")[1]))
-
-    for tr in Form.find_all("tr"):
-        data = tr.find_all("td")
-        match len(data):
-            case 0: print("no data")
-            case 1: print("1\t",data[0])
-            case _: print(len(data), "\t", extract_new_data(data[1]) ,"\t|\t", extract_old_data(data[1]))
-
-    #print(extract_date_and_tag(extract_old_data(Entry.find_all("td")[1]), "%B %Y"))
-    print(extract_date_and_tag("April 2008 [ test ]", "%B %Y"))
-
-
-    Entry2 = BeautifulSoup(entry2,"lxml")
-    print(extract_old_data(Entry2)) #error here.
-    print(extract_new_data(Entry2))
-
-
-    Entry3 = copy(Entry2)
-    print(Entry3)
-    Entry4 = Entry3.find_all(class_="add_hilite")[0].extract()
-    print(Entry3.text)
-    print(Entry4.text)