From 71e87a9abe924ec6883963580dc3f17944652492 Mon Sep 17 00:00:00 2001
From: will king <youainti@protonmail.com>
Date: Wed, 8 Jun 2022 19:04:47 -0700
Subject: [PATCH] Saving current status.

---
 Parser/.vscode/launch.json   |  16 +++++
 Parser/extraction-lib.py     | 125 ++++++++++++++++++++++++++++++++++-
 Parser/prototype_history.sql |  62 +++++++++++++++++
 Parser/textprocessing.py     | 121 +++++++++++++++++++++++++++++++++
 4 files changed, 321 insertions(+), 3 deletions(-)
 create mode 100644 Parser/.vscode/launch.json
 create mode 100644 Parser/prototype_history.sql
 create mode 100644 Parser/textprocessing.py
diff --git a/Parser/.vscode/launch.json b/Parser/.vscode/launch.json
new file mode 100644
index 0000000..306f58e
--- /dev/null
+++ b/Parser/.vscode/launch.json
@@ -0,0 +1,16 @@
+{
+    // Use IntelliSense to learn about possible attributes.
+    // Hover to view descriptions of existing attributes.
+    // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
+    "version": "0.2.0",
+    "configurations": [
+        {
+            "name": "Python: Current File",
+            "type": "python",
+            "request": "launch",
+            "program": "${file}",
+            "console": "integratedTerminal",
+            "justMyCode": true
+        }
+    ]
+}
\ No newline at end of file
diff --git a/Parser/extraction-lib.py b/Parser/extraction-lib.py
index 158d7f2..2d867c7 100644
--- a/Parser/extraction-lib.py
+++ b/Parser/extraction-lib.py
@@ -1,8 +1,127 @@
+from tokenize import String
 from bs4 import BeautifulSoup
+import abc
+import textprocessing as tp #cuz tp is important
+#requires Python 3.10
+
+def extract_data_from_tr(tr) -> tuple[String, String]:
+    """
+    Takes an html data row of interest, extracts the record_name from the first <td>, and the data from the second <td>.
+
+    For the data, it will split between old and new data, making copies of each and returnign them.
+
+    Uses functionality from ./textprocessing.py (separated because it is important to test that functionality)
+    to get extract data from tags.
+
+    """
+    #get list of cells
+    #for cell in cells
+    #if class_=="rowLabel", extract text
+    #else parse out new and old text
+    #return triple: row_lable, old, new
+    pass
+
+#superclasses
+class VersionData{abc.ABC}:
+    """
+    This abstract class holds two types of data:
+        - Data with a 1-to-1 relationship with the trial/version pair.
+        - Data with a child relationship with the trial/version pair.
+
+    Each subclass will return the 1-to-1 data for another system to add to the DB.
+    This is so that a single record can be created in one go.
+    Each subclass will load the child data to the database directly. 
+    """
+    @abc.abstractmethod
+    def version_fields(self):
+        """
+        This function returns data that should be included in a standard table
+        related to version_x of the record.
+
+        It also returns the columns?
+        """
+        pass
+    @abc.abstractmethod
+    def version_records(self, foreign_key, db_cursor):
+        """
+        This function loads data that needs to be held in auxilary tables
+        into the database.
+        For example, the list of sponsors will need to be tracked separatly from 
+        trial status.
+        """
+        pass
+
+
+class StudyStatusData(VersionData):
+    columns = ["primary_completion_date", "completion_date", "last_update_posted_date"]
+
+    def __init__(self ,primary_completion_date, completion_date, last_update_posted_date) -> None:
+        pass
+
+def extract_study_statuses(study_status_form, version_a,version_b):
+    """
+    This extracts data from a study_status form and returns one or two StudyStatusData objects
+    """
+    pass
+
+class SponsorCollaboratorsData(VersionData):
+    columns=[]
+    def __init__(self) -> None:
+        pass
+    
+
+
+def get_forms(soup):
+
+    data_list = []
+
+    #extract all forms
+    for form in soup.body.find_all("form"):
+        #Match forms against ID types
+        if not "id" in form.attrs:
+            continue
+
+        match form.attrs["id"]:
+            case "form_StudyStatus":
+                print("test successful 2")
+            case "form_SponsorCollaborators":
+                pass
+            case "form_Oversight":
+                pass
+            case "form_StudyDescription":
+                pass
+            case "form_Conditions":
+                pass
+            case "form_StudyDesign":
+                pass
+            case "form_ArmsandInterventions":
+                pass
+            case "form_ProtocolOutcomeMeasures":
+                pass
+            case "form_Eligibility":
+                pass
+            case "form_ContactsLocations":
+                pass
+            case "form_IPDSharing":
+                pass
+            case "form_References":
+                pass
+            case "form_ParticipantFlow":
+                pass
+            case "form_BaselineCharacteristics":
+                pass
+            case "form_ROutcomeMeasures":
+                pass
+            case "form_AdverseEvents":
+                pass
+            case "form_LimitationsandCaveats":
+                pass
+            case "form_MoreInformation":
+                pass
+            case _:
+                print(form.attrs["id"])
 
 if __name__ == "__main__":
     with open("./NCT00658567.html") as fh:
         soup = BeautifulSoup(fh, "lxml")
-
-    print(soup)
-
+    get_forms(soup)
\ No newline at end of file
diff --git a/Parser/prototype_history.sql b/Parser/prototype_history.sql
new file mode 100644
index 0000000..53e9be0
--- /dev/null
+++ b/Parser/prototype_history.sql
@@ -0,0 +1,62 @@
+/*
+Create schema history
+
+
+CREATE TABLE history.versions
+    nct_id
+    version
+    --Study Status
+    overall_status
+    primary_completion_date
+    completion_date
+    last_update_submitted_date
+    --SponsorCollaborators
+    sponsor (multi?)
+    collaborators (multi?)
+    --Oversight
+    fda_regulated_drug (ignore)
+    fda_regulated_device (ignore)
+    dmc (ignore)
+    --StuldyDescription
+    summary    
+    detailed_description
+    --Conditions
+    Conditions
+    Keywords
+    --StudyDesign
+    Study type
+    Primary Purpose
+    Study Phase
+    Interventional Study Model
+    Number of Arms
+    Masking
+    Allocation
+    Enrollment
+    --ArmsAndInterventions
+    Arms (multiple) (Ignore)
+    --ProtocolOutcomeMeasures
+    --Eligibility
+    --ContactsLocation
+    --IPDSharing
+    --References
+    --ParticipantFlow
+    --BaselineCharacteristics
+    --ROutcomeMeasures
+    --AdverseEvents
+    --LimitationsAndCaveats
+    --More Information
+
+
+CREATE TABLE history.colaborators
+    nct_id
+    version
+    collaborator_name
+
+CREATE TABLE history.locations
+    nct_id
+    version
+    location name
+    location contact info
+
+CREATE TABLE history.arms
+*/
\ No newline at end of file
diff --git a/Parser/textprocessing.py b/Parser/textprocessing.py
new file mode 100644
index 0000000..8738f86
--- /dev/null
+++ b/Parser/textprocessing.py
@@ -0,0 +1,121 @@
+from cgitb import html
+import re
+
+form = """
+<tr>
+    <td colspan="2">
+      <form id="form_StudyStatus">
+        <div class="w3-responsive">
+          <fieldset class="entryReq" id="StudyStatus"
+            style="margin:auto;margin-bottom:1em;padding-bottom:0.5em;width:98%;">
+            <legend class="moduleLabel"> <img id="StudyStatusImg" class="toggleImage"
+                onclick="toggleModule('StudyStatus');" src="html/images/collapse.png"
+                alt='Open or close this module'>
+              Study Status</legend>
+            <div id="StudyStatusBody" class="moduleBody">
+              <table class="indent1 moduleTable resultTable">
+                <thead>
+                  <tr>
+                    <th style="width:210px;"></th>
+                    <th></th>
+                  </tr>
+                </thead>
+                <tbody>
+                  <tr>
+                    <td class="rowLabel" style="min-width: 210px;">Record Verification:</td>
+                    <td>April <span class="drop_hilite">2008</span> <span class="add_hilite">2017</span> </td>
+                  </tr>
+                  <tr>
+                    <td class="rowLabel" style="min-width: 210px;">Overall Status:</td>
+                    <td><span class="drop_hilite">Recruiting</span> <span class="add_hilite">Completed</span></td>
+                  </tr>
+                  <tr>
+                    <td class="rowLabel" style="min-width: 210px;">Study Start:</td>
+                    <td>March 2008 </td>
+                  </tr>
+                  <tr>
+                    <td class="rowLabel" style="min-width: 210px;">Primary Completion:</td>
+                    <td> <span class="add_hilite">December 2009 [Actual]</span> </td>
+                  </tr>
+                  <tr>
+                    <td class="rowLabel" style="min-width: 210px;">Study Completion:</td>
+                    <td>December 2009 [ <span class="drop_hilite">Anticipated</span> <span
+                        class="add_hilite">Actual</span>] </td>
+                  </tr>
+                  <tr style="border-bottom:1px solid lightgray">
+                    <td colspan="3"></td>
+                  </tr>
+                  <tr>
+                    <td class="rowLabel" style="min-width: 210px;">First Submitted:</td>
+                    <td>April 10, 2008 </td>
+                  </tr>
+                  <tr>
+                    <td class="rowLabel" style="min-width: 210px;">First Submitted that<br />Met QC Criteria:</td>
+                    <td>April 10, 2008 </td>
+                  </tr>
+                  <tr>
+                    <td class="rowLabel" style="min-width: 210px;">First Posted:</td>
+                    <td>April 15, 2008 [Estimate] </td>
+                  </tr>
+                  <tr style="border-bottom:1px solid lightgray">
+                    <td colspan="3"></td>
+                  </tr>
+                  <tr>
+                    <td class="rowLabel" style="min-width: 210px;">Results First Submitted:</td>
+                    <td> <span class="add_hilite">February 6, 2014</span> </td>
+                  </tr>
+                  <tr>
+                    <td class="rowLabel" style="min-width: 210px;">Results First Submitted that<br />Met QC
+                      Criteria:</td>
+                    <td> <span class="add_hilite">August 29, 2014</span> </td>
+                  </tr>
+                  <tr>
+                    <td class="rowLabel" style="min-width: 210px;">Results First Posted:</td>
+                    <td> <span class="add_hilite">September 9, 2014 [Estimate]</span> </td>
+                  </tr>
+                  <tr style="border-bottom:1px solid lightgray">
+                    <td colspan="3"></td>
+                  </tr>
+                  <tr>
+                    <td class="rowLabel" style="min-width: 210px;">Last Update Submitted that<br />Met QC Criteria:
+                    </td>
+                    <td>April <span class="drop_hilite">10</span> <span class="add_hilite">18</span>, <span
+                        class="drop_hilite">2008</span> <span class="add_hilite">2017</span> </td>
+                  </tr>
+                  <tr>
+                    <td class="rowLabel" style="min-width: 210px;">Last Update Posted:</td>
+                    <td><span class="drop_hilite">April 15, 2008 [Estimate]</span> <span class="add_hilite">May 19,
+                        2017 [Actual]</span> </td>
+                  </tr>
+                </tbody>
+              </table>
+            </div>
+          </fieldset>
+        </div>
+      </form>
+    </td>
+  </tr>
+"""
+
+
+entry1 = """
+<tr>
+<td class="rowLabel" style="min-width: 210px;">Record Verification:</td>
+<td>April <span class="drop_hilite">2008</span> <span class="add_hilite">2017</span> </td>
+</tr>
+"""
+
+
+drop_old_re = re.compile('<span class="drop_hilite">\w*</span>\s?')
+drop_new_re = re.compile('<span class="add_hilite">\w*</span>\s?')
+drop_tags_re = re.compile('<[=-_,.:;"/\w\s]+>')
+
+
+print(drop_new_re.sub("",entry1))
+print(drop_old_re.sub("",entry1))
+print(drop_tags_re.sub("",entry1))
+
+print(drop_tags_re.sub("",drop_new_re.sub("",entry1)))
+
+
+print(drop_tags_re.sub("",drop_new_re.sub("",form)))
\ No newline at end of file


Record Verification:	April 2008 2017
Overall Status:	Recruiting Completed
Study Start:	March 2008
Primary Completion:	December 2009 [Actual]
Study Completion:	December 2009 [ Anticipated Actual]

First Submitted:	April 10, 2008
First Submitted that Met QC Criteria:	April 10, 2008
First Posted:	April 15, 2008 [Estimate]

Results First Submitted:	February 6, 2014
Results First Submitted that Met QC + Criteria:	August 29, 2014
Results First Posted:	September 9, 2014 [Estimate]

Last Update Submitted that Met QC Criteria: +	April 10 18, 2008 2017
Last Update Posted:	April 15, 2008 [Estimate] May 19, + 2017 [Actual]