From 3eb9a4130a365ed0ce8ee3df7ebdd35c658ff27a Mon Sep 17 00:00:00 2001
From: youainti <youainti@protonmail.com>
Date: Fri, 30 Dec 2022 00:26:36 -0800
Subject: [PATCH] Got parsing updated to new format

---
 Parser/extraction_lib.py | 72 +++++++++++++++++++++++-----------------
 1 file changed, 41 insertions(+), 31 deletions(-)

diff --git a/Parser/extraction_lib.py b/Parser/extraction_lib.py
index 1d3b26b..8a3628f 100644
--- a/Parser/extraction_lib.py
+++ b/Parser/extraction_lib.py
@@ -123,14 +123,17 @@ def extract_study_statuses(study_status_form, version_a,version_b):
     StudyStatusData objects, 
 
     """
+
     #get rows
     rows = study_status_form.table.tbody.find_all("tr")
     #iterate through rows, 
     for trow in rows:
         #matching on rowLabels
+
         match tr_to_td(trow):
-            case ["Primary Completion:" as row_label, tag]: 
-                old,new = split_by_version(tag)
+            case ["Primary Completion:" as row_label, old,new]: 
+                print("row matched: {}".format(row_label)) if VERBOSE else ""
+
                 tagdate1 = extract_date_and_tag(old.text)
                 version_a._primary_completion_date = tagdate1.date
                 version_a._primary_completion_date_category = optional_strip(tagdate1.tag)
@@ -139,8 +142,8 @@ def extract_study_statuses(study_status_form, version_a,version_b):
                 version_b._primary_completion_date = tagdate2.date
                 version_b._primary_completion_date_category = optional_strip(tagdate2.tag)
 
-            case ["Study Start:" as row_label, tag]: 
-                old,new = split_by_version(tag)
+            case ["Study Start:" as row_label, old, new]: 
+                print("row matched: {}".format(row_label)) if VERBOSE else ""
                 tagdate1 = extract_date_and_tag(old.text)
                 version_a._start_date = tagdate1.date
                 version_a._start_date_category = optional_strip(tagdate1.tag)
@@ -149,21 +152,23 @@ def extract_study_statuses(study_status_form, version_a,version_b):
                 version_b._start_date = tagdate2.date
                 version_b._start_date_category = optional_strip(tagdate2.tag)
 
-            case ["Study Completion:" as row_label, tag]:
-                old,new = split_by_version(tag)
+            case ["Study Completion:" as row_label, old,new]:
+                print("row matched: {}".format(row_label)) if VERBOSE else ""
                 tagdate1 = extract_date_and_tag(old.text)
                 version_a._completion_date = tagdate1.date
                 version_a._completion_date_category = optional_strip(tagdate1.tag)              
-
                 tagdate2 = extract_date_and_tag(new.text)
                 version_b._completion_date = tagdate2.date
                 version_b._completion_date_category = optional_strip(tagdate2.tag)
 
-            case ["Overall Status:" as row_label, tag]:
-                old,new = split_by_version(tag)
+            case ["Overall Status:" as row_label, old,new]:
+                print("row matched: {}".format(row_label)) if VERBOSE else ""
                 version_a._overall_status = optional_strip(old.text)
-                version_b._overall_status = optional_strip(new.text)         
+                version_b._overall_status = optional_strip(new.text)
+                #FIX: There is an issue with NCT00789633 where the overall status includes information as to why it was suspended.
 
+            case _ as row_label:
+                print("row not matched: {}".format(row_label)) if VERBOSE else ""
 
 def extract_study_design(study_status_form, version_a,version_b):
     """
@@ -177,8 +182,8 @@ def extract_study_design(study_status_form, version_a,version_b):
     for trow in rows:
         #matching on rowLabels
         match tr_to_td(trow):
-            case ["Enrollment:" as row_label, tag]: 
-                old,new = split_by_version(tag)
+            case ["Enrollment:" as row_label, old, new]: 
+                print("row matched: {}".format(row_label)) if VERBOSE else ""
 
                 #Extract tag and text, add them to preallocated object
                 tagtext1 = extract_text_and_tag(old.text)
@@ -189,6 +194,8 @@ def extract_study_design(study_status_form, version_a,version_b):
                 version_b._enrollment = tagtext2.text
                 version_b._enrollment_category = optional_strip(tagtext2.tag)
 
+            case _ as row_label:
+                print("row not matched: {}".format(row_label)) if VERBOSE else ""
              
 def extract_sponsor_data(study_status_form, version_a,version_b):
     """
@@ -202,25 +209,30 @@ def extract_sponsor_data(study_status_form, version_a,version_b):
     for trow in rows:
         #matching on rowLabels
         match tr_to_td(trow):
-            case ["Sponsor:" as row_label, tag]: 
-                old, new = split_by_version(tag)
+            case ["Sponsor:" as row_label, old, new]:
+                print("row matched: {}".format(row_label)) if VERBOSE else ""
                 version_a._sponsor = optional_strip(old.text)
                 version_b._sponsor = optional_strip(new.text)
 
-            case ["Responsible Party:" as row_label, tag]: 
-                old, new = split_by_version(tag)
+            case ["Responsible Party:" as row_label, old, new]: 
+                print("row matched: {}".format(row_label)) if VERBOSE else ""
                 version_a._responsible_party = optional_strip(old.text)
                 version_b._responsible_party = optional_strip(new.text)
 
-            case ["Collaborators:" as row_label, tag]: 
-                #old, new = split_by_version(tag)
+            case ["Collaborators:" as row_label, old, new]: 
+                print("row matched: {}".format(row_label)) if VERBOSE else ""
                 #TODO: find a trial with multiple collaborators and figure out how to identify/count them:w
                 # So far can't figure out where this is in AACT, so I'm going to ignore it.
                 pass
 
+            case _ as row_label:
+                print("row not matched: {}".format(row_label)) if VERBOSE else ""
 
 
 def split_by_version(tag):
+    '''
+    OUTDATED: With the new format that separates old and new versions, I don't technically need this. It is a nice place to identify exact changes if those are every needed though and it removes the highlights cleanly.
+    '''
     #clone elements and remove sub-tags that are not needed.
     old = copy(tag)
     for span in old.find_all(class_="add_hilite"):
@@ -269,7 +281,7 @@ def extract_text_and_tag(text):
 
     #handle various empty cases
     if not text or text == '':
-        return TagDatePair(None, None)
+        return TagTextPair(None, None)
         
     date_split = text.split("[")
     if len(date_split) > 1:
@@ -282,19 +294,19 @@ def extract_text_and_tag(text):
 
 ### FUNCTIONS
 
-def tr_to_td(tr) -> tuple[str, str]:
+def tr_to_td(tr) -> tuple[str, str, str]:
     """
     Takes an html data row of interest, extracts the record_name from the first <td>, and the data from the second <td>.
 
     For the data, it just extracts the text. 
     The text itself then needs processed separately, based on what it should contain.
     """
-    #get list of cells
+    #get list of cells 
     td_list = tr.find_all("td")
-    if len(td_list) == 2:
-        return td_list[0].text, td_list[1]
+    if len(td_list) == 3:
+        return td_list[0].text, td_list[1], td_list[2]
     else:
-        return None, None
+        return None, None, None
 
 def get_forms(soup,version_a,version_b):
 
@@ -343,8 +355,8 @@ def get_forms(soup,version_a,version_b):
                 pass
             case "form_MoreInformation":
                 pass
-            case _:
-                print(form.attrs["id"])
+            case _ as form_name:
+                print("form not matched: {}".format(form_name)) if VERBOSE else ""
 
 
 ### CONSTANTS
@@ -365,6 +377,7 @@ def get_data_from_versions(nct_id,html, version_a_int, version_b_int):
 
 
 if __name__ == "__main__":
+    VERBOSE = True 
 
     with psycopg2.connect(dbname="aact_db", user="root", password="root",host="will-office") as db_connection:
         #pull the requests from the db
@@ -376,13 +389,11 @@ if __name__ == "__main__":
             """
             curse.execute(sql)
             for response in curse.fetchall():
-                #
                 nct_id, version_a, version_b, html = response
 
-                print(nct_id)
+                print(nct_id, version_a, version_b) if VERBOSE else ""
+
                 version1, version2 = get_data_from_versions(nct_id, html, version_a, version_b)
-                print(version1.nct_id)
-                print(version2._enrollment)
             
                 if version_b == version_a + 1:
                     version1.load_to_db(db_connection)
@@ -390,7 +401,6 @@ if __name__ == "__main__":
                 else:
                     version2.load_to_db(db_connection)
 
-                exit(1)
 
 
 """