Got parsing updated to new format

3 years ago · 3eb9a4130a
parent ee3e37e834
commit 3eb9a4130a
1 changed files with 41 additions and 31 deletions
--- a/Parser/extraction_lib.py
+++ b/Parser/extraction_lib.py
@ -123,14 +123,17 @@ def extract_study_statuses(study_status_form, version_a,version_b):
    StudyStatusData objects, 
    """
    #get rows
    rows = study_status_form.table.tbody.find_all("tr")
    #iterate through rows, 
    for trow in rows:
        #matching on rowLabels
        match tr_to_td(trow):
-            case ["Primary Completion:" as row_label, tag]: 
+            case ["Primary Completion:" as row_label, old,new]: 
-                old,new = split_by_version(tag)
+                print("row matched: {}".format(row_label)) if VERBOSE else ""
                tagdate1 = extract_date_and_tag(old.text)
                version_a._primary_completion_date = tagdate1.date
                version_a._primary_completion_date_category = optional_strip(tagdate1.tag)
@ -139,8 +142,8 @@ def extract_study_statuses(study_status_form, version_a,version_b):
                version_b._primary_completion_date = tagdate2.date
                version_b._primary_completion_date_category = optional_strip(tagdate2.tag)
-            case ["Study Start:" as row_label, tag]: 
+            case ["Study Start:" as row_label, old, new]: 
-                old,new = split_by_version(tag)
+                print("row matched: {}".format(row_label)) if VERBOSE else ""
                tagdate1 = extract_date_and_tag(old.text)
                version_a._start_date = tagdate1.date
                version_a._start_date_category = optional_strip(tagdate1.tag)
@ -149,21 +152,23 @@ def extract_study_statuses(study_status_form, version_a,version_b):
                version_b._start_date = tagdate2.date
                version_b._start_date_category = optional_strip(tagdate2.tag)
-            case ["Study Completion:" as row_label, tag]:
+            case ["Study Completion:" as row_label, old,new]:
-                old,new = split_by_version(tag)
+                print("row matched: {}".format(row_label)) if VERBOSE else ""
                tagdate1 = extract_date_and_tag(old.text)
                version_a._completion_date = tagdate1.date
                version_a._completion_date_category = optional_strip(tagdate1.tag)              
                tagdate2 = extract_date_and_tag(new.text)
                version_b._completion_date = tagdate2.date
                version_b._completion_date_category = optional_strip(tagdate2.tag)
-            case ["Overall Status:" as row_label, tag]:
+            case ["Overall Status:" as row_label, old,new]:
-                old,new = split_by_version(tag)
+                print("row matched: {}".format(row_label)) if VERBOSE else ""
                version_a._overall_status = optional_strip(old.text)
-                version_b._overall_status = optional_strip(new.text)         
+                version_b._overall_status = optional_strip(new.text)
                #FIX: There is an issue with NCT00789633 where the overall status includes information as to why it was suspended.
            case _ as row_label:
                print("row not matched: {}".format(row_label)) if VERBOSE else ""
 def extract_study_design(study_status_form, version_a,version_b):
    """
@ -177,8 +182,8 @@ def extract_study_design(study_status_form, version_a,version_b):
    for trow in rows:
        #matching on rowLabels
        match tr_to_td(trow):
-            case ["Enrollment:" as row_label, tag]: 
+            case ["Enrollment:" as row_label, old, new]: 
-                old,new = split_by_version(tag)
+                print("row matched: {}".format(row_label)) if VERBOSE else ""
                #Extract tag and text, add them to preallocated object
                tagtext1 = extract_text_and_tag(old.text)
@ -189,6 +194,8 @@ def extract_study_design(study_status_form, version_a,version_b):
                version_b._enrollment = tagtext2.text
                version_b._enrollment_category = optional_strip(tagtext2.tag)
            case _ as row_label:
                print("row not matched: {}".format(row_label)) if VERBOSE else ""
 def extract_sponsor_data(study_status_form, version_a,version_b):
    """
@ -202,25 +209,30 @@ def extract_sponsor_data(study_status_form, version_a,version_b):
    for trow in rows:
        #matching on rowLabels
        match tr_to_td(trow):
-            case ["Sponsor:" as row_label, tag]: 
+            case ["Sponsor:" as row_label, old, new]:
-                old, new = split_by_version(tag)
+                print("row matched: {}".format(row_label)) if VERBOSE else ""
                version_a._sponsor = optional_strip(old.text)
                version_b._sponsor = optional_strip(new.text)
-            case ["Responsible Party:" as row_label, tag]: 
+            case ["Responsible Party:" as row_label, old, new]: 
-                old, new = split_by_version(tag)
+                print("row matched: {}".format(row_label)) if VERBOSE else ""
                version_a._responsible_party = optional_strip(old.text)
                version_b._responsible_party = optional_strip(new.text)
-            case ["Collaborators:" as row_label, tag]: 
+            case ["Collaborators:" as row_label, old, new]: 
-                #old, new = split_by_version(tag)
+                print("row matched: {}".format(row_label)) if VERBOSE else ""
                #TODO: find a trial with multiple collaborators and figure out how to identify/count them:w
                # So far can't figure out where this is in AACT, so I'm going to ignore it.
                pass
            case _ as row_label:
                print("row not matched: {}".format(row_label)) if VERBOSE else ""
 def split_by_version(tag):
    '''
    OUTDATED: With the new format that separates old and new versions, I don't technically need this. It is a nice place to identify exact changes if those are every needed though and it removes the highlights cleanly.
    '''
    #clone elements and remove sub-tags that are not needed.
    old = copy(tag)
    for span in old.find_all(class_="add_hilite"):
@ -269,7 +281,7 @@ def extract_text_and_tag(text):
    #handle various empty cases
    if not text or text == '':
-        return TagDatePair(None, None)
+        return TagTextPair(None, None)
    date_split = text.split("[")
    if len(date_split) > 1:
@ -282,19 +294,19 @@ def extract_text_and_tag(text):
 ### FUNCTIONS
-def tr_to_td(tr) -> tuple[str, str]:
+def tr_to_td(tr) -> tuple[str, str, str]:
    """
    Takes an html data row of interest, extracts the record_name from the first <td>, and the data from the second <td>.
    For the data, it just extracts the text. 
    The text itself then needs processed separately, based on what it should contain.
    """
-    #get list of cells
+    #get list of cells 
    td_list = tr.find_all("td")
-    if len(td_list) == 2:
+    if len(td_list) == 3:
-        return td_list[0].text, td_list[1]
+        return td_list[0].text, td_list[1], td_list[2]
    else:
-        return None, None
+        return None, None, None
 def get_forms(soup,version_a,version_b):
@ -343,8 +355,8 @@ def get_forms(soup,version_a,version_b):
                pass
            case "form_MoreInformation":
                pass
-            case _:
+            case _ as form_name:
-                print(form.attrs["id"])
+                print("form not matched: {}".format(form_name)) if VERBOSE else ""
 ### CONSTANTS
@ -365,6 +377,7 @@ def get_data_from_versions(nct_id,html, version_a_int, version_b_int):
 if __name__ == "__main__":
    VERBOSE = True 
    with psycopg2.connect(dbname="aact_db", user="root", password="root",host="will-office") as db_connection:
        #pull the requests from the db
@ -376,13 +389,11 @@ if __name__ == "__main__":
            """
            curse.execute(sql)
            for response in curse.fetchall():
                #
                nct_id, version_a, version_b, html = response
-                print(nct_id)
+                print(nct_id, version_a, version_b) if VERBOSE else ""
                version1, version2 = get_data_from_versions(nct_id, html, version_a, version_b)
                print(version1.nct_id)
                print(version2._enrollment)
                if version_b == version_a + 1:
                    version1.load_to_db(db_connection)
@ -390,7 +401,6 @@ if __name__ == "__main__":
                else:
                    version2.load_to_db(db_connection)
                exit(1)
 """