From 3eb9a4130a365ed0ce8ee3df7ebdd35c658ff27a Mon Sep 17 00:00:00 2001 From: youainti Date: Fri, 30 Dec 2022 00:26:36 -0800 Subject: [PATCH] Got parsing updated to new format --- Parser/extraction_lib.py | 72 +++++++++++++++++++++++----------------- 1 file changed, 41 insertions(+), 31 deletions(-) diff --git a/Parser/extraction_lib.py b/Parser/extraction_lib.py index 1d3b26b..8a3628f 100644 --- a/Parser/extraction_lib.py +++ b/Parser/extraction_lib.py @@ -123,14 +123,17 @@ def extract_study_statuses(study_status_form, version_a,version_b): StudyStatusData objects, """ + #get rows rows = study_status_form.table.tbody.find_all("tr") #iterate through rows, for trow in rows: #matching on rowLabels + match tr_to_td(trow): - case ["Primary Completion:" as row_label, tag]: - old,new = split_by_version(tag) + case ["Primary Completion:" as row_label, old,new]: + print("row matched: {}".format(row_label)) if VERBOSE else "" + tagdate1 = extract_date_and_tag(old.text) version_a._primary_completion_date = tagdate1.date version_a._primary_completion_date_category = optional_strip(tagdate1.tag) @@ -139,8 +142,8 @@ def extract_study_statuses(study_status_form, version_a,version_b): version_b._primary_completion_date = tagdate2.date version_b._primary_completion_date_category = optional_strip(tagdate2.tag) - case ["Study Start:" as row_label, tag]: - old,new = split_by_version(tag) + case ["Study Start:" as row_label, old, new]: + print("row matched: {}".format(row_label)) if VERBOSE else "" tagdate1 = extract_date_and_tag(old.text) version_a._start_date = tagdate1.date version_a._start_date_category = optional_strip(tagdate1.tag) @@ -149,21 +152,23 @@ def extract_study_statuses(study_status_form, version_a,version_b): version_b._start_date = tagdate2.date version_b._start_date_category = optional_strip(tagdate2.tag) - case ["Study Completion:" as row_label, tag]: - old,new = split_by_version(tag) + case ["Study Completion:" as row_label, old,new]: + print("row matched: {}".format(row_label)) if VERBOSE else "" tagdate1 = extract_date_and_tag(old.text) version_a._completion_date = tagdate1.date version_a._completion_date_category = optional_strip(tagdate1.tag) - tagdate2 = extract_date_and_tag(new.text) version_b._completion_date = tagdate2.date version_b._completion_date_category = optional_strip(tagdate2.tag) - case ["Overall Status:" as row_label, tag]: - old,new = split_by_version(tag) + case ["Overall Status:" as row_label, old,new]: + print("row matched: {}".format(row_label)) if VERBOSE else "" version_a._overall_status = optional_strip(old.text) - version_b._overall_status = optional_strip(new.text) + version_b._overall_status = optional_strip(new.text) + #FIX: There is an issue with NCT00789633 where the overall status includes information as to why it was suspended. + case _ as row_label: + print("row not matched: {}".format(row_label)) if VERBOSE else "" def extract_study_design(study_status_form, version_a,version_b): """ @@ -177,8 +182,8 @@ def extract_study_design(study_status_form, version_a,version_b): for trow in rows: #matching on rowLabels match tr_to_td(trow): - case ["Enrollment:" as row_label, tag]: - old,new = split_by_version(tag) + case ["Enrollment:" as row_label, old, new]: + print("row matched: {}".format(row_label)) if VERBOSE else "" #Extract tag and text, add them to preallocated object tagtext1 = extract_text_and_tag(old.text) @@ -189,6 +194,8 @@ def extract_study_design(study_status_form, version_a,version_b): version_b._enrollment = tagtext2.text version_b._enrollment_category = optional_strip(tagtext2.tag) + case _ as row_label: + print("row not matched: {}".format(row_label)) if VERBOSE else "" def extract_sponsor_data(study_status_form, version_a,version_b): """ @@ -202,25 +209,30 @@ def extract_sponsor_data(study_status_form, version_a,version_b): for trow in rows: #matching on rowLabels match tr_to_td(trow): - case ["Sponsor:" as row_label, tag]: - old, new = split_by_version(tag) + case ["Sponsor:" as row_label, old, new]: + print("row matched: {}".format(row_label)) if VERBOSE else "" version_a._sponsor = optional_strip(old.text) version_b._sponsor = optional_strip(new.text) - case ["Responsible Party:" as row_label, tag]: - old, new = split_by_version(tag) + case ["Responsible Party:" as row_label, old, new]: + print("row matched: {}".format(row_label)) if VERBOSE else "" version_a._responsible_party = optional_strip(old.text) version_b._responsible_party = optional_strip(new.text) - case ["Collaborators:" as row_label, tag]: - #old, new = split_by_version(tag) + case ["Collaborators:" as row_label, old, new]: + print("row matched: {}".format(row_label)) if VERBOSE else "" #TODO: find a trial with multiple collaborators and figure out how to identify/count them:w # So far can't figure out where this is in AACT, so I'm going to ignore it. pass + case _ as row_label: + print("row not matched: {}".format(row_label)) if VERBOSE else "" def split_by_version(tag): + ''' + OUTDATED: With the new format that separates old and new versions, I don't technically need this. It is a nice place to identify exact changes if those are every needed though and it removes the highlights cleanly. + ''' #clone elements and remove sub-tags that are not needed. old = copy(tag) for span in old.find_all(class_="add_hilite"): @@ -269,7 +281,7 @@ def extract_text_and_tag(text): #handle various empty cases if not text or text == '': - return TagDatePair(None, None) + return TagTextPair(None, None) date_split = text.split("[") if len(date_split) > 1: @@ -282,19 +294,19 @@ def extract_text_and_tag(text): ### FUNCTIONS -def tr_to_td(tr) -> tuple[str, str]: +def tr_to_td(tr) -> tuple[str, str, str]: """ Takes an html data row of interest, extracts the record_name from the first , and the data from the second . For the data, it just extracts the text. The text itself then needs processed separately, based on what it should contain. """ - #get list of cells + #get list of cells td_list = tr.find_all("td") - if len(td_list) == 2: - return td_list[0].text, td_list[1] + if len(td_list) == 3: + return td_list[0].text, td_list[1], td_list[2] else: - return None, None + return None, None, None def get_forms(soup,version_a,version_b): @@ -343,8 +355,8 @@ def get_forms(soup,version_a,version_b): pass case "form_MoreInformation": pass - case _: - print(form.attrs["id"]) + case _ as form_name: + print("form not matched: {}".format(form_name)) if VERBOSE else "" ### CONSTANTS @@ -365,6 +377,7 @@ def get_data_from_versions(nct_id,html, version_a_int, version_b_int): if __name__ == "__main__": + VERBOSE = True with psycopg2.connect(dbname="aact_db", user="root", password="root",host="will-office") as db_connection: #pull the requests from the db @@ -376,13 +389,11 @@ if __name__ == "__main__": """ curse.execute(sql) for response in curse.fetchall(): - # nct_id, version_a, version_b, html = response - print(nct_id) + print(nct_id, version_a, version_b) if VERBOSE else "" + version1, version2 = get_data_from_versions(nct_id, html, version_a, version_b) - print(version1.nct_id) - print(version2._enrollment) if version_b == version_a + 1: version1.load_to_db(db_connection) @@ -390,7 +401,6 @@ if __name__ == "__main__": else: version2.load_to_db(db_connection) - exit(1) """