Got parsing updated to new format

llm-extraction
youainti 3 years ago
parent ee3e37e834
commit 3eb9a4130a

@ -123,14 +123,17 @@ def extract_study_statuses(study_status_form, version_a,version_b):
StudyStatusData objects, StudyStatusData objects,
""" """
#get rows #get rows
rows = study_status_form.table.tbody.find_all("tr") rows = study_status_form.table.tbody.find_all("tr")
#iterate through rows, #iterate through rows,
for trow in rows: for trow in rows:
#matching on rowLabels #matching on rowLabels
match tr_to_td(trow): match tr_to_td(trow):
case ["Primary Completion:" as row_label, tag]: case ["Primary Completion:" as row_label, old,new]:
old,new = split_by_version(tag) print("row matched: {}".format(row_label)) if VERBOSE else ""
tagdate1 = extract_date_and_tag(old.text) tagdate1 = extract_date_and_tag(old.text)
version_a._primary_completion_date = tagdate1.date version_a._primary_completion_date = tagdate1.date
version_a._primary_completion_date_category = optional_strip(tagdate1.tag) version_a._primary_completion_date_category = optional_strip(tagdate1.tag)
@ -139,8 +142,8 @@ def extract_study_statuses(study_status_form, version_a,version_b):
version_b._primary_completion_date = tagdate2.date version_b._primary_completion_date = tagdate2.date
version_b._primary_completion_date_category = optional_strip(tagdate2.tag) version_b._primary_completion_date_category = optional_strip(tagdate2.tag)
case ["Study Start:" as row_label, tag]: case ["Study Start:" as row_label, old, new]:
old,new = split_by_version(tag) print("row matched: {}".format(row_label)) if VERBOSE else ""
tagdate1 = extract_date_and_tag(old.text) tagdate1 = extract_date_and_tag(old.text)
version_a._start_date = tagdate1.date version_a._start_date = tagdate1.date
version_a._start_date_category = optional_strip(tagdate1.tag) version_a._start_date_category = optional_strip(tagdate1.tag)
@ -149,21 +152,23 @@ def extract_study_statuses(study_status_form, version_a,version_b):
version_b._start_date = tagdate2.date version_b._start_date = tagdate2.date
version_b._start_date_category = optional_strip(tagdate2.tag) version_b._start_date_category = optional_strip(tagdate2.tag)
case ["Study Completion:" as row_label, tag]: case ["Study Completion:" as row_label, old,new]:
old,new = split_by_version(tag) print("row matched: {}".format(row_label)) if VERBOSE else ""
tagdate1 = extract_date_and_tag(old.text) tagdate1 = extract_date_and_tag(old.text)
version_a._completion_date = tagdate1.date version_a._completion_date = tagdate1.date
version_a._completion_date_category = optional_strip(tagdate1.tag) version_a._completion_date_category = optional_strip(tagdate1.tag)
tagdate2 = extract_date_and_tag(new.text) tagdate2 = extract_date_and_tag(new.text)
version_b._completion_date = tagdate2.date version_b._completion_date = tagdate2.date
version_b._completion_date_category = optional_strip(tagdate2.tag) version_b._completion_date_category = optional_strip(tagdate2.tag)
case ["Overall Status:" as row_label, tag]: case ["Overall Status:" as row_label, old,new]:
old,new = split_by_version(tag) print("row matched: {}".format(row_label)) if VERBOSE else ""
version_a._overall_status = optional_strip(old.text) version_a._overall_status = optional_strip(old.text)
version_b._overall_status = optional_strip(new.text) version_b._overall_status = optional_strip(new.text)
#FIX: There is an issue with NCT00789633 where the overall status includes information as to why it was suspended.
case _ as row_label:
print("row not matched: {}".format(row_label)) if VERBOSE else ""
def extract_study_design(study_status_form, version_a,version_b): def extract_study_design(study_status_form, version_a,version_b):
""" """
@ -177,8 +182,8 @@ def extract_study_design(study_status_form, version_a,version_b):
for trow in rows: for trow in rows:
#matching on rowLabels #matching on rowLabels
match tr_to_td(trow): match tr_to_td(trow):
case ["Enrollment:" as row_label, tag]: case ["Enrollment:" as row_label, old, new]:
old,new = split_by_version(tag) print("row matched: {}".format(row_label)) if VERBOSE else ""
#Extract tag and text, add them to preallocated object #Extract tag and text, add them to preallocated object
tagtext1 = extract_text_and_tag(old.text) tagtext1 = extract_text_and_tag(old.text)
@ -189,6 +194,8 @@ def extract_study_design(study_status_form, version_a,version_b):
version_b._enrollment = tagtext2.text version_b._enrollment = tagtext2.text
version_b._enrollment_category = optional_strip(tagtext2.tag) version_b._enrollment_category = optional_strip(tagtext2.tag)
case _ as row_label:
print("row not matched: {}".format(row_label)) if VERBOSE else ""
def extract_sponsor_data(study_status_form, version_a,version_b): def extract_sponsor_data(study_status_form, version_a,version_b):
""" """
@ -202,25 +209,30 @@ def extract_sponsor_data(study_status_form, version_a,version_b):
for trow in rows: for trow in rows:
#matching on rowLabels #matching on rowLabels
match tr_to_td(trow): match tr_to_td(trow):
case ["Sponsor:" as row_label, tag]: case ["Sponsor:" as row_label, old, new]:
old, new = split_by_version(tag) print("row matched: {}".format(row_label)) if VERBOSE else ""
version_a._sponsor = optional_strip(old.text) version_a._sponsor = optional_strip(old.text)
version_b._sponsor = optional_strip(new.text) version_b._sponsor = optional_strip(new.text)
case ["Responsible Party:" as row_label, tag]: case ["Responsible Party:" as row_label, old, new]:
old, new = split_by_version(tag) print("row matched: {}".format(row_label)) if VERBOSE else ""
version_a._responsible_party = optional_strip(old.text) version_a._responsible_party = optional_strip(old.text)
version_b._responsible_party = optional_strip(new.text) version_b._responsible_party = optional_strip(new.text)
case ["Collaborators:" as row_label, tag]: case ["Collaborators:" as row_label, old, new]:
#old, new = split_by_version(tag) print("row matched: {}".format(row_label)) if VERBOSE else ""
#TODO: find a trial with multiple collaborators and figure out how to identify/count them:w #TODO: find a trial with multiple collaborators and figure out how to identify/count them:w
# So far can't figure out where this is in AACT, so I'm going to ignore it. # So far can't figure out where this is in AACT, so I'm going to ignore it.
pass pass
case _ as row_label:
print("row not matched: {}".format(row_label)) if VERBOSE else ""
def split_by_version(tag): def split_by_version(tag):
'''
OUTDATED: With the new format that separates old and new versions, I don't technically need this. It is a nice place to identify exact changes if those are every needed though and it removes the highlights cleanly.
'''
#clone elements and remove sub-tags that are not needed. #clone elements and remove sub-tags that are not needed.
old = copy(tag) old = copy(tag)
for span in old.find_all(class_="add_hilite"): for span in old.find_all(class_="add_hilite"):
@ -269,7 +281,7 @@ def extract_text_and_tag(text):
#handle various empty cases #handle various empty cases
if not text or text == '': if not text or text == '':
return TagDatePair(None, None) return TagTextPair(None, None)
date_split = text.split("[") date_split = text.split("[")
if len(date_split) > 1: if len(date_split) > 1:
@ -282,19 +294,19 @@ def extract_text_and_tag(text):
### FUNCTIONS ### FUNCTIONS
def tr_to_td(tr) -> tuple[str, str]: def tr_to_td(tr) -> tuple[str, str, str]:
""" """
Takes an html data row of interest, extracts the record_name from the first <td>, and the data from the second <td>. Takes an html data row of interest, extracts the record_name from the first <td>, and the data from the second <td>.
For the data, it just extracts the text. For the data, it just extracts the text.
The text itself then needs processed separately, based on what it should contain. The text itself then needs processed separately, based on what it should contain.
""" """
#get list of cells #get list of cells
td_list = tr.find_all("td") td_list = tr.find_all("td")
if len(td_list) == 2: if len(td_list) == 3:
return td_list[0].text, td_list[1] return td_list[0].text, td_list[1], td_list[2]
else: else:
return None, None return None, None, None
def get_forms(soup,version_a,version_b): def get_forms(soup,version_a,version_b):
@ -343,8 +355,8 @@ def get_forms(soup,version_a,version_b):
pass pass
case "form_MoreInformation": case "form_MoreInformation":
pass pass
case _: case _ as form_name:
print(form.attrs["id"]) print("form not matched: {}".format(form_name)) if VERBOSE else ""
### CONSTANTS ### CONSTANTS
@ -365,6 +377,7 @@ def get_data_from_versions(nct_id,html, version_a_int, version_b_int):
if __name__ == "__main__": if __name__ == "__main__":
VERBOSE = True
with psycopg2.connect(dbname="aact_db", user="root", password="root",host="will-office") as db_connection: with psycopg2.connect(dbname="aact_db", user="root", password="root",host="will-office") as db_connection:
#pull the requests from the db #pull the requests from the db
@ -376,13 +389,11 @@ if __name__ == "__main__":
""" """
curse.execute(sql) curse.execute(sql)
for response in curse.fetchall(): for response in curse.fetchall():
#
nct_id, version_a, version_b, html = response nct_id, version_a, version_b, html = response
print(nct_id) print(nct_id, version_a, version_b) if VERBOSE else ""
version1, version2 = get_data_from_versions(nct_id, html, version_a, version_b) version1, version2 = get_data_from_versions(nct_id, html, version_a, version_b)
print(version1.nct_id)
print(version2._enrollment)
if version_b == version_a + 1: if version_b == version_a + 1:
version1.load_to_db(db_connection) version1.load_to_db(db_connection)
@ -390,7 +401,6 @@ if __name__ == "__main__":
else: else:
version2.load_to_db(db_connection) version2.load_to_db(db_connection)
exit(1)
""" """

Loading…
Cancel
Save