Got parsing updated to new format

llm-extraction
youainti 3 years ago
parent ee3e37e834
commit 3eb9a4130a

@ -123,14 +123,17 @@ def extract_study_statuses(study_status_form, version_a,version_b):
StudyStatusData objects,
"""
#get rows
rows = study_status_form.table.tbody.find_all("tr")
#iterate through rows,
for trow in rows:
#matching on rowLabels
match tr_to_td(trow):
case ["Primary Completion:" as row_label, tag]:
old,new = split_by_version(tag)
case ["Primary Completion:" as row_label, old,new]:
print("row matched: {}".format(row_label)) if VERBOSE else ""
tagdate1 = extract_date_and_tag(old.text)
version_a._primary_completion_date = tagdate1.date
version_a._primary_completion_date_category = optional_strip(tagdate1.tag)
@ -139,8 +142,8 @@ def extract_study_statuses(study_status_form, version_a,version_b):
version_b._primary_completion_date = tagdate2.date
version_b._primary_completion_date_category = optional_strip(tagdate2.tag)
case ["Study Start:" as row_label, tag]:
old,new = split_by_version(tag)
case ["Study Start:" as row_label, old, new]:
print("row matched: {}".format(row_label)) if VERBOSE else ""
tagdate1 = extract_date_and_tag(old.text)
version_a._start_date = tagdate1.date
version_a._start_date_category = optional_strip(tagdate1.tag)
@ -149,21 +152,23 @@ def extract_study_statuses(study_status_form, version_a,version_b):
version_b._start_date = tagdate2.date
version_b._start_date_category = optional_strip(tagdate2.tag)
case ["Study Completion:" as row_label, tag]:
old,new = split_by_version(tag)
case ["Study Completion:" as row_label, old,new]:
print("row matched: {}".format(row_label)) if VERBOSE else ""
tagdate1 = extract_date_and_tag(old.text)
version_a._completion_date = tagdate1.date
version_a._completion_date_category = optional_strip(tagdate1.tag)
tagdate2 = extract_date_and_tag(new.text)
version_b._completion_date = tagdate2.date
version_b._completion_date_category = optional_strip(tagdate2.tag)
case ["Overall Status:" as row_label, tag]:
old,new = split_by_version(tag)
case ["Overall Status:" as row_label, old,new]:
print("row matched: {}".format(row_label)) if VERBOSE else ""
version_a._overall_status = optional_strip(old.text)
version_b._overall_status = optional_strip(new.text)
#FIX: There is an issue with NCT00789633 where the overall status includes information as to why it was suspended.
case _ as row_label:
print("row not matched: {}".format(row_label)) if VERBOSE else ""
def extract_study_design(study_status_form, version_a,version_b):
"""
@ -177,8 +182,8 @@ def extract_study_design(study_status_form, version_a,version_b):
for trow in rows:
#matching on rowLabels
match tr_to_td(trow):
case ["Enrollment:" as row_label, tag]:
old,new = split_by_version(tag)
case ["Enrollment:" as row_label, old, new]:
print("row matched: {}".format(row_label)) if VERBOSE else ""
#Extract tag and text, add them to preallocated object
tagtext1 = extract_text_and_tag(old.text)
@ -189,6 +194,8 @@ def extract_study_design(study_status_form, version_a,version_b):
version_b._enrollment = tagtext2.text
version_b._enrollment_category = optional_strip(tagtext2.tag)
case _ as row_label:
print("row not matched: {}".format(row_label)) if VERBOSE else ""
def extract_sponsor_data(study_status_form, version_a,version_b):
"""
@ -202,25 +209,30 @@ def extract_sponsor_data(study_status_form, version_a,version_b):
for trow in rows:
#matching on rowLabels
match tr_to_td(trow):
case ["Sponsor:" as row_label, tag]:
old, new = split_by_version(tag)
case ["Sponsor:" as row_label, old, new]:
print("row matched: {}".format(row_label)) if VERBOSE else ""
version_a._sponsor = optional_strip(old.text)
version_b._sponsor = optional_strip(new.text)
case ["Responsible Party:" as row_label, tag]:
old, new = split_by_version(tag)
case ["Responsible Party:" as row_label, old, new]:
print("row matched: {}".format(row_label)) if VERBOSE else ""
version_a._responsible_party = optional_strip(old.text)
version_b._responsible_party = optional_strip(new.text)
case ["Collaborators:" as row_label, tag]:
#old, new = split_by_version(tag)
case ["Collaborators:" as row_label, old, new]:
print("row matched: {}".format(row_label)) if VERBOSE else ""
#TODO: find a trial with multiple collaborators and figure out how to identify/count them:w
# So far can't figure out where this is in AACT, so I'm going to ignore it.
pass
case _ as row_label:
print("row not matched: {}".format(row_label)) if VERBOSE else ""
def split_by_version(tag):
'''
OUTDATED: With the new format that separates old and new versions, I don't technically need this. It is a nice place to identify exact changes if those are every needed though and it removes the highlights cleanly.
'''
#clone elements and remove sub-tags that are not needed.
old = copy(tag)
for span in old.find_all(class_="add_hilite"):
@ -269,7 +281,7 @@ def extract_text_and_tag(text):
#handle various empty cases
if not text or text == '':
return TagDatePair(None, None)
return TagTextPair(None, None)
date_split = text.split("[")
if len(date_split) > 1:
@ -282,7 +294,7 @@ def extract_text_and_tag(text):
### FUNCTIONS
def tr_to_td(tr) -> tuple[str, str]:
def tr_to_td(tr) -> tuple[str, str, str]:
"""
Takes an html data row of interest, extracts the record_name from the first <td>, and the data from the second <td>.
@ -291,10 +303,10 @@ def tr_to_td(tr) -> tuple[str, str]:
"""
#get list of cells
td_list = tr.find_all("td")
if len(td_list) == 2:
return td_list[0].text, td_list[1]
if len(td_list) == 3:
return td_list[0].text, td_list[1], td_list[2]
else:
return None, None
return None, None, None
def get_forms(soup,version_a,version_b):
@ -343,8 +355,8 @@ def get_forms(soup,version_a,version_b):
pass
case "form_MoreInformation":
pass
case _:
print(form.attrs["id"])
case _ as form_name:
print("form not matched: {}".format(form_name)) if VERBOSE else ""
### CONSTANTS
@ -365,6 +377,7 @@ def get_data_from_versions(nct_id,html, version_a_int, version_b_int):
if __name__ == "__main__":
VERBOSE = True
with psycopg2.connect(dbname="aact_db", user="root", password="root",host="will-office") as db_connection:
#pull the requests from the db
@ -376,13 +389,11 @@ if __name__ == "__main__":
"""
curse.execute(sql)
for response in curse.fetchall():
#
nct_id, version_a, version_b, html = response
print(nct_id)
print(nct_id, version_a, version_b) if VERBOSE else ""
version1, version2 = get_data_from_versions(nct_id, html, version_a, version_b)
print(version1.nct_id)
print(version2._enrollment)
if version_b == version_a + 1:
version1.load_to_db(db_connection)
@ -390,7 +401,6 @@ if __name__ == "__main__":
else:
version2.load_to_db(db_connection)
exit(1)
"""

Loading…
Cancel
Save