diff --git a/Parser/extraction-lib.py b/Parser/extraction-lib.py index 2d867c7..0e07ec6 100644 --- a/Parser/extraction-lib.py +++ b/Parser/extraction-lib.py @@ -1,79 +1,227 @@ -from tokenize import String +from collections import namedtuple +from copy import copy +from datetime import datetime +from ensurepip import version from bs4 import BeautifulSoup import abc import textprocessing as tp #cuz tp is important #requires Python 3.10 -def extract_data_from_tr(tr) -> tuple[String, String]: - """ - Takes an html data row of interest, extracts the record_name from the first , and the data from the second . - For the data, it will split between old and new data, making copies of each and returnign them. +###CLASSES AND CONSTRUCTORS - Uses functionality from ./textprocessing.py (separated because it is important to test that functionality) - to get extract data from tags. - - """ - #get list of cells - #for cell in cells - #if class_=="rowLabel", extract text - #else parse out new and old text - #return triple: row_lable, old, new - pass +TagDatePair = namedtuple("TagDatePair", ["tag","date"]) +TagTextPair = namedtuple("TagTextPair", ["tag","text"]) #superclasses -class VersionData{abc.ABC}: +class VersionData(): """ - This abstract class holds two types of data: + This class holds two types of data: - Data with a 1-to-1 relationship with the trial/version pair. - Data with a child relationship with the trial/version pair. - Each subclass will return the 1-to-1 data for another system to add to the DB. - This is so that a single record can be created in one go. - Each subclass will load the child data to the database directly. + This initializes with None attributes, and implements setter + methods to load them (just to double check types) + That way I can just pass around the VersionData instance + and add data as I go. + + It will also implement the ability to load the data to the database """ - @abc.abstractmethod - def version_fields(self): - """ - This function returns data that should be included in a standard table - related to version_x of the record. - - It also returns the columns? - """ - pass - @abc.abstractmethod - def version_records(self, foreign_key, db_cursor): - """ - This function loads data that needs to be held in auxilary tables - into the database. - For example, the list of sponsors will need to be tracked separatly from - trial status. - """ - pass - - -class StudyStatusData(VersionData): - columns = ["primary_completion_date", "completion_date", "last_update_posted_date"] - - def __init__(self ,primary_completion_date, completion_date, last_update_posted_date) -> None: - pass + def __init__(self,nct_id,version_id): + #identifiers + self.nct_id = nct_id + self.version_id = version_id + + #Study Status + self._primary_completion_date = None + self._primary_completion_date_category = None + self._completion_date = None + self._completion_date_category = None + self._overall_status = None + + #Study Design + self._enrollment = None + self._enrollment_category = None + + #Sponsors and Collaborators + self._sponsor = None + #self._sponsor_category = None #I don't believe this is included in the raw data + self._responsible_party = None + #self._responsible_party_category = None #I don't believe this is included in the raw data + #self._collaborators = None #currently going to ignore as I've not fount it in AACT + def extract_study_statuses(study_status_form, version_a,version_b): """ - This extracts data from a study_status form and returns one or two StudyStatusData objects + This extracts data from a study_status form and returns one or two + StudyStatusData objects, + + """ + #get rows + rows = study_status_form.table.tbody.find_all("tr") + #iterate through rows, + for trow in rows: + #matching on rowLabels + #print(trow.__str__()[:80]) + match tr_to_td(trow): + case ["Primary Completion:" as row_label, tag]: + old,new = split_by_version(tag) + tagdate1 = extract_date_and_tag(old.text,date_MMMM_YYYY) + version_a._primary_completion_date = tagdate1.date + version_a._primary_completion_date_category = tagdate1.tag + + tagdate2 = extract_date_and_tag(new.text,date_MMMM_YYYY) + version_b._primary_completion_date = tagdate2.date + version_b._primary_completion_date_category = tagdate2.tag + + case ["Study Completion:" as row_label, tag]: + old,new = split_by_version(tag) + tagdate1 = extract_date_and_tag(old.text,date_MMMM_YYYY) + version_a._completion_date = tagdate1.date + version_a._completion_date_category = tagdate1.tag + + tagdate2 = extract_date_and_tag(new.text,date_MMMM_YYYY) + version_b._completion_date = tagdate2.date + version_b._completion_date_category = tagdate2.tag + + case ["Overall Status:" as row_label, tag]: + old,new = split_by_version(tag) + version_a._overall_status = old.text + version_b._overall_status = new.text + + +def extract_study_design(study_status_form, version_a,version_b): + """ + This extracts data from a study_status form and returns one or two + StudyStatusData objects, + + """ + #get rows + rows = study_status_form.table.tbody.find_all("tr") + #iterate through rows, + for trow in rows: + #matching on rowLabels + #print(trow.__str__()[:80]) + match tr_to_td(trow): + case ["Enrollment:" as row_label, tag]: + old,new = split_by_version(tag) + tagdate1 = extract_text_and_tag(old.text) + version_a._enrollment = tagdate1.text + version_a._enrollment_category = tagdate1.tag + + tagdate2 = extract_text_and_tag(new.text) + version_b._enrollment = tagdate2.text + version_b._enrollment_category = tagdate2.tag + + +def extract_sponsor_data(study_status_form, version_a,version_b): + """ + This extracts data from a study_status form and returns one or two + StudyStatusData objects, + + """ + #get rows + rows = study_status_form.table.tbody.find_all("tr") + #iterate through rows, + for trow in rows: + #matching on rowLabels + #print(trow.__str__()[:80]) + match tr_to_td(trow): + case ["Sponsor:" as row_label, tag]: + old, new = split_by_version(tag) + version_a._sponsor = old.text + version_b._sponsor = new.text + + case ["Responsible Party:" as row_label, tag]: + old, new = split_by_version(tag) + version_a._responsible_party = old.text + version_b._responsible_party = new.text + + case ["Collaborators:" as row_label, tag]: + #old, new = split_by_version(tag) + #TODO: find a trial with multiple collaborators and figure out how to identify/count them:w + # So far can't figure out where this is in AACT, so I'm going to ignore it. + pass + + + +def split_by_version(tag): + #clone elements and remove sub-tags that are not needed. + old = copy(tag) + for span in old.find_all(class_="add_hilite"): + span.extract() + + new = copy(tag) + for span in new.find_all(class_="drop_hilite"): + span.extract() + return old,new + + +def extract_date_and_tag(text, date_format): + """ + Extracts a datetype according to the date format + and the estimate tag based on + + """ + #FIX: Currently, there are multiple (mixed) data formats in use + #Theses can exist in the same data field, in two different versions + #so instead of using a single (passed) data format, I need to + #select between various data formats. + + text = text.strip() + + #handle various empty cases + if not text or text == '': + return TagDatePair(None, None) + + date_split = text.split("[") + if len(date_split) > 1: + estimate_tag = date_split[1].split("]")[0].strip() + else: + estimate_tag = None + date_object = datetime.strptime(date_split[0].strip(), date_format) + + return TagDatePair(estimate_tag, date_object) + + +def extract_text_and_tag(text): + """ + Extracts a datetype according to the date format + and the estimate tag based on + """ - pass + text = text.strip() -class SponsorCollaboratorsData(VersionData): - columns=[] - def __init__(self) -> None: - pass + #handle various empty cases + if not text or text == '': + return TagDatePair(None, None) + + date_split = text.split("[") + if len(date_split) > 1: + estimate_tag = date_split[1].split("]")[0].strip() + else: + estimate_tag = None + text_object = date_split[0].strip() + return TagTextPair(estimate_tag, text_object) +### FUNCTIONS -def get_forms(soup): +def tr_to_td(tr) -> tuple[str, str]: + """ + Takes an html data row of interest, extracts the record_name from the first , and the data from the second . - data_list = [] + For the data, it just extracts the text. + The text itself then needs processed separately, based on what it should contain. + """ + #get list of cells + td_list = tr.find_all("td") + if len(td_list) == 2: + return td_list[0].text, td_list[1] + else: + return None, None + +def get_forms(soup,version_a,version_b): #extract all forms for form in soup.body.find_all("form"): @@ -83,9 +231,9 @@ def get_forms(soup): match form.attrs["id"]: case "form_StudyStatus": - print("test successful 2") + extract_study_statuses(form,version_a,version_b) case "form_SponsorCollaborators": - pass + extract_sponsor_data(form, version_a, version_b) case "form_Oversight": pass case "form_StudyDescription": @@ -93,7 +241,7 @@ def get_forms(soup): case "form_Conditions": pass case "form_StudyDesign": - pass + extract_study_design(form,version_a,version_b) case "form_ArmsandInterventions": pass case "form_ProtocolOutcomeMeasures": @@ -121,7 +269,19 @@ def get_forms(soup): case _: print(form.attrs["id"]) + +### CONSTANTS +date_MMMM_YYYY = "%B %Y" +date_MMMM_DD_YYYY = "%B %d, %Y" + if __name__ == "__main__": - with open("./NCT00658567.html") as fh: - soup = BeautifulSoup(fh, "lxml") - get_forms(soup) \ No newline at end of file + + for file in ["./NCT00658567.html", "./NCT01303796.html"]: + with open(file) as fh: + soup = BeautifulSoup(fh, "lxml") + + version1 = VersionData("NCT00658567",1) + version2 = VersionData("NCT00658567",2) + get_forms(soup, version1, version2) + print(version1.__dict__) #order messed up somewhere:w + print(version2.__dict__) #order messed up somewhere:w diff --git a/Parser/prototype_history.sql b/Parser/prototype_history.sql index 53e9be0..b66142b 100644 --- a/Parser/prototype_history.sql +++ b/Parser/prototype_history.sql @@ -6,9 +6,9 @@ CREATE TABLE history.versions nct_id version --Study Status - overall_status - primary_completion_date - completion_date + overall_status^ + primary_completion_date^ + completion_date^ last_update_submitted_date --SponsorCollaborators sponsor (multi?) @@ -31,7 +31,7 @@ CREATE TABLE history.versions Number of Arms Masking Allocation - Enrollment + Enrollment ^ --ArmsAndInterventions Arms (multiple) (Ignore) --ProtocolOutcomeMeasures diff --git a/Parser/textprocessing.py b/Parser/textprocessing.py index 8738f86..56a1aeb 100644 --- a/Parser/textprocessing.py +++ b/Parser/textprocessing.py @@ -1,4 +1,6 @@ -from cgitb import html +from copy import copy +from datetime import datetime +from bs4 import BeautifulSoup import re form = """ @@ -106,16 +108,77 @@ entry1 = """ """ -drop_old_re = re.compile('\w*\s?') -drop_new_re = re.compile('\w*\s?') -drop_tags_re = re.compile('<[=-_,.:;"/\w\s]+>') +entry2 = ' December 2009 [Actual] ' +DROP_HILITE_re = re.compile('[\[\]\w]*\s?') +ADD_HILITE_re = re.compile('\w*\s?') +TAGS_RE = re.compile('<[=-_,.:;"/\w\s]+>') -print(drop_new_re.sub("",entry1)) -print(drop_old_re.sub("",entry1)) -print(drop_tags_re.sub("",entry1)) +def extract_new_data(td): + text = td.__str__() + return TAGS_RE.sub("",DROP_HILITE_re.sub(" ",text)).strip() -print(drop_tags_re.sub("",drop_new_re.sub("",entry1))) +def extract_old_data(td): + text = td.__str__() + return TAGS_RE.sub("",ADD_HILITE_re.sub(" ",text)).strip() +def delete_tags(td): + text = td.__str__() + return TAGS_RE.sub(" ",text).strip() -print(drop_tags_re.sub("",drop_new_re.sub("",form))) \ No newline at end of file + +def extract_date_and_tag(text, date_format): + """ + Extracts a datetype according to the date format + and the estimate tag based on + + """ + if not text: + return " " + + date_split = text.split("[") + if len(date_split) > 1: + estimate_tag = date_split[1].split("]")[0].strip() + else: + estimate_tag = None + date_object = datetime.strptime(date_split[0].strip(), date_format) + + return estimate_tag, date_object + #TODO: Write test + +def extract_text_and_tag(text): + """ + + """ + pass + +if __name__ == "__main__": + Entry = BeautifulSoup(entry1, "lxml") + Form = BeautifulSoup(form, "lxml") + + + + print(extract_new_data(Entry.find_all("td")[1])) + print(extract_old_data(Entry.find_all("td")[1])) + + for tr in Form.find_all("tr"): + data = tr.find_all("td") + match len(data): + case 0: print("no data") + case 1: print("1\t",data[0]) + case _: print(len(data), "\t", extract_new_data(data[1]) ,"\t|\t", extract_old_data(data[1])) + + #print(extract_date_and_tag(extract_old_data(Entry.find_all("td")[1]), "%B %Y")) + print(extract_date_and_tag("April 2008 [ test ]", "%B %Y")) + + + Entry2 = BeautifulSoup(entry2,"lxml") + print(extract_old_data(Entry2)) #error here. + print(extract_new_data(Entry2)) + + + Entry3 = copy(Entry2) + print(Entry3) + Entry4 = Entry3.find_all(class_="add_hilite")[0].extract() + print(Entry3.text) + print(Entry4.text) \ No newline at end of file