from tokenize import String from bs4 import BeautifulSoup import abc import textprocessing as tp #cuz tp is important #requires Python 3.10 def extract_data_from_tr(tr) -> tuple[String, String]: """ Takes an html data row of interest, extracts the record_name from the first , and the data from the second . For the data, it will split between old and new data, making copies of each and returnign them. Uses functionality from ./textprocessing.py (separated because it is important to test that functionality) to get extract data from tags. """ #get list of cells #for cell in cells #if class_=="rowLabel", extract text #else parse out new and old text #return triple: row_lable, old, new pass #superclasses class VersionData{abc.ABC}: """ This abstract class holds two types of data: - Data with a 1-to-1 relationship with the trial/version pair. - Data with a child relationship with the trial/version pair. Each subclass will return the 1-to-1 data for another system to add to the DB. This is so that a single record can be created in one go. Each subclass will load the child data to the database directly. """ @abc.abstractmethod def version_fields(self): """ This function returns data that should be included in a standard table related to version_x of the record. It also returns the columns? """ pass @abc.abstractmethod def version_records(self, foreign_key, db_cursor): """ This function loads data that needs to be held in auxilary tables into the database. For example, the list of sponsors will need to be tracked separatly from trial status. """ pass class StudyStatusData(VersionData): columns = ["primary_completion_date", "completion_date", "last_update_posted_date"] def __init__(self ,primary_completion_date, completion_date, last_update_posted_date) -> None: pass def extract_study_statuses(study_status_form, version_a,version_b): """ This extracts data from a study_status form and returns one or two StudyStatusData objects """ pass class SponsorCollaboratorsData(VersionData): columns=[] def __init__(self) -> None: pass def get_forms(soup): data_list = [] #extract all forms for form in soup.body.find_all("form"): #Match forms against ID types if not "id" in form.attrs: continue match form.attrs["id"]: case "form_StudyStatus": print("test successful 2") case "form_SponsorCollaborators": pass case "form_Oversight": pass case "form_StudyDescription": pass case "form_Conditions": pass case "form_StudyDesign": pass case "form_ArmsandInterventions": pass case "form_ProtocolOutcomeMeasures": pass case "form_Eligibility": pass case "form_ContactsLocations": pass case "form_IPDSharing": pass case "form_References": pass case "form_ParticipantFlow": pass case "form_BaselineCharacteristics": pass case "form_ROutcomeMeasures": pass case "form_AdverseEvents": pass case "form_LimitationsandCaveats": pass case "form_MoreInformation": pass case _: print(form.attrs["id"]) if __name__ == "__main__": with open("./NCT00658567.html") as fh: soup = BeautifulSoup(fh, "lxml") get_forms(soup)