from tokenize import String
from bs4 import BeautifulSoup
import abc
import textprocessing as tp #cuz tp is important
#requires Python 3.10
def extract_data_from_tr(tr) -> tuple[String, String]:
"""
Takes an html data row of interest, extracts the record_name from the first
, and the data from the second | .
For the data, it will split between old and new data, making copies of each and returnign them.
Uses functionality from ./textprocessing.py (separated because it is important to test that functionality)
to get extract data from tags.
"""
#get list of cells
#for cell in cells
#if class_=="rowLabel", extract text
#else parse out new and old text
#return triple: row_lable, old, new
pass
#superclasses
class VersionData{abc.ABC}:
"""
This abstract class holds two types of data:
- Data with a 1-to-1 relationship with the trial/version pair.
- Data with a child relationship with the trial/version pair.
Each subclass will return the 1-to-1 data for another system to add to the DB.
This is so that a single record can be created in one go.
Each subclass will load the child data to the database directly.
"""
@abc.abstractmethod
def version_fields(self):
"""
This function returns data that should be included in a standard table
related to version_x of the record.
It also returns the columns?
"""
pass
@abc.abstractmethod
def version_records(self, foreign_key, db_cursor):
"""
This function loads data that needs to be held in auxilary tables
into the database.
For example, the list of sponsors will need to be tracked separatly from
trial status.
"""
pass
class StudyStatusData(VersionData):
columns = ["primary_completion_date", "completion_date", "last_update_posted_date"]
def __init__(self ,primary_completion_date, completion_date, last_update_posted_date) -> None:
pass
def extract_study_statuses(study_status_form, version_a,version_b):
"""
This extracts data from a study_status form and returns one or two StudyStatusData objects
"""
pass
class SponsorCollaboratorsData(VersionData):
columns=[]
def __init__(self) -> None:
pass
def get_forms(soup):
data_list = []
#extract all forms
for form in soup.body.find_all("form"):
#Match forms against ID types
if not "id" in form.attrs:
continue
match form.attrs["id"]:
case "form_StudyStatus":
print("test successful 2")
case "form_SponsorCollaborators":
pass
case "form_Oversight":
pass
case "form_StudyDescription":
pass
case "form_Conditions":
pass
case "form_StudyDesign":
pass
case "form_ArmsandInterventions":
pass
case "form_ProtocolOutcomeMeasures":
pass
case "form_Eligibility":
pass
case "form_ContactsLocations":
pass
case "form_IPDSharing":
pass
case "form_References":
pass
case "form_ParticipantFlow":
pass
case "form_BaselineCharacteristics":
pass
case "form_ROutcomeMeasures":
pass
case "form_AdverseEvents":
pass
case "form_LimitationsandCaveats":
pass
case "form_MoreInformation":
pass
case _:
print(form.attrs["id"])
if __name__ == "__main__":
with open("./NCT00658567.html") as fh:
soup = BeautifulSoup(fh, "lxml")
get_forms(soup) |