ClinicalTrialsDataProcessing/Parser/extraction-lib.py

from tokenize import String
from bs4 import BeautifulSoup
import abc
import textprocessing as tp #cuz tp is important
#requires Python 3.10

def extract_data_from_tr(tr) -> tuple[String, String]:
    """
    Takes an html data row of interest, extracts the record_name from the first <td>, and the data from the second <td>.

    For the data, it will split between old and new data, making copies of each and returnign them.

    Uses functionality from ./textprocessing.py (separated because it is important to test that functionality)
    to get extract data from tags.

    """
    #get list of cells
    #for cell in cells
    #if class_=="rowLabel", extract text
    #else parse out new and old text
    #return triple: row_lable, old, new
    pass

#superclasses
class VersionData{abc.ABC}:
    """
    This abstract class holds two types of data:
        - Data with a 1-to-1 relationship with the trial/version pair.
        - Data with a child relationship with the trial/version pair.

    Each subclass will return the 1-to-1 data for another system to add to the DB.
    This is so that a single record can be created in one go.
    Each subclass will load the child data to the database directly.
    """
    @abc.abstractmethod
    def version_fields(self):
        """
        This function returns data that should be included in a standard table
        related to version_x of the record.

        It also returns the columns?
        """
        pass
    @abc.abstractmethod
    def version_records(self, foreign_key, db_cursor):
        """
        This function loads data that needs to be held in auxilary tables
        into the database.
        For example, the list of sponsors will need to be tracked separatly from
        trial status.
        """
        pass


class StudyStatusData(VersionData):
    columns = ["primary_completion_date", "completion_date", "last_update_posted_date"]

    def __init__(self ,primary_completion_date, completion_date, last_update_posted_date) -> None:
        pass

def extract_study_statuses(study_status_form, version_a,version_b):
    """
    This extracts data from a study_status form and returns one or two StudyStatusData objects
    """
    pass

class SponsorCollaboratorsData(VersionData):
    columns=[]
    def __init__(self) -> None:
        pass


def get_forms(soup):

    data_list = []

    #extract all forms
    for form in soup.body.find_all("form"):
        #Match forms against ID types
        if not "id" in form.attrs:
            continue

        match form.attrs["id"]:
            case "form_StudyStatus":
                print("test successful 2")
            case "form_SponsorCollaborators":
                pass
            case "form_Oversight":
                pass
            case "form_StudyDescription":
                pass
            case "form_Conditions":
                pass
            case "form_StudyDesign":
                pass
            case "form_ArmsandInterventions":
                pass
            case "form_ProtocolOutcomeMeasures":
                pass
            case "form_Eligibility":
                pass
            case "form_ContactsLocations":
                pass
            case "form_IPDSharing":
                pass
            case "form_References":
                pass
            case "form_ParticipantFlow":
                pass
            case "form_BaselineCharacteristics":
                pass
            case "form_ROutcomeMeasures":
                pass
            case "form_AdverseEvents":
                pass
            case "form_LimitationsandCaveats":
                pass
            case "form_MoreInformation":
                pass
            case _:
                print(form.attrs["id"])

if __name__ == "__main__":
    with open("./NCT00658567.html") as fh:
        soup = BeautifulSoup(fh, "lxml")
    get_forms(soup)