You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
127 lines
3.8 KiB
Python
127 lines
3.8 KiB
Python
from tokenize import String
|
|
from bs4 import BeautifulSoup
|
|
import abc
|
|
import textprocessing as tp #cuz tp is important
|
|
#requires Python 3.10
|
|
|
|
def extract_data_from_tr(tr) -> tuple[String, String]:
|
|
"""
|
|
Takes an html data row of interest, extracts the record_name from the first <td>, and the data from the second <td>.
|
|
|
|
For the data, it will split between old and new data, making copies of each and returnign them.
|
|
|
|
Uses functionality from ./textprocessing.py (separated because it is important to test that functionality)
|
|
to get extract data from tags.
|
|
|
|
"""
|
|
#get list of cells
|
|
#for cell in cells
|
|
#if class_=="rowLabel", extract text
|
|
#else parse out new and old text
|
|
#return triple: row_lable, old, new
|
|
pass
|
|
|
|
#superclasses
|
|
class VersionData{abc.ABC}:
|
|
"""
|
|
This abstract class holds two types of data:
|
|
- Data with a 1-to-1 relationship with the trial/version pair.
|
|
- Data with a child relationship with the trial/version pair.
|
|
|
|
Each subclass will return the 1-to-1 data for another system to add to the DB.
|
|
This is so that a single record can be created in one go.
|
|
Each subclass will load the child data to the database directly.
|
|
"""
|
|
@abc.abstractmethod
|
|
def version_fields(self):
|
|
"""
|
|
This function returns data that should be included in a standard table
|
|
related to version_x of the record.
|
|
|
|
It also returns the columns?
|
|
"""
|
|
pass
|
|
@abc.abstractmethod
|
|
def version_records(self, foreign_key, db_cursor):
|
|
"""
|
|
This function loads data that needs to be held in auxilary tables
|
|
into the database.
|
|
For example, the list of sponsors will need to be tracked separatly from
|
|
trial status.
|
|
"""
|
|
pass
|
|
|
|
|
|
class StudyStatusData(VersionData):
|
|
columns = ["primary_completion_date", "completion_date", "last_update_posted_date"]
|
|
|
|
def __init__(self ,primary_completion_date, completion_date, last_update_posted_date) -> None:
|
|
pass
|
|
|
|
def extract_study_statuses(study_status_form, version_a,version_b):
|
|
"""
|
|
This extracts data from a study_status form and returns one or two StudyStatusData objects
|
|
"""
|
|
pass
|
|
|
|
class SponsorCollaboratorsData(VersionData):
|
|
columns=[]
|
|
def __init__(self) -> None:
|
|
pass
|
|
|
|
|
|
|
|
def get_forms(soup):
|
|
|
|
data_list = []
|
|
|
|
#extract all forms
|
|
for form in soup.body.find_all("form"):
|
|
#Match forms against ID types
|
|
if not "id" in form.attrs:
|
|
continue
|
|
|
|
match form.attrs["id"]:
|
|
case "form_StudyStatus":
|
|
print("test successful 2")
|
|
case "form_SponsorCollaborators":
|
|
pass
|
|
case "form_Oversight":
|
|
pass
|
|
case "form_StudyDescription":
|
|
pass
|
|
case "form_Conditions":
|
|
pass
|
|
case "form_StudyDesign":
|
|
pass
|
|
case "form_ArmsandInterventions":
|
|
pass
|
|
case "form_ProtocolOutcomeMeasures":
|
|
pass
|
|
case "form_Eligibility":
|
|
pass
|
|
case "form_ContactsLocations":
|
|
pass
|
|
case "form_IPDSharing":
|
|
pass
|
|
case "form_References":
|
|
pass
|
|
case "form_ParticipantFlow":
|
|
pass
|
|
case "form_BaselineCharacteristics":
|
|
pass
|
|
case "form_ROutcomeMeasures":
|
|
pass
|
|
case "form_AdverseEvents":
|
|
pass
|
|
case "form_LimitationsandCaveats":
|
|
pass
|
|
case "form_MoreInformation":
|
|
pass
|
|
case _:
|
|
print(form.attrs["id"])
|
|
|
|
if __name__ == "__main__":
|
|
with open("./NCT00658567.html") as fh:
|
|
soup = BeautifulSoup(fh, "lxml")
|
|
get_forms(soup) |