From 71e87a9abe924ec6883963580dc3f17944652492 Mon Sep 17 00:00:00 2001 From: will king Date: Wed, 8 Jun 2022 19:04:47 -0700 Subject: [PATCH] Saving current status. --- Parser/.vscode/launch.json | 16 +++++ Parser/extraction-lib.py | 125 ++++++++++++++++++++++++++++++++++- Parser/prototype_history.sql | 62 +++++++++++++++++ Parser/textprocessing.py | 121 +++++++++++++++++++++++++++++++++ 4 files changed, 321 insertions(+), 3 deletions(-) create mode 100644 Parser/.vscode/launch.json create mode 100644 Parser/prototype_history.sql create mode 100644 Parser/textprocessing.py diff --git a/Parser/.vscode/launch.json b/Parser/.vscode/launch.json new file mode 100644 index 0000000..306f58e --- /dev/null +++ b/Parser/.vscode/launch.json @@ -0,0 +1,16 @@ +{ + // Use IntelliSense to learn about possible attributes. + // Hover to view descriptions of existing attributes. + // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 + "version": "0.2.0", + "configurations": [ + { + "name": "Python: Current File", + "type": "python", + "request": "launch", + "program": "${file}", + "console": "integratedTerminal", + "justMyCode": true + } + ] +} \ No newline at end of file diff --git a/Parser/extraction-lib.py b/Parser/extraction-lib.py index 158d7f2..2d867c7 100644 --- a/Parser/extraction-lib.py +++ b/Parser/extraction-lib.py @@ -1,8 +1,127 @@ +from tokenize import String from bs4 import BeautifulSoup +import abc +import textprocessing as tp #cuz tp is important +#requires Python 3.10 + +def extract_data_from_tr(tr) -> tuple[String, String]: + """ + Takes an html data row of interest, extracts the record_name from the first , and the data from the second . + + For the data, it will split between old and new data, making copies of each and returnign them. + + Uses functionality from ./textprocessing.py (separated because it is important to test that functionality) + to get extract data from tags. + + """ + #get list of cells + #for cell in cells + #if class_=="rowLabel", extract text + #else parse out new and old text + #return triple: row_lable, old, new + pass + +#superclasses +class VersionData{abc.ABC}: + """ + This abstract class holds two types of data: + - Data with a 1-to-1 relationship with the trial/version pair. + - Data with a child relationship with the trial/version pair. + + Each subclass will return the 1-to-1 data for another system to add to the DB. + This is so that a single record can be created in one go. + Each subclass will load the child data to the database directly. + """ + @abc.abstractmethod + def version_fields(self): + """ + This function returns data that should be included in a standard table + related to version_x of the record. + + It also returns the columns? + """ + pass + @abc.abstractmethod + def version_records(self, foreign_key, db_cursor): + """ + This function loads data that needs to be held in auxilary tables + into the database. + For example, the list of sponsors will need to be tracked separatly from + trial status. + """ + pass + + +class StudyStatusData(VersionData): + columns = ["primary_completion_date", "completion_date", "last_update_posted_date"] + + def __init__(self ,primary_completion_date, completion_date, last_update_posted_date) -> None: + pass + +def extract_study_statuses(study_status_form, version_a,version_b): + """ + This extracts data from a study_status form and returns one or two StudyStatusData objects + """ + pass + +class SponsorCollaboratorsData(VersionData): + columns=[] + def __init__(self) -> None: + pass + + + +def get_forms(soup): + + data_list = [] + + #extract all forms + for form in soup.body.find_all("form"): + #Match forms against ID types + if not "id" in form.attrs: + continue + + match form.attrs["id"]: + case "form_StudyStatus": + print("test successful 2") + case "form_SponsorCollaborators": + pass + case "form_Oversight": + pass + case "form_StudyDescription": + pass + case "form_Conditions": + pass + case "form_StudyDesign": + pass + case "form_ArmsandInterventions": + pass + case "form_ProtocolOutcomeMeasures": + pass + case "form_Eligibility": + pass + case "form_ContactsLocations": + pass + case "form_IPDSharing": + pass + case "form_References": + pass + case "form_ParticipantFlow": + pass + case "form_BaselineCharacteristics": + pass + case "form_ROutcomeMeasures": + pass + case "form_AdverseEvents": + pass + case "form_LimitationsandCaveats": + pass + case "form_MoreInformation": + pass + case _: + print(form.attrs["id"]) if __name__ == "__main__": with open("./NCT00658567.html") as fh: soup = BeautifulSoup(fh, "lxml") - - print(soup) - + get_forms(soup) \ No newline at end of file diff --git a/Parser/prototype_history.sql b/Parser/prototype_history.sql new file mode 100644 index 0000000..53e9be0 --- /dev/null +++ b/Parser/prototype_history.sql @@ -0,0 +1,62 @@ +/* +Create schema history + + +CREATE TABLE history.versions + nct_id + version + --Study Status + overall_status + primary_completion_date + completion_date + last_update_submitted_date + --SponsorCollaborators + sponsor (multi?) + collaborators (multi?) + --Oversight + fda_regulated_drug (ignore) + fda_regulated_device (ignore) + dmc (ignore) + --StuldyDescription + summary + detailed_description + --Conditions + Conditions + Keywords + --StudyDesign + Study type + Primary Purpose + Study Phase + Interventional Study Model + Number of Arms + Masking + Allocation + Enrollment + --ArmsAndInterventions + Arms (multiple) (Ignore) + --ProtocolOutcomeMeasures + --Eligibility + --ContactsLocation + --IPDSharing + --References + --ParticipantFlow + --BaselineCharacteristics + --ROutcomeMeasures + --AdverseEvents + --LimitationsAndCaveats + --More Information + + +CREATE TABLE history.colaborators + nct_id + version + collaborator_name + +CREATE TABLE history.locations + nct_id + version + location name + location contact info + +CREATE TABLE history.arms +*/ \ No newline at end of file diff --git a/Parser/textprocessing.py b/Parser/textprocessing.py new file mode 100644 index 0000000..8738f86 --- /dev/null +++ b/Parser/textprocessing.py @@ -0,0 +1,121 @@ +from cgitb import html +import re + +form = """ + + +
+
+
+ Open or close this module + Study Status +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Record Verification:April 2008 2017
Overall Status:Recruiting Completed
Study Start:March 2008
Primary Completion: December 2009 [Actual]
Study Completion:December 2009 [ Anticipated Actual]
First Submitted:April 10, 2008
First Submitted that
Met QC Criteria:
April 10, 2008
First Posted:April 15, 2008 [Estimate]
Results First Submitted: February 6, 2014
Results First Submitted that
Met QC + Criteria:
August 29, 2014
Results First Posted: September 9, 2014 [Estimate]
Last Update Submitted that
Met QC Criteria: +
April 10 18, 2008 2017
Last Update Posted:April 15, 2008 [Estimate] May 19, + 2017 [Actual]
+
+
+
+
+ + +""" + + +entry1 = """ + +Record Verification: +April 2008 2017 + +""" + + +drop_old_re = re.compile('\w*\s?') +drop_new_re = re.compile('\w*\s?') +drop_tags_re = re.compile('<[=-_,.:;"/\w\s]+>') + + +print(drop_new_re.sub("",entry1)) +print(drop_old_re.sub("",entry1)) +print(drop_tags_re.sub("",entry1)) + +print(drop_tags_re.sub("",drop_new_re.sub("",entry1))) + + +print(drop_tags_re.sub("",drop_new_re.sub("",form))) \ No newline at end of file