diff --git a/Parser/.vscode/launch.json b/Parser/.vscode/launch.json
new file mode 100644
index 0000000..306f58e
--- /dev/null
+++ b/Parser/.vscode/launch.json
@@ -0,0 +1,16 @@
+{
+ // Use IntelliSense to learn about possible attributes.
+ // Hover to view descriptions of existing attributes.
+ // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
+ "version": "0.2.0",
+ "configurations": [
+ {
+ "name": "Python: Current File",
+ "type": "python",
+ "request": "launch",
+ "program": "${file}",
+ "console": "integratedTerminal",
+ "justMyCode": true
+ }
+ ]
+}
\ No newline at end of file
diff --git a/Parser/extraction-lib.py b/Parser/extraction-lib.py
index 158d7f2..2d867c7 100644
--- a/Parser/extraction-lib.py
+++ b/Parser/extraction-lib.py
@@ -1,8 +1,127 @@
+from tokenize import String
from bs4 import BeautifulSoup
+import abc
+import textprocessing as tp #cuz tp is important
+#requires Python 3.10
+
+def extract_data_from_tr(tr) -> tuple[String, String]:
+ """
+ Takes an html data row of interest, extracts the record_name from the first
, and the data from the second | .
+
+ For the data, it will split between old and new data, making copies of each and returnign them.
+
+ Uses functionality from ./textprocessing.py (separated because it is important to test that functionality)
+ to get extract data from tags.
+
+ """
+ #get list of cells
+ #for cell in cells
+ #if class_=="rowLabel", extract text
+ #else parse out new and old text
+ #return triple: row_lable, old, new
+ pass
+
+#superclasses
+class VersionData{abc.ABC}:
+ """
+ This abstract class holds two types of data:
+ - Data with a 1-to-1 relationship with the trial/version pair.
+ - Data with a child relationship with the trial/version pair.
+
+ Each subclass will return the 1-to-1 data for another system to add to the DB.
+ This is so that a single record can be created in one go.
+ Each subclass will load the child data to the database directly.
+ """
+ @abc.abstractmethod
+ def version_fields(self):
+ """
+ This function returns data that should be included in a standard table
+ related to version_x of the record.
+
+ It also returns the columns?
+ """
+ pass
+ @abc.abstractmethod
+ def version_records(self, foreign_key, db_cursor):
+ """
+ This function loads data that needs to be held in auxilary tables
+ into the database.
+ For example, the list of sponsors will need to be tracked separatly from
+ trial status.
+ """
+ pass
+
+
+class StudyStatusData(VersionData):
+ columns = ["primary_completion_date", "completion_date", "last_update_posted_date"]
+
+ def __init__(self ,primary_completion_date, completion_date, last_update_posted_date) -> None:
+ pass
+
+def extract_study_statuses(study_status_form, version_a,version_b):
+ """
+ This extracts data from a study_status form and returns one or two StudyStatusData objects
+ """
+ pass
+
+class SponsorCollaboratorsData(VersionData):
+ columns=[]
+ def __init__(self) -> None:
+ pass
+
+
+
+def get_forms(soup):
+
+ data_list = []
+
+ #extract all forms
+ for form in soup.body.find_all("form"):
+ #Match forms against ID types
+ if not "id" in form.attrs:
+ continue
+
+ match form.attrs["id"]:
+ case "form_StudyStatus":
+ print("test successful 2")
+ case "form_SponsorCollaborators":
+ pass
+ case "form_Oversight":
+ pass
+ case "form_StudyDescription":
+ pass
+ case "form_Conditions":
+ pass
+ case "form_StudyDesign":
+ pass
+ case "form_ArmsandInterventions":
+ pass
+ case "form_ProtocolOutcomeMeasures":
+ pass
+ case "form_Eligibility":
+ pass
+ case "form_ContactsLocations":
+ pass
+ case "form_IPDSharing":
+ pass
+ case "form_References":
+ pass
+ case "form_ParticipantFlow":
+ pass
+ case "form_BaselineCharacteristics":
+ pass
+ case "form_ROutcomeMeasures":
+ pass
+ case "form_AdverseEvents":
+ pass
+ case "form_LimitationsandCaveats":
+ pass
+ case "form_MoreInformation":
+ pass
+ case _:
+ print(form.attrs["id"])
if __name__ == "__main__":
with open("./NCT00658567.html") as fh:
soup = BeautifulSoup(fh, "lxml")
-
- print(soup)
-
+ get_forms(soup)
\ No newline at end of file
diff --git a/Parser/prototype_history.sql b/Parser/prototype_history.sql
new file mode 100644
index 0000000..53e9be0
--- /dev/null
+++ b/Parser/prototype_history.sql
@@ -0,0 +1,62 @@
+/*
+Create schema history
+
+
+CREATE TABLE history.versions
+ nct_id
+ version
+ --Study Status
+ overall_status
+ primary_completion_date
+ completion_date
+ last_update_submitted_date
+ --SponsorCollaborators
+ sponsor (multi?)
+ collaborators (multi?)
+ --Oversight
+ fda_regulated_drug (ignore)
+ fda_regulated_device (ignore)
+ dmc (ignore)
+ --StuldyDescription
+ summary
+ detailed_description
+ --Conditions
+ Conditions
+ Keywords
+ --StudyDesign
+ Study type
+ Primary Purpose
+ Study Phase
+ Interventional Study Model
+ Number of Arms
+ Masking
+ Allocation
+ Enrollment
+ --ArmsAndInterventions
+ Arms (multiple) (Ignore)
+ --ProtocolOutcomeMeasures
+ --Eligibility
+ --ContactsLocation
+ --IPDSharing
+ --References
+ --ParticipantFlow
+ --BaselineCharacteristics
+ --ROutcomeMeasures
+ --AdverseEvents
+ --LimitationsAndCaveats
+ --More Information
+
+
+CREATE TABLE history.colaborators
+ nct_id
+ version
+ collaborator_name
+
+CREATE TABLE history.locations
+ nct_id
+ version
+ location name
+ location contact info
+
+CREATE TABLE history.arms
+*/
\ No newline at end of file
diff --git a/Parser/textprocessing.py b/Parser/textprocessing.py
new file mode 100644
index 0000000..8738f86
--- /dev/null
+++ b/Parser/textprocessing.py
@@ -0,0 +1,121 @@
+from cgitb import html
+import re
+
+form = """
+ |
+ |
+
+ |
+
+"""
+
+
+entry1 = """
+
+| Record Verification: |
+April 2008 2017 |
+
+"""
+
+
+drop_old_re = re.compile('\w*\s?')
+drop_new_re = re.compile('\w*\s?')
+drop_tags_re = re.compile('<[=-_,.:;"/\w\s]+>')
+
+
+print(drop_new_re.sub("",entry1))
+print(drop_old_re.sub("",entry1))
+print(drop_tags_re.sub("",entry1))
+
+print(drop_tags_re.sub("",drop_new_re.sub("",entry1)))
+
+
+print(drop_tags_re.sub("",drop_new_re.sub("",form)))
\ No newline at end of file