Saving current status.

parser
will king 4 years ago
parent d8d00101fa
commit 71e87a9abe

@ -0,0 +1,16 @@
{
// Use IntelliSense to learn about possible attributes.
// Hover to view descriptions of existing attributes.
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
"version": "0.2.0",
"configurations": [
{
"name": "Python: Current File",
"type": "python",
"request": "launch",
"program": "${file}",
"console": "integratedTerminal",
"justMyCode": true
}
]
}

@ -1,8 +1,127 @@
from tokenize import String
from bs4 import BeautifulSoup
import abc
import textprocessing as tp #cuz tp is important
#requires Python 3.10
def extract_data_from_tr(tr) -> tuple[String, String]:
"""
Takes an html data row of interest, extracts the record_name from the first <td>, and the data from the second <td>.
For the data, it will split between old and new data, making copies of each and returnign them.
Uses functionality from ./textprocessing.py (separated because it is important to test that functionality)
to get extract data from tags.
"""
#get list of cells
#for cell in cells
#if class_=="rowLabel", extract text
#else parse out new and old text
#return triple: row_lable, old, new
pass
#superclasses
class VersionData{abc.ABC}:
"""
This abstract class holds two types of data:
- Data with a 1-to-1 relationship with the trial/version pair.
- Data with a child relationship with the trial/version pair.
Each subclass will return the 1-to-1 data for another system to add to the DB.
This is so that a single record can be created in one go.
Each subclass will load the child data to the database directly.
"""
@abc.abstractmethod
def version_fields(self):
"""
This function returns data that should be included in a standard table
related to version_x of the record.
It also returns the columns?
"""
pass
@abc.abstractmethod
def version_records(self, foreign_key, db_cursor):
"""
This function loads data that needs to be held in auxilary tables
into the database.
For example, the list of sponsors will need to be tracked separatly from
trial status.
"""
pass
class StudyStatusData(VersionData):
columns = ["primary_completion_date", "completion_date", "last_update_posted_date"]
def __init__(self ,primary_completion_date, completion_date, last_update_posted_date) -> None:
pass
def extract_study_statuses(study_status_form, version_a,version_b):
"""
This extracts data from a study_status form and returns one or two StudyStatusData objects
"""
pass
class SponsorCollaboratorsData(VersionData):
columns=[]
def __init__(self) -> None:
pass
def get_forms(soup):
data_list = []
#extract all forms
for form in soup.body.find_all("form"):
#Match forms against ID types
if not "id" in form.attrs:
continue
match form.attrs["id"]:
case "form_StudyStatus":
print("test successful 2")
case "form_SponsorCollaborators":
pass
case "form_Oversight":
pass
case "form_StudyDescription":
pass
case "form_Conditions":
pass
case "form_StudyDesign":
pass
case "form_ArmsandInterventions":
pass
case "form_ProtocolOutcomeMeasures":
pass
case "form_Eligibility":
pass
case "form_ContactsLocations":
pass
case "form_IPDSharing":
pass
case "form_References":
pass
case "form_ParticipantFlow":
pass
case "form_BaselineCharacteristics":
pass
case "form_ROutcomeMeasures":
pass
case "form_AdverseEvents":
pass
case "form_LimitationsandCaveats":
pass
case "form_MoreInformation":
pass
case _:
print(form.attrs["id"])
if __name__ == "__main__":
with open("./NCT00658567.html") as fh:
soup = BeautifulSoup(fh, "lxml")
print(soup)
get_forms(soup)

@ -0,0 +1,62 @@
/*
Create schema history
CREATE TABLE history.versions
nct_id
version
--Study Status
overall_status
primary_completion_date
completion_date
last_update_submitted_date
--SponsorCollaborators
sponsor (multi?)
collaborators (multi?)
--Oversight
fda_regulated_drug (ignore)
fda_regulated_device (ignore)
dmc (ignore)
--StuldyDescription
summary
detailed_description
--Conditions
Conditions
Keywords
--StudyDesign
Study type
Primary Purpose
Study Phase
Interventional Study Model
Number of Arms
Masking
Allocation
Enrollment
--ArmsAndInterventions
Arms (multiple) (Ignore)
--ProtocolOutcomeMeasures
--Eligibility
--ContactsLocation
--IPDSharing
--References
--ParticipantFlow
--BaselineCharacteristics
--ROutcomeMeasures
--AdverseEvents
--LimitationsAndCaveats
--More Information
CREATE TABLE history.colaborators
nct_id
version
collaborator_name
CREATE TABLE history.locations
nct_id
version
location name
location contact info
CREATE TABLE history.arms
*/

@ -0,0 +1,121 @@
from cgitb import html
import re
form = """
<tr>
<td colspan="2">
<form id="form_StudyStatus">
<div class="w3-responsive">
<fieldset class="entryReq" id="StudyStatus"
style="margin:auto;margin-bottom:1em;padding-bottom:0.5em;width:98%;">
<legend class="moduleLabel"> <img id="StudyStatusImg" class="toggleImage"
onclick="toggleModule('StudyStatus');" src="html/images/collapse.png"
alt='Open or close this module'>
Study Status</legend>
<div id="StudyStatusBody" class="moduleBody">
<table class="indent1 moduleTable resultTable">
<thead>
<tr>
<th style="width:210px;"></th>
<th></th>
</tr>
</thead>
<tbody>
<tr>
<td class="rowLabel" style="min-width: 210px;">Record Verification:</td>
<td>April <span class="drop_hilite">2008</span> <span class="add_hilite">2017</span> </td>
</tr>
<tr>
<td class="rowLabel" style="min-width: 210px;">Overall Status:</td>
<td><span class="drop_hilite">Recruiting</span> <span class="add_hilite">Completed</span></td>
</tr>
<tr>
<td class="rowLabel" style="min-width: 210px;">Study Start:</td>
<td>March 2008 </td>
</tr>
<tr>
<td class="rowLabel" style="min-width: 210px;">Primary Completion:</td>
<td> <span class="add_hilite">December 2009 [Actual]</span> </td>
</tr>
<tr>
<td class="rowLabel" style="min-width: 210px;">Study Completion:</td>
<td>December 2009 [ <span class="drop_hilite">Anticipated</span> <span
class="add_hilite">Actual</span>] </td>
</tr>
<tr style="border-bottom:1px solid lightgray">
<td colspan="3"></td>
</tr>
<tr>
<td class="rowLabel" style="min-width: 210px;">First Submitted:</td>
<td>April 10, 2008 </td>
</tr>
<tr>
<td class="rowLabel" style="min-width: 210px;">First Submitted that<br />Met QC Criteria:</td>
<td>April 10, 2008 </td>
</tr>
<tr>
<td class="rowLabel" style="min-width: 210px;">First Posted:</td>
<td>April 15, 2008 [Estimate] </td>
</tr>
<tr style="border-bottom:1px solid lightgray">
<td colspan="3"></td>
</tr>
<tr>
<td class="rowLabel" style="min-width: 210px;">Results First Submitted:</td>
<td> <span class="add_hilite">February 6, 2014</span> </td>
</tr>
<tr>
<td class="rowLabel" style="min-width: 210px;">Results First Submitted that<br />Met QC
Criteria:</td>
<td> <span class="add_hilite">August 29, 2014</span> </td>
</tr>
<tr>
<td class="rowLabel" style="min-width: 210px;">Results First Posted:</td>
<td> <span class="add_hilite">September 9, 2014 [Estimate]</span> </td>
</tr>
<tr style="border-bottom:1px solid lightgray">
<td colspan="3"></td>
</tr>
<tr>
<td class="rowLabel" style="min-width: 210px;">Last Update Submitted that<br />Met QC Criteria:
</td>
<td>April <span class="drop_hilite">10</span> <span class="add_hilite">18</span>, <span
class="drop_hilite">2008</span> <span class="add_hilite">2017</span> </td>
</tr>
<tr>
<td class="rowLabel" style="min-width: 210px;">Last Update Posted:</td>
<td><span class="drop_hilite">April 15, 2008 [Estimate]</span> <span class="add_hilite">May 19,
2017 [Actual]</span> </td>
</tr>
</tbody>
</table>
</div>
</fieldset>
</div>
</form>
</td>
</tr>
"""
entry1 = """
<tr>
<td class="rowLabel" style="min-width: 210px;">Record Verification:</td>
<td>April <span class="drop_hilite">2008</span> <span class="add_hilite">2017</span> </td>
</tr>
"""
drop_old_re = re.compile('<span class="drop_hilite">\w*</span>\s?')
drop_new_re = re.compile('<span class="add_hilite">\w*</span>\s?')
drop_tags_re = re.compile('<[=-_,.:;"/\w\s]+>')
print(drop_new_re.sub("",entry1))
print(drop_old_re.sub("",entry1))
print(drop_tags_re.sub("",entry1))
print(drop_tags_re.sub("",drop_new_re.sub("",entry1)))
print(drop_tags_re.sub("",drop_new_re.sub("",form)))
Loading…
Cancel
Save