added feature to extract start_date. deleted unused files and modified others for ease of use.

parser
youainti 4 years ago
parent 453e82974e
commit 9d5a726494

File diff suppressed because it is too large Load Diff

@ -1,9 +1,9 @@
from collections import namedtuple from collections import namedtuple
from copy import copy from copy import copy
from datetime import datetime from datetime import datetime
import psycopg2
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
import textprocessing as tp #cuz tp is important #import textprocessing as tp #cuz tp is important
#requires Python 3.10 #requires Python 3.10
@ -28,12 +28,14 @@ class VersionData():
""" """
def __init__(self,nct_id,version_id): def __init__(self,nct_id,version_id):
#identifiers #identifiers
self.nct_id = nct_id self.nct_id = nct_id.strip()
self.version_id = version_id self.version_id = version_id
#Study Status #Study Status
self._primary_completion_date = None self._primary_completion_date = None
self._primary_completion_date_category = None self._primary_completion_date_category = None
self._start_date = None
self._start_date_category = None
self._completion_date = None self._completion_date = None
self._completion_date_category = None self._completion_date_category = None
self._overall_status = None self._overall_status = None
@ -49,9 +51,73 @@ class VersionData():
#self._responsible_party_category = None #I don't believe this is included in the raw data #self._responsible_party_category = None #I don't believe this is included in the raw data
#self._collaborators = None #currently going to ignore as I've not fount it in AACT #self._collaborators = None #currently going to ignore as I've not fount it in AACT
def load_to_db(db_cursor): def load_to_db(self,db_connection):
#load to initial table, then load any extra details into other tables #load to initial table, then load any extra details into other tables
pass sql = """
INSERT INTO history.trial_snapshots
(
nct_id,
version,
primary_completion_date,
primary_completion_date_category,
start_date,
start_date_category,
completion_date,
completion_date_category,
overall_status,
enrollment,
enrollment_category,
sponsor,
responsible_party
)
VALUES
(
%s,
%s,
%s,
%s,
%s,
%s,
%s,
%s,
%s,
%s,
%s,
%s,
%s
)
"""
with db_connection.cursor() as db_cursor:
try:
db_cursor.execute(
sql,
(
self.nct_id,
self.version_id,
self._primary_completion_date,
self._primary_completion_date_category,
self._start_date,
self._start_date_category,
self._completion_date,
self._completion_date_category,
self._overall_status,
self._enrollment,
self._enrollment_category,
self._sponsor,
self._responsible_party
)
)
except Exception as err:
#catch any error, print the applicable information, and raise the error.
print(self)
raise err
def optional_strip(possible_string):
if type(possible_string) == str:
return possible_string.strip()
else:
return possible_string
def extract_study_statuses(study_status_form, version_a,version_b): def extract_study_statuses(study_status_form, version_a,version_b):
""" """
@ -70,26 +136,36 @@ def extract_study_statuses(study_status_form, version_a,version_b):
old,new = split_by_version(tag) old,new = split_by_version(tag)
tagdate1 = extract_date_and_tag(old.text) tagdate1 = extract_date_and_tag(old.text)
version_a._primary_completion_date = tagdate1.date version_a._primary_completion_date = tagdate1.date
version_a._primary_completion_date_category = tagdate1.tag version_a._primary_completion_date_category = optional_strip(tagdate1.tag)
tagdate2 = extract_date_and_tag(new.text) tagdate2 = extract_date_and_tag(new.text)
version_b._primary_completion_date = tagdate2.date version_b._primary_completion_date = tagdate2.date
version_b._primary_completion_date_category = tagdate2.tag version_b._primary_completion_date_category = optional_strip(tagdate2.tag)
case ["Study Start:" as row_label, tag]:
old,new = split_by_version(tag)
tagdate1 = extract_date_and_tag(old.text)
version_a._start_date = tagdate1.date
version_a._start_date_category = optional_strip(tagdate1.tag)
tagdate2 = extract_date_and_tag(new.text)
version_b._start_date = tagdate2.date
version_b._start_date_category = optional_strip(tagdate2.tag)
case ["Study Completion:" as row_label, tag]: case ["Study Completion:" as row_label, tag]:
old,new = split_by_version(tag) old,new = split_by_version(tag)
tagdate1 = extract_date_and_tag(old.text) tagdate1 = extract_date_and_tag(old.text)
version_a._completion_date = tagdate1.date version_a._completion_date = tagdate1.date
version_a._completion_date_category = tagdate1.tag version_a._completion_date_category = optional_strip(tagdate1.tag)
tagdate2 = extract_date_and_tag(new.text) tagdate2 = extract_date_and_tag(new.text)
version_b._completion_date = tagdate2.date version_b._completion_date = tagdate2.date
version_b._completion_date_category = tagdate2.tag version_b._completion_date_category = optional_strip(tagdate2.tag)
case ["Overall Status:" as row_label, tag]: case ["Overall Status:" as row_label, tag]:
old,new = split_by_version(tag) old,new = split_by_version(tag)
version_a._overall_status = old.text version_a._overall_status = optional_strip(old.text)
version_b._overall_status = new.text version_b._overall_status = optional_strip(new.text)
def extract_study_design(study_status_form, version_a,version_b): def extract_study_design(study_status_form, version_a,version_b):
@ -108,11 +184,11 @@ def extract_study_design(study_status_form, version_a,version_b):
old,new = split_by_version(tag) old,new = split_by_version(tag)
tagdate1 = extract_text_and_tag(old.text) tagdate1 = extract_text_and_tag(old.text)
version_a._enrollment = tagdate1.text version_a._enrollment = tagdate1.text
version_a._enrollment_category = tagdate1.tag version_a._enrollment_category = optional_strip(tagdate1.tag)
tagdate2 = extract_text_and_tag(new.text) tagdate2 = extract_text_and_tag(new.text)
version_b._enrollment = tagdate2.text version_b._enrollment = tagdate2.text
version_b._enrollment_category = tagdate2.tag version_b._enrollment_category = optional_strip(tagdate2.tag)
def extract_sponsor_data(study_status_form, version_a,version_b): def extract_sponsor_data(study_status_form, version_a,version_b):
@ -129,13 +205,13 @@ def extract_sponsor_data(study_status_form, version_a,version_b):
match tr_to_td(trow): match tr_to_td(trow):
case ["Sponsor:" as row_label, tag]: case ["Sponsor:" as row_label, tag]:
old, new = split_by_version(tag) old, new = split_by_version(tag)
version_a._sponsor = old.text version_a._sponsor = optional_strip(old.text)
version_b._sponsor = new.text version_b._sponsor = optional_strip(new.text)
case ["Responsible Party:" as row_label, tag]: case ["Responsible Party:" as row_label, tag]:
old, new = split_by_version(tag) old, new = split_by_version(tag)
version_a._responsible_party = old.text version_a._responsible_party = optional_strip(old.text)
version_b._responsible_party = new.text version_b._responsible_party = optional_strip(new.text)
case ["Collaborators:" as row_label, tag]: case ["Collaborators:" as row_label, tag]:
#old, new = split_by_version(tag) #old, new = split_by_version(tag)
@ -285,8 +361,14 @@ def get_data_from_versions(nct_id,html, version_a_int, version_b_int):
if __name__ == "__main__": if __name__ == "__main__":
for file in ["./NCT00658567.html", "./NCT01303796.html"]: for file in ["NCT00658567", "NCT01303796"]:
with open(file) as fh: with open("./{}.html".format(file)) as fh:
version1, version2 = get_data_from_versions(file, fh.read(), 1,2) version1, version2 = get_data_from_versions(file, fh.read(), 1,2)
print(version1.__dict__) #order messed up somewhere:w print(version1.__dict__) #order messed up somewhere:w
print(version2.__dict__) #order messed up somewhere:w print(version2.__dict__) #order messed up somewhere:w
with psycopg2.connect(dbname="aact_db", user="root", password="root",host="will-office") as db_connection:
version1.load_to_db(db_connection)
version2.load_to_db(db_connection)
#print(db_connection)

@ -87,7 +87,7 @@ ALTER TYPE history.study_statuses
-- Table: history.trial_snapshots -- Table: history.trial_snapshots
-- DROP TABLE IF EXISTS history.trial_snapshots; DROP TABLE IF EXISTS history.trial_snapshots;
CREATE TABLE IF NOT EXISTS history.trial_snapshots CREATE TABLE IF NOT EXISTS history.trial_snapshots
( (
@ -95,6 +95,8 @@ CREATE TABLE IF NOT EXISTS history.trial_snapshots
version integer NOT NULL, version integer NOT NULL,
primary_completion_date timestamp without time zone, primary_completion_date timestamp without time zone,
primary_completion_date_category history.updatable_catetories, primary_completion_date_category history.updatable_catetories,
start_date timestamp without time zone,
start_date_category history.updatable_catetories,
completion_date timestamp without time zone, completion_date timestamp without time zone,
completion_date_category history.updatable_catetories, completion_date_category history.updatable_catetories,
overall_status history.study_statuses, overall_status history.study_statuses,
@ -103,7 +105,7 @@ CREATE TABLE IF NOT EXISTS history.trial_snapshots
sponsor character varying(255) COLLATE pg_catalog."default", sponsor character varying(255) COLLATE pg_catalog."default",
responsible_party character varying(255) COLLATE pg_catalog."default", responsible_party character varying(255) COLLATE pg_catalog."default",
CONSTRAINT trial_snapshots_pkey PRIMARY KEY (nct_id, version) CONSTRAINT trial_snapshots_pkey PRIMARY KEY (nct_id, version)
) );
TABLESPACE pg_default; TABLESPACE pg_default;

@ -1,184 +0,0 @@
from copy import copy
from datetime import datetime
from bs4 import BeautifulSoup
import re
form = """
<tr>
<td colspan="2">
<form id="form_StudyStatus">
<div class="w3-responsive">
<fieldset class="entryReq" id="StudyStatus"
style="margin:auto;margin-bottom:1em;padding-bottom:0.5em;width:98%;">
<legend class="moduleLabel"> <img id="StudyStatusImg" class="toggleImage"
onclick="toggleModule('StudyStatus');" src="html/images/collapse.png"
alt='Open or close this module'>
Study Status</legend>
<div id="StudyStatusBody" class="moduleBody">
<table class="indent1 moduleTable resultTable">
<thead>
<tr>
<th style="width:210px;"></th>
<th></th>
</tr>
</thead>
<tbody>
<tr>
<td class="rowLabel" style="min-width: 210px;">Record Verification:</td>
<td>April <span class="drop_hilite">2008</span> <span class="add_hilite">2017</span> </td>
</tr>
<tr>
<td class="rowLabel" style="min-width: 210px;">Overall Status:</td>
<td><span class="drop_hilite">Recruiting</span> <span class="add_hilite">Completed</span></td>
</tr>
<tr>
<td class="rowLabel" style="min-width: 210px;">Study Start:</td>
<td>March 2008 </td>
</tr>
<tr>
<td class="rowLabel" style="min-width: 210px;">Primary Completion:</td>
<td> <span class="add_hilite">December 2009 [Actual]</span> </td>
</tr>
<tr>
<td class="rowLabel" style="min-width: 210px;">Study Completion:</td>
<td>December 2009 [ <span class="drop_hilite">Anticipated</span> <span
class="add_hilite">Actual</span>] </td>
</tr>
<tr style="border-bottom:1px solid lightgray">
<td colspan="3"></td>
</tr>
<tr>
<td class="rowLabel" style="min-width: 210px;">First Submitted:</td>
<td>April 10, 2008 </td>
</tr>
<tr>
<td class="rowLabel" style="min-width: 210px;">First Submitted that<br />Met QC Criteria:</td>
<td>April 10, 2008 </td>
</tr>
<tr>
<td class="rowLabel" style="min-width: 210px;">First Posted:</td>
<td>April 15, 2008 [Estimate] </td>
</tr>
<tr style="border-bottom:1px solid lightgray">
<td colspan="3"></td>
</tr>
<tr>
<td class="rowLabel" style="min-width: 210px;">Results First Submitted:</td>
<td> <span class="add_hilite">February 6, 2014</span> </td>
</tr>
<tr>
<td class="rowLabel" style="min-width: 210px;">Results First Submitted that<br />Met QC
Criteria:</td>
<td> <span class="add_hilite">August 29, 2014</span> </td>
</tr>
<tr>
<td class="rowLabel" style="min-width: 210px;">Results First Posted:</td>
<td> <span class="add_hilite">September 9, 2014 [Estimate]</span> </td>
</tr>
<tr style="border-bottom:1px solid lightgray">
<td colspan="3"></td>
</tr>
<tr>
<td class="rowLabel" style="min-width: 210px;">Last Update Submitted that<br />Met QC Criteria:
</td>
<td>April <span class="drop_hilite">10</span> <span class="add_hilite">18</span>, <span
class="drop_hilite">2008</span> <span class="add_hilite">2017</span> </td>
</tr>
<tr>
<td class="rowLabel" style="min-width: 210px;">Last Update Posted:</td>
<td><span class="drop_hilite">April 15, 2008 [Estimate]</span> <span class="add_hilite">May 19,
2017 [Actual]</span> </td>
</tr>
</tbody>
</table>
</div>
</fieldset>
</div>
</form>
</td>
</tr>
"""
entry1 = """
<tr>
<td class="rowLabel" style="min-width: 210px;">Record Verification:</td>
<td>April <span class="drop_hilite">2008</span> <span class="add_hilite">2017</span> </td>
</tr>
"""
entry2 = '<td> <span class="add_hilite">December 2009 [Actual]</span> </td>'
DROP_HILITE_re = re.compile('<span class="drop_hilite">[\[\]\w]*</span>\s?')
ADD_HILITE_re = re.compile('<span class="add_hilite">\w*</span>\s?')
TAGS_RE = re.compile('<[=-_,.:;"/\w\s]+>')
def extract_new_data(td):
text = td.__str__()
return TAGS_RE.sub("",DROP_HILITE_re.sub(" ",text)).strip()
def extract_old_data(td):
text = td.__str__()
return TAGS_RE.sub("",ADD_HILITE_re.sub(" ",text)).strip()
def delete_tags(td):
text = td.__str__()
return TAGS_RE.sub(" ",text).strip()
def extract_date_and_tag(text, date_format):
"""
Extracts a datetype according to the date format
and the estimate tag based on
"""
if not text:
return " "
date_split = text.split("[")
if len(date_split) > 1:
estimate_tag = date_split[1].split("]")[0].strip()
else:
estimate_tag = None
date_object = datetime.strptime(date_split[0].strip(), date_format)
return estimate_tag, date_object
#TODO: Write test
def extract_text_and_tag(text):
"""
"""
pass
if __name__ == "__main__":
Entry = BeautifulSoup(entry1, "lxml")
Form = BeautifulSoup(form, "lxml")
print(extract_new_data(Entry.find_all("td")[1]))
print(extract_old_data(Entry.find_all("td")[1]))
for tr in Form.find_all("tr"):
data = tr.find_all("td")
match len(data):
case 0: print("no data")
case 1: print("1\t",data[0])
case _: print(len(data), "\t", extract_new_data(data[1]) ,"\t|\t", extract_old_data(data[1]))
#print(extract_date_and_tag(extract_old_data(Entry.find_all("td")[1]), "%B %Y"))
print(extract_date_and_tag("April 2008 [ test ]", "%B %Y"))
Entry2 = BeautifulSoup(entry2,"lxml")
print(extract_old_data(Entry2)) #error here.
print(extract_new_data(Entry2))
Entry3 = copy(Entry2)
print(Entry3)
Entry4 = Entry3.find_all(class_="add_hilite")[0].extract()
print(Entry3.text)
print(Entry4.text)
Loading…
Cancel
Save