extraction-lib as a functional module and the associated history file.

parser
youainti 4 years ago
parent a9027c9467
commit 453e82974e

@ -1,9 +1,8 @@
from collections import namedtuple
from copy import copy
from datetime import datetime
from ensurepip import version
from bs4 import BeautifulSoup
import abc
import textprocessing as tp #cuz tp is important
#requires Python 3.10
@ -50,6 +49,9 @@ class VersionData():
#self._responsible_party_category = None #I don't believe this is included in the raw data
#self._collaborators = None #currently going to ignore as I've not fount it in AACT
def load_to_db(db_cursor):
#load to initial table, then load any extra details into other tables
pass
def extract_study_statuses(study_status_form, version_a,version_b):
"""
@ -66,21 +68,21 @@ def extract_study_statuses(study_status_form, version_a,version_b):
match tr_to_td(trow):
case ["Primary Completion:" as row_label, tag]:
old,new = split_by_version(tag)
tagdate1 = extract_date_and_tag(old.text,date_MMMM_YYYY)
tagdate1 = extract_date_and_tag(old.text)
version_a._primary_completion_date = tagdate1.date
version_a._primary_completion_date_category = tagdate1.tag
tagdate2 = extract_date_and_tag(new.text,date_MMMM_YYYY)
tagdate2 = extract_date_and_tag(new.text)
version_b._primary_completion_date = tagdate2.date
version_b._primary_completion_date_category = tagdate2.tag
case ["Study Completion:" as row_label, tag]:
old,new = split_by_version(tag)
tagdate1 = extract_date_and_tag(old.text,date_MMMM_YYYY)
tagdate1 = extract_date_and_tag(old.text)
version_a._completion_date = tagdate1.date
version_a._completion_date_category = tagdate1.tag
tagdate2 = extract_date_and_tag(new.text,date_MMMM_YYYY)
tagdate2 = extract_date_and_tag(new.text)
version_b._completion_date = tagdate2.date
version_b._completion_date_category = tagdate2.tag
@ -101,7 +103,6 @@ def extract_study_design(study_status_form, version_a,version_b):
#iterate through rows,
for trow in rows:
#matching on rowLabels
#print(trow.__str__()[:80])
match tr_to_td(trow):
case ["Enrollment:" as row_label, tag]:
old,new = split_by_version(tag)
@ -125,7 +126,6 @@ def extract_sponsor_data(study_status_form, version_a,version_b):
#iterate through rows,
for trow in rows:
#matching on rowLabels
#print(trow.__str__()[:80])
match tr_to_td(trow):
case ["Sponsor:" as row_label, tag]:
old, new = split_by_version(tag)
@ -157,16 +157,12 @@ def split_by_version(tag):
return old,new
def extract_date_and_tag(text, date_format):
def extract_date_and_tag(text):
"""
Extracts a datetype according to the date format
and the estimate tag based on
"""
#FIX: Currently, there are multiple (mixed) data formats in use
#Theses can exist in the same data field, in two different versions
#so instead of using a single (passed) data format, I need to
#select between various data formats.
text = text.strip()
@ -179,8 +175,12 @@ def extract_date_and_tag(text, date_format):
estimate_tag = date_split[1].split("]")[0].strip()
else:
estimate_tag = None
date_object = datetime.strptime(date_split[0].strip(), date_format)
try:
date_object = datetime.strptime(date_split[0].strip(), date_MMMM_YYYY)
except ValueError as ve:
date_object = datetime.strptime(date_split[0].strip(), date_MMMM_DD_YYYY)
return TagDatePair(estimate_tag, date_object)
@ -274,14 +274,19 @@ def get_forms(soup,version_a,version_b):
date_MMMM_YYYY = "%B %Y"
date_MMMM_DD_YYYY = "%B %d, %Y"
def get_data_from_versions(nct_id,html, version_a_int, version_b_int):
soup = BeautifulSoup(html,"lxml")
version_a = VersionData(nct_id, version_a_int)
version_b = VersionData(nct_id, version_b_int)
get_forms(soup, version_a, version_b)
return version_a,version_b
if __name__ == "__main__":
for file in ["./NCT00658567.html", "./NCT01303796.html"]:
with open(file) as fh:
soup = BeautifulSoup(fh, "lxml")
version1 = VersionData("NCT00658567",1)
version2 = VersionData("NCT00658567",2)
get_forms(soup, version1, version2)
version1, version2 = get_data_from_versions(file, fh.read(), 1,2)
print(version1.__dict__) #order messed up somewhere:w
print(version2.__dict__) #order messed up somewhere:w

@ -59,4 +59,53 @@ CREATE TABLE history.locations
location contact info
CREATE TABLE history.arms
*/
*/
/*
Create the history
*/
CREATE TYPE history.updatable_catetories AS ENUM
('Actual', 'Anticipated', 'Expected');
ALTER TYPE history.updatable_catetories
OWNER TO root;
COMMENT ON TYPE history.updatable_catetories
IS 'This enum is used to capture the different types of categories that a date or enrollemnt figure may have.';
CREATE TYPE history.study_statuses AS ENUM
('Available', 'Withdrawn', 'Withheld', 'Temporarily not available', 'Active, not recruiting', 'Recruiting', 'Not yet recruiting', 'Enrolling by invitation', 'Suspended', 'No longer available', 'Approved for marketing', 'Unknown status', 'Completed', 'Terminated');
ALTER TYPE history.study_statuses
OWNER TO root;
-- Table: history.trial_snapshots
-- DROP TABLE IF EXISTS history.trial_snapshots;
CREATE TABLE IF NOT EXISTS history.trial_snapshots
(
nct_id character varying(15) COLLATE pg_catalog."default" NOT NULL,
version integer NOT NULL,
primary_completion_date timestamp without time zone,
primary_completion_date_category history.updatable_catetories,
completion_date timestamp without time zone,
completion_date_category history.updatable_catetories,
overall_status history.study_statuses,
enrollment integer,
enrollment_category history.updatable_catetories,
sponsor character varying(255) COLLATE pg_catalog."default",
responsible_party character varying(255) COLLATE pg_catalog."default",
CONSTRAINT trial_snapshots_pkey PRIMARY KEY (nct_id, version)
)
TABLESPACE pg_default;
ALTER TABLE IF EXISTS history.trial_snapshots
OWNER to root;
Loading…
Cancel
Save