extraction-lib as a functional module and the associated history file.

parser
youainti 4 years ago
parent a9027c9467
commit 453e82974e

@ -1,9 +1,8 @@
from collections import namedtuple from collections import namedtuple
from copy import copy from copy import copy
from datetime import datetime from datetime import datetime
from ensurepip import version
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
import abc
import textprocessing as tp #cuz tp is important import textprocessing as tp #cuz tp is important
#requires Python 3.10 #requires Python 3.10
@ -50,6 +49,9 @@ class VersionData():
#self._responsible_party_category = None #I don't believe this is included in the raw data #self._responsible_party_category = None #I don't believe this is included in the raw data
#self._collaborators = None #currently going to ignore as I've not fount it in AACT #self._collaborators = None #currently going to ignore as I've not fount it in AACT
def load_to_db(db_cursor):
#load to initial table, then load any extra details into other tables
pass
def extract_study_statuses(study_status_form, version_a,version_b): def extract_study_statuses(study_status_form, version_a,version_b):
""" """
@ -66,21 +68,21 @@ def extract_study_statuses(study_status_form, version_a,version_b):
match tr_to_td(trow): match tr_to_td(trow):
case ["Primary Completion:" as row_label, tag]: case ["Primary Completion:" as row_label, tag]:
old,new = split_by_version(tag) old,new = split_by_version(tag)
tagdate1 = extract_date_and_tag(old.text,date_MMMM_YYYY) tagdate1 = extract_date_and_tag(old.text)
version_a._primary_completion_date = tagdate1.date version_a._primary_completion_date = tagdate1.date
version_a._primary_completion_date_category = tagdate1.tag version_a._primary_completion_date_category = tagdate1.tag
tagdate2 = extract_date_and_tag(new.text,date_MMMM_YYYY) tagdate2 = extract_date_and_tag(new.text)
version_b._primary_completion_date = tagdate2.date version_b._primary_completion_date = tagdate2.date
version_b._primary_completion_date_category = tagdate2.tag version_b._primary_completion_date_category = tagdate2.tag
case ["Study Completion:" as row_label, tag]: case ["Study Completion:" as row_label, tag]:
old,new = split_by_version(tag) old,new = split_by_version(tag)
tagdate1 = extract_date_and_tag(old.text,date_MMMM_YYYY) tagdate1 = extract_date_and_tag(old.text)
version_a._completion_date = tagdate1.date version_a._completion_date = tagdate1.date
version_a._completion_date_category = tagdate1.tag version_a._completion_date_category = tagdate1.tag
tagdate2 = extract_date_and_tag(new.text,date_MMMM_YYYY) tagdate2 = extract_date_and_tag(new.text)
version_b._completion_date = tagdate2.date version_b._completion_date = tagdate2.date
version_b._completion_date_category = tagdate2.tag version_b._completion_date_category = tagdate2.tag
@ -101,7 +103,6 @@ def extract_study_design(study_status_form, version_a,version_b):
#iterate through rows, #iterate through rows,
for trow in rows: for trow in rows:
#matching on rowLabels #matching on rowLabels
#print(trow.__str__()[:80])
match tr_to_td(trow): match tr_to_td(trow):
case ["Enrollment:" as row_label, tag]: case ["Enrollment:" as row_label, tag]:
old,new = split_by_version(tag) old,new = split_by_version(tag)
@ -125,7 +126,6 @@ def extract_sponsor_data(study_status_form, version_a,version_b):
#iterate through rows, #iterate through rows,
for trow in rows: for trow in rows:
#matching on rowLabels #matching on rowLabels
#print(trow.__str__()[:80])
match tr_to_td(trow): match tr_to_td(trow):
case ["Sponsor:" as row_label, tag]: case ["Sponsor:" as row_label, tag]:
old, new = split_by_version(tag) old, new = split_by_version(tag)
@ -157,16 +157,12 @@ def split_by_version(tag):
return old,new return old,new
def extract_date_and_tag(text, date_format): def extract_date_and_tag(text):
""" """
Extracts a datetype according to the date format Extracts a datetype according to the date format
and the estimate tag based on and the estimate tag based on
""" """
#FIX: Currently, there are multiple (mixed) data formats in use
#Theses can exist in the same data field, in two different versions
#so instead of using a single (passed) data format, I need to
#select between various data formats.
text = text.strip() text = text.strip()
@ -179,7 +175,11 @@ def extract_date_and_tag(text, date_format):
estimate_tag = date_split[1].split("]")[0].strip() estimate_tag = date_split[1].split("]")[0].strip()
else: else:
estimate_tag = None estimate_tag = None
date_object = datetime.strptime(date_split[0].strip(), date_format)
try:
date_object = datetime.strptime(date_split[0].strip(), date_MMMM_YYYY)
except ValueError as ve:
date_object = datetime.strptime(date_split[0].strip(), date_MMMM_DD_YYYY)
return TagDatePair(estimate_tag, date_object) return TagDatePair(estimate_tag, date_object)
@ -274,14 +274,19 @@ def get_forms(soup,version_a,version_b):
date_MMMM_YYYY = "%B %Y" date_MMMM_YYYY = "%B %Y"
date_MMMM_DD_YYYY = "%B %d, %Y" date_MMMM_DD_YYYY = "%B %d, %Y"
def get_data_from_versions(nct_id,html, version_a_int, version_b_int):
soup = BeautifulSoup(html,"lxml")
version_a = VersionData(nct_id, version_a_int)
version_b = VersionData(nct_id, version_b_int)
get_forms(soup, version_a, version_b)
return version_a,version_b
if __name__ == "__main__": if __name__ == "__main__":
for file in ["./NCT00658567.html", "./NCT01303796.html"]: for file in ["./NCT00658567.html", "./NCT01303796.html"]:
with open(file) as fh: with open(file) as fh:
soup = BeautifulSoup(fh, "lxml") version1, version2 = get_data_from_versions(file, fh.read(), 1,2)
version1 = VersionData("NCT00658567",1)
version2 = VersionData("NCT00658567",2)
get_forms(soup, version1, version2)
print(version1.__dict__) #order messed up somewhere:w print(version1.__dict__) #order messed up somewhere:w
print(version2.__dict__) #order messed up somewhere:w print(version2.__dict__) #order messed up somewhere:w

@ -60,3 +60,52 @@ CREATE TABLE history.locations
CREATE TABLE history.arms CREATE TABLE history.arms
*/ */
/*
Create the history
*/
CREATE TYPE history.updatable_catetories AS ENUM
('Actual', 'Anticipated', 'Expected');
ALTER TYPE history.updatable_catetories
OWNER TO root;
COMMENT ON TYPE history.updatable_catetories
IS 'This enum is used to capture the different types of categories that a date or enrollemnt figure may have.';
CREATE TYPE history.study_statuses AS ENUM
('Available', 'Withdrawn', 'Withheld', 'Temporarily not available', 'Active, not recruiting', 'Recruiting', 'Not yet recruiting', 'Enrolling by invitation', 'Suspended', 'No longer available', 'Approved for marketing', 'Unknown status', 'Completed', 'Terminated');
ALTER TYPE history.study_statuses
OWNER TO root;
-- Table: history.trial_snapshots
-- DROP TABLE IF EXISTS history.trial_snapshots;
CREATE TABLE IF NOT EXISTS history.trial_snapshots
(
nct_id character varying(15) COLLATE pg_catalog."default" NOT NULL,
version integer NOT NULL,
primary_completion_date timestamp without time zone,
primary_completion_date_category history.updatable_catetories,
completion_date timestamp without time zone,
completion_date_category history.updatable_catetories,
overall_status history.study_statuses,
enrollment integer,
enrollment_category history.updatable_catetories,
sponsor character varying(255) COLLATE pg_catalog."default",
responsible_party character varying(255) COLLATE pg_catalog."default",
CONSTRAINT trial_snapshots_pkey PRIMARY KEY (nct_id, version)
)
TABLESPACE pg_default;
ALTER TABLE IF EXISTS history.trial_snapshots
OWNER to root;
Loading…
Cancel
Save