Added history schema, got extraction_lib to read and write from db

llm-extraction
youainti 3 years ago
parent 4ae3064bf2
commit ee3e37e834

@ -19,8 +19,98 @@ GRANT USAGE ON SCHEMA history TO history_reader;
GRANT SELECT ON ALL TABLES IN SCHEMA http TO history_reader;
/* History Tables
Below is where I would construct the parsed trial history tables that I need.
Possible fields
nct_id
version
--Study Status
overall_status^
primary_completion_date^
completion_date^
last_update_submitted_date
--SponsorCollaborators
sponsor (multi?)
collaborators (multi?)
--Oversight
fda_regulated_drug (ignore)
fda_regulated_device (ignore)
dmc (ignore)
--StuldyDescription
summary
detailed_description
--Conditions
Conditions
Keywords
--StudyDesign
Study type
Primary Purpose
Study Phase
Interventional Study Model
Number of Arms
Masking
Allocation
Enrollment ^
--ArmsAndInterventions
Arms (multiple) (Ignore)
--ProtocolOutcomeMeasures
--Eligibility
--ContactsLocation
--IPDSharing
--References
--ParticipantFlow
--BaselineCharacteristics
--ROutcomeMeasures
--AdverseEvents
--LimitationsAndCaveats
--More Information
*/
CREATE TYPE history.updatable_catetories AS ENUM
('Actual', 'Anticipated', 'Expected');
ALTER TYPE history.updatable_catetories
OWNER TO root;
COMMENT ON TYPE history.updatable_catetories
IS 'This enum is used to capture the different types of categories that a date or enrollemnt figure may have.';
CREATE TYPE history.study_statuses AS ENUM
('Available', 'Withdrawn', 'Withheld', 'Temporarily not available', 'Active, not recruiting', 'Recruiting', 'Not yet recruiting', 'Enrolling by invitation', 'Suspended', 'No longer available', 'Approved for marketing', 'Unknown status', 'Completed', 'Terminated');
ALTER TYPE history.study_statuses
OWNER TO root;
COMMENT ON TYPE history.study_statuses
IS 'This enum is used to record study status. These are pulled from the ClinicalTrials.gov documentation.';
-- Table: history.trial_snapshots
DROP TABLE IF EXISTS history.trial_snapshots;
CREATE TABLE IF NOT EXISTS history.trial_snapshots
(
nct_id character varying(15) COLLATE pg_catalog."default" NOT NULL,
version integer NOT NULL,
primary_completion_date timestamp without time zone,
primary_completion_date_category history.updatable_catetories,
start_date timestamp without time zone,
start_date_category history.updatable_catetories,
completion_date timestamp without time zone,
completion_date_category history.updatable_catetories,
overall_status history.study_statuses,
enrollment integer,
enrollment_category history.updatable_catetories,
sponsor character varying(255) COLLATE pg_catalog."default",
responsible_party character varying(255) COLLATE pg_catalog."default",
CONSTRAINT trial_snapshots_pkey PRIMARY KEY (nct_id, version)
);
ALTER TABLE IF EXISTS history.trial_snapshots
OWNER to root;

@ -108,6 +108,8 @@ class VersionData():
#catch any error, print the applicable information, and raise the error.
print(self)
raise err
db_connection.commit()
def optional_strip(possible_string):
if type(possible_string) == str:
@ -126,7 +128,6 @@ def extract_study_statuses(study_status_form, version_a,version_b):
#iterate through rows,
for trow in rows:
#matching on rowLabels
#print(trow.__str__()[:80])
match tr_to_td(trow):
case ["Primary Completion:" as row_label, tag]:
old,new = split_by_version(tag)
@ -178,13 +179,15 @@ def extract_study_design(study_status_form, version_a,version_b):
match tr_to_td(trow):
case ["Enrollment:" as row_label, tag]:
old,new = split_by_version(tag)
tagdate1 = extract_text_and_tag(old.text)
version_a._enrollment = tagdate1.text
version_a._enrollment_category = optional_strip(tagdate1.tag)
#Extract tag and text, add them to preallocated object
tagtext1 = extract_text_and_tag(old.text)
version_a._enrollment = tagtext1.text
version_a._enrollment_category = optional_strip(tagtext1.tag)
tagdate2 = extract_text_and_tag(new.text)
version_b._enrollment = tagdate2.text
version_b._enrollment_category = optional_strip(tagdate2.tag)
tagtext2 = extract_text_and_tag(new.text)
version_b._enrollment = tagtext2.text
version_b._enrollment_category = optional_strip(tagtext2.tag)
def extract_sponsor_data(study_status_form, version_a,version_b):
@ -301,6 +304,8 @@ def get_forms(soup,version_a,version_b):
if not "id" in form.attrs:
continue
#for each type of form (identified by the ID field)
# extract and add the data to the preallocated objects
match form.attrs["id"]:
case "form_StudyStatus":
extract_study_statuses(form,version_a,version_b)
@ -348,8 +353,12 @@ date_MMMM_DD_YYYY = "%B %d, %Y"
def get_data_from_versions(nct_id,html, version_a_int, version_b_int):
soup = BeautifulSoup(html,"lxml")
#preallocate version data
version_a = VersionData(nct_id, version_a_int)
version_b = VersionData(nct_id, version_b_int)
#extract data from html and put it in the preallocated objects
get_forms(soup, version_a, version_b)
return version_a,version_b
@ -357,25 +366,31 @@ def get_data_from_versions(nct_id,html, version_a_int, version_b_int):
if __name__ == "__main__":
with psycopg2.connect(dbname="aact_db", user="root", password="root",host="localhost") as db_connection:
with psycopg2.connect(dbname="aact_db", user="root", password="root",host="will-office") as db_connection:
#pull the requests from the db
with db_connection.cursor() as curse:
sql = """
SELECT nct_id, version_a,version_b, html
FROM http.responses
WHERE response_code = 200
"""
responses = curse.execute(sql)
for response in responses.fetch_all():
curse.execute(sql)
for response in curse.fetchall():
#
nct_id, version_a, version_b, html = response
print(nct_id)
version1, version2 = get_data_from_versions(nct_id, html, version_a, version_b)
print(version1.nct_id)
print(version2._enrollment)
if version_b == version_a + 1:
version1.load_to_db(db_connection)
version2.load_to_db(db_connection)
else:
version2.load_to_db(db_connection)
if version_b == version_a + 1:
version1.load_to_db(db_connection)
version2.load_to_db(db_connection)
else:
version2.load_to_db(db_connection)
exit(1)
"""
@ -402,4 +417,4 @@ TO add a new field to extraction-lib
- splitting into old and new versions
- Extracting the data for both old and new
- add the data to the passed VersionData objects
"""
"""

@ -1,113 +0,0 @@
/*
Create schema history
CREATE TABLE history.versions
nct_id
version
--Study Status
overall_status^
primary_completion_date^
completion_date^
last_update_submitted_date
--SponsorCollaborators
sponsor (multi?)
collaborators (multi?)
--Oversight
fda_regulated_drug (ignore)
fda_regulated_device (ignore)
dmc (ignore)
--StuldyDescription
summary
detailed_description
--Conditions
Conditions
Keywords
--StudyDesign
Study type
Primary Purpose
Study Phase
Interventional Study Model
Number of Arms
Masking
Allocation
Enrollment ^
--ArmsAndInterventions
Arms (multiple) (Ignore)
--ProtocolOutcomeMeasures
--Eligibility
--ContactsLocation
--IPDSharing
--References
--ParticipantFlow
--BaselineCharacteristics
--ROutcomeMeasures
--AdverseEvents
--LimitationsAndCaveats
--More Information
CREATE TABLE history.colaborators
nct_id
version
collaborator_name
CREATE TABLE history.locations
nct_id
version
location name
location contact info
CREATE TABLE history.arms
*/
/*
Create the history
*/
CREATE TYPE history.updatable_catetories AS ENUM
('Actual', 'Anticipated', 'Expected');
ALTER TYPE history.updatable_catetories
OWNER TO root;
COMMENT ON TYPE history.updatable_catetories
IS 'This enum is used to capture the different types of categories that a date or enrollemnt figure may have.';
CREATE TYPE history.study_statuses AS ENUM
('Available', 'Withdrawn', 'Withheld', 'Temporarily not available', 'Active, not recruiting', 'Recruiting', 'Not yet recruiting', 'Enrolling by invitation', 'Suspended', 'No longer available', 'Approved for marketing', 'Unknown status', 'Completed', 'Terminated');
ALTER TYPE history.study_statuses
OWNER TO root;
-- Table: history.trial_snapshots
DROP TABLE IF EXISTS history.trial_snapshots;
CREATE TABLE IF NOT EXISTS history.trial_snapshots
(
nct_id character varying(15) COLLATE pg_catalog."default" NOT NULL,
version integer NOT NULL,
primary_completion_date timestamp without time zone,
primary_completion_date_category history.updatable_catetories,
start_date timestamp without time zone,
start_date_category history.updatable_catetories,
completion_date timestamp without time zone,
completion_date_category history.updatable_catetories,
overall_status history.study_statuses,
enrollment integer,
enrollment_category history.updatable_catetories,
sponsor character varying(255) COLLATE pg_catalog."default",
responsible_party character varying(255) COLLATE pg_catalog."default",
CONSTRAINT trial_snapshots_pkey PRIMARY KEY (nct_id, version)
);
TABLESPACE pg_default;
ALTER TABLE IF EXISTS history.trial_snapshots
OWNER to root;

@ -4,7 +4,7 @@ This represents my
## Prerequisites
> Python >= 3.8
> Python >= 3.10 (requires match statement)
> Docker >= 20.10
> Curl >= 7
> Just >= 1.9

@ -63,7 +63,7 @@ build: check-status setup-containers
#remove containers and rebuild based on previously downloaded data
rebuild: clean-docker build
#system will be built from scratch, including downloading data
#system will be built from scratch, using previously downloaded data
#download data and create the containers
create: check-status download-aact-data build
@ -76,6 +76,10 @@ recreate: clean-docker create
#Register trials of interest in the database based on ./history_downloader/selected_trials.sql
select-trials:
cd history_downloader && python ./select_trials.py
#Download trial histories based on registered trials of interest.
download-trial-histories:
cd history_downloader && python ./downloader.py
test-db-connection:
cd history_downloader && python db_connection.py

Loading…
Cancel
Save