Added history schema, got extraction_lib to read and write from db

llm-extraction
youainti 3 years ago
parent 4ae3064bf2
commit ee3e37e834

@ -19,8 +19,98 @@ GRANT USAGE ON SCHEMA history TO history_reader;
GRANT SELECT ON ALL TABLES IN SCHEMA http TO history_reader; GRANT SELECT ON ALL TABLES IN SCHEMA http TO history_reader;
/* History Tables /* History Tables
Below is where I would construct the parsed trial history tables that I need. Below is where I would construct the parsed trial history tables that I need.
Possible fields
nct_id
version
--Study Status
overall_status^
primary_completion_date^
completion_date^
last_update_submitted_date
--SponsorCollaborators
sponsor (multi?)
collaborators (multi?)
--Oversight
fda_regulated_drug (ignore)
fda_regulated_device (ignore)
dmc (ignore)
--StuldyDescription
summary
detailed_description
--Conditions
Conditions
Keywords
--StudyDesign
Study type
Primary Purpose
Study Phase
Interventional Study Model
Number of Arms
Masking
Allocation
Enrollment ^
--ArmsAndInterventions
Arms (multiple) (Ignore)
--ProtocolOutcomeMeasures
--Eligibility
--ContactsLocation
--IPDSharing
--References
--ParticipantFlow
--BaselineCharacteristics
--ROutcomeMeasures
--AdverseEvents
--LimitationsAndCaveats
--More Information
*/ */
CREATE TYPE history.updatable_catetories AS ENUM
('Actual', 'Anticipated', 'Expected');
ALTER TYPE history.updatable_catetories
OWNER TO root;
COMMENT ON TYPE history.updatable_catetories
IS 'This enum is used to capture the different types of categories that a date or enrollemnt figure may have.';
CREATE TYPE history.study_statuses AS ENUM
('Available', 'Withdrawn', 'Withheld', 'Temporarily not available', 'Active, not recruiting', 'Recruiting', 'Not yet recruiting', 'Enrolling by invitation', 'Suspended', 'No longer available', 'Approved for marketing', 'Unknown status', 'Completed', 'Terminated');
ALTER TYPE history.study_statuses
OWNER TO root;
COMMENT ON TYPE history.study_statuses
IS 'This enum is used to record study status. These are pulled from the ClinicalTrials.gov documentation.';
-- Table: history.trial_snapshots
DROP TABLE IF EXISTS history.trial_snapshots;
CREATE TABLE IF NOT EXISTS history.trial_snapshots
(
nct_id character varying(15) COLLATE pg_catalog."default" NOT NULL,
version integer NOT NULL,
primary_completion_date timestamp without time zone,
primary_completion_date_category history.updatable_catetories,
start_date timestamp without time zone,
start_date_category history.updatable_catetories,
completion_date timestamp without time zone,
completion_date_category history.updatable_catetories,
overall_status history.study_statuses,
enrollment integer,
enrollment_category history.updatable_catetories,
sponsor character varying(255) COLLATE pg_catalog."default",
responsible_party character varying(255) COLLATE pg_catalog."default",
CONSTRAINT trial_snapshots_pkey PRIMARY KEY (nct_id, version)
);
ALTER TABLE IF EXISTS history.trial_snapshots
OWNER to root;

@ -109,6 +109,8 @@ class VersionData():
print(self) print(self)
raise err raise err
db_connection.commit()
def optional_strip(possible_string): def optional_strip(possible_string):
if type(possible_string) == str: if type(possible_string) == str:
return possible_string.strip() return possible_string.strip()
@ -126,7 +128,6 @@ def extract_study_statuses(study_status_form, version_a,version_b):
#iterate through rows, #iterate through rows,
for trow in rows: for trow in rows:
#matching on rowLabels #matching on rowLabels
#print(trow.__str__()[:80])
match tr_to_td(trow): match tr_to_td(trow):
case ["Primary Completion:" as row_label, tag]: case ["Primary Completion:" as row_label, tag]:
old,new = split_by_version(tag) old,new = split_by_version(tag)
@ -178,13 +179,15 @@ def extract_study_design(study_status_form, version_a,version_b):
match tr_to_td(trow): match tr_to_td(trow):
case ["Enrollment:" as row_label, tag]: case ["Enrollment:" as row_label, tag]:
old,new = split_by_version(tag) old,new = split_by_version(tag)
tagdate1 = extract_text_and_tag(old.text)
version_a._enrollment = tagdate1.text
version_a._enrollment_category = optional_strip(tagdate1.tag)
tagdate2 = extract_text_and_tag(new.text) #Extract tag and text, add them to preallocated object
version_b._enrollment = tagdate2.text tagtext1 = extract_text_and_tag(old.text)
version_b._enrollment_category = optional_strip(tagdate2.tag) version_a._enrollment = tagtext1.text
version_a._enrollment_category = optional_strip(tagtext1.tag)
tagtext2 = extract_text_and_tag(new.text)
version_b._enrollment = tagtext2.text
version_b._enrollment_category = optional_strip(tagtext2.tag)
def extract_sponsor_data(study_status_form, version_a,version_b): def extract_sponsor_data(study_status_form, version_a,version_b):
@ -301,6 +304,8 @@ def get_forms(soup,version_a,version_b):
if not "id" in form.attrs: if not "id" in form.attrs:
continue continue
#for each type of form (identified by the ID field)
# extract and add the data to the preallocated objects
match form.attrs["id"]: match form.attrs["id"]:
case "form_StudyStatus": case "form_StudyStatus":
extract_study_statuses(form,version_a,version_b) extract_study_statuses(form,version_a,version_b)
@ -348,8 +353,12 @@ date_MMMM_DD_YYYY = "%B %d, %Y"
def get_data_from_versions(nct_id,html, version_a_int, version_b_int): def get_data_from_versions(nct_id,html, version_a_int, version_b_int):
soup = BeautifulSoup(html,"lxml") soup = BeautifulSoup(html,"lxml")
#preallocate version data
version_a = VersionData(nct_id, version_a_int) version_a = VersionData(nct_id, version_a_int)
version_b = VersionData(nct_id, version_b_int) version_b = VersionData(nct_id, version_b_int)
#extract data from html and put it in the preallocated objects
get_forms(soup, version_a, version_b) get_forms(soup, version_a, version_b)
return version_a,version_b return version_a,version_b
@ -357,25 +366,31 @@ def get_data_from_versions(nct_id,html, version_a_int, version_b_int):
if __name__ == "__main__": if __name__ == "__main__":
with psycopg2.connect(dbname="aact_db", user="root", password="root",host="localhost") as db_connection: with psycopg2.connect(dbname="aact_db", user="root", password="root",host="will-office") as db_connection:
#pull the requests from the db #pull the requests from the db
with db_connection.cursor() as curse: with db_connection.cursor() as curse:
sql = """ sql = """
SELECT nct_id, version_a,version_b, html SELECT nct_id, version_a,version_b, html
FROM http.responses FROM http.responses
WHERE response_code = 200
""" """
responses = curse.execute(sql) curse.execute(sql)
for response in responses.fetch_all(): for response in curse.fetchall():
# #
nct_id, version_a, version_b, html = response nct_id, version_a, version_b, html = response
print(nct_id)
version1, version2 = get_data_from_versions(nct_id, html, version_a, version_b) version1, version2 = get_data_from_versions(nct_id, html, version_a, version_b)
print(version1.nct_id)
print(version2._enrollment)
if version_b == version_a + 1:
version1.load_to_db(db_connection)
version2.load_to_db(db_connection)
else:
version2.load_to_db(db_connection)
if version_b == version_a + 1: exit(1)
version1.load_to_db(db_connection)
version2.load_to_db(db_connection)
else:
version2.load_to_db(db_connection)
""" """

@ -1,113 +0,0 @@
/*
Create schema history
CREATE TABLE history.versions
nct_id
version
--Study Status
overall_status^
primary_completion_date^
completion_date^
last_update_submitted_date
--SponsorCollaborators
sponsor (multi?)
collaborators (multi?)
--Oversight
fda_regulated_drug (ignore)
fda_regulated_device (ignore)
dmc (ignore)
--StuldyDescription
summary
detailed_description
--Conditions
Conditions
Keywords
--StudyDesign
Study type
Primary Purpose
Study Phase
Interventional Study Model
Number of Arms
Masking
Allocation
Enrollment ^
--ArmsAndInterventions
Arms (multiple) (Ignore)
--ProtocolOutcomeMeasures
--Eligibility
--ContactsLocation
--IPDSharing
--References
--ParticipantFlow
--BaselineCharacteristics
--ROutcomeMeasures
--AdverseEvents
--LimitationsAndCaveats
--More Information
CREATE TABLE history.colaborators
nct_id
version
collaborator_name
CREATE TABLE history.locations
nct_id
version
location name
location contact info
CREATE TABLE history.arms
*/
/*
Create the history
*/
CREATE TYPE history.updatable_catetories AS ENUM
('Actual', 'Anticipated', 'Expected');
ALTER TYPE history.updatable_catetories
OWNER TO root;
COMMENT ON TYPE history.updatable_catetories
IS 'This enum is used to capture the different types of categories that a date or enrollemnt figure may have.';
CREATE TYPE history.study_statuses AS ENUM
('Available', 'Withdrawn', 'Withheld', 'Temporarily not available', 'Active, not recruiting', 'Recruiting', 'Not yet recruiting', 'Enrolling by invitation', 'Suspended', 'No longer available', 'Approved for marketing', 'Unknown status', 'Completed', 'Terminated');
ALTER TYPE history.study_statuses
OWNER TO root;
-- Table: history.trial_snapshots
DROP TABLE IF EXISTS history.trial_snapshots;
CREATE TABLE IF NOT EXISTS history.trial_snapshots
(
nct_id character varying(15) COLLATE pg_catalog."default" NOT NULL,
version integer NOT NULL,
primary_completion_date timestamp without time zone,
primary_completion_date_category history.updatable_catetories,
start_date timestamp without time zone,
start_date_category history.updatable_catetories,
completion_date timestamp without time zone,
completion_date_category history.updatable_catetories,
overall_status history.study_statuses,
enrollment integer,
enrollment_category history.updatable_catetories,
sponsor character varying(255) COLLATE pg_catalog."default",
responsible_party character varying(255) COLLATE pg_catalog."default",
CONSTRAINT trial_snapshots_pkey PRIMARY KEY (nct_id, version)
);
TABLESPACE pg_default;
ALTER TABLE IF EXISTS history.trial_snapshots
OWNER to root;

@ -4,7 +4,7 @@ This represents my
## Prerequisites ## Prerequisites
> Python >= 3.8 > Python >= 3.10 (requires match statement)
> Docker >= 20.10 > Docker >= 20.10
> Curl >= 7 > Curl >= 7
> Just >= 1.9 > Just >= 1.9

@ -63,7 +63,7 @@ build: check-status setup-containers
#remove containers and rebuild based on previously downloaded data #remove containers and rebuild based on previously downloaded data
rebuild: clean-docker build rebuild: clean-docker build
#system will be built from scratch, including downloading data #system will be built from scratch, using previously downloaded data
#download data and create the containers #download data and create the containers
create: check-status download-aact-data build create: check-status download-aact-data build
@ -76,6 +76,10 @@ recreate: clean-docker create
#Register trials of interest in the database based on ./history_downloader/selected_trials.sql #Register trials of interest in the database based on ./history_downloader/selected_trials.sql
select-trials: select-trials:
cd history_downloader && python ./select_trials.py cd history_downloader && python ./select_trials.py
#Download trial histories based on registered trials of interest. #Download trial histories based on registered trials of interest.
download-trial-histories: download-trial-histories:
cd history_downloader && python ./downloader.py cd history_downloader && python ./downloader.py
test-db-connection:
cd history_downloader && python db_connection.py

Loading…
Cancel
Save