You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
ClinicalTrialsDataProcessing/Scripts/ExtractingData.sql

637 lines
18 KiB
SQL

create extension tablefunc;
/*Getting Trial Data all together
* There are 3 main datasets to join per trial:
*
* - Trial Data (still need to stick it together)
* - Duration and enrollment data
* - Compound Marketing (can get for any trial)
* - how many individual brands per compound at the start of the trial
* - Disease Data (can get for verified trials)
* - Population upper limit (Global Burdens of Disease)
* - Category (ICD10 2nd level groups)
*/
/*Disease Data*/
-- ICD10 Category and GBD data
with cte as (
select
nct_id,
max("level") as max_level
from trial_to_cause
group by nct_id
), cte2 as (
select
ttc.nct_id,
ttc.ui,
ttc."condition",
ttc.cause_text,
ttc.cause_id,
cte.max_level
from trial_to_cause ttc
join cte
on cte.nct_id=ttc.nct_id
where ttc."level"=cte.max_level
group by
ttc.nct_id,
ttc.ui,
ttc."condition",
ttc.cause_text,
ttc.cause_id,
cte.max_level
order by nct_id,ui
), cte3 as (
select
nct_id,
substring(cte2.ui for 3) as code,
cte2."condition",
cte2.cause_text,
cte2.cause_id,
ic.id as category_id,
ic.group_name
from cte2
join "DiseaseBurden".icd10_categories ic
on
substring(cte2.ui for 3) <= ic.end_code
and
substring(cte2.ui for 3) >= ic.start_code
)
select nct_id, cause_id,category_id
from cte3
group by nct_id, cause_id, category_id
;
--still need to link to actual disease burdens.
/*Compound Marketing Data*/
---Number of trials after a certain date
with marketing_cte as (
select nct_id,count(distinct application_number_or_citation)
from public.match_trial_to_marketing_start_date mttmsd
where "min" > '2012-06-01'
group by nct_id
)
select * from marketing_cte
;
/*Get versions*/
/* Ignore this version
with cte1 as (
select nct_id,min("version") over (partition by nct_id) as min_version
from history.trial_snapshots ts
where
ts.start_date < ts.submission_date
), cte2 as (
select * from cte1
group by nct_id, min_version
order by nct_id
), cte3 as (
select
ts2.nct_id,
ts2."version",
ts2.overall_status,
ts2.submission_date,
ts2.start_date,
ts2.enrollment,
ts2.enrollment_category,
ts2.primary_completion_date,
ts2.primary_completion_date_category ,
--mv.nct_id,
mv.min_version
from history.trial_snapshots ts2
join cte2 mv
on mv.nct_id = ts2.nct_id
where
ts2."version" = mv.min_version
order by ts2.nct_id
), cte4 as (
select cte3.nct_id, cte3.submission_date - cte3.start_date as submission_presecence
from cte3
)
select avg(submission_presecence)
from cte4
;
--avg 61 day difference
*/
--use this version
with cte1 as ( --get trials
select nct_id,max("version") over (partition by nct_id) as min_version
from history.trial_snapshots ts
where
ts.start_date > ts.submission_date
), cte2 as ( --
select * from cte1
group by nct_id, min_version
order by nct_id
), cte3 as (
select
ts2.nct_id,
ts2."version",
ts2.overall_status,
ts2.submission_date,
ts2.start_date,
ts2.enrollment,
ts2.enrollment_category,
ts2.primary_completion_date,
ts2.primary_completion_date_category ,
--mv.nct_id,
mv.min_version
from history.trial_snapshots ts2
join cte2 mv
on mv.nct_id = ts2.nct_id
where
ts2."version" = mv.min_version
order by ts2.nct_id
)
select *
from cte3
where
enrollment is null
or enrollment_category is null
or primary_completion_date is null
or primary_completion_date_category is null
or start_date is null
/*, cte4 as (
select cte3.nct_id, cte3.submission_date - cte3.start_date as submission_presecence
from cte3
)
select avg(submission_presecence)
from cte4
; -- -33 day difference on average
*/
with cte1_min as (
select nct_id,min("version") over (partition by nct_id) as min_version
from history.trial_snapshots ts
where
ts.start_date <= ts.submission_date
),cte1_max as (
select nct_id,max("version") over (partition by nct_id) as max_version
from history.trial_snapshots ts
where
ts.start_date >= ts.submission_date
), cte2_min as (
select * from cte1_min
group by nct_id, min_version
), cte2_max as (
select * from cte1_max
group by nct_id, max_version
)
select *
from cte2_min
join cte2_max
on cte2_min.nct_id=cte2_max.nct_id
where min_version >= max_version
/* Neet to take a different tack in filling out the is version of the data.
* The idea is that we need the latest of each major category
* before the start date.
* */
--get the set of trials which have
with cte as (
/* Get the absolute difference between the start date and the
* submission_date for each version of the trial (measured in days)
*
*/
select
s.nct_id,
s.start_date,
ts."version",
ts.submission_date,
abs(extract(epoch from ts.submission_date - s.start_date)::float/(24*60*60)) as start_deviance
from ctgov.studies s
join history.trial_snapshots ts
on s.nct_id = ts.nct_id
where s.nct_id in (select distinct nct_id from "DiseaseBurden".trial_to_icd10 tti)
),cte2 as (
/* Rank each version based on it's proximity to the start date
* */
select
cte.nct_id,
cte."version",
row_number() over (partition by cte.nct_id order by cte.start_deviance) as rownum,
cte.submission_date,
cte.start_deviance,
cte.start_date,
ts.primary_completion_date ,
ts.primary_completion_date_category ,
ts.overall_status ,
ts.enrollment ,
ts.enrollment_category
from cte
join history.trial_snapshots ts
on cte.nct_id=ts.nct_id and cte."version"=ts."version"
), cte3_primary_completion as (
/* for each trial
* select the version with a filled out primary_completion_source
* that is closest to the start date.
* */
select cte2.nct_id, min(cte2.rownum) as primary_completion_source
from cte2
where cte2.primary_completion_date is not null
group by cte2.nct_id
), cte3_enrollment as (
/* for each trial
* select the version with a filled out enrollment
* that is closest to the start date.
* */
select cte2.nct_id, min(cte2.rownum) as enrollment_source
from cte2
where cte2.enrollment is not null
group by cte2.nct_id
), cte4 as (
/* join the best options together to get the data of interest.
*
* On further inspection there are just a view of those, with
* many of them having a 7+ month difference between the two versions.
* I think I am going to drop them.
* */
select
c3e.nct_id,
--c2a.submission_date as submission_date_a,
--c2b.submission_date as submission_date_b,
--c3e.enrollment_source,
c2a."version" as version_a,
c2a.enrollment,
c2a.enrollment_category,
--c3p.primary_completion_source ,
c2b."version" as version_b,
c2b.primary_completion_date,
c2b.primary_completion_date_category
from cte3_enrollment c3e
join cte2 c2a
on c3e.nct_id = c2a.nct_id and c3e.enrollment_source = c2a.rownum
join cte3_primary_completion c3p
on c3e.nct_id = c3p.nct_id
join cte2 c2b
on c3p.nct_id=c2b.nct_id and c3p.primary_completion_source = c2b.rownum
), cte5 as (
select nct_id
from cte4 where version_a != version_b
)
select
c.nct_id,
s2.overall_status,
c.enrollment as planned_enrollment,
s2.enrollment,
s2.start_date,
c.primary_completion_date as planned_primary_completion_date,
s2.primary_completion_date,
extract(epoch from c.primary_completion_date - s2.start_date)/(24*60*60) as planned_duration,
s2.primary_completion_date - s2.start_date as actual_duration
from cte4 c
join ctgov.studies s2
on c.nct_id = s2.nct_id
where c.nct_id not in (select nct_id from cte5)
;
/*
* Concern about causal inference
*
* When putting the data together for CBO it looked like we got occasional updates about
* the status of trials that included enrollment updates.
* That doesn't appear to be the case, but that messes with the ability to causally identify
* any results. I need to be careful about this data is used.
*
* I created the statements below to get the data that I need.
*/
----get the set of trial snapshots
create or replace view public.view_cte as
select
nct_id,
primary_completion_date,
primary_completion_date_category,
enrollment,
start_date,
enrollment_category ,
overall_status,
--count("version"),
min(submission_date) as earliest_date_observed
from history.trial_snapshots ts
where
nct_id in (select distinct nct_id from "DiseaseBurden".trial_to_icd10 tti where tti.approved='accepted')
and submission_date >= start_date
and overall_status not in ('Completed','Terminated')
group by
nct_id,
primary_completion_date,
primary_completion_date_category,
start_date,
enrollment,
enrollment_category ,
overall_status
;
create or replace view public.view_disbur_cte0 as
select tti.nct_id, tti.ui , tti."condition",itc.cause_text, ch.cause_id, ch."level"
from "DiseaseBurden".trial_to_icd10 tti
join "DiseaseBurden".icd10_to_cause itc
on replace(REPLACE(tti.ui,'-',''),'.','') = replace(REPLACE(itc.code ,'-',''),'.','')
join "DiseaseBurden".cause_hierarchy ch
on itc.cause_text = ch.cause_name
where
tti.approved = 'accepted'
;
create or replace view public.view_trial_to_cause as
select tti.nct_id, tti.ui , tti."condition",itc.cause_text, ch.cause_id, ch."level"
from "DiseaseBurden".trial_to_icd10 tti
join "DiseaseBurden".icd10_to_cause itc
on replace(REPLACE(tti.ui,'-',''),'.','') = replace(REPLACE(itc.code ,'-',''),'.','')
join "DiseaseBurden".cause_hierarchy ch
on itc.cause_text = ch.cause_name
where
tti.approved = 'accepted'
order by nct_id
;--does this duplicate the view above?
create or replace view public.view_disbur_cte as
select
nct_id,
max("level") as max_level
from view_disbur_cte0
group by nct_id
;
create or replace view public.view_disbur_cte2 as
select
ttc.nct_id,
ttc.ui,
ttc."condition",
ttc.cause_text,
ttc.cause_id,
disbur_cte.max_level
from view_trial_to_cause ttc
join view_disbur_cte as disbur_cte
on disbur_cte.nct_id=ttc.nct_id
where ttc."level"=disbur_cte.max_level
group by
ttc.nct_id,
ttc.ui,
ttc."condition",
ttc.cause_text,
ttc.cause_id,
disbur_cte.max_level
order by nct_id,ui
;
create or replace view public.view_disbur_cte3 as
select
nct_id,
substring(disbur_cte2.ui for 3) as code,
disbur_cte2."condition",
disbur_cte2.cause_text,
disbur_cte2.cause_id,
ic.chapter_code as category_id,
ic.group_name,
disbur_cte2.max_level
from view_disbur_cte2 as disbur_cte2
join "DiseaseBurden".icd10_categories ic
on
substring(disbur_cte2.ui for 3) <= ic.end_code
and
substring(disbur_cte2.ui for 3) >= ic.start_code
where ic."level" = 1
;
create or replace view public.view_burdens_cte as
select *
from "DiseaseBurden".burdens b
where b.sex_id = 3 --both sexes
and b.metric_id = 1 --number/count
and b.measure_id = 2 --DALYs
and b.age_id =22 --all ages
;
create or replace view public.view_burdens_cte2 as
select
--c1.location_id,
c1.cause_id,
c1."year",
--high sdi
c1.val as h_sdi_val,
c1.upper_95 as h_sdi_u95,
c1.lower_95 as h_sdi_l95,
--high-middle sdi
c2.val as hm_sdi_val,
c2.upper_95 as hm_sdi_u95,
c2.lower_95 as hm_sdi_l95,
--middle sdi
c3.val as m_sdi_val,
c3.upper_95 as m_sdi_u95,
c3.lower_95 as m_sdi_l95,
--low-middle sdi
c4.val as lm_sdi_val,
c4.upper_95 as lm_sdi_u95,
c4.lower_95 as lm_sdi_l95,
--low sdi
c5.val as l_sdi_val,
c5.upper_95 as l_sdi_u95,
c5.lower_95 as l_sdi_l95
from view_burdens_cte c1
join view_burdens_cte c2
on c1.cause_id = c2.cause_id
and c1."year" = c2."year"
join view_burdens_cte c3
on c1.cause_id = c3.cause_id
and c1."year" = c3."year"
join view_burdens_cte c4
on c1.cause_id = c4.cause_id
and c1."year" = c4."year"
join view_burdens_cte c5
on c1.cause_id = c5.cause_id
and c1."year" = c5."year"
where c1.location_id = 44635
and c2.location_id = 44634
and c3.location_id = 44639
and c4.location_id = 44636
and c5.location_id = 44637
;
--drop view if exists public.formatted_data cascade;
create or replace view public.formatted_data as
select
cte.nct_id,
cte.start_date,
cte.enrollment as current_enrollment,
cte.enrollment_category,
cte.overall_status as current_status,
cte.earliest_date_observed,
extract( epoch from (cte.earliest_date_observed - cte.start_date))/extract( epoch from (cte.primary_completion_date - cte.start_date)) as elapsed_duration
,count(distinct mttmsd."application_number_or_citation") as n_brands
,dbc3.code
,dbc3."condition"
,dbc3.cause_text
,dbc3.cause_id
,dbc3.category_id
,dbc3.group_name
,dbc3.max_level
--c1.location_id,
--,b.cause_id
,b."year",
--high sdi
b.h_sdi_val,
b.h_sdi_u95,
b.h_sdi_l95,
--high-middle sdi
b.hm_sdi_val,
b.hm_sdi_u95,
b.hm_sdi_l95,
--middle sdi
b.m_sdi_val,
b.m_sdi_u95,
b.m_sdi_l95,
--low-middle sdi
b.lm_sdi_val,
b.lm_sdi_u95,
b.lm_sdi_l95,
--low sdi
b.l_sdi_val,
b.l_sdi_u95,
b.l_sdi_l95
from view_cte as cte
join public.match_trial_to_marketing_start_date mttmsd
on cte.nct_id = mttmsd."nct_id"
join view_disbur_cte3 dbc3
on dbc3.nct_id = cte.nct_id
join view_burdens_cte2 b
on b.cause_id = dbc3.cause_id and extract(year from b."year") = extract(year from cte.earliest_date_observed)
where
mttmsd."min" <= cte.earliest_date_observed
group by
cte.nct_id,
cte.start_date,
cte.enrollment,
cte.enrollment_category,
cte.overall_status,
cte.earliest_date_observed,
elapsed_duration
,dbc3.code
,dbc3."condition"
,dbc3.cause_text
,dbc3.cause_id
,dbc3.category_id
,dbc3.group_name
,dbc3.max_level
--c1.location_id,
,b.cause_id,
b."year",
--high sdi
b.h_sdi_val,
b.h_sdi_u95,
b.h_sdi_l95,
--high-middle sdi
b.hm_sdi_val,
b.hm_sdi_u95,
b.hm_sdi_l95,
--middle sdi
b.m_sdi_val,
b.m_sdi_u95,
b.m_sdi_l95,
--low-middle sdi
b.lm_sdi_val,
b.lm_sdi_u95,
b.lm_sdi_l95,
--low sdi
b.l_sdi_val,
b.l_sdi_u95,
b.l_sdi_l95
order by cte.nct_id ,cte.earliest_date_observed
;--used this one 2023-04-05
--get the planned enrollment
create or replace view public.time_between_submission_and_start_view as
/* Get the absolute difference between the start date and the
* submission_date for each version of the trial (measured in days)
*
*/
select
s.nct_id,
s.start_date,
ts."version",
ts.submission_date,
abs(extract(epoch from ts.submission_date - s.start_date)::float/(24*60*60)) as start_deviance
from ctgov.studies s
join history.trial_snapshots ts
on s.nct_id = ts.nct_id
where s.nct_id in (select distinct nct_id from "DiseaseBurden".trial_to_icd10 tti)
;
create or replace view rank_proximity_to_start_time_view as
/* Rank each version based on it's proximity to the start date
* */
select
cte.nct_id,
cte."version",
row_number() over (partition by cte.nct_id order by cte.start_deviance) as rownum,
cte.submission_date,
cte.start_deviance,
cte.start_date,
ts.primary_completion_date ,
ts.primary_completion_date_category ,
ts.overall_status ,
ts.enrollment ,
ts.enrollment_category
from public.time_between_submission_and_start_view cte
join history.trial_snapshots ts
on cte.nct_id=ts.nct_id and cte."version"=ts."version"
;
create or replace view enrollment_closest_to_start_view as
/* for each trial
* select the version with a filled out enrollment
* that is closest to the start date.
* */
select cte2.nct_id, min(cte2.rownum) as enrollment_source
from rank_proximity_to_start_time_view cte2
where cte2.enrollment is not null
group by cte2.nct_id
;
--drop view public.formatted_data_with_planned_enrollment ;
create or replace view formatted_data_with_planned_enrollment as
select
f.*,
s.overall_status as final_status,
c2a."version",
c2a.enrollment as planned_enrollment
from formatted_data f
join ctgov.studies s
on f.nct_id = s.nct_id
join enrollment_closest_to_start_view c3e
on c3e.nct_id = f.nct_id
join rank_proximity_to_start_time_view c2a
on c3e.nct_id = c2a.nct_id and c3e.enrollment_source = c2a.rownum
;
select * from formatted_data_with_planned_enrollment
-------------------GET COUNTS------------------
select count(distinct nct_id) from public.view_cte; --88
select count(distinct nct_id) from public.view_disbur_cte0; --130
select count(distinct nct_id) from public.view_trial_to_cause; --130
select count(distinct nct_id) from public.view_disbur_cte;--130
select count(distinct nct_id) from public.view_disbur_cte2;--130
select count(distinct nct_id) from public.view_disbur_cte3;--130
select count(distinct nct_id) from public.formatted_data; --48 probably because there are so many trials that don't fall into a GBD category/cause
select count(distinct nct_id) from public.time_between_submission_and_start_view;--1067
select count(distinct nct_id) from rank_proximity_to_start_time_view;--1067
select count(distinct nct_id) from enrollment_closest_to_start_view;--1067
select count(distinct nct_id) from formatted_data_with_planned_enrollment;--48