You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
637 lines
18 KiB
SQL
637 lines
18 KiB
SQL
create extension tablefunc;
|
|
|
|
/*Getting Trial Data all together
|
|
* There are 3 main datasets to join per trial:
|
|
*
|
|
* - Trial Data (still need to stick it together)
|
|
* - Duration and enrollment data
|
|
* - Compound Marketing (can get for any trial)
|
|
* - how many individual brands per compound at the start of the trial
|
|
* - Disease Data (can get for verified trials)
|
|
* - Population upper limit (Global Burdens of Disease)
|
|
* - Category (ICD10 2nd level groups)
|
|
*/
|
|
|
|
/*Disease Data*/
|
|
-- ICD10 Category and GBD data
|
|
with cte as (
|
|
select
|
|
nct_id,
|
|
max("level") as max_level
|
|
from trial_to_cause
|
|
group by nct_id
|
|
), cte2 as (
|
|
select
|
|
ttc.nct_id,
|
|
ttc.ui,
|
|
ttc."condition",
|
|
ttc.cause_text,
|
|
ttc.cause_id,
|
|
cte.max_level
|
|
from trial_to_cause ttc
|
|
join cte
|
|
on cte.nct_id=ttc.nct_id
|
|
where ttc."level"=cte.max_level
|
|
group by
|
|
ttc.nct_id,
|
|
ttc.ui,
|
|
ttc."condition",
|
|
ttc.cause_text,
|
|
ttc.cause_id,
|
|
cte.max_level
|
|
order by nct_id,ui
|
|
), cte3 as (
|
|
select
|
|
nct_id,
|
|
substring(cte2.ui for 3) as code,
|
|
cte2."condition",
|
|
cte2.cause_text,
|
|
cte2.cause_id,
|
|
ic.id as category_id,
|
|
ic.group_name
|
|
from cte2
|
|
join "DiseaseBurden".icd10_categories ic
|
|
on
|
|
substring(cte2.ui for 3) <= ic.end_code
|
|
and
|
|
substring(cte2.ui for 3) >= ic.start_code
|
|
)
|
|
select nct_id, cause_id,category_id
|
|
from cte3
|
|
group by nct_id, cause_id, category_id
|
|
;
|
|
--still need to link to actual disease burdens.
|
|
|
|
/*Compound Marketing Data*/
|
|
---Number of trials after a certain date
|
|
with marketing_cte as (
|
|
select nct_id,count(distinct application_number_or_citation)
|
|
from public.match_trial_to_marketing_start_date mttmsd
|
|
where "min" > '2012-06-01'
|
|
group by nct_id
|
|
)
|
|
select * from marketing_cte
|
|
;
|
|
|
|
/*Get versions*/
|
|
/* Ignore this version
|
|
with cte1 as (
|
|
select nct_id,min("version") over (partition by nct_id) as min_version
|
|
from history.trial_snapshots ts
|
|
where
|
|
ts.start_date < ts.submission_date
|
|
), cte2 as (
|
|
select * from cte1
|
|
group by nct_id, min_version
|
|
order by nct_id
|
|
), cte3 as (
|
|
select
|
|
ts2.nct_id,
|
|
ts2."version",
|
|
ts2.overall_status,
|
|
ts2.submission_date,
|
|
ts2.start_date,
|
|
ts2.enrollment,
|
|
ts2.enrollment_category,
|
|
ts2.primary_completion_date,
|
|
ts2.primary_completion_date_category ,
|
|
--mv.nct_id,
|
|
mv.min_version
|
|
from history.trial_snapshots ts2
|
|
join cte2 mv
|
|
on mv.nct_id = ts2.nct_id
|
|
where
|
|
ts2."version" = mv.min_version
|
|
order by ts2.nct_id
|
|
), cte4 as (
|
|
select cte3.nct_id, cte3.submission_date - cte3.start_date as submission_presecence
|
|
from cte3
|
|
)
|
|
select avg(submission_presecence)
|
|
from cte4
|
|
;
|
|
--avg 61 day difference
|
|
*/
|
|
|
|
--use this version
|
|
with cte1 as ( --get trials
|
|
select nct_id,max("version") over (partition by nct_id) as min_version
|
|
from history.trial_snapshots ts
|
|
where
|
|
ts.start_date > ts.submission_date
|
|
), cte2 as ( --
|
|
select * from cte1
|
|
group by nct_id, min_version
|
|
order by nct_id
|
|
), cte3 as (
|
|
select
|
|
ts2.nct_id,
|
|
ts2."version",
|
|
ts2.overall_status,
|
|
ts2.submission_date,
|
|
ts2.start_date,
|
|
ts2.enrollment,
|
|
ts2.enrollment_category,
|
|
ts2.primary_completion_date,
|
|
ts2.primary_completion_date_category ,
|
|
--mv.nct_id,
|
|
mv.min_version
|
|
from history.trial_snapshots ts2
|
|
join cte2 mv
|
|
on mv.nct_id = ts2.nct_id
|
|
where
|
|
ts2."version" = mv.min_version
|
|
order by ts2.nct_id
|
|
)
|
|
select *
|
|
from cte3
|
|
where
|
|
enrollment is null
|
|
or enrollment_category is null
|
|
or primary_completion_date is null
|
|
or primary_completion_date_category is null
|
|
or start_date is null
|
|
/*, cte4 as (
|
|
select cte3.nct_id, cte3.submission_date - cte3.start_date as submission_presecence
|
|
from cte3
|
|
)
|
|
select avg(submission_presecence)
|
|
from cte4
|
|
; -- -33 day difference on average
|
|
*/
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
with cte1_min as (
|
|
select nct_id,min("version") over (partition by nct_id) as min_version
|
|
from history.trial_snapshots ts
|
|
where
|
|
ts.start_date <= ts.submission_date
|
|
),cte1_max as (
|
|
select nct_id,max("version") over (partition by nct_id) as max_version
|
|
from history.trial_snapshots ts
|
|
where
|
|
ts.start_date >= ts.submission_date
|
|
), cte2_min as (
|
|
select * from cte1_min
|
|
group by nct_id, min_version
|
|
), cte2_max as (
|
|
select * from cte1_max
|
|
group by nct_id, max_version
|
|
)
|
|
select *
|
|
from cte2_min
|
|
join cte2_max
|
|
on cte2_min.nct_id=cte2_max.nct_id
|
|
where min_version >= max_version
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/* Neet to take a different tack in filling out the is version of the data.
|
|
* The idea is that we need the latest of each major category
|
|
* before the start date.
|
|
* */
|
|
|
|
--get the set of trials which have
|
|
with cte as (
|
|
/* Get the absolute difference between the start date and the
|
|
* submission_date for each version of the trial (measured in days)
|
|
*
|
|
*/
|
|
select
|
|
s.nct_id,
|
|
s.start_date,
|
|
ts."version",
|
|
ts.submission_date,
|
|
abs(extract(epoch from ts.submission_date - s.start_date)::float/(24*60*60)) as start_deviance
|
|
from ctgov.studies s
|
|
join history.trial_snapshots ts
|
|
on s.nct_id = ts.nct_id
|
|
where s.nct_id in (select distinct nct_id from "DiseaseBurden".trial_to_icd10 tti)
|
|
),cte2 as (
|
|
/* Rank each version based on it's proximity to the start date
|
|
* */
|
|
select
|
|
cte.nct_id,
|
|
cte."version",
|
|
row_number() over (partition by cte.nct_id order by cte.start_deviance) as rownum,
|
|
cte.submission_date,
|
|
cte.start_deviance,
|
|
cte.start_date,
|
|
ts.primary_completion_date ,
|
|
ts.primary_completion_date_category ,
|
|
ts.overall_status ,
|
|
ts.enrollment ,
|
|
ts.enrollment_category
|
|
from cte
|
|
join history.trial_snapshots ts
|
|
on cte.nct_id=ts.nct_id and cte."version"=ts."version"
|
|
), cte3_primary_completion as (
|
|
/* for each trial
|
|
* select the version with a filled out primary_completion_source
|
|
* that is closest to the start date.
|
|
* */
|
|
select cte2.nct_id, min(cte2.rownum) as primary_completion_source
|
|
from cte2
|
|
where cte2.primary_completion_date is not null
|
|
group by cte2.nct_id
|
|
), cte3_enrollment as (
|
|
/* for each trial
|
|
* select the version with a filled out enrollment
|
|
* that is closest to the start date.
|
|
* */
|
|
select cte2.nct_id, min(cte2.rownum) as enrollment_source
|
|
from cte2
|
|
where cte2.enrollment is not null
|
|
group by cte2.nct_id
|
|
), cte4 as (
|
|
/* join the best options together to get the data of interest.
|
|
*
|
|
* On further inspection there are just a view of those, with
|
|
* many of them having a 7+ month difference between the two versions.
|
|
* I think I am going to drop them.
|
|
* */
|
|
select
|
|
c3e.nct_id,
|
|
--c2a.submission_date as submission_date_a,
|
|
--c2b.submission_date as submission_date_b,
|
|
--c3e.enrollment_source,
|
|
c2a."version" as version_a,
|
|
c2a.enrollment,
|
|
c2a.enrollment_category,
|
|
--c3p.primary_completion_source ,
|
|
c2b."version" as version_b,
|
|
c2b.primary_completion_date,
|
|
c2b.primary_completion_date_category
|
|
from cte3_enrollment c3e
|
|
join cte2 c2a
|
|
on c3e.nct_id = c2a.nct_id and c3e.enrollment_source = c2a.rownum
|
|
join cte3_primary_completion c3p
|
|
on c3e.nct_id = c3p.nct_id
|
|
join cte2 c2b
|
|
on c3p.nct_id=c2b.nct_id and c3p.primary_completion_source = c2b.rownum
|
|
), cte5 as (
|
|
select nct_id
|
|
from cte4 where version_a != version_b
|
|
)
|
|
select
|
|
c.nct_id,
|
|
s2.overall_status,
|
|
c.enrollment as planned_enrollment,
|
|
s2.enrollment,
|
|
s2.start_date,
|
|
c.primary_completion_date as planned_primary_completion_date,
|
|
s2.primary_completion_date,
|
|
extract(epoch from c.primary_completion_date - s2.start_date)/(24*60*60) as planned_duration,
|
|
s2.primary_completion_date - s2.start_date as actual_duration
|
|
from cte4 c
|
|
join ctgov.studies s2
|
|
on c.nct_id = s2.nct_id
|
|
where c.nct_id not in (select nct_id from cte5)
|
|
;
|
|
|
|
|
|
/*
|
|
* Concern about causal inference
|
|
*
|
|
* When putting the data together for CBO it looked like we got occasional updates about
|
|
* the status of trials that included enrollment updates.
|
|
* That doesn't appear to be the case, but that messes with the ability to causally identify
|
|
* any results. I need to be careful about this data is used.
|
|
*
|
|
* I created the statements below to get the data that I need.
|
|
*/
|
|
|
|
|
|
|
|
----get the set of trial snapshots
|
|
create or replace view public.view_cte as
|
|
select
|
|
nct_id,
|
|
primary_completion_date,
|
|
primary_completion_date_category,
|
|
enrollment,
|
|
start_date,
|
|
enrollment_category ,
|
|
overall_status,
|
|
--count("version"),
|
|
min(submission_date) as earliest_date_observed
|
|
from history.trial_snapshots ts
|
|
where
|
|
nct_id in (select distinct nct_id from "DiseaseBurden".trial_to_icd10 tti where tti.approved='accepted')
|
|
and submission_date >= start_date
|
|
and overall_status not in ('Completed','Terminated')
|
|
group by
|
|
nct_id,
|
|
primary_completion_date,
|
|
primary_completion_date_category,
|
|
start_date,
|
|
enrollment,
|
|
enrollment_category ,
|
|
overall_status
|
|
;
|
|
create or replace view public.view_disbur_cte0 as
|
|
select tti.nct_id, tti.ui , tti."condition",itc.cause_text, ch.cause_id, ch."level"
|
|
from "DiseaseBurden".trial_to_icd10 tti
|
|
join "DiseaseBurden".icd10_to_cause itc
|
|
on replace(REPLACE(tti.ui,'-',''),'.','') = replace(REPLACE(itc.code ,'-',''),'.','')
|
|
join "DiseaseBurden".cause_hierarchy ch
|
|
on itc.cause_text = ch.cause_name
|
|
where
|
|
tti.approved = 'accepted'
|
|
;
|
|
create or replace view public.view_trial_to_cause as
|
|
select tti.nct_id, tti.ui , tti."condition",itc.cause_text, ch.cause_id, ch."level"
|
|
from "DiseaseBurden".trial_to_icd10 tti
|
|
join "DiseaseBurden".icd10_to_cause itc
|
|
on replace(REPLACE(tti.ui,'-',''),'.','') = replace(REPLACE(itc.code ,'-',''),'.','')
|
|
join "DiseaseBurden".cause_hierarchy ch
|
|
on itc.cause_text = ch.cause_name
|
|
where
|
|
tti.approved = 'accepted'
|
|
order by nct_id
|
|
;--does this duplicate the view above?
|
|
|
|
create or replace view public.view_disbur_cte as
|
|
select
|
|
nct_id,
|
|
max("level") as max_level
|
|
from view_disbur_cte0
|
|
group by nct_id
|
|
|
|
;
|
|
create or replace view public.view_disbur_cte2 as
|
|
select
|
|
ttc.nct_id,
|
|
ttc.ui,
|
|
ttc."condition",
|
|
ttc.cause_text,
|
|
ttc.cause_id,
|
|
disbur_cte.max_level
|
|
from view_trial_to_cause ttc
|
|
join view_disbur_cte as disbur_cte
|
|
on disbur_cte.nct_id=ttc.nct_id
|
|
where ttc."level"=disbur_cte.max_level
|
|
group by
|
|
ttc.nct_id,
|
|
ttc.ui,
|
|
ttc."condition",
|
|
ttc.cause_text,
|
|
ttc.cause_id,
|
|
disbur_cte.max_level
|
|
order by nct_id,ui
|
|
;
|
|
create or replace view public.view_disbur_cte3 as
|
|
select
|
|
nct_id,
|
|
substring(disbur_cte2.ui for 3) as code,
|
|
disbur_cte2."condition",
|
|
disbur_cte2.cause_text,
|
|
disbur_cte2.cause_id,
|
|
ic.chapter_code as category_id,
|
|
ic.group_name,
|
|
disbur_cte2.max_level
|
|
from view_disbur_cte2 as disbur_cte2
|
|
join "DiseaseBurden".icd10_categories ic
|
|
on
|
|
substring(disbur_cte2.ui for 3) <= ic.end_code
|
|
and
|
|
substring(disbur_cte2.ui for 3) >= ic.start_code
|
|
where ic."level" = 1
|
|
|
|
;
|
|
create or replace view public.view_burdens_cte as
|
|
select *
|
|
from "DiseaseBurden".burdens b
|
|
where b.sex_id = 3 --both sexes
|
|
and b.metric_id = 1 --number/count
|
|
and b.measure_id = 2 --DALYs
|
|
and b.age_id =22 --all ages
|
|
;
|
|
create or replace view public.view_burdens_cte2 as
|
|
select
|
|
--c1.location_id,
|
|
c1.cause_id,
|
|
c1."year",
|
|
--high sdi
|
|
c1.val as h_sdi_val,
|
|
c1.upper_95 as h_sdi_u95,
|
|
c1.lower_95 as h_sdi_l95,
|
|
--high-middle sdi
|
|
c2.val as hm_sdi_val,
|
|
c2.upper_95 as hm_sdi_u95,
|
|
c2.lower_95 as hm_sdi_l95,
|
|
--middle sdi
|
|
c3.val as m_sdi_val,
|
|
c3.upper_95 as m_sdi_u95,
|
|
c3.lower_95 as m_sdi_l95,
|
|
--low-middle sdi
|
|
c4.val as lm_sdi_val,
|
|
c4.upper_95 as lm_sdi_u95,
|
|
c4.lower_95 as lm_sdi_l95,
|
|
--low sdi
|
|
c5.val as l_sdi_val,
|
|
c5.upper_95 as l_sdi_u95,
|
|
c5.lower_95 as l_sdi_l95
|
|
from view_burdens_cte c1
|
|
join view_burdens_cte c2
|
|
on c1.cause_id = c2.cause_id
|
|
and c1."year" = c2."year"
|
|
join view_burdens_cte c3
|
|
on c1.cause_id = c3.cause_id
|
|
and c1."year" = c3."year"
|
|
join view_burdens_cte c4
|
|
on c1.cause_id = c4.cause_id
|
|
and c1."year" = c4."year"
|
|
join view_burdens_cte c5
|
|
on c1.cause_id = c5.cause_id
|
|
and c1."year" = c5."year"
|
|
where c1.location_id = 44635
|
|
and c2.location_id = 44634
|
|
and c3.location_id = 44639
|
|
and c4.location_id = 44636
|
|
and c5.location_id = 44637
|
|
;
|
|
--drop view if exists public.formatted_data cascade;
|
|
create or replace view public.formatted_data as
|
|
select
|
|
cte.nct_id,
|
|
cte.start_date,
|
|
cte.enrollment as current_enrollment,
|
|
cte.enrollment_category,
|
|
cte.overall_status as current_status,
|
|
cte.earliest_date_observed,
|
|
extract( epoch from (cte.earliest_date_observed - cte.start_date))/extract( epoch from (cte.primary_completion_date - cte.start_date)) as elapsed_duration
|
|
,count(distinct mttmsd."application_number_or_citation") as n_brands
|
|
,dbc3.code
|
|
,dbc3."condition"
|
|
,dbc3.cause_text
|
|
,dbc3.cause_id
|
|
,dbc3.category_id
|
|
,dbc3.group_name
|
|
,dbc3.max_level
|
|
--c1.location_id,
|
|
--,b.cause_id
|
|
,b."year",
|
|
--high sdi
|
|
b.h_sdi_val,
|
|
b.h_sdi_u95,
|
|
b.h_sdi_l95,
|
|
--high-middle sdi
|
|
b.hm_sdi_val,
|
|
b.hm_sdi_u95,
|
|
b.hm_sdi_l95,
|
|
--middle sdi
|
|
b.m_sdi_val,
|
|
b.m_sdi_u95,
|
|
b.m_sdi_l95,
|
|
--low-middle sdi
|
|
b.lm_sdi_val,
|
|
b.lm_sdi_u95,
|
|
b.lm_sdi_l95,
|
|
--low sdi
|
|
b.l_sdi_val,
|
|
b.l_sdi_u95,
|
|
b.l_sdi_l95
|
|
from view_cte as cte
|
|
join public.match_trial_to_marketing_start_date mttmsd
|
|
on cte.nct_id = mttmsd."nct_id"
|
|
join view_disbur_cte3 dbc3
|
|
on dbc3.nct_id = cte.nct_id
|
|
join view_burdens_cte2 b
|
|
on b.cause_id = dbc3.cause_id and extract(year from b."year") = extract(year from cte.earliest_date_observed)
|
|
where
|
|
mttmsd."min" <= cte.earliest_date_observed
|
|
group by
|
|
cte.nct_id,
|
|
cte.start_date,
|
|
cte.enrollment,
|
|
cte.enrollment_category,
|
|
cte.overall_status,
|
|
cte.earliest_date_observed,
|
|
elapsed_duration
|
|
,dbc3.code
|
|
,dbc3."condition"
|
|
,dbc3.cause_text
|
|
,dbc3.cause_id
|
|
,dbc3.category_id
|
|
,dbc3.group_name
|
|
,dbc3.max_level
|
|
--c1.location_id,
|
|
,b.cause_id,
|
|
b."year",
|
|
--high sdi
|
|
b.h_sdi_val,
|
|
b.h_sdi_u95,
|
|
b.h_sdi_l95,
|
|
--high-middle sdi
|
|
b.hm_sdi_val,
|
|
b.hm_sdi_u95,
|
|
b.hm_sdi_l95,
|
|
--middle sdi
|
|
b.m_sdi_val,
|
|
b.m_sdi_u95,
|
|
b.m_sdi_l95,
|
|
--low-middle sdi
|
|
b.lm_sdi_val,
|
|
b.lm_sdi_u95,
|
|
b.lm_sdi_l95,
|
|
--low sdi
|
|
b.l_sdi_val,
|
|
b.l_sdi_u95,
|
|
b.l_sdi_l95
|
|
order by cte.nct_id ,cte.earliest_date_observed
|
|
;--used this one 2023-04-05
|
|
--get the planned enrollment
|
|
create or replace view public.time_between_submission_and_start_view as
|
|
/* Get the absolute difference between the start date and the
|
|
* submission_date for each version of the trial (measured in days)
|
|
*
|
|
*/
|
|
select
|
|
s.nct_id,
|
|
s.start_date,
|
|
ts."version",
|
|
ts.submission_date,
|
|
abs(extract(epoch from ts.submission_date - s.start_date)::float/(24*60*60)) as start_deviance
|
|
from ctgov.studies s
|
|
join history.trial_snapshots ts
|
|
on s.nct_id = ts.nct_id
|
|
where s.nct_id in (select distinct nct_id from "DiseaseBurden".trial_to_icd10 tti)
|
|
;
|
|
create or replace view rank_proximity_to_start_time_view as
|
|
/* Rank each version based on it's proximity to the start date
|
|
* */
|
|
select
|
|
cte.nct_id,
|
|
cte."version",
|
|
row_number() over (partition by cte.nct_id order by cte.start_deviance) as rownum,
|
|
cte.submission_date,
|
|
cte.start_deviance,
|
|
cte.start_date,
|
|
ts.primary_completion_date ,
|
|
ts.primary_completion_date_category ,
|
|
ts.overall_status ,
|
|
ts.enrollment ,
|
|
ts.enrollment_category
|
|
from public.time_between_submission_and_start_view cte
|
|
join history.trial_snapshots ts
|
|
on cte.nct_id=ts.nct_id and cte."version"=ts."version"
|
|
;
|
|
create or replace view enrollment_closest_to_start_view as
|
|
/* for each trial
|
|
* select the version with a filled out enrollment
|
|
* that is closest to the start date.
|
|
* */
|
|
select cte2.nct_id, min(cte2.rownum) as enrollment_source
|
|
from rank_proximity_to_start_time_view cte2
|
|
where cte2.enrollment is not null
|
|
group by cte2.nct_id
|
|
;
|
|
--drop view public.formatted_data_with_planned_enrollment ;
|
|
create or replace view formatted_data_with_planned_enrollment as
|
|
select
|
|
f.*,
|
|
s.overall_status as final_status,
|
|
c2a."version",
|
|
c2a.enrollment as planned_enrollment
|
|
from formatted_data f
|
|
join ctgov.studies s
|
|
on f.nct_id = s.nct_id
|
|
join enrollment_closest_to_start_view c3e
|
|
on c3e.nct_id = f.nct_id
|
|
join rank_proximity_to_start_time_view c2a
|
|
on c3e.nct_id = c2a.nct_id and c3e.enrollment_source = c2a.rownum
|
|
;
|
|
select * from formatted_data_with_planned_enrollment
|
|
|
|
-------------------GET COUNTS------------------
|
|
select count(distinct nct_id) from public.view_cte; --88
|
|
select count(distinct nct_id) from public.view_disbur_cte0; --130
|
|
select count(distinct nct_id) from public.view_trial_to_cause; --130
|
|
select count(distinct nct_id) from public.view_disbur_cte;--130
|
|
select count(distinct nct_id) from public.view_disbur_cte2;--130
|
|
select count(distinct nct_id) from public.view_disbur_cte3;--130
|
|
select count(distinct nct_id) from public.formatted_data; --48 probably because there are so many trials that don't fall into a GBD category/cause
|
|
select count(distinct nct_id) from public.time_between_submission_and_start_view;--1067
|
|
select count(distinct nct_id) from rank_proximity_to_start_time_view;--1067
|
|
select count(distinct nct_id) from enrollment_closest_to_start_view;--1067
|
|
select count(distinct nct_id) from formatted_data_with_planned_enrollment;--48
|
|
|
|
|
|
|
|
|