create extension tablefunc; /*Getting Trial Data all together * There are 3 main datasets to join per trial: * * - Trial Data (still need to stick it together) * - Duration and enrollment data * - Compound Marketing (can get for any trial) * - how many individual brands per compound at the start of the trial * - Disease Data (can get for verified trials) * - Population upper limit (Global Burdens of Disease) * - Category (ICD10 2nd level groups) */ /*Disease Data*/ -- ICD10 Category and GBD data with cte as ( select nct_id, max("level") as max_level from trial_to_cause group by nct_id ), cte2 as ( select ttc.nct_id, ttc.ui, ttc."condition", ttc.cause_text, ttc.cause_id, cte.max_level from trial_to_cause ttc join cte on cte.nct_id=ttc.nct_id where ttc."level"=cte.max_level group by ttc.nct_id, ttc.ui, ttc."condition", ttc.cause_text, ttc.cause_id, cte.max_level order by nct_id,ui ), cte3 as ( select nct_id, substring(cte2.ui for 3) as code, cte2."condition", cte2.cause_text, cte2.cause_id, ic.id as category_id, ic.group_name from cte2 join "DiseaseBurden".icd10_categories ic on substring(cte2.ui for 3) <= ic.end_code and substring(cte2.ui for 3) >= ic.start_code ) select nct_id, cause_id,category_id from cte3 group by nct_id, cause_id, category_id ; --still need to link to actual disease burdens. /*Compound Marketing Data*/ ---Number of trials after a certain date with marketing_cte as ( select nct_id,count(distinct application_number_or_citation) from public.match_trial_to_marketing_start_date mttmsd where "min" > '2012-06-01' group by nct_id ) select * from marketing_cte ; /*Get versions*/ /* Ignore this version with cte1 as ( select nct_id,min("version") over (partition by nct_id) as min_version from history.trial_snapshots ts where ts.start_date < ts.submission_date ), cte2 as ( select * from cte1 group by nct_id, min_version order by nct_id ), cte3 as ( select ts2.nct_id, ts2."version", ts2.overall_status, ts2.submission_date, ts2.start_date, ts2.enrollment, ts2.enrollment_category, ts2.primary_completion_date, ts2.primary_completion_date_category , --mv.nct_id, mv.min_version from history.trial_snapshots ts2 join cte2 mv on mv.nct_id = ts2.nct_id where ts2."version" = mv.min_version order by ts2.nct_id ), cte4 as ( select cte3.nct_id, cte3.submission_date - cte3.start_date as submission_presecence from cte3 ) select avg(submission_presecence) from cte4 ; --avg 61 day difference */ --use this version with cte1 as ( --get trials select nct_id,max("version") over (partition by nct_id) as min_version from history.trial_snapshots ts where ts.start_date > ts.submission_date ), cte2 as ( -- select * from cte1 group by nct_id, min_version order by nct_id ), cte3 as ( select ts2.nct_id, ts2."version", ts2.overall_status, ts2.submission_date, ts2.start_date, ts2.enrollment, ts2.enrollment_category, ts2.primary_completion_date, ts2.primary_completion_date_category , --mv.nct_id, mv.min_version from history.trial_snapshots ts2 join cte2 mv on mv.nct_id = ts2.nct_id where ts2."version" = mv.min_version order by ts2.nct_id ) select * from cte3 where enrollment is null or enrollment_category is null or primary_completion_date is null or primary_completion_date_category is null or start_date is null /*, cte4 as ( select cte3.nct_id, cte3.submission_date - cte3.start_date as submission_presecence from cte3 ) select avg(submission_presecence) from cte4 ; -- -33 day difference on average */ with cte1_min as ( select nct_id,min("version") over (partition by nct_id) as min_version from history.trial_snapshots ts where ts.start_date <= ts.submission_date ),cte1_max as ( select nct_id,max("version") over (partition by nct_id) as max_version from history.trial_snapshots ts where ts.start_date >= ts.submission_date ), cte2_min as ( select * from cte1_min group by nct_id, min_version ), cte2_max as ( select * from cte1_max group by nct_id, max_version ) select * from cte2_min join cte2_max on cte2_min.nct_id=cte2_max.nct_id where min_version >= max_version /* Neet to take a different tack in filling out the is version of the data. * The idea is that we need the latest of each major category * before the start date. * */ --get the set of trials which have with cte as ( /* Get the absolute difference between the start date and the * submission_date for each version of the trial (measured in days) * */ select s.nct_id, s.start_date, ts."version", ts.submission_date, abs(extract(epoch from ts.submission_date - s.start_date)::float/(24*60*60)) as start_deviance from ctgov.studies s join history.trial_snapshots ts on s.nct_id = ts.nct_id where s.nct_id in (select distinct nct_id from "DiseaseBurden".trial_to_icd10 tti) ),cte2 as ( /* Rank each version based on it's proximity to the start date * */ select cte.nct_id, cte."version", row_number() over (partition by cte.nct_id order by cte.start_deviance) as rownum, cte.submission_date, cte.start_deviance, cte.start_date, ts.primary_completion_date , ts.primary_completion_date_category , ts.overall_status , ts.enrollment , ts.enrollment_category from cte join history.trial_snapshots ts on cte.nct_id=ts.nct_id and cte."version"=ts."version" ), cte3_primary_completion as ( /* for each trial * select the version with a filled out primary_completion_source * that is closest to the start date. * */ select cte2.nct_id, min(cte2.rownum) as primary_completion_source from cte2 where cte2.primary_completion_date is not null group by cte2.nct_id ), cte3_enrollment as ( /* for each trial * select the version with a filled out enrollment * that is closest to the start date. * */ select cte2.nct_id, min(cte2.rownum) as enrollment_source from cte2 where cte2.enrollment is not null group by cte2.nct_id ), cte4 as ( /* join the best options together to get the data of interest. * * On further inspection there are just a view of those, with * many of them having a 7+ month difference between the two versions. * I think I am going to drop them. * */ select c3e.nct_id, --c2a.submission_date as submission_date_a, --c2b.submission_date as submission_date_b, --c3e.enrollment_source, c2a."version" as version_a, c2a.enrollment, c2a.enrollment_category, --c3p.primary_completion_source , c2b."version" as version_b, c2b.primary_completion_date, c2b.primary_completion_date_category from cte3_enrollment c3e join cte2 c2a on c3e.nct_id = c2a.nct_id and c3e.enrollment_source = c2a.rownum join cte3_primary_completion c3p on c3e.nct_id = c3p.nct_id join cte2 c2b on c3p.nct_id=c2b.nct_id and c3p.primary_completion_source = c2b.rownum ), cte5 as ( select nct_id from cte4 where version_a != version_b ) select c.nct_id, s2.overall_status, c.enrollment as planned_enrollment, s2.enrollment, s2.start_date, c.primary_completion_date as planned_primary_completion_date, s2.primary_completion_date, extract(epoch from c.primary_completion_date - s2.start_date)/(24*60*60) as planned_duration, s2.primary_completion_date - s2.start_date as actual_duration from cte4 c join ctgov.studies s2 on c.nct_id = s2.nct_id where c.nct_id not in (select nct_id from cte5) ; /* * Concern about causal inference * * When putting the data together for CBO it looked like we got occasional updates about * the status of trials that included enrollment updates. * That doesn't appear to be the case, but that messes with the ability to causally identify * any results. I need to be careful about this data is used. * * I created the statements below to get the data that I need. */ ----get the set of trial snapshots create or replace view public.view_cte as select nct_id, primary_completion_date, primary_completion_date_category, enrollment, start_date, enrollment_category , overall_status, --count("version"), min(submission_date) as earliest_date_observed from history.trial_snapshots ts where nct_id in (select distinct nct_id from "DiseaseBurden".trial_to_icd10 tti where tti.approved='accepted') and submission_date >= start_date and overall_status not in ('Completed','Terminated') group by nct_id, primary_completion_date, primary_completion_date_category, start_date, enrollment, enrollment_category , overall_status ; create or replace view public.view_disbur_cte0 as select tti.nct_id, tti.ui , tti."condition",itc.cause_text, ch.cause_id, ch."level" from "DiseaseBurden".trial_to_icd10 tti join "DiseaseBurden".icd10_to_cause itc on replace(REPLACE(tti.ui,'-',''),'.','') = replace(REPLACE(itc.code ,'-',''),'.','') join "DiseaseBurden".cause_hierarchy ch on itc.cause_text = ch.cause_name where tti.approved = 'accepted' ; create or replace view public.view_trial_to_cause as select tti.nct_id, tti.ui , tti."condition",itc.cause_text, ch.cause_id, ch."level" from "DiseaseBurden".trial_to_icd10 tti join "DiseaseBurden".icd10_to_cause itc on replace(REPLACE(tti.ui,'-',''),'.','') = replace(REPLACE(itc.code ,'-',''),'.','') join "DiseaseBurden".cause_hierarchy ch on itc.cause_text = ch.cause_name where tti.approved = 'accepted' order by nct_id ;--does this duplicate the view above? create or replace view public.view_disbur_cte as select nct_id, max("level") as max_level from view_disbur_cte0 group by nct_id ; create or replace view public.view_disbur_cte2 as select ttc.nct_id, ttc.ui, ttc."condition", ttc.cause_text, ttc.cause_id, disbur_cte.max_level from view_trial_to_cause ttc join view_disbur_cte as disbur_cte on disbur_cte.nct_id=ttc.nct_id where ttc."level"=disbur_cte.max_level group by ttc.nct_id, ttc.ui, ttc."condition", ttc.cause_text, ttc.cause_id, disbur_cte.max_level order by nct_id,ui ; create or replace view public.view_disbur_cte3 as select nct_id, substring(disbur_cte2.ui for 3) as code, disbur_cte2."condition", disbur_cte2.cause_text, disbur_cte2.cause_id, ic.chapter_code as category_id, ic.group_name, disbur_cte2.max_level from view_disbur_cte2 as disbur_cte2 join "DiseaseBurden".icd10_categories ic on substring(disbur_cte2.ui for 3) <= ic.end_code and substring(disbur_cte2.ui for 3) >= ic.start_code where ic."level" = 1 ; create or replace view public.view_burdens_cte as select * from "DiseaseBurden".burdens b where b.sex_id = 3 --both sexes and b.metric_id = 1 --number/count and b.measure_id = 2 --DALYs and b.age_id =22 --all ages ; create or replace view public.view_burdens_cte2 as select --c1.location_id, c1.cause_id, c1."year", --high sdi c1.val as h_sdi_val, c1.upper_95 as h_sdi_u95, c1.lower_95 as h_sdi_l95, --high-middle sdi c2.val as hm_sdi_val, c2.upper_95 as hm_sdi_u95, c2.lower_95 as hm_sdi_l95, --middle sdi c3.val as m_sdi_val, c3.upper_95 as m_sdi_u95, c3.lower_95 as m_sdi_l95, --low-middle sdi c4.val as lm_sdi_val, c4.upper_95 as lm_sdi_u95, c4.lower_95 as lm_sdi_l95, --low sdi c5.val as l_sdi_val, c5.upper_95 as l_sdi_u95, c5.lower_95 as l_sdi_l95 from view_burdens_cte c1 join view_burdens_cte c2 on c1.cause_id = c2.cause_id and c1."year" = c2."year" join view_burdens_cte c3 on c1.cause_id = c3.cause_id and c1."year" = c3."year" join view_burdens_cte c4 on c1.cause_id = c4.cause_id and c1."year" = c4."year" join view_burdens_cte c5 on c1.cause_id = c5.cause_id and c1."year" = c5."year" where c1.location_id = 44635 and c2.location_id = 44634 and c3.location_id = 44639 and c4.location_id = 44636 and c5.location_id = 44637 ; --drop view if exists public.formatted_data cascade; create or replace view public.formatted_data as select cte.nct_id, cte.start_date, cte.enrollment as current_enrollment, cte.enrollment_category, cte.overall_status as current_status, cte.earliest_date_observed, extract( epoch from (cte.earliest_date_observed - cte.start_date))/extract( epoch from (cte.primary_completion_date - cte.start_date)) as elapsed_duration ,count(distinct mttmsd."application_number_or_citation") as n_brands ,dbc3.code ,dbc3."condition" ,dbc3.cause_text ,dbc3.cause_id ,dbc3.category_id ,dbc3.group_name ,dbc3.max_level --c1.location_id, --,b.cause_id ,b."year", --high sdi b.h_sdi_val, b.h_sdi_u95, b.h_sdi_l95, --high-middle sdi b.hm_sdi_val, b.hm_sdi_u95, b.hm_sdi_l95, --middle sdi b.m_sdi_val, b.m_sdi_u95, b.m_sdi_l95, --low-middle sdi b.lm_sdi_val, b.lm_sdi_u95, b.lm_sdi_l95, --low sdi b.l_sdi_val, b.l_sdi_u95, b.l_sdi_l95 from view_cte as cte join public.match_trial_to_marketing_start_date mttmsd on cte.nct_id = mttmsd."nct_id" join view_disbur_cte3 dbc3 on dbc3.nct_id = cte.nct_id join view_burdens_cte2 b on b.cause_id = dbc3.cause_id and extract(year from b."year") = extract(year from cte.earliest_date_observed) where mttmsd."min" <= cte.earliest_date_observed group by cte.nct_id, cte.start_date, cte.enrollment, cte.enrollment_category, cte.overall_status, cte.earliest_date_observed, elapsed_duration ,dbc3.code ,dbc3."condition" ,dbc3.cause_text ,dbc3.cause_id ,dbc3.category_id ,dbc3.group_name ,dbc3.max_level --c1.location_id, ,b.cause_id, b."year", --high sdi b.h_sdi_val, b.h_sdi_u95, b.h_sdi_l95, --high-middle sdi b.hm_sdi_val, b.hm_sdi_u95, b.hm_sdi_l95, --middle sdi b.m_sdi_val, b.m_sdi_u95, b.m_sdi_l95, --low-middle sdi b.lm_sdi_val, b.lm_sdi_u95, b.lm_sdi_l95, --low sdi b.l_sdi_val, b.l_sdi_u95, b.l_sdi_l95 order by cte.nct_id ,cte.earliest_date_observed ;--used this one 2023-04-05 --get the planned enrollment create or replace view public.time_between_submission_and_start_view as /* Get the absolute difference between the start date and the * submission_date for each version of the trial (measured in days) * */ select s.nct_id, s.start_date, ts."version", ts.submission_date, abs(extract(epoch from ts.submission_date - s.start_date)::float/(24*60*60)) as start_deviance from ctgov.studies s join history.trial_snapshots ts on s.nct_id = ts.nct_id where s.nct_id in (select distinct nct_id from "DiseaseBurden".trial_to_icd10 tti) ; create or replace view rank_proximity_to_start_time_view as /* Rank each version based on it's proximity to the start date * */ select cte.nct_id, cte."version", row_number() over (partition by cte.nct_id order by cte.start_deviance) as rownum, cte.submission_date, cte.start_deviance, cte.start_date, ts.primary_completion_date , ts.primary_completion_date_category , ts.overall_status , ts.enrollment , ts.enrollment_category from public.time_between_submission_and_start_view cte join history.trial_snapshots ts on cte.nct_id=ts.nct_id and cte."version"=ts."version" ; create or replace view enrollment_closest_to_start_view as /* for each trial * select the version with a filled out enrollment * that is closest to the start date. * */ select cte2.nct_id, min(cte2.rownum) as enrollment_source from rank_proximity_to_start_time_view cte2 where cte2.enrollment is not null group by cte2.nct_id ; --drop view public.formatted_data_with_planned_enrollment ; create or replace view formatted_data_with_planned_enrollment as select f.*, s.overall_status as final_status, c2a."version", c2a.enrollment as planned_enrollment from formatted_data f join ctgov.studies s on f.nct_id = s.nct_id join enrollment_closest_to_start_view c3e on c3e.nct_id = f.nct_id join rank_proximity_to_start_time_view c2a on c3e.nct_id = c2a.nct_id and c3e.enrollment_source = c2a.rownum ; select * from formatted_data_with_planned_enrollment -------------------GET COUNTS------------------ select count(distinct nct_id) from public.view_cte; --88 select count(distinct nct_id) from public.view_disbur_cte0; --130 select count(distinct nct_id) from public.view_trial_to_cause; --130 select count(distinct nct_id) from public.view_disbur_cte;--130 select count(distinct nct_id) from public.view_disbur_cte2;--130 select count(distinct nct_id) from public.view_disbur_cte3;--130 select count(distinct nct_id) from public.formatted_data; --48 probably because there are so many trials that don't fall into a GBD category/cause select count(distinct nct_id) from public.time_between_submission_and_start_view;--1067 select count(distinct nct_id) from rank_proximity_to_start_time_view;--1067 select count(distinct nct_id) from enrollment_closest_to_start_view;--1067 select count(distinct nct_id) from formatted_data_with_planned_enrollment;--48 select count(distinct nct_id) from public.view_trial_to_cause; --130 select count(distinct nct_id) from formatted_data_with_planned_enrollment;--48 --get durations and count snapshots per trial per trial with cte1 as ( select nct_id, start_date , primary_completion_date, overall_status , primary_completion_date - start_date as duration from ctgov.studies s where nct_id in (select distinct nct_id from http.download_status ds) ), cte2 as ( select nct_id,count(*) as snapshot_count from formatted_data_with_planned_enrollment fdwpe group by nct_id ) select a.nct_id, a.overall_status, a.duration,b.snapshot_count from cte1 as a join cte2 as b on a.nct_id=b.nct_id ;