ClinicalTrialsDataProcessing/Scripts/ExtractingData.sql

create extension tablefunc;

/*Getting Trial Data all together
 * There are 3 main datasets to join per trial:
 *
 * - Trial Data (still need to stick it together)
 *   - Duration and enrollment data
 * - Compound Marketing (can get for any trial)
 *   - how many individual brands per compound at the start of the trial
 * - Disease Data (can get for verified trials)
 *   - Population upper limit (Global Burdens of Disease)
 *   - Category (ICD10 2nd level groups)
 */

/*Disease Data*/
-- ICD10 Category and GBD data
with cte as (
    select
        nct_id,
        max("level") as max_level
    from trial_to_cause
    group by nct_id
), cte2 as (
    select
        ttc.nct_id,
        ttc.ui,
        ttc."condition",
        ttc.cause_text,
        ttc.cause_id,
        cte.max_level
    from trial_to_cause ttc
        join cte
            on cte.nct_id=ttc.nct_id
    where ttc."level"=cte.max_level
    group by
        ttc.nct_id,
        ttc.ui,
        ttc."condition",
        ttc.cause_text,
        ttc.cause_id,
        cte.max_level
    order by nct_id,ui
), cte3 as (
select
    nct_id,
    substring(cte2.ui for 3) as code,
    cte2."condition",
    cte2.cause_text,
    cte2.cause_id,
    ic.id as category_id,
    ic.group_name
from cte2
    join "DiseaseBurden".icd10_categories ic
        on
            substring(cte2.ui for 3) <= ic.end_code
            and
            substring(cte2.ui for 3) >= ic.start_code
)
select nct_id, cause_id,category_id
from cte3
group by nct_id, cause_id, category_id
;
--still need to link to actual disease burdens.

/*Compound Marketing Data*/
---Number of trials after a certain date
with marketing_cte as (
    select nct_id,count(distinct application_number_or_citation)
    from public.match_trial_to_marketing_start_date mttmsd
    where "min" > '2012-06-01'
    group by nct_id
)
select * from marketing_cte
;

/*Get versions*/
/* Ignore this version
with cte1 as (
    select nct_id,min("version") over (partition by nct_id) as min_version
    from history.trial_snapshots ts
    where
        ts.start_date < ts.submission_date
), cte2 as (
    select * from cte1
    group by nct_id, min_version
    order by nct_id
), cte3 as (
    select
        ts2.nct_id,
        ts2."version",
        ts2.overall_status,
        ts2.submission_date,
        ts2.start_date,
        ts2.enrollment,
        ts2.enrollment_category,
        ts2.primary_completion_date,
        ts2.primary_completion_date_category ,
        --mv.nct_id,
        mv.min_version
    from history.trial_snapshots ts2
        join cte2 mv
            on mv.nct_id = ts2.nct_id
    where
        ts2."version" = mv.min_version
    order by ts2.nct_id
), cte4 as (
    select cte3.nct_id, cte3.submission_date - cte3.start_date as submission_presecence
    from cte3
)
select avg(submission_presecence)
from cte4
;
--avg 61 day difference
 */

--use this version
with cte1 as ( --get trials
    select nct_id,max("version") over (partition by nct_id) as min_version
    from history.trial_snapshots ts
    where
        ts.start_date > ts.submission_date
), cte2 as ( --
    select * from cte1
    group by nct_id, min_version
    order by nct_id
), cte3 as (
    select
        ts2.nct_id,
        ts2."version",
        ts2.overall_status,
        ts2.submission_date,
        ts2.start_date,
        ts2.enrollment,
        ts2.enrollment_category,
        ts2.primary_completion_date,
        ts2.primary_completion_date_category ,
        --mv.nct_id,
        mv.min_version
    from history.trial_snapshots ts2
        join cte2 mv
            on mv.nct_id = ts2.nct_id
    where
        ts2."version" = mv.min_version
    order by ts2.nct_id
)
select *
from cte3
    where
        enrollment is null
        or enrollment_category is null
        or primary_completion_date is null
        or primary_completion_date_category is null
        or start_date is null
/*, cte4 as (
    select cte3.nct_id, cte3.submission_date - cte3.start_date as submission_presecence
    from cte3
)
select avg(submission_presecence)
from cte4
; -- -33 day difference on average
*/


with cte1_min as (
    select nct_id,min("version") over (partition by nct_id) as min_version
    from history.trial_snapshots ts
    where
        ts.start_date <= ts.submission_date
),cte1_max as (
    select nct_id,max("version") over (partition by nct_id) as max_version
    from history.trial_snapshots ts
    where
        ts.start_date >= ts.submission_date
), cte2_min as (
    select * from cte1_min
    group by nct_id, min_version
), cte2_max as (
    select * from cte1_max
    group by nct_id, max_version
)
select *
from cte2_min
    join cte2_max
        on cte2_min.nct_id=cte2_max.nct_id
where min_version >= max_version


/* Neet to take a different tack in filling out the is version of the data.
 * The idea is that we need the latest of each major category
 * before the start date.
 * */

--get the set of trials which have
with cte as (
    /* Get the absolute difference between the start date and the
     * submission_date for each version of the trial (measured in days)
     *
     */
    select
        s.nct_id,
        s.start_date,
        ts."version",
        ts.submission_date,
        abs(extract(epoch from ts.submission_date - s.start_date)::float/(24*60*60)) as start_deviance
    from ctgov.studies s
        join history.trial_snapshots ts
            on s.nct_id = ts.nct_id
    where s.nct_id in (select distinct nct_id from "DiseaseBurden".trial_to_icd10 tti)
),cte2 as (
    /* Rank each version based on it's proximity to the start date
     * */
    select
        cte.nct_id,
        cte."version",
        row_number() over (partition by cte.nct_id order by cte.start_deviance) as rownum,
        cte.submission_date,
        cte.start_deviance,
        cte.start_date,
        ts.primary_completion_date ,
        ts.primary_completion_date_category ,
        ts.overall_status ,
        ts.enrollment ,
        ts.enrollment_category
    from cte
        join history.trial_snapshots ts
            on cte.nct_id=ts.nct_id and cte."version"=ts."version"
), cte3_primary_completion as (
    /* for each trial
     * select the version with a filled out primary_completion_source
     * that is closest to the start date.
     * */
    select cte2.nct_id, min(cte2.rownum) as primary_completion_source
    from cte2
    where cte2.primary_completion_date is not null
    group by cte2.nct_id
), cte3_enrollment as (
    /* for each trial
     * select the version with a filled out enrollment
     * that is closest to the start date.
     * */
    select cte2.nct_id, min(cte2.rownum) as enrollment_source
    from cte2
    where cte2.enrollment is not null
    group by cte2.nct_id
), cte4 as (
/* join the best options together to get the data of interest.
 *
 * On further inspection there are just a view of those, with
 * many of them having a 7+ month difference between the two versions.
 * I think I am going to drop them.
 * */
select
    c3e.nct_id,
    --c2a.submission_date as submission_date_a,
    --c2b.submission_date as submission_date_b,
    --c3e.enrollment_source,
    c2a."version" as version_a,
    c2a.enrollment,
    c2a.enrollment_category,
    --c3p.primary_completion_source ,
    c2b."version" as version_b,
    c2b.primary_completion_date,
    c2b.primary_completion_date_category
from cte3_enrollment c3e
    join cte2 c2a
        on c3e.nct_id = c2a.nct_id and c3e.enrollment_source = c2a.rownum
    join cte3_primary_completion c3p
        on c3e.nct_id = c3p.nct_id
    join cte2 c2b
        on c3p.nct_id=c2b.nct_id and c3p.primary_completion_source = c2b.rownum
), cte5 as (
    select nct_id
    from cte4 where version_a != version_b
)
select
    c.nct_id,
    s2.overall_status,
    c.enrollment as planned_enrollment,
    s2.enrollment,
    s2.start_date,
    c.primary_completion_date as planned_primary_completion_date,
    s2.primary_completion_date,
    extract(epoch from c.primary_completion_date - s2.start_date)/(24*60*60) as planned_duration,
    s2.primary_completion_date - s2.start_date as actual_duration
from cte4 c
    join ctgov.studies s2
        on c.nct_id = s2.nct_id
where c.nct_id not in (select nct_id from cte5)
;


/*
 * Concern about causal inference
 *
 * When putting the data together for CBO it looked like we got occasional updates about
 * the status of trials that included enrollment updates.
 * That doesn't appear to be the case, but that messes with the ability to causally identify
 * any results. I need to be careful about this data is used.
 *
 * I created the statements below to get the data that I need.
 */


----get the set of trial snapshots
create or replace view public.view_cte as
select
    nct_id,
    primary_completion_date,
    primary_completion_date_category,
    enrollment,
    start_date,
    enrollment_category ,
    overall_status,
    --count("version"),
    min(submission_date) as earliest_date_observed
from history.trial_snapshots ts
where
    nct_id in (select distinct nct_id from "DiseaseBurden".trial_to_icd10 tti where tti.approved='accepted')
    and submission_date >= start_date
    and overall_status not in ('Completed','Terminated')
group by
    nct_id,
    primary_completion_date,
    primary_completion_date_category,
    start_date,
    enrollment,
    enrollment_category ,
    overall_status
;
create or replace view public.view_disbur_cte0 as
select tti.nct_id, tti.ui , tti."condition",itc.cause_text, ch.cause_id, ch."level"
from "DiseaseBurden".trial_to_icd10 tti
    join "DiseaseBurden".icd10_to_cause itc
        on replace(REPLACE(tti.ui,'-',''),'.','') = replace(REPLACE(itc.code ,'-',''),'.','')
    join "DiseaseBurden".cause_hierarchy  ch
        on itc.cause_text = ch.cause_name
where
    tti.approved = 'accepted'
;
create or replace view  public.view_trial_to_cause as
select tti.nct_id, tti.ui , tti."condition",itc.cause_text, ch.cause_id, ch."level"
from "DiseaseBurden".trial_to_icd10 tti
    join "DiseaseBurden".icd10_to_cause itc
        on replace(REPLACE(tti.ui,'-',''),'.','') = replace(REPLACE(itc.code ,'-',''),'.','')
    join "DiseaseBurden".cause_hierarchy  ch
        on itc.cause_text = ch.cause_name
where
    tti.approved = 'accepted'
order by nct_id
;--does this duplicate the view above?

create or replace view  public.view_disbur_cte as
select
    nct_id,
    max("level") as max_level
from view_disbur_cte0
group by nct_id

;
create or replace view  public.view_disbur_cte2 as
select
    ttc.nct_id,
    ttc.ui,
    ttc."condition",
    ttc.cause_text,
    ttc.cause_id,
    disbur_cte.max_level
from view_trial_to_cause ttc
    join view_disbur_cte as disbur_cte
        on disbur_cte.nct_id=ttc.nct_id
where ttc."level"=disbur_cte.max_level
group by
    ttc.nct_id,
    ttc.ui,
    ttc."condition",
    ttc.cause_text,
    ttc.cause_id,
    disbur_cte.max_level
order by nct_id,ui
;
create or replace view  public.view_disbur_cte3 as
select
    nct_id,
    substring(disbur_cte2.ui for 3) as code,
    disbur_cte2."condition",
    disbur_cte2.cause_text,
    disbur_cte2.cause_id,
    ic.chapter_code  as category_id,
    ic.group_name,
    disbur_cte2.max_level
from view_disbur_cte2 as disbur_cte2
    join "DiseaseBurden".icd10_categories ic
        on
            substring(disbur_cte2.ui for 3) <= ic.end_code
            and
            substring(disbur_cte2.ui for 3) >= ic.start_code
where ic."level" = 1

;
create or replace view  public.view_burdens_cte as
select *
from "DiseaseBurden".burdens b
where  b.sex_id = 3 --both sexes
    and b.metric_id = 1 --number/count
    and b.measure_id = 2 --DALYs
    and b.age_id =22 --all ages
;
create or replace view  public.view_burdens_cte2 as
select
    --c1.location_id,
    c1.cause_id,
    c1."year",
    --high sdi
    c1.val as h_sdi_val,
    c1.upper_95 as h_sdi_u95,
    c1.lower_95 as h_sdi_l95,
    --high-middle sdi
    c2.val as hm_sdi_val,
    c2.upper_95 as hm_sdi_u95,
    c2.lower_95 as hm_sdi_l95,
    --middle sdi
    c3.val as m_sdi_val,
    c3.upper_95 as m_sdi_u95,
    c3.lower_95 as m_sdi_l95,
    --low-middle sdi
    c4.val as lm_sdi_val,
    c4.upper_95 as lm_sdi_u95,
    c4.lower_95 as lm_sdi_l95,
    --low sdi
    c5.val as l_sdi_val,
    c5.upper_95 as l_sdi_u95,
    c5.lower_95 as l_sdi_l95
from view_burdens_cte c1
    join view_burdens_cte c2
        on c1.cause_id = c2.cause_id
            and c1."year" = c2."year"
    join view_burdens_cte c3
        on c1.cause_id = c3.cause_id
            and c1."year" = c3."year"
    join view_burdens_cte c4
        on c1.cause_id = c4.cause_id
            and c1."year" = c4."year"
    join view_burdens_cte c5
        on c1.cause_id = c5.cause_id
            and c1."year" = c5."year"
where c1.location_id = 44635
    and c2.location_id = 44634
    and c3.location_id = 44639
    and c4.location_id = 44636
    and c5.location_id = 44637
;
--drop view if exists public.formatted_data cascade;
create or replace view public.formatted_data as
select
    cte.nct_id,
    cte.start_date,
    cte.enrollment as current_enrollment,
    cte.enrollment_category,
    cte.overall_status  as current_status,
    cte.earliest_date_observed,
    extract( epoch from (cte.earliest_date_observed - cte.start_date))/extract( epoch from (cte.primary_completion_date - cte.start_date)) as elapsed_duration
    ,count(distinct mttmsd."application_number_or_citation") as n_brands
    ,dbc3.code
    ,dbc3."condition"
    ,dbc3.cause_text
    ,dbc3.cause_id
    ,dbc3.category_id
    ,dbc3.group_name
    ,dbc3.max_level
    --c1.location_id,
    --,b.cause_id
    ,b."year",
    --high sdi
    b.h_sdi_val,
    b.h_sdi_u95,
    b.h_sdi_l95,
    --high-middle sdi
    b.hm_sdi_val,
    b.hm_sdi_u95,
    b.hm_sdi_l95,
    --middle sdi
    b.m_sdi_val,
    b.m_sdi_u95,
    b.m_sdi_l95,
    --low-middle sdi
    b.lm_sdi_val,
    b.lm_sdi_u95,
    b.lm_sdi_l95,
    --low sdi
    b.l_sdi_val,
    b.l_sdi_u95,
    b.l_sdi_l95
from view_cte as cte
    join public.match_trial_to_marketing_start_date mttmsd
        on cte.nct_id = mttmsd."nct_id"
    join view_disbur_cte3 dbc3
        on dbc3.nct_id = cte.nct_id
    join view_burdens_cte2 b
        on b.cause_id = dbc3.cause_id and extract(year from b."year") = extract(year from cte.earliest_date_observed)
where
    mttmsd."min" <= cte.earliest_date_observed
group by
    cte.nct_id,
    cte.start_date,
    cte.enrollment,
    cte.enrollment_category,
    cte.overall_status,
    cte.earliest_date_observed,
    elapsed_duration
    ,dbc3.code
    ,dbc3."condition"
    ,dbc3.cause_text
    ,dbc3.cause_id
    ,dbc3.category_id
    ,dbc3.group_name
    ,dbc3.max_level
    --c1.location_id,
    ,b.cause_id,
    b."year",
    --high sdi
    b.h_sdi_val,
    b.h_sdi_u95,
    b.h_sdi_l95,
    --high-middle sdi
    b.hm_sdi_val,
    b.hm_sdi_u95,
    b.hm_sdi_l95,
    --middle sdi
    b.m_sdi_val,
    b.m_sdi_u95,
    b.m_sdi_l95,
    --low-middle sdi
    b.lm_sdi_val,
    b.lm_sdi_u95,
    b.lm_sdi_l95,
    --low sdi
    b.l_sdi_val,
    b.l_sdi_u95,
    b.l_sdi_l95
order by cte.nct_id ,cte.earliest_date_observed
;--used this one 2023-04-05
--get the planned enrollment
create or replace view public.time_between_submission_and_start_view as
    /* Get the absolute difference between the start date and the
     * submission_date for each version of the trial (measured in days)
     *
     */
select
    s.nct_id,
    s.start_date,
    ts."version",
    ts.submission_date,
    abs(extract(epoch from ts.submission_date - s.start_date)::float/(24*60*60)) as start_deviance
from ctgov.studies s
    join history.trial_snapshots ts
        on s.nct_id = ts.nct_id
where s.nct_id in (select distinct nct_id from "DiseaseBurden".trial_to_icd10 tti)
;
create or replace view rank_proximity_to_start_time_view as
    /* Rank each version based on it's proximity to the start date
     * */
    select
        cte.nct_id,
        cte."version",
        row_number() over (partition by cte.nct_id order by cte.start_deviance) as rownum,
        cte.submission_date,
        cte.start_deviance,
        cte.start_date,
        ts.primary_completion_date ,
        ts.primary_completion_date_category ,
        ts.overall_status ,
        ts.enrollment ,
        ts.enrollment_category
    from public.time_between_submission_and_start_view cte
        join history.trial_snapshots ts
            on cte.nct_id=ts.nct_id and cte."version"=ts."version"
;
create or replace view enrollment_closest_to_start_view as
    /* for each trial
     * select the version with a filled out enrollment
     * that is closest to the start date.
     * */
    select cte2.nct_id, min(cte2.rownum) as enrollment_source
    from rank_proximity_to_start_time_view cte2
    where cte2.enrollment is not null
    group by cte2.nct_id
;
--drop view public.formatted_data_with_planned_enrollment ;
create or replace view formatted_data_with_planned_enrollment as
select
    f.*,
    s.overall_status as final_status,
    c2a."version",
    c2a.enrollment as planned_enrollment
from formatted_data f
    join ctgov.studies s
        on f.nct_id = s.nct_id
    join enrollment_closest_to_start_view c3e
        on c3e.nct_id = f.nct_id
    join rank_proximity_to_start_time_view c2a
        on c3e.nct_id = c2a.nct_id and c3e.enrollment_source = c2a.rownum
;
select * from formatted_data_with_planned_enrollment

-------------------GET COUNTS------------------
select count(distinct nct_id) from public.view_cte; --88
select count(distinct nct_id) from public.view_disbur_cte0; --130
select count(distinct nct_id) from public.view_trial_to_cause; --130
select count(distinct nct_id) from public.view_disbur_cte;--130
select count(distinct nct_id) from public.view_disbur_cte2;--130
select count(distinct nct_id) from public.view_disbur_cte3;--130
select count(distinct nct_id) from public.formatted_data; --48 probably because there are so many trials that don't fall into a GBD category/cause
select count(distinct nct_id) from public.time_between_submission_and_start_view;--1067
select count(distinct nct_id) from rank_proximity_to_start_time_view;--1067
select count(distinct nct_id) from enrollment_closest_to_start_view;--1067
select count(distinct nct_id) from formatted_data_with_planned_enrollment;--48