diff --git a/Scripts/VariousDevelopmentsForAnalysis.sql b/Scripts/VariousDevelopmentsForAnalysis.sql new file mode 100644 index 0000000..9e312fc --- /dev/null +++ b/Scripts/VariousDevelopmentsForAnalysis.sql @@ -0,0 +1,308 @@ +select * from formatted_data_with_planned_enrollment fdwpe +; + + +select * from formatted_data_mat fdm +; + +select count(distinct condition ) from formatted_data_mat fdm + +select nct_id, fdm.current_status , count(*) +from formatted_data_mat fdm +group by nct_id , fdm.current_status +order by nct_id +; + +select * from formatted_data_mat fdm ; + + +-- group with trial split +with cte as ( +select nct_id +from formatted_data_mat fdm +group by nct_id +having count(distinct current_status) > 1 +order by nct_id +) +select + fdm.nct_id + , current_status + , earliest_date_observed + , elapsed_duration + , n_brands + , category_id + , h_sdi_val + , h_sdi_u95 + , h_sdi_l95 + , hm_sdi_val + , hm_sdi_u95 + , hm_sdi_l95 + , m_sdi_val + , m_sdi_u95 + , m_sdi_l95 + , lm_sdi_val + , lm_sdi_u95 + , lm_sdi_l95 + , l_sdi_val + , l_sdi_u95 + , l_sdi_l95 +from formatted_data_mat fdm + join cte on cte.nct_id = fdm.nct_id +group by + fdm.nct_id + , current_status + , earliest_date_observed + , elapsed_duration + , n_brands + , category_id + , h_sdi_val + , h_sdi_u95 + , h_sdi_l95 + , hm_sdi_val + , hm_sdi_u95 + , hm_sdi_l95 + , m_sdi_val + , m_sdi_u95 + , m_sdi_l95 + , lm_sdi_val + , lm_sdi_u95 + , lm_sdi_l95 + , l_sdi_val + , l_sdi_u95 + , l_sdi_l95 +order by nct_id , earliest_date_observed +; + +select count(distinct category_id ) from + + +select distinct category_id from formatted_data_mat fdm +; + + + + + +-- group with trial split +with cte as ( +select nct_id +from formatted_data_mat fdm +group by nct_id +having count(distinct current_status) > 1 +order by nct_id +) +select + fdm.nct_id + , current_status + , earliest_date_observed + , elapsed_duration + , n_brands + , category_id + , h_sdi_val + , h_sdi_u95 + , h_sdi_l95 + , hm_sdi_val + , hm_sdi_u95 + , hm_sdi_l95 + , m_sdi_val + , m_sdi_u95 + , m_sdi_l95 + , lm_sdi_val + , lm_sdi_u95 + , lm_sdi_l95 + , l_sdi_val + , l_sdi_u95 + , l_sdi_l95 +from formatted_data_mat fdm + join cte on cte.nct_id = fdm.nct_id +group by + fdm.nct_id + , current_status + , earliest_date_observed + , elapsed_duration + , n_brands + , category_id + , h_sdi_val + , h_sdi_u95 + , h_sdi_l95 + , hm_sdi_val + , hm_sdi_u95 + , hm_sdi_l95 + , m_sdi_val + , m_sdi_u95 + , m_sdi_l95 + , lm_sdi_val + , lm_sdi_u95 + , lm_sdi_l95 + , l_sdi_val + , l_sdi_u95 + , l_sdi_l95 +order by nct_id , earliest_date_observed +; --TODO: join to usp dc dataset + + + + +WITH trialncts AS ( + SELECT DISTINCT ts.nct_id + FROM history.trial_snapshots ts +), nct_to_cui AS ( + SELECT bi.nct_id, + bi.downcase_mesh_term, + rr.tty2, + rr.rxcui2 AS approved_drug_rxcui, + count(*) AS count + FROM ctgov.browse_interventions bi + LEFT JOIN rxnorm_migrated.rxnorm_props rp ON bi.downcase_mesh_term::text = rp.propvalue1::text + LEFT JOIN rxnorm_migrated.rxnorm_relations rr ON rr.rxcui1 = rp.rxcui + WHERE (bi.nct_id::text IN ( SELECT trialncts.nct_id + FROM trialncts)) AND bi.mesh_type::text = 'mesh-list'::text AND rp.propname::text = 'Active_ingredient_name'::text AND (rr.tty2::text = ANY (ARRAY['BPCK'::text, 'SCD'::text, 'SBD'::text, 'GPCK'::text])) + GROUP BY bi.nct_id, bi.downcase_mesh_term, rr.tty2, rr.rxcui2 + ) + SELECT nct_to_cui.nct_id, + ud."USP Category", + ud."USP Class" + FROM nct_to_cui + JOIN "Formularies".usp_dc ud ON ud.rxcui::bpchar = nct_to_cui.approved_drug_rxcui + GROUP BY nct_to_cui.nct_id, ud."USP Category", ud."USP Class" + ORDER BY nct_to_cui.nct_id; + + + + +CREATE MATERIALIZED VIEW "Formularies".nct_to_brands_through_uspdc +AS +WITH trialncts AS ( + SELECT DISTINCT ts.nct_id + FROM history.trial_snapshots ts +) +SELECT + bi.nct_id, + count( distinct rr2.rxcui2 ) as brand_name_count + FROM ctgov.browse_interventions bi + LEFT JOIN rxnorm_migrated.rxnorm_props rp ON bi.downcase_mesh_term::text = rp.propvalue1::text --match mesh terms to rxcui + LEFT JOIN rxnorm_migrated.rxnorm_relations rr ON rr.rxcui1 = rp.rxcui -- match rxcui to relations between rxcuis + LEFT JOIN rxnorm_migrated.rxnorm_relations rr2 ON rr.rxcui2 = rr2.rxcui1 -- match rxcui to relations between rxcuis +WHERE + (bi.nct_id::text IN (SELECT trialncts.nct_id FROM trialncts)) --check the nct_id is in our list + AND + bi.mesh_type::text = 'mesh-list'::text --we are only looking at mesh "list" rxcuis + AND rp.propname::text = 'Active_ingredient_name'::text --and we only care about active ingredients linked to \/\/\/\/\/ + AND (rr.tty2::text = ANY (ARRAY['BPCK'::text, 'SCD'::text, 'SBD'::text, 'GPCK'::text])) --and we are linking from active ingredients ^^^^ to branded packs + AND (rr2.tty2::text = 'BN') --and from branded packs back to brand names +GROUP BY bi.nct_id --remove duplicates +; + + + +/* + * + */ + + +select + fdqpe.nct_id + --,fdqpe.start_date + --,fdqpe.current_enrollment + --,fdqpe.enrollment_category + ,fdqpe.current_status + ,fdqpe.earliest_date_observed + ,fdqpe.elapsed_duration + ,fdqpe.n_brands as identical_brands + ,ntbtu.brand_name_count + ,fdqpe.category_id + ,fdqpe.final_status + ,fdqpe.h_sdi_val + --,fdqpe.h_sdi_u95 + --,fdqpe.h_sdi_l95 + ,fdqpe.hm_sdi_val + --,fdqpe.hm_sdi_u95 + --,fdqpe.hm_sdi_l95 + ,fdqpe.m_sdi_val + --,fdqpe.m_sdi_u95 + --,fdqpe.m_sdi_l95 + ,fdqpe.lm_sdi_val + --,fdqpe.lm_sdi_u95 + --,fdqpe.lm_sdi_l95 + ,fdqpe.l_sdi_val + --,fdqpe.l_sdi_u95 + --,fdqpe.l_sdi_l95 +from formatted_data_mat fdqpe + join "Formularies".nct_to_brands_through_uspdc ntbtu + on fdqpe.nct_id = ntbtu.nct_id +; + +--example of multiple reopenings +select * +from formatted_data_mat fdm +where nct_id = 'NCT01239797' + +--attempt to automatically find transition periods +with cte1 as ( + select nct_id, min(earliest_date_observed) over (partition by nct_id) as earliest_closed_enrollment + from formatted_data_mat fdm + where current_status = 'Active, not recruiting' +), cte2 as ( + select nct_id, max(earliest_date_observed) over (partition by nct_id) latest_open_enrollment + from formatted_data_mat fdm + where current_status != 'Active, not recruiting' +) +select + cte1.nct_id + ,cte1.earliest_closed_enrollment + ,cte2.latest_open_enrollment + ,cte1.earliest_closed_enrollment - cte2.latest_open_enrollment +from cte1 + join cte2 on cte1.nct_id = cte2.nct_id +/*group by + cte1.nct_id + ,cte1.earliest_closed_enrollment + ,cte2.latest_open_enrollment +*/ + + + +/* So ocassionally a study reopens enrollment. + * If that didn't happen, then I could just find the first enrollment matching X and/or last enrollment matching Y + * to get the transitions + * Instead I need to create shifts of statuses between snapshots, and then remove all of those that did not change. + * + * Better yet, just get the last shift to ANR. + * */ + + +/* Take each entry and get the status from a lagged snapshot + * Then select each snapshot moving from previous_state to ANR + * and filter out everything except the last one. + * */ +with cte as ( +select + nct_id + ,lag(current_status, 1) over (partition by nct_id order by earliest_date_observed) as previous_status + ,current_status + ,earliest_date_observed as date_current +from formatted_data_mat fdm +), cte2 as ( +select + nct_id + ,previous_status + ,current_status + ,max(date_current) as date_current_max +from cte +where + previous_status != current_status + and + current_status = 'Active, not recruiting' +group by + nct_id + ,previous_status + ,current_status + ,date_current +) +select * +from formatted_data_mat fdm + join cte2 + on cte2.nct_id = fdm.nct_id + and cte2.date_current_max = fdm.earliest_date_observed +; --join back into + \ No newline at end of file