Adding sql that was sitting on home computer

2 years ago · dfb041d12b
parent 9aaf007791
commit dfb041d12b
1 changed files with 308 additions and 0 deletions
--- a/Scripts/VariousDevelopmentsForAnalysis.sql
+++ b/Scripts/VariousDevelopmentsForAnalysis.sql
@ -0,0 +1,308 @@
+select * from formatted_data_with_planned_enrollment fdwpe 
+;
+
+
+select * from formatted_data_mat fdm
+;
+
+select count(distinct condition ) from formatted_data_mat fdm
+
+select nct_id, fdm.current_status , count(*) 
+from formatted_data_mat fdm 
+group by nct_id  , fdm.current_status 
+order by nct_id 
+;
+
+select * from formatted_data_mat fdm ;
+
+
+-- group with trial split
+with cte as (
+select nct_id
+from formatted_data_mat fdm 
+group by nct_id
+having count(distinct current_status) > 1
+order by nct_id
+)
+select 
+    fdm.nct_id
+    , current_status
+    , earliest_date_observed 
+    , elapsed_duration 
+    , n_brands
+    , category_id
+    , h_sdi_val 
+    , h_sdi_u95
+    , h_sdi_l95
+    , hm_sdi_val 
+    , hm_sdi_u95
+    , hm_sdi_l95
+    , m_sdi_val 
+    , m_sdi_u95
+    , m_sdi_l95
+    , lm_sdi_val 
+    , lm_sdi_u95
+    , lm_sdi_l95
+    , l_sdi_val 
+    , l_sdi_u95
+    , l_sdi_l95
+from formatted_data_mat fdm
+    join cte on cte.nct_id = fdm.nct_id 
+group by 
+    fdm.nct_id
+    , current_status
+    , earliest_date_observed 
+    , elapsed_duration 
+    , n_brands
+    , category_id
+    , h_sdi_val 
+    , h_sdi_u95
+    , h_sdi_l95
+    , hm_sdi_val 
+    , hm_sdi_u95
+    , hm_sdi_l95
+    , m_sdi_val 
+    , m_sdi_u95
+    , m_sdi_l95
+    , lm_sdi_val 
+    , lm_sdi_u95
+    , lm_sdi_l95
+    , l_sdi_val 
+    , l_sdi_u95
+    , l_sdi_l95
+order by nct_id , earliest_date_observed 
+;
+
+select count(distinct category_id ) from 
+
+
+select distinct category_id  from formatted_data_mat fdm 
+;
+
+
+
+
+
+-- group with trial split
+with cte as (
+select nct_id
+from formatted_data_mat fdm 
+group by nct_id
+having count(distinct current_status) > 1
+order by nct_id
+)
+select 
+    fdm.nct_id
+    , current_status
+    , earliest_date_observed 
+    , elapsed_duration 
+    , n_brands
+    , category_id
+    , h_sdi_val 
+    , h_sdi_u95
+    , h_sdi_l95
+    , hm_sdi_val 
+    , hm_sdi_u95
+    , hm_sdi_l95
+    , m_sdi_val 
+    , m_sdi_u95
+    , m_sdi_l95
+    , lm_sdi_val 
+    , lm_sdi_u95
+    , lm_sdi_l95
+    , l_sdi_val 
+    , l_sdi_u95
+    , l_sdi_l95
+from formatted_data_mat fdm
+    join cte on cte.nct_id = fdm.nct_id 
+group by 
+    fdm.nct_id
+    , current_status
+    , earliest_date_observed 
+    , elapsed_duration 
+    , n_brands
+    , category_id
+    , h_sdi_val 
+    , h_sdi_u95
+    , h_sdi_l95
+    , hm_sdi_val 
+    , hm_sdi_u95
+    , hm_sdi_l95
+    , m_sdi_val 
+    , m_sdi_u95
+    , m_sdi_l95
+    , lm_sdi_val 
+    , lm_sdi_u95
+    , lm_sdi_l95
+    , l_sdi_val 
+    , l_sdi_u95
+    , l_sdi_l95
+order by nct_id , earliest_date_observed 
+; --TODO: join to usp dc dataset
+
+
+
+
+WITH trialncts AS (
+         SELECT DISTINCT ts.nct_id
+           FROM history.trial_snapshots ts
+), nct_to_cui AS (
+         SELECT bi.nct_id,
+            bi.downcase_mesh_term,
+            rr.tty2,
+            rr.rxcui2 AS approved_drug_rxcui,
+            count(*) AS count
+           FROM ctgov.browse_interventions bi
+             LEFT JOIN rxnorm_migrated.rxnorm_props rp ON bi.downcase_mesh_term::text = rp.propvalue1::text
+             LEFT JOIN rxnorm_migrated.rxnorm_relations rr ON rr.rxcui1 = rp.rxcui
+          WHERE (bi.nct_id::text IN ( SELECT trialncts.nct_id
+                   FROM trialncts)) AND bi.mesh_type::text = 'mesh-list'::text AND rp.propname::text = 'Active_ingredient_name'::text AND (rr.tty2::text = ANY (ARRAY['BPCK'::text, 'SCD'::text, 'SBD'::text, 'GPCK'::text]))
+          GROUP BY bi.nct_id, bi.downcase_mesh_term, rr.tty2, rr.rxcui2
+        )   
+ SELECT nct_to_cui.nct_id,
+    ud."USP Category",
+    ud."USP Class"
+   FROM nct_to_cui
+     JOIN "Formularies".usp_dc ud ON ud.rxcui::bpchar = nct_to_cui.approved_drug_rxcui
+  GROUP BY nct_to_cui.nct_id, ud."USP Category", ud."USP Class"
+  ORDER BY nct_to_cui.nct_id;
+
+  
+  
+  
+CREATE MATERIALIZED VIEW "Formularies".nct_to_brands_through_uspdc
+AS
+WITH trialncts AS (
+         SELECT DISTINCT ts.nct_id
+           FROM history.trial_snapshots ts
+)
+SELECT 
+    bi.nct_id,
+    count( distinct rr2.rxcui2 ) as brand_name_count
+    FROM ctgov.browse_interventions bi
+      LEFT JOIN rxnorm_migrated.rxnorm_props rp ON bi.downcase_mesh_term::text = rp.propvalue1::text --match mesh terms to rxcui
+      LEFT JOIN rxnorm_migrated.rxnorm_relations rr ON rr.rxcui1 = rp.rxcui -- match rxcui to relations between rxcuis
+      LEFT JOIN rxnorm_migrated.rxnorm_relations rr2 ON rr.rxcui2 = rr2.rxcui1  -- match rxcui to relations between rxcuis
+WHERE 
+    (bi.nct_id::text IN (SELECT trialncts.nct_id FROM trialncts)) --check the nct_id is in our list 
+    AND 
+    bi.mesh_type::text = 'mesh-list'::text --we are only looking at mesh "list" rxcuis
+    AND rp.propname::text = 'Active_ingredient_name'::text  --and we only care about active ingredients linked to \/\/\/\/\/
+    AND (rr.tty2::text = ANY (ARRAY['BPCK'::text, 'SCD'::text, 'SBD'::text, 'GPCK'::text])) --and we are linking from active ingredients ^^^^ to branded packs
+    AND (rr2.tty2::text = 'BN') --and from branded packs back to brand names
+GROUP BY bi.nct_id --remove duplicates
+;
+
+
+
+/* 
+ * 
+ */
+
+
+select 
+    fdqpe.nct_id
+    --,fdqpe.start_date
+    --,fdqpe.current_enrollment
+    --,fdqpe.enrollment_category
+    ,fdqpe.current_status 
+    ,fdqpe.earliest_date_observed 
+    ,fdqpe.elapsed_duration
+    ,fdqpe.n_brands as identical_brands
+    ,ntbtu.brand_name_count 
+    ,fdqpe.category_id
+    ,fdqpe.final_status
+    ,fdqpe.h_sdi_val
+    --,fdqpe.h_sdi_u95
+    --,fdqpe.h_sdi_l95
+    ,fdqpe.hm_sdi_val
+    --,fdqpe.hm_sdi_u95
+    --,fdqpe.hm_sdi_l95
+    ,fdqpe.m_sdi_val
+    --,fdqpe.m_sdi_u95
+    --,fdqpe.m_sdi_l95
+    ,fdqpe.lm_sdi_val
+    --,fdqpe.lm_sdi_u95
+    --,fdqpe.lm_sdi_l95
+    ,fdqpe.l_sdi_val
+    --,fdqpe.l_sdi_u95
+    --,fdqpe.l_sdi_l95
+from formatted_data_mat fdqpe
+    join "Formularies".nct_to_brands_through_uspdc ntbtu
+        on fdqpe.nct_id = ntbtu.nct_id 
+;
+
+--example of multiple reopenings
+select * 
+from formatted_data_mat fdm 
+where nct_id = 'NCT01239797'
+
+--attempt to automatically find transition periods
+with cte1 as (
+	select nct_id, min(earliest_date_observed) over (partition by nct_id) as earliest_closed_enrollment
+	from formatted_data_mat fdm 
+	where current_status = 'Active, not recruiting'
+), cte2 as (
+	select nct_id, max(earliest_date_observed) over (partition by nct_id) latest_open_enrollment
+	from formatted_data_mat fdm 
+	where current_status != 'Active, not recruiting'
+)
+select 
+	cte1.nct_id
+	,cte1.earliest_closed_enrollment
+	,cte2.latest_open_enrollment
+	,cte1.earliest_closed_enrollment - cte2.latest_open_enrollment 
+from cte1
+	join cte2 on cte1.nct_id = cte2.nct_id
+/*group by 
+	cte1.nct_id
+	,cte1.earliest_closed_enrollment
+	,cte2.latest_open_enrollment
+*/
+
+	
+
+/* So ocassionally a study reopens enrollment.
+ * If that didn't happen, then I could just find the first enrollment matching X and/or last enrollment matching Y
+ * to get the transitions
+ * Instead I need to create shifts of statuses between snapshots, and then remove all of those that did not change. 
+ * 
+ * Better yet, just get the last shift to ANR.
+ * */
+
+	
+/* Take each entry and get the status from a lagged snapshot
+ * Then select each snapshot moving from previous_state to ANR
+ * and filter out everything except the last one.
+ * */
+with cte as (
+select 
+	nct_id
+	,lag(current_status, 1) over (partition by nct_id order by earliest_date_observed)  as previous_status
+	,current_status
+	,earliest_date_observed as date_current
+from formatted_data_mat fdm
+), cte2 as (
+select 	
+	nct_id 
+	,previous_status
+	,current_status 
+	,max(date_current) as date_current_max
+from cte
+where 
+	previous_status != current_status 
+	and
+	current_status = 'Active, not recruiting'
+group by 
+	nct_id
+	,previous_status
+	,current_status
+	,date_current
+)
+select * 
+from formatted_data_mat fdm
+	join cte2 
+		on cte2.nct_id = fdm.nct_id 
+		and cte2.date_current_max = fdm.earliest_date_observed 
+; --join back into 
+