From 6fcc8cda226be573d661308cbe9dd21e8e664060 Mon Sep 17 00:00:00 2001 From: Will King Date: Sat, 1 Feb 2025 19:59:36 +0000 Subject: [PATCH] Fixed column issue in enrollment delay --- r-analysis/EffectsOfEnrollmentDelay.qmd | 54 +- r-analysis/EffectsOfEnrollmentDelay_const.qmd | 1214 ------- .../EffectsOfEnrollmentDelay_rebased.html | 3086 ----------------- .../EffectsOfEnrollmentDelay_rebased.qmd | 1224 ------- .../Hierarchal_Logistic_with_status_diff.stan | 102 - 5 files changed, 27 insertions(+), 5653 deletions(-) delete mode 100644 r-analysis/EffectsOfEnrollmentDelay_const.qmd delete mode 100644 r-analysis/EffectsOfEnrollmentDelay_rebased.html delete mode 100644 r-analysis/EffectsOfEnrollmentDelay_rebased.qmd delete mode 100644 r-analysis/Hierarchal_Logistic_with_status_diff.stan diff --git a/r-analysis/EffectsOfEnrollmentDelay.qmd b/r-analysis/EffectsOfEnrollmentDelay.qmd index 1aea9e5..0602f22 100644 --- a/r-analysis/EffectsOfEnrollmentDelay.qmd +++ b/r-analysis/EffectsOfEnrollmentDelay.qmd @@ -238,14 +238,14 @@ cf_categories <- df_counterfact_base$category_id ################################# FIT MODEL ######################################### inherited_cols <- c( "elapsed_duration" - #,"identical_brands" - #,"brand_name_counts" + ,"identical_brands" + ,"brand_name_counts" ,"h_sdi_val" ,"hm_sdi_val" ,"m_sdi_val" ,"lm_sdi_val" ,"l_sdi_val" - ,"status_NYR"# TODO: may need to remove + ,"status_NYR" ,"status_EBI" ,"status_Rec" ,"status_ANR" @@ -404,7 +404,7 @@ parameter_mcmc_areas <- function( Plan: select all snapshots that are the first to have closed enrollment (Rec -> ANR) ```{r} #delay intervention -intervention_enrollment <- x_cf_base[c(inherited_cols,"brand_name_counts", "identical_brands")] +intervention_enrollment <- x_cf_base[c(inherited_cols)] intervention_enrollment["status_ANR"] <- 0 intervention_enrollment["status_Rec"] <- 1 ``` @@ -419,12 +419,14 @@ counterfact_delay <- list( x = as.matrix(x), mu_mean = 0, mu_stdev = 0.05, - sigma_shape = 4, - sigma_rate = 20, + sigma_shape = 8, + sigma_rate = , Nx = nrow(x_cf_base), llx = as.vector(cf_categories), counterfact_x_tilde = as.matrix(intervention_enrollment), - counterfact_x = as.matrix(x_cf_base) + counterfact_x = as.matrix(x_cf_base), + status_indexes = c(11,12) #subtract anr from recruiting to get movement from anr to recruiting + ) ``` @@ -432,14 +434,24 @@ counterfact_delay <- list( fit <- stan( file='Hierarchal_Logistic.stan', data = counterfact_delay, - chains = 4, - iter = 5000, + chains = 8, + iter = 12000, + warmup = 4000, seed = 11021585 ) ``` +## Fit Results +```{r} +print(check_hmc_diagnostics(fit)) +print(get_bfmi(fit)) +``` + +```{r} +print(fit) +``` @@ -564,13 +576,6 @@ sum(df5$snapshot_count) -## Fit Results - - -```{r} -################################# ANALYZE ##################################### -print(fit) -``` # Parameter Distributions @@ -639,13 +644,13 @@ Note these have 95% outer CI and 80% inner (shaded) ```{r} -#get the generic and uspdc parameters -print(px[4]$plot + px[7]$plot) -ggsave(paste0(image_parameters_across_groups,"2+3_generic_and_uspdc.png")) +##get the generic and uspdc parameters +#print(px[4]$plot + px[7]$plot) +#ggsave(paste0(image_parameters_across_groups,"2+3_generic_and_uspdc.png")) #get the parameters associated with duration -px[16]$plot + px[19]$plot -ggsave(paste0(image_parameters_across_groups,"11+12_statusREC_and_statusANR.png")) +#px[16]$plot + px[19]$plot +#ggsave(paste0(image_parameters_across_groups,"11+12_statusREC_and_statusANR.png")) ``` @@ -718,11 +723,6 @@ ggsave(paste0(image_dist_diff_analysis,"/prior_sigma.png")) -```{r} -check_hmc_diagnostics(fit) -``` - - @@ -1207,4 +1207,4 @@ for (i in 1:3) { - \ No newline at end of file + diff --git a/r-analysis/EffectsOfEnrollmentDelay_const.qmd b/r-analysis/EffectsOfEnrollmentDelay_const.qmd deleted file mode 100644 index d50f988..0000000 --- a/r-analysis/EffectsOfEnrollmentDelay_const.qmd +++ /dev/null @@ -1,1214 +0,0 @@ ---- -title: "The Effects of Recruitment status on completion of clinical trials" -author: "Will King" -format: html -editor: source ---- - - -# Setup - -```{r} -library(knitr) -library(bayesplot) -available_mcmc(pattern = "_nuts_") -library(ggplot2) -library(patchwork) -library(tidyverse) -library(rstan) -library(tidyr) -library(ghibli) -library(xtable) -#Resources: https://github.com/stan-dev/rstan/wiki/RStan-Getting-Started - -#save unchanged models instead of recompiling -rstan_options(auto_write = TRUE) -#allow for multithreaded sampling -options(mc.cores = parallel::detectCores()) - -#test installation, shouldn't get any errors -#example(stan_model, package = "rstan", run.dontrun = TRUE) - - -image_root <- "./output/const_anr/EffectsOfEnrollmentDelay" -image_dist_diff_analysis <- paste0(image_root,"/dist_diff_analysis") -image_trial_details <-paste0(image_root,"/trials_details") -image_parameters_by_groups <-paste0(image_root,"/betas/parameters_by_group") -image_parameters_across_groups <-paste0(image_root,"/betas/parameter_across_groups") - -``` - -```{r} -################ Pull data from database ###################### -library(RPostgreSQL) -host <- 'aact_db-restored-2025-01-07' - -driver <- dbDriver("PostgreSQL") - -get_data <- function(driver) { - -con <- dbConnect( - driver, - user='root', - password='root', - dbname='aact_db', - host=host - ) -on.exit(dbDisconnect(con)) - -query <- dbSendQuery( - con, -# "select * from formatted_data_with_planned_enrollment;" -" -select - fdqpe.nct_id - --,fdqpe.start_date - --,fdqpe.current_enrollment - --,fdqpe.enrollment_category - ,fdqpe.current_status - ,fdqpe.earliest_date_observed - ,fdqpe.elapsed_duration - ,fdqpe.n_brands as identical_brands - ,ntbtu.brand_name_counts - ,fdqpe.category_id - ,fdqpe.final_status - ,fdqpe.h_sdi_val - --,fdqpe.h_sdi_u95 - --,fdqpe.h_sdi_l95 - ,fdqpe.hm_sdi_val - --,fdqpe.hm_sdi_u95 - --,fdqpe.hm_sdi_l95 - ,fdqpe.m_sdi_val - --,fdqpe.m_sdi_u95 - --,fdqpe.m_sdi_l95 - ,fdqpe.lm_sdi_val - --,fdqpe.lm_sdi_u95 - --,fdqpe.lm_sdi_l95 - ,fdqpe.l_sdi_val - --,fdqpe.l_sdi_u95 - --,fdqpe.l_sdi_l95 -from formatted_data_with_planned_enrollment fdqpe - join \"Formularies\".nct_to_brand_counts_through_uspdc ntbtu - on fdqpe.nct_id = ntbtu.nct_id -order by fdqpe.nct_id, fdqpe.earliest_date_observed -; -" - ) -df <- fetch(query, n = -1) -df <- na.omit(df) - -query2 <-dbSendQuery(con,"select count(*) from \"DiseaseBurden\".icd10_categories ic where \"level\"=1;") -n_categories <- fetch(query2, n = -1) - -return(list(data=df,ncat=n_categories)) -} - - -get_counterfact_base <- function(driver) { - -con <- dbConnect( - driver, - user='root', - password='root', - dbname='aact_db', - host=host - ) -on.exit(dbDisconnect(con)) - -query <- dbSendQuery( - con, - " - with cte as ( - --get last recruiting state - select fd.nct_id, max(fd.earliest_date_observed),min(fd2.earliest_date_observed) as tmstmp - from formatted_data fd - join formatted_data fd2 - on fd.nct_id=fd2.nct_id and fd.earliest_date_observed < fd2.earliest_date_observed - where fd.current_status = 'Recruiting' - and fd2.current_status = 'Active, not recruiting' - group by fd.nct_id - ) - select - fdqpe.nct_id - --,fdqpe.start_date - --,fdqpe.current_enrollment - --,fdqpe.enrollment_category - ,fdqpe.current_status - ,fdqpe.earliest_date_observed - ,fdqpe.elapsed_duration - ,fdqpe.n_brands as identical_brands - ,ntbtu.brand_name_counts - ,fdqpe.category_id - ,fdqpe.final_status - ,fdqpe.h_sdi_val - --,fdqpe.h_sdi_u95 - --,fdqpe.h_sdi_l95 - ,fdqpe.hm_sdi_val - --,fdqpe.hm_sdi_u95 - --,fdqpe.hm_sdi_l95 - ,fdqpe.m_sdi_val - --,fdqpe.m_sdi_u95 - --,fdqpe.m_sdi_l95 - ,fdqpe.lm_sdi_val - --,fdqpe.lm_sdi_u95 - --,fdqpe.lm_sdi_l95 - ,fdqpe.l_sdi_val - --,fdqpe.l_sdi_u95 - --,fdqpe.l_sdi_l95 - from formatted_data_with_planned_enrollment fdqpe - join \"Formularies\".nct_to_brand_counts_through_uspdc ntbtu - on fdqpe.nct_id = ntbtu.nct_id - join cte - on fdqpe.nct_id = cte.nct_id - and fdqpe.earliest_date_observed = cte.tmstmp - order by fdqpe.nct_id, fdqpe.earliest_date_observed - ; - " - ) -df <- fetch(query, n = -1) -df <- na.omit(df) - -query2 <-dbSendQuery(con,"select count(*) from \"DiseaseBurden\".icd10_categories ic where \"level\"=1;") -n_categories <- fetch(query2, n = -1) - -return(list(data=df,ncat=n_categories)) -} - - -d <- get_data(driver) -df <- d$data -n_categories <- d$ncat - -cf <- get_counterfact_base(driver) -df_counterfact_base <- cf$data - - - -################ Format Data ########################### - -data_formatter <- function(df) { -categories <- df["category_id"] - -x <- df["elapsed_duration"] -x["identical_brands"] <- asinh(df$identical_brands) -x["brand_name_counts"] <- asinh(df$brand_name_count) -x["h_sdi_val"] <- asinh(df$h_sdi_val) -x["hm_sdi_val"] <- asinh(df$hm_sdi_val) -x["m_sdi_val"] <- asinh(df$m_sdi_val) -x["lm_sdi_val"] <- asinh(df$lm_sdi_val) -x["l_sdi_val"] <- asinh(df$l_sdi_val) - - -#Setup fixed effects - -x["status_NYR"] <- ifelse(df["current_status"]=="Not yet recruiting",1,0) -x["status_EBI"] <- ifelse(df["current_status"]=="Enrolling by invitation",1,0) -x["status_Rec"] <- ifelse(df["current_status"]=="Recruiting",1,0) - -#set base condition -x["status_ANR"] <- 1 #ifelse(df["current_status"]=="Active, not recruiting",1,0) # - - -y <- ifelse(df["final_status"]=="Terminated",1,0) - -#get category list - - -return(list(x=x,y=y)) -} - -train <- data_formatter(df) -counterfact_base <- data_formatter(df_counterfact_base) - -categories <- df$category_id - -x <- train$x -y <- train$y - -x_cf_base <- counterfact_base$x -y_cf_base <- counterfact_base$y -cf_categories <- df_counterfact_base$category_id -``` - - - -# Fit Model - - - - -```{r} -################################# FIT MODEL ######################################### -inherited_cols <- c( - "elapsed_duration" - #,"identical_brands" - #,"brand_name_counts" - ,"h_sdi_val" - ,"hm_sdi_val" - ,"m_sdi_val" - ,"lm_sdi_val" - ,"l_sdi_val" - ,"status_NYR"# TODO: may need to remove - ,"status_EBI" - ,"status_Rec" - ,"status_ANR" -) -``` - - - -```{r} -beta_list <- list( - groups = c( - `1`="Infections & Parasites", - `2`="Neoplasms", - `3`="Blood & Immune system", - `4`="Endocrine, Nutritional, and Metabolic", - `5`="Mental & Behavioral", - `6`="Nervous System", - `7`="Eye and Adnexa", - `8`="Ear and Mastoid", - `9`="Circulatory", - `10`="Respiratory", - `11`="Digestive", - `12`="Skin & Subcutaneaous tissue", - `13`="Musculoskeletal", - `14`="Genitourinary", - `15`="Pregancy, Childbirth, & Puerperium", - `16`="Perinatal Period", - `17`="Congential", - `18`="Symptoms, Signs etc.", - `19`="Injury etc.", - `20`="External Causes", - `21`="Contact with Healthcare", - `22`="Special Purposes" - ), - parameters = c( - `1`="Elapsed Duration", - # brands - `2`="asinh(Generic Brands)", - `3`="asinh(Competitors USPDC)", - # population - `4`="asinh(High SDI)", - `5`="asinh(High-Medium SDI)", - `6`="asinh(Medium SDI)", - `7`="asinh(Low-Medium SDI)", - `8`="asinh(Low SDI)", - #Status - `9`="status_NYR", - `10`="status_EBI", - `11`="status_Rec", - `12`="status_ANR" - ) -) - -get_parameters <- function(stem,class_list) { - #get categories and lengths - named <- names(class_list) - lengths <- sapply(named, (function (x) length(class_list[[x]]))) - - #describe the grid needed - iter_list <- sapply(named, (function (x) 1:lengths[x])) - - #generate the list of parameters - pardf <- generate_parameter_df(stem, iter_list) - - #add columns with appropriate human-readable names - for (name in named) { - pardf[paste(name,"_hr",sep="")] <- as.factor( - sapply(pardf[name], (function (i) class_list[[name]][i])) - ) - } - - return(pardf) -} - -generate_parameter_df <- function(stem, iter_list) { - grid <- expand.grid(iter_list) - grid["param_name"] <- grid %>% unite(x,colnames(grid),sep=",") - grid["param_name"] <- paste(stem,"[",grid$param_name,"]",sep="") - return(grid) -} - -group_mcmc_areas <- function( - stem,# = "beta" - class_list,# = beta_list - stanfit,# = fit - group_id,# = 2 - rename=TRUE, - filter=NULL - ) { - - #get all parameter names - params <- get_parameters(stem,class_list) - - #filter down to parameters of interest - params <- filter(params,groups == group_id) - #Get dataframe with only the rows of interest - filtdata <- as.data.frame(stanfit)[params$param_name] - #rename columns - if (rename) dimnames(filtdata)[[2]] <- params$parameters_hr - #get group name for title - group_name <- class_list$groups[group_id] - #create area plot with appropriate title - p <- mcmc_areas(filtdata,prob = 0.8, prob_outer = 0.95) + - ggtitle(paste("Parameter distributions for ICD-10 class:",group_name)) + - geom_vline(xintercept=seq(-2,2,0.5),color="grey",alpha=0.750) - - d <- pivot_longer(filtdata, everything()) |> - group_by(name) |> - summarize( - mean=mean(value) - ,q025 = quantile(value,probs = 0.025) - ,q975 = quantile(value,probs = 0.975) - ,q05 = quantile(value,probs = 0.05) - ,q95 = quantile(value,probs = 0.95) - ) - return(list(plot=p,quantiles=d,name=group_name)) -} - -parameter_mcmc_areas <- function( - stem,# = "beta" - class_list,# = beta_list - stanfit,# = fit - parameter_id,# = 2 - rename=TRUE - ) { - #get all parameter names - params <- get_parameters(stem,class_list) - #filter down to parameters of interest - params <- filter(params,parameters == parameter_id) - #Get dataframe with only the rows of interest - filtdata <- as.data.frame(stanfit)[params$param_name] - #rename columns - if (rename) dimnames(filtdata)[[2]] <- params$groups_hr - #get group name for title - parameter_name <- class_list$parameters[parameter_id] - #create area plot with appropriate title - p <- mcmc_areas(filtdata,prob = 0.8, prob_outer = 0.95) + - ggtitle(parameter_name,"Parameter Distribution") + - geom_vline(xintercept=seq(-2,2,0.5),color="grey",alpha=0.750) - - d <- pivot_longer(filtdata, everything()) |> - group_by(name) |> - summarize( - mean=mean(value) - ,q025 = quantile(value,probs = 0.025) - ,q975 = quantile(value,probs = 0.975) - ,q05 = quantile(value,probs = 0.05) - ,q95 = quantile(value,probs = 0.95) - ) - return(list(plot=p,quantiles=d,name=parameter_name)) -} - - -``` - -Plan: select all snapshots that are the first to have closed -enrollment (Rec -> ANR when comparing across snapshots), and then -```{r} -#delay intervention -intervention_enrollment <- x_cf_base[c(inherited_cols,"brand_name_counts", "identical_brands")] -intervention_enrollment["status_ANR"] <- 0 -intervention_enrollment["status_Rec"] <- 1 -``` - -```{r} -counterfact_delay <- list( - D = ncol(x),# - N = nrow(x), - L = n_categories$count, - y = as.vector(y), - ll = as.vector(categories), - x = as.matrix(x), - mu_mean = 0, - mu_stdev = 0.05, - sigma_shape = 4, - sigma_rate = 20, - Nx = nrow(x_cf_base), - llx = as.vector(cf_categories), - counterfact_x_tilde = as.matrix(intervention_enrollment), - counterfact_x = as.matrix(x_cf_base), - status_indexes = c(11,12) #subtract anr from recruiting to get movement from anr to recruiting -) -``` - -```{r} -fit <- stan( - file='Hierarchal_Logistic.stan', - data = counterfact_delay, - chains = 4, - iter = 5000, - seed = 11021585 - ) -``` - - - -## Calculate relative difference in parameters between recruiting and active not recruiting states - - - -## Explore data - - -```{r} -#get number of trials and snapshots in each category -group_trials_by_category <- as.data.frame(aggregate(category_id ~ nct_id, df, max)) -group_trials_by_category <- as.data.frame(group_trials_by_category) - -category_count <- group_trials_by_category |> group_by(category_id) |> count() - -``` - - -```{r} -################################# DATA EXPLORATION ############################ -driver <- dbDriver("PostgreSQL") - -con <- dbConnect( - driver, - user='root', - password='root', - dbname='aact_db', - host=host - ) -#Plot histogram of count of snapshots -df3 <- dbGetQuery( - con, - "select nct_id,final_status,count(*) from formatted_data_with_planned_enrollment fdwpe - group by nct_id,final_status ;" - ) -#df3 <- fetch(query3, n = -1) - -ggplot(data=df3, aes(x=count, fill=final_status)) + - geom_histogram(binwidth=1) + - ggtitle("Histogram of snapshots per trial (matched trials)") + - xlab("Snapshots per trial") -ggsave(paste0(image_trial_details,"/HistSnapshots.png")) - -#Plot duration for terminated vs completed -df4 <- dbGetQuery( - con, - " - select - nct_id, - start_date , - primary_completion_date, - overall_status , - primary_completion_date - start_date as duration - from ctgov.studies s - where nct_id in (select distinct nct_id from http.download_status ds) - ;" - ) -#df4 <- fetch(query4, n = -1) - -ggplot(data=df4, aes(x=duration,fill=overall_status)) + - geom_histogram()+ - ggtitle("Histogram of trial durations") + - xlab("duration")+ - facet_wrap(~overall_status) -ggsave(paste0(image_trial_details,"/HistTrialDurations_Faceted.png")) - -df5 <- dbGetQuery( - con, - " - with cte1 as ( - select - nct_id, - start_date , - primary_completion_date, - overall_status , - primary_completion_date - start_date as duration - from ctgov.studies s - where nct_id in (select distinct nct_id from http.download_status ds) - ), cte2 as ( - select nct_id,count(*) as snapshot_count from formatted_data_with_planned_enrollment fdwpe - group by nct_id - ) - select a.nct_id, a.overall_status, a.duration,b.snapshot_count - from cte1 as a - join cte2 as b - on a.nct_id=b.nct_id - ;" - ) -df5$overall_status <- as.factor(df5$overall_status) - -ggplot(data=df5, aes(x=duration,y=snapshot_count,color=overall_status)) + - geom_jitter() + - ggtitle("Comparison of duration, status, and snapshot_count") + - xlab("duration") + - ylab("snapshot count") -ggsave(paste0(image_trial_details,"/SnapshotsVsDurationVsTermination.png")) - -dbDisconnect(con) - -#get number of trials and snapshots in each category -group_trials_by_category <- as.data.frame(aggregate(category_id ~ nct_id, df, max)) -group_trials_by_category <- as.data.frame(group_trials_by_category) - -ggplot(data = group_trials_by_category, aes(x=category_id)) + - geom_bar(binwidth=1,color="black",fill="lightblue") + - scale_x_continuous(breaks=scales::pretty_breaks(n=22)) + - labs( - title="bar chart of trial categories" - ,x="Category ID" - ,y="Count" - ) -ggsave(paste0(image_trial_details,"/CategoryCounts.png")) - - - -summary(df5) - -cor(df5$duration,df5$snapshot_count) -sum(df5$snapshot_count) -``` - - - - -## Fit Results - - -```{r} -################################# ANALYZE ##################################### -print(fit) -``` - -# Parameter Distributions - - -```{r} -#g1 <- group_mcmc_areas("beta",beta_list,fit,1) - - -gx <- c() - -#grab parameters for every category with more than 8 observations -for (i in category_count$category_id[category_count$n >= 0]) { - print(i) - - #Print parameter distributions - gi <- group_mcmc_areas("beta",beta_list,fit,i) #add way to filter groups - ggsave( - paste0(image_parameters_by_groups,"/group_",i,"_",gi$name,".png") - ,plot=gi$plot - ) - gx <- c(gx,gi) - - #Get Quantiles and means for parameters -# table <- xtable(gi$quantiles, -# floating=FALSE -# ,latex.environments = NULL -# ,booktabs = TRUE -# ,zap=getOption("digits") -# ) -# write_lines(table,paste0("./latex_output/DirectEffects/group_",gi$name,".tex")) -} -``` - - - -```{r} -px <- c() - - -for (i in c(1,2,3,9,10,11,12)) { - - #Print parameter distributions - pi <- parameter_mcmc_areas("beta",beta_list,fit,i) #add way to filter groups - ggsave( - paste0(image_parameters_across_groups,"/parameters_",i,"_",pi$name,".png") - ,plot=pi$plot - ) - px <- c(px,pi) - - #Get Quantiles and means for parameters -# table <- xtable(pi$quantiles, -# floating=FALSE -# ,latex.environments = NULL -# ,booktabs = TRUE -# ,zap=getOption("digits") -# ) -# write_lines(table,paste0("./latex_output/DirectEffects/parameters_",i,"_",pi$name,".tex")) - -} -``` - -Note these have 95% outer CI and 80% inner (shaded) - - - - - -```{r} -#get the generic and uspdc parameters -print(px[4]$plot + px[7]$plot) -ggsave(paste0(image_parameters_across_groups,"2+3_generic_and_uspdc.png")) - -#get the parameters associated with duration -px[16]$plot + px[19]$plot -ggsave(paste0(image_parameters_across_groups,"11+12_statusREC_and_statusANR.png")) -``` - - -# Counterfactuals - -```{r} -generated_ib <- gqs( - fit@stanmodel, - data=counterfact_delay, - draws=as.matrix(fit), - seed=11021585 - ) -``` - -```{r} -df_ib_p <- data.frame( - p_prior=as.vector(extract(generated_ib, pars="p_prior")$p_prior) - ,p_predicted = as.vector(extract(generated_ib, pars="p_predicted")$p_predicted) -) - -df_ib_prior <- data.frame( - mu_prior = as.vector(extract(generated_ib, pars="mu_prior")$mu_prior) - ,sigma_prior = as.vector(extract(generated_ib, pars="sigma_prior")$sigma_prior) -) - -#p_prior -ggplot(df_ib_p, aes(x=p_prior)) + - geom_density() + - labs( - title="Implied Prior Distribution P" - ,subtitle="" - ,x="Probability Domain 'p'" - ,y="Probability Density" - ) -ggsave(paste0(image_dist_diff_analysis,"/prior_p.png")) - -#p_posterior -ggplot(df_ib_p, aes(x=p_predicted)) + - geom_density() + - labs( - title="Implied Posterior Distribution P" - ,subtitle="" - ,x="Probability Domain 'p'" - ,y="Probability Density" - ) -ggsave(paste0(image_dist_diff_analysis,"/posterior_p.png")) - -#mu_prior -ggplot(df_ib_prior) + - geom_density(aes(x=mu_prior)) + - labs( - title="Prior - Mu" - ,subtitle="same prior for all Mu values" - ,x="Mu" - ,y="Probability" - ) -ggsave(paste0(image_dist_diff_analysis,"/prior_mu.png")) - -#sigma_posterior -ggplot(df_ib_prior) + - geom_density(aes(x=sigma_prior)) + - labs( - title="Prior - Sigma" - ,subtitle="same prior for all Sigma values" - ,x="Sigma" - ,y="Probability" - ) -ggsave(paste0(image_dist_diff_analysis,"/prior_sigma.png")) -``` - - - -```{r} -check_hmc_diagnostics(fit) -``` - - - - - -### Intervention: Delay close of enrollment - -```{r} -counterfact_predicted_ib <- data.frame( - p_predicted_default = as.vector(extract(generated_ib, pars="p_predicted_default")$p_predicted_default) - ,p_predicted_intervention = as.vector(extract(generated_ib, pars="p_predicted_intervention")$p_predicted_intervention) - ,predicted_difference = as.vector(extract(generated_ib, pars="predicted_difference")$predicted_difference) -) - - -ggplot(counterfact_predicted_ib, aes(x=p_predicted_default)) + - geom_density() + - labs( - title="Predicted Distribution of 'p'" - ,subtitle="Intervention: None" - ,x="Probability Domain 'p'" - ,y="Probability Density" - ) -ggsave(paste0(image_dist_diff_analysis,"p_no_intervention.png")) - -ggplot(counterfact_predicted_ib, aes(x=p_predicted_intervention)) + - geom_density() + - labs( - title="Predicted Distribution of 'p'" - ,subtitle="Intervention: Delay close of enrollment" - ,x="Probability Domain 'p'" - ,y="Probability Density" - ) -ggsave(paste0(image_dist_diff_analysis,"/p_delay_intervention.png")) - -ggplot(counterfact_predicted_ib, aes(x=predicted_difference)) + - geom_density() + - labs( - title="Predicted Distribution of differences 'p'" - ,subtitle="Intervention: Delay close of enrollment" - ,x="Difference in 'p' under treatment" - ,y="Probability Density" - ) -ggsave(paste0(image_dist_diff_analysis,"/p_delay_intervention_distdiff_1.png")) -``` - - -```{r} -get_category_count <- function(tbl, id) { - result <- tbl$n[tbl$category_id == id] - if(length(result) == 0) 0 else result -} - -category_names <- sapply(1:length(beta_list$groups), - function(i) sprintf("ICD-10 #%d: %s (n=%d)", - i, - beta_list$groups[i], - get_category_count(category_count, i))) - -``` - -```{r} -pddf_ib <- data.frame(extract(generated_ib, pars="predicted_difference")$predicted_difference) |> - pivot_longer(X1:X168) #CHANGE_NOTE: moved from X169 to X168 - -pddf_ib["entry_idx"] <- as.numeric(gsub("\\D","",pddf_ib$name)) -pddf_ib["category"] <- sapply(pddf_ib$entry_idx, function(i) df$category_id[i]) -pddf_ib["category_name"] <- sapply( - pddf_ib$category, - function(i) category_names[i] - ) - - - -ggplot(pddf_ib, aes(x=value,)) + - geom_density(adjust=1/5) + - labs( - title = "Distribution of predicted differences" - ,subtitle = "Intervention: Delay close of enrollment" - ,x = "Difference in probability due to intervention" - ,y = "Probability Density" - ) + - geom_vline(aes(xintercept = 0), color = "skyblue", linetype="dashed") -ggsave(paste0(image_dist_diff_analysis,"/p_delay_intervention_distdiff_styled.png")) - -ggplot(pddf_ib, aes(x=value,)) + - geom_density(adjust=1/5) + - facet_wrap( - ~factor( - category_name, - levels=category_names - ) - , labeller = label_wrap_gen(multi_line = TRUE) - , ncol=4) + - labs( - title = "Distribution of predicted differences | By Group" - ,subtitle = "Intervention: Delay close of enrollment" - ,x = "Difference in probability due to intervention" - ,y = "Probability Density" - ) + - geom_vline(aes(xintercept = 0), color = "skyblue", linetype="dashed") + - theme(strip.text.x = element_text(size = 8)) -ggsave(paste0(image_dist_diff_analysis,"/p_delay_intervention_distdiff_by_group.png")) - -ggplot(pddf_ib, aes(x=value,)) + - geom_histogram(bins=300) + - facet_wrap( - ~factor( - category_name, - levels=category_names - ) - , labeller = label_wrap_gen(multi_line = TRUE) - , ncol=4) + - labs( - title = "Histogram of predicted differences | By Group" - ,subtitle = "Intervention: Delay close of enrollment" - ,x = "Difference in probability due to intervention" - ,y = "Predicted counts" - ) + - geom_vline(aes(xintercept = 0), color = "skyblue", linetype="dashed") + - theme(strip.text.x = element_text(size = 8)) -ggsave(paste0(image_dist_diff_analysis,"p_delay_intervention_histdiff_by_group.png")) -``` - - -```{r} -p3 <- ggplot(pddf_ib, aes(x=value,)) + - geom_histogram(bins=500) + - labs( - title = "Distribution of predicted differences" - ,subtitle = "Intervention: Delay close of enrollment" - ,x = "Difference in probability due to intervention" - ,y = "Probability Density" - ,caption = "Vertical marks: 5/10/25/50/75/90/95th percentiles. Dot shows mean." - ) + - geom_vline(aes(xintercept = 0), color = "skyblue", linetype="dashed") - -density_hight <- max(density(pddf_ib$value)$y) - -stats <- list( - p5 = quantile(pddf_ib$value, 0.05), - p10 = quantile(pddf_ib$value, 0.10), - q1 = quantile(pddf_ib$value, 0.25), - med = median(pddf_ib$value), - mean = mean(pddf_ib$value), - q3 = quantile(pddf_ib$value, 0.75), - p90 = quantile(pddf_ib$value, 0.90), - p95 = quantile(pddf_ib$value, 0.95), - max_height = max(ggplot_build(p3)$data[[1]]$count), - y_offset = -max(ggplot_build(p3)$data[[1]]$count) * 0.05, - y_offset_density = -density_hight * 0.15 -) - -p3 + - # Box - geom_segment(data = data.frame( - x = c(stats$q1, stats$q3, stats$med), - xend = c(stats$q1, stats$q3, stats$med), - y = rep(stats$y_offset, 3), - yend = rep(stats$y_offset * 2, 3) - ), aes(x = x, xend = xend, y = y, yend = yend)) + - geom_segment(data = data.frame( - x = rep(stats$q1, 2), - xend = rep(stats$q3, 2), - y = c(stats$y_offset, stats$y_offset * 2), - yend = c(stats$y_offset, stats$y_offset * 2) - ), aes(x = x, xend = xend, y = y, yend = yend)) + - # Inner whiskers (Q1->P10, Q3->P90) - geom_segment(data = data.frame( - x = c(stats$q1, stats$q3), - xend = c(stats$p10, stats$p90), - y = rep(stats$y_offset * 1.5, 2), - yend = rep(stats$y_offset * 1.5, 2) - ), aes(x = x, xend = xend, y = y, yend = yend)) + - # Crossbars at P10/P90 - geom_segment(data = data.frame( - x = c(stats$p10, stats$p90), - xend = c(stats$p10, stats$p90), - y = stats$y_offset * 1.3, - yend = stats$y_offset * 1.7 - ), aes(x = x, xend = xend, y = y, yend = yend)) + - # Outer whiskers (P10->P5, P90->P95) - geom_segment(data = data.frame( - x = c(stats$p10, stats$p90), - xend = c(stats$p5, stats$p95), - y = rep(stats$y_offset * 1.5, 2), - yend = rep(stats$y_offset * 1.5, 2) - ), aes(x = x, xend = xend, y = y, yend = yend)) + - # Crossbars at P5/P95 - geom_segment(data = data.frame( - x = c(stats$p5, stats$p95), - xend = c(stats$p5, stats$p95), - y = stats$y_offset * 1.3, - yend = stats$y_offset * 1.7 - ), aes(x = x, xend = xend, y = y, yend = yend)) + - # Mean dot - geom_point(data = data.frame( - x = stats$mean, - y = stats$y_offset * 1.5 - ), aes(x = x, y = y)) -ggsave(paste0(image_dist_diff_analysis,"/p_delay_intervention_histdiff_boxplot.png")) - - -p4 <- ggplot(pddf_ib, aes(x = value)) + - geom_density() + - labs( - title = "Distribution of predicted differences" - ,subtitle = "Intervention: Delay close of enrollment" - ,x = "Difference in probability due to intervention" - ,y = "Probability Density" - ,caption = "Vertical marks: 5/10/25/50/75/90/95th percentiles. Dot shows mean." - ) + - geom_vline(aes(xintercept = 0), color = "skyblue", linetype="dashed") + - geom_segment(data = data.frame( - x = c(stats$q1, stats$q3, stats$med), - xend = c(stats$q1, stats$q3, stats$med), - y = rep(stats$y_offset_density, 3), - yend = rep(stats$y_offset_density * 2, 3) - ), aes(x = x, xend = xend, y = y, yend = yend)) + - geom_segment(data = data.frame( - x = rep(stats$q1, 2), - xend = rep(stats$q3, 2), - y = c(stats$y_offset_density, stats$y_offset_density * 2), - yend = c(stats$y_offset_density, stats$y_offset_density * 2) - ), aes(x = x, xend = xend, y = y, yend = yend)) + - # Inner whiskers (Q1->P10, Q3->P90) - geom_segment(data = data.frame( - x = c(stats$q1, stats$q3), - xend = c(stats$p10, stats$p90), - y = rep(stats$y_offset_density * 1.5, 2), - yend = rep(stats$y_offset_density * 1.5, 2) - ), aes(x = x, xend = xend, y = y, yend = yend)) + - # Crossbars at P10/P90 - geom_segment(data = data.frame( - x = c(stats$p10, stats$p90), - xend = c(stats$p10, stats$p90), - y = stats$y_offset_density * 1.3, - yend = stats$y_offset_density * 1.7 - ), aes(x = x, xend = xend, y = y, yend = yend)) + - # Outer whiskers (P10->P5, P90->P95) - geom_segment(data = data.frame( - x = c(stats$p10, stats$p90), - xend = c(stats$p5, stats$p95), - y = rep(stats$y_offset_density * 1.5, 2), - yend = rep(stats$y_offset_density * 1.5, 2) - ), aes(x = x, xend = xend, y = y, yend = yend)) + - # Crossbars at P5/P95 - geom_segment(data = data.frame( - x = c(stats$p5, stats$p95), - xend = c(stats$p5, stats$p95), - y = stats$y_offset_density * 1.3, - yend = stats$y_offset_density * 1.7 - ), aes(x = x, xend = xend, y = y, yend = yend)) + - # Mean dot - geom_point(data = data.frame( - x = stats$mean, - y = stats$y_offset_density * 1.5 - ), aes(x = x, y = y)) -ggsave(paste0(image_dist_diff_analysis,"/p_delay_intervention_distdiff_boxplot.png")) -``` - -```{r} - ggplot(pddf_ib, aes(x=value)) + - stat_ecdf(geom='step') + - labs( - title = "Cumulative distribution of predicted differences", - subtitle = "Intervention: Delay close of enrollment", - x = "Difference in probability of termination due to intervention", - y = "Cumulative Proportion" - ) - -ggsave(paste0(image_dist_diff_analysis,"/p_delay_intervention_cumulative_distdiff.png")) -``` - - -Get the % of differences in the spike around zero -```{r} -# get values around and above/below spike -width <- 0.02 -spike_band_centered_zero <- mean( pddf_ib$value >= -width/2 & pddf_ib$value <= width/2) -above_spike_band <- mean( pddf_ib$value >= width/2) -below_spike_band <- mean( pddf_ib$value <= -width/2) - -# get mass above and mass below zero -mass_below_zero <- mean( pddf_ib$value <= 0) -``` -Looking at the spike around zero, we find that `r spike_band_centered_zero*100`% -of the probability mass is contained within the band from -[`r -width*100/2`,`r width*100/2`]. -Additionally, there was `r above_spike_band*100`% of the probability above that --- representing those with a general increase in the probability of termination -- -and `r below_spike_band*100`% of the probability mass below the band --- representing a decrease in the probability of termination. - -On average, if you keep the trial open instead of closing it, -`r mass_below_zero`% of trials will see a decrease in the probability of termination, -but, due to the high increase in probability of termination given termination was increased, -the mean probability of termination increases by `r stats$mean`. - -```{r} -# 5%-iles - -summary(pddf_ib$value) - -# Create your quantiles -quants <- quantile(pddf_ib$value, probs = seq(0,1,0.05), type=4) - -# Convert to a data frame -quant_df <- data.frame( - Percentile = names(quants), - Value = quants -) -kable(quant_df) -``` - -There seems to be some trials that are highly suceptable to this enrollment delay. Specifically, there were some - -```{r} -n = length(counterfact_predicted_ib$p_predicted_intervention) -k = 100 -simulated_terminations_intervention <- mean(rbinom(n,k,as.vector(counterfact_predicted_ib$p_predicted_intervention))) -simulated_terminations_base <-mean(rbinom(n,k,as.vector(counterfact_predicted_ib$p_predicted_default))) - -simulated_percentages <- (simulated_terminations_intervention - simulated_terminations_base)/k -``` - -The simulation above shows that this results in a percentage-point increase of about -`r simulated_percentages * 100`. - - - - -# Diagnostics - -```{r} -#| eval: true -#trace plots -image_diagnostics <- paste0(image_root,"/diagnostics") - -plot(fit, pars=c("mu"), plotfun="trace") -ggsave(paste0(image_diagnostics,"/trace_plot_mu.png")) - - -for (i in 1:3) { - print( - mcmc_rank_overlay( - fit, - pars=c( - paste0("mu[",4*i-3,"]"), - paste0("mu[",4*i-2,"]"), - paste0("mu[",4*i-1,"]"), - paste0("mu[",4*i,"]") - ), - n_bins=100 - )+ legend_move("top") + - scale_colour_ghibli_d("KikiMedium") - ) - mu_range <- paste0(4*i-3,"-",4*i) - filename <- paste0(image_diagnostics,"/trace_rank_plot_mu_",mu_range,".png") - ggsave(filename) -} -``` - -```{r} -#| eval: true -plot(fit, pars=c("sigma"), plotfun="trace") -ggsave(paste0(image_diagnostics,"/traceplot_sigma.png")) - -for (i in 1:3) { - print( - mcmc_rank_overlay( - fit, - pars=c( - paste0("sigma[",4*i-3,"]"), - paste0("sigma[",4*i-2,"]"), - paste0("sigma[",4*i-1,"]"), - paste0("sigma[",4*i,"]") - ), - n_bins=100 - )+ legend_move("top") + - scale_colour_ghibli_d("KikiMedium") - ) - sigma_range <- paste0(4*i-3,"-",4*i) - filename <- paste0(image_diagnostics,"/trace_rank_plot_sigma_",sigma_range,".png") - ggsave(filename) -} -``` - -```{r} -#| eval: true -#other diagnostics -logpost <- log_posterior(fit) -nuts_prmts <- nuts_params(fit) -posterior <- as.array(fit) - -``` - -```{r} -#| eval: true -color_scheme_set("darkgray") -div_style <- parcoord_style_np(div_color = "green", div_size = 0.05, div_alpha = 0.4) -mcmc_parcoord(posterior, regex_pars = "mu", np=nuts_prmts, np_style = div_style, alpha = 0.05) -ggsave(paste0(image_diagnostics,"/parcoord_mu.png")) -``` - -```{r} -#| eval: true -for (i in 1:3) { - mus = sapply(3:0, function(j) paste0("mu[",4*i-j ,"]")) - print( - mcmc_pairs( - posterior, - np = nuts_prmts, - pars=c( - mus, - "lp__" - ), - off_diag_args = list(size = 0.75) - ) - ) - mu_range <- paste0(4*i-3,"-",4*i) - filename <- paste0(image_diagnostics,"/correlation_plot_mu_",mu_range,".png") - ggsave(filename) -} - - - -``` - -```{r} -#| eval: true -mcmc_parcoord(posterior,regex_pars = "sigma", np=nuts_prmts, alpha=0.05) -ggsave(paste0(image_diagnostics,"/parcoord_sigma.png")) -``` - -```{r} -#| eval: true - -for (i in 1:3) { - params = sapply(3:0, function(j) paste0("sigma[",4*i-j ,"]")) - print( - mcmc_pairs( - posterior, - np = nuts_prmts, - pars=c( - params, - "lp__" - ), - off_diag_args = list(size = 0.75) - ) - ) - sigma_range <- paste0(4*i-3,"-",4*i) - filename <- paste0(image_diagnostics,"/correlation_plot_sigma_",sigma_range,".png") - ggsave(filename) -} -``` - - -```{r} -#| eval: true -for (k in 1:22) { -for (i in 1:3) { - params = sapply(3:0, function(j) paste0("beta[",k,",",4*i-j ,"]")) - print( - mcmc_pairs( - posterior, - np = nuts_prmts, - pars=c( - params, - "lp__" - ), - off_diag_args = list(size = 0.75) - ) - ) - - beta_range <- paste0("k_",k,"_i_",4*i-3,"-",4*i) - filename <- paste0(image_diagnostics,"/correlation_plot_beta_",beta_range,".png") - ggsave(filename) -}} -``` - - - - - - - \ No newline at end of file diff --git a/r-analysis/EffectsOfEnrollmentDelay_rebased.html b/r-analysis/EffectsOfEnrollmentDelay_rebased.html deleted file mode 100644 index ff8ee80..0000000 --- a/r-analysis/EffectsOfEnrollmentDelay_rebased.html +++ /dev/null @@ -1,3086 +0,0 @@ - - - - - - - - - - -The Effects of Recruitment status on completion of clinical trials - - - - - - - - - - - - - - - - - - - -
- -
- -
-
-

The Effects of Recruitment status on completion of clinical trials

-
- - - -
- -
-
Author
-
-

Will King

-
-
- - - -
- - - -
- - -
-

Setup

-
-
library(knitr)
-library(bayesplot)
-
-
This is bayesplot version 1.11.1
-
-
-
- Online documentation and vignettes at mc-stan.org/bayesplot
-
-
-
- bayesplot theme set to bayesplot::theme_default()
-
-
-
   * Does _not_ affect other ggplot2 plots
-
-
-
   * See ?bayesplot_theme_set for details on theme setting
-
-
available_mcmc(pattern = "_nuts_")
-
-
bayesplot MCMC module:
-(matching pattern '_nuts_') 
-  mcmc_nuts_acceptance
-  mcmc_nuts_divergence
-  mcmc_nuts_energy
-  mcmc_nuts_stepsize
-  mcmc_nuts_treedepth
-
-
library(ggplot2)
-library(patchwork)
-library(tidyverse)
-
-
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
-✔ dplyr     1.1.4     ✔ readr     2.1.5
-✔ forcats   1.0.0     ✔ stringr   1.5.1
-✔ lubridate 1.9.4     ✔ tibble    3.2.1
-✔ purrr     1.0.2     ✔ tidyr     1.3.1
-
-
-
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
-✖ dplyr::filter() masks stats::filter()
-✖ dplyr::lag()    masks stats::lag()
-ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
-
-
library(rstan)
-
-
Loading required package: StanHeaders
-
-rstan version 2.32.6 (Stan version 2.32.2)
-
-For execution on a local, multicore CPU with excess RAM we recommend calling
-options(mc.cores = parallel::detectCores()).
-To avoid recompilation of unchanged Stan programs, we recommend calling
-rstan_options(auto_write = TRUE)
-For within-chain threading using `reduce_sum()` or `map_rect()` Stan functions,
-change `threads_per_chain` option:
-rstan_options(threads_per_chain = 1)
-
-
-Attaching package: 'rstan'
-
-The following object is masked from 'package:tidyr':
-
-    extract
-
-
library(tidyr)
-library(ghibli)
-library(xtable)
-#Resources: https://github.com/stan-dev/rstan/wiki/RStan-Getting-Started
-
-#save unchanged models instead of recompiling
-rstan_options(auto_write = TRUE)
-#allow for multithreaded sampling
-options(mc.cores = parallel::detectCores())
-
-#test installation, shouldn't get any errors
-#example(stan_model, package = "rstan", run.dontrun = TRUE)
-
-
-image_root <- "./output/withdiff/EffectsOfEnrollmentDelay"
-image_dist_diff_analysis <- paste0(image_root,"/dist_diff_analysis")
-image_trial_details <-paste0(image_root,"/trials_details")
-image_parameters_by_groups <-paste0(image_root,"/betas/parameters_by_group")
-image_parameters_across_groups <-paste0(image_root,"/betas/parameter_across_groups")
-
-
-
################ Pull data from database ######################
-library(RPostgreSQL)
-
-
Loading required package: DBI
-
-
host <- 'aact_db-restored-2025-01-07'
-
-driver <- dbDriver("PostgreSQL")
-
-get_data <- function(driver) {
-
-con <- dbConnect(
-    driver,
-    user='root',
-    password='root',
-    dbname='aact_db',
-    host=host
-    )
-on.exit(dbDisconnect(con))
-
-query <- dbSendQuery(
-    con,
-#    "select * from formatted_data_with_planned_enrollment;"
-"
-select 
-    fdqpe.nct_id
-    --,fdqpe.start_date
-    --,fdqpe.current_enrollment
-    --,fdqpe.enrollment_category
-    ,fdqpe.current_status 
-    ,fdqpe.earliest_date_observed 
-    ,fdqpe.elapsed_duration
-    ,fdqpe.n_brands as identical_brands
-    ,ntbtu.brand_name_counts 
-    ,fdqpe.category_id
-    ,fdqpe.final_status
-    ,fdqpe.h_sdi_val
-    --,fdqpe.h_sdi_u95
-    --,fdqpe.h_sdi_l95
-    ,fdqpe.hm_sdi_val
-    --,fdqpe.hm_sdi_u95
-    --,fdqpe.hm_sdi_l95
-    ,fdqpe.m_sdi_val
-    --,fdqpe.m_sdi_u95
-    --,fdqpe.m_sdi_l95
-    ,fdqpe.lm_sdi_val
-    --,fdqpe.lm_sdi_u95
-    --,fdqpe.lm_sdi_l95
-    ,fdqpe.l_sdi_val
-    --,fdqpe.l_sdi_u95
-    --,fdqpe.l_sdi_l95
-from formatted_data_with_planned_enrollment fdqpe
-    join \"Formularies\".nct_to_brand_counts_through_uspdc ntbtu
-        on fdqpe.nct_id = ntbtu.nct_id 
-order by fdqpe.nct_id, fdqpe.earliest_date_observed 
-;
-"
-    )
-df <- fetch(query, n = -1)
-df <- na.omit(df)
-
-query2 <-dbSendQuery(con,"select count(*) from \"DiseaseBurden\".icd10_categories ic where \"level\"=1;")
-n_categories <- fetch(query2, n = -1)
-
-return(list(data=df,ncat=n_categories))
-}
-
-
-get_counterfact_base <- function(driver) {
-
-con <- dbConnect(
-    driver,
-    user='root',
-    password='root',
-    dbname='aact_db',
-    host=host
-    )
-on.exit(dbDisconnect(con))
-
-query <- dbSendQuery(
-    con,
-    "
-    with cte as (
-    --get last recruiting state
-    select fd.nct_id, max(fd.earliest_date_observed),min(fd2.earliest_date_observed) as tmstmp
-    from formatted_data fd 
-        join formatted_data fd2 
-        on fd.nct_id=fd2.nct_id and fd.earliest_date_observed < fd2.earliest_date_observed 
-    where fd.current_status = 'Recruiting'
-        and fd2.current_status = 'Active, not recruiting'
-    group by fd.nct_id 
-    )
-    select 
-        fdqpe.nct_id
-        --,fdqpe.start_date
-        --,fdqpe.current_enrollment
-        --,fdqpe.enrollment_category
-        ,fdqpe.current_status 
-        ,fdqpe.earliest_date_observed 
-        ,fdqpe.elapsed_duration
-        ,fdqpe.n_brands as identical_brands
-        ,ntbtu.brand_name_counts 
-        ,fdqpe.category_id
-        ,fdqpe.final_status
-        ,fdqpe.h_sdi_val
-        --,fdqpe.h_sdi_u95
-        --,fdqpe.h_sdi_l95
-        ,fdqpe.hm_sdi_val
-        --,fdqpe.hm_sdi_u95
-        --,fdqpe.hm_sdi_l95
-        ,fdqpe.m_sdi_val
-        --,fdqpe.m_sdi_u95
-        --,fdqpe.m_sdi_l95
-        ,fdqpe.lm_sdi_val
-        --,fdqpe.lm_sdi_u95
-        --,fdqpe.lm_sdi_l95
-        ,fdqpe.l_sdi_val
-        --,fdqpe.l_sdi_u95
-        --,fdqpe.l_sdi_l95
-    from formatted_data_with_planned_enrollment fdqpe
-        join \"Formularies\".nct_to_brand_counts_through_uspdc ntbtu
-            on fdqpe.nct_id = ntbtu.nct_id 
-        join cte 
-            on fdqpe.nct_id = cte.nct_id 
-                and fdqpe.earliest_date_observed = cte.tmstmp
-    order by fdqpe.nct_id, fdqpe.earliest_date_observed 
-    ;
-    "
-    )
-df <- fetch(query, n = -1)
-df <- na.omit(df)
-
-query2 <-dbSendQuery(con,"select count(*) from \"DiseaseBurden\".icd10_categories ic where \"level\"=1;")
-n_categories <- fetch(query2, n = -1)
-
-return(list(data=df,ncat=n_categories))
-}
-
-
-d <- get_data(driver)
-df <- d$data
-n_categories <- d$ncat
-
-cf <- get_counterfact_base(driver)
-df_counterfact_base <- cf$data
-
-
-
-################ Format Data ###########################
-
-data_formatter <- function(df) {
-categories <- df["category_id"]
-
-x <- df["elapsed_duration"]
-x["identical_brands"] <- asinh(df$identical_brands)
-x["brand_name_counts"] <- asinh(df$brand_name_count)
-x["h_sdi_val"] <- asinh(df$h_sdi_val)
-x["hm_sdi_val"] <- asinh(df$hm_sdi_val)
-x["m_sdi_val"] <- asinh(df$m_sdi_val)
-x["lm_sdi_val"] <- asinh(df$lm_sdi_val)
-x["l_sdi_val"] <- asinh(df$l_sdi_val)
-
-
-#Setup fixed effects
-
-x["status_NYR"] <- ifelse(df["current_status"]=="Not yet recruiting",1,0)
-x["status_EBI"] <- ifelse(df["current_status"]=="Enrolling by invitation",1,0)
-x["status_Rec"] <- ifelse(df["current_status"]=="Recruiting",1,0) 
-x["status_ANR"] <- ifelse(df["current_status"]=="Active, not recruiting",1,0)
-
-
-y <- ifelse(df["final_status"]=="Terminated",1,0)
-
-#get category list
-
-
-return(list(x=x,y=y))
-}
-
-train <- data_formatter(df)
-counterfact_base <- data_formatter(df_counterfact_base)
-
-categories <- df$category_id
-
-x <- train$x
-y <- train$y
-
-x_cf_base <- counterfact_base$x
-y_cf_base <- counterfact_base$y
-cf_categories <- df_counterfact_base$category_id
-
-
-
-

Fit Model

-
-
################################# FIT MODEL #########################################
-inherited_cols <- c(
-    "elapsed_duration"
-    ,"identical_brands"
-    ,"brand_name_counts"
-    ,"h_sdi_val"
-    ,"hm_sdi_val"
-    ,"m_sdi_val"
-    ,"lm_sdi_val"
-    ,"l_sdi_val"
-    ,"status_NYR"
-    ,"status_EBI"
-    ,"status_Rec"
-    ,"status_ANR"
-)
-
-
-
beta_list <- list(
-        groups = c(
-        `1`="Infections & Parasites",
-        `2`="Neoplasms",
-        `3`="Blood & Immune system",
-        `4`="Endocrine, Nutritional, and Metabolic",
-        `5`="Mental & Behavioral",
-        `6`="Nervous System",
-        `7`="Eye and Adnexa",
-        `8`="Ear and Mastoid",
-        `9`="Circulatory",
-        `10`="Respiratory",
-        `11`="Digestive",
-        `12`="Skin & Subcutaneaous tissue",
-        `13`="Musculoskeletal",
-        `14`="Genitourinary",
-        `15`="Pregancy, Childbirth, & Puerperium",
-        `16`="Perinatal Period",
-        `17`="Congential",
-        `18`="Symptoms, Signs etc.",
-        `19`="Injury etc.",
-        `20`="External Causes",
-        `21`="Contact with Healthcare",
-        `22`="Special Purposes"
-    ),
-    parameters = c(
-        `1`="Elapsed Duration",
-        # brands
-        `2`="asinh(Generic Brands)",
-        `3`="asinh(Competitors USPDC)",
-        # population
-        `4`="asinh(High SDI)",
-        `5`="asinh(High-Medium SDI)",
-        `6`="asinh(Medium SDI)",
-        `7`="asinh(Low-Medium SDI)",
-        `8`="asinh(Low SDI)",
-        #Status
-        `9`="status_NYR",
-        `10`="status_EBI",
-        `11`="status_Rec",
-        `12`="status_ANR"
-    )
-)
-
-get_parameters <- function(stem,class_list) {
-    #get categories and lengths
-    named <- names(class_list)
-    lengths <- sapply(named, (function (x) length(class_list[[x]])))
-    
-    #describe the grid needed
-    iter_list <- sapply(named, (function (x) 1:lengths[x]))
-    
-    #generate the list of parameters
-    pardf <- generate_parameter_df(stem, iter_list)
-    
-    #add columns with appropriate human-readable names
-    for (name in named) {
-        pardf[paste(name,"_hr",sep="")] <- as.factor(
-            sapply(pardf[name], (function (i) class_list[[name]][i]))
-        )
-    }
-     
-    return(pardf)   
-}
-
-generate_parameter_df <- function(stem, iter_list) {
-    grid <- expand.grid(iter_list)
-    grid["param_name"] <- grid %>% unite(x,colnames(grid),sep=",")
-    grid["param_name"] <- paste(stem,"[",grid$param_name,"]",sep="")
-    return(grid)
-}
-
-group_mcmc_areas <- function(
-        stem,# = "beta"
-        class_list,# = beta_list
-        stanfit,# = fit
-        group_id,# = 2
-        rename=TRUE,
-        filter=NULL
-        ) {
-    
-    #get all parameter names
-    params <- get_parameters(stem,class_list)
-    
-    #filter down to parameters of interest
-    params <- filter(params,groups == group_id)
-    #Get dataframe with only the rows of interest
-    filtdata <- as.data.frame(stanfit)[params$param_name]
-    #rename columns
-    if (rename) dimnames(filtdata)[[2]] <- params$parameters_hr
-    #get group name for title
-    group_name <- class_list$groups[group_id]
-    #create area plot with appropriate title
-    p <- mcmc_areas(filtdata,prob = 0.8, prob_outer = 0.95) +
-        ggtitle(paste("Parameter distributions for ICD-10 class:",group_name)) +
-        geom_vline(xintercept=seq(-2,2,0.5),color="grey",alpha=0.750)  
-    
-    d <- pivot_longer(filtdata, everything()) |> 
-        group_by(name) |> 
-        summarize(
-            mean=mean(value)
-            ,q025 = quantile(value,probs = 0.025)
-            ,q975 = quantile(value,probs = 0.975)
-            ,q05 = quantile(value,probs = 0.05)
-            ,q95 = quantile(value,probs = 0.95)
-            )
-    return(list(plot=p,quantiles=d,name=group_name))
-}
-
-parameter_mcmc_areas <- function(
-        stem,# = "beta"
-        class_list,# = beta_list
-        stanfit,# = fit
-        parameter_id,# = 2
-        rename=TRUE
-        ) {
-    #get all parameter names
-    params <- get_parameters(stem,class_list)
-    #filter down to parameters of interest
-    params <- filter(params,parameters == parameter_id)
-    #Get dataframe with only the rows of interest
-    filtdata <- as.data.frame(stanfit)[params$param_name]
-    #rename columns
-    if (rename) dimnames(filtdata)[[2]] <- params$groups_hr
-    #get group name for title
-    parameter_name <- class_list$parameters[parameter_id]
-    #create area plot with appropriate title
-    p <- mcmc_areas(filtdata,prob = 0.8, prob_outer = 0.95) +
-        ggtitle(parameter_name,"Parameter Distribution") +
-        geom_vline(xintercept=seq(-2,2,0.5),color="grey",alpha=0.750)  
-    
-    d <- pivot_longer(filtdata, everything()) |> 
-        group_by(name) |> 
-        summarize(
-            mean=mean(value)
-            ,q025 = quantile(value,probs = 0.025)
-            ,q975 = quantile(value,probs = 0.975)
-            ,q05 = quantile(value,probs = 0.05)
-            ,q95 = quantile(value,probs = 0.95)
-            )
-    return(list(plot=p,quantiles=d,name=parameter_name))
-}
-
-

Plan: select all snapshots that are the first to have closed enrollment (Rec -> ANR when comparing across snapshots), and then

-
-
#delay intervention
-intervention_enrollment <- x_cf_base[c(inherited_cols)]
-#TOFIX: ^^^ This ordering of columns is
-intervention_enrollment["status_ANR"] <- 0
-intervention_enrollment["status_Rec"] <- 1
-
-
-
counterfact_delay <- list(
-    D = ncol(x),#
-    N = nrow(x),
-    L = n_categories$count,
-    y = as.vector(y),
-    ll = as.vector(categories),
-    x = as.matrix(x),
-    mu_mean = 0,
-    mu_stdev = 0.05,
-    sigma_shape = 4,
-    sigma_rate = 20,
-    Nx = nrow(x_cf_base),
-    llx = as.vector(cf_categories),
-    counterfact_x_tilde = as.matrix(intervention_enrollment),
-    counterfact_x = as.matrix(x_cf_base),
-    status_indexes = c(11,12) #subtract anr from recruiting to get movement from anr to recruiting
-)
-
-
-
fit <- stan(
-    file='Hierarchal_Logistic.stan', 
-    data = counterfact_delay,
-    chains = 4,
-    iter = 5000,
-    seed = 11021585
-    )
-
-
Warning: Bulk Effective Samples Size (ESS) is too low, indicating posterior means and medians may be unreliable.
-Running the chains for more iterations may help. See
-https://mc-stan.org/misc/warnings.html#bulk-ess
-
-
-
-

Calculate relative difference in parameters between recruiting and active not recruiting states

-
-
filt_data <- as.data.frame(extract(fit,pars="status_diff"))
-dimnames(filt_data)[[2]] <- beta_list$groups
-mcmc_areas(filt_data,prob = 0.8, prob_outer = 0.95) +
-  ggtitle("Relative fixed effects across groups", subtitle ="moving from `Active, not recruiting` to `Recruiting`") +
-        geom_vline(xintercept=seq(-2,2,0.5),color="grey",alpha=0.750) 
-
-
Warning: Removed 3 rows containing missing values or values outside the scale range
-(`geom_vline()`).
-
-
-
-
-

-
-
-
-
-

I’ve got an issue here, because this should be a movement from ANR -> recruiting but that is suggesting that

-
-
-

Explore data

-
-
#get number of trials and snapshots in each category
-group_trials_by_category <- as.data.frame(aggregate(category_id ~ nct_id, df, max))
-group_trials_by_category <- as.data.frame(group_trials_by_category)
-
-category_count <- group_trials_by_category |> group_by(category_id) |> count()
-
-
-
################################# DATA EXPLORATION ############################
-driver <- dbDriver("PostgreSQL")
-
-con <- dbConnect(
-    driver,
-    user='root',
-    password='root',
-    dbname='aact_db',
-    host=host
-    )
-#Plot histogram of count of snapshots
-df3 <- dbGetQuery(
-    con,
-    "select nct_id,final_status,count(*) from formatted_data_with_planned_enrollment fdwpe 
-    group by nct_id,final_status ;"
-    )
-#df3 <- fetch(query3, n = -1)
-
-ggplot(data=df3, aes(x=count, fill=final_status)) + 
-    geom_histogram(binwidth=1) +
-    ggtitle("Histogram of snapshots per trial (matched trials)") +
-    xlab("Snapshots per trial")
-
-
-
-

-
-
-
-
ggsave(paste0(image_trial_details,"/HistSnapshots.png"))
-
-
Saving 7 x 5 in image
-
-
#Plot duration for terminated vs completed
-df4 <- dbGetQuery(
-    con,
-    "
-    select 
-        nct_id, 
-        start_date , 
-        primary_completion_date, 
-        overall_status ,
-        primary_completion_date - start_date as duration
-    from ctgov.studies s 
-    where nct_id in (select distinct nct_id from http.download_status ds)
-    ;"
-    )
-#df4 <- fetch(query4, n = -1)
-
-ggplot(data=df4, aes(x=duration,fill=overall_status)) +
-    geom_histogram()+
-    ggtitle("Histogram of trial durations") +
-    xlab("duration")+
-    facet_wrap(~overall_status)
-
-
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
-
-
-
-
-

-
-
-
-
ggsave(paste0(image_trial_details,"/HistTrialDurations_Faceted.png"))
-
-
Saving 7 x 5 in image
-`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
-
-
df5 <- dbGetQuery(
-    con,
-    "
-    with cte1 as (
-    select 
-        nct_id, 
-        start_date , 
-        primary_completion_date, 
-        overall_status ,
-        primary_completion_date - start_date as duration
-    from ctgov.studies s 
-    where nct_id in (select distinct nct_id from http.download_status ds)
-    ), cte2 as (
-    select nct_id,count(*) as snapshot_count from formatted_data_with_planned_enrollment fdwpe
-    group by nct_id
-    )
-    select a.nct_id, a.overall_status, a.duration,b.snapshot_count
-    from cte1 as a
-        join cte2 as b  
-            on a.nct_id=b.nct_id
-    ;"
-    )
-df5$overall_status <- as.factor(df5$overall_status)
-
-ggplot(data=df5, aes(x=duration,y=snapshot_count,color=overall_status)) +
-    geom_jitter() +
-    ggtitle("Comparison of duration, status, and snapshot_count") +
-    xlab("duration") +
-    ylab("snapshot count") 
-
-
-
-

-
-
-
-
ggsave(paste0(image_trial_details,"/SnapshotsVsDurationVsTermination.png"))
-
-
Saving 7 x 5 in image
-
-
dbDisconnect(con)
-
-
[1] TRUE
-
-
#get number of trials and snapshots in each category
-group_trials_by_category <- as.data.frame(aggregate(category_id ~ nct_id, df, max))
-group_trials_by_category <- as.data.frame(group_trials_by_category)
-
-ggplot(data = group_trials_by_category, aes(x=category_id)) +
-    geom_bar(binwidth=1,color="black",fill="lightblue") +
-    scale_x_continuous(breaks=scales::pretty_breaks(n=22)) + 
-    labs(
-        title="bar chart of trial categories"
-        ,x="Category ID"
-        ,y="Count"
-    )
-
-
Warning in geom_bar(binwidth = 1, color = "black", fill = "lightblue"):
-Ignoring unknown parameters: `binwidth`
-
-
-
-
-

-
-
-
-
ggsave(paste0(image_trial_details,"/CategoryCounts.png"))
-
-
Saving 7 x 5 in image
-
-
summary(df5)
-
-
    nct_id             overall_status    duration      snapshot_count  
- Length:162         Completed :134    Min.   :  61.0   Min.   : 1.000  
- Class :character   Terminated: 28    1st Qu.: 618.5   1st Qu.: 4.000  
- Mode  :character                     Median :1022.5   Median : 6.000  
-                                      Mean   :1202.4   Mean   : 8.315  
-                                      3rd Qu.:1637.0   3rd Qu.:11.000  
-                                      Max.   :3332.0   Max.   :48.000  
-
-
cor(df5$duration,df5$snapshot_count)
-
-
[1] 0.3356006
-
-
sum(df5$snapshot_count)
-
-
[1] 1347
-
-
-
-
-

Fit Results

-
-
################################# ANALYZE #####################################
-#print(fit)
-
-
-
-
-

Parameter Distributions

-
-
#g1 <- group_mcmc_areas("beta",beta_list,fit,1)
-
-
-gx <- c()
-
-#grab parameters for every category with more than 8 observations
-for (i in category_count$category_id[category_count$n >= 0]) {
-    print(i)
-    
-    #Print parameter distributions
-    gi <- group_mcmc_areas("beta",beta_list,fit,i) #add way to filter groups
-    ggsave(
-        paste0(image_parameters_by_groups,"/group_",i,"_",gi$name,".png")
-        ,plot=gi$plot
-        )
-    gx <- c(gx,gi)
-
-    #Get Quantiles and means for parameters
-#    table <- xtable(gi$quantiles,
-#      floating=FALSE
-#      ,latex.environments = NULL
-#      ,booktabs = TRUE
-#      ,zap=getOption("digits")
-#      )
-#    write_lines(table,paste0("./latex_output/DirectEffects/group_",gi$name,".tex"))
-}
-
-
[1] 1
-
-
-
Saving 7 x 5 in image
-
-
-
Warning: Removed 3 rows containing missing values or values outside the scale range
-(`geom_vline()`).
-
-
-
[1] 2
-
-
-
Saving 7 x 5 in image
-
-
-
Warning: Removed 2 rows containing missing values or values outside the scale range
-(`geom_vline()`).
-
-
-
[1] 3
-
-
-
Saving 7 x 5 in image
-
-
-
Warning: Removed 2 rows containing missing values or values outside the scale range
-(`geom_vline()`).
-
-
-
[1] 4
-
-
-
Saving 7 x 5 in image
-
-
-
Warning: Removed 2 rows containing missing values or values outside the scale range
-(`geom_vline()`).
-
-
-
[1] 5
-
-
-
Saving 7 x 5 in image
-
-
-
Warning: Removed 1 row containing missing values or values outside the scale range
-(`geom_vline()`).
-
-
-
[1] 6
-
-
-
Saving 7 x 5 in image
-
-
-
Warning: Removed 2 rows containing missing values or values outside the scale range
-(`geom_vline()`).
-
-
-
[1] 7
-
-
-
Saving 7 x 5 in image
-
-
-
Warning: Removed 3 rows containing missing values or values outside the scale range
-(`geom_vline()`).
-
-
-
[1] 9
-
-
-
Saving 7 x 5 in image
-
-
-
Warning: Removed 3 rows containing missing values or values outside the scale range
-(`geom_vline()`).
-
-
-
[1] 10
-
-
-
Saving 7 x 5 in image
-
-
-
Warning: Removed 3 rows containing missing values or values outside the scale range
-(`geom_vline()`).
-
-
-
[1] 11
-
-
-
Saving 7 x 5 in image
-
-
-
Warning: Removed 2 rows containing missing values or values outside the scale range
-(`geom_vline()`).
-
-
-
[1] 12
-
-
-
Saving 7 x 5 in image
-
-
-
Warning: Removed 1 row containing missing values or values outside the scale range
-(`geom_vline()`).
-
-
-
[1] 13
-
-
-
Saving 7 x 5 in image
-
-
-
[1] 14
-
-
-
Saving 7 x 5 in image
-
-
-
Warning: Removed 3 rows containing missing values or values outside the scale range
-(`geom_vline()`).
-
-
-
[1] 17
-
-
-
Saving 7 x 5 in image
-
-
-
Warning: Removed 2 rows containing missing values or values outside the scale range
-(`geom_vline()`).
-
-
-
[1] 18
-
-
-
Saving 7 x 5 in image
-
-
-
Warning: Removed 2 rows containing missing values or values outside the scale range
-(`geom_vline()`).
-
-
-
-
px <- c()
-
-
-for (i in c(1,2,3,9,10,11,12)) {
-    
-    #Print parameter distributions
-    pi <- parameter_mcmc_areas("beta",beta_list,fit,i) #add way to filter groups
-    ggsave(
-        paste0(image_parameters_across_groups,"/parameters_",i,"_",pi$name,".png")
-        ,plot=pi$plot
-        )
-    px <- c(px,pi)
-
-    #Get Quantiles and means for parameters
-#    table <- xtable(pi$quantiles,
-#      floating=FALSE
-#      ,latex.environments = NULL
-#      ,booktabs = TRUE
-#      ,zap=getOption("digits")
-#      )
-#    write_lines(table,paste0("./latex_output/DirectEffects/parameters_",i,"_",pi$name,".tex"))
-    
-}
-
-
Saving 7 x 5 in image
-
-
-
Warning: Removed 6 rows containing missing values or values outside the scale range
-(`geom_vline()`).
-
-
-
Saving 7 x 5 in image
-Saving 7 x 5 in image
-Saving 7 x 5 in image
-
-
-
Warning: Removed 3 rows containing missing values or values outside the scale range
-(`geom_vline()`).
-
-
-
Saving 7 x 5 in image
-
-
-
Warning: Removed 6 rows containing missing values or values outside the scale range
-(`geom_vline()`).
-
-
-
Saving 7 x 5 in image
-
-
-
Warning: Removed 5 rows containing missing values or values outside the scale range
-(`geom_vline()`).
-
-
-
Saving 7 x 5 in image
-
-
-
Warning: Removed 5 rows containing missing values or values outside the scale range
-(`geom_vline()`).
-
-
-

Note these have 95% outer CI and 80% inner (shaded)

-
-
#get the generic and uspdc parameters
-print(px[4]$plot + px[7]$plot)
-
-
-
-

-
-
-
-
ggsave(paste0(image_parameters_across_groups,"2+3_generic_and_uspdc.png"))
-
-
Saving 7 x 5 in image
-
-
#get the parameters associated with duration
-px[16]$plot + px[19]$plot
-
-
Warning: Removed 5 rows containing missing values or values outside the scale range
-(`geom_vline()`).
-Removed 5 rows containing missing values or values outside the scale range
-(`geom_vline()`).
-
-
-
-
-

-
-
-
-
ggsave(paste0(image_parameters_across_groups,"11+12_statusREC_and_statusANR.png"))
-
-
Saving 7 x 5 in image
-
-
-
Warning: Removed 5 rows containing missing values or values outside the scale range
-(`geom_vline()`).
-Removed 5 rows containing missing values or values outside the scale range
-(`geom_vline()`).
-
-
-
-
-

Counterfactuals

-
-
generated_ib <- gqs(
-    fit@stanmodel,
-    data=counterfact_delay,
-    draws=as.matrix(fit),
-    seed=11021585
-    )
-
-
-

Get priors

-
-
df_ib_p <- data.frame(
-    p_prior=as.vector(extract(generated_ib, pars="p_prior")$p_prior)
-    ,p_predicted = as.vector(extract(generated_ib, pars="p_predicted")$p_predicted)
-)
-
-df_ib_prior <- data.frame(
-    mu_prior = as.vector(extract(generated_ib, pars="mu_prior")$mu_prior)
-    ,sigma_prior = as.vector(extract(generated_ib, pars="sigma_prior")$sigma_prior)
-)
-
-#p_prior
-ggplot(df_ib_p, aes(x=p_prior)) +
-    geom_density() + 
-    labs(
-        title="implied prior distribution p"
-        ,subtitle=""
-        ,x="probability domain 'p'"
-        ,y="probability density"
-    )
-
-
-
-

-
-
-
-
ggsave(paste0(image_dist_diff_analysis,"/prior_p.png"))
-
-
Saving 7 x 5 in image
-
-
#p_posterior
-ggplot(df_ib_p, aes(x=p_predicted)) +
-    geom_density() + 
-    labs(
-        title="implied posterior distribution p"
-        ,subtitle=""
-        ,x="probability domain 'p'"
-        ,y="Probability Density"
-    )
-
-
-
-

-
-
-
-
ggsave(paste0(image_dist_diff_analysis,"/posterior_p.png"))
-
-
Saving 7 x 5 in image
-
-
#mu_prior
-ggplot(df_ib_prior) +
-    geom_density(aes(x=mu_prior)) + 
-    labs(
-        title="Prior - Mu"
-        ,subtitle="same prior for all Mu values"
-        ,x="Mu"
-        ,y="Probability"
-    )
-
-
-
-

-
-
-
-
ggsave(paste0(image_dist_diff_analysis,"/prior_mu.png"))
-
-
Saving 7 x 5 in image
-
-
#sigma_posterior
-ggplot(df_ib_prior) +
-    geom_density(aes(x=sigma_prior)) + 
-    labs(
-        title="Prior - Sigma"
-        ,subtitle="same prior for all Sigma values"
-        ,x="Sigma"
-        ,y="Probability"
-    )
-
-
-
-

-
-
-
-
ggsave(paste0(image_dist_diff_analysis,"/prior_sigma.png"))
-
-
Saving 7 x 5 in image
-
-
-
-
check_hmc_diagnostics(fit)
-
-

-Divergences:
-
-
-
0 of 10000 iterations ended with a divergence.
-
-
-

-Tree depth:
-
-
-
0 of 10000 iterations saturated the maximum tree depth of 10.
-
-
-

-Energy:
-
-
-
E-BFMI indicated no pathological behavior.
-
-
-
-
-

Intervention: Delay close of enrollment

-
-
counterfact_predicted_ib <- data.frame(
-    p_predicted_default = as.vector(extract(generated_ib, pars="p_predicted_default")$p_predicted_default)
-    ,p_predicted_intervention = as.vector(extract(generated_ib, pars="p_predicted_intervention")$p_predicted_intervention)
-    ,predicted_difference = as.vector(extract(generated_ib, pars="predicted_difference")$predicted_difference)
-)
-
-
-ggplot(counterfact_predicted_ib, aes(x=p_predicted_default)) +
-    geom_density() + 
-    labs(
-        title="Predicted Distribution of 'p'"
-        ,subtitle="Intervention: None"
-        ,x="Probability Domain 'p'"
-        ,y="Probability Density"
-    )
-
-
-
-

-
-
-
-
ggsave(paste0(image_dist_diff_analysis,"p_no_intervention.png"))
-
-
Saving 7 x 5 in image
-
-
ggplot(counterfact_predicted_ib, aes(x=p_predicted_intervention)) +
-    geom_density() + 
-    labs(
-        title="Predicted Distribution of 'p'"
-        ,subtitle="Intervention: Delay close of enrollment"
-        ,x="Probability Domain 'p'"
-        ,y="Probability Density"
-    )
-
-
-
-

-
-
-
-
ggsave(paste0(image_dist_diff_analysis,"/p_delay_intervention.png"))
-
-
Saving 7 x 5 in image
-
-
ggplot(counterfact_predicted_ib, aes(x=predicted_difference)) +
-    geom_density() + 
-    labs(
-        title="Predicted Distribution of differences 'p'"
-        ,subtitle="Intervention: Delay close of enrollment"
-        ,x="Difference in 'p' under treatment"
-        ,y="Probability Density"
-    )
-
-
-
-

-
-
-
-
ggsave(paste0(image_dist_diff_analysis,"/p_delay_intervention_distdiff_1.png"))
-
-
Saving 7 x 5 in image
-
-
-
-
get_category_count <- function(tbl, id) {
-  result <- tbl$n[tbl$category_id == id]
-  if(length(result) == 0) 0 else result
-}
-
-category_names <- sapply(1:length(beta_list$groups), 
-    function(i) sprintf("ICD-10 #%d: %s (n=%d)", 
-                       i, 
-                       beta_list$groups[i],
-                       get_category_count(category_count, i)))
-
-
-
pddf_ib <- data.frame(extract(generated_ib, pars="predicted_difference")$predicted_difference) |>
-    pivot_longer(X1:X168) #CHANGE_NOTE: moved from X169 to X168
-
-pddf_ib["entry_idx"] <- as.numeric(gsub("\\D","",pddf_ib$name))
-pddf_ib["category"] <-  sapply(pddf_ib$entry_idx, function(i) df$category_id[i])
-pddf_ib["category_name"] <- sapply(
-    pddf_ib$category, 
-    function(i) category_names[i]
-    )
-  
-
-
-ggplot(pddf_ib, aes(x=value,)) +
-    geom_density(adjust=1/5) +
-    labs(
-        title = "Distribution of predicted differences"
-        ,subtitle = "Intervention: Delay close of enrollment"
-        ,x = "Difference in probability due to intervention"
-        ,y = "Probability Density"
-    ) + 
-    geom_vline(aes(xintercept = 0), color = "skyblue", linetype="dashed") 
-
-
-
-

-
-
-
-
ggsave(paste0(image_dist_diff_analysis,"/p_delay_intervention_distdiff_styled.png"))
-
-
Saving 7 x 5 in image
-
-
ggplot(pddf_ib, aes(x=value,)) +
-    geom_density(adjust=1/5) +
-    facet_wrap(
-        ~factor(
-            category_name, 
-            levels=category_names
-            )
-        , labeller = label_wrap_gen(multi_line = TRUE)
-        , ncol=4) +
-    labs(
-        title = "Distribution of predicted differences | By Group"
-        ,subtitle = "Intervention: Delay close of enrollment"
-        ,x = "Difference in probability due to intervention"
-        ,y = "Probability Density"
-    ) + 
-    geom_vline(aes(xintercept = 0), color = "skyblue", linetype="dashed") +
-    theme(strip.text.x = element_text(size = 8))
-
-
-
-

-
-
-
-
ggsave(paste0(image_dist_diff_analysis,"/p_delay_intervention_distdiff_by_group.png"))
-
-
Saving 7 x 5 in image
-
-
ggplot(pddf_ib, aes(x=value,)) +
-    geom_histogram(bins=300) +
-    facet_wrap(
-        ~factor(
-            category_name, 
-            levels=category_names
-            )
-        , labeller = label_wrap_gen(multi_line = TRUE)
-        , ncol=4) +
-    labs(
-        title = "Histogram of predicted differences | By Group"
-        ,subtitle = "Intervention: Delay close of enrollment"
-        ,x = "Difference in probability due to intervention"
-        ,y = "Predicted counts"
-    ) + 
-    geom_vline(aes(xintercept = 0), color = "skyblue", linetype="dashed") +
-    theme(strip.text.x = element_text(size = 8))
-
-
-
-

-
-
-
-
ggsave(paste0(image_dist_diff_analysis,"p_delay_intervention_histdiff_by_group.png"))
-
-
Saving 7 x 5 in image
-
-
-
-
p3 <- ggplot(pddf_ib, aes(x=value,)) +
-    geom_histogram(bins=500) +
-    labs(
-        title = "Distribution of predicted differences"
-        ,subtitle = "Intervention: Delay close of enrollment"
-        ,x = "Difference in probability due to intervention"
-        ,y = "Probability Density"
-        ,caption = "Vertical marks: 5/10/25/50/75/90/95th percentiles. Dot shows mean."
-    ) + 
-    geom_vline(aes(xintercept = 0), color = "skyblue", linetype="dashed") 
-
-density_hight <- max(density(pddf_ib$value)$y)
-
-stats <- list(
- p5 = quantile(pddf_ib$value, 0.05),
- p10 = quantile(pddf_ib$value, 0.10),
- q1 = quantile(pddf_ib$value, 0.25),
- med = median(pddf_ib$value),
- mean = mean(pddf_ib$value),
- q3 = quantile(pddf_ib$value, 0.75),
- p90 = quantile(pddf_ib$value, 0.90),
- p95 = quantile(pddf_ib$value, 0.95),
- max_height = max(ggplot_build(p3)$data[[1]]$count),
- y_offset = -max(ggplot_build(p3)$data[[1]]$count) * 0.05,
- y_offset_density = -density_hight * 0.15
-)
-
-p3 + 
- # Box
- geom_segment(data = data.frame(
-   x = c(stats$q1, stats$q3, stats$med),
-   xend = c(stats$q1, stats$q3, stats$med),
-   y = rep(stats$y_offset, 3), 
-   yend = rep(stats$y_offset * 2, 3)
- ), aes(x = x, xend = xend, y = y, yend = yend)) +
- geom_segment(data = data.frame(
-   x = rep(stats$q1, 2),
-   xend = rep(stats$q3, 2),
-   y = c(stats$y_offset, stats$y_offset * 2),
-   yend = c(stats$y_offset, stats$y_offset * 2)
- ), aes(x = x, xend = xend, y = y, yend = yend)) +
- # Inner whiskers (Q1->P10, Q3->P90)
- geom_segment(data = data.frame(
-   x = c(stats$q1, stats$q3),
-   xend = c(stats$p10, stats$p90),
-   y = rep(stats$y_offset * 1.5, 2),
-   yend = rep(stats$y_offset * 1.5, 2)
- ), aes(x = x, xend = xend, y = y, yend = yend)) +
- # Crossbars at P10/P90
- geom_segment(data = data.frame(
-   x = c(stats$p10, stats$p90),
-   xend = c(stats$p10, stats$p90),
-   y = stats$y_offset * 1.3,
-   yend = stats$y_offset * 1.7
- ), aes(x = x, xend = xend, y = y, yend = yend)) +
- # Outer whiskers (P10->P5, P90->P95)
- geom_segment(data = data.frame(
-   x = c(stats$p10, stats$p90),
-   xend = c(stats$p5, stats$p95),
-   y = rep(stats$y_offset * 1.5, 2),
-   yend = rep(stats$y_offset * 1.5, 2)
- ), aes(x = x, xend = xend, y = y, yend = yend)) +
- # Crossbars at P5/P95
- geom_segment(data = data.frame(
-   x = c(stats$p5, stats$p95),
-   xend = c(stats$p5, stats$p95),
-   y = stats$y_offset * 1.3,
-   yend = stats$y_offset * 1.7
- ), aes(x = x, xend = xend, y = y, yend = yend)) +
- # Mean dot
- geom_point(data = data.frame(
-   x = stats$mean,
-   y = stats$y_offset * 1.5
- ), aes(x = x, y = y))
-
-
-
-

-
-
-
-
ggsave(paste0(image_dist_diff_analysis,"/p_delay_intervention_histdiff_boxplot.png"))
-
-
Saving 7 x 5 in image
-
-
p4 <- ggplot(pddf_ib, aes(x = value)) +
-  geom_density() +
-    labs(
-        title = "Distribution of predicted differences"
-        ,subtitle = "Intervention: Delay close of enrollment"
-        ,x = "Difference in probability due to intervention"
-        ,y = "Probability Density"
-        ,caption = "Vertical marks: 5/10/25/50/75/90/95th percentiles. Dot shows mean."
-    ) + 
-    geom_vline(aes(xintercept = 0), color = "skyblue", linetype="dashed") +
- geom_segment(data = data.frame(
-   x = c(stats$q1, stats$q3, stats$med),
-   xend = c(stats$q1, stats$q3, stats$med),
-   y = rep(stats$y_offset_density, 3), 
-   yend = rep(stats$y_offset_density * 2, 3)
- ), aes(x = x, xend = xend, y = y, yend = yend)) +
- geom_segment(data = data.frame(
-   x = rep(stats$q1, 2),
-   xend = rep(stats$q3, 2),
-   y = c(stats$y_offset_density, stats$y_offset_density * 2),
-   yend = c(stats$y_offset_density, stats$y_offset_density * 2)
- ), aes(x = x, xend = xend, y = y, yend = yend)) +
- # Inner whiskers (Q1->P10, Q3->P90)
- geom_segment(data = data.frame(
-   x = c(stats$q1, stats$q3),
-   xend = c(stats$p10, stats$p90),
-   y = rep(stats$y_offset_density * 1.5, 2),
-   yend = rep(stats$y_offset_density * 1.5, 2)
- ), aes(x = x, xend = xend, y = y, yend = yend)) +
- # Crossbars at P10/P90
- geom_segment(data = data.frame(
-   x = c(stats$p10, stats$p90),
-   xend = c(stats$p10, stats$p90),
-   y = stats$y_offset_density * 1.3,
-   yend = stats$y_offset_density * 1.7
- ), aes(x = x, xend = xend, y = y, yend = yend)) +
- # Outer whiskers (P10->P5, P90->P95)
- geom_segment(data = data.frame(
-   x = c(stats$p10, stats$p90),
-   xend = c(stats$p5, stats$p95),
-   y = rep(stats$y_offset_density * 1.5, 2),
-   yend = rep(stats$y_offset_density * 1.5, 2)
- ), aes(x = x, xend = xend, y = y, yend = yend)) +
- # Crossbars at P5/P95
- geom_segment(data = data.frame(
-   x = c(stats$p5, stats$p95),
-   xend = c(stats$p5, stats$p95),
-   y = stats$y_offset_density * 1.3,
-   yend = stats$y_offset_density * 1.7
- ), aes(x = x, xend = xend, y = y, yend = yend)) +
- # Mean dot
- geom_point(data = data.frame(
-   x = stats$mean,
-   y = stats$y_offset_density * 1.5
- ), aes(x = x, y = y))
-ggsave(paste0(image_dist_diff_analysis,"/p_delay_intervention_distdiff_boxplot.png"))
-
-
Saving 7 x 5 in image
-
-
-
-
 ggplot(pddf_ib, aes(x=value)) +
-    stat_ecdf(geom='step') +
-    labs(
-        title = "Cumulative distribution of predicted differences",
-        subtitle = "Intervention: Delay close of enrollment",
-        x = "Difference in probability of termination due to intervention",
-        y = "Cumulative Proportion"
-    ) 
-
-
-
-

-
-
-
-
ggsave(paste0(image_dist_diff_analysis,"/p_delay_intervention_cumulative_distdiff.png"))
-
-
Saving 7 x 5 in image
-
-
-

Get the % of differences in the spike around zero

-
-
# get values around and above/below spike
-width <- 0.02
-spike_band_centered_zero <- mean( pddf_ib$value >= -width/2 & pddf_ib$value <= width/2)
-above_spike_band <- mean( pddf_ib$value >= width/2)
-below_spike_band <- mean( pddf_ib$value <= -width/2)
-
-# get mass above and mass below zero
-mass_below_zero <- mean( pddf_ib$value <= 0)
-
-

Looking at the spike around zero, we find that 39.193869% of the probability mass is contained within the band from [-1,1]. Additionally, there was 51.937619% of the probability above that – representing those with a general increase in the probability of termination – and 8.8685119% of the probability mass below the band – representing a decrease in the probability of termination.

-

On average, if you keep the trial open instead of closing it, 0.2488518% of trials will see a decrease in the probability of termination, but, due to the high increase in probability of termination given termination was increased, the mean probability of termination increases by 0.0249444.

-
-
# 5%-iles
-
-summary(pddf_ib$value)
-
-
    Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
--0.42755  0.00000  0.01164  0.02494  0.04213  0.54813 
-
-
# Create your quantiles
-quants <- quantile(pddf_ib$value, probs = seq(0,1,0.05), type=4)
-
-# Convert to a data frame
-quant_df <- data.frame(
-  Percentile = names(quants),
-  Value = quants
-)
-kable(quant_df)
-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
PercentileValue
0%0%-0.4275511
5%5%-0.0213008
10%10%-0.0079768
15%15%-0.0021932
20%20%-0.0001065
25%25%0.0000000
30%30%0.0000535
35%35%0.0012179
40%40%0.0040158
45%45%0.0075646
50%50%0.0116358
55%55%0.0162235
60%60%0.0214039
65%65%0.0272582
70%70%0.0340562
75%75%0.0421259
80%80%0.0519438
85%85%0.0645430
90%90%0.0819636
95%95%0.1102529
100%100%0.5481257
-
-
-

There seems to be some trials that are highly suceptable to this enrollment delay. Specifically, there were some

-
-
n = length(counterfact_predicted_ib$p_predicted_intervention)
-k = 100
-simulated_terminations_intervention <- mean(rbinom(n,k,as.vector(counterfact_predicted_ib$p_predicted_intervention)))
-simulated_terminations_base <-mean(rbinom(n,k,as.vector(counterfact_predicted_ib$p_predicted_default)))
-
-simulated_percentages <- (simulated_terminations_intervention - simulated_terminations_base)/k
-
-

The simulation above shows that this results in a percentage-point increase of about 2.4922006.

-
-
-
-

Diagnostics

-
-
#trace plots
-image_diagnostics <- paste0(image_root,"/diagnostics")
-
-plot(fit, pars=c("mu"), plotfun="trace")
-
-
-
-

-
-
-
-
ggsave(paste0(image_diagnostics,"/trace_plot_mu.png"))
-
-
Saving 7 x 5 in image
-
-
for (i in 1:3) {
-    print(
-        mcmc_rank_overlay(
-        fit, 
-        pars=c(
-            paste0("mu[",4*i-3,"]"),
-            paste0("mu[",4*i-2,"]"),
-            paste0("mu[",4*i-1,"]"),
-            paste0("mu[",4*i,"]")
-            ), 
-        n_bins=100
-        )+  legend_move("top") +
-             scale_colour_ghibli_d("KikiMedium")
-    )
-    mu_range <- paste0(4*i-3,"-",4*i)
-    filename <- paste0(image_diagnostics,"/trace_rank_plot_mu_",mu_range,".png")
-    ggsave(filename)
-}
-
-
Scale for colour is already present.
-Adding another scale for colour, which will replace the existing scale.
-
-
-
-
-

-
-
-
-
-
Saving 7 x 5 in image
-Scale for colour is already present.
-Adding another scale for colour, which will replace the existing scale.
-
-
-
-
-

-
-
-
-
-
Saving 7 x 5 in image
-Scale for colour is already present.
-Adding another scale for colour, which will replace the existing scale.
-
-
-
-
-

-
-
-
-
-
Saving 7 x 5 in image
-
-
-
-
plot(fit, pars=c("sigma"), plotfun="trace")
-
-
-
-

-
-
-
-
ggsave(paste0(image_diagnostics,"/traceplot_sigma.png"))
-
-
Saving 7 x 5 in image
-
-
for (i in 1:3) {
-    print(
-        mcmc_rank_overlay(
-        fit, 
-        pars=c(
-            paste0("sigma[",4*i-3,"]"),
-            paste0("sigma[",4*i-2,"]"),
-            paste0("sigma[",4*i-1,"]"),
-            paste0("sigma[",4*i,"]")
-            ), 
-        n_bins=100
-        )+  legend_move("top") +
-             scale_colour_ghibli_d("KikiMedium")
-    )
-    sigma_range <- paste0(4*i-3,"-",4*i)
-    filename <- paste0(image_diagnostics,"/trace_rank_plot_sigma_",sigma_range,".png")
-    ggsave(filename)
-}
-
-
Scale for colour is already present.
-Adding another scale for colour, which will replace the existing scale.
-
-
-
-
-

-
-
-
-
-
Saving 7 x 5 in image
-Scale for colour is already present.
-Adding another scale for colour, which will replace the existing scale.
-
-
-
-
-

-
-
-
-
-
Saving 7 x 5 in image
-Scale for colour is already present.
-Adding another scale for colour, which will replace the existing scale.
-
-
-
-
-

-
-
-
-
-
Saving 7 x 5 in image
-
-
-
-
#other diagnostics
-logpost <- log_posterior(fit)
-nuts_prmts <- nuts_params(fit)
-posterior <- as.array(fit)
-
-
-
color_scheme_set("darkgray")
-div_style <- parcoord_style_np(div_color = "green", div_size = 0.05, div_alpha = 0.4)
-mcmc_parcoord(posterior, regex_pars = "mu", np=nuts_prmts, np_style = div_style, alpha = 0.05)
-
-
-
-

-
-
-
-
ggsave(paste0(image_diagnostics,"/parcoord_mu.png"))
-
-
Saving 7 x 5 in image
-
-
-
-
for (i in 1:3) {
-    mus = sapply(3:0, function(j) paste0("mu[",4*i-j ,"]"))
-    print(
-        mcmc_pairs(
-            posterior,
-            np = nuts_prmts,
-            pars=c(
-                mus,
-                "lp__"
-            ),
-            off_diag_args = list(size = 0.75)
-        )
-    )
-    mu_range <- paste0(4*i-3,"-",4*i)
-    filename <- paste0(image_diagnostics,"/correlation_plot_mu_",mu_range,".png")
-    ggsave(filename)
-}
-
-
-
-

-
-
-
-
-
Saving 7 x 5 in image
-
-
-
-
-

-
-
-
-
-
Saving 7 x 5 in image
-
-
-
-
-

-
-
-
-
-
Saving 7 x 5 in image
-
-
-
-
mcmc_parcoord(posterior,regex_pars = "sigma", np=nuts_prmts, alpha=0.05)
-
-
-
-

-
-
-
-
ggsave(paste0(image_diagnostics,"/parcoord_sigma.png"))
-
-
Saving 7 x 5 in image
-
-
-
-
for (i in 1:3) {
-    params = sapply(3:0, function(j) paste0("sigma[",4*i-j ,"]"))
-    print(
-        mcmc_pairs(
-            posterior,
-            np = nuts_prmts,
-            pars=c(
-                params,
-                "lp__"
-            ),
-            off_diag_args = list(size = 0.75)
-        )
-    )
-    sigma_range <- paste0(4*i-3,"-",4*i)
-    filename <- paste0(image_diagnostics,"/correlation_plot_sigma_",sigma_range,".png")
-    ggsave(filename)
-}
-
-
-
-

-
-
-
-
-
Saving 7 x 5 in image
-
-
-
-
-

-
-
-
-
-
Saving 7 x 5 in image
-
-
-
-
-

-
-
-
-
-
Saving 7 x 5 in image
-
-
-
-
for (k in 1:22) {
-for (i in 1:3) {
-    params = sapply(3:0, function(j) paste0("beta[",k,",",4*i-j ,"]"))
-    print(
-        mcmc_pairs(
-            posterior,
-            np = nuts_prmts,
-            pars=c(
-                params,
-                "lp__"
-            ),
-            off_diag_args = list(size = 0.75)
-        )
-    )
-    
-    beta_range <- paste0("k_",k,"_i_",4*i-3,"-",4*i)
-    filename <- paste0(image_diagnostics,"/correlation_plot_beta_",beta_range,".png")
-    ggsave(filename)
-}}
-
-
-
-

-
-
-
-
-
Saving 7 x 5 in image
-
-
-
-
-

-
-
-
-
-
Saving 7 x 5 in image
-
-
-
-
-

-
-
-
-
-
Saving 7 x 5 in image
-
-
-
-
-

-
-
-
-
-
Saving 7 x 5 in image
-
-
-
-
-

-
-
-
-
-
Saving 7 x 5 in image
-
-
-
-
-

-
-
-
-
-
Saving 7 x 5 in image
-
-
-
-
-

-
-
-
-
-
Saving 7 x 5 in image
-
-
-
-
-

-
-
-
-
-
Saving 7 x 5 in image
-
-
-
-
-

-
-
-
-
-
Saving 7 x 5 in image
-
-
-
-
-

-
-
-
-
-
Saving 7 x 5 in image
-
-
-
-
-

-
-
-
-
-
Saving 7 x 5 in image
-
-
-
-
-

-
-
-
-
-
Saving 7 x 5 in image
-
-
-
-
-

-
-
-
-
-
Saving 7 x 5 in image
-
-
-
-
-

-
-
-
-
-
Saving 7 x 5 in image
-
-
-
-
-

-
-
-
-
-
Saving 7 x 5 in image
-
-
-
-
-

-
-
-
-
-
Saving 7 x 5 in image
-
-
-
-
-

-
-
-
-
-
Saving 7 x 5 in image
-
-
-
-
-

-
-
-
-
-
Saving 7 x 5 in image
-
-
-
-
-

-
-
-
-
-
Saving 7 x 5 in image
-
-
-
-
-

-
-
-
-
-
Saving 7 x 5 in image
-
-
-
-
-

-
-
-
-
-
Saving 7 x 5 in image
-
-
-
-
-

-
-
-
-
-
Saving 7 x 5 in image
-
-
-
-
-

-
-
-
-
-
Saving 7 x 5 in image
-
-
-
-
-

-
-
-
-
-
Saving 7 x 5 in image
-
-
-
-
-

-
-
-
-
-
Saving 7 x 5 in image
-
-
-
-
-

-
-
-
-
-
Saving 7 x 5 in image
-
-
-
-
-

-
-
-
-
-
Saving 7 x 5 in image
-
-
-
-
-

-
-
-
-
-
Saving 7 x 5 in image
-
-
-
-
-

-
-
-
-
-
Saving 7 x 5 in image
-
-
-
-
-

-
-
-
-
-
Saving 7 x 5 in image
-
-
-
-
-

-
-
-
-
-
Saving 7 x 5 in image
-
-
-
-
-

-
-
-
-
-
Saving 7 x 5 in image
-
-
-
-
-

-
-
-
-
-
Saving 7 x 5 in image
-
-
-
-
-

-
-
-
-
-
Saving 7 x 5 in image
-
-
-
-
-

-
-
-
-
-
Saving 7 x 5 in image
-
-
-
-
-

-
-
-
-
-
Saving 7 x 5 in image
-
-
-
-
-

-
-
-
-
-
Saving 7 x 5 in image
-
-
-
-
-

-
-
-
-
-
Saving 7 x 5 in image
-
-
-
-
-

-
-
-
-
-
Saving 7 x 5 in image
-
-
-
-
-

-
-
-
-
-
Saving 7 x 5 in image
-
-
-
-
-

-
-
-
-
-
Saving 7 x 5 in image
-
-
-
-
-

-
-
-
-
-
Saving 7 x 5 in image
-
-
-
-
-

-
-
-
-
-
Saving 7 x 5 in image
-
-
-
-
-

-
-
-
-
-
Saving 7 x 5 in image
-
-
-
-
-

-
-
-
-
-
Saving 7 x 5 in image
-
-
-
-
-

-
-
-
-
-
Saving 7 x 5 in image
-
-
-
-
-

-
-
-
-
-
Saving 7 x 5 in image
-
-
-
-
-

-
-
-
-
-
Saving 7 x 5 in image
-
-
-
-
-

-
-
-
-
-
Saving 7 x 5 in image
-
-
-
-
-

-
-
-
-
-
Saving 7 x 5 in image
-
-
-
-
-

-
-
-
-
-
Saving 7 x 5 in image
-
-
-
-
-

-
-
-
-
-
Saving 7 x 5 in image
-
-
-
-
-

-
-
-
-
-
Saving 7 x 5 in image
-
-
-
-
-

-
-
-
-
-
Saving 7 x 5 in image
-
-
-
-
-

-
-
-
-
-
Saving 7 x 5 in image
-
-
-
-
-

-
-
-
-
-
Saving 7 x 5 in image
-
-
-
-
-

-
-
-
-
-
Saving 7 x 5 in image
-
-
-
-
-

-
-
-
-
-
Saving 7 x 5 in image
-
-
-
-
-

-
-
-
-
-
Saving 7 x 5 in image
-
-
-
-
-

-
-
-
-
-
Saving 7 x 5 in image
-
-
-
-
-

-
-
-
-
-
Saving 7 x 5 in image
-
-
-
-
-

-
-
-
-
-
Saving 7 x 5 in image
-
-
-
-
-

-
-
-
-
-
Saving 7 x 5 in image
-
-
-
-
-

-
-
-
-
-
Saving 7 x 5 in image
-
-
-
-
-

-
-
-
-
-
Saving 7 x 5 in image
-
-
-
-
-

-
-
-
-
-
Saving 7 x 5 in image
-
-
-
- -
- - -
- - - - - \ No newline at end of file diff --git a/r-analysis/EffectsOfEnrollmentDelay_rebased.qmd b/r-analysis/EffectsOfEnrollmentDelay_rebased.qmd deleted file mode 100644 index 46b6f20..0000000 --- a/r-analysis/EffectsOfEnrollmentDelay_rebased.qmd +++ /dev/null @@ -1,1224 +0,0 @@ ---- -title: "The Effects of Recruitment status on completion of clinical trials" -author: "Will King" -format: html -editor: source ---- - - -# Setup - -```{r} -library(knitr) -library(bayesplot) -available_mcmc(pattern = "_nuts_") -library(ggplot2) -library(patchwork) -library(tidyverse) -library(rstan) -library(tidyr) -library(ghibli) -library(xtable) -#Resources: https://github.com/stan-dev/rstan/wiki/RStan-Getting-Started - -#save unchanged models instead of recompiling -rstan_options(auto_write = TRUE) -#allow for multithreaded sampling -options(mc.cores = parallel::detectCores()) - -#test installation, shouldn't get any errors -#example(stan_model, package = "rstan", run.dontrun = TRUE) - - -image_root <- "./output/withdiff/EffectsOfEnrollmentDelay" -image_dist_diff_analysis <- paste0(image_root,"/dist_diff_analysis") -image_trial_details <-paste0(image_root,"/trials_details") -image_parameters_by_groups <-paste0(image_root,"/betas/parameters_by_group") -image_parameters_across_groups <-paste0(image_root,"/betas/parameter_across_groups") - -``` - -```{r} -################ Pull data from database ###################### -library(RPostgreSQL) -host <- 'aact_db-restored-2025-01-07' - -driver <- dbDriver("PostgreSQL") - -get_data <- function(driver) { - -con <- dbConnect( - driver, - user='root', - password='root', - dbname='aact_db', - host=host - ) -on.exit(dbDisconnect(con)) - -query <- dbSendQuery( - con, -# "select * from formatted_data_with_planned_enrollment;" -" -select - fdqpe.nct_id - --,fdqpe.start_date - --,fdqpe.current_enrollment - --,fdqpe.enrollment_category - ,fdqpe.current_status - ,fdqpe.earliest_date_observed - ,fdqpe.elapsed_duration - ,fdqpe.n_brands as identical_brands - ,ntbtu.brand_name_counts - ,fdqpe.category_id - ,fdqpe.final_status - ,fdqpe.h_sdi_val - --,fdqpe.h_sdi_u95 - --,fdqpe.h_sdi_l95 - ,fdqpe.hm_sdi_val - --,fdqpe.hm_sdi_u95 - --,fdqpe.hm_sdi_l95 - ,fdqpe.m_sdi_val - --,fdqpe.m_sdi_u95 - --,fdqpe.m_sdi_l95 - ,fdqpe.lm_sdi_val - --,fdqpe.lm_sdi_u95 - --,fdqpe.lm_sdi_l95 - ,fdqpe.l_sdi_val - --,fdqpe.l_sdi_u95 - --,fdqpe.l_sdi_l95 -from formatted_data_with_planned_enrollment fdqpe - join \"Formularies\".nct_to_brand_counts_through_uspdc ntbtu - on fdqpe.nct_id = ntbtu.nct_id -order by fdqpe.nct_id, fdqpe.earliest_date_observed -; -" - ) -df <- fetch(query, n = -1) -df <- na.omit(df) - -query2 <-dbSendQuery(con,"select count(*) from \"DiseaseBurden\".icd10_categories ic where \"level\"=1;") -n_categories <- fetch(query2, n = -1) - -return(list(data=df,ncat=n_categories)) -} - - -get_counterfact_base <- function(driver) { - -con <- dbConnect( - driver, - user='root', - password='root', - dbname='aact_db', - host=host - ) -on.exit(dbDisconnect(con)) - -query <- dbSendQuery( - con, - " - with cte as ( - --get last recruiting state - select fd.nct_id, max(fd.earliest_date_observed),min(fd2.earliest_date_observed) as tmstmp - from formatted_data fd - join formatted_data fd2 - on fd.nct_id=fd2.nct_id and fd.earliest_date_observed < fd2.earliest_date_observed - where fd.current_status = 'Recruiting' - and fd2.current_status = 'Active, not recruiting' - group by fd.nct_id - ) - select - fdqpe.nct_id - --,fdqpe.start_date - --,fdqpe.current_enrollment - --,fdqpe.enrollment_category - ,fdqpe.current_status - ,fdqpe.earliest_date_observed - ,fdqpe.elapsed_duration - ,fdqpe.n_brands as identical_brands - ,ntbtu.brand_name_counts - ,fdqpe.category_id - ,fdqpe.final_status - ,fdqpe.h_sdi_val - --,fdqpe.h_sdi_u95 - --,fdqpe.h_sdi_l95 - ,fdqpe.hm_sdi_val - --,fdqpe.hm_sdi_u95 - --,fdqpe.hm_sdi_l95 - ,fdqpe.m_sdi_val - --,fdqpe.m_sdi_u95 - --,fdqpe.m_sdi_l95 - ,fdqpe.lm_sdi_val - --,fdqpe.lm_sdi_u95 - --,fdqpe.lm_sdi_l95 - ,fdqpe.l_sdi_val - --,fdqpe.l_sdi_u95 - --,fdqpe.l_sdi_l95 - from formatted_data_with_planned_enrollment fdqpe - join \"Formularies\".nct_to_brand_counts_through_uspdc ntbtu - on fdqpe.nct_id = ntbtu.nct_id - join cte - on fdqpe.nct_id = cte.nct_id - and fdqpe.earliest_date_observed = cte.tmstmp - order by fdqpe.nct_id, fdqpe.earliest_date_observed - ; - " - ) -df <- fetch(query, n = -1) -df <- na.omit(df) - -query2 <-dbSendQuery(con,"select count(*) from \"DiseaseBurden\".icd10_categories ic where \"level\"=1;") -n_categories <- fetch(query2, n = -1) - -return(list(data=df,ncat=n_categories)) -} - - -d <- get_data(driver) -df <- d$data -n_categories <- d$ncat - -cf <- get_counterfact_base(driver) -df_counterfact_base <- cf$data - - - -################ Format Data ########################### - -data_formatter <- function(df) { -categories <- df["category_id"] - -x <- df["elapsed_duration"] -x["identical_brands"] <- asinh(df$identical_brands) -x["brand_name_counts"] <- asinh(df$brand_name_count) -x["h_sdi_val"] <- asinh(df$h_sdi_val) -x["hm_sdi_val"] <- asinh(df$hm_sdi_val) -x["m_sdi_val"] <- asinh(df$m_sdi_val) -x["lm_sdi_val"] <- asinh(df$lm_sdi_val) -x["l_sdi_val"] <- asinh(df$l_sdi_val) - - -#Setup fixed effects - -x["status_NYR"] <- ifelse(df["current_status"]=="Not yet recruiting",1,0) -x["status_EBI"] <- ifelse(df["current_status"]=="Enrolling by invitation",1,0) -x["status_Rec"] <- ifelse(df["current_status"]=="Recruiting",1,0) -x["status_ANR"] <- ifelse(df["current_status"]=="Active, not recruiting",1,0) - - -y <- ifelse(df["final_status"]=="Terminated",1,0) - -#get category list - - -return(list(x=x,y=y)) -} - -train <- data_formatter(df) -counterfact_base <- data_formatter(df_counterfact_base) - -categories <- df$category_id - -x <- train$x -y <- train$y - -x_cf_base <- counterfact_base$x -y_cf_base <- counterfact_base$y -cf_categories <- df_counterfact_base$category_id -``` - - - -# Fit Model - - - - -```{r} -################################# FIT MODEL ######################################### -inherited_cols <- c( - "elapsed_duration" - ,"identical_brands" - ,"brand_name_counts" - ,"h_sdi_val" - ,"hm_sdi_val" - ,"m_sdi_val" - ,"lm_sdi_val" - ,"l_sdi_val" - ,"status_NYR" - ,"status_EBI" - ,"status_Rec" - ,"status_ANR" -) -``` - - - -```{r} -beta_list <- list( - groups = c( - `1`="Infections & Parasites", - `2`="Neoplasms", - `3`="Blood & Immune system", - `4`="Endocrine, Nutritional, and Metabolic", - `5`="Mental & Behavioral", - `6`="Nervous System", - `7`="Eye and Adnexa", - `8`="Ear and Mastoid", - `9`="Circulatory", - `10`="Respiratory", - `11`="Digestive", - `12`="Skin & Subcutaneaous tissue", - `13`="Musculoskeletal", - `14`="Genitourinary", - `15`="Pregancy, Childbirth, & Puerperium", - `16`="Perinatal Period", - `17`="Congential", - `18`="Symptoms, Signs etc.", - `19`="Injury etc.", - `20`="External Causes", - `21`="Contact with Healthcare", - `22`="Special Purposes" - ), - parameters = c( - `1`="Elapsed Duration", - # brands - `2`="asinh(Generic Brands)", - `3`="asinh(Competitors USPDC)", - # population - `4`="asinh(High SDI)", - `5`="asinh(High-Medium SDI)", - `6`="asinh(Medium SDI)", - `7`="asinh(Low-Medium SDI)", - `8`="asinh(Low SDI)", - #Status - `9`="status_NYR", - `10`="status_EBI", - `11`="status_Rec", - `12`="status_ANR" - ) -) - -get_parameters <- function(stem,class_list) { - #get categories and lengths - named <- names(class_list) - lengths <- sapply(named, (function (x) length(class_list[[x]]))) - - #describe the grid needed - iter_list <- sapply(named, (function (x) 1:lengths[x])) - - #generate the list of parameters - pardf <- generate_parameter_df(stem, iter_list) - - #add columns with appropriate human-readable names - for (name in named) { - pardf[paste(name,"_hr",sep="")] <- as.factor( - sapply(pardf[name], (function (i) class_list[[name]][i])) - ) - } - - return(pardf) -} - -generate_parameter_df <- function(stem, iter_list) { - grid <- expand.grid(iter_list) - grid["param_name"] <- grid %>% unite(x,colnames(grid),sep=",") - grid["param_name"] <- paste(stem,"[",grid$param_name,"]",sep="") - return(grid) -} - -group_mcmc_areas <- function( - stem,# = "beta" - class_list,# = beta_list - stanfit,# = fit - group_id,# = 2 - rename=TRUE, - filter=NULL - ) { - - #get all parameter names - params <- get_parameters(stem,class_list) - - #filter down to parameters of interest - params <- filter(params,groups == group_id) - #Get dataframe with only the rows of interest - filtdata <- as.data.frame(stanfit)[params$param_name] - #rename columns - if (rename) dimnames(filtdata)[[2]] <- params$parameters_hr - #get group name for title - group_name <- class_list$groups[group_id] - #create area plot with appropriate title - p <- mcmc_areas(filtdata,prob = 0.8, prob_outer = 0.95) + - ggtitle(paste("Parameter distributions for ICD-10 class:",group_name)) + - geom_vline(xintercept=seq(-2,2,0.5),color="grey",alpha=0.750) - - d <- pivot_longer(filtdata, everything()) |> - group_by(name) |> - summarize( - mean=mean(value) - ,q025 = quantile(value,probs = 0.025) - ,q975 = quantile(value,probs = 0.975) - ,q05 = quantile(value,probs = 0.05) - ,q95 = quantile(value,probs = 0.95) - ) - return(list(plot=p,quantiles=d,name=group_name)) -} - -parameter_mcmc_areas <- function( - stem,# = "beta" - class_list,# = beta_list - stanfit,# = fit - parameter_id,# = 2 - rename=TRUE - ) { - #get all parameter names - params <- get_parameters(stem,class_list) - #filter down to parameters of interest - params <- filter(params,parameters == parameter_id) - #Get dataframe with only the rows of interest - filtdata <- as.data.frame(stanfit)[params$param_name] - #rename columns - if (rename) dimnames(filtdata)[[2]] <- params$groups_hr - #get group name for title - parameter_name <- class_list$parameters[parameter_id] - #create area plot with appropriate title - p <- mcmc_areas(filtdata,prob = 0.8, prob_outer = 0.95) + - ggtitle(parameter_name,"Parameter Distribution") + - geom_vline(xintercept=seq(-2,2,0.5),color="grey",alpha=0.750) - - d <- pivot_longer(filtdata, everything()) |> - group_by(name) |> - summarize( - mean=mean(value) - ,q025 = quantile(value,probs = 0.025) - ,q975 = quantile(value,probs = 0.975) - ,q05 = quantile(value,probs = 0.05) - ,q95 = quantile(value,probs = 0.95) - ) - return(list(plot=p,quantiles=d,name=parameter_name)) -} - - -``` - -Plan: select all snapshots that are the first to have closed -enrollment (Rec -> ANR when comparing across snapshots), and then -```{r} -#delay intervention -intervention_enrollment <- x_cf_base[c(inherited_cols)] -#TOFIX: ^^^ This ordering of columns is -intervention_enrollment["status_ANR"] <- 0 -intervention_enrollment["status_Rec"] <- 1 -``` - -```{r} -counterfact_delay <- list( - D = ncol(x),# - N = nrow(x), - L = n_categories$count, - y = as.vector(y), - ll = as.vector(categories), - x = as.matrix(x), - mu_mean = 0, - mu_stdev = 0.05, - sigma_shape = 4, - sigma_rate = 20, - Nx = nrow(x_cf_base), - llx = as.vector(cf_categories), - counterfact_x_tilde = as.matrix(intervention_enrollment), - counterfact_x = as.matrix(x_cf_base), - status_indexes = c(11,12) #subtract anr from recruiting to get movement from anr to recruiting -) -``` - -```{r} -fit <- stan( - file='Hierarchal_Logistic.stan', - data = counterfact_delay, - chains = 4, - iter = 5000, - seed = 11021585 - ) -``` - - - -## Calculate relative difference in parameters between recruiting and active not recruiting states - - -```{r} -filt_data <- as.data.frame(extract(fit,pars="status_diff")) -dimnames(filt_data)[[2]] <- beta_list$groups -mcmc_areas(filt_data,prob = 0.8, prob_outer = 0.95) + - ggtitle("Relative fixed effects across groups", subtitle ="moving from `Active, not recruiting` to `Recruiting`") + - geom_vline(xintercept=seq(-2,2,0.5),color="grey",alpha=0.750) -``` - -I've got an issue here, because this should be a movement from ANR -> recruiting -but that is suggesting that - -## Explore data - - -```{r} -#get number of trials and snapshots in each category -group_trials_by_category <- as.data.frame(aggregate(category_id ~ nct_id, df, max)) -group_trials_by_category <- as.data.frame(group_trials_by_category) - -category_count <- group_trials_by_category |> group_by(category_id) |> count() - -``` - - -```{r} -################################# DATA EXPLORATION ############################ -driver <- dbDriver("PostgreSQL") - -con <- dbConnect( - driver, - user='root', - password='root', - dbname='aact_db', - host=host - ) -#Plot histogram of count of snapshots -df3 <- dbGetQuery( - con, - "select nct_id,final_status,count(*) from formatted_data_with_planned_enrollment fdwpe - group by nct_id,final_status ;" - ) -#df3 <- fetch(query3, n = -1) - -ggplot(data=df3, aes(x=count, fill=final_status)) + - geom_histogram(binwidth=1) + - ggtitle("Histogram of snapshots per trial (matched trials)") + - xlab("Snapshots per trial") -ggsave(paste0(image_trial_details,"/HistSnapshots.png")) - -#Plot duration for terminated vs completed -df4 <- dbGetQuery( - con, - " - select - nct_id, - start_date , - primary_completion_date, - overall_status , - primary_completion_date - start_date as duration - from ctgov.studies s - where nct_id in (select distinct nct_id from http.download_status ds) - ;" - ) -#df4 <- fetch(query4, n = -1) - -ggplot(data=df4, aes(x=duration,fill=overall_status)) + - geom_histogram()+ - ggtitle("Histogram of trial durations") + - xlab("duration")+ - facet_wrap(~overall_status) -ggsave(paste0(image_trial_details,"/HistTrialDurations_Faceted.png")) - -df5 <- dbGetQuery( - con, - " - with cte1 as ( - select - nct_id, - start_date , - primary_completion_date, - overall_status , - primary_completion_date - start_date as duration - from ctgov.studies s - where nct_id in (select distinct nct_id from http.download_status ds) - ), cte2 as ( - select nct_id,count(*) as snapshot_count from formatted_data_with_planned_enrollment fdwpe - group by nct_id - ) - select a.nct_id, a.overall_status, a.duration,b.snapshot_count - from cte1 as a - join cte2 as b - on a.nct_id=b.nct_id - ;" - ) -df5$overall_status <- as.factor(df5$overall_status) - -ggplot(data=df5, aes(x=duration,y=snapshot_count,color=overall_status)) + - geom_jitter() + - ggtitle("Comparison of duration, status, and snapshot_count") + - xlab("duration") + - ylab("snapshot count") -ggsave(paste0(image_trial_details,"/SnapshotsVsDurationVsTermination.png")) - -dbDisconnect(con) - -#get number of trials and snapshots in each category -group_trials_by_category <- as.data.frame(aggregate(category_id ~ nct_id, df, max)) -group_trials_by_category <- as.data.frame(group_trials_by_category) - -ggplot(data = group_trials_by_category, aes(x=category_id)) + - geom_bar(binwidth=1,color="black",fill="lightblue") + - scale_x_continuous(breaks=scales::pretty_breaks(n=22)) + - labs( - title="bar chart of trial categories" - ,x="Category ID" - ,y="Count" - ) -ggsave(paste0(image_trial_details,"/CategoryCounts.png")) - - - -summary(df5) - -cor(df5$duration,df5$snapshot_count) -sum(df5$snapshot_count) -``` - - - - -## Fit Results - - -```{r} -################################# ANALYZE ##################################### -#print(fit) -``` - -# Parameter Distributions - - -```{r} -#g1 <- group_mcmc_areas("beta",beta_list,fit,1) - - -gx <- c() - -#grab parameters for every category with more than 8 observations -for (i in category_count$category_id[category_count$n >= 0]) { - print(i) - - #Print parameter distributions - gi <- group_mcmc_areas("beta",beta_list,fit,i) #add way to filter groups - ggsave( - paste0(image_parameters_by_groups,"/group_",i,"_",gi$name,".png") - ,plot=gi$plot - ) - gx <- c(gx,gi) - - #Get Quantiles and means for parameters -# table <- xtable(gi$quantiles, -# floating=FALSE -# ,latex.environments = NULL -# ,booktabs = TRUE -# ,zap=getOption("digits") -# ) -# write_lines(table,paste0("./latex_output/DirectEffects/group_",gi$name,".tex")) -} -``` - - - -```{r} -px <- c() - - -for (i in c(1,2,3,9,10,11,12)) { - - #Print parameter distributions - pi <- parameter_mcmc_areas("beta",beta_list,fit,i) #add way to filter groups - ggsave( - paste0(image_parameters_across_groups,"/parameters_",i,"_",pi$name,".png") - ,plot=pi$plot - ) - px <- c(px,pi) - - #Get Quantiles and means for parameters -# table <- xtable(pi$quantiles, -# floating=FALSE -# ,latex.environments = NULL -# ,booktabs = TRUE -# ,zap=getOption("digits") -# ) -# write_lines(table,paste0("./latex_output/DirectEffects/parameters_",i,"_",pi$name,".tex")) - -} -``` - -Note these have 95% outer CI and 80% inner (shaded) - - - - - -```{r} -#get the generic and uspdc parameters -print(px[4]$plot + px[7]$plot) -ggsave(paste0(image_parameters_across_groups,"2+3_generic_and_uspdc.png")) - -#get the parameters associated with duration -px[16]$plot + px[19]$plot -ggsave(paste0(image_parameters_across_groups,"11+12_statusREC_and_statusANR.png")) -``` - - -# Counterfactuals - -```{r} -generated_ib <- gqs( - fit@stanmodel, - data=counterfact_delay, - draws=as.matrix(fit), - seed=11021585 - ) -``` - -### Get priors -```{r} -df_ib_p <- data.frame( - p_prior=as.vector(extract(generated_ib, pars="p_prior")$p_prior) - ,p_predicted = as.vector(extract(generated_ib, pars="p_predicted")$p_predicted) -) - -df_ib_prior <- data.frame( - mu_prior = as.vector(extract(generated_ib, pars="mu_prior")$mu_prior) - ,sigma_prior = as.vector(extract(generated_ib, pars="sigma_prior")$sigma_prior) -) - -#p_prior -ggplot(df_ib_p, aes(x=p_prior)) + - geom_density() + - labs( - title="implied prior distribution p" - ,subtitle="" - ,x="probability domain 'p'" - ,y="probability density" - ) -ggsave(paste0(image_dist_diff_analysis,"/prior_p.png")) - -#p_posterior -ggplot(df_ib_p, aes(x=p_predicted)) + - geom_density() + - labs( - title="implied posterior distribution p" - ,subtitle="" - ,x="probability domain 'p'" - ,y="Probability Density" - ) -ggsave(paste0(image_dist_diff_analysis,"/posterior_p.png")) - -#mu_prior -ggplot(df_ib_prior) + - geom_density(aes(x=mu_prior)) + - labs( - title="Prior - Mu" - ,subtitle="same prior for all Mu values" - ,x="Mu" - ,y="Probability" - ) -ggsave(paste0(image_dist_diff_analysis,"/prior_mu.png")) - -#sigma_posterior -ggplot(df_ib_prior) + - geom_density(aes(x=sigma_prior)) + - labs( - title="Prior - Sigma" - ,subtitle="same prior for all Sigma values" - ,x="Sigma" - ,y="Probability" - ) -ggsave(paste0(image_dist_diff_analysis,"/prior_sigma.png")) -``` - - - -```{r} -check_hmc_diagnostics(fit) -``` - - - - - -### Intervention: Delay close of enrollment - -```{r} -counterfact_predicted_ib <- data.frame( - p_predicted_default = as.vector(extract(generated_ib, pars="p_predicted_default")$p_predicted_default) - ,p_predicted_intervention = as.vector(extract(generated_ib, pars="p_predicted_intervention")$p_predicted_intervention) - ,predicted_difference = as.vector(extract(generated_ib, pars="predicted_difference")$predicted_difference) -) - - -ggplot(counterfact_predicted_ib, aes(x=p_predicted_default)) + - geom_density() + - labs( - title="Predicted Distribution of 'p'" - ,subtitle="Intervention: None" - ,x="Probability Domain 'p'" - ,y="Probability Density" - ) -ggsave(paste0(image_dist_diff_analysis,"p_no_intervention.png")) - -ggplot(counterfact_predicted_ib, aes(x=p_predicted_intervention)) + - geom_density() + - labs( - title="Predicted Distribution of 'p'" - ,subtitle="Intervention: Delay close of enrollment" - ,x="Probability Domain 'p'" - ,y="Probability Density" - ) -ggsave(paste0(image_dist_diff_analysis,"/p_delay_intervention.png")) - -ggplot(counterfact_predicted_ib, aes(x=predicted_difference)) + - geom_density() + - labs( - title="Predicted Distribution of differences 'p'" - ,subtitle="Intervention: Delay close of enrollment" - ,x="Difference in 'p' under treatment" - ,y="Probability Density" - ) -ggsave(paste0(image_dist_diff_analysis,"/p_delay_intervention_distdiff_1.png")) -``` - - -```{r} -get_category_count <- function(tbl, id) { - result <- tbl$n[tbl$category_id == id] - if(length(result) == 0) 0 else result -} - -category_names <- sapply(1:length(beta_list$groups), - function(i) sprintf("ICD-10 #%d: %s (n=%d)", - i, - beta_list$groups[i], - get_category_count(category_count, i))) - -``` - -```{r} -pddf_ib <- data.frame(extract(generated_ib, pars="predicted_difference")$predicted_difference) |> - pivot_longer(X1:X168) #CHANGE_NOTE: moved from X169 to X168 - -pddf_ib["entry_idx"] <- as.numeric(gsub("\\D","",pddf_ib$name)) -pddf_ib["category"] <- sapply(pddf_ib$entry_idx, function(i) df$category_id[i]) -pddf_ib["category_name"] <- sapply( - pddf_ib$category, - function(i) category_names[i] - ) - - - -ggplot(pddf_ib, aes(x=value,)) + - geom_density(adjust=1/5) + - labs( - title = "Distribution of predicted differences" - ,subtitle = "Intervention: Delay close of enrollment" - ,x = "Difference in probability due to intervention" - ,y = "Probability Density" - ) + - geom_vline(aes(xintercept = 0), color = "skyblue", linetype="dashed") -ggsave(paste0(image_dist_diff_analysis,"/p_delay_intervention_distdiff_styled.png")) - -ggplot(pddf_ib, aes(x=value,)) + - geom_density(adjust=1/5) + - facet_wrap( - ~factor( - category_name, - levels=category_names - ) - , labeller = label_wrap_gen(multi_line = TRUE) - , ncol=4) + - labs( - title = "Distribution of predicted differences | By Group" - ,subtitle = "Intervention: Delay close of enrollment" - ,x = "Difference in probability due to intervention" - ,y = "Probability Density" - ) + - geom_vline(aes(xintercept = 0), color = "skyblue", linetype="dashed") + - theme(strip.text.x = element_text(size = 8)) -ggsave(paste0(image_dist_diff_analysis,"/p_delay_intervention_distdiff_by_group.png")) - -ggplot(pddf_ib, aes(x=value,)) + - geom_histogram(bins=300) + - facet_wrap( - ~factor( - category_name, - levels=category_names - ) - , labeller = label_wrap_gen(multi_line = TRUE) - , ncol=4) + - labs( - title = "Histogram of predicted differences | By Group" - ,subtitle = "Intervention: Delay close of enrollment" - ,x = "Difference in probability due to intervention" - ,y = "Predicted counts" - ) + - geom_vline(aes(xintercept = 0), color = "skyblue", linetype="dashed") + - theme(strip.text.x = element_text(size = 8)) -ggsave(paste0(image_dist_diff_analysis,"p_delay_intervention_histdiff_by_group.png")) -``` - - -```{r} -p3 <- ggplot(pddf_ib, aes(x=value,)) + - geom_histogram(bins=500) + - labs( - title = "Distribution of predicted differences" - ,subtitle = "Intervention: Delay close of enrollment" - ,x = "Difference in probability due to intervention" - ,y = "Probability Density" - ,caption = "Vertical marks: 5/10/25/50/75/90/95th percentiles. Dot shows mean." - ) + - geom_vline(aes(xintercept = 0), color = "skyblue", linetype="dashed") - -density_hight <- max(density(pddf_ib$value)$y) - -stats <- list( - p5 = quantile(pddf_ib$value, 0.05), - p10 = quantile(pddf_ib$value, 0.10), - q1 = quantile(pddf_ib$value, 0.25), - med = median(pddf_ib$value), - mean = mean(pddf_ib$value), - q3 = quantile(pddf_ib$value, 0.75), - p90 = quantile(pddf_ib$value, 0.90), - p95 = quantile(pddf_ib$value, 0.95), - max_height = max(ggplot_build(p3)$data[[1]]$count), - y_offset = -max(ggplot_build(p3)$data[[1]]$count) * 0.05, - y_offset_density = -density_hight * 0.15 -) - -p3 + - # Box - geom_segment(data = data.frame( - x = c(stats$q1, stats$q3, stats$med), - xend = c(stats$q1, stats$q3, stats$med), - y = rep(stats$y_offset, 3), - yend = rep(stats$y_offset * 2, 3) - ), aes(x = x, xend = xend, y = y, yend = yend)) + - geom_segment(data = data.frame( - x = rep(stats$q1, 2), - xend = rep(stats$q3, 2), - y = c(stats$y_offset, stats$y_offset * 2), - yend = c(stats$y_offset, stats$y_offset * 2) - ), aes(x = x, xend = xend, y = y, yend = yend)) + - # Inner whiskers (Q1->P10, Q3->P90) - geom_segment(data = data.frame( - x = c(stats$q1, stats$q3), - xend = c(stats$p10, stats$p90), - y = rep(stats$y_offset * 1.5, 2), - yend = rep(stats$y_offset * 1.5, 2) - ), aes(x = x, xend = xend, y = y, yend = yend)) + - # Crossbars at P10/P90 - geom_segment(data = data.frame( - x = c(stats$p10, stats$p90), - xend = c(stats$p10, stats$p90), - y = stats$y_offset * 1.3, - yend = stats$y_offset * 1.7 - ), aes(x = x, xend = xend, y = y, yend = yend)) + - # Outer whiskers (P10->P5, P90->P95) - geom_segment(data = data.frame( - x = c(stats$p10, stats$p90), - xend = c(stats$p5, stats$p95), - y = rep(stats$y_offset * 1.5, 2), - yend = rep(stats$y_offset * 1.5, 2) - ), aes(x = x, xend = xend, y = y, yend = yend)) + - # Crossbars at P5/P95 - geom_segment(data = data.frame( - x = c(stats$p5, stats$p95), - xend = c(stats$p5, stats$p95), - y = stats$y_offset * 1.3, - yend = stats$y_offset * 1.7 - ), aes(x = x, xend = xend, y = y, yend = yend)) + - # Mean dot - geom_point(data = data.frame( - x = stats$mean, - y = stats$y_offset * 1.5 - ), aes(x = x, y = y)) -ggsave(paste0(image_dist_diff_analysis,"/p_delay_intervention_histdiff_boxplot.png")) - - -p4 <- ggplot(pddf_ib, aes(x = value)) + - geom_density() + - labs( - title = "Distribution of predicted differences" - ,subtitle = "Intervention: Delay close of enrollment" - ,x = "Difference in probability due to intervention" - ,y = "Probability Density" - ,caption = "Vertical marks: 5/10/25/50/75/90/95th percentiles. Dot shows mean." - ) + - geom_vline(aes(xintercept = 0), color = "skyblue", linetype="dashed") + - geom_segment(data = data.frame( - x = c(stats$q1, stats$q3, stats$med), - xend = c(stats$q1, stats$q3, stats$med), - y = rep(stats$y_offset_density, 3), - yend = rep(stats$y_offset_density * 2, 3) - ), aes(x = x, xend = xend, y = y, yend = yend)) + - geom_segment(data = data.frame( - x = rep(stats$q1, 2), - xend = rep(stats$q3, 2), - y = c(stats$y_offset_density, stats$y_offset_density * 2), - yend = c(stats$y_offset_density, stats$y_offset_density * 2) - ), aes(x = x, xend = xend, y = y, yend = yend)) + - # Inner whiskers (Q1->P10, Q3->P90) - geom_segment(data = data.frame( - x = c(stats$q1, stats$q3), - xend = c(stats$p10, stats$p90), - y = rep(stats$y_offset_density * 1.5, 2), - yend = rep(stats$y_offset_density * 1.5, 2) - ), aes(x = x, xend = xend, y = y, yend = yend)) + - # Crossbars at P10/P90 - geom_segment(data = data.frame( - x = c(stats$p10, stats$p90), - xend = c(stats$p10, stats$p90), - y = stats$y_offset_density * 1.3, - yend = stats$y_offset_density * 1.7 - ), aes(x = x, xend = xend, y = y, yend = yend)) + - # Outer whiskers (P10->P5, P90->P95) - geom_segment(data = data.frame( - x = c(stats$p10, stats$p90), - xend = c(stats$p5, stats$p95), - y = rep(stats$y_offset_density * 1.5, 2), - yend = rep(stats$y_offset_density * 1.5, 2) - ), aes(x = x, xend = xend, y = y, yend = yend)) + - # Crossbars at P5/P95 - geom_segment(data = data.frame( - x = c(stats$p5, stats$p95), - xend = c(stats$p5, stats$p95), - y = stats$y_offset_density * 1.3, - yend = stats$y_offset_density * 1.7 - ), aes(x = x, xend = xend, y = y, yend = yend)) + - # Mean dot - geom_point(data = data.frame( - x = stats$mean, - y = stats$y_offset_density * 1.5 - ), aes(x = x, y = y)) -ggsave(paste0(image_dist_diff_analysis,"/p_delay_intervention_distdiff_boxplot.png")) -``` - -```{r} - ggplot(pddf_ib, aes(x=value)) + - stat_ecdf(geom='step') + - labs( - title = "Cumulative distribution of predicted differences", - subtitle = "Intervention: Delay close of enrollment", - x = "Difference in probability of termination due to intervention", - y = "Cumulative Proportion" - ) - -ggsave(paste0(image_dist_diff_analysis,"/p_delay_intervention_cumulative_distdiff.png")) -``` - - -Get the % of differences in the spike around zero -```{r} -# get values around and above/below spike -width <- 0.02 -spike_band_centered_zero <- mean( pddf_ib$value >= -width/2 & pddf_ib$value <= width/2) -above_spike_band <- mean( pddf_ib$value >= width/2) -below_spike_band <- mean( pddf_ib$value <= -width/2) - -# get mass above and mass below zero -mass_below_zero <- mean( pddf_ib$value <= 0) -``` -Looking at the spike around zero, we find that `r spike_band_centered_zero*100`% -of the probability mass is contained within the band from -[`r -width*100/2`,`r width*100/2`]. -Additionally, there was `r above_spike_band*100`% of the probability above that --- representing those with a general increase in the probability of termination -- -and `r below_spike_band*100`% of the probability mass below the band --- representing a decrease in the probability of termination. - -On average, if you keep the trial open instead of closing it, -`r mass_below_zero`% of trials will see a decrease in the probability of termination, -but, due to the high increase in probability of termination given termination was increased, -the mean probability of termination increases by `r stats$mean`. - -```{r} -# 5%-iles - -summary(pddf_ib$value) - -# Create your quantiles -quants <- quantile(pddf_ib$value, probs = seq(0,1,0.05), type=4) - -# Convert to a data frame -quant_df <- data.frame( - Percentile = names(quants), - Value = quants -) -kable(quant_df) -``` - -There seems to be some trials that are highly suceptable to this enrollment delay. Specifically, there were some - -```{r} -n = length(counterfact_predicted_ib$p_predicted_intervention) -k = 100 -simulated_terminations_intervention <- mean(rbinom(n,k,as.vector(counterfact_predicted_ib$p_predicted_intervention))) -simulated_terminations_base <-mean(rbinom(n,k,as.vector(counterfact_predicted_ib$p_predicted_default))) - -simulated_percentages <- (simulated_terminations_intervention - simulated_terminations_base)/k -``` - -The simulation above shows that this results in a percentage-point increase of about -`r simulated_percentages * 100`. - - - - -# Diagnostics - -```{r} -#| eval: true -#trace plots -image_diagnostics <- paste0(image_root,"/diagnostics") - -plot(fit, pars=c("mu"), plotfun="trace") -ggsave(paste0(image_diagnostics,"/trace_plot_mu.png")) - - -for (i in 1:3) { - print( - mcmc_rank_overlay( - fit, - pars=c( - paste0("mu[",4*i-3,"]"), - paste0("mu[",4*i-2,"]"), - paste0("mu[",4*i-1,"]"), - paste0("mu[",4*i,"]") - ), - n_bins=100 - )+ legend_move("top") + - scale_colour_ghibli_d("KikiMedium") - ) - mu_range <- paste0(4*i-3,"-",4*i) - filename <- paste0(image_diagnostics,"/trace_rank_plot_mu_",mu_range,".png") - ggsave(filename) -} -``` - -```{r} -#| eval: true -plot(fit, pars=c("sigma"), plotfun="trace") -ggsave(paste0(image_diagnostics,"/traceplot_sigma.png")) - -for (i in 1:3) { - print( - mcmc_rank_overlay( - fit, - pars=c( - paste0("sigma[",4*i-3,"]"), - paste0("sigma[",4*i-2,"]"), - paste0("sigma[",4*i-1,"]"), - paste0("sigma[",4*i,"]") - ), - n_bins=100 - )+ legend_move("top") + - scale_colour_ghibli_d("KikiMedium") - ) - sigma_range <- paste0(4*i-3,"-",4*i) - filename <- paste0(image_diagnostics,"/trace_rank_plot_sigma_",sigma_range,".png") - ggsave(filename) -} -``` - -```{r} -#| eval: true -#other diagnostics -logpost <- log_posterior(fit) -nuts_prmts <- nuts_params(fit) -posterior <- as.array(fit) - -``` - -```{r} -#| eval: true -color_scheme_set("darkgray") -div_style <- parcoord_style_np(div_color = "green", div_size = 0.05, div_alpha = 0.4) -mcmc_parcoord(posterior, regex_pars = "mu", np=nuts_prmts, np_style = div_style, alpha = 0.05) -ggsave(paste0(image_diagnostics,"/parcoord_mu.png")) -``` - -```{r} -#| eval: true -for (i in 1:3) { - mus = sapply(3:0, function(j) paste0("mu[",4*i-j ,"]")) - print( - mcmc_pairs( - posterior, - np = nuts_prmts, - pars=c( - mus, - "lp__" - ), - off_diag_args = list(size = 0.75) - ) - ) - mu_range <- paste0(4*i-3,"-",4*i) - filename <- paste0(image_diagnostics,"/correlation_plot_mu_",mu_range,".png") - ggsave(filename) -} - - - -``` - -```{r} -#| eval: true -mcmc_parcoord(posterior,regex_pars = "sigma", np=nuts_prmts, alpha=0.05) -ggsave(paste0(image_diagnostics,"/parcoord_sigma.png")) -``` - -```{r} -#| eval: true - -for (i in 1:3) { - params = sapply(3:0, function(j) paste0("sigma[",4*i-j ,"]")) - print( - mcmc_pairs( - posterior, - np = nuts_prmts, - pars=c( - params, - "lp__" - ), - off_diag_args = list(size = 0.75) - ) - ) - sigma_range <- paste0(4*i-3,"-",4*i) - filename <- paste0(image_diagnostics,"/correlation_plot_sigma_",sigma_range,".png") - ggsave(filename) -} -``` - - -```{r} -#| eval: true -for (k in 1:22) { -for (i in 1:3) { - params = sapply(3:0, function(j) paste0("beta[",k,",",4*i-j ,"]")) - print( - mcmc_pairs( - posterior, - np = nuts_prmts, - pars=c( - params, - "lp__" - ), - off_diag_args = list(size = 0.75) - ) - ) - - beta_range <- paste0("k_",k,"_i_",4*i-3,"-",4*i) - filename <- paste0(image_diagnostics,"/correlation_plot_beta_",beta_range,".png") - ggsave(filename) -}} -``` - - - - - - - diff --git a/r-analysis/Hierarchal_Logistic_with_status_diff.stan b/r-analysis/Hierarchal_Logistic_with_status_diff.stan deleted file mode 100644 index 8aa7a04..0000000 --- a/r-analysis/Hierarchal_Logistic_with_status_diff.stan +++ /dev/null @@ -1,102 +0,0 @@ -// -// This Stan program defines a simple model, with a -// vector of values 'y' modeled as normally distributed -// with mean 'mu' and standard deviation 'sigma'. -// -// Learn more about model development with Stan at: -// -// http://mc-stan.org/users/interfaces/rstan.html -// https://github.com/stan-dev/rstan/wiki/RStan-Getting-Started -// - -// The input data is a vector 'y' of length 'N'. -data { - int D; // number of features - int N; // number of observations - int L; // number of categories - array[N] int y; // vector of dependent variables - array[N] int ll; // vector of categories - array[N] row_vector[D] x; // independent variables - real mu_mean; //hyperprior - real mu_stdev; //hyperprior - real sigma_shape; //hyperprior - real sigma_rate; //hyperprior - //counterfactuals - int Nx; - array[Nx] int llx;//vec of categories - array[Nx] row_vector[D] counterfact_x_tilde; // Posterior Prediction intervention - array[Nx] row_vector[D] counterfact_x; // Posterior Prediction intervention - //the two statuses to catch relative difference - array[2] int status_indexes; //the two status indexes to compare (order is x[1] - x[2]) -} -parameters { - array[D] real mu; - array[D] real sigma; - array[L] vector[D] beta; -} -model { - sigma ~ gamma(sigma_shape,sigma_rate); //hyperprior for stdev: shape and inverse scale - mu ~ normal(mu_mean, mu_stdev); //hyperprior for mean //TODO: convert to mvnormal - for (l in 1:L) { - beta[l] ~ normal(mu, sigma); - } - { - vector[N] x_beta_ll; - for (n in 1:N) { - x_beta_ll[n] = x[n] * beta[ll[n]]; - } - y ~ bernoulli_logit(x_beta_ll); - } -} -generated quantities { - //SETUP PRIOR PREDICTION - //preallocate - real mu_prior[D]; - real sigma_prior[D]; - real p_prior[N]; // what I have priors about - real p_predicted[N]; //predicted p_values - //intervention - real p_predicted_default[Nx]; //predicted p_values - real p_predicted_intervention[Nx]; //predicted p_values - real predicted_difference[Nx]; //difference in predicted values - - //collect array of relative status differences between - array[L] real status_diff; - //GENERATE RELATIVE DIFFERENCES BETWEEN STATUSES - for (l in 1:L) { - status_diff[l] = beta[l,status_indexes[1]] - beta[l,status_indexes[2]]; - } - - //GENERATE PRIOR PREDICTIONS - { - vector[D] beta_prior[L];//local var - //sample parameters - for (d in 1:D) { - mu_prior[d] = normal_rng(mu_mean,mu_stdev); - sigma_prior[d] = gamma_rng(sigma_shape,sigma_rate); - } - for (l in 1:L) { - for (d in 1:D) { - beta_prior[l,d] = normal_rng(mu_prior[d],sigma_prior[d]); - } - } - //generate probabilities - vector[D] b_prior[N];//local var - for (n in 1:N){ - b_prior[n] = beta_prior[ll[n]]; - p_prior[n] = inv_logit( x[n] * b_prior[n] ); - } - } - //GENERATE POSTERIOR PREDICTIONS - for (n in 1:N) { - p_predicted[n] = inv_logit( x[n] * beta[ll[n]] ); - } - //GENERATE POSTERIOR DISTRIBUTION OF DIFFERENCES - for (n in 1:Nx) { - p_predicted_default[n] = inv_logit( counterfact_x[n] * beta[llx[n]] ); - p_predicted_intervention[n] = inv_logit( counterfact_x_tilde[n] * beta[llx[n]] ); //intervention - - //intervention - base case - predicted_difference[n] = p_predicted_intervention[n] - p_predicted_default[n]; - } -}