--- title: "The Effects of Recruitment status on completion of clinical trials" author: "Will King" format: html editor: source --- # Setup ```{r} library(knitr) library(bayesplot) available_mcmc(pattern = "_nuts_") library(ggplot2) library(patchwork) library(tidyverse) library(rstan) library(tidyr) library(ghibli) library(xtable) #Resources: https://github.com/stan-dev/rstan/wiki/RStan-Getting-Started #save unchanged models instead of recompiling rstan_options(auto_write = TRUE) #allow for multithreaded sampling options(mc.cores = parallel::detectCores()) #test installation, shouldn't get any errors #example(stan_model, package = "rstan", run.dontrun = TRUE) image_root <- "./output/EffectsOfEnrollmentDelay" image_dist_diff_analysis <- paste0(image_root,"/dist_diff_analysis") image_trial_details <-paste0(image_root,"/trials_details") image_parameters_by_groups <-paste0(image_root,"/betas/parameters_by_group") image_parameters_across_groups <-paste0(image_root,"/betas/parameter_across_groups") ``` ```{r} ################ Pull data from database ###################### library(RPostgreSQL) host <- 'aact_db-restored-2025-01-07' driver <- dbDriver("PostgreSQL") get_data <- function(driver) { con <- dbConnect( driver, user='root', password='root', dbname='aact_db', host=host ) on.exit(dbDisconnect(con)) query <- dbSendQuery( con, # "select * from formatted_data_with_planned_enrollment;" " select fdqpe.nct_id --,fdqpe.start_date --,fdqpe.current_enrollment --,fdqpe.enrollment_category ,fdqpe.current_status ,fdqpe.earliest_date_observed ,fdqpe.elapsed_duration ,fdqpe.n_brands as identical_brands ,ntbtu.brand_name_counts ,fdqpe.category_id ,fdqpe.final_status ,fdqpe.h_sdi_val --,fdqpe.h_sdi_u95 --,fdqpe.h_sdi_l95 ,fdqpe.hm_sdi_val --,fdqpe.hm_sdi_u95 --,fdqpe.hm_sdi_l95 ,fdqpe.m_sdi_val --,fdqpe.m_sdi_u95 --,fdqpe.m_sdi_l95 ,fdqpe.lm_sdi_val --,fdqpe.lm_sdi_u95 --,fdqpe.lm_sdi_l95 ,fdqpe.l_sdi_val --,fdqpe.l_sdi_u95 --,fdqpe.l_sdi_l95 from formatted_data_with_planned_enrollment fdqpe join \"Formularies\".nct_to_brand_counts_through_uspdc ntbtu on fdqpe.nct_id = ntbtu.nct_id order by fdqpe.nct_id, fdqpe.earliest_date_observed ; " ) df <- fetch(query, n = -1) df <- na.omit(df) query2 <-dbSendQuery(con,"select count(*) from \"DiseaseBurden\".icd10_categories ic where \"level\"=1;") n_categories <- fetch(query2, n = -1) return(list(data=df,ncat=n_categories)) } get_counterfact_base <- function(driver) { con <- dbConnect( driver, user='root', password='root', dbname='aact_db', host=host ) on.exit(dbDisconnect(con)) query <- dbSendQuery( con, " with cte as ( --get last recruiting state select fd.nct_id, max(fd.earliest_date_observed),min(fd2.earliest_date_observed) as tmstmp from formatted_data fd join formatted_data fd2 on fd.nct_id=fd2.nct_id and fd.earliest_date_observed < fd2.earliest_date_observed where fd.current_status = 'Recruiting' and fd2.current_status = 'Active, not recruiting' group by fd.nct_id ) select fdqpe.nct_id --,fdqpe.start_date --,fdqpe.current_enrollment --,fdqpe.enrollment_category ,fdqpe.current_status ,fdqpe.earliest_date_observed ,fdqpe.elapsed_duration ,fdqpe.n_brands as identical_brands ,ntbtu.brand_name_counts ,fdqpe.category_id ,fdqpe.final_status ,fdqpe.h_sdi_val --,fdqpe.h_sdi_u95 --,fdqpe.h_sdi_l95 ,fdqpe.hm_sdi_val --,fdqpe.hm_sdi_u95 --,fdqpe.hm_sdi_l95 ,fdqpe.m_sdi_val --,fdqpe.m_sdi_u95 --,fdqpe.m_sdi_l95 ,fdqpe.lm_sdi_val --,fdqpe.lm_sdi_u95 --,fdqpe.lm_sdi_l95 ,fdqpe.l_sdi_val --,fdqpe.l_sdi_u95 --,fdqpe.l_sdi_l95 from formatted_data_with_planned_enrollment fdqpe join \"Formularies\".nct_to_brand_counts_through_uspdc ntbtu on fdqpe.nct_id = ntbtu.nct_id join cte on fdqpe.nct_id = cte.nct_id and fdqpe.earliest_date_observed = cte.tmstmp order by fdqpe.nct_id, fdqpe.earliest_date_observed ; " ) df <- fetch(query, n = -1) df <- na.omit(df) query2 <-dbSendQuery(con,"select count(*) from \"DiseaseBurden\".icd10_categories ic where \"level\"=1;") n_categories <- fetch(query2, n = -1) return(list(data=df,ncat=n_categories)) } d <- get_data(driver) df <- d$data n_categories <- d$ncat cf <- get_counterfact_base(driver) df_counterfact_base <- cf$data ################ Format Data ########################### data_formatter <- function(df) { categories <- df["category_id"] x <- df["elapsed_duration"] x["identical_brands"] <- asinh(df$identical_brands) x["brand_name_counts"] <- asinh(df$brand_name_count) x["h_sdi_val"] <- asinh(df$h_sdi_val) x["hm_sdi_val"] <- asinh(df$hm_sdi_val) x["m_sdi_val"] <- asinh(df$m_sdi_val) x["lm_sdi_val"] <- asinh(df$lm_sdi_val) x["l_sdi_val"] <- asinh(df$l_sdi_val) #Setup fixed effects x["status_NYR"] <- ifelse(df["current_status"]=="Not yet recruiting",1,0) x["status_EBI"] <- ifelse(df["current_status"]=="Enrolling by invitation",1,0) x["status_Rec"] <- ifelse(df["current_status"]=="Recruiting",1,0) x["status_ANR"] <- ifelse(df["current_status"]=="Active, not recruiting",1,0) y <- ifelse(df["final_status"]=="Terminated",1,0) #get category list return(list(x=x,y=y)) } train <- data_formatter(df) counterfact_base <- data_formatter(df_counterfact_base) categories <- df$category_id x <- as.matrix(train$x) y <- as.vector(train$y) x_cf_base <- counterfact_base$x y_cf_base <- counterfact_base$y cf_categories <- df_counterfact_base$category_id ``` # Fit Model ```{r} ################################# FIT MODEL ######################################### inherited_cols <- c( "elapsed_duration" ,"identical_brands" ,"brand_name_counts" ,"h_sdi_val" ,"hm_sdi_val" ,"m_sdi_val" ,"lm_sdi_val" ,"l_sdi_val" ,"status_NYR" ,"status_EBI" ,"status_Rec" ,"status_ANR" ) ``` ```{r} beta_list <- list( groups = c( `1`="Infections & Parasites", `2`="Neoplasms", `3`="Blood & Immune system", `4`="Endocrine, Nutritional, and Metabolic", `5`="Mental & Behavioral", `6`="Nervous System", `7`="Eye and Adnexa", `8`="Ear and Mastoid", `9`="Circulatory", `10`="Respiratory", `11`="Digestive", `12`="Skin & Subcutaneaous tissue", `13`="Musculoskeletal", `14`="Genitourinary", `15`="Pregancy, Childbirth, & Puerperium", `16`="Perinatal Period", `17`="Congential", `18`="Symptoms, Signs etc.", `19`="Injury etc.", `20`="External Causes", `21`="Contact with Healthcare", `22`="Special Purposes" ), parameters = c( `1`="Elapsed Duration", # brands `2`="asinh(Generic Brands)", `3`="asinh(Competitors USPDC)", # population `4`="asinh(High SDI)", `5`="asinh(High-Medium SDI)", `6`="asinh(Medium SDI)", `7`="asinh(Low-Medium SDI)", `8`="asinh(Low SDI)", #Status `9`="status_NYR", `10`="status_EBI", `11`="status_Rec", `12`="status_ANR" ) ) get_parameters <- function(stem,class_list) { #get categories and lengths named <- names(class_list) lengths <- sapply(named, (function (x) length(class_list[[x]]))) #describe the grid needed iter_list <- sapply(named, (function (x) 1:lengths[x])) #generate the list of parameters pardf <- generate_parameter_df(stem, iter_list) #add columns with appropriate human-readable names for (name in named) { pardf[paste(name,"_hr",sep="")] <- as.factor( sapply(pardf[name], (function (i) class_list[[name]][i])) ) } return(pardf) } generate_parameter_df <- function(stem, iter_list) { grid <- expand.grid(iter_list) grid["param_name"] <- grid %>% unite(x,colnames(grid),sep=",") grid["param_name"] <- paste(stem,"[",grid$param_name,"]",sep="") return(grid) } group_mcmc_areas <- function( stem,# = "beta" class_list,# = beta_list stanfit,# = fit group_id,# = 2 rename=TRUE, filter=NULL ) { #get all parameter names params <- get_parameters(stem,class_list) #filter down to parameters of interest params <- filter(params,groups == group_id) #Get dataframe with only the rows of interest filtdata <- as.data.frame(stanfit)[params$param_name] #rename columns if (rename) dimnames(filtdata)[[2]] <- params$parameters_hr #get group name for title group_name <- class_list$groups[group_id] #create area plot with appropriate title p <- mcmc_areas(filtdata,prob = 0.8, prob_outer = 0.95) + ggtitle(paste("Parameter distributions for ICD-10 class:",group_name)) + geom_vline(xintercept=seq(-2,2,0.5),color="grey",alpha=0.750) d <- pivot_longer(filtdata, everything()) |> group_by(name) |> summarize( mean=mean(value) ,q025 = quantile(value,probs = 0.025) ,q975 = quantile(value,probs = 0.975) ,q05 = quantile(value,probs = 0.05) ,q95 = quantile(value,probs = 0.95) ) return(list(plot=p,quantiles=d,name=group_name)) } parameter_mcmc_areas <- function( stem,# = "beta" class_list,# = beta_list stanfit,# = fit parameter_id,# = 2 rename=TRUE ) { #get all parameter names params <- get_parameters(stem,class_list) #filter down to parameters of interest params <- filter(params,parameters == parameter_id) #Get dataframe with only the rows of interest filtdata <- as.data.frame(stanfit)[params$param_name] #rename columns if (rename) dimnames(filtdata)[[2]] <- params$groups_hr #get group name for title parameter_name <- class_list$parameters[parameter_id] #create area plot with appropriate title p <- mcmc_areas(filtdata,prob = 0.8, prob_outer = 0.95) + ggtitle(parameter_name,"Parameter Distribution") + geom_vline(xintercept=seq(-2,2,0.5),color="grey",alpha=0.750) d <- pivot_longer(filtdata, everything()) |> group_by(name) |> summarize( mean=mean(value) ,q025 = quantile(value,probs = 0.025) ,q975 = quantile(value,probs = 0.975) ,q05 = quantile(value,probs = 0.05) ,q95 = quantile(value,probs = 0.95) ) return(list(plot=p,quantiles=d,name=parameter_name)) } ``` Plan: select all snapshots that are the first to have closed enrollment (Rec -> ANR) ```{r} #delay intervention intervention_enrollment <- x_cf_base[c(inherited_cols)] intervention_enrollment["status_ANR"] <- 0 intervention_enrollment["status_Rec"] <- 1 ``` ```{r} counterfact_delay <- list( D = ncol(x), N = nrow(x), L = n_categories$count, y = y, ll = as.vector(categories), x = x, mu_mean = 0, mu_stdev = 0.05, sigma_location = -2.1, sigma_scale = 0.2, Nx = nrow(x_cf_base), llx = as.vector(cf_categories), counterfact_x_tilde = as.matrix(intervention_enrollment), counterfact_x = as.matrix(x_cf_base), status_indexes = c(11,12) #subtract anr from recruiting to get movement from anr to recruiting ) ``` ```{r} fit <- stan( file='Hierarchal_Logistic.stan', data = counterfact_delay, chains = 8, iter = 12000, warmup = 4000, seed = 11021585 ) ``` ## Fit Results ```{r} print(check_hmc_diagnostics(fit)) print(get_bfmi(fit)) ``` ```{r} print(fit) ``` ## Explore data ```{r} #get number of trials and snapshots in each category group_trials_by_category <- as.data.frame(aggregate(category_id ~ nct_id, df, max)) group_trials_by_category <- as.data.frame(group_trials_by_category) category_count <- group_trials_by_category |> group_by(category_id) |> count() ``` ```{r} ################################# DATA EXPLORATION ############################ driver <- dbDriver("PostgreSQL") con <- dbConnect( driver, user='root', password='root', dbname='aact_db', host=host ) #Plot histogram of count of snapshots df3 <- dbGetQuery( con, "select nct_id,final_status,count(*) from formatted_data_with_planned_enrollment fdwpe group by nct_id,final_status ;" ) #df3 <- fetch(query3, n = -1) ggplot(data=df3, aes(x=count, fill=final_status)) + geom_histogram(binwidth=1) + ggtitle("Histogram of snapshots per trial (matched trials)") + xlab("Snapshots per trial") ggsave(paste0(image_trial_details,"/HistSnapshots.png")) #Plot duration for terminated vs completed df4 <- dbGetQuery( con, " select nct_id, start_date , primary_completion_date, overall_status , primary_completion_date - start_date as duration from ctgov.studies s where nct_id in (select distinct nct_id from http.download_status ds) ;" ) #df4 <- fetch(query4, n = -1) ggplot(data=df4, aes(x=duration,fill=overall_status)) + geom_histogram()+ ggtitle("Histogram of trial durations") + xlab("duration")+ facet_wrap(~overall_status) ggsave(paste0(image_trial_details,"/HistTrialDurations_Faceted.png")) df5 <- dbGetQuery( con, " with cte1 as ( select nct_id, start_date , primary_completion_date, overall_status , primary_completion_date - start_date as duration from ctgov.studies s where nct_id in (select distinct nct_id from http.download_status ds) ), cte2 as ( select nct_id,count(*) as snapshot_count from formatted_data_with_planned_enrollment fdwpe group by nct_id ) select a.nct_id, a.overall_status, a.duration,b.snapshot_count from cte1 as a join cte2 as b on a.nct_id=b.nct_id ;" ) df5$overall_status <- as.factor(df5$overall_status) ggplot(data=df5, aes(x=duration,y=snapshot_count,color=overall_status)) + geom_jitter() + ggtitle("Comparison of duration, status, and snapshot_count") + xlab("duration") + ylab("snapshot count") ggsave(paste0(image_trial_details,"/SnapshotsVsDurationVsTermination.png")) dbDisconnect(con) #get number of trials and snapshots in each category group_trials_by_category <- as.data.frame(aggregate(category_id ~ nct_id, df, max)) group_trials_by_category <- as.data.frame(group_trials_by_category) ggplot(data = group_trials_by_category, aes(x=category_id)) + geom_bar(binwidth=1,color="black",fill="lightblue") + scale_x_continuous(breaks=scales::pretty_breaks(n=22)) + labs( title="bar chart of trial categories" ,x="Category ID" ,y="Count" ) ggsave(paste0(image_trial_details,"/CategoryCounts.png")) summary(df5) cor_dur_count <- cor(df5$duration,df5$snapshot_count) count_snapshots <- sum(df5$snapshot_count) ``` the correlation value is `r cor_dur_count` between duration and snapshot count. There are `r count_snapshots` snapshots in total. # Parameter Distributions ```{r} #g1 <- group_mcmc_areas("beta",beta_list,fit,1) gx <- c() #grab parameters for every category with more than 8 observations for (i in category_count$category_id[category_count$n >= 0]) { print(i) #Print parameter distributions gi <- group_mcmc_areas("beta",beta_list,fit,i) #add way to filter groups ggsave( paste0(image_parameters_by_groups,"/group_",i,"_",gi$name,".png") ,plot=gi$plot ) gx <- c(gx,gi) #Get Quantiles and means for parameters table <- xtable(gi$quantiles, floating=FALSE ,latex.environments = NULL ,booktabs = TRUE ,zap=getOption("digits") ) write_lines( table, paste0(image_parameters_by_groups,"/group_table_",i,"_",gi$name,".tex") ) } ``` ```{r} px <- c() for (i in c(1,2,3,9,10,11,12)) { #Print parameter distributions pi <- parameter_mcmc_areas("beta",beta_list,fit,i) #add way to filter groups ggsave( paste0(image_parameters_across_groups,"/parameters_",i,"_",pi$name,".png") ,plot=pi$plot ) px <- c(px,pi) #Get Quantiles and means for parameters table <- xtable(pi$quantiles, floating=FALSE ,latex.environments = NULL ,booktabs = TRUE ,zap=getOption("digits") ) write_lines( table, paste0(image_parameters_across_groups,"/parameters_tables_",i,"_",pi$name,".tex") ) } ``` Note these have 95% outer CI and 80% inner (shaded) # Counterfactuals calculation ```{r} generated_ib <- gqs( fit@stanmodel, data=counterfact_delay, draws=as.matrix(fit), seed=11021585 ) ``` # Priors ```{r} df_ib_p <- data.frame( p_prior=as.vector(extract(generated_ib, pars="p_prior")$p_prior) ,p_predicted = as.vector(extract(generated_ib, pars="p_predicted")$p_predicted) ) df_ib_prior <- data.frame( mu_prior = as.vector(extract(generated_ib, pars="mu_prior")$mu_prior) ,sigma_prior = as.vector(extract(generated_ib, pars="sigma_prior")$sigma_prior) ) ``` ```{r} #p_prior ggplot(df_ib_p, aes(x=p_prior)) + geom_density() + labs( title="Implied Prior Distribution P" ,subtitle="" ,x="Probability Domain 'p'" ,y="Probability Density" ) ggsave(paste0(image_dist_diff_analysis,"/prior_p.png")) #p_posterior ggplot(df_ib_p, aes(x=p_predicted)) + geom_density() + labs( title="Implied Posterior Distribution P" ,subtitle="" ,x="Probability Domain 'p'" ,y="Probability Density" ) ggsave(paste0(image_dist_diff_analysis,"/posterior_p.png")) #mu_prior ggplot(df_ib_prior) + geom_density(aes(x=mu_prior)) + labs( title="Prior - Mu" ,subtitle="same prior for all Mu values" ,x="Mu" ,y="Probability" ) ggsave(paste0(image_dist_diff_analysis,"/prior_mu.png")) #sigma_posterior ggplot(df_ib_prior) + geom_density(aes(x=sigma_prior)) + labs( title="Prior - Sigma" ,subtitle="same prior for all Sigma values" ,x="Sigma" ,y="Probability" ) ggsave(paste0(image_dist_diff_analysis,"/prior_sigma.png")) ``` ### Intervention: Delay close of enrollment ```{r} counterfact_predicted_ib <- data.frame( p_predicted_default = as.vector(extract(generated_ib, pars="p_predicted_default")$p_predicted_default) ,p_predicted_intervention = as.vector(extract(generated_ib, pars="p_predicted_intervention")$p_predicted_intervention) ,predicted_difference = as.vector(extract(generated_ib, pars="predicted_difference")$predicted_difference) ) ggplot(counterfact_predicted_ib, aes(x=p_predicted_default)) + geom_density() + labs( title="Predicted Distribution of 'p'" ,subtitle="Intervention: None" ,x="Probability Domain 'p'" ,y="Probability Density" ) ggsave(paste0(image_dist_diff_analysis,"p_no_intervention.png")) ggplot(counterfact_predicted_ib, aes(x=p_predicted_intervention)) + geom_density() + labs( title="Predicted Distribution of 'p'" ,subtitle="Intervention: Delay close of enrollment" ,x="Probability Domain 'p'" ,y="Probability Density" ) ggsave(paste0(image_dist_diff_analysis,"/p_delay_intervention.png")) ggplot(counterfact_predicted_ib, aes(x=predicted_difference)) + geom_density() + labs( title="Predicted Distribution of differences 'p'" ,subtitle="Intervention: Delay close of enrollment" ,x="Difference in 'p' under treatment" ,y="Probability Density" ) ggsave(paste0(image_dist_diff_analysis,"/p_delay_intervention_distdiff_1.png")) ``` ```{r} get_category_count <- function(tbl, id) { result <- tbl$n[tbl$category_id == id] if(length(result) == 0) 0 else result } category_names <- sapply(1:length(beta_list$groups), function(i) sprintf("ICD-10 #%d: %s (n=%d)", i, beta_list$groups[i], get_category_count(category_count, i))) ``` ```{r} pddf_ib <- data.frame(extract(generated_ib, pars="predicted_difference")$predicted_difference) |> pivot_longer(X1:X168) #CHANGE_NOTE: moved from X169 to X168 pddf_ib["entry_idx"] <- as.numeric(gsub("\\D","",pddf_ib$name)) pddf_ib["category"] <- sapply(pddf_ib$entry_idx, function(i) df$category_id[i]) pddf_ib["category_name"] <- sapply( pddf_ib$category, function(i) category_names[i] ) ``` ```{r} ggplot(pddf_ib, aes(x=value,)) + geom_density(adjust=1/5) + labs( title = "Distribution of predicted differences" ,subtitle = "Intervention: Delay close of enrollment" ,x = "Difference in probability due to intervention" ,y = "Probability Density" ) + geom_vline(aes(xintercept = 0), color = "skyblue", linetype="dashed") ggsave(paste0(image_dist_diff_analysis,"/p_delay_intervention_distdiff_styled.png")) ``` ```{r} ggplot(pddf_ib, aes(x=value,)) + geom_density(adjust=1/5) + facet_wrap( ~factor( category_name, levels=category_names ) , labeller = label_wrap_gen(multi_line = TRUE) , ncol=4) + labs( title = "Distribution of predicted differences | By Group" ,subtitle = "Intervention: Delay close of enrollment" ,x = "Difference in probability due to intervention" ,y = "Probability Density" ) + geom_vline(aes(xintercept = 0), color = "skyblue", linetype="dashed") + theme(strip.text.x = element_text(size = 8)) ggsave(paste0(image_dist_diff_analysis,"/p_delay_intervention_distdiff_by_group.png")) ``` ```{r} ggplot(pddf_ib, aes(x=value,)) + geom_histogram(bins=300) + facet_wrap( ~factor( category_name, levels=category_names ) , labeller = label_wrap_gen(multi_line = TRUE) , ncol=4) + labs( title = "Histogram of predicted differences | By Group" ,subtitle = "Intervention: Delay close of enrollment" ,x = "Difference in probability due to intervention" ,y = "Predicted counts" ) + geom_vline(aes(xintercept = 0), color = "skyblue", linetype="dashed") + theme(strip.text.x = element_text(size = 8)) ggsave(paste0(image_dist_diff_analysis,"p_delay_intervention_histdiff_by_group.png")) ``` ```{r} p3 <- ggplot(pddf_ib, aes(x=value,)) + geom_histogram(bins=500) + labs( title = "Distribution of predicted differences" ,subtitle = "Intervention: Delay close of enrollment" ,x = "Difference in probability due to intervention" ,y = "Probability Density" ,caption = "Vertical marks: 5/10/25/50/75/90/95th percentiles. Dot shows mean." ) + geom_vline(aes(xintercept = 0), color = "skyblue", linetype="dashed") density_hight <- max(density(pddf_ib$value)$y) stats <- list( p5 = quantile(pddf_ib$value, 0.05), p10 = quantile(pddf_ib$value, 0.10), q1 = quantile(pddf_ib$value, 0.25), med = median(pddf_ib$value), mean = mean(pddf_ib$value), q3 = quantile(pddf_ib$value, 0.75), p90 = quantile(pddf_ib$value, 0.90), p95 = quantile(pddf_ib$value, 0.95), max_height = max(ggplot_build(p3)$data[[1]]$count), y_offset = -max(ggplot_build(p3)$data[[1]]$count) * 0.05, y_offset_density = -density_hight * 0.15 ) p3 + # Box geom_segment(data = data.frame( x = c(stats$q1, stats$q3, stats$med), xend = c(stats$q1, stats$q3, stats$med), y = rep(stats$y_offset, 3), yend = rep(stats$y_offset * 2, 3) ), aes(x = x, xend = xend, y = y, yend = yend)) + geom_segment(data = data.frame( x = rep(stats$q1, 2), xend = rep(stats$q3, 2), y = c(stats$y_offset, stats$y_offset * 2), yend = c(stats$y_offset, stats$y_offset * 2) ), aes(x = x, xend = xend, y = y, yend = yend)) + # Inner whiskers (Q1->P10, Q3->P90) geom_segment(data = data.frame( x = c(stats$q1, stats$q3), xend = c(stats$p10, stats$p90), y = rep(stats$y_offset * 1.5, 2), yend = rep(stats$y_offset * 1.5, 2) ), aes(x = x, xend = xend, y = y, yend = yend)) + # Crossbars at P10/P90 geom_segment(data = data.frame( x = c(stats$p10, stats$p90), xend = c(stats$p10, stats$p90), y = stats$y_offset * 1.3, yend = stats$y_offset * 1.7 ), aes(x = x, xend = xend, y = y, yend = yend)) + # Outer whiskers (P10->P5, P90->P95) geom_segment(data = data.frame( x = c(stats$p10, stats$p90), xend = c(stats$p5, stats$p95), y = rep(stats$y_offset * 1.5, 2), yend = rep(stats$y_offset * 1.5, 2) ), aes(x = x, xend = xend, y = y, yend = yend)) + # Crossbars at P5/P95 geom_segment(data = data.frame( x = c(stats$p5, stats$p95), xend = c(stats$p5, stats$p95), y = stats$y_offset * 1.3, yend = stats$y_offset * 1.7 ), aes(x = x, xend = xend, y = y, yend = yend)) + # Mean dot geom_point(data = data.frame( x = stats$mean, y = stats$y_offset * 1.5 ), aes(x = x, y = y)) ggsave(paste0(image_dist_diff_analysis,"/p_delay_intervention_histdiff_boxplot.png")) ``` ```{r} p4 <- ggplot(pddf_ib, aes(x = value)) + geom_density() + labs( title = "Distribution of predicted differences" ,subtitle = "Intervention: Delay close of enrollment" ,x = "Difference in probability due to intervention" ,y = "Probability Density" ,caption = "Vertical marks: 5/10/25/50/75/90/95th percentiles. Dot shows mean." ) + geom_vline(aes(xintercept = 0), color = "skyblue", linetype="dashed") + geom_segment(data = data.frame( x = c(stats$q1, stats$q3, stats$med), xend = c(stats$q1, stats$q3, stats$med), y = rep(stats$y_offset_density, 3), yend = rep(stats$y_offset_density * 2, 3) ), aes(x = x, xend = xend, y = y, yend = yend)) + geom_segment(data = data.frame( x = rep(stats$q1, 2), xend = rep(stats$q3, 2), y = c(stats$y_offset_density, stats$y_offset_density * 2), yend = c(stats$y_offset_density, stats$y_offset_density * 2) ), aes(x = x, xend = xend, y = y, yend = yend)) + # Inner whiskers (Q1->P10, Q3->P90) geom_segment(data = data.frame( x = c(stats$q1, stats$q3), xend = c(stats$p10, stats$p90), y = rep(stats$y_offset_density * 1.5, 2), yend = rep(stats$y_offset_density * 1.5, 2) ), aes(x = x, xend = xend, y = y, yend = yend)) + # Crossbars at P10/P90 geom_segment(data = data.frame( x = c(stats$p10, stats$p90), xend = c(stats$p10, stats$p90), y = stats$y_offset_density * 1.3, yend = stats$y_offset_density * 1.7 ), aes(x = x, xend = xend, y = y, yend = yend)) + # Outer whiskers (P10->P5, P90->P95) geom_segment(data = data.frame( x = c(stats$p10, stats$p90), xend = c(stats$p5, stats$p95), y = rep(stats$y_offset_density * 1.5, 2), yend = rep(stats$y_offset_density * 1.5, 2) ), aes(x = x, xend = xend, y = y, yend = yend)) + # Crossbars at P5/P95 geom_segment(data = data.frame( x = c(stats$p5, stats$p95), xend = c(stats$p5, stats$p95), y = stats$y_offset_density * 1.3, yend = stats$y_offset_density * 1.7 ), aes(x = x, xend = xend, y = y, yend = yend)) + # Mean dot geom_point(data = data.frame( x = stats$mean, y = stats$y_offset_density * 1.5 ), aes(x = x, y = y)) ggsave(paste0(image_dist_diff_analysis,"/p_delay_intervention_distdiff_boxplot.png")) p4 ``` ```{r} ggplot(pddf_ib, aes(x=value)) + stat_ecdf(geom='step') + labs( title = "Cumulative distribution of predicted differences", subtitle = "Intervention: Delay close of enrollment", x = "Difference in probability of termination due to intervention", y = "Cumulative Proportion" ) ggsave(paste0(image_dist_diff_analysis,"/p_delay_intervention_cumulative_distdiff.png")) ``` Get the % of differences in the spike around zero ```{r} # get values around and above/below spike width <- 0.02 spike_band_centered_zero <- mean( pddf_ib$value >= -width/2 & pddf_ib$value <= width/2) above_spike_band <- mean( pddf_ib$value >= width/2) below_spike_band <- mean( pddf_ib$value <= -width/2) # get mass above and mass below zero mass_below_zero <- mean( pddf_ib$value <= 0) ``` Looking at the spike around zero, we find that `r spike_band_centered_zero*100`% of the probability mass is contained within the band from [`r -width*100/2`,`r width*100/2`]. Additionally, there was `r above_spike_band*100`% of the probability above that -- representing those with a general increase in the probability of termination -- and `r below_spike_band*100`% of the probability mass below the band -- representing a decrease in the probability of termination. On average, if you keep the trial open instead of closing it, `r mass_below_zero`% of trials will see a decrease in the probability of termination, but, due to the high increase in probability of termination given termination was increased, the mean probability of termination increases by `r stats$mean`. ```{r} # 5%-iles summary(pddf_ib$value) # Create your quantiles quants <- quantile(pddf_ib$value, probs = seq(0,1,0.05), type=4) # Convert to a data frame quant_df <- data.frame( #Percentile = names(quants), Value = quants ) # Convert to LaTeX table <- xtable(quant_df, digits = rep(3, ncol(d) + 1), floating = FALSE, latex.environments = NULL, booktabs = TRUE ) # Write to file write_lines( print(table, include.rownames = FALSE), paste0(image_root,"/distdiff_5percentile_table.tex") ) kable(quant_df) proportion_increase <- mean(pddf_ib$value >= 0) ``` about `r proportion_increase * 100` percent probability increase in the probability of terminations ```{r} n = length(counterfact_predicted_ib$p_predicted_intervention) k = 100 simulated_terminations_intervention <- mean(rbinom(n,k,as.vector(counterfact_predicted_ib$p_predicted_intervention))) simulated_terminations_base <-mean(rbinom(n,k,as.vector(counterfact_predicted_ib$p_predicted_default))) simulated_percentages <- (simulated_terminations_intervention - simulated_terminations_base)/k ``` The simulation above shows that this results in a percentage-point change in terminations of about `r simulated_percentages * 100`. ## fixed effects distributions ```{r} #Get dataframe with only the rows of interest filtdata <- as.data.frame(extract(fit, pars="status_diff")) #rename columns dimnames(filtdata)[[2]] <- beta_list$groups #create area plot with appropriate title mcmc_areas(filtdata,prob = 0.8, prob_outer = 0.95) + ggtitle("Differences in Fixed Effects | By ICD-10 Category", subtitle = "Moving from 'Active, not recruiting' to 'Recruiting'" ) + geom_vline(xintercept=seq(-0.25,0.5,0.25),color="grey",alpha=0.750) ggsave(paste0(image_parameters_across_groups,"/fixed_effects_anr_vs_rec_by_group.png")) d <- pivot_longer(filtdata, everything()) |> group_by(name) |> summarize( mean = mean(value), `P(≥0)` = mean(value >= 0), `2.5%` = quantile(value, probs = 0.025), `5%` = quantile(value, probs = 0.05), `25%` = quantile(value, probs = 0.25), `50% median` = quantile(value, probs = 0.5), `75%` = quantile(value, probs = 0.75), `95%` = quantile(value, probs = 0.95), `97.5%` = quantile(value, probs = 0.975) ) # Rename the name column names(d)[1] <- "ICD-10 Category" # Convert to LaTeX table <- xtable(d, digits = rep(3, ncol(d) + 1), floating = FALSE, latex.environments = NULL, booktabs = TRUE ) # Write to file write_lines( print(table, include.rownames = FALSE), paste0(image_parameters_across_groups,"/fixed_effects_anr_vs_rec_by_group.tex") ) ``` # Diagnostics ```{r} #| eval: true #trace plots image_diagnostics <- paste0(image_root,"/diagnostics") plot(fit, pars=c("mu"), plotfun="trace") ggsave(paste0(image_diagnostics,"/trace_plot_mu.png")) for (i in 1:3) { print( mcmc_rank_overlay( fit, pars=c( paste0("mu[",4*i-3,"]"), paste0("mu[",4*i-2,"]"), paste0("mu[",4*i-1,"]"), paste0("mu[",4*i,"]") ), n_bins=100 )+ legend_move("top") + scale_colour_ghibli_d("KikiMedium") ) mu_range <- paste0(4*i-3,"-",4*i) filename <- paste0(image_diagnostics,"/trace_rank_plot_mu_",mu_range,".png") ggsave(filename) } ``` ```{r} #| eval: true plot(fit, pars=c("sigma"), plotfun="trace") ggsave(paste0(image_diagnostics,"/traceplot_sigma.png")) for (i in 1:3) { print( mcmc_rank_overlay( fit, pars=c( paste0("sigma[",4*i-3,"]"), paste0("sigma[",4*i-2,"]"), paste0("sigma[",4*i-1,"]"), paste0("sigma[",4*i,"]") ), n_bins=100 )+ legend_move("top") + scale_colour_ghibli_d("KikiMedium") ) sigma_range <- paste0(4*i-3,"-",4*i) filename <- paste0(image_diagnostics,"/trace_rank_plot_sigma_",sigma_range,".png") ggsave(filename) } ``` ```{r} #| eval: true #other diagnostics logpost <- log_posterior(fit) nuts_prmts <- nuts_params(fit) posterior <- as.array(fit) ``` ```{r} #| eval: true color_scheme_set("darkgray") div_style <- parcoord_style_np(div_color = "green", div_size = 0.05, div_alpha = 0.4) mcmc_parcoord(posterior, regex_pars = "mu", np=nuts_prmts, np_style = div_style, alpha = 0.05) ggsave(paste0(image_diagnostics,"/parcoord_mu.png")) ``` ```{r} #| eval: true for (i in 1:3) { mus = sapply(3:0, function(j) paste0("mu[",4*i-j ,"]")) print( mcmc_pairs( posterior, np = nuts_prmts, pars=c( mus, "lp__" ), off_diag_args = list(size = 0.75) ) ) mu_range <- paste0(4*i-3,"-",4*i) filename <- paste0(image_diagnostics,"/correlation_plot_mu_",mu_range,".png") ggsave(filename) } ``` ```{r} #| eval: true mcmc_parcoord(posterior,regex_pars = "sigma", np=nuts_prmts, alpha=0.05) ggsave(paste0(image_diagnostics,"/parcoord_sigma.png")) ``` ```{r} #| eval: true for (i in 1:3) { params = sapply(3:0, function(j) paste0("sigma[",4*i-j ,"]")) print( mcmc_pairs( posterior, np = nuts_prmts, pars=c( params, "lp__" ), off_diag_args = list(size = 0.75) ) ) sigma_range <- paste0(4*i-3,"-",4*i) filename <- paste0(image_diagnostics,"/correlation_plot_sigma_",sigma_range,".png") ggsave(filename) } ``` ```{r} #| eval: true for (k in 1:22) { for (i in 1:3) { params = sapply(3:0, function(j) paste0("beta[",k,",",4*i-j ,"]")) print( mcmc_pairs( posterior, np = nuts_prmts, pars=c( params, "lp__" ), off_diag_args = list(size = 0.75) ) ) beta_range <- paste0("k_",k,"_i_",4*i-3,"-",4*i) filename <- paste0(image_diagnostics,"/correlation_plot_beta_",beta_range,".png") ggsave(filename) }} ```