── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(rstan)
Loading required package: StanHeaders
rstan version 2.32.6 (Stan version 2.32.2)
For execution on a local, multicore CPU with excess RAM we recommend calling
options(mc.cores = parallel::detectCores()).
To avoid recompilation of unchanged Stan programs, we recommend calling
rstan_options(auto_write = TRUE)
For within-chain threading using `reduce_sum()` or `map_rect()` Stan functions,
change `threads_per_chain` option:
rstan_options(threads_per_chain = 1)
Attaching package: 'rstan'
The following object is masked from 'package:tidyr':
extract
library(tidyr)library(ghibli)library(xtable)#Resources: https://github.com/stan-dev/rstan/wiki/RStan-Getting-Started#save unchanged models instead of recompilingrstan_options(auto_write =TRUE)#allow for multithreaded samplingoptions(mc.cores = parallel::detectCores())#test installation, shouldn't get any errors#example(stan_model, package = "rstan", run.dontrun = TRUE)image_root <-"./output/withdiff/EffectsOfEnrollmentDelay"image_dist_diff_analysis <-paste0(image_root,"/dist_diff_analysis")image_trial_details <-paste0(image_root,"/trials_details")image_parameters_by_groups <-paste0(image_root,"/betas/parameters_by_group")image_parameters_across_groups <-paste0(image_root,"/betas/parameter_across_groups")
################ Pull data from database ######################library(RPostgreSQL)
Loading required package: DBI
host <-'aact_db-restored-2025-01-07'driver <-dbDriver("PostgreSQL")get_data <-function(driver) {con <-dbConnect( driver,user='root',password='root',dbname='aact_db',host=host )on.exit(dbDisconnect(con))query <-dbSendQuery( con,# "select * from formatted_data_with_planned_enrollment;""select fdqpe.nct_id --,fdqpe.start_date --,fdqpe.current_enrollment --,fdqpe.enrollment_category ,fdqpe.current_status ,fdqpe.earliest_date_observed ,fdqpe.elapsed_duration ,fdqpe.n_brands as identical_brands ,ntbtu.brand_name_counts ,fdqpe.category_id ,fdqpe.final_status ,fdqpe.h_sdi_val --,fdqpe.h_sdi_u95 --,fdqpe.h_sdi_l95 ,fdqpe.hm_sdi_val --,fdqpe.hm_sdi_u95 --,fdqpe.hm_sdi_l95 ,fdqpe.m_sdi_val --,fdqpe.m_sdi_u95 --,fdqpe.m_sdi_l95 ,fdqpe.lm_sdi_val --,fdqpe.lm_sdi_u95 --,fdqpe.lm_sdi_l95 ,fdqpe.l_sdi_val --,fdqpe.l_sdi_u95 --,fdqpe.l_sdi_l95from formatted_data_with_planned_enrollment fdqpe join \"Formularies\".nct_to_brand_counts_through_uspdc ntbtu on fdqpe.nct_id = ntbtu.nct_id order by fdqpe.nct_id, fdqpe.earliest_date_observed ;" )df <-fetch(query, n =-1)df <-na.omit(df)query2 <-dbSendQuery(con,"select count(*) from \"DiseaseBurden\".icd10_categories ic where \"level\"=1;")n_categories <-fetch(query2, n =-1)return(list(data=df,ncat=n_categories))}get_counterfact_base <-function(driver) {con <-dbConnect( driver,user='root',password='root',dbname='aact_db',host=host )on.exit(dbDisconnect(con))query <-dbSendQuery( con," with cte as ( --get last recruiting state select fd.nct_id, max(fd.earliest_date_observed),min(fd2.earliest_date_observed) as tmstmp from formatted_data fd join formatted_data fd2 on fd.nct_id=fd2.nct_id and fd.earliest_date_observed < fd2.earliest_date_observed where fd.current_status = 'Recruiting' and fd2.current_status = 'Active, not recruiting' group by fd.nct_id ) select fdqpe.nct_id --,fdqpe.start_date --,fdqpe.current_enrollment --,fdqpe.enrollment_category ,fdqpe.current_status ,fdqpe.earliest_date_observed ,fdqpe.elapsed_duration ,fdqpe.n_brands as identical_brands ,ntbtu.brand_name_counts ,fdqpe.category_id ,fdqpe.final_status ,fdqpe.h_sdi_val --,fdqpe.h_sdi_u95 --,fdqpe.h_sdi_l95 ,fdqpe.hm_sdi_val --,fdqpe.hm_sdi_u95 --,fdqpe.hm_sdi_l95 ,fdqpe.m_sdi_val --,fdqpe.m_sdi_u95 --,fdqpe.m_sdi_l95 ,fdqpe.lm_sdi_val --,fdqpe.lm_sdi_u95 --,fdqpe.lm_sdi_l95 ,fdqpe.l_sdi_val --,fdqpe.l_sdi_u95 --,fdqpe.l_sdi_l95 from formatted_data_with_planned_enrollment fdqpe join \"Formularies\".nct_to_brand_counts_through_uspdc ntbtu on fdqpe.nct_id = ntbtu.nct_id join cte on fdqpe.nct_id = cte.nct_id and fdqpe.earliest_date_observed = cte.tmstmp order by fdqpe.nct_id, fdqpe.earliest_date_observed ; " )df <-fetch(query, n =-1)df <-na.omit(df)query2 <-dbSendQuery(con,"select count(*) from \"DiseaseBurden\".icd10_categories ic where \"level\"=1;")n_categories <-fetch(query2, n =-1)return(list(data=df,ncat=n_categories))}d <-get_data(driver)df <- d$datan_categories <- d$ncatcf <-get_counterfact_base(driver)df_counterfact_base <- cf$data################ Format Data ###########################data_formatter <-function(df) {categories <- df["category_id"]x <- df["elapsed_duration"]x["identical_brands"] <-asinh(df$identical_brands)x["brand_name_counts"] <-asinh(df$brand_name_count)x["h_sdi_val"] <-asinh(df$h_sdi_val)x["hm_sdi_val"] <-asinh(df$hm_sdi_val)x["m_sdi_val"] <-asinh(df$m_sdi_val)x["lm_sdi_val"] <-asinh(df$lm_sdi_val)x["l_sdi_val"] <-asinh(df$l_sdi_val)#Setup fixed effectsx["status_NYR"] <-ifelse(df["current_status"]=="Not yet recruiting",1,0)x["status_EBI"] <-ifelse(df["current_status"]=="Enrolling by invitation",1,0)x["status_Rec"] <-ifelse(df["current_status"]=="Recruiting",1,0) x["status_ANR"] <-ifelse(df["current_status"]=="Active, not recruiting",1,0)y <-ifelse(df["final_status"]=="Terminated",1,0)#get category listreturn(list(x=x,y=y))}train <-data_formatter(df)counterfact_base <-data_formatter(df_counterfact_base)categories <- df$category_idx <- train$xy <- train$yx_cf_base <- counterfact_base$xy_cf_base <- counterfact_base$ycf_categories <- df_counterfact_base$category_id
Fit Model
################################# FIT MODEL #########################################inherited_cols <-c("elapsed_duration" ,"identical_brands" ,"brand_name_counts" ,"h_sdi_val" ,"hm_sdi_val" ,"m_sdi_val" ,"lm_sdi_val" ,"l_sdi_val" ,"status_NYR" ,"status_EBI" ,"status_Rec" ,"status_ANR")
beta_list <-list(groups =c(`1`="Infections & Parasites",`2`="Neoplasms",`3`="Blood & Immune system",`4`="Endocrine, Nutritional, and Metabolic",`5`="Mental & Behavioral",`6`="Nervous System",`7`="Eye and Adnexa",`8`="Ear and Mastoid",`9`="Circulatory",`10`="Respiratory",`11`="Digestive",`12`="Skin & Subcutaneaous tissue",`13`="Musculoskeletal",`14`="Genitourinary",`15`="Pregancy, Childbirth, & Puerperium",`16`="Perinatal Period",`17`="Congential",`18`="Symptoms, Signs etc.",`19`="Injury etc.",`20`="External Causes",`21`="Contact with Healthcare",`22`="Special Purposes" ),parameters =c(`1`="Elapsed Duration",# brands`2`="asinh(Generic Brands)",`3`="asinh(Competitors USPDC)",# population`4`="asinh(High SDI)",`5`="asinh(High-Medium SDI)",`6`="asinh(Medium SDI)",`7`="asinh(Low-Medium SDI)",`8`="asinh(Low SDI)",#Status`9`="status_NYR",`10`="status_EBI",`11`="status_Rec",`12`="status_ANR" ))get_parameters <-function(stem,class_list) {#get categories and lengths named <-names(class_list) lengths <-sapply(named, (function (x) length(class_list[[x]])))#describe the grid needed iter_list <-sapply(named, (function (x) 1:lengths[x]))#generate the list of parameters pardf <-generate_parameter_df(stem, iter_list)#add columns with appropriate human-readable namesfor (name in named) { pardf[paste(name,"_hr",sep="")] <-as.factor(sapply(pardf[name], (function (i) class_list[[name]][i])) ) }return(pardf) }generate_parameter_df <-function(stem, iter_list) { grid <-expand.grid(iter_list) grid["param_name"] <- grid %>%unite(x,colnames(grid),sep=",") grid["param_name"] <-paste(stem,"[",grid$param_name,"]",sep="")return(grid)}group_mcmc_areas <-function( stem,# = "beta" class_list,# = beta_list stanfit,# = fit group_id,# = 2rename=TRUE,filter=NULL ) {#get all parameter names params <-get_parameters(stem,class_list)#filter down to parameters of interest params <-filter(params,groups == group_id)#Get dataframe with only the rows of interest filtdata <-as.data.frame(stanfit)[params$param_name]#rename columnsif (rename) dimnames(filtdata)[[2]] <- params$parameters_hr#get group name for title group_name <- class_list$groups[group_id]#create area plot with appropriate title p <-mcmc_areas(filtdata,prob =0.8, prob_outer =0.95) +ggtitle(paste("Parameter distributions for ICD-10 class:",group_name)) +geom_vline(xintercept=seq(-2,2,0.5),color="grey",alpha=0.750) d <-pivot_longer(filtdata, everything()) |>group_by(name) |>summarize(mean=mean(value) ,q025 =quantile(value,probs =0.025) ,q975 =quantile(value,probs =0.975) ,q05 =quantile(value,probs =0.05) ,q95 =quantile(value,probs =0.95) )return(list(plot=p,quantiles=d,name=group_name))}parameter_mcmc_areas <-function( stem,# = "beta" class_list,# = beta_list stanfit,# = fit parameter_id,# = 2rename=TRUE ) {#get all parameter names params <-get_parameters(stem,class_list)#filter down to parameters of interest params <-filter(params,parameters == parameter_id)#Get dataframe with only the rows of interest filtdata <-as.data.frame(stanfit)[params$param_name]#rename columnsif (rename) dimnames(filtdata)[[2]] <- params$groups_hr#get group name for title parameter_name <- class_list$parameters[parameter_id]#create area plot with appropriate title p <-mcmc_areas(filtdata,prob =0.8, prob_outer =0.95) +ggtitle(parameter_name,"Parameter Distribution") +geom_vline(xintercept=seq(-2,2,0.5),color="grey",alpha=0.750) d <-pivot_longer(filtdata, everything()) |>group_by(name) |>summarize(mean=mean(value) ,q025 =quantile(value,probs =0.025) ,q975 =quantile(value,probs =0.975) ,q05 =quantile(value,probs =0.05) ,q95 =quantile(value,probs =0.95) )return(list(plot=p,quantiles=d,name=parameter_name))}
Plan: select all snapshots that are the first to have closed enrollment (Rec -> ANR when comparing across snapshots), and then
#delay interventionintervention_enrollment <- x_cf_base[c(inherited_cols)]#TOFIX: ^^^ This ordering of columns isintervention_enrollment["status_ANR"] <-0intervention_enrollment["status_Rec"] <-1
counterfact_delay <-list(D =ncol(x),#N =nrow(x),L = n_categories$count,y =as.vector(y),ll =as.vector(categories),x =as.matrix(x),mu_mean =0,mu_stdev =0.05,sigma_shape =4,sigma_rate =20,Nx =nrow(x_cf_base),llx =as.vector(cf_categories),counterfact_x_tilde =as.matrix(intervention_enrollment),counterfact_x =as.matrix(x_cf_base),status_indexes =c(11,12) #subtract anr from recruiting to get movement from anr to recruiting)
fit <-stan(file='Hierarchal_Logistic.stan', data = counterfact_delay,chains =4,iter =5000,seed =11021585 )
Warning: Bulk Effective Samples Size (ESS) is too low, indicating posterior means and medians may be unreliable.
Running the chains for more iterations may help. See
https://mc-stan.org/misc/warnings.html#bulk-ess
Calculate relative difference in parameters between recruiting and active not recruiting states
filt_data <-as.data.frame(extract(fit,pars="status_diff"))dimnames(filt_data)[[2]] <- beta_list$groupsmcmc_areas(filt_data,prob =0.8, prob_outer =0.95) +ggtitle("Relative fixed effects across groups", subtitle ="moving from `Active, not recruiting` to `Recruiting`") +geom_vline(xintercept=seq(-2,2,0.5),color="grey",alpha=0.750)
Warning: Removed 3 rows containing missing values or values outside the scale range
(`geom_vline()`).
I’ve got an issue here, because this should be a movement from ANR -> recruiting but that is suggesting that
Explore data
#get number of trials and snapshots in each categorygroup_trials_by_category <-as.data.frame(aggregate(category_id ~ nct_id, df, max))group_trials_by_category <-as.data.frame(group_trials_by_category)category_count <- group_trials_by_category |>group_by(category_id) |>count()
################################# DATA EXPLORATION ############################driver <-dbDriver("PostgreSQL")con <-dbConnect( driver,user='root',password='root',dbname='aact_db',host=host )#Plot histogram of count of snapshotsdf3 <-dbGetQuery( con,"select nct_id,final_status,count(*) from formatted_data_with_planned_enrollment fdwpe group by nct_id,final_status ;" )#df3 <- fetch(query3, n = -1)ggplot(data=df3, aes(x=count, fill=final_status)) +geom_histogram(binwidth=1) +ggtitle("Histogram of snapshots per trial (matched trials)") +xlab("Snapshots per trial")
#Plot duration for terminated vs completeddf4 <-dbGetQuery( con," select nct_id, start_date , primary_completion_date, overall_status , primary_completion_date - start_date as duration from ctgov.studies s where nct_id in (select distinct nct_id from http.download_status ds) ;" )#df4 <- fetch(query4, n = -1)ggplot(data=df4, aes(x=duration,fill=overall_status)) +geom_histogram()+ggtitle("Histogram of trial durations") +xlab("duration")+facet_wrap(~overall_status)
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Saving 7 x 5 in image
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
df5 <-dbGetQuery( con," with cte1 as ( select nct_id, start_date , primary_completion_date, overall_status , primary_completion_date - start_date as duration from ctgov.studies s where nct_id in (select distinct nct_id from http.download_status ds) ), cte2 as ( select nct_id,count(*) as snapshot_count from formatted_data_with_planned_enrollment fdwpe group by nct_id ) select a.nct_id, a.overall_status, a.duration,b.snapshot_count from cte1 as a join cte2 as b on a.nct_id=b.nct_id ;" )df5$overall_status <-as.factor(df5$overall_status)ggplot(data=df5, aes(x=duration,y=snapshot_count,color=overall_status)) +geom_jitter() +ggtitle("Comparison of duration, status, and snapshot_count") +xlab("duration") +ylab("snapshot count")
#get number of trials and snapshots in each categorygroup_trials_by_category <-as.data.frame(aggregate(category_id ~ nct_id, df, max))group_trials_by_category <-as.data.frame(group_trials_by_category)ggplot(data = group_trials_by_category, aes(x=category_id)) +geom_bar(binwidth=1,color="black",fill="lightblue") +scale_x_continuous(breaks=scales::pretty_breaks(n=22)) +labs(title="bar chart of trial categories" ,x="Category ID" ,y="Count" )
Warning in geom_bar(binwidth = 1, color = "black", fill = "lightblue"):
Ignoring unknown parameters: `binwidth`
#g1 <- group_mcmc_areas("beta",beta_list,fit,1)gx <-c()#grab parameters for every category with more than 8 observationsfor (i in category_count$category_id[category_count$n >=0]) {print(i)#Print parameter distributions gi <-group_mcmc_areas("beta",beta_list,fit,i) #add way to filter groupsggsave(paste0(image_parameters_by_groups,"/group_",i,"_",gi$name,".png") ,plot=gi$plot ) gx <-c(gx,gi)#Get Quantiles and means for parameters# table <- xtable(gi$quantiles,# floating=FALSE# ,latex.environments = NULL# ,booktabs = TRUE# ,zap=getOption("digits")# )# write_lines(table,paste0("./latex_output/DirectEffects/group_",gi$name,".tex"))}
[1] 1
Saving 7 x 5 in image
Warning: Removed 3 rows containing missing values or values outside the scale range
(`geom_vline()`).
[1] 2
Saving 7 x 5 in image
Warning: Removed 2 rows containing missing values or values outside the scale range
(`geom_vline()`).
[1] 3
Saving 7 x 5 in image
Warning: Removed 2 rows containing missing values or values outside the scale range
(`geom_vline()`).
[1] 4
Saving 7 x 5 in image
Warning: Removed 2 rows containing missing values or values outside the scale range
(`geom_vline()`).
[1] 5
Saving 7 x 5 in image
Warning: Removed 1 row containing missing values or values outside the scale range
(`geom_vline()`).
[1] 6
Saving 7 x 5 in image
Warning: Removed 2 rows containing missing values or values outside the scale range
(`geom_vline()`).
[1] 7
Saving 7 x 5 in image
Warning: Removed 3 rows containing missing values or values outside the scale range
(`geom_vline()`).
[1] 9
Saving 7 x 5 in image
Warning: Removed 3 rows containing missing values or values outside the scale range
(`geom_vline()`).
[1] 10
Saving 7 x 5 in image
Warning: Removed 3 rows containing missing values or values outside the scale range
(`geom_vline()`).
[1] 11
Saving 7 x 5 in image
Warning: Removed 2 rows containing missing values or values outside the scale range
(`geom_vline()`).
[1] 12
Saving 7 x 5 in image
Warning: Removed 1 row containing missing values or values outside the scale range
(`geom_vline()`).
[1] 13
Saving 7 x 5 in image
[1] 14
Saving 7 x 5 in image
Warning: Removed 3 rows containing missing values or values outside the scale range
(`geom_vline()`).
[1] 17
Saving 7 x 5 in image
Warning: Removed 2 rows containing missing values or values outside the scale range
(`geom_vline()`).
[1] 18
Saving 7 x 5 in image
Warning: Removed 2 rows containing missing values or values outside the scale range
(`geom_vline()`).
px <-c()for (i inc(1,2,3,9,10,11,12)) {#Print parameter distributions pi <-parameter_mcmc_areas("beta",beta_list,fit,i) #add way to filter groupsggsave(paste0(image_parameters_across_groups,"/parameters_",i,"_",pi$name,".png") ,plot=pi$plot ) px <-c(px,pi)#Get Quantiles and means for parameters# table <- xtable(pi$quantiles,# floating=FALSE# ,latex.environments = NULL# ,booktabs = TRUE# ,zap=getOption("digits")# )# write_lines(table,paste0("./latex_output/DirectEffects/parameters_",i,"_",pi$name,".tex"))}
Saving 7 x 5 in image
Warning: Removed 6 rows containing missing values or values outside the scale range
(`geom_vline()`).
Saving 7 x 5 in image
Saving 7 x 5 in image
Saving 7 x 5 in image
Warning: Removed 3 rows containing missing values or values outside the scale range
(`geom_vline()`).
Saving 7 x 5 in image
Warning: Removed 6 rows containing missing values or values outside the scale range
(`geom_vline()`).
Saving 7 x 5 in image
Warning: Removed 5 rows containing missing values or values outside the scale range
(`geom_vline()`).
Saving 7 x 5 in image
Warning: Removed 5 rows containing missing values or values outside the scale range
(`geom_vline()`).
Note these have 95% outer CI and 80% inner (shaded)
#get the generic and uspdc parametersprint(px[4]$plot + px[7]$plot)
#get the parameters associated with durationpx[16]$plot + px[19]$plot
Warning: Removed 5 rows containing missing values or values outside the scale range
(`geom_vline()`).
Removed 5 rows containing missing values or values outside the scale range
(`geom_vline()`).
Warning: Removed 5 rows containing missing values or values outside the scale range
(`geom_vline()`).
Removed 5 rows containing missing values or values outside the scale range
(`geom_vline()`).
#mu_priorggplot(df_ib_prior) +geom_density(aes(x=mu_prior)) +labs(title="Prior - Mu" ,subtitle="same prior for all Mu values" ,x="Mu" ,y="Probability" )
ggplot(counterfact_predicted_ib, aes(x=p_predicted_intervention)) +geom_density() +labs(title="Predicted Distribution of 'p'" ,subtitle="Intervention: Delay close of enrollment" ,x="Probability Domain 'p'" ,y="Probability Density" )
ggplot(counterfact_predicted_ib, aes(x=predicted_difference)) +geom_density() +labs(title="Predicted Distribution of differences 'p'" ,subtitle="Intervention: Delay close of enrollment" ,x="Difference in 'p' under treatment" ,y="Probability Density" )
pddf_ib <-data.frame(extract(generated_ib, pars="predicted_difference")$predicted_difference) |>pivot_longer(X1:X168) #CHANGE_NOTE: moved from X169 to X168pddf_ib["entry_idx"] <-as.numeric(gsub("\\D","",pddf_ib$name))pddf_ib["category"] <-sapply(pddf_ib$entry_idx, function(i) df$category_id[i])pddf_ib["category_name"] <-sapply( pddf_ib$category, function(i) category_names[i] )ggplot(pddf_ib, aes(x=value,)) +geom_density(adjust=1/5) +labs(title ="Distribution of predicted differences" ,subtitle ="Intervention: Delay close of enrollment" ,x ="Difference in probability due to intervention" ,y ="Probability Density" ) +geom_vline(aes(xintercept =0), color ="skyblue", linetype="dashed")
p4 <-ggplot(pddf_ib, aes(x = value)) +geom_density() +labs(title ="Distribution of predicted differences" ,subtitle ="Intervention: Delay close of enrollment" ,x ="Difference in probability due to intervention" ,y ="Probability Density" ,caption ="Vertical marks: 5/10/25/50/75/90/95th percentiles. Dot shows mean." ) +geom_vline(aes(xintercept =0), color ="skyblue", linetype="dashed") +geom_segment(data =data.frame(x =c(stats$q1, stats$q3, stats$med),xend =c(stats$q1, stats$q3, stats$med),y =rep(stats$y_offset_density, 3), yend =rep(stats$y_offset_density *2, 3) ), aes(x = x, xend = xend, y = y, yend = yend)) +geom_segment(data =data.frame(x =rep(stats$q1, 2),xend =rep(stats$q3, 2),y =c(stats$y_offset_density, stats$y_offset_density *2),yend =c(stats$y_offset_density, stats$y_offset_density *2) ), aes(x = x, xend = xend, y = y, yend = yend)) +# Inner whiskers (Q1->P10, Q3->P90)geom_segment(data =data.frame(x =c(stats$q1, stats$q3),xend =c(stats$p10, stats$p90),y =rep(stats$y_offset_density *1.5, 2),yend =rep(stats$y_offset_density *1.5, 2) ), aes(x = x, xend = xend, y = y, yend = yend)) +# Crossbars at P10/P90geom_segment(data =data.frame(x =c(stats$p10, stats$p90),xend =c(stats$p10, stats$p90),y = stats$y_offset_density *1.3,yend = stats$y_offset_density *1.7 ), aes(x = x, xend = xend, y = y, yend = yend)) +# Outer whiskers (P10->P5, P90->P95)geom_segment(data =data.frame(x =c(stats$p10, stats$p90),xend =c(stats$p5, stats$p95),y =rep(stats$y_offset_density *1.5, 2),yend =rep(stats$y_offset_density *1.5, 2) ), aes(x = x, xend = xend, y = y, yend = yend)) +# Crossbars at P5/P95geom_segment(data =data.frame(x =c(stats$p5, stats$p95),xend =c(stats$p5, stats$p95),y = stats$y_offset_density *1.3,yend = stats$y_offset_density *1.7 ), aes(x = x, xend = xend, y = y, yend = yend)) +# Mean dotgeom_point(data =data.frame(x = stats$mean,y = stats$y_offset_density *1.5 ), aes(x = x, y = y))ggsave(paste0(image_dist_diff_analysis,"/p_delay_intervention_distdiff_boxplot.png"))
Saving 7 x 5 in image
ggplot(pddf_ib, aes(x=value)) +stat_ecdf(geom='step') +labs(title ="Cumulative distribution of predicted differences",subtitle ="Intervention: Delay close of enrollment",x ="Difference in probability of termination due to intervention",y ="Cumulative Proportion" )
# get values around and above/below spikewidth <-0.02spike_band_centered_zero <-mean( pddf_ib$value >=-width/2& pddf_ib$value <= width/2)above_spike_band <-mean( pddf_ib$value >= width/2)below_spike_band <-mean( pddf_ib$value <=-width/2)# get mass above and mass below zeromass_below_zero <-mean( pddf_ib$value <=0)
Looking at the spike around zero, we find that 39.193869% of the probability mass is contained within the band from [-1,1]. Additionally, there was 51.937619% of the probability above that – representing those with a general increase in the probability of termination – and 8.8685119% of the probability mass below the band – representing a decrease in the probability of termination.
On average, if you keep the trial open instead of closing it, 0.2488518% of trials will see a decrease in the probability of termination, but, due to the high increase in probability of termination given termination was increased, the mean probability of termination increases by 0.0249444.
# 5%-ilessummary(pddf_ib$value)
Min. 1st Qu. Median Mean 3rd Qu. Max.
-0.42755 0.00000 0.01164 0.02494 0.04213 0.54813
# Create your quantilesquants <-quantile(pddf_ib$value, probs =seq(0,1,0.05), type=4)# Convert to a data framequant_df <-data.frame(Percentile =names(quants),Value = quants)kable(quant_df)
Percentile
Value
0%
0%
-0.4275511
5%
5%
-0.0213008
10%
10%
-0.0079768
15%
15%
-0.0021932
20%
20%
-0.0001065
25%
25%
0.0000000
30%
30%
0.0000535
35%
35%
0.0012179
40%
40%
0.0040158
45%
45%
0.0075646
50%
50%
0.0116358
55%
55%
0.0162235
60%
60%
0.0214039
65%
65%
0.0272582
70%
70%
0.0340562
75%
75%
0.0421259
80%
80%
0.0519438
85%
85%
0.0645430
90%
90%
0.0819636
95%
95%
0.1102529
100%
100%
0.5481257
There seems to be some trials that are highly suceptable to this enrollment delay. Specifically, there were some
n =length(counterfact_predicted_ib$p_predicted_intervention)k =100simulated_terminations_intervention <-mean(rbinom(n,k,as.vector(counterfact_predicted_ib$p_predicted_intervention)))simulated_terminations_base <-mean(rbinom(n,k,as.vector(counterfact_predicted_ib$p_predicted_default)))simulated_percentages <- (simulated_terminations_intervention - simulated_terminations_base)/k
The simulation above shows that this results in a percentage-point increase of about 2.4922006.