You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
ClinicalTrialsEstimation/r-analysis/EffectsOfEnrollmentDelay.qmd

1211 lines
33 KiB
Plaintext

---
title: "The Effects of Recruitment status on completion of clinical trials"
author: "Will King"
format: html
editor: source
---
# Setup
```{r}
library(knitr)
library(bayesplot)
available_mcmc(pattern = "_nuts_")
library(ggplot2)
library(patchwork)
library(tidyverse)
library(rstan)
library(tidyr)
library(ghibli)
library(xtable)
#Resources: https://github.com/stan-dev/rstan/wiki/RStan-Getting-Started
#save unchanged models instead of recompiling
rstan_options(auto_write = TRUE)
#allow for multithreaded sampling
options(mc.cores = parallel::detectCores())
#test installation, shouldn't get any errors
#example(stan_model, package = "rstan", run.dontrun = TRUE)
image_root <- "./output/EffectsOfEnrollmentDelay"
image_dist_diff_analysis <- paste0(image_root,"/dist_diff_analysis")
image_trial_details <-paste0(image_root,"/trials_details")
image_parameters_by_groups <-paste0(image_root,"/betas/parameters_by_group")
image_parameters_across_groups <-paste0(image_root,"/betas/parameter_across_groups")
```
```{r}
################ Pull data from database ######################
library(RPostgreSQL)
host <- 'aact_db-restored-2025-01-07'
driver <- dbDriver("PostgreSQL")
get_data <- function(driver) {
con <- dbConnect(
driver,
user='root',
password='root',
dbname='aact_db',
host=host
)
on.exit(dbDisconnect(con))
query <- dbSendQuery(
con,
# "select * from formatted_data_with_planned_enrollment;"
"
select
fdqpe.nct_id
--,fdqpe.start_date
--,fdqpe.current_enrollment
--,fdqpe.enrollment_category
,fdqpe.current_status
,fdqpe.earliest_date_observed
,fdqpe.elapsed_duration
,fdqpe.n_brands as identical_brands
,ntbtu.brand_name_counts
,fdqpe.category_id
,fdqpe.final_status
,fdqpe.h_sdi_val
--,fdqpe.h_sdi_u95
--,fdqpe.h_sdi_l95
,fdqpe.hm_sdi_val
--,fdqpe.hm_sdi_u95
--,fdqpe.hm_sdi_l95
,fdqpe.m_sdi_val
--,fdqpe.m_sdi_u95
--,fdqpe.m_sdi_l95
,fdqpe.lm_sdi_val
--,fdqpe.lm_sdi_u95
--,fdqpe.lm_sdi_l95
,fdqpe.l_sdi_val
--,fdqpe.l_sdi_u95
--,fdqpe.l_sdi_l95
from formatted_data_with_planned_enrollment fdqpe
join \"Formularies\".nct_to_brand_counts_through_uspdc ntbtu
on fdqpe.nct_id = ntbtu.nct_id
order by fdqpe.nct_id, fdqpe.earliest_date_observed
;
"
)
df <- fetch(query, n = -1)
df <- na.omit(df)
query2 <-dbSendQuery(con,"select count(*) from \"DiseaseBurden\".icd10_categories ic where \"level\"=1;")
n_categories <- fetch(query2, n = -1)
return(list(data=df,ncat=n_categories))
}
get_counterfact_base <- function(driver) {
con <- dbConnect(
driver,
user='root',
password='root',
dbname='aact_db',
host=host
)
on.exit(dbDisconnect(con))
query <- dbSendQuery(
con,
"
with cte as (
--get last recruiting state
select fd.nct_id, max(fd.earliest_date_observed),min(fd2.earliest_date_observed) as tmstmp
from formatted_data fd
join formatted_data fd2
on fd.nct_id=fd2.nct_id and fd.earliest_date_observed < fd2.earliest_date_observed
where fd.current_status = 'Recruiting'
and fd2.current_status = 'Active, not recruiting'
group by fd.nct_id
)
select
fdqpe.nct_id
--,fdqpe.start_date
--,fdqpe.current_enrollment
--,fdqpe.enrollment_category
,fdqpe.current_status
,fdqpe.earliest_date_observed
,fdqpe.elapsed_duration
,fdqpe.n_brands as identical_brands
,ntbtu.brand_name_counts
,fdqpe.category_id
,fdqpe.final_status
,fdqpe.h_sdi_val
--,fdqpe.h_sdi_u95
--,fdqpe.h_sdi_l95
,fdqpe.hm_sdi_val
--,fdqpe.hm_sdi_u95
--,fdqpe.hm_sdi_l95
,fdqpe.m_sdi_val
--,fdqpe.m_sdi_u95
--,fdqpe.m_sdi_l95
,fdqpe.lm_sdi_val
--,fdqpe.lm_sdi_u95
--,fdqpe.lm_sdi_l95
,fdqpe.l_sdi_val
--,fdqpe.l_sdi_u95
--,fdqpe.l_sdi_l95
from formatted_data_with_planned_enrollment fdqpe
join \"Formularies\".nct_to_brand_counts_through_uspdc ntbtu
on fdqpe.nct_id = ntbtu.nct_id
join cte
on fdqpe.nct_id = cte.nct_id
and fdqpe.earliest_date_observed = cte.tmstmp
order by fdqpe.nct_id, fdqpe.earliest_date_observed
;
"
)
df <- fetch(query, n = -1)
df <- na.omit(df)
query2 <-dbSendQuery(con,"select count(*) from \"DiseaseBurden\".icd10_categories ic where \"level\"=1;")
n_categories <- fetch(query2, n = -1)
return(list(data=df,ncat=n_categories))
}
d <- get_data(driver)
df <- d$data
n_categories <- d$ncat
cf <- get_counterfact_base(driver)
df_counterfact_base <- cf$data
################ Format Data ###########################
data_formatter <- function(df) {
categories <- df["category_id"]
x <- df["elapsed_duration"]
x["identical_brands"] <- asinh(df$identical_brands)
x["brand_name_counts"] <- asinh(df$brand_name_count)
x["h_sdi_val"] <- asinh(df$h_sdi_val)
x["hm_sdi_val"] <- asinh(df$hm_sdi_val)
x["m_sdi_val"] <- asinh(df$m_sdi_val)
x["lm_sdi_val"] <- asinh(df$lm_sdi_val)
x["l_sdi_val"] <- asinh(df$l_sdi_val)
#Setup fixed effects
x["status_NYR"] <- ifelse(df["current_status"]=="Not yet recruiting",1,0)
x["status_EBI"] <- ifelse(df["current_status"]=="Enrolling by invitation",1,0)
x["status_Rec"] <- ifelse(df["current_status"]=="Recruiting",1,0)
x["status_ANR"] <- ifelse(df["current_status"]=="Active, not recruiting",1,0)
y <- ifelse(df["final_status"]=="Terminated",1,0)
#get category list
return(list(x=x,y=y))
}
train <- data_formatter(df)
counterfact_base <- data_formatter(df_counterfact_base)
categories <- df$category_id
x <- train$x
y <- train$y
x_cf_base <- counterfact_base$x
y_cf_base <- counterfact_base$y
cf_categories <- df_counterfact_base$category_id
```
# Fit Model
```{r}
################################# FIT MODEL #########################################
inherited_cols <- c(
"elapsed_duration"
,"identical_brands"
,"brand_name_counts"
,"h_sdi_val"
,"hm_sdi_val"
,"m_sdi_val"
,"lm_sdi_val"
,"l_sdi_val"
,"status_NYR"
,"status_EBI"
,"status_Rec"
,"status_ANR"
)
```
```{r}
beta_list <- list(
groups = c(
`1`="Infections & Parasites",
`2`="Neoplasms",
`3`="Blood & Immune system",
`4`="Endocrine, Nutritional, and Metabolic",
`5`="Mental & Behavioral",
`6`="Nervous System",
`7`="Eye and Adnexa",
`8`="Ear and Mastoid",
`9`="Circulatory",
`10`="Respiratory",
`11`="Digestive",
`12`="Skin & Subcutaneaous tissue",
`13`="Musculoskeletal",
`14`="Genitourinary",
`15`="Pregancy, Childbirth, & Puerperium",
`16`="Perinatal Period",
`17`="Congential",
`18`="Symptoms, Signs etc.",
`19`="Injury etc.",
`20`="External Causes",
`21`="Contact with Healthcare",
`22`="Special Purposes"
),
parameters = c(
`1`="Elapsed Duration",
# brands
`2`="asinh(Generic Brands)",
`3`="asinh(Competitors USPDC)",
# population
`4`="asinh(High SDI)",
`5`="asinh(High-Medium SDI)",
`6`="asinh(Medium SDI)",
`7`="asinh(Low-Medium SDI)",
`8`="asinh(Low SDI)",
#Status
`9`="status_NYR",
`10`="status_EBI",
`11`="status_Rec",
`12`="status_ANR"
)
)
get_parameters <- function(stem,class_list) {
#get categories and lengths
named <- names(class_list)
lengths <- sapply(named, (function (x) length(class_list[[x]])))
#describe the grid needed
iter_list <- sapply(named, (function (x) 1:lengths[x]))
#generate the list of parameters
pardf <- generate_parameter_df(stem, iter_list)
#add columns with appropriate human-readable names
for (name in named) {
pardf[paste(name,"_hr",sep="")] <- as.factor(
sapply(pardf[name], (function (i) class_list[[name]][i]))
)
}
return(pardf)
}
generate_parameter_df <- function(stem, iter_list) {
grid <- expand.grid(iter_list)
grid["param_name"] <- grid %>% unite(x,colnames(grid),sep=",")
grid["param_name"] <- paste(stem,"[",grid$param_name,"]",sep="")
return(grid)
}
group_mcmc_areas <- function(
stem,# = "beta"
class_list,# = beta_list
stanfit,# = fit
group_id,# = 2
rename=TRUE,
filter=NULL
) {
#get all parameter names
params <- get_parameters(stem,class_list)
#filter down to parameters of interest
params <- filter(params,groups == group_id)
#Get dataframe with only the rows of interest
filtdata <- as.data.frame(stanfit)[params$param_name]
#rename columns
if (rename) dimnames(filtdata)[[2]] <- params$parameters_hr
#get group name for title
group_name <- class_list$groups[group_id]
#create area plot with appropriate title
p <- mcmc_areas(filtdata,prob = 0.8, prob_outer = 0.95) +
ggtitle(paste("Parameter distributions for ICD-10 class:",group_name)) +
geom_vline(xintercept=seq(-2,2,0.5),color="grey",alpha=0.750)
d <- pivot_longer(filtdata, everything()) |>
group_by(name) |>
summarize(
mean=mean(value)
,q025 = quantile(value,probs = 0.025)
,q975 = quantile(value,probs = 0.975)
,q05 = quantile(value,probs = 0.05)
,q95 = quantile(value,probs = 0.95)
)
return(list(plot=p,quantiles=d,name=group_name))
}
parameter_mcmc_areas <- function(
stem,# = "beta"
class_list,# = beta_list
stanfit,# = fit
parameter_id,# = 2
rename=TRUE
) {
#get all parameter names
params <- get_parameters(stem,class_list)
#filter down to parameters of interest
params <- filter(params,parameters == parameter_id)
#Get dataframe with only the rows of interest
filtdata <- as.data.frame(stanfit)[params$param_name]
#rename columns
if (rename) dimnames(filtdata)[[2]] <- params$groups_hr
#get group name for title
parameter_name <- class_list$parameters[parameter_id]
#create area plot with appropriate title
p <- mcmc_areas(filtdata,prob = 0.8, prob_outer = 0.95) +
ggtitle(parameter_name,"Parameter Distribution") +
geom_vline(xintercept=seq(-2,2,0.5),color="grey",alpha=0.750)
d <- pivot_longer(filtdata, everything()) |>
group_by(name) |>
summarize(
mean=mean(value)
,q025 = quantile(value,probs = 0.025)
,q975 = quantile(value,probs = 0.975)
,q05 = quantile(value,probs = 0.05)
,q95 = quantile(value,probs = 0.95)
)
return(list(plot=p,quantiles=d,name=parameter_name))
}
```
Plan: select all snapshots that are the first to have closed enrollment (Rec -> ANR)
```{r}
#delay intervention
intervention_enrollment <- x_cf_base[c(inherited_cols)]
intervention_enrollment["status_ANR"] <- 0
intervention_enrollment["status_Rec"] <- 1
```
```{r}
counterfact_delay <- list(
D = ncol(x),#
N = nrow(x),
L = n_categories$count,
y = as.vector(y),
ll = as.vector(categories),
x = as.matrix(x),
mu_mean = 0,
mu_stdev = 0.05,
sigma_shape = 8,
sigma_rate = ,
Nx = nrow(x_cf_base),
llx = as.vector(cf_categories),
counterfact_x_tilde = as.matrix(intervention_enrollment),
counterfact_x = as.matrix(x_cf_base),
status_indexes = c(11,12) #subtract anr from recruiting to get movement from anr to recruiting
)
```
```{r}
fit <- stan(
file='Hierarchal_Logistic.stan',
data = counterfact_delay,
chains = 8,
iter = 12000,
warmup = 4000,
seed = 11021585
)
```
## Fit Results
```{r}
print(check_hmc_diagnostics(fit))
print(get_bfmi(fit))
```
```{r}
print(fit)
```
## Explore data
```{r}
#get number of trials and snapshots in each category
group_trials_by_category <- as.data.frame(aggregate(category_id ~ nct_id, df, max))
group_trials_by_category <- as.data.frame(group_trials_by_category)
category_count <- group_trials_by_category |> group_by(category_id) |> count()
```
```{r}
################################# DATA EXPLORATION ############################
driver <- dbDriver("PostgreSQL")
con <- dbConnect(
driver,
user='root',
password='root',
dbname='aact_db',
host=host
)
#Plot histogram of count of snapshots
df3 <- dbGetQuery(
con,
"select nct_id,final_status,count(*) from formatted_data_with_planned_enrollment fdwpe
group by nct_id,final_status ;"
)
#df3 <- fetch(query3, n = -1)
ggplot(data=df3, aes(x=count, fill=final_status)) +
geom_histogram(binwidth=1) +
ggtitle("Histogram of snapshots per trial (matched trials)") +
xlab("Snapshots per trial")
ggsave(paste0(image_trial_details,"/HistSnapshots.png"))
#Plot duration for terminated vs completed
df4 <- dbGetQuery(
con,
"
select
nct_id,
start_date ,
primary_completion_date,
overall_status ,
primary_completion_date - start_date as duration
from ctgov.studies s
where nct_id in (select distinct nct_id from http.download_status ds)
;"
)
#df4 <- fetch(query4, n = -1)
ggplot(data=df4, aes(x=duration,fill=overall_status)) +
geom_histogram()+
ggtitle("Histogram of trial durations") +
xlab("duration")+
facet_wrap(~overall_status)
ggsave(paste0(image_trial_details,"/HistTrialDurations_Faceted.png"))
df5 <- dbGetQuery(
con,
"
with cte1 as (
select
nct_id,
start_date ,
primary_completion_date,
overall_status ,
primary_completion_date - start_date as duration
from ctgov.studies s
where nct_id in (select distinct nct_id from http.download_status ds)
), cte2 as (
select nct_id,count(*) as snapshot_count from formatted_data_with_planned_enrollment fdwpe
group by nct_id
)
select a.nct_id, a.overall_status, a.duration,b.snapshot_count
from cte1 as a
join cte2 as b
on a.nct_id=b.nct_id
;"
)
df5$overall_status <- as.factor(df5$overall_status)
ggplot(data=df5, aes(x=duration,y=snapshot_count,color=overall_status)) +
geom_jitter() +
ggtitle("Comparison of duration, status, and snapshot_count") +
xlab("duration") +
ylab("snapshot count")
ggsave(paste0(image_trial_details,"/SnapshotsVsDurationVsTermination.png"))
dbDisconnect(con)
#get number of trials and snapshots in each category
group_trials_by_category <- as.data.frame(aggregate(category_id ~ nct_id, df, max))
group_trials_by_category <- as.data.frame(group_trials_by_category)
ggplot(data = group_trials_by_category, aes(x=category_id)) +
geom_bar(binwidth=1,color="black",fill="lightblue") +
scale_x_continuous(breaks=scales::pretty_breaks(n=22)) +
labs(
title="bar chart of trial categories"
,x="Category ID"
,y="Count"
)
ggsave(paste0(image_trial_details,"/CategoryCounts.png"))
summary(df5)
cor(df5$duration,df5$snapshot_count)
sum(df5$snapshot_count)
```
# Parameter Distributions
```{r}
#g1 <- group_mcmc_areas("beta",beta_list,fit,1)
gx <- c()
#grab parameters for every category with more than 8 observations
for (i in category_count$category_id[category_count$n >= 0]) {
print(i)
#Print parameter distributions
gi <- group_mcmc_areas("beta",beta_list,fit,i) #add way to filter groups
ggsave(
paste0(image_parameters_by_groups,"/group_",i,"_",gi$name,".png")
,plot=gi$plot
)
gx <- c(gx,gi)
#Get Quantiles and means for parameters
# table <- xtable(gi$quantiles,
# floating=FALSE
# ,latex.environments = NULL
# ,booktabs = TRUE
# ,zap=getOption("digits")
# )
# write_lines(table,paste0("./latex_output/DirectEffects/group_",gi$name,".tex"))
}
```
```{r}
px <- c()
for (i in c(1,2,3,9,10,11,12)) {
#Print parameter distributions
pi <- parameter_mcmc_areas("beta",beta_list,fit,i) #add way to filter groups
ggsave(
paste0(image_parameters_across_groups,"/parameters_",i,"_",pi$name,".png")
,plot=pi$plot
)
px <- c(px,pi)
#Get Quantiles and means for parameters
# table <- xtable(pi$quantiles,
# floating=FALSE
# ,latex.environments = NULL
# ,booktabs = TRUE
# ,zap=getOption("digits")
# )
# write_lines(table,paste0("./latex_output/DirectEffects/parameters_",i,"_",pi$name,".tex"))
}
```
Note these have 95% outer CI and 80% inner (shaded)
```{r}
##get the generic and uspdc parameters
#print(px[4]$plot + px[7]$plot)
#ggsave(paste0(image_parameters_across_groups,"2+3_generic_and_uspdc.png"))
#get the parameters associated with duration
#px[16]$plot + px[19]$plot
#ggsave(paste0(image_parameters_across_groups,"11+12_statusREC_and_statusANR.png"))
```
# Counterfactuals
```{r}
generated_ib <- gqs(
fit@stanmodel,
data=counterfact_delay,
draws=as.matrix(fit),
seed=11021585
)
```
```{r}
df_ib_p <- data.frame(
p_prior=as.vector(extract(generated_ib, pars="p_prior")$p_prior)
,p_predicted = as.vector(extract(generated_ib, pars="p_predicted")$p_predicted)
)
df_ib_prior <- data.frame(
mu_prior = as.vector(extract(generated_ib, pars="mu_prior")$mu_prior)
,sigma_prior = as.vector(extract(generated_ib, pars="sigma_prior")$sigma_prior)
)
#p_prior
ggplot(df_ib_p, aes(x=p_prior)) +
geom_density() +
labs(
title="Implied Prior Distribution P"
,subtitle=""
,x="Probability Domain 'p'"
,y="Probability Density"
)
ggsave(paste0(image_dist_diff_analysis,"/prior_p.png"))
#p_posterior
ggplot(df_ib_p, aes(x=p_predicted)) +
geom_density() +
labs(
title="Implied Posterior Distribution P"
,subtitle=""
,x="Probability Domain 'p'"
,y="Probability Density"
)
ggsave(paste0(image_dist_diff_analysis,"/posterior_p.png"))
#mu_prior
ggplot(df_ib_prior) +
geom_density(aes(x=mu_prior)) +
labs(
title="Prior - Mu"
,subtitle="same prior for all Mu values"
,x="Mu"
,y="Probability"
)
ggsave(paste0(image_dist_diff_analysis,"/prior_mu.png"))
#sigma_posterior
ggplot(df_ib_prior) +
geom_density(aes(x=sigma_prior)) +
labs(
title="Prior - Sigma"
,subtitle="same prior for all Sigma values"
,x="Sigma"
,y="Probability"
)
ggsave(paste0(image_dist_diff_analysis,"/prior_sigma.png"))
```
### Intervention: Delay close of enrollment
```{r}
counterfact_predicted_ib <- data.frame(
p_predicted_default = as.vector(extract(generated_ib, pars="p_predicted_default")$p_predicted_default)
,p_predicted_intervention = as.vector(extract(generated_ib, pars="p_predicted_intervention")$p_predicted_intervention)
,predicted_difference = as.vector(extract(generated_ib, pars="predicted_difference")$predicted_difference)
)
ggplot(counterfact_predicted_ib, aes(x=p_predicted_default)) +
geom_density() +
labs(
title="Predicted Distribution of 'p'"
,subtitle="Intervention: None"
,x="Probability Domain 'p'"
,y="Probability Density"
)
ggsave(paste0(image_dist_diff_analysis,"p_no_intervention.png"))
ggplot(counterfact_predicted_ib, aes(x=p_predicted_intervention)) +
geom_density() +
labs(
title="Predicted Distribution of 'p'"
,subtitle="Intervention: Delay close of enrollment"
,x="Probability Domain 'p'"
,y="Probability Density"
)
ggsave(paste0(image_dist_diff_analysis,"/p_delay_intervention.png"))
ggplot(counterfact_predicted_ib, aes(x=predicted_difference)) +
geom_density() +
labs(
title="Predicted Distribution of differences 'p'"
,subtitle="Intervention: Delay close of enrollment"
,x="Difference in 'p' under treatment"
,y="Probability Density"
)
ggsave(paste0(image_dist_diff_analysis,"/p_delay_intervention_distdiff_1.png"))
```
```{r}
get_category_count <- function(tbl, id) {
result <- tbl$n[tbl$category_id == id]
if(length(result) == 0) 0 else result
}
category_names <- sapply(1:length(beta_list$groups),
function(i) sprintf("ICD-10 #%d: %s (n=%d)",
i,
beta_list$groups[i],
get_category_count(category_count, i)))
```
```{r}
pddf_ib <- data.frame(extract(generated_ib, pars="predicted_difference")$predicted_difference) |>
pivot_longer(X1:X168) #CHANGE_NOTE: moved from X169 to X168
pddf_ib["entry_idx"] <- as.numeric(gsub("\\D","",pddf_ib$name))
pddf_ib["category"] <- sapply(pddf_ib$entry_idx, function(i) df$category_id[i])
pddf_ib["category_name"] <- sapply(
pddf_ib$category,
function(i) category_names[i]
)
ggplot(pddf_ib, aes(x=value,)) +
geom_density(adjust=1/5) +
labs(
title = "Distribution of predicted differences"
,subtitle = "Intervention: Delay close of enrollment"
,x = "Difference in probability due to intervention"
,y = "Probability Density"
) +
geom_vline(aes(xintercept = 0), color = "skyblue", linetype="dashed")
ggsave(paste0(image_dist_diff_analysis,"/p_delay_intervention_distdiff_styled.png"))
ggplot(pddf_ib, aes(x=value,)) +
geom_density(adjust=1/5) +
facet_wrap(
~factor(
category_name,
levels=category_names
)
, labeller = label_wrap_gen(multi_line = TRUE)
, ncol=4) +
labs(
title = "Distribution of predicted differences | By Group"
,subtitle = "Intervention: Delay close of enrollment"
,x = "Difference in probability due to intervention"
,y = "Probability Density"
) +
geom_vline(aes(xintercept = 0), color = "skyblue", linetype="dashed") +
theme(strip.text.x = element_text(size = 8))
ggsave(paste0(image_dist_diff_analysis,"/p_delay_intervention_distdiff_by_group.png"))
ggplot(pddf_ib, aes(x=value,)) +
geom_histogram(bins=300) +
facet_wrap(
~factor(
category_name,
levels=category_names
)
, labeller = label_wrap_gen(multi_line = TRUE)
, ncol=4) +
labs(
title = "Histogram of predicted differences | By Group"
,subtitle = "Intervention: Delay close of enrollment"
,x = "Difference in probability due to intervention"
,y = "Predicted counts"
) +
geom_vline(aes(xintercept = 0), color = "skyblue", linetype="dashed") +
theme(strip.text.x = element_text(size = 8))
ggsave(paste0(image_dist_diff_analysis,"p_delay_intervention_histdiff_by_group.png"))
```
```{r}
p3 <- ggplot(pddf_ib, aes(x=value,)) +
geom_histogram(bins=500) +
labs(
title = "Distribution of predicted differences"
,subtitle = "Intervention: Delay close of enrollment"
,x = "Difference in probability due to intervention"
,y = "Probability Density"
,caption = "Vertical marks: 5/10/25/50/75/90/95th percentiles. Dot shows mean."
) +
geom_vline(aes(xintercept = 0), color = "skyblue", linetype="dashed")
density_hight <- max(density(pddf_ib$value)$y)
stats <- list(
p5 = quantile(pddf_ib$value, 0.05),
p10 = quantile(pddf_ib$value, 0.10),
q1 = quantile(pddf_ib$value, 0.25),
med = median(pddf_ib$value),
mean = mean(pddf_ib$value),
q3 = quantile(pddf_ib$value, 0.75),
p90 = quantile(pddf_ib$value, 0.90),
p95 = quantile(pddf_ib$value, 0.95),
max_height = max(ggplot_build(p3)$data[[1]]$count),
y_offset = -max(ggplot_build(p3)$data[[1]]$count) * 0.05,
y_offset_density = -density_hight * 0.15
)
p3 +
# Box
geom_segment(data = data.frame(
x = c(stats$q1, stats$q3, stats$med),
xend = c(stats$q1, stats$q3, stats$med),
y = rep(stats$y_offset, 3),
yend = rep(stats$y_offset * 2, 3)
), aes(x = x, xend = xend, y = y, yend = yend)) +
geom_segment(data = data.frame(
x = rep(stats$q1, 2),
xend = rep(stats$q3, 2),
y = c(stats$y_offset, stats$y_offset * 2),
yend = c(stats$y_offset, stats$y_offset * 2)
), aes(x = x, xend = xend, y = y, yend = yend)) +
# Inner whiskers (Q1->P10, Q3->P90)
geom_segment(data = data.frame(
x = c(stats$q1, stats$q3),
xend = c(stats$p10, stats$p90),
y = rep(stats$y_offset * 1.5, 2),
yend = rep(stats$y_offset * 1.5, 2)
), aes(x = x, xend = xend, y = y, yend = yend)) +
# Crossbars at P10/P90
geom_segment(data = data.frame(
x = c(stats$p10, stats$p90),
xend = c(stats$p10, stats$p90),
y = stats$y_offset * 1.3,
yend = stats$y_offset * 1.7
), aes(x = x, xend = xend, y = y, yend = yend)) +
# Outer whiskers (P10->P5, P90->P95)
geom_segment(data = data.frame(
x = c(stats$p10, stats$p90),
xend = c(stats$p5, stats$p95),
y = rep(stats$y_offset * 1.5, 2),
yend = rep(stats$y_offset * 1.5, 2)
), aes(x = x, xend = xend, y = y, yend = yend)) +
# Crossbars at P5/P95
geom_segment(data = data.frame(
x = c(stats$p5, stats$p95),
xend = c(stats$p5, stats$p95),
y = stats$y_offset * 1.3,
yend = stats$y_offset * 1.7
), aes(x = x, xend = xend, y = y, yend = yend)) +
# Mean dot
geom_point(data = data.frame(
x = stats$mean,
y = stats$y_offset * 1.5
), aes(x = x, y = y))
ggsave(paste0(image_dist_diff_analysis,"/p_delay_intervention_histdiff_boxplot.png"))
p4 <- ggplot(pddf_ib, aes(x = value)) +
geom_density() +
labs(
title = "Distribution of predicted differences"
,subtitle = "Intervention: Delay close of enrollment"
,x = "Difference in probability due to intervention"
,y = "Probability Density"
,caption = "Vertical marks: 5/10/25/50/75/90/95th percentiles. Dot shows mean."
) +
geom_vline(aes(xintercept = 0), color = "skyblue", linetype="dashed") +
geom_segment(data = data.frame(
x = c(stats$q1, stats$q3, stats$med),
xend = c(stats$q1, stats$q3, stats$med),
y = rep(stats$y_offset_density, 3),
yend = rep(stats$y_offset_density * 2, 3)
), aes(x = x, xend = xend, y = y, yend = yend)) +
geom_segment(data = data.frame(
x = rep(stats$q1, 2),
xend = rep(stats$q3, 2),
y = c(stats$y_offset_density, stats$y_offset_density * 2),
yend = c(stats$y_offset_density, stats$y_offset_density * 2)
), aes(x = x, xend = xend, y = y, yend = yend)) +
# Inner whiskers (Q1->P10, Q3->P90)
geom_segment(data = data.frame(
x = c(stats$q1, stats$q3),
xend = c(stats$p10, stats$p90),
y = rep(stats$y_offset_density * 1.5, 2),
yend = rep(stats$y_offset_density * 1.5, 2)
), aes(x = x, xend = xend, y = y, yend = yend)) +
# Crossbars at P10/P90
geom_segment(data = data.frame(
x = c(stats$p10, stats$p90),
xend = c(stats$p10, stats$p90),
y = stats$y_offset_density * 1.3,
yend = stats$y_offset_density * 1.7
), aes(x = x, xend = xend, y = y, yend = yend)) +
# Outer whiskers (P10->P5, P90->P95)
geom_segment(data = data.frame(
x = c(stats$p10, stats$p90),
xend = c(stats$p5, stats$p95),
y = rep(stats$y_offset_density * 1.5, 2),
yend = rep(stats$y_offset_density * 1.5, 2)
), aes(x = x, xend = xend, y = y, yend = yend)) +
# Crossbars at P5/P95
geom_segment(data = data.frame(
x = c(stats$p5, stats$p95),
xend = c(stats$p5, stats$p95),
y = stats$y_offset_density * 1.3,
yend = stats$y_offset_density * 1.7
), aes(x = x, xend = xend, y = y, yend = yend)) +
# Mean dot
geom_point(data = data.frame(
x = stats$mean,
y = stats$y_offset_density * 1.5
), aes(x = x, y = y))
ggsave(paste0(image_dist_diff_analysis,"/p_delay_intervention_distdiff_boxplot.png"))
```
```{r}
ggplot(pddf_ib, aes(x=value)) +
stat_ecdf(geom='step') +
labs(
title = "Cumulative distribution of predicted differences",
subtitle = "Intervention: Delay close of enrollment",
x = "Difference in probability of termination due to intervention",
y = "Cumulative Proportion"
)
ggsave(paste0(image_dist_diff_analysis,"/p_delay_intervention_cumulative_distdiff.png"))
```
Get the % of differences in the spike around zero
```{r}
# get values around and above/below spike
width <- 0.02
spike_band_centered_zero <- mean( pddf_ib$value >= -width/2 & pddf_ib$value <= width/2)
above_spike_band <- mean( pddf_ib$value >= width/2)
below_spike_band <- mean( pddf_ib$value <= -width/2)
# get mass above and mass below zero
mass_below_zero <- mean( pddf_ib$value <= 0)
```
Looking at the spike around zero, we find that `r spike_band_centered_zero*100`%
of the probability mass is contained within the band from
[`r -width*100/2`,`r width*100/2`].
Additionally, there was `r above_spike_band*100`% of the probability above that
-- representing those with a general increase in the probability of termination --
and `r below_spike_band*100`% of the probability mass below the band
-- representing a decrease in the probability of termination.
On average, if you keep the trial open instead of closing it,
`r mass_below_zero`% of trials will see a decrease in the probability of termination,
but, due to the high increase in probability of termination given termination was increased,
the mean probability of termination increases by `r stats$mean`.
```{r}
# 5%-iles
summary(pddf_ib$value)
# Create your quantiles
quants <- quantile(pddf_ib$value, probs = seq(0,1,0.05), type=4)
# Convert to a data frame
quant_df <- data.frame(
Percentile = names(quants),
Value = quants
)
kable(quant_df)
```
There seems to be some trials that are highly suceptable to this enrollment delay. Specifically, there were some
```{r}
n = length(counterfact_predicted_ib$p_predicted_intervention)
k = 100
simulated_terminations_intervention <- mean(rbinom(n,k,as.vector(counterfact_predicted_ib$p_predicted_intervention)))
simulated_terminations_base <-mean(rbinom(n,k,as.vector(counterfact_predicted_ib$p_predicted_default)))
simulated_percentages <- (simulated_terminations_intervention - simulated_terminations_base)/k
```
The simulation above shows that this results in a percentage-point increase of about
`r simulated_percentages * 100`.
# Diagnostics
```{r}
#| eval: true
#trace plots
image_diagnostics <- paste0(image_root,"/diagnostics")
plot(fit, pars=c("mu"), plotfun="trace")
ggsave(paste0(image_diagnostics,"/trace_plot_mu.png"))
for (i in 1:3) {
print(
mcmc_rank_overlay(
fit,
pars=c(
paste0("mu[",4*i-3,"]"),
paste0("mu[",4*i-2,"]"),
paste0("mu[",4*i-1,"]"),
paste0("mu[",4*i,"]")
),
n_bins=100
)+ legend_move("top") +
scale_colour_ghibli_d("KikiMedium")
)
mu_range <- paste0(4*i-3,"-",4*i)
filename <- paste0(image_diagnostics,"/trace_rank_plot_mu_",mu_range,".png")
ggsave(filename)
}
```
```{r}
#| eval: true
plot(fit, pars=c("sigma"), plotfun="trace")
ggsave(paste0(image_diagnostics,"/traceplot_sigma.png"))
for (i in 1:3) {
print(
mcmc_rank_overlay(
fit,
pars=c(
paste0("sigma[",4*i-3,"]"),
paste0("sigma[",4*i-2,"]"),
paste0("sigma[",4*i-1,"]"),
paste0("sigma[",4*i,"]")
),
n_bins=100
)+ legend_move("top") +
scale_colour_ghibli_d("KikiMedium")
)
sigma_range <- paste0(4*i-3,"-",4*i)
filename <- paste0(image_diagnostics,"/trace_rank_plot_sigma_",sigma_range,".png")
ggsave(filename)
}
```
```{r}
#| eval: true
#other diagnostics
logpost <- log_posterior(fit)
nuts_prmts <- nuts_params(fit)
posterior <- as.array(fit)
```
```{r}
#| eval: true
color_scheme_set("darkgray")
div_style <- parcoord_style_np(div_color = "green", div_size = 0.05, div_alpha = 0.4)
mcmc_parcoord(posterior, regex_pars = "mu", np=nuts_prmts, np_style = div_style, alpha = 0.05)
ggsave(paste0(image_diagnostics,"/parcoord_mu.png"))
```
```{r}
#| eval: true
for (i in 1:3) {
mus = sapply(3:0, function(j) paste0("mu[",4*i-j ,"]"))
print(
mcmc_pairs(
posterior,
np = nuts_prmts,
pars=c(
mus,
"lp__"
),
off_diag_args = list(size = 0.75)
)
)
mu_range <- paste0(4*i-3,"-",4*i)
filename <- paste0(image_diagnostics,"/correlation_plot_mu_",mu_range,".png")
ggsave(filename)
}
```
```{r}
#| eval: true
mcmc_parcoord(posterior,regex_pars = "sigma", np=nuts_prmts, alpha=0.05)
ggsave(paste0(image_diagnostics,"/parcoord_sigma.png"))
```
```{r}
#| eval: true
for (i in 1:3) {
params = sapply(3:0, function(j) paste0("sigma[",4*i-j ,"]"))
print(
mcmc_pairs(
posterior,
np = nuts_prmts,
pars=c(
params,
"lp__"
),
off_diag_args = list(size = 0.75)
)
)
sigma_range <- paste0(4*i-3,"-",4*i)
filename <- paste0(image_diagnostics,"/correlation_plot_sigma_",sigma_range,".png")
ggsave(filename)
}
```
```{r}
#| eval: true
for (k in 1:22) {
for (i in 1:3) {
params = sapply(3:0, function(j) paste0("beta[",k,",",4*i-j ,"]"))
print(
mcmc_pairs(
posterior,
np = nuts_prmts,
pars=c(
params,
"lp__"
),
off_diag_args = list(size = 0.75)
)
)
beta_range <- paste0("k_",k,"_i_",4*i-3,"-",4*i)
filename <- paste0(image_diagnostics,"/correlation_plot_beta_",beta_range,".png")
ggsave(filename)
}}
```