@ -1,983 +0,0 @@
|
||||
---
|
||||
title: "The Effects of Recruitment status on completion of clinical trials"
|
||||
author: "Will King"
|
||||
format: html
|
||||
editor: source
|
||||
---
|
||||
|
||||
|
||||
|
||||
|
||||
# Setup
|
||||
|
||||
|
||||
|
||||
```{r}
|
||||
library(knitr)
|
||||
library(bayesplot)
|
||||
available_mcmc(pattern = "_nuts_")
|
||||
library(ggplot2)
|
||||
library(patchwork)
|
||||
library(tidyverse)
|
||||
library(rstan)
|
||||
library(tidyr)
|
||||
library(ghibli)
|
||||
library(xtable)
|
||||
#Resources: https://github.com/stan-dev/rstan/wiki/RStan-Getting-Started
|
||||
|
||||
#save unchanged models instead of recompiling
|
||||
rstan_options(auto_write = TRUE)
|
||||
#allow for multithreaded sampling
|
||||
options(mc.cores = parallel::detectCores())
|
||||
|
||||
#test installation, shouldn't get any errors
|
||||
#example(stan_model, package = "rstan", run.dontrun = TRUE)
|
||||
```
|
||||
|
||||
```{r}
|
||||
################ Pull data from database ######################
|
||||
library(RPostgreSQL)
|
||||
host <- 'aact_db-restored-2025-01-07'
|
||||
|
||||
driver <- dbDriver("PostgreSQL")
|
||||
|
||||
get_data <- function(driver) {
|
||||
|
||||
con <- dbConnect(
|
||||
driver,
|
||||
user='root',
|
||||
password='root',
|
||||
dbname='aact_db',
|
||||
host=host
|
||||
)
|
||||
on.exit(dbDisconnect(con))
|
||||
|
||||
query <- dbSendQuery(
|
||||
con,
|
||||
# "select * from formatted_data_with_planned_enrollment;"
|
||||
"
|
||||
select
|
||||
fdqpe.nct_id
|
||||
--,fdqpe.start_date
|
||||
--,fdqpe.current_enrollment
|
||||
--,fdqpe.enrollment_category
|
||||
,fdqpe.current_status
|
||||
,fdqpe.earliest_date_observed
|
||||
,fdqpe.elapsed_duration
|
||||
,fdqpe.n_brands as identical_brands
|
||||
,ntbtu.brand_name_counts
|
||||
,fdqpe.category_id
|
||||
,fdqpe.final_status
|
||||
,fdqpe.h_sdi_val
|
||||
--,fdqpe.h_sdi_u95
|
||||
--,fdqpe.h_sdi_l95
|
||||
,fdqpe.hm_sdi_val
|
||||
--,fdqpe.hm_sdi_u95
|
||||
--,fdqpe.hm_sdi_l95
|
||||
,fdqpe.m_sdi_val
|
||||
--,fdqpe.m_sdi_u95
|
||||
--,fdqpe.m_sdi_l95
|
||||
,fdqpe.lm_sdi_val
|
||||
--,fdqpe.lm_sdi_u95
|
||||
--,fdqpe.lm_sdi_l95
|
||||
,fdqpe.l_sdi_val
|
||||
--,fdqpe.l_sdi_u95
|
||||
--,fdqpe.l_sdi_l95
|
||||
from formatted_data_with_planned_enrollment fdqpe
|
||||
join \"Formularies\".nct_to_brand_counts_through_uspdc ntbtu
|
||||
on fdqpe.nct_id = ntbtu.nct_id
|
||||
order by fdqpe.nct_id, fdqpe.earliest_date_observed
|
||||
;
|
||||
"
|
||||
)
|
||||
df <- fetch(query, n = -1)
|
||||
df <- na.omit(df)
|
||||
|
||||
query2 <-dbSendQuery(con,"select count(*) from \"DiseaseBurden\".icd10_categories ic where \"level\"=1;")
|
||||
n_categories <- fetch(query2, n = -1)
|
||||
|
||||
return(list(data=df,ncat=n_categories))
|
||||
}
|
||||
|
||||
|
||||
get_counterfact_base <- function(driver) {
|
||||
|
||||
con <- dbConnect(
|
||||
driver,
|
||||
user='root',
|
||||
password='root',
|
||||
dbname='aact_db',
|
||||
host=host
|
||||
)
|
||||
on.exit(dbDisconnect(con))
|
||||
|
||||
query <- dbSendQuery(
|
||||
con,
|
||||
"
|
||||
with cte as (
|
||||
--get last recruiting state
|
||||
select fd.nct_id, max(fd.earliest_date_observed),min(fd2.earliest_date_observed) as tmstmp
|
||||
from formatted_data fd
|
||||
join formatted_data fd2
|
||||
on fd.nct_id=fd2.nct_id and fd.earliest_date_observed < fd2.earliest_date_observed
|
||||
where fd.current_status = 'Recruiting'
|
||||
and fd2.current_status = 'Active, not recruiting'
|
||||
group by fd.nct_id
|
||||
)
|
||||
select
|
||||
fdqpe.nct_id
|
||||
--,fdqpe.start_date
|
||||
--,fdqpe.current_enrollment
|
||||
--,fdqpe.enrollment_category
|
||||
,fdqpe.current_status
|
||||
,fdqpe.earliest_date_observed
|
||||
,fdqpe.elapsed_duration
|
||||
,fdqpe.n_brands as identical_brands
|
||||
,ntbtu.brand_name_counts
|
||||
,fdqpe.category_id
|
||||
,fdqpe.final_status
|
||||
,fdqpe.h_sdi_val
|
||||
--,fdqpe.h_sdi_u95
|
||||
--,fdqpe.h_sdi_l95
|
||||
,fdqpe.hm_sdi_val
|
||||
--,fdqpe.hm_sdi_u95
|
||||
--,fdqpe.hm_sdi_l95
|
||||
,fdqpe.m_sdi_val
|
||||
--,fdqpe.m_sdi_u95
|
||||
--,fdqpe.m_sdi_l95
|
||||
,fdqpe.lm_sdi_val
|
||||
--,fdqpe.lm_sdi_u95
|
||||
--,fdqpe.lm_sdi_l95
|
||||
,fdqpe.l_sdi_val
|
||||
--,fdqpe.l_sdi_u95
|
||||
--,fdqpe.l_sdi_l95
|
||||
from formatted_data_with_planned_enrollment fdqpe
|
||||
join \"Formularies\".nct_to_brand_counts_through_uspdc ntbtu
|
||||
on fdqpe.nct_id = ntbtu.nct_id
|
||||
join cte
|
||||
on fdqpe.nct_id = cte.nct_id
|
||||
and fdqpe.earliest_date_observed = cte.tmstmp
|
||||
order by fdqpe.nct_id, fdqpe.earliest_date_observed
|
||||
;
|
||||
"
|
||||
)
|
||||
df <- fetch(query, n = -1)
|
||||
df <- na.omit(df)
|
||||
|
||||
query2 <-dbSendQuery(con,"select count(*) from \"DiseaseBurden\".icd10_categories ic where \"level\"=1;")
|
||||
n_categories <- fetch(query2, n = -1)
|
||||
|
||||
return(list(data=df,ncat=n_categories))
|
||||
}
|
||||
|
||||
|
||||
d <- get_data(driver)
|
||||
df <- d$data
|
||||
n_categories <- d$ncat
|
||||
|
||||
cf <- get_counterfact_base(driver)
|
||||
df_counterfact_base <- cf$data
|
||||
|
||||
|
||||
|
||||
################ Format Data ###########################
|
||||
|
||||
data_formatter <- function(df) {
|
||||
categories <- df["category_id"]
|
||||
|
||||
x <- df["elapsed_duration"]
|
||||
x["identical_brands"] <- asinh(df$identical_brands)
|
||||
x["brand_name_counts"] <- asinh(df$brand_name_count)
|
||||
x["h_sdi_val"] <- asinh(df$h_sdi_val)
|
||||
x["hm_sdi_val"] <- asinh(df$hm_sdi_val)
|
||||
x["m_sdi_val"] <- asinh(df$m_sdi_val)
|
||||
x["lm_sdi_val"] <- asinh(df$lm_sdi_val)
|
||||
x["l_sdi_val"] <- asinh(df$l_sdi_val)
|
||||
|
||||
|
||||
#Setup fixed effects
|
||||
x["status_NYR"] <- ifelse(df["current_status"]=="Not yet recruiting",1,0)
|
||||
x["status_EBI"] <- ifelse(df["current_status"]=="Enrolling by invitation",1,0)
|
||||
x["status_Rec"] <- ifelse(df["current_status"]=="Recruiting",1,0)
|
||||
x["status_ANR"] <- ifelse(df["current_status"]=="Active, not recruiting",1,0)
|
||||
|
||||
|
||||
y <- ifelse(df["final_status"]=="Terminated",1,0)
|
||||
|
||||
#get category list
|
||||
|
||||
|
||||
return(list(x=x,y=y))
|
||||
}
|
||||
|
||||
train <- data_formatter(df)
|
||||
counterfact_base <- data_formatter(df_counterfact_base)
|
||||
|
||||
categories <- df$category_id
|
||||
|
||||
x <- train$x
|
||||
y <- train$y
|
||||
|
||||
x_cf_base <- counterfact_base$x
|
||||
y_cf_base <- counterfact_base$y
|
||||
cf_categories <- df_counterfact_base$category_id
|
||||
```
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
# Fit Model
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
```{r}
|
||||
################################# FIT MODEL #########################################
|
||||
inherited_cols <- c(
|
||||
"elapsed_duration"
|
||||
#,"identical_brands"
|
||||
#,"brand_name_counts"
|
||||
,"h_sdi_val"
|
||||
,"hm_sdi_val"
|
||||
,"m_sdi_val"
|
||||
,"lm_sdi_val"
|
||||
,"l_sdi_val"
|
||||
,"status_NYR"# TODO: may need to remove
|
||||
,"status_EBI"
|
||||
,"status_Rec"
|
||||
,"status_ANR"
|
||||
)
|
||||
```
|
||||
|
||||
```{r}
|
||||
beta_list <- list(
|
||||
groups = c(
|
||||
`1`="Infections & Parasites",
|
||||
`2`="Neoplasms",
|
||||
`3`="Blood & Immune system",
|
||||
`4`="Endocrine, Nutritional, and Metabolic",
|
||||
`5`="Mental & Behavioral",
|
||||
`6`="Nervous System",
|
||||
`7`="Eye and Adnexa",
|
||||
`8`="Ear and Mastoid",
|
||||
`9`="Circulatory",
|
||||
`10`="Respiratory",
|
||||
`11`="Digestive",
|
||||
`12`="Skin & Subcutaneaous tissue",
|
||||
`13`="Musculoskeletal",
|
||||
`14`="Genitourinary",
|
||||
`15`="Pregancy, Childbirth, & Puerperium",
|
||||
`16`="Perinatal Period",
|
||||
`17`="Congential",
|
||||
`18`="Symptoms, Signs etc.",
|
||||
`19`="Injury etc.",
|
||||
`20`="External Causes",
|
||||
`21`="Contact with Healthcare",
|
||||
`22`="Special Purposes"
|
||||
),
|
||||
parameters = c(
|
||||
`1`="Elapsed Duration",
|
||||
# brands
|
||||
`2`="asinh(Generic Brands)",
|
||||
`3`="asinh(Competitors USPDC)",
|
||||
# population
|
||||
`4`="asinh(High SDI)",
|
||||
`5`="asinh(High-Medium SDI)",
|
||||
`6`="asinh(Medium SDI)",
|
||||
`7`="asinh(Low-Medium SDI)",
|
||||
`8`="asinh(Low SDI)",
|
||||
#Status
|
||||
`9`="status_NYR",
|
||||
`10`="status_EBI",
|
||||
`11`="status_Rec",
|
||||
`12`="status_ANR"
|
||||
)
|
||||
)
|
||||
|
||||
get_parameters <- function(stem,class_list) {
|
||||
#get categories and lengths
|
||||
named <- names(class_list)
|
||||
lengths <- sapply(named, (function (x) length(class_list[[x]])))
|
||||
|
||||
#describe the grid needed
|
||||
iter_list <- sapply(named, (function (x) 1:lengths[x]))
|
||||
|
||||
#generate the list of parameters
|
||||
pardf <- generate_parameter_df(stem, iter_list)
|
||||
|
||||
#add columns with appropriate human-readable names
|
||||
for (name in named) {
|
||||
pardf[paste(name,"_hr",sep="")] <- as.factor(
|
||||
sapply(pardf[name], (function (i) class_list[[name]][i]))
|
||||
)
|
||||
}
|
||||
|
||||
return(pardf)
|
||||
}
|
||||
|
||||
generate_parameter_df <- function(stem, iter_list) {
|
||||
grid <- expand.grid(iter_list)
|
||||
grid["param_name"] <- grid %>% unite(x,colnames(grid),sep=",")
|
||||
grid["param_name"] <- paste(stem,"[",grid$param_name,"]",sep="")
|
||||
return(grid)
|
||||
}
|
||||
|
||||
group_mcmc_areas <- function(
|
||||
stem,# = "beta"
|
||||
class_list,# = beta_list
|
||||
stanfit,# = fit
|
||||
group_id,# = 2
|
||||
rename=TRUE,
|
||||
filter=NULL
|
||||
) {
|
||||
|
||||
#get all parameter names
|
||||
params <- get_parameters(stem,class_list)
|
||||
|
||||
#filter down to parameters of interest
|
||||
params <- filter(params,groups == group_id)
|
||||
#Get dataframe with only the rows of interest
|
||||
filtdata <- as.data.frame(stanfit)[params$param_name]
|
||||
#rename columns
|
||||
if (rename) dimnames(filtdata)[[2]] <- params$parameters_hr
|
||||
#get group name for title
|
||||
group_name <- class_list$groups[group_id]
|
||||
#create area plot with appropriate title
|
||||
p <- mcmc_areas(filtdata,prob = 0.8, prob_outer = 0.95) +
|
||||
ggtitle(paste("Parameter distributions for ICD-10 class:",group_name)) +
|
||||
geom_vline(xintercept=0,color="grey",alpha=0.75)
|
||||
|
||||
d <- pivot_longer(filtdata, everything()) |>
|
||||
group_by(name) |>
|
||||
summarize(
|
||||
mean=mean(value)
|
||||
,q025 = quantile(value,probs = 0.025)
|
||||
,q975 = quantile(value,probs = 0.975)
|
||||
,q05 = quantile(value,probs = 0.05)
|
||||
,q95 = quantile(value,probs = 0.95)
|
||||
)
|
||||
return(list(plot=p,quantiles=d,name=group_name))
|
||||
}
|
||||
|
||||
parameter_mcmc_areas <- function(
|
||||
stem,# = "beta"
|
||||
class_list,# = beta_list
|
||||
stanfit,# = fit
|
||||
parameter_id,# = 2
|
||||
rename=TRUE
|
||||
) {
|
||||
#get all parameter names
|
||||
params <- get_parameters(stem,class_list)
|
||||
#filter down to parameters of interest
|
||||
params <- filter(params,parameters == parameter_id)
|
||||
#Get dataframe with only the rows of interest
|
||||
filtdata <- as.data.frame(stanfit)[params$param_name]
|
||||
#rename columns
|
||||
if (rename) dimnames(filtdata)[[2]] <- params$groups_hr
|
||||
#get group name for title
|
||||
parameter_name <- class_list$parameters[parameter_id]
|
||||
#create area plot with appropriate title
|
||||
p <- mcmc_areas(filtdata,prob = 0.8, prob_outer = 0.95) +
|
||||
ggtitle(parameter_name,"Parameter Distribution")
|
||||
|
||||
d <- pivot_longer(filtdata, everything()) |>
|
||||
group_by(name) |>
|
||||
summarize(
|
||||
mean=mean(value)
|
||||
,q025 = quantile(value,probs = 0.025)
|
||||
,q975 = quantile(value,probs = 0.975)
|
||||
,q05 = quantile(value,probs = 0.05)
|
||||
,q95 = quantile(value,probs = 0.95)
|
||||
)
|
||||
return(list(plot=p,quantiles=d,name=parameter_name))
|
||||
}
|
||||
|
||||
|
||||
```
|
||||
|
||||
|
||||
|
||||
Plan: select all snapshots that are the first to have closed enrollment (Rec -> ANR)
|
||||
|
||||
|
||||
```{r}
|
||||
#delay intervention
|
||||
intervention_enrollment <- x_cf_base[c(inherited_cols,"brand_name_counts", "identical_brands")]
|
||||
intervention_enrollment["status_ANR"] <- 0
|
||||
intervention_enrollment["status_Rec"] <- 1
|
||||
```
|
||||
|
||||
```{r}
|
||||
counterfact_delay <- list(
|
||||
D = ncol(x),#
|
||||
N = nrow(x),
|
||||
L = n_categories$count,
|
||||
y = as.vector(y),
|
||||
ll = as.vector(categories),
|
||||
x = as.matrix(x),
|
||||
mu_mean = 0,
|
||||
mu_stdev = 0.05,
|
||||
sigma_shape = 4,
|
||||
sigma_rate = 20,
|
||||
Nx = nrow(x_cf_base),
|
||||
llx = as.vector(cf_categories),
|
||||
counterfact_x_tilde = as.matrix(intervention_enrollment),
|
||||
counterfact_x = as.matrix(x_cf_base)
|
||||
)
|
||||
```
|
||||
|
||||
```{r}
|
||||
fit <- stan(
|
||||
file='Hierarchal_Logistic.stan',
|
||||
data = counterfact_delay,
|
||||
chains = 4,
|
||||
iter = 5000,
|
||||
seed = 11021585
|
||||
)
|
||||
```
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
## Explore data
|
||||
|
||||
|
||||
|
||||
|
||||
```{r}
|
||||
#get number of trials and snapshots in each category
|
||||
group_trials_by_category <- as.data.frame(aggregate(category_id ~ nct_id, df, max))
|
||||
group_trials_by_category <- as.data.frame(group_trials_by_category)
|
||||
|
||||
category_count <- group_trials_by_category |> group_by(category_id) |> count()
|
||||
|
||||
```
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
## Fit Results
|
||||
|
||||
|
||||
|
||||
|
||||
```{r}
|
||||
################################# ANALYZE #####################################
|
||||
print(fit)
|
||||
```
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
# Counterfactuals
|
||||
|
||||
|
||||
|
||||
```{r}
|
||||
generated_ib <- gqs(
|
||||
fit@stanmodel,
|
||||
data=counterfact_delay,
|
||||
draws=as.matrix(fit),
|
||||
seed=11021585
|
||||
)
|
||||
```
|
||||
|
||||
```{r}
|
||||
df_ib_p <- data.frame(
|
||||
p_prior=as.vector(extract(generated_ib, pars="p_prior")$p_prior)
|
||||
,p_predicted = as.vector(extract(generated_ib, pars="p_predicted")$p_predicted)
|
||||
)
|
||||
|
||||
df_ib_prior <- data.frame(
|
||||
mu_prior = as.vector(extract(generated_ib, pars="mu_prior")$mu_prior)
|
||||
,sigma_prior = as.vector(extract(generated_ib, pars="sigma_prior")$sigma_prior)
|
||||
)
|
||||
|
||||
#p_prior
|
||||
ggplot(df_ib_p, aes(x=p_prior)) +
|
||||
geom_density() +
|
||||
labs(
|
||||
title="Implied Prior Distribution P"
|
||||
,subtitle=""
|
||||
,x="Probability Domain 'p'"
|
||||
,y="Probability Density"
|
||||
)
|
||||
ggsave("./EffectsOfEnrollmentDelay/Images/DirectEffects/prior_p.png")
|
||||
|
||||
#p_posterior
|
||||
ggplot(df_ib_p, aes(x=p_predicted)) +
|
||||
geom_density() +
|
||||
labs(
|
||||
title="Implied Posterior Distribution P"
|
||||
,subtitle=""
|
||||
,x="Probability Domain 'p'"
|
||||
,y="Probability Density"
|
||||
)
|
||||
ggsave("./EffectsOfEnrollmentDelay/Images/DirectEffects/posterior_p.png")
|
||||
|
||||
#mu_prior
|
||||
ggplot(df_ib_prior) +
|
||||
geom_density(aes(x=mu_prior)) +
|
||||
labs(
|
||||
title="Prior - Mu"
|
||||
,subtitle="same prior for all Mu values"
|
||||
,x="Mu"
|
||||
,y="Probability"
|
||||
)
|
||||
ggsave("./EffectsOfEnrollmentDelay/Images/DirectEffects/prior_mu.png")
|
||||
|
||||
#sigma_posterior
|
||||
ggplot(df_ib_prior) +
|
||||
geom_density(aes(x=sigma_prior)) +
|
||||
labs(
|
||||
title="Prior - Sigma"
|
||||
,subtitle="same prior for all Sigma values"
|
||||
,x="Sigma"
|
||||
,y="Probability"
|
||||
)
|
||||
ggsave("./EffectsOfEnrollmentDelay/Images/DirectEffects/prior_sigma.png")
|
||||
```
|
||||
|
||||
```{r}
|
||||
check_hmc_diagnostics(fit)
|
||||
```
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
### Intervention: Delay close of enrollment
|
||||
|
||||
|
||||
|
||||
```{r}
|
||||
counterfact_predicted_ib <- data.frame(
|
||||
p_predicted_default = as.vector(extract(generated_ib, pars="p_predicted_default")$p_predicted_default)
|
||||
,p_predicted_intervention = as.vector(extract(generated_ib, pars="p_predicted_intervention")$p_predicted_intervention)
|
||||
,predicted_difference = as.vector(extract(generated_ib, pars="predicted_difference")$predicted_difference)
|
||||
)
|
||||
|
||||
|
||||
ggplot(counterfact_predicted_ib, aes(x=p_predicted_default)) +
|
||||
geom_density() +
|
||||
labs(
|
||||
title="Predicted Distribution of 'p'"
|
||||
,subtitle="Intervention: None"
|
||||
,x="Probability Domain 'p'"
|
||||
,y="Probability Density"
|
||||
)
|
||||
ggsave("./EffectsOfEnrollmentDelay/Images/DirectEffects/default_p_generic_intervention_base.png")
|
||||
|
||||
ggplot(counterfact_predicted_ib, aes(x=p_predicted_intervention)) +
|
||||
geom_density() +
|
||||
labs(
|
||||
title="Predicted Distribution of 'p'"
|
||||
,subtitle="Intervention: Delay close of enrollment"
|
||||
,x="Probability Domain 'p'"
|
||||
,y="Probability Density"
|
||||
)
|
||||
ggsave("./EffectsOfEnrollmentDelay/Images/DirectEffects/default_p_generic_intervention_interv.png")
|
||||
|
||||
ggplot(counterfact_predicted_ib, aes(x=predicted_difference)) +
|
||||
geom_density() +
|
||||
labs(
|
||||
title="Predicted Distribution of differences 'p'"
|
||||
,subtitle="Intervention: Delay close of enrollment"
|
||||
,x="Difference in 'p' under treatment"
|
||||
,y="Probability Density"
|
||||
)
|
||||
ggsave("./EffectsOfEnrollmentDelay/Images/DirectEffects/default_p_generic_intervention_distdiff.png")
|
||||
```
|
||||
|
||||
```{r}
|
||||
get_category_count <- function(tbl, id) {
|
||||
result <- tbl$n[tbl$category_id == id]
|
||||
if(length(result) == 0) 0 else result
|
||||
}
|
||||
|
||||
category_names <- sapply(1:length(beta_list$groups),
|
||||
function(i) sprintf("ICD-10 #%d: %s (n=%d)",
|
||||
i,
|
||||
beta_list$groups[i],
|
||||
get_category_count(category_count, i)))
|
||||
|
||||
```
|
||||
|
||||
```{r}
|
||||
pddf_ib <- data.frame(extract(generated_ib, pars="predicted_difference")$predicted_difference) |>
|
||||
pivot_longer(X1:X168) #CHANGE_NOTE: moved from X169 to X168
|
||||
|
||||
#TODO: Fix Category names
|
||||
pddf_ib["entry_idx"] <- as.numeric(gsub("\\D","",pddf_ib$name))
|
||||
pddf_ib["category"] <- sapply(pddf_ib$entry_idx, function(i) df$category_id[i])
|
||||
pddf_ib["category_name"] <- sapply(
|
||||
pddf_ib$category,
|
||||
function(i) category_names[i]
|
||||
)
|
||||
|
||||
|
||||
|
||||
ggplot(pddf_ib, aes(x=value,)) +
|
||||
geom_density(adjust=1/5) +
|
||||
labs(
|
||||
title = "Distribution of predicted differences"
|
||||
,subtitle = "Intervention: Delay close of enrollment"
|
||||
,x = "Difference in probability due to intervention"
|
||||
,y = "Probability Density"
|
||||
) +
|
||||
geom_vline(aes(xintercept = 0), color = "skyblue", linetype="dashed")
|
||||
#todo: add median, mean, 40/60 quantiles as well as
|
||||
ggsave("./EffectsOfEnrollmentDelay/Images/DirectEffects/p_delay_intervention_distdiff_styled.png")
|
||||
|
||||
ggplot(pddf_ib, aes(x=value,)) +
|
||||
geom_density(adjust=1/5) +
|
||||
facet_wrap(
|
||||
~factor(
|
||||
category_name,
|
||||
levels=category_names
|
||||
)
|
||||
, labeller = label_wrap_gen(multi_line = TRUE)
|
||||
, ncol=4) +
|
||||
labs(
|
||||
title = "Distribution of predicted differences | By Group"
|
||||
,subtitle = "Intervention: Delay close of enrollment"
|
||||
,x = "Difference in probability due to intervention"
|
||||
,y = "Probability Density"
|
||||
) +
|
||||
geom_vline(aes(xintercept = 0), color = "skyblue", linetype="dashed") +
|
||||
theme(strip.text.x = element_text(size = 8))
|
||||
ggsave("./EffectsOfEnrollmentDelay/Images/DirectEffects/p_delay_intervention_distdiff_by_group.png")
|
||||
|
||||
ggplot(pddf_ib, aes(x=value,)) +
|
||||
geom_histogram(bins=300) +
|
||||
facet_wrap(
|
||||
~factor(
|
||||
category_name,
|
||||
levels=category_names
|
||||
)
|
||||
, labeller = label_wrap_gen(multi_line = TRUE)
|
||||
, ncol=4) +
|
||||
labs(
|
||||
title = "Histogram of predicted differences | By Group"
|
||||
,subtitle = "Intervention: Delay close of enrollment"
|
||||
,x = "Difference in probability due to intervention"
|
||||
,y = "Predicted counts"
|
||||
) +
|
||||
geom_vline(aes(xintercept = 0), color = "skyblue", linetype="dashed") +
|
||||
theme(strip.text.x = element_text(size = 8))
|
||||
ggsave("./EffectsOfEnrollmentDelay/Images/DirectEffects/p_delay_intervention_histdiff_by_group.png")
|
||||
```
|
||||
|
||||
```{r}
|
||||
p3 <- ggplot(pddf_ib, aes(x=value,)) +
|
||||
geom_histogram(bins=500) +
|
||||
labs(
|
||||
title = "Distribution of predicted differences"
|
||||
,subtitle = "Intervention: Delay close of enrollment"
|
||||
,x = "Difference in probability due to intervention"
|
||||
,y = "Probability Density"
|
||||
,caption = "Vertical marks: 5/10/25/50/75/90/95th percentiles. Dot shows mean."
|
||||
) +
|
||||
geom_vline(aes(xintercept = 0), color = "skyblue", linetype="dashed")
|
||||
|
||||
stats <- list(
|
||||
p5 = quantile(pddf_ib$value, 0.05),
|
||||
p10 = quantile(pddf_ib$value, 0.10),
|
||||
q1 = quantile(pddf_ib$value, 0.25),
|
||||
med = median(pddf_ib$value),
|
||||
mean = mean(pddf_ib$value),
|
||||
q3 = quantile(pddf_ib$value, 0.75),
|
||||
p90 = quantile(pddf_ib$value, 0.90),
|
||||
p95 = quantile(pddf_ib$value, 0.95),
|
||||
max_height = max(ggplot_build(p3)$data[[1]]$count),
|
||||
y_offset = -max(ggplot_build(p3)$data[[1]]$count) * 0.05
|
||||
)
|
||||
|
||||
p3 +
|
||||
# Box
|
||||
geom_segment(data = data.frame(
|
||||
x = c(stats$q1, stats$q3, stats$med),
|
||||
xend = c(stats$q1, stats$q3, stats$med),
|
||||
y = rep(stats$y_offset, 3),
|
||||
yend = rep(stats$y_offset * 2, 3)
|
||||
), aes(x = x, xend = xend, y = y, yend = yend)) +
|
||||
geom_segment(data = data.frame(
|
||||
x = rep(stats$q1, 2),
|
||||
xend = rep(stats$q3, 2),
|
||||
y = c(stats$y_offset, stats$y_offset * 2),
|
||||
yend = c(stats$y_offset, stats$y_offset * 2)
|
||||
), aes(x = x, xend = xend, y = y, yend = yend)) +
|
||||
# Inner whiskers (Q1->P10, Q3->P90)
|
||||
geom_segment(data = data.frame(
|
||||
x = c(stats$q1, stats$q3),
|
||||
xend = c(stats$p10, stats$p90),
|
||||
y = rep(stats$y_offset * 1.5, 2),
|
||||
yend = rep(stats$y_offset * 1.5, 2)
|
||||
), aes(x = x, xend = xend, y = y, yend = yend)) +
|
||||
# Crossbars at P10/P90
|
||||
geom_segment(data = data.frame(
|
||||
x = c(stats$p10, stats$p90),
|
||||
xend = c(stats$p10, stats$p90),
|
||||
y = stats$y_offset * 1.3,
|
||||
yend = stats$y_offset * 1.7
|
||||
), aes(x = x, xend = xend, y = y, yend = yend)) +
|
||||
# Outer whiskers (P10->P5, P90->P95)
|
||||
geom_segment(data = data.frame(
|
||||
x = c(stats$p10, stats$p90),
|
||||
xend = c(stats$p5, stats$p95),
|
||||
y = rep(stats$y_offset * 1.5, 2),
|
||||
yend = rep(stats$y_offset * 1.5, 2)
|
||||
), aes(x = x, xend = xend, y = y, yend = yend)) +
|
||||
# Crossbars at P5/P95
|
||||
geom_segment(data = data.frame(
|
||||
x = c(stats$p5, stats$p95),
|
||||
xend = c(stats$p5, stats$p95),
|
||||
y = stats$y_offset * 1.3,
|
||||
yend = stats$y_offset * 1.7
|
||||
), aes(x = x, xend = xend, y = y, yend = yend)) +
|
||||
# Mean dot
|
||||
geom_point(data = data.frame(
|
||||
x = stats$mean,
|
||||
y = stats$y_offset * 1.5
|
||||
), aes(x = x, y = y))
|
||||
ggsave("./EffectsOfEnrollmentDelay/Images/DirectEffects/p_delay_intervention_histdiff_boxplot.png")
|
||||
```
|
||||
|
||||
```{r}
|
||||
ggplot(pddf_ib, aes(x=value)) +
|
||||
stat_ecdf(geom='step') +
|
||||
labs(
|
||||
title = "Cumulative distribution of predicted differences",
|
||||
subtitle = "Intervention: Delay close of enrollment",
|
||||
x = "Difference in probability of termination due to intervention",
|
||||
y = "Cumulative Proportion"
|
||||
)
|
||||
|
||||
ggsave("./EffectsOfEnrollmentDelay/Images/DirectEffects/p_delay_intervention_cumulative_distdiff.png")
|
||||
```
|
||||
|
||||
|
||||
|
||||
|
||||
Get the % of differences in the spike around zero
|
||||
|
||||
|
||||
```{r}
|
||||
# get values around and above/below spike
|
||||
width <- 0.02
|
||||
spike_band_centered_zero <- mean( pddf_ib$value >= -width/2 & pddf_ib$value <= width/2)
|
||||
above_spike_band <- mean( pddf_ib$value >= width/2)
|
||||
below_spike_band <- mean( pddf_ib$value <= -width/2)
|
||||
|
||||
# get mass above and mass below zero
|
||||
mass_below_zero <- mean( pddf_ib$value <= 0)
|
||||
```
|
||||
|
||||
|
||||
Looking at the spike around zero, we find that `r spike_band_centered_zero*100`%
|
||||
of the probability mass is contained within the band from
|
||||
[`r -width*100/2`,`r width*100/2`].
|
||||
Additionally, there was `r above_spike_band*100`% of the probability above that
|
||||
-- representing those with a general increase in the probability of termination --
|
||||
and `r below_spike_band*100`% of the probability mass below the band
|
||||
-- representing a decrease in the probability of termination.
|
||||
|
||||
On average, if you keep the trial open instead of closing it,
|
||||
`r mass_below_zero`% of trials will see a decrease in the probability of termination,
|
||||
but, due to the high increase in probability of termination given termination was increased,
|
||||
the mean probability of termination increases by `r stats$mean`.
|
||||
|
||||
|
||||
|
||||
```{r}
|
||||
# 5%-iles
|
||||
|
||||
summary(pddf_ib$value)
|
||||
|
||||
# Create your quantiles
|
||||
quants <- quantile(pddf_ib$value, probs = seq(0,1,0.05), type=4)
|
||||
|
||||
# Convert to a data frame
|
||||
quant_df <- data.frame(
|
||||
Percentile = names(quants),
|
||||
Value = quants
|
||||
)
|
||||
kable(quant_df)
|
||||
```
|
||||
|
||||
```{r}
|
||||
n = length(counterfact_predicted_ib$p_predicted_intervention)
|
||||
k = 100
|
||||
simulated_terminations_intervention <- mean(rbinom(n,k,as.vector(counterfact_predicted_ib$p_predicted_intervention)))
|
||||
simulated_terminations_base <-mean(rbinom(n,k,as.vector(counterfact_predicted_ib$p_predicted_default)))
|
||||
|
||||
simulated_percentages <- (simulated_terminations_intervention - simulated_terminations_base)/k
|
||||
```
|
||||
|
||||
|
||||
|
||||
The simulation above shows that this results in a percentage-point increase of about
|
||||
`r simulated_percentages * 100`.
|
||||
|
||||
|
||||
|
||||
|
||||
# Diagnostics
|
||||
|
||||
|
||||
|
||||
```{r}
|
||||
#| eval: true
|
||||
#trace plots
|
||||
plot(fit, pars=c("mu"), plotfun="trace")
|
||||
|
||||
|
||||
for (i in 1:3) {
|
||||
print(
|
||||
mcmc_rank_overlay(
|
||||
fit,
|
||||
pars=c(
|
||||
paste0("mu[",4*i-3,"]"),
|
||||
paste0("mu[",4*i-2,"]"),
|
||||
paste0("mu[",4*i-1,"]"),
|
||||
paste0("mu[",4*i,"]")
|
||||
),
|
||||
n_bins=100
|
||||
)+ legend_move("top") +
|
||||
scale_colour_ghibli_d("KikiMedium")
|
||||
)
|
||||
}
|
||||
```
|
||||
|
||||
```{r}
|
||||
#| eval: true
|
||||
plot(fit, pars=c("sigma"), plotfun="trace")
|
||||
|
||||
for (i in 1:3) {
|
||||
print(
|
||||
mcmc_rank_overlay(
|
||||
fit,
|
||||
pars=c(
|
||||
paste0("sigma[",4*i-3,"]"),
|
||||
paste0("sigma[",4*i-2,"]"),
|
||||
paste0("sigma[",4*i-1,"]"),
|
||||
paste0("sigma[",4*i,"]")
|
||||
),
|
||||
n_bins=100
|
||||
)+ legend_move("top") +
|
||||
scale_colour_ghibli_d("KikiMedium")
|
||||
)
|
||||
}
|
||||
```
|
||||
|
||||
```{r}
|
||||
#| eval: true
|
||||
#other diagnostics
|
||||
logpost <- log_posterior(fit)
|
||||
nuts_prmts <- nuts_params(fit)
|
||||
posterior <- as.array(fit)
|
||||
|
||||
```
|
||||
|
||||
```{r}
|
||||
#| eval: true
|
||||
color_scheme_set("darkgray")
|
||||
div_style <- parcoord_style_np(div_color = "green", div_size = 0.05, div_alpha = 0.4)
|
||||
mcmc_parcoord(posterior, regex_pars = "mu", np=nuts_prmts, np_style = div_style, alpha = 0.05)
|
||||
```
|
||||
|
||||
```{r}
|
||||
#| eval: true
|
||||
for (i in 1:3) {
|
||||
mus = sapply(3:0, function(j) paste0("mu[",4*i-j ,"]"))
|
||||
print(
|
||||
mcmc_pairs(
|
||||
posterior,
|
||||
np = nuts_prmts,
|
||||
pars=c(
|
||||
mus,
|
||||
"lp__"
|
||||
),
|
||||
off_diag_args = list(size = 0.75)
|
||||
)
|
||||
)
|
||||
}
|
||||
|
||||
|
||||
|
||||
```
|
||||
|
||||
```{r}
|
||||
#| eval: true
|
||||
mcmc_parcoord(posterior,regex_pars = "sigma", np=nuts_prmts, alpha=0.05)
|
||||
```
|
||||
|
||||
```{r}
|
||||
#| eval: true
|
||||
|
||||
for (i in 1:3) {
|
||||
params = sapply(3:0, function(j) paste0("sigma[",4*i-j ,"]"))
|
||||
print(
|
||||
mcmc_pairs(
|
||||
posterior,
|
||||
np = nuts_prmts,
|
||||
pars=c(
|
||||
params,
|
||||
"lp__"
|
||||
),
|
||||
off_diag_args = list(size = 0.75)
|
||||
)
|
||||
)
|
||||
}
|
||||
```
|
||||
|
||||
```{r}
|
||||
#| eval: true
|
||||
for (k in 1:22) {
|
||||
for (i in 1:3) {
|
||||
params = sapply(3:0, function(j) paste0("beta[",k,",",4*i-j ,"]"))
|
||||
print(
|
||||
mcmc_pairs(
|
||||
posterior,
|
||||
np = nuts_prmts,
|
||||
pars=c(
|
||||
params,
|
||||
"lp__"
|
||||
),
|
||||
off_diag_args = list(size = 0.75)
|
||||
)
|
||||
)
|
||||
}}
|
||||
```
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
# TODO
|
||||
- [ ] Double check data flow. (Write summary of this in human readable form)
|
||||
- Is it the data we want from the database
|
||||
- Training
|
||||
- Counterfactual Evaluation
|
||||
- choose a single snapshot per trial.
|
||||
- Is the model in STAN well specified.
|
||||
- [ ] work on LOO validation of model
|
||||
|
||||
|
After Width: | Height: | Size: 22 KiB |
|
After Width: | Height: | Size: 22 KiB |
|
After Width: | Height: | Size: 22 KiB |
|
After Width: | Height: | Size: 22 KiB |
|
After Width: | Height: | Size: 22 KiB |
|
After Width: | Height: | Size: 22 KiB |
|
After Width: | Height: | Size: 22 KiB |
|
After Width: | Height: | Size: 22 KiB |
|
After Width: | Height: | Size: 22 KiB |
|
After Width: | Height: | Size: 22 KiB |
|
After Width: | Height: | Size: 22 KiB |
|
After Width: | Height: | Size: 22 KiB |
|
After Width: | Height: | Size: 22 KiB |
|
After Width: | Height: | Size: 22 KiB |
|
After Width: | Height: | Size: 22 KiB |
|
After Width: | Height: | Size: 22 KiB |
|
After Width: | Height: | Size: 22 KiB |
|
After Width: | Height: | Size: 22 KiB |
|
After Width: | Height: | Size: 22 KiB |
|
After Width: | Height: | Size: 22 KiB |
|
After Width: | Height: | Size: 22 KiB |
|
After Width: | Height: | Size: 22 KiB |
|
After Width: | Height: | Size: 22 KiB |
|
After Width: | Height: | Size: 22 KiB |
|
After Width: | Height: | Size: 22 KiB |
|
After Width: | Height: | Size: 22 KiB |
|
After Width: | Height: | Size: 22 KiB |
|
After Width: | Height: | Size: 22 KiB |
|
After Width: | Height: | Size: 22 KiB |
|
After Width: | Height: | Size: 22 KiB |
|
After Width: | Height: | Size: 22 KiB |
|
After Width: | Height: | Size: 22 KiB |
|
After Width: | Height: | Size: 22 KiB |
|
After Width: | Height: | Size: 22 KiB |
|
After Width: | Height: | Size: 22 KiB |
|
After Width: | Height: | Size: 22 KiB |
|
After Width: | Height: | Size: 22 KiB |
|
After Width: | Height: | Size: 22 KiB |
|
After Width: | Height: | Size: 22 KiB |
|
After Width: | Height: | Size: 22 KiB |
|
After Width: | Height: | Size: 22 KiB |
|
After Width: | Height: | Size: 22 KiB |
|
After Width: | Height: | Size: 22 KiB |
|
After Width: | Height: | Size: 22 KiB |
|
After Width: | Height: | Size: 22 KiB |
|
After Width: | Height: | Size: 22 KiB |
|
After Width: | Height: | Size: 22 KiB |
|
After Width: | Height: | Size: 22 KiB |
|
After Width: | Height: | Size: 22 KiB |
|
After Width: | Height: | Size: 22 KiB |
|
After Width: | Height: | Size: 22 KiB |
|
After Width: | Height: | Size: 22 KiB |
|
After Width: | Height: | Size: 22 KiB |
|
After Width: | Height: | Size: 22 KiB |
|
After Width: | Height: | Size: 22 KiB |
|
After Width: | Height: | Size: 22 KiB |
|
After Width: | Height: | Size: 22 KiB |
|
After Width: | Height: | Size: 22 KiB |
|
After Width: | Height: | Size: 22 KiB |
|
After Width: | Height: | Size: 22 KiB |
|
After Width: | Height: | Size: 22 KiB |
|
After Width: | Height: | Size: 22 KiB |
|
After Width: | Height: | Size: 22 KiB |
|
After Width: | Height: | Size: 22 KiB |
|
After Width: | Height: | Size: 22 KiB |
|
After Width: | Height: | Size: 22 KiB |
|
After Width: | Height: | Size: 22 KiB |
|
After Width: | Height: | Size: 22 KiB |
|
After Width: | Height: | Size: 22 KiB |
|
After Width: | Height: | Size: 22 KiB |
|
After Width: | Height: | Size: 22 KiB |
|
After Width: | Height: | Size: 22 KiB |
|
After Width: | Height: | Size: 1.9 MiB |
|
After Width: | Height: | Size: 1.4 MiB |
|
After Width: | Height: | Size: 323 KiB |
|
After Width: | Height: | Size: 181 KiB |
|
After Width: | Height: | Size: 184 KiB |
|
After Width: | Height: | Size: 184 KiB |
|
After Width: | Height: | Size: 183 KiB |
|
After Width: | Height: | Size: 186 KiB |
|
After Width: | Height: | Size: 191 KiB |
|
After Width: | Height: | Size: 377 KiB |
|
Before Width: | Height: | Size: 60 KiB After Width: | Height: | Size: 56 KiB |
|
Before Width: | Height: | Size: 262 KiB After Width: | Height: | Size: 289 KiB |
|
Before Width: | Height: | Size: 184 KiB After Width: | Height: | Size: 196 KiB |
|
Before Width: | Height: | Size: 178 KiB After Width: | Height: | Size: 188 KiB |
|
Before Width: | Height: | Size: 168 KiB After Width: | Height: | Size: 180 KiB |
|
Before Width: | Height: | Size: 182 KiB After Width: | Height: | Size: 187 KiB |
|
Before Width: | Height: | Size: 183 KiB After Width: | Height: | Size: 182 KiB |
|
Before Width: | Height: | Size: 192 KiB After Width: | Height: | Size: 192 KiB |
|
Before Width: | Height: | Size: 171 KiB After Width: | Height: | Size: 186 KiB |
|
Before Width: | Height: | Size: 165 KiB After Width: | Height: | Size: 180 KiB |
|
Before Width: | Height: | Size: 177 KiB After Width: | Height: | Size: 198 KiB |
|
Before Width: | Height: | Size: 349 KiB After Width: | Height: | Size: 356 KiB |
|
Before Width: | Height: | Size: 309 KiB After Width: | Height: | Size: 323 KiB |
|
Before Width: | Height: | Size: 329 KiB After Width: | Height: | Size: 342 KiB |
|
After Width: | Height: | Size: 17 KiB |