diff --git a/r-analysis/EffectsOfEnrollmentDelay.qmd b/r-analysis/EffectsOfEnrollmentDelay.qmd
index 1aea9e5..0602f22 100644
--- a/r-analysis/EffectsOfEnrollmentDelay.qmd
+++ b/r-analysis/EffectsOfEnrollmentDelay.qmd
@@ -238,14 +238,14 @@ cf_categories <- df_counterfact_base$category_id
################################# FIT MODEL #########################################
inherited_cols <- c(
"elapsed_duration"
- #,"identical_brands"
- #,"brand_name_counts"
+ ,"identical_brands"
+ ,"brand_name_counts"
,"h_sdi_val"
,"hm_sdi_val"
,"m_sdi_val"
,"lm_sdi_val"
,"l_sdi_val"
- ,"status_NYR"# TODO: may need to remove
+ ,"status_NYR"
,"status_EBI"
,"status_Rec"
,"status_ANR"
@@ -404,7 +404,7 @@ parameter_mcmc_areas <- function(
Plan: select all snapshots that are the first to have closed enrollment (Rec -> ANR)
```{r}
#delay intervention
-intervention_enrollment <- x_cf_base[c(inherited_cols,"brand_name_counts", "identical_brands")]
+intervention_enrollment <- x_cf_base[c(inherited_cols)]
intervention_enrollment["status_ANR"] <- 0
intervention_enrollment["status_Rec"] <- 1
```
@@ -419,12 +419,14 @@ counterfact_delay <- list(
x = as.matrix(x),
mu_mean = 0,
mu_stdev = 0.05,
- sigma_shape = 4,
- sigma_rate = 20,
+ sigma_shape = 8,
+ sigma_rate = ,
Nx = nrow(x_cf_base),
llx = as.vector(cf_categories),
counterfact_x_tilde = as.matrix(intervention_enrollment),
- counterfact_x = as.matrix(x_cf_base)
+ counterfact_x = as.matrix(x_cf_base),
+ status_indexes = c(11,12) #subtract anr from recruiting to get movement from anr to recruiting
+
)
```
@@ -432,14 +434,24 @@ counterfact_delay <- list(
fit <- stan(
file='Hierarchal_Logistic.stan',
data = counterfact_delay,
- chains = 4,
- iter = 5000,
+ chains = 8,
+ iter = 12000,
+ warmup = 4000,
seed = 11021585
)
```
+## Fit Results
+```{r}
+print(check_hmc_diagnostics(fit))
+print(get_bfmi(fit))
+```
+
+```{r}
+print(fit)
+```
@@ -564,13 +576,6 @@ sum(df5$snapshot_count)
-## Fit Results
-
-
-```{r}
-################################# ANALYZE #####################################
-print(fit)
-```
# Parameter Distributions
@@ -639,13 +644,13 @@ Note these have 95% outer CI and 80% inner (shaded)
```{r}
-#get the generic and uspdc parameters
-print(px[4]$plot + px[7]$plot)
-ggsave(paste0(image_parameters_across_groups,"2+3_generic_and_uspdc.png"))
+##get the generic and uspdc parameters
+#print(px[4]$plot + px[7]$plot)
+#ggsave(paste0(image_parameters_across_groups,"2+3_generic_and_uspdc.png"))
#get the parameters associated with duration
-px[16]$plot + px[19]$plot
-ggsave(paste0(image_parameters_across_groups,"11+12_statusREC_and_statusANR.png"))
+#px[16]$plot + px[19]$plot
+#ggsave(paste0(image_parameters_across_groups,"11+12_statusREC_and_statusANR.png"))
```
@@ -718,11 +723,6 @@ ggsave(paste0(image_dist_diff_analysis,"/prior_sigma.png"))
-```{r}
-check_hmc_diagnostics(fit)
-```
-
-
@@ -1207,4 +1207,4 @@ for (i in 1:3) {
-
\ No newline at end of file
+
diff --git a/r-analysis/EffectsOfEnrollmentDelay_const.qmd b/r-analysis/EffectsOfEnrollmentDelay_const.qmd
deleted file mode 100644
index d50f988..0000000
--- a/r-analysis/EffectsOfEnrollmentDelay_const.qmd
+++ /dev/null
@@ -1,1214 +0,0 @@
----
-title: "The Effects of Recruitment status on completion of clinical trials"
-author: "Will King"
-format: html
-editor: source
----
-
-
-# Setup
-
-```{r}
-library(knitr)
-library(bayesplot)
-available_mcmc(pattern = "_nuts_")
-library(ggplot2)
-library(patchwork)
-library(tidyverse)
-library(rstan)
-library(tidyr)
-library(ghibli)
-library(xtable)
-#Resources: https://github.com/stan-dev/rstan/wiki/RStan-Getting-Started
-
-#save unchanged models instead of recompiling
-rstan_options(auto_write = TRUE)
-#allow for multithreaded sampling
-options(mc.cores = parallel::detectCores())
-
-#test installation, shouldn't get any errors
-#example(stan_model, package = "rstan", run.dontrun = TRUE)
-
-
-image_root <- "./output/const_anr/EffectsOfEnrollmentDelay"
-image_dist_diff_analysis <- paste0(image_root,"/dist_diff_analysis")
-image_trial_details <-paste0(image_root,"/trials_details")
-image_parameters_by_groups <-paste0(image_root,"/betas/parameters_by_group")
-image_parameters_across_groups <-paste0(image_root,"/betas/parameter_across_groups")
-
-```
-
-```{r}
-################ Pull data from database ######################
-library(RPostgreSQL)
-host <- 'aact_db-restored-2025-01-07'
-
-driver <- dbDriver("PostgreSQL")
-
-get_data <- function(driver) {
-
-con <- dbConnect(
- driver,
- user='root',
- password='root',
- dbname='aact_db',
- host=host
- )
-on.exit(dbDisconnect(con))
-
-query <- dbSendQuery(
- con,
-# "select * from formatted_data_with_planned_enrollment;"
-"
-select
- fdqpe.nct_id
- --,fdqpe.start_date
- --,fdqpe.current_enrollment
- --,fdqpe.enrollment_category
- ,fdqpe.current_status
- ,fdqpe.earliest_date_observed
- ,fdqpe.elapsed_duration
- ,fdqpe.n_brands as identical_brands
- ,ntbtu.brand_name_counts
- ,fdqpe.category_id
- ,fdqpe.final_status
- ,fdqpe.h_sdi_val
- --,fdqpe.h_sdi_u95
- --,fdqpe.h_sdi_l95
- ,fdqpe.hm_sdi_val
- --,fdqpe.hm_sdi_u95
- --,fdqpe.hm_sdi_l95
- ,fdqpe.m_sdi_val
- --,fdqpe.m_sdi_u95
- --,fdqpe.m_sdi_l95
- ,fdqpe.lm_sdi_val
- --,fdqpe.lm_sdi_u95
- --,fdqpe.lm_sdi_l95
- ,fdqpe.l_sdi_val
- --,fdqpe.l_sdi_u95
- --,fdqpe.l_sdi_l95
-from formatted_data_with_planned_enrollment fdqpe
- join \"Formularies\".nct_to_brand_counts_through_uspdc ntbtu
- on fdqpe.nct_id = ntbtu.nct_id
-order by fdqpe.nct_id, fdqpe.earliest_date_observed
-;
-"
- )
-df <- fetch(query, n = -1)
-df <- na.omit(df)
-
-query2 <-dbSendQuery(con,"select count(*) from \"DiseaseBurden\".icd10_categories ic where \"level\"=1;")
-n_categories <- fetch(query2, n = -1)
-
-return(list(data=df,ncat=n_categories))
-}
-
-
-get_counterfact_base <- function(driver) {
-
-con <- dbConnect(
- driver,
- user='root',
- password='root',
- dbname='aact_db',
- host=host
- )
-on.exit(dbDisconnect(con))
-
-query <- dbSendQuery(
- con,
- "
- with cte as (
- --get last recruiting state
- select fd.nct_id, max(fd.earliest_date_observed),min(fd2.earliest_date_observed) as tmstmp
- from formatted_data fd
- join formatted_data fd2
- on fd.nct_id=fd2.nct_id and fd.earliest_date_observed < fd2.earliest_date_observed
- where fd.current_status = 'Recruiting'
- and fd2.current_status = 'Active, not recruiting'
- group by fd.nct_id
- )
- select
- fdqpe.nct_id
- --,fdqpe.start_date
- --,fdqpe.current_enrollment
- --,fdqpe.enrollment_category
- ,fdqpe.current_status
- ,fdqpe.earliest_date_observed
- ,fdqpe.elapsed_duration
- ,fdqpe.n_brands as identical_brands
- ,ntbtu.brand_name_counts
- ,fdqpe.category_id
- ,fdqpe.final_status
- ,fdqpe.h_sdi_val
- --,fdqpe.h_sdi_u95
- --,fdqpe.h_sdi_l95
- ,fdqpe.hm_sdi_val
- --,fdqpe.hm_sdi_u95
- --,fdqpe.hm_sdi_l95
- ,fdqpe.m_sdi_val
- --,fdqpe.m_sdi_u95
- --,fdqpe.m_sdi_l95
- ,fdqpe.lm_sdi_val
- --,fdqpe.lm_sdi_u95
- --,fdqpe.lm_sdi_l95
- ,fdqpe.l_sdi_val
- --,fdqpe.l_sdi_u95
- --,fdqpe.l_sdi_l95
- from formatted_data_with_planned_enrollment fdqpe
- join \"Formularies\".nct_to_brand_counts_through_uspdc ntbtu
- on fdqpe.nct_id = ntbtu.nct_id
- join cte
- on fdqpe.nct_id = cte.nct_id
- and fdqpe.earliest_date_observed = cte.tmstmp
- order by fdqpe.nct_id, fdqpe.earliest_date_observed
- ;
- "
- )
-df <- fetch(query, n = -1)
-df <- na.omit(df)
-
-query2 <-dbSendQuery(con,"select count(*) from \"DiseaseBurden\".icd10_categories ic where \"level\"=1;")
-n_categories <- fetch(query2, n = -1)
-
-return(list(data=df,ncat=n_categories))
-}
-
-
-d <- get_data(driver)
-df <- d$data
-n_categories <- d$ncat
-
-cf <- get_counterfact_base(driver)
-df_counterfact_base <- cf$data
-
-
-
-################ Format Data ###########################
-
-data_formatter <- function(df) {
-categories <- df["category_id"]
-
-x <- df["elapsed_duration"]
-x["identical_brands"] <- asinh(df$identical_brands)
-x["brand_name_counts"] <- asinh(df$brand_name_count)
-x["h_sdi_val"] <- asinh(df$h_sdi_val)
-x["hm_sdi_val"] <- asinh(df$hm_sdi_val)
-x["m_sdi_val"] <- asinh(df$m_sdi_val)
-x["lm_sdi_val"] <- asinh(df$lm_sdi_val)
-x["l_sdi_val"] <- asinh(df$l_sdi_val)
-
-
-#Setup fixed effects
-
-x["status_NYR"] <- ifelse(df["current_status"]=="Not yet recruiting",1,0)
-x["status_EBI"] <- ifelse(df["current_status"]=="Enrolling by invitation",1,0)
-x["status_Rec"] <- ifelse(df["current_status"]=="Recruiting",1,0)
-
-#set base condition
-x["status_ANR"] <- 1 #ifelse(df["current_status"]=="Active, not recruiting",1,0) #
-
-
-y <- ifelse(df["final_status"]=="Terminated",1,0)
-
-#get category list
-
-
-return(list(x=x,y=y))
-}
-
-train <- data_formatter(df)
-counterfact_base <- data_formatter(df_counterfact_base)
-
-categories <- df$category_id
-
-x <- train$x
-y <- train$y
-
-x_cf_base <- counterfact_base$x
-y_cf_base <- counterfact_base$y
-cf_categories <- df_counterfact_base$category_id
-```
-
-
-
-# Fit Model
-
-
-
-
-```{r}
-################################# FIT MODEL #########################################
-inherited_cols <- c(
- "elapsed_duration"
- #,"identical_brands"
- #,"brand_name_counts"
- ,"h_sdi_val"
- ,"hm_sdi_val"
- ,"m_sdi_val"
- ,"lm_sdi_val"
- ,"l_sdi_val"
- ,"status_NYR"# TODO: may need to remove
- ,"status_EBI"
- ,"status_Rec"
- ,"status_ANR"
-)
-```
-
-
-
-```{r}
-beta_list <- list(
- groups = c(
- `1`="Infections & Parasites",
- `2`="Neoplasms",
- `3`="Blood & Immune system",
- `4`="Endocrine, Nutritional, and Metabolic",
- `5`="Mental & Behavioral",
- `6`="Nervous System",
- `7`="Eye and Adnexa",
- `8`="Ear and Mastoid",
- `9`="Circulatory",
- `10`="Respiratory",
- `11`="Digestive",
- `12`="Skin & Subcutaneaous tissue",
- `13`="Musculoskeletal",
- `14`="Genitourinary",
- `15`="Pregancy, Childbirth, & Puerperium",
- `16`="Perinatal Period",
- `17`="Congential",
- `18`="Symptoms, Signs etc.",
- `19`="Injury etc.",
- `20`="External Causes",
- `21`="Contact with Healthcare",
- `22`="Special Purposes"
- ),
- parameters = c(
- `1`="Elapsed Duration",
- # brands
- `2`="asinh(Generic Brands)",
- `3`="asinh(Competitors USPDC)",
- # population
- `4`="asinh(High SDI)",
- `5`="asinh(High-Medium SDI)",
- `6`="asinh(Medium SDI)",
- `7`="asinh(Low-Medium SDI)",
- `8`="asinh(Low SDI)",
- #Status
- `9`="status_NYR",
- `10`="status_EBI",
- `11`="status_Rec",
- `12`="status_ANR"
- )
-)
-
-get_parameters <- function(stem,class_list) {
- #get categories and lengths
- named <- names(class_list)
- lengths <- sapply(named, (function (x) length(class_list[[x]])))
-
- #describe the grid needed
- iter_list <- sapply(named, (function (x) 1:lengths[x]))
-
- #generate the list of parameters
- pardf <- generate_parameter_df(stem, iter_list)
-
- #add columns with appropriate human-readable names
- for (name in named) {
- pardf[paste(name,"_hr",sep="")] <- as.factor(
- sapply(pardf[name], (function (i) class_list[[name]][i]))
- )
- }
-
- return(pardf)
-}
-
-generate_parameter_df <- function(stem, iter_list) {
- grid <- expand.grid(iter_list)
- grid["param_name"] <- grid %>% unite(x,colnames(grid),sep=",")
- grid["param_name"] <- paste(stem,"[",grid$param_name,"]",sep="")
- return(grid)
-}
-
-group_mcmc_areas <- function(
- stem,# = "beta"
- class_list,# = beta_list
- stanfit,# = fit
- group_id,# = 2
- rename=TRUE,
- filter=NULL
- ) {
-
- #get all parameter names
- params <- get_parameters(stem,class_list)
-
- #filter down to parameters of interest
- params <- filter(params,groups == group_id)
- #Get dataframe with only the rows of interest
- filtdata <- as.data.frame(stanfit)[params$param_name]
- #rename columns
- if (rename) dimnames(filtdata)[[2]] <- params$parameters_hr
- #get group name for title
- group_name <- class_list$groups[group_id]
- #create area plot with appropriate title
- p <- mcmc_areas(filtdata,prob = 0.8, prob_outer = 0.95) +
- ggtitle(paste("Parameter distributions for ICD-10 class:",group_name)) +
- geom_vline(xintercept=seq(-2,2,0.5),color="grey",alpha=0.750)
-
- d <- pivot_longer(filtdata, everything()) |>
- group_by(name) |>
- summarize(
- mean=mean(value)
- ,q025 = quantile(value,probs = 0.025)
- ,q975 = quantile(value,probs = 0.975)
- ,q05 = quantile(value,probs = 0.05)
- ,q95 = quantile(value,probs = 0.95)
- )
- return(list(plot=p,quantiles=d,name=group_name))
-}
-
-parameter_mcmc_areas <- function(
- stem,# = "beta"
- class_list,# = beta_list
- stanfit,# = fit
- parameter_id,# = 2
- rename=TRUE
- ) {
- #get all parameter names
- params <- get_parameters(stem,class_list)
- #filter down to parameters of interest
- params <- filter(params,parameters == parameter_id)
- #Get dataframe with only the rows of interest
- filtdata <- as.data.frame(stanfit)[params$param_name]
- #rename columns
- if (rename) dimnames(filtdata)[[2]] <- params$groups_hr
- #get group name for title
- parameter_name <- class_list$parameters[parameter_id]
- #create area plot with appropriate title
- p <- mcmc_areas(filtdata,prob = 0.8, prob_outer = 0.95) +
- ggtitle(parameter_name,"Parameter Distribution") +
- geom_vline(xintercept=seq(-2,2,0.5),color="grey",alpha=0.750)
-
- d <- pivot_longer(filtdata, everything()) |>
- group_by(name) |>
- summarize(
- mean=mean(value)
- ,q025 = quantile(value,probs = 0.025)
- ,q975 = quantile(value,probs = 0.975)
- ,q05 = quantile(value,probs = 0.05)
- ,q95 = quantile(value,probs = 0.95)
- )
- return(list(plot=p,quantiles=d,name=parameter_name))
-}
-
-
-```
-
-Plan: select all snapshots that are the first to have closed
-enrollment (Rec -> ANR when comparing across snapshots), and then
-```{r}
-#delay intervention
-intervention_enrollment <- x_cf_base[c(inherited_cols,"brand_name_counts", "identical_brands")]
-intervention_enrollment["status_ANR"] <- 0
-intervention_enrollment["status_Rec"] <- 1
-```
-
-```{r}
-counterfact_delay <- list(
- D = ncol(x),#
- N = nrow(x),
- L = n_categories$count,
- y = as.vector(y),
- ll = as.vector(categories),
- x = as.matrix(x),
- mu_mean = 0,
- mu_stdev = 0.05,
- sigma_shape = 4,
- sigma_rate = 20,
- Nx = nrow(x_cf_base),
- llx = as.vector(cf_categories),
- counterfact_x_tilde = as.matrix(intervention_enrollment),
- counterfact_x = as.matrix(x_cf_base),
- status_indexes = c(11,12) #subtract anr from recruiting to get movement from anr to recruiting
-)
-```
-
-```{r}
-fit <- stan(
- file='Hierarchal_Logistic.stan',
- data = counterfact_delay,
- chains = 4,
- iter = 5000,
- seed = 11021585
- )
-```
-
-
-
-## Calculate relative difference in parameters between recruiting and active not recruiting states
-
-
-
-## Explore data
-
-
-```{r}
-#get number of trials and snapshots in each category
-group_trials_by_category <- as.data.frame(aggregate(category_id ~ nct_id, df, max))
-group_trials_by_category <- as.data.frame(group_trials_by_category)
-
-category_count <- group_trials_by_category |> group_by(category_id) |> count()
-
-```
-
-
-```{r}
-################################# DATA EXPLORATION ############################
-driver <- dbDriver("PostgreSQL")
-
-con <- dbConnect(
- driver,
- user='root',
- password='root',
- dbname='aact_db',
- host=host
- )
-#Plot histogram of count of snapshots
-df3 <- dbGetQuery(
- con,
- "select nct_id,final_status,count(*) from formatted_data_with_planned_enrollment fdwpe
- group by nct_id,final_status ;"
- )
-#df3 <- fetch(query3, n = -1)
-
-ggplot(data=df3, aes(x=count, fill=final_status)) +
- geom_histogram(binwidth=1) +
- ggtitle("Histogram of snapshots per trial (matched trials)") +
- xlab("Snapshots per trial")
-ggsave(paste0(image_trial_details,"/HistSnapshots.png"))
-
-#Plot duration for terminated vs completed
-df4 <- dbGetQuery(
- con,
- "
- select
- nct_id,
- start_date ,
- primary_completion_date,
- overall_status ,
- primary_completion_date - start_date as duration
- from ctgov.studies s
- where nct_id in (select distinct nct_id from http.download_status ds)
- ;"
- )
-#df4 <- fetch(query4, n = -1)
-
-ggplot(data=df4, aes(x=duration,fill=overall_status)) +
- geom_histogram()+
- ggtitle("Histogram of trial durations") +
- xlab("duration")+
- facet_wrap(~overall_status)
-ggsave(paste0(image_trial_details,"/HistTrialDurations_Faceted.png"))
-
-df5 <- dbGetQuery(
- con,
- "
- with cte1 as (
- select
- nct_id,
- start_date ,
- primary_completion_date,
- overall_status ,
- primary_completion_date - start_date as duration
- from ctgov.studies s
- where nct_id in (select distinct nct_id from http.download_status ds)
- ), cte2 as (
- select nct_id,count(*) as snapshot_count from formatted_data_with_planned_enrollment fdwpe
- group by nct_id
- )
- select a.nct_id, a.overall_status, a.duration,b.snapshot_count
- from cte1 as a
- join cte2 as b
- on a.nct_id=b.nct_id
- ;"
- )
-df5$overall_status <- as.factor(df5$overall_status)
-
-ggplot(data=df5, aes(x=duration,y=snapshot_count,color=overall_status)) +
- geom_jitter() +
- ggtitle("Comparison of duration, status, and snapshot_count") +
- xlab("duration") +
- ylab("snapshot count")
-ggsave(paste0(image_trial_details,"/SnapshotsVsDurationVsTermination.png"))
-
-dbDisconnect(con)
-
-#get number of trials and snapshots in each category
-group_trials_by_category <- as.data.frame(aggregate(category_id ~ nct_id, df, max))
-group_trials_by_category <- as.data.frame(group_trials_by_category)
-
-ggplot(data = group_trials_by_category, aes(x=category_id)) +
- geom_bar(binwidth=1,color="black",fill="lightblue") +
- scale_x_continuous(breaks=scales::pretty_breaks(n=22)) +
- labs(
- title="bar chart of trial categories"
- ,x="Category ID"
- ,y="Count"
- )
-ggsave(paste0(image_trial_details,"/CategoryCounts.png"))
-
-
-
-summary(df5)
-
-cor(df5$duration,df5$snapshot_count)
-sum(df5$snapshot_count)
-```
-
-
-
-
-## Fit Results
-
-
-```{r}
-################################# ANALYZE #####################################
-print(fit)
-```
-
-# Parameter Distributions
-
-
-```{r}
-#g1 <- group_mcmc_areas("beta",beta_list,fit,1)
-
-
-gx <- c()
-
-#grab parameters for every category with more than 8 observations
-for (i in category_count$category_id[category_count$n >= 0]) {
- print(i)
-
- #Print parameter distributions
- gi <- group_mcmc_areas("beta",beta_list,fit,i) #add way to filter groups
- ggsave(
- paste0(image_parameters_by_groups,"/group_",i,"_",gi$name,".png")
- ,plot=gi$plot
- )
- gx <- c(gx,gi)
-
- #Get Quantiles and means for parameters
-# table <- xtable(gi$quantiles,
-# floating=FALSE
-# ,latex.environments = NULL
-# ,booktabs = TRUE
-# ,zap=getOption("digits")
-# )
-# write_lines(table,paste0("./latex_output/DirectEffects/group_",gi$name,".tex"))
-}
-```
-
-
-
-```{r}
-px <- c()
-
-
-for (i in c(1,2,3,9,10,11,12)) {
-
- #Print parameter distributions
- pi <- parameter_mcmc_areas("beta",beta_list,fit,i) #add way to filter groups
- ggsave(
- paste0(image_parameters_across_groups,"/parameters_",i,"_",pi$name,".png")
- ,plot=pi$plot
- )
- px <- c(px,pi)
-
- #Get Quantiles and means for parameters
-# table <- xtable(pi$quantiles,
-# floating=FALSE
-# ,latex.environments = NULL
-# ,booktabs = TRUE
-# ,zap=getOption("digits")
-# )
-# write_lines(table,paste0("./latex_output/DirectEffects/parameters_",i,"_",pi$name,".tex"))
-
-}
-```
-
-Note these have 95% outer CI and 80% inner (shaded)
-
-
-
-
-
-```{r}
-#get the generic and uspdc parameters
-print(px[4]$plot + px[7]$plot)
-ggsave(paste0(image_parameters_across_groups,"2+3_generic_and_uspdc.png"))
-
-#get the parameters associated with duration
-px[16]$plot + px[19]$plot
-ggsave(paste0(image_parameters_across_groups,"11+12_statusREC_and_statusANR.png"))
-```
-
-
-# Counterfactuals
-
-```{r}
-generated_ib <- gqs(
- fit@stanmodel,
- data=counterfact_delay,
- draws=as.matrix(fit),
- seed=11021585
- )
-```
-
-```{r}
-df_ib_p <- data.frame(
- p_prior=as.vector(extract(generated_ib, pars="p_prior")$p_prior)
- ,p_predicted = as.vector(extract(generated_ib, pars="p_predicted")$p_predicted)
-)
-
-df_ib_prior <- data.frame(
- mu_prior = as.vector(extract(generated_ib, pars="mu_prior")$mu_prior)
- ,sigma_prior = as.vector(extract(generated_ib, pars="sigma_prior")$sigma_prior)
-)
-
-#p_prior
-ggplot(df_ib_p, aes(x=p_prior)) +
- geom_density() +
- labs(
- title="Implied Prior Distribution P"
- ,subtitle=""
- ,x="Probability Domain 'p'"
- ,y="Probability Density"
- )
-ggsave(paste0(image_dist_diff_analysis,"/prior_p.png"))
-
-#p_posterior
-ggplot(df_ib_p, aes(x=p_predicted)) +
- geom_density() +
- labs(
- title="Implied Posterior Distribution P"
- ,subtitle=""
- ,x="Probability Domain 'p'"
- ,y="Probability Density"
- )
-ggsave(paste0(image_dist_diff_analysis,"/posterior_p.png"))
-
-#mu_prior
-ggplot(df_ib_prior) +
- geom_density(aes(x=mu_prior)) +
- labs(
- title="Prior - Mu"
- ,subtitle="same prior for all Mu values"
- ,x="Mu"
- ,y="Probability"
- )
-ggsave(paste0(image_dist_diff_analysis,"/prior_mu.png"))
-
-#sigma_posterior
-ggplot(df_ib_prior) +
- geom_density(aes(x=sigma_prior)) +
- labs(
- title="Prior - Sigma"
- ,subtitle="same prior for all Sigma values"
- ,x="Sigma"
- ,y="Probability"
- )
-ggsave(paste0(image_dist_diff_analysis,"/prior_sigma.png"))
-```
-
-
-
-```{r}
-check_hmc_diagnostics(fit)
-```
-
-
-
-
-
-### Intervention: Delay close of enrollment
-
-```{r}
-counterfact_predicted_ib <- data.frame(
- p_predicted_default = as.vector(extract(generated_ib, pars="p_predicted_default")$p_predicted_default)
- ,p_predicted_intervention = as.vector(extract(generated_ib, pars="p_predicted_intervention")$p_predicted_intervention)
- ,predicted_difference = as.vector(extract(generated_ib, pars="predicted_difference")$predicted_difference)
-)
-
-
-ggplot(counterfact_predicted_ib, aes(x=p_predicted_default)) +
- geom_density() +
- labs(
- title="Predicted Distribution of 'p'"
- ,subtitle="Intervention: None"
- ,x="Probability Domain 'p'"
- ,y="Probability Density"
- )
-ggsave(paste0(image_dist_diff_analysis,"p_no_intervention.png"))
-
-ggplot(counterfact_predicted_ib, aes(x=p_predicted_intervention)) +
- geom_density() +
- labs(
- title="Predicted Distribution of 'p'"
- ,subtitle="Intervention: Delay close of enrollment"
- ,x="Probability Domain 'p'"
- ,y="Probability Density"
- )
-ggsave(paste0(image_dist_diff_analysis,"/p_delay_intervention.png"))
-
-ggplot(counterfact_predicted_ib, aes(x=predicted_difference)) +
- geom_density() +
- labs(
- title="Predicted Distribution of differences 'p'"
- ,subtitle="Intervention: Delay close of enrollment"
- ,x="Difference in 'p' under treatment"
- ,y="Probability Density"
- )
-ggsave(paste0(image_dist_diff_analysis,"/p_delay_intervention_distdiff_1.png"))
-```
-
-
-```{r}
-get_category_count <- function(tbl, id) {
- result <- tbl$n[tbl$category_id == id]
- if(length(result) == 0) 0 else result
-}
-
-category_names <- sapply(1:length(beta_list$groups),
- function(i) sprintf("ICD-10 #%d: %s (n=%d)",
- i,
- beta_list$groups[i],
- get_category_count(category_count, i)))
-
-```
-
-```{r}
-pddf_ib <- data.frame(extract(generated_ib, pars="predicted_difference")$predicted_difference) |>
- pivot_longer(X1:X168) #CHANGE_NOTE: moved from X169 to X168
-
-pddf_ib["entry_idx"] <- as.numeric(gsub("\\D","",pddf_ib$name))
-pddf_ib["category"] <- sapply(pddf_ib$entry_idx, function(i) df$category_id[i])
-pddf_ib["category_name"] <- sapply(
- pddf_ib$category,
- function(i) category_names[i]
- )
-
-
-
-ggplot(pddf_ib, aes(x=value,)) +
- geom_density(adjust=1/5) +
- labs(
- title = "Distribution of predicted differences"
- ,subtitle = "Intervention: Delay close of enrollment"
- ,x = "Difference in probability due to intervention"
- ,y = "Probability Density"
- ) +
- geom_vline(aes(xintercept = 0), color = "skyblue", linetype="dashed")
-ggsave(paste0(image_dist_diff_analysis,"/p_delay_intervention_distdiff_styled.png"))
-
-ggplot(pddf_ib, aes(x=value,)) +
- geom_density(adjust=1/5) +
- facet_wrap(
- ~factor(
- category_name,
- levels=category_names
- )
- , labeller = label_wrap_gen(multi_line = TRUE)
- , ncol=4) +
- labs(
- title = "Distribution of predicted differences | By Group"
- ,subtitle = "Intervention: Delay close of enrollment"
- ,x = "Difference in probability due to intervention"
- ,y = "Probability Density"
- ) +
- geom_vline(aes(xintercept = 0), color = "skyblue", linetype="dashed") +
- theme(strip.text.x = element_text(size = 8))
-ggsave(paste0(image_dist_diff_analysis,"/p_delay_intervention_distdiff_by_group.png"))
-
-ggplot(pddf_ib, aes(x=value,)) +
- geom_histogram(bins=300) +
- facet_wrap(
- ~factor(
- category_name,
- levels=category_names
- )
- , labeller = label_wrap_gen(multi_line = TRUE)
- , ncol=4) +
- labs(
- title = "Histogram of predicted differences | By Group"
- ,subtitle = "Intervention: Delay close of enrollment"
- ,x = "Difference in probability due to intervention"
- ,y = "Predicted counts"
- ) +
- geom_vline(aes(xintercept = 0), color = "skyblue", linetype="dashed") +
- theme(strip.text.x = element_text(size = 8))
-ggsave(paste0(image_dist_diff_analysis,"p_delay_intervention_histdiff_by_group.png"))
-```
-
-
-```{r}
-p3 <- ggplot(pddf_ib, aes(x=value,)) +
- geom_histogram(bins=500) +
- labs(
- title = "Distribution of predicted differences"
- ,subtitle = "Intervention: Delay close of enrollment"
- ,x = "Difference in probability due to intervention"
- ,y = "Probability Density"
- ,caption = "Vertical marks: 5/10/25/50/75/90/95th percentiles. Dot shows mean."
- ) +
- geom_vline(aes(xintercept = 0), color = "skyblue", linetype="dashed")
-
-density_hight <- max(density(pddf_ib$value)$y)
-
-stats <- list(
- p5 = quantile(pddf_ib$value, 0.05),
- p10 = quantile(pddf_ib$value, 0.10),
- q1 = quantile(pddf_ib$value, 0.25),
- med = median(pddf_ib$value),
- mean = mean(pddf_ib$value),
- q3 = quantile(pddf_ib$value, 0.75),
- p90 = quantile(pddf_ib$value, 0.90),
- p95 = quantile(pddf_ib$value, 0.95),
- max_height = max(ggplot_build(p3)$data[[1]]$count),
- y_offset = -max(ggplot_build(p3)$data[[1]]$count) * 0.05,
- y_offset_density = -density_hight * 0.15
-)
-
-p3 +
- # Box
- geom_segment(data = data.frame(
- x = c(stats$q1, stats$q3, stats$med),
- xend = c(stats$q1, stats$q3, stats$med),
- y = rep(stats$y_offset, 3),
- yend = rep(stats$y_offset * 2, 3)
- ), aes(x = x, xend = xend, y = y, yend = yend)) +
- geom_segment(data = data.frame(
- x = rep(stats$q1, 2),
- xend = rep(stats$q3, 2),
- y = c(stats$y_offset, stats$y_offset * 2),
- yend = c(stats$y_offset, stats$y_offset * 2)
- ), aes(x = x, xend = xend, y = y, yend = yend)) +
- # Inner whiskers (Q1->P10, Q3->P90)
- geom_segment(data = data.frame(
- x = c(stats$q1, stats$q3),
- xend = c(stats$p10, stats$p90),
- y = rep(stats$y_offset * 1.5, 2),
- yend = rep(stats$y_offset * 1.5, 2)
- ), aes(x = x, xend = xend, y = y, yend = yend)) +
- # Crossbars at P10/P90
- geom_segment(data = data.frame(
- x = c(stats$p10, stats$p90),
- xend = c(stats$p10, stats$p90),
- y = stats$y_offset * 1.3,
- yend = stats$y_offset * 1.7
- ), aes(x = x, xend = xend, y = y, yend = yend)) +
- # Outer whiskers (P10->P5, P90->P95)
- geom_segment(data = data.frame(
- x = c(stats$p10, stats$p90),
- xend = c(stats$p5, stats$p95),
- y = rep(stats$y_offset * 1.5, 2),
- yend = rep(stats$y_offset * 1.5, 2)
- ), aes(x = x, xend = xend, y = y, yend = yend)) +
- # Crossbars at P5/P95
- geom_segment(data = data.frame(
- x = c(stats$p5, stats$p95),
- xend = c(stats$p5, stats$p95),
- y = stats$y_offset * 1.3,
- yend = stats$y_offset * 1.7
- ), aes(x = x, xend = xend, y = y, yend = yend)) +
- # Mean dot
- geom_point(data = data.frame(
- x = stats$mean,
- y = stats$y_offset * 1.5
- ), aes(x = x, y = y))
-ggsave(paste0(image_dist_diff_analysis,"/p_delay_intervention_histdiff_boxplot.png"))
-
-
-p4 <- ggplot(pddf_ib, aes(x = value)) +
- geom_density() +
- labs(
- title = "Distribution of predicted differences"
- ,subtitle = "Intervention: Delay close of enrollment"
- ,x = "Difference in probability due to intervention"
- ,y = "Probability Density"
- ,caption = "Vertical marks: 5/10/25/50/75/90/95th percentiles. Dot shows mean."
- ) +
- geom_vline(aes(xintercept = 0), color = "skyblue", linetype="dashed") +
- geom_segment(data = data.frame(
- x = c(stats$q1, stats$q3, stats$med),
- xend = c(stats$q1, stats$q3, stats$med),
- y = rep(stats$y_offset_density, 3),
- yend = rep(stats$y_offset_density * 2, 3)
- ), aes(x = x, xend = xend, y = y, yend = yend)) +
- geom_segment(data = data.frame(
- x = rep(stats$q1, 2),
- xend = rep(stats$q3, 2),
- y = c(stats$y_offset_density, stats$y_offset_density * 2),
- yend = c(stats$y_offset_density, stats$y_offset_density * 2)
- ), aes(x = x, xend = xend, y = y, yend = yend)) +
- # Inner whiskers (Q1->P10, Q3->P90)
- geom_segment(data = data.frame(
- x = c(stats$q1, stats$q3),
- xend = c(stats$p10, stats$p90),
- y = rep(stats$y_offset_density * 1.5, 2),
- yend = rep(stats$y_offset_density * 1.5, 2)
- ), aes(x = x, xend = xend, y = y, yend = yend)) +
- # Crossbars at P10/P90
- geom_segment(data = data.frame(
- x = c(stats$p10, stats$p90),
- xend = c(stats$p10, stats$p90),
- y = stats$y_offset_density * 1.3,
- yend = stats$y_offset_density * 1.7
- ), aes(x = x, xend = xend, y = y, yend = yend)) +
- # Outer whiskers (P10->P5, P90->P95)
- geom_segment(data = data.frame(
- x = c(stats$p10, stats$p90),
- xend = c(stats$p5, stats$p95),
- y = rep(stats$y_offset_density * 1.5, 2),
- yend = rep(stats$y_offset_density * 1.5, 2)
- ), aes(x = x, xend = xend, y = y, yend = yend)) +
- # Crossbars at P5/P95
- geom_segment(data = data.frame(
- x = c(stats$p5, stats$p95),
- xend = c(stats$p5, stats$p95),
- y = stats$y_offset_density * 1.3,
- yend = stats$y_offset_density * 1.7
- ), aes(x = x, xend = xend, y = y, yend = yend)) +
- # Mean dot
- geom_point(data = data.frame(
- x = stats$mean,
- y = stats$y_offset_density * 1.5
- ), aes(x = x, y = y))
-ggsave(paste0(image_dist_diff_analysis,"/p_delay_intervention_distdiff_boxplot.png"))
-```
-
-```{r}
- ggplot(pddf_ib, aes(x=value)) +
- stat_ecdf(geom='step') +
- labs(
- title = "Cumulative distribution of predicted differences",
- subtitle = "Intervention: Delay close of enrollment",
- x = "Difference in probability of termination due to intervention",
- y = "Cumulative Proportion"
- )
-
-ggsave(paste0(image_dist_diff_analysis,"/p_delay_intervention_cumulative_distdiff.png"))
-```
-
-
-Get the % of differences in the spike around zero
-```{r}
-# get values around and above/below spike
-width <- 0.02
-spike_band_centered_zero <- mean( pddf_ib$value >= -width/2 & pddf_ib$value <= width/2)
-above_spike_band <- mean( pddf_ib$value >= width/2)
-below_spike_band <- mean( pddf_ib$value <= -width/2)
-
-# get mass above and mass below zero
-mass_below_zero <- mean( pddf_ib$value <= 0)
-```
-Looking at the spike around zero, we find that `r spike_band_centered_zero*100`%
-of the probability mass is contained within the band from
-[`r -width*100/2`,`r width*100/2`].
-Additionally, there was `r above_spike_band*100`% of the probability above that
--- representing those with a general increase in the probability of termination --
-and `r below_spike_band*100`% of the probability mass below the band
--- representing a decrease in the probability of termination.
-
-On average, if you keep the trial open instead of closing it,
-`r mass_below_zero`% of trials will see a decrease in the probability of termination,
-but, due to the high increase in probability of termination given termination was increased,
-the mean probability of termination increases by `r stats$mean`.
-
-```{r}
-# 5%-iles
-
-summary(pddf_ib$value)
-
-# Create your quantiles
-quants <- quantile(pddf_ib$value, probs = seq(0,1,0.05), type=4)
-
-# Convert to a data frame
-quant_df <- data.frame(
- Percentile = names(quants),
- Value = quants
-)
-kable(quant_df)
-```
-
-There seems to be some trials that are highly suceptable to this enrollment delay. Specifically, there were some
-
-```{r}
-n = length(counterfact_predicted_ib$p_predicted_intervention)
-k = 100
-simulated_terminations_intervention <- mean(rbinom(n,k,as.vector(counterfact_predicted_ib$p_predicted_intervention)))
-simulated_terminations_base <-mean(rbinom(n,k,as.vector(counterfact_predicted_ib$p_predicted_default)))
-
-simulated_percentages <- (simulated_terminations_intervention - simulated_terminations_base)/k
-```
-
-The simulation above shows that this results in a percentage-point increase of about
-`r simulated_percentages * 100`.
-
-
-
-
-# Diagnostics
-
-```{r}
-#| eval: true
-#trace plots
-image_diagnostics <- paste0(image_root,"/diagnostics")
-
-plot(fit, pars=c("mu"), plotfun="trace")
-ggsave(paste0(image_diagnostics,"/trace_plot_mu.png"))
-
-
-for (i in 1:3) {
- print(
- mcmc_rank_overlay(
- fit,
- pars=c(
- paste0("mu[",4*i-3,"]"),
- paste0("mu[",4*i-2,"]"),
- paste0("mu[",4*i-1,"]"),
- paste0("mu[",4*i,"]")
- ),
- n_bins=100
- )+ legend_move("top") +
- scale_colour_ghibli_d("KikiMedium")
- )
- mu_range <- paste0(4*i-3,"-",4*i)
- filename <- paste0(image_diagnostics,"/trace_rank_plot_mu_",mu_range,".png")
- ggsave(filename)
-}
-```
-
-```{r}
-#| eval: true
-plot(fit, pars=c("sigma"), plotfun="trace")
-ggsave(paste0(image_diagnostics,"/traceplot_sigma.png"))
-
-for (i in 1:3) {
- print(
- mcmc_rank_overlay(
- fit,
- pars=c(
- paste0("sigma[",4*i-3,"]"),
- paste0("sigma[",4*i-2,"]"),
- paste0("sigma[",4*i-1,"]"),
- paste0("sigma[",4*i,"]")
- ),
- n_bins=100
- )+ legend_move("top") +
- scale_colour_ghibli_d("KikiMedium")
- )
- sigma_range <- paste0(4*i-3,"-",4*i)
- filename <- paste0(image_diagnostics,"/trace_rank_plot_sigma_",sigma_range,".png")
- ggsave(filename)
-}
-```
-
-```{r}
-#| eval: true
-#other diagnostics
-logpost <- log_posterior(fit)
-nuts_prmts <- nuts_params(fit)
-posterior <- as.array(fit)
-
-```
-
-```{r}
-#| eval: true
-color_scheme_set("darkgray")
-div_style <- parcoord_style_np(div_color = "green", div_size = 0.05, div_alpha = 0.4)
-mcmc_parcoord(posterior, regex_pars = "mu", np=nuts_prmts, np_style = div_style, alpha = 0.05)
-ggsave(paste0(image_diagnostics,"/parcoord_mu.png"))
-```
-
-```{r}
-#| eval: true
-for (i in 1:3) {
- mus = sapply(3:0, function(j) paste0("mu[",4*i-j ,"]"))
- print(
- mcmc_pairs(
- posterior,
- np = nuts_prmts,
- pars=c(
- mus,
- "lp__"
- ),
- off_diag_args = list(size = 0.75)
- )
- )
- mu_range <- paste0(4*i-3,"-",4*i)
- filename <- paste0(image_diagnostics,"/correlation_plot_mu_",mu_range,".png")
- ggsave(filename)
-}
-
-
-
-```
-
-```{r}
-#| eval: true
-mcmc_parcoord(posterior,regex_pars = "sigma", np=nuts_prmts, alpha=0.05)
-ggsave(paste0(image_diagnostics,"/parcoord_sigma.png"))
-```
-
-```{r}
-#| eval: true
-
-for (i in 1:3) {
- params = sapply(3:0, function(j) paste0("sigma[",4*i-j ,"]"))
- print(
- mcmc_pairs(
- posterior,
- np = nuts_prmts,
- pars=c(
- params,
- "lp__"
- ),
- off_diag_args = list(size = 0.75)
- )
- )
- sigma_range <- paste0(4*i-3,"-",4*i)
- filename <- paste0(image_diagnostics,"/correlation_plot_sigma_",sigma_range,".png")
- ggsave(filename)
-}
-```
-
-
-```{r}
-#| eval: true
-for (k in 1:22) {
-for (i in 1:3) {
- params = sapply(3:0, function(j) paste0("beta[",k,",",4*i-j ,"]"))
- print(
- mcmc_pairs(
- posterior,
- np = nuts_prmts,
- pars=c(
- params,
- "lp__"
- ),
- off_diag_args = list(size = 0.75)
- )
- )
-
- beta_range <- paste0("k_",k,"_i_",4*i-3,"-",4*i)
- filename <- paste0(image_diagnostics,"/correlation_plot_beta_",beta_range,".png")
- ggsave(filename)
-}}
-```
-
-
-
-
-
-
-
\ No newline at end of file
diff --git a/r-analysis/EffectsOfEnrollmentDelay_rebased.html b/r-analysis/EffectsOfEnrollmentDelay_rebased.html
deleted file mode 100644
index ff8ee80..0000000
--- a/r-analysis/EffectsOfEnrollmentDelay_rebased.html
+++ /dev/null
@@ -1,3086 +0,0 @@
-
-
-
-
-
-
-
-
-
-
-The Effects of Recruitment status on completion of clinical trials
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-Setup
-
-
library (knitr)
-library (bayesplot)
-
-
This is bayesplot version 1.11.1
-
-
-
- Online documentation and vignettes at mc-stan.org/bayesplot
-
-
-
- bayesplot theme set to bayesplot::theme_default()
-
-
-
* Does _not_ affect other ggplot2 plots
-
-
-
* See ?bayesplot_theme_set for details on theme setting
-
-
available_mcmc (pattern = "_nuts_" )
-
-
bayesplot MCMC module:
-(matching pattern '_nuts_')
- mcmc_nuts_acceptance
- mcmc_nuts_divergence
- mcmc_nuts_energy
- mcmc_nuts_stepsize
- mcmc_nuts_treedepth
-
-
library (ggplot2)
-library (patchwork)
-library (tidyverse)
-
-
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
-✔ dplyr 1.1.4 ✔ readr 2.1.5
-✔ forcats 1.0.0 ✔ stringr 1.5.1
-✔ lubridate 1.9.4 ✔ tibble 3.2.1
-✔ purrr 1.0.2 ✔ tidyr 1.3.1
-
-
-
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
-✖ dplyr::filter() masks stats::filter()
-✖ dplyr::lag() masks stats::lag()
-ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
-
-
-
-
Loading required package: StanHeaders
-
-rstan version 2.32.6 (Stan version 2.32.2)
-
-For execution on a local, multicore CPU with excess RAM we recommend calling
-options(mc.cores = parallel::detectCores()).
-To avoid recompilation of unchanged Stan programs, we recommend calling
-rstan_options(auto_write = TRUE)
-For within-chain threading using `reduce_sum()` or `map_rect()` Stan functions,
-change `threads_per_chain` option:
-rstan_options(threads_per_chain = 1)
-
-
-Attaching package: 'rstan'
-
-The following object is masked from 'package:tidyr':
-
- extract
-
-
library (tidyr)
-library (ghibli)
-library (xtable)
-#Resources: https://github.com/stan-dev/rstan/wiki/RStan-Getting-Started
-
-#save unchanged models instead of recompiling
-rstan_options (auto_write = TRUE )
-#allow for multithreaded sampling
-options (mc.cores = parallel:: detectCores ())
-
-#test installation, shouldn't get any errors
-#example(stan_model, package = "rstan", run.dontrun = TRUE)
-
-
- image_root <- "./output/withdiff/EffectsOfEnrollmentDelay"
- image_dist_diff_analysis <- paste0 (image_root,"/dist_diff_analysis" )
- image_trial_details <- paste0 (image_root,"/trials_details" )
- image_parameters_by_groups <- paste0 (image_root,"/betas/parameters_by_group" )
- image_parameters_across_groups <- paste0 (image_root,"/betas/parameter_across_groups" )
-
-
-
################ Pull data from database ######################
-library (RPostgreSQL)
-
-
Loading required package: DBI
-
-
host <- 'aact_db-restored-2025-01-07'
-
- driver <- dbDriver ("PostgreSQL" )
-
- get_data <- function (driver) {
-
- con <- dbConnect (
- driver,
- user= 'root' ,
- password= 'root' ,
- dbname= 'aact_db' ,
- host= host
- )
-on.exit (dbDisconnect (con))
-
- query <- dbSendQuery (
- con,
-# "select * from formatted_data_with_planned_enrollment;"
-"
-select
- fdqpe.nct_id
- --,fdqpe.start_date
- --,fdqpe.current_enrollment
- --,fdqpe.enrollment_category
- ,fdqpe.current_status
- ,fdqpe.earliest_date_observed
- ,fdqpe.elapsed_duration
- ,fdqpe.n_brands as identical_brands
- ,ntbtu.brand_name_counts
- ,fdqpe.category_id
- ,fdqpe.final_status
- ,fdqpe.h_sdi_val
- --,fdqpe.h_sdi_u95
- --,fdqpe.h_sdi_l95
- ,fdqpe.hm_sdi_val
- --,fdqpe.hm_sdi_u95
- --,fdqpe.hm_sdi_l95
- ,fdqpe.m_sdi_val
- --,fdqpe.m_sdi_u95
- --,fdqpe.m_sdi_l95
- ,fdqpe.lm_sdi_val
- --,fdqpe.lm_sdi_u95
- --,fdqpe.lm_sdi_l95
- ,fdqpe.l_sdi_val
- --,fdqpe.l_sdi_u95
- --,fdqpe.l_sdi_l95
-from formatted_data_with_planned_enrollment fdqpe
- join \" Formularies \" .nct_to_brand_counts_through_uspdc ntbtu
- on fdqpe.nct_id = ntbtu.nct_id
-order by fdqpe.nct_id, fdqpe.earliest_date_observed
-;
-"
- )
- df <- fetch (query, n = - 1 )
- df <- na.omit (df)
-
- query2 <- dbSendQuery (con,"select count(*) from \" DiseaseBurden \" .icd10_categories ic where \" level \" =1;" )
- n_categories <- fetch (query2, n = - 1 )
-
-return (list (data= df,ncat= n_categories))
- }
-
-
- get_counterfact_base <- function (driver) {
-
- con <- dbConnect (
- driver,
- user= 'root' ,
- password= 'root' ,
- dbname= 'aact_db' ,
- host= host
- )
-on.exit (dbDisconnect (con))
-
- query <- dbSendQuery (
- con,
- "
- with cte as (
- --get last recruiting state
- select fd.nct_id, max(fd.earliest_date_observed),min(fd2.earliest_date_observed) as tmstmp
- from formatted_data fd
- join formatted_data fd2
- on fd.nct_id=fd2.nct_id and fd.earliest_date_observed < fd2.earliest_date_observed
- where fd.current_status = 'Recruiting'
- and fd2.current_status = 'Active, not recruiting'
- group by fd.nct_id
- )
- select
- fdqpe.nct_id
- --,fdqpe.start_date
- --,fdqpe.current_enrollment
- --,fdqpe.enrollment_category
- ,fdqpe.current_status
- ,fdqpe.earliest_date_observed
- ,fdqpe.elapsed_duration
- ,fdqpe.n_brands as identical_brands
- ,ntbtu.brand_name_counts
- ,fdqpe.category_id
- ,fdqpe.final_status
- ,fdqpe.h_sdi_val
- --,fdqpe.h_sdi_u95
- --,fdqpe.h_sdi_l95
- ,fdqpe.hm_sdi_val
- --,fdqpe.hm_sdi_u95
- --,fdqpe.hm_sdi_l95
- ,fdqpe.m_sdi_val
- --,fdqpe.m_sdi_u95
- --,fdqpe.m_sdi_l95
- ,fdqpe.lm_sdi_val
- --,fdqpe.lm_sdi_u95
- --,fdqpe.lm_sdi_l95
- ,fdqpe.l_sdi_val
- --,fdqpe.l_sdi_u95
- --,fdqpe.l_sdi_l95
- from formatted_data_with_planned_enrollment fdqpe
- join \" Formularies \" .nct_to_brand_counts_through_uspdc ntbtu
- on fdqpe.nct_id = ntbtu.nct_id
- join cte
- on fdqpe.nct_id = cte.nct_id
- and fdqpe.earliest_date_observed = cte.tmstmp
- order by fdqpe.nct_id, fdqpe.earliest_date_observed
- ;
- "
- )
- df <- fetch (query, n = - 1 )
- df <- na.omit (df)
-
- query2 <- dbSendQuery (con,"select count(*) from \" DiseaseBurden \" .icd10_categories ic where \" level \" =1;" )
- n_categories <- fetch (query2, n = - 1 )
-
-return (list (data= df,ncat= n_categories))
- }
-
-
- d <- get_data (driver)
- df <- d$ data
- n_categories <- d$ ncat
-
- cf <- get_counterfact_base (driver)
- df_counterfact_base <- cf$ data
-
-
-
-################ Format Data ###########################
-
- data_formatter <- function (df) {
- categories <- df["category_id" ]
-
- x <- df["elapsed_duration" ]
- x["identical_brands" ] <- asinh (df$ identical_brands)
- x["brand_name_counts" ] <- asinh (df$ brand_name_count)
- x["h_sdi_val" ] <- asinh (df$ h_sdi_val)
- x["hm_sdi_val" ] <- asinh (df$ hm_sdi_val)
- x["m_sdi_val" ] <- asinh (df$ m_sdi_val)
- x["lm_sdi_val" ] <- asinh (df$ lm_sdi_val)
- x["l_sdi_val" ] <- asinh (df$ l_sdi_val)
-
-
-#Setup fixed effects
-
- x["status_NYR" ] <- ifelse (df["current_status" ]== "Not yet recruiting" ,1 ,0 )
- x["status_EBI" ] <- ifelse (df["current_status" ]== "Enrolling by invitation" ,1 ,0 )
- x["status_Rec" ] <- ifelse (df["current_status" ]== "Recruiting" ,1 ,0 )
- x["status_ANR" ] <- ifelse (df["current_status" ]== "Active, not recruiting" ,1 ,0 )
-
-
- y <- ifelse (df["final_status" ]== "Terminated" ,1 ,0 )
-
-#get category list
-
-
-return (list (x= x,y= y))
- }
-
- train <- data_formatter (df)
- counterfact_base <- data_formatter (df_counterfact_base)
-
- categories <- df$ category_id
-
- x <- train$ x
- y <- train$ y
-
- x_cf_base <- counterfact_base$ x
- y_cf_base <- counterfact_base$ y
- cf_categories <- df_counterfact_base$ category_id
-
-
-
-Fit Model
-
-
################################# FIT MODEL #########################################
- inherited_cols <- c (
- "elapsed_duration"
- ,"identical_brands"
- ,"brand_name_counts"
- ,"h_sdi_val"
- ,"hm_sdi_val"
- ,"m_sdi_val"
- ,"lm_sdi_val"
- ,"l_sdi_val"
- ,"status_NYR"
- ,"status_EBI"
- ,"status_Rec"
- ,"status_ANR"
- )
-
-
-
beta_list <- list (
- groups = c (
- ` 1 ` = "Infections & Parasites" ,
- ` 2 ` = "Neoplasms" ,
- ` 3 ` = "Blood & Immune system" ,
- ` 4 ` = "Endocrine, Nutritional, and Metabolic" ,
- ` 5 ` = "Mental & Behavioral" ,
- ` 6 ` = "Nervous System" ,
- ` 7 ` = "Eye and Adnexa" ,
- ` 8 ` = "Ear and Mastoid" ,
- ` 9 ` = "Circulatory" ,
- ` 10 ` = "Respiratory" ,
- ` 11 ` = "Digestive" ,
- ` 12 ` = "Skin & Subcutaneaous tissue" ,
- ` 13 ` = "Musculoskeletal" ,
- ` 14 ` = "Genitourinary" ,
- ` 15 ` = "Pregancy, Childbirth, & Puerperium" ,
- ` 16 ` = "Perinatal Period" ,
- ` 17 ` = "Congential" ,
- ` 18 ` = "Symptoms, Signs etc." ,
- ` 19 ` = "Injury etc." ,
- ` 20 ` = "External Causes" ,
- ` 21 ` = "Contact with Healthcare" ,
- ` 22 ` = "Special Purposes"
- ),
- parameters = c (
- ` 1 ` = "Elapsed Duration" ,
- # brands
- ` 2 ` = "asinh(Generic Brands)" ,
- ` 3 ` = "asinh(Competitors USPDC)" ,
- # population
- ` 4 ` = "asinh(High SDI)" ,
- ` 5 ` = "asinh(High-Medium SDI)" ,
- ` 6 ` = "asinh(Medium SDI)" ,
- ` 7 ` = "asinh(Low-Medium SDI)" ,
- ` 8 ` = "asinh(Low SDI)" ,
- #Status
- ` 9 ` = "status_NYR" ,
- ` 10 ` = "status_EBI" ,
- ` 11 ` = "status_Rec" ,
- ` 12 ` = "status_ANR"
- )
- )
-
- get_parameters <- function (stem,class_list) {
- #get categories and lengths
- named <- names (class_list)
- lengths <- sapply (named, (function (x) length (class_list[[x]])))
-
- #describe the grid needed
- iter_list <- sapply (named, (function (x) 1 : lengths[x]))
-
- #generate the list of parameters
- pardf <- generate_parameter_df (stem, iter_list)
-
- #add columns with appropriate human-readable names
- for (name in named) {
- pardf[paste (name,"_hr" ,sep= "" )] <- as.factor (
- sapply (pardf[name], (function (i) class_list[[name]][i]))
- )
- }
-
- return (pardf)
- }
-
- generate_parameter_df <- function (stem, iter_list) {
- grid <- expand.grid (iter_list)
- grid["param_name" ] <- grid %>% unite (x,colnames (grid),sep= "," )
- grid["param_name" ] <- paste (stem,"[" ,grid$ param_name,"]" ,sep= "" )
- return (grid)
- }
-
- group_mcmc_areas <- function (
- stem,# = "beta"
- class_list,# = beta_list
- stanfit,# = fit
- group_id,# = 2
- rename= TRUE ,
- filter= NULL
- ) {
-
- #get all parameter names
- params <- get_parameters (stem,class_list)
-
- #filter down to parameters of interest
- params <- filter (params,groups == group_id)
- #Get dataframe with only the rows of interest
- filtdata <- as.data.frame (stanfit)[params$ param_name]
- #rename columns
- if (rename) dimnames (filtdata)[[2 ]] <- params$ parameters_hr
- #get group name for title
- group_name <- class_list$ groups[group_id]
- #create area plot with appropriate title
- p <- mcmc_areas (filtdata,prob = 0.8 , prob_outer = 0.95 ) +
- ggtitle (paste ("Parameter distributions for ICD-10 class:" ,group_name)) +
- geom_vline (xintercept= seq (- 2 ,2 ,0.5 ),color= "grey" ,alpha= 0.750 )
-
- d <- pivot_longer (filtdata, everything ()) |>
- group_by (name) |>
- summarize (
- mean= mean (value)
- ,q025 = quantile (value,probs = 0.025 )
- ,q975 = quantile (value,probs = 0.975 )
- ,q05 = quantile (value,probs = 0.05 )
- ,q95 = quantile (value,probs = 0.95 )
- )
- return (list (plot= p,quantiles= d,name= group_name))
- }
-
- parameter_mcmc_areas <- function (
- stem,# = "beta"
- class_list,# = beta_list
- stanfit,# = fit
- parameter_id,# = 2
- rename= TRUE
- ) {
- #get all parameter names
- params <- get_parameters (stem,class_list)
- #filter down to parameters of interest
- params <- filter (params,parameters == parameter_id)
- #Get dataframe with only the rows of interest
- filtdata <- as.data.frame (stanfit)[params$ param_name]
- #rename columns
- if (rename) dimnames (filtdata)[[2 ]] <- params$ groups_hr
- #get group name for title
- parameter_name <- class_list$ parameters[parameter_id]
- #create area plot with appropriate title
- p <- mcmc_areas (filtdata,prob = 0.8 , prob_outer = 0.95 ) +
- ggtitle (parameter_name,"Parameter Distribution" ) +
- geom_vline (xintercept= seq (- 2 ,2 ,0.5 ),color= "grey" ,alpha= 0.750 )
-
- d <- pivot_longer (filtdata, everything ()) |>
- group_by (name) |>
- summarize (
- mean= mean (value)
- ,q025 = quantile (value,probs = 0.025 )
- ,q975 = quantile (value,probs = 0.975 )
- ,q05 = quantile (value,probs = 0.05 )
- ,q95 = quantile (value,probs = 0.95 )
- )
- return (list (plot= p,quantiles= d,name= parameter_name))
- }
-
-Plan: select all snapshots that are the first to have closed enrollment (Rec -> ANR when comparing across snapshots), and then
-
-
#delay intervention
- intervention_enrollment <- x_cf_base[c (inherited_cols)]
-#TOFIX: ^^^ This ordering of columns is
- intervention_enrollment["status_ANR" ] <- 0
- intervention_enrollment["status_Rec" ] <- 1
-
-
-
counterfact_delay <- list (
- D = ncol (x),#
- N = nrow (x),
- L = n_categories$ count,
- y = as.vector (y),
- ll = as.vector (categories),
- x = as.matrix (x),
- mu_mean = 0 ,
- mu_stdev = 0.05 ,
- sigma_shape = 4 ,
- sigma_rate = 20 ,
- Nx = nrow (x_cf_base),
- llx = as.vector (cf_categories),
- counterfact_x_tilde = as.matrix (intervention_enrollment),
- counterfact_x = as.matrix (x_cf_base),
- status_indexes = c (11 ,12 ) #subtract anr from recruiting to get movement from anr to recruiting
- )
-
-
-
fit <- stan (
- file= 'Hierarchal_Logistic.stan' ,
- data = counterfact_delay,
- chains = 4 ,
- iter = 5000 ,
- seed = 11021585
- )
-
-
Warning: Bulk Effective Samples Size (ESS) is too low, indicating posterior means and medians may be unreliable.
-Running the chains for more iterations may help. See
-https://mc-stan.org/misc/warnings.html#bulk-ess
-
-
-
-Calculate relative difference in parameters between recruiting and active not recruiting states
-
-
filt_data <- as.data.frame (extract (fit,pars= "status_diff" ))
-dimnames (filt_data)[[2 ]] <- beta_list$ groups
-mcmc_areas (filt_data,prob = 0.8 , prob_outer = 0.95 ) +
- ggtitle ("Relative fixed effects across groups" , subtitle = "moving from `Active, not recruiting` to `Recruiting`" ) +
- geom_vline (xintercept= seq (- 2 ,2 ,0.5 ),color= "grey" ,alpha= 0.750 )
-
-
Warning: Removed 3 rows containing missing values or values outside the scale range
-(`geom_vline()`).
-
-
-
-
-
-
-
-
-
-I’ve got an issue here, because this should be a movement from ANR -> recruiting but that is suggesting that
-
-
-Explore data
-
-
#get number of trials and snapshots in each category
- group_trials_by_category <- as.data.frame (aggregate (category_id ~ nct_id, df, max))
- group_trials_by_category <- as.data.frame (group_trials_by_category)
-
- category_count <- group_trials_by_category |> group_by (category_id) |> count ()
-
-
-
################################# DATA EXPLORATION ############################
- driver <- dbDriver ("PostgreSQL" )
-
- con <- dbConnect (
- driver,
- user= 'root' ,
- password= 'root' ,
- dbname= 'aact_db' ,
- host= host
- )
-#Plot histogram of count of snapshots
- df3 <- dbGetQuery (
- con,
- "select nct_id,final_status,count(*) from formatted_data_with_planned_enrollment fdwpe
- group by nct_id,final_status ;"
- )
-#df3 <- fetch(query3, n = -1)
-
-ggplot (data= df3, aes (x= count, fill= final_status)) +
- geom_histogram (binwidth= 1 ) +
- ggtitle ("Histogram of snapshots per trial (matched trials)" ) +
- xlab ("Snapshots per trial" )
-
-
-
-
-
-
-
-
ggsave (paste0 (image_trial_details,"/HistSnapshots.png" ))
-
-
#Plot duration for terminated vs completed
- df4 <- dbGetQuery (
- con,
- "
- select
- nct_id,
- start_date ,
- primary_completion_date,
- overall_status ,
- primary_completion_date - start_date as duration
- from ctgov.studies s
- where nct_id in (select distinct nct_id from http.download_status ds)
- ;"
- )
-#df4 <- fetch(query4, n = -1)
-
-ggplot (data= df4, aes (x= duration,fill= overall_status)) +
- geom_histogram ()+
- ggtitle ("Histogram of trial durations" ) +
- xlab ("duration" )+
- facet_wrap (~ overall_status)
-
-
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
-
-
-
-
-
-
-
-
-
ggsave (paste0 (image_trial_details,"/HistTrialDurations_Faceted.png" ))
-
-
Saving 7 x 5 in image
-`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
-
-
df5 <- dbGetQuery (
- con,
- "
- with cte1 as (
- select
- nct_id,
- start_date ,
- primary_completion_date,
- overall_status ,
- primary_completion_date - start_date as duration
- from ctgov.studies s
- where nct_id in (select distinct nct_id from http.download_status ds)
- ), cte2 as (
- select nct_id,count(*) as snapshot_count from formatted_data_with_planned_enrollment fdwpe
- group by nct_id
- )
- select a.nct_id, a.overall_status, a.duration,b.snapshot_count
- from cte1 as a
- join cte2 as b
- on a.nct_id=b.nct_id
- ;"
- )
- df5$ overall_status <- as.factor (df5$ overall_status)
-
-ggplot (data= df5, aes (x= duration,y= snapshot_count,color= overall_status)) +
- geom_jitter () +
- ggtitle ("Comparison of duration, status, and snapshot_count" ) +
- xlab ("duration" ) +
- ylab ("snapshot count" )
-
-
-
-
-
-
-
-
ggsave (paste0 (image_trial_details,"/SnapshotsVsDurationVsTermination.png" ))
-
-
-
-
#get number of trials and snapshots in each category
- group_trials_by_category <- as.data.frame (aggregate (category_id ~ nct_id, df, max))
- group_trials_by_category <- as.data.frame (group_trials_by_category)
-
-ggplot (data = group_trials_by_category, aes (x= category_id)) +
- geom_bar (binwidth= 1 ,color= "black" ,fill= "lightblue" ) +
- scale_x_continuous (breaks= scales:: pretty_breaks (n= 22 )) +
- labs (
- title= "bar chart of trial categories"
- ,x= "Category ID"
- ,y= "Count"
- )
-
-
Warning in geom_bar(binwidth = 1, color = "black", fill = "lightblue"):
-Ignoring unknown parameters: `binwidth`
-
-
-
-
-
-
-
-
-
ggsave (paste0 (image_trial_details,"/CategoryCounts.png" ))
-
-
-
-
nct_id overall_status duration snapshot_count
- Length:162 Completed :134 Min. : 61.0 Min. : 1.000
- Class :character Terminated: 28 1st Qu.: 618.5 1st Qu.: 4.000
- Mode :character Median :1022.5 Median : 6.000
- Mean :1202.4 Mean : 8.315
- 3rd Qu.:1637.0 3rd Qu.:11.000
- Max. :3332.0 Max. :48.000
-
-
cor (df5$ duration,df5$ snapshot_count)
-
-
-
-
-
-
-Fit Results
-
-
################################# ANALYZE #####################################
-#print(fit)
-
-
-
-
-Parameter Distributions
-
-
#g1 <- group_mcmc_areas("beta",beta_list,fit,1)
-
-
- gx <- c ()
-
-#grab parameters for every category with more than 8 observations
-for (i in category_count$ category_id[category_count$ n >= 0 ]) {
- print (i)
-
- #Print parameter distributions
- gi <- group_mcmc_areas ("beta" ,beta_list,fit,i) #add way to filter groups
- ggsave (
- paste0 (image_parameters_by_groups,"/group_" ,i,"_" ,gi$ name,".png" )
- ,plot= gi$ plot
- )
- gx <- c (gx,gi)
-
- #Get Quantiles and means for parameters
-# table <- xtable(gi$quantiles,
-# floating=FALSE
-# ,latex.environments = NULL
-# ,booktabs = TRUE
-# ,zap=getOption("digits")
-# )
-# write_lines(table,paste0("./latex_output/DirectEffects/group_",gi$name,".tex"))
- }
-
-
-
-
Warning: Removed 3 rows containing missing values or values outside the scale range
-(`geom_vline()`).
-
-
-
-
-
Warning: Removed 2 rows containing missing values or values outside the scale range
-(`geom_vline()`).
-
-
-
-
-
Warning: Removed 2 rows containing missing values or values outside the scale range
-(`geom_vline()`).
-
-
-
-
-
Warning: Removed 2 rows containing missing values or values outside the scale range
-(`geom_vline()`).
-
-
-
-
-
Warning: Removed 1 row containing missing values or values outside the scale range
-(`geom_vline()`).
-
-
-
-
-
Warning: Removed 2 rows containing missing values or values outside the scale range
-(`geom_vline()`).
-
-
-
-
-
Warning: Removed 3 rows containing missing values or values outside the scale range
-(`geom_vline()`).
-
-
-
-
-
Warning: Removed 3 rows containing missing values or values outside the scale range
-(`geom_vline()`).
-
-
-
-
-
Warning: Removed 3 rows containing missing values or values outside the scale range
-(`geom_vline()`).
-
-
-
-
-
Warning: Removed 2 rows containing missing values or values outside the scale range
-(`geom_vline()`).
-
-
-
-
-
Warning: Removed 1 row containing missing values or values outside the scale range
-(`geom_vline()`).
-
-
-
-
-
-
-
Warning: Removed 3 rows containing missing values or values outside the scale range
-(`geom_vline()`).
-
-
-
-
-
Warning: Removed 2 rows containing missing values or values outside the scale range
-(`geom_vline()`).
-
-
-
-
-
Warning: Removed 2 rows containing missing values or values outside the scale range
-(`geom_vline()`).
-
-
-
-
px <- c ()
-
-
-for (i in c (1 ,2 ,3 ,9 ,10 ,11 ,12 )) {
-
- #Print parameter distributions
- pi <- parameter_mcmc_areas ("beta" ,beta_list,fit,i) #add way to filter groups
- ggsave (
- paste0 (image_parameters_across_groups,"/parameters_" ,i,"_" ,pi$ name,".png" )
- ,plot= pi$ plot
- )
- px <- c (px,pi)
-
- #Get Quantiles and means for parameters
-# table <- xtable(pi$quantiles,
-# floating=FALSE
-# ,latex.environments = NULL
-# ,booktabs = TRUE
-# ,zap=getOption("digits")
-# )
-# write_lines(table,paste0("./latex_output/DirectEffects/parameters_",i,"_",pi$name,".tex"))
-
- }
-
-
-
Warning: Removed 6 rows containing missing values or values outside the scale range
-(`geom_vline()`).
-
-
-
Saving 7 x 5 in image
-Saving 7 x 5 in image
-Saving 7 x 5 in image
-
-
-
Warning: Removed 3 rows containing missing values or values outside the scale range
-(`geom_vline()`).
-
-
-
-
Warning: Removed 6 rows containing missing values or values outside the scale range
-(`geom_vline()`).
-
-
-
-
Warning: Removed 5 rows containing missing values or values outside the scale range
-(`geom_vline()`).
-
-
-
-
Warning: Removed 5 rows containing missing values or values outside the scale range
-(`geom_vline()`).
-
-
-Note these have 95% outer CI and 80% inner (shaded)
-
-
#get the generic and uspdc parameters
-print (px[4 ]$ plot + px[7 ]$ plot)
-
-
-
-
-
-
-
-
ggsave (paste0 (image_parameters_across_groups,"2+3_generic_and_uspdc.png" ))
-
-
#get the parameters associated with duration
- px[16 ]$ plot + px[19 ]$ plot
-
-
Warning: Removed 5 rows containing missing values or values outside the scale range
-(`geom_vline()`).
-Removed 5 rows containing missing values or values outside the scale range
-(`geom_vline()`).
-
-
-
-
-
-
-
-
-
ggsave (paste0 (image_parameters_across_groups,"11+12_statusREC_and_statusANR.png" ))
-
-
-
Warning: Removed 5 rows containing missing values or values outside the scale range
-(`geom_vline()`).
-Removed 5 rows containing missing values or values outside the scale range
-(`geom_vline()`).
-
-
-
-
-Counterfactuals
-
-
generated_ib <- gqs (
- fit@ stanmodel,
- data= counterfact_delay,
- draws= as.matrix (fit),
- seed= 11021585
- )
-
-
-Get priors
-
-
df_ib_p <- data.frame (
- p_prior= as.vector (extract (generated_ib, pars= "p_prior" )$ p_prior)
- ,p_predicted = as.vector (extract (generated_ib, pars= "p_predicted" )$ p_predicted)
- )
-
- df_ib_prior <- data.frame (
- mu_prior = as.vector (extract (generated_ib, pars= "mu_prior" )$ mu_prior)
- ,sigma_prior = as.vector (extract (generated_ib, pars= "sigma_prior" )$ sigma_prior)
- )
-
-#p_prior
-ggplot (df_ib_p, aes (x= p_prior)) +
- geom_density () +
- labs (
- title= "implied prior distribution p"
- ,subtitle= ""
- ,x= "probability domain 'p'"
- ,y= "probability density"
- )
-
-
-
-
-
-
-
-
ggsave (paste0 (image_dist_diff_analysis,"/prior_p.png" ))
-
-
#p_posterior
-ggplot (df_ib_p, aes (x= p_predicted)) +
- geom_density () +
- labs (
- title= "implied posterior distribution p"
- ,subtitle= ""
- ,x= "probability domain 'p'"
- ,y= "Probability Density"
- )
-
-
-
-
-
-
-
-
ggsave (paste0 (image_dist_diff_analysis,"/posterior_p.png" ))
-
-
#mu_prior
-ggplot (df_ib_prior) +
- geom_density (aes (x= mu_prior)) +
- labs (
- title= "Prior - Mu"
- ,subtitle= "same prior for all Mu values"
- ,x= "Mu"
- ,y= "Probability"
- )
-
-
-
-
-
-
-
-
ggsave (paste0 (image_dist_diff_analysis,"/prior_mu.png" ))
-
-
#sigma_posterior
-ggplot (df_ib_prior) +
- geom_density (aes (x= sigma_prior)) +
- labs (
- title= "Prior - Sigma"
- ,subtitle= "same prior for all Sigma values"
- ,x= "Sigma"
- ,y= "Probability"
- )
-
-
-
-
-
-
-
-
ggsave (paste0 (image_dist_diff_analysis,"/prior_sigma.png" ))
-
-
-
-
check_hmc_diagnostics (fit)
-
-
-
0 of 10000 iterations ended with a divergence.
-
-
-
-
0 of 10000 iterations saturated the maximum tree depth of 10.
-
-
-
-
E-BFMI indicated no pathological behavior.
-
-
-
-
-Intervention: Delay close of enrollment
-
-
counterfact_predicted_ib <- data.frame (
- p_predicted_default = as.vector (extract (generated_ib, pars= "p_predicted_default" )$ p_predicted_default)
- ,p_predicted_intervention = as.vector (extract (generated_ib, pars= "p_predicted_intervention" )$ p_predicted_intervention)
- ,predicted_difference = as.vector (extract (generated_ib, pars= "predicted_difference" )$ predicted_difference)
- )
-
-
-ggplot (counterfact_predicted_ib, aes (x= p_predicted_default)) +
- geom_density () +
- labs (
- title= "Predicted Distribution of 'p'"
- ,subtitle= "Intervention: None"
- ,x= "Probability Domain 'p'"
- ,y= "Probability Density"
- )
-
-
-
-
-
-
-
-
ggsave (paste0 (image_dist_diff_analysis,"p_no_intervention.png" ))
-
-
ggplot (counterfact_predicted_ib, aes (x= p_predicted_intervention)) +
- geom_density () +
- labs (
- title= "Predicted Distribution of 'p'"
- ,subtitle= "Intervention: Delay close of enrollment"
- ,x= "Probability Domain 'p'"
- ,y= "Probability Density"
- )
-
-
-
-
-
-
-
-
ggsave (paste0 (image_dist_diff_analysis,"/p_delay_intervention.png" ))
-
-
ggplot (counterfact_predicted_ib, aes (x= predicted_difference)) +
- geom_density () +
- labs (
- title= "Predicted Distribution of differences 'p'"
- ,subtitle= "Intervention: Delay close of enrollment"
- ,x= "Difference in 'p' under treatment"
- ,y= "Probability Density"
- )
-
-
-
-
-
-
-
-
ggsave (paste0 (image_dist_diff_analysis,"/p_delay_intervention_distdiff_1.png" ))
-
-
-
-
get_category_count <- function (tbl, id) {
- result <- tbl$ n[tbl$ category_id == id]
- if (length (result) == 0 ) 0 else result
- }
-
- category_names <- sapply (1 : length (beta_list$ groups),
- function (i) sprintf ("ICD-10 #%d: %s (n=%d)" ,
- i,
- beta_list$ groups[i],
- get_category_count (category_count, i)))
-
-
-
pddf_ib <- data.frame (extract (generated_ib, pars= "predicted_difference" )$ predicted_difference) |>
- pivot_longer (X1: X168) #CHANGE_NOTE: moved from X169 to X168
-
- pddf_ib["entry_idx" ] <- as.numeric (gsub (" \\ D" ,"" ,pddf_ib$ name))
- pddf_ib["category" ] <- sapply (pddf_ib$ entry_idx, function (i) df$ category_id[i])
- pddf_ib["category_name" ] <- sapply (
- pddf_ib$ category,
- function (i) category_names[i]
- )
-
-
-
-ggplot (pddf_ib, aes (x= value,)) +
- geom_density (adjust= 1 / 5 ) +
- labs (
- title = "Distribution of predicted differences"
- ,subtitle = "Intervention: Delay close of enrollment"
- ,x = "Difference in probability due to intervention"
- ,y = "Probability Density"
- ) +
- geom_vline (aes (xintercept = 0 ), color = "skyblue" , linetype= "dashed" )
-
-
-
-
-
-
-
-
ggsave (paste0 (image_dist_diff_analysis,"/p_delay_intervention_distdiff_styled.png" ))
-
-
ggplot (pddf_ib, aes (x= value,)) +
- geom_density (adjust= 1 / 5 ) +
- facet_wrap (
- ~ factor (
- category_name,
- levels= category_names
- )
- , labeller = label_wrap_gen (multi_line = TRUE )
- , ncol= 4 ) +
- labs (
- title = "Distribution of predicted differences | By Group"
- ,subtitle = "Intervention: Delay close of enrollment"
- ,x = "Difference in probability due to intervention"
- ,y = "Probability Density"
- ) +
- geom_vline (aes (xintercept = 0 ), color = "skyblue" , linetype= "dashed" ) +
- theme (strip.text.x = element_text (size = 8 ))
-
-
-
-
-
-
-
-
ggsave (paste0 (image_dist_diff_analysis,"/p_delay_intervention_distdiff_by_group.png" ))
-
-
ggplot (pddf_ib, aes (x= value,)) +
- geom_histogram (bins= 300 ) +
- facet_wrap (
- ~ factor (
- category_name,
- levels= category_names
- )
- , labeller = label_wrap_gen (multi_line = TRUE )
- , ncol= 4 ) +
- labs (
- title = "Histogram of predicted differences | By Group"
- ,subtitle = "Intervention: Delay close of enrollment"
- ,x = "Difference in probability due to intervention"
- ,y = "Predicted counts"
- ) +
- geom_vline (aes (xintercept = 0 ), color = "skyblue" , linetype= "dashed" ) +
- theme (strip.text.x = element_text (size = 8 ))
-
-
-
-
-
-
-
-
ggsave (paste0 (image_dist_diff_analysis,"p_delay_intervention_histdiff_by_group.png" ))
-
-
-
-
p3 <- ggplot (pddf_ib, aes (x= value,)) +
- geom_histogram (bins= 500 ) +
- labs (
- title = "Distribution of predicted differences"
- ,subtitle = "Intervention: Delay close of enrollment"
- ,x = "Difference in probability due to intervention"
- ,y = "Probability Density"
- ,caption = "Vertical marks: 5/10/25/50/75/90/95th percentiles. Dot shows mean."
- ) +
- geom_vline (aes (xintercept = 0 ), color = "skyblue" , linetype= "dashed" )
-
- density_hight <- max (density (pddf_ib$ value)$ y)
-
- stats <- list (
- p5 = quantile (pddf_ib$ value, 0.05 ),
- p10 = quantile (pddf_ib$ value, 0.10 ),
- q1 = quantile (pddf_ib$ value, 0.25 ),
- med = median (pddf_ib$ value),
- mean = mean (pddf_ib$ value),
- q3 = quantile (pddf_ib$ value, 0.75 ),
- p90 = quantile (pddf_ib$ value, 0.90 ),
- p95 = quantile (pddf_ib$ value, 0.95 ),
- max_height = max (ggplot_build (p3)$ data[[1 ]]$ count),
- y_offset = - max (ggplot_build (p3)$ data[[1 ]]$ count) * 0.05 ,
- y_offset_density = - density_hight * 0.15
- )
-
- p3 +
- # Box
- geom_segment (data = data.frame (
- x = c (stats$ q1, stats$ q3, stats$ med),
- xend = c (stats$ q1, stats$ q3, stats$ med),
- y = rep (stats$ y_offset, 3 ),
- yend = rep (stats$ y_offset * 2 , 3 )
- ), aes (x = x, xend = xend, y = y, yend = yend)) +
- geom_segment (data = data.frame (
- x = rep (stats$ q1, 2 ),
- xend = rep (stats$ q3, 2 ),
- y = c (stats$ y_offset, stats$ y_offset * 2 ),
- yend = c (stats$ y_offset, stats$ y_offset * 2 )
- ), aes (x = x, xend = xend, y = y, yend = yend)) +
- # Inner whiskers (Q1->P10, Q3->P90)
- geom_segment (data = data.frame (
- x = c (stats$ q1, stats$ q3),
- xend = c (stats$ p10, stats$ p90),
- y = rep (stats$ y_offset * 1.5 , 2 ),
- yend = rep (stats$ y_offset * 1.5 , 2 )
- ), aes (x = x, xend = xend, y = y, yend = yend)) +
- # Crossbars at P10/P90
- geom_segment (data = data.frame (
- x = c (stats$ p10, stats$ p90),
- xend = c (stats$ p10, stats$ p90),
- y = stats$ y_offset * 1.3 ,
- yend = stats$ y_offset * 1.7
- ), aes (x = x, xend = xend, y = y, yend = yend)) +
- # Outer whiskers (P10->P5, P90->P95)
- geom_segment (data = data.frame (
- x = c (stats$ p10, stats$ p90),
- xend = c (stats$ p5, stats$ p95),
- y = rep (stats$ y_offset * 1.5 , 2 ),
- yend = rep (stats$ y_offset * 1.5 , 2 )
- ), aes (x = x, xend = xend, y = y, yend = yend)) +
- # Crossbars at P5/P95
- geom_segment (data = data.frame (
- x = c (stats$ p5, stats$ p95),
- xend = c (stats$ p5, stats$ p95),
- y = stats$ y_offset * 1.3 ,
- yend = stats$ y_offset * 1.7
- ), aes (x = x, xend = xend, y = y, yend = yend)) +
- # Mean dot
- geom_point (data = data.frame (
- x = stats$ mean,
- y = stats$ y_offset * 1.5
- ), aes (x = x, y = y))
-
-
-
-
-
-
-
-
ggsave (paste0 (image_dist_diff_analysis,"/p_delay_intervention_histdiff_boxplot.png" ))
-
-
p4 <- ggplot (pddf_ib, aes (x = value)) +
- geom_density () +
- labs (
- title = "Distribution of predicted differences"
- ,subtitle = "Intervention: Delay close of enrollment"
- ,x = "Difference in probability due to intervention"
- ,y = "Probability Density"
- ,caption = "Vertical marks: 5/10/25/50/75/90/95th percentiles. Dot shows mean."
- ) +
- geom_vline (aes (xintercept = 0 ), color = "skyblue" , linetype= "dashed" ) +
- geom_segment (data = data.frame (
- x = c (stats$ q1, stats$ q3, stats$ med),
- xend = c (stats$ q1, stats$ q3, stats$ med),
- y = rep (stats$ y_offset_density, 3 ),
- yend = rep (stats$ y_offset_density * 2 , 3 )
- ), aes (x = x, xend = xend, y = y, yend = yend)) +
- geom_segment (data = data.frame (
- x = rep (stats$ q1, 2 ),
- xend = rep (stats$ q3, 2 ),
- y = c (stats$ y_offset_density, stats$ y_offset_density * 2 ),
- yend = c (stats$ y_offset_density, stats$ y_offset_density * 2 )
- ), aes (x = x, xend = xend, y = y, yend = yend)) +
- # Inner whiskers (Q1->P10, Q3->P90)
- geom_segment (data = data.frame (
- x = c (stats$ q1, stats$ q3),
- xend = c (stats$ p10, stats$ p90),
- y = rep (stats$ y_offset_density * 1.5 , 2 ),
- yend = rep (stats$ y_offset_density * 1.5 , 2 )
- ), aes (x = x, xend = xend, y = y, yend = yend)) +
- # Crossbars at P10/P90
- geom_segment (data = data.frame (
- x = c (stats$ p10, stats$ p90),
- xend = c (stats$ p10, stats$ p90),
- y = stats$ y_offset_density * 1.3 ,
- yend = stats$ y_offset_density * 1.7
- ), aes (x = x, xend = xend, y = y, yend = yend)) +
- # Outer whiskers (P10->P5, P90->P95)
- geom_segment (data = data.frame (
- x = c (stats$ p10, stats$ p90),
- xend = c (stats$ p5, stats$ p95),
- y = rep (stats$ y_offset_density * 1.5 , 2 ),
- yend = rep (stats$ y_offset_density * 1.5 , 2 )
- ), aes (x = x, xend = xend, y = y, yend = yend)) +
- # Crossbars at P5/P95
- geom_segment (data = data.frame (
- x = c (stats$ p5, stats$ p95),
- xend = c (stats$ p5, stats$ p95),
- y = stats$ y_offset_density * 1.3 ,
- yend = stats$ y_offset_density * 1.7
- ), aes (x = x, xend = xend, y = y, yend = yend)) +
- # Mean dot
- geom_point (data = data.frame (
- x = stats$ mean,
- y = stats$ y_offset_density * 1.5
- ), aes (x = x, y = y))
-ggsave (paste0 (image_dist_diff_analysis,"/p_delay_intervention_distdiff_boxplot.png" ))
-
-
-
-
ggplot (pddf_ib, aes (x= value)) +
- stat_ecdf (geom= 'step' ) +
- labs (
- title = "Cumulative distribution of predicted differences" ,
- subtitle = "Intervention: Delay close of enrollment" ,
- x = "Difference in probability of termination due to intervention" ,
- y = "Cumulative Proportion"
- )
-
-
-
-
-
-
-
-
ggsave (paste0 (image_dist_diff_analysis,"/p_delay_intervention_cumulative_distdiff.png" ))
-
-
-Get the % of differences in the spike around zero
-
-
# get values around and above/below spike
- width <- 0.02
- spike_band_centered_zero <- mean ( pddf_ib$ value >= - width/ 2 & pddf_ib$ value <= width/ 2 )
- above_spike_band <- mean ( pddf_ib$ value >= width/ 2 )
- below_spike_band <- mean ( pddf_ib$ value <= - width/ 2 )
-
-# get mass above and mass below zero
- mass_below_zero <- mean ( pddf_ib$ value <= 0 )
-
-Looking at the spike around zero, we find that 39.193869% of the probability mass is contained within the band from [-1,1]. Additionally, there was 51.937619% of the probability above that – representing those with a general increase in the probability of termination – and 8.8685119% of the probability mass below the band – representing a decrease in the probability of termination.
-On average, if you keep the trial open instead of closing it, 0.2488518% of trials will see a decrease in the probability of termination, but, due to the high increase in probability of termination given termination was increased, the mean probability of termination increases by 0.0249444.
-
-
# 5%-iles
-
-summary (pddf_ib$ value)
-
-
Min. 1st Qu. Median Mean 3rd Qu. Max.
--0.42755 0.00000 0.01164 0.02494 0.04213 0.54813
-
-
# Create your quantiles
- quants <- quantile (pddf_ib$ value, probs = seq (0 ,1 ,0.05 ), type= 4 )
-
-# Convert to a data frame
- quant_df <- data.frame (
- Percentile = names (quants),
- Value = quants
- )
-kable (quant_df)
-
-
-
-
-
-
-
-0%
-0%
--0.4275511
-
-
-5%
-5%
--0.0213008
-
-
-10%
-10%
--0.0079768
-
-
-15%
-15%
--0.0021932
-
-
-20%
-20%
--0.0001065
-
-
-25%
-25%
-0.0000000
-
-
-30%
-30%
-0.0000535
-
-
-35%
-35%
-0.0012179
-
-
-40%
-40%
-0.0040158
-
-
-45%
-45%
-0.0075646
-
-
-50%
-50%
-0.0116358
-
-
-55%
-55%
-0.0162235
-
-
-60%
-60%
-0.0214039
-
-
-65%
-65%
-0.0272582
-
-
-70%
-70%
-0.0340562
-
-
-75%
-75%
-0.0421259
-
-
-80%
-80%
-0.0519438
-
-
-85%
-85%
-0.0645430
-
-
-90%
-90%
-0.0819636
-
-
-95%
-95%
-0.1102529
-
-
-100%
-100%
-0.5481257
-
-
-
-
-
-There seems to be some trials that are highly suceptable to this enrollment delay. Specifically, there were some
-
-
n = length (counterfact_predicted_ib$ p_predicted_intervention)
- k = 100
- simulated_terminations_intervention <- mean (rbinom (n,k,as.vector (counterfact_predicted_ib$ p_predicted_intervention)))
- simulated_terminations_base <- mean (rbinom (n,k,as.vector (counterfact_predicted_ib$ p_predicted_default)))
-
- simulated_percentages <- (simulated_terminations_intervention - simulated_terminations_base)/ k
-
-The simulation above shows that this results in a percentage-point increase of about 2.4922006.
-
-
-
-Diagnostics
-
-
#trace plots
- image_diagnostics <- paste0 (image_root,"/diagnostics" )
-
-plot (fit, pars= c ("mu" ), plotfun= "trace" )
-
-
-
-
-
-
-
-
ggsave (paste0 (image_diagnostics,"/trace_plot_mu.png" ))
-
-
for (i in 1 : 3 ) {
- print (
- mcmc_rank_overlay (
- fit,
- pars= c (
- paste0 ("mu[" ,4 * i-3 ,"]" ),
- paste0 ("mu[" ,4 * i-2 ,"]" ),
- paste0 ("mu[" ,4 * i-1 ,"]" ),
- paste0 ("mu[" ,4 * i,"]" )
- ),
- n_bins= 100
- )+ legend_move ("top" ) +
- scale_colour_ghibli_d ("KikiMedium" )
- )
- mu_range <- paste0 (4 * i-3 ,"-" ,4 * i)
- filename <- paste0 (image_diagnostics,"/trace_rank_plot_mu_" ,mu_range,".png" )
- ggsave (filename)
- }
-
-
Scale for colour is already present.
-Adding another scale for colour, which will replace the existing scale.
-
-
-
-
-
-
-
-
-
-
Saving 7 x 5 in image
-Scale for colour is already present.
-Adding another scale for colour, which will replace the existing scale.
-
-
-
-
-
-
-
-
-
-
Saving 7 x 5 in image
-Scale for colour is already present.
-Adding another scale for colour, which will replace the existing scale.
-
-
-
-
-
-
-
-
-
-
-
-
plot (fit, pars= c ("sigma" ), plotfun= "trace" )
-
-
-
-
-
-
-
-
ggsave (paste0 (image_diagnostics,"/traceplot_sigma.png" ))
-
-
for (i in 1 : 3 ) {
- print (
- mcmc_rank_overlay (
- fit,
- pars= c (
- paste0 ("sigma[" ,4 * i-3 ,"]" ),
- paste0 ("sigma[" ,4 * i-2 ,"]" ),
- paste0 ("sigma[" ,4 * i-1 ,"]" ),
- paste0 ("sigma[" ,4 * i,"]" )
- ),
- n_bins= 100
- )+ legend_move ("top" ) +
- scale_colour_ghibli_d ("KikiMedium" )
- )
- sigma_range <- paste0 (4 * i-3 ,"-" ,4 * i)
- filename <- paste0 (image_diagnostics,"/trace_rank_plot_sigma_" ,sigma_range,".png" )
- ggsave (filename)
- }
-
-
Scale for colour is already present.
-Adding another scale for colour, which will replace the existing scale.
-
-
-
-
-
-
-
-
-
-
Saving 7 x 5 in image
-Scale for colour is already present.
-Adding another scale for colour, which will replace the existing scale.
-
-
-
-
-
-
-
-
-
-
Saving 7 x 5 in image
-Scale for colour is already present.
-Adding another scale for colour, which will replace the existing scale.
-
-
-
-
-
-
-
-
-
-
-
-
#other diagnostics
- logpost <- log_posterior (fit)
- nuts_prmts <- nuts_params (fit)
- posterior <- as.array (fit)
-
-
-
color_scheme_set ("darkgray" )
- div_style <- parcoord_style_np (div_color = "green" , div_size = 0.05 , div_alpha = 0.4 )
-mcmc_parcoord (posterior, regex_pars = "mu" , np= nuts_prmts, np_style = div_style, alpha = 0.05 )
-
-
-
-
-
-
-
-
ggsave (paste0 (image_diagnostics,"/parcoord_mu.png" ))
-
-
-
-
for (i in 1 : 3 ) {
- mus = sapply (3 : 0 , function (j) paste0 ("mu[" ,4 * i- j ,"]" ))
- print (
- mcmc_pairs (
- posterior,
- np = nuts_prmts,
- pars= c (
- mus,
- "lp__"
- ),
- off_diag_args = list (size = 0.75 )
- )
- )
- mu_range <- paste0 (4 * i-3 ,"-" ,4 * i)
- filename <- paste0 (image_diagnostics,"/correlation_plot_mu_" ,mu_range,".png" )
- ggsave (filename)
- }
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
mcmc_parcoord (posterior,regex_pars = "sigma" , np= nuts_prmts, alpha= 0.05 )
-
-
-
-
-
-
-
-
ggsave (paste0 (image_diagnostics,"/parcoord_sigma.png" ))
-
-
-
-
for (i in 1 : 3 ) {
- params = sapply (3 : 0 , function (j) paste0 ("sigma[" ,4 * i- j ,"]" ))
- print (
- mcmc_pairs (
- posterior,
- np = nuts_prmts,
- pars= c (
- params,
- "lp__"
- ),
- off_diag_args = list (size = 0.75 )
- )
- )
- sigma_range <- paste0 (4 * i-3 ,"-" ,4 * i)
- filename <- paste0 (image_diagnostics,"/correlation_plot_sigma_" ,sigma_range,".png" )
- ggsave (filename)
- }
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
for (k in 1 : 22 ) {
-for (i in 1 : 3 ) {
- params = sapply (3 : 0 , function (j) paste0 ("beta[" ,k,"," ,4 * i- j ,"]" ))
- print (
- mcmc_pairs (
- posterior,
- np = nuts_prmts,
- pars= c (
- params,
- "lp__"
- ),
- off_diag_args = list (size = 0.75 )
- )
- )
-
- beta_range <- paste0 ("k_" ,k,"_i_" ,4 * i-3 ,"-" ,4 * i)
- filename <- paste0 (image_diagnostics,"/correlation_plot_beta_" ,beta_range,".png" )
- ggsave (filename)
- }}
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
\ No newline at end of file
diff --git a/r-analysis/EffectsOfEnrollmentDelay_rebased.qmd b/r-analysis/EffectsOfEnrollmentDelay_rebased.qmd
deleted file mode 100644
index 46b6f20..0000000
--- a/r-analysis/EffectsOfEnrollmentDelay_rebased.qmd
+++ /dev/null
@@ -1,1224 +0,0 @@
----
-title: "The Effects of Recruitment status on completion of clinical trials"
-author: "Will King"
-format: html
-editor: source
----
-
-
-# Setup
-
-```{r}
-library(knitr)
-library(bayesplot)
-available_mcmc(pattern = "_nuts_")
-library(ggplot2)
-library(patchwork)
-library(tidyverse)
-library(rstan)
-library(tidyr)
-library(ghibli)
-library(xtable)
-#Resources: https://github.com/stan-dev/rstan/wiki/RStan-Getting-Started
-
-#save unchanged models instead of recompiling
-rstan_options(auto_write = TRUE)
-#allow for multithreaded sampling
-options(mc.cores = parallel::detectCores())
-
-#test installation, shouldn't get any errors
-#example(stan_model, package = "rstan", run.dontrun = TRUE)
-
-
-image_root <- "./output/withdiff/EffectsOfEnrollmentDelay"
-image_dist_diff_analysis <- paste0(image_root,"/dist_diff_analysis")
-image_trial_details <-paste0(image_root,"/trials_details")
-image_parameters_by_groups <-paste0(image_root,"/betas/parameters_by_group")
-image_parameters_across_groups <-paste0(image_root,"/betas/parameter_across_groups")
-
-```
-
-```{r}
-################ Pull data from database ######################
-library(RPostgreSQL)
-host <- 'aact_db-restored-2025-01-07'
-
-driver <- dbDriver("PostgreSQL")
-
-get_data <- function(driver) {
-
-con <- dbConnect(
- driver,
- user='root',
- password='root',
- dbname='aact_db',
- host=host
- )
-on.exit(dbDisconnect(con))
-
-query <- dbSendQuery(
- con,
-# "select * from formatted_data_with_planned_enrollment;"
-"
-select
- fdqpe.nct_id
- --,fdqpe.start_date
- --,fdqpe.current_enrollment
- --,fdqpe.enrollment_category
- ,fdqpe.current_status
- ,fdqpe.earliest_date_observed
- ,fdqpe.elapsed_duration
- ,fdqpe.n_brands as identical_brands
- ,ntbtu.brand_name_counts
- ,fdqpe.category_id
- ,fdqpe.final_status
- ,fdqpe.h_sdi_val
- --,fdqpe.h_sdi_u95
- --,fdqpe.h_sdi_l95
- ,fdqpe.hm_sdi_val
- --,fdqpe.hm_sdi_u95
- --,fdqpe.hm_sdi_l95
- ,fdqpe.m_sdi_val
- --,fdqpe.m_sdi_u95
- --,fdqpe.m_sdi_l95
- ,fdqpe.lm_sdi_val
- --,fdqpe.lm_sdi_u95
- --,fdqpe.lm_sdi_l95
- ,fdqpe.l_sdi_val
- --,fdqpe.l_sdi_u95
- --,fdqpe.l_sdi_l95
-from formatted_data_with_planned_enrollment fdqpe
- join \"Formularies\".nct_to_brand_counts_through_uspdc ntbtu
- on fdqpe.nct_id = ntbtu.nct_id
-order by fdqpe.nct_id, fdqpe.earliest_date_observed
-;
-"
- )
-df <- fetch(query, n = -1)
-df <- na.omit(df)
-
-query2 <-dbSendQuery(con,"select count(*) from \"DiseaseBurden\".icd10_categories ic where \"level\"=1;")
-n_categories <- fetch(query2, n = -1)
-
-return(list(data=df,ncat=n_categories))
-}
-
-
-get_counterfact_base <- function(driver) {
-
-con <- dbConnect(
- driver,
- user='root',
- password='root',
- dbname='aact_db',
- host=host
- )
-on.exit(dbDisconnect(con))
-
-query <- dbSendQuery(
- con,
- "
- with cte as (
- --get last recruiting state
- select fd.nct_id, max(fd.earliest_date_observed),min(fd2.earliest_date_observed) as tmstmp
- from formatted_data fd
- join formatted_data fd2
- on fd.nct_id=fd2.nct_id and fd.earliest_date_observed < fd2.earliest_date_observed
- where fd.current_status = 'Recruiting'
- and fd2.current_status = 'Active, not recruiting'
- group by fd.nct_id
- )
- select
- fdqpe.nct_id
- --,fdqpe.start_date
- --,fdqpe.current_enrollment
- --,fdqpe.enrollment_category
- ,fdqpe.current_status
- ,fdqpe.earliest_date_observed
- ,fdqpe.elapsed_duration
- ,fdqpe.n_brands as identical_brands
- ,ntbtu.brand_name_counts
- ,fdqpe.category_id
- ,fdqpe.final_status
- ,fdqpe.h_sdi_val
- --,fdqpe.h_sdi_u95
- --,fdqpe.h_sdi_l95
- ,fdqpe.hm_sdi_val
- --,fdqpe.hm_sdi_u95
- --,fdqpe.hm_sdi_l95
- ,fdqpe.m_sdi_val
- --,fdqpe.m_sdi_u95
- --,fdqpe.m_sdi_l95
- ,fdqpe.lm_sdi_val
- --,fdqpe.lm_sdi_u95
- --,fdqpe.lm_sdi_l95
- ,fdqpe.l_sdi_val
- --,fdqpe.l_sdi_u95
- --,fdqpe.l_sdi_l95
- from formatted_data_with_planned_enrollment fdqpe
- join \"Formularies\".nct_to_brand_counts_through_uspdc ntbtu
- on fdqpe.nct_id = ntbtu.nct_id
- join cte
- on fdqpe.nct_id = cte.nct_id
- and fdqpe.earliest_date_observed = cte.tmstmp
- order by fdqpe.nct_id, fdqpe.earliest_date_observed
- ;
- "
- )
-df <- fetch(query, n = -1)
-df <- na.omit(df)
-
-query2 <-dbSendQuery(con,"select count(*) from \"DiseaseBurden\".icd10_categories ic where \"level\"=1;")
-n_categories <- fetch(query2, n = -1)
-
-return(list(data=df,ncat=n_categories))
-}
-
-
-d <- get_data(driver)
-df <- d$data
-n_categories <- d$ncat
-
-cf <- get_counterfact_base(driver)
-df_counterfact_base <- cf$data
-
-
-
-################ Format Data ###########################
-
-data_formatter <- function(df) {
-categories <- df["category_id"]
-
-x <- df["elapsed_duration"]
-x["identical_brands"] <- asinh(df$identical_brands)
-x["brand_name_counts"] <- asinh(df$brand_name_count)
-x["h_sdi_val"] <- asinh(df$h_sdi_val)
-x["hm_sdi_val"] <- asinh(df$hm_sdi_val)
-x["m_sdi_val"] <- asinh(df$m_sdi_val)
-x["lm_sdi_val"] <- asinh(df$lm_sdi_val)
-x["l_sdi_val"] <- asinh(df$l_sdi_val)
-
-
-#Setup fixed effects
-
-x["status_NYR"] <- ifelse(df["current_status"]=="Not yet recruiting",1,0)
-x["status_EBI"] <- ifelse(df["current_status"]=="Enrolling by invitation",1,0)
-x["status_Rec"] <- ifelse(df["current_status"]=="Recruiting",1,0)
-x["status_ANR"] <- ifelse(df["current_status"]=="Active, not recruiting",1,0)
-
-
-y <- ifelse(df["final_status"]=="Terminated",1,0)
-
-#get category list
-
-
-return(list(x=x,y=y))
-}
-
-train <- data_formatter(df)
-counterfact_base <- data_formatter(df_counterfact_base)
-
-categories <- df$category_id
-
-x <- train$x
-y <- train$y
-
-x_cf_base <- counterfact_base$x
-y_cf_base <- counterfact_base$y
-cf_categories <- df_counterfact_base$category_id
-```
-
-
-
-# Fit Model
-
-
-
-
-```{r}
-################################# FIT MODEL #########################################
-inherited_cols <- c(
- "elapsed_duration"
- ,"identical_brands"
- ,"brand_name_counts"
- ,"h_sdi_val"
- ,"hm_sdi_val"
- ,"m_sdi_val"
- ,"lm_sdi_val"
- ,"l_sdi_val"
- ,"status_NYR"
- ,"status_EBI"
- ,"status_Rec"
- ,"status_ANR"
-)
-```
-
-
-
-```{r}
-beta_list <- list(
- groups = c(
- `1`="Infections & Parasites",
- `2`="Neoplasms",
- `3`="Blood & Immune system",
- `4`="Endocrine, Nutritional, and Metabolic",
- `5`="Mental & Behavioral",
- `6`="Nervous System",
- `7`="Eye and Adnexa",
- `8`="Ear and Mastoid",
- `9`="Circulatory",
- `10`="Respiratory",
- `11`="Digestive",
- `12`="Skin & Subcutaneaous tissue",
- `13`="Musculoskeletal",
- `14`="Genitourinary",
- `15`="Pregancy, Childbirth, & Puerperium",
- `16`="Perinatal Period",
- `17`="Congential",
- `18`="Symptoms, Signs etc.",
- `19`="Injury etc.",
- `20`="External Causes",
- `21`="Contact with Healthcare",
- `22`="Special Purposes"
- ),
- parameters = c(
- `1`="Elapsed Duration",
- # brands
- `2`="asinh(Generic Brands)",
- `3`="asinh(Competitors USPDC)",
- # population
- `4`="asinh(High SDI)",
- `5`="asinh(High-Medium SDI)",
- `6`="asinh(Medium SDI)",
- `7`="asinh(Low-Medium SDI)",
- `8`="asinh(Low SDI)",
- #Status
- `9`="status_NYR",
- `10`="status_EBI",
- `11`="status_Rec",
- `12`="status_ANR"
- )
-)
-
-get_parameters <- function(stem,class_list) {
- #get categories and lengths
- named <- names(class_list)
- lengths <- sapply(named, (function (x) length(class_list[[x]])))
-
- #describe the grid needed
- iter_list <- sapply(named, (function (x) 1:lengths[x]))
-
- #generate the list of parameters
- pardf <- generate_parameter_df(stem, iter_list)
-
- #add columns with appropriate human-readable names
- for (name in named) {
- pardf[paste(name,"_hr",sep="")] <- as.factor(
- sapply(pardf[name], (function (i) class_list[[name]][i]))
- )
- }
-
- return(pardf)
-}
-
-generate_parameter_df <- function(stem, iter_list) {
- grid <- expand.grid(iter_list)
- grid["param_name"] <- grid %>% unite(x,colnames(grid),sep=",")
- grid["param_name"] <- paste(stem,"[",grid$param_name,"]",sep="")
- return(grid)
-}
-
-group_mcmc_areas <- function(
- stem,# = "beta"
- class_list,# = beta_list
- stanfit,# = fit
- group_id,# = 2
- rename=TRUE,
- filter=NULL
- ) {
-
- #get all parameter names
- params <- get_parameters(stem,class_list)
-
- #filter down to parameters of interest
- params <- filter(params,groups == group_id)
- #Get dataframe with only the rows of interest
- filtdata <- as.data.frame(stanfit)[params$param_name]
- #rename columns
- if (rename) dimnames(filtdata)[[2]] <- params$parameters_hr
- #get group name for title
- group_name <- class_list$groups[group_id]
- #create area plot with appropriate title
- p <- mcmc_areas(filtdata,prob = 0.8, prob_outer = 0.95) +
- ggtitle(paste("Parameter distributions for ICD-10 class:",group_name)) +
- geom_vline(xintercept=seq(-2,2,0.5),color="grey",alpha=0.750)
-
- d <- pivot_longer(filtdata, everything()) |>
- group_by(name) |>
- summarize(
- mean=mean(value)
- ,q025 = quantile(value,probs = 0.025)
- ,q975 = quantile(value,probs = 0.975)
- ,q05 = quantile(value,probs = 0.05)
- ,q95 = quantile(value,probs = 0.95)
- )
- return(list(plot=p,quantiles=d,name=group_name))
-}
-
-parameter_mcmc_areas <- function(
- stem,# = "beta"
- class_list,# = beta_list
- stanfit,# = fit
- parameter_id,# = 2
- rename=TRUE
- ) {
- #get all parameter names
- params <- get_parameters(stem,class_list)
- #filter down to parameters of interest
- params <- filter(params,parameters == parameter_id)
- #Get dataframe with only the rows of interest
- filtdata <- as.data.frame(stanfit)[params$param_name]
- #rename columns
- if (rename) dimnames(filtdata)[[2]] <- params$groups_hr
- #get group name for title
- parameter_name <- class_list$parameters[parameter_id]
- #create area plot with appropriate title
- p <- mcmc_areas(filtdata,prob = 0.8, prob_outer = 0.95) +
- ggtitle(parameter_name,"Parameter Distribution") +
- geom_vline(xintercept=seq(-2,2,0.5),color="grey",alpha=0.750)
-
- d <- pivot_longer(filtdata, everything()) |>
- group_by(name) |>
- summarize(
- mean=mean(value)
- ,q025 = quantile(value,probs = 0.025)
- ,q975 = quantile(value,probs = 0.975)
- ,q05 = quantile(value,probs = 0.05)
- ,q95 = quantile(value,probs = 0.95)
- )
- return(list(plot=p,quantiles=d,name=parameter_name))
-}
-
-
-```
-
-Plan: select all snapshots that are the first to have closed
-enrollment (Rec -> ANR when comparing across snapshots), and then
-```{r}
-#delay intervention
-intervention_enrollment <- x_cf_base[c(inherited_cols)]
-#TOFIX: ^^^ This ordering of columns is
-intervention_enrollment["status_ANR"] <- 0
-intervention_enrollment["status_Rec"] <- 1
-```
-
-```{r}
-counterfact_delay <- list(
- D = ncol(x),#
- N = nrow(x),
- L = n_categories$count,
- y = as.vector(y),
- ll = as.vector(categories),
- x = as.matrix(x),
- mu_mean = 0,
- mu_stdev = 0.05,
- sigma_shape = 4,
- sigma_rate = 20,
- Nx = nrow(x_cf_base),
- llx = as.vector(cf_categories),
- counterfact_x_tilde = as.matrix(intervention_enrollment),
- counterfact_x = as.matrix(x_cf_base),
- status_indexes = c(11,12) #subtract anr from recruiting to get movement from anr to recruiting
-)
-```
-
-```{r}
-fit <- stan(
- file='Hierarchal_Logistic.stan',
- data = counterfact_delay,
- chains = 4,
- iter = 5000,
- seed = 11021585
- )
-```
-
-
-
-## Calculate relative difference in parameters between recruiting and active not recruiting states
-
-
-```{r}
-filt_data <- as.data.frame(extract(fit,pars="status_diff"))
-dimnames(filt_data)[[2]] <- beta_list$groups
-mcmc_areas(filt_data,prob = 0.8, prob_outer = 0.95) +
- ggtitle("Relative fixed effects across groups", subtitle ="moving from `Active, not recruiting` to `Recruiting`") +
- geom_vline(xintercept=seq(-2,2,0.5),color="grey",alpha=0.750)
-```
-
-I've got an issue here, because this should be a movement from ANR -> recruiting
-but that is suggesting that
-
-## Explore data
-
-
-```{r}
-#get number of trials and snapshots in each category
-group_trials_by_category <- as.data.frame(aggregate(category_id ~ nct_id, df, max))
-group_trials_by_category <- as.data.frame(group_trials_by_category)
-
-category_count <- group_trials_by_category |> group_by(category_id) |> count()
-
-```
-
-
-```{r}
-################################# DATA EXPLORATION ############################
-driver <- dbDriver("PostgreSQL")
-
-con <- dbConnect(
- driver,
- user='root',
- password='root',
- dbname='aact_db',
- host=host
- )
-#Plot histogram of count of snapshots
-df3 <- dbGetQuery(
- con,
- "select nct_id,final_status,count(*) from formatted_data_with_planned_enrollment fdwpe
- group by nct_id,final_status ;"
- )
-#df3 <- fetch(query3, n = -1)
-
-ggplot(data=df3, aes(x=count, fill=final_status)) +
- geom_histogram(binwidth=1) +
- ggtitle("Histogram of snapshots per trial (matched trials)") +
- xlab("Snapshots per trial")
-ggsave(paste0(image_trial_details,"/HistSnapshots.png"))
-
-#Plot duration for terminated vs completed
-df4 <- dbGetQuery(
- con,
- "
- select
- nct_id,
- start_date ,
- primary_completion_date,
- overall_status ,
- primary_completion_date - start_date as duration
- from ctgov.studies s
- where nct_id in (select distinct nct_id from http.download_status ds)
- ;"
- )
-#df4 <- fetch(query4, n = -1)
-
-ggplot(data=df4, aes(x=duration,fill=overall_status)) +
- geom_histogram()+
- ggtitle("Histogram of trial durations") +
- xlab("duration")+
- facet_wrap(~overall_status)
-ggsave(paste0(image_trial_details,"/HistTrialDurations_Faceted.png"))
-
-df5 <- dbGetQuery(
- con,
- "
- with cte1 as (
- select
- nct_id,
- start_date ,
- primary_completion_date,
- overall_status ,
- primary_completion_date - start_date as duration
- from ctgov.studies s
- where nct_id in (select distinct nct_id from http.download_status ds)
- ), cte2 as (
- select nct_id,count(*) as snapshot_count from formatted_data_with_planned_enrollment fdwpe
- group by nct_id
- )
- select a.nct_id, a.overall_status, a.duration,b.snapshot_count
- from cte1 as a
- join cte2 as b
- on a.nct_id=b.nct_id
- ;"
- )
-df5$overall_status <- as.factor(df5$overall_status)
-
-ggplot(data=df5, aes(x=duration,y=snapshot_count,color=overall_status)) +
- geom_jitter() +
- ggtitle("Comparison of duration, status, and snapshot_count") +
- xlab("duration") +
- ylab("snapshot count")
-ggsave(paste0(image_trial_details,"/SnapshotsVsDurationVsTermination.png"))
-
-dbDisconnect(con)
-
-#get number of trials and snapshots in each category
-group_trials_by_category <- as.data.frame(aggregate(category_id ~ nct_id, df, max))
-group_trials_by_category <- as.data.frame(group_trials_by_category)
-
-ggplot(data = group_trials_by_category, aes(x=category_id)) +
- geom_bar(binwidth=1,color="black",fill="lightblue") +
- scale_x_continuous(breaks=scales::pretty_breaks(n=22)) +
- labs(
- title="bar chart of trial categories"
- ,x="Category ID"
- ,y="Count"
- )
-ggsave(paste0(image_trial_details,"/CategoryCounts.png"))
-
-
-
-summary(df5)
-
-cor(df5$duration,df5$snapshot_count)
-sum(df5$snapshot_count)
-```
-
-
-
-
-## Fit Results
-
-
-```{r}
-################################# ANALYZE #####################################
-#print(fit)
-```
-
-# Parameter Distributions
-
-
-```{r}
-#g1 <- group_mcmc_areas("beta",beta_list,fit,1)
-
-
-gx <- c()
-
-#grab parameters for every category with more than 8 observations
-for (i in category_count$category_id[category_count$n >= 0]) {
- print(i)
-
- #Print parameter distributions
- gi <- group_mcmc_areas("beta",beta_list,fit,i) #add way to filter groups
- ggsave(
- paste0(image_parameters_by_groups,"/group_",i,"_",gi$name,".png")
- ,plot=gi$plot
- )
- gx <- c(gx,gi)
-
- #Get Quantiles and means for parameters
-# table <- xtable(gi$quantiles,
-# floating=FALSE
-# ,latex.environments = NULL
-# ,booktabs = TRUE
-# ,zap=getOption("digits")
-# )
-# write_lines(table,paste0("./latex_output/DirectEffects/group_",gi$name,".tex"))
-}
-```
-
-
-
-```{r}
-px <- c()
-
-
-for (i in c(1,2,3,9,10,11,12)) {
-
- #Print parameter distributions
- pi <- parameter_mcmc_areas("beta",beta_list,fit,i) #add way to filter groups
- ggsave(
- paste0(image_parameters_across_groups,"/parameters_",i,"_",pi$name,".png")
- ,plot=pi$plot
- )
- px <- c(px,pi)
-
- #Get Quantiles and means for parameters
-# table <- xtable(pi$quantiles,
-# floating=FALSE
-# ,latex.environments = NULL
-# ,booktabs = TRUE
-# ,zap=getOption("digits")
-# )
-# write_lines(table,paste0("./latex_output/DirectEffects/parameters_",i,"_",pi$name,".tex"))
-
-}
-```
-
-Note these have 95% outer CI and 80% inner (shaded)
-
-
-
-
-
-```{r}
-#get the generic and uspdc parameters
-print(px[4]$plot + px[7]$plot)
-ggsave(paste0(image_parameters_across_groups,"2+3_generic_and_uspdc.png"))
-
-#get the parameters associated with duration
-px[16]$plot + px[19]$plot
-ggsave(paste0(image_parameters_across_groups,"11+12_statusREC_and_statusANR.png"))
-```
-
-
-# Counterfactuals
-
-```{r}
-generated_ib <- gqs(
- fit@stanmodel,
- data=counterfact_delay,
- draws=as.matrix(fit),
- seed=11021585
- )
-```
-
-### Get priors
-```{r}
-df_ib_p <- data.frame(
- p_prior=as.vector(extract(generated_ib, pars="p_prior")$p_prior)
- ,p_predicted = as.vector(extract(generated_ib, pars="p_predicted")$p_predicted)
-)
-
-df_ib_prior <- data.frame(
- mu_prior = as.vector(extract(generated_ib, pars="mu_prior")$mu_prior)
- ,sigma_prior = as.vector(extract(generated_ib, pars="sigma_prior")$sigma_prior)
-)
-
-#p_prior
-ggplot(df_ib_p, aes(x=p_prior)) +
- geom_density() +
- labs(
- title="implied prior distribution p"
- ,subtitle=""
- ,x="probability domain 'p'"
- ,y="probability density"
- )
-ggsave(paste0(image_dist_diff_analysis,"/prior_p.png"))
-
-#p_posterior
-ggplot(df_ib_p, aes(x=p_predicted)) +
- geom_density() +
- labs(
- title="implied posterior distribution p"
- ,subtitle=""
- ,x="probability domain 'p'"
- ,y="Probability Density"
- )
-ggsave(paste0(image_dist_diff_analysis,"/posterior_p.png"))
-
-#mu_prior
-ggplot(df_ib_prior) +
- geom_density(aes(x=mu_prior)) +
- labs(
- title="Prior - Mu"
- ,subtitle="same prior for all Mu values"
- ,x="Mu"
- ,y="Probability"
- )
-ggsave(paste0(image_dist_diff_analysis,"/prior_mu.png"))
-
-#sigma_posterior
-ggplot(df_ib_prior) +
- geom_density(aes(x=sigma_prior)) +
- labs(
- title="Prior - Sigma"
- ,subtitle="same prior for all Sigma values"
- ,x="Sigma"
- ,y="Probability"
- )
-ggsave(paste0(image_dist_diff_analysis,"/prior_sigma.png"))
-```
-
-
-
-```{r}
-check_hmc_diagnostics(fit)
-```
-
-
-
-
-
-### Intervention: Delay close of enrollment
-
-```{r}
-counterfact_predicted_ib <- data.frame(
- p_predicted_default = as.vector(extract(generated_ib, pars="p_predicted_default")$p_predicted_default)
- ,p_predicted_intervention = as.vector(extract(generated_ib, pars="p_predicted_intervention")$p_predicted_intervention)
- ,predicted_difference = as.vector(extract(generated_ib, pars="predicted_difference")$predicted_difference)
-)
-
-
-ggplot(counterfact_predicted_ib, aes(x=p_predicted_default)) +
- geom_density() +
- labs(
- title="Predicted Distribution of 'p'"
- ,subtitle="Intervention: None"
- ,x="Probability Domain 'p'"
- ,y="Probability Density"
- )
-ggsave(paste0(image_dist_diff_analysis,"p_no_intervention.png"))
-
-ggplot(counterfact_predicted_ib, aes(x=p_predicted_intervention)) +
- geom_density() +
- labs(
- title="Predicted Distribution of 'p'"
- ,subtitle="Intervention: Delay close of enrollment"
- ,x="Probability Domain 'p'"
- ,y="Probability Density"
- )
-ggsave(paste0(image_dist_diff_analysis,"/p_delay_intervention.png"))
-
-ggplot(counterfact_predicted_ib, aes(x=predicted_difference)) +
- geom_density() +
- labs(
- title="Predicted Distribution of differences 'p'"
- ,subtitle="Intervention: Delay close of enrollment"
- ,x="Difference in 'p' under treatment"
- ,y="Probability Density"
- )
-ggsave(paste0(image_dist_diff_analysis,"/p_delay_intervention_distdiff_1.png"))
-```
-
-
-```{r}
-get_category_count <- function(tbl, id) {
- result <- tbl$n[tbl$category_id == id]
- if(length(result) == 0) 0 else result
-}
-
-category_names <- sapply(1:length(beta_list$groups),
- function(i) sprintf("ICD-10 #%d: %s (n=%d)",
- i,
- beta_list$groups[i],
- get_category_count(category_count, i)))
-
-```
-
-```{r}
-pddf_ib <- data.frame(extract(generated_ib, pars="predicted_difference")$predicted_difference) |>
- pivot_longer(X1:X168) #CHANGE_NOTE: moved from X169 to X168
-
-pddf_ib["entry_idx"] <- as.numeric(gsub("\\D","",pddf_ib$name))
-pddf_ib["category"] <- sapply(pddf_ib$entry_idx, function(i) df$category_id[i])
-pddf_ib["category_name"] <- sapply(
- pddf_ib$category,
- function(i) category_names[i]
- )
-
-
-
-ggplot(pddf_ib, aes(x=value,)) +
- geom_density(adjust=1/5) +
- labs(
- title = "Distribution of predicted differences"
- ,subtitle = "Intervention: Delay close of enrollment"
- ,x = "Difference in probability due to intervention"
- ,y = "Probability Density"
- ) +
- geom_vline(aes(xintercept = 0), color = "skyblue", linetype="dashed")
-ggsave(paste0(image_dist_diff_analysis,"/p_delay_intervention_distdiff_styled.png"))
-
-ggplot(pddf_ib, aes(x=value,)) +
- geom_density(adjust=1/5) +
- facet_wrap(
- ~factor(
- category_name,
- levels=category_names
- )
- , labeller = label_wrap_gen(multi_line = TRUE)
- , ncol=4) +
- labs(
- title = "Distribution of predicted differences | By Group"
- ,subtitle = "Intervention: Delay close of enrollment"
- ,x = "Difference in probability due to intervention"
- ,y = "Probability Density"
- ) +
- geom_vline(aes(xintercept = 0), color = "skyblue", linetype="dashed") +
- theme(strip.text.x = element_text(size = 8))
-ggsave(paste0(image_dist_diff_analysis,"/p_delay_intervention_distdiff_by_group.png"))
-
-ggplot(pddf_ib, aes(x=value,)) +
- geom_histogram(bins=300) +
- facet_wrap(
- ~factor(
- category_name,
- levels=category_names
- )
- , labeller = label_wrap_gen(multi_line = TRUE)
- , ncol=4) +
- labs(
- title = "Histogram of predicted differences | By Group"
- ,subtitle = "Intervention: Delay close of enrollment"
- ,x = "Difference in probability due to intervention"
- ,y = "Predicted counts"
- ) +
- geom_vline(aes(xintercept = 0), color = "skyblue", linetype="dashed") +
- theme(strip.text.x = element_text(size = 8))
-ggsave(paste0(image_dist_diff_analysis,"p_delay_intervention_histdiff_by_group.png"))
-```
-
-
-```{r}
-p3 <- ggplot(pddf_ib, aes(x=value,)) +
- geom_histogram(bins=500) +
- labs(
- title = "Distribution of predicted differences"
- ,subtitle = "Intervention: Delay close of enrollment"
- ,x = "Difference in probability due to intervention"
- ,y = "Probability Density"
- ,caption = "Vertical marks: 5/10/25/50/75/90/95th percentiles. Dot shows mean."
- ) +
- geom_vline(aes(xintercept = 0), color = "skyblue", linetype="dashed")
-
-density_hight <- max(density(pddf_ib$value)$y)
-
-stats <- list(
- p5 = quantile(pddf_ib$value, 0.05),
- p10 = quantile(pddf_ib$value, 0.10),
- q1 = quantile(pddf_ib$value, 0.25),
- med = median(pddf_ib$value),
- mean = mean(pddf_ib$value),
- q3 = quantile(pddf_ib$value, 0.75),
- p90 = quantile(pddf_ib$value, 0.90),
- p95 = quantile(pddf_ib$value, 0.95),
- max_height = max(ggplot_build(p3)$data[[1]]$count),
- y_offset = -max(ggplot_build(p3)$data[[1]]$count) * 0.05,
- y_offset_density = -density_hight * 0.15
-)
-
-p3 +
- # Box
- geom_segment(data = data.frame(
- x = c(stats$q1, stats$q3, stats$med),
- xend = c(stats$q1, stats$q3, stats$med),
- y = rep(stats$y_offset, 3),
- yend = rep(stats$y_offset * 2, 3)
- ), aes(x = x, xend = xend, y = y, yend = yend)) +
- geom_segment(data = data.frame(
- x = rep(stats$q1, 2),
- xend = rep(stats$q3, 2),
- y = c(stats$y_offset, stats$y_offset * 2),
- yend = c(stats$y_offset, stats$y_offset * 2)
- ), aes(x = x, xend = xend, y = y, yend = yend)) +
- # Inner whiskers (Q1->P10, Q3->P90)
- geom_segment(data = data.frame(
- x = c(stats$q1, stats$q3),
- xend = c(stats$p10, stats$p90),
- y = rep(stats$y_offset * 1.5, 2),
- yend = rep(stats$y_offset * 1.5, 2)
- ), aes(x = x, xend = xend, y = y, yend = yend)) +
- # Crossbars at P10/P90
- geom_segment(data = data.frame(
- x = c(stats$p10, stats$p90),
- xend = c(stats$p10, stats$p90),
- y = stats$y_offset * 1.3,
- yend = stats$y_offset * 1.7
- ), aes(x = x, xend = xend, y = y, yend = yend)) +
- # Outer whiskers (P10->P5, P90->P95)
- geom_segment(data = data.frame(
- x = c(stats$p10, stats$p90),
- xend = c(stats$p5, stats$p95),
- y = rep(stats$y_offset * 1.5, 2),
- yend = rep(stats$y_offset * 1.5, 2)
- ), aes(x = x, xend = xend, y = y, yend = yend)) +
- # Crossbars at P5/P95
- geom_segment(data = data.frame(
- x = c(stats$p5, stats$p95),
- xend = c(stats$p5, stats$p95),
- y = stats$y_offset * 1.3,
- yend = stats$y_offset * 1.7
- ), aes(x = x, xend = xend, y = y, yend = yend)) +
- # Mean dot
- geom_point(data = data.frame(
- x = stats$mean,
- y = stats$y_offset * 1.5
- ), aes(x = x, y = y))
-ggsave(paste0(image_dist_diff_analysis,"/p_delay_intervention_histdiff_boxplot.png"))
-
-
-p4 <- ggplot(pddf_ib, aes(x = value)) +
- geom_density() +
- labs(
- title = "Distribution of predicted differences"
- ,subtitle = "Intervention: Delay close of enrollment"
- ,x = "Difference in probability due to intervention"
- ,y = "Probability Density"
- ,caption = "Vertical marks: 5/10/25/50/75/90/95th percentiles. Dot shows mean."
- ) +
- geom_vline(aes(xintercept = 0), color = "skyblue", linetype="dashed") +
- geom_segment(data = data.frame(
- x = c(stats$q1, stats$q3, stats$med),
- xend = c(stats$q1, stats$q3, stats$med),
- y = rep(stats$y_offset_density, 3),
- yend = rep(stats$y_offset_density * 2, 3)
- ), aes(x = x, xend = xend, y = y, yend = yend)) +
- geom_segment(data = data.frame(
- x = rep(stats$q1, 2),
- xend = rep(stats$q3, 2),
- y = c(stats$y_offset_density, stats$y_offset_density * 2),
- yend = c(stats$y_offset_density, stats$y_offset_density * 2)
- ), aes(x = x, xend = xend, y = y, yend = yend)) +
- # Inner whiskers (Q1->P10, Q3->P90)
- geom_segment(data = data.frame(
- x = c(stats$q1, stats$q3),
- xend = c(stats$p10, stats$p90),
- y = rep(stats$y_offset_density * 1.5, 2),
- yend = rep(stats$y_offset_density * 1.5, 2)
- ), aes(x = x, xend = xend, y = y, yend = yend)) +
- # Crossbars at P10/P90
- geom_segment(data = data.frame(
- x = c(stats$p10, stats$p90),
- xend = c(stats$p10, stats$p90),
- y = stats$y_offset_density * 1.3,
- yend = stats$y_offset_density * 1.7
- ), aes(x = x, xend = xend, y = y, yend = yend)) +
- # Outer whiskers (P10->P5, P90->P95)
- geom_segment(data = data.frame(
- x = c(stats$p10, stats$p90),
- xend = c(stats$p5, stats$p95),
- y = rep(stats$y_offset_density * 1.5, 2),
- yend = rep(stats$y_offset_density * 1.5, 2)
- ), aes(x = x, xend = xend, y = y, yend = yend)) +
- # Crossbars at P5/P95
- geom_segment(data = data.frame(
- x = c(stats$p5, stats$p95),
- xend = c(stats$p5, stats$p95),
- y = stats$y_offset_density * 1.3,
- yend = stats$y_offset_density * 1.7
- ), aes(x = x, xend = xend, y = y, yend = yend)) +
- # Mean dot
- geom_point(data = data.frame(
- x = stats$mean,
- y = stats$y_offset_density * 1.5
- ), aes(x = x, y = y))
-ggsave(paste0(image_dist_diff_analysis,"/p_delay_intervention_distdiff_boxplot.png"))
-```
-
-```{r}
- ggplot(pddf_ib, aes(x=value)) +
- stat_ecdf(geom='step') +
- labs(
- title = "Cumulative distribution of predicted differences",
- subtitle = "Intervention: Delay close of enrollment",
- x = "Difference in probability of termination due to intervention",
- y = "Cumulative Proportion"
- )
-
-ggsave(paste0(image_dist_diff_analysis,"/p_delay_intervention_cumulative_distdiff.png"))
-```
-
-
-Get the % of differences in the spike around zero
-```{r}
-# get values around and above/below spike
-width <- 0.02
-spike_band_centered_zero <- mean( pddf_ib$value >= -width/2 & pddf_ib$value <= width/2)
-above_spike_band <- mean( pddf_ib$value >= width/2)
-below_spike_band <- mean( pddf_ib$value <= -width/2)
-
-# get mass above and mass below zero
-mass_below_zero <- mean( pddf_ib$value <= 0)
-```
-Looking at the spike around zero, we find that `r spike_band_centered_zero*100`%
-of the probability mass is contained within the band from
-[`r -width*100/2`,`r width*100/2`].
-Additionally, there was `r above_spike_band*100`% of the probability above that
--- representing those with a general increase in the probability of termination --
-and `r below_spike_band*100`% of the probability mass below the band
--- representing a decrease in the probability of termination.
-
-On average, if you keep the trial open instead of closing it,
-`r mass_below_zero`% of trials will see a decrease in the probability of termination,
-but, due to the high increase in probability of termination given termination was increased,
-the mean probability of termination increases by `r stats$mean`.
-
-```{r}
-# 5%-iles
-
-summary(pddf_ib$value)
-
-# Create your quantiles
-quants <- quantile(pddf_ib$value, probs = seq(0,1,0.05), type=4)
-
-# Convert to a data frame
-quant_df <- data.frame(
- Percentile = names(quants),
- Value = quants
-)
-kable(quant_df)
-```
-
-There seems to be some trials that are highly suceptable to this enrollment delay. Specifically, there were some
-
-```{r}
-n = length(counterfact_predicted_ib$p_predicted_intervention)
-k = 100
-simulated_terminations_intervention <- mean(rbinom(n,k,as.vector(counterfact_predicted_ib$p_predicted_intervention)))
-simulated_terminations_base <-mean(rbinom(n,k,as.vector(counterfact_predicted_ib$p_predicted_default)))
-
-simulated_percentages <- (simulated_terminations_intervention - simulated_terminations_base)/k
-```
-
-The simulation above shows that this results in a percentage-point increase of about
-`r simulated_percentages * 100`.
-
-
-
-
-# Diagnostics
-
-```{r}
-#| eval: true
-#trace plots
-image_diagnostics <- paste0(image_root,"/diagnostics")
-
-plot(fit, pars=c("mu"), plotfun="trace")
-ggsave(paste0(image_diagnostics,"/trace_plot_mu.png"))
-
-
-for (i in 1:3) {
- print(
- mcmc_rank_overlay(
- fit,
- pars=c(
- paste0("mu[",4*i-3,"]"),
- paste0("mu[",4*i-2,"]"),
- paste0("mu[",4*i-1,"]"),
- paste0("mu[",4*i,"]")
- ),
- n_bins=100
- )+ legend_move("top") +
- scale_colour_ghibli_d("KikiMedium")
- )
- mu_range <- paste0(4*i-3,"-",4*i)
- filename <- paste0(image_diagnostics,"/trace_rank_plot_mu_",mu_range,".png")
- ggsave(filename)
-}
-```
-
-```{r}
-#| eval: true
-plot(fit, pars=c("sigma"), plotfun="trace")
-ggsave(paste0(image_diagnostics,"/traceplot_sigma.png"))
-
-for (i in 1:3) {
- print(
- mcmc_rank_overlay(
- fit,
- pars=c(
- paste0("sigma[",4*i-3,"]"),
- paste0("sigma[",4*i-2,"]"),
- paste0("sigma[",4*i-1,"]"),
- paste0("sigma[",4*i,"]")
- ),
- n_bins=100
- )+ legend_move("top") +
- scale_colour_ghibli_d("KikiMedium")
- )
- sigma_range <- paste0(4*i-3,"-",4*i)
- filename <- paste0(image_diagnostics,"/trace_rank_plot_sigma_",sigma_range,".png")
- ggsave(filename)
-}
-```
-
-```{r}
-#| eval: true
-#other diagnostics
-logpost <- log_posterior(fit)
-nuts_prmts <- nuts_params(fit)
-posterior <- as.array(fit)
-
-```
-
-```{r}
-#| eval: true
-color_scheme_set("darkgray")
-div_style <- parcoord_style_np(div_color = "green", div_size = 0.05, div_alpha = 0.4)
-mcmc_parcoord(posterior, regex_pars = "mu", np=nuts_prmts, np_style = div_style, alpha = 0.05)
-ggsave(paste0(image_diagnostics,"/parcoord_mu.png"))
-```
-
-```{r}
-#| eval: true
-for (i in 1:3) {
- mus = sapply(3:0, function(j) paste0("mu[",4*i-j ,"]"))
- print(
- mcmc_pairs(
- posterior,
- np = nuts_prmts,
- pars=c(
- mus,
- "lp__"
- ),
- off_diag_args = list(size = 0.75)
- )
- )
- mu_range <- paste0(4*i-3,"-",4*i)
- filename <- paste0(image_diagnostics,"/correlation_plot_mu_",mu_range,".png")
- ggsave(filename)
-}
-
-
-
-```
-
-```{r}
-#| eval: true
-mcmc_parcoord(posterior,regex_pars = "sigma", np=nuts_prmts, alpha=0.05)
-ggsave(paste0(image_diagnostics,"/parcoord_sigma.png"))
-```
-
-```{r}
-#| eval: true
-
-for (i in 1:3) {
- params = sapply(3:0, function(j) paste0("sigma[",4*i-j ,"]"))
- print(
- mcmc_pairs(
- posterior,
- np = nuts_prmts,
- pars=c(
- params,
- "lp__"
- ),
- off_diag_args = list(size = 0.75)
- )
- )
- sigma_range <- paste0(4*i-3,"-",4*i)
- filename <- paste0(image_diagnostics,"/correlation_plot_sigma_",sigma_range,".png")
- ggsave(filename)
-}
-```
-
-
-```{r}
-#| eval: true
-for (k in 1:22) {
-for (i in 1:3) {
- params = sapply(3:0, function(j) paste0("beta[",k,",",4*i-j ,"]"))
- print(
- mcmc_pairs(
- posterior,
- np = nuts_prmts,
- pars=c(
- params,
- "lp__"
- ),
- off_diag_args = list(size = 0.75)
- )
- )
-
- beta_range <- paste0("k_",k,"_i_",4*i-3,"-",4*i)
- filename <- paste0(image_diagnostics,"/correlation_plot_beta_",beta_range,".png")
- ggsave(filename)
-}}
-```
-
-
-
-
-
-
-
diff --git a/r-analysis/Hierarchal_Logistic_with_status_diff.stan b/r-analysis/Hierarchal_Logistic_with_status_diff.stan
deleted file mode 100644
index 8aa7a04..0000000
--- a/r-analysis/Hierarchal_Logistic_with_status_diff.stan
+++ /dev/null
@@ -1,102 +0,0 @@
-//
-// This Stan program defines a simple model, with a
-// vector of values 'y' modeled as normally distributed
-// with mean 'mu' and standard deviation 'sigma'.
-//
-// Learn more about model development with Stan at:
-//
-// http://mc-stan.org/users/interfaces/rstan.html
-// https://github.com/stan-dev/rstan/wiki/RStan-Getting-Started
-//
-
-// The input data is a vector 'y' of length 'N'.
-data {
- int D; // number of features
- int N; // number of observations
- int L; // number of categories
- array[N] int y; // vector of dependent variables
- array[N] int ll; // vector of categories
- array[N] row_vector[D] x; // independent variables
- real mu_mean; //hyperprior
- real mu_stdev; //hyperprior
- real sigma_shape; //hyperprior
- real sigma_rate; //hyperprior
- //counterfactuals
- int Nx;
- array[Nx] int llx;//vec of categories
- array[Nx] row_vector[D] counterfact_x_tilde; // Posterior Prediction intervention
- array[Nx] row_vector[D] counterfact_x; // Posterior Prediction intervention
- //the two statuses to catch relative difference
- array[2] int status_indexes; //the two status indexes to compare (order is x[1] - x[2])
-}
-parameters {
- array[D] real mu;
- array[D] real sigma;
- array[L] vector[D] beta;
-}
-model {
- sigma ~ gamma(sigma_shape,sigma_rate); //hyperprior for stdev: shape and inverse scale
- mu ~ normal(mu_mean, mu_stdev); //hyperprior for mean //TODO: convert to mvnormal
- for (l in 1:L) {
- beta[l] ~ normal(mu, sigma);
- }
- {
- vector[N] x_beta_ll;
- for (n in 1:N) {
- x_beta_ll[n] = x[n] * beta[ll[n]];
- }
- y ~ bernoulli_logit(x_beta_ll);
- }
-}
-generated quantities {
- //SETUP PRIOR PREDICTION
- //preallocate
- real mu_prior[D];
- real sigma_prior[D];
- real p_prior[N]; // what I have priors about
- real p_predicted[N]; //predicted p_values
- //intervention
- real p_predicted_default[Nx]; //predicted p_values
- real p_predicted_intervention[Nx]; //predicted p_values
- real predicted_difference[Nx]; //difference in predicted values
-
- //collect array of relative status differences between
- array[L] real status_diff;
- //GENERATE RELATIVE DIFFERENCES BETWEEN STATUSES
- for (l in 1:L) {
- status_diff[l] = beta[l,status_indexes[1]] - beta[l,status_indexes[2]];
- }
-
- //GENERATE PRIOR PREDICTIONS
- {
- vector[D] beta_prior[L];//local var
- //sample parameters
- for (d in 1:D) {
- mu_prior[d] = normal_rng(mu_mean,mu_stdev);
- sigma_prior[d] = gamma_rng(sigma_shape,sigma_rate);
- }
- for (l in 1:L) {
- for (d in 1:D) {
- beta_prior[l,d] = normal_rng(mu_prior[d],sigma_prior[d]);
- }
- }
- //generate probabilities
- vector[D] b_prior[N];//local var
- for (n in 1:N){
- b_prior[n] = beta_prior[ll[n]];
- p_prior[n] = inv_logit( x[n] * b_prior[n] );
- }
- }
- //GENERATE POSTERIOR PREDICTIONS
- for (n in 1:N) {
- p_predicted[n] = inv_logit( x[n] * beta[ll[n]] );
- }
- //GENERATE POSTERIOR DISTRIBUTION OF DIFFERENCES
- for (n in 1:Nx) {
- p_predicted_default[n] = inv_logit( counterfact_x[n] * beta[llx[n]] );
- p_predicted_intervention[n] = inv_logit( counterfact_x_tilde[n] * beta[llx[n]] ); //intervention
-
- //intervention - base case
- predicted_difference[n] = p_predicted_intervention[n] - p_predicted_default[n];
- }
-}