The Effects of Recruitment status on completion of clinical trials

Author

Will King

Setup

library(knitr)
library(bayesplot)
This is bayesplot version 1.11.1
- Online documentation and vignettes at mc-stan.org/bayesplot
- bayesplot theme set to bayesplot::theme_default()
   * Does _not_ affect other ggplot2 plots
   * See ?bayesplot_theme_set for details on theme setting
available_mcmc(pattern = "_nuts_")
bayesplot MCMC module:
(matching pattern '_nuts_') 
  mcmc_nuts_acceptance
  mcmc_nuts_divergence
  mcmc_nuts_energy
  mcmc_nuts_stepsize
  mcmc_nuts_treedepth
library(ggplot2)
library(patchwork)
library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ lubridate 1.9.4     ✔ tibble    3.2.1
✔ purrr     1.0.2     ✔ tidyr     1.3.1
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(rstan)
Loading required package: StanHeaders

rstan version 2.32.6 (Stan version 2.32.2)

For execution on a local, multicore CPU with excess RAM we recommend calling
options(mc.cores = parallel::detectCores()).
To avoid recompilation of unchanged Stan programs, we recommend calling
rstan_options(auto_write = TRUE)
For within-chain threading using `reduce_sum()` or `map_rect()` Stan functions,
change `threads_per_chain` option:
rstan_options(threads_per_chain = 1)


Attaching package: 'rstan'

The following object is masked from 'package:tidyr':

    extract
library(tidyr)
library(ghibli)
library(xtable)
#Resources: https://github.com/stan-dev/rstan/wiki/RStan-Getting-Started

#save unchanged models instead of recompiling
rstan_options(auto_write = TRUE)
#allow for multithreaded sampling
options(mc.cores = parallel::detectCores())

#test installation, shouldn't get any errors
#example(stan_model, package = "rstan", run.dontrun = TRUE)


image_root <- "./output/withdiff/EffectsOfEnrollmentDelay"
image_dist_diff_analysis <- paste0(image_root,"/dist_diff_analysis")
image_trial_details <-paste0(image_root,"/trials_details")
image_parameters_by_groups <-paste0(image_root,"/betas/parameters_by_group")
image_parameters_across_groups <-paste0(image_root,"/betas/parameter_across_groups")
################ Pull data from database ######################
library(RPostgreSQL)
Loading required package: DBI
host <- 'aact_db-restored-2025-01-07'

driver <- dbDriver("PostgreSQL")

get_data <- function(driver) {

con <- dbConnect(
    driver,
    user='root',
    password='root',
    dbname='aact_db',
    host=host
    )
on.exit(dbDisconnect(con))

query <- dbSendQuery(
    con,
#    "select * from formatted_data_with_planned_enrollment;"
"
select 
    fdqpe.nct_id
    --,fdqpe.start_date
    --,fdqpe.current_enrollment
    --,fdqpe.enrollment_category
    ,fdqpe.current_status 
    ,fdqpe.earliest_date_observed 
    ,fdqpe.elapsed_duration
    ,fdqpe.n_brands as identical_brands
    ,ntbtu.brand_name_counts 
    ,fdqpe.category_id
    ,fdqpe.final_status
    ,fdqpe.h_sdi_val
    --,fdqpe.h_sdi_u95
    --,fdqpe.h_sdi_l95
    ,fdqpe.hm_sdi_val
    --,fdqpe.hm_sdi_u95
    --,fdqpe.hm_sdi_l95
    ,fdqpe.m_sdi_val
    --,fdqpe.m_sdi_u95
    --,fdqpe.m_sdi_l95
    ,fdqpe.lm_sdi_val
    --,fdqpe.lm_sdi_u95
    --,fdqpe.lm_sdi_l95
    ,fdqpe.l_sdi_val
    --,fdqpe.l_sdi_u95
    --,fdqpe.l_sdi_l95
from formatted_data_with_planned_enrollment fdqpe
    join \"Formularies\".nct_to_brand_counts_through_uspdc ntbtu
        on fdqpe.nct_id = ntbtu.nct_id 
order by fdqpe.nct_id, fdqpe.earliest_date_observed 
;
"
    )
df <- fetch(query, n = -1)
df <- na.omit(df)

query2 <-dbSendQuery(con,"select count(*) from \"DiseaseBurden\".icd10_categories ic where \"level\"=1;")
n_categories <- fetch(query2, n = -1)

return(list(data=df,ncat=n_categories))
}


get_counterfact_base <- function(driver) {

con <- dbConnect(
    driver,
    user='root',
    password='root',
    dbname='aact_db',
    host=host
    )
on.exit(dbDisconnect(con))

query <- dbSendQuery(
    con,
    "
    with cte as (
    --get last recruiting state
    select fd.nct_id, max(fd.earliest_date_observed),min(fd2.earliest_date_observed) as tmstmp
    from formatted_data fd 
        join formatted_data fd2 
        on fd.nct_id=fd2.nct_id and fd.earliest_date_observed < fd2.earliest_date_observed 
    where fd.current_status = 'Recruiting'
        and fd2.current_status = 'Active, not recruiting'
    group by fd.nct_id 
    )
    select 
        fdqpe.nct_id
        --,fdqpe.start_date
        --,fdqpe.current_enrollment
        --,fdqpe.enrollment_category
        ,fdqpe.current_status 
        ,fdqpe.earliest_date_observed 
        ,fdqpe.elapsed_duration
        ,fdqpe.n_brands as identical_brands
        ,ntbtu.brand_name_counts 
        ,fdqpe.category_id
        ,fdqpe.final_status
        ,fdqpe.h_sdi_val
        --,fdqpe.h_sdi_u95
        --,fdqpe.h_sdi_l95
        ,fdqpe.hm_sdi_val
        --,fdqpe.hm_sdi_u95
        --,fdqpe.hm_sdi_l95
        ,fdqpe.m_sdi_val
        --,fdqpe.m_sdi_u95
        --,fdqpe.m_sdi_l95
        ,fdqpe.lm_sdi_val
        --,fdqpe.lm_sdi_u95
        --,fdqpe.lm_sdi_l95
        ,fdqpe.l_sdi_val
        --,fdqpe.l_sdi_u95
        --,fdqpe.l_sdi_l95
    from formatted_data_with_planned_enrollment fdqpe
        join \"Formularies\".nct_to_brand_counts_through_uspdc ntbtu
            on fdqpe.nct_id = ntbtu.nct_id 
        join cte 
            on fdqpe.nct_id = cte.nct_id 
                and fdqpe.earliest_date_observed = cte.tmstmp
    order by fdqpe.nct_id, fdqpe.earliest_date_observed 
    ;
    "
    )
df <- fetch(query, n = -1)
df <- na.omit(df)

query2 <-dbSendQuery(con,"select count(*) from \"DiseaseBurden\".icd10_categories ic where \"level\"=1;")
n_categories <- fetch(query2, n = -1)

return(list(data=df,ncat=n_categories))
}


d <- get_data(driver)
df <- d$data
n_categories <- d$ncat

cf <- get_counterfact_base(driver)
df_counterfact_base <- cf$data



################ Format Data ###########################

data_formatter <- function(df) {
categories <- df["category_id"]

x <- df["elapsed_duration"]
x["identical_brands"] <- asinh(df$identical_brands)
x["brand_name_counts"] <- asinh(df$brand_name_count)
x["h_sdi_val"] <- asinh(df$h_sdi_val)
x["hm_sdi_val"] <- asinh(df$hm_sdi_val)
x["m_sdi_val"] <- asinh(df$m_sdi_val)
x["lm_sdi_val"] <- asinh(df$lm_sdi_val)
x["l_sdi_val"] <- asinh(df$l_sdi_val)


#Setup fixed effects

x["status_NYR"] <- ifelse(df["current_status"]=="Not yet recruiting",1,0)
x["status_EBI"] <- ifelse(df["current_status"]=="Enrolling by invitation",1,0)
x["status_Rec"] <- ifelse(df["current_status"]=="Recruiting",1,0) 
x["status_ANR"] <- ifelse(df["current_status"]=="Active, not recruiting",1,0)


y <- ifelse(df["final_status"]=="Terminated",1,0)

#get category list


return(list(x=x,y=y))
}

train <- data_formatter(df)
counterfact_base <- data_formatter(df_counterfact_base)

categories <- df$category_id

x <- train$x
y <- train$y

x_cf_base <- counterfact_base$x
y_cf_base <- counterfact_base$y
cf_categories <- df_counterfact_base$category_id

Fit Model

################################# FIT MODEL #########################################
inherited_cols <- c(
    "elapsed_duration"
    ,"identical_brands"
    ,"brand_name_counts"
    ,"h_sdi_val"
    ,"hm_sdi_val"
    ,"m_sdi_val"
    ,"lm_sdi_val"
    ,"l_sdi_val"
    ,"status_NYR"
    ,"status_EBI"
    ,"status_Rec"
    ,"status_ANR"
)
beta_list <- list(
        groups = c(
        `1`="Infections & Parasites",
        `2`="Neoplasms",
        `3`="Blood & Immune system",
        `4`="Endocrine, Nutritional, and Metabolic",
        `5`="Mental & Behavioral",
        `6`="Nervous System",
        `7`="Eye and Adnexa",
        `8`="Ear and Mastoid",
        `9`="Circulatory",
        `10`="Respiratory",
        `11`="Digestive",
        `12`="Skin & Subcutaneaous tissue",
        `13`="Musculoskeletal",
        `14`="Genitourinary",
        `15`="Pregancy, Childbirth, & Puerperium",
        `16`="Perinatal Period",
        `17`="Congential",
        `18`="Symptoms, Signs etc.",
        `19`="Injury etc.",
        `20`="External Causes",
        `21`="Contact with Healthcare",
        `22`="Special Purposes"
    ),
    parameters = c(
        `1`="Elapsed Duration",
        # brands
        `2`="asinh(Generic Brands)",
        `3`="asinh(Competitors USPDC)",
        # population
        `4`="asinh(High SDI)",
        `5`="asinh(High-Medium SDI)",
        `6`="asinh(Medium SDI)",
        `7`="asinh(Low-Medium SDI)",
        `8`="asinh(Low SDI)",
        #Status
        `9`="status_NYR",
        `10`="status_EBI",
        `11`="status_Rec",
        `12`="status_ANR"
    )
)

get_parameters <- function(stem,class_list) {
    #get categories and lengths
    named <- names(class_list)
    lengths <- sapply(named, (function (x) length(class_list[[x]])))
    
    #describe the grid needed
    iter_list <- sapply(named, (function (x) 1:lengths[x]))
    
    #generate the list of parameters
    pardf <- generate_parameter_df(stem, iter_list)
    
    #add columns with appropriate human-readable names
    for (name in named) {
        pardf[paste(name,"_hr",sep="")] <- as.factor(
            sapply(pardf[name], (function (i) class_list[[name]][i]))
        )
    }
     
    return(pardf)   
}

generate_parameter_df <- function(stem, iter_list) {
    grid <- expand.grid(iter_list)
    grid["param_name"] <- grid %>% unite(x,colnames(grid),sep=",")
    grid["param_name"] <- paste(stem,"[",grid$param_name,"]",sep="")
    return(grid)
}

group_mcmc_areas <- function(
        stem,# = "beta"
        class_list,# = beta_list
        stanfit,# = fit
        group_id,# = 2
        rename=TRUE,
        filter=NULL
        ) {
    
    #get all parameter names
    params <- get_parameters(stem,class_list)
    
    #filter down to parameters of interest
    params <- filter(params,groups == group_id)
    #Get dataframe with only the rows of interest
    filtdata <- as.data.frame(stanfit)[params$param_name]
    #rename columns
    if (rename) dimnames(filtdata)[[2]] <- params$parameters_hr
    #get group name for title
    group_name <- class_list$groups[group_id]
    #create area plot with appropriate title
    p <- mcmc_areas(filtdata,prob = 0.8, prob_outer = 0.95) +
        ggtitle(paste("Parameter distributions for ICD-10 class:",group_name)) +
        geom_vline(xintercept=seq(-2,2,0.5),color="grey",alpha=0.750)  
    
    d <- pivot_longer(filtdata, everything()) |> 
        group_by(name) |> 
        summarize(
            mean=mean(value)
            ,q025 = quantile(value,probs = 0.025)
            ,q975 = quantile(value,probs = 0.975)
            ,q05 = quantile(value,probs = 0.05)
            ,q95 = quantile(value,probs = 0.95)
            )
    return(list(plot=p,quantiles=d,name=group_name))
}

parameter_mcmc_areas <- function(
        stem,# = "beta"
        class_list,# = beta_list
        stanfit,# = fit
        parameter_id,# = 2
        rename=TRUE
        ) {
    #get all parameter names
    params <- get_parameters(stem,class_list)
    #filter down to parameters of interest
    params <- filter(params,parameters == parameter_id)
    #Get dataframe with only the rows of interest
    filtdata <- as.data.frame(stanfit)[params$param_name]
    #rename columns
    if (rename) dimnames(filtdata)[[2]] <- params$groups_hr
    #get group name for title
    parameter_name <- class_list$parameters[parameter_id]
    #create area plot with appropriate title
    p <- mcmc_areas(filtdata,prob = 0.8, prob_outer = 0.95) +
        ggtitle(parameter_name,"Parameter Distribution") +
        geom_vline(xintercept=seq(-2,2,0.5),color="grey",alpha=0.750)  
    
    d <- pivot_longer(filtdata, everything()) |> 
        group_by(name) |> 
        summarize(
            mean=mean(value)
            ,q025 = quantile(value,probs = 0.025)
            ,q975 = quantile(value,probs = 0.975)
            ,q05 = quantile(value,probs = 0.05)
            ,q95 = quantile(value,probs = 0.95)
            )
    return(list(plot=p,quantiles=d,name=parameter_name))
}

Plan: select all snapshots that are the first to have closed enrollment (Rec -> ANR when comparing across snapshots), and then

#delay intervention
intervention_enrollment <- x_cf_base[c(inherited_cols)]
#TOFIX: ^^^ This ordering of columns is
intervention_enrollment["status_ANR"] <- 0
intervention_enrollment["status_Rec"] <- 1
counterfact_delay <- list(
    D = ncol(x),#
    N = nrow(x),
    L = n_categories$count,
    y = as.vector(y),
    ll = as.vector(categories),
    x = as.matrix(x),
    mu_mean = 0,
    mu_stdev = 0.05,
    sigma_shape = 4,
    sigma_rate = 20,
    Nx = nrow(x_cf_base),
    llx = as.vector(cf_categories),
    counterfact_x_tilde = as.matrix(intervention_enrollment),
    counterfact_x = as.matrix(x_cf_base),
    status_indexes = c(11,12) #subtract anr from recruiting to get movement from anr to recruiting
)
fit <- stan(
    file='Hierarchal_Logistic.stan', 
    data = counterfact_delay,
    chains = 4,
    iter = 5000,
    seed = 11021585
    )
Warning: Bulk Effective Samples Size (ESS) is too low, indicating posterior means and medians may be unreliable.
Running the chains for more iterations may help. See
https://mc-stan.org/misc/warnings.html#bulk-ess

Calculate relative difference in parameters between recruiting and active not recruiting states

filt_data <- as.data.frame(extract(fit,pars="status_diff"))
dimnames(filt_data)[[2]] <- beta_list$groups
mcmc_areas(filt_data,prob = 0.8, prob_outer = 0.95) +
  ggtitle("Relative fixed effects across groups", subtitle ="moving from `Active, not recruiting` to `Recruiting`") +
        geom_vline(xintercept=seq(-2,2,0.5),color="grey",alpha=0.750) 
Warning: Removed 3 rows containing missing values or values outside the scale range
(`geom_vline()`).

I’ve got an issue here, because this should be a movement from ANR -> recruiting but that is suggesting that

Explore data

#get number of trials and snapshots in each category
group_trials_by_category <- as.data.frame(aggregate(category_id ~ nct_id, df, max))
group_trials_by_category <- as.data.frame(group_trials_by_category)

category_count <- group_trials_by_category |> group_by(category_id) |> count()
################################# DATA EXPLORATION ############################
driver <- dbDriver("PostgreSQL")

con <- dbConnect(
    driver,
    user='root',
    password='root',
    dbname='aact_db',
    host=host
    )
#Plot histogram of count of snapshots
df3 <- dbGetQuery(
    con,
    "select nct_id,final_status,count(*) from formatted_data_with_planned_enrollment fdwpe 
    group by nct_id,final_status ;"
    )
#df3 <- fetch(query3, n = -1)

ggplot(data=df3, aes(x=count, fill=final_status)) + 
    geom_histogram(binwidth=1) +
    ggtitle("Histogram of snapshots per trial (matched trials)") +
    xlab("Snapshots per trial")

ggsave(paste0(image_trial_details,"/HistSnapshots.png"))
Saving 7 x 5 in image
#Plot duration for terminated vs completed
df4 <- dbGetQuery(
    con,
    "
    select 
        nct_id, 
        start_date , 
        primary_completion_date, 
        overall_status ,
        primary_completion_date - start_date as duration
    from ctgov.studies s 
    where nct_id in (select distinct nct_id from http.download_status ds)
    ;"
    )
#df4 <- fetch(query4, n = -1)

ggplot(data=df4, aes(x=duration,fill=overall_status)) +
    geom_histogram()+
    ggtitle("Histogram of trial durations") +
    xlab("duration")+
    facet_wrap(~overall_status)
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggsave(paste0(image_trial_details,"/HistTrialDurations_Faceted.png"))
Saving 7 x 5 in image
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
df5 <- dbGetQuery(
    con,
    "
    with cte1 as (
    select 
        nct_id, 
        start_date , 
        primary_completion_date, 
        overall_status ,
        primary_completion_date - start_date as duration
    from ctgov.studies s 
    where nct_id in (select distinct nct_id from http.download_status ds)
    ), cte2 as (
    select nct_id,count(*) as snapshot_count from formatted_data_with_planned_enrollment fdwpe
    group by nct_id
    )
    select a.nct_id, a.overall_status, a.duration,b.snapshot_count
    from cte1 as a
        join cte2 as b  
            on a.nct_id=b.nct_id
    ;"
    )
df5$overall_status <- as.factor(df5$overall_status)

ggplot(data=df5, aes(x=duration,y=snapshot_count,color=overall_status)) +
    geom_jitter() +
    ggtitle("Comparison of duration, status, and snapshot_count") +
    xlab("duration") +
    ylab("snapshot count") 

ggsave(paste0(image_trial_details,"/SnapshotsVsDurationVsTermination.png"))
Saving 7 x 5 in image
dbDisconnect(con)
[1] TRUE
#get number of trials and snapshots in each category
group_trials_by_category <- as.data.frame(aggregate(category_id ~ nct_id, df, max))
group_trials_by_category <- as.data.frame(group_trials_by_category)

ggplot(data = group_trials_by_category, aes(x=category_id)) +
    geom_bar(binwidth=1,color="black",fill="lightblue") +
    scale_x_continuous(breaks=scales::pretty_breaks(n=22)) + 
    labs(
        title="bar chart of trial categories"
        ,x="Category ID"
        ,y="Count"
    )
Warning in geom_bar(binwidth = 1, color = "black", fill = "lightblue"):
Ignoring unknown parameters: `binwidth`

ggsave(paste0(image_trial_details,"/CategoryCounts.png"))
Saving 7 x 5 in image
summary(df5)
    nct_id             overall_status    duration      snapshot_count  
 Length:162         Completed :134    Min.   :  61.0   Min.   : 1.000  
 Class :character   Terminated: 28    1st Qu.: 618.5   1st Qu.: 4.000  
 Mode  :character                     Median :1022.5   Median : 6.000  
                                      Mean   :1202.4   Mean   : 8.315  
                                      3rd Qu.:1637.0   3rd Qu.:11.000  
                                      Max.   :3332.0   Max.   :48.000  
cor(df5$duration,df5$snapshot_count)
[1] 0.3356006
sum(df5$snapshot_count)
[1] 1347

Fit Results

################################# ANALYZE #####################################
#print(fit)

Parameter Distributions

#g1 <- group_mcmc_areas("beta",beta_list,fit,1)


gx <- c()

#grab parameters for every category with more than 8 observations
for (i in category_count$category_id[category_count$n >= 0]) {
    print(i)
    
    #Print parameter distributions
    gi <- group_mcmc_areas("beta",beta_list,fit,i) #add way to filter groups
    ggsave(
        paste0(image_parameters_by_groups,"/group_",i,"_",gi$name,".png")
        ,plot=gi$plot
        )
    gx <- c(gx,gi)

    #Get Quantiles and means for parameters
#    table <- xtable(gi$quantiles,
#      floating=FALSE
#      ,latex.environments = NULL
#      ,booktabs = TRUE
#      ,zap=getOption("digits")
#      )
#    write_lines(table,paste0("./latex_output/DirectEffects/group_",gi$name,".tex"))
}
[1] 1
Saving 7 x 5 in image
Warning: Removed 3 rows containing missing values or values outside the scale range
(`geom_vline()`).
[1] 2
Saving 7 x 5 in image
Warning: Removed 2 rows containing missing values or values outside the scale range
(`geom_vline()`).
[1] 3
Saving 7 x 5 in image
Warning: Removed 2 rows containing missing values or values outside the scale range
(`geom_vline()`).
[1] 4
Saving 7 x 5 in image
Warning: Removed 2 rows containing missing values or values outside the scale range
(`geom_vline()`).
[1] 5
Saving 7 x 5 in image
Warning: Removed 1 row containing missing values or values outside the scale range
(`geom_vline()`).
[1] 6
Saving 7 x 5 in image
Warning: Removed 2 rows containing missing values or values outside the scale range
(`geom_vline()`).
[1] 7
Saving 7 x 5 in image
Warning: Removed 3 rows containing missing values or values outside the scale range
(`geom_vline()`).
[1] 9
Saving 7 x 5 in image
Warning: Removed 3 rows containing missing values or values outside the scale range
(`geom_vline()`).
[1] 10
Saving 7 x 5 in image
Warning: Removed 3 rows containing missing values or values outside the scale range
(`geom_vline()`).
[1] 11
Saving 7 x 5 in image
Warning: Removed 2 rows containing missing values or values outside the scale range
(`geom_vline()`).
[1] 12
Saving 7 x 5 in image
Warning: Removed 1 row containing missing values or values outside the scale range
(`geom_vline()`).
[1] 13
Saving 7 x 5 in image
[1] 14
Saving 7 x 5 in image
Warning: Removed 3 rows containing missing values or values outside the scale range
(`geom_vline()`).
[1] 17
Saving 7 x 5 in image
Warning: Removed 2 rows containing missing values or values outside the scale range
(`geom_vline()`).
[1] 18
Saving 7 x 5 in image
Warning: Removed 2 rows containing missing values or values outside the scale range
(`geom_vline()`).
px <- c()


for (i in c(1,2,3,9,10,11,12)) {
    
    #Print parameter distributions
    pi <- parameter_mcmc_areas("beta",beta_list,fit,i) #add way to filter groups
    ggsave(
        paste0(image_parameters_across_groups,"/parameters_",i,"_",pi$name,".png")
        ,plot=pi$plot
        )
    px <- c(px,pi)

    #Get Quantiles and means for parameters
#    table <- xtable(pi$quantiles,
#      floating=FALSE
#      ,latex.environments = NULL
#      ,booktabs = TRUE
#      ,zap=getOption("digits")
#      )
#    write_lines(table,paste0("./latex_output/DirectEffects/parameters_",i,"_",pi$name,".tex"))
    
}
Saving 7 x 5 in image
Warning: Removed 6 rows containing missing values or values outside the scale range
(`geom_vline()`).
Saving 7 x 5 in image
Saving 7 x 5 in image
Saving 7 x 5 in image
Warning: Removed 3 rows containing missing values or values outside the scale range
(`geom_vline()`).
Saving 7 x 5 in image
Warning: Removed 6 rows containing missing values or values outside the scale range
(`geom_vline()`).
Saving 7 x 5 in image
Warning: Removed 5 rows containing missing values or values outside the scale range
(`geom_vline()`).
Saving 7 x 5 in image
Warning: Removed 5 rows containing missing values or values outside the scale range
(`geom_vline()`).

Note these have 95% outer CI and 80% inner (shaded)

#get the generic and uspdc parameters
print(px[4]$plot + px[7]$plot)

ggsave(paste0(image_parameters_across_groups,"2+3_generic_and_uspdc.png"))
Saving 7 x 5 in image
#get the parameters associated with duration
px[16]$plot + px[19]$plot
Warning: Removed 5 rows containing missing values or values outside the scale range
(`geom_vline()`).
Removed 5 rows containing missing values or values outside the scale range
(`geom_vline()`).

ggsave(paste0(image_parameters_across_groups,"11+12_statusREC_and_statusANR.png"))
Saving 7 x 5 in image
Warning: Removed 5 rows containing missing values or values outside the scale range
(`geom_vline()`).
Removed 5 rows containing missing values or values outside the scale range
(`geom_vline()`).

Counterfactuals

generated_ib <- gqs(
    fit@stanmodel,
    data=counterfact_delay,
    draws=as.matrix(fit),
    seed=11021585
    )

Get priors

df_ib_p <- data.frame(
    p_prior=as.vector(extract(generated_ib, pars="p_prior")$p_prior)
    ,p_predicted = as.vector(extract(generated_ib, pars="p_predicted")$p_predicted)
)

df_ib_prior <- data.frame(
    mu_prior = as.vector(extract(generated_ib, pars="mu_prior")$mu_prior)
    ,sigma_prior = as.vector(extract(generated_ib, pars="sigma_prior")$sigma_prior)
)

#p_prior
ggplot(df_ib_p, aes(x=p_prior)) +
    geom_density() + 
    labs(
        title="implied prior distribution p"
        ,subtitle=""
        ,x="probability domain 'p'"
        ,y="probability density"
    )

ggsave(paste0(image_dist_diff_analysis,"/prior_p.png"))
Saving 7 x 5 in image
#p_posterior
ggplot(df_ib_p, aes(x=p_predicted)) +
    geom_density() + 
    labs(
        title="implied posterior distribution p"
        ,subtitle=""
        ,x="probability domain 'p'"
        ,y="Probability Density"
    )

ggsave(paste0(image_dist_diff_analysis,"/posterior_p.png"))
Saving 7 x 5 in image
#mu_prior
ggplot(df_ib_prior) +
    geom_density(aes(x=mu_prior)) + 
    labs(
        title="Prior - Mu"
        ,subtitle="same prior for all Mu values"
        ,x="Mu"
        ,y="Probability"
    )

ggsave(paste0(image_dist_diff_analysis,"/prior_mu.png"))
Saving 7 x 5 in image
#sigma_posterior
ggplot(df_ib_prior) +
    geom_density(aes(x=sigma_prior)) + 
    labs(
        title="Prior - Sigma"
        ,subtitle="same prior for all Sigma values"
        ,x="Sigma"
        ,y="Probability"
    )

ggsave(paste0(image_dist_diff_analysis,"/prior_sigma.png"))
Saving 7 x 5 in image
check_hmc_diagnostics(fit)

Divergences:
0 of 10000 iterations ended with a divergence.

Tree depth:
0 of 10000 iterations saturated the maximum tree depth of 10.

Energy:
E-BFMI indicated no pathological behavior.

Intervention: Delay close of enrollment

counterfact_predicted_ib <- data.frame(
    p_predicted_default = as.vector(extract(generated_ib, pars="p_predicted_default")$p_predicted_default)
    ,p_predicted_intervention = as.vector(extract(generated_ib, pars="p_predicted_intervention")$p_predicted_intervention)
    ,predicted_difference = as.vector(extract(generated_ib, pars="predicted_difference")$predicted_difference)
)


ggplot(counterfact_predicted_ib, aes(x=p_predicted_default)) +
    geom_density() + 
    labs(
        title="Predicted Distribution of 'p'"
        ,subtitle="Intervention: None"
        ,x="Probability Domain 'p'"
        ,y="Probability Density"
    )

ggsave(paste0(image_dist_diff_analysis,"p_no_intervention.png"))
Saving 7 x 5 in image
ggplot(counterfact_predicted_ib, aes(x=p_predicted_intervention)) +
    geom_density() + 
    labs(
        title="Predicted Distribution of 'p'"
        ,subtitle="Intervention: Delay close of enrollment"
        ,x="Probability Domain 'p'"
        ,y="Probability Density"
    )

ggsave(paste0(image_dist_diff_analysis,"/p_delay_intervention.png"))
Saving 7 x 5 in image
ggplot(counterfact_predicted_ib, aes(x=predicted_difference)) +
    geom_density() + 
    labs(
        title="Predicted Distribution of differences 'p'"
        ,subtitle="Intervention: Delay close of enrollment"
        ,x="Difference in 'p' under treatment"
        ,y="Probability Density"
    )

ggsave(paste0(image_dist_diff_analysis,"/p_delay_intervention_distdiff_1.png"))
Saving 7 x 5 in image
get_category_count <- function(tbl, id) {
  result <- tbl$n[tbl$category_id == id]
  if(length(result) == 0) 0 else result
}

category_names <- sapply(1:length(beta_list$groups), 
    function(i) sprintf("ICD-10 #%d: %s (n=%d)", 
                       i, 
                       beta_list$groups[i],
                       get_category_count(category_count, i)))
pddf_ib <- data.frame(extract(generated_ib, pars="predicted_difference")$predicted_difference) |>
    pivot_longer(X1:X168) #CHANGE_NOTE: moved from X169 to X168

pddf_ib["entry_idx"] <- as.numeric(gsub("\\D","",pddf_ib$name))
pddf_ib["category"] <-  sapply(pddf_ib$entry_idx, function(i) df$category_id[i])
pddf_ib["category_name"] <- sapply(
    pddf_ib$category, 
    function(i) category_names[i]
    )
  


ggplot(pddf_ib, aes(x=value,)) +
    geom_density(adjust=1/5) +
    labs(
        title = "Distribution of predicted differences"
        ,subtitle = "Intervention: Delay close of enrollment"
        ,x = "Difference in probability due to intervention"
        ,y = "Probability Density"
    ) + 
    geom_vline(aes(xintercept = 0), color = "skyblue", linetype="dashed") 

ggsave(paste0(image_dist_diff_analysis,"/p_delay_intervention_distdiff_styled.png"))
Saving 7 x 5 in image
ggplot(pddf_ib, aes(x=value,)) +
    geom_density(adjust=1/5) +
    facet_wrap(
        ~factor(
            category_name, 
            levels=category_names
            )
        , labeller = label_wrap_gen(multi_line = TRUE)
        , ncol=4) +
    labs(
        title = "Distribution of predicted differences | By Group"
        ,subtitle = "Intervention: Delay close of enrollment"
        ,x = "Difference in probability due to intervention"
        ,y = "Probability Density"
    ) + 
    geom_vline(aes(xintercept = 0), color = "skyblue", linetype="dashed") +
    theme(strip.text.x = element_text(size = 8))

ggsave(paste0(image_dist_diff_analysis,"/p_delay_intervention_distdiff_by_group.png"))
Saving 7 x 5 in image
ggplot(pddf_ib, aes(x=value,)) +
    geom_histogram(bins=300) +
    facet_wrap(
        ~factor(
            category_name, 
            levels=category_names
            )
        , labeller = label_wrap_gen(multi_line = TRUE)
        , ncol=4) +
    labs(
        title = "Histogram of predicted differences | By Group"
        ,subtitle = "Intervention: Delay close of enrollment"
        ,x = "Difference in probability due to intervention"
        ,y = "Predicted counts"
    ) + 
    geom_vline(aes(xintercept = 0), color = "skyblue", linetype="dashed") +
    theme(strip.text.x = element_text(size = 8))

ggsave(paste0(image_dist_diff_analysis,"p_delay_intervention_histdiff_by_group.png"))
Saving 7 x 5 in image
p3 <- ggplot(pddf_ib, aes(x=value,)) +
    geom_histogram(bins=500) +
    labs(
        title = "Distribution of predicted differences"
        ,subtitle = "Intervention: Delay close of enrollment"
        ,x = "Difference in probability due to intervention"
        ,y = "Probability Density"
        ,caption = "Vertical marks: 5/10/25/50/75/90/95th percentiles. Dot shows mean."
    ) + 
    geom_vline(aes(xintercept = 0), color = "skyblue", linetype="dashed") 

density_hight <- max(density(pddf_ib$value)$y)

stats <- list(
 p5 = quantile(pddf_ib$value, 0.05),
 p10 = quantile(pddf_ib$value, 0.10),
 q1 = quantile(pddf_ib$value, 0.25),
 med = median(pddf_ib$value),
 mean = mean(pddf_ib$value),
 q3 = quantile(pddf_ib$value, 0.75),
 p90 = quantile(pddf_ib$value, 0.90),
 p95 = quantile(pddf_ib$value, 0.95),
 max_height = max(ggplot_build(p3)$data[[1]]$count),
 y_offset = -max(ggplot_build(p3)$data[[1]]$count) * 0.05,
 y_offset_density = -density_hight * 0.15
)

p3 + 
 # Box
 geom_segment(data = data.frame(
   x = c(stats$q1, stats$q3, stats$med),
   xend = c(stats$q1, stats$q3, stats$med),
   y = rep(stats$y_offset, 3), 
   yend = rep(stats$y_offset * 2, 3)
 ), aes(x = x, xend = xend, y = y, yend = yend)) +
 geom_segment(data = data.frame(
   x = rep(stats$q1, 2),
   xend = rep(stats$q3, 2),
   y = c(stats$y_offset, stats$y_offset * 2),
   yend = c(stats$y_offset, stats$y_offset * 2)
 ), aes(x = x, xend = xend, y = y, yend = yend)) +
 # Inner whiskers (Q1->P10, Q3->P90)
 geom_segment(data = data.frame(
   x = c(stats$q1, stats$q3),
   xend = c(stats$p10, stats$p90),
   y = rep(stats$y_offset * 1.5, 2),
   yend = rep(stats$y_offset * 1.5, 2)
 ), aes(x = x, xend = xend, y = y, yend = yend)) +
 # Crossbars at P10/P90
 geom_segment(data = data.frame(
   x = c(stats$p10, stats$p90),
   xend = c(stats$p10, stats$p90),
   y = stats$y_offset * 1.3,
   yend = stats$y_offset * 1.7
 ), aes(x = x, xend = xend, y = y, yend = yend)) +
 # Outer whiskers (P10->P5, P90->P95)
 geom_segment(data = data.frame(
   x = c(stats$p10, stats$p90),
   xend = c(stats$p5, stats$p95),
   y = rep(stats$y_offset * 1.5, 2),
   yend = rep(stats$y_offset * 1.5, 2)
 ), aes(x = x, xend = xend, y = y, yend = yend)) +
 # Crossbars at P5/P95
 geom_segment(data = data.frame(
   x = c(stats$p5, stats$p95),
   xend = c(stats$p5, stats$p95),
   y = stats$y_offset * 1.3,
   yend = stats$y_offset * 1.7
 ), aes(x = x, xend = xend, y = y, yend = yend)) +
 # Mean dot
 geom_point(data = data.frame(
   x = stats$mean,
   y = stats$y_offset * 1.5
 ), aes(x = x, y = y))

ggsave(paste0(image_dist_diff_analysis,"/p_delay_intervention_histdiff_boxplot.png"))
Saving 7 x 5 in image
p4 <- ggplot(pddf_ib, aes(x = value)) +
  geom_density() +
    labs(
        title = "Distribution of predicted differences"
        ,subtitle = "Intervention: Delay close of enrollment"
        ,x = "Difference in probability due to intervention"
        ,y = "Probability Density"
        ,caption = "Vertical marks: 5/10/25/50/75/90/95th percentiles. Dot shows mean."
    ) + 
    geom_vline(aes(xintercept = 0), color = "skyblue", linetype="dashed") +
 geom_segment(data = data.frame(
   x = c(stats$q1, stats$q3, stats$med),
   xend = c(stats$q1, stats$q3, stats$med),
   y = rep(stats$y_offset_density, 3), 
   yend = rep(stats$y_offset_density * 2, 3)
 ), aes(x = x, xend = xend, y = y, yend = yend)) +
 geom_segment(data = data.frame(
   x = rep(stats$q1, 2),
   xend = rep(stats$q3, 2),
   y = c(stats$y_offset_density, stats$y_offset_density * 2),
   yend = c(stats$y_offset_density, stats$y_offset_density * 2)
 ), aes(x = x, xend = xend, y = y, yend = yend)) +
 # Inner whiskers (Q1->P10, Q3->P90)
 geom_segment(data = data.frame(
   x = c(stats$q1, stats$q3),
   xend = c(stats$p10, stats$p90),
   y = rep(stats$y_offset_density * 1.5, 2),
   yend = rep(stats$y_offset_density * 1.5, 2)
 ), aes(x = x, xend = xend, y = y, yend = yend)) +
 # Crossbars at P10/P90
 geom_segment(data = data.frame(
   x = c(stats$p10, stats$p90),
   xend = c(stats$p10, stats$p90),
   y = stats$y_offset_density * 1.3,
   yend = stats$y_offset_density * 1.7
 ), aes(x = x, xend = xend, y = y, yend = yend)) +
 # Outer whiskers (P10->P5, P90->P95)
 geom_segment(data = data.frame(
   x = c(stats$p10, stats$p90),
   xend = c(stats$p5, stats$p95),
   y = rep(stats$y_offset_density * 1.5, 2),
   yend = rep(stats$y_offset_density * 1.5, 2)
 ), aes(x = x, xend = xend, y = y, yend = yend)) +
 # Crossbars at P5/P95
 geom_segment(data = data.frame(
   x = c(stats$p5, stats$p95),
   xend = c(stats$p5, stats$p95),
   y = stats$y_offset_density * 1.3,
   yend = stats$y_offset_density * 1.7
 ), aes(x = x, xend = xend, y = y, yend = yend)) +
 # Mean dot
 geom_point(data = data.frame(
   x = stats$mean,
   y = stats$y_offset_density * 1.5
 ), aes(x = x, y = y))
ggsave(paste0(image_dist_diff_analysis,"/p_delay_intervention_distdiff_boxplot.png"))
Saving 7 x 5 in image
 ggplot(pddf_ib, aes(x=value)) +
    stat_ecdf(geom='step') +
    labs(
        title = "Cumulative distribution of predicted differences",
        subtitle = "Intervention: Delay close of enrollment",
        x = "Difference in probability of termination due to intervention",
        y = "Cumulative Proportion"
    ) 

ggsave(paste0(image_dist_diff_analysis,"/p_delay_intervention_cumulative_distdiff.png"))
Saving 7 x 5 in image

Get the % of differences in the spike around zero

# get values around and above/below spike
width <- 0.02
spike_band_centered_zero <- mean( pddf_ib$value >= -width/2 & pddf_ib$value <= width/2)
above_spike_band <- mean( pddf_ib$value >= width/2)
below_spike_band <- mean( pddf_ib$value <= -width/2)

# get mass above and mass below zero
mass_below_zero <- mean( pddf_ib$value <= 0)

Looking at the spike around zero, we find that 39.193869% of the probability mass is contained within the band from [-1,1]. Additionally, there was 51.937619% of the probability above that – representing those with a general increase in the probability of termination – and 8.8685119% of the probability mass below the band – representing a decrease in the probability of termination.

On average, if you keep the trial open instead of closing it, 0.2488518% of trials will see a decrease in the probability of termination, but, due to the high increase in probability of termination given termination was increased, the mean probability of termination increases by 0.0249444.

# 5%-iles

summary(pddf_ib$value)
    Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
-0.42755  0.00000  0.01164  0.02494  0.04213  0.54813 
# Create your quantiles
quants <- quantile(pddf_ib$value, probs = seq(0,1,0.05), type=4)

# Convert to a data frame
quant_df <- data.frame(
  Percentile = names(quants),
  Value = quants
)
kable(quant_df)
Percentile Value
0% 0% -0.4275511
5% 5% -0.0213008
10% 10% -0.0079768
15% 15% -0.0021932
20% 20% -0.0001065
25% 25% 0.0000000
30% 30% 0.0000535
35% 35% 0.0012179
40% 40% 0.0040158
45% 45% 0.0075646
50% 50% 0.0116358
55% 55% 0.0162235
60% 60% 0.0214039
65% 65% 0.0272582
70% 70% 0.0340562
75% 75% 0.0421259
80% 80% 0.0519438
85% 85% 0.0645430
90% 90% 0.0819636
95% 95% 0.1102529
100% 100% 0.5481257

There seems to be some trials that are highly suceptable to this enrollment delay. Specifically, there were some

n = length(counterfact_predicted_ib$p_predicted_intervention)
k = 100
simulated_terminations_intervention <- mean(rbinom(n,k,as.vector(counterfact_predicted_ib$p_predicted_intervention)))
simulated_terminations_base <-mean(rbinom(n,k,as.vector(counterfact_predicted_ib$p_predicted_default)))

simulated_percentages <- (simulated_terminations_intervention - simulated_terminations_base)/k

The simulation above shows that this results in a percentage-point increase of about 2.4922006.

Diagnostics

#trace plots
image_diagnostics <- paste0(image_root,"/diagnostics")

plot(fit, pars=c("mu"), plotfun="trace")

ggsave(paste0(image_diagnostics,"/trace_plot_mu.png"))
Saving 7 x 5 in image
for (i in 1:3) {
    print(
        mcmc_rank_overlay(
        fit, 
        pars=c(
            paste0("mu[",4*i-3,"]"),
            paste0("mu[",4*i-2,"]"),
            paste0("mu[",4*i-1,"]"),
            paste0("mu[",4*i,"]")
            ), 
        n_bins=100
        )+  legend_move("top") +
             scale_colour_ghibli_d("KikiMedium")
    )
    mu_range <- paste0(4*i-3,"-",4*i)
    filename <- paste0(image_diagnostics,"/trace_rank_plot_mu_",mu_range,".png")
    ggsave(filename)
}
Scale for colour is already present.
Adding another scale for colour, which will replace the existing scale.

Saving 7 x 5 in image
Scale for colour is already present.
Adding another scale for colour, which will replace the existing scale.

Saving 7 x 5 in image
Scale for colour is already present.
Adding another scale for colour, which will replace the existing scale.

Saving 7 x 5 in image
plot(fit, pars=c("sigma"), plotfun="trace")

ggsave(paste0(image_diagnostics,"/traceplot_sigma.png"))
Saving 7 x 5 in image
for (i in 1:3) {
    print(
        mcmc_rank_overlay(
        fit, 
        pars=c(
            paste0("sigma[",4*i-3,"]"),
            paste0("sigma[",4*i-2,"]"),
            paste0("sigma[",4*i-1,"]"),
            paste0("sigma[",4*i,"]")
            ), 
        n_bins=100
        )+  legend_move("top") +
             scale_colour_ghibli_d("KikiMedium")
    )
    sigma_range <- paste0(4*i-3,"-",4*i)
    filename <- paste0(image_diagnostics,"/trace_rank_plot_sigma_",sigma_range,".png")
    ggsave(filename)
}
Scale for colour is already present.
Adding another scale for colour, which will replace the existing scale.

Saving 7 x 5 in image
Scale for colour is already present.
Adding another scale for colour, which will replace the existing scale.

Saving 7 x 5 in image
Scale for colour is already present.
Adding another scale for colour, which will replace the existing scale.

Saving 7 x 5 in image
#other diagnostics
logpost <- log_posterior(fit)
nuts_prmts <- nuts_params(fit)
posterior <- as.array(fit)
color_scheme_set("darkgray")
div_style <- parcoord_style_np(div_color = "green", div_size = 0.05, div_alpha = 0.4)
mcmc_parcoord(posterior, regex_pars = "mu", np=nuts_prmts, np_style = div_style, alpha = 0.05)

ggsave(paste0(image_diagnostics,"/parcoord_mu.png"))
Saving 7 x 5 in image
for (i in 1:3) {
    mus = sapply(3:0, function(j) paste0("mu[",4*i-j ,"]"))
    print(
        mcmc_pairs(
            posterior,
            np = nuts_prmts,
            pars=c(
                mus,
                "lp__"
            ),
            off_diag_args = list(size = 0.75)
        )
    )
    mu_range <- paste0(4*i-3,"-",4*i)
    filename <- paste0(image_diagnostics,"/correlation_plot_mu_",mu_range,".png")
    ggsave(filename)
}

Saving 7 x 5 in image

Saving 7 x 5 in image

Saving 7 x 5 in image
mcmc_parcoord(posterior,regex_pars = "sigma", np=nuts_prmts, alpha=0.05)

ggsave(paste0(image_diagnostics,"/parcoord_sigma.png"))
Saving 7 x 5 in image
for (i in 1:3) {
    params = sapply(3:0, function(j) paste0("sigma[",4*i-j ,"]"))
    print(
        mcmc_pairs(
            posterior,
            np = nuts_prmts,
            pars=c(
                params,
                "lp__"
            ),
            off_diag_args = list(size = 0.75)
        )
    )
    sigma_range <- paste0(4*i-3,"-",4*i)
    filename <- paste0(image_diagnostics,"/correlation_plot_sigma_",sigma_range,".png")
    ggsave(filename)
}

Saving 7 x 5 in image

Saving 7 x 5 in image

Saving 7 x 5 in image
for (k in 1:22) {
for (i in 1:3) {
    params = sapply(3:0, function(j) paste0("beta[",k,",",4*i-j ,"]"))
    print(
        mcmc_pairs(
            posterior,
            np = nuts_prmts,
            pars=c(
                params,
                "lp__"
            ),
            off_diag_args = list(size = 0.75)
        )
    )
    
    beta_range <- paste0("k_",k,"_i_",4*i-3,"-",4*i)
    filename <- paste0(image_diagnostics,"/correlation_plot_beta_",beta_range,".png")
    ggsave(filename)
}}

Saving 7 x 5 in image

Saving 7 x 5 in image

Saving 7 x 5 in image

Saving 7 x 5 in image

Saving 7 x 5 in image

Saving 7 x 5 in image

Saving 7 x 5 in image

Saving 7 x 5 in image

Saving 7 x 5 in image

Saving 7 x 5 in image

Saving 7 x 5 in image

Saving 7 x 5 in image

Saving 7 x 5 in image

Saving 7 x 5 in image

Saving 7 x 5 in image

Saving 7 x 5 in image

Saving 7 x 5 in image

Saving 7 x 5 in image

Saving 7 x 5 in image

Saving 7 x 5 in image

Saving 7 x 5 in image

Saving 7 x 5 in image

Saving 7 x 5 in image

Saving 7 x 5 in image

Saving 7 x 5 in image

Saving 7 x 5 in image

Saving 7 x 5 in image

Saving 7 x 5 in image

Saving 7 x 5 in image

Saving 7 x 5 in image

Saving 7 x 5 in image

Saving 7 x 5 in image

Saving 7 x 5 in image

Saving 7 x 5 in image

Saving 7 x 5 in image

Saving 7 x 5 in image

Saving 7 x 5 in image

Saving 7 x 5 in image

Saving 7 x 5 in image

Saving 7 x 5 in image

Saving 7 x 5 in image

Saving 7 x 5 in image

Saving 7 x 5 in image

Saving 7 x 5 in image

Saving 7 x 5 in image

Saving 7 x 5 in image

Saving 7 x 5 in image

Saving 7 x 5 in image

Saving 7 x 5 in image

Saving 7 x 5 in image

Saving 7 x 5 in image

Saving 7 x 5 in image

Saving 7 x 5 in image

Saving 7 x 5 in image

Saving 7 x 5 in image

Saving 7 x 5 in image

Saving 7 x 5 in image

Saving 7 x 5 in image

Saving 7 x 5 in image

Saving 7 x 5 in image

Saving 7 x 5 in image

Saving 7 x 5 in image

Saving 7 x 5 in image

Saving 7 x 5 in image

Saving 7 x 5 in image

Saving 7 x 5 in image