Updated analysis graphs and interpretation.

Also renamed some images to match what they represent.
The new .png files are from that. The renamed ones are from running the
analysis earlier.
exploring_fixed_effects
Will King 1 year ago
parent d25f5c2a0e
commit 26b3ce9b17

@ -9,6 +9,7 @@ editor: source
# Setup
```{r}
library(knitr)
library(bayesplot)
available_mcmc(pattern = "_nuts_")
library(ggplot2)
@ -32,6 +33,7 @@ options(mc.cores = parallel::detectCores())
```{r}
################ Pull data from database ######################
library(RPostgreSQL)
host <- 'aact_db-restored-2025-01-07'
driver <- dbDriver("PostgreSQL")
@ -42,7 +44,7 @@ con <- dbConnect(
user='root',
password='root',
dbname='aact_db',
host='will-office'
host=host
)
on.exit(dbDisconnect(con))
@ -59,7 +61,7 @@ select
,fdqpe.earliest_date_observed
,fdqpe.elapsed_duration
,fdqpe.n_brands as identical_brands
,ntbtu.brand_name_count
,ntbtu.brand_name_counts
,fdqpe.category_id
,fdqpe.final_status
,fdqpe.h_sdi_val
@ -78,7 +80,7 @@ select
--,fdqpe.l_sdi_u95
--,fdqpe.l_sdi_l95
from formatted_data_with_planned_enrollment fdqpe
join \"Formularies\".nct_to_brands_through_uspdc ntbtu
join \"Formularies\".nct_to_brand_counts_through_uspdc ntbtu
on fdqpe.nct_id = ntbtu.nct_id
order by fdqpe.nct_id, fdqpe.earliest_date_observed
;
@ -101,7 +103,7 @@ con <- dbConnect(
user='root',
password='root',
dbname='aact_db',
host='will-office'
host=host
)
on.exit(dbDisconnect(con))
@ -127,7 +129,7 @@ query <- dbSendQuery(
,fdqpe.earliest_date_observed
,fdqpe.elapsed_duration
,fdqpe.n_brands as identical_brands
,ntbtu.brand_name_count
,ntbtu.brand_name_counts
,fdqpe.category_id
,fdqpe.final_status
,fdqpe.h_sdi_val
@ -146,7 +148,7 @@ query <- dbSendQuery(
--,fdqpe.l_sdi_u95
--,fdqpe.l_sdi_l95
from formatted_data_with_planned_enrollment fdqpe
join \"Formularies\".nct_to_brands_through_uspdc ntbtu
join \"Formularies\".nct_to_brand_counts_through_uspdc ntbtu
on fdqpe.nct_id = ntbtu.nct_id
join cte
on fdqpe.nct_id = cte.nct_id
@ -235,7 +237,7 @@ inherited_cols <- c(
,"m_sdi_val"
,"lm_sdi_val"
,"l_sdi_val"
,"status_NYR"
,"status_NYR"# TODO: may need to remove
,"status_EBI"
,"status_Rec"
,"status_ANR"
@ -325,6 +327,7 @@ group_mcmc_areas <- function(
rename=TRUE,
filter=NULL
) {
#get all parameter names
params <- get_parameters(stem,class_list)
@ -579,19 +582,35 @@ ggsave("./EffectsOfEnrollmentDelay/Images/DirectEffects/default_p_generic_interv
```{r}
get_category_count <- function(tbl, id) {
result <- tbl$n[tbl$category_id == id]
if(length(result) == 0) 0 else result
}
category_names <- sapply(1:length(beta_list$groups),
function(i) sprintf("ICD-10 #%d: %s (n=%d)",
i,
beta_list$groups[i],
get_category_count(category_count, i)))
```
```{r}
pddf_ib <- data.frame(extract(generated_ib, pars="predicted_difference")$predicted_difference) |>
pivot_longer(X1:X169)
pivot_longer(X1:X168) #CHANGE_NOTE: moved from X169 to X168
#TODO: Fix Category names
pddf_ib["entry_idx"] <- as.numeric(gsub("\\D","",pddf_ib$name))
pddf_ib["category"] <- sapply(pddf_ib$entry_idx, function(i) df$category_id[i])
pddf_ib["category_name"] <- sapply(pddf_ib$category, function(i) beta_list$groups[i])
pddf_ib["category_name"] <- sapply(
pddf_ib$category,
function(i) category_names[i]
)
ggplot(pddf_ib, aes(x=value,)) +
geom_density(bins=100) +
geom_density(adjust=1/5) +
labs(
title = "Distribution of predicted differences"
,subtitle = "Intervention: Delay close of enrollment"
@ -599,14 +618,15 @@ ggplot(pddf_ib, aes(x=value,)) +
,y = "Probability Density"
) +
geom_vline(aes(xintercept = 0), color = "skyblue", linetype="dashed")
ggsave("./EffectsOfEnrollmentDelay/Images/DirectEffects/p_generic_intervention_distdiff_styled.png")
#todo: add median, mean, 40/60 quantiles as well as
ggsave("./EffectsOfEnrollmentDelay/Images/DirectEffects/p_delay_intervention_distdiff_styled.png")
ggplot(pddf_ib, aes(x=value,)) +
geom_density(bins=100) +
geom_density(adjust=1/5) +
facet_wrap(
~factor(
category_name,
levels=beta_list$groups
levels=category_names
)
, labeller = label_wrap_gen(multi_line = TRUE)
, ncol=4) +
@ -618,57 +638,184 @@ ggplot(pddf_ib, aes(x=value,)) +
) +
geom_vline(aes(xintercept = 0), color = "skyblue", linetype="dashed") +
theme(strip.text.x = element_text(size = 8))
ggsave("./EffectsOfEnrollmentDelay/Images/DirectEffects/p_generic_intervention_distdiff_by_group.png")
ggsave("./EffectsOfEnrollmentDelay/Images/DirectEffects/p_delay_intervention_distdiff_by_group.png")
ggplot(pddf_ib, aes(x=value,)) +
geom_histogram(bins=100) +
geom_histogram(bins=300) +
facet_wrap(
~factor(
category_name,
levels=beta_list$groups
levels=category_names
)
, labeller = label_wrap_gen(multi_line = TRUE)
, ncol=5) +
, ncol=4) +
labs(
title = "Histogram of predicted differences | By Group"
,subtitle = "Intervention: Delay close of enrollment"
,x = "Difference in probability due to intervention"
,y = "Predicted counts"
) +
#xlim(-0.25,0.1) +
geom_vline(aes(xintercept = 0), color = "skyblue", linetype="dashed") +
theme(strip.text.x = element_text(size = 8))
ggsave("./EffectsOfEnrollmentDelay/Images/DirectEffects/p_generic_intervention_histdiff_by_group.png")
ggsave("./EffectsOfEnrollmentDelay/Images/DirectEffects/p_delay_intervention_histdiff_by_group.png")
```
```{r}
p3 <- ggplot(pddf_ib, aes(x=value,)) +
geom_histogram(bins=500) +
labs(
title = "Distribution of predicted differences"
,subtitle = "Intervention: Delay close of enrollment"
,x = "Difference in probability due to intervention"
,y = "Probability Density"
,caption = "Vertical marks: 5/10/25/50/75/90/95th percentiles. Dot shows mean."
) +
geom_vline(aes(xintercept = 0), color = "skyblue", linetype="dashed")
stats <- list(
p5 = quantile(pddf_ib$value, 0.05),
p10 = quantile(pddf_ib$value, 0.10),
q1 = quantile(pddf_ib$value, 0.25),
med = median(pddf_ib$value),
mean = mean(pddf_ib$value),
q3 = quantile(pddf_ib$value, 0.75),
p90 = quantile(pddf_ib$value, 0.90),
p95 = quantile(pddf_ib$value, 0.95),
max_height = max(ggplot_build(p3)$data[[1]]$count),
y_offset = -max(ggplot_build(p3)$data[[1]]$count) * 0.05
)
p3 +
# Box
geom_segment(data = data.frame(
x = c(stats$q1, stats$q3, stats$med),
xend = c(stats$q1, stats$q3, stats$med),
y = rep(stats$y_offset, 3),
yend = rep(stats$y_offset * 2, 3)
), aes(x = x, xend = xend, y = y, yend = yend)) +
geom_segment(data = data.frame(
x = rep(stats$q1, 2),
xend = rep(stats$q3, 2),
y = c(stats$y_offset, stats$y_offset * 2),
yend = c(stats$y_offset, stats$y_offset * 2)
), aes(x = x, xend = xend, y = y, yend = yend)) +
# Inner whiskers (Q1->P10, Q3->P90)
geom_segment(data = data.frame(
x = c(stats$q1, stats$q3),
xend = c(stats$p10, stats$p90),
y = rep(stats$y_offset * 1.5, 2),
yend = rep(stats$y_offset * 1.5, 2)
), aes(x = x, xend = xend, y = y, yend = yend)) +
# Crossbars at P10/P90
geom_segment(data = data.frame(
x = c(stats$p10, stats$p90),
xend = c(stats$p10, stats$p90),
y = stats$y_offset * 1.3,
yend = stats$y_offset * 1.7
), aes(x = x, xend = xend, y = y, yend = yend)) +
# Outer whiskers (P10->P5, P90->P95)
geom_segment(data = data.frame(
x = c(stats$p10, stats$p90),
xend = c(stats$p5, stats$p95),
y = rep(stats$y_offset * 1.5, 2),
yend = rep(stats$y_offset * 1.5, 2)
), aes(x = x, xend = xend, y = y, yend = yend)) +
# Crossbars at P5/P95
geom_segment(data = data.frame(
x = c(stats$p5, stats$p95),
xend = c(stats$p5, stats$p95),
y = stats$y_offset * 1.3,
yend = stats$y_offset * 1.7
), aes(x = x, xend = xend, y = y, yend = yend)) +
# Mean dot
geom_point(data = data.frame(
x = stats$mean,
y = stats$y_offset * 1.5
), aes(x = x, y = y))
ggsave("./EffectsOfEnrollmentDelay/Images/DirectEffects/p_delay_intervention_histdiff_boxplot.png")
```
```{r}
ggplot(pddf_ib, aes(x=value)) +
stat_ecdf(geom='step') +
labs(
title = "Cumulative distribution of predicted differences",
subtitle = "Intervention: Delay close of enrollment",
x = "Difference in probability of termination due to intervention",
y = "Cumulative Proportion"
)
ggsave("./EffectsOfEnrollmentDelay/Images/DirectEffects/p_delay_intervention_cumulative_distdiff.png")
```
Get the probability of increase over probability of a decrease
Get the % of differences in the spike around zero
```{r}
mean(counterfact_predicted_ib$predicted_difference)
# get values around and above/below spike
width <- 0.02
spike_band_centered_zero <- mean( pddf_ib$value >= -width/2 & pddf_ib$value <= width/2)
above_spike_band <- mean( pddf_ib$value >= width/2)
below_spike_band <- mean( pddf_ib$value <= -width/2)
# get mass above and mass below zero
mass_below_zero <- mean( pddf_ib$value <= 0)
```
Thus adding a Delay close of enrollment increases the probability of termination by 16.72% on average for
the snapshots investigated.
Looking at the spike around zero, we find that `r spike_band_centered_zero*100`%
of the probability mass is contained within the band from
[`r -width*100/2`,`r width*100/2`].
Additionally, there was `r above_spike_band*100`% of the probability above that
-- representing those with a general increase in the probability of termination --
and `r below_spike_band*100`% of the probability mass below the band
-- representing a decrease in the probability of termination.
On average, if you keep the trial open instead of closing it,
`r mass_below_zero`% of trials will see a decrease in the probability of termination,
but, due to the high increase in probability of termination given termination was increased,
the mean probability of termination increases by `r stats$mean`.
```{r}
# 5%-iles
summary(pddf_ib$value)
# Create your quantiles
quants <- quantile(pddf_ib$value, probs = seq(0,1,0.05), type=4)
# Convert to a data frame
quant_df <- data.frame(
Percentile = names(quants),
Value = quants
)
kable(quant_df)
```
There seems to be some trials that are highly suceptable to this enrollment delay. Specifically, there were some
```{r}
n = length(counterfact_predicted_ib$p_predicted_intervention)
mean(rbinom(n,1,as.vector(counterfact_predicted_ib$p_predicted_intervention)))
mean(rbinom(n,1,as.vector(counterfact_predicted_ib$p_predicted_default)))
k = 100
simulated_terminations_intervention <- mean(rbinom(n,k,as.vector(counterfact_predicted_ib$p_predicted_intervention)))
simulated_terminations_base <-mean(rbinom(n,k,as.vector(counterfact_predicted_ib$p_predicted_default)))
simulated_percentages <- (simulated_terminations_intervention - simulated_terminations_base)/k
```
The simulation above shows that this results in a percentage-point increase of about
`r simulated_percentages * 100`.
# Diagnostics
```{r}
#| eval: false
#| eval: true
#trace plots
plot(fit, pars=c("mu"), plotfun="trace")
for (i in 1:4) {
for (i in 1:3) {
print(
mcmc_rank_overlay(
fit,
@ -686,10 +833,10 @@ for (i in 1:4) {
```
```{r}
#| eval: false
#| eval: true
plot(fit, pars=c("sigma"), plotfun="trace")
for (i in 1:4) {
for (i in 1:3) {
print(
mcmc_rank_overlay(
fit,
@ -707,7 +854,7 @@ for (i in 1:4) {
```
```{r}
#| eval: false
#| eval: true
#other diagnostics
logpost <- log_posterior(fit)
nuts_prmts <- nuts_params(fit)
@ -716,15 +863,15 @@ posterior <- as.array(fit)
```
```{r}
#| eval: false
#| eval: true
color_scheme_set("darkgray")
div_style <- parcoord_style_np(div_color = "green", div_size = 0.05, div_alpha = 0.4)
mcmc_parcoord(posterior, regex_pars = "mu", np=nuts_prmts, np_style = div_style, alpha = 0.05)
```
```{r}
#| eval: false
for (i in 1:4) {
#| eval: true
for (i in 1:3) {
mus = sapply(3:0, function(j) paste0("mu[",4*i-j ,"]"))
print(
mcmc_pairs(
@ -744,14 +891,14 @@ for (i in 1:4) {
```
```{r}
#| eval: false
#| eval: true
mcmc_parcoord(posterior,regex_pars = "sigma", np=nuts_prmts, alpha=0.05)
```
```{r}
#| eval: false
#| eval: true
for (i in 1:4) {
for (i in 1:3) {
params = sapply(3:0, function(j) paste0("sigma[",4*i-j ,"]"))
print(
mcmc_pairs(
@ -769,9 +916,9 @@ for (i in 1:4) {
```{r}
#| eval: false
#| eval: true
for (k in 1:22) {
for (i in 1:4) {
for (i in 1:3) {
params = sapply(3:0, function(j) paste0("beta[",k,",",4*i-j ,"]"))
print(
mcmc_pairs(

@ -0,0 +1,983 @@
---
title: "The Effects of Recruitment status on completion of clinical trials"
author: "Will King"
format: html
editor: source
---
# Setup
```{r}
library(knitr)
library(bayesplot)
available_mcmc(pattern = "_nuts_")
library(ggplot2)
library(patchwork)
library(tidyverse)
library(rstan)
library(tidyr)
library(ghibli)
library(xtable)
#Resources: https://github.com/stan-dev/rstan/wiki/RStan-Getting-Started
#save unchanged models instead of recompiling
rstan_options(auto_write = TRUE)
#allow for multithreaded sampling
options(mc.cores = parallel::detectCores())
#test installation, shouldn't get any errors
#example(stan_model, package = "rstan", run.dontrun = TRUE)
```
```{r}
################ Pull data from database ######################
library(RPostgreSQL)
host <- 'aact_db-restored-2025-01-07'
driver <- dbDriver("PostgreSQL")
get_data <- function(driver) {
con <- dbConnect(
driver,
user='root',
password='root',
dbname='aact_db',
host=host
)
on.exit(dbDisconnect(con))
query <- dbSendQuery(
con,
# "select * from formatted_data_with_planned_enrollment;"
"
select
fdqpe.nct_id
--,fdqpe.start_date
--,fdqpe.current_enrollment
--,fdqpe.enrollment_category
,fdqpe.current_status
,fdqpe.earliest_date_observed
,fdqpe.elapsed_duration
,fdqpe.n_brands as identical_brands
,ntbtu.brand_name_counts
,fdqpe.category_id
,fdqpe.final_status
,fdqpe.h_sdi_val
--,fdqpe.h_sdi_u95
--,fdqpe.h_sdi_l95
,fdqpe.hm_sdi_val
--,fdqpe.hm_sdi_u95
--,fdqpe.hm_sdi_l95
,fdqpe.m_sdi_val
--,fdqpe.m_sdi_u95
--,fdqpe.m_sdi_l95
,fdqpe.lm_sdi_val
--,fdqpe.lm_sdi_u95
--,fdqpe.lm_sdi_l95
,fdqpe.l_sdi_val
--,fdqpe.l_sdi_u95
--,fdqpe.l_sdi_l95
from formatted_data_with_planned_enrollment fdqpe
join \"Formularies\".nct_to_brand_counts_through_uspdc ntbtu
on fdqpe.nct_id = ntbtu.nct_id
order by fdqpe.nct_id, fdqpe.earliest_date_observed
;
"
)
df <- fetch(query, n = -1)
df <- na.omit(df)
query2 <-dbSendQuery(con,"select count(*) from \"DiseaseBurden\".icd10_categories ic where \"level\"=1;")
n_categories <- fetch(query2, n = -1)
return(list(data=df,ncat=n_categories))
}
get_counterfact_base <- function(driver) {
con <- dbConnect(
driver,
user='root',
password='root',
dbname='aact_db',
host=host
)
on.exit(dbDisconnect(con))
query <- dbSendQuery(
con,
"
with cte as (
--get last recruiting state
select fd.nct_id, max(fd.earliest_date_observed),min(fd2.earliest_date_observed) as tmstmp
from formatted_data fd
join formatted_data fd2
on fd.nct_id=fd2.nct_id and fd.earliest_date_observed < fd2.earliest_date_observed
where fd.current_status = 'Recruiting'
and fd2.current_status = 'Active, not recruiting'
group by fd.nct_id
)
select
fdqpe.nct_id
--,fdqpe.start_date
--,fdqpe.current_enrollment
--,fdqpe.enrollment_category
,fdqpe.current_status
,fdqpe.earliest_date_observed
,fdqpe.elapsed_duration
,fdqpe.n_brands as identical_brands
,ntbtu.brand_name_counts
,fdqpe.category_id
,fdqpe.final_status
,fdqpe.h_sdi_val
--,fdqpe.h_sdi_u95
--,fdqpe.h_sdi_l95
,fdqpe.hm_sdi_val
--,fdqpe.hm_sdi_u95
--,fdqpe.hm_sdi_l95
,fdqpe.m_sdi_val
--,fdqpe.m_sdi_u95
--,fdqpe.m_sdi_l95
,fdqpe.lm_sdi_val
--,fdqpe.lm_sdi_u95
--,fdqpe.lm_sdi_l95
,fdqpe.l_sdi_val
--,fdqpe.l_sdi_u95
--,fdqpe.l_sdi_l95
from formatted_data_with_planned_enrollment fdqpe
join \"Formularies\".nct_to_brand_counts_through_uspdc ntbtu
on fdqpe.nct_id = ntbtu.nct_id
join cte
on fdqpe.nct_id = cte.nct_id
and fdqpe.earliest_date_observed = cte.tmstmp
order by fdqpe.nct_id, fdqpe.earliest_date_observed
;
"
)
df <- fetch(query, n = -1)
df <- na.omit(df)
query2 <-dbSendQuery(con,"select count(*) from \"DiseaseBurden\".icd10_categories ic where \"level\"=1;")
n_categories <- fetch(query2, n = -1)
return(list(data=df,ncat=n_categories))
}
d <- get_data(driver)
df <- d$data
n_categories <- d$ncat
cf <- get_counterfact_base(driver)
df_counterfact_base <- cf$data
################ Format Data ###########################
data_formatter <- function(df) {
categories <- df["category_id"]
x <- df["elapsed_duration"]
x["identical_brands"] <- asinh(df$identical_brands)
x["brand_name_counts"] <- asinh(df$brand_name_count)
x["h_sdi_val"] <- asinh(df$h_sdi_val)
x["hm_sdi_val"] <- asinh(df$hm_sdi_val)
x["m_sdi_val"] <- asinh(df$m_sdi_val)
x["lm_sdi_val"] <- asinh(df$lm_sdi_val)
x["l_sdi_val"] <- asinh(df$l_sdi_val)
#Setup fixed effects
x["status_NYR"] <- ifelse(df["current_status"]=="Not yet recruiting",1,0)
x["status_EBI"] <- ifelse(df["current_status"]=="Enrolling by invitation",1,0)
x["status_Rec"] <- ifelse(df["current_status"]=="Recruiting",1,0)
x["status_ANR"] <- ifelse(df["current_status"]=="Active, not recruiting",1,0)
y <- ifelse(df["final_status"]=="Terminated",1,0)
#get category list
return(list(x=x,y=y))
}
train <- data_formatter(df)
counterfact_base <- data_formatter(df_counterfact_base)
categories <- df$category_id
x <- train$x
y <- train$y
x_cf_base <- counterfact_base$x
y_cf_base <- counterfact_base$y
cf_categories <- df_counterfact_base$category_id
```
# Fit Model
```{r}
################################# FIT MODEL #########################################
inherited_cols <- c(
"elapsed_duration"
#,"identical_brands"
#,"brand_name_counts"
,"h_sdi_val"
,"hm_sdi_val"
,"m_sdi_val"
,"lm_sdi_val"
,"l_sdi_val"
,"status_NYR"# TODO: may need to remove
,"status_EBI"
,"status_Rec"
,"status_ANR"
)
```
```{r}
beta_list <- list(
groups = c(
`1`="Infections & Parasites",
`2`="Neoplasms",
`3`="Blood & Immune system",
`4`="Endocrine, Nutritional, and Metabolic",
`5`="Mental & Behavioral",
`6`="Nervous System",
`7`="Eye and Adnexa",
`8`="Ear and Mastoid",
`9`="Circulatory",
`10`="Respiratory",
`11`="Digestive",
`12`="Skin & Subcutaneaous tissue",
`13`="Musculoskeletal",
`14`="Genitourinary",
`15`="Pregancy, Childbirth, & Puerperium",
`16`="Perinatal Period",
`17`="Congential",
`18`="Symptoms, Signs etc.",
`19`="Injury etc.",
`20`="External Causes",
`21`="Contact with Healthcare",
`22`="Special Purposes"
),
parameters = c(
`1`="Elapsed Duration",
# brands
`2`="asinh(Generic Brands)",
`3`="asinh(Competitors USPDC)",
# population
`4`="asinh(High SDI)",
`5`="asinh(High-Medium SDI)",
`6`="asinh(Medium SDI)",
`7`="asinh(Low-Medium SDI)",
`8`="asinh(Low SDI)",
#Status
`9`="status_NYR",
`10`="status_EBI",
`11`="status_Rec",
`12`="status_ANR"
)
)
get_parameters <- function(stem,class_list) {
#get categories and lengths
named <- names(class_list)
lengths <- sapply(named, (function (x) length(class_list[[x]])))
#describe the grid needed
iter_list <- sapply(named, (function (x) 1:lengths[x]))
#generate the list of parameters
pardf <- generate_parameter_df(stem, iter_list)
#add columns with appropriate human-readable names
for (name in named) {
pardf[paste(name,"_hr",sep="")] <- as.factor(
sapply(pardf[name], (function (i) class_list[[name]][i]))
)
}
return(pardf)
}
generate_parameter_df <- function(stem, iter_list) {
grid <- expand.grid(iter_list)
grid["param_name"] <- grid %>% unite(x,colnames(grid),sep=",")
grid["param_name"] <- paste(stem,"[",grid$param_name,"]",sep="")
return(grid)
}
group_mcmc_areas <- function(
stem,# = "beta"
class_list,# = beta_list
stanfit,# = fit
group_id,# = 2
rename=TRUE,
filter=NULL
) {
#get all parameter names
params <- get_parameters(stem,class_list)
#filter down to parameters of interest
params <- filter(params,groups == group_id)
#Get dataframe with only the rows of interest
filtdata <- as.data.frame(stanfit)[params$param_name]
#rename columns
if (rename) dimnames(filtdata)[[2]] <- params$parameters_hr
#get group name for title
group_name <- class_list$groups[group_id]
#create area plot with appropriate title
p <- mcmc_areas(filtdata,prob = 0.8, prob_outer = 0.95) +
ggtitle(paste("Parameter distributions for ICD-10 class:",group_name)) +
geom_vline(xintercept=0,color="grey",alpha=0.75)
d <- pivot_longer(filtdata, everything()) |>
group_by(name) |>
summarize(
mean=mean(value)
,q025 = quantile(value,probs = 0.025)
,q975 = quantile(value,probs = 0.975)
,q05 = quantile(value,probs = 0.05)
,q95 = quantile(value,probs = 0.95)
)
return(list(plot=p,quantiles=d,name=group_name))
}
parameter_mcmc_areas <- function(
stem,# = "beta"
class_list,# = beta_list
stanfit,# = fit
parameter_id,# = 2
rename=TRUE
) {
#get all parameter names
params <- get_parameters(stem,class_list)
#filter down to parameters of interest
params <- filter(params,parameters == parameter_id)
#Get dataframe with only the rows of interest
filtdata <- as.data.frame(stanfit)[params$param_name]
#rename columns
if (rename) dimnames(filtdata)[[2]] <- params$groups_hr
#get group name for title
parameter_name <- class_list$parameters[parameter_id]
#create area plot with appropriate title
p <- mcmc_areas(filtdata,prob = 0.8, prob_outer = 0.95) +
ggtitle(parameter_name,"Parameter Distribution")
d <- pivot_longer(filtdata, everything()) |>
group_by(name) |>
summarize(
mean=mean(value)
,q025 = quantile(value,probs = 0.025)
,q975 = quantile(value,probs = 0.975)
,q05 = quantile(value,probs = 0.05)
,q95 = quantile(value,probs = 0.95)
)
return(list(plot=p,quantiles=d,name=parameter_name))
}
```
Plan: select all snapshots that are the first to have closed enrollment (Rec -> ANR)
```{r}
#delay intervention
intervention_enrollment <- x_cf_base[c(inherited_cols,"brand_name_counts", "identical_brands")]
intervention_enrollment["status_ANR"] <- 0
intervention_enrollment["status_Rec"] <- 1
```
```{r}
counterfact_delay <- list(
D = ncol(x),#
N = nrow(x),
L = n_categories$count,
y = as.vector(y),
ll = as.vector(categories),
x = as.matrix(x),
mu_mean = 0,
mu_stdev = 0.05,
sigma_shape = 4,
sigma_rate = 20,
Nx = nrow(x_cf_base),
llx = as.vector(cf_categories),
counterfact_x_tilde = as.matrix(intervention_enrollment),
counterfact_x = as.matrix(x_cf_base)
)
```
```{r}
fit <- stan(
file='Hierarchal_Logistic.stan',
data = counterfact_delay,
chains = 4,
iter = 5000,
seed = 11021585
)
```
## Explore data
```{r}
#get number of trials and snapshots in each category
group_trials_by_category <- as.data.frame(aggregate(category_id ~ nct_id, df, max))
group_trials_by_category <- as.data.frame(group_trials_by_category)
category_count <- group_trials_by_category |> group_by(category_id) |> count()
```
## Fit Results
```{r}
################################# ANALYZE #####################################
print(fit)
```
# Counterfactuals
```{r}
generated_ib <- gqs(
fit@stanmodel,
data=counterfact_delay,
draws=as.matrix(fit),
seed=11021585
)
```
```{r}
df_ib_p <- data.frame(
p_prior=as.vector(extract(generated_ib, pars="p_prior")$p_prior)
,p_predicted = as.vector(extract(generated_ib, pars="p_predicted")$p_predicted)
)
df_ib_prior <- data.frame(
mu_prior = as.vector(extract(generated_ib, pars="mu_prior")$mu_prior)
,sigma_prior = as.vector(extract(generated_ib, pars="sigma_prior")$sigma_prior)
)
#p_prior
ggplot(df_ib_p, aes(x=p_prior)) +
geom_density() +
labs(
title="Implied Prior Distribution P"
,subtitle=""
,x="Probability Domain 'p'"
,y="Probability Density"
)
ggsave("./EffectsOfEnrollmentDelay/Images/DirectEffects/prior_p.png")
#p_posterior
ggplot(df_ib_p, aes(x=p_predicted)) +
geom_density() +
labs(
title="Implied Posterior Distribution P"
,subtitle=""
,x="Probability Domain 'p'"
,y="Probability Density"
)
ggsave("./EffectsOfEnrollmentDelay/Images/DirectEffects/posterior_p.png")
#mu_prior
ggplot(df_ib_prior) +
geom_density(aes(x=mu_prior)) +
labs(
title="Prior - Mu"
,subtitle="same prior for all Mu values"
,x="Mu"
,y="Probability"
)
ggsave("./EffectsOfEnrollmentDelay/Images/DirectEffects/prior_mu.png")
#sigma_posterior
ggplot(df_ib_prior) +
geom_density(aes(x=sigma_prior)) +
labs(
title="Prior - Sigma"
,subtitle="same prior for all Sigma values"
,x="Sigma"
,y="Probability"
)
ggsave("./EffectsOfEnrollmentDelay/Images/DirectEffects/prior_sigma.png")
```
```{r}
check_hmc_diagnostics(fit)
```
### Intervention: Delay close of enrollment
```{r}
counterfact_predicted_ib <- data.frame(
p_predicted_default = as.vector(extract(generated_ib, pars="p_predicted_default")$p_predicted_default)
,p_predicted_intervention = as.vector(extract(generated_ib, pars="p_predicted_intervention")$p_predicted_intervention)
,predicted_difference = as.vector(extract(generated_ib, pars="predicted_difference")$predicted_difference)
)
ggplot(counterfact_predicted_ib, aes(x=p_predicted_default)) +
geom_density() +
labs(
title="Predicted Distribution of 'p'"
,subtitle="Intervention: None"
,x="Probability Domain 'p'"
,y="Probability Density"
)
ggsave("./EffectsOfEnrollmentDelay/Images/DirectEffects/default_p_generic_intervention_base.png")
ggplot(counterfact_predicted_ib, aes(x=p_predicted_intervention)) +
geom_density() +
labs(
title="Predicted Distribution of 'p'"
,subtitle="Intervention: Delay close of enrollment"
,x="Probability Domain 'p'"
,y="Probability Density"
)
ggsave("./EffectsOfEnrollmentDelay/Images/DirectEffects/default_p_generic_intervention_interv.png")
ggplot(counterfact_predicted_ib, aes(x=predicted_difference)) +
geom_density() +
labs(
title="Predicted Distribution of differences 'p'"
,subtitle="Intervention: Delay close of enrollment"
,x="Difference in 'p' under treatment"
,y="Probability Density"
)
ggsave("./EffectsOfEnrollmentDelay/Images/DirectEffects/default_p_generic_intervention_distdiff.png")
```
```{r}
get_category_count <- function(tbl, id) {
result <- tbl$n[tbl$category_id == id]
if(length(result) == 0) 0 else result
}
category_names <- sapply(1:length(beta_list$groups),
function(i) sprintf("ICD-10 #%d: %s (n=%d)",
i,
beta_list$groups[i],
get_category_count(category_count, i)))
```
```{r}
pddf_ib <- data.frame(extract(generated_ib, pars="predicted_difference")$predicted_difference) |>
pivot_longer(X1:X168) #CHANGE_NOTE: moved from X169 to X168
#TODO: Fix Category names
pddf_ib["entry_idx"] <- as.numeric(gsub("\\D","",pddf_ib$name))
pddf_ib["category"] <- sapply(pddf_ib$entry_idx, function(i) df$category_id[i])
pddf_ib["category_name"] <- sapply(
pddf_ib$category,
function(i) category_names[i]
)
ggplot(pddf_ib, aes(x=value,)) +
geom_density(adjust=1/5) +
labs(
title = "Distribution of predicted differences"
,subtitle = "Intervention: Delay close of enrollment"
,x = "Difference in probability due to intervention"
,y = "Probability Density"
) +
geom_vline(aes(xintercept = 0), color = "skyblue", linetype="dashed")
#todo: add median, mean, 40/60 quantiles as well as
ggsave("./EffectsOfEnrollmentDelay/Images/DirectEffects/p_delay_intervention_distdiff_styled.png")
ggplot(pddf_ib, aes(x=value,)) +
geom_density(adjust=1/5) +
facet_wrap(
~factor(
category_name,
levels=category_names
)
, labeller = label_wrap_gen(multi_line = TRUE)
, ncol=4) +
labs(
title = "Distribution of predicted differences | By Group"
,subtitle = "Intervention: Delay close of enrollment"
,x = "Difference in probability due to intervention"
,y = "Probability Density"
) +
geom_vline(aes(xintercept = 0), color = "skyblue", linetype="dashed") +
theme(strip.text.x = element_text(size = 8))
ggsave("./EffectsOfEnrollmentDelay/Images/DirectEffects/p_delay_intervention_distdiff_by_group.png")
ggplot(pddf_ib, aes(x=value,)) +
geom_histogram(bins=300) +
facet_wrap(
~factor(
category_name,
levels=category_names
)
, labeller = label_wrap_gen(multi_line = TRUE)
, ncol=4) +
labs(
title = "Histogram of predicted differences | By Group"
,subtitle = "Intervention: Delay close of enrollment"
,x = "Difference in probability due to intervention"
,y = "Predicted counts"
) +
geom_vline(aes(xintercept = 0), color = "skyblue", linetype="dashed") +
theme(strip.text.x = element_text(size = 8))
ggsave("./EffectsOfEnrollmentDelay/Images/DirectEffects/p_delay_intervention_histdiff_by_group.png")
```
```{r}
p3 <- ggplot(pddf_ib, aes(x=value,)) +
geom_histogram(bins=500) +
labs(
title = "Distribution of predicted differences"
,subtitle = "Intervention: Delay close of enrollment"
,x = "Difference in probability due to intervention"
,y = "Probability Density"
,caption = "Vertical marks: 5/10/25/50/75/90/95th percentiles. Dot shows mean."
) +
geom_vline(aes(xintercept = 0), color = "skyblue", linetype="dashed")
stats <- list(
p5 = quantile(pddf_ib$value, 0.05),
p10 = quantile(pddf_ib$value, 0.10),
q1 = quantile(pddf_ib$value, 0.25),
med = median(pddf_ib$value),
mean = mean(pddf_ib$value),
q3 = quantile(pddf_ib$value, 0.75),
p90 = quantile(pddf_ib$value, 0.90),
p95 = quantile(pddf_ib$value, 0.95),
max_height = max(ggplot_build(p3)$data[[1]]$count),
y_offset = -max(ggplot_build(p3)$data[[1]]$count) * 0.05
)
p3 +
# Box
geom_segment(data = data.frame(
x = c(stats$q1, stats$q3, stats$med),
xend = c(stats$q1, stats$q3, stats$med),
y = rep(stats$y_offset, 3),
yend = rep(stats$y_offset * 2, 3)
), aes(x = x, xend = xend, y = y, yend = yend)) +
geom_segment(data = data.frame(
x = rep(stats$q1, 2),
xend = rep(stats$q3, 2),
y = c(stats$y_offset, stats$y_offset * 2),
yend = c(stats$y_offset, stats$y_offset * 2)
), aes(x = x, xend = xend, y = y, yend = yend)) +
# Inner whiskers (Q1->P10, Q3->P90)
geom_segment(data = data.frame(
x = c(stats$q1, stats$q3),
xend = c(stats$p10, stats$p90),
y = rep(stats$y_offset * 1.5, 2),
yend = rep(stats$y_offset * 1.5, 2)
), aes(x = x, xend = xend, y = y, yend = yend)) +
# Crossbars at P10/P90
geom_segment(data = data.frame(
x = c(stats$p10, stats$p90),
xend = c(stats$p10, stats$p90),
y = stats$y_offset * 1.3,
yend = stats$y_offset * 1.7
), aes(x = x, xend = xend, y = y, yend = yend)) +
# Outer whiskers (P10->P5, P90->P95)
geom_segment(data = data.frame(
x = c(stats$p10, stats$p90),
xend = c(stats$p5, stats$p95),
y = rep(stats$y_offset * 1.5, 2),
yend = rep(stats$y_offset * 1.5, 2)
), aes(x = x, xend = xend, y = y, yend = yend)) +
# Crossbars at P5/P95
geom_segment(data = data.frame(
x = c(stats$p5, stats$p95),
xend = c(stats$p5, stats$p95),
y = stats$y_offset * 1.3,
yend = stats$y_offset * 1.7
), aes(x = x, xend = xend, y = y, yend = yend)) +
# Mean dot
geom_point(data = data.frame(
x = stats$mean,
y = stats$y_offset * 1.5
), aes(x = x, y = y))
ggsave("./EffectsOfEnrollmentDelay/Images/DirectEffects/p_delay_intervention_histdiff_boxplot.png")
```
```{r}
ggplot(pddf_ib, aes(x=value)) +
stat_ecdf(geom='step') +
labs(
title = "Cumulative distribution of predicted differences",
subtitle = "Intervention: Delay close of enrollment",
x = "Difference in probability of termination due to intervention",
y = "Cumulative Proportion"
)
ggsave("./EffectsOfEnrollmentDelay/Images/DirectEffects/p_delay_intervention_cumulative_distdiff.png")
```
Get the % of differences in the spike around zero
```{r}
# get values around and above/below spike
width <- 0.02
spike_band_centered_zero <- mean( pddf_ib$value >= -width/2 & pddf_ib$value <= width/2)
above_spike_band <- mean( pddf_ib$value >= width/2)
below_spike_band <- mean( pddf_ib$value <= -width/2)
# get mass above and mass below zero
mass_below_zero <- mean( pddf_ib$value <= 0)
```
Looking at the spike around zero, we find that `r spike_band_centered_zero*100`%
of the probability mass is contained within the band from
[`r -width*100/2`,`r width*100/2`].
Additionally, there was `r above_spike_band*100`% of the probability above that
-- representing those with a general increase in the probability of termination --
and `r below_spike_band*100`% of the probability mass below the band
-- representing a decrease in the probability of termination.
On average, if you keep the trial open instead of closing it,
`r mass_below_zero`% of trials will see a decrease in the probability of termination,
but, due to the high increase in probability of termination given termination was increased,
the mean probability of termination increases by `r stats$mean`.
```{r}
# 5%-iles
summary(pddf_ib$value)
# Create your quantiles
quants <- quantile(pddf_ib$value, probs = seq(0,1,0.05), type=4)
# Convert to a data frame
quant_df <- data.frame(
Percentile = names(quants),
Value = quants
)
kable(quant_df)
```
```{r}
n = length(counterfact_predicted_ib$p_predicted_intervention)
k = 100
simulated_terminations_intervention <- mean(rbinom(n,k,as.vector(counterfact_predicted_ib$p_predicted_intervention)))
simulated_terminations_base <-mean(rbinom(n,k,as.vector(counterfact_predicted_ib$p_predicted_default)))
simulated_percentages <- (simulated_terminations_intervention - simulated_terminations_base)/k
```
The simulation above shows that this results in a percentage-point increase of about
`r simulated_percentages * 100`.
# Diagnostics
```{r}
#| eval: true
#trace plots
plot(fit, pars=c("mu"), plotfun="trace")
for (i in 1:3) {
print(
mcmc_rank_overlay(
fit,
pars=c(
paste0("mu[",4*i-3,"]"),
paste0("mu[",4*i-2,"]"),
paste0("mu[",4*i-1,"]"),
paste0("mu[",4*i,"]")
),
n_bins=100
)+ legend_move("top") +
scale_colour_ghibli_d("KikiMedium")
)
}
```
```{r}
#| eval: true
plot(fit, pars=c("sigma"), plotfun="trace")
for (i in 1:3) {
print(
mcmc_rank_overlay(
fit,
pars=c(
paste0("sigma[",4*i-3,"]"),
paste0("sigma[",4*i-2,"]"),
paste0("sigma[",4*i-1,"]"),
paste0("sigma[",4*i,"]")
),
n_bins=100
)+ legend_move("top") +
scale_colour_ghibli_d("KikiMedium")
)
}
```
```{r}
#| eval: true
#other diagnostics
logpost <- log_posterior(fit)
nuts_prmts <- nuts_params(fit)
posterior <- as.array(fit)
```
```{r}
#| eval: true
color_scheme_set("darkgray")
div_style <- parcoord_style_np(div_color = "green", div_size = 0.05, div_alpha = 0.4)
mcmc_parcoord(posterior, regex_pars = "mu", np=nuts_prmts, np_style = div_style, alpha = 0.05)
```
```{r}
#| eval: true
for (i in 1:3) {
mus = sapply(3:0, function(j) paste0("mu[",4*i-j ,"]"))
print(
mcmc_pairs(
posterior,
np = nuts_prmts,
pars=c(
mus,
"lp__"
),
off_diag_args = list(size = 0.75)
)
)
}
```
```{r}
#| eval: true
mcmc_parcoord(posterior,regex_pars = "sigma", np=nuts_prmts, alpha=0.05)
```
```{r}
#| eval: true
for (i in 1:3) {
params = sapply(3:0, function(j) paste0("sigma[",4*i-j ,"]"))
print(
mcmc_pairs(
posterior,
np = nuts_prmts,
pars=c(
params,
"lp__"
),
off_diag_args = list(size = 0.75)
)
)
}
```
```{r}
#| eval: true
for (k in 1:22) {
for (i in 1:3) {
params = sapply(3:0, function(j) paste0("beta[",k,",",4*i-j ,"]"))
print(
mcmc_pairs(
posterior,
np = nuts_prmts,
pars=c(
params,
"lp__"
),
off_diag_args = list(size = 0.75)
)
)
}}
```
# TODO
- [ ] Double check data flow. (Write summary of this in human readable form)
- Is it the data we want from the database
- Training
- Counterfactual Evaluation
- choose a single snapshot per trial.
- Is the model in STAN well specified.
- [ ] work on LOO validation of model

Binary file not shown.

Before

Width:  |  Height:  |  Size: 79 KiB

After

Width:  |  Height:  |  Size: 75 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 93 KiB

After

Width:  |  Height:  |  Size: 85 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 79 KiB

After

Width:  |  Height:  |  Size: 73 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 173 KiB

After

Width:  |  Height:  |  Size: 194 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 96 KiB

After

Width:  |  Height:  |  Size: 80 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 128 KiB

After

Width:  |  Height:  |  Size: 147 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 76 KiB

After

Width:  |  Height:  |  Size: 67 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 81 KiB

After

Width:  |  Height:  |  Size: 81 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 74 KiB

After

Width:  |  Height:  |  Size: 74 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 85 KiB

After

Width:  |  Height:  |  Size: 84 KiB

Loading…
Cancel
Save