You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
1114 lines
28 KiB
Plaintext
1114 lines
28 KiB
Plaintext
---
|
|
title: "The Effects of market conditions on recruitment and completion of clinical trials"
|
|
author: "Will King"
|
|
format: html
|
|
editor: source
|
|
---
|
|
|
|
IMPORTANT: Not setup yet
|
|
|
|
|
|
# Setup
|
|
|
|
```{r}
|
|
library(bayesplot)
|
|
available_mcmc(pattern = "_nuts_")
|
|
library(ggplot2)
|
|
library(patchwork)
|
|
library(tidyverse)
|
|
library(rstan)
|
|
library(tidyr)
|
|
library(ghibli)
|
|
library(xtable)
|
|
#Resources: https://github.com/stan-dev/rstan/wiki/RStan-Getting-Started
|
|
|
|
#save unchanged models instead of recompiling
|
|
rstan_options(auto_write = TRUE)
|
|
#allow for multithreaded sampling
|
|
options(mc.cores = parallel::detectCores())
|
|
|
|
#test installation, shouldn't get any errors
|
|
#example(stan_model, package = "rstan", run.dontrun = TRUE)
|
|
```
|
|
|
|
```{r}
|
|
################ Pull data from database ######################
|
|
library(RPostgreSQL)
|
|
|
|
driver <- dbDriver("PostgreSQL")
|
|
|
|
get_data <- function(driver) {
|
|
|
|
con <- dbConnect(
|
|
driver,
|
|
user='root',
|
|
password='root',
|
|
dbname='aact_db',
|
|
host='will-office'
|
|
)
|
|
on.exit(dbDisconnect(con))
|
|
|
|
query <- dbSendQuery(
|
|
con,
|
|
# "select * from formatted_data_with_planned_enrollment;"
|
|
"
|
|
select
|
|
fdqpe.nct_id
|
|
--,fdqpe.start_date
|
|
--,fdqpe.current_enrollment
|
|
--,fdqpe.enrollment_category
|
|
,fdqpe.current_status
|
|
,fdqpe.earliest_date_observed
|
|
,fdqpe.elapsed_duration
|
|
,fdqpe.n_brands as identical_brands
|
|
,ntbtu.brand_name_count
|
|
,fdqpe.category_id
|
|
,fdqpe.final_status
|
|
,fdqpe.h_sdi_val
|
|
--,fdqpe.h_sdi_u95
|
|
--,fdqpe.h_sdi_l95
|
|
,fdqpe.hm_sdi_val
|
|
--,fdqpe.hm_sdi_u95
|
|
--,fdqpe.hm_sdi_l95
|
|
,fdqpe.m_sdi_val
|
|
--,fdqpe.m_sdi_u95
|
|
--,fdqpe.m_sdi_l95
|
|
,fdqpe.lm_sdi_val
|
|
--,fdqpe.lm_sdi_u95
|
|
--,fdqpe.lm_sdi_l95
|
|
,fdqpe.l_sdi_val
|
|
--,fdqpe.l_sdi_u95
|
|
--,fdqpe.l_sdi_l95
|
|
from formatted_data_with_planned_enrollment fdqpe
|
|
join \"Formularies\".nct_to_brands_through_uspdc ntbtu
|
|
on fdqpe.nct_id = ntbtu.nct_id
|
|
order by fdqpe.nct_id, fdqpe.earliest_date_observed
|
|
;
|
|
"
|
|
)
|
|
df <- fetch(query, n = -1)
|
|
df <- na.omit(df)
|
|
|
|
query2 <-dbSendQuery(con,"select count(*) from \"DiseaseBurden\".icd10_categories ic where \"level\"=1;")
|
|
n_categories <- fetch(query2, n = -1)
|
|
|
|
return(list(data=df,ncat=n_categories))
|
|
}
|
|
|
|
d <- get_data(driver)
|
|
df <- d$data
|
|
n_categories <- d$ncat
|
|
|
|
|
|
|
|
|
|
################ Format Data ###########################
|
|
|
|
data_formatter <- function(df) {
|
|
categories <- df["category_id"]
|
|
|
|
x <- df["elapsed_duration"]
|
|
x["identical_brands"] <- asinh(df$identical_brands)
|
|
x["brand_name_counts"] <- asinh(df$brand_name_count)
|
|
x["h_sdi_val"] <- asinh(df$h_sdi_val)
|
|
x["hm_sdi_val"] <- asinh(df$hm_sdi_val)
|
|
x["m_sdi_val"] <- asinh(df$m_sdi_val)
|
|
x["lm_sdi_val"] <- asinh(df$lm_sdi_val)
|
|
x["l_sdi_val"] <- asinh(df$l_sdi_val)
|
|
|
|
|
|
#Setup fixed effects
|
|
x["status_NYR"] <- ifelse(df["current_status"]=="Not yet recruiting",1,0)
|
|
x["status_EBI"] <- ifelse(df["current_status"]=="Enrolling by invitation",1,0)
|
|
x["status_Rec"] <- ifelse(df["current_status"]=="Recruiting",1,0)
|
|
x["status_ANR"] <- ifelse(df["current_status"]=="Active, not recruiting",1,0)
|
|
|
|
|
|
y <- ifelse(df["final_status"]=="Terminated",1,0)
|
|
|
|
#get category list
|
|
|
|
|
|
return(list(x=x,y=y))
|
|
}
|
|
|
|
train <- data_formatter(df)
|
|
|
|
categories <- df$category_id
|
|
|
|
x <- train$x
|
|
y <- train$y
|
|
```
|
|
|
|
|
|
|
|
# Fit Model
|
|
|
|
|
|
|
|
|
|
```{r}
|
|
################################# FIT MODEL #########################################
|
|
inherited_cols <- c(
|
|
"elapsed_duration"
|
|
#,"identical_brands"
|
|
#,"brand_name_counts"
|
|
,"h_sdi_val"
|
|
,"hm_sdi_val"
|
|
,"m_sdi_val"
|
|
,"lm_sdi_val"
|
|
,"l_sdi_val"
|
|
,"status_NYR"
|
|
,"status_EBI"
|
|
,"status_Rec"
|
|
,"status_ANR"
|
|
)
|
|
```
|
|
|
|
|
|
|
|
```{r}
|
|
beta_list <- list(
|
|
groups = c(
|
|
`1`="Infections & Parasites",
|
|
`2`="Neoplasms",
|
|
`3`="Blood & Immune system",
|
|
`4`="Endocrine, Nutritional, and Metabolic",
|
|
`5`="Mental & Behavioral",
|
|
`6`="Nervous System",
|
|
`7`="Eye and Adnexa",
|
|
`8`="Ear and Mastoid",
|
|
`9`="Circulatory",
|
|
`10`="Respiratory",
|
|
`11`="Digestive",
|
|
`12`="Skin & Subcutaneaous tissue",
|
|
`13`="Musculoskeletal",
|
|
`14`="Genitourinary",
|
|
`15`="Pregancy, Childbirth, & Puerperium",
|
|
`16`="Perinatal Period",
|
|
`17`="Congential",
|
|
`18`="Symptoms, Signs etc.",
|
|
`19`="Injury etc.",
|
|
`20`="External Causes",
|
|
`21`="Contact with Healthcare",
|
|
`22`="Special Purposes"
|
|
),
|
|
parameters = c(
|
|
`1`="Elapsed Duration",
|
|
# brands
|
|
`2`="asinh(Generic Brands)",
|
|
`3`="asinh(Competitors USPDC)",
|
|
# population
|
|
`4`="asinh(High SDI)",
|
|
`5`="asinh(High-Medium SDI)",
|
|
`6`="asinh(Medium SDI)",
|
|
`7`="asinh(Low-Medium SDI)",
|
|
`8`="asinh(Low SDI)",
|
|
#Status
|
|
`9`="status_NYR",
|
|
`10`="status_EBI",
|
|
`11`="status_Rec",
|
|
`12`="status_ANR"
|
|
)
|
|
)
|
|
|
|
get_parameters <- function(stem,class_list) {
|
|
#get categories and lengths
|
|
named <- names(class_list)
|
|
lengths <- sapply(named, (function (x) length(class_list[[x]])))
|
|
|
|
#describe the grid needed
|
|
iter_list <- sapply(named, (function (x) 1:lengths[x]))
|
|
|
|
#generate the list of parameters
|
|
pardf <- generate_parameter_df(stem, iter_list)
|
|
|
|
#add columns with appropriate human-readable names
|
|
for (name in named) {
|
|
pardf[paste(name,"_hr",sep="")] <- as.factor(
|
|
sapply(pardf[name], (function (i) class_list[[name]][i]))
|
|
)
|
|
}
|
|
|
|
return(pardf)
|
|
}
|
|
|
|
generate_parameter_df <- function(stem, iter_list) {
|
|
grid <- expand.grid(iter_list)
|
|
grid["param_name"] <- grid %>% unite(x,colnames(grid),sep=",")
|
|
grid["param_name"] <- paste(stem,"[",grid$param_name,"]",sep="")
|
|
return(grid)
|
|
}
|
|
|
|
group_mcmc_areas <- function(
|
|
stem,# = "beta"
|
|
class_list,# = beta_list
|
|
stanfit,# = fit
|
|
group_id,# = 2
|
|
rename=TRUE,
|
|
filter=NULL
|
|
) {
|
|
#get all parameter names
|
|
params <- get_parameters(stem,class_list)
|
|
|
|
#filter down to parameters of interest
|
|
params <- filter(params,groups == group_id)
|
|
#Get dataframe with only the rows of interest
|
|
filtdata <- as.data.frame(stanfit)[params$param_name]
|
|
#rename columns
|
|
if (rename) dimnames(filtdata)[[2]] <- params$parameters_hr
|
|
#get group name for title
|
|
group_name <- class_list$groups[group_id]
|
|
#create area plot with appropriate title
|
|
p <- mcmc_areas(filtdata,prob = 0.8, prob_outer = 0.95) +
|
|
ggtitle(paste("Parameter distributions for ICD-10 class:",group_name)) +
|
|
geom_vline(xintercept=0,color="grey",alpha=0.75)
|
|
|
|
d <- pivot_longer(filtdata, everything()) |>
|
|
group_by(name) |>
|
|
summarize(
|
|
mean=mean(value)
|
|
,q025 = quantile(value,probs = 0.025)
|
|
,q975 = quantile(value,probs = 0.975)
|
|
,q05 = quantile(value,probs = 0.05)
|
|
,q95 = quantile(value,probs = 0.95)
|
|
)
|
|
return(list(plot=p,quantiles=d,name=group_name))
|
|
}
|
|
|
|
parameter_mcmc_areas <- function(
|
|
stem,# = "beta"
|
|
class_list,# = beta_list
|
|
stanfit,# = fit
|
|
parameter_id,# = 2
|
|
rename=TRUE
|
|
) {
|
|
#get all parameter names
|
|
params <- get_parameters(stem,class_list)
|
|
#filter down to parameters of interest
|
|
params <- filter(params,parameters == parameter_id)
|
|
#Get dataframe with only the rows of interest
|
|
filtdata <- as.data.frame(stanfit)[params$param_name]
|
|
#rename columns
|
|
if (rename) dimnames(filtdata)[[2]] <- params$groups_hr
|
|
#get group name for title
|
|
parameter_name <- class_list$parameters[parameter_id]
|
|
#create area plot with appropriate title
|
|
p <- mcmc_areas(filtdata,prob = 0.8, prob_outer = 0.95) +
|
|
ggtitle(parameter_name,"Parameter Distribution")
|
|
|
|
d <- pivot_longer(filtdata, everything()) |>
|
|
group_by(name) |>
|
|
summarize(
|
|
mean=mean(value)
|
|
,q025 = quantile(value,probs = 0.025)
|
|
,q975 = quantile(value,probs = 0.975)
|
|
,q05 = quantile(value,probs = 0.05)
|
|
,q95 = quantile(value,probs = 0.95)
|
|
)
|
|
return(list(plot=p,quantiles=d,name=parameter_name))
|
|
}
|
|
|
|
|
|
```
|
|
|
|
|
|
```{r}
|
|
#generics intervention
|
|
brand_intervention_ib <- x[c(inherited_cols,"brand_name_counts")]
|
|
brand_intervention_ib["identical_brands"] <- asinh(sinh(x$identical_brands)+1) #add a single generic brand
|
|
```
|
|
|
|
```{r}
|
|
counterfact_marketing_ib <- list(
|
|
D = ncol(x),#
|
|
N = nrow(x),
|
|
L = n_categories$count,
|
|
y = as.vector(y),
|
|
ll = as.vector(categories),
|
|
x = as.matrix(x),
|
|
mu_mean = 0,
|
|
mu_stdev = 0.05,
|
|
sigma_shape = 4,
|
|
sigma_rate = 20,
|
|
Nx = nrow(x),
|
|
llx = as.vector(categories),
|
|
counterfact_x_tilde = as.matrix(brand_intervention_ib),
|
|
counterfact_x = as.matrix(x)
|
|
)
|
|
```
|
|
|
|
```{r}
|
|
fit <- stan(
|
|
file='Hierarchal_Logistic.stan',
|
|
data = counterfact_marketing_ib,
|
|
chains = 4,
|
|
iter = 5000,
|
|
seed = 11021585
|
|
)
|
|
```
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
## Explore data
|
|
|
|
```{r}
|
|
################################# DATA EXPLORATION ############################
|
|
driver <- dbDriver("PostgreSQL")
|
|
|
|
con <- dbConnect(
|
|
driver,
|
|
user='root',
|
|
password='root',
|
|
dbname='aact_db',
|
|
host='will-office'
|
|
)
|
|
#Plot histogram of count of snapshots
|
|
df3 <- dbGetQuery(
|
|
con,
|
|
"select nct_id,final_status,count(*) from formatted_data_with_planned_enrollment fdwpe
|
|
group by nct_id,final_status ;"
|
|
)
|
|
#df3 <- fetch(query3, n = -1)
|
|
|
|
ggplot(data=df3, aes(x=count, fill=final_status)) +
|
|
geom_histogram(binwidth=1) +
|
|
ggtitle("Histogram of snapshots per trial (matched trials)") +
|
|
xlab("Snapshots per trial")
|
|
ggsave("./Images/HistSnapshots.png")
|
|
|
|
#Plot duration for terminated vs completed
|
|
df4 <- dbGetQuery(
|
|
con,
|
|
"
|
|
select
|
|
nct_id,
|
|
start_date ,
|
|
primary_completion_date,
|
|
overall_status ,
|
|
primary_completion_date - start_date as duration
|
|
from ctgov.studies s
|
|
where nct_id in (select distinct nct_id from http.download_status ds)
|
|
;"
|
|
)
|
|
#df4 <- fetch(query4, n = -1)
|
|
|
|
ggplot(data=df4, aes(x=duration,fill=overall_status)) +
|
|
geom_histogram()+
|
|
ggtitle("Histogram of trial durations") +
|
|
xlab("duration")+
|
|
facet_wrap(~overall_status)
|
|
ggsave("./Images/HistTrialDurations_Faceted.png")
|
|
|
|
df5 <- dbGetQuery(
|
|
con,
|
|
"
|
|
with cte1 as (
|
|
select
|
|
nct_id,
|
|
start_date ,
|
|
primary_completion_date,
|
|
overall_status ,
|
|
primary_completion_date - start_date as duration
|
|
from ctgov.studies s
|
|
where nct_id in (select distinct nct_id from http.download_status ds)
|
|
), cte2 as (
|
|
select nct_id,count(*) as snapshot_count from formatted_data_with_planned_enrollment fdwpe
|
|
group by nct_id
|
|
)
|
|
select a.nct_id, a.overall_status, a.duration,b.snapshot_count
|
|
from cte1 as a
|
|
join cte2 as b
|
|
on a.nct_id=b.nct_id
|
|
;"
|
|
)
|
|
df5$overall_status <- as.factor(df5$overall_status)
|
|
|
|
ggplot(data=df5, aes(x=duration,y=snapshot_count,color=overall_status)) +
|
|
geom_jitter() +
|
|
ggtitle("Comparison of duration, status, and snapshot_count") +
|
|
xlab("duration") +
|
|
ylab("snapshot count")
|
|
ggsave("./Images/SnapshotsVsDurationVsTermination.png")
|
|
|
|
dbDisconnect(con)
|
|
|
|
#get number of trials and snapshots in each category
|
|
group_trials_by_category <- as.data.frame(aggregate(category_id ~ nct_id, df, max))
|
|
group_trials_by_category <- as.data.frame(group_trials_by_category)
|
|
|
|
ggplot(data = group_trials_by_category, aes(x=category_id)) +
|
|
geom_bar(binwidth=1,color="black",fill="seagreen") +
|
|
scale_x_continuous(breaks=scales::pretty_breaks(n=22)) +
|
|
labs(
|
|
title="bar chart of trial categories"
|
|
,x="Category ID"
|
|
,y="Count"
|
|
)
|
|
ggsave("./Images/CategoryCounts.png")
|
|
|
|
|
|
|
|
summary(df5)
|
|
```
|
|
|
|
|
|
|
|
|
|
|
|
|
|
```{r}
|
|
category_count <- group_trials_by_category |> group_by(category_id) |> count()
|
|
|
|
```
|
|
|
|
|
|
|
|
|
|
|
|
## Fit Results
|
|
|
|
|
|
```{r}
|
|
################################# ANALYZE #####################################
|
|
print(fit)
|
|
```
|
|
|
|
|
|
|
|
|
|
|
|
|
|
### Investigating parameter distributions
|
|
|
|
```{r}
|
|
#g1 <- group_mcmc_areas("beta",beta_list,fit,1)
|
|
|
|
|
|
gx <- c()
|
|
|
|
#grab parameters for every category with more than 8 observations
|
|
for (i in category_count$category_id[category_count$n >= 8]) {
|
|
print(i)
|
|
|
|
#Print parameter distributions
|
|
gi <- group_mcmc_areas("beta",beta_list,fit,i) #add way to filter groups
|
|
ggsave(
|
|
paste0("./Images/DirectEffects/Parameters/group_",i,"_",gi$name,".png")
|
|
,plot=gi$plot
|
|
)
|
|
gx <- c(gx,gi)
|
|
|
|
#Get Quantiles and means for parameters
|
|
table <- xtable(gi$quantiles,
|
|
floating=FALSE
|
|
,latex.environments = NULL
|
|
,booktabs = TRUE
|
|
,zap=getOption("digits")
|
|
)
|
|
write_lines(table,paste0("./latex_output/DirectEffects/group_",gi$name,".tex"))
|
|
}
|
|
```
|
|
|
|
|
|
|
|
```{r}
|
|
px <- c()
|
|
|
|
|
|
for (i in c(1,2,3,9,10,11,12)) {
|
|
|
|
#Print parameter distributions
|
|
pi <- parameter_mcmc_areas("beta",beta_list,fit,i) #add way to filter groups
|
|
ggsave(
|
|
paste0("./Images/DirectEffects/Parameters/parameters_",i,"_",pi$name,".png")
|
|
,plot=pi$plot
|
|
)
|
|
px <- c(px,pi)
|
|
|
|
#Get Quantiles and means for parameters
|
|
table <- xtable(pi$quantiles,
|
|
floating=FALSE
|
|
,latex.environments = NULL
|
|
,booktabs = TRUE
|
|
,zap=getOption("digits")
|
|
)
|
|
write_lines(table,paste0("./latex_output/DirectEffects/parameters_",i,"_",pi$name,".tex"))
|
|
|
|
}
|
|
```
|
|
|
|
Note these have 95% outer CI and 80% inner (shaded)
|
|
|
|
|
|
1) "Elapsed Duration",
|
|
2) "asinh(Generic Brands)",
|
|
3) "asinh(Competitors USPDC)",
|
|
4) "asinh(High SDI)",
|
|
5) "asinh(High-Medium SDI)",
|
|
6) "asinh(Medium SDI)",
|
|
7) "asinh(Low-Medium SDI)",
|
|
8) "asinh(Low SDI)",
|
|
9) "status_NYR",
|
|
10) "status_EBI",
|
|
11) "status_Rec",
|
|
12) "status_ANR",
|
|
|
|
|
|
|
|
|
|
```{r}
|
|
print(px[4]$plot + px[7]$plot)
|
|
ggsave("./Images/DirectEffects/Parameters/2+3_generic_and_uspdc.png")
|
|
```
|
|
|
|
|
|
|
|
# Counterfactuals
|
|
|
|
```{r}
|
|
generated_ib <- gqs(
|
|
fit@stanmodel,
|
|
data=counterfact_marketing_ib,
|
|
draws=as.matrix(fit),
|
|
seed=11021585
|
|
)
|
|
```
|
|
|
|
```{r}
|
|
df_ib_p <- data.frame(
|
|
p_prior=as.vector(extract(generated_ib, pars="p_prior")$p_prior)
|
|
,p_predicted = as.vector(extract(generated_ib, pars="p_predicted")$p_predicted)
|
|
)
|
|
|
|
df_ib_prior <- data.frame(
|
|
mu_prior = as.vector(extract(generated_ib, pars="mu_prior")$mu_prior)
|
|
,sigma_prior = as.vector(extract(generated_ib, pars="sigma_prior")$sigma_prior)
|
|
)
|
|
|
|
#p_prior
|
|
ggplot(df_ib_p, aes(x=p_prior)) +
|
|
geom_density() +
|
|
labs(
|
|
title="Implied Prior Distribution P"
|
|
,subtitle=""
|
|
,x="Probability Domain 'p'"
|
|
,y="Probability Density"
|
|
)
|
|
ggsave("./Images/DirectEffects/prior_p.png")
|
|
|
|
#p_posterior
|
|
ggplot(df_ib_p, aes(x=p_predicted)) +
|
|
geom_density() +
|
|
labs(
|
|
title="Implied Posterior Distribution P"
|
|
,subtitle=""
|
|
,x="Probability Domain 'p'"
|
|
,y="Probability Density"
|
|
)
|
|
ggsave("./Images/DirectEffects/posterior_p.png")
|
|
|
|
#mu_prior
|
|
ggplot(df_ib_prior) +
|
|
geom_density(aes(x=mu_prior)) +
|
|
labs(
|
|
title="Prior - Mu"
|
|
,subtitle="same prior for all Mu values"
|
|
,x="Mu"
|
|
,y="Probability"
|
|
)
|
|
ggsave("./Images/DirectEffects/prior_mu.png")
|
|
|
|
#sigma_posterior
|
|
ggplot(df_ib_prior) +
|
|
geom_density(aes(x=sigma_prior)) +
|
|
labs(
|
|
title="Prior - Sigma"
|
|
,subtitle="same prior for all Sigma values"
|
|
,x="Sigma"
|
|
,y="Probability"
|
|
)
|
|
ggsave("./Images/DirectEffects/prior_sigma.png")
|
|
```
|
|
|
|
|
|
|
|
```{r}
|
|
check_hmc_diagnostics(fit)
|
|
```
|
|
|
|
|
|
|
|
|
|
|
|
### Intervention: Alternatives
|
|
|
|
#### Generic Alternative
|
|
|
|
```{r}
|
|
counterfact_predicted_ib <- data.frame(
|
|
p_predicted_default = as.vector(extract(generated_ib, pars="p_predicted_default")$p_predicted_default)
|
|
,p_predicted_intervention = as.vector(extract(generated_ib, pars="p_predicted_intervention")$p_predicted_intervention)
|
|
,predicted_difference = as.vector(extract(generated_ib, pars="predicted_difference")$predicted_difference)
|
|
)
|
|
|
|
|
|
ggplot(counterfact_predicted_ib, aes(x=p_predicted_default)) +
|
|
geom_density() +
|
|
labs(
|
|
title="Predicted Distribution of 'p'"
|
|
,subtitle="Intervention: None"
|
|
,x="Probability Domain 'p'"
|
|
,y="Probability Density"
|
|
)
|
|
ggsave("./Images/DirectEffects/default_p_generic_intervention_base.png")
|
|
|
|
ggplot(counterfact_predicted_ib, aes(x=p_predicted_intervention)) +
|
|
geom_density() +
|
|
labs(
|
|
title="Predicted Distribution of 'p'"
|
|
,subtitle="Intervention: Add a single generic competitor"
|
|
,x="Probability Domain 'p'"
|
|
,y="Probability Density"
|
|
)
|
|
ggsave("./Images/DirectEffects/default_p_generic_intervention_interv.png")
|
|
|
|
ggplot(counterfact_predicted_ib, aes(x=predicted_difference)) +
|
|
geom_density() +
|
|
labs(
|
|
title="Predicted Distribution of differences 'p'"
|
|
,subtitle="Intervention: Add a single generic competitor"
|
|
,x="Difference in 'p' under treatment"
|
|
,y="Probability Density"
|
|
)
|
|
ggsave("./Images/DirectEffects/default_p_generic_intervention_distdiff.png")
|
|
```
|
|
|
|
|
|
```{r}
|
|
|
|
|
|
pddf_ib <- data.frame(extract(generated_ib, pars="predicted_difference")$predicted_difference) |>
|
|
pivot_longer(X1:X1343)
|
|
|
|
#TODO: Fix Category names
|
|
pddf_ib["entry_idx"] <- as.numeric(gsub("\\D","",pddf_ib$name))
|
|
pddf_ib["category"] <- sapply(pddf_ib$entry_idx, function(i) df$category_id[i])
|
|
pddf_ib["category_name"] <- sapply(pddf_ib$category, function(i) beta_list$groups[i])
|
|
|
|
|
|
ggplot(pddf_ib, aes(x=value,)) +
|
|
geom_density(bins=100) +
|
|
labs(
|
|
title = "Distribution of predicted differences"
|
|
,subtitle = "Intervention: add a single generic competitor"
|
|
,x = "Difference in probability due to intervention"
|
|
,y = "Probability Density"
|
|
) +
|
|
geom_vline(aes(xintercept = 0), color = "skyblue", linetype="dashed")
|
|
ggsave("./Images/DirectEffects/p_generic_intervention_distdiff_styled.png")
|
|
|
|
ggplot(pddf_ib, aes(x=value,)) +
|
|
geom_density(bins=100) +
|
|
facet_wrap(
|
|
~factor(
|
|
category_name,
|
|
levels=beta_list$groups
|
|
)
|
|
, labeller = label_wrap_gen(multi_line = TRUE)
|
|
, ncol=5) +
|
|
labs(
|
|
title = "Distribution of predicted differences | By Group"
|
|
,subtitle = "Intervention: add a single generic competitor"
|
|
,x = "Difference in probability due to intervention"
|
|
,y = "Probability Density"
|
|
) +
|
|
geom_vline(aes(xintercept = 0), color = "skyblue", linetype="dashed") +
|
|
theme(strip.text.x = element_text(size = 8))
|
|
ggsave("./Images/DirectEffects/p_generic_intervention_distdiff_by_group.png")
|
|
|
|
ggplot(pddf_ib, aes(x=value,)) +
|
|
geom_histogram(bins=100) +
|
|
facet_wrap(
|
|
~factor(
|
|
category_name,
|
|
levels=beta_list$groups
|
|
)
|
|
, labeller = label_wrap_gen(multi_line = TRUE)
|
|
, ncol=5) +
|
|
labs(
|
|
title = "Histogram of predicted differences | By Group"
|
|
,subtitle = "Intervention: add a single generic competitor"
|
|
,x = "Difference in probability due to intervention"
|
|
,y = "Predicted counts"
|
|
) +
|
|
#xlim(-0.25,0.1) +
|
|
geom_vline(aes(xintercept = 0), color = "skyblue", linetype="dashed") +
|
|
theme(strip.text.x = element_text(size = 8))
|
|
ggsave("./Images/DirectEffects/p_generic_intervention_histdiff_by_group.png")
|
|
```
|
|
|
|
Get the probability of increase over probability of a decrease
|
|
|
|
```{r}
|
|
mean(counterfact_predicted_ib$predicted_difference)
|
|
```
|
|
Thus adding a single generic competitor increases the probability of termination by 16.55% on average for
|
|
the snapshots investigated.
|
|
|
|
|
|
|
|
```{r}
|
|
n = length(counterfact_predicted_ib$p_predicted_intervention)
|
|
mean(rbinom(n,1,as.vector(counterfact_predicted_ib$p_predicted_intervention)))
|
|
mean(rbinom(n,1,as.vector(counterfact_predicted_ib$p_predicted_default)))
|
|
```
|
|
|
|
|
|
#### USP DC Alternative
|
|
|
|
```{r}
|
|
#formulary intervention
|
|
brand_intervention_bnc <- x[c(inherited_cols,"identical_brands")]
|
|
brand_intervention_bnc["brand_name_counts"] <- asinh(sinh(x$brand_name_counts)+1) #add a single formulary competitor brand
|
|
```
|
|
|
|
```{r}
|
|
counterfact_marketing_bnc <- list(
|
|
D = ncol(x),#
|
|
N = nrow(x),
|
|
L = n_categories$count,
|
|
y = as.vector(y),
|
|
ll = as.vector(categories),
|
|
x = as.matrix(x),
|
|
mu_mean = 0,
|
|
mu_stdev = 0.05,
|
|
sigma_shape = 4,
|
|
sigma_rate = 20,
|
|
Nx = nrow(x),
|
|
llx = as.vector(categories),
|
|
counterfact_x_tilde = as.matrix(brand_intervention_bnc),
|
|
counterfact_x = as.matrix(x)
|
|
)
|
|
```
|
|
|
|
|
|
```{r}
|
|
generated_bnc <- gqs(
|
|
fit@stanmodel,
|
|
data=counterfact_marketing_bnc,
|
|
draws=as.matrix(fit),
|
|
seed=11021585
|
|
)
|
|
```
|
|
|
|
|
|
```{r}
|
|
counterfact_predicted_bnc <- data.frame(
|
|
p_predicted_default = as.vector(extract(generated_bnc, pars="p_predicted_default")$p_predicted_default)
|
|
,p_predicted_intervention = as.vector(extract(generated_bnc, pars="p_predicted_intervention")$p_predicted_intervention)
|
|
,predicted_difference = as.vector(extract(generated_bnc, pars="predicted_difference")$predicted_difference)
|
|
)
|
|
|
|
|
|
ggplot(counterfact_predicted_bnc, aes(x=p_predicted_default)) +
|
|
geom_density() +
|
|
labs(
|
|
title="Predicted Distribution of 'p'"
|
|
,subtitle="Intervention: None"
|
|
,x="Probability Domain 'p'"
|
|
,y="Probability Density"
|
|
)
|
|
ggsave("./Images/DirectEffects/default_p_uspdc_intervention_base.png")
|
|
|
|
ggplot(counterfact_predicted_bnc, aes(x=p_predicted_intervention)) +
|
|
geom_density() +
|
|
labs(
|
|
title="Predicted Distribution of 'p'"
|
|
,subtitle="Intervention: Add a single USP DC competitor"
|
|
,x="Probability Domain 'p'"
|
|
,y="Probability Density"
|
|
)
|
|
ggsave("./Images/DirectEffects/default_p_uspdc_intervention_interv.png")
|
|
|
|
ggplot(counterfact_predicted_bnc, aes(x=predicted_difference)) +
|
|
geom_density() +
|
|
labs(
|
|
title="Predicted Distribution of differences 'p'"
|
|
,subtitle="Intervention: Add a single USP DC competitor"
|
|
,x="Difference in 'p' under treatment"
|
|
,y="Probability Density"
|
|
)
|
|
ggsave("./Images/DirectEffects/default_p_uspdc_intervention_distdiff.png")
|
|
```
|
|
|
|
|
|
```{r}
|
|
pddf_bnc <- data.frame(extract(generated_bnc, pars="predicted_difference")$predicted_difference) |>
|
|
pivot_longer(X1:X1343)
|
|
|
|
#Add Category names
|
|
pddf_bnc["entry_idx"] <- as.numeric(gsub("\\D","",pddf_bnc$name))
|
|
pddf_bnc["category"] <- sapply(pddf_bnc$entry_idx, function(i) df$category_id[i])
|
|
pddf_bnc["category_name"] <- sapply(pddf_bnc$category, function(i) beta_list$groups[i])
|
|
|
|
|
|
|
|
ggplot(pddf_bnc, aes(x=value,)) +
|
|
geom_density(bins=100) +
|
|
labs(
|
|
title = "Distribution of predicted differences"
|
|
,subtitle = "Intervention: add a single USP DC competitor"
|
|
,x = "Difference in probability due to intervention"
|
|
,y = "Probability Density"
|
|
) +
|
|
geom_vline(aes(xintercept = 0), color = "skyblue", linetype="dashed")
|
|
ggsave("./Images/DirectEffects/p_uspdc_intervention_distdiff_styled.png")
|
|
|
|
ggplot(pddf_bnc, aes(x=value,)) +
|
|
geom_density(bins=100) +
|
|
facet_wrap(
|
|
~factor(
|
|
category_name,
|
|
levels=beta_list$groups
|
|
)
|
|
, labeller = label_wrap_gen(multi_line = TRUE)
|
|
, ncol=5) +
|
|
labs(
|
|
title = "Distribution of predicted differences in 'p' | By Group"
|
|
,subtitle = "Intervention: add a single USP DC competitor"
|
|
,x = "Difference in probability due to intervention"
|
|
,y = "Probability Density"
|
|
) +
|
|
geom_vline(aes(xintercept = 0), color = "skyblue", linetype="dashed") +
|
|
theme(strip.text.x = element_text(size = 8))
|
|
ggsave("./Images/DirectEffects/p_uspdc_intervention_distdiff_by_group.png")
|
|
|
|
ggplot(pddf_bnc, aes(x=value,)) +
|
|
geom_histogram(bins=100) +
|
|
facet_wrap(
|
|
~factor(
|
|
category_name,
|
|
levels=beta_list$groups
|
|
)
|
|
, labeller = label_wrap_gen(multi_line = TRUE)
|
|
, ncol=5) +
|
|
labs(
|
|
title = "Histogram of predicted differences in 'p' | By Group"
|
|
,subtitle = "Intervention: add a single USP DC competitor"
|
|
,x = "Difference in probability due to intervention"
|
|
,y = "Predicted counts"
|
|
) +
|
|
#xlim(-0.25,0.1) +
|
|
geom_vline(aes(xintercept = 0), color = "skyblue", linetype="dashed") +
|
|
theme(strip.text.x = element_text(size = 8))
|
|
ggsave("./Images/DirectEffects/p_uspdc_intervention_histdiff_by_group.png")
|
|
```
|
|
|
|
|
|
TODO: add density plot of (x,y,z) (date,value,counts)
|
|
- with and without faceting
|
|
|
|
|
|
```{r}
|
|
mean(counterfact_predicted_bnc$predicted_difference)
|
|
```
|
|
Addin a single USP DC competitor increases/reduces the probability of completion by 16.47% on average
|
|
for the snapshots of trials that we have.
|
|
|
|
|
|
|
|
|
|
### Intervention: Marginal increase in time to finish enrollment
|
|
|
|
```{r}
|
|
#| eval: false
|
|
|
|
pddf <- data.frame(extract(generated, pars="predicted_difference")$predicted_difference) |> pivot_longer(X1:X189)
|
|
pddf["entry_idx"] <- as.numeric(gsub("\\D","",pddf$name))
|
|
|
|
pddf["category"] <- sapply(pddf$entry_idx, function(i) counterfact_categories[i])
|
|
pddf["category_name"] <- sapply(pddf$category, function(i) beta_list$groups[i])
|
|
|
|
ggplot(pddf, aes(x=value,)) +
|
|
geom_histogram(bins=100) +
|
|
labs(
|
|
title = "Distribution of predicted differences"
|
|
,x = "Difference in probability due to intervention"
|
|
,y = "Predicted counts"
|
|
) +
|
|
xlim(-0.3,0.1) +
|
|
geom_vline(aes(xintercept = 0), color = "skyblue", linetype="dashed")
|
|
|
|
ggplot(pddf, aes(x=value,)) +
|
|
geom_histogram(bins=100) +
|
|
facet_wrap(
|
|
~factor(
|
|
category_name,
|
|
levels=beta_list$groups
|
|
)
|
|
, labeller = label_wrap_gen(multi_line = TRUE)
|
|
, ncol=5) +
|
|
labs(
|
|
title = "Distribution of predicted differences",
|
|
subtitle = "By group"
|
|
,x = "Difference in probability due to intervention"
|
|
,y = "Predicted counts"
|
|
) +
|
|
xlim(-0.25,0.1) +
|
|
geom_vline(aes(xintercept = 0), color = "skyblue", linetype="dashed") +
|
|
theme(strip.text.x = element_text(size = 8))
|
|
|
|
```
|
|
|
|
|
|
Recall that we had really tight zero priors.
|
|
|
|
|
|
|
|
# Diagnostics
|
|
|
|
```{r}
|
|
#| eval: false
|
|
#trace plots
|
|
plot(fit, pars=c("mu"), plotfun="trace")
|
|
|
|
|
|
for (i in 1:4) {
|
|
print(
|
|
mcmc_rank_overlay(
|
|
fit,
|
|
pars=c(
|
|
paste0("mu[",4*i-3,"]"),
|
|
paste0("mu[",4*i-2,"]"),
|
|
paste0("mu[",4*i-1,"]"),
|
|
paste0("mu[",4*i,"]")
|
|
),
|
|
n_bins=100
|
|
)+ legend_move("top") +
|
|
scale_colour_ghibli_d("KikiMedium")
|
|
)
|
|
}
|
|
```
|
|
|
|
```{r}
|
|
#| eval: false
|
|
plot(fit, pars=c("sigma"), plotfun="trace")
|
|
|
|
for (i in 1:4) {
|
|
print(
|
|
mcmc_rank_overlay(
|
|
fit,
|
|
pars=c(
|
|
paste0("sigma[",4*i-3,"]"),
|
|
paste0("sigma[",4*i-2,"]"),
|
|
paste0("sigma[",4*i-1,"]"),
|
|
paste0("sigma[",4*i,"]")
|
|
),
|
|
n_bins=100
|
|
)+ legend_move("top") +
|
|
scale_colour_ghibli_d("KikiMedium")
|
|
)
|
|
}
|
|
```
|
|
|
|
```{r}
|
|
#| eval: false
|
|
#other diagnostics
|
|
logpost <- log_posterior(fit)
|
|
nuts_prmts <- nuts_params(fit)
|
|
posterior <- as.array(fit)
|
|
|
|
```
|
|
|
|
```{r}
|
|
#| eval: false
|
|
color_scheme_set("darkgray")
|
|
div_style <- parcoord_style_np(div_color = "green", div_size = 0.05, div_alpha = 0.4)
|
|
mcmc_parcoord(posterior, regex_pars = "mu", np=nuts_prmts, np_style = div_style, alpha = 0.05)
|
|
```
|
|
|
|
```{r}
|
|
#| eval: false
|
|
for (i in 1:4) {
|
|
mus = sapply(3:0, function(j) paste0("mu[",4*i-j ,"]"))
|
|
print(
|
|
mcmc_pairs(
|
|
posterior,
|
|
np = nuts_prmts,
|
|
pars=c(
|
|
mus,
|
|
"lp__"
|
|
),
|
|
off_diag_args = list(size = 0.75)
|
|
)
|
|
)
|
|
}
|
|
|
|
|
|
|
|
```
|
|
|
|
```{r}
|
|
#| eval: false
|
|
mcmc_parcoord(posterior,regex_pars = "sigma", np=nuts_prmts, alpha=0.05)
|
|
```
|
|
|
|
```{r}
|
|
#| eval: false
|
|
|
|
for (i in 1:4) {
|
|
params = sapply(3:0, function(j) paste0("sigma[",4*i-j ,"]"))
|
|
print(
|
|
mcmc_pairs(
|
|
posterior,
|
|
np = nuts_prmts,
|
|
pars=c(
|
|
params,
|
|
"lp__"
|
|
),
|
|
off_diag_args = list(size = 0.75)
|
|
)
|
|
)
|
|
}
|
|
```
|
|
|
|
|
|
```{r}
|
|
#| eval: false
|
|
for (k in 1:22) {
|
|
for (i in 1:4) {
|
|
params = sapply(3:0, function(j) paste0("beta[",k,",",4*i-j ,"]"))
|
|
print(
|
|
mcmc_pairs(
|
|
posterior,
|
|
np = nuts_prmts,
|
|
pars=c(
|
|
params,
|
|
"lp__"
|
|
),
|
|
off_diag_args = list(size = 0.75)
|
|
)
|
|
)
|
|
}}
|
|
```
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# TODO
|
|
- [ ] Double check data flow. (Write summary of this in human readable form)
|
|
- Is it the data we want from the database
|
|
- Training
|
|
- Counterfactual Evaluation
|
|
- choose a single snapshot per trial.
|
|
- Is the model in STAN well specified.
|
|
- [ ] work on LOO validation of model
|