You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
ClinicalTrialsEstimation/r-analysis/EffectsOfMarketConditions_n...

851 lines
21 KiB
Plaintext

---
title: "The Effects of market conditions on recruitment and completion of clinical trials"
author: "Will King"
format: html
editor: source
---
# Setup
```{r}
library(bayesplot)
available_mcmc(pattern = "_nuts_")
library(ggplot2)
library(patchwork)
library(tidyverse)
library(rstan)
library(tidyr)
library(ghibli)
#Resources: https://github.com/stan-dev/rstan/wiki/RStan-Getting-Started
#save unchanged models instead of recompiling
rstan_options(auto_write = TRUE)
#allow for multithreaded sampling
options(mc.cores = parallel::detectCores())
#test installation, shouldn't get any errors
#example(stan_model, package = "rstan", run.dontrun = TRUE)
```
```{r}
################ Pull data from database ######################
library(RPostgreSQL)
driver <- dbDriver("PostgreSQL")
get_data <- function(driver) {
con <- dbConnect(
driver,
user='root',
password='root',
dbname='aact_db',
host='will-office'
)
on.exit(dbDisconnect(con))
query <- dbSendQuery(
con,
# "select * from formatted_data_with_planned_enrollment;"
"
select
fdqpe.nct_id
--,fdqpe.start_date
--,fdqpe.current_enrollment
--,fdqpe.enrollment_category
,fdqpe.current_status
,fdqpe.earliest_date_observed
,fdqpe.elapsed_duration
,fdqpe.n_brands as identical_brands
,ntbtu.brand_name_count
,fdqpe.category_id
,fdqpe.final_status
,fdqpe.h_sdi_val
--,fdqpe.h_sdi_u95
--,fdqpe.h_sdi_l95
,fdqpe.hm_sdi_val
--,fdqpe.hm_sdi_u95
--,fdqpe.hm_sdi_l95
,fdqpe.m_sdi_val
--,fdqpe.m_sdi_u95
--,fdqpe.m_sdi_l95
,fdqpe.lm_sdi_val
--,fdqpe.lm_sdi_u95
--,fdqpe.lm_sdi_l95
,fdqpe.l_sdi_val
--,fdqpe.l_sdi_u95
--,fdqpe.l_sdi_l95
from formatted_data_with_planned_enrollment fdqpe
join \"Formularies\".nct_to_brands_through_uspdc ntbtu
on fdqpe.nct_id = ntbtu.nct_id
order by fdqpe.nct_id, fdqpe.earliest_date_observed
;
"
)
df <- fetch(query, n = -1)
df <- na.omit(df)
query2 <-dbSendQuery(con,"select count(*) from \"DiseaseBurden\".icd10_categories ic where \"level\"=1;")
n_categories <- fetch(query2, n = -1)
return(list(data=df,ncat=n_categories))
}
d <- get_data(driver)
df <- d$data
n_categories <- d$ncat
################ Format Data ###########################
data_formatter <- function(df) {
x <- df["category_id"]
x["identical_brands"] <- asinh(df$identical_brands)
x["brand_name_counts"] <- asinh(df$brand_name_count)
x["h_sdi_val"] <- asinh(df$h_sdi_val)
x["hm_sdi_val"] <- asinh(df$hm_sdi_val)
x["m_sdi_val"] <- asinh(df$m_sdi_val)
x["lm_sdi_val"] <- asinh(df$lm_sdi_val)
x["l_sdi_val"] <- asinh(df$l_sdi_val)
y <- ifelse(df["final_status"]=="Terminated",1,0)
return(list(x=x,y=y))
}
train <- data_formatter(df)
categories <- df$category_id
x <- train$x
y <- train$y
x$category_id <- NULL
```
### Intervention: Adding a single competitor
```{r}
inherited_cols <- c(
"identical_brands"
,"brand_name_counts"
,"h_sdi_val"
,"hm_sdi_val"
,"m_sdi_val"
,"lm_sdi_val"
,"l_sdi_val"
)
```
#### Generics
```{r}
#generics intervention
brand_intervention_ib <- x[inherited_cols]
brand_intervention_ib["identical_brands"] <- asinh(sinh(x$identical_brands)+1) #add a single generic brand
```
```{r}
counterfact_marketing_ib <- list(
D = ncol(x),#
N = nrow(x),
L = n_categories$count,
y = as.vector(y),
ll = as.vector(categories),
x = as.matrix(x),
mu_mean = 0,
mu_stdev = 0.05,
sigma_shape = 4,
sigma_rate = 20,
Nx = nrow(x),
llx = as.vector(categories),
counterfact_x_tilde = as.matrix(brand_intervention_ib),
counterfact_x = as.matrix(x)
)
```
### USP DC
```{r}
#formulary intervention
brand_intervention_bnc <- x[inherited_cols]
brand_intervention_bnc["brand_name_counts"] <- asinh(sinh(x$brand_name_counts)+1) #add a single formulary competitor brand
```
```{r}
counterfact_marketing_bnc <- list(
D = ncol(x),#
N = nrow(x),
L = n_categories$count,
y = as.vector(y),
ll = as.vector(categories),
x = as.matrix(x),
mu_mean = 0,
mu_stdev = 0.05,
sigma_shape = 4,
sigma_rate = 20,
Nx = nrow(x),
llx = as.vector(categories),
counterfact_x_tilde = as.matrix(brand_intervention_bnc),
counterfact_x = as.matrix(x)
)
```
# Fit Model
```{r}
################################# FIT MODEL #########################################
fit <- stan(
file='Hierarchal_Logistic.stan',
data = counterfact_marketing_ib,
chains = 4,
iter = 5000,
seed = 11021585
)
```
```{r}
generated_bi <- gqs(
fit@stanmodel,
data=counterfact_marketing_ib,
draws=as.matrix(fit),
seed=11021585
)
```
## Priors
```{r}
#| eval: false
hist(as.vector(extract(generated_bi, pars="p_prior")$p_prior))
hist(as.vector(extract(generated_bi, pars="mu_prior")$mu_prior), )
hist(as.vector(extract(generated_bi, pars="sigma_prior")$sigma_prior))
```
```{r}
df_ib_p <- data.frame(
p_prior=as.vector(extract(generated_bi, pars="p_prior")$p_prior)
,p_predicted = as.vector(extract(generated_bi, pars="p_predicted")$p_predicted)
)
df_ib_prior <- data.frame(
mu_prior = as.vector(extract(generated_bi, pars="mu_prior")$mu_prior)
,sigma_prior = as.vector(extract(generated_bi, pars="sigma_prior")$sigma_prior)
)
#p_prior
ggplot(df_ib_p, aes(x=p_prior)) +
geom_density() +
labs(
title="Implied Prior Distribution P"
,subtitle=""
,x="Probability Domain 'p'"
,y="Probability Density"
)
ggsave("./Images/TotalEffects/prior_p.png")
#p_posterior
ggplot(df_ib_p, aes(x=p_predicted)) +
geom_density() +
labs(
title="Implied Posterior Distribution P"
,subtitle=""
,x="Probability Domain 'p'"
,y="Probability Density"
)
ggsave("./Images/TotalEffects/posterior_p.png")
#mu_prior
ggplot(df_ib_prior) +
geom_density(aes(x=mu_prior)) +
labs(
title="Prior - Mu"
,subtitle="same prior for all Mu values"
,x="Mu"
,y="Probability"
)
ggsave("./Images/TotalEffects/prior_mu.png")
#sigma_posterior
ggplot(df_ib_prior) +
geom_density(aes(x=sigma_prior)) +
labs(
title="Prior - Sigma"
,subtitle="same prior for all Sigma values"
,x="Sigma"
,y="Probability"
)
ggsave("./Images/TotalEffects/prior_sigma.png")
```
```{r}
check_hmc_diagnostics(fit)
#hist(as.vector(extract(generated_bi, pars="p_predicted")$p_predicted))
```
# Diagnostics
```{r}
#| eval: false
#trace plots
plot(fit, pars=c("mu"), plotfun="trace")
mcmc_rank_overlay(
fit,
pars=sapply(1:7, function(i) paste0("mu[",i,"]"))
,n_bins=100
)+ legend_move("top") +
scale_colour_ghibli_d("KikiMedium")
```
```{r}
#| eval: false
plot(fit, pars=c("sigma"), plotfun="trace")
mcmc_rank_overlay(
fit,
pars=sapply(1:7, function(i) paste0("sigma[",i,"]"))
,n_bins=100
)+ legend_move("top") +
scale_colour_ghibli_d("KikiMedium")
```
```{r}
#| eval: false
#other diagnostics
logpost <- log_posterior(fit)
nuts_prmts <- nuts_params(fit)
posterior <- as.array(fit)
```
```{r}
#| eval: false
color_scheme_set("darkgray")
div_style <- parcoord_style_np(div_color = "green", div_size = 0.05, div_alpha = 0.4)
mcmc_parcoord(posterior, regex_pars = "mu", np=nuts_prmts, np_style = div_style, alpha = 0.05)
```
```{r}
#| eval: false
mus = sapply(1:7, function(j) paste0("mu[",j ,"]"))
mcmc_pairs(
posterior,
np = nuts_prmts,
pars=c(
mus,
"lp__"
),
off_diag_args = list(size = 0.75)
)
```
```{r}
#| eval: false
mcmc_parcoord(posterior,regex_pars = "sigma", np=nuts_prmts, alpha=0.05)
```
```{r}
#| eval: false
sigmas = sapply(1:7, function(j) paste0("sigma[",j ,"]"))
mcmc_pairs(
posterior,
np = nuts_prmts,
pars=c(
sigmas,
"lp__"
),
off_diag_args = list(size = 0.75)
)
```
```{r}
#| eval: false
#for (k in 1:22) {
# params = sapply(1:7, function(j) paste0("beta[",k,",",j ,"]"))
# print(
# mcmc_pairs(
# posterior,
# np = nuts_prmts,
# pars=c(
# params,
# "lp__"
# ),
# off_diag_args = list(size = 0.75)
# )
# )
#}
```
# Results
```{r}
################################# ANALYZE #####################################
print(fit)
```
## Result Plots
Note the regular large difference in variance.
I would guess those are the beta[1:22,2] values.
I wonder if a lot of the variance is due to the 2 values that are sitting out.
```{r}
beta_list <- list(
groups = c(
`1`="Infections & Parasites",
`2`="Neoplasms",
`3`="Blood & Immune system",
`4`="Endocrine, Nutritional, and Metabolic",
`5`="Mental & Behavioral",
`6`="Nervous System",
`7`="Eye and Adnexa",
`8`="Ear and Mastoid",
`9`="Circulatory",
`10`="Respiratory",
`11`="Digestive",
`12`="Skin & Subcutaneaous tissue",
`13`="Musculoskeletal",
`14`="Genitourinary",
`15`="Pregancy, Childbirth, & Puerperium",
`16`="Perinatal Period",
`17`="Congential",
`18`="Symptoms, Signs etc.",
`19`="Injury etc.",
`20`="External Causes",
`21`="Contact with Healthcare",
`22`="Special Purposes"
),
parameters = c(
# brands
`1`="asinh(Generic Brands)",
`2`="asinh(Competitors USPDC)",
# population
`3`="asinh(High SDI)",
`4`="asinh(High-Medium SDI)",
`5`="asinh(Medium SDI)",
`6`="asinh(Low-Medium SDI)",
`7`="asinh(Low SDI)"
)
)
get_parameters <- function(stem,class_list) {
#get categories and lengths
named <- names(class_list)
lengths <- sapply(named, (function (x) length(class_list[[x]])))
#describe the grid needed
iter_list <- sapply(named, (function (x) 1:lengths[x]))
#generate the list of parameters
pardf <- generate_parameter_df(stem, iter_list)
#add columns with appropriate human-readable names
for (name in named) {
pardf[paste(name,"_hr",sep="")] <- as.factor(
sapply(pardf[name], (function (i) class_list[[name]][i]))
)
}
return(pardf)
}
generate_parameter_df <- function(stem, iter_list) {
grid <- expand.grid(iter_list)
grid["param_name"] <- grid %>% unite(x,colnames(grid),sep=",")
grid["param_name"] <- paste(stem,"[",grid$param_name,"]",sep="")
return(grid)
}
group_mcmc_areas <- function(
stem,# = "beta"
class_list,# = beta_list
stanfit,# = fit
group_id,# = 2
rename=TRUE
) {
#get all parameter names
params <- get_parameters(stem,class_list)
#filter down to parameters of interest
params <- filter(params,groups == group_id)
#Get dataframe with only the rows of interest
filtdata <- as.data.frame(stanfit)[params$param_name]
#rename columns
if (rename) dimnames(filtdata)[[2]] <- params$parameters_hr
#get group name for title
group_name <- class_list$groups[group_id]
#create area plot with appropriate title
mcmc_areas(filtdata,prob = 0.8, prob_outer = 0.95) +
ggtitle(paste("Parameter distributions for ICD-10 class:",group_name))
}
parameter_mcmc_areas <- function(
stem,# = "beta"
class_list,# = beta_list
stanfit,# = fit
parameter_id,# = 2
rename=TRUE
) {
#get all parameter names
params <- get_parameters(stem,class_list)
#filter down to parameters of interest
params <- filter(params,parameters == parameter_id)
#Get dataframe with only the rows of interest
filtdata <- as.data.frame(stanfit)[params$param_name]
#rename columns
if (rename) dimnames(filtdata)[[2]] <- params$groups_hr
#get group name for title
parameter_name <- class_list$parameters[parameter_id]
#create area plot with appropriate title
mcmc_areas(filtdata,prob = 0.8, prob_outer = 0.95) +
ggtitle(parameter_name,"Parameter Distribution")
}
```
```{r}
#mcmc_intervals(fit, pars=get_parameters("beta",beta_list)$param_name)
```
### Investigating parameter distributions
```{r}
#g1 <- group_mcmc_areas("beta",beta_list,fit,2)
#g2 <- group_mcmc_areas("beta",beta_list,fit,2)
#g3 <- group_mcmc_areas("beta",beta_list,fit,2)
#g4 <- group_mcmc_areas("beta",beta_list,fit,2)
#g5 <- group_mcmc_areas("beta",beta_list,fit,2)
#g6 <- group_mcmc_areas("beta",beta_list,fit,2)
#g7 <- group_mcmc_areas("beta",beta_list,fit,2)
#g8 <- group_mcmc_areas("beta",beta_list,fit,2)
#g9 <- group_mcmc_areas("beta",beta_list,fit,2)
#g10 <- group_mcmc_areas("beta",beta_list,fit,2)
#g11 <- group_mcmc_areas("beta",beta_list,fit,2)
#g12 <- group_mcmc_areas("beta",beta_list,fit,2)
#g13 <- group_mcmc_areas("beta",beta_list,fit,2)
#g14 <- group_mcmc_areas("beta",beta_list,fit,2)
#g15 <- group_mcmc_areas("beta",beta_list,fit,2)
#g16 <- group_mcmc_areas("beta",beta_list,fit,2)
#g17 <- group_mcmc_areas("beta",beta_list,fit,2)
#g18 <- group_mcmc_areas("beta",beta_list,fit,2)
#g19 <- group_mcmc_areas("beta",beta_list,fit,2)
#g20 <- group_mcmc_areas("beta",beta_list,fit,2)
#g21 <- group_mcmc_areas("beta",beta_list,fit,2)
#g22 <- group_mcmc_areas("beta",beta_list,fit,2)
p1 <- parameter_mcmc_areas("beta",beta_list,fit,1)
ggsave("./Images/TotalEffects/Parameters/01_generics.png")
p2 <- parameter_mcmc_areas("beta",beta_list,fit,2)
ggsave("./Images/TotalEffects/Parameters/02_uspdc.png")
#p3 <- parameter_mcmc_areas("beta",beta_list,fit,3)
#p4 <- parameter_mcmc_areas("beta",beta_list,fit,4)
#p5 <- parameter_mcmc_areas("beta",beta_list,fit,5)
#p6 <- parameter_mcmc_areas("beta",beta_list,fit,6)
#p7 <- parameter_mcmc_areas("beta",beta_list,fit,7)
```
Note these have 95% outer CI and 80% inner (shaded)
1) "asinh(Generic Brands)",
2) "asinh(Competitors USPDC)",
3) "asinh(High SDI)",
4) "asinh(High-Medium SDI)",
5) "asinh(Medium SDI)",
6) "asinh(Low-Medium SDI)",
7) "asinh(Low SDI)",
of interest
- p1 + p2
```{r}
p1 + p2
ggsave("./Images/TotalEffects/Parameters/1&2_generics_and_uspdc.png")
```
# Posterior Prediction
## Distribution of Predicted Differences
### Intervention: Adding a single competitor
#### Generics
```{r}
#| eval: false
#TODO: Convert to ggplot, stabilize y axis
hist(as.vector(extract(generated_bi, pars="p_predicted_default")$p_predicted_default))
hist(as.vector(extract(generated_bi, pars="p_predicted_intervention")$p_predicted_intervention))
hist(as.vector(extract(generated_bi, pars="predicted_difference")$predicted_difference))
```
```{r}
counterfact_predicted_ib <- data.frame(
p_predicted_default = as.vector(extract(generated_bi, pars="p_predicted_default")$p_predicted_default)
,p_predicted_intervention = as.vector(extract(generated_bi, pars="p_predicted_intervention")$p_predicted_intervention)
,predicted_difference = as.vector(extract(generated_bi, pars="predicted_difference")$predicted_difference)
)
```
```{r}
ggplot(counterfact_predicted_ib, aes(x=p_predicted_default)) +
geom_density() +
labs(
title="Predicted Distribution of 'p'"
,subtitle="Intervention: None"
,x="Probability Domain 'p'"
,y="Probability Density"
)
ggsave("./Images/TotalEffects/default_p_generic_intervention_base.png")
ggplot(counterfact_predicted_ib, aes(x=p_predicted_intervention)) +
geom_density() +
labs(
title="Predicted Distribution of 'p'"
,subtitle="Intervention: Add a single generic competitor"
,x="Probability Domain 'p'"
,y="Probability Density"
)
ggsave("./Images/TotalEffects/default_p_generic_intervention_interv.png")
ggplot(counterfact_predicted_ib, aes(x=predicted_difference)) +
geom_density() +
labs(
title="Predicted Distribution of differences 'p'"
,subtitle="Intervention: Add a single generic competitor"
,x="Difference in 'p' under treatment"
,y="Probability Density"
)
ggsave("./Images/TotalEffects/default_p_generic_intervention_distdiff.png")
```
```{r}
pddf_ib <- data.frame(extract(generated_bi, pars="predicted_difference")$predicted_difference) |>
pivot_longer(X1:X1343)
#TODO: Fix Category names
pddf_ib["entry_idx"] <- as.numeric(gsub("\\D","",pddf_ib$name))
pddf_ib["category"] <- sapply(pddf_ib$entry_idx, function(i) df$category_id[i])
pddf_ib["category_name"] <- sapply(pddf_ib$category, function(i) beta_list$groups[i])
```
```{r}
ggplot(pddf_ib, aes(x=value,)) +
geom_density() +
labs(
title = "Distribution of predicted differences"
,subtitle = "Intervention: add a single generic competitor"
,x = "Difference in probability due to intervention"
,y = "Probability Density"
) +
geom_vline(aes(xintercept = 0), color = "skyblue", linetype="dashed")
ggsave("./Images/TotalEffects/p_generic_intervention_distdiff_styled.png")
ggplot(pddf_ib, aes(x=value,)) +
geom_density() +
facet_wrap(
~factor(
category_name,
levels=beta_list$groups
)
, labeller = label_wrap_gen(multi_line = TRUE)
, ncol=5
,scales="free"
) +
xlim(-1,1)+
labs(
title = "Distribution of predicted differences | By Group"
,subtitle = "Intervention: add a single generic competitor"
,x = "Difference in probability due to intervention"
,y = "Probability Density"
) +
geom_vline(aes(xintercept = 0), color = "skyblue", linetype="dashed") +
theme(strip.text.x = element_text(size = 8))
ggsave("./Images/TotalEffects/p_generic_intervention_distdiff_by_group.png")
ggplot(pddf_ib, aes(x=value,)) +
geom_histogram(bins=100) +
facet_wrap(
~factor(
category_name,
levels=beta_list$groups
)
, labeller = label_wrap_gen(multi_line = TRUE)
, ncol=5) +
labs(
title = "Histogram of predicted differences | By Group"
,subtitle = "Intervention: add a single generic competitor"
,x = "Difference in probability due to intervention"
,y = "Predicted counts"
) +
#xlim(-0.25,0.1) +
geom_vline(aes(xintercept = 0), color = "skyblue", linetype="dashed") +
theme(strip.text.x = element_text(size = 8))
ggsave("./Images/TotalEffects/p_generic_intervention_histdiff_by_group.png")
```
#### USP DC
```{r}
generated_bnc <- gqs(
fit@stanmodel,
data=counterfact_marketing_bnc,
draws=as.matrix(fit),
seed=11021585
)
```
```{r}
#| eval: false
#TODO: Convert to ggplot, stabilize y axis
hist(as.vector(extract(generated_bnc, pars="p_predicted_default")$p_predicted_default), bins=100)
hist(as.vector(extract(generated_bnc, pars="p_predicted_intervention")$p_predicted_intervention), bins=100)
hist(as.vector(extract(generated_bnc, pars="predicted_difference")$predicted_difference), bins=100)
```
```{r}
counterfact_predicted_bnc <- data.frame(
p_predicted_default = as.vector(extract(generated_bnc, pars="p_predicted_default")$p_predicted_default)
,p_predicted_intervention = as.vector(extract(generated_bnc, pars="p_predicted_intervention")$p_predicted_intervention)
,predicted_difference = as.vector(extract(generated_bnc, pars="predicted_difference")$predicted_difference)
)
```
```{r}
pddf_bnc <- data.frame(extract(generated_bnc, pars="predicted_difference")$predicted_difference) |>
pivot_longer(X1:X1343)
#TODO: Fix Category names
pddf_bnc["entry_idx"] <- as.numeric(gsub("\\D","",pddf_bnc$name))
pddf_bnc["category"] <- sapply(pddf_bnc$entry_idx, function(i) df$category_id[i])
pddf_bnc["category_name"] <- sapply(pddf_bnc$category, function(i) beta_list$groups[i])
```
```{r}
ggplot(pddf_bnc, aes(x=value,)) +
geom_density() +
labs(
title = "Distribution of predicted differences"
,subtitle = "Intervention: add a single USP DC competitor"
,x = "Difference in probability due to intervention"
,y = "Probability Density"
) +
geom_vline(aes(xintercept = 0), color = "skyblue", linetype="dashed")
ggsave("./Images/TotalEffects/p_uspdc_intervention_distdiff_styled.png")
ggplot(pddf_bnc, aes(x=value,)) +
geom_density() +
facet_wrap(
~factor(
category_name,
levels=beta_list$groups
)
, labeller = label_wrap_gen(multi_line = TRUE)
, ncol=5
,scales="free"
) +
labs(
title = "Distribution of predicted differences | By Group"
,subtitle = "Intervention: add a single USP DC competitor"
,x = "Difference in probability due to intervention"
,y = "Probability Density"
) +
geom_vline(aes(xintercept = 0), color = "skyblue", linetype="dashed") +
theme(strip.text.x = element_text(size = 8))
ggsave("./Images/TotalEffects/p_uspdc_intervention_distdiff_by_group.png")
ggplot(pddf_bnc, aes(x=value,)) +
geom_histogram(bins=100) +
facet_wrap(
~factor(
category_name,
levels=beta_list$groups
)
, labeller = label_wrap_gen(multi_line = TRUE)
, ncol=5) +
labs(
title = "Histogram of predicted differences | By Group"
,subtitle = "Intervention: add a single USP DC competitor"
,x = "Difference in probability due to intervention"
,y = "Predicted counts"
) +
#xlim(-0.25,0.1) +
geom_vline(aes(xintercept = 0), color = "skyblue", linetype="dashed") +
theme(strip.text.x = element_text(size = 8))
ggsave("./Images/TotalEffects/p_uspdc_intervention_histdiff_by_group.png")
```