Updated to keep current analysis

check_reversion
Will King 3 years ago
parent 96319edca3
commit 63229d3a99

@ -5,12 +5,18 @@ format: html
editor: source editor: source
--- ---
# Setup
```{r} ```{r}
library(bayesplot) library(bayesplot)
available_mcmc(pattern = "_nuts_") available_mcmc(pattern = "_nuts_")
library(ggplot2) library(ggplot2)
library(patchwork)
library(tidyverse)
library(rstan) library(rstan)
library(tidyr)
library(ghibli)
#Resources: https://github.com/stan-dev/rstan/wiki/RStan-Getting-Started #Resources: https://github.com/stan-dev/rstan/wiki/RStan-Getting-Started
#save unchanged models instead of recompiling #save unchanged models instead of recompiling
@ -41,7 +47,41 @@ on.exit(dbDisconnect(con))
query <- dbSendQuery( query <- dbSendQuery(
con, con,
"select * from formatted_data_with_planned_enrollment;" # "select * from formatted_data_with_planned_enrollment;"
"
select
fdqpe.nct_id
--,fdqpe.start_date
--,fdqpe.current_enrollment
--,fdqpe.enrollment_category
,fdqpe.current_status
,fdqpe.earliest_date_observed
,fdqpe.elapsed_duration
,fdqpe.n_brands as identical_brands
,ntbtu.brand_name_count
,fdqpe.category_id
,fdqpe.final_status
,fdqpe.h_sdi_val
--,fdqpe.h_sdi_u95
--,fdqpe.h_sdi_l95
,fdqpe.hm_sdi_val
--,fdqpe.hm_sdi_u95
--,fdqpe.hm_sdi_l95
,fdqpe.m_sdi_val
--,fdqpe.m_sdi_u95
--,fdqpe.m_sdi_l95
,fdqpe.lm_sdi_val
--,fdqpe.lm_sdi_u95
--,fdqpe.lm_sdi_l95
,fdqpe.l_sdi_val
--,fdqpe.l_sdi_u95
--,fdqpe.l_sdi_l95
from formatted_data_with_planned_enrollment fdqpe
join \"Formularies\".nct_to_brands_through_uspdc ntbtu
on fdqpe.nct_id = ntbtu.nct_id
order by fdqpe.nct_id, fdqpe.earliest_date_observed
;
"
) )
df <- fetch(query, n = -1) df <- fetch(query, n = -1)
df <- na.omit(df) df <- na.omit(df)
@ -58,32 +98,152 @@ n_categories <- d$ncat
################ Format Data ###########################
################ Format Data ###########################
data_formatter <- function(df) {
categories <- df["category_id"] categories <- df["category_id"]
x <- df["elapsed_duration"] x <- df["elapsed_duration"]
x["log_n_brands"] <- asinh(df$n_brands) #todo - convert to arcsinh() transform x["identical_brands"] <- asinh(df$identical_brands)
x["brand_name_counts"] <- asinh(df$brand_name_count)
x["h_sdi_val"] <- asinh(df$h_sdi_val) x["h_sdi_val"] <- asinh(df$h_sdi_val)
x["hm_sdi_val"] <- asinh(df$hm_sdi_val) x["hm_sdi_val"] <- asinh(df$hm_sdi_val)
x["m_sdi_val"] <- asinh(df$m_sdi_val) x["m_sdi_val"] <- asinh(df$m_sdi_val)
x["lm_sdi_val"] <- asinh(df$lm_sdi_val) x["lm_sdi_val"] <- asinh(df$lm_sdi_val)
x["l_sdi_val"] <- asinh(df$l_sdi_val) x["l_sdi_val"] <- asinh(df$l_sdi_val)
#x["enrollment"] <- df["current_enrollment"] / df["planned_enrollment"]
#square terms
#x["enrollment^2"] <- x["enrollment"]^2 #Setup fixed effects
#x["elapsed_duration^2"] <- x["elapsed_duration"]^2 x["status_NYR"] <- ifelse(df["current_status"]=="Not yet recruiting",1,0)
#x["n_brands^2"] <- x["n_brands"]^2 x["status_EBI"] <- ifelse(df["current_status"]=="Enrolling by invitation",1,0)
#break out #x["status_Rec"] <- ifelse(df["current_status"]=="Recruiting",1,0) # Base case is Recruiting
#x["status_NYR"] <- ifelse(df["current_status"]=="Not yet recruiting",1,0) x["status_ANR"] <- ifelse(df["current_status"]=="Active, not recruiting",1,0)
#x["status_Rec"] <- ifelse(df["current_status"]=="Recruiting",1,0)
#x["status_ANR"] <- ifelse(df["current_status"]=="Active, not recruiting",1,0)
#x["status_EBI"] <- ifelse(df["current_status"]=="Enrolling by invitation",1,0) #interaction terms for competitors
x["ib*elapsed"] <- x["elapsed_duration"]*x["identical_brands"]
x["bnc*elapsed"] <- x["elapsed_duration"] * x["brand_name_counts"]
#interaction terms for status effects
x["sNYR*elapsed"] <- x["elapsed_duration"]*x["status_NYR"]
x["sEBI*elapsed"] <- x["elapsed_duration"]*x["status_EBI"]
x["sANR*elapsed"] <- x["elapsed_duration"]*x["status_ANR"]
y <- ifelse(df["final_status"]=="Terminated",1,0) y <- ifelse(df["final_status"]=="Terminated",1,0)
#get category list
return(list(x=x,y=y))
}
train <- data_formatter(df)
categories <- df$category_id
x <- train$x
y <- train$y
```
```{r}
# get data for counterfactuals
get_counterfactuals <- function(driver) {
con <- dbConnect(
driver,
user='root',
password='root',
dbname='aact_db',
host='will-office'
)
on.exit(dbDisconnect(con))
query <- dbSendQuery(
con,
"
with cte as (
select
nct_id
,lag(current_status, 1) over (partition by nct_id order by earliest_date_observed) as previous_status
,current_status
,earliest_date_observed as date_current
from formatted_data_mat fdm
), cte2 as (
select
nct_id
,previous_status
,current_status
,max(date_current) as date_current_max
from cte
where
previous_status != current_status
and
current_status = 'Active, not recruiting'
group by
nct_id
,previous_status
,current_status
,date_current
)
select
fdqpe.nct_id
--,fdqpe.start_date
--,fdqpe.current_enrollment
--,fdqpe.enrollment_category
,fdqpe.current_status
,fdqpe.earliest_date_observed
,fdqpe.elapsed_duration
,fdqpe.n_brands as identical_brands
,ntbtu.brand_name_count
,fdqpe.category_id
,fdqpe.final_status
,fdqpe.h_sdi_val
--,fdqpe.h_sdi_u95
--,fdqpe.h_sdi_l95
,fdqpe.hm_sdi_val
--,fdqpe.hm_sdi_u95
--,fdqpe.hm_sdi_l95
,fdqpe.m_sdi_val
--,fdqpe.m_sdi_u95
--,fdqpe.m_sdi_l95
,fdqpe.lm_sdi_val
--,fdqpe.lm_sdi_u95
--,fdqpe.lm_sdi_l95
,fdqpe.l_sdi_val
--,fdqpe.l_sdi_u95
--,fdqpe.l_sdi_l95
from formatted_data_with_planned_enrollment fdqpe
join \"Formularies\".nct_to_brands_through_uspdc ntbtu
on fdqpe.nct_id = ntbtu.nct_id
join cte2
on cte2.nct_id = fdqpe.nct_id
and cte2.date_current_max = fdqpe.earliest_date_observed
order by fdqpe.nct_id, fdqpe.earliest_date_observed
;
"
)
df <- fetch(query, n = -1)
df <- na.omit(df)
return(df)
}
counterfact_raw <- get_counterfactuals(driver)
#extract data
counterfact_list <- data_formatter(counterfact_raw)
counterfact_list$y <- NULL #remove the chance of accidentally training on the wrong data
counterfact_categories <- counterfact_raw$category_id
#setup the two counterfactuals
counterfact_x <- counterfact_list$x #no change
counterfact_x_tilde <- counterfact_list$x #to be changed
#make changes, set it to Recruiting #TODO: change this so it matches previous state.
counterfact_x_tilde$status_ANR <- 0
counterfact_x_tilde["sANR*elapsed"] <- 0
``` ```
## Explore data
```{r} ```{r}
################################# DATA EXPLORATION ############################ ################################# DATA EXPLORATION ############################
driver <- dbDriver("PostgreSQL") driver <- dbDriver("PostgreSQL")
@ -152,20 +312,37 @@ df5 <- dbGetQuery(
on a.nct_id=b.nct_id on a.nct_id=b.nct_id
;" ;"
) )
df5$overall_status <- as.factor(df5$overall_status)
#df5 <- fetch(query5, n = -1) #df5 <- fetch(query5, n = -1)
ggplot(data=df5, aes(x=duration,y=snapshot_count,color=overall_status)) + ggplot(data=df5, aes(x=duration,y=snapshot_count,color=overall_status)) +
geom_point() + geom_jitter() +
ggtitle("duration, status, and snapshot_count") + ggtitle("duration, status, and snapshot_count") +
xlab("duration") + xlab("duration") +
ylab("snapshot count") ylab("snapshot count")
dbDisconnect(con) dbDisconnect(con)
#get number of trials and snapshots in each category
group_trials_by_category <- as.data.frame(aggregate(category_id ~ nct_id, df, max))
group_trials_by_category <- as.data.frame(group_trials_by_category)
ggplot(data = group_trials_by_category, aes(x=category_id)) +
geom_histogram(binwidth=1,color="black",fill="seagreen") +
scale_x_continuous(breaks=scales::pretty_breaks(n=22))
summary(df5)
``` ```
# Fit Model
```{r} ```{r}
################################# FIT ######################################### ################################# FIT MODEL #########################################
#setup data (named list) #setup data (named list)
trials_data <- list( trials_data <- list(
@ -173,11 +350,19 @@ trials_data <- list(
N = nrow(x), N = nrow(x),
L = n_categories$count, L = n_categories$count,
y = as.vector(y), y = as.vector(y),
ll = as.vector(categories$category_id), ll = as.vector(categories),
x = as.matrix(x) x = as.matrix(x),
mu_mean = 0,
mu_stdev = 0.05,
sigma_shape = 4,
sigma_rate = 20,
Nx = 189,
llx = as.vector(counterfact_categories),
counterfact_x_tilde = as.matrix(counterfact_x_tilde),
counterfact_x = as.matrix(counterfact_x)
) )
#fit <- stan(file='Logistic.stan', data = trials_data) #attempt at simple model
fit <- stan( fit <- stan(
file='Hierarchal_Logistic.stan', file='Hierarchal_Logistic.stan',
data = trials_data, data = trials_data,
@ -187,49 +372,81 @@ fit <- stan(
) )
``` ```
```{r}
#pull out prior predictions
generated <- gqs(
fit@stanmodel,
data=trials_data,
draws=as.matrix(fit),
seed=11021585
)
# to implement distribution of differences:
# create two datasets with interventions
# simulate both with the same seed and draws
# figure out how to difference the posterior (and maybe look at prior) probabilities
```
```{r} ```{r}
################################# ANALYZE ##################################### hist(as.vector(extract(generated, pars="p_prior")$p_prior))
print(fit) hist(as.vector(extract(generated, pars="mu_prior")$mu_prior), )
hist(as.vector(extract(generated, pars="sigma_prior")$sigma_prior))
``` ```
```{r}
check_hmc_diagnostics(fit)
hist(as.vector(extract(generated, pars="p_predicted")$p_predicted))
```
# Diagnostics
```{r} ```{r}
color_scheme_set("darkgray")
#trace plots #trace plots
plot(fit, pars=c("mu"), plotfun="trace") plot(fit, pars=c("mu"), plotfun="trace")
color_scheme_set("viridis")
mcmc_rank_overlay(
fit, for (i in 1:4) {
pars=c( print(
"mu[1]", mcmc_rank_overlay(
"mu[2]", fit,
"mu[3]", pars=c(
"mu[4]", paste0("mu[",4*i-3,"]"),
"mu[5]", paste0("mu[",4*i-2,"]"),
"mu[6]", paste0("mu[",4*i-1,"]"),
"mu[7]" paste0("mu[",4*i,"]")
), ),
n_bins=100 n_bins=100
)+ legend_move("top") )+ legend_move("top") +
``` scale_colour_ghibli_d("KikiMedium")
)
```{r} }
color_scheme_set("viridis") ```
```{r}
plot(fit, pars=c("sigma"), plotfun="trace") plot(fit, pars=c("sigma"), plotfun="trace")
color_scheme_set("viridis")
mcmc_rank_overlay( for (i in 1:4) {
fit, print(
pars=c( mcmc_rank_overlay(
"sigma[1]", fit,
"sigma[2]", pars=c(
"sigma[3]", paste0("sigma[",4*i-3,"]"),
"sigma[4]", paste0("sigma[",4*i-2,"]"),
"sigma[5]", paste0("sigma[",4*i-1,"]"),
"sigma[6]", paste0("sigma[",4*i,"]")
"sigma[7]" ),
), n_bins=100
n_bins=100 )+ legend_move("top") +
)+ legend_move("top") scale_colour_ghibli_d("KikiMedium")
)
}
``` ```
```{r} ```{r}
@ -242,81 +459,513 @@ posterior <- as.array(fit)
```{r} ```{r}
color_scheme_set("darkgray") color_scheme_set("darkgray")
mcmc_parcoord(posterior,pars=c( div_style <- parcoord_style_np(div_color = "green", div_size = 0.05, div_alpha = 0.4)
"mu[1]", mcmc_parcoord(posterior, regex_pars = "mu", np=nuts_prmts, np_style = div_style, alpha = 0.05)
"mu[2]",
"mu[3]",
"mu[4]",
"mu[5]",
"mu[6]",
"mu[7]"
),
np=nuts_prmts
)
``` ```
```{r} ```{r}
mcmc_pairs( for (i in 1:4) {
posterior, mus = sapply(3:0, function(j) paste0("mu[",4*i-j ,"]"))
np = nuts_prmts, print(
pars=c( mcmc_pairs(
"mu[1]", posterior,
"mu[2]", np = nuts_prmts,
"mu[3]", pars=c(
"mu[4]", mus,
"mu[5]", "lp__"
"mu[6]", ),
"mu[7]" off_diag_args = list(size = 0.75)
)
)
}
```
```{r}
mcmc_parcoord(posterior,regex_pars = "sigma", np=nuts_prmts, alpha=0.05)
```
```{r}
for (i in 1:4) {
params = sapply(3:0, function(j) paste0("sigma[",4*i-j ,"]"))
print(
mcmc_pairs(
posterior,
np = nuts_prmts,
pars=c(
params,
"lp__"
),
off_diag_args = list(size = 0.75)
)
)
}
```
```{r}
for (k in 1:22) {
for (i in 1:4) {
params = sapply(3:0, function(j) paste0("beta[",k,",",4*i-j ,"]"))
print(
mcmc_pairs(
posterior,
np = nuts_prmts,
pars=c(
params,
"lp__"
),
off_diag_args = list(size = 0.75)
)
)
}}
```
# Results
```{r}
################################# ANALYZE #####################################
print(fit)
```
## Result Plots
Note the regular large difference in variance.
I would guess those are the beta[1:22,2] values.
I wonder if a lot of the variance is due to the 2 values that are sitting out.
```{r}
beta_list <- list(
groups = c(
`1`="Infections & Parasites",
`2`="Neoplasms",
`3`="Blood & Immune system",
`4`="Endocrine, Nutritional, and Metabolic",
`5`="Mental & Behavioral",
`6`="Nervous System",
`7`="Eye and Adnexa",
`8`="Ear and Mastoid",
`9`="Circulatory",
`10`="Respiratory",
`11`="Digestive",
`12`="Skin & Subcutaneaous tissue",
`13`="Musculoskeletal",
`14`="Genitourinary",
`15`="Pregancy, Childbirth, & Puerperium",
`16`="Perinatal Period",
`17`="Congential",
`18`="Symptoms, Signs etc.",
`19`="Injury etc.",
`20`="External Causes",
`21`="Contact with Healthcare",
`22`="Special Purposes"
), ),
off_diag_args = list(size = 0.75) parameters = c(
`1`="Elapsed Duration",
# brands
`2`="asinh(Generic Brands)",
`3`="asinh(Competitors USPDC)",
# population
`4`="asinh(High SDI)",
`5`="asinh(High-Medium SDI)",
`6`="asinh(Medium SDI)",
`7`="asinh(Low-Medium SDI)",
`8`="asinh(Low SDI)",
#Status
`9`="status_NYR",
`10`="status_EBI",
`11`="status_ANR",
#interactions for brands
`12`="ib*elapsed",
`13`="bnc*elapsed",
# interactions for status
`14`="sNYR*elapsed",
`15`="sEBI*elapsed",
`16`="sANR*elapsed"
)
) )
get_parameters <- function(stem,class_list) {
#get categories and lengths
named <- names(class_list)
lengths <- sapply(named, (function (x) length(class_list[[x]])))
#describe the grid needed
iter_list <- sapply(named, (function (x) 1:lengths[x]))
#generate the list of parameters
pardf <- generate_parameter_df(stem, iter_list)
#add columns with appropriate human-readable names
for (name in named) {
pardf[paste(name,"_hr",sep="")] <- as.factor(
sapply(pardf[name], (function (i) class_list[[name]][i]))
)
}
return(pardf)
}
generate_parameter_df <- function(stem, iter_list) {
grid <- expand.grid(iter_list)
grid["param_name"] <- grid %>% unite(x,colnames(grid),sep=",")
grid["param_name"] <- paste(stem,"[",grid$param_name,"]",sep="")
return(grid)
}
group_mcmc_areas <- function(
stem,# = "beta"
class_list,# = beta_list
stanfit,# = fit
group_id,# = 2
rename=TRUE
) {
#get all parameter names
params <- get_parameters(stem,class_list)
#filter down to parameters of interest
params <- filter(params,groups == group_id)
#Get dataframe with only the rows of interest
filtdata <- as.data.frame(stanfit)[params$param_name]
#rename columns
if (rename) dimnames(filtdata)[[2]] <- params$parameters_hr
#get group name for title
group_name <- class_list$groups[group_id]
#create area plot with appropriate title
mcmc_areas(filtdata,prob = 0.8, prob_outer = 0.95) +
ggtitle(paste("Parameter distributions for ICD-10 class:",group_name))
}
parameter_mcmc_areas <- function(
stem,# = "beta"
class_list,# = beta_list
stanfit,# = fit
parameter_id,# = 2
rename=TRUE
) {
#get all parameter names
params <- get_parameters(stem,class_list)
#filter down to parameters of interest
params <- filter(params,parameters == parameter_id)
#Get dataframe with only the rows of interest
filtdata <- as.data.frame(stanfit)[params$param_name]
#rename columns
if (rename) dimnames(filtdata)[[2]] <- params$groups_hr
#get group name for title
parameter_name <- class_list$parameters[parameter_id]
#create area plot with appropriate title
mcmc_areas(filtdata,prob = 0.8, prob_outer = 0.95) +
ggtitle(parameter_name,"Parameter Distribution")
}
```
```{r}
#mcmc_intervals(fit, pars=get_parameters("beta",beta_list)$param_name)
```
### Investigating parameter distributions
```{r}
#g1 <- group_mcmc_areas("beta",beta_list,fit,2)
#g2 <- group_mcmc_areas("beta",beta_list,fit,2)
#g3 <- group_mcmc_areas("beta",beta_list,fit,2)
#g4 <- group_mcmc_areas("beta",beta_list,fit,2)
#g5 <- group_mcmc_areas("beta",beta_list,fit,2)
#g6 <- group_mcmc_areas("beta",beta_list,fit,2)
#g7 <- group_mcmc_areas("beta",beta_list,fit,2)
#g8 <- group_mcmc_areas("beta",beta_list,fit,2)
#g9 <- group_mcmc_areas("beta",beta_list,fit,2)
#g10 <- group_mcmc_areas("beta",beta_list,fit,2)
#g11 <- group_mcmc_areas("beta",beta_list,fit,2)
#g12 <- group_mcmc_areas("beta",beta_list,fit,2)
#g13 <- group_mcmc_areas("beta",beta_list,fit,2)
#g14 <- group_mcmc_areas("beta",beta_list,fit,2)
#g15 <- group_mcmc_areas("beta",beta_list,fit,2)
#g16 <- group_mcmc_areas("beta",beta_list,fit,2)
#g17 <- group_mcmc_areas("beta",beta_list,fit,2)
#g18 <- group_mcmc_areas("beta",beta_list,fit,2)
#g19 <- group_mcmc_areas("beta",beta_list,fit,2)
#g20 <- group_mcmc_areas("beta",beta_list,fit,2)
#g21 <- group_mcmc_areas("beta",beta_list,fit,2)
#g22 <- group_mcmc_areas("beta",beta_list,fit,2)
p1 <- parameter_mcmc_areas("beta",beta_list,fit,1)
p2 <- parameter_mcmc_areas("beta",beta_list,fit,2)
p3 <- parameter_mcmc_areas("beta",beta_list,fit,3)
#p4 <- parameter_mcmc_areas("beta",beta_list,fit,4)
#p5 <- parameter_mcmc_areas("beta",beta_list,fit,5)
#p6 <- parameter_mcmc_areas("beta",beta_list,fit,6)
#p7 <- parameter_mcmc_areas("beta",beta_list,fit,7)
#p8 <- parameter_mcmc_areas("beta",beta_list,fit,8)
p9 <- parameter_mcmc_areas("beta",beta_list,fit,9)
p10 <- parameter_mcmc_areas("beta",beta_list,fit,10)
p11 <- parameter_mcmc_areas("beta",beta_list,fit,11)
p12 <- parameter_mcmc_areas("beta",beta_list,fit,12)
p13 <- parameter_mcmc_areas("beta",beta_list,fit,13)
p14 <- parameter_mcmc_areas("beta",beta_list,fit,14)
p15 <- parameter_mcmc_areas("beta",beta_list,fit,15)
p16 <- parameter_mcmc_areas("beta",beta_list,fit,16)
```
Note these have 95% outer CI and 80% inner (shaded)
1) "Elapsed Duration",
2) "asinh(Generic Brands)",
3) "asinh(Competitors USPDC)",
4) "asinh(High SDI)",
5) "asinh(High-Medium SDI)",
6) "asinh(Medium SDI)",
7) "asinh(Low-Medium SDI)",
8) "asinh(Low SDI)",
9) "status_NYR",
10) "status_EBI",
11) "status_ANR",
12) "ib*elapsed",
13) "bnc*elapsed",
14) "sNYR*elapsed",
15) "sEBI*elapsed",
16) "sANR*elapsed"
of interest
- p1 + p2
- p3 + p2
- p2 + p12
- p3 + p13
- p9 + p14
- p10 + p15
- p11 + p16
```{r}
p1 + p2
```
```{r}
p2 + p3
```
```{r}
p2 + p12
```
```{r}
p3 + p13
```
```{r}
p9 + p14
```
```{r}
p10 + p15
```
```{r}
p11 + p16
```
# Posterior Prediction
```{r}
#TODO: Convert to ggplot, stabilize y axis
hist(as.vector(extract(generated, pars="p_predicted_default")$p_predicted_default))
hist(as.vector(extract(generated, pars="p_predicted_intervention")$p_predicted_intervention))
``` ```
## Distribution of Predicted Differences
### Intervention: Marginal increase in time to finish enrollment
```{r} ```{r}
mcmc_parcoord(posterior,pars=c(
"sigma[1]", pddf <- data.frame(extract(generated, pars="predicted_difference")$predicted_difference) |> pivot_longer(X1:X189)
"sigma[2]", pddf["entry_idx"] <- as.numeric(gsub("\\D","",pddf$name))
"sigma[3]",
"sigma[4]", pddf["category"] <- sapply(pddf$entry_idx, function(i) counterfact_categories[i])
"sigma[5]", pddf["category_name"] <- sapply(pddf$category, function(i) beta_list$groups[i])
"sigma[6]",
"sigma[7]" ggplot(pddf, aes(x=value,)) +
), geom_histogram(bins=100) +
np=nuts_prmts labs(
title = "Distribution of predicted differences"
,x = "Difference in probability due to intervention"
,y = "Predicted counts"
) +
xlim(-0.3,0.1) +
geom_vline(aes(xintercept = 0), color = "skyblue", linetype="dashed")
ggplot(pddf, aes(x=value,)) +
geom_histogram(bins=100) +
facet_wrap(
~factor(
category_name,
levels=beta_list$groups
)
, labeller = label_wrap_gen(multi_line = TRUE)
, ncol=5) +
labs(
title = "Distribution of predicted differences",
subtitle = "By group"
,x = "Difference in probability due to intervention"
,y = "Predicted counts"
) +
xlim(-0.25,0.1) +
geom_vline(aes(xintercept = 0), color = "skyblue", linetype="dashed") +
theme(strip.text.x = element_text(size = 8))
```
Recall that we had really tight zero priors.
### Intervention: Adding a single competitor
```{r}
inherited_cols <- c(
"elapsed_duration"
#,"identical_brands"
#,"brand_name_counts"
,"h_sdi_val"
,"hm_sdi_val"
,"m_sdi_val"
,"lm_sdi_val"
,"l_sdi_val"
,"status_NYR"
,"status_EBI"
,"status_ANR"
#,"ib*elapsed"
#,"bnc*elapsed"
,"sNYR*elapsed"
,"sEBI*elapsed"
,"sANR*elapsed"
) )
``` ```
#### Generics
```{r} ```{r}
mcmc_pairs( #generics intervention
posterior, brand_intervention_ib <- x[c(inherited_cols,"brand_name_counts","bnc*elapsed")]
np = nuts_prmts, brand_intervention_ib["identical_brands"] <- asinh(sinh(x$identical_brands)+1) #add a single generic brand
pars=c( brand_intervention_ib["ib*elapsed"] <- brand_intervention_ib$identical_brands * brand_intervention_ib$elapsed_duration
"sigma[1]", ```
"sigma[2]",
"sigma[3]", ```{r}
"sigma[4]", counterfact_marketing_ib <- list(
"sigma[5]", D = ncol(x),#
"sigma[6]", N = nrow(x),
"sigma[7]" L = n_categories$count,
), y = as.vector(y),
off_diag_args = list(size = 0.75) ll = as.vector(categories),
x = as.matrix(x),
mu_mean = 0,
mu_stdev = 0.05,
sigma_shape = 4,
sigma_rate = 20,
Nx = nrow(x),
llx = as.vector(categories),
counterfact_x_tilde = as.matrix(brand_intervention_ib),
counterfact_x = as.matrix(x)
) )
```
```{r}
generated_ib <- gqs(
fit@stanmodel,
data=counterfact_marketing_ib,
draws=as.matrix(fit),
seed=11021585
)
```
```{r}
#TODO: Convert to ggplot, stabilize y axis
hist(as.vector(extract(generated_ib, pars="p_predicted_default")$p_predicted_default), bins=100)
hist(as.vector(extract(generated_ib, pars="p_predicted_intervention")$p_predicted_intervention), bins=100)
hist(as.vector(extract(generated_ib, pars="predicted_difference")$predicted_difference), bins=100)
``` ```
```{r} ```{r}
# check model estimates
#mu
plot(fit, pars=c("mu"), show_density =TRUE, ci_level=0.8) pddf_ib <- data.frame(extract(generated_ib, pars="predicted_difference")$predicted_difference) |>
plot(fit, pars=c("mu"), plotfun = "hist", bins=50) pivot_longer(X1:X1343)
#TODO: Fix Category names
pddf_ib["entry_idx"] <- as.numeric(gsub("\\D","",pddf_ib$name))
pddf_ib["category"] <- sapply(pddf_ib$entry_idx, function(i) df$category_id[i])
pddf_ib["category_name"] <- sapply(pddf_ib$category, function(i) beta_list$groups[i])
``` ```
```{r} ```{r}
#sigma
plot(fit, pars=c("sigma"), show_density =TRUE, ci_level=0.8)
plot(fit, pars=c("sigma"), plotfun = "hist",bins=50)
ggplot(pddf_ib, aes(x=value,)) +
geom_histogram(bins=100) +
labs(
title = "Distribution of predicted differences"
,x = "Difference in probability due to intervention"
,y = "Predicted counts"
) +
#xlim(-0.3,0.1) +
geom_vline(aes(xintercept = 0), color = "skyblue", linetype="dashed")
ggplot(pddf_ib, aes(x=value,)) +
geom_histogram(bins=100) +
facet_wrap(
~factor(
category_name,
levels=beta_list$groups
)
, labeller = label_wrap_gen(multi_line = TRUE)
, ncol=5) +
labs(
title = "Distribution of predicted differences",
subtitle = "By group"
,x = "Difference in probability due to intervention"
,y = "Predicted counts"
) +
#xlim(-0.25,0.1) +
geom_vline(aes(xintercept = 0), color = "skyblue", linetype="dashed") +
theme(strip.text.x = element_text(size = 8))
```
#### USP DC
```{r}
#formulary intervention
brand_intervention_bnc <- x[c(inherited_cols,"identical_brands","ib*elapsed")]
brand_intervention_bnc["brand_name_counts"] <- asinh(sinh(x$brand_name_counts)+1) #add a single formulary competitor brand
brand_intervention_bnc["bnc*elapsed"] <- brand_intervention_bnc$brand_name_counts * brand_intervention_bnc$elapsed_duration
``` ```

@ -11,21 +11,30 @@
// The input data is a vector 'y' of length 'N'. // The input data is a vector 'y' of length 'N'.
data { data {
int<lower=1> D; int<lower=1> D; // number of features
int<lower=1> N; int<lower=1> N; // number of observations
int<lower=1> L; int<lower=1> L; // number of categories
int<lower=0, upper=1> y[N]; array[N] int<lower=0, upper=1> y; // vector of dependent variables
int<lower=1, upper=L> ll[N]; array[N] int<lower=1, upper=L> ll; // vector of categories
row_vector[D] x[N]; array[N] row_vector[D] x; // independent variables
real mu_mean; //hyperprior
real<lower=0> mu_stdev; //hyperprior
real sigma_shape; //hyperprior
real sigma_rate; //hyperprior
//counterfactuals
int<lower=0> Nx;
array[Nx] int<lower=1, upper=L> llx;
array[Nx] row_vector[D] counterfact_x_tilde; // Posterior Prediction intervention
array[Nx] row_vector[D] counterfact_x; // Posterior Prediction intervention
} }
parameters { parameters {
real mu[D]; array[D] real mu;
real<lower=0> sigma[D]; array[D] real<lower=0> sigma;
vector[D] beta[L]; array[L] vector[D] beta;
} }
model { model {
sigma ~ gamma(2,1); //shape and inverse scale sigma ~ gamma(sigma_shape,sigma_rate); //hyperprior for stdev: shape and inverse scale
mu ~ normal(0, 1); //convert to mvnormal mu ~ normal(mu_mean, mu_stdev); //hyperprior for mean //TODO: convert to mvnormal
for (l in 1:L) { for (l in 1:L) {
beta[l] ~ normal(mu, sigma); beta[l] ~ normal(mu, sigma);
} }
@ -37,4 +46,47 @@ model {
y ~ bernoulli_logit(x_beta_ll); y ~ bernoulli_logit(x_beta_ll);
} }
} }
generated quantities {
//SETUP PRIOR PREDICTION
//preallocate
real mu_prior[D];
real sigma_prior[D];
real p_prior[N]; // what I have priors about
real p_predicted[N]; //predicted p_values
//intervention
real p_predicted_default[Nx]; //predicted p_values
real p_predicted_intervention[Nx]; //predicted p_values
real predicted_difference[Nx]; //difference in predicted values
//GENERATE PRIOR PREDICTIONS
{
vector[D] beta_prior[L];//local var
//sample parameters
for (d in 1:D) {
mu_prior[d] = normal_rng(mu_mean,mu_stdev);
sigma_prior[d] = gamma_rng(sigma_shape,sigma_rate);
}
for (l in 1:L) {
for (d in 1:D) {
beta_prior[l,d] = normal_rng(mu_prior[d],sigma_prior[d]);
}
}
//generate probabilities
vector[D] b_prior[N];//local var
for (n in 1:N){
b_prior[n] = beta_prior[ll[n]];
p_prior[n] = inv_logit( x[n] * b_prior[n] );
}
}
//GENERATE POSTERIOR PREDICTIONS
for (n in 1:N) {
p_predicted[n] = inv_logit( x[n] * beta[ll[n]] );
}
//GENERATE POSTERIOR DISTRIBUTION OF DIFFERENCES
for (n in 1:Nx) {
p_predicted_default[n] = inv_logit( counterfact_x[n] * beta[llx[n]] );
p_predicted_intervention[n] = inv_logit( counterfact_x_tilde[n] * beta[llx[n]] ); //intervention
predicted_difference[n] = p_predicted_default[n] - p_predicted_intervention[n];
}
}

Loading…
Cancel
Save