From 63229d3a99264d64e046a9303f456204b58295f5 Mon Sep 17 00:00:00 2001 From: Will King Date: Wed, 4 Oct 2023 05:18:43 +0000 Subject: [PATCH] Updated to keep current analysis --- r-analysis/EffectsOfMarketConditions.qmd | 863 ++++++++++++++++++++--- r-analysis/Hierarchal_Logistic.stan | 74 +- 2 files changed, 819 insertions(+), 118 deletions(-) diff --git a/r-analysis/EffectsOfMarketConditions.qmd b/r-analysis/EffectsOfMarketConditions.qmd index fd0f24c..48779e7 100644 --- a/r-analysis/EffectsOfMarketConditions.qmd +++ b/r-analysis/EffectsOfMarketConditions.qmd @@ -5,12 +5,18 @@ format: html editor: source --- + +# Setup + ```{r} library(bayesplot) available_mcmc(pattern = "_nuts_") library(ggplot2) - +library(patchwork) +library(tidyverse) library(rstan) +library(tidyr) +library(ghibli) #Resources: https://github.com/stan-dev/rstan/wiki/RStan-Getting-Started #save unchanged models instead of recompiling @@ -41,7 +47,41 @@ on.exit(dbDisconnect(con)) query <- dbSendQuery( con, - "select * from formatted_data_with_planned_enrollment;" +# "select * from formatted_data_with_planned_enrollment;" +" +select + fdqpe.nct_id + --,fdqpe.start_date + --,fdqpe.current_enrollment + --,fdqpe.enrollment_category + ,fdqpe.current_status + ,fdqpe.earliest_date_observed + ,fdqpe.elapsed_duration + ,fdqpe.n_brands as identical_brands + ,ntbtu.brand_name_count + ,fdqpe.category_id + ,fdqpe.final_status + ,fdqpe.h_sdi_val + --,fdqpe.h_sdi_u95 + --,fdqpe.h_sdi_l95 + ,fdqpe.hm_sdi_val + --,fdqpe.hm_sdi_u95 + --,fdqpe.hm_sdi_l95 + ,fdqpe.m_sdi_val + --,fdqpe.m_sdi_u95 + --,fdqpe.m_sdi_l95 + ,fdqpe.lm_sdi_val + --,fdqpe.lm_sdi_u95 + --,fdqpe.lm_sdi_l95 + ,fdqpe.l_sdi_val + --,fdqpe.l_sdi_u95 + --,fdqpe.l_sdi_l95 +from formatted_data_with_planned_enrollment fdqpe + join \"Formularies\".nct_to_brands_through_uspdc ntbtu + on fdqpe.nct_id = ntbtu.nct_id +order by fdqpe.nct_id, fdqpe.earliest_date_observed +; +" ) df <- fetch(query, n = -1) df <- na.omit(df) @@ -58,32 +98,152 @@ n_categories <- d$ncat -################ Format Data ########################### +################ Format Data ########################### +data_formatter <- function(df) { categories <- df["category_id"] x <- df["elapsed_duration"] -x["log_n_brands"] <- asinh(df$n_brands) #todo - convert to arcsinh() transform +x["identical_brands"] <- asinh(df$identical_brands) +x["brand_name_counts"] <- asinh(df$brand_name_count) x["h_sdi_val"] <- asinh(df$h_sdi_val) x["hm_sdi_val"] <- asinh(df$hm_sdi_val) x["m_sdi_val"] <- asinh(df$m_sdi_val) x["lm_sdi_val"] <- asinh(df$lm_sdi_val) x["l_sdi_val"] <- asinh(df$l_sdi_val) -#x["enrollment"] <- df["current_enrollment"] / df["planned_enrollment"] -#square terms -#x["enrollment^2"] <- x["enrollment"]^2 -#x["elapsed_duration^2"] <- x["elapsed_duration"]^2 -#x["n_brands^2"] <- x["n_brands"]^2 -#break out -#x["status_NYR"] <- ifelse(df["current_status"]=="Not yet recruiting",1,0) -#x["status_Rec"] <- ifelse(df["current_status"]=="Recruiting",1,0) -#x["status_ANR"] <- ifelse(df["current_status"]=="Active, not recruiting",1,0) -#x["status_EBI"] <- ifelse(df["current_status"]=="Enrolling by invitation",1,0) + + +#Setup fixed effects +x["status_NYR"] <- ifelse(df["current_status"]=="Not yet recruiting",1,0) +x["status_EBI"] <- ifelse(df["current_status"]=="Enrolling by invitation",1,0) +#x["status_Rec"] <- ifelse(df["current_status"]=="Recruiting",1,0) # Base case is Recruiting +x["status_ANR"] <- ifelse(df["current_status"]=="Active, not recruiting",1,0) + + +#interaction terms for competitors +x["ib*elapsed"] <- x["elapsed_duration"]*x["identical_brands"] +x["bnc*elapsed"] <- x["elapsed_duration"] * x["brand_name_counts"] + +#interaction terms for status effects +x["sNYR*elapsed"] <- x["elapsed_duration"]*x["status_NYR"] +x["sEBI*elapsed"] <- x["elapsed_duration"]*x["status_EBI"] +x["sANR*elapsed"] <- x["elapsed_duration"]*x["status_ANR"] y <- ifelse(df["final_status"]=="Terminated",1,0) + +#get category list + + +return(list(x=x,y=y)) +} + +train <- data_formatter(df) + +categories <- df$category_id + +x <- train$x +y <- train$y +``` + + +```{r} +# get data for counterfactuals +get_counterfactuals <- function(driver) { + con <- dbConnect( + driver, + user='root', + password='root', + dbname='aact_db', + host='will-office' + ) + on.exit(dbDisconnect(con)) + + query <- dbSendQuery( + con, + " + with cte as ( + select + nct_id + ,lag(current_status, 1) over (partition by nct_id order by earliest_date_observed) as previous_status + ,current_status + ,earliest_date_observed as date_current + from formatted_data_mat fdm + ), cte2 as ( + select + nct_id + ,previous_status + ,current_status + ,max(date_current) as date_current_max + from cte + where + previous_status != current_status + and + current_status = 'Active, not recruiting' + group by + nct_id + ,previous_status + ,current_status + ,date_current + ) + select + fdqpe.nct_id + --,fdqpe.start_date + --,fdqpe.current_enrollment + --,fdqpe.enrollment_category + ,fdqpe.current_status + ,fdqpe.earliest_date_observed + ,fdqpe.elapsed_duration + ,fdqpe.n_brands as identical_brands + ,ntbtu.brand_name_count + ,fdqpe.category_id + ,fdqpe.final_status + ,fdqpe.h_sdi_val + --,fdqpe.h_sdi_u95 + --,fdqpe.h_sdi_l95 + ,fdqpe.hm_sdi_val + --,fdqpe.hm_sdi_u95 + --,fdqpe.hm_sdi_l95 + ,fdqpe.m_sdi_val + --,fdqpe.m_sdi_u95 + --,fdqpe.m_sdi_l95 + ,fdqpe.lm_sdi_val + --,fdqpe.lm_sdi_u95 + --,fdqpe.lm_sdi_l95 + ,fdqpe.l_sdi_val + --,fdqpe.l_sdi_u95 + --,fdqpe.l_sdi_l95 + from formatted_data_with_planned_enrollment fdqpe + join \"Formularies\".nct_to_brands_through_uspdc ntbtu + on fdqpe.nct_id = ntbtu.nct_id + join cte2 + on cte2.nct_id = fdqpe.nct_id + and cte2.date_current_max = fdqpe.earliest_date_observed + order by fdqpe.nct_id, fdqpe.earliest_date_observed + ; + " + ) + df <- fetch(query, n = -1) + df <- na.omit(df) + + return(df) +} +counterfact_raw <- get_counterfactuals(driver) +#extract data +counterfact_list <- data_formatter(counterfact_raw) +counterfact_list$y <- NULL #remove the chance of accidentally training on the wrong data +counterfact_categories <- counterfact_raw$category_id +#setup the two counterfactuals +counterfact_x <- counterfact_list$x #no change +counterfact_x_tilde <- counterfact_list$x #to be changed +#make changes, set it to Recruiting #TODO: change this so it matches previous state. +counterfact_x_tilde$status_ANR <- 0 +counterfact_x_tilde["sANR*elapsed"] <- 0 ``` + +## Explore data + ```{r} ################################# DATA EXPLORATION ############################ driver <- dbDriver("PostgreSQL") @@ -152,20 +312,37 @@ df5 <- dbGetQuery( on a.nct_id=b.nct_id ;" ) +df5$overall_status <- as.factor(df5$overall_status) #df5 <- fetch(query5, n = -1) ggplot(data=df5, aes(x=duration,y=snapshot_count,color=overall_status)) + - geom_point() + + geom_jitter() + ggtitle("duration, status, and snapshot_count") + xlab("duration") + ylab("snapshot count") dbDisconnect(con) + +#get number of trials and snapshots in each category +group_trials_by_category <- as.data.frame(aggregate(category_id ~ nct_id, df, max)) +group_trials_by_category <- as.data.frame(group_trials_by_category) +ggplot(data = group_trials_by_category, aes(x=category_id)) + + geom_histogram(binwidth=1,color="black",fill="seagreen") + + scale_x_continuous(breaks=scales::pretty_breaks(n=22)) + + + +summary(df5) ``` + + +# Fit Model + + ```{r} -################################# FIT ######################################### +################################# FIT MODEL ######################################### #setup data (named list) trials_data <- list( @@ -173,11 +350,19 @@ trials_data <- list( N = nrow(x), L = n_categories$count, y = as.vector(y), - ll = as.vector(categories$category_id), - x = as.matrix(x) + ll = as.vector(categories), + x = as.matrix(x), + mu_mean = 0, + mu_stdev = 0.05, + sigma_shape = 4, + sigma_rate = 20, + Nx = 189, + llx = as.vector(counterfact_categories), + counterfact_x_tilde = as.matrix(counterfact_x_tilde), + counterfact_x = as.matrix(counterfact_x) ) -#fit <- stan(file='Logistic.stan', data = trials_data) #attempt at simple model + fit <- stan( file='Hierarchal_Logistic.stan', data = trials_data, @@ -187,49 +372,81 @@ fit <- stan( ) ``` +```{r} +#pull out prior predictions +generated <- gqs( + fit@stanmodel, + data=trials_data, + draws=as.matrix(fit), + seed=11021585 + ) +# to implement distribution of differences: +# create two datasets with interventions +# simulate both with the same seed and draws +# figure out how to difference the posterior (and maybe look at prior) probabilities +``` + ```{r} -################################# ANALYZE ##################################### -print(fit) +hist(as.vector(extract(generated, pars="p_prior")$p_prior)) +hist(as.vector(extract(generated, pars="mu_prior")$mu_prior), ) +hist(as.vector(extract(generated, pars="sigma_prior")$sigma_prior)) ``` + + +```{r} +check_hmc_diagnostics(fit) +hist(as.vector(extract(generated, pars="p_predicted")$p_predicted)) +``` + + + + + + +# Diagnostics + ```{r} -color_scheme_set("darkgray") #trace plots plot(fit, pars=c("mu"), plotfun="trace") -color_scheme_set("viridis") -mcmc_rank_overlay( - fit, - pars=c( - "mu[1]", - "mu[2]", - "mu[3]", - "mu[4]", - "mu[5]", - "mu[6]", - "mu[7]" - ), - n_bins=100 - )+ legend_move("top") -``` - -```{r} -color_scheme_set("viridis") + + +for (i in 1:4) { + print( + mcmc_rank_overlay( + fit, + pars=c( + paste0("mu[",4*i-3,"]"), + paste0("mu[",4*i-2,"]"), + paste0("mu[",4*i-1,"]"), + paste0("mu[",4*i,"]") + ), + n_bins=100 + )+ legend_move("top") + + scale_colour_ghibli_d("KikiMedium") + ) +} +``` + +```{r} plot(fit, pars=c("sigma"), plotfun="trace") -color_scheme_set("viridis") -mcmc_rank_overlay( - fit, - pars=c( - "sigma[1]", - "sigma[2]", - "sigma[3]", - "sigma[4]", - "sigma[5]", - "sigma[6]", - "sigma[7]" - ), - n_bins=100 - )+ legend_move("top") + +for (i in 1:4) { + print( + mcmc_rank_overlay( + fit, + pars=c( + paste0("sigma[",4*i-3,"]"), + paste0("sigma[",4*i-2,"]"), + paste0("sigma[",4*i-1,"]"), + paste0("sigma[",4*i,"]") + ), + n_bins=100 + )+ legend_move("top") + + scale_colour_ghibli_d("KikiMedium") + ) +} ``` ```{r} @@ -242,81 +459,513 @@ posterior <- as.array(fit) ```{r} color_scheme_set("darkgray") -mcmc_parcoord(posterior,pars=c( - "mu[1]", - "mu[2]", - "mu[3]", - "mu[4]", - "mu[5]", - "mu[6]", - "mu[7]" - ), - np=nuts_prmts -) +div_style <- parcoord_style_np(div_color = "green", div_size = 0.05, div_alpha = 0.4) +mcmc_parcoord(posterior, regex_pars = "mu", np=nuts_prmts, np_style = div_style, alpha = 0.05) ``` ```{r} -mcmc_pairs( - posterior, - np = nuts_prmts, - pars=c( - "mu[1]", - "mu[2]", - "mu[3]", - "mu[4]", - "mu[5]", - "mu[6]", - "mu[7]" +for (i in 1:4) { + mus = sapply(3:0, function(j) paste0("mu[",4*i-j ,"]")) + print( + mcmc_pairs( + posterior, + np = nuts_prmts, + pars=c( + mus, + "lp__" + ), + off_diag_args = list(size = 0.75) + ) + ) +} + + + +``` + +```{r} +mcmc_parcoord(posterior,regex_pars = "sigma", np=nuts_prmts, alpha=0.05) +``` + +```{r} + +for (i in 1:4) { + params = sapply(3:0, function(j) paste0("sigma[",4*i-j ,"]")) + print( + mcmc_pairs( + posterior, + np = nuts_prmts, + pars=c( + params, + "lp__" + ), + off_diag_args = list(size = 0.75) + ) + ) +} +``` + + +```{r} +for (k in 1:22) { +for (i in 1:4) { + params = sapply(3:0, function(j) paste0("beta[",k,",",4*i-j ,"]")) + print( + mcmc_pairs( + posterior, + np = nuts_prmts, + pars=c( + params, + "lp__" + ), + off_diag_args = list(size = 0.75) + ) + ) +}} +``` + + +# Results + + +```{r} +################################# ANALYZE ##################################### +print(fit) +``` + +## Result Plots + + +Note the regular large difference in variance. +I would guess those are the beta[1:22,2] values. +I wonder if a lot of the variance is due to the 2 values that are sitting out. + + + +```{r} +beta_list <- list( + groups = c( + `1`="Infections & Parasites", + `2`="Neoplasms", + `3`="Blood & Immune system", + `4`="Endocrine, Nutritional, and Metabolic", + `5`="Mental & Behavioral", + `6`="Nervous System", + `7`="Eye and Adnexa", + `8`="Ear and Mastoid", + `9`="Circulatory", + `10`="Respiratory", + `11`="Digestive", + `12`="Skin & Subcutaneaous tissue", + `13`="Musculoskeletal", + `14`="Genitourinary", + `15`="Pregancy, Childbirth, & Puerperium", + `16`="Perinatal Period", + `17`="Congential", + `18`="Symptoms, Signs etc.", + `19`="Injury etc.", + `20`="External Causes", + `21`="Contact with Healthcare", + `22`="Special Purposes" ), - off_diag_args = list(size = 0.75) + parameters = c( + `1`="Elapsed Duration", + # brands + `2`="asinh(Generic Brands)", + `3`="asinh(Competitors USPDC)", + # population + `4`="asinh(High SDI)", + `5`="asinh(High-Medium SDI)", + `6`="asinh(Medium SDI)", + `7`="asinh(Low-Medium SDI)", + `8`="asinh(Low SDI)", + #Status + `9`="status_NYR", + `10`="status_EBI", + `11`="status_ANR", + #interactions for brands + `12`="ib*elapsed", + `13`="bnc*elapsed", + # interactions for status + `14`="sNYR*elapsed", + `15`="sEBI*elapsed", + `16`="sANR*elapsed" + + ) ) +get_parameters <- function(stem,class_list) { + #get categories and lengths + named <- names(class_list) + lengths <- sapply(named, (function (x) length(class_list[[x]]))) + + #describe the grid needed + iter_list <- sapply(named, (function (x) 1:lengths[x])) + + #generate the list of parameters + pardf <- generate_parameter_df(stem, iter_list) + + #add columns with appropriate human-readable names + for (name in named) { + pardf[paste(name,"_hr",sep="")] <- as.factor( + sapply(pardf[name], (function (i) class_list[[name]][i])) + ) + } + + return(pardf) +} + +generate_parameter_df <- function(stem, iter_list) { + grid <- expand.grid(iter_list) + grid["param_name"] <- grid %>% unite(x,colnames(grid),sep=",") + grid["param_name"] <- paste(stem,"[",grid$param_name,"]",sep="") + return(grid) +} + +group_mcmc_areas <- function( + stem,# = "beta" + class_list,# = beta_list + stanfit,# = fit + group_id,# = 2 + rename=TRUE + ) { + #get all parameter names + params <- get_parameters(stem,class_list) + #filter down to parameters of interest + params <- filter(params,groups == group_id) + #Get dataframe with only the rows of interest + filtdata <- as.data.frame(stanfit)[params$param_name] + #rename columns + if (rename) dimnames(filtdata)[[2]] <- params$parameters_hr + #get group name for title + group_name <- class_list$groups[group_id] + #create area plot with appropriate title + mcmc_areas(filtdata,prob = 0.8, prob_outer = 0.95) + + ggtitle(paste("Parameter distributions for ICD-10 class:",group_name)) +} + +parameter_mcmc_areas <- function( + stem,# = "beta" + class_list,# = beta_list + stanfit,# = fit + parameter_id,# = 2 + rename=TRUE + ) { + #get all parameter names + params <- get_parameters(stem,class_list) + #filter down to parameters of interest + params <- filter(params,parameters == parameter_id) + #Get dataframe with only the rows of interest + filtdata <- as.data.frame(stanfit)[params$param_name] + #rename columns + if (rename) dimnames(filtdata)[[2]] <- params$groups_hr + #get group name for title + parameter_name <- class_list$parameters[parameter_id] + #create area plot with appropriate title + mcmc_areas(filtdata,prob = 0.8, prob_outer = 0.95) + + ggtitle(parameter_name,"Parameter Distribution") +} + + +``` + +```{r} +#mcmc_intervals(fit, pars=get_parameters("beta",beta_list)$param_name) +``` + +### Investigating parameter distributions + +```{r} +#g1 <- group_mcmc_areas("beta",beta_list,fit,2) +#g2 <- group_mcmc_areas("beta",beta_list,fit,2) +#g3 <- group_mcmc_areas("beta",beta_list,fit,2) +#g4 <- group_mcmc_areas("beta",beta_list,fit,2) +#g5 <- group_mcmc_areas("beta",beta_list,fit,2) +#g6 <- group_mcmc_areas("beta",beta_list,fit,2) +#g7 <- group_mcmc_areas("beta",beta_list,fit,2) +#g8 <- group_mcmc_areas("beta",beta_list,fit,2) +#g9 <- group_mcmc_areas("beta",beta_list,fit,2) +#g10 <- group_mcmc_areas("beta",beta_list,fit,2) +#g11 <- group_mcmc_areas("beta",beta_list,fit,2) +#g12 <- group_mcmc_areas("beta",beta_list,fit,2) +#g13 <- group_mcmc_areas("beta",beta_list,fit,2) +#g14 <- group_mcmc_areas("beta",beta_list,fit,2) +#g15 <- group_mcmc_areas("beta",beta_list,fit,2) +#g16 <- group_mcmc_areas("beta",beta_list,fit,2) +#g17 <- group_mcmc_areas("beta",beta_list,fit,2) +#g18 <- group_mcmc_areas("beta",beta_list,fit,2) +#g19 <- group_mcmc_areas("beta",beta_list,fit,2) +#g20 <- group_mcmc_areas("beta",beta_list,fit,2) +#g21 <- group_mcmc_areas("beta",beta_list,fit,2) +#g22 <- group_mcmc_areas("beta",beta_list,fit,2) + + +p1 <- parameter_mcmc_areas("beta",beta_list,fit,1) +p2 <- parameter_mcmc_areas("beta",beta_list,fit,2) +p3 <- parameter_mcmc_areas("beta",beta_list,fit,3) +#p4 <- parameter_mcmc_areas("beta",beta_list,fit,4) +#p5 <- parameter_mcmc_areas("beta",beta_list,fit,5) +#p6 <- parameter_mcmc_areas("beta",beta_list,fit,6) +#p7 <- parameter_mcmc_areas("beta",beta_list,fit,7) +#p8 <- parameter_mcmc_areas("beta",beta_list,fit,8) +p9 <- parameter_mcmc_areas("beta",beta_list,fit,9) +p10 <- parameter_mcmc_areas("beta",beta_list,fit,10) +p11 <- parameter_mcmc_areas("beta",beta_list,fit,11) +p12 <- parameter_mcmc_areas("beta",beta_list,fit,12) +p13 <- parameter_mcmc_areas("beta",beta_list,fit,13) +p14 <- parameter_mcmc_areas("beta",beta_list,fit,14) +p15 <- parameter_mcmc_areas("beta",beta_list,fit,15) +p16 <- parameter_mcmc_areas("beta",beta_list,fit,16) +``` + +Note these have 95% outer CI and 80% inner (shaded) + + + 1) "Elapsed Duration", + 2) "asinh(Generic Brands)", + 3) "asinh(Competitors USPDC)", + 4) "asinh(High SDI)", + 5) "asinh(High-Medium SDI)", + 6) "asinh(Medium SDI)", + 7) "asinh(Low-Medium SDI)", + 8) "asinh(Low SDI)", + 9) "status_NYR", + 10) "status_EBI", + 11) "status_ANR", + 12) "ib*elapsed", + 13) "bnc*elapsed", + 14) "sNYR*elapsed", + 15) "sEBI*elapsed", + 16) "sANR*elapsed" + +of interest +- p1 + p2 +- p3 + p2 +- p2 + p12 +- p3 + p13 +- p9 + p14 +- p10 + p15 +- p11 + p16 + +```{r} +p1 + p2 +``` + + +```{r} +p2 + p3 +``` + +```{r} +p2 + p12 +``` + +```{r} +p3 + p13 +``` + +```{r} +p9 + p14 +``` + + + +```{r} +p10 + p15 +``` + + +```{r} +p11 + p16 +``` + +# Posterior Prediction + + +```{r} +#TODO: Convert to ggplot, stabilize y axis +hist(as.vector(extract(generated, pars="p_predicted_default")$p_predicted_default)) +hist(as.vector(extract(generated, pars="p_predicted_intervention")$p_predicted_intervention)) ``` + + + + +## Distribution of Predicted Differences + +### Intervention: Marginal increase in time to finish enrollment + ```{r} -mcmc_parcoord(posterior,pars=c( - "sigma[1]", - "sigma[2]", - "sigma[3]", - "sigma[4]", - "sigma[5]", - "sigma[6]", - "sigma[7]" -), -np=nuts_prmts + +pddf <- data.frame(extract(generated, pars="predicted_difference")$predicted_difference) |> pivot_longer(X1:X189) +pddf["entry_idx"] <- as.numeric(gsub("\\D","",pddf$name)) + +pddf["category"] <- sapply(pddf$entry_idx, function(i) counterfact_categories[i]) +pddf["category_name"] <- sapply(pddf$category, function(i) beta_list$groups[i]) + +ggplot(pddf, aes(x=value,)) + + geom_histogram(bins=100) + + labs( + title = "Distribution of predicted differences" + ,x = "Difference in probability due to intervention" + ,y = "Predicted counts" + ) + + xlim(-0.3,0.1) + + geom_vline(aes(xintercept = 0), color = "skyblue", linetype="dashed") + +ggplot(pddf, aes(x=value,)) + + geom_histogram(bins=100) + + facet_wrap( + ~factor( + category_name, + levels=beta_list$groups + ) + , labeller = label_wrap_gen(multi_line = TRUE) + , ncol=5) + + labs( + title = "Distribution of predicted differences", + subtitle = "By group" + ,x = "Difference in probability due to intervention" + ,y = "Predicted counts" + ) + + xlim(-0.25,0.1) + + geom_vline(aes(xintercept = 0), color = "skyblue", linetype="dashed") + + theme(strip.text.x = element_text(size = 8)) + +``` + + +Recall that we had really tight zero priors. + + +### Intervention: Adding a single competitor +```{r} +inherited_cols <- c( + "elapsed_duration" + #,"identical_brands" + #,"brand_name_counts" + ,"h_sdi_val" + ,"hm_sdi_val" + ,"m_sdi_val" + ,"lm_sdi_val" + ,"l_sdi_val" + ,"status_NYR" + ,"status_EBI" + ,"status_ANR" + #,"ib*elapsed" + #,"bnc*elapsed" + ,"sNYR*elapsed" + ,"sEBI*elapsed" + ,"sANR*elapsed" ) ``` + + + + +#### Generics + ```{r} -mcmc_pairs( - posterior, - np = nuts_prmts, - pars=c( - "sigma[1]", - "sigma[2]", - "sigma[3]", - "sigma[4]", - "sigma[5]", - "sigma[6]", - "sigma[7]" - ), - off_diag_args = list(size = 0.75) +#generics intervention +brand_intervention_ib <- x[c(inherited_cols,"brand_name_counts","bnc*elapsed")] +brand_intervention_ib["identical_brands"] <- asinh(sinh(x$identical_brands)+1) #add a single generic brand +brand_intervention_ib["ib*elapsed"] <- brand_intervention_ib$identical_brands * brand_intervention_ib$elapsed_duration +``` + +```{r} +counterfact_marketing_ib <- list( + D = ncol(x),# + N = nrow(x), + L = n_categories$count, + y = as.vector(y), + ll = as.vector(categories), + x = as.matrix(x), + mu_mean = 0, + mu_stdev = 0.05, + sigma_shape = 4, + sigma_rate = 20, + Nx = nrow(x), + llx = as.vector(categories), + counterfact_x_tilde = as.matrix(brand_intervention_ib), + counterfact_x = as.matrix(x) ) +``` +```{r} +generated_ib <- gqs( + fit@stanmodel, + data=counterfact_marketing_ib, + draws=as.matrix(fit), + seed=11021585 + ) +``` + +```{r} +#TODO: Convert to ggplot, stabilize y axis +hist(as.vector(extract(generated_ib, pars="p_predicted_default")$p_predicted_default), bins=100) +hist(as.vector(extract(generated_ib, pars="p_predicted_intervention")$p_predicted_intervention), bins=100) +hist(as.vector(extract(generated_ib, pars="predicted_difference")$predicted_difference), bins=100) ``` + + ```{r} -# check model estimates -#mu -plot(fit, pars=c("mu"), show_density =TRUE, ci_level=0.8) -plot(fit, pars=c("mu"), plotfun = "hist", bins=50) + + +pddf_ib <- data.frame(extract(generated_ib, pars="predicted_difference")$predicted_difference) |> + pivot_longer(X1:X1343) + +#TODO: Fix Category names +pddf_ib["entry_idx"] <- as.numeric(gsub("\\D","",pddf_ib$name)) +pddf_ib["category"] <- sapply(pddf_ib$entry_idx, function(i) df$category_id[i]) +pddf_ib["category_name"] <- sapply(pddf_ib$category, function(i) beta_list$groups[i]) ``` + ```{r} -#sigma -plot(fit, pars=c("sigma"), show_density =TRUE, ci_level=0.8) -plot(fit, pars=c("sigma"), plotfun = "hist",bins=50) +ggplot(pddf_ib, aes(x=value,)) + + geom_histogram(bins=100) + + labs( + title = "Distribution of predicted differences" + ,x = "Difference in probability due to intervention" + ,y = "Predicted counts" + ) + + #xlim(-0.3,0.1) + + geom_vline(aes(xintercept = 0), color = "skyblue", linetype="dashed") + +ggplot(pddf_ib, aes(x=value,)) + + geom_histogram(bins=100) + + facet_wrap( + ~factor( + category_name, + levels=beta_list$groups + ) + , labeller = label_wrap_gen(multi_line = TRUE) + , ncol=5) + + labs( + title = "Distribution of predicted differences", + subtitle = "By group" + ,x = "Difference in probability due to intervention" + ,y = "Predicted counts" + ) + + #xlim(-0.25,0.1) + + geom_vline(aes(xintercept = 0), color = "skyblue", linetype="dashed") + + theme(strip.text.x = element_text(size = 8)) + +``` + + + +#### USP DC + +```{r} +#formulary intervention +brand_intervention_bnc <- x[c(inherited_cols,"identical_brands","ib*elapsed")] +brand_intervention_bnc["brand_name_counts"] <- asinh(sinh(x$brand_name_counts)+1) #add a single formulary competitor brand +brand_intervention_bnc["bnc*elapsed"] <- brand_intervention_bnc$brand_name_counts * brand_intervention_bnc$elapsed_duration ``` diff --git a/r-analysis/Hierarchal_Logistic.stan b/r-analysis/Hierarchal_Logistic.stan index 1bff9a1..3c13726 100644 --- a/r-analysis/Hierarchal_Logistic.stan +++ b/r-analysis/Hierarchal_Logistic.stan @@ -11,21 +11,30 @@ // The input data is a vector 'y' of length 'N'. data { - int D; - int N; - int L; - int y[N]; - int ll[N]; - row_vector[D] x[N]; + int D; // number of features + int N; // number of observations + int L; // number of categories + array[N] int y; // vector of dependent variables + array[N] int ll; // vector of categories + array[N] row_vector[D] x; // independent variables + real mu_mean; //hyperprior + real mu_stdev; //hyperprior + real sigma_shape; //hyperprior + real sigma_rate; //hyperprior + //counterfactuals + int Nx; + array[Nx] int llx; + array[Nx] row_vector[D] counterfact_x_tilde; // Posterior Prediction intervention + array[Nx] row_vector[D] counterfact_x; // Posterior Prediction intervention } parameters { - real mu[D]; - real sigma[D]; - vector[D] beta[L]; + array[D] real mu; + array[D] real sigma; + array[L] vector[D] beta; } model { - sigma ~ gamma(2,1); //shape and inverse scale - mu ~ normal(0, 1); //convert to mvnormal + sigma ~ gamma(sigma_shape,sigma_rate); //hyperprior for stdev: shape and inverse scale + mu ~ normal(mu_mean, mu_stdev); //hyperprior for mean //TODO: convert to mvnormal for (l in 1:L) { beta[l] ~ normal(mu, sigma); } @@ -37,4 +46,47 @@ model { y ~ bernoulli_logit(x_beta_ll); } } +generated quantities { + //SETUP PRIOR PREDICTION + //preallocate + real mu_prior[D]; + real sigma_prior[D]; + real p_prior[N]; // what I have priors about + real p_predicted[N]; //predicted p_values + //intervention + real p_predicted_default[Nx]; //predicted p_values + real p_predicted_intervention[Nx]; //predicted p_values + real predicted_difference[Nx]; //difference in predicted values + //GENERATE PRIOR PREDICTIONS + { + vector[D] beta_prior[L];//local var + //sample parameters + for (d in 1:D) { + mu_prior[d] = normal_rng(mu_mean,mu_stdev); + sigma_prior[d] = gamma_rng(sigma_shape,sigma_rate); + } + for (l in 1:L) { + for (d in 1:D) { + beta_prior[l,d] = normal_rng(mu_prior[d],sigma_prior[d]); + } + } + //generate probabilities + vector[D] b_prior[N];//local var + for (n in 1:N){ + b_prior[n] = beta_prior[ll[n]]; + p_prior[n] = inv_logit( x[n] * b_prior[n] ); + } + } + //GENERATE POSTERIOR PREDICTIONS + for (n in 1:N) { + p_predicted[n] = inv_logit( x[n] * beta[ll[n]] ); + } + //GENERATE POSTERIOR DISTRIBUTION OF DIFFERENCES + for (n in 1:Nx) { + p_predicted_default[n] = inv_logit( counterfact_x[n] * beta[llx[n]] ); + p_predicted_intervention[n] = inv_logit( counterfact_x_tilde[n] * beta[llx[n]] ); //intervention + + predicted_difference[n] = p_predicted_default[n] - p_predicted_intervention[n]; + } +}