From eabf8589558aa73858a3de72649c25240ec281fc Mon Sep 17 00:00:00 2001 From: Will King Date: Fri, 6 Oct 2023 07:06:02 +0000 Subject: [PATCH] Got direct effects model working --- r-analysis/EffectsOfMarketConditions.qmd | 937 ++++++++++------------- 1 file changed, 401 insertions(+), 536 deletions(-) diff --git a/r-analysis/EffectsOfMarketConditions.qmd b/r-analysis/EffectsOfMarketConditions.qmd index 259ee78..20d7688 100644 --- a/r-analysis/EffectsOfMarketConditions.qmd +++ b/r-analysis/EffectsOfMarketConditions.qmd @@ -122,13 +122,13 @@ x["status_ANR"] <- ifelse(df["current_status"]=="Active, not recruiting",1,0) #interaction terms for competitors -x["ib*elapsed"] <- x["elapsed_duration"]*x["identical_brands"] -x["bnc*elapsed"] <- x["elapsed_duration"] * x["brand_name_counts"] +#x["ib*elapsed"] <- x["elapsed_duration"]*x["identical_brands"] +#x["bnc*elapsed"] <- x["elapsed_duration"] * x["brand_name_counts"] #interaction terms for status effects -x["sNYR*elapsed"] <- x["elapsed_duration"]*x["status_NYR"] -x["sEBI*elapsed"] <- x["elapsed_duration"]*x["status_EBI"] -x["sANR*elapsed"] <- x["elapsed_duration"]*x["status_ANR"] +#x["sNYR*elapsed"] <- x["elapsed_duration"]*x["status_NYR"] +#x["sEBI*elapsed"] <- x["elapsed_duration"]*x["status_EBI"] +#x["sANR*elapsed"] <- x["elapsed_duration"]*x["status_ANR"] y <- ifelse(df["final_status"]=="Terminated",1,0) @@ -147,100 +147,372 @@ y <- train$y ``` + +# Fit Model + + + + +```{r} +################################# FIT MODEL ######################################### +inherited_cols <- c( + "elapsed_duration" + #,"identical_brands" + #,"brand_name_counts" + ,"h_sdi_val" + ,"hm_sdi_val" + ,"m_sdi_val" + ,"lm_sdi_val" + ,"l_sdi_val" + ,"status_NYR" + ,"status_EBI" + ,"status_ANR" +) +``` + + + +```{r} +beta_list <- list( + groups = c( + `1`="Infections & Parasites", + `2`="Neoplasms", + `3`="Blood & Immune system", + `4`="Endocrine, Nutritional, and Metabolic", + `5`="Mental & Behavioral", + `6`="Nervous System", + `7`="Eye and Adnexa", + `8`="Ear and Mastoid", + `9`="Circulatory", + `10`="Respiratory", + `11`="Digestive", + `12`="Skin & Subcutaneaous tissue", + `13`="Musculoskeletal", + `14`="Genitourinary", + `15`="Pregancy, Childbirth, & Puerperium", + `16`="Perinatal Period", + `17`="Congential", + `18`="Symptoms, Signs etc.", + `19`="Injury etc.", + `20`="External Causes", + `21`="Contact with Healthcare", + `22`="Special Purposes" + ), + parameters = c( + `1`="Elapsed Duration", + # brands + `2`="asinh(Generic Brands)", + `3`="asinh(Competitors USPDC)", + # population + `4`="asinh(High SDI)", + `5`="asinh(High-Medium SDI)", + `6`="asinh(Medium SDI)", + `7`="asinh(Low-Medium SDI)", + `8`="asinh(Low SDI)", + #Status + `9`="status_NYR", + `10`="status_EBI", + `11`="status_ANR", + #interactions for brands + `12`="ib*elapsed", + `13`="bnc*elapsed", + # interactions for status + `14`="sNYR*elapsed", + `15`="sEBI*elapsed", + `16`="sANR*elapsed" + + ) +) + +get_parameters <- function(stem,class_list) { + #get categories and lengths + named <- names(class_list) + lengths <- sapply(named, (function (x) length(class_list[[x]]))) + + #describe the grid needed + iter_list <- sapply(named, (function (x) 1:lengths[x])) + + #generate the list of parameters + pardf <- generate_parameter_df(stem, iter_list) + + #add columns with appropriate human-readable names + for (name in named) { + pardf[paste(name,"_hr",sep="")] <- as.factor( + sapply(pardf[name], (function (i) class_list[[name]][i])) + ) + } + + return(pardf) +} + +generate_parameter_df <- function(stem, iter_list) { + grid <- expand.grid(iter_list) + grid["param_name"] <- grid %>% unite(x,colnames(grid),sep=",") + grid["param_name"] <- paste(stem,"[",grid$param_name,"]",sep="") + return(grid) +} + +group_mcmc_areas <- function( + stem,# = "beta" + class_list,# = beta_list + stanfit,# = fit + group_id,# = 2 + rename=TRUE + ) { + #get all parameter names + params <- get_parameters(stem,class_list) + #filter down to parameters of interest + params <- filter(params,groups == group_id) + #Get dataframe with only the rows of interest + filtdata <- as.data.frame(stanfit)[params$param_name] + #rename columns + if (rename) dimnames(filtdata)[[2]] <- params$parameters_hr + #get group name for title + group_name <- class_list$groups[group_id] + #create area plot with appropriate title + mcmc_areas(filtdata,prob = 0.8, prob_outer = 0.95) + + ggtitle(paste("Parameter distributions for ICD-10 class:",group_name)) +} + +parameter_mcmc_areas <- function( + stem,# = "beta" + class_list,# = beta_list + stanfit,# = fit + parameter_id,# = 2 + rename=TRUE + ) { + #get all parameter names + params <- get_parameters(stem,class_list) + #filter down to parameters of interest + params <- filter(params,parameters == parameter_id) + #Get dataframe with only the rows of interest + filtdata <- as.data.frame(stanfit)[params$param_name] + #rename columns + if (rename) dimnames(filtdata)[[2]] <- params$groups_hr + #get group name for title + parameter_name <- class_list$parameters[parameter_id] + #create area plot with appropriate title + mcmc_areas(filtdata,prob = 0.8, prob_outer = 0.95) + + ggtitle(parameter_name,"Parameter Distribution") +} + + +``` + + +```{r} +#generics intervention +brand_intervention_ib <- x[c(inherited_cols,"brand_name_counts")] +brand_intervention_ib["identical_brands"] <- asinh(sinh(x$identical_brands)+1) #add a single generic brand +``` + +```{r} +counterfact_marketing_ib <- list( + D = ncol(x),# + N = nrow(x), + L = n_categories$count, + y = as.vector(y), + ll = as.vector(categories), + x = as.matrix(x), + mu_mean = 0, + mu_stdev = 0.05, + sigma_shape = 4, + sigma_rate = 20, + Nx = nrow(x), + llx = as.vector(categories), + counterfact_x_tilde = as.matrix(brand_intervention_ib), + counterfact_x = as.matrix(x) +) +``` + +```{r} +fit <- stan( + file='Hierarchal_Logistic.stan', + data = counterfact_marketing_ib, + chains = 4, + iter = 5000, + seed = 11021585 + ) +``` + +```{r} +generated_ib <- gqs( + fit@stanmodel, + data=counterfact_marketing_ib, + draws=as.matrix(fit), + seed=11021585 + ) +``` + + + +```{r} +hist(as.vector(extract(generated_ib, pars="p_prior")$p_prior)) +hist(as.vector(extract(generated_ib, pars="mu_prior")$mu_prior), ) +hist(as.vector(extract(generated_ib, pars="sigma_prior")$sigma_prior)) +``` + + + +```{r} +check_hmc_diagnostics(fit) +hist(as.vector(extract(generated_ib, pars="p_predicted")$p_predicted)) +``` + + + + +### Intervention: Adding a single competitor + +```{r} +#TODO: Convert to ggplot, stabilize y axis +hist(as.vector(extract(generated_ib, pars="p_predicted_default")$p_predicted_default)) +hist(as.vector(extract(generated_ib, pars="p_predicted_intervention")$p_predicted_intervention)) +hist(as.vector(extract(generated_ib, pars="predicted_difference")$predicted_difference)) +``` + + + +```{r} + + +pddf_ib <- data.frame(extract(generated_ib, pars="predicted_difference")$predicted_difference) |> + pivot_longer(X1:X1343) + +#TODO: Fix Category names +pddf_ib["entry_idx"] <- as.numeric(gsub("\\D","",pddf_ib$name)) +pddf_ib["category"] <- sapply(pddf_ib$entry_idx, function(i) df$category_id[i]) +pddf_ib["category_name"] <- sapply(pddf_ib$category, function(i) beta_list$groups[i]) +``` + + +```{r} + +ggplot(pddf_ib, aes(x=value,)) + + geom_histogram(bins=100) + + labs( + title = "Distribution of predicted differences" + ,x = "Difference in probability due to intervention" + ,y = "Predicted counts" + ) + + #xlim(-0.3,0.1) + + geom_vline(aes(xintercept = 0), color = "skyblue", linetype="dashed") + +ggplot(pddf_ib, aes(x=value,)) + + geom_histogram(bins=100) + + facet_wrap( + ~factor( + category_name, + levels=beta_list$groups + ) + , labeller = label_wrap_gen(multi_line = TRUE) + , ncol=5) + + labs( + title = "Distribution of predicted differences", + subtitle = "By group" + ,x = "Difference in probability due to intervention" + ,y = "Predicted counts" + ) + + #xlim(-0.25,0.1) + + geom_vline(aes(xintercept = 0), color = "skyblue", linetype="dashed") + + theme(strip.text.x = element_text(size = 8)) + +``` + + + +#### USP DC + +```{r} +#formulary intervention +brand_intervention_bnc <- x[c(inherited_cols,"identical_brands")] +brand_intervention_bnc["brand_name_counts"] <- asinh(sinh(x$brand_name_counts)+1) #add a single formulary competitor brand +``` + +```{r} +counterfact_marketing_bnc <- list( + D = ncol(x),# + N = nrow(x), + L = n_categories$count, + y = as.vector(y), + ll = as.vector(categories), + x = as.matrix(x), + mu_mean = 0, + mu_stdev = 0.05, + sigma_shape = 4, + sigma_rate = 20, + Nx = nrow(x), + llx = as.vector(categories), + counterfact_x_tilde = as.matrix(brand_intervention_bnc), + counterfact_x = as.matrix(x) +) +``` + + ```{r} -# get data for counterfactuals -get_counterfactuals <- function(driver) { - con <- dbConnect( - driver, - user='root', - password='root', - dbname='aact_db', - host='will-office' - ) - on.exit(dbDisconnect(con)) - - query <- dbSendQuery( - con, - " - with cte as ( - select - nct_id - ,lag(current_status, 1) over (partition by nct_id order by earliest_date_observed) as previous_status - ,current_status - ,earliest_date_observed as date_current - from formatted_data_mat fdm - ), cte2 as ( - select - nct_id - ,previous_status - ,current_status - ,max(date_current) as date_current_max - from cte - where - previous_status != current_status - and - current_status = 'Active, not recruiting' - group by - nct_id - ,previous_status - ,current_status - ,date_current +generated_bnc <- gqs( + fit@stanmodel, + data=counterfact_marketing_bnc, + draws=as.matrix(fit), + seed=11021585 ) - select - fdqpe.nct_id - --,fdqpe.start_date - --,fdqpe.current_enrollment - --,fdqpe.enrollment_category - ,fdqpe.current_status - ,fdqpe.earliest_date_observed - ,fdqpe.elapsed_duration - ,fdqpe.n_brands as identical_brands - ,ntbtu.brand_name_count - ,fdqpe.category_id - ,fdqpe.final_status - ,fdqpe.h_sdi_val - --,fdqpe.h_sdi_u95 - --,fdqpe.h_sdi_l95 - ,fdqpe.hm_sdi_val - --,fdqpe.hm_sdi_u95 - --,fdqpe.hm_sdi_l95 - ,fdqpe.m_sdi_val - --,fdqpe.m_sdi_u95 - --,fdqpe.m_sdi_l95 - ,fdqpe.lm_sdi_val - --,fdqpe.lm_sdi_u95 - --,fdqpe.lm_sdi_l95 - ,fdqpe.l_sdi_val - --,fdqpe.l_sdi_u95 - --,fdqpe.l_sdi_l95 - from formatted_data_with_planned_enrollment fdqpe - join \"Formularies\".nct_to_brands_through_uspdc ntbtu - on fdqpe.nct_id = ntbtu.nct_id - join cte2 - on cte2.nct_id = fdqpe.nct_id - and cte2.date_current_max = fdqpe.earliest_date_observed - order by fdqpe.nct_id, fdqpe.earliest_date_observed - ; - " - ) - df <- fetch(query, n = -1) - df <- na.omit(df) - - return(df) -} -counterfact_raw <- get_counterfactuals(driver) -#extract data -counterfact_list <- data_formatter(counterfact_raw) -counterfact_list$y <- NULL #remove the chance of accidentally training on the wrong data -counterfact_categories <- counterfact_raw$category_id -#setup the two counterfactuals -counterfact_x <- counterfact_list$x #no change -counterfact_x_tilde <- counterfact_list$x #to be changed -#make changes, set it to Recruiting #TODO: change this so it matches previous state. -counterfact_x_tilde$status_ANR <- 0 -counterfact_x_tilde["sANR*elapsed"] <- 0 ``` +```{r} +#TODO: Convert to ggplot, stabilize y axis +hist(as.vector(extract(generated_bnc, pars="p_predicted_default")$p_predicted_default), bins=100) +hist(as.vector(extract(generated_bnc, pars="p_predicted_intervention")$p_predicted_intervention), bins=100) +hist(as.vector(extract(generated_bnc, pars="predicted_difference")$predicted_difference), bins=100) +``` + + +```{r} +pddf_bnc <- data.frame(extract(generated_bnc, pars="predicted_difference")$predicted_difference) |> + pivot_longer(X1:X1343) + +#Add Category names +pddf_bnc["entry_idx"] <- as.numeric(gsub("\\D","",pddf_bnc$name)) +pddf_bnc["category"] <- sapply(pddf_bnc$entry_idx, function(i) df$category_id[i]) +pddf_bnc["category_name"] <- sapply(pddf_bnc$category, function(i) beta_list$groups[i]) + +``` + + +```{r} + +ggplot(pddf_bnc, aes(x=value,)) + + geom_histogram(bins=100) + + labs( + title = "Distribution of predicted differences" + ,x = "Difference in probability due to intervention" + ,y = "Predicted counts" + ) + + #xlim(-0.3,0.1) + + geom_vline(aes(xintercept = 0), color = "skyblue", linetype="dashed") + +ggplot(pddf_bnc, aes(x=value,)) + + geom_histogram(bins=100) + + facet_wrap( + ~factor( + category_name, + levels=beta_list$groups + ) + , labeller = label_wrap_gen(multi_line = TRUE) + , ncol=5) + + labs( + title = "Distribution of predicted differences", + subtitle = "By group" + ,x = "Difference in probability due to intervention" + ,y = "Predicted counts" + ) + + #xlim(-0.25,0.1) + + geom_vline(aes(xintercept = 0), color = "skyblue", linetype="dashed") + + theme(strip.text.x = element_text(size = 8)) + +``` + +TODO: add density plot of (x,y,z) (date,value,counts) + - with and without faceting ## Explore data @@ -338,70 +610,6 @@ summary(df5) -# Fit Model - - -```{r} -################################# FIT MODEL ######################################### - -#setup data (named list) -trials_data <- list( - D = ncol(x),# - N = nrow(x), - L = n_categories$count, - y = as.vector(y), - ll = as.vector(categories), - x = as.matrix(x), - mu_mean = 0, - mu_stdev = 0.05, - sigma_shape = 4, - sigma_rate = 20, - Nx = 189, - llx = as.vector(counterfact_categories), - counterfact_x_tilde = as.matrix(counterfact_x_tilde), - counterfact_x = as.matrix(counterfact_x) -) - - -fit <- stan( - file='Hierarchal_Logistic.stan', - data = trials_data, - chains = 4, - iter = 5000, - seed = 11021585 - ) -``` - -```{r} -#pull out prior predictions -generated <- gqs( - fit@stanmodel, - data=trials_data, - draws=as.matrix(fit), - seed=11021585 - ) -# to implement distribution of differences: -# create two datasets with interventions -# simulate both with the same seed and draws -# figure out how to difference the posterior (and maybe look at prior) probabilities -``` - - -```{r} -hist(as.vector(extract(generated, pars="p_prior")$p_prior)) -hist(as.vector(extract(generated, pars="mu_prior")$mu_prior), ) -hist(as.vector(extract(generated, pars="sigma_prior")$sigma_prior)) -``` - - - -```{r} -check_hmc_diagnostics(fit) -hist(as.vector(extract(generated, pars="p_predicted")$p_predicted)) -``` - - - @@ -503,172 +711,45 @@ for (i in 1:4) { ) ) } -``` - - -```{r} -for (k in 1:22) { -for (i in 1:4) { - params = sapply(3:0, function(j) paste0("beta[",k,",",4*i-j ,"]")) - print( - mcmc_pairs( - posterior, - np = nuts_prmts, - pars=c( - params, - "lp__" - ), - off_diag_args = list(size = 0.75) - ) - ) -}} -``` - - -# Results - - -```{r} -################################# ANALYZE ##################################### -print(fit) -``` - -## Result Plots - - -Note the regular large difference in variance. -I would guess those are the beta[1:22,2] values. -I wonder if a lot of the variance is due to the 2 values that are sitting out. - - - -```{r} -beta_list <- list( - groups = c( - `1`="Infections & Parasites", - `2`="Neoplasms", - `3`="Blood & Immune system", - `4`="Endocrine, Nutritional, and Metabolic", - `5`="Mental & Behavioral", - `6`="Nervous System", - `7`="Eye and Adnexa", - `8`="Ear and Mastoid", - `9`="Circulatory", - `10`="Respiratory", - `11`="Digestive", - `12`="Skin & Subcutaneaous tissue", - `13`="Musculoskeletal", - `14`="Genitourinary", - `15`="Pregancy, Childbirth, & Puerperium", - `16`="Perinatal Period", - `17`="Congential", - `18`="Symptoms, Signs etc.", - `19`="Injury etc.", - `20`="External Causes", - `21`="Contact with Healthcare", - `22`="Special Purposes" - ), - parameters = c( - `1`="Elapsed Duration", - # brands - `2`="asinh(Generic Brands)", - `3`="asinh(Competitors USPDC)", - # population - `4`="asinh(High SDI)", - `5`="asinh(High-Medium SDI)", - `6`="asinh(Medium SDI)", - `7`="asinh(Low-Medium SDI)", - `8`="asinh(Low SDI)", - #Status - `9`="status_NYR", - `10`="status_EBI", - `11`="status_ANR", - #interactions for brands - `12`="ib*elapsed", - `13`="bnc*elapsed", - # interactions for status - `14`="sNYR*elapsed", - `15`="sEBI*elapsed", - `16`="sANR*elapsed" - - ) -) - -get_parameters <- function(stem,class_list) { - #get categories and lengths - named <- names(class_list) - lengths <- sapply(named, (function (x) length(class_list[[x]]))) - - #describe the grid needed - iter_list <- sapply(named, (function (x) 1:lengths[x])) - - #generate the list of parameters - pardf <- generate_parameter_df(stem, iter_list) - - #add columns with appropriate human-readable names - for (name in named) { - pardf[paste(name,"_hr",sep="")] <- as.factor( - sapply(pardf[name], (function (i) class_list[[name]][i])) - ) - } - - return(pardf) -} - -generate_parameter_df <- function(stem, iter_list) { - grid <- expand.grid(iter_list) - grid["param_name"] <- grid %>% unite(x,colnames(grid),sep=",") - grid["param_name"] <- paste(stem,"[",grid$param_name,"]",sep="") - return(grid) -} - -group_mcmc_areas <- function( - stem,# = "beta" - class_list,# = beta_list - stanfit,# = fit - group_id,# = 2 - rename=TRUE - ) { - #get all parameter names - params <- get_parameters(stem,class_list) - #filter down to parameters of interest - params <- filter(params,groups == group_id) - #Get dataframe with only the rows of interest - filtdata <- as.data.frame(stanfit)[params$param_name] - #rename columns - if (rename) dimnames(filtdata)[[2]] <- params$parameters_hr - #get group name for title - group_name <- class_list$groups[group_id] - #create area plot with appropriate title - mcmc_areas(filtdata,prob = 0.8, prob_outer = 0.95) + - ggtitle(paste("Parameter distributions for ICD-10 class:",group_name)) -} +``` -parameter_mcmc_areas <- function( - stem,# = "beta" - class_list,# = beta_list - stanfit,# = fit - parameter_id,# = 2 - rename=TRUE - ) { - #get all parameter names - params <- get_parameters(stem,class_list) - #filter down to parameters of interest - params <- filter(params,parameters == parameter_id) - #Get dataframe with only the rows of interest - filtdata <- as.data.frame(stanfit)[params$param_name] - #rename columns - if (rename) dimnames(filtdata)[[2]] <- params$groups_hr - #get group name for title - parameter_name <- class_list$parameters[parameter_id] - #create area plot with appropriate title - mcmc_areas(filtdata,prob = 0.8, prob_outer = 0.95) + - ggtitle(parameter_name,"Parameter Distribution") -} + +```{r} +for (k in 1:22) { +for (i in 1:4) { + params = sapply(3:0, function(j) paste0("beta[",k,",",4*i-j ,"]")) + print( + mcmc_pairs( + posterior, + np = nuts_prmts, + pars=c( + params, + "lp__" + ), + off_diag_args = list(size = 0.75) + ) + ) +}} +``` +# Results + + +```{r} +################################# ANALYZE ##################################### +print(fit) ``` +## Result Plots + + +Note the regular large difference in variance. +I would guess those are the beta[1:22,2] values. +I wonder if a lot of the variance is due to the 2 values that are sitting out. + + + ```{r} #mcmc_intervals(fit, pars=get_parameters("beta",beta_list)$param_name) ``` @@ -840,219 +921,3 @@ ggplot(pddf, aes(x=value,)) + Recall that we had really tight zero priors. -### Intervention: Adding a single competitor -```{r} -inherited_cols <- c( - "elapsed_duration" - #,"identical_brands" - #,"brand_name_counts" - ,"h_sdi_val" - ,"hm_sdi_val" - ,"m_sdi_val" - ,"lm_sdi_val" - ,"l_sdi_val" - ,"status_NYR" - ,"status_EBI" - ,"status_ANR" - #,"ib*elapsed" - #,"bnc*elapsed" - ,"sNYR*elapsed" - ,"sEBI*elapsed" - ,"sANR*elapsed" -) -``` - - - - - -#### Generics - -```{r} -#generics intervention -brand_intervention_ib <- x[c(inherited_cols,"brand_name_counts","bnc*elapsed")] -brand_intervention_ib["identical_brands"] <- asinh(sinh(x$identical_brands)+1) #add a single generic brand -brand_intervention_ib["ib*elapsed"] <- brand_intervention_ib$identical_brands * brand_intervention_ib$elapsed_duration -``` - -```{r} -counterfact_marketing_ib <- list( - D = ncol(x),# - N = nrow(x), - L = n_categories$count, - y = as.vector(y), - ll = as.vector(categories), - x = as.matrix(x), - mu_mean = 0, - mu_stdev = 0.05, - sigma_shape = 4, - sigma_rate = 20, - Nx = nrow(x), - llx = as.vector(categories), - counterfact_x_tilde = as.matrix(brand_intervention_ib), - counterfact_x = as.matrix(x) -) -``` - - -```{r} -generated_ib <- gqs( - fit@stanmodel, - data=counterfact_marketing_ib, - draws=as.matrix(fit), - seed=11021585 - ) -``` - -```{r} -#TODO: Convert to ggplot, stabilize y axis -hist(as.vector(extract(generated_ib, pars="p_predicted_default")$p_predicted_default), bins=100) -hist(as.vector(extract(generated_ib, pars="p_predicted_intervention")$p_predicted_intervention), bins=100) -hist(as.vector(extract(generated_ib, pars="predicted_difference")$predicted_difference), bins=100) -``` - - - -```{r} - - -pddf_ib <- data.frame(extract(generated_ib, pars="predicted_difference")$predicted_difference) |> - pivot_longer(X1:X1343) - -#TODO: Fix Category names -pddf_ib["entry_idx"] <- as.numeric(gsub("\\D","",pddf_ib$name)) -pddf_ib["category"] <- sapply(pddf_ib$entry_idx, function(i) df$category_id[i]) -pddf_ib["category_name"] <- sapply(pddf_ib$category, function(i) beta_list$groups[i]) -``` - - -```{r} - -ggplot(pddf_ib, aes(x=value,)) + - geom_histogram(bins=100) + - labs( - title = "Distribution of predicted differences" - ,x = "Difference in probability due to intervention" - ,y = "Predicted counts" - ) + - #xlim(-0.3,0.1) + - geom_vline(aes(xintercept = 0), color = "skyblue", linetype="dashed") - -ggplot(pddf_ib, aes(x=value,)) + - geom_histogram(bins=100) + - facet_wrap( - ~factor( - category_name, - levels=beta_list$groups - ) - , labeller = label_wrap_gen(multi_line = TRUE) - , ncol=5) + - labs( - title = "Distribution of predicted differences", - subtitle = "By group" - ,x = "Difference in probability due to intervention" - ,y = "Predicted counts" - ) + - #xlim(-0.25,0.1) + - geom_vline(aes(xintercept = 0), color = "skyblue", linetype="dashed") + - theme(strip.text.x = element_text(size = 8)) - -``` - - - -#### USP DC - -```{r} -#formulary intervention -brand_intervention_bnc <- x[c(inherited_cols,"identical_brands","ib*elapsed")] -brand_intervention_bnc["brand_name_counts"] <- asinh(sinh(x$brand_name_counts)+1) #add a single formulary competitor brand -brand_intervention_bnc["bnc*elapsed"] <- brand_intervention_bnc$brand_name_counts * brand_intervention_bnc$elapsed_duration -``` - -```{r} -counterfact_marketing_bnc <- list( - D = ncol(x),# - N = nrow(x), - L = n_categories$count, - y = as.vector(y), - ll = as.vector(categories), - x = as.matrix(x), - mu_mean = 0, - mu_stdev = 0.05, - sigma_shape = 4, - sigma_rate = 20, - Nx = nrow(x), - llx = as.vector(categories), - counterfact_x_tilde = as.matrix(brand_intervention_bnc), - counterfact_x = as.matrix(x) -) -``` - - -```{r} -generated_bnc <- gqs( - fit@stanmodel, - data=counterfact_marketing_bnc, - draws=as.matrix(fit), - seed=11021585 - ) -``` - -```{r} -#TODO: Convert to ggplot, stabilize y axis -hist(as.vector(extract(generated_bnc, pars="p_predicted_default")$p_predicted_default), bins=100) -hist(as.vector(extract(generated_bnc, pars="p_predicted_intervention")$p_predicted_intervention), bins=100) -hist(as.vector(extract(generated_bnc, pars="predicted_difference")$predicted_difference), bins=100) -``` - - -```{r} -pddf_bnc <- data.frame(extract(generated_bnc, pars="predicted_difference")$predicted_difference) |> - pivot_longer(X1:X1343) - -#Add Category names -pddf_bnc["entry_idx"] <- as.numeric(gsub("\\D","",pddf_bnc$name)) -pddf_bnc["category"] <- sapply(pddf_bnc$entry_idx, function(i) df$category_id[i]) -pddf_bnc["category_name"] <- sapply(pddf_bnc$category, function(i) beta_list$groups[i]) - -#add snapshot date -pddf_bnc["snapshot_date"] <- sapply(pddf_bnc$entry_idx, function(i) df$earliest_date_observed[i])#changed values -``` - - -```{r} - -ggplot(pddf_bnc, aes(x=value,)) + - geom_histogram(bins=100) + - labs( - title = "Distribution of predicted differences" - ,x = "Difference in probability due to intervention" - ,y = "Predicted counts" - ) + - #xlim(-0.3,0.1) + - geom_vline(aes(xintercept = 0), color = "skyblue", linetype="dashed") - -ggplot(pddf_bnc, aes(x=value,)) + - geom_histogram(bins=100) + - facet_wrap( - ~factor( - category_name, - levels=beta_list$groups - ) - , labeller = label_wrap_gen(multi_line = TRUE) - , ncol=5) + - labs( - title = "Distribution of predicted differences", - subtitle = "By group" - ,x = "Difference in probability due to intervention" - ,y = "Predicted counts" - ) + - #xlim(-0.25,0.1) + - geom_vline(aes(xintercept = 0), color = "skyblue", linetype="dashed") + - theme(strip.text.x = element_text(size = 8)) - -``` - -TODO: add density plot of (x,y,z) (date,value,counts) - - with and without faceting