Got direct effects model working

3 years ago · eabf858955
parent 98fc18e200
commit eabf858955
1 changed files with 401 additions and 536 deletions
--- a/r-analysis/EffectsOfMarketConditions.qmd
+++ b/r-analysis/EffectsOfMarketConditions.qmd
@ -122,13 +122,13 @@ x["status_ANR"] <- ifelse(df["current_status"]=="Active, not recruiting",1,0)


 #interaction terms for competitors
-x["ib*elapsed"] <- x["elapsed_duration"]*x["identical_brands"]
-x["bnc*elapsed"] <- x["elapsed_duration"] * x["brand_name_counts"]
+#x["ib*elapsed"] <- x["elapsed_duration"]*x["identical_brands"]
+#x["bnc*elapsed"] <- x["elapsed_duration"] * x["brand_name_counts"]

 #interaction terms for status effects
-x["sNYR*elapsed"] <- x["elapsed_duration"]*x["status_NYR"] 
-x["sEBI*elapsed"] <- x["elapsed_duration"]*x["status_EBI"] 
-x["sANR*elapsed"] <- x["elapsed_duration"]*x["status_ANR"] 
+#x["sNYR*elapsed"] <- x["elapsed_duration"]*x["status_NYR"] 
+#x["sEBI*elapsed"] <- x["elapsed_duration"]*x["status_EBI"] 
+#x["sANR*elapsed"] <- x["elapsed_duration"]*x["status_ANR"] 

 y <- ifelse(df["final_status"]=="Terminated",1,0)

@ -147,100 +147,372 @@ y <- train$y
 ```


+
+# Fit Model
+
+
+
+
+```{r}
+################################# FIT MODEL #########################################
+inherited_cols <- c(
+    "elapsed_duration"
+    #,"identical_brands"
+    #,"brand_name_counts"
+    ,"h_sdi_val"
+    ,"hm_sdi_val"
+    ,"m_sdi_val"
+    ,"lm_sdi_val"
+    ,"l_sdi_val"
+    ,"status_NYR"
+    ,"status_EBI"
+    ,"status_ANR"
+)
+```
+
+
+
+```{r}
+beta_list <- list(
+        groups = c(
+        `1`="Infections & Parasites",
+        `2`="Neoplasms",
+        `3`="Blood & Immune system",
+        `4`="Endocrine, Nutritional, and Metabolic",
+        `5`="Mental & Behavioral",
+        `6`="Nervous System",
+        `7`="Eye and Adnexa",
+        `8`="Ear and Mastoid",
+        `9`="Circulatory",
+        `10`="Respiratory",
+        `11`="Digestive",
+        `12`="Skin & Subcutaneaous tissue",
+        `13`="Musculoskeletal",
+        `14`="Genitourinary",
+        `15`="Pregancy, Childbirth, & Puerperium",
+        `16`="Perinatal Period",
+        `17`="Congential",
+        `18`="Symptoms, Signs etc.",
+        `19`="Injury etc.",
+        `20`="External Causes",
+        `21`="Contact with Healthcare",
+        `22`="Special Purposes"
+    ),
+    parameters = c(
+        `1`="Elapsed Duration",
+        # brands
+        `2`="asinh(Generic Brands)",
+        `3`="asinh(Competitors USPDC)",
+        # population
+        `4`="asinh(High SDI)",
+        `5`="asinh(High-Medium SDI)",
+        `6`="asinh(Medium SDI)",
+        `7`="asinh(Low-Medium SDI)",
+        `8`="asinh(Low SDI)",
+        #Status
+        `9`="status_NYR",
+        `10`="status_EBI",
+        `11`="status_ANR",
+        #interactions for brands
+        `12`="ib*elapsed",
+        `13`="bnc*elapsed",
+        # interactions for status
+        `14`="sNYR*elapsed",
+        `15`="sEBI*elapsed",
+        `16`="sANR*elapsed"
+        
+    )
+)
+
+get_parameters <- function(stem,class_list) {
+    #get categories and lengths
+    named <- names(class_list)
+    lengths <- sapply(named, (function (x) length(class_list[[x]])))
+    
+    #describe the grid needed
+    iter_list <- sapply(named, (function (x) 1:lengths[x]))
+    
+    #generate the list of parameters
+    pardf <- generate_parameter_df(stem, iter_list)
+    
+    #add columns with appropriate human-readable names
+    for (name in named) {
+        pardf[paste(name,"_hr",sep="")] <- as.factor(
+            sapply(pardf[name], (function (i) class_list[[name]][i]))
+        )
+    }
+     
+    return(pardf)   
+}
+
+generate_parameter_df <- function(stem, iter_list) {
+    grid <- expand.grid(iter_list)
+    grid["param_name"] <- grid %>% unite(x,colnames(grid),sep=",")
+    grid["param_name"] <- paste(stem,"[",grid$param_name,"]",sep="")
+    return(grid)
+}
+
+group_mcmc_areas <- function(
+        stem,# = "beta"
+        class_list,# = beta_list
+        stanfit,# = fit
+        group_id,# = 2
+        rename=TRUE
+        ) {
+    #get all parameter names
+    params <- get_parameters(stem,class_list)
+    #filter down to parameters of interest
+    params <- filter(params,groups == group_id)
+    #Get dataframe with only the rows of interest
+    filtdata <- as.data.frame(stanfit)[params$param_name]
+    #rename columns
+    if (rename) dimnames(filtdata)[[2]] <- params$parameters_hr
+    #get group name for title
+    group_name <- class_list$groups[group_id]
+    #create area plot with appropriate title
+    mcmc_areas(filtdata,prob = 0.8, prob_outer = 0.95) +
+        ggtitle(paste("Parameter distributions for ICD-10 class:",group_name))
+}
+
+parameter_mcmc_areas <- function(
+        stem,# = "beta"
+        class_list,# = beta_list
+        stanfit,# = fit
+        parameter_id,# = 2
+        rename=TRUE
+        ) {
+    #get all parameter names
+    params <- get_parameters(stem,class_list)
+    #filter down to parameters of interest
+    params <- filter(params,parameters == parameter_id)
+    #Get dataframe with only the rows of interest
+    filtdata <- as.data.frame(stanfit)[params$param_name]
+    #rename columns
+    if (rename) dimnames(filtdata)[[2]] <- params$groups_hr
+    #get group name for title
+    parameter_name <- class_list$parameters[parameter_id]
+    #create area plot with appropriate title
+    mcmc_areas(filtdata,prob = 0.8, prob_outer = 0.95) +
+        ggtitle(parameter_name,"Parameter Distribution")
+}
+
+
+```
+
+
+```{r}
+#generics intervention
+brand_intervention_ib <- x[c(inherited_cols,"brand_name_counts")]
+brand_intervention_ib["identical_brands"] <- asinh(sinh(x$identical_brands)+1) #add a single generic brand
+```
+
+```{r}
+counterfact_marketing_ib <- list(
+    D = ncol(x),#
+    N = nrow(x),
+    L = n_categories$count,
+    y = as.vector(y),
+    ll = as.vector(categories),
+    x = as.matrix(x),
+    mu_mean = 0,
+    mu_stdev = 0.05,
+    sigma_shape = 4,
+    sigma_rate = 20,
+    Nx = nrow(x),
+    llx = as.vector(categories),
+    counterfact_x_tilde = as.matrix(brand_intervention_ib),
+    counterfact_x = as.matrix(x)
+)
+```
+
+```{r}
+fit <- stan(
+    file='Hierarchal_Logistic.stan', 
+    data = counterfact_marketing_ib,
+    chains = 4,
+    iter = 5000,
+    seed = 11021585
+    )
+```
+
+```{r}
+generated_ib <- gqs(
+    fit@stanmodel,
+    data=counterfact_marketing_ib,
+    draws=as.matrix(fit),
+    seed=11021585
+    )
+```
+
+
+
+```{r}
+hist(as.vector(extract(generated_ib, pars="p_prior")$p_prior)) 
+hist(as.vector(extract(generated_ib, pars="mu_prior")$mu_prior), )
+hist(as.vector(extract(generated_ib, pars="sigma_prior")$sigma_prior))
+```
+
+
+
+```{r}
+check_hmc_diagnostics(fit)
+hist(as.vector(extract(generated_ib, pars="p_predicted")$p_predicted))
+```
+
+
+
+
+### Intervention: Adding a single competitor
+
+```{r}
+#TODO: Convert to ggplot, stabilize y axis
+hist(as.vector(extract(generated_ib, pars="p_predicted_default")$p_predicted_default))
+hist(as.vector(extract(generated_ib, pars="p_predicted_intervention")$p_predicted_intervention))
+hist(as.vector(extract(generated_ib, pars="predicted_difference")$predicted_difference))
+```
+
+
+
+```{r}
+
+
+pddf_ib <- data.frame(extract(generated_ib, pars="predicted_difference")$predicted_difference) |>
+    pivot_longer(X1:X1343)
+
+#TODO: Fix Category names
+pddf_ib["entry_idx"] <- as.numeric(gsub("\\D","",pddf_ib$name))
+pddf_ib["category"] <-  sapply(pddf_ib$entry_idx, function(i) df$category_id[i])
+pddf_ib["category_name"] <- sapply(pddf_ib$category, function(i) beta_list$groups[i])
+```
+
+
+```{r}
+
+ggplot(pddf_ib, aes(x=value,)) +
+    geom_histogram(bins=100) +
+    labs(
+        title = "Distribution of predicted differences"
+        ,x = "Difference in probability due to intervention"
+        ,y = "Predicted counts"
+    ) + 
+    #xlim(-0.3,0.1) +
+    geom_vline(aes(xintercept = 0), color = "skyblue", linetype="dashed") 
+
+ggplot(pddf_ib, aes(x=value,)) +
+    geom_histogram(bins=100) +
+    facet_wrap(
+        ~factor(
+            category_name, 
+            levels=beta_list$groups
+            )
+        , labeller = label_wrap_gen(multi_line = TRUE)
+        , ncol=5) +
+    labs(
+        title = "Distribution of predicted differences",
+        subtitle = "By group"
+        ,x = "Difference in probability due to intervention"
+        ,y = "Predicted counts"
+    ) + 
+    #xlim(-0.25,0.1) +
+    geom_vline(aes(xintercept = 0), color = "skyblue", linetype="dashed") +
+    theme(strip.text.x = element_text(size = 8))
+
+```
+
+
+
+#### USP DC
+
+```{r}
+#formulary intervention
+brand_intervention_bnc <- x[c(inherited_cols,"identical_brands")]
+brand_intervention_bnc["brand_name_counts"] <- asinh(sinh(x$brand_name_counts)+1) #add a single formulary competitor brand
+```
+
+```{r}
+counterfact_marketing_bnc <- list(
+    D = ncol(x),#
+    N = nrow(x),
+    L = n_categories$count,
+    y = as.vector(y),
+    ll = as.vector(categories),
+    x = as.matrix(x),
+    mu_mean = 0,
+    mu_stdev = 0.05,
+    sigma_shape = 4,
+    sigma_rate = 20,
+    Nx = nrow(x),
+    llx = as.vector(categories),
+    counterfact_x_tilde = as.matrix(brand_intervention_bnc),
+    counterfact_x = as.matrix(x)
+)
+```
+
+
 ```{r}
-# get data for counterfactuals
-get_counterfactuals <- function(driver) {
-    con <- dbConnect(
-        driver,
-        user='root',
-        password='root',
-        dbname='aact_db',
-        host='will-office'
-        )
-    on.exit(dbDisconnect(con))
-    
-    query <- dbSendQuery(
-        con,
-    "
-    with cte as (
-    select 
-    	nct_id
-    	,lag(current_status, 1) over (partition by nct_id order by earliest_date_observed)  as previous_status
-    	,current_status
-    	,earliest_date_observed as date_current
-    from formatted_data_mat fdm
-    ), cte2 as (
-    select 	
-    	nct_id 
-    	,previous_status
-    	,current_status 
-    	,max(date_current) as date_current_max
-    from cte
-    where 
-    	previous_status != current_status 
-    	and
-    	current_status = 'Active, not recruiting'
-    group by 
-    	nct_id
-    	,previous_status
-    	,current_status
-    	,date_current
+generated_bnc <- gqs(
+    fit@stanmodel,
+    data=counterfact_marketing_bnc,
+    draws=as.matrix(fit),
+    seed=11021585
    )
-    select 
-        fdqpe.nct_id
-        --,fdqpe.start_date
-        --,fdqpe.current_enrollment
-        --,fdqpe.enrollment_category
-        ,fdqpe.current_status 
-        ,fdqpe.earliest_date_observed 
-        ,fdqpe.elapsed_duration
-        ,fdqpe.n_brands as identical_brands
-        ,ntbtu.brand_name_count 
-        ,fdqpe.category_id
-        ,fdqpe.final_status
-        ,fdqpe.h_sdi_val
-        --,fdqpe.h_sdi_u95
-        --,fdqpe.h_sdi_l95
-        ,fdqpe.hm_sdi_val
-        --,fdqpe.hm_sdi_u95
-        --,fdqpe.hm_sdi_l95
-        ,fdqpe.m_sdi_val
-        --,fdqpe.m_sdi_u95
-        --,fdqpe.m_sdi_l95
-        ,fdqpe.lm_sdi_val
-        --,fdqpe.lm_sdi_u95
-        --,fdqpe.lm_sdi_l95
-        ,fdqpe.l_sdi_val
-        --,fdqpe.l_sdi_u95
-        --,fdqpe.l_sdi_l95
-    from formatted_data_with_planned_enrollment fdqpe
-        join \"Formularies\".nct_to_brands_through_uspdc ntbtu
-            on fdqpe.nct_id = ntbtu.nct_id 
-        join cte2 
-		    on cte2.nct_id = fdqpe.nct_id 
-		    and cte2.date_current_max = fdqpe.earliest_date_observed 
-    order by fdqpe.nct_id, fdqpe.earliest_date_observed 
-    ;
-    "
-        )
-    df <- fetch(query, n = -1)
-    df <- na.omit(df)
-    
-    return(df)
-}
-counterfact_raw <- get_counterfactuals(driver)
-#extract data
-counterfact_list <- data_formatter(counterfact_raw)
-counterfact_list$y <- NULL #remove the chance of accidentally training on the wrong data
-counterfact_categories <- counterfact_raw$category_id
-#setup the two counterfactuals
-counterfact_x <- counterfact_list$x #no change
-counterfact_x_tilde <- counterfact_list$x #to be changed
-#make changes, set it to Recruiting #TODO: change this so it matches previous state.
-counterfact_x_tilde$status_ANR <- 0
-counterfact_x_tilde["sANR*elapsed"] <- 0
 ```

+```{r}
+#TODO: Convert to ggplot, stabilize y axis
+hist(as.vector(extract(generated_bnc, pars="p_predicted_default")$p_predicted_default), bins=100)
+hist(as.vector(extract(generated_bnc, pars="p_predicted_intervention")$p_predicted_intervention), bins=100)
+hist(as.vector(extract(generated_bnc, pars="predicted_difference")$predicted_difference), bins=100)
+```
+
+
+```{r}
+pddf_bnc <- data.frame(extract(generated_bnc, pars="predicted_difference")$predicted_difference) |>
+    pivot_longer(X1:X1343)
+
+#Add Category names
+pddf_bnc["entry_idx"] <- as.numeric(gsub("\\D","",pddf_bnc$name))
+pddf_bnc["category"] <-  sapply(pddf_bnc$entry_idx, function(i) df$category_id[i])
+pddf_bnc["category_name"] <- sapply(pddf_bnc$category, function(i) beta_list$groups[i])
+
+```
+
+
+```{r}
+
+ggplot(pddf_bnc, aes(x=value,)) +
+    geom_histogram(bins=100) +
+    labs(
+        title = "Distribution of predicted differences"
+        ,x = "Difference in probability due to intervention"
+        ,y = "Predicted counts"
+    ) + 
+    #xlim(-0.3,0.1) +
+    geom_vline(aes(xintercept = 0), color = "skyblue", linetype="dashed") 
+
+ggplot(pddf_bnc, aes(x=value,)) +
+    geom_histogram(bins=100) +
+    facet_wrap(
+        ~factor(
+            category_name, 
+            levels=beta_list$groups
+            )
+        , labeller = label_wrap_gen(multi_line = TRUE)
+        , ncol=5) +
+    labs(
+        title = "Distribution of predicted differences",
+        subtitle = "By group"
+        ,x = "Difference in probability due to intervention"
+        ,y = "Predicted counts"
+    ) + 
+    #xlim(-0.25,0.1) +
+    geom_vline(aes(xintercept = 0), color = "skyblue", linetype="dashed") +
+    theme(strip.text.x = element_text(size = 8))
+
+```
+
+TODO: add density plot of (x,y,z) (date,value,counts)
+    - with and without faceting

 ## Explore data

@ -338,70 +610,6 @@ summary(df5)



-# Fit Model
-
-
-```{r}
-################################# FIT MODEL #########################################
-
-#setup data (named list)
-trials_data <- list(
-    D = ncol(x),#
-    N = nrow(x),
-    L = n_categories$count,
-    y = as.vector(y),
-    ll = as.vector(categories),
-    x = as.matrix(x),
-    mu_mean = 0,
-    mu_stdev = 0.05,
-    sigma_shape = 4,
-    sigma_rate = 20,
-    Nx = 189,
-    llx = as.vector(counterfact_categories),
-    counterfact_x_tilde = as.matrix(counterfact_x_tilde),
-    counterfact_x = as.matrix(counterfact_x)
-)
-
-
-fit <- stan(
-    file='Hierarchal_Logistic.stan', 
-    data = trials_data,
-    chains = 4,
-    iter = 5000,
-    seed = 11021585
-    )
-```
-
-```{r}
-#pull out prior predictions
-generated <- gqs(
-    fit@stanmodel,
-    data=trials_data,
-    draws=as.matrix(fit),
-    seed=11021585
-    )
-# to implement distribution of differences:
-# create two datasets with interventions
-# simulate both with the same seed and draws
-# figure out how to difference the posterior (and maybe look at prior) probabilities
-```
-
-
-```{r}
-hist(as.vector(extract(generated, pars="p_prior")$p_prior)) 
-hist(as.vector(extract(generated, pars="mu_prior")$mu_prior), )
-hist(as.vector(extract(generated, pars="sigma_prior")$sigma_prior))
-```
-
-
-
-```{r}
-check_hmc_diagnostics(fit)
-hist(as.vector(extract(generated, pars="p_predicted")$p_predicted))
-```
-
-
-



@ -503,172 +711,45 @@ for (i in 1:4) {
        )
    )
 }
-```
-
-
-```{r}
-for (k in 1:22) {
-for (i in 1:4) {
-    params = sapply(3:0, function(j) paste0("beta[",k,",",4*i-j ,"]"))
-    print(
-        mcmc_pairs(
-            posterior,
-            np = nuts_prmts,
-            pars=c(
-                params,
-                "lp__"
-            ),
-            off_diag_args = list(size = 0.75)
-        )
-    )
-}}
-```
-
-
-# Results
-
-
-```{r}
-################################# ANALYZE #####################################
-print(fit)
-```
-
-## Result Plots
-
-
-Note the regular large difference in variance. 
-I would guess those are the beta[1:22,2] values.
-I wonder if a lot of the variance is due to the 2 values that are sitting out.
-
-
-
-```{r}
-beta_list <- list(
-        groups = c(
-        `1`="Infections & Parasites",
-        `2`="Neoplasms",
-        `3`="Blood & Immune system",
-        `4`="Endocrine, Nutritional, and Metabolic",
-        `5`="Mental & Behavioral",
-        `6`="Nervous System",
-        `7`="Eye and Adnexa",
-        `8`="Ear and Mastoid",
-        `9`="Circulatory",
-        `10`="Respiratory",
-        `11`="Digestive",
-        `12`="Skin & Subcutaneaous tissue",
-        `13`="Musculoskeletal",
-        `14`="Genitourinary",
-        `15`="Pregancy, Childbirth, & Puerperium",
-        `16`="Perinatal Period",
-        `17`="Congential",
-        `18`="Symptoms, Signs etc.",
-        `19`="Injury etc.",
-        `20`="External Causes",
-        `21`="Contact with Healthcare",
-        `22`="Special Purposes"
-    ),
-    parameters = c(
-        `1`="Elapsed Duration",
-        # brands
-        `2`="asinh(Generic Brands)",
-        `3`="asinh(Competitors USPDC)",
-        # population
-        `4`="asinh(High SDI)",
-        `5`="asinh(High-Medium SDI)",
-        `6`="asinh(Medium SDI)",
-        `7`="asinh(Low-Medium SDI)",
-        `8`="asinh(Low SDI)",
-        #Status
-        `9`="status_NYR",
-        `10`="status_EBI",
-        `11`="status_ANR",
-        #interactions for brands
-        `12`="ib*elapsed",
-        `13`="bnc*elapsed",
-        # interactions for status
-        `14`="sNYR*elapsed",
-        `15`="sEBI*elapsed",
-        `16`="sANR*elapsed"
-        
-    )
-)
-
-get_parameters <- function(stem,class_list) {
-    #get categories and lengths
-    named <- names(class_list)
-    lengths <- sapply(named, (function (x) length(class_list[[x]])))
-    
-    #describe the grid needed
-    iter_list <- sapply(named, (function (x) 1:lengths[x]))
-    
-    #generate the list of parameters
-    pardf <- generate_parameter_df(stem, iter_list)
-    
-    #add columns with appropriate human-readable names
-    for (name in named) {
-        pardf[paste(name,"_hr",sep="")] <- as.factor(
-            sapply(pardf[name], (function (i) class_list[[name]][i]))
-        )
-    }
-     
-    return(pardf)   
-}
-
-generate_parameter_df <- function(stem, iter_list) {
-    grid <- expand.grid(iter_list)
-    grid["param_name"] <- grid %>% unite(x,colnames(grid),sep=",")
-    grid["param_name"] <- paste(stem,"[",grid$param_name,"]",sep="")
-    return(grid)
-}
-
-group_mcmc_areas <- function(
-        stem,# = "beta"
-        class_list,# = beta_list
-        stanfit,# = fit
-        group_id,# = 2
-        rename=TRUE
-        ) {
-    #get all parameter names
-    params <- get_parameters(stem,class_list)
-    #filter down to parameters of interest
-    params <- filter(params,groups == group_id)
-    #Get dataframe with only the rows of interest
-    filtdata <- as.data.frame(stanfit)[params$param_name]
-    #rename columns
-    if (rename) dimnames(filtdata)[[2]] <- params$parameters_hr
-    #get group name for title
-    group_name <- class_list$groups[group_id]
-    #create area plot with appropriate title
-    mcmc_areas(filtdata,prob = 0.8, prob_outer = 0.95) +
-        ggtitle(paste("Parameter distributions for ICD-10 class:",group_name))
-}
+```

-parameter_mcmc_areas <- function(
-        stem,# = "beta"
-        class_list,# = beta_list
-        stanfit,# = fit
-        parameter_id,# = 2
-        rename=TRUE
-        ) {
-    #get all parameter names
-    params <- get_parameters(stem,class_list)
-    #filter down to parameters of interest
-    params <- filter(params,parameters == parameter_id)
-    #Get dataframe with only the rows of interest
-    filtdata <- as.data.frame(stanfit)[params$param_name]
-    #rename columns
-    if (rename) dimnames(filtdata)[[2]] <- params$groups_hr
-    #get group name for title
-    parameter_name <- class_list$parameters[parameter_id]
-    #create area plot with appropriate title
-    mcmc_areas(filtdata,prob = 0.8, prob_outer = 0.95) +
-        ggtitle(parameter_name,"Parameter Distribution")
-}
+
+```{r}
+for (k in 1:22) {
+for (i in 1:4) {
+    params = sapply(3:0, function(j) paste0("beta[",k,",",4*i-j ,"]"))
+    print(
+        mcmc_pairs(
+            posterior,
+            np = nuts_prmts,
+            pars=c(
+                params,
+                "lp__"
+            ),
+            off_diag_args = list(size = 0.75)
+        )
+    )
+}}
+```


+# Results
+
+
+```{r}
+################################# ANALYZE #####################################
+print(fit)
 ```

+## Result Plots
+
+
+Note the regular large difference in variance. 
+I would guess those are the beta[1:22,2] values.
+I wonder if a lot of the variance is due to the 2 values that are sitting out.
+
+
+
 ```{r}
 #mcmc_intervals(fit, pars=get_parameters("beta",beta_list)$param_name)
 ```
@ -840,219 +921,3 @@ ggplot(pddf, aes(x=value,)) +
 Recall that we had really tight zero priors.


-### Intervention: Adding a single competitor
-```{r}
-inherited_cols <- c(
-    "elapsed_duration"
-    #,"identical_brands"
-    #,"brand_name_counts"
-    ,"h_sdi_val"
-    ,"hm_sdi_val"
-    ,"m_sdi_val"
-    ,"lm_sdi_val"
-    ,"l_sdi_val"
-    ,"status_NYR"
-    ,"status_EBI"
-    ,"status_ANR"
-    #,"ib*elapsed"
-    #,"bnc*elapsed"
-    ,"sNYR*elapsed"
-    ,"sEBI*elapsed"
-    ,"sANR*elapsed"
-)
-```
-
-
-
-
-
-#### Generics
-
-```{r}
-#generics intervention
-brand_intervention_ib <- x[c(inherited_cols,"brand_name_counts","bnc*elapsed")]
-brand_intervention_ib["identical_brands"] <- asinh(sinh(x$identical_brands)+1) #add a single generic brand
-brand_intervention_ib["ib*elapsed"] <- brand_intervention_ib$identical_brands * brand_intervention_ib$elapsed_duration
-```
-
-```{r}
-counterfact_marketing_ib <- list(
-    D = ncol(x),#
-    N = nrow(x),
-    L = n_categories$count,
-    y = as.vector(y),
-    ll = as.vector(categories),
-    x = as.matrix(x),
-    mu_mean = 0,
-    mu_stdev = 0.05,
-    sigma_shape = 4,
-    sigma_rate = 20,
-    Nx = nrow(x),
-    llx = as.vector(categories),
-    counterfact_x_tilde = as.matrix(brand_intervention_ib),
-    counterfact_x = as.matrix(x)
-)
-```
-
-
-```{r}
-generated_ib <- gqs(
-    fit@stanmodel,
-    data=counterfact_marketing_ib,
-    draws=as.matrix(fit),
-    seed=11021585
-    )
-```
-
-```{r}
-#TODO: Convert to ggplot, stabilize y axis
-hist(as.vector(extract(generated_ib, pars="p_predicted_default")$p_predicted_default), bins=100)
-hist(as.vector(extract(generated_ib, pars="p_predicted_intervention")$p_predicted_intervention), bins=100)
-hist(as.vector(extract(generated_ib, pars="predicted_difference")$predicted_difference), bins=100)
-```
-
-
-
-```{r}
-
-
-pddf_ib <- data.frame(extract(generated_ib, pars="predicted_difference")$predicted_difference) |>
-    pivot_longer(X1:X1343)
-
-#TODO: Fix Category names
-pddf_ib["entry_idx"] <- as.numeric(gsub("\\D","",pddf_ib$name))
-pddf_ib["category"] <-  sapply(pddf_ib$entry_idx, function(i) df$category_id[i])
-pddf_ib["category_name"] <- sapply(pddf_ib$category, function(i) beta_list$groups[i])
-```
-
-
-```{r}
-
-ggplot(pddf_ib, aes(x=value,)) +
-    geom_histogram(bins=100) +
-    labs(
-        title = "Distribution of predicted differences"
-        ,x = "Difference in probability due to intervention"
-        ,y = "Predicted counts"
-    ) + 
-    #xlim(-0.3,0.1) +
-    geom_vline(aes(xintercept = 0), color = "skyblue", linetype="dashed") 
-
-ggplot(pddf_ib, aes(x=value,)) +
-    geom_histogram(bins=100) +
-    facet_wrap(
-        ~factor(
-            category_name, 
-            levels=beta_list$groups
-            )
-        , labeller = label_wrap_gen(multi_line = TRUE)
-        , ncol=5) +
-    labs(
-        title = "Distribution of predicted differences",
-        subtitle = "By group"
-        ,x = "Difference in probability due to intervention"
-        ,y = "Predicted counts"
-    ) + 
-    #xlim(-0.25,0.1) +
-    geom_vline(aes(xintercept = 0), color = "skyblue", linetype="dashed") +
-    theme(strip.text.x = element_text(size = 8))
-
-```
-
-
-
-#### USP DC
-
-```{r}
-#formulary intervention
-brand_intervention_bnc <- x[c(inherited_cols,"identical_brands","ib*elapsed")]
-brand_intervention_bnc["brand_name_counts"] <- asinh(sinh(x$brand_name_counts)+1) #add a single formulary competitor brand
-brand_intervention_bnc["bnc*elapsed"] <- brand_intervention_bnc$brand_name_counts * brand_intervention_bnc$elapsed_duration
-```
-
-```{r}
-counterfact_marketing_bnc <- list(
-    D = ncol(x),#
-    N = nrow(x),
-    L = n_categories$count,
-    y = as.vector(y),
-    ll = as.vector(categories),
-    x = as.matrix(x),
-    mu_mean = 0,
-    mu_stdev = 0.05,
-    sigma_shape = 4,
-    sigma_rate = 20,
-    Nx = nrow(x),
-    llx = as.vector(categories),
-    counterfact_x_tilde = as.matrix(brand_intervention_bnc),
-    counterfact_x = as.matrix(x)
-)
-```
-
-
-```{r}
-generated_bnc <- gqs(
-    fit@stanmodel,
-    data=counterfact_marketing_bnc,
-    draws=as.matrix(fit),
-    seed=11021585
-    )
-```
-
-```{r}
-#TODO: Convert to ggplot, stabilize y axis
-hist(as.vector(extract(generated_bnc, pars="p_predicted_default")$p_predicted_default), bins=100)
-hist(as.vector(extract(generated_bnc, pars="p_predicted_intervention")$p_predicted_intervention), bins=100)
-hist(as.vector(extract(generated_bnc, pars="predicted_difference")$predicted_difference), bins=100)
-```
-
-
-```{r}
-pddf_bnc <- data.frame(extract(generated_bnc, pars="predicted_difference")$predicted_difference) |>
-    pivot_longer(X1:X1343)
-
-#Add Category names
-pddf_bnc["entry_idx"] <- as.numeric(gsub("\\D","",pddf_bnc$name))
-pddf_bnc["category"] <-  sapply(pddf_bnc$entry_idx, function(i) df$category_id[i])
-pddf_bnc["category_name"] <- sapply(pddf_bnc$category, function(i) beta_list$groups[i])
-
-#add snapshot date
-pddf_bnc["snapshot_date"] <- sapply(pddf_bnc$entry_idx, function(i) df$earliest_date_observed[i])#changed values
-```
-
-
-```{r}
-
-ggplot(pddf_bnc, aes(x=value,)) +
-    geom_histogram(bins=100) +
-    labs(
-        title = "Distribution of predicted differences"
-        ,x = "Difference in probability due to intervention"
-        ,y = "Predicted counts"
-    ) + 
-    #xlim(-0.3,0.1) +
-    geom_vline(aes(xintercept = 0), color = "skyblue", linetype="dashed") 
-
-ggplot(pddf_bnc, aes(x=value,)) +
-    geom_histogram(bins=100) +
-    facet_wrap(
-        ~factor(
-            category_name, 
-            levels=beta_list$groups
-            )
-        , labeller = label_wrap_gen(multi_line = TRUE)
-        , ncol=5) +
-    labs(
-        title = "Distribution of predicted differences",
-        subtitle = "By group"
-        ,x = "Difference in probability due to intervention"
-        ,y = "Predicted counts"
-    ) + 
-    #xlim(-0.25,0.1) +
-    geom_vline(aes(xintercept = 0), color = "skyblue", linetype="dashed") +
-    theme(strip.text.x = element_text(size = 8))
-
-```
-
-TODO: add density plot of (x,y,z) (date,value,counts)
-    - with and without faceting