ClinicalTrialsEstimation/r-analysis/EffectsOfMarketConditions.qmd

---
title: "The Effects of market conditions on recruitment and completion of clinical trials"
author: "Will King"
format: html
editor: source
---


# Setup

```{r}
library(bayesplot)
available_mcmc(pattern = "_nuts_")
library(ggplot2)
library(patchwork)
library(tidyverse)
library(rstan)
library(tidyr)
library(ghibli)
#Resources: https://github.com/stan-dev/rstan/wiki/RStan-Getting-Started

#save unchanged models instead of recompiling
rstan_options(auto_write = TRUE)
#allow for multithreaded sampling
options(mc.cores = parallel::detectCores())

#test installation, shouldn't get any errors
#example(stan_model, package = "rstan", run.dontrun = TRUE)
```

```{r}
################ Pull data from database ######################
library(RPostgreSQL)

driver <- dbDriver("PostgreSQL")

get_data <- function(driver) {

con <- dbConnect(
    driver,
    user='root',
    password='root',
    dbname='aact_db',
    host='will-office'
    )
on.exit(dbDisconnect(con))

query <- dbSendQuery(
    con,
#    "select * from formatted_data_with_planned_enrollment;"
"
select
    fdqpe.nct_id
    --,fdqpe.start_date
    --,fdqpe.current_enrollment
    --,fdqpe.enrollment_category
    ,fdqpe.current_status
    ,fdqpe.earliest_date_observed
    ,fdqpe.elapsed_duration
    ,fdqpe.n_brands as identical_brands
    ,ntbtu.brand_name_count
    ,fdqpe.category_id
    ,fdqpe.final_status
    ,fdqpe.h_sdi_val
    --,fdqpe.h_sdi_u95
    --,fdqpe.h_sdi_l95
    ,fdqpe.hm_sdi_val
    --,fdqpe.hm_sdi_u95
    --,fdqpe.hm_sdi_l95
    ,fdqpe.m_sdi_val
    --,fdqpe.m_sdi_u95
    --,fdqpe.m_sdi_l95
    ,fdqpe.lm_sdi_val
    --,fdqpe.lm_sdi_u95
    --,fdqpe.lm_sdi_l95
    ,fdqpe.l_sdi_val
    --,fdqpe.l_sdi_u95
    --,fdqpe.l_sdi_l95
from formatted_data_with_planned_enrollment fdqpe
    join \"Formularies\".nct_to_brands_through_uspdc ntbtu
        on fdqpe.nct_id = ntbtu.nct_id
order by fdqpe.nct_id, fdqpe.earliest_date_observed
;
"
    )
df <- fetch(query, n = -1)
df <- na.omit(df)

query2 <-dbSendQuery(con,"select count(*) from \"DiseaseBurden\".icd10_categories ic where \"level\"=1;")
n_categories <- fetch(query2, n = -1)

return(list(data=df,ncat=n_categories))
}

d <- get_data(driver)
df <- d$data
n_categories <- d$ncat


################ Format Data ###########################

data_formatter <- function(df) {
categories <- df["category_id"]

x <- df["elapsed_duration"]
x["identical_brands"] <- asinh(df$identical_brands)
x["brand_name_counts"] <- asinh(df$brand_name_count)
x["h_sdi_val"] <- asinh(df$h_sdi_val)
x["hm_sdi_val"] <- asinh(df$hm_sdi_val)
x["m_sdi_val"] <- asinh(df$m_sdi_val)
x["lm_sdi_val"] <- asinh(df$lm_sdi_val)
x["l_sdi_val"] <- asinh(df$l_sdi_val)


#Setup fixed effects
x["status_NYR"] <- ifelse(df["current_status"]=="Not yet recruiting",1,0)
x["status_EBI"] <- ifelse(df["current_status"]=="Enrolling by invitation",1,0)
#x["status_Rec"] <- ifelse(df["current_status"]=="Recruiting",1,0) # Base case is Recruiting
x["status_ANR"] <- ifelse(df["current_status"]=="Active, not recruiting",1,0)


#interaction terms for competitors
x["ib*elapsed"] <- x["elapsed_duration"]*x["identical_brands"]
x["bnc*elapsed"] <- x["elapsed_duration"] * x["brand_name_counts"]

#interaction terms for status effects
x["sNYR*elapsed"] <- x["elapsed_duration"]*x["status_NYR"]
x["sEBI*elapsed"] <- x["elapsed_duration"]*x["status_EBI"]
x["sANR*elapsed"] <- x["elapsed_duration"]*x["status_ANR"]

y <- ifelse(df["final_status"]=="Terminated",1,0)

#get category list


return(list(x=x,y=y))
}

train <- data_formatter(df)

categories <- df$category_id

x <- train$x
y <- train$y
```


```{r}
# get data for counterfactuals
get_counterfactuals <- function(driver) {
    con <- dbConnect(
        driver,
        user='root',
        password='root',
        dbname='aact_db',
        host='will-office'
        )
    on.exit(dbDisconnect(con))

    query <- dbSendQuery(
        con,
    "
    with cte as (
    select
    	nct_id
    	,lag(current_status, 1) over (partition by nct_id order by earliest_date_observed)  as previous_status
    	,current_status
    	,earliest_date_observed as date_current
    from formatted_data_mat fdm
    ), cte2 as (
    select
    	nct_id
    	,previous_status
    	,current_status
    	,max(date_current) as date_current_max
    from cte
    where
    	previous_status != current_status
    	and
    	current_status = 'Active, not recruiting'
    group by
    	nct_id
    	,previous_status
    	,current_status
    	,date_current
    )
    select
        fdqpe.nct_id
        --,fdqpe.start_date
        --,fdqpe.current_enrollment
        --,fdqpe.enrollment_category
        ,fdqpe.current_status
        ,fdqpe.earliest_date_observed
        ,fdqpe.elapsed_duration
        ,fdqpe.n_brands as identical_brands
        ,ntbtu.brand_name_count
        ,fdqpe.category_id
        ,fdqpe.final_status
        ,fdqpe.h_sdi_val
        --,fdqpe.h_sdi_u95
        --,fdqpe.h_sdi_l95
        ,fdqpe.hm_sdi_val
        --,fdqpe.hm_sdi_u95
        --,fdqpe.hm_sdi_l95
        ,fdqpe.m_sdi_val
        --,fdqpe.m_sdi_u95
        --,fdqpe.m_sdi_l95
        ,fdqpe.lm_sdi_val
        --,fdqpe.lm_sdi_u95
        --,fdqpe.lm_sdi_l95
        ,fdqpe.l_sdi_val
        --,fdqpe.l_sdi_u95
        --,fdqpe.l_sdi_l95
    from formatted_data_with_planned_enrollment fdqpe
        join \"Formularies\".nct_to_brands_through_uspdc ntbtu
            on fdqpe.nct_id = ntbtu.nct_id
        join cte2
		    on cte2.nct_id = fdqpe.nct_id
		    and cte2.date_current_max = fdqpe.earliest_date_observed
    order by fdqpe.nct_id, fdqpe.earliest_date_observed
    ;
    "
        )
    df <- fetch(query, n = -1)
    df <- na.omit(df)

    return(df)
}
counterfact_raw <- get_counterfactuals(driver)
#extract data
counterfact_list <- data_formatter(counterfact_raw)
counterfact_list$y <- NULL #remove the chance of accidentally training on the wrong data
counterfact_categories <- counterfact_raw$category_id
#setup the two counterfactuals
counterfact_x <- counterfact_list$x #no change
counterfact_x_tilde <- counterfact_list$x #to be changed
#make changes, set it to Recruiting #TODO: change this so it matches previous state.
counterfact_x_tilde$status_ANR <- 0
counterfact_x_tilde["sANR*elapsed"] <- 0
```


## Explore data

```{r}
################################# DATA EXPLORATION ############################
driver <- dbDriver("PostgreSQL")

con <- dbConnect(
    driver,
    user='root',
    password='root',
    dbname='aact_db',
    host='will-office'
    )
#Plot histogram of count of snapshots
df3 <- dbGetQuery(
    con,
    "select nct_id,final_status,count(*) from formatted_data_with_planned_enrollment fdwpe
    group by nct_id,final_status ;"
    )
#df3 <- fetch(query3, n = -1)

ggplot(data=df3, aes(x=count, fill=final_status)) +
    geom_histogram(binwidth=1) +
    ggtitle("Histogram of snapshots per trial (matched trials)") +
    xlab("Snapshots per trial")

#Plot duration for terminated vs completed
df4 <- dbGetQuery(
    con,
    "
    select
        nct_id,
        start_date ,
        primary_completion_date,
        overall_status ,
        primary_completion_date - start_date as duration
    from ctgov.studies s
    where nct_id in (select distinct nct_id from http.download_status ds)
    ;"
    )
#df4 <- fetch(query4, n = -1)

ggplot(data=df4, aes(x=duration,fill=overall_status)) +
    geom_histogram()+
    ggtitle("Histogram of trial durations") +
    xlab("duration")+
    facet_wrap(~overall_status)

df5 <- dbGetQuery(
    con,
    "
    with cte1 as (
    select
        nct_id,
        start_date ,
        primary_completion_date,
        overall_status ,
        primary_completion_date - start_date as duration
    from ctgov.studies s
    where nct_id in (select distinct nct_id from http.download_status ds)
    ), cte2 as (
    select nct_id,count(*) as snapshot_count from formatted_data_with_planned_enrollment fdwpe
    group by nct_id
    )
    select a.nct_id, a.overall_status, a.duration,b.snapshot_count
    from cte1 as a
        join cte2 as b
            on a.nct_id=b.nct_id
    ;"
    )
df5$overall_status <- as.factor(df5$overall_status)
#df5 <- fetch(query5, n = -1)

ggplot(data=df5, aes(x=duration,y=snapshot_count,color=overall_status)) +
    geom_jitter() +
    ggtitle("duration, status, and snapshot_count") +
    xlab("duration") +
    ylab("snapshot count")


dbDisconnect(con)

#get number of trials and snapshots in each category
group_trials_by_category <- as.data.frame(aggregate(category_id ~ nct_id, df, max))
group_trials_by_category <- as.data.frame(group_trials_by_category)
ggplot(data = group_trials_by_category, aes(x=category_id)) +
    geom_histogram(binwidth=1,color="black",fill="seagreen") +
    scale_x_continuous(breaks=scales::pretty_breaks(n=22))


summary(df5)
```


# Fit Model


```{r}
################################# FIT MODEL #########################################

#setup data (named list)
trials_data <- list(
    D = ncol(x),#
    N = nrow(x),
    L = n_categories$count,
    y = as.vector(y),
    ll = as.vector(categories),
    x = as.matrix(x),
    mu_mean = 0,
    mu_stdev = 0.05,
    sigma_shape = 4,
    sigma_rate = 20,
    Nx = 189,
    llx = as.vector(counterfact_categories),
    counterfact_x_tilde = as.matrix(counterfact_x_tilde),
    counterfact_x = as.matrix(counterfact_x)
)


fit <- stan(
    file='Hierarchal_Logistic.stan',
    data = trials_data,
    chains = 4,
    iter = 5000,
    seed = 11021585
    )
```

```{r}
#pull out prior predictions
generated <- gqs(
    fit@stanmodel,
    data=trials_data,
    draws=as.matrix(fit),
    seed=11021585
    )
# to implement distribution of differences:
# create two datasets with interventions
# simulate both with the same seed and draws
# figure out how to difference the posterior (and maybe look at prior) probabilities
```


```{r}
hist(as.vector(extract(generated, pars="p_prior")$p_prior))
hist(as.vector(extract(generated, pars="mu_prior")$mu_prior), )
hist(as.vector(extract(generated, pars="sigma_prior")$sigma_prior))
```


```{r}
check_hmc_diagnostics(fit)
hist(as.vector(extract(generated, pars="p_predicted")$p_predicted))
```


# Diagnostics

```{r}
#trace plots
plot(fit, pars=c("mu"), plotfun="trace")


for (i in 1:4) {
    print(
        mcmc_rank_overlay(
        fit,
        pars=c(
            paste0("mu[",4*i-3,"]"),
            paste0("mu[",4*i-2,"]"),
            paste0("mu[",4*i-1,"]"),
            paste0("mu[",4*i,"]")
            ),
        n_bins=100
        )+  legend_move("top") +
             scale_colour_ghibli_d("KikiMedium")
    )
}
```

```{r}
plot(fit, pars=c("sigma"), plotfun="trace")

for (i in 1:4) {
    print(
        mcmc_rank_overlay(
        fit,
        pars=c(
            paste0("sigma[",4*i-3,"]"),
            paste0("sigma[",4*i-2,"]"),
            paste0("sigma[",4*i-1,"]"),
            paste0("sigma[",4*i,"]")
            ),
        n_bins=100
        )+  legend_move("top") +
             scale_colour_ghibli_d("KikiMedium")
    )
}
```

```{r}
#other diagnostics
logpost <- log_posterior(fit)
nuts_prmts <- nuts_params(fit)
posterior <- as.array(fit)

```

```{r}
color_scheme_set("darkgray")
div_style <- parcoord_style_np(div_color = "green", div_size = 0.05, div_alpha = 0.4)
mcmc_parcoord(posterior, regex_pars = "mu", np=nuts_prmts, np_style = div_style, alpha = 0.05)
```

```{r}
for (i in 1:4) {
    mus = sapply(3:0, function(j) paste0("mu[",4*i-j ,"]"))
    print(
        mcmc_pairs(
            posterior,
            np = nuts_prmts,
            pars=c(
                mus,
                "lp__"
            ),
            off_diag_args = list(size = 0.75)
        )
    )
}


```

```{r}
mcmc_parcoord(posterior,regex_pars = "sigma", np=nuts_prmts, alpha=0.05)
```

```{r}

for (i in 1:4) {
    params = sapply(3:0, function(j) paste0("sigma[",4*i-j ,"]"))
    print(
        mcmc_pairs(
            posterior,
            np = nuts_prmts,
            pars=c(
                params,
                "lp__"
            ),
            off_diag_args = list(size = 0.75)
        )
    )
}
```


```{r}
for (k in 1:22) {
for (i in 1:4) {
    params = sapply(3:0, function(j) paste0("beta[",k,",",4*i-j ,"]"))
    print(
        mcmc_pairs(
            posterior,
            np = nuts_prmts,
            pars=c(
                params,
                "lp__"
            ),
            off_diag_args = list(size = 0.75)
        )
    )
}}
```


# Results


```{r}
################################# ANALYZE #####################################
print(fit)
```

## Result Plots


Note the regular large difference in variance.
I would guess those are the beta[1:22,2] values.
I wonder if a lot of the variance is due to the 2 values that are sitting out.


```{r}
beta_list <- list(
        groups = c(
        `1`="Infections & Parasites",
        `2`="Neoplasms",
        `3`="Blood & Immune system",
        `4`="Endocrine, Nutritional, and Metabolic",
        `5`="Mental & Behavioral",
        `6`="Nervous System",
        `7`="Eye and Adnexa",
        `8`="Ear and Mastoid",
        `9`="Circulatory",
        `10`="Respiratory",
        `11`="Digestive",
        `12`="Skin & Subcutaneaous tissue",
        `13`="Musculoskeletal",
        `14`="Genitourinary",
        `15`="Pregancy, Childbirth, & Puerperium",
        `16`="Perinatal Period",
        `17`="Congential",
        `18`="Symptoms, Signs etc.",
        `19`="Injury etc.",
        `20`="External Causes",
        `21`="Contact with Healthcare",
        `22`="Special Purposes"
    ),
    parameters = c(
        `1`="Elapsed Duration",
        # brands
        `2`="asinh(Generic Brands)",
        `3`="asinh(Competitors USPDC)",
        # population
        `4`="asinh(High SDI)",
        `5`="asinh(High-Medium SDI)",
        `6`="asinh(Medium SDI)",
        `7`="asinh(Low-Medium SDI)",
        `8`="asinh(Low SDI)",
        #Status
        `9`="status_NYR",
        `10`="status_EBI",
        `11`="status_ANR",
        #interactions for brands
        `12`="ib*elapsed",
        `13`="bnc*elapsed",
        # interactions for status
        `14`="sNYR*elapsed",
        `15`="sEBI*elapsed",
        `16`="sANR*elapsed"

    )
)

get_parameters <- function(stem,class_list) {
    #get categories and lengths
    named <- names(class_list)
    lengths <- sapply(named, (function (x) length(class_list[[x]])))

    #describe the grid needed
    iter_list <- sapply(named, (function (x) 1:lengths[x]))

    #generate the list of parameters
    pardf <- generate_parameter_df(stem, iter_list)

    #add columns with appropriate human-readable names
    for (name in named) {
        pardf[paste(name,"_hr",sep="")] <- as.factor(
            sapply(pardf[name], (function (i) class_list[[name]][i]))
        )
    }

    return(pardf)
}

generate_parameter_df <- function(stem, iter_list) {
    grid <- expand.grid(iter_list)
    grid["param_name"] <- grid %>% unite(x,colnames(grid),sep=",")
    grid["param_name"] <- paste(stem,"[",grid$param_name,"]",sep="")
    return(grid)
}

group_mcmc_areas <- function(
        stem,# = "beta"
        class_list,# = beta_list
        stanfit,# = fit
        group_id,# = 2
        rename=TRUE
        ) {
    #get all parameter names
    params <- get_parameters(stem,class_list)
    #filter down to parameters of interest
    params <- filter(params,groups == group_id)
    #Get dataframe with only the rows of interest
    filtdata <- as.data.frame(stanfit)[params$param_name]
    #rename columns
    if (rename) dimnames(filtdata)[[2]] <- params$parameters_hr
    #get group name for title
    group_name <- class_list$groups[group_id]
    #create area plot with appropriate title
    mcmc_areas(filtdata,prob = 0.8, prob_outer = 0.95) +
        ggtitle(paste("Parameter distributions for ICD-10 class:",group_name))
}

parameter_mcmc_areas <- function(
        stem,# = "beta"
        class_list,# = beta_list
        stanfit,# = fit
        parameter_id,# = 2
        rename=TRUE
        ) {
    #get all parameter names
    params <- get_parameters(stem,class_list)
    #filter down to parameters of interest
    params <- filter(params,parameters == parameter_id)
    #Get dataframe with only the rows of interest
    filtdata <- as.data.frame(stanfit)[params$param_name]
    #rename columns
    if (rename) dimnames(filtdata)[[2]] <- params$groups_hr
    #get group name for title
    parameter_name <- class_list$parameters[parameter_id]
    #create area plot with appropriate title
    mcmc_areas(filtdata,prob = 0.8, prob_outer = 0.95) +
        ggtitle(parameter_name,"Parameter Distribution")
}


```

```{r}
#mcmc_intervals(fit, pars=get_parameters("beta",beta_list)$param_name)
```

### Investigating parameter distributions

```{r}
#g1 <- group_mcmc_areas("beta",beta_list,fit,2)
#g2 <- group_mcmc_areas("beta",beta_list,fit,2)
#g3 <- group_mcmc_areas("beta",beta_list,fit,2)
#g4 <- group_mcmc_areas("beta",beta_list,fit,2)
#g5 <- group_mcmc_areas("beta",beta_list,fit,2)
#g6 <- group_mcmc_areas("beta",beta_list,fit,2)
#g7 <- group_mcmc_areas("beta",beta_list,fit,2)
#g8 <- group_mcmc_areas("beta",beta_list,fit,2)
#g9 <- group_mcmc_areas("beta",beta_list,fit,2)
#g10 <- group_mcmc_areas("beta",beta_list,fit,2)
#g11 <- group_mcmc_areas("beta",beta_list,fit,2)
#g12 <- group_mcmc_areas("beta",beta_list,fit,2)
#g13 <- group_mcmc_areas("beta",beta_list,fit,2)
#g14 <- group_mcmc_areas("beta",beta_list,fit,2)
#g15 <- group_mcmc_areas("beta",beta_list,fit,2)
#g16 <- group_mcmc_areas("beta",beta_list,fit,2)
#g17 <- group_mcmc_areas("beta",beta_list,fit,2)
#g18 <- group_mcmc_areas("beta",beta_list,fit,2)
#g19 <- group_mcmc_areas("beta",beta_list,fit,2)
#g20 <- group_mcmc_areas("beta",beta_list,fit,2)
#g21 <- group_mcmc_areas("beta",beta_list,fit,2)
#g22 <- group_mcmc_areas("beta",beta_list,fit,2)


p1 <- parameter_mcmc_areas("beta",beta_list,fit,1)
p2 <- parameter_mcmc_areas("beta",beta_list,fit,2)
p3 <- parameter_mcmc_areas("beta",beta_list,fit,3)
#p4 <- parameter_mcmc_areas("beta",beta_list,fit,4)
#p5 <- parameter_mcmc_areas("beta",beta_list,fit,5)
#p6 <- parameter_mcmc_areas("beta",beta_list,fit,6)
#p7 <- parameter_mcmc_areas("beta",beta_list,fit,7)
#p8 <- parameter_mcmc_areas("beta",beta_list,fit,8)
p9 <- parameter_mcmc_areas("beta",beta_list,fit,9)
p10 <- parameter_mcmc_areas("beta",beta_list,fit,10)
p11 <- parameter_mcmc_areas("beta",beta_list,fit,11)
p12 <- parameter_mcmc_areas("beta",beta_list,fit,12)
p13 <- parameter_mcmc_areas("beta",beta_list,fit,13)
p14 <- parameter_mcmc_areas("beta",beta_list,fit,14)
p15 <- parameter_mcmc_areas("beta",beta_list,fit,15)
p16 <- parameter_mcmc_areas("beta",beta_list,fit,16)
```

Note these have 95% outer CI and 80% inner (shaded)


    1) "Elapsed Duration",
    2) "asinh(Generic Brands)",
    3) "asinh(Competitors USPDC)",
    4) "asinh(High SDI)",
    5) "asinh(High-Medium SDI)",
    6) "asinh(Medium SDI)",
    7) "asinh(Low-Medium SDI)",
    8) "asinh(Low SDI)",
    9) "status_NYR",
    10) "status_EBI",
    11) "status_ANR",
    12) "ib*elapsed",
    13) "bnc*elapsed",
    14) "sNYR*elapsed",
    15) "sEBI*elapsed",
    16) "sANR*elapsed"

of interest
- p1 + p2
- p3 + p2
- p2 + p12
- p3 + p13
- p9 + p14
- p10 + p15
- p11 + p16

```{r}
p1 + p2
```


```{r}
p2 + p3
```

```{r}
p2 + p12
```

```{r}
p3 + p13
```

```{r}
p9 + p14
```


```{r}
p10 + p15
```


```{r}
p11 + p16
```


# Posterior Prediction


```{r}
#TODO: Convert to ggplot, stabilize y axis
hist(as.vector(extract(generated, pars="p_predicted_default")$p_predicted_default))
hist(as.vector(extract(generated, pars="p_predicted_intervention")$p_predicted_intervention))
```


## Distribution of Predicted Differences

### Intervention: Marginal increase in time to finish enrollment

```{r}

pddf <- data.frame(extract(generated, pars="predicted_difference")$predicted_difference)  |> pivot_longer(X1:X189)
pddf["entry_idx"] <- as.numeric(gsub("\\D","",pddf$name))

pddf["category"] <-  sapply(pddf$entry_idx, function(i) counterfact_categories[i])
pddf["category_name"] <- sapply(pddf$category, function(i) beta_list$groups[i])

ggplot(pddf, aes(x=value,)) +
    geom_histogram(bins=100) +
    labs(
        title = "Distribution of predicted differences"
        ,x = "Difference in probability due to intervention"
        ,y = "Predicted counts"
    ) +
    xlim(-0.3,0.1) +
    geom_vline(aes(xintercept = 0), color = "skyblue", linetype="dashed")

ggplot(pddf, aes(x=value,)) +
    geom_histogram(bins=100) +
    facet_wrap(
        ~factor(
            category_name,
            levels=beta_list$groups
            )
        , labeller = label_wrap_gen(multi_line = TRUE)
        , ncol=5) +
    labs(
        title = "Distribution of predicted differences",
        subtitle = "By group"
        ,x = "Difference in probability due to intervention"
        ,y = "Predicted counts"
    ) +
    xlim(-0.25,0.1) +
    geom_vline(aes(xintercept = 0), color = "skyblue", linetype="dashed") +
    theme(strip.text.x = element_text(size = 8))

```


Recall that we had really tight zero priors.


### Intervention: Adding a single competitor
```{r}
inherited_cols <- c(
    "elapsed_duration"
    #,"identical_brands"
    #,"brand_name_counts"
    ,"h_sdi_val"
    ,"hm_sdi_val"
    ,"m_sdi_val"
    ,"lm_sdi_val"
    ,"l_sdi_val"
    ,"status_NYR"
    ,"status_EBI"
    ,"status_ANR"
    #,"ib*elapsed"
    #,"bnc*elapsed"
    ,"sNYR*elapsed"
    ,"sEBI*elapsed"
    ,"sANR*elapsed"
)
```


#### Generics

```{r}
#generics intervention
brand_intervention_ib <- x[c(inherited_cols,"brand_name_counts","bnc*elapsed")]
brand_intervention_ib["identical_brands"] <- asinh(sinh(x$identical_brands)+1) #add a single generic brand
brand_intervention_ib["ib*elapsed"] <- brand_intervention_ib$identical_brands * brand_intervention_ib$elapsed_duration
```

```{r}
counterfact_marketing_ib <- list(
    D = ncol(x),#
    N = nrow(x),
    L = n_categories$count,
    y = as.vector(y),
    ll = as.vector(categories),
    x = as.matrix(x),
    mu_mean = 0,
    mu_stdev = 0.05,
    sigma_shape = 4,
    sigma_rate = 20,
    Nx = nrow(x),
    llx = as.vector(categories),
    counterfact_x_tilde = as.matrix(brand_intervention_ib),
    counterfact_x = as.matrix(x)
)
```


```{r}
generated_ib <- gqs(
    fit@stanmodel,
    data=counterfact_marketing_ib,
    draws=as.matrix(fit),
    seed=11021585
    )
```

```{r}
#TODO: Convert to ggplot, stabilize y axis
hist(as.vector(extract(generated_ib, pars="p_predicted_default")$p_predicted_default), bins=100)
hist(as.vector(extract(generated_ib, pars="p_predicted_intervention")$p_predicted_intervention), bins=100)
hist(as.vector(extract(generated_ib, pars="predicted_difference")$predicted_difference), bins=100)
```


```{r}


pddf_ib <- data.frame(extract(generated_ib, pars="predicted_difference")$predicted_difference) |>
    pivot_longer(X1:X1343)

#TODO: Fix Category names
pddf_ib["entry_idx"] <- as.numeric(gsub("\\D","",pddf_ib$name))
pddf_ib["category"] <-  sapply(pddf_ib$entry_idx, function(i) df$category_id[i])
pddf_ib["category_name"] <- sapply(pddf_ib$category, function(i) beta_list$groups[i])
```


```{r}

ggplot(pddf_ib, aes(x=value,)) +
    geom_histogram(bins=100) +
    labs(
        title = "Distribution of predicted differences"
        ,x = "Difference in probability due to intervention"
        ,y = "Predicted counts"
    ) +
    #xlim(-0.3,0.1) +
    geom_vline(aes(xintercept = 0), color = "skyblue", linetype="dashed")

ggplot(pddf_ib, aes(x=value,)) +
    geom_histogram(bins=100) +
    facet_wrap(
        ~factor(
            category_name,
            levels=beta_list$groups
            )
        , labeller = label_wrap_gen(multi_line = TRUE)
        , ncol=5) +
    labs(
        title = "Distribution of predicted differences",
        subtitle = "By group"
        ,x = "Difference in probability due to intervention"
        ,y = "Predicted counts"
    ) +
    #xlim(-0.25,0.1) +
    geom_vline(aes(xintercept = 0), color = "skyblue", linetype="dashed") +
    theme(strip.text.x = element_text(size = 8))

```


#### USP DC

```{r}
#formulary intervention
brand_intervention_bnc <- x[c(inherited_cols,"identical_brands","ib*elapsed")]
brand_intervention_bnc["brand_name_counts"] <- asinh(sinh(x$brand_name_counts)+1) #add a single formulary competitor brand
brand_intervention_bnc["bnc*elapsed"] <- brand_intervention_bnc$brand_name_counts * brand_intervention_bnc$elapsed_duration
```

```{r}
counterfact_marketing_bnc <- list(
    D = ncol(x),#
    N = nrow(x),
    L = n_categories$count,
    y = as.vector(y),
    ll = as.vector(categories),
    x = as.matrix(x),
    mu_mean = 0,
    mu_stdev = 0.05,
    sigma_shape = 4,
    sigma_rate = 20,
    Nx = nrow(x),
    llx = as.vector(categories),
    counterfact_x_tilde = as.matrix(brand_intervention_bnc),
    counterfact_x = as.matrix(x)
)
```


```{r}
generated_bnc <- gqs(
    fit@stanmodel,
    data=counterfact_marketing_bnc,
    draws=as.matrix(fit),
    seed=11021585
    )
```

```{r}
#TODO: Convert to ggplot, stabilize y axis
hist(as.vector(extract(generated_bnc, pars="p_predicted_default")$p_predicted_default), bins=100)
hist(as.vector(extract(generated_bnc, pars="p_predicted_intervention")$p_predicted_intervention), bins=100)
hist(as.vector(extract(generated_bnc, pars="predicted_difference")$predicted_difference), bins=100)
```


```{r}
pddf_bnc <- data.frame(extract(generated_bnc, pars="predicted_difference")$predicted_difference) |>
    pivot_longer(X1:X1343)

#Add Category names
pddf_bnc["entry_idx"] <- as.numeric(gsub("\\D","",pddf_bnc$name))
pddf_bnc["category"] <-  sapply(pddf_bnc$entry_idx, function(i) df$category_id[i])
pddf_bnc["category_name"] <- sapply(pddf_bnc$category, function(i) beta_list$groups[i])

#add snapshot date
pddf_bnc["snapshot_date"] <- sapply(pddf_bnc$entry_idx, function(i) df$earliest_date_observed[i])#changed values
```


```{r}

ggplot(pddf_bnc, aes(x=value,)) +
    geom_histogram(bins=100) +
    labs(
        title = "Distribution of predicted differences"
        ,x = "Difference in probability due to intervention"
        ,y = "Predicted counts"
    ) +
    #xlim(-0.3,0.1) +
    geom_vline(aes(xintercept = 0), color = "skyblue", linetype="dashed")

ggplot(pddf_bnc, aes(x=value,)) +
    geom_histogram(bins=100) +
    facet_wrap(
        ~factor(
            category_name,
            levels=beta_list$groups
            )
        , labeller = label_wrap_gen(multi_line = TRUE)
        , ncol=5) +
    labs(
        title = "Distribution of predicted differences",
        subtitle = "By group"
        ,x = "Difference in probability due to intervention"
        ,y = "Predicted counts"
    ) +
    #xlim(-0.25,0.1) +
    geom_vline(aes(xintercept = 0), color = "skyblue", linetype="dashed") +
    theme(strip.text.x = element_text(size = 8))

```

TODO: add density plot of (x,y,z) (date,value,counts)
    - with and without faceting