From 98fc18e2003421b309b9f9390fd5817aaa94a19b Mon Sep 17 00:00:00 2001 From: Will King Date: Wed, 4 Oct 2023 16:52:44 -0700 Subject: [PATCH] updated analysis --- .gitignore | 6 + r-analysis/.Rhistory | 0 r-analysis/EffectsOfMarketConditions.qmd | 89 +- ...fMarketConditions_no_enrollment_status.qmd | 783 ++++++++++++++++++ .../{DataMangement.R => FullAnalysis.R} | 181 +++- 5 files changed, 1012 insertions(+), 47 deletions(-) delete mode 100644 r-analysis/.Rhistory create mode 100644 r-analysis/EffectsOfMarketConditions_no_enrollment_status.qmd rename r-analysis/{DataMangement.R => FullAnalysis.R} (52%) diff --git a/.gitignore b/.gitignore index 3be90e4..42c04a4 100644 --- a/.gitignore +++ b/.gitignore @@ -166,3 +166,9 @@ Manifest.toml .Rproj.user */*_files/* + + + +#R stuff +.Rhistory +.RData diff --git a/r-analysis/.Rhistory b/r-analysis/.Rhistory deleted file mode 100644 index e69de29..0000000 diff --git a/r-analysis/EffectsOfMarketConditions.qmd b/r-analysis/EffectsOfMarketConditions.qmd index 48779e7..259ee78 100644 --- a/r-analysis/EffectsOfMarketConditions.qmd +++ b/r-analysis/EffectsOfMarketConditions.qmd @@ -366,7 +366,7 @@ trials_data <- list( fit <- stan( file='Hierarchal_Logistic.stan', data = trials_data, - chains = 6, + chains = 4, iter = 5000, seed = 11021585 ) @@ -969,3 +969,90 @@ brand_intervention_bnc <- x[c(inherited_cols,"identical_brands","ib*elapsed")] brand_intervention_bnc["brand_name_counts"] <- asinh(sinh(x$brand_name_counts)+1) #add a single formulary competitor brand brand_intervention_bnc["bnc*elapsed"] <- brand_intervention_bnc$brand_name_counts * brand_intervention_bnc$elapsed_duration ``` + +```{r} +counterfact_marketing_bnc <- list( + D = ncol(x),# + N = nrow(x), + L = n_categories$count, + y = as.vector(y), + ll = as.vector(categories), + x = as.matrix(x), + mu_mean = 0, + mu_stdev = 0.05, + sigma_shape = 4, + sigma_rate = 20, + Nx = nrow(x), + llx = as.vector(categories), + counterfact_x_tilde = as.matrix(brand_intervention_bnc), + counterfact_x = as.matrix(x) +) +``` + + +```{r} +generated_bnc <- gqs( + fit@stanmodel, + data=counterfact_marketing_bnc, + draws=as.matrix(fit), + seed=11021585 + ) +``` + +```{r} +#TODO: Convert to ggplot, stabilize y axis +hist(as.vector(extract(generated_bnc, pars="p_predicted_default")$p_predicted_default), bins=100) +hist(as.vector(extract(generated_bnc, pars="p_predicted_intervention")$p_predicted_intervention), bins=100) +hist(as.vector(extract(generated_bnc, pars="predicted_difference")$predicted_difference), bins=100) +``` + + +```{r} +pddf_bnc <- data.frame(extract(generated_bnc, pars="predicted_difference")$predicted_difference) |> + pivot_longer(X1:X1343) + +#Add Category names +pddf_bnc["entry_idx"] <- as.numeric(gsub("\\D","",pddf_bnc$name)) +pddf_bnc["category"] <- sapply(pddf_bnc$entry_idx, function(i) df$category_id[i]) +pddf_bnc["category_name"] <- sapply(pddf_bnc$category, function(i) beta_list$groups[i]) + +#add snapshot date +pddf_bnc["snapshot_date"] <- sapply(pddf_bnc$entry_idx, function(i) df$earliest_date_observed[i])#changed values +``` + + +```{r} + +ggplot(pddf_bnc, aes(x=value,)) + + geom_histogram(bins=100) + + labs( + title = "Distribution of predicted differences" + ,x = "Difference in probability due to intervention" + ,y = "Predicted counts" + ) + + #xlim(-0.3,0.1) + + geom_vline(aes(xintercept = 0), color = "skyblue", linetype="dashed") + +ggplot(pddf_bnc, aes(x=value,)) + + geom_histogram(bins=100) + + facet_wrap( + ~factor( + category_name, + levels=beta_list$groups + ) + , labeller = label_wrap_gen(multi_line = TRUE) + , ncol=5) + + labs( + title = "Distribution of predicted differences", + subtitle = "By group" + ,x = "Difference in probability due to intervention" + ,y = "Predicted counts" + ) + + #xlim(-0.25,0.1) + + geom_vline(aes(xintercept = 0), color = "skyblue", linetype="dashed") + + theme(strip.text.x = element_text(size = 8)) + +``` + +TODO: add density plot of (x,y,z) (date,value,counts) + - with and without faceting diff --git a/r-analysis/EffectsOfMarketConditions_no_enrollment_status.qmd b/r-analysis/EffectsOfMarketConditions_no_enrollment_status.qmd new file mode 100644 index 0000000..cdccd16 --- /dev/null +++ b/r-analysis/EffectsOfMarketConditions_no_enrollment_status.qmd @@ -0,0 +1,783 @@ +--- +title: "The Effects of market conditions on recruitment and completion of clinical trials" +author: "Will King" +format: html +editor: source +--- + + +# Setup + +```{r} +library(bayesplot) +available_mcmc(pattern = "_nuts_") +library(ggplot2) +library(patchwork) +library(tidyverse) +library(rstan) +library(tidyr) +library(ghibli) +#Resources: https://github.com/stan-dev/rstan/wiki/RStan-Getting-Started + +#save unchanged models instead of recompiling +rstan_options(auto_write = TRUE) +#allow for multithreaded sampling +options(mc.cores = parallel::detectCores()) + +#test installation, shouldn't get any errors +#example(stan_model, package = "rstan", run.dontrun = TRUE) +``` + +```{r} +################ Pull data from database ###################### +library(RPostgreSQL) + +driver <- dbDriver("PostgreSQL") + +get_data <- function(driver) { + +con <- dbConnect( + driver, + user='root', + password='root', + dbname='aact_db', + host='will-office' + ) +on.exit(dbDisconnect(con)) + +query <- dbSendQuery( + con, +# "select * from formatted_data_with_planned_enrollment;" +" +select + fdqpe.nct_id + --,fdqpe.start_date + --,fdqpe.current_enrollment + --,fdqpe.enrollment_category + ,fdqpe.current_status + ,fdqpe.earliest_date_observed + ,fdqpe.elapsed_duration + ,fdqpe.n_brands as identical_brands + ,ntbtu.brand_name_count + ,fdqpe.category_id + ,fdqpe.final_status + ,fdqpe.h_sdi_val + --,fdqpe.h_sdi_u95 + --,fdqpe.h_sdi_l95 + ,fdqpe.hm_sdi_val + --,fdqpe.hm_sdi_u95 + --,fdqpe.hm_sdi_l95 + ,fdqpe.m_sdi_val + --,fdqpe.m_sdi_u95 + --,fdqpe.m_sdi_l95 + ,fdqpe.lm_sdi_val + --,fdqpe.lm_sdi_u95 + --,fdqpe.lm_sdi_l95 + ,fdqpe.l_sdi_val + --,fdqpe.l_sdi_u95 + --,fdqpe.l_sdi_l95 +from formatted_data_with_planned_enrollment fdqpe + join \"Formularies\".nct_to_brands_through_uspdc ntbtu + on fdqpe.nct_id = ntbtu.nct_id +order by fdqpe.nct_id, fdqpe.earliest_date_observed +; +" + ) +df <- fetch(query, n = -1) +df <- na.omit(df) + +query2 <-dbSendQuery(con,"select count(*) from \"DiseaseBurden\".icd10_categories ic where \"level\"=1;") +n_categories <- fetch(query2, n = -1) + +return(list(data=df,ncat=n_categories)) +} + +d <- get_data(driver) +df <- d$data +n_categories <- d$ncat + + + + +################ Format Data ########################### + +data_formatter <- function(df) { +categories <- df["category_id"] + +x <- df["elapsed_duration"] +x["identical_brands"] <- asinh(df$identical_brands) +x["brand_name_counts"] <- asinh(df$brand_name_count) +x["h_sdi_val"] <- asinh(df$h_sdi_val) +x["hm_sdi_val"] <- asinh(df$hm_sdi_val) +x["m_sdi_val"] <- asinh(df$m_sdi_val) +x["lm_sdi_val"] <- asinh(df$lm_sdi_val) +x["l_sdi_val"] <- asinh(df$l_sdi_val) + + +#interaction terms for competitors +x["ib*elapsed"] <- x["elapsed_duration"]*x["identical_brands"] +x["bnc*elapsed"] <- x["elapsed_duration"] * x["brand_name_counts"] + +y <- ifelse(df["final_status"]=="Terminated",1,0) + + + +return(list(x=x,y=y)) +} + +train <- data_formatter(df) + +categories <- df$category_id + +x <- train$x +y <- train$y +``` + + + +### Intervention: Adding a single competitor +```{r} +inherited_cols <- c( + "elapsed_duration" + ,"identical_brands" + ,"brand_name_counts" + ,"h_sdi_val" + ,"hm_sdi_val" + ,"m_sdi_val" + ,"lm_sdi_val" + ,"l_sdi_val" + ,"status_NYR" + ,"status_EBI" + ,"status_ANR" + ,"ib*elapsed" + ,"bnc*elapsed" +) +``` + + + +#### Generics + +```{r} +#generics intervention +brand_intervention_ib <- x[inherited_cols] +brand_intervention_ib["identical_brands"] <- asinh(sinh(x$identical_brands)+1) #add a single generic brand +brand_intervention_ib["ib*elapsed"] <- brand_intervention_ib$identical_brands * brand_intervention_ib$elapsed_duration +``` + +```{r} +counterfact_marketing_ib <- list( + D = ncol(x),# + N = nrow(x), + L = n_categories$count, + y = as.vector(y), + ll = as.vector(categories), + x = as.matrix(x), + mu_mean = 0, + mu_stdev = 0.05, + sigma_shape = 4, + sigma_rate = 20, + Nx = nrow(x), + llx = as.vector(categories), + counterfact_x_tilde = as.matrix(brand_intervention_ib), + counterfact_x = as.matrix(x) +) +``` + +```{r} +generated_bi <- gqs( + fit@stanmodel, + data=counterfact_marketing_ib, + draws=as.matrix(fit), + seed=11021585 + ) +``` + +### USP DC + + +```{r} +#formulary intervention +brand_intervention_bnc <- x[c(inherited_cols,"identical_brands","ib*elapsed")] +brand_intervention_bnc["brand_name_counts"] <- asinh(sinh(x$brand_name_counts)+1) #add a single formulary competitor brand +brand_intervention_bnc["bnc*elapsed"] <- brand_intervention_bnc$brand_name_counts * brand_intervention_bnc$elapsed_duration +``` + +```{r} +counterfact_marketing_bnc <- list( + D = ncol(x),# + N = nrow(x), + L = n_categories$count, + y = as.vector(y), + ll = as.vector(categories), + x = as.matrix(x), + mu_mean = 0, + mu_stdev = 0.05, + sigma_shape = 4, + sigma_rate = 20, + Nx = nrow(x), + llx = as.vector(categories), + counterfact_x_tilde = as.matrix(brand_intervention_bnc), + counterfact_x = as.matrix(x) +) +``` + + +```{r} +generated_bnc <- gqs( + fit@stanmodel, + data=counterfact_marketing_bnc, + draws=as.matrix(fit), + seed=11021585 + ) +``` + + +# Fit Model + + +```{r} +################################# FIT MODEL ######################################### + + + +fit <- stan( + file='Hierarchal_Logistic.stan', + data = counterfact_marketing_ib, + chains = 4, + iter = 5000, + seed = 11021585 + ) +``` + + + +```{r} +hist(as.vector(extract(generated, pars="p_prior")$p_prior)) +hist(as.vector(extract(generated, pars="mu_prior")$mu_prior), ) +hist(as.vector(extract(generated, pars="sigma_prior")$sigma_prior)) +``` + + + +```{r} +check_hmc_diagnostics(fit) +hist(as.vector(extract(generated, pars="p_predicted")$p_predicted)) +``` + + + + + + +# Diagnostics + +```{r} +#trace plots +plot(fit, pars=c("mu"), plotfun="trace") + + +for (i in 1:4) { + print( + mcmc_rank_overlay( + fit, + pars=c( + paste0("mu[",4*i-3,"]"), + paste0("mu[",4*i-2,"]"), + paste0("mu[",4*i-1,"]"), + paste0("mu[",4*i,"]") + ), + n_bins=100 + )+ legend_move("top") + + scale_colour_ghibli_d("KikiMedium") + ) +} +``` + +```{r} +plot(fit, pars=c("sigma"), plotfun="trace") + +for (i in 1:4) { + print( + mcmc_rank_overlay( + fit, + pars=c( + paste0("sigma[",4*i-3,"]"), + paste0("sigma[",4*i-2,"]"), + paste0("sigma[",4*i-1,"]"), + paste0("sigma[",4*i,"]") + ), + n_bins=100 + )+ legend_move("top") + + scale_colour_ghibli_d("KikiMedium") + ) +} +``` + +```{r} +#other diagnostics +logpost <- log_posterior(fit) +nuts_prmts <- nuts_params(fit) +posterior <- as.array(fit) + +``` + +```{r} +color_scheme_set("darkgray") +div_style <- parcoord_style_np(div_color = "green", div_size = 0.05, div_alpha = 0.4) +mcmc_parcoord(posterior, regex_pars = "mu", np=nuts_prmts, np_style = div_style, alpha = 0.05) +``` + +```{r} +for (i in 1:4) { + mus = sapply(3:0, function(j) paste0("mu[",4*i-j ,"]")) + print( + mcmc_pairs( + posterior, + np = nuts_prmts, + pars=c( + mus, + "lp__" + ), + off_diag_args = list(size = 0.75) + ) + ) +} + + + +``` + +```{r} +mcmc_parcoord(posterior,regex_pars = "sigma", np=nuts_prmts, alpha=0.05) +``` + +```{r} + +for (i in 1:4) { + params = sapply(3:0, function(j) paste0("sigma[",4*i-j ,"]")) + print( + mcmc_pairs( + posterior, + np = nuts_prmts, + pars=c( + params, + "lp__" + ), + off_diag_args = list(size = 0.75) + ) + ) +} +``` + + +```{r} +for (k in 1:22) { +for (i in 1:4) { + params = sapply(3:0, function(j) paste0("beta[",k,",",4*i-j ,"]")) + print( + mcmc_pairs( + posterior, + np = nuts_prmts, + pars=c( + params, + "lp__" + ), + off_diag_args = list(size = 0.75) + ) + ) +}} +``` + + +# Results + + +```{r} +################################# ANALYZE ##################################### +print(fit) +``` + +## Result Plots + + +Note the regular large difference in variance. +I would guess those are the beta[1:22,2] values. +I wonder if a lot of the variance is due to the 2 values that are sitting out. + + + +```{r} +beta_list <- list( + groups = c( + `1`="Infections & Parasites", + `2`="Neoplasms", + `3`="Blood & Immune system", + `4`="Endocrine, Nutritional, and Metabolic", + `5`="Mental & Behavioral", + `6`="Nervous System", + `7`="Eye and Adnexa", + `8`="Ear and Mastoid", + `9`="Circulatory", + `10`="Respiratory", + `11`="Digestive", + `12`="Skin & Subcutaneaous tissue", + `13`="Musculoskeletal", + `14`="Genitourinary", + `15`="Pregancy, Childbirth, & Puerperium", + `16`="Perinatal Period", + `17`="Congential", + `18`="Symptoms, Signs etc.", + `19`="Injury etc.", + `20`="External Causes", + `21`="Contact with Healthcare", + `22`="Special Purposes" + ), + parameters = c( + `1`="Elapsed Duration", + # brands + `2`="asinh(Generic Brands)", + `3`="asinh(Competitors USPDC)", + # population + `4`="asinh(High SDI)", + `5`="asinh(High-Medium SDI)", + `6`="asinh(Medium SDI)", + `7`="asinh(Low-Medium SDI)", + `8`="asinh(Low SDI)", + #Status + `9`="status_NYR", + `10`="status_EBI", + `11`="status_ANR", + #interactions for brands + `12`="ib*elapsed", + `13`="bnc*elapsed", + # interactions for status + `14`="sNYR*elapsed", + `15`="sEBI*elapsed", + `16`="sANR*elapsed" + + ) +) + +get_parameters <- function(stem,class_list) { + #get categories and lengths + named <- names(class_list) + lengths <- sapply(named, (function (x) length(class_list[[x]]))) + + #describe the grid needed + iter_list <- sapply(named, (function (x) 1:lengths[x])) + + #generate the list of parameters + pardf <- generate_parameter_df(stem, iter_list) + + #add columns with appropriate human-readable names + for (name in named) { + pardf[paste(name,"_hr",sep="")] <- as.factor( + sapply(pardf[name], (function (i) class_list[[name]][i])) + ) + } + + return(pardf) +} + +generate_parameter_df <- function(stem, iter_list) { + grid <- expand.grid(iter_list) + grid["param_name"] <- grid %>% unite(x,colnames(grid),sep=",") + grid["param_name"] <- paste(stem,"[",grid$param_name,"]",sep="") + return(grid) +} + +group_mcmc_areas <- function( + stem,# = "beta" + class_list,# = beta_list + stanfit,# = fit + group_id,# = 2 + rename=TRUE + ) { + #get all parameter names + params <- get_parameters(stem,class_list) + #filter down to parameters of interest + params <- filter(params,groups == group_id) + #Get dataframe with only the rows of interest + filtdata <- as.data.frame(stanfit)[params$param_name] + #rename columns + if (rename) dimnames(filtdata)[[2]] <- params$parameters_hr + #get group name for title + group_name <- class_list$groups[group_id] + #create area plot with appropriate title + mcmc_areas(filtdata,prob = 0.8, prob_outer = 0.95) + + ggtitle(paste("Parameter distributions for ICD-10 class:",group_name)) +} + +parameter_mcmc_areas <- function( + stem,# = "beta" + class_list,# = beta_list + stanfit,# = fit + parameter_id,# = 2 + rename=TRUE + ) { + #get all parameter names + params <- get_parameters(stem,class_list) + #filter down to parameters of interest + params <- filter(params,parameters == parameter_id) + #Get dataframe with only the rows of interest + filtdata <- as.data.frame(stanfit)[params$param_name] + #rename columns + if (rename) dimnames(filtdata)[[2]] <- params$groups_hr + #get group name for title + parameter_name <- class_list$parameters[parameter_id] + #create area plot with appropriate title + mcmc_areas(filtdata,prob = 0.8, prob_outer = 0.95) + + ggtitle(parameter_name,"Parameter Distribution") +} + + +``` + +```{r} +#mcmc_intervals(fit, pars=get_parameters("beta",beta_list)$param_name) +``` + +### Investigating parameter distributions + +```{r} +#g1 <- group_mcmc_areas("beta",beta_list,fit,2) +#g2 <- group_mcmc_areas("beta",beta_list,fit,2) +#g3 <- group_mcmc_areas("beta",beta_list,fit,2) +#g4 <- group_mcmc_areas("beta",beta_list,fit,2) +#g5 <- group_mcmc_areas("beta",beta_list,fit,2) +#g6 <- group_mcmc_areas("beta",beta_list,fit,2) +#g7 <- group_mcmc_areas("beta",beta_list,fit,2) +#g8 <- group_mcmc_areas("beta",beta_list,fit,2) +#g9 <- group_mcmc_areas("beta",beta_list,fit,2) +#g10 <- group_mcmc_areas("beta",beta_list,fit,2) +#g11 <- group_mcmc_areas("beta",beta_list,fit,2) +#g12 <- group_mcmc_areas("beta",beta_list,fit,2) +#g13 <- group_mcmc_areas("beta",beta_list,fit,2) +#g14 <- group_mcmc_areas("beta",beta_list,fit,2) +#g15 <- group_mcmc_areas("beta",beta_list,fit,2) +#g16 <- group_mcmc_areas("beta",beta_list,fit,2) +#g17 <- group_mcmc_areas("beta",beta_list,fit,2) +#g18 <- group_mcmc_areas("beta",beta_list,fit,2) +#g19 <- group_mcmc_areas("beta",beta_list,fit,2) +#g20 <- group_mcmc_areas("beta",beta_list,fit,2) +#g21 <- group_mcmc_areas("beta",beta_list,fit,2) +#g22 <- group_mcmc_areas("beta",beta_list,fit,2) + + +p1 <- parameter_mcmc_areas("beta",beta_list,fit,1) +p2 <- parameter_mcmc_areas("beta",beta_list,fit,2) +p3 <- parameter_mcmc_areas("beta",beta_list,fit,3) +#p4 <- parameter_mcmc_areas("beta",beta_list,fit,4) +#p5 <- parameter_mcmc_areas("beta",beta_list,fit,5) +#p6 <- parameter_mcmc_areas("beta",beta_list,fit,6) +#p7 <- parameter_mcmc_areas("beta",beta_list,fit,7) +#p8 <- parameter_mcmc_areas("beta",beta_list,fit,8) +p9 <- parameter_mcmc_areas("beta",beta_list,fit,9) +p10 <- parameter_mcmc_areas("beta",beta_list,fit,10) +p11 <- parameter_mcmc_areas("beta",beta_list,fit,11) +p12 <- parameter_mcmc_areas("beta",beta_list,fit,12) +p13 <- parameter_mcmc_areas("beta",beta_list,fit,13) +p14 <- parameter_mcmc_areas("beta",beta_list,fit,14) +p15 <- parameter_mcmc_areas("beta",beta_list,fit,15) +p16 <- parameter_mcmc_areas("beta",beta_list,fit,16) +``` + +Note these have 95% outer CI and 80% inner (shaded) + + + 1) "Elapsed Duration", + 2) "asinh(Generic Brands)", + 3) "asinh(Competitors USPDC)", + 4) "asinh(High SDI)", + 5) "asinh(High-Medium SDI)", + 6) "asinh(Medium SDI)", + 7) "asinh(Low-Medium SDI)", + 8) "asinh(Low SDI)", + 9) "status_NYR", + 10) "status_EBI", + 11) "status_ANR", + 12) "ib*elapsed", + 13) "bnc*elapsed", + 14) "sNYR*elapsed", + 15) "sEBI*elapsed", + 16) "sANR*elapsed" + +of interest +- p1 + p2 +- p3 + p2 +- p2 + p12 +- p3 + p13 +- p9 + p14 +- p10 + p15 +- p11 + p16 + +```{r} +p1 + p2 +``` + + +```{r} +p2 + p3 +``` + +```{r} +p2 + p12 +``` + +```{r} +p3 + p13 +``` + +```{r} +p9 + p14 +``` + + + +```{r} +p10 + p15 +``` + + +```{r} +p11 + p16 +``` + + +# Posterior Prediction + + +```{r} +#TODO: Convert to ggplot, stabilize y axis +hist(as.vector(extract(generated, pars="p_predicted_default")$p_predicted_default)) +hist(as.vector(extract(generated, pars="p_predicted_intervention")$p_predicted_intervention)) +``` + + + + + +## Distribution of Predicted Differences + + + +### Intervention: Adding a single competitor + + +#### Generics + + +```{r} +#TODO: Convert to ggplot, stabilize y axis +hist(as.vector(extract(generated_ib, pars="p_predicted_default")$p_predicted_default), bins=100) +hist(as.vector(extract(generated_ib, pars="p_predicted_intervention")$p_predicted_intervention), bins=100) +hist(as.vector(extract(generated_ib, pars="predicted_difference")$predicted_difference), bins=100) +``` + + + +```{r} + + +pddf_ib <- data.frame(extract(generated_ib, pars="predicted_difference")$predicted_difference) |> + pivot_longer(X1:X1343) + +#TODO: Fix Category names +pddf_ib["entry_idx"] <- as.numeric(gsub("\\D","",pddf_ib$name)) +pddf_ib["category"] <- sapply(pddf_ib$entry_idx, function(i) df$category_id[i]) +pddf_ib["category_name"] <- sapply(pddf_ib$category, function(i) beta_list$groups[i]) +``` + + +```{r} + +ggplot(pddf_ib, aes(x=value,)) + + geom_histogram(bins=100) + + labs( + title = "Distribution of predicted differences" + ,x = "Difference in probability due to intervention" + ,y = "Predicted counts" + ) + + #xlim(-0.3,0.1) + + geom_vline(aes(xintercept = 0), color = "skyblue", linetype="dashed") + +ggplot(pddf_ib, aes(x=value,)) + + geom_histogram(bins=100) + + facet_wrap( + ~factor( + category_name, + levels=beta_list$groups + ) + , labeller = label_wrap_gen(multi_line = TRUE) + , ncol=5) + + labs( + title = "Distribution of predicted differences", + subtitle = "By group" + ,x = "Difference in probability due to intervention" + ,y = "Predicted counts" + ) + + #xlim(-0.25,0.1) + + geom_vline(aes(xintercept = 0), color = "skyblue", linetype="dashed") + + theme(strip.text.x = element_text(size = 8)) + +``` + + + +#### USP DC +```{r} +#TODO: Convert to ggplot, stabilize y axis +hist(as.vector(extract(generated_bnc, pars="p_predicted_default")$p_predicted_default), bins=100) +hist(as.vector(extract(generated_bnc, pars="p_predicted_intervention")$p_predicted_intervention), bins=100) +hist(as.vector(extract(generated_bnc, pars="predicted_difference")$predicted_difference), bins=100) +``` + + +```{r} +pddf_bnc <- data.frame(extract(generated_bnc, pars="predicted_difference")$predicted_difference) |> + pivot_longer(X1:X1343) + +#Add Category names +pddf_bnc["entry_idx"] <- as.numeric(gsub("\\D","",pddf_bnc$name)) +pddf_bnc["category"] <- sapply(pddf_bnc$entry_idx, function(i) df$category_id[i]) +pddf_bnc["category_name"] <- sapply(pddf_bnc$category, function(i) beta_list$groups[i]) + +#add snapshot date +pddf_bnc["snapshot_date"] <- sapply(pddf_bnc$entry_idx, function(i) as_date(df$earliest_date_observed[i])) +``` + + +```{r} + +ggplot(pddf_bnc, aes(x=value,)) + + geom_histogram(bins=100) + + labs( + title = "Distribution of predicted differences" + ,x = "Difference in probability due to intervention" + ,y = "Predicted counts" + ) + + #xlim(-0.3,0.1) + + geom_vline(aes(xintercept = 0), color = "skyblue", linetype="dashed") + +ggplot(pddf_bnc, aes(x=value,)) + + geom_histogram(bins=100) + + facet_wrap( + ~factor( + category_name, + levels=beta_list$groups + ) + , labeller = label_wrap_gen(multi_line = TRUE) + , ncol=5) + + labs( + title = "Distribution of predicted differences", + subtitle = "By group" + ,x = "Difference in probability due to intervention" + ,y = "Predicted counts" + ) + + #xlim(-0.25,0.1) + + geom_vline(aes(xintercept = 0), color = "skyblue", linetype="dashed") + + theme(strip.text.x = element_text(size = 8)) + +``` + diff --git a/r-analysis/DataMangement.R b/r-analysis/FullAnalysis.R similarity index 52% rename from r-analysis/DataMangement.R rename to r-analysis/FullAnalysis.R index d5fac77..27bd5d6 100644 --- a/r-analysis/DataMangement.R +++ b/r-analysis/FullAnalysis.R @@ -1,8 +1,9 @@ library(bayesplot) available_mcmc(pattern = "_nuts_") library(ggplot2) - -library(rstan) +library(posterior) +library(cmdstanr) +library(rstan) # for stanfit #Resources: https://github.com/stan-dev/rstan/wiki/RStan-Getting-Started #save unchanged models instead of recompiling @@ -30,17 +31,18 @@ query <- dbSendQuery( con, "select * from formatted_data_with_planned_enrollment;" ) -df <- fetch(query) +df <- fetch(query,Inf) df <- na.omit(df) -query2 <-dbSendQuery(con,"select count(*) from \"DiseaseBurden\".icd10_categories ic ;") -n_categories <- fetch(query2) +n_categories <- 22 #number of categories in ICD-10 ################ Format Data ########################### categories <- df["category_id"] +#Convert log(x+1) to arcsinh + x <- df["elapsed_duration"] x["log_n_brands"] <- log(df$n_brands + 1) x["h_sdi_val"] <- log(df$h_sdi_val + 1) @@ -48,7 +50,7 @@ x["hm_sdi_val"] <- log(df$hm_sdi_val + 1) x["m_sdi_val"] <- log(df$m_sdi_val + 1) x["lm_sdi_val"] <- log(df$lm_sdi_val + 1) x["l_sdi_val"] <- log(df$l_sdi_val + 1) -x["enrollment"] <- df["current_enrollment"] / df["planned_enrollment"] +#x["enrollment"] <- df["current_enrollment"] / df["planned_enrollment"] #square terms #x["enrollment^2"] <- x["enrollment"]^2 #x["elapsed_duration^2"] <- x["elapsed_duration"]^2 @@ -72,29 +74,39 @@ y <- ifelse(df["final_status"]=="Terminated",1,0) trials_data <- list( D = ncol(x),# N = nrow(x), -# L = n_categories$count, + L = n_categories, y = as.vector(y), -# ll = as.vector(categories$category_id), - x = as.matrix(x) + ll = categories$category_id, + x = as.matrix(x), + mu_mean = 0, + mu_stdev = 0.5, + sigma_shape = 6, + sigma_rate = 12 ) -#fit <- stan(file='Logistic.stan', data = trials_data) #attempt at simple model -fit <- stan(file='Hierarchal_Logistic.stan', data = trials_data) +model <- cmdstan_model(file.path("Hierarchal_Logistic.stan")) -################################# ANALYZE ##################################### -print(fit) +fit <- model$sample( + data = trials_data, + seed = 11021585, + chains = 4, + parallel_chains = 4, + refresh = 500 +) +################################# ANALYZE ##################################### color_scheme_set("darkgray") -#trace plots -plot(fit, pars=c("mu"), plotfun="trace") -plot(fit, pars=c("sigma"), plotfun="trace") +div_style <- parcoord_style_np(div_color = "green", div_size = 0.05, div_alpha = 0.4) -#other diagnostics -logpost <- log_posterior(fit) -nuts_prmts <- nuts_params(fit) -posterior <- as.array(fit) +print(fit$summary(),n=265) +stanfitted <- rstan::read_stan_csv(fit$output_files()) +#analyze mu values +draw_mu <- fit$draws("mu") +mcmc_hist(draw_mu) +mcmc_trace(draw_mu) +mcmc_pairs(draw_mu) mcmc_parcoord(posterior,pars=c( "mu[1]", "mu[2]", @@ -106,23 +118,17 @@ mcmc_parcoord(posterior,pars=c( "mu[8]", "mu[9]", "mu[10]", - "mu[11]", - "mu[12]" - ), - np=nuts_prmts + "mu[11]" +), +np=nuts_prmts, +np_style = div_style ) -mcmc_pairs( - posterior, - np = nuts_prmts, - pars=c( - "mu[1]", - "mu[2]", - "sigma[1]", - "sigma[2]" - ), - off_diag_args = list(size = 0.75) -) + +#check sigma +draw_sigma <- fit$draws("sigma") +mcmc_hist(draw_sigma) +mcmc_trace(draw_sigma) mcmc_parcoord(posterior,pars=c( "sigma[1]", @@ -135,16 +141,99 @@ mcmc_parcoord(posterior,pars=c( "sigma[8]", "sigma[9]", "sigma[10]", - "sigma[11]", - "sigma[12]" + "sigma[11]" ), -np=nuts_prmts +np=nuts_prmts, +np_style = div_style +) + +#other diagnostics +logpost <- log_posterior(fit) +nuts_prmts <- nuts_params(fit) +posterior <- fit$draws() + +mcmc_pairs( + posterior, + np = nuts_prmts, + pars=c( + "mu[1]", + "mu[2]", + "mu[3]", + "mu[4]", + "mu[5]", + "mu[6]", + "mu[7]", + "mu[8]", + "mu[9]", + "mu[10]", + "mu[11]" + ), + off_diag_args = list(size = 0.75) ) -# check model estimates -#mu -plot(fit, pars=c("mu"), show_density =TRUE, ci_level=0.8) -plot(fit, pars=c("mu"), plotfun = "hist") - -#sigma -plot(fit, pars=c("sigma"), show_density =TRUE, ci_level=0.8) -plot(fit, pars=c("sigma"), plotfun = "hist") + +#Interpretation +mcmc_areas(posterior, + pars=c( + "mu[1]", + "mu[2]", + "mu[3]", + "mu[4]", + "mu[5]", + "mu[6]", + "mu[7]", + "mu[8]", + "mu[9]", + "mu[10]", + "mu[11]" + ), + prob = 0.95 +) + +#Interpretation +mcmc_areas(posterior, + pars=c( + "sigma[1]", + "sigma[2]", + "sigma[3]", + "sigma[4]", + "sigma[5]", + "sigma[6]", + "sigma[7]", + "sigma[8]", + "sigma[9]", + "sigma[10]", + "sigma[11]" + ), + prob = 0.95 +) + +#iterate through betas + + +mcmc_areas(posterior, + pars=c( + "beta[1,*]" + ), + prob = 0.95 +) + +#generate array of betas +betas_array <- sapply(1:11, function(param) sapply(1:22, function(group) paste0("beta[",group,",",param,"]"))) + +for (group in 1:22) { + print( + mcmc_areas(posterior, + pars=betas_array[group,], + prob = 0.95 + ) + ) +} + +for (param in 1:11) { + print( + mcmc_areas(posterior, + pars=betas_array[,param], + prob = 0.95 + ) + ) +}