From d753db81c709ec53bbbe8bfbfa4153769051ddd6 Mon Sep 17 00:00:00 2001
From: Will King <will.king.git@youainti.com>
Date: Sat, 21 Mar 2026 15:20:44 -0700
Subject: [PATCH] Changes to enrollment delay without ICD10Group fix

I wanted to fix the ICD-10 groups with a record of other changes I made.
---
 r-analysis/EffectsOfEnrollmentDelay.qmd | 161 ++++++++++++++++--------
 1 file changed, 110 insertions(+), 51 deletions(-)

diff --git a/r-analysis/EffectsOfEnrollmentDelay.qmd b/r-analysis/EffectsOfEnrollmentDelay.qmd
index 5df7ec3..10d021d 100644
--- a/r-analysis/EffectsOfEnrollmentDelay.qmd
+++ b/r-analysis/EffectsOfEnrollmentDelay.qmd
@@ -8,6 +8,8 @@ editor: source
 
 # Setup
 
+
+
 ```{r}
 library(knitr)
 library(bayesplot)
@@ -37,10 +39,15 @@ image_parameters_by_groups <-paste0(image_root,"/betas/parameters_by_group")
 image_parameters_across_groups <-paste0(image_root,"/betas/parameter_across_groups")
 ```
 
+run on `{r} now(tz='UTC')`
+
+
+
 ```{r}
 ################ Pull data from database ######################
 library(RPostgreSQL)
-host <- 'aact_db-restored-2025-01-07'
+#host <-'aact_db-restored-2025-01-07'
+host <- '10.89.0.6'
 
 driver <- dbDriver("PostgreSQL")
 
@@ -431,6 +438,7 @@ counterfact_delay <- list(
 ```
 
 ```{r}
+#| label: Fitting
 fit <- stan(
     file='Hierarchal_Logistic.stan', 
     data = counterfact_delay,
@@ -493,7 +501,7 @@ ggplot(data=df3, aes(x=count, fill=final_status)) +
     geom_histogram(binwidth=1) +
     ggtitle("Histogram of snapshots per trial (matched trials)") +
     xlab("Snapshots per trial")
-ggsave(paste0(image_trial_details,"/HistSnapshots.png"))
+ggsave(paste0(image_trial_details,"/HistSnapshots.png"), create.dir = TRUE)
 
 #Plot duration for terminated vs completed
 df4 <- dbGetQuery(
@@ -516,7 +524,7 @@ ggplot(data=df4, aes(x=duration,fill=overall_status)) +
     ggtitle("Histogram of trial durations") +
     xlab("duration")+
     facet_wrap(~overall_status)
-ggsave(paste0(image_trial_details,"/HistTrialDurations_Faceted.png"))
+ggsave(paste0(image_trial_details,"/HistTrialDurations_Faceted.png"), create.dir = TRUE)
 
 df5 <- dbGetQuery(
     con,
@@ -547,7 +555,7 @@ ggplot(data=df5, aes(x=duration,y=snapshot_count,color=overall_status)) +
     ggtitle("Comparison of duration, status, and snapshot_count") +
     xlab("duration") +
     ylab("snapshot count") 
-ggsave(paste0(image_trial_details,"/SnapshotsVsDurationVsTermination.png"))
+ggsave(paste0(image_trial_details,"/SnapshotsVsDurationVsTermination.png"), create.dir = TRUE)
 
 dbDisconnect(con)
 
@@ -563,7 +571,7 @@ ggplot(data = group_trials_by_category, aes(x=category_id)) +
         ,x="Category ID"
         ,y="Count"
     )
-ggsave(paste0(image_trial_details,"/CategoryCounts.png"))
+ggsave(paste0(image_trial_details,"/CategoryCounts.png"), create.dir = TRUE)
     
 
 
@@ -573,8 +581,16 @@ cor_dur_count <- cor(df5$duration,df5$snapshot_count)
 count_snapshots <- sum(df5$snapshot_count)
 ```
 
-the correlation value is `r cor_dur_count` between duration and snapshot count.
-There are `r count_snapshots` snapshots in total.
+the correlation value is `{r} cor_dur_count` between duration and snapshot count.
+There are `{r} count_snapshots` snapshots in total, spread over 
+`{r} length(df5%nct_id)` trials.
+
+The number of categories by trial are
+TODO: add in category names, 
+```{r}
+group_trials_by_category |> count(category_id)
+```
+
 
 
 
@@ -595,7 +611,7 @@ for (i in category_count$category_id[category_count$n >= 0]) {
     gi <- group_mcmc_areas("beta",beta_list,fit,i) #add way to filter groups
     ggsave(
         paste0(image_parameters_by_groups,"/group_",i,"_",gi$name,".png")
-        ,plot=gi$plot
+        ,plot=gi$plot, create.dir = TRUE
         )
     gx <- c(gx,gi)
 
@@ -625,7 +641,7 @@ for (i in c(1,2,3,9,10,11,12)) {
     pi <- parameter_mcmc_areas("beta",beta_list,fit,i) #add way to filter groups
     ggsave(
         paste0(image_parameters_across_groups,"/parameters_",i,"_",pi$name,".png")
-        ,plot=pi$plot
+        ,plot=pi$plot, create.dir = TRUE
         )
     px <- c(px,pi)
 
@@ -685,7 +701,7 @@ ggplot(df_ib_p, aes(x=p_prior)) +
         ,x="Probability Domain 'p'"
         ,y="Probability Density"
     )
-ggsave(paste0(image_dist_diff_analysis,"/prior_p.png"))
+ggsave(paste0(image_dist_diff_analysis,"/prior_p.png"), create.dir = TRUE)
 
 #p_posterior
 ggplot(df_ib_p, aes(x=p_predicted)) +
@@ -696,7 +712,7 @@ ggplot(df_ib_p, aes(x=p_predicted)) +
         ,x="Probability Domain 'p'"
         ,y="Probability Density"
     )
-ggsave(paste0(image_dist_diff_analysis,"/posterior_p.png"))
+ggsave(paste0(image_dist_diff_analysis,"/posterior_p.png"), create.dir = TRUE)
 
 #mu_prior
 ggplot(df_ib_prior) +
@@ -707,7 +723,7 @@ ggplot(df_ib_prior) +
         ,x="Mu"
         ,y="Probability"
     )
-ggsave(paste0(image_dist_diff_analysis,"/prior_mu.png"))
+ggsave(paste0(image_dist_diff_analysis,"/prior_mu.png"), create.dir = TRUE)
 
 #sigma_posterior
 ggplot(df_ib_prior) +
@@ -718,7 +734,7 @@ ggplot(df_ib_prior) +
         ,x="Sigma"
         ,y="Probability"
     )
-ggsave(paste0(image_dist_diff_analysis,"/prior_sigma.png"))
+ggsave(paste0(image_dist_diff_analysis,"/prior_sigma.png"), create.dir = TRUE)
 ```
 
 
@@ -726,7 +742,7 @@ ggsave(paste0(image_dist_diff_analysis,"/prior_sigma.png"))
 
 
 
-### Intervention: Delay close of enrollment
+# Intervention: Delay close of enrollment
 
 ```{r}
 counterfact_predicted_ib <- data.frame(
@@ -744,7 +760,7 @@ ggplot(counterfact_predicted_ib, aes(x=p_predicted_default)) +
         ,x="Probability Domain 'p'"
         ,y="Probability Density"
     )
-ggsave(paste0(image_dist_diff_analysis,"/p_no_intervention.png"))
+ggsave(paste0(image_dist_diff_analysis,"/p_no_intervention.png"), create.dir = TRUE)
 
 ggplot(counterfact_predicted_ib, aes(x=p_predicted_intervention)) +
     geom_density() + 
@@ -754,7 +770,7 @@ ggplot(counterfact_predicted_ib, aes(x=p_predicted_intervention)) +
         ,x="Probability Domain 'p'"
         ,y="Probability Density"
     )
-ggsave(paste0(image_dist_diff_analysis,"/p_delay_intervention.png"))
+ggsave(paste0(image_dist_diff_analysis,"/p_delay_intervention.png"), create.dir = TRUE)
 
 ggplot(counterfact_predicted_ib, aes(x=predicted_difference)) +
     geom_density() + 
@@ -764,7 +780,7 @@ ggplot(counterfact_predicted_ib, aes(x=predicted_difference)) +
         ,x="Difference in 'p' under treatment"
         ,y="Probability Density"
     )
-ggsave(paste0(image_dist_diff_analysis,"/p_delay_intervention_distdiff_1.png"))
+ggsave(paste0(image_dist_diff_analysis,"/p_delay_intervention_distdiff_1.png"), create.dir = TRUE)
 ```
 
 
@@ -805,7 +821,7 @@ ggplot(pddf_ib, aes(x=value,)) +
         ,y = "Probability Density"
     ) + 
     geom_vline(aes(xintercept = 0), color = "skyblue", linetype="dashed") 
-ggsave(paste0(image_dist_diff_analysis,"/p_delay_intervention_distdiff_styled.png"))
+ggsave(paste0(image_dist_diff_analysis,"/p_delay_intervention_distdiff_styled.png"), create.dir = TRUE)
 ```
 
 
@@ -827,7 +843,7 @@ ggplot(pddf_ib, aes(x=value,)) +
     ) + 
     geom_vline(aes(xintercept = 0), color = "skyblue", linetype="dashed") +
     theme(strip.text.x = element_text(size = 8))
-ggsave(paste0(image_dist_diff_analysis,"/p_delay_intervention_distdiff_by_group.png"))
+ggsave(paste0(image_dist_diff_analysis,"/p_delay_intervention_distdiff_by_group.png"), create.dir = TRUE)
 ```
 
 
@@ -849,7 +865,7 @@ ggplot(pddf_ib, aes(x=value,)) +
     ) + 
     geom_vline(aes(xintercept = 0), color = "skyblue", linetype="dashed") +
     theme(strip.text.x = element_text(size = 8))
-ggsave(paste0(image_dist_diff_analysis,"/p_delay_intervention_histdiff_by_group.png"))
+ggsave(paste0(image_dist_diff_analysis,"/p_delay_intervention_histdiff_by_group.png"), create.dir = TRUE)
 ```
 
 
@@ -873,6 +889,7 @@ stats <- list(
  q1 = quantile(pddf_ib$value, 0.25),
  med = median(pddf_ib$value),
  mean = mean(pddf_ib$value),
+ stdev = sd(pddf_ib$value),
  q3 = quantile(pddf_ib$value, 0.75),
  p90 = quantile(pddf_ib$value, 0.90),
  p95 = quantile(pddf_ib$value, 0.95),
@@ -928,7 +945,7 @@ p3 +
    x = stats$mean,
    y = stats$y_offset * 1.5
  ), aes(x = x, y = y))
-ggsave(paste0(image_dist_diff_analysis,"/p_delay_intervention_histdiff_boxplot.png"))
+ggsave(paste0(image_dist_diff_analysis,"/p_delay_intervention_histdiff_boxplot.png"), create.dir = TRUE)
 ```
 
 
@@ -988,7 +1005,7 @@ p4 <- ggplot(pddf_ib, aes(x = value)) +
    x = stats$mean,
    y = stats$y_offset_density * 1.5
  ), aes(x = x, y = y))
-ggsave(paste0(image_dist_diff_analysis,"/p_delay_intervention_distdiff_boxplot.png"))
+ggsave(paste0(image_dist_diff_analysis,"/p_delay_intervention_distdiff_boxplot.png"), create.dir = TRUE)
 
 p4
 ```
@@ -1003,7 +1020,7 @@ p4
         y = "Cumulative Proportion"
     ) 
 
-ggsave(paste0(image_dist_diff_analysis,"/p_delay_intervention_cumulative_distdiff.png"))
+ggsave(paste0(image_dist_diff_analysis,"/p_delay_intervention_cumulative_distdiff.png"), create.dir = TRUE)
 ```
 
 
@@ -1018,18 +1035,18 @@ below_spike_band <- mean( pddf_ib$value <= -width/2)
 # get mass above and mass below zero
 mass_below_zero <- mean( pddf_ib$value <= 0)
 ```
-Looking at the spike around zero, we find that `r spike_band_centered_zero*100`% 
+Looking at the spike around zero, we find that `{r} spike_band_centered_zero*100`% 
 of the probability mass is contained within the band from 
-[`r -width*100/2`,`r width*100/2`].
-Additionally, there was `r above_spike_band*100`% of the probability above that 
+[`{r} -width/2`,`{r} width/2`].
+Additionally, there was `{r} above_spike_band*100`% of the probability above that 
 -- representing those with a general increase in the probability of termination --
-and `r below_spike_band*100`% of the probability mass below the band
+and `{r} below_spike_band*100`% of the probability mass below the band
 -- representing a decrease in the probability of termination.
 
 On average, if you keep the trial open instead of closing it,
-`r mass_below_zero`% of trials will see a decrease in the probability of termination,
+`{r} mass_below_zero * 100`% of trials will see a decrease in the probability of termination,
 but, due to the high increase in probability of termination given termination was increased,
-the mean probability of termination increases by `r stats$mean`.
+the mean probability of termination increases by `{r} stats$mean (stats$stdev)`.
 
 ```{r}
 # 5%-iles
@@ -1063,7 +1080,7 @@ kable(quant_df)
 proportion_increase <- mean(pddf_ib$value >= 0)
 ```
 
-about `r proportion_increase * 100` percent probability increase in the probability of terminations
+about `{r} proportion_increase * 100` percent probability increase in the probability of terminations
 
 ```{r}
 n = length(counterfact_predicted_ib$p_predicted_intervention)
@@ -1071,11 +1088,45 @@ k = 100
 simulated_terminations_intervention <- mean(rbinom(n,k,as.vector(counterfact_predicted_ib$p_predicted_intervention)))
 simulated_terminations_base <-mean(rbinom(n,k,as.vector(counterfact_predicted_ib$p_predicted_default)))
 
-simulated_percentages <- (simulated_terminations_intervention - simulated_terminations_base)/k
+simulated_percentages <- (simulated_terminations_intervention - simulated_terminations_base)
 ```
 
 The simulation above shows that this results in a percentage-point change in terminations of about 
-`r simulated_percentages * 100`.
+`{r} simulated_percentages * 100`.
+
+
+```{r}
+distdiff_by_group <- pddf_ib %>% 
+    group_by(category_name, category) %>%
+    summarize(
+        mean = mean(value, na.rm=TRUE) *100
+        ,"P(p>0)" = mean(value>0) *100
+        ,"5%"    = quantile( value, 0.05, na.rm = TRUE)*100
+        ,"25%"   = quantile( value, 0.25, na.rm = TRUE)*100
+        ,"median" = quantile( value, 0.5, na.rm = TRUE)*100
+        ,"75%"   = quantile( value, 0.75, na.rm = TRUE)*100
+        ,"95%"   = quantile( value, 0.95, na.rm = TRUE)*100
+    ) %>% arrange(category) %>% select(!category)
+    
+distdif_by_group_latex<- xtable(distdiff_by_group,
+    digits = rep(2, ncol(distdiff_by_group) +1),
+    floating = FALSE,
+    align = "llccccccc",
+    latex.environments = "tabularx",
+    booktabs = TRUE
+)
+
+# Write to file
+write_lines(
+    print(distdif_by_group_latex, include.rownames = FALSE),
+    paste0(image_root,"/distdiff_anr_vs_rec_by_group.tex")
+)
+
+print(distdif_by_group_latex)
+distdiff_by_group
+```
+
+
 
 ## fixed effects distributions
 
@@ -1090,7 +1141,7 @@ mcmc_areas(filtdata,prob = 0.8, prob_outer = 0.95) +
             subtitle = "Moving from 'Active, not recruiting' to 'Recruiting'"
         ) +
     geom_vline(xintercept=seq(-0.25,0.5,0.25),color="grey",alpha=0.750)  
-ggsave(paste0(image_parameters_across_groups,"/fixed_effects_anr_vs_rec_by_group.png"))
+ggsave(paste0(image_parameters_across_groups,"/fixed_effects_anr_vs_rec_by_group.png"), create.dir = TRUE)
     
 fixed_effects_table <- pivot_longer(filtdata, everything()) |> 
     group_by(name) |> 
@@ -1118,7 +1169,7 @@ names(fixed_effects_table)[1] <- "ICD-10 Category"
 
 # Convert to LaTeX
 table <- xtable(fixed_effects_table,
-    digits = rep(3, ncol(fixed_effects_table) + 1),
+    digits = rep(2, ncol(fixed_effects_table) + 1),
     floating = FALSE,
     align = "lccccccccccc",
     latex.environments = "tabularx",
@@ -1141,12 +1192,13 @@ write_lines(
 # Diagnostics
 
 ```{r}
-#| eval: true
+#| label: diagnostics 1
+#| eval: false
 #trace plots
 image_diagnostics <- paste0(image_root,"/diagnostics")
 
 plot(fit, pars=c("mu"), plotfun="trace")
-ggsave(paste0(image_diagnostics,"/trace_plot_mu.png"))
+ggsave(paste0(image_diagnostics,"/trace_plot_mu.png"), create.dir = TRUE)
 
 
 for (i in 1:3) {
@@ -1165,14 +1217,15 @@ for (i in 1:3) {
     )
     mu_range <- paste0(4*i-3,"-",4*i)
     filename <- paste0(image_diagnostics,"/trace_rank_plot_mu_",mu_range,".png")
-    ggsave(filename)
+    ggsave(filename, create.dir = TRUE)
 }
 ```
 
 ```{r}
-#| eval: true
+#| label: diagnostics 2
+#| eval: false
 plot(fit, pars=c("sigma"), plotfun="trace")
-ggsave(paste0(image_diagnostics,"/traceplot_sigma.png"))
+ggsave(paste0(image_diagnostics,"/traceplot_sigma.png"), create.dir = TRUE)
 
 for (i in 1:3) {
     print(
@@ -1190,12 +1243,13 @@ for (i in 1:3) {
     )
     sigma_range <- paste0(4*i-3,"-",4*i)
     filename <- paste0(image_diagnostics,"/trace_rank_plot_sigma_",sigma_range,".png")
-    ggsave(filename)
+    ggsave(filename, create.dir = TRUE)
 }
 ```
 
 ```{r}
-#| eval: true
+#| label: diagnostics 3
+#| eval: false
 #other diagnostics
 logpost <- log_posterior(fit)
 nuts_prmts <- nuts_params(fit)
@@ -1204,15 +1258,17 @@ posterior <- as.array(fit)
 ```
 
 ```{r}
-#| eval: true
+#| label: diagnostics 4
+#| eval: false
 color_scheme_set("darkgray")
 div_style <- parcoord_style_np(div_color = "green", div_size = 0.05, div_alpha = 0.4)
 mcmc_parcoord(posterior, regex_pars = "mu", np=nuts_prmts, np_style = div_style, alpha = 0.05)
-ggsave(paste0(image_diagnostics,"/parcoord_mu.png"))
+ggsave(paste0(image_diagnostics,"/parcoord_mu.png"), create.dir = TRUE)
 ```
 
 ```{r}
-#| eval: true
+#| label: diagnostics 5
+#| eval: false
 for (i in 1:3) {
     mus = sapply(3:0, function(j) paste0("mu[",4*i-j ,"]"))
     print(
@@ -1228,7 +1284,7 @@ for (i in 1:3) {
     )
     mu_range <- paste0(4*i-3,"-",4*i)
     filename <- paste0(image_diagnostics,"/correlation_plot_mu_",mu_range,".png")
-    ggsave(filename)
+    ggsave(filename, create.dir = TRUE)
 }
 
 
@@ -1236,13 +1292,15 @@ for (i in 1:3) {
 ```
 
 ```{r}
-#| eval: true
+#| label: diagnostics 6
+#| eval: false
 mcmc_parcoord(posterior,regex_pars = "sigma", np=nuts_prmts, alpha=0.05)
-ggsave(paste0(image_diagnostics,"/parcoord_sigma.png"))
+ggsave(paste0(image_diagnostics,"/parcoord_sigma.png"), create.dir = TRUE)
 ```
 
 ```{r}
-#| eval: true
+#| label: diagnostics 7
+#| eval: false
 
 for (i in 1:3) {
     params = sapply(3:0, function(j) paste0("sigma[",4*i-j ,"]"))
@@ -1259,13 +1317,14 @@ for (i in 1:3) {
     )
     sigma_range <- paste0(4*i-3,"-",4*i)
     filename <- paste0(image_diagnostics,"/correlation_plot_sigma_",sigma_range,".png")
-    ggsave(filename)
+    ggsave(filename, create.dir = TRUE)
 }
 ```
 
 
 ```{r}
-#| eval: true
+#| eval: false
+#| label: diagnostics 8
 for (k in 1:22) {
 for (i in 1:3) {
     params = sapply(3:0, function(j) paste0("beta[",k,",",4*i-j ,"]"))
@@ -1283,7 +1342,7 @@ for (i in 1:3) {
     
     beta_range <- paste0("k_",k,"_i_",4*i-3,"-",4*i)
     filename <- paste0(image_diagnostics,"/correlation_plot_beta_",beta_range,".png")
-    ggsave(filename)
+    ggsave(filename, create.dir = TRUE)
 }}
 ```