Changes to enrollment delay without ICD10Group fix

I wanted to fix the ICD-10 groups with a record of other changes I made.
4 months ago · d753db81c7
parent c9ce21a815
commit d753db81c7
1 changed files with 110 additions and 51 deletions
--- a/r-analysis/EffectsOfEnrollmentDelay.qmd
+++ b/r-analysis/EffectsOfEnrollmentDelay.qmd
@ -8,6 +8,8 @@ editor: source

 # Setup

+
+
 ```{r}
 library(knitr)
 library(bayesplot)
@ -37,10 +39,15 @@ image_parameters_by_groups <-paste0(image_root,"/betas/parameters_by_group")
 image_parameters_across_groups <-paste0(image_root,"/betas/parameter_across_groups")
 ```

+run on `{r} now(tz='UTC')`
+
+
+
 ```{r}
 ################ Pull data from database ######################
 library(RPostgreSQL)
-host <- 'aact_db-restored-2025-01-07'
+#host <-'aact_db-restored-2025-01-07'
+host <- '10.89.0.6'

 driver <- dbDriver("PostgreSQL")

@ -431,6 +438,7 @@ counterfact_delay <- list(
 ```

 ```{r}
+#| label: Fitting
 fit <- stan(
    file='Hierarchal_Logistic.stan', 
    data = counterfact_delay,
@ -493,7 +501,7 @@ ggplot(data=df3, aes(x=count, fill=final_status)) +
    geom_histogram(binwidth=1) +
    ggtitle("Histogram of snapshots per trial (matched trials)") +
    xlab("Snapshots per trial")
-ggsave(paste0(image_trial_details,"/HistSnapshots.png"))
+ggsave(paste0(image_trial_details,"/HistSnapshots.png"), create.dir = TRUE)

 #Plot duration for terminated vs completed
 df4 <- dbGetQuery(
@ -516,7 +524,7 @@ ggplot(data=df4, aes(x=duration,fill=overall_status)) +
    ggtitle("Histogram of trial durations") +
    xlab("duration")+
    facet_wrap(~overall_status)
-ggsave(paste0(image_trial_details,"/HistTrialDurations_Faceted.png"))
+ggsave(paste0(image_trial_details,"/HistTrialDurations_Faceted.png"), create.dir = TRUE)

 df5 <- dbGetQuery(
    con,
@ -547,7 +555,7 @@ ggplot(data=df5, aes(x=duration,y=snapshot_count,color=overall_status)) +
    ggtitle("Comparison of duration, status, and snapshot_count") +
    xlab("duration") +
    ylab("snapshot count") 
-ggsave(paste0(image_trial_details,"/SnapshotsVsDurationVsTermination.png"))
+ggsave(paste0(image_trial_details,"/SnapshotsVsDurationVsTermination.png"), create.dir = TRUE)

 dbDisconnect(con)

@ -563,7 +571,7 @@ ggplot(data = group_trials_by_category, aes(x=category_id)) +
        ,x="Category ID"
        ,y="Count"
    )
-ggsave(paste0(image_trial_details,"/CategoryCounts.png"))
+ggsave(paste0(image_trial_details,"/CategoryCounts.png"), create.dir = TRUE)
    


@ -573,8 +581,16 @@ cor_dur_count <- cor(df5$duration,df5$snapshot_count)
 count_snapshots <- sum(df5$snapshot_count)
 ```

-the correlation value is `r cor_dur_count` between duration and snapshot count.
-There are `r count_snapshots` snapshots in total.
+the correlation value is `{r} cor_dur_count` between duration and snapshot count.
+There are `{r} count_snapshots` snapshots in total, spread over 
+`{r} length(df5%nct_id)` trials.
+
+The number of categories by trial are
+TODO: add in category names, 
+```{r}
+group_trials_by_category |> count(category_id)
+```
+



@ -595,7 +611,7 @@ for (i in category_count$category_id[category_count$n >= 0]) {
    gi <- group_mcmc_areas("beta",beta_list,fit,i) #add way to filter groups
    ggsave(
        paste0(image_parameters_by_groups,"/group_",i,"_",gi$name,".png")
-        ,plot=gi$plot
+        ,plot=gi$plot, create.dir = TRUE
        )
    gx <- c(gx,gi)

@ -625,7 +641,7 @@ for (i in c(1,2,3,9,10,11,12)) {
    pi <- parameter_mcmc_areas("beta",beta_list,fit,i) #add way to filter groups
    ggsave(
        paste0(image_parameters_across_groups,"/parameters_",i,"_",pi$name,".png")
-        ,plot=pi$plot
+        ,plot=pi$plot, create.dir = TRUE
        )
    px <- c(px,pi)

@ -685,7 +701,7 @@ ggplot(df_ib_p, aes(x=p_prior)) +
        ,x="Probability Domain 'p'"
        ,y="Probability Density"
    )
-ggsave(paste0(image_dist_diff_analysis,"/prior_p.png"))
+ggsave(paste0(image_dist_diff_analysis,"/prior_p.png"), create.dir = TRUE)

 #p_posterior
 ggplot(df_ib_p, aes(x=p_predicted)) +
@ -696,7 +712,7 @@ ggplot(df_ib_p, aes(x=p_predicted)) +
        ,x="Probability Domain 'p'"
        ,y="Probability Density"
    )
-ggsave(paste0(image_dist_diff_analysis,"/posterior_p.png"))
+ggsave(paste0(image_dist_diff_analysis,"/posterior_p.png"), create.dir = TRUE)

 #mu_prior
 ggplot(df_ib_prior) +
@ -707,7 +723,7 @@ ggplot(df_ib_prior) +
        ,x="Mu"
        ,y="Probability"
    )
-ggsave(paste0(image_dist_diff_analysis,"/prior_mu.png"))
+ggsave(paste0(image_dist_diff_analysis,"/prior_mu.png"), create.dir = TRUE)

 #sigma_posterior
 ggplot(df_ib_prior) +
@ -718,7 +734,7 @@ ggplot(df_ib_prior) +
        ,x="Sigma"
        ,y="Probability"
    )
-ggsave(paste0(image_dist_diff_analysis,"/prior_sigma.png"))
+ggsave(paste0(image_dist_diff_analysis,"/prior_sigma.png"), create.dir = TRUE)
 ```


@ -726,7 +742,7 @@ ggsave(paste0(image_dist_diff_analysis,"/prior_sigma.png"))



-### Intervention: Delay close of enrollment
+# Intervention: Delay close of enrollment

 ```{r}
 counterfact_predicted_ib <- data.frame(
@ -744,7 +760,7 @@ ggplot(counterfact_predicted_ib, aes(x=p_predicted_default)) +
        ,x="Probability Domain 'p'"
        ,y="Probability Density"
    )
-ggsave(paste0(image_dist_diff_analysis,"/p_no_intervention.png"))
+ggsave(paste0(image_dist_diff_analysis,"/p_no_intervention.png"), create.dir = TRUE)

 ggplot(counterfact_predicted_ib, aes(x=p_predicted_intervention)) +
    geom_density() + 
@ -754,7 +770,7 @@ ggplot(counterfact_predicted_ib, aes(x=p_predicted_intervention)) +
        ,x="Probability Domain 'p'"
        ,y="Probability Density"
    )
-ggsave(paste0(image_dist_diff_analysis,"/p_delay_intervention.png"))
+ggsave(paste0(image_dist_diff_analysis,"/p_delay_intervention.png"), create.dir = TRUE)

 ggplot(counterfact_predicted_ib, aes(x=predicted_difference)) +
    geom_density() + 
@ -764,7 +780,7 @@ ggplot(counterfact_predicted_ib, aes(x=predicted_difference)) +
        ,x="Difference in 'p' under treatment"
        ,y="Probability Density"
    )
-ggsave(paste0(image_dist_diff_analysis,"/p_delay_intervention_distdiff_1.png"))
+ggsave(paste0(image_dist_diff_analysis,"/p_delay_intervention_distdiff_1.png"), create.dir = TRUE)
 ```


@ -805,7 +821,7 @@ ggplot(pddf_ib, aes(x=value,)) +
        ,y = "Probability Density"
    ) + 
    geom_vline(aes(xintercept = 0), color = "skyblue", linetype="dashed") 
-ggsave(paste0(image_dist_diff_analysis,"/p_delay_intervention_distdiff_styled.png"))
+ggsave(paste0(image_dist_diff_analysis,"/p_delay_intervention_distdiff_styled.png"), create.dir = TRUE)
 ```


@ -827,7 +843,7 @@ ggplot(pddf_ib, aes(x=value,)) +
    ) + 
    geom_vline(aes(xintercept = 0), color = "skyblue", linetype="dashed") +
    theme(strip.text.x = element_text(size = 8))
-ggsave(paste0(image_dist_diff_analysis,"/p_delay_intervention_distdiff_by_group.png"))
+ggsave(paste0(image_dist_diff_analysis,"/p_delay_intervention_distdiff_by_group.png"), create.dir = TRUE)
 ```


@ -849,7 +865,7 @@ ggplot(pddf_ib, aes(x=value,)) +
    ) + 
    geom_vline(aes(xintercept = 0), color = "skyblue", linetype="dashed") +
    theme(strip.text.x = element_text(size = 8))
-ggsave(paste0(image_dist_diff_analysis,"/p_delay_intervention_histdiff_by_group.png"))
+ggsave(paste0(image_dist_diff_analysis,"/p_delay_intervention_histdiff_by_group.png"), create.dir = TRUE)
 ```


@ -873,6 +889,7 @@ stats <- list(
 q1 = quantile(pddf_ib$value, 0.25),
 med = median(pddf_ib$value),
 mean = mean(pddf_ib$value),
+ stdev = sd(pddf_ib$value),
 q3 = quantile(pddf_ib$value, 0.75),
 p90 = quantile(pddf_ib$value, 0.90),
 p95 = quantile(pddf_ib$value, 0.95),
@ -928,7 +945,7 @@ p3 +
   x = stats$mean,
   y = stats$y_offset * 1.5
 ), aes(x = x, y = y))
-ggsave(paste0(image_dist_diff_analysis,"/p_delay_intervention_histdiff_boxplot.png"))
+ggsave(paste0(image_dist_diff_analysis,"/p_delay_intervention_histdiff_boxplot.png"), create.dir = TRUE)
 ```


@ -988,7 +1005,7 @@ p4 <- ggplot(pddf_ib, aes(x = value)) +
   x = stats$mean,
   y = stats$y_offset_density * 1.5
 ), aes(x = x, y = y))
-ggsave(paste0(image_dist_diff_analysis,"/p_delay_intervention_distdiff_boxplot.png"))
+ggsave(paste0(image_dist_diff_analysis,"/p_delay_intervention_distdiff_boxplot.png"), create.dir = TRUE)

 p4
 ```
@ -1003,7 +1020,7 @@ p4
        y = "Cumulative Proportion"
    ) 

-ggsave(paste0(image_dist_diff_analysis,"/p_delay_intervention_cumulative_distdiff.png"))
+ggsave(paste0(image_dist_diff_analysis,"/p_delay_intervention_cumulative_distdiff.png"), create.dir = TRUE)
 ```


@ -1018,18 +1035,18 @@ below_spike_band <- mean( pddf_ib$value <= -width/2)
 # get mass above and mass below zero
 mass_below_zero <- mean( pddf_ib$value <= 0)
 ```
-Looking at the spike around zero, we find that `r spike_band_centered_zero*100`% 
+Looking at the spike around zero, we find that `{r} spike_band_centered_zero*100`% 
 of the probability mass is contained within the band from 
-[`r -width*100/2`,`r width*100/2`].
-Additionally, there was `r above_spike_band*100`% of the probability above that 
+[`{r} -width/2`,`{r} width/2`].
+Additionally, there was `{r} above_spike_band*100`% of the probability above that 
 -- representing those with a general increase in the probability of termination --
-and `r below_spike_band*100`% of the probability mass below the band
+and `{r} below_spike_band*100`% of the probability mass below the band
 -- representing a decrease in the probability of termination.

 On average, if you keep the trial open instead of closing it,
-`r mass_below_zero`% of trials will see a decrease in the probability of termination,
+`{r} mass_below_zero * 100`% of trials will see a decrease in the probability of termination,
 but, due to the high increase in probability of termination given termination was increased,
-the mean probability of termination increases by `r stats$mean`.
+the mean probability of termination increases by `{r} stats$mean (stats$stdev)`.

 ```{r}
 # 5%-iles
@ -1063,7 +1080,7 @@ kable(quant_df)
 proportion_increase <- mean(pddf_ib$value >= 0)
 ```

-about `r proportion_increase * 100` percent probability increase in the probability of terminations
+about `{r} proportion_increase * 100` percent probability increase in the probability of terminations

 ```{r}
 n = length(counterfact_predicted_ib$p_predicted_intervention)
@ -1071,11 +1088,45 @@ k = 100
 simulated_terminations_intervention <- mean(rbinom(n,k,as.vector(counterfact_predicted_ib$p_predicted_intervention)))
 simulated_terminations_base <-mean(rbinom(n,k,as.vector(counterfact_predicted_ib$p_predicted_default)))

-simulated_percentages <- (simulated_terminations_intervention - simulated_terminations_base)/k
+simulated_percentages <- (simulated_terminations_intervention - simulated_terminations_base)
 ```

 The simulation above shows that this results in a percentage-point change in terminations of about 
-`r simulated_percentages * 100`.
+`{r} simulated_percentages * 100`.
+
+
+```{r}
+distdiff_by_group <- pddf_ib %>% 
+    group_by(category_name, category) %>%
+    summarize(
+        mean = mean(value, na.rm=TRUE) *100
+        ,"P(p>0)" = mean(value>0) *100
+        ,"5%"    = quantile( value, 0.05, na.rm = TRUE)*100
+        ,"25%"   = quantile( value, 0.25, na.rm = TRUE)*100
+        ,"median" = quantile( value, 0.5, na.rm = TRUE)*100
+        ,"75%"   = quantile( value, 0.75, na.rm = TRUE)*100
+        ,"95%"   = quantile( value, 0.95, na.rm = TRUE)*100
+    ) %>% arrange(category) %>% select(!category)
+    
+distdif_by_group_latex<- xtable(distdiff_by_group,
+    digits = rep(2, ncol(distdiff_by_group) +1),
+    floating = FALSE,
+    align = "llccccccc",
+    latex.environments = "tabularx",
+    booktabs = TRUE
+)
+
+# Write to file
+write_lines(
+    print(distdif_by_group_latex, include.rownames = FALSE),
+    paste0(image_root,"/distdiff_anr_vs_rec_by_group.tex")
+)
+
+print(distdif_by_group_latex)
+distdiff_by_group
+```
+
+

 ## fixed effects distributions

@ -1090,7 +1141,7 @@ mcmc_areas(filtdata,prob = 0.8, prob_outer = 0.95) +
            subtitle = "Moving from 'Active, not recruiting' to 'Recruiting'"
        ) +
    geom_vline(xintercept=seq(-0.25,0.5,0.25),color="grey",alpha=0.750)  
-ggsave(paste0(image_parameters_across_groups,"/fixed_effects_anr_vs_rec_by_group.png"))
+ggsave(paste0(image_parameters_across_groups,"/fixed_effects_anr_vs_rec_by_group.png"), create.dir = TRUE)
    
 fixed_effects_table <- pivot_longer(filtdata, everything()) |> 
    group_by(name) |> 
@ -1118,7 +1169,7 @@ names(fixed_effects_table)[1] <- "ICD-10 Category"

 # Convert to LaTeX
 table <- xtable(fixed_effects_table,
-    digits = rep(3, ncol(fixed_effects_table) + 1),
+    digits = rep(2, ncol(fixed_effects_table) + 1),
    floating = FALSE,
    align = "lccccccccccc",
    latex.environments = "tabularx",
@ -1141,12 +1192,13 @@ write_lines(
 # Diagnostics

 ```{r}
-#| eval: true
+#| label: diagnostics 1
+#| eval: false
 #trace plots
 image_diagnostics <- paste0(image_root,"/diagnostics")

 plot(fit, pars=c("mu"), plotfun="trace")
-ggsave(paste0(image_diagnostics,"/trace_plot_mu.png"))
+ggsave(paste0(image_diagnostics,"/trace_plot_mu.png"), create.dir = TRUE)


 for (i in 1:3) {
@ -1165,14 +1217,15 @@ for (i in 1:3) {
    )
    mu_range <- paste0(4*i-3,"-",4*i)
    filename <- paste0(image_diagnostics,"/trace_rank_plot_mu_",mu_range,".png")
-    ggsave(filename)
+    ggsave(filename, create.dir = TRUE)
 }
 ```

 ```{r}
-#| eval: true
+#| label: diagnostics 2
+#| eval: false
 plot(fit, pars=c("sigma"), plotfun="trace")
-ggsave(paste0(image_diagnostics,"/traceplot_sigma.png"))
+ggsave(paste0(image_diagnostics,"/traceplot_sigma.png"), create.dir = TRUE)

 for (i in 1:3) {
    print(
@ -1190,12 +1243,13 @@ for (i in 1:3) {
    )
    sigma_range <- paste0(4*i-3,"-",4*i)
    filename <- paste0(image_diagnostics,"/trace_rank_plot_sigma_",sigma_range,".png")
-    ggsave(filename)
+    ggsave(filename, create.dir = TRUE)
 }
 ```

 ```{r}
-#| eval: true
+#| label: diagnostics 3
+#| eval: false
 #other diagnostics
 logpost <- log_posterior(fit)
 nuts_prmts <- nuts_params(fit)
@ -1204,15 +1258,17 @@ posterior <- as.array(fit)
 ```

 ```{r}
-#| eval: true
+#| label: diagnostics 4
+#| eval: false
 color_scheme_set("darkgray")
 div_style <- parcoord_style_np(div_color = "green", div_size = 0.05, div_alpha = 0.4)
 mcmc_parcoord(posterior, regex_pars = "mu", np=nuts_prmts, np_style = div_style, alpha = 0.05)
-ggsave(paste0(image_diagnostics,"/parcoord_mu.png"))
+ggsave(paste0(image_diagnostics,"/parcoord_mu.png"), create.dir = TRUE)
 ```

 ```{r}
-#| eval: true
+#| label: diagnostics 5
+#| eval: false
 for (i in 1:3) {
    mus = sapply(3:0, function(j) paste0("mu[",4*i-j ,"]"))
    print(
@ -1228,7 +1284,7 @@ for (i in 1:3) {
    )
    mu_range <- paste0(4*i-3,"-",4*i)
    filename <- paste0(image_diagnostics,"/correlation_plot_mu_",mu_range,".png")
-    ggsave(filename)
+    ggsave(filename, create.dir = TRUE)
 }


@ -1236,13 +1292,15 @@ for (i in 1:3) {
 ```

 ```{r}
-#| eval: true
+#| label: diagnostics 6
+#| eval: false
 mcmc_parcoord(posterior,regex_pars = "sigma", np=nuts_prmts, alpha=0.05)
-ggsave(paste0(image_diagnostics,"/parcoord_sigma.png"))
+ggsave(paste0(image_diagnostics,"/parcoord_sigma.png"), create.dir = TRUE)
 ```

 ```{r}
-#| eval: true
+#| label: diagnostics 7
+#| eval: false

 for (i in 1:3) {
    params = sapply(3:0, function(j) paste0("sigma[",4*i-j ,"]"))
@ -1259,13 +1317,14 @@ for (i in 1:3) {
    )
    sigma_range <- paste0(4*i-3,"-",4*i)
    filename <- paste0(image_diagnostics,"/correlation_plot_sigma_",sigma_range,".png")
-    ggsave(filename)
+    ggsave(filename, create.dir = TRUE)
 }
 ```


 ```{r}
-#| eval: true
+#| eval: false
+#| label: diagnostics 8
 for (k in 1:22) {
 for (i in 1:3) {
    params = sapply(3:0, function(j) paste0("beta[",k,",",4*i-j ,"]"))
@ -1283,7 +1342,7 @@ for (i in 1:3) {
    
    beta_range <- paste0("k_",k,"_i_",4*i-3,"-",4*i)
    filename <- paste0(image_diagnostics,"/correlation_plot_beta_",beta_range,".png")
-    ggsave(filename)
+    ggsave(filename, create.dir = TRUE)
 }}
 ```