From d753db81c709ec53bbbe8bfbfa4153769051ddd6 Mon Sep 17 00:00:00 2001 From: Will King Date: Sat, 21 Mar 2026 15:20:44 -0700 Subject: [PATCH] Changes to enrollment delay without ICD10Group fix I wanted to fix the ICD-10 groups with a record of other changes I made. --- r-analysis/EffectsOfEnrollmentDelay.qmd | 161 ++++++++++++++++-------- 1 file changed, 110 insertions(+), 51 deletions(-) diff --git a/r-analysis/EffectsOfEnrollmentDelay.qmd b/r-analysis/EffectsOfEnrollmentDelay.qmd index 5df7ec3..10d021d 100644 --- a/r-analysis/EffectsOfEnrollmentDelay.qmd +++ b/r-analysis/EffectsOfEnrollmentDelay.qmd @@ -8,6 +8,8 @@ editor: source # Setup + + ```{r} library(knitr) library(bayesplot) @@ -37,10 +39,15 @@ image_parameters_by_groups <-paste0(image_root,"/betas/parameters_by_group") image_parameters_across_groups <-paste0(image_root,"/betas/parameter_across_groups") ``` +run on `{r} now(tz='UTC')` + + + ```{r} ################ Pull data from database ###################### library(RPostgreSQL) -host <- 'aact_db-restored-2025-01-07' +#host <-'aact_db-restored-2025-01-07' +host <- '10.89.0.6' driver <- dbDriver("PostgreSQL") @@ -431,6 +438,7 @@ counterfact_delay <- list( ``` ```{r} +#| label: Fitting fit <- stan( file='Hierarchal_Logistic.stan', data = counterfact_delay, @@ -493,7 +501,7 @@ ggplot(data=df3, aes(x=count, fill=final_status)) + geom_histogram(binwidth=1) + ggtitle("Histogram of snapshots per trial (matched trials)") + xlab("Snapshots per trial") -ggsave(paste0(image_trial_details,"/HistSnapshots.png")) +ggsave(paste0(image_trial_details,"/HistSnapshots.png"), create.dir = TRUE) #Plot duration for terminated vs completed df4 <- dbGetQuery( @@ -516,7 +524,7 @@ ggplot(data=df4, aes(x=duration,fill=overall_status)) + ggtitle("Histogram of trial durations") + xlab("duration")+ facet_wrap(~overall_status) -ggsave(paste0(image_trial_details,"/HistTrialDurations_Faceted.png")) +ggsave(paste0(image_trial_details,"/HistTrialDurations_Faceted.png"), create.dir = TRUE) df5 <- dbGetQuery( con, @@ -547,7 +555,7 @@ ggplot(data=df5, aes(x=duration,y=snapshot_count,color=overall_status)) + ggtitle("Comparison of duration, status, and snapshot_count") + xlab("duration") + ylab("snapshot count") -ggsave(paste0(image_trial_details,"/SnapshotsVsDurationVsTermination.png")) +ggsave(paste0(image_trial_details,"/SnapshotsVsDurationVsTermination.png"), create.dir = TRUE) dbDisconnect(con) @@ -563,7 +571,7 @@ ggplot(data = group_trials_by_category, aes(x=category_id)) + ,x="Category ID" ,y="Count" ) -ggsave(paste0(image_trial_details,"/CategoryCounts.png")) +ggsave(paste0(image_trial_details,"/CategoryCounts.png"), create.dir = TRUE) @@ -573,8 +581,16 @@ cor_dur_count <- cor(df5$duration,df5$snapshot_count) count_snapshots <- sum(df5$snapshot_count) ``` -the correlation value is `r cor_dur_count` between duration and snapshot count. -There are `r count_snapshots` snapshots in total. +the correlation value is `{r} cor_dur_count` between duration and snapshot count. +There are `{r} count_snapshots` snapshots in total, spread over +`{r} length(df5%nct_id)` trials. + +The number of categories by trial are +TODO: add in category names, +```{r} +group_trials_by_category |> count(category_id) +``` + @@ -595,7 +611,7 @@ for (i in category_count$category_id[category_count$n >= 0]) { gi <- group_mcmc_areas("beta",beta_list,fit,i) #add way to filter groups ggsave( paste0(image_parameters_by_groups,"/group_",i,"_",gi$name,".png") - ,plot=gi$plot + ,plot=gi$plot, create.dir = TRUE ) gx <- c(gx,gi) @@ -625,7 +641,7 @@ for (i in c(1,2,3,9,10,11,12)) { pi <- parameter_mcmc_areas("beta",beta_list,fit,i) #add way to filter groups ggsave( paste0(image_parameters_across_groups,"/parameters_",i,"_",pi$name,".png") - ,plot=pi$plot + ,plot=pi$plot, create.dir = TRUE ) px <- c(px,pi) @@ -685,7 +701,7 @@ ggplot(df_ib_p, aes(x=p_prior)) + ,x="Probability Domain 'p'" ,y="Probability Density" ) -ggsave(paste0(image_dist_diff_analysis,"/prior_p.png")) +ggsave(paste0(image_dist_diff_analysis,"/prior_p.png"), create.dir = TRUE) #p_posterior ggplot(df_ib_p, aes(x=p_predicted)) + @@ -696,7 +712,7 @@ ggplot(df_ib_p, aes(x=p_predicted)) + ,x="Probability Domain 'p'" ,y="Probability Density" ) -ggsave(paste0(image_dist_diff_analysis,"/posterior_p.png")) +ggsave(paste0(image_dist_diff_analysis,"/posterior_p.png"), create.dir = TRUE) #mu_prior ggplot(df_ib_prior) + @@ -707,7 +723,7 @@ ggplot(df_ib_prior) + ,x="Mu" ,y="Probability" ) -ggsave(paste0(image_dist_diff_analysis,"/prior_mu.png")) +ggsave(paste0(image_dist_diff_analysis,"/prior_mu.png"), create.dir = TRUE) #sigma_posterior ggplot(df_ib_prior) + @@ -718,7 +734,7 @@ ggplot(df_ib_prior) + ,x="Sigma" ,y="Probability" ) -ggsave(paste0(image_dist_diff_analysis,"/prior_sigma.png")) +ggsave(paste0(image_dist_diff_analysis,"/prior_sigma.png"), create.dir = TRUE) ``` @@ -726,7 +742,7 @@ ggsave(paste0(image_dist_diff_analysis,"/prior_sigma.png")) -### Intervention: Delay close of enrollment +# Intervention: Delay close of enrollment ```{r} counterfact_predicted_ib <- data.frame( @@ -744,7 +760,7 @@ ggplot(counterfact_predicted_ib, aes(x=p_predicted_default)) + ,x="Probability Domain 'p'" ,y="Probability Density" ) -ggsave(paste0(image_dist_diff_analysis,"/p_no_intervention.png")) +ggsave(paste0(image_dist_diff_analysis,"/p_no_intervention.png"), create.dir = TRUE) ggplot(counterfact_predicted_ib, aes(x=p_predicted_intervention)) + geom_density() + @@ -754,7 +770,7 @@ ggplot(counterfact_predicted_ib, aes(x=p_predicted_intervention)) + ,x="Probability Domain 'p'" ,y="Probability Density" ) -ggsave(paste0(image_dist_diff_analysis,"/p_delay_intervention.png")) +ggsave(paste0(image_dist_diff_analysis,"/p_delay_intervention.png"), create.dir = TRUE) ggplot(counterfact_predicted_ib, aes(x=predicted_difference)) + geom_density() + @@ -764,7 +780,7 @@ ggplot(counterfact_predicted_ib, aes(x=predicted_difference)) + ,x="Difference in 'p' under treatment" ,y="Probability Density" ) -ggsave(paste0(image_dist_diff_analysis,"/p_delay_intervention_distdiff_1.png")) +ggsave(paste0(image_dist_diff_analysis,"/p_delay_intervention_distdiff_1.png"), create.dir = TRUE) ``` @@ -805,7 +821,7 @@ ggplot(pddf_ib, aes(x=value,)) + ,y = "Probability Density" ) + geom_vline(aes(xintercept = 0), color = "skyblue", linetype="dashed") -ggsave(paste0(image_dist_diff_analysis,"/p_delay_intervention_distdiff_styled.png")) +ggsave(paste0(image_dist_diff_analysis,"/p_delay_intervention_distdiff_styled.png"), create.dir = TRUE) ``` @@ -827,7 +843,7 @@ ggplot(pddf_ib, aes(x=value,)) + ) + geom_vline(aes(xintercept = 0), color = "skyblue", linetype="dashed") + theme(strip.text.x = element_text(size = 8)) -ggsave(paste0(image_dist_diff_analysis,"/p_delay_intervention_distdiff_by_group.png")) +ggsave(paste0(image_dist_diff_analysis,"/p_delay_intervention_distdiff_by_group.png"), create.dir = TRUE) ``` @@ -849,7 +865,7 @@ ggplot(pddf_ib, aes(x=value,)) + ) + geom_vline(aes(xintercept = 0), color = "skyblue", linetype="dashed") + theme(strip.text.x = element_text(size = 8)) -ggsave(paste0(image_dist_diff_analysis,"/p_delay_intervention_histdiff_by_group.png")) +ggsave(paste0(image_dist_diff_analysis,"/p_delay_intervention_histdiff_by_group.png"), create.dir = TRUE) ``` @@ -873,6 +889,7 @@ stats <- list( q1 = quantile(pddf_ib$value, 0.25), med = median(pddf_ib$value), mean = mean(pddf_ib$value), + stdev = sd(pddf_ib$value), q3 = quantile(pddf_ib$value, 0.75), p90 = quantile(pddf_ib$value, 0.90), p95 = quantile(pddf_ib$value, 0.95), @@ -928,7 +945,7 @@ p3 + x = stats$mean, y = stats$y_offset * 1.5 ), aes(x = x, y = y)) -ggsave(paste0(image_dist_diff_analysis,"/p_delay_intervention_histdiff_boxplot.png")) +ggsave(paste0(image_dist_diff_analysis,"/p_delay_intervention_histdiff_boxplot.png"), create.dir = TRUE) ``` @@ -988,7 +1005,7 @@ p4 <- ggplot(pddf_ib, aes(x = value)) + x = stats$mean, y = stats$y_offset_density * 1.5 ), aes(x = x, y = y)) -ggsave(paste0(image_dist_diff_analysis,"/p_delay_intervention_distdiff_boxplot.png")) +ggsave(paste0(image_dist_diff_analysis,"/p_delay_intervention_distdiff_boxplot.png"), create.dir = TRUE) p4 ``` @@ -1003,7 +1020,7 @@ p4 y = "Cumulative Proportion" ) -ggsave(paste0(image_dist_diff_analysis,"/p_delay_intervention_cumulative_distdiff.png")) +ggsave(paste0(image_dist_diff_analysis,"/p_delay_intervention_cumulative_distdiff.png"), create.dir = TRUE) ``` @@ -1018,18 +1035,18 @@ below_spike_band <- mean( pddf_ib$value <= -width/2) # get mass above and mass below zero mass_below_zero <- mean( pddf_ib$value <= 0) ``` -Looking at the spike around zero, we find that `r spike_band_centered_zero*100`% +Looking at the spike around zero, we find that `{r} spike_band_centered_zero*100`% of the probability mass is contained within the band from -[`r -width*100/2`,`r width*100/2`]. -Additionally, there was `r above_spike_band*100`% of the probability above that +[`{r} -width/2`,`{r} width/2`]. +Additionally, there was `{r} above_spike_band*100`% of the probability above that -- representing those with a general increase in the probability of termination -- -and `r below_spike_band*100`% of the probability mass below the band +and `{r} below_spike_band*100`% of the probability mass below the band -- representing a decrease in the probability of termination. On average, if you keep the trial open instead of closing it, -`r mass_below_zero`% of trials will see a decrease in the probability of termination, +`{r} mass_below_zero * 100`% of trials will see a decrease in the probability of termination, but, due to the high increase in probability of termination given termination was increased, -the mean probability of termination increases by `r stats$mean`. +the mean probability of termination increases by `{r} stats$mean (stats$stdev)`. ```{r} # 5%-iles @@ -1063,7 +1080,7 @@ kable(quant_df) proportion_increase <- mean(pddf_ib$value >= 0) ``` -about `r proportion_increase * 100` percent probability increase in the probability of terminations +about `{r} proportion_increase * 100` percent probability increase in the probability of terminations ```{r} n = length(counterfact_predicted_ib$p_predicted_intervention) @@ -1071,11 +1088,45 @@ k = 100 simulated_terminations_intervention <- mean(rbinom(n,k,as.vector(counterfact_predicted_ib$p_predicted_intervention))) simulated_terminations_base <-mean(rbinom(n,k,as.vector(counterfact_predicted_ib$p_predicted_default))) -simulated_percentages <- (simulated_terminations_intervention - simulated_terminations_base)/k +simulated_percentages <- (simulated_terminations_intervention - simulated_terminations_base) ``` The simulation above shows that this results in a percentage-point change in terminations of about -`r simulated_percentages * 100`. +`{r} simulated_percentages * 100`. + + +```{r} +distdiff_by_group <- pddf_ib %>% + group_by(category_name, category) %>% + summarize( + mean = mean(value, na.rm=TRUE) *100 + ,"P(p>0)" = mean(value>0) *100 + ,"5%" = quantile( value, 0.05, na.rm = TRUE)*100 + ,"25%" = quantile( value, 0.25, na.rm = TRUE)*100 + ,"median" = quantile( value, 0.5, na.rm = TRUE)*100 + ,"75%" = quantile( value, 0.75, na.rm = TRUE)*100 + ,"95%" = quantile( value, 0.95, na.rm = TRUE)*100 + ) %>% arrange(category) %>% select(!category) + +distdif_by_group_latex<- xtable(distdiff_by_group, + digits = rep(2, ncol(distdiff_by_group) +1), + floating = FALSE, + align = "llccccccc", + latex.environments = "tabularx", + booktabs = TRUE +) + +# Write to file +write_lines( + print(distdif_by_group_latex, include.rownames = FALSE), + paste0(image_root,"/distdiff_anr_vs_rec_by_group.tex") +) + +print(distdif_by_group_latex) +distdiff_by_group +``` + + ## fixed effects distributions @@ -1090,7 +1141,7 @@ mcmc_areas(filtdata,prob = 0.8, prob_outer = 0.95) + subtitle = "Moving from 'Active, not recruiting' to 'Recruiting'" ) + geom_vline(xintercept=seq(-0.25,0.5,0.25),color="grey",alpha=0.750) -ggsave(paste0(image_parameters_across_groups,"/fixed_effects_anr_vs_rec_by_group.png")) +ggsave(paste0(image_parameters_across_groups,"/fixed_effects_anr_vs_rec_by_group.png"), create.dir = TRUE) fixed_effects_table <- pivot_longer(filtdata, everything()) |> group_by(name) |> @@ -1118,7 +1169,7 @@ names(fixed_effects_table)[1] <- "ICD-10 Category" # Convert to LaTeX table <- xtable(fixed_effects_table, - digits = rep(3, ncol(fixed_effects_table) + 1), + digits = rep(2, ncol(fixed_effects_table) + 1), floating = FALSE, align = "lccccccccccc", latex.environments = "tabularx", @@ -1141,12 +1192,13 @@ write_lines( # Diagnostics ```{r} -#| eval: true +#| label: diagnostics 1 +#| eval: false #trace plots image_diagnostics <- paste0(image_root,"/diagnostics") plot(fit, pars=c("mu"), plotfun="trace") -ggsave(paste0(image_diagnostics,"/trace_plot_mu.png")) +ggsave(paste0(image_diagnostics,"/trace_plot_mu.png"), create.dir = TRUE) for (i in 1:3) { @@ -1165,14 +1217,15 @@ for (i in 1:3) { ) mu_range <- paste0(4*i-3,"-",4*i) filename <- paste0(image_diagnostics,"/trace_rank_plot_mu_",mu_range,".png") - ggsave(filename) + ggsave(filename, create.dir = TRUE) } ``` ```{r} -#| eval: true +#| label: diagnostics 2 +#| eval: false plot(fit, pars=c("sigma"), plotfun="trace") -ggsave(paste0(image_diagnostics,"/traceplot_sigma.png")) +ggsave(paste0(image_diagnostics,"/traceplot_sigma.png"), create.dir = TRUE) for (i in 1:3) { print( @@ -1190,12 +1243,13 @@ for (i in 1:3) { ) sigma_range <- paste0(4*i-3,"-",4*i) filename <- paste0(image_diagnostics,"/trace_rank_plot_sigma_",sigma_range,".png") - ggsave(filename) + ggsave(filename, create.dir = TRUE) } ``` ```{r} -#| eval: true +#| label: diagnostics 3 +#| eval: false #other diagnostics logpost <- log_posterior(fit) nuts_prmts <- nuts_params(fit) @@ -1204,15 +1258,17 @@ posterior <- as.array(fit) ``` ```{r} -#| eval: true +#| label: diagnostics 4 +#| eval: false color_scheme_set("darkgray") div_style <- parcoord_style_np(div_color = "green", div_size = 0.05, div_alpha = 0.4) mcmc_parcoord(posterior, regex_pars = "mu", np=nuts_prmts, np_style = div_style, alpha = 0.05) -ggsave(paste0(image_diagnostics,"/parcoord_mu.png")) +ggsave(paste0(image_diagnostics,"/parcoord_mu.png"), create.dir = TRUE) ``` ```{r} -#| eval: true +#| label: diagnostics 5 +#| eval: false for (i in 1:3) { mus = sapply(3:0, function(j) paste0("mu[",4*i-j ,"]")) print( @@ -1228,7 +1284,7 @@ for (i in 1:3) { ) mu_range <- paste0(4*i-3,"-",4*i) filename <- paste0(image_diagnostics,"/correlation_plot_mu_",mu_range,".png") - ggsave(filename) + ggsave(filename, create.dir = TRUE) } @@ -1236,13 +1292,15 @@ for (i in 1:3) { ``` ```{r} -#| eval: true +#| label: diagnostics 6 +#| eval: false mcmc_parcoord(posterior,regex_pars = "sigma", np=nuts_prmts, alpha=0.05) -ggsave(paste0(image_diagnostics,"/parcoord_sigma.png")) +ggsave(paste0(image_diagnostics,"/parcoord_sigma.png"), create.dir = TRUE) ``` ```{r} -#| eval: true +#| label: diagnostics 7 +#| eval: false for (i in 1:3) { params = sapply(3:0, function(j) paste0("sigma[",4*i-j ,"]")) @@ -1259,13 +1317,14 @@ for (i in 1:3) { ) sigma_range <- paste0(4*i-3,"-",4*i) filename <- paste0(image_diagnostics,"/correlation_plot_sigma_",sigma_range,".png") - ggsave(filename) + ggsave(filename, create.dir = TRUE) } ``` ```{r} -#| eval: true +#| eval: false +#| label: diagnostics 8 for (k in 1:22) { for (i in 1:3) { params = sapply(3:0, function(j) paste0("beta[",k,",",4*i-j ,"]")) @@ -1283,7 +1342,7 @@ for (i in 1:3) { beta_range <- paste0("k_",k,"_i_",4*i-3,"-",4*i) filename <- paste0(image_diagnostics,"/correlation_plot_beta_",beta_range,".png") - ggsave(filename) + ggsave(filename, create.dir = TRUE) }} ```