From 0d0225ecdb93960da54e68c9053ff86b806eb8e4 Mon Sep 17 00:00:00 2001 From: Will King Date: Fri, 6 Oct 2023 11:16:01 -0700 Subject: [PATCH] recording current model --- r-analysis/EffectsOfMarketConditions.qmd | 331 +++++++++++++++-------- 1 file changed, 215 insertions(+), 116 deletions(-) diff --git a/r-analysis/EffectsOfMarketConditions.qmd b/r-analysis/EffectsOfMarketConditions.qmd index 20d7688..7fef5f3 100644 --- a/r-analysis/EffectsOfMarketConditions.qmd +++ b/r-analysis/EffectsOfMarketConditions.qmd @@ -117,19 +117,10 @@ x["l_sdi_val"] <- asinh(df$l_sdi_val) #Setup fixed effects x["status_NYR"] <- ifelse(df["current_status"]=="Not yet recruiting",1,0) x["status_EBI"] <- ifelse(df["current_status"]=="Enrolling by invitation",1,0) -#x["status_Rec"] <- ifelse(df["current_status"]=="Recruiting",1,0) # Base case is Recruiting +x["status_Rec"] <- ifelse(df["current_status"]=="Recruiting",1,0) x["status_ANR"] <- ifelse(df["current_status"]=="Active, not recruiting",1,0) -#interaction terms for competitors -#x["ib*elapsed"] <- x["elapsed_duration"]*x["identical_brands"] -#x["bnc*elapsed"] <- x["elapsed_duration"] * x["brand_name_counts"] - -#interaction terms for status effects -#x["sNYR*elapsed"] <- x["elapsed_duration"]*x["status_NYR"] -#x["sEBI*elapsed"] <- x["elapsed_duration"]*x["status_EBI"] -#x["sANR*elapsed"] <- x["elapsed_duration"]*x["status_ANR"] - y <- ifelse(df["final_status"]=="Terminated",1,0) #get category list @@ -166,6 +157,7 @@ inherited_cols <- c( ,"l_sdi_val" ,"status_NYR" ,"status_EBI" + ,"status_Rec" ,"status_ANR" ) ``` @@ -212,15 +204,8 @@ beta_list <- list( #Status `9`="status_NYR", `10`="status_EBI", - `11`="status_ANR", - #interactions for brands - `12`="ib*elapsed", - `13`="bnc*elapsed", - # interactions for status - `14`="sNYR*elapsed", - `15`="sEBI*elapsed", - `16`="sANR*elapsed" - + `11`="status_Rec", + `12`="status_ANR" ) ) @@ -344,19 +329,66 @@ generated_ib <- gqs( ) ``` +```{r} +df_ib_p <- data.frame( + p_prior=as.vector(extract(generated_ib, pars="p_prior")$p_prior) + ,p_predicted = as.vector(extract(generated_ib, pars="p_predicted")$p_predicted) +) +df_ib_prior <- data.frame( + mu_prior = as.vector(extract(generated_ib, pars="mu_prior")$mu_prior) + ,sigma_prior = as.vector(extract(generated_ib, pars="sigma_prior")$sigma_prior) +) -```{r} -hist(as.vector(extract(generated_ib, pars="p_prior")$p_prior)) -hist(as.vector(extract(generated_ib, pars="mu_prior")$mu_prior), ) -hist(as.vector(extract(generated_ib, pars="sigma_prior")$sigma_prior)) +#p_prior +ggplot(df_ib_p, aes(x=p_prior)) + + geom_density() + + labs( + title="Implied Prior Distribution P" + ,subtitle="" + ,x="Probability Domain 'p'" + ,y="Probability Density" + ) +ggsave("./Images/DirectEffects/prior_p.png") + +#p_posterior +ggplot(df_ib_p, aes(x=p_predicted)) + + geom_density() + + labs( + title="Implied Posterior Distribution P" + ,subtitle="" + ,x="Probability Domain 'p'" + ,y="Probability Density" + ) +ggsave("./Images/DirectEffects/posterior_p.png") + +#mu_prior +ggplot(df_ib_prior) + + geom_density(aes(x=mu_prior)) + + labs( + title="Prior - Mu" + ,subtitle="same prior for all Mu values" + ,x="Mu" + ,y="Probability" + ) +ggsave("./Images/DirectEffects/prior_mu.png") + +#sigma_posterior +ggplot(df_ib_prior) + + geom_density(aes(x=sigma_prior)) + + labs( + title="Prior - Sigma" + ,subtitle="same prior for all Sigma values" + ,x="Sigma" + ,y="Probability" + ) +ggsave("./Images/DirectEffects/prior_sigma.png") ``` ```{r} check_hmc_diagnostics(fit) -hist(as.vector(extract(generated_ib, pars="p_predicted")$p_predicted)) ``` @@ -365,13 +397,44 @@ hist(as.vector(extract(generated_ib, pars="p_predicted")$p_predicted)) ### Intervention: Adding a single competitor ```{r} -#TODO: Convert to ggplot, stabilize y axis -hist(as.vector(extract(generated_ib, pars="p_predicted_default")$p_predicted_default)) -hist(as.vector(extract(generated_ib, pars="p_predicted_intervention")$p_predicted_intervention)) -hist(as.vector(extract(generated_ib, pars="predicted_difference")$predicted_difference)) -``` +counterfact_predicted_ib <- data.frame( + p_predicted_default = as.vector(extract(generated_ib, pars="p_predicted_default")$p_predicted_default) + ,p_predicted_intervention = as.vector(extract(generated_ib, pars="p_predicted_intervention")$p_predicted_intervention) + ,predicted_difference = as.vector(extract(generated_ib, pars="predicted_difference")$predicted_difference) +) +ggplot(counterfact_predicted_ib, aes(x=p_predicted_default)) + + geom_density() + + labs( + title="Predicted Distribution of 'p'" + ,subtitle="Intervention: None" + ,x="Probability Domain 'p'" + ,y="Probability Density" + ) +ggsave("./Images/DirectEffects/default_p_generic_intervention_base.png") + +ggplot(counterfact_predicted_ib, aes(x=p_predicted_intervention)) + + geom_density() + + labs( + title="Predicted Distribution of 'p'" + ,subtitle="Intervention: Add a single generic competitor" + ,x="Probability Domain 'p'" + ,y="Probability Density" + ) +ggsave("./Images/DirectEffects/default_p_generic_intervention_interv.png") + +ggplot(counterfact_predicted_ib, aes(x=predicted_difference)) + + geom_density() + + labs( + title="Predicted Distribution of differences 'p'" + ,subtitle="Intervention: Add a single generic competitor" + ,x="Difference in 'p' under treatment" + ,y="Probability Density" + ) +ggsave("./Images/DirectEffects/default_p_generic_intervention_distdiff.png") +``` + ```{r} @@ -383,20 +446,37 @@ pddf_ib <- data.frame(extract(generated_ib, pars="predicted_difference")$predict pddf_ib["entry_idx"] <- as.numeric(gsub("\\D","",pddf_ib$name)) pddf_ib["category"] <- sapply(pddf_ib$entry_idx, function(i) df$category_id[i]) pddf_ib["category_name"] <- sapply(pddf_ib$category, function(i) beta_list$groups[i]) -``` -```{r} - ggplot(pddf_ib, aes(x=value,)) + - geom_histogram(bins=100) + + geom_density(bins=100) + labs( title = "Distribution of predicted differences" + ,subtitle = "Intervention: add a single generic competitor" ,x = "Difference in probability due to intervention" - ,y = "Predicted counts" + ,y = "Probability Density" ) + - #xlim(-0.3,0.1) + geom_vline(aes(xintercept = 0), color = "skyblue", linetype="dashed") +ggsave("./Images/DirectEffects/p_generic_intervention_distdiff_styled.png") + +ggplot(pddf_ib, aes(x=value,)) + + geom_density(bins=100) + + facet_wrap( + ~factor( + category_name, + levels=beta_list$groups + ) + , labeller = label_wrap_gen(multi_line = TRUE) + , ncol=5) + + labs( + title = "Distribution of predicted differences | By Group" + ,subtitle = "Intervention: add a single generic competitor" + ,x = "Difference in probability due to intervention" + ,y = "Probability Density" + ) + + geom_vline(aes(xintercept = 0), color = "skyblue", linetype="dashed") + + theme(strip.text.x = element_text(size = 8)) +ggsave("./Images/DirectEffects/p_generic_intervention_distdiff_by_group.png") ggplot(pddf_ib, aes(x=value,)) + geom_histogram(bins=100) + @@ -408,15 +488,15 @@ ggplot(pddf_ib, aes(x=value,)) + , labeller = label_wrap_gen(multi_line = TRUE) , ncol=5) + labs( - title = "Distribution of predicted differences", - subtitle = "By group" + title = "Histogram of predicted differences | By Group" + ,subtitle = "Intervention: add a single generic competitor" ,x = "Difference in probability due to intervention" ,y = "Predicted counts" ) + #xlim(-0.25,0.1) + geom_vline(aes(xintercept = 0), color = "skyblue", linetype="dashed") + theme(strip.text.x = element_text(size = 8)) - +ggsave("./Images/DirectEffects/p_generic_intervention_histdiff_by_group.png") ``` @@ -458,11 +538,44 @@ generated_bnc <- gqs( ) ``` + ```{r} -#TODO: Convert to ggplot, stabilize y axis -hist(as.vector(extract(generated_bnc, pars="p_predicted_default")$p_predicted_default), bins=100) -hist(as.vector(extract(generated_bnc, pars="p_predicted_intervention")$p_predicted_intervention), bins=100) -hist(as.vector(extract(generated_bnc, pars="predicted_difference")$predicted_difference), bins=100) +counterfact_predicted_bnc <- data.frame( + p_predicted_default = as.vector(extract(generated_bnc, pars="p_predicted_default")$p_predicted_default) + ,p_predicted_intervention = as.vector(extract(generated_bnc, pars="p_predicted_intervention")$p_predicted_intervention) + ,predicted_difference = as.vector(extract(generated_bnc, pars="predicted_difference")$predicted_difference) +) + + +ggplot(counterfact_predicted_bnc, aes(x=p_predicted_default)) + + geom_density() + + labs( + title="Predicted Distribution of 'p'" + ,subtitle="Intervention: None" + ,x="Probability Domain 'p'" + ,y="Probability Density" + ) +ggsave("./Images/DirectEffects/default_p_uspdc_intervention_base.png") + +ggplot(counterfact_predicted_bnc, aes(x=p_predicted_intervention)) + + geom_density() + + labs( + title="Predicted Distribution of 'p'" + ,subtitle="Intervention: Add a single USP DC competitor" + ,x="Probability Domain 'p'" + ,y="Probability Density" + ) +ggsave("./Images/DirectEffects/default_p_uspdc_intervention_interv.png") + +ggplot(counterfact_predicted_bnc, aes(x=predicted_difference)) + + geom_density() + + labs( + title="Predicted Distribution of differences 'p'" + ,subtitle="Intervention: Add a single USP DC competitor" + ,x="Difference in 'p' under treatment" + ,y="Probability Density" + ) +ggsave("./Images/DirectEffects/default_p_uspdc_intervention_distdiff.png") ``` @@ -475,20 +588,37 @@ pddf_bnc["entry_idx"] <- as.numeric(gsub("\\D","",pddf_bnc$name)) pddf_bnc["category"] <- sapply(pddf_bnc$entry_idx, function(i) df$category_id[i]) pddf_bnc["category_name"] <- sapply(pddf_bnc$category, function(i) beta_list$groups[i]) -``` - -```{r} ggplot(pddf_bnc, aes(x=value,)) + - geom_histogram(bins=100) + + geom_density(bins=100) + labs( title = "Distribution of predicted differences" + ,subtitle = "Intervention: add a single USP DC competitor" ,x = "Difference in probability due to intervention" - ,y = "Predicted counts" + ,y = "Probability Density" ) + - #xlim(-0.3,0.1) + geom_vline(aes(xintercept = 0), color = "skyblue", linetype="dashed") +ggsave("./Images/DirectEffects/p_uspdc_intervention_distdiff_styled.png") + +ggplot(pddf_bnc, aes(x=value,)) + + geom_density(bins=100) + + facet_wrap( + ~factor( + category_name, + levels=beta_list$groups + ) + , labeller = label_wrap_gen(multi_line = TRUE) + , ncol=5) + + labs( + title = "Distribution of predicted differences in 'p' | By Group" + ,subtitle = "Intervention: add a single USP DC competitor" + ,x = "Difference in probability due to intervention" + ,y = "Probability Density" + ) + + geom_vline(aes(xintercept = 0), color = "skyblue", linetype="dashed") + + theme(strip.text.x = element_text(size = 8)) +ggsave("./Images/DirectEffects/p_uspdc_intervention_distdiff_by_group.png") ggplot(pddf_bnc, aes(x=value,)) + geom_histogram(bins=100) + @@ -500,17 +630,18 @@ ggplot(pddf_bnc, aes(x=value,)) + , labeller = label_wrap_gen(multi_line = TRUE) , ncol=5) + labs( - title = "Distribution of predicted differences", - subtitle = "By group" + title = "Histogram of predicted differences in 'p' | By Group" + ,subtitle = "Intervention: add a single USP DC competitor" ,x = "Difference in probability due to intervention" ,y = "Predicted counts" ) + #xlim(-0.25,0.1) + geom_vline(aes(xintercept = 0), color = "skyblue", linetype="dashed") + theme(strip.text.x = element_text(size = 8)) - +ggsave("./Images/DirectEffects/p_uspdc_intervention_histdiff_by_group.png") ``` + TODO: add density plot of (x,y,z) (date,value,counts) - with and without faceting @@ -539,6 +670,7 @@ ggplot(data=df3, aes(x=count, fill=final_status)) + geom_histogram(binwidth=1) + ggtitle("Histogram of snapshots per trial (matched trials)") + xlab("Snapshots per trial") +ggsave("./Images/HistSnapshots.png") #Plot duration for terminated vs completed df4 <- dbGetQuery( @@ -561,6 +693,7 @@ ggplot(data=df4, aes(x=duration,fill=overall_status)) + ggtitle("Histogram of trial durations") + xlab("duration")+ facet_wrap(~overall_status) +ggsave("./Images/HistTrialDurations_Faceted.png") df5 <- dbGetQuery( con, @@ -585,23 +718,29 @@ df5 <- dbGetQuery( ;" ) df5$overall_status <- as.factor(df5$overall_status) -#df5 <- fetch(query5, n = -1) ggplot(data=df5, aes(x=duration,y=snapshot_count,color=overall_status)) + geom_jitter() + - ggtitle("duration, status, and snapshot_count") + + ggtitle("Comparison of duration, status, and snapshot_count") + xlab("duration") + ylab("snapshot count") - +ggsave("./Images/SnapshotsVsDurationVsTermination.png") dbDisconnect(con) #get number of trials and snapshots in each category group_trials_by_category <- as.data.frame(aggregate(category_id ~ nct_id, df, max)) group_trials_by_category <- as.data.frame(group_trials_by_category) + ggplot(data = group_trials_by_category, aes(x=category_id)) + - geom_histogram(binwidth=1,color="black",fill="seagreen") + - scale_x_continuous(breaks=scales::pretty_breaks(n=22)) + geom_bar(binwidth=1,color="black",fill="seagreen") + + scale_x_continuous(breaks=scales::pretty_breaks(n=22)) + + labs( + title="bar chart of trial categories" + ,x="Category ID" + ,y="Count" + ) +ggsave("./Images/CategoryCounts.png") @@ -616,6 +755,7 @@ summary(df5) # Diagnostics ```{r} +#| eval: false #trace plots plot(fit, pars=c("mu"), plotfun="trace") @@ -638,6 +778,7 @@ for (i in 1:4) { ``` ```{r} +#| eval: false plot(fit, pars=c("sigma"), plotfun="trace") for (i in 1:4) { @@ -658,6 +799,7 @@ for (i in 1:4) { ``` ```{r} +#| eval: false #other diagnostics logpost <- log_posterior(fit) nuts_prmts <- nuts_params(fit) @@ -666,12 +808,14 @@ posterior <- as.array(fit) ``` ```{r} +#| eval: false color_scheme_set("darkgray") div_style <- parcoord_style_np(div_color = "green", div_size = 0.05, div_alpha = 0.4) mcmc_parcoord(posterior, regex_pars = "mu", np=nuts_prmts, np_style = div_style, alpha = 0.05) ``` ```{r} +#| eval: false for (i in 1:4) { mus = sapply(3:0, function(j) paste0("mu[",4*i-j ,"]")) print( @@ -692,10 +836,12 @@ for (i in 1:4) { ``` ```{r} +#| eval: false mcmc_parcoord(posterior,regex_pars = "sigma", np=nuts_prmts, alpha=0.05) ``` ```{r} +#| eval: false for (i in 1:4) { params = sapply(3:0, function(j) paste0("sigma[",4*i-j ,"]")) @@ -715,6 +861,7 @@ for (i in 1:4) { ```{r} +#| eval: false for (k in 1:22) { for (i in 1:4) { params = sapply(3:0, function(j) paste0("beta[",k,",",4*i-j ,"]")) @@ -750,9 +897,6 @@ I wonder if a lot of the variance is due to the 2 values that are sitting out. -```{r} -#mcmc_intervals(fit, pars=get_parameters("beta",beta_list)$param_name) -``` ### Investigating parameter distributions @@ -782,21 +926,24 @@ I wonder if a lot of the variance is due to the 2 values that are sitting out. p1 <- parameter_mcmc_areas("beta",beta_list,fit,1) +ggsave("./Images/DirectEffects/Parameters/01_elapsed_duration.png") p2 <- parameter_mcmc_areas("beta",beta_list,fit,2) +ggsave("./Images/DirectEffects/Parameters/02_generic.png") p3 <- parameter_mcmc_areas("beta",beta_list,fit,3) +ggsave("./Images/DirectEffects/Parameters/03_uspdc.png") #p4 <- parameter_mcmc_areas("beta",beta_list,fit,4) #p5 <- parameter_mcmc_areas("beta",beta_list,fit,5) #p6 <- parameter_mcmc_areas("beta",beta_list,fit,6) #p7 <- parameter_mcmc_areas("beta",beta_list,fit,7) #p8 <- parameter_mcmc_areas("beta",beta_list,fit,8) p9 <- parameter_mcmc_areas("beta",beta_list,fit,9) +ggsave("./Images/DirectEffects/Parameters/09_NYR.png") p10 <- parameter_mcmc_areas("beta",beta_list,fit,10) +ggsave("./Images/DirectEffects/Parameters/10_EBI.png") p11 <- parameter_mcmc_areas("beta",beta_list,fit,11) +ggsave("./Images/DirectEffects/Parameters/11_Rec.png") p12 <- parameter_mcmc_areas("beta",beta_list,fit,12) -p13 <- parameter_mcmc_areas("beta",beta_list,fit,13) -p14 <- parameter_mcmc_areas("beta",beta_list,fit,14) -p15 <- parameter_mcmc_areas("beta",beta_list,fit,15) -p16 <- parameter_mcmc_areas("beta",beta_list,fit,16) +ggsave("./Images/DirectEffects/Parameters/12_ANR.png") ``` Note these have 95% outer CI and 80% inner (shaded) @@ -812,73 +959,25 @@ Note these have 95% outer CI and 80% inner (shaded) 8) "asinh(Low SDI)", 9) "status_NYR", 10) "status_EBI", - 11) "status_ANR", - 12) "ib*elapsed", - 13) "bnc*elapsed", - 14) "sNYR*elapsed", - 15) "sEBI*elapsed", - 16) "sANR*elapsed" - -of interest -- p1 + p2 -- p3 + p2 -- p2 + p12 -- p3 + p13 -- p9 + p14 -- p10 + p15 -- p11 + p16 - -```{r} -p1 + p2 -``` - - -```{r} -p2 + p3 -``` - -```{r} -p2 + p12 -``` - -```{r} -p3 + p13 -``` - -```{r} -p9 + p14 -``` + 11) "status_Rec", + 12) "status_ANR", -```{r} -p10 + p15 -``` - ```{r} -p11 + p16 -``` - - -# Posterior Prediction - - -```{r} -#TODO: Convert to ggplot, stabilize y axis -hist(as.vector(extract(generated, pars="p_predicted_default")$p_predicted_default)) -hist(as.vector(extract(generated, pars="p_predicted_intervention")$p_predicted_intervention)) +p2 + p3 +ggsave("./Images/DirectEffects/Parameters/2+3_generic_and_uspdc.png") ``` -## Distribution of Predicted Differences - ### Intervention: Marginal increase in time to finish enrollment ```{r} +#| eval: false pddf <- data.frame(extract(generated, pars="predicted_difference")$predicted_difference) |> pivot_longer(X1:X189) pddf["entry_idx"] <- as.numeric(gsub("\\D","",pddf$name))