diff --git a/.gitignore b/.gitignore index 729f10d..3be90e4 100644 --- a/.gitignore +++ b/.gitignore @@ -165,3 +165,4 @@ docs/site/ Manifest.toml .Rproj.user +*/*_files/* diff --git a/r-analysis/.Rhistory b/r-analysis/.Rhistory new file mode 100644 index 0000000..e69de29 diff --git a/r-analysis/DataMangement.R b/r-analysis/DataMangement.R new file mode 100644 index 0000000..d5fac77 --- /dev/null +++ b/r-analysis/DataMangement.R @@ -0,0 +1,150 @@ +library(bayesplot) +available_mcmc(pattern = "_nuts_") +library(ggplot2) + +library(rstan) +#Resources: https://github.com/stan-dev/rstan/wiki/RStan-Getting-Started + +#save unchanged models instead of recompiling +rstan_options(auto_write = TRUE) +#allow for multithreaded sampling +options(mc.cores = parallel::detectCores()) + +#test installation, shouldn't get any errors +#example(stan_model, package = "rstan", run.dontrun = TRUE) + +################ Pull data from database ###################### +library(RPostgreSQL) + +driver <- dbDriver("PostgreSQL") + +con <- dbConnect( + driver, + user='root', + password='root', + dbname='aact_db', + host='will-office' + ) + +query <- dbSendQuery( + con, + "select * from formatted_data_with_planned_enrollment;" + ) +df <- fetch(query) +df <- na.omit(df) + +query2 <-dbSendQuery(con,"select count(*) from \"DiseaseBurden\".icd10_categories ic ;") +n_categories <- fetch(query2) + +################ Format Data ########################### + + +categories <- df["category_id"] + +x <- df["elapsed_duration"] +x["log_n_brands"] <- log(df$n_brands + 1) +x["h_sdi_val"] <- log(df$h_sdi_val + 1) +x["hm_sdi_val"] <- log(df$hm_sdi_val + 1) +x["m_sdi_val"] <- log(df$m_sdi_val + 1) +x["lm_sdi_val"] <- log(df$lm_sdi_val + 1) +x["l_sdi_val"] <- log(df$l_sdi_val + 1) +x["enrollment"] <- df["current_enrollment"] / df["planned_enrollment"] +#square terms +#x["enrollment^2"] <- x["enrollment"]^2 +#x["elapsed_duration^2"] <- x["elapsed_duration"]^2 +#x["n_brands^2"] <- x["n_brands"]^2 +#break out +x["status_NYR"] <- ifelse(df["current_status"]=="Not yet recruiting",1,0) +x["status_Rec"] <- ifelse(df["current_status"]=="Recruiting",1,0) +x["status_ANR"] <- ifelse(df["current_status"]=="Active, not recruiting",1,0) +x["status_EBI"] <- ifelse(df["current_status"]=="Enrolling by invitation",1,0) + +y <- ifelse(df["final_status"]=="Terminated",1,0) + +################################# DATA EXPLORATION ############################ + +#Plot terminated vs completed +#Plot duration for terminated vs completed +#Plot different times of + +################################# FIT ######################################### +#setup data (named list) +trials_data <- list( + D = ncol(x),# + N = nrow(x), +# L = n_categories$count, + y = as.vector(y), +# ll = as.vector(categories$category_id), + x = as.matrix(x) +) + +#fit <- stan(file='Logistic.stan', data = trials_data) #attempt at simple model +fit <- stan(file='Hierarchal_Logistic.stan', data = trials_data) + +################################# ANALYZE ##################################### +print(fit) + +color_scheme_set("darkgray") +#trace plots +plot(fit, pars=c("mu"), plotfun="trace") +plot(fit, pars=c("sigma"), plotfun="trace") + +#other diagnostics +logpost <- log_posterior(fit) +nuts_prmts <- nuts_params(fit) +posterior <- as.array(fit) + + +mcmc_parcoord(posterior,pars=c( + "mu[1]", + "mu[2]", + "mu[3]", + "mu[4]", + "mu[5]", + "mu[6]", + "mu[7]", + "mu[8]", + "mu[9]", + "mu[10]", + "mu[11]", + "mu[12]" + ), + np=nuts_prmts +) + +mcmc_pairs( + posterior, + np = nuts_prmts, + pars=c( + "mu[1]", + "mu[2]", + "sigma[1]", + "sigma[2]" + ), + off_diag_args = list(size = 0.75) +) + +mcmc_parcoord(posterior,pars=c( + "sigma[1]", + "sigma[2]", + "sigma[3]", + "sigma[4]", + "sigma[5]", + "sigma[6]", + "sigma[7]", + "sigma[8]", + "sigma[9]", + "sigma[10]", + "sigma[11]", + "sigma[12]" +), +np=nuts_prmts +) +# check model estimates +#mu +plot(fit, pars=c("mu"), show_density =TRUE, ci_level=0.8) +plot(fit, pars=c("mu"), plotfun = "hist") + +#sigma +plot(fit, pars=c("sigma"), show_density =TRUE, ci_level=0.8) +plot(fit, pars=c("sigma"), plotfun = "hist") diff --git a/r-analysis/EffectsOfMarketConditions.qmd b/r-analysis/EffectsOfMarketConditions.qmd new file mode 100644 index 0000000..fd0f24c --- /dev/null +++ b/r-analysis/EffectsOfMarketConditions.qmd @@ -0,0 +1,322 @@ +--- +title: "The Effects of market conditions on recruitment and completion of clinical trials" +author: "Will King" +format: html +editor: source +--- + +```{r} +library(bayesplot) +available_mcmc(pattern = "_nuts_") +library(ggplot2) + +library(rstan) +#Resources: https://github.com/stan-dev/rstan/wiki/RStan-Getting-Started + +#save unchanged models instead of recompiling +rstan_options(auto_write = TRUE) +#allow for multithreaded sampling +options(mc.cores = parallel::detectCores()) + +#test installation, shouldn't get any errors +#example(stan_model, package = "rstan", run.dontrun = TRUE) +``` + +```{r} +################ Pull data from database ###################### +library(RPostgreSQL) + +driver <- dbDriver("PostgreSQL") + +get_data <- function(driver) { + +con <- dbConnect( + driver, + user='root', + password='root', + dbname='aact_db', + host='will-office' + ) +on.exit(dbDisconnect(con)) + +query <- dbSendQuery( + con, + "select * from formatted_data_with_planned_enrollment;" + ) +df <- fetch(query, n = -1) +df <- na.omit(df) + +query2 <-dbSendQuery(con,"select count(*) from \"DiseaseBurden\".icd10_categories ic where \"level\"=1;") +n_categories <- fetch(query2, n = -1) + +return(list(data=df,ncat=n_categories)) +} + +d <- get_data(driver) +df <- d$data +n_categories <- d$ncat + + + +################ Format Data ########################### + + +categories <- df["category_id"] + +x <- df["elapsed_duration"] +x["log_n_brands"] <- asinh(df$n_brands) #todo - convert to arcsinh() transform +x["h_sdi_val"] <- asinh(df$h_sdi_val) +x["hm_sdi_val"] <- asinh(df$hm_sdi_val) +x["m_sdi_val"] <- asinh(df$m_sdi_val) +x["lm_sdi_val"] <- asinh(df$lm_sdi_val) +x["l_sdi_val"] <- asinh(df$l_sdi_val) +#x["enrollment"] <- df["current_enrollment"] / df["planned_enrollment"] +#square terms +#x["enrollment^2"] <- x["enrollment"]^2 +#x["elapsed_duration^2"] <- x["elapsed_duration"]^2 +#x["n_brands^2"] <- x["n_brands"]^2 +#break out +#x["status_NYR"] <- ifelse(df["current_status"]=="Not yet recruiting",1,0) +#x["status_Rec"] <- ifelse(df["current_status"]=="Recruiting",1,0) +#x["status_ANR"] <- ifelse(df["current_status"]=="Active, not recruiting",1,0) +#x["status_EBI"] <- ifelse(df["current_status"]=="Enrolling by invitation",1,0) + +y <- ifelse(df["final_status"]=="Terminated",1,0) +``` + +```{r} +################################# DATA EXPLORATION ############################ +driver <- dbDriver("PostgreSQL") + +con <- dbConnect( + driver, + user='root', + password='root', + dbname='aact_db', + host='will-office' + ) +#Plot histogram of count of snapshots +df3 <- dbGetQuery( + con, + "select nct_id,final_status,count(*) from formatted_data_with_planned_enrollment fdwpe + group by nct_id,final_status ;" + ) +#df3 <- fetch(query3, n = -1) + +ggplot(data=df3, aes(x=count, fill=final_status)) + + geom_histogram(binwidth=1) + + ggtitle("Histogram of snapshots per trial (matched trials)") + + xlab("Snapshots per trial") + +#Plot duration for terminated vs completed +df4 <- dbGetQuery( + con, + " + select + nct_id, + start_date , + primary_completion_date, + overall_status , + primary_completion_date - start_date as duration + from ctgov.studies s + where nct_id in (select distinct nct_id from http.download_status ds) + ;" + ) +#df4 <- fetch(query4, n = -1) + +ggplot(data=df4, aes(x=duration,fill=overall_status)) + + geom_histogram()+ + ggtitle("Histogram of trial durations") + + xlab("duration")+ + facet_wrap(~overall_status) + +df5 <- dbGetQuery( + con, + " + with cte1 as ( + select + nct_id, + start_date , + primary_completion_date, + overall_status , + primary_completion_date - start_date as duration + from ctgov.studies s + where nct_id in (select distinct nct_id from http.download_status ds) + ), cte2 as ( + select nct_id,count(*) as snapshot_count from formatted_data_with_planned_enrollment fdwpe + group by nct_id + ) + select a.nct_id, a.overall_status, a.duration,b.snapshot_count + from cte1 as a + join cte2 as b + on a.nct_id=b.nct_id + ;" + ) +#df5 <- fetch(query5, n = -1) + +ggplot(data=df5, aes(x=duration,y=snapshot_count,color=overall_status)) + + geom_point() + + ggtitle("duration, status, and snapshot_count") + + xlab("duration") + + ylab("snapshot count") + + +dbDisconnect(con) +``` + +```{r} +################################# FIT ######################################### + +#setup data (named list) +trials_data <- list( + D = ncol(x),# + N = nrow(x), + L = n_categories$count, + y = as.vector(y), + ll = as.vector(categories$category_id), + x = as.matrix(x) +) + +#fit <- stan(file='Logistic.stan', data = trials_data) #attempt at simple model +fit <- stan( + file='Hierarchal_Logistic.stan', + data = trials_data, + chains = 6, + iter = 5000, + seed = 11021585 + ) +``` + + +```{r} +################################# ANALYZE ##################################### +print(fit) +``` + +```{r} +color_scheme_set("darkgray") +#trace plots +plot(fit, pars=c("mu"), plotfun="trace") +color_scheme_set("viridis") +mcmc_rank_overlay( + fit, + pars=c( + "mu[1]", + "mu[2]", + "mu[3]", + "mu[4]", + "mu[5]", + "mu[6]", + "mu[7]" + ), + n_bins=100 + )+ legend_move("top") +``` + +```{r} +color_scheme_set("viridis") +plot(fit, pars=c("sigma"), plotfun="trace") +color_scheme_set("viridis") +mcmc_rank_overlay( + fit, + pars=c( + "sigma[1]", + "sigma[2]", + "sigma[3]", + "sigma[4]", + "sigma[5]", + "sigma[6]", + "sigma[7]" + ), + n_bins=100 + )+ legend_move("top") +``` + +```{r} +#other diagnostics +logpost <- log_posterior(fit) +nuts_prmts <- nuts_params(fit) +posterior <- as.array(fit) + +``` + +```{r} +color_scheme_set("darkgray") +mcmc_parcoord(posterior,pars=c( + "mu[1]", + "mu[2]", + "mu[3]", + "mu[4]", + "mu[5]", + "mu[6]", + "mu[7]" + ), + np=nuts_prmts +) +``` + +```{r} +mcmc_pairs( + posterior, + np = nuts_prmts, + pars=c( + "mu[1]", + "mu[2]", + "mu[3]", + "mu[4]", + "mu[5]", + "mu[6]", + "mu[7]" + ), + off_diag_args = list(size = 0.75) +) + + +``` + +```{r} +mcmc_parcoord(posterior,pars=c( + "sigma[1]", + "sigma[2]", + "sigma[3]", + "sigma[4]", + "sigma[5]", + "sigma[6]", + "sigma[7]" +), +np=nuts_prmts +) +``` + +```{r} +mcmc_pairs( + posterior, + np = nuts_prmts, + pars=c( + "sigma[1]", + "sigma[2]", + "sigma[3]", + "sigma[4]", + "sigma[5]", + "sigma[6]", + "sigma[7]" + ), + off_diag_args = list(size = 0.75) +) + + +``` + +```{r} +# check model estimates +#mu +plot(fit, pars=c("mu"), show_density =TRUE, ci_level=0.8) +plot(fit, pars=c("mu"), plotfun = "hist", bins=50) +``` + +```{r} +#sigma +plot(fit, pars=c("sigma"), show_density =TRUE, ci_level=0.8) +plot(fit, pars=c("sigma"), plotfun = "hist",bins=50) + +``` diff --git a/r-analysis/Hierarchal_Logistic.rds b/r-analysis/Hierarchal_Logistic.rds new file mode 100644 index 0000000..8d9afa6 Binary files /dev/null and b/r-analysis/Hierarchal_Logistic.rds differ diff --git a/r-analysis/Hierarchal_Logistic.stan b/r-analysis/Hierarchal_Logistic.stan new file mode 100644 index 0000000..1bff9a1 --- /dev/null +++ b/r-analysis/Hierarchal_Logistic.stan @@ -0,0 +1,40 @@ +// +// This Stan program defines a simple model, with a +// vector of values 'y' modeled as normally distributed +// with mean 'mu' and standard deviation 'sigma'. +// +// Learn more about model development with Stan at: +// +// http://mc-stan.org/users/interfaces/rstan.html +// https://github.com/stan-dev/rstan/wiki/RStan-Getting-Started +// + +// The input data is a vector 'y' of length 'N'. +data { + int D; + int N; + int L; + int y[N]; + int ll[N]; + row_vector[D] x[N]; +} +parameters { + real mu[D]; + real sigma[D]; + vector[D] beta[L]; +} +model { + sigma ~ gamma(2,1); //shape and inverse scale + mu ~ normal(0, 1); //convert to mvnormal + for (l in 1:L) { + beta[l] ~ normal(mu, sigma); + } + { + vector[N] x_beta_ll; + for (n in 1:N) { + x_beta_ll[n] = x[n] * beta[ll[n]]; + } + y ~ bernoulli_logit(x_beta_ll); + } +} + diff --git a/r-analysis/Logistic.stan b/r-analysis/Logistic.stan new file mode 100644 index 0000000..fe342bb --- /dev/null +++ b/r-analysis/Logistic.stan @@ -0,0 +1,30 @@ +// +// This Stan program defines a simple model, with a +// vector of values 'y' modeled as normally distributed +// with mean 'mu' and standard deviation 'sigma'. +// +// Learn more about model development with Stan at: +// +// http://mc-stan.org/users/interfaces/rstan.html +// https://github.com/stan-dev/rstan/wiki/RStan-Getting-Started +// + +// The input data is a vector 'y' of length 'N'. +data { + int D; + int N; + int y[N]; + row_vector[D] x[N]; +} +parameters { + real mu[D]; + real sigma[D]; +} +model { + sigma ~ gamma(2,0.1); + mu ~ normal(0, sigma); //convert to mvnormal + + for (n in 1:N) { + y[n] ~ bernoulli_logit(x[n] * mu); + } +} diff --git a/r-analysis/basic_logit.stan b/r-analysis/basic_logit.stan new file mode 100644 index 0000000..5b619e9 --- /dev/null +++ b/r-analysis/basic_logit.stan @@ -0,0 +1,32 @@ +// +// This Stan program defines a simple model, with a +// vector of values 'y' modeled as normally distributed +// with mean 'mu' and standard deviation 'sigma'. +// +// Learn more about model development with Stan at: +// +// http://mc-stan.org/users/interfaces/rstan.html +// https://github.com/stan-dev/rstan/wiki/RStan-Getting-Started +// + +// The input data is a vector 'y' of length 'N'. +data { + int N; + int k; + matrix[N,k] X; + vector[N] int y; +} + +// The parameters accepted by the model. Our model +// accepts two parameters 'mu' and 'sigma'. +parameters { + vector[k] beta; +} + +// The model to be estimated. We model the output +// 'y' to be normally distributed with mean 'mu' +// and standard deviation 'sigma'. +model { + y ~ bernoulli_logit( X * beta); +} + diff --git a/r-analysis/r-analysis.Rproj b/r-analysis/r-analysis.Rproj new file mode 100644 index 0000000..8a81d49 --- /dev/null +++ b/r-analysis/r-analysis.Rproj @@ -0,0 +1,17 @@ +Version: 1.0 + +RestoreWorkspace: Yes +SaveWorkspace: Ask +AlwaysSaveHistory: Yes + +EnableCodeIndexing: Yes +UseSpacesForTab: Yes +NumSpacesForTab: 4 +Encoding: UTF-8 + +RnwWeave: Sweave +LaTeX: pdfLaTeX + +AutoAppendNewline: Yes +StripTrailingWhitespace: Yes +LineEndingConversion: Native