--- title: "The Effects of market conditions on recruitment and completion of clinical trials" author: "Will King" format: html editor: source --- ```{r} library(bayesplot) available_mcmc(pattern = "_nuts_") library(ggplot2) library(rstan) #Resources: https://github.com/stan-dev/rstan/wiki/RStan-Getting-Started #save unchanged models instead of recompiling rstan_options(auto_write = TRUE) #allow for multithreaded sampling options(mc.cores = parallel::detectCores()) #test installation, shouldn't get any errors #example(stan_model, package = "rstan", run.dontrun = TRUE) ``` ```{r} ################ Pull data from database ###################### library(RPostgreSQL) driver <- dbDriver("PostgreSQL") get_data <- function(driver) { con <- dbConnect( driver, user='root', password='root', dbname='aact_db', host='will-office' ) on.exit(dbDisconnect(con)) query <- dbSendQuery( con, "select * from formatted_data_with_planned_enrollment;" ) df <- fetch(query, n = -1) df <- na.omit(df) query2 <-dbSendQuery(con,"select count(*) from \"DiseaseBurden\".icd10_categories ic where \"level\"=1;") n_categories <- fetch(query2, n = -1) return(list(data=df,ncat=n_categories)) } d <- get_data(driver) df <- d$data n_categories <- d$ncat ################ Format Data ########################### categories <- df["category_id"] x <- df["elapsed_duration"] x["log_n_brands"] <- asinh(df$n_brands) #todo - convert to arcsinh() transform x["h_sdi_val"] <- asinh(df$h_sdi_val) x["hm_sdi_val"] <- asinh(df$hm_sdi_val) x["m_sdi_val"] <- asinh(df$m_sdi_val) x["lm_sdi_val"] <- asinh(df$lm_sdi_val) x["l_sdi_val"] <- asinh(df$l_sdi_val) #x["enrollment"] <- df["current_enrollment"] / df["planned_enrollment"] #square terms #x["enrollment^2"] <- x["enrollment"]^2 #x["elapsed_duration^2"] <- x["elapsed_duration"]^2 #x["n_brands^2"] <- x["n_brands"]^2 #break out #x["status_NYR"] <- ifelse(df["current_status"]=="Not yet recruiting",1,0) #x["status_Rec"] <- ifelse(df["current_status"]=="Recruiting",1,0) #x["status_ANR"] <- ifelse(df["current_status"]=="Active, not recruiting",1,0) #x["status_EBI"] <- ifelse(df["current_status"]=="Enrolling by invitation",1,0) y <- ifelse(df["final_status"]=="Terminated",1,0) ``` ```{r} ################################# DATA EXPLORATION ############################ driver <- dbDriver("PostgreSQL") con <- dbConnect( driver, user='root', password='root', dbname='aact_db', host='will-office' ) #Plot histogram of count of snapshots df3 <- dbGetQuery( con, "select nct_id,final_status,count(*) from formatted_data_with_planned_enrollment fdwpe group by nct_id,final_status ;" ) #df3 <- fetch(query3, n = -1) ggplot(data=df3, aes(x=count, fill=final_status)) + geom_histogram(binwidth=1) + ggtitle("Histogram of snapshots per trial (matched trials)") + xlab("Snapshots per trial") #Plot duration for terminated vs completed df4 <- dbGetQuery( con, " select nct_id, start_date , primary_completion_date, overall_status , primary_completion_date - start_date as duration from ctgov.studies s where nct_id in (select distinct nct_id from http.download_status ds) ;" ) #df4 <- fetch(query4, n = -1) ggplot(data=df4, aes(x=duration,fill=overall_status)) + geom_histogram()+ ggtitle("Histogram of trial durations") + xlab("duration")+ facet_wrap(~overall_status) df5 <- dbGetQuery( con, " with cte1 as ( select nct_id, start_date , primary_completion_date, overall_status , primary_completion_date - start_date as duration from ctgov.studies s where nct_id in (select distinct nct_id from http.download_status ds) ), cte2 as ( select nct_id,count(*) as snapshot_count from formatted_data_with_planned_enrollment fdwpe group by nct_id ) select a.nct_id, a.overall_status, a.duration,b.snapshot_count from cte1 as a join cte2 as b on a.nct_id=b.nct_id ;" ) #df5 <- fetch(query5, n = -1) ggplot(data=df5, aes(x=duration,y=snapshot_count,color=overall_status)) + geom_point() + ggtitle("duration, status, and snapshot_count") + xlab("duration") + ylab("snapshot count") dbDisconnect(con) ``` ```{r} ################################# FIT ######################################### #setup data (named list) trials_data <- list( D = ncol(x),# N = nrow(x), L = n_categories$count, y = as.vector(y), ll = as.vector(categories$category_id), x = as.matrix(x) ) #fit <- stan(file='Logistic.stan', data = trials_data) #attempt at simple model fit <- stan( file='Hierarchal_Logistic.stan', data = trials_data, chains = 6, iter = 5000, seed = 11021585 ) ``` ```{r} ################################# ANALYZE ##################################### print(fit) ``` ```{r} color_scheme_set("darkgray") #trace plots plot(fit, pars=c("mu"), plotfun="trace") color_scheme_set("viridis") mcmc_rank_overlay( fit, pars=c( "mu[1]", "mu[2]", "mu[3]", "mu[4]", "mu[5]", "mu[6]", "mu[7]" ), n_bins=100 )+ legend_move("top") ``` ```{r} color_scheme_set("viridis") plot(fit, pars=c("sigma"), plotfun="trace") color_scheme_set("viridis") mcmc_rank_overlay( fit, pars=c( "sigma[1]", "sigma[2]", "sigma[3]", "sigma[4]", "sigma[5]", "sigma[6]", "sigma[7]" ), n_bins=100 )+ legend_move("top") ``` ```{r} #other diagnostics logpost <- log_posterior(fit) nuts_prmts <- nuts_params(fit) posterior <- as.array(fit) ``` ```{r} color_scheme_set("darkgray") mcmc_parcoord(posterior,pars=c( "mu[1]", "mu[2]", "mu[3]", "mu[4]", "mu[5]", "mu[6]", "mu[7]" ), np=nuts_prmts ) ``` ```{r} mcmc_pairs( posterior, np = nuts_prmts, pars=c( "mu[1]", "mu[2]", "mu[3]", "mu[4]", "mu[5]", "mu[6]", "mu[7]" ), off_diag_args = list(size = 0.75) ) ``` ```{r} mcmc_parcoord(posterior,pars=c( "sigma[1]", "sigma[2]", "sigma[3]", "sigma[4]", "sigma[5]", "sigma[6]", "sigma[7]" ), np=nuts_prmts ) ``` ```{r} mcmc_pairs( posterior, np = nuts_prmts, pars=c( "sigma[1]", "sigma[2]", "sigma[3]", "sigma[4]", "sigma[5]", "sigma[6]", "sigma[7]" ), off_diag_args = list(size = 0.75) ) ``` ```{r} # check model estimates #mu plot(fit, pars=c("mu"), show_density =TRUE, ci_level=0.8) plot(fit, pars=c("mu"), plotfun = "hist", bins=50) ``` ```{r} #sigma plot(fit, pars=c("sigma"), show_density =TRUE, ci_level=0.8) plot(fit, pars=c("sigma"), plotfun = "hist",bins=50) ```