You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
ClinicalTrialsEstimation/r-analysis/FullAnalysis.R

240 lines
5.1 KiB
R

library(bayesplot)
available_mcmc(pattern = "_nuts_")
library(ggplot2)
library(posterior)
library(cmdstanr)
library(rstan) # for stanfit
#Resources: https://github.com/stan-dev/rstan/wiki/RStan-Getting-Started
#save unchanged models instead of recompiling
rstan_options(auto_write = TRUE)
#allow for multithreaded sampling
options(mc.cores = parallel::detectCores())
#test installation, shouldn't get any errors
#example(stan_model, package = "rstan", run.dontrun = TRUE)
################ Pull data from database ######################
library(RPostgreSQL)
driver <- dbDriver("PostgreSQL")
con <- dbConnect(
driver,
user='root',
password='root',
dbname='aact_db',
host='will-office'
)
query <- dbSendQuery(
con,
"select * from formatted_data_with_planned_enrollment;"
)
df <- fetch(query,Inf)
df <- na.omit(df)
n_categories <- 22 #number of categories in ICD-10
################ Format Data ###########################
categories <- df["category_id"]
#Convert log(x+1) to arcsinh
x <- df["elapsed_duration"]
x["log_n_brands"] <- log(df$n_brands + 1)
x["h_sdi_val"] <- log(df$h_sdi_val + 1)
x["hm_sdi_val"] <- log(df$hm_sdi_val + 1)
x["m_sdi_val"] <- log(df$m_sdi_val + 1)
x["lm_sdi_val"] <- log(df$lm_sdi_val + 1)
x["l_sdi_val"] <- log(df$l_sdi_val + 1)
#x["enrollment"] <- df["current_enrollment"] / df["planned_enrollment"]
#square terms
#x["enrollment^2"] <- x["enrollment"]^2
#x["elapsed_duration^2"] <- x["elapsed_duration"]^2
#x["n_brands^2"] <- x["n_brands"]^2
#break out
x["status_NYR"] <- ifelse(df["current_status"]=="Not yet recruiting",1,0)
x["status_Rec"] <- ifelse(df["current_status"]=="Recruiting",1,0)
x["status_ANR"] <- ifelse(df["current_status"]=="Active, not recruiting",1,0)
x["status_EBI"] <- ifelse(df["current_status"]=="Enrolling by invitation",1,0)
y <- ifelse(df["final_status"]=="Terminated",1,0)
################################# DATA EXPLORATION ############################
#Plot terminated vs completed
#Plot duration for terminated vs completed
#Plot different times of
################################# FIT #########################################
#setup data (named list)
trials_data <- list(
D = ncol(x),#
N = nrow(x),
L = n_categories,
y = as.vector(y),
ll = categories$category_id,
x = as.matrix(x),
mu_mean = 0,
mu_stdev = 0.5,
sigma_shape = 6,
sigma_rate = 12
)
model <- cmdstan_model(file.path("Hierarchal_Logistic.stan"))
fit <- model$sample(
data = trials_data,
seed = 11021585,
chains = 4,
parallel_chains = 4,
refresh = 500
)
################################# ANALYZE #####################################
color_scheme_set("darkgray")
div_style <- parcoord_style_np(div_color = "green", div_size = 0.05, div_alpha = 0.4)
print(fit$summary(),n=265)
stanfitted <- rstan::read_stan_csv(fit$output_files())
#analyze mu values
draw_mu <- fit$draws("mu")
mcmc_hist(draw_mu)
mcmc_trace(draw_mu)
mcmc_pairs(draw_mu)
mcmc_parcoord(posterior,pars=c(
"mu[1]",
"mu[2]",
"mu[3]",
"mu[4]",
"mu[5]",
"mu[6]",
"mu[7]",
"mu[8]",
"mu[9]",
"mu[10]",
"mu[11]"
),
np=nuts_prmts,
np_style = div_style
)
#check sigma
draw_sigma <- fit$draws("sigma")
mcmc_hist(draw_sigma)
mcmc_trace(draw_sigma)
mcmc_parcoord(posterior,pars=c(
"sigma[1]",
"sigma[2]",
"sigma[3]",
"sigma[4]",
"sigma[5]",
"sigma[6]",
"sigma[7]",
"sigma[8]",
"sigma[9]",
"sigma[10]",
"sigma[11]"
),
np=nuts_prmts,
np_style = div_style
)
#other diagnostics
logpost <- log_posterior(fit)
nuts_prmts <- nuts_params(fit)
posterior <- fit$draws()
mcmc_pairs(
posterior,
np = nuts_prmts,
pars=c(
"mu[1]",
"mu[2]",
"mu[3]",
"mu[4]",
"mu[5]",
"mu[6]",
"mu[7]",
"mu[8]",
"mu[9]",
"mu[10]",
"mu[11]"
),
off_diag_args = list(size = 0.75)
)
#Interpretation
mcmc_areas(posterior,
pars=c(
"mu[1]",
"mu[2]",
"mu[3]",
"mu[4]",
"mu[5]",
"mu[6]",
"mu[7]",
"mu[8]",
"mu[9]",
"mu[10]",
"mu[11]"
),
prob = 0.95
)
#Interpretation
mcmc_areas(posterior,
pars=c(
"sigma[1]",
"sigma[2]",
"sigma[3]",
"sigma[4]",
"sigma[5]",
"sigma[6]",
"sigma[7]",
"sigma[8]",
"sigma[9]",
"sigma[10]",
"sigma[11]"
),
prob = 0.95
)
#iterate through betas
mcmc_areas(posterior,
pars=c(
"beta[1,*]"
),
prob = 0.95
)
#generate array of betas
betas_array <- sapply(1:11, function(param) sapply(1:22, function(group) paste0("beta[",group,",",param,"]")))
for (group in 1:22) {
print(
mcmc_areas(posterior,
pars=betas_array[group,],
prob = 0.95
)
)
}
for (param in 1:11) {
print(
mcmc_areas(posterior,
pars=betas_array[,param],
prob = 0.95
)
)
}