You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
323 lines
6.7 KiB
Plaintext
323 lines
6.7 KiB
Plaintext
---
|
|
title: "The Effects of market conditions on recruitment and completion of clinical trials"
|
|
author: "Will King"
|
|
format: html
|
|
editor: source
|
|
---
|
|
|
|
```{r}
|
|
library(bayesplot)
|
|
available_mcmc(pattern = "_nuts_")
|
|
library(ggplot2)
|
|
|
|
library(rstan)
|
|
#Resources: https://github.com/stan-dev/rstan/wiki/RStan-Getting-Started
|
|
|
|
#save unchanged models instead of recompiling
|
|
rstan_options(auto_write = TRUE)
|
|
#allow for multithreaded sampling
|
|
options(mc.cores = parallel::detectCores())
|
|
|
|
#test installation, shouldn't get any errors
|
|
#example(stan_model, package = "rstan", run.dontrun = TRUE)
|
|
```
|
|
|
|
```{r}
|
|
################ Pull data from database ######################
|
|
library(RPostgreSQL)
|
|
|
|
driver <- dbDriver("PostgreSQL")
|
|
|
|
get_data <- function(driver) {
|
|
|
|
con <- dbConnect(
|
|
driver,
|
|
user='root',
|
|
password='root',
|
|
dbname='aact_db',
|
|
host='will-office'
|
|
)
|
|
on.exit(dbDisconnect(con))
|
|
|
|
query <- dbSendQuery(
|
|
con,
|
|
"select * from formatted_data_with_planned_enrollment;"
|
|
)
|
|
df <- fetch(query, n = -1)
|
|
df <- na.omit(df)
|
|
|
|
query2 <-dbSendQuery(con,"select count(*) from \"DiseaseBurden\".icd10_categories ic where \"level\"=1;")
|
|
n_categories <- fetch(query2, n = -1)
|
|
|
|
return(list(data=df,ncat=n_categories))
|
|
}
|
|
|
|
d <- get_data(driver)
|
|
df <- d$data
|
|
n_categories <- d$ncat
|
|
|
|
|
|
|
|
################ Format Data ###########################
|
|
|
|
|
|
categories <- df["category_id"]
|
|
|
|
x <- df["elapsed_duration"]
|
|
x["log_n_brands"] <- asinh(df$n_brands) #todo - convert to arcsinh() transform
|
|
x["h_sdi_val"] <- asinh(df$h_sdi_val)
|
|
x["hm_sdi_val"] <- asinh(df$hm_sdi_val)
|
|
x["m_sdi_val"] <- asinh(df$m_sdi_val)
|
|
x["lm_sdi_val"] <- asinh(df$lm_sdi_val)
|
|
x["l_sdi_val"] <- asinh(df$l_sdi_val)
|
|
#x["enrollment"] <- df["current_enrollment"] / df["planned_enrollment"]
|
|
#square terms
|
|
#x["enrollment^2"] <- x["enrollment"]^2
|
|
#x["elapsed_duration^2"] <- x["elapsed_duration"]^2
|
|
#x["n_brands^2"] <- x["n_brands"]^2
|
|
#break out
|
|
#x["status_NYR"] <- ifelse(df["current_status"]=="Not yet recruiting",1,0)
|
|
#x["status_Rec"] <- ifelse(df["current_status"]=="Recruiting",1,0)
|
|
#x["status_ANR"] <- ifelse(df["current_status"]=="Active, not recruiting",1,0)
|
|
#x["status_EBI"] <- ifelse(df["current_status"]=="Enrolling by invitation",1,0)
|
|
|
|
y <- ifelse(df["final_status"]=="Terminated",1,0)
|
|
```
|
|
|
|
```{r}
|
|
################################# DATA EXPLORATION ############################
|
|
driver <- dbDriver("PostgreSQL")
|
|
|
|
con <- dbConnect(
|
|
driver,
|
|
user='root',
|
|
password='root',
|
|
dbname='aact_db',
|
|
host='will-office'
|
|
)
|
|
#Plot histogram of count of snapshots
|
|
df3 <- dbGetQuery(
|
|
con,
|
|
"select nct_id,final_status,count(*) from formatted_data_with_planned_enrollment fdwpe
|
|
group by nct_id,final_status ;"
|
|
)
|
|
#df3 <- fetch(query3, n = -1)
|
|
|
|
ggplot(data=df3, aes(x=count, fill=final_status)) +
|
|
geom_histogram(binwidth=1) +
|
|
ggtitle("Histogram of snapshots per trial (matched trials)") +
|
|
xlab("Snapshots per trial")
|
|
|
|
#Plot duration for terminated vs completed
|
|
df4 <- dbGetQuery(
|
|
con,
|
|
"
|
|
select
|
|
nct_id,
|
|
start_date ,
|
|
primary_completion_date,
|
|
overall_status ,
|
|
primary_completion_date - start_date as duration
|
|
from ctgov.studies s
|
|
where nct_id in (select distinct nct_id from http.download_status ds)
|
|
;"
|
|
)
|
|
#df4 <- fetch(query4, n = -1)
|
|
|
|
ggplot(data=df4, aes(x=duration,fill=overall_status)) +
|
|
geom_histogram()+
|
|
ggtitle("Histogram of trial durations") +
|
|
xlab("duration")+
|
|
facet_wrap(~overall_status)
|
|
|
|
df5 <- dbGetQuery(
|
|
con,
|
|
"
|
|
with cte1 as (
|
|
select
|
|
nct_id,
|
|
start_date ,
|
|
primary_completion_date,
|
|
overall_status ,
|
|
primary_completion_date - start_date as duration
|
|
from ctgov.studies s
|
|
where nct_id in (select distinct nct_id from http.download_status ds)
|
|
), cte2 as (
|
|
select nct_id,count(*) as snapshot_count from formatted_data_with_planned_enrollment fdwpe
|
|
group by nct_id
|
|
)
|
|
select a.nct_id, a.overall_status, a.duration,b.snapshot_count
|
|
from cte1 as a
|
|
join cte2 as b
|
|
on a.nct_id=b.nct_id
|
|
;"
|
|
)
|
|
#df5 <- fetch(query5, n = -1)
|
|
|
|
ggplot(data=df5, aes(x=duration,y=snapshot_count,color=overall_status)) +
|
|
geom_point() +
|
|
ggtitle("duration, status, and snapshot_count") +
|
|
xlab("duration") +
|
|
ylab("snapshot count")
|
|
|
|
|
|
dbDisconnect(con)
|
|
```
|
|
|
|
```{r}
|
|
################################# FIT #########################################
|
|
|
|
#setup data (named list)
|
|
trials_data <- list(
|
|
D = ncol(x),#
|
|
N = nrow(x),
|
|
L = n_categories$count,
|
|
y = as.vector(y),
|
|
ll = as.vector(categories$category_id),
|
|
x = as.matrix(x)
|
|
)
|
|
|
|
#fit <- stan(file='Logistic.stan', data = trials_data) #attempt at simple model
|
|
fit <- stan(
|
|
file='Hierarchal_Logistic.stan',
|
|
data = trials_data,
|
|
chains = 6,
|
|
iter = 5000,
|
|
seed = 11021585
|
|
)
|
|
```
|
|
|
|
|
|
```{r}
|
|
################################# ANALYZE #####################################
|
|
print(fit)
|
|
```
|
|
|
|
```{r}
|
|
color_scheme_set("darkgray")
|
|
#trace plots
|
|
plot(fit, pars=c("mu"), plotfun="trace")
|
|
color_scheme_set("viridis")
|
|
mcmc_rank_overlay(
|
|
fit,
|
|
pars=c(
|
|
"mu[1]",
|
|
"mu[2]",
|
|
"mu[3]",
|
|
"mu[4]",
|
|
"mu[5]",
|
|
"mu[6]",
|
|
"mu[7]"
|
|
),
|
|
n_bins=100
|
|
)+ legend_move("top")
|
|
```
|
|
|
|
```{r}
|
|
color_scheme_set("viridis")
|
|
plot(fit, pars=c("sigma"), plotfun="trace")
|
|
color_scheme_set("viridis")
|
|
mcmc_rank_overlay(
|
|
fit,
|
|
pars=c(
|
|
"sigma[1]",
|
|
"sigma[2]",
|
|
"sigma[3]",
|
|
"sigma[4]",
|
|
"sigma[5]",
|
|
"sigma[6]",
|
|
"sigma[7]"
|
|
),
|
|
n_bins=100
|
|
)+ legend_move("top")
|
|
```
|
|
|
|
```{r}
|
|
#other diagnostics
|
|
logpost <- log_posterior(fit)
|
|
nuts_prmts <- nuts_params(fit)
|
|
posterior <- as.array(fit)
|
|
|
|
```
|
|
|
|
```{r}
|
|
color_scheme_set("darkgray")
|
|
mcmc_parcoord(posterior,pars=c(
|
|
"mu[1]",
|
|
"mu[2]",
|
|
"mu[3]",
|
|
"mu[4]",
|
|
"mu[5]",
|
|
"mu[6]",
|
|
"mu[7]"
|
|
),
|
|
np=nuts_prmts
|
|
)
|
|
```
|
|
|
|
```{r}
|
|
mcmc_pairs(
|
|
posterior,
|
|
np = nuts_prmts,
|
|
pars=c(
|
|
"mu[1]",
|
|
"mu[2]",
|
|
"mu[3]",
|
|
"mu[4]",
|
|
"mu[5]",
|
|
"mu[6]",
|
|
"mu[7]"
|
|
),
|
|
off_diag_args = list(size = 0.75)
|
|
)
|
|
|
|
|
|
```
|
|
|
|
```{r}
|
|
mcmc_parcoord(posterior,pars=c(
|
|
"sigma[1]",
|
|
"sigma[2]",
|
|
"sigma[3]",
|
|
"sigma[4]",
|
|
"sigma[5]",
|
|
"sigma[6]",
|
|
"sigma[7]"
|
|
),
|
|
np=nuts_prmts
|
|
)
|
|
```
|
|
|
|
```{r}
|
|
mcmc_pairs(
|
|
posterior,
|
|
np = nuts_prmts,
|
|
pars=c(
|
|
"sigma[1]",
|
|
"sigma[2]",
|
|
"sigma[3]",
|
|
"sigma[4]",
|
|
"sigma[5]",
|
|
"sigma[6]",
|
|
"sigma[7]"
|
|
),
|
|
off_diag_args = list(size = 0.75)
|
|
)
|
|
|
|
|
|
```
|
|
|
|
```{r}
|
|
# check model estimates
|
|
#mu
|
|
plot(fit, pars=c("mu"), show_density =TRUE, ci_level=0.8)
|
|
plot(fit, pars=c("mu"), plotfun = "hist", bins=50)
|
|
```
|
|
|
|
```{r}
|
|
#sigma
|
|
plot(fit, pars=c("sigma"), show_density =TRUE, ci_level=0.8)
|
|
plot(fit, pars=c("sigma"), plotfun = "hist",bins=50)
|
|
|
|
```
|