You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
ClinicalTrialsEstimation/r-analysis/EffectsOfMarketConditions.qmd

323 lines
6.7 KiB
Plaintext

---
title: "The Effects of market conditions on recruitment and completion of clinical trials"
author: "Will King"
format: html
editor: source
---
```{r}
library(bayesplot)
available_mcmc(pattern = "_nuts_")
library(ggplot2)
library(rstan)
#Resources: https://github.com/stan-dev/rstan/wiki/RStan-Getting-Started
#save unchanged models instead of recompiling
rstan_options(auto_write = TRUE)
#allow for multithreaded sampling
options(mc.cores = parallel::detectCores())
#test installation, shouldn't get any errors
#example(stan_model, package = "rstan", run.dontrun = TRUE)
```
```{r}
################ Pull data from database ######################
library(RPostgreSQL)
driver <- dbDriver("PostgreSQL")
get_data <- function(driver) {
con <- dbConnect(
driver,
user='root',
password='root',
dbname='aact_db',
host='will-office'
)
on.exit(dbDisconnect(con))
query <- dbSendQuery(
con,
"select * from formatted_data_with_planned_enrollment;"
)
df <- fetch(query, n = -1)
df <- na.omit(df)
query2 <-dbSendQuery(con,"select count(*) from \"DiseaseBurden\".icd10_categories ic where \"level\"=1;")
n_categories <- fetch(query2, n = -1)
return(list(data=df,ncat=n_categories))
}
d <- get_data(driver)
df <- d$data
n_categories <- d$ncat
################ Format Data ###########################
categories <- df["category_id"]
x <- df["elapsed_duration"]
x["log_n_brands"] <- asinh(df$n_brands) #todo - convert to arcsinh() transform
x["h_sdi_val"] <- asinh(df$h_sdi_val)
x["hm_sdi_val"] <- asinh(df$hm_sdi_val)
x["m_sdi_val"] <- asinh(df$m_sdi_val)
x["lm_sdi_val"] <- asinh(df$lm_sdi_val)
x["l_sdi_val"] <- asinh(df$l_sdi_val)
#x["enrollment"] <- df["current_enrollment"] / df["planned_enrollment"]
#square terms
#x["enrollment^2"] <- x["enrollment"]^2
#x["elapsed_duration^2"] <- x["elapsed_duration"]^2
#x["n_brands^2"] <- x["n_brands"]^2
#break out
#x["status_NYR"] <- ifelse(df["current_status"]=="Not yet recruiting",1,0)
#x["status_Rec"] <- ifelse(df["current_status"]=="Recruiting",1,0)
#x["status_ANR"] <- ifelse(df["current_status"]=="Active, not recruiting",1,0)
#x["status_EBI"] <- ifelse(df["current_status"]=="Enrolling by invitation",1,0)
y <- ifelse(df["final_status"]=="Terminated",1,0)
```
```{r}
################################# DATA EXPLORATION ############################
driver <- dbDriver("PostgreSQL")
con <- dbConnect(
driver,
user='root',
password='root',
dbname='aact_db',
host='will-office'
)
#Plot histogram of count of snapshots
df3 <- dbGetQuery(
con,
"select nct_id,final_status,count(*) from formatted_data_with_planned_enrollment fdwpe
group by nct_id,final_status ;"
)
#df3 <- fetch(query3, n = -1)
ggplot(data=df3, aes(x=count, fill=final_status)) +
geom_histogram(binwidth=1) +
ggtitle("Histogram of snapshots per trial (matched trials)") +
xlab("Snapshots per trial")
#Plot duration for terminated vs completed
df4 <- dbGetQuery(
con,
"
select
nct_id,
start_date ,
primary_completion_date,
overall_status ,
primary_completion_date - start_date as duration
from ctgov.studies s
where nct_id in (select distinct nct_id from http.download_status ds)
;"
)
#df4 <- fetch(query4, n = -1)
ggplot(data=df4, aes(x=duration,fill=overall_status)) +
geom_histogram()+
ggtitle("Histogram of trial durations") +
xlab("duration")+
facet_wrap(~overall_status)
df5 <- dbGetQuery(
con,
"
with cte1 as (
select
nct_id,
start_date ,
primary_completion_date,
overall_status ,
primary_completion_date - start_date as duration
from ctgov.studies s
where nct_id in (select distinct nct_id from http.download_status ds)
), cte2 as (
select nct_id,count(*) as snapshot_count from formatted_data_with_planned_enrollment fdwpe
group by nct_id
)
select a.nct_id, a.overall_status, a.duration,b.snapshot_count
from cte1 as a
join cte2 as b
on a.nct_id=b.nct_id
;"
)
#df5 <- fetch(query5, n = -1)
ggplot(data=df5, aes(x=duration,y=snapshot_count,color=overall_status)) +
geom_point() +
ggtitle("duration, status, and snapshot_count") +
xlab("duration") +
ylab("snapshot count")
dbDisconnect(con)
```
```{r}
################################# FIT #########################################
#setup data (named list)
trials_data <- list(
D = ncol(x),#
N = nrow(x),
L = n_categories$count,
y = as.vector(y),
ll = as.vector(categories$category_id),
x = as.matrix(x)
)
#fit <- stan(file='Logistic.stan', data = trials_data) #attempt at simple model
fit <- stan(
file='Hierarchal_Logistic.stan',
data = trials_data,
chains = 6,
iter = 5000,
seed = 11021585
)
```
```{r}
################################# ANALYZE #####################################
print(fit)
```
```{r}
color_scheme_set("darkgray")
#trace plots
plot(fit, pars=c("mu"), plotfun="trace")
color_scheme_set("viridis")
mcmc_rank_overlay(
fit,
pars=c(
"mu[1]",
"mu[2]",
"mu[3]",
"mu[4]",
"mu[5]",
"mu[6]",
"mu[7]"
),
n_bins=100
)+ legend_move("top")
```
```{r}
color_scheme_set("viridis")
plot(fit, pars=c("sigma"), plotfun="trace")
color_scheme_set("viridis")
mcmc_rank_overlay(
fit,
pars=c(
"sigma[1]",
"sigma[2]",
"sigma[3]",
"sigma[4]",
"sigma[5]",
"sigma[6]",
"sigma[7]"
),
n_bins=100
)+ legend_move("top")
```
```{r}
#other diagnostics
logpost <- log_posterior(fit)
nuts_prmts <- nuts_params(fit)
posterior <- as.array(fit)
```
```{r}
color_scheme_set("darkgray")
mcmc_parcoord(posterior,pars=c(
"mu[1]",
"mu[2]",
"mu[3]",
"mu[4]",
"mu[5]",
"mu[6]",
"mu[7]"
),
np=nuts_prmts
)
```
```{r}
mcmc_pairs(
posterior,
np = nuts_prmts,
pars=c(
"mu[1]",
"mu[2]",
"mu[3]",
"mu[4]",
"mu[5]",
"mu[6]",
"mu[7]"
),
off_diag_args = list(size = 0.75)
)
```
```{r}
mcmc_parcoord(posterior,pars=c(
"sigma[1]",
"sigma[2]",
"sigma[3]",
"sigma[4]",
"sigma[5]",
"sigma[6]",
"sigma[7]"
),
np=nuts_prmts
)
```
```{r}
mcmc_pairs(
posterior,
np = nuts_prmts,
pars=c(
"sigma[1]",
"sigma[2]",
"sigma[3]",
"sigma[4]",
"sigma[5]",
"sigma[6]",
"sigma[7]"
),
off_diag_args = list(size = 0.75)
)
```
```{r}
# check model estimates
#mu
plot(fit, pars=c("mu"), show_density =TRUE, ci_level=0.8)
plot(fit, pars=c("mu"), plotfun = "hist", bins=50)
```
```{r}
#sigma
plot(fit, pars=c("sigma"), show_density =TRUE, ci_level=0.8)
plot(fit, pars=c("sigma"), plotfun = "hist",bins=50)
```