Used for version 1.0.0 analysis

check_reversion
youainti 3 years ago
parent 8a8c49f9ca
commit 4551fba165

@ -0,0 +1,150 @@
library(bayesplot)
available_mcmc(pattern = "_nuts_")
library(ggplot2)
library(rstan)
#Resources: https://github.com/stan-dev/rstan/wiki/RStan-Getting-Started
#save unchanged models instead of recompiling
rstan_options(auto_write = TRUE)
#allow for multithreaded sampling
options(mc.cores = parallel::detectCores())
#test installation, shouldn't get any errors
#example(stan_model, package = "rstan", run.dontrun = TRUE)
################ Pull data from database ######################
library(RPostgreSQL)
driver <- dbDriver("PostgreSQL")
con <- dbConnect(
driver,
user='root',
password='root',
dbname='aact_db',
host='will-office'
)
query <- dbSendQuery(
con,
"select * from formatted_data_with_planned_enrollment;"
)
df <- fetch(query)
df <- na.omit(df)
query2 <-dbSendQuery(con,"select count(*) from \"DiseaseBurden\".icd10_categories ic ;")
n_categories <- fetch(query2)
################ Format Data ###########################
categories <- df["category_id"]
x <- df["elapsed_duration"]
x["log_n_brands"] <- log(df$n_brands + 1)
x["h_sdi_val"] <- log(df$h_sdi_val + 1)
x["hm_sdi_val"] <- log(df$hm_sdi_val + 1)
x["m_sdi_val"] <- log(df$m_sdi_val + 1)
x["lm_sdi_val"] <- log(df$lm_sdi_val + 1)
x["l_sdi_val"] <- log(df$l_sdi_val + 1)
x["enrollment"] <- df["current_enrollment"] / df["planned_enrollment"]
#square terms
#x["enrollment^2"] <- x["enrollment"]^2
#x["elapsed_duration^2"] <- x["elapsed_duration"]^2
#x["n_brands^2"] <- x["n_brands"]^2
#break out
x["status_NYR"] <- ifelse(df["current_status"]=="Not yet recruiting",1,0)
x["status_Rec"] <- ifelse(df["current_status"]=="Recruiting",1,0)
x["status_ANR"] <- ifelse(df["current_status"]=="Active, not recruiting",1,0)
x["status_EBI"] <- ifelse(df["current_status"]=="Enrolling by invitation",1,0)
y <- ifelse(df["final_status"]=="Terminated",1,0)
################################# DATA EXPLORATION ############################
#Plot terminated vs completed
#Plot duration for terminated vs completed
#Plot different times of
################################# FIT #########################################
#setup data (named list)
trials_data <- list(
D = ncol(x),#
N = nrow(x),
# L = n_categories$count,
y = as.vector(y),
# ll = as.vector(categories$category_id),
x = as.matrix(x)
)
#fit <- stan(file='Logistic.stan', data = trials_data) #attempt at simple model
fit <- stan(file='Hierarchal_Logistic.stan', data = trials_data)
################################# ANALYZE #####################################
print(fit)
color_scheme_set("darkgray")
#trace plots
plot(fit, pars=c("mu"), plotfun="trace")
plot(fit, pars=c("sigma"), plotfun="trace")
#other diagnostics
logpost <- log_posterior(fit)
nuts_prmts <- nuts_params(fit)
posterior <- as.array(fit)
mcmc_parcoord(posterior,pars=c(
"mu[1]",
"mu[2]",
"mu[3]",
"mu[4]",
"mu[5]",
"mu[6]",
"mu[7]",
"mu[8]",
"mu[9]",
"mu[10]",
"mu[11]",
"mu[12]"
),
np=nuts_prmts
)
mcmc_pairs(
posterior,
np = nuts_prmts,
pars=c(
"mu[1]",
"mu[2]",
"sigma[1]",
"sigma[2]"
),
off_diag_args = list(size = 0.75)
)
mcmc_parcoord(posterior,pars=c(
"sigma[1]",
"sigma[2]",
"sigma[3]",
"sigma[4]",
"sigma[5]",
"sigma[6]",
"sigma[7]",
"sigma[8]",
"sigma[9]",
"sigma[10]",
"sigma[11]",
"sigma[12]"
),
np=nuts_prmts
)
# check model estimates
#mu
plot(fit, pars=c("mu"), show_density =TRUE, ci_level=0.8)
plot(fit, pars=c("mu"), plotfun = "hist")
#sigma
plot(fit, pars=c("sigma"), show_density =TRUE, ci_level=0.8)
plot(fit, pars=c("sigma"), plotfun = "hist")

@ -0,0 +1,322 @@
---
title: "The Effects of market conditions on recruitment and completion of clinical trials"
author: "Will King"
format: html
editor: source
---
```{r}
library(bayesplot)
available_mcmc(pattern = "_nuts_")
library(ggplot2)
library(rstan)
#Resources: https://github.com/stan-dev/rstan/wiki/RStan-Getting-Started
#save unchanged models instead of recompiling
rstan_options(auto_write = TRUE)
#allow for multithreaded sampling
options(mc.cores = parallel::detectCores())
#test installation, shouldn't get any errors
#example(stan_model, package = "rstan", run.dontrun = TRUE)
```
```{r}
################ Pull data from database ######################
library(RPostgreSQL)
driver <- dbDriver("PostgreSQL")
get_data <- function(driver) {
con <- dbConnect(
driver,
user='root',
password='root',
dbname='aact_db',
host='will-office'
)
on.exit(dbDisconnect(con))
query <- dbSendQuery(
con,
"select * from formatted_data_with_planned_enrollment;"
)
df <- fetch(query, n = -1)
df <- na.omit(df)
query2 <-dbSendQuery(con,"select count(*) from \"DiseaseBurden\".icd10_categories ic where \"level\"=1;")
n_categories <- fetch(query2, n = -1)
return(list(data=df,ncat=n_categories))
}
d <- get_data(driver)
df <- d$data
n_categories <- d$ncat
################ Format Data ###########################
categories <- df["category_id"]
x <- df["elapsed_duration"]
x["log_n_brands"] <- asinh(df$n_brands) #todo - convert to arcsinh() transform
x["h_sdi_val"] <- asinh(df$h_sdi_val)
x["hm_sdi_val"] <- asinh(df$hm_sdi_val)
x["m_sdi_val"] <- asinh(df$m_sdi_val)
x["lm_sdi_val"] <- asinh(df$lm_sdi_val)
x["l_sdi_val"] <- asinh(df$l_sdi_val)
#x["enrollment"] <- df["current_enrollment"] / df["planned_enrollment"]
#square terms
#x["enrollment^2"] <- x["enrollment"]^2
#x["elapsed_duration^2"] <- x["elapsed_duration"]^2
#x["n_brands^2"] <- x["n_brands"]^2
#break out
#x["status_NYR"] <- ifelse(df["current_status"]=="Not yet recruiting",1,0)
#x["status_Rec"] <- ifelse(df["current_status"]=="Recruiting",1,0)
#x["status_ANR"] <- ifelse(df["current_status"]=="Active, not recruiting",1,0)
#x["status_EBI"] <- ifelse(df["current_status"]=="Enrolling by invitation",1,0)
y <- ifelse(df["final_status"]=="Terminated",1,0)
```
```{r}
################################# DATA EXPLORATION ############################
driver <- dbDriver("PostgreSQL")
con <- dbConnect(
driver,
user='root',
password='root',
dbname='aact_db',
host='will-office'
)
#Plot histogram of count of snapshots
df3 <- dbGetQuery(
con,
"select nct_id,final_status,count(*) from formatted_data_with_planned_enrollment fdwpe
group by nct_id,final_status ;"
)
#df3 <- fetch(query3, n = -1)
ggplot(data=df3, aes(x=count, fill=final_status)) +
geom_histogram(binwidth=1) +
ggtitle("Histogram of snapshots per trial (matched trials)") +
xlab("Snapshots per trial")
#Plot duration for terminated vs completed
df4 <- dbGetQuery(
con,
"
select
nct_id,
start_date ,
primary_completion_date,
overall_status ,
primary_completion_date - start_date as duration
from ctgov.studies s
where nct_id in (select distinct nct_id from http.download_status ds)
;"
)
#df4 <- fetch(query4, n = -1)
ggplot(data=df4, aes(x=duration,fill=overall_status)) +
geom_histogram()+
ggtitle("Histogram of trial durations") +
xlab("duration")+
facet_wrap(~overall_status)
df5 <- dbGetQuery(
con,
"
with cte1 as (
select
nct_id,
start_date ,
primary_completion_date,
overall_status ,
primary_completion_date - start_date as duration
from ctgov.studies s
where nct_id in (select distinct nct_id from http.download_status ds)
), cte2 as (
select nct_id,count(*) as snapshot_count from formatted_data_with_planned_enrollment fdwpe
group by nct_id
)
select a.nct_id, a.overall_status, a.duration,b.snapshot_count
from cte1 as a
join cte2 as b
on a.nct_id=b.nct_id
;"
)
#df5 <- fetch(query5, n = -1)
ggplot(data=df5, aes(x=duration,y=snapshot_count,color=overall_status)) +
geom_point() +
ggtitle("duration, status, and snapshot_count") +
xlab("duration") +
ylab("snapshot count")
dbDisconnect(con)
```
```{r}
################################# FIT #########################################
#setup data (named list)
trials_data <- list(
D = ncol(x),#
N = nrow(x),
L = n_categories$count,
y = as.vector(y),
ll = as.vector(categories$category_id),
x = as.matrix(x)
)
#fit <- stan(file='Logistic.stan', data = trials_data) #attempt at simple model
fit <- stan(
file='Hierarchal_Logistic.stan',
data = trials_data,
chains = 6,
iter = 5000,
seed = 11021585
)
```
```{r}
################################# ANALYZE #####################################
print(fit)
```
```{r}
color_scheme_set("darkgray")
#trace plots
plot(fit, pars=c("mu"), plotfun="trace")
color_scheme_set("viridis")
mcmc_rank_overlay(
fit,
pars=c(
"mu[1]",
"mu[2]",
"mu[3]",
"mu[4]",
"mu[5]",
"mu[6]",
"mu[7]"
),
n_bins=100
)+ legend_move("top")
```
```{r}
color_scheme_set("viridis")
plot(fit, pars=c("sigma"), plotfun="trace")
color_scheme_set("viridis")
mcmc_rank_overlay(
fit,
pars=c(
"sigma[1]",
"sigma[2]",
"sigma[3]",
"sigma[4]",
"sigma[5]",
"sigma[6]",
"sigma[7]"
),
n_bins=100
)+ legend_move("top")
```
```{r}
#other diagnostics
logpost <- log_posterior(fit)
nuts_prmts <- nuts_params(fit)
posterior <- as.array(fit)
```
```{r}
color_scheme_set("darkgray")
mcmc_parcoord(posterior,pars=c(
"mu[1]",
"mu[2]",
"mu[3]",
"mu[4]",
"mu[5]",
"mu[6]",
"mu[7]"
),
np=nuts_prmts
)
```
```{r}
mcmc_pairs(
posterior,
np = nuts_prmts,
pars=c(
"mu[1]",
"mu[2]",
"mu[3]",
"mu[4]",
"mu[5]",
"mu[6]",
"mu[7]"
),
off_diag_args = list(size = 0.75)
)
```
```{r}
mcmc_parcoord(posterior,pars=c(
"sigma[1]",
"sigma[2]",
"sigma[3]",
"sigma[4]",
"sigma[5]",
"sigma[6]",
"sigma[7]"
),
np=nuts_prmts
)
```
```{r}
mcmc_pairs(
posterior,
np = nuts_prmts,
pars=c(
"sigma[1]",
"sigma[2]",
"sigma[3]",
"sigma[4]",
"sigma[5]",
"sigma[6]",
"sigma[7]"
),
off_diag_args = list(size = 0.75)
)
```
```{r}
# check model estimates
#mu
plot(fit, pars=c("mu"), show_density =TRUE, ci_level=0.8)
plot(fit, pars=c("mu"), plotfun = "hist", bins=50)
```
```{r}
#sigma
plot(fit, pars=c("sigma"), show_density =TRUE, ci_level=0.8)
plot(fit, pars=c("sigma"), plotfun = "hist",bins=50)
```

@ -0,0 +1,40 @@
//
// This Stan program defines a simple model, with a
// vector of values 'y' modeled as normally distributed
// with mean 'mu' and standard deviation 'sigma'.
//
// Learn more about model development with Stan at:
//
// http://mc-stan.org/users/interfaces/rstan.html
// https://github.com/stan-dev/rstan/wiki/RStan-Getting-Started
//
// The input data is a vector 'y' of length 'N'.
data {
int<lower=1> D;
int<lower=1> N;
int<lower=1> L;
int<lower=0, upper=1> y[N];
int<lower=1, upper=L> ll[N];
row_vector[D] x[N];
}
parameters {
real mu[D];
real<lower=0> sigma[D];
vector[D] beta[L];
}
model {
sigma ~ gamma(2,1); //shape and inverse scale
mu ~ normal(0, 1); //convert to mvnormal
for (l in 1:L) {
beta[l] ~ normal(mu, sigma);
}
{
vector[N] x_beta_ll;
for (n in 1:N) {
x_beta_ll[n] = x[n] * beta[ll[n]];
}
y ~ bernoulli_logit(x_beta_ll);
}
}

@ -0,0 +1,30 @@
//
// This Stan program defines a simple model, with a
// vector of values 'y' modeled as normally distributed
// with mean 'mu' and standard deviation 'sigma'.
//
// Learn more about model development with Stan at:
//
// http://mc-stan.org/users/interfaces/rstan.html
// https://github.com/stan-dev/rstan/wiki/RStan-Getting-Started
//
// The input data is a vector 'y' of length 'N'.
data {
int<lower=1> D;
int<lower=1> N;
int<lower=0, upper=1> y[N];
row_vector[D] x[N];
}
parameters {
real mu[D];
real<lower=0> sigma[D];
}
model {
sigma ~ gamma(2,0.1);
mu ~ normal(0, sigma); //convert to mvnormal
for (n in 1:N) {
y[n] ~ bernoulli_logit(x[n] * mu);
}
}

@ -0,0 +1,32 @@
//
// This Stan program defines a simple model, with a
// vector of values 'y' modeled as normally distributed
// with mean 'mu' and standard deviation 'sigma'.
//
// Learn more about model development with Stan at:
//
// http://mc-stan.org/users/interfaces/rstan.html
// https://github.com/stan-dev/rstan/wiki/RStan-Getting-Started
//
// The input data is a vector 'y' of length 'N'.
data {
int<lower=0> N;
int<lower=0> k;
matrix[N,k] X;
vector[N] int<lower=0, upper=1> y;
}
// The parameters accepted by the model. Our model
// accepts two parameters 'mu' and 'sigma'.
parameters {
vector[k] beta;
}
// The model to be estimated. We model the output
// 'y' to be normally distributed with mean 'mu'
// and standard deviation 'sigma'.
model {
y ~ bernoulli_logit( X * beta);
}

@ -0,0 +1,17 @@
Version: 1.0
RestoreWorkspace: Yes
SaveWorkspace: Ask
AlwaysSaveHistory: Yes
EnableCodeIndexing: Yes
UseSpacesForTab: Yes
NumSpacesForTab: 4
Encoding: UTF-8
RnwWeave: Sweave
LaTeX: pdfLaTeX
AutoAppendNewline: Yes
StripTrailingWhitespace: Yes
LineEndingConversion: Native
Loading…
Cancel
Save