Merge branch 'main' of ssh://git.youainti.com:3022/youainti/ClinicalTrialsEstimation

3 years ago · f2cfb776a0
parent 05b1b0c2ef 96319edca3
commit f2cfb776a0
9 changed files with 592 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -165,3 +165,4 @@ docs/site/
 Manifest.toml

 .Rproj.user
+*/*_files/*
--- a/r-analysis/.Rhistory
+++ b/r-analysis/.Rhistory
--- a/r-analysis/DataMangement.R
+++ b/r-analysis/DataMangement.R
@ -0,0 +1,150 @@
+library(bayesplot)
+available_mcmc(pattern = "_nuts_")
+library(ggplot2)
+
+library(rstan)
+#Resources: https://github.com/stan-dev/rstan/wiki/RStan-Getting-Started
+
+#save unchanged models instead of recompiling
+rstan_options(auto_write = TRUE)
+#allow for multithreaded sampling
+options(mc.cores = parallel::detectCores())
+
+#test installation, shouldn't get any errors
+#example(stan_model, package = "rstan", run.dontrun = TRUE)
+
+################ Pull data from database ######################
+library(RPostgreSQL)
+
+driver <- dbDriver("PostgreSQL")
+
+con <- dbConnect(
+    driver,
+    user='root',
+    password='root',
+    dbname='aact_db',
+    host='will-office'
+    )
+
+query <- dbSendQuery(
+    con,
+    "select * from formatted_data_with_planned_enrollment;"
+    )
+df <- fetch(query)
+df <- na.omit(df)
+
+query2 <-dbSendQuery(con,"select count(*) from \"DiseaseBurden\".icd10_categories ic ;")
+n_categories <- fetch(query2)
+
+################ Format Data ###########################
+
+
+categories <- df["category_id"]
+
+x <- df["elapsed_duration"]
+x["log_n_brands"] <- log(df$n_brands + 1)
+x["h_sdi_val"] <- log(df$h_sdi_val + 1)
+x["hm_sdi_val"] <- log(df$hm_sdi_val + 1)
+x["m_sdi_val"] <- log(df$m_sdi_val + 1)
+x["lm_sdi_val"] <- log(df$lm_sdi_val + 1)
+x["l_sdi_val"] <- log(df$l_sdi_val + 1)
+x["enrollment"] <- df["current_enrollment"] / df["planned_enrollment"]
+#square terms
+#x["enrollment^2"] <- x["enrollment"]^2
+#x["elapsed_duration^2"] <- x["elapsed_duration"]^2
+#x["n_brands^2"] <- x["n_brands"]^2
+#break out
+x["status_NYR"] <- ifelse(df["current_status"]=="Not yet recruiting",1,0)
+x["status_Rec"] <- ifelse(df["current_status"]=="Recruiting",1,0)
+x["status_ANR"] <- ifelse(df["current_status"]=="Active, not recruiting",1,0)
+x["status_EBI"] <- ifelse(df["current_status"]=="Enrolling by invitation",1,0)
+
+y <- ifelse(df["final_status"]=="Terminated",1,0)
+
+################################# DATA EXPLORATION ############################
+
+#Plot terminated vs completed
+#Plot duration for terminated vs completed
+#Plot different times of
+
+################################# FIT #########################################
+#setup data (named list)
+trials_data <- list(
+    D = ncol(x),#
+    N = nrow(x),
+#    L = n_categories$count,
+    y = as.vector(y),
+#    ll = as.vector(categories$category_id),
+    x = as.matrix(x)
+)
+
+#fit <- stan(file='Logistic.stan', data = trials_data) #attempt at simple model
+fit <- stan(file='Hierarchal_Logistic.stan', data = trials_data)
+
+################################# ANALYZE #####################################
+print(fit)
+
+color_scheme_set("darkgray")
+#trace plots
+plot(fit, pars=c("mu"), plotfun="trace")
+plot(fit, pars=c("sigma"), plotfun="trace")
+
+#other diagnostics
+logpost <- log_posterior(fit)
+nuts_prmts <- nuts_params(fit)
+posterior <- as.array(fit)
+
+
+mcmc_parcoord(posterior,pars=c(
+    "mu[1]",
+    "mu[2]",
+    "mu[3]",
+    "mu[4]",
+    "mu[5]",
+    "mu[6]",
+    "mu[7]",
+    "mu[8]",
+    "mu[9]",
+    "mu[10]",
+    "mu[11]",
+    "mu[12]"
+    ),
+    np=nuts_prmts
+)
+
+mcmc_pairs(
+    posterior,
+    np = nuts_prmts,
+    pars=c(
+        "mu[1]",
+        "mu[2]",
+        "sigma[1]",
+        "sigma[2]"
+    ),
+    off_diag_args = list(size = 0.75)
+)
+
+mcmc_parcoord(posterior,pars=c(
+    "sigma[1]",
+    "sigma[2]",
+    "sigma[3]",
+    "sigma[4]",
+    "sigma[5]",
+    "sigma[6]",
+    "sigma[7]",
+    "sigma[8]",
+    "sigma[9]",
+    "sigma[10]",
+    "sigma[11]",
+    "sigma[12]"
+),
+np=nuts_prmts
+)
+# check model estimates
+#mu
+plot(fit, pars=c("mu"), show_density =TRUE, ci_level=0.8)
+plot(fit, pars=c("mu"), plotfun = "hist")
+
+#sigma
+plot(fit, pars=c("sigma"), show_density =TRUE, ci_level=0.8)
+plot(fit, pars=c("sigma"), plotfun = "hist")
--- a/r-analysis/EffectsOfMarketConditions.qmd
+++ b/r-analysis/EffectsOfMarketConditions.qmd
@ -0,0 +1,322 @@
+---
+title: "The Effects of market conditions on recruitment and completion of clinical trials"
+author: "Will King"
+format: html
+editor: source
+---
+
+```{r}
+library(bayesplot)
+available_mcmc(pattern = "_nuts_")
+library(ggplot2)
+
+library(rstan)
+#Resources: https://github.com/stan-dev/rstan/wiki/RStan-Getting-Started
+
+#save unchanged models instead of recompiling
+rstan_options(auto_write = TRUE)
+#allow for multithreaded sampling
+options(mc.cores = parallel::detectCores())
+
+#test installation, shouldn't get any errors
+#example(stan_model, package = "rstan", run.dontrun = TRUE)
+```
+
+```{r}
+################ Pull data from database ######################
+library(RPostgreSQL)
+
+driver <- dbDriver("PostgreSQL")
+
+get_data <- function(driver) {
+
+con <- dbConnect(
+    driver,
+    user='root',
+    password='root',
+    dbname='aact_db',
+    host='will-office'
+    )
+on.exit(dbDisconnect(con))
+
+query <- dbSendQuery(
+    con,
+    "select * from formatted_data_with_planned_enrollment;"
+    )
+df <- fetch(query, n = -1)
+df <- na.omit(df)
+
+query2 <-dbSendQuery(con,"select count(*) from \"DiseaseBurden\".icd10_categories ic where \"level\"=1;")
+n_categories <- fetch(query2, n = -1)
+
+return(list(data=df,ncat=n_categories))
+}
+
+d <- get_data(driver)
+df <- d$data
+n_categories <- d$ncat
+
+
+
+################ Format Data ###########################
+
+
+categories <- df["category_id"]
+
+x <- df["elapsed_duration"]
+x["log_n_brands"] <- asinh(df$n_brands) #todo - convert to arcsinh() transform
+x["h_sdi_val"] <- asinh(df$h_sdi_val)
+x["hm_sdi_val"] <- asinh(df$hm_sdi_val)
+x["m_sdi_val"] <- asinh(df$m_sdi_val)
+x["lm_sdi_val"] <- asinh(df$lm_sdi_val)
+x["l_sdi_val"] <- asinh(df$l_sdi_val)
+#x["enrollment"] <- df["current_enrollment"] / df["planned_enrollment"]
+#square terms
+#x["enrollment^2"] <- x["enrollment"]^2
+#x["elapsed_duration^2"] <- x["elapsed_duration"]^2
+#x["n_brands^2"] <- x["n_brands"]^2
+#break out
+#x["status_NYR"] <- ifelse(df["current_status"]=="Not yet recruiting",1,0)
+#x["status_Rec"] <- ifelse(df["current_status"]=="Recruiting",1,0)
+#x["status_ANR"] <- ifelse(df["current_status"]=="Active, not recruiting",1,0)
+#x["status_EBI"] <- ifelse(df["current_status"]=="Enrolling by invitation",1,0)
+
+y <- ifelse(df["final_status"]=="Terminated",1,0)
+```
+
+```{r}
+################################# DATA EXPLORATION ############################
+driver <- dbDriver("PostgreSQL")
+
+con <- dbConnect(
+    driver,
+    user='root',
+    password='root',
+    dbname='aact_db',
+    host='will-office'
+    )
+#Plot histogram of count of snapshots
+df3 <- dbGetQuery(
+    con,
+    "select nct_id,final_status,count(*) from formatted_data_with_planned_enrollment fdwpe 
+    group by nct_id,final_status ;"
+    )
+#df3 <- fetch(query3, n = -1)
+
+ggplot(data=df3, aes(x=count, fill=final_status)) + 
+    geom_histogram(binwidth=1) +
+    ggtitle("Histogram of snapshots per trial (matched trials)") +
+    xlab("Snapshots per trial")
+
+#Plot duration for terminated vs completed
+df4 <- dbGetQuery(
+    con,
+    "
+    select 
+        nct_id, 
+        start_date , 
+        primary_completion_date, 
+        overall_status ,
+        primary_completion_date - start_date as duration
+    from ctgov.studies s 
+    where nct_id in (select distinct nct_id from http.download_status ds)
+    ;"
+    )
+#df4 <- fetch(query4, n = -1)
+
+ggplot(data=df4, aes(x=duration,fill=overall_status)) +
+    geom_histogram()+
+    ggtitle("Histogram of trial durations") +
+    xlab("duration")+
+    facet_wrap(~overall_status)
+
+df5 <- dbGetQuery(
+    con,
+    "
+    with cte1 as (
+    select 
+        nct_id, 
+        start_date , 
+        primary_completion_date, 
+        overall_status ,
+        primary_completion_date - start_date as duration
+    from ctgov.studies s 
+    where nct_id in (select distinct nct_id from http.download_status ds)
+    ), cte2 as (
+    select nct_id,count(*) as snapshot_count from formatted_data_with_planned_enrollment fdwpe
+    group by nct_id
+    )
+    select a.nct_id, a.overall_status, a.duration,b.snapshot_count
+    from cte1 as a
+        join cte2 as b
+            on a.nct_id=b.nct_id
+    ;"
+    )
+#df5 <- fetch(query5, n = -1)
+
+ggplot(data=df5, aes(x=duration,y=snapshot_count,color=overall_status)) +
+    geom_point() +
+    ggtitle("duration, status, and snapshot_count") +
+    xlab("duration") +
+    ylab("snapshot count") 
+
+
+dbDisconnect(con)
+```
+
+```{r}
+################################# FIT #########################################
+
+#setup data (named list)
+trials_data <- list(
+    D = ncol(x),#
+    N = nrow(x),
+    L = n_categories$count,
+    y = as.vector(y),
+    ll = as.vector(categories$category_id),
+    x = as.matrix(x)
+)
+
+#fit <- stan(file='Logistic.stan', data = trials_data) #attempt at simple model
+fit <- stan(
+    file='Hierarchal_Logistic.stan', 
+    data = trials_data,
+    chains = 6,
+    iter = 5000,
+    seed = 11021585
+    )
+```
+
+
+```{r}
+################################# ANALYZE #####################################
+print(fit)
+```
+
+```{r}
+color_scheme_set("darkgray")
+#trace plots
+plot(fit, pars=c("mu"), plotfun="trace")
+color_scheme_set("viridis")
+mcmc_rank_overlay(
+    fit, 
+    pars=c(
+        "mu[1]",
+        "mu[2]",
+        "mu[3]",
+        "mu[4]",
+        "mu[5]",
+        "mu[6]",
+        "mu[7]"
+        ), 
+    n_bins=100
+    )+  legend_move("top")
+```
+
+```{r}
+color_scheme_set("viridis")
+plot(fit, pars=c("sigma"), plotfun="trace")
+color_scheme_set("viridis")
+mcmc_rank_overlay(
+    fit, 
+    pars=c(
+        "sigma[1]",
+        "sigma[2]",
+        "sigma[3]",
+        "sigma[4]",
+        "sigma[5]",
+        "sigma[6]",
+        "sigma[7]"
+        ), 
+    n_bins=100
+    )+  legend_move("top")
+```
+
+```{r}
+#other diagnostics
+logpost <- log_posterior(fit)
+nuts_prmts <- nuts_params(fit)
+posterior <- as.array(fit)
+
+```
+
+```{r}
+color_scheme_set("darkgray")
+mcmc_parcoord(posterior,pars=c(
+    "mu[1]",
+    "mu[2]",
+    "mu[3]",
+    "mu[4]",
+    "mu[5]",
+    "mu[6]",
+    "mu[7]"
+    ),
+    np=nuts_prmts
+)
+```
+
+```{r}
+mcmc_pairs(
+    posterior,
+    np = nuts_prmts,
+    pars=c(
+        "mu[1]",
+        "mu[2]",
+        "mu[3]",
+        "mu[4]",
+        "mu[5]",
+        "mu[6]",
+        "mu[7]"
+    ),
+    off_diag_args = list(size = 0.75)
+)
+
+
+```
+
+```{r}
+mcmc_parcoord(posterior,pars=c(
+    "sigma[1]",
+    "sigma[2]",
+    "sigma[3]",
+    "sigma[4]",
+    "sigma[5]",
+    "sigma[6]",
+    "sigma[7]"
+),
+np=nuts_prmts
+)
+```
+
+```{r}
+mcmc_pairs(
+    posterior,
+    np = nuts_prmts,
+    pars=c(
+        "sigma[1]",
+        "sigma[2]",
+        "sigma[3]",
+        "sigma[4]",
+        "sigma[5]",
+        "sigma[6]",
+        "sigma[7]"
+    ),
+    off_diag_args = list(size = 0.75)
+)
+
+
+```
+
+```{r}
+# check model estimates
+#mu
+plot(fit, pars=c("mu"), show_density =TRUE, ci_level=0.8)
+plot(fit, pars=c("mu"), plotfun = "hist", bins=50)
+```
+
+```{r}
+#sigma
+plot(fit, pars=c("sigma"), show_density =TRUE, ci_level=0.8)
+plot(fit, pars=c("sigma"), plotfun = "hist",bins=50)
+
+```
--- a/r-analysis/Hierarchal_Logistic.rds
+++ b/r-analysis/Hierarchal_Logistic.rds
--- a/r-analysis/Hierarchal_Logistic.stan
+++ b/r-analysis/Hierarchal_Logistic.stan
@ -0,0 +1,40 @@
+//
+// This Stan program defines a simple model, with a
+// vector of values 'y' modeled as normally distributed
+// with mean 'mu' and standard deviation 'sigma'.
+//
+// Learn more about model development with Stan at:
+//
+//    http://mc-stan.org/users/interfaces/rstan.html
+//    https://github.com/stan-dev/rstan/wiki/RStan-Getting-Started
+//
+
+// The input data is a vector 'y' of length 'N'.
+data {
+  int<lower=1> D;
+  int<lower=1> N;
+  int<lower=1> L;
+  int<lower=0, upper=1> y[N];
+  int<lower=1, upper=L> ll[N];
+  row_vector[D] x[N];
+}
+parameters {
+  real mu[D];
+  real<lower=0> sigma[D];
+  vector[D] beta[L];
+}
+model {
+  sigma ~ gamma(2,1); //shape and inverse scale
+  mu ~ normal(0, 1); //convert to mvnormal
+  for (l in 1:L) {
+    beta[l] ~ normal(mu, sigma);
+  }
+  {
+    vector[N] x_beta_ll;
+    for (n in 1:N) {
+      x_beta_ll[n] = x[n] * beta[ll[n]];
+    }
+    y ~ bernoulli_logit(x_beta_ll);
+  }
+}
+
--- a/r-analysis/Logistic.stan
+++ b/r-analysis/Logistic.stan
@ -0,0 +1,30 @@
+//
+// This Stan program defines a simple model, with a
+// vector of values 'y' modeled as normally distributed
+// with mean 'mu' and standard deviation 'sigma'.
+//
+// Learn more about model development with Stan at:
+//
+//    http://mc-stan.org/users/interfaces/rstan.html
+//    https://github.com/stan-dev/rstan/wiki/RStan-Getting-Started
+//
+
+// The input data is a vector 'y' of length 'N'.
+data {
+  int<lower=1> D;
+  int<lower=1> N;
+  int<lower=0, upper=1> y[N];
+  row_vector[D] x[N];
+}
+parameters {
+  real mu[D];
+  real<lower=0> sigma[D];
+}
+model {
+  sigma ~ gamma(2,0.1);
+  mu ~ normal(0, sigma); //convert to mvnormal
+
+  for (n in 1:N) {
+    y[n] ~ bernoulli_logit(x[n] * mu);
+  }
+}
--- a/r-analysis/basic_logit.stan
+++ b/r-analysis/basic_logit.stan
@ -0,0 +1,32 @@
+//
+// This Stan program defines a simple model, with a
+// vector of values 'y' modeled as normally distributed
+// with mean 'mu' and standard deviation 'sigma'.
+//
+// Learn more about model development with Stan at:
+//
+//    http://mc-stan.org/users/interfaces/rstan.html
+//    https://github.com/stan-dev/rstan/wiki/RStan-Getting-Started
+//
+
+// The input data is a vector 'y' of length 'N'.
+data {
+  int<lower=0> N;
+  int<lower=0> k;
+  matrix[N,k] X;
+  vector[N] int<lower=0, upper=1> y;
+}
+
+// The parameters accepted by the model. Our model
+// accepts two parameters 'mu' and 'sigma'.
+parameters {
+  vector[k] beta;
+}
+
+// The model to be estimated. We model the output
+// 'y' to be normally distributed with mean 'mu'
+// and standard deviation 'sigma'.
+model {
+  y ~ bernoulli_logit( X * beta);
+}
+
--- a/r-analysis/r-analysis.Rproj
+++ b/r-analysis/r-analysis.Rproj
@ -0,0 +1,17 @@
+Version: 1.0
+
+RestoreWorkspace: Yes
+SaveWorkspace: Ask
+AlwaysSaveHistory: Yes
+
+EnableCodeIndexing: Yes
+UseSpacesForTab: Yes
+NumSpacesForTab: 4
+Encoding: UTF-8
+
+RnwWeave: Sweave
+LaTeX: pdfLaTeX
+
+AutoAppendNewline: Yes
+StripTrailingWhitespace: Yes
+LineEndingConversion: Native