Used for version 1.0.0 analysis

3 years ago · 4551fba165
parent 8a8c49f9ca
commit 4551fba165
8 changed files with 591 additions and 0 deletions
--- a/r-analysis/.Rhistory
+++ b/r-analysis/.Rhistory
--- a/r-analysis/DataMangement.R
+++ b/r-analysis/DataMangement.R
@ -0,0 +1,150 @@
 library(bayesplot)
 available_mcmc(pattern = "_nuts_")
 library(ggplot2)
 library(rstan)
 #Resources: https://github.com/stan-dev/rstan/wiki/RStan-Getting-Started
 #save unchanged models instead of recompiling
 rstan_options(auto_write = TRUE)
 #allow for multithreaded sampling
 options(mc.cores = parallel::detectCores())
 #test installation, shouldn't get any errors
 #example(stan_model, package = "rstan", run.dontrun = TRUE)
 ################ Pull data from database ######################
 library(RPostgreSQL)
 driver <- dbDriver("PostgreSQL")
 con <- dbConnect(
    driver,
    user='root',
    password='root',
    dbname='aact_db',
    host='will-office'
    )
 query <- dbSendQuery(
    con,
    "select * from formatted_data_with_planned_enrollment;"
    )
 df <- fetch(query)
 df <- na.omit(df)
 query2 <-dbSendQuery(con,"select count(*) from \"DiseaseBurden\".icd10_categories ic ;")
 n_categories <- fetch(query2)
 ################ Format Data ###########################
 categories <- df["category_id"]
 x <- df["elapsed_duration"]
 x["log_n_brands"] <- log(df$n_brands + 1)
 x["h_sdi_val"] <- log(df$h_sdi_val + 1)
 x["hm_sdi_val"] <- log(df$hm_sdi_val + 1)
 x["m_sdi_val"] <- log(df$m_sdi_val + 1)
 x["lm_sdi_val"] <- log(df$lm_sdi_val + 1)
 x["l_sdi_val"] <- log(df$l_sdi_val + 1)
 x["enrollment"] <- df["current_enrollment"] / df["planned_enrollment"]
 #square terms
 #x["enrollment^2"] <- x["enrollment"]^2
 #x["elapsed_duration^2"] <- x["elapsed_duration"]^2
 #x["n_brands^2"] <- x["n_brands"]^2
 #break out
 x["status_NYR"] <- ifelse(df["current_status"]=="Not yet recruiting",1,0)
 x["status_Rec"] <- ifelse(df["current_status"]=="Recruiting",1,0)
 x["status_ANR"] <- ifelse(df["current_status"]=="Active, not recruiting",1,0)
 x["status_EBI"] <- ifelse(df["current_status"]=="Enrolling by invitation",1,0)
 y <- ifelse(df["final_status"]=="Terminated",1,0)
 ################################# DATA EXPLORATION ############################
 #Plot terminated vs completed
 #Plot duration for terminated vs completed
 #Plot different times of
 ################################# FIT #########################################
 #setup data (named list)
 trials_data <- list(
    D = ncol(x),#
    N = nrow(x),
 #    L = n_categories$count,
    y = as.vector(y),
 #    ll = as.vector(categories$category_id),
    x = as.matrix(x)
 )
 #fit <- stan(file='Logistic.stan', data = trials_data) #attempt at simple model
 fit <- stan(file='Hierarchal_Logistic.stan', data = trials_data)
 ################################# ANALYZE #####################################
 print(fit)
 color_scheme_set("darkgray")
 #trace plots
 plot(fit, pars=c("mu"), plotfun="trace")
 plot(fit, pars=c("sigma"), plotfun="trace")
 #other diagnostics
 logpost <- log_posterior(fit)
 nuts_prmts <- nuts_params(fit)
 posterior <- as.array(fit)
 mcmc_parcoord(posterior,pars=c(
    "mu[1]",
    "mu[2]",
    "mu[3]",
    "mu[4]",
    "mu[5]",
    "mu[6]",
    "mu[7]",
    "mu[8]",
    "mu[9]",
    "mu[10]",
    "mu[11]",
    "mu[12]"
    ),
    np=nuts_prmts
 )
 mcmc_pairs(
    posterior,
    np = nuts_prmts,
    pars=c(
        "mu[1]",
        "mu[2]",
        "sigma[1]",
        "sigma[2]"
    ),
    off_diag_args = list(size = 0.75)
 )
 mcmc_parcoord(posterior,pars=c(
    "sigma[1]",
    "sigma[2]",
    "sigma[3]",
    "sigma[4]",
    "sigma[5]",
    "sigma[6]",
    "sigma[7]",
    "sigma[8]",
    "sigma[9]",
    "sigma[10]",
    "sigma[11]",
    "sigma[12]"
 ),
 np=nuts_prmts
 )
 # check model estimates
 #mu
 plot(fit, pars=c("mu"), show_density =TRUE, ci_level=0.8)
 plot(fit, pars=c("mu"), plotfun = "hist")
 #sigma
 plot(fit, pars=c("sigma"), show_density =TRUE, ci_level=0.8)
 plot(fit, pars=c("sigma"), plotfun = "hist")
--- a/r-analysis/EffectsOfMarketConditions.qmd
+++ b/r-analysis/EffectsOfMarketConditions.qmd
@ -0,0 +1,322 @@
 ---
 title: "The Effects of market conditions on recruitment and completion of clinical trials"
 author: "Will King"
 format: html
 editor: source
 ---
 ```{r}
 library(bayesplot)
 available_mcmc(pattern = "_nuts_")
 library(ggplot2)
 library(rstan)
 #Resources: https://github.com/stan-dev/rstan/wiki/RStan-Getting-Started
 #save unchanged models instead of recompiling
 rstan_options(auto_write = TRUE)
 #allow for multithreaded sampling
 options(mc.cores = parallel::detectCores())
 #test installation, shouldn't get any errors
 #example(stan_model, package = "rstan", run.dontrun = TRUE)
 ```
 ```{r}
 ################ Pull data from database ######################
 library(RPostgreSQL)
 driver <- dbDriver("PostgreSQL")
 get_data <- function(driver) {
 con <- dbConnect(
    driver,
    user='root',
    password='root',
    dbname='aact_db',
    host='will-office'
    )
 on.exit(dbDisconnect(con))
 query <- dbSendQuery(
    con,
    "select * from formatted_data_with_planned_enrollment;"
    )
 df <- fetch(query, n = -1)
 df <- na.omit(df)
 query2 <-dbSendQuery(con,"select count(*) from \"DiseaseBurden\".icd10_categories ic where \"level\"=1;")
 n_categories <- fetch(query2, n = -1)
 return(list(data=df,ncat=n_categories))
 }
 d <- get_data(driver)
 df <- d$data
 n_categories <- d$ncat
 ################ Format Data ###########################
 categories <- df["category_id"]
 x <- df["elapsed_duration"]
 x["log_n_brands"] <- asinh(df$n_brands) #todo - convert to arcsinh() transform
 x["h_sdi_val"] <- asinh(df$h_sdi_val)
 x["hm_sdi_val"] <- asinh(df$hm_sdi_val)
 x["m_sdi_val"] <- asinh(df$m_sdi_val)
 x["lm_sdi_val"] <- asinh(df$lm_sdi_val)
 x["l_sdi_val"] <- asinh(df$l_sdi_val)
 #x["enrollment"] <- df["current_enrollment"] / df["planned_enrollment"]
 #square terms
 #x["enrollment^2"] <- x["enrollment"]^2
 #x["elapsed_duration^2"] <- x["elapsed_duration"]^2
 #x["n_brands^2"] <- x["n_brands"]^2
 #break out
 #x["status_NYR"] <- ifelse(df["current_status"]=="Not yet recruiting",1,0)
 #x["status_Rec"] <- ifelse(df["current_status"]=="Recruiting",1,0)
 #x["status_ANR"] <- ifelse(df["current_status"]=="Active, not recruiting",1,0)
 #x["status_EBI"] <- ifelse(df["current_status"]=="Enrolling by invitation",1,0)
 y <- ifelse(df["final_status"]=="Terminated",1,0)
 ```
 ```{r}
 ################################# DATA EXPLORATION ############################
 driver <- dbDriver("PostgreSQL")
 con <- dbConnect(
    driver,
    user='root',
    password='root',
    dbname='aact_db',
    host='will-office'
    )
 #Plot histogram of count of snapshots
 df3 <- dbGetQuery(
    con,
    "select nct_id,final_status,count(*) from formatted_data_with_planned_enrollment fdwpe 
    group by nct_id,final_status ;"
    )
 #df3 <- fetch(query3, n = -1)
 ggplot(data=df3, aes(x=count, fill=final_status)) + 
    geom_histogram(binwidth=1) +
    ggtitle("Histogram of snapshots per trial (matched trials)") +
    xlab("Snapshots per trial")
 #Plot duration for terminated vs completed
 df4 <- dbGetQuery(
    con,
    "
    select 
        nct_id, 
        start_date , 
        primary_completion_date, 
        overall_status ,
        primary_completion_date - start_date as duration
    from ctgov.studies s 
    where nct_id in (select distinct nct_id from http.download_status ds)
    ;"
    )
 #df4 <- fetch(query4, n = -1)
 ggplot(data=df4, aes(x=duration,fill=overall_status)) +
    geom_histogram()+
    ggtitle("Histogram of trial durations") +
    xlab("duration")+
    facet_wrap(~overall_status)
 df5 <- dbGetQuery(
    con,
    "
    with cte1 as (
    select 
        nct_id, 
        start_date , 
        primary_completion_date, 
        overall_status ,
        primary_completion_date - start_date as duration
    from ctgov.studies s 
    where nct_id in (select distinct nct_id from http.download_status ds)
    ), cte2 as (
    select nct_id,count(*) as snapshot_count from formatted_data_with_planned_enrollment fdwpe
    group by nct_id
    )
    select a.nct_id, a.overall_status, a.duration,b.snapshot_count
    from cte1 as a
        join cte2 as b
            on a.nct_id=b.nct_id
    ;"
    )
 #df5 <- fetch(query5, n = -1)
 ggplot(data=df5, aes(x=duration,y=snapshot_count,color=overall_status)) +
    geom_point() +
    ggtitle("duration, status, and snapshot_count") +
    xlab("duration") +
    ylab("snapshot count") 
 dbDisconnect(con)
 ```
 ```{r}
 ################################# FIT #########################################
 #setup data (named list)
 trials_data <- list(
    D = ncol(x),#
    N = nrow(x),
    L = n_categories$count,
    y = as.vector(y),
    ll = as.vector(categories$category_id),
    x = as.matrix(x)
 )
 #fit <- stan(file='Logistic.stan', data = trials_data) #attempt at simple model
 fit <- stan(
    file='Hierarchal_Logistic.stan', 
    data = trials_data,
    chains = 6,
    iter = 5000,
    seed = 11021585
    )
 ```
 ```{r}
 ################################# ANALYZE #####################################
 print(fit)
 ```
 ```{r}
 color_scheme_set("darkgray")
 #trace plots
 plot(fit, pars=c("mu"), plotfun="trace")
 color_scheme_set("viridis")
 mcmc_rank_overlay(
    fit, 
    pars=c(
        "mu[1]",
        "mu[2]",
        "mu[3]",
        "mu[4]",
        "mu[5]",
        "mu[6]",
        "mu[7]"
        ), 
    n_bins=100
    )+  legend_move("top")
 ```
 ```{r}
 color_scheme_set("viridis")
 plot(fit, pars=c("sigma"), plotfun="trace")
 color_scheme_set("viridis")
 mcmc_rank_overlay(
    fit, 
    pars=c(
        "sigma[1]",
        "sigma[2]",
        "sigma[3]",
        "sigma[4]",
        "sigma[5]",
        "sigma[6]",
        "sigma[7]"
        ), 
    n_bins=100
    )+  legend_move("top")
 ```
 ```{r}
 #other diagnostics
 logpost <- log_posterior(fit)
 nuts_prmts <- nuts_params(fit)
 posterior <- as.array(fit)
 ```
 ```{r}
 color_scheme_set("darkgray")
 mcmc_parcoord(posterior,pars=c(
    "mu[1]",
    "mu[2]",
    "mu[3]",
    "mu[4]",
    "mu[5]",
    "mu[6]",
    "mu[7]"
    ),
    np=nuts_prmts
 )
 ```
 ```{r}
 mcmc_pairs(
    posterior,
    np = nuts_prmts,
    pars=c(
        "mu[1]",
        "mu[2]",
        "mu[3]",
        "mu[4]",
        "mu[5]",
        "mu[6]",
        "mu[7]"
    ),
    off_diag_args = list(size = 0.75)
 )
 ```
 ```{r}
 mcmc_parcoord(posterior,pars=c(
    "sigma[1]",
    "sigma[2]",
    "sigma[3]",
    "sigma[4]",
    "sigma[5]",
    "sigma[6]",
    "sigma[7]"
 ),
 np=nuts_prmts
 )
 ```
 ```{r}
 mcmc_pairs(
    posterior,
    np = nuts_prmts,
    pars=c(
        "sigma[1]",
        "sigma[2]",
        "sigma[3]",
        "sigma[4]",
        "sigma[5]",
        "sigma[6]",
        "sigma[7]"
    ),
    off_diag_args = list(size = 0.75)
 )
 ```
 ```{r}
 # check model estimates
 #mu
 plot(fit, pars=c("mu"), show_density =TRUE, ci_level=0.8)
 plot(fit, pars=c("mu"), plotfun = "hist", bins=50)
 ```
 ```{r}
 #sigma
 plot(fit, pars=c("sigma"), show_density =TRUE, ci_level=0.8)
 plot(fit, pars=c("sigma"), plotfun = "hist",bins=50)
 ```
--- a/r-analysis/Hierarchal_Logistic.rds
+++ b/r-analysis/Hierarchal_Logistic.rds
--- a/r-analysis/Hierarchal_Logistic.stan
+++ b/r-analysis/Hierarchal_Logistic.stan
@ -0,0 +1,40 @@
 //
 // This Stan program defines a simple model, with a
 // vector of values 'y' modeled as normally distributed
 // with mean 'mu' and standard deviation 'sigma'.
 //
 // Learn more about model development with Stan at:
 //
 //    http://mc-stan.org/users/interfaces/rstan.html
 //    https://github.com/stan-dev/rstan/wiki/RStan-Getting-Started
 //
 // The input data is a vector 'y' of length 'N'.
 data {
  int<lower=1> D;
  int<lower=1> N;
  int<lower=1> L;
  int<lower=0, upper=1> y[N];
  int<lower=1, upper=L> ll[N];
  row_vector[D] x[N];
 }
 parameters {
  real mu[D];
  real<lower=0> sigma[D];
  vector[D] beta[L];
 }
 model {
  sigma ~ gamma(2,1); //shape and inverse scale
  mu ~ normal(0, 1); //convert to mvnormal
  for (l in 1:L) {
    beta[l] ~ normal(mu, sigma);
  }
  {
    vector[N] x_beta_ll;
    for (n in 1:N) {
      x_beta_ll[n] = x[n] * beta[ll[n]];
    }
    y ~ bernoulli_logit(x_beta_ll);
  }
 }
--- a/r-analysis/Logistic.stan
+++ b/r-analysis/Logistic.stan
@ -0,0 +1,30 @@
 //
 // This Stan program defines a simple model, with a
 // vector of values 'y' modeled as normally distributed
 // with mean 'mu' and standard deviation 'sigma'.
 //
 // Learn more about model development with Stan at:
 //
 //    http://mc-stan.org/users/interfaces/rstan.html
 //    https://github.com/stan-dev/rstan/wiki/RStan-Getting-Started
 //
 // The input data is a vector 'y' of length 'N'.
 data {
  int<lower=1> D;
  int<lower=1> N;
  int<lower=0, upper=1> y[N];
  row_vector[D] x[N];
 }
 parameters {
  real mu[D];
  real<lower=0> sigma[D];
 }
 model {
  sigma ~ gamma(2,0.1);
  mu ~ normal(0, sigma); //convert to mvnormal
  for (n in 1:N) {
    y[n] ~ bernoulli_logit(x[n] * mu);
  }
 }
--- a/r-analysis/basic_logit.stan
+++ b/r-analysis/basic_logit.stan
@ -0,0 +1,32 @@
 //
 // This Stan program defines a simple model, with a
 // vector of values 'y' modeled as normally distributed
 // with mean 'mu' and standard deviation 'sigma'.
 //
 // Learn more about model development with Stan at:
 //
 //    http://mc-stan.org/users/interfaces/rstan.html
 //    https://github.com/stan-dev/rstan/wiki/RStan-Getting-Started
 //
 // The input data is a vector 'y' of length 'N'.
 data {
  int<lower=0> N;
  int<lower=0> k;
  matrix[N,k] X;
  vector[N] int<lower=0, upper=1> y;
 }
 // The parameters accepted by the model. Our model
 // accepts two parameters 'mu' and 'sigma'.
 parameters {
  vector[k] beta;
 }
 // The model to be estimated. We model the output
 // 'y' to be normally distributed with mean 'mu'
 // and standard deviation 'sigma'.
 model {
  y ~ bernoulli_logit( X * beta);
 }
--- a/r-analysis/r-analysis.Rproj
+++ b/r-analysis/r-analysis.Rproj
@ -0,0 +1,17 @@
 Version: 1.0
 RestoreWorkspace: Yes
 SaveWorkspace: Ask
 AlwaysSaveHistory: Yes
 EnableCodeIndexing: Yes
 UseSpacesForTab: Yes
 NumSpacesForTab: 4
 Encoding: UTF-8
 RnwWeave: Sweave
 LaTeX: pdfLaTeX
 AutoAppendNewline: Yes
 StripTrailingWhitespace: Yes
 LineEndingConversion: Native