---
title: "The Effects of market conditions on recruitment and completion of clinical trials"
author: "Will King"
format: html
editor: source
---

```{r}
library(bayesplot)
available_mcmc(pattern = "_nuts_")
library(ggplot2)

library(rstan)
#Resources: https://github.com/stan-dev/rstan/wiki/RStan-Getting-Started

#save unchanged models instead of recompiling
rstan_options(auto_write = TRUE)
#allow for multithreaded sampling
options(mc.cores = parallel::detectCores())

#test installation, shouldn't get any errors
#example(stan_model, package = "rstan", run.dontrun = TRUE)
```

```{r}
################ Pull data from database ######################
library(RPostgreSQL)

driver <- dbDriver("PostgreSQL")

get_data <- function(driver) {

con <- dbConnect(
    driver,
    user='root',
    password='root',
    dbname='aact_db',
    host='will-office'
    )
on.exit(dbDisconnect(con))

query <- dbSendQuery(
    con,
    "select * from formatted_data_with_planned_enrollment;"
    )
df <- fetch(query, n = -1)
df <- na.omit(df)

query2 <-dbSendQuery(con,"select count(*) from \"DiseaseBurden\".icd10_categories ic where \"level\"=1;")
n_categories <- fetch(query2, n = -1)

return(list(data=df,ncat=n_categories))
}

d <- get_data(driver)
df <- d$data
n_categories <- d$ncat


################ Format Data ###########################


categories <- df["category_id"]

x <- df["elapsed_duration"]
x["log_n_brands"] <- asinh(df$n_brands) #todo - convert to arcsinh() transform
x["h_sdi_val"] <- asinh(df$h_sdi_val)
x["hm_sdi_val"] <- asinh(df$hm_sdi_val)
x["m_sdi_val"] <- asinh(df$m_sdi_val)
x["lm_sdi_val"] <- asinh(df$lm_sdi_val)
x["l_sdi_val"] <- asinh(df$l_sdi_val)
#x["enrollment"] <- df["current_enrollment"] / df["planned_enrollment"]
#square terms
#x["enrollment^2"] <- x["enrollment"]^2
#x["elapsed_duration^2"] <- x["elapsed_duration"]^2
#x["n_brands^2"] <- x["n_brands"]^2
#break out
#x["status_NYR"] <- ifelse(df["current_status"]=="Not yet recruiting",1,0)
#x["status_Rec"] <- ifelse(df["current_status"]=="Recruiting",1,0)
#x["status_ANR"] <- ifelse(df["current_status"]=="Active, not recruiting",1,0)
#x["status_EBI"] <- ifelse(df["current_status"]=="Enrolling by invitation",1,0)

y <- ifelse(df["final_status"]=="Terminated",1,0)
```

```{r}
################################# DATA EXPLORATION ############################
driver <- dbDriver("PostgreSQL")

con <- dbConnect(
    driver,
    user='root',
    password='root',
    dbname='aact_db',
    host='will-office'
    )
#Plot histogram of count of snapshots
df3 <- dbGetQuery(
    con,
    "select nct_id,final_status,count(*) from formatted_data_with_planned_enrollment fdwpe 
    group by nct_id,final_status ;"
    )
#df3 <- fetch(query3, n = -1)

ggplot(data=df3, aes(x=count, fill=final_status)) + 
    geom_histogram(binwidth=1) +
    ggtitle("Histogram of snapshots per trial (matched trials)") +
    xlab("Snapshots per trial")

#Plot duration for terminated vs completed
df4 <- dbGetQuery(
    con,
    "
    select 
        nct_id, 
        start_date , 
        primary_completion_date, 
        overall_status ,
        primary_completion_date - start_date as duration
    from ctgov.studies s 
    where nct_id in (select distinct nct_id from http.download_status ds)
    ;"
    )
#df4 <- fetch(query4, n = -1)

ggplot(data=df4, aes(x=duration,fill=overall_status)) +
    geom_histogram()+
    ggtitle("Histogram of trial durations") +
    xlab("duration")+
    facet_wrap(~overall_status)

df5 <- dbGetQuery(
    con,
    "
    with cte1 as (
    select 
        nct_id, 
        start_date , 
        primary_completion_date, 
        overall_status ,
        primary_completion_date - start_date as duration
    from ctgov.studies s 
    where nct_id in (select distinct nct_id from http.download_status ds)
    ), cte2 as (
    select nct_id,count(*) as snapshot_count from formatted_data_with_planned_enrollment fdwpe
    group by nct_id
    )
    select a.nct_id, a.overall_status, a.duration,b.snapshot_count
    from cte1 as a
        join cte2 as b
            on a.nct_id=b.nct_id
    ;"
    )
#df5 <- fetch(query5, n = -1)

ggplot(data=df5, aes(x=duration,y=snapshot_count,color=overall_status)) +
    geom_point() +
    ggtitle("duration, status, and snapshot_count") +
    xlab("duration") +
    ylab("snapshot count") 


dbDisconnect(con)
```

```{r}
################################# FIT #########################################

#setup data (named list)
trials_data <- list(
    D = ncol(x),#
    N = nrow(x),
    L = n_categories$count,
    y = as.vector(y),
    ll = as.vector(categories$category_id),
    x = as.matrix(x)
)

#fit <- stan(file='Logistic.stan', data = trials_data) #attempt at simple model
fit <- stan(
    file='Hierarchal_Logistic.stan', 
    data = trials_data,
    chains = 6,
    iter = 5000,
    seed = 11021585
    )
```


```{r}
################################# ANALYZE #####################################
print(fit)
```

```{r}
color_scheme_set("darkgray")
#trace plots
plot(fit, pars=c("mu"), plotfun="trace")
color_scheme_set("viridis")
mcmc_rank_overlay(
    fit, 
    pars=c(
        "mu[1]",
        "mu[2]",
        "mu[3]",
        "mu[4]",
        "mu[5]",
        "mu[6]",
        "mu[7]"
        ), 
    n_bins=100
    )+  legend_move("top")
```

```{r}
color_scheme_set("viridis")
plot(fit, pars=c("sigma"), plotfun="trace")
color_scheme_set("viridis")
mcmc_rank_overlay(
    fit, 
    pars=c(
        "sigma[1]",
        "sigma[2]",
        "sigma[3]",
        "sigma[4]",
        "sigma[5]",
        "sigma[6]",
        "sigma[7]"
        ), 
    n_bins=100
    )+  legend_move("top")
```

```{r}
#other diagnostics
logpost <- log_posterior(fit)
nuts_prmts <- nuts_params(fit)
posterior <- as.array(fit)

```

```{r}
color_scheme_set("darkgray")
mcmc_parcoord(posterior,pars=c(
    "mu[1]",
    "mu[2]",
    "mu[3]",
    "mu[4]",
    "mu[5]",
    "mu[6]",
    "mu[7]"
    ),
    np=nuts_prmts
)
```

```{r}
mcmc_pairs(
    posterior,
    np = nuts_prmts,
    pars=c(
        "mu[1]",
        "mu[2]",
        "mu[3]",
        "mu[4]",
        "mu[5]",
        "mu[6]",
        "mu[7]"
    ),
    off_diag_args = list(size = 0.75)
)


```

```{r}
mcmc_parcoord(posterior,pars=c(
    "sigma[1]",
    "sigma[2]",
    "sigma[3]",
    "sigma[4]",
    "sigma[5]",
    "sigma[6]",
    "sigma[7]"
),
np=nuts_prmts
)
```

```{r}
mcmc_pairs(
    posterior,
    np = nuts_prmts,
    pars=c(
        "sigma[1]",
        "sigma[2]",
        "sigma[3]",
        "sigma[4]",
        "sigma[5]",
        "sigma[6]",
        "sigma[7]"
    ),
    off_diag_args = list(size = 0.75)
)


```

```{r}
# check model estimates
#mu
plot(fit, pars=c("mu"), show_density =TRUE, ci_level=0.8)
plot(fit, pars=c("mu"), plotfun = "hist", bins=50)
```

```{r}
#sigma
plot(fit, pars=c("sigma"), show_density =TRUE, ci_level=0.8)
plot(fit, pars=c("sigma"), plotfun = "hist",bins=50)

```