Squashed commit of the following:

commit 963293fc2b Author: Will King <will.king.git@youainti.com> Date: Mon Jan 13 21:15:44 2025 -0800 Added diagnostics appendix, notes in results commit d6d2360206 Author: Will King <will.king.git@youainti.com> Date: Mon Jan 13 20:53:18 2025 -0800 Finally got all the needed images correct. Adjusted directory to make it easier to find images. commit 37d35377b3 Author: Will King <will.king.git@youainti.com> Date: Mon Jan 13 16:29:00 2025 -0800 Added more images to assets & included in results. commit becefe15e0 Author: Will King <will.king.git@youainti.com> Date: Mon Jan 13 12:56:15 2025 -0800 Updated images commit 86f9b8dfc9 Author: will king <youainti@protonmail.com> Date: Mon Jan 13 09:24:20 2025 -0800 finished drafting results commit 64f3d14f7b Author: will king <youainti@protonmail.com> Date: Mon Jan 6 12:48:48 2025 -0800 Midday updates from writing commit 1630af2928 Author: will king <youainti@protonmail.com> Date: Tue Dec 3 17:08:43 2024 -0800 more updates commit 5d9640ab8d Author: will king <youainti@protonmail.com> Date: Thu Nov 28 23:39:04 2024 -0800 saving work commit 3e6a8f10d4 Author: will king <youainti@protonmail.com> Date: Tue Nov 26 17:18:24 2024 -0800 tweaked econometrics presentation, added todos commit 7d51cb10b3 Author: will king <youainti@protonmail.com> Date: Tue Nov 26 15:57:50 2024 -0800 updated layout, added gitignore
2 years ago · 907214e359
parent 9eab4b48e2
commit 907214e359
174 changed files with 412 additions and 196 deletions
--- a/Paper/.gitignore
+++ b/Paper/.gitignore
@ -0,0 +1,5 @@
+*.pdf
+*.aux
+*.lof
+*.lot
+*.idx
--- a/Paper/Main.tex
+++ b/Paper/Main.tex
@ -68,10 +68,10 @@ Section \ref{SEC:Results} discusses the results of the analysis.
 \subfile{sections/10_CausalStory}
 \subfile{sections/02_data}

-%---------------------------------------------------------------
-\section{Causal Identification}\label{SEC:CausalIdentification}
-%---------------------------------------------------------------
-\subfile{sections/03_CausalIdentification}
+% %---------------------------------------------------------------
+% \section{Causal Identification}\label{SEC:CausalIdentification}
+% %---------------------------------------------------------------
+% \subfile{sections/03_CausalIdentification}

 %---------------------------------------------------------------
 \section{Econometric Model}\label{SEC:EconometricModel}
@ -103,6 +103,7 @@ Section \ref{SEC:Results} discusses the results of the analysis.
 %---------------------------------------------------------------
 \section{Appendicies}
 %---------------------------------------------------------------
+\subfile{sections/21_appendix_diagnostics}

 \newpage
 \tableofcontents
--- a/Paper/jmp_layout.kdl
+++ b/Paper/jmp_layout.kdl
@ -1,5 +1,5 @@
 layout {
-    tab name="Main and Compile" cwd="~/research/PhD_Deliverables/jmp/Latex/Paper/" hide_floating_panes=true focus=true {
+    tab name="Main and Compile" cwd="./" hide_floating_panes=true focus=true {
    // This tab is where I manage main from. 
    // it opens up Main.txt for my JMP, opens the pdf in okular (in a floating tab), and then get's ready to build the pdf.
        pane size=1 borderless=true {
@ -33,7 +33,7 @@ layout {
        }
    }

-    tab name="sections" cwd="~/research/PhD_Deliverables/jmp/Latex/Paper/sections/" {
+    tab name="sections" cwd="sections/" {
        pane size=1 borderless=true {
            plugin location="tab-bar"
        }
@ -56,7 +56,7 @@ layout {
        }
    }

-    tab name="git" cwd="~/research/PhD_Deliverables/jmp/Latex/Paper/" {
+    tab name="git" cwd="./" {
        pane size=1 borderless=true {
            plugin location="tab-bar"
        }
@ -73,7 +73,7 @@ layout {
            }

            pane command="git" {
-                args "log" "-n 10" "--all" "--oneline" "--graph" "--stat" "--decorate"
+                args "log" "--all" "--oneline" "--graph" "--stat" "--decorate"
            }
        }

--- a/Paper/sections/04_EconometricModel.tex
+++ b/Paper/sections/04_EconometricModel.tex
@ -3,66 +3,133 @@

 \begin{document}
 %% Describe goal
-%   Estimate probability distribution of normalized durations and conclusion statuses.
-%   Explain why this answers questions well.
-%   How do I propose estimating that?

+The model I use is a 
+hierarchal logistic regression model where the 
+hierarchies are based on disease categories.
 %%NOTATION
+% change notation
+% i indexes trials for y and d 
+% n indexes snapshots within the trial

 First, some notation:
 \begin{itemize}
+    \item $i$: indexes trials
    \item $n$: indexes trial snapshots.
-    \item $y_n$: whether each trial terminated (true) or completed (false).
-    \item $d$: indexes ICD-10 disease categories.
-    \item $d_n$: represents the disease category of the trial associated with the snapshot $n$.
-    \item $x_n$: represents the other dependent variables associated to the snapshot.
-        This includes\footnote{No trials in the current dataset are ever suspended.}:
-        \begin{enumerate} 
-            \item Elapsed duration
-            \item arcsinh of the number of brands
-            \item arcsinh of the DALYs from high SDI countries
-            \item arcsinh of the DALYs from high-medium SDI countries
-            \item Enrollment (no distinction between anticipated or actual) 
-            \item Dummy Status: Not yet recruiting
-            \item Dummy Status: Recruiting
-            \item Dummy Status: Active, not recruiting
-            \item Dummy Status: Enrolling by invitation 
-        \end{enumerate} 
+    \item $y_i$: whether each trial 
+        terminated (true, 1) or completed (false, 0).
+    \item $d_i$: indexes the ICD-10 disease category of the trial.
+    \item $x_{i,n}$: represents the independent 
+        variables associated with the snapshot.
 \end{itemize} 
-The arcsinh transform is used because it is similar to a log transform but
-maps $\text{arcsinh}(0)=0$.
-

-The bayesian model to measure the direct effects of enrollment and the number 
-of other brands is easily specified as a hierarchal logistic regression.
+The goal is to take each snapshot and predict 
+The actual specification of the model to measure 
+the direct effect of enrollment is:
 \begin{align}
-    y_n \sim \text{Bernoulli}(p_n) \\
-	p_n = \text{logit}(x_n \vec \beta(d_n))
+    y_i \sim \text{Bernoulli}(p_{i,n}) \\
+        p_{i,n} = \text{logit}(x_{i,n} \vec \beta(d_i))
 \end{align}
-Where beta is indexed by $k$ for each parameter in $x$, and by
-$d \in \{1,2,\dots,21,22\}$ for each general ICD-10 category.
+Where beta is indexed by 
+$d \in \{1,2,\dots,21,22\}$ 
+for each general ICD-10 category.
 The betas are distributed
 \begin{align}
-    \beta_k(d) \sim \text{Normal}(\mu_k,\sigma_k)
+    \beta(d_i) \sim \text{Normal}(\mu_i,\sigma_i I)
 \end{align}
-With hyperparameters
+With hyperpriors
+%Checked on 2024-11-27. Is corrrect. \todo{Double check that these are the priors I used.}
 \begin{align}
    \mu_k \sim \text{Normal}(0,0.05) \\
    \sigma_k \sim \text{Gamma}(4,20)
 \end{align}
+\todo{Double check actual spec}


-Other variables are implicitly conditioned on as they were used 
-to select trials of interest.
+The independent variables include: 
+\todo{Make sure data is described before this point.}
+\begin{subequations}
+\begin{align}
+    x_{i,n}\beta(d_i) 
+        = & \bx{1}{\text{Elapsed Duration}} \\
+        &+ \bx{2}{\arcsinh \left(\text{\# Generic compunds}\right)} \\
+        &+ \bx{3}{\arcsinh \left(\text{\# Branded compunds}\right)} \\ 
+        &+ \bx{4}{\text{\# DALYs in High SDI Countries}} \\
+        &+ \bx{5}{\text{\# DALYs in High-Medium SDI Countries}} \\
+        &+ \bx{6}{\text{\# DALYs in Medium SDI Countries}} \\
+        &+ \bx{7}{\text{\# DALYs in Low-Medium SDI Countries}} \\
+        &+ \bx{8}{\text{\# DALYs in Low SDI Countries}} \\
+        &+ \bxi{9}{\text{Not yet Recruiting}}{\text{Trial Status}}\\
+        &+ \bxi{10}{\text{Recruiting}}{\text{Trial Status}}\\
+        &+ \bxi{11}{\text{Enrolling by Invitation Only}}{\text{Trial Status}}\\
+        &+ \bxi{12}{\text{Active, not recruiting}}{\text{Trial Status}}
+\end{align}
+\end{subequations}
+The arcsinh transform is used because it is similar to a log transform but
+differentiably handles counts of zero since 
+$\text{arcsinh}(0) = \ln (0 + \sqrt{0^2 + 1}) =0$.
+Note that in this is a heirarchal model, each IDC-10 disease category 
+gets it's own set of parameters, and that is why the $\beta$s are parameterized
+by $d_i$.
+%%%% Not sure if space should go here. I think these work well together.
+Other variables are implicitly controlled for as they are used 
+to select the trials of interest.
 These include:
+        \todo{double check these in the code.}
 \begin{itemize}
-    \item Is the trial Phase 3?\footnote{
-       Conditioning on phase 3 is equivalent to asserting that previous trials 
-       occured and had acceptable safety and efficacy results.
+    \item The trial is Phase 3.
+    \item The trial has a Data Monitoring Committee.
+    \item The compounds are FDA regulated drug.
+    \item The trial was never suspended\footnote{
+        This was because I wasn't sure how to handle it in the model
+        when I started scraping the data. 
+        Later the website changed.
+        This is technically post selection. 
+        \todo{double check where this happened in the code. 
+        I may have only done it in the CBO analysis.}
    }
-    \item Does the trial have a Data Monitoring Committee?
-    \item Are the compounds an FDA regulated drug?
 \end{itemize}
-%TODO: double check the sql used to select trials of interest.
+
+\subsection{Interpretation}
+% Explain 
+% - What do we care about? Changes in the probability of 
+% - distribution of differences -> relate to E(\delta Y)
+% - How do we obtain this distribution of differences?
+%   - from the model, we pay attention to P under treatment and control
+%   - We obtain this by fitting the model, then simulating under treatment and control, and taking the difference in the probability.
+%   - 
+
+The specific measure of interest is how much a delay in 
+closing enrollment changes the probability of terminating a trial
+$p_{i,n}$ in the model.
+
+In the standard reduced form causal inference, the treatment effect
+of interest for outcome $Z$ is measured as 
+\begin{align}
+    E(Z(\text{Treatment}) - Z(\text{Control})) 
+    = E(Z(\text{Treatment})) - E(Z(\text{Control}))
+\end{align}
+Because $Z(\text{Treatment})$ and $Z(\text{Control})$ are random variables,
+$Z(\text{Treatment}) - Z(\text{Control}) = \delta_Z$, is also a random variable. 
+In the bayesian framework, this parameter has a distribution, and so 
+we can calculate the distribution of differences in 
+the probability of termination due to a given delay in 
+closing recrutiment,
+$p_{i,n}(T) - p_{i,n}(C) = \delta_{p_{i,n}}$.
+
+I calculate the posterior distribution of $\delta_{p_{i,n}}$ by estimating the 
+posterior distributions of the $\beta$s and then simulating $\delta_{p_{i,n}}$.
+This involves taking a draw from the $\beta$s distribution, calculating
+$p_{i,n}(C)$ 
+for the underlying trials at the snapshot when they close enrollment
+and then calculating 
+$p_{i,n}(T)$ 
+under the counterfactual where enrollment had not yet closed.
+The difference 
+$\delta_{p_{i,n}}$ 
+is then calculated for each trial, and saved. 
+After repeating this for all the posterior samples, we have an esitmate 
+for the posterior distribution of differences between treatement and control.
+

 \end{document}
--- a/Paper/sections/06_Results.tex
+++ b/Paper/sections/06_Results.tex
@ -1,119 +1,206 @@
 \documentclass[../Main.tex]{subfiles}

 \begin{document}
-%\subsection{Data Exploration} %TODO: fill this out later.
-%look at trial 
-\subsection{Model Fitting}
-In this section we examine the results from fitting the econometric model using
-mc-stan (\cite{mc-stan}) through the rstan (\cite{rstan}) interface.

-%describe 
-The model was based on the hierarchal logistic regression model 
-presented in the Stan Users Guide (\cite{mc-stan}), 
-and was run with 2,500 warmup iterations and
-2,500 sampling iterations in six chains.
-There were various issues, including 160 divergent transitions and the R-hat 
-measure was 1.49. 
-Overall these suggest that the econometric model is incorrect as 
-written or requires reparameterization.
-%TODO: and info about how I learned about these diagnostics
-
-
-% \subsubsection{Diagnostics}
-% %Examine trank plots
-% To identify which parameters were problematic, I first looked at trace rank 
-% histograms.
-% Under idea circumstances, each line (representing a chain) should exchange 
-% places with the other lines frequently.
-% In both \cref{fig:mu_trank} and \cref{fig:sigma_trank}, most parameters seem
-% to mix well but there are a couple of exceptions.
-% This warrants further investigation.
-%
-% \begin{figure}[H]
-%     \includegraphics[width=\textwidth]{../assets/img/mu_trank.png}
-%     \caption{Trace Rank Histogram: Mu values}
-% 	\label{fig:mu_trank}
-% \end{figure}
-%
-% \begin{figure}[H]
-%     \includegraphics[width=\textwidth]{../assets/img/sigma_trank.png}
-%     \caption{Trace Rank Histogram: Sigma values}
-% 	\label{fig:sigma_trank}
-% \end{figure}
-%
-% %Take a look at batman and points for mu
-% In the case of the Mu values, a parallel coordinates plot 
-% doesn't seem to indicate any parameters as likely candidates
-% for causing the issues with divergent transitions.
-% \begin{figure}[H]
-%     \includegraphics[width=\textwidth]{../assets/img/mu_batman.png}
-%     \caption{Parallel Coordinate Plot: Mu values}
-% 	\label{fig:mu_batman}
-% \end{figure}
-% Note that at each parameter, there is some level of dispersion between 
-% values that diverged.
-%
-% On the other hand, in the parallel coordinates plot for sigma values,
-% it appears that most divergent transitions occur with values of 
-% sigma[1], sigma[3], sigma[6], and sigma[7] close to zero.
-% \begin{figure}[H]
-%     \includegraphics[width=\textwidth]{../assets/img/sigma_batman.png}
-%     \caption{Parallel Coordinate Plot: Sigma values}
-% 	\label{fig:sigma_batman}
-% \end{figure}
-% Overall this suggests that there is an issue with the specification
-% of the covariance structures of the hyperparameters.
-%
-% Additional evidence that the covariance structure is incorrect comes from 
-% plotting pairs of parameter values and examining the chains with divergent
-% transitions.
-%
-% \begin{figure}[H]
-%     \includegraphics[width=\textwidth]{../assets/img/sigma_pairs_5-9.png}
-% 	\caption{Parameter Pairs plots: Sigma[5] through Sigma[9]}
-% 	\label{fig:sigma_pairs_5-9.png}
-% \end{figure}
-% From this we can see that divergent pairs are highly correlated with the cases
-% where sigma[6] or sigma[7] are equal to zero.
-% This has an impact on the shape of both of those estimated parameters, causing
-% both to be bimodal.
-
-
-\subsection{Interpretation}
-
-The key results so far are related to the distribution of differences in $p$.
-
-In figure \ref{fig:pred_dist_dif_delay} we see that there while most trials do not see any increased risk 
-from a delay in closing enrollment, there is a small group that does experience this.
+In this section 
+I describe the model fitting, the posteriors of the parameters of interest,
+and intepret the results.
+
+
+\subsection{Data Summaries and Estimation Procedure}
+
+% Data Summaries
+Overall, I successfully processed 162 trials, with 1,347 snapshots between them.
+Figure \ref{fig:snapshot_counts} shows the histogram of snapshots per trial.
+Most trials lasted less than 1,500 days, as can be seen in 
+\ref{fig:trial_durations}. 
+Although there are a large number of snapshots that will be used to fit the 
+model, the number of trials -- the unit of observation -- are quite low. 
+Add to the  fact that these are spread over multiple ICD-10 categories
+and the overall quantity of trials is quite low. 
+
+To continue, we can use a scatterplot to get a rough idea of the observed
+relationship between the number of snapshots and the duration of trials. 
+We can see this in Figure \ref{fig:snapshot_duration_scatter}, where
+the correlation (measured at $0.34$) is apparent.
+

 \begin{figure}[H]
-    \includegraphics[width=\textwidth]{../assets/img/current/pred_dist_diff-delay}
-	\caption{}
-	\label{fig:pred_dist_diff_delay}
+    \includegraphics[width=\textwidth]{../assets/img/trials_details/HistTrialDurations_Faceted}
+    \todo{Replace this graphic with the histogram of trial durations}
+    \caption{Histograms of Trial Durations}
+    \label{fig:trial_durations}
 \end{figure}

-Figure \ref{fig:pred_dist_dif_delay2} shows how this varies across disease categories
 \begin{figure}[H]
-    \includegraphics[width=\textwidth]{../assets/img/current/pred_dist_diff-delay-group}
-	\caption{}
-	\label{fig:pred_dist_dif_delay2}
+    \includegraphics[width=\textwidth]{../assets/img/trials_details/HistSnapshots}
+    \todo{Replace this graphic with the histogram of snapshots}
+    \caption{Histogram of the count of Snapshots}
+    \label{fig:snapshot_counts}
+\end{figure}
+
+\begin{figure}[H]
+    \includegraphics[width=\textwidth]{../assets/img/trials_details/SnapshotsVsDurationVsTermination}
+    \todo{Replace this graphic with the scatterplot comparing durations and snapshots}
+    \caption{Scatterplot comparing the Count of Snapshots and Trial Duration}
+    \label{fig:snapshot_counts}
+\end{figure}
+
+% Estimation Procedure
+I fit the econometric model using mc-stan 
+\cite{standevelopmentteam_StanModelling_2022}
+through the rstan 
+\cite{standevelopmentteam_RStanInterface_2023}
+interface using 4 chains with 
+%describe  
+2,500
+warmup iterations and
+2,500
+sampling iterations each.
+
+Two of the chains experienced a low 
+Estimated Baysian Fraction of Missing Information (E-BFMI) ,
+suggesting that there are some parts of the posterior distribution
+that were not explored well during the model fitting. 
+I presume this is due to the low number of trials in some of the 
+ICD-10 categories.
+We can see in Figure \ref{fig:barchart_idc_categories} that some of these 
+disease categories had a single trial represented while others were 
+not represented at all.
+
+\begin{figure}[H]
+    \includegraphics[width=\textwidth]{../assets/img/trials_details/CategoryCounts}
+    \caption{Bar chart of trials by ICD-10 categories}
+    \label{fig:barchart_idc_categories}
 \end{figure}

-We can also examine the direct effect from adding a single generic competitior drug.
+
+\subsection{Primary Results}
+
+The primary, causally-identified value we can estimate is the change in 
+the probability of termination caused by (counterfactually) keeping enrollment
+open instead of closing enrollment when observed. 
+In figure \ref{fig:pred_dist_diff_delay} below, we see this impact of 
+keeping enrollment open.
+

 \begin{figure}[H]
-    \includegraphics[width=\textwidth]{../assets/img/current/pred_dist_diff-generic}
-	\caption{}
-	\label{fig:pred_dist_diff_generic}
+    \includegraphics[width=\textwidth]{../assets/img/dist_diff_analysis/p_delay_intervention_distdiff_boxplot}
+    \todo{Replace this graphic with the histdiff with boxplot}
+    \small{
+        Values near 1 indicate a near perfect increase in the probability 
+        of termination. 
+        Values near 0 indicate little change in probability,
+        while values near -1, represent a decrease in the probability
+        of termination. 
+        The scale is in probability points, thus a value near 1 is a change 
+        from unlikely to terminate under control, to highly likely to 
+        terminate.
+    }
+    \caption{Histogram of the Distribution of Predicted Differences}
+    \label{fig:pred_dist_diff_delay}
 \end{figure}

-Figure \ref{fig:pred_dist_dif_generic2} shows how this varies across disease categories
+There are a few interesting things to point out here. 
+Let's start by getting aquainted with the details of the distribution above.
+% - spike at 0
+% - the boxplot
+% - 63% of mass below 0 : find better way to say that
+%   - For a random trial, there is a 63% chance that the impact is to reduce the probability of a termination.
+% - 2 pctg-point wide band centered on 0 has ~13% of the masss
+% - mean represents 9.x% increase in probability of termination. A quick simulation gives about the same pctg-point increase in terminated trials.
+
+A few interesting interpretation bits come out of this.
+% - there are 3 regimes: low impact (near zero), medium impact (concentrated in decreased probability of termination), and high impact (concentrated in increased probability of termination). 
+The first this that there appear to be three different regimes. 
+The first regime consists of the low impact results, i.e. those values of $\delta_p$ 
+near zero. 
+About 13\% of trials lie within a single percentage point change of zero, 
+suggesting that there is a reasonable chance that delaying 
+a close of enrollment has no impact. 
+The second regime consists of the moderate impact on clinical trials'
+probabilities of termination, say values in the interval $[-0.5, 0.5]$ 
+on the graph.
+Most of this probability mass is represents a decrease in the probability of 
+a termination, some of it rather large.
+Finally, there exists the high impact region, almost exclusively concentrated 
+around increases in the probability of termination at $\delta_p > 0.75$. 
+These represent cases where delaying the close of enrollemnt changes a trial
+from a case where they were highly likely to complete their primary objectives to 
+a case where they were likely or almost certain to terminate the trial early.
+%   - the high impact regime is strange because it consists of trials that moved from unlikely (<20% chance) of termination to a high chance (>80% chance) of termination. Something like 5% of all trials have a greater than 98 percentage point increase in termination. Not sure what this is doing. 
+
+%   - Potential Explanations for high impact regime:
+How could this intervention have such a wide range in the intensity 
+and direction of impacts?
+A few explanations include that some trials are suceptable or that this is a 
+result of too little data.
+%       - Some trials are highly suceptable. This is the face value effect
+One option is that some categories are more suceptable to 
+issues with participant enrollment. 
+If this is the case, we should be able to isolate categories that contribute
+the most to this effect.
+Another is that this might be a modelling artefact, due to the relatively
+low number of trials in certain ICD-10 categories. 
+In short, there might be high levels of uncertanty in some parameter values,
+which manifest as fat tails in the distributions of the $\beta$ parameters. 
+Because of the logistic format of the model, these fat tails lead to 
+extreme values of $p$, and potentally large changes $\delta_p$. 
+%       - Could be uncertanty. If the model is highly uncertain, e.g. there isn't enough data, we could have a small percentage of large increases. This could be in general or just for a few categories with low amounts of data.
+% - 
+% - 
+
+I believe that this second explanation -- a model artifact due to uncertanty --
+is likely to be the cause. 
+Three points lead me to believe this:
+\begin{itemize}
+    \item The low fractions of E-BFMI suggest that the sampler is struggling 
+        to explore some regions of the posterior. 
+        According to \cite{standevelopmentteam_RuntimeWarnings_2022} this is 
+        often due to thick tails of posterior distributions.
+    \item When we examine the results across different ICD-10 groups, 
+        \ref{fig:pred_dist_dif_delay2}
+        \todo{move figure from below}
+        we note this same issue.
+    \item In Figure \ref{fig:betas_delay}, we see that some some ICD-10 categories
+        \todo{add figure}
+        have \todo{note fat tails}.
+    \item There are few trials available, particularly among some specific 
+        ICD-10 categories.
+\end{itemize}
+%           - take a look at beta values and then discuss if that lines up with results from dist-diff by group. 
+%       - My initial thought is that there is not enough data/too uncertain. I think this because it happens for most/all of the categories.
+% - 
+% - 
+% - 
+Overally it is hard to escape the conclusion that more data is needed across
+many -- if not all -- of the disease categories.
+
+
+
+Figure \ref{fig:pred_dist_dif_delay2} shows how this overall
+result comes from different disease categories.
 \begin{figure}[H]
-    \includegraphics[width=\textwidth]{../assets/img/current/pred_dist_diff-generic-group}
-	\caption{}
-	\label{fig:pred_dist_dif_generic2}
+    \includegraphics[width=\textwidth]{../assets/img/dist_diff_analysis/p_delay_intervention_distdiff_by_group}
+    \caption{Distribution of Predicted differences by Disease Group}
+    \label{fig:pred_dist_dif_delay2}
 \end{figure}


+\subsection{Secondary Results}
+
+% Examine beta parameters 
+% - Little movement except where data is strong, general negative movement. Still really wide 
+% - Note how they all learned (partial pooling) reduction in \beta from ANR?
+% - Need to discuss the 5 different states. Can't remember which one is dropped for the life of me. May need to fix parameterization.
+% - 
+
+\begin{figure}[H]
+    \includegraphics[width=\textwidth]{../assets/img/betas/parameter_across_groups/parameters_12_status_ANR}
+    \caption{Distribution of parameters associated with ``Active, not recruiting'' status, by ICD-10 Category}
+    \label{fig:parameters_ANR_by_group}
+\end{figure}
+% - 
+

 \end{document}
--- a/Paper/sections/08_PotentialImprovements.tex
+++ b/Paper/sections/08_PotentialImprovements.tex
@ -12,40 +12,40 @@ The most important step is to increase the number of observations available.
 Currently this requires matching trials to ICD-10 codes by hand, but
 there are certainly some steps that can be taken to improve the speed with which
 this can be done.
-
-\subsection{Covariance Structure}
-
-As noted in the diagnostics section, many of the convergence issues seem
-to occure in the covariance structure. 
-Instead of representing the parameters $\beta$ as independently normal:
-\begin{align}
-    \beta_k(d) \sim \text{Normal}(\mu_k, \sigma_k)
-\end{align}
-I propose using a multivariate normal distribution:
-\begin{align}
-    \beta(d) \sim \text{MvNormal}(\mu, \Sigma)
-\end{align}
-I am not familiar with typical approaches to priors on the covariance matrix,
-so this will require a further literature search as to best practices.
-
-\subsection{Finding Reasonable Priors}
-
-In standard bayesian regression, heavy tailed priors are common. 
-When working with a bayesian bernoulli-logit model, this is not appropriate as 
-heavy tails cause the estimated probabilities $p_n$ to concentrate around the 
-values $0$ and $1$, and away from values such as $\frac{1}{2}$ as discussed in
-\cite{mcelreath_statistical_2020}. %TODO: double check the chapter for this.
-
-I indend to take the general approach recommended in \cite{mcelreath_statistical_2020} of using
-prior predictive checks to evaluate the implications of different priors
-on the distribution on $p_n$.
-This would consist of taking the independent variables and predicting the values
-of $p_n$ based on a proposed set of priors. 
-By plotting these predictions, I can ensure that the specific parameter priors 
-used are consistent with my prior beliefs on how $p_n$ behaves.
-Currently I believe that $p_n$ should be roughly uniform or unimodal, centered 
-around $p_n = \frac{1}{2}$.
-
+%
+% \subsection{Covariance Structure}
+%
+% As noted in the diagnostics section, many of the convergence issues seem
+% to occure in the covariance structure. 
+% Instead of representing the parameters $\beta$ as independently normal:
+% \begin{align}
+%     \beta_k(d) \sim \text{Normal}(\mu_k, \sigma_k)
+% \end{align}
+% I propose using a multivariate normal distribution:
+% \begin{align}
+%     \beta(d) \sim \text{MvNormal}(\mu, \Sigma)
+% \end{align}
+% I am not familiar with typical approaches to priors on the covariance matrix,
+% so this will require a further literature search as to best practices.
+
+% \subsection{Finding Reasonable Priors}
+%
+% In standard bayesian regression, heavy tailed priors are common. 
+% When working with a bayesian bernoulli-logit model, this is not appropriate as 
+% heavy tails cause the estimated probabilities $p_n$ to concentrate around the 
+% values $0$ and $1$, and away from values such as $\frac{1}{2}$ as discussed in
+% \cite{mcelreath_statistical_2020}. %TODO: double check the chapter for this.
+%
+% I indend to take the general approach recommended in \cite{mcelreath_statistical_2020} of using
+% prior predictive checks to evaluate the implications of different priors
+% on the distribution on $p_n$.
+% This would consist of taking the independent variables and predicting the values
+% of $p_n$ based on a proposed set of priors. 
+% By plotting these predictions, I can ensure that the specific parameter priors 
+% used are consistent with my prior beliefs on how $p_n$ behaves.
+% Currently I believe that $p_n$ should be roughly uniform or unimodal, centered 
+% around $p_n = \frac{1}{2}$.
+%

 \subsection{Imputing Enrollment}

@ -81,21 +81,17 @@ found a way to do so.

 \subsection{Improving Measures of Market Conditions}

-Finally, the currently employed measure of market conditions -- the number of 
-brands using the same active ingredients -- is not a very good measure of 
-the options available to potential participants of a clinical trial.
-The ideal measures would capture the alternatives available to treat a given
-disease (drug meeting the given indication) at the time of the trial snapshot, 
-but this data is hard to come by.
 In addition to the fact that many diseases may be treated by non-pharmaceutical 
 means, off-label prescription of pharmaceuticals is legal at the federal level 
 (\cite{commissioner_understanding_2019}).
 These two facts both complicate measuring market conditions.
-
-One dataset that I have only investigated briefly is the \url{DrugCentral.org}
-database which tracks official indications and some off-label indications as 
-well
-(\cite{ursu_drugcentral_2017}).
+One way to address non-pharmaceutical treatments is to concentrate on domains
+that are primarily treated by pharmaceuticals.
+This requires domain knowledge that I don't have.
+% One dataset that I have only investigated briefly is the \url{DrugCentral.org}
+% database which tracks official indications and some off-label indications as 
+% well
+% (\cite{ursu_drugcentral_2017}).


 \end{document}
--- a/Paper/sections/10_CausalStory.tex
+++ b/Paper/sections/10_CausalStory.tex
@ -69,7 +69,7 @@ in the first place while currently observed safety and efficiency results
 help the sponsor judge whether or not to continue the trial.

 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-\subsection{Clinical Trials Data Sources}
+\subsection{Data Summary}
 %% Describe data here
 Since Sep 27th, 2007 those who conduct clinical trials of FDA controlled 
 drugs or devices on human subjects must register 
--- a/Paper/sections/12_clinical_trial_background.tex
+++ b/Paper/sections/12_clinical_trial_background.tex
@ -58,10 +58,29 @@ purpose of the clinical trials process.
 On the other hand, when a trial terminates early due to reasons 
 other than safety or efficacy concerns, the trial operator does not learn
 if the drug is effective or safe. 
-This is a true failure in that we did not learn if the drug was effective or not.
-Unfortunately, although termination documentation typically includes a 
-description of a reason for the clinical trial termination, this doesn't necessarily
-list all the reasons contributing to the trial termination and may not exist for a given trial.
+This is a knowledge-gathering failure where the trial operator 
+did not learn if the drug was effective or not.
+I prefer describing a clinical trial as being terminated for 
+\begin{itemize}
+    \item Safety or Efficacy concerns
+    \item Strategic concerns
+    \item Operational concerns.
+\end{itemize}
+
+Unfortunately it can be difficult to know why a given trial was terminated,
+in spite of the fact that upon termination, trials typically record a 
+description of \textit{a single} reason for the clinical trial termination. 
+This doesn't necessarily list all the reasons contributing to the trial termination and may not exist for a given trial.
+For example, if a Principle Investigator leaves for another institution 
+(terminating the trial), is this decison affected by 
+a safety or efficacy concern, 
+a new competitor on the market, 
+difficulting recruiting participants,
+or a lack of financial support from the study sponsor? 
+Estimating the impact of different problems that trials face from these 
+low-information, post-hoc signals is insufficient.
+For this reason, I use clinical trial progression to estimate effects. 
+\todo{not sure if this is the best place for this.}

 As a trial goes through the different stages of recruitment, the investigators
 update the records on ClinicalTrials.gov. 
--- a/Paper/sections/21_appendix_diagnostics.tex
+++ b/Paper/sections/21_appendix_diagnostics.tex
@ -0,0 +1,38 @@
+\documentclass[../Main.tex]{subfiles}
+\graphicspath{{\subfix{Assets/img/}}}
+
+\begin{document}
+\subsection{Diagnostics}
+Reported low E-BFMI scores (low is considered below $0.2$)
+\todo{Fill these out based on what is in the rendered report.}
+\begin{itemize}
+    \item Chain 1: 0.178
+    \item Chain 2: 0.189
+\end{itemize}
+
+No other reported issues.
+
+\begin{figure}[H]
+    \caption{Diagnostics: Trace Rank Plots - $\mu$}
+    \label{fig:trial_durations}
+    \includegraphics[width=\textwidth]{../assets/img/diagnostics/trace_rank_plot_mu_1-4}
+    \\[\smallskipamount]
+    \includegraphics[width=\textwidth]{../assets/img/diagnostics/trace_rank_plot_mu_5-8}
+    \\[\smallskipamount]
+    \includegraphics[width=\textwidth]{../assets/img/diagnostics/trace_rank_plot_mu_9-12}
+\end{figure}
+Mixing seems to be fine
+
+\begin{figure}[H]
+    \caption{Diagnostics: Trace Rank Plots - $\sigma$}
+    \label{fig:trial_durations}
+    \includegraphics[width=\textwidth]{../assets/img/diagnostics/trace_rank_plot_sigma_1-4}
+    \\[\smallskipamount]
+    \includegraphics[width=\textwidth]{../assets/img/diagnostics/trace_rank_plot_sigma_5-8}
+    \\[\smallskipamount]
+    \includegraphics[width=\textwidth]{../assets/img/diagnostics/trace_rank_plot_sigma_9-12}
+\end{figure}
+Mixing is slower than $\mu$ values, but doesn't seem too problematic in light of
+other deficencies such as low number of observations.
+
+\end{document}
--- a/assets/img/Images/CategoryCounts.png
+++ b/assets/img/Images/CategoryCounts.png
--- a/assets/img/Images/DirectEffects/Parameters/01_elapsed_duration.png
+++ b/assets/img/Images/DirectEffects/Parameters/01_elapsed_duration.png
--- a/assets/img/Images/DirectEffects/Parameters/02_generic.png
+++ b/assets/img/Images/DirectEffects/Parameters/02_generic.png
--- a/assets/img/Images/DirectEffects/Parameters/03_uspdc.png
+++ b/assets/img/Images/DirectEffects/Parameters/03_uspdc.png
--- a/assets/img/Images/DirectEffects/Parameters/09_NYR.png
+++ b/assets/img/Images/DirectEffects/Parameters/09_NYR.png
--- a/assets/img/Images/DirectEffects/Parameters/10_EBI.png
+++ b/assets/img/Images/DirectEffects/Parameters/10_EBI.png
--- a/assets/img/Images/DirectEffects/Parameters/11_Rec.png
+++ b/assets/img/Images/DirectEffects/Parameters/11_Rec.png
--- a/assets/img/Images/DirectEffects/Parameters/12_ANR.png
+++ b/assets/img/Images/DirectEffects/Parameters/12_ANR.png
--- a/assets/img/Images/DirectEffects/Parameters/2+3_generic_and_uspdc.png
+++ b/assets/img/Images/DirectEffects/Parameters/2+3_generic_and_uspdc.png
--- a/assets/img/Images/DirectEffects/default_p_generic_intervention_base.png
+++ b/assets/img/Images/DirectEffects/default_p_generic_intervention_base.png
--- a/assets/img/Images/DirectEffects/default_p_generic_intervention_distdiff.png
+++ b/assets/img/Images/DirectEffects/default_p_generic_intervention_distdiff.png
--- a/assets/img/Images/DirectEffects/default_p_generic_intervention_interv.png
+++ b/assets/img/Images/DirectEffects/default_p_generic_intervention_interv.png
--- a/assets/img/Images/DirectEffects/default_p_uspdc_intervention_base.png
+++ b/assets/img/Images/DirectEffects/default_p_uspdc_intervention_base.png
--- a/assets/img/Images/DirectEffects/default_p_uspdc_intervention_distdiff.png
+++ b/assets/img/Images/DirectEffects/default_p_uspdc_intervention_distdiff.png
--- a/assets/img/Images/DirectEffects/default_p_uspdc_intervention_interv.png
+++ b/assets/img/Images/DirectEffects/default_p_uspdc_intervention_interv.png
--- a/assets/img/Images/DirectEffects/p_generic_intervention_distdiff_by_group.png
+++ b/assets/img/Images/DirectEffects/p_generic_intervention_distdiff_by_group.png
--- a/assets/img/Images/DirectEffects/p_generic_intervention_distdiff_styled.png
+++ b/assets/img/Images/DirectEffects/p_generic_intervention_distdiff_styled.png
--- a/assets/img/Images/DirectEffects/p_generic_intervention_histdiff_by_group.png
+++ b/assets/img/Images/DirectEffects/p_generic_intervention_histdiff_by_group.png
--- a/assets/img/Images/DirectEffects/p_uspdc_intervention_distdiff_by_group.png
+++ b/assets/img/Images/DirectEffects/p_uspdc_intervention_distdiff_by_group.png
--- a/assets/img/Images/DirectEffects/p_uspdc_intervention_distdiff_styled.png
+++ b/assets/img/Images/DirectEffects/p_uspdc_intervention_distdiff_styled.png
--- a/assets/img/Images/DirectEffects/p_uspdc_intervention_histdiff_by_group.png
+++ b/assets/img/Images/DirectEffects/p_uspdc_intervention_histdiff_by_group.png
--- a/assets/img/Images/DirectEffects/posterior_p.png
+++ b/assets/img/Images/DirectEffects/posterior_p.png
--- a/assets/img/Images/DirectEffects/prior_mu.png
+++ b/assets/img/Images/DirectEffects/prior_mu.png
--- a/assets/img/Images/DirectEffects/prior_p.png
+++ b/assets/img/Images/DirectEffects/prior_p.png
--- a/assets/img/Images/DirectEffects/prior_sigma.png
+++ b/assets/img/Images/DirectEffects/prior_sigma.png
--- a/assets/img/Images/HistSnapshots.png
+++ b/assets/img/Images/HistSnapshots.png
--- a/assets/img/Images/HistTrialDurations_Faceted.png
+++ b/assets/img/Images/HistTrialDurations_Faceted.png
--- a/assets/img/Images/SnapshotsVsDurationVsTermination.png
+++ b/assets/img/Images/SnapshotsVsDurationVsTermination.png
--- a/assets/img/Images/TotalEffects/Parameters/01_generics.png
+++ b/assets/img/Images/TotalEffects/Parameters/01_generics.png
--- a/assets/img/Images/TotalEffects/Parameters/02_uspdc.png
+++ b/assets/img/Images/TotalEffects/Parameters/02_uspdc.png
--- a/assets/img/Images/TotalEffects/Parameters/1_2_generics_and_uspdc.png
+++ b/assets/img/Images/TotalEffects/Parameters/1_2_generics_and_uspdc.png
--- a/assets/img/Images/TotalEffects/default_p_generic_intervention_base.png
+++ b/assets/img/Images/TotalEffects/default_p_generic_intervention_base.png
--- a/assets/img/Images/TotalEffects/default_p_generic_intervention_distdiff.png
+++ b/assets/img/Images/TotalEffects/default_p_generic_intervention_distdiff.png
--- a/assets/img/Images/TotalEffects/default_p_generic_intervention_interv.png
+++ b/assets/img/Images/TotalEffects/default_p_generic_intervention_interv.png
--- a/assets/img/Images/TotalEffects/p_generic_intervention_distdiff_by_group.png
+++ b/assets/img/Images/TotalEffects/p_generic_intervention_distdiff_by_group.png
--- a/assets/img/Images/TotalEffects/p_generic_intervention_distdiff_styled.png
+++ b/assets/img/Images/TotalEffects/p_generic_intervention_distdiff_styled.png
--- a/assets/img/Images/TotalEffects/p_generic_intervention_histdiff_by_group.png
+++ b/assets/img/Images/TotalEffects/p_generic_intervention_histdiff_by_group.png
--- a/assets/img/Images/TotalEffects/p_uspdc_intervention_distdiff_by_group.png
+++ b/assets/img/Images/TotalEffects/p_uspdc_intervention_distdiff_by_group.png
--- a/assets/img/Images/TotalEffects/p_uspdc_intervention_distdiff_styled.png
+++ b/assets/img/Images/TotalEffects/p_uspdc_intervention_distdiff_styled.png
--- a/assets/img/Images/TotalEffects/p_uspdc_intervention_histdiff_by_group.png
+++ b/assets/img/Images/TotalEffects/p_uspdc_intervention_histdiff_by_group.png
--- a/assets/img/Images/TotalEffects/posterior_p.png
+++ b/assets/img/Images/TotalEffects/posterior_p.png
--- a/assets/img/Images/TotalEffects/prior_mu.png
+++ b/assets/img/Images/TotalEffects/prior_mu.png
--- a/assets/img/Images/TotalEffects/prior_sigma.png
+++ b/assets/img/Images/TotalEffects/prior_sigma.png
--- a/assets/img/Images/TotalEffects/priorp.png
+++ b/assets/img/Images/TotalEffects/priorp.png
--- a/assets/img/betas/parameter_across_groups/parameters_10_status_EBI.png
+++ b/assets/img/betas/parameter_across_groups/parameters_10_status_EBI.png
--- a/assets/img/betas/parameter_across_groups/parameters_11_status_Rec.png
+++ b/assets/img/betas/parameter_across_groups/parameters_11_status_Rec.png
--- a/assets/img/betas/parameter_across_groups/parameters_12_status_ANR.png
+++ b/assets/img/betas/parameter_across_groups/parameters_12_status_ANR.png
--- a/assets/img/betas/parameter_across_groups/parameters_1_Elapsed
+++ b/assets/img/betas/parameter_across_groups/parameters_1_Elapsed
--- a/assets/img/betas/parameter_across_groups/parameters_2_asinh(Generic
+++ b/assets/img/betas/parameter_across_groups/parameters_2_asinh(Generic
--- a/assets/img/betas/parameter_across_groups/parameters_3_asinh(Competitors
+++ b/assets/img/betas/parameter_across_groups/parameters_3_asinh(Competitors
--- a/assets/img/betas/parameter_across_groups/parameters_9_status_NYR.png
+++ b/assets/img/betas/parameter_across_groups/parameters_9_status_NYR.png
--- a/assets/img/betas/parameter_across_groups11+12_statusREC_and_statusANR.png
+++ b/assets/img/betas/parameter_across_groups11+12_statusREC_and_statusANR.png
--- a/assets/img/betas/parameter_across_groups2+3_generic_and_uspdc.png
+++ b/assets/img/betas/parameter_across_groups2+3_generic_and_uspdc.png
--- a/assets/img/betas/parameters_by_group/group_10_Respiratory.png
+++ b/assets/img/betas/parameters_by_group/group_10_Respiratory.png
--- a/assets/img/betas/parameters_by_group/group_11_Digestive.png
+++ b/assets/img/betas/parameters_by_group/group_11_Digestive.png
--- a/assets/img/betas/parameters_by_group/group_12_Skin
+++ b/assets/img/betas/parameters_by_group/group_12_Skin
--- a/assets/img/betas/parameters_by_group/group_13_Musculoskeletal.png
+++ b/assets/img/betas/parameters_by_group/group_13_Musculoskeletal.png
--- a/assets/img/betas/parameters_by_group/group_14_Genitourinary.png
+++ b/assets/img/betas/parameters_by_group/group_14_Genitourinary.png
--- a/assets/img/betas/parameters_by_group/group_17_Congential.png
+++ b/assets/img/betas/parameters_by_group/group_17_Congential.png
--- a/assets/img/betas/parameters_by_group/group_18_Symptoms,
+++ b/assets/img/betas/parameters_by_group/group_18_Symptoms,
--- a/assets/img/betas/parameters_by_group/group_1_Infections
+++ b/assets/img/betas/parameters_by_group/group_1_Infections
--- a/assets/img/betas/parameters_by_group/group_2_Neoplasms.png
+++ b/assets/img/betas/parameters_by_group/group_2_Neoplasms.png
--- a/assets/img/betas/parameters_by_group/group_3_Blood
+++ b/assets/img/betas/parameters_by_group/group_3_Blood
--- a/assets/img/betas/parameters_by_group/group_4_Endocrine,
+++ b/assets/img/betas/parameters_by_group/group_4_Endocrine,
--- a/assets/img/betas/parameters_by_group/group_5_Mental
+++ b/assets/img/betas/parameters_by_group/group_5_Mental
--- a/assets/img/betas/parameters_by_group/group_6_Nervous
+++ b/assets/img/betas/parameters_by_group/group_6_Nervous
--- a/assets/img/betas/parameters_by_group/group_7_Eye
+++ b/assets/img/betas/parameters_by_group/group_7_Eye
--- a/assets/img/betas/parameters_by_group/group_9_Circulatory.png
+++ b/assets/img/betas/parameters_by_group/group_9_Circulatory.png
--- a/assets/img/current/pred_dist_diff-delay-group.png
+++ b/assets/img/current/pred_dist_diff-delay-group.png
--- a/assets/img/current/pred_dist_diff-delay.png
+++ b/assets/img/current/pred_dist_diff-delay.png
--- a/assets/img/current/pred_dist_diff-generic-group.png
+++ b/assets/img/current/pred_dist_diff-generic-group.png
--- a/assets/img/current/pred_dist_diff-generic.png
+++ b/assets/img/current/pred_dist_diff-generic.png
--- a/assets/img/diagnostics/correlation_plot_beta_k_10_i_1-4.png
+++ b/assets/img/diagnostics/correlation_plot_beta_k_10_i_1-4.png
--- a/assets/img/diagnostics/correlation_plot_beta_k_10_i_5-8.png
+++ b/assets/img/diagnostics/correlation_plot_beta_k_10_i_5-8.png
--- a/assets/img/diagnostics/correlation_plot_beta_k_10_i_9-12.png
+++ b/assets/img/diagnostics/correlation_plot_beta_k_10_i_9-12.png
--- a/assets/img/diagnostics/correlation_plot_beta_k_11_i_1-4.png
+++ b/assets/img/diagnostics/correlation_plot_beta_k_11_i_1-4.png
--- a/assets/img/diagnostics/correlation_plot_beta_k_11_i_5-8.png
+++ b/assets/img/diagnostics/correlation_plot_beta_k_11_i_5-8.png
--- a/assets/img/diagnostics/correlation_plot_beta_k_11_i_9-12.png
+++ b/assets/img/diagnostics/correlation_plot_beta_k_11_i_9-12.png
--- a/assets/img/diagnostics/correlation_plot_beta_k_12_i_1-4.png
+++ b/assets/img/diagnostics/correlation_plot_beta_k_12_i_1-4.png
--- a/assets/img/diagnostics/correlation_plot_beta_k_12_i_5-8.png
+++ b/assets/img/diagnostics/correlation_plot_beta_k_12_i_5-8.png
--- a/assets/img/diagnostics/correlation_plot_beta_k_12_i_9-12.png
+++ b/assets/img/diagnostics/correlation_plot_beta_k_12_i_9-12.png
--- a/assets/img/diagnostics/correlation_plot_beta_k_13_i_1-4.png
+++ b/assets/img/diagnostics/correlation_plot_beta_k_13_i_1-4.png
--- a/assets/img/diagnostics/correlation_plot_beta_k_13_i_5-8.png
+++ b/assets/img/diagnostics/correlation_plot_beta_k_13_i_5-8.png
--- a/assets/img/diagnostics/correlation_plot_beta_k_13_i_9-12.png
+++ b/assets/img/diagnostics/correlation_plot_beta_k_13_i_9-12.png
--- a/assets/img/diagnostics/correlation_plot_beta_k_14_i_1-4.png
+++ b/assets/img/diagnostics/correlation_plot_beta_k_14_i_1-4.png
--- a/assets/img/diagnostics/correlation_plot_beta_k_14_i_5-8.png
+++ b/assets/img/diagnostics/correlation_plot_beta_k_14_i_5-8.png
--- a/assets/img/diagnostics/correlation_plot_beta_k_14_i_9-12.png
+++ b/assets/img/diagnostics/correlation_plot_beta_k_14_i_9-12.png
--- a/assets/img/diagnostics/correlation_plot_beta_k_15_i_1-4.png
+++ b/assets/img/diagnostics/correlation_plot_beta_k_15_i_1-4.png
--- a/assets/img/diagnostics/correlation_plot_beta_k_15_i_5-8.png
+++ b/assets/img/diagnostics/correlation_plot_beta_k_15_i_5-8.png
--- a/assets/img/diagnostics/correlation_plot_beta_k_15_i_9-12.png
+++ b/assets/img/diagnostics/correlation_plot_beta_k_15_i_9-12.png
--- a/assets/img/diagnostics/correlation_plot_beta_k_16_i_1-4.png
+++ b/assets/img/diagnostics/correlation_plot_beta_k_16_i_1-4.png
--- a/Show More
+++ b/Show More