saving work

1 year ago · 5d9640ab8d
parent 3e6a8f10d4
commit 5d9640ab8d
2 changed files with 47 additions and 36 deletions
--- a/Paper/sections/04_EconometricModel.tex
+++ b/Paper/sections/04_EconometricModel.tex
@ -3,10 +3,10 @@

 \begin{document}
 %% Describe goal
-%   Estimate probability distribution of normalized durations and conclusion statuses.
-%   Explain why this answers questions well.
-%   How do I propose estimating that?

+The model I use is a 
+hierarchal logistic regression model where the 
+hierarchies are based on disease categories.
 %%NOTATION
 % change notation
 % i indexes trials for y and d 
@ -16,51 +16,65 @@ First, some notation:
 \begin{itemize}
    \item $i$: indexes trials
    \item $n$: indexes trial snapshots.
-    \item $y_i$: whether each trial terminated (true) or completed (false).
-    \item $d_i$: indexes the ICD-10 disease categories per trial.
+    \item $y_i$: whether each trial 
+        terminated (true, 1) or completed (false, 0).
+    \item $d_i$: indexes the ICD-10 disease category of the trial.
    \item $x_{i,n}$: represents the other dependent 
        variables associated with the snapshot.
-        % This includes\footnote{No trials in the current dataset are ever suspended.}:
-        % \begin{enumerate} 
-        %     \item Elapsed duration
-        %     \item arcsinh of the number of brands
-        %     \item arcsinh of the DALYs from high SDI countries
-        %     \item arcsinh of the DALYs from high-medium SDI countries
-        %     \item Enrollment (no distinction between anticipated or actual) 
-        %     \item Dummy Status: Not yet recruiting
-        %     \item Dummy Status: Recruiting
-        %     \item Dummy Status: Active, not recruiting
-        %     \item Dummy Status: Enrolling by invitation 
-        % \end{enumerate} 
 \end{itemize} 
-% The arcsinh transform is used because it is similar to a log transform but
-% maps $\text{arcsinh}(0)=0$.

-
-The bayesian model to measure the direct effect of enrollment
-is specified as a hierarchal logistic regression.
+The goal is to take each snapshot and predict 
+The actual specification of the model to measure 
+the direct effect of enrollment is:
 \begin{align}
    y_i \sim \text{Bernoulli}(p_{i,n}) \\
-        p_{i,n} = \text{logit}(x_{i,n} \vec \beta(d_n))
+        p_{i,n} = \text{logit}(x_{i,n} \vec \beta(d_i))
 \end{align}
 Where beta is indexed by 
 $d \in \{1,2,\dots,21,22\}$ 
 for each general ICD-10 category.
 The betas are distributed
 \begin{align}
-    \beta(d) \sim \text{Normal}(\mu,\sigma I)
+    \beta(d_i) \sim \text{Normal}(\mu_i,\sigma_i I)
 \end{align}
 With hyperpriors
+%Checked on 2024-11-27. Is corrrect. \todo{Double check that these are the priors I used.}
 \begin{align}
    \mu_k \sim \text{Normal}(0,0.05) \\
    \sigma_k \sim \text{Gamma}(4,20)
 \end{align}
-\todo{Double check that these are the priors I used.}
+\todo{Double check actual spec}


-Other variables are implicitly conditioned-on as they are used 
+The independent variables include: 
+\todo{Make sure data is described before this point.}
+\begin{subequations}
+\begin{align}
+    x_{i,n}\beta(d_i) 
+        = & \bx{1}{\text{Elapsed Duration}} \\
+        &+ \bx{2}{\arcsinh \left(\text{\# Generic compunds}\right)} \\
+        &+ \bx{3}{\arcsinh \left(\text{\# Branded compunds}\right)} \\ 
+        &+ \bx{4}{\text{\# DALYs in High SDI Countries}} \\
+        &+ \bx{5}{\text{\# DALYs in High-Medium SDI Countries}} \\
+        &+ \bx{6}{\text{\# DALYs in Medium SDI Countries}} \\
+        &+ \bx{7}{\text{\# DALYs in Low-Medium SDI Countries}} \\
+        &+ \bx{8}{\text{\# DALYs in Low SDI Countries}} \\
+        &+ \bxi{9}{\text{Not yet Recruiting}}{\text{Trial Status}}\\
+        &+ \bxi{10}{\text{Recruiting}}{\text{Trial Status}}\\
+        &+ \bxi{11}{\text{Enrolling by Invitation Only}}{\text{Trial Status}}\\
+        &+ \bxi{12}{\text{Active, not recruiting}}{\text{Trial Status}}
+\end{align}
+\end{subequations}
+The arcsinh transform is used because it is similar to a log transform but
+differentiably handles counts of zero since 
+$\text{arcsinh}(0) = \ln (0 + \sqrt{0^2 + 1}) =0$.
+Note that in this is a heirarchal model, each IDC-10 disease category 
+gets it's own set of parameters, and that is why the $\beta$s are parameterized
+by $d_i$.
+%%%% Not sure if space should go here. I think these work well together.
+Other variables are implicitly controlled for as they are used 
 to select the trials of interest.
-I ensured that:
+These include:
        \todo{double check these in the code.}
 \begin{itemize}
    \item The trial is Phase 3.
@ -70,15 +84,9 @@ I ensured that:
        This was because I wasn't sure how to handle it in the model
        when I started scraping the data. 
        Later the website changed.
-        This is technically post selection in some cases.
+        This is technically post selection. 
+        \todo{double check where this happened in the code. 
+        I may have only done it in the CBO analysis.}
    }
 \end{itemize}
-
-
-\todo{Make sure data is described before this point.}
-\todo{Put in a standard econometrics model}
-\begin{equation}
-    x\beta = \beta_0 + \beta_1 \times \text{test}
-    \label{eq:test}
-\end{equation}
 \end{document}
--- a/assets/preambles/MathPreamble.tex
+++ b/assets/preambles/MathPreamble.tex
@ -10,6 +10,9 @@
 %%%%%%%%%%%% MATH FORMATTING %%%%%%%%%%%%%%%%%%%%%
 %Helpful bits
 \newcommand{\bb}[1]{\mathbb{#1}}
+\newcommand{\bx}[2]{\beta_{#1}(d_i) \times #2}
+\newcommand{\bxi}[3]{\beta_{#1}(d_i) \times I_{#2}(#3)}
+\newcommand{\arcsinh}{\text{arcsinh}}

 %Derivatives etc.
 \newcommand{\parder}[3]{\ensuremath{ \frac{\partial^{#3} #1}{\partial #2~^{#3}}}}