From 5d9640ab8d1599e33599a58dc23e6de47c412adb Mon Sep 17 00:00:00 2001 From: will king Date: Thu, 28 Nov 2024 23:39:04 -0800 Subject: [PATCH] saving work --- Paper/sections/04_EconometricModel.tex | 80 ++++++++++++++------------ assets/preambles/MathPreamble.tex | 3 + 2 files changed, 47 insertions(+), 36 deletions(-) diff --git a/Paper/sections/04_EconometricModel.tex b/Paper/sections/04_EconometricModel.tex index de205f9..5c509fd 100644 --- a/Paper/sections/04_EconometricModel.tex +++ b/Paper/sections/04_EconometricModel.tex @@ -3,10 +3,10 @@ \begin{document} %% Describe goal -% Estimate probability distribution of normalized durations and conclusion statuses. -% Explain why this answers questions well. -% How do I propose estimating that? +The model I use is a +hierarchal logistic regression model where the +hierarchies are based on disease categories. %%NOTATION % change notation % i indexes trials for y and d @@ -16,51 +16,65 @@ First, some notation: \begin{itemize} \item $i$: indexes trials \item $n$: indexes trial snapshots. - \item $y_i$: whether each trial terminated (true) or completed (false). - \item $d_i$: indexes the ICD-10 disease categories per trial. + \item $y_i$: whether each trial + terminated (true, 1) or completed (false, 0). + \item $d_i$: indexes the ICD-10 disease category of the trial. \item $x_{i,n}$: represents the other dependent variables associated with the snapshot. - % This includes\footnote{No trials in the current dataset are ever suspended.}: - % \begin{enumerate} - % \item Elapsed duration - % \item arcsinh of the number of brands - % \item arcsinh of the DALYs from high SDI countries - % \item arcsinh of the DALYs from high-medium SDI countries - % \item Enrollment (no distinction between anticipated or actual) - % \item Dummy Status: Not yet recruiting - % \item Dummy Status: Recruiting - % \item Dummy Status: Active, not recruiting - % \item Dummy Status: Enrolling by invitation - % \end{enumerate} \end{itemize} -% The arcsinh transform is used because it is similar to a log transform but -% maps $\text{arcsinh}(0)=0$. - -The bayesian model to measure the direct effect of enrollment -is specified as a hierarchal logistic regression. +The goal is to take each snapshot and predict +The actual specification of the model to measure +the direct effect of enrollment is: \begin{align} y_i \sim \text{Bernoulli}(p_{i,n}) \\ - p_{i,n} = \text{logit}(x_{i,n} \vec \beta(d_n)) + p_{i,n} = \text{logit}(x_{i,n} \vec \beta(d_i)) \end{align} Where beta is indexed by $d \in \{1,2,\dots,21,22\}$ for each general ICD-10 category. The betas are distributed \begin{align} - \beta(d) \sim \text{Normal}(\mu,\sigma I) + \beta(d_i) \sim \text{Normal}(\mu_i,\sigma_i I) \end{align} With hyperpriors +%Checked on 2024-11-27. Is corrrect. \todo{Double check that these are the priors I used.} \begin{align} \mu_k \sim \text{Normal}(0,0.05) \\ \sigma_k \sim \text{Gamma}(4,20) \end{align} -\todo{Double check that these are the priors I used.} +\todo{Double check actual spec} -Other variables are implicitly conditioned-on as they are used +The independent variables include: +\todo{Make sure data is described before this point.} +\begin{subequations} +\begin{align} + x_{i,n}\beta(d_i) + = & \bx{1}{\text{Elapsed Duration}} \\ + &+ \bx{2}{\arcsinh \left(\text{\# Generic compunds}\right)} \\ + &+ \bx{3}{\arcsinh \left(\text{\# Branded compunds}\right)} \\ + &+ \bx{4}{\text{\# DALYs in High SDI Countries}} \\ + &+ \bx{5}{\text{\# DALYs in High-Medium SDI Countries}} \\ + &+ \bx{6}{\text{\# DALYs in Medium SDI Countries}} \\ + &+ \bx{7}{\text{\# DALYs in Low-Medium SDI Countries}} \\ + &+ \bx{8}{\text{\# DALYs in Low SDI Countries}} \\ + &+ \bxi{9}{\text{Not yet Recruiting}}{\text{Trial Status}}\\ + &+ \bxi{10}{\text{Recruiting}}{\text{Trial Status}}\\ + &+ \bxi{11}{\text{Enrolling by Invitation Only}}{\text{Trial Status}}\\ + &+ \bxi{12}{\text{Active, not recruiting}}{\text{Trial Status}} +\end{align} +\end{subequations} +The arcsinh transform is used because it is similar to a log transform but +differentiably handles counts of zero since +$\text{arcsinh}(0) = \ln (0 + \sqrt{0^2 + 1}) =0$. +Note that in this is a heirarchal model, each IDC-10 disease category +gets it's own set of parameters, and that is why the $\beta$s are parameterized +by $d_i$. +%%%% Not sure if space should go here. I think these work well together. +Other variables are implicitly controlled for as they are used to select the trials of interest. -I ensured that: +These include: \todo{double check these in the code.} \begin{itemize} \item The trial is Phase 3. @@ -70,15 +84,9 @@ I ensured that: This was because I wasn't sure how to handle it in the model when I started scraping the data. Later the website changed. - This is technically post selection in some cases. + This is technically post selection. + \todo{double check where this happened in the code. + I may have only done it in the CBO analysis.} } \end{itemize} - - -\todo{Make sure data is described before this point.} -\todo{Put in a standard econometrics model} -\begin{equation} - x\beta = \beta_0 + \beta_1 \times \text{test} - \label{eq:test} -\end{equation} \end{document} diff --git a/assets/preambles/MathPreamble.tex b/assets/preambles/MathPreamble.tex index 0b39634..22dd50c 100644 --- a/assets/preambles/MathPreamble.tex +++ b/assets/preambles/MathPreamble.tex @@ -10,6 +10,9 @@ %%%%%%%%%%%% MATH FORMATTING %%%%%%%%%%%%%%%%%%%%% %Helpful bits \newcommand{\bb}[1]{\mathbb{#1}} +\newcommand{\bx}[2]{\beta_{#1}(d_i) \times #2} +\newcommand{\bxi}[3]{\beta_{#1}(d_i) \times I_{#2}(#3)} +\newcommand{\arcsinh}{\text{arcsinh}} %Derivatives etc. \newcommand{\parder}[3]{\ensuremath{ \frac{\partial^{#3} #1}{\partial #2~^{#3}}}}