From 142670d08a44975427b583e41758a29dac79982c Mon Sep 17 00:00:00 2001
From: Will King <will@youainti.com>
Date: Sat, 24 Aug 2024 13:33:48 -0700
Subject: [PATCH 1/5] added some data that had been left untracked

---
 .gitattributes                                |  4 ++
 2023-09-06_aactdb_with_matches.sql.gzip       |  3 --
 .../2023-09-06_aactdb_with_matches.sql.gz     |  3 ++
 .../AACT_Reloader/StartRestoreContainer.sh    | 23 +++++++++
 justfile                                      | 47 ++++++++-----------
 ..._dc_pub_2023_release_2.0_updated_final.csv |  3 ++
 .../USP MMG/MMG_v8.0_Alignment_File.csv       |  3 ++
 ...yProductSystem_NationalDrugCodeExtract.csv |  3 ++
 scripts/drugtools/historical_nct_extractor.py | 16 +++----
 9 files changed, 66 insertions(+), 39 deletions(-)
 delete mode 100644 2023-09-06_aactdb_with_matches.sql.gzip
 create mode 100644 containers/AACT_Reloader/2023-09-06_aactdb_with_matches.sql.gz
 create mode 100755 containers/AACT_Reloader/StartRestoreContainer.sh
 create mode 100644 other_data/USP DC/usp_dc_pub_2023_release_2.0_updated_final.csv
 create mode 100644 other_data/USP MMG/MMG_v8.0_Alignment_File.csv
 create mode 100644 other_data/VA Formulary/PharmacyProductSystem_NationalDrugCodeExtract.csv

diff --git a/.gitattributes b/.gitattributes
index 20425b7..b720f8a 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -1,2 +1,6 @@
 *.sql.gzip filter=lfs diff=lfs merge=lfs -text
 *.xlsx filter=lfs diff=lfs merge=lfs -text
+containers/AACT_Reloader/2023-09-06_aactdb_with_matches.sql.gz filter=lfs diff=lfs merge=lfs -text
+other_data/USP[[:space:]]DC/usp_dc_pub_2023_release_2.0_updated_final.csv filter=lfs diff=lfs merge=lfs -text
+other_data/USP[[:space:]]MMG/MMG_v8.0_Alignment_File.csv filter=lfs diff=lfs merge=lfs -text
+other_data/VA[[:space:]]Formulary/PharmacyProductSystem_NationalDrugCodeExtract.csv filter=lfs diff=lfs merge=lfs -text
diff --git a/2023-09-06_aactdb_with_matches.sql.gzip b/2023-09-06_aactdb_with_matches.sql.gzip
deleted file mode 100644
index c7282ee..0000000
--- a/2023-09-06_aactdb_with_matches.sql.gzip
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:9ac5e99d27f53b81380d98f546c656332fac2be05dabea0ade16fe692a1334d6
-size 1897211526
diff --git a/containers/AACT_Reloader/2023-09-06_aactdb_with_matches.sql.gz b/containers/AACT_Reloader/2023-09-06_aactdb_with_matches.sql.gz
new file mode 100644
index 0000000..7ff43a7
--- /dev/null
+++ b/containers/AACT_Reloader/2023-09-06_aactdb_with_matches.sql.gz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bd5142f25ff9ef65048c02d5173b5e2a90f4a07513480d5e8c399cf9da39e678
+size 1897211561
diff --git a/containers/AACT_Reloader/StartRestoreContainer.sh b/containers/AACT_Reloader/StartRestoreContainer.sh
new file mode 100755
index 0000000..d495fbf
--- /dev/null
+++ b/containers/AACT_Reloader/StartRestoreContainer.sh
@@ -0,0 +1,23 @@
+#!/bin/bash
+
+RESTORE_DUMP_GZ=2023-09-06_aactdb_with_matches.sql.gz
+POSTGRES_USER=root
+POSTGRES_PASSWORD=root
+
+#start container
+podman run \
+    -e POSTGRES_PASSWORD="${POSTGRES_PASSWORD}" \
+    -e POSTGRES_USER="${POSTGRES_USER}" \
+    -e POSTGRES_DB=aact_db \
+    --name aact_db \
+    --detatch \
+    --shm-size=512mb \
+    --volume ${RESTORE_DUMP_GZ}:/backup/${RESTORE_DUMP_GZ} \
+    --ports 5432:5432\
+    postgres:14-alpine
+
+
+
+#execute within container
+podman exec aact_db \
+    "gzip --keep --stdout --decompress /backup/2023-09-06_aactdb_with_matches.sql.gz | psql -U ${POSTGRES_USER}"
diff --git a/justfile b/justfile
index a3691ed..0be52ea 100644
--- a/justfile
+++ b/justfile
@@ -5,23 +5,19 @@
 #   - move postgress login credentials (allow them to be printed from just while setting up)
 
 
-#paths for aact_db (postgres)
-aact_download_link := "https://ctti-aact.nyc3.digitaloceanspaces.com/27grtsnhtccplxapj2o8ak9aotvv"
-aact_download_file := "2022-12-23_postgres_data.zip"
-aact_download_path := "./containers/AACT_downloader/aact_downloads"
-aact_zipped_data_filepath := aact_download_path / aact_download_file
+data_link := "https://ctti-aact.nyc3.digitaloceanspaces.com/27grtsnhtccplxapj2o8ak9aotvv"
+data_file := "2022-12-23_postgres_data.zip"
+data_path := "./containers/AACT_downloader/aact_downloads"
+data_filepath := data_path / data_file
 
 #must match the 'container name: aact_db' in the docker-compose.yaml
-docker_container := `docker container ls -a | grep "aact_db|rxnav_db" | cut -f 1 -d " " | tr "\n" " "`
-
-#paths for rxnavinabox
-rxnav_path := "./containers/RxNav-In-a-box"
-rxnav_version := "rxnav-in-a-box-20230103"
-rxnav_data_path := rxnav_path / rxnav_version / "mysql" / "02_data.sql"
+docker_container := `docker container ls -a | grep aact_db | cut -f 1 -d " " | tr "\n" " "`
 
 #Various paths for docker stuff
-docker-compose_path := "./containers/docker-compose.yaml"
+docker-compose_path := "./AACT_downloader/docker-compose.yaml"
 
+#rxnorm_mappings
+rxnorm_mappings_url := "https://dailymed-data.nlm.nih.gov/public-release-files/rxnorm_mappings.zip"
 
 #Number of historical trials to download.
 count := "100"
@@ -32,23 +28,18 @@ check-status:
     docker --version
     #check if python version > 3.10. 
     python --version
-    #python -c 'import sys; exit(sys.hexversion >= 50859504)'
+    python -c 'import sys; exit(sys.hexversion >= 50859504)'
     curl --version
     echo "current docker containers:{{docker_container}}"
 
-
+#Setup the AACT container
 setup-containers: 
-    echo "todo"
     @echo "Check for downloaded data"
-    #aact
-    [ -s {{aact_download_path}}/postgres_data.dmp ]
-    #rxnav
-    [ -s {{rxnav_data_path}} ]
+    [ -s {{data_path}}/postgres_data.dmp ]
 
     #run docker compose
-    @echo "Setting up AACT_db & RxNav_db container"
+    @echo "Setting up AACT container"
     docker-compose -f {{docker-compose_path}} up -d
-    
 
 #Stop the appropriate docker container
 stop-containers:
@@ -69,13 +60,10 @@ clean-docker: stop-containers
 
 #Download the AACT data
 download-aact-data:
-    #download
-    curl {{aact_download_link}} > {{aact_zipped_data_filepath}}
-    unzip {{aact_zipped_data_filepath}} -d {{aact_download_path}}
-    rm {{aact_zipped_data_filepath}}
+    curl {{data_link}} > ./AACT_downloader/aact_downloads/{{data_file}}
+    unzip {{data_filepath}} -d {{data_path}}
+    rm {{data_filepath}}
 
-download-rxnav-data:
-    echo "Currently manually downloaded."
 
 #build based on previously downloaded data
 build: check-status setup-containers
@@ -117,3 +105,8 @@ get-nsde:
     cd market_data && bash download_nsde.sh
     cd market_data && python extract_nsde.py
 
+get-rxnorm-mappings:
+    #this may not be needed, all it does is match spls to rxcuis and I think I already have that.
+    curl {{rxnorm_mappings_url}} > ./market_data/rxnorm_mappings.zip
+    cd ./market_data && unzip ./rxnorm_mappings.zip
+    rm ./market_data/rxnorm_mappings.zip
diff --git a/other_data/USP DC/usp_dc_pub_2023_release_2.0_updated_final.csv b/other_data/USP DC/usp_dc_pub_2023_release_2.0_updated_final.csv
new file mode 100644
index 0000000..6cd7c22
--- /dev/null
+++ b/other_data/USP DC/usp_dc_pub_2023_release_2.0_updated_final.csv	
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d08d3944a859c0b1f6bbd466ca027fc46c86ef5bb0328cb005fa002b7b61e70b
+size 2451625
diff --git a/other_data/USP MMG/MMG_v8.0_Alignment_File.csv b/other_data/USP MMG/MMG_v8.0_Alignment_File.csv
new file mode 100644
index 0000000..f12587c
--- /dev/null
+++ b/other_data/USP MMG/MMG_v8.0_Alignment_File.csv	
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:85859dae3971460d36e0643ee1396cb646dba158b75862d557210cb2c50707a9
+size 874058
diff --git a/other_data/VA Formulary/PharmacyProductSystem_NationalDrugCodeExtract.csv b/other_data/VA Formulary/PharmacyProductSystem_NationalDrugCodeExtract.csv
new file mode 100644
index 0000000..e383df4
--- /dev/null
+++ b/other_data/VA Formulary/PharmacyProductSystem_NationalDrugCodeExtract.csv	
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8c877a461be9e75565f78d0552e76f597b5cce709c82a3f9ad30dcc0f26ddafc
+size 32481883
diff --git a/scripts/drugtools/historical_nct_extractor.py b/scripts/drugtools/historical_nct_extractor.py
index ab62bf3..326d4fd 100644
--- a/scripts/drugtools/historical_nct_extractor.py
+++ b/scripts/drugtools/historical_nct_extractor.py
@@ -126,17 +126,18 @@ def extract_submission_dates(soup):
 
     version_date_dict = {}
 
-    for row in reversed(table_rows):
+    for row in table_rows:
         # if it is <td headers="VersionNumber">xx</td> then it contains what we need.
+        version_number = None
+        version_date = None
         for td in row.findChildren("td"):
             if ("headers" in td.attrs):
                 if (td.attrs["headers"][0]=="VersionNumber"):
                     version_number = int(td.text)
                 elif (td.attrs["headers"][0]=="VersionDate"):
-                    version_date = td.text
-            version_date_dict[version_number] = datetime.strptime(version_date , "%B %d, %Y")
+                    version_date = datetime.strptime(td.text.strip() , "%B %d, %Y")
 
-    print(version_date_dict)
+        version_date_dict[version_number] = version_date
     return version_date_dict
 
 def optional_strip(possible_string):
@@ -396,14 +397,12 @@ date_MMMM_DD_YYYY = "%B %d, %Y"
 
 def get_data_from_versions(nct_id,html, version_a_int, version_b_int):
     soup = BeautifulSoup(html,"lxml")
-    print(getting_data_from_versions)
 
     version_date_dict = extract_submission_dates(soup)
-    print(version_date_dict)
 
     #preallocate version data
-    version_a = VersionData(nct_id, version_a_int, version_date_dict[version_a_int])
-    version_b = VersionData(nct_id, version_b_int, version_date_dict[version_b_int])
+    version_a = VersionData(nct_id, version_a_int, version_date_dict.get(version_a_int))
+    version_b = VersionData(nct_id, version_b_int, version_date_dict.get(version_b_int))
 
     #extract data from html and put it in the preallocated objects
     get_forms(soup, version_a, version_b)
@@ -424,7 +423,6 @@ def run():
             curse.execute(sql)
             for response in tqdm(curse.fetchall()):
                 nct_id, version_a, version_b, html = response
-                print(nct_id)
 
                 print(nct_id, version_a, version_b) if VERBOSE else ""
 

From d90539a679d30b96fed21414bd6f69d1690a241a Mon Sep 17 00:00:00 2001
From: Will King <will@youainti.com>
Date: Sat, 24 Aug 2024 16:19:40 -0700
Subject: [PATCH 2/5] updated data-restoration script. Currently working

---
 containers/AACT_Reloader/.gitattributes       |  1 +
 .../AACT_Reloader/StartRestoreContainer.sh    | 32 ++++++++++++++-----
 .../2023-09-06_aactdb_with_matches.sql.gz     |  0
 3 files changed, 25 insertions(+), 8 deletions(-)
 create mode 100644 containers/AACT_Reloader/.gitattributes
 rename containers/AACT_Reloader/{ => backup}/2023-09-06_aactdb_with_matches.sql.gz (100%)

diff --git a/containers/AACT_Reloader/.gitattributes b/containers/AACT_Reloader/.gitattributes
new file mode 100644
index 0000000..a20c5a3
--- /dev/null
+++ b/containers/AACT_Reloader/.gitattributes
@@ -0,0 +1 @@
+backup/2023-09-06_aactdb_with_matches.sql.gz filter=lfs diff=lfs merge=lfs -text
diff --git a/containers/AACT_Reloader/StartRestoreContainer.sh b/containers/AACT_Reloader/StartRestoreContainer.sh
index d495fbf..e647c84 100755
--- a/containers/AACT_Reloader/StartRestoreContainer.sh
+++ b/containers/AACT_Reloader/StartRestoreContainer.sh
@@ -3,21 +3,37 @@
 RESTORE_DUMP_GZ=2023-09-06_aactdb_with_matches.sql.gz
 POSTGRES_USER=root
 POSTGRES_PASSWORD=root
+POSTGRES_DB=aact_db
 
 #start container
 podman run \
     -e POSTGRES_PASSWORD="${POSTGRES_PASSWORD}" \
     -e POSTGRES_USER="${POSTGRES_USER}" \
-    -e POSTGRES_DB=aact_db \
-    --name aact_db \
-    --detatch \
+    -e POSTGRES_DB="${POSTGRES_DB}" \
+    --name "${POSTGRES_DB}" \
+    --detach \
     --shm-size=512mb \
-    --volume ${RESTORE_DUMP_GZ}:/backup/${RESTORE_DUMP_GZ} \
-    --ports 5432:5432\
+    --volume ./backup/:/backup/ \
+    -p 5432:5432\
     postgres:14-alpine
 
 
+sleep 10
 
-#execute within container
-podman exec aact_db \
-    "gzip --keep --stdout --decompress /backup/2023-09-06_aactdb_with_matches.sql.gz | psql -U ${POSTGRES_USER}"
+# Function to check if PostgreSQL is ready
+function check_postgres {
+  podman exec -i "${POSTGRES_DB}" psql -h localhost -U "${POSTGRES_USER}" -d "${POSTGRES_DB}" -c '\q' > /dev/null 2>&1
+}
+
+# Wait for PostgreSQL to be ready
+until check_postgres; do
+  echo "Waiting for PostgreSQL to be ready..."
+  sleep 4
+done
+
+echo "PostgreSQL is ready. Restoring the database..."
+
+# Decompress the dump file and restore it to the database
+podman exec -i "${POSTGRES_DB}" sh -c "gunzip -c /backup/${RESTORE_DUMP_GZ} | psql -h localhost -U ${POSTGRES_USER} -d ${POSTGRES_DB}"
+
+echo "Database restoration complete."
diff --git a/containers/AACT_Reloader/2023-09-06_aactdb_with_matches.sql.gz b/containers/AACT_Reloader/backup/2023-09-06_aactdb_with_matches.sql.gz
similarity index 100%
rename from containers/AACT_Reloader/2023-09-06_aactdb_with_matches.sql.gz
rename to containers/AACT_Reloader/backup/2023-09-06_aactdb_with_matches.sql.gz

From 9aaf0077919c5da7bc5bb20ea3549adc3e096daf Mon Sep 17 00:00:00 2001
From: will king <youainti@protonmail.com>
Date: Sat, 7 Sep 2024 09:59:10 -0700
Subject: [PATCH 3/5] removed accidental includion of .dbeaver configuration

---
 .dbeaver/.credentials-config.json.bak | Bin 336 -> 0 bytes
 .dbeaver/.data-sources.json.bak       |  64 --------------------------
 .dbeaver/.project-metadata.json.bak   |   1 -
 .gitignore                            |   2 +-
 4 files changed, 1 insertion(+), 66 deletions(-)
 delete mode 100644 .dbeaver/.credentials-config.json.bak
 delete mode 100644 .dbeaver/.data-sources.json.bak
 delete mode 100644 .dbeaver/.project-metadata.json.bak

diff --git a/.dbeaver/.credentials-config.json.bak b/.dbeaver/.credentials-config.json.bak
deleted file mode 100644
index 5be47bd4b59987ff12f333b2280f8a3ba977cc42..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 336
zcmV-W0k8h~=exeU`Wsxb{O+H>=V;Jx{r!L+X$}Pf!~)(rxt%%my>5}$b|q<ws{YSs
z%2{1wWB|G$R36Ng7p~xpBzu=n^AHHi%4YCQ=BQkV4UiHqL*xYIhMmsZr+BnhG#;Fr
zeme}}AB@o7`W6-`R84lJ`K+~p;r308R?kF=$g*)bw-gh~-Es*FFj4wP&|YXW!E4QV
z2=@78m1*e&SWn7RDvt(>q@;NbTcu^cFoPVYT+L!fatnvNWAQ_5W5XVmj5;0r14wO%
zkq3-p0rNVA;TYwc03xJ<8>&*Ox%{4v4+&%-9T3duT#;(w<FK}p`%k<Gu*jSzd?Pj`
zRh{y`PSwcqn7px=)!lCRFxOME+puP^*7k6i7ve|_$X~Ww9Sl7xO`clBA`0&T#=YzZ
i@s?_S(14SdtD64tthNN~#<CS>wI2GWCmHxVt#y*vMXMhG

diff --git a/.dbeaver/.data-sources.json.bak b/.dbeaver/.data-sources.json.bak
deleted file mode 100644
index 1ed4cce..0000000
--- a/.dbeaver/.data-sources.json.bak
+++ /dev/null
@@ -1,64 +0,0 @@
-{
-	"folders": {},
-	"connections": {
-		"mariaDB-186c896820e-6ff11b5b802d8b82": {
-			"provider": "mysql",
-			"driver": "mariaDB",
-			"name": "rxnav",
-			"save-password": true,
-			"configuration": {
-				"host": "will-office",
-				"port": "3306",
-				"url": "jdbc:mariadb://will-office:3306/",
-				"configurationType": "MANUAL",
-				"type": "dev",
-				"auth-model": "native"
-			}
-		},
-		"postgres-jdbc-186c896a347-2a3d946d2dea4df7": {
-			"provider": "postgresql",
-			"driver": "postgres-jdbc",
-			"name": "aact_db",
-			"save-password": true,
-			"configuration": {
-				"host": "100.95.169.11",
-				"port": "5432",
-				"database": "aact_db",
-				"url": "jdbc:postgresql://100.95.169.11:5432/aact_db",
-				"configurationType": "MANUAL",
-				"type": "dev",
-				"provider-properties": {},
-				"auth-model": "native"
-			},
-			"custom-properties": {
-				"resultset.maxrows": "500"
-			}
-		},
-		"postgres-jdbc-186cd8f479f-6cc3c10c8adc3359": {
-			"provider": "postgresql",
-			"driver": "postgres-jdbc",
-			"name": "drugcentral",
-			"save-password": true,
-			"configuration": {
-				"host": "localhost",
-				"port": "54320",
-				"database": "postgres",
-				"url": "jdbc:postgresql://localhost:54320/postgres",
-				"configurationType": "MANUAL",
-				"type": "dev",
-				"auth-model": "native"
-			}
-		}
-	},
-	"connection-types": {
-		"dev": {
-			"name": "Development",
-			"color": "255,255,255",
-			"description": "Regular development database",
-			"auto-commit": true,
-			"confirm-execute": false,
-			"confirm-data-change": false,
-			"auto-close-transactions": false
-		}
-	}
-}
\ No newline at end of file
diff --git a/.dbeaver/.project-metadata.json.bak b/.dbeaver/.project-metadata.json.bak
deleted file mode 100644
index 1334ab3..0000000
--- a/.dbeaver/.project-metadata.json.bak
+++ /dev/null
@@ -1 +0,0 @@
-{"resources":{"Scripts/ASSOICATING NCTIDs to NDCs and Marketing dates.sql":{"default-datasource":"postgres-jdbc-186c896a347-2a3d946d2dea4df7","default-catalog":"aact_db","default-schema":"public"},"Scripts/Data_summaries.sql":{"default-datasource":"postgres-jdbc-186c896a347-2a3d946d2dea4df7","default-catalog":"aact_db","default-schema":"public"},"Scripts/DevelopingLinks.sql":{"default-schema":"public","default-datasource":"postgres-jdbc-186c896a347-2a3d946d2dea4df7","default-catalog":"aact_db"},"Scripts/DiseaseBurdens_create_table.sql":{"default-schema":"public","default-datasource":"postgres-jdbc-186c896a347-2a3d946d2dea4df7","default-catalog":"aact_db"},"Scripts/GlobalBurdensOfDisease2019Codebook.sql":{"default-schema":"DiseaseBurden","default-datasource":"postgres-jdbc-186c896a347-2a3d946d2dea4df7","default-catalog":"aact_db"},"Scripts/GroupingTrials.sql":{"default-datasource":"postgres-jdbc-186c896a347-2a3d946d2dea4df7","default-catalog":"aact_db","default-schema":"public"},"Scripts/Script.sql":{"default-datasource":"postgres-jdbc-186c896a347-2a3d946d2dea4df7","default-catalog":"aact_db","default-schema":"public"},"Scripts/TablesAndViews_Public.sql":{"default-datasource":"postgres-jdbc-186c896a347-2a3d946d2dea4df7","default-catalog":"aact_db","default-schema":"public"},"development_sql/ASSOICATING NCTIDs to NDCs and Marketing dates.sql":{"default-schema":"public","default-datasource":"postgres-jdbc-186c896a347-2a3d946d2dea4df7","default-catalog":"aact_db"}}}
\ No newline at end of file
diff --git a/.gitignore b/.gitignore
index 0e6b8e4..dd8f5aa 100644
--- a/.gitignore
+++ b/.gitignore
@@ -194,4 +194,4 @@ containers/drugcentral/docker-entrypoint-initdb.d/*.sql
 containers/drugcentral/docker-entrypoint-initdb.d/*.sql.gz
 containers/drugcentral/db_store/*
 
-
+.dbeaver/

From dfb041d12b7f8ad6ece1d8d16d0d8ffe0d3101c1 Mon Sep 17 00:00:00 2001
From: will king <youainti@protonmail.com>
Date: Sat, 7 Sep 2024 10:00:38 -0700
Subject: [PATCH 4/5] Adding sql that was sitting on home computer

---
 Scripts/VariousDevelopmentsForAnalysis.sql | 308 +++++++++++++++++++++
 1 file changed, 308 insertions(+)
 create mode 100644 Scripts/VariousDevelopmentsForAnalysis.sql

diff --git a/Scripts/VariousDevelopmentsForAnalysis.sql b/Scripts/VariousDevelopmentsForAnalysis.sql
new file mode 100644
index 0000000..9e312fc
--- /dev/null
+++ b/Scripts/VariousDevelopmentsForAnalysis.sql
@@ -0,0 +1,308 @@
+select * from formatted_data_with_planned_enrollment fdwpe 
+;
+
+
+select * from formatted_data_mat fdm
+;
+
+select count(distinct condition ) from formatted_data_mat fdm
+
+select nct_id, fdm.current_status , count(*) 
+from formatted_data_mat fdm 
+group by nct_id  , fdm.current_status 
+order by nct_id 
+;
+
+select * from formatted_data_mat fdm ;
+
+
+-- group with trial split
+with cte as (
+select nct_id
+from formatted_data_mat fdm 
+group by nct_id
+having count(distinct current_status) > 1
+order by nct_id
+)
+select 
+    fdm.nct_id
+    , current_status
+    , earliest_date_observed 
+    , elapsed_duration 
+    , n_brands
+    , category_id
+    , h_sdi_val 
+    , h_sdi_u95
+    , h_sdi_l95
+    , hm_sdi_val 
+    , hm_sdi_u95
+    , hm_sdi_l95
+    , m_sdi_val 
+    , m_sdi_u95
+    , m_sdi_l95
+    , lm_sdi_val 
+    , lm_sdi_u95
+    , lm_sdi_l95
+    , l_sdi_val 
+    , l_sdi_u95
+    , l_sdi_l95
+from formatted_data_mat fdm
+    join cte on cte.nct_id = fdm.nct_id 
+group by 
+    fdm.nct_id
+    , current_status
+    , earliest_date_observed 
+    , elapsed_duration 
+    , n_brands
+    , category_id
+    , h_sdi_val 
+    , h_sdi_u95
+    , h_sdi_l95
+    , hm_sdi_val 
+    , hm_sdi_u95
+    , hm_sdi_l95
+    , m_sdi_val 
+    , m_sdi_u95
+    , m_sdi_l95
+    , lm_sdi_val 
+    , lm_sdi_u95
+    , lm_sdi_l95
+    , l_sdi_val 
+    , l_sdi_u95
+    , l_sdi_l95
+order by nct_id , earliest_date_observed 
+;
+
+select count(distinct category_id ) from 
+
+
+select distinct category_id  from formatted_data_mat fdm 
+;
+
+
+
+
+
+-- group with trial split
+with cte as (
+select nct_id
+from formatted_data_mat fdm 
+group by nct_id
+having count(distinct current_status) > 1
+order by nct_id
+)
+select 
+    fdm.nct_id
+    , current_status
+    , earliest_date_observed 
+    , elapsed_duration 
+    , n_brands
+    , category_id
+    , h_sdi_val 
+    , h_sdi_u95
+    , h_sdi_l95
+    , hm_sdi_val 
+    , hm_sdi_u95
+    , hm_sdi_l95
+    , m_sdi_val 
+    , m_sdi_u95
+    , m_sdi_l95
+    , lm_sdi_val 
+    , lm_sdi_u95
+    , lm_sdi_l95
+    , l_sdi_val 
+    , l_sdi_u95
+    , l_sdi_l95
+from formatted_data_mat fdm
+    join cte on cte.nct_id = fdm.nct_id 
+group by 
+    fdm.nct_id
+    , current_status
+    , earliest_date_observed 
+    , elapsed_duration 
+    , n_brands
+    , category_id
+    , h_sdi_val 
+    , h_sdi_u95
+    , h_sdi_l95
+    , hm_sdi_val 
+    , hm_sdi_u95
+    , hm_sdi_l95
+    , m_sdi_val 
+    , m_sdi_u95
+    , m_sdi_l95
+    , lm_sdi_val 
+    , lm_sdi_u95
+    , lm_sdi_l95
+    , l_sdi_val 
+    , l_sdi_u95
+    , l_sdi_l95
+order by nct_id , earliest_date_observed 
+; --TODO: join to usp dc dataset
+
+
+
+
+WITH trialncts AS (
+         SELECT DISTINCT ts.nct_id
+           FROM history.trial_snapshots ts
+), nct_to_cui AS (
+         SELECT bi.nct_id,
+            bi.downcase_mesh_term,
+            rr.tty2,
+            rr.rxcui2 AS approved_drug_rxcui,
+            count(*) AS count
+           FROM ctgov.browse_interventions bi
+             LEFT JOIN rxnorm_migrated.rxnorm_props rp ON bi.downcase_mesh_term::text = rp.propvalue1::text
+             LEFT JOIN rxnorm_migrated.rxnorm_relations rr ON rr.rxcui1 = rp.rxcui
+          WHERE (bi.nct_id::text IN ( SELECT trialncts.nct_id
+                   FROM trialncts)) AND bi.mesh_type::text = 'mesh-list'::text AND rp.propname::text = 'Active_ingredient_name'::text AND (rr.tty2::text = ANY (ARRAY['BPCK'::text, 'SCD'::text, 'SBD'::text, 'GPCK'::text]))
+          GROUP BY bi.nct_id, bi.downcase_mesh_term, rr.tty2, rr.rxcui2
+        )   
+ SELECT nct_to_cui.nct_id,
+    ud."USP Category",
+    ud."USP Class"
+   FROM nct_to_cui
+     JOIN "Formularies".usp_dc ud ON ud.rxcui::bpchar = nct_to_cui.approved_drug_rxcui
+  GROUP BY nct_to_cui.nct_id, ud."USP Category", ud."USP Class"
+  ORDER BY nct_to_cui.nct_id;
+
+  
+  
+  
+CREATE MATERIALIZED VIEW "Formularies".nct_to_brands_through_uspdc
+AS
+WITH trialncts AS (
+         SELECT DISTINCT ts.nct_id
+           FROM history.trial_snapshots ts
+)
+SELECT 
+    bi.nct_id,
+    count( distinct rr2.rxcui2 ) as brand_name_count
+    FROM ctgov.browse_interventions bi
+      LEFT JOIN rxnorm_migrated.rxnorm_props rp ON bi.downcase_mesh_term::text = rp.propvalue1::text --match mesh terms to rxcui
+      LEFT JOIN rxnorm_migrated.rxnorm_relations rr ON rr.rxcui1 = rp.rxcui -- match rxcui to relations between rxcuis
+      LEFT JOIN rxnorm_migrated.rxnorm_relations rr2 ON rr.rxcui2 = rr2.rxcui1  -- match rxcui to relations between rxcuis
+WHERE 
+    (bi.nct_id::text IN (SELECT trialncts.nct_id FROM trialncts)) --check the nct_id is in our list 
+    AND 
+    bi.mesh_type::text = 'mesh-list'::text --we are only looking at mesh "list" rxcuis
+    AND rp.propname::text = 'Active_ingredient_name'::text  --and we only care about active ingredients linked to \/\/\/\/\/
+    AND (rr.tty2::text = ANY (ARRAY['BPCK'::text, 'SCD'::text, 'SBD'::text, 'GPCK'::text])) --and we are linking from active ingredients ^^^^ to branded packs
+    AND (rr2.tty2::text = 'BN') --and from branded packs back to brand names
+GROUP BY bi.nct_id --remove duplicates
+;
+
+
+
+/* 
+ * 
+ */
+
+
+select 
+    fdqpe.nct_id
+    --,fdqpe.start_date
+    --,fdqpe.current_enrollment
+    --,fdqpe.enrollment_category
+    ,fdqpe.current_status 
+    ,fdqpe.earliest_date_observed 
+    ,fdqpe.elapsed_duration
+    ,fdqpe.n_brands as identical_brands
+    ,ntbtu.brand_name_count 
+    ,fdqpe.category_id
+    ,fdqpe.final_status
+    ,fdqpe.h_sdi_val
+    --,fdqpe.h_sdi_u95
+    --,fdqpe.h_sdi_l95
+    ,fdqpe.hm_sdi_val
+    --,fdqpe.hm_sdi_u95
+    --,fdqpe.hm_sdi_l95
+    ,fdqpe.m_sdi_val
+    --,fdqpe.m_sdi_u95
+    --,fdqpe.m_sdi_l95
+    ,fdqpe.lm_sdi_val
+    --,fdqpe.lm_sdi_u95
+    --,fdqpe.lm_sdi_l95
+    ,fdqpe.l_sdi_val
+    --,fdqpe.l_sdi_u95
+    --,fdqpe.l_sdi_l95
+from formatted_data_mat fdqpe
+    join "Formularies".nct_to_brands_through_uspdc ntbtu
+        on fdqpe.nct_id = ntbtu.nct_id 
+;
+
+--example of multiple reopenings
+select * 
+from formatted_data_mat fdm 
+where nct_id = 'NCT01239797'
+
+--attempt to automatically find transition periods
+with cte1 as (
+	select nct_id, min(earliest_date_observed) over (partition by nct_id) as earliest_closed_enrollment
+	from formatted_data_mat fdm 
+	where current_status = 'Active, not recruiting'
+), cte2 as (
+	select nct_id, max(earliest_date_observed) over (partition by nct_id) latest_open_enrollment
+	from formatted_data_mat fdm 
+	where current_status != 'Active, not recruiting'
+)
+select 
+	cte1.nct_id
+	,cte1.earliest_closed_enrollment
+	,cte2.latest_open_enrollment
+	,cte1.earliest_closed_enrollment - cte2.latest_open_enrollment 
+from cte1
+	join cte2 on cte1.nct_id = cte2.nct_id
+/*group by 
+	cte1.nct_id
+	,cte1.earliest_closed_enrollment
+	,cte2.latest_open_enrollment
+*/
+
+	
+
+/* So ocassionally a study reopens enrollment.
+ * If that didn't happen, then I could just find the first enrollment matching X and/or last enrollment matching Y
+ * to get the transitions
+ * Instead I need to create shifts of statuses between snapshots, and then remove all of those that did not change. 
+ * 
+ * Better yet, just get the last shift to ANR.
+ * */
+
+	
+/* Take each entry and get the status from a lagged snapshot
+ * Then select each snapshot moving from previous_state to ANR
+ * and filter out everything except the last one.
+ * */
+with cte as (
+select 
+	nct_id
+	,lag(current_status, 1) over (partition by nct_id order by earliest_date_observed)  as previous_status
+	,current_status
+	,earliest_date_observed as date_current
+from formatted_data_mat fdm
+), cte2 as (
+select 	
+	nct_id 
+	,previous_status
+	,current_status 
+	,max(date_current) as date_current_max
+from cte
+where 
+	previous_status != current_status 
+	and
+	current_status = 'Active, not recruiting'
+group by 
+	nct_id
+	,previous_status
+	,current_status
+	,date_current
+)
+select * 
+from formatted_data_mat fdm
+	join cte2 
+		on cte2.nct_id = fdm.nct_id 
+		and cte2.date_current_max = fdm.earliest_date_observed 
+; --join back into 
+	
\ No newline at end of file

From 495955170c00dbe3c1316f403661311f89a3b08d Mon Sep 17 00:00:00 2001
From: will king <youainti@protonmail.com>
Date: Sat, 7 Sep 2024 10:20:06 -0700
Subject: [PATCH 5/5] changed data setup path for clarity

---
 .../Icd10ConditionsMatching/Icd10ConditionsMatching/__init__.py   | 0
 .../Icd10ConditionsMatching/db_interface.py                       | 0
 .../Icd10ConditionsMatching/Icd10ConditionsMatching/login.py      | 0
 .../Icd10ConditionsMatching/Icd10ConditionsMatching/model.py      | 0
 .../Icd10ConditionsMatching/templates/base.html                   | 0
 .../Icd10ConditionsMatching/templates/validation_index.html       | 0
 .../Icd10ConditionsMatching/templates/validation_of_trial.html    | 0
 .../Icd10ConditionsMatching/Icd10ConditionsMatching/validation.py | 0
 {scripts => data_mgmt_scripts}/Icd10ConditionsMatching/setup.py   | 0
 {scripts => data_mgmt_scripts}/Icd10ConditionsMatching/start.sh   | 0
 {scripts => data_mgmt_scripts}/db_connection_test.py              | 0
 {scripts => data_mgmt_scripts}/download_tests.py                  | 0
 {scripts => data_mgmt_scripts}/drugtools/__init__.py              | 0
 .../drugtools/download_and_extract_nsde.py                        | 0
 {scripts => data_mgmt_scripts}/drugtools/env_setup.py             | 0
 .../drugtools/historical_nct_downloader.py                        | 0
 .../drugtools/historical_nct_extractor.py                         | 0
 .../drugtools/historical_trial_selector.py                        | 0
 {scripts => data_mgmt_scripts}/drugtools/migrate_mysql2pgsql.py   | 0
 {scripts => data_mgmt_scripts}/drugtools/selected_trials.sql      | 0
 {scripts => data_mgmt_scripts}/import-icd10_to_cause.py           | 0
 {scripts => data_mgmt_scripts}/rm_data.sh                         | 0
 {scripts => data_mgmt_scripts}/runall.py                          | 0
 {scripts => data_mgmt_scripts}/umls_requests.py                   | 0
 24 files changed, 0 insertions(+), 0 deletions(-)
 rename {scripts => data_mgmt_scripts}/Icd10ConditionsMatching/Icd10ConditionsMatching/__init__.py (100%)
 rename {scripts => data_mgmt_scripts}/Icd10ConditionsMatching/Icd10ConditionsMatching/db_interface.py (100%)
 rename {scripts => data_mgmt_scripts}/Icd10ConditionsMatching/Icd10ConditionsMatching/login.py (100%)
 rename {scripts => data_mgmt_scripts}/Icd10ConditionsMatching/Icd10ConditionsMatching/model.py (100%)
 rename {scripts => data_mgmt_scripts}/Icd10ConditionsMatching/Icd10ConditionsMatching/templates/base.html (100%)
 rename {scripts => data_mgmt_scripts}/Icd10ConditionsMatching/Icd10ConditionsMatching/templates/validation_index.html (100%)
 rename {scripts => data_mgmt_scripts}/Icd10ConditionsMatching/Icd10ConditionsMatching/templates/validation_of_trial.html (100%)
 rename {scripts => data_mgmt_scripts}/Icd10ConditionsMatching/Icd10ConditionsMatching/validation.py (100%)
 rename {scripts => data_mgmt_scripts}/Icd10ConditionsMatching/setup.py (100%)
 rename {scripts => data_mgmt_scripts}/Icd10ConditionsMatching/start.sh (100%)
 rename {scripts => data_mgmt_scripts}/db_connection_test.py (100%)
 rename {scripts => data_mgmt_scripts}/download_tests.py (100%)
 rename {scripts => data_mgmt_scripts}/drugtools/__init__.py (100%)
 rename {scripts => data_mgmt_scripts}/drugtools/download_and_extract_nsde.py (100%)
 rename {scripts => data_mgmt_scripts}/drugtools/env_setup.py (100%)
 rename {scripts => data_mgmt_scripts}/drugtools/historical_nct_downloader.py (100%)
 rename {scripts => data_mgmt_scripts}/drugtools/historical_nct_extractor.py (100%)
 rename {scripts => data_mgmt_scripts}/drugtools/historical_trial_selector.py (100%)
 rename {scripts => data_mgmt_scripts}/drugtools/migrate_mysql2pgsql.py (100%)
 rename {scripts => data_mgmt_scripts}/drugtools/selected_trials.sql (100%)
 rename {scripts => data_mgmt_scripts}/import-icd10_to_cause.py (100%)
 rename {scripts => data_mgmt_scripts}/rm_data.sh (100%)
 rename {scripts => data_mgmt_scripts}/runall.py (100%)
 rename {scripts => data_mgmt_scripts}/umls_requests.py (100%)

diff --git a/scripts/Icd10ConditionsMatching/Icd10ConditionsMatching/__init__.py b/data_mgmt_scripts/Icd10ConditionsMatching/Icd10ConditionsMatching/__init__.py
similarity index 100%
rename from scripts/Icd10ConditionsMatching/Icd10ConditionsMatching/__init__.py
rename to data_mgmt_scripts/Icd10ConditionsMatching/Icd10ConditionsMatching/__init__.py
diff --git a/scripts/Icd10ConditionsMatching/Icd10ConditionsMatching/db_interface.py b/data_mgmt_scripts/Icd10ConditionsMatching/Icd10ConditionsMatching/db_interface.py
similarity index 100%
rename from scripts/Icd10ConditionsMatching/Icd10ConditionsMatching/db_interface.py
rename to data_mgmt_scripts/Icd10ConditionsMatching/Icd10ConditionsMatching/db_interface.py
diff --git a/scripts/Icd10ConditionsMatching/Icd10ConditionsMatching/login.py b/data_mgmt_scripts/Icd10ConditionsMatching/Icd10ConditionsMatching/login.py
similarity index 100%
rename from scripts/Icd10ConditionsMatching/Icd10ConditionsMatching/login.py
rename to data_mgmt_scripts/Icd10ConditionsMatching/Icd10ConditionsMatching/login.py
diff --git a/scripts/Icd10ConditionsMatching/Icd10ConditionsMatching/model.py b/data_mgmt_scripts/Icd10ConditionsMatching/Icd10ConditionsMatching/model.py
similarity index 100%
rename from scripts/Icd10ConditionsMatching/Icd10ConditionsMatching/model.py
rename to data_mgmt_scripts/Icd10ConditionsMatching/Icd10ConditionsMatching/model.py
diff --git a/scripts/Icd10ConditionsMatching/Icd10ConditionsMatching/templates/base.html b/data_mgmt_scripts/Icd10ConditionsMatching/Icd10ConditionsMatching/templates/base.html
similarity index 100%
rename from scripts/Icd10ConditionsMatching/Icd10ConditionsMatching/templates/base.html
rename to data_mgmt_scripts/Icd10ConditionsMatching/Icd10ConditionsMatching/templates/base.html
diff --git a/scripts/Icd10ConditionsMatching/Icd10ConditionsMatching/templates/validation_index.html b/data_mgmt_scripts/Icd10ConditionsMatching/Icd10ConditionsMatching/templates/validation_index.html
similarity index 100%
rename from scripts/Icd10ConditionsMatching/Icd10ConditionsMatching/templates/validation_index.html
rename to data_mgmt_scripts/Icd10ConditionsMatching/Icd10ConditionsMatching/templates/validation_index.html
diff --git a/scripts/Icd10ConditionsMatching/Icd10ConditionsMatching/templates/validation_of_trial.html b/data_mgmt_scripts/Icd10ConditionsMatching/Icd10ConditionsMatching/templates/validation_of_trial.html
similarity index 100%
rename from scripts/Icd10ConditionsMatching/Icd10ConditionsMatching/templates/validation_of_trial.html
rename to data_mgmt_scripts/Icd10ConditionsMatching/Icd10ConditionsMatching/templates/validation_of_trial.html
diff --git a/scripts/Icd10ConditionsMatching/Icd10ConditionsMatching/validation.py b/data_mgmt_scripts/Icd10ConditionsMatching/Icd10ConditionsMatching/validation.py
similarity index 100%
rename from scripts/Icd10ConditionsMatching/Icd10ConditionsMatching/validation.py
rename to data_mgmt_scripts/Icd10ConditionsMatching/Icd10ConditionsMatching/validation.py
diff --git a/scripts/Icd10ConditionsMatching/setup.py b/data_mgmt_scripts/Icd10ConditionsMatching/setup.py
similarity index 100%
rename from scripts/Icd10ConditionsMatching/setup.py
rename to data_mgmt_scripts/Icd10ConditionsMatching/setup.py
diff --git a/scripts/Icd10ConditionsMatching/start.sh b/data_mgmt_scripts/Icd10ConditionsMatching/start.sh
similarity index 100%
rename from scripts/Icd10ConditionsMatching/start.sh
rename to data_mgmt_scripts/Icd10ConditionsMatching/start.sh
diff --git a/scripts/db_connection_test.py b/data_mgmt_scripts/db_connection_test.py
similarity index 100%
rename from scripts/db_connection_test.py
rename to data_mgmt_scripts/db_connection_test.py
diff --git a/scripts/download_tests.py b/data_mgmt_scripts/download_tests.py
similarity index 100%
rename from scripts/download_tests.py
rename to data_mgmt_scripts/download_tests.py
diff --git a/scripts/drugtools/__init__.py b/data_mgmt_scripts/drugtools/__init__.py
similarity index 100%
rename from scripts/drugtools/__init__.py
rename to data_mgmt_scripts/drugtools/__init__.py
diff --git a/scripts/drugtools/download_and_extract_nsde.py b/data_mgmt_scripts/drugtools/download_and_extract_nsde.py
similarity index 100%
rename from scripts/drugtools/download_and_extract_nsde.py
rename to data_mgmt_scripts/drugtools/download_and_extract_nsde.py
diff --git a/scripts/drugtools/env_setup.py b/data_mgmt_scripts/drugtools/env_setup.py
similarity index 100%
rename from scripts/drugtools/env_setup.py
rename to data_mgmt_scripts/drugtools/env_setup.py
diff --git a/scripts/drugtools/historical_nct_downloader.py b/data_mgmt_scripts/drugtools/historical_nct_downloader.py
similarity index 100%
rename from scripts/drugtools/historical_nct_downloader.py
rename to data_mgmt_scripts/drugtools/historical_nct_downloader.py
diff --git a/scripts/drugtools/historical_nct_extractor.py b/data_mgmt_scripts/drugtools/historical_nct_extractor.py
similarity index 100%
rename from scripts/drugtools/historical_nct_extractor.py
rename to data_mgmt_scripts/drugtools/historical_nct_extractor.py
diff --git a/scripts/drugtools/historical_trial_selector.py b/data_mgmt_scripts/drugtools/historical_trial_selector.py
similarity index 100%
rename from scripts/drugtools/historical_trial_selector.py
rename to data_mgmt_scripts/drugtools/historical_trial_selector.py
diff --git a/scripts/drugtools/migrate_mysql2pgsql.py b/data_mgmt_scripts/drugtools/migrate_mysql2pgsql.py
similarity index 100%
rename from scripts/drugtools/migrate_mysql2pgsql.py
rename to data_mgmt_scripts/drugtools/migrate_mysql2pgsql.py
diff --git a/scripts/drugtools/selected_trials.sql b/data_mgmt_scripts/drugtools/selected_trials.sql
similarity index 100%
rename from scripts/drugtools/selected_trials.sql
rename to data_mgmt_scripts/drugtools/selected_trials.sql
diff --git a/scripts/import-icd10_to_cause.py b/data_mgmt_scripts/import-icd10_to_cause.py
similarity index 100%
rename from scripts/import-icd10_to_cause.py
rename to data_mgmt_scripts/import-icd10_to_cause.py
diff --git a/scripts/rm_data.sh b/data_mgmt_scripts/rm_data.sh
similarity index 100%
rename from scripts/rm_data.sh
rename to data_mgmt_scripts/rm_data.sh
diff --git a/scripts/runall.py b/data_mgmt_scripts/runall.py
similarity index 100%
rename from scripts/runall.py
rename to data_mgmt_scripts/runall.py
diff --git a/scripts/umls_requests.py b/data_mgmt_scripts/umls_requests.py
similarity index 100%
rename from scripts/umls_requests.py
rename to data_mgmt_scripts/umls_requests.py