recording logs

Fixed the restore script.
Got it runnign currently
43 changed files with 12966 additions and 75 deletions
--- a/.dbeaver/.credentials-config.json.bak
+++ b/.dbeaver/.credentials-config.json.bak
--- a/.dbeaver/.data-sources.json.bak
+++ b/.dbeaver/.data-sources.json.bak
@ -1,64 +0,0 @@
-{
-	"folders": {},
-	"connections": {
-		"mariaDB-186c896820e-6ff11b5b802d8b82": {
-			"provider": "mysql",
-			"driver": "mariaDB",
-			"name": "rxnav",
-			"save-password": true,
-			"configuration": {
-				"host": "will-office",
-				"port": "3306",
-				"url": "jdbc:mariadb://will-office:3306/",
-				"configurationType": "MANUAL",
-				"type": "dev",
-				"auth-model": "native"
-			}
-		},
-		"postgres-jdbc-186c896a347-2a3d946d2dea4df7": {
-			"provider": "postgresql",
-			"driver": "postgres-jdbc",
-			"name": "aact_db",
-			"save-password": true,
-			"configuration": {
-				"host": "100.95.169.11",
-				"port": "5432",
-				"database": "aact_db",
-				"url": "jdbc:postgresql://100.95.169.11:5432/aact_db",
-				"configurationType": "MANUAL",
-				"type": "dev",
-				"provider-properties": {},
-				"auth-model": "native"
-			},
-			"custom-properties": {
-				"resultset.maxrows": "500"
-			}
-		},
-		"postgres-jdbc-186cd8f479f-6cc3c10c8adc3359": {
-			"provider": "postgresql",
-			"driver": "postgres-jdbc",
-			"name": "drugcentral",
-			"save-password": true,
-			"configuration": {
-				"host": "localhost",
-				"port": "54320",
-				"database": "postgres",
-				"url": "jdbc:postgresql://localhost:54320/postgres",
-				"configurationType": "MANUAL",
-				"type": "dev",
-				"auth-model": "native"
-			}
-		}
-	},
-	"connection-types": {
-		"dev": {
-			"name": "Development",
-			"color": "255,255,255",
-			"description": "Regular development database",
-			"auto-commit": true,
-			"confirm-execute": false,
-			"confirm-data-change": false,
-			"auto-close-transactions": false
-		}
-	}
-}
--- a/.dbeaver/.project-metadata.json.bak
+++ b/.dbeaver/.project-metadata.json.bak
@ -1 +0,0 @@
-{"resources":{"Scripts/ASSOICATING NCTIDs to NDCs and Marketing dates.sql":{"default-datasource":"postgres-jdbc-186c896a347-2a3d946d2dea4df7","default-catalog":"aact_db","default-schema":"public"},"Scripts/Data_summaries.sql":{"default-datasource":"postgres-jdbc-186c896a347-2a3d946d2dea4df7","default-catalog":"aact_db","default-schema":"public"},"Scripts/DevelopingLinks.sql":{"default-schema":"public","default-datasource":"postgres-jdbc-186c896a347-2a3d946d2dea4df7","default-catalog":"aact_db"},"Scripts/DiseaseBurdens_create_table.sql":{"default-schema":"public","default-datasource":"postgres-jdbc-186c896a347-2a3d946d2dea4df7","default-catalog":"aact_db"},"Scripts/GlobalBurdensOfDisease2019Codebook.sql":{"default-schema":"DiseaseBurden","default-datasource":"postgres-jdbc-186c896a347-2a3d946d2dea4df7","default-catalog":"aact_db"},"Scripts/GroupingTrials.sql":{"default-datasource":"postgres-jdbc-186c896a347-2a3d946d2dea4df7","default-catalog":"aact_db","default-schema":"public"},"Scripts/Script.sql":{"default-datasource":"postgres-jdbc-186c896a347-2a3d946d2dea4df7","default-catalog":"aact_db","default-schema":"public"},"Scripts/TablesAndViews_Public.sql":{"default-datasource":"postgres-jdbc-186c896a347-2a3d946d2dea4df7","default-catalog":"aact_db","default-schema":"public"},"development_sql/ASSOICATING NCTIDs to NDCs and Marketing dates.sql":{"default-schema":"public","default-datasource":"postgres-jdbc-186c896a347-2a3d946d2dea4df7","default-catalog":"aact_db"}}}
--- a/.gitattributes
+++ b/.gitattributes
@ -5,3 +5,4 @@ other_data/USP[[:space:]]DC/usp_dc_pub_2023_release_2.0_updated_final.csv filter
 other_data/USP[[:space:]]MMG/MMG_v8.0_Alignment_File.csv filter=lfs diff=lfs merge=lfs -text
 other_data/VA[[:space:]]Formulary/PharmacyProductSystem_NationalDrugCodeExtract.csv filter=lfs diff=lfs merge=lfs -text
 containers/AACT_Reloader/backup/aact_db_backup_20250106_184236.sql.gz filter=lfs diff=lfs merge=lfs -text
+containers/AACT_Reloader/backup/aact_db_backup_20250107_133822.sql.gz filter=lfs diff=lfs merge=lfs -text
--- a/.gitignore
+++ b/.gitignore
@ -194,4 +194,4 @@ containers/drugcentral/docker-entrypoint-initdb.d/*.sql
 containers/drugcentral/docker-entrypoint-initdb.d/*.sql.gz
 containers/drugcentral/db_store/*

-
+.dbeaver/
--- a/Scripts/Backup_AACT_work.sh
+++ b/Scripts/Backup_AACT_work.sh
@ -2,8 +2,9 @@
 backup_dir="/mnt/will/large_data/Research_large_data/ClinicalTrialsDataProcessing/containers/AACT_Reloader/backup/"
 date_stamp=$(date +%Y%m%d_%H%M%S)
 filename="aact_db_backup_${date_stamp}.sql"
+container_name = ${1:-aact_db}

-podman exec aact_db pg_dump -U root aact_db > "${backup_dir}/${filename}"
+podman exec "$container_name" pg_dump -U root aact_db > "${backup_dir}/${filename}"

 # Optional: compress the backup
 gzip "${backup_dir}/${filename}"
--- a/Scripts/ConfiguringFormularies.sql
+++ b/Scripts/ConfiguringFormularies.sql
@ -43,6 +43,9 @@ CREATE TABLE "Formularies".usp_dc_2023 (
 	"API Concept" varchar(250) NULL
 );
 ```
+
+It links rxcuis to other rxcuis where they have a matching USP Categories and Class
+This gives alternative RXCUIs based on category an class.
 */
 CREATE MATERIALIZED VIEW "Formularies".rxcui_category_class_links AS
 WITH base AS (
@ -72,27 +75,43 @@ I'll' break this into two steps.
 1. link formulary alternatives to compounds and brands,
 2. link nct_id to formulary alternatives
 */
+drop if exists materialized view "Formularies".match_trial_compound_to_alternate_bn_rxcuis;
+drop if exists materialized view "Formularies".rxcui_to_brand_through_uspdc cascade;
+
 create materialized view "Formularies".rxcui_to_brand_through_uspdc AS
-select
+select distinct
 	rccl.source_rxcui
 	,rccl.linked_rxcui
 	,rccl.category
 	,rccl."class"
 	,rr.tty1
-	,rr.tty2
+	--,rr.tty2
 	,rr.rxcui2
 from "Formularies".rxcui_category_class_links rccl
 join rxnorm_migrated.rxnorm_relations rr on rr.rxcui1 = rccl.linked_rxcui
 where rr.tty2 = 'BN'
 ;

+/* So this one takes each RXCUI and it's associated RXCUIs from the same 
+category and class, and filters it down to associated RXCUI's that 
+represent brand names.
+ */

-create materialized view match_trial_compound_to_alternate_bn_rxcuis as
-select distinct mttbi.nct_id,  rtbtu.rxcui2
+create materialized view "Formularies".match_trial_compound_to_alternate_bn_rxcuis as
+select distinct mttbi.nct_id,  rtbtu.rxcui2 as brand_rxcuis
 from match_trials_to_bn_in mttbi
 join "Formularies".rxcui_to_brand_through_uspdc rtbtu
 	on mttbi.bn_or_in_cui = rtbtu.rxcui2
+;

 /*
-Now I need to create a way to link
+This takes the list of ingredients and brands associated with a trial, and 
+links it to the list of alternative brand names.
 */
+
+--renamed the view
+CREATE OR REPLACE VIEW "Formularies".nct_to_brand_counts_through_uspdc
+AS SELECT mtctabr.nct_id,
+    count(*) AS brand_name_counts
+   FROM "Formularies".match_trial_compound_to_alternate_bn_rxcuis mtctabr
+  GROUP BY mtctabr.nct_id;
--- a/Scripts/VariousDevelopmentsForAnalysis.sql
+++ b/Scripts/VariousDevelopmentsForAnalysis.sql
@ -0,0 +1,308 @@
+select * from formatted_data_with_planned_enrollment fdwpe 
+;
+
+
+select * from formatted_data_mat fdm
+;
+
+select count(distinct condition ) from formatted_data_mat fdm
+
+select nct_id, fdm.current_status , count(*) 
+from formatted_data_mat fdm 
+group by nct_id  , fdm.current_status 
+order by nct_id 
+;
+
+select * from formatted_data_mat fdm ;
+
+
+-- group with trial split
+with cte as (
+select nct_id
+from formatted_data_mat fdm 
+group by nct_id
+having count(distinct current_status) > 1
+order by nct_id
+)
+select 
+    fdm.nct_id
+    , current_status
+    , earliest_date_observed 
+    , elapsed_duration 
+    , n_brands
+    , category_id
+    , h_sdi_val 
+    , h_sdi_u95
+    , h_sdi_l95
+    , hm_sdi_val 
+    , hm_sdi_u95
+    , hm_sdi_l95
+    , m_sdi_val 
+    , m_sdi_u95
+    , m_sdi_l95
+    , lm_sdi_val 
+    , lm_sdi_u95
+    , lm_sdi_l95
+    , l_sdi_val 
+    , l_sdi_u95
+    , l_sdi_l95
+from formatted_data_mat fdm
+    join cte on cte.nct_id = fdm.nct_id 
+group by 
+    fdm.nct_id
+    , current_status
+    , earliest_date_observed 
+    , elapsed_duration 
+    , n_brands
+    , category_id
+    , h_sdi_val 
+    , h_sdi_u95
+    , h_sdi_l95
+    , hm_sdi_val 
+    , hm_sdi_u95
+    , hm_sdi_l95
+    , m_sdi_val 
+    , m_sdi_u95
+    , m_sdi_l95
+    , lm_sdi_val 
+    , lm_sdi_u95
+    , lm_sdi_l95
+    , l_sdi_val 
+    , l_sdi_u95
+    , l_sdi_l95
+order by nct_id , earliest_date_observed 
+;
+
+select count(distinct category_id ) from 
+
+
+select distinct category_id  from formatted_data_mat fdm 
+;
+
+
+
+
+
+-- group with trial split
+with cte as (
+select nct_id
+from formatted_data_mat fdm 
+group by nct_id
+having count(distinct current_status) > 1
+order by nct_id
+)
+select 
+    fdm.nct_id
+    , current_status
+    , earliest_date_observed 
+    , elapsed_duration 
+    , n_brands
+    , category_id
+    , h_sdi_val 
+    , h_sdi_u95
+    , h_sdi_l95
+    , hm_sdi_val 
+    , hm_sdi_u95
+    , hm_sdi_l95
+    , m_sdi_val 
+    , m_sdi_u95
+    , m_sdi_l95
+    , lm_sdi_val 
+    , lm_sdi_u95
+    , lm_sdi_l95
+    , l_sdi_val 
+    , l_sdi_u95
+    , l_sdi_l95
+from formatted_data_mat fdm
+    join cte on cte.nct_id = fdm.nct_id 
+group by 
+    fdm.nct_id
+    , current_status
+    , earliest_date_observed 
+    , elapsed_duration 
+    , n_brands
+    , category_id
+    , h_sdi_val 
+    , h_sdi_u95
+    , h_sdi_l95
+    , hm_sdi_val 
+    , hm_sdi_u95
+    , hm_sdi_l95
+    , m_sdi_val 
+    , m_sdi_u95
+    , m_sdi_l95
+    , lm_sdi_val 
+    , lm_sdi_u95
+    , lm_sdi_l95
+    , l_sdi_val 
+    , l_sdi_u95
+    , l_sdi_l95
+order by nct_id , earliest_date_observed 
+; --TODO: join to usp dc dataset
+
+
+
+
+WITH trialncts AS (
+         SELECT DISTINCT ts.nct_id
+           FROM history.trial_snapshots ts
+), nct_to_cui AS (
+         SELECT bi.nct_id,
+            bi.downcase_mesh_term,
+            rr.tty2,
+            rr.rxcui2 AS approved_drug_rxcui,
+            count(*) AS count
+           FROM ctgov.browse_interventions bi
+             LEFT JOIN rxnorm_migrated.rxnorm_props rp ON bi.downcase_mesh_term::text = rp.propvalue1::text
+             LEFT JOIN rxnorm_migrated.rxnorm_relations rr ON rr.rxcui1 = rp.rxcui
+          WHERE (bi.nct_id::text IN ( SELECT trialncts.nct_id
+                   FROM trialncts)) AND bi.mesh_type::text = 'mesh-list'::text AND rp.propname::text = 'Active_ingredient_name'::text AND (rr.tty2::text = ANY (ARRAY['BPCK'::text, 'SCD'::text, 'SBD'::text, 'GPCK'::text]))
+          GROUP BY bi.nct_id, bi.downcase_mesh_term, rr.tty2, rr.rxcui2
+        )   
+ SELECT nct_to_cui.nct_id,
+    ud."USP Category",
+    ud."USP Class"
+   FROM nct_to_cui
+     JOIN "Formularies".usp_dc ud ON ud.rxcui::bpchar = nct_to_cui.approved_drug_rxcui
+  GROUP BY nct_to_cui.nct_id, ud."USP Category", ud."USP Class"
+  ORDER BY nct_to_cui.nct_id;
+
+  
+  
+  
+CREATE MATERIALIZED VIEW "Formularies".nct_to_brands_through_uspdc
+AS
+WITH trialncts AS (
+         SELECT DISTINCT ts.nct_id
+           FROM history.trial_snapshots ts
+)
+SELECT 
+    bi.nct_id,
+    count( distinct rr2.rxcui2 ) as brand_name_count
+    FROM ctgov.browse_interventions bi
+      LEFT JOIN rxnorm_migrated.rxnorm_props rp ON bi.downcase_mesh_term::text = rp.propvalue1::text --match mesh terms to rxcui
+      LEFT JOIN rxnorm_migrated.rxnorm_relations rr ON rr.rxcui1 = rp.rxcui -- match rxcui to relations between rxcuis
+      LEFT JOIN rxnorm_migrated.rxnorm_relations rr2 ON rr.rxcui2 = rr2.rxcui1  -- match rxcui to relations between rxcuis
+WHERE 
+    (bi.nct_id::text IN (SELECT trialncts.nct_id FROM trialncts)) --check the nct_id is in our list 
+    AND 
+    bi.mesh_type::text = 'mesh-list'::text --we are only looking at mesh "list" rxcuis
+    AND rp.propname::text = 'Active_ingredient_name'::text  --and we only care about active ingredients linked to \/\/\/\/\/
+    AND (rr.tty2::text = ANY (ARRAY['BPCK'::text, 'SCD'::text, 'SBD'::text, 'GPCK'::text])) --and we are linking from active ingredients ^^^^ to branded packs
+    AND (rr2.tty2::text = 'BN') --and from branded packs back to brand names
+GROUP BY bi.nct_id --remove duplicates
+;
+
+
+
+/* 
+ * 
+ */
+
+
+select 
+    fdqpe.nct_id
+    --,fdqpe.start_date
+    --,fdqpe.current_enrollment
+    --,fdqpe.enrollment_category
+    ,fdqpe.current_status 
+    ,fdqpe.earliest_date_observed 
+    ,fdqpe.elapsed_duration
+    ,fdqpe.n_brands as identical_brands
+    ,ntbtu.brand_name_count 
+    ,fdqpe.category_id
+    ,fdqpe.final_status
+    ,fdqpe.h_sdi_val
+    --,fdqpe.h_sdi_u95
+    --,fdqpe.h_sdi_l95
+    ,fdqpe.hm_sdi_val
+    --,fdqpe.hm_sdi_u95
+    --,fdqpe.hm_sdi_l95
+    ,fdqpe.m_sdi_val
+    --,fdqpe.m_sdi_u95
+    --,fdqpe.m_sdi_l95
+    ,fdqpe.lm_sdi_val
+    --,fdqpe.lm_sdi_u95
+    --,fdqpe.lm_sdi_l95
+    ,fdqpe.l_sdi_val
+    --,fdqpe.l_sdi_u95
+    --,fdqpe.l_sdi_l95
+from formatted_data_mat fdqpe
+    join "Formularies".nct_to_brands_through_uspdc ntbtu
+        on fdqpe.nct_id = ntbtu.nct_id 
+;
+
+--example of multiple reopenings
+select * 
+from formatted_data_mat fdm 
+where nct_id = 'NCT01239797'
+
+--attempt to automatically find transition periods
+with cte1 as (
+	select nct_id, min(earliest_date_observed) over (partition by nct_id) as earliest_closed_enrollment
+	from formatted_data_mat fdm 
+	where current_status = 'Active, not recruiting'
+), cte2 as (
+	select nct_id, max(earliest_date_observed) over (partition by nct_id) latest_open_enrollment
+	from formatted_data_mat fdm 
+	where current_status != 'Active, not recruiting'
+)
+select 
+	cte1.nct_id
+	,cte1.earliest_closed_enrollment
+	,cte2.latest_open_enrollment
+	,cte1.earliest_closed_enrollment - cte2.latest_open_enrollment 
+from cte1
+	join cte2 on cte1.nct_id = cte2.nct_id
+/*group by 
+	cte1.nct_id
+	,cte1.earliest_closed_enrollment
+	,cte2.latest_open_enrollment
+*/
+
+	
+
+/* So ocassionally a study reopens enrollment.
+ * If that didn't happen, then I could just find the first enrollment matching X and/or last enrollment matching Y
+ * to get the transitions
+ * Instead I need to create shifts of statuses between snapshots, and then remove all of those that did not change. 
+ * 
+ * Better yet, just get the last shift to ANR.
+ * */
+
+	
+/* Take each entry and get the status from a lagged snapshot
+ * Then select each snapshot moving from previous_state to ANR
+ * and filter out everything except the last one.
+ * */
+with cte as (
+select 
+	nct_id
+	,lag(current_status, 1) over (partition by nct_id order by earliest_date_observed)  as previous_status
+	,current_status
+	,earliest_date_observed as date_current
+from formatted_data_mat fdm
+), cte2 as (
+select 	
+	nct_id 
+	,previous_status
+	,current_status 
+	,max(date_current) as date_current_max
+from cte
+where 
+	previous_status != current_status 
+	and
+	current_status = 'Active, not recruiting'
+group by 
+	nct_id
+	,previous_status
+	,current_status
+	,date_current
+)
+select * 
+from formatted_data_mat fdm
+	join cte2 
+		on cte2.nct_id = fdm.nct_id 
+		and cte2.date_current_max = fdm.earliest_date_observed 
+; --join back into 
+	
--- a/containers/AACT_Reloader/StartRestoreContainer.sh
+++ b/containers/AACT_Reloader/StartRestoreContainer.sh
@ -1,6 +1,6 @@
 #!/bin/bash

-RESTORE_DUMP_GZ=2023-09-06_aactdb_with_matches.sql.gz
+RESTORE_DUMP_GZ="${1:-aact_db_backup_20250107_133822.sql.gz}"
 POSTGRES_USER=root
 POSTGRES_PASSWORD=root
 POSTGRES_DB=aact_db
@ -25,7 +25,7 @@ sleep 10

 # Function to check if PostgreSQL is ready
 function check_postgres {
-    podman exec -i "${CONTAINER_NAME}" psql -h "${CONTAINER_NAME}" -U "${POSTGRES_USER}" -d "${POSTGRES_DB}" -c '\q' > /dev/null 2>&1    
+    podman exec -i "${CONTAINER_NAME}" psql -h localhost -U "${POSTGRES_USER}" -d "${POSTGRES_DB}" -c '\q' > /dev/null 2>&1    
 }

 # Wait for PostgreSQL to be ready
@ -37,6 +37,6 @@ done
 echo "PostgreSQL is ready. Restoring the database..."

 # Decompress the dump file and restore it to the database
-podman exec -i "${POSTGRES_DB}" sh -c "gunzip -c /backup/${RESTORE_DUMP_GZ} | psql -h localhost -U ${POSTGRES_USER} -d ${POSTGRES_DB}"
+podman exec -i "${CONTAINER_NAME}" sh -c "gunzip -c /backup/${RESTORE_DUMP_GZ} | psql -h localhost -U ${POSTGRES_USER} -d ${POSTGRES_DB}"

 echo "Database restoration complete."
--- a/containers/AACT_Reloader/backup/aact_db_backup_20250107_133822.sql.gz
+++ b/containers/AACT_Reloader/backup/aact_db_backup_20250107_133822.sql.gz
--- a/data_mgmt_scripts/Icd10ConditionsMatching/Icd10ConditionsMatching/init.py
+++ b/data_mgmt_scripts/Icd10ConditionsMatching/Icd10ConditionsMatching/init.py
--- a/data_mgmt_scripts/Icd10ConditionsMatching/Icd10ConditionsMatching/db_interface.py
+++ b/data_mgmt_scripts/Icd10ConditionsMatching/Icd10ConditionsMatching/db_interface.py
--- a/data_mgmt_scripts/Icd10ConditionsMatching/Icd10ConditionsMatching/login.py
+++ b/data_mgmt_scripts/Icd10ConditionsMatching/Icd10ConditionsMatching/login.py
--- a/data_mgmt_scripts/Icd10ConditionsMatching/Icd10ConditionsMatching/model.py
+++ b/data_mgmt_scripts/Icd10ConditionsMatching/Icd10ConditionsMatching/model.py
--- a/data_mgmt_scripts/Icd10ConditionsMatching/Icd10ConditionsMatching/templates/base.html
+++ b/data_mgmt_scripts/Icd10ConditionsMatching/Icd10ConditionsMatching/templates/base.html
--- a/data_mgmt_scripts/Icd10ConditionsMatching/Icd10ConditionsMatching/templates/validation_index.html
+++ b/data_mgmt_scripts/Icd10ConditionsMatching/Icd10ConditionsMatching/templates/validation_index.html
--- a/data_mgmt_scripts/Icd10ConditionsMatching/Icd10ConditionsMatching/templates/validation_of_trial.html
+++ b/data_mgmt_scripts/Icd10ConditionsMatching/Icd10ConditionsMatching/templates/validation_of_trial.html
--- a/data_mgmt_scripts/Icd10ConditionsMatching/Icd10ConditionsMatching/validation.py
+++ b/data_mgmt_scripts/Icd10ConditionsMatching/Icd10ConditionsMatching/validation.py
--- a/data_mgmt_scripts/Icd10ConditionsMatching/setup.py
+++ b/data_mgmt_scripts/Icd10ConditionsMatching/setup.py
--- a/data_mgmt_scripts/Icd10ConditionsMatching/start.sh
+++ b/data_mgmt_scripts/Icd10ConditionsMatching/start.sh
--- a/data_mgmt_scripts/db_connection_test.py
+++ b/data_mgmt_scripts/db_connection_test.py
--- a/data_mgmt_scripts/download_tests.py
+++ b/data_mgmt_scripts/download_tests.py
--- a/data_mgmt_scripts/drugtools/init.py
+++ b/data_mgmt_scripts/drugtools/init.py
--- a/data_mgmt_scripts/drugtools/download_and_extract_nsde.py
+++ b/data_mgmt_scripts/drugtools/download_and_extract_nsde.py
--- a/data_mgmt_scripts/drugtools/env_setup.py
+++ b/data_mgmt_scripts/drugtools/env_setup.py
--- a/data_mgmt_scripts/drugtools/historical_nct_downloader.py
+++ b/data_mgmt_scripts/drugtools/historical_nct_downloader.py
--- a/data_mgmt_scripts/drugtools/historical_nct_extractor.py
+++ b/data_mgmt_scripts/drugtools/historical_nct_extractor.py
--- a/data_mgmt_scripts/drugtools/historical_trial_selector.py
+++ b/data_mgmt_scripts/drugtools/historical_trial_selector.py
--- a/data_mgmt_scripts/drugtools/migrate_mysql2pgsql.py
+++ b/data_mgmt_scripts/drugtools/migrate_mysql2pgsql.py
--- a/data_mgmt_scripts/drugtools/selected_trials.sql
+++ b/data_mgmt_scripts/drugtools/selected_trials.sql
--- a/data_mgmt_scripts/import-icd10_to_cause.py
+++ b/data_mgmt_scripts/import-icd10_to_cause.py
--- a/data_mgmt_scripts/rm_data.sh
+++ b/data_mgmt_scripts/rm_data.sh
--- a/data_mgmt_scripts/runall.py
+++ b/data_mgmt_scripts/runall.py
--- a/data_mgmt_scripts/umls_requests.py
+++ b/data_mgmt_scripts/umls_requests.py
--- a/logs.org
+++ b/logs.org
@ -0,0 +1,140 @@
+* Plan/Todo [2025-01-06]
+Goal is to update the main images with more details, i.e. adding means
+etc.
+
+- get aact_db back up
+- attach it to a "research" network
+- restart rocker, attaching it to the same research network.
+
+** NOTES
+
+aact_db-restored-2024-11-27 didn't successfully restore. It is missing
+all the important stuff.
+
+Figured out why the restore was failing. My code to restore had a faulty
+check to see if the DB was up and ready. Fixed that now.
+
+Waiting for restore (manually triggered) to start. Then I should have
+access to the table as needed.
+
+It seems like I'm missing some data within a schema, specifically the
+Formularies and their associated views.
+
+My options are:
+
+- search around for documentation or other stuff
+- try to rebuild
+
+my suspision is that I forgot to back it up. I think it is probably
+worth looking for. - So I've been looking through my copy of
+ClinicalTrialsDataProcessing, and have not found anything referencing
+it. The formularies data is required for my analysis though. If I
+remember correctly, I manually uploaded the USP datasets in DBeaver,
+then created any views etc.
+
+I think that I'll have to recreate it. This is going to be hard because
+I'm not sure what it did. At least I created mildly informative table
+names.
+
+The tables/views I've identified are: -
+=Formularies.nct_to_brands_through_uspdc=
+
+It looks like I need to - import usp-dc dataset - link those drugs to
+usp data - create a view that links those automatically - back it up. -
+double check the data I get from the request.
+
+The links will be through RXCUIs, and grouped on =USP Class= In effect,
+for a given RXCUI, I want to get the list of RXCUI's which have the same
+USP-DC class, and then be able to link back to brands.
+
+This should have the following links: - RXCUI -> USP-DC category/class
+pair - USP-DC category/class pair -> RXCUIs - RXUCIs -> competitors
+
+Do I want to combine the USP-DC and UPS-MMG datasets? No, there is
+enough difference in them that I don't want to have to handle it that
+way.
+
+I've been working on this in scripts/ConfiguringFormularies.sql
+
+So what I've managed to do so far is export tables, backup the data.
+
+I've got a version that connects trials to brand names, but there may be
+more details to the connection than I thought. I'd like to check if I
+need to filter anything or check if there are other ingredients etc that
+I need to include. */I probably need to write some descriptions of all
+the tables and views to put everything together. An ai would probably be
+helpful in doing this./*
+
+
+** Code snippets
+#+begin_example
+podman run \
+ -e POSTGRES_PASSWORD="${POSTGRES_PASSWORD}" \
+ -e POSTGRES_USER="${POSTGRES_USER}" \
+ -e POSTGRES_DB="${POSTGRES_DB}" \
+ --name "${CONTAINER_NAME}" \
+ --detach \
+ --network research-network \
+ --shm-size=512mb \
+ --volume ./backup/:/backup/ \
+ -p 5432:5432\
+ postgres:14-alpine
+#+end_example
+
+#+begin_example
+function check_postgres {
+podman exec -i "${POSTGRES_DB}" psql -h localhost -U "${POSTGRES_USER}" -d "${POSTGRES_DB}" -c '\q' > /dev/null 2>&1    
+}
+#+end_example
+
+** Notes at end of day
+- was reasonably productive in getting stuff unblocked for finishing
+  JMP, which i'll need to do before I leave town next week.
+
+
+** What I've got to do tomorrow
+I've got a version that connects trials to brand names, but there may be
+more details to the connection than I thought. I'd like to check if I
+need to filter anything or check if there are other ingredients etc that
+I need to include. */I probably need to write some descriptions of all
+the tables and views to put everything together. An ai would probably be
+helpful in doing this./* At the end of it all, I should be able to get a
+count of competing drugs per trial.
+
+Once that is done, I can relink aact_db and rocker, then rerun my
+analysis. Then I can adjust the images that I need for my JMP.
+* [2025-01-07 Tue 12:01] notes
+
+  So what I've got to do is 
+
+** DONE  Investigate what compounds are showing up in my current list
+    if that is what I want, then I'll be able proceed with redoing my images
+   if not, then I'll have to work on adjusting the views etc that I have.
+
+   I've looked through it and it seems to correct.
+
+*** [[/mnt/will/large_data/Research_large_data/ClinicalTrialsDataProcessing/Scripts/ConfiguringFormularies.sql][ConfiguringFormularies.sql:81]] [2025-01-07 Tue 13:24] 
+  I've tweaked these three views to make them clearer.
+  I also renamed the view of interest to ="Formularies".nct_to_brand_counts_through_uspdc= to better represent what it does.
+
+
+** DONE Rerun the analysis
+   CLOSED: [2025-01-07 Tue 16:39]
+    So it looks like I'll need to 
+    1. take a backup of aact_db
+    2. restore from backup, putting the container in the research network
+    3. then rerun the analysis.
+
+      Ok, I'm pushing the backup and can get started on restoring from backup.
+      Backup is restoring. As I recall, this takes 40 minutes.
+   
+      had some mild tweaks to match the new results
+   it now runs
+
+** DONE Add more details to images
+   CLOSED: [2025-01-13 Mon 10:26]
+    The details I want to add include:
+
+    - [x] sample sizes for breakdowns
+    - [x] box and whisker plot along the bottom of the large values
+   [[https://claude.ai/chat/0e6b6368-130e-4aa8-aa16-97b6c937bba4]] has details
--- a/other_data/.gitattributes
+++ b/other_data/.gitattributes
@ -0,0 +1,9 @@
+USP[[:space:]]DC/USP_DC_12_2021_RELEASE_1.0.xlsx filter=lfs diff=lfs merge=lfs -text
+USP[[:space:]]DC/usp_dc_pub_2023_release_2.0_updated_final.xlsx filter=lfs diff=lfs merge=lfs -text
+USP[[:space:]]MMG/Final_Report_and_Summary_of_Methodology_and_Approach_v1.1.pdf filter=lfs diff=lfs merge=lfs -text
+USP[[:space:]]MMG/MMG_v8.0_Alignment_File.xlsx filter=lfs diff=lfs merge=lfs -text
+USP[[:space:]]MMG/Summary_of_Changes_between_MMGv7.0_and_MMGv8.0.pdf filter=lfs diff=lfs merge=lfs -text
+USP[[:space:]]MMG/USP_Medicare_Model_Guidelines_v8.0__All_Excel_Spreadsheets_.xlsx filter=lfs diff=lfs merge=lfs -text
+USP[[:space:]]MMG/USP_Medicare_Model_Guidelines_v8.0__Categories_and_Classes_.pdf filter=lfs diff=lfs merge=lfs -text
+USP[[:space:]]MMG/USP_Medicare_Model_Guidelines_v8.0__Showing_changes_from_v7.0_.pdf filter=lfs diff=lfs merge=lfs -text
+USP[[:space:]]MMG/USP_Medicare_Model_Guidelines_v8.0__With_Example_Part_D_Drugs_.pdf filter=lfs diff=lfs merge=lfs -text
--- a/other_data/RandomSample_AACT_reasons_why_stopped/reasons_why_stopped.csv
+++ b/other_data/RandomSample_AACT_reasons_why_stopped/reasons_why_stopped.csv
--- a/other_data/RandomSample_AACT_reasons_why_stopped/reasons_why_stopped.fods
+++ b/other_data/RandomSample_AACT_reasons_why_stopped/reasons_why_stopped.fods
--- a/MMG/Final_Report_and_Summary_of_Methodology_and_Approach_v1.1.pdf
+++ b/MMG/Final_Report_and_Summary_of_Methodology_and_Approach_v1.1.pdf
--- a/MMG/Summary_of_Changes_between_MMGv7.0_and_MMGv8.0.pdf
+++ b/MMG/Summary_of_Changes_between_MMGv7.0_and_MMGv8.0.pdf
--- a/MMG/USP_Medicare_Model_Guidelines_v8.0__Categories_and_Classes_.pdf
+++ b/MMG/USP_Medicare_Model_Guidelines_v8.0__Categories_and_Classes_.pdf
--- a/MMG/USP_Medicare_Model_Guidelines_v8.0__Showing_changes_from_v7.0_.pdf
+++ b/MMG/USP_Medicare_Model_Guidelines_v8.0__Showing_changes_from_v7.0_.pdf
--- a/MMG/USP_Medicare_Model_Guidelines_v8.0__With_Example_Part_D_Drugs_.pdf
+++ b/MMG/USP_Medicare_Model_Guidelines_v8.0__With_Example_Part_D_Drugs_.pdf
Author	SHA1	Message	Date
Will King	d1d0dc87a7	recording logs	1 year ago
Will King	1782372a45	Fixed the restore script. Got it runnign currently	1 year ago
Will King	fc478517ac	Backing up current dataset	1 year ago
Will King	d912408456	Got counts of competing drugs back	1 year ago
Will King	2488cceebc	Merge branch 'main' of https://git.youainti.com/youainti/ClinicalTrialsDataProcessing	1 year ago
will king	3311159ab6	Merge remote-tracking branch 'refs/remotes/origin/main' Adjusted some git lfs stuff.	2 years ago
will king	bb374dbde9	Mergeing orign/main and home-pc/main	2 years ago
will king	635cfe42d9	added new data on reasons for terminating trials	2 years ago
will king	495955170c	changed data setup path for clarity	2 years ago
will king	de3698052b	Merging work from different computers. FF merge. Includes a lot of data and updates to various files. Merge branch 'main' of ssh://git.youainti.com:3022/youainti/ClinicalTrialsDataProcessing	2 years ago
will king	dfb041d12b	Adding sql that was sitting on home computer	2 years ago
will king	9aaf007791	removed accidental includion of .dbeaver configuration	2 years ago
				`@ -1 +0,0 @@`
				{"resources":{"Scripts/ASSOICATING NCTIDs to NDCs and Marketing dates.sql":{"default-datasource":"postgres-jdbc-186c896a347-2a3d946d2dea4df7","default-catalog":"aact_db","default-schema":"public"},"Scripts/Data_summaries.sql":{"default-datasource":"postgres-jdbc-186c896a347-2a3d946d2dea4df7","default-catalog":"aact_db","default-schema":"public"},"Scripts/DevelopingLinks.sql":{"default-schema":"public","default-datasource":"postgres-jdbc-186c896a347-2a3d946d2dea4df7","default-catalog":"aact_db"},"Scripts/DiseaseBurdens_create_table.sql":{"default-schema":"public","default-datasource":"postgres-jdbc-186c896a347-2a3d946d2dea4df7","default-catalog":"aact_db"},"Scripts/GlobalBurdensOfDisease2019Codebook.sql":{"default-schema":"DiseaseBurden","default-datasource":"postgres-jdbc-186c896a347-2a3d946d2dea4df7","default-catalog":"aact_db"},"Scripts/GroupingTrials.sql":{"default-datasource":"postgres-jdbc-186c896a347-2a3d946d2dea4df7","default-catalog":"aact_db","default-schema":"public"},"Scripts/Script.sql":{"default-datasource":"postgres-jdbc-186c896a347-2a3d946d2dea4df7","default-catalog":"aact_db","default-schema":"public"},"Scripts/TablesAndViews_Public.sql":{"default-datasource":"postgres-jdbc-186c896a347-2a3d946d2dea4df7","default-catalog":"aact_db","default-schema":"public"},"development_sql/ASSOICATING NCTIDs to NDCs and Marketing dates.sql":{"default-schema":"public","default-datasource":"postgres-jdbc-186c896a347-2a3d946d2dea4df7","default-catalog":"aact_db"}}}