Compare commits

..

No commits in common. 'main' and 'llm-extraction' have entirely different histories.

@ -0,0 +1,64 @@
{
"folders": {},
"connections": {
"mariaDB-186c896820e-6ff11b5b802d8b82": {
"provider": "mysql",
"driver": "mariaDB",
"name": "rxnav",
"save-password": true,
"configuration": {
"host": "will-office",
"port": "3306",
"url": "jdbc:mariadb://will-office:3306/",
"configurationType": "MANUAL",
"type": "dev",
"auth-model": "native"
}
},
"postgres-jdbc-186c896a347-2a3d946d2dea4df7": {
"provider": "postgresql",
"driver": "postgres-jdbc",
"name": "aact_db",
"save-password": true,
"configuration": {
"host": "100.95.169.11",
"port": "5432",
"database": "aact_db",
"url": "jdbc:postgresql://100.95.169.11:5432/aact_db",
"configurationType": "MANUAL",
"type": "dev",
"provider-properties": {},
"auth-model": "native"
},
"custom-properties": {
"resultset.maxrows": "500"
}
},
"postgres-jdbc-186cd8f479f-6cc3c10c8adc3359": {
"provider": "postgresql",
"driver": "postgres-jdbc",
"name": "drugcentral",
"save-password": true,
"configuration": {
"host": "localhost",
"port": "54320",
"database": "postgres",
"url": "jdbc:postgresql://localhost:54320/postgres",
"configurationType": "MANUAL",
"type": "dev",
"auth-model": "native"
}
}
},
"connection-types": {
"dev": {
"name": "Development",
"color": "255,255,255",
"description": "Regular development database",
"auto-commit": true,
"confirm-execute": false,
"confirm-data-change": false,
"auto-close-transactions": false
}
}
}

@ -0,0 +1 @@
{"resources":{"Scripts/ASSOICATING NCTIDs to NDCs and Marketing dates.sql":{"default-datasource":"postgres-jdbc-186c896a347-2a3d946d2dea4df7","default-catalog":"aact_db","default-schema":"public"},"Scripts/Data_summaries.sql":{"default-datasource":"postgres-jdbc-186c896a347-2a3d946d2dea4df7","default-catalog":"aact_db","default-schema":"public"},"Scripts/DevelopingLinks.sql":{"default-schema":"public","default-datasource":"postgres-jdbc-186c896a347-2a3d946d2dea4df7","default-catalog":"aact_db"},"Scripts/DiseaseBurdens_create_table.sql":{"default-schema":"public","default-datasource":"postgres-jdbc-186c896a347-2a3d946d2dea4df7","default-catalog":"aact_db"},"Scripts/GlobalBurdensOfDisease2019Codebook.sql":{"default-schema":"DiseaseBurden","default-datasource":"postgres-jdbc-186c896a347-2a3d946d2dea4df7","default-catalog":"aact_db"},"Scripts/GroupingTrials.sql":{"default-datasource":"postgres-jdbc-186c896a347-2a3d946d2dea4df7","default-catalog":"aact_db","default-schema":"public"},"Scripts/Script.sql":{"default-datasource":"postgres-jdbc-186c896a347-2a3d946d2dea4df7","default-catalog":"aact_db","default-schema":"public"},"Scripts/TablesAndViews_Public.sql":{"default-datasource":"postgres-jdbc-186c896a347-2a3d946d2dea4df7","default-catalog":"aact_db","default-schema":"public"},"development_sql/ASSOICATING NCTIDs to NDCs and Marketing dates.sql":{"default-schema":"public","default-datasource":"postgres-jdbc-186c896a347-2a3d946d2dea4df7","default-catalog":"aact_db"}}}

1
.gitattributes vendored

@ -5,4 +5,3 @@ other_data/USP[[:space:]]DC/usp_dc_pub_2023_release_2.0_updated_final.csv filter
other_data/USP[[:space:]]MMG/MMG_v8.0_Alignment_File.csv filter=lfs diff=lfs merge=lfs -text
other_data/VA[[:space:]]Formulary/PharmacyProductSystem_NationalDrugCodeExtract.csv filter=lfs diff=lfs merge=lfs -text
containers/AACT_Reloader/backup/aact_db_backup_20250106_184236.sql.gz filter=lfs diff=lfs merge=lfs -text
containers/AACT_Reloader/backup/aact_db_backup_20250107_133822.sql.gz filter=lfs diff=lfs merge=lfs -text

2
.gitignore vendored

@ -194,4 +194,4 @@ containers/drugcentral/docker-entrypoint-initdb.d/*.sql
containers/drugcentral/docker-entrypoint-initdb.d/*.sql.gz
containers/drugcentral/db_store/*
.dbeaver/

@ -2,9 +2,8 @@
backup_dir="/mnt/will/large_data/Research_large_data/ClinicalTrialsDataProcessing/containers/AACT_Reloader/backup/"
date_stamp=$(date +%Y%m%d_%H%M%S)
filename="aact_db_backup_${date_stamp}.sql"
container_name = ${1:-aact_db}
podman exec "$container_name" pg_dump -U root aact_db > "${backup_dir}/${filename}"
podman exec aact_db pg_dump -U root aact_db > "${backup_dir}/${filename}"
# Optional: compress the backup
gzip "${backup_dir}/${filename}"

@ -43,9 +43,6 @@ CREATE TABLE "Formularies".usp_dc_2023 (
"API Concept" varchar(250) NULL
);
```
It links rxcuis to other rxcuis where they have a matching USP Categories and Class
This gives alternative RXCUIs based on category an class.
*/
CREATE MATERIALIZED VIEW "Formularies".rxcui_category_class_links AS
WITH base AS (
@ -75,43 +72,27 @@ I'll' break this into two steps.
1. link formulary alternatives to compounds and brands,
2. link nct_id to formulary alternatives
*/
drop if exists materialized view "Formularies".match_trial_compound_to_alternate_bn_rxcuis;
drop if exists materialized view "Formularies".rxcui_to_brand_through_uspdc cascade;
create materialized view "Formularies".rxcui_to_brand_through_uspdc AS
select distinct
select
rccl.source_rxcui
,rccl.linked_rxcui
,rccl.category
,rccl."class"
,rr.tty1
--,rr.tty2
,rr.tty2
,rr.rxcui2
from "Formularies".rxcui_category_class_links rccl
join rxnorm_migrated.rxnorm_relations rr on rr.rxcui1 = rccl.linked_rxcui
where rr.tty2 = 'BN'
;
/* So this one takes each RXCUI and it's associated RXCUIs from the same
category and class, and filters it down to associated RXCUI's that
represent brand names.
*/
create materialized view "Formularies".match_trial_compound_to_alternate_bn_rxcuis as
select distinct mttbi.nct_id, rtbtu.rxcui2 as brand_rxcuis
create materialized view match_trial_compound_to_alternate_bn_rxcuis as
select distinct mttbi.nct_id, rtbtu.rxcui2
from match_trials_to_bn_in mttbi
join "Formularies".rxcui_to_brand_through_uspdc rtbtu
on mttbi.bn_or_in_cui = rtbtu.rxcui2
;
/*
This takes the list of ingredients and brands associated with a trial, and
links it to the list of alternative brand names.
Now I need to create a way to link
*/
--renamed the view
CREATE OR REPLACE VIEW "Formularies".nct_to_brand_counts_through_uspdc
AS SELECT mtctabr.nct_id,
count(*) AS brand_name_counts
FROM "Formularies".match_trial_compound_to_alternate_bn_rxcuis mtctabr
GROUP BY mtctabr.nct_id;

@ -1,308 +0,0 @@
select * from formatted_data_with_planned_enrollment fdwpe
;
select * from formatted_data_mat fdm
;
select count(distinct condition ) from formatted_data_mat fdm
select nct_id, fdm.current_status , count(*)
from formatted_data_mat fdm
group by nct_id , fdm.current_status
order by nct_id
;
select * from formatted_data_mat fdm ;
-- group with trial split
with cte as (
select nct_id
from formatted_data_mat fdm
group by nct_id
having count(distinct current_status) > 1
order by nct_id
)
select
fdm.nct_id
, current_status
, earliest_date_observed
, elapsed_duration
, n_brands
, category_id
, h_sdi_val
, h_sdi_u95
, h_sdi_l95
, hm_sdi_val
, hm_sdi_u95
, hm_sdi_l95
, m_sdi_val
, m_sdi_u95
, m_sdi_l95
, lm_sdi_val
, lm_sdi_u95
, lm_sdi_l95
, l_sdi_val
, l_sdi_u95
, l_sdi_l95
from formatted_data_mat fdm
join cte on cte.nct_id = fdm.nct_id
group by
fdm.nct_id
, current_status
, earliest_date_observed
, elapsed_duration
, n_brands
, category_id
, h_sdi_val
, h_sdi_u95
, h_sdi_l95
, hm_sdi_val
, hm_sdi_u95
, hm_sdi_l95
, m_sdi_val
, m_sdi_u95
, m_sdi_l95
, lm_sdi_val
, lm_sdi_u95
, lm_sdi_l95
, l_sdi_val
, l_sdi_u95
, l_sdi_l95
order by nct_id , earliest_date_observed
;
select count(distinct category_id ) from
select distinct category_id from formatted_data_mat fdm
;
-- group with trial split
with cte as (
select nct_id
from formatted_data_mat fdm
group by nct_id
having count(distinct current_status) > 1
order by nct_id
)
select
fdm.nct_id
, current_status
, earliest_date_observed
, elapsed_duration
, n_brands
, category_id
, h_sdi_val
, h_sdi_u95
, h_sdi_l95
, hm_sdi_val
, hm_sdi_u95
, hm_sdi_l95
, m_sdi_val
, m_sdi_u95
, m_sdi_l95
, lm_sdi_val
, lm_sdi_u95
, lm_sdi_l95
, l_sdi_val
, l_sdi_u95
, l_sdi_l95
from formatted_data_mat fdm
join cte on cte.nct_id = fdm.nct_id
group by
fdm.nct_id
, current_status
, earliest_date_observed
, elapsed_duration
, n_brands
, category_id
, h_sdi_val
, h_sdi_u95
, h_sdi_l95
, hm_sdi_val
, hm_sdi_u95
, hm_sdi_l95
, m_sdi_val
, m_sdi_u95
, m_sdi_l95
, lm_sdi_val
, lm_sdi_u95
, lm_sdi_l95
, l_sdi_val
, l_sdi_u95
, l_sdi_l95
order by nct_id , earliest_date_observed
; --TODO: join to usp dc dataset
WITH trialncts AS (
SELECT DISTINCT ts.nct_id
FROM history.trial_snapshots ts
), nct_to_cui AS (
SELECT bi.nct_id,
bi.downcase_mesh_term,
rr.tty2,
rr.rxcui2 AS approved_drug_rxcui,
count(*) AS count
FROM ctgov.browse_interventions bi
LEFT JOIN rxnorm_migrated.rxnorm_props rp ON bi.downcase_mesh_term::text = rp.propvalue1::text
LEFT JOIN rxnorm_migrated.rxnorm_relations rr ON rr.rxcui1 = rp.rxcui
WHERE (bi.nct_id::text IN ( SELECT trialncts.nct_id
FROM trialncts)) AND bi.mesh_type::text = 'mesh-list'::text AND rp.propname::text = 'Active_ingredient_name'::text AND (rr.tty2::text = ANY (ARRAY['BPCK'::text, 'SCD'::text, 'SBD'::text, 'GPCK'::text]))
GROUP BY bi.nct_id, bi.downcase_mesh_term, rr.tty2, rr.rxcui2
)
SELECT nct_to_cui.nct_id,
ud."USP Category",
ud."USP Class"
FROM nct_to_cui
JOIN "Formularies".usp_dc ud ON ud.rxcui::bpchar = nct_to_cui.approved_drug_rxcui
GROUP BY nct_to_cui.nct_id, ud."USP Category", ud."USP Class"
ORDER BY nct_to_cui.nct_id;
CREATE MATERIALIZED VIEW "Formularies".nct_to_brands_through_uspdc
AS
WITH trialncts AS (
SELECT DISTINCT ts.nct_id
FROM history.trial_snapshots ts
)
SELECT
bi.nct_id,
count( distinct rr2.rxcui2 ) as brand_name_count
FROM ctgov.browse_interventions bi
LEFT JOIN rxnorm_migrated.rxnorm_props rp ON bi.downcase_mesh_term::text = rp.propvalue1::text --match mesh terms to rxcui
LEFT JOIN rxnorm_migrated.rxnorm_relations rr ON rr.rxcui1 = rp.rxcui -- match rxcui to relations between rxcuis
LEFT JOIN rxnorm_migrated.rxnorm_relations rr2 ON rr.rxcui2 = rr2.rxcui1 -- match rxcui to relations between rxcuis
WHERE
(bi.nct_id::text IN (SELECT trialncts.nct_id FROM trialncts)) --check the nct_id is in our list
AND
bi.mesh_type::text = 'mesh-list'::text --we are only looking at mesh "list" rxcuis
AND rp.propname::text = 'Active_ingredient_name'::text --and we only care about active ingredients linked to \/\/\/\/\/
AND (rr.tty2::text = ANY (ARRAY['BPCK'::text, 'SCD'::text, 'SBD'::text, 'GPCK'::text])) --and we are linking from active ingredients ^^^^ to branded packs
AND (rr2.tty2::text = 'BN') --and from branded packs back to brand names
GROUP BY bi.nct_id --remove duplicates
;
/*
*
*/
select
fdqpe.nct_id
--,fdqpe.start_date
--,fdqpe.current_enrollment
--,fdqpe.enrollment_category
,fdqpe.current_status
,fdqpe.earliest_date_observed
,fdqpe.elapsed_duration
,fdqpe.n_brands as identical_brands
,ntbtu.brand_name_count
,fdqpe.category_id
,fdqpe.final_status
,fdqpe.h_sdi_val
--,fdqpe.h_sdi_u95
--,fdqpe.h_sdi_l95
,fdqpe.hm_sdi_val
--,fdqpe.hm_sdi_u95
--,fdqpe.hm_sdi_l95
,fdqpe.m_sdi_val
--,fdqpe.m_sdi_u95
--,fdqpe.m_sdi_l95
,fdqpe.lm_sdi_val
--,fdqpe.lm_sdi_u95
--,fdqpe.lm_sdi_l95
,fdqpe.l_sdi_val
--,fdqpe.l_sdi_u95
--,fdqpe.l_sdi_l95
from formatted_data_mat fdqpe
join "Formularies".nct_to_brands_through_uspdc ntbtu
on fdqpe.nct_id = ntbtu.nct_id
;
--example of multiple reopenings
select *
from formatted_data_mat fdm
where nct_id = 'NCT01239797'
--attempt to automatically find transition periods
with cte1 as (
select nct_id, min(earliest_date_observed) over (partition by nct_id) as earliest_closed_enrollment
from formatted_data_mat fdm
where current_status = 'Active, not recruiting'
), cte2 as (
select nct_id, max(earliest_date_observed) over (partition by nct_id) latest_open_enrollment
from formatted_data_mat fdm
where current_status != 'Active, not recruiting'
)
select
cte1.nct_id
,cte1.earliest_closed_enrollment
,cte2.latest_open_enrollment
,cte1.earliest_closed_enrollment - cte2.latest_open_enrollment
from cte1
join cte2 on cte1.nct_id = cte2.nct_id
/*group by
cte1.nct_id
,cte1.earliest_closed_enrollment
,cte2.latest_open_enrollment
*/
/* So ocassionally a study reopens enrollment.
* If that didn't happen, then I could just find the first enrollment matching X and/or last enrollment matching Y
* to get the transitions
* Instead I need to create shifts of statuses between snapshots, and then remove all of those that did not change.
*
* Better yet, just get the last shift to ANR.
* */
/* Take each entry and get the status from a lagged snapshot
* Then select each snapshot moving from previous_state to ANR
* and filter out everything except the last one.
* */
with cte as (
select
nct_id
,lag(current_status, 1) over (partition by nct_id order by earliest_date_observed) as previous_status
,current_status
,earliest_date_observed as date_current
from formatted_data_mat fdm
), cte2 as (
select
nct_id
,previous_status
,current_status
,max(date_current) as date_current_max
from cte
where
previous_status != current_status
and
current_status = 'Active, not recruiting'
group by
nct_id
,previous_status
,current_status
,date_current
)
select *
from formatted_data_mat fdm
join cte2
on cte2.nct_id = fdm.nct_id
and cte2.date_current_max = fdm.earliest_date_observed
; --join back into

@ -1,6 +1,6 @@
#!/bin/bash
RESTORE_DUMP_GZ="${1:-aact_db_backup_20250107_133822.sql.gz}"
RESTORE_DUMP_GZ=2023-09-06_aactdb_with_matches.sql.gz
POSTGRES_USER=root
POSTGRES_PASSWORD=root
POSTGRES_DB=aact_db
@ -25,7 +25,7 @@ sleep 10
# Function to check if PostgreSQL is ready
function check_postgres {
podman exec -i "${CONTAINER_NAME}" psql -h localhost -U "${POSTGRES_USER}" -d "${POSTGRES_DB}" -c '\q' > /dev/null 2>&1
podman exec -i "${CONTAINER_NAME}" psql -h "${CONTAINER_NAME}" -U "${POSTGRES_USER}" -d "${POSTGRES_DB}" -c '\q' > /dev/null 2>&1
}
# Wait for PostgreSQL to be ready
@ -37,6 +37,6 @@ done
echo "PostgreSQL is ready. Restoring the database..."
# Decompress the dump file and restore it to the database
podman exec -i "${CONTAINER_NAME}" sh -c "gunzip -c /backup/${RESTORE_DUMP_GZ} | psql -h localhost -U ${POSTGRES_USER} -d ${POSTGRES_DB}"
podman exec -i "${POSTGRES_DB}" sh -c "gunzip -c /backup/${RESTORE_DUMP_GZ} | psql -h localhost -U ${POSTGRES_USER} -d ${POSTGRES_DB}"
echo "Database restoration complete."

@ -1,140 +0,0 @@
* Plan/Todo [2025-01-06]
Goal is to update the main images with more details, i.e. adding means
etc.
- get aact_db back up
- attach it to a "research" network
- restart rocker, attaching it to the same research network.
** NOTES
aact_db-restored-2024-11-27 didn't successfully restore. It is missing
all the important stuff.
Figured out why the restore was failing. My code to restore had a faulty
check to see if the DB was up and ready. Fixed that now.
Waiting for restore (manually triggered) to start. Then I should have
access to the table as needed.
It seems like I'm missing some data within a schema, specifically the
Formularies and their associated views.
My options are:
- search around for documentation or other stuff
- try to rebuild
my suspision is that I forgot to back it up. I think it is probably
worth looking for. - So I've been looking through my copy of
ClinicalTrialsDataProcessing, and have not found anything referencing
it. The formularies data is required for my analysis though. If I
remember correctly, I manually uploaded the USP datasets in DBeaver,
then created any views etc.
I think that I'll have to recreate it. This is going to be hard because
I'm not sure what it did. At least I created mildly informative table
names.
The tables/views I've identified are: -
=Formularies.nct_to_brands_through_uspdc=
It looks like I need to - import usp-dc dataset - link those drugs to
usp data - create a view that links those automatically - back it up. -
double check the data I get from the request.
The links will be through RXCUIs, and grouped on =USP Class= In effect,
for a given RXCUI, I want to get the list of RXCUI's which have the same
USP-DC class, and then be able to link back to brands.
This should have the following links: - RXCUI -> USP-DC category/class
pair - USP-DC category/class pair -> RXCUIs - RXUCIs -> competitors
Do I want to combine the USP-DC and UPS-MMG datasets? No, there is
enough difference in them that I don't want to have to handle it that
way.
I've been working on this in scripts/ConfiguringFormularies.sql
So what I've managed to do so far is export tables, backup the data.
I've got a version that connects trials to brand names, but there may be
more details to the connection than I thought. I'd like to check if I
need to filter anything or check if there are other ingredients etc that
I need to include. */I probably need to write some descriptions of all
the tables and views to put everything together. An ai would probably be
helpful in doing this./*
** Code snippets
#+begin_example
podman run \
-e POSTGRES_PASSWORD="${POSTGRES_PASSWORD}" \
-e POSTGRES_USER="${POSTGRES_USER}" \
-e POSTGRES_DB="${POSTGRES_DB}" \
--name "${CONTAINER_NAME}" \
--detach \
--network research-network \
--shm-size=512mb \
--volume ./backup/:/backup/ \
-p 5432:5432\
postgres:14-alpine
#+end_example
#+begin_example
function check_postgres {
podman exec -i "${POSTGRES_DB}" psql -h localhost -U "${POSTGRES_USER}" -d "${POSTGRES_DB}" -c '\q' > /dev/null 2>&1
}
#+end_example
** Notes at end of day
- was reasonably productive in getting stuff unblocked for finishing
JMP, which i'll need to do before I leave town next week.
** What I've got to do tomorrow
I've got a version that connects trials to brand names, but there may be
more details to the connection than I thought. I'd like to check if I
need to filter anything or check if there are other ingredients etc that
I need to include. */I probably need to write some descriptions of all
the tables and views to put everything together. An ai would probably be
helpful in doing this./* At the end of it all, I should be able to get a
count of competing drugs per trial.
Once that is done, I can relink aact_db and rocker, then rerun my
analysis. Then I can adjust the images that I need for my JMP.
* [2025-01-07 Tue 12:01] notes
So what I've got to do is
** DONE Investigate what compounds are showing up in my current list
if that is what I want, then I'll be able proceed with redoing my images
if not, then I'll have to work on adjusting the views etc that I have.
I've looked through it and it seems to correct.
*** [[/mnt/will/large_data/Research_large_data/ClinicalTrialsDataProcessing/Scripts/ConfiguringFormularies.sql][ConfiguringFormularies.sql:81]] [2025-01-07 Tue 13:24]
I've tweaked these three views to make them clearer.
I also renamed the view of interest to ="Formularies".nct_to_brand_counts_through_uspdc= to better represent what it does.
** DONE Rerun the analysis
CLOSED: [2025-01-07 Tue 16:39]
So it looks like I'll need to
1. take a backup of aact_db
2. restore from backup, putting the container in the research network
3. then rerun the analysis.
Ok, I'm pushing the backup and can get started on restoring from backup.
Backup is restoring. As I recall, this takes 40 minutes.
had some mild tweaks to match the new results
it now runs
** DONE Add more details to images
CLOSED: [2025-01-13 Mon 10:26]
The details I want to add include:
- [x] sample sizes for breakdowns
- [x] box and whisker plot along the bottom of the large values
[[https://claude.ai/chat/0e6b6368-130e-4aa8-aa16-97b6c937bba4]] has details

@ -1,9 +0,0 @@
USP[[:space:]]DC/USP_DC_12_2021_RELEASE_1.0.xlsx filter=lfs diff=lfs merge=lfs -text
USP[[:space:]]DC/usp_dc_pub_2023_release_2.0_updated_final.xlsx filter=lfs diff=lfs merge=lfs -text
USP[[:space:]]MMG/Final_Report_and_Summary_of_Methodology_and_Approach_v1.1.pdf filter=lfs diff=lfs merge=lfs -text
USP[[:space:]]MMG/MMG_v8.0_Alignment_File.xlsx filter=lfs diff=lfs merge=lfs -text
USP[[:space:]]MMG/Summary_of_Changes_between_MMGv7.0_and_MMGv8.0.pdf filter=lfs diff=lfs merge=lfs -text
USP[[:space:]]MMG/USP_Medicare_Model_Guidelines_v8.0__All_Excel_Spreadsheets_.xlsx filter=lfs diff=lfs merge=lfs -text
USP[[:space:]]MMG/USP_Medicare_Model_Guidelines_v8.0__Categories_and_Classes_.pdf filter=lfs diff=lfs merge=lfs -text
USP[[:space:]]MMG/USP_Medicare_Model_Guidelines_v8.0__Showing_changes_from_v7.0_.pdf filter=lfs diff=lfs merge=lfs -text
USP[[:space:]]MMG/USP_Medicare_Model_Guidelines_v8.0__With_Example_Part_D_Drugs_.pdf filter=lfs diff=lfs merge=lfs -text
Loading…
Cancel
Save