From 142670d08a44975427b583e41758a29dac79982c Mon Sep 17 00:00:00 2001 From: Will King Date: Sat, 24 Aug 2024 13:33:48 -0700 Subject: [PATCH 1/5] added some data that had been left untracked --- .gitattributes | 4 ++ 2023-09-06_aactdb_with_matches.sql.gzip | 3 -- .../2023-09-06_aactdb_with_matches.sql.gz | 3 ++ .../AACT_Reloader/StartRestoreContainer.sh | 23 +++++++++ justfile | 47 ++++++++----------- ..._dc_pub_2023_release_2.0_updated_final.csv | 3 ++ .../USP MMG/MMG_v8.0_Alignment_File.csv | 3 ++ ...yProductSystem_NationalDrugCodeExtract.csv | 3 ++ scripts/drugtools/historical_nct_extractor.py | 16 +++---- 9 files changed, 66 insertions(+), 39 deletions(-) delete mode 100644 2023-09-06_aactdb_with_matches.sql.gzip create mode 100644 containers/AACT_Reloader/2023-09-06_aactdb_with_matches.sql.gz create mode 100755 containers/AACT_Reloader/StartRestoreContainer.sh create mode 100644 other_data/USP DC/usp_dc_pub_2023_release_2.0_updated_final.csv create mode 100644 other_data/USP MMG/MMG_v8.0_Alignment_File.csv create mode 100644 other_data/VA Formulary/PharmacyProductSystem_NationalDrugCodeExtract.csv diff --git a/.gitattributes b/.gitattributes index 20425b7..b720f8a 100644 --- a/.gitattributes +++ b/.gitattributes @@ -1,2 +1,6 @@ *.sql.gzip filter=lfs diff=lfs merge=lfs -text *.xlsx filter=lfs diff=lfs merge=lfs -text +containers/AACT_Reloader/2023-09-06_aactdb_with_matches.sql.gz filter=lfs diff=lfs merge=lfs -text +other_data/USP[[:space:]]DC/usp_dc_pub_2023_release_2.0_updated_final.csv filter=lfs diff=lfs merge=lfs -text +other_data/USP[[:space:]]MMG/MMG_v8.0_Alignment_File.csv filter=lfs diff=lfs merge=lfs -text +other_data/VA[[:space:]]Formulary/PharmacyProductSystem_NationalDrugCodeExtract.csv filter=lfs diff=lfs merge=lfs -text diff --git a/2023-09-06_aactdb_with_matches.sql.gzip b/2023-09-06_aactdb_with_matches.sql.gzip deleted file mode 100644 index c7282ee..0000000 --- a/2023-09-06_aactdb_with_matches.sql.gzip +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:9ac5e99d27f53b81380d98f546c656332fac2be05dabea0ade16fe692a1334d6 -size 1897211526 diff --git a/containers/AACT_Reloader/2023-09-06_aactdb_with_matches.sql.gz b/containers/AACT_Reloader/2023-09-06_aactdb_with_matches.sql.gz new file mode 100644 index 0000000..7ff43a7 --- /dev/null +++ b/containers/AACT_Reloader/2023-09-06_aactdb_with_matches.sql.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bd5142f25ff9ef65048c02d5173b5e2a90f4a07513480d5e8c399cf9da39e678 +size 1897211561 diff --git a/containers/AACT_Reloader/StartRestoreContainer.sh b/containers/AACT_Reloader/StartRestoreContainer.sh new file mode 100755 index 0000000..d495fbf --- /dev/null +++ b/containers/AACT_Reloader/StartRestoreContainer.sh @@ -0,0 +1,23 @@ +#!/bin/bash + +RESTORE_DUMP_GZ=2023-09-06_aactdb_with_matches.sql.gz +POSTGRES_USER=root +POSTGRES_PASSWORD=root + +#start container +podman run \ + -e POSTGRES_PASSWORD="${POSTGRES_PASSWORD}" \ + -e POSTGRES_USER="${POSTGRES_USER}" \ + -e POSTGRES_DB=aact_db \ + --name aact_db \ + --detatch \ + --shm-size=512mb \ + --volume ${RESTORE_DUMP_GZ}:/backup/${RESTORE_DUMP_GZ} \ + --ports 5432:5432\ + postgres:14-alpine + + + +#execute within container +podman exec aact_db \ + "gzip --keep --stdout --decompress /backup/2023-09-06_aactdb_with_matches.sql.gz | psql -U ${POSTGRES_USER}" diff --git a/justfile b/justfile index a3691ed..0be52ea 100644 --- a/justfile +++ b/justfile @@ -5,23 +5,19 @@ # - move postgress login credentials (allow them to be printed from just while setting up) -#paths for aact_db (postgres) -aact_download_link := "https://ctti-aact.nyc3.digitaloceanspaces.com/27grtsnhtccplxapj2o8ak9aotvv" -aact_download_file := "2022-12-23_postgres_data.zip" -aact_download_path := "./containers/AACT_downloader/aact_downloads" -aact_zipped_data_filepath := aact_download_path / aact_download_file +data_link := "https://ctti-aact.nyc3.digitaloceanspaces.com/27grtsnhtccplxapj2o8ak9aotvv" +data_file := "2022-12-23_postgres_data.zip" +data_path := "./containers/AACT_downloader/aact_downloads" +data_filepath := data_path / data_file #must match the 'container name: aact_db' in the docker-compose.yaml -docker_container := `docker container ls -a | grep "aact_db|rxnav_db" | cut -f 1 -d " " | tr "\n" " "` - -#paths for rxnavinabox -rxnav_path := "./containers/RxNav-In-a-box" -rxnav_version := "rxnav-in-a-box-20230103" -rxnav_data_path := rxnav_path / rxnav_version / "mysql" / "02_data.sql" +docker_container := `docker container ls -a | grep aact_db | cut -f 1 -d " " | tr "\n" " "` #Various paths for docker stuff -docker-compose_path := "./containers/docker-compose.yaml" +docker-compose_path := "./AACT_downloader/docker-compose.yaml" +#rxnorm_mappings +rxnorm_mappings_url := "https://dailymed-data.nlm.nih.gov/public-release-files/rxnorm_mappings.zip" #Number of historical trials to download. count := "100" @@ -32,23 +28,18 @@ check-status: docker --version #check if python version > 3.10. python --version - #python -c 'import sys; exit(sys.hexversion >= 50859504)' + python -c 'import sys; exit(sys.hexversion >= 50859504)' curl --version echo "current docker containers:{{docker_container}}" - +#Setup the AACT container setup-containers: - echo "todo" @echo "Check for downloaded data" - #aact - [ -s {{aact_download_path}}/postgres_data.dmp ] - #rxnav - [ -s {{rxnav_data_path}} ] + [ -s {{data_path}}/postgres_data.dmp ] #run docker compose - @echo "Setting up AACT_db & RxNav_db container" + @echo "Setting up AACT container" docker-compose -f {{docker-compose_path}} up -d - #Stop the appropriate docker container stop-containers: @@ -69,13 +60,10 @@ clean-docker: stop-containers #Download the AACT data download-aact-data: - #download - curl {{aact_download_link}} > {{aact_zipped_data_filepath}} - unzip {{aact_zipped_data_filepath}} -d {{aact_download_path}} - rm {{aact_zipped_data_filepath}} + curl {{data_link}} > ./AACT_downloader/aact_downloads/{{data_file}} + unzip {{data_filepath}} -d {{data_path}} + rm {{data_filepath}} -download-rxnav-data: - echo "Currently manually downloaded." #build based on previously downloaded data build: check-status setup-containers @@ -117,3 +105,8 @@ get-nsde: cd market_data && bash download_nsde.sh cd market_data && python extract_nsde.py +get-rxnorm-mappings: + #this may not be needed, all it does is match spls to rxcuis and I think I already have that. + curl {{rxnorm_mappings_url}} > ./market_data/rxnorm_mappings.zip + cd ./market_data && unzip ./rxnorm_mappings.zip + rm ./market_data/rxnorm_mappings.zip diff --git a/other_data/USP DC/usp_dc_pub_2023_release_2.0_updated_final.csv b/other_data/USP DC/usp_dc_pub_2023_release_2.0_updated_final.csv new file mode 100644 index 0000000..6cd7c22 --- /dev/null +++ b/other_data/USP DC/usp_dc_pub_2023_release_2.0_updated_final.csv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d08d3944a859c0b1f6bbd466ca027fc46c86ef5bb0328cb005fa002b7b61e70b +size 2451625 diff --git a/other_data/USP MMG/MMG_v8.0_Alignment_File.csv b/other_data/USP MMG/MMG_v8.0_Alignment_File.csv new file mode 100644 index 0000000..f12587c --- /dev/null +++ b/other_data/USP MMG/MMG_v8.0_Alignment_File.csv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:85859dae3971460d36e0643ee1396cb646dba158b75862d557210cb2c50707a9 +size 874058 diff --git a/other_data/VA Formulary/PharmacyProductSystem_NationalDrugCodeExtract.csv b/other_data/VA Formulary/PharmacyProductSystem_NationalDrugCodeExtract.csv new file mode 100644 index 0000000..e383df4 --- /dev/null +++ b/other_data/VA Formulary/PharmacyProductSystem_NationalDrugCodeExtract.csv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8c877a461be9e75565f78d0552e76f597b5cce709c82a3f9ad30dcc0f26ddafc +size 32481883 diff --git a/scripts/drugtools/historical_nct_extractor.py b/scripts/drugtools/historical_nct_extractor.py index ab62bf3..326d4fd 100644 --- a/scripts/drugtools/historical_nct_extractor.py +++ b/scripts/drugtools/historical_nct_extractor.py @@ -126,17 +126,18 @@ def extract_submission_dates(soup): version_date_dict = {} - for row in reversed(table_rows): + for row in table_rows: # if it is xx then it contains what we need. + version_number = None + version_date = None for td in row.findChildren("td"): if ("headers" in td.attrs): if (td.attrs["headers"][0]=="VersionNumber"): version_number = int(td.text) elif (td.attrs["headers"][0]=="VersionDate"): - version_date = td.text - version_date_dict[version_number] = datetime.strptime(version_date , "%B %d, %Y") + version_date = datetime.strptime(td.text.strip() , "%B %d, %Y") - print(version_date_dict) + version_date_dict[version_number] = version_date return version_date_dict def optional_strip(possible_string): @@ -396,14 +397,12 @@ date_MMMM_DD_YYYY = "%B %d, %Y" def get_data_from_versions(nct_id,html, version_a_int, version_b_int): soup = BeautifulSoup(html,"lxml") - print(getting_data_from_versions) version_date_dict = extract_submission_dates(soup) - print(version_date_dict) #preallocate version data - version_a = VersionData(nct_id, version_a_int, version_date_dict[version_a_int]) - version_b = VersionData(nct_id, version_b_int, version_date_dict[version_b_int]) + version_a = VersionData(nct_id, version_a_int, version_date_dict.get(version_a_int)) + version_b = VersionData(nct_id, version_b_int, version_date_dict.get(version_b_int)) #extract data from html and put it in the preallocated objects get_forms(soup, version_a, version_b) @@ -424,7 +423,6 @@ def run(): curse.execute(sql) for response in tqdm(curse.fetchall()): nct_id, version_a, version_b, html = response - print(nct_id) print(nct_id, version_a, version_b) if VERBOSE else "" From d90539a679d30b96fed21414bd6f69d1690a241a Mon Sep 17 00:00:00 2001 From: Will King Date: Sat, 24 Aug 2024 16:19:40 -0700 Subject: [PATCH 2/5] updated data-restoration script. Currently working --- containers/AACT_Reloader/.gitattributes | 1 + .../AACT_Reloader/StartRestoreContainer.sh | 32 ++++++++++++++----- .../2023-09-06_aactdb_with_matches.sql.gz | 0 3 files changed, 25 insertions(+), 8 deletions(-) create mode 100644 containers/AACT_Reloader/.gitattributes rename containers/AACT_Reloader/{ => backup}/2023-09-06_aactdb_with_matches.sql.gz (100%) diff --git a/containers/AACT_Reloader/.gitattributes b/containers/AACT_Reloader/.gitattributes new file mode 100644 index 0000000..a20c5a3 --- /dev/null +++ b/containers/AACT_Reloader/.gitattributes @@ -0,0 +1 @@ +backup/2023-09-06_aactdb_with_matches.sql.gz filter=lfs diff=lfs merge=lfs -text diff --git a/containers/AACT_Reloader/StartRestoreContainer.sh b/containers/AACT_Reloader/StartRestoreContainer.sh index d495fbf..e647c84 100755 --- a/containers/AACT_Reloader/StartRestoreContainer.sh +++ b/containers/AACT_Reloader/StartRestoreContainer.sh @@ -3,21 +3,37 @@ RESTORE_DUMP_GZ=2023-09-06_aactdb_with_matches.sql.gz POSTGRES_USER=root POSTGRES_PASSWORD=root +POSTGRES_DB=aact_db #start container podman run \ -e POSTGRES_PASSWORD="${POSTGRES_PASSWORD}" \ -e POSTGRES_USER="${POSTGRES_USER}" \ - -e POSTGRES_DB=aact_db \ - --name aact_db \ - --detatch \ + -e POSTGRES_DB="${POSTGRES_DB}" \ + --name "${POSTGRES_DB}" \ + --detach \ --shm-size=512mb \ - --volume ${RESTORE_DUMP_GZ}:/backup/${RESTORE_DUMP_GZ} \ - --ports 5432:5432\ + --volume ./backup/:/backup/ \ + -p 5432:5432\ postgres:14-alpine +sleep 10 -#execute within container -podman exec aact_db \ - "gzip --keep --stdout --decompress /backup/2023-09-06_aactdb_with_matches.sql.gz | psql -U ${POSTGRES_USER}" +# Function to check if PostgreSQL is ready +function check_postgres { + podman exec -i "${POSTGRES_DB}" psql -h localhost -U "${POSTGRES_USER}" -d "${POSTGRES_DB}" -c '\q' > /dev/null 2>&1 +} + +# Wait for PostgreSQL to be ready +until check_postgres; do + echo "Waiting for PostgreSQL to be ready..." + sleep 4 +done + +echo "PostgreSQL is ready. Restoring the database..." + +# Decompress the dump file and restore it to the database +podman exec -i "${POSTGRES_DB}" sh -c "gunzip -c /backup/${RESTORE_DUMP_GZ} | psql -h localhost -U ${POSTGRES_USER} -d ${POSTGRES_DB}" + +echo "Database restoration complete." diff --git a/containers/AACT_Reloader/2023-09-06_aactdb_with_matches.sql.gz b/containers/AACT_Reloader/backup/2023-09-06_aactdb_with_matches.sql.gz similarity index 100% rename from containers/AACT_Reloader/2023-09-06_aactdb_with_matches.sql.gz rename to containers/AACT_Reloader/backup/2023-09-06_aactdb_with_matches.sql.gz From 9aaf0077919c5da7bc5bb20ea3549adc3e096daf Mon Sep 17 00:00:00 2001 From: will king Date: Sat, 7 Sep 2024 09:59:10 -0700 Subject: [PATCH 3/5] removed accidental includion of .dbeaver configuration --- .dbeaver/.credentials-config.json.bak | Bin 336 -> 0 bytes .dbeaver/.data-sources.json.bak | 64 -------------------------- .dbeaver/.project-metadata.json.bak | 1 - .gitignore | 2 +- 4 files changed, 1 insertion(+), 66 deletions(-) delete mode 100644 .dbeaver/.credentials-config.json.bak delete mode 100644 .dbeaver/.data-sources.json.bak delete mode 100644 .dbeaver/.project-metadata.json.bak diff --git a/.dbeaver/.credentials-config.json.bak b/.dbeaver/.credentials-config.json.bak deleted file mode 100644 index 5be47bd4b59987ff12f333b2280f8a3ba977cc42..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 336 zcmV-W0k8h~=exeU`Wsxb{O+H>=V;Jx{r!L+X$}Pf!~)(rxt%%my>5}$b|qq@;NbTcu^cFoPVYT+L!fatnvNWAQ_5W5XVmj5;0r14wO% zkq3-p0rNVA;TYwc03xJ<8>&*Ox%{4v4+&%-9T3duT#;(wwI2GWCmHxVt#y*vMXMhG diff --git a/.dbeaver/.data-sources.json.bak b/.dbeaver/.data-sources.json.bak deleted file mode 100644 index 1ed4cce..0000000 --- a/.dbeaver/.data-sources.json.bak +++ /dev/null @@ -1,64 +0,0 @@ -{ - "folders": {}, - "connections": { - "mariaDB-186c896820e-6ff11b5b802d8b82": { - "provider": "mysql", - "driver": "mariaDB", - "name": "rxnav", - "save-password": true, - "configuration": { - "host": "will-office", - "port": "3306", - "url": "jdbc:mariadb://will-office:3306/", - "configurationType": "MANUAL", - "type": "dev", - "auth-model": "native" - } - }, - "postgres-jdbc-186c896a347-2a3d946d2dea4df7": { - "provider": "postgresql", - "driver": "postgres-jdbc", - "name": "aact_db", - "save-password": true, - "configuration": { - "host": "100.95.169.11", - "port": "5432", - "database": "aact_db", - "url": "jdbc:postgresql://100.95.169.11:5432/aact_db", - "configurationType": "MANUAL", - "type": "dev", - "provider-properties": {}, - "auth-model": "native" - }, - "custom-properties": { - "resultset.maxrows": "500" - } - }, - "postgres-jdbc-186cd8f479f-6cc3c10c8adc3359": { - "provider": "postgresql", - "driver": "postgres-jdbc", - "name": "drugcentral", - "save-password": true, - "configuration": { - "host": "localhost", - "port": "54320", - "database": "postgres", - "url": "jdbc:postgresql://localhost:54320/postgres", - "configurationType": "MANUAL", - "type": "dev", - "auth-model": "native" - } - } - }, - "connection-types": { - "dev": { - "name": "Development", - "color": "255,255,255", - "description": "Regular development database", - "auto-commit": true, - "confirm-execute": false, - "confirm-data-change": false, - "auto-close-transactions": false - } - } -} \ No newline at end of file diff --git a/.dbeaver/.project-metadata.json.bak b/.dbeaver/.project-metadata.json.bak deleted file mode 100644 index 1334ab3..0000000 --- a/.dbeaver/.project-metadata.json.bak +++ /dev/null @@ -1 +0,0 @@ -{"resources":{"Scripts/ASSOICATING NCTIDs to NDCs and Marketing dates.sql":{"default-datasource":"postgres-jdbc-186c896a347-2a3d946d2dea4df7","default-catalog":"aact_db","default-schema":"public"},"Scripts/Data_summaries.sql":{"default-datasource":"postgres-jdbc-186c896a347-2a3d946d2dea4df7","default-catalog":"aact_db","default-schema":"public"},"Scripts/DevelopingLinks.sql":{"default-schema":"public","default-datasource":"postgres-jdbc-186c896a347-2a3d946d2dea4df7","default-catalog":"aact_db"},"Scripts/DiseaseBurdens_create_table.sql":{"default-schema":"public","default-datasource":"postgres-jdbc-186c896a347-2a3d946d2dea4df7","default-catalog":"aact_db"},"Scripts/GlobalBurdensOfDisease2019Codebook.sql":{"default-schema":"DiseaseBurden","default-datasource":"postgres-jdbc-186c896a347-2a3d946d2dea4df7","default-catalog":"aact_db"},"Scripts/GroupingTrials.sql":{"default-datasource":"postgres-jdbc-186c896a347-2a3d946d2dea4df7","default-catalog":"aact_db","default-schema":"public"},"Scripts/Script.sql":{"default-datasource":"postgres-jdbc-186c896a347-2a3d946d2dea4df7","default-catalog":"aact_db","default-schema":"public"},"Scripts/TablesAndViews_Public.sql":{"default-datasource":"postgres-jdbc-186c896a347-2a3d946d2dea4df7","default-catalog":"aact_db","default-schema":"public"},"development_sql/ASSOICATING NCTIDs to NDCs and Marketing dates.sql":{"default-schema":"public","default-datasource":"postgres-jdbc-186c896a347-2a3d946d2dea4df7","default-catalog":"aact_db"}}} \ No newline at end of file diff --git a/.gitignore b/.gitignore index 0e6b8e4..dd8f5aa 100644 --- a/.gitignore +++ b/.gitignore @@ -194,4 +194,4 @@ containers/drugcentral/docker-entrypoint-initdb.d/*.sql containers/drugcentral/docker-entrypoint-initdb.d/*.sql.gz containers/drugcentral/db_store/* - +.dbeaver/ From dfb041d12b7f8ad6ece1d8d16d0d8ffe0d3101c1 Mon Sep 17 00:00:00 2001 From: will king Date: Sat, 7 Sep 2024 10:00:38 -0700 Subject: [PATCH 4/5] Adding sql that was sitting on home computer --- Scripts/VariousDevelopmentsForAnalysis.sql | 308 +++++++++++++++++++++ 1 file changed, 308 insertions(+) create mode 100644 Scripts/VariousDevelopmentsForAnalysis.sql diff --git a/Scripts/VariousDevelopmentsForAnalysis.sql b/Scripts/VariousDevelopmentsForAnalysis.sql new file mode 100644 index 0000000..9e312fc --- /dev/null +++ b/Scripts/VariousDevelopmentsForAnalysis.sql @@ -0,0 +1,308 @@ +select * from formatted_data_with_planned_enrollment fdwpe +; + + +select * from formatted_data_mat fdm +; + +select count(distinct condition ) from formatted_data_mat fdm + +select nct_id, fdm.current_status , count(*) +from formatted_data_mat fdm +group by nct_id , fdm.current_status +order by nct_id +; + +select * from formatted_data_mat fdm ; + + +-- group with trial split +with cte as ( +select nct_id +from formatted_data_mat fdm +group by nct_id +having count(distinct current_status) > 1 +order by nct_id +) +select + fdm.nct_id + , current_status + , earliest_date_observed + , elapsed_duration + , n_brands + , category_id + , h_sdi_val + , h_sdi_u95 + , h_sdi_l95 + , hm_sdi_val + , hm_sdi_u95 + , hm_sdi_l95 + , m_sdi_val + , m_sdi_u95 + , m_sdi_l95 + , lm_sdi_val + , lm_sdi_u95 + , lm_sdi_l95 + , l_sdi_val + , l_sdi_u95 + , l_sdi_l95 +from formatted_data_mat fdm + join cte on cte.nct_id = fdm.nct_id +group by + fdm.nct_id + , current_status + , earliest_date_observed + , elapsed_duration + , n_brands + , category_id + , h_sdi_val + , h_sdi_u95 + , h_sdi_l95 + , hm_sdi_val + , hm_sdi_u95 + , hm_sdi_l95 + , m_sdi_val + , m_sdi_u95 + , m_sdi_l95 + , lm_sdi_val + , lm_sdi_u95 + , lm_sdi_l95 + , l_sdi_val + , l_sdi_u95 + , l_sdi_l95 +order by nct_id , earliest_date_observed +; + +select count(distinct category_id ) from + + +select distinct category_id from formatted_data_mat fdm +; + + + + + +-- group with trial split +with cte as ( +select nct_id +from formatted_data_mat fdm +group by nct_id +having count(distinct current_status) > 1 +order by nct_id +) +select + fdm.nct_id + , current_status + , earliest_date_observed + , elapsed_duration + , n_brands + , category_id + , h_sdi_val + , h_sdi_u95 + , h_sdi_l95 + , hm_sdi_val + , hm_sdi_u95 + , hm_sdi_l95 + , m_sdi_val + , m_sdi_u95 + , m_sdi_l95 + , lm_sdi_val + , lm_sdi_u95 + , lm_sdi_l95 + , l_sdi_val + , l_sdi_u95 + , l_sdi_l95 +from formatted_data_mat fdm + join cte on cte.nct_id = fdm.nct_id +group by + fdm.nct_id + , current_status + , earliest_date_observed + , elapsed_duration + , n_brands + , category_id + , h_sdi_val + , h_sdi_u95 + , h_sdi_l95 + , hm_sdi_val + , hm_sdi_u95 + , hm_sdi_l95 + , m_sdi_val + , m_sdi_u95 + , m_sdi_l95 + , lm_sdi_val + , lm_sdi_u95 + , lm_sdi_l95 + , l_sdi_val + , l_sdi_u95 + , l_sdi_l95 +order by nct_id , earliest_date_observed +; --TODO: join to usp dc dataset + + + + +WITH trialncts AS ( + SELECT DISTINCT ts.nct_id + FROM history.trial_snapshots ts +), nct_to_cui AS ( + SELECT bi.nct_id, + bi.downcase_mesh_term, + rr.tty2, + rr.rxcui2 AS approved_drug_rxcui, + count(*) AS count + FROM ctgov.browse_interventions bi + LEFT JOIN rxnorm_migrated.rxnorm_props rp ON bi.downcase_mesh_term::text = rp.propvalue1::text + LEFT JOIN rxnorm_migrated.rxnorm_relations rr ON rr.rxcui1 = rp.rxcui + WHERE (bi.nct_id::text IN ( SELECT trialncts.nct_id + FROM trialncts)) AND bi.mesh_type::text = 'mesh-list'::text AND rp.propname::text = 'Active_ingredient_name'::text AND (rr.tty2::text = ANY (ARRAY['BPCK'::text, 'SCD'::text, 'SBD'::text, 'GPCK'::text])) + GROUP BY bi.nct_id, bi.downcase_mesh_term, rr.tty2, rr.rxcui2 + ) + SELECT nct_to_cui.nct_id, + ud."USP Category", + ud."USP Class" + FROM nct_to_cui + JOIN "Formularies".usp_dc ud ON ud.rxcui::bpchar = nct_to_cui.approved_drug_rxcui + GROUP BY nct_to_cui.nct_id, ud."USP Category", ud."USP Class" + ORDER BY nct_to_cui.nct_id; + + + + +CREATE MATERIALIZED VIEW "Formularies".nct_to_brands_through_uspdc +AS +WITH trialncts AS ( + SELECT DISTINCT ts.nct_id + FROM history.trial_snapshots ts +) +SELECT + bi.nct_id, + count( distinct rr2.rxcui2 ) as brand_name_count + FROM ctgov.browse_interventions bi + LEFT JOIN rxnorm_migrated.rxnorm_props rp ON bi.downcase_mesh_term::text = rp.propvalue1::text --match mesh terms to rxcui + LEFT JOIN rxnorm_migrated.rxnorm_relations rr ON rr.rxcui1 = rp.rxcui -- match rxcui to relations between rxcuis + LEFT JOIN rxnorm_migrated.rxnorm_relations rr2 ON rr.rxcui2 = rr2.rxcui1 -- match rxcui to relations between rxcuis +WHERE + (bi.nct_id::text IN (SELECT trialncts.nct_id FROM trialncts)) --check the nct_id is in our list + AND + bi.mesh_type::text = 'mesh-list'::text --we are only looking at mesh "list" rxcuis + AND rp.propname::text = 'Active_ingredient_name'::text --and we only care about active ingredients linked to \/\/\/\/\/ + AND (rr.tty2::text = ANY (ARRAY['BPCK'::text, 'SCD'::text, 'SBD'::text, 'GPCK'::text])) --and we are linking from active ingredients ^^^^ to branded packs + AND (rr2.tty2::text = 'BN') --and from branded packs back to brand names +GROUP BY bi.nct_id --remove duplicates +; + + + +/* + * + */ + + +select + fdqpe.nct_id + --,fdqpe.start_date + --,fdqpe.current_enrollment + --,fdqpe.enrollment_category + ,fdqpe.current_status + ,fdqpe.earliest_date_observed + ,fdqpe.elapsed_duration + ,fdqpe.n_brands as identical_brands + ,ntbtu.brand_name_count + ,fdqpe.category_id + ,fdqpe.final_status + ,fdqpe.h_sdi_val + --,fdqpe.h_sdi_u95 + --,fdqpe.h_sdi_l95 + ,fdqpe.hm_sdi_val + --,fdqpe.hm_sdi_u95 + --,fdqpe.hm_sdi_l95 + ,fdqpe.m_sdi_val + --,fdqpe.m_sdi_u95 + --,fdqpe.m_sdi_l95 + ,fdqpe.lm_sdi_val + --,fdqpe.lm_sdi_u95 + --,fdqpe.lm_sdi_l95 + ,fdqpe.l_sdi_val + --,fdqpe.l_sdi_u95 + --,fdqpe.l_sdi_l95 +from formatted_data_mat fdqpe + join "Formularies".nct_to_brands_through_uspdc ntbtu + on fdqpe.nct_id = ntbtu.nct_id +; + +--example of multiple reopenings +select * +from formatted_data_mat fdm +where nct_id = 'NCT01239797' + +--attempt to automatically find transition periods +with cte1 as ( + select nct_id, min(earliest_date_observed) over (partition by nct_id) as earliest_closed_enrollment + from formatted_data_mat fdm + where current_status = 'Active, not recruiting' +), cte2 as ( + select nct_id, max(earliest_date_observed) over (partition by nct_id) latest_open_enrollment + from formatted_data_mat fdm + where current_status != 'Active, not recruiting' +) +select + cte1.nct_id + ,cte1.earliest_closed_enrollment + ,cte2.latest_open_enrollment + ,cte1.earliest_closed_enrollment - cte2.latest_open_enrollment +from cte1 + join cte2 on cte1.nct_id = cte2.nct_id +/*group by + cte1.nct_id + ,cte1.earliest_closed_enrollment + ,cte2.latest_open_enrollment +*/ + + + +/* So ocassionally a study reopens enrollment. + * If that didn't happen, then I could just find the first enrollment matching X and/or last enrollment matching Y + * to get the transitions + * Instead I need to create shifts of statuses between snapshots, and then remove all of those that did not change. + * + * Better yet, just get the last shift to ANR. + * */ + + +/* Take each entry and get the status from a lagged snapshot + * Then select each snapshot moving from previous_state to ANR + * and filter out everything except the last one. + * */ +with cte as ( +select + nct_id + ,lag(current_status, 1) over (partition by nct_id order by earliest_date_observed) as previous_status + ,current_status + ,earliest_date_observed as date_current +from formatted_data_mat fdm +), cte2 as ( +select + nct_id + ,previous_status + ,current_status + ,max(date_current) as date_current_max +from cte +where + previous_status != current_status + and + current_status = 'Active, not recruiting' +group by + nct_id + ,previous_status + ,current_status + ,date_current +) +select * +from formatted_data_mat fdm + join cte2 + on cte2.nct_id = fdm.nct_id + and cte2.date_current_max = fdm.earliest_date_observed +; --join back into + \ No newline at end of file From 495955170c00dbe3c1316f403661311f89a3b08d Mon Sep 17 00:00:00 2001 From: will king Date: Sat, 7 Sep 2024 10:20:06 -0700 Subject: [PATCH 5/5] changed data setup path for clarity --- .../Icd10ConditionsMatching/Icd10ConditionsMatching/__init__.py | 0 .../Icd10ConditionsMatching/db_interface.py | 0 .../Icd10ConditionsMatching/Icd10ConditionsMatching/login.py | 0 .../Icd10ConditionsMatching/Icd10ConditionsMatching/model.py | 0 .../Icd10ConditionsMatching/templates/base.html | 0 .../Icd10ConditionsMatching/templates/validation_index.html | 0 .../Icd10ConditionsMatching/templates/validation_of_trial.html | 0 .../Icd10ConditionsMatching/Icd10ConditionsMatching/validation.py | 0 {scripts => data_mgmt_scripts}/Icd10ConditionsMatching/setup.py | 0 {scripts => data_mgmt_scripts}/Icd10ConditionsMatching/start.sh | 0 {scripts => data_mgmt_scripts}/db_connection_test.py | 0 {scripts => data_mgmt_scripts}/download_tests.py | 0 {scripts => data_mgmt_scripts}/drugtools/__init__.py | 0 .../drugtools/download_and_extract_nsde.py | 0 {scripts => data_mgmt_scripts}/drugtools/env_setup.py | 0 .../drugtools/historical_nct_downloader.py | 0 .../drugtools/historical_nct_extractor.py | 0 .../drugtools/historical_trial_selector.py | 0 {scripts => data_mgmt_scripts}/drugtools/migrate_mysql2pgsql.py | 0 {scripts => data_mgmt_scripts}/drugtools/selected_trials.sql | 0 {scripts => data_mgmt_scripts}/import-icd10_to_cause.py | 0 {scripts => data_mgmt_scripts}/rm_data.sh | 0 {scripts => data_mgmt_scripts}/runall.py | 0 {scripts => data_mgmt_scripts}/umls_requests.py | 0 24 files changed, 0 insertions(+), 0 deletions(-) rename {scripts => data_mgmt_scripts}/Icd10ConditionsMatching/Icd10ConditionsMatching/__init__.py (100%) rename {scripts => data_mgmt_scripts}/Icd10ConditionsMatching/Icd10ConditionsMatching/db_interface.py (100%) rename {scripts => data_mgmt_scripts}/Icd10ConditionsMatching/Icd10ConditionsMatching/login.py (100%) rename {scripts => data_mgmt_scripts}/Icd10ConditionsMatching/Icd10ConditionsMatching/model.py (100%) rename {scripts => data_mgmt_scripts}/Icd10ConditionsMatching/Icd10ConditionsMatching/templates/base.html (100%) rename {scripts => data_mgmt_scripts}/Icd10ConditionsMatching/Icd10ConditionsMatching/templates/validation_index.html (100%) rename {scripts => data_mgmt_scripts}/Icd10ConditionsMatching/Icd10ConditionsMatching/templates/validation_of_trial.html (100%) rename {scripts => data_mgmt_scripts}/Icd10ConditionsMatching/Icd10ConditionsMatching/validation.py (100%) rename {scripts => data_mgmt_scripts}/Icd10ConditionsMatching/setup.py (100%) rename {scripts => data_mgmt_scripts}/Icd10ConditionsMatching/start.sh (100%) rename {scripts => data_mgmt_scripts}/db_connection_test.py (100%) rename {scripts => data_mgmt_scripts}/download_tests.py (100%) rename {scripts => data_mgmt_scripts}/drugtools/__init__.py (100%) rename {scripts => data_mgmt_scripts}/drugtools/download_and_extract_nsde.py (100%) rename {scripts => data_mgmt_scripts}/drugtools/env_setup.py (100%) rename {scripts => data_mgmt_scripts}/drugtools/historical_nct_downloader.py (100%) rename {scripts => data_mgmt_scripts}/drugtools/historical_nct_extractor.py (100%) rename {scripts => data_mgmt_scripts}/drugtools/historical_trial_selector.py (100%) rename {scripts => data_mgmt_scripts}/drugtools/migrate_mysql2pgsql.py (100%) rename {scripts => data_mgmt_scripts}/drugtools/selected_trials.sql (100%) rename {scripts => data_mgmt_scripts}/import-icd10_to_cause.py (100%) rename {scripts => data_mgmt_scripts}/rm_data.sh (100%) rename {scripts => data_mgmt_scripts}/runall.py (100%) rename {scripts => data_mgmt_scripts}/umls_requests.py (100%) diff --git a/scripts/Icd10ConditionsMatching/Icd10ConditionsMatching/__init__.py b/data_mgmt_scripts/Icd10ConditionsMatching/Icd10ConditionsMatching/__init__.py similarity index 100% rename from scripts/Icd10ConditionsMatching/Icd10ConditionsMatching/__init__.py rename to data_mgmt_scripts/Icd10ConditionsMatching/Icd10ConditionsMatching/__init__.py diff --git a/scripts/Icd10ConditionsMatching/Icd10ConditionsMatching/db_interface.py b/data_mgmt_scripts/Icd10ConditionsMatching/Icd10ConditionsMatching/db_interface.py similarity index 100% rename from scripts/Icd10ConditionsMatching/Icd10ConditionsMatching/db_interface.py rename to data_mgmt_scripts/Icd10ConditionsMatching/Icd10ConditionsMatching/db_interface.py diff --git a/scripts/Icd10ConditionsMatching/Icd10ConditionsMatching/login.py b/data_mgmt_scripts/Icd10ConditionsMatching/Icd10ConditionsMatching/login.py similarity index 100% rename from scripts/Icd10ConditionsMatching/Icd10ConditionsMatching/login.py rename to data_mgmt_scripts/Icd10ConditionsMatching/Icd10ConditionsMatching/login.py diff --git a/scripts/Icd10ConditionsMatching/Icd10ConditionsMatching/model.py b/data_mgmt_scripts/Icd10ConditionsMatching/Icd10ConditionsMatching/model.py similarity index 100% rename from scripts/Icd10ConditionsMatching/Icd10ConditionsMatching/model.py rename to data_mgmt_scripts/Icd10ConditionsMatching/Icd10ConditionsMatching/model.py diff --git a/scripts/Icd10ConditionsMatching/Icd10ConditionsMatching/templates/base.html b/data_mgmt_scripts/Icd10ConditionsMatching/Icd10ConditionsMatching/templates/base.html similarity index 100% rename from scripts/Icd10ConditionsMatching/Icd10ConditionsMatching/templates/base.html rename to data_mgmt_scripts/Icd10ConditionsMatching/Icd10ConditionsMatching/templates/base.html diff --git a/scripts/Icd10ConditionsMatching/Icd10ConditionsMatching/templates/validation_index.html b/data_mgmt_scripts/Icd10ConditionsMatching/Icd10ConditionsMatching/templates/validation_index.html similarity index 100% rename from scripts/Icd10ConditionsMatching/Icd10ConditionsMatching/templates/validation_index.html rename to data_mgmt_scripts/Icd10ConditionsMatching/Icd10ConditionsMatching/templates/validation_index.html diff --git a/scripts/Icd10ConditionsMatching/Icd10ConditionsMatching/templates/validation_of_trial.html b/data_mgmt_scripts/Icd10ConditionsMatching/Icd10ConditionsMatching/templates/validation_of_trial.html similarity index 100% rename from scripts/Icd10ConditionsMatching/Icd10ConditionsMatching/templates/validation_of_trial.html rename to data_mgmt_scripts/Icd10ConditionsMatching/Icd10ConditionsMatching/templates/validation_of_trial.html diff --git a/scripts/Icd10ConditionsMatching/Icd10ConditionsMatching/validation.py b/data_mgmt_scripts/Icd10ConditionsMatching/Icd10ConditionsMatching/validation.py similarity index 100% rename from scripts/Icd10ConditionsMatching/Icd10ConditionsMatching/validation.py rename to data_mgmt_scripts/Icd10ConditionsMatching/Icd10ConditionsMatching/validation.py diff --git a/scripts/Icd10ConditionsMatching/setup.py b/data_mgmt_scripts/Icd10ConditionsMatching/setup.py similarity index 100% rename from scripts/Icd10ConditionsMatching/setup.py rename to data_mgmt_scripts/Icd10ConditionsMatching/setup.py diff --git a/scripts/Icd10ConditionsMatching/start.sh b/data_mgmt_scripts/Icd10ConditionsMatching/start.sh similarity index 100% rename from scripts/Icd10ConditionsMatching/start.sh rename to data_mgmt_scripts/Icd10ConditionsMatching/start.sh diff --git a/scripts/db_connection_test.py b/data_mgmt_scripts/db_connection_test.py similarity index 100% rename from scripts/db_connection_test.py rename to data_mgmt_scripts/db_connection_test.py diff --git a/scripts/download_tests.py b/data_mgmt_scripts/download_tests.py similarity index 100% rename from scripts/download_tests.py rename to data_mgmt_scripts/download_tests.py diff --git a/scripts/drugtools/__init__.py b/data_mgmt_scripts/drugtools/__init__.py similarity index 100% rename from scripts/drugtools/__init__.py rename to data_mgmt_scripts/drugtools/__init__.py diff --git a/scripts/drugtools/download_and_extract_nsde.py b/data_mgmt_scripts/drugtools/download_and_extract_nsde.py similarity index 100% rename from scripts/drugtools/download_and_extract_nsde.py rename to data_mgmt_scripts/drugtools/download_and_extract_nsde.py diff --git a/scripts/drugtools/env_setup.py b/data_mgmt_scripts/drugtools/env_setup.py similarity index 100% rename from scripts/drugtools/env_setup.py rename to data_mgmt_scripts/drugtools/env_setup.py diff --git a/scripts/drugtools/historical_nct_downloader.py b/data_mgmt_scripts/drugtools/historical_nct_downloader.py similarity index 100% rename from scripts/drugtools/historical_nct_downloader.py rename to data_mgmt_scripts/drugtools/historical_nct_downloader.py diff --git a/scripts/drugtools/historical_nct_extractor.py b/data_mgmt_scripts/drugtools/historical_nct_extractor.py similarity index 100% rename from scripts/drugtools/historical_nct_extractor.py rename to data_mgmt_scripts/drugtools/historical_nct_extractor.py diff --git a/scripts/drugtools/historical_trial_selector.py b/data_mgmt_scripts/drugtools/historical_trial_selector.py similarity index 100% rename from scripts/drugtools/historical_trial_selector.py rename to data_mgmt_scripts/drugtools/historical_trial_selector.py diff --git a/scripts/drugtools/migrate_mysql2pgsql.py b/data_mgmt_scripts/drugtools/migrate_mysql2pgsql.py similarity index 100% rename from scripts/drugtools/migrate_mysql2pgsql.py rename to data_mgmt_scripts/drugtools/migrate_mysql2pgsql.py diff --git a/scripts/drugtools/selected_trials.sql b/data_mgmt_scripts/drugtools/selected_trials.sql similarity index 100% rename from scripts/drugtools/selected_trials.sql rename to data_mgmt_scripts/drugtools/selected_trials.sql diff --git a/scripts/import-icd10_to_cause.py b/data_mgmt_scripts/import-icd10_to_cause.py similarity index 100% rename from scripts/import-icd10_to_cause.py rename to data_mgmt_scripts/import-icd10_to_cause.py diff --git a/scripts/rm_data.sh b/data_mgmt_scripts/rm_data.sh similarity index 100% rename from scripts/rm_data.sh rename to data_mgmt_scripts/rm_data.sh diff --git a/scripts/runall.py b/data_mgmt_scripts/runall.py similarity index 100% rename from scripts/runall.py rename to data_mgmt_scripts/runall.py diff --git a/scripts/umls_requests.py b/data_mgmt_scripts/umls_requests.py similarity index 100% rename from scripts/umls_requests.py rename to data_mgmt_scripts/umls_requests.py