diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..b720f8a --- /dev/null +++ b/.gitattributes @@ -0,0 +1,6 @@ +*.sql.gzip filter=lfs diff=lfs merge=lfs -text +*.xlsx filter=lfs diff=lfs merge=lfs -text +containers/AACT_Reloader/2023-09-06_aactdb_with_matches.sql.gz filter=lfs diff=lfs merge=lfs -text +other_data/USP[[:space:]]DC/usp_dc_pub_2023_release_2.0_updated_final.csv filter=lfs diff=lfs merge=lfs -text +other_data/USP[[:space:]]MMG/MMG_v8.0_Alignment_File.csv filter=lfs diff=lfs merge=lfs -text +other_data/VA[[:space:]]Formulary/PharmacyProductSystem_NationalDrugCodeExtract.csv filter=lfs diff=lfs merge=lfs -text diff --git a/containers/AACT_Reloader/.gitattributes b/containers/AACT_Reloader/.gitattributes new file mode 100644 index 0000000..a20c5a3 --- /dev/null +++ b/containers/AACT_Reloader/.gitattributes @@ -0,0 +1 @@ +backup/2023-09-06_aactdb_with_matches.sql.gz filter=lfs diff=lfs merge=lfs -text diff --git a/containers/AACT_Reloader/StartRestoreContainer.sh b/containers/AACT_Reloader/StartRestoreContainer.sh new file mode 100755 index 0000000..e647c84 --- /dev/null +++ b/containers/AACT_Reloader/StartRestoreContainer.sh @@ -0,0 +1,39 @@ +#!/bin/bash + +RESTORE_DUMP_GZ=2023-09-06_aactdb_with_matches.sql.gz +POSTGRES_USER=root +POSTGRES_PASSWORD=root +POSTGRES_DB=aact_db + +#start container +podman run \ + -e POSTGRES_PASSWORD="${POSTGRES_PASSWORD}" \ + -e POSTGRES_USER="${POSTGRES_USER}" \ + -e POSTGRES_DB="${POSTGRES_DB}" \ + --name "${POSTGRES_DB}" \ + --detach \ + --shm-size=512mb \ + --volume ./backup/:/backup/ \ + -p 5432:5432\ + postgres:14-alpine + + +sleep 10 + +# Function to check if PostgreSQL is ready +function check_postgres { + podman exec -i "${POSTGRES_DB}" psql -h localhost -U "${POSTGRES_USER}" -d "${POSTGRES_DB}" -c '\q' > /dev/null 2>&1 +} + +# Wait for PostgreSQL to be ready +until check_postgres; do + echo "Waiting for PostgreSQL to be ready..." + sleep 4 +done + +echo "PostgreSQL is ready. Restoring the database..." + +# Decompress the dump file and restore it to the database +podman exec -i "${POSTGRES_DB}" sh -c "gunzip -c /backup/${RESTORE_DUMP_GZ} | psql -h localhost -U ${POSTGRES_USER} -d ${POSTGRES_DB}" + +echo "Database restoration complete." diff --git a/containers/AACT_Reloader/backup/2023-09-06_aactdb_with_matches.sql.gz b/containers/AACT_Reloader/backup/2023-09-06_aactdb_with_matches.sql.gz new file mode 100644 index 0000000..7ff43a7 --- /dev/null +++ b/containers/AACT_Reloader/backup/2023-09-06_aactdb_with_matches.sql.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bd5142f25ff9ef65048c02d5173b5e2a90f4a07513480d5e8c399cf9da39e678 +size 1897211561 diff --git a/justfile b/justfile index a3691ed..0be52ea 100644 --- a/justfile +++ b/justfile @@ -5,23 +5,19 @@ # - move postgress login credentials (allow them to be printed from just while setting up) -#paths for aact_db (postgres) -aact_download_link := "https://ctti-aact.nyc3.digitaloceanspaces.com/27grtsnhtccplxapj2o8ak9aotvv" -aact_download_file := "2022-12-23_postgres_data.zip" -aact_download_path := "./containers/AACT_downloader/aact_downloads" -aact_zipped_data_filepath := aact_download_path / aact_download_file +data_link := "https://ctti-aact.nyc3.digitaloceanspaces.com/27grtsnhtccplxapj2o8ak9aotvv" +data_file := "2022-12-23_postgres_data.zip" +data_path := "./containers/AACT_downloader/aact_downloads" +data_filepath := data_path / data_file #must match the 'container name: aact_db' in the docker-compose.yaml -docker_container := `docker container ls -a | grep "aact_db|rxnav_db" | cut -f 1 -d " " | tr "\n" " "` - -#paths for rxnavinabox -rxnav_path := "./containers/RxNav-In-a-box" -rxnav_version := "rxnav-in-a-box-20230103" -rxnav_data_path := rxnav_path / rxnav_version / "mysql" / "02_data.sql" +docker_container := `docker container ls -a | grep aact_db | cut -f 1 -d " " | tr "\n" " "` #Various paths for docker stuff -docker-compose_path := "./containers/docker-compose.yaml" +docker-compose_path := "./AACT_downloader/docker-compose.yaml" +#rxnorm_mappings +rxnorm_mappings_url := "https://dailymed-data.nlm.nih.gov/public-release-files/rxnorm_mappings.zip" #Number of historical trials to download. count := "100" @@ -32,23 +28,18 @@ check-status: docker --version #check if python version > 3.10. python --version - #python -c 'import sys; exit(sys.hexversion >= 50859504)' + python -c 'import sys; exit(sys.hexversion >= 50859504)' curl --version echo "current docker containers:{{docker_container}}" - +#Setup the AACT container setup-containers: - echo "todo" @echo "Check for downloaded data" - #aact - [ -s {{aact_download_path}}/postgres_data.dmp ] - #rxnav - [ -s {{rxnav_data_path}} ] + [ -s {{data_path}}/postgres_data.dmp ] #run docker compose - @echo "Setting up AACT_db & RxNav_db container" + @echo "Setting up AACT container" docker-compose -f {{docker-compose_path}} up -d - #Stop the appropriate docker container stop-containers: @@ -69,13 +60,10 @@ clean-docker: stop-containers #Download the AACT data download-aact-data: - #download - curl {{aact_download_link}} > {{aact_zipped_data_filepath}} - unzip {{aact_zipped_data_filepath}} -d {{aact_download_path}} - rm {{aact_zipped_data_filepath}} + curl {{data_link}} > ./AACT_downloader/aact_downloads/{{data_file}} + unzip {{data_filepath}} -d {{data_path}} + rm {{data_filepath}} -download-rxnav-data: - echo "Currently manually downloaded." #build based on previously downloaded data build: check-status setup-containers @@ -117,3 +105,8 @@ get-nsde: cd market_data && bash download_nsde.sh cd market_data && python extract_nsde.py +get-rxnorm-mappings: + #this may not be needed, all it does is match spls to rxcuis and I think I already have that. + curl {{rxnorm_mappings_url}} > ./market_data/rxnorm_mappings.zip + cd ./market_data && unzip ./rxnorm_mappings.zip + rm ./market_data/rxnorm_mappings.zip diff --git a/other_data/USP DC/USP_DC_12_2021_RELEASE_1.0.xlsx b/other_data/USP DC/USP_DC_12_2021_RELEASE_1.0.xlsx new file mode 100644 index 0000000..8354a7a --- /dev/null +++ b/other_data/USP DC/USP_DC_12_2021_RELEASE_1.0.xlsx @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cfd45c46b7fc30af8e5b6c53ba03429af9fd9b6f17c026d2739b99ccf3ff44ba +size 1790714 diff --git a/other_data/USP DC/usp_dc_pub_2023_release_2.0_updated_final.csv b/other_data/USP DC/usp_dc_pub_2023_release_2.0_updated_final.csv new file mode 100644 index 0000000..6cd7c22 --- /dev/null +++ b/other_data/USP DC/usp_dc_pub_2023_release_2.0_updated_final.csv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d08d3944a859c0b1f6bbd466ca027fc46c86ef5bb0328cb005fa002b7b61e70b +size 2451625 diff --git a/other_data/USP DC/usp_dc_pub_2023_release_2.0_updated_final.xlsx b/other_data/USP DC/usp_dc_pub_2023_release_2.0_updated_final.xlsx new file mode 100644 index 0000000..2b275a7 --- /dev/null +++ b/other_data/USP DC/usp_dc_pub_2023_release_2.0_updated_final.xlsx @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6f5864bccd3abea775523e841af3da1f70d2cff5394d2a2c9702ebec8131037c +size 891222 diff --git a/other_data/USP MMG/Final_Report_and_Summary_of_Methodology_and_Approach_v1.1.pdf b/other_data/USP MMG/Final_Report_and_Summary_of_Methodology_and_Approach_v1.1.pdf new file mode 100644 index 0000000..a17f1d3 Binary files /dev/null and b/other_data/USP MMG/Final_Report_and_Summary_of_Methodology_and_Approach_v1.1.pdf differ diff --git a/other_data/USP MMG/MMG_v8.0_Alignment_File.csv b/other_data/USP MMG/MMG_v8.0_Alignment_File.csv new file mode 100644 index 0000000..f12587c --- /dev/null +++ b/other_data/USP MMG/MMG_v8.0_Alignment_File.csv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:85859dae3971460d36e0643ee1396cb646dba158b75862d557210cb2c50707a9 +size 874058 diff --git a/other_data/USP MMG/MMG_v8.0_Alignment_File.xlsx b/other_data/USP MMG/MMG_v8.0_Alignment_File.xlsx new file mode 100644 index 0000000..9d1fdaa --- /dev/null +++ b/other_data/USP MMG/MMG_v8.0_Alignment_File.xlsx @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a9e352b380988cc76f4ebb0377719c8cf3f6e268f61c5f775648153d20350aeb +size 356637 diff --git a/other_data/USP MMG/Summary_of_Changes_between_MMGv7.0_and_MMGv8.0.pdf b/other_data/USP MMG/Summary_of_Changes_between_MMGv7.0_and_MMGv8.0.pdf new file mode 100644 index 0000000..add4b2d Binary files /dev/null and b/other_data/USP MMG/Summary_of_Changes_between_MMGv7.0_and_MMGv8.0.pdf differ diff --git a/other_data/USP MMG/USP_Medicare_Model_Guidelines_v8.0__All_Excel_Spreadsheets_.xlsx b/other_data/USP MMG/USP_Medicare_Model_Guidelines_v8.0__All_Excel_Spreadsheets_.xlsx new file mode 100644 index 0000000..e18fc7b --- /dev/null +++ b/other_data/USP MMG/USP_Medicare_Model_Guidelines_v8.0__All_Excel_Spreadsheets_.xlsx @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:90c9f3b08135a250b7b2f2e5c1a1efc23e46898f1dce6399eddc12a00032d36d +size 131520 diff --git a/other_data/USP MMG/USP_Medicare_Model_Guidelines_v8.0__Categories_and_Classes_.pdf b/other_data/USP MMG/USP_Medicare_Model_Guidelines_v8.0__Categories_and_Classes_.pdf new file mode 100644 index 0000000..43d5a7d Binary files /dev/null and b/other_data/USP MMG/USP_Medicare_Model_Guidelines_v8.0__Categories_and_Classes_.pdf differ diff --git a/other_data/USP MMG/USP_Medicare_Model_Guidelines_v8.0__Showing_changes_from_v7.0_.pdf b/other_data/USP MMG/USP_Medicare_Model_Guidelines_v8.0__Showing_changes_from_v7.0_.pdf new file mode 100644 index 0000000..7477fd6 Binary files /dev/null and b/other_data/USP MMG/USP_Medicare_Model_Guidelines_v8.0__Showing_changes_from_v7.0_.pdf differ diff --git a/other_data/USP MMG/USP_Medicare_Model_Guidelines_v8.0__With_Example_Part_D_Drugs_.pdf b/other_data/USP MMG/USP_Medicare_Model_Guidelines_v8.0__With_Example_Part_D_Drugs_.pdf new file mode 100644 index 0000000..8cb9058 Binary files /dev/null and b/other_data/USP MMG/USP_Medicare_Model_Guidelines_v8.0__With_Example_Part_D_Drugs_.pdf differ diff --git a/other_data/VA Formulary/PharmacyProductSystem_NationalDrugCodeExtract.csv b/other_data/VA Formulary/PharmacyProductSystem_NationalDrugCodeExtract.csv new file mode 100644 index 0000000..e383df4 --- /dev/null +++ b/other_data/VA Formulary/PharmacyProductSystem_NationalDrugCodeExtract.csv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8c877a461be9e75565f78d0552e76f597b5cce709c82a3f9ad30dcc0f26ddafc +size 32481883 diff --git a/other_data/VA Formulary/PharmacyProductSystem_NationalDrugCodeExtract.xlsx b/other_data/VA Formulary/PharmacyProductSystem_NationalDrugCodeExtract.xlsx new file mode 100644 index 0000000..f8bf29e --- /dev/null +++ b/other_data/VA Formulary/PharmacyProductSystem_NationalDrugCodeExtract.xlsx @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:77f2d74681a8d509195e5d2bc01104b9321fe500bcca9065738bc9b523edd74a +size 13465117 diff --git a/other_data/VA Formulary/VADrugClass2132012.xls b/other_data/VA Formulary/VADrugClass2132012.xls new file mode 100644 index 0000000..d5c7e5a Binary files /dev/null and b/other_data/VA Formulary/VADrugClass2132012.xls differ diff --git a/other_data/VA Formulary/VA_National_Formulary_JUNE_2023.xlsx b/other_data/VA Formulary/VA_National_Formulary_JUNE_2023.xlsx new file mode 100644 index 0000000..59676c5 --- /dev/null +++ b/other_data/VA Formulary/VA_National_Formulary_JUNE_2023.xlsx @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fb97661ad316e7ab307a5aaece1b2f5b7438d1ee1c6a278192f47e63ff6388cc +size 316461 diff --git a/other_data/VA Formulary/VA_National_Formulary_by_class_JUNE_2023.xlsx b/other_data/VA Formulary/VA_National_Formulary_by_class_JUNE_2023.xlsx new file mode 100644 index 0000000..895d8a7 --- /dev/null +++ b/other_data/VA Formulary/VA_National_Formulary_by_class_JUNE_2023.xlsx @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:58131fd8577707cc94a7208a3da4fabbf0fe10d16cf69dde1cd7b6e7d5701bae +size 379551 diff --git a/scripts/drugtools/historical_nct_downloader.py b/scripts/drugtools/historical_nct_downloader.py index af3ce3e..31cf456 100644 --- a/scripts/drugtools/historical_nct_downloader.py +++ b/scripts/drugtools/historical_nct_downloader.py @@ -5,11 +5,13 @@ from multiprocess import Pool, Value import math import time from drugtools.env_setup import postgres_conn, ENV +from tqdm import tqdm ############ GLOBALS RESET_TIME = Value('I',int(ENV["TRIAL_DOWNLOAD_RESET_TIME"])) DELAY_TIME = Value("I",int(ENV["TRIAL_DOWNLOAD_DELAY_TIME"])) TRIAL_RESERVATION_LIMIT=int(ENV["TRIAL_RESERVATION_LIMIT"]) +TRIAL_RESERVATION_BATCH_SIZE=int(ENV["TRIAL_RESERVATION_BATCH_SIZE"]) ############ Functions def get_highest_version_number(response): @@ -25,7 +27,10 @@ def get_highest_version_number(response): soup = BeautifulSoup(response.text, features="lxml") #get version table rows - table_rows = soup.findChildren("fieldset")[0].table.tbody.findChildren("tr") + try: + table_rows = soup.findChildren("fieldset")[0].table.tbody.findChildren("tr") + except IndexError as ie: + raise ie for row in reversed(table_rows): # if it is xx then it contains what we need. @@ -44,7 +49,11 @@ def make_request(nct_id,version1,version2): url = baseurl.format(nct_id,version1,version2) #make request - response = requests.get(url) + try: + time.sleep(0.02) + response = requests.get(url) + except requests.exceptions.ConnectionError as ce: + raise ce #return the response return response @@ -85,16 +94,24 @@ def download_and_handle_errors(cursor, nct_id, version_a, version_b, delay_time, #check for if r.status_code == 200: upload_response(cursor,nct_id,version_a, version_b, r) + elif r.status_code == 404: + upload_response(cursor, nct_id, version_a, version_b, r) + write_incomplete(cursor,nct_id) + return None elif r.status_code == 503: # write http code to http.responses upload_response(cursor, nct_id, version_a, version_b, r) # write incomplete to http.download_status write_incomplete(cursor,nct_id) # tell all other processes to slow down the request speed - delay_time.value += 1 # Delay - print("Recieved 503 on {}, increasing delay count to {}".format(nct_id, delay_tiome)) - time.sleep(reset_time) + with delay_time.get_lock(): + delay_time.value += 1 + time.sleep(reset_time.value) + print("Recieved 503 on {}, increasing delay count to {}".format( + nct_id, + delay_time) + ) else: #TODO: this should handle errors by # write http code to http.responses @@ -104,8 +121,13 @@ def download_and_handle_errors(cursor, nct_id, version_a, version_b, delay_time, # raise exception #raise Exception("Download of {} (versions {},{}) returned http code {}".format(nct_id,version_a,version_b, r.status_code)) + print("Recieved {} on {}, increasing delay count to {}".format( + r.status_code, + nct_id, + delay_time)) # Delay - time.sleep(reset_time) + with reset_time.get_lock(): + time.sleep(reset_time.value) return r def write_incomplete(cursor, nct_id): @@ -128,9 +150,6 @@ def download_trial_records(nct_id, delay_time, reset_time): This doesn't reserve a trial for download, but it does release the reservation. """ - #for testing - print(nct_id) - # A new connection is created every time the function is called so that this # function can be run using a multiprocessing pool @@ -140,8 +159,14 @@ def download_trial_records(nct_id, delay_time, reset_time): #upload the first two versions r = download_and_handle_errors(cursor, nct_id, 1, 2, delay_time, reset_time) #extract last version - v = get_highest_version_number(r) + if r is None: + return None + + try: + v = get_highest_version_number(r) + except IndexError as ie: + raise RuntimeError(ie.__str__() + " | nct_id {}".format(nct_id)) #download and upload the remaining versions if v == 2: @@ -205,23 +230,29 @@ def reserve_trials(db_connection, limit=10): return nctids_list +def chunker(seq, size): + return [seq[pos:pos + size] for pos in range(0, len(seq), size)] def reserve_and_download_versions(limit): - #db connection - with postgres_conn() as con: - - #get list of nct_ids - nctids = reserve_trials(con, limit) - print("reserving_trials: ", nctids) - #lambda that parameterizes the downloader, allowing it to be passed to the pool. def downloader(nct): download_trial_records(nct, DELAY_TIME, RESET_TIME) - #start analyzing them - with Pool(processes=12) as process_pool: - process_pool.map(downloader, nctids) + #db connection + with postgres_conn() as con: + itt = 0 + while (nctids := reserve_trials(con,TRIAL_RESERVATION_BATCH_SIZE)) and \ + itt < TRIAL_RESERVATION_LIMIT: + print(nctids) + with Pool(processes=12) as process_pool: + l = len(nctids) + itt += l + with tqdm(total=l) as prog_bar: + for _ in process_pool.imap_unordered(downloader, nctids): + prog_bar.update() + con.commit() + def run(): @@ -231,4 +262,5 @@ if __name__ == "__main__": """ Main! """ - run() \ No newline at end of file + run() + #db connection diff --git a/scripts/drugtools/historical_nct_extractor.py b/scripts/drugtools/historical_nct_extractor.py index ab62bf3..326d4fd 100644 --- a/scripts/drugtools/historical_nct_extractor.py +++ b/scripts/drugtools/historical_nct_extractor.py @@ -126,17 +126,18 @@ def extract_submission_dates(soup): version_date_dict = {} - for row in reversed(table_rows): + for row in table_rows: # if it is xx then it contains what we need. + version_number = None + version_date = None for td in row.findChildren("td"): if ("headers" in td.attrs): if (td.attrs["headers"][0]=="VersionNumber"): version_number = int(td.text) elif (td.attrs["headers"][0]=="VersionDate"): - version_date = td.text - version_date_dict[version_number] = datetime.strptime(version_date , "%B %d, %Y") + version_date = datetime.strptime(td.text.strip() , "%B %d, %Y") - print(version_date_dict) + version_date_dict[version_number] = version_date return version_date_dict def optional_strip(possible_string): @@ -396,14 +397,12 @@ date_MMMM_DD_YYYY = "%B %d, %Y" def get_data_from_versions(nct_id,html, version_a_int, version_b_int): soup = BeautifulSoup(html,"lxml") - print(getting_data_from_versions) version_date_dict = extract_submission_dates(soup) - print(version_date_dict) #preallocate version data - version_a = VersionData(nct_id, version_a_int, version_date_dict[version_a_int]) - version_b = VersionData(nct_id, version_b_int, version_date_dict[version_b_int]) + version_a = VersionData(nct_id, version_a_int, version_date_dict.get(version_a_int)) + version_b = VersionData(nct_id, version_b_int, version_date_dict.get(version_b_int)) #extract data from html and put it in the preallocated objects get_forms(soup, version_a, version_b) @@ -424,7 +423,6 @@ def run(): curse.execute(sql) for response in tqdm(curse.fetchall()): nct_id, version_a, version_b, html = response - print(nct_id) print(nct_id, version_a, version_b) if VERBOSE else "" diff --git a/scripts/runall.py b/scripts/runall.py index 39935f0..01e481d 100644 --- a/scripts/runall.py +++ b/scripts/runall.py @@ -11,9 +11,13 @@ print(env_setup.ENV) cont = input("Are you willing to continue with the current environmnet? y/[n]") if cont == "Y" or cont == "y": - hts.run() - hnd.run() + print("SelectingTrials") + #hts.run() + print("downloading trials") + #hnd.run() + print("extracting trials") hne.run() + exit(0) daen.run() mm2p.run() else: