added some data that had been left untracked

llm-extraction
Will King 2 years ago
parent 6a931b3a49
commit 142670d08a

4
.gitattributes vendored

@ -1,2 +1,6 @@
*.sql.gzip filter=lfs diff=lfs merge=lfs -text *.sql.gzip filter=lfs diff=lfs merge=lfs -text
*.xlsx filter=lfs diff=lfs merge=lfs -text *.xlsx filter=lfs diff=lfs merge=lfs -text
containers/AACT_Reloader/2023-09-06_aactdb_with_matches.sql.gz filter=lfs diff=lfs merge=lfs -text
other_data/USP[[:space:]]DC/usp_dc_pub_2023_release_2.0_updated_final.csv filter=lfs diff=lfs merge=lfs -text
other_data/USP[[:space:]]MMG/MMG_v8.0_Alignment_File.csv filter=lfs diff=lfs merge=lfs -text
other_data/VA[[:space:]]Formulary/PharmacyProductSystem_NationalDrugCodeExtract.csv filter=lfs diff=lfs merge=lfs -text

BIN
2023-09-06_aactdb_with_matches.sql.gzip (Stored with Git LFS)

Binary file not shown.

Binary file not shown.

@ -0,0 +1,23 @@
#!/bin/bash
RESTORE_DUMP_GZ=2023-09-06_aactdb_with_matches.sql.gz
POSTGRES_USER=root
POSTGRES_PASSWORD=root
#start container
podman run \
-e POSTGRES_PASSWORD="${POSTGRES_PASSWORD}" \
-e POSTGRES_USER="${POSTGRES_USER}" \
-e POSTGRES_DB=aact_db \
--name aact_db \
--detatch \
--shm-size=512mb \
--volume ${RESTORE_DUMP_GZ}:/backup/${RESTORE_DUMP_GZ} \
--ports 5432:5432\
postgres:14-alpine
#execute within container
podman exec aact_db \
"gzip --keep --stdout --decompress /backup/2023-09-06_aactdb_with_matches.sql.gz | psql -U ${POSTGRES_USER}"

@ -5,23 +5,19 @@
# - move postgress login credentials (allow them to be printed from just while setting up) # - move postgress login credentials (allow them to be printed from just while setting up)
#paths for aact_db (postgres) data_link := "https://ctti-aact.nyc3.digitaloceanspaces.com/27grtsnhtccplxapj2o8ak9aotvv"
aact_download_link := "https://ctti-aact.nyc3.digitaloceanspaces.com/27grtsnhtccplxapj2o8ak9aotvv" data_file := "2022-12-23_postgres_data.zip"
aact_download_file := "2022-12-23_postgres_data.zip" data_path := "./containers/AACT_downloader/aact_downloads"
aact_download_path := "./containers/AACT_downloader/aact_downloads" data_filepath := data_path / data_file
aact_zipped_data_filepath := aact_download_path / aact_download_file
#must match the 'container name: aact_db' in the docker-compose.yaml #must match the 'container name: aact_db' in the docker-compose.yaml
docker_container := `docker container ls -a | grep "aact_db|rxnav_db" | cut -f 1 -d " " | tr "\n" " "` docker_container := `docker container ls -a | grep aact_db | cut -f 1 -d " " | tr "\n" " "`
#paths for rxnavinabox
rxnav_path := "./containers/RxNav-In-a-box"
rxnav_version := "rxnav-in-a-box-20230103"
rxnav_data_path := rxnav_path / rxnav_version / "mysql" / "02_data.sql"
#Various paths for docker stuff #Various paths for docker stuff
docker-compose_path := "./containers/docker-compose.yaml" docker-compose_path := "./AACT_downloader/docker-compose.yaml"
#rxnorm_mappings
rxnorm_mappings_url := "https://dailymed-data.nlm.nih.gov/public-release-files/rxnorm_mappings.zip"
#Number of historical trials to download. #Number of historical trials to download.
count := "100" count := "100"
@ -32,23 +28,18 @@ check-status:
docker --version docker --version
#check if python version > 3.10. #check if python version > 3.10.
python --version python --version
#python -c 'import sys; exit(sys.hexversion >= 50859504)' python -c 'import sys; exit(sys.hexversion >= 50859504)'
curl --version curl --version
echo "current docker containers:{{docker_container}}" echo "current docker containers:{{docker_container}}"
#Setup the AACT container
setup-containers: setup-containers:
echo "todo"
@echo "Check for downloaded data" @echo "Check for downloaded data"
#aact [ -s {{data_path}}/postgres_data.dmp ]
[ -s {{aact_download_path}}/postgres_data.dmp ]
#rxnav
[ -s {{rxnav_data_path}} ]
#run docker compose #run docker compose
@echo "Setting up AACT_db & RxNav_db container" @echo "Setting up AACT container"
docker-compose -f {{docker-compose_path}} up -d docker-compose -f {{docker-compose_path}} up -d
#Stop the appropriate docker container #Stop the appropriate docker container
stop-containers: stop-containers:
@ -69,13 +60,10 @@ clean-docker: stop-containers
#Download the AACT data #Download the AACT data
download-aact-data: download-aact-data:
#download curl {{data_link}} > ./AACT_downloader/aact_downloads/{{data_file}}
curl {{aact_download_link}} > {{aact_zipped_data_filepath}} unzip {{data_filepath}} -d {{data_path}}
unzip {{aact_zipped_data_filepath}} -d {{aact_download_path}} rm {{data_filepath}}
rm {{aact_zipped_data_filepath}}
download-rxnav-data:
echo "Currently manually downloaded."
#build based on previously downloaded data #build based on previously downloaded data
build: check-status setup-containers build: check-status setup-containers
@ -117,3 +105,8 @@ get-nsde:
cd market_data && bash download_nsde.sh cd market_data && bash download_nsde.sh
cd market_data && python extract_nsde.py cd market_data && python extract_nsde.py
get-rxnorm-mappings:
#this may not be needed, all it does is match spls to rxcuis and I think I already have that.
curl {{rxnorm_mappings_url}} > ./market_data/rxnorm_mappings.zip
cd ./market_data && unzip ./rxnorm_mappings.zip
rm ./market_data/rxnorm_mappings.zip

Binary file not shown.
1 version https://git-lfs.github.com/spec/v1
2 oid sha256:d08d3944a859c0b1f6bbd466ca027fc46c86ef5bb0328cb005fa002b7b61e70b
3 size 2451625

Binary file not shown.
1 version https://git-lfs.github.com/spec/v1
2 oid sha256:85859dae3971460d36e0643ee1396cb646dba158b75862d557210cb2c50707a9
3 size 874058

Binary file not shown.
1 version https://git-lfs.github.com/spec/v1
2 oid sha256:8c877a461be9e75565f78d0552e76f597b5cce709c82a3f9ad30dcc0f26ddafc
3 size 32481883

@ -126,17 +126,18 @@ def extract_submission_dates(soup):
version_date_dict = {} version_date_dict = {}
for row in reversed(table_rows): for row in table_rows:
# if it is <td headers="VersionNumber">xx</td> then it contains what we need. # if it is <td headers="VersionNumber">xx</td> then it contains what we need.
version_number = None
version_date = None
for td in row.findChildren("td"): for td in row.findChildren("td"):
if ("headers" in td.attrs): if ("headers" in td.attrs):
if (td.attrs["headers"][0]=="VersionNumber"): if (td.attrs["headers"][0]=="VersionNumber"):
version_number = int(td.text) version_number = int(td.text)
elif (td.attrs["headers"][0]=="VersionDate"): elif (td.attrs["headers"][0]=="VersionDate"):
version_date = td.text version_date = datetime.strptime(td.text.strip() , "%B %d, %Y")
version_date_dict[version_number] = datetime.strptime(version_date , "%B %d, %Y")
print(version_date_dict) version_date_dict[version_number] = version_date
return version_date_dict return version_date_dict
def optional_strip(possible_string): def optional_strip(possible_string):
@ -396,14 +397,12 @@ date_MMMM_DD_YYYY = "%B %d, %Y"
def get_data_from_versions(nct_id,html, version_a_int, version_b_int): def get_data_from_versions(nct_id,html, version_a_int, version_b_int):
soup = BeautifulSoup(html,"lxml") soup = BeautifulSoup(html,"lxml")
print(getting_data_from_versions)
version_date_dict = extract_submission_dates(soup) version_date_dict = extract_submission_dates(soup)
print(version_date_dict)
#preallocate version data #preallocate version data
version_a = VersionData(nct_id, version_a_int, version_date_dict[version_a_int]) version_a = VersionData(nct_id, version_a_int, version_date_dict.get(version_a_int))
version_b = VersionData(nct_id, version_b_int, version_date_dict[version_b_int]) version_b = VersionData(nct_id, version_b_int, version_date_dict.get(version_b_int))
#extract data from html and put it in the preallocated objects #extract data from html and put it in the preallocated objects
get_forms(soup, version_a, version_b) get_forms(soup, version_a, version_b)
@ -424,7 +423,6 @@ def run():
curse.execute(sql) curse.execute(sql)
for response in tqdm(curse.fetchall()): for response in tqdm(curse.fetchall()):
nct_id, version_a, version_b, html = response nct_id, version_a, version_b, html = response
print(nct_id)
print(nct_id, version_a, version_b) if VERBOSE else "" print(nct_id, version_a, version_b) if VERBOSE else ""

Loading…
Cancel
Save