diff --git a/README.md b/README.md index 1f8b215..e4842fb 100644 --- a/README.md +++ b/README.md @@ -1,56 +1,34 @@ # ClinicalTrialsDataProcessing -This is used to build tools which process and standardize the data. +This represents my -More data later. +## Prerequisites -# Outline +> Python >= 3.8 +> Docker >= 20.10 +> Curl >= 7 +> Just >= 1.9 -## Directory Tree -AACT_downloader +# Usage -## Key files index -# Background on Docker -Docker uses the following flow +## Basic usage -1. configuration using `docker-compose.yaml` or a `Dockerfile` -2. `docker build .` to generate an image -3. `docker run xxxxxx` to take the image and create a container. - - when the container is created, it starts, running commands as configured in the dockerfile. - - Consequently, the AACT database image when run must initialize the postgres db, then run the initalization details. - - Here is where bind mounts come into play. +Check prerequisites +```bash +just check-status +``` -## Multistage builds -https://stackoverflow.com/questions/53659993/docker-multi-stage-how-to-split-up-into-multiple-dockerfiles +Setup the underlying AACT database including downloading historical data. +```bash +just create +just select-trials +``` -https://docs.docker.com/develop/develop-images/multistage-build/ +## Advanced Usage -Basically +## TODO +finish advanced usage +add a section describing networking -## Dockerfile vs docker-compose.yaml - -A `Dockerfile` is used to create images. - -A `docker-compose.yaml` is used to automate the deployment of containers. - -## Types of storage - -### COPY/ADD (Dockerfile) - -In a dockerfile, this adds a file permanently to the image. - -This adds files one way to or from the container when initialized. - -### Volumes (docker-compose.yaml && Dockerfile) - -Useable in both docker-compose and Dockerfile's, this creates a permanent storage. -It can be maintained by docker or stored in a particular location. - -Good for longer term storage such as databases. - -### Bind mounts (docker-compose.yaml) - -Bind mounts are used to make a host filesystem resource -available diff --git a/history_downloader/downloader.py b/history_downloader/downloader.py index 9875525..d664d4e 100644 --- a/history_downloader/downloader.py +++ b/history_downloader/downloader.py @@ -108,8 +108,8 @@ def write_incomplete(cursor, nct_id): Flags a trial as not having been fully downloaded. """ query = """ - INSERT INTO HTTP.DOWNLOAD_STATUS (NCT_ID,STATUS) - %s, 'Incomplete'::HTTP.HISTORY_DOWNLOAD_STATUS + INSERT INTO HTTP.DOWNLOAD_STATUS (NCT_ID,STATUS) VALUES + (%s, 'Incomplete'::HTTP.HISTORY_DOWNLOAD_STATUS); """ cursor.execute(query, [nct_id] ) @@ -197,32 +197,6 @@ def step_generator(max_version): yield (old,i) old = i + 1 -def flag_trials_of_interest(db_connection): - """ - Mark the queries of interest as "of interest" - INCOMPLETE - """ - - query = """ - INSERT INTO http.download_status (nct_id, status) - SELECT nct_id, 'Of Interest'::http.history_download_status AS status - FROM ctgov.studies - WHERE - is_fda_regulated_drug=TRUE - AND - study_type = 'Interventional' - AND - phase='Phase 3' - AND - overall_status in ('Terminated', 'Completed') - AND - start_date > '2008-01-01' - AND - completion_date < '2022-01-01' - ; - """ - #TODO: actually send it to the database. - #Should probably have thsi saved as a file, such as the downloader_prep.sql file. def reserve_trials(db_connection, limit=10): """ @@ -254,17 +228,17 @@ if __name__ == "__main__": #instantiate a database connnection creator dbc = DBConnectionCreator( dbname="aact_db" - ,user="python_downloader" + ,user="root" ,host="will-office" ,port=5432 - ,password="download") + ,password="root") #db connection with dbc.new() as con: #get list of nct_ids - nctids = reserve_trials(con, 1500) + nctids = reserve_trials(con, 150) print(nctids) diff --git a/history_downloader/downloader_prep.sql b/history_downloader/downloader_prep.sql deleted file mode 100644 index 43323f6..0000000 --- a/history_downloader/downloader_prep.sql +++ /dev/null @@ -1,21 +0,0 @@ -DELETE FROM http.download_status; - -INSERT INTO http.download_status (nct_id, status) -SELECT nct_id, 'Of Interest'::http.history_download_status AS status -FROM ctgov.studies -WHERE -is_fda_regulated_drug=TRUE -AND -study_type = 'Interventional' -AND -phase='Phase 3' -AND -overall_status in ('Terminated', 'Completed') -AND -start_date > '2008-01-01' -AND -completion_date < '2022-01-01' -; - - -SELECT count(*) FROM http.download_status ; diff --git a/history_downloader/select_trials.py b/history_downloader/select_trials.py new file mode 100644 index 0000000..5c2713a --- /dev/null +++ b/history_downloader/select_trials.py @@ -0,0 +1,19 @@ +import downloader as dldr + + + + +if __name__ == "__main__": + + dbc = dldr.DBConnectionCreator( + dbname="aact_db" + ,user="root" + ,host="will-office" + ,port=5432 + ,password="root") + + with open('selected_trials.sql','r') as fh: + sqlfile = fh.read() + with dbc.new() as connection: + with connection.cursor() as curse: + curse.execute(sqlfile) diff --git a/history_downloader/selected_trials.sql b/history_downloader/selected_trials.sql new file mode 100644 index 0000000..64ee383 --- /dev/null +++ b/history_downloader/selected_trials.sql @@ -0,0 +1,21 @@ +DELETE FROM http.download_status; + +INSERT INTO http.download_status (nct_id, status) +SELECT nct_id, 'Of Interest'::http.history_download_status AS status +FROM ctgov.studies +WHERE + is_fda_regulated_drug=TRUE + AND + study_type = 'Interventional' + AND + phase='Phase 3' + AND + overall_status in ('Terminated', 'Completed') + AND + start_date > '2008-01-01' + AND + completion_date < '2022-01-01' +; + + +SELECT count(*) FROM http.download_status ; diff --git a/justfile b/justfile index fe37057..695092c 100644 --- a/justfile +++ b/justfile @@ -72,3 +72,10 @@ create: check-status download-aact-data build #remove containers, redownload data, then rebuild containers recreate: clean-docker create # removed containers, redownloaded data, then rebuilt containers + +#Register trials of interest in the database based on ./history_downloader/selected_trials.sql +select-trials: + cd history_downloader && python ./select_trials.py +#Download trial histories based on registered trials of interest. +download-trial-histories: + cd history_downloader && python ./downloader.py