diff --git a/.gitignore b/.gitignore index e934503..8b8166c 100644 --- a/.gitignore +++ b/.gitignore @@ -180,4 +180,5 @@ Manifest.toml ###### Custom ##### -**/host_data/*_clinical_trials/ +*_clinical_trials/ +*_clinical_trials.zip diff --git a/DockerContainers/ClinicalTrialHistory/Dockerfile b/DockerContainers/ClinicalTrialHistory/Dockerfile new file mode 100644 index 0000000..e8b5481 --- /dev/null +++ b/DockerContainers/ClinicalTrialHistory/Dockerfile @@ -0,0 +1,7 @@ +FROM youainti/aact_from_dump +LABEL AUTHOR 'Will King (youainti@protonmail.com)' +LABEL DESCRIPTION 'add extra processing to the aact database in preparation for downloading history.' + +#copy additional init scripts +COPY ./docker-entrypoint-initdb.d/ /docker-entrypoint-initdb.d/ +#these will be run after the database is initialized diff --git a/DockerContainers/ClinicalTrialHistory/docker-entrypoint-initdb.d/020_HttpSchema.sql b/DockerContainers/ClinicalTrialHistory/docker-entrypoint-initdb.d/020_HttpSchema.sql new file mode 100644 index 0000000..53b5f08 --- /dev/null +++ b/DockerContainers/ClinicalTrialHistory/docker-entrypoint-initdb.d/020_HttpSchema.sql @@ -0,0 +1,41 @@ +CREATE SCHEMA http; +/* +The purpose of this schema, tables, and associated roles to process HTTP responses. +I may even include a table to keep track of the XML responses +*/ + +/* +Add a role to manage permissions on the http schema +*/ +CREATE ROLE http_requestor; + +GRANT CONNECT ON DATABASE aact_db to http_requestor; + +GRANT USAGE ON SCHEMA http TO http_requestor; + +GRANT INSERT,SELECT ON ALL TABLES IN SCHEMA http TO http_requestor; + + +/* Create tables related to http requests +As not every request will have an xml doc, split them. +*/ + +CREATE TABLE IF NOT EXISTS http.responses ( + id SERIAL PRIMARY KEY, + nct VARCHAR(15), + version SMALLINT + url VARCHAR(255), + response_code SMALLINT, + response_date DATE + ); + +CREATE TABLE IF NOT EXISTS http.xml_documents ( + id SERIAL PRIMARY KEY, + xml XML, + CONSTRAINT http_response + FOREIGN KEY (id) + REFERENCES http.responses (id) + ON DELETE CASCADE --remove xml if the request is deleted +); + + diff --git a/DockerContainers/ClinicalTrialHistory/docker-entrypoint-initdb.d/030_HistoricalSchema.sql b/DockerContainers/ClinicalTrialHistory/docker-entrypoint-initdb.d/030_HistoricalSchema.sql new file mode 100644 index 0000000..2f605f8 --- /dev/null +++ b/DockerContainers/ClinicalTrialHistory/docker-entrypoint-initdb.d/030_HistoricalSchema.sql @@ -0,0 +1,26 @@ +-- Create a schema handling trial history. +CREATE SCHEMA history; + +--Create role for anyone who needs to both select and insert on historical data +CREATE ROLE history_writer; +GRANT CONNECT ON DATABASE aact_db to history_writer; + +GRANT USAGE ON SCHEMA history TO history_writer; + +GRANT INSERT,SELECT ON ALL TABLES IN SCHEMA http TO history_writer; + + +--Create role for anyone who only needs selection access to historical data, such as for analysis +CREATE ROLE history_reader; +GRANT CONNECT ON DATABASE aact_db to history_reader; + +GRANT USAGE ON SCHEMA history TO history_reader; + +GRANT SELECT ON ALL TABLES IN SCHEMA http TO history_reader; + + + +/* History Tables +Below is where I would construct the parsed trial history tables that I need. +*/ + diff --git a/DockerContainers/ClinicalTrialHistory/docker-entrypoint-initdb.d/090_AnalysisViews.sql b/DockerContainers/ClinicalTrialHistory/docker-entrypoint-initdb.d/090_AnalysisViews.sql new file mode 100644 index 0000000..271064e --- /dev/null +++ b/DockerContainers/ClinicalTrialHistory/docker-entrypoint-initdb.d/090_AnalysisViews.sql @@ -0,0 +1 @@ +--Eventually this should let me add necessary views during the construction of the DB. \ No newline at end of file diff --git a/DockerContainers/docker-compose.yaml b/DockerContainers/docker-compose.yaml new file mode 100644 index 0000000..5bebbd5 --- /dev/null +++ b/DockerContainers/docker-compose.yaml @@ -0,0 +1,37 @@ +version: '3' + +volumes: + aact_pg_database: #This is to hold the database. + +services: + + aact: + build: ./ClinicalTrialHistory #build and use the clinical trial history db. + container_name: aact_db + #restart: always #restart after crashes + environment: + POSTGRES_USER: root + POSTGRES_PASSWORD: root + POSTGRES_DB: aact_db + ports: + - "5432:5432" #host:container + volumes: #host:container is the format. + - aact_pg_database:/var/lib/postgresql/ # this is persistant storage for the database + - ./20220201_clinical_trials/postgres_data.dmp:/mnt/host_data/postgres_data.dmp + + pgadmin: + container_name: pgadmin4_webservice + image: dpage/pgadmin4 + #restart: always + environment: + PGADMIN_DEFAULT_EMAIL: admin@admin.com + PGADMIN_DEFAULT_PASSWORD: root + ports: + - "5050:80" + volumes: #host:container is the format. + #The volume with server login information. + - ./pgadmin4/servers.json:/pgadmin4/servers.json + +#Checklist for production +# uncomment restart: always in both services. +# add a python environment to run data collection etc \ No newline at end of file diff --git a/downloader/db_connection.py b/DockerContainers/downloader/db_connection.py similarity index 100% rename from downloader/db_connection.py rename to DockerContainers/downloader/db_connection.py diff --git a/PostgressDocker/docker-compose.yaml b/PostgressDocker/docker-compose.yaml deleted file mode 100644 index 1ebe243..0000000 --- a/PostgressDocker/docker-compose.yaml +++ /dev/null @@ -1,49 +0,0 @@ -version: '3' - -volumes: - aact_pg_database: #This is to hold the database. - aact_helpful_files: #This is to hold files that need accessed by both pgadmin and postgres. I honestly expect it to usually be empty. - driver: local - driver_opts: - type: 'none' - o: 'bind' - device: /home/will/research/ClinicalTrialsDataProcessing/PostgressDocker/host_data/ - #change this path to match the path you are holding the dump file in - -services: - aact: - container_name: aact_db - image: postgres - #restart: always #restart to run things well. - environment: - POSTGRES_USER: admin - POSTGRES_PASSWORD: root - POSTGRES_DB: aact_db - ports: - - "5432:5432" #host:container - volumes: #host:container is the format. - #pull in a single file - #- $HOME/research/ClinicalTrialsDataProcessing/PostgressDocker/infile:/infile - #add a storage volumes - - aact_pg_database:/var/lib/postgresql/ # this is the database that persits between uses - - aact_helpful_files:/mnt/host_data #use :ro # to give read only permisions - #entrypoint: - #none yet - #- echo "test" > /test.touched - - pgadmin: - container_name: pgadmin4_webservice - image: dpage/pgadmin4 - #restart: always - environment: - PGADMIN_DEFAULT_EMAIL: admin@admin.com - PGADMIN_DEFAULT_PASSWORD: root - ports: - - "5050:80" - volumes: #host:container is the format. - #The volume with server login information. - - ./pgadmin4/servers.json:/pgadmin4/servers.json - -#Checklist for production -# uncomment restart: always in both services. -# add a python environment to run data collection etc \ No newline at end of file diff --git a/README.md b/README.md index e72ef69..2c90cab 100644 --- a/README.md +++ b/README.md @@ -3,3 +3,59 @@ This is used to build tools which process and standardize the data. More data later. + +# proposed architecture: + +AACT Dockerfile #when built on its own, allows for the creation of the AACT database. + - Should create appropriate users. + - + +ClinicalTrialHistory Dockerfile #inherits from AACT (multistage builds?) and adds initialization steps to the database. + - adds schemas, tables, and views for both http requests and parsed history tables. + - adds connection roles + - adds connection users + +ClinicalTrialDataProcessing docker-compose.yaml #deploys a ClinicalTrialHistory container, pgadmin4, and eventually the downloading program/environment. + +# Background on Docker +Docker uses the following flow + +1. configuration using `docker-compose.yaml` or a `Dockerfile` +2. `docker build .` to generate an image +3. `docker run xxxxxx` to take the image and create a container. + - when the container is created, it starts, running commands as configured in the dockerfile. + - Consequently, the AACT database image when run must initialize the postgres db, then run the initalization details. + - Here is where bind mounts come into play. + +## Multistage builds +https://stackoverflow.com/questions/53659993/docker-multi-stage-how-to-split-up-into-multiple-dockerfiles + +https://docs.docker.com/develop/develop-images/multistage-build/ + +Basically + +## Dockerfile vs docker-compose.yaml + +A `Dockerfile` is used to create images. + +A `docker-compose.yaml` is used to automate the deployment of containers. + +## Types of storage + +### COPY/ADD (Dockerfile) + +In a dockerfile, this adds a file permanently to the image. + +This adds files one way to or from the container when initialized. + +### Volumes (docker-compose.yaml && Dockerfile) + +Useable in both docker-compose and Dockerfile's, this creates a permanent storage. +It can be maintained by docker or stored in a particular location. + +Good for longer term storage such as databases. + +### Bind mounts (docker-compose.yaml) + +Bind mounts are used to make a host filesystem resource +available \ No newline at end of file