diff --git a/.gitignore b/.gitignore index eecffb8..dfc311e 100644 --- a/.gitignore +++ b/.gitignore @@ -184,3 +184,5 @@ aact_downloads/ NCT*.html /Orangebook/EOBZIP_*/ /Orangebook/Orangebooks/ +*.json +*.zip diff --git a/AACT_downloader/docker-entrypoint-initdb.d/040_StructuredProductLabels_Schema.sql b/AACT_downloader/docker-entrypoint-initdb.d/040_StructuredProductLabels_Schema.sql new file mode 100644 index 0000000..7aaa9bf --- /dev/null +++ b/AACT_downloader/docker-entrypoint-initdb.d/040_StructuredProductLabels_Schema.sql @@ -0,0 +1,39 @@ + +DROP TABLE IF EXISTS spl.nsde; + +CREATE SEQUENCE IF NOT EXISTS spl.nsde_id_seq + INCREMENT 1 + START 1 + MINVALUE 1 + MAXVALUE 9223372036854775807 + CACHE 1; + +ALTER SEQUENCE spl.nsde_id_seq + OWNER TO root; + +CREATE TABLE IF NOT EXISTS spl.nsde +( + id integer NOT NULL DEFAULT nextval('spl.nsde_id_seq'::regclass), + package_ndc11 character varying(11) COLLATE pg_catalog."default", + application_number_or_citation character varying(25) COLLATE pg_catalog."default", + package_ndc character varying(50) COLLATE pg_catalog."default", + proprietary_name character varying(500) COLLATE pg_catalog."default", + product_type character varying(90) COLLATE pg_catalog."default", + marketing_category character varying(160) COLLATE pg_catalog."default", + dosage_form character varying(155) COLLATE pg_catalog."default", + billing_unit character varying(35) COLLATE pg_catalog."default", + marketing_start_date date, + marketing_end_date date, + inactivation_date date, + reactivation_date date, + CONSTRAINT nsde_pkey PRIMARY KEY (id) +) + +TABLESPACE pg_default; + +ALTER TABLE IF EXISTS spl.nsde + OWNER to root; + +-- if the table is dropped, the sequence is as well +ALTER SEQUENCE spl.nsde_id_seq + OWNED BY spl.nsde.id; diff --git a/history_downloader/db_connection.py b/history_downloader/db_connection.py index 02fe828..7a95b03 100644 --- a/history_downloader/db_connection.py +++ b/history_downloader/db_connection.py @@ -4,7 +4,7 @@ import psycopg2 as psyco -conn = psyco.connect(dbname="aact_db", user="root", host="will-office", password="root") +conn = psyco.connect(dbname="aact_db", user="root", host="localhost", password="root") curse = conn.cursor() diff --git a/market_data/download_nsde.sh b/market_data/download_nsde.sh new file mode 100644 index 0000000..2118484 --- /dev/null +++ b/market_data/download_nsde.sh @@ -0,0 +1,16 @@ +#!/bin/bash +set -e + +curl https://download.open.fda.gov/other/nsde/other-nsde-0001-of-0003.json.zip > ./nsde_1.zip +unzip ./nsde_1.zip +rm ./nsde_1.zip + +curl https://download.open.fda.gov/other/nsde/other-nsde-0002-of-0003.json.zip > ./nsde_2.zip +unzip ./nsde_2.zip +rm ./nsde_2.zip + +curl https://download.open.fda.gov/other/nsde/other-nsde-0003-of-0003.json.zip > ./nsde_3.zip +unzip ./nsde_3.zip +rm ./nsde_3.zip + + diff --git a/market_data/extract_nsde.py b/market_data/extract_nsde.py new file mode 100644 index 0000000..9730fcd --- /dev/null +++ b/market_data/extract_nsde.py @@ -0,0 +1,85 @@ +import json +import psycopg2 as psyco +from psycopg2.extras import execute_values +import datetime as dt + +def file_generator(max_num): + for itt in range(1,max_num+1): + filename = "other-nsde-{:0>4}-of-{:0>4}.json".format(itt,max_num) + yield filename + +def get_date(result,key): + r = result.get(key) + if r: + return dt.datetime.strptime(r, "%Y%m%d") + else: + return None + +def build_values(result): + #adjust types + proprietary_name = result.get("proprietary_name") + application_number_or_citation = result.get("application_number_or_citation") + product_type = result.get("product_type") + package_ndc = result.get("package_ndc") + marketing_category = result.get("marketing_category") + package_ndc11 = result.get("package_ndc11") + dosage_form = result.get("dosage_form") + billing_unit = result.get("billing_unit") + marketing_start_date = get_date(result,"marketing_start_date") + marketing_end_date = get_date(result, "marketing_end_date") + inactivation_date = get_date(result, "inactivation_date") + reactivation_date = get_date(result,"reactivation_date") + + return ( + proprietary_name + ,application_number_or_citation + ,product_type + ,package_ndc + ,marketing_category + ,package_ndc11 + ,dosage_form + ,billing_unit + ,marketing_start_date + ,marketing_end_date + ,inactivation_date + ,reactivation_date + ) + + +if __name__ == "__main__": + for x in file_generator(3): + #It would be nice to replace this^^ file_generator with something that retrieves and unzips the files directly. + with ( + psyco.connect(dbname="aact_db", user="root", host="localhost", password="root") as con, + con.cursor() as curse, + open("./"+x,"r") as j + ): + print(x) + + results = json.loads(j.read())["results"] + query = """ +INSERT INTO spl.nsde ( + proprietary_name + ,application_number_or_citation + ,product_type + ,package_ndc + ,marketing_category + ,package_ndc11 + ,dosage_form + ,billing_unit + ,marketing_end_date + ,marketing_start_date + ,inactivation_date + ,reactivation_date +) +VALUES %s; +""" + + values = [build_values(y) for y in results] + execute_values(curse,query,values) + + + + + +