Created downloader, table, and loader for market data

llm-extraction
youainti 3 years ago
parent dfbd82de54
commit 266c1c9686

2
.gitignore vendored

@ -184,3 +184,5 @@ aact_downloads/
NCT*.html NCT*.html
/Orangebook/EOBZIP_*/ /Orangebook/EOBZIP_*/
/Orangebook/Orangebooks/ /Orangebook/Orangebooks/
*.json
*.zip

@ -0,0 +1,39 @@
DROP TABLE IF EXISTS spl.nsde;
CREATE SEQUENCE IF NOT EXISTS spl.nsde_id_seq
INCREMENT 1
START 1
MINVALUE 1
MAXVALUE 9223372036854775807
CACHE 1;
ALTER SEQUENCE spl.nsde_id_seq
OWNER TO root;
CREATE TABLE IF NOT EXISTS spl.nsde
(
id integer NOT NULL DEFAULT nextval('spl.nsde_id_seq'::regclass),
package_ndc11 character varying(11) COLLATE pg_catalog."default",
application_number_or_citation character varying(25) COLLATE pg_catalog."default",
package_ndc character varying(50) COLLATE pg_catalog."default",
proprietary_name character varying(500) COLLATE pg_catalog."default",
product_type character varying(90) COLLATE pg_catalog."default",
marketing_category character varying(160) COLLATE pg_catalog."default",
dosage_form character varying(155) COLLATE pg_catalog."default",
billing_unit character varying(35) COLLATE pg_catalog."default",
marketing_start_date date,
marketing_end_date date,
inactivation_date date,
reactivation_date date,
CONSTRAINT nsde_pkey PRIMARY KEY (id)
)
TABLESPACE pg_default;
ALTER TABLE IF EXISTS spl.nsde
OWNER to root;
-- if the table is dropped, the sequence is as well
ALTER SEQUENCE spl.nsde_id_seq
OWNED BY spl.nsde.id;

@ -4,7 +4,7 @@
import psycopg2 as psyco import psycopg2 as psyco
conn = psyco.connect(dbname="aact_db", user="root", host="will-office", password="root") conn = psyco.connect(dbname="aact_db", user="root", host="localhost", password="root")
curse = conn.cursor() curse = conn.cursor()

@ -0,0 +1,16 @@
#!/bin/bash
set -e
curl https://download.open.fda.gov/other/nsde/other-nsde-0001-of-0003.json.zip > ./nsde_1.zip
unzip ./nsde_1.zip
rm ./nsde_1.zip
curl https://download.open.fda.gov/other/nsde/other-nsde-0002-of-0003.json.zip > ./nsde_2.zip
unzip ./nsde_2.zip
rm ./nsde_2.zip
curl https://download.open.fda.gov/other/nsde/other-nsde-0003-of-0003.json.zip > ./nsde_3.zip
unzip ./nsde_3.zip
rm ./nsde_3.zip

@ -0,0 +1,85 @@
import json
import psycopg2 as psyco
from psycopg2.extras import execute_values
import datetime as dt
def file_generator(max_num):
for itt in range(1,max_num+1):
filename = "other-nsde-{:0>4}-of-{:0>4}.json".format(itt,max_num)
yield filename
def get_date(result,key):
r = result.get(key)
if r:
return dt.datetime.strptime(r, "%Y%m%d")
else:
return None
def build_values(result):
#adjust types
proprietary_name = result.get("proprietary_name")
application_number_or_citation = result.get("application_number_or_citation")
product_type = result.get("product_type")
package_ndc = result.get("package_ndc")
marketing_category = result.get("marketing_category")
package_ndc11 = result.get("package_ndc11")
dosage_form = result.get("dosage_form")
billing_unit = result.get("billing_unit")
marketing_start_date = get_date(result,"marketing_start_date")
marketing_end_date = get_date(result, "marketing_end_date")
inactivation_date = get_date(result, "inactivation_date")
reactivation_date = get_date(result,"reactivation_date")
return (
proprietary_name
,application_number_or_citation
,product_type
,package_ndc
,marketing_category
,package_ndc11
,dosage_form
,billing_unit
,marketing_start_date
,marketing_end_date
,inactivation_date
,reactivation_date
)
if __name__ == "__main__":
for x in file_generator(3):
#It would be nice to replace this^^ file_generator with something that retrieves and unzips the files directly.
with (
psyco.connect(dbname="aact_db", user="root", host="localhost", password="root") as con,
con.cursor() as curse,
open("./"+x,"r") as j
):
print(x)
results = json.loads(j.read())["results"]
query = """
INSERT INTO spl.nsde (
proprietary_name
,application_number_or_citation
,product_type
,package_ndc
,marketing_category
,package_ndc11
,dosage_form
,billing_unit
,marketing_end_date
,marketing_start_date
,inactivation_date
,reactivation_date
)
VALUES %s;
"""
values = [build_values(y) for y in results]
execute_values(curse,query,values)
Loading…
Cancel
Save