ClinicalTrialsDataProcessing/scripts/drugtools/download_and_extract_nsde.py

import json
from psycopg2.extras import execute_values
import datetime as dt
from drugtools.env_setup import postgres_conn, ENV
import requests
import zipfile
import io

URL_STEM = 'https://download.open.fda.gov/other/nsde/'
NUMBER_OF_NSDE_FILES = int(ENV["NUMBER_OF_NSDE_FILES"])

def filename_generator(max_num):
    for itt in range(1,max_num+1):
        filename = "other-nsde-{:0>4}-of-{:0>4}.json.zip".format(itt,max_num)
        yield filename

def get_date(result,key):
    r = result.get(key)
    if r:
        return dt.datetime.strptime(r, "%Y%m%d")
    else:
        return None

def build_values(result):
    #adjust types
    proprietary_name = result.get("proprietary_name")
    application_number_or_citation = result.get("application_number_or_citation")
    product_type = result.get("product_type")
    package_ndc = result.get("package_ndc")
    marketing_category = result.get("marketing_category")
    package_ndc11 = result.get("package_ndc11")
    dosage_form = result.get("dosage_form")
    billing_unit = result.get("billing_unit")
    marketing_start_date = get_date(result,"marketing_start_date")
    marketing_end_date = get_date(result, "marketing_end_date")
    inactivation_date = get_date(result, "inactivation_date")
    reactivation_date = get_date(result,"reactivation_date")

    return (
    proprietary_name
    ,application_number_or_citation
    ,product_type
    ,package_ndc
    ,marketing_category
    ,package_ndc11
    ,dosage_form
    ,billing_unit
    ,marketing_start_date
    ,marketing_end_date
    ,inactivation_date
    ,reactivation_date
            )

def download_and_extract_zip(base_url,filename):
    response = requests.get(base_url + filename)

    with zipfile.ZipFile(io.BytesIO(response.content)) as the_zip:
        contents_list = the_zip.infolist()
        for content_name in contents_list:
            return the_zip.read(content_name)

def run():
    for filename in filename_generator(NUMBER_OF_NSDE_FILES):
        #It would be nice to replace this^^ file_generator with something that retrieves and unzips the files directly.
        with (postgres_conn() as con , con.cursor() as curse):
            print(filename)

            j = download_and_extract_zip(URL_STEM, filename)
            results = json.loads(j)["results"]
            query = """
INSERT INTO spl.nsde (
    proprietary_name
    ,application_number_or_citation
    ,product_type
    ,package_ndc
    ,marketing_category
    ,package_ndc11
    ,dosage_form
    ,billing_unit
    ,marketing_start_date
    ,marketing_end_date
    ,inactivation_date
    ,reactivation_date
)
VALUES %s;
"""

            values = [build_values(y) for y in results]
            execute_values(curse,query,values)


if __name__ == "__main__":
    run()