diff --git a/Orangebook/CidParser.py b/Orangebook/CidParser.py
new file mode 100644
index 0000000..1b45db0
--- /dev/null
+++ b/Orangebook/CidParser.py
@@ -0,0 +1,93 @@
+# Adobe Pdf Character ID (cid:\d+) parser
+# The purpose is to allow someone to create their own table equivalent to the "\toUnicode" that
+# should be provided in every PDF using cid's (but is often mangled).
+
+def get_digits(string):
+ """
+ Extract leading the digits from a cid tag.
+ """
+ splat = string.split(")")
+ num = splat[0]
+ l = len(num)
+ return int(num),l
+
+def token_generator(string):
+ """
+ An iterable that returns tokens describing a string in a pdf.
+ Tokens take two forms:
+ - Integers: these represend CID codes
+ - Characters: these represent the arbitrary characters often returned amidst cid's.
+
+ It is a python generator becasue that simplifies the ordering and allows us to avoid recursion.
+ """
+ start = 0
+ str_len = len(string)
+
+ while start < str_len:
+ substring = string[start:]
+
+ #check for cid
+ if (str_len - start > 6) and (substring[0:5] == "(cid:"):
+
+ num,length = get_digits(substring[5:])
+ start += length + 6
+ yield num
+
+ elif (str_len - start > 1):
+ start += 1
+ yield substring[0]
+ else:
+ start += 1
+ yield substring
+
+
+class UnknownSymbol():
+ """
+ Represents a token that is not in the parser's dictionary.
+ """
+ def __init__(self, symbol):
+ self.symbol = symbol
+
+ def __repr__(self):
+ return "UnknownSymbol: {} of type {}".format(self.symbol, type(self.symbol))
+
+ def __str__(self):
+ return "\uFFFD"
+
+class Parser:
+ """
+ Translates from tokens to character arrays or strings, handling errors as it goes.
+
+ It requires a dictionary during instantiation.
+ This dictionary is what is used to perform lookups.
+
+ It exposes 3 methods
+ - convert attempts to convert a single token
+ - convert_stream will try to convert an iterable of tokens into an iterable of text.
+ - check_list_of_strings will try to convert a list of strings containing cids and other symbols into
+ - strings, if there are no Unknown symbols.
+ - lists, containing characters and Unknown symbols.
+ """
+ def __init__(self, lookup_table):
+ self._lookup_table = lookup_table
+
+ def convert(self,token):
+ try:
+ return self._lookup_table[token]
+ except:
+ return UnknownSymbol(token)
+
+ def convert_list(self,token_stream):
+ for token in token_stream:
+ yield self.convert(token)
+
+ def convert_list_of_strings(self, list_of_strings):
+ for token_stream in list_of_stings:
+ arr = [x for x in ob2020.convert_stream(token_generator(token_stream))]
+ try:
+ print("".join(arr))
+ except:
+ print(arr)
+
+if __name__ == "__main__":
+ print("Plan was to accept and proceess a symbol table and text. Apparently it has not been implemented."
\ No newline at end of file
diff --git a/Orangebook/Untitled.ipynb b/Orangebook/Untitled.ipynb
new file mode 100644
index 0000000..1a38498
--- /dev/null
+++ b/Orangebook/Untitled.ipynb
@@ -0,0 +1,371 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "id": "40358f02-c376-4431-be39-cdd477f17e7a",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import polars as pl"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 25,
+ "id": "8fb27ee2-72c1-4e80-9d00-de54f2834fe8",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "polars.datatypes.Datetime"
+ ]
+ },
+ "execution_count": 25,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "pl.datatypes.Datetime"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 55,
+ "id": "2c0edd77-c2d0-4184-a094-8c01783d2f0e",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "products = pl.scan_csv(file=\"./EOBZIP_2022_04/products.txt\", sep=\"~\")\n",
+ "patents = pl.scan_csv(file=\"./EOBZIP_2022_04/patent.txt\", sep=\"~\")\n",
+ "exclusivity = pl.scan_csv(file=\"./EOBZIP_2022_04/exclusivity.txt\", sep=\"~\", parse_dates=True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 58,
+ "id": "023f211d-23aa-4a2c-843d-1b60cec91079",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def set_exclusivity_types(df):\n",
+ " return df.with_columns([\n",
+ " pl.col(\"Exclusivity_Date\").str.strptime(pl.Date, fmt=\"%b %-d, %Y\")\n",
+ " ])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 61,
+ "id": "a1da42c9-e47a-4437-b089-e9b91f789a0c",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ "\n",
+ "\n",
+ "| \n",
+ "Appl_Type\n",
+ " | \n",
+ "\n",
+ "Appl_No\n",
+ " | \n",
+ "\n",
+ "Product_No\n",
+ " | \n",
+ "\n",
+ "Exclusivity_Code\n",
+ " | \n",
+ "\n",
+ "Exclusivity_Date\n",
+ " | \n",
+ "
\n",
+ "\n",
+ "| \n",
+ "str\n",
+ " | \n",
+ "\n",
+ "i64\n",
+ " | \n",
+ "\n",
+ "i64\n",
+ " | \n",
+ "\n",
+ "str\n",
+ " | \n",
+ "\n",
+ "date\n",
+ " | \n",
+ "
\n",
+ "\n",
+ "\n",
+ "\n",
+ "| \n",
+ "\"N\"\n",
+ " | \n",
+ "\n",
+ "11366\n",
+ " | \n",
+ "\n",
+ "2\n",
+ " | \n",
+ "\n",
+ "\"ODE-96\"\n",
+ " | \n",
+ "\n",
+ "2022-08-07\n",
+ " | \n",
+ "
\n",
+ "\n",
+ "| \n",
+ "\"N\"\n",
+ " | \n",
+ "\n",
+ "20287\n",
+ " | \n",
+ "\n",
+ "11\n",
+ " | \n",
+ "\n",
+ "\"NPP\"\n",
+ " | \n",
+ "\n",
+ "2022-05-16\n",
+ " | \n",
+ "
\n",
+ "\n",
+ "| \n",
+ "\"N\"\n",
+ " | \n",
+ "\n",
+ "20287\n",
+ " | \n",
+ "\n",
+ "10\n",
+ " | \n",
+ "\n",
+ "\"NPP\"\n",
+ " | \n",
+ "\n",
+ "2022-05-16\n",
+ " | \n",
+ "
\n",
+ "\n",
+ "| \n",
+ "\"N\"\n",
+ " | \n",
+ "\n",
+ "20287\n",
+ " | \n",
+ "\n",
+ "9\n",
+ " | \n",
+ "\n",
+ "\"NPP\"\n",
+ " | \n",
+ "\n",
+ "2022-05-16\n",
+ " | \n",
+ "
\n",
+ "\n",
+ "| \n",
+ "\"N\"\n",
+ " | \n",
+ "\n",
+ "20287\n",
+ " | \n",
+ "\n",
+ "8\n",
+ " | \n",
+ "\n",
+ "\"NPP\"\n",
+ " | \n",
+ "\n",
+ "2022-05-16\n",
+ " | \n",
+ "
\n",
+ "\n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ "shape: (5, 5)\n",
+ "┌───────────┬─────────┬────────────┬──────────────────┬──────────────────┐\n",
+ "│ Appl_Type ┆ Appl_No ┆ Product_No ┆ Exclusivity_Code ┆ Exclusivity_Date │\n",
+ "│ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n",
+ "│ str ┆ i64 ┆ i64 ┆ str ┆ date │\n",
+ "╞═══════════╪═════════╪════════════╪══════════════════╪══════════════════╡\n",
+ "│ N ┆ 11366 ┆ 2 ┆ ODE-96 ┆ 2022-08-07 │\n",
+ "├╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤\n",
+ "│ N ┆ 20287 ┆ 11 ┆ NPP ┆ 2022-05-16 │\n",
+ "├╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤\n",
+ "│ N ┆ 20287 ┆ 10 ┆ NPP ┆ 2022-05-16 │\n",
+ "├╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤\n",
+ "│ N ┆ 20287 ┆ 9 ┆ NPP ┆ 2022-05-16 │\n",
+ "├╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤\n",
+ "│ N ┆ 20287 ┆ 8 ┆ NPP ┆ 2022-05-16 │\n",
+ "└───────────┴─────────┴────────────┴──────────────────┴──────────────────┘"
+ ]
+ },
+ "execution_count": 61,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "exclusivity.pipe(set_exclusivity_types).head(5).collect()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 85,
+ "id": "92fe99fa-1963-460c-99ea-7f614b4b2e25",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def set_patent_types(df):\n",
+ " return df.with_columns([\n",
+ " pl.col(\"Patent_Expire_Date_Text\").str.strptime(pl.Date, fmt=\"%b %-d, %Y\"),\n",
+ " pl.col(\"Submission_Date\").str.strptime(pl.Date, fmt=\"%b %-d, %Y\"),\n",
+ " pl.col(\"Drug_Substance_Flag\") == \"Y\",\n",
+ " pl.col(\"Drug_Product_Flag\") == \"Y\",\n",
+ " pl.col(\"Delist_Flag\") == \"Y\"\n",
+ " ])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 90,
+ "id": "13707ca6-094f-4ed7-94cb-824087e97874",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ "\n",
+ "\n",
+ "| \n",
+ "Patent_Expire_Date_Text\n",
+ " | \n",
+ "
\n",
+ "\n",
+ "| \n",
+ "date\n",
+ " | \n",
+ "
\n",
+ "\n",
+ "\n",
+ "\n",
+ "| \n",
+ "2022-01-02\n",
+ " | \n",
+ "
\n",
+ "\n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ "shape: (1, 1)\n",
+ "┌─────────────────────────┐\n",
+ "│ Patent_Expire_Date_Text │\n",
+ "│ --- │\n",
+ "│ date │\n",
+ "╞═════════════════════════╡\n",
+ "│ 2022-01-02 │\n",
+ "└─────────────────────────┘"
+ ]
+ },
+ "execution_count": 90,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "patents.pipe(set_patent_types).select(\"Patent_Expire_Date_Text\").min().collect()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 81,
+ "id": "18ad8df7-45d5-4454-8955-c5f28a7d7f1e",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "polars.datatypes.Null"
+ ]
+ },
+ "execution_count": 81,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "pl.datatypes.Null"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "79e4b3d9-29ae-4302-bee1-4be02e0ba654",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.8.13"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/Orangebook/Untitled2.ipynb b/Orangebook/Untitled2.ipynb
new file mode 100644
index 0000000..7a1679d
--- /dev/null
+++ b/Orangebook/Untitled2.ipynb
@@ -0,0 +1,216 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 196,
+ "id": "2f61df31-f3c1-4b2e-ae96-96bf06089b17",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def get_digits(string):\n",
+ " splat = string.split(\")\")\n",
+ " num = splat[0]\n",
+ " l = len(num)\n",
+ " return int(num),l\n",
+ "\n",
+ "def token_generator(string):\n",
+ " \n",
+ " start = 0\n",
+ " str_len = len(string)\n",
+ " \n",
+ " \n",
+ " while start < str_len:\n",
+ " substring = string[start:]\n",
+ " \n",
+ " #check for cid\n",
+ " if (str_len - start > 6) and (substring[0:5] == \"(cid:\"):\n",
+ " \n",
+ " num,length = get_digits(substring[5:])\n",
+ " start += length + 6\n",
+ " yield num\n",
+ " \n",
+ " elif (str_len - start > 1):\n",
+ " start += 1\n",
+ " yield substring[0]\n",
+ " else:\n",
+ " start += 1\n",
+ " yield substring\n",
+ "\n",
+ "class UnknownSymbol():\n",
+ " def __init__(self, symbol):\n",
+ " self.symbol = symbol\n",
+ " \n",
+ " def __repr__(self):\n",
+ " return \"UnknownSymbol: {} of type {}\".format(self.symbol, type(self.symbol))\n",
+ " \n",
+ " def __str__(self):\n",
+ " return \"\\uFFFD\"\n",
+ "\n",
+ "class Parser:\n",
+ " def __init__(self, lookup_table):\n",
+ " self._lookup_table = lookup_table\n",
+ " \n",
+ " def convert(self,symbol):\n",
+ " try:\n",
+ " return self._lookup_table[symbol]\n",
+ " except:\n",
+ " return UnknownSymbol(symbol)\n",
+ " \n",
+ " def convert_stream(self,token_stream):\n",
+ " for token in token_stream:\n",
+ " yield self.convert(token)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 213,
+ "id": "e2c1e39b-0ac5-4ad7-9176-ef8ea69feeec",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "ob2020 = Parser({\n",
+ " 23:\"4\"\n",
+ " ,19:\"0\"\n",
+ " ,\"7\":\"T\"\n",
+ " ,\"+\":\"H\"\n",
+ " ,3:\" \"\n",
+ " ,\"(\":\"E\"\n",
+ " ,\"’\":\"D\"\n",
+ " ,\",\":\"I\"\n",
+ " ,\"2\":\"O\"\n",
+ " ,\"1\":\"N\"\n",
+ " ,16:\"-\"\n",
+ " ,21:\"2\"\n",
+ " ,\"$\":\"A\"\n",
+ " ,\"3\":\"P\"\n",
+ " ,\"5\":\"R\"\n",
+ " ,\"9\":\"V\"\n",
+ " ,\"8\":\"U\"\n",
+ " ,\"*\":\"G\"\n",
+ " ,\"&\":\"C\"\n",
+ " ,\"/\":\"L\"\n",
+ " ,\"6\":\"S\"\n",
+ " ,22:\"3\"\n",
+ " ,20:\"1\"\n",
+ " ,11:\" \"\n",
+ " ,\"R\":\"(\"\n",
+ " ,\"I\":\"of\"\n",
+ " ,24:\"5\"\n",
+ " ,12:\")\"\n",
+ "})"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 214,
+ "id": "c02896ab-fc75-44cc-bb27-a5dcf1b6d7f0",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "'40TH EDITION - 2020 - APPROVED DRUG PRODUCT LIST'"
+ ]
+ },
+ "execution_count": 214,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "\"\".join([x for x in ob2020.convert_stream(token_generator(b))])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 215,
+ "id": "1794e826-fa1f-4aba-8eab-6d603a06dfe0",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "c = \"35(6&5,37,21(cid:3)’58*(cid:3)352’8&7(cid:3)/,67(cid:3)\""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 216,
+ "id": "c8b67d79-81ad-4b3a-be8b-bace9a8d943a",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "'PRESCRIPTION DRUG PRODUCT LIST '"
+ ]
+ },
+ "execution_count": 216,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "\"\".join([x for x in ob2020.convert_stream(token_generator(c))])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 217,
+ "id": "f76d9760-fe69-4743-ab47-41cf74866d70",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "d = \"(cid:22)(cid:16)(cid:20)(cid:11)RI(cid:3)(cid:23)(cid:24)(cid:22)(cid:12)\""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 218,
+ "id": "ee925997-0701-4d8e-b713-6f39c6a50a5b",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "['3', '-', '1', ' ', '(', 'of', ' ', '4', '5', '3', ')']"
+ ]
+ },
+ "execution_count": 218,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "[x for x in ob2020.convert_stream(token_generator(d))]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "ae152c75-5fd6-4756-a473-fcea2de5ee30",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.8.13"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/Orangebook/cid_parser.ipynb b/Orangebook/cid_parser.ipynb
new file mode 100644
index 0000000..6a1266b
--- /dev/null
+++ b/Orangebook/cid_parser.ipynb
@@ -0,0 +1,1066 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "id": "412f6c09-9f40-46e3-9642-720946c93ba8",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import pdfminer"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "id": "967257f2-dae8-4789-8306-539575d55f57",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "20220506\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(pdfminer.__version__)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "id": "5f56326c-70c4-4e2c-a11b-75b32c231842",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from pdfminer.converter import TextConverter\n",
+ "from pdfminer.layout import LAParams\n",
+ "from pdfminer.pdfdocument import PDFDocument\n",
+ "from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter\n",
+ "from pdfminer.pdfpage import PDFPage\n",
+ "from pdfminer.pdfparser import PDFParser\n",
+ "\n",
+ "from io import StringIO"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "id": "c6c619b1-f897-4c1f-892f-a294b8ab0338",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "20220506\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(pdfminer.__version__)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "id": "7b4c4555-57cc-4109-a937-f26fea9afae3",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "output_string = StringIO()\n",
+ "\n",
+ "with open(\"/home/will/research/ClinicalTrialsDataProcessing/Orangebook/Orangebooks/testprint4.pdf\", \"rb\") as file_handle:\n",
+ " parser = PDFParser(file_handle)\n",
+ " doc = PDFDocument(parser)\n",
+ " \n",
+ " rsrcmgr = PDFResourceManager()\n",
+ " device = TextConverter(rsrcmgr, output_string, laparams=LAParams())\n",
+ " interpreter = PDFPageInterpreter(rsrcmgr, device)\n",
+ " \n",
+ " for page in PDFPage.create_pages(doc):\n",
+ " interpreter.process_page(page)\n",
+ "\n",
+ "pdfminer_lines = output_string.getvalue().splitlines()\n",
+ "pdftotext_lines = [ln for ln in pdfminer_lines if ln]\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "id": "7315468d-a918-4365-9008-661fca836344",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def get_digits(string):\n",
+ " splat = string.split(\")\")\n",
+ " num = splat[0]\n",
+ " l = len(num)\n",
+ " return int(num),l\n",
+ "\n",
+ "def token_generator(string):\n",
+ " \n",
+ " start = 0\n",
+ " str_len = len(string)\n",
+ " \n",
+ " \n",
+ " while start < str_len:\n",
+ " substring = string[start:]\n",
+ " \n",
+ " #check for cid\n",
+ " if (str_len - start > 6) and (substring[0:5] == \"(cid:\"):\n",
+ " \n",
+ " num,length = get_digits(substring[5:])\n",
+ " start += length + 6\n",
+ " yield num\n",
+ " \n",
+ " elif (str_len - start > 1):\n",
+ " start += 1\n",
+ " yield substring[0]\n",
+ " else:\n",
+ " start += 1\n",
+ " yield substring\n",
+ "\n",
+ "class UnknownSymbol():\n",
+ " def __init__(self, symbol):\n",
+ " self.symbol = symbol\n",
+ " \n",
+ " def __repr__(self):\n",
+ " return \"UnknownSymbol: {} of type {}\".format(self.symbol, type(self.symbol))\n",
+ " \n",
+ " def __str__(self):\n",
+ " return \"\\uFFFD\"\n",
+ "\n",
+ "class Parser:\n",
+ " def __init__(self, lookup_table):\n",
+ " self._lookup_table = lookup_table\n",
+ " \n",
+ " def convert(self,symbol):\n",
+ " try:\n",
+ " return self._lookup_table[symbol]\n",
+ " except:\n",
+ " return UnknownSymbol(symbol)\n",
+ " \n",
+ " def convert_stream(self,token_stream):\n",
+ " for token in token_stream:\n",
+ " yield self.convert(token)\n",
+ " \n",
+ " def check_stream(self, pdftotext_lines):\n",
+ " for entry in pdftotext_lines:\n",
+ " arr = [x for x in ob2020.convert_stream(token_generator(entry))]\n",
+ " try:\n",
+ " print(\"\".join(arr))\n",
+ " except:\n",
+ " print(arr)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "id": "826d0009-120e-4e04-b90b-6585308e69d3",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "ob2020 = Parser({\n",
+ " 23:\"4\"\n",
+ " ,19:\"0\"\n",
+ " ,\"7\":\"T\"\n",
+ " ,\"+\":\"H\"\n",
+ " ,3:\" \"\n",
+ " ,\"(\":\"E\"\n",
+ " ,\"’\":\"D\"\n",
+ " ,\",\":\"I\"\n",
+ " ,\"2\":\"O\"\n",
+ " ,\"1\":\"N\"\n",
+ " ,16:\"-\"\n",
+ " ,21:\"2\"\n",
+ " ,\"$\":\"A\"\n",
+ " ,\"3\":\"P\"\n",
+ " ,\"5\":\"R\"\n",
+ " ,\"9\":\"V\"\n",
+ " ,\"8\":\"U\"\n",
+ " ,\"*\":\"G\"\n",
+ " ,\"&\":\"C\"\n",
+ " ,\"/\":\"L\"\n",
+ " ,\"6\":\"S\"\n",
+ " ,22:\"3\"\n",
+ " ,20:\"1\"\n",
+ " ,11:\"(\"\n",
+ " ,\"R\":\"o\"\n",
+ " ,\"I\":\"f\"\n",
+ " ,24:\"5\"\n",
+ " ,12:\")\"\n",
+ " ,\" \":\"👨🏻🚀\"\n",
+ " ,\"%\":\"B\"\n",
+ " ,\")\":\"F\"\n",
+ " ,30:\";\"\n",
+ " ,\"0\":\"M\"\n",
+ " ,\"4\":\"Q\"\n",
+ " ,18:\"/\"\n",
+ " ,26:\"7\"\n",
+ " ,28:\"9\"\n",
+ " ,\"D\":\"a\"\n",
+ " ,\"U\":\"r\"\n",
+ " ,15:\",\"\n",
+ " ,27:\"8\"\n",
+ " ,\"H\":\"e\"\n",
+ " ,\"S\":\"p\"\n",
+ " ,25:\"6\"\n",
+ " ,\"=\":\"Z\"\n",
+ " ,14:\"+\"\n",
+ " ,4:\"!\"\n",
+ " ,\"F\":\"c\"\n",
+ " ,\";\":\"X\"\n",
+ " ,\"<\":\"Y\"\n",
+ " ,\"Y\":\"v\"\n",
+ " ,\"-\":\"J\"\n",
+ " ,\"X\":\"u\"\n",
+ " ,\"Q\":\"n\"\n",
+ " ,\"W\":\"t\"\n",
+ " ,\"J\":\"g\"\n",
+ " ,\".\":\"K\"\n",
+ " ,\":\":\"W\"\n",
+ " ,17:\".\"\n",
+ " ,\"O\":\"l\"\n",
+ " ,\"E\":\"b\"\n",
+ " ,\"\\\\\":\"y\"\n",
+ " ,8:\"%\"\n",
+ " ,\"L\":\"i\"\n",
+ " ,\"P\":\"m\"\n",
+ " ,10:\"'\"\n",
+ "})"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "id": "58327a77-029b-4c05-b410-45d1a539ad3e",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "39TH EDITION - 2019 - APPROVED DRUG PRODUCT LIST👨🏻🚀👨🏻🚀\n",
+ "PRESCRIPTION DRUG PRODUCT LIST \n",
+ "['c', 'a', UnknownSymbol: A of type , '👨🏻\\u200d🚀', UnknownSymbol: d of type , UnknownSymbol: a of type , UnknownSymbol: t of type , UnknownSymbol: a of type , '👨🏻\\u200d🚀', UnknownSymbol: s of type , UnknownSymbol: u of type , UnknownSymbol: p of type , UnknownSymbol: p of type , UnknownSymbol: l of type , UnknownSymbol: i of type , UnknownSymbol: e of type , UnknownSymbol: d of type , '👨🏻\\u200d🚀', UnknownSymbol: b of type , UnknownSymbol: y of type , '👨🏻\\u200d🚀', 'a', UnknownSymbol: r of type , UnknownSymbol: u of type , UnknownSymbol: g of type , 'm', UnknownSymbol: a of type , UnknownSymbol: t of type , UnknownSymbol: e of type , UnknownSymbol: n of type , UnknownSymbol: t of type , 't', UnknownSymbol: a of type , UnknownSymbol: t of type , UnknownSymbol: c of type , UnknownSymbol: h of type , 'K', UnknownSymbol: c of type , UnknownSymbol: o of type , UnknownSymbol: m of type ]\n",
+ "['3', '-', '1', 'E', UnknownSymbol: o of type , UnknownSymbol: f of type , '👨🏻\\u200d🚀', 'Q', 'R', 'O', 'F', '👨🏻\\u200d🚀', '👨🏻\\u200d🚀']\n",
+ "ABACAVIR SULFATE👨🏻🚀👨🏻🚀\n",
+ "SOLUTION;ORAL👨🏻🚀\n",
+ "ABACAVIR SULFATE👨🏻🚀👨🏻🚀\n",
+ "AA \n",
+ "AA \n",
+ "AUROBINDO PHARMA \n",
+ "LTD👨🏻🚀👨🏻🚀\n",
+ "HETERO LABS LTD III 👨🏻🚀EQ 20MG BASE/ML \n",
+ "EQ 20MG BASE/ML \n",
+ "A077950 001 👨🏻🚀Mar 14, 2018👨🏻🚀\n",
+ "A201107 001 👨🏻🚀Sep 26, 2016👨🏻🚀\n",
+ "AA \n",
+ "ZIAGEN👨🏻🚀👨🏻🚀\n",
+ "+! \n",
+ "TABLET;ORAL👨🏻🚀\n",
+ "VIIV HLTHCARE \n",
+ "EQ 20MG BASE/ML \n",
+ "N020978 001 👨🏻🚀Dec 17, 1998👨🏻🚀👨🏻🚀\n",
+ "ABACAVIR SULFATE👨🏻🚀👨🏻🚀\n",
+ "APOTEX INC \n",
+ "AUROBINDO PHARMA \n",
+ "LTD👨🏻🚀👨🏻🚀\n",
+ "EQ 300MG BASE \n",
+ "CIPLA \n",
+ "HETERO LABS LTD III 👨🏻🚀EQ 300MG BASE \n",
+ "EQ 300MG BASE \n",
+ "MYLAN PHARMS INC \n",
+ "EQ 300MG BASE \n",
+ "STRIDES PHARMA \n",
+ "EQ 300MG BASE \n",
+ "EQ 300MG BASE \n",
+ "AB \n",
+ "AB \n",
+ "AB \n",
+ "AB \n",
+ "AB \n",
+ "AB \n",
+ "ZIAGEN👨🏻🚀👨🏻🚀\n",
+ "+! \n",
+ "AB \n",
+ "ABACAVIR SULFATE; DOLUTEGRAVIR SODIUM; LAMIVUDINE👨🏻🚀👨🏻🚀\n",
+ "VIIV HLTHCARE \n",
+ "EQ 300MG BASE \n",
+ "A201570 001 👨🏻🚀Dec 17, 2012👨🏻🚀\n",
+ "A077844 001 👨🏻🚀Dec 17, 2012👨🏻🚀\n",
+ "A078119 001 👨🏻🚀Nov 21, 2017👨🏻🚀\n",
+ "A091560 001 👨🏻🚀Sep 13, 2013👨🏻🚀\n",
+ "A091294 001 👨🏻🚀Jun 18, 2012👨🏻🚀\n",
+ "A091050 001 👨🏻🚀Oct 28, 2016👨🏻🚀\n",
+ "N020977 001 👨🏻🚀Dec 17, 1998👨🏻🚀👨🏻🚀\n",
+ "TABLET;ORAL👨🏻🚀👨🏻🚀\n",
+ "TRIUMEQ👨🏻🚀👨🏻🚀\n",
+ "+! \n",
+ "VIIV HLTHCARE \n",
+ "ABACAVIR SULFATE; LAMIVUDINE👨🏻🚀👨🏻🚀\n",
+ "TABLET;ORAL👨🏻🚀\n",
+ "EQ 600MG BASE;EQ 50MG BASE;300MG \n",
+ "N205551 001 👨🏻🚀Aug 22, 2014👨🏻🚀👨🏻🚀\n",
+ "AB \n",
+ "AB \n",
+ "AB \n",
+ "AB \n",
+ "AB \n",
+ "AB \n",
+ "ABACAVIR SULFATE AND LAMIVUDINE👨🏻🚀👨🏻🚀\n",
+ "AUROBINDO PHARMA \n",
+ "LTD👨🏻🚀👨🏻🚀\n",
+ "EQ 600MG BASE;300MG \n",
+ "CIPLA \n",
+ "LUPIN LTD \n",
+ "TEVA PHARMS USA \n",
+ "ZYDUS PHARMS USA \n",
+ "INC👨🏻🚀👨🏻🚀\n",
+ "EQ 600MG BASE;300MG \n",
+ "EQ 600MG BASE;300MG \n",
+ "EQ 600MG BASE;300MG \n",
+ "EQ 600MG BASE;300MG \n",
+ "EQ 600MG BASE;300MG \n",
+ "EPZICOM👨🏻🚀👨🏻🚀\n",
+ "+! \n",
+ "AB \n",
+ "ABACAVIR SULFATE; LAMIVUDINE; ZIDOVUDINE👨🏻🚀👨🏻🚀\n",
+ "VIIV HLTHCARE \n",
+ "EQ 600MG BASE;300MG \n",
+ "A090159 001 👨🏻🚀Nov 15, 2018👨🏻🚀\n",
+ "A206151 001 👨🏻🚀Mar 28, 2017👨🏻🚀\n",
+ "A091144 001 👨🏻🚀Mar 28, 2017👨🏻🚀\n",
+ "A204990 001 👨🏻🚀Mar 28, 2017👨🏻🚀\n",
+ "A079246 001 👨🏻🚀Sep 29, 2016👨🏻🚀\n",
+ "A208990 001 👨🏻🚀Nov 15, 2018👨🏻🚀\n",
+ "N021652 001 👨🏻🚀Aug 02, 2004👨🏻🚀👨🏻🚀\n",
+ "TABLET;ORAL👨🏻🚀\n",
+ "ABACAVIR SULFATE, LAMIVUDINE AND ZIDOVUDINE👨🏻🚀\n",
+ "AB \n",
+ "LUPIN LTD \n",
+ "EQ 300MG BASE;150MG;300MG \n",
+ "A202912 001 👨🏻🚀Dec 05, 2013👨🏻🚀\n",
+ "TRIZIVIR👨🏻🚀👨🏻🚀\n",
+ "AB \n",
+ "+! \n",
+ "ABALOPARATIDE👨🏻🚀👨🏻🚀\n",
+ "VIIV HLTHCARE \n",
+ "SOLUTION;SUBCUTANEOUS👨🏻🚀👨🏻🚀\n",
+ "TYMLOS👨🏻🚀👨🏻🚀\n",
+ "+! \n",
+ "RADIUS HEALTH INC \n",
+ "ABEMACICLIB👨🏻🚀👨🏻🚀\n",
+ "TABLET;ORAL👨🏻🚀👨🏻🚀\n",
+ "VERZENIO👨🏻🚀👨🏻🚀\n",
+ "ELI LILLY AND CO \n",
+ "+ \n",
+ "+ \n",
+ "+ \n",
+ "+! \n",
+ "ABIRATERONE ACETATE👨🏻🚀👨🏻🚀\n",
+ "TABLET;ORAL👨🏻🚀\n",
+ "AB \n",
+ "AB \n",
+ "AB \n",
+ "AB \n",
+ "AB \n",
+ "AB \n",
+ "ABIRATERONE ACETATE👨🏻🚀👨🏻🚀\n",
+ "AMNEAL PHARMS \n",
+ "APOTEX INC \n",
+ "HIKMA PHARMS \n",
+ "MYLAN PHARMS INC \n",
+ "TEVA PHARMS USA \n",
+ "ZYTIGA👨🏻🚀👨🏻🚀\n",
+ "+ \n",
+ "YONSA👨🏻🚀👨🏻🚀\n",
+ "+! \n",
+ "ZYTIGA👨🏻🚀👨🏻🚀\n",
+ "+! \n",
+ "JANSSEN BIOTECH \n",
+ "SUN PHARMA GLOBAL \n",
+ "JANSSEN BIOTECH \n",
+ "EQ 300MG BASE;150MG;300MG \n",
+ "N021205 001 👨🏻🚀Nov 14, 2000👨🏻🚀👨🏻🚀\n",
+ "3.12MG/1.56ML (2MG/ML) \n",
+ "N208743 001 👨🏻🚀Apr 28, 2017👨🏻🚀👨🏻🚀\n",
+ "50MG \n",
+ "100MG \n",
+ "150MG \n",
+ "200MG \n",
+ "250MG \n",
+ "250MG \n",
+ "250MG \n",
+ "250MG \n",
+ "250MG \n",
+ "250MG \n",
+ "125MG \n",
+ "500MG \n",
+ "N208716 001 👨🏻🚀Sep 28, 2017👨🏻🚀\n",
+ "N208716 002 👨🏻🚀Sep 28, 2017👨🏻🚀\n",
+ "N208716 003 👨🏻🚀Sep 28, 2017👨🏻🚀\n",
+ "N208716 004 👨🏻🚀Sep 28, 2017👨🏻🚀👨🏻🚀\n",
+ "A208327 001 👨🏻🚀Jan 07, 2019👨🏻🚀\n",
+ "A208453 001 👨🏻🚀Oct 31, 2018👨🏻🚀\n",
+ "A208339 001 👨🏻🚀Oct 31, 2018👨🏻🚀\n",
+ "A208446 001 👨🏻🚀Oct 31, 2018👨🏻🚀\n",
+ "A208432 001 👨🏻🚀Oct 31, 2018👨🏻🚀\n",
+ "N202379 001 👨🏻🚀Apr 28, 2011👨🏻🚀\n",
+ "N210308 001 👨🏻🚀May 22, 2018👨🏻🚀\n",
+ "N202379 002 👨🏻🚀Apr 14, 2017👨🏻🚀👨🏻🚀\n",
+ "39TH EDITION - 2019 - APPROVED DRUG PRODUCT LIST👨🏻🚀👨🏻🚀\n",
+ "PRESCRIPTION DRUG PRODUCT LIST \n",
+ "['c', 'a', UnknownSymbol: A of type , '👨🏻\\u200d🚀', UnknownSymbol: d of type , UnknownSymbol: a of type , UnknownSymbol: t of type , UnknownSymbol: a of type , '👨🏻\\u200d🚀', UnknownSymbol: s of type , UnknownSymbol: u of type , UnknownSymbol: p of type , UnknownSymbol: p of type , UnknownSymbol: l of type , UnknownSymbol: i of type , UnknownSymbol: e of type , UnknownSymbol: d of type , '👨🏻\\u200d🚀', UnknownSymbol: b of type , UnknownSymbol: y of type , '👨🏻\\u200d🚀', 'a', UnknownSymbol: r of type , UnknownSymbol: u of type , UnknownSymbol: g of type , 'm', UnknownSymbol: a of type , UnknownSymbol: t of type , UnknownSymbol: e of type , UnknownSymbol: n of type , UnknownSymbol: t of type , 't', UnknownSymbol: a of type , UnknownSymbol: t of type , UnknownSymbol: c of type , UnknownSymbol: h of type , 'K', UnknownSymbol: c of type , UnknownSymbol: o of type , UnknownSymbol: m of type ]\n",
+ "['3', '-', '2', 'E', UnknownSymbol: o of type , UnknownSymbol: f of type , '👨🏻\\u200d🚀', 'Q', 'R', 'O', 'F', '👨🏻\\u200d🚀', '👨🏻\\u200d🚀']\n",
+ "ACALABRUTINIB👨🏻🚀👨🏻🚀\n",
+ "CAPSULE;ORAL👨🏻🚀👨🏻🚀\n",
+ "CALQUENCE👨🏻🚀👨🏻🚀\n",
+ "+! \n",
+ "ASTRAZENECA \n",
+ "100MG \n",
+ "ACAMPROSATE CALCIUM👨🏻🚀👨🏻🚀\n",
+ "TABLET, DELAYED RELEASE;ORAL👨🏻🚀\n",
+ "ACAMPROSATE CALCIUM👨🏻🚀👨🏻🚀\n",
+ "! \n",
+ "AB \n",
+ "AB \n",
+ "AB \n",
+ "AB \n",
+ "BARR LABS DIV TEVA \n",
+ "GLENMARK GENERICS \n",
+ "MYLAN PHARMS INC \n",
+ "ZYDUS PHARMS USA \n",
+ "INC👨🏻🚀👨🏻🚀\n",
+ "333MG \n",
+ "333MG \n",
+ "333MG \n",
+ "333MG \n",
+ "ACARBOSE👨🏻🚀👨🏻🚀\n",
+ "TABLET;ORAL👨🏻🚀\n",
+ "ACARBOSE👨🏻🚀👨🏻🚀\n",
+ "EMCURE PHARMS LTD \n",
+ "IMPAX LABS \n",
+ "MYLAN \n",
+ "STRIDES PHARMA \n",
+ "VIRTUS PHARM \n",
+ "WATSON LABS \n",
+ "WEST-WARD PHARMS \n",
+ "INT👨🏻🚀👨🏻🚀\n",
+ "AB \n",
+ "AB \n",
+ "AB \n",
+ "AB \n",
+ "AB \n",
+ "AB \n",
+ "AB \n",
+ "AB \n",
+ "AB \n",
+ "AB \n",
+ "AB \n",
+ "AB \n",
+ "AB \n",
+ "AB \n",
+ "AB \n",
+ "AB \n",
+ "AB \n",
+ "AB \n",
+ "AB \n",
+ "AB \n",
+ "AB \n",
+ "PRECOSE👨🏻🚀👨🏻🚀\n",
+ "+! \n",
+ "+ \n",
+ "+ \n",
+ "BAYER HLTHCARE \n",
+ "AB \n",
+ "AB \n",
+ "AB \n",
+ "ACEBUTOLOL HYDROCHLORIDE👨🏻🚀👨🏻🚀\n",
+ "CAPSULE;ORAL👨🏻🚀\n",
+ "ACEBUTOLOL HYDROCHLORIDE👨🏻🚀👨🏻🚀\n",
+ "AMNEAL PHARM \n",
+ "! \n",
+ "! \n",
+ "AB \n",
+ "AB \n",
+ "AB \n",
+ "AB \n",
+ "ACETAMINOPHEN👨🏻🚀👨🏻🚀\n",
+ "MYLAN \n",
+ "25MG \n",
+ "50MG \n",
+ "100MG \n",
+ "25MG \n",
+ "50MG \n",
+ "100MG \n",
+ "25MG \n",
+ "50MG \n",
+ "100MG \n",
+ "25MG \n",
+ "50MG \n",
+ "100MG \n",
+ "25MG \n",
+ "50MG \n",
+ "100MG \n",
+ "25MG \n",
+ "50MG \n",
+ "100MG \n",
+ "25MG \n",
+ "50MG \n",
+ "100MG \n",
+ "25MG \n",
+ "50MG \n",
+ "100MG \n",
+ "EQ 200MG BASE \n",
+ "EQ 400MG BASE \n",
+ "EQ 200MG BASE \n",
+ "EQ 400MG BASE \n",
+ "N210259 001 👨🏻🚀Oct 31, 2017👨🏻🚀👨🏻🚀\n",
+ "A200143 001 👨🏻🚀Nov 18, 2013👨🏻🚀\n",
+ "A202229 001 👨🏻🚀Jul 16, 2013👨🏻🚀\n",
+ "A200142 001 👨🏻🚀Mar 11, 2014👨🏻🚀\n",
+ "A205995 001 👨🏻🚀May 26, 2017👨🏻🚀\n",
+ "A202271 001 👨🏻🚀Feb 07, 2012👨🏻🚀\n",
+ "A202271 002 👨🏻🚀Feb 07, 2012👨🏻🚀\n",
+ "A202271 003 👨🏻🚀Feb 07, 2012👨🏻🚀\n",
+ "A078441 001 👨🏻🚀May 14, 2009👨🏻🚀\n",
+ "A078441 002 👨🏻🚀May 14, 2009👨🏻🚀\n",
+ "A078441 003 👨🏻🚀May 14, 2009👨🏻🚀\n",
+ "A091053 001 👨🏻🚀Jan 06, 2011👨🏻🚀\n",
+ "A091053 002 👨🏻🚀Jan 06, 2011👨🏻🚀\n",
+ "A091053 003 👨🏻🚀Jan 06, 2011👨🏻🚀\n",
+ "A090912 001 👨🏻🚀Jul 27, 2011👨🏻🚀\n",
+ "A090912 002 👨🏻🚀Jul 27, 2011👨🏻🚀\n",
+ "A090912 003 👨🏻🚀Jul 27, 2011👨🏻🚀\n",
+ "A091343 001 👨🏻🚀Oct 17, 2013👨🏻🚀\n",
+ "A091343 002 👨🏻🚀Oct 17, 2013👨🏻🚀\n",
+ "A091343 003 👨🏻🚀Oct 17, 2013👨🏻🚀\n",
+ "A077532 001 👨🏻🚀May 07, 2008👨🏻🚀\n",
+ "A077532 002 👨🏻🚀May 07, 2008👨🏻🚀\n",
+ "A077532 003 👨🏻🚀May 07, 2008👨🏻🚀\n",
+ "A078470 001 👨🏻🚀May 07, 2008👨🏻🚀\n",
+ "A078470 002 👨🏻🚀May 07, 2008👨🏻🚀\n",
+ "A078470 003 👨🏻🚀May 07, 2008👨🏻🚀\n",
+ "N020482 004 👨🏻🚀May 29, 1997👨🏻🚀\n",
+ "N020482 001 👨🏻🚀Sep 06, 1995👨🏻🚀\n",
+ "N020482 002 👨🏻🚀Sep 06, 1995👨🏻🚀👨🏻🚀\n",
+ "A075047 001 👨🏻🚀Dec 30, 1999👨🏻🚀\n",
+ "A075047 002 👨🏻🚀Dec 30, 1999👨🏻🚀\n",
+ "A074288 001 👨🏻🚀Apr 24, 1995👨🏻🚀\n",
+ "A074288 002 👨🏻🚀Apr 24, 1995👨🏻🚀👨🏻🚀\n",
+ "SOLUTION;INTRAVENOUS👨🏻🚀\n",
+ "ACETAMINOPHEN👨🏻🚀👨🏻🚀\n",
+ "AP \n",
+ "AP \n",
+ "AP \n",
+ "CUSTOPHARM INC \n",
+ "SANDOZ INC \n",
+ "1GM/100ML (10MG/ML) \n",
+ "1GM/100ML (10MG/ML) \n",
+ "A202605 001 👨🏻🚀Jun 13, 2016👨🏻🚀\n",
+ "A204052 001 👨🏻🚀Mar 22, 2016👨🏻🚀\n",
+ "OFIRMEV👨🏻🚀👨🏻🚀\n",
+ "+! \n",
+ "MALLINCKRODT HOSP \n",
+ "ACETAMINOPHEN👨🏻🚀👨🏻🚀\n",
+ "1GM/100ML (10MG/ML) \n",
+ "N022450 001 👨🏻🚀Nov 02, 2010👨🏻🚀👨🏻🚀\n",
+ "FRESENIUS KABI USA \n",
+ "1GM/100ML (10MG/ML) \n",
+ "N204767 001 👨🏻🚀Oct 28, 2015👨🏻🚀👨🏻🚀\n",
+ "ACETAMINOPHEN; BENZHYDROCODONE HYDROCHLORIDE👨🏻🚀👨🏻🚀\n",
+ "TABLET;ORAL👨🏻🚀👨🏻🚀\n",
+ "APADAZ👨🏻🚀👨🏻🚀\n",
+ "+ \n",
+ "KEMPHARM \n",
+ "ACETAMINOPHEN; BUTALBITAL👨🏻🚀👨🏻🚀\n",
+ "CAPSULE;ORAL👨🏻🚀👨🏻🚀\n",
+ "325MG;EQ 6.12MG BASE \n",
+ "N208653 001 👨🏻🚀Feb 23, 2018👨🏻🚀👨🏻🚀\n",
+ "BUTALBITAL AND ACETAMINOPHEN👨🏻🚀👨🏻🚀\n",
+ "MAYNE PHARMA INC \n",
+ "! \n",
+ "300MG;50MG \n",
+ "TABLET;ORAL👨🏻🚀\n",
+ "BUTALBITAL AND ACETAMINOPHEN👨🏻🚀👨🏻🚀\n",
+ "CNTY LINE PHARMS \n",
+ "LARKEN LABS INC \n",
+ "300MG;50MG \n",
+ "325MG;50MG \n",
+ "325MG;50MG \n",
+ "AA \n",
+ "AA \n",
+ "AA \n",
+ "A207313 001 👨🏻🚀Dec 27, 2017👨🏻🚀👨🏻🚀\n",
+ "A207635 001 👨🏻🚀Jun 05, 2017👨🏻🚀\n",
+ "A205120 001 👨🏻🚀Oct 30, 2015👨🏻🚀\n",
+ "A203484 002 👨🏻🚀Dec 04, 2015👨🏻🚀👨🏻🚀\n",
+ "39TH EDITION - 2019 - APPROVED DRUG PRODUCT LIST👨🏻🚀👨🏻🚀\n",
+ "PRESCRIPTION DRUG PRODUCT LIST \n",
+ "['c', 'a', UnknownSymbol: A of type , '👨🏻\\u200d🚀', UnknownSymbol: d of type , UnknownSymbol: a of type , UnknownSymbol: t of type , UnknownSymbol: a of type , '👨🏻\\u200d🚀', UnknownSymbol: s of type , UnknownSymbol: u of type , UnknownSymbol: p of type , UnknownSymbol: p of type , UnknownSymbol: l of type , UnknownSymbol: i of type , UnknownSymbol: e of type , UnknownSymbol: d of type , '👨🏻\\u200d🚀', UnknownSymbol: b of type , UnknownSymbol: y of type , '👨🏻\\u200d🚀', 'a', UnknownSymbol: r of type , UnknownSymbol: u of type , UnknownSymbol: g of type , 'm', UnknownSymbol: a of type , UnknownSymbol: t of type , UnknownSymbol: e of type , UnknownSymbol: n of type , UnknownSymbol: t of type , 't', UnknownSymbol: a of type , UnknownSymbol: t of type , UnknownSymbol: c of type , UnknownSymbol: h of type , 'K', UnknownSymbol: c of type , UnknownSymbol: o of type , UnknownSymbol: m of type ]\n",
+ "['3', '-', '3', 'E', UnknownSymbol: o of type , UnknownSymbol: f of type , '👨🏻\\u200d🚀', 'Q', 'R', 'O', 'F', '👨🏻\\u200d🚀', '👨🏻\\u200d🚀']\n",
+ "ACETAMINOPHEN; BUTALBITAL👨🏻🚀👨🏻🚀\n",
+ "TABLET;ORAL👨🏻🚀\n",
+ "BUTALBITAL AND ACETAMINOPHEN👨🏻🚀👨🏻🚀\n",
+ "AA \n",
+ "AA \n",
+ "! \n",
+ "MIKART \n",
+ "NEXGEN PHARMA \n",
+ "BUTAPAP👨🏻🚀👨🏻🚀\n",
+ "AA \n",
+ "! \n",
+ "MIKART \n",
+ "ALLZITAL👨🏻🚀👨🏻🚀\n",
+ "300MG;50MG \n",
+ "300MG;50MG \n",
+ "325MG;50MG \n",
+ "LARKEN LABS INC \n",
+ "ACETAMINOPHEN; BUTALBITAL; CAFFEINE👨🏻🚀👨🏻🚀\n",
+ "325MG;25MG \n",
+ "CAPSULE;ORAL👨🏻🚀\n",
+ "BUTALBITAL, ACETAMINOPHEN AND CAFFEINE👨🏻🚀\n",
+ "! \n",
+ "! \n",
+ "AA \n",
+ "AA \n",
+ "AA \n",
+ "AA \n",
+ "AA \n",
+ "AUROLIFE PHARMA LLC 👨🏻🚀325MG;50MG;40MG \n",
+ "325MG;50MG;40MG \n",
+ "MAYNE PHARMA INC \n",
+ "300MG;50MG;40MG \n",
+ "NEXGEN PHARMA \n",
+ "300MG;50MG;40MG \n",
+ "NUVO PHARMS INC \n",
+ "300MG;50MG;40MG \n",
+ "WRASER PHARMS LLC \n",
+ "SOLUTION;ORAL👨🏻🚀👨🏻🚀\n",
+ "BUTALBITAL, ACETAMINOPHEN AND CAFFEINE👨🏻🚀👨🏻🚀\n",
+ "A207386 001 👨🏻🚀Nov 15, 2016👨🏻🚀\n",
+ "A090956 001 👨🏻🚀Aug 23, 2011👨🏻🚀\n",
+ "A089987 001 👨🏻🚀Oct 26, 1992👨🏻🚀👨🏻🚀\n",
+ "A203484 001 👨🏻🚀Dec 04, 2015👨🏻🚀👨🏻🚀\n",
+ "A204733 001 👨🏻🚀Sep 26, 2018👨🏻🚀\n",
+ "A089007 001 👨🏻🚀Mar 17, 1986👨🏻🚀\n",
+ "A040885 001 👨🏻🚀Nov 16, 2009👨🏻🚀\n",
+ "A207118 001 👨🏻🚀Oct 28, 2016👨🏻🚀\n",
+ "A206615 001 👨🏻🚀Aug 04, 2017👨🏻🚀\n",
+ "! \n",
+ "MIKART \n",
+ "TABLET;ORAL👨🏻🚀\n",
+ "325MG/15ML;50MG/15ML;40MG/15ML \n",
+ "A040387 001 👨🏻🚀Jan 31, 2003👨🏻🚀👨🏻🚀\n",
+ "BUTALBITAL, ACETAMINOPHEN AND CAFFEINE👨🏻🚀\n",
+ "AA \n",
+ "AA \n",
+ "AA \n",
+ "AA \n",
+ "AA \n",
+ "AA \n",
+ "AA \n",
+ "AA \n",
+ "AA \n",
+ "ACETAMINOPHEN; BUTALBITAL; CAFFEINE; CODEINE PHOSPHATE👨🏻🚀👨🏻🚀\n",
+ "325MG;50MG;40MG \n",
+ "ABHAI LLC \n",
+ "ACTAVIS LABS UT INC 👨🏻🚀325MG;50MG;40MG \n",
+ "325MG;50MG;40MG \n",
+ "CNTY LINE PHARMS \n",
+ "325MG;50MG;40MG \n",
+ "HIKMA PHARMS \n",
+ "325MG;50MG;40MG \n",
+ "LANNETT CO INC \n",
+ "325MG;50MG;40MG \n",
+ "MIKART \n",
+ "325MG;50MG;40MG \n",
+ "NEXGEN PHARMA INC \n",
+ "325MG;50MG;40MG \n",
+ "SPECGX LLC \n",
+ "325MG;50MG;40MG \n",
+ "VINTAGE PHARMS \n",
+ "! \n",
+ "CAPSULE;ORAL👨🏻🚀\n",
+ "BUTALBITAL, ACETAMINOPHEN, CAFFEINE AND CODEINE PHOSPHATE👨🏻🚀\n",
+ "AB \n",
+ "AB \n",
+ "NEXGEN PHARMA INC \n",
+ "VINTAGE PHARMS \n",
+ "325MG;50MG;40MG;30MG \n",
+ "325MG;50MG;40MG;30MG \n",
+ "FIORICET W/ CODEINE👨🏻🚀\n",
+ "AB \n",
+ "+! \n",
+ "ACTAVIS LABS UT INC 👨🏻🚀325MG;50MG;40MG;30MG \n",
+ "BUTALBITAL, ACETAMINOPHEN, CAFFEINE AND CODEINE PHOSPHATE👨🏻🚀👨🏻🚀\n",
+ "A211106 001 👨🏻🚀Sep 26, 2018👨🏻🚀\n",
+ "A088616 001 👨🏻🚀Nov 09, 1984👨🏻🚀\n",
+ "A204984 001 👨🏻🚀Jan 10, 2017👨🏻🚀\n",
+ "A089718 001 👨🏻🚀Jun 12, 1995👨🏻🚀\n",
+ "A200243 001 👨🏻🚀Sep 13, 2012👨🏻🚀\n",
+ "A089175 001 👨🏻🚀Jan 21, 1987👨🏻🚀\n",
+ "A209587 001 👨🏻🚀Oct 31, 2018👨🏻🚀\n",
+ "A087804 001 👨🏻🚀Jan 24, 1985👨🏻🚀\n",
+ "A040511 001 👨🏻🚀Aug 27, 2003👨🏻🚀👨🏻🚀\n",
+ "A076560 001 👨🏻🚀Jun 10, 2004👨🏻🚀\n",
+ "A075929 001 👨🏻🚀Apr 22, 2002👨🏻🚀\n",
+ "N020232 001 👨🏻🚀Jul 30, 1992👨🏻🚀👨🏻🚀\n",
+ "NEXGEN PHARMA INC \n",
+ "300MG;50MG;40MG;30MG \n",
+ "A076560 002 👨🏻🚀Jul 19, 2012👨🏻🚀👨🏻🚀\n",
+ "ACETAMINOPHEN; CAFFEINE; DIHYDROCODEINE BITARTRATE👨🏻🚀👨🏻🚀\n",
+ "CAPSULE;ORAL👨🏻🚀👨🏻🚀\n",
+ "TREZIX👨🏻🚀👨🏻🚀\n",
+ "WRASER PHARMS LLC \n",
+ "320.5MG;30MG;16MG \n",
+ "A204785 001 👨🏻🚀Nov 26, 2014👨🏻🚀👨🏻🚀\n",
+ "TABLET;ORAL👨🏻🚀👨🏻🚀\n",
+ "ACETAMINOPHEN, CAFFEINE AND DIHYDROCODEINE BITARTRATE👨🏻🚀👨🏻🚀\n",
+ "LARKEN LABS INC \n",
+ "325MG;30MG;16MG \n",
+ "ACETAMINOPHEN; CODEINE PHOSPHATE👨🏻🚀👨🏻🚀\n",
+ "SOLUTION;ORAL👨🏻🚀\n",
+ "ACETAMINOPHEN AND CODEINE PHOSPHATE👨🏻🚀👨🏻🚀\n",
+ "AA \n",
+ "AA \n",
+ "AA \n",
+ "AA \n",
+ "AA \n",
+ "! \n",
+ "HI TECH PHARMA \n",
+ "LANNETT CO INC \n",
+ "MIKART \n",
+ "PHARM ASSOC \n",
+ "WOCKHARDT BIO AG \n",
+ "TABLET;ORAL👨🏻🚀\n",
+ "120MG/5ML;12MG/5ML \n",
+ "120MG/5ML;12MG/5ML \n",
+ "120MG/5ML;12MG/5ML \n",
+ "120MG/5ML;12MG/5ML \n",
+ "120MG/5ML;12MG/5ML \n",
+ "ACETAMINOPHEN AND CODEINE PHOSPHATE👨🏻🚀👨🏻🚀\n",
+ "AA \n",
+ "AA \n",
+ "AA \n",
+ "AA \n",
+ "AA \n",
+ "AA \n",
+ "AA \n",
+ "AA \n",
+ "AA \n",
+ "AA \n",
+ "AA \n",
+ "AA \n",
+ "AA \n",
+ "! \n",
+ "! \n",
+ "SPECGX LLC \n",
+ "300MG;30MG \n",
+ "AMNEAL PHARMS NY \n",
+ "AUROLIFE PHARMA LLC 👨🏻🚀300MG;15MG \n",
+ "300MG;30MG \n",
+ "300MG;60MG \n",
+ "300MG;15MG \n",
+ "300MG;30MG \n",
+ "300MG;60MG \n",
+ "300MG;30MG \n",
+ "300MG;60MG \n",
+ "300MG;15MG \n",
+ "300MG;30MG \n",
+ "300MG;60MG \n",
+ "300MG;15MG \n",
+ "SUN PHARM INDS LTD \n",
+ "VINTAGE \n",
+ "TEVA \n",
+ "A204209 001 👨🏻🚀Sep 30, 2016👨🏻🚀👨🏻🚀\n",
+ "A040119 001 👨🏻🚀Apr 26, 1996👨🏻🚀\n",
+ "A091238 001 👨🏻🚀Nov 10, 2011👨🏻🚀\n",
+ "A089450 001 👨🏻🚀Oct 27, 1992👨🏻🚀\n",
+ "A087508 001👨🏻🚀👨🏻🚀\n",
+ "A087006 001👨🏻🚀👨🏻🚀\n",
+ "A040779 001 👨🏻🚀May 29, 2008👨🏻🚀\n",
+ "A202800 001 👨🏻🚀Apr 15, 2013👨🏻🚀\n",
+ "A202800 002 👨🏻🚀Apr 15, 2013👨🏻🚀\n",
+ "A202800 003 👨🏻🚀Apr 15, 2013👨🏻🚀\n",
+ "A040419 001 👨🏻🚀May 31, 2001👨🏻🚀\n",
+ "A040419 002 👨🏻🚀May 31, 2001👨🏻🚀\n",
+ "A040419 003 👨🏻🚀May 31, 2001👨🏻🚀\n",
+ "A085868 001👨🏻🚀👨🏻🚀\n",
+ "A087083 001👨🏻🚀👨🏻🚀\n",
+ "A088627 001 👨🏻🚀Mar 06, 1985👨🏻🚀\n",
+ "A088628 001 👨🏻🚀Mar 06, 1985👨🏻🚀\n",
+ "A088629 001 👨🏻🚀Mar 06, 1985👨🏻🚀\n",
+ "A089990 001 👨🏻🚀Sep 30, 1988👨🏻🚀👨🏻🚀\n",
+ "39TH EDITION - 2019 - APPROVED DRUG PRODUCT LIST👨🏻🚀👨🏻🚀\n",
+ "PRESCRIPTION DRUG PRODUCT LIST \n",
+ "['c', 'a', UnknownSymbol: A of type , '👨🏻\\u200d🚀', UnknownSymbol: d of type , UnknownSymbol: a of type , UnknownSymbol: t of type , UnknownSymbol: a of type , '👨🏻\\u200d🚀', UnknownSymbol: s of type , UnknownSymbol: u of type , UnknownSymbol: p of type , UnknownSymbol: p of type , UnknownSymbol: l of type , UnknownSymbol: i of type , UnknownSymbol: e of type , UnknownSymbol: d of type , '👨🏻\\u200d🚀', UnknownSymbol: b of type , UnknownSymbol: y of type , '👨🏻\\u200d🚀', 'a', UnknownSymbol: r of type , UnknownSymbol: u of type , UnknownSymbol: g of type , 'm', UnknownSymbol: a of type , UnknownSymbol: t of type , UnknownSymbol: e of type , UnknownSymbol: n of type , UnknownSymbol: t of type , 't', UnknownSymbol: a of type , UnknownSymbol: t of type , UnknownSymbol: c of type , UnknownSymbol: h of type , 'K', UnknownSymbol: c of type , UnknownSymbol: o of type , UnknownSymbol: m of type ]\n",
+ "['3', '-', '4', 'E', UnknownSymbol: o of type , UnknownSymbol: f of type , '👨🏻\\u200d🚀', 'Q', 'R', 'O', 'F', '👨🏻\\u200d🚀', '👨🏻\\u200d🚀']\n",
+ "ACETAMINOPHEN; CODEINE PHOSPHATE👨🏻🚀👨🏻🚀\n",
+ "TABLET;ORAL👨🏻🚀\n",
+ "ACETAMINOPHEN AND CODEINE PHOSPHATE👨🏻🚀👨🏻🚀\n",
+ "AA \n",
+ "AA \n",
+ "VINTAGE PHARMS \n",
+ "TYLENOL W/ CODEINE NO. 3👨🏻🚀\n",
+ "300MG;30MG \n",
+ "300MG;60MG \n",
+ "AA \n",
+ "! \n",
+ "JANSSEN PHARMS \n",
+ "300MG;30MG \n",
+ "TYLENOL W/ CODEINE NO. 4👨🏻🚀\n",
+ "AA \n",
+ "ACETAMINOPHEN; HYDROCODONE BITARTRATE👨🏻🚀👨🏻🚀\n",
+ "JANSSEN PHARMS \n",
+ "300MG;60MG \n",
+ "SOLUTION;ORAL👨🏻🚀\n",
+ "HYDROCODONE BITARTRATE AND ACETAMINOPHEN👨🏻🚀👨🏻🚀\n",
+ "AA \n",
+ "AA \n",
+ "AA \n",
+ "AA \n",
+ "GENUS LIFESCIENCES \n",
+ "MIKART \n",
+ "PHARM ASSOC \n",
+ "VISTAPHARM \n",
+ "MIKART \n",
+ "PHARM ASSOC \n",
+ "325MG/15ML;7.5MG/15ML \n",
+ "325MG/15ML;7.5MG/15ML \n",
+ "325MG/15ML;7.5MG/15ML \n",
+ "325MG/15ML;7.5MG/15ML \n",
+ "300MG/15ML;10MG/15ML \n",
+ "325MG/15ML;10MG/15ML \n",
+ "! \n",
+ "! \n",
+ "! \n",
+ "TABLET;ORAL👨🏻🚀\n",
+ "ANEXSIA 5/325👨🏻🚀\n",
+ "AA \n",
+ "AA \n",
+ "AA \n",
+ "AA \n",
+ "AA \n",
+ "AA \n",
+ "AA \n",
+ "AA \n",
+ "AA \n",
+ "AA \n",
+ "AA \n",
+ "AA \n",
+ "AA \n",
+ "AA \n",
+ "AA \n",
+ "AA \n",
+ "AA \n",
+ "AA \n",
+ "AA \n",
+ "AA \n",
+ "AA \n",
+ "AA \n",
+ "AA \n",
+ "AA \n",
+ "AA \n",
+ "AA \n",
+ "AA \n",
+ "AA \n",
+ "AA \n",
+ "AA \n",
+ "AA \n",
+ "AA \n",
+ "AA \n",
+ "AA \n",
+ "AA \n",
+ "AA \n",
+ "AA \n",
+ "AA \n",
+ "AA \n",
+ "AA \n",
+ "AA \n",
+ "AA \n",
+ "AA \n",
+ "AA \n",
+ "AA \n",
+ "AA \n",
+ "AA \n",
+ "AA \n",
+ "AA \n",
+ "AA \n",
+ "SPECGX LLC \n",
+ "ANEXSIA 7.5/325👨🏻🚀\n",
+ "325MG;5MG \n",
+ "325MG;7.5MG \n",
+ "HYDROCODONE BITARTRATE AND ACETAMINOPHEN👨🏻🚀👨🏻🚀\n",
+ "SPECGX LLC \n",
+ "ABHAI LLC \n",
+ "300MG;5MG \n",
+ "300MG;7.5MG \n",
+ "300MG;10MG \n",
+ "325MG;5MG \n",
+ "325MG;7.5MG \n",
+ "325MG;10MG \n",
+ "ACTAVIS LABS FL INC 👨🏻🚀300MG;5MG \n",
+ "ALVOGEN PINE BROOK \n",
+ "AMNEAL PHARMS \n",
+ "AMNEAL PHARMS NY \n",
+ "ASCENT PHARMS INC \n",
+ "300MG;7.5MG \n",
+ "300MG;10MG \n",
+ "300MG;5MG \n",
+ "300MG;7.5MG \n",
+ "300MG;10MG \n",
+ "325MG;2.5MG \n",
+ "325MG;5MG \n",
+ "325MG;7.5MG \n",
+ "325MG;10MG \n",
+ "300MG;10MG \n",
+ "300MG;5MG \n",
+ "325MG;5MG \n",
+ "325MG;7.5MG \n",
+ "325MG;10MG \n",
+ "325MG;2.5MG \n",
+ "325MG;5MG \n",
+ "325MG;7.5MG \n",
+ "325MG;10MG \n",
+ "AUROLIFE PHARMA LLC 👨🏻🚀300MG;5MG \n",
+ "300MG;7.5MG \n",
+ "300MG;10MG \n",
+ "325MG;5MG \n",
+ "325MG;7.5MG \n",
+ "325MG;10MG \n",
+ "325MG;2.5MG \n",
+ "325MG;5MG \n",
+ "325MG;7.5MG \n",
+ "325MG;10MG \n",
+ "325MG;5MG \n",
+ "325MG;7.5MG \n",
+ "325MG;10MG \n",
+ "300MG;5MG \n",
+ "300MG;7.5MG \n",
+ "300MG;10MG \n",
+ "325MG;5MG \n",
+ "325MG;7.5MG \n",
+ "325MG;10MG \n",
+ "300MG;5MG \n",
+ "300MG;7.5MG \n",
+ "300MG;10MG \n",
+ "325MG;2.5MG \n",
+ "ELITE LABS INC \n",
+ "EPIC PHARMA LLC \n",
+ "LANNETT CO INC \n",
+ "MIKART \n",
+ "! \n",
+ "! \n",
+ "! \n",
+ "! \n",
+ "A089805 001 👨🏻🚀Sep 30, 1988👨🏻🚀\n",
+ "A089828 001 👨🏻🚀Sep 30, 1988👨🏻🚀\n",
+ "A085055 003👨🏻🚀👨🏻🚀\n",
+ "A085055 004👨🏻🚀👨🏻🚀\n",
+ "A040894 001 👨🏻🚀Jul 19, 2011👨🏻🚀\n",
+ "A040482 001 👨🏻🚀Sep 25, 2003👨🏻🚀\n",
+ "A040838 001 👨🏻🚀May 10, 2013👨🏻🚀\n",
+ "A200343 001 👨🏻🚀Jan 25, 2012👨🏻🚀👨🏻🚀\n",
+ "A040881 001 👨🏻🚀Feb 25, 2010👨🏻🚀👨🏻🚀\n",
+ "A040834 001 👨🏻🚀Apr 18, 2008👨🏻🚀\n",
+ "A040409 001 👨🏻🚀Oct 20, 2000👨🏻🚀\n",
+ "A040405 001 👨🏻🚀Sep 08, 2000👨🏻🚀\n",
+ "A209036 001 👨🏻🚀Jun 21, 2017👨🏻🚀\n",
+ "A209036 002 👨🏻🚀Jun 21, 2017👨🏻🚀\n",
+ "A209036 003 👨🏻🚀Jun 21, 2017👨🏻🚀\n",
+ "A209037 001 👨🏻🚀Jun 21, 2017👨🏻🚀\n",
+ "A209037 002 👨🏻🚀Jun 21, 2017👨🏻🚀\n",
+ "A209037 003 👨🏻🚀Jun 21, 2017👨🏻🚀\n",
+ "A206470 001 👨🏻🚀Jun 02, 2016👨🏻🚀\n",
+ "A206470 002 👨🏻🚀Jun 02, 2016👨🏻🚀\n",
+ "A206470 003 👨🏻🚀Jun 02, 2016👨🏻🚀\n",
+ "A208540 001 👨🏻🚀Nov 08, 2018👨🏻🚀\n",
+ "A208540 002 👨🏻🚀Nov 08, 2018👨🏻🚀\n",
+ "A208540 003 👨🏻🚀Nov 08, 2018👨🏻🚀\n",
+ "A209958 001 👨🏻🚀Oct 24, 2018👨🏻🚀\n",
+ "A209958 002 👨🏻🚀Oct 24, 2018👨🏻🚀\n",
+ "A209958 003 👨🏻🚀Oct 24, 2018👨🏻🚀\n",
+ "A209958 004 👨🏻🚀Oct 24, 2018👨🏻🚀\n",
+ "A207137 001 👨🏻🚀Nov 29, 2016👨🏻🚀\n",
+ "A206869 001 👨🏻🚀Jun 23, 2017👨🏻🚀\n",
+ "A040736 001 👨🏻🚀Aug 25, 2006👨🏻🚀\n",
+ "A040746 002 👨🏻🚀May 10, 2016👨🏻🚀\n",
+ "A040746 001 👨🏻🚀Aug 25, 2006👨🏻🚀\n",
+ "A211487 001 👨🏻🚀Nov 07, 2018👨🏻🚀\n",
+ "A211487 002 👨🏻🚀Nov 07, 2018👨🏻🚀\n",
+ "A211487 003 👨🏻🚀Nov 07, 2018👨🏻🚀\n",
+ "A211487 004 👨🏻🚀Nov 07, 2018👨🏻🚀\n",
+ "A207709 001 👨🏻🚀Sep 13, 2018👨🏻🚀\n",
+ "A207709 002 👨🏻🚀Sep 13, 2018👨🏻🚀\n",
+ "A207709 003 👨🏻🚀Sep 13, 2018👨🏻🚀\n",
+ "A201013 001 👨🏻🚀Apr 11, 2012👨🏻🚀\n",
+ "A201013 002 👨🏻🚀Apr 11, 2012👨🏻🚀\n",
+ "A201013 003 👨🏻🚀Apr 11, 2012👨🏻🚀\n",
+ "A209924 001 👨🏻🚀Nov 16, 2018👨🏻🚀\n",
+ "A209924 002 👨🏻🚀Nov 16, 2018👨🏻🚀\n",
+ "A209924 003 👨🏻🚀Nov 16, 2018👨🏻🚀\n",
+ "A209924 004 👨🏻🚀Nov 16, 2018👨🏻🚀\n",
+ "A203863 001 👨🏻🚀Mar 30, 2018👨🏻🚀\n",
+ "A203863 002 👨🏻🚀Mar 30, 2018👨🏻🚀\n",
+ "A203863 003 👨🏻🚀Mar 30, 2018👨🏻🚀\n",
+ "A207171 001 👨🏻🚀Jun 20, 2017👨🏻🚀\n",
+ "A207171 002 👨🏻🚀Jun 20, 2017👨🏻🚀\n",
+ "A207171 003 👨🏻🚀Jun 20, 2017👨🏻🚀\n",
+ "A207172 001 👨🏻🚀Jun 22, 2017👨🏻🚀\n",
+ "A207172 002 👨🏻🚀Jun 22, 2017👨🏻🚀\n",
+ "A207172 003 👨🏻🚀Jun 22, 2017👨🏻🚀\n",
+ "A040658 001 👨🏻🚀Jan 19, 2006👨🏻🚀\n",
+ "A040658 002 👨🏻🚀Mar 24, 2006👨🏻🚀\n",
+ "A040658 003 👨🏻🚀Jun 23, 2004👨🏻🚀\n",
+ "A040846 001 👨🏻🚀Jun 09, 2010👨🏻🚀👨🏻🚀\n"
+ ]
+ }
+ ],
+ "source": [
+ "ob2020.check_stream(pdftotext_lines)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "e899245f-b901-4108-bfab-6ab7b329e8d6",
+ "metadata": {},
+ "source": [
+ "current thought: Walk the object tree of the pdf, extracting and converting text while tracking where it is physically."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "4543148b-4f2f-47f6-a13c-13f21dfd4a7c",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.8.13"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/Orangebook/download.url.txt b/Orangebook/download.url.txt
new file mode 100644
index 0000000..c54344c
--- /dev/null
+++ b/Orangebook/download.url.txt
@@ -0,0 +1 @@
+https://www.fda.gov/media/76860/download
diff --git a/Orangebook/testing-pyocr.ipynb b/Orangebook/testing-pyocr.ipynb
new file mode 100644
index 0000000..a1e4b15
--- /dev/null
+++ b/Orangebook/testing-pyocr.ipynb
@@ -0,0 +1,145 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "id": "51bf48a1-920a-4e64-ac5f-323ff3a27ebf",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Will use tool 'Tesseract (sh)'\n",
+ "Available languages: eng, osd\n",
+ "Will use language 'eng'\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Import the required libraries\n",
+ "from wand.image import Image\n",
+ "from PIL import Image as PI\n",
+ "import pyocr\n",
+ "import pyocr.builders\n",
+ "import io, sys\n",
+ "\n",
+ "\n",
+ "# Get the handle of the OCR library (in this case, tesseract)\n",
+ "tools = pyocr.get_available_tools()\n",
+ "if len(tools) == 0:\n",
+ "\tprint(\"No OCR tool found!\")\n",
+ "\tsys.exit(1)\n",
+ "tool = tools[0]\n",
+ "print(\"Will use tool '%s'\" % (tool.get_name()))\n",
+ "\n",
+ "# Get the language\n",
+ "langs = tool.get_available_languages()\n",
+ "print(\"Available languages: %s\" % \", \".join(langs)) \n",
+ "lang = langs[0] # For English\n",
+ "print(\"Will use language '%s'\" % (lang))\n",
+ "\n",
+ "# Setup two lists which will be used to hold our images and final_text\n",
+ "req_image = []\n",
+ "final_text = []\n",
+ "\n",
+ "# Open the PDF file using wand and convert it to jpeg\n",
+ "image_pdf = Image(filename=\"/home/will/research/ClinicalTrialsDataProcessing/Orangebook/Orangebooks/testprint.pdf\", resolution=300)\n",
+ "image_jpeg = image_pdf.convert('pdf')\n",
+ "\n",
+ "# wand has converted all the separate pages in the PDF into separate image\n",
+ "# blobs. We can loop over them and append them as a blob into the req_image\n",
+ "# list.\n",
+ "for img in image_jpeg.sequence:\n",
+ "\timg_page = Image(image=img)\n",
+ "\treq_image.append(img_page.make_blob('jpeg'))\n",
+ "\n",
+ "# Now we just need to run OCR over the image blobs and store all of the \n",
+ "# recognized text in final_text.\n",
+ "for img in req_image:\n",
+ "\ttxt = tool.image_to_string(\n",
+ "\t\tPI.open(io.BytesIO(img)),\n",
+ "\t\tlang=lang,\n",
+ "\t\tbuilder=pyocr.builders.TextBuilder()\n",
+ "\t)\n",
+ "\tfinal_text.append(txt)\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "id": "f0d5f1d6-7e15-4ee6-b4ee-cbd41c5afb99",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "The final text is: \n",
+ "\n",
+ "40TH EDITION - 2020 - APPROVED DRUG PRODUCT LIST\n",
+ "\n",
+ "PRESCRIPTION DRUG PRODUCT LIST\n",
+ "\n",
+ "ABACAVIR SULFATE\n",
+ "SOLUTION; ORAL\n",
+ "ABACAVIR SULFATE\n",
+ "\n",
+ "EQ 2 5 /ML\n",
+ "\n",
+ "EQ 2 Ee /ML\n",
+ "\n",
+ "EQ 300MG BASE\n",
+ "EQ 300MG BASE\n",
+ "EQ 300MG BASE\n",
+ "\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(\"\\nThe final text is: \\n\")\n",
+ "print(final_text[0][0:200])"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "1cac17e7-079d-4e32-bdbf-ae49194b2078",
+ "metadata": {},
+ "source": [
+ "it appears taht this does not have the required precision. I'll need to do this some other way."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "2283e290-fab3-4cda-8ce9-55a0b3533c98",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.8.13"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}