diff --git a/Orangebook/CidParser.py b/Orangebook/CidParser.py new file mode 100644 index 0000000..1b45db0 --- /dev/null +++ b/Orangebook/CidParser.py @@ -0,0 +1,93 @@ +# Adobe Pdf Character ID (cid:\d+) parser +# The purpose is to allow someone to create their own table equivalent to the "\toUnicode" that +# should be provided in every PDF using cid's (but is often mangled). + +def get_digits(string): + """ + Extract leading the digits from a cid tag. + """ + splat = string.split(")") + num = splat[0] + l = len(num) + return int(num),l + +def token_generator(string): + """ + An iterable that returns tokens describing a string in a pdf. + Tokens take two forms: + - Integers: these represend CID codes + - Characters: these represent the arbitrary characters often returned amidst cid's. + + It is a python generator becasue that simplifies the ordering and allows us to avoid recursion. + """ + start = 0 + str_len = len(string) + + while start < str_len: + substring = string[start:] + + #check for cid + if (str_len - start > 6) and (substring[0:5] == "(cid:"): + + num,length = get_digits(substring[5:]) + start += length + 6 + yield num + + elif (str_len - start > 1): + start += 1 + yield substring[0] + else: + start += 1 + yield substring + + +class UnknownSymbol(): + """ + Represents a token that is not in the parser's dictionary. + """ + def __init__(self, symbol): + self.symbol = symbol + + def __repr__(self): + return "UnknownSymbol: {} of type {}".format(self.symbol, type(self.symbol)) + + def __str__(self): + return "\uFFFD" + +class Parser: + """ + Translates from tokens to character arrays or strings, handling errors as it goes. + + It requires a dictionary during instantiation. + This dictionary is what is used to perform lookups. + + It exposes 3 methods + - convert attempts to convert a single token + - convert_stream will try to convert an iterable of tokens into an iterable of text. + - check_list_of_strings will try to convert a list of strings containing cids and other symbols into + - strings, if there are no Unknown symbols. + - lists, containing characters and Unknown symbols. + """ + def __init__(self, lookup_table): + self._lookup_table = lookup_table + + def convert(self,token): + try: + return self._lookup_table[token] + except: + return UnknownSymbol(token) + + def convert_list(self,token_stream): + for token in token_stream: + yield self.convert(token) + + def convert_list_of_strings(self, list_of_strings): + for token_stream in list_of_stings: + arr = [x for x in ob2020.convert_stream(token_generator(token_stream))] + try: + print("".join(arr)) + except: + print(arr) + +if __name__ == "__main__": + print("Plan was to accept and proceess a symbol table and text. Apparently it has not been implemented." \ No newline at end of file diff --git a/Orangebook/Untitled.ipynb b/Orangebook/Untitled.ipynb new file mode 100644 index 0000000..1a38498 --- /dev/null +++ b/Orangebook/Untitled.ipynb @@ -0,0 +1,371 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 18, + "id": "40358f02-c376-4431-be39-cdd477f17e7a", + "metadata": {}, + "outputs": [], + "source": [ + "import polars as pl" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "8fb27ee2-72c1-4e80-9d00-de54f2834fe8", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "polars.datatypes.Datetime" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pl.datatypes.Datetime" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "id": "2c0edd77-c2d0-4184-a094-8c01783d2f0e", + "metadata": {}, + "outputs": [], + "source": [ + "products = pl.scan_csv(file=\"./EOBZIP_2022_04/products.txt\", sep=\"~\")\n", + "patents = pl.scan_csv(file=\"./EOBZIP_2022_04/patent.txt\", sep=\"~\")\n", + "exclusivity = pl.scan_csv(file=\"./EOBZIP_2022_04/exclusivity.txt\", sep=\"~\", parse_dates=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "id": "023f211d-23aa-4a2c-843d-1b60cec91079", + "metadata": {}, + "outputs": [], + "source": [ + "def set_exclusivity_types(df):\n", + " return df.with_columns([\n", + " pl.col(\"Exclusivity_Date\").str.strptime(pl.Date, fmt=\"%b %-d, %Y\")\n", + " ])" + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "id": "a1da42c9-e47a-4437-b089-e9b91f789a0c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
\n", + "Appl_Type\n", + "\n", + "Appl_No\n", + "\n", + "Product_No\n", + "\n", + "Exclusivity_Code\n", + "\n", + "Exclusivity_Date\n", + "
\n", + "str\n", + "\n", + "i64\n", + "\n", + "i64\n", + "\n", + "str\n", + "\n", + "date\n", + "
\n", + "\"N\"\n", + "\n", + "11366\n", + "\n", + "2\n", + "\n", + "\"ODE-96\"\n", + "\n", + "2022-08-07\n", + "
\n", + "\"N\"\n", + "\n", + "20287\n", + "\n", + "11\n", + "\n", + "\"NPP\"\n", + "\n", + "2022-05-16\n", + "
\n", + "\"N\"\n", + "\n", + "20287\n", + "\n", + "10\n", + "\n", + "\"NPP\"\n", + "\n", + "2022-05-16\n", + "
\n", + "\"N\"\n", + "\n", + "20287\n", + "\n", + "9\n", + "\n", + "\"NPP\"\n", + "\n", + "2022-05-16\n", + "
\n", + "\"N\"\n", + "\n", + "20287\n", + "\n", + "8\n", + "\n", + "\"NPP\"\n", + "\n", + "2022-05-16\n", + "
\n", + "
" + ], + "text/plain": [ + "shape: (5, 5)\n", + "┌───────────┬─────────┬────────────┬──────────────────┬──────────────────┐\n", + "│ Appl_Type ┆ Appl_No ┆ Product_No ┆ Exclusivity_Code ┆ Exclusivity_Date │\n", + "│ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n", + "│ str ┆ i64 ┆ i64 ┆ str ┆ date │\n", + "╞═══════════╪═════════╪════════════╪══════════════════╪══════════════════╡\n", + "│ N ┆ 11366 ┆ 2 ┆ ODE-96 ┆ 2022-08-07 │\n", + "├╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤\n", + "│ N ┆ 20287 ┆ 11 ┆ NPP ┆ 2022-05-16 │\n", + "├╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤\n", + "│ N ┆ 20287 ┆ 10 ┆ NPP ┆ 2022-05-16 │\n", + "├╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤\n", + "│ N ┆ 20287 ┆ 9 ┆ NPP ┆ 2022-05-16 │\n", + "├╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤\n", + "│ N ┆ 20287 ┆ 8 ┆ NPP ┆ 2022-05-16 │\n", + "└───────────┴─────────┴────────────┴──────────────────┴──────────────────┘" + ] + }, + "execution_count": 61, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "exclusivity.pipe(set_exclusivity_types).head(5).collect()" + ] + }, + { + "cell_type": "code", + "execution_count": 85, + "id": "92fe99fa-1963-460c-99ea-7f614b4b2e25", + "metadata": {}, + "outputs": [], + "source": [ + "def set_patent_types(df):\n", + " return df.with_columns([\n", + " pl.col(\"Patent_Expire_Date_Text\").str.strptime(pl.Date, fmt=\"%b %-d, %Y\"),\n", + " pl.col(\"Submission_Date\").str.strptime(pl.Date, fmt=\"%b %-d, %Y\"),\n", + " pl.col(\"Drug_Substance_Flag\") == \"Y\",\n", + " pl.col(\"Drug_Product_Flag\") == \"Y\",\n", + " pl.col(\"Delist_Flag\") == \"Y\"\n", + " ])" + ] + }, + { + "cell_type": "code", + "execution_count": 90, + "id": "13707ca6-094f-4ed7-94cb-824087e97874", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
\n", + "Patent_Expire_Date_Text\n", + "
\n", + "date\n", + "
\n", + "2022-01-02\n", + "
\n", + "
" + ], + "text/plain": [ + "shape: (1, 1)\n", + "┌─────────────────────────┐\n", + "│ Patent_Expire_Date_Text │\n", + "│ --- │\n", + "│ date │\n", + "╞═════════════════════════╡\n", + "│ 2022-01-02 │\n", + "└─────────────────────────┘" + ] + }, + "execution_count": 90, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "patents.pipe(set_patent_types).select(\"Patent_Expire_Date_Text\").min().collect()" + ] + }, + { + "cell_type": "code", + "execution_count": 81, + "id": "18ad8df7-45d5-4454-8955-c5f28a7d7f1e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "polars.datatypes.Null" + ] + }, + "execution_count": 81, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pl.datatypes.Null" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "79e4b3d9-29ae-4302-bee1-4be02e0ba654", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/Orangebook/Untitled2.ipynb b/Orangebook/Untitled2.ipynb new file mode 100644 index 0000000..7a1679d --- /dev/null +++ b/Orangebook/Untitled2.ipynb @@ -0,0 +1,216 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 196, + "id": "2f61df31-f3c1-4b2e-ae96-96bf06089b17", + "metadata": {}, + "outputs": [], + "source": [ + "def get_digits(string):\n", + " splat = string.split(\")\")\n", + " num = splat[0]\n", + " l = len(num)\n", + " return int(num),l\n", + "\n", + "def token_generator(string):\n", + " \n", + " start = 0\n", + " str_len = len(string)\n", + " \n", + " \n", + " while start < str_len:\n", + " substring = string[start:]\n", + " \n", + " #check for cid\n", + " if (str_len - start > 6) and (substring[0:5] == \"(cid:\"):\n", + " \n", + " num,length = get_digits(substring[5:])\n", + " start += length + 6\n", + " yield num\n", + " \n", + " elif (str_len - start > 1):\n", + " start += 1\n", + " yield substring[0]\n", + " else:\n", + " start += 1\n", + " yield substring\n", + "\n", + "class UnknownSymbol():\n", + " def __init__(self, symbol):\n", + " self.symbol = symbol\n", + " \n", + " def __repr__(self):\n", + " return \"UnknownSymbol: {} of type {}\".format(self.symbol, type(self.symbol))\n", + " \n", + " def __str__(self):\n", + " return \"\\uFFFD\"\n", + "\n", + "class Parser:\n", + " def __init__(self, lookup_table):\n", + " self._lookup_table = lookup_table\n", + " \n", + " def convert(self,symbol):\n", + " try:\n", + " return self._lookup_table[symbol]\n", + " except:\n", + " return UnknownSymbol(symbol)\n", + " \n", + " def convert_stream(self,token_stream):\n", + " for token in token_stream:\n", + " yield self.convert(token)" + ] + }, + { + "cell_type": "code", + "execution_count": 213, + "id": "e2c1e39b-0ac5-4ad7-9176-ef8ea69feeec", + "metadata": {}, + "outputs": [], + "source": [ + "ob2020 = Parser({\n", + " 23:\"4\"\n", + " ,19:\"0\"\n", + " ,\"7\":\"T\"\n", + " ,\"+\":\"H\"\n", + " ,3:\" \"\n", + " ,\"(\":\"E\"\n", + " ,\"’\":\"D\"\n", + " ,\",\":\"I\"\n", + " ,\"2\":\"O\"\n", + " ,\"1\":\"N\"\n", + " ,16:\"-\"\n", + " ,21:\"2\"\n", + " ,\"$\":\"A\"\n", + " ,\"3\":\"P\"\n", + " ,\"5\":\"R\"\n", + " ,\"9\":\"V\"\n", + " ,\"8\":\"U\"\n", + " ,\"*\":\"G\"\n", + " ,\"&\":\"C\"\n", + " ,\"/\":\"L\"\n", + " ,\"6\":\"S\"\n", + " ,22:\"3\"\n", + " ,20:\"1\"\n", + " ,11:\" \"\n", + " ,\"R\":\"(\"\n", + " ,\"I\":\"of\"\n", + " ,24:\"5\"\n", + " ,12:\")\"\n", + "})" + ] + }, + { + "cell_type": "code", + "execution_count": 214, + "id": "c02896ab-fc75-44cc-bb27-a5dcf1b6d7f0", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'40TH EDITION - 2020 - APPROVED DRUG PRODUCT LIST'" + ] + }, + "execution_count": 214, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "\"\".join([x for x in ob2020.convert_stream(token_generator(b))])" + ] + }, + { + "cell_type": "code", + "execution_count": 215, + "id": "1794e826-fa1f-4aba-8eab-6d603a06dfe0", + "metadata": {}, + "outputs": [], + "source": [ + "c = \"35(6&5,37,21(cid:3)’58*(cid:3)352’8&7(cid:3)/,67(cid:3)\"" + ] + }, + { + "cell_type": "code", + "execution_count": 216, + "id": "c8b67d79-81ad-4b3a-be8b-bace9a8d943a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'PRESCRIPTION DRUG PRODUCT LIST '" + ] + }, + "execution_count": 216, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "\"\".join([x for x in ob2020.convert_stream(token_generator(c))])" + ] + }, + { + "cell_type": "code", + "execution_count": 217, + "id": "f76d9760-fe69-4743-ab47-41cf74866d70", + "metadata": {}, + "outputs": [], + "source": [ + "d = \"(cid:22)(cid:16)(cid:20)(cid:11)RI(cid:3)(cid:23)(cid:24)(cid:22)(cid:12)\"" + ] + }, + { + "cell_type": "code", + "execution_count": 218, + "id": "ee925997-0701-4d8e-b713-6f39c6a50a5b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['3', '-', '1', ' ', '(', 'of', ' ', '4', '5', '3', ')']" + ] + }, + "execution_count": 218, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "[x for x in ob2020.convert_stream(token_generator(d))]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ae152c75-5fd6-4756-a473-fcea2de5ee30", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/Orangebook/cid_parser.ipynb b/Orangebook/cid_parser.ipynb new file mode 100644 index 0000000..6a1266b --- /dev/null +++ b/Orangebook/cid_parser.ipynb @@ -0,0 +1,1066 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "412f6c09-9f40-46e3-9642-720946c93ba8", + "metadata": {}, + "outputs": [], + "source": [ + "import pdfminer" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "967257f2-dae8-4789-8306-539575d55f57", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "20220506\n" + ] + } + ], + "source": [ + "print(pdfminer.__version__)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "5f56326c-70c4-4e2c-a11b-75b32c231842", + "metadata": {}, + "outputs": [], + "source": [ + "from pdfminer.converter import TextConverter\n", + "from pdfminer.layout import LAParams\n", + "from pdfminer.pdfdocument import PDFDocument\n", + "from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter\n", + "from pdfminer.pdfpage import PDFPage\n", + "from pdfminer.pdfparser import PDFParser\n", + "\n", + "from io import StringIO" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "c6c619b1-f897-4c1f-892f-a294b8ab0338", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "20220506\n" + ] + } + ], + "source": [ + "print(pdfminer.__version__)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "7b4c4555-57cc-4109-a937-f26fea9afae3", + "metadata": {}, + "outputs": [], + "source": [ + "output_string = StringIO()\n", + "\n", + "with open(\"/home/will/research/ClinicalTrialsDataProcessing/Orangebook/Orangebooks/testprint4.pdf\", \"rb\") as file_handle:\n", + " parser = PDFParser(file_handle)\n", + " doc = PDFDocument(parser)\n", + " \n", + " rsrcmgr = PDFResourceManager()\n", + " device = TextConverter(rsrcmgr, output_string, laparams=LAParams())\n", + " interpreter = PDFPageInterpreter(rsrcmgr, device)\n", + " \n", + " for page in PDFPage.create_pages(doc):\n", + " interpreter.process_page(page)\n", + "\n", + "pdfminer_lines = output_string.getvalue().splitlines()\n", + "pdftotext_lines = [ln for ln in pdfminer_lines if ln]\n" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "7315468d-a918-4365-9008-661fca836344", + "metadata": {}, + "outputs": [], + "source": [ + "def get_digits(string):\n", + " splat = string.split(\")\")\n", + " num = splat[0]\n", + " l = len(num)\n", + " return int(num),l\n", + "\n", + "def token_generator(string):\n", + " \n", + " start = 0\n", + " str_len = len(string)\n", + " \n", + " \n", + " while start < str_len:\n", + " substring = string[start:]\n", + " \n", + " #check for cid\n", + " if (str_len - start > 6) and (substring[0:5] == \"(cid:\"):\n", + " \n", + " num,length = get_digits(substring[5:])\n", + " start += length + 6\n", + " yield num\n", + " \n", + " elif (str_len - start > 1):\n", + " start += 1\n", + " yield substring[0]\n", + " else:\n", + " start += 1\n", + " yield substring\n", + "\n", + "class UnknownSymbol():\n", + " def __init__(self, symbol):\n", + " self.symbol = symbol\n", + " \n", + " def __repr__(self):\n", + " return \"UnknownSymbol: {} of type {}\".format(self.symbol, type(self.symbol))\n", + " \n", + " def __str__(self):\n", + " return \"\\uFFFD\"\n", + "\n", + "class Parser:\n", + " def __init__(self, lookup_table):\n", + " self._lookup_table = lookup_table\n", + " \n", + " def convert(self,symbol):\n", + " try:\n", + " return self._lookup_table[symbol]\n", + " except:\n", + " return UnknownSymbol(symbol)\n", + " \n", + " def convert_stream(self,token_stream):\n", + " for token in token_stream:\n", + " yield self.convert(token)\n", + " \n", + " def check_stream(self, pdftotext_lines):\n", + " for entry in pdftotext_lines:\n", + " arr = [x for x in ob2020.convert_stream(token_generator(entry))]\n", + " try:\n", + " print(\"\".join(arr))\n", + " except:\n", + " print(arr)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "826d0009-120e-4e04-b90b-6585308e69d3", + "metadata": {}, + "outputs": [], + "source": [ + "ob2020 = Parser({\n", + " 23:\"4\"\n", + " ,19:\"0\"\n", + " ,\"7\":\"T\"\n", + " ,\"+\":\"H\"\n", + " ,3:\" \"\n", + " ,\"(\":\"E\"\n", + " ,\"’\":\"D\"\n", + " ,\",\":\"I\"\n", + " ,\"2\":\"O\"\n", + " ,\"1\":\"N\"\n", + " ,16:\"-\"\n", + " ,21:\"2\"\n", + " ,\"$\":\"A\"\n", + " ,\"3\":\"P\"\n", + " ,\"5\":\"R\"\n", + " ,\"9\":\"V\"\n", + " ,\"8\":\"U\"\n", + " ,\"*\":\"G\"\n", + " ,\"&\":\"C\"\n", + " ,\"/\":\"L\"\n", + " ,\"6\":\"S\"\n", + " ,22:\"3\"\n", + " ,20:\"1\"\n", + " ,11:\"(\"\n", + " ,\"R\":\"o\"\n", + " ,\"I\":\"f\"\n", + " ,24:\"5\"\n", + " ,12:\")\"\n", + " ,\" \":\"👨🏻‍🚀\"\n", + " ,\"%\":\"B\"\n", + " ,\")\":\"F\"\n", + " ,30:\";\"\n", + " ,\"0\":\"M\"\n", + " ,\"4\":\"Q\"\n", + " ,18:\"/\"\n", + " ,26:\"7\"\n", + " ,28:\"9\"\n", + " ,\"D\":\"a\"\n", + " ,\"U\":\"r\"\n", + " ,15:\",\"\n", + " ,27:\"8\"\n", + " ,\"H\":\"e\"\n", + " ,\"S\":\"p\"\n", + " ,25:\"6\"\n", + " ,\"=\":\"Z\"\n", + " ,14:\"+\"\n", + " ,4:\"!\"\n", + " ,\"F\":\"c\"\n", + " ,\";\":\"X\"\n", + " ,\"<\":\"Y\"\n", + " ,\"Y\":\"v\"\n", + " ,\"-\":\"J\"\n", + " ,\"X\":\"u\"\n", + " ,\"Q\":\"n\"\n", + " ,\"W\":\"t\"\n", + " ,\"J\":\"g\"\n", + " ,\".\":\"K\"\n", + " ,\":\":\"W\"\n", + " ,17:\".\"\n", + " ,\"O\":\"l\"\n", + " ,\"E\":\"b\"\n", + " ,\"\\\\\":\"y\"\n", + " ,8:\"%\"\n", + " ,\"L\":\"i\"\n", + " ,\"P\":\"m\"\n", + " ,10:\"'\"\n", + "})" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "58327a77-029b-4c05-b410-45d1a539ad3e", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "39TH EDITION - 2019 - APPROVED DRUG PRODUCT LIST👨🏻‍🚀👨🏻‍🚀\n", + "PRESCRIPTION DRUG PRODUCT LIST \n", + "['c', 'a', UnknownSymbol: A of type , '👨🏻\\u200d🚀', UnknownSymbol: d of type , UnknownSymbol: a of type , UnknownSymbol: t of type , UnknownSymbol: a of type , '👨🏻\\u200d🚀', UnknownSymbol: s of type , UnknownSymbol: u of type , UnknownSymbol: p of type , UnknownSymbol: p of type , UnknownSymbol: l of type , UnknownSymbol: i of type , UnknownSymbol: e of type , UnknownSymbol: d of type , '👨🏻\\u200d🚀', UnknownSymbol: b of type , UnknownSymbol: y of type , '👨🏻\\u200d🚀', 'a', UnknownSymbol: r of type , UnknownSymbol: u of type , UnknownSymbol: g of type , 'm', UnknownSymbol: a of type , UnknownSymbol: t of type , UnknownSymbol: e of type , UnknownSymbol: n of type , UnknownSymbol: t of type , 't', UnknownSymbol: a of type , UnknownSymbol: t of type , UnknownSymbol: c of type , UnknownSymbol: h of type , 'K', UnknownSymbol: c of type , UnknownSymbol: o of type , UnknownSymbol: m of type ]\n", + "['3', '-', '1', 'E', UnknownSymbol: o of type , UnknownSymbol: f of type , '👨🏻\\u200d🚀', 'Q', 'R', 'O', 'F', '👨🏻\\u200d🚀', '👨🏻\\u200d🚀']\n", + "ABACAVIR SULFATE👨🏻‍🚀👨🏻‍🚀\n", + "SOLUTION;ORAL👨🏻‍🚀\n", + "ABACAVIR SULFATE👨🏻‍🚀👨🏻‍🚀\n", + "AA \n", + "AA \n", + "AUROBINDO PHARMA \n", + "LTD👨🏻‍🚀👨🏻‍🚀\n", + "HETERO LABS LTD III 👨🏻‍🚀EQ 20MG BASE/ML \n", + "EQ 20MG BASE/ML \n", + "A077950 001 👨🏻‍🚀Mar 14, 2018👨🏻‍🚀\n", + "A201107 001 👨🏻‍🚀Sep 26, 2016👨🏻‍🚀\n", + "AA \n", + "ZIAGEN👨🏻‍🚀👨🏻‍🚀\n", + "+! \n", + "TABLET;ORAL👨🏻‍🚀\n", + "VIIV HLTHCARE \n", + "EQ 20MG BASE/ML \n", + "N020978 001 👨🏻‍🚀Dec 17, 1998👨🏻‍🚀👨🏻‍🚀\n", + "ABACAVIR SULFATE👨🏻‍🚀👨🏻‍🚀\n", + "APOTEX INC \n", + "AUROBINDO PHARMA \n", + "LTD👨🏻‍🚀👨🏻‍🚀\n", + "EQ 300MG BASE \n", + "CIPLA \n", + "HETERO LABS LTD III 👨🏻‍🚀EQ 300MG BASE \n", + "EQ 300MG BASE \n", + "MYLAN PHARMS INC \n", + "EQ 300MG BASE \n", + "STRIDES PHARMA \n", + "EQ 300MG BASE \n", + "EQ 300MG BASE \n", + "AB \n", + "AB \n", + "AB \n", + "AB \n", + "AB \n", + "AB \n", + "ZIAGEN👨🏻‍🚀👨🏻‍🚀\n", + "+! \n", + "AB \n", + "ABACAVIR SULFATE; DOLUTEGRAVIR SODIUM; LAMIVUDINE👨🏻‍🚀👨🏻‍🚀\n", + "VIIV HLTHCARE \n", + "EQ 300MG BASE \n", + "A201570 001 👨🏻‍🚀Dec 17, 2012👨🏻‍🚀\n", + "A077844 001 👨🏻‍🚀Dec 17, 2012👨🏻‍🚀\n", + "A078119 001 👨🏻‍🚀Nov 21, 2017👨🏻‍🚀\n", + "A091560 001 👨🏻‍🚀Sep 13, 2013👨🏻‍🚀\n", + "A091294 001 👨🏻‍🚀Jun 18, 2012👨🏻‍🚀\n", + "A091050 001 👨🏻‍🚀Oct 28, 2016👨🏻‍🚀\n", + "N020977 001 👨🏻‍🚀Dec 17, 1998👨🏻‍🚀👨🏻‍🚀\n", + "TABLET;ORAL👨🏻‍🚀👨🏻‍🚀\n", + "TRIUMEQ👨🏻‍🚀👨🏻‍🚀\n", + "+! \n", + "VIIV HLTHCARE \n", + "ABACAVIR SULFATE; LAMIVUDINE👨🏻‍🚀👨🏻‍🚀\n", + "TABLET;ORAL👨🏻‍🚀\n", + "EQ 600MG BASE;EQ 50MG BASE;300MG \n", + "N205551 001 👨🏻‍🚀Aug 22, 2014👨🏻‍🚀👨🏻‍🚀\n", + "AB \n", + "AB \n", + "AB \n", + "AB \n", + "AB \n", + "AB \n", + "ABACAVIR SULFATE AND LAMIVUDINE👨🏻‍🚀👨🏻‍🚀\n", + "AUROBINDO PHARMA \n", + "LTD👨🏻‍🚀👨🏻‍🚀\n", + "EQ 600MG BASE;300MG \n", + "CIPLA \n", + "LUPIN LTD \n", + "TEVA PHARMS USA \n", + "ZYDUS PHARMS USA \n", + "INC👨🏻‍🚀👨🏻‍🚀\n", + "EQ 600MG BASE;300MG \n", + "EQ 600MG BASE;300MG \n", + "EQ 600MG BASE;300MG \n", + "EQ 600MG BASE;300MG \n", + "EQ 600MG BASE;300MG \n", + "EPZICOM👨🏻‍🚀👨🏻‍🚀\n", + "+! \n", + "AB \n", + "ABACAVIR SULFATE; LAMIVUDINE; ZIDOVUDINE👨🏻‍🚀👨🏻‍🚀\n", + "VIIV HLTHCARE \n", + "EQ 600MG BASE;300MG \n", + "A090159 001 👨🏻‍🚀Nov 15, 2018👨🏻‍🚀\n", + "A206151 001 👨🏻‍🚀Mar 28, 2017👨🏻‍🚀\n", + "A091144 001 👨🏻‍🚀Mar 28, 2017👨🏻‍🚀\n", + "A204990 001 👨🏻‍🚀Mar 28, 2017👨🏻‍🚀\n", + "A079246 001 👨🏻‍🚀Sep 29, 2016👨🏻‍🚀\n", + "A208990 001 👨🏻‍🚀Nov 15, 2018👨🏻‍🚀\n", + "N021652 001 👨🏻‍🚀Aug 02, 2004👨🏻‍🚀👨🏻‍🚀\n", + "TABLET;ORAL👨🏻‍🚀\n", + "ABACAVIR SULFATE, LAMIVUDINE AND ZIDOVUDINE👨🏻‍🚀\n", + "AB \n", + "LUPIN LTD \n", + "EQ 300MG BASE;150MG;300MG \n", + "A202912 001 👨🏻‍🚀Dec 05, 2013👨🏻‍🚀\n", + "TRIZIVIR👨🏻‍🚀👨🏻‍🚀\n", + "AB \n", + "+! \n", + "ABALOPARATIDE👨🏻‍🚀👨🏻‍🚀\n", + "VIIV HLTHCARE \n", + "SOLUTION;SUBCUTANEOUS👨🏻‍🚀👨🏻‍🚀\n", + "TYMLOS👨🏻‍🚀👨🏻‍🚀\n", + "+! \n", + "RADIUS HEALTH INC \n", + "ABEMACICLIB👨🏻‍🚀👨🏻‍🚀\n", + "TABLET;ORAL👨🏻‍🚀👨🏻‍🚀\n", + "VERZENIO👨🏻‍🚀👨🏻‍🚀\n", + "ELI LILLY AND CO \n", + "+ \n", + "+ \n", + "+ \n", + "+! \n", + "ABIRATERONE ACETATE👨🏻‍🚀👨🏻‍🚀\n", + "TABLET;ORAL👨🏻‍🚀\n", + "AB \n", + "AB \n", + "AB \n", + "AB \n", + "AB \n", + "AB \n", + "ABIRATERONE ACETATE👨🏻‍🚀👨🏻‍🚀\n", + "AMNEAL PHARMS \n", + "APOTEX INC \n", + "HIKMA PHARMS \n", + "MYLAN PHARMS INC \n", + "TEVA PHARMS USA \n", + "ZYTIGA👨🏻‍🚀👨🏻‍🚀\n", + "+ \n", + "YONSA👨🏻‍🚀👨🏻‍🚀\n", + "+! \n", + "ZYTIGA👨🏻‍🚀👨🏻‍🚀\n", + "+! \n", + "JANSSEN BIOTECH \n", + "SUN PHARMA GLOBAL \n", + "JANSSEN BIOTECH \n", + "EQ 300MG BASE;150MG;300MG \n", + "N021205 001 👨🏻‍🚀Nov 14, 2000👨🏻‍🚀👨🏻‍🚀\n", + "3.12MG/1.56ML (2MG/ML) \n", + "N208743 001 👨🏻‍🚀Apr 28, 2017👨🏻‍🚀👨🏻‍🚀\n", + "50MG \n", + "100MG \n", + "150MG \n", + "200MG \n", + "250MG \n", + "250MG \n", + "250MG \n", + "250MG \n", + "250MG \n", + "250MG \n", + "125MG \n", + "500MG \n", + "N208716 001 👨🏻‍🚀Sep 28, 2017👨🏻‍🚀\n", + "N208716 002 👨🏻‍🚀Sep 28, 2017👨🏻‍🚀\n", + "N208716 003 👨🏻‍🚀Sep 28, 2017👨🏻‍🚀\n", + "N208716 004 👨🏻‍🚀Sep 28, 2017👨🏻‍🚀👨🏻‍🚀\n", + "A208327 001 👨🏻‍🚀Jan 07, 2019👨🏻‍🚀\n", + "A208453 001 👨🏻‍🚀Oct 31, 2018👨🏻‍🚀\n", + "A208339 001 👨🏻‍🚀Oct 31, 2018👨🏻‍🚀\n", + "A208446 001 👨🏻‍🚀Oct 31, 2018👨🏻‍🚀\n", + "A208432 001 👨🏻‍🚀Oct 31, 2018👨🏻‍🚀\n", + "N202379 001 👨🏻‍🚀Apr 28, 2011👨🏻‍🚀\n", + "N210308 001 👨🏻‍🚀May 22, 2018👨🏻‍🚀\n", + "N202379 002 👨🏻‍🚀Apr 14, 2017👨🏻‍🚀👨🏻‍🚀\n", + "39TH EDITION - 2019 - APPROVED DRUG PRODUCT LIST👨🏻‍🚀👨🏻‍🚀\n", + "PRESCRIPTION DRUG PRODUCT LIST \n", + "['c', 'a', UnknownSymbol: A of type , '👨🏻\\u200d🚀', UnknownSymbol: d of type , UnknownSymbol: a of type , UnknownSymbol: t of type , UnknownSymbol: a of type , '👨🏻\\u200d🚀', UnknownSymbol: s of type , UnknownSymbol: u of type , UnknownSymbol: p of type , UnknownSymbol: p of type , UnknownSymbol: l of type , UnknownSymbol: i of type , UnknownSymbol: e of type , UnknownSymbol: d of type , '👨🏻\\u200d🚀', UnknownSymbol: b of type , UnknownSymbol: y of type , '👨🏻\\u200d🚀', 'a', UnknownSymbol: r of type , UnknownSymbol: u of type , UnknownSymbol: g of type , 'm', UnknownSymbol: a of type , UnknownSymbol: t of type , UnknownSymbol: e of type , UnknownSymbol: n of type , UnknownSymbol: t of type , 't', UnknownSymbol: a of type , UnknownSymbol: t of type , UnknownSymbol: c of type , UnknownSymbol: h of type , 'K', UnknownSymbol: c of type , UnknownSymbol: o of type , UnknownSymbol: m of type ]\n", + "['3', '-', '2', 'E', UnknownSymbol: o of type , UnknownSymbol: f of type , '👨🏻\\u200d🚀', 'Q', 'R', 'O', 'F', '👨🏻\\u200d🚀', '👨🏻\\u200d🚀']\n", + "ACALABRUTINIB👨🏻‍🚀👨🏻‍🚀\n", + "CAPSULE;ORAL👨🏻‍🚀👨🏻‍🚀\n", + "CALQUENCE👨🏻‍🚀👨🏻‍🚀\n", + "+! \n", + "ASTRAZENECA \n", + "100MG \n", + "ACAMPROSATE CALCIUM👨🏻‍🚀👨🏻‍🚀\n", + "TABLET, DELAYED RELEASE;ORAL👨🏻‍🚀\n", + "ACAMPROSATE CALCIUM👨🏻‍🚀👨🏻‍🚀\n", + "! \n", + "AB \n", + "AB \n", + "AB \n", + "AB \n", + "BARR LABS DIV TEVA \n", + "GLENMARK GENERICS \n", + "MYLAN PHARMS INC \n", + "ZYDUS PHARMS USA \n", + "INC👨🏻‍🚀👨🏻‍🚀\n", + "333MG \n", + "333MG \n", + "333MG \n", + "333MG \n", + "ACARBOSE👨🏻‍🚀👨🏻‍🚀\n", + "TABLET;ORAL👨🏻‍🚀\n", + "ACARBOSE👨🏻‍🚀👨🏻‍🚀\n", + "EMCURE PHARMS LTD \n", + "IMPAX LABS \n", + "MYLAN \n", + "STRIDES PHARMA \n", + "VIRTUS PHARM \n", + "WATSON LABS \n", + "WEST-WARD PHARMS \n", + "INT👨🏻‍🚀👨🏻‍🚀\n", + "AB \n", + "AB \n", + "AB \n", + "AB \n", + "AB \n", + "AB \n", + "AB \n", + "AB \n", + "AB \n", + "AB \n", + "AB \n", + "AB \n", + "AB \n", + "AB \n", + "AB \n", + "AB \n", + "AB \n", + "AB \n", + "AB \n", + "AB \n", + "AB \n", + "PRECOSE👨🏻‍🚀👨🏻‍🚀\n", + "+! \n", + "+ \n", + "+ \n", + "BAYER HLTHCARE \n", + "AB \n", + "AB \n", + "AB \n", + "ACEBUTOLOL HYDROCHLORIDE👨🏻‍🚀👨🏻‍🚀\n", + "CAPSULE;ORAL👨🏻‍🚀\n", + "ACEBUTOLOL HYDROCHLORIDE👨🏻‍🚀👨🏻‍🚀\n", + "AMNEAL PHARM \n", + "! \n", + "! \n", + "AB \n", + "AB \n", + "AB \n", + "AB \n", + "ACETAMINOPHEN👨🏻‍🚀👨🏻‍🚀\n", + "MYLAN \n", + "25MG \n", + "50MG \n", + "100MG \n", + "25MG \n", + "50MG \n", + "100MG \n", + "25MG \n", + "50MG \n", + "100MG \n", + "25MG \n", + "50MG \n", + "100MG \n", + "25MG \n", + "50MG \n", + "100MG \n", + "25MG \n", + "50MG \n", + "100MG \n", + "25MG \n", + "50MG \n", + "100MG \n", + "25MG \n", + "50MG \n", + "100MG \n", + "EQ 200MG BASE \n", + "EQ 400MG BASE \n", + "EQ 200MG BASE \n", + "EQ 400MG BASE \n", + "N210259 001 👨🏻‍🚀Oct 31, 2017👨🏻‍🚀👨🏻‍🚀\n", + "A200143 001 👨🏻‍🚀Nov 18, 2013👨🏻‍🚀\n", + "A202229 001 👨🏻‍🚀Jul 16, 2013👨🏻‍🚀\n", + "A200142 001 👨🏻‍🚀Mar 11, 2014👨🏻‍🚀\n", + "A205995 001 👨🏻‍🚀May 26, 2017👨🏻‍🚀\n", + "A202271 001 👨🏻‍🚀Feb 07, 2012👨🏻‍🚀\n", + "A202271 002 👨🏻‍🚀Feb 07, 2012👨🏻‍🚀\n", + "A202271 003 👨🏻‍🚀Feb 07, 2012👨🏻‍🚀\n", + "A078441 001 👨🏻‍🚀May 14, 2009👨🏻‍🚀\n", + "A078441 002 👨🏻‍🚀May 14, 2009👨🏻‍🚀\n", + "A078441 003 👨🏻‍🚀May 14, 2009👨🏻‍🚀\n", + "A091053 001 👨🏻‍🚀Jan 06, 2011👨🏻‍🚀\n", + "A091053 002 👨🏻‍🚀Jan 06, 2011👨🏻‍🚀\n", + "A091053 003 👨🏻‍🚀Jan 06, 2011👨🏻‍🚀\n", + "A090912 001 👨🏻‍🚀Jul 27, 2011👨🏻‍🚀\n", + "A090912 002 👨🏻‍🚀Jul 27, 2011👨🏻‍🚀\n", + "A090912 003 👨🏻‍🚀Jul 27, 2011👨🏻‍🚀\n", + "A091343 001 👨🏻‍🚀Oct 17, 2013👨🏻‍🚀\n", + "A091343 002 👨🏻‍🚀Oct 17, 2013👨🏻‍🚀\n", + "A091343 003 👨🏻‍🚀Oct 17, 2013👨🏻‍🚀\n", + "A077532 001 👨🏻‍🚀May 07, 2008👨🏻‍🚀\n", + "A077532 002 👨🏻‍🚀May 07, 2008👨🏻‍🚀\n", + "A077532 003 👨🏻‍🚀May 07, 2008👨🏻‍🚀\n", + "A078470 001 👨🏻‍🚀May 07, 2008👨🏻‍🚀\n", + "A078470 002 👨🏻‍🚀May 07, 2008👨🏻‍🚀\n", + "A078470 003 👨🏻‍🚀May 07, 2008👨🏻‍🚀\n", + "N020482 004 👨🏻‍🚀May 29, 1997👨🏻‍🚀\n", + "N020482 001 👨🏻‍🚀Sep 06, 1995👨🏻‍🚀\n", + "N020482 002 👨🏻‍🚀Sep 06, 1995👨🏻‍🚀👨🏻‍🚀\n", + "A075047 001 👨🏻‍🚀Dec 30, 1999👨🏻‍🚀\n", + "A075047 002 👨🏻‍🚀Dec 30, 1999👨🏻‍🚀\n", + "A074288 001 👨🏻‍🚀Apr 24, 1995👨🏻‍🚀\n", + "A074288 002 👨🏻‍🚀Apr 24, 1995👨🏻‍🚀👨🏻‍🚀\n", + "SOLUTION;INTRAVENOUS👨🏻‍🚀\n", + "ACETAMINOPHEN👨🏻‍🚀👨🏻‍🚀\n", + "AP \n", + "AP \n", + "AP \n", + "CUSTOPHARM INC \n", + "SANDOZ INC \n", + "1GM/100ML (10MG/ML) \n", + "1GM/100ML (10MG/ML) \n", + "A202605 001 👨🏻‍🚀Jun 13, 2016👨🏻‍🚀\n", + "A204052 001 👨🏻‍🚀Mar 22, 2016👨🏻‍🚀\n", + "OFIRMEV👨🏻‍🚀👨🏻‍🚀\n", + "+! \n", + "MALLINCKRODT HOSP \n", + "ACETAMINOPHEN👨🏻‍🚀👨🏻‍🚀\n", + "1GM/100ML (10MG/ML) \n", + "N022450 001 👨🏻‍🚀Nov 02, 2010👨🏻‍🚀👨🏻‍🚀\n", + "FRESENIUS KABI USA \n", + "1GM/100ML (10MG/ML) \n", + "N204767 001 👨🏻‍🚀Oct 28, 2015👨🏻‍🚀👨🏻‍🚀\n", + "ACETAMINOPHEN; BENZHYDROCODONE HYDROCHLORIDE👨🏻‍🚀👨🏻‍🚀\n", + "TABLET;ORAL👨🏻‍🚀👨🏻‍🚀\n", + "APADAZ👨🏻‍🚀👨🏻‍🚀\n", + "+ \n", + "KEMPHARM \n", + "ACETAMINOPHEN; BUTALBITAL👨🏻‍🚀👨🏻‍🚀\n", + "CAPSULE;ORAL👨🏻‍🚀👨🏻‍🚀\n", + "325MG;EQ 6.12MG BASE \n", + "N208653 001 👨🏻‍🚀Feb 23, 2018👨🏻‍🚀👨🏻‍🚀\n", + "BUTALBITAL AND ACETAMINOPHEN👨🏻‍🚀👨🏻‍🚀\n", + "MAYNE PHARMA INC \n", + "! \n", + "300MG;50MG \n", + "TABLET;ORAL👨🏻‍🚀\n", + "BUTALBITAL AND ACETAMINOPHEN👨🏻‍🚀👨🏻‍🚀\n", + "CNTY LINE PHARMS \n", + "LARKEN LABS INC \n", + "300MG;50MG \n", + "325MG;50MG \n", + "325MG;50MG \n", + "AA \n", + "AA \n", + "AA \n", + "A207313 001 👨🏻‍🚀Dec 27, 2017👨🏻‍🚀👨🏻‍🚀\n", + "A207635 001 👨🏻‍🚀Jun 05, 2017👨🏻‍🚀\n", + "A205120 001 👨🏻‍🚀Oct 30, 2015👨🏻‍🚀\n", + "A203484 002 👨🏻‍🚀Dec 04, 2015👨🏻‍🚀👨🏻‍🚀\n", + "39TH EDITION - 2019 - APPROVED DRUG PRODUCT LIST👨🏻‍🚀👨🏻‍🚀\n", + "PRESCRIPTION DRUG PRODUCT LIST \n", + "['c', 'a', UnknownSymbol: A of type , '👨🏻\\u200d🚀', UnknownSymbol: d of type , UnknownSymbol: a of type , UnknownSymbol: t of type , UnknownSymbol: a of type , '👨🏻\\u200d🚀', UnknownSymbol: s of type , UnknownSymbol: u of type , UnknownSymbol: p of type , UnknownSymbol: p of type , UnknownSymbol: l of type , UnknownSymbol: i of type , UnknownSymbol: e of type , UnknownSymbol: d of type , '👨🏻\\u200d🚀', UnknownSymbol: b of type , UnknownSymbol: y of type , '👨🏻\\u200d🚀', 'a', UnknownSymbol: r of type , UnknownSymbol: u of type , UnknownSymbol: g of type , 'm', UnknownSymbol: a of type , UnknownSymbol: t of type , UnknownSymbol: e of type , UnknownSymbol: n of type , UnknownSymbol: t of type , 't', UnknownSymbol: a of type , UnknownSymbol: t of type , UnknownSymbol: c of type , UnknownSymbol: h of type , 'K', UnknownSymbol: c of type , UnknownSymbol: o of type , UnknownSymbol: m of type ]\n", + "['3', '-', '3', 'E', UnknownSymbol: o of type , UnknownSymbol: f of type , '👨🏻\\u200d🚀', 'Q', 'R', 'O', 'F', '👨🏻\\u200d🚀', '👨🏻\\u200d🚀']\n", + "ACETAMINOPHEN; BUTALBITAL👨🏻‍🚀👨🏻‍🚀\n", + "TABLET;ORAL👨🏻‍🚀\n", + "BUTALBITAL AND ACETAMINOPHEN👨🏻‍🚀👨🏻‍🚀\n", + "AA \n", + "AA \n", + "! \n", + "MIKART \n", + "NEXGEN PHARMA \n", + "BUTAPAP👨🏻‍🚀👨🏻‍🚀\n", + "AA \n", + "! \n", + "MIKART \n", + "ALLZITAL👨🏻‍🚀👨🏻‍🚀\n", + "300MG;50MG \n", + "300MG;50MG \n", + "325MG;50MG \n", + "LARKEN LABS INC \n", + "ACETAMINOPHEN; BUTALBITAL; CAFFEINE👨🏻‍🚀👨🏻‍🚀\n", + "325MG;25MG \n", + "CAPSULE;ORAL👨🏻‍🚀\n", + "BUTALBITAL, ACETAMINOPHEN AND CAFFEINE👨🏻‍🚀\n", + "! \n", + "! \n", + "AA \n", + "AA \n", + "AA \n", + "AA \n", + "AA \n", + "AUROLIFE PHARMA LLC 👨🏻‍🚀325MG;50MG;40MG \n", + "325MG;50MG;40MG \n", + "MAYNE PHARMA INC \n", + "300MG;50MG;40MG \n", + "NEXGEN PHARMA \n", + "300MG;50MG;40MG \n", + "NUVO PHARMS INC \n", + "300MG;50MG;40MG \n", + "WRASER PHARMS LLC \n", + "SOLUTION;ORAL👨🏻‍🚀👨🏻‍🚀\n", + "BUTALBITAL, ACETAMINOPHEN AND CAFFEINE👨🏻‍🚀👨🏻‍🚀\n", + "A207386 001 👨🏻‍🚀Nov 15, 2016👨🏻‍🚀\n", + "A090956 001 👨🏻‍🚀Aug 23, 2011👨🏻‍🚀\n", + "A089987 001 👨🏻‍🚀Oct 26, 1992👨🏻‍🚀👨🏻‍🚀\n", + "A203484 001 👨🏻‍🚀Dec 04, 2015👨🏻‍🚀👨🏻‍🚀\n", + "A204733 001 👨🏻‍🚀Sep 26, 2018👨🏻‍🚀\n", + "A089007 001 👨🏻‍🚀Mar 17, 1986👨🏻‍🚀\n", + "A040885 001 👨🏻‍🚀Nov 16, 2009👨🏻‍🚀\n", + "A207118 001 👨🏻‍🚀Oct 28, 2016👨🏻‍🚀\n", + "A206615 001 👨🏻‍🚀Aug 04, 2017👨🏻‍🚀\n", + "! \n", + "MIKART \n", + "TABLET;ORAL👨🏻‍🚀\n", + "325MG/15ML;50MG/15ML;40MG/15ML \n", + "A040387 001 👨🏻‍🚀Jan 31, 2003👨🏻‍🚀👨🏻‍🚀\n", + "BUTALBITAL, ACETAMINOPHEN AND CAFFEINE👨🏻‍🚀\n", + "AA \n", + "AA \n", + "AA \n", + "AA \n", + "AA \n", + "AA \n", + "AA \n", + "AA \n", + "AA \n", + "ACETAMINOPHEN; BUTALBITAL; CAFFEINE; CODEINE PHOSPHATE👨🏻‍🚀👨🏻‍🚀\n", + "325MG;50MG;40MG \n", + "ABHAI LLC \n", + "ACTAVIS LABS UT INC 👨🏻‍🚀325MG;50MG;40MG \n", + "325MG;50MG;40MG \n", + "CNTY LINE PHARMS \n", + "325MG;50MG;40MG \n", + "HIKMA PHARMS \n", + "325MG;50MG;40MG \n", + "LANNETT CO INC \n", + "325MG;50MG;40MG \n", + "MIKART \n", + "325MG;50MG;40MG \n", + "NEXGEN PHARMA INC \n", + "325MG;50MG;40MG \n", + "SPECGX LLC \n", + "325MG;50MG;40MG \n", + "VINTAGE PHARMS \n", + "! \n", + "CAPSULE;ORAL👨🏻‍🚀\n", + "BUTALBITAL, ACETAMINOPHEN, CAFFEINE AND CODEINE PHOSPHATE👨🏻‍🚀\n", + "AB \n", + "AB \n", + "NEXGEN PHARMA INC \n", + "VINTAGE PHARMS \n", + "325MG;50MG;40MG;30MG \n", + "325MG;50MG;40MG;30MG \n", + "FIORICET W/ CODEINE👨🏻‍🚀\n", + "AB \n", + "+! \n", + "ACTAVIS LABS UT INC 👨🏻‍🚀325MG;50MG;40MG;30MG \n", + "BUTALBITAL, ACETAMINOPHEN, CAFFEINE AND CODEINE PHOSPHATE👨🏻‍🚀👨🏻‍🚀\n", + "A211106 001 👨🏻‍🚀Sep 26, 2018👨🏻‍🚀\n", + "A088616 001 👨🏻‍🚀Nov 09, 1984👨🏻‍🚀\n", + "A204984 001 👨🏻‍🚀Jan 10, 2017👨🏻‍🚀\n", + "A089718 001 👨🏻‍🚀Jun 12, 1995👨🏻‍🚀\n", + "A200243 001 👨🏻‍🚀Sep 13, 2012👨🏻‍🚀\n", + "A089175 001 👨🏻‍🚀Jan 21, 1987👨🏻‍🚀\n", + "A209587 001 👨🏻‍🚀Oct 31, 2018👨🏻‍🚀\n", + "A087804 001 👨🏻‍🚀Jan 24, 1985👨🏻‍🚀\n", + "A040511 001 👨🏻‍🚀Aug 27, 2003👨🏻‍🚀👨🏻‍🚀\n", + "A076560 001 👨🏻‍🚀Jun 10, 2004👨🏻‍🚀\n", + "A075929 001 👨🏻‍🚀Apr 22, 2002👨🏻‍🚀\n", + "N020232 001 👨🏻‍🚀Jul 30, 1992👨🏻‍🚀👨🏻‍🚀\n", + "NEXGEN PHARMA INC \n", + "300MG;50MG;40MG;30MG \n", + "A076560 002 👨🏻‍🚀Jul 19, 2012👨🏻‍🚀👨🏻‍🚀\n", + "ACETAMINOPHEN; CAFFEINE; DIHYDROCODEINE BITARTRATE👨🏻‍🚀👨🏻‍🚀\n", + "CAPSULE;ORAL👨🏻‍🚀👨🏻‍🚀\n", + "TREZIX👨🏻‍🚀👨🏻‍🚀\n", + "WRASER PHARMS LLC \n", + "320.5MG;30MG;16MG \n", + "A204785 001 👨🏻‍🚀Nov 26, 2014👨🏻‍🚀👨🏻‍🚀\n", + "TABLET;ORAL👨🏻‍🚀👨🏻‍🚀\n", + "ACETAMINOPHEN, CAFFEINE AND DIHYDROCODEINE BITARTRATE👨🏻‍🚀👨🏻‍🚀\n", + "LARKEN LABS INC \n", + "325MG;30MG;16MG \n", + "ACETAMINOPHEN; CODEINE PHOSPHATE👨🏻‍🚀👨🏻‍🚀\n", + "SOLUTION;ORAL👨🏻‍🚀\n", + "ACETAMINOPHEN AND CODEINE PHOSPHATE👨🏻‍🚀👨🏻‍🚀\n", + "AA \n", + "AA \n", + "AA \n", + "AA \n", + "AA \n", + "! \n", + "HI TECH PHARMA \n", + "LANNETT CO INC \n", + "MIKART \n", + "PHARM ASSOC \n", + "WOCKHARDT BIO AG \n", + "TABLET;ORAL👨🏻‍🚀\n", + "120MG/5ML;12MG/5ML \n", + "120MG/5ML;12MG/5ML \n", + "120MG/5ML;12MG/5ML \n", + "120MG/5ML;12MG/5ML \n", + "120MG/5ML;12MG/5ML \n", + "ACETAMINOPHEN AND CODEINE PHOSPHATE👨🏻‍🚀👨🏻‍🚀\n", + "AA \n", + "AA \n", + "AA \n", + "AA \n", + "AA \n", + "AA \n", + "AA \n", + "AA \n", + "AA \n", + "AA \n", + "AA \n", + "AA \n", + "AA \n", + "! \n", + "! \n", + "SPECGX LLC \n", + "300MG;30MG \n", + "AMNEAL PHARMS NY \n", + "AUROLIFE PHARMA LLC 👨🏻‍🚀300MG;15MG \n", + "300MG;30MG \n", + "300MG;60MG \n", + "300MG;15MG \n", + "300MG;30MG \n", + "300MG;60MG \n", + "300MG;30MG \n", + "300MG;60MG \n", + "300MG;15MG \n", + "300MG;30MG \n", + "300MG;60MG \n", + "300MG;15MG \n", + "SUN PHARM INDS LTD \n", + "VINTAGE \n", + "TEVA \n", + "A204209 001 👨🏻‍🚀Sep 30, 2016👨🏻‍🚀👨🏻‍🚀\n", + "A040119 001 👨🏻‍🚀Apr 26, 1996👨🏻‍🚀\n", + "A091238 001 👨🏻‍🚀Nov 10, 2011👨🏻‍🚀\n", + "A089450 001 👨🏻‍🚀Oct 27, 1992👨🏻‍🚀\n", + "A087508 001👨🏻‍🚀👨🏻‍🚀\n", + "A087006 001👨🏻‍🚀👨🏻‍🚀\n", + "A040779 001 👨🏻‍🚀May 29, 2008👨🏻‍🚀\n", + "A202800 001 👨🏻‍🚀Apr 15, 2013👨🏻‍🚀\n", + "A202800 002 👨🏻‍🚀Apr 15, 2013👨🏻‍🚀\n", + "A202800 003 👨🏻‍🚀Apr 15, 2013👨🏻‍🚀\n", + "A040419 001 👨🏻‍🚀May 31, 2001👨🏻‍🚀\n", + "A040419 002 👨🏻‍🚀May 31, 2001👨🏻‍🚀\n", + "A040419 003 👨🏻‍🚀May 31, 2001👨🏻‍🚀\n", + "A085868 001👨🏻‍🚀👨🏻‍🚀\n", + "A087083 001👨🏻‍🚀👨🏻‍🚀\n", + "A088627 001 👨🏻‍🚀Mar 06, 1985👨🏻‍🚀\n", + "A088628 001 👨🏻‍🚀Mar 06, 1985👨🏻‍🚀\n", + "A088629 001 👨🏻‍🚀Mar 06, 1985👨🏻‍🚀\n", + "A089990 001 👨🏻‍🚀Sep 30, 1988👨🏻‍🚀👨🏻‍🚀\n", + "39TH EDITION - 2019 - APPROVED DRUG PRODUCT LIST👨🏻‍🚀👨🏻‍🚀\n", + "PRESCRIPTION DRUG PRODUCT LIST \n", + "['c', 'a', UnknownSymbol: A of type , '👨🏻\\u200d🚀', UnknownSymbol: d of type , UnknownSymbol: a of type , UnknownSymbol: t of type , UnknownSymbol: a of type , '👨🏻\\u200d🚀', UnknownSymbol: s of type , UnknownSymbol: u of type , UnknownSymbol: p of type , UnknownSymbol: p of type , UnknownSymbol: l of type , UnknownSymbol: i of type , UnknownSymbol: e of type , UnknownSymbol: d of type , '👨🏻\\u200d🚀', UnknownSymbol: b of type , UnknownSymbol: y of type , '👨🏻\\u200d🚀', 'a', UnknownSymbol: r of type , UnknownSymbol: u of type , UnknownSymbol: g of type , 'm', UnknownSymbol: a of type , UnknownSymbol: t of type , UnknownSymbol: e of type , UnknownSymbol: n of type , UnknownSymbol: t of type , 't', UnknownSymbol: a of type , UnknownSymbol: t of type , UnknownSymbol: c of type , UnknownSymbol: h of type , 'K', UnknownSymbol: c of type , UnknownSymbol: o of type , UnknownSymbol: m of type ]\n", + "['3', '-', '4', 'E', UnknownSymbol: o of type , UnknownSymbol: f of type , '👨🏻\\u200d🚀', 'Q', 'R', 'O', 'F', '👨🏻\\u200d🚀', '👨🏻\\u200d🚀']\n", + "ACETAMINOPHEN; CODEINE PHOSPHATE👨🏻‍🚀👨🏻‍🚀\n", + "TABLET;ORAL👨🏻‍🚀\n", + "ACETAMINOPHEN AND CODEINE PHOSPHATE👨🏻‍🚀👨🏻‍🚀\n", + "AA \n", + "AA \n", + "VINTAGE PHARMS \n", + "TYLENOL W/ CODEINE NO. 3👨🏻‍🚀\n", + "300MG;30MG \n", + "300MG;60MG \n", + "AA \n", + "! \n", + "JANSSEN PHARMS \n", + "300MG;30MG \n", + "TYLENOL W/ CODEINE NO. 4👨🏻‍🚀\n", + "AA \n", + "ACETAMINOPHEN; HYDROCODONE BITARTRATE👨🏻‍🚀👨🏻‍🚀\n", + "JANSSEN PHARMS \n", + "300MG;60MG \n", + "SOLUTION;ORAL👨🏻‍🚀\n", + "HYDROCODONE BITARTRATE AND ACETAMINOPHEN👨🏻‍🚀👨🏻‍🚀\n", + "AA \n", + "AA \n", + "AA \n", + "AA \n", + "GENUS LIFESCIENCES \n", + "MIKART \n", + "PHARM ASSOC \n", + "VISTAPHARM \n", + "MIKART \n", + "PHARM ASSOC \n", + "325MG/15ML;7.5MG/15ML \n", + "325MG/15ML;7.5MG/15ML \n", + "325MG/15ML;7.5MG/15ML \n", + "325MG/15ML;7.5MG/15ML \n", + "300MG/15ML;10MG/15ML \n", + "325MG/15ML;10MG/15ML \n", + "! \n", + "! \n", + "! \n", + "TABLET;ORAL👨🏻‍🚀\n", + "ANEXSIA 5/325👨🏻‍🚀\n", + "AA \n", + "AA \n", + "AA \n", + "AA \n", + "AA \n", + "AA \n", + "AA \n", + "AA \n", + "AA \n", + "AA \n", + "AA \n", + "AA \n", + "AA \n", + "AA \n", + "AA \n", + "AA \n", + "AA \n", + "AA \n", + "AA \n", + "AA \n", + "AA \n", + "AA \n", + "AA \n", + "AA \n", + "AA \n", + "AA \n", + "AA \n", + "AA \n", + "AA \n", + "AA \n", + "AA \n", + "AA \n", + "AA \n", + "AA \n", + "AA \n", + "AA \n", + "AA \n", + "AA \n", + "AA \n", + "AA \n", + "AA \n", + "AA \n", + "AA \n", + "AA \n", + "AA \n", + "AA \n", + "AA \n", + "AA \n", + "AA \n", + "AA \n", + "SPECGX LLC \n", + "ANEXSIA 7.5/325👨🏻‍🚀\n", + "325MG;5MG \n", + "325MG;7.5MG \n", + "HYDROCODONE BITARTRATE AND ACETAMINOPHEN👨🏻‍🚀👨🏻‍🚀\n", + "SPECGX LLC \n", + "ABHAI LLC \n", + "300MG;5MG \n", + "300MG;7.5MG \n", + "300MG;10MG \n", + "325MG;5MG \n", + "325MG;7.5MG \n", + "325MG;10MG \n", + "ACTAVIS LABS FL INC 👨🏻‍🚀300MG;5MG \n", + "ALVOGEN PINE BROOK \n", + "AMNEAL PHARMS \n", + "AMNEAL PHARMS NY \n", + "ASCENT PHARMS INC \n", + "300MG;7.5MG \n", + "300MG;10MG \n", + "300MG;5MG \n", + "300MG;7.5MG \n", + "300MG;10MG \n", + "325MG;2.5MG \n", + "325MG;5MG \n", + "325MG;7.5MG \n", + "325MG;10MG \n", + "300MG;10MG \n", + "300MG;5MG \n", + "325MG;5MG \n", + "325MG;7.5MG \n", + "325MG;10MG \n", + "325MG;2.5MG \n", + "325MG;5MG \n", + "325MG;7.5MG \n", + "325MG;10MG \n", + "AUROLIFE PHARMA LLC 👨🏻‍🚀300MG;5MG \n", + "300MG;7.5MG \n", + "300MG;10MG \n", + "325MG;5MG \n", + "325MG;7.5MG \n", + "325MG;10MG \n", + "325MG;2.5MG \n", + "325MG;5MG \n", + "325MG;7.5MG \n", + "325MG;10MG \n", + "325MG;5MG \n", + "325MG;7.5MG \n", + "325MG;10MG \n", + "300MG;5MG \n", + "300MG;7.5MG \n", + "300MG;10MG \n", + "325MG;5MG \n", + "325MG;7.5MG \n", + "325MG;10MG \n", + "300MG;5MG \n", + "300MG;7.5MG \n", + "300MG;10MG \n", + "325MG;2.5MG \n", + "ELITE LABS INC \n", + "EPIC PHARMA LLC \n", + "LANNETT CO INC \n", + "MIKART \n", + "! \n", + "! \n", + "! \n", + "! \n", + "A089805 001 👨🏻‍🚀Sep 30, 1988👨🏻‍🚀\n", + "A089828 001 👨🏻‍🚀Sep 30, 1988👨🏻‍🚀\n", + "A085055 003👨🏻‍🚀👨🏻‍🚀\n", + "A085055 004👨🏻‍🚀👨🏻‍🚀\n", + "A040894 001 👨🏻‍🚀Jul 19, 2011👨🏻‍🚀\n", + "A040482 001 👨🏻‍🚀Sep 25, 2003👨🏻‍🚀\n", + "A040838 001 👨🏻‍🚀May 10, 2013👨🏻‍🚀\n", + "A200343 001 👨🏻‍🚀Jan 25, 2012👨🏻‍🚀👨🏻‍🚀\n", + "A040881 001 👨🏻‍🚀Feb 25, 2010👨🏻‍🚀👨🏻‍🚀\n", + "A040834 001 👨🏻‍🚀Apr 18, 2008👨🏻‍🚀\n", + "A040409 001 👨🏻‍🚀Oct 20, 2000👨🏻‍🚀\n", + "A040405 001 👨🏻‍🚀Sep 08, 2000👨🏻‍🚀\n", + "A209036 001 👨🏻‍🚀Jun 21, 2017👨🏻‍🚀\n", + "A209036 002 👨🏻‍🚀Jun 21, 2017👨🏻‍🚀\n", + "A209036 003 👨🏻‍🚀Jun 21, 2017👨🏻‍🚀\n", + "A209037 001 👨🏻‍🚀Jun 21, 2017👨🏻‍🚀\n", + "A209037 002 👨🏻‍🚀Jun 21, 2017👨🏻‍🚀\n", + "A209037 003 👨🏻‍🚀Jun 21, 2017👨🏻‍🚀\n", + "A206470 001 👨🏻‍🚀Jun 02, 2016👨🏻‍🚀\n", + "A206470 002 👨🏻‍🚀Jun 02, 2016👨🏻‍🚀\n", + "A206470 003 👨🏻‍🚀Jun 02, 2016👨🏻‍🚀\n", + "A208540 001 👨🏻‍🚀Nov 08, 2018👨🏻‍🚀\n", + "A208540 002 👨🏻‍🚀Nov 08, 2018👨🏻‍🚀\n", + "A208540 003 👨🏻‍🚀Nov 08, 2018👨🏻‍🚀\n", + "A209958 001 👨🏻‍🚀Oct 24, 2018👨🏻‍🚀\n", + "A209958 002 👨🏻‍🚀Oct 24, 2018👨🏻‍🚀\n", + "A209958 003 👨🏻‍🚀Oct 24, 2018👨🏻‍🚀\n", + "A209958 004 👨🏻‍🚀Oct 24, 2018👨🏻‍🚀\n", + "A207137 001 👨🏻‍🚀Nov 29, 2016👨🏻‍🚀\n", + "A206869 001 👨🏻‍🚀Jun 23, 2017👨🏻‍🚀\n", + "A040736 001 👨🏻‍🚀Aug 25, 2006👨🏻‍🚀\n", + "A040746 002 👨🏻‍🚀May 10, 2016👨🏻‍🚀\n", + "A040746 001 👨🏻‍🚀Aug 25, 2006👨🏻‍🚀\n", + "A211487 001 👨🏻‍🚀Nov 07, 2018👨🏻‍🚀\n", + "A211487 002 👨🏻‍🚀Nov 07, 2018👨🏻‍🚀\n", + "A211487 003 👨🏻‍🚀Nov 07, 2018👨🏻‍🚀\n", + "A211487 004 👨🏻‍🚀Nov 07, 2018👨🏻‍🚀\n", + "A207709 001 👨🏻‍🚀Sep 13, 2018👨🏻‍🚀\n", + "A207709 002 👨🏻‍🚀Sep 13, 2018👨🏻‍🚀\n", + "A207709 003 👨🏻‍🚀Sep 13, 2018👨🏻‍🚀\n", + "A201013 001 👨🏻‍🚀Apr 11, 2012👨🏻‍🚀\n", + "A201013 002 👨🏻‍🚀Apr 11, 2012👨🏻‍🚀\n", + "A201013 003 👨🏻‍🚀Apr 11, 2012👨🏻‍🚀\n", + "A209924 001 👨🏻‍🚀Nov 16, 2018👨🏻‍🚀\n", + "A209924 002 👨🏻‍🚀Nov 16, 2018👨🏻‍🚀\n", + "A209924 003 👨🏻‍🚀Nov 16, 2018👨🏻‍🚀\n", + "A209924 004 👨🏻‍🚀Nov 16, 2018👨🏻‍🚀\n", + "A203863 001 👨🏻‍🚀Mar 30, 2018👨🏻‍🚀\n", + "A203863 002 👨🏻‍🚀Mar 30, 2018👨🏻‍🚀\n", + "A203863 003 👨🏻‍🚀Mar 30, 2018👨🏻‍🚀\n", + "A207171 001 👨🏻‍🚀Jun 20, 2017👨🏻‍🚀\n", + "A207171 002 👨🏻‍🚀Jun 20, 2017👨🏻‍🚀\n", + "A207171 003 👨🏻‍🚀Jun 20, 2017👨🏻‍🚀\n", + "A207172 001 👨🏻‍🚀Jun 22, 2017👨🏻‍🚀\n", + "A207172 002 👨🏻‍🚀Jun 22, 2017👨🏻‍🚀\n", + "A207172 003 👨🏻‍🚀Jun 22, 2017👨🏻‍🚀\n", + "A040658 001 👨🏻‍🚀Jan 19, 2006👨🏻‍🚀\n", + "A040658 002 👨🏻‍🚀Mar 24, 2006👨🏻‍🚀\n", + "A040658 003 👨🏻‍🚀Jun 23, 2004👨🏻‍🚀\n", + "A040846 001 👨🏻‍🚀Jun 09, 2010👨🏻‍🚀👨🏻‍🚀\n" + ] + } + ], + "source": [ + "ob2020.check_stream(pdftotext_lines)" + ] + }, + { + "cell_type": "markdown", + "id": "e899245f-b901-4108-bfab-6ab7b329e8d6", + "metadata": {}, + "source": [ + "current thought: Walk the object tree of the pdf, extracting and converting text while tracking where it is physically." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4543148b-4f2f-47f6-a13c-13f21dfd4a7c", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/Orangebook/download.url.txt b/Orangebook/download.url.txt new file mode 100644 index 0000000..c54344c --- /dev/null +++ b/Orangebook/download.url.txt @@ -0,0 +1 @@ +https://www.fda.gov/media/76860/download diff --git a/Orangebook/testing-pyocr.ipynb b/Orangebook/testing-pyocr.ipynb new file mode 100644 index 0000000..a1e4b15 --- /dev/null +++ b/Orangebook/testing-pyocr.ipynb @@ -0,0 +1,145 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 4, + "id": "51bf48a1-920a-4e64-ac5f-323ff3a27ebf", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Will use tool 'Tesseract (sh)'\n", + "Available languages: eng, osd\n", + "Will use language 'eng'\n" + ] + } + ], + "source": [ + "# Import the required libraries\n", + "from wand.image import Image\n", + "from PIL import Image as PI\n", + "import pyocr\n", + "import pyocr.builders\n", + "import io, sys\n", + "\n", + "\n", + "# Get the handle of the OCR library (in this case, tesseract)\n", + "tools = pyocr.get_available_tools()\n", + "if len(tools) == 0:\n", + "\tprint(\"No OCR tool found!\")\n", + "\tsys.exit(1)\n", + "tool = tools[0]\n", + "print(\"Will use tool '%s'\" % (tool.get_name()))\n", + "\n", + "# Get the language\n", + "langs = tool.get_available_languages()\n", + "print(\"Available languages: %s\" % \", \".join(langs)) \n", + "lang = langs[0] # For English\n", + "print(\"Will use language '%s'\" % (lang))\n", + "\n", + "# Setup two lists which will be used to hold our images and final_text\n", + "req_image = []\n", + "final_text = []\n", + "\n", + "# Open the PDF file using wand and convert it to jpeg\n", + "image_pdf = Image(filename=\"/home/will/research/ClinicalTrialsDataProcessing/Orangebook/Orangebooks/testprint.pdf\", resolution=300)\n", + "image_jpeg = image_pdf.convert('pdf')\n", + "\n", + "# wand has converted all the separate pages in the PDF into separate image\n", + "# blobs. We can loop over them and append them as a blob into the req_image\n", + "# list.\n", + "for img in image_jpeg.sequence:\n", + "\timg_page = Image(image=img)\n", + "\treq_image.append(img_page.make_blob('jpeg'))\n", + "\n", + "# Now we just need to run OCR over the image blobs and store all of the \n", + "# recognized text in final_text.\n", + "for img in req_image:\n", + "\ttxt = tool.image_to_string(\n", + "\t\tPI.open(io.BytesIO(img)),\n", + "\t\tlang=lang,\n", + "\t\tbuilder=pyocr.builders.TextBuilder()\n", + "\t)\n", + "\tfinal_text.append(txt)\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "f0d5f1d6-7e15-4ee6-b4ee-cbd41c5afb99", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "The final text is: \n", + "\n", + "40TH EDITION - 2020 - APPROVED DRUG PRODUCT LIST\n", + "\n", + "PRESCRIPTION DRUG PRODUCT LIST\n", + "\n", + "ABACAVIR SULFATE\n", + "SOLUTION; ORAL\n", + "ABACAVIR SULFATE\n", + "\n", + "EQ 2 5 /ML\n", + "\n", + "EQ 2 Ee /ML\n", + "\n", + "EQ 300MG BASE\n", + "EQ 300MG BASE\n", + "EQ 300MG BASE\n", + "\n", + "\n" + ] + } + ], + "source": [ + "print(\"\\nThe final text is: \\n\")\n", + "print(final_text[0][0:200])" + ] + }, + { + "cell_type": "markdown", + "id": "1cac17e7-079d-4e32-bdbf-ae49194b2078", + "metadata": {}, + "source": [ + "it appears taht this does not have the required precision. I'll need to do this some other way." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2283e290-fab3-4cda-8ce9-55a0b3533c98", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}