Adding attempts at orangbook processing. The CID parser stuff is in case I need to parse the pdfs

4 years ago · 5a1f744449
parent e97434f70a
commit 5a1f744449
6 changed files with 1892 additions and 0 deletions
--- a/Orangebook/CidParser.py
+++ b/Orangebook/CidParser.py
@ -0,0 +1,93 @@
+# Adobe Pdf Character ID (cid:\d+) parser
+# The purpose is to allow someone to create their own table equivalent to the "\toUnicode" that
+# should be provided in every PDF using cid's (but is often mangled).
+
+def get_digits(string):
+    """
+    Extract leading the digits from a cid tag.
+    """
+    splat = string.split(")")
+    num = splat[0]
+    l = len(num)
+    return int(num),l
+
+def token_generator(string):
+    """
+    An iterable that returns tokens describing a string in a pdf.
+    Tokens take two forms:
+        - Integers: these represend CID codes
+        - Characters: these represent the arbitrary characters often returned amidst cid's.
+    
+    It is a python generator becasue that simplifies the ordering and allows us to avoid recursion.
+    """
+    start = 0
+    str_len = len(string)
+    
+    while start < str_len:
+        substring = string[start:]
+        
+        #check for cid
+        if (str_len - start > 6) and (substring[0:5] == "(cid:"):
+            
+            num,length = get_digits(substring[5:])
+            start += length + 6
+            yield num
+            
+        elif (str_len - start > 1):
+            start += 1
+            yield substring[0]
+        else:
+            start += 1
+            yield substring
+
+
+class UnknownSymbol():
+    """
+    Represents a token that is not in the parser's dictionary.
+    """
+    def __init__(self, symbol):
+        self.symbol = symbol
+        
+    def __repr__(self):
+        return "UnknownSymbol: {} of type {}".format(self.symbol, type(self.symbol))
+    
+    def __str__(self):
+        return "\uFFFD"
+
+class Parser:
+    """
+    Translates from tokens to character arrays or strings, handling errors as it goes.
+    
+    It requires a dictionary during instantiation. 
+    This dictionary is what is used to perform lookups.
+    
+    It exposes 3 methods
+        - convert attempts to convert a single token
+        - convert_stream will try to convert an iterable of tokens into an iterable of text.
+        - check_list_of_strings will try to convert a list of strings containing cids and other symbols into 
+            - strings, if there are no Unknown symbols.
+            - lists, containing characters and Unknown symbols.
+    """
+    def __init__(self, lookup_table):
+        self._lookup_table = lookup_table
+        
+    def convert(self,token):
+        try:
+            return self._lookup_table[token]
+        except:
+            return  UnknownSymbol(token)
+    
+    def convert_list(self,token_stream):
+        for token in token_stream:
+            yield self.convert(token)
+    
+    def convert_list_of_strings(self, list_of_strings):
+        for token_stream in list_of_stings:
+            arr = [x for x in ob2020.convert_stream(token_generator(token_stream))]
+            try:
+                print("".join(arr))
+            except:
+                print(arr)
+                
+if __name__ == "__main__":
+    print("Plan was to accept and proceess a symbol table and text. Apparently it has not been implemented."
--- a/Orangebook/Untitled.ipynb
+++ b/Orangebook/Untitled.ipynb
@ -0,0 +1,371 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "id": "40358f02-c376-4431-be39-cdd477f17e7a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import polars as pl"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "id": "8fb27ee2-72c1-4e80-9d00-de54f2834fe8",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "polars.datatypes.Datetime"
+      ]
+     },
+     "execution_count": 25,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "pl.datatypes.Datetime"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 55,
+   "id": "2c0edd77-c2d0-4184-a094-8c01783d2f0e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "products = pl.scan_csv(file=\"./EOBZIP_2022_04/products.txt\", sep=\"~\")\n",
+    "patents = pl.scan_csv(file=\"./EOBZIP_2022_04/patent.txt\", sep=\"~\")\n",
+    "exclusivity = pl.scan_csv(file=\"./EOBZIP_2022_04/exclusivity.txt\", sep=\"~\", parse_dates=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 58,
+   "id": "023f211d-23aa-4a2c-843d-1b60cec91079",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def set_exclusivity_types(df):\n",
+    "    return df.with_columns([\n",
+    "        pl.col(\"Exclusivity_Date\").str.strptime(pl.Date, fmt=\"%b %-d, %Y\")\n",
+    "    ])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 61,
+   "id": "a1da42c9-e47a-4437-b089-e9b91f789a0c",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1 \"class=\"dataframe \">\n",
+       "<thead>\n",
+       "<tr>\n",
+       "<th>\n",
+       "Appl_Type\n",
+       "</th>\n",
+       "<th>\n",
+       "Appl_No\n",
+       "</th>\n",
+       "<th>\n",
+       "Product_No\n",
+       "</th>\n",
+       "<th>\n",
+       "Exclusivity_Code\n",
+       "</th>\n",
+       "<th>\n",
+       "Exclusivity_Date\n",
+       "</th>\n",
+       "</tr>\n",
+       "<tr>\n",
+       "<td>\n",
+       "str\n",
+       "</td>\n",
+       "<td>\n",
+       "i64\n",
+       "</td>\n",
+       "<td>\n",
+       "i64\n",
+       "</td>\n",
+       "<td>\n",
+       "str\n",
+       "</td>\n",
+       "<td>\n",
+       "date\n",
+       "</td>\n",
+       "</tr>\n",
+       "</thead>\n",
+       "<tbody>\n",
+       "<tr>\n",
+       "<td>\n",
+       "\"N\"\n",
+       "</td>\n",
+       "<td>\n",
+       "11366\n",
+       "</td>\n",
+       "<td>\n",
+       "2\n",
+       "</td>\n",
+       "<td>\n",
+       "\"ODE-96\"\n",
+       "</td>\n",
+       "<td>\n",
+       "2022-08-07\n",
+       "</td>\n",
+       "</tr>\n",
+       "<tr>\n",
+       "<td>\n",
+       "\"N\"\n",
+       "</td>\n",
+       "<td>\n",
+       "20287\n",
+       "</td>\n",
+       "<td>\n",
+       "11\n",
+       "</td>\n",
+       "<td>\n",
+       "\"NPP\"\n",
+       "</td>\n",
+       "<td>\n",
+       "2022-05-16\n",
+       "</td>\n",
+       "</tr>\n",
+       "<tr>\n",
+       "<td>\n",
+       "\"N\"\n",
+       "</td>\n",
+       "<td>\n",
+       "20287\n",
+       "</td>\n",
+       "<td>\n",
+       "10\n",
+       "</td>\n",
+       "<td>\n",
+       "\"NPP\"\n",
+       "</td>\n",
+       "<td>\n",
+       "2022-05-16\n",
+       "</td>\n",
+       "</tr>\n",
+       "<tr>\n",
+       "<td>\n",
+       "\"N\"\n",
+       "</td>\n",
+       "<td>\n",
+       "20287\n",
+       "</td>\n",
+       "<td>\n",
+       "9\n",
+       "</td>\n",
+       "<td>\n",
+       "\"NPP\"\n",
+       "</td>\n",
+       "<td>\n",
+       "2022-05-16\n",
+       "</td>\n",
+       "</tr>\n",
+       "<tr>\n",
+       "<td>\n",
+       "\"N\"\n",
+       "</td>\n",
+       "<td>\n",
+       "20287\n",
+       "</td>\n",
+       "<td>\n",
+       "8\n",
+       "</td>\n",
+       "<td>\n",
+       "\"NPP\"\n",
+       "</td>\n",
+       "<td>\n",
+       "2022-05-16\n",
+       "</td>\n",
+       "</tr>\n",
+       "</tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "shape: (5, 5)\n",
+       "┌───────────┬─────────┬────────────┬──────────────────┬──────────────────┐\n",
+       "│ Appl_Type ┆ Appl_No ┆ Product_No ┆ Exclusivity_Code ┆ Exclusivity_Date │\n",
+       "│ ---       ┆ ---     ┆ ---        ┆ ---              ┆ ---              │\n",
+       "│ str       ┆ i64     ┆ i64        ┆ str              ┆ date             │\n",
+       "╞═══════════╪═════════╪════════════╪══════════════════╪══════════════════╡\n",
+       "│ N         ┆ 11366   ┆ 2          ┆ ODE-96           ┆ 2022-08-07       │\n",
+       "├╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤\n",
+       "│ N         ┆ 20287   ┆ 11         ┆ NPP              ┆ 2022-05-16       │\n",
+       "├╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤\n",
+       "│ N         ┆ 20287   ┆ 10         ┆ NPP              ┆ 2022-05-16       │\n",
+       "├╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤\n",
+       "│ N         ┆ 20287   ┆ 9          ┆ NPP              ┆ 2022-05-16       │\n",
+       "├╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤\n",
+       "│ N         ┆ 20287   ┆ 8          ┆ NPP              ┆ 2022-05-16       │\n",
+       "└───────────┴─────────┴────────────┴──────────────────┴──────────────────┘"
+      ]
+     },
+     "execution_count": 61,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "exclusivity.pipe(set_exclusivity_types).head(5).collect()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 85,
+   "id": "92fe99fa-1963-460c-99ea-7f614b4b2e25",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def set_patent_types(df):\n",
+    "    return df.with_columns([\n",
+    "        pl.col(\"Patent_Expire_Date_Text\").str.strptime(pl.Date, fmt=\"%b %-d, %Y\"),\n",
+    "        pl.col(\"Submission_Date\").str.strptime(pl.Date, fmt=\"%b %-d, %Y\"),\n",
+    "        pl.col(\"Drug_Substance_Flag\") == \"Y\",\n",
+    "        pl.col(\"Drug_Product_Flag\") == \"Y\",\n",
+    "        pl.col(\"Delist_Flag\") == \"Y\"\n",
+    "    ])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 90,
+   "id": "13707ca6-094f-4ed7-94cb-824087e97874",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1 \"class=\"dataframe \">\n",
+       "<thead>\n",
+       "<tr>\n",
+       "<th>\n",
+       "Patent_Expire_Date_Text\n",
+       "</th>\n",
+       "</tr>\n",
+       "<tr>\n",
+       "<td>\n",
+       "date\n",
+       "</td>\n",
+       "</tr>\n",
+       "</thead>\n",
+       "<tbody>\n",
+       "<tr>\n",
+       "<td>\n",
+       "2022-01-02\n",
+       "</td>\n",
+       "</tr>\n",
+       "</tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "shape: (1, 1)\n",
+       "┌─────────────────────────┐\n",
+       "│ Patent_Expire_Date_Text │\n",
+       "│ ---                     │\n",
+       "│ date                    │\n",
+       "╞═════════════════════════╡\n",
+       "│ 2022-01-02              │\n",
+       "└─────────────────────────┘"
+      ]
+     },
+     "execution_count": 90,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "patents.pipe(set_patent_types).select(\"Patent_Expire_Date_Text\").min().collect()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 81,
+   "id": "18ad8df7-45d5-4454-8955-c5f28a7d7f1e",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "polars.datatypes.Null"
+      ]
+     },
+     "execution_count": 81,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "pl.datatypes.Null"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "79e4b3d9-29ae-4302-bee1-4be02e0ba654",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.13"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/Orangebook/Untitled2.ipynb
+++ b/Orangebook/Untitled2.ipynb
@ -0,0 +1,216 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 196,
+   "id": "2f61df31-f3c1-4b2e-ae96-96bf06089b17",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def get_digits(string):\n",
+    "    splat = string.split(\")\")\n",
+    "    num = splat[0]\n",
+    "    l = len(num)\n",
+    "    return int(num),l\n",
+    "\n",
+    "def token_generator(string):\n",
+    "    \n",
+    "    start = 0\n",
+    "    str_len = len(string)\n",
+    "    \n",
+    "    \n",
+    "    while start < str_len:\n",
+    "        substring = string[start:]\n",
+    "        \n",
+    "        #check for cid\n",
+    "        if (str_len - start > 6) and (substring[0:5] == \"(cid:\"):\n",
+    "            \n",
+    "            num,length = get_digits(substring[5:])\n",
+    "            start += length + 6\n",
+    "            yield num\n",
+    "            \n",
+    "        elif (str_len - start > 1):\n",
+    "            start += 1\n",
+    "            yield substring[0]\n",
+    "        else:\n",
+    "            start += 1\n",
+    "            yield substring\n",
+    "\n",
+    "class UnknownSymbol():\n",
+    "    def __init__(self, symbol):\n",
+    "        self.symbol = symbol\n",
+    "        \n",
+    "    def __repr__(self):\n",
+    "        return \"UnknownSymbol: {} of type {}\".format(self.symbol, type(self.symbol))\n",
+    "    \n",
+    "    def __str__(self):\n",
+    "        return \"\\uFFFD\"\n",
+    "\n",
+    "class Parser:\n",
+    "    def __init__(self, lookup_table):\n",
+    "        self._lookup_table = lookup_table\n",
+    "        \n",
+    "    def convert(self,symbol):\n",
+    "        try:\n",
+    "            return self._lookup_table[symbol]\n",
+    "        except:\n",
+    "            return  UnknownSymbol(symbol)\n",
+    "    \n",
+    "    def convert_stream(self,token_stream):\n",
+    "        for token in token_stream:\n",
+    "            yield self.convert(token)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 213,
+   "id": "e2c1e39b-0ac5-4ad7-9176-ef8ea69feeec",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ob2020 = Parser({\n",
+    "    23:\"4\"\n",
+    "    ,19:\"0\"\n",
+    "    ,\"7\":\"T\"\n",
+    "    ,\"+\":\"H\"\n",
+    "    ,3:\" \"\n",
+    "    ,\"(\":\"E\"\n",
+    "    ,\"’\":\"D\"\n",
+    "    ,\",\":\"I\"\n",
+    "    ,\"2\":\"O\"\n",
+    "    ,\"1\":\"N\"\n",
+    "    ,16:\"-\"\n",
+    "    ,21:\"2\"\n",
+    "    ,\"$\":\"A\"\n",
+    "    ,\"3\":\"P\"\n",
+    "    ,\"5\":\"R\"\n",
+    "    ,\"9\":\"V\"\n",
+    "    ,\"8\":\"U\"\n",
+    "    ,\"*\":\"G\"\n",
+    "    ,\"&\":\"C\"\n",
+    "    ,\"/\":\"L\"\n",
+    "    ,\"6\":\"S\"\n",
+    "    ,22:\"3\"\n",
+    "    ,20:\"1\"\n",
+    "    ,11:\" \"\n",
+    "    ,\"R\":\"(\"\n",
+    "    ,\"I\":\"of\"\n",
+    "    ,24:\"5\"\n",
+    "    ,12:\")\"\n",
+    "})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 214,
+   "id": "c02896ab-fc75-44cc-bb27-a5dcf1b6d7f0",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'40TH EDITION - 2020 - APPROVED DRUG PRODUCT LIST'"
+      ]
+     },
+     "execution_count": 214,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "\"\".join([x for x in ob2020.convert_stream(token_generator(b))])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 215,
+   "id": "1794e826-fa1f-4aba-8eab-6d603a06dfe0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "c = \"35(6&5,37,21(cid:3)’58*(cid:3)352’8&7(cid:3)/,67(cid:3)\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 216,
+   "id": "c8b67d79-81ad-4b3a-be8b-bace9a8d943a",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'PRESCRIPTION DRUG PRODUCT LIST '"
+      ]
+     },
+     "execution_count": 216,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "\"\".join([x for x in ob2020.convert_stream(token_generator(c))])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 217,
+   "id": "f76d9760-fe69-4743-ab47-41cf74866d70",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "d = \"(cid:22)(cid:16)(cid:20)(cid:11)RI(cid:3)(cid:23)(cid:24)(cid:22)(cid:12)\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 218,
+   "id": "ee925997-0701-4d8e-b713-6f39c6a50a5b",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['3', '-', '1', ' ', '(', 'of', ' ', '4', '5', '3', ')']"
+      ]
+     },
+     "execution_count": 218,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "[x for x in ob2020.convert_stream(token_generator(d))]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ae152c75-5fd6-4756-a473-fcea2de5ee30",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.13"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/Orangebook/cid_parser.ipynb
+++ b/Orangebook/cid_parser.ipynb
--- a/Orangebook/download.url.txt
+++ b/Orangebook/download.url.txt
@ -0,0 +1 @@
+https://www.fda.gov/media/76860/download
--- a/Orangebook/testing-pyocr.ipynb
+++ b/Orangebook/testing-pyocr.ipynb
@ -0,0 +1,145 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "51bf48a1-920a-4e64-ac5f-323ff3a27ebf",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Will use tool 'Tesseract (sh)'\n",
+      "Available languages: eng, osd\n",
+      "Will use language 'eng'\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Import the required libraries\n",
+    "from wand.image import Image\n",
+    "from PIL import Image as PI\n",
+    "import pyocr\n",
+    "import pyocr.builders\n",
+    "import io, sys\n",
+    "\n",
+    "\n",
+    "# Get the handle of the OCR library (in this case, tesseract)\n",
+    "tools = pyocr.get_available_tools()\n",
+    "if len(tools) == 0:\n",
+    "\tprint(\"No OCR tool found!\")\n",
+    "\tsys.exit(1)\n",
+    "tool = tools[0]\n",
+    "print(\"Will use tool '%s'\" % (tool.get_name()))\n",
+    "\n",
+    "# Get the language\n",
+    "langs = tool.get_available_languages()\n",
+    "print(\"Available languages: %s\" % \", \".join(langs)) \n",
+    "lang = langs[0] # For English\n",
+    "print(\"Will use language '%s'\" % (lang))\n",
+    "\n",
+    "# Setup two lists which will be used to hold our images and final_text\n",
+    "req_image = []\n",
+    "final_text = []\n",
+    "\n",
+    "# Open the PDF file using wand and convert it to jpeg\n",
+    "image_pdf = Image(filename=\"/home/will/research/ClinicalTrialsDataProcessing/Orangebook/Orangebooks/testprint.pdf\", resolution=300)\n",
+    "image_jpeg = image_pdf.convert('pdf')\n",
+    "\n",
+    "# wand has converted all the separate pages in the PDF into separate image\n",
+    "# blobs. We can loop over them and append them as a blob into the req_image\n",
+    "# list.\n",
+    "for img in image_jpeg.sequence:\n",
+    "\timg_page = Image(image=img)\n",
+    "\treq_image.append(img_page.make_blob('jpeg'))\n",
+    "\n",
+    "# Now we just need to run OCR over the image blobs and store all of the \n",
+    "# recognized text in final_text.\n",
+    "for img in req_image:\n",
+    "\ttxt = tool.image_to_string(\n",
+    "\t\tPI.open(io.BytesIO(img)),\n",
+    "\t\tlang=lang,\n",
+    "\t\tbuilder=pyocr.builders.TextBuilder()\n",
+    "\t)\n",
+    "\tfinal_text.append(txt)\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "f0d5f1d6-7e15-4ee6-b4ee-cbd41c5afb99",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "The final text is: \n",
+      "\n",
+      "40TH EDITION - 2020 - APPROVED DRUG PRODUCT LIST\n",
+      "\n",
+      "PRESCRIPTION DRUG PRODUCT LIST\n",
+      "\n",
+      "ABACAVIR SULFATE\n",
+      "SOLUTION; ORAL\n",
+      "ABACAVIR SULFATE\n",
+      "\n",
+      "EQ 2 5 /ML\n",
+      "\n",
+      "EQ 2 Ee /ML\n",
+      "\n",
+      "EQ 300MG BASE\n",
+      "EQ 300MG BASE\n",
+      "EQ 300MG BASE\n",
+      "\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(\"\\nThe final text is: \\n\")\n",
+    "print(final_text[0][0:200])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1cac17e7-079d-4e32-bdbf-ae49194b2078",
+   "metadata": {},
+   "source": [
+    "it appears taht this does not have the required precision. I'll need to do this some other way."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2283e290-fab3-4cda-8ce9-55a0b3533c98",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.13"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}