{ "cells": [ { "cell_type": "code", "execution_count": 4, "id": "51bf48a1-920a-4e64-ac5f-323ff3a27ebf", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Will use tool 'Tesseract (sh)'\n", "Available languages: eng, osd\n", "Will use language 'eng'\n" ] } ], "source": [ "# Import the required libraries\n", "from wand.image import Image\n", "from PIL import Image as PI\n", "import pyocr\n", "import pyocr.builders\n", "import io, sys\n", "\n", "\n", "# Get the handle of the OCR library (in this case, tesseract)\n", "tools = pyocr.get_available_tools()\n", "if len(tools) == 0:\n", "\tprint(\"No OCR tool found!\")\n", "\tsys.exit(1)\n", "tool = tools[0]\n", "print(\"Will use tool '%s'\" % (tool.get_name()))\n", "\n", "# Get the language\n", "langs = tool.get_available_languages()\n", "print(\"Available languages: %s\" % \", \".join(langs)) \n", "lang = langs[0] # For English\n", "print(\"Will use language '%s'\" % (lang))\n", "\n", "# Setup two lists which will be used to hold our images and final_text\n", "req_image = []\n", "final_text = []\n", "\n", "# Open the PDF file using wand and convert it to jpeg\n", "image_pdf = Image(filename=\"/home/will/research/ClinicalTrialsDataProcessing/Orangebook/Orangebooks/testprint.pdf\", resolution=300)\n", "image_jpeg = image_pdf.convert('pdf')\n", "\n", "# wand has converted all the separate pages in the PDF into separate image\n", "# blobs. We can loop over them and append them as a blob into the req_image\n", "# list.\n", "for img in image_jpeg.sequence:\n", "\timg_page = Image(image=img)\n", "\treq_image.append(img_page.make_blob('jpeg'))\n", "\n", "# Now we just need to run OCR over the image blobs and store all of the \n", "# recognized text in final_text.\n", "for img in req_image:\n", "\ttxt = tool.image_to_string(\n", "\t\tPI.open(io.BytesIO(img)),\n", "\t\tlang=lang,\n", "\t\tbuilder=pyocr.builders.TextBuilder()\n", "\t)\n", "\tfinal_text.append(txt)\n", "\n" ] }, { "cell_type": "code", "execution_count": 7, "id": "f0d5f1d6-7e15-4ee6-b4ee-cbd41c5afb99", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "The final text is: \n", "\n", "40TH EDITION - 2020 - APPROVED DRUG PRODUCT LIST\n", "\n", "PRESCRIPTION DRUG PRODUCT LIST\n", "\n", "ABACAVIR SULFATE\n", "SOLUTION; ORAL\n", "ABACAVIR SULFATE\n", "\n", "EQ 2 5 /ML\n", "\n", "EQ 2 Ee /ML\n", "\n", "EQ 300MG BASE\n", "EQ 300MG BASE\n", "EQ 300MG BASE\n", "\n", "\n" ] } ], "source": [ "print(\"\\nThe final text is: \\n\")\n", "print(final_text[0][0:200])" ] }, { "cell_type": "markdown", "id": "1cac17e7-079d-4e32-bdbf-ae49194b2078", "metadata": {}, "source": [ "it appears taht this does not have the required precision. I'll need to do this some other way." ] }, { "cell_type": "code", "execution_count": null, "id": "2283e290-fab3-4cda-8ce9-55a0b3533c98", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.13" } }, "nbformat": 4, "nbformat_minor": 5 }