ClinicalTrialsDataProcessing/Orangebook/testing-pyocr.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "51bf48a1-920a-4e64-ac5f-323ff3a27ebf",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Will use tool 'Tesseract (sh)'\n",
      "Available languages: eng, osd\n",
      "Will use language 'eng'\n"
     ]
    }
   ],
   "source": [
    "# Import the required libraries\n",
    "from wand.image import Image\n",
    "from PIL import Image as PI\n",
    "import pyocr\n",
    "import pyocr.builders\n",
    "import io, sys\n",
    "\n",
    "\n",
    "# Get the handle of the OCR library (in this case, tesseract)\n",
    "tools = pyocr.get_available_tools()\n",
    "if len(tools) == 0:\n",
    "\tprint(\"No OCR tool found!\")\n",
    "\tsys.exit(1)\n",
    "tool = tools[0]\n",
    "print(\"Will use tool '%s'\" % (tool.get_name()))\n",
    "\n",
    "# Get the language\n",
    "langs = tool.get_available_languages()\n",
    "print(\"Available languages: %s\" % \", \".join(langs)) \n",
    "lang = langs[0] # For English\n",
    "print(\"Will use language '%s'\" % (lang))\n",
    "\n",
    "# Setup two lists which will be used to hold our images and final_text\n",
    "req_image = []\n",
    "final_text = []\n",
    "\n",
    "# Open the PDF file using wand and convert it to jpeg\n",
    "image_pdf = Image(filename=\"/home/will/research/ClinicalTrialsDataProcessing/Orangebook/Orangebooks/testprint.pdf\", resolution=300)\n",
    "image_jpeg = image_pdf.convert('pdf')\n",
    "\n",
    "# wand has converted all the separate pages in the PDF into separate image\n",
    "# blobs. We can loop over them and append them as a blob into the req_image\n",
    "# list.\n",
    "for img in image_jpeg.sequence:\n",
    "\timg_page = Image(image=img)\n",
    "\treq_image.append(img_page.make_blob('jpeg'))\n",
    "\n",
    "# Now we just need to run OCR over the image blobs and store all of the \n",
    "# recognized text in final_text.\n",
    "for img in req_image:\n",
    "\ttxt = tool.image_to_string(\n",
    "\t\tPI.open(io.BytesIO(img)),\n",
    "\t\tlang=lang,\n",
    "\t\tbuilder=pyocr.builders.TextBuilder()\n",
    "\t)\n",
    "\tfinal_text.append(txt)\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "f0d5f1d6-7e15-4ee6-b4ee-cbd41c5afb99",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "The final text is: \n",
      "\n",
      "40TH EDITION - 2020 - APPROVED DRUG PRODUCT LIST\n",
      "\n",
      "PRESCRIPTION DRUG PRODUCT LIST\n",
      "\n",
      "ABACAVIR SULFATE\n",
      "SOLUTION; ORAL\n",
      "ABACAVIR SULFATE\n",
      "\n",
      "EQ 2 5 /ML\n",
      "\n",
      "EQ 2 Ee /ML\n",
      "\n",
      "EQ 300MG BASE\n",
      "EQ 300MG BASE\n",
      "EQ 300MG BASE\n",
      "\n",
      "\n"
     ]
    }
   ],
   "source": [
    "print(\"\\nThe final text is: \\n\")\n",
    "print(final_text[0][0:200])"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "1cac17e7-079d-4e32-bdbf-ae49194b2078",
   "metadata": {},
   "source": [
    "it appears taht this does not have the required precision. I'll need to do this some other way."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2283e290-fab3-4cda-8ce9-55a0b3533c98",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}