You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
ClinicalTrialsDataProcessing/Orangebook/testing-pyocr.ipynb

146 lines
3.8 KiB
Plaintext

{
"cells": [
{
"cell_type": "code",
"execution_count": 4,
"id": "51bf48a1-920a-4e64-ac5f-323ff3a27ebf",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Will use tool 'Tesseract (sh)'\n",
"Available languages: eng, osd\n",
"Will use language 'eng'\n"
]
}
],
"source": [
"# Import the required libraries\n",
"from wand.image import Image\n",
"from PIL import Image as PI\n",
"import pyocr\n",
"import pyocr.builders\n",
"import io, sys\n",
"\n",
"\n",
"# Get the handle of the OCR library (in this case, tesseract)\n",
"tools = pyocr.get_available_tools()\n",
"if len(tools) == 0:\n",
"\tprint(\"No OCR tool found!\")\n",
"\tsys.exit(1)\n",
"tool = tools[0]\n",
"print(\"Will use tool '%s'\" % (tool.get_name()))\n",
"\n",
"# Get the language\n",
"langs = tool.get_available_languages()\n",
"print(\"Available languages: %s\" % \", \".join(langs)) \n",
"lang = langs[0] # For English\n",
"print(\"Will use language '%s'\" % (lang))\n",
"\n",
"# Setup two lists which will be used to hold our images and final_text\n",
"req_image = []\n",
"final_text = []\n",
"\n",
"# Open the PDF file using wand and convert it to jpeg\n",
"image_pdf = Image(filename=\"/home/will/research/ClinicalTrialsDataProcessing/Orangebook/Orangebooks/testprint.pdf\", resolution=300)\n",
"image_jpeg = image_pdf.convert('pdf')\n",
"\n",
"# wand has converted all the separate pages in the PDF into separate image\n",
"# blobs. We can loop over them and append them as a blob into the req_image\n",
"# list.\n",
"for img in image_jpeg.sequence:\n",
"\timg_page = Image(image=img)\n",
"\treq_image.append(img_page.make_blob('jpeg'))\n",
"\n",
"# Now we just need to run OCR over the image blobs and store all of the \n",
"# recognized text in final_text.\n",
"for img in req_image:\n",
"\ttxt = tool.image_to_string(\n",
"\t\tPI.open(io.BytesIO(img)),\n",
"\t\tlang=lang,\n",
"\t\tbuilder=pyocr.builders.TextBuilder()\n",
"\t)\n",
"\tfinal_text.append(txt)\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "f0d5f1d6-7e15-4ee6-b4ee-cbd41c5afb99",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"The final text is: \n",
"\n",
"40TH EDITION - 2020 - APPROVED DRUG PRODUCT LIST\n",
"\n",
"PRESCRIPTION DRUG PRODUCT LIST\n",
"\n",
"ABACAVIR SULFATE\n",
"SOLUTION; ORAL\n",
"ABACAVIR SULFATE\n",
"\n",
"EQ 2 5 /ML\n",
"\n",
"EQ 2 Ee /ML\n",
"\n",
"EQ 300MG BASE\n",
"EQ 300MG BASE\n",
"EQ 300MG BASE\n",
"\n",
"\n"
]
}
],
"source": [
"print(\"\\nThe final text is: \\n\")\n",
"print(final_text[0][0:200])"
]
},
{
"cell_type": "markdown",
"id": "1cac17e7-079d-4e32-bdbf-ae49194b2078",
"metadata": {},
"source": [
"it appears taht this does not have the required precision. I'll need to do this some other way."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "2283e290-fab3-4cda-8ce9-55a0b3533c98",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.13"
}
},
"nbformat": 4,
"nbformat_minor": 5
}