You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
146 lines
3.8 KiB
Plaintext
146 lines
3.8 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 4,
|
|
"id": "51bf48a1-920a-4e64-ac5f-323ff3a27ebf",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Will use tool 'Tesseract (sh)'\n",
|
|
"Available languages: eng, osd\n",
|
|
"Will use language 'eng'\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"# Import the required libraries\n",
|
|
"from wand.image import Image\n",
|
|
"from PIL import Image as PI\n",
|
|
"import pyocr\n",
|
|
"import pyocr.builders\n",
|
|
"import io, sys\n",
|
|
"\n",
|
|
"\n",
|
|
"# Get the handle of the OCR library (in this case, tesseract)\n",
|
|
"tools = pyocr.get_available_tools()\n",
|
|
"if len(tools) == 0:\n",
|
|
"\tprint(\"No OCR tool found!\")\n",
|
|
"\tsys.exit(1)\n",
|
|
"tool = tools[0]\n",
|
|
"print(\"Will use tool '%s'\" % (tool.get_name()))\n",
|
|
"\n",
|
|
"# Get the language\n",
|
|
"langs = tool.get_available_languages()\n",
|
|
"print(\"Available languages: %s\" % \", \".join(langs)) \n",
|
|
"lang = langs[0] # For English\n",
|
|
"print(\"Will use language '%s'\" % (lang))\n",
|
|
"\n",
|
|
"# Setup two lists which will be used to hold our images and final_text\n",
|
|
"req_image = []\n",
|
|
"final_text = []\n",
|
|
"\n",
|
|
"# Open the PDF file using wand and convert it to jpeg\n",
|
|
"image_pdf = Image(filename=\"/home/will/research/ClinicalTrialsDataProcessing/Orangebook/Orangebooks/testprint.pdf\", resolution=300)\n",
|
|
"image_jpeg = image_pdf.convert('pdf')\n",
|
|
"\n",
|
|
"# wand has converted all the separate pages in the PDF into separate image\n",
|
|
"# blobs. We can loop over them and append them as a blob into the req_image\n",
|
|
"# list.\n",
|
|
"for img in image_jpeg.sequence:\n",
|
|
"\timg_page = Image(image=img)\n",
|
|
"\treq_image.append(img_page.make_blob('jpeg'))\n",
|
|
"\n",
|
|
"# Now we just need to run OCR over the image blobs and store all of the \n",
|
|
"# recognized text in final_text.\n",
|
|
"for img in req_image:\n",
|
|
"\ttxt = tool.image_to_string(\n",
|
|
"\t\tPI.open(io.BytesIO(img)),\n",
|
|
"\t\tlang=lang,\n",
|
|
"\t\tbuilder=pyocr.builders.TextBuilder()\n",
|
|
"\t)\n",
|
|
"\tfinal_text.append(txt)\n",
|
|
"\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 7,
|
|
"id": "f0d5f1d6-7e15-4ee6-b4ee-cbd41c5afb99",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"\n",
|
|
"The final text is: \n",
|
|
"\n",
|
|
"40TH EDITION - 2020 - APPROVED DRUG PRODUCT LIST\n",
|
|
"\n",
|
|
"PRESCRIPTION DRUG PRODUCT LIST\n",
|
|
"\n",
|
|
"ABACAVIR SULFATE\n",
|
|
"SOLUTION; ORAL\n",
|
|
"ABACAVIR SULFATE\n",
|
|
"\n",
|
|
"EQ 2 5 /ML\n",
|
|
"\n",
|
|
"EQ 2 Ee /ML\n",
|
|
"\n",
|
|
"EQ 300MG BASE\n",
|
|
"EQ 300MG BASE\n",
|
|
"EQ 300MG BASE\n",
|
|
"\n",
|
|
"\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"print(\"\\nThe final text is: \\n\")\n",
|
|
"print(final_text[0][0:200])"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "1cac17e7-079d-4e32-bdbf-ae49194b2078",
|
|
"metadata": {},
|
|
"source": [
|
|
"it appears taht this does not have the required precision. I'll need to do this some other way."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "2283e290-fab3-4cda-8ce9-55a0b3533c98",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": []
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3 (ipykernel)",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.8.13"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 5
|
|
}
|