{ "cells": [ { "cell_type": "code", "execution_count": 196, "id": "2f61df31-f3c1-4b2e-ae96-96bf06089b17", "metadata": {}, "outputs": [], "source": [ "def get_digits(string):\n", " splat = string.split(\")\")\n", " num = splat[0]\n", " l = len(num)\n", " return int(num),l\n", "\n", "def token_generator(string):\n", " \n", " start = 0\n", " str_len = len(string)\n", " \n", " \n", " while start < str_len:\n", " substring = string[start:]\n", " \n", " #check for cid\n", " if (str_len - start > 6) and (substring[0:5] == \"(cid:\"):\n", " \n", " num,length = get_digits(substring[5:])\n", " start += length + 6\n", " yield num\n", " \n", " elif (str_len - start > 1):\n", " start += 1\n", " yield substring[0]\n", " else:\n", " start += 1\n", " yield substring\n", "\n", "class UnknownSymbol():\n", " def __init__(self, symbol):\n", " self.symbol = symbol\n", " \n", " def __repr__(self):\n", " return \"UnknownSymbol: {} of type {}\".format(self.symbol, type(self.symbol))\n", " \n", " def __str__(self):\n", " return \"\\uFFFD\"\n", "\n", "class Parser:\n", " def __init__(self, lookup_table):\n", " self._lookup_table = lookup_table\n", " \n", " def convert(self,symbol):\n", " try:\n", " return self._lookup_table[symbol]\n", " except:\n", " return UnknownSymbol(symbol)\n", " \n", " def convert_stream(self,token_stream):\n", " for token in token_stream:\n", " yield self.convert(token)" ] }, { "cell_type": "code", "execution_count": 213, "id": "e2c1e39b-0ac5-4ad7-9176-ef8ea69feeec", "metadata": {}, "outputs": [], "source": [ "ob2020 = Parser({\n", " 23:\"4\"\n", " ,19:\"0\"\n", " ,\"7\":\"T\"\n", " ,\"+\":\"H\"\n", " ,3:\" \"\n", " ,\"(\":\"E\"\n", " ,\"’\":\"D\"\n", " ,\",\":\"I\"\n", " ,\"2\":\"O\"\n", " ,\"1\":\"N\"\n", " ,16:\"-\"\n", " ,21:\"2\"\n", " ,\"$\":\"A\"\n", " ,\"3\":\"P\"\n", " ,\"5\":\"R\"\n", " ,\"9\":\"V\"\n", " ,\"8\":\"U\"\n", " ,\"*\":\"G\"\n", " ,\"&\":\"C\"\n", " ,\"/\":\"L\"\n", " ,\"6\":\"S\"\n", " ,22:\"3\"\n", " ,20:\"1\"\n", " ,11:\" \"\n", " ,\"R\":\"(\"\n", " ,\"I\":\"of\"\n", " ,24:\"5\"\n", " ,12:\")\"\n", "})" ] }, { "cell_type": "code", "execution_count": 214, "id": "c02896ab-fc75-44cc-bb27-a5dcf1b6d7f0", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'40TH EDITION - 2020 - APPROVED DRUG PRODUCT LIST'" ] }, "execution_count": 214, "metadata": {}, "output_type": "execute_result" } ], "source": [ "\"\".join([x for x in ob2020.convert_stream(token_generator(b))])" ] }, { "cell_type": "code", "execution_count": 215, "id": "1794e826-fa1f-4aba-8eab-6d603a06dfe0", "metadata": {}, "outputs": [], "source": [ "c = \"35(6&5,37,21(cid:3)’58*(cid:3)352’8&7(cid:3)/,67(cid:3)\"" ] }, { "cell_type": "code", "execution_count": 216, "id": "c8b67d79-81ad-4b3a-be8b-bace9a8d943a", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'PRESCRIPTION DRUG PRODUCT LIST '" ] }, "execution_count": 216, "metadata": {}, "output_type": "execute_result" } ], "source": [ "\"\".join([x for x in ob2020.convert_stream(token_generator(c))])" ] }, { "cell_type": "code", "execution_count": 217, "id": "f76d9760-fe69-4743-ab47-41cf74866d70", "metadata": {}, "outputs": [], "source": [ "d = \"(cid:22)(cid:16)(cid:20)(cid:11)RI(cid:3)(cid:23)(cid:24)(cid:22)(cid:12)\"" ] }, { "cell_type": "code", "execution_count": 218, "id": "ee925997-0701-4d8e-b713-6f39c6a50a5b", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['3', '-', '1', ' ', '(', 'of', ' ', '4', '5', '3', ')']" ] }, "execution_count": 218, "metadata": {}, "output_type": "execute_result" } ], "source": [ "[x for x in ob2020.convert_stream(token_generator(d))]" ] }, { "cell_type": "code", "execution_count": null, "id": "ae152c75-5fd6-4756-a473-fcea2de5ee30", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.13" } }, "nbformat": 4, "nbformat_minor": 5 }