You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
ClinicalTrialsDataProcessing/Orangebook/Untitled2.ipynb

217 lines
5.1 KiB
Plaintext

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

{
"cells": [
{
"cell_type": "code",
"execution_count": 196,
"id": "2f61df31-f3c1-4b2e-ae96-96bf06089b17",
"metadata": {},
"outputs": [],
"source": [
"def get_digits(string):\n",
" splat = string.split(\")\")\n",
" num = splat[0]\n",
" l = len(num)\n",
" return int(num),l\n",
"\n",
"def token_generator(string):\n",
" \n",
" start = 0\n",
" str_len = len(string)\n",
" \n",
" \n",
" while start < str_len:\n",
" substring = string[start:]\n",
" \n",
" #check for cid\n",
" if (str_len - start > 6) and (substring[0:5] == \"(cid:\"):\n",
" \n",
" num,length = get_digits(substring[5:])\n",
" start += length + 6\n",
" yield num\n",
" \n",
" elif (str_len - start > 1):\n",
" start += 1\n",
" yield substring[0]\n",
" else:\n",
" start += 1\n",
" yield substring\n",
"\n",
"class UnknownSymbol():\n",
" def __init__(self, symbol):\n",
" self.symbol = symbol\n",
" \n",
" def __repr__(self):\n",
" return \"UnknownSymbol: {} of type {}\".format(self.symbol, type(self.symbol))\n",
" \n",
" def __str__(self):\n",
" return \"\\uFFFD\"\n",
"\n",
"class Parser:\n",
" def __init__(self, lookup_table):\n",
" self._lookup_table = lookup_table\n",
" \n",
" def convert(self,symbol):\n",
" try:\n",
" return self._lookup_table[symbol]\n",
" except:\n",
" return UnknownSymbol(symbol)\n",
" \n",
" def convert_stream(self,token_stream):\n",
" for token in token_stream:\n",
" yield self.convert(token)"
]
},
{
"cell_type": "code",
"execution_count": 213,
"id": "e2c1e39b-0ac5-4ad7-9176-ef8ea69feeec",
"metadata": {},
"outputs": [],
"source": [
"ob2020 = Parser({\n",
" 23:\"4\"\n",
" ,19:\"0\"\n",
" ,\"7\":\"T\"\n",
" ,\"+\":\"H\"\n",
" ,3:\" \"\n",
" ,\"(\":\"E\"\n",
" ,\"\":\"D\"\n",
" ,\",\":\"I\"\n",
" ,\"2\":\"O\"\n",
" ,\"1\":\"N\"\n",
" ,16:\"-\"\n",
" ,21:\"2\"\n",
" ,\"$\":\"A\"\n",
" ,\"3\":\"P\"\n",
" ,\"5\":\"R\"\n",
" ,\"9\":\"V\"\n",
" ,\"8\":\"U\"\n",
" ,\"*\":\"G\"\n",
" ,\"&\":\"C\"\n",
" ,\"/\":\"L\"\n",
" ,\"6\":\"S\"\n",
" ,22:\"3\"\n",
" ,20:\"1\"\n",
" ,11:\" \"\n",
" ,\"R\":\"(\"\n",
" ,\"I\":\"of\"\n",
" ,24:\"5\"\n",
" ,12:\")\"\n",
"})"
]
},
{
"cell_type": "code",
"execution_count": 214,
"id": "c02896ab-fc75-44cc-bb27-a5dcf1b6d7f0",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'40TH EDITION - 2020 - APPROVED DRUG PRODUCT LIST'"
]
},
"execution_count": 214,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"\"\".join([x for x in ob2020.convert_stream(token_generator(b))])"
]
},
{
"cell_type": "code",
"execution_count": 215,
"id": "1794e826-fa1f-4aba-8eab-6d603a06dfe0",
"metadata": {},
"outputs": [],
"source": [
"c = \"35(6&5,37,21(cid:3)58*(cid:3)3528&7(cid:3)/,67(cid:3)\""
]
},
{
"cell_type": "code",
"execution_count": 216,
"id": "c8b67d79-81ad-4b3a-be8b-bace9a8d943a",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'PRESCRIPTION DRUG PRODUCT LIST '"
]
},
"execution_count": 216,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"\"\".join([x for x in ob2020.convert_stream(token_generator(c))])"
]
},
{
"cell_type": "code",
"execution_count": 217,
"id": "f76d9760-fe69-4743-ab47-41cf74866d70",
"metadata": {},
"outputs": [],
"source": [
"d = \"(cid:22)(cid:16)(cid:20)(cid:11)RI(cid:3)(cid:23)(cid:24)(cid:22)(cid:12)\""
]
},
{
"cell_type": "code",
"execution_count": 218,
"id": "ee925997-0701-4d8e-b713-6f39c6a50a5b",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['3', '-', '1', ' ', '(', 'of', ' ', '4', '5', '3', ')']"
]
},
"execution_count": 218,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"[x for x in ob2020.convert_stream(token_generator(d))]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ae152c75-5fd6-4756-a473-fcea2de5ee30",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.13"
}
},
"nbformat": 4,
"nbformat_minor": 5
}