You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
93 lines
3.0 KiB
Python
93 lines
3.0 KiB
Python
# Adobe Pdf Character ID (cid:\d+) parser
|
|
# The purpose is to allow someone to create their own table equivalent to the "\toUnicode" that
|
|
# should be provided in every PDF using cid's (but is often mangled).
|
|
|
|
def get_digits(string):
|
|
"""
|
|
Extract leading the digits from a cid tag.
|
|
"""
|
|
splat = string.split(")")
|
|
num = splat[0]
|
|
l = len(num)
|
|
return int(num),l
|
|
|
|
def token_generator(string):
|
|
"""
|
|
An iterable that returns tokens describing a string in a pdf.
|
|
Tokens take two forms:
|
|
- Integers: these represend CID codes
|
|
- Characters: these represent the arbitrary characters often returned amidst cid's.
|
|
|
|
It is a python generator becasue that simplifies the ordering and allows us to avoid recursion.
|
|
"""
|
|
start = 0
|
|
str_len = len(string)
|
|
|
|
while start < str_len:
|
|
substring = string[start:]
|
|
|
|
#check for cid
|
|
if (str_len - start > 6) and (substring[0:5] == "(cid:"):
|
|
|
|
num,length = get_digits(substring[5:])
|
|
start += length + 6
|
|
yield num
|
|
|
|
elif (str_len - start > 1):
|
|
start += 1
|
|
yield substring[0]
|
|
else:
|
|
start += 1
|
|
yield substring
|
|
|
|
|
|
class UnknownSymbol():
|
|
"""
|
|
Represents a token that is not in the parser's dictionary.
|
|
"""
|
|
def __init__(self, symbol):
|
|
self.symbol = symbol
|
|
|
|
def __repr__(self):
|
|
return "UnknownSymbol: {} of type {}".format(self.symbol, type(self.symbol))
|
|
|
|
def __str__(self):
|
|
return "\uFFFD"
|
|
|
|
class Parser:
|
|
"""
|
|
Translates from tokens to character arrays or strings, handling errors as it goes.
|
|
|
|
It requires a dictionary during instantiation.
|
|
This dictionary is what is used to perform lookups.
|
|
|
|
It exposes 3 methods
|
|
- convert attempts to convert a single token
|
|
- convert_stream will try to convert an iterable of tokens into an iterable of text.
|
|
- check_list_of_strings will try to convert a list of strings containing cids and other symbols into
|
|
- strings, if there are no Unknown symbols.
|
|
- lists, containing characters and Unknown symbols.
|
|
"""
|
|
def __init__(self, lookup_table):
|
|
self._lookup_table = lookup_table
|
|
|
|
def convert(self,token):
|
|
try:
|
|
return self._lookup_table[token]
|
|
except:
|
|
return UnknownSymbol(token)
|
|
|
|
def convert_list(self,token_stream):
|
|
for token in token_stream:
|
|
yield self.convert(token)
|
|
|
|
def convert_list_of_strings(self, list_of_strings):
|
|
for token_stream in list_of_stings:
|
|
arr = [x for x in ob2020.convert_stream(token_generator(token_stream))]
|
|
try:
|
|
print("".join(arr))
|
|
except:
|
|
print(arr)
|
|
|
|
if __name__ == "__main__":
|
|
print("Plan was to accept and proceess a symbol table and text. Apparently it has not been implemented." |