# Adobe Pdf Character ID (cid:\d+) parser # The purpose is to allow someone to create their own table equivalent to the "\toUnicode" that # should be provided in every PDF using cid's (but is often mangled). def get_digits(string): """ Extract leading the digits from a cid tag. """ splat = string.split(")") num = splat[0] l = len(num) return int(num),l def token_generator(string): """ An iterable that returns tokens describing a string in a pdf. Tokens take two forms: - Integers: these represend CID codes - Characters: these represent the arbitrary characters often returned amidst cid's. It is a python generator becasue that simplifies the ordering and allows us to avoid recursion. """ start = 0 str_len = len(string) while start < str_len: substring = string[start:] #check for cid if (str_len - start > 6) and (substring[0:5] == "(cid:"): num,length = get_digits(substring[5:]) start += length + 6 yield num elif (str_len - start > 1): start += 1 yield substring[0] else: start += 1 yield substring class UnknownSymbol(): """ Represents a token that is not in the parser's dictionary. """ def __init__(self, symbol): self.symbol = symbol def __repr__(self): return "UnknownSymbol: {} of type {}".format(self.symbol, type(self.symbol)) def __str__(self): return "\uFFFD" class Parser: """ Translates from tokens to character arrays or strings, handling errors as it goes. It requires a dictionary during instantiation. This dictionary is what is used to perform lookups. It exposes 3 methods - convert attempts to convert a single token - convert_stream will try to convert an iterable of tokens into an iterable of text. - check_list_of_strings will try to convert a list of strings containing cids and other symbols into - strings, if there are no Unknown symbols. - lists, containing characters and Unknown symbols. """ def __init__(self, lookup_table): self._lookup_table = lookup_table def convert(self,token): try: return self._lookup_table[token] except: return UnknownSymbol(token) def convert_list(self,token_stream): for token in token_stream: yield self.convert(token) def convert_list_of_strings(self, list_of_strings): for token_stream in list_of_stings: arr = [x for x in ob2020.convert_stream(token_generator(token_stream))] try: print("".join(arr)) except: print(arr) if __name__ == "__main__": print("Plan was to accept and proceess a symbol table and text. Apparently it has not been implemented."