ClinicalTrialsDataProcessing/Orangebook/CidParser.py

# Adobe Pdf Character ID (cid:\d+) parser
# The purpose is to allow someone to create their own table equivalent to the "\toUnicode" that
# should be provided in every PDF using cid's (but is often mangled).

def get_digits(string):
    """
    Extract leading the digits from a cid tag.
    """
    splat = string.split(")")
    num = splat[0]
    l = len(num)
    return int(num),l

def token_generator(string):
    """
    An iterable that returns tokens describing a string in a pdf.
    Tokens take two forms:
        - Integers: these represend CID codes
        - Characters: these represent the arbitrary characters often returned amidst cid's.

    It is a python generator becasue that simplifies the ordering and allows us to avoid recursion.
    """
    start = 0
    str_len = len(string)

    while start < str_len:
        substring = string[start:]

        #check for cid
        if (str_len - start > 6) and (substring[0:5] == "(cid:"):

            num,length = get_digits(substring[5:])
            start += length + 6
            yield num

        elif (str_len - start > 1):
            start += 1
            yield substring[0]
        else:
            start += 1
            yield substring


class UnknownSymbol():
    """
    Represents a token that is not in the parser's dictionary.
    """
    def __init__(self, symbol):
        self.symbol = symbol

    def __repr__(self):
        return "UnknownSymbol: {} of type {}".format(self.symbol, type(self.symbol))

    def __str__(self):
        return "\uFFFD"

class Parser:
    """
    Translates from tokens to character arrays or strings, handling errors as it goes.

    It requires a dictionary during instantiation.
    This dictionary is what is used to perform lookups.

    It exposes 3 methods
        - convert attempts to convert a single token
        - convert_stream will try to convert an iterable of tokens into an iterable of text.
        - check_list_of_strings will try to convert a list of strings containing cids and other symbols into
            - strings, if there are no Unknown symbols.
            - lists, containing characters and Unknown symbols.
    """
    def __init__(self, lookup_table):
        self._lookup_table = lookup_table

    def convert(self,token):
        try:
            return self._lookup_table[token]
        except:
            return  UnknownSymbol(token)

    def convert_list(self,token_stream):
        for token in token_stream:
            yield self.convert(token)

    def convert_list_of_strings(self, list_of_strings):
        for token_stream in list_of_stings:
            arr = [x for x in ob2020.convert_stream(token_generator(token_stream))]
            try:
                print("".join(arr))
            except:
                print(arr)

if __name__ == "__main__":
    print("Plan was to accept and proceess a symbol table and text. Apparently it has not been implemented."