cdZWj
/
PaperText_Segmentation


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195
							#!/usr/bin/env python
# __author__ = "Ronie Martinez"
# __copyright__ = "Copyright 2016-2019, Ronie Martinez"
# __credits__ = ["Ronie Martinez"]
# __license__ = "MIT"
# __maintainer__ = "Ronie Martinez"
# __email__ = "ronmarti18@gmail.com"
from .commands import MATRICES
from .exceptions import EmptyGroupError, NumeratorNotFoundError, DenominatorNotFoundError
from .tokenizer import tokenize


def group(tokens, opening='{', closing='}'):
    g = [] if opening=='{' else ['\left'+opening]
    while True:
        token = next(tokens)
        if token == closing:
            if len(g):
                break
            else:
                raise EmptyGroupError
        elif token == opening:
            try:
                g.append(group(tokens))
            except EmptyGroupError:
                g += [opening, closing]
        else:
            g.append(token)
    if closing != '}':
        g.append(r'\right'+closing)
    return _aggregate(iter(g))


def process_row(tokens):
    row = []
    content = []
    for token in tokens:
        if token == '&':
            pass
        elif token == '\\\\':
            if len(row):
                content.append(row)
            row = []
        else:
            row.append(token)
    if len(row):
        content.append(row)
    while len(content) == 1 and isinstance(content[0], list):
        content = content.pop()
    return content


def environment(begin, tokens):
    if begin.startswith(r'\begin'):
        env = begin[7:-1]
    else:
        env = begin[1:]
    alignment = None
    content = []
    row = []
    while True:
        try:
            token = next_item_or_group(tokens)
            if isinstance(token, list):
                if env == 'array' and any(x in token for x in ['l','c','r','|',['l'],['c'],['r'],['|']]):
                    alignment = token
                else:
                    row.append(process_row(token))
            elif token == r'\end{{{}}}'.format(env):
                break
            elif token == '&':
                pass
            elif token == '\\\\':
                content.append(row)
                row = [';']
            elif token == '[' and not len(content):
                try:
                    alignment = group(tokens, '[', ']')
                except EmptyGroupError:
                    pass
            elif token == '--':
                try:
                    next_token = next(tokens)
                    row.append([token, next_token])
                except StopIteration:
                    row.append(token)
            elif token in '_^':
                process_sub_sup(row, token, tokens)
            else:
                row.append(token)
        except EmptyGroupError:
            row += ['{', '}']
            continue
        except StopIteration:
            break
    if len(row):
        content.append(row)
    while len(content) == 1 and isinstance(content[0], list):
        content = content.pop()
    if alignment:
        # return r'\{}'.format(env), ''.join(alignment), content
        return content
    else:
        #TODO mark
        # return r'\{}'.format(env), content
        return content

def next_item_or_group(tokens):
    token = next(tokens)
    if token == '{':
        return group(tokens)
    return token


def _aggregate(tokens):
    aggregated = []
    while True:
        try:
            token = next_item_or_group(tokens)
            if isinstance(token, list):
                aggregated.append(token)
            elif token == '[':
                try:
                    g = group(tokens, '[', ']')
                    if len(aggregated):
                        previous = aggregated[-1]
                        if previous == r'\sqrt':
                            root = next(tokens)
                            if root == '{':
                                try:
                                    root = group(tokens)
                                except EmptyGroupError:
                                    root = ''
                            aggregated[-1] = r'\root'
                            aggregated.append(root)
                        else:
                            pass  # FIXME: possible issues
                    aggregated.append(g)
                except EmptyGroupError:
                    aggregated += ['[', ']']
            elif token in '_^':
                process_sub_sup(aggregated, token, tokens)
            elif token.startswith(r'\begin') or token in MATRICES:
                aggregated += environment(token, tokens)
            elif token == r'\over':
                try:
                    numerator = aggregated.pop()
                    aggregated.append(r'\frac')
                    aggregated.append([numerator])
                    denominator = next_item_or_group(tokens)
                    aggregated.append([denominator])
                except IndexError:
                    raise NumeratorNotFoundError
                except (StopIteration, EmptyGroupError):
                    raise DenominatorNotFoundError
            else:
                aggregated.append(token)
        except EmptyGroupError:
            aggregated += ['{', '}']
            continue
        except StopIteration:
            break
    return aggregated


def aggregate(data):
    tokens = tokenize(data)
    return _aggregate(tokens)


def process_sub_sup(aggregated, token, tokens):
    try:
        previous = aggregated.pop()
        if isinstance(previous, str) and previous in '+-*/=[]_^{}':
            aggregated += [previous, token]
            return
        try:
            next_token = next_item_or_group(tokens)
            if len(aggregated) >= 2:
                if aggregated[-2] == '_' and token == '^':
                    aggregated[-2] = '_^'
                    aggregated += [previous, next_token]
                elif aggregated[-2] == '^' and token == '_':
                    aggregated[-2] = '_^'
                    aggregated += [next_token, previous]
                else:
                    aggregated += [token, previous, next_token]
            else:
                aggregated += [token, previous, next_token]
        except EmptyGroupError:
            aggregated += [previous, token, '{', '}']
        except StopIteration:
            return
    except IndexError:
        aggregated.append(token)