[matplotlib-devel] mathtext SoC update

Edin Salković Wed, 02 Aug 2006 07:08:22 -0700

I'm still alive ;)

Although I still haven't received any payments from Google (they are
doing their best to solve this), I've began working on implementing
the Knuth's layout algorithms.


I have studied a bit the TeXbook, the existing mathtext parsing code,
and I have decided to rewrite the parsing almost from scratch.
Although I don't no too much about parsing, I think it won't be that
much of a problem.

My idea is to first transform a TeX string to a Python list (tree-like
structure), which can be composed of strings, commands, and/or other
lists, and so on.

Then, I plan to write some classes to trasnform this list/tree to the
actual boxes needed for displaying.

The first part is done (although bugs are possible). Now I'm
concetrating on the remaining part.

The current module is attached. It doesn't need any third-party
libraries currently.
The following works:
Going from:
r"asdf { \horse{}\ \zztop{} \ Hello\^^a^{b_c}}"

to:
['asdf', ' ', [' ', '\\horse', [], '\\space', '\\zztop', [], ' ',
'\\space', 'Hello', '\\circumflex', '\\superscript', 'a',
'\\superscript', ['b', '\\subscript', 'c']]]


Please John, do comment (others with spare time are welcome too :).

#~ from matplotlib.pyparsing import Literal, Word, OneOrMore, ZeroOrMore, \
     #~ Combine, Group, Optional, Forward, NotAny, alphas, nums, alphanums, \
     #~ StringStart, StringEnd, ParseException, FollowedBy, Regex

esc_char = '\\'
# Grouping delimiters
begin_group_char = '{'
end_group_char = '}'
dec_delim = '.'
word_delim = ' '

enviroment = {
'rm' : 'rm'
}

# Maximum number of nestings (groups within groups)
max_depth = 10

class TexParseError(Exception):
    pass


# Helper functions, mainly used by the parser
def debug_tok(tok):
    print tok
    #pass

def is_command(tok):
    pass

def remove_comments(texstring):
    # TO-DO
    return texstring

def group_split(texstring):
    """Splits the string into three parts based on the grouping delimiters,
    and returns them as a list.
    """
    if texstring == begin_group_char + end_group_char:
        return '', [], ''
    length = len(texstring)
    i = texstring.find(begin_group_char)
    if i == -1:
        return texstring, '', ''
    pos_begin = i
    count = 1
    num_groups = 0
    while count != 0:
        i = i + 1
        # First we check some things
        if num_groups > max_depth:
            message = "Maximum number of nestings reached. Too many groups"
            raise TexParseError(message)
        if i == length:
            message = "Group not closed properly"
            raise TexParseError(message)

        if texstring[i] == end_group_char:
            count -= 1
        elif texstring[i] == begin_group_char:
            num_groups += 1
            count += 1
    before = texstring[:pos_begin]
    if pos_begin + 1 == i:
        grouping = []
    else:
        grouping = texstring[pos_begin + 1:i]
    after = texstring[i + 1:]
    return before, grouping, after

def break_up_commands(texstring):
    """Breaks up a string (mustn't contain any groupings) into a list
    of commands and pure text.
    """
    result = []
    if not texstring:
        return result
    _texstrings = texstring.split(esc_char)
    for i, _texstring in enumerate(_texstrings):
        _command, _puretext = split_command(_texstring)
        if i == 0 and _texstrings[0]:
            # Case when the first command is a not a command but text
            result.append(_command)
            result.extend(_puretext)
            continue
        if _command:
            result.append(esc_char + _command)
        if _puretext:
            if _puretext[0] == word_delim:
                _puretext = _puretext[1:]
            result.extend(_puretext)
    return result

def split_command(texstring):
    """Splits a texstring into a command part and a pure text (as a list) part"""
    if not texstring:
        return "", []
    _puretext = []
    _command, _rest = get_first_word(texstring)
    if not _command:
        _command = texstring[0]
        _rest = texstring[1:]
    while True:
        _word, _rest = get_first_word(_rest)
        if _word:
            _puretext.append(_word)
        if _rest:
            _puretext.extend(_rest[0])
            if len(_rest) == 1:
                break
            _rest = _rest[1:]
        else:
            break
    return _command, _puretext

def get_first_word(texstring):
    _word = ""
    i = 0
    _length = len(texstring)
    if _length == 0:
        return "", ""
    if texstring[0].isalpha():
        while _length > i and texstring[i].isalpha():
            _word += texstring[i]
            i = i + 1
    elif texstring[0].isdigit():
        while _length > i and (texstring[i].isdigit()
                            or texstring[i] == dec_delim):
            _word += texstring[i]
            i = i + 1
        
    return _word, texstring[i:]

def parse_mathtex(texstring):
    """Parses the normalized tex string and returns a list. Used
    recursively.
    The returned list can then be evaluated by a TeX evaluator"""
    result = []
    if not texstring:
        return result
    # Checking for groupings: begin_group_char...end_group_char
    before, grouping, after = group_split(texstring)
    print before, '\n', grouping, '\n', after

    if before:
        result.extend(break_up_commands(before))
    if grouping or grouping == []:
        result.append(parse_mathtex(grouping))
    if after:
        result.extend(parse_mathtex(after))

    return result

def normalize_tex(texstring):
    """Normalizes the whole TeX expression (that is: prepares it for
    parsing)"""
    texstring = remove_comments(texstring)
    # Removing the escaped escape character (replacing it)
    texstring = texstring.replace(esc_char + esc_char, esc_char + 'backslash')
    
    # Removing the escaped scope/grouping characters
    texstring = texstring.replace(esc_char + begin_group_char, esc_char + 'lbrace')
    texstring = texstring.replace(esc_char + end_group_char, esc_char + 'rbrace')

    # Now we should have a clean expression, so we check if all the grouping
    # are OK (every begin_group_char should have a matching end_group_char)
    # TO-DO

    # Removing the escaped space-like characters. Unescaped space in TeX is
    # not important
    # Replacing all space-like characters with a single space word_delim
    texstring = word_delim.join(texstring.split())
    texstring = texstring.replace(esc_char + word_delim, esc_char + 'space'
                                    + word_delim)

    # Dealing with "syntactic sugar" goes here (i.e. '_', '^' etc.)
    texstring = texstring.replace(esc_char + '_', esc_char + 'underscore' + word_delim)
    i = texstring.find('_' + word_delim)
    if i != -1:
        raise TexParseError('Subscripting with space not allowed')
    texstring = texstring.replace('_', esc_char + 'subscript' + word_delim)

    texstring = texstring.replace(esc_char + '^', esc_char + 'circumflex' + word_delim)
    i = texstring.find('^' + word_delim)
    if i != -1:
        raise TexParseError('Superscripting with space not allowed')
    texstring = texstring.replace('^', esc_char + 'superscript' + word_delim)

    # Removing unnecessary white space
    texstring = word_delim.join(texstring.split())

    return texstring

if __name__ == '__main__':
    #texstring = r"\\{ \horse\   Hello\^ ^ a^b_c}"
    texstring = r"  asdf { \horse{}\ \zztop{} \ Hello\^^a^{b_c}}"
    #texstring = r"{}{} { }"
    #texstring = r"{{{_ }}}"
    #texstring = r"\horse{}"
    #texstring = r"\horse;,.?)_)(*(*^*%&$$%{} Haha! Kako je frajeru?"
    #texstring = r"a_2\trav 32"
    print texstring
    texstring = normalize_tex(texstring)
    print texstring
    _parsed = parse_mathtex(texstring)
    print _parsed

-------------------------------------------------------------------------
Take Surveys. Earn Cash. Influence the Future of IT
Join SourceForge.net's Techsay panel and you'll get the chance to share your
opinions on IT & business topics through brief surveys -- and earn cash
http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV

_______________________________________________
Matplotlib-devel mailing list
Matplotlib-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/matplotlib-devel

[matplotlib-devel] mathtext SoC update

Reply via email to