I'm still alive ;)
Although I still haven't received any payments from Google (they are
doing their best to solve this), I've began working on implementing
the Knuth's layout algorithms.
I have studied a bit the TeXbook, the existing mathtext parsing code,
and I have decided to rewrite the parsing almost from scratch.
Although I don't no too much about parsing, I think it won't be that
much of a problem.
My idea is to first transform a TeX string to a Python list (tree-like
structure), which can be composed of strings, commands, and/or other
lists, and so on.
Then, I plan to write some classes to trasnform this list/tree to the
actual boxes needed for displaying.
The first part is done (although bugs are possible). Now I'm
concetrating on the remaining part.
The current module is attached. It doesn't need any third-party
libraries currently.
The following works:
Going from:
r"asdf { \horse{}\ \zztop{} \ Hello\^^a^{b_c}}"
to:
['asdf', ' ', [' ', '\\horse', [], '\\space', '\\zztop', [], ' ',
'\\space', 'Hello', '\\circumflex', '\\superscript', 'a',
'\\superscript', ['b', '\\subscript', 'c']]]
Please John, do comment (others with spare time are welcome too :).
#~ from matplotlib.pyparsing import Literal, Word, OneOrMore, ZeroOrMore, \
#~ Combine, Group, Optional, Forward, NotAny, alphas, nums, alphanums, \
#~ StringStart, StringEnd, ParseException, FollowedBy, Regex
esc_char = '\\'
# Grouping delimiters
begin_group_char = '{'
end_group_char = '}'
dec_delim = '.'
word_delim = ' '
enviroment = {
'rm' : 'rm'
}
# Maximum number of nestings (groups within groups)
max_depth = 10
class TexParseError(Exception):
pass
# Helper functions, mainly used by the parser
def debug_tok(tok):
print tok
#pass
def is_command(tok):
pass
def remove_comments(texstring):
# TO-DO
return texstring
def group_split(texstring):
"""Splits the string into three parts based on the grouping delimiters,
and returns them as a list.
"""
if texstring == begin_group_char + end_group_char:
return '', [], ''
length = len(texstring)
i = texstring.find(begin_group_char)
if i == -1:
return texstring, '', ''
pos_begin = i
count = 1
num_groups = 0
while count != 0:
i = i + 1
# First we check some things
if num_groups > max_depth:
message = "Maximum number of nestings reached. Too many groups"
raise TexParseError(message)
if i == length:
message = "Group not closed properly"
raise TexParseError(message)
if texstring[i] == end_group_char:
count -= 1
elif texstring[i] == begin_group_char:
num_groups += 1
count += 1
before = texstring[:pos_begin]
if pos_begin + 1 == i:
grouping = []
else:
grouping = texstring[pos_begin + 1:i]
after = texstring[i + 1:]
return before, grouping, after
def break_up_commands(texstring):
"""Breaks up a string (mustn't contain any groupings) into a list
of commands and pure text.
"""
result = []
if not texstring:
return result
_texstrings = texstring.split(esc_char)
for i, _texstring in enumerate(_texstrings):
_command, _puretext = split_command(_texstring)
if i == 0 and _texstrings[0]:
# Case when the first command is a not a command but text
result.append(_command)
result.extend(_puretext)
continue
if _command:
result.append(esc_char + _command)
if _puretext:
if _puretext[0] == word_delim:
_puretext = _puretext[1:]
result.extend(_puretext)
return result
def split_command(texstring):
"""Splits a texstring into a command part and a pure text (as a list) part"""
if not texstring:
return "", []
_puretext = []
_command, _rest = get_first_word(texstring)
if not _command:
_command = texstring[0]
_rest = texstring[1:]
while True:
_word, _rest = get_first_word(_rest)
if _word:
_puretext.append(_word)
if _rest:
_puretext.extend(_rest[0])
if len(_rest) == 1:
break
_rest = _rest[1:]
else:
break
return _command, _puretext
def get_first_word(texstring):
_word = ""
i = 0
_length = len(texstring)
if _length == 0:
return "", ""
if texstring[0].isalpha():
while _length > i and texstring[i].isalpha():
_word += texstring[i]
i = i + 1
elif texstring[0].isdigit():
while _length > i and (texstring[i].isdigit()
or texstring[i] == dec_delim):
_word += texstring[i]
i = i + 1
return _word, texstring[i:]
def parse_mathtex(texstring):
"""Parses the normalized tex string and returns a list. Used
recursively.
The returned list can then be evaluated by a TeX evaluator"""
result = []
if not texstring:
return result
# Checking for groupings: begin_group_char...end_group_char
before, grouping, after = group_split(texstring)
print before, '\n', grouping, '\n', after
if before:
result.extend(break_up_commands(before))
if grouping or grouping == []:
result.append(parse_mathtex(grouping))
if after:
result.extend(parse_mathtex(after))
return result
def normalize_tex(texstring):
"""Normalizes the whole TeX expression (that is: prepares it for
parsing)"""
texstring = remove_comments(texstring)
# Removing the escaped escape character (replacing it)
texstring = texstring.replace(esc_char + esc_char, esc_char + 'backslash')
# Removing the escaped scope/grouping characters
texstring = texstring.replace(esc_char + begin_group_char, esc_char + 'lbrace')
texstring = texstring.replace(esc_char + end_group_char, esc_char + 'rbrace')
# Now we should have a clean expression, so we check if all the grouping
# are OK (every begin_group_char should have a matching end_group_char)
# TO-DO
# Removing the escaped space-like characters. Unescaped space in TeX is
# not important
# Replacing all space-like characters with a single space word_delim
texstring = word_delim.join(texstring.split())
texstring = texstring.replace(esc_char + word_delim, esc_char + 'space'
+ word_delim)
# Dealing with "syntactic sugar" goes here (i.e. '_', '^' etc.)
texstring = texstring.replace(esc_char + '_', esc_char + 'underscore' + word_delim)
i = texstring.find('_' + word_delim)
if i != -1:
raise TexParseError('Subscripting with space not allowed')
texstring = texstring.replace('_', esc_char + 'subscript' + word_delim)
texstring = texstring.replace(esc_char + '^', esc_char + 'circumflex' + word_delim)
i = texstring.find('^' + word_delim)
if i != -1:
raise TexParseError('Superscripting with space not allowed')
texstring = texstring.replace('^', esc_char + 'superscript' + word_delim)
# Removing unnecessary white space
texstring = word_delim.join(texstring.split())
return texstring
if __name__ == '__main__':
#texstring = r"\\{ \horse\ Hello\^ ^ a^b_c}"
texstring = r" asdf { \horse{}\ \zztop{} \ Hello\^^a^{b_c}}"
#texstring = r"{}{} { }"
#texstring = r"{{{_ }}}"
#texstring = r"\horse{}"
#texstring = r"\horse;,.?)_)(*(*^*%&$$%{} Haha! Kako je frajeru?"
#texstring = r"a_2\trav 32"
print texstring
texstring = normalize_tex(texstring)
print texstring
_parsed = parse_mathtex(texstring)
print _parsed
-------------------------------------------------------------------------
Take Surveys. Earn Cash. Influence the Future of IT
Join SourceForge.net's Techsay panel and you'll get the chance to share your
opinions on IT & business topics through brief surveys -- and earn cash
http://www.techsay.com/default.php?page=join.php&p=sourceforge&CID=DEVDEV
_______________________________________________
Matplotlib-devel mailing list
Matplotlib-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/matplotlib-devel