José or any other python expert, I need your help. The attached patch is the lyx2lyx part of the "getting rid of InsetLaTeXAccent" patch. convert_accent works well, but revert_accent does not. I put the error messages in the file. Can anybody tell me why
document.body[i] = unicodedata.normalize("NFKD", document.body[i]) does not work? If you want to try this, use the attached test file (don't save it with LyX if you want to test convert_accent, it is hand crafted). Georg
latexaccent-all.lyx
Description: application/lyx
Index: lib/lyx2lyx/LyX.py =================================================================== --- lib/lyx2lyx/LyX.py (Revision 16629) +++ lib/lyx2lyx/LyX.py (Arbeitskopie) @@ -73,7 +73,7 @@ format_relation = [("0_06", [200], ge ("1_2", [220], generate_minor_versions("1.2" , 4)), ("1_3", [221], generate_minor_versions("1.3" , 7)), ("1_4", range(222,246), generate_minor_versions("1.4" , 3)), - ("1_5", range(246,257), generate_minor_versions("1.5" , 0))] + ("1_5", range(246,258), generate_minor_versions("1.5" , 0))] def formats_list(): Index: lib/lyx2lyx/lyx_1_5.py =================================================================== --- lib/lyx2lyx/lyx_1_5.py (Revision 16629) +++ lib/lyx2lyx/lyx_1_5.py (Arbeitskopie) @@ -20,7 +20,9 @@ """ Convert files to the file format generated by lyx 1.5""" import re -from parser_tools import find_token, find_token_exact, find_tokens, find_end_of, get_value +import unicodedata + +from parser_tools import find_re, find_token, find_token_exact, find_tokens, find_end_of, get_value from LyX import get_encoding @@ -719,6 +721,167 @@ def revert_encodings(document): document.inputencoding = get_value(document.header, "\\inputencoding", 0) +accent_map = { + "`" : 0x0309, # grave + "'" : 0x0301, # acute + "^" : 0x0302, # circumflex + "~" : 0x0303, # tilde + "=" : 0x0304, # macron + "u" : 0x0306, # breve + "." : 0x0307, # dot above + "\"": 0x0308, # diaresis + "r" : 0x030a, # ring above + "H" : 0x030b, # double acute + "v" : 0x030c, # caron + "b" : 0x0320, # minus sign below + "d" : 0x0323, # dot below + "c" : 0x0327, # cedilla + "k" : 0x0328, # ogonek + "t" : 0x0361 # tie +} + + +special_accent_map = { + 'i' : 0x0131, # dotless i + 'j' : 0x0237, # dotless j + 'l' : 0x0142, # l with stroke + 'L' : 0x0141, # L with stroke +} + +def _convert_accent(type, char): + if char == '': + if type in special_accent_map: + return unichr(special_accent_map[type]) + # a missing char is treated as space by LyX + char = ' ' + if (len(char) > 1): + # We can only convert accents on a single char + return '' + # \i and \j + if char[0] == "\\": + char = char[1:] + a = accent_map.get(type) + if a: + return unicodedata.normalize("NFKC", "%s%s" % (char, unichr(a))) + return '' + + +def convert_ertbackslash(body, i, ert, default_layout): + r""" ------------------------------------------------------------------------------------------- + Convert backslashes and '\n' into valid ERT code, append the converted + text to body[i] and return the (maybe incremented) line index i""" + + for c in ert: + if c == '\\': + body[i] = body[i] + '\\backslash ' + i = i + 1 + body.insert(i, '') + elif c == '\n': + body[i+1:i+1] = ['\\end_layout', '', '\\begin_layout %s' % default_layout, ''] + i = i + 4 + else: + body[i] = body[i] + c + return i + + +def convert_accent(document): + # The following forms are supported by LyX: + # '\i \"{a}' (standard form, as written by LyX) + # '\i \"{ }' (standard form, as written by LyX if the accented char is a space) + # '\i \"{}' (also accepted if the accented char is a space) + # '\i \" a' (also accepted) + # '\i \"' (also accepted) + re_wholeinset = re.compile(r'^(.*)(\\i\s+)(.*)$') + re_contents = re.compile(r'^([^\s{]+)(.*)$') + re_accentedcontents = re.compile(r'^\s*{?([^{}]*)}?\s*$') + i = 0 + while 1: + i = find_re(document.body, re_wholeinset, i) + if i == -1: + return + match = re_wholeinset.match(document.body[i]) + prefix = match.group(1) + contents = match.group(3).strip() + match = re_contents.match(contents) + if match: + # Strip first char (always \) + accent = match.group(1)[1:] + accented_contents = match.group(2).strip() + match = re_accentedcontents.match(accented_contents) + accented_char = match.group(1).strip() + converted = _convert_accent(accent, accented_char) + if converted == '': + contents = '%s{%s}' % (accent, accented_char), + else: + document.body[i] = '%s%s' % (prefix, converted) + i += 1 + continue + document.warning("Converting unknown InsetLaTeXAccent `\\i %s' to ERT." % contents) + document.body[i] = prefix + document.body[i+1:i+1] = ['\\begin_inset ERT', + 'status collapsed', + '', + '\\begin_layout %s' % document.default_layout, + '', + '', + ''] + i = convert_ertbackslash(document.body, i + 7, + '\\i %s' % contents, + document.default_layout) + document.body[i+1:i+1] = ['\\end_layout', + '', + '\\end_inset'] + i += 3 + + +def revert_accent(document): + numberoflines = len(document.body) + # Since LyX may insert a line break within a word we must combine all + # words before unicode normalization + for i in range(numberoflines-1): + if (document.body[i] == ''): + continue + while (document.body[i][-1] != ' ' and len(document.body[i+1]) > 0 and + document.body[i+1][0] != ' '): + document.body[i] += document.body[i+1][0] + document.body[i+1] = document.body[i+1][1:] + inverse_accent_map = {} + for k in accent_map.keys(): + inverse_accent_map[accent_map[k]] = k + inverse_special_accent_map = {} + for k in special_accent_map.keys(): + inverse_special_accent_map[special_accent_map[k]] = k + # Normalize to "Normal form D" (NFD, also known as canonical decomposition) + for i in range(numberoflines): + # This does not work: + # TypeError: normalize() argument 2 must be unicode, not str + document.body[i] = unicodedata.normalize("NFKD", document.body[i]) + # This does not work either: + # TypeError: decoding Unicode is not supported + document.body[i] = unicodedata.normalize("NFKD", unicode(document.body[i], 'utf-8')) + # Replace accented characters with InsetLaTeXAccent + # FIXME: Do not convert characters that can be represented in the chosen + # encoding + for i in range(len(document.body)): + for j in range(len(document.body[i])): + if document.body[i][j] in inverse_accent_map: + accent = document.body[i][j] + # Insert the rest of the line as new line + if j < len(document.body[i]) - 1: + document.body[i+1:i+1] = document.body[i][j+1:] + document.body[i][j:] = "\\i %s{}" % inverse_special_accent_map[accent] + elif j > 0 and document.body[i][j] in inverse_accent_map: + accented_char = document.body[i][j-1] + accent = document.body[i][j] + # Insert the rest of the line as new line + if j < len(document.body[i]) - 1: + document.body[i+1:i+1] = document.body[i][j+1:] + document.body[i][j-1:] = "\\i %s{%s}" % (inverse_accent_map[accent], accented_char) + # Normalize to "Normal form C" (NFC, pre-composed characters) again + for i in range(numberoflines): + document.body[i] = unicodedata.normalize("NFKC", document.body[i]) + + ## # Conversion hub # @@ -734,16 +897,18 @@ convert = [[246, []], [253, []], [254, [convert_esint]], [255, []], - [256, []]] + [256, []], + [257, [convert_accent]]] -revert = [[255, [revert_encodings]], +revert = [[256, []], + [255, [revert_encodings]], [254, [revert_clearpage, revert_cleardoublepage]], [253, [revert_esint]], [252, [revert_nomenclature, revert_printnomenclature]], [251, [revert_commandparams]], [250, [revert_cs_label]], [249, []], - [248, [revert_utf8]], + [248, [revert_accent, revert_utf8]], [247, [revert_booktabs]], [246, [revert_font_settings]], [245, [revert_framed]]]