python help needed

Georg Baum Tue, 09 Jan 2007 10:46:59 -0800

José or any other python expert, I need your help. The attached patch is 
the lyx2lyx part of the "getting rid of InsetLaTeXAccent" patch. 
convert_accent works well, but revert_accent does not. I put the error 
messages in the file. Can anybody tell me why


document.body[i] = unicodedata.normalize("NFKD", document.body[i])

does not work?

If you want to try this, use the attached test file (don't save it with LyX 
if you want to test convert_accent, it is hand crafted).


Georg

latexaccent-all.lyx
Description: application/lyx

Index: lib/lyx2lyx/LyX.py
===================================================================
--- lib/lyx2lyx/LyX.py	(Revision 16629)
+++ lib/lyx2lyx/LyX.py	(Arbeitskopie)
@@ -73,7 +73,7 @@ format_relation = [("0_06",    [200], ge
                    ("1_2",     [220], generate_minor_versions("1.2" , 4)),
                    ("1_3",     [221], generate_minor_versions("1.3" , 7)),
                    ("1_4", range(222,246), generate_minor_versions("1.4" , 3)),
-                   ("1_5", range(246,257), generate_minor_versions("1.5" , 0))]
+                   ("1_5", range(246,258), generate_minor_versions("1.5" , 0))]
 
 
 def formats_list():
Index: lib/lyx2lyx/lyx_1_5.py
===================================================================
--- lib/lyx2lyx/lyx_1_5.py	(Revision 16629)
+++ lib/lyx2lyx/lyx_1_5.py	(Arbeitskopie)
@@ -20,7 +20,9 @@
 """ Convert files to the file format generated by lyx 1.5"""
 
 import re
-from parser_tools import find_token, find_token_exact, find_tokens, find_end_of, get_value
+import unicodedata
+
+from parser_tools import find_re, find_token, find_token_exact, find_tokens, find_end_of, get_value
 from LyX import get_encoding
 
 
@@ -719,6 +721,167 @@ def revert_encodings(document):
     document.inputencoding = get_value(document.header, "\\inputencoding", 0)
 
 
+accent_map = {
+    "`" : 0x0309, # grave
+    "'" : 0x0301, # acute
+    "^" : 0x0302, # circumflex
+    "~" : 0x0303, # tilde
+    "=" : 0x0304, # macron
+    "u" : 0x0306, # breve
+    "." : 0x0307, # dot above
+    "\"": 0x0308, # diaresis
+    "r" : 0x030a, # ring above
+    "H" : 0x030b, # double acute
+    "v" : 0x030c, # caron
+    "b" : 0x0320, # minus sign below
+    "d" : 0x0323, # dot below
+    "c" : 0x0327, # cedilla
+    "k" : 0x0328, # ogonek
+    "t" : 0x0361  # tie
+}
+
+
+special_accent_map = {
+    'i' : 0x0131, # dotless i
+    'j' : 0x0237, # dotless j
+    'l' : 0x0142, # l with stroke
+    'L' : 0x0141, # L with stroke
+}
+
+def _convert_accent(type, char):
+    if char == '':
+        if type in special_accent_map:
+            return unichr(special_accent_map[type])
+        # a missing char is treated as space by LyX
+        char = ' '
+    if (len(char) > 1):
+        # We can only convert accents on a single char
+        return ''
+    # \i and \j
+    if char[0] == "\\":
+        char = char[1:]
+    a = accent_map.get(type)
+    if a:
+        return unicodedata.normalize("NFKC", "%s%s" % (char, unichr(a)))
+    return ''
+
+
+def convert_ertbackslash(body, i, ert, default_layout):
+    r""" -------------------------------------------------------------------------------------------
+    Convert backslashes and '\n' into valid ERT code, append the converted
+    text to body[i] and return the (maybe incremented) line index i"""
+
+    for c in ert:
+        if c == '\\':
+            body[i] = body[i] + '\\backslash '
+            i = i + 1
+            body.insert(i, '')
+        elif c == '\n':
+            body[i+1:i+1] = ['\\end_layout', '', '\\begin_layout %s' % default_layout, '']
+            i = i + 4
+        else:
+            body[i] = body[i] + c
+    return i
+
+
+def convert_accent(document):
+    # The following forms are supported by LyX:
+    # '\i \"{a}' (standard form, as written by LyX)
+    # '\i \"{ }' (standard form, as written by LyX if the accented char is a space)
+    # '\i \"{}'  (also accepted if the accented char is a space)
+    # '\i \" a'  (also accepted)
+    # '\i \"'    (also accepted)
+    re_wholeinset = re.compile(r'^(.*)(\\i\s+)(.*)$')
+    re_contents = re.compile(r'^([^\s{]+)(.*)$')
+    re_accentedcontents = re.compile(r'^\s*{?([^{}]*)}?\s*$')
+    i = 0
+    while 1:
+        i = find_re(document.body, re_wholeinset, i)
+        if i == -1:
+            return
+        match = re_wholeinset.match(document.body[i])
+        prefix = match.group(1)
+        contents = match.group(3).strip()
+        match = re_contents.match(contents)
+        if match:
+            # Strip first char (always \)
+            accent = match.group(1)[1:]
+            accented_contents = match.group(2).strip()
+            match = re_accentedcontents.match(accented_contents)
+            accented_char = match.group(1).strip()
+            converted = _convert_accent(accent, accented_char)
+            if converted == '':
+                contents = '%s{%s}' % (accent, accented_char),
+            else:
+                document.body[i] = '%s%s' % (prefix, converted)
+                i += 1
+                continue
+        document.warning("Converting unknown InsetLaTeXAccent `\\i %s' to ERT." % contents)
+        document.body[i] = prefix
+        document.body[i+1:i+1] = ['\\begin_inset ERT',
+                                  'status collapsed',
+                                  '',
+                                  '\\begin_layout %s' % document.default_layout,
+                                  '',
+                                  '',
+                                  '']
+        i = convert_ertbackslash(document.body, i + 7,
+                                 '\\i %s' % contents,
+                                 document.default_layout)
+        document.body[i+1:i+1] = ['\\end_layout',
+                                  '',
+                                  '\\end_inset']
+        i += 3
+
+
+def revert_accent(document):
+    numberoflines = len(document.body)
+    # Since LyX may insert a line break within a word we must combine all
+    # words before unicode normalization
+    for i in range(numberoflines-1):
+        if (document.body[i] == ''):
+            continue
+        while (document.body[i][-1] != ' ' and len(document.body[i+1]) > 0 and
+               document.body[i+1][0] != ' '):
+            document.body[i] += document.body[i+1][0]
+            document.body[i+1] = document.body[i+1][1:]
+    inverse_accent_map = {}
+    for k in accent_map.keys():
+        inverse_accent_map[accent_map[k]] = k
+    inverse_special_accent_map = {}
+    for k in special_accent_map.keys():
+        inverse_special_accent_map[special_accent_map[k]] = k
+    # Normalize to "Normal form D" (NFD, also known as canonical decomposition)
+    for i in range(numberoflines):
+        # This does not work:
+	# TypeError: normalize() argument 2 must be unicode, not str
+        document.body[i] = unicodedata.normalize("NFKD", document.body[i])
+	# This does not work either:
+	# TypeError: decoding Unicode is not supported
+        document.body[i] = unicodedata.normalize("NFKD", unicode(document.body[i], 'utf-8'))
+    # Replace accented characters with InsetLaTeXAccent
+    # FIXME: Do not convert characters that can be represented in the chosen
+    # encoding
+    for i in range(len(document.body)):
+        for j in range(len(document.body[i])):
+            if document.body[i][j] in inverse_accent_map:
+                accent = document.body[i][j]
+                # Insert the rest of the line as new line
+                if j < len(document.body[i]) - 1:
+                    document.body[i+1:i+1] = document.body[i][j+1:]
+                document.body[i][j:] = "\\i %s{}" % inverse_special_accent_map[accent]
+            elif j > 0 and document.body[i][j] in inverse_accent_map:
+                accented_char = document.body[i][j-1]
+                accent = document.body[i][j]
+                # Insert the rest of the line as new line
+                if j < len(document.body[i]) - 1:
+                    document.body[i+1:i+1] = document.body[i][j+1:]
+                document.body[i][j-1:] = "\\i %s{%s}" % (inverse_accent_map[accent], accented_char)
+    # Normalize to "Normal form C" (NFC, pre-composed characters) again
+    for i in range(numberoflines):
+        document.body[i] = unicodedata.normalize("NFKC", document.body[i])
+
+
 ##
 # Conversion hub
 #
@@ -734,16 +897,18 @@ convert = [[246, []],
            [253, []],
            [254, [convert_esint]],
            [255, []],
-           [256, []]]
+           [256, []],
+           [257, [convert_accent]]]
 
-revert =  [[255, [revert_encodings]],
+revert =  [[256, []],
+           [255, [revert_encodings]],
            [254, [revert_clearpage, revert_cleardoublepage]],
            [253, [revert_esint]],
            [252, [revert_nomenclature, revert_printnomenclature]],
            [251, [revert_commandparams]],
            [250, [revert_cs_label]],
            [249, []],
-           [248, [revert_utf8]],
+           [248, [revert_accent, revert_utf8]],
            [247, [revert_booktabs]],
            [246, [revert_font_settings]],
            [245, [revert_framed]]]

python help needed

Reply via email to