When handling epub manifest with UTF-8 characters, a2x would crash with UnicodeEncodeError. This is because a2x would try to read the manifest with default encoding (usually ASCII) and fail on unicode characters.
This patch changes behaviour so that during reading/writing we work with encodings and produce UTF-8 encoded files by default. When handling HTML we first look at encoding specified and decode contents before always passing unicode to HTMLParser. For reproducer see: https://bugzilla.redhat.com/show_bug.cgi?id=968308 -- You received this message because you are subscribed to the Google Groups "asciidoc" group. To unsubscribe from this group and stop receiving emails from it, send an email to [email protected]. To post to this group, send email to [email protected]. Visit this group at http://groups.google.com/group/asciidoc?hl=en. For more options, visit https://groups.google.com/groups/opt_out.
# HG changeset patch # User Stanislav Ochotnicky <[email protected]> # Date 1370342018 -7200 # Node ID 300eecfedc285dfed271442c693412764e20a206 # Parent 2763a041e498eb5d2cf47b275e618a61b9bf0198 [a2x] Handle encoding of resources and use unicode internally When handling epub manifest with UTF-8 characters, a2x would crash with UnicodeEncodeError. This is because a2x would try to read the manifest with default encoding (usually ASCII) and fail on unicode characters. This patch changes behaviour so that during reading/writing we work with encodings and produce UTF-8 encoded files by default. When handling HTML we first look at encoding specified and decode contents before always passing unicode to HTMLParser. For reproducer see: https://bugzilla.redhat.com/show_bug.cgi?id=968308 diff --git a/a2x.py b/a2x.py --- a/a2x.py +++ b/a2x.py @@ -10,6 +10,7 @@ ''' import os +import codecs import fnmatch import HTMLParser import re @@ -144,20 +145,31 @@ result = _find_executable(file_name) return result -def write_file(filename, data, mode='w'): - f = open(filename, mode) +def write_file(filename, data, mode='w', encoding='utf-8'): + f = codecs.open(filename, mode, encoding) try: f.write(data) finally: f.close() -def read_file(filename, mode='r'): - f = open(filename, mode) +def read_file(filename, mode='r', encoding='utf-8'): + f = codecs.open(filename, mode, encoding) try: return f.read() finally: f.close() +def get_xml_encoding(filename): + try: + f = open(filename, 'r') + mo = re.search(r'\A<\?xml.* encoding="(.*?)"', f.read()) + if mo: + return mo.group(1) + else: + return 'utf-8' + finally: + f.close() + def shell_cd(path): verbose('chdir %s' % path) if not OPTIONS.dry_run: @@ -256,15 +268,9 @@ if OPTIONS.dry_run: continue parser = FindResources() - # HTMLParser has problems with non-ASCII strings. - # See http://bugs.python.org/issue3932 - contents = read_file(filename) - mo = re.search(r'\A<\?xml.* encoding="(.*?)"', contents) - if mo: - encoding = mo.group(1) - parser.feed(contents.decode(encoding)) - else: - parser.feed(contents) + encoding = get_xml_encoding(filename) + contents = read_file(filename, 'r', encoding) + parser.feed(contents) parser.close() result = list(set(result)) # Drop duplicate values. result.sort()
