Author: Amaury Forgeot d'Arc <amaur...@gmail.com> Branch: stdlib-3.2.5 Changeset: r70392:73ca2fbe4077 Date: 2014-04-02 02:21 +0200 http://bitbucket.org/pypy/pypy/changeset/73ca2fbe4077/
Log: Expat parser now correctly works with unicode input, even when the XML internal encoding is not UTF8 (CPython issue 17089) diff --git a/pypy/module/pyexpat/interp_pyexpat.py b/pypy/module/pyexpat/interp_pyexpat.py --- a/pypy/module/pyexpat/interp_pyexpat.py +++ b/pypy/module/pyexpat/interp_pyexpat.py @@ -2,6 +2,7 @@ from pypy.interpreter.typedef import TypeDef, GetSetProperty from pypy.interpreter.gateway import interp2app, unwrap_spec, WrappedDefault from pypy.interpreter.error import OperationError, oefmt +from pypy.interpreter.unicodehelper import encode_utf8 from rpython.rlib import rgc, jit from rpython.rtyper.lltypesystem import rffi, lltype from rpython.rtyper.tool import rffi_platform @@ -348,6 +349,8 @@ XML_SetUnknownEncodingHandler = expat_external( 'XML_SetUnknownEncodingHandler', [XML_Parser, callback_type, rffi.VOIDP], lltype.Void) +XML_SetEncoding = expat_external( + 'XML_SetEncoding', [XML_Parser, rffi.CCHARP], rffi.INT) # Declarations of external functions @@ -622,10 +625,17 @@ # Parse methods - @unwrap_spec(data='bufferstr_or_u', isfinal=bool) - def Parse(self, space, data, isfinal=False): + @unwrap_spec(isfinal=bool) + def Parse(self, space, w_data, isfinal=False): """Parse(data[, isfinal]) Parse XML data. `isfinal' should be true at end of input.""" + if space.isinstance_w(w_data, space.w_unicode): + u = w_data.unicode_w(space) + data = encode_utf8(space, w_data.unicode_w(space)) + # Explicitly set UTF-8 encoding. Return code ignored. + XML_SetEncoding(self.itself, "utf-8") + else: + data = space.bufferstr_w(w_data) res = XML_Parse(self.itself, data, len(data), isfinal) if self._exc_info: e = self._exc_info @@ -643,9 +653,8 @@ eof = False while not eof: w_data = space.call_method(w_file, 'read', space.wrap(2048)) - data = space.bytes_w(w_data) - eof = len(data) == 0 - w_res = self.Parse(space, data, isfinal=eof) + eof = space.len_w(w_data) == 0 + w_res = self.Parse(space, w_data, isfinal=eof) return w_res @unwrap_spec(base=str) diff --git a/pypy/module/pyexpat/test/test_parser.py b/pypy/module/pyexpat/test/test_parser.py --- a/pypy/module/pyexpat/test/test_parser.py +++ b/pypy/module/pyexpat/test/test_parser.py @@ -100,7 +100,7 @@ p.Parse(xml) def test_python_encoding(self): - # This name is not knonwn by expat + # This name is not known by expat xml = b"<?xml version='1.0' encoding='latin1'?><s>caf\xe9</s>" import pyexpat p = pyexpat.ParserCreate() @@ -110,12 +110,21 @@ p.Parse(xml) def test_mbcs(self): - xml = "<?xml version='1.0' encoding='gbk'?><p/>" + xml = b"<?xml version='1.0' encoding='gbk'?><p/>" import pyexpat p = pyexpat.ParserCreate() exc = raises(ValueError, p.Parse, xml) assert str(exc.value) == "multi-byte encodings are not supported" + def test_parse_str(self): + xml = "<?xml version='1.0' encoding='latin1'?><s>caf\xe9</s>" + import pyexpat + p = pyexpat.ParserCreate() + def gotText(text): + assert text == "caf\xe9" + p.CharacterDataHandler = gotText + p.Parse(xml) + def test_decode_error(self): xml = b'<fran\xe7ais>Comment \xe7a va ? Tr\xe8s bien ?</fran\xe7ais>' import pyexpat _______________________________________________ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit