Author: Tyler Wade <[email protected]>
Branch: utf8-unicode2
Changeset: r72468:e70f582fd5dc
Date: 2014-07-17 01:43 -0500
http://bitbucket.org/pypy/pypy/changeset/e70f582fd5dc/
Log: Fix _multibytecodec
diff --git a/pypy/interpreter/utf8.py b/pypy/interpreter/utf8.py
--- a/pypy/interpreter/utf8.py
+++ b/pypy/interpreter/utf8.py
@@ -2,9 +2,8 @@
from rpython.rlib.objectmodel import specialize
from rpython.rlib.runicode import utf8_code_length
from rpython.rlib.unicodedata import unicodedb_5_2_0 as unicodedb
-from rpython.rlib.rarithmetic import r_uint
-from rpython.rtyper.lltypesystem import rffi
-from rpython.rtyper.lltypesystem import lltype
+from rpython.rlib.rarithmetic import r_uint, intmask
+from rpython.rtyper.lltypesystem import rffi, lltype
wchar_rint = rffi.r_uint
WCHAR_INTP = rffi.UINTP
@@ -464,7 +463,7 @@
if rffi.sizeof(rffi.WCHAR_T) == 2:
if 0xD800 <= c <= 0xDBFF:
i += 1
- c2 = int(array[i])
+ c2 = intmask(array[i])
if c2 == 0:
builder.append(c)
break
@@ -485,7 +484,7 @@
builder = Utf8Builder()
i = 0;
while i < size:
- c = int(array[i])
+ c = intmask(array[i])
if c == 0:
break
@@ -513,7 +512,7 @@
builder = Utf8Builder()
i = 0;
while i < size:
- c = int(array[i])
+ c = intmask(array[i])
if rffi.sizeof(rffi.WCHAR_T) == 2:
if i != size - 1 and 0xD800 <= c <= 0xDBFF:
diff --git a/pypy/module/_multibytecodec/c_codecs.py
b/pypy/module/_multibytecodec/c_codecs.py
--- a/pypy/module/_multibytecodec/c_codecs.py
+++ b/pypy/module/_multibytecodec/c_codecs.py
@@ -1,8 +1,9 @@
import py
from rpython.rtyper.lltypesystem import lltype, rffi
from rpython.translator.tool.cbuild import ExternalCompilationInfo
+from pypy.interpreter.utf8 import Utf8Str
-UNICODE_REPLACEMENT_CHARACTER = u'\uFFFD'
+UNICODE_REPLACEMENT_CHARACTER = Utf8Str.from_unicode(u'\uFFFD')
class EncodeDecodeError(Exception):
@@ -139,7 +140,7 @@
errorcb, namecb, stringdata)
src = pypy_cjk_dec_outbuf(decodebuf)
length = pypy_cjk_dec_outlen(decodebuf)
- return rffi.wcharpsize2unicode(src, length)
+ return Utf8Str.from_wcharpsize(src, length)
#
finally:
rffi.free_nonmovingbuffer(stringdata, inbuf)
@@ -164,18 +165,18 @@
if errors == "strict":
raise EncodeDecodeError(start, end, reason)
elif errors == "ignore":
- replace = u""
+ replace = Utf8Str("")
elif errors == "replace":
replace = UNICODE_REPLACEMENT_CHARACTER
else:
assert errorcb
replace, end = errorcb(errors, namecb, reason,
stringdata, start, end)
- inbuf = rffi.get_nonmoving_unicodebuffer(replace)
+ inbuf = replace.copy_to_wcharp()
try:
r = pypy_cjk_dec_replace_on_error(decodebuf, inbuf, len(replace), end)
finally:
- rffi.free_nonmoving_unicodebuffer(replace, inbuf)
+ rffi.free_wcharp(inbuf)
if r == MBERR_NOMEMORY:
raise MemoryError
@@ -222,7 +223,7 @@
def encodeex(encodebuf, unicodedata, errors="strict", errorcb=None,
namecb=None, ignore_error=0):
inleft = len(unicodedata)
- inbuf = rffi.get_nonmoving_unicodebuffer(unicodedata)
+ inbuf = unicodedata.copy_to_wcharp()
try:
if pypy_cjk_enc_init(encodebuf, inbuf, inleft) < 0:
raise MemoryError
@@ -247,7 +248,7 @@
return rffi.charpsize2str(src, length)
#
finally:
- rffi.free_nonmoving_unicodebuffer(unicodedata, inbuf)
+ rffi.free_wcharp(inbuf)
def multibytecodec_encerror(encodebuf, e, errors,
errorcb, namecb, unicodedata):
@@ -273,7 +274,7 @@
elif errors == "replace":
codec = pypy_cjk_enc_getcodec(encodebuf)
try:
- replace = encode(codec, u"?")
+ replace = encode(codec, Utf8Str("?"))
except EncodeDecodeError:
replace = "?"
else:
diff --git a/pypy/module/_multibytecodec/test/test_c_codecs.py
b/pypy/module/_multibytecodec/test/test_c_codecs.py
--- a/pypy/module/_multibytecodec/test/test_c_codecs.py
+++ b/pypy/module/_multibytecodec/test/test_c_codecs.py
@@ -1,4 +1,5 @@
import py
+from pypy.interpreter.utf8 import Utf8Str
from pypy.module._multibytecodec.c_codecs import getcodec, codecs
from pypy.module._multibytecodec.c_codecs import decode, encode
from pypy.module._multibytecodec.c_codecs import EncodeDecodeError
@@ -95,37 +96,38 @@
def test_encode_hz():
c = getcodec("hz")
- s = encode(c, u'foobar')
+ s = encode(c, Utf8Str('foobar'))
assert s == 'foobar' and type(s) is str
- s = encode(c, u'\u5f95\u6cef')
+ s = encode(c, Utf8Str.from_unicode(u'\u5f95\u6cef'))
assert s == '~{abc}~}'
def test_encode_hz_error():
# error
c = getcodec("hz")
- e = py.test.raises(EncodeDecodeError, encode, c, u'abc\u1234def').value
+ e = py.test.raises(EncodeDecodeError, encode, c,
+ Utf8Str.from_unicode(u'abc\u1234def')).value
assert e.start == 3
assert e.end == 4
assert e.reason == "illegal multibyte sequence"
def test_encode_hz_ignore():
c = getcodec("hz")
- s = encode(c, u'abc\u1234def', 'ignore')
+ s = encode(c, Utf8Str.from_unicode(u'abc\u1234def'), 'ignore')
assert s == 'abcdef'
def test_encode_hz_replace():
c = getcodec("hz")
- s = encode(c, u'abc\u1234def', 'replace')
+ s = encode(c, Utf8Str.from_unicode(u'abc\u1234def'), 'replace')
assert s == 'abc?def'
def test_encode_jisx0208():
c = getcodec('iso2022_jp')
- s = encode(c, u'\u83ca\u5730\u6642\u592b')
+ s = encode(c, Utf8Str.from_unicode(u'\u83ca\u5730\u6642\u592b'))
assert s == '\x1b$B5FCO;~IW\x1b(B' and type(s) is str
def test_encode_custom_error_handler_bytes():
c = getcodec("hz")
def errorhandler(errors, enc, msg, t, startingpos, endingpos):
return None, '\xc3', endingpos
- s = encode(c, u'abc\u1234def', 'foo', errorhandler)
+ s = encode(c, Utf8Str.from_unicode(u'abc\u1234def'), 'foo', errorhandler)
assert '\xc3' in s
diff --git a/pypy/module/sys/vm.py b/pypy/module/sys/vm.py
--- a/pypy/module/sys/vm.py
+++ b/pypy/module/sys/vm.py
@@ -3,11 +3,11 @@
"""
from rpython.rlib import jit
-from rpython.rlib.runicode import MAXUNICODE
from pypy.interpreter import gateway
from pypy.interpreter.error import OperationError
from pypy.interpreter.gateway import unwrap_spec
+from pypy.interpreter.utf8_codecs import MAXUNICODE
# ____________________________________________________________
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit