Author: Antonio Cuni <[email protected]>
Branch: rpython-utf8
Changeset: r56943:18ef63ba4fb6
Date: 2012-08-30 16:24 +0200
http://bitbucket.org/pypy/pypy/changeset/18ef63ba4fb6/
Log: add the possibility of doing x.decode('utf-8') in rpython
diff --git a/pypy/annotation/unaryop.py b/pypy/annotation/unaryop.py
--- a/pypy/annotation/unaryop.py
+++ b/pypy/annotation/unaryop.py
@@ -553,7 +553,7 @@
if not s_enc.is_constant():
raise TypeError("Non-constant encoding not supported")
enc = s_enc.const
- if enc not in ('ascii', 'latin-1'):
+ if enc not in ('ascii', 'latin-1', 'utf-8'):
raise TypeError("Encoding %s not supported for strings" % (enc,))
return SomeUnicodeString()
method_decode.can_only_throw = [UnicodeDecodeError]
diff --git a/pypy/rlib/runicode.py b/pypy/rlib/runicode.py
--- a/pypy/rlib/runicode.py
+++ b/pypy/rlib/runicode.py
@@ -47,12 +47,10 @@
def raise_unicode_exception_decode(errors, encoding, msg, s,
startingpos, endingpos):
- assert isinstance(s, str)
raise UnicodeDecodeError(encoding, s, startingpos, endingpos, msg)
def raise_unicode_exception_encode(errors, encoding, msg, u,
startingpos, endingpos):
- assert isinstance(u, unicode)
raise UnicodeEncodeError(encoding, u, startingpos, endingpos, msg)
# ____________________________________________________________
diff --git a/pypy/rpython/lltypesystem/rstr.py
b/pypy/rpython/lltypesystem/rstr.py
--- a/pypy/rpython/lltypesystem/rstr.py
+++ b/pypy/rpython/lltypesystem/rstr.py
@@ -143,6 +143,13 @@
s.chars[i] = cast_primitive(UniChar, value.chars[i])
return s
+ def ll_decode_utf8(self, llvalue):
+ from pypy.rpython.annlowlevel import hlstr, llunicode
+ from pypy.rlib.runicode import str_decode_utf_8
+ value = hlstr(llvalue)
+ univalue, _ = str_decode_utf_8(value, len(value), 'strict')
+ return llunicode(univalue)
+
class UnicodeRepr(BaseLLStringRepr, AbstractUnicodeRepr):
lowleveltype = Ptr(UNICODE)
basetype = basestring
diff --git a/pypy/rpython/ootypesystem/rstr.py
b/pypy/rpython/ootypesystem/rstr.py
--- a/pypy/rpython/ootypesystem/rstr.py
+++ b/pypy/rpython/ootypesystem/rstr.py
@@ -60,6 +60,13 @@
sb.ll_append_char(cast_primitive(UniChar, c))
return sb.ll_build()
+ def ll_decode_utf8(self, llvalue):
+ from pypy.rpython.annlowlevel import hlstr, oounicode
+ from pypy.rlib.runicode import str_decode_utf_8
+ value = hlstr(llvalue)
+ univalue, _ = str_decode_utf_8(value, len(value), 'strict')
+ return oounicode(univalue)
+
class UnicodeRepr(BaseOOStringRepr, AbstractUnicodeRepr):
lowleveltype = ootype.Unicode
diff --git a/pypy/rpython/rstr.py b/pypy/rpython/rstr.py
--- a/pypy/rpython/rstr.py
+++ b/pypy/rpython/rstr.py
@@ -309,6 +309,8 @@
return hop.gendirectcall(self.ll.ll_str2unicode, v_self)
elif encoding == 'latin-1':
return hop.gendirectcall(self.ll_decode_latin1, v_self)
+ elif encoding == 'utf-8':
+ return hop.gendirectcall(self.ll_decode_utf8, v_self)
else:
raise TyperError("encoding %s not implemented" % (encoding, ))
diff --git a/pypy/rpython/test/test_runicode.py
b/pypy/rpython/test/test_runicode.py
--- a/pypy/rpython/test/test_runicode.py
+++ b/pypy/rpython/test/test_runicode.py
@@ -130,11 +130,14 @@
assert self.interpret(f, [300, False]) == f(300, False)
def test_unicode_decode(self):
- def f(x):
- y = 'xxx'
- return (y + chr(x)).decode('ascii') + chr(x).decode("latin-1")
+ strings = ['xxx', u'àèì'.encode('latin-1'),
u'美'.encode('utf-8')]
+ def f(n):
+ x = strings[n]
+ y = strings[n+1]
+ z = strings[n+2]
+ return x.decode('ascii') + y.decode('latin-1') + z.decode('utf-8')
- assert self.ll_to_string(self.interpret(f, [38])) == f(38)
+ assert self.ll_to_string(self.interpret(f, [0])) == f(0)
def test_unicode_decode_error(self):
def f(x):
_______________________________________________
pypy-commit mailing list
[email protected]
http://mail.python.org/mailman/listinfo/pypy-commit