Author: Antonio Cuni <[email protected]>
Branch: rpython-utf8
Changeset: r56943:18ef63ba4fb6
Date: 2012-08-30 16:24 +0200
http://bitbucket.org/pypy/pypy/changeset/18ef63ba4fb6/

Log:    add the possibility of doing x.decode('utf-8') in rpython

diff --git a/pypy/annotation/unaryop.py b/pypy/annotation/unaryop.py
--- a/pypy/annotation/unaryop.py
+++ b/pypy/annotation/unaryop.py
@@ -553,7 +553,7 @@
         if not s_enc.is_constant():
             raise TypeError("Non-constant encoding not supported")
         enc = s_enc.const
-        if enc not in ('ascii', 'latin-1'):
+        if enc not in ('ascii', 'latin-1', 'utf-8'):
             raise TypeError("Encoding %s not supported for strings" % (enc,))
         return SomeUnicodeString()
     method_decode.can_only_throw = [UnicodeDecodeError]
diff --git a/pypy/rlib/runicode.py b/pypy/rlib/runicode.py
--- a/pypy/rlib/runicode.py
+++ b/pypy/rlib/runicode.py
@@ -47,12 +47,10 @@
 
 def raise_unicode_exception_decode(errors, encoding, msg, s,
                                    startingpos, endingpos):
-    assert isinstance(s, str)
     raise UnicodeDecodeError(encoding, s, startingpos, endingpos, msg)
 
 def raise_unicode_exception_encode(errors, encoding, msg, u,
                                    startingpos, endingpos):
-    assert isinstance(u, unicode)
     raise UnicodeEncodeError(encoding, u, startingpos, endingpos, msg)
 
 # ____________________________________________________________
diff --git a/pypy/rpython/lltypesystem/rstr.py 
b/pypy/rpython/lltypesystem/rstr.py
--- a/pypy/rpython/lltypesystem/rstr.py
+++ b/pypy/rpython/lltypesystem/rstr.py
@@ -143,6 +143,13 @@
             s.chars[i] = cast_primitive(UniChar, value.chars[i])
         return s
 
+    def ll_decode_utf8(self, llvalue):
+        from pypy.rpython.annlowlevel import hlstr, llunicode
+        from pypy.rlib.runicode import str_decode_utf_8
+        value = hlstr(llvalue)
+        univalue, _ = str_decode_utf_8(value, len(value), 'strict')
+        return llunicode(univalue)
+
 class UnicodeRepr(BaseLLStringRepr, AbstractUnicodeRepr):
     lowleveltype = Ptr(UNICODE)
     basetype = basestring
diff --git a/pypy/rpython/ootypesystem/rstr.py 
b/pypy/rpython/ootypesystem/rstr.py
--- a/pypy/rpython/ootypesystem/rstr.py
+++ b/pypy/rpython/ootypesystem/rstr.py
@@ -60,6 +60,13 @@
             sb.ll_append_char(cast_primitive(UniChar, c))
         return sb.ll_build()
 
+    def ll_decode_utf8(self, llvalue):
+        from pypy.rpython.annlowlevel import hlstr, oounicode
+        from pypy.rlib.runicode import str_decode_utf_8
+        value = hlstr(llvalue)
+        univalue, _ = str_decode_utf_8(value, len(value), 'strict')
+        return oounicode(univalue)
+
 
 class UnicodeRepr(BaseOOStringRepr, AbstractUnicodeRepr):
     lowleveltype = ootype.Unicode
diff --git a/pypy/rpython/rstr.py b/pypy/rpython/rstr.py
--- a/pypy/rpython/rstr.py
+++ b/pypy/rpython/rstr.py
@@ -309,6 +309,8 @@
             return hop.gendirectcall(self.ll.ll_str2unicode, v_self)
         elif encoding == 'latin-1':
             return hop.gendirectcall(self.ll_decode_latin1, v_self)
+        elif encoding == 'utf-8':
+            return hop.gendirectcall(self.ll_decode_utf8, v_self)
         else:
             raise TyperError("encoding %s not implemented" % (encoding, ))
 
diff --git a/pypy/rpython/test/test_runicode.py 
b/pypy/rpython/test/test_runicode.py
--- a/pypy/rpython/test/test_runicode.py
+++ b/pypy/rpython/test/test_runicode.py
@@ -130,11 +130,14 @@
         assert self.interpret(f, [300, False]) == f(300, False)
 
     def test_unicode_decode(self):
-        def f(x):
-            y = 'xxx'
-            return (y + chr(x)).decode('ascii') + chr(x).decode("latin-1") 
+        strings = ['xxx', u'&#224;&#232;&#236;'.encode('latin-1'), 
u'&#32654;'.encode('utf-8')]
+        def f(n):
+            x = strings[n]
+            y = strings[n+1]
+            z = strings[n+2]
+            return x.decode('ascii') + y.decode('latin-1') + z.decode('utf-8')
 
-        assert self.ll_to_string(self.interpret(f, [38])) == f(38)
+        assert self.ll_to_string(self.interpret(f, [0])) == f(0)
 
     def test_unicode_decode_error(self):
         def f(x):
_______________________________________________
pypy-commit mailing list
[email protected]
http://mail.python.org/mailman/listinfo/pypy-commit

Reply via email to