Author: Brian Kearns <[email protected]>
Branch: stdlib-2.7.6
Changeset: r69599:62fa89efe2e7
Date: 2014-03-02 02:02 -0500
http://bitbucket.org/pypy/pypy/changeset/62fa89efe2e7/

Log:    fix incremental utf-16 decoder (cpython issue11461)

diff --git a/pypy/module/_codecs/test/test_codecs.py 
b/pypy/module/_codecs/test/test_codecs.py
--- a/pypy/module/_codecs/test/test_codecs.py
+++ b/pypy/module/_codecs/test/test_codecs.py
@@ -134,11 +134,15 @@
                 u"\x00\xff\u07ff\u0800",
                 u"\x00\xff\u07ff\u0800",
                 u"\x00\xff\u07ff\u0800\uffff",
+                u"\x00\xff\u07ff\u0800\uffff",
+                u"\x00\xff\u07ff\u0800\uffff",
+                u"\x00\xff\u07ff\u0800\uffff",
+                u"\x00\xff\u07ff\u0800\uffff\U00010000",
             ]
 
         buffer = ''
         result = u""
-        for (c, partialresult) in 
zip(u"\x00\xff\u07ff\u0800\uffff".encode(encoding), check_partial):
+        for (c, partialresult) in 
zip(u"\x00\xff\u07ff\u0800\uffff\U00010000".encode(encoding), check_partial):
             buffer += c
             res = _codecs.utf_8_decode(buffer,'strict',False)
             if res[1] >0 :
@@ -160,10 +164,14 @@
                     u"\x00\xff\u0100",
                     u"\x00\xff\u0100",
                     u"\x00\xff\u0100\uffff",
+                    u"\x00\xff\u0100\uffff",
+                    u"\x00\xff\u0100\uffff",
+                    u"\x00\xff\u0100\uffff",
+                    u"\x00\xff\u0100\uffff\U00010000",
                 ]
         buffer = ''
         result = u""
-        for (c, partialresult) in 
zip(u"\x00\xff\u0100\uffff".encode(encoding), check_partial):
+        for (c, partialresult) in 
zip(u"\x00\xff\u0100\uffff\U00010000".encode(encoding), check_partial):
             buffer += c
             res = _codecs.utf_16_decode(buffer,'strict',False)
             if res[1] >0 :
diff --git a/pypy/objspace/std/test/test_unicodeobject.py 
b/pypy/objspace/std/test/test_unicodeobject.py
--- a/pypy/objspace/std/test/test_unicodeobject.py
+++ b/pypy/objspace/std/test/test_unicodeobject.py
@@ -1,8 +1,8 @@
 import py
 import sys
 
+
 class TestUnicodeObject:
-
     def test_comparison_warning(self):
         warnings = []
         def my_warn(msg, warningscls):
@@ -32,6 +32,7 @@
                 space.w_unicode, "__new__", space.w_unicode, w_uni)
         assert w_new is w_uni
 
+
 class AppTestUnicodeStringStdOnly:
     def test_compares(self):
         assert u'a' == 'a'
@@ -314,7 +315,6 @@
         assert u'xyzzyhelloxyzzy'.lstrip('xyz') == u'helloxyzzy'
         assert u'xyzzyhelloxyzzy'.rstrip(u'xyz') == u'xyzzyhello'
 
-
     def test_long_from_unicode(self):
         assert long(u'12345678901234567890') == 12345678901234567890
         assert int(u'12345678901234567890') == 12345678901234567890
@@ -336,7 +336,7 @@
                      u'a', u'"', u'\'', u'\"', u'\t', u'\\', u"'''\"",
                      unichr(19), unichr(2), u'\u1234', u'\U00101234']:
             assert eval(repr(ustr)) == ustr
-            
+
     def test_getnewargs(self):
         class X(unicode):
             pass
@@ -400,7 +400,7 @@
         assert not 'hello'.endswith((u'he\u1111', u'he'))
         assert 'hello'.endswith((u'\u1111lo', u'llo'))
         assert 'hello'.endswith((u'\u1111hellox', u'hello'))
-    
+
     def test_endswith(self):
         assert u'ab'.endswith(u'ab') is True
         assert u'ab'.endswith(u'b') is True
@@ -441,13 +441,13 @@
 
         s = u'xy\t'
         assert s.expandtabs() =='xy      '
-        
+
         s = u'\txy\t'
         assert s.expandtabs() =='        xy      '
         assert s.expandtabs(1) ==' xy '
         assert s.expandtabs(2) =='  xy  '
         assert s.expandtabs(3) =='   xy '
-        
+
         assert u'xy'.expandtabs() =='xy'
         assert u''.expandtabs() ==''
 
@@ -456,7 +456,7 @@
         if sys.maxint > (1 << 32):
             skip("Wrong platform")
         raises((OverflowError, MemoryError), u't\tt\t'.expandtabs, sys.maxint)
-        
+
     def test_translate(self):
         assert u'bbbc' == u'abababc'.translate({ord('a'):None})
         assert u'iiic' == u'abababc'.translate({ord('a'):None, 
ord('b'):ord('i')})
@@ -473,7 +473,7 @@
     def test_unicode_form_encoded_object(self):
         assert unicode('x', 'utf-8') == u'x'
         assert unicode('x', 'utf-8', 'strict') == u'x'
-        
+
     def test_unicode_startswith_tuple(self):
         assert u'xxx'.startswith(('x', 'y', 'z'), 0)
         assert u'xxx'.endswith(('x', 'y', 'z'), 0)
@@ -572,7 +572,6 @@
 
 
     def test_partition(self):
-
         assert (u'this is the par', u'ti', u'tion method') == \
             u'this is the partition method'.partition(u'ti')
 
@@ -587,7 +586,6 @@
         raises(TypeError, S.partition, None)
 
     def test_rpartition(self):
-
         assert (u'this is the rparti', u'ti', u'on method') == \
             u'this is the rpartition method'.rpartition(u'ti')
 
@@ -601,7 +599,6 @@
         raises(ValueError, S.rpartition, u'')
         raises(TypeError, S.rpartition, None)
 
-
     def test_mul(self):
         zero = 0
         assert type(u'' * zero) == type(zero * u'') == unicode
@@ -730,7 +727,7 @@
                 return X("stuff")
 
         assert unicode(Y()).__class__ is X
-    
+
     def test_getslice(self):
         assert u'123456'.__getslice__(1, 5) == u'2345'
         s = u"abc"
@@ -827,7 +824,7 @@
 
             def __unicode__(self):
                 return u'bar'
-    
+
         a = A()
         b = B()
         s = '%s %s' % (a, b)
diff --git a/rpython/rlib/runicode.py b/rpython/rlib/runicode.py
--- a/rpython/rlib/runicode.py
+++ b/rpython/rlib/runicode.py
@@ -453,10 +453,11 @@
             continue
         # UTF-16 code pair:
         if len(s) - pos < 2:
+            pos -= 2
             if not final:
                 break
             errmsg = "unexpected end of data"
-            r, pos = errorhandler(errors, 'utf16', errmsg, s, pos - 2, len(s))
+            r, pos = errorhandler(errors, 'utf16', errmsg, s, pos, len(s))
             result.append(r)
             if len(s) - pos < 2:
                 break
diff --git a/rpython/rlib/test/test_runicode.py 
b/rpython/rlib/test/test_runicode.py
--- a/rpython/rlib/test/test_runicode.py
+++ b/rpython/rlib/test/test_runicode.py
@@ -4,6 +4,7 @@
 import sys, random
 from rpython.rlib import runicode
 
+
 def test_unichr():
     assert runicode.UNICHR(0xffff) == u'\uffff'
     if runicode.MAXUNICODE > 0xffff:
@@ -15,6 +16,7 @@
         py.test.raises(ValueError, runicode.UNICHR, 0x10000)
     py.test.raises(TypeError, runicode.UNICHR, 'abc')
 
+
 def test_ord():
     assert runicode.ORD('a') == 97
     assert runicode.ORD(u'a') == 97
@@ -118,7 +120,6 @@
 
 
 class TestDecoding(UnicodeTests):
-
     # XXX test bom recognition in utf-16
     # XXX test proper error handling
 
@@ -552,7 +553,6 @@
             self.checkdecodeerror(s, "utf-8", 0, 3, addstuff=True,
                                   msg='invalid continuation byte')
 
-
     def test_issue8271(self):
         # From CPython
         # Issue #8271: during the decoding of an invalid UTF-8 byte sequence,
@@ -648,6 +648,7 @@
             assert decoder(seq, len(seq), 'ignore', final=True
                            ) == (res, len(seq))
 
+
 class TestEncoding(UnicodeTests):
     def test_all_ascii(self):
         for i in range(128):
@@ -759,6 +760,7 @@
         py.test.raises(UnicodeEncodeError, encoder, u' 12, \u1234 ', 7, None)
         assert encoder(u'u\u1234', 2, 'replace') == 'u?'
 
+
 class TestTranslation(object):
     def setup_class(cls):
         if runicode.MAXUNICODE != sys.maxunicode:
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit

Reply via email to