[pypy-commit] pypy stdlib-2.7.11: I have conquered the latest round of utf7 bugs!

alex_gaynor Sun, 20 Mar 2016 07:48:10 -0700

Author: Alex Gaynor <[email protected]>
Branch: stdlib-2.7.11
Changeset: r83191:f3d2f640ffca
Date: 2016-03-20 10:46 -0400
http://bitbucket.org/pypy/pypy/changeset/f3d2f640ffca/


Log:    I have conquered the latest round of utf7 bugs!

diff --git a/pypy/module/_codecs/test/test_codecs.py 
b/pypy/module/_codecs/test/test_codecs.py
--- a/pypy/module/_codecs/test/test_codecs.py
+++ b/pypy/module/_codecs/test/test_codecs.py
@@ -28,8 +28,8 @@
         raises( UnicodeDecodeError, unicode,'\\NSPACE}','unicode-escape')
         raises( UnicodeDecodeError, unicode,'\\NSPACE','unicode-escape')
         raises( UnicodeDecodeError, unicode,'\\N','unicode-escape')
-        assert  unicode('\\N{SPACE}\\N{SPACE}','unicode-escape') == u"  " 
-        assert  unicode('\\N{SPACE}a\\N{SPACE}','unicode-escape') == u" a " 
+        assert  unicode('\\N{SPACE}\\N{SPACE}','unicode-escape') == u"  "
+        assert  unicode('\\N{SPACE}a\\N{SPACE}','unicode-escape') == u" a "
         assert "\\N{foo}xx".decode("unicode-escape", "ignore") == u"xx"
         assert 1 <= len(u"\N{CJK UNIFIED IDEOGRAPH-20000}") <= 2
 
@@ -676,6 +676,9 @@
             (b'a+//,+IKw-b', u'a\ufffd\u20acb'),
             (b'a+///,+IKw-b', u'a\uffff\ufffd\u20acb'),
             (b'a+////,+IKw-b', u'a\uffff\ufffd\u20acb'),
+            (b'a+2AE\xe1b', u'a\ufffdb'),
+            (b'a+2AEA-b', u'a\ufffdb'),
+            (b'a+2AH-b', u'a\ufffdb'),
         ]
         for raw, expected in tests:
             raises(UnicodeDecodeError, codecs.utf_7_decode, raw, 'strict', 
True)
diff --git a/rpython/rlib/runicode.py b/rpython/rlib/runicode.py
--- a/rpython/rlib/runicode.py
+++ b/rpython/rlib/runicode.py
@@ -812,10 +812,9 @@
     startinpos = 0
     while pos < size:
         ch = s[pos]
-        oc = ord(ch)
 
         if inShift: # in a base-64 section
-            if _utf7_IS_BASE64(oc): #consume a base-64 character
+            if _utf7_IS_BASE64(ord(ch)): #consume a base-64 character
                 base64buffer = (base64buffer << 6) | _utf7_FROM_BASE64(ch)
                 base64bits += 6
                 pos += 1
@@ -828,7 +827,7 @@
                     assert outCh <= 0xffff
                     if surrogate:
                         # expecting a second surrogate
-                        if outCh >= 0xDC00 and outCh <= 0xDFFFF:
+                        if outCh >= 0xDC00 and outCh <= 0xDFFF:
                             if MAXUNICODE < 65536:
                                 result.append(unichr(surrogate))
                                 result.append(unichr(outCh))
@@ -851,15 +850,11 @@
             else:
                 # now leaving a base-64 section
                 inShift = False
-                pos += 1
-
-                if surrogate:
-                    result.append(unichr(surrogate))
-                    surrogate = 0
 
                 if base64bits > 0: # left-over bits
                     if base64bits >= 6:
                         # We've seen at least one base-64 character
+                        pos += 1
                         msg = "partial character in shift sequence"
                         res, pos = errorhandler(errors, 'utf7',
                                                 msg, s, pos-1, pos)
@@ -868,20 +863,21 @@
                     else:
                         # Some bits remain; they should be zero
                         if base64buffer != 0:
+                            pos += 1
                             msg = "non-zero padding bits in shift sequence"
                             res, pos = errorhandler(errors, 'utf7',
                                                     msg, s, pos-1, pos)
                             result.append(res)
                             continue
 
+                if surrogate and _utf7_DECODE_DIRECT(ord(ch)):
+                    result.append(unichr(surrogate))
+                surrogate = 0
+
                 if ch == '-':
                     # '-' is absorbed; other terminating characters are
                     # preserved
-                    base64bits = 0
-                    base64buffer = 0
-                    surrogate = 0
-                else:
-                    result.append(unichr(ord(ch)))
+                    pos += 1
 
         elif ch == '+':
             startinpos = pos
@@ -891,12 +887,13 @@
                 result.append(u'+')
             else: # begin base64-encoded section
                 inShift = 1
+                surrogate = 0
                 shiftOutStartPos = result.getlength()
                 base64bits = 0
                 base64buffer = 0
 
-        elif _utf7_DECODE_DIRECT(oc): # character decodes at itself
-            result.append(unichr(oc))
+        elif _utf7_DECODE_DIRECT(ord(ch)): # character decodes at itself
+            result.append(unichr(ord(ch)))
             pos += 1
         else:
             startinpos = pos
@@ -909,6 +906,7 @@
     final_length = result.getlength()
     if inShift and final: # in shift sequence, no more to follow
         # if we're in an inconsistent state, that's an error
+        inShift = 0
         if (surrogate or
             base64bits >= 6 or
             (base64bits > 0 and base64buffer != 0)):
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit

[pypy-commit] pypy stdlib-2.7.11: I have conquered the latest round of utf7 bugs!

Reply via email to