Author: Alex Gaynor <[email protected]>
Branch: stdlib-2.7.11
Changeset: r83191:f3d2f640ffca
Date: 2016-03-20 10:46 -0400
http://bitbucket.org/pypy/pypy/changeset/f3d2f640ffca/
Log: I have conquered the latest round of utf7 bugs!
diff --git a/pypy/module/_codecs/test/test_codecs.py
b/pypy/module/_codecs/test/test_codecs.py
--- a/pypy/module/_codecs/test/test_codecs.py
+++ b/pypy/module/_codecs/test/test_codecs.py
@@ -28,8 +28,8 @@
raises( UnicodeDecodeError, unicode,'\\NSPACE}','unicode-escape')
raises( UnicodeDecodeError, unicode,'\\NSPACE','unicode-escape')
raises( UnicodeDecodeError, unicode,'\\N','unicode-escape')
- assert unicode('\\N{SPACE}\\N{SPACE}','unicode-escape') == u" "
- assert unicode('\\N{SPACE}a\\N{SPACE}','unicode-escape') == u" a "
+ assert unicode('\\N{SPACE}\\N{SPACE}','unicode-escape') == u" "
+ assert unicode('\\N{SPACE}a\\N{SPACE}','unicode-escape') == u" a "
assert "\\N{foo}xx".decode("unicode-escape", "ignore") == u"xx"
assert 1 <= len(u"\N{CJK UNIFIED IDEOGRAPH-20000}") <= 2
@@ -676,6 +676,9 @@
(b'a+//,+IKw-b', u'a\ufffd\u20acb'),
(b'a+///,+IKw-b', u'a\uffff\ufffd\u20acb'),
(b'a+////,+IKw-b', u'a\uffff\ufffd\u20acb'),
+ (b'a+2AE\xe1b', u'a\ufffdb'),
+ (b'a+2AEA-b', u'a\ufffdb'),
+ (b'a+2AH-b', u'a\ufffdb'),
]
for raw, expected in tests:
raises(UnicodeDecodeError, codecs.utf_7_decode, raw, 'strict',
True)
diff --git a/rpython/rlib/runicode.py b/rpython/rlib/runicode.py
--- a/rpython/rlib/runicode.py
+++ b/rpython/rlib/runicode.py
@@ -812,10 +812,9 @@
startinpos = 0
while pos < size:
ch = s[pos]
- oc = ord(ch)
if inShift: # in a base-64 section
- if _utf7_IS_BASE64(oc): #consume a base-64 character
+ if _utf7_IS_BASE64(ord(ch)): #consume a base-64 character
base64buffer = (base64buffer << 6) | _utf7_FROM_BASE64(ch)
base64bits += 6
pos += 1
@@ -828,7 +827,7 @@
assert outCh <= 0xffff
if surrogate:
# expecting a second surrogate
- if outCh >= 0xDC00 and outCh <= 0xDFFFF:
+ if outCh >= 0xDC00 and outCh <= 0xDFFF:
if MAXUNICODE < 65536:
result.append(unichr(surrogate))
result.append(unichr(outCh))
@@ -851,15 +850,11 @@
else:
# now leaving a base-64 section
inShift = False
- pos += 1
-
- if surrogate:
- result.append(unichr(surrogate))
- surrogate = 0
if base64bits > 0: # left-over bits
if base64bits >= 6:
# We've seen at least one base-64 character
+ pos += 1
msg = "partial character in shift sequence"
res, pos = errorhandler(errors, 'utf7',
msg, s, pos-1, pos)
@@ -868,20 +863,21 @@
else:
# Some bits remain; they should be zero
if base64buffer != 0:
+ pos += 1
msg = "non-zero padding bits in shift sequence"
res, pos = errorhandler(errors, 'utf7',
msg, s, pos-1, pos)
result.append(res)
continue
+ if surrogate and _utf7_DECODE_DIRECT(ord(ch)):
+ result.append(unichr(surrogate))
+ surrogate = 0
+
if ch == '-':
# '-' is absorbed; other terminating characters are
# preserved
- base64bits = 0
- base64buffer = 0
- surrogate = 0
- else:
- result.append(unichr(ord(ch)))
+ pos += 1
elif ch == '+':
startinpos = pos
@@ -891,12 +887,13 @@
result.append(u'+')
else: # begin base64-encoded section
inShift = 1
+ surrogate = 0
shiftOutStartPos = result.getlength()
base64bits = 0
base64buffer = 0
- elif _utf7_DECODE_DIRECT(oc): # character decodes at itself
- result.append(unichr(oc))
+ elif _utf7_DECODE_DIRECT(ord(ch)): # character decodes at itself
+ result.append(unichr(ord(ch)))
pos += 1
else:
startinpos = pos
@@ -909,6 +906,7 @@
final_length = result.getlength()
if inShift and final: # in shift sequence, no more to follow
# if we're in an inconsistent state, that's an error
+ inShift = 0
if (surrogate or
base64bits >= 6 or
(base64bits > 0 and base64buffer != 0)):
_______________________________________________
pypy-commit mailing list
[email protected]
https://mail.python.org/mailman/listinfo/pypy-commit