Author: Armin Rigo <ar...@tunes.org> Branch: Changeset: r86816:e9dd5882eed6 Date: 2016-09-01 12:23 +0200 http://bitbucket.org/pypy/pypy/changeset/e9dd5882eed6/
Log: Issue #2389: the custom error handler may return a 'pos' that is smaller than 'size', in which case we need to continue looping diff --git a/rpython/rlib/runicode.py b/rpython/rlib/runicode.py --- a/rpython/rlib/runicode.py +++ b/rpython/rlib/runicode.py @@ -157,22 +157,26 @@ if pos + n > size: if not final: break + # argh, this obscure block of code is mostly a copy of + # what follows :-( charsleft = size - pos - 1 # either 0, 1, 2 - # note: when we get the 'unexpected end of data' we don't care - # about the pos anymore and we just ignore the value + # note: when we get the 'unexpected end of data' we need + # to care about the pos returned; it can be lower than size, + # in case we need to continue running this loop if not charsleft: # there's only the start byte and nothing else r, pos = errorhandler(errors, 'utf8', 'unexpected end of data', s, pos, pos+1) result.append(r) - break + continue ordch2 = ord(s[pos+1]) if n == 3: # 3-bytes seq with only a continuation byte if (ordch2>>6 != 0x2 or # 0b10 - (ordch1 == 0xe0 and ordch2 < 0xa0)): - # or (ordch1 == 0xed and ordch2 > 0x9f) + (ordch1 == 0xe0 and ordch2 < 0xa0) + or (not allow_surrogates and ordch1 == 0xed and ordch2 > 0x9f) + ): # second byte invalid, take the first and continue r, pos = errorhandler(errors, 'utf8', 'invalid continuation byte', @@ -185,7 +189,7 @@ 'unexpected end of data', s, pos, pos+2) result.append(r) - break + continue elif n == 4: # 4-bytes seq with 1 or 2 continuation bytes if (ordch2>>6 != 0x2 or # 0b10 @@ -210,7 +214,8 @@ 'unexpected end of data', s, pos, pos+charsleft+1) result.append(r) - break + continue + raise AssertionError("unreachable") if n == 0: r, pos = errorhandler(errors, 'utf8', diff --git a/rpython/rlib/test/test_runicode.py b/rpython/rlib/test/test_runicode.py --- a/rpython/rlib/test/test_runicode.py +++ b/rpython/rlib/test/test_runicode.py @@ -289,6 +289,12 @@ def setup_method(self, meth): self.decoder = self.getdecoder('utf-8') + def custom_replace(self, errors, encoding, msg, s, startingpos, endingpos): + assert errors == 'custom' + # returns FOO, but consumes only one character (not up to endingpos) + FOO = u'\u1234' + return FOO, startingpos + 1 + def to_bytestring(self, bytes): return ''.join(chr(int(c, 16)) for c in bytes.split()) @@ -309,6 +315,7 @@ E.g. <80> is a continuation byte and can appear only after a start byte. """ FFFD = u'\ufffd' + FOO = u'\u1234' for byte in '\x80\xA0\x9F\xBF\xC0\xC1\xF5\xFF': py.test.raises(UnicodeDecodeError, self.decoder, byte, 1, None, final=True) self.checkdecodeerror(byte, 'utf-8', 0, 1, addstuff=False, @@ -320,6 +327,11 @@ assert self.decoder(byte, 1, 'ignore', final=True) == (u'', 1) assert (self.decoder('aaaa' + byte + 'bbbb', 9, 'ignore', final=True) == (u'aaaabbbb', 9)) + assert self.decoder(byte, 1, 'custom', final=True, + errorhandler=self.custom_replace) == (FOO, 1) + assert (self.decoder('aaaa' + byte + 'bbbb', 9, 'custom', + final=True, errorhandler=self.custom_replace) == + (u'aaaa'+ FOO + u'bbbb', 9)) def test_unexpected_end_of_data(self): """ @@ -343,6 +355,7 @@ 'F4 80', 'F4 8F', 'F4 80 80', 'F4 80 BF', 'F4 8F 80', 'F4 8F BF' ] FFFD = u'\ufffd' + FOO = u'\u1234' for seq in sequences: seq = self.to_bytestring(seq) py.test.raises(UnicodeDecodeError, self.decoder, seq, len(seq), @@ -358,6 +371,12 @@ ) == (u'', len(seq)) assert (self.decoder('aaaa' + seq + 'bbbb', len(seq) + 8, 'ignore', final=True) == (u'aaaabbbb', len(seq) + 8)) + assert (self.decoder(seq, len(seq), 'custom', final=True, + errorhandler=self.custom_replace) == + (FOO * len(seq), len(seq))) + assert (self.decoder('aaaa' + seq + 'bbbb', len(seq) + 8, 'custom', + final=True, errorhandler=self.custom_replace) == + (u'aaaa'+ FOO * len(seq) + u'bbbb', len(seq) + 8)) def test_invalid_cb_for_2bytes_seq(self): """ _______________________________________________ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit