Author: Armin Rigo <ar...@tunes.org> Branch: Changeset: r86817:ee3a2fbec01a Date: 2016-09-01 12:36 +0200 http://bitbucket.org/pypy/pypy/changeset/ee3a2fbec01a/
Log: Move the bit checking inside helpers, share it from the two places diff --git a/rpython/rlib/runicode.py b/rpython/rlib/runicode.py --- a/rpython/rlib/runicode.py +++ b/rpython/rlib/runicode.py @@ -137,6 +137,25 @@ result=result) return result.build(), pos +def _invalid_cont_byte(ordch): + return ordch>>6 != 0x2 # 0b10 + +_invalid_byte_2_of_2 = _invalid_cont_byte +_invalid_byte_3_of_3 = _invalid_cont_byte +_invalid_byte_3_of_4 = _invalid_cont_byte +_invalid_byte_4_of_4 = _invalid_cont_byte + +def _invalid_byte_2_of_3(ordch1, ordch2, allow_surrogates): + return (ordch2>>6 != 0x2 or # 0b10 + (ordch1 == 0xe0 and ordch2 < 0xa0) + # surrogates shouldn't be valid UTF-8! + or (not allow_surrogates and ordch1 == 0xed and ordch2 > 0x9f)) + +def _invalid_byte_2_of_4(ordch1, ordch2): + return (ordch2>>6 != 0x2 or # 0b10 + (ordch1 == 0xf0 and ordch2 < 0x90) or + (ordch1 == 0xf4 and ordch2 > 0x8f)) + @specialize.argtype(6) def str_decode_utf_8_impl(s, size, errors, final, errorhandler, allow_surrogates, result): @@ -173,10 +192,7 @@ ordch2 = ord(s[pos+1]) if n == 3: # 3-bytes seq with only a continuation byte - if (ordch2>>6 != 0x2 or # 0b10 - (ordch1 == 0xe0 and ordch2 < 0xa0) - or (not allow_surrogates and ordch1 == 0xed and ordch2 > 0x9f) - ): + if _invalid_byte_2_of_3(ordch1, ordch2, allow_surrogates): # second byte invalid, take the first and continue r, pos = errorhandler(errors, 'utf8', 'invalid continuation byte', @@ -192,16 +208,14 @@ continue elif n == 4: # 4-bytes seq with 1 or 2 continuation bytes - if (ordch2>>6 != 0x2 or # 0b10 - (ordch1 == 0xf0 and ordch2 < 0x90) or - (ordch1 == 0xf4 and ordch2 > 0x8f)): + if _invalid_byte_2_of_4(ordch1, ordch2): # second byte invalid, take the first and continue r, pos = errorhandler(errors, 'utf8', 'invalid continuation byte', s, pos, pos+1) result.append(r) continue - elif charsleft == 2 and ord(s[pos+2])>>6 != 0x2: # 0b10 + elif charsleft == 2 and _invalid_byte_3_of_4(ord(s[pos+2])): # third byte invalid, take the first two and continue r, pos = errorhandler(errors, 'utf8', 'invalid continuation byte', @@ -228,7 +242,7 @@ elif n == 2: ordch2 = ord(s[pos+1]) - if ordch2>>6 != 0x2: # 0b10 + if _invalid_byte_2_of_2(ordch2): r, pos = errorhandler(errors, 'utf8', 'invalid continuation byte', s, pos, pos+1) @@ -242,17 +256,13 @@ elif n == 3: ordch2 = ord(s[pos+1]) ordch3 = ord(s[pos+2]) - if (ordch2>>6 != 0x2 or # 0b10 - (ordch1 == 0xe0 and ordch2 < 0xa0) - # surrogates shouldn't be valid UTF-8! - or (not allow_surrogates and ordch1 == 0xed and ordch2 > 0x9f) - ): + if _invalid_byte_2_of_3(ordch1, ordch2, allow_surrogates): r, pos = errorhandler(errors, 'utf8', 'invalid continuation byte', s, pos, pos+1) result.append(r) continue - elif ordch3>>6 != 0x2: # 0b10 + elif _invalid_byte_3_of_3(ordch3): r, pos = errorhandler(errors, 'utf8', 'invalid continuation byte', s, pos, pos+2) @@ -268,21 +278,19 @@ ordch2 = ord(s[pos+1]) ordch3 = ord(s[pos+2]) ordch4 = ord(s[pos+3]) - if (ordch2>>6 != 0x2 or # 0b10 - (ordch1 == 0xf0 and ordch2 < 0x90) or - (ordch1 == 0xf4 and ordch2 > 0x8f)): + if _invalid_byte_2_of_4(ordch1, ordch2): r, pos = errorhandler(errors, 'utf8', 'invalid continuation byte', s, pos, pos+1) result.append(r) continue - elif ordch3>>6 != 0x2: # 0b10 + elif _invalid_byte_3_of_4(ordch3): r, pos = errorhandler(errors, 'utf8', 'invalid continuation byte', s, pos, pos+2) result.append(r) continue - elif ordch4>>6 != 0x2: # 0b10 + elif _invalid_byte_4_of_4(ordch4): r, pos = errorhandler(errors, 'utf8', 'invalid continuation byte', s, pos, pos+3) _______________________________________________ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit