Author: Armin Rigo <ar...@tunes.org> Branch: py3.5 Changeset: r87812:bff060a4f720 Date: 2016-10-15 13:55 +0200 http://bitbucket.org/pypy/pypy/changeset/bff060a4f720/
Log: Trying to fix some re failures in 3.5 (will graft back to default) diff --git a/rpython/rlib/rlocale.py b/rpython/rlib/rlocale.py --- a/rpython/rlib/rlocale.py +++ b/rpython/rlib/rlocale.py @@ -195,6 +195,7 @@ isalpha = external('isalpha', [rffi.INT], rffi.INT) isupper = external('isupper', [rffi.INT], rffi.INT) +toupper = external('toupper', [rffi.INT], rffi.INT) islower = external('islower', [rffi.INT], rffi.INT) tolower = external('tolower', [rffi.INT], rffi.INT) isalnum = external('isalnum', [rffi.INT], rffi.INT) diff --git a/rpython/rlib/rsre/rpy/sre_constants.py b/rpython/rlib/rsre/rpy/sre_constants.py --- a/rpython/rlib/rsre/rpy/sre_constants.py +++ b/rpython/rlib/rsre/rpy/sre_constants.py @@ -58,6 +58,7 @@ REPEAT_ONE = "repeat_one" SUBPATTERN = "subpattern" MIN_REPEAT_ONE = "min_repeat_one" +RANGE_IGNORE = "range_ignore" # positions AT_BEGINNING = "at_beginning" @@ -119,8 +120,8 @@ REPEAT, REPEAT_ONE, SUBPATTERN, - MIN_REPEAT_ONE - + MIN_REPEAT_ONE, + RANGE_IGNORE, ] ATCODES = [ diff --git a/rpython/rlib/rsre/rsre_char.py b/rpython/rlib/rsre/rsre_char.py --- a/rpython/rlib/rsre/rsre_char.py +++ b/rpython/rlib/rsre/rsre_char.py @@ -2,7 +2,7 @@ Character categories and charsets. """ import sys -from rpython.rlib.rlocale import tolower, isalnum +from rpython.rlib.rlocale import tolower, toupper, isalnum from rpython.rlib.unroll import unrolling_iterable from rpython.rlib import jit from rpython.rlib.rarithmetic import int_between @@ -67,6 +67,19 @@ char_ord += ord('a') - ord('A') return char_ord +def getupper(char_ord, flags): + if flags & SRE_FLAG_LOCALE: + if char_ord < 256: # cheating! Well, CPython does too. + char_ord = toupper(char_ord) + return char_ord + elif flags & SRE_FLAG_UNICODE: + assert unicodedb is not None + char_ord = unicodedb.toupper(char_ord) + else: + if int_between(ord('a'), char_ord, ord('z') + 1): # ASCII upper + char_ord += ord('A') - ord('a') + return char_ord + #### Category helpers is_a_word = [(chr(i).isalnum() or chr(i) == '_') for i in range(256)] @@ -139,16 +152,17 @@ ##### Charset evaluation @jit.unroll_safe -def check_charset(pattern, ppos, char_code): +def check_charset(ctx, ppos, char_code): """Checks whether a character matches set of arbitrary length. The set starts at pattern[ppos].""" negated = False result = False + pattern = ctx.pattern while True: opcode = pattern[ppos] for i, function in set_dispatch_unroll: if opcode == i: - newresult, ppos = function(pattern, ppos, char_code) + newresult, ppos = function(ctx, ppos, char_code) result |= newresult break else: @@ -163,18 +177,21 @@ return not result return result -def set_literal(pat, index, char_code): +def set_literal(ctx, index, char_code): # <LITERAL> <code> + pat = ctx.pattern match = pat[index+1] == char_code return match, index + 2 -def set_category(pat, index, char_code): +def set_category(ctx, index, char_code): # <CATEGORY> <code> + pat = ctx.pattern match = category_dispatch(pat[index+1], char_code) return match, index + 2 -def set_charset(pat, index, char_code): +def set_charset(ctx, index, char_code): # <CHARSET> <bitmap> (16 bits per code word) + pat = ctx.pattern if CODESIZE == 2: match = char_code < 256 and \ (pat[index+1+(char_code >> 4)] & (1 << (char_code & 15))) @@ -184,13 +201,25 @@ (pat[index+1+(char_code >> 5)] & (1 << (char_code & 31))) return match, index + 9 # skip bitmap -def set_range(pat, index, char_code): +def set_range(ctx, index, char_code): # <RANGE> <lower> <upper> + pat = ctx.pattern match = int_between(pat[index+1], char_code, pat[index+2] + 1) return match, index + 3 -def set_bigcharset(pat, index, char_code): +def set_range_ignore(ctx, index, char_code): + # <RANGE_IGNORE> <lower> <upper> + # the char_code is already lower cased + pat = ctx.pattern + lower = pat[index + 1] + upper = pat[index + 2] + match1 = int_between(lower, char_code, upper + 1) + match2 = int_between(lower, getupper(char_code, ctx.flags), upper + 1) + return match1 | match2, index + 3 + +def set_bigcharset(ctx, index, char_code): # <BIGCHARSET> <blockcount> <256 blockindices> <blocks> + pat = ctx.pattern count = pat[index+1] index += 2 @@ -224,7 +253,7 @@ index += count * (32 / CODESIZE) # skip blocks return match, index -def set_unicode_general_category(pat, index, char_code): +def set_unicode_general_category(ctx, index, char_code): # Unicode "General category property code" (not used by Python). # A general category is two letters. 'pat[index+1]' contains both # the first character, and the second character shifted by 8. @@ -233,6 +262,7 @@ # Negative matches are triggered by bit number 7. assert unicodedb is not None cat = unicodedb.category(char_code) + pat = ctx.pattern category_code = pat[index + 1] first_character = category_code & 0x7F second_character = (category_code >> 8) & 0x7F @@ -260,6 +290,7 @@ 11: set_bigcharset, 19: set_literal, 27: set_range, + 32: set_range_ignore, 70: set_unicode_general_category, } set_dispatch_unroll = unrolling_iterable(sorted(set_dispatch_table.items())) diff --git a/rpython/rlib/rsre/rsre_core.py b/rpython/rlib/rsre/rsre_core.py --- a/rpython/rlib/rsre/rsre_core.py +++ b/rpython/rlib/rsre/rsre_core.py @@ -40,6 +40,7 @@ OPCODE_REPEAT_ONE = 29 #OPCODE_SUBPATTERN = 30 OPCODE_MIN_REPEAT_ONE = 31 +OPCODE_RANGE_IGNORE = 32 # not used by Python itself OPCODE_UNICODE_GENERAL_CATEGORY = 70 @@ -640,8 +641,7 @@ elif op == OPCODE_IN: # match set member (or non_member) # <IN> <skip> <set> - if ptr >= ctx.end or not rsre_char.check_charset(ctx.pattern, - ppos+1, + if ptr >= ctx.end or not rsre_char.check_charset(ctx, ppos+1, ctx.str(ptr)): return ppos += ctx.pat(ppos) @@ -650,8 +650,7 @@ elif op == OPCODE_IN_IGNORE: # match set member (or non_member), ignoring case # <IN> <skip> <set> - if ptr >= ctx.end or not rsre_char.check_charset(ctx.pattern, - ppos+1, + if ptr >= ctx.end or not rsre_char.check_charset(ctx, ppos+1, ctx.lowstr(ptr)): return ppos += ctx.pat(ppos) @@ -871,10 +870,10 @@ return True # match anything (including a newline) @specializectx def match_IN(ctx, ptr, ppos): - return rsre_char.check_charset(ctx.pattern, ppos+2, ctx.str(ptr)) + return rsre_char.check_charset(ctx, ppos+2, ctx.str(ptr)) @specializectx def match_IN_IGNORE(ctx, ptr, ppos): - return rsre_char.check_charset(ctx.pattern, ppos+2, ctx.lowstr(ptr)) + return rsre_char.check_charset(ctx, ppos+2, ctx.lowstr(ptr)) @specializectx def match_LITERAL(ctx, ptr, ppos): return ctx.str(ptr) == ctx.pat(ppos+1) @@ -1134,7 +1133,7 @@ while start < ctx.end: ctx.jitdriver_CharsetSearch.jit_merge_point(ctx=ctx, start=start, base=base) - if rsre_char.check_charset(ctx.pattern, 5, ctx.str(start)): + if rsre_char.check_charset(ctx, 5, ctx.str(start)): if sre_match(ctx, base, start, None) is not None: ctx.match_start = start return True diff --git a/rpython/rlib/rsre/test/test_char.py b/rpython/rlib/rsre/test/test_char.py --- a/rpython/rlib/rsre/test/test_char.py +++ b/rpython/rlib/rsre/test/test_char.py @@ -34,6 +34,22 @@ assert rsre_char.getlower(UPPER_PI, SRE_FLAG_LOCALE | SRE_FLAG_UNICODE) \ == UPPER_PI +def test_getupper(): + assert rsre_char.getupper(ord('A'), 0) == ord('A') + assert rsre_char.getupper(ord('b'), 0) == ord('B') + assert rsre_char.getupper(10, 0) == 10 + assert rsre_char.getupper(LOWER_PI, 0) == LOWER_PI + # + assert rsre_char.getupper(ord('a'), SRE_FLAG_UNICODE) == ord('A') + assert rsre_char.getupper(ord('2'), SRE_FLAG_UNICODE) == ord('2') + assert rsre_char.getupper(10, SRE_FLAG_UNICODE) == 10 + assert rsre_char.getupper(LOWER_PI, SRE_FLAG_UNICODE) == UPPER_PI + # + assert rsre_char.getupper(LOWER_PI, SRE_FLAG_LOCALE) == LOWER_PI + assert rsre_char.getupper(LOWER_PI, SRE_FLAG_LOCALE | SRE_FLAG_UNICODE) \ + == LOWER_PI + + def test_is_word(): assert rsre_char.is_word(ord('A')) assert rsre_char.is_word(ord('_')) @@ -128,6 +144,10 @@ assert cat(CHCODES["category_uni_not_digit"], DINGBAT_CIRCLED) +class Ctx: + def __init__(self, pattern): + self.pattern = pattern + def test_general_category(): from rpython.rlib.unicodedata import unicodedb @@ -137,12 +157,12 @@ pat_neg = [70, ord(cat) | 0x80, 0] for c in positive: assert unicodedb.category(ord(c)).startswith(cat) - assert rsre_char.check_charset(pat_pos, 0, ord(c)) - assert not rsre_char.check_charset(pat_neg, 0, ord(c)) + assert rsre_char.check_charset(Ctx(pat_pos), 0, ord(c)) + assert not rsre_char.check_charset(Ctx(pat_neg), 0, ord(c)) for c in negative: assert not unicodedb.category(ord(c)).startswith(cat) - assert not rsre_char.check_charset(pat_pos, 0, ord(c)) - assert rsre_char.check_charset(pat_neg, 0, ord(c)) + assert not rsre_char.check_charset(Ctx(pat_pos), 0, ord(c)) + assert rsre_char.check_charset(Ctx(pat_neg), 0, ord(c)) def cat2num(cat): return ord(cat[0]) | (ord(cat[1]) << 8) @@ -153,17 +173,17 @@ pat_neg = [70, cat2num(cat) | 0x80, 0] for c in positive: assert unicodedb.category(ord(c)) == cat - assert rsre_char.check_charset(pat_pos, 0, ord(c)) - assert not rsre_char.check_charset(pat_neg, 0, ord(c)) + assert rsre_char.check_charset(Ctx(pat_pos), 0, ord(c)) + assert not rsre_char.check_charset(Ctx(pat_neg), 0, ord(c)) for c in negative: assert unicodedb.category(ord(c)) != cat - assert not rsre_char.check_charset(pat_pos, 0, ord(c)) - assert rsre_char.check_charset(pat_neg, 0, ord(c)) + assert not rsre_char.check_charset(Ctx(pat_pos), 0, ord(c)) + assert rsre_char.check_charset(Ctx(pat_neg), 0, ord(c)) # test for how the common 'L&' pattern might be compiled pat = [70, cat2num('Lu'), 70, cat2num('Ll'), 70, cat2num('Lt'), 0] - assert rsre_char.check_charset(pat, 0, 65) # Lu - assert rsre_char.check_charset(pat, 0, 99) # Ll - assert rsre_char.check_charset(pat, 0, 453) # Lt - assert not rsre_char.check_charset(pat, 0, 688) # Lm - assert not rsre_char.check_charset(pat, 0, 5870) # Nl + assert rsre_char.check_charset(Ctx(pat), 0, 65) # Lu + assert rsre_char.check_charset(Ctx(pat), 0, 99) # Ll + assert rsre_char.check_charset(Ctx(pat), 0, 453) # Lt + assert not rsre_char.check_charset(Ctx(pat), 0, 688) # Lm + assert not rsre_char.check_charset(Ctx(pat), 0, 5870) # Nl diff --git a/rpython/rlib/rsre/test/test_match.py b/rpython/rlib/rsre/test/test_match.py --- a/rpython/rlib/rsre/test/test_match.py +++ b/rpython/rlib/rsre/test/test_match.py @@ -1,5 +1,5 @@ import re, random, py -from rpython.rlib.rsre import rsre_core +from rpython.rlib.rsre import rsre_core, rsre_char from rpython.rlib.rsre.rpy import get_code, VERSION @@ -299,3 +299,12 @@ assert rsre_core.fullmatch(r, "ab") r = get_code(r"(?!a)..") assert not rsre_core.fullmatch(r, "ab") + + def test_range_ignore(self): + from rpython.rlib.unicodedata import unicodedb + rsre_char.set_unicode_db(unicodedb) + # + r = get_code(u"[\U00010428-\U0001044f]", re.I) + assert r.count(27) == 1 # OPCODE_RANGE + r[r.index(27)] = 32 # => OPCODE_RANGE_IGNORE + assert rsre_core.match(r, u"\U00010428") _______________________________________________ pypy-commit mailing list pypy-commit@python.org https://mail.python.org/mailman/listinfo/pypy-commit