In perl.git, the branch blead has been updated <https://perl5.git.perl.org/perl.git/commitdiff/dcafa39d55218b798360ec3fa393a42a1b6ed3c5?hp=4a69216a74159df74779841fa79d731bcc5c6a9d>
- Log ----------------------------------------------------------------- commit dcafa39d55218b798360ec3fa393a42a1b6ed3c5 Author: Karl Williamson <[email protected]> Date: Thu Sep 12 20:56:59 2019 -0600 t/re/anyof.t: Fix test names This previously duplicated some boiler plate in the test name commit 0cc31f4df460bf4216abdcaadcc1b0dca0d4ab88 Author: Karl Williamson <[email protected]> Date: Sun May 26 12:22:26 2019 -0600 regcomp.c: Fix -Dr bug If dumping the program and a single range crosses the border between being in the bitmap and not, the range must be split at the border because the output has separate text for things in the bitmap vs. those not. I'm not sure that there is a situation where this currently occurs, but it will so with a future commit commit cfba2ecca3ba2c255c4533aa7cc149abdeea3ec0 Author: Karl Williamson <[email protected]> Date: Sun Mar 31 14:13:58 2019 -0600 regcomp.c: Collapse some code These case statements are all repeated in the code for bracketed character classes, and mean the same thing. That code knows a bunch of things for optimizing. No need to duplicate that. Instead, pretend these are being called within brackets, and call the code to handle that case, which will generate the proper ops. This now follows the example of Unicode properties which have long been processed by pretending they are surrounded by [...] commit 4758c20d21341aad8eb03b0831dc6e1a38046a0e Author: Karl Williamson <[email protected]> Date: Wed Jun 26 12:23:47 2019 -0600 Generalize inRANGE() I figured out a way to make this work generally. I've also tested this vs what some modern compilers do under -O2. It seems this macro is slightly better. commit 833b0f46f2b673765c7e3d42e8530db0ad65ceeb Author: Karl Williamson <[email protected]> Date: Wed Jun 26 12:01:05 2019 -0600 Add withinCOUNT() macro and change inRANGE to use it This uses just one conditional to see if a value is between low and (low + n). commit a15223fd18aff8e134dce76a2e5428202c0f2df1 Author: Karl Williamson <[email protected]> Date: Sat Sep 14 22:18:13 2019 -0600 perl.h: Fix typo in comment ----------------------------------------------------------------------- Summary of changes: handy.h | 30 ++++++--------- perl.h | 2 +- regcomp.c | 117 +++++++++++++++++------------------------------------------ t/re/anyof.t | 3 +- 4 files changed, 47 insertions(+), 105 deletions(-) diff --git a/handy.h b/handy.h index dc08ef3a7d..d9cd92d567 100644 --- a/handy.h +++ b/handy.h @@ -1317,27 +1317,21 @@ or casts #define FITS_IN_8_BITS(c) (1) #endif +/* Returns true if l <= c <= l + n, where 'l' and 'n' are non-negative + * Written this way so that after optimization, only one conditional test is + * needed. */ +#define withinCOUNT(c, l, n) (__ASSERT_((l) >= 0) __ASSERT_((n) >= (0)) \ + (((WIDEST_UTYPE) (((c) | 0) - ((l) | 0))) <= (((WIDEST_UTYPE) ((n) | 0))))) + /* Returns true if c is in the range l..u, where 'l' is non-negative * Written this way so that after optimization, only one conditional test is - * needed. - * - * This isn't fully general, except for the special cased 'signed char' (which - * should be resolved at compile time): It won't work if 'c' is negative, and - * 'l' is larger than the max for that signed type. Thus if 'c' is a negative - * int, and 'l' is larger than INT_MAX, it will fail. To protect agains this - * happening, there is an assert that will generate a warning if c is larger - * than e.g. INT_MAX if it is an 'unsigned int'. This could be a false - * positive, but khw couldn't figure out a way to make it better. It's good - * enough so far */ + * needed. */ #define inRANGE(c, l, u) (__ASSERT_((l) >= 0) __ASSERT_((u) >= (l)) \ - ((sizeof(c) == 1) \ - ? (((WIDEST_UTYPE) ((((U8) (c))|0) - (l))) <= ((WIDEST_UTYPE) ((u) - (l)))) \ - : (__ASSERT_( (((WIDEST_UTYPE) 1) << (CHARBITS * sizeof(c) - 1) & (c)) \ - /* sign bit of c is 0 */ == 0 \ - || (((~ ((WIDEST_UTYPE) 1) << ((CHARBITS * sizeof(c) - 1) - 1))\ - /* l not larger than largest value in c's signed type */ \ - & ~ ((WIDEST_UTYPE) 0)) & (l)) == 0) \ - ((WIDEST_UTYPE) (((c) - (l)) | 0) <= ((WIDEST_UTYPE) ((u) - (l))))))) + ( (sizeof(c) == sizeof(U8)) ? withinCOUNT(((U8) (c)), (l), ((u) - (l))) \ + : (sizeof(c) == sizeof(U16)) ? withinCOUNT(((U16) (c)), (l), ((u) - (l))) \ + : (sizeof(c) == sizeof(U32)) ? withinCOUNT(((U32) (c)), (l), ((u) - (l))) \ + : (__ASSERT_(sizeof(c) == sizeof(WIDEST_UTYPE)) \ + withinCOUNT(( (c)), (l), ((u) - (l)))))) #ifdef EBCDIC # ifndef _ALL_SOURCE diff --git a/perl.h b/perl.h index 89d3c828ce..fb4eb77190 100644 --- a/perl.h +++ b/perl.h @@ -6245,7 +6245,7 @@ the plain locale pragma without a parameter (S<C<use locale>>) is in effect. # if defined (PERL_CORE) || defined (PERL_IN_XSUB_RE) /* This internal macro should be called from places that operate under - * locale rules. It there is a problem with the current locale that + * locale rules. If there is a problem with the current locale that * hasn't been raised yet, it will output a warning this time. Because * this will so rarely be true, there is no point to optimize for time; * instead it makes sense to minimize space used and do all the work in diff --git a/regcomp.c b/regcomp.c index edd97a80f5..b389f9ec7f 100644 --- a/regcomp.c +++ b/regcomp.c @@ -13364,7 +13364,6 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) char *parse_start; U8 op; int invert = 0; - U8 arg; GET_RE_DEBUG_FLAGS_DECL; @@ -13525,13 +13524,6 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) *flagp |= HASWIDTH; goto finish_meta_pat; - case 'W': - invert = 1; - /* FALLTHROUGH */ - case 'w': - arg = ANYOF_WORDCHAR; - goto join_posix; - case 'B': invert = 1; /* FALLTHROUGH */ @@ -13650,85 +13642,26 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) goto finish_meta_pat; } - case 'D': - invert = 1; - /* FALLTHROUGH */ - case 'd': - arg = ANYOF_DIGIT; - if (! DEPENDS_SEMANTICS) { - goto join_posix; - } - - /* \d doesn't have any matches in the upper Latin1 range, hence /d - * is equivalent to /u. Changing to /u saves some branches at - * runtime */ - op = POSIXU; - goto join_posix_op_known; - case 'R': ret = reg_node(pRExC_state, LNBREAK); *flagp |= HASWIDTH|SIMPLE; goto finish_meta_pat; - case 'H': - invert = 1; - /* FALLTHROUGH */ + case 'd': + case 'D': case 'h': - arg = ANYOF_BLANK; - op = POSIXU; - goto join_posix_op_known; - - case 'V': - invert = 1; - /* FALLTHROUGH */ - case 'v': - arg = ANYOF_VERTWS; - op = POSIXU; - goto join_posix_op_known; - - case 'S': - invert = 1; - /* FALLTHROUGH */ - case 's': - arg = ANYOF_SPACE; - - join_posix: - - op = POSIXD + get_regex_charset(RExC_flags); - if (op > POSIXA) { /* /aa is same as /a */ - op = POSIXA; - } - else if (op == POSIXL) { - RExC_contains_locale = 1; - } - else if (op == POSIXD) { - RExC_seen_d_op = TRUE; - } - - join_posix_op_known: - - if (invert) { - op += NPOSIXD - POSIXD; - } - - ret = reg_node(pRExC_state, op); - FLAGS(REGNODE_p(ret)) = namedclass_to_classnum(arg); - - *flagp |= HASWIDTH|SIMPLE; - /* FALLTHROUGH */ - - finish_meta_pat: - if ( UCHARAT(RExC_parse + 1) == '{' - && UNLIKELY(! new_regcurly(RExC_parse + 1, RExC_end))) - { - RExC_parse += 2; - vFAIL("Unescaped left brace in regex is illegal here"); - } - nextchar(pRExC_state); - Set_Node_Length(REGNODE_p(ret), 2); /* MJD */ - break; + case 'H': case 'p': case 'P': + case 's': + case 'S': + case 'v': + case 'V': + case 'w': + case 'W': + /* These all have the same meaning inside [brackets], and it knows + * how to do the best optimizations for them. So, pretend we found + * these within brackets, and let it do the work */ RExC_parse--; ret = regclass(pRExC_state, flagp, depth+1, @@ -13747,10 +13680,21 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) FAIL2("panic: regclass returned failure to regatom, flags=%#" UVxf, (UV) *flagp); - RExC_parse--; + RExC_parse--; /* regclass() leaves this one too far ahead */ + finish_meta_pat: + /* The escapes above that don't take a parameter can't be + * followed by a '{'. But 'pX', 'p{foo}' and + * correspondingly 'P' can be */ + if ( RExC_parse - parse_start == 1 + && UCHARAT(RExC_parse + 1) == '{' + && UNLIKELY(! new_regcurly(RExC_parse + 1, RExC_end))) + { + RExC_parse += 2; + vFAIL("Unescaped left brace in regex is illegal here"); + } Set_Node_Offset(REGNODE_p(ret), parse_start); - Set_Node_Cur_Length(REGNODE_p(ret), parse_start - 2); + Set_Node_Length(REGNODE_p(ret), RExC_parse - parse_start + 1); /* MJD */ nextchar(pRExC_state); break; case 'N': @@ -21576,9 +21520,14 @@ S_put_range(pTHX_ SV *sv, UV start, const UV end, const bool allow_literals) /* As a final resort, output the range or subrange as hex. */ - this_end = (end < NUM_ANYOF_CODE_POINTS) - ? end - : NUM_ANYOF_CODE_POINTS - 1; + if (start >= NUM_ANYOF_CODE_POINTS) { + this_end = end; + } + else { + this_end = (end < NUM_ANYOF_CODE_POINTS) + ? end + : NUM_ANYOF_CODE_POINTS - 1; + } #if NUM_ANYOF_CODE_POINTS > 256 format = (this_end < 256) ? "\\x%02" UVXf "-\\x%02" UVXf diff --git a/t/re/anyof.t b/t/re/anyof.t index b7656d68a5..eee7467ef0 100644 --- a/t/re/anyof.t +++ b/t/re/anyof.t @@ -877,8 +877,7 @@ while (defined (my $test = shift @tests)) { [[:xdigit:]]{2} )? /x, $test_name); } else { - is($result, $expected, - "Verify compilation of $test displays as $test_name"); + is($result, $expected, $test_name); } } } -- Perl5 Master Repository
