[perl.git] branch blead updated. v5.31.3-152-gdcafa39d55

Karl Williamson Sat, 14 Sep 2019 21:27:23 -0700

In perl.git, the branch blead has been updated

<https://perl5.git.perl.org/perl.git/commitdiff/dcafa39d55218b798360ec3fa393a42a1b6ed3c5?hp=4a69216a74159df74779841fa79d731bcc5c6a9d>


- Log -----------------------------------------------------------------
commit dcafa39d55218b798360ec3fa393a42a1b6ed3c5
Author: Karl Williamson <[email protected]>
Date:   Thu Sep 12 20:56:59 2019 -0600

    t/re/anyof.t: Fix test names
    
    This previously duplicated some boiler plate in the test name

commit 0cc31f4df460bf4216abdcaadcc1b0dca0d4ab88
Author: Karl Williamson <[email protected]>
Date:   Sun May 26 12:22:26 2019 -0600

    regcomp.c: Fix -Dr bug
    
    If dumping the program and a single range crosses the border between
    being in the bitmap and not, the range must be split at the border
    because the output has separate text for things in the bitmap vs. those
    not.
    
    I'm not sure that there is a situation where this currently occurs, but
    it will so with a future commit

commit cfba2ecca3ba2c255c4533aa7cc149abdeea3ec0
Author: Karl Williamson <[email protected]>
Date:   Sun Mar 31 14:13:58 2019 -0600

    regcomp.c: Collapse some code
    
    These case statements are all repeated in the code for bracketed
    character classes, and mean the same thing.  That code knows a bunch of
    things for optimizing.  No need to duplicate that.  Instead, pretend
    these are being called within brackets, and call the code to handle
    that case, which will generate the proper ops.  This now follows the
    example of Unicode properties which have long been processed by
    pretending they are surrounded by [...]

commit 4758c20d21341aad8eb03b0831dc6e1a38046a0e
Author: Karl Williamson <[email protected]>
Date:   Wed Jun 26 12:23:47 2019 -0600

    Generalize inRANGE()
    
    I figured out a way to make this work generally.  I've also tested this
    vs what some modern compilers do under -O2.  It seems this macro is
    slightly better.

commit 833b0f46f2b673765c7e3d42e8530db0ad65ceeb
Author: Karl Williamson <[email protected]>
Date:   Wed Jun 26 12:01:05 2019 -0600

    Add withinCOUNT() macro and change inRANGE to use it
    
    This uses just one conditional to see if a value is between low and
    (low + n).

commit a15223fd18aff8e134dce76a2e5428202c0f2df1
Author: Karl Williamson <[email protected]>
Date:   Sat Sep 14 22:18:13 2019 -0600

    perl.h: Fix typo in comment

-----------------------------------------------------------------------

Summary of changes:
 handy.h      |  30 ++++++---------
 perl.h       |   2 +-
 regcomp.c    | 117 +++++++++++++++++------------------------------------------
 t/re/anyof.t |   3 +-
 4 files changed, 47 insertions(+), 105 deletions(-)

diff --git a/handy.h b/handy.h
index dc08ef3a7d..d9cd92d567 100644
--- a/handy.h
+++ b/handy.h
@@ -1317,27 +1317,21 @@ or casts
 #define FITS_IN_8_BITS(c) (1)
 #endif
 
+/* Returns true if l <= c <= l + n, where 'l' and 'n' are non-negative
+ * Written this way so that after optimization, only one conditional test is
+ * needed. */
+#define withinCOUNT(c, l, n) (__ASSERT_((l) >= 0) __ASSERT_((n) >= (0))        
\
+   (((WIDEST_UTYPE) (((c) | 0) - ((l) | 0))) <= (((WIDEST_UTYPE) ((n) | 0)))))
+
 /* Returns true if c is in the range l..u, where 'l' is non-negative
  * Written this way so that after optimization, only one conditional test is
- * needed.
- *
- * This isn't fully general, except for the special cased 'signed char' (which
- * should be resolved at compile time):  It won't work if 'c' is negative, and
- * 'l' is larger than the max for that signed type.  Thus if 'c' is a negative
- * int, and 'l' is larger than INT_MAX, it will fail.  To protect agains this
- * happening, there is an assert that will generate a warning if c is larger
- * than e.g.  INT_MAX if it is an 'unsigned int'.  This could be a false
- * positive, but khw couldn't figure out a way to make it better.  It's good
- * enough so far */
+ * needed. */
 #define inRANGE(c, l, u) (__ASSERT_((l) >= 0) __ASSERT_((u) >= (l))            
\
-  ((sizeof(c) == 1)                                                            
\
-   ? (((WIDEST_UTYPE) ((((U8) (c))|0) - (l))) <= ((WIDEST_UTYPE) ((u) - (l)))) 
\
-   : (__ASSERT_(   (((WIDEST_UTYPE) 1) <<  (CHARBITS * sizeof(c) - 1) & (c))   
\
-                     /* sign bit of c is 0 */                             == 0 
\
-                || (((~ ((WIDEST_UTYPE) 1) << ((CHARBITS * sizeof(c) - 1) - 
1))\
-                   /* l not larger than largest value in c's signed type */    
\
-                                          & ~ ((WIDEST_UTYPE) 0)) & (l)) == 0) 
\
-      ((WIDEST_UTYPE) (((c) - (l)) | 0) <= ((WIDEST_UTYPE) ((u) - (l)))))))
+   (  (sizeof(c) == sizeof(U8))  ? withinCOUNT(((U8)  (c)), (l), ((u) - (l)))  
\
+    : (sizeof(c) == sizeof(U16)) ? withinCOUNT(((U16) (c)), (l), ((u) - (l)))  
\
+    : (sizeof(c) == sizeof(U32)) ? withinCOUNT(((U32) (c)), (l), ((u) - (l)))  
\
+    : (__ASSERT_(sizeof(c) == sizeof(WIDEST_UTYPE))                            
\
+                                   withinCOUNT((      (c)), (l), ((u) - 
(l))))))
 
 #ifdef EBCDIC
 #   ifndef _ALL_SOURCE
diff --git a/perl.h b/perl.h
index 89d3c828ce..fb4eb77190 100644
--- a/perl.h
+++ b/perl.h
@@ -6245,7 +6245,7 @@ the plain locale pragma without a parameter (S<C<use 
locale>>) is in effect.
 #  if defined (PERL_CORE) || defined (PERL_IN_XSUB_RE)
 
      /* This internal macro should be called from places that operate under
-      * locale rules.  It there is a problem with the current locale that
+      * locale rules.  If there is a problem with the current locale that
       * hasn't been raised yet, it will output a warning this time.  Because
       * this will so rarely  be true, there is no point to optimize for time;
       * instead it makes sense to minimize space used and do all the work in
diff --git a/regcomp.c b/regcomp.c
index edd97a80f5..b389f9ec7f 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -13364,7 +13364,6 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, 
U32 depth)
     char *parse_start;
     U8 op;
     int invert = 0;
-    U8 arg;
 
     GET_RE_DEBUG_FLAGS_DECL;
 
@@ -13525,13 +13524,6 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, 
U32 depth)
            *flagp |= HASWIDTH;
            goto finish_meta_pat;
 
-       case 'W':
-            invert = 1;
-            /* FALLTHROUGH */
-       case 'w':
-            arg = ANYOF_WORDCHAR;
-            goto join_posix;
-
        case 'B':
             invert = 1;
             /* FALLTHROUGH */
@@ -13650,85 +13642,26 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 
*flagp, U32 depth)
            goto finish_meta_pat;
           }
 
-       case 'D':
-            invert = 1;
-            /* FALLTHROUGH */
-       case 'd':
-            arg = ANYOF_DIGIT;
-            if (! DEPENDS_SEMANTICS) {
-                goto join_posix;
-            }
-
-            /* \d doesn't have any matches in the upper Latin1 range, hence /d
-             * is equivalent to /u.  Changing to /u saves some branches at
-             * runtime */
-            op = POSIXU;
-            goto join_posix_op_known;
-
        case 'R':
            ret = reg_node(pRExC_state, LNBREAK);
            *flagp |= HASWIDTH|SIMPLE;
            goto finish_meta_pat;
 
-       case 'H':
-            invert = 1;
-            /* FALLTHROUGH */
+       case 'd':
+       case 'D':
        case 'h':
-           arg = ANYOF_BLANK;
-            op = POSIXU;
-            goto join_posix_op_known;
-
-       case 'V':
-            invert = 1;
-            /* FALLTHROUGH */
-       case 'v':
-           arg = ANYOF_VERTWS;
-            op = POSIXU;
-            goto join_posix_op_known;
-
-       case 'S':
-            invert = 1;
-            /* FALLTHROUGH */
-       case 's':
-            arg = ANYOF_SPACE;
-
-          join_posix:
-
-           op = POSIXD + get_regex_charset(RExC_flags);
-            if (op > POSIXA) {  /* /aa is same as /a */
-                op = POSIXA;
-            }
-            else if (op == POSIXL) {
-                RExC_contains_locale = 1;
-            }
-            else if (op == POSIXD) {
-                RExC_seen_d_op = TRUE;
-            }
-
-          join_posix_op_known:
-
-            if (invert) {
-                op += NPOSIXD - POSIXD;
-            }
-
-           ret = reg_node(pRExC_state, op);
-            FLAGS(REGNODE_p(ret)) = namedclass_to_classnum(arg);
-
-           *flagp |= HASWIDTH|SIMPLE;
-            /* FALLTHROUGH */
-
-          finish_meta_pat:
-            if (   UCHARAT(RExC_parse + 1) == '{'
-                && UNLIKELY(! new_regcurly(RExC_parse + 1, RExC_end)))
-            {
-                RExC_parse += 2;
-                vFAIL("Unescaped left brace in regex is illegal here");
-            }
-           nextchar(pRExC_state);
-            Set_Node_Length(REGNODE_p(ret), 2); /* MJD */
-           break;
+       case 'H':
        case 'p':
        case 'P':
+       case 's':
+       case 'S':
+       case 'v':
+       case 'V':
+       case 'w':
+       case 'W':
+            /* These all have the same meaning inside [brackets], and it knows
+             * how to do the best optimizations for them.  So, pretend we found
+             * these within brackets, and let it do the work */
             RExC_parse--;
 
             ret = regclass(pRExC_state, flagp, depth+1,
@@ -13747,10 +13680,21 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 
*flagp, U32 depth)
                 FAIL2("panic: regclass returned failure to regatom, flags=%#" 
UVxf,
                       (UV) *flagp);
 
-            RExC_parse--;
+            RExC_parse--;   /* regclass() leaves this one too far ahead */
 
+          finish_meta_pat:
+                   /* The escapes above that don't take a parameter can't be
+                    * followed by a '{'.  But 'pX', 'p{foo}' and
+                    * correspondingly 'P' can be */
+            if (   RExC_parse - parse_start == 1
+                && UCHARAT(RExC_parse + 1) == '{'
+                && UNLIKELY(! new_regcurly(RExC_parse + 1, RExC_end)))
+            {
+                RExC_parse += 2;
+                vFAIL("Unescaped left brace in regex is illegal here");
+            }
             Set_Node_Offset(REGNODE_p(ret), parse_start);
-            Set_Node_Cur_Length(REGNODE_p(ret), parse_start - 2);
+            Set_Node_Length(REGNODE_p(ret), RExC_parse - parse_start + 1); /* 
MJD */
             nextchar(pRExC_state);
            break;
         case 'N':
@@ -21576,9 +21520,14 @@ S_put_range(pTHX_ SV *sv, UV start, const UV end, 
const bool allow_literals)
 
         /* As a final resort, output the range or subrange as hex. */
 
-        this_end = (end < NUM_ANYOF_CODE_POINTS)
-                    ? end
-                    : NUM_ANYOF_CODE_POINTS - 1;
+        if (start >= NUM_ANYOF_CODE_POINTS) {
+            this_end = end;
+        }
+        else {
+            this_end = (end < NUM_ANYOF_CODE_POINTS)
+                        ? end
+                        : NUM_ANYOF_CODE_POINTS - 1;
+        }
 #if NUM_ANYOF_CODE_POINTS > 256
         format = (this_end < 256)
                  ? "\\x%02" UVXf "-\\x%02" UVXf
diff --git a/t/re/anyof.t b/t/re/anyof.t
index b7656d68a5..eee7467ef0 100644
--- a/t/re/anyof.t
+++ b/t/re/anyof.t
@@ -877,8 +877,7 @@ while (defined (my $test = shift @tests)) {
                               [[:xdigit:]]{2} )? /x, $test_name);
         }
         else {
-            is($result, $expected,
-               "Verify compilation of $test displays as $test_name");
+            is($result, $expected, $test_name);
         }
     }
 }

-- 
Perl5 Master Repository

[perl.git] branch blead updated. v5.31.3-152-gdcafa39d55

Reply via email to