In perl.git, the branch blead has been updated

<http://perl5.git.perl.org/perl.git/commitdiff/8a776697be4d3151d33fc5fefefbc88f290a7c29?hp=1059e5b21f7e16d18b3df847d2e5d3f343f44f1c>

- Log -----------------------------------------------------------------
commit 8a776697be4d3151d33fc5fefefbc88f290a7c29
Author: Karl Williamson <[email protected]>
Date:   Sat Dec 19 11:22:04 2015 -0700

    regcomp.c: Skip some work
    
    We can optimize ANYOF nodes that are equivalent to POSIX character
    classes.  Discovering if they are equivalent takes work, which can be
    skipped with a simple test that will rule out many run-of-the-mill
    character classes.

M       regcomp.c

commit d113e221247bf39615d19b490f672c4d47380294
Author: Karl Williamson <[email protected]>
Date:   Sat Dec 19 11:19:35 2015 -0700

    regcomp.c: White space only
    
    Indent a section of code in preparation for the next commit which will
    make it into a block.

M       regcomp.c

commit 0e7784f31c0885f80e7369182fdad8861ad9508a
Author: Karl Williamson <[email protected]>
Date:   Sat Dec 19 11:14:07 2015 -0700

    regcomp.c: Add comments

M       regcomp.c

commit 13f7995862ecd2f387b4b447464429871c52b1a3
Author: Karl Williamson <[email protected]>
Date:   Sat Dec 19 09:49:00 2015 -0700

    mktables: Add "$0:" to its first output
    
    So in a make, it is abundantly clear where the messages are coming from

M       charclass_invlists.h
M       lib/unicore/mktables
M       regcharclass.h
-----------------------------------------------------------------------

Summary of changes:
 charclass_invlists.h |  2 +-
 lib/unicore/mktables |  4 +--
 regcharclass.h       |  2 +-
 regcomp.c            | 78 ++++++++++++++++++++++++++++++++++++----------------
 4 files changed, 59 insertions(+), 27 deletions(-)

diff --git a/charclass_invlists.h b/charclass_invlists.h
index 91ef063..8a37ab6 100644
--- a/charclass_invlists.h
+++ b/charclass_invlists.h
@@ -99537,7 +99537,7 @@ static const UV XPosixXDigit_invlist[] = { /* for 
EBCDIC POSIX-BC */
  * 1a0687fb9c6c4567e853913549df0944fe40821279a3e9cdaa6ab8679bc286fd 
lib/unicore/extracted/DLineBreak.txt
  * 40bcfed3ca727c19e1331f6c33806231d5f7eeeabd2e6a9e06a3740c85d0c250 
lib/unicore/extracted/DNumType.txt
  * a18d502bad39d527ac5586d7bc93e29f565859e3bcc24ada627eff606d6f5fed 
lib/unicore/extracted/DNumValues.txt
- * 4d44b51567e796f3021824c071a3d73fea1a664b59a064956be17850d976631e 
lib/unicore/mktables
+ * 8e23f7adafce8ef1aadbbb3f1e942c14f5d5c8318599cae7ed0ad555e60d4639 
lib/unicore/mktables
  * 462c9aaa608fb2014cd9649af1c5c009485c60b9c8b15b89401fdc10cf6161c6 
lib/unicore/version
  * 996abda3c0fbc2bfd575092af09e3b9b0331e624eb2e969a268457f8fd31ecbb 
regen/charset_translations.pl
  * 8a097f8f726bb1619af2f27f149ab87e60a1602f790147e3a561358be16abd27 
regen/mk_invlists.pl
diff --git a/lib/unicore/mktables b/lib/unicore/mktables
index 57e05f3..be66780 100644
--- a/lib/unicore/mktables
+++ b/lib/unicore/mktables
@@ -19332,10 +19332,10 @@ if (! $rebuild) {
     }
 }
 if (! $rebuild) {
-    print "Files seem to be ok, not bothering to rebuild.  Add '-w' option to 
force build\n";
+    print "$0: Files seem to be ok, not bothering to rebuild.  Add '-w' option 
to force build\n";
     exit(0);
 }
-print "Must rebuild tables.\n" if $verbosity >= $VERBOSE;
+print "$0: Must rebuild tables.\n" if $verbosity >= $VERBOSE;
 
 # Ready to do the major processing.  First create the perl pseudo-property.
 $perl = Property->new('perl', Type => $NON_STRING, Perl_Extension => 1);
diff --git a/regcharclass.h b/regcharclass.h
index 66064d1..30e9133 100644
--- a/regcharclass.h
+++ b/regcharclass.h
@@ -2514,7 +2514,7 @@
  * 1a0687fb9c6c4567e853913549df0944fe40821279a3e9cdaa6ab8679bc286fd 
lib/unicore/extracted/DLineBreak.txt
  * 40bcfed3ca727c19e1331f6c33806231d5f7eeeabd2e6a9e06a3740c85d0c250 
lib/unicore/extracted/DNumType.txt
  * a18d502bad39d527ac5586d7bc93e29f565859e3bcc24ada627eff606d6f5fed 
lib/unicore/extracted/DNumValues.txt
- * 4d44b51567e796f3021824c071a3d73fea1a664b59a064956be17850d976631e 
lib/unicore/mktables
+ * 8e23f7adafce8ef1aadbbb3f1e942c14f5d5c8318599cae7ed0ad555e60d4639 
lib/unicore/mktables
  * 462c9aaa608fb2014cd9649af1c5c009485c60b9c8b15b89401fdc10cf6161c6 
lib/unicore/version
  * 996abda3c0fbc2bfd575092af09e3b9b0331e624eb2e969a268457f8fd31ecbb 
regen/charset_translations.pl
  * d9c04ac46bdd81bb3e26519f2b8eb6242cb12337205add3f7cf092b0c58dccc4 
regen/regcharclass.pl
diff --git a/regcomp.c b/regcomp.c
index 7a028fd..f11fda1 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -3367,6 +3367,14 @@ S_construct_ahocorasick_from_trie(pTHX_ RExC_state_t 
*pRExC_state, regnode *sour
  * The adjacent nodes actually may be separated by NOTHING-kind nodes, and
  * these get optimized out
  *
+ * XXX khw thinks this should be enhanced to fill EXACT (at least) nodes as 
full
+ * as possible, even if that means splitting an existing node so that its first
+ * part is moved to the preceeding node.  This would maximise the efficiency of
+ * memEQ during matching.  Elsewhere in this file, khw proposes splitting
+ * EXACTFish nodes into portions that don't change under folding vs those that
+ * do.  Those portions that don't change may be the only things in the pattern 
that
+ * could be used to find fixed and floating strings.
+ *
  * If a node is to match under /i (folded), the number of characters it matches
  * can be different than its character length if it contains a multi-character
  * fold.  *min_subtract is set to the total delta number of characters of the
@@ -12360,7 +12368,12 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, 
U32 depth)
          reparse:
 
             /* We look for the EXACTFish to EXACT node optimizaton only if
-             * folding.  (And we don't need to figure this out until pass 2) */
+             * folding.  (And we don't need to figure this out until pass 2).
+             * XXX It might actually make sense to split the node into portions
+             * that are exact and ones that aren't, so that we could later use
+             * the exact ones to find the longest fixed and floating strings.
+             * One would want to join them back into a larger node.  One could
+             * use a pseudo regnode like 'EXACT_ORIG_FOLD' */
             maybe_exact = FOLD && PASS2;
 
            /* XXX The node can hold up to 255 bytes, yet this only goes to
@@ -16127,7 +16140,14 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 
*flagp, U32 depth,
      * adjacent such nodes.  And if the class is equivalent to things like /./,
      * expensive run-time swashes can be avoided.  Now that we have more
      * complete information, we can find things necessarily missed by the
-     * earlier code. */
+     * earlier code.  Another possible "optimization" that isn't done is that
+     * something like [Ee] could be changed into an EXACTFU.  khw tried this
+     * and found that the ANYOF is faster, including for code points not in the
+     * bitmap.  This still might make sense to do, provided it got joined with
+     * an adjacent node(s) to create a longer EXACTFU one.  This could be
+     * accomplished by creating a pseudo ANYOF_EXACTFU node type that the join
+     * routine would know is joinable.  If that didn't happen, the node type
+     * could then be made a straight ANYOF */
 
     if (optimizable && cp_list && ! invert) {
         UV start, end;
@@ -16216,36 +16236,48 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 
*flagp, U32 depth,
         invlist_iterfinish(cp_list);
 
         if (op == END) {
+            const UV cp_list_len = _invlist_len(cp_list);
+            const UV* cp_list_array = invlist_array(cp_list);
 
             /* Here, didn't find an optimization.  See if this matches any of
              * the POSIX classes.  These run slightly faster for above-Unicode
              * code points, so don't bother with POSIXA ones nor the 2 that
-             * have no above-Unicode matches */
-            for (posix_class = 0;
-                 posix_class <= _HIGHEST_REGCOMP_DOT_H_SYNC;
-                 posix_class++)
-            {
-                int try_inverted;
-                if (posix_class == _CC_ASCII || posix_class == _CC_CNTRL) {
-                    continue;
-                }
-                for (try_inverted = 0; try_inverted < 2; try_inverted++) {
+             * have no above-Unicode matches.  We can avoid these checks unless
+             * the ANYOF matches at least as high as the lowest POSIX one
+             * (which was manually found to be \v.  The actual code point may
+             * increase in later Unicode releases, if a higher code point is
+             * assigned to be \v, but this code will never break.  It would
+             * just mean we could execute the checks for posix optimizations
+             * unnecessarily) */
+
+            if (cp_list_array[cp_list_len-1] > 0x2029) {
+                for (posix_class = 0;
+                     posix_class <= _HIGHEST_REGCOMP_DOT_H_SYNC;
+                     posix_class++)
+                {
+                    int try_inverted;
+                    if (posix_class == _CC_ASCII || posix_class == _CC_CNTRL) {
+                        continue;
+                    }
+                    for (try_inverted = 0; try_inverted < 2; try_inverted++) {
 
-                    /* Check if matches normal or inverted */
-                    if (_invlistEQ(cp_list,
-                                   PL_XPosix_ptrs[posix_class],
-                                   try_inverted))
-                    {
-                        op = (try_inverted)
-                             ? NPOSIXU
-                             : POSIXU;
-                        *flagp |= HASWIDTH|SIMPLE;
-                        goto found_posix;
+                        /* Check if matches normal or inverted */
+                        if (_invlistEQ(cp_list,
+                                       PL_XPosix_ptrs[posix_class],
+                                       try_inverted))
+                        {
+                            op = (try_inverted)
+                                 ? NPOSIXU
+                                 : POSIXU;
+                            *flagp |= HASWIDTH|SIMPLE;
+                            goto found_posix;
+                        }
                     }
                 }
+              found_posix: ;
             }
-          found_posix: ;
         }
+
         if (op != END) {
             RExC_parse = (char *)orig_parse;
             RExC_emit = (regnode *)orig_emit;

--
Perl5 Master Repository

Reply via email to