[perl.git] branch blead, updated. v5.17.4-246-g3465e1f

Karl Williamson Thu, 11 Oct 2012 19:38:30 -0700

In perl.git, the branch blead has been updated

<http://perl5.git.perl.org/perl.git/commitdiff/3465e1f03c6c748e8f8a6bf8bfdfaf1fc58a4810?hp=def6ed225c88257d6c50813a8212783f0d267e0e>


- Log -----------------------------------------------------------------
commit 3465e1f03c6c748e8f8a6bf8bfdfaf1fc58a4810
Author: Karl Williamson <[email protected]>
Date:   Thu Oct 11 12:15:53 2012 -0600

    regcomp.c: Optimize EXACTFish nodes without folds to EXACT
    
    Often, case folding will be applied to the entire regular expression
    (such as by using "/i"), but there will be components in it that are the
    same, folded or not.  These components could be represented as EXACT
    nodes with no loss of information.  The regex optimizer is then able to
    apply more optimizations to them than it could otherwise, and pattern
    matching will execute faster.
    
    This commit turns any EXACTFish node (except those under locale rules,
    whose folding rules are not known until runtime)) that contains entirely
    unfoldable characters into the equivalent EXACT node.
    
    This optimization brings up the idea of possibly splitting an EXACTFish
    node that contains a sufficiently long contiguous string of non-folding
    characters into the portions that have folding and the portions that
    don't.  That might or might not be beneficial; I'm not undertaking the
    experiments to check that out.

M       regcomp.c

commit 5e4a1da18f8fd71f2e5f0b98b0d41e3da257281a
Author: Karl Williamson <[email protected]>
Date:   Thu Oct 11 14:56:27 2012 -0600

    regexec.c: Fix EXACT node handling in regrepeat()
    
    Commit b40a2c17551b484a78122be98db5dc06bb4614d5 introduced a bug in
    handling EXACT nodes when the pattern is in UTF-8.  This cleans that up.

M       regexec.c
M       t/re/pat.t
-----------------------------------------------------------------------

Summary of changes:
 regcomp.c  |   62 +++++++++++++++++++++++++++++++++++++++++++++++++++++------
 regexec.c  |   52 +++++++++++++++++++++++++++++++++----------------
 t/re/pat.t |   17 +++++++++++++++-
 3 files changed, 106 insertions(+), 25 deletions(-)

diff --git a/regcomp.c b/regcomp.c
index 128bbbb..55aa218 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -10495,6 +10495,11 @@ tryagain:
             bool next_is_quantifier;
             char * oldp = NULL;
 
+            /* If a folding node contains only code points that don't
+             * participate in folds, it can be changed into an EXACT node,
+             * which allows the optimizer more things to look for */
+            bool maybe_exact;
+
            ender = 0;
             node_type = compute_EXACTish(pRExC_state);
            ret = reg_node(pRExC_state, node_type);
@@ -10507,6 +10512,11 @@ tryagain:
 
        reparse:
 
+            /* We do the EXACTFish to EXACT node only if folding, and not if in
+             * locale, as whether a character folds or not isn't known until
+             * runtime */
+            maybe_exact = FOLD && ! LOC;
+
            /* XXX The node can hold up to 255 bytes, yet this only goes to
              * 127.  I (khw) do not know why.  Keeping it somewhat less than
              * 255 allows us to not have to worry about overflow due to
@@ -10788,13 +10798,44 @@ tryagain:
                             }
                         }
                         else {
-                            ender = _to_uni_fold_flags(ender, (U8 *) s, 
&foldlen,
-                                    FOLD_FLAGS_FULL
-                                     | ((LOC) ?  FOLD_FLAGS_LOCALE
-                                              : (ASCII_FOLD_RESTRICTED)
-                                                ? FOLD_FLAGS_NOMIX_ASCII
-                                                : 0)
-                                );
+                            UV folded = _to_uni_fold_flags(
+                                           ender,
+                                           (U8 *) s,
+                                           &foldlen,
+                                           FOLD_FLAGS_FULL
+                                           | ((LOC) ?  FOLD_FLAGS_LOCALE
+                                                    : (ASCII_FOLD_RESTRICTED)
+                                                      ? FOLD_FLAGS_NOMIX_ASCII
+                                                      : 0)
+                                            );
+
+                            /* If this node only contains non-folding code
+                             * points so far, see if this new one is also
+                             * non-folding */
+                            if (maybe_exact) {
+                                if (folded != ender) {
+                                    maybe_exact = FALSE;
+                                }
+                                else {
+                                    /* Here the fold is the original; we have
+                                     * to check further to see if anything
+                                     * folds to it */
+                                    if (! PL_utf8_foldable) {
+                                        SV* swash = swash_init("utf8",
+                                                           "_Perl_Any_Folds",
+                                                           &PL_sv_undef, 1, 0);
+                                        PL_utf8_foldable =
+                                                    _get_swash_invlist(swash);
+                                        SvREFCNT_dec(swash);
+                                    }
+                                    if (_invlist_contains_cp(PL_utf8_foldable,
+                                                             ender))
+                                    {
+                                        maybe_exact = FALSE;
+                                    }
+                                }
+                            }
+                            ender = folded;
                         }
                        s += foldlen;
 
@@ -10808,6 +10849,7 @@ tryagain:
                     }
                     else {
                         *(s++) = ender;
+                        maybe_exact &= ! isALPHA_L1(ender);
                     }
                }
                else if (UTF) {
@@ -10997,6 +11039,12 @@ tryagain:
        loopdone:   /* Jumped to when encounters something that shouldn't be in
                       the node */
 
+            /* If 'maybe_exact' is still set here, means there are no
+             * code points in the node that participate in folds */
+            if (FOLD && maybe_exact) {
+                OP(ret) = EXACT;
+            }
+
             /* I (khw) don't know if you can get here with zero length, but the
              * old code handled this situation by creating a zero-length EXACT
              * node.  Might as well be NOTHING instead */
diff --git a/regexec.c b/regexec.c
index bad11f2..febc222 100644
--- a/regexec.c
+++ b/regexec.c
@@ -6488,31 +6488,48 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, 
const regnode *p, I32 ma
     case EXACT:
        c = (U8)*STRING(p);
 
-       if (! utf8_target || UNI_IS_INVARIANT(c)) {
+        /* Can use a simple loop if the pattern char to match on is invariant
+         * under UTF-8, or both target and pattern aren't UTF-8.  Note that we
+         * can use UTF8_IS_INVARIANT() even if the pattern isn't UTF-8, as it's
+         * true iff it doesn't matter if the argument is in UTF-8 or not */
+        if (UTF8_IS_INVARIANT(c) || (! utf8_target && ! UTF_PATTERN)) {
            while (scan < loceol && UCHARAT(scan) == c) {
                scan++;
            }
        }
        else if (UTF_PATTERN) {
-            STRLEN scan_char_len;
-
-           loceol = PL_regeol;
+            if (utf8_target) {
+                STRLEN scan_char_len;
+                loceol = PL_regeol;
+
+                /* When both target and pattern are UTF-8, we have to do s
+                 * string EQ */
+                while (hardcount < max
+                       && scan + (scan_char_len = UTF8SKIP(scan)) <= loceol
+                       && scan_char_len <= STR_LEN(p)
+                       && memEQ(scan, STRING(p), scan_char_len))
+                {
+                    scan += scan_char_len;
+                    hardcount++;
+                }
+            }
+            else if (! UTF8_IS_ABOVE_LATIN1(c)) {
 
-           while (hardcount < max
-                   && scan + (scan_char_len = UTF8SKIP(scan)) < loceol
-                   && scan_char_len <= STR_LEN(p)
-                   && memEQ(scan, STRING(p), scan_char_len))
-            {
-               scan += scan_char_len;
-               hardcount++;
-           }
+                /* Target isn't utf8; convert the character in the UTF-8
+                 * pattern to non-UTF8, and do a simple loop */
+                c = TWO_BYTE_UTF8_TO_UNI(c, *(STRING(p) + 1));
+                while (scan < loceol && UCHARAT(scan) == c) {
+                    scan++;
+                }
+            } /* else pattern char is above Latin1, can't possibly match the
+                 non-UTF-8 target */
         }
-       else {
+        else {
 
-           /* Here, the string is utf8, the pattern isn't, but <c> is different
-            * in utf8 than not, so can't compare them directly.  Outside the
-            * loop, find the two utf8 bytes that represent c, and then
-            * look for those in sequence in the utf8 string */
+            /* Here, the string must be utf8; pattern isn't, and <c> is
+             * different in utf8 than not, so can't compare them directly.
+             * Outside the loop, find the two utf8 bytes that represent c, and
+             * then look for those in sequence in the utf8 string */
            U8 high = UTF8_TWO_BYTE_HI(c);
            U8 low = UTF8_TWO_BYTE_LO(c);
            loceol = PL_regeol;
@@ -6527,6 +6544,7 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, 
const regnode *p, I32 ma
            }
        }
        break;
+
     case EXACTFA:
        utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII;
        goto do_exactf;
diff --git a/t/re/pat.t b/t/re/pat.t
index 08f784d..619b2ea 100644
--- a/t/re/pat.t
+++ b/t/re/pat.t
@@ -19,7 +19,7 @@ BEGIN {
     require './test.pl';
 }
 
-plan tests => 453;  # Update this when adding/deleting tests.
+plan tests => 465;  # Update this when adding/deleting tests.
 
 run_tests() unless caller;
 
@@ -1284,6 +1284,21 @@ EOP
         ok("\x{017F}\x{017F}" =~ qr/^[\x{00DF}]?$/i, "[] to EXACTish 
optimization");
     }
 
+    {
+        for my $char (":", "\x{f7}", "\x{2010}") {
+            my $utf8_char = $char;
+            utf8::upgrade($utf8_char);
+            my $display = $char;
+            $display = display($display);
+            my $utf8_display = "utf8::upgrade(\"$display\")";
+
+            like($char, qr/^$char?$/, "\"$display\" =~ /^$display?\$/");
+            like($char, qr/^$utf8_char?$/, "my \$p = \"$display\"; 
utf8::upgrade(\$p); \"$display\" =~ /^\$p?\$/");
+            like($utf8_char, qr/^$char?$/, "my \$c = \"$display\"; 
utf8::upgrade(\$c); \"\$c\" =~ /^$display?\$/");
+            like($utf8_char, qr/^$utf8_char?$/, "my \$c = \"$display\"; 
utf8::upgrade(\$c); my \$p = \"$display\"; utf8::upgrade(\$p); \"\$c\" =~ 
/^\$p?\$/");
+        }
+    }
+
 } # End of sub run_tests
 
 1;

--
Perl5 Master Repository

[perl.git] branch blead, updated. v5.17.4-246-g3465e1f

Reply via email to