In perl.git, the branch blead has been updated

<http://perl5.git.perl.org/perl.git/commitdiff/2786be71c69a3e244009b94145ca66f2326aadb9?hp=234242ca5bbd2e073a9d2c4dc7103a0dcea61a54>

- Log -----------------------------------------------------------------
commit 2786be71c69a3e244009b94145ca66f2326aadb9
Author: Karl Williamson <[email protected]>
Date:   Tue Dec 7 22:51:34 2010 -0700

    regcomp.c: Clean up optimization for 1-char []
    
    A single character character class can be optimized into an EXACT node.
    The changes elsewhere allow this to no longer be constrained to
    ASCII-only when the pattern isn't UTF-8.  Also, the optimization
    shouldn't have happened for FOLDED characters, as explained in the
    comments, when they participate in multi-char folds; so that is removed.
    
    Also, a locale node with folded characters can be optimized.

M       regcomp.c

commit 3a15e693385b1ab6186ad77f2fc208db1d0e05ea
Author: Karl Williamson <[email protected]>
Date:   Tue Dec 7 17:04:02 2010 -0700

    regcomp: Allow freeing up bit in ANYOF flags
    
    The flags field is fully used, and until the ANYOF node is split later
    in development, the CLASS bit will need to be freed up to give the space
    for other uses.  This patch allows for this to easily be toggled.

M       regcomp.c
M       regcomp.h
M       regexec.c

commit 40c7855679f2fac8010393d53040d01a1eecb050
Author: Karl Williamson <[email protected]>
Date:   Tue Dec 7 16:23:07 2010 -0700

    regcomp.c: Move [] inversion optimization
    
    The optimization to do inversion a compile time is moved to earlier.
    This doesn't help today, but it may someday when we start keeping better
    track of Unicode characters, and is the more logical place for it.

M       regcomp.c

commit fed836b3bfd7f84c2054616037d002570c7f407c
Author: Karl Williamson <[email protected]>
Date:   Tue Dec 7 15:21:55 2010 -0700

    regcomp.c: When inverting a [], adjust bit count
    
    When one complements every bit, the count of those that are set should
    be complemented as well.

M       regcomp.c

commit f1d6d3cd22e129a9bf34f86ebe89183aad01d216
Author: Karl Williamson <[email protected]>
Date:   Tue Dec 7 15:16:07 2010 -0700

    Subject: [PATCH] regcomp.c: adjust flag
    
    When something matches above Latin1, it should have the ANYOF_UTF8 bit
    set.

M       regcomp.c

commit 30e9bc90ff255ffc2e220a3b10772597a8d8423a
Author: Karl Williamson <[email protected]>
Date:   Tue Dec 7 15:13:29 2010 -0700

    regcomp.c: Change constants for clarity.
    
    Oddly, it is clearer to use 0xFF as an exclusive-or target instead of an
    unrelated #define that happens to have that value.

M       regcomp.c

commit a1f3213ba6ba505766fbd1a7636a29ffe462cfe9
Author: Karl Williamson <[email protected]>
Date:   Tue Dec 7 15:06:35 2010 -0700

    regcomp.c: fix indent

M       regcomp.c

commit b3a66ba06366992e90140792ae0fb1480093d249
Author: Karl Williamson <[email protected]>
Date:   Tue Dec 7 15:04:20 2010 -0700

    regcomp.c: remove no longer needed test
    
    optimize_invert is no longer needed given the changes already made, as
    now if there is something not in the bitmap, a flag will be set, and the
    optimization doesn't take place unless the only flag is inversion.  And,
    the bitmap is setup completely now for anything that doesn't have to be
    deferred to runtime, and such deferrals are marked with other flags.

M       regcomp.c

commit ce1c68b24a719356e8e7724a2c8a963159f3c18d
Author: Karl Williamson <[email protected]>
Date:   Tue Dec 7 14:50:13 2010 -0700

    regcomp.c: isASCII doesn't match outside ANYOF bitmap
    
    So there is no need to tell regexec that it does, and then can combine
    two other statements

M       regcomp.c

commit 41b0f1c13ae23011b37421cd90820bc780b74f30
Author: Karl Williamson <[email protected]>
Date:   Tue Dec 7 14:36:30 2010 -0700

    regcomp.c: Fix compiler warning
    
    One smoke is warning about truncated results.  This should fix that.  It
    may be that other compilers will now complain, and we'll need to add
    casts, but I'm waiting to see.

M       regcomp.c

commit b8e7c664739512d97c3d65f44d80a53754521f15
Author: Karl Williamson <[email protected]>
Date:   Mon Dec 6 15:11:59 2010 -0700

    regcomp.c: Remove no longer necessary loop
    
    Recent changes to this cause the bitmap to be populated where possible
    with the all folding taken into consideration.  Therefore, the FOLD flag
    isn't necessary except when that wasn't possible, and the loop that went
    through looking for folds is also not necessary, as the bitmap is
    now completely populated before it gets to where the loop was.

M       regcomp.c

commit ffc130aa464ab42cfe8e8a19205f5b398675d98f
Author: Karl Williamson <[email protected]>
Date:   Sat Dec 11 15:02:49 2010 -0700

    regcomp.c: remove unncessary counting
    
    stored now contains the number of 1 bits in the ANYOF node, and is no
    longer needed to be arbitrarily set.  Part of this is because there is
    now a flag if there is any match outside the bitmap, which prohibits
    optimization if so.

M       regcomp.c
-----------------------------------------------------------------------

Summary of changes:
 regcomp.c |  147 +++++++++++++++++++++++++++++++++++++------------------------
 regcomp.h |   30 ++++++++++---
 regexec.c |    4 +-
 3 files changed, 114 insertions(+), 67 deletions(-)

diff --git a/regcomp.c b/regcomp.c
index 34c4e95..66cadcf 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -3609,8 +3609,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode 
**scanp,
                        goto do_default;
                    if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
                        value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
-                                || ((data->start_class->flags & ANYOF_CLASS)
-                                     && 
ANYOF_CLASS_TEST_ANY_SET(data->start_class)));
+                                || 
ANYOF_CLASS_TEST_ANY_SET(data->start_class));
                        cl_anything(pRExC_state, data->start_class);
                    }
                    if (flags & SCF_DO_STCLASS_AND || !value)
@@ -8300,7 +8299,7 @@ S_set_regclass_bit_fold(pTHX_ RExC_state_t *pRExC_state, 
regnode* node, const U8
 
 
 PERL_STATIC_INLINE U8
-S_set_regclass_bit(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U32 
value)
+S_set_regclass_bit(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 
value)
 {
     /* This inline function sets a bit in the bitmap if not already set, and if
      * appropriate, its fold, returning the number of bits that actually
@@ -8344,12 +8343,11 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
     bool need_class = 0;
     SV *listsv = NULL;
     UV n;
-    bool optimize_invert   = TRUE;
     AV* unicode_alternate  = NULL;
 #ifdef EBCDIC
     UV literal_endpoint = 0;
 #endif
-    UV stored = 0;  /* 0, 1, or more than 1 chars stored in the class */
+    UV stored = 0;  /* how many chars stored in the bitmap */
 
     regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
         case we need to change the emitted regop to an EXACT. */
@@ -8378,14 +8376,21 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
 
     if (SIZE_ONLY) {
        RExC_size += ANYOF_SKIP;
+#ifdef ANYOF_ADD_LOC_SKIP
+       if (LOC) {
+           RExC_size += ANYOF_ADD_LOC_SKIP;
+       }
+#endif
        listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
     }
     else {
        RExC_emit += ANYOF_SKIP;
-       if (FOLD)
-           ANYOF_FLAGS(ret) |= ANYOF_FOLD;
-       if (LOC)
+       if (LOC) {
            ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
+#ifdef ANYOF_ADD_LOC_SKIP
+           RExC_emit += ANYOF_ADD_LOC_SKIP;
+#endif
+       }
        ANYOF_BITMAP_ZERO(ret);
        listsv = newSVpvs("# comment\n");
     }
@@ -8599,10 +8604,14 @@ parseit:
            if (LOC && namedclass < ANYOF_MAX && ! need_class) {
                need_class = 1;
                if (SIZE_ONLY) {
+#ifdef ANYOF_CLASS_ADD_SKIP
                    RExC_size += ANYOF_CLASS_ADD_SKIP;
+#endif
                }
                else {
+#ifdef ANYOF_CLASS_ADD_SKIP
                    RExC_emit += ANYOF_CLASS_ADD_SKIP;
+#endif
                    ANYOF_CLASS_ZERO(ret);
                }
                ANYOF_FLAGS(ret) |= ANYOF_CLASS;
@@ -8627,7 +8636,7 @@ parseit:
                    else {
                        ANYOF_FLAGS(ret) |= ANYOF_UTF8;
                        Perl_sv_catpvf(aTHX_ listsv,
-                                      "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, 
(UV) '-');
+                          "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
                    }
                }
 
@@ -8640,8 +8649,6 @@ parseit:
                const char *what = NULL;
                char yesno = 0;
 
-               if (namedclass > OOB_NAMEDCLASS)
-                   optimize_invert = FALSE;
                /* Possible truncation here but in some 64-bit environments
                 * the compiler gets heartburn about switch on 64-bit values.
                 * A similar issue a little earlier when switching on value.
@@ -8679,7 +8686,8 @@ parseit:
                               S_set_regclass_bit(aTHX_ pRExC_state, ret, 
ASCII_TO_NATIVE(value));
                    }
                    yesno = '+';
-                   what = "ASCII";
+                   what = NULL;        /* Doesn't match outside ascii, so
+                                          don't want to add +utf8:: */
                    break;
                case ANYOF_NASCII:
                    if (LOC)
@@ -8729,13 +8737,9 @@ parseit:
                if (what) {
                    /* Strings such as "+utf8::isWord\n" */
                    Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
-               }
-               stored+=2; /* can't optimize this class */
-
-               /* All but ASCII can match characters storable only in utf8 */
-               if (namedclass != ANYOF_ASCII) {
                    ANYOF_FLAGS(ret) |= ANYOF_UTF8;
                }
+
                continue;
            }
        } /* end of namedclass \blah */
@@ -8773,7 +8777,6 @@ parseit:
        }
 
        /* now is the next time */
-        /*stored += (value - prevvalue + 1);*/
        if (!SIZE_ONLY) {
            if (prevvalue < 256) {
                const IV ceilvalue = value < 256 ? value : 255;
@@ -8808,7 +8811,6 @@ parseit:
          if (value > 255 || UTF) {
                const UV prevnatvalue  = NATIVE_TO_UNI(prevvalue);
                const UV natvalue      = NATIVE_TO_UNI(value);
-                stored+=2; /* can't optimize this class */
 
                /* If the code point requires utf8 to represent, and we are not
                 * folding, it can't match unless the target is in utf8.  Only
@@ -8906,52 +8908,81 @@ parseit:
         return ret;
     /****** !SIZE_ONLY AFTER HERE *********/
 
-    if( stored == 1 && (value < 128 || (value < 256 && !UTF))
-        && !( ANYOF_FLAGS(ret) & ( ANYOF_FLAGS_ALL ^ ANYOF_FOLD ) )
-    ) {
-       /* optimize single char class to an EXACT node but *only* when its not
-        * a UTF/high char.  Note that the information needed to decide to do
-        * this optimization is not currently available until the 2nd pass, and
-        * that the actually used EXACT node takes less space than the
-        * calculated ANYOF node, and hence the amount of space calculated in
-         * the first pass is larger than actually used.  Currently we don't
-         * keep track of enough information to do this for nodes which contain
-         * matches outside the bitmap */
+    /* Folding in the bitmap is taken care of above, but not for locale, for
+     * which we have to wait to see what folding is in effect at runtime, and
+     * for things not in the bitmap */
+    if (FOLD && (LOC || ANYOF_FLAGS(ret) & ANYOF_NONBITMAP)) {
+        ANYOF_FLAGS(ret) |= ANYOF_FOLD;
+    }
+
+    /* Optimize inverted simple patterns (e.g. [^a-z]).  Note that this doesn't
+     * optimize locale.  Doing so perhaps could be done as long as there is
+     * nothing like \w in it; some thought also would have to be given to the
+     * interaction with above 0x100 chars */
+    if ((ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
+       for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
+           ANYOF_BITMAP(ret)[value] ^= 0xFF;
+       stored = 256 - stored;
+
+       /* The inversion means that everything above 255 is matched */
+       ANYOF_FLAGS(ret) = ANYOF_UTF8|ANYOF_UNICODE_ALL;
+    }
+
+    /* A single character class can be "optimized" into an EXACTish node.
+     * Note that since we don't currently count how many characters there are
+     * outside the bitmap, we are XXX missing optimization possibilities for
+     * them.  This optimization can't happen unless this is a truly single
+     * character class, which means that it can't be an inversion into a
+     * many-character class, and there must be no possibility of there being
+     * things outside the bitmap.  'stored' (only) for locales doesn't include
+     * \w, etc, so have to make a special test that they aren't present */
+    if (! (ANYOF_FLAGS(ret) & (ANYOF_NONBITMAP|ANYOF_INVERT|ANYOF_UNICODE_ALL))
+        && ((stored == 1 && ((! (ANYOF_FLAGS(ret) & ANYOF_LOCALE))
+                              || (! ANYOF_CLASS_TEST_ANY_SET(ret))))))
+    {
+        /* Note that the information needed to decide to do this optimization
+         * is not currently available until the 2nd pass, and that the actually
+         * used EXACT node takes less space than the calculated ANYOF node, and
+         * hence the amount of space calculated in the first pass is larger
+         * than actually used, so this optimization doesn't gain us any space.
+        * But an EXACT node is faster than an ANYOF node, and can be combined
+        * with any adjacent EXACT nodes later by the optimizer for further
+        * gains. */
+
         const char * cur_parse= RExC_parse;
         RExC_emit = (regnode *)orig_emit;
         RExC_parse = (char *)orig_parse;
-        ret = reg_node(pRExC_state,
-                       (U8)((ANYOF_FLAGS(ret) & ANYOF_FOLD) ? EXACTF : EXACT));
+
+       /* (A locale node can have 1 point and be folded; all the other folds
+        * will include the fold, hence will have 2 points, so we won't get
+        * here with FOLD set unless it is also locale) */
+       ret = reg_node(pRExC_state, (U8) (! FOLD)
+                                        ? EXACT
+                                        : EXACTFL
+                   );
         RExC_parse = (char *)cur_parse;
-        *STRING(ret)= (char)value;
-        STR_LEN(ret)= 1;
-        RExC_emit += STR_SZ(1);
+       if (UTF && ! NATIVE_IS_INVARIANT(value)) {
+           *STRING(ret)= UTF8_EIGHT_BIT_HI((U8) value);
+           *(STRING(ret) + 1)= UTF8_EIGHT_BIT_LO((U8) value);
+           STR_LEN(ret)= 2;
+           RExC_emit += STR_SZ(2);
+       }
+       else {
+           *STRING(ret)= (char)value;
+           STR_LEN(ret)= 1;
+           RExC_emit += STR_SZ(1);
+       }
        SvREFCNT_dec(listsv);
         return ret;
-    }
-    /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
-    if ( /* If the only flag is folding (plus possibly inversion). */
-       ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
-       ) {
-       for (value = 0; value < 256; ++value) {
-           if (ANYOF_BITMAP_TEST(ret, value)) {
-               UV fold = PL_fold[value];
 
-               if (fold != value)
-                   ANYOF_BITMAP_SET(ret, fold);
-           }
-       }
-       ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
+       /* (A 2-character class of the very special form like [aA] could be
+        * optimized into an EXACTFish node, but only for non-locales, and for
+        * characters which only have the two folds; so things like 'fF' and
+        * 'Ii' wouldn't work because of the fold of 'LATIN SMALL LIGATURE FI'.
+        * Since we don't have that information currently conveniently
+        * available, skip the optimization) */
     }
 
-    /* optimize inverted simple patterns (e.g. [^a-z]) */
-    if (optimize_invert &&
-       /* If the only flag is inversion. */
-       (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
-       for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
-           ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
-       ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
-    }
     {
        AV * const av = newAV();
        SV *rv;
@@ -9712,8 +9743,8 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const 
regnode *o)
        }
         
         EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
-        /* output any special charclass tests (used mostly under use locale) */
-       if (o->flags & ANYOF_CLASS && ANYOF_CLASS_TEST_ANY_SET(o))
+        /* output any special charclass tests (used entirely under use locale) 
*/
+       if (ANYOF_CLASS_TEST_ANY_SET(o))
            for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
                if (ANYOF_CLASS_TEST(o,i)) {
                    sv_catpv(sv, anyofs[i]);
diff --git a/regcomp.h b/regcomp.h
index 6dc05f5..c140089 100644
--- a/regcomp.h
+++ b/regcomp.h
@@ -406,12 +406,6 @@ struct regnode_charclass_class {
 #define ANYOF_CLASS_CLEAR(p, c)        (ANYOF_CLASS_BYTE(p, c) &= 
~ANYOF_BIT(c))
 #define ANYOF_CLASS_TEST(p, c) (ANYOF_CLASS_BYTE(p, c) &   ANYOF_BIT(c))
 
-/* Quicker way to see if there are actually any tests.  This is because
- * currently the set of tests can be empty even when the class bitmap is
- * allocated */
-#define ANYOF_CLASS_TEST_ANY_SET(p)    /* assumes sizeof(p) = 4 */       \
-       memNE (((struct regnode_charclass_class*)(p))->classflags, "0000", 
ANYOF_CLASS_SIZE)
-
 #define ANYOF_CLASS_ZERO(ret)  Zero(((struct 
regnode_charclass_class*)(ret))->classflags, ANYOF_CLASSBITMAP_SIZE, char)
 #define ANYOF_BITMAP_ZERO(ret) Zero(((struct 
regnode_charclass*)(ret))->bitmap, ANYOF_BITMAP_SIZE, char)
 
@@ -431,7 +425,29 @@ struct regnode_charclass_class {
 
 #define ANYOF_SKIP             ((ANYOF_SIZE - 1)/sizeof(regnode))
 #define ANYOF_CLASS_SKIP       ((ANYOF_CLASS_SIZE - 1)/sizeof(regnode))
-#define ANYOF_CLASS_ADD_SKIP   (ANYOF_CLASS_SKIP - ANYOF_SKIP)
+
+/* The class bit can be set to the locale one if necessary to save bits at the
+ * expense of having locale ANYOF nodes always have a class bit map, and hence
+ * take up extra space.  This allows convenient changing it as development
+ * proceeds on this */
+#if ANYOF_CLASS == ANYOF_LOCALE
+#   undef ANYOF_CLASS_ADD_SKIP
+#   define ANYOF_ADD_LOC_SKIP  (ANYOF_CLASS_SKIP - ANYOF_SKIP)
+
+    /* Quicker way to see if there are actually any tests.  This is because
+     * currently the set of tests can be empty even when the class bitmap is
+     * allocated */
+#   if ANYOF_CLASSBITMAP_SIZE != 4
+#      error ANYOF_CLASSBITMAP_SIZE is expected to be 4
+#   endif
+#   define ANYOF_CLASS_TEST_ANY_SET(p) /* assumes sizeof(p) = 4 */       \
+       memNE (((struct regnode_charclass_class*)(p))->classflags,        \
+               "\0\0\0\0", ANYOF_CLASSBITMAP_SIZE)
+#else
+#   define ANYOF_CLASS_ADD_SKIP        (ANYOF_CLASS_SKIP - ANYOF_SKIP)
+#   undef ANYOF_ADD_LOC_SKIP
+#   define ANYOF_CLASS_TEST_ANY_SET(p) (ANYOF_FLAGS(p) & ANYOF_CLASS)
+#endif
 
 
 /*
diff --git a/regexec.c b/regexec.c
index c1f1ae2..f2723e4 100644
--- a/regexec.c
+++ b/regexec.c
@@ -6356,8 +6356,8 @@ S_reginclass(pTHX_ const regexp * const prog, register 
const regnode * const n,
                match = TRUE;
        }
        
-       if (!match && (flags & ANYOF_CLASS) && ANYOF_CLASS_TEST_ANY_SET(n)) {
-           PL_reg_flags |= RF_tainted;
+       if (!match && ANYOF_CLASS_TEST_ANY_SET(n)) {
+           PL_reg_flags |= RF_tainted;     /* CLASS implies LOCALE */
            if (
                (ANYOF_CLASS_TEST(n, ANYOF_ALNUM)   &&  isALNUM_LC(c))  ||
                (ANYOF_CLASS_TEST(n, ANYOF_NALNUM)  && !isALNUM_LC(c))  ||

--
Perl5 Master Repository

Reply via email to