In perl.git, the branch blead has been updated

<http://perl5.git.perl.org/perl.git/commitdiff/fab2782b37b5570d7f8f8065fd7d18621117ed49?hp=a5801849a8b80a14c18c29987d91f489f17894c8>

- Log -----------------------------------------------------------------
commit fab2782b37b5570d7f8f8065fd7d18621117ed49
Author: Yves Orton <[email protected]>
Date:   Sun Feb 19 21:32:05 2012 +0100

    rework how the trie logic handles the newer EXACT nodetypes
    
    This cleans up and simplifies and extends how the trie
    logic interacts with the new node types. This change ultimately
    makes the EXACTFU, EXACTFU_SS, EXACTFU_NO_TRIE (renamed to
    EXACTFU_TRICKYFOLD) work properly with the trie engine regardless
    of whether the string is utf8 or latin1.
    
    This patch depends on the following:
    
        EXACT              => utf8 or "binary" text
    
        EXACTFU            => either pre-folded utf8, or latin1 that has to be 
folded as though it was utf8
        EXACTFU_SS         => special case of EXACTFU to handle \xDF/ss 
(affects latin1 treatment)
        EXACTFU_TRICKYFOLD => special case of EXACTFU to handle tricky 
non-latin1 fold rules
    
        EXACTF             => "old style fold logic" untriable nodetype
        EXACTFA            => (currently) untriable nodetype
        EXACTFL            => (currently) untriable nodetype
    
    See the comments in regcomp.sym for these fold types.
    
    This patch involves a number of distinct, but related parts. Starting
    from compilation:
    
    * Simplify how we detect a triable sequence given the new nodetypes,
      this also probably fixed some "bugs" in how we detected certain
      sequences, like /||foo|bar/.
    
    * Simplify how we read EXACTFU nodes under utf8 by removing the now
      redundant folding logic (EXACTFU nodes under utf8 are prefolded).
      Also extend this logic to handle latin1 patterns properly (in
      conjunction with  other changes)
    
    * Part of the problems associated with EXACTFU_SS and EXACTFU_TRICKYFOLD
      have to do with how the trie logic interacts with the minlen logic.
      This change handles both by pessimising the minlen when encounting
      these nodetypes. One observation is that the minlen logic is basically
      broken, and works only because it conflates bytes and codepoints in
      such a way that we more or less always get a value small enough that 
things work out
      anyway. Fixing that is properly is the job of another patch.
    
    * Part of the problem of doing folding under unicode rules is that
      there are a lot of foldings possible, some with strange rules. This
      means that the bitmap logic does not work correctly in all cases,
      as we currently do not have any way to populate it properly.
      So this patch disables the bitmap entirely when folding is involved
      until that is fixed.
    
    The end result of this is: we can TRIE/AHOCORASICK any sequence of
    EXACT, or EXACTFU (ish) nodes, regardless of utf8 or not, but we disable
    the bitmap when folding.
    
    A note for follow up relating to this patch is that the way EXACTFU_XXX
    nodes are currently dealt with we wont build the "maximal" trie because
    of their presence, instead creating a "jumptrie" consisting of either a
    leading EXACTFU node followed by a EXACTFU_XXX node, or vice versa. We
    should eventually address that.

M       embed.fnc
M       embed.h
M       proto.h
M       regcomp.c
M       regcomp.sym
M       regexec.c
M       regnodes.h
M       t/re/fold_grind.t

commit 2f137bbd018b7f86a6a557d3552cbb7a760bb43a
Author: Yves Orton <[email protected]>
Date:   Sun Feb 19 21:04:44 2012 +0100

    make test.pl show test number and name in failure diagnostics output
    
    The old output would show only the line number as diagnostics
    but not the test number, nor the test name, which often contains
    very useful information. This patch makes sure this is visible in
    the diagnostics output of test failures.

M       t/test.pl
-----------------------------------------------------------------------

Summary of changes:
 embed.fnc         |    4 +-
 embed.h           |    4 +-
 proto.h           |   14 ++-
 regcomp.c         |  269 +++++++++++++++++++++++++++++++++--------------------
 regcomp.sym       |    4 +-
 regexec.c         |  145 ++++++++++++++--------------
 regnodes.h        |   10 +-
 t/re/fold_grind.t |    2 +-
 t/test.pl         |    2 +-
 9 files changed, 263 insertions(+), 191 deletions(-)

diff --git a/embed.fnc b/embed.fnc
index 5c380ff..c549dc9 100644
--- a/embed.fnc
+++ b/embed.fnc
@@ -604,7 +604,9 @@ Ap  |UV     |to_uni_upper   |UV c|NN U8 *p|NN STRLEN *lenp
 Ap     |UV     |to_uni_title   |UV c|NN U8 *p|NN STRLEN *lenp
 #ifdef PERL_IN_UTF8_C
 sR     |U8     |to_lower_latin1|const U8 c|NULLOK U8 *p|NULLOK STRLEN *lenp
-p      |UV     |_to_fold_latin1|const U8 c|NN U8 *p|NN STRLEN *lenp|const bool 
flags
+#endif
+#if defined(PERL_IN_UTF8_C) || defined(PERL_IN_REGCOMP_C) || 
defined(PERL_IN_REGEXEC_C)
+EXp        |UV        |_to_fold_latin1|const U8 c|NN U8 *p|NN STRLEN 
*lenp|const bool flags
 #endif
 #if defined(PERL_IN_UTF8_C) || defined(PERL_IN_PP_C)
 p      |UV     |_to_upper_title_latin1|const U8 c|NN U8 *p|NN STRLEN 
*lenp|const char S_or_s
diff --git a/embed.h b/embed.h
index 9fdf91b..1d1e598 100644
--- a/embed.h
+++ b/embed.h
@@ -983,6 +983,9 @@
 #define reghop4                        S_reghop4
 #    endif
 #  endif
+#  if defined(PERL_IN_UTF8_C) || defined(PERL_IN_REGCOMP_C) || 
defined(PERL_IN_REGEXEC_C)
+#define _to_fold_latin1(a,b,c,d)       Perl__to_fold_latin1(aTHX_ a,b,c,d)
+#  endif
 #  if defined(PERL_OLD_COPY_ON_WRITE)
 #define sv_setsv_cow(a,b)      Perl_sv_setsv_cow(aTHX_ a,b)
 #  endif
@@ -1600,7 +1603,6 @@
 #define isa_lookup(a,b,c,d)    S_isa_lookup(aTHX_ a,b,c,d)
 #  endif
 #  if defined(PERL_IN_UTF8_C)
-#define _to_fold_latin1(a,b,c,d)       Perl__to_fold_latin1(aTHX_ a,b,c,d)
 #define check_locale_boundary_crossing(a,b,c,d)        
S_check_locale_boundary_crossing(aTHX_ a,b,c,d)
 #define is_utf8_char_slow      S_is_utf8_char_slow
 #define is_utf8_common(a,b,c)  S_is_utf8_common(aTHX_ a,b,c)
diff --git a/proto.h b/proto.h
index dd3fd58..b811e6b 100644
--- a/proto.h
+++ b/proto.h
@@ -7114,12 +7114,6 @@ STATIC bool      S_isa_lookup(pTHX_ HV *stash, const 
char * const name, STRLEN len, U
 
 #endif
 #if defined(PERL_IN_UTF8_C)
-PERL_CALLCONV UV       Perl__to_fold_latin1(pTHX_ const U8 c, U8 *p, STRLEN 
*lenp, const bool flags)
-                       __attribute__nonnull__(pTHX_2)
-                       __attribute__nonnull__(pTHX_3);
-#define PERL_ARGS_ASSERT__TO_FOLD_LATIN1       \
-       assert(p); assert(lenp)
-
 STATIC UV      S_check_locale_boundary_crossing(pTHX_ const U8* const p, const 
UV result, U8* const ustrp, STRLEN *lenp)
                        __attribute__warn_unused_result__
                        __attribute__nonnull__(pTHX_1)
@@ -7166,6 +7160,14 @@ PERL_CALLCONV UV Perl__to_upper_title_latin1(pTHX_ const 
U8 c, U8 *p, STRLEN *le
        assert(p); assert(lenp)
 
 #endif
+#if defined(PERL_IN_UTF8_C) || defined(PERL_IN_REGCOMP_C) || 
defined(PERL_IN_REGEXEC_C)
+PERL_CALLCONV UV       Perl__to_fold_latin1(pTHX_ const U8 c, U8 *p, STRLEN 
*lenp, const bool flags)
+                       __attribute__nonnull__(pTHX_2)
+                       __attribute__nonnull__(pTHX_3);
+#define PERL_ARGS_ASSERT__TO_FOLD_LATIN1       \
+       assert(p); assert(lenp)
+
+#endif
 #if defined(PERL_IN_UTIL_C)
 STATIC bool    S_ckwarn_common(pTHX_ U32 w);
 STATIC const COP*      S_closest_cop(pTHX_ const COP *cop, const OP *o)
diff --git a/regcomp.c b/regcomp.c
index 7a3433a..e3da6e9 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -1366,44 +1366,47 @@ is the recommended Unicode-aware way of saying
     *(d++) = uv;
 */
 
-#define TRIE_STORE_REVCHAR                                                 \
+#define TRIE_STORE_REVCHAR(val)                                            \
     STMT_START {                                                           \
        if (UTF) {                                                         \
-           SV *zlopp = newSV(2);                                          \
+            SV *zlopp = newSV(7); /* XXX: optimize me */                   \
            unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp);      \
-           unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, uvc & 
0xFF); \
+            unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, val); \
            SvCUR_set(zlopp, kapow - flrbbbbb);                            \
            SvPOK_on(zlopp);                                               \
            SvUTF8_on(zlopp);                                              \
            av_push(revcharmap, zlopp);                                    \
        } else {                                                           \
-           char ooooff = (char)uvc;                                            
   \
+            char ooooff = (char)val;                                           
\
            av_push(revcharmap, newSVpvn(&ooooff, 1));                     \
        }                                                                  \
         } STMT_END
 
-#define TRIE_READ_CHAR STMT_START {                                           \
-    wordlen++;                                                                \
-    if ( UTF ) {                                                              \
-       if ( folder ) {                                                       \
-           if ( foldlen > 0 ) {                                              \
-              uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags );     \
-              foldlen -= len;                                                \
-              scan += len;                                                   \
-              len = 0;                                                       \
-           } else {                                                          \
-               len = UTF8SKIP(uc);\
-               uvc = to_utf8_fold( uc, foldbuf, &foldlen);                   \
-               foldlen -= UNISKIP( uvc );                                    \
-               scan = foldbuf + UNISKIP( uvc );                              \
-           }                                                                 \
-       } else {                                                              \
-           uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
-       }                                                                     \
-    } else {                                                                  \
-       uvc = (U32)*uc;                                                       \
-       len = 1;                                                              \
-    }                                                                         \
+#define TRIE_READ_CHAR STMT_START {                                            
         \
+    wordlen++;                                                                 
         \
+    if ( UTF ) {                                                               
         \
+        /* if it is UTF then it is either already folded, or does not need 
folding */   \
+        uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags);    
         \
+    }                                                                          
         \
+    else if (folder == PL_fold_latin1) {                                       
         \
+        /* if we use this folder we have to obey unicode rules on latin-1 data 
*/       \
+        if ( foldlen > 0 ) {                                                   
         \
+           uvc = utf8n_to_uvuni( (const U8*) scan, UTF8_MAXLEN, &len, uniflags 
);       \
+           foldlen -= len;                                                     
         \
+           scan += len;                                                        
         \
+           len = 0;                                                            
         \
+        } else {                                                               
         \
+            len = 1;                                                           
         \
+            uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, 1);            
         \
+            skiplen = UNISKIP(uvc);                                            
         \
+            foldlen -= skiplen;                                                
         \
+            scan = foldbuf + skiplen;                                          
         \
+        }                                                                      
         \
+    } else {                                                                   
         \
+        /* raw data, will be folded later if needed */                         
         \
+        uvc = (U32)*uc;                                                        
         \
+        len = 1;                                                               
         \
+    }                                                                          
         \
 } STMT_END
 
 
@@ -1522,10 +1525,12 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode 
*startbranch, regnode *firs
     switch (flags) {
        case EXACT: break;
        case EXACTFA:
+        case EXACTFU_SS:
+        case EXACTFU_TRICKYFOLD:
        case EXACTFU: folder = PL_fold_latin1; break;
        case EXACTF:  folder = PL_fold; break;
        case EXACTFL: folder = PL_fold_locale; break;
-        default: Perl_croak( aTHX_ "panic! In trie construction, unknown node 
type %u", (unsigned) flags );
+        default: Perl_croak( aTHX_ "panic! In trie construction, unknown node 
type %u %s", (unsigned) flags, PL_reg_name[flags] );
     }
 
     trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
@@ -1534,7 +1539,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode 
*startbranch, regnode *firs
     trie->wordcount = word_count;
     RExC_rxi->data->data[ data_slot ] = (void*)trie;
     trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
-    if (!(UTF && folder))
+    if (flags == EXACT)
        trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
     trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
                        trie->wordcount+1, sizeof(reg_trie_wordinfo));
@@ -1594,6 +1599,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode 
*startbranch, regnode *firs
         const U8 * const e  = uc + STR_LEN( noper );
         STRLEN foldlen = 0;
         U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
+        STRLEN skiplen = 0;
         const U8 *scan = (U8*)NULL;
         U32 wordlen      = 0;         /* required init */
         STRLEN chars = 0;
@@ -1603,28 +1609,37 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode 
*startbranch, regnode *firs
             trie->minlen= 0;
             continue;
         }
-        if ( set_bit ) /* bitmap only alloced when !(UTF&&Folding) */
+        if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
             TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
                                           regardless of encoding */
-
+            if (OP( noper ) == EXACTFU_SS) {
+                /* false positives are ok, so just set this */
+                TRIE_BITMAP_SET(trie,0xDF);
+            }
+        }
         for ( ; uc < e ; uc += len ) {
             TRIE_CHARCOUNT(trie)++;
             TRIE_READ_CHAR;
             chars++;
             if ( uvc < 256 ) {
+                if ( folder ) {
+                    U8 folded= folder[ (U8) uvc ];
+                    if ( !trie->charmap[ folded ] ) {
+                        trie->charmap[ folded ]=( ++trie->uniquecharcount );
+                        TRIE_STORE_REVCHAR( folded );
+                    }
+                }
                 if ( !trie->charmap[ uvc ] ) {
                     trie->charmap[ uvc ]=( ++trie->uniquecharcount );
-                    if ( folder )
-                        trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
-                    TRIE_STORE_REVCHAR;
+                    TRIE_STORE_REVCHAR( uvc );
                 }
                 if ( set_bit ) {
                    /* store the codepoint in the bitmap, and its folded
                     * equivalent. */
-                    TRIE_BITMAP_SET(trie,uvc);
+                    TRIE_BITMAP_SET(trie, uvc);
 
                    /* store the folded codepoint */
-                   if ( folder ) TRIE_BITMAP_SET(trie,folder[ uvc ]);
+                    if ( folder ) TRIE_BITMAP_SET(trie, folder[(U8) uvc ]);
 
                    if ( !UTF ) {
                        /* store first byte of utf8 representation of
@@ -1647,17 +1662,28 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode 
*startbranch, regnode *firs
 
                 if ( !SvTRUE( *svpp ) ) {
                     sv_setiv( *svpp, ++trie->uniquecharcount );
-                    TRIE_STORE_REVCHAR;
+                    TRIE_STORE_REVCHAR(uvc);
                 }
             }
         }
         if( cur == first ) {
-            trie->minlen=chars;
-            trie->maxlen=chars;
+            trie->minlen = chars;
+            trie->maxlen = chars;
         } else if (chars < trie->minlen) {
-            trie->minlen=chars;
+            trie->minlen = chars;
         } else if (chars > trie->maxlen) {
-            trie->maxlen=chars;
+            trie->maxlen = chars;
+        }
+        if (OP( noper ) == EXACTFU_SS) {
+            /* XXX: workaround - 'ss' could match "\x{DF}" so minlen could be 
1 and not 2*/
+           if (trie->minlen > 1)
+                trie->minlen= 1;
+        }
+       if (OP( noper ) == EXACTFU_TRICKYFOLD) {
+           /* XXX: workround - things like "\x{1FBE}\x{0308}\x{0301}" can 
match "\x{0390}" 
+            *                - We assume that any such sequence might match a 
2 byte string */
+            if (trie->minlen > 2 )
+                trie->minlen= 2;
         }
 
     } /* end first pass */
@@ -1730,6 +1756,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode 
*startbranch, regnode *firs
            STRLEN foldlen   = 0;         /* required init */
             U32 wordlen      = 0;         /* required init */
            U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
+            STRLEN skiplen   = 0;
 
             if (OP(noper) != NOTHING) {
                 for ( ; uc < e ; uc += len ) {
@@ -1930,8 +1957,10 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode 
*startbranch, regnode *firs
 
             STRLEN foldlen   = 0;         /* required init */
             U32 wordlen      = 0;         /* required init */
+            STRLEN skiplen   = 0;
             U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
 
+
             if ( OP(noper) != NOTHING ) {
                 for ( ; uc < e ; uc += len ) {
 
@@ -2568,7 +2597,7 @@ S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, 
regnode *source,  regnode
  *      optimizer doesn't reject these possibilities based on size constraints.
  * 2)   These sequences are not currently correctly handled by the trie code
  *      either, so it changes the joined node type to ops that are not handled
- *      by trie's, those new ops being EXACTFU_SS and EXACTFU_NO_TRIE.
+ *      by trie's, those new ops being EXACTFU_SS and EXACTFU_TRICKYFOLD.
  * 3)   This is sufficient for the two Greek sequences (described below), but
  *      the one involving the Sharp s (\xDF) needs more.  The node type
  *      EXACTFU_SS is used for an EXACTFU node that contains at least one "ss"
@@ -2823,7 +2852,7 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode 
*scan, UV *min_subtract, b
                         * is.  (I (khw) think it doesn't matter in regexec.c
                         * for UTF patterns, but no need to change it */
                        if (OP(scan) == EXACTFU) {
-                           OP(scan) = EXACTFU_NO_TRIE;
+                           OP(scan) = EXACTFU_TRICKYFOLD;
                        }
                        s += 6; /* We already know what this sequence is.  Skip
                                   the rest of it */
@@ -3185,7 +3214,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode 
**scanp,
                         regnode *first = (regnode *)NULL;
                         regnode *last = (regnode *)NULL;
                         regnode *tail = scan;
-                        U8 optype = 0;
+                        U8 trietype = 0;
                         U32 count=0;
 
 #ifdef DEBUGGING
@@ -3216,32 +3245,59 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode 
**scanp,
                         
                         /*
 
-                           step through the branches, cur represents each
-                           branch, noper is the first thing to be matched
-                           as part of that branch and noper_next is the
-                           regnext() of that node. if noper is an EXACT
-                           and noper_next is the same as scan (our current
-                           position in the regex) then the EXACT branch is
-                           a possible optimization target. Once we have
-                           two or more consecutive such branches we can
-                           create a trie of the EXACT's contents and stich
-                           it in place. If the sequence represents all of
-                           the branches we eliminate the whole thing and
-                           replace it with a single TRIE. If it is a
-                           subsequence then we need to stitch it in. This
-                           means the first branch has to remain, and needs
-                           to be repointed at the item on the branch chain
-                           following the last branch optimized. This could
-                           be either a BRANCH, in which case the
-                           subsequence is internal, or it could be the
-                           item following the branch sequence in which
-                           case the subsequence is at the end.
+                            Step through the branches
+                                cur represents each branch,
+                                noper is the first thing to be matched as part 
of that branch
+                                noper_next is the regnext() of that node.
+
+                            We normally handle a case like this 
/FOO[xyz]|BAR[pqr]/
+                            via a "jump trie" but we also support building 
with NOJUMPTRIE,
+                            which restricts the trie logic to structures like 
/FOO|BAR/.
+
+                            If noper is a trieable nodetype then the branch is 
a possible optimization
+                            target. If we are building under NOJUMPTRIE then 
we require that noper_next
+                            is the same as scan (our current position in the 
regex program).
+
+                            Once we have two or more consecutive such branches 
we can create a
+                            trie of the EXACT's contents and stitch it in 
place into the program.
+
+                            If the sequence represents all of the branches in 
the alternation we
+                            replace the entire thing with a single TRIE node.
+
+                            Otherwise when it is a subsequence we need to 
stitch it in place and
+                            replace only the relevant branches. This means the 
first branch has
+                            to remain as it is used by the alternation logic, 
and its next pointer,
+                            and needs to be repointed at the item on the 
branch chain following
+                            the last branch we have optimized away.
+
+                            This could be either a BRANCH, in which case the 
subsequence is internal,
+                            or it could be the item following the branch 
sequence in which case the
+                            subsequence is at the end (which does not 
necessarily mean the first node
+                            is the start of the alternation).
+
+                            TRIE_TYPE(X) is a define which maps the optype to 
a trietype.
+
+                                optype          |  trietype
+                                ----------------+-----------
+                                NOTHING         | NOTHING
+                                EXACT           | EXACT
+                                EXACTFU         | EXACTFU
+                                EXACTFU_SS      | EXACTFU
+                                EXACTFU_TRICKYFOLD | EXACTFU
+                                EXACTFA         | 0
+
 
                         */
+#define TRIE_TYPE(X) ( ( NOTHING == (X) ) ? NOTHING :   \
+                       ( EXACT == (X) )   ? EXACT :        \
+                       ( EXACTFU == (X) || EXACTFU_SS == (X) || 
EXACTFU_TRICKYFOLD == (X) ) ? EXACTFU :        \
+                       0 )
 
                         /* dont use tail as the end marker for this traverse */
                         for ( cur = startbranch ; cur != scan ; cur = regnext( 
cur ) ) {
                             regnode * const noper = NEXTOPER( cur );
+                            U8 noper_type = OP( noper );
+                            U8 noper_trietype = TRIE_TYPE( noper_type );
 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
                             regnode * const noper_next = regnext( noper );
 #endif
@@ -3263,59 +3319,68 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode 
**scanp,
                                 PerlIO_printf( Perl_debug_log, 
"(First==%d,Last==%d,Cur==%d)\n",
                                    REG_NODE_NUM(first), REG_NODE_NUM(last), 
REG_NODE_NUM(cur) );
                             });
-                            if ( (((first && optype!=NOTHING) ? OP( noper ) == 
optype
-                                         : PL_regkind[ OP( noper ) ] == EXACT )
-                                  || OP(noper) == NOTHING )
+
+                            /* Is noper a trieable nodetype that can be merged 
with the
+                             * current trie (if there is one)? */
+                            if ( noper_trietype
+                                  &&
+                                  (
+                                        ( noper_trietype == NOTHING )
+                                        ||
+                                        ( trietype == NOTHING )
+                                        ||
+                                        ( trietype == noper_trietype )
+                                  )
 #ifdef NOJUMPTRIE
                                   && noper_next == tail
 #endif
                                   && count < U16_MAX)
                             {
+                                /* Handle mergable triable node
+                                 * Either we are the first node in a new 
trieable sequence,
+                                 * in which case we do some bookkeeping, 
otherwise we update
+                                 * the end pointer. */
                                 count++;
-                                if ( !first || optype == NOTHING ) {
-                                    if (!first) first = cur;
-                                    optype = OP( noper );
+                                if ( !first ) {
+                                    first = cur;
+                                    trietype = noper_trietype;
                                 } else {
+                                    if ( trietype == NOTHING )
+                                        trietype = noper_trietype;
                                     last = cur;
                                 }
-                            } else {
-/* 
-    Currently the trie logic handles case insensitive matching properly only
-    when the pattern is UTF-8 and the node is EXACTFU (thus forcing unicode
-    semantics).
-
-    If/when this is fixed the following define can be swapped
-    in below to fully enable trie logic.
-
-#define TRIE_TYPE_IS_SAFE 1
-
-Note that join_exact() assumes that the other types of EXACTFish nodes are not
-used in tries, so that would have to be updated if this changed
-
-*/
-#define TRIE_TYPE_IS_SAFE ((UTF && optype == EXACTFU) || optype==EXACT)
-
-                                if ( last && TRIE_TYPE_IS_SAFE ) {
+                            } /* end handle mergable triable node */
+                            else {
+                                /* handle unmergable node -
+                                 * noper may either be a triable node which 
can not be tried
+                                 * together with the current trie, or a non 
triable node */
+                                if ( last && trietype != NOTHING ) {
+                                    /* if last is set then we have found at 
least two triable branch
+                                     * sequences in a row of a similar 
trietype so we can turn them
+                                     * into a trie */
                                     make_trie( pRExC_state, 
                                             startbranch, first, cur, tail, 
count, 
-                                            optype, depth+1 );
+                                            trietype, depth+1 );
+                                    last = NULL; /* note: we clear/update 
first, trietype etc below, so we dont do it here */
                                 }
-                                if ( PL_regkind[ OP( noper ) ] == EXACT
+                                if ( noper_trietype
 #ifdef NOJUMPTRIE
                                      && noper_next == tail
 #endif
                                 ){
+                                    /* noper is triable, so we can start a new 
trie sequence */
                                     count = 1;
                                     first = cur;
-                                    optype = OP( noper );
-                                } else {
+                                    trietype = noper_trietype;
+                                } else if (first) {
+                                    /* if we already saw a first but the 
current node is not triable then we have
+                                     * to reset the first information. */
                                     count = 0;
                                     first = NULL;
-                                    optype = 0;
+                                    trietype = 0;
                                 }
-                                last = NULL;
-                            }
-                        }
+                            } /* end handle unmergable node */
+                        } /* loop over branches */
                         DEBUG_OPTIMISE_r({
                             regprop(RExC_rx, mysv, cur);
                             PerlIO_printf( Perl_debug_log,
@@ -3323,9 +3388,11 @@ used in tries, so that would have to be updated if this 
changed
                               "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
 
                         });
-                        
-                        if ( last && TRIE_TYPE_IS_SAFE ) {
-                            made= make_trie( pRExC_state, startbranch, first, 
scan, tail, count, optype, depth+1 );
+                        if ( last && trietype != NOTHING ) {
+                            /* the last branch of the sequence was part of a 
trie,
+                             * so we have to construct it here outside of the 
loop
+                             */
+                            made= make_trie( pRExC_state, startbranch, first, 
scan, tail, count, trietype, depth+1 );
 #ifdef TRIE_STUDY_OPT
                             if ( ((made == MADE_EXACT_TRIE && 
                                  startbranch == first) 
@@ -3339,8 +3406,8 @@ used in tries, so that would have to be updated if this 
changed
                                 }
                             }
 #endif
-                        }
-                    }
+                        } /* end if ( last) */
+                    } /* TRIE_MAXBUF is non zero */
                     
                 } /* do trie */
                 
@@ -12034,7 +12101,7 @@ S_regtail_study(pTHX_ RExC_state_t *pRExC_state, 
regnode *p, const regnode *val,
                 case EXACTFA:
                 case EXACTFU:
                 case EXACTFU_SS:
-                case EXACTFU_NO_TRIE:
+                case EXACTFU_TRICKYFOLD:
                 case EXACTFL:
                         if( exact == PSEUDO )
                             exact= OP(scan);
diff --git a/regcomp.sym b/regcomp.sym
index f0ee701..e6be6ac 100644
--- a/regcomp.sym
+++ b/regcomp.sym
@@ -92,14 +92,14 @@ BRANCH      BRANCH,     node 0 V  ; Match this alternative, 
or the next...
 # not used
 BACK        BACK,       no 0 V    ; Match "", "next" ptr points backward.
 
-#*Literals
+#*Literals - NOTE the relative ordering of these types is important do not 
change it
 
 EXACT       EXACT,      str       ; Match this string (preceded by length).
 EXACTF      EXACT,      str       ; Match this non-UTF-8 string (not 
guaranteed to be folded) using /id rules (w/len).
 EXACTFL     EXACT,      str       ; Match this string (not guaranteed to be 
folded) using /il rules (w/len).
 EXACTFU     EXACT,      str      ; Match this string (folded iff in UTF-8, 
length in folding doesn't change if not in UTF-8) using /iu rules (w/len).
 EXACTFU_SS  EXACT,      str      ; Match this string (folded iff in UTF-8, 
length in folding may change even if not in UTF-8) using /iu rules (w/len).
-EXACTFU_NO_TRIE EXACT,  str      ; Match this folded UTF-8 string using /iu 
rules, but don't generate a trie for it
+EXACTFU_TRICKYFOLD EXACT,  str   ; Match this folded UTF-8 string using /iu 
rules, but don't generate a trie for it
 EXACTFA     EXACT,      str      ; Match this string (not guaranteed to be 
folded) using /iaa rules (w/len).
 
 #*Do nothing types
diff --git a/regexec.c b/regexec.c
index 76eb7f9..8ccb6f7 100644
--- a/regexec.c
+++ b/regexec.c
@@ -303,13 +303,13 @@
 /* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
    we don't need this definition. */
 #define IS_TEXT(rn)   ( OP(rn)==EXACT   || OP(rn)==REF   || OP(rn)==NREF   )
-#define IS_TEXTF(rn)  ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || 
OP(rn)==EXACTFU_NO_TRIE || OP(rn)==EXACTFA || OP(rn)==EXACTF || OP(rn)==REFF  
|| OP(rn)==NREFF )
+#define IS_TEXTF(rn)  ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || 
OP(rn)==EXACTFU_TRICKYFOLD || OP(rn)==EXACTFA || OP(rn)==EXACTF || OP(rn)==REFF 
 || OP(rn)==NREFF )
 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
 
 #else
 /* ... so we use this as its faster. */
 #define IS_TEXT(rn)   ( OP(rn)==EXACT   )
-#define IS_TEXTFU(rn)  ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || 
OP(rn)==EXACTFU_NO_TRIE || OP(rn) == EXACTFA)
+#define IS_TEXTFU(rn)  ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || 
OP(rn)==EXACTFU_TRICKYFOLD || OP(rn) == EXACTFA)
 #define IS_TEXTF(rn)  ( OP(rn)==EXACTF  )
 #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
 
@@ -1187,58 +1187,61 @@ Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, 
char *strpos,
 
 #define DECL_TRIE_TYPE(scan) \
     const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold } 
\
-                   trie_type = (scan->flags != EXACT) \
-                             ? (utf8_target ? trie_utf8_fold : (UTF_PATTERN ? 
trie_latin_utf8_fold : trie_plain)) \
-                              : (utf8_target ? trie_utf8 : trie_plain)
-
-#define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len,  \
-uvc, charid, foldlen, foldbuf, uniflags) STMT_START {                       \
-    switch (trie_type) {                                                    \
-    case trie_utf8_fold:                                                    \
-       if ( foldlen>0 ) {                                                  \
-           uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags ); \
-           foldlen -= len;                                                 \
-           uscan += len;                                                   \
-           len=0;                                                          \
-       } else {                                                            \
-           uvc = to_utf8_fold( (U8 *) uc, foldbuf, &foldlen );             \
-           len = UTF8SKIP(uc); \
-           foldlen -= UNISKIP( uvc );                                      \
-           uscan = foldbuf + UNISKIP( uvc );                               \
-       }                                                                   \
-       break;                                                              \
-    case trie_latin_utf8_fold:                                              \
-       if ( foldlen>0 ) {                                                  \
-           uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags );     \
-           foldlen -= len;                                                 \
-           uscan += len;                                                   \
-           len=0;                                                          \
-       } else {                                                            \
-           len = 1;                                                        \
-           uvc = to_uni_fold( *(U8*)uc, foldbuf, &foldlen );               \
-           foldlen -= UNISKIP( uvc );                                      \
-           uscan = foldbuf + UNISKIP( uvc );                               \
-       }                                                                   \
-       break;                                                              \
-    case trie_utf8:                                                         \
-       uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags );       \
-       break;                                                              \
-    case trie_plain:                                                        \
-       uvc = (UV)*uc;                                                      \
-       len = 1;                                                            \
-    }                                                                       \
-    if (uvc < 256) {                                                        \
-       charid = trie->charmap[ uvc ];                                      \
-    }                                                                       \
-    else {                                                                  \
-       charid = 0;                                                         \
-       if (widecharmap) {                                                  \
-           SV** const svpp = hv_fetch(widecharmap,                         \
-                       (char*)&uvc, sizeof(UV), 0);                        \
-           if (svpp)                                                       \
-               charid = (U16)SvIV(*svpp);                                  \
-       }                                                                   \
-    }                                                                       \
+                    trie_type = ((scan->flags == EXACT) \
+                              ? (utf8_target ? trie_utf8 : trie_plain) \
+                              : (utf8_target ? trie_utf8_fold : 
trie_latin_utf8_fold))
+
+#define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len,     
     \
+uvc, charid, foldlen, foldbuf, uniflags) STMT_START {                          
     \
+    STRLEN skiplen;                                                            
     \
+    switch (trie_type) {                                                       
     \
+    case trie_utf8_fold:                                                       
     \
+        if ( foldlen>0 ) {                                                     
     \
+            uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, 
uniflags ); \
+            foldlen -= len;                                                    
     \
+            uscan += len;                                                      
     \
+            len=0;                                                             
     \
+        } else {                                                               
     \
+            uvc = to_utf8_fold( (const U8*) uc, foldbuf, &foldlen );           
     \
+            len = UTF8SKIP(uc);                                                
     \
+            skiplen = UNISKIP( uvc );                                          
     \
+            foldlen -= skiplen;                                                
     \
+            uscan = foldbuf + skiplen;                                         
     \
+        }                                                                      
     \
+        break;                                                                 
     \
+    case trie_latin_utf8_fold:                                                 
     \
+        if ( foldlen>0 ) {                                                     
     \
+            uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, 
uniflags ); \
+            foldlen -= len;                                                    
     \
+            uscan += len;                                                      
     \
+            len=0;                                                             
     \
+        } else {                                                               
     \
+            len = 1;                                                           
     \
+            uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, 1);            
     \
+            skiplen = UNISKIP( uvc );                                          
     \
+            foldlen -= skiplen;                                                
     \
+            uscan = foldbuf + skiplen;                                         
     \
+        }                                                                      
     \
+        break;                                                                 
     \
+    case trie_utf8:                                                            
     \
+        uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags );   
     \
+        break;                                                                 
     \
+    case trie_plain:                                                           
     \
+        uvc = (UV)*uc;                                                         
     \
+        len = 1;                                                               
     \
+    }                                                                          
     \
+    if (uvc < 256) {                                                           
     \
+        charid = trie->charmap[ uvc ];                                         
     \
+    }                                                                          
     \
+    else {                                                                     
     \
+        charid = 0;                                                            
     \
+        if (widecharmap) {                                                     
     \
+            SV** const svpp = hv_fetch(widecharmap,                            
     \
+                        (char*)&uvc, sizeof(UV), 0);                           
     \
+            if (svpp)                                                          
     \
+                charid = (U16)SvIV(*svpp);                                     
     \
+        }                                                                      
     \
+    }                                                                          
     \
 } STMT_END
 
 #define REXEC_FBC_EXACTISH_SCAN(CoNd)                     \
@@ -1490,7 +1493,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, 
char *s,
            }
            goto do_exactf_utf8;
 
-       case EXACTFU_NO_TRIE:
+       case EXACTFU_TRICKYFOLD:
        case EXACTFU:
            if (UTF_PATTERN || utf8_target) {
                utf8_fold_flags = (UTF_PATTERN) ? FOLDEQ_S2_ALREADY_FOLDED : 0;
@@ -3267,16 +3270,14 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
             /* In this case the charclass data is available inline so
                we can fail fast without a lot of extra overhead. 
              */
-            if (scan->flags == EXACT || !utf8_target) {
-                if(!ANYOF_BITMAP_TEST(scan, *locinput)) {
-                    DEBUG_EXECUTE_r(
-                        PerlIO_printf(Perl_debug_log,
-                                 "%*s  %sfailed to match trie start 
class...%s\n",
-                                 REPORT_CODE_OFF+depth*2, "", PL_colors[4], 
PL_colors[5])
-                    );
-                    sayNO_SILENT;
-                    /* NOTREACHED */
-                }                      
+            if(!ANYOF_BITMAP_TEST(scan, *locinput)) {
+                DEBUG_EXECUTE_r(
+                    PerlIO_printf(Perl_debug_log,
+                              "%*s  %sfailed to match trie start class...%s\n",
+                              REPORT_CODE_OFF+depth*2, "", PL_colors[4], 
PL_colors[5])
+                );
+                sayNO_SILENT;
+                /* NOTREACHED */
             }
             /* FALL THROUGH */
        case TRIE:
@@ -3334,9 +3335,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
                HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 
]);
                 U32 state = trie->startstate;
 
-               if (trie->bitmap && trie_type != trie_utf8_fold &&
-                   !TRIE_BITMAP_TEST(trie,*locinput)
-               ) {
+                if (trie->bitmap && !TRIE_BITMAP_TEST(trie,*locinput) ) {
                    if (trie->states[ state ].wordnum) {
                         DEBUG_EXECUTE_r(
                             PerlIO_printf(Perl_debug_log,
@@ -3671,7 +3670,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog)
            goto do_exactf;
 
        case EXACTFU_SS:
-       case EXACTFU_NO_TRIE:
+       case EXACTFU_TRICKYFOLD:
        case EXACTFU:
            folder = foldEQ_latin1;
            fold_array = PL_fold_latin1;
@@ -5085,7 +5084,7 @@ NULL
                            case EXACTF: ST.c2 = PL_fold[ST.c1]; break;
                            case EXACTFA:
                            case EXACTFU_SS:
-                           case EXACTFU_NO_TRIE:
+                           case EXACTFU_TRICKYFOLD:
                            case EXACTFU: ST.c2 = PL_fold_latin1[ST.c1]; break;
                            case EXACTFL: ST.c2 = PL_fold_locale[ST.c1]; break;
                            default: ST.c2 = ST.c1;
@@ -5241,7 +5240,7 @@ NULL
                            case EXACTF: ST.c2 = PL_fold[ST.c1]; break;
                            case EXACTFA:
                            case EXACTFU_SS:
-                           case EXACTFU_NO_TRIE:
+                           case EXACTFU_TRICKYFOLD:
                            case EXACTFU: ST.c2 = PL_fold_latin1[ST.c1]; break;
                            case EXACTFL: ST.c2 = PL_fold_locale[ST.c1]; break;
                            default: ST.c2 = ST.c1; break;
@@ -6035,7 +6034,7 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, 
I32 max, int depth)
            goto do_exactf;
 
     case EXACTFU_SS:
-    case EXACTFU_NO_TRIE:
+    case EXACTFU_TRICKYFOLD:
     case EXACTFU:
        utf8_flags = (UTF_PATTERN) ? FOLDEQ_S2_ALREADY_FOLDED : 0;
 
@@ -6077,7 +6076,7 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, 
I32 max, int depth)
            switch (OP(p)) {
                case EXACTF: folded = PL_fold[c]; break;
                case EXACTFA:
-               case EXACTFU_NO_TRIE:
+               case EXACTFU_TRICKYFOLD:
                case EXACTFU: folded = PL_fold_latin1[c]; break;
                case EXACTFL: folded = PL_fold_locale[c]; break;
                default: Perl_croak(aTHX_ "panic: Unexpected op %u", OP(p));
diff --git a/regnodes.h b/regnodes.h
index 9941593..5561243 100644
--- a/regnodes.h
+++ b/regnodes.h
@@ -62,7 +62,7 @@
 #define        EXACTFL                 50      /* 0x32 Match this string (not 
guaranteed to be folded) using /il rules (w/len). */
 #define        EXACTFU                 51      /* 0x33 Match this string 
(folded iff in UTF-8, length in folding doesn't change if not in UTF-8) using 
/iu rules (w/len). */
 #define        EXACTFU_SS              52      /* 0x34 Match this string 
(folded iff in UTF-8, length in folding may change even if not in UTF-8) using 
/iu rules (w/len). */
-#define        EXACTFU_NO_TRIE         53      /* 0x35 Match this folded UTF-8 
string using /iu rules, but don't generate a trie for it */
+#define        EXACTFU_TRICKYFOLD      53      /* 0x35 Match this folded UTF-8 
string using /iu rules, but don't generate a trie for it */
 #define        EXACTFA                 54      /* 0x36 Match this string (not 
guaranteed to be folded) using /iaa rules (w/len). */
 #define        NOTHING                 55      /* 0x37 Match empty string. */
 #define        TAIL                    56      /* 0x38 Match empty string. Can 
jump here from outside. */
@@ -223,7 +223,7 @@ EXTCONST U8 PL_regkind[] = {
        EXACT,          /* EXACTFL                */
        EXACT,          /* EXACTFU                */
        EXACT,          /* EXACTFU_SS             */
-       EXACT,          /* EXACTFU_NO_TRIE        */
+       EXACT,          /* EXACTFU_TRICKYFOLD     */
        EXACT,          /* EXACTFA                */
        NOTHING,        /* NOTHING                */
        NOTHING,        /* TAIL                   */
@@ -384,7 +384,7 @@ static const U8 regarglen[] = {
        0,                                      /* EXACTFL      */
        0,                                      /* EXACTFU      */
        0,                                      /* EXACTFU_SS   */
-       0,                                      /* EXACTFU_NO_TRIE */
+       0,                                      /* EXACTFU_TRICKYFOLD */
        0,                                      /* EXACTFA      */
        0,                                      /* NOTHING      */
        0,                                      /* TAIL         */
@@ -502,7 +502,7 @@ static const char reg_off_by_arg[] = {
        0,      /* EXACTFL      */
        0,      /* EXACTFU      */
        0,      /* EXACTFU_SS   */
-       0,      /* EXACTFU_NO_TRIE */
+       0,      /* EXACTFU_TRICKYFOLD */
        0,      /* EXACTFA      */
        0,      /* NOTHING      */
        0,      /* TAIL         */
@@ -625,7 +625,7 @@ EXTCONST char * const PL_reg_name[] = {
        "EXACTFL",                      /* 0x32 */
        "EXACTFU",                      /* 0x33 */
        "EXACTFU_SS",                   /* 0x34 */
-       "EXACTFU_NO_TRIE",              /* 0x35 */
+       "EXACTFU_TRICKYFOLD",           /* 0x35 */
        "EXACTFA",                      /* 0x36 */
        "NOTHING",                      /* 0x37 */
        "TAIL",                         /* 0x38 */
diff --git a/t/re/fold_grind.t b/t/re/fold_grind.t
index 98fc396..e2153e3 100644
--- a/t/re/fold_grind.t
+++ b/t/re/fold_grind.t
@@ -735,7 +735,7 @@ foreach my $test (sort { numerically } keys %tests) {
                           if (!$res || $ENV{PERL_DEBUG_FULL_TEST}) {
                             # Failed or debug; output the result
                             $count++;
-                            ok($res, $desc);
+                            ok($res, "test $count - $desc");
                           } else {
                             # Just count the test as passed
                             $okays++;
diff --git a/t/test.pl b/t/test.pl
index ad59ca9..604cdda 100644
--- a/t/test.pl
+++ b/t/test.pl
@@ -210,7 +210,7 @@ sub _ok {
        note @mess; # Ensure that the message is properly escaped.
     }
     else {
-       _diag "# Failed $where\n";
+        _diag "# Failed test $test - $name $where\n";
        _diag @mess;
     }
 

--
Perl5 Master Repository

Reply via email to