In perl.git, the branch blead has been updated

<http://perl5.git.perl.org/perl.git/commitdiff/6bbba9040c7840209170b2ff9a1d7b03ae1cbdc1?hp=ec192197b904194f8a514368e774522e9b83add8>

- Log -----------------------------------------------------------------
commit 6bbba9040c7840209170b2ff9a1d7b03ae1cbdc1
Author: Karl Williamson <[email protected]>
Date:   Mon Dec 6 12:16:24 2010 -0700

    regexec.c: Fix locale and \s
    
    The handling for locale \s and \S both assume that the character in
    ASCII platforms at 0x20 is a space.  This is not necessarily so.
    
    I'm guessing that the code was originally just copied and pasted from
    the non-locale space handling code without thinking.  That code hard-coded
    in the space character, probably to avoid an expensive swash fetch for a
    common situation.

M       regexec.c

commit b77393f6288f64bf00f41fef15da0fac4085bfd2
Author: Karl Williamson <[email protected]>
Date:   Mon Dec 6 12:01:22 2010 -0700

    regexec.c: Add missing handlers for locale \d
    
    regexec.c had some code to handle \d under locales, but not everywhere.

M       regexec.c

commit 28b5d7bf98b62fd30fb98fcdb5c701b1b2acdd8f
Author: Karl Williamson <[email protected]>
Date:   Mon Dec 6 11:56:49 2010 -0700

    regcomp.sym: Correct DIGITL, NDIGITL entries
    
    These were missing that they were simple (matching exactly 1 character)
    and have 0 regnode arguments

M       regcomp.sym
M       regnodes.h

commit 6ab9ea91fb04390bf9c50134beadab7cf6fd0c25
Author: Karl Williamson <[email protected]>
Date:   Mon Dec 6 08:35:17 2010 -0700

    regcomp.c: Add locale for \d
    
    The DIGITL and NDIGITL regnodes were not being generated; instead
    regular DIGIT and NDIGIT regnodes were even under locale.
    
    This means no one has probably ever used Perl on a locale that changed
    the digits.

M       regcomp.c

commit 81c14aa2230ca380c2f424e69ac8f9dc0bb4ae23
Author: Karl Williamson <[email protected]>
Date:   Sun Dec 5 13:10:13 2010 -0700

    toke.c: Fix EBCDIC problem
    
    Commit 356979f4a7d780fd67a92a9ca6c8659bd12e7168 failed to include two
    instances in toke.c that needed the same treatment, i.e., converting
    properly from I8 to native.

M       toke.c

commit 7538f7248145d82e23e430518cf41f4da91e8fdd
Author: Karl Williamson <[email protected]>
Date:   Sun Dec 5 13:09:43 2010 -0700

    toke.c: highlight problematic-mentioning comment

M       toke.c

commit 4d252e65989d9d34bc759c890186b40e14559420
Author: Karl Williamson <[email protected]>
Date:   Sun Dec 5 13:08:33 2010 -0700

    uni/fold.t: Prevent [] from being optimized out
    
    This test hasn't been testing bracketed char classes because a single
    character class gets optimized out.

M       t/uni/fold.t

commit 9d6ecd7a330475b012bff918b30af2834aec6ea4
Author: Karl Williamson <[email protected]>
Date:   Sun Dec 5 13:07:13 2010 -0700

    regcomp.c: Revert to using regcomp.sym order
    
    Now that the new nodes are grouped properly, we can use the fact that
    the named backreferences all come after all the numbered backreferences,
    as had been there before.

M       regcomp.c

commit 01f98ec2b0828a07c073b6eef8c4942f61e69e13
Author: Karl Williamson <[email protected]>
Date:   Sun Dec 5 12:28:21 2010 -0700

    regcomp.sym: Re-order for better grouping
    
    The recently added regnodes are moved to their respective equivalence
    classes, and the named backreferences are moved to just after the
    numbered backreferences

M       regcomp.sym
M       regnodes.h

commit 381b57db967e0e071e69852c5b3297178de329ae
Author: Karl Williamson <[email protected]>
Date:   Sat Dec 4 20:48:24 2010 -0700

    regcomp.sym: Remove misleading comments
    
    Yves informed me that in spite of the comments giving precise node
    numbers, those numbers can change, so new nodes can be mixed in with
    their kin.  Remove those comments

M       regcomp.sym
-----------------------------------------------------------------------

Summary of changes:
 regcomp.c    |   14 +++-
 regcomp.sym  |   61 ++++++-------
 regexec.c    |   36 +++++++-
 regnodes.h   |  272 +++++++++++++++++++++++++++++-----------------------------
 t/uni/fold.t |    9 +-
 toke.c       |   10 +-
 6 files changed, 218 insertions(+), 184 deletions(-)

diff --git a/regcomp.c b/regcomp.c
index 60fef55..4fb8c37 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -7445,11 +7445,19 @@ tryagain:
            *flagp |= HASWIDTH|SIMPLE;
            goto finish_meta_pat;
        case 'd':
-           ret = reg_node(pRExC_state, DIGIT);
+            if (LOC) {
+                ret = reg_node(pRExC_state, (U8)(DIGITL));
+            } else {
+                ret = reg_node(pRExC_state, (U8)(DIGIT));
+            }
            *flagp |= HASWIDTH|SIMPLE;
            goto finish_meta_pat;
        case 'D':
-           ret = reg_node(pRExC_state, NDIGIT);
+            if (LOC) {
+                ret = reg_node(pRExC_state, (U8)(NDIGITL));
+            } else {
+                ret = reg_node(pRExC_state, (U8)(NDIGIT));
+            }
            *flagp |= HASWIDTH|SIMPLE;
            goto finish_meta_pat;
        case 'R':
@@ -9607,7 +9615,7 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const 
regnode *o)
     else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || 
OP(o)==ACCEPT) {
        Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o));    /* Parenth number */
        if ( RXp_PAREN_NAMES(prog) ) {
-            if ( k != REF || (OP(o) != NREF && OP(o) != NREFF && OP(o) != 
NREFFL && OP(o) != NREFFU)) {
+            if ( k != REF || (OP(o) < NREF)) {
                AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
                SV **name= av_fetch(list, ARG(o), 0 );
                if (name)
diff --git a/regcomp.sym b/regcomp.sym
index 4e787a7..707da08 100644
--- a/regcomp.sym
+++ b/regcomp.sym
@@ -16,12 +16,12 @@
 
 
 
-#* Exit points (0,1)
+#* Exit points
 
 END         END,        no        ; End of program.
 SUCCEED     END,        no        ; Return from a subroutine, basically.
 
-#* Anchors: (2..13)
+#* Anchors:
 
 BOL         BOL,        no        ; Match "" at beginning of line.
 MBOL        BOL,        no        ; Same, assuming multiline.
@@ -36,7 +36,7 @@ NBOUND      NBOUND,     no        ; Match "" at any word 
non-boundary
 NBOUNDL     NBOUND,     no        ; Match "" at any word non-boundary
 GPOS        GPOS,       no        ; Matches where last m//g left off.
 
-#* [Special] alternatives: (14..30)
+#* [Special] alternatives:
 
 REG_ANY     REG_ANY,    no 0 S    ; Match any one character (except newline).
 SANY        REG_ANY,    no 0 S    ; Match any one character.
@@ -51,12 +51,12 @@ SPACEL      SPACE,      no 0 S    ; Match any whitespace 
char in locale
 NSPACE      NSPACE,     no 0 S    ; Match any non-whitespace character
 NSPACEL     NSPACE,     no 0 S    ; Match any non-whitespace char in locale
 DIGIT       DIGIT,      no 0 S    ; Match any numeric character
-DIGITL      DIGIT,      no        ; Match any numeric character in locale
+DIGITL      DIGIT,      no 0 S    ; Match any numeric character in locale
 NDIGIT      NDIGIT,     no 0 S    ; Match any non-numeric character
-NDIGITL     NDIGIT,     no        ; Match any non-numeric character in locale
+NDIGITL     NDIGIT,     no 0 S    ; Match any non-numeric character in locale
 CLUMP       CLUMP,      no 0 V    ; Match any extended grapheme cluster 
sequence
 
-#* Alternation (31)
+#* Alternation
 
 # BRANCH        The set of branches constituting a single choice are hooked
 #               together with their "next" pointers, since precedence prevents
@@ -68,26 +68,27 @@ CLUMP       CLUMP,      no 0 V    ; Match any extended 
grapheme cluster sequence
 #
 BRANCH      BRANCH,     node 0 V  ; Match this alternative, or the next...
 
-#*Back pointer (32)
+#*Back pointer
 
 # BACK          Normal "next" pointers all implicitly point forward; BACK
 #               exists to make loop structures possible.
 # not used
 BACK        BACK,       no 0 V    ; Match "", "next" ptr points backward.
 
-#*Literals (33..35)
+#*Literals
 
 EXACT       EXACT,      str       ; Match this string (preceded by length).
 EXACTF      EXACT,      str       ; Match this string, folded, native charset 
semantics for non-utf8 (prec. by length).
 EXACTFL     EXACT,      str       ; Match this string, folded in locale 
(w/len).
+EXACTFU     EXACT,      str      ; Match this string, folded, Unicode 
semantics for non-utf8 (prec. by length).
 
-#*Do nothing types (36..37)
+#*Do nothing types
 
 NOTHING     NOTHING,    no        ; Match empty string.
 # A variant of above which delimits a group, thus stops optimizations
 TAIL        NOTHING,    no        ; Match empty string. Can jump here from 
outside.
 
-#*Loops (38..44)
+#*Loops
 
 # STAR,PLUS    '?', and complex '*' and '+', are implemented as circular
 #               BRANCH structures using BACK.  Simple cases (one character
@@ -105,7 +106,7 @@ CURLYX      CURLY,      sv 2 V    ; Match this complex 
thing {n,m} times.
 # This terminator creates a loop structure for CURLYX
 WHILEM      WHILEM,     no 0 V    ; Do curly processing and see if rest 
matches.
 
-#*Buffer related (45..49)
+#*Buffer related
 
 # OPEN,CLOSE,GROUPP     ...are numbered at compile time.
 OPEN        OPEN,       num 1     ; Mark this point in input as start of #n.
@@ -114,7 +115,16 @@ CLOSE       CLOSE,      num 1     ; Analogous to OPEN.
 REF         REF,        num 1 V   ; Match some already matched string
 REFF        REF,        num 1 V   ; Match already matched string, folded using 
native charset semantics for non-utf8
 REFFL       REF,        num 1 V   ; Match already matched string, folded in 
loc.
+# REFFU and NREFFU could have been implemented using the FLAGS field of the
+# regnode, but by having a separate node type, we can use the existing switch
+# statement to avoid some tests
+REFFU       REF,        num 1 V   ; Match already matched string, folded using 
unicode semantics for non-utf8
 
+#*Named references.  Code in regcomp.c assumes that these all are after the 
numbered references
+NREF        REF,        no-sv 1 V ; Match some already matched string
+NREFF       REF,        no-sv 1 V ; Match already matched string, folded using 
native charset semantics for non-utf8
+NREFFL      REF,        no-sv 1 V ; Match already matched string, folded in 
loc.
+NREFFU      REF,        num   1 V ; Match already matched string, folded using 
unicode semantics for non-utf8
 
 IFMATCH     BRANCHJ,    off 1 . 2 ; Succeeds if the following matches.
 UNLESSM     BRANCHJ,    off 1 . 2 ; Fails if the following matches.
@@ -122,24 +132,24 @@ SUSPEND     BRANCHJ,    off 1 V 1 ; "Independent" sub-RE.
 IFTHEN      BRANCHJ,    off 1 V 1 ; Switch, should be preceeded by switcher .
 GROUPP      GROUPP,     num 1     ; Whether the group matched.
 
-#*Support for long RE (55..56)
+#*Support for long RE
 
 LONGJMP     LONGJMP,    off 1 . 1 ; Jump far away.
 BRANCHJ     BRANCHJ,    off 1 V 1 ; BRANCH with long offset.
 
-#*The heavy worker (57)
+#*The heavy worker
 
 EVAL        EVAL,       evl 1     ; Execute some Perl code.
 
-#*Modifiers (58..59)
+#*Modifiers
 
 MINMOD      MINMOD,     no        ; Next operator is not greedy.
 LOGICAL     LOGICAL,    no        ; Next opcode should set the flag only.
 
-# This is not used yet (60)
+# This is not used yet
 RENUM       BRANCHJ,    off 1 . 1 ; Group with independently numbered parens.
 
-#*Trie Related (61..62)
+#*Trie Related
 
 # Behave the same as A|LIST|OF|WORDS would. The '..C' variants have  
 # inline charclass data (ascii only), the 'C' store it in the structure.
@@ -152,17 +162,11 @@ TRIEC       TRIE,trie charclass   ; Same as TRIE, but 
with embedded charclass da
 AHOCORASICK     TRIE,   trie 1    ; Aho Corasick stclass. flags==type
 AHOCORASICKC    TRIE,trie charclass   ; Same as AHOCORASICK, but with embedded 
charclass data
 
-#*Regex Subroutines (65..66) 
+#*Regex Subroutines
 GOSUB       GOSUB,      num/ofs 2L    ; recurse to paren arg1 at (signed) ofs 
arg2
 GOSTART     GOSTART,    no        ; recurse to start of pattern
 
-#*Named references (67..69)
-NREF        REF,        no-sv 1 V ; Match some already matched string
-NREFF       REF,        no-sv 1 V ; Match already matched string, folded using 
native charset semantics for non-utf8
-NREFFL      REF,        no-sv 1 V ; Match already matched string, folded in 
loc.
-
-
-#*Special conditionals  (70..72)
+#*Special conditionals
 NGROUPP     NGROUPP,    no-sv 1   ; Whether the group matched.            
 INSUBP      INSUBP,     num 1     ; Whether we are in a specific recurse.  
 DEFINEP     DEFINEP,    none 1    ; Never execute directly.               
@@ -192,16 +196,9 @@ HORIZWS     HORIZWS,    none 0 S  ; horizontal whitespace  
     (Perl 6)
 NHORIZWS    NHORIZWS,   none 0 S  ; not horizontal whitespace   (Perl 6)
 
 FOLDCHAR    FOLDCHAR,   codepoint 1 ; codepoint with tricky case folding 
properties.
-EXACTFU     EXACT,      str        ; Match this string, folded, Unicode 
semantics for non-utf8 (prec. by length).
-
-# These could have been implemented using the FLAGS field of the regnode, but
-# by having a separate node type, we can use the existing switch statement to
-# avoid some tests
-REFFU       REF,        num 1 V   ; Match already matched string, folded using 
unicode semantics for non-utf8
-NREFFU       REF,        num 1 V   ; Match already matched string, folded 
using unicode semantics for non-utf8
 
 
-# NEW STUFF ABOVE THIS LINE  
+# NEW STUFF SOMEWHERE ABOVE THIS LINE
 
 
################################################################################
 
diff --git a/regexec.c b/regexec.c
index 112722e..c1f1ae2 100644
--- a/regexec.c
+++ b/regexec.c
@@ -1645,7 +1645,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, 
char *s,
            );
        case SPACEL:
            REXEC_FBC_CSCAN_TAINT(
-               *s == ' ' || isSPACE_LC_utf8((U8*)s),
+               isSPACE_LC_utf8((U8*)s),
                isSPACE_LC(*s)
            );
        case NSPACE:
@@ -1656,7 +1656,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, 
char *s,
            );
        case NSPACEL:
            REXEC_FBC_CSCAN_TAINT(
-               !(*s == ' ' || isSPACE_LC_utf8((U8*)s)),
+               !isSPACE_LC_utf8((U8*)s),
                !isSPACE_LC(*s)
            );
        case DIGIT:
@@ -6036,7 +6036,7 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, 
I32 max, int depth)
        if (utf8_target) {
            loceol = PL_regeol;
            while (hardcount < max && scan < loceol &&
-                  (*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
+                  isSPACE_LC_utf8((U8*)scan)) {
                scan += UTF8SKIP(scan);
                hardcount++;
            }
@@ -6071,7 +6071,7 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, 
I32 max, int depth)
        if (utf8_target) {
            loceol = PL_regeol;
            while (hardcount < max && scan < loceol &&
-                  !(*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) {
+                  !isSPACE_LC_utf8((U8*)scan)) {
                scan += UTF8SKIP(scan);
                hardcount++;
            }
@@ -6094,6 +6094,20 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, 
I32 max, int depth)
                scan++;
        }
        break;
+    case DIGITL:
+       PL_reg_flags |= RF_tainted;
+       if (utf8_target) {
+           loceol = PL_regeol;
+           while (hardcount < max && scan < loceol &&
+                  isDIGIT_LC_utf8((U8*)scan)) {
+               scan += UTF8SKIP(scan);
+               hardcount++;
+           }
+       } else {
+           while (scan < loceol && isDIGIT_LC(*scan))
+               scan++;
+       }
+       break;
     case NDIGIT:
        if (utf8_target) {
            loceol = PL_regeol;
@@ -6107,6 +6121,20 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, 
I32 max, int depth)
            while (scan < loceol && !isDIGIT(*scan))
                scan++;
        }
+    case NDIGITL:
+       PL_reg_flags |= RF_tainted;
+       if (utf8_target) {
+           loceol = PL_regeol;
+           while (hardcount < max && scan < loceol &&
+                  !isDIGIT_LC_utf8((U8*)scan)) {
+               scan += UTF8SKIP(scan);
+               hardcount++;
+           }
+       } else {
+           while (scan < loceol && !isDIGIT_LC(*scan))
+               scan++;
+       }
+       break;
     case LNBREAK:
         if (utf8_target) {
            loceol = PL_regeol;
diff --git a/regnodes.h b/regnodes.h
index 09ab661..35a4cc1 100644
--- a/regnodes.h
+++ b/regnodes.h
@@ -45,62 +45,62 @@
 #define        EXACT                   33      /* 0x21 Match this string 
(preceded by length). */
 #define        EXACTF                  34      /* 0x22 Match this string, 
folded, native charset semantics for non-utf8 (prec. by length). */
 #define        EXACTFL                 35      /* 0x23 Match this string, 
folded in locale (w/len). */
-#define        NOTHING                 36      /* 0x24 Match empty string. */
-#define        TAIL                    37      /* 0x25 Match empty string. Can 
jump here from outside. */
-#define        STAR                    38      /* 0x26 Match this (simple) 
thing 0 or more times. */
-#define        PLUS                    39      /* 0x27 Match this (simple) 
thing 1 or more times. */
-#define        CURLY                   40      /* 0x28 Match this simple thing 
{n,m} times. */
-#define        CURLYN                  41      /* 0x29 Capture next-after-this 
simple thing */
-#define        CURLYM                  42      /* 0x2a Capture this 
medium-complex thing {n,m} times. */
-#define        CURLYX                  43      /* 0x2b Match this complex 
thing {n,m} times. */
-#define        WHILEM                  44      /* 0x2c Do curly processing and 
see if rest matches. */
-#define        OPEN                    45      /* 0x2d Mark this point in 
input as start of */
-#define        CLOSE                   46      /* 0x2e Analogous to OPEN. */
-#define        REF                     47      /* 0x2f Match some already 
matched string */
-#define        REFF                    48      /* 0x30 Match already matched 
string, folded using native charset semantics for non-utf8 */
-#define        REFFL                   49      /* 0x31 Match already matched 
string, folded in loc. */
-#define        IFMATCH                 50      /* 0x32 Succeeds if the 
following matches. */
-#define        UNLESSM                 51      /* 0x33 Fails if the following 
matches. */
-#define        SUSPEND                 52      /* 0x34 "Independent" sub-RE. */
-#define        IFTHEN                  53      /* 0x35 Switch, should be 
preceeded by switcher . */
-#define        GROUPP                  54      /* 0x36 Whether the group 
matched. */
-#define        LONGJMP                 55      /* 0x37 Jump far away. */
-#define        BRANCHJ                 56      /* 0x38 BRANCH with long 
offset. */
-#define        EVAL                    57      /* 0x39 Execute some Perl code. 
*/
-#define        MINMOD                  58      /* 0x3a Next operator is not 
greedy. */
-#define        LOGICAL                 59      /* 0x3b Next opcode should set 
the flag only. */
-#define        RENUM                   60      /* 0x3c Group with 
independently numbered parens. */
-#define        TRIE                    61      /* 0x3d Match many 
EXACT(F[LU]?)? at once. flags==type */
-#define        TRIEC                   62      /* 0x3e Same as TRIE, but with 
embedded charclass data */
-#define        AHOCORASICK             63      /* 0x3f Aho Corasick stclass. 
flags==type */
-#define        AHOCORASICKC            64      /* 0x40 Same as AHOCORASICK, 
but with embedded charclass data */
-#define        GOSUB                   65      /* 0x41 recurse to paren arg1 
at (signed) ofs arg2 */
-#define        GOSTART                 66      /* 0x42 recurse to start of 
pattern */
-#define        NREF                    67      /* 0x43 Match some already 
matched string */
-#define        NREFF                   68      /* 0x44 Match already matched 
string, folded using native charset semantics for non-utf8 */
-#define        NREFFL                  69      /* 0x45 Match already matched 
string, folded in loc. */
-#define        NGROUPP                 70      /* 0x46 Whether the group 
matched. */
-#define        INSUBP                  71      /* 0x47 Whether we are in a 
specific recurse. */
-#define        DEFINEP                 72      /* 0x48 Never execute directly. 
*/
-#define        ENDLIKE                 73      /* 0x49 Used only for the type 
field of verbs */
-#define        OPFAIL                  74      /* 0x4a Same as (?!) */
-#define        ACCEPT                  75      /* 0x4b Accepts the current 
matched string. */
-#define        VERB                    76      /* 0x4c Used only for the type 
field of verbs */
-#define        PRUNE                   77      /* 0x4d Pattern fails at this 
startpoint if no-backtracking through this */
-#define        MARKPOINT               78      /* 0x4e Push the current 
location for rollback by cut. */
-#define        SKIP                    79      /* 0x4f On failure skip forward 
(to the mark) before retrying */
-#define        COMMIT                  80      /* 0x50 Pattern fails outright 
if backtracking through this */
-#define        CUTGROUP                81      /* 0x51 On failure go to the 
next alternation in the group */
-#define        KEEPS                   82      /* 0x52 $& begins here. */
-#define        LNBREAK                 83      /* 0x53 generic newline pattern 
*/
-#define        VERTWS                  84      /* 0x54 vertical whitespace     
    (Perl 6) */
-#define        NVERTWS                 85      /* 0x55 not vertical whitespace 
    (Perl 6) */
-#define        HORIZWS                 86      /* 0x56 horizontal whitespace   
    (Perl 6) */
-#define        NHORIZWS                87      /* 0x57 not horizontal 
whitespace   (Perl 6) */
-#define        FOLDCHAR                88      /* 0x58 codepoint with tricky 
case folding properties. */
-#define        EXACTFU                 89      /* 0x59 Match this string, 
folded, Unicode semantics for non-utf8 (prec. by length). */
-#define        REFFU                   90      /* 0x5a Match already matched 
string, folded using unicode semantics for non-utf8 */
-#define        NREFFU                  91      /* 0x5b Match already matched 
string, folded using unicode semantics for non-utf8 */
+#define        EXACTFU                 36      /* 0x24 Match this string, 
folded, Unicode semantics for non-utf8 (prec. by length). */
+#define        NOTHING                 37      /* 0x25 Match empty string. */
+#define        TAIL                    38      /* 0x26 Match empty string. Can 
jump here from outside. */
+#define        STAR                    39      /* 0x27 Match this (simple) 
thing 0 or more times. */
+#define        PLUS                    40      /* 0x28 Match this (simple) 
thing 1 or more times. */
+#define        CURLY                   41      /* 0x29 Match this simple thing 
{n,m} times. */
+#define        CURLYN                  42      /* 0x2a Capture next-after-this 
simple thing */
+#define        CURLYM                  43      /* 0x2b Capture this 
medium-complex thing {n,m} times. */
+#define        CURLYX                  44      /* 0x2c Match this complex 
thing {n,m} times. */
+#define        WHILEM                  45      /* 0x2d Do curly processing and 
see if rest matches. */
+#define        OPEN                    46      /* 0x2e Mark this point in 
input as start of */
+#define        CLOSE                   47      /* 0x2f Analogous to OPEN. */
+#define        REF                     48      /* 0x30 Match some already 
matched string */
+#define        REFF                    49      /* 0x31 Match already matched 
string, folded using native charset semantics for non-utf8 */
+#define        REFFL                   50      /* 0x32 Match already matched 
string, folded in loc. */
+#define        REFFU                   51      /* 0x33 Match already matched 
string, folded using unicode semantics for non-utf8 */
+#define        NREF                    52      /* 0x34 Match some already 
matched string */
+#define        NREFF                   53      /* 0x35 Match already matched 
string, folded using native charset semantics for non-utf8 */
+#define        NREFFL                  54      /* 0x36 Match already matched 
string, folded in loc. */
+#define        NREFFU                  55      /* 0x37 Match already matched 
string, folded using unicode semantics for non-utf8 */
+#define        IFMATCH                 56      /* 0x38 Succeeds if the 
following matches. */
+#define        UNLESSM                 57      /* 0x39 Fails if the following 
matches. */
+#define        SUSPEND                 58      /* 0x3a "Independent" sub-RE. */
+#define        IFTHEN                  59      /* 0x3b Switch, should be 
preceeded by switcher . */
+#define        GROUPP                  60      /* 0x3c Whether the group 
matched. */
+#define        LONGJMP                 61      /* 0x3d Jump far away. */
+#define        BRANCHJ                 62      /* 0x3e BRANCH with long 
offset. */
+#define        EVAL                    63      /* 0x3f Execute some Perl code. 
*/
+#define        MINMOD                  64      /* 0x40 Next operator is not 
greedy. */
+#define        LOGICAL                 65      /* 0x41 Next opcode should set 
the flag only. */
+#define        RENUM                   66      /* 0x42 Group with 
independently numbered parens. */
+#define        TRIE                    67      /* 0x43 Match many 
EXACT(F[LU]?)? at once. flags==type */
+#define        TRIEC                   68      /* 0x44 Same as TRIE, but with 
embedded charclass data */
+#define        AHOCORASICK             69      /* 0x45 Aho Corasick stclass. 
flags==type */
+#define        AHOCORASICKC            70      /* 0x46 Same as AHOCORASICK, 
but with embedded charclass data */
+#define        GOSUB                   71      /* 0x47 recurse to paren arg1 
at (signed) ofs arg2 */
+#define        GOSTART                 72      /* 0x48 recurse to start of 
pattern */
+#define        NGROUPP                 73      /* 0x49 Whether the group 
matched. */
+#define        INSUBP                  74      /* 0x4a Whether we are in a 
specific recurse. */
+#define        DEFINEP                 75      /* 0x4b Never execute directly. 
*/
+#define        ENDLIKE                 76      /* 0x4c Used only for the type 
field of verbs */
+#define        OPFAIL                  77      /* 0x4d Same as (?!) */
+#define        ACCEPT                  78      /* 0x4e Accepts the current 
matched string. */
+#define        VERB                    79      /* 0x4f Used only for the type 
field of verbs */
+#define        PRUNE                   80      /* 0x50 Pattern fails at this 
startpoint if no-backtracking through this */
+#define        MARKPOINT               81      /* 0x51 Push the current 
location for rollback by cut. */
+#define        SKIP                    82      /* 0x52 On failure skip forward 
(to the mark) before retrying */
+#define        COMMIT                  83      /* 0x53 Pattern fails outright 
if backtracking through this */
+#define        CUTGROUP                84      /* 0x54 On failure go to the 
next alternation in the group */
+#define        KEEPS                   85      /* 0x55 $& begins here. */
+#define        LNBREAK                 86      /* 0x56 generic newline pattern 
*/
+#define        VERTWS                  87      /* 0x57 vertical whitespace     
    (Perl 6) */
+#define        NVERTWS                 88      /* 0x58 not vertical whitespace 
    (Perl 6) */
+#define        HORIZWS                 89      /* 0x59 horizontal whitespace   
    (Perl 6) */
+#define        NHORIZWS                90      /* 0x5a not horizontal 
whitespace   (Perl 6) */
+#define        FOLDCHAR                91      /* 0x5b codepoint with tricky 
case folding properties. */
 #define        OPTIMIZED               92      /* 0x5c Placeholder for dump. */
 #define        PSEUDO                  93      /* 0x5d Pseudo opcode for 
internal use. */
        /* ------------ States ------------- */
@@ -187,6 +187,7 @@ EXTCONST U8 PL_regkind[] = {
        EXACT,          /* EXACT                  */
        EXACT,          /* EXACTF                 */
        EXACT,          /* EXACTFL                */
+       EXACT,          /* EXACTFU                */
        NOTHING,        /* NOTHING                */
        NOTHING,        /* TAIL                   */
        STAR,           /* STAR                   */
@@ -201,6 +202,11 @@ EXTCONST U8 PL_regkind[] = {
        REF,            /* REF                    */
        REF,            /* REFF                   */
        REF,            /* REFFL                  */
+       REF,            /* REFFU                  */
+       REF,            /* NREF                   */
+       REF,            /* NREFF                  */
+       REF,            /* NREFFL                 */
+       REF,            /* NREFFU                 */
        BRANCHJ,        /* IFMATCH                */
        BRANCHJ,        /* UNLESSM                */
        BRANCHJ,        /* SUSPEND                */
@@ -218,9 +224,6 @@ EXTCONST U8 PL_regkind[] = {
        TRIE,           /* AHOCORASICKC           */
        GOSUB,          /* GOSUB                  */
        GOSTART,        /* GOSTART                */
-       REF,            /* NREF                   */
-       REF,            /* NREFF                  */
-       REF,            /* NREFFL                 */
        NGROUPP,        /* NGROUPP                */
        INSUBP,         /* INSUBP                 */
        DEFINEP,        /* DEFINEP                */
@@ -240,9 +243,6 @@ EXTCONST U8 PL_regkind[] = {
        HORIZWS,        /* HORIZWS                */
        NHORIZWS,       /* NHORIZWS               */
        FOLDCHAR,       /* FOLDCHAR               */
-       EXACT,          /* EXACTFU                */
-       REF,            /* REFFU                  */
-       REF,            /* NREFFU                 */
        NOTHING,        /* OPTIMIZED              */
        PSEUDO,         /* PSEUDO                 */
        /* ------------ States ------------- */
@@ -329,6 +329,7 @@ static const U8 regarglen[] = {
        0,                                      /* EXACT        */
        0,                                      /* EXACTF       */
        0,                                      /* EXACTFL      */
+       0,                                      /* EXACTFU      */
        0,                                      /* NOTHING      */
        0,                                      /* TAIL         */
        0,                                      /* STAR         */
@@ -343,6 +344,11 @@ static const U8 regarglen[] = {
        EXTRA_SIZE(struct regnode_1),           /* REF          */
        EXTRA_SIZE(struct regnode_1),           /* REFF         */
        EXTRA_SIZE(struct regnode_1),           /* REFFL        */
+       EXTRA_SIZE(struct regnode_1),           /* REFFU        */
+       EXTRA_SIZE(struct regnode_1),           /* NREF         */
+       EXTRA_SIZE(struct regnode_1),           /* NREFF        */
+       EXTRA_SIZE(struct regnode_1),           /* NREFFL       */
+       EXTRA_SIZE(struct regnode_1),           /* NREFFU       */
        EXTRA_SIZE(struct regnode_1),           /* IFMATCH      */
        EXTRA_SIZE(struct regnode_1),           /* UNLESSM      */
        EXTRA_SIZE(struct regnode_1),           /* SUSPEND      */
@@ -360,9 +366,6 @@ static const U8 regarglen[] = {
        EXTRA_SIZE(struct regnode_charclass),   /* AHOCORASICKC */
        EXTRA_SIZE(struct regnode_2L),          /* GOSUB        */
        0,                                      /* GOSTART      */
-       EXTRA_SIZE(struct regnode_1),           /* NREF         */
-       EXTRA_SIZE(struct regnode_1),           /* NREFF        */
-       EXTRA_SIZE(struct regnode_1),           /* NREFFL       */
        EXTRA_SIZE(struct regnode_1),           /* NGROUPP      */
        EXTRA_SIZE(struct regnode_1),           /* INSUBP       */
        EXTRA_SIZE(struct regnode_1),           /* DEFINEP      */
@@ -382,9 +385,6 @@ static const U8 regarglen[] = {
        0,                                      /* HORIZWS      */
        0,                                      /* NHORIZWS     */
        EXTRA_SIZE(struct regnode_1),           /* FOLDCHAR     */
-       0,                                      /* EXACTFU      */
-       EXTRA_SIZE(struct regnode_1),           /* REFFU        */
-       EXTRA_SIZE(struct regnode_1),           /* NREFFU       */
        0,                                      /* OPTIMIZED    */
        0,                                      /* PSEUDO       */
 };
@@ -428,6 +428,7 @@ static const char reg_off_by_arg[] = {
        0,      /* EXACT        */
        0,      /* EXACTF       */
        0,      /* EXACTFL      */
+       0,      /* EXACTFU      */
        0,      /* NOTHING      */
        0,      /* TAIL         */
        0,      /* STAR         */
@@ -442,6 +443,11 @@ static const char reg_off_by_arg[] = {
        0,      /* REF          */
        0,      /* REFF         */
        0,      /* REFFL        */
+       0,      /* REFFU        */
+       0,      /* NREF         */
+       0,      /* NREFF        */
+       0,      /* NREFFL       */
+       0,      /* NREFFU       */
        2,      /* IFMATCH      */
        2,      /* UNLESSM      */
        1,      /* SUSPEND      */
@@ -459,9 +465,6 @@ static const char reg_off_by_arg[] = {
        0,      /* AHOCORASICKC */
        0,      /* GOSUB        */
        0,      /* GOSTART      */
-       0,      /* NREF         */
-       0,      /* NREFF        */
-       0,      /* NREFFL       */
        0,      /* NGROUPP      */
        0,      /* INSUBP       */
        0,      /* DEFINEP      */
@@ -481,9 +484,6 @@ static const char reg_off_by_arg[] = {
        0,      /* HORIZWS      */
        0,      /* NHORIZWS     */
        0,      /* FOLDCHAR     */
-       0,      /* EXACTFU      */
-       0,      /* REFFU        */
-       0,      /* NREFFU       */
        0,      /* OPTIMIZED    */
        0,      /* PSEUDO       */
 };
@@ -532,62 +532,62 @@ EXTCONST char * const PL_reg_name[] = {
        "EXACT",                        /* 0x21 */
        "EXACTF",                       /* 0x22 */
        "EXACTFL",                      /* 0x23 */
-       "NOTHING",                      /* 0x24 */
-       "TAIL",                         /* 0x25 */
-       "STAR",                         /* 0x26 */
-       "PLUS",                         /* 0x27 */
-       "CURLY",                        /* 0x28 */
-       "CURLYN",                       /* 0x29 */
-       "CURLYM",                       /* 0x2a */
-       "CURLYX",                       /* 0x2b */
-       "WHILEM",                       /* 0x2c */
-       "OPEN",                         /* 0x2d */
-       "CLOSE",                        /* 0x2e */
-       "REF",                          /* 0x2f */
-       "REFF",                         /* 0x30 */
-       "REFFL",                        /* 0x31 */
-       "IFMATCH",                      /* 0x32 */
-       "UNLESSM",                      /* 0x33 */
-       "SUSPEND",                      /* 0x34 */
-       "IFTHEN",                       /* 0x35 */
-       "GROUPP",                       /* 0x36 */
-       "LONGJMP",                      /* 0x37 */
-       "BRANCHJ",                      /* 0x38 */
-       "EVAL",                         /* 0x39 */
-       "MINMOD",                       /* 0x3a */
-       "LOGICAL",                      /* 0x3b */
-       "RENUM",                        /* 0x3c */
-       "TRIE",                         /* 0x3d */
-       "TRIEC",                        /* 0x3e */
-       "AHOCORASICK",                  /* 0x3f */
-       "AHOCORASICKC",                 /* 0x40 */
-       "GOSUB",                        /* 0x41 */
-       "GOSTART",                      /* 0x42 */
-       "NREF",                         /* 0x43 */
-       "NREFF",                        /* 0x44 */
-       "NREFFL",                       /* 0x45 */
-       "NGROUPP",                      /* 0x46 */
-       "INSUBP",                       /* 0x47 */
-       "DEFINEP",                      /* 0x48 */
-       "ENDLIKE",                      /* 0x49 */
-       "OPFAIL",                       /* 0x4a */
-       "ACCEPT",                       /* 0x4b */
-       "VERB",                         /* 0x4c */
-       "PRUNE",                        /* 0x4d */
-       "MARKPOINT",                    /* 0x4e */
-       "SKIP",                         /* 0x4f */
-       "COMMIT",                       /* 0x50 */
-       "CUTGROUP",                     /* 0x51 */
-       "KEEPS",                        /* 0x52 */
-       "LNBREAK",                      /* 0x53 */
-       "VERTWS",                       /* 0x54 */
-       "NVERTWS",                      /* 0x55 */
-       "HORIZWS",                      /* 0x56 */
-       "NHORIZWS",                     /* 0x57 */
-       "FOLDCHAR",                     /* 0x58 */
-       "EXACTFU",                      /* 0x59 */
-       "REFFU",                        /* 0x5a */
-       "NREFFU",                       /* 0x5b */
+       "EXACTFU",                      /* 0x24 */
+       "NOTHING",                      /* 0x25 */
+       "TAIL",                         /* 0x26 */
+       "STAR",                         /* 0x27 */
+       "PLUS",                         /* 0x28 */
+       "CURLY",                        /* 0x29 */
+       "CURLYN",                       /* 0x2a */
+       "CURLYM",                       /* 0x2b */
+       "CURLYX",                       /* 0x2c */
+       "WHILEM",                       /* 0x2d */
+       "OPEN",                         /* 0x2e */
+       "CLOSE",                        /* 0x2f */
+       "REF",                          /* 0x30 */
+       "REFF",                         /* 0x31 */
+       "REFFL",                        /* 0x32 */
+       "REFFU",                        /* 0x33 */
+       "NREF",                         /* 0x34 */
+       "NREFF",                        /* 0x35 */
+       "NREFFL",                       /* 0x36 */
+       "NREFFU",                       /* 0x37 */
+       "IFMATCH",                      /* 0x38 */
+       "UNLESSM",                      /* 0x39 */
+       "SUSPEND",                      /* 0x3a */
+       "IFTHEN",                       /* 0x3b */
+       "GROUPP",                       /* 0x3c */
+       "LONGJMP",                      /* 0x3d */
+       "BRANCHJ",                      /* 0x3e */
+       "EVAL",                         /* 0x3f */
+       "MINMOD",                       /* 0x40 */
+       "LOGICAL",                      /* 0x41 */
+       "RENUM",                        /* 0x42 */
+       "TRIE",                         /* 0x43 */
+       "TRIEC",                        /* 0x44 */
+       "AHOCORASICK",                  /* 0x45 */
+       "AHOCORASICKC",                 /* 0x46 */
+       "GOSUB",                        /* 0x47 */
+       "GOSTART",                      /* 0x48 */
+       "NGROUPP",                      /* 0x49 */
+       "INSUBP",                       /* 0x4a */
+       "DEFINEP",                      /* 0x4b */
+       "ENDLIKE",                      /* 0x4c */
+       "OPFAIL",                       /* 0x4d */
+       "ACCEPT",                       /* 0x4e */
+       "VERB",                         /* 0x4f */
+       "PRUNE",                        /* 0x50 */
+       "MARKPOINT",                    /* 0x51 */
+       "SKIP",                         /* 0x52 */
+       "COMMIT",                       /* 0x53 */
+       "CUTGROUP",                     /* 0x54 */
+       "KEEPS",                        /* 0x55 */
+       "LNBREAK",                      /* 0x56 */
+       "VERTWS",                       /* 0x57 */
+       "NVERTWS",                      /* 0x58 */
+       "HORIZWS",                      /* 0x59 */
+       "NHORIZWS",                     /* 0x5a */
+       "FOLDCHAR",                     /* 0x5b */
        "OPTIMIZED",                    /* 0x5c */
        "PSEUDO",                       /* 0x5d */
        /* ------------ States ------------- */
@@ -684,8 +684,8 @@ EXTCONST U8 PL_varies[] __attribute__deprecated__;
 #else
 EXTCONST U8 PL_varies[] __attribute__deprecated__ = {
     CLUMP, BRANCH, BACK, STAR, PLUS, CURLY, CURLYN, CURLYM, CURLYX, WHILEM,
-    REF, REFF, REFFL, SUSPEND, IFTHEN, BRANCHJ, NREF, NREFF, NREFFL, REFFU,
-    NREFFU,
+    REF, REFF, REFFL, REFFU, NREF, NREFF, NREFFL, NREFFU, SUSPEND, IFTHEN,
+    BRANCHJ,
     0
 };
 #endif /* DOINIT */
@@ -694,7 +694,7 @@ EXTCONST U8 PL_varies[] __attribute__deprecated__ = {
 EXTCONST U8 PL_varies_bitmask[];
 #else
 EXTCONST U8 PL_varies_bitmask[] = {
-    0x00, 0x00, 0x00, 0xC0, 0xC1, 0x9F, 0x33, 0x01, 0x38, 0x00, 0x00, 0x0C
+    0x00, 0x00, 0x00, 0xC0, 0x81, 0x3F, 0xFF, 0x4C, 0x00, 0x00, 0x00, 0x00
 };
 #endif /* DOINIT */
 
@@ -707,8 +707,8 @@ EXTCONST U8 PL_simple[] __attribute__deprecated__;
 #else
 EXTCONST U8 PL_simple[] __attribute__deprecated__ = {
     REG_ANY, SANY, CANY, ANYOF, ALNUM, ALNUML, NALNUM, NALNUML, SPACE,
-    SPACEL, NSPACE, NSPACEL, DIGIT, NDIGIT, VERTWS, NVERTWS, HORIZWS,
-    NHORIZWS,
+    SPACEL, NSPACE, NSPACEL, DIGIT, DIGITL, NDIGIT, NDIGITL, VERTWS,
+    NVERTWS, HORIZWS, NHORIZWS,
     0
 };
 #endif /* DOINIT */
@@ -717,7 +717,7 @@ EXTCONST U8 PL_simple[] __attribute__deprecated__ = {
 EXTCONST U8 PL_simple_bitmask[];
 #else
 EXTCONST U8 PL_simple_bitmask[] = {
-    0x00, 0xC0, 0xFF, 0x17, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF0, 0x00
+    0x00, 0xC0, 0xFF, 0x3F, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x07
 };
 #endif /* DOINIT */
 
diff --git a/t/uni/fold.t b/t/uni/fold.t
index f6f467c..0f71c80 100644
--- a/t/uni/fold.t
+++ b/t/uni/fold.t
@@ -38,12 +38,13 @@ if (open(CF, $CF)) {
        my $b = pack("U0U*", map { hex } split " ", $mapping);
        my $t0 = ":$a:" =~ /:$a:/    ? 1 : 0;
        my $t1 = ":$a:" =~ /:$a:/i   ? 1 : 0;
-       my $t2 = ":$a:" =~ /:[$a]:/  ? 1 : 0;
-       my $t3 = ":$a:" =~ /:[$a]:/i ? 1 : 0;
+       my $t2 = ":$a:" =~ /:[_$a]:/  ? 1 : 0; # Two chars in [] so doesn't get
+                                               # optimized to a non-charclass
+       my $t3 = ":$a:" =~ /:[_$a]:/i ? 1 : 0;
        my $t4 = ":$a:" =~ /:$b:/i   ? 1 : 0;
-       my $t5 = ":$a:" =~ /:[$b]:/i ? 1 : 0;
+       my $t5 = ":$a:" =~ /:[_$b]:/i ? 1 : 0;
        my $t6 = ":$b:" =~ /:$a:/i   ? 1 : 0;
-       my $t7 = ":$b:" =~ /:[$a]:/i ? 1 : 0;
+       my $t7 = ":$b:" =~ /:[_$a]:/i ? 1 : 0;
        print $t0 && $t1 && $t2 && $t3 && $t4 && $t5 && $t6 && $t7 ?
            "ok $i \# - $code - $name - $mapping - $status\n" :
            "not ok $i \# - $code - $name - $mapping - $status - $t0 $t1 $t2 
$t3 $t4 $t5 $t6 $t7\n";
diff --git a/toke.c b/toke.c
index aa1f57c..12359e0 100644
--- a/toke.c
+++ b/toke.c
@@ -3024,9 +3024,9 @@ S_scan_const(pTHX_ char *start)
                 * no-op except on utfebcdic variant characters.  Every
                 * character generated by this that would normally need to be
                 * enclosed by this macro is invariant, so the macro is not
-                * needed, and would complicate use of copy(). There are other
-                * parts of this file where the macro is used inconsistently,
-                * but are saved by it being a no-op */
+                * needed, and would complicate use of copy().  XXX There are
+                * other parts of this file where the macro is used
+                * inconsistently, but are saved by it being a no-op */
 
                /* The structure of this section of code (besides checking for
                 * errors and upgrading to utf8) is:
@@ -3298,7 +3298,7 @@ S_scan_const(pTHX_ char *start)
                            if (UTF8_IS_INVARIANT(*i)) {
                                if (! isALPHAU(*i)) problematic = TRUE;
                            } else if (UTF8_IS_DOWNGRADEABLE_START(*i)) {
-                               if (! isALPHAU(UNI_TO_NATIVE(UTF8_ACCUMULATE(*i,
+                               if (! 
isALPHAU(UNI_TO_NATIVE(TWO_BYTE_UTF8_TO_UNI(*i,
                                                                            
*(i+1)))))
                                {
                                    problematic = TRUE;
@@ -3314,7 +3314,7 @@ S_scan_const(pTHX_ char *start)
                                    continue;
                                } else if (isCHARNAME_CONT(
                                            UNI_TO_NATIVE(
-                                           UTF8_ACCUMULATE(*i, *(i+1)))))
+                                           TWO_BYTE_UTF8_TO_UNI(*i, *(i+1)))))
                                {
                                    continue;
                                }

--
Perl5 Master Repository

Reply via email to