In perl.git, the branch blead has been updated <http://perl5.git.perl.org/perl.git/commitdiff/6bbba9040c7840209170b2ff9a1d7b03ae1cbdc1?hp=ec192197b904194f8a514368e774522e9b83add8>
- Log ----------------------------------------------------------------- commit 6bbba9040c7840209170b2ff9a1d7b03ae1cbdc1 Author: Karl Williamson <[email protected]> Date: Mon Dec 6 12:16:24 2010 -0700 regexec.c: Fix locale and \s The handling for locale \s and \S both assume that the character in ASCII platforms at 0x20 is a space. This is not necessarily so. I'm guessing that the code was originally just copied and pasted from the non-locale space handling code without thinking. That code hard-coded in the space character, probably to avoid an expensive swash fetch for a common situation. M regexec.c commit b77393f6288f64bf00f41fef15da0fac4085bfd2 Author: Karl Williamson <[email protected]> Date: Mon Dec 6 12:01:22 2010 -0700 regexec.c: Add missing handlers for locale \d regexec.c had some code to handle \d under locales, but not everywhere. M regexec.c commit 28b5d7bf98b62fd30fb98fcdb5c701b1b2acdd8f Author: Karl Williamson <[email protected]> Date: Mon Dec 6 11:56:49 2010 -0700 regcomp.sym: Correct DIGITL, NDIGITL entries These were missing that they were simple (matching exactly 1 character) and have 0 regnode arguments M regcomp.sym M regnodes.h commit 6ab9ea91fb04390bf9c50134beadab7cf6fd0c25 Author: Karl Williamson <[email protected]> Date: Mon Dec 6 08:35:17 2010 -0700 regcomp.c: Add locale for \d The DIGITL and NDIGITL regnodes were not being generated; instead regular DIGIT and NDIGIT regnodes were even under locale. This means no one has probably ever used Perl on a locale that changed the digits. M regcomp.c commit 81c14aa2230ca380c2f424e69ac8f9dc0bb4ae23 Author: Karl Williamson <[email protected]> Date: Sun Dec 5 13:10:13 2010 -0700 toke.c: Fix EBCDIC problem Commit 356979f4a7d780fd67a92a9ca6c8659bd12e7168 failed to include two instances in toke.c that needed the same treatment, i.e., converting properly from I8 to native. M toke.c commit 7538f7248145d82e23e430518cf41f4da91e8fdd Author: Karl Williamson <[email protected]> Date: Sun Dec 5 13:09:43 2010 -0700 toke.c: highlight problematic-mentioning comment M toke.c commit 4d252e65989d9d34bc759c890186b40e14559420 Author: Karl Williamson <[email protected]> Date: Sun Dec 5 13:08:33 2010 -0700 uni/fold.t: Prevent [] from being optimized out This test hasn't been testing bracketed char classes because a single character class gets optimized out. M t/uni/fold.t commit 9d6ecd7a330475b012bff918b30af2834aec6ea4 Author: Karl Williamson <[email protected]> Date: Sun Dec 5 13:07:13 2010 -0700 regcomp.c: Revert to using regcomp.sym order Now that the new nodes are grouped properly, we can use the fact that the named backreferences all come after all the numbered backreferences, as had been there before. M regcomp.c commit 01f98ec2b0828a07c073b6eef8c4942f61e69e13 Author: Karl Williamson <[email protected]> Date: Sun Dec 5 12:28:21 2010 -0700 regcomp.sym: Re-order for better grouping The recently added regnodes are moved to their respective equivalence classes, and the named backreferences are moved to just after the numbered backreferences M regcomp.sym M regnodes.h commit 381b57db967e0e071e69852c5b3297178de329ae Author: Karl Williamson <[email protected]> Date: Sat Dec 4 20:48:24 2010 -0700 regcomp.sym: Remove misleading comments Yves informed me that in spite of the comments giving precise node numbers, those numbers can change, so new nodes can be mixed in with their kin. Remove those comments M regcomp.sym ----------------------------------------------------------------------- Summary of changes: regcomp.c | 14 +++- regcomp.sym | 61 ++++++------- regexec.c | 36 +++++++- regnodes.h | 272 +++++++++++++++++++++++++++++----------------------------- t/uni/fold.t | 9 +- toke.c | 10 +- 6 files changed, 218 insertions(+), 184 deletions(-) diff --git a/regcomp.c b/regcomp.c index 60fef55..4fb8c37 100644 --- a/regcomp.c +++ b/regcomp.c @@ -7445,11 +7445,19 @@ tryagain: *flagp |= HASWIDTH|SIMPLE; goto finish_meta_pat; case 'd': - ret = reg_node(pRExC_state, DIGIT); + if (LOC) { + ret = reg_node(pRExC_state, (U8)(DIGITL)); + } else { + ret = reg_node(pRExC_state, (U8)(DIGIT)); + } *flagp |= HASWIDTH|SIMPLE; goto finish_meta_pat; case 'D': - ret = reg_node(pRExC_state, NDIGIT); + if (LOC) { + ret = reg_node(pRExC_state, (U8)(NDIGITL)); + } else { + ret = reg_node(pRExC_state, (U8)(NDIGIT)); + } *flagp |= HASWIDTH|SIMPLE; goto finish_meta_pat; case 'R': @@ -9607,7 +9615,7 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) { Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */ if ( RXp_PAREN_NAMES(prog) ) { - if ( k != REF || (OP(o) != NREF && OP(o) != NREFF && OP(o) != NREFFL && OP(o) != NREFFU)) { + if ( k != REF || (OP(o) < NREF)) { AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]); SV **name= av_fetch(list, ARG(o), 0 ); if (name) diff --git a/regcomp.sym b/regcomp.sym index 4e787a7..707da08 100644 --- a/regcomp.sym +++ b/regcomp.sym @@ -16,12 +16,12 @@ -#* Exit points (0,1) +#* Exit points END END, no ; End of program. SUCCEED END, no ; Return from a subroutine, basically. -#* Anchors: (2..13) +#* Anchors: BOL BOL, no ; Match "" at beginning of line. MBOL BOL, no ; Same, assuming multiline. @@ -36,7 +36,7 @@ NBOUND NBOUND, no ; Match "" at any word non-boundary NBOUNDL NBOUND, no ; Match "" at any word non-boundary GPOS GPOS, no ; Matches where last m//g left off. -#* [Special] alternatives: (14..30) +#* [Special] alternatives: REG_ANY REG_ANY, no 0 S ; Match any one character (except newline). SANY REG_ANY, no 0 S ; Match any one character. @@ -51,12 +51,12 @@ SPACEL SPACE, no 0 S ; Match any whitespace char in locale NSPACE NSPACE, no 0 S ; Match any non-whitespace character NSPACEL NSPACE, no 0 S ; Match any non-whitespace char in locale DIGIT DIGIT, no 0 S ; Match any numeric character -DIGITL DIGIT, no ; Match any numeric character in locale +DIGITL DIGIT, no 0 S ; Match any numeric character in locale NDIGIT NDIGIT, no 0 S ; Match any non-numeric character -NDIGITL NDIGIT, no ; Match any non-numeric character in locale +NDIGITL NDIGIT, no 0 S ; Match any non-numeric character in locale CLUMP CLUMP, no 0 V ; Match any extended grapheme cluster sequence -#* Alternation (31) +#* Alternation # BRANCH The set of branches constituting a single choice are hooked # together with their "next" pointers, since precedence prevents @@ -68,26 +68,27 @@ CLUMP CLUMP, no 0 V ; Match any extended grapheme cluster sequence # BRANCH BRANCH, node 0 V ; Match this alternative, or the next... -#*Back pointer (32) +#*Back pointer # BACK Normal "next" pointers all implicitly point forward; BACK # exists to make loop structures possible. # not used BACK BACK, no 0 V ; Match "", "next" ptr points backward. -#*Literals (33..35) +#*Literals EXACT EXACT, str ; Match this string (preceded by length). EXACTF EXACT, str ; Match this string, folded, native charset semantics for non-utf8 (prec. by length). EXACTFL EXACT, str ; Match this string, folded in locale (w/len). +EXACTFU EXACT, str ; Match this string, folded, Unicode semantics for non-utf8 (prec. by length). -#*Do nothing types (36..37) +#*Do nothing types NOTHING NOTHING, no ; Match empty string. # A variant of above which delimits a group, thus stops optimizations TAIL NOTHING, no ; Match empty string. Can jump here from outside. -#*Loops (38..44) +#*Loops # STAR,PLUS '?', and complex '*' and '+', are implemented as circular # BRANCH structures using BACK. Simple cases (one character @@ -105,7 +106,7 @@ CURLYX CURLY, sv 2 V ; Match this complex thing {n,m} times. # This terminator creates a loop structure for CURLYX WHILEM WHILEM, no 0 V ; Do curly processing and see if rest matches. -#*Buffer related (45..49) +#*Buffer related # OPEN,CLOSE,GROUPP ...are numbered at compile time. OPEN OPEN, num 1 ; Mark this point in input as start of #n. @@ -114,7 +115,16 @@ CLOSE CLOSE, num 1 ; Analogous to OPEN. REF REF, num 1 V ; Match some already matched string REFF REF, num 1 V ; Match already matched string, folded using native charset semantics for non-utf8 REFFL REF, num 1 V ; Match already matched string, folded in loc. +# REFFU and NREFFU could have been implemented using the FLAGS field of the +# regnode, but by having a separate node type, we can use the existing switch +# statement to avoid some tests +REFFU REF, num 1 V ; Match already matched string, folded using unicode semantics for non-utf8 +#*Named references. Code in regcomp.c assumes that these all are after the numbered references +NREF REF, no-sv 1 V ; Match some already matched string +NREFF REF, no-sv 1 V ; Match already matched string, folded using native charset semantics for non-utf8 +NREFFL REF, no-sv 1 V ; Match already matched string, folded in loc. +NREFFU REF, num 1 V ; Match already matched string, folded using unicode semantics for non-utf8 IFMATCH BRANCHJ, off 1 . 2 ; Succeeds if the following matches. UNLESSM BRANCHJ, off 1 . 2 ; Fails if the following matches. @@ -122,24 +132,24 @@ SUSPEND BRANCHJ, off 1 V 1 ; "Independent" sub-RE. IFTHEN BRANCHJ, off 1 V 1 ; Switch, should be preceeded by switcher . GROUPP GROUPP, num 1 ; Whether the group matched. -#*Support for long RE (55..56) +#*Support for long RE LONGJMP LONGJMP, off 1 . 1 ; Jump far away. BRANCHJ BRANCHJ, off 1 V 1 ; BRANCH with long offset. -#*The heavy worker (57) +#*The heavy worker EVAL EVAL, evl 1 ; Execute some Perl code. -#*Modifiers (58..59) +#*Modifiers MINMOD MINMOD, no ; Next operator is not greedy. LOGICAL LOGICAL, no ; Next opcode should set the flag only. -# This is not used yet (60) +# This is not used yet RENUM BRANCHJ, off 1 . 1 ; Group with independently numbered parens. -#*Trie Related (61..62) +#*Trie Related # Behave the same as A|LIST|OF|WORDS would. The '..C' variants have # inline charclass data (ascii only), the 'C' store it in the structure. @@ -152,17 +162,11 @@ TRIEC TRIE,trie charclass ; Same as TRIE, but with embedded charclass da AHOCORASICK TRIE, trie 1 ; Aho Corasick stclass. flags==type AHOCORASICKC TRIE,trie charclass ; Same as AHOCORASICK, but with embedded charclass data -#*Regex Subroutines (65..66) +#*Regex Subroutines GOSUB GOSUB, num/ofs 2L ; recurse to paren arg1 at (signed) ofs arg2 GOSTART GOSTART, no ; recurse to start of pattern -#*Named references (67..69) -NREF REF, no-sv 1 V ; Match some already matched string -NREFF REF, no-sv 1 V ; Match already matched string, folded using native charset semantics for non-utf8 -NREFFL REF, no-sv 1 V ; Match already matched string, folded in loc. - - -#*Special conditionals (70..72) +#*Special conditionals NGROUPP NGROUPP, no-sv 1 ; Whether the group matched. INSUBP INSUBP, num 1 ; Whether we are in a specific recurse. DEFINEP DEFINEP, none 1 ; Never execute directly. @@ -192,16 +196,9 @@ HORIZWS HORIZWS, none 0 S ; horizontal whitespace (Perl 6) NHORIZWS NHORIZWS, none 0 S ; not horizontal whitespace (Perl 6) FOLDCHAR FOLDCHAR, codepoint 1 ; codepoint with tricky case folding properties. -EXACTFU EXACT, str ; Match this string, folded, Unicode semantics for non-utf8 (prec. by length). - -# These could have been implemented using the FLAGS field of the regnode, but -# by having a separate node type, we can use the existing switch statement to -# avoid some tests -REFFU REF, num 1 V ; Match already matched string, folded using unicode semantics for non-utf8 -NREFFU REF, num 1 V ; Match already matched string, folded using unicode semantics for non-utf8 -# NEW STUFF ABOVE THIS LINE +# NEW STUFF SOMEWHERE ABOVE THIS LINE ################################################################################ diff --git a/regexec.c b/regexec.c index 112722e..c1f1ae2 100644 --- a/regexec.c +++ b/regexec.c @@ -1645,7 +1645,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, ); case SPACEL: REXEC_FBC_CSCAN_TAINT( - *s == ' ' || isSPACE_LC_utf8((U8*)s), + isSPACE_LC_utf8((U8*)s), isSPACE_LC(*s) ); case NSPACE: @@ -1656,7 +1656,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, ); case NSPACEL: REXEC_FBC_CSCAN_TAINT( - !(*s == ' ' || isSPACE_LC_utf8((U8*)s)), + !isSPACE_LC_utf8((U8*)s), !isSPACE_LC(*s) ); case DIGIT: @@ -6036,7 +6036,7 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth) if (utf8_target) { loceol = PL_regeol; while (hardcount < max && scan < loceol && - (*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) { + isSPACE_LC_utf8((U8*)scan)) { scan += UTF8SKIP(scan); hardcount++; } @@ -6071,7 +6071,7 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth) if (utf8_target) { loceol = PL_regeol; while (hardcount < max && scan < loceol && - !(*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) { + !isSPACE_LC_utf8((U8*)scan)) { scan += UTF8SKIP(scan); hardcount++; } @@ -6094,6 +6094,20 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth) scan++; } break; + case DIGITL: + PL_reg_flags |= RF_tainted; + if (utf8_target) { + loceol = PL_regeol; + while (hardcount < max && scan < loceol && + isDIGIT_LC_utf8((U8*)scan)) { + scan += UTF8SKIP(scan); + hardcount++; + } + } else { + while (scan < loceol && isDIGIT_LC(*scan)) + scan++; + } + break; case NDIGIT: if (utf8_target) { loceol = PL_regeol; @@ -6107,6 +6121,20 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth) while (scan < loceol && !isDIGIT(*scan)) scan++; } + case NDIGITL: + PL_reg_flags |= RF_tainted; + if (utf8_target) { + loceol = PL_regeol; + while (hardcount < max && scan < loceol && + !isDIGIT_LC_utf8((U8*)scan)) { + scan += UTF8SKIP(scan); + hardcount++; + } + } else { + while (scan < loceol && !isDIGIT_LC(*scan)) + scan++; + } + break; case LNBREAK: if (utf8_target) { loceol = PL_regeol; diff --git a/regnodes.h b/regnodes.h index 09ab661..35a4cc1 100644 --- a/regnodes.h +++ b/regnodes.h @@ -45,62 +45,62 @@ #define EXACT 33 /* 0x21 Match this string (preceded by length). */ #define EXACTF 34 /* 0x22 Match this string, folded, native charset semantics for non-utf8 (prec. by length). */ #define EXACTFL 35 /* 0x23 Match this string, folded in locale (w/len). */ -#define NOTHING 36 /* 0x24 Match empty string. */ -#define TAIL 37 /* 0x25 Match empty string. Can jump here from outside. */ -#define STAR 38 /* 0x26 Match this (simple) thing 0 or more times. */ -#define PLUS 39 /* 0x27 Match this (simple) thing 1 or more times. */ -#define CURLY 40 /* 0x28 Match this simple thing {n,m} times. */ -#define CURLYN 41 /* 0x29 Capture next-after-this simple thing */ -#define CURLYM 42 /* 0x2a Capture this medium-complex thing {n,m} times. */ -#define CURLYX 43 /* 0x2b Match this complex thing {n,m} times. */ -#define WHILEM 44 /* 0x2c Do curly processing and see if rest matches. */ -#define OPEN 45 /* 0x2d Mark this point in input as start of */ -#define CLOSE 46 /* 0x2e Analogous to OPEN. */ -#define REF 47 /* 0x2f Match some already matched string */ -#define REFF 48 /* 0x30 Match already matched string, folded using native charset semantics for non-utf8 */ -#define REFFL 49 /* 0x31 Match already matched string, folded in loc. */ -#define IFMATCH 50 /* 0x32 Succeeds if the following matches. */ -#define UNLESSM 51 /* 0x33 Fails if the following matches. */ -#define SUSPEND 52 /* 0x34 "Independent" sub-RE. */ -#define IFTHEN 53 /* 0x35 Switch, should be preceeded by switcher . */ -#define GROUPP 54 /* 0x36 Whether the group matched. */ -#define LONGJMP 55 /* 0x37 Jump far away. */ -#define BRANCHJ 56 /* 0x38 BRANCH with long offset. */ -#define EVAL 57 /* 0x39 Execute some Perl code. */ -#define MINMOD 58 /* 0x3a Next operator is not greedy. */ -#define LOGICAL 59 /* 0x3b Next opcode should set the flag only. */ -#define RENUM 60 /* 0x3c Group with independently numbered parens. */ -#define TRIE 61 /* 0x3d Match many EXACT(F[LU]?)? at once. flags==type */ -#define TRIEC 62 /* 0x3e Same as TRIE, but with embedded charclass data */ -#define AHOCORASICK 63 /* 0x3f Aho Corasick stclass. flags==type */ -#define AHOCORASICKC 64 /* 0x40 Same as AHOCORASICK, but with embedded charclass data */ -#define GOSUB 65 /* 0x41 recurse to paren arg1 at (signed) ofs arg2 */ -#define GOSTART 66 /* 0x42 recurse to start of pattern */ -#define NREF 67 /* 0x43 Match some already matched string */ -#define NREFF 68 /* 0x44 Match already matched string, folded using native charset semantics for non-utf8 */ -#define NREFFL 69 /* 0x45 Match already matched string, folded in loc. */ -#define NGROUPP 70 /* 0x46 Whether the group matched. */ -#define INSUBP 71 /* 0x47 Whether we are in a specific recurse. */ -#define DEFINEP 72 /* 0x48 Never execute directly. */ -#define ENDLIKE 73 /* 0x49 Used only for the type field of verbs */ -#define OPFAIL 74 /* 0x4a Same as (?!) */ -#define ACCEPT 75 /* 0x4b Accepts the current matched string. */ -#define VERB 76 /* 0x4c Used only for the type field of verbs */ -#define PRUNE 77 /* 0x4d Pattern fails at this startpoint if no-backtracking through this */ -#define MARKPOINT 78 /* 0x4e Push the current location for rollback by cut. */ -#define SKIP 79 /* 0x4f On failure skip forward (to the mark) before retrying */ -#define COMMIT 80 /* 0x50 Pattern fails outright if backtracking through this */ -#define CUTGROUP 81 /* 0x51 On failure go to the next alternation in the group */ -#define KEEPS 82 /* 0x52 $& begins here. */ -#define LNBREAK 83 /* 0x53 generic newline pattern */ -#define VERTWS 84 /* 0x54 vertical whitespace (Perl 6) */ -#define NVERTWS 85 /* 0x55 not vertical whitespace (Perl 6) */ -#define HORIZWS 86 /* 0x56 horizontal whitespace (Perl 6) */ -#define NHORIZWS 87 /* 0x57 not horizontal whitespace (Perl 6) */ -#define FOLDCHAR 88 /* 0x58 codepoint with tricky case folding properties. */ -#define EXACTFU 89 /* 0x59 Match this string, folded, Unicode semantics for non-utf8 (prec. by length). */ -#define REFFU 90 /* 0x5a Match already matched string, folded using unicode semantics for non-utf8 */ -#define NREFFU 91 /* 0x5b Match already matched string, folded using unicode semantics for non-utf8 */ +#define EXACTFU 36 /* 0x24 Match this string, folded, Unicode semantics for non-utf8 (prec. by length). */ +#define NOTHING 37 /* 0x25 Match empty string. */ +#define TAIL 38 /* 0x26 Match empty string. Can jump here from outside. */ +#define STAR 39 /* 0x27 Match this (simple) thing 0 or more times. */ +#define PLUS 40 /* 0x28 Match this (simple) thing 1 or more times. */ +#define CURLY 41 /* 0x29 Match this simple thing {n,m} times. */ +#define CURLYN 42 /* 0x2a Capture next-after-this simple thing */ +#define CURLYM 43 /* 0x2b Capture this medium-complex thing {n,m} times. */ +#define CURLYX 44 /* 0x2c Match this complex thing {n,m} times. */ +#define WHILEM 45 /* 0x2d Do curly processing and see if rest matches. */ +#define OPEN 46 /* 0x2e Mark this point in input as start of */ +#define CLOSE 47 /* 0x2f Analogous to OPEN. */ +#define REF 48 /* 0x30 Match some already matched string */ +#define REFF 49 /* 0x31 Match already matched string, folded using native charset semantics for non-utf8 */ +#define REFFL 50 /* 0x32 Match already matched string, folded in loc. */ +#define REFFU 51 /* 0x33 Match already matched string, folded using unicode semantics for non-utf8 */ +#define NREF 52 /* 0x34 Match some already matched string */ +#define NREFF 53 /* 0x35 Match already matched string, folded using native charset semantics for non-utf8 */ +#define NREFFL 54 /* 0x36 Match already matched string, folded in loc. */ +#define NREFFU 55 /* 0x37 Match already matched string, folded using unicode semantics for non-utf8 */ +#define IFMATCH 56 /* 0x38 Succeeds if the following matches. */ +#define UNLESSM 57 /* 0x39 Fails if the following matches. */ +#define SUSPEND 58 /* 0x3a "Independent" sub-RE. */ +#define IFTHEN 59 /* 0x3b Switch, should be preceeded by switcher . */ +#define GROUPP 60 /* 0x3c Whether the group matched. */ +#define LONGJMP 61 /* 0x3d Jump far away. */ +#define BRANCHJ 62 /* 0x3e BRANCH with long offset. */ +#define EVAL 63 /* 0x3f Execute some Perl code. */ +#define MINMOD 64 /* 0x40 Next operator is not greedy. */ +#define LOGICAL 65 /* 0x41 Next opcode should set the flag only. */ +#define RENUM 66 /* 0x42 Group with independently numbered parens. */ +#define TRIE 67 /* 0x43 Match many EXACT(F[LU]?)? at once. flags==type */ +#define TRIEC 68 /* 0x44 Same as TRIE, but with embedded charclass data */ +#define AHOCORASICK 69 /* 0x45 Aho Corasick stclass. flags==type */ +#define AHOCORASICKC 70 /* 0x46 Same as AHOCORASICK, but with embedded charclass data */ +#define GOSUB 71 /* 0x47 recurse to paren arg1 at (signed) ofs arg2 */ +#define GOSTART 72 /* 0x48 recurse to start of pattern */ +#define NGROUPP 73 /* 0x49 Whether the group matched. */ +#define INSUBP 74 /* 0x4a Whether we are in a specific recurse. */ +#define DEFINEP 75 /* 0x4b Never execute directly. */ +#define ENDLIKE 76 /* 0x4c Used only for the type field of verbs */ +#define OPFAIL 77 /* 0x4d Same as (?!) */ +#define ACCEPT 78 /* 0x4e Accepts the current matched string. */ +#define VERB 79 /* 0x4f Used only for the type field of verbs */ +#define PRUNE 80 /* 0x50 Pattern fails at this startpoint if no-backtracking through this */ +#define MARKPOINT 81 /* 0x51 Push the current location for rollback by cut. */ +#define SKIP 82 /* 0x52 On failure skip forward (to the mark) before retrying */ +#define COMMIT 83 /* 0x53 Pattern fails outright if backtracking through this */ +#define CUTGROUP 84 /* 0x54 On failure go to the next alternation in the group */ +#define KEEPS 85 /* 0x55 $& begins here. */ +#define LNBREAK 86 /* 0x56 generic newline pattern */ +#define VERTWS 87 /* 0x57 vertical whitespace (Perl 6) */ +#define NVERTWS 88 /* 0x58 not vertical whitespace (Perl 6) */ +#define HORIZWS 89 /* 0x59 horizontal whitespace (Perl 6) */ +#define NHORIZWS 90 /* 0x5a not horizontal whitespace (Perl 6) */ +#define FOLDCHAR 91 /* 0x5b codepoint with tricky case folding properties. */ #define OPTIMIZED 92 /* 0x5c Placeholder for dump. */ #define PSEUDO 93 /* 0x5d Pseudo opcode for internal use. */ /* ------------ States ------------- */ @@ -187,6 +187,7 @@ EXTCONST U8 PL_regkind[] = { EXACT, /* EXACT */ EXACT, /* EXACTF */ EXACT, /* EXACTFL */ + EXACT, /* EXACTFU */ NOTHING, /* NOTHING */ NOTHING, /* TAIL */ STAR, /* STAR */ @@ -201,6 +202,11 @@ EXTCONST U8 PL_regkind[] = { REF, /* REF */ REF, /* REFF */ REF, /* REFFL */ + REF, /* REFFU */ + REF, /* NREF */ + REF, /* NREFF */ + REF, /* NREFFL */ + REF, /* NREFFU */ BRANCHJ, /* IFMATCH */ BRANCHJ, /* UNLESSM */ BRANCHJ, /* SUSPEND */ @@ -218,9 +224,6 @@ EXTCONST U8 PL_regkind[] = { TRIE, /* AHOCORASICKC */ GOSUB, /* GOSUB */ GOSTART, /* GOSTART */ - REF, /* NREF */ - REF, /* NREFF */ - REF, /* NREFFL */ NGROUPP, /* NGROUPP */ INSUBP, /* INSUBP */ DEFINEP, /* DEFINEP */ @@ -240,9 +243,6 @@ EXTCONST U8 PL_regkind[] = { HORIZWS, /* HORIZWS */ NHORIZWS, /* NHORIZWS */ FOLDCHAR, /* FOLDCHAR */ - EXACT, /* EXACTFU */ - REF, /* REFFU */ - REF, /* NREFFU */ NOTHING, /* OPTIMIZED */ PSEUDO, /* PSEUDO */ /* ------------ States ------------- */ @@ -329,6 +329,7 @@ static const U8 regarglen[] = { 0, /* EXACT */ 0, /* EXACTF */ 0, /* EXACTFL */ + 0, /* EXACTFU */ 0, /* NOTHING */ 0, /* TAIL */ 0, /* STAR */ @@ -343,6 +344,11 @@ static const U8 regarglen[] = { EXTRA_SIZE(struct regnode_1), /* REF */ EXTRA_SIZE(struct regnode_1), /* REFF */ EXTRA_SIZE(struct regnode_1), /* REFFL */ + EXTRA_SIZE(struct regnode_1), /* REFFU */ + EXTRA_SIZE(struct regnode_1), /* NREF */ + EXTRA_SIZE(struct regnode_1), /* NREFF */ + EXTRA_SIZE(struct regnode_1), /* NREFFL */ + EXTRA_SIZE(struct regnode_1), /* NREFFU */ EXTRA_SIZE(struct regnode_1), /* IFMATCH */ EXTRA_SIZE(struct regnode_1), /* UNLESSM */ EXTRA_SIZE(struct regnode_1), /* SUSPEND */ @@ -360,9 +366,6 @@ static const U8 regarglen[] = { EXTRA_SIZE(struct regnode_charclass), /* AHOCORASICKC */ EXTRA_SIZE(struct regnode_2L), /* GOSUB */ 0, /* GOSTART */ - EXTRA_SIZE(struct regnode_1), /* NREF */ - EXTRA_SIZE(struct regnode_1), /* NREFF */ - EXTRA_SIZE(struct regnode_1), /* NREFFL */ EXTRA_SIZE(struct regnode_1), /* NGROUPP */ EXTRA_SIZE(struct regnode_1), /* INSUBP */ EXTRA_SIZE(struct regnode_1), /* DEFINEP */ @@ -382,9 +385,6 @@ static const U8 regarglen[] = { 0, /* HORIZWS */ 0, /* NHORIZWS */ EXTRA_SIZE(struct regnode_1), /* FOLDCHAR */ - 0, /* EXACTFU */ - EXTRA_SIZE(struct regnode_1), /* REFFU */ - EXTRA_SIZE(struct regnode_1), /* NREFFU */ 0, /* OPTIMIZED */ 0, /* PSEUDO */ }; @@ -428,6 +428,7 @@ static const char reg_off_by_arg[] = { 0, /* EXACT */ 0, /* EXACTF */ 0, /* EXACTFL */ + 0, /* EXACTFU */ 0, /* NOTHING */ 0, /* TAIL */ 0, /* STAR */ @@ -442,6 +443,11 @@ static const char reg_off_by_arg[] = { 0, /* REF */ 0, /* REFF */ 0, /* REFFL */ + 0, /* REFFU */ + 0, /* NREF */ + 0, /* NREFF */ + 0, /* NREFFL */ + 0, /* NREFFU */ 2, /* IFMATCH */ 2, /* UNLESSM */ 1, /* SUSPEND */ @@ -459,9 +465,6 @@ static const char reg_off_by_arg[] = { 0, /* AHOCORASICKC */ 0, /* GOSUB */ 0, /* GOSTART */ - 0, /* NREF */ - 0, /* NREFF */ - 0, /* NREFFL */ 0, /* NGROUPP */ 0, /* INSUBP */ 0, /* DEFINEP */ @@ -481,9 +484,6 @@ static const char reg_off_by_arg[] = { 0, /* HORIZWS */ 0, /* NHORIZWS */ 0, /* FOLDCHAR */ - 0, /* EXACTFU */ - 0, /* REFFU */ - 0, /* NREFFU */ 0, /* OPTIMIZED */ 0, /* PSEUDO */ }; @@ -532,62 +532,62 @@ EXTCONST char * const PL_reg_name[] = { "EXACT", /* 0x21 */ "EXACTF", /* 0x22 */ "EXACTFL", /* 0x23 */ - "NOTHING", /* 0x24 */ - "TAIL", /* 0x25 */ - "STAR", /* 0x26 */ - "PLUS", /* 0x27 */ - "CURLY", /* 0x28 */ - "CURLYN", /* 0x29 */ - "CURLYM", /* 0x2a */ - "CURLYX", /* 0x2b */ - "WHILEM", /* 0x2c */ - "OPEN", /* 0x2d */ - "CLOSE", /* 0x2e */ - "REF", /* 0x2f */ - "REFF", /* 0x30 */ - "REFFL", /* 0x31 */ - "IFMATCH", /* 0x32 */ - "UNLESSM", /* 0x33 */ - "SUSPEND", /* 0x34 */ - "IFTHEN", /* 0x35 */ - "GROUPP", /* 0x36 */ - "LONGJMP", /* 0x37 */ - "BRANCHJ", /* 0x38 */ - "EVAL", /* 0x39 */ - "MINMOD", /* 0x3a */ - "LOGICAL", /* 0x3b */ - "RENUM", /* 0x3c */ - "TRIE", /* 0x3d */ - "TRIEC", /* 0x3e */ - "AHOCORASICK", /* 0x3f */ - "AHOCORASICKC", /* 0x40 */ - "GOSUB", /* 0x41 */ - "GOSTART", /* 0x42 */ - "NREF", /* 0x43 */ - "NREFF", /* 0x44 */ - "NREFFL", /* 0x45 */ - "NGROUPP", /* 0x46 */ - "INSUBP", /* 0x47 */ - "DEFINEP", /* 0x48 */ - "ENDLIKE", /* 0x49 */ - "OPFAIL", /* 0x4a */ - "ACCEPT", /* 0x4b */ - "VERB", /* 0x4c */ - "PRUNE", /* 0x4d */ - "MARKPOINT", /* 0x4e */ - "SKIP", /* 0x4f */ - "COMMIT", /* 0x50 */ - "CUTGROUP", /* 0x51 */ - "KEEPS", /* 0x52 */ - "LNBREAK", /* 0x53 */ - "VERTWS", /* 0x54 */ - "NVERTWS", /* 0x55 */ - "HORIZWS", /* 0x56 */ - "NHORIZWS", /* 0x57 */ - "FOLDCHAR", /* 0x58 */ - "EXACTFU", /* 0x59 */ - "REFFU", /* 0x5a */ - "NREFFU", /* 0x5b */ + "EXACTFU", /* 0x24 */ + "NOTHING", /* 0x25 */ + "TAIL", /* 0x26 */ + "STAR", /* 0x27 */ + "PLUS", /* 0x28 */ + "CURLY", /* 0x29 */ + "CURLYN", /* 0x2a */ + "CURLYM", /* 0x2b */ + "CURLYX", /* 0x2c */ + "WHILEM", /* 0x2d */ + "OPEN", /* 0x2e */ + "CLOSE", /* 0x2f */ + "REF", /* 0x30 */ + "REFF", /* 0x31 */ + "REFFL", /* 0x32 */ + "REFFU", /* 0x33 */ + "NREF", /* 0x34 */ + "NREFF", /* 0x35 */ + "NREFFL", /* 0x36 */ + "NREFFU", /* 0x37 */ + "IFMATCH", /* 0x38 */ + "UNLESSM", /* 0x39 */ + "SUSPEND", /* 0x3a */ + "IFTHEN", /* 0x3b */ + "GROUPP", /* 0x3c */ + "LONGJMP", /* 0x3d */ + "BRANCHJ", /* 0x3e */ + "EVAL", /* 0x3f */ + "MINMOD", /* 0x40 */ + "LOGICAL", /* 0x41 */ + "RENUM", /* 0x42 */ + "TRIE", /* 0x43 */ + "TRIEC", /* 0x44 */ + "AHOCORASICK", /* 0x45 */ + "AHOCORASICKC", /* 0x46 */ + "GOSUB", /* 0x47 */ + "GOSTART", /* 0x48 */ + "NGROUPP", /* 0x49 */ + "INSUBP", /* 0x4a */ + "DEFINEP", /* 0x4b */ + "ENDLIKE", /* 0x4c */ + "OPFAIL", /* 0x4d */ + "ACCEPT", /* 0x4e */ + "VERB", /* 0x4f */ + "PRUNE", /* 0x50 */ + "MARKPOINT", /* 0x51 */ + "SKIP", /* 0x52 */ + "COMMIT", /* 0x53 */ + "CUTGROUP", /* 0x54 */ + "KEEPS", /* 0x55 */ + "LNBREAK", /* 0x56 */ + "VERTWS", /* 0x57 */ + "NVERTWS", /* 0x58 */ + "HORIZWS", /* 0x59 */ + "NHORIZWS", /* 0x5a */ + "FOLDCHAR", /* 0x5b */ "OPTIMIZED", /* 0x5c */ "PSEUDO", /* 0x5d */ /* ------------ States ------------- */ @@ -684,8 +684,8 @@ EXTCONST U8 PL_varies[] __attribute__deprecated__; #else EXTCONST U8 PL_varies[] __attribute__deprecated__ = { CLUMP, BRANCH, BACK, STAR, PLUS, CURLY, CURLYN, CURLYM, CURLYX, WHILEM, - REF, REFF, REFFL, SUSPEND, IFTHEN, BRANCHJ, NREF, NREFF, NREFFL, REFFU, - NREFFU, + REF, REFF, REFFL, REFFU, NREF, NREFF, NREFFL, NREFFU, SUSPEND, IFTHEN, + BRANCHJ, 0 }; #endif /* DOINIT */ @@ -694,7 +694,7 @@ EXTCONST U8 PL_varies[] __attribute__deprecated__ = { EXTCONST U8 PL_varies_bitmask[]; #else EXTCONST U8 PL_varies_bitmask[] = { - 0x00, 0x00, 0x00, 0xC0, 0xC1, 0x9F, 0x33, 0x01, 0x38, 0x00, 0x00, 0x0C + 0x00, 0x00, 0x00, 0xC0, 0x81, 0x3F, 0xFF, 0x4C, 0x00, 0x00, 0x00, 0x00 }; #endif /* DOINIT */ @@ -707,8 +707,8 @@ EXTCONST U8 PL_simple[] __attribute__deprecated__; #else EXTCONST U8 PL_simple[] __attribute__deprecated__ = { REG_ANY, SANY, CANY, ANYOF, ALNUM, ALNUML, NALNUM, NALNUML, SPACE, - SPACEL, NSPACE, NSPACEL, DIGIT, NDIGIT, VERTWS, NVERTWS, HORIZWS, - NHORIZWS, + SPACEL, NSPACE, NSPACEL, DIGIT, DIGITL, NDIGIT, NDIGITL, VERTWS, + NVERTWS, HORIZWS, NHORIZWS, 0 }; #endif /* DOINIT */ @@ -717,7 +717,7 @@ EXTCONST U8 PL_simple[] __attribute__deprecated__ = { EXTCONST U8 PL_simple_bitmask[]; #else EXTCONST U8 PL_simple_bitmask[] = { - 0x00, 0xC0, 0xFF, 0x17, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xF0, 0x00 + 0x00, 0xC0, 0xFF, 0x3F, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x07 }; #endif /* DOINIT */ diff --git a/t/uni/fold.t b/t/uni/fold.t index f6f467c..0f71c80 100644 --- a/t/uni/fold.t +++ b/t/uni/fold.t @@ -38,12 +38,13 @@ if (open(CF, $CF)) { my $b = pack("U0U*", map { hex } split " ", $mapping); my $t0 = ":$a:" =~ /:$a:/ ? 1 : 0; my $t1 = ":$a:" =~ /:$a:/i ? 1 : 0; - my $t2 = ":$a:" =~ /:[$a]:/ ? 1 : 0; - my $t3 = ":$a:" =~ /:[$a]:/i ? 1 : 0; + my $t2 = ":$a:" =~ /:[_$a]:/ ? 1 : 0; # Two chars in [] so doesn't get + # optimized to a non-charclass + my $t3 = ":$a:" =~ /:[_$a]:/i ? 1 : 0; my $t4 = ":$a:" =~ /:$b:/i ? 1 : 0; - my $t5 = ":$a:" =~ /:[$b]:/i ? 1 : 0; + my $t5 = ":$a:" =~ /:[_$b]:/i ? 1 : 0; my $t6 = ":$b:" =~ /:$a:/i ? 1 : 0; - my $t7 = ":$b:" =~ /:[$a]:/i ? 1 : 0; + my $t7 = ":$b:" =~ /:[_$a]:/i ? 1 : 0; print $t0 && $t1 && $t2 && $t3 && $t4 && $t5 && $t6 && $t7 ? "ok $i \# - $code - $name - $mapping - $status\n" : "not ok $i \# - $code - $name - $mapping - $status - $t0 $t1 $t2 $t3 $t4 $t5 $t6 $t7\n"; diff --git a/toke.c b/toke.c index aa1f57c..12359e0 100644 --- a/toke.c +++ b/toke.c @@ -3024,9 +3024,9 @@ S_scan_const(pTHX_ char *start) * no-op except on utfebcdic variant characters. Every * character generated by this that would normally need to be * enclosed by this macro is invariant, so the macro is not - * needed, and would complicate use of copy(). There are other - * parts of this file where the macro is used inconsistently, - * but are saved by it being a no-op */ + * needed, and would complicate use of copy(). XXX There are + * other parts of this file where the macro is used + * inconsistently, but are saved by it being a no-op */ /* The structure of this section of code (besides checking for * errors and upgrading to utf8) is: @@ -3298,7 +3298,7 @@ S_scan_const(pTHX_ char *start) if (UTF8_IS_INVARIANT(*i)) { if (! isALPHAU(*i)) problematic = TRUE; } else if (UTF8_IS_DOWNGRADEABLE_START(*i)) { - if (! isALPHAU(UNI_TO_NATIVE(UTF8_ACCUMULATE(*i, + if (! isALPHAU(UNI_TO_NATIVE(TWO_BYTE_UTF8_TO_UNI(*i, *(i+1))))) { problematic = TRUE; @@ -3314,7 +3314,7 @@ S_scan_const(pTHX_ char *start) continue; } else if (isCHARNAME_CONT( UNI_TO_NATIVE( - UTF8_ACCUMULATE(*i, *(i+1))))) + TWO_BYTE_UTF8_TO_UNI(*i, *(i+1))))) { continue; } -- Perl5 Master Repository
