In perl.git, the branch blead has been updated <https://perl5.git.perl.org/perl.git/commitdiff/2813d4adc971fbaa124b5322d4bccaa73e9df8e2?hp=9c13cd3cdfa6ab6920882a355869287a277989c3>
- Log ----------------------------------------------------------------- commit 2813d4adc971fbaa124b5322d4bccaa73e9df8e2 Author: Karl Williamson <[email protected]> Date: Mon Jan 29 20:47:56 2018 -0700 Add ANYOFM regnode This is a specialized ANYOF node for use when the code points in it have characteristics that allow them to be matched with a mask instead of a bit map. When this happens, the speed up is pretty spectacular: Key: Ir Instruction read Dr Data read Dw Data write COND conditional branches IND indirect branches The numbers represent raw counts per loop iteration. Results of ('b' x 10000) . 'a' =~ /[Aa]/ blead mask Ratio % -------- ------- ------- Ir 153132.0 25636.0 597.3 Dr 40909.0 2155.0 1898.3 Dw 20593.0 593.0 3472.7 COND 20529.0 3028.0 678.0 IND 22.0 22.0 100.0 See the comments in regcomp.c or http://nntp.perl.org/group/perl.perl5.porters/249001 for a description of the cases that this new technique can handle. But several common ones include the C0 controls (on ASCII platforms), [01], [0-7], [Aa] and any other ASCII case pair. The set of ASCII characters also could be done with this node instead of having the special ASCII regnode, reducing code size and complexity. I haven't investigated the speed loss of doing so. A NANYOFM node could be created for matching the complements this one matches. A pattern like /A/i is not affected by this commit, but the regex optimizer could be changed to take advantage of this commit. What would need to be done is for it to look at the first byte of an EXACTFish node and if its one of the case pairs this handles, to generate a synthetic start class for it. This would automatically invoke the sped up code. commit 67a1b5f935fc7a39d75e1cafb06a0cea10871612 Author: Karl Williamson <[email protected]> Date: Mon Jan 22 13:55:03 2018 -0700 recomp.sym: Add ANYOFM regnode This uses a mask instead of a bitmap, and is restricted to representing invariant characters under UTF-8 that meet particular bit patterns. commit 2b7ee0568e8c163f9205a7bcb178d69ef88571ce Author: Karl Williamson <[email protected]> Date: Thu Jan 25 13:35:09 2018 -0700 regcomp.c: White-space only Indent code that the previous commit created a block around commit 06a83acd6aa23a9d19f925cc3232ff18fe7deee2 Author: Karl Williamson <[email protected]> Date: Thu Jan 25 13:26:16 2018 -0700 regcomp.c: Allow a fcn param to be NULL In which case handling is skipped. This is in preparation for a future commit which will use this function in a slightly different manner commit 070e8b2ef4f827a7e0d3199f7b37883a09545802 Author: Karl Williamson <[email protected]> Date: Fri Dec 29 15:45:38 2017 -0700 regexec.c: Use word-at-a-time to repeat /i single byte pattern For most of the case folding pairs, like [Aa], it is possible to use a mask to match them word-at-a-time in regrepeat(), so that long sequences of them are handled with significantly better performance. commit ab1efbdc1f74b2f4db076efa0b4d54f387d74efe Author: Karl Williamson <[email protected]> Date: Fri Dec 29 15:17:41 2017 -0700 regexec.c: Use word-at-a-time to repeat a single byte pattern There is special code in the function regrepeat() to handle instances where the pattern to repeat is a single byte. These all can be done word-at-a-time to significantly increase the performance of long repeats. commit 6a40c2e4e2dc26eb6ad39caf87cebef0743b90e7 Author: Karl Williamson <[email protected]> Date: Tue Dec 26 18:25:26 2017 -0700 regexec.c: Replace loop by memchr() This can be called on a potentially long string. commit 56dd984bdb8056d778b964ab6a46cb7dfaef915c Author: Karl Williamson <[email protected]> Date: Mon Jan 29 20:33:14 2018 -0700 Compile variant_byte_number() for EBCDIC Future commits will use this without regard to platform. commit 597ee3f45b478da1456092f63d3ac698ee812786 Author: Karl Williamson <[email protected]> Date: Mon Jan 29 20:07:51 2018 -0700 Use different scheme to handle MSVC6 Recent commit 0b08cab0fc46a5f381ca18a451f55cf12c81d966 caused a function to not be compiled when running on MSVC6, and hence its callers needed to use an alternative mechanism there. This is easy enough, it turns out, but it also turns out that there are more opportunities to call this function. Rather than having each caller have to know about the MSVC6 problem, this current commit reimplements the function on that platform to use a slow, dumb method, so knowing about the issue is confined to just this one function. ----------------------------------------------------------------------- Summary of changes: embed.fnc | 11 +- embed.h | 8 +- inline.h | 28 +++-- pod/perldebguts.pod | 2 + proto.h | 34 ++++-- regcomp.c | 203 +++++++++++++++++++++++++++++---- regcomp.sym | 1 + regexec.c | 251 ++++++++++++++++++++++++++++++++++++---- regnodes.h | 321 ++++++++++++++++++++++++++-------------------------- t/re/anyof.t | 2 +- 10 files changed, 636 insertions(+), 225 deletions(-) diff --git a/embed.fnc b/embed.fnc index 35202e8d7c..02546ffb3f 100644 --- a/embed.fnc +++ b/embed.fnc @@ -806,9 +806,7 @@ AndmoR |bool |is_utf8_invariant_string|NN const U8* const s \ AnidR |bool |is_utf8_invariant_string_loc|NN const U8* const s \ |STRLEN len \ |NULLOK const U8 ** ep -#if ! defined(EBCDIC) && ! defined USING_MSVC6 AniR |unsigned int|_variant_byte_number|PERL_UINTMAX_T word -#endif #if defined(PERL_CORE) || defined(PERL_EXT) EinR |Size_t |variant_under_utf8_count|NN const U8* const s \ |NN const U8* const e @@ -2459,6 +2457,7 @@ Es |SSize_t|study_chunk |NN RExC_state_t *pRExC_state \ |I32 stopparen|U32 recursed_depth \ |NULLOK regnode_ssc *and_withp \ |U32 flags|U32 depth +EsR |SV * |get_ANYOFM_contents|NN const regnode * n EsRn |U32 |add_data |NN RExC_state_t* const pRExC_state \ |NN const char* const s|const U32 n rs |void |re_croak2 |bool utf8|NN const char* pat1|NN const char* pat2|... @@ -2491,7 +2490,7 @@ Es |const regnode*|dumpuntil|NN const regexp *r|NN const regnode *start \ |NN SV* sv|I32 indent|U32 depth Es |void |put_code_point |NN SV* sv|UV c Es |bool |put_charclass_bitmap_innards|NN SV* sv \ - |NN char* bitmap \ + |NULLOK char* bitmap \ |NULLOK SV* nonbitmap_invlist \ |NULLOK SV* only_utf8_locale_invlist\ |NULLOK const regnode * const node \ @@ -2534,6 +2533,12 @@ ERp |bool |_is_grapheme |NN const U8 * strbeg|NN const U8 * s|NN const U8 *stren ERs |bool |isFOO_utf8_lc |const U8 classnum|NN const U8* character ERns |char *|find_next_ascii|NN char* s|NN const char * send|const bool is_utf8 ERns |char *|find_next_non_ascii|NN char* s|NN const char * send|const bool is_utf8 +ERns |char * |find_next_masked|NN char * s \ + |NN const char * send \ + |const U8 byte|const U8 mask +ERns |char *|find_span_end |NN char* s|NN const char * send|const char span_byte +ERns |U8 *|find_span_end_mask|NN U8 * s|NN const U8 * send \ + |const U8 span_byte|const U8 mask ERs |SSize_t|regmatch |NN regmatch_info *reginfo|NN char *startpos|NN regnode *prog WERs |I32 |regrepeat |NN regexp *prog|NN char **startposp \ |NN const regnode *p \ diff --git a/embed.h b/embed.h index 334c6063fb..d53dff9123 100644 --- a/embed.h +++ b/embed.h @@ -46,6 +46,7 @@ #define _to_utf8_lower_flags(a,b,c,d,e,f,g) Perl__to_utf8_lower_flags(aTHX_ a,b,c,d,e,f,g) #define _to_utf8_title_flags(a,b,c,d,e,f,g) Perl__to_utf8_title_flags(aTHX_ a,b,c,d,e,f,g) #define _to_utf8_upper_flags(a,b,c,d,e,f,g) Perl__to_utf8_upper_flags(aTHX_ a,b,c,d,e,f,g) +#define _variant_byte_number S__variant_byte_number #define amagic_call(a,b,c,d) Perl_amagic_call(aTHX_ a,b,c,d) #define amagic_deref_call(a,b) Perl_amagic_deref_call(aTHX_ a,b) #define apply_attrs_string(a,b,c,d) Perl_apply_attrs_string(aTHX_ a,b,c,d) @@ -768,9 +769,6 @@ #define whichsig_sv(a) Perl_whichsig_sv(aTHX_ a) #define wrap_keyword_plugin(a,b) Perl_wrap_keyword_plugin(aTHX_ a,b) #define wrap_op_checker(a,b,c) Perl_wrap_op_checker(aTHX_ a,b,c) -#if ! defined(EBCDIC) && ! defined USING_MSVC6 -#define _variant_byte_number S__variant_byte_number -#endif #if !(defined(HAS_MEMMEM)) #define ninstr Perl_ninstr #endif @@ -1012,6 +1010,7 @@ #define compute_EXACTish S_compute_EXACTish #define construct_ahocorasick_from_trie(a,b,c) S_construct_ahocorasick_from_trie(aTHX_ a,b,c) #define edit_distance S_edit_distance +#define get_ANYOFM_contents(a) S_get_ANYOFM_contents(aTHX_ a) #define get_ANYOF_cp_list_for_ssc(a,b) S_get_ANYOF_cp_list_for_ssc(aTHX_ a,b) #define get_invlist_iter_addr S_get_invlist_iter_addr #define grok_bslash_N(a,b,c,d,e,f,g) S_grok_bslash_N(aTHX_ a,b,c,d,e,f,g) @@ -1119,7 +1118,10 @@ #define backup_one_WB(a,b,c,d) S_backup_one_WB(aTHX_ a,b,c,d) #define find_byclass(a,b,c,d,e) S_find_byclass(aTHX_ a,b,c,d,e) #define find_next_ascii S_find_next_ascii +#define find_next_masked S_find_next_masked #define find_next_non_ascii S_find_next_non_ascii +#define find_span_end S_find_span_end +#define find_span_end_mask S_find_span_end_mask #define isFOO_utf8_lc(a,b) S_isFOO_utf8_lc(aTHX_ a,b) #define isGCB(a,b,c,d,e) S_isGCB(aTHX_ a,b,c,d,e) #define isLB(a,b,c,d,e,f) S_isLB(aTHX_ a,b,c,d,e,f) diff --git a/inline.h b/inline.h index 769e0532ac..3cd90e5712 100644 --- a/inline.h +++ b/inline.h @@ -401,8 +401,6 @@ S_is_utf8_invariant_string_loc(const U8* const s, STRLEN len, const U8 ** ep) | ( ( (PTR2nat(x) \ & PERL_WORD_BOUNDARY_MASK) >> 2)))) -# ifndef USING_MSVC6 - /* Do the word-at-a-time iff there is at least one usable full word. That * means that after advancing to a word boundary, there still is at least a * full word left. The number of bytes needed to advance is 'wordsize - @@ -460,7 +458,6 @@ S_is_utf8_invariant_string_loc(const U8* const s, STRLEN len, const U8 ** ep) } while (x + PERL_WORDSIZE <= send); } -# endif /* End of ! MSVC6 */ #endif /* End of ! EBCDIC */ /* Process per-byte */ @@ -479,11 +476,6 @@ S_is_utf8_invariant_string_loc(const U8* const s, STRLEN len, const U8 ** ep) return TRUE; } -#if ! defined(EBCDIC) && ! defined(USING_MSVC6) - -/* Apparent compiler error with MSVC6, so can't use this function. All callers - * to it must be compiled to use the EBCDIC fallback on MSVC6 */ - PERL_STATIC_INLINE unsigned int S__variant_byte_number(PERL_UINTMAX_T word) { @@ -496,7 +488,24 @@ S__variant_byte_number(PERL_UINTMAX_T word) /* Get just the msb bits of each byte */ word &= PERL_VARIANTS_WORD_MASK; -# if BYTEORDER == 0x1234 || BYTEORDER == 0x12345678 +# ifdef USING_MSVC6 /* VC6 has some issues with the normal code, and the + easiest thing is to hide that from the callers */ + { + unsigned int i; + const U8 * s = (U8 *) &word; + dTHX; + + for (i = 0; i < sizeof(word); i++ ) { + if (s[i]) { + return i; + } + } + + Perl_croak(aTHX_ "panic: %s: %d: unexpected zero word\n", + __FILE__, __LINE__); + } + +# elif BYTEORDER == 0x1234 || BYTEORDER == 0x12345678 /* Bytes are stored like * Byte8 ... Byte2 Byte1 @@ -574,7 +583,6 @@ S__variant_byte_number(PERL_UINTMAX_T word) return (unsigned int) word; } -#endif /* ! EBCDIC */ #if defined(PERL_CORE) || defined(PERL_EXT) /* diff --git a/pod/perldebguts.pod b/pod/perldebguts.pod index 3a66f24a20..b1c01ca2d6 100644 --- a/pod/perldebguts.pod +++ b/pod/perldebguts.pod @@ -605,6 +605,8 @@ will be lost. single char match only ANYOFD sv 1 Like ANYOF, but /d is in effect ANYOFL sv 1 Like ANYOF, but /l is in effect + ANYOFM byte 1 Like ANYOF, but matches an invariant byte as + determined by the mask and arg # POSIX Character Classes: POSIXD none Some [[:class:]] under /d; the FLAGS field diff --git a/proto.h b/proto.h index eadfc976db..0755630a94 100644 --- a/proto.h +++ b/proto.h @@ -137,6 +137,11 @@ PERL_CALLCONV UV Perl__to_utf8_title_flags(pTHX_ const U8 *p, const U8* e, U8* u PERL_CALLCONV UV Perl__to_utf8_upper_flags(pTHX_ const U8 *p, const U8 *e, U8* ustrp, STRLEN *lenp, bool flags, const char * const file, const int line); #define PERL_ARGS_ASSERT__TO_UTF8_UPPER_FLAGS \ assert(p); assert(ustrp); assert(file) +#ifndef PERL_NO_INLINE_FUNCTIONS +PERL_STATIC_INLINE unsigned int S__variant_byte_number(PERL_UINTMAX_T word) + __attribute__warn_unused_result__; +#endif + PERL_CALLCONV void Perl__warn_problematic_locale(void); PERL_CALLCONV_NO_RET void Perl_abort_execution(pTHX_ const char * const msg, const char * const name) __attribute__noreturn__; @@ -3806,13 +3811,6 @@ PERL_CALLCONV int Perl_yylex(pTHX); PERL_CALLCONV int Perl_yyparse(pTHX_ int gramtype); PERL_CALLCONV void Perl_yyquit(pTHX); PERL_CALLCONV void Perl_yyunlex(pTHX); -#if ! defined(EBCDIC) && ! defined USING_MSVC6 -#ifndef PERL_NO_INLINE_FUNCTIONS -PERL_STATIC_INLINE unsigned int S__variant_byte_number(PERL_UINTMAX_T word) - __attribute__warn_unused_result__; -#endif - -#endif #if ! defined(HAS_MEMRCHR) && (defined(PERL_CORE) || defined(PERL_EXT)) #ifndef PERL_NO_INLINE_FUNCTIONS PERL_STATIC_INLINE void * S_my_memrchr(const char * s, const char c, const STRLEN len); @@ -4190,7 +4188,7 @@ STATIC const regnode* S_dumpuntil(pTHX_ const regexp *r, const regnode *start, c assert(r); assert(start); assert(node); assert(sv) STATIC bool S_put_charclass_bitmap_innards(pTHX_ SV* sv, char* bitmap, SV* nonbitmap_invlist, SV* only_utf8_locale_invlist, const regnode * const node, const bool force_as_is_display); #define PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS \ - assert(sv); assert(bitmap) + assert(sv) STATIC SV* S_put_charclass_bitmap_innards_common(pTHX_ SV* invlist, SV* posixes, SV* only_utf8, SV* not_utf8, SV* only_utf8_locale, const bool invert); #define PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS_COMMON \ assert(invlist) @@ -5181,6 +5179,11 @@ STATIC int S_edit_distance(const UV *src, const UV *tgt, const STRLEN x, const S #define PERL_ARGS_ASSERT_EDIT_DISTANCE \ assert(src); assert(tgt) +STATIC SV * S_get_ANYOFM_contents(pTHX_ const regnode * n) + __attribute__warn_unused_result__; +#define PERL_ARGS_ASSERT_GET_ANYOFM_CONTENTS \ + assert(n) + STATIC SV* S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state, const regnode_charclass* const node); #define PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC \ assert(pRExC_state); assert(node) @@ -5574,11 +5577,26 @@ STATIC char * S_find_next_ascii(char* s, const char * send, const bool is_utf8) #define PERL_ARGS_ASSERT_FIND_NEXT_ASCII \ assert(s); assert(send) +STATIC char * S_find_next_masked(char * s, const char * send, const U8 byte, const U8 mask) + __attribute__warn_unused_result__; +#define PERL_ARGS_ASSERT_FIND_NEXT_MASKED \ + assert(s); assert(send) + STATIC char * S_find_next_non_ascii(char* s, const char * send, const bool is_utf8) __attribute__warn_unused_result__; #define PERL_ARGS_ASSERT_FIND_NEXT_NON_ASCII \ assert(s); assert(send) +STATIC char * S_find_span_end(char* s, const char * send, const char span_byte) + __attribute__warn_unused_result__; +#define PERL_ARGS_ASSERT_FIND_SPAN_END \ + assert(s); assert(send) + +STATIC U8 * S_find_span_end_mask(U8 * s, const U8 * send, const U8 span_byte, const U8 mask) + __attribute__warn_unused_result__; +#define PERL_ARGS_ASSERT_FIND_SPAN_END_MASK \ + assert(s); assert(send) + STATIC bool S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character) __attribute__warn_unused_result__; #define PERL_ARGS_ASSERT_ISFOO_UTF8_LC \ diff --git a/regcomp.c b/regcomp.c index 198f291f06..1cd5329f10 100644 --- a/regcomp.c +++ b/regcomp.c @@ -5516,6 +5516,27 @@ Perl_re_printf( aTHX_ "LHS=%" UVuf " RHS=%" UVuf "\n", (regnode_charclass *) scan); break; + case ANYOFM: + { + SV* cp_list = get_ANYOFM_contents(scan); + + if (flags & SCF_DO_STCLASS_OR) { + ssc_union(data->start_class, + cp_list, + FALSE /* don't invert */ + ); + } + else if (flags & SCF_DO_STCLASS_AND) { + ssc_intersection(data->start_class, + cp_list, + FALSE /* don't invert */ + ); + } + + SvREFCNT_dec_NN(cp_list); + break; + } + case NPOSIXL: invert = 1; /* FALLTHROUGH */ @@ -17999,25 +18020,20 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, * certain common classes that are easy to test. Getting to this point in * the code means that the class didn't get optimized there. Since this * code is only executed in Pass 2, it is too late to save space--it has - * been allocated in Pass 1, and currently isn't given back. But turning - * things into an EXACTish node can allow the optimizer to join it to any - * adjacent such nodes. And if the class is equivalent to things like /./, - * expensive run-time swashes can be avoided. Now that we have more - * complete information, we can find things necessarily missed by the - * earlier code. Another possible "optimization" that isn't done is that - * something like [Ee] could be changed into an EXACTFU. khw tried this - * and found that the ANYOF is faster, including for code points not in the - * bitmap. This still might make sense to do, provided it got joined with - * an adjacent node(s) to create a longer EXACTFU one. This could be - * accomplished by creating a pseudo ANYOF_EXACTFU node type that the join - * routine would know is joinable. If that didn't happen, the node type - * could then be made a straight ANYOF */ + * been allocated in Pass 1, and currently isn't given back. XXX Why not? + * But turning things into an EXACTish node can allow the optimizer to join + * it to any adjacent such nodes. And if the class is equivalent to things + * like /./, expensive run-time swashes can be avoided. Now that we have + * more complete information, we can find things necessarily missed by the + * earlier code. */ if (optimizable && cp_list && ! invert) { UV start, end; U8 op = END; /* The optimzation node-type */ int posix_class = -1; /* Illegal value */ const char * cur_parse= RExC_parse; + U8 ANYOFM_mask; + U32 anode_arg = 0; invlist_iterinit(cp_list); if (! invlist_iternext(cp_list, &start, &end)) { @@ -18156,6 +18172,106 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, } found_posix: ; } + + /* If it didn't match a POSIX class, it might be able to be turned + * into an ANYOFM node. Compare two different bytes, bit-by-bit. + * In some positions, the bits in each will be 1; and in other + * positions both will be 0; and in some positions the bit will be + * 1 in one byte, and 0 in the other. Let 'n' be the number of + * positions where the bits differ. We create a mask which has + * exactly 'n' 0 bits, each in a position where the two bytes + * differ. Now take the set of all bytes that when ANDed with the + * mask yield the same result. That set has 2**n elements, and is + * representable by just two 8 bit numbers: the result and the + * mask. Importantly, matching the set can be vectorized by + * creating a word full of the result bytes, and a word full of the + * mask bytes, yielding a significant speed up. Here, see if this + * node matches such a set. As a concrete example consider [01], + * and the byte representing '0' which is 0x30 on ASCII machines. + * It has the bits 0011 0000. Take the mask 1111 1110. If we AND + * 0x31 and 0x30 with that mask we get 0x30. Any other bytes ANDed + * yield something else. So [01], which is a common usage, is + * optimizable into ANYOFM, and can benefit from the speed up. We + * can only do this on UTF-8 invariant bytes, because the variance + * would throw this off. */ + if ( op == END + && invlist_highest(cp_list) <= +#ifdef EBCDIC + 0xFF +#else + 0x7F +#endif + ) { + Size_t cp_count = 0; + bool first_time = TRUE; + unsigned int lowest_cp; + U8 bits_differing = 0; + + /* Only needed on EBCDIC, as there, variants and non- are mixed + * together. Could #ifdef it out on ASCII, but probably the + * compiler will optimize it out */ + bool has_variant = FALSE; + + /* Go through the bytes and find the bit positions that differ */ + invlist_iterinit(cp_list); + while (invlist_iternext(cp_list, &start, &end)) { + unsigned int i = start; + + cp_count += end - start + 1; + + if (first_time) { + if (! UVCHR_IS_INVARIANT(i)) { + has_variant = TRUE; + continue; + } + + first_time = FALSE; + lowest_cp = start; + + i++; + } + + /* Find the bit positions that differ from the lowest code + * point in the node. Keep track of all such positions by + * OR'ing */ + for (; i <= end; i++) { + if (! UVCHR_IS_INVARIANT(i)) { + has_variant = TRUE; + continue; + } + + bits_differing |= i ^ lowest_cp; + } + } + invlist_iterfinish(cp_list); + + /* At the end of the loop, we count how many bits differ from + * the bits in lowest code point, call the count 'd'. If the + * set we found contains 2**d elements, it is the closure of + * all code points that differ only in those bit positions. To + * convince yourself of that, first note that the number in the + * closure must be a power of 2, which we test for. The only + * way we could have that count and it be some differing set, + * is if we got some code points that don't differ from the + * lowest code point in any position, but do differ from each + * other in some other position. That means one code point has + * a 1 in that position, and another has a 0. But that would + * mean that one of them differs from the lowest code point in + * that position, which possibility we've already excluded. */ + if ( ! has_variant + && cp_count == 1U << PL_bitcount[bits_differing]) + { + assert(cp_count > 1); + op = ANYOFM; + + /* We need to make the bits that differ be 0's */ + ANYOFM_mask = ~ bits_differing; /* This goes into FLAGS */ + + /* The argument is the lowest code point */ + anode_arg = lowest_cp; + *flagp |= HASWIDTH|SIMPLE; + } + } } if (op != END) { @@ -18163,7 +18279,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, RExC_emit = (regnode *)orig_emit; if (regarglen[op]) { - ret = reganode(pRExC_state, op, 0); + ret = reganode(pRExC_state, op, anode_arg); } else { ret = reg_node(pRExC_state, op); } @@ -18178,6 +18294,9 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, else if (PL_regkind[op] == POSIXD || PL_regkind[op] == NPOSIXD) { FLAGS(ret) = posix_class; } + else if (PL_regkind[op] == ANYOFM) { + FLAGS(ret) = ANYOFM_mask; + } SvREFCNT_dec_NN(cp_list); return ret; @@ -19030,6 +19149,36 @@ S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, } #endif +STATIC SV* +S_get_ANYOFM_contents(pTHX_ const regnode * n) { + + /* Returns an inversion list of all the code points matched by the ANYOFM + * node 'n' */ + + SV * cp_list = _new_invlist(-1); + const U8 lowest = ARG(n); + unsigned int i; + U8 count = 0; + U8 needed = 1U << PL_bitcount[ (U8) ~ FLAGS(n)]; + + PERL_ARGS_ASSERT_GET_ANYOFM_CONTENTS; + + /* Starting with the lowest code point, any code point that ANDed with the + * mask yields the lowest code point is in the set */ + for (i = lowest; i <= 0xFF; i++) { + if ((i & FLAGS(n)) == ARG(n)) { + cp_list = add_cp_to_invlist(cp_list, i); + count++; + + /* We know how many code points (a power of two) that are in the + * set. No use looking once we've got that number */ + if (count >= needed) break; + } + } + + return cp_list; +} + /* - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form */ @@ -19556,6 +19705,15 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_ SvREFCNT_dec(unresolved); } + else if (k == ANYOFM) { + SV * cp_list = get_ANYOFM_contents(o); + + Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]); + put_charclass_bitmap_innards(sv, NULL, cp_list, NULL, NULL, TRUE); + Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]); + + SvREFCNT_dec(cp_list); + } else if (k == POSIXD || k == NPOSIXD) { U8 index = FLAGS(o) * 2; if (index < C_ARRAY_LENGTH(anyofs)) { @@ -20595,7 +20753,7 @@ S_put_charclass_bitmap_innards(pTHX_ SV *sv, { /* Appends to 'sv' a displayable version of the innards of the bracketed * character class defined by the other arguments: - * 'bitmap' points to the bitmap. + * 'bitmap' points to the bitmap, or NULL if to ignore that. * 'nonbitmap_invlist' is an inversion list of the code points that are in * the bitmap range, but for some reason aren't in the bitmap; NULL if * none. The reasons for this could be that they require some @@ -20706,13 +20864,16 @@ S_put_charclass_bitmap_innards(pTHX_ SV *sv, } /* Accumulate the bit map into the unconditional match list */ - for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) { - if (BITMAP_TEST(bitmap, i)) { - int start = i++; - for (; i < NUM_ANYOF_CODE_POINTS && BITMAP_TEST(bitmap, i); i++) { - /* empty */ + if (bitmap) { + for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) { + if (BITMAP_TEST(bitmap, i)) { + int start = i++; + for (; + i < NUM_ANYOF_CODE_POINTS && BITMAP_TEST(bitmap, i); + i++) + { /* empty */ } + invlist = _add_range_to_invlist(invlist, start, i-1); } - invlist = _add_range_to_invlist(invlist, start, i-1); } } diff --git a/regcomp.sym b/regcomp.sym index cddf84c24d..14840b5845 100644 --- a/regcomp.sym +++ b/regcomp.sym @@ -59,6 +59,7 @@ SANY REG_ANY, no 0 S ; Match any one character. ANYOF ANYOF, sv 1 S ; Match character in (or not in) this class, single char match only ANYOFD ANYOF, sv 1 S ; Like ANYOF, but /d is in effect ANYOFL ANYOF, sv 1 S ; Like ANYOF, but /l is in effect +ANYOFM ANYOFM byte 1 S ; Like ANYOF, but matches an invariant byte as determined by the mask and arg #* POSIX Character Classes: # Order of the below is important. See ordering comment above. diff --git a/regexec.c b/regexec.c index 530f1d6250..31a133f20b 100644 --- a/regexec.c +++ b/regexec.c @@ -560,7 +560,7 @@ S_find_next_ascii(char * s, const char * send, const bool utf8_target) PERL_ARGS_ASSERT_FIND_NEXT_ASCII; -#if ! defined(EBCDIC) && ! defined(USING_MSVC6) +#ifndef EBCDIC if ((STRLEN) (send - s) >= PERL_WORDSIZE @@ -676,6 +676,200 @@ S_find_next_non_ascii(char * s, const char * send, const bool utf8_target) } +STATIC char * +S_find_span_end(char * s, const char * send, const char span_byte) +{ + /* Returns the position of the first byte in the sequence between 's' and + * 'send-1' inclusive that isn't 'span_byte'; returns 'send' if none found. + * */ + + PERL_ARGS_ASSERT_FIND_SPAN_END; + + assert(send >= s); + + if ((STRLEN) (send - s) >= PERL_WORDSIZE + + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(s) + - (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK)) + { + PERL_UINTMAX_T span_word; + + /* Process per-byte until reach word boundary. XXX This loop could be + * eliminated if we knew that this platform had fast unaligned reads */ + while (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK) { + if (*s != span_byte) { + return s; + } + s++; + } + + /* Create a word filled with the bytes we are spanning */ + span_word = PERL_COUNT_MULTIPLIER * span_byte; + + /* Process per-word as long as we have at least a full word left */ + do { + + /* Keep going if the whole word is composed of 'span_byte's */ + if ((* (PERL_UINTMAX_T *) s) == span_word) { + s += PERL_WORDSIZE; + continue; + } + + /* Here, at least one byte in the word isn't 'span_byte'. This xor + * leaves 1 bits only in those non-matching bytes */ + span_word ^= * (PERL_UINTMAX_T *) s; + + /* Make sure the upper bit of each non-matching byte is set. This + * makes each such byte look like an ASCII platform variant byte */ + span_word |= span_word << 1; + span_word |= span_word << 2; + span_word |= span_word << 4; + + /* That reduces the problem to what this function solves */ + return s + _variant_byte_number(span_word); + + } while (s + PERL_WORDSIZE <= send); + } + + /* Process the straggler bytes beyond the final word boundary */ + while (s < send) { + if (*s != span_byte) { + return s; + } + s++; + } + + return s; +} + +STATIC char * +S_find_next_masked(char * s, const char * send, const U8 byte, const U8 mask) +{ + /* Returns the position of the first byte in the sequence between 's' + * and 'send-1' inclusive that when ANDed with 'mask' yields 'byte'; + * returns 'send' if none found. It uses word-level operations instead of + * byte to speed up the process */ + + PERL_ARGS_ASSERT_FIND_NEXT_MASKED; + + assert(send >= s); + assert((byte & mask) == byte); + + if ((STRLEN) (send - s) >= PERL_WORDSIZE + + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(s) + - (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK)) + { + PERL_UINTMAX_T word_complemented, mask_word; + + while (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK) { + if (((* (U8 *) s) & mask) == byte) { + return s; + } + s++; + } + + word_complemented = ~ (PERL_COUNT_MULTIPLIER * byte); + mask_word = PERL_COUNT_MULTIPLIER * mask; + + do { + PERL_UINTMAX_T masked = (* (PERL_UINTMAX_T *) s) & mask_word; + + /* If 'masked' contains 'byte' within it, anding with the + * complement will leave those 8 bits 0 */ + masked &= word_complemented; + + /* This causes the most significant bit to be set to 1 for any + * bytes in the word that aren't completely 0 */ + masked |= masked << 1; + masked |= masked << 2; + masked |= masked << 4; + + /* The msbits are the same as what marks a byte as variant, so we + * can use this mask. If all msbits are 1, the word doesn't + * contain 'byte' */ + if ((masked & PERL_VARIANTS_WORD_MASK) == PERL_VARIANTS_WORD_MASK) { + s += PERL_WORDSIZE; + continue; + } + + /* Here, the msbit of bytes in the word that aren't 'byte' are 1, + * and any that are, are 0. Complement and re-AND to swap that */ + masked = ~ masked; + masked &= PERL_VARIANTS_WORD_MASK; + + /* This reduces the problem to that solved by this function */ + s += _variant_byte_number(masked); + return s; + + } while (s + PERL_WORDSIZE <= send); + } + + while (s < send) { + if (((* (U8 *) s) & mask) == byte) { + return s; + } + s++; + } + + return s; +} + +STATIC U8 * +S_find_span_end_mask(U8 * s, const U8 * send, const U8 span_byte, const U8 mask) +{ + /* Returns the position of the first byte in the sequence between 's' and + * 'send-1' inclusive that when ANDed with 'mask' isn't 'span_byte'. + * 'span_byte' should have been ANDed with 'mask' in the call of this + * function. Returns 'send' if none found. Works like find_span_end(), + * except for the AND */ + + PERL_ARGS_ASSERT_FIND_SPAN_END_MASK; + + assert(send >= s); + assert((span_byte & mask) == span_byte); + + if ((STRLEN) (send - s) >= PERL_WORDSIZE + + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(s) + - (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK)) + { + PERL_UINTMAX_T span_word, mask_word; + + while (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK) { + if (((* (U8 *) s) & mask) != span_byte) { + return s; + } + s++; + } + + span_word = PERL_COUNT_MULTIPLIER * span_byte; + mask_word = PERL_COUNT_MULTIPLIER * mask; + + do { + PERL_UINTMAX_T masked = (* (PERL_UINTMAX_T *) s) & mask_word; + + if (masked == span_word) { + s += PERL_WORDSIZE; + continue; + } + + masked ^= span_word; + masked |= masked << 1; + masked |= masked << 2; + masked |= masked << 4; + return s + _variant_byte_number(masked); + + } while (s + PERL_WORDSIZE <= send); + } + + while (s < send) { + if (((* (U8 *) s) & mask) != span_byte) { + return s; + } + s++; + } + + return s; +} + /* * pregexec and friends */ @@ -2062,6 +2256,12 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, } break; + case ANYOFM: /* ARG() is the base byte; FLAGS() the mask byte */ + /* UTF-8ness doesn't matter, so use 0 */ + REXEC_FBC_FIND_NEXT_SCAN(0, + find_next_masked(s, strend, ARG(c), FLAGS(c))); + break; + case EXACTFA_NO_TRIE: /* This node only generated for non-utf8 patterns */ assert(! is_utf8_pat); /* FALLTHROUGH */ @@ -6537,6 +6737,13 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) } break; + case ANYOFM: + if (NEXTCHR_IS_EOS || (UCHARAT(locinput) & FLAGS(scan)) != ARG(scan)) { + sayNO; + } + locinput++; + break; + case ASCII: if (NEXTCHR_IS_EOS || ! isASCII(UCHARAT(locinput))) { sayNO; @@ -9003,8 +9210,10 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, hardcount++; } } else { - while (scan < loceol && *scan != '\n') - scan++; + scan = (char *) memchr(scan, '\n', loceol - scan); + if (! scan) { + scan = loceol; + } } break; case SANY: @@ -9028,7 +9237,7 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, c = (U8)*STRING(p); - /* Can use a simple loop if the pattern char to match on is invariant + /* Can use a simple find if the pattern char to match on is invariant * under UTF-8, or both target and pattern aren't UTF-8. Note that we * can use UTF8_IS_INVARIANT() even if the pattern isn't UTF-8, as it's * true iff it doesn't matter if the argument is in UTF-8 or not */ @@ -9038,9 +9247,7 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, * since here, to match at all, 1 char == 1 byte */ loceol = scan + max; } - while (scan < loceol && UCHARAT(scan) == c) { - scan++; - } + scan = find_span_end(scan, loceol, (U8) c); } else if (reginfo->is_utf8_pat) { if (utf8_target) { @@ -9060,11 +9267,9 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, else if (! UTF8_IS_ABOVE_LATIN1(c)) { /* Target isn't utf8; convert the character in the UTF-8 - * pattern to non-UTF8, and do a simple loop */ + * pattern to non-UTF8, and do a simple find */ c = EIGHT_BIT_UTF8_TO_NATIVE(c, *(STRING(p) + 1)); - while (scan < loceol && UCHARAT(scan) == c) { - scan++; - } + scan = find_span_end(scan, loceol, (U8) c); } /* else pattern char is above Latin1, can't possibly match the non-UTF-8 target */ } @@ -9162,9 +9367,7 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, } } else if (c1 == c2) { - while (scan < loceol && UCHARAT(scan) == c1) { - scan++; - } + scan = find_span_end(scan, loceol, c1); } else { /* See comments in regmatch() CURLY_B_min_known_fail. We avoid @@ -9173,14 +9376,12 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, U8 c1_c2_bits_differing = c1 ^ c2; if (isPOWER_OF_2(c1_c2_bits_differing)) { - U8 c1_masked = c1 & ~ c1_c2_bits_differing; U8 c1_c2_mask = ~ c1_c2_bits_differing; - while ( scan < loceol - && (UCHARAT(scan) & c1_c2_mask) == c1_masked) - { - scan++; - } + scan = (char *) find_span_end_mask((U8 *) scan, + (U8 *) loceol, + c1 & c1_c2_mask, + c1_c2_mask); } else { while ( scan < loceol @@ -9222,7 +9423,7 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, } break; - case ASCII: + case ANYOFM: if (utf8_target && loceol - scan > max) { /* We didn't adjust <loceol> at the beginning of this routine @@ -9231,6 +9432,14 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, loceol = scan + max; } + scan = (char *) find_span_end_mask((U8 *) scan, (U8 *) loceol, (U8) ARG(p), FLAGS(p)); + break; + + case ASCII: + if (utf8_target && loceol - scan > max) { + loceol = scan + max; + } + scan = find_next_non_ascii(scan, loceol, utf8_target); break; diff --git a/regnodes.h b/regnodes.h index f76aab4cc0..855a215650 100644 --- a/regnodes.h +++ b/regnodes.h @@ -6,8 +6,8 @@ /* Regops and State definitions */ -#define REGNODE_MAX 96 -#define REGMATCH_STATE_MAX 138 +#define REGNODE_MAX 97 +#define REGMATCH_STATE_MAX 139 #define END 0 /* 0000 End of program. */ #define SUCCEED 1 /* 0x01 Return from a subroutine, basically. */ @@ -32,82 +32,83 @@ #define ANYOF 18 /* 0x12 Match character in (or not in) this class, single char match only */ #define ANYOFD 19 /* 0x13 Like ANYOF, but /d is in effect */ #define ANYOFL 20 /* 0x14 Like ANYOF, but /l is in effect */ -#define POSIXD 21 /* 0x15 Some [[:class:]] under /d; the FLAGS field gives which one */ -#define POSIXL 22 /* 0x16 Some [[:class:]] under /l; the FLAGS field gives which one */ -#define POSIXU 23 /* 0x17 Some [[:class:]] under /u; the FLAGS field gives which one */ -#define POSIXA 24 /* 0x18 Some [[:class:]] under /a; the FLAGS field gives which one */ -#define NPOSIXD 25 /* 0x19 complement of POSIXD, [[:^class:]] */ -#define NPOSIXL 26 /* 0x1a complement of POSIXL, [[:^class:]] */ -#define NPOSIXU 27 /* 0x1b complement of POSIXU, [[:^class:]] */ -#define NPOSIXA 28 /* 0x1c complement of POSIXA, [[:^class:]] */ -#define ASCII 29 /* 0x1d [[:ascii:]] */ -#define NASCII 30 /* 0x1e [[:^ascii:]] */ -#define CLUMP 31 /* 0x1f Match any extended grapheme cluster sequence */ -#define BRANCH 32 /* 0x20 Match this alternative, or the next... */ -#define EXACT 33 /* 0x21 Match this string (preceded by length). */ -#define EXACTL 34 /* 0x22 Like EXACT, but /l is in effect (used so locale-related warnings can be checked for). */ -#define EXACTF 35 /* 0x23 Match this non-UTF-8 string (not guaranteed to be folded) using /id rules (w/len). */ -#define EXACTFL 36 /* 0x24 Match this string (not guaranteed to be folded) using /il rules (w/len). */ -#define EXACTFU 37 /* 0x25 Match this string (folded iff in UTF-8, length in folding doesn't change if not in UTF-8) using /iu rules (w/len). */ -#define EXACTFA 38 /* 0x26 Match this string (not guaranteed to be folded) using /iaa rules (w/len). */ -#define EXACTFU_SS 39 /* 0x27 Match this string (folded iff in UTF-8, length in folding may change even if not in UTF-8) using /iu rules (w/len). */ -#define EXACTFLU8 40 /* 0x28 Rare cirucmstances: like EXACTFU, but is under /l, UTF-8, folded, and everything in it is above 255. */ -#define EXACTFA_NO_TRIE 41 /* 0x29 Match this string (which is not trie-able; not guaranteed to be folded) using /iaa rules (w/len). */ -#define NOTHING 42 /* 0x2a Match empty string. */ -#define TAIL 43 /* 0x2b Match empty string. Can jump here from outside. */ -#define STAR 44 /* 0x2c Match this (simple) thing 0 or more times. */ -#define PLUS 45 /* 0x2d Match this (simple) thing 1 or more times. */ -#define CURLY 46 /* 0x2e Match this simple thing {n,m} times. */ -#define CURLYN 47 /* 0x2f Capture next-after-this simple thing */ -#define CURLYM 48 /* 0x30 Capture this medium-complex thing {n,m} times. */ -#define CURLYX 49 /* 0x31 Match this complex thing {n,m} times. */ -#define WHILEM 50 /* 0x32 Do curly processing and see if rest matches. */ -#define OPEN 51 /* 0x33 Mark this point in input as start of #n. */ -#define CLOSE 52 /* 0x34 Close corresponding OPEN of #n. */ -#define SROPEN 53 /* 0x35 Same as OPEN, but for script run */ -#define SRCLOSE 54 /* 0x36 Close preceding SROPEN */ -#define REF 55 /* 0x37 Match some already matched string */ -#define REFF 56 /* 0x38 Match already matched string, folded using native charset rules for non-utf8 */ -#define REFFL 57 /* 0x39 Match already matched string, folded in loc. */ -#define REFFU 58 /* 0x3a Match already matched string, folded using unicode rules for non-utf8 */ -#define REFFA 59 /* 0x3b Match already matched string, folded using unicode rules for non-utf8, no mixing ASCII, non-ASCII */ -#define NREF 60 /* 0x3c Match some already matched string */ -#define NREFF 61 /* 0x3d Match already matched string, folded using native charset rules for non-utf8 */ -#define NREFFL 62 /* 0x3e Match already matched string, folded in loc. */ -#define NREFFU 63 /* 0x3f Match already matched string, folded using unicode rules for non-utf8 */ -#define NREFFA 64 /* 0x40 Match already matched string, folded using unicode rules for non-utf8, no mixing ASCII, non-ASCII */ -#define LONGJMP 65 /* 0x41 Jump far away. */ -#define BRANCHJ 66 /* 0x42 BRANCH with long offset. */ -#define IFMATCH 67 /* 0x43 Succeeds if the following matches. */ -#define UNLESSM 68 /* 0x44 Fails if the following matches. */ -#define SUSPEND 69 /* 0x45 "Independent" sub-RE. */ -#define IFTHEN 70 /* 0x46 Switch, should be preceded by switcher. */ -#define GROUPP 71 /* 0x47 Whether the group matched. */ -#define EVAL 72 /* 0x48 Execute some Perl code. */ -#define MINMOD 73 /* 0x49 Next operator is not greedy. */ -#define LOGICAL 74 /* 0x4a Next opcode should set the flag only. */ -#define RENUM 75 /* 0x4b Group with independently numbered parens. */ -#define TRIE 76 /* 0x4c Match many EXACT(F[ALU]?)? at once. flags==type */ -#define TRIEC 77 /* 0x4d Same as TRIE, but with embedded charclass data */ -#define AHOCORASICK 78 /* 0x4e Aho Corasick stclass. flags==type */ -#define AHOCORASICKC 79 /* 0x4f Same as AHOCORASICK, but with embedded charclass data */ -#define GOSUB 80 /* 0x50 recurse to paren arg1 at (signed) ofs arg2 */ -#define NGROUPP 81 /* 0x51 Whether the group matched. */ -#define INSUBP 82 /* 0x52 Whether we are in a specific recurse. */ -#define DEFINEP 83 /* 0x53 Never execute directly. */ -#define ENDLIKE 84 /* 0x54 Used only for the type field of verbs */ -#define OPFAIL 85 /* 0x55 Same as (?!), but with verb arg */ -#define ACCEPT 86 /* 0x56 Accepts the current matched string, with verbar */ -#define VERB 87 /* 0x57 Used only for the type field of verbs */ -#define PRUNE 88 /* 0x58 Pattern fails at this startpoint if no-backtracking through this */ -#define MARKPOINT 89 /* 0x59 Push the current location for rollback by cut. */ -#define SKIP 90 /* 0x5a On failure skip forward (to the mark) before retrying */ -#define COMMIT 91 /* 0x5b Pattern fails outright if backtracking through this */ -#define CUTGROUP 92 /* 0x5c On failure go to the next alternation in the group */ -#define KEEPS 93 /* 0x5d $& begins here. */ -#define LNBREAK 94 /* 0x5e generic newline pattern */ -#define OPTIMIZED 95 /* 0x5f Placeholder for dump. */ -#define PSEUDO 96 /* 0x60 Pseudo opcode for internal use. */ +#define ANYOFM 21 /* 0x15 Like ANYOF, but matches an invariant byte as determined by the mask and arg */ +#define POSIXD 22 /* 0x16 Some [[:class:]] under /d; the FLAGS field gives which one */ +#define POSIXL 23 /* 0x17 Some [[:class:]] under /l; the FLAGS field gives which one */ +#define POSIXU 24 /* 0x18 Some [[:class:]] under /u; the FLAGS field gives which one */ +#define POSIXA 25 /* 0x19 Some [[:class:]] under /a; the FLAGS field gives which one */ +#define NPOSIXD 26 /* 0x1a complement of POSIXD, [[:^class:]] */ +#define NPOSIXL 27 /* 0x1b complement of POSIXL, [[:^class:]] */ +#define NPOSIXU 28 /* 0x1c complement of POSIXU, [[:^class:]] */ +#define NPOSIXA 29 /* 0x1d complement of POSIXA, [[:^class:]] */ +#define ASCII 30 /* 0x1e [[:ascii:]] */ +#define NASCII 31 /* 0x1f [[:^ascii:]] */ +#define CLUMP 32 /* 0x20 Match any extended grapheme cluster sequence */ +#define BRANCH 33 /* 0x21 Match this alternative, or the next... */ +#define EXACT 34 /* 0x22 Match this string (preceded by length). */ +#define EXACTL 35 /* 0x23 Like EXACT, but /l is in effect (used so locale-related warnings can be checked for). */ +#define EXACTF 36 /* 0x24 Match this non-UTF-8 string (not guaranteed to be folded) using /id rules (w/len). */ +#define EXACTFL 37 /* 0x25 Match this string (not guaranteed to be folded) using /il rules (w/len). */ +#define EXACTFU 38 /* 0x26 Match this string (folded iff in UTF-8, length in folding doesn't change if not in UTF-8) using /iu rules (w/len). */ +#define EXACTFA 39 /* 0x27 Match this string (not guaranteed to be folded) using /iaa rules (w/len). */ +#define EXACTFU_SS 40 /* 0x28 Match this string (folded iff in UTF-8, length in folding may change even if not in UTF-8) using /iu rules (w/len). */ +#define EXACTFLU8 41 /* 0x29 Rare cirucmstances: like EXACTFU, but is under /l, UTF-8, folded, and everything in it is above 255. */ +#define EXACTFA_NO_TRIE 42 /* 0x2a Match this string (which is not trie-able; not guaranteed to be folded) using /iaa rules (w/len). */ +#define NOTHING 43 /* 0x2b Match empty string. */ +#define TAIL 44 /* 0x2c Match empty string. Can jump here from outside. */ +#define STAR 45 /* 0x2d Match this (simple) thing 0 or more times. */ +#define PLUS 46 /* 0x2e Match this (simple) thing 1 or more times. */ +#define CURLY 47 /* 0x2f Match this simple thing {n,m} times. */ +#define CURLYN 48 /* 0x30 Capture next-after-this simple thing */ +#define CURLYM 49 /* 0x31 Capture this medium-complex thing {n,m} times. */ +#define CURLYX 50 /* 0x32 Match this complex thing {n,m} times. */ +#define WHILEM 51 /* 0x33 Do curly processing and see if rest matches. */ +#define OPEN 52 /* 0x34 Mark this point in input as start of #n. */ +#define CLOSE 53 /* 0x35 Close corresponding OPEN of #n. */ +#define SROPEN 54 /* 0x36 Same as OPEN, but for script run */ +#define SRCLOSE 55 /* 0x37 Close preceding SROPEN */ +#define REF 56 /* 0x38 Match some already matched string */ +#define REFF 57 /* 0x39 Match already matched string, folded using native charset rules for non-utf8 */ +#define REFFL 58 /* 0x3a Match already matched string, folded in loc. */ +#define REFFU 59 /* 0x3b Match already matched string, folded using unicode rules for non-utf8 */ +#define REFFA 60 /* 0x3c Match already matched string, folded using unicode rules for non-utf8, no mixing ASCII, non-ASCII */ +#define NREF 61 /* 0x3d Match some already matched string */ +#define NREFF 62 /* 0x3e Match already matched string, folded using native charset rules for non-utf8 */ +#define NREFFL 63 /* 0x3f Match already matched string, folded in loc. */ +#define NREFFU 64 /* 0x40 Match already matched string, folded using unicode rules for non-utf8 */ +#define NREFFA 65 /* 0x41 Match already matched string, folded using unicode rules for non-utf8, no mixing ASCII, non-ASCII */ +#define LONGJMP 66 /* 0x42 Jump far away. */ +#define BRANCHJ 67 /* 0x43 BRANCH with long offset. */ +#define IFMATCH 68 /* 0x44 Succeeds if the following matches. */ +#define UNLESSM 69 /* 0x45 Fails if the following matches. */ +#define SUSPEND 70 /* 0x46 "Independent" sub-RE. */ +#define IFTHEN 71 /* 0x47 Switch, should be preceded by switcher. */ +#define GROUPP 72 /* 0x48 Whether the group matched. */ +#define EVAL 73 /* 0x49 Execute some Perl code. */ +#define MINMOD 74 /* 0x4a Next operator is not greedy. */ +#define LOGICAL 75 /* 0x4b Next opcode should set the flag only. */ +#define RENUM 76 /* 0x4c Group with independently numbered parens. */ +#define TRIE 77 /* 0x4d Match many EXACT(F[ALU]?)? at once. flags==type */ +#define TRIEC 78 /* 0x4e Same as TRIE, but with embedded charclass data */ +#define AHOCORASICK 79 /* 0x4f Aho Corasick stclass. flags==type */ +#define AHOCORASICKC 80 /* 0x50 Same as AHOCORASICK, but with embedded charclass data */ +#define GOSUB 81 /* 0x51 recurse to paren arg1 at (signed) ofs arg2 */ +#define NGROUPP 82 /* 0x52 Whether the group matched. */ +#define INSUBP 83 /* 0x53 Whether we are in a specific recurse. */ +#define DEFINEP 84 /* 0x54 Never execute directly. */ +#define ENDLIKE 85 /* 0x55 Used only for the type field of verbs */ +#define OPFAIL 86 /* 0x56 Same as (?!), but with verb arg */ +#define ACCEPT 87 /* 0x57 Accepts the current matched string, with verbar */ +#define VERB 88 /* 0x58 Used only for the type field of verbs */ +#define PRUNE 89 /* 0x59 Pattern fails at this startpoint if no-backtracking through this */ +#define MARKPOINT 90 /* 0x5a Push the current location for rollback by cut. */ +#define SKIP 91 /* 0x5b On failure skip forward (to the mark) before retrying */ +#define COMMIT 92 /* 0x5c Pattern fails outright if backtracking through this */ +#define CUTGROUP 93 /* 0x5d On failure go to the next alternation in the group */ +#define KEEPS 94 /* 0x5e $& begins here. */ +#define LNBREAK 95 /* 0x5f generic newline pattern */ +#define OPTIMIZED 96 /* 0x60 Placeholder for dump. */ +#define PSEUDO 97 /* 0x61 Pseudo opcode for internal use. */ /* ------------ States ------------- */ #define TRIE_next (REGNODE_MAX + 1) /* state for TRIE */ #define TRIE_next_fail (REGNODE_MAX + 2) /* state for TRIE */ @@ -179,6 +180,7 @@ EXTCONST U8 PL_regkind[] = { ANYOF, /* ANYOF */ ANYOF, /* ANYOFD */ ANYOF, /* ANYOFL */ + ANYOFM, /* ANYOFM */ POSIXD, /* POSIXD */ POSIXD, /* POSIXL */ POSIXD, /* POSIXU */ @@ -327,6 +329,7 @@ static const U8 regarglen[] = { EXTRA_SIZE(struct regnode_1), /* ANYOF */ EXTRA_SIZE(struct regnode_1), /* ANYOFD */ EXTRA_SIZE(struct regnode_1), /* ANYOFL */ + EXTRA_SIZE(struct regnode_1), /* ANYOFM */ 0, /* POSIXD */ 0, /* POSIXL */ 0, /* POSIXU */ @@ -429,6 +432,7 @@ static const char reg_off_by_arg[] = { 0, /* ANYOF */ 0, /* ANYOFD */ 0, /* ANYOFL */ + 0, /* ANYOFM */ 0, /* POSIXD */ 0, /* POSIXL */ 0, /* POSIXU */ @@ -537,82 +541,83 @@ EXTCONST char * const PL_reg_name[] = { "ANYOF", /* 0x12 */ "ANYOFD", /* 0x13 */ "ANYOFL", /* 0x14 */ - "POSIXD", /* 0x15 */ - "POSIXL", /* 0x16 */ - "POSIXU", /* 0x17 */ - "POSIXA", /* 0x18 */ - "NPOSIXD", /* 0x19 */ - "NPOSIXL", /* 0x1a */ - "NPOSIXU", /* 0x1b */ - "NPOSIXA", /* 0x1c */ - "ASCII", /* 0x1d */ - "NASCII", /* 0x1e */ - "CLUMP", /* 0x1f */ - "BRANCH", /* 0x20 */ - "EXACT", /* 0x21 */ - "EXACTL", /* 0x22 */ - "EXACTF", /* 0x23 */ - "EXACTFL", /* 0x24 */ - "EXACTFU", /* 0x25 */ - "EXACTFA", /* 0x26 */ - "EXACTFU_SS", /* 0x27 */ - "EXACTFLU8", /* 0x28 */ - "EXACTFA_NO_TRIE", /* 0x29 */ - "NOTHING", /* 0x2a */ - "TAIL", /* 0x2b */ - "STAR", /* 0x2c */ - "PLUS", /* 0x2d */ - "CURLY", /* 0x2e */ - "CURLYN", /* 0x2f */ - "CURLYM", /* 0x30 */ - "CURLYX", /* 0x31 */ - "WHILEM", /* 0x32 */ - "OPEN", /* 0x33 */ - "CLOSE", /* 0x34 */ - "SROPEN", /* 0x35 */ - "SRCLOSE", /* 0x36 */ - "REF", /* 0x37 */ - "REFF", /* 0x38 */ - "REFFL", /* 0x39 */ - "REFFU", /* 0x3a */ - "REFFA", /* 0x3b */ - "NREF", /* 0x3c */ - "NREFF", /* 0x3d */ - "NREFFL", /* 0x3e */ - "NREFFU", /* 0x3f */ - "NREFFA", /* 0x40 */ - "LONGJMP", /* 0x41 */ - "BRANCHJ", /* 0x42 */ - "IFMATCH", /* 0x43 */ - "UNLESSM", /* 0x44 */ - "SUSPEND", /* 0x45 */ - "IFTHEN", /* 0x46 */ - "GROUPP", /* 0x47 */ - "EVAL", /* 0x48 */ - "MINMOD", /* 0x49 */ - "LOGICAL", /* 0x4a */ - "RENUM", /* 0x4b */ - "TRIE", /* 0x4c */ - "TRIEC", /* 0x4d */ - "AHOCORASICK", /* 0x4e */ - "AHOCORASICKC", /* 0x4f */ - "GOSUB", /* 0x50 */ - "NGROUPP", /* 0x51 */ - "INSUBP", /* 0x52 */ - "DEFINEP", /* 0x53 */ - "ENDLIKE", /* 0x54 */ - "OPFAIL", /* 0x55 */ - "ACCEPT", /* 0x56 */ - "VERB", /* 0x57 */ - "PRUNE", /* 0x58 */ - "MARKPOINT", /* 0x59 */ - "SKIP", /* 0x5a */ - "COMMIT", /* 0x5b */ - "CUTGROUP", /* 0x5c */ - "KEEPS", /* 0x5d */ - "LNBREAK", /* 0x5e */ - "OPTIMIZED", /* 0x5f */ - "PSEUDO", /* 0x60 */ + "ANYOFM", /* 0x15 */ + "POSIXD", /* 0x16 */ + "POSIXL", /* 0x17 */ + "POSIXU", /* 0x18 */ + "POSIXA", /* 0x19 */ + "NPOSIXD", /* 0x1a */ + "NPOSIXL", /* 0x1b */ + "NPOSIXU", /* 0x1c */ + "NPOSIXA", /* 0x1d */ + "ASCII", /* 0x1e */ + "NASCII", /* 0x1f */ + "CLUMP", /* 0x20 */ + "BRANCH", /* 0x21 */ + "EXACT", /* 0x22 */ + "EXACTL", /* 0x23 */ + "EXACTF", /* 0x24 */ + "EXACTFL", /* 0x25 */ + "EXACTFU", /* 0x26 */ + "EXACTFA", /* 0x27 */ + "EXACTFU_SS", /* 0x28 */ + "EXACTFLU8", /* 0x29 */ + "EXACTFA_NO_TRIE", /* 0x2a */ + "NOTHING", /* 0x2b */ + "TAIL", /* 0x2c */ + "STAR", /* 0x2d */ + "PLUS", /* 0x2e */ + "CURLY", /* 0x2f */ + "CURLYN", /* 0x30 */ + "CURLYM", /* 0x31 */ + "CURLYX", /* 0x32 */ + "WHILEM", /* 0x33 */ + "OPEN", /* 0x34 */ + "CLOSE", /* 0x35 */ + "SROPEN", /* 0x36 */ + "SRCLOSE", /* 0x37 */ + "REF", /* 0x38 */ + "REFF", /* 0x39 */ + "REFFL", /* 0x3a */ + "REFFU", /* 0x3b */ + "REFFA", /* 0x3c */ + "NREF", /* 0x3d */ + "NREFF", /* 0x3e */ + "NREFFL", /* 0x3f */ + "NREFFU", /* 0x40 */ + "NREFFA", /* 0x41 */ + "LONGJMP", /* 0x42 */ + "BRANCHJ", /* 0x43 */ + "IFMATCH", /* 0x44 */ + "UNLESSM", /* 0x45 */ + "SUSPEND", /* 0x46 */ + "IFTHEN", /* 0x47 */ + "GROUPP", /* 0x48 */ + "EVAL", /* 0x49 */ + "MINMOD", /* 0x4a */ + "LOGICAL", /* 0x4b */ + "RENUM", /* 0x4c */ + "TRIE", /* 0x4d */ + "TRIEC", /* 0x4e */ + "AHOCORASICK", /* 0x4f */ + "AHOCORASICKC", /* 0x50 */ + "GOSUB", /* 0x51 */ + "NGROUPP", /* 0x52 */ + "INSUBP", /* 0x53 */ + "DEFINEP", /* 0x54 */ + "ENDLIKE", /* 0x55 */ + "OPFAIL", /* 0x56 */ + "ACCEPT", /* 0x57 */ + "VERB", /* 0x58 */ + "PRUNE", /* 0x59 */ + "MARKPOINT", /* 0x5a */ + "SKIP", /* 0x5b */ + "COMMIT", /* 0x5c */ + "CUTGROUP", /* 0x5d */ + "KEEPS", /* 0x5e */ + "LNBREAK", /* 0x5f */ + "OPTIMIZED", /* 0x60 */ + "PSEUDO", /* 0x61 */ /* ------------ States ------------- */ "TRIE_next", /* REGNODE_MAX +0x01 */ "TRIE_next_fail", /* REGNODE_MAX +0x02 */ @@ -749,7 +754,7 @@ EXTCONST U8 PL_varies[] __attribute__deprecated__ = { EXTCONST U8 PL_varies_bitmask[]; #else EXTCONST U8 PL_varies_bitmask[] = { - 0x00, 0x00, 0x00, 0x80, 0x01, 0xF0, 0x87, 0xFF, 0x65, 0x00, 0x00, 0x00, 0x00 + 0x00, 0x00, 0x00, 0x00, 0x03, 0xE0, 0x0F, 0xFF, 0xCB, 0x00, 0x00, 0x00, 0x00 }; #endif /* DOINIT */ @@ -761,8 +766,8 @@ EXTCONST U8 PL_varies_bitmask[] = { EXTCONST U8 PL_simple[] __attribute__deprecated__; #else EXTCONST U8 PL_simple[] __attribute__deprecated__ = { - REG_ANY, SANY, ANYOF, ANYOFD, ANYOFL, POSIXD, POSIXL, POSIXU, POSIXA, - NPOSIXD, NPOSIXL, NPOSIXU, NPOSIXA, ASCII, NASCII, + REG_ANY, SANY, ANYOF, ANYOFD, ANYOFL, ANYOFM, POSIXD, POSIXL, POSIXU, + POSIXA, NPOSIXD, NPOSIXL, NPOSIXU, NPOSIXA, ASCII, NASCII, 0 }; #endif /* DOINIT */ @@ -771,7 +776,7 @@ EXTCONST U8 PL_simple[] __attribute__deprecated__ = { EXTCONST U8 PL_simple_bitmask[]; #else EXTCONST U8 PL_simple_bitmask[] = { - 0x00, 0x00, 0xFF, 0x7F, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + 0x00, 0x00, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }; #endif /* DOINIT */ diff --git a/t/re/anyof.t b/t/re/anyof.t index d24e4a71a8..12fb9b3a8c 100644 --- a/t/re/anyof.t +++ b/t/re/anyof.t @@ -31,7 +31,7 @@ BEGIN { # skipped and not skipped. my @tests = ( - '[[{]' => 'ANYOF[\[\{]', + '[[{]' => 'ANYOFM[\[\{]', '[^\S ]' => 'ANYOFD[\t\n\x0B\f\r{utf8}\x85\xA0][1680 2000-200A 2028-2029 202F 205F 3000]', '[^\n\r]' => 'ANYOF[^\n\r][0100-INFINITY]', '[^\/\|,\$\%%\@\ \%"\<\>\:\#\&\*\{\}\[\]\(\)]' => 'ANYOF[^ "#$%&()*,/:<>@\[\]\{|\}][0100-INFINITY]', -- Perl5 Master Repository
