In perl.git, the branch blead has been updated <http://perl5.git.perl.org/perl.git/commitdiff/a0bd1a30d379f2625c307657d63fc50173d7a56d?hp=e0be3f061825dcf8ffbb0e0581a77e09e74ab8fc>
- Log ----------------------------------------------------------------- commit a0bd1a30d379f2625c307657d63fc50173d7a56d Author: Karl Williamson <[email protected]> Date: Sun Aug 23 10:30:02 2015 -0600 Make qr/(?[ ])/ work in UTF-8 locales Previously use of this under /l regex rules was a compile time error. Now it works like \b{wb} and \b{sb}, which compile under locale rules and always work like Unicode says they should. A UTF-8 locale implies Unicode rules, and the goal is for it to work seamlessly with the rest of perl. This construct was the only one I am aware of that didn't work seamlessly (not counting OS interfaces) under UTF-8 LC_CTYPE locales. For all three of these constructs, use with a non-UTF-8 runtime locale raises a warning, and Unicode rules are used anyway. UTF-8 locale collation still has problems, but this is low priority to fix, as it's a lot of work, and if one really cares, one should be using Unicode::Collate. M pod/perldelta.pod M pod/perldiag.pod M pod/perlrecharclass.pod M regcomp.c M regcomp.h M regexec.c M t/lib/warnings/regexec M t/re/reg_mesg.t M t/re/regex_sets.t commit 2d3d6e6e7c2d50b1cc47032cf089151823fb20a6 Author: Karl Williamson <[email protected]> Date: Sun Aug 23 10:25:16 2015 -0600 regcomp.c: Add a parameter to static function This will be used by the next commit M embed.fnc M embed.h M proto.h M regcomp.c commit f240c685c914970dc8ffec926f02d6048831bc09 Author: Karl Williamson <[email protected]> Date: Fri Aug 21 22:21:57 2015 -0600 regcomp.h: Fold 2 ANYOF flags into a single one The ANYOF_FLAGS bits are all used up, but a future commit wants one. This commit frees up a bit by sharing two of the existing comparatively-rarely-used ones. One bit is used only under /d matching rules, while the other is used only when not under /d. Only the latter bit is used in synthetic start classes. The previous commit introduced an ANYOFD node type corresponding to /d. An SSC never is this type. Thus, the bits have mutually exclusive meanings, and we can use the node type to distinguish between the two meanings of the combined bit. An alternative implementation would have been to use the ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES non-/d bit instead of the one chosen. But this is used more frequently, so the disambiguation would have been exercised more frequently, slowing execution down ever so slightly; more importantly, this one required fewer code changes, by a slight amount. M regcomp.c M regcomp.h M regexec.c commit ac44c12e0b8cc8431bb165c203dcf56d2659302c Author: Karl Williamson <[email protected]> Date: Fri Aug 21 13:06:53 2015 -0600 Add ANYOFD regex node This is like an ANYOF node, but just for when /d is in effect. It will be used in future commits M pod/perldebguts.pod M regcomp.c M regcomp.sym M regexec.c M regnodes.h commit b24abbc803191b400f0d0ab41db2f184860e7534 Author: Karl Williamson <[email protected]> Date: Fri Aug 21 09:54:05 2015 -0600 perldebguts: Add clarification M pod/perldebguts.pod M regcomp.sym M regnodes.h ----------------------------------------------------------------------- Summary of changes: embed.fnc | 1 + embed.h | 2 +- pod/perldebguts.pod | 4 +- pod/perldelta.pod | 9 ++ pod/perldiag.pod | 11 +- pod/perlrecharclass.pod | 8 +- proto.h | 2 +- regcomp.c | 105 +++++++++++++--- regcomp.h | 23 ++-- regcomp.sym | 3 +- regexec.c | 30 ++++- regnodes.h | 313 ++++++++++++++++++++++++------------------------ t/lib/warnings/regexec | 47 ++++++++ t/re/reg_mesg.t | 2 - t/re/regex_sets.t | 41 ++++++- 15 files changed, 406 insertions(+), 195 deletions(-) diff --git a/embed.fnc b/embed.fnc index 12c0551..1be276f 100644 --- a/embed.fnc +++ b/embed.fnc @@ -2130,6 +2130,7 @@ Es |regnode*|regclass |NN RExC_state_t *pRExC_state \ |bool allow_multi_fold \ |const bool silence_non_portable \ |const bool strict \ + |const bool optimizable \ |NULLOK SV** ret_invlist Es |void|add_above_Latin1_folds|NN RExC_state_t *pRExC_state|const U8 cp \ |NN SV** invlist diff --git a/embed.h b/embed.h index 0611ea9..faa4112 100644 --- a/embed.h +++ b/embed.h @@ -996,7 +996,7 @@ #define reganode(a,b,c) S_reganode(aTHX_ a,b,c) #define regatom(a,b,c) S_regatom(aTHX_ a,b,c) #define regbranch(a,b,c,d) S_regbranch(aTHX_ a,b,c,d) -#define regclass(a,b,c,d,e,f,g,h) S_regclass(aTHX_ a,b,c,d,e,f,g,h) +#define regclass(a,b,c,d,e,f,g,h,i) S_regclass(aTHX_ a,b,c,d,e,f,g,h,i) #define regex_set_precedence S_regex_set_precedence #define reginsert(a,b,c,d) S_reginsert(aTHX_ a,b,c,d) #define regnode_guts(a,b,c,d) S_regnode_guts(aTHX_ a,b,c,d) diff --git a/pod/perldebguts.pod b/pod/perldebguts.pod index 064af64..eb0a6ca 100644 --- a/pod/perldebguts.pod +++ b/pod/perldebguts.pod @@ -594,6 +594,7 @@ will be lost. SANY no Match any one character. ANYOF sv 1 Match character in (or not in) this class, single char match only + ANYOFD sv 1 Like ANYOF, but /d is in effect ANYOFL sv 1 Like ANYOF, but /l is in effect # POSIX Character Classes: @@ -628,7 +629,8 @@ will be lost. # Literals EXACT str Match this string (preceded by length). - EXACTL str Like EXACT, but /l is in effect. + EXACTL str Like EXACT, but /l is in effect (used so + locale-related warnings can be checked for). EXACTF str Match this non-UTF-8 string (not guaranteed to be folded) using /id rules (w/len). EXACTFL str Match this string (not guaranteed to be diff --git a/pod/perldelta.pod b/pod/perldelta.pod index d08581a..aafbd1c 100644 --- a/pod/perldelta.pod +++ b/pod/perldelta.pod @@ -27,6 +27,15 @@ here, but most should go in the L</Performance Enhancements> section. [ List each enhancement as a =head2 entry ] +=head2 C<qr/(?[ ])/> now works in UTF-8 locales + +L<Extended Bracketed Character Classes|perlrecharclass/Extended Bracketed Character Classes> +now will successfully compile when S<C<use locale>> is in effect. The compiled +pattern will use standard Unicode rules. If the runtime locale is not a +UTF-8 one, a warning is raised and standard Unicode rules are used +anyway. No tainting is done since the outcome does not actually depend +on the locale. + =head1 Security XXX Any security-related notices go here. In particular, any security diff --git a/pod/perldiag.pod b/pod/perldiag.pod index 2effeeb..918d35c 100644 --- a/pod/perldiag.pod +++ b/pod/perldiag.pod @@ -6610,14 +6610,13 @@ is deprecated. See L<perlvar/"$[">. form if you wish to use an empty line as the terminator of the here-document. -=item Use of \b{} for non-UTF-8 locale is wrong. Assuming a UTF-8 locale +=item Use of %s for non-UTF-8 locale is wrong. Assuming a UTF-8 locale (W locale) You are matching a regular expression using locale rules, -and a Unicode boundary is being matched, but the locale is not a Unicode -one. This doesn't make sense. Perl will continue, assuming a Unicode -(UTF-8) locale, but the results could well be wrong except if the locale -happens to be ISO-8859-1 (Latin1) where this message is spurious and can -be ignored. +and the specified construct was encountered. This construct is only +valid for UTF-8 locales, which the current locale isn't. This doesn't +make sense. Perl will continue, assuming a Unicode (UTF-8) locale, but +the results are likely to be wrong. =item Use of /c modifier is meaningless in s/// diff --git a/pod/perlrecharclass.pod b/pod/perlrecharclass.pod index ce28771..f46de4c 100644 --- a/pod/perlrecharclass.pod +++ b/pod/perlrecharclass.pod @@ -1106,8 +1106,12 @@ just three limitations: =item 1 -This construct cannot be used within the scope of -C<use locale> (or the C<E<sol>l> regex modifier). +When compiled within the scope of C<use locale> (or the C<E<sol>l> regex +modifier), this construct assumes that the execution-time locale will be +a UTF-8 one, and the generated pattern always uses Unicode rules. What +gets matched or not thus isn't dependent on the actual runtime locale, so +tainting is not enabled. But a C<locale> category warning is raised +if the runtime locale turns out to not be UTF-8. =item 2 diff --git a/proto.h b/proto.h index a3bd488..1ddabd9 100644 --- a/proto.h +++ b/proto.h @@ -4768,7 +4768,7 @@ STATIC regnode* S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth STATIC regnode* S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth); #define PERL_ARGS_ASSERT_REGBRANCH \ assert(pRExC_state); assert(flagp) -STATIC regnode* S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, const bool stop_at_1, bool allow_multi_fold, const bool silence_non_portable, const bool strict, SV** ret_invlist); +STATIC regnode* S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, const bool stop_at_1, bool allow_multi_fold, const bool silence_non_portable, const bool strict, const bool optimiza ... [23 chars truncated] #define PERL_ARGS_ASSERT_REGCLASS \ assert(pRExC_state); assert(flagp) STATIC unsigned int S_regex_set_precedence(const U8 my_operator) diff --git a/regcomp.c b/regcomp.c index 4264274..91e1603 100644 --- a/regcomp.c +++ b/regcomp.c @@ -1177,7 +1177,9 @@ S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state, /* If this can match all upper Latin1 code points, have to add them * as well */ - if (ANYOF_FLAGS(node) & ANYOF_MATCHES_ALL_NON_UTF8_NON_ASCII) { + if (OP(node) == ANYOFD + && (ANYOF_FLAGS(node) & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)) + { _invlist_union(invlist, PL_UpperLatin1, &invlist); } @@ -1255,12 +1257,19 @@ S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc, * that should be; while the consequences for having /l bugs is * incorrect matches */ if (ssc_is_anything((regnode_ssc *)and_with)) { - anded_flags |= ANYOF_WARN_SUPER; + anded_flags |= ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER; } } else { anded_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, and_with); - anded_flags = ANYOF_FLAGS(and_with) & ANYOF_COMMON_FLAGS; + if (OP(and_with) == ANYOFD) { + anded_flags = ANYOF_FLAGS(and_with) & ANYOF_COMMON_FLAGS; + } + else { + anded_flags = ANYOF_FLAGS(and_with) + &( ANYOF_COMMON_FLAGS + |ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER); + } } ANYOF_FLAGS(ssc) &= anded_flags; @@ -1411,6 +1420,11 @@ S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc, else { ored_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, or_with); ored_flags = ANYOF_FLAGS(or_with) & ANYOF_COMMON_FLAGS; + if (OP(or_with) != ANYOFD) { + ored_flags + |= ANYOF_FLAGS(or_with) + & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER; + } } ANYOF_FLAGS(ssc) |= ored_flags; @@ -1609,7 +1623,9 @@ S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc) /* The code in this file assumes that all but these flags aren't relevant * to the SSC, except SSC_MATCHES_EMPTY_STRING, which should be cleared * by the time we reach here */ - assert(! (ANYOF_FLAGS(ssc) & ~ANYOF_COMMON_FLAGS)); + assert(! (ANYOF_FLAGS(ssc) + & ~( ANYOF_COMMON_FLAGS + |ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER))); populate_ANYOF_from_invlist( (regnode *) ssc, &invlist); @@ -5097,6 +5113,7 @@ PerlIO_printf(Perl_debug_log, "LHS=%"UVuf" RHS=%"UVuf"\n", } break; + case ANYOFD: case ANYOFL: case ANYOF: if (flags & SCF_DO_STCLASS_AND) @@ -11713,6 +11730,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) TRUE, /* allow multi-char folds */ FALSE, /* don't silence non-portable warnings. */ (bool) RExC_strict, + TRUE, /* Allow an optimized regnode result */ NULL); if (*RExC_parse != ']') { RExC_parse = oregcomp_parse; @@ -12015,6 +12033,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) It would be a bug if these returned non-portables */ (bool) RExC_strict, + TRUE, /* Allow an optimized regnode result */ NULL); /* regclass() can only return RESTART_UTF8 if multi-char folds are allowed. */ @@ -13330,14 +13349,16 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, this function */ const bool save_fold = FOLD; /* Temporary */ char *save_end, *save_parse; /* Temporaries */ + const bool in_locale = LOC; /* we turn off /l during processing */ GET_RE_DEBUG_FLAGS_DECL; PERL_ARGS_ASSERT_HANDLE_REGEX_SETS; - if (LOC) { /* XXX could make valid in UTF-8 locales */ - vFAIL("(?[...]) not valid in locale"); + if (in_locale) { + set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET); } + RExC_uni_semantics = 1; /* The use of this operator implies /u. This is required so that the compile time values are valid in all runtime cases */ @@ -13393,6 +13414,7 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, FALSE, /* don't allow multi-char folds */ TRUE, /* silence non-portable warnings. */ TRUE, /* strict */ + FALSE, /* Require return to be an ANYOF */ ¤t )) FAIL2("panic: regclass returned NULL to handle_sets, " @@ -13419,6 +13441,10 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, nextchar(pRExC_state); Set_Node_Length(node, RExC_parse - oregcomp_parse + 1); /* MJD */ + if (in_locale) { + set_regex_charset(&RExC_flags, REGEX_LOCALE_CHARSET); + } + return node; } goto no_close; @@ -13646,6 +13672,7 @@ redo_curchar: FALSE, /* don't allow multi-char folds */ FALSE, /* don't silence non-portable warnings. */ TRUE, /* strict */ + FALSE, /* Require return to be an ANYOF */ ¤t)) { FAIL2("panic: regclass returned NULL to handle_sets, " @@ -13673,6 +13700,7 @@ redo_curchar: FALSE, /* don't allow multi-char folds */ FALSE, /* don't silence non-portable warnings. */ TRUE, /* strict */ + FALSE, /* Require return to be an ANYOF */ ¤t )) { @@ -13973,14 +14001,42 @@ redo_curchar: well have generated non-portable code points, but they're valid on this machine */ FALSE, /* similarly, no need for strict */ + FALSE, /* Require return to be an ANYOF */ NULL ); if (!node) FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf, PTR2UV(flagp)); + + /* Fix up the node type if we are in locale. (We have pretended we are + * under /u for the purposes of regclass(), as this construct will only + * work under UTF-8 locales. But now we change the opcode to be ANYOFL (so + * as to cause any warnings about bad locales to be output in regexec.c), + * and add the flag that indicates to check if not in a UTF-8 locale. The + * reason we above forbid optimization into something other than an ANYOF + * node is simply to minimize the number of code changes in regexec.c. + * Otherwise we would have to create new EXACTish node types and deal with + * them. This decision could be revisited should this construct become + * popular. + * + * (One might think we could look at the resulting ANYOF node and suppress + * the flag if everything is above 255, as those would be UTF-8 only, + * but this isn't true, as the components that led to that result could + * have been locale-affected, and just happen to cancel each other out + * under UTF-8 locales.) */ + if (in_locale) { + set_regex_charset(&RExC_flags, REGEX_LOCALE_CHARSET); + + assert(OP(node) == ANYOF); + + OP(node) = ANYOFL; + ANYOF_FLAGS(node) |= ANYOF_LOC_REQ_UTF8; + } + if (save_fold) { RExC_flags |= RXf_PMf_FOLD; } + RExC_parse = save_parse + 1; RExC_end = save_end; SvREFCNT_dec_NN(final); @@ -14132,6 +14188,8 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, about too large characters */ const bool strict, + const bool optimizable, /* ? Allow a non-ANYOF return + node */ SV** ret_invlist /* Return an inversion list, not a node */ ) { @@ -14262,7 +14320,9 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, ret = reganode(pRExC_state, (LOC) ? ANYOFL - : ANYOF, + : (DEPENDS_SEMANTICS) + ? ANYOFD + : ANYOF, 0); if (SIZE_ONLY) { @@ -15319,8 +15379,9 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, * 2) if the character class contains only a single element (including a * single range), we see if there is an equivalent node for it. * Other checks are possible */ - if (! ret_invlist /* Can't optimize if returning the constructed - inversion list */ + if ( optimizable + && ! ret_invlist /* Can't optimize if returning the constructed + inversion list */ && (UNLIKELY(posixl_matches_all) || element_count == 1)) { U8 op = END; @@ -15681,7 +15742,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, if (DEPENDS_SEMANTICS) { /* Under /d, everything in the upper half of the Latin1 range * matches these complements */ - ANYOF_FLAGS(ret) |= ANYOF_MATCHES_ALL_NON_UTF8_NON_ASCII; + ANYOF_FLAGS(ret) |= ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER; } else if (AT_LEAST_ASCII_RESTRICTED) { /* Under /a and /aa, everything above ASCII matches these @@ -15768,7 +15829,8 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, } if (warn_super) { - ANYOF_FLAGS(ret) |= ANYOF_WARN_SUPER; + ANYOF_FLAGS(ret) + |= ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER; } } @@ -15854,7 +15916,8 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, * be easy, but perhaps too slow, to check any candidates against all the * node types they could possibly match using _invlistEQ(). */ - if (cp_list + if ( optimizable + && cp_list && ! invert && ! depends_list && ! (ANYOF_FLAGS(ret) & (ANYOF_LOCALE_FLAGS)) @@ -15863,7 +15926,9 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, /* We don't optimize if we are supposed to make sure all non-Unicode * code points raise a warning, as only ANYOF nodes have this check. * */ - && ! ((ANYOF_FLAGS(ret) & ANYOF_WARN_SUPER) && ALWAYS_WARN_SUPER)) + && ! ((ANYOF_FLAGS(ret) & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER) + && OP(ret) != ANYOFD + && ALWAYS_WARN_SUPER)) { UV start, end; U8 op = END; /* The optimzation node-type */ @@ -17012,8 +17077,14 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_ SV* bitmap_invlist; /* Will hold what the bit map contains */ - if (OP(o) == ANYOFL) - sv_catpvs(sv, "{loc}"); + if (OP(o) == ANYOFL) { + if (flags & ANYOF_LOC_REQ_UTF8) { + sv_catpvs(sv, "{utf8-loc}"); + } + else { + sv_catpvs(sv, "{loc}"); + } + } if (flags & ANYOF_LOC_FOLD) sv_catpvs(sv, "{i}"); Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]); @@ -17049,7 +17120,9 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_ sv_catpvs(sv, "^"); } - if (flags & ANYOF_MATCHES_ALL_NON_UTF8_NON_ASCII) { + if (OP(o) == ANYOFD + && (flags & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)) + { sv_catpvs(sv, "{non-utf8-latin1-all}"); } diff --git a/regcomp.h b/regcomp.h index 897d35b..0f2617b 100644 --- a/regcomp.h +++ b/regcomp.h @@ -426,9 +426,9 @@ struct regnode_ssc { * at compile-time */ #define ANYOF_MATCHES_POSIXL 0x08 -/* Should we raise a warning if matching against an above-Unicode code point? - * */ -#define ANYOF_WARN_SUPER 0x10 +/* Only under /l. If set, none of INVERT, LOC_FOLD, POSIXL, + * HAS_NONBITMAP_NON_UTF8_MATCHES can be set */ +#define ANYOF_LOC_REQ_UTF8 0x10 /* Can match something outside the bitmap that isn't in utf8 */ #define ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES 0x20 @@ -436,9 +436,17 @@ struct regnode_ssc { /* Matches every code point NUM_ANYOF_CODE_POINTS and above*/ #define ANYOF_MATCHES_ALL_ABOVE_BITMAP 0x40 -/* Match all Latin1 characters that aren't ASCII when the target string is not - * in utf8. */ -#define ANYOF_MATCHES_ALL_NON_UTF8_NON_ASCII 0x80 + +/* Shared bit: + * Under /d it means the ANYOF node matches all non-ASCII Latin1 + * characters when the target string is not in utf8. + * When not under /d, it means the ANYOF node should raise a warning if + * matching against an above-Unicode code point. + * (These uses are mutually exclusive because the warning requires a \p{}, and + * \p{} implies /u which deselects /d). An SSC node only has this bit set if + * what is meant is the warning. The long macro name is to make sure that you + * are cautioned about its shared nature */ +#define ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER 0x80 #define ANYOF_FLAGS_ALL (0xff) @@ -447,7 +455,8 @@ struct regnode_ssc { /* These are the flags that apply to both regular ANYOF nodes and synthetic * start class nodes during construction of the SSC. During finalization of * the SSC, other of the flags could be added to it */ -#define ANYOF_COMMON_FLAGS (ANYOF_WARN_SUPER|ANYOF_HAS_UTF8_NONBITMAP_MATCHES) +#define ANYOF_COMMON_FLAGS ( ANYOF_HAS_UTF8_NONBITMAP_MATCHES \ + |ANYOF_LOC_REQ_UTF8) /* Character classes for node->classflags of ANYOF */ /* Should be synchronized with a table in regprop() */ diff --git a/regcomp.sym b/regcomp.sym index ffcb53b..201c65e 100644 --- a/regcomp.sym +++ b/regcomp.sym @@ -57,6 +57,7 @@ NBOUNDA NBOUND, no ; Match "" betweeen any \w\w or \W\W, where \w REG_ANY REG_ANY, no 0 S ; Match any one character (except newline). SANY REG_ANY, no 0 S ; Match any one character. ANYOF ANYOF, sv 1 S ; Match character in (or not in) this class, single char match only +ANYOFD ANYOF, sv 1 S ; Like ANYOF, but /d is in effect ANYOFL ANYOF, sv 1 S ; Like ANYOF, but /l is in effect #* POSIX Character Classes: @@ -90,7 +91,7 @@ BRANCH BRANCH, node 0 V ; Match this alternative, or the next... # NOTE: the relative ordering of these types is important do not change it EXACT EXACT, str ; Match this string (preceded by length). -EXACTL EXACT, str ; Like EXACT, but /l is in effect. +EXACTL EXACT, str ; Like EXACT, but /l is in effect (used so locale-related warnings can be checked for). EXACTF EXACT, str ; Match this non-UTF-8 string (not guaranteed to be folded) using /id rules (w/len). EXACTFL EXACT, str ; Match this string (not guaranteed to be folded) using /il rules (w/len). EXACTFU EXACT, str ; Match this string (folded iff in UTF-8, length in folding doesn't change if not in UTF-8) using /iu rules (w/len). diff --git a/regexec.c b/regexec.c index f2517e5..781bc6b 100644 --- a/regexec.c +++ b/regexec.c @@ -86,6 +86,9 @@ #include "invlist_inline.h" #include "unicode_constants.h" +static const char utf8_locale_required[] = + "Use of (?[ ]) for non-UTF-8 locale is wrong. Assuming a UTF-8 locale"; + #ifdef DEBUGGING /* At least one required character in the target string is expressible only in * UTF-8. */ @@ -1822,7 +1825,13 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, switch (OP(c)) { case ANYOFL: _CHECK_AND_WARN_PROBLEMATIC_LOCALE; + + if ((FLAGS(c) & ANYOF_LOC_REQ_UTF8) && ! IN_UTF8_CTYPE_LOCALE) { + Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), utf8_locale_required); + } + /* FALLTHROUGH */ + case ANYOFD: case ANYOF: if (utf8_target) { REXEC_FBC_UTF8_CLASS_SCAN( @@ -5729,7 +5738,13 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) case ANYOFL: /* /[abc]/l */ _CHECK_AND_WARN_PROBLEMATIC_LOCALE; + + if ((FLAGS(scan) & ANYOF_LOC_REQ_UTF8) && ! IN_UTF8_CTYPE_LOCALE) + { + Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), utf8_locale_required); + } /* FALLTHROUGH */ + case ANYOFD: /* /[abc]/d */ case ANYOF: /* /[abc]/ */ if (NEXTCHR_IS_EOS) sayNO; @@ -8243,7 +8258,12 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, } case ANYOFL: _CHECK_AND_WARN_PROBLEMATIC_LOCALE; + + if ((FLAGS(p) & ANYOF_LOC_REQ_UTF8) && ! IN_UTF8_CTYPE_LOCALE) { + Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), utf8_locale_required); + } /* FALLTHROUGH */ + case ANYOFD: case ANYOF: if (utf8_target) { while (hardcount < max @@ -8586,7 +8606,7 @@ S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const * UTF8_ALLOW_FFFF */ if (c_len == (STRLEN)-1) Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)"); - if (c > 255 && OP(n) == ANYOFL && ! is_ANYOF_SYNTHETIC(n)) { + if (c > 255 && OP(n) == ANYOFL && ! (flags & ANYOF_LOC_REQ_UTF8)) { _CHECK_AND_OUTPUT_WIDE_LOCALE_CP_MSG(c); } } @@ -8595,7 +8615,9 @@ S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const if (c < NUM_ANYOF_CODE_POINTS) { if (ANYOF_BITMAP_TEST(n, c)) match = TRUE; - else if ((flags & ANYOF_MATCHES_ALL_NON_UTF8_NON_ASCII) + else if ((flags + & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER) + && OP(n) == ANYOFD && ! utf8_target && ! isASCII(c)) { @@ -8698,7 +8720,9 @@ S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const } if (UNICODE_IS_SUPER(c) - && (flags & ANYOF_WARN_SUPER) + && (flags + & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER) + && OP(n) != ANYOFD && ckWARN_d(WARN_NON_UNICODE)) { Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE), diff --git a/regnodes.h b/regnodes.h index db32920..cc3da9d 100644 --- a/regnodes.h +++ b/regnodes.h @@ -6,8 +6,8 @@ /* Regops and State definitions */ -#define REGNODE_MAX 92 -#define REGMATCH_STATE_MAX 132 +#define REGNODE_MAX 93 +#define REGMATCH_STATE_MAX 133 #define END 0 /* 0000 End of program. */ #define SUCCEED 1 /* 0x01 Return from a subroutine, basically. */ @@ -30,80 +30,81 @@ #define REG_ANY 16 /* 0x10 Match any one character (except newline). */ #define SANY 17 /* 0x11 Match any one character. */ #define ANYOF 18 /* 0x12 Match character in (or not in) this class, single char match only */ -#define ANYOFL 19 /* 0x13 Like ANYOF, but /l is in effect */ -#define POSIXD 20 /* 0x14 Some [[:class:]] under /d; the FLAGS field gives which one */ -#define POSIXL 21 /* 0x15 Some [[:class:]] under /l; the FLAGS field gives which one */ -#define POSIXU 22 /* 0x16 Some [[:class:]] under /u; the FLAGS field gives which one */ -#define POSIXA 23 /* 0x17 Some [[:class:]] under /a; the FLAGS field gives which one */ -#define NPOSIXD 24 /* 0x18 complement of POSIXD, [[:^class:]] */ -#define NPOSIXL 25 /* 0x19 complement of POSIXL, [[:^class:]] */ -#define NPOSIXU 26 /* 0x1a complement of POSIXU, [[:^class:]] */ -#define NPOSIXA 27 /* 0x1b complement of POSIXA, [[:^class:]] */ -#define CLUMP 28 /* 0x1c Match any extended grapheme cluster sequence */ -#define BRANCH 29 /* 0x1d Match this alternative, or the next... */ -#define EXACT 30 /* 0x1e Match this string (preceded by length). */ -#define EXACTL 31 /* 0x1f Like EXACT, but /l is in effect. */ -#define EXACTF 32 /* 0x20 Match this non-UTF-8 string (not guaranteed to be folded) using /id rules (w/len). */ -#define EXACTFL 33 /* 0x21 Match this string (not guaranteed to be folded) using /il rules (w/len). */ -#define EXACTFU 34 /* 0x22 Match this string (folded iff in UTF-8, length in folding doesn't change if not in UTF-8) using /iu rules (w/len). */ -#define EXACTFA 35 /* 0x23 Match this string (not guaranteed to be folded) using /iaa rules (w/len). */ -#define EXACTFU_SS 36 /* 0x24 Match this string (folded iff in UTF-8, length in folding may change even if not in UTF-8) using /iu rules (w/len). */ -#define EXACTFLU8 37 /* 0x25 Rare cirucmstances: like EXACTFU, but is under /l, UTF-8, folded, and everything in it is above 255. */ -#define EXACTFA_NO_TRIE 38 /* 0x26 Match this string (which is not trie-able; not guaranteed to be folded) using /iaa rules (w/len). */ -#define NOTHING 39 /* 0x27 Match empty string. */ -#define TAIL 40 /* 0x28 Match empty string. Can jump here from outside. */ -#define STAR 41 /* 0x29 Match this (simple) thing 0 or more times. */ -#define PLUS 42 /* 0x2a Match this (simple) thing 1 or more times. */ -#define CURLY 43 /* 0x2b Match this simple thing {n,m} times. */ -#define CURLYN 44 /* 0x2c Capture next-after-this simple thing */ -#define CURLYM 45 /* 0x2d Capture this medium-complex thing {n,m} times. */ -#define CURLYX 46 /* 0x2e Match this complex thing {n,m} times. */ -#define WHILEM 47 /* 0x2f Do curly processing and see if rest matches. */ -#define OPEN 48 /* 0x30 Mark this point in input as start of #n. */ -#define CLOSE 49 /* 0x31 Analogous to OPEN. */ -#define REF 50 /* 0x32 Match some already matched string */ -#define REFF 51 /* 0x33 Match already matched string, folded using native charset rules for non-utf8 */ -#define REFFL 52 /* 0x34 Match already matched string, folded in loc. */ -#define REFFU 53 /* 0x35 Match already matched string, folded using unicode rules for non-utf8 */ -#define REFFA 54 /* 0x36 Match already matched string, folded using unicode rules for non-utf8, no mixing ASCII, non-ASCII */ -#define NREF 55 /* 0x37 Match some already matched string */ -#define NREFF 56 /* 0x38 Match already matched string, folded using native charset rules for non-utf8 */ -#define NREFFL 57 /* 0x39 Match already matched string, folded in loc. */ -#define NREFFU 58 /* 0x3a Match already matched string, folded using unicode rules for non-utf8 */ -#define NREFFA 59 /* 0x3b Match already matched string, folded using unicode rules for non-utf8, no mixing ASCII, non-ASCII */ -#define LONGJMP 60 /* 0x3c Jump far away. */ -#define BRANCHJ 61 /* 0x3d BRANCH with long offset. */ -#define IFMATCH 62 /* 0x3e Succeeds if the following matches. */ -#define UNLESSM 63 /* 0x3f Fails if the following matches. */ -#define SUSPEND 64 /* 0x40 "Independent" sub-RE. */ -#define IFTHEN 65 /* 0x41 Switch, should be preceded by switcher. */ -#define GROUPP 66 /* 0x42 Whether the group matched. */ -#define EVAL 67 /* 0x43 Execute some Perl code. */ -#define MINMOD 68 /* 0x44 Next operator is not greedy. */ -#define LOGICAL 69 /* 0x45 Next opcode should set the flag only. */ -#define RENUM 70 /* 0x46 Group with independently numbered parens. */ -#define TRIE 71 /* 0x47 Match many EXACT(F[ALU]?)? at once. flags==type */ -#define TRIEC 72 /* 0x48 Same as TRIE, but with embedded charclass data */ -#define AHOCORASICK 73 /* 0x49 Aho Corasick stclass. flags==type */ -#define AHOCORASICKC 74 /* 0x4a Same as AHOCORASICK, but with embedded charclass data */ -#define GOSUB 75 /* 0x4b recurse to paren arg1 at (signed) ofs arg2 */ -#define GOSTART 76 /* 0x4c recurse to start of pattern */ -#define NGROUPP 77 /* 0x4d Whether the group matched. */ -#define INSUBP 78 /* 0x4e Whether we are in a specific recurse. */ -#define DEFINEP 79 /* 0x4f Never execute directly. */ -#define ENDLIKE 80 /* 0x50 Used only for the type field of verbs */ -#define OPFAIL 81 /* 0x51 Same as (?!) */ -#define ACCEPT 82 /* 0x52 Accepts the current matched string. */ -#define VERB 83 /* 0x53 Used only for the type field of verbs */ -#define PRUNE 84 /* 0x54 Pattern fails at this startpoint if no-backtracking through this */ -#define MARKPOINT 85 /* 0x55 Push the current location for rollback by cut. */ -#define SKIP 86 /* 0x56 On failure skip forward (to the mark) before retrying */ -#define COMMIT 87 /* 0x57 Pattern fails outright if backtracking through this */ -#define CUTGROUP 88 /* 0x58 On failure go to the next alternation in the group */ -#define KEEPS 89 /* 0x59 $& begins here. */ -#define LNBREAK 90 /* 0x5a generic newline pattern */ -#define OPTIMIZED 91 /* 0x5b Placeholder for dump. */ -#define PSEUDO 92 /* 0x5c Pseudo opcode for internal use. */ +#define ANYOFD 19 /* 0x13 Like ANYOF, but /d is in effect */ +#define ANYOFL 20 /* 0x14 Like ANYOF, but /l is in effect */ +#define POSIXD 21 /* 0x15 Some [[:class:]] under /d; the FLAGS field gives which one */ +#define POSIXL 22 /* 0x16 Some [[:class:]] under /l; the FLAGS field gives which one */ +#define POSIXU 23 /* 0x17 Some [[:class:]] under /u; the FLAGS field gives which one */ +#define POSIXA 24 /* 0x18 Some [[:class:]] under /a; the FLAGS field gives which one */ +#define NPOSIXD 25 /* 0x19 complement of POSIXD, [[:^class:]] */ +#define NPOSIXL 26 /* 0x1a complement of POSIXL, [[:^class:]] */ +#define NPOSIXU 27 /* 0x1b complement of POSIXU, [[:^class:]] */ +#define NPOSIXA 28 /* 0x1c complement of POSIXA, [[:^class:]] */ +#define CLUMP 29 /* 0x1d Match any extended grapheme cluster sequence */ +#define BRANCH 30 /* 0x1e Match this alternative, or the next... */ +#define EXACT 31 /* 0x1f Match this string (preceded by length). */ +#define EXACTL 32 /* 0x20 Like EXACT, but /l is in effect (used so locale-related warnings can be checked for). */ +#define EXACTF 33 /* 0x21 Match this non-UTF-8 string (not guaranteed to be folded) using /id rules (w/len). */ +#define EXACTFL 34 /* 0x22 Match this string (not guaranteed to be folded) using /il rules (w/len). */ +#define EXACTFU 35 /* 0x23 Match this string (folded iff in UTF-8, length in folding doesn't change if not in UTF-8) using /iu rules (w/len). */ +#define EXACTFA 36 /* 0x24 Match this string (not guaranteed to be folded) using /iaa rules (w/len). */ +#define EXACTFU_SS 37 /* 0x25 Match this string (folded iff in UTF-8, length in folding may change even if not in UTF-8) using /iu rules (w/len). */ +#define EXACTFLU8 38 /* 0x26 Rare cirucmstances: like EXACTFU, but is under /l, UTF-8, folded, and everything in it is above 255. */ +#define EXACTFA_NO_TRIE 39 /* 0x27 Match this string (which is not trie-able; not guaranteed to be folded) using /iaa rules (w/len). */ +#define NOTHING 40 /* 0x28 Match empty string. */ +#define TAIL 41 /* 0x29 Match empty string. Can jump here from outside. */ +#define STAR 42 /* 0x2a Match this (simple) thing 0 or more times. */ +#define PLUS 43 /* 0x2b Match this (simple) thing 1 or more times. */ +#define CURLY 44 /* 0x2c Match this simple thing {n,m} times. */ +#define CURLYN 45 /* 0x2d Capture next-after-this simple thing */ +#define CURLYM 46 /* 0x2e Capture this medium-complex thing {n,m} times. */ +#define CURLYX 47 /* 0x2f Match this complex thing {n,m} times. */ +#define WHILEM 48 /* 0x30 Do curly processing and see if rest matches. */ +#define OPEN 49 /* 0x31 Mark this point in input as start of #n. */ +#define CLOSE 50 /* 0x32 Analogous to OPEN. */ +#define REF 51 /* 0x33 Match some already matched string */ +#define REFF 52 /* 0x34 Match already matched string, folded using native charset rules for non-utf8 */ +#define REFFL 53 /* 0x35 Match already matched string, folded in loc. */ +#define REFFU 54 /* 0x36 Match already matched string, folded using unicode rules for non-utf8 */ +#define REFFA 55 /* 0x37 Match already matched string, folded using unicode rules for non-utf8, no mixing ASCII, non-ASCII */ +#define NREF 56 /* 0x38 Match some already matched string */ +#define NREFF 57 /* 0x39 Match already matched string, folded using native charset rules for non-utf8 */ +#define NREFFL 58 /* 0x3a Match already matched string, folded in loc. */ +#define NREFFU 59 /* 0x3b Match already matched string, folded using unicode rules for non-utf8 */ +#define NREFFA 60 /* 0x3c Match already matched string, folded using unicode rules for non-utf8, no mixing ASCII, non-ASCII */ +#define LONGJMP 61 /* 0x3d Jump far away. */ +#define BRANCHJ 62 /* 0x3e BRANCH with long offset. */ +#define IFMATCH 63 /* 0x3f Succeeds if the following matches. */ +#define UNLESSM 64 /* 0x40 Fails if the following matches. */ +#define SUSPEND 65 /* 0x41 "Independent" sub-RE. */ +#define IFTHEN 66 /* 0x42 Switch, should be preceded by switcher. */ +#define GROUPP 67 /* 0x43 Whether the group matched. */ +#define EVAL 68 /* 0x44 Execute some Perl code. */ +#define MINMOD 69 /* 0x45 Next operator is not greedy. */ +#define LOGICAL 70 /* 0x46 Next opcode should set the flag only. */ +#define RENUM 71 /* 0x47 Group with independently numbered parens. */ +#define TRIE 72 /* 0x48 Match many EXACT(F[ALU]?)? at once. flags==type */ +#define TRIEC 73 /* 0x49 Same as TRIE, but with embedded charclass data */ +#define AHOCORASICK 74 /* 0x4a Aho Corasick stclass. flags==type */ +#define AHOCORASICKC 75 /* 0x4b Same as AHOCORASICK, but with embedded charclass data */ +#define GOSUB 76 /* 0x4c recurse to paren arg1 at (signed) ofs arg2 */ +#define GOSTART 77 /* 0x4d recurse to start of pattern */ +#define NGROUPP 78 /* 0x4e Whether the group matched. */ +#define INSUBP 79 /* 0x4f Whether we are in a specific recurse. */ +#define DEFINEP 80 /* 0x50 Never execute directly. */ +#define ENDLIKE 81 /* 0x51 Used only for the type field of verbs */ +#define OPFAIL 82 /* 0x52 Same as (?!) */ +#define ACCEPT 83 /* 0x53 Accepts the current matched string. */ +#define VERB 84 /* 0x54 Used only for the type field of verbs */ +#define PRUNE 85 /* 0x55 Pattern fails at this startpoint if no-backtracking through this */ +#define MARKPOINT 86 /* 0x56 Push the current location for rollback by cut. */ +#define SKIP 87 /* 0x57 On failure skip forward (to the mark) before retrying */ +#define COMMIT 88 /* 0x58 Pattern fails outright if backtracking through this */ +#define CUTGROUP 89 /* 0x59 On failure go to the next alternation in the group */ +#define KEEPS 90 /* 0x5a $& begins here. */ +#define LNBREAK 91 /* 0x5b generic newline pattern */ +#define OPTIMIZED 92 /* 0x5c Placeholder for dump. */ +#define PSEUDO 93 /* 0x5d Pseudo opcode for internal use. */ /* ------------ States ------------- */ #define TRIE_next (REGNODE_MAX + 1) /* state for TRIE */ #define TRIE_next_fail (REGNODE_MAX + 2) /* state for TRIE */ @@ -171,6 +172,7 @@ EXTCONST U8 PL_regkind[] = { REG_ANY, /* REG_ANY */ REG_ANY, /* SANY */ ANYOF, /* ANYOF */ + ANYOF, /* ANYOFD */ ANYOF, /* ANYOFL */ POSIXD, /* POSIXD */ POSIXD, /* POSIXL */ @@ -312,6 +314,7 @@ static const U8 regarglen[] = { 0, /* REG_ANY */ 0, /* SANY */ EXTRA_SIZE(struct regnode_1), /* ANYOF */ + EXTRA_SIZE(struct regnode_1), /* ANYOFD */ EXTRA_SIZE(struct regnode_1), /* ANYOFL */ 0, /* POSIXD */ 0, /* POSIXL */ @@ -410,6 +413,7 @@ static const char reg_off_by_arg[] = { 0, /* REG_ANY */ 0, /* SANY */ 0, /* ANYOF */ + 0, /* ANYOFD */ 0, /* ANYOFL */ 0, /* POSIXD */ 0, /* POSIXL */ @@ -513,80 +517,81 @@ EXTCONST char * const PL_reg_name[] = { "REG_ANY", /* 0x10 */ "SANY", /* 0x11 */ "ANYOF", /* 0x12 */ - "ANYOFL", /* 0x13 */ - "POSIXD", /* 0x14 */ - "POSIXL", /* 0x15 */ - "POSIXU", /* 0x16 */ - "POSIXA", /* 0x17 */ - "NPOSIXD", /* 0x18 */ - "NPOSIXL", /* 0x19 */ - "NPOSIXU", /* 0x1a */ - "NPOSIXA", /* 0x1b */ - "CLUMP", /* 0x1c */ - "BRANCH", /* 0x1d */ - "EXACT", /* 0x1e */ - "EXACTL", /* 0x1f */ - "EXACTF", /* 0x20 */ - "EXACTFL", /* 0x21 */ - "EXACTFU", /* 0x22 */ - "EXACTFA", /* 0x23 */ - "EXACTFU_SS", /* 0x24 */ - "EXACTFLU8", /* 0x25 */ - "EXACTFA_NO_TRIE", /* 0x26 */ - "NOTHING", /* 0x27 */ - "TAIL", /* 0x28 */ - "STAR", /* 0x29 */ - "PLUS", /* 0x2a */ - "CURLY", /* 0x2b */ - "CURLYN", /* 0x2c */ - "CURLYM", /* 0x2d */ - "CURLYX", /* 0x2e */ - "WHILEM", /* 0x2f */ - "OPEN", /* 0x30 */ - "CLOSE", /* 0x31 */ - "REF", /* 0x32 */ - "REFF", /* 0x33 */ - "REFFL", /* 0x34 */ - "REFFU", /* 0x35 */ - "REFFA", /* 0x36 */ - "NREF", /* 0x37 */ - "NREFF", /* 0x38 */ - "NREFFL", /* 0x39 */ - "NREFFU", /* 0x3a */ - "NREFFA", /* 0x3b */ - "LONGJMP", /* 0x3c */ - "BRANCHJ", /* 0x3d */ - "IFMATCH", /* 0x3e */ - "UNLESSM", /* 0x3f */ - "SUSPEND", /* 0x40 */ - "IFTHEN", /* 0x41 */ - "GROUPP", /* 0x42 */ - "EVAL", /* 0x43 */ - "MINMOD", /* 0x44 */ - "LOGICAL", /* 0x45 */ - "RENUM", /* 0x46 */ - "TRIE", /* 0x47 */ - "TRIEC", /* 0x48 */ - "AHOCORASICK", /* 0x49 */ - "AHOCORASICKC", /* 0x4a */ - "GOSUB", /* 0x4b */ - "GOSTART", /* 0x4c */ - "NGROUPP", /* 0x4d */ - "INSUBP", /* 0x4e */ - "DEFINEP", /* 0x4f */ - "ENDLIKE", /* 0x50 */ - "OPFAIL", /* 0x51 */ - "ACCEPT", /* 0x52 */ - "VERB", /* 0x53 */ - "PRUNE", /* 0x54 */ - "MARKPOINT", /* 0x55 */ - "SKIP", /* 0x56 */ - "COMMIT", /* 0x57 */ - "CUTGROUP", /* 0x58 */ - "KEEPS", /* 0x59 */ - "LNBREAK", /* 0x5a */ - "OPTIMIZED", /* 0x5b */ - "PSEUDO", /* 0x5c */ + "ANYOFD", /* 0x13 */ + "ANYOFL", /* 0x14 */ + "POSIXD", /* 0x15 */ + "POSIXL", /* 0x16 */ + "POSIXU", /* 0x17 */ + "POSIXA", /* 0x18 */ + "NPOSIXD", /* 0x19 */ + "NPOSIXL", /* 0x1a */ + "NPOSIXU", /* 0x1b */ + "NPOSIXA", /* 0x1c */ + "CLUMP", /* 0x1d */ + "BRANCH", /* 0x1e */ + "EXACT", /* 0x1f */ + "EXACTL", /* 0x20 */ + "EXACTF", /* 0x21 */ + "EXACTFL", /* 0x22 */ + "EXACTFU", /* 0x23 */ + "EXACTFA", /* 0x24 */ + "EXACTFU_SS", /* 0x25 */ + "EXACTFLU8", /* 0x26 */ + "EXACTFA_NO_TRIE", /* 0x27 */ + "NOTHING", /* 0x28 */ + "TAIL", /* 0x29 */ + "STAR", /* 0x2a */ + "PLUS", /* 0x2b */ + "CURLY", /* 0x2c */ + "CURLYN", /* 0x2d */ + "CURLYM", /* 0x2e */ + "CURLYX", /* 0x2f */ + "WHILEM", /* 0x30 */ + "OPEN", /* 0x31 */ + "CLOSE", /* 0x32 */ + "REF", /* 0x33 */ + "REFF", /* 0x34 */ + "REFFL", /* 0x35 */ + "REFFU", /* 0x36 */ + "REFFA", /* 0x37 */ + "NREF", /* 0x38 */ + "NREFF", /* 0x39 */ + "NREFFL", /* 0x3a */ + "NREFFU", /* 0x3b */ + "NREFFA", /* 0x3c */ + "LONGJMP", /* 0x3d */ + "BRANCHJ", /* 0x3e */ + "IFMATCH", /* 0x3f */ + "UNLESSM", /* 0x40 */ + "SUSPEND", /* 0x41 */ + "IFTHEN", /* 0x42 */ + "GROUPP", /* 0x43 */ + "EVAL", /* 0x44 */ + "MINMOD", /* 0x45 */ + "LOGICAL", /* 0x46 */ + "RENUM", /* 0x47 */ + "TRIE", /* 0x48 */ + "TRIEC", /* 0x49 */ + "AHOCORASICK", /* 0x4a */ + "AHOCORASICKC", /* 0x4b */ + "GOSUB", /* 0x4c */ + "GOSTART", /* 0x4d */ + "NGROUPP", /* 0x4e */ + "INSUBP", /* 0x4f */ + "DEFINEP", /* 0x50 */ + "ENDLIKE", /* 0x51 */ + "OPFAIL", /* 0x52 */ + "ACCEPT", /* 0x53 */ + "VERB", /* 0x54 */ + "PRUNE", /* 0x55 */ + "MARKPOINT", /* 0x56 */ + "SKIP", /* 0x57 */ + "COMMIT", /* 0x58 */ + "CUTGROUP", /* 0x59 */ + "KEEPS", /* 0x5a */ + "LNBREAK", /* 0x5b */ + "OPTIMIZED", /* 0x5c */ + "PSEUDO", /* 0x5d */ /* ------------ States ------------- */ "TRIE_next", /* REGNODE_MAX +0x01 */ "TRIE_next_fail", /* REGNODE_MAX +0x02 */ @@ -720,7 +725,7 @@ EXTCONST U8 PL_varies[] __attribute__deprecated__ = { EXTCONST U8 PL_varies_bitmask[]; #else EXTCONST U8 PL_varies_bitmask[] = { - 0x00, 0x00, 0x00, 0x30, 0x00, 0xFE, 0xFC, 0x2F, 0x03, 0x00, 0x00, 0x00 + 0x00, 0x00, 0x00, 0x60, 0x00, 0xFC, 0xF9, 0x5F, 0x06, 0x00, 0x00, 0x00 }; #endif /* DOINIT */ @@ -732,8 +737,8 @@ EXTCONST U8 PL_varies_bitmask[] = { EXTCONST U8 PL_simple[] __attribute__deprecated__; #else EXTCONST U8 PL_simple[] __attribute__deprecated__ = { - REG_ANY, SANY, ANYOF, ANYOFL, POSIXD, POSIXL, POSIXU, POSIXA, NPOSIXD, - NPOSIXL, NPOSIXU, NPOSIXA, + REG_ANY, SANY, ANYOF, ANYOFD, ANYOFL, POSIXD, POSIXL, POSIXU, POSIXA, + NPOSIXD, NPOSIXL, NPOSIXU, NPOSIXA, 0 }; #endif /* DOINIT */ @@ -742,7 +747,7 @@ EXTCONST U8 PL_simple[] __attribute__deprecated__ = { EXTCONST U8 PL_simple_bitmask[]; #else EXTCONST U8 PL_simple_bitmask[] = { - 0x00, 0x00, 0xFF, 0x0F, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + 0x00, 0x00, 0xFF, 0x1F, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }; #endif /* DOINIT */ diff --git a/t/lib/warnings/regexec b/t/lib/warnings/regexec index 750880e..1f3b65b 100644 --- a/t/lib/warnings/regexec +++ b/t/lib/warnings/regexec @@ -212,3 +212,50 @@ Use of \b{} or \B{} for non-UTF-8 locale is wrong. Assuming a UTF-8 locale at - Use of \b{} or \B{} for non-UTF-8 locale is wrong. Assuming a UTF-8 locale at - line 16. Use of \b{} or \B{} for non-UTF-8 locale is wrong. Assuming a UTF-8 locale at - line 17. Use of \b{} or \B{} for non-UTF-8 locale is wrong. Assuming a UTF-8 locale at - line 17. +######## +# NAME (?[ ]) in non-UTF-8 locale +eval { require POSIX; POSIX->import("locale_h") }; +if ($@) { + print("SKIPPED\n# no POSIX\n"),exit; +} +no warnings 'experimental::regex_sets'; +use warnings 'locale'; +use locale; +setlocale(&POSIX::LC_CTYPE, "C"); +"\N{KELVIN SIGN}" =~ /(?[ \N{KELVIN SIGN} ])/i; +"K" =~ /(?[ \N{KELVIN SIGN} ])/i; +"k" =~ /(?[ \N{KELVIN SIGN} ])/i; +":" =~ /(?[ \: ])/; +no warnings 'locale'; +EXPECT +Use of (?[ ]) for non-UTF-8 locale is wrong. Assuming a UTF-8 locale at - line 9. +Use of (?[ ]) for non-UTF-8 locale is wrong. Assuming a UTF-8 locale at - line 9. +Use of (?[ ]) for non-UTF-8 locale is wrong. Assuming a UTF-8 locale at - line 10. +Use of (?[ ]) for non-UTF-8 locale is wrong. Assuming a UTF-8 locale at - line 10. +Use of (?[ ]) for non-UTF-8 locale is wrong. Assuming a UTF-8 locale at - line 11. +Use of (?[ ]) for non-UTF-8 locale is wrong. Assuming a UTF-8 locale at - line 11. +Use of (?[ ]) for non-UTF-8 locale is wrong. Assuming a UTF-8 locale at - line 12. +Use of (?[ ]) for non-UTF-8 locale is wrong. Assuming a UTF-8 locale at - line 12. +######## +# NAME (?[ ]) in UTF-8 locale +require '../loc_tools.pl'; +unless (locales_enabled()) { + print("SKIPPED\n# locales not available\n"),exit; +} +eval { require POSIX; POSIX->import("locale_h") }; +if ($@) { + print("SKIPPED\n# no POSIX\n"),exit; +} +my $utf8_locale = find_utf8_ctype_locale(); +unless ($utf8_locale) { + print("SKIPPED\n# No UTF-8 locale available\n"),exit; +} +no warnings 'experimental::regex_sets'; +use warnings 'locale'; +use locale; +setlocale(&POSIX::LC_CTYPE, $utf8_locale); +"\N{KELVIN SIGN}" =~ /(?[ \N{KELVIN SIGN} ])/i; +"K" =~ /(?[ \N{KELVIN SIGN} ])/i; +"k" =~ /(?[ \N{KELVIN SIGN} ])/i; +":" =~ /(?[ \: ])/; +EXPECT diff --git a/t/re/reg_mesg.t b/t/re/reg_mesg.t index a058824..d9d9d74 100644 --- a/t/re/reg_mesg.t +++ b/t/re/reg_mesg.t @@ -205,7 +205,6 @@ my @death = '/(?[[[:w:]]])/' => "POSIX class [:w:] unknown {#} m/(?[[[:w:]{#}]])/", '/(?[[:w:]])/' => "POSIX class [:w:] unknown {#} m/(?[[:w:]{#}])/", '/(?[a])/' => 'Unexpected character {#} m/(?[a{#}])/', - '/(?[\t])/l' => '(?[...]) not valid in locale {#} m/(?[{#}\t])/', '/(?[ + \t ])/' => 'Unexpected binary operator \'+\' with no preceding operand {#} m/(?[ +{#} \t ])/', '/(?[ \cK - ( + \t ) ])/' => 'Unexpected binary operator \'+\' with no preceding operand {#} m/(?[ \cK - ( +{#} \t ) ])/', '/(?[ \cK ( \t ) ])/' => 'Unexpected \'(\' with no preceding operator {#} m/(?[ \cK ({#} \t ) ])/', @@ -410,7 +409,6 @@ my @death_utf8 = mark_as_utf8( '/ã(?[[[:ã:]]])ã/' => "POSIX class [:ã:] unknown {#} m/ã(?[[[:ã:]{#}]])ã/", '/ã(?[[:ã:]])ã/' => "POSIX class [:ã:] unknown {#} m/ã(?[[:ã:]{#}])ã/", '/ã(?[ã])ã/' => 'Unexpected character {#} m/ã(?[ã{#}])ã/', - '/ã(?[ã])/l' => '(?[...]) not valid in locale {#} m/ã(?[{#}ã])/', '/ã(?[ + [ã] ])/' => 'Unexpected binary operator \'+\' with no preceding operand {#} m/ã(?[ +{#} [ã] ])/', '/ã(?[ \cK - ( + [ã] ) ])/' => 'Unexpected binary operator \'+\' with no preceding operand {#} m/ã(?[ \cK - ( +{#} [ã] ) ])/', '/ã(?[ \cK ( [ã] ) ])/' => 'Unexpected \'(\' with no preceding operator {#} m/ã(?[ \cK ({#} [ã] ) ])/', diff --git a/t/re/regex_sets.t b/t/re/regex_sets.t index 48a4f00..c85fde6 100644 --- a/t/re/regex_sets.t +++ b/t/re/regex_sets.t @@ -9,7 +9,8 @@ BEGIN { chdir 't' if -d 't'; @INC = ('../lib','.','../ext/re'); require './test.pl'; - require './test.pl'; require './charset_tools.pl'; + require './charset_tools.pl'; + require './loc_tools.pl'; skip_all_without_unicode_tables(); } @@ -96,6 +97,44 @@ like("k", $still_fold, "/i on interpolated (?[ ]) is retained in outer without / eval 'my $x = qr/(?[ [a] ])/; qr/(?[ $x ])/'; is($@, "", 'qr/(?[ [a] ])/ can be interpolated'); +if (! is_miniperl() && locales_enabled('LC_CTYPE')) { + my $utf8_locale = find_utf8_ctype_locale; + SKIP: { + skip("No utf8 locale available on this platform", 8) unless $utf8_locale; + + setlocale(&POSIX::LC_ALL, "C"); + use locale; + + $kelvin_fold = qr/(?[ \N{KELVIN SIGN} ])/i; + my $single_char_class = qr/(?[ \: ])/; + + setlocale(&POSIX::LC_ALL, $utf8_locale); + + like("\N{KELVIN SIGN}", $kelvin_fold, + '(?[ \N{KELVIN SIGN} ]) matches itself under /i in UTF8-locale'); + like("K", $kelvin_fold, + '(?[ \N{KELVIN SIGN} ]) matches "K" under /i in UTF8-locale'); + like("k", $kelvin_fold, + '(?[ \N{KELVIN SIGN} ]) matches "k" under /i in UTF8-locale'); + like(":", $single_char_class, + '(?[ : ]) matches itself in UTF8-locale (a single character class)'); + + setlocale(&POSIX::LC_ALL, "C"); + + # These should generate warnings (the above 4 shouldn't), but like() + # suppresses them, so the warnings tests are in t/lib/warnings/regexec + like("\N{KELVIN SIGN}", $kelvin_fold, + '(?[ \N{KELVIN SIGN} ]) matches itself under /i in C locale'); + like("K", $kelvin_fold, + '(?[ \N{KELVIN SIGN} ]) matches "K" under /i in C locale'); + like("k", $kelvin_fold, + '(?[ \N{KELVIN SIGN} ]) matches "k" under /i in C locale'); + like(":", $single_char_class, + '(?[ : ]) matches itself in C locale (a single character class)'); + } +} + + done_testing(); 1; -- Perl5 Master Repository
