In perl.git, the branch blead has been updated <http://perl5.git.perl.org/perl.git/commitdiff/3a3d108484629fe5b421976b8d6fd6f280a1f97a?hp=52ffb1b5f4ae92a18ec49ded8308d386abc08f03>
- Log ----------------------------------------------------------------- commit 3a3d108484629fe5b421976b8d6fd6f280a1f97a Author: Karl Williamson <[email protected]> Date: Mon Feb 15 11:02:07 2016 -0700 Cast correctly to U8, not char U8 is what the function being called is expecting M regcomp.c commit 768f8d987cfefb34c0b9c1a2e5c65b4448b4b937 Author: Karl Williamson <[email protected]> Date: Mon Feb 15 20:59:10 2016 -0700 perlapi: Hide the swash functions These should be internal only, and we may want to get rid of them someday. Hide their existence so that people who don't already know about them won't be tempted to try to use them. M embed.fnc commit b271ede139c499b0a38fbeff3a5a032adcb54955 Author: Karl Williamson <[email protected]> Date: Mon Feb 15 20:32:32 2016 -0700 regcomp.h: Not all ANYOF flags are in use. So, it's better to not have a mask to include the unused ones. M regcomp.h commit 346aefe94ef87336bae3ae96d8047475e3361709 Author: Karl Williamson <[email protected]> Date: Sat Feb 13 17:21:28 2016 -0700 regcomp.c: Simplify a few lines of code This code had been written before the isMNEMONIC_CNTRL() macro was created. Using the macro simplifies things a little. M regcomp.c commit 7240094943c6fdbc40220518824ce9120ca0fab5 Author: Karl Williamson <[email protected]> Date: Sat Feb 13 15:51:50 2016 -0700 regcomp.c: Clean up logic in function This function uses some crude heuristics to decide whether to make a synthetic start class or not. This commit removes some redundancies. M regcomp.c commit cab181dd2f1897a30514c8f9efb56be269495619 Author: Karl Williamson <[email protected]> Date: Thu Feb 11 10:25:04 2016 -0700 regcomp.c: -Dr \xZZ instead of \x{ZZ} The brackets are unnecessary and clutter the output. M regcomp.c commit c0e8a4e1861b0a6637457cb7da1ff9b89f90e22f Author: Karl Williamson <[email protected]> Date: Thu Feb 11 10:12:57 2016 -0700 regcomp.c: Fix -Dr bug It was using a wrong length calculation, which under some circumstances caused the output to include extra bytes. Also I added comments, and changed a variable name, so I don't have to figure this out again from scratch. M regcomp.c commit c3caadb53923cd6d6dc69e426f00325a943ea81f Author: Karl Williamson <[email protected]> Date: Mon Feb 15 11:04:36 2016 -0700 regcomp.c: Use macro to hide complexity There is an existing macro that does these three lines in one source line. M regcomp.c commit fe0a36465f2c9353aeec20e4a5d22f6681bb28bb Author: Karl Williamson <[email protected]> Date: Sat Feb 13 13:49:00 2016 -0700 Don't allow /\N{}/ under 're strict' This is the one remaining empty {} that was accepted under the experimental 'use re "strict"'. M embed.fnc M embed.h M pod/perldelta.pod M pod/perldiag.pod M proto.h M regcomp.c M t/re/reg_mesg.t commit a60b792277ab2836704066899135e8d4c83b0f0c Author: Karl Williamson <[email protected]> Date: Sat Feb 13 15:20:49 2016 -0700 perlrecharclass: Add some missing info M pod/perlrecharclass.pod ----------------------------------------------------------------------- Summary of changes: embed.fnc | 7 +++-- embed.h | 2 +- pod/perldelta.pod | 6 ++++ pod/perldiag.pod | 7 +++-- pod/perlrecharclass.pod | 3 ++ proto.h | 2 +- regcomp.c | 82 ++++++++++++++++++++++++------------------------- regcomp.h | 2 +- t/re/reg_mesg.t | 4 ++- 9 files changed, 64 insertions(+), 51 deletions(-) diff --git a/embed.fnc b/embed.fnc index 23e1e52..f5ace28 100644 --- a/embed.fnc +++ b/embed.fnc @@ -1164,7 +1164,7 @@ Amb |OP* |ref |NULLOK OP* o|I32 type s |OP* |refkids |NULLOK OP* o|I32 type #endif Ap |void |regdump |NN const regexp* r -Ap |SV* |regclass_swash |NULLOK const regexp *prog \ +ApM |SV* |regclass_swash |NULLOK const regexp *prog \ |NN const struct regnode *node|bool doinit \ |NULLOK SV **listsvp|NULLOK SV **altsvp #if defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_PERL_C) || defined(PERL_IN_UTF8_C) @@ -1517,8 +1517,8 @@ Apd |void |sv_vsetpvfn |NN SV *const sv|NN const char *const pat|const STRLEN pa |NULLOK va_list *const args|NULLOK SV **const svargs \ |const I32 svmax|NULLOK bool *const maybe_tainted ApR |NV |str_to_version |NN SV *sv -ApR |SV* |swash_init |NN const char* pkg|NN const char* name|NN SV* listsv|I32 minbits|I32 none -Ap |UV |swash_fetch |NN SV *swash|NN const U8 *ptr|bool do_utf8 +ApRM |SV* |swash_init |NN const char* pkg|NN const char* name|NN SV* listsv|I32 minbits|I32 none +ApM |UV |swash_fetch |NN SV *swash|NN const U8 *ptr|bool do_utf8 #ifdef PERL_IN_REGCOMP_C EiMR |SV* |add_cp_to_invlist |NULLOK SV* invlist|const UV cp EsM |void |_append_range_to_invlist |NN SV* const invlist|const UV start|const UV end @@ -2174,6 +2174,7 @@ Es |bool |grok_bslash_N |NN RExC_state_t *pRExC_state \ |NULLOK UV *code_point_p \ |NULLOK int* cp_count \ |NN I32 *flagp \ + |const bool strict \ |const U32 depth Es |void |reginsert |NN RExC_state_t *pRExC_state \ |U8 op|NN regnode *opnd|U32 depth diff --git a/embed.h b/embed.h index a1368ea..a12a3e6 100644 --- a/embed.h +++ b/embed.h @@ -999,7 +999,7 @@ #define edit_distance S_edit_distance #define get_ANYOF_cp_list_for_ssc(a,b) S_get_ANYOF_cp_list_for_ssc(aTHX_ a,b) #define get_invlist_iter_addr S_get_invlist_iter_addr -#define grok_bslash_N(a,b,c,d,e,f) S_grok_bslash_N(aTHX_ a,b,c,d,e,f) +#define grok_bslash_N(a,b,c,d,e,f,g) S_grok_bslash_N(aTHX_ a,b,c,d,e,f,g) #define handle_named_backref(a,b,c,d) S_handle_named_backref(aTHX_ a,b,c,d) #define handle_possible_posix(a,b,c,d) S_handle_possible_posix(aTHX_ a,b,c,d) #define handle_regex_sets(a,b,c,d,e) S_handle_regex_sets(aTHX_ a,b,c,d,e) diff --git a/pod/perldelta.pod b/pod/perldelta.pod index 5607b2e..55db093 100644 --- a/pod/perldelta.pod +++ b/pod/perldelta.pod @@ -52,6 +52,12 @@ XXX For a release on a stable branch, this section aspires to be: [ List each incompatible change as a =head2 entry ] +=head2 C<qr/\N{}/> now disallowed under C<use re "strict"> + +An empty C<\N{}> makes no sense, but for backwards compatibility is +silently accepted as doing nothing. But now this is a fatal error under +the experimental feature L<re/'strict' mode>. + =head1 Deprecations XXX Any deprecated features, syntax, modules etc. should be listed here. diff --git a/pod/perldiag.pod b/pod/perldiag.pod index cc27016..1e4760d 100644 --- a/pod/perldiag.pod +++ b/pod/perldiag.pod @@ -7203,9 +7203,10 @@ Something Very Wrong. (F) Named Unicode character escapes (C<\N{...}>) may return a zero-length sequence. Such an escape was used in an extended character class, i.e. -C<(?[...])>, which is not permitted. Check that the correct escape has -been used, and the correct charnames handler is in scope. The S<<-- HERE> -shows whereabouts in the regular expression the problem was discovered. +C<(?[...])>, or under C<use re 'strict'>, which is not permitted. Check +that the correct escape has been used, and the correct charnames handler +is in scope. The S<<-- HERE> shows whereabouts in the regular +expression the problem was discovered. =back diff --git a/pod/perlrecharclass.pod b/pod/perlrecharclass.pod index ef8048f..7f5a4ef 100644 --- a/pod/perlrecharclass.pod +++ b/pod/perlrecharclass.pod @@ -999,6 +999,9 @@ use it will raise a warning, unless disabled via Comments on this feature are welcome; send email to C<[email protected]>. +The rules used by L<C<use re 'strict>|re/'strict' mode> apply to this +construct. + We can extend the example above: /(?[ ( \p{Thai} + \p{Lao} ) & \p{Digit} ])/ diff --git a/proto.h b/proto.h index 4f2d687..c3adf2d 100644 --- a/proto.h +++ b/proto.h @@ -4747,7 +4747,7 @@ PERL_STATIC_INLINE STRLEN* S_get_invlist_iter_addr(SV* invlist) #define PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR \ assert(invlist) -STATIC bool S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** nodep, UV *code_point_p, int* cp_count, I32 *flagp, const U32 depth); +STATIC bool S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** nodep, UV *code_point_p, int* cp_count, I32 *flagp, const bool strict, const U32 depth); #define PERL_ARGS_ASSERT_GROK_BSLASH_N \ assert(pRExC_state); assert(flagp) PERL_STATIC_INLINE regnode* S_handle_named_backref(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, char * parse_start, char ch); diff --git a/regcomp.c b/regcomp.c index f3b185c..5dbccfb 100644 --- a/regcomp.c +++ b/regcomp.c @@ -1822,35 +1822,28 @@ S_is_ssc_worth_it(const RExC_state_t * pRExC_state, const regnode_ssc * ssc) * (unassigned, private use, surrogates, controls and formats). This * is a much large number. */ - const U32 max_match = (LOC) - ? 127 - : (! UNI_SEMANTICS) - ? 63 - : (invlist_highest(ssc->invlist) < 256) - ? 127 - : ((NON_OTHER_COUNT + 1) / 2) - 1; U32 count = 0; /* Running total of number of code points matched by 'ssc' */ UV start, end; /* Start and end points of current range in inversion list */ + const U32 max_code_points = (LOC) + ? 256 + : (( ! UNI_SEMANTICS + || invlist_highest(ssc->invlist) < 256) + ? 128 + : NON_OTHER_COUNT); + const U32 max_match = max_code_points / 2; PERL_ARGS_ASSERT_IS_SSC_WORTH_IT; invlist_iterinit(ssc->invlist); while (invlist_iternext(ssc->invlist, &start, &end)) { - - /* /u is the only thing that we expect to match above 255; so if not /u - * and even if there are matches above 255, ignore them. This catches - * things like \d under /d which does match the digits above 255, but - * since the pattern is /d, it is not likely to be expecting them */ - if (! UNI_SEMANTICS) { - if (start > 255) { - break; - } - end = MIN(end, 255); + if (start >= max_code_points) { + break; } + end = MIN(end, max_code_points - 1); count += end - start + 1; - if (count > max_match) { + if (count >= max_match) { invlist_iterfinish(ssc->invlist); return FALSE; } @@ -11394,6 +11387,7 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, UV * code_point_p, int * cp_count, I32 * flagp, + const bool strict, const U32 depth ) { @@ -11543,6 +11537,10 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, semantics */ if (endbrace == RExC_parse) { /* empty: \N{} */ + if (strict) { + RExC_parse++; /* Position after the "}" */ + vFAIL("Zero length \\N{}"); + } if (cp_count) { *cp_count = 0; } @@ -12422,6 +12420,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) NULL, /* Don't need a count of how many code points */ flagp, + RExC_strict, depth) ) { break; @@ -12748,6 +12747,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) NULL, /* Don't need a count of how many code points */ flagp, + RExC_strict, depth) ) { if (*flagp & NEED_UTF8) @@ -15529,6 +15529,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, &value, /* Yes single value */ &cp_count, /* Multiple code pt count */ flagp, + strict, depth) ) { @@ -15541,11 +15542,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, vFAIL("\\N in a character class must be a named character: \\N{...}"); } else if (cp_count == 0) { - if (strict) { - RExC_parse++; /* Position after the "}" */ - vFAIL("Zero length \\N{}"); - } - else if (PASS2) { + if (PASS2) { ckWARNreg(RExC_parse, "Ignoring zero length \\N{} in character class"); } @@ -15666,9 +15663,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, /* Look up the property name, and get its swash and * inversion list, if the property is found */ - if (swash) { /* Return any left-overs */ - SvREFCNT_dec_NN(swash); - } + SvREFCNT_dec(swash); /* Free any left-overs */ swash = _core_swash_init("utf8", name, &PL_sv_undef, 1, /* binary */ 0, /* not tr/// */ @@ -16344,7 +16339,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, "\"%.*s\" is more clearly written simply as \"%s\"", (int) (RExC_parse - rangebegin), rangebegin, - cntrl_to_mnemonic((char) value) + cntrl_to_mnemonic((U8) value) ); } } @@ -18427,11 +18422,16 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_ bitmap_invlist); if (lv && lv != &PL_sv_undef) { char *s = savesvpv(lv); - char * const origs = s; + const char * const orig_s = s; /* Save the beginning of + 's', so can be freed */ + /* Ignore anything before the first \n */ while (*s && *s != '\n') s++; + /* The data are one range per line. A range is a single + * entity; or two, separated by \t. So can just convert \n + * to space and \t to '-' */ if (*s == '\n') { const char * const t = ++s; @@ -18452,10 +18452,10 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_ if (*s == '\n') { /* Truncate very long output */ - if (s - origs > 256) { + if ((UV) (s - t) > 256) { Perl_sv_catpvf(aTHX_ sv, "%.*s...", - (int) (s - origs - 1), + (int) (s - t), t); goto out_dump; } @@ -18466,15 +18466,18 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_ } s++; } + + /* Here, it fits in the allocated space. Replace a + * final blank with a NUL */ if (s[-1] == ' ') - s[-1] = 0; + s[-1] = '\0'; sv_catpv(sv, t); } out_dump: - Safefree(origs); + Safefree(orig_s); SvREFCNT_dec_NN(lv); } @@ -19173,14 +19176,11 @@ S_put_code_point(pTHX_ SV *sv, UV c) sv_catpvs(sv, "\\"); sv_catpvn(sv, &string, 1); } + else if (isMNEMONIC_CNTRL(c)) { + Perl_sv_catpvf(aTHX_ sv, "%s", cntrl_to_mnemonic((U8) c)); + } else { - const char * const mnemonic = cntrl_to_mnemonic((char) c); - if (mnemonic) { - Perl_sv_catpvf(aTHX_ sv, "%s", mnemonic); - } - else { - Perl_sv_catpvf(aTHX_ sv, "\\x{%02X}", (U8) c); - } + Perl_sv_catpvf(aTHX_ sv, "\\x%02X", (U8) c); } } @@ -19335,10 +19335,10 @@ S_put_range(pTHX_ SV *sv, UV start, const UV end, const bool allow_literals) : NUM_ANYOF_CODE_POINTS - 1; #if NUM_ANYOF_CODE_POINTS > 256 format = (this_end < 256) - ? "\\x{%02"UVXf"}-\\x{%02"UVXf"}" + ? "\\x%02"UVXf"-\\x%02"UVXf"" : "\\x{%04"UVXf"}-\\x{%04"UVXf"}"; #else - format = "\\x{%02"UVXf"}-\\x{%02"UVXf"}"; + format = "\\x%02"UVXf"-\\x%02"UVXf""; #endif GCC_DIAG_IGNORE(-Wformat-nonliteral); Perl_sv_catpvf(aTHX_ sv, format, start, this_end); diff --git a/regcomp.h b/regcomp.h index 44c2c1c..07e098a 100644 --- a/regcomp.h +++ b/regcomp.h @@ -553,7 +553,7 @@ struct regnode_ssc { * are cautioned about its shared nature */ #define ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER 0x80 -#define ANYOF_FLAGS_ALL (0xff) +#define ANYOF_FLAGS_ALL (0xff & ~0x10) #define ANYOF_LOCALE_FLAGS (ANYOFL_FOLD | ANYOF_MATCHES_POSIXL) diff --git a/t/re/reg_mesg.t b/t/re/reg_mesg.t index 6ec5d94..d05922e 100644 --- a/t/re/reg_mesg.t +++ b/t/re/reg_mesg.t @@ -295,7 +295,9 @@ my @death_only_under_strict = ( 'm/[\x{ABCDEFG}]/' => 'Illegal hexadecimal digit \'G\' ignored', => 'Non-hex character {#} m/[\x{ABCDEFG{#}}]/', 'm/[\N{}]/' => 'Ignoring zero length \\N{} in character class {#} m/[\\N{}{#}]/', - => 'Zero length \\N{} {#} m/[\\N{}]{#}/', + => 'Zero length \\N{} {#} m/[\\N{}{#}]/', + 'm/\N{}/' => "", + => 'Zero length \\N{} {#} m/\\N{}{#}/', "m'[\\y]\\x{100}'" => 'Unrecognized escape \y in character class passed through {#} m/[\y{#}]\x{100}/', => 'Unrecognized escape \y in character class {#} m/[\y{#}]\x{100}/', 'm/[a-\d]\x{100}/' => 'False [] range "a-\d" {#} m/[a-\d{#}]\x{100}/', -- Perl5 Master Repository
