In perl.git, the branch blead has been updated <http://perl5.git.perl.org/perl.git/commitdiff/902994e45aafa5c63ac8bf2219075daf29139b3c?hp=7e62f75cb6326421c6dade1c6ca08206084d7348>
- Log ----------------------------------------------------------------- commit 902994e45aafa5c63ac8bf2219075daf29139b3c Author: Karl Williamson <[email protected]> Date: Fri Jan 18 21:58:10 2013 -0700 Extend strictness for qr/(?[ \N{} ])/ This recently added regex syntax imposes stricter rules on parsing than normal. However, this did not include parsing \N{} constructs that occur within it. This commit does that, making fatal the warnings that come from \N{} I will add to perldiag the newly added messages along with the others for (?[ ]) before 5.18 ships ----------------------------------------------------------------------- Summary of changes: embed.fnc | 7 ++++--- embed.h | 2 +- proto.h | 2 +- regcomp.c | 32 ++++++++++++++++++++++++-------- t/porting/diag.t | 2 ++ t/re/pat_advanced.t | 3 +++ t/re/reg_mesg.t | 2 ++ 7 files changed, 37 insertions(+), 13 deletions(-) diff --git a/embed.fnc b/embed.fnc index 1fd1f4e..53c582d 100644 --- a/embed.fnc +++ b/embed.fnc @@ -1983,9 +1983,10 @@ Es |regnode*|reg_node |NN struct RExC_state_t *pRExC_state|U8 op Es |UV |reg_recode |const char value|NN SV **encp Es |regnode*|regpiece |NN struct RExC_state_t *pRExC_state \ |NN I32 *flagp|U32 depth -Es |bool |grok_bslash_N |NN struct RExC_state_t *pRExC_state \ - |NULLOK regnode** nodep|NULLOK UV *valuep \ - |NN I32 *flagp|U32 depth|bool in_char_class +Es |bool |grok_bslash_N |NN struct RExC_state_t *pRExC_state \ + |NULLOK regnode** nodep|NULLOK UV *valuep \ + |NN I32 *flagp|U32 depth|bool in_char_class \ + |const bool strict Es |void |reginsert |NN struct RExC_state_t *pRExC_state \ |U8 op|NN regnode *opnd|U32 depth Es |void |regtail |NN struct RExC_state_t *pRExC_state \ diff --git a/embed.h b/embed.h index 470805d..1df6ab4 100644 --- a/embed.h +++ b/embed.h @@ -910,7 +910,7 @@ #define get_invlist_previous_index_addr(a) S_get_invlist_previous_index_addr(aTHX_ a) #define get_invlist_version_id_addr(a) S_get_invlist_version_id_addr(aTHX_ a) #define get_invlist_zero_addr(a) S_get_invlist_zero_addr(aTHX_ a) -#define grok_bslash_N(a,b,c,d,e,f) S_grok_bslash_N(aTHX_ a,b,c,d,e,f) +#define grok_bslash_N(a,b,c,d,e,f,g) S_grok_bslash_N(aTHX_ a,b,c,d,e,f,g) #define handle_sets(a,b,c,d) S_handle_sets(aTHX_ a,b,c,d) #define invlist_array(a) S_invlist_array(aTHX_ a) #define invlist_clone(a) S_invlist_clone(aTHX_ a) diff --git a/proto.h b/proto.h index e0c3279..0d0078d 100644 --- a/proto.h +++ b/proto.h @@ -6503,7 +6503,7 @@ PERL_STATIC_INLINE UV* S_get_invlist_zero_addr(pTHX_ SV* invlist) #define PERL_ARGS_ASSERT_GET_INVLIST_ZERO_ADDR \ assert(invlist) -STATIC bool S_grok_bslash_N(pTHX_ struct RExC_state_t *pRExC_state, regnode** nodep, UV *valuep, I32 *flagp, U32 depth, bool in_char_class) +STATIC bool S_grok_bslash_N(pTHX_ struct RExC_state_t *pRExC_state, regnode** nodep, UV *valuep, I32 *flagp, U32 depth, bool in_char_class, const bool strict) __attribute__nonnull__(pTHX_1) __attribute__nonnull__(pTHX_4); #define PERL_ARGS_ASSERT_GROK_BSLASH_N \ diff --git a/regcomp.c b/regcomp.c index 05e9fe5..a22f8ff 100644 --- a/regcomp.c +++ b/regcomp.c @@ -9634,7 +9634,9 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) } STATIC bool -S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, UV *valuep, I32 *flagp, U32 depth, bool in_char_class) +S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, UV *valuep, I32 *flagp, U32 depth, bool in_char_class, + const bool strict /* Apply stricter parsing rules? */ + ) { /* This is expected to be called by a parser routine that has recognized '\N' @@ -9749,9 +9751,14 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, UV *valuep, I } else if (in_char_class) { if (SIZE_ONLY && in_char_class) { - ckWARNreg(RExC_parse, - "Ignoring zero length \\N{} in character class" - ); + if (strict) { + RExC_parse++; /* Position after the "}" */ + vFAIL("Zero length \\N{}"); + } + else { + ckWARNreg(RExC_parse, + "Ignoring zero length \\N{} in character class"); + } } ret = FALSE; } @@ -9803,7 +9810,13 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, UV *valuep, I } if (in_char_class && has_multiple_chars) { - ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class"); + if (strict) { + RExC_parse = endbrace; + vFAIL("\\N{} in character class restricted to one character"); + } + else { + ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class"); + } } RExC_parse = endbrace + 1; @@ -10339,7 +10352,8 @@ tryagain: * special treatment for quantifiers is not needed for such single * character sequences */ ++RExC_parse; - if (! grok_bslash_N(pRExC_state, &ret, NULL, flagp, depth, FALSE)) { + if (! grok_bslash_N(pRExC_state, &ret, NULL, flagp, depth, FALSE, + FALSE /* not strict */ )) { RExC_parse--; goto defchar; } @@ -10603,7 +10617,8 @@ tryagain: * */ RExC_parse = p + 1; if (! grok_bslash_N(pRExC_state, NULL, &ender, - flagp, depth, FALSE)) + flagp, depth, FALSE, + FALSE /* not strict */ )) { RExC_parse = p = oldp; goto loopdone; @@ -11988,7 +12003,8 @@ parseit: from earlier versions, OTOH that behaviour was broken as well. */ if (! grok_bslash_N(pRExC_state, NULL, &value, flagp, depth, - TRUE /* => charclass */)) + TRUE, /* => charclass */ + strict)) { goto parseit; } diff --git a/t/porting/diag.t b/t/porting/diag.t index 13f1811..7355151 100644 --- a/t/porting/diag.t +++ b/t/porting/diag.t @@ -658,3 +658,5 @@ Operation "%s" returns its argument for UTF-16 surrogate U+%X Unicode surrogate U+%X is illegal in UTF-8 UTF-16 surrogate U+%X False [] range "%s" in regex; marked by <-- HERE in m/%s/ +\N{} in character class restricted to one character in regex; marked by <-- HERE in m/%s/ +Zero length \N{} in regex; marked by <-- HERE in m/%s/ diff --git a/t/re/pat_advanced.t b/t/re/pat_advanced.t index a411220..60ae9d6 100644 --- a/t/re/pat_advanced.t +++ b/t/re/pat_advanced.t @@ -1014,6 +1014,9 @@ sub run_tests { ok "\N{LONG-STR}" =~ /^\N{LONG-STR}$/, 'Verify that long string works'; ok "\N{LONG-STR}" =~ /^\N{LONG-STR}$/i, 'Verify under folding that long string works'; + eval '/(?[[\N{EMPTY-STR}]])/'; + ok $@ && $@ =~ /Zero length \\N\{}/; + undef $w; eval q [is("\N{TOO MANY SPACES}", "TOO MANY SPACES", "Multiple spaces in character name works")]; like ($w, qr/A sequence of multiple spaces in a charnames alias definition is deprecated/, "... but returns a deprecation warning"); diff --git a/t/re/reg_mesg.t b/t/re/reg_mesg.t index 14e9ace..30bc2d6 100644 --- a/t/re/reg_mesg.t +++ b/t/re/reg_mesg.t @@ -147,7 +147,9 @@ my @death = 'm/(?[[\w-x]])/' => 'False [] range "\w-" in regex; marked by {#} in m/(?[[\w-{#}x]])/', 'm/(?[[a-\pM]])/' => 'False [] range "a-\pM" in regex; marked by {#} in m/(?[[a-\pM{#}]])/', 'm/(?[[\pM-x]])/' => 'False [] range "\pM-" in regex; marked by {#} in m/(?[[\pM-{#}x]])/', + 'm/(?[[\N{LATIN CAPITAL LETTER A WITH MACRON AND GRAVE}]])/' => '\N{} in character class restricted to one character in regex; marked by {#} in m/(?[[\N{U+100.300{#}}]])/', ); +# Tests involving a user-defined charnames translator are in pat_advanced.t ## ## Key-value pairs of code/error of code that should have non-fatal warnings. -- Perl5 Master Repository
