In perl.git, the branch blead has been updated <http://perl5.git.perl.org/perl.git/commitdiff/e061ec81dcb60a85554e31a39475b89528de42f0?hp=5ad5b34cb2af84d4f37219a5dee752fca0459151>
- Log ----------------------------------------------------------------- commit e061ec81dcb60a85554e31a39475b89528de42f0 Merge: 5ad5b34... 71e9c53... Author: Rafael Garcia-Suarez <[email protected]> Date: Thu Jun 18 11:20:53 2009 +0200 Merge branch 'backslash-N' into blead commit 71e9c5323d288a57ab4e6570f3aee42167c6d5e7 Author: Rafael Garcia-Suarez <[email protected]> Date: Wed Jun 17 18:54:41 2009 +0200 Add perldelta entry for \N M pod/perl5110delta.pod commit cbf4a1297456c507afef540d31b7fadad2d72425 Author: Rafael Garcia-Suarez <[email protected]> Date: Wed Jun 17 18:50:00 2009 +0200 Some more tests for \N M t/op/re_tests commit c741660aac46f1784b5ef16aeab5e4958a91df78 Author: Rafael Garcia-Suarez <[email protected]> Date: Tue Jun 16 08:40:50 2009 +0200 Basic docs for \N M pod/perlre.pod M pod/perlrebackslash.pod M pod/perlrecharclass.pod M pod/perlreref.pod M pod/perltodo.pod commit afefe6bfcf9956c77e5f9eee351e3d13be12ea3b Author: Rafael Garcia-Suarez <[email protected]> Date: Tue Jun 16 08:27:23 2009 +0200 Implement new regex escape \N \N, like in Perl 6, is equivalent to . but not influenced by /s. It matches any character except \n. Note that followed by { and a non-number, \N is still a named character. M embed.fnc M embed.h M proto.h M regcomp.c M t/op/re_tests ----------------------------------------------------------------------- Summary of changes: embed.fnc | 2 +- embed.h | 2 +- pod/perl5110delta.pod | 8 ++++++++ pod/perlre.pod | 1 + pod/perlrebackslash.pod | 1 + pod/perlrecharclass.pod | 10 ++++++++-- pod/perlreref.pod | 1 + pod/perltodo.pod | 6 ------ proto.h | 2 +- regcomp.c | 32 +++++++++++++++++++++++--------- t/op/re_tests | 22 +++++++++++++++++++++- 11 files changed, 66 insertions(+), 21 deletions(-) diff --git a/embed.fnc b/embed.fnc index 68f3817..439203c 100644 --- a/embed.fnc +++ b/embed.fnc @@ -1593,7 +1593,7 @@ Es |UV |reg_recode |const char value|NN SV **encp Es |regnode*|regpiece |NN struct RExC_state_t *pRExC_state \ |NN I32 *flagp|U32 depth Es |regnode*|reg_namedseq |NN struct RExC_state_t *pRExC_state \ - |NULLOK UV *valuep + |NULLOK UV *valuep|NULLOK I32 *flagp Es |void |reginsert |NN struct RExC_state_t *pRExC_state \ |U8 op|NN regnode *opnd|U32 depth Es |void |regtail |NN struct RExC_state_t *pRExC_state \ diff --git a/embed.h b/embed.h index e320dc5..9af17f6 100644 --- a/embed.h +++ b/embed.h @@ -3756,7 +3756,7 @@ #define reg_node(a,b) S_reg_node(aTHX_ a,b) #define reg_recode(a,b) S_reg_recode(aTHX_ a,b) #define regpiece(a,b,c) S_regpiece(aTHX_ a,b,c) -#define reg_namedseq(a,b) S_reg_namedseq(aTHX_ a,b) +#define reg_namedseq(a,b,c) S_reg_namedseq(aTHX_ a,b,c) #define reginsert(a,b,c,d) S_reginsert(aTHX_ a,b,c,d) #define regtail(a,b,c,d) S_regtail(aTHX_ a,b,c,d) #define reg_scan_name(a,b) S_reg_scan_name(aTHX_ a,b) diff --git a/pod/perl5110delta.pod b/pod/perl5110delta.pod index afd1b35..720b5d2 100644 --- a/pod/perl5110delta.pod +++ b/pod/perl5110delta.pod @@ -109,6 +109,14 @@ to avoid relying on the object's underlying structure). This pragma allows you to lexically disable or enable overloading for some or all operations. (Yuval Kogman) +=head2 C<\N> regex escape + +A new regex escape has been added, C<\N>. It will match any character that +is not a newline, independently from the presence or absence of the single +line match modifier C</s>. (If C<\N> is followed by an opening brace and +by a letter, perl will still assume that a Unicode character name is +coming, so compatibility is preserved.) (Rafael Garcia-Suarez) + =head1 Modules and Pragmata =head2 Pragmata Changes diff --git a/pod/perlre.pod b/pod/perlre.pod index a076d3a..ee1c2cb 100644 --- a/pod/perlre.pod +++ b/pod/perlre.pod @@ -271,6 +271,7 @@ X<word> X<whitespace> X<character class> X<backreference> \g{name} Named backreference \k<name> Named backreference \K Keep the stuff left of the \K, don't include it in $& + \N Any character but \n \v Vertical whitespace \V Not vertical whitespace \h Horizontal whitespace diff --git a/pod/perlrebackslash.pod b/pod/perlrebackslash.pod index ddd7abe..40f73fc 100644 --- a/pod/perlrebackslash.pod +++ b/pod/perlrebackslash.pod @@ -83,6 +83,7 @@ quoted constructs>. \l Lowercase next character. \L Lowercase till \E. \n (Logical) newline character. + \N Any character but newline. \N{} Named (Unicode) character. \p{}, \pP Character with a Unicode property. \P{}, \PP Character without a Unicode property. diff --git a/pod/perlrecharclass.pod b/pod/perlrecharclass.pod index a626dd9..930c0fc 100644 --- a/pod/perlrecharclass.pod +++ b/pod/perlrecharclass.pod @@ -38,7 +38,6 @@ Here are some examples: "\n" =~ /(?s:.)/ # Match (local 'single line' modifier) "ab" =~ /^.$/ # No match (dot matches one character) - =head2 Backslashed sequences Perl regular expressions contain many backslashed sequences that @@ -59,6 +58,7 @@ more detail below. \S Match a non-white space character. \h Match a horizontal white space character. \H Match a character that isn't horizontal white space. + \N Match a character that isn't newline. \v Match a vertical white space character. \V Match a character that isn't vertical white space. \pP, \p{Prop} Match a character matching a Unicode property. @@ -94,7 +94,7 @@ Any character that isn't matched by C<\w> will be matched by C<\W>. =head3 White space -C<\s> matches any single character that is consider white space. In the +C<\s> matches any single character that is considered white space. In the ASCII range, C<\s> matches the horizontal tab (C<\t>), the new line (C<\n>), the form feed (C<\f>), the carriage return (C<\r>), and the space (the vertical tab, C<\cK> is not matched by C<\s>). The exact set @@ -113,6 +113,12 @@ C<\h> will match any character that is considered horizontal white space; this includes the space and the tab characters. C<\H> will match any character that is not considered horizontal white space. +C<\N>, like the dot, will match any character that is not a newline. The +difference is that C<\N> will not be influenced by the single line C</s> +regular expression modifier. (Note that, since C<\N{}> is also used for +Unicode named characters, if C<\N> is followed by an opening brace and +by a letter, perl will assume that a Unicode character name is coming.) + C<\v> will match any character that is considered vertical white space; this includes the carriage return and line feed characters (newline). C<\V> will match any character that is not considered vertical white space. diff --git a/pod/perlreref.pod b/pod/perlreref.pod index b9fb3b0..87baab2 100644 --- a/pod/perlreref.pod +++ b/pod/perlreref.pod @@ -125,6 +125,7 @@ and L<perlunicode> for details. \S A non-whitespace character \h An horizontal white space \H A non horizontal white space + \N A non newline (like . without /s) \v A vertical white space \V A non vertical white space \R A generic newline (?>\v|\x0D\x0A) diff --git a/pod/perltodo.pod b/pod/perltodo.pod index cf0304a..c84b51e 100644 --- a/pod/perltodo.pod +++ b/pod/perltodo.pod @@ -1101,12 +1101,6 @@ combines the code in pp_entersub, pp_leavesub. This should probably be done 1st in XS, and using B::Generate to patch the new OP into the optrees. -=head2 C<\N> - -It should be possible to add a C<\N> regex assertion, meaning "every -character except C<\n>° independently of the context. That would -of course imply that C<\N> couldn't be followed by an opening C<{>. - =head1 Big projects Tasks that will get your name mentioned in the description of the "Highlights diff --git a/proto.h b/proto.h index 78f17dd..285e05f 100644 --- a/proto.h +++ b/proto.h @@ -5178,7 +5178,7 @@ STATIC regnode* S_regpiece(pTHX_ struct RExC_state_t *pRExC_state, I32 *flagp, U #define PERL_ARGS_ASSERT_REGPIECE \ assert(pRExC_state); assert(flagp) -STATIC regnode* S_reg_namedseq(pTHX_ struct RExC_state_t *pRExC_state, UV *valuep) +STATIC regnode* S_reg_namedseq(pTHX_ struct RExC_state_t *pRExC_state, UV *valuep, I32 *flagp) __attribute__nonnull__(pTHX_1); #define PERL_ARGS_ASSERT_REG_NAMEDSEQ \ assert(pRExC_state) diff --git a/regcomp.c b/regcomp.c index e061528..bc7086f 100644 --- a/regcomp.c +++ b/regcomp.c @@ -6553,7 +6553,7 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) /* reg_namedseq(pRExC_state,UVp) This is expected to be called by a parser routine that has - recognized'\N' and needs to handle the rest. RExC_parse is + recognized '\N' and needs to handle the rest. RExC_parse is expected to point at the first char following the N at the time of the call. @@ -6567,11 +6567,11 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) be returned to indicate failure. (This will NOT be a valid pointer to a regnode.) - If value is null then it is assumed that we are parsing normal text + If valuep is null then it is assumed that we are parsing normal text and inserts a new EXACT node into the program containing the resolved string and returns a pointer to the new node. If the string is zerolength a NOTHING node is emitted. - + On success RExC_parse is set to the char following the endbrace. Parsing failures will generate a fatal errorvia vFAIL(...) @@ -6585,7 +6585,7 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) */ STATIC regnode * -S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep) +S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp) { char * name; /* start of the content of the name */ char * endbrace; /* endbrace following the name */ @@ -6597,8 +6597,22 @@ S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep) PERL_ARGS_ASSERT_REG_NAMEDSEQ; - if (*RExC_parse != '{') { - vFAIL("Missing braces on \\N{}"); + if (*RExC_parse != '{' || + (*RExC_parse == '{' && RExC_parse[1] + && strchr("0123456789", RExC_parse[1]))) + { + GET_RE_DEBUG_FLAGS_DECL; + if (valuep) + /* no bare \N in a charclass */ + vFAIL("Missing braces on \\N{}"); + GET_RE_DEBUG_FLAGS; + nextchar(pRExC_state); + ret = reg_node(pRExC_state, REG_ANY); + *flagp |= HASWIDTH|SIMPLE; + RExC_naughty++; + RExC_parse--; + Set_Node_Length(ret, 1); /* MJD */ + return ret; } name = RExC_parse+1; endbrace = strchr(RExC_parse, '}'); @@ -7159,12 +7173,12 @@ tryagain: } break; case 'N': - /* Handle \N{NAME} here and not below because it can be + /* Handle \N and \N{NAME} here and not below because it can be multicharacter. join_exact() will join them up later on. Also this makes sure that things like /\N{BLAH}+/ and \N{BLAH} being multi char Just Happen. dmq*/ ++RExC_parse; - ret= reg_namedseq(pRExC_state, NULL); + ret= reg_namedseq(pRExC_state, NULL, flagp); break; case 'k': /* Handle \k<NAME> and \k'NAME' */ parse_named_seq: @@ -7964,7 +7978,7 @@ parseit: from earlier versions, OTOH that behaviour was broken as well. */ UV v; /* value is register so we cant & it /grrr */ - if (reg_namedseq(pRExC_state, &v)) { + if (reg_namedseq(pRExC_state, &v, NULL)) { goto parseit; } value= v; diff --git a/t/op/re_tests b/t/op/re_tests index f9b070d..0c04840 100644 --- a/t/op/re_tests +++ b/t/op/re_tests @@ -31,6 +31,12 @@ ab*bc abbbbc y $+[0] 6 .{3,4} abbbbc y $& abbb .{3,4} abbbbc y $-[0] 0 .{3,4} abbbbc y $+[0] 4 +\N{1} abbbbc y $& a +\N{1} abbbbc y $-[0] 0 +\N{1} abbbbc y $+[0] 1 +\N{3,4} abbbbc y $& abbb +\N{3,4} abbbbc y $-[0] 0 +\N{3,4} abbbbc y $+[0] 4 ab{0,}bc abbbbc y $& abbbbc ab{0,}bc abbbbc y $-[0] 0 ab{0,}bc abbbbc y $+[0] 6 @@ -69,8 +75,11 @@ abc$ aabcd n - - $ abc y $& a.c abc y $& abc a.c axc y $& axc +a\Nc abc y $& abc a.*c axyzc y $& axyzc +a\N*c axyzc y $& axyzc a.*c axyzd n - - +a\N*c axyzd n - - a[bc]d abc n - - a[bc]d abd y $& abd a[b]d abd y $& abd @@ -78,6 +87,7 @@ a[b]d abd y $& abd .[b]. abd y $& abd .[b]. aBd n - - (?i:.[b].) abd y $& abd +(?i:\N[b]\N) abd y $& abd a[b-d]e abd n - - a[b-d]e ace y $& ace a[b-d] aac y $& ac @@ -315,6 +325,7 @@ a[-]?c ac y $& ac '$'i ABC y $& 'a.c'i ABC y $& ABC 'a.c'i AXC y $& AXC +'a\Nc'i ABC y $& ABC 'a.*?c'i AXYZC y $& AXYZC 'a.*c'i AXYZD n - - 'a[bc]d'i ABC n - - @@ -497,8 +508,11 @@ a(?:b|(c|e){1,2}?|d)+?(.) ace y $1$2 ce '(?-i:a)b'i AB n - - '((?-i:a))b'i AB n - - '((?-i:a.))b'i a\nB n - - +'((?-i:a\N))b'i a\nB n - - '((?s-i:a.))b'i a\nB y $1 a\n +'((?s-i:a\N))b'i a\nB n - - '((?s-i:a.))b'i B\nB n - - +'((?s-i:a\N))b'i B\nB n - - (?:c|d)(?:)(?:a(?:)(?:b)(?:b(?:))(?:b(?:)(?:b))) cabbbb y $& cabbbb (?:c|d)(?:)(?:aaaaaaaa(?:)(?:bbbbbbbb)(?:bbbbbbbb(?:))(?:bbbbbbbb(?:)(?:bbbbbbbb))) caaaaaaaabbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb y $& caaaaaaaabbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb '(ab)\d\1'i Ab4ab y $1 Ab @@ -531,6 +545,8 @@ x(~~)*(?:(?:F)?)? x~~ y - - ((?s).)c(?!.) a\nb\nc\n y $1:$& \n:\nc ((?s)b.)c(?!.) a\nb\nc\n y $1 b\n ((?s)b.)c(?!.) a\nb\nc\n y $1:$& b\n:b\nc +((?s)b.)c(?!\N) a\nb\nc\n y $1:$& b\n:b\nc +'(b.)c(?!\N)'s a\nb\nc\n y $1:$& b\n:b\nc ^b a\nb\nc\n n - - ()^b a\nb\nc\n n - - ((?m)^b) a\nb\nc\n y $1 b @@ -1294,6 +1310,7 @@ X(\w+)(?=\s)|X(\w+) Xab y [$1-$2] [-ab] (?|(?|(a)|(b))|(?|(c)|(d))) c y $1 c (?|(?|(a)|(b))|(?|(c)|(d))) d y $1 d (.)(?|(.)(.)x|(.)d)(.) abcde y $1-$2-$3-$4-$5- b-c--e-- +(\N)(?|(\N)(\N)x|(\N)d)(\N) abcde y $1-$2-$3-$4-$5- b-c--e-- #Bug #41492 (?(DEFINE)(?<A>(?&B)+)(?<B>a))(?&A) a y $& a (?(DEFINE)(?<A>(?&B)+)(?<B>a))(?&A) aa y $& aa @@ -1340,7 +1357,10 @@ foo(\h)bar foo\tbar y $1 \t (\H)(\h) foo\tbar y $1-$2 o-\t (\h)(\H) foo\tbar y $1-$2 \t-b -.*\z foo\n y - - +.*\z foo\n y -$&- -- +\N*\z foo\n y -$&- -- +.*\Z foo\n y -$&- -foo- +\N*\Z foo\n y -$&- -foo- ^(?:(\d)x)?\d$ 1 y ${\(defined($1)?1:0)} 0 .*?(?:(\w)|(\w))x abx y $1-$2 b- -- Perl5 Master Repository
