In perl.git, the branch smoke-me/remove-regcomp-setjmp has been updated <http://perl5.git.perl.org/perl.git/commitdiff/1f605877dc7b1d1b806d6f0e30dc1d5adbfe6ef3?hp=281ebc7936c3263bdbd589f5d2fe217e39676b43>
- Log ----------------------------------------------------------------- commit 1f605877dc7b1d1b806d6f0e30dc1d5adbfe6ef3 Author: Nicholas Clark <[email protected]> Date: Wed Jan 16 21:58:02 2013 +0100 Test that UTF-8 in the look-ahead of (?(?=...)...) restarts the sizing parse. S_reg() recurses to itself to parse various constructions used as the conditionals in conditional matching. Look-aheads and look-behinds can turn out to need to be sized as UTF-8, which can cause the inner S_reg() to use the macro REQUIRE_UTF8 is used to restart the parse. Test that this is handled correctly. M t/re/re_tests commit c58f43dacc644d292dd42318762b4f5f975a6206 Author: Nicholas Clark <[email protected]> Date: Wed Jan 16 17:08:03 2013 +0100 Test that S_grok_bslash_N() copes if S_reg() restarts the sizing parse. S_reg() can discover midway through parsing the pattern to determine its size, that the pattern will actually need to be encoded as UTF-8. If calculations so far have been done in terms of bytes, then the macro REQUIRE_UTF8 is used to restart the parse, so that sizes can be calculated correctly for UTF-8. It is possible to trigger this restart when processing multi-character charnames interpolated into the pattern using \N{}. Test that this is handled correctly. M t/re/pat_advanced.t ----------------------------------------------------------------------- Summary of changes: t/re/pat_advanced.t | 8 ++++++++ t/re/re_tests | 4 ++++ 2 files changed, 12 insertions(+), 0 deletions(-) diff --git a/t/re/pat_advanced.t b/t/re/pat_advanced.t index a411220..af1bb93 100644 --- a/t/re/pat_advanced.t +++ b/t/re/pat_advanced.t @@ -1079,6 +1079,14 @@ sub run_tests { eval "q(W) =~ /\\N{$name}/"; ok ! $w, 'Verify that latin1 letter in name doesnt give warning'; + # This tests the code path that restarts the parse when the recursive + # call to S_reg() from within S_grok_bslash_N() discovers that the + # pattern needs to be recalculated as UTF-8. use eval to avoid + # needing literal Unicode in this source file: + my $r = eval "qr/\\N{\x{100}\x{100}}/"; + isnt $r, undef, "Generated regex for multi-char UTF-8 charname" + or diag($@); + ok "\x{100}\x{100}" =~ $r, "which matches"; } { diff --git a/t/re/re_tests b/t/re/re_tests index e2a7e89..17ed9eb 100644 --- a/t/re/re_tests +++ b/t/re/re_tests @@ -592,6 +592,10 @@ x(~~)*(?:(?:F)?)? x~~ y - - (?(?!a)b|a) a y $& a (?(?=a)b|a) a n - - (?(?=a)a|b) a y $& a +(?(?!\x{100})\x{100}|b) \x{100} n - - +(?(?!\x{100})b|\x{100}) \x{100} y $& \x{100} +(?(?=\x{100})b|\x{100}) \x{100} n - - +(?(?=\x{100})\x{100}|b) \x{100} y $& \x{100} (?=(a+?))(\1ab) aaab y $2 aab ^(?=(a+?))\1ab aaab n - - (\w+:)+ one: y $1 one: -- Perl5 Master Repository
