In perl.git, the branch blead has been updated <http://perl5.git.perl.org/perl.git/commitdiff/35146e33497dadaa6d9dff57c12764af3e85b229?hp=60275626fd188140155d5af48278ea124efa392f>
- Log ----------------------------------------------------------------- commit 35146e33497dadaa6d9dff57c12764af3e85b229 Author: Karl Williamson <[email protected]> Date: Wed Dec 1 16:36:44 2010 -0700 Nit in perluniintro.pod M pod/perluniintro.pod commit cbf9b121af7809c4ce7bec337045f2eee9dc3d5b Author: Karl Williamson <[email protected]> Date: Wed Dec 1 16:34:58 2010 -0700 Nit in perlunicode.pod M pod/perlunicode.pod commit 315f11aa5a6d12fadc6dc03ca3bc26c69af24cb0 Author: Karl Williamson <[email protected]> Date: Wed Dec 1 16:34:25 2010 -0700 Nit in perlre.pod M pod/perlre.pod commit 20db750130061015fab1ffed94ff374c2bd38af3 Author: Karl Williamson <[email protected]> Date: Wed Dec 1 16:33:54 2010 -0700 Document Unicode doc fix M lib/feature.pm M pod/perldelta.pod M pod/perlre.pod M pod/perlunicode.pod M pod/perlunifaq.pod commit 4ee7c0eabacb52cfaad975a33feeb842bbf347b3 Author: Karl Williamson <[email protected]> Date: Wed Dec 1 16:15:18 2010 -0700 Nit in perlunicode.pod M pod/perlunicode.pod commit 371a505ea6dc25429b55acd7ad25ddbd1bf3a38f Author: Karl Williamson <[email protected]> Date: Tue Nov 30 22:58:37 2010 -0700 re/fold_grind.t: Add tests for NREFFU, REFFU This adds simple tests for these. Inspection of the code indicated to me that more complex tests were not warranted. M t/re/fold_grind.t commit 4444fd9fcde4c2a822a058883c6dc4fd29359931 Author: Karl Williamson <[email protected]> Date: Tue Nov 30 22:35:13 2010 -0700 regcomp.c: Generate REFFU and NREFFU This causes the new nodes that denote Unicode semantics in backreferences to be generated when appropriate. Because the addition of these nodes was at the end of the node list, the arithmetic relation that previously was valid no longer is. M regcomp.c commit d7ef4b7364482c4749537ca35a54bca0956e5709 Author: Karl Williamson <[email protected]> Date: Tue Nov 30 22:05:25 2010 -0700 regexec.c: Handle REFFU and NREFFU; refactor This adds handling of the Unicode folding semantics capture buffer backreferences. I've refactored the code so that the case statements set up the type of folding, to avoid having to test for which in the common code. Also, the previous code was confusing fold case and lowercase. There is already a routine to handle the fold case, so that simplified things. M regexec.c commit d08723ac389e1bb953f17d6b0bf72f567509de20 Author: Karl Williamson <[email protected]> Date: Tue Nov 30 21:49:20 2010 -0700 re/fold_grind.t: Refactor to test utf8 patterns. The previous version wasn't really testing utf8 patterns. M t/re/fold_grind.t commit 7fcd3a28e7ffbe190def12c7e16a921175e2339c Author: Karl Williamson <[email protected]> Date: Tue Nov 30 21:39:16 2010 -0700 regcomp.sym: Add REFFU and NREFFU nodes These will be used for matching capture buffers case-insensitively using Unicode semantics. make regen will regenerate the delivered regnodes.h M regcomp.sym M regnodes.h commit 60c7e6729abcbf29933292741d6e80291f00a7c0 Author: Karl Williamson <[email protected]> Date: Tue Nov 30 21:38:09 2010 -0700 regcomp.sym: update comment M regcomp.sym commit 54251c2ea6cf7f216b5de51bbed4a87b4bf578a4 Author: Karl Williamson <[email protected]> Date: Tue Nov 30 19:00:00 2010 -0700 regcomp.c: Use latin1 folding in synthetic start class This is because the pattern may not specify unicode semantics, but if the target matching string is in utf8, then unicode semantics may be needed nonetheless. So to avoid the regexec optimizer rejecting the match, we need to allow for a possible false positive. M regcomp.c commit 7b98bc43488ec15a4fe9ecdcfe8fc67135640c03 Author: Karl Williamson <[email protected]> Date: Tue Nov 30 18:10:37 2010 -0700 regcomp.c: utf8 pattern defaults to Unicode semantics A utf8 pattern should force unicode semantics unless otherwise overridden. This means that the 'd' regex modifier means Unicode semantics as well. M regcomp.c M t/re/reg_fold.t commit 1e696034880c724355310894883f86e27e0cb264 Author: Karl Williamson <[email protected]> Date: Tue Nov 30 17:48:22 2010 -0700 regcomp.c: teach tries about EXACTFU M regcomp.c commit 4b714af6fa31f14f6cc58acda554a5dbef2f5248 Author: Karl Williamson <[email protected]> Date: Tue Nov 30 17:45:54 2010 -0700 regcomp.c: typo in comment M regcomp.c commit b2a1b324431ad3984f2da6f52e6d2bff1f36b802 Author: Karl Williamson <[email protected]> Date: Tue Nov 30 17:43:35 2010 -0700 re/reg_fold.t: use array size for test counts M t/re/reg_fold.t commit 164739d1f34e0a80ea5ed1e9bf03a54e4f30c740 Author: Karl Williamson <[email protected]> Date: Tue Nov 30 14:02:08 2010 -0700 regcomp.c: Remove duplicate statement The flags this statement cleared are cleared as part of the macro called just before it. M regcomp.c ----------------------------------------------------------------------- Summary of changes: lib/feature.pm | 21 +++++++-- pod/perldelta.pod | 33 +++++++++++--- pod/perlre.pod | 46 +++++++++++--------- pod/perlunicode.pod | 65 +++++++++------------------- pod/perlunifaq.pod | 42 +++++++++--------- pod/perluniintro.pod | 2 +- regcomp.c | 107 ++++++++++++++++++++++++++++++++-------------- regcomp.sym | 9 ++++- regexec.c | 115 +++++++++++++++++++++++++++++++------------------- regnodes.h | 29 +++++++++---- t/re/fold_grind.t | 47 +++++++++++++++----- t/re/reg_fold.t | 13 ++---- 12 files changed, 325 insertions(+), 204 deletions(-) diff --git a/lib/feature.pm b/lib/feature.pm index f8a9078..c70010d 100644 --- a/lib/feature.pm +++ b/lib/feature.pm @@ -105,11 +105,22 @@ See L<perlsub/"Persistent Private Variables"> for details. =head2 the 'unicode_strings' feature -C<use feature 'unicode_strings'> tells the compiler to treat -all strings outside of C<use locale> and C<use bytes> as Unicode. It is -available starting with Perl 5.11.3, but is not fully implemented. - -See L<perlunicode/The "Unicode Bug"> for details. +C<use feature 'unicode_strings'> tells the compiler to use Unicode semantics +in all string operations executed within its scope (unless they are also +within the scope of either C<use locale> or C<use bytes>). The same applies +to all regular expressions compiled within the scope, even if executed outside +it. + +C<no feature 'unicode_strings'> tells the compiler to use the traditional +Perl semantics wherein the native character set semantics is used unless it is +clear to Perl that Unicode is desired. This can lead to some surprises +when the behavior suddenly changes. (See +L<perlunicode/The "Unicode Bug"> for details.) For this reason, if you are +potentially using Unicode in your program, the +C<use feature 'unicode_strings'> subpragma is B<strongly> recommended. + +This subpragma is available starting with Perl 5.11.3, but was not fully +implemented until 5.13.8. =head1 FEATURE BUNDLES diff --git a/pod/perldelta.pod b/pod/perldelta.pod index cfeff1f..b7d710b 100644 --- a/pod/perldelta.pod +++ b/pod/perldelta.pod @@ -2,7 +2,6 @@ =for comment This has been completed up to 779bcb7d, except for: -1b9f127-fad448f (Karl Williamson says he will do this) ad9e76a8629ed1ac483f0a7ed0e4da40ac5a1a00 d9a4b459f94297889956ac3adc42707365f274c2 @@ -81,6 +80,18 @@ method support still works as expected: open my $fh, ">", $file; $fh->autoflush(1); # IO::File not loaded +=head2 Full functionality for C<use feature 'unicode_strings'> + +This release provides full functionality for C<use feature +'unicode_strings'>. Under its scope, all string operations executed and +regular expressions compiled (even if executed outside its scope) have +Unicode semantics. See L<feature>. + +This feature avoids the "Unicode Bug" (See +L<perlunicode/The "Unicode Bug"> for details.) If their is a +possibility that your code will process Unicode strings, you are +B<strongly> encouraged to use this subpragma to avoid nasty surprises. + =head1 Security XXX Any security-related notices go here. In particular, any security @@ -492,12 +503,6 @@ L<[perl #79178]|http://rt.perl.org/rt3/Public/Bug/Display.html?id=79178>. =item * -A number of bugs with regular expression bracketed character classes -have been fixed, mostly having to do with matching characters in the -non-ASCII Latin-1 range. - -=item * - A closure containing an C<if> statement followed by a constant or variable is no longer treated as a constant L<[perl #63540]|http://rt.perl.org/rt3/Public/Bug/Display.html?id=63540>. @@ -514,6 +519,20 @@ A regular expression optimisation would sometimes cause a match with a C<{n,m}> quantifier to fail when it should match L<[perl #79152]|http://rt.perl.org/rt3/Public/Bug/Display.html?id=79152>. +=item * + +What has become known as the "Unicode Bug" is resolved in this release. +Under C<use feature 'unicode_strings'>, the internal storage format of a +string no longer affects the external semantics. There are two known +exceptions. User-defined case changing functions, which are planned to +be deprecated in 5.14, require utf8-encoded strings to function; and the +character C<LATIN SMALL LETTER SHARP S> in regular expression +case-insensitive matching has a somewhat different set of bugs depending +on the internal storage format. Case-insensitive matching of all +characters that have multi-character matches, as this one does, is +problematical in Perl. +L<[perl #58182]|http://rt.perl.org/rt3/Public/Bug/Display.html?id=58182>. + =back =head1 Known Problems diff --git a/pod/perlre.pod b/pod/perlre.pod index acc1ad5..b74618f 100644 --- a/pod/perlre.pod +++ b/pod/perlre.pod @@ -646,31 +646,37 @@ locale, and can differ from one match to another if there is an intervening call of the L<setlocale() function|perllocale/The setlocale function>. This modifier is automatically set if the regular expression is compiled -within the scope of a C<"use locale"> pragma. +within the scope of a C<"use locale"> pragma. Results are not +well-defined when using this and matching against a utf8-encoded string. C<"u"> means to use Unicode semantics when pattern matching. It is -automatically set if the regular expression is compiled within the scope -of a L<C<"use feature 'unicode_strings">|feature> pragma (and isn't -also in the scope of L<C<"use locale">|locale> nor -L<C<"use bytes">|bytes> pragmas. It is not fully implemented at the -time of this writing, but work is being done to complete the job. On -EBCDIC platforms this currently has no effect, but on ASCII platforms, -it effectively turns them into Latin-1 platforms. That is, the ASCII -characters remain as ASCII characters (since ASCII is a subset of -Latin-1), but the non-ASCII code points are treated as Latin-1 -characters. Right now, this only applies to the C<"\b">, C<"\s">, and -C<"\w"> pattern matching operators, plus their complements. For -example, when this option is not on, C<"\w"> matches precisely -C<[A-Za-z0-9_]> (on a non-utf8 string). When the option is on, it -matches not just those, but all the Latin-1 word characters (such as an -"n" with a tilde). It thus matches exactly the same set of code points -from 0 to 255 as it would if the string were encoded in utf8. +automatically set if the regular expression is encoded in utf8, or is +compiled within the scope of a +L<C<"use feature 'unicode_strings">|feature> pragma (and isn't also in +the scope of L<C<"use locale">|locale> nor L<C<"use bytes">|bytes> +pragmas. On ASCII platforms, the code points between 128 and 255 take on their +Latin-1 (ISO-8859-1) meanings (which are the same as Unicode's), whereas +in strict ASCII their meanings are undefined. Thus the platform +effectively becomes a Unicode platform. The ASCII characters remain as +ASCII characters (since ASCII is a subset of Latin-1 and Unicode). For +example, when this option is not on, on a non-utf8 string, C<"\w"> +matches precisely C<[A-Za-z0-9_]>. When the option is on, it matches +not just those, but all the Latin-1 word characters (such as an "n" with +a tilde). On EBCDIC platforms, which already are equivalent to Latin-1, +this modifier changes behavior only when the C<"/i"> modifier is also +specified, and affects only two characters, giving them full Unicode +semantics: the C<MICRO SIGN> will match the Greek capital and +small letters C<MU>; otherwise not; and the C<LATIN CAPITAL LETTER SHARP +S> will match any of C<SS>, C<Ss>, C<sS>, and C<ss>, otherwise not. +(This last case is buggy, however.) C<"d"> means to use the traditional Perl pattern matching behavior. This is dualistic (hence the name C<"d">, which also could stand for -"default"). When this is in effect, Perl matches utf8-encoded strings +"depends"). When this is in effect, Perl matches utf8-encoded strings using Unicode rules, and matches non-utf8-encoded strings using the -platform's native character set rules. +platform's native character set rules. (If the regular expression +itself is encoded in utf8, Unicode rules are used regardless of the +target string's encoding.) See L<perlunicode/The "Unicode Bug">. It is automatically selected by default if the regular expression is compiled neither within the scope of a C<"use locale"> pragma nor a <C<"use feature 'unicode_strings"> @@ -680,7 +686,7 @@ Note that the C<d>, C<l>, C<p>, and C<u> modifiers are special in that they can only be enabled, not disabled, and the C<d>, C<l>, and C<u> modifiers are mutually exclusive: specifying one de-specifies the others, and a maximum of one may appear in the construct. Thus, for -example, C<(?-p)>, C<(?-d:...)>, and C<(?-dl:...)> will warn when +example, C<(?-p)>, C<(?-d:...)>, and C<(?dl:...)> will warn when compiled under C<use warnings>. Note also that the C<p> modifier is special in that its presence diff --git a/pod/perlunicode.pod b/pod/perlunicode.pod index b950f7b..242238f 100644 --- a/pod/perlunicode.pod +++ b/pod/perlunicode.pod @@ -23,7 +23,7 @@ Read L<Unicode Security Considerations|http://www.unicode.org/reports/tr36>. Perl knows when a filehandle uses Perl's internal Unicode encodings (UTF-8, or UTF-EBCDIC if in EBCDIC) if the filehandle is opened with -the ":utf8" layer. Other encodings can be converted to Perl's +the ":encoding(utf8)" layer. Other encodings can be converted to Perl's encoding on input or from Perl's encoding on output by use of the ":encoding(...)" layer. See L<open>. @@ -101,9 +101,9 @@ or from literals and constants in the source text. The C<bytes> pragma will always, regardless of platform, force byte semantics in a particular lexical scope. See L<bytes>. -The C<use feature 'unicode_strings'> pragma is intended to always, regardless -of platform, force character (Unicode) semantics in a particular lexical scope. -In release 5.12, it is partially implemented, applying only to case changes. +The C<use feature 'unicode_strings'> pragma is intended always, +regardless of platform, to force character (Unicode) semantics in a +particular lexical scope. See L</The "Unicode Bug"> below. The C<utf8> pragma is primarily a compatibility device that enables @@ -1450,7 +1450,8 @@ The term, the "Unicode bug" has been applied to an inconsistency with the Unicode characters whose ordinals are in the Latin-1 Supplement block, that is, between 128 and 255. Without a locale specified, unlike all other characters or code points, these characters have very different semantics in -byte semantics versus character semantics. +byte semantics versus character semantics, unless +C<use feature 'unicode_strings'> is specified. In character semantics they are interpreted as Unicode code points, which means they have the same semantics as Latin-1 (ISO-8859-1). @@ -1514,45 +1515,21 @@ ASCII range (except in a locale), along with Perl's desire to add Unicode support seamlessly. The result wasn't seamless: these characters were orphaned. -Work is being done to correct this, but only some of it is complete. -What has been finished is: - -=over - -=item * - -the matching of C<\b>, C<\s>, C<\w> and the Posix -character classes and their complements in regular expressions - -=item * - -case changing (but not user-defined casing) - -=item * - -case-insensitive (C</i>) regular expression matching for [bracketed -character classes] only, except for some bugs with C<LATIN SMALL -LETTER SHARP S> (which is supposed to match the two character sequence -"ss" (or "Ss" or "sS" or "SS"), but Perl has a number of bugs for all -such multi-character case insensitive characters, of which this is just -one example. - -=back - -Due to concerns, and some evidence, that older code might -have come to rely on the existing behavior, the new behavior must be explicitly -enabled by the feature C<unicode_strings> in the L<feature> pragma, even though -no new syntax is involved. - -See L<perlfunc/lc> for details on how this pragma works in combination with -various others for casing. - -Even though the implementation is incomplete, it is planned to have this -pragma affect all the problematic behaviors in later releases: you can't -have one without them all. - -In the meantime, a workaround is to always call utf8::upgrade($string), or to -use the standard module L<Encode>. Also, a scalar that has any characters +Starting in Perl 5.14, C<use feature 'unicode_strings'> can be used to +cause Perl to use Unicode semantics on all string operations within the +scope of the feature subpragma. Regular expressions compiled in its +scope retain that behavior even when executed or compiled into larger +regular expressions outside the scope. (The pragma does not, however, +affect user-defined case changing operations. These still require a +UTF-8 encoded string to operate.) + +In Perl 5.12, the subpragma affected casing changes, but not regular +expressions. See L<perlfunc/lc> for details on how this pragma works in +combination with various others for casing. + +For earlier Perls, or when a string is passed to a function outside the +subpragma's scope, a workaround is to always call C<utf8::upgrade($string)>, +or to use the standard module L<Encode>. Also, a scalar that has any characters whose ordinal is above 0x100, or which were specified using either of the C<\N{...}> notations will automatically have character semantics. diff --git a/pod/perlunifaq.pod b/pod/perlunifaq.pod index 877e4d1..9fd2b38 100644 --- a/pod/perlunifaq.pod +++ b/pod/perlunifaq.pod @@ -138,27 +138,27 @@ concern, and you can just C<eval> dumped data as always. =head2 Why do some characters not uppercase or lowercase correctly? -It seemed like a good idea at the time, to keep the semantics the same for -standard strings, when Perl got Unicode support. The plan is to fix this -in the future, and the casing component has in fact mostly been fixed, but we -have to deal with the fact that Perl treats equal strings differently, -depending on the internal state. - -First the casing. Just put a C<use feature 'unicode_strings'> near the -beginning of your program. Within its lexical scope, C<uc>, C<lc>, C<ucfirst>, -C<lcfirst>, and the regular expression escapes C<\U>, C<\L>, C<\u>, C<\l> use -Unicode semantics for changing case regardless of whether the UTF8 flag is on -or not. However, if you pass strings to subroutines in modules outside the -pragma's scope, they currently likely won't behave this way, and you have to -try one of the solutions below. There is another exception as well: if you -have furnished your own casing functions to override the default, these will -not be called unless the UTF8 flag is on) - -This remains a problem for the regular expression constructs -C</.../i>, C<(?i:...)>, and C</[[:posix:]]/>. - -To force Unicode semantics, you can upgrade the internal representation to -by doing C<utf8::upgrade($string)>. This can be used +Starting in Perl 5.14 (and partially in Perl 5.12), just put a +C<use feature 'unicode_strings'> near the beginning of your program. +Within its lexical scope you shouldn't have this problem. It also is +automatically enabled under C<use feature ':5.12'> or using C<-E> on the +command line for Perl 5.12 or higher. + +The rationale for requiring this is to not break older programs that +rely on the way things worked before Unicode came along. Those older +programs knew only about the ASCII character set, and so may not work +properly for additional characters. When a string is encoded in UTF-8, +Perl assumes that the program is prepared to deal with Unicode, but when +the string isn't, Perl assumes that only ASCII (unless it is an EBCDIC +platform) is wanted, and so those characters that are not ASCII +characters aren't recognized as to what they would be in Unicode. +C<use feature 'unicode_strings'> tells Perl to treat all characters as +Unicode, whether the string is encoded in UTF-8 or not, thus avoiding +the problem. + +However, on earlier Perls, or if you pass strings to subroutines outside +the feature's scope, you can force Unicode semantics by changing the +encoding to UTF-8 by doing C<utf8::upgrade($string)>. This can be used safely on any string, as it checks and does not change strings that have already been upgraded. diff --git a/pod/perluniintro.pod b/pod/perluniintro.pod index f0b2be5..6a8c07d 100644 --- a/pod/perluniintro.pod +++ b/pod/perluniintro.pod @@ -83,7 +83,7 @@ Because of backward compatibility with legacy encodings, the "a unique number for every character" idea breaks down a bit: instead, there is "at least one number for every character". The same character could be represented differently in several legacy encodings. The -converse is also not true: some code points do not have an assigned +converse is not also true: some code points do not have an assigned character. Firstly, there are unallocated code points within otherwise used blocks. Secondly, there are special Unicode control characters that do not represent true characters. diff --git a/regcomp.c b/regcomp.c index 4b69bf7..2df0a6e 100644 --- a/regcomp.c +++ b/regcomp.c @@ -1156,7 +1156,7 @@ the silent ignoring of duplicate alternations which are of the form: / (DUPE|DUPE) X? (?{ ... }) Y /x -Thus EVAL blocks follwing a trie may be called a different number of times with +Thus EVAL blocks following a trie may be called a different number of times with and without the optimisation. With the optimisations dupes will be silently ignored. This inconsistant behaviour of EVAL type nodes is well established as the following demonstrates: @@ -1358,13 +1358,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs regnode *convert = NULL; U32 *prev_states; /* temp array mapping each state to previous one */ /* we just use folder as a flag in utf8 */ - const U8 * const folder = ( flags == EXACTF - ? PL_fold - : ( flags == EXACTFL - ? PL_fold_locale - : NULL - ) - ); + const U8 * folder = NULL; #ifdef DEBUGGING const U32 data_slot = add_data( pRExC_state, 4, "tuuu" ); @@ -1384,6 +1378,12 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs PERL_UNUSED_ARG(depth); #endif + switch (flags) { + case EXACTFU: folder = PL_fold_latin1; break; + case EXACTF: folder = PL_fold; break; + case EXACTFL: folder = PL_fold_locale; break; + } + trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) ); trie->refcount = 1; trie->startstate = 1; @@ -3073,11 +3073,18 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, /* Check whether it is compatible with what we know already! */ int compat = 1; + + /* If compatibile, we or it in below. It is compatible if is + * in the bitmp and either 1) its bit or its fold is set, or 2) + * it's for a locale. Even if there isn't unicode semantics + * here, at runtime there may be because of matching against a + * utf8 string, so accept a possible false positive for + * latin1-range folds */ if (uc >= 0x100 || (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE)) && !ANYOF_BITMAP_TEST(data->start_class, uc) && (!(data->start_class->flags & ANYOF_FOLD) - || !ANYOF_BITMAP_TEST(data->start_class, (UNI_SEMANTICS) ? PL_fold_latin1[uc] : PL_fold[uc]))) + || !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc]))) ) compat = 0; ANYOF_CLASS_ZERO(data->start_class); @@ -3119,12 +3126,13 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, if (flags & SCF_DO_STCLASS_AND) { /* Check whether it is compatible with what we know already! */ int compat = 1; - if (uc >= 0x100 || - (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE)) - && !ANYOF_BITMAP_TEST(data->start_class, uc) - && !ANYOF_BITMAP_TEST(data->start_class, (UNI_SEMANTICS) ? PL_fold_latin1[uc] : PL_fold[uc]))) + (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE)) + && !ANYOF_BITMAP_TEST(data->start_class, uc) + && !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc]))) + { compat = 0; + } ANYOF_CLASS_ZERO(data->start_class); ANYOF_BITMAP_ZERO(data->start_class); if (compat) { @@ -3136,13 +3144,11 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, } else { - /* Also set the other member of the fold pair. Can't - * do this for locale, because not known until runtime - */ - ANYOF_BITMAP_SET(data->start_class, - (OP(scan) == EXACTFU) - ? PL_fold_latin1[uc] - : PL_fold[uc]); + /* Also set the other member of the fold pair. In case + * that unicode semantics is called for at runtime, use + * the full latin1 fold. (Can't do this for locale, + * because not known until runtime */ + ANYOF_BITMAP_SET(data->start_class, PL_fold_latin1[uc]); } } } @@ -3158,9 +3164,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, * can't do that in locale because not known until * run-time */ ANYOF_BITMAP_SET(data->start_class, - (OP(scan) == EXACTFU) - ? PL_fold_latin1[uc] - : PL_fold[uc]); + PL_fold_latin1[uc]); } } data->start_class->flags &= ~ANYOF_EOS; @@ -4461,6 +4465,12 @@ Perl_re_compile(pTHX_ SV * const pattern, U32 pm_flags) restudied = 0; #endif + /* Set to use unicode semantics if the pattern is in utf8 and has the + * 'dual' charset specified, as it means unicode when utf8 */ + if (RExC_utf8 && ! (pm_flags & (RXf_PMf_LOCALE|RXf_PMf_UNICODE))) { + pm_flags |= RXf_PMf_UNICODE; + } + RExC_precomp = exp; RExC_flags = pm_flags; RExC_sawback = 0; @@ -5839,9 +5849,15 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) SvREFCNT_inc_simple_void(sv_dat); } RExC_sawback = 1; - ret = reganode(pRExC_state, - (U8)(FOLD ? (LOC ? NREFFL : NREFF) : NREF), - num); + ret = reganode(pRExC_state, + ((! FOLD) + ? NREF + : (UNI_SEMANTICS) + ? NREFFU + : (LOC) + ? NREFFL + : NREFF), + num); *flagp |= HASWIDTH; Set_Node_Offset(ret, parse_start+1); @@ -6268,7 +6284,10 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) that follow */ has_use_defaults = TRUE; STD_PMMOD_FLAGS_CLEAR(&RExC_flags); - RExC_flags &= ~(RXf_PMf_LOCALE|RXf_PMf_UNICODE); + if (RExC_utf8) { /* But the default for a utf8 pattern is + unicode semantics */ + RExC_flags |= RXf_PMf_UNICODE; + } goto parse_flags; default: --RExC_parse; @@ -6307,7 +6326,17 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) { goto fail_modifiers; } - negflags |= (RXf_PMf_LOCALE|RXf_PMf_UNICODE); + + /* The dual charset means unicode semantics if the + * pattern (or target, not known until runtime) are + * utf8 */ + if (RExC_utf8) { + posflags |= RXf_PMf_UNICODE; + negflags |= RXf_PMf_LOCALE; + } + else { + negflags |= (RXf_PMf_LOCALE|RXf_PMf_UNICODE); + } has_charset_modifier = 1; break; case ONCE_PAT_MOD: /* 'o' */ @@ -7508,8 +7537,14 @@ tryagain: RExC_sawback = 1; ret = reganode(pRExC_state, - (U8)(FOLD ? (LOC ? NREFFL : NREFF) : NREF), - num); + ((! FOLD) + ? NREF + : (UNI_SEMANTICS) + ? NREFFU + : (LOC) + ? NREFFL + : NREFF), + num); *flagp |= HASWIDTH; /* override incorrect value set in reganode MJD */ @@ -7570,8 +7605,14 @@ tryagain: } RExC_sawback = 1; ret = reganode(pRExC_state, - (U8)(FOLD ? (LOC ? REFFL : REFF) : REF), - num); + ((! FOLD) + ? REF + : (UNI_SEMANTICS) + ? REFFU + : (LOC) + ? REFFL + : REFF), + num); *flagp |= HASWIDTH; /* override incorrect value set in reganode MJD */ @@ -9571,7 +9612,7 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) { Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */ if ( RXp_PAREN_NAMES(prog) ) { - if ( k != REF || OP(o) < NREF) { + if ( k != REF || (OP(o) != NREF && OP(o) != NREFF && OP(o) != NREFFL && OP(o) != NREFFU)) { AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]); SV **name= av_fetch(list, ARG(o), 0 ); if (name) diff --git a/regcomp.sym b/regcomp.sym index a85d33f..4e787a7 100644 --- a/regcomp.sym +++ b/regcomp.sym @@ -145,7 +145,7 @@ RENUM BRANCHJ, off 1 . 1 ; Group with independently numbered parens. # inline charclass data (ascii only), the 'C' store it in the structure. # NOTE: the relative order of the TRIE-like regops is signifigant -TRIE TRIE, trie 1 ; Match many EXACT(FL?)? at once. flags==type +TRIE TRIE, trie 1 ; Match many EXACT(F[LU]?)? at once. flags==type TRIEC TRIE,trie charclass ; Same as TRIE, but with embedded charclass data # For start classes, contains an added fail table. @@ -194,6 +194,13 @@ NHORIZWS NHORIZWS, none 0 S ; not horizontal whitespace (Perl 6) FOLDCHAR FOLDCHAR, codepoint 1 ; codepoint with tricky case folding properties. EXACTFU EXACT, str ; Match this string, folded, Unicode semantics for non-utf8 (prec. by length). +# These could have been implemented using the FLAGS field of the regnode, but +# by having a separate node type, we can use the existing switch statement to +# avoid some tests +REFFU REF, num 1 V ; Match already matched string, folded using unicode semantics for non-utf8 +NREFFU REF, num 1 V ; Match already matched string, folded using unicode semantics for non-utf8 + + # NEW STUFF ABOVE THIS LINE ################################################################################ diff --git a/regexec.c b/regexec.c index ff76c84..ffa2da4 100644 --- a/regexec.c +++ b/regexec.c @@ -3927,31 +3927,69 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) break; case NREFFL: - { + { /* The capture buffer cases. The ones beginning with N for the + named buffers just convert to the equivalent numbered and + pretend they were called as the corresponding numbered buffer + op. */ char *s; char type; + I32 (*folder)() = NULL; /* NULL assumes will be NREF, REF: no + folding */ + const U8 * fold_array = NULL; + PL_reg_flags |= RF_tainted; - /* FALL THROUGH */ - case NREF: + folder = foldEQ_locale; + fold_array = PL_fold_locale; + type = REFFL; + goto do_nref; + + case NREFFU: + folder = foldEQ_latin1; + fold_array = PL_fold_latin1; + type = REFFU; + goto do_nref; + case NREFF: - type = OP(scan); + folder = foldEQ; + fold_array = PL_fold; + type = REFF; + goto do_nref; + + case NREF: + type = REF; + do_nref: + + /* For the named back references, find the corresponding buffer + * number */ n = reg_check_named_buff_matched(rex,scan); - if ( n ) { - type = REF + ( type - NREF ); - goto do_ref; - } else { + if ( ! n ) { sayNO; - } - /* unreached */ + } + goto do_nref_ref_common; + case REFFL: PL_reg_flags |= RF_tainted; + folder = foldEQ_locale; + fold_array = PL_fold_locale; + goto do_ref; + + case REFFU: + folder = foldEQ_latin1; + fold_array = PL_fold_latin1; + goto do_ref; + + case REFF: + folder = foldEQ; + fold_array = PL_fold; /* FALL THROUGH */ + case REF: - case REFF: - n = ARG(scan); /* which paren pair */ + do_ref: type = OP(scan); - do_ref: + n = ARG(scan); /* which paren pair */ + + do_nref_ref_common: ln = PL_regoffs[n].start; PL_reg_leftiter = PL_reg_maxiter; /* Void cache */ if (*PL_reglastparen < n || ln == -1) @@ -3960,49 +3998,40 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) break; s = PL_bostr + ln; - if (utf8_target && type != REF) { /* REF can do byte comparison */ - char *l = locinput; - const char *e = PL_bostr + PL_regoffs[n].end; - /* - * Note that we can't do the "other character" lookup trick as - * in the 8-bit case (no pun intended) because in Unicode we - * have to map both upper and title case to lower case. - */ - if (type == REFF) { - while (s < e) { - STRLEN ulen1, ulen2; - U8 tmpbuf1[UTF8_MAXBYTES_CASE+1]; - U8 tmpbuf2[UTF8_MAXBYTES_CASE+1]; - - if (l >= PL_regeol) - sayNO; - toLOWER_utf8((U8*)s, tmpbuf1, &ulen1); - toLOWER_utf8((U8*)l, tmpbuf2, &ulen2); - if (ulen1 != ulen2 || memNE((char *)tmpbuf1, (char *)tmpbuf2, ulen1)) - sayNO; - s += ulen1; - l += ulen2; - } + if (type != REF /* REF can do byte comparison */ + && (utf8_target + || (type == REFFU + && (*s == (char) LATIN_SMALL_LETTER_SHARP_S + || *locinput == (char) LATIN_SMALL_LETTER_SHARP_S)))) + { /* XXX handle REFFL better */ + char * limit = PL_regeol; + + /* This call case insensitively compares the entire buffer + * at s, with the current input starting at locinput, but + * not going off the end given by PL_regeol, and returns in + * limit upon success, how much of the current input was + * matched */ + if (! foldEQ_utf8(s, NULL, PL_regoffs[n].end - ln, utf8_target, + locinput, &limit, 0, utf8_target)) + { + sayNO; } - locinput = l; + locinput = limit; nextchr = UCHARAT(locinput); break; } - /* Inline the first character, for speed. */ + /* Not utf8: Inline the first character, for speed. */ if (UCHARAT(s) != nextchr && (type == REF || - (UCHARAT(s) != (type == REFF - ? PL_fold : PL_fold_locale)[nextchr]))) + UCHARAT(s) != fold_array[nextchr])) sayNO; ln = PL_regoffs[n].end - ln; if (locinput + ln > PL_regeol) sayNO; if (ln > 1 && (type == REF ? memNE(s, locinput, ln) - : (type == REFF - ? ! foldEQ(s, locinput, ln) - : ! foldEQ_locale(s, locinput, ln)))) + : ! folder(s, locinput, ln))) sayNO; locinput += ln; nextchr = UCHARAT(locinput); diff --git a/regnodes.h b/regnodes.h index 97ac607..09ab661 100644 --- a/regnodes.h +++ b/regnodes.h @@ -6,8 +6,8 @@ /* Regops and State definitions */ -#define REGNODE_MAX 91 -#define REGMATCH_STATE_MAX 131 +#define REGNODE_MAX 93 +#define REGMATCH_STATE_MAX 133 #define END 0 /* 0000 End of program. */ #define SUCCEED 1 /* 0x01 Return from a subroutine, basically. */ @@ -70,7 +70,7 @@ #define MINMOD 58 /* 0x3a Next operator is not greedy. */ #define LOGICAL 59 /* 0x3b Next opcode should set the flag only. */ #define RENUM 60 /* 0x3c Group with independently numbered parens. */ -#define TRIE 61 /* 0x3d Match many EXACT(FL?)? at once. flags==type */ +#define TRIE 61 /* 0x3d Match many EXACT(F[LU]?)? at once. flags==type */ #define TRIEC 62 /* 0x3e Same as TRIE, but with embedded charclass data */ #define AHOCORASICK 63 /* 0x3f Aho Corasick stclass. flags==type */ #define AHOCORASICKC 64 /* 0x40 Same as AHOCORASICK, but with embedded charclass data */ @@ -99,8 +99,10 @@ #define NHORIZWS 87 /* 0x57 not horizontal whitespace (Perl 6) */ #define FOLDCHAR 88 /* 0x58 codepoint with tricky case folding properties. */ #define EXACTFU 89 /* 0x59 Match this string, folded, Unicode semantics for non-utf8 (prec. by length). */ -#define OPTIMIZED 90 /* 0x5a Placeholder for dump. */ -#define PSEUDO 91 /* 0x5b Pseudo opcode for internal use. */ +#define REFFU 90 /* 0x5a Match already matched string, folded using unicode semantics for non-utf8 */ +#define NREFFU 91 /* 0x5b Match already matched string, folded using unicode semantics for non-utf8 */ +#define OPTIMIZED 92 /* 0x5c Placeholder for dump. */ +#define PSEUDO 93 /* 0x5d Pseudo opcode for internal use. */ /* ------------ States ------------- */ #define TRIE_next (REGNODE_MAX + 1) /* state for TRIE */ #define TRIE_next_fail (REGNODE_MAX + 2) /* state for TRIE */ @@ -239,6 +241,8 @@ EXTCONST U8 PL_regkind[] = { NHORIZWS, /* NHORIZWS */ FOLDCHAR, /* FOLDCHAR */ EXACT, /* EXACTFU */ + REF, /* REFFU */ + REF, /* NREFFU */ NOTHING, /* OPTIMIZED */ PSEUDO, /* PSEUDO */ /* ------------ States ------------- */ @@ -379,6 +383,8 @@ static const U8 regarglen[] = { 0, /* NHORIZWS */ EXTRA_SIZE(struct regnode_1), /* FOLDCHAR */ 0, /* EXACTFU */ + EXTRA_SIZE(struct regnode_1), /* REFFU */ + EXTRA_SIZE(struct regnode_1), /* NREFFU */ 0, /* OPTIMIZED */ 0, /* PSEUDO */ }; @@ -476,6 +482,8 @@ static const char reg_off_by_arg[] = { 0, /* NHORIZWS */ 0, /* FOLDCHAR */ 0, /* EXACTFU */ + 0, /* REFFU */ + 0, /* NREFFU */ 0, /* OPTIMIZED */ 0, /* PSEUDO */ }; @@ -578,8 +586,10 @@ EXTCONST char * const PL_reg_name[] = { "NHORIZWS", /* 0x57 */ "FOLDCHAR", /* 0x58 */ "EXACTFU", /* 0x59 */ - "OPTIMIZED", /* 0x5a */ - "PSEUDO", /* 0x5b */ + "REFFU", /* 0x5a */ + "NREFFU", /* 0x5b */ + "OPTIMIZED", /* 0x5c */ + "PSEUDO", /* 0x5d */ /* ------------ States ------------- */ "TRIE_next", /* REGNODE_MAX +0x01 */ "TRIE_next_fail", /* REGNODE_MAX +0x02 */ @@ -674,7 +684,8 @@ EXTCONST U8 PL_varies[] __attribute__deprecated__; #else EXTCONST U8 PL_varies[] __attribute__deprecated__ = { CLUMP, BRANCH, BACK, STAR, PLUS, CURLY, CURLYN, CURLYM, CURLYX, WHILEM, - REF, REFF, REFFL, SUSPEND, IFTHEN, BRANCHJ, NREF, NREFF, NREFFL, + REF, REFF, REFFL, SUSPEND, IFTHEN, BRANCHJ, NREF, NREFF, NREFFL, REFFU, + NREFFU, 0 }; #endif /* DOINIT */ @@ -683,7 +694,7 @@ EXTCONST U8 PL_varies[] __attribute__deprecated__ = { EXTCONST U8 PL_varies_bitmask[]; #else EXTCONST U8 PL_varies_bitmask[] = { - 0x00, 0x00, 0x00, 0xC0, 0xC1, 0x9F, 0x33, 0x01, 0x38, 0x00, 0x00, 0x00 + 0x00, 0x00, 0x00, 0xC0, 0xC1, 0x9F, 0x33, 0x01, 0x38, 0x00, 0x00, 0x0C }; #endif /* DOINIT */ diff --git a/t/re/fold_grind.t b/t/re/fold_grind.t index 13fdd3c..55241e3 100644 --- a/t/re/fold_grind.t +++ b/t/re/fold_grind.t @@ -1,5 +1,4 @@ -# Grind out a lot of combinatoric tests for folding. Still missing are -# testing backreferences and tries. +# Grind out a lot of combinatoric tests for folding. use charnames ":full"; @@ -13,6 +12,7 @@ BEGIN { use strict; use warnings; +use Encode; # Tests both unicode and not, so make sure not implicitly testing unicode no feature 'unicode_strings'; @@ -238,7 +238,8 @@ foreach my $test (sort { numerically } keys %tests) { #diag $progress; # Now grind out tests, using various combinations. - foreach my $uni_semantics ("", 'u') { # Both non- and uni semantics + # XXX foreach my $charset ('d', 'u', 'l') { + foreach my $charset ('d', 'u') { foreach my $utf8_target (0, 1) { # Both utf8 and not, for # code points < 256 my $upgrade_target = ""; @@ -247,16 +248,37 @@ foreach my $test (sort { numerically } keys %tests) { # something above latin1. So impossible to test if to not to be in # utf8; and otherwise, no upgrade is needed. next if $target_above_latin1 && ! $utf8_target; - $upgrade_target = '; utf8::upgrade($c)' if ! $target_above_latin1 && $utf8_target; + $upgrade_target = ' utf8::upgrade($c);' if ! $target_above_latin1 && $utf8_target; - foreach my $uni_pattern (0, 1) { - next if $pattern_above_latin1 && ! $uni_pattern; + foreach my $utf8_pattern (0, 1) { + next if $pattern_above_latin1 && ! $utf8_pattern; + my $uni_semantics = $utf8_target || $charset eq 'u' || ($charset eq 'd' && $utf8_pattern); my $upgrade_pattern = ""; - $upgrade_pattern = '; use re "/u"' if ! $pattern_above_latin1 && $uni_pattern; + $upgrade_pattern = ' utf8::upgrade($p);' if ! $pattern_above_latin1 && $utf8_pattern; my $lhs = join "", @x_target; my @rhs = @x_pattern; - #print "$lhs: ", "/@rhs/\n"; + my $rhs = join "", @rhs; + my $should_fail = ! $uni_semantics && $ord >= 128 && $ord < 256 && ! $is_self; + + # Do simple tests of referencing capture buffers, named and + # numbered. + my $op = '=~'; + $op = '!~' if $should_fail; + my $eval = "my \$c = \"$lhs$rhs\"; my \$p = qr/(?$charset:^($rhs)\\1\$)/i;$upgrade_target$upgrade_pattern \$c $op \$p"; + push @eval_tests, qq[ok(eval '$eval', '$eval')]; + $eval = "my \$c = \"$lhs$rhs\"; my \$p = qr/(?$charset:^(?<grind>$rhs)\\k<grind>\$)/i;$upgrade_target$upgrade_pattern \$c $op \$p"; + push @eval_tests, qq[ok(eval '$eval', '$eval')]; + $count += 2; + if ($lhs ne $rhs) { + $eval = "my \$c = \"$rhs$lhs\"; my \$p = qr/(?$charset:^($rhs)\\1\$)/i;$upgrade_target$upgrade_pattern \$c $op \$p"; + push @eval_tests, qq[ok(eval '$eval', '$eval')]; + $eval = "my \$c = \"$rhs$lhs\"; my \$p = qr/(?$charset:^(?<grind>$rhs)\\k<grind>\$)/i;$upgrade_target$upgrade_pattern \$c $op \$p"; + push @eval_tests, qq[ok(eval '$eval', '$eval')]; + $count += 2; + } + #diag $eval_tests[-1]; + #next; foreach my $bracketed (0, 1) { # Put rhs in [...], or not foreach my $inverted (0,1) { @@ -314,9 +336,9 @@ foreach my $test (sort { numerically } keys %tests) { # something on one or both sides that force it to. my $must_match = ! $can_match_null || ($l_anchor && $r_anchor) || ($l_anchor && $append) || ($r_anchor && $prepend) || ($prepend && $append); #next unless $must_match; - my $quantified = "(?$uni_semantics:$l_anchor$prepend$interior${quantifier}$append$r_anchor)"; + my $quantified = "(?$charset:$l_anchor$prepend$interior${quantifier}$append$r_anchor)"; my $op; - if ($must_match && ! $utf8_target && ! $uni_pattern && ! $uni_semantics && $ord >= 128 && $ord < 256 && ! $is_self) { + if ($must_match && $should_fail) { $op = 0; } else { $op = 1; @@ -324,8 +346,9 @@ foreach my $test (sort { numerically } keys %tests) { $op = ! $op if $must_match && $inverted; $op = ($op) ? '=~' : '!~'; - my $stuff .= " utf8_target=$utf8_target, uni_semantics=$uni_semantics, uni_pattern=$uni_pattern, bracketed=$bracketed, prepend=$prepend, append=$append, parend=$parend, quan ... [60 chars truncated] - my $eval = "my \$c = \"$prepend$lhs$append\"$upgrade_target; $upgrade_pattern; \$c $op /$quantified/i;"; + my $stuff .= " uni_semantics=$uni_semantics, should_fail=$should_fail, bracketed=$bracketed, prepend=$prepend, append=$append, parend=$parend, quantifier=$quantifier, l_anch ... [34 chars truncated] + $stuff .= "; pattern_above_latin1=$pattern_above_latin1; utf8_pattern=$utf8_pattern"; + my $eval = "my \$c = \"$prepend$lhs$append\"; my \$p = qr/$quantified/i;$upgrade_target$upgrade_pattern \$c $op \$p;"; # XXX Doesn't currently test multi-char folds next if @pattern != 1; diff --git a/t/re/reg_fold.t b/t/re/reg_fold.t index ce84960..59dbfc7 100644 --- a/t/re/reg_fold.t +++ b/t/re/reg_fold.t @@ -8,7 +8,6 @@ BEGIN { use strict; use warnings; -my $count=1; my @tests; my %todo_pass = map { $_ => 1 } @@ -68,7 +67,6 @@ while (<$fh>) { # There are a few of these that pass; most fail. $tests[-1]="TODO: { local \$::TODO='Some multi-char, f8 folded inside character class [ ] doesnt work';\n$tests[-1] }" } - $count++; } } } @@ -116,19 +114,18 @@ for my $i (0 .. 255) { my $hex_fold_ascii = sprintf "0x%02X", $fold_ascii[$i]; my $hex_fold_latin1 = sprintf "0x%02X", $fold_latin1[$i]; push @tests, qq[like chr($hex_fold_ascii), qr/(?d:$chr)/i, 'chr($hex_fold_ascii) =~ qr/(?d:$chr)/i']; - $count++; push @tests, qq[like chr($hex_fold_latin1), qr/(?u:$chr)/i, 'chr($hex_fold_latin1) =~ qr/(?u:$chr)/i']; - $count++; } push @tests, qq[like chr(0x0430), qr/[=\x{0410}-\x{0411}]/i, 'Bug #71752 Unicode /i char in a range']; -$count++; push @tests, qq[like 'a', qr/\\p{Upper}/i, "'a' =~ /\\\\p{Upper}/i"]; -$count++; push @tests, q[my $c = "\x{212A}"; my $p = qr/(?:^[\x{004B}_]+$)/i; utf8::upgrade($p); like $c, $p, 'Bug #78994: my $c = "\x{212A}"; my $p = qr/(?:^[\x{004B}_]+$)/i; utf8::upgrade($p); $c =~ $p']; -$count++; -eval join ";\n","plan tests=>".($count-1),@tests,"1" +use charnames ":full"; +push @tests, q[my $re1 = "\N{WHITE SMILING FACE}";like "\xE8", qr/[\w$re1]/, 'my $re = "\N{WHITE SMILING FACE}"; "\xE8" =~ qr/[\w$re]/']; +push @tests, q[my $re2 = "\N{WHITE SMILING FACE}";like "\xE8", qr/\w|$re2/, 'my $re = "\N{WHITE SMILING FACE}"; "\xE8" =~ qr/\w|$re/']; + +eval join ";\n","plan tests=>". (scalar @tests), @tests, "1" or die $@; __DATA__ -- Perl5 Master Repository
