In perl.git, the branch blead has been updated <http://perl5.git.perl.org/perl.git/commitdiff/bfdf22ecfe7cf92546a7f989d23ce76679360abd?hp=ba207afd4f7e36d6017bca62c51c750ddc6beb7a>
- Log ----------------------------------------------------------------- commit bfdf22ecfe7cf92546a7f989d23ce76679360abd Author: Karl Williamson <[email protected]> Date: Wed Jun 6 15:56:09 2012 -0600 utf8.c: White-space only M utf8.c commit cbe07460d443564aea40e4397ab55080aab2d0b9 Author: Karl Williamson <[email protected]> Date: Wed Jun 6 15:50:53 2012 -0600 utf8.c: Refactor a portion of to_utf8_case() This routine can never return 0, as if there is no case mapping, the input is used instead. The code point for that input has already been derived earlier in the function, so it doesn't have to be recalculated. And, rearrange the order of things slightly. M utf8.c commit dd9267d7ed4287d573abd56d1aadf2152c628baf Author: Karl Williamson <[email protected]> Date: Wed Jun 6 15:40:38 2012 -0600 utf8.c: Avoid some extra work In the case changed, the output is the input, so can just Copy it instead of re-deriving it. M utf8.c commit 2114036ce68685bad99e2d43873afa0ff512b0a6 Author: Karl Williamson <[email protected]> Date: Wed Jun 6 15:40:02 2012 -0600 utf8.c: Add, revise comments M utf8.c commit 2269d15c887e7326906ea6195d5970ac188c3411 Author: Karl Williamson <[email protected]> Date: Wed Jun 6 11:12:24 2012 -0600 docs patch: 'unicode_strings' doesn't change utf8ness We had a field report of lack of clarity around this, so add some text. M lib/feature.pm M pod/perlunicode.pod M regen/feature.pl commit 3c8317961e30b9ee408493f68b38d723f6748319 Author: Karl Williamson <[email protected]> Date: Wed Jun 6 11:11:34 2012 -0600 perlfunc: Add comma M pod/perlfunc.pod ----------------------------------------------------------------------- Summary of changes: lib/feature.pm | 5 +++-- pod/perlfunc.pod | 2 +- pod/perlunicode.pod | 6 ++++-- regen/feature.pl | 5 +++-- utf8.c | 21 +++++++++++++++------ 5 files changed, 26 insertions(+), 13 deletions(-) diff --git a/lib/feature.pm b/lib/feature.pm index 37e571f..cc89bde 100644 --- a/lib/feature.pm +++ b/lib/feature.pm @@ -5,7 +5,7 @@ package feature; -our $VERSION = '1.28'; +our $VERSION = '1.29'; our %feature = ( fc => 'feature_fc', @@ -139,7 +139,8 @@ C<use feature 'unicode_strings'> tells the compiler to use Unicode semantics in all string operations executed within its scope (unless they are also within the scope of either C<use locale> or C<use bytes>). The same applies to all regular expressions compiled within the scope, even if executed outside -it. +it. It does not change the internal representation of strings, but only how +they are interpreted. C<no feature 'unicode_strings'> tells the compiler to use the traditional Perl semantics wherein the native character set semantics is used unless it is diff --git a/pod/perlfunc.pod b/pod/perlfunc.pod index 3482f36..99cdeec 100644 --- a/pod/perlfunc.pod +++ b/pod/perlfunc.pod @@ -7252,7 +7252,7 @@ Perl versions, it should call C<srand>; otherwise most programs won't call C<srand()> at all. But there are a few situations in recent Perls where programs are likely to -want to call C<srand>. One is for generating predictable results generally for +want to call C<srand>. One is for generating predictable results, generally for testing or debugging. There, you use C<srand($seed)>, with the same C<$seed> each time. Another case is that you may want to call C<srand()> after a C<fork()> to avoid child processes sharing the same seed value as the diff --git a/pod/perlunicode.pod b/pod/perlunicode.pod index 77daca3..e893571 100644 --- a/pod/perlunicode.pod +++ b/pod/perlunicode.pod @@ -28,8 +28,10 @@ C<use feature 'unicode_strings'> is specified. (This is automatically selected if you use C<use 5.012> or higher.) Failure to do this can trigger unexpected surprises. See L</The "Unicode Bug"> below. -This pragma doesn't affect I/O, and there are still several places -where Unicode isn't fully supported, such as in filenames. +This pragma doesn't affect I/O. Nor does it change the internal +representation of strings, only their interpretation. There are still +several places where Unicode isn't fully supported, such as in +filenames. =item Input and Output Layers diff --git a/regen/feature.pl b/regen/feature.pl index 15315c7..1a85d0a 100755 --- a/regen/feature.pl +++ b/regen/feature.pl @@ -338,7 +338,7 @@ read_only_bottom_close_and_rename($h); __END__ package feature; -our $VERSION = '1.28'; +our $VERSION = '1.29'; FEATURES @@ -437,7 +437,8 @@ C<use feature 'unicode_strings'> tells the compiler to use Unicode semantics in all string operations executed within its scope (unless they are also within the scope of either C<use locale> or C<use bytes>). The same applies to all regular expressions compiled within the scope, even if executed outside -it. +it. It does not change the internal representation of strings, but only how +they are interpreted. C<no feature 'unicode_strings'> tells the compiler to use the traditional Perl semantics wherein the native character set semantics is used unless it is diff --git a/utf8.c b/utf8.c index 018c85a..0a6f9ed 100644 --- a/utf8.c +++ b/utf8.c @@ -998,7 +998,7 @@ Perl_utf8_to_uvuni_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen) } /* Like L</utf8_to_uvuni_buf>(), but should only be called when it is known that - * there are no malformations in the input UTF-8 string C<s>. surrogates, + * there are no malformations in the input UTF-8 string C<s>. Surrogates, * non-character code points, and non-Unicode code points are allowed */ UV @@ -2410,7 +2410,7 @@ Perl_to_utf8_case(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, } if (!len && *swashp) { - const UV uv2 = swash_fetch(*swashp, tmpbuf, TRUE); + const UV uv2 = swash_fetch(*swashp, tmpbuf, TRUE /* => is utf8 */); if (uv2) { /* It was "normal" (a single character mapping). */ @@ -2419,14 +2419,23 @@ Perl_to_utf8_case(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, } } - if (!len) /* Neither: just copy. In other words, there was no mapping - defined, which means that the code point maps to itself */ - len = uvchr_to_utf8(ustrp, uv0) - ustrp; + if (len) { + if (lenp) { + *lenp = len; + } + return valid_utf8_to_uvchr(ustrp, 0); + } + + /* Here, there was no mapping defined, which means that the code point maps + * to itself. Return the inputs */ + len = UTF8SKIP(p); + Copy(p, ustrp, len, U8); if (lenp) *lenp = len; - return len ? valid_utf8_to_uvchr(ustrp, 0) : 0; + return uv0; + } STATIC UV -- Perl5 Master Repository
