In perl.git, the branch blead has been updated <http://perl5.git.perl.org/perl.git/commitdiff/f11f9c4c96c7b47e180befa9fb272343566a7167?hp=0ed2b00b2bd6a650391433ad8733983692df43d5>
- Log ----------------------------------------------------------------- commit f11f9c4c96c7b47e180befa9fb272343566a7167 Author: Karl Williamson <[email protected]> Date: Sat May 31 17:11:10 2014 -0600 perlebcdic: Clean-up There are much simpler ways to do some things than were given. This also makes some clarifications, and removes obsolete text, shortens some too long verbatim lines. M pod/perlebcdic.pod M t/porting/known_pod_issues.dat commit e30b2da5d01e80840b115621b2c798c01ff426a2 Author: Karl Williamson <[email protected]> Date: Sat May 31 17:09:53 2014 -0600 utfebcdic.h: Add comments M utfebcdic.h commit b985ae615210d15bfa67bddb2118de1c02c21935 Author: Karl Williamson <[email protected]> Date: Fri Nov 15 13:40:03 2013 -0700 PATCH: [perl #120386]: av_len documentation I think I have incorporated everybody's concerns in this patch. M av.c commit ca3d51ba62f0e2b46d3714c26711c8973a3724bb Author: Karl Williamson <[email protected]> Date: Mon Oct 14 22:58:30 2013 -0600 lib/utf8.pm: Document unicode_to_native() and inverse M lib/utf8.pm commit 308a4ae118d0c5ca43889a96b89dd0b5be487b5c Author: Karl Williamson <[email protected]> Date: Mon Oct 14 22:38:24 2013 -0600 lib/utf8.pm: wrap code examples in pod with C<> M lib/utf8.pm commit b6fb4de72917282df5b2b45712d7d3f9f1609943 Author: Karl Williamson <[email protected]> Date: Sat May 31 14:27:54 2014 -0600 lib/locale.t: Add tests, fix test names It is unfortunate that tests are in two places in this file. The reason is to avoid doing eval's (to avoid any perturbations that might cause) but be under the scope of two different locale forms. This adds text to remind maintainers that they should create copies when adding tests, and includes the new tests that have previously been added to one place but not the other, and includes the new improved test names. M lib/locale.t commit 40205cabcf46648032cafa4d149c2cbf7787179d Author: Karl Williamson <[email protected]> Date: Sat May 31 14:11:35 2014 -0600 lib/locale.t: Add some tests M lib/locale.t commit 2575405ef7be50228c2f0ecad4d802ee784f6be3 Author: Karl Williamson <[email protected]> Date: Sat May 31 14:10:00 2014 -0600 lib/locale.t: Improve test names; add some names M lib/locale.t commit 7c641603528c1bd9ae833c58a4f65b49ae8d4726 Author: Karl Williamson <[email protected]> Date: Sat May 31 12:45:41 2014 -0600 sv.c: Add comment M sv.c ----------------------------------------------------------------------- Summary of changes: av.c | 11 +- lib/locale.t | 525 +++++++++++++++++++++++++++-------------- lib/utf8.pm | 59 ++++- pod/perlebcdic.pod | 122 +++------- sv.c | 3 + t/porting/known_pod_issues.dat | 1 - utfebcdic.h | 2 + 7 files changed, 445 insertions(+), 278 deletions(-) diff --git a/av.c b/av.c index c08d2c2..0602525 100644 --- a/av.c +++ b/av.c @@ -760,11 +760,16 @@ The Perl equivalent for this is C<$#myarray>. (A slightly shorter form is C<av_tindex>.) +=for apidoc av_tindex + +Same as L</av_top_index>. + =for apidoc av_len -Same as L</av_top_index>. Returns the highest index in the array. Note that the -return value is +1 what its name implies it returns; and hence differs in -meaning from what the similarly named L</sv_len> returns. +Same as L</av_top_index>. Note that, unlike what the name implies, it returns +the highest index in the array, so to get the size of the array you need to use +S<C<av_len(av) + 1>>. This is unlike L</sv_len>, which returns what you would +expect. =cut */ diff --git a/lib/locale.t b/lib/locale.t index eea92e0..5e839bc 100644 --- a/lib/locale.t +++ b/lib/locale.t @@ -67,7 +67,7 @@ sub debugf { printf @_ if $debug; } -$a = 'abc %'; +$a = 'abc %9'; my $test_num = 0; @@ -94,13 +94,15 @@ sub is_tainted { # hello, camel two. sub check_taint ($;$) { my $message_tail = $_[1] // ""; - $message_tail = ": $message_tail" if $message_tail; + + # Extra blanks are so aligns with taint_not output + $message_tail = ": $message_tail" if $message_tail; ok is_tainted($_[0]), "verify that is tainted$message_tail"; } sub check_taint_not ($;$) { my $message_tail = $_[1] // ""; - $message_tail = ": $message_tail" if $message_tail; + $message_tail = ": $message_tail" if $message_tail; ok((not is_tainted($_[0])), "verify that isn't tainted$message_tail"); } @@ -110,33 +112,36 @@ check_taint_not $&, "not tainted outside 'use locale'"; use locale; # engage locale and therefore locale taint. -check_taint_not $a, "\t\$a"; - -check_taint uc($a); -check_taint "\U$a"; -check_taint ucfirst($a); -check_taint "\u$a"; -check_taint lc($a); -check_taint fc($a); -check_taint "\L$a"; -check_taint "\F$a"; -check_taint lcfirst($a); -check_taint "\l$a"; - -check_taint_not sprintf('%e', 123.456); -check_taint_not sprintf('%f', 123.456); -check_taint_not sprintf('%g', 123.456); -check_taint_not sprintf('%d', 123.456); -check_taint_not sprintf('%x', 123.456); +# BE SURE TO COPY ANYTHING YOU ADD to these tests to the block below for +# ":notcharacters" + +check_taint_not $a, '$a'; + +check_taint uc($a), 'uc($a)'; +check_taint "\U$a", '"\U$a"'; +check_taint ucfirst($a), 'ucfirst($a)'; +check_taint "\u$a", '"\u$a"'; +check_taint lc($a), 'lc($a)'; +check_taint fc($a), 'fc($a)'; +check_taint "\L$a", '"\L$a"'; +check_taint "\F$a", '"\F$a"'; +check_taint lcfirst($a), 'lcfirst($a)'; +check_taint "\l$a", '"\l$a"'; + +check_taint_not sprintf('%e', 123.456), "sprintf('%e', 123.456)"; +check_taint_not sprintf('%f', 123.456), "sprintf('%f', 123.456)"; +check_taint_not sprintf('%g', 123.456), "sprintf('%g', 123.456)"; +check_taint_not sprintf('%d', 123.456), "sprintf('%d', 123.456)"; +check_taint_not sprintf('%x', 123.456), "sprintf('%x', 123.456)"; $_ = $a; # untaint $_ $_ = uc($a); # taint $_ -check_taint $_, "\t\$_"; +check_taint $_, '$_ = uc($a)'; /(\w)/; # taint $&, $`, $', $+, $1. -check_taint $&, "\t/(\\w)/ \$&"; +check_taint $&, "\$& from /(\\w)/"; check_taint $`, "\t\$`"; check_taint $', "\t\$'"; check_taint $+, "\t\$+"; @@ -144,7 +149,7 @@ check_taint $1, "\t\$1"; check_taint_not $2, "\t\$2"; /(.)/; # untaint $&, $`, $', $+, $1. -check_taint_not $&, "\t/(.)/ \$&"; +check_taint_not $&, "\$& from /(.)/"; check_taint_not $`, "\t\$`"; check_taint_not $', "\t\$'"; check_taint_not $+, "\t\$+"; @@ -152,7 +157,7 @@ check_taint_not $1, "\t\$1"; check_taint_not $2, "\t\$2"; /(\W)/; # taint $&, $`, $', $+, $1. -check_taint $&, "\t/(\\W)/ \$&"; +check_taint $&, "\$& from /(\\W)/"; check_taint $`, "\t\$`"; check_taint $', "\t\$'"; check_taint $+, "\t\$+"; @@ -160,7 +165,7 @@ check_taint $1, "\t\$1"; check_taint_not $2, "\t\$2"; /(.)/; # untaint $&, $`, $', $+, $1. -check_taint_not $&, "\t/(.)/ \$&"; +check_taint_not $&, "\$& from /(.)/"; check_taint_not $`, "\t\$`"; check_taint_not $', "\t\$'"; check_taint_not $+, "\t\$+"; @@ -168,7 +173,7 @@ check_taint_not $1, "\t\$1"; check_taint_not $2, "\t\$2"; /(\s)/; # taint $&, $`, $', $+, $1. -check_taint $&, "\t/(\\s)/ \$&"; +check_taint $&, "\$& from /(\\s)/"; check_taint $`, "\t\$`"; check_taint $', "\t\$'"; check_taint $+, "\t\$+"; @@ -176,10 +181,43 @@ check_taint $1, "\t\$1"; check_taint_not $2, "\t\$2"; /(.)/; # untaint $&, $`, $', $+, $1. -check_taint_not $&, "\t/(.)/ \$&"; +check_taint_not $&, "\$& from /(.)/"; /(\S)/; # taint $&, $`, $', $+, $1. -check_taint $&, "\t/(\\S)/ \$&"; +check_taint $&, "\$& from /(\\S)/"; +check_taint $`, "\t\$`"; +check_taint $', "\t\$'"; +check_taint $+, "\t\$+"; +check_taint $1, "\t\$1"; +check_taint_not $2, "\t\$2"; + +/(.)/; # untaint $&, $`, $', $+, $1. +check_taint_not $&, "\$& from /(.)/"; + +"0" =~ /(\d)/; # taint $&, $`, $', $+, $1. +check_taint $&, "\$& from /(\\d)/"; +check_taint $`, "\t\$`"; +check_taint $', "\t\$'"; +check_taint $+, "\t\$+"; +check_taint $1, "\t\$1"; +check_taint_not $2, "\t\$2"; + +/(.)/; # untaint $&, $`, $', $+, $1. +check_taint_not $&, "\$& from /(.)/"; + +/(\D)/; # taint $&, $`, $', $+, $1. +check_taint $&, "\$& from /(\\D)/"; +check_taint $`, "\t\$`"; +check_taint $', "\t\$'"; +check_taint $+, "\t\$+"; +check_taint $1, "\t\$1"; +check_taint_not $2, "\t\$2"; + +/(.)/; # untaint $&, $`, $', $+, $1. +check_taint_not $&, "\$& from /(.)/"; + +/([[:alnum:]])/; # taint $&, $`, $', $+, $1. +check_taint $&, "\$& from /([[:alnum:]])/"; check_taint $`, "\t\$`"; check_taint $', "\t\$'"; check_taint $+, "\t\$+"; @@ -187,39 +225,55 @@ check_taint $1, "\t\$1"; check_taint_not $2, "\t\$2"; /(.)/; # untaint $&, $`, $', $+, $1. -check_taint_not $&, "\t/(.)/ \$&"; +check_taint_not $&, "\$& from /(.)/"; + +/([[:^alnum:]])/; # taint $&, $`, $', $+, $1. +check_taint $&, "\$& from /([[:^alnum:]])/"; +check_taint $`, "\t\$`"; +check_taint $', "\t\$'"; +check_taint $+, "\t\$+"; +check_taint $1, "\t\$1"; +check_taint_not $2, "\t\$2"; "a" =~ /(a)|(\w)/; # taint $&, $`, $', $+, $1. -check_taint $&, "\t/(a)|(\\w)/ \$&"; +check_taint $&, "\$& from /(a)|(\\w)/"; check_taint $`, "\t\$`"; check_taint $', "\t\$'"; check_taint $+, "\t\$+"; check_taint $1, "\t\$1"; -ok($1 eq 'a', ("\t" x 4) . "\$1 is 'a'"); -ok(! defined $2, ("\t" x 4) . "\$2 is undefined"); +ok($1 eq 'a', ("\t" x 5) . "\$1 is 'a'"); +ok(! defined $2, ("\t" x 5) . "\$2 is undefined"); check_taint_not $2, "\t\$2"; check_taint_not $3, "\t\$3"; /(.)/; # untaint $&, $`, $', $+, $1. -check_taint_not $&, "\t/(.)/ \$&"; +check_taint_not $&, "\$& from /(.)/"; "\N{CYRILLIC SMALL LETTER A}" =~ /(\N{CYRILLIC CAPITAL LETTER A})/i; # no tainting because no locale dependence -check_taint_not $&, "\t/(\\N{CYRILLIC CAPITAL LETTER A})/i \$&"; +check_taint_not $&, "\$& from /(\\N{CYRILLIC CAPITAL LETTER A})/i"; check_taint_not $`, "\t\$`"; check_taint_not $', "\t\$'"; check_taint_not $+, "\t\$+"; check_taint_not $1, "\t\$1"; -ok($1 eq "\N{CYRILLIC SMALL LETTER A}", ("\t" x 4) . "\$1 is 'small cyrillic a'"); +ok($1 eq "\N{CYRILLIC SMALL LETTER A}", ("\t" x 4) . "\t\$1 is 'small cyrillic a'"); check_taint_not $2, "\t\$2"; /(.)/; # untaint $&, $`, $', $+, $1. -check_taint_not $&, "\t/./ \$&"; +check_taint_not $&, "\$& from /./"; + +"(\N{KELVIN SIGN})" =~ /(\N{KELVIN SIGN})/i; # taints because depends on locale +check_taint $&, "\$& from /(\\N{KELVIN SIGN})/i"; +check_taint $`, "\t\$`"; +check_taint $', "\t\$'"; +check_taint $+, "\t\$+"; +check_taint $1, "\t\$1"; +check_taint_not $2, "\t\$2"; /(.)/; # untaint $&, $`, $', $+, $1. -check_taint_not $&, "\t/(.)/ \$&"; +check_taint_not $&, "\$& from /(.)/"; "a:" =~ /(.)\b(.)/; # taint $&, $`, $', $+, $1. -check_taint $&, "\t/(.)\\b(.)/ \$&"; +check_taint $&, "\$& from /(.)\\b(.)/"; check_taint $`, "\t\$`"; check_taint $', "\t\$'"; check_taint $+, "\t\$+"; @@ -228,10 +282,10 @@ check_taint $2, "\t\$2"; check_taint_not $3, "\t\$3"; /(.)/; # untaint $&, $`, $', $+, $1. -check_taint_not $&, "\t/./ \$&"; +check_taint_not $&, "\$& from /./"; "aa" =~ /(.)\B(.)/; # taint $&, $`, $', $+, $1. -check_taint $&, "\t/(.)\\B(.)/ \$&"; +check_taint $&, "\$& from /(.)\\B(.)/"; check_taint $`, "\t\$`"; check_taint $', "\t\$'"; check_taint $+, "\t\$+"; @@ -240,26 +294,26 @@ check_taint $2, "\t\$2"; check_taint_not $3, "\t\$3"; /(.)/; # untaint $&, $`, $', $+, $1. -check_taint_not $&, "\t/./ \$&"; +check_taint_not $&, "\$& from /./"; "aaa" =~ /(.).(\1)/i; # notaint because not locale dependent -check_taint_not $&, "\t/(.).(\\1)/ \$&"; +check_taint_not $&, "\$ & from /(.).(\\1)/"; check_taint_not $`, "\t\$`"; check_taint_not $', "\t\$'"; check_taint_not $+, "\t\$+"; check_taint_not $1, "\t\$1"; check_taint_not $2, "\t\$2"; -check_taint_not $3, "\t\$3"; +check_taint_not $3, "\t\$3"; /(.)/; # untaint $&, $`, $', $+, $1. -check_taint_not $&, "\t/./ \$&"; +check_taint_not $&, "\$ & from /./"; $_ = $a; # untaint $_ -check_taint_not $_, "\t\$_"; +check_taint_not $_, 'untainting $_ works'; /(b)/; # this must not taint -check_taint_not $&, "\t/(b)/ \$&"; +check_taint_not $&, "\$ & from /(b)/"; check_taint_not $`, "\t\$`"; check_taint_not $', "\t\$'"; check_taint_not $+, "\t\$+"; @@ -268,12 +322,12 @@ check_taint_not $2, "\t\$2"; $_ = $a; # untaint $_ -check_taint_not $_, "\t\$_"; +check_taint_not $_, 'untainting $_ works'; $b = uc($a); # taint $b s/(.+)/$b/; # this must taint only the $_ -check_taint $_, "\t\$_"; +check_taint $_, '$_ (wasn\'t tainted) from s/(.+)/$b/ where $b is tainted'; check_taint_not $&, "\t\$&"; check_taint_not $`, "\t\$`"; check_taint_not $', "\t\$'"; @@ -284,7 +338,7 @@ check_taint_not $2, "\t\$2"; $_ = $a; # untaint $_ s/(.+)/b/; # this must not taint -check_taint_not $_, "\t\$_"; +check_taint_not $_, '$_ (wasn\'t tainted) from s/(.+)/b/'; check_taint_not $&, "\t\$&"; check_taint_not $`, "\t\$`"; check_taint_not $', "\t\$'"; @@ -295,13 +349,13 @@ check_taint_not $2, "\t\$2"; $b = $a; # untaint $b ($b = $a) =~ s/\w/$&/; -check_taint $b, "\t\$b"; # $b should be tainted. -check_taint_not $a, "\t\$a"; # $a should be not. +check_taint $b, '$b from ($b = $a) =~ s/\w/$&/'; # $b should be tainted. +check_taint_not $a, '$a from ($b = $a) =~ s/\w/$&/'; # $a should be not. $_ = $a; # untaint $_ s/(\w)/\l$1/; # this must taint -check_taint $_, "\t\$_"; +check_taint $_, '$_ (wasn\'t tainted) from s/(\w)/\l$1/,'; # this must taint check_taint $&, "\t\$&"; check_taint $`, "\t\$`"; check_taint $', "\t\$'"; @@ -312,7 +366,7 @@ check_taint_not $2, "\t\$2"; $_ = $a; # untaint $_ s/(\w)/\L$1/; # this must taint -check_taint $_, "\t\$_"; +check_taint $_, '$_ (wasn\'t tainted) from s/(\w)/\L$1/,'; check_taint $&, "\t\$&"; check_taint $`, "\t\$`"; check_taint $', "\t\$'"; @@ -323,7 +377,7 @@ check_taint_not $2, "\t\$2"; $_ = $a; # untaint $_ s/(\w)/\u$1/; # this must taint -check_taint $_, "\t\$_"; +check_taint $_, '$_ (wasn\'t tainted) from s/(\w)/\u$1/'; check_taint $&, "\t\$&"; check_taint $`, "\t\$`"; check_taint $', "\t\$'"; @@ -334,7 +388,7 @@ check_taint_not $2, "\t\$2"; $_ = $a; # untaint $_ s/(\w)/\U$1/; # this must taint -check_taint $_, "\t\$_"; +check_taint $_, '$_ (wasn\'t tainted) from s/(\w)/\U$1/'; check_taint $&, "\t\$&"; check_taint $`, "\t\$`"; check_taint $', "\t\$'"; @@ -344,7 +398,7 @@ check_taint_not $2, "\t\$2"; # After all this tainting $a should be cool. -check_taint_not $a, "\t\$a"; +check_taint_not $a, '$a still not tainted'; "a" =~ /([a-z])/; check_taint_not $1, '"a" =~ /([a-z])/'; @@ -358,168 +412,283 @@ check_taint_not $1, '"foo.bar_baz" =~ /^(.*)[._](.*?)$/'; use locale ':not_characters'; # engage restricted locale with different # tainting rules - - check_taint_not $a; - - check_taint_not uc($a); - check_taint_not "\U$a"; - check_taint_not ucfirst($a); - check_taint_not "\u$a"; - check_taint_not lc($a); - check_taint_not fc($a); - check_taint_not "\L$a"; - check_taint_not "\F$a"; - check_taint_not lcfirst($a); - check_taint_not "\l$a"; - - check_taint_not sprintf('%e', 123.456); - check_taint_not sprintf('%f', 123.456); - check_taint_not sprintf('%g', 123.456); - check_taint_not sprintf('%d', 123.456); - check_taint_not sprintf('%x', 123.456); + check_taint_not $a, '$a'; + + check_taint_not uc($a), 'uc($a)'; + check_taint_not "\U$a", '"\U$a"'; + check_taint_not ucfirst($a), 'ucfirst($a)'; + check_taint_not "\u$a", '"\u$a"'; + check_taint_not lc($a), 'lc($a)'; + check_taint_not fc($a), 'fc($a)'; + check_taint_not "\L$a", '"\L$a"'; + check_taint_not "\F$a", '"\F$a"'; + check_taint_not lcfirst($a), 'lcfirst($a)'; + check_taint_not "\l$a", '"\l$a"'; + + check_taint_not sprintf('%e', 123.456), "sprintf('%e', 123.456)"; + check_taint_not sprintf('%f', 123.456), "sprintf('%f', 123.456)"; + check_taint_not sprintf('%g', 123.456), "sprintf('%g', 123.456)"; + check_taint_not sprintf('%d', 123.456), "sprintf('%d', 123.456)"; + check_taint_not sprintf('%x', 123.456), "sprintf('%x', 123.456)"; $_ = $a; # untaint $_ - $_ = uc($a); # taint $_ + $_ = uc($a); - check_taint_not $_; + check_taint_not $_, '$_ = uc($a)'; - /(\w)/; # taint $&, $`, $', $+, $1. - check_taint_not $&; - check_taint_not $`; - check_taint_not $'; - check_taint_not $+; - check_taint_not $1; - check_taint_not $2; + /(\w)/; + check_taint_not $&, "\$& from /(\\w)/"; + check_taint_not $`, "\t\$`"; + check_taint_not $', "\t\$'"; + check_taint_not $+, "\t\$+"; + check_taint_not $1, "\t\$1"; + check_taint_not $2, "\t\$2"; /(.)/; # untaint $&, $`, $', $+, $1. - check_taint_not $&; - check_taint_not $`; - check_taint_not $'; - check_taint_not $+; - check_taint_not $1; - check_taint_not $2; - - /(\W)/; # taint $&, $`, $', $+, $1. - check_taint_not $&; - check_taint_not $`; - check_taint_not $'; - check_taint_not $+; - check_taint_not $1; - check_taint_not $2; - - /(\s)/; # taint $&, $`, $', $+, $1. - check_taint_not $&; - check_taint_not $`; - check_taint_not $'; - check_taint_not $+; - check_taint_not $1; - check_taint_not $2; - - /(\S)/; # taint $&, $`, $', $+, $1. - check_taint_not $&; - check_taint_not $`; - check_taint_not $'; - check_taint_not $+; - check_taint_not $1; - check_taint_not $2; + check_taint_not $&, "\$& from /(.)/"; + check_taint_not $`, "\t\$`"; + check_taint_not $', "\t\$'"; + check_taint_not $+, "\t\$+"; + check_taint_not $1, "\t\$1"; + check_taint_not $2, "\t\$2"; + + /(\W)/; + check_taint_not $&, "\$& from /(\\W)/"; + check_taint_not $`, "\t\$`"; + check_taint_not $', "\t\$'"; + check_taint_not $+, "\t\$+"; + check_taint_not $1, "\t\$1"; + check_taint_not $2, "\t\$2"; - $_ = $a; # untaint $_ + /(.)/; # untaint $&, $`, $', $+, $1. + check_taint_not $&, "\$& from /(.)/"; + check_taint_not $`, "\t\$`"; + check_taint_not $', "\t\$'"; + check_taint_not $+, "\t\$+"; + check_taint_not $1, "\t\$1"; + check_taint_not $2, "\t\$2"; + + /(\s)/; + check_taint_not $&, "\$& from /(\\s)/"; + check_taint_not $`, "\t\$`"; + check_taint_not $', "\t\$'"; + check_taint_not $+, "\t\$+"; + check_taint_not $1, "\t\$1"; + check_taint_not $2, "\t\$2"; - check_taint_not $_; + /(.)/; # untaint $&, $`, $', $+, $1. + check_taint_not $&, "\$& from /(.)/"; - /(b)/; # this must not taint - check_taint_not $&; - check_taint_not $`; - check_taint_not $'; - check_taint_not $+; - check_taint_not $1; - check_taint_not $2; + /(\S)/; + check_taint_not $&, "\$& from /(\\S)/"; + check_taint_not $`, "\t\$`"; + check_taint_not $', "\t\$'"; + check_taint_not $+, "\t\$+"; + check_taint_not $1, "\t\$1"; + check_taint_not $2, "\t\$2"; - $_ = $a; # untaint $_ + /(.)/; # untaint $&, $`, $', $+, $1. + check_taint_not $&, "\$& from /(.)/"; + + "0" =~ /(\d)/; + check_taint_not $&, "\$& from /(\\d)/"; + check_taint_not $`, "\t\$`"; + check_taint_not $', "\t\$'"; + check_taint_not $+, "\t\$+"; + check_taint_not $1, "\t\$1"; + check_taint_not $2, "\t\$2"; + + /(.)/; # untaint $&, $`, $', $+, $1. + check_taint_not $&, "\$& from /(.)/"; + + /(\D)/; + check_taint_not $&, "\$& from /(\\D)/"; + check_taint_not $`, "\t\$`"; + check_taint_not $', "\t\$'"; + check_taint_not $+, "\t\$+"; + check_taint_not $1, "\t\$1"; + check_taint_not $2, "\t\$2"; + + /(.)/; # untaint $&, $`, $', $+, $1. + check_taint_not $&, "\$& from /(.)/"; + + /([[:alnum:]])/; + check_taint_not $&, "\$& from /([[:alnum:]])/"; + check_taint_not $`, "\t\$`"; + check_taint_not $', "\t\$'"; + check_taint_not $+, "\t\$+"; + check_taint_not $1, "\t\$1"; + check_taint_not $2, "\t\$2"; + + /(.)/; # untaint $&, $`, $', $+, $1. + check_taint_not $&, "\$& from /(.)/"; + + /([[:^alnum:]])/; + check_taint_not $&, "\$& from /([[:^alnum:]])/"; + check_taint_not $`, "\t\$`"; + check_taint_not $', "\t\$'"; + check_taint_not $+, "\t\$+"; + check_taint_not $1, "\t\$1"; + check_taint_not $2, "\t\$2"; + + "a" =~ /(a)|(\w)/; + check_taint_not $&, "\$& from /(a)|(\\w)/"; + check_taint_not $`, "\t\$`"; + check_taint_not $', "\t\$'"; + check_taint_not $+, "\t\$+"; + check_taint_not $1, "\t\$1"; + ok($1 eq 'a', ("\t" x 5) . "\$1 is 'a'"); + ok(! defined $2, ("\t" x 5) . "\$2 is undefined"); + check_taint_not $2, "\t\$2"; + check_taint_not $3, "\t\$3"; + + /(.)/; # untaint $&, $`, $', $+, $1. + check_taint_not $&, "\$& from /(.)/"; + + "\N{CYRILLIC SMALL LETTER A}" =~ /(\N{CYRILLIC CAPITAL LETTER A})/i; + check_taint_not $&, "\$& from /(\\N{CYRILLIC CAPITAL LETTER A})/i"; + check_taint_not $`, "\t\$`"; + check_taint_not $', "\t\$'"; + check_taint_not $+, "\t\$+"; + check_taint_not $1, "\t\$1"; + ok($1 eq "\N{CYRILLIC SMALL LETTER A}", ("\t" x 4) . "\t\$1 is 'small cyrillic a'"); + check_taint_not $2, "\t\$2"; + + /(.)/; # untaint $&, $`, $', $+, $1. + check_taint_not $&, "\$& from /./"; + + "(\N{KELVIN SIGN})" =~ /(\N{KELVIN SIGN})/i; + check_taint_not $&, "\$& from /(\\N{KELVIN SIGN})/i"; + check_taint_not $`, "\t\$`"; + check_taint_not $', "\t\$'"; + check_taint_not $+, "\t\$+"; + check_taint_not $1, "\t\$1"; + check_taint_not $2, "\t\$2"; - check_taint_not $_; + /(.)/; # untaint $&, $`, $', $+, $1. + check_taint_not $&, "\$& from /(.)/"; + + "a:" =~ /(.)\b(.)/; + check_taint_not $&, "\$& from /(.)\\b(.)/"; + check_taint_not $`, "\t\$`"; + check_taint_not $', "\t\$'"; + check_taint_not $+, "\t\$+"; + check_taint_not $1, "\t\$1"; + check_taint_not $2, "\t\$2"; + check_taint_not $3, "\t\$3"; - $b = uc($a); # taint $b - s/(.+)/$b/; # this must taint only the $_ + /(.)/; # untaint $&, $`, $', $+, $1. + check_taint_not $&, "\$& from /./"; - check_taint_not $_; - check_taint_not $&; - check_taint_not $`; - check_taint_not $'; - check_taint_not $+; - check_taint_not $1; - check_taint_not $2; + "aa" =~ /(.)\B(.)/; + check_taint_not $&, "\$& from /(.)\\B(.)/"; + check_taint_not $`, "\t\$`"; + check_taint_not $', "\t\$'"; + check_taint_not $+, "\t\$+"; + check_taint_not $1, "\t\$1"; + check_taint_not $2, "\t\$2"; + check_taint_not $3, "\t\$3"; + + /(.)/; # untaint $&, $`, $', $+, $1. + check_taint_not $&, "\$& from /./"; + + "aaa" =~ /(.).(\1)/i; # notaint because not locale dependent + check_taint_not $&, "\$ & from /(.).(\\1)/"; + check_taint_not $`, "\t\$`"; + check_taint_not $', "\t\$'"; + check_taint_not $+, "\t\$+"; + check_taint_not $1, "\t\$1"; + check_taint_not $2, "\t\$2"; + check_taint_not $3, "\t\$3"; + + /(.)/; # untaint $&, $`, $', $+, $1. + check_taint_not $&, "\$ & from /./"; $_ = $a; # untaint $_ - s/(.+)/b/; # this must not taint - check_taint_not $_; - check_taint_not $&; - check_taint_not $`; - check_taint_not $'; - check_taint_not $+; - check_taint_not $1; - check_taint_not $2; + check_taint_not $_, 'untainting $_ works'; + + /(b)/; + check_taint_not $&, "\$ & from /(b)/"; + check_taint_not $`, "\t\$`"; + check_taint_not $', "\t\$'"; + check_taint_not $+, "\t\$+"; + check_taint_not $1, "\t\$1"; + check_taint_not $2, "\t\$2"; + + $_ = $a; # untaint $_ + + check_taint_not $_, 'untainting $_ works'; + + s/(.+)/b/; + check_taint_not $_, '$_ (wasn\'t tainted) from s/(.+)/b/'; + check_taint_not $&, "\t\$&"; + check_taint_not $`, "\t\$`"; + check_taint_not $', "\t\$'"; + check_taint_not $+, "\t\$+"; + check_taint_not $1, "\t\$1"; + check_taint_not $2, "\t\$2"; $b = $a; # untaint $b ($b = $a) =~ s/\w/$&/; - check_taint_not $b; # $b should be tainted. - check_taint_not $a; # $a should be not. + check_taint_not $b, '$b from ($b = $a) =~ s/\w/$&/'; + check_taint_not $a, '$a from ($b = $a) =~ s/\w/$&/'; $_ = $a; # untaint $_ - s/(\w)/\l$1/; # this must taint - check_taint_not $_; - check_taint_not $&; - check_taint_not $`; - check_taint_not $'; - check_taint_not $+; - check_taint_not $1; - check_taint_not $2; + s/(\w)/\l$1/; + check_taint_not $_, '$_ (wasn\'t tainted) from s/(\w)/\l$1/,'; # this must taint + check_taint_not $&, "\t\$&"; + check_taint_not $`, "\t\$`"; + check_taint_not $', "\t\$'"; + check_taint_not $+, "\t\$+"; + check_taint_not $1, "\t\$1"; + check_taint_not $2, "\t\$2"; $_ = $a; # untaint $_ - s/(\w)/\L$1/; # this must taint - check_taint_not $_; - check_taint_not $&; - check_taint_not $`; - check_taint_not $'; - check_taint_not $+; - check_taint_not $1; - check_taint_not $2; + s/(\w)/\L$1/; + check_taint_not $_, '$_ (wasn\'t tainted) from s/(\w)/\L$1/,'; + check_taint_not $&, "\t\$&"; + check_taint_not $`, "\t\$`"; + check_taint_not $', "\t\$'"; + check_taint_not $+, "\t\$+"; + check_taint_not $1, "\t\$1"; + check_taint_not $2, "\t\$2"; $_ = $a; # untaint $_ - s/(\w)/\u$1/; # this must taint - check_taint_not $_; - check_taint_not $&; - check_taint_not $`; - check_taint_not $'; - check_taint_not $+; - check_taint_not $1; - check_taint_not $2; + s/(\w)/\u$1/; + check_taint_not $_, '$_ (wasn\'t tainted) from s/(\w)/\u$1/'; + check_taint_not $&, "\t\$&"; + check_taint_not $`, "\t\$`"; + check_taint_not $', "\t\$'"; + check_taint_not $+, "\t\$+"; + check_taint_not $1, "\t\$1"; + check_taint_not $2, "\t\$2"; $_ = $a; # untaint $_ - s/(\w)/\U$1/; # this must taint - check_taint_not $_; - check_taint_not $&; - check_taint_not $`; - check_taint_not $'; - check_taint_not $+; - check_taint_not $1; - check_taint_not $2; + s/(\w)/\U$1/; + check_taint_not $_, '$_ (wasn\'t tainted) from s/(\w)/\U$1/'; + check_taint_not $&, "\t\$&"; + check_taint_not $`, "\t\$`"; + check_taint_not $', "\t\$'"; + check_taint_not $+, "\t\$+"; + check_taint_not $1, "\t\$1"; + check_taint_not $2, "\t\$2"; # After all this tainting $a should be cool. - check_taint_not $a; + check_taint_not $a, '$a still not tainted'; "a" =~ /([a-z])/; check_taint_not $1, '"a" =~ /([a-z])/'; "foo.bar_baz" =~ /^(.*)[._](.*?)$/; # Bug 120675 check_taint_not $1, '"foo.bar_baz" =~ /^(.*)[._](.*?)$/'; + } # Here are in scope of 'use locale' diff --git a/lib/utf8.pm b/lib/utf8.pm index 43c7277..4980c7c 100644 --- a/lib/utf8.pm +++ b/lib/utf8.pm @@ -2,7 +2,7 @@ package utf8; $utf8::hint_bits = 0x00800000; -our $VERSION = '1.13'; +our $VERSION = '1.14'; sub import { $^H |= $utf8::hint_bits; @@ -42,6 +42,14 @@ utf8 - Perl pragma to enable/disable UTF-8 (or UTF-EBCDIC) in source code utf8::encode($string); # "\x{100}" becomes "\xc4\x80" utf8::decode($string); # "\xc4\x80" becomes "\x{100}" + # Convert a code point from the platform native character set to + # Unicode, and vice-versa. + $unicode = utf8::native_to_unicode(ord('A')); # returns 65 on both + # ASCII and EBCDIC + # platforms + $native = utf8::unicode_to_native(65); # returns 65 on ASCII + # platforms; 193 on EBCDIC + $flag = utf8::is_utf8($string); # since Perl 5.8.1 $flag = utf8::valid($string); @@ -99,7 +107,7 @@ you should not say that unless you really want to have UTF-8 source code. =over 4 -=item * $num_octets = utf8::upgrade($string) +=item * C<$num_octets = utf8::upgrade($string)> Converts in-place the internal representation of the string from an octet sequence in the native encoding (Latin-1 or EBCDIC) to I<UTF-X>. The @@ -114,7 +122,7 @@ B<Note that this function does not handle arbitrary encodings.> Therefore Encode is recommended for the general purposes; see also L<Encode>. -=item * $success = utf8::downgrade($string[, $fail_ok]) +=item * C<$success = utf8::downgrade($string[, $fail_ok])> Converts in-place the internal representation of the string from I<UTF-X> to the equivalent octet sequence in the native encoding (Latin-1 @@ -135,7 +143,7 @@ B<Note that this function does not handle arbitrary encodings.> Therefore Encode is recommended for the general purposes; see also L<Encode>. -=item * utf8::encode($string) +=item * C<utf8::encode($string)> Converts in-place the character sequence to the corresponding octet sequence in I<UTF-X>. That is, every (possibly wide) character gets @@ -144,14 +152,14 @@ individual I<UTF-X> bytes of the character. The UTF8 flag is turned off. Returns nothing. my $a = "\x{100}"; # $a contains one character, with ord 0x100 - utf8::encode($a); # $a contains two characters, with ords 0xc4 and - # 0x80 + utf8::encode($a); # $a contains two characters, with ords (on + # ASCII platforms) 0xc4 and 0x80 B<Note that this function does not handle arbitrary encodings.> Therefore Encode is recommended for the general purposes; see also L<Encode>. -=item * $success = utf8::decode($string) +=item * C<$success = utf8::decode($string)> Attempts to convert in-place the octet sequence encoded as I<UTF-X> to the corresponding character sequence. That is, it replaces each sequence of @@ -161,20 +169,47 @@ turned on only if the source string contains multiple-byte I<UTF-X> characters. If I<$string> is invalid as I<UTF-X>, returns false; otherwise returns true. - my $a = "\xc4\x80"; # $a contains two characters, with ords - # 0xc4 and 0x80 - utf8::decode($a); # $a contains one character, with ord 0x100 + my $a = "\xc4\x80"; # $a contains two characters, with ords + # 0xc4 and 0x80 + utf8::decode($a); # On ASCII platforms, $a contains one char, + # with ord 0x100. On EBCDIC platforms, $a + # is unchanged and the function returns FALSE. + +(C<"\xc4\x80"> is not a valid sequence of bytes in any UTF-8-encoded +character(s) in the EBCDIC code pages that Perl supports, which is why the +above example returns failure on them. What does decode into C<\x{100}> +depends on the platform. It is C<"\x8C\x41"> in IBM-1047.) B<Note that this function does not handle arbitrary encodings.> Therefore Encode is recommended for the general purposes; see also L<Encode>. -=item * $flag = utf8::is_utf8($string) +=item * C<$unicode = utf8::native_to_unicode($code_point)> + +This takes an unsigned integer (which represents the ordinal number of a +character (or a code point) on the platform the program is being run on) and +returns its Unicode equivalent value. Since ASCII platforms natively use the +Unicode code points, this function returns its input on them. On EBCDIC +platforms it converts from EBCIDC to Unicode. + +A meaningless value will currently be returned if the input is not an unsigned +integer. + +=item * C<$native = utf8::unicode_to_native($code_point)> + +This is the inverse of C<utf8::native_to_unicode()>, converting the other +direction. Again, on ASCII platforms, this returns its input, but on EBCDIC +platforms it will find the native platform code point, given any Unicode one. + +A meaningless value will currently be returned if the input is not an unsigned +integer. + +=item * C<$flag = utf8::is_utf8($string)> (Since Perl 5.8.1) Test whether I<$string> is marked internally as encoded in UTF-8. Functionally the same as Encode::is_utf8(). -=item * $flag = utf8::valid($string) +=item * C<$flag = utf8::valid($string)> [INTERNAL] Test whether I<$string> is in a consistent state regarding UTF-8. Will return true if it is well-formed UTF-8 and has the UTF-8 flag diff --git a/pod/perlebcdic.pod b/pod/perlebcdic.pod index 28f5981..c7efd98 100644 --- a/pod/perlebcdic.pod +++ b/pod/perlebcdic.pod @@ -125,7 +125,7 @@ This causes a problem with the semantics of the pack/unpack "U", which are supposed to pack Unicode code points to characters and back to numbers. The problem is: which code points to use for code points less than 256? (for 256 and over there's no problem: Unicode code points are used) -In EBCDIC, for the low 256 the EBCDIC code points are used. This +In EBCDIC, the EBCDIC code points are used for the low 256. This means that the equivalences pack("U", ord($character)) eq $character @@ -142,12 +142,8 @@ equal I<A with acute> or chr(101), and unpack("U", "A") would equal =item * -Many of the remaining problems seem to be related to case-insensitive matching - -=item * - The extensions Unicode::Collate and Unicode::Normalized are not -supported under EBCDIC, likewise for the encoding pragma. +supported under EBCDIC, likewise for the (now deprecated) encoding pragma. =back @@ -269,7 +265,8 @@ might want to write: open(FH,"<perlebcdic.pod") or die "Could not open perlebcdic.pod: $!"; while (<FH>) { - if (/(.{29})(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\.?(\d*)\s+(\d+)\.?(\d*)/) + if (/(.{29})(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\.?(\d*) + \s+(\d+)\.?(\d*)/x) { if ($7 ne '' && $9 ne '') { printf( @@ -310,7 +307,8 @@ Or, in order to retain the UTF-x code points in hexadecimal: open(FH,"<perlebcdic.pod") or die "Could not open perlebcdic.pod: $!"; while (<FH>) { - if (/(.{29})(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\.?(\d*)\s+(\d+)\.?(\d*)/) + if (/(.{29})(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\.?(\d*) + \s+(\d+)\.?(\d*)/x) { if ($7 ne '' && $9 ne '') { printf( @@ -330,8 +328,8 @@ Or, in order to retain the UTF-x code points in hexadecimal: ISO - 8859-1 POS- - CCSID CCSID CCSID IX- + 8859-1 POS- CCSID + CCSID CCSID CCSID IX- 1047 chr 0819 0037 1047 BC UTF-8 UTF-EBCDIC --------------------------------------------------------------------- <NUL> 0 0 0 0 0 0 @@ -973,17 +971,23 @@ work on any platform as follows: sub Is_c0 { my $char = substr(shift,0,1); - return $char =~ /[[:cntrl:]]/ - && $char =~ /[[:ascii:]]/ - && ! Is_delete($char); + return $char =~ /[[:cntrl:]]/a && ! Is_delete($char); + + # Alternatively: + # return $char =~ /[[:cntrl:]]/ + # && $char =~ /[[:ascii:]]/ + # && ! Is_delete($char); } sub Is_print_ascii { my $char = substr(shift,0,1); - return $char =~ /[[:print:]]/ && $char =~ /[[:ascii:]]/; + return $char =~ /[[:print:]]/a; # Alternatively: + # return $char =~ /[[:print:]]/ && $char =~ /[[:ascii:]]/; + + # Or # return $char # =~ /[ !"\#\$%&'()*+,\-.\/0-9:;<=>?\@A-Z[\\\]^_`a-z{|}~]/; } @@ -1012,7 +1016,8 @@ to use the characters in the range explicitly: sub Is_latin_1 { my $char = substr(shift,0,1); - $char =~ /[ ¡¢£¤¥¦§¨©ª«¬Â®¯°±²³´µ¶·¸¹º»¼½¾¿ÃÃÃÃÃà ÃÃÃÃÃÃÃÃÃÃÃÃÃÃÃÃÃÃÃÃÃÃÃÃÃÃà áâãäåæçèéêëìÃîïðñòóôõö÷øù ... [15 chars truncated] + $char =~ /[ ¡¢£¤¥¦§¨©ª«¬Â®¯°±²³´µ¶·¸¹º»¼½¾¿ÃÃÃÃÃà ÃÃÃÃÃÃÃÃÃÃ] + [ÃÃÃÃÃÃÃÃÃÃÃÃÃÃÃÃà áâãäåæçèéêëìÃîïðñòóôõö÷øùúûüýþÿ]/x; } Although that form may run into trouble in network transit (due to the @@ -1112,65 +1117,24 @@ may also be expressed as either of: http://www.pvhp.com/%7epvhp/ where 7E is the hexadecimal ASCII code point for '~'. Here is an example -of decoding such a URL under CCSID 1047: +of decoding such a URL in any EBCDIC code page: $url = 'http://www.pvhp.com/%7Epvhp/'; - # this array assumes code page 1047 - my @a2e_1047 = ( - 0, 1, 2, 3, 55, 45, 46, 47, 22, 5, 21, 11, 12, 13, 14, 15, - 16, 17, 18, 19, 60, 61, 50, 38, 24, 25, 63, 39, 28, 29, 30, 31, - 64, 90,127,123, 91,108, 80,125, 77, 93, 92, 78,107, 96, 75, 97, - 240,241,242,243,244,245,246,247,248,249,122, 94, 76,126,110,111, - 124,193,194,195,196,197,198,199,200,201,209,210,211,212,213,214, - 215,216,217,226,227,228,229,230,231,232,233,173,224,189, 95,109, - 121,129,130,131,132,133,134,135,136,137,145,146,147,148,149,150, - 151,152,153,162,163,164,165,166,167,168,169,192, 79,208,161, 7, - 32, 33, 34, 35, 36, 37, 6, 23, 40, 41, 42, 43, 44, 9, 10, 27, - 48, 49, 26, 51, 52, 53, 54, 8, 56, 57, 58, 59, 4, 20, 62,255, - 65,170, 74,177,159,178,106,181,187,180,154,138,176,202,175,188, - 144,143,234,250,190,160,182,179,157,218,155,139,183,184,185,171, - 100,101, 98,102, 99,103,158,104,116,113,114,115,120,117,118,119, - 172,105,237,238,235,239,236,191,128,253,254,251,252,186,174, 89, - 68, 69, 66, 70, 67, 71,156, 72, 84, 81, 82, 83, 88, 85, 86, 87, - 140, 73,205,206,203,207,204,225,112,221,222,219,220,141,142,223 - ); - $url =~ s/%([0-9a-fA-F]{2})/pack("c",$a2e_1047[hex($1)])/ge; + $url =~ s/%([0-9a-fA-F]{2})/ + pack("c",utf8::unicode_to_native(hex($1)))/xge; Conversely, here is a partial solution for the task of encoding such -a URL under the 1047 code page: +a URL in any EBCDIC code page: $url = 'http://www.pvhp.com/~pvhp/'; - # this array assumes code page 1047 - my @e2a_1047 = ( - 0, 1, 2, 3,156, 9,134,127,151,141,142, 11, 12, 13, 14, 15, - 16, 17, 18, 19,157, 10, 8,135, 24, 25,146,143, 28, 29, 30, 31, - 128,129,130,131,132,133, 23, 27,136,137,138,139,140, 5, 6, 7, - 144,145, 22,147,148,149,150, 4,152,153,154,155, 20, 21,158, 26, - 32,160,226,228,224,225,227,229,231,241,162, 46, 60, 40, 43,124, - 38,233,234,235,232,237,238,239,236,223, 33, 36, 42, 41, 59, 94, - 45, 47,194,196,192,193,195,197,199,209,166, 44, 37, 95, 62, 63, - 248,201,202,203,200,205,206,207,204, 96, 58, 35, 64, 39, 61, 34, - 216, 97, 98, 99,100,101,102,103,104,105,171,187,240,253,254,177, - 176,106,107,108,109,110,111,112,113,114,170,186,230,184,198,164, - 181,126,115,116,117,118,119,120,121,122,161,191,208, 91,222,174, - 172,163,165,183,169,167,182,188,189,190,221,168,175, 93,180,215, - 123, 65, 66, 67, 68, 69, 70, 71, 72, 73,173,244,246,242,243,245, - 125, 74, 75, 76, 77, 78, 79, 80, 81, 82,185,251,252,249,250,255, - 92,247, 83, 84, 85, 86, 87, 88, 89, 90,178,212,214,210,211,213, - 48, 49, 50, 51, 52, 53, 54, 55, 56, 57,179,219,220,217,218,159 - ); # The following regular expression does not address the # mappings for: ('.' => '%2E', '/' => '%2F', ':' => '%3A') $url =~ s/([\t "#%&\(\),;<=>\?\@\[\\\]^`{|}~])/ - sprintf("%%%02X",$e2a_1047[ord($1)])/xge; + sprintf("%%%02X",utf8::native_to_unicode(ord($1)))/xge; where a more complete solution would split the URL into components and apply a full s/// substitution only to the appropriate parts. -In the remaining examples a @e2a or @a2e array may be employed -but the assignment will not be shown explicitly. For code page 1047 -you could use the @a2e_1047 or @e2a_1047 arrays just shown. - =head2 uu encoding and decoding The C<u> template to pack() or unpack() will render EBCDIC data in EBCDIC @@ -1196,19 +1160,17 @@ following will print "Yes indeed\n" on either an ASCII or EBCDIC computer: print "indeed\n"; } -Here is a very spartan uudecoder that will work on EBCDIC provided -that the @e2a array is filled in appropriately: +Here is a very spartan uudecoder that will work on EBCDIC: #!/usr/local/bin/perl - @e2a = ( # this must be filled in - ); $_ = <> until ($mode,$file) = /^begin\s*(\d*)\s*(\S*)/; open(OUT, "> $file") if $file ne ""; while(<>) { last if /^end/; next if /[a-z]/; - next unless int(((($e2a[ord()] - 32 ) & 077) + 2) / 3) == - int(length() / 4); + next unless int((((utf8::native_to_unicode(ord()) - 32 ) & 077) + + 2) / 3) + == int(length() / 4); print OUT unpack("u", $_); } close(OUT); @@ -1224,36 +1186,28 @@ the printable set using: $qp_string =~ s/([=\x00-\x1F\x80-\xFF])/sprintf("=%02X",ord($1))/ge; Whereas a QP encoder that works on both ASCII and EBCDIC platforms -would look somewhat like the following (where the EBCDIC branch @e2a -array is omitted for brevity): +would look somewhat like the following: - if (ord('A') == 65) { # ASCII - $delete = "\x7F"; # ASCII - @e2a = (0 .. 255) # ASCII to ASCII identity map - } - else { # EBCDIC - $delete = "\x07"; # EBCDIC - @e2a = # EBCDIC to ASCII map (as shown above) - } + $delete = utf8::unicode_to_native(ord("\x7F")); $qp_string =~ - s/([^ !"\#\$%&'()*+,\-.\/0-9:;<>?\@A-Z[\\\]^_`a-z{|}~$delete])/ - sprintf("=%02X",$e2a[ord($1)])/xge; + s/([^[:print:]$delete])/ + sprintf("=%02X",utf8::native_to_unicode(ord($1)))/xage; (although in production code the substitutions might be done -in the EBCDIC branch with the @e2a array and separately in the +in the EBCDIC branch with the function call and separately in the ASCII branch without the expense of the identity map). Such QP strings can be decoded with: # This QP decoder is limited to ASCII only - $string =~ s/=([0-9A-Fa-f][0-9A-Fa-f])/chr hex $1/ge; + $string =~ s/=([[:xdigit:][[:xdigit:])/chr hex $1/ge; $string =~ s/=[\n\r]+$//; Whereas a QP decoder that works on both ASCII and EBCDIC platforms -would look somewhat like the following (where the @a2e array is -omitted for brevity): +would look somewhat like the following: - $string =~ s/=([0-9A-Fa-f][0-9A-Fa-f])/chr $a2e[hex $1]/ge; + $string =~ s/=([[:xdigit:][:xdigit:]])/ + chr utf8::native_to_unicode(hex $1)/xge; $string =~ s/=[\n\r]+$//; =head2 Caesarean ciphers diff --git a/sv.c b/sv.c index bb7481a..13ea53c 100644 --- a/sv.c +++ b/sv.c @@ -1584,6 +1584,9 @@ Perl_sv_grow(pTHX_ SV *const sv, STRLEN newlen) if (newlen < minlen) newlen = minlen; #ifndef PERL_UNWARANTED_CHUMMINESS_WITH_MALLOC + + /* Don't round up on the first allocation, as odds are pretty good that + * the initial request is accurate as to what is really needed */ if (SvLEN(sv)) { newlen = PERL_STRLEN_ROUNDUP(newlen); } diff --git a/t/porting/known_pod_issues.dat b/t/porting/known_pod_issues.dat index 4f3d2c0..b53fc85 100644 --- a/t/porting/known_pod_issues.dat +++ b/t/porting/known_pod_issues.dat @@ -247,7 +247,6 @@ pod/perldebtut.pod Verbatim line length including indents exceeds 79 by 22 pod/perldebug.pod Verbatim line length including indents exceeds 79 by 3 pod/perldsc.pod Verbatim line length including indents exceeds 79 by 4 pod/perldtrace.pod Verbatim line length including indents exceeds 79 by 26 -pod/perlebcdic.pod Verbatim line length including indents exceeds 79 by 3 pod/perlfunc.pod ? Should you be using F<...> or maybe L<...> instead of 1 pod/perlgit.pod Verbatim line length including indents exceeds 79 by 12 pod/perlgpl.pod Verbatim line length including indents exceeds 79 by 50 diff --git a/utfebcdic.h b/utfebcdic.h index 7eec66f..23b4ae2 100644 --- a/utfebcdic.h +++ b/utfebcdic.h @@ -84,6 +84,8 @@ * pages. Best is to convert to I8 before sending them, as the I8 * representation is the same no matter what the underlying code page is. * + * tr16 also says that NEL and LF be swapped. We don't do that. + * * EBCDIC characters above 0xFF are the same as Unicode in Perl's * implementation of all 3 encodings, so for those Step 1 is trivial. * -- Perl5 Master Repository
