Change 18299 by rgs@rgs-home on 2002/12/12 20:35:29
Integrate from maint-5.8 : changes 18290-1, 18293-5, 18297
Affected files ...
... //depot/perl/ext/POSIX/POSIX.pod#31 integrate
... //depot/perl/ext/POSIX/t/is.t#2 integrate
... //depot/perl/ext/POSIX/t/posix.t#16 integrate
... //depot/perl/pod/perlre.pod#80 integrate
... //depot/perl/pod/perlretut.pod#32 integrate
... //depot/perl/regexec.c#294 integrate
... //depot/perl/t/op/pat.t#185 integrate
... //depot/perl/t/op/subst.t#37 integrate
Differences ...
==== //depot/perl/ext/POSIX/POSIX.pod#31 (text) ====
Index: perl/ext/POSIX/POSIX.pod
--- perl/ext/POSIX/POSIX.pod#30~17911~ Tue Sep 17 12:29:53 2002
+++ perl/ext/POSIX/POSIX.pod Thu Dec 12 12:35:29 2002
@@ -580,15 +580,20 @@
=item isalnum
-This is identical to the C function, except that it can apply to a single
-character or to a whole string. Consider using regular expressions and the
-C</[[:alnum:]]/> construct instead, or possibly the C</\w/> construct.
+This is identical to the C function, except that it can apply to a
+single character or to a whole string. Note that locale settings may
+affect what characters are considered C<isalnum>. Does not work on
+Unicode characters code point 256 or higher. Consider using regular
+expressions and the C</[[:alnum:]]/> construct instead, or possibly
+the C</\w/> construct.
=item isalpha
-This is identical to the C function, except that it can apply to a single
-character or to a whole string. Consider using regular expressions and the
-C</[[:alpha:]]/> construct instead.
+This is identical to the C function, except that it can apply to
+a single character or to a whole string. Note that locale settings
+may affect what characters are considered C<isalpha>. Does not work
+on Unicode characters code point 256 or higher. Consider using regular
+expressions and the C</[[:alpha:]]/> construct instead.
=item isatty
@@ -597,60 +602,82 @@
=item iscntrl
-This is identical to the C function, except that it can apply to a single
-character or to a whole string. Consider using regular expressions and the
-C</[[:cntrl:]]/> construct instead.
+This is identical to the C function, except that it can apply to
+a single character or to a whole string. Note that locale settings
+may affect what characters are considered C<iscntrl>. Does not work
+on Unicode characters code point 256 or higher. Consider using regular
+expressions and the C</[[:cntrl:]]/> construct instead.
=item isdigit
-This is identical to the C function, except that it can apply to a single
-character or to a whole string. Consider using regular expressions and the
-C</[[:digit:]]/> construct instead, or the C</\d/> construct.
+This is identical to the C function, except that it can apply to
+a single character or to a whole string. Note that locale settings
+may affect what characters are considered C<isdigit> (unlikely, but
+still possible). Does not work on Unicode characters code point 256
+or higher. Consider using regular expressions and the C</[[:digit:]]/>
+construct instead, or the C</\d/> construct.
=item isgraph
-This is identical to the C function, except that it can apply to a single
-character or to a whole string. Consider using regular expressions and the
-C</[[:graph:]]/> construct instead.
+This is identical to the C function, except that it can apply to
+a single character or to a whole string. Note that locale settings
+may affect what characters are considered C<isgraph>. Does not work
+on Unicode characters code point 256 or higher. Consider using regular
+expressions and the C</[[:graph:]]/> construct instead.
=item islower
-This is identical to the C function, except that it can apply to a single
-character or to a whole string. Consider using regular expressions and the
-C</[[:lower:]]/> construct instead. Do B<not> use C</[a-z]/>.
+This is identical to the C function, except that it can apply to
+a single character or to a whole string. Note that locale settings
+may affect what characters are considered C<islower>. Does not work
+on Unicode characters code point 256 or higher. Consider using regular
+expressions and the C</[[:lower:]]/> construct instead. Do B<not> use
+C</[a-z]/>.
=item isprint
-This is identical to the C function, except that it can apply to a single
-character or to a whole string. Consider using regular expressions and the
-C</[[:print:]]/> construct instead.
+This is identical to the C function, except that it can apply to
+a single character or to a whole string. Note that locale settings
+may affect what characters are considered C<isprint>. Does not work
+on Unicode characters code point 256 or higher. Consider using regular
+expressions and the C</[[:print:]]/> construct instead.
=item ispunct
-This is identical to the C function, except that it can apply to a single
-character or to a whole string. Consider using regular expressions and the
-C</[[:punct:]]/> construct instead.
+This is identical to the C function, except that it can apply to
+a single character or to a whole string. Note that locale settings
+may affect what characters are considered C<ispunct>. Does not work
+on Unicode characters code point 256 or higher. Consider using regular
+expressions and the C</[[:punct:]]/> construct instead.
=item isspace
-This is identical to the C function, except that it can apply to a single
-character or to a whole string. Consider using regular expressions and the
-C</[[:space:]]/> construct instead, or the C</\s/> construct.
-(Note that C</\s/> and C</[[:space:]]/> are slightly different in that
-C</[[:space:]]/> can normally match a vertical tab, while C</\s/> does
-not.)
+This is identical to the C function, except that it can apply to
+a single character or to a whole string. Note that locale settings
+may affect what characters are considered C<isspace>. Does not work
+on Unicode characters code point 256 or higher. Consider using regular
+expressions and the C</[[:space:]]/> construct instead, or the C</\s/>
+construct. (Note that C</\s/> and C</[[:space:]]/> are slightly
+different in that C</[[:space:]]/> can normally match a vertical tab,
+while C</\s/> does not.)
=item isupper
-This is identical to the C function, except that it can apply to a single
-character or to a whole string. Consider using regular expressions and the
-C</[[:upper:]]/> construct instead. Do B<not> use C</[A-Z]/>.
+This is identical to the C function, except that it can apply to
+a single character or to a whole string. Note that locale settings
+may affect what characters are considered C<isupper>. Does not work
+on Unicode characters code point 256 or higher. Consider using regular
+expressions and the C</[[:upper:]]/> construct instead. Do B<not> use
+C</[A-Z]/>.
=item isxdigit
This is identical to the C function, except that it can apply to a single
-character or to a whole string. Consider using regular expressions and the
-C</[[:xdigit:]]/> construct instead, or simply C</[0-9a-f]/i>.
+character or to a whole string. Note that locale settings may affect what
+characters are considered C<isxdigit> (unlikely, but still possible).
+Does not work on Unicode characters code point 256 or higher.
+Consider using regular expressions and the C</[[:xdigit:]]/>
+construct instead, or simply C</[0-9a-f]/i>.
=item kill
@@ -1224,12 +1251,23 @@
year (C<year>) is given in years since 1900. I.e., the year 1995 is 95; the
year 2001 is 101. Consult your system's C<strftime()> manpage for details
about these and the other arguments.
+
If you want your code to be portable, your format (C<fmt>) argument
should use only the conversion specifiers defined by the ANSI C
-standard. These are C<aAbBcdHIjmMpSUwWxXyYZ%>.
-The given arguments are made consistent
-as though by calling C<mktime()> before calling your system's
-C<strftime()> function, except that the C<isdst> value is not affected.
+standard (C89, to play safe). These are C<aAbBcdHIjmMpSUwWxXyYZ%>.
+But even then, the B<results> of some of the conversion specifiers are
+non-portable. For example, the specifiers C<aAbBcpZ> change according
+to the locale settings of the user, and both how to set locales (the
+locale names) and what output to expect are non-standard.
+The specifier C<c> changes according to the timezone settings of the
+user and the timezone computation rules of the operating system.
+The C<Z> specifier is notoriously unportable since the names of
+timezones are non-standard. Sticking to the numeric specifiers is the
+safest route.
+
+The given arguments are made consistent as though by calling
+C<mktime()> before calling your system's C<strftime()> function,
+except that the C<isdst> value is not affected.
The string for Tuesday, December 12, 1995.
==== //depot/perl/ext/POSIX/t/is.t#2 (text) ====
Index: perl/ext/POSIX/t/is.t
--- perl/ext/POSIX/t/is.t#1~18221~ Sun Dec 1 18:18:19 2002
+++ perl/ext/POSIX/t/is.t Thu Dec 12 12:35:29 2002
@@ -10,12 +10,14 @@
}
}
-
use POSIX;
use strict ;
-$| = 1;
+# E.g. \t might or might not be isprint() depending on the locale,
+# so let's reset to the default.
+setlocale(LC_ALL, 'C') if $Config{d_setlocale};
+$| = 1;
# List of characters (and strings) to feed to the is<xxx> functions.
#
==== //depot/perl/ext/POSIX/t/posix.t#16 (text) ====
Index: perl/ext/POSIX/t/posix.t
--- perl/ext/POSIX/t/posix.t#15~18267~ Sun Dec 8 16:10:09 2002
+++ perl/ext/POSIX/t/posix.t Thu Dec 12 12:35:29 2002
@@ -11,8 +11,7 @@
}
require "./test.pl";
-plan(tests => 66);
-
+plan(tests => 61);
use POSIX qw(fcntl_h signal_h limits_h _exit getcwd open read strftime write
errno);
@@ -182,26 +181,6 @@
try_strftime("Wed Mar 01 00:00:00 2000 061", 0,0,0, 1,2,100);
try_strftime("Fri Mar 31 00:00:00 2000 091", 0,0,0, 31,2,100);
&POSIX::setlocale(&POSIX::LC_TIME, $lc) if $Config{d_setlocale};
-
-SKIP: {
- # XXX wait for smokers to see which OSs else to skip
- skip("No mktime and/or tm_gmtoff", 5)
- if !$Config{d_mktime} || !$Config{d_tm_tm_gmtoff} || !$Config{d_tm_tm_zone};
- local $ENV{TZ} = "Europe/Berlin";
-
- # May fail for ancient FreeBSD versions.
- # %z is not included in POSIX, but valid on Linux and FreeBSD.
- foreach $def ([1000,'Sun Sep 9 03:46:40 2001 +0200 CEST'],
- [900, 'Thu Jul 9 18:00:00 1998 +0200 CEST'],
- [800, 'Tue May 9 08:13:20 1995 +0200 CEST'],
- [700, 'Sat Mar 7 21:26:40 1992 +0100 CET'],
- [600, 'Thu Jan 5 11:40:00 1989 +0100 CET'],
- ) {
- my($t, $expected) = @$def;
- my @tm = localtime($t*1000000);
- is(strftime("%c %z %Z",@tm), $expected, "validating zone setting: $expected");
- }
-}
{
for my $test (0, 1) {
==== //depot/perl/pod/perlre.pod#80 (text) ====
Index: perl/pod/perlre.pod
--- perl/pod/perlre.pod#79~17729~ Fri Aug 16 19:33:15 2002
+++ perl/pod/perlre.pod Thu Dec 12 12:35:29 2002
@@ -188,6 +188,7 @@
\C Match a single C char (octet) even under Unicode.
NOTE: breaks up characters into their UTF-8 bytes,
so you may end up with malformed pieces of UTF-8.
+ Unsupported in lookbehind.
A C<\w> matches a single alphanumeric character (an alphabetic
character, or a decimal digit) or C<_>, not a whole word. Use C<\w+>
==== //depot/perl/pod/perlretut.pod#32 (text) ====
Index: perl/pod/perlretut.pod
--- perl/pod/perlretut.pod#31~16012~ Fri Apr 19 18:46:03 2002
+++ perl/pod/perlretut.pod Thu Dec 12 12:35:29 2002
@@ -1707,7 +1707,7 @@
The last regexp matches, but is dangerous because the string
I<character> position is no longer synchronized to the string I<byte>
position. This generates the warning 'Malformed UTF-8
-character'. C<\C> is best used for matching the binary data in strings
+character'. The C<\C> is best used for matching the binary data in strings
with binary data intermixed with Unicode characters.
Let us now discuss the rest of the character classes. Just as with
@@ -2003,6 +2003,10 @@
$x =~ /foo(?!bar)/; # doesn't match, 'bar' follows 'foo'
$x =~ /foo(?!baz)/; # matches, 'baz' doesn't follow 'foo'
$x =~ /(?<!\s)foo/; # matches, there is no \s before 'foo'
+
+The C<\C> is unsupported in lookbehind, because the already
+treacherous definition of C<\C> would become even more so
+when going backwards.
=head2 Using independent subexpressions to prevent backtracking
==== //depot/perl/regexec.c#294 (text) ====
Index: perl/regexec.c
--- perl/regexec.c#293~18085~ Mon Nov 4 04:17:12 2002
+++ perl/regexec.c Thu Dec 12 12:35:29 2002
@@ -1882,9 +1882,12 @@
goto phooey;
}
else if ((c = prog->regstclass)) {
- if (minlen && PL_regkind[(U8)OP(prog->regstclass)] != EXACT)
+ if (minlen) {
+ I32 op = (U8)OP(prog->regstclass);
/* don't bother with what can't match */
- strend = HOPc(strend, -(minlen - 1));
+ if (PL_regkind[op] != EXACT && op != CANY)
+ strend = HOPc(strend, -(minlen - 1));
+ }
DEBUG_r({
SV *prop = sv_newmortal();
char *s0;
@@ -2269,17 +2272,17 @@
regprop(prop, scan);
{
char *s0 =
- do_utf8 ?
+ do_utf8 && OP(scan) != CANY ?
pv_uni_display(dsv0, (U8*)(locinput - pref_len),
pref0_len, 60, UNI_DISPLAY_REGEX) :
locinput - pref_len;
int len0 = do_utf8 ? strlen(s0) : pref0_len;
- char *s1 = do_utf8 ?
+ char *s1 = do_utf8 && OP(scan) != CANY ?
pv_uni_display(dsv1, (U8*)(locinput - pref_len + pref0_len),
pref_len - pref0_len, 60, UNI_DISPLAY_REGEX) :
locinput - pref_len + pref0_len;
int len1 = do_utf8 ? strlen(s1) : pref_len - pref0_len;
- char *s2 = do_utf8 ?
+ char *s2 = do_utf8 && OP(scan) != CANY ?
pv_uni_display(dsv2, (U8*)locinput,
PL_regeol - locinput, 60, UNI_DISPLAY_REGEX) :
locinput;
==== //depot/perl/t/op/pat.t#185 (xtext) ====
Index: perl/t/op/pat.t
--- perl/t/op/pat.t#184~18280~ Tue Dec 10 13:30:10 2002
+++ perl/t/op/pat.t Thu Dec 12 12:35:29 2002
@@ -6,7 +6,7 @@
$| = 1;
-print "1..942\n";
+print "1..968\n";
BEGIN {
chdir 't' if -d 't';
@@ -3006,4 +3006,53 @@
++$test;
}
-# last test 942
+{
+ print "# [perl #15763]\n";
+
+ $a = "x\x{100}";
+ chop $a; # but leaves the UTF-8 flag
+ $a .= "y"; # 1 byte before "y"
+
+ ok($a =~ /^\C/, 'match one \C on 1-byte UTF-8');
+ ok($a =~ /^\C{1}/, 'match \C{1}');
+
+ ok($a =~ /^\Cy/, 'match \Cy');
+ ok($a =~ /^\C{1}y/, 'match \C{1}y');
+
+ $a = "\x{100}y"; # 2 bytes before "y"
+
+ ok($a =~ /^\C/, 'match one \C on 2-byte UTF-8');
+ ok($a =~ /^\C{1}/, 'match \C{1}');
+ ok($a =~ /^\C\C/, 'match two \C');
+ ok($a =~ /^\C{2}/, 'match \C{2}');
+
+ ok($a =~ /^\C\C\C/, 'match three \C on 2-byte UTF-8 and a byte');
+ ok($a =~ /^\C{3}/, 'match \C{3}');
+
+ ok($a =~ /^\C\Cy/, 'match two \C');
+ ok($a =~ /^\C{2}y/, 'match \C{2}');
+
+ ok($a !~ /^\C\C\Cy/, 'not match three \Cy');
+ ok($a !~ /^\C{2}\Cy/, 'not match \C{3}y');
+
+ $a = "\x{1000}y"; # 3 bytes before "y"
+
+ ok($a =~ /^\C/, 'match one \C on three-byte UTF-8');
+ ok($a =~ /^\C{1}/, 'match \C{1}');
+ ok($a =~ /^\C\C/, 'match two \C');
+ ok($a =~ /^\C{2}/, 'match \C{2}');
+ ok($a =~ /^\C\C\C/, 'match three \C');
+ ok($a =~ /^\C{3}/, 'match \C{3}');
+
+ ok($a =~ /^\C\C\C\C/, 'match four \C on three-byte UTF-8 and a byte');
+ ok($a =~ /^\C{4}/, 'match \C{4}');
+
+ ok($a =~ /^\C\C\Cy/, 'match three \Cy');
+ ok($a =~ /^\C{3}y/, 'match \C{3}y');
+
+ ok($a !~ /^\C\C\C\C\y/, 'not match four \Cy');
+ ok($a !~ /^\C{4}y/, 'not match \C{4}y');
+}
+
+# last test 968
+
==== //depot/perl/t/op/subst.t#37 (xtext) ====
Index: perl/t/op/subst.t
--- perl/t/op/subst.t#36~18221~ Sun Dec 1 18:18:19 2002
+++ perl/t/op/subst.t Thu Dec 12 12:35:29 2002
@@ -7,7 +7,7 @@
}
require './test.pl';
-plan( tests => 124 );
+plan( tests => 125 );
$x = 'foo';
$_ = "x";
@@ -494,9 +494,19 @@
$_ = 'aaaa';
$r = 'x';
$s = s/a(?{})/$r/g;
-is("<$_> <$s>", "<xxxx> <4>", "perl #7806");
+is("<$_> <$s>", "<xxxx> <4>", "[perl #7806]");
$_ = 'aaaa';
$s = s/a(?{})//g;
-is("<$_> <$s>", "<> <4>", "perl #7806");
+is("<$_> <$s>", "<> <4>", "[perl #7806]");
+# [perl #19048] Coredump in silly replacement
+{
+ local $^W = 0;
+ $_="abcdef\n";
+ s!.!!eg;
+ is($_, "\n", "[perl #19048]");
+}
+
+
+
End of Patch.