In perl.git, the branch blead has been updated <http://perl5.git.perl.org/perl.git/commitdiff/534752c1d25d7c52c702337927c37e40c4df103d?hp=cc890588b0ce25c7e37126933582b6d944ad1584>
- Log ----------------------------------------------------------------- commit 534752c1d25d7c52c702337927c37e40c4df103d Author: Karl Williamson <[email protected]> Date: Thu Aug 11 17:43:13 2016 -0600 utf8.c: Remove an EBCDIC-only path Rewriting this slightly allows an #ifdef EBCDIC to be removed. I am copying from code written by [email protected] and included in the Encode distribution. M utf8.c commit cee69f79c11928d8dc4009829e006913b0c02059 Author: Karl Williamson <[email protected]> Date: Thu Aug 11 16:53:58 2016 -0600 regexec.c: White-space only M regexec.c commit 594feacd4bcd5777c4b50f0f656cd58e8506c5cc Author: Karl Williamson <[email protected]> Date: Thu Aug 11 16:53:35 2016 -0600 lib/locale.t: Add some extra info to a debug statement M lib/locale.t commit 389cb6df94a20dfb9291b46c419d7626abcec7ff Author: Karl Williamson <[email protected]> Date: Thu Aug 11 16:49:57 2016 -0600 lib/locale.t: Generalize test To test for cased letters, we need to include lower as well as upper, folded. I happen to know that internally currently, this doesn't make a difference, but the test should still be written so it will work properly if that were to change. M lib/locale.t commit 1065615957bc3821a00a319d54f18f443fc2861e Author: Karl Williamson <[email protected]> Date: Thu Aug 11 16:42:20 2016 -0600 Take advantage of SvGROW's return value I had not realized that SvGROW returned the new string pointer. Using that makes a one-step process from a two-step process. I examined the code for other possible occurrences, and found others where it seemed that the two-step seemed clearer, so left those alone. M perl.c M pp.c M toke.c ----------------------------------------------------------------------- Summary of changes: lib/locale.t | 13 ++++++++----- perl.c | 3 +-- pp.c | 15 +++++---------- regexec.c | 2 +- toke.c | 3 +-- utf8.c | 12 ++++-------- 6 files changed, 20 insertions(+), 28 deletions(-) diff --git a/lib/locale.t b/lib/locale.t index 0b7f415..e8cedbc 100644 --- a/lib/locale.t +++ b/lib/locale.t @@ -979,7 +979,7 @@ foreach my $Locale (@Locale) { @{$posixes{'punct'}} = grep /[[:punct:]]/, map {chr } 0..255; @{$posixes{'upper'}} = grep /[[:upper:]]/, map {chr } 0..255; @{$posixes{'xdigit'}} = grep /[[:xdigit:]]/, map {chr } 0..255; - @{$posixes{'cased'}} = grep /[[:upper:]]/i, map {chr } 0..255; + @{$posixes{'cased'}} = grep /[[:upper:][:lower:]]/i, map {chr } 0..255; # Sieve the uppercase and the lowercase. @@ -1010,7 +1010,7 @@ foreach my $Locale (@Locale) { @{$posixes{'punct'}} = grep /[[:punct:]]/, map {chr } 0..255; @{$posixes{'upper'}} = grep /[[:upper:]]/, map {chr } 0..255; @{$posixes{'xdigit'}} = grep /[[:xdigit:]]/, map {chr } 0..255; - @{$posixes{'cased'}} = grep /[[:upper:]]/i, map {chr } 0..255; + @{$posixes{'cased'}} = grep /[[:upper:][:lower:]]/i, map {chr } 0..255; for (@{$posixes{'word'}}) { if (/[^\d_]/) { # skip digits and the _ if (uc($_) eq $_) { @@ -1204,7 +1204,7 @@ foreach my $Locale (@Locale) { (/[[:xdigit:]]/ xor /[[:^xdigit:]]/) || # effectively is what [:cased:] would be if it existed. - (/[[:upper:]]/i xor /[[:^upper:]]/i); + (/[[:upper:][:lower:]]/i xor /[^[:upper:][:lower:]]/i); } else { push @f, $_ unless (/[[:alpha:]]/ xor /[[:^alpha:]]/) || @@ -1220,7 +1220,7 @@ foreach my $Locale (@Locale) { (/[[:upper:]]/ xor /[[:^upper:]]/) || (/[[:word:]]/ xor /[[:^word:]]/) || (/[[:xdigit:]]/ xor /[[:^xdigit:]]/) || - (/[[:upper:]]/i xor /[[:^upper:]]/i); + (/[[:upper:][:lower:]]/i xor /[^[:upper:][:lower:]]/i); } } report_multi_result($Locale, $locales_test_number, \@f); @@ -1988,12 +1988,15 @@ foreach my $Locale (@Locale) { foreach my $err (keys %!) { use Errno; $! = eval "&Errno::$err"; # Convert to strerror() output + my $errnum = 0+$!; my $strerror = "$!"; if ("$strerror" =~ /\P{ASCII}/) { $ok14 = utf8::is_utf8($strerror); no locale; $ok14_5 = "$!" !~ /\P{ASCII}/; - debug(disp_str("non-ASCII \$!=$!")) if ! $ok14_5; + debug( disp_str( + "non-ASCII \$! for error $errnum='$strerror'")) + if ! $ok14_5; last; } } diff --git a/perl.c b/perl.c index 21a8b30..6618535 100644 --- a/perl.c +++ b/perl.c @@ -3224,8 +3224,7 @@ Perl_moreswitches(pTHX_ const char *s) s--; } PL_rs = newSVpvs(""); - SvGROW(PL_rs, (STRLEN)(UVCHR_SKIP(rschar) + 1)); - tmps = (U8*)SvPVX(PL_rs); + tmps = (U8*) SvGROW(PL_rs, (STRLEN)(UVCHR_SKIP(rschar) + 1)); uvchr_to_utf8(tmps, rschar); SvCUR_set(PL_rs, UVCHR_SKIP(rschar)); SvUTF8_on(PL_rs); diff --git a/pp.c b/pp.c index 124bf63..49b6abe 100644 --- a/pp.c +++ b/pp.c @@ -4109,8 +4109,7 @@ PP(pp_uc) * allocate without allocating too much. Such is life. * See corresponding comment in lc code for another option * */ - SvGROW(dest, min); - d = (U8*)SvPVX(dest) + o; + d = o + (U8*) SvGROW(dest, min); } Copy(tmpbuf, d, ulen, U8); d += ulen; @@ -4174,8 +4173,7 @@ PP(pp_uc) * ASCII. If not enough room, grow the string */ if (SvLEN(dest) < ++min) { const UV o = d - (U8*)SvPVX_const(dest); - SvGROW(dest, min); - d = (U8*)SvPVX(dest) + o; + d = o + (U8*) SvGROW(dest, min); } *d++ = 'S'; *d = 'S'; /* upper case is 'SS' */ continue; /* Back to the tight loop; still in ASCII */ @@ -4325,8 +4323,7 @@ PP(pp_lc) * Another option would be to grow an extra byte or two more * each time we need to grow, which would cut down the million * to 500K, with little waste */ - SvGROW(dest, min); - d = (U8*)SvPVX(dest) + o; + d = o + (U8*) SvGROW(dest, min); } /* Copy the newly lowercased letter to the output buffer we're @@ -4520,8 +4517,7 @@ PP(pp_fc) if (ulen > u && (SvLEN(dest) < (min += ulen - u))) { const UV o = d - (U8*)SvPVX_const(dest); - SvGROW(dest, min); - d = (U8*)SvPVX(dest) + o; + d = o + (U8*) SvGROW(dest, min); } Copy(tmpbuf, d, ulen, U8); @@ -4600,8 +4596,7 @@ PP(pp_fc) * becomes "ss", which may require growing the SV. */ if (SvLEN(dest) < ++min) { const UV o = d - (U8*)SvPVX_const(dest); - SvGROW(dest, min); - d = (U8*)SvPVX(dest) + o; + d = o + (U8*) SvGROW(dest, min); } *(d)++ = 's'; *d = 's'; diff --git a/regexec.c b/regexec.c index 60ff2a0..380e378 100644 --- a/regexec.c +++ b/regexec.c @@ -458,7 +458,7 @@ S_isFOO_lc(pTHX_ const U8 classnum, const U8 character) case _CC_ENUM_ALPHA: return isALPHA_LC(character); case _CC_ENUM_ASCII: return isASCII_LC(character); case _CC_ENUM_BLANK: return isBLANK_LC(character); - case _CC_ENUM_CASED: return isLOWER_LC(character) + case _CC_ENUM_CASED: return isLOWER_LC(character) || isUPPER_LC(character); case _CC_ENUM_CNTRL: return isCNTRL_LC(character); case _CC_ENUM_DIGIT: return isDIGIT_LC(character); diff --git a/toke.c b/toke.c index b0e3736..ddc7e15 100644 --- a/toke.c +++ b/toke.c @@ -3098,8 +3098,7 @@ S_scan_const(pTHX_ char *start) /* Subtract 3 for the bytes that were already accounted for * (min, max, and the hyphen) */ - SvGROW(sv, SvLEN(sv) + grow - 3); - d = SvPVX(sv) + save_offset; /* refresh d after realloc */ + d = save_offset + SvGROW(sv, SvLEN(sv) + grow - 3); /* Here, we expand out the range. On ASCII platforms, the * compiler should optimize out the 'convert_unicode==TRUE' diff --git a/utf8.c b/utf8.c index 7d72c55..0b7cbda 100644 --- a/utf8.c +++ b/utf8.c @@ -638,14 +638,10 @@ Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags) /* Here is not a continuation byte, nor an invariant. The only thing left * is a start byte (possibly for an overlong) */ -#ifdef EBCDIC - uv = NATIVE_UTF8_TO_I8(uv); -#endif - - /* Remove the leading bits that indicate the number of bytes in the - * character's whole UTF-8 sequence, leaving just the bits that are part of - * the value */ - uv &= UTF_START_MASK(expectlen); + /* Convert to I8 on EBCDIC (no-op on ASCII), then remove the leading bits + * that indicate the number of bytes in the character's whole UTF-8 + * sequence, leaving just the bits that are part of the value. */ + uv = NATIVE_UTF8_TO_I8(uv) & UTF_START_MASK(expectlen); /* Now, loop through the remaining bytes in the character's sequence, * accumulating each into the working value as we go. Be sure to not look -- Perl5 Master Repository
