In perl.git, the branch blead has been updated <http://perl5.git.perl.org/perl.git/commitdiff/66b092637d94a05d3e747623f03f2bfc0679dcb5?hp=51238c0c8ece0331d937dcc2ff1271fa7f8ae110>
- Log ----------------------------------------------------------------- commit 66b092637d94a05d3e747623f03f2bfc0679dcb5 Author: Karl Williamson <[email protected]> Date: Tue Dec 27 21:25:39 2016 -0700 toke.c: Clarify comment M toke.c commit 188d22cf717ab070718f1459212a881ded22bef4 Author: Karl Williamson <[email protected]> Date: Tue Dec 27 21:15:43 2016 -0700 toke.c: Keep better track for avoiding work in tr/// As explained in the comments, tr/// creates byte map tables for bytes 0-255 if everything is below 256. If something is higher than that, special handling is required (currently a swash), and so there is no point in creating the tables here by expanding each range. Prior to this commit, it realized this on a per-range basis, not expanding ranges which extend above 255. This commit changes that so if any time it finds something above 255, it remembers that and skips the expansion of all future ranges encountered. This was done without adding any extra tests outside the specific tr/// code. M toke.c commit ec282617d074afafad84a1049786659375a2d285 Author: Karl Williamson <[email protected]> Date: Tue Dec 27 21:20:13 2016 -0700 toke.c: Fix comment that said the opposite of what was meant M toke.c commit 87dd6ea7d465d7912157b79f9d292353cc5a188e Author: Karl Williamson <[email protected]> Date: Tue Dec 27 20:55:13 2016 -0700 toke.c: Simplify expression Unless there's a reason its needed, I prefer not to set a variable within an 'if', as it increases the reader's cognitive load. M toke.c commit 5dfbfbd5a7bf2502dbdafd98ce9e7698d2b7d230 Author: Karl Williamson <[email protected]> Date: Tue Dec 27 20:52:20 2016 -0700 SV_UTF8_NO_ENCODING is no longer used It once had meaning under 'use encoding'; M sv.h M toke.c commit 1e02a1751026b404f3991e6e884bfdb449c82216 Author: Karl Williamson <[email protected]> Date: Thu Aug 18 21:41:08 2016 -0600 toke.c: White-space only Indent after previous commit added new blocks, and a couple other white space adjustments M toke.c commit af9be36c89322d2469f27b3c98c20c32044697fe Author: Karl Williamson <[email protected]> Date: Thu Aug 18 18:54:13 2016 -0600 toke.c: Potentially avoid work when converting to UTF-8 Some code points < 256 are the same whether represented in UTF-8, or not. Others change to require 2 bytes to represent in UTF-8. When parsing a string, using UTF-8 is avoided unless necessary, because of the extra overhead required for processing UTF-8. This means that when, during the parse, we discover we need to convert to UTF-8, we have to, in effect, reparse whatever we have so far to make sure those code points that differ under UTF-8 get their proper representation. This reparsing would not be necessary if we know that the string doesn't have such code points. It turns out that keeping track of having seen UTF-8 variant code points is cheap, requiring no extra branch instructions. And the payoff is potentially large, avoiding having to reparse the string. This commit changes to keep track. M toke.c commit 8d0042a898ed988cce011b3be47ac52b8c37b727 Author: Karl Williamson <[email protected]> Date: Sun Dec 25 21:53:04 2016 -0700 toke.c: Reorder if/else branches It is easier to read code if the trivial branch is follows immediately after the 'if'. M toke.c commit 2916094f340070aac2552b1836d61c69607559de Author: Karl Williamson <[email protected]> Date: Tue Dec 27 19:37:33 2016 -0700 toke.c: Fix too-small SvGROW() This might be a bug on EBCDIC, but not ASCII platforms. The code forgot that SvGROW takes a total size, and not an incremental size. So, this is most likely a no-op, but I believe there are no cases on ASCII platforms where this actually needs to grow, and on EBCDIC, it would only be very large, way above Unicode, code points. Grows in later iterations of the loop would recover to grow to the correct size, unless this EBCDIC escape sequence was the final thing there. M toke.c commit b73e60d1ecb44d0f32cd92f3b9ba898ee62ab49b Author: Karl Williamson <[email protected]> Date: Tue Dec 27 19:20:08 2016 -0700 toke.c: Simplify a couple of expressions By storing an intermediate result in a temporary, the final expression is clearer. M toke.c commit 94beb10175a0e77e01d71e76b4cdccb303547783 Author: Karl Williamson <[email protected]> Date: Tue Dec 27 19:45:07 2016 -0700 toke.c: Convert sv_grow to SvGROW The latter is preferred M toke.c ----------------------------------------------------------------------- Summary of changes: sv.h | 2 +- toke.c | 222 ++++++++++++++++++++++++++++++++++++++++++++--------------------- 2 files changed, 151 insertions(+), 73 deletions(-) diff --git a/sv.h b/sv.h index ee6587909c..e311ff2c02 100644 --- a/sv.h +++ b/sv.h @@ -1806,10 +1806,10 @@ Like C<sv_utf8_upgrade>, but doesn't do magic on C<sv>. #define SvSHARED_HASH(sv) (0 + SvSHARED_HEK_FROM_PV(SvPVX_const(sv))->hek_hash) /* flag values for sv_*_flags functions */ +#define SV_UTF8_NO_ENCODING 0 /* No longer used */ #define SV_IMMEDIATE_UNREF 1 #define SV_GMAGIC 2 #define SV_COW_DROP_PV 4 -#define SV_UTF8_NO_ENCODING 8 #define SV_NOSTEAL 16 #define SV_CONST_RETURN 32 #define SV_MUTABLE_RETURN 64 diff --git a/toke.c b/toke.c index 78879070fb..c5971c76ef 100644 --- a/toke.c +++ b/toke.c @@ -2785,15 +2785,17 @@ S_get_and_check_backslash_N_name(pTHX_ const char* s, const char* const e) In transliterations: characters are VERY literal, except for - not at the start or end - of the string, which indicates a range. If the range is in bytes, + of the string, which indicates a range. However some backslash sequences + are recognized: \r, \n, and the like + \007 \o{}, \x{}, \N{} + If all elements in the transliteration are below 256, scan_const expands the range to the full set of intermediate characters. If the range is in utf8, the hyphen is replaced with a certain range mark which will be handled by pmtrans() in op.c. In double-quoted strings: backslashes: - double-quoted style: \r and \n - constants: \x31, etc. + all those recognized in transliterations deprecated backrefs: \1 (in substitution replacements) case and quoting: \U \Q \E stops on @ and $ @@ -2851,11 +2853,18 @@ S_scan_const(pTHX_ char *start) bool didrange = FALSE; /* did we just finish a range? */ bool in_charclass = FALSE; /* within /[...]/ */ bool has_utf8 = FALSE; /* Output constant is UTF8 */ + bool has_above_latin1 = FALSE; /* does something require special + handling in tr/// ? */ bool this_utf8 = cBOOL(UTF); /* Is the source string assumed to be UTF8? But, this can show as true when the source isn't utf8, as for example when it is entirely composed of hex constants */ + STRLEN utf8_variant_count = 0; /* When not in UTF-8, this counts the + number of characters found so far + that will expand (into 2 bytes) + should we have to convert to + UTF-8) */ SV *res; /* result from charnames */ STRLEN offset_to_max; /* The offset in the output to where the range high-end character is temporarily placed */ @@ -2911,15 +2920,15 @@ S_scan_const(pTHX_ char *start) * flag is set and we fix up the range. * * Ranges entirely within Latin1 are expanded out entirely, in - * order to avoid the significant overhead of making a swash. - * Ranges that extend above Latin1 have to have a swash, so there - * is no advantage to abbreviating them here, so they are stored - * here as Min, ILLEGAL_UTF8_BYTE, Max. The illegal byte signifies - * a hyphen without any possible ambiguity. On EBCDIC machines, if - * the range is expressed as Unicode, the Latin1 portion is - * expanded out even if the entire range extends above Latin1. - * This is because each code point in it has to be processed here - * individually to get its native translation */ + * order to make the transliteration a simple table look-up. + * Ranges that extend above Latin1 have to be done differently, so + * there is no advantage to expanding them here, so they are + * stored here as Min, ILLEGAL_UTF8_BYTE, Max. The illegal byte + * signifies a hyphen without any possible ambiguity. On EBCDIC + * machines, if the range is expressed as Unicode, the Latin1 + * portion is expanded out even if the range extends above + * Latin1. This is because each code point in it has to be + * processed here individually to get its native translation */ if (! dorange) { @@ -2938,6 +2947,15 @@ S_scan_const(pTHX_ char *start) non_portable_endpoint = 0; backslash_N = 0; #endif + /* The tests here and the following 'else' for being above + * Latin1 suffice to find all such occurences in the + * constant, except those added by a backslash escape + * sequence, like \x{100}. And all those set + * 'has_above_latin1' as appropriate */ + if (this_utf8 && UTF8_IS_ABOVE_LATIN1(*s)) { + has_above_latin1 = TRUE; + } + /* Drops down to generic code to process current byte */ } else { @@ -2957,6 +2975,10 @@ S_scan_const(pTHX_ char *start) * pointer). We'll finish processing the range the next * time through the loop */ offset_to_max = d - SvPVX_const(sv); + + if (this_utf8 && UTF8_IS_ABOVE_LATIN1(*s)) { + has_above_latin1 = TRUE; + } } } /* End of not a range */ else { @@ -3001,13 +3023,13 @@ S_scan_const(pTHX_ char *start) * Unicode value (\N{...}), or if the range is a subset of * [A-Z] or [a-z], and both ends are literal characters, * like 'A', and not like \x{C1} */ - if ((convert_unicode - = cBOOL(backslash_N) /* \N{} forces Unicode, hence + convert_unicode = + cBOOL(backslash_N) /* \N{} forces Unicode, hence portable range */ || ( ! non_portable_endpoint && (( isLOWER_A(range_min) && isLOWER_A(range_max)) - || (isUPPER_A(range_min) && isUPPER_A(range_max)))) - )) { + || (isUPPER_A(range_min) && isUPPER_A(range_max)))); + if (convert_unicode) { /* Special handling is needed for these portable ranges. * They are defined to all be in Unicode terms, which @@ -3058,14 +3080,15 @@ S_scan_const(pTHX_ char *start) if (has_utf8) { - /* We try to avoid creating a swash. If the upper end of - * this range is below 256, this range won't force a swash; - * otherwise it does force a swash, and as long as we have - * to have one, we might as well not expand things out. - * But if it's EBCDIC, we may have to look at each - * character below 256 if we have to convert to/from - * Unicode values */ - if (range_max > 255 + /* If everything in the transliteration is below 256, we + * can avoid special handling later. A translation table + * of each of those bytes is created. And so we expand out + * all ranges to their constituent code points. But if + * we've encountered something above 255, the expanding + * won't help, so skip doing that. But if it's EBCDIC, we + * may have to look at each character below 256 if we have + * to convert to/from Unicode values */ + if ( has_above_latin1 #ifdef EBCDIC && (range_min > 255 || ! convert_unicode) #endif @@ -3202,8 +3225,7 @@ S_scan_const(pTHX_ char *start) if (!esc) in_charclass = TRUE; } - - else if (*s == ']' && PL_lex_inpat && in_charclass) { + else if (*s == ']' && PL_lex_inpat && in_charclass) { char *s1 = s-1; int esc = 0; while (s1 >= start && *s1-- == '\\') @@ -3396,38 +3418,55 @@ S_scan_const(pTHX_ char *start) } else { if (!has_utf8 && uv > 255) { - /* Might need to recode whatever we have accumulated so - * far if it contains any chars variant in utf8 or - * utf-ebcdic. */ - SvCUR_set(sv, d - SvPVX_const(sv)); - SvPOK_on(sv); - *d = '\0'; - /* See Note on sizing above. */ - sv_utf8_upgrade_flags_grow( - sv, - SV_GMAGIC|SV_FORCE_UTF8_UPGRADE - /* Above-latin1 in string - * implies no encoding */ - |SV_UTF8_NO_ENCODING, - UVCHR_SKIP(uv) + (STRLEN)(send - s) + 1); - d = SvPVX(sv) + SvCUR(sv); - has_utf8 = TRUE; + /* Here, 'uv' won't fit unless we convert to UTF-8. + * If we've only seen invariants so far, all we have to + * do is turn on the flag */ + if (utf8_variant_count == 0) { + SvUTF8_on(sv); + } + else { + SvCUR_set(sv, d - SvPVX_const(sv)); + SvPOK_on(sv); + *d = '\0'; + + sv_utf8_upgrade_flags_grow( + sv, + SV_GMAGIC|SV_FORCE_UTF8_UPGRADE, + + /* Since we're having to grow here, + * make sure we have enough room for + * this escape and a NUL, so the + * code immediately below won't have + * to actually grow again */ + UVCHR_SKIP(uv) + + (STRLEN)(send - s) + 1); + d = SvPVX(sv) + SvCUR(sv); + } + + has_above_latin1 = TRUE; + has_utf8 = TRUE; } - if (has_utf8) { + if (! has_utf8) { + *d++ = (char)uv; + utf8_variant_count++; + } + else { /* Usually, there will already be enough room in 'sv' * since such escapes are likely longer than any UTF-8 * sequence they can end up as. This isn't the case on * EBCDIC where \x{40000000} contains 12 bytes, and the * UTF-8 for it contains 14. And, we have to allow for * a trailing NUL. It probably can't happen on ASCII - * platforms, but be safe */ - const STRLEN needed = d - SvPVX(sv) + UVCHR_SKIP(uv) + * platforms, but be safe. See Note on sizing above. */ + const STRLEN needed = d - SvPVX(sv) + + UVCHR_SKIP(uv) + + (send - s) + 1; if (UNLIKELY(needed > SvLEN(sv))) { SvCUR_set(sv, d - SvPVX_const(sv)); - d = sv_grow(sv, needed) + SvCUR(sv); + d = SvCUR(sv) + SvGROW(sv, needed); } d = (char*)uvchr_to_utf8((U8*)d, uv); @@ -3438,9 +3477,6 @@ S_scan_const(pTHX_ char *start) (PL_lex_repl ? OPpTRANS_FROM_UTF : OPpTRANS_TO_UTF); } - } - else { - *d++ = (char)uv; } } #ifdef EBCDIC @@ -3554,16 +3590,27 @@ S_scan_const(pTHX_ char *start) if (! has_utf8 && ( uv > 0xFF || PL_lex_inwhat != OP_TRANS)) { + /* See Note on sizing above. */ + const STRLEN extra = OFFUNISKIP(uv) + (send - e) + 1; + SvCUR_set(sv, d - SvPVX_const(sv)); SvPOK_on(sv); *d = '\0'; - /* See Note on sizing above. */ - sv_utf8_upgrade_flags_grow( - sv, - SV_GMAGIC|SV_FORCE_UTF8_UPGRADE, - OFFUNISKIP(uv) + (STRLEN)(send - e) + 1); - d = SvPVX(sv) + SvCUR(sv); + + if (utf8_variant_count == 0) { + SvUTF8_on(sv); + d = SvCUR(sv) + SvGROW(sv, SvCUR(sv) + extra); + } + else { + sv_utf8_upgrade_flags_grow( + sv, + SV_GMAGIC|SV_FORCE_UTF8_UPGRADE, + extra); + d = SvPVX(sv) + SvCUR(sv); + } + has_utf8 = TRUE; + has_above_latin1 = TRUE; } /* Add the (Unicode) code point to the output. */ @@ -3708,34 +3755,49 @@ S_scan_const(pTHX_ char *start) (int) (e + 1 - start), start)); goto end_backslash_N; } + + if (SvUTF8(res) && UTF8_IS_ABOVE_LATIN1(*str)) { + has_above_latin1 = TRUE; + } + } else if (! SvUTF8(res)) { /* Make sure \N{} return is UTF-8. This is because * \N{} implies Unicode semantics, and scalars have * to be in utf8 to guarantee those semantics; but * not needed in tr/// */ - sv_utf8_upgrade_flags(res, SV_UTF8_NO_ENCODING); + sv_utf8_upgrade_flags(res, 0); str = SvPV_const(res, len); } /* Upgrade destination to be utf8 if this new * component is */ if (! has_utf8 && SvUTF8(res)) { + /* See Note on sizing above. */ + const STRLEN extra = len + (send - s) + 1; + SvCUR_set(sv, d - SvPVX_const(sv)); SvPOK_on(sv); *d = '\0'; - /* See Note on sizing above. */ - sv_utf8_upgrade_flags_grow(sv, + + if (utf8_variant_count == 0) { + SvUTF8_on(sv); + d = SvCUR(sv) + SvGROW(sv, SvCUR(sv) + extra); + } + else { + sv_utf8_upgrade_flags_grow(sv, SV_GMAGIC|SV_FORCE_UTF8_UPGRADE, - len + (STRLEN)(send - s) + 1); - d = SvPVX(sv) + SvCUR(sv); + extra); + d = SvPVX(sv) + SvCUR(sv); + } has_utf8 = TRUE; } else if (len > (STRLEN)(e - s + 4)) { /* I _guess_ 4 is \N{} --jhi */ /* See Note on sizing above. (NOTE: SvCUR() is not * set correctly here). */ + const STRLEN extra = len + (send - e) + 1; const STRLEN off = d - SvPVX_const(sv); - d = off + SvGROW(sv, off + len + (STRLEN)(send - s) + 1); + d = off + SvGROW(sv, off + extra); } Copy(str, d, len, char); d += len; @@ -3799,12 +3861,17 @@ S_scan_const(pTHX_ char *start) * to/from UTF-8. * * If the input has the same representation in UTF-8 as not, it will be - * a single byte, and we don't care about UTF8ness; or if neither - * source nor output is UTF-8, just copy the byte */ - if (NATIVE_BYTE_IS_INVARIANT((U8)(*s)) || (! this_utf8 && ! has_utf8)) - { + * a single byte, and we don't care about UTF8ness; just copy the byte */ + if (NATIVE_BYTE_IS_INVARIANT((U8)(*s))) { *d++ = *s++; } + else if (! this_utf8 && ! has_utf8) { + /* If neither source nor output is UTF-8, is also a single byte, + * just copy it; but this byte counts should we later have to + * convert to UTF-8 */ + *d++ = *s++; + utf8_variant_count++; + } else if (this_utf8 && has_utf8) { /* Both UTF-8, can just copy */ const STRLEN len = UTF8SKIP(s); @@ -3821,23 +3888,34 @@ S_scan_const(pTHX_ char *start) const UV nextuv = (this_utf8) ? utf8n_to_uvchr((U8*)s, send - s, &len, 0) : (UV) ((U8) *s); - const STRLEN need = UVCHR_SKIP(nextuv); + STRLEN need = UVCHR_SKIP(nextuv); + if (!has_utf8) { SvCUR_set(sv, d - SvPVX_const(sv)); SvPOK_on(sv); *d = '\0'; - /* See Note on sizing above. */ - sv_utf8_upgrade_flags_grow(sv, - SV_GMAGIC|SV_FORCE_UTF8_UPGRADE, - need + (STRLEN)(send - s) + 1); - d = SvPVX(sv) + SvCUR(sv); + + /* See Note on sizing above. */ + need += (STRLEN)(send - s) + 1; + + if (utf8_variant_count == 0) { + SvUTF8_on(sv); + d = SvCUR(sv) + SvGROW(sv, SvCUR(sv) + need); + } + else { + sv_utf8_upgrade_flags_grow(sv, + SV_GMAGIC|SV_FORCE_UTF8_UPGRADE, + need); + d = SvPVX(sv) + SvCUR(sv); + } has_utf8 = TRUE; } else if (need > len) { /* encoded value larger than old, may need extra space (NOTE: * SvCUR() is not set correctly here). See Note on sizing * above. */ + const STRLEN extra = need + (send - s) + 1; const STRLEN off = d - SvPVX_const(sv); - d = SvGROW(sv, off + need + (STRLEN)(send - s) + 1) + off; + d = off + SvGROW(sv, off + extra); } s += len; -- Perl5 Master Repository
