In perl.git, the branch blead has been updated <http://perl5.git.perl.org/perl.git/commitdiff/c7d255944c0b238f9cec18e728822535d42a9ed2?hp=76f2ffcdf8c40f8ff5966aa85d388596131ff8fe>
- Log ----------------------------------------------------------------- commit c7d255944c0b238f9cec18e728822535d42a9ed2 Author: Karl Williamson <[email protected]> Date: Mon Nov 24 13:19:21 2014 -0700 Make /[\N{}-\N{}]/ match Unicodely on EBCDIC This makes [\N{U+06}-\N{U+09}] match U+06, U+07, U+08, U+09 even on EBCDIC platforms, allowing one to write portable ranges. For 1047 EBCDIC this would match 0x2E, 0x2F, 0x16, and 0x05. Thanks to Yaroslave Kuzmin for finding a bug in an earlier incarnation of this patch. M pod/perlre.pod M pod/perlrecharclass.pod M regcomp.c commit 22e7ef05c1f7a7fcd58d10d6e720579b9bbea728 Author: Karl Williamson <[email protected]> Date: Thu Nov 13 10:59:34 2014 -0700 toke.c: Add comment M toke.c ----------------------------------------------------------------------- Summary of changes: pod/perlre.pod | 10 ++++-- pod/perlrecharclass.pod | 18 +++++++++-- regcomp.c | 82 +++++++++++++++++++++++++++++++++---------------- toke.c | 4 ++- 4 files changed, 82 insertions(+), 32 deletions(-) diff --git a/pod/perlre.pod b/pod/perlre.pod index 891eb34..f11e5ff 100644 --- a/pod/perlre.pod +++ b/pod/perlre.pod @@ -2312,8 +2312,14 @@ Note also that the whole range idea is rather unportable between character sets--and even within character sets they may cause results you probably didn't expect. A sound principle is to use only ranges that begin from and end at either alphabetics of equal case ([a-e], -[A-E]), or digits ([0-9]). Anything else is unsafe. If in doubt, -spell out the character sets in full. +[A-E]), or digits ([0-9]). Anything else is unsafe or unclear. If in +doubt, spell out the character sets in full. Specifying the end points +of the range using the C<\N{...}> syntax, using Unicode names or code +points makes the range portable, but still likely not easily +understandable to someone reading the code. For example, +C<[\N{U+04}-\N{U+07}]> means to match the Unicode code points +C<\N{U+04}>, C<\N{U+05}>, C<\N{U+06}>, and C<\N{U+07}>, whatever their +native values may be on the platform. Characters may be specified using a metacharacter syntax much like that used in C: "\n" matches a newline, "\t" a tab, "\r" a carriage return, diff --git a/pod/perlrecharclass.pod b/pod/perlrecharclass.pod index c79c9a0..fb5868d 100644 --- a/pod/perlrecharclass.pod +++ b/pod/perlrecharclass.pod @@ -608,10 +608,22 @@ Examples: # hyphen ('-'), or the letter 'm'. ['-?] # Matches any of the characters '()*+,-./0123456789:;<=>? # (But not on an EBCDIC platform). - -Perl guarantees that the ranges C<A-Z>, C<a-z>, C<0-9>, and any + [\N{APOSTROPHE}-\N{QUESTION MARK}] + # Matches any of the characters '()*+,-./0123456789:;<=>? + # even on an EBCDIC platform. + [\N{U+27}-\N{U+3F}] # Same. (U+27 is "'", and U+3F is "?" + +As the final two examples above show, you can achieve portablity to +non-ASCII platforms by using the C<\N{...}> form for the range +endpoints. These indicate that the specified range is to be interpreted +using Unicode values, so C<[\N{U+27}-\N{U+3F}]> means to match +C<\N{U+27}>, C<\N{U+28}>, C<\N{U+29}>, ..., C<\N{U+3D}>, C<\N{U+3E}>, +and C<\N{U+3F}>, whatever the native code point versions for those are. + +Perl also guarantees that the ranges C<A-Z>, C<a-z>, C<0-9>, and any subranges of these match what an English-only speaker would expect them -to match. That is, C<[A-Z]> matches the 26 ASCII uppercase letters; +to match on any platform. That is, C<[A-Z]> matches the 26 ASCII +uppercase letters; C<[a-z]> matches the 26 lowercase letters; and C<[0-9]> matches the 10 digits. Subranges, like C<[h-k]>, match correspondingly, in this case just the four letters C<"h">, C<"i">, C<"j">, and C<"k">. This is the diff --git a/regcomp.c b/regcomp.c index 442d0ba..85a142e 100644 --- a/regcomp.c +++ b/regcomp.c @@ -13742,6 +13742,11 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, /* In a range, counts how many 0-2 of the ends of it came from literals, * not escapes. Thus we can tell if 'A' was input vs \x{C1} */ UV literal_endpoint = 0; + + /* Is the range unicode? which means on a platform that isn't 1-1 native + * to Unicode (i.e. non-ASCII), each code point in it should be considered + * to be a Unicode value. */ + bool unicode_range = FALSE; #endif bool invert = FALSE; /* Is this class to be complemented */ @@ -13947,8 +13952,10 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, } /* Here, is a single code point, and <value> contains it */ #ifdef EBCDIC - /* We consider named characters to be literal characters */ + /* We consider named characters to be literal characters, + * and they are Unicode */ literal_endpoint++; + unicode_range = TRUE; #endif } break; @@ -14406,8 +14413,23 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, * minus sign */ if (range) { +#ifdef EBCDIC + /* For unicode ranges, we have to test that the Unicode as opposed + * to the native values are not decreasing. (Above 255, and there + * is no difference between native and Unicode) */ + if (unicode_range && prevvalue < 255 && value < 255) { + if (NATIVE_TO_LATIN1(prevvalue) > NATIVE_TO_LATIN1(value)) { + goto backwards_range; + } + } + else +#endif if (prevvalue > value) /* b-a */ { - const int w = RExC_parse - rangebegin; + int w; +#ifdef EBCDIC + backwards_range: +#endif + w = RExC_parse - rangebegin; vFAIL2utf8f( "Invalid [] range \"%"UTF8f"\"", UTF8fARG(UTF, w, rangebegin)); @@ -14542,32 +14564,40 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, cp_foldable_list = _add_range_to_invlist(cp_foldable_list, prevvalue, value); #else - SV* this_range = _new_invlist(1); - _append_range_to_invlist(this_range, prevvalue, value); - - /* In EBCDIC, the ranges 'A-Z' and 'a-z' are each not contiguous. - * If this range was specified using something like 'i-j', we want - * to include only the 'i' and the 'j', and not anything in - * between, so exclude non-ASCII, non-alphabetics from it. - * However, if the range was specified with something like - * [\x89-\x91] or [\x89-j], all code points within it should be - * included. literal_endpoint==2 means both ends of the range used - * a literal character, not \x{foo} */ - if (literal_endpoint == 2 - && ((isLOWER_A(prevvalue) && isLOWER_A(value)) - || (isUPPER_A(prevvalue) && isUPPER_A(value)))) + /* On non-ASCII platforms, for ranges that span all of 0..255, and + * ones that don't require special handling, we can just add the + * range like we do for ASCII platforms */ + if ((UNLIKELY(prevvalue == 0) && value >= 255) + || ! (prevvalue < 256 + && (unicode_range + || (literal_endpoint == 2 + && ((isLOWER_A(prevvalue) && isLOWER_A(value)) + || (isUPPER_A(prevvalue) + && isUPPER_A(value))))))) { - _invlist_intersection(this_range, PL_XPosix_ptrs[_CC_ASCII], - &this_range); - - /* Since 'this_range' now only contains ascii, the intersection - * of it with anything will still yield only ascii */ - _invlist_intersection(this_range, PL_XPosix_ptrs[_CC_ALPHA], - &this_range); + cp_foldable_list = _add_range_to_invlist(cp_foldable_list, + prevvalue, value); + } + else { + /* Here, requires special handling. This can be because it is + * a range whose code points are considered to be Unicode, and + * so must be individually translated into native, or because + * its a subrange of 'A-Z' or 'a-z' which each aren't + * contiguous in EBCDIC, but we have defined them to include + * only the "expected" upper or lower case ASCII alphabetics. + * Subranges above 255 are the same in native and Unicode, so + * can be added as a range */ + U8 start = NATIVE_TO_LATIN1(prevvalue); + unsigned j; + U8 end = (value < 256) ? NATIVE_TO_LATIN1(value) : 255; + for (j = start; j <= end; j++) { + cp_foldable_list = add_cp_to_invlist(cp_foldable_list, LATIN1_TO_NATIVE(j)); + } + if (value > 255) { + cp_foldable_list = _add_range_to_invlist(cp_foldable_list, + 256, value); + } } - _invlist_union(cp_foldable_list, this_range, &cp_foldable_list); - literal_endpoint = 0; - SvREFCNT_dec_NN(this_range); #endif } diff --git a/toke.c b/toke.c index 059c463..9a01103 100644 --- a/toke.c +++ b/toke.c @@ -8588,7 +8588,9 @@ S_scan_ident(pTHX_ char *s, char *dest, STRLEN destlen, I32 ck_uni) * * Because all ASCII characters have the same representation whether * encoded in UTF-8 or not, we can use the foo_A macros below and '\0' and - * '{' without knowing if is UTF-8 or not */ + * '{' without knowing if is UTF-8 or not. + * EBCDIC already uses the rules that ASCII platforms will use after the + * deprecation cycle; see comment below about the deprecation. */ #ifdef EBCDIC # define VALID_LEN_ONE_IDENT(s, is_utf8) \ (isGRAPH_A(*(s)) || ((is_utf8) \ -- Perl5 Master Repository
