[perl.git] branch blead, updated. v5.21.6-75-gc7d2559

Karl Williamson Mon, 24 Nov 2014 12:44:28 -0800

In perl.git, the branch blead has been updated

<http://perl5.git.perl.org/perl.git/commitdiff/c7d255944c0b238f9cec18e728822535d42a9ed2?hp=76f2ffcdf8c40f8ff5966aa85d388596131ff8fe>


- Log -----------------------------------------------------------------
commit c7d255944c0b238f9cec18e728822535d42a9ed2
Author: Karl Williamson <[email protected]>
Date:   Mon Nov 24 13:19:21 2014 -0700

    Make /[\N{}-\N{}]/ match Unicodely on EBCDIC
    
    This makes [\N{U+06}-\N{U+09}] match U+06, U+07, U+08, U+09 even on
    EBCDIC platforms, allowing one to write portable ranges.  For 1047
    EBCDIC this would match 0x2E, 0x2F, 0x16, and 0x05.
    
    Thanks to Yaroslave Kuzmin for finding a bug in an earlier incarnation
    of this patch.

M       pod/perlre.pod
M       pod/perlrecharclass.pod
M       regcomp.c

commit 22e7ef05c1f7a7fcd58d10d6e720579b9bbea728
Author: Karl Williamson <[email protected]>
Date:   Thu Nov 13 10:59:34 2014 -0700

    toke.c: Add comment

M       toke.c
-----------------------------------------------------------------------

Summary of changes:
 pod/perlre.pod          | 10 ++++--
 pod/perlrecharclass.pod | 18 +++++++++--
 regcomp.c               | 82 +++++++++++++++++++++++++++++++++----------------
 toke.c                  |  4 ++-
 4 files changed, 82 insertions(+), 32 deletions(-)

diff --git a/pod/perlre.pod b/pod/perlre.pod
index 891eb34..f11e5ff 100644
--- a/pod/perlre.pod
+++ b/pod/perlre.pod
@@ -2312,8 +2312,14 @@ Note also that the whole range idea is rather unportable 
between
 character sets--and even within character sets they may cause results
 you probably didn't expect.  A sound principle is to use only ranges
 that begin from and end at either alphabetics of equal case ([a-e],
-[A-E]), or digits ([0-9]).  Anything else is unsafe.  If in doubt,
-spell out the character sets in full.
+[A-E]), or digits ([0-9]).  Anything else is unsafe or unclear.  If in
+doubt, spell out the character sets in full.  Specifying the end points
+of the range using the C<\N{...}> syntax, using Unicode names or code
+points makes the range portable, but still likely not easily
+understandable to someone reading the code.  For example,
+C<[\N{U+04}-\N{U+07}]> means to match the Unicode code points
+C<\N{U+04}>, C<\N{U+05}>, C<\N{U+06}>, and C<\N{U+07}>, whatever their
+native values may be on the platform.
 
 Characters may be specified using a metacharacter syntax much like that
 used in C: "\n" matches a newline, "\t" a tab, "\r" a carriage return,
diff --git a/pod/perlrecharclass.pod b/pod/perlrecharclass.pod
index c79c9a0..fb5868d 100644
--- a/pod/perlrecharclass.pod
+++ b/pod/perlrecharclass.pod
@@ -608,10 +608,22 @@ Examples:
              #  hyphen ('-'), or the letter 'm'.
  ['-?]       #  Matches any of the characters  '()*+,-./0123456789:;<=>?
              #  (But not on an EBCDIC platform).
-
-Perl guarantees that the ranges C<A-Z>, C<a-z>, C<0-9>, and any
+ [\N{APOSTROPHE}-\N{QUESTION MARK}]
+             #  Matches any of the characters  '()*+,-./0123456789:;<=>?
+             #  even on an EBCDIC platform.
+ [\N{U+27}-\N{U+3F}] # Same. (U+27 is "'", and U+3F is "?"
+
+As the final two examples above show, you can achieve portablity to
+non-ASCII platforms by using the C<\N{...}> form for the range
+endpoints.  These indicate that the specified range is to be interpreted
+using Unicode values, so C<[\N{U+27}-\N{U+3F}]> means to match
+C<\N{U+27}>, C<\N{U+28}>, C<\N{U+29}>, ..., C<\N{U+3D}>, C<\N{U+3E}>,
+and C<\N{U+3F}>, whatever the native code point versions for those are.
+
+Perl also guarantees that the ranges C<A-Z>, C<a-z>, C<0-9>, and any
 subranges of these match what an English-only speaker would expect them
-to match.  That is, C<[A-Z]> matches the 26 ASCII uppercase letters;
+to match on any platform.  That is, C<[A-Z]> matches the 26 ASCII
+uppercase letters;
 C<[a-z]> matches the 26 lowercase letters; and C<[0-9]> matches the 10
 digits.  Subranges, like C<[h-k]>, match correspondingly, in this case
 just the four letters C<"h">, C<"i">, C<"j">, and C<"k">.  This is the
diff --git a/regcomp.c b/regcomp.c
index 442d0ba..85a142e 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -13742,6 +13742,11 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 
*flagp, U32 depth,
     /* In a range, counts how many 0-2 of the ends of it came from literals,
      * not escapes.  Thus we can tell if 'A' was input vs \x{C1} */
     UV literal_endpoint = 0;
+
+    /* Is the range unicode? which means on a platform that isn't 1-1 native
+     * to Unicode (i.e. non-ASCII), each code point in it should be considered
+     * to be a Unicode value.  */
+    bool unicode_range = FALSE;
 #endif
     bool invert = FALSE;    /* Is this class to be complemented */
 
@@ -13947,8 +13952,10 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 
*flagp, U32 depth,
                     }
                     /* Here, is a single code point, and <value> contains it */
 #ifdef EBCDIC
-                    /* We consider named characters to be literal characters */
+                    /* We consider named characters to be literal characters,
+                     * and they are Unicode */
                     literal_endpoint++;
+                    unicode_range = TRUE;
 #endif
                 }
                 break;
@@ -14406,8 +14413,23 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 
*flagp, U32 depth,
          * minus sign */
 
        if (range) {
+#ifdef EBCDIC
+            /* For unicode ranges, we have to test that the Unicode as opposed
+             * to the native values are not decreasing.  (Above 255, and there
+             * is no difference between native and Unicode) */
+           if (unicode_range && prevvalue < 255 && value < 255) {
+                if (NATIVE_TO_LATIN1(prevvalue) > NATIVE_TO_LATIN1(value)) {
+                    goto backwards_range;
+                }
+            }
+            else
+#endif
            if (prevvalue > value) /* b-a */ {
-               const int w = RExC_parse - rangebegin;
+               int w;
+#ifdef EBCDIC
+              backwards_range:
+#endif
+                w = RExC_parse - rangebegin;
                 vFAIL2utf8f(
                     "Invalid [] range \"%"UTF8f"\"",
                     UTF8fARG(UTF, w, rangebegin));
@@ -14542,32 +14564,40 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 
*flagp, U32 depth,
             cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
                                                      prevvalue, value);
 #else
-            SV* this_range = _new_invlist(1);
-            _append_range_to_invlist(this_range, prevvalue, value);
-
-            /* In EBCDIC, the ranges 'A-Z' and 'a-z' are each not contiguous.
-             * If this range was specified using something like 'i-j', we want
-             * to include only the 'i' and the 'j', and not anything in
-             * between, so exclude non-ASCII, non-alphabetics from it.
-             * However, if the range was specified with something like
-             * [\x89-\x91] or [\x89-j], all code points within it should be
-             * included.  literal_endpoint==2 means both ends of the range used
-             * a literal character, not \x{foo} */
-           if (literal_endpoint == 2
-                && ((isLOWER_A(prevvalue) && isLOWER_A(value))
-                    || (isUPPER_A(prevvalue) && isUPPER_A(value))))
+            /* On non-ASCII platforms, for ranges that span all of 0..255, and
+             * ones that don't require special handling, we can just add the
+             * range like we do for ASCII platforms */
+            if ((UNLIKELY(prevvalue == 0) && value >= 255)
+                || ! (prevvalue < 256
+                      && (unicode_range
+                          || (literal_endpoint == 2
+                              && ((isLOWER_A(prevvalue) && isLOWER_A(value))
+                                  || (isUPPER_A(prevvalue)
+                                      && isUPPER_A(value)))))))
             {
-                _invlist_intersection(this_range, PL_XPosix_ptrs[_CC_ASCII],
-                                      &this_range);
-
-                /* Since 'this_range' now only contains ascii, the intersection
-                 * of it with anything will still yield only ascii */
-                _invlist_intersection(this_range, PL_XPosix_ptrs[_CC_ALPHA],
-                                      &this_range);
+                cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
+                                                         prevvalue, value);
+            }
+            else {
+                /* Here, requires special handling.  This can be because it is
+                 * a range whose code points are considered to be Unicode, and
+                 * so must be individually translated into native, or because
+                 * its a subrange of 'A-Z' or 'a-z' which each aren't
+                 * contiguous in EBCDIC, but we have defined them to include
+                 * only the "expected" upper or lower case ASCII alphabetics.
+                 * Subranges above 255 are the same in native and Unicode, so
+                 * can be added as a range */
+                U8 start = NATIVE_TO_LATIN1(prevvalue);
+                unsigned j;
+                U8 end = (value < 256) ? NATIVE_TO_LATIN1(value) : 255;
+                for (j = start; j <= end; j++) {
+                    cp_foldable_list = add_cp_to_invlist(cp_foldable_list, 
LATIN1_TO_NATIVE(j));
+                }
+                if (value > 255) {
+                    cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
+                                                             256, value);
+                }
             }
-            _invlist_union(cp_foldable_list, this_range, &cp_foldable_list);
-            literal_endpoint = 0;
-            SvREFCNT_dec_NN(this_range);
 #endif
         }
 
diff --git a/toke.c b/toke.c
index 059c463..9a01103 100644
--- a/toke.c
+++ b/toke.c
@@ -8588,7 +8588,9 @@ S_scan_ident(pTHX_ char *s, char *dest, STRLEN destlen, 
I32 ck_uni)
  *
  *      Because all ASCII characters have the same representation whether
  *      encoded in UTF-8 or not, we can use the foo_A macros below and '\0' and
- *      '{' without knowing if is UTF-8 or not */
+ *      '{' without knowing if is UTF-8 or not.
+ * EBCDIC already uses the rules that ASCII platforms will use after the
+ * deprecation cycle; see comment below about the deprecation. */
 #ifdef EBCDIC
 #   define VALID_LEN_ONE_IDENT(s, is_utf8)                                    \
     (isGRAPH_A(*(s)) || ((is_utf8)                                            \

--
Perl5 Master Repository

[perl.git] branch blead, updated. v5.21.6-75-gc7d2559

Reply via email to