In perl.git, the branch blead has been updated

<https://perl5.git.perl.org/perl.git/commitdiff/4f89f09f9b3db7f542b7029c55aad389a59c9a02?hp=d7bcd45a8b00fa6be02a62e097629473b2a9becc>

- Log -----------------------------------------------------------------
commit 4f89f09f9b3db7f542b7029c55aad389a59c9a02
Merge: d7bcd45a8b 2c03e801f9
Author: Karl Williamson <[email protected]>
Date:   Sun Oct 6 11:08:40 2019 -0600

    Merge branch 'Remove EBCDCIC special handling' into blead
    
    It turns out that only a single number is needed to distinguish between
    basic UTF-8 and UTF-EBCDIC.  And that is the number of bits of
    real information are in each continuation byte.  In UTF-8 it is 6 (2
    bits reserved for syntax); In UTF-EBCDIC it is 5.
    
    Everything else stems from reasonable decisions based on this
    fundamental difference.  So all the other constants can be common
    between the two systems, using compile-time shifts and masks.
    
    For Perl's extended UTF-8-like encoding, another constant is needed,
    which is the number of continuation bytes appended when the start byte
    is 8 bits.  For both systems, that number is the minimum required to be
    able to encode a 64-bit integer. (There are other ways to extend the
    encoding, including some that are infinitely so.  But Perl chose to just
    append a fixed number of bytes, so it isn't extensible.  But it has the
    advantage of needing to rely only on the first byte to know how many
    more are coming.)
    
    This commit consolidates various constants that differed between the
    two systems, but were unnecessarily so.  There are other constants that
    remain that differ between the two files; these are for convenience .

commit 2c03e801f9133bacfe39d2a12decdd9d2b3b075a
Author: Karl Williamson <[email protected]>
Date:   Sun Oct 6 10:50:49 2019 -0600

    Make defn of UTF_IS_CONTINUED common
    
    This can be derived from other values, removing an EBCDIC dependency

commit ab2e28c2f2b8f2edf930448a1c0182a8bd4f469f
Author: Karl Williamson <[email protected]>
Date:   Wed Oct 2 20:37:17 2019 -0600

    Make defn of UVCHR_IS_INVARIANT common
    
    This can be derived from other values, removing an EBCDIC dependency

commit 2dc97505e86018c7ceba8c96fd84f477c8dd45d3
Author: Karl Williamson <[email protected]>
Date:   Wed Oct 2 18:08:32 2019 -0600

    Make defn of OFFUNI_IS_INVARIANT common
    
    This can be derived from other values, removing an EBCDIC dependency

commit 7c88d61e18cab1244ecd155556e1f0b3563a7e4a
Author: Karl Williamson <[email protected]>
Date:   Wed Oct 2 18:03:26 2019 -0600

    Make defn of UTF8_IS_DOWNGRADEABLE_START common
    
    This can be derived from other values, removing an EBCDIC dependency

commit 1df634280fbf565fc9e9ada123c12a82404aa817
Author: Karl Williamson <[email protected]>
Date:   Wed Oct 2 17:56:01 2019 -0600

    Make defn of UTF_IS_ABOVE_LATIN1 common
    
    This can be derived from other values, removing an EBCDIC dependency

commit 4bab39bc1904f776c12d31a54ff5abe06fc9c103
Author: Karl Williamson <[email protected]>
Date:   Sun Oct 6 10:50:12 2019 -0600

    Make defn of UTF8_IS_START common
    
    This can be derived from other values, removing an EBCDIC dependency

commit f4225fa0e24724a97c2ff1d4e608353ca1537506
Author: Karl Williamson <[email protected]>
Date:   Wed Oct 2 17:13:31 2019 -0600

    Make defn of UTF8_IS_CONTINUATION common
    
    This can be derived from other values, removing an EBCDIC dependency

commit 38f458ffd56c0eb9f5df18cb6693ca326a4b1374
Author: Karl Williamson <[email protected]>
Date:   Wed Oct 2 17:07:50 2019 -0600

    Make defn of UTF_CONTINUATION_MARK common
    
    This can be derived from other values, removing an EBCDIC dependency

commit 9f3cfb7a26dab519dbc83ef02bd3fbf084cb6fc3
Author: Karl Williamson <[email protected]>
Date:   Wed Oct 2 16:48:38 2019 -0600

    Make defn of UTF_IS_CONTINUATION_MASK common
    
    This variable can be defined from the same base in both UTF-8 and
    UTF-EBCDIC, and doing so eliminates an EBCDIC dependency.

-----------------------------------------------------------------------

Summary of changes:
 utf8.h      | 125 +++++++++++++++++++++++++++++++-----------------------------
 utfebcdic.h |  43 +--------------------
 2 files changed, 65 insertions(+), 103 deletions(-)

diff --git a/utf8.h b/utf8.h
index 472527c4a1..889324e587 100644
--- a/utf8.h
+++ b/utf8.h
@@ -272,67 +272,7 @@ Perl's extended UTF-8 means we can have start bytes up 
through FF, though any
 beginning with FF yields a code point that is too large for 32-bit ASCII
 platforms.  FF signals to use 13 bytes for the encoded character.  This breaks
 the paradigm that the number of leading bits gives how many total bytes there
-are in the character.
-
-=cut
-*/
-
-/* Is the representation of the Unicode code point 'cp' the same regardless of
- * being encoded in UTF-8 or not? */
-#define OFFUNI_IS_INVARIANT(cp)     isASCII(cp)
-
-/*
-=for apidoc Am|bool|UVCHR_IS_INVARIANT|UV cp
-
-Evaluates to 1 if the representation of code point C<cp> is the same whether or
-not it is encoded in UTF-8; otherwise evaluates to 0.  UTF-8 invariant
-characters can be copied as-is when converting to/from UTF-8, saving time.
-C<cp> is Unicode if above 255; otherwise is platform-native.
-
-=cut
- */
-
-#define UVCHR_IS_INVARIANT(cp)      OFFUNI_IS_INVARIANT(cp)
-
-/* This defines the bits that are to be in the continuation bytes of a 
multi-byte
- * UTF-8 encoded character that mark it is a continuation byte. */
-#define UTF_CONTINUATION_MARK          0x80
-
-/* Misleadingly named: is the UTF8-encoded byte 'c' part of a variant sequence
- * in UTF-8?  This is the inverse of UTF8_IS_INVARIANT.  The |0 makes sure this
- * isn't mistakenly called with a ptr argument */
-#define UTF8_IS_CONTINUED(c)  (__ASSERT_(FITS_IN_8_BITS(c))                 \
-                               ((U8)((c) | 0)) &  UTF_CONTINUATION_MARK)
-
-/* Is the byte 'c' the first byte of a multi-byte UTF8-8 encoded sequence?
- * This doesn't catch invariants (they are single-byte).  It also excludes the
- * illegal overlong sequences that begin with C0 and C1.  The |0 makes sure
- * this isn't mistakenly called with a ptr argument */
-#define UTF8_IS_START(c)      (__ASSERT_(FITS_IN_8_BITS(c))                 \
-                               ((U8)((c) | 0)) >= 0xc2)
-
-/* For use in UTF8_IS_CONTINUATION() below */
-#define UTF_IS_CONTINUATION_MASK    0xC0
-
-/* Is the byte 'c' part of a multi-byte UTF8-8 encoded sequence, and not the
- * first byte thereof?  The |0 makes sure this isn't mistakenly called with a
- * ptr argument */
-#define UTF8_IS_CONTINUATION(c)     (__ASSERT_(FITS_IN_8_BITS(c))           \
-     (((U8)((c) | 0)) & UTF_IS_CONTINUATION_MASK) == UTF_CONTINUATION_MARK)
-
-/* Is the UTF8-encoded byte 'c' the first byte of a two byte sequence?  Use
- * UTF8_IS_NEXT_CHAR_DOWNGRADEABLE() instead if the input isn't known to
- * be well-formed.  Masking with 0xfe allows the low bit to be 0 or 1; thus
- * this matches 0xc[23].  The |0 makes sure this isn't mistakenly called with a
- * ptr argument */
-#define UTF8_IS_DOWNGRADEABLE_START(c) (__ASSERT_(FITS_IN_8_BITS(c))       \
-                                         (((U8)((c) | 0)) & 0xfe) == 0xc2)
-
-/* Is the UTF8-encoded byte 'c' the first byte of a sequence of bytes that
- * represent a code point > 255?  The |0 makes sure this isn't mistakenly
- * called with a ptr argument */
-#define UTF8_IS_ABOVE_LATIN1(c)     (__ASSERT_(FITS_IN_8_BITS(c))           \
-                                     ((U8)((c) | 0)) >= 0xc4)
+are in the character. */
 
 /* This is the number of low-order bits a continuation byte in a UTF-8 encoded
  * sequence contributes to the specification of the code point.  In the bit
@@ -360,6 +300,39 @@ C<cp> is Unicode if above 255; otherwise is 
platform-native.
  * UTF-8, 0x1F in UTF-EBCDIC. */
 #define UTF_CONTINUATION_MASK  ((U8) ((1U << UTF_ACCUMULATION_SHIFT) - 1))
 
+/* For use in UTF8_IS_CONTINUATION().  This turns out to be 0xC0 in UTF-8,
+ * E0 in UTF-EBCDIC */
+#define UTF_IS_CONTINUATION_MASK    ((U8) (0xFF << UTF_ACCUMULATION_SHIFT))
+
+/* This defines the bits that are to be in the continuation bytes of a
+ * multi-byte UTF-8 encoded character that mark it is a continuation byte.
+ * This turns out to be 0x80 in UTF-8, 0xA0 in UTF-EBCDIC.  (khw doesn't know
+ * the underlying reason that B0 works here) */
+#define UTF_CONTINUATION_MARK       (UTF_IS_CONTINUATION_MASK & 0xB0)
+
+/* Is the byte 'c' part of a multi-byte UTF8-8 encoded sequence, and not the
+ * first byte thereof? */
+#define UTF8_IS_CONTINUATION(c)     (__ASSERT_(FITS_IN_8_BITS(c))           \
+            (((NATIVE_UTF8_TO_I8(c) & UTF_IS_CONTINUATION_MASK)             \
+                                                == UTF_CONTINUATION_MARK)))
+
+/* Is the representation of the Unicode code point 'cp' the same regardless of
+ * being encoded in UTF-8 or not? This is a fundamental property of
+ * UTF-8,EBCDIC */
+#define OFFUNI_IS_INVARIANT(c) (((WIDEST_UTYPE)(c)) < UTF_CONTINUATION_MARK)
+
+/*
+=for apidoc Am|bool|UVCHR_IS_INVARIANT|UV cp
+
+Evaluates to 1 if the representation of code point C<cp> is the same whether or
+not it is encoded in UTF-8; otherwise evaluates to 0.  UTF-8 invariant
+characters can be copied as-is when converting to/from UTF-8, saving time.
+C<cp> is Unicode if above 255; otherwise is platform-native.
+
+=cut
+ */
+#define UVCHR_IS_INVARIANT(cp)  (OFFUNI_IS_INVARIANT(NATIVE_TO_UNI(cp)))
+
 /* Internal macro to be used only in this file to aid in constructing other
  * publicly accessible macros.
  * The number of bytes required to express this uv in UTF-8, for just those
@@ -418,6 +391,31 @@ encoded as UTF-8.  C<cp> is a native (ASCII or EBCDIC) 
code point if less than
  */
 #define UVCHR_SKIP(uv) ( UVCHR_IS_INVARIANT(uv) ? 1 : __BASE_UNI_SKIP(uv))
 
+#define UTF_MIN_START_BYTE                                                  \
+     ((UTF_CONTINUATION_MARK >> UTF_ACCUMULATION_SHIFT) | UTF_START_MARK(2))
+
+/* Is the byte 'c' the first byte of a multi-byte UTF8-8 encoded sequence?
+ * This doesn't catch invariants (they are single-byte).  It also excludes the
+ * illegal overlong sequences that begin with C0 and C1 on ASCII platforms, and
+ * C0-C4 I8 start bytes on EBCDIC ones */
+#define UTF8_IS_START(c)    (__ASSERT_(FITS_IN_8_BITS(c))                   \
+                             (NATIVE_UTF8_TO_I8(c) >= UTF_MIN_START_BYTE))
+
+#define UTF_MIN_ABOVE_LATIN1_BYTE                                           \
+                    ((0x100 >> UTF_ACCUMULATION_SHIFT) | UTF_START_MARK(2))
+
+/* Is the UTF8-encoded byte 'c' the first byte of a sequence of bytes that
+ * represent a code point > 255? */
+#define UTF8_IS_ABOVE_LATIN1(c)     (__ASSERT_(FITS_IN_8_BITS(c))           \
+                        (NATIVE_UTF8_TO_I8(c) >= UTF_MIN_ABOVE_LATIN1_BYTE))
+
+/* Is the UTF8-encoded byte 'c' the first byte of a two byte sequence?  Use
+ * UTF8_IS_NEXT_CHAR_DOWNGRADEABLE() instead if the input isn't known to
+ * be well-formed. */
+#define UTF8_IS_DOWNGRADEABLE_START(c) (__ASSERT_(FITS_IN_8_BITS(c))       \
+                inRANGE(NATIVE_UTF8_TO_I8(c),                               \
+                        UTF_MIN_START_BYTE, UTF_MIN_ABOVE_LATIN1_BYTE - 1))
+
 /* The largest code point representable by two UTF-8 bytes on this platform.
  * As explained in the comments for __COMMON_UNI_SKIP, 32 start bytes with
  * UTF_ACCUMULATION_SHIFT bits of information each */
@@ -585,6 +583,11 @@ with a ptr argument.
  * above show, doesn't matter as to its implementation */
 #define NATIVE_BYTE_IS_INVARIANT(c)    UVCHR_IS_INVARIANT(c)
 
+/* Misleadingly named: is the UTF8-encoded byte 'c' part of a variant sequence
+ * in UTF-8?  This is the inverse of UTF8_IS_INVARIANT. */
+#define UTF8_IS_CONTINUED(c)  (__ASSERT_(FITS_IN_8_BITS(c))                 \
+                               (! UTF8_IS_INVARIANT(c)))
+
 /* The macros in the next 4 sets are used to generate the two utf8 or utfebcdic
  * bytes from an ordinal that is known to fit into exactly two (not one) bytes;
  * it must be less than 0x3FF to work across both encodings. */
diff --git a/utfebcdic.h b/utfebcdic.h
index d8278a1e72..4a66637bbb 100644
--- a/utfebcdic.h
+++ b/utfebcdic.h
@@ -202,54 +202,13 @@ possible to UTF-8-encode a single code point in different 
ways, but that is
 explicitly forbidden, and the shortest possible encoding should always be used
 (and that is what Perl does). */
 
-/* This is a fundamental property of UTF-EBCDIC */
-#define OFFUNI_IS_INVARIANT(c) (((UV)(c)) <  0xA0)
-
-/* It turns out that on EBCDIC platforms, the invariants are the characters
- * that have ASCII equivalents, plus the C1 controls.  Since the C0 controls
- * and DELETE are ASCII, this is the same as: (isASCII(uv) || isCNTRL_L1(uv))
- * */
-#define UVCHR_IS_INVARIANT(uv) cBOOL(FITS_IN_8_BITS(uv)                        
\
-   && (PL_charclass[(U8) (uv)] & (_CC_mask(_CC_ASCII) | _CC_mask(_CC_CNTRL))))
-
-/* UTF-EBCDIC semantic macros - We used to transform back into I8 and then
- * compare, but now only have to do a single lookup by using a bit in
- * l1_char_class_tab.h.
- * Comments as to the meaning of each are given at their corresponding utf8.h
+/* Comments as to the meaning of each are given at their corresponding utf8.h
  * definitions. */
 
-#define UTF8_IS_START(c)               _generic_isCC(c, _CC_UTF8_IS_START)
-
-#define UTF_IS_CONTINUATION_MASK    0xE0
-
-#define UTF8_IS_CONTINUATION(c)                _generic_isCC(c, 
_CC_UTF8_IS_CONTINUATION)
-
-/* The above instead could be written as this:
-#define UTF8_IS_CONTINUATION(c)                                                
 \
-            (((NATIVE_UTF8_TO_I8(c) & UTF_IS_CONTINUATION_MASK)                
 \
-                                                == UTF_CONTINUATION_MARK)
- */
-
-/* Equivalent to ! UVCHR_IS_INVARIANT(c) */
-#define UTF8_IS_CONTINUED(c)           cBOOL(FITS_IN_8_BITS(c)                 
\
-   && ! (PL_charclass[(U8) (c)] & (_CC_mask(_CC_ASCII) | _CC_mask(_CC_CNTRL))))
-
-#define UTF8_IS_DOWNGRADEABLE_START(c)   _generic_isCC(c,                      
 \
-                                              _CC_UTF8_IS_DOWNGRADEABLE_START)
-
-/* Equivalent to (UTF8_IS_START(c) && ! UTF8_IS_DOWNGRADEABLE_START(c))
- * Makes sure that the START bit is set and the DOWNGRADEABLE bit isn't */
-#define UTF8_IS_ABOVE_LATIN1(c) cBOOL(FITS_IN_8_BITS(c)                        
 \
-  && ((PL_charclass[(U8) (c)] & ( _CC_mask(_CC_UTF8_IS_START)                  
 \
-                                 |_CC_mask(_CC_UTF8_IS_DOWNGRADEABLE_START)))  
 \
-                        == _CC_mask(_CC_UTF8_IS_START)))
-
 #define isUTF8_POSSIBLY_PROBLEMATIC(c)                                         
 \
                 _generic_isCC(c, _CC_UTF8_START_BYTE_IS_FOR_AT_LEAST_SURROGATE)
 
-#define UTF_CONTINUATION_MARK          0xA0
 #define UTF_ACCUMULATION_SHIFT         5
-
 /* ^? is defined to be APC on EBCDIC systems.  See the definition of toCTRL()
  * for more */
 #define QUESTION_MARK_CTRL   LATIN1_TO_NATIVE(0x9F)

-- 
Perl5 Master Repository

Reply via email to