In perl.git, the branch blead has been updated <https://perl5.git.perl.org/perl.git/commitdiff/4f89f09f9b3db7f542b7029c55aad389a59c9a02?hp=d7bcd45a8b00fa6be02a62e097629473b2a9becc>
- Log ----------------------------------------------------------------- commit 4f89f09f9b3db7f542b7029c55aad389a59c9a02 Merge: d7bcd45a8b 2c03e801f9 Author: Karl Williamson <[email protected]> Date: Sun Oct 6 11:08:40 2019 -0600 Merge branch 'Remove EBCDCIC special handling' into blead It turns out that only a single number is needed to distinguish between basic UTF-8 and UTF-EBCDIC. And that is the number of bits of real information are in each continuation byte. In UTF-8 it is 6 (2 bits reserved for syntax); In UTF-EBCDIC it is 5. Everything else stems from reasonable decisions based on this fundamental difference. So all the other constants can be common between the two systems, using compile-time shifts and masks. For Perl's extended UTF-8-like encoding, another constant is needed, which is the number of continuation bytes appended when the start byte is 8 bits. For both systems, that number is the minimum required to be able to encode a 64-bit integer. (There are other ways to extend the encoding, including some that are infinitely so. But Perl chose to just append a fixed number of bytes, so it isn't extensible. But it has the advantage of needing to rely only on the first byte to know how many more are coming.) This commit consolidates various constants that differed between the two systems, but were unnecessarily so. There are other constants that remain that differ between the two files; these are for convenience . commit 2c03e801f9133bacfe39d2a12decdd9d2b3b075a Author: Karl Williamson <[email protected]> Date: Sun Oct 6 10:50:49 2019 -0600 Make defn of UTF_IS_CONTINUED common This can be derived from other values, removing an EBCDIC dependency commit ab2e28c2f2b8f2edf930448a1c0182a8bd4f469f Author: Karl Williamson <[email protected]> Date: Wed Oct 2 20:37:17 2019 -0600 Make defn of UVCHR_IS_INVARIANT common This can be derived from other values, removing an EBCDIC dependency commit 2dc97505e86018c7ceba8c96fd84f477c8dd45d3 Author: Karl Williamson <[email protected]> Date: Wed Oct 2 18:08:32 2019 -0600 Make defn of OFFUNI_IS_INVARIANT common This can be derived from other values, removing an EBCDIC dependency commit 7c88d61e18cab1244ecd155556e1f0b3563a7e4a Author: Karl Williamson <[email protected]> Date: Wed Oct 2 18:03:26 2019 -0600 Make defn of UTF8_IS_DOWNGRADEABLE_START common This can be derived from other values, removing an EBCDIC dependency commit 1df634280fbf565fc9e9ada123c12a82404aa817 Author: Karl Williamson <[email protected]> Date: Wed Oct 2 17:56:01 2019 -0600 Make defn of UTF_IS_ABOVE_LATIN1 common This can be derived from other values, removing an EBCDIC dependency commit 4bab39bc1904f776c12d31a54ff5abe06fc9c103 Author: Karl Williamson <[email protected]> Date: Sun Oct 6 10:50:12 2019 -0600 Make defn of UTF8_IS_START common This can be derived from other values, removing an EBCDIC dependency commit f4225fa0e24724a97c2ff1d4e608353ca1537506 Author: Karl Williamson <[email protected]> Date: Wed Oct 2 17:13:31 2019 -0600 Make defn of UTF8_IS_CONTINUATION common This can be derived from other values, removing an EBCDIC dependency commit 38f458ffd56c0eb9f5df18cb6693ca326a4b1374 Author: Karl Williamson <[email protected]> Date: Wed Oct 2 17:07:50 2019 -0600 Make defn of UTF_CONTINUATION_MARK common This can be derived from other values, removing an EBCDIC dependency commit 9f3cfb7a26dab519dbc83ef02bd3fbf084cb6fc3 Author: Karl Williamson <[email protected]> Date: Wed Oct 2 16:48:38 2019 -0600 Make defn of UTF_IS_CONTINUATION_MASK common This variable can be defined from the same base in both UTF-8 and UTF-EBCDIC, and doing so eliminates an EBCDIC dependency. ----------------------------------------------------------------------- Summary of changes: utf8.h | 125 +++++++++++++++++++++++++++++++----------------------------- utfebcdic.h | 43 +-------------------- 2 files changed, 65 insertions(+), 103 deletions(-) diff --git a/utf8.h b/utf8.h index 472527c4a1..889324e587 100644 --- a/utf8.h +++ b/utf8.h @@ -272,67 +272,7 @@ Perl's extended UTF-8 means we can have start bytes up through FF, though any beginning with FF yields a code point that is too large for 32-bit ASCII platforms. FF signals to use 13 bytes for the encoded character. This breaks the paradigm that the number of leading bits gives how many total bytes there -are in the character. - -=cut -*/ - -/* Is the representation of the Unicode code point 'cp' the same regardless of - * being encoded in UTF-8 or not? */ -#define OFFUNI_IS_INVARIANT(cp) isASCII(cp) - -/* -=for apidoc Am|bool|UVCHR_IS_INVARIANT|UV cp - -Evaluates to 1 if the representation of code point C<cp> is the same whether or -not it is encoded in UTF-8; otherwise evaluates to 0. UTF-8 invariant -characters can be copied as-is when converting to/from UTF-8, saving time. -C<cp> is Unicode if above 255; otherwise is platform-native. - -=cut - */ - -#define UVCHR_IS_INVARIANT(cp) OFFUNI_IS_INVARIANT(cp) - -/* This defines the bits that are to be in the continuation bytes of a multi-byte - * UTF-8 encoded character that mark it is a continuation byte. */ -#define UTF_CONTINUATION_MARK 0x80 - -/* Misleadingly named: is the UTF8-encoded byte 'c' part of a variant sequence - * in UTF-8? This is the inverse of UTF8_IS_INVARIANT. The |0 makes sure this - * isn't mistakenly called with a ptr argument */ -#define UTF8_IS_CONTINUED(c) (__ASSERT_(FITS_IN_8_BITS(c)) \ - ((U8)((c) | 0)) & UTF_CONTINUATION_MARK) - -/* Is the byte 'c' the first byte of a multi-byte UTF8-8 encoded sequence? - * This doesn't catch invariants (they are single-byte). It also excludes the - * illegal overlong sequences that begin with C0 and C1. The |0 makes sure - * this isn't mistakenly called with a ptr argument */ -#define UTF8_IS_START(c) (__ASSERT_(FITS_IN_8_BITS(c)) \ - ((U8)((c) | 0)) >= 0xc2) - -/* For use in UTF8_IS_CONTINUATION() below */ -#define UTF_IS_CONTINUATION_MASK 0xC0 - -/* Is the byte 'c' part of a multi-byte UTF8-8 encoded sequence, and not the - * first byte thereof? The |0 makes sure this isn't mistakenly called with a - * ptr argument */ -#define UTF8_IS_CONTINUATION(c) (__ASSERT_(FITS_IN_8_BITS(c)) \ - (((U8)((c) | 0)) & UTF_IS_CONTINUATION_MASK) == UTF_CONTINUATION_MARK) - -/* Is the UTF8-encoded byte 'c' the first byte of a two byte sequence? Use - * UTF8_IS_NEXT_CHAR_DOWNGRADEABLE() instead if the input isn't known to - * be well-formed. Masking with 0xfe allows the low bit to be 0 or 1; thus - * this matches 0xc[23]. The |0 makes sure this isn't mistakenly called with a - * ptr argument */ -#define UTF8_IS_DOWNGRADEABLE_START(c) (__ASSERT_(FITS_IN_8_BITS(c)) \ - (((U8)((c) | 0)) & 0xfe) == 0xc2) - -/* Is the UTF8-encoded byte 'c' the first byte of a sequence of bytes that - * represent a code point > 255? The |0 makes sure this isn't mistakenly - * called with a ptr argument */ -#define UTF8_IS_ABOVE_LATIN1(c) (__ASSERT_(FITS_IN_8_BITS(c)) \ - ((U8)((c) | 0)) >= 0xc4) +are in the character. */ /* This is the number of low-order bits a continuation byte in a UTF-8 encoded * sequence contributes to the specification of the code point. In the bit @@ -360,6 +300,39 @@ C<cp> is Unicode if above 255; otherwise is platform-native. * UTF-8, 0x1F in UTF-EBCDIC. */ #define UTF_CONTINUATION_MASK ((U8) ((1U << UTF_ACCUMULATION_SHIFT) - 1)) +/* For use in UTF8_IS_CONTINUATION(). This turns out to be 0xC0 in UTF-8, + * E0 in UTF-EBCDIC */ +#define UTF_IS_CONTINUATION_MASK ((U8) (0xFF << UTF_ACCUMULATION_SHIFT)) + +/* This defines the bits that are to be in the continuation bytes of a + * multi-byte UTF-8 encoded character that mark it is a continuation byte. + * This turns out to be 0x80 in UTF-8, 0xA0 in UTF-EBCDIC. (khw doesn't know + * the underlying reason that B0 works here) */ +#define UTF_CONTINUATION_MARK (UTF_IS_CONTINUATION_MASK & 0xB0) + +/* Is the byte 'c' part of a multi-byte UTF8-8 encoded sequence, and not the + * first byte thereof? */ +#define UTF8_IS_CONTINUATION(c) (__ASSERT_(FITS_IN_8_BITS(c)) \ + (((NATIVE_UTF8_TO_I8(c) & UTF_IS_CONTINUATION_MASK) \ + == UTF_CONTINUATION_MARK))) + +/* Is the representation of the Unicode code point 'cp' the same regardless of + * being encoded in UTF-8 or not? This is a fundamental property of + * UTF-8,EBCDIC */ +#define OFFUNI_IS_INVARIANT(c) (((WIDEST_UTYPE)(c)) < UTF_CONTINUATION_MARK) + +/* +=for apidoc Am|bool|UVCHR_IS_INVARIANT|UV cp + +Evaluates to 1 if the representation of code point C<cp> is the same whether or +not it is encoded in UTF-8; otherwise evaluates to 0. UTF-8 invariant +characters can be copied as-is when converting to/from UTF-8, saving time. +C<cp> is Unicode if above 255; otherwise is platform-native. + +=cut + */ +#define UVCHR_IS_INVARIANT(cp) (OFFUNI_IS_INVARIANT(NATIVE_TO_UNI(cp))) + /* Internal macro to be used only in this file to aid in constructing other * publicly accessible macros. * The number of bytes required to express this uv in UTF-8, for just those @@ -418,6 +391,31 @@ encoded as UTF-8. C<cp> is a native (ASCII or EBCDIC) code point if less than */ #define UVCHR_SKIP(uv) ( UVCHR_IS_INVARIANT(uv) ? 1 : __BASE_UNI_SKIP(uv)) +#define UTF_MIN_START_BYTE \ + ((UTF_CONTINUATION_MARK >> UTF_ACCUMULATION_SHIFT) | UTF_START_MARK(2)) + +/* Is the byte 'c' the first byte of a multi-byte UTF8-8 encoded sequence? + * This doesn't catch invariants (they are single-byte). It also excludes the + * illegal overlong sequences that begin with C0 and C1 on ASCII platforms, and + * C0-C4 I8 start bytes on EBCDIC ones */ +#define UTF8_IS_START(c) (__ASSERT_(FITS_IN_8_BITS(c)) \ + (NATIVE_UTF8_TO_I8(c) >= UTF_MIN_START_BYTE)) + +#define UTF_MIN_ABOVE_LATIN1_BYTE \ + ((0x100 >> UTF_ACCUMULATION_SHIFT) | UTF_START_MARK(2)) + +/* Is the UTF8-encoded byte 'c' the first byte of a sequence of bytes that + * represent a code point > 255? */ +#define UTF8_IS_ABOVE_LATIN1(c) (__ASSERT_(FITS_IN_8_BITS(c)) \ + (NATIVE_UTF8_TO_I8(c) >= UTF_MIN_ABOVE_LATIN1_BYTE)) + +/* Is the UTF8-encoded byte 'c' the first byte of a two byte sequence? Use + * UTF8_IS_NEXT_CHAR_DOWNGRADEABLE() instead if the input isn't known to + * be well-formed. */ +#define UTF8_IS_DOWNGRADEABLE_START(c) (__ASSERT_(FITS_IN_8_BITS(c)) \ + inRANGE(NATIVE_UTF8_TO_I8(c), \ + UTF_MIN_START_BYTE, UTF_MIN_ABOVE_LATIN1_BYTE - 1)) + /* The largest code point representable by two UTF-8 bytes on this platform. * As explained in the comments for __COMMON_UNI_SKIP, 32 start bytes with * UTF_ACCUMULATION_SHIFT bits of information each */ @@ -585,6 +583,11 @@ with a ptr argument. * above show, doesn't matter as to its implementation */ #define NATIVE_BYTE_IS_INVARIANT(c) UVCHR_IS_INVARIANT(c) +/* Misleadingly named: is the UTF8-encoded byte 'c' part of a variant sequence + * in UTF-8? This is the inverse of UTF8_IS_INVARIANT. */ +#define UTF8_IS_CONTINUED(c) (__ASSERT_(FITS_IN_8_BITS(c)) \ + (! UTF8_IS_INVARIANT(c))) + /* The macros in the next 4 sets are used to generate the two utf8 or utfebcdic * bytes from an ordinal that is known to fit into exactly two (not one) bytes; * it must be less than 0x3FF to work across both encodings. */ diff --git a/utfebcdic.h b/utfebcdic.h index d8278a1e72..4a66637bbb 100644 --- a/utfebcdic.h +++ b/utfebcdic.h @@ -202,54 +202,13 @@ possible to UTF-8-encode a single code point in different ways, but that is explicitly forbidden, and the shortest possible encoding should always be used (and that is what Perl does). */ -/* This is a fundamental property of UTF-EBCDIC */ -#define OFFUNI_IS_INVARIANT(c) (((UV)(c)) < 0xA0) - -/* It turns out that on EBCDIC platforms, the invariants are the characters - * that have ASCII equivalents, plus the C1 controls. Since the C0 controls - * and DELETE are ASCII, this is the same as: (isASCII(uv) || isCNTRL_L1(uv)) - * */ -#define UVCHR_IS_INVARIANT(uv) cBOOL(FITS_IN_8_BITS(uv) \ - && (PL_charclass[(U8) (uv)] & (_CC_mask(_CC_ASCII) | _CC_mask(_CC_CNTRL)))) - -/* UTF-EBCDIC semantic macros - We used to transform back into I8 and then - * compare, but now only have to do a single lookup by using a bit in - * l1_char_class_tab.h. - * Comments as to the meaning of each are given at their corresponding utf8.h +/* Comments as to the meaning of each are given at their corresponding utf8.h * definitions. */ -#define UTF8_IS_START(c) _generic_isCC(c, _CC_UTF8_IS_START) - -#define UTF_IS_CONTINUATION_MASK 0xE0 - -#define UTF8_IS_CONTINUATION(c) _generic_isCC(c, _CC_UTF8_IS_CONTINUATION) - -/* The above instead could be written as this: -#define UTF8_IS_CONTINUATION(c) \ - (((NATIVE_UTF8_TO_I8(c) & UTF_IS_CONTINUATION_MASK) \ - == UTF_CONTINUATION_MARK) - */ - -/* Equivalent to ! UVCHR_IS_INVARIANT(c) */ -#define UTF8_IS_CONTINUED(c) cBOOL(FITS_IN_8_BITS(c) \ - && ! (PL_charclass[(U8) (c)] & (_CC_mask(_CC_ASCII) | _CC_mask(_CC_CNTRL)))) - -#define UTF8_IS_DOWNGRADEABLE_START(c) _generic_isCC(c, \ - _CC_UTF8_IS_DOWNGRADEABLE_START) - -/* Equivalent to (UTF8_IS_START(c) && ! UTF8_IS_DOWNGRADEABLE_START(c)) - * Makes sure that the START bit is set and the DOWNGRADEABLE bit isn't */ -#define UTF8_IS_ABOVE_LATIN1(c) cBOOL(FITS_IN_8_BITS(c) \ - && ((PL_charclass[(U8) (c)] & ( _CC_mask(_CC_UTF8_IS_START) \ - |_CC_mask(_CC_UTF8_IS_DOWNGRADEABLE_START))) \ - == _CC_mask(_CC_UTF8_IS_START))) - #define isUTF8_POSSIBLY_PROBLEMATIC(c) \ _generic_isCC(c, _CC_UTF8_START_BYTE_IS_FOR_AT_LEAST_SURROGATE) -#define UTF_CONTINUATION_MARK 0xA0 #define UTF_ACCUMULATION_SHIFT 5 - /* ^? is defined to be APC on EBCDIC systems. See the definition of toCTRL() * for more */ #define QUESTION_MARK_CTRL LATIN1_TO_NATIVE(0x9F) -- Perl5 Master Repository
