[perl.git] branch smoke-me/khw-encode, updated. v5.25.4-51-g9c03449

Karl Williamson Tue, 30 Aug 2016 20:46:25 -0700

In perl.git, the branch smoke-me/khw-encode has been updated

<http://perl5.git.perl.org/perl.git/commitdiff/9c03449800417dd02cc1af613951a1002490a52a?hp=655eab69b102d09f7e6643cbb78877097b23d994>


- Log -----------------------------------------------------------------
commit 9c03449800417dd02cc1af613951a1002490a52a
Author: Karl Williamson <k...@cpan.org>
Date:   Tue Aug 30 21:26:17 2016 -0600

    squash
-----------------------------------------------------------------------

Summary of changes:
 cpan/Encode/Encode.xs | 81 +++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 78 insertions(+), 3 deletions(-)

diff --git a/cpan/Encode/Encode.xs b/cpan/Encode/Encode.xs
index c4d061c..8c029c1 100644
--- a/cpan/Encode/Encode.xs
+++ b/cpan/Encode/Encode.xs
@@ -315,7 +315,7 @@ strict_utf8(pTHX_ SV* sv)
 static bool
 is_strict_utf8_string_loc(const U8* const s, STRLEN const len, const U8 **ep)
 {
-    /* Returns a boolean giving whether or not the input string from 's' to 
+    /* Returns a boolean giving whether or not the input string from 's' to
      * ('s' + 'len' - 1) is well-formed UTF-8 that is entirely Unicode code
      * points that aren't surrogates nor non-character code points.  *ep is set
      * to point to 1 byte beyond the end of the final valid input character */
@@ -342,6 +342,79 @@ is_strict_utf8_string_loc(const U8* const s, STRLEN const 
len, const U8 **ep)
     return TRUE;
 }
 
+static bool
+is_strict_utf8_valid_partial_char(const U8* const s, const U8 * const e)
+{
+    /* Returns a boolean giving whether or not the input string from 's' to
+     * ('e' - 1) is the initial, but incomplete, sequence of well-formed UTF-8
+     * for some Unicode code point, as far as it goes.
+     *
+     * It also checks that if the sequence were to be followed by any valid
+     * UTF-8 to form a complete character, that that character could not be a
+     * surrogate nor non-Unicode code point.  Only the first one or two bytes
+     * of the UTF-8 representation is needed to make this determination.  To
+     * determine if the UTF-8 could be a non-character, a complete sequence is
+     * required, so that is not done here.  Hence if the sequence, when
+     * completed, could be something other than a surrogate, non-character code
+     * point, or above-Unicode code point, TRUE is returned.
+     */
+
+    if (! is_utf8_valid_partial_char(s, e)) {
+        return FALSE;
+    }
+
+    /* The code below is derived from this table.  Keep in mind that legal
+     * continuation bytes range between \x80..\xBF for UTF-8, and \xA0..\xBF
+     * for I8.
+     *              UTF-8            UTF-EBCDIC I8
+     *   U+D800: \xED\xA0\x80      \xF1\xB6\xA0\xA0      First surrogate
+     *   U+DFFF: \xED\xBF\xBF      \xF1\xB7\xBF\xBF      Final surrogate
+     * U+110000: \xF4\x90\x80\x80  \xF9\xA2\xA0\xA0\xA0  First above Unicode
+     */
+
+#ifndef EBCDIC
+
+    if (UNLIKELY(*s > 0xF4)) {
+        return FALSE;           /* Above UTF-8 */
+    }
+
+    if (e - s > 1) {
+        if (UNLIKELY(*s == 0xF4 && *(s + 1) >= 0x90)) {
+            return FALSE;   /* Above UTF-8 */
+        }
+
+        if (UNLIKELY(*s == 0xED && *(s + 1) >= 0xA0)) {
+            return FALSE;   /* Surrogate */
+        }
+    }
+
+#else
+
+    {
+        const U8 s0 = NATIVE_UTF8_TO_I8(*s);
+
+        if (UNLIKELY(s0 > 0xF9)) {
+            return FALSE;           /* Above UTF-8 */
+        }
+
+        if (e - s > 1) {
+            if (UNLIKELY(s0 == 0xF9 && NATIVE_UTF8_TO_I8(*(s + 1)) >= 0xA2)) {
+                return FALSE;   /* Above UTF-8 */
+            }
+
+            if (UNLIKELY(s0 == 0xF1 && (   NATIVE_UTF8_TO_I8(*(s + 1)) = 0xB6
+                                        || NATIVE_UTF8_TO_I8(*(s + 1)) = 
0xB7)))
+            {
+                return FALSE;   /* Surrogate */
+            }
+        }
+    }
+
+#endif
+
+    return TRUE;
+}
+
 static U8*
 process_utf8(pTHX_ SV* dst, U8* s, U8* e, SV *check_sv,
              bool encode, bool strict, bool stop_at_partial)
@@ -392,7 +465,7 @@ process_utf8(pTHX_ SV* dst, U8* s, U8* e, SV *check_sv,
 
         bool valid = (strict)
                      ? is_strict_utf8_string_loc(s, e - s, &e_or_where_failed)
-                     : is_utf8_string_loc       (s, e - s, &e_or_where_failed);
+                     :        is_utf8_string_loc(s, e - s, &e_or_where_failed);
         STRLEN len = e_or_where_failed - s;
 
         Move(s, d, len, U8);
@@ -401,7 +474,9 @@ process_utf8(pTHX_ SV* dst, U8* s, U8* e, SV *check_sv,
 
         if (    LIKELY(valid)
             || (  (stop_at_partial || (check & ENCODE_STOP_AT_PARTIAL))
-                && is_utf8_valid_partial_char(s, e)))
+                && ((strict)
+                    ? is_strict_utf8_valid_partial_char(s, e)
+                    :        is_utf8_valid_partial_char(s, e))))
         {
             break;
         }

--
Perl5 Master Repository

[perl.git] branch smoke-me/khw-encode, updated. v5.25.4-51-g9c03449

Reply via email to