[4/6] lucy-clownfish git commit: Move some functions from StrHelp to Str

nwellnhof Sat, 06 Aug 2016 07:30:06 -0700

Move some functions from StrHelp to Str

- utf8_valid
- validate_utf8
- is_whitespace
- encode_utf8_char



Project: http://git-wip-us.apache.org/repos/asf/lucy-clownfish/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucy-clownfish/commit/ed2010ca
Tree: http://git-wip-us.apache.org/repos/asf/lucy-clownfish/tree/ed2010ca
Diff: http://git-wip-us.apache.org/repos/asf/lucy-clownfish/diff/ed2010ca

Branch: refs/heads/master
Commit: ed2010caec0af7ccdcaff76a05cdb516166a6ad4
Parents: 64a1000
Author: Nick Wellnhofer <[email protected]>
Authored: Tue Aug 2 18:46:35 2016 +0200
Committer: Nick Wellnhofer <[email protected]>
Committed: Tue Aug 2 19:05:14 2016 +0200

----------------------------------------------------------------------
 runtime/core/Clownfish/CharBuf.c                |   3 +-
 runtime/core/Clownfish/String.c                 | 180 +++++++++++-
 runtime/core/Clownfish/String.cfh               |  33 +++
 runtime/core/Clownfish/Util/StringHelper.c      | 176 -----------
 runtime/core/Clownfish/Util/StringHelper.cfh    |  34 ---
 .../perl/buildlib/Clownfish/Build/Binding.pm    |   2 +-
 runtime/perl/xs/XSBind.c                        |   1 -
 runtime/python/cfext/CFBind.c                   |   3 +-
 runtime/ruby/ext/Bind.c                         |   1 -
 runtime/ruby/ext/Clownfish.c                    |   1 -
 runtime/test/Clownfish/Test/TestCharBuf.c       |   3 +-
 runtime/test/Clownfish/Test/TestString.c        | 294 ++++++++++++++++++-
 .../test/Clownfish/Test/Util/TestStringHelper.c | 294 +------------------
 13 files changed, 522 insertions(+), 503 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucy-clownfish/blob/ed2010ca/runtime/core/Clownfish/CharBuf.c
----------------------------------------------------------------------
diff --git a/runtime/core/Clownfish/CharBuf.c b/runtime/core/Clownfish/CharBuf.c
index 2dbae91..30f54dd 100644
--- a/runtime/core/Clownfish/CharBuf.c
+++ b/runtime/core/Clownfish/CharBuf.c
@@ -30,7 +30,6 @@
 #include "Clownfish/Err.h"
 #include "Clownfish/String.h"
 #include "Clownfish/Util/Memory.h"
-#include "Clownfish/Util/StringHelper.h"
 #include "Clownfish/Class.h"
 
 // Append trusted UTF-8 to the CharBuf.
@@ -290,7 +289,7 @@ CB_Cat_Char_IMP(CharBuf *self, int32_t code_point) {
     size_t old_size = self->size;
     SI_add_grow_and_oversize(self, old_size, MAX_UTF8_BYTES);
     char *end = self->ptr + old_size;
-    size_t count = StrHelp_encode_utf8_char(code_point, (uint8_t*)end);
+    size_t count = Str_encode_utf8_char(code_point, (uint8_t*)end);
     self->size += count;
 }
 

http://git-wip-us.apache.org/repos/asf/lucy-clownfish/blob/ed2010ca/runtime/core/Clownfish/String.c
----------------------------------------------------------------------
diff --git a/runtime/core/Clownfish/String.c b/runtime/core/Clownfish/String.c
index 0353ffd..0de7f28 100644
--- a/runtime/core/Clownfish/String.c
+++ b/runtime/core/Clownfish/String.c
@@ -19,6 +19,7 @@
 #define CFISH_USE_SHORT_NAMES
 
 #include <string.h>
+#include <stdio.h>
 #include <stdlib.h>
 #include <ctype.h>
 
@@ -29,7 +30,6 @@
 #include "Clownfish/CharBuf.h"
 #include "Clownfish/Err.h"
 #include "Clownfish/Util/Memory.h"
-#include "Clownfish/Util/StringHelper.h"
 
 #define STACK_ITER(string, byte_offset) \
     S_new_stack_iter(alloca(sizeof(StringIterator)), string, byte_offset)
@@ -40,6 +40,178 @@ S_memmem(String *self, const char *substring, size_t size);
 static StringIterator*
 S_new_stack_iter(void *allocation, String *string, size_t byte_offset);
 
+// Return a pointer to the first invalid UTF-8 sequence, or NULL if
+// the UTF-8 is valid.
+static const uint8_t*
+S_find_invalid_utf8(const uint8_t *string, size_t size) {
+    const uint8_t *const end = string + size;
+    while (string < end) {
+        const uint8_t *start = string;
+        const uint8_t header_byte = *string++;
+
+        if (header_byte < 0x80) {
+            // ASCII
+            ;
+        }
+        else if (header_byte < 0xE0) {
+            // Disallow non-shortest-form ASCII and continuation bytes.
+            if (header_byte < 0xC2)         { return start; }
+            // Two-byte sequence.
+            if (string == end)              { return start; }
+            if ((*string++ & 0xC0) != 0x80) { return start; }
+        }
+        else if (header_byte < 0xF0) {
+            // Three-byte sequence.
+            if (end - string < 2)           { return start; }
+            if (header_byte == 0xED) {
+                // Disallow UTF-16 surrogates.
+                if (*string < 0x80 || *string > 0x9F) {
+                    return start;
+                }
+            }
+            else if (!(header_byte & 0x0F)) {
+                // Disallow non-shortest-form.
+                if (!(*string & 0x20)) {
+                    return start;
+                }
+            }
+            if ((*string++ & 0xC0) != 0x80) { return start; }
+            if ((*string++ & 0xC0) != 0x80) { return start; }
+        }
+        else {
+            if (header_byte > 0xF4)         { return start; }
+            // Four-byte sequence.
+            if (end - string < 3)           { return start; }
+            if (!(header_byte & 0x07)) {
+                // Disallow non-shortest-form.
+                if (!(*string & 0x30)) {
+                    return start;
+                }
+            }
+            else if (header_byte == 0xF4) {
+                // Code point larger than 0x10FFFF.
+                if (*string >= 0x90) {
+                    return start;
+                }
+            }
+            if ((*string++ & 0xC0) != 0x80) { return start; }
+            if ((*string++ & 0xC0) != 0x80) { return start; }
+            if ((*string++ & 0xC0) != 0x80) { return start; }
+        }
+    }
+
+    return NULL;
+}
+
+bool
+Str_utf8_valid(const char *ptr, size_t size) {
+    return S_find_invalid_utf8((const uint8_t*)ptr, size) == NULL;
+}
+
+void
+Str_validate_utf8(const char *ptr, size_t size, const char *file, int line,
+                  const char *func) {
+    const uint8_t *string  = (const uint8_t*)ptr;
+    const uint8_t *invalid = S_find_invalid_utf8(string, size);
+    if (invalid == NULL) { return; }
+
+    CharBuf *buf = CB_new(0);
+    CB_Cat_Trusted_Utf8(buf, "Invalid UTF-8", 13);
+
+    if (invalid > string) {
+        const uint8_t *prefix = invalid;
+        size_t num_code_points = 0;
+
+        // Skip up to 20 code points backwards.
+        while (prefix > string) {
+            prefix -= 1;
+
+            if ((*prefix & 0xC0) != 0x80) {
+                num_code_points += 1;
+                if (num_code_points >= 20) { break; }
+            }
+        }
+
+        CB_Cat_Trusted_Utf8(buf, " after '", 8);
+        CB_Cat_Trusted_Utf8(buf, (const char*)prefix, invalid - prefix);
+        CB_Cat_Trusted_Utf8(buf, "'", 1);
+    }
+
+    CB_Cat_Trusted_Utf8(buf, ":", 1);
+
+    // Append offending bytes as hex.
+    const uint8_t *end = string + size;
+    const uint8_t *max = invalid + 5;
+    for (const uint8_t *byte = invalid; byte < end && byte < max; byte++) {
+        char hex[4];
+        sprintf(hex, " %02X", *byte);
+        CB_Cat_Trusted_Utf8(buf, hex, 3);
+    }
+
+    String *mess = CB_Yield_String(buf);
+    DECREF(buf);
+
+    Err *err = Err_new(mess);
+    Err_Add_Frame(err, file, line, func);
+    Err_do_throw(err);
+}
+
+bool
+Str_is_whitespace(int32_t code_point) {
+    switch (code_point) {
+            // <control-0009>..<control-000D>
+        case 0x0009: case 0x000A: case 0x000B: case 0x000C: case 0x000D:
+        case 0x0020: // SPACE
+        case 0x0085: // <control-0085>
+        case 0x00A0: // NO-BREAK SPACE
+        case 0x1680: // OGHAM SPACE MARK
+            // EN QUAD..HAIR SPACE
+        case 0x2000: case 0x2001: case 0x2002: case 0x2003: case 0x2004:
+        case 0x2005: case 0x2006: case 0x2007: case 0x2008: case 0x2009:
+        case 0x200A:
+        case 0x2028: // LINE SEPARATOR
+        case 0x2029: // PARAGRAPH SEPARATOR
+        case 0x202F: // NARROW NO-BREAK SPACE
+        case 0x205F: // MEDIUM MATHEMATICAL SPACE
+        case 0x3000: // IDEOGRAPHIC SPACE
+            return true;
+
+        default:
+            return false;
+    }
+}
+
+uint32_t
+Str_encode_utf8_char(int32_t code_point, void *buffer) {
+    uint8_t *buf = (uint8_t*)buffer;
+    if (code_point <= 0x7F) { // ASCII
+        buf[0] = (uint8_t)code_point;
+        return 1;
+    }
+    else if (code_point <= 0x07FF) { // 2 byte range
+        buf[0] = (uint8_t)(0xC0 | (code_point >> 6));
+        buf[1] = (uint8_t)(0x80 | (code_point & 0x3f));
+        return 2;
+    }
+    else if (code_point <= 0xFFFF) { // 3 byte range
+        buf[0] = (uint8_t)(0xE0 | (code_point  >> 12));
+        buf[1] = (uint8_t)(0x80 | ((code_point >> 6) & 0x3F));
+        buf[2] = (uint8_t)(0x80 | (code_point        & 0x3f));
+        return 3;
+    }
+    else if (code_point <= 0x10FFFF) { // 4 byte range
+        buf[0] = (uint8_t)(0xF0 | (code_point  >> 18));
+        buf[1] = (uint8_t)(0x80 | ((code_point >> 12) & 0x3F));
+        buf[2] = (uint8_t)(0x80 | ((code_point >> 6)  & 0x3F));
+        buf[3] = (uint8_t)(0x80 | (code_point         & 0x3f));
+        return 4;
+    }
+    else {
+        THROW(ERR, "Illegal Unicode code point: %u32", code_point);
+        UNREACHABLE_RETURN(uint32_t);
+    }
+}
+
 String*
 Str_new_from_utf8(const char *utf8, size_t size) {
     VALIDATE_UTF8(utf8, size);
@@ -122,7 +294,7 @@ String*
 Str_new_from_char(int32_t code_point) {
     const size_t MAX_UTF8_BYTES = 4;
     char   *ptr  = (char*)MALLOCATE(MAX_UTF8_BYTES + 1);
-    size_t  size = StrHelp_encode_utf8_char(code_point, (uint8_t*)ptr);
+    size_t  size = Str_encode_utf8_char(code_point, (uint8_t*)ptr);
     ptr[size] = '\0';
 
     String *self = (String*)Class_Make_Obj(STRING);
@@ -740,7 +912,7 @@ StrIter_Skip_Whitespace_IMP(StringIterator *self) {
     int32_t code_point;
 
     while (STR_OOB != (code_point = StrIter_Next(self))) {
-        if (!StrHelp_is_whitespace(code_point)) { break; }
+        if (!Str_is_whitespace(code_point)) { break; }
         byte_offset = self->byte_offset;
         ++num_skipped;
     }
@@ -756,7 +928,7 @@ StrIter_Skip_Whitespace_Back_IMP(StringIterator *self) {
     int32_t code_point;
 
     while (STR_OOB != (code_point = StrIter_Prev(self))) {
-        if (!StrHelp_is_whitespace(code_point)) { break; }
+        if (!Str_is_whitespace(code_point)) { break; }
         byte_offset = self->byte_offset;
         ++num_skipped;
     }

http://git-wip-us.apache.org/repos/asf/lucy-clownfish/blob/ed2010ca/runtime/core/Clownfish/String.cfh
----------------------------------------------------------------------
diff --git a/runtime/core/Clownfish/String.cfh 
b/runtime/core/Clownfish/String.cfh
index 72f60a1..6b3323e 100644
--- a/runtime/core/Clownfish/String.cfh
+++ b/runtime/core/Clownfish/String.cfh
@@ -24,6 +24,9 @@ __C__
 // For CFISH_ALLOCA_OBJ.
 #include "Clownfish/Class.h"
 
+// For CFISH_ERR_FUNC_MACRO.
+#include "Clownfish/Err.h"
+
 __END_C__
 
 /**
@@ -37,6 +40,31 @@ public final class Clownfish::String nickname Str
     size_t      size;
     String     *origin;
 
+    /** Return true if the string is valid UTF-8, false otherwise.
+     */
+    public inert bool
+    utf8_valid(const char *ptr, size_t len);
+
+    /** Throws an error if the string isn't valid UTF-8.
+     */
+    public inert void
+    validate_utf8(const char *text, size_t size, const char *file, int line,
+                  const char *func);
+
+    /** Returns true if the code point qualifies as Unicode whitespace.
+     */
+    public inert bool
+    is_whitespace(int32_t code_point);
+
+    /** Encode a Unicode code point to a UTF-8 sequence.
+     *
+     * @param code_point A legal unicode code point.
+     * @param buffer Write buffer which must hold at least 4 bytes (the
+     * maximum legal length for a UTF-8 char).
+     */
+    inert uint32_t
+    encode_utf8_char(int32_t code_point, void *buffer);
+
     /** Return a String which holds a copy of the supplied UTF-8 character
      * data after checking for validity.
      *
@@ -506,6 +534,10 @@ public final class Clownfish::StringIterator nickname 
StrIter
 
 __C__
 
+#define CFISH_VALIDATE_UTF8(text, size) \
+    cfish_Str_validate_utf8(text, size, \
+                            __FILE__, __LINE__, CFISH_ERR_FUNC_MACRO)
+
 #define CFISH_SSTR_BLANK() \
     cfish_Str_init_stack_string(CFISH_ALLOCA_OBJ(CFISH_STRING), "", 0)
 
@@ -519,6 +551,7 @@ __C__
 #define CFISH_STR_OOB       -1
 
 #ifdef CFISH_USE_SHORT_NAMES
+  #define VALIDATE_UTF8          CFISH_VALIDATE_UTF8
   #define SSTR_BLANK             CFISH_SSTR_BLANK
   #define SSTR_WRAP_C            CFISH_SSTR_WRAP_C
   #define SSTR_WRAP_UTF8         CFISH_SSTR_WRAP_UTF8

http://git-wip-us.apache.org/repos/asf/lucy-clownfish/blob/ed2010ca/runtime/core/Clownfish/Util/StringHelper.c
----------------------------------------------------------------------
diff --git a/runtime/core/Clownfish/Util/StringHelper.c 
b/runtime/core/Clownfish/Util/StringHelper.c
index 256c9e0..7b8e9d8 100644
--- a/runtime/core/Clownfish/Util/StringHelper.c
+++ b/runtime/core/Clownfish/Util/StringHelper.c
@@ -15,10 +15,6 @@
  */
 
 #define C_CFISH_STRINGHELPER
-#include <string.h>
-#include <stddef.h>
-#include <stdio.h>
-
 #define CFISH_USE_SHORT_NAMES
 
 #include "Clownfish/Util/StringHelper.h"
@@ -79,178 +75,6 @@ StrHelp_to_base36(uint64_t num, void *buffer) {
     return size;
 }
 
-// Return a pointer to the first invalid UTF-8 sequence, or NULL if
-// the UTF-8 is valid.
-static const uint8_t*
-S_find_invalid_utf8(const uint8_t *string, size_t size) {
-    const uint8_t *const end = string + size;
-    while (string < end) {
-        const uint8_t *start = string;
-        const uint8_t header_byte = *string++;
-
-        if (header_byte < 0x80) {
-            // ASCII
-            ;
-        }
-        else if (header_byte < 0xE0) {
-            // Disallow non-shortest-form ASCII and continuation bytes.
-            if (header_byte < 0xC2)         { return start; }
-            // Two-byte sequence.
-            if (string == end)              { return start; }
-            if ((*string++ & 0xC0) != 0x80) { return start; }
-        }
-        else if (header_byte < 0xF0) {
-            // Three-byte sequence.
-            if (end - string < 2)           { return start; }
-            if (header_byte == 0xED) {
-                // Disallow UTF-16 surrogates.
-                if (*string < 0x80 || *string > 0x9F) {
-                    return start;
-                }
-            }
-            else if (!(header_byte & 0x0F)) {
-                // Disallow non-shortest-form.
-                if (!(*string & 0x20)) {
-                    return start;
-                }
-            }
-            if ((*string++ & 0xC0) != 0x80) { return start; }
-            if ((*string++ & 0xC0) != 0x80) { return start; }
-        }
-        else {
-            if (header_byte > 0xF4)         { return start; }
-            // Four-byte sequence.
-            if (end - string < 3)           { return start; }
-            if (!(header_byte & 0x07)) {
-                // Disallow non-shortest-form.
-                if (!(*string & 0x30)) {
-                    return start;
-                }
-            }
-            else if (header_byte == 0xF4) {
-                // Code point larger than 0x10FFFF.
-                if (*string >= 0x90) {
-                    return start;
-                }
-            }
-            if ((*string++ & 0xC0) != 0x80) { return start; }
-            if ((*string++ & 0xC0) != 0x80) { return start; }
-            if ((*string++ & 0xC0) != 0x80) { return start; }
-        }
-    }
-
-    return NULL;
-}
-
-bool
-StrHelp_utf8_valid(const char *ptr, size_t size) {
-    return S_find_invalid_utf8((const uint8_t*)ptr, size) == NULL;
-}
-
-void
-StrHelp_validate_utf8(const char *ptr, size_t size, const char *file,
-                      int line, const char *func) {
-    const uint8_t *string  = (const uint8_t*)ptr;
-    const uint8_t *invalid = S_find_invalid_utf8(string, size);
-    if (invalid == NULL) { return; }
-
-    CharBuf *buf = CB_new(0);
-    CB_Cat_Trusted_Utf8(buf, "Invalid UTF-8", 13);
-
-    if (invalid > string) {
-        const uint8_t *prefix = invalid;
-        size_t num_code_points = 0;
-
-        // Skip up to 20 code points backwards.
-        while (prefix > string) {
-            prefix -= 1;
-
-            if ((*prefix & 0xC0) != 0x80) {
-                num_code_points += 1;
-                if (num_code_points >= 20) { break; }
-            }
-        }
-
-        CB_Cat_Trusted_Utf8(buf, " after '", 8);
-        CB_Cat_Trusted_Utf8(buf, (const char*)prefix, invalid - prefix);
-        CB_Cat_Trusted_Utf8(buf, "'", 1);
-    }
-
-    CB_Cat_Trusted_Utf8(buf, ":", 1);
-
-    // Append offending bytes as hex.
-    const uint8_t *end = string + size;
-    const uint8_t *max = invalid + 5;
-    for (const uint8_t *byte = invalid; byte < end && byte < max; byte++) {
-        char hex[4];
-        sprintf(hex, " %02X", *byte);
-        CB_Cat_Trusted_Utf8(buf, hex, 3);
-    }
-
-    String *mess = CB_Yield_String(buf);
-    DECREF(buf);
-
-    Err *err = Err_new(mess);
-    Err_Add_Frame(err, file, line, func);
-    Err_do_throw(err);
-}
-
-bool
-StrHelp_is_whitespace(int32_t code_point) {
-    switch (code_point) {
-            // <control-0009>..<control-000D>
-        case 0x0009: case 0x000A: case 0x000B: case 0x000C: case 0x000D:
-        case 0x0020: // SPACE
-        case 0x0085: // <control-0085>
-        case 0x00A0: // NO-BREAK SPACE
-        case 0x1680: // OGHAM SPACE MARK
-            // EN QUAD..HAIR SPACE
-        case 0x2000: case 0x2001: case 0x2002: case 0x2003: case 0x2004:
-        case 0x2005: case 0x2006: case 0x2007: case 0x2008: case 0x2009:
-        case 0x200A:
-        case 0x2028: // LINE SEPARATOR
-        case 0x2029: // PARAGRAPH SEPARATOR
-        case 0x202F: // NARROW NO-BREAK SPACE
-        case 0x205F: // MEDIUM MATHEMATICAL SPACE
-        case 0x3000: // IDEOGRAPHIC SPACE
-            return true;
-
-        default:
-            return false;
-    }
-}
-
-uint32_t
-StrHelp_encode_utf8_char(int32_t code_point, void *buffer) {
-    uint8_t *buf = (uint8_t*)buffer;
-    if (code_point <= 0x7F) { // ASCII
-        buf[0] = (uint8_t)code_point;
-        return 1;
-    }
-    else if (code_point <= 0x07FF) { // 2 byte range
-        buf[0] = (uint8_t)(0xC0 | (code_point >> 6));
-        buf[1] = (uint8_t)(0x80 | (code_point & 0x3f));
-        return 2;
-    }
-    else if (code_point <= 0xFFFF) { // 3 byte range
-        buf[0] = (uint8_t)(0xE0 | (code_point  >> 12));
-        buf[1] = (uint8_t)(0x80 | ((code_point >> 6) & 0x3F));
-        buf[2] = (uint8_t)(0x80 | (code_point        & 0x3f));
-        return 3;
-    }
-    else if (code_point <= 0x10FFFF) { // 4 byte range
-        buf[0] = (uint8_t)(0xF0 | (code_point  >> 18));
-        buf[1] = (uint8_t)(0x80 | ((code_point >> 12) & 0x3F));
-        buf[2] = (uint8_t)(0x80 | ((code_point >> 6)  & 0x3F));
-        buf[3] = (uint8_t)(0x80 | (code_point         & 0x3f));
-        return 4;
-    }
-    else {
-        THROW(ERR, "Illegal Unicode code point: %u32", code_point);
-        UNREACHABLE_RETURN(uint32_t);
-    }
-}
-
 const char*
 StrHelp_back_utf8_char(const char *ptr, const char *start) {
     while (--ptr >= start) {

http://git-wip-us.apache.org/repos/asf/lucy-clownfish/blob/ed2010ca/runtime/core/Clownfish/Util/StringHelper.cfh
----------------------------------------------------------------------
diff --git a/runtime/core/Clownfish/Util/StringHelper.cfh 
b/runtime/core/Clownfish/Util/StringHelper.cfh
index 1264bea..1e915e6 100644
--- a/runtime/core/Clownfish/Util/StringHelper.cfh
+++ b/runtime/core/Clownfish/Util/StringHelper.cfh
@@ -16,10 +16,6 @@
 
 parcel Clownfish;
 
-__C__
-#include "Clownfish/Err.h"
-__END_C__
-
 inert class Clownfish::Util::StringHelper nickname StrHelp {
 
     /* A table where the values indicate the number of bytes in a UTF-8
@@ -43,31 +39,6 @@ inert class Clownfish::Util::StringHelper nickname StrHelp {
     inert size_t
     to_base36(uint64_t value, void *buffer);
 
-    /** Return true if the string is valid UTF-8, false otherwise.
-     */
-    inert bool
-    utf8_valid(const char *ptr, size_t len);
-
-    /** Throws an error if the string isn't valid UTF-8.
-     */
-    inert void
-    validate_utf8(const char *text, size_t size, const char *file, int line,
-                  const char *func);
-
-    /** Returns true if the code point qualifies as Unicode whitespace.
-     */
-    inert bool
-    is_whitespace(int32_t code_point);
-
-    /** Encode a Unicode code point to a UTF-8 sequence.
-     *
-     * @param code_point A legal unicode code point.
-     * @param buffer Write buffer which must hold at least 4 bytes (the
-     * maximum legal length for a UTF-8 char).
-     */
-    inert uint32_t
-    encode_utf8_char(int32_t code_point, void *buffer);
-
     /** Return the first non-continuation byte before the supplied pointer.
      * If backtracking progresses beyond the supplied start, return NULL.
      */
@@ -76,17 +47,12 @@ inert class Clownfish::Util::StringHelper nickname StrHelp {
 }
 
 __C__
-#define CFISH_VALIDATE_UTF8(text, size) \
-    cfish_StrHelp_validate_utf8(text, size, \
-                                __FILE__, __LINE__, CFISH_ERR_FUNC_MACRO)
-
 /** The maximum number of bytes encoded by to_base36(), including the
  * terminating NULL.
  */
 #define cfish_StrHelp_MAX_BASE36_BYTES 14
 #ifdef CFISH_USE_SHORT_NAMES
   #define StrHelp_MAX_BASE36_BYTES cfish_StrHelp_MAX_BASE36_BYTES
-  #define VALIDATE_UTF8            CFISH_VALIDATE_UTF8
 #endif
 __END_C__
 

http://git-wip-us.apache.org/repos/asf/lucy-clownfish/blob/ed2010ca/runtime/perl/buildlib/Clownfish/Build/Binding.pm
----------------------------------------------------------------------
diff --git a/runtime/perl/buildlib/Clownfish/Build/Binding.pm 
b/runtime/perl/buildlib/Clownfish/Build/Binding.pm
index 71b0ff8..ecc83b5 100644
--- a/runtime/perl/buildlib/Clownfish/Build/Binding.pm
+++ b/runtime/perl/buildlib/Clownfish/Build/Binding.pm
@@ -1010,7 +1010,7 @@ CODE:
 {
     STRLEN len;
     char *ptr = SvPV(sv, len);
-    RETVAL = cfish_StrHelp_utf8_valid(ptr, len);
+    RETVAL = cfish_Str_utf8_valid(ptr, len);
 }
 OUTPUT: RETVAL
 

http://git-wip-us.apache.org/repos/asf/lucy-clownfish/blob/ed2010ca/runtime/perl/xs/XSBind.c
----------------------------------------------------------------------
diff --git a/runtime/perl/xs/XSBind.c b/runtime/perl/xs/XSBind.c
index ab9ee82..b566f8d 100644
--- a/runtime/perl/xs/XSBind.c
+++ b/runtime/perl/xs/XSBind.c
@@ -33,7 +33,6 @@
 #include "Clownfish/PtrHash.h"
 #include "Clownfish/TestHarness/TestUtils.h"
 #include "Clownfish/Util/Atomic.h"
-#include "Clownfish/Util/StringHelper.h"
 #include "Clownfish/Util/Memory.h"
 
 #define XSBIND_REFCOUNT_FLAG   1

http://git-wip-us.apache.org/repos/asf/lucy-clownfish/blob/ed2010ca/runtime/python/cfext/CFBind.c
----------------------------------------------------------------------
diff --git a/runtime/python/cfext/CFBind.c b/runtime/python/cfext/CFBind.c
index 0703880..536cb1d 100644
--- a/runtime/python/cfext/CFBind.c
+++ b/runtime/python/cfext/CFBind.c
@@ -39,7 +39,6 @@
 #include "Clownfish/TestHarness/TestUtils.h"
 #include "Clownfish/Util/Atomic.h"
 #include "Clownfish/Util/Memory.h"
-#include "Clownfish/Util/StringHelper.h"
 #include "Clownfish/Vector.h"
 
 static bool Err_initialized;
@@ -195,7 +194,7 @@ S_maybe_py_to_cfish(PyObject *py_obj, cfish_Class *klass, 
bool increment,
         Py_ssize_t size;
         char *ptr = PyUnicode_AsUTF8AndSize(py_obj, &size);
         // TODO: Can we guarantee that Python will always supply valid UTF-8?
-        if (!ptr || !cfish_StrHelp_utf8_valid(ptr, size)) {
+        if (!ptr || !cfish_Str_utf8_valid(ptr, size)) {
             return false;
         }
         *obj_ptr = (cfish_Obj*)cfish_Str_new_from_trusted_utf8(ptr, size);

http://git-wip-us.apache.org/repos/asf/lucy-clownfish/blob/ed2010ca/runtime/ruby/ext/Bind.c
----------------------------------------------------------------------
diff --git a/runtime/ruby/ext/Bind.c b/runtime/ruby/ext/Bind.c
index a12b1e1..70c0a9e 100644
--- a/runtime/ruby/ext/Bind.c
+++ b/runtime/ruby/ext/Bind.c
@@ -16,7 +16,6 @@
 
 #include "ruby.h"
 #include "Bind.h"
-#include "Clownfish/Util/StringHelper.h"
 
 VALUE
 Bind_cfish_to_ruby(cfish_Obj *obj) {

http://git-wip-us.apache.org/repos/asf/lucy-clownfish/blob/ed2010ca/runtime/ruby/ext/Clownfish.c
----------------------------------------------------------------------
diff --git a/runtime/ruby/ext/Clownfish.c b/runtime/ruby/ext/Clownfish.c
index 972d2db..8993ad6 100644
--- a/runtime/ruby/ext/Clownfish.c
+++ b/runtime/ruby/ext/Clownfish.c
@@ -17,7 +17,6 @@
 #include "ruby.h"
 
 #include "Clownfish/Util/Memory.h"
-#include "Clownfish/Util/StringHelper.h"
 #include "Clownfish/String.h"
 #include "Clownfish/Test/TestCharBuf.h"
 #include "Clownfish/Test.h"

http://git-wip-us.apache.org/repos/asf/lucy-clownfish/blob/ed2010ca/runtime/test/Clownfish/Test/TestCharBuf.c
----------------------------------------------------------------------
diff --git a/runtime/test/Clownfish/Test/TestCharBuf.c 
b/runtime/test/Clownfish/Test/TestCharBuf.c
index 0782ce2..9cf5bbf 100644
--- a/runtime/test/Clownfish/Test/TestCharBuf.c
+++ b/runtime/test/Clownfish/Test/TestCharBuf.c
@@ -32,7 +32,6 @@
 #include "Clownfish/Test.h"
 #include "Clownfish/TestHarness/TestBatchRunner.h"
 #include "Clownfish/TestHarness/TestUtils.h"
-#include "Clownfish/Util/StringHelper.h"
 #include "Clownfish/Class.h"
 
 static char smiley[] = { (char)0xE2, (char)0x98, (char)0xBA, 0 };
@@ -116,7 +115,7 @@ test_roundtrip(TestBatchRunner *runner) {
         size_t size = Str_Get_Size(str);
 
         // Verify that utf8_valid agrees.
-        if (!StrHelp_utf8_valid(start, size)) {
+        if (!Str_utf8_valid(start, size)) {
             break;
         }
 

http://git-wip-us.apache.org/repos/asf/lucy-clownfish/blob/ed2010ca/runtime/test/Clownfish/Test/TestString.c
----------------------------------------------------------------------
diff --git a/runtime/test/Clownfish/Test/TestString.c 
b/runtime/test/Clownfish/Test/TestString.c
index d89b5fe..d557546 100644
--- a/runtime/test/Clownfish/Test/TestString.c
+++ b/runtime/test/Clownfish/Test/TestString.c
@@ -38,6 +38,25 @@ static char smiley[] = { (char)0xE2, (char)0x98, (char)0xBA, 
0 };
 static uint32_t smiley_len = 3;
 static int32_t smiley_cp  = 0x263A;
 
+static const uint8_t UTF8_COUNT[256] = {
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+    4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+};
+
 TestString*
 TestStr_new() {
     return (TestString*)Class_Make_Obj(TESTSTRING);
@@ -71,6 +90,274 @@ S_smiley_with_whitespace(size_t *num_spaces_ptr) {
     return retval;
 }
 
+/* This alternative implementation of utf8_valid() is (presumably) slower, but
+ * it implements the standard in a more linear, easy-to-grok way.
+ */
+#define TRAIL_OK(n) (n >= 0x80 && n <= 0xBF)
+static bool
+S_utf8_valid_alt(const char *maybe_utf8, size_t size) {
+    const uint8_t *string = (const uint8_t*)maybe_utf8;
+    const uint8_t *const end = string + size;
+    while (string < end) {
+        int count = UTF8_COUNT[*string];
+        bool valid = false;
+        if (count == 1) {
+            if (string[0] <= 0x7F) {
+                valid = true;
+            }
+        }
+        else if (count == 2) {
+            if (string[0] >= 0xC2 && string[0] <= 0xDF) {
+                if (TRAIL_OK(string[1])) {
+                    valid = true;
+                }
+            }
+        }
+        else if (count == 3) {
+            if (string[0] == 0xE0) {
+                if (string[1] >= 0xA0 && string[1] <= 0xBF
+                    && TRAIL_OK(string[2])
+                   ) {
+                    valid = true;
+                }
+            }
+            else if (string[0] >= 0xE1 && string[0] <= 0xEC) {
+                if (TRAIL_OK(string[1])
+                    && TRAIL_OK(string[2])
+                   ) {
+                    valid = true;
+                }
+            }
+            else if (string[0] == 0xED) {
+                if (string[1] >= 0x80 && string[1] <= 0x9F
+                    && TRAIL_OK(string[2])
+                   ) {
+                    valid = true;
+                }
+            }
+            else if (string[0] >= 0xEE && string[0] <= 0xEF) {
+                if (TRAIL_OK(string[1])
+                    && TRAIL_OK(string[2])
+                   ) {
+                    valid = true;
+                }
+            }
+        }
+        else if (count == 4) {
+            if (string[0] == 0xF0) {
+                if (string[1] >= 0x90 && string[1] <= 0xBF
+                    && TRAIL_OK(string[2])
+                    && TRAIL_OK(string[3])
+                   ) {
+                    valid = true;
+                }
+            }
+            else if (string[0] >= 0xF1 && string[0] <= 0xF3) {
+                if (TRAIL_OK(string[1])
+                    && TRAIL_OK(string[2])
+                    && TRAIL_OK(string[3])
+                   ) {
+                    valid = true;
+                }
+            }
+            else if (string[0] == 0xF4) {
+                if (string[1] >= 0x80 && string[1] <= 0x8F
+                    && TRAIL_OK(string[2])
+                    && TRAIL_OK(string[3])
+                   ) {
+                    valid = true;
+                }
+            }
+        }
+
+        if (!valid) {
+            return false;
+        }
+        string += count;
+    }
+
+    if (string != end) {
+        return false;
+    }
+
+    return true;
+}
+
+static void
+test_all_code_points(TestBatchRunner *runner) {
+    int32_t code_point;
+    for (code_point = 0; code_point <= 0x10FFFF; code_point++) {
+        char buffer[4];
+        uint32_t size = Str_encode_utf8_char(code_point, buffer);
+        char *start = buffer;
+
+        // Verify length returned by encode_utf8_char().
+        if (size != UTF8_COUNT[(unsigned char)buffer[0]]) {
+            break;
+        }
+        // Verify that utf8_valid() agrees with alternate implementation.
+        if (!!Str_utf8_valid(start, size)
+            != !!S_utf8_valid_alt(start, size)
+           ) {
+            break;
+        }
+    }
+    if (code_point == 0x110000) {
+        PASS(runner, "Successfully round tripped 0 - 0x10FFFF");
+    }
+    else {
+        FAIL(runner, "Failed round trip at 0x%.1X", (unsigned)code_point);
+    }
+}
+
+static void
+S_test_validity(TestBatchRunner *runner, const char *content, size_t size,
+                bool expected, const char *description) {
+    bool sane = Str_utf8_valid(content, size);
+    bool double_check = S_utf8_valid_alt(content, size);
+    if (sane != double_check) {
+        FAIL(runner, "Disagreement: %s", description);
+    }
+    else {
+        TEST_TRUE(runner, sane == expected, "%s", description);
+    }
+}
+
+static void
+test_utf8_valid(TestBatchRunner *runner) {
+    // Musical symbol G clef:
+    // Code point: U+1D11E
+    // UTF-16:     0xD834 0xDD1E
+    // UTF-8       0xF0 0x9D 0x84 0x9E
+    S_test_validity(runner, "\xF0\x9D\x84\x9E", 4, true,
+                    "Musical symbol G clef");
+    S_test_validity(runner, "\xED\xA0\xB4\xED\xB4\x9E", 6, false,
+                    "G clef as UTF-8 encoded UTF-16 surrogates");
+    S_test_validity(runner, ".\xED\xA0\xB4.", 5, false,
+                    "Isolated high surrogate");
+    S_test_validity(runner, ".\xED\xB4\x9E.", 5, false,
+                    "Isolated low surrogate");
+
+    // Shortest form.
+    S_test_validity(runner, ".\xC1\x9C.", 4, false,
+                    "Non-shortest form ASCII backslash");
+    S_test_validity(runner, ".\xC0\xAF.", 4, false,
+                    "Non-shortest form ASCII slash");
+    S_test_validity(runner, ".\xC0\x80.", 4, false,
+                    "Non-shortest form ASCII NUL character");
+    S_test_validity(runner, ".\xE0\x9F\xBF.", 5, false,
+                    "Non-shortest form three byte sequence");
+    S_test_validity(runner, ".\xF0\x8F\xBF\xBF.", 6, false,
+                    "Non-shortest form four byte sequence");
+
+    // Range.
+    S_test_validity(runner, "\xF8\x88\x80\x80\x80", 5, false, "5-byte UTF-8");
+    S_test_validity(runner, "\xF4\x8F\xBF\xBF", 4, true,
+                    "Code point 0x10FFFF");
+    S_test_validity(runner, "\xF4\x90\x80\x80", 4, false,
+                    "Code point 0x110000 too large");
+    S_test_validity(runner, "\xF5\x80\x80\x80", 4, false,
+                    "Sequence starting with 0xF5");
+
+    // Truncated sequences.
+    S_test_validity(runner, "\xC2", 1, false,
+                    "Truncated two byte sequence");
+    S_test_validity(runner, "\xE2\x98", 2, false,
+                    "Truncated three byte sequence");
+    S_test_validity(runner, "\xF0\x9D\x84", 3, false,
+                    "Truncated four byte sequence");
+
+    // Bad continuations.
+    S_test_validity(runner, "\xE2\x98\xBA\xE2\x98\xBA", 6, true,
+                    "SmileySmiley");
+    S_test_validity(runner, "\xE2\xBA\xE2\x98\xBA", 5, false,
+                    "missing first continuation byte");
+    S_test_validity(runner, "\xE2\x98\xE2\x98\xBA", 5, false,
+                    "missing second continuation byte");
+    S_test_validity(runner, "\xE2\xE2\x98\xBA", 4, false,
+                    "missing both continuation bytes");
+    S_test_validity(runner, "\xBA\xE2\x98\xBA\xE2\xBA", 5, false,
+                    "missing first continuation byte (end)");
+    S_test_validity(runner, "\xE2\x98\xBA\xE2\x98", 5, false,
+                    "missing second continuation byte (end)");
+    S_test_validity(runner, "\xE2\x98\xBA\xE2", 4, false,
+                    "missing both continuation bytes (end)");
+    S_test_validity(runner, "\xBA\xE2\x98\xBA", 4, false,
+                    "isolated continuation byte 0xBA");
+    S_test_validity(runner, "\x98\xE2\x98\xBA", 4, false,
+                    "isolated continuation byte 0x98");
+    S_test_validity(runner, "\xE2\x98\xBA\xBA", 4, false,
+                    "isolated continuation byte 0xBA (end)");
+    S_test_validity(runner, "\xE2\x98\xBA\x98", 4, false,
+                    "isolated continuation byte 0x98 (end)");
+    S_test_validity(runner, "\xF0xxxx", 5, false,
+                    "missing continuation byte 2/4");
+    S_test_validity(runner, "\xF0\x9Dxxxx", 5, false,
+                    "missing continuation byte 3/4");
+    S_test_validity(runner, "\xF0\x9D\x84xx", 5, false,
+                    "missing continuation byte 4/4");
+}
+
+static void
+S_validate_utf8(void *context) {
+    const char *text = (const char*)context;
+    Str_validate_utf8(text, strlen(text), "src.c", 17, "fn");
+}
+
+static void
+test_validate_utf8(TestBatchRunner *runner) {
+    {
+        Err *error = Err_trap(S_validate_utf8, "Sigma\xC1\x9C.");
+        TEST_TRUE(runner, error != NULL, "validate_utf8 throws");
+        String *mess = Err_Get_Mess(error);
+        const char *expected = "Invalid UTF-8 after 'Sigma': C1 9C 2E\n";
+        bool ok = Str_Starts_With_Utf8(mess, expected, strlen(expected));
+        TEST_TRUE(runner, ok, "validate_utf8 throws correct error message");
+        DECREF(error);
+    }
+
+    {
+        Err *error = Err_trap(S_validate_utf8,
+                              "xxx123456789\xE2\x93\xAA"
+                              "1234567890\xC1\x9C.");
+        String *mess = Err_Get_Mess(error);
+        const char *expected =
+            "Invalid UTF-8 after '123456789\xE2\x93\xAA"
+            "1234567890': C1 9C 2E\n";
+        bool ok = Str_Starts_With_Utf8(mess, expected, strlen(expected));
+        TEST_TRUE(runner, ok, "validate_utf8 truncates long prefix");
+        DECREF(error);
+    }
+}
+
+static void
+test_is_whitespace(TestBatchRunner *runner) {
+    TEST_TRUE(runner, Str_is_whitespace(' '), "space is whitespace");
+    TEST_TRUE(runner, Str_is_whitespace('\n'), "newline is whitespace");
+    TEST_TRUE(runner, Str_is_whitespace('\t'), "tab is whitespace");
+    TEST_TRUE(runner, Str_is_whitespace('\v'),
+              "vertical tab is whitespace");
+    TEST_FALSE(runner, Str_is_whitespace('a'), "'a' isn't whitespace");
+    TEST_FALSE(runner, Str_is_whitespace(0), "NULL isn't whitespace");
+    TEST_FALSE(runner, Str_is_whitespace(0x263A),
+               "Smiley isn't whitespace");
+}
+
+static void
+S_encode_utf8_char(void *context) {
+    int32_t *code_point_ptr = (int32_t*)context;
+    char buffer[4];
+    Str_encode_utf8_char(*code_point_ptr, buffer);
+}
+
+static void
+test_encode_utf8_char(TestBatchRunner *runner) {
+    int32_t code_point = 0x110000;
+    Err *error = Err_trap(S_encode_utf8_char, &code_point);
+    TEST_TRUE(runner, error != NULL, "Encode code point 0x110000 throws");
+    DECREF(error);
+}
+
 static void
 test_new(TestBatchRunner *runner) {
     static char chars[] = "A string " SMILEY " with a smile.";
@@ -813,7 +1100,12 @@ test_iterator_substring(TestBatchRunner *runner) {
 
 void
 TestStr_Run_IMP(TestString *self, TestBatchRunner *runner) {
-    TestBatchRunner_Plan(runner, (TestBatch*)self, 158);
+    TestBatchRunner_Plan(runner, (TestBatch*)self, 200);
+    test_all_code_points(runner);
+    test_utf8_valid(runner);
+    test_validate_utf8(runner);
+    test_is_whitespace(runner);
+    test_encode_utf8_char(runner);
     test_new(runner);
     test_Cat(runner);
     test_Clone(runner);

http://git-wip-us.apache.org/repos/asf/lucy-clownfish/blob/ed2010ca/runtime/test/Clownfish/Test/Util/TestStringHelper.c
----------------------------------------------------------------------
diff --git a/runtime/test/Clownfish/Test/Util/TestStringHelper.c 
b/runtime/test/Clownfish/Test/Util/TestStringHelper.c
index d009a58..2caee84 100644
--- a/runtime/test/Clownfish/Test/Util/TestStringHelper.c
+++ b/runtime/test/Clownfish/Test/Util/TestStringHelper.c
@@ -28,104 +28,11 @@
 #include "Clownfish/Util/StringHelper.h"
 #include "Clownfish/Class.h"
 
-/* This alternative implementation of utf8_valid() is (presumably) slower, but
- * it implements the standard in a more linear, easy-to-grok way.
- */
-#define TRAIL_OK(n) (n >= 0x80 && n <= 0xBF)
 TestStringHelper*
 TestStrHelp_new() {
     return (TestStringHelper*)Class_Make_Obj(TESTSTRINGHELPER);
 }
 
-static bool
-S_utf8_valid_alt(const char *maybe_utf8, size_t size) {
-    const uint8_t *string = (const uint8_t*)maybe_utf8;
-    const uint8_t *const end = string + size;
-    while (string < end) {
-        int count = StrHelp_UTF8_COUNT[*string];
-        bool valid = false;
-        if (count == 1) {
-            if (string[0] <= 0x7F) {
-                valid = true;
-            }
-        }
-        else if (count == 2) {
-            if (string[0] >= 0xC2 && string[0] <= 0xDF) {
-                if (TRAIL_OK(string[1])) {
-                    valid = true;
-                }
-            }
-        }
-        else if (count == 3) {
-            if (string[0] == 0xE0) {
-                if (string[1] >= 0xA0 && string[1] <= 0xBF
-                    && TRAIL_OK(string[2])
-                   ) {
-                    valid = true;
-                }
-            }
-            else if (string[0] >= 0xE1 && string[0] <= 0xEC) {
-                if (TRAIL_OK(string[1])
-                    && TRAIL_OK(string[2])
-                   ) {
-                    valid = true;
-                }
-            }
-            else if (string[0] == 0xED) {
-                if (string[1] >= 0x80 && string[1] <= 0x9F
-                    && TRAIL_OK(string[2])
-                   ) {
-                    valid = true;
-                }
-            }
-            else if (string[0] >= 0xEE && string[0] <= 0xEF) {
-                if (TRAIL_OK(string[1])
-                    && TRAIL_OK(string[2])
-                   ) {
-                    valid = true;
-                }
-            }
-        }
-        else if (count == 4) {
-            if (string[0] == 0xF0) {
-                if (string[1] >= 0x90 && string[1] <= 0xBF
-                    && TRAIL_OK(string[2])
-                    && TRAIL_OK(string[3])
-                   ) {
-                    valid = true;
-                }
-            }
-            else if (string[0] >= 0xF1 && string[0] <= 0xF3) {
-                if (TRAIL_OK(string[1])
-                    && TRAIL_OK(string[2])
-                    && TRAIL_OK(string[3])
-                   ) {
-                    valid = true;
-                }
-            }
-            else if (string[0] == 0xF4) {
-                if (string[1] >= 0x80 && string[1] <= 0x8F
-                    && TRAIL_OK(string[2])
-                    && TRAIL_OK(string[3])
-                   ) {
-                    valid = true;
-                }
-            }
-        }
-
-        if (!valid) {
-            return false;
-        }
-        string += count;
-    }
-
-    if (string != end) {
-        return false;
-    }
-
-    return true;
-}
-
 static void
 test_overlap(TestBatchRunner *runner) {
     size_t result;
@@ -157,210 +64,41 @@ test_to_base36(TestBatchRunner *runner) {
 }
 
 static void
-test_utf8_round_trip(TestBatchRunner *runner) {
+test_back_utf8_char(TestBatchRunner *runner) {
+    char buffer[4];
+    char *buf = buffer + 1;
+    uint32_t len = Str_encode_utf8_char(0x263A, buffer);
+    char *end = buffer + len;
+    TEST_TRUE(runner, StrHelp_back_utf8_char(end, buffer) == buffer,
+              "back_utf8_char");
+    TEST_TRUE(runner, StrHelp_back_utf8_char(end, buf) == NULL,
+              "back_utf8_char returns NULL rather than back up beyond start");
+    TEST_TRUE(runner, StrHelp_back_utf8_char(buffer, buffer) == NULL,
+              "back_utf8_char returns NULL when end == start");
+
     int32_t code_point;
     for (code_point = 0; code_point <= 0x10FFFF; code_point++) {
-        char buffer[4];
-        uint32_t size = StrHelp_encode_utf8_char(code_point, buffer);
+        uint32_t size = Str_encode_utf8_char(code_point, buffer);
         char *start = buffer;
         char *end   = start + size;
 
-        // Verify length returned by encode_utf8_char().
-        if (size != StrHelp_UTF8_COUNT[(unsigned char)buffer[0]]) {
-            break;
-        }
-        // Verify that utf8_valid() agrees with alternate implementation.
-        if (!!StrHelp_utf8_valid(start, size)
-            != !!S_utf8_valid_alt(start, size)
-           ) {
-            break;
-        }
-
-        // Verify back_utf8_char().
         if (StrHelp_back_utf8_char(end, start) != start) {
             break;
         }
     }
     if (code_point == 0x110000) {
-        PASS(runner, "Successfully round tripped 0 - 0x10FFFF");
+        PASS(runner, "back_utf8_char works for code points 0 - 0x10FFFF");
     }
     else {
-        FAIL(runner, "Failed round trip at 0x%.1X", (unsigned)code_point);
-    }
-}
-
-static void
-S_test_validity(TestBatchRunner *runner, const char *content, size_t size,
-                bool expected, const char *description) {
-    bool sane = StrHelp_utf8_valid(content, size);
-    bool double_check = S_utf8_valid_alt(content, size);
-    if (sane != double_check) {
-        FAIL(runner, "Disagreement: %s", description);
-    }
-    else {
-        TEST_TRUE(runner, sane == expected, "%s", description);
-    }
-}
-
-static void
-test_utf8_valid(TestBatchRunner *runner) {
-    // Musical symbol G clef:
-    // Code point: U+1D11E
-    // UTF-16:     0xD834 0xDD1E
-    // UTF-8       0xF0 0x9D 0x84 0x9E
-    S_test_validity(runner, "\xF0\x9D\x84\x9E", 4, true,
-                    "Musical symbol G clef");
-    S_test_validity(runner, "\xED\xA0\xB4\xED\xB4\x9E", 6, false,
-                    "G clef as UTF-8 encoded UTF-16 surrogates");
-    S_test_validity(runner, ".\xED\xA0\xB4.", 5, false,
-                    "Isolated high surrogate");
-    S_test_validity(runner, ".\xED\xB4\x9E.", 5, false,
-                    "Isolated low surrogate");
-
-    // Shortest form.
-    S_test_validity(runner, ".\xC1\x9C.", 4, false,
-                    "Non-shortest form ASCII backslash");
-    S_test_validity(runner, ".\xC0\xAF.", 4, false,
-                    "Non-shortest form ASCII slash");
-    S_test_validity(runner, ".\xC0\x80.", 4, false,
-                    "Non-shortest form ASCII NUL character");
-    S_test_validity(runner, ".\xE0\x9F\xBF.", 5, false,
-                    "Non-shortest form three byte sequence");
-    S_test_validity(runner, ".\xF0\x8F\xBF\xBF.", 6, false,
-                    "Non-shortest form four byte sequence");
-
-    // Range.
-    S_test_validity(runner, "\xF8\x88\x80\x80\x80", 5, false, "5-byte UTF-8");
-    S_test_validity(runner, "\xF4\x8F\xBF\xBF", 4, true,
-                    "Code point 0x10FFFF");
-    S_test_validity(runner, "\xF4\x90\x80\x80", 4, false,
-                    "Code point 0x110000 too large");
-    S_test_validity(runner, "\xF5\x80\x80\x80", 4, false,
-                    "Sequence starting with 0xF5");
-
-    // Truncated sequences.
-    S_test_validity(runner, "\xC2", 1, false,
-                    "Truncated two byte sequence");
-    S_test_validity(runner, "\xE2\x98", 2, false,
-                    "Truncated three byte sequence");
-    S_test_validity(runner, "\xF0\x9D\x84", 3, false,
-                    "Truncated four byte sequence");
-
-    // Bad continuations.
-    S_test_validity(runner, "\xE2\x98\xBA\xE2\x98\xBA", 6, true,
-                    "SmileySmiley");
-    S_test_validity(runner, "\xE2\xBA\xE2\x98\xBA", 5, false,
-                    "missing first continuation byte");
-    S_test_validity(runner, "\xE2\x98\xE2\x98\xBA", 5, false,
-                    "missing second continuation byte");
-    S_test_validity(runner, "\xE2\xE2\x98\xBA", 4, false,
-                    "missing both continuation bytes");
-    S_test_validity(runner, "\xBA\xE2\x98\xBA\xE2\xBA", 5, false,
-                    "missing first continuation byte (end)");
-    S_test_validity(runner, "\xE2\x98\xBA\xE2\x98", 5, false,
-                    "missing second continuation byte (end)");
-    S_test_validity(runner, "\xE2\x98\xBA\xE2", 4, false,
-                    "missing both continuation bytes (end)");
-    S_test_validity(runner, "\xBA\xE2\x98\xBA", 4, false,
-                    "isolated continuation byte 0xBA");
-    S_test_validity(runner, "\x98\xE2\x98\xBA", 4, false,
-                    "isolated continuation byte 0x98");
-    S_test_validity(runner, "\xE2\x98\xBA\xBA", 4, false,
-                    "isolated continuation byte 0xBA (end)");
-    S_test_validity(runner, "\xE2\x98\xBA\x98", 4, false,
-                    "isolated continuation byte 0x98 (end)");
-    S_test_validity(runner, "\xF0xxxx", 5, false,
-                    "missing continuation byte 2/4");
-    S_test_validity(runner, "\xF0\x9Dxxxx", 5, false,
-                    "missing continuation byte 3/4");
-    S_test_validity(runner, "\xF0\x9D\x84xx", 5, false,
-                    "missing continuation byte 4/4");
-}
-
-static void
-S_validate_utf8(void *context) {
-    const char *text = (const char*)context;
-    StrHelp_validate_utf8(text, strlen(text), "src.c", 17, "fn");
-}
-
-static void
-test_validate_utf8(TestBatchRunner *runner) {
-    {
-        Err *error = Err_trap(S_validate_utf8, "Sigma\xC1\x9C.");
-        TEST_TRUE(runner, error != NULL, "validate_utf8 throws");
-        String *mess = Err_Get_Mess(error);
-        const char *expected = "Invalid UTF-8 after 'Sigma': C1 9C 2E\n";
-        bool ok = Str_Starts_With_Utf8(mess, expected, strlen(expected));
-        TEST_TRUE(runner, ok, "validate_utf8 throws correct error message");
-        DECREF(error);
+        FAIL(runner, "Failed back_utf8_char at 0x%.1X", (unsigned)code_point);
     }
-
-    {
-        Err *error = Err_trap(S_validate_utf8,
-                              "xxx123456789\xE2\x93\xAA"
-                              "1234567890\xC1\x9C.");
-        String *mess = Err_Get_Mess(error);
-        const char *expected =
-            "Invalid UTF-8 after '123456789\xE2\x93\xAA"
-            "1234567890': C1 9C 2E\n";
-        bool ok = Str_Starts_With_Utf8(mess, expected, strlen(expected));
-        TEST_TRUE(runner, ok, "validate_utf8 truncates long prefix");
-        DECREF(error);
-    }
-}
-
-static void
-test_is_whitespace(TestBatchRunner *runner) {
-    TEST_TRUE(runner, StrHelp_is_whitespace(' '), "space is whitespace");
-    TEST_TRUE(runner, StrHelp_is_whitespace('\n'), "newline is whitespace");
-    TEST_TRUE(runner, StrHelp_is_whitespace('\t'), "tab is whitespace");
-    TEST_TRUE(runner, StrHelp_is_whitespace('\v'),
-              "vertical tab is whitespace");
-    TEST_FALSE(runner, StrHelp_is_whitespace('a'), "'a' isn't whitespace");
-    TEST_FALSE(runner, StrHelp_is_whitespace(0), "NULL isn't whitespace");
-    TEST_FALSE(runner, StrHelp_is_whitespace(0x263A),
-               "Smiley isn't whitespace");
-}
-
-static void
-S_encode_utf8_char(void *context) {
-    int32_t *code_point_ptr = (int32_t*)context;
-    char buffer[4];
-    StrHelp_encode_utf8_char(*code_point_ptr, buffer);
-}
-
-static void
-test_encode_utf8_char(TestBatchRunner *runner) {
-    int32_t code_point = 0x110000;
-    Err *error = Err_trap(S_encode_utf8_char, &code_point);
-    TEST_TRUE(runner, error != NULL, "Encode code point 0x110000 throws");
-    DECREF(error);
-}
-
-static void
-test_back_utf8_char(TestBatchRunner *runner) {
-    char buffer[4];
-    char *buf = buffer + 1;
-    uint32_t len = StrHelp_encode_utf8_char(0x263A, buffer);
-    char *end = buffer + len;
-    TEST_TRUE(runner, StrHelp_back_utf8_char(end, buffer) == buffer,
-              "back_utf8_char");
-    TEST_TRUE(runner, StrHelp_back_utf8_char(end, buf) == NULL,
-              "back_utf8_char returns NULL rather than back up beyond start");
-    TEST_TRUE(runner, StrHelp_back_utf8_char(buffer, buffer) == NULL,
-              "back_utf8_char returns NULL when end == start");
 }
 
 void
 TestStrHelp_Run_IMP(TestStringHelper *self, TestBatchRunner *runner) {
-    TestBatchRunner_Plan(runner, (TestBatch*)self, 55);
+    TestBatchRunner_Plan(runner, (TestBatch*)self, 14);
     test_overlap(runner);
     test_to_base36(runner);
-    test_utf8_round_trip(runner);
-    test_utf8_valid(runner);
-    test_validate_utf8(runner);
-    test_is_whitespace(runner);
-    test_encode_utf8_char(runner);
     test_back_utf8_char(runner);
 }

[4/6] lucy-clownfish git commit: Move some functions from StrHelp to Str

Reply via email to