Move some functions from StrHelp to Str - utf8_valid - validate_utf8 - is_whitespace - encode_utf8_char
Project: http://git-wip-us.apache.org/repos/asf/lucy-clownfish/repo Commit: http://git-wip-us.apache.org/repos/asf/lucy-clownfish/commit/ed2010ca Tree: http://git-wip-us.apache.org/repos/asf/lucy-clownfish/tree/ed2010ca Diff: http://git-wip-us.apache.org/repos/asf/lucy-clownfish/diff/ed2010ca Branch: refs/heads/master Commit: ed2010caec0af7ccdcaff76a05cdb516166a6ad4 Parents: 64a1000 Author: Nick Wellnhofer <[email protected]> Authored: Tue Aug 2 18:46:35 2016 +0200 Committer: Nick Wellnhofer <[email protected]> Committed: Tue Aug 2 19:05:14 2016 +0200 ---------------------------------------------------------------------- runtime/core/Clownfish/CharBuf.c | 3 +- runtime/core/Clownfish/String.c | 180 +++++++++++- runtime/core/Clownfish/String.cfh | 33 +++ runtime/core/Clownfish/Util/StringHelper.c | 176 ----------- runtime/core/Clownfish/Util/StringHelper.cfh | 34 --- .../perl/buildlib/Clownfish/Build/Binding.pm | 2 +- runtime/perl/xs/XSBind.c | 1 - runtime/python/cfext/CFBind.c | 3 +- runtime/ruby/ext/Bind.c | 1 - runtime/ruby/ext/Clownfish.c | 1 - runtime/test/Clownfish/Test/TestCharBuf.c | 3 +- runtime/test/Clownfish/Test/TestString.c | 294 ++++++++++++++++++- .../test/Clownfish/Test/Util/TestStringHelper.c | 294 +------------------ 13 files changed, 522 insertions(+), 503 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/lucy-clownfish/blob/ed2010ca/runtime/core/Clownfish/CharBuf.c ---------------------------------------------------------------------- diff --git a/runtime/core/Clownfish/CharBuf.c b/runtime/core/Clownfish/CharBuf.c index 2dbae91..30f54dd 100644 --- a/runtime/core/Clownfish/CharBuf.c +++ b/runtime/core/Clownfish/CharBuf.c @@ -30,7 +30,6 @@ #include "Clownfish/Err.h" #include "Clownfish/String.h" #include "Clownfish/Util/Memory.h" -#include "Clownfish/Util/StringHelper.h" #include "Clownfish/Class.h" // Append trusted UTF-8 to the CharBuf. @@ -290,7 +289,7 @@ CB_Cat_Char_IMP(CharBuf *self, int32_t code_point) { size_t old_size = self->size; SI_add_grow_and_oversize(self, old_size, MAX_UTF8_BYTES); char *end = self->ptr + old_size; - size_t count = StrHelp_encode_utf8_char(code_point, (uint8_t*)end); + size_t count = Str_encode_utf8_char(code_point, (uint8_t*)end); self->size += count; } http://git-wip-us.apache.org/repos/asf/lucy-clownfish/blob/ed2010ca/runtime/core/Clownfish/String.c ---------------------------------------------------------------------- diff --git a/runtime/core/Clownfish/String.c b/runtime/core/Clownfish/String.c index 0353ffd..0de7f28 100644 --- a/runtime/core/Clownfish/String.c +++ b/runtime/core/Clownfish/String.c @@ -19,6 +19,7 @@ #define CFISH_USE_SHORT_NAMES #include <string.h> +#include <stdio.h> #include <stdlib.h> #include <ctype.h> @@ -29,7 +30,6 @@ #include "Clownfish/CharBuf.h" #include "Clownfish/Err.h" #include "Clownfish/Util/Memory.h" -#include "Clownfish/Util/StringHelper.h" #define STACK_ITER(string, byte_offset) \ S_new_stack_iter(alloca(sizeof(StringIterator)), string, byte_offset) @@ -40,6 +40,178 @@ S_memmem(String *self, const char *substring, size_t size); static StringIterator* S_new_stack_iter(void *allocation, String *string, size_t byte_offset); +// Return a pointer to the first invalid UTF-8 sequence, or NULL if +// the UTF-8 is valid. +static const uint8_t* +S_find_invalid_utf8(const uint8_t *string, size_t size) { + const uint8_t *const end = string + size; + while (string < end) { + const uint8_t *start = string; + const uint8_t header_byte = *string++; + + if (header_byte < 0x80) { + // ASCII + ; + } + else if (header_byte < 0xE0) { + // Disallow non-shortest-form ASCII and continuation bytes. + if (header_byte < 0xC2) { return start; } + // Two-byte sequence. + if (string == end) { return start; } + if ((*string++ & 0xC0) != 0x80) { return start; } + } + else if (header_byte < 0xF0) { + // Three-byte sequence. + if (end - string < 2) { return start; } + if (header_byte == 0xED) { + // Disallow UTF-16 surrogates. + if (*string < 0x80 || *string > 0x9F) { + return start; + } + } + else if (!(header_byte & 0x0F)) { + // Disallow non-shortest-form. + if (!(*string & 0x20)) { + return start; + } + } + if ((*string++ & 0xC0) != 0x80) { return start; } + if ((*string++ & 0xC0) != 0x80) { return start; } + } + else { + if (header_byte > 0xF4) { return start; } + // Four-byte sequence. + if (end - string < 3) { return start; } + if (!(header_byte & 0x07)) { + // Disallow non-shortest-form. + if (!(*string & 0x30)) { + return start; + } + } + else if (header_byte == 0xF4) { + // Code point larger than 0x10FFFF. + if (*string >= 0x90) { + return start; + } + } + if ((*string++ & 0xC0) != 0x80) { return start; } + if ((*string++ & 0xC0) != 0x80) { return start; } + if ((*string++ & 0xC0) != 0x80) { return start; } + } + } + + return NULL; +} + +bool +Str_utf8_valid(const char *ptr, size_t size) { + return S_find_invalid_utf8((const uint8_t*)ptr, size) == NULL; +} + +void +Str_validate_utf8(const char *ptr, size_t size, const char *file, int line, + const char *func) { + const uint8_t *string = (const uint8_t*)ptr; + const uint8_t *invalid = S_find_invalid_utf8(string, size); + if (invalid == NULL) { return; } + + CharBuf *buf = CB_new(0); + CB_Cat_Trusted_Utf8(buf, "Invalid UTF-8", 13); + + if (invalid > string) { + const uint8_t *prefix = invalid; + size_t num_code_points = 0; + + // Skip up to 20 code points backwards. + while (prefix > string) { + prefix -= 1; + + if ((*prefix & 0xC0) != 0x80) { + num_code_points += 1; + if (num_code_points >= 20) { break; } + } + } + + CB_Cat_Trusted_Utf8(buf, " after '", 8); + CB_Cat_Trusted_Utf8(buf, (const char*)prefix, invalid - prefix); + CB_Cat_Trusted_Utf8(buf, "'", 1); + } + + CB_Cat_Trusted_Utf8(buf, ":", 1); + + // Append offending bytes as hex. + const uint8_t *end = string + size; + const uint8_t *max = invalid + 5; + for (const uint8_t *byte = invalid; byte < end && byte < max; byte++) { + char hex[4]; + sprintf(hex, " %02X", *byte); + CB_Cat_Trusted_Utf8(buf, hex, 3); + } + + String *mess = CB_Yield_String(buf); + DECREF(buf); + + Err *err = Err_new(mess); + Err_Add_Frame(err, file, line, func); + Err_do_throw(err); +} + +bool +Str_is_whitespace(int32_t code_point) { + switch (code_point) { + // <control-0009>..<control-000D> + case 0x0009: case 0x000A: case 0x000B: case 0x000C: case 0x000D: + case 0x0020: // SPACE + case 0x0085: // <control-0085> + case 0x00A0: // NO-BREAK SPACE + case 0x1680: // OGHAM SPACE MARK + // EN QUAD..HAIR SPACE + case 0x2000: case 0x2001: case 0x2002: case 0x2003: case 0x2004: + case 0x2005: case 0x2006: case 0x2007: case 0x2008: case 0x2009: + case 0x200A: + case 0x2028: // LINE SEPARATOR + case 0x2029: // PARAGRAPH SEPARATOR + case 0x202F: // NARROW NO-BREAK SPACE + case 0x205F: // MEDIUM MATHEMATICAL SPACE + case 0x3000: // IDEOGRAPHIC SPACE + return true; + + default: + return false; + } +} + +uint32_t +Str_encode_utf8_char(int32_t code_point, void *buffer) { + uint8_t *buf = (uint8_t*)buffer; + if (code_point <= 0x7F) { // ASCII + buf[0] = (uint8_t)code_point; + return 1; + } + else if (code_point <= 0x07FF) { // 2 byte range + buf[0] = (uint8_t)(0xC0 | (code_point >> 6)); + buf[1] = (uint8_t)(0x80 | (code_point & 0x3f)); + return 2; + } + else if (code_point <= 0xFFFF) { // 3 byte range + buf[0] = (uint8_t)(0xE0 | (code_point >> 12)); + buf[1] = (uint8_t)(0x80 | ((code_point >> 6) & 0x3F)); + buf[2] = (uint8_t)(0x80 | (code_point & 0x3f)); + return 3; + } + else if (code_point <= 0x10FFFF) { // 4 byte range + buf[0] = (uint8_t)(0xF0 | (code_point >> 18)); + buf[1] = (uint8_t)(0x80 | ((code_point >> 12) & 0x3F)); + buf[2] = (uint8_t)(0x80 | ((code_point >> 6) & 0x3F)); + buf[3] = (uint8_t)(0x80 | (code_point & 0x3f)); + return 4; + } + else { + THROW(ERR, "Illegal Unicode code point: %u32", code_point); + UNREACHABLE_RETURN(uint32_t); + } +} + String* Str_new_from_utf8(const char *utf8, size_t size) { VALIDATE_UTF8(utf8, size); @@ -122,7 +294,7 @@ String* Str_new_from_char(int32_t code_point) { const size_t MAX_UTF8_BYTES = 4; char *ptr = (char*)MALLOCATE(MAX_UTF8_BYTES + 1); - size_t size = StrHelp_encode_utf8_char(code_point, (uint8_t*)ptr); + size_t size = Str_encode_utf8_char(code_point, (uint8_t*)ptr); ptr[size] = '\0'; String *self = (String*)Class_Make_Obj(STRING); @@ -740,7 +912,7 @@ StrIter_Skip_Whitespace_IMP(StringIterator *self) { int32_t code_point; while (STR_OOB != (code_point = StrIter_Next(self))) { - if (!StrHelp_is_whitespace(code_point)) { break; } + if (!Str_is_whitespace(code_point)) { break; } byte_offset = self->byte_offset; ++num_skipped; } @@ -756,7 +928,7 @@ StrIter_Skip_Whitespace_Back_IMP(StringIterator *self) { int32_t code_point; while (STR_OOB != (code_point = StrIter_Prev(self))) { - if (!StrHelp_is_whitespace(code_point)) { break; } + if (!Str_is_whitespace(code_point)) { break; } byte_offset = self->byte_offset; ++num_skipped; } http://git-wip-us.apache.org/repos/asf/lucy-clownfish/blob/ed2010ca/runtime/core/Clownfish/String.cfh ---------------------------------------------------------------------- diff --git a/runtime/core/Clownfish/String.cfh b/runtime/core/Clownfish/String.cfh index 72f60a1..6b3323e 100644 --- a/runtime/core/Clownfish/String.cfh +++ b/runtime/core/Clownfish/String.cfh @@ -24,6 +24,9 @@ __C__ // For CFISH_ALLOCA_OBJ. #include "Clownfish/Class.h" +// For CFISH_ERR_FUNC_MACRO. +#include "Clownfish/Err.h" + __END_C__ /** @@ -37,6 +40,31 @@ public final class Clownfish::String nickname Str size_t size; String *origin; + /** Return true if the string is valid UTF-8, false otherwise. + */ + public inert bool + utf8_valid(const char *ptr, size_t len); + + /** Throws an error if the string isn't valid UTF-8. + */ + public inert void + validate_utf8(const char *text, size_t size, const char *file, int line, + const char *func); + + /** Returns true if the code point qualifies as Unicode whitespace. + */ + public inert bool + is_whitespace(int32_t code_point); + + /** Encode a Unicode code point to a UTF-8 sequence. + * + * @param code_point A legal unicode code point. + * @param buffer Write buffer which must hold at least 4 bytes (the + * maximum legal length for a UTF-8 char). + */ + inert uint32_t + encode_utf8_char(int32_t code_point, void *buffer); + /** Return a String which holds a copy of the supplied UTF-8 character * data after checking for validity. * @@ -506,6 +534,10 @@ public final class Clownfish::StringIterator nickname StrIter __C__ +#define CFISH_VALIDATE_UTF8(text, size) \ + cfish_Str_validate_utf8(text, size, \ + __FILE__, __LINE__, CFISH_ERR_FUNC_MACRO) + #define CFISH_SSTR_BLANK() \ cfish_Str_init_stack_string(CFISH_ALLOCA_OBJ(CFISH_STRING), "", 0) @@ -519,6 +551,7 @@ __C__ #define CFISH_STR_OOB -1 #ifdef CFISH_USE_SHORT_NAMES + #define VALIDATE_UTF8 CFISH_VALIDATE_UTF8 #define SSTR_BLANK CFISH_SSTR_BLANK #define SSTR_WRAP_C CFISH_SSTR_WRAP_C #define SSTR_WRAP_UTF8 CFISH_SSTR_WRAP_UTF8 http://git-wip-us.apache.org/repos/asf/lucy-clownfish/blob/ed2010ca/runtime/core/Clownfish/Util/StringHelper.c ---------------------------------------------------------------------- diff --git a/runtime/core/Clownfish/Util/StringHelper.c b/runtime/core/Clownfish/Util/StringHelper.c index 256c9e0..7b8e9d8 100644 --- a/runtime/core/Clownfish/Util/StringHelper.c +++ b/runtime/core/Clownfish/Util/StringHelper.c @@ -15,10 +15,6 @@ */ #define C_CFISH_STRINGHELPER -#include <string.h> -#include <stddef.h> -#include <stdio.h> - #define CFISH_USE_SHORT_NAMES #include "Clownfish/Util/StringHelper.h" @@ -79,178 +75,6 @@ StrHelp_to_base36(uint64_t num, void *buffer) { return size; } -// Return a pointer to the first invalid UTF-8 sequence, or NULL if -// the UTF-8 is valid. -static const uint8_t* -S_find_invalid_utf8(const uint8_t *string, size_t size) { - const uint8_t *const end = string + size; - while (string < end) { - const uint8_t *start = string; - const uint8_t header_byte = *string++; - - if (header_byte < 0x80) { - // ASCII - ; - } - else if (header_byte < 0xE0) { - // Disallow non-shortest-form ASCII and continuation bytes. - if (header_byte < 0xC2) { return start; } - // Two-byte sequence. - if (string == end) { return start; } - if ((*string++ & 0xC0) != 0x80) { return start; } - } - else if (header_byte < 0xF0) { - // Three-byte sequence. - if (end - string < 2) { return start; } - if (header_byte == 0xED) { - // Disallow UTF-16 surrogates. - if (*string < 0x80 || *string > 0x9F) { - return start; - } - } - else if (!(header_byte & 0x0F)) { - // Disallow non-shortest-form. - if (!(*string & 0x20)) { - return start; - } - } - if ((*string++ & 0xC0) != 0x80) { return start; } - if ((*string++ & 0xC0) != 0x80) { return start; } - } - else { - if (header_byte > 0xF4) { return start; } - // Four-byte sequence. - if (end - string < 3) { return start; } - if (!(header_byte & 0x07)) { - // Disallow non-shortest-form. - if (!(*string & 0x30)) { - return start; - } - } - else if (header_byte == 0xF4) { - // Code point larger than 0x10FFFF. - if (*string >= 0x90) { - return start; - } - } - if ((*string++ & 0xC0) != 0x80) { return start; } - if ((*string++ & 0xC0) != 0x80) { return start; } - if ((*string++ & 0xC0) != 0x80) { return start; } - } - } - - return NULL; -} - -bool -StrHelp_utf8_valid(const char *ptr, size_t size) { - return S_find_invalid_utf8((const uint8_t*)ptr, size) == NULL; -} - -void -StrHelp_validate_utf8(const char *ptr, size_t size, const char *file, - int line, const char *func) { - const uint8_t *string = (const uint8_t*)ptr; - const uint8_t *invalid = S_find_invalid_utf8(string, size); - if (invalid == NULL) { return; } - - CharBuf *buf = CB_new(0); - CB_Cat_Trusted_Utf8(buf, "Invalid UTF-8", 13); - - if (invalid > string) { - const uint8_t *prefix = invalid; - size_t num_code_points = 0; - - // Skip up to 20 code points backwards. - while (prefix > string) { - prefix -= 1; - - if ((*prefix & 0xC0) != 0x80) { - num_code_points += 1; - if (num_code_points >= 20) { break; } - } - } - - CB_Cat_Trusted_Utf8(buf, " after '", 8); - CB_Cat_Trusted_Utf8(buf, (const char*)prefix, invalid - prefix); - CB_Cat_Trusted_Utf8(buf, "'", 1); - } - - CB_Cat_Trusted_Utf8(buf, ":", 1); - - // Append offending bytes as hex. - const uint8_t *end = string + size; - const uint8_t *max = invalid + 5; - for (const uint8_t *byte = invalid; byte < end && byte < max; byte++) { - char hex[4]; - sprintf(hex, " %02X", *byte); - CB_Cat_Trusted_Utf8(buf, hex, 3); - } - - String *mess = CB_Yield_String(buf); - DECREF(buf); - - Err *err = Err_new(mess); - Err_Add_Frame(err, file, line, func); - Err_do_throw(err); -} - -bool -StrHelp_is_whitespace(int32_t code_point) { - switch (code_point) { - // <control-0009>..<control-000D> - case 0x0009: case 0x000A: case 0x000B: case 0x000C: case 0x000D: - case 0x0020: // SPACE - case 0x0085: // <control-0085> - case 0x00A0: // NO-BREAK SPACE - case 0x1680: // OGHAM SPACE MARK - // EN QUAD..HAIR SPACE - case 0x2000: case 0x2001: case 0x2002: case 0x2003: case 0x2004: - case 0x2005: case 0x2006: case 0x2007: case 0x2008: case 0x2009: - case 0x200A: - case 0x2028: // LINE SEPARATOR - case 0x2029: // PARAGRAPH SEPARATOR - case 0x202F: // NARROW NO-BREAK SPACE - case 0x205F: // MEDIUM MATHEMATICAL SPACE - case 0x3000: // IDEOGRAPHIC SPACE - return true; - - default: - return false; - } -} - -uint32_t -StrHelp_encode_utf8_char(int32_t code_point, void *buffer) { - uint8_t *buf = (uint8_t*)buffer; - if (code_point <= 0x7F) { // ASCII - buf[0] = (uint8_t)code_point; - return 1; - } - else if (code_point <= 0x07FF) { // 2 byte range - buf[0] = (uint8_t)(0xC0 | (code_point >> 6)); - buf[1] = (uint8_t)(0x80 | (code_point & 0x3f)); - return 2; - } - else if (code_point <= 0xFFFF) { // 3 byte range - buf[0] = (uint8_t)(0xE0 | (code_point >> 12)); - buf[1] = (uint8_t)(0x80 | ((code_point >> 6) & 0x3F)); - buf[2] = (uint8_t)(0x80 | (code_point & 0x3f)); - return 3; - } - else if (code_point <= 0x10FFFF) { // 4 byte range - buf[0] = (uint8_t)(0xF0 | (code_point >> 18)); - buf[1] = (uint8_t)(0x80 | ((code_point >> 12) & 0x3F)); - buf[2] = (uint8_t)(0x80 | ((code_point >> 6) & 0x3F)); - buf[3] = (uint8_t)(0x80 | (code_point & 0x3f)); - return 4; - } - else { - THROW(ERR, "Illegal Unicode code point: %u32", code_point); - UNREACHABLE_RETURN(uint32_t); - } -} - const char* StrHelp_back_utf8_char(const char *ptr, const char *start) { while (--ptr >= start) { http://git-wip-us.apache.org/repos/asf/lucy-clownfish/blob/ed2010ca/runtime/core/Clownfish/Util/StringHelper.cfh ---------------------------------------------------------------------- diff --git a/runtime/core/Clownfish/Util/StringHelper.cfh b/runtime/core/Clownfish/Util/StringHelper.cfh index 1264bea..1e915e6 100644 --- a/runtime/core/Clownfish/Util/StringHelper.cfh +++ b/runtime/core/Clownfish/Util/StringHelper.cfh @@ -16,10 +16,6 @@ parcel Clownfish; -__C__ -#include "Clownfish/Err.h" -__END_C__ - inert class Clownfish::Util::StringHelper nickname StrHelp { /* A table where the values indicate the number of bytes in a UTF-8 @@ -43,31 +39,6 @@ inert class Clownfish::Util::StringHelper nickname StrHelp { inert size_t to_base36(uint64_t value, void *buffer); - /** Return true if the string is valid UTF-8, false otherwise. - */ - inert bool - utf8_valid(const char *ptr, size_t len); - - /** Throws an error if the string isn't valid UTF-8. - */ - inert void - validate_utf8(const char *text, size_t size, const char *file, int line, - const char *func); - - /** Returns true if the code point qualifies as Unicode whitespace. - */ - inert bool - is_whitespace(int32_t code_point); - - /** Encode a Unicode code point to a UTF-8 sequence. - * - * @param code_point A legal unicode code point. - * @param buffer Write buffer which must hold at least 4 bytes (the - * maximum legal length for a UTF-8 char). - */ - inert uint32_t - encode_utf8_char(int32_t code_point, void *buffer); - /** Return the first non-continuation byte before the supplied pointer. * If backtracking progresses beyond the supplied start, return NULL. */ @@ -76,17 +47,12 @@ inert class Clownfish::Util::StringHelper nickname StrHelp { } __C__ -#define CFISH_VALIDATE_UTF8(text, size) \ - cfish_StrHelp_validate_utf8(text, size, \ - __FILE__, __LINE__, CFISH_ERR_FUNC_MACRO) - /** The maximum number of bytes encoded by to_base36(), including the * terminating NULL. */ #define cfish_StrHelp_MAX_BASE36_BYTES 14 #ifdef CFISH_USE_SHORT_NAMES #define StrHelp_MAX_BASE36_BYTES cfish_StrHelp_MAX_BASE36_BYTES - #define VALIDATE_UTF8 CFISH_VALIDATE_UTF8 #endif __END_C__ http://git-wip-us.apache.org/repos/asf/lucy-clownfish/blob/ed2010ca/runtime/perl/buildlib/Clownfish/Build/Binding.pm ---------------------------------------------------------------------- diff --git a/runtime/perl/buildlib/Clownfish/Build/Binding.pm b/runtime/perl/buildlib/Clownfish/Build/Binding.pm index 71b0ff8..ecc83b5 100644 --- a/runtime/perl/buildlib/Clownfish/Build/Binding.pm +++ b/runtime/perl/buildlib/Clownfish/Build/Binding.pm @@ -1010,7 +1010,7 @@ CODE: { STRLEN len; char *ptr = SvPV(sv, len); - RETVAL = cfish_StrHelp_utf8_valid(ptr, len); + RETVAL = cfish_Str_utf8_valid(ptr, len); } OUTPUT: RETVAL http://git-wip-us.apache.org/repos/asf/lucy-clownfish/blob/ed2010ca/runtime/perl/xs/XSBind.c ---------------------------------------------------------------------- diff --git a/runtime/perl/xs/XSBind.c b/runtime/perl/xs/XSBind.c index ab9ee82..b566f8d 100644 --- a/runtime/perl/xs/XSBind.c +++ b/runtime/perl/xs/XSBind.c @@ -33,7 +33,6 @@ #include "Clownfish/PtrHash.h" #include "Clownfish/TestHarness/TestUtils.h" #include "Clownfish/Util/Atomic.h" -#include "Clownfish/Util/StringHelper.h" #include "Clownfish/Util/Memory.h" #define XSBIND_REFCOUNT_FLAG 1 http://git-wip-us.apache.org/repos/asf/lucy-clownfish/blob/ed2010ca/runtime/python/cfext/CFBind.c ---------------------------------------------------------------------- diff --git a/runtime/python/cfext/CFBind.c b/runtime/python/cfext/CFBind.c index 0703880..536cb1d 100644 --- a/runtime/python/cfext/CFBind.c +++ b/runtime/python/cfext/CFBind.c @@ -39,7 +39,6 @@ #include "Clownfish/TestHarness/TestUtils.h" #include "Clownfish/Util/Atomic.h" #include "Clownfish/Util/Memory.h" -#include "Clownfish/Util/StringHelper.h" #include "Clownfish/Vector.h" static bool Err_initialized; @@ -195,7 +194,7 @@ S_maybe_py_to_cfish(PyObject *py_obj, cfish_Class *klass, bool increment, Py_ssize_t size; char *ptr = PyUnicode_AsUTF8AndSize(py_obj, &size); // TODO: Can we guarantee that Python will always supply valid UTF-8? - if (!ptr || !cfish_StrHelp_utf8_valid(ptr, size)) { + if (!ptr || !cfish_Str_utf8_valid(ptr, size)) { return false; } *obj_ptr = (cfish_Obj*)cfish_Str_new_from_trusted_utf8(ptr, size); http://git-wip-us.apache.org/repos/asf/lucy-clownfish/blob/ed2010ca/runtime/ruby/ext/Bind.c ---------------------------------------------------------------------- diff --git a/runtime/ruby/ext/Bind.c b/runtime/ruby/ext/Bind.c index a12b1e1..70c0a9e 100644 --- a/runtime/ruby/ext/Bind.c +++ b/runtime/ruby/ext/Bind.c @@ -16,7 +16,6 @@ #include "ruby.h" #include "Bind.h" -#include "Clownfish/Util/StringHelper.h" VALUE Bind_cfish_to_ruby(cfish_Obj *obj) { http://git-wip-us.apache.org/repos/asf/lucy-clownfish/blob/ed2010ca/runtime/ruby/ext/Clownfish.c ---------------------------------------------------------------------- diff --git a/runtime/ruby/ext/Clownfish.c b/runtime/ruby/ext/Clownfish.c index 972d2db..8993ad6 100644 --- a/runtime/ruby/ext/Clownfish.c +++ b/runtime/ruby/ext/Clownfish.c @@ -17,7 +17,6 @@ #include "ruby.h" #include "Clownfish/Util/Memory.h" -#include "Clownfish/Util/StringHelper.h" #include "Clownfish/String.h" #include "Clownfish/Test/TestCharBuf.h" #include "Clownfish/Test.h" http://git-wip-us.apache.org/repos/asf/lucy-clownfish/blob/ed2010ca/runtime/test/Clownfish/Test/TestCharBuf.c ---------------------------------------------------------------------- diff --git a/runtime/test/Clownfish/Test/TestCharBuf.c b/runtime/test/Clownfish/Test/TestCharBuf.c index 0782ce2..9cf5bbf 100644 --- a/runtime/test/Clownfish/Test/TestCharBuf.c +++ b/runtime/test/Clownfish/Test/TestCharBuf.c @@ -32,7 +32,6 @@ #include "Clownfish/Test.h" #include "Clownfish/TestHarness/TestBatchRunner.h" #include "Clownfish/TestHarness/TestUtils.h" -#include "Clownfish/Util/StringHelper.h" #include "Clownfish/Class.h" static char smiley[] = { (char)0xE2, (char)0x98, (char)0xBA, 0 }; @@ -116,7 +115,7 @@ test_roundtrip(TestBatchRunner *runner) { size_t size = Str_Get_Size(str); // Verify that utf8_valid agrees. - if (!StrHelp_utf8_valid(start, size)) { + if (!Str_utf8_valid(start, size)) { break; } http://git-wip-us.apache.org/repos/asf/lucy-clownfish/blob/ed2010ca/runtime/test/Clownfish/Test/TestString.c ---------------------------------------------------------------------- diff --git a/runtime/test/Clownfish/Test/TestString.c b/runtime/test/Clownfish/Test/TestString.c index d89b5fe..d557546 100644 --- a/runtime/test/Clownfish/Test/TestString.c +++ b/runtime/test/Clownfish/Test/TestString.c @@ -38,6 +38,25 @@ static char smiley[] = { (char)0xE2, (char)0x98, (char)0xBA, 0 }; static uint32_t smiley_len = 3; static int32_t smiley_cp = 0x263A; +static const uint8_t UTF8_COUNT[256] = { + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +}; + TestString* TestStr_new() { return (TestString*)Class_Make_Obj(TESTSTRING); @@ -71,6 +90,274 @@ S_smiley_with_whitespace(size_t *num_spaces_ptr) { return retval; } +/* This alternative implementation of utf8_valid() is (presumably) slower, but + * it implements the standard in a more linear, easy-to-grok way. + */ +#define TRAIL_OK(n) (n >= 0x80 && n <= 0xBF) +static bool +S_utf8_valid_alt(const char *maybe_utf8, size_t size) { + const uint8_t *string = (const uint8_t*)maybe_utf8; + const uint8_t *const end = string + size; + while (string < end) { + int count = UTF8_COUNT[*string]; + bool valid = false; + if (count == 1) { + if (string[0] <= 0x7F) { + valid = true; + } + } + else if (count == 2) { + if (string[0] >= 0xC2 && string[0] <= 0xDF) { + if (TRAIL_OK(string[1])) { + valid = true; + } + } + } + else if (count == 3) { + if (string[0] == 0xE0) { + if (string[1] >= 0xA0 && string[1] <= 0xBF + && TRAIL_OK(string[2]) + ) { + valid = true; + } + } + else if (string[0] >= 0xE1 && string[0] <= 0xEC) { + if (TRAIL_OK(string[1]) + && TRAIL_OK(string[2]) + ) { + valid = true; + } + } + else if (string[0] == 0xED) { + if (string[1] >= 0x80 && string[1] <= 0x9F + && TRAIL_OK(string[2]) + ) { + valid = true; + } + } + else if (string[0] >= 0xEE && string[0] <= 0xEF) { + if (TRAIL_OK(string[1]) + && TRAIL_OK(string[2]) + ) { + valid = true; + } + } + } + else if (count == 4) { + if (string[0] == 0xF0) { + if (string[1] >= 0x90 && string[1] <= 0xBF + && TRAIL_OK(string[2]) + && TRAIL_OK(string[3]) + ) { + valid = true; + } + } + else if (string[0] >= 0xF1 && string[0] <= 0xF3) { + if (TRAIL_OK(string[1]) + && TRAIL_OK(string[2]) + && TRAIL_OK(string[3]) + ) { + valid = true; + } + } + else if (string[0] == 0xF4) { + if (string[1] >= 0x80 && string[1] <= 0x8F + && TRAIL_OK(string[2]) + && TRAIL_OK(string[3]) + ) { + valid = true; + } + } + } + + if (!valid) { + return false; + } + string += count; + } + + if (string != end) { + return false; + } + + return true; +} + +static void +test_all_code_points(TestBatchRunner *runner) { + int32_t code_point; + for (code_point = 0; code_point <= 0x10FFFF; code_point++) { + char buffer[4]; + uint32_t size = Str_encode_utf8_char(code_point, buffer); + char *start = buffer; + + // Verify length returned by encode_utf8_char(). + if (size != UTF8_COUNT[(unsigned char)buffer[0]]) { + break; + } + // Verify that utf8_valid() agrees with alternate implementation. + if (!!Str_utf8_valid(start, size) + != !!S_utf8_valid_alt(start, size) + ) { + break; + } + } + if (code_point == 0x110000) { + PASS(runner, "Successfully round tripped 0 - 0x10FFFF"); + } + else { + FAIL(runner, "Failed round trip at 0x%.1X", (unsigned)code_point); + } +} + +static void +S_test_validity(TestBatchRunner *runner, const char *content, size_t size, + bool expected, const char *description) { + bool sane = Str_utf8_valid(content, size); + bool double_check = S_utf8_valid_alt(content, size); + if (sane != double_check) { + FAIL(runner, "Disagreement: %s", description); + } + else { + TEST_TRUE(runner, sane == expected, "%s", description); + } +} + +static void +test_utf8_valid(TestBatchRunner *runner) { + // Musical symbol G clef: + // Code point: U+1D11E + // UTF-16: 0xD834 0xDD1E + // UTF-8 0xF0 0x9D 0x84 0x9E + S_test_validity(runner, "\xF0\x9D\x84\x9E", 4, true, + "Musical symbol G clef"); + S_test_validity(runner, "\xED\xA0\xB4\xED\xB4\x9E", 6, false, + "G clef as UTF-8 encoded UTF-16 surrogates"); + S_test_validity(runner, ".\xED\xA0\xB4.", 5, false, + "Isolated high surrogate"); + S_test_validity(runner, ".\xED\xB4\x9E.", 5, false, + "Isolated low surrogate"); + + // Shortest form. + S_test_validity(runner, ".\xC1\x9C.", 4, false, + "Non-shortest form ASCII backslash"); + S_test_validity(runner, ".\xC0\xAF.", 4, false, + "Non-shortest form ASCII slash"); + S_test_validity(runner, ".\xC0\x80.", 4, false, + "Non-shortest form ASCII NUL character"); + S_test_validity(runner, ".\xE0\x9F\xBF.", 5, false, + "Non-shortest form three byte sequence"); + S_test_validity(runner, ".\xF0\x8F\xBF\xBF.", 6, false, + "Non-shortest form four byte sequence"); + + // Range. + S_test_validity(runner, "\xF8\x88\x80\x80\x80", 5, false, "5-byte UTF-8"); + S_test_validity(runner, "\xF4\x8F\xBF\xBF", 4, true, + "Code point 0x10FFFF"); + S_test_validity(runner, "\xF4\x90\x80\x80", 4, false, + "Code point 0x110000 too large"); + S_test_validity(runner, "\xF5\x80\x80\x80", 4, false, + "Sequence starting with 0xF5"); + + // Truncated sequences. + S_test_validity(runner, "\xC2", 1, false, + "Truncated two byte sequence"); + S_test_validity(runner, "\xE2\x98", 2, false, + "Truncated three byte sequence"); + S_test_validity(runner, "\xF0\x9D\x84", 3, false, + "Truncated four byte sequence"); + + // Bad continuations. + S_test_validity(runner, "\xE2\x98\xBA\xE2\x98\xBA", 6, true, + "SmileySmiley"); + S_test_validity(runner, "\xE2\xBA\xE2\x98\xBA", 5, false, + "missing first continuation byte"); + S_test_validity(runner, "\xE2\x98\xE2\x98\xBA", 5, false, + "missing second continuation byte"); + S_test_validity(runner, "\xE2\xE2\x98\xBA", 4, false, + "missing both continuation bytes"); + S_test_validity(runner, "\xBA\xE2\x98\xBA\xE2\xBA", 5, false, + "missing first continuation byte (end)"); + S_test_validity(runner, "\xE2\x98\xBA\xE2\x98", 5, false, + "missing second continuation byte (end)"); + S_test_validity(runner, "\xE2\x98\xBA\xE2", 4, false, + "missing both continuation bytes (end)"); + S_test_validity(runner, "\xBA\xE2\x98\xBA", 4, false, + "isolated continuation byte 0xBA"); + S_test_validity(runner, "\x98\xE2\x98\xBA", 4, false, + "isolated continuation byte 0x98"); + S_test_validity(runner, "\xE2\x98\xBA\xBA", 4, false, + "isolated continuation byte 0xBA (end)"); + S_test_validity(runner, "\xE2\x98\xBA\x98", 4, false, + "isolated continuation byte 0x98 (end)"); + S_test_validity(runner, "\xF0xxxx", 5, false, + "missing continuation byte 2/4"); + S_test_validity(runner, "\xF0\x9Dxxxx", 5, false, + "missing continuation byte 3/4"); + S_test_validity(runner, "\xF0\x9D\x84xx", 5, false, + "missing continuation byte 4/4"); +} + +static void +S_validate_utf8(void *context) { + const char *text = (const char*)context; + Str_validate_utf8(text, strlen(text), "src.c", 17, "fn"); +} + +static void +test_validate_utf8(TestBatchRunner *runner) { + { + Err *error = Err_trap(S_validate_utf8, "Sigma\xC1\x9C."); + TEST_TRUE(runner, error != NULL, "validate_utf8 throws"); + String *mess = Err_Get_Mess(error); + const char *expected = "Invalid UTF-8 after 'Sigma': C1 9C 2E\n"; + bool ok = Str_Starts_With_Utf8(mess, expected, strlen(expected)); + TEST_TRUE(runner, ok, "validate_utf8 throws correct error message"); + DECREF(error); + } + + { + Err *error = Err_trap(S_validate_utf8, + "xxx123456789\xE2\x93\xAA" + "1234567890\xC1\x9C."); + String *mess = Err_Get_Mess(error); + const char *expected = + "Invalid UTF-8 after '123456789\xE2\x93\xAA" + "1234567890': C1 9C 2E\n"; + bool ok = Str_Starts_With_Utf8(mess, expected, strlen(expected)); + TEST_TRUE(runner, ok, "validate_utf8 truncates long prefix"); + DECREF(error); + } +} + +static void +test_is_whitespace(TestBatchRunner *runner) { + TEST_TRUE(runner, Str_is_whitespace(' '), "space is whitespace"); + TEST_TRUE(runner, Str_is_whitespace('\n'), "newline is whitespace"); + TEST_TRUE(runner, Str_is_whitespace('\t'), "tab is whitespace"); + TEST_TRUE(runner, Str_is_whitespace('\v'), + "vertical tab is whitespace"); + TEST_FALSE(runner, Str_is_whitespace('a'), "'a' isn't whitespace"); + TEST_FALSE(runner, Str_is_whitespace(0), "NULL isn't whitespace"); + TEST_FALSE(runner, Str_is_whitespace(0x263A), + "Smiley isn't whitespace"); +} + +static void +S_encode_utf8_char(void *context) { + int32_t *code_point_ptr = (int32_t*)context; + char buffer[4]; + Str_encode_utf8_char(*code_point_ptr, buffer); +} + +static void +test_encode_utf8_char(TestBatchRunner *runner) { + int32_t code_point = 0x110000; + Err *error = Err_trap(S_encode_utf8_char, &code_point); + TEST_TRUE(runner, error != NULL, "Encode code point 0x110000 throws"); + DECREF(error); +} + static void test_new(TestBatchRunner *runner) { static char chars[] = "A string " SMILEY " with a smile."; @@ -813,7 +1100,12 @@ test_iterator_substring(TestBatchRunner *runner) { void TestStr_Run_IMP(TestString *self, TestBatchRunner *runner) { - TestBatchRunner_Plan(runner, (TestBatch*)self, 158); + TestBatchRunner_Plan(runner, (TestBatch*)self, 200); + test_all_code_points(runner); + test_utf8_valid(runner); + test_validate_utf8(runner); + test_is_whitespace(runner); + test_encode_utf8_char(runner); test_new(runner); test_Cat(runner); test_Clone(runner); http://git-wip-us.apache.org/repos/asf/lucy-clownfish/blob/ed2010ca/runtime/test/Clownfish/Test/Util/TestStringHelper.c ---------------------------------------------------------------------- diff --git a/runtime/test/Clownfish/Test/Util/TestStringHelper.c b/runtime/test/Clownfish/Test/Util/TestStringHelper.c index d009a58..2caee84 100644 --- a/runtime/test/Clownfish/Test/Util/TestStringHelper.c +++ b/runtime/test/Clownfish/Test/Util/TestStringHelper.c @@ -28,104 +28,11 @@ #include "Clownfish/Util/StringHelper.h" #include "Clownfish/Class.h" -/* This alternative implementation of utf8_valid() is (presumably) slower, but - * it implements the standard in a more linear, easy-to-grok way. - */ -#define TRAIL_OK(n) (n >= 0x80 && n <= 0xBF) TestStringHelper* TestStrHelp_new() { return (TestStringHelper*)Class_Make_Obj(TESTSTRINGHELPER); } -static bool -S_utf8_valid_alt(const char *maybe_utf8, size_t size) { - const uint8_t *string = (const uint8_t*)maybe_utf8; - const uint8_t *const end = string + size; - while (string < end) { - int count = StrHelp_UTF8_COUNT[*string]; - bool valid = false; - if (count == 1) { - if (string[0] <= 0x7F) { - valid = true; - } - } - else if (count == 2) { - if (string[0] >= 0xC2 && string[0] <= 0xDF) { - if (TRAIL_OK(string[1])) { - valid = true; - } - } - } - else if (count == 3) { - if (string[0] == 0xE0) { - if (string[1] >= 0xA0 && string[1] <= 0xBF - && TRAIL_OK(string[2]) - ) { - valid = true; - } - } - else if (string[0] >= 0xE1 && string[0] <= 0xEC) { - if (TRAIL_OK(string[1]) - && TRAIL_OK(string[2]) - ) { - valid = true; - } - } - else if (string[0] == 0xED) { - if (string[1] >= 0x80 && string[1] <= 0x9F - && TRAIL_OK(string[2]) - ) { - valid = true; - } - } - else if (string[0] >= 0xEE && string[0] <= 0xEF) { - if (TRAIL_OK(string[1]) - && TRAIL_OK(string[2]) - ) { - valid = true; - } - } - } - else if (count == 4) { - if (string[0] == 0xF0) { - if (string[1] >= 0x90 && string[1] <= 0xBF - && TRAIL_OK(string[2]) - && TRAIL_OK(string[3]) - ) { - valid = true; - } - } - else if (string[0] >= 0xF1 && string[0] <= 0xF3) { - if (TRAIL_OK(string[1]) - && TRAIL_OK(string[2]) - && TRAIL_OK(string[3]) - ) { - valid = true; - } - } - else if (string[0] == 0xF4) { - if (string[1] >= 0x80 && string[1] <= 0x8F - && TRAIL_OK(string[2]) - && TRAIL_OK(string[3]) - ) { - valid = true; - } - } - } - - if (!valid) { - return false; - } - string += count; - } - - if (string != end) { - return false; - } - - return true; -} - static void test_overlap(TestBatchRunner *runner) { size_t result; @@ -157,210 +64,41 @@ test_to_base36(TestBatchRunner *runner) { } static void -test_utf8_round_trip(TestBatchRunner *runner) { +test_back_utf8_char(TestBatchRunner *runner) { + char buffer[4]; + char *buf = buffer + 1; + uint32_t len = Str_encode_utf8_char(0x263A, buffer); + char *end = buffer + len; + TEST_TRUE(runner, StrHelp_back_utf8_char(end, buffer) == buffer, + "back_utf8_char"); + TEST_TRUE(runner, StrHelp_back_utf8_char(end, buf) == NULL, + "back_utf8_char returns NULL rather than back up beyond start"); + TEST_TRUE(runner, StrHelp_back_utf8_char(buffer, buffer) == NULL, + "back_utf8_char returns NULL when end == start"); + int32_t code_point; for (code_point = 0; code_point <= 0x10FFFF; code_point++) { - char buffer[4]; - uint32_t size = StrHelp_encode_utf8_char(code_point, buffer); + uint32_t size = Str_encode_utf8_char(code_point, buffer); char *start = buffer; char *end = start + size; - // Verify length returned by encode_utf8_char(). - if (size != StrHelp_UTF8_COUNT[(unsigned char)buffer[0]]) { - break; - } - // Verify that utf8_valid() agrees with alternate implementation. - if (!!StrHelp_utf8_valid(start, size) - != !!S_utf8_valid_alt(start, size) - ) { - break; - } - - // Verify back_utf8_char(). if (StrHelp_back_utf8_char(end, start) != start) { break; } } if (code_point == 0x110000) { - PASS(runner, "Successfully round tripped 0 - 0x10FFFF"); + PASS(runner, "back_utf8_char works for code points 0 - 0x10FFFF"); } else { - FAIL(runner, "Failed round trip at 0x%.1X", (unsigned)code_point); - } -} - -static void -S_test_validity(TestBatchRunner *runner, const char *content, size_t size, - bool expected, const char *description) { - bool sane = StrHelp_utf8_valid(content, size); - bool double_check = S_utf8_valid_alt(content, size); - if (sane != double_check) { - FAIL(runner, "Disagreement: %s", description); - } - else { - TEST_TRUE(runner, sane == expected, "%s", description); - } -} - -static void -test_utf8_valid(TestBatchRunner *runner) { - // Musical symbol G clef: - // Code point: U+1D11E - // UTF-16: 0xD834 0xDD1E - // UTF-8 0xF0 0x9D 0x84 0x9E - S_test_validity(runner, "\xF0\x9D\x84\x9E", 4, true, - "Musical symbol G clef"); - S_test_validity(runner, "\xED\xA0\xB4\xED\xB4\x9E", 6, false, - "G clef as UTF-8 encoded UTF-16 surrogates"); - S_test_validity(runner, ".\xED\xA0\xB4.", 5, false, - "Isolated high surrogate"); - S_test_validity(runner, ".\xED\xB4\x9E.", 5, false, - "Isolated low surrogate"); - - // Shortest form. - S_test_validity(runner, ".\xC1\x9C.", 4, false, - "Non-shortest form ASCII backslash"); - S_test_validity(runner, ".\xC0\xAF.", 4, false, - "Non-shortest form ASCII slash"); - S_test_validity(runner, ".\xC0\x80.", 4, false, - "Non-shortest form ASCII NUL character"); - S_test_validity(runner, ".\xE0\x9F\xBF.", 5, false, - "Non-shortest form three byte sequence"); - S_test_validity(runner, ".\xF0\x8F\xBF\xBF.", 6, false, - "Non-shortest form four byte sequence"); - - // Range. - S_test_validity(runner, "\xF8\x88\x80\x80\x80", 5, false, "5-byte UTF-8"); - S_test_validity(runner, "\xF4\x8F\xBF\xBF", 4, true, - "Code point 0x10FFFF"); - S_test_validity(runner, "\xF4\x90\x80\x80", 4, false, - "Code point 0x110000 too large"); - S_test_validity(runner, "\xF5\x80\x80\x80", 4, false, - "Sequence starting with 0xF5"); - - // Truncated sequences. - S_test_validity(runner, "\xC2", 1, false, - "Truncated two byte sequence"); - S_test_validity(runner, "\xE2\x98", 2, false, - "Truncated three byte sequence"); - S_test_validity(runner, "\xF0\x9D\x84", 3, false, - "Truncated four byte sequence"); - - // Bad continuations. - S_test_validity(runner, "\xE2\x98\xBA\xE2\x98\xBA", 6, true, - "SmileySmiley"); - S_test_validity(runner, "\xE2\xBA\xE2\x98\xBA", 5, false, - "missing first continuation byte"); - S_test_validity(runner, "\xE2\x98\xE2\x98\xBA", 5, false, - "missing second continuation byte"); - S_test_validity(runner, "\xE2\xE2\x98\xBA", 4, false, - "missing both continuation bytes"); - S_test_validity(runner, "\xBA\xE2\x98\xBA\xE2\xBA", 5, false, - "missing first continuation byte (end)"); - S_test_validity(runner, "\xE2\x98\xBA\xE2\x98", 5, false, - "missing second continuation byte (end)"); - S_test_validity(runner, "\xE2\x98\xBA\xE2", 4, false, - "missing both continuation bytes (end)"); - S_test_validity(runner, "\xBA\xE2\x98\xBA", 4, false, - "isolated continuation byte 0xBA"); - S_test_validity(runner, "\x98\xE2\x98\xBA", 4, false, - "isolated continuation byte 0x98"); - S_test_validity(runner, "\xE2\x98\xBA\xBA", 4, false, - "isolated continuation byte 0xBA (end)"); - S_test_validity(runner, "\xE2\x98\xBA\x98", 4, false, - "isolated continuation byte 0x98 (end)"); - S_test_validity(runner, "\xF0xxxx", 5, false, - "missing continuation byte 2/4"); - S_test_validity(runner, "\xF0\x9Dxxxx", 5, false, - "missing continuation byte 3/4"); - S_test_validity(runner, "\xF0\x9D\x84xx", 5, false, - "missing continuation byte 4/4"); -} - -static void -S_validate_utf8(void *context) { - const char *text = (const char*)context; - StrHelp_validate_utf8(text, strlen(text), "src.c", 17, "fn"); -} - -static void -test_validate_utf8(TestBatchRunner *runner) { - { - Err *error = Err_trap(S_validate_utf8, "Sigma\xC1\x9C."); - TEST_TRUE(runner, error != NULL, "validate_utf8 throws"); - String *mess = Err_Get_Mess(error); - const char *expected = "Invalid UTF-8 after 'Sigma': C1 9C 2E\n"; - bool ok = Str_Starts_With_Utf8(mess, expected, strlen(expected)); - TEST_TRUE(runner, ok, "validate_utf8 throws correct error message"); - DECREF(error); + FAIL(runner, "Failed back_utf8_char at 0x%.1X", (unsigned)code_point); } - - { - Err *error = Err_trap(S_validate_utf8, - "xxx123456789\xE2\x93\xAA" - "1234567890\xC1\x9C."); - String *mess = Err_Get_Mess(error); - const char *expected = - "Invalid UTF-8 after '123456789\xE2\x93\xAA" - "1234567890': C1 9C 2E\n"; - bool ok = Str_Starts_With_Utf8(mess, expected, strlen(expected)); - TEST_TRUE(runner, ok, "validate_utf8 truncates long prefix"); - DECREF(error); - } -} - -static void -test_is_whitespace(TestBatchRunner *runner) { - TEST_TRUE(runner, StrHelp_is_whitespace(' '), "space is whitespace"); - TEST_TRUE(runner, StrHelp_is_whitespace('\n'), "newline is whitespace"); - TEST_TRUE(runner, StrHelp_is_whitespace('\t'), "tab is whitespace"); - TEST_TRUE(runner, StrHelp_is_whitespace('\v'), - "vertical tab is whitespace"); - TEST_FALSE(runner, StrHelp_is_whitespace('a'), "'a' isn't whitespace"); - TEST_FALSE(runner, StrHelp_is_whitespace(0), "NULL isn't whitespace"); - TEST_FALSE(runner, StrHelp_is_whitespace(0x263A), - "Smiley isn't whitespace"); -} - -static void -S_encode_utf8_char(void *context) { - int32_t *code_point_ptr = (int32_t*)context; - char buffer[4]; - StrHelp_encode_utf8_char(*code_point_ptr, buffer); -} - -static void -test_encode_utf8_char(TestBatchRunner *runner) { - int32_t code_point = 0x110000; - Err *error = Err_trap(S_encode_utf8_char, &code_point); - TEST_TRUE(runner, error != NULL, "Encode code point 0x110000 throws"); - DECREF(error); -} - -static void -test_back_utf8_char(TestBatchRunner *runner) { - char buffer[4]; - char *buf = buffer + 1; - uint32_t len = StrHelp_encode_utf8_char(0x263A, buffer); - char *end = buffer + len; - TEST_TRUE(runner, StrHelp_back_utf8_char(end, buffer) == buffer, - "back_utf8_char"); - TEST_TRUE(runner, StrHelp_back_utf8_char(end, buf) == NULL, - "back_utf8_char returns NULL rather than back up beyond start"); - TEST_TRUE(runner, StrHelp_back_utf8_char(buffer, buffer) == NULL, - "back_utf8_char returns NULL when end == start"); } void TestStrHelp_Run_IMP(TestStringHelper *self, TestBatchRunner *runner) { - TestBatchRunner_Plan(runner, (TestBatch*)self, 55); + TestBatchRunner_Plan(runner, (TestBatch*)self, 14); test_overlap(runner); test_to_base36(runner); - test_utf8_round_trip(runner); - test_utf8_valid(runner); - test_validate_utf8(runner); - test_is_whitespace(runner); - test_encode_utf8_char(runner); test_back_utf8_char(runner); }
