Improve error message in case of invalid UTF-8 Add a new string helper StrHelp_validate_utf8 and a corresponding macro that validates a UTF-8 string and throws an error if invalid UTF-8 is encountered. The error message shows where the invalid UTF-8 was found and prints the offending bytes as hex.
Obsoletes the old hack that wrote an error message directly to stderr. First part of CLOWNFISH-97. Project: http://git-wip-us.apache.org/repos/asf/lucy-clownfish/repo Commit: http://git-wip-us.apache.org/repos/asf/lucy-clownfish/commit/cb0bb2ca Tree: http://git-wip-us.apache.org/repos/asf/lucy-clownfish/tree/cb0bb2ca Diff: http://git-wip-us.apache.org/repos/asf/lucy-clownfish/diff/cb0bb2ca Branch: refs/heads/master Commit: cb0bb2ca3f77158c5620e4db0ec227f5719582e7 Parents: 946d25a Author: Nick Wellnhofer <[email protected]> Authored: Thu May 12 15:46:17 2016 +0200 Committer: Nick Wellnhofer <[email protected]> Committed: Fri May 13 15:55:08 2016 +0200 ---------------------------------------------------------------------- runtime/core/Clownfish/CharBuf.c | 24 +---- runtime/core/Clownfish/String.c | 37 +------- .../core/Clownfish/Test/Util/TestStringHelper.c | 33 ++++++- runtime/core/Clownfish/Util/StringHelper.c | 94 ++++++++++++++++---- runtime/core/Clownfish/Util/StringHelper.cfh | 15 ++++ 5 files changed, 128 insertions(+), 75 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/lucy-clownfish/blob/cb0bb2ca/runtime/core/Clownfish/CharBuf.c ---------------------------------------------------------------------- diff --git a/runtime/core/Clownfish/CharBuf.c b/runtime/core/Clownfish/CharBuf.c index 5589079..eb4f92f 100644 --- a/runtime/core/Clownfish/CharBuf.c +++ b/runtime/core/Clownfish/CharBuf.c @@ -57,16 +57,6 @@ S_grow_and_oversize(CharBuf *self, size_t min_size); static void S_overflow_error(); -// Helper function for throwing invalid UTF-8 error. Since THROW uses -// a String internally, calling THROW with invalid UTF-8 would create an -// infinite loop -- so we fwrite some of the bogus text to stderr and -// invoke THROW with a generic message. -#define DIE_INVALID_UTF8(text, size) \ - S_die_invalid_utf8(text, size, __FILE__, __LINE__, CFISH_ERR_FUNC_MACRO) -static void -S_die_invalid_utf8(const char *text, size_t size, const char *file, int line, - const char *func); - // Helper function for throwing invalid pattern error. static void S_die_invalid_pattern(const char *pattern); @@ -104,16 +94,6 @@ CB_Grow_IMP(CharBuf *self, size_t size) { } static void -S_die_invalid_utf8(const char *text, size_t size, const char *file, int line, - const char *func) { - fprintf(stderr, "Invalid UTF-8, aborting: '"); - fwrite(text, sizeof(char), size < 200 ? size : 200, stderr); - if (size > 200) { fwrite("[...]", sizeof(char), 5, stderr); } - fprintf(stderr, "' (length %lu)\n", (unsigned long)size); - Err_throw_at(ERR, file, line, func, "Invalid UTF-8"); -} - -static void S_die_invalid_pattern(const char *pattern) { size_t pattern_len = strlen(pattern); fprintf(stderr, "Invalid pattern, aborting: '"); @@ -310,9 +290,7 @@ CB_Clone_IMP(CharBuf *self) { void CB_Cat_Utf8_IMP(CharBuf *self, const char* ptr, size_t size) { - if (!StrHelp_utf8_valid(ptr, size)) { - DIE_INVALID_UTF8(ptr, size); - } + VALIDATE_UTF8(ptr, size); SI_cat_utf8(self, ptr, size); } http://git-wip-us.apache.org/repos/asf/lucy-clownfish/blob/cb0bb2ca/runtime/core/Clownfish/String.c ---------------------------------------------------------------------- diff --git a/runtime/core/Clownfish/String.c b/runtime/core/Clownfish/String.c index d30a04a..767a34e 100644 --- a/runtime/core/Clownfish/String.c +++ b/runtime/core/Clownfish/String.c @@ -19,7 +19,6 @@ #define CFISH_USE_SHORT_NAMES #include <string.h> -#include <stdio.h> #include <stdlib.h> #include <ctype.h> @@ -35,16 +34,6 @@ #define STACK_ITER(string, byte_offset) \ S_new_stack_iter(alloca(sizeof(StringIterator)), string, byte_offset) -// Helper function for throwing invalid UTF-8 error. Since THROW uses -// a String internally, calling THROW with invalid UTF-8 would create an -// infinite loop -- so we fwrite some of the bogus text to stderr and -// invoke THROW with a generic message. -#define DIE_INVALID_UTF8(text, size) \ - S_die_invalid_utf8(text, size, __FILE__, __LINE__, CFISH_ERR_FUNC_MACRO) -static void -S_die_invalid_utf8(const char *text, size_t size, const char *file, int line, - const char *func); - static const char* S_memmem(String *self, const char *substring, size_t size); @@ -53,9 +42,7 @@ S_new_stack_iter(void *allocation, String *string, size_t byte_offset); String* Str_new_from_utf8(const char *utf8, size_t size) { - if (!StrHelp_utf8_valid(utf8, size)) { - DIE_INVALID_UTF8(utf8, size); - } + VALIDATE_UTF8(utf8, size); String *self = (String*)Class_Make_Obj(STRING); return Str_init_from_trusted_utf8(self, utf8, size); } @@ -85,9 +72,7 @@ Str_init_from_trusted_utf8(String *self, const char *utf8, size_t size) { String* Str_new_steal_utf8(char *utf8, size_t size) { - if (!StrHelp_utf8_valid(utf8, size)) { - DIE_INVALID_UTF8(utf8, size); - } + VALIDATE_UTF8(utf8, size); String *self = (String*)Class_Make_Obj(STRING); return Str_init_steal_trusted_utf8(self, utf8, size); } @@ -108,9 +93,7 @@ Str_init_steal_trusted_utf8(String *self, char *utf8, size_t size) { String* Str_new_wrap_utf8(const char *utf8, size_t size) { - if (!StrHelp_utf8_valid(utf8, size)) { - DIE_INVALID_UTF8(utf8, size); - } + VALIDATE_UTF8(utf8, size); String *self = (String*)Class_Make_Obj(STRING); return Str_init_wrap_trusted_utf8(self, utf8, size); } @@ -208,16 +191,6 @@ Str_Hash_Sum_IMP(String *self) { return hashvalue; } -static void -S_die_invalid_utf8(const char *text, size_t size, const char *file, int line, - const char *func) { - fprintf(stderr, "Invalid UTF-8, aborting: '"); - fwrite(text, sizeof(char), size < 200 ? size : 200, stderr); - if (size > 200) { fwrite("[...]", sizeof(char), 5, stderr); } - fprintf(stderr, "' (length %lu)\n", (unsigned long)size); - Err_throw_at(ERR, file, line, func, "Invalid UTF-8"); -} - String* Str_To_String_IMP(String *self) { return (String*)INCREF(self); @@ -297,9 +270,7 @@ Str_Cat_IMP(String *self, String *other) { String* Str_Cat_Utf8_IMP(String *self, const char* ptr, size_t size) { - if (!StrHelp_utf8_valid(ptr, size)) { - DIE_INVALID_UTF8(ptr, size); - } + VALIDATE_UTF8(ptr, size); return Str_Cat_Trusted_Utf8(self, ptr, size); } http://git-wip-us.apache.org/repos/asf/lucy-clownfish/blob/cb0bb2ca/runtime/core/Clownfish/Test/Util/TestStringHelper.c ---------------------------------------------------------------------- diff --git a/runtime/core/Clownfish/Test/Util/TestStringHelper.c b/runtime/core/Clownfish/Test/Util/TestStringHelper.c index 64529d4..03895b6 100644 --- a/runtime/core/Clownfish/Test/Util/TestStringHelper.c +++ b/runtime/core/Clownfish/Test/Util/TestStringHelper.c @@ -257,6 +257,36 @@ test_utf8_valid(TestBatchRunner *runner) { } static void +S_validate_utf8(void *context) { + const char *text = (const char*)context; + StrHelp_validate_utf8(text, strlen(text), "src.c", 17, "fn"); +} + +static void +test_validate_utf8(TestBatchRunner *runner) { + { + Err *error = Err_trap(S_validate_utf8, "Sigma\xC1\x9C."); + TEST_TRUE(runner, error != NULL, "validate_utf8 throws"); + String *mess = Err_Get_Mess(error); + const char *expected = "Invalid UTF-8 after 'Sigma': C1 9C 2E\n"; + bool ok = Str_Starts_With_Utf8(mess, expected, strlen(expected)); + TEST_TRUE(runner, ok, "validate_utf8 throws correct error message"); + } + + { + Err *error = Err_trap(S_validate_utf8, + "xxx123456789\xE2\x93\xAA" + "1234567890\xC1\x9C."); + String *mess = Err_Get_Mess(error); + const char *expected = + "Invalid UTF-8 after '123456789\xE2\x93\xAA" + "1234567890': C1 9C 2E\n"; + bool ok = Str_Starts_With_Utf8(mess, expected, strlen(expected)); + TEST_TRUE(runner, ok, "validate_utf8 truncates long prefix"); + } +} + +static void test_is_whitespace(TestBatchRunner *runner) { TEST_TRUE(runner, StrHelp_is_whitespace(' '), "space is whitespace"); TEST_TRUE(runner, StrHelp_is_whitespace('\n'), "newline is whitespace"); @@ -285,11 +315,12 @@ test_back_utf8_char(TestBatchRunner *runner) { void TestStrHelp_Run_IMP(TestStringHelper *self, TestBatchRunner *runner) { - TestBatchRunner_Plan(runner, (TestBatch*)self, 39); + TestBatchRunner_Plan(runner, (TestBatch*)self, 42); test_overlap(runner); test_to_base36(runner); test_utf8_round_trip(runner); test_utf8_valid(runner); + test_validate_utf8(runner); test_is_whitespace(runner); test_back_utf8_char(runner); } http://git-wip-us.apache.org/repos/asf/lucy-clownfish/blob/cb0bb2ca/runtime/core/Clownfish/Util/StringHelper.c ---------------------------------------------------------------------- diff --git a/runtime/core/Clownfish/Util/StringHelper.c b/runtime/core/Clownfish/Util/StringHelper.c index 59418b2..5b9de2f 100644 --- a/runtime/core/Clownfish/Util/StringHelper.c +++ b/runtime/core/Clownfish/Util/StringHelper.c @@ -17,11 +17,14 @@ #define C_CFISH_STRINGHELPER #include <string.h> #include <stddef.h> +#include <stdio.h> #define CFISH_USE_SHORT_NAMES #include "Clownfish/Util/StringHelper.h" +#include "Clownfish/CharBuf.h" #include "Clownfish/Err.h" +#include "Clownfish/String.h" #include "Clownfish/Util/Memory.h" const uint8_t cfish_StrHelp_UTF8_COUNT[] = { @@ -76,11 +79,13 @@ StrHelp_to_base36(uint64_t num, void *buffer) { return size; } -bool -StrHelp_utf8_valid(const char *ptr, size_t size) { - const uint8_t *string = (const uint8_t*)ptr; +// Return a pointer to the first invalid UTF-8 sequence, or NULL if +// the UTF-8 is valid. +static const uint8_t* +S_find_invalid_utf8(const uint8_t *string, size_t size) { const uint8_t *const end = string + size; while (string < end) { + const uint8_t *start = string; const uint8_t header_byte = *string++; int count = StrHelp_UTF8_COUNT[header_byte] & 0x7; switch (count & 0x7) { @@ -88,43 +93,96 @@ StrHelp_utf8_valid(const char *ptr, size_t size) { // ASCII break; case 2: - if (string == end) { return false; } + if (string == end) { return start; } // Disallow non-shortest-form ASCII. - if (!(header_byte & 0x1E)) { return false; } - if ((*string++ & 0xC0) != 0x80) { return false; } + if (!(header_byte & 0x1E)) { return start; } + if ((*string++ & 0xC0) != 0x80) { return start; } break; case 3: - if (end - string < 2) { return false; } + if (end - string < 2) { return start; } if (header_byte == 0xED) { if (*string < 0x80 || *string > 0x9F) { - return false; + return start; } } else if (!(header_byte & 0x0F)) { if (!(*string & 0x20)) { - return false; + return start; } } - if ((*string++ & 0xC0) != 0x80) { return false; } - if ((*string++ & 0xC0) != 0x80) { return false; } + if ((*string++ & 0xC0) != 0x80) { return start; } + if ((*string++ & 0xC0) != 0x80) { return start; } break; case 4: - if (end - string < 3) { return false; } + if (end - string < 3) { return start; } if (!(header_byte & 0x07)) { if (!(*string & 0x30)) { - return false; + return start; } } - if ((*string++ & 0xC0) != 0x80) { return false; } - if ((*string++ & 0xC0) != 0x80) { return false; } - if ((*string++ & 0xC0) != 0x80) { return false; } + if ((*string++ & 0xC0) != 0x80) { return start; } + if ((*string++ & 0xC0) != 0x80) { return start; } + if ((*string++ & 0xC0) != 0x80) { return start; } break; default: - return false; + return start; } } - return true; + return NULL; +} + +bool +StrHelp_utf8_valid(const char *ptr, size_t size) { + return S_find_invalid_utf8((const uint8_t*)ptr, size) == NULL; +} + +void +StrHelp_validate_utf8(const char *ptr, size_t size, const char *file, + int line, const char *func) { + const uint8_t *string = (const uint8_t*)ptr; + const uint8_t *invalid = S_find_invalid_utf8(string, size); + if (invalid == NULL) { return; } + + CharBuf *buf = CB_new(0); + CB_Cat_Trusted_Utf8(buf, "Invalid UTF-8", 13); + + if (invalid > string) { + const uint8_t *prefix = invalid; + size_t num_code_points = 0; + + // Skip up to 20 code points backwards. + while (prefix > string) { + prefix -= 1; + + if ((*prefix & 0xC0) != 0x80) { + num_code_points += 1; + if (num_code_points >= 20) { break; } + } + } + + CB_Cat_Trusted_Utf8(buf, " after '", 8); + CB_Cat_Trusted_Utf8(buf, (const char*)prefix, invalid - prefix); + CB_Cat_Trusted_Utf8(buf, "'", 1); + } + + CB_Cat_Trusted_Utf8(buf, ":", 1); + + // Append offending bytes as hex. + const uint8_t *end = string + size; + const uint8_t *max = invalid + 5; + for (const uint8_t *byte = invalid; byte < end && byte < max; byte++) { + char hex[4]; + sprintf(hex, " %02X", *byte); + CB_Cat_Trusted_Utf8(buf, hex, 3); + } + + String *mess = CB_Yield_String(buf); + DECREF(buf); + + Err *err = Err_new(mess); + Err_Add_Frame(err, file, line, func); + Err_do_throw(err); } bool http://git-wip-us.apache.org/repos/asf/lucy-clownfish/blob/cb0bb2ca/runtime/core/Clownfish/Util/StringHelper.cfh ---------------------------------------------------------------------- diff --git a/runtime/core/Clownfish/Util/StringHelper.cfh b/runtime/core/Clownfish/Util/StringHelper.cfh index 90fbbd5..796aa2d 100644 --- a/runtime/core/Clownfish/Util/StringHelper.cfh +++ b/runtime/core/Clownfish/Util/StringHelper.cfh @@ -16,6 +16,10 @@ parcel Clownfish; +__C__ +#include "Clownfish/Err.h" +__END_C__ + inert class Clownfish::Util::StringHelper nickname StrHelp { /* A table where the values indicate the number of bytes in a UTF-8 @@ -44,6 +48,12 @@ inert class Clownfish::Util::StringHelper nickname StrHelp { inert bool utf8_valid(const char *ptr, size_t len); + /** Throws an error if the string isn't valid UTF-8. + */ + inert void + validate_utf8(const char *text, size_t size, const char *file, int line, + const char *func); + /** Returns true if the code point qualifies as Unicode whitespace. */ inert bool @@ -71,12 +81,17 @@ inert class Clownfish::Util::StringHelper nickname StrHelp { } __C__ +#define CFISH_VALIDATE_UTF8(text, size) \ + cfish_StrHelp_validate_utf8(text, size, \ + __FILE__, __LINE__, CFISH_ERR_FUNC_MACRO) + /** The maximum number of bytes encoded by to_base36(), including the * terminating NULL. */ #define cfish_StrHelp_MAX_BASE36_BYTES 14 #ifdef CFISH_USE_SHORT_NAMES #define StrHelp_MAX_BASE36_BYTES cfish_StrHelp_MAX_BASE36_BYTES + #define VALIDATE_UTF8 CFISH_VALIDATE_UTF8 #endif __END_C__
