Improve error message in case of invalid UTF-8

Add a new string helper StrHelp_validate_utf8 and a corresponding
macro that validates a UTF-8 string and throws an error if invalid
UTF-8 is encountered. The error message shows where the invalid UTF-8
was found and prints the offending bytes as hex.

Obsoletes the old hack that wrote an error message directly to stderr.

First part of CLOWNFISH-97.


Project: http://git-wip-us.apache.org/repos/asf/lucy-clownfish/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucy-clownfish/commit/cb0bb2ca
Tree: http://git-wip-us.apache.org/repos/asf/lucy-clownfish/tree/cb0bb2ca
Diff: http://git-wip-us.apache.org/repos/asf/lucy-clownfish/diff/cb0bb2ca

Branch: refs/heads/master
Commit: cb0bb2ca3f77158c5620e4db0ec227f5719582e7
Parents: 946d25a
Author: Nick Wellnhofer <[email protected]>
Authored: Thu May 12 15:46:17 2016 +0200
Committer: Nick Wellnhofer <[email protected]>
Committed: Fri May 13 15:55:08 2016 +0200

----------------------------------------------------------------------
 runtime/core/Clownfish/CharBuf.c                | 24 +----
 runtime/core/Clownfish/String.c                 | 37 +-------
 .../core/Clownfish/Test/Util/TestStringHelper.c | 33 ++++++-
 runtime/core/Clownfish/Util/StringHelper.c      | 94 ++++++++++++++++----
 runtime/core/Clownfish/Util/StringHelper.cfh    | 15 ++++
 5 files changed, 128 insertions(+), 75 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucy-clownfish/blob/cb0bb2ca/runtime/core/Clownfish/CharBuf.c
----------------------------------------------------------------------
diff --git a/runtime/core/Clownfish/CharBuf.c b/runtime/core/Clownfish/CharBuf.c
index 5589079..eb4f92f 100644
--- a/runtime/core/Clownfish/CharBuf.c
+++ b/runtime/core/Clownfish/CharBuf.c
@@ -57,16 +57,6 @@ S_grow_and_oversize(CharBuf *self, size_t min_size);
 static void
 S_overflow_error();
 
-// Helper function for throwing invalid UTF-8 error. Since THROW uses
-// a String internally, calling THROW with invalid UTF-8 would create an
-// infinite loop -- so we fwrite some of the bogus text to stderr and
-// invoke THROW with a generic message.
-#define DIE_INVALID_UTF8(text, size) \
-    S_die_invalid_utf8(text, size, __FILE__, __LINE__, CFISH_ERR_FUNC_MACRO)
-static void
-S_die_invalid_utf8(const char *text, size_t size, const char *file, int line,
-                   const char *func);
-
 // Helper function for throwing invalid pattern error.
 static void
 S_die_invalid_pattern(const char *pattern);
@@ -104,16 +94,6 @@ CB_Grow_IMP(CharBuf *self, size_t size) {
 }
 
 static void
-S_die_invalid_utf8(const char *text, size_t size, const char *file, int line,
-                   const char *func) {
-    fprintf(stderr, "Invalid UTF-8, aborting: '");
-    fwrite(text, sizeof(char), size < 200 ? size : 200, stderr);
-    if (size > 200) { fwrite("[...]", sizeof(char), 5, stderr); }
-    fprintf(stderr, "' (length %lu)\n", (unsigned long)size);
-    Err_throw_at(ERR, file, line, func, "Invalid UTF-8");
-}
-
-static void
 S_die_invalid_pattern(const char *pattern) {
     size_t  pattern_len = strlen(pattern);
     fprintf(stderr, "Invalid pattern, aborting: '");
@@ -310,9 +290,7 @@ CB_Clone_IMP(CharBuf *self) {
 
 void
 CB_Cat_Utf8_IMP(CharBuf *self, const char* ptr, size_t size) {
-    if (!StrHelp_utf8_valid(ptr, size)) {
-        DIE_INVALID_UTF8(ptr, size);
-    }
+    VALIDATE_UTF8(ptr, size);
     SI_cat_utf8(self, ptr, size);
 }
 

http://git-wip-us.apache.org/repos/asf/lucy-clownfish/blob/cb0bb2ca/runtime/core/Clownfish/String.c
----------------------------------------------------------------------
diff --git a/runtime/core/Clownfish/String.c b/runtime/core/Clownfish/String.c
index d30a04a..767a34e 100644
--- a/runtime/core/Clownfish/String.c
+++ b/runtime/core/Clownfish/String.c
@@ -19,7 +19,6 @@
 #define CFISH_USE_SHORT_NAMES
 
 #include <string.h>
-#include <stdio.h>
 #include <stdlib.h>
 #include <ctype.h>
 
@@ -35,16 +34,6 @@
 #define STACK_ITER(string, byte_offset) \
     S_new_stack_iter(alloca(sizeof(StringIterator)), string, byte_offset)
 
-// Helper function for throwing invalid UTF-8 error. Since THROW uses
-// a String internally, calling THROW with invalid UTF-8 would create an
-// infinite loop -- so we fwrite some of the bogus text to stderr and
-// invoke THROW with a generic message.
-#define DIE_INVALID_UTF8(text, size) \
-    S_die_invalid_utf8(text, size, __FILE__, __LINE__, CFISH_ERR_FUNC_MACRO)
-static void
-S_die_invalid_utf8(const char *text, size_t size, const char *file, int line,
-                   const char *func);
-
 static const char*
 S_memmem(String *self, const char *substring, size_t size);
 
@@ -53,9 +42,7 @@ S_new_stack_iter(void *allocation, String *string, size_t 
byte_offset);
 
 String*
 Str_new_from_utf8(const char *utf8, size_t size) {
-    if (!StrHelp_utf8_valid(utf8, size)) {
-        DIE_INVALID_UTF8(utf8, size);
-    }
+    VALIDATE_UTF8(utf8, size);
     String *self = (String*)Class_Make_Obj(STRING);
     return Str_init_from_trusted_utf8(self, utf8, size);
 }
@@ -85,9 +72,7 @@ Str_init_from_trusted_utf8(String *self, const char *utf8, 
size_t size) {
 
 String*
 Str_new_steal_utf8(char *utf8, size_t size) {
-    if (!StrHelp_utf8_valid(utf8, size)) {
-        DIE_INVALID_UTF8(utf8, size);
-    }
+    VALIDATE_UTF8(utf8, size);
     String *self = (String*)Class_Make_Obj(STRING);
     return Str_init_steal_trusted_utf8(self, utf8, size);
 }
@@ -108,9 +93,7 @@ Str_init_steal_trusted_utf8(String *self, char *utf8, size_t 
size) {
 
 String*
 Str_new_wrap_utf8(const char *utf8, size_t size) {
-    if (!StrHelp_utf8_valid(utf8, size)) {
-        DIE_INVALID_UTF8(utf8, size);
-    }
+    VALIDATE_UTF8(utf8, size);
     String *self = (String*)Class_Make_Obj(STRING);
     return Str_init_wrap_trusted_utf8(self, utf8, size);
 }
@@ -208,16 +191,6 @@ Str_Hash_Sum_IMP(String *self) {
     return hashvalue;
 }
 
-static void
-S_die_invalid_utf8(const char *text, size_t size, const char *file, int line,
-                   const char *func) {
-    fprintf(stderr, "Invalid UTF-8, aborting: '");
-    fwrite(text, sizeof(char), size < 200 ? size : 200, stderr);
-    if (size > 200) { fwrite("[...]", sizeof(char), 5, stderr); }
-    fprintf(stderr, "' (length %lu)\n", (unsigned long)size);
-    Err_throw_at(ERR, file, line, func, "Invalid UTF-8");
-}
-
 String*
 Str_To_String_IMP(String *self) {
     return (String*)INCREF(self);
@@ -297,9 +270,7 @@ Str_Cat_IMP(String *self, String *other) {
 
 String*
 Str_Cat_Utf8_IMP(String *self, const char* ptr, size_t size) {
-    if (!StrHelp_utf8_valid(ptr, size)) {
-        DIE_INVALID_UTF8(ptr, size);
-    }
+    VALIDATE_UTF8(ptr, size);
     return Str_Cat_Trusted_Utf8(self, ptr, size);
 }
 

http://git-wip-us.apache.org/repos/asf/lucy-clownfish/blob/cb0bb2ca/runtime/core/Clownfish/Test/Util/TestStringHelper.c
----------------------------------------------------------------------
diff --git a/runtime/core/Clownfish/Test/Util/TestStringHelper.c 
b/runtime/core/Clownfish/Test/Util/TestStringHelper.c
index 64529d4..03895b6 100644
--- a/runtime/core/Clownfish/Test/Util/TestStringHelper.c
+++ b/runtime/core/Clownfish/Test/Util/TestStringHelper.c
@@ -257,6 +257,36 @@ test_utf8_valid(TestBatchRunner *runner) {
 }
 
 static void
+S_validate_utf8(void *context) {
+    const char *text = (const char*)context;
+    StrHelp_validate_utf8(text, strlen(text), "src.c", 17, "fn");
+}
+
+static void
+test_validate_utf8(TestBatchRunner *runner) {
+    {
+        Err *error = Err_trap(S_validate_utf8, "Sigma\xC1\x9C.");
+        TEST_TRUE(runner, error != NULL, "validate_utf8 throws");
+        String *mess = Err_Get_Mess(error);
+        const char *expected = "Invalid UTF-8 after 'Sigma': C1 9C 2E\n";
+        bool ok = Str_Starts_With_Utf8(mess, expected, strlen(expected));
+        TEST_TRUE(runner, ok, "validate_utf8 throws correct error message");
+    }
+
+    {
+        Err *error = Err_trap(S_validate_utf8,
+                              "xxx123456789\xE2\x93\xAA"
+                              "1234567890\xC1\x9C.");
+        String *mess = Err_Get_Mess(error);
+        const char *expected =
+            "Invalid UTF-8 after '123456789\xE2\x93\xAA"
+            "1234567890': C1 9C 2E\n";
+        bool ok = Str_Starts_With_Utf8(mess, expected, strlen(expected));
+        TEST_TRUE(runner, ok, "validate_utf8 truncates long prefix");
+    }
+}
+
+static void
 test_is_whitespace(TestBatchRunner *runner) {
     TEST_TRUE(runner, StrHelp_is_whitespace(' '), "space is whitespace");
     TEST_TRUE(runner, StrHelp_is_whitespace('\n'), "newline is whitespace");
@@ -285,11 +315,12 @@ test_back_utf8_char(TestBatchRunner *runner) {
 
 void
 TestStrHelp_Run_IMP(TestStringHelper *self, TestBatchRunner *runner) {
-    TestBatchRunner_Plan(runner, (TestBatch*)self, 39);
+    TestBatchRunner_Plan(runner, (TestBatch*)self, 42);
     test_overlap(runner);
     test_to_base36(runner);
     test_utf8_round_trip(runner);
     test_utf8_valid(runner);
+    test_validate_utf8(runner);
     test_is_whitespace(runner);
     test_back_utf8_char(runner);
 }

http://git-wip-us.apache.org/repos/asf/lucy-clownfish/blob/cb0bb2ca/runtime/core/Clownfish/Util/StringHelper.c
----------------------------------------------------------------------
diff --git a/runtime/core/Clownfish/Util/StringHelper.c 
b/runtime/core/Clownfish/Util/StringHelper.c
index 59418b2..5b9de2f 100644
--- a/runtime/core/Clownfish/Util/StringHelper.c
+++ b/runtime/core/Clownfish/Util/StringHelper.c
@@ -17,11 +17,14 @@
 #define C_CFISH_STRINGHELPER
 #include <string.h>
 #include <stddef.h>
+#include <stdio.h>
 
 #define CFISH_USE_SHORT_NAMES
 
 #include "Clownfish/Util/StringHelper.h"
+#include "Clownfish/CharBuf.h"
 #include "Clownfish/Err.h"
+#include "Clownfish/String.h"
 #include "Clownfish/Util/Memory.h"
 
 const uint8_t cfish_StrHelp_UTF8_COUNT[] = {
@@ -76,11 +79,13 @@ StrHelp_to_base36(uint64_t num, void *buffer) {
     return size;
 }
 
-bool
-StrHelp_utf8_valid(const char *ptr, size_t size) {
-    const uint8_t *string    = (const uint8_t*)ptr;
+// Return a pointer to the first invalid UTF-8 sequence, or NULL if
+// the UTF-8 is valid.
+static const uint8_t*
+S_find_invalid_utf8(const uint8_t *string, size_t size) {
     const uint8_t *const end = string + size;
     while (string < end) {
+        const uint8_t *start = string;
         const uint8_t header_byte = *string++;
         int count = StrHelp_UTF8_COUNT[header_byte] & 0x7;
         switch (count & 0x7) {
@@ -88,43 +93,96 @@ StrHelp_utf8_valid(const char *ptr, size_t size) {
                 // ASCII
                 break;
             case 2:
-                if (string == end)              { return false; }
+                if (string == end)              { return start; }
                 // Disallow non-shortest-form ASCII.
-                if (!(header_byte & 0x1E))      { return false; }
-                if ((*string++ & 0xC0) != 0x80) { return false; }
+                if (!(header_byte & 0x1E))      { return start; }
+                if ((*string++ & 0xC0) != 0x80) { return start; }
                 break;
             case 3:
-                if (end - string < 2)           { return false; }
+                if (end - string < 2)           { return start; }
                 if (header_byte == 0xED) {
                     if (*string < 0x80 || *string > 0x9F) {
-                        return false;
+                        return start;
                     }
                 }
                 else if (!(header_byte & 0x0F)) {
                     if (!(*string & 0x20)) {
-                        return false;
+                        return start;
                     }
                 }
-                if ((*string++ & 0xC0) != 0x80) { return false; }
-                if ((*string++ & 0xC0) != 0x80) { return false; }
+                if ((*string++ & 0xC0) != 0x80) { return start; }
+                if ((*string++ & 0xC0) != 0x80) { return start; }
                 break;
             case 4:
-                if (end - string < 3)           { return false; }
+                if (end - string < 3)           { return start; }
                 if (!(header_byte & 0x07)) {
                     if (!(*string & 0x30)) {
-                        return false;
+                        return start;
                     }
                 }
-                if ((*string++ & 0xC0) != 0x80) { return false; }
-                if ((*string++ & 0xC0) != 0x80) { return false; }
-                if ((*string++ & 0xC0) != 0x80) { return false; }
+                if ((*string++ & 0xC0) != 0x80) { return start; }
+                if ((*string++ & 0xC0) != 0x80) { return start; }
+                if ((*string++ & 0xC0) != 0x80) { return start; }
                 break;
             default:
-                return false;
+                return start;
         }
     }
 
-    return true;
+    return NULL;
+}
+
+bool
+StrHelp_utf8_valid(const char *ptr, size_t size) {
+    return S_find_invalid_utf8((const uint8_t*)ptr, size) == NULL;
+}
+
+void
+StrHelp_validate_utf8(const char *ptr, size_t size, const char *file,
+                      int line, const char *func) {
+    const uint8_t *string  = (const uint8_t*)ptr;
+    const uint8_t *invalid = S_find_invalid_utf8(string, size);
+    if (invalid == NULL) { return; }
+
+    CharBuf *buf = CB_new(0);
+    CB_Cat_Trusted_Utf8(buf, "Invalid UTF-8", 13);
+
+    if (invalid > string) {
+        const uint8_t *prefix = invalid;
+        size_t num_code_points = 0;
+
+        // Skip up to 20 code points backwards.
+        while (prefix > string) {
+            prefix -= 1;
+
+            if ((*prefix & 0xC0) != 0x80) {
+                num_code_points += 1;
+                if (num_code_points >= 20) { break; }
+            }
+        }
+
+        CB_Cat_Trusted_Utf8(buf, " after '", 8);
+        CB_Cat_Trusted_Utf8(buf, (const char*)prefix, invalid - prefix);
+        CB_Cat_Trusted_Utf8(buf, "'", 1);
+    }
+
+    CB_Cat_Trusted_Utf8(buf, ":", 1);
+
+    // Append offending bytes as hex.
+    const uint8_t *end = string + size;
+    const uint8_t *max = invalid + 5;
+    for (const uint8_t *byte = invalid; byte < end && byte < max; byte++) {
+        char hex[4];
+        sprintf(hex, " %02X", *byte);
+        CB_Cat_Trusted_Utf8(buf, hex, 3);
+    }
+
+    String *mess = CB_Yield_String(buf);
+    DECREF(buf);
+
+    Err *err = Err_new(mess);
+    Err_Add_Frame(err, file, line, func);
+    Err_do_throw(err);
 }
 
 bool

http://git-wip-us.apache.org/repos/asf/lucy-clownfish/blob/cb0bb2ca/runtime/core/Clownfish/Util/StringHelper.cfh
----------------------------------------------------------------------
diff --git a/runtime/core/Clownfish/Util/StringHelper.cfh 
b/runtime/core/Clownfish/Util/StringHelper.cfh
index 90fbbd5..796aa2d 100644
--- a/runtime/core/Clownfish/Util/StringHelper.cfh
+++ b/runtime/core/Clownfish/Util/StringHelper.cfh
@@ -16,6 +16,10 @@
 
 parcel Clownfish;
 
+__C__
+#include "Clownfish/Err.h"
+__END_C__
+
 inert class Clownfish::Util::StringHelper nickname StrHelp {
 
     /* A table where the values indicate the number of bytes in a UTF-8
@@ -44,6 +48,12 @@ inert class Clownfish::Util::StringHelper nickname StrHelp {
     inert bool
     utf8_valid(const char *ptr, size_t len);
 
+    /** Throws an error if the string isn't valid UTF-8.
+     */
+    inert void
+    validate_utf8(const char *text, size_t size, const char *file, int line,
+                  const char *func);
+
     /** Returns true if the code point qualifies as Unicode whitespace.
      */
     inert bool
@@ -71,12 +81,17 @@ inert class Clownfish::Util::StringHelper nickname StrHelp {
 }
 
 __C__
+#define CFISH_VALIDATE_UTF8(text, size) \
+    cfish_StrHelp_validate_utf8(text, size, \
+                                __FILE__, __LINE__, CFISH_ERR_FUNC_MACRO)
+
 /** The maximum number of bytes encoded by to_base36(), including the
  * terminating NULL.
  */
 #define cfish_StrHelp_MAX_BASE36_BYTES 14
 #ifdef CFISH_USE_SHORT_NAMES
   #define StrHelp_MAX_BASE36_BYTES cfish_StrHelp_MAX_BASE36_BYTES
+  #define VALIDATE_UTF8            CFISH_VALIDATE_UTF8
 #endif
 __END_C__
 

Reply via email to