Code points above 0x10FFFF are invalid

Additional test cases will be added when addressing CLOWNFISH-95.

Fixes CLOWNFISH-99.


Project: http://git-wip-us.apache.org/repos/asf/lucy-clownfish/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucy-clownfish/commit/fbaafea1
Tree: http://git-wip-us.apache.org/repos/asf/lucy-clownfish/tree/fbaafea1
Diff: http://git-wip-us.apache.org/repos/asf/lucy-clownfish/diff/fbaafea1

Branch: refs/heads/master
Commit: fbaafea1cce828d79513c6e652ba80a23aa398f2
Parents: 732f750
Author: Nick Wellnhofer <[email protected]>
Authored: Sun May 15 11:42:10 2016 +0200
Committer: Nick Wellnhofer <[email protected]>
Committed: Sun May 15 12:12:12 2016 +0200

----------------------------------------------------------------------
 runtime/core/Clownfish/Util/StringHelper.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucy-clownfish/blob/fbaafea1/runtime/core/Clownfish/Util/StringHelper.c
----------------------------------------------------------------------
diff --git a/runtime/core/Clownfish/Util/StringHelper.c 
b/runtime/core/Clownfish/Util/StringHelper.c
index 5b9de2f..2f33e37 100644
--- a/runtime/core/Clownfish/Util/StringHelper.c
+++ b/runtime/core/Clownfish/Util/StringHelper.c
@@ -43,7 +43,7 @@ const uint8_t cfish_StrHelp_UTF8_COUNT[] = {
     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
-    4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0,
+    4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 };
 
 size_t
@@ -101,11 +101,13 @@ S_find_invalid_utf8(const uint8_t *string, size_t size) {
             case 3:
                 if (end - string < 2)           { return start; }
                 if (header_byte == 0xED) {
+                    // Disallow UTF-16 surrogates.
                     if (*string < 0x80 || *string > 0x9F) {
                         return start;
                     }
                 }
                 else if (!(header_byte & 0x0F)) {
+                    // Disallow non-shortest-form.
                     if (!(*string & 0x20)) {
                         return start;
                     }
@@ -116,10 +118,17 @@ S_find_invalid_utf8(const uint8_t *string, size_t size) {
             case 4:
                 if (end - string < 3)           { return start; }
                 if (!(header_byte & 0x07)) {
+                    // Disallow non-shortest-form.
                     if (!(*string & 0x30)) {
                         return start;
                     }
                 }
+                else if (header_byte == 0xF4) {
+                    // Code point larger than 0x10FFFF.
+                    if (*string >= 0x90) {
+                        return start;
+                    }
+                }
                 if ((*string++ & 0xC0) != 0x80) { return start; }
                 if ((*string++ & 0xC0) != 0x80) { return start; }
                 if ((*string++ & 0xC0) != 0x80) { return start; }

Reply via email to