Code points above 0x10FFFF are invalid Additional test cases will be added when addressing CLOWNFISH-95.
Fixes CLOWNFISH-99. Project: http://git-wip-us.apache.org/repos/asf/lucy-clownfish/repo Commit: http://git-wip-us.apache.org/repos/asf/lucy-clownfish/commit/fbaafea1 Tree: http://git-wip-us.apache.org/repos/asf/lucy-clownfish/tree/fbaafea1 Diff: http://git-wip-us.apache.org/repos/asf/lucy-clownfish/diff/fbaafea1 Branch: refs/heads/master Commit: fbaafea1cce828d79513c6e652ba80a23aa398f2 Parents: 732f750 Author: Nick Wellnhofer <[email protected]> Authored: Sun May 15 11:42:10 2016 +0200 Committer: Nick Wellnhofer <[email protected]> Committed: Sun May 15 12:12:12 2016 +0200 ---------------------------------------------------------------------- runtime/core/Clownfish/Util/StringHelper.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/lucy-clownfish/blob/fbaafea1/runtime/core/Clownfish/Util/StringHelper.c ---------------------------------------------------------------------- diff --git a/runtime/core/Clownfish/Util/StringHelper.c b/runtime/core/Clownfish/Util/StringHelper.c index 5b9de2f..2f33e37 100644 --- a/runtime/core/Clownfish/Util/StringHelper.c +++ b/runtime/core/Clownfish/Util/StringHelper.c @@ -43,7 +43,7 @@ const uint8_t cfish_StrHelp_UTF8_COUNT[] = { 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, - 4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, + 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }; size_t @@ -101,11 +101,13 @@ S_find_invalid_utf8(const uint8_t *string, size_t size) { case 3: if (end - string < 2) { return start; } if (header_byte == 0xED) { + // Disallow UTF-16 surrogates. if (*string < 0x80 || *string > 0x9F) { return start; } } else if (!(header_byte & 0x0F)) { + // Disallow non-shortest-form. if (!(*string & 0x20)) { return start; } @@ -116,10 +118,17 @@ S_find_invalid_utf8(const uint8_t *string, size_t size) { case 4: if (end - string < 3) { return start; } if (!(header_byte & 0x07)) { + // Disallow non-shortest-form. if (!(*string & 0x30)) { return start; } } + else if (header_byte == 0xF4) { + // Code point larger than 0x10FFFF. + if (*string >= 0x90) { + return start; + } + } if ((*string++ & 0xC0) != 0x80) { return start; } if ((*string++ & 0xC0) != 0x80) { return start; } if ((*string++ & 0xC0) != 0x80) { return start; }
