Changeset: fe8055a86502 for MonetDB URL: https://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=fe8055a86502 Modified Files: gdk/gdk_string.c Branch: Apr2019 Log Message:
Check that strings are properly encoded UTF-8 before allowing them in string heap. diffs (102 lines): diff --git a/gdk/gdk_string.c b/gdk/gdk_string.c --- a/gdk/gdk_string.c +++ b/gdk/gdk_string.c @@ -258,6 +258,41 @@ strPut(Heap *h, var_t *dst, const char * } /* the string was not found in the heap, we need to enter it */ + if (v[0] != '\200' || v[1] != '\0') { + /* check that string is correctly encoded UTF-8; there + * was no need to do this earlier: if the string was + * found above, it must have gone through here in the + * past */ + int nutf8 = 0; + int m = 0; + for (size_t i = 0; v[i]; i++) { + if (nutf8 > 0) { + if ((v[i] & 0xC0) != 0x80 || + (m != 0 && (v[i] & m) == 0)) { + badutf8: + GDKerror("strPut: incorrectly encoded UTF-8"); + return 0; + } + m = 0; + nutf8--; + } else if ((v[i] & 0xE0) == 0xC0) { + nutf8 = 1; + if ((v[i] & 0x1E) == 0) + goto badutf8; + } else if ((v[i] & 0xF0) == 0xE0) { + nutf8 = 2; + if ((v[i] & 0x0F) == 0) + m = 0x20; + } else if ((v[i] & 0xF8) == 0xF0) { + nutf8 = 3; + if ((v[i] & 0x07) == 0) + m = 0x30; + } else if ((v[i] & 0x80) != 0) { + goto badutf8; + } + } + } + pad = GDK_VARALIGN - (h->free & (GDK_VARALIGN - 1)); if (elimbase == 0) { /* i.e. h->free < GDK_ELIMLIMIT */ if (pad < sizeof(stridx_t)) { @@ -310,56 +345,6 @@ strPut(Heap *h, var_t *dst, const char * /* insert string */ pos = h->free + pad + extralen; *dst = (var_t) pos; -#ifndef NDEBUG - /* just before inserting into the heap, make sure that the - * string is actually UTF-8 (if we encountered a return - * statement before this, the string was already in the heap, - * and hence already checked) */ - if (v[0] != '\200' || v[1] != '\0') { - /* not str_nil, must be UTF-8 */ - size_t i; - - for (i = 0; v[i] != '\0'; i++) { - /* check that v[i] is the start of a validly - * coded UTF-8 sequence: this involves - * checking that the first byte is a valid - * start byte and is followed by the correct - * number of follow-up bytes, but also that - * the sequence cannot be shorter */ - if ((v[i] & 0x80) == 0) { - /* 0aaaaaaa */ - continue; - } else if ((v[i] & 0xE0) == 0xC0) { - /* 110bbbba 10aaaaaa - * one of the b's must be set*/ - assert(v[i] & 0x4D); - i++; - assert((v[i] & 0xC0) == 0x80); - } else if ((v[i] & 0xF0) == 0xE0) { - /* 1110cccc 10cbbbba 10aaaaaa - * one of the c's must be set*/ - assert(v[i] & 0x0F || v[i + 1] & 0x20); - i++; - assert((v[i] & 0xC0) == 0x80); - i++; - assert((v[i] & 0xC0) == 0x80); - } else if ((v[i] & 0xF8) == 0xF0) { - /* 11110ddd 10ddcccc 10cbbbba 10aaaaaa - * one of the d's must be set */ - assert(v[i] & 0x07 || v[i + 1] & 0x30); - i++; - assert((v[i] & 0xC0) == 0x80); - i++; - assert((v[i] & 0xC0) == 0x80); - i++; - assert((v[i] & 0xC0) == 0x80); - } else { - /* this will fail */ - assert((v[i] & 0x80) == 0); - } - } - } -#endif memcpy(h->base + pos, v, len); if (h->hashash) { ((BUN *) (h->base + pos))[-1] = strhash; _______________________________________________ checkin-list mailing list checkin-list@monetdb.org https://www.monetdb.org/mailman/listinfo/checkin-list