Changeset: 450f65bcb3e5 for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB/rev/450f65bcb3e5
Modified Files:
clients/odbc/driver/CMakeLists.txt
clients/odbc/driver/ODBCUtil.c
common/stream/CMakeLists.txt
common/stream/stdio_stream.c
common/stream/winio.c
common/utils/CMakeLists.txt
common/utils/mutils.c
Branch: default
Log Message:
Use utf-8 decode function in more places.
diffs (truncated from 430 to 300 lines):
diff --git a/clients/odbc/driver/CMakeLists.txt
b/clients/odbc/driver/CMakeLists.txt
--- a/clients/odbc/driver/CMakeLists.txt
+++ b/clients/odbc/driver/CMakeLists.txt
@@ -133,6 +133,7 @@ target_link_libraries(MonetODBC
monetdb_config_header
mutils
mapi
+ mutf8
${ODBCINST_LIBRARIES})
install(TARGETS
diff --git a/clients/odbc/driver/ODBCUtil.c b/clients/odbc/driver/ODBCUtil.c
--- a/clients/odbc/driver/ODBCUtil.c
+++ b/clients/odbc/driver/ODBCUtil.c
@@ -35,6 +35,7 @@
#include "ODBCUtil.h"
#include "ODBCDbc.h"
#include <float.h>
+#include "mutf8.h"
#ifdef WIN32
@@ -199,7 +200,6 @@ ODBCutf82wchar(const SQLCHAR *src,
{
SQLLEN i = 0;
SQLINTEGER j = 0;
- uint32_t c;
if (buf == NULL)
buflen = 0;
@@ -220,51 +220,25 @@ ODBCutf82wchar(const SQLCHAR *src,
else if (length < 0)
return "Invalid length parameter";
+ uint32_t state = 0, codepoint = 0;
while (j < length && i + 1 < buflen && src[j]) {
- if ((src[j+0] & 0x80) == 0) {
- buf[i++] = src[j+0];
- j += 1;
- } else if (j + 1 < length
- && (src[j+0] & 0xE0) == 0xC0
- && (src[j+1] & 0xC0) == 0x80
- && (src[j+0] & 0x1E) != 0) {
- buf[i++] = (src[j+0] & 0x1F) << 6
- | (src[j+1] & 0x3F);
- j += 2;
- } else if (j + 2 < length
- && (src[j+0] & 0xF0) == 0xE0
- && (src[j+1] & 0xC0) == 0x80
- && (src[j+2] & 0xC0) == 0x80
- && ((src[j+0] & 0x0F) != 0
- || (src[j+1] & 0x20) != 0)) {
- buf[i++] = (src[j+0] & 0x0F) << 12
- | (src[j+1] & 0x3F) << 6
- | (src[j+2] & 0x3F);
- j += 3;
- } else if (j + 3 < length
- && (src[j+0] & 0xF8) == 0xF0
- && (src[j+1] & 0xC0) == 0x80
- && (src[j+2] & 0xC0) == 0x80
- && (src[j+3] & 0xC0) == 0x80
- && ((src[j+0] & 0x07) != 0
- || (src[j+1] & 0x30) != 0)) {
- c = (src[j+0] & 0x07) << 18
- | (src[j+1] & 0x3F) << 12
- | (src[j+2] & 0x3F) << 6
- | (src[j+3] & 0x3F);
- if (c > 0x10FFFF || (c & 0x1FF800) == 0x00D800)
- return "Illegal code point";
+ switch (decode(&state, &codepoint, (uint8_t) src[j++])) {
+ case UTF8_ACCEPT:
#if SIZEOF_SQLWCHAR == 2
- if (i + 2 >= buflen)
- break;
- buf[i++] = 0xD7C0 + (c >> 10);
- buf[i++] = 0xDC00 + (c & 0x03FF);
+ if (codepoint <= 0xFFFF) {
+ buf[i++] = (SQLWCHAR) codepoint;
+ } else {
+ buf[i++] = (SQLWCHAR) (0xD7C0 + (codepoint >>
10));
+ buf[i++] = (SQLWCHAR) (0xDC00 + (codepoint &
0x3FF));
+ }
#else
- buf[i++] = c;
+ buf[i++] = (SQLWCHAR) codepoint;
#endif
- j += 4;
- } else {
+ break;
+ case UTF8_REJECT:
return "Illegal code point";
+ default:
+ break;
}
}
if (buflen > 0)
@@ -272,40 +246,22 @@ ODBCutf82wchar(const SQLCHAR *src,
if (consumed)
*consumed = (size_t) j;
while (j < length && src[j]) {
- i++;
- if ((src[j+0] & 0x80) == 0) {
- j += 1;
- } else if (j + 1 < length
- && (src[j+0] & 0xE0) == 0xC0
- && (src[j+1] & 0xC0) == 0x80
- && (src[j+0] & 0x1E) != 0) {
- j += 2;
- } else if (j + 2 < length
- && (src[j+0] & 0xF0) == 0xE0
- && (src[j+1] & 0xC0) == 0x80
- && (src[j+2] & 0xC0) == 0x80
- && ((src[j+0] & 0x0F) != 0
- || (src[j+1] & 0x20) != 0)) {
- j += 3;
- } else if (j + 3 < length
- && (src[j+0] & 0xF8) == 0xF0
- && (src[j+1] & 0xC0) == 0x80
- && (src[j+2] & 0xC0) == 0x80
- && (src[j+3] & 0xC0) == 0x80
- && ((src[j+0] & 0x07) != 0
- || (src[j+1] & 0x30) != 0)) {
- c = (src[j+0] & 0x07) << 18
- | (src[j+1] & 0x3F) << 12
- | (src[j+2] & 0x3F) << 6
- | (src[j+3] & 0x3F);
- if (c > 0x10FFFF || (c & 0x1FF800) == 0x00D800)
- return "Illegal code point";
+ switch (decode(&state, &codepoint, (uint8_t) src[j++])) {
+ case UTF8_ACCEPT:
#if SIZEOF_SQLWCHAR == 2
+ if (codepoint <= 0xFFFF) {
+ i++;
+ } else {
+ i += 2;
+ }
+#else
i++;
#endif
- j += 4;
- } else {
+ break;
+ case UTF8_REJECT:
return "Illegal code point";
+ default:
+ break;
}
}
if (buflenout)
diff --git a/common/stream/CMakeLists.txt b/common/stream/CMakeLists.txt
--- a/common/stream/CMakeLists.txt
+++ b/common/stream/CMakeLists.txt
@@ -68,6 +68,7 @@ target_link_libraries(stream
$<$<BOOL:${LZ4_FOUND}>:LZ4::LZ4>
$<$<BOOL:${OPENSSL_FOUND}>:OpenSSL::SSL>
matomic
+ mutf8
monetdb_config_header
$<$<PLATFORM_ID:Windows>:ws2_32>
Threads::Threads)
diff --git a/common/stream/stdio_stream.c b/common/stream/stdio_stream.c
--- a/common/stream/stdio_stream.c
+++ b/common/stream/stdio_stream.c
@@ -15,6 +15,7 @@
#include "monetdb_config.h"
#include "stream.h"
#include "stream_internal.h"
+#include "mutf8.h"
/* ------------------------------------------------------------------ */
@@ -177,43 +178,22 @@ utf8towchar(const char *src)
{
wchar_t *dest;
size_t i = 0;
- size_t j = 0;
- uint32_t c;
+ uint32_t state = 0, codepoint = 0;
/* count how many wchar_t's we need, while also checking for
* correctness of the input */
- while (src[j]) {
- i++;
- if ((src[j+0] & 0x80) == 0) {
- j += 1;
- } else if ((src[j+0] & 0xE0) == 0xC0
- && (src[j+1] & 0xC0) == 0x80
- && (src[j+0] & 0x1E) != 0) {
- j += 2;
- } else if ((src[j+0] & 0xF0) == 0xE0
- && (src[j+1] & 0xC0) == 0x80
- && (src[j+2] & 0xC0) == 0x80
- && ((src[j+0] & 0x0F) != 0
- || (src[j+1] & 0x20) != 0)) {
- j += 3;
- } else if ((src[j+0] & 0xF8) == 0xF0
- && (src[j+1] & 0xC0) == 0x80
- && (src[j+2] & 0xC0) == 0x80
- && (src[j+3] & 0xC0) == 0x80) {
- c = (src[j+0] & 0x07) << 18
- | (src[j+1] & 0x3F) << 12
- | (src[j+2] & 0x3F) << 6
- | (src[j+3] & 0x3F);
- if (c < 0x10000
- || c > 0x10FFFF
- || (c & 0x1FF800) == 0x00D800)
- return NULL;
+ for (size_t j = 0; src[j]; j++) {
+ switch (decode(&state, &codepoint, (uint8_t) src[j])) {
+ case UTF8_ACCEPT:
+ i++;
#if SIZEOF_WCHAR_T == 2
- i++;
+ i += (codepoint > 0xFFFF);
#endif
- j += 4;
- } else {
+ break;
+ case UTF8_REJECT:
return NULL;
+ default:
+ break;
}
}
dest = malloc((i + 1) * sizeof(wchar_t));
@@ -221,32 +201,27 @@ utf8towchar(const char *src)
return NULL;
/* go through the source string again, this time we can skip
* the correctness tests */
- i = j = 0;
- while (src[j]) {
- if ((src[j+0] & 0x80) == 0) {
- dest[i++] = src[j+0];
- j += 1;
- } else if ((src[j+0] & 0xE0) == 0xC0) {
- dest[i++] = (src[j+0] & 0x1F) << 6
- | (src[j+1] & 0x3F);
- j += 2;
- } else if ((src[j+0] & 0xF0) == 0xE0) {
- dest[i++] = (src[j+0] & 0x0F) << 12
- | (src[j+1] & 0x3F) << 6
- | (src[j+2] & 0x3F);
- j += 3;
- } else if ((src[j+0] & 0xF8) == 0xF0) {
- c = (src[j+0] & 0x07) << 18
- | (src[j+1] & 0x3F) << 12
- | (src[j+2] & 0x3F) << 6
- | (src[j+3] & 0x3F);
+ i = 0;
+ for (size_t j = 0; src[j]; j++) {
+ switch (decode(&state, &codepoint, (uint8_t) src[j])) {
+ case UTF8_ACCEPT:
#if SIZEOF_WCHAR_T == 2
- dest[i++] = 0xD7C0 + (c >> 10);
- dest[i++] = 0xDC00 + (c & 0x03FF);
+ if (codepoint <= 0xFFFF) {
+ dest[i++] = (wchar_t) codepoint;
+ } else {
+ dest[i++] = (wchar_t) (0xD7C0 + (codepoint >>
10));
+ dest[i++] = (wchar_t) (0xDC00 + (codepoint &
0x3FF));
+ }
#else
- dest[i++] = c;
+ dest[i++] = (wchar_t) codepoint;
#endif
- j += 4;
+ break;
+ case UTF8_REJECT:
+ /* cannot happen because of first loop */
+ free(dest);
+ return NULL;
+ default:
+ break;
}
}
dest[i] = 0;
diff --git a/common/stream/winio.c b/common/stream/winio.c
--- a/common/stream/winio.c
+++ b/common/stream/winio.c
@@ -214,7 +214,7 @@ console_write(stream *restrict s, const
mnstr_set_error(s, MNSTR_WRITE_ERROR, "encoding error
%d", __LINE__);
return -1;
} else if (c->ch > 0xFFFF) {
- c->wbuf[c->len++] = 0xD800 | ((c->ch >> 10) - (1 << 6));
+ c->wbuf[c->len++] = 0xD7C0 + (c->ch >> 10);
c->wbuf[c->len++] = 0xDC00 | (c->ch & 0x03FF);
} else {
c->wbuf[c->len++] = c->ch;
@@ -268,7 +268,7 @@ console_write(stream *restrict s, const
mnstr_set_error(s, MNSTR_WRITE_ERROR, "encoding error
%d", __LINE__);
return -1;
} else if (ch > 0xFFFF) {
- c->wbuf[c->len++] = 0xD800 | ((ch >> 10) - (1 << 6));
+ c->wbuf[c->len++] = 0xD7C0 + (ch >> 10);
c->wbuf[c->len++] = 0xDC00 | (ch & 0x03FF);
} else {
c->wbuf[c->len++] = ch;
diff --git a/common/utils/CMakeLists.txt b/common/utils/CMakeLists.txt
--- a/common/utils/CMakeLists.txt
+++ b/common/utils/CMakeLists.txt
_______________________________________________
checkin-list mailing list -- [email protected]
To unsubscribe send an email to [email protected]