MonetDB: ascii-flag - Some more UTF-8 stuff.

Sjoerd Mullender via checkin-list Fri, 03 May 2024 06:50:10 -0700

Changeset: e764818e0173 for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB/rev/e764818e0173
Modified Files:
        clients/mapiclient/mclient.c
Branch: ascii-flag
Log Message:


Some more UTF-8 stuff.


diffs (truncated from 313 to 300 lines):

diff --git a/clients/mapiclient/mclient.c b/clients/mapiclient/mclient.c
--- a/clients/mapiclient/mclient.c
+++ b/clients/mapiclient/mclient.c
@@ -103,9 +103,6 @@ typedef int64_t timertype;
 
 static timertype t0, t1;       /* used for timing */
 
-#define UTF8BOM                "\xEF\xBB\xBF"  /* UTF-8 encoding of Unicode 
BOM */
-#define UTF8BOMLENGTH  3       /* length of above */
-
 /* Pagination and simple ASCII-based rendering is provided for SQL
  * sessions. The result set size is limited by the cache size of the
  * Mapi Library. It is sufficiently large to accommodate most result
@@ -173,6 +170,53 @@ static char *nullstring = default_nullst
 #include <ctype.h>
 #include "mhelp.h"
 
+/* The code starting here, and up to and including the function decode
+ * below are copyright Bjoern Hoehrmann per the below notice.
+ *
+ * The function decode provides a fast way to check for valid UTF-8
+ * sequences and returns the value of the codepoint as well.  */
+
+// Copyright (c) 2008-2009 Bjoern Hoehrmann <bjo...@hoehrmann.de>
+// See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
+
+#define UTF8_ACCEPT 0
+#define UTF8_REJECT 12
+
+static const uint8_t utf8d[] = {
+       // The first part of the table maps bytes to character classes that
+       // to reduce the size of the transition table and create bitmasks.
+        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+        1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,  9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
+        7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,  7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
+        8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,  2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
+       10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
+
+       // The second part is a transition table that maps a combination
+       // of a state of the automaton and a character class to a state.
+        0,12,24,36,60,96,84,12,12,12,48,72, 
12,12,12,12,12,12,12,12,12,12,12,12,
+       12, 0,12,12,12,12,12, 0,12, 0,12,12, 
12,24,12,12,12,12,12,24,12,24,12,12,
+       12,12,12,12,12,12,12,24,12,12,12,12, 
12,24,12,12,12,12,12,12,12,24,12,12,
+       12,12,12,12,12,12,12,36,12,36,12,12, 
12,36,12,12,12,12,12,36,12,36,12,12,
+       12,36,12,12,12,12,12,12,12,12,12,12,
+};
+
+static inline uint32_t
+decode(uint32_t *state, uint32_t *codep, uint32_t byte)
+{
+       uint32_t type = utf8d[byte];
+
+       *codep = (*state != UTF8_ACCEPT) ?
+               (byte & 0x3fu) | (*codep << 6) :
+               (0xff >> type) & (byte);
+
+       *state = utf8d[256 + *state + type];
+       return *state;
+}
+/* end code copyright by Bjoern Hoehrmann */
+
 static timertype
 gettime(void)
 {
@@ -590,69 +634,32 @@ static size_t
 utf8strlenmax(char *s, char *e, size_t max, char **t)
 {
        size_t len = 0, len0 = 0;
-       int c;
-       int n;
        char *t0 = s;
 
        assert(max == 0 || t != NULL);
        if (s == NULL)
                return 0;
-       c = 0;
-       n = 0;
-       while (*s != 0 && (e == NULL || s < e)) {
-               if (*s == '\n') {
-                       assert(n == 0);
-                       if (max) {
-                               *t = s;
-                               return len;
+
+       uint32_t state = 0, codepoint = 0;
+       while (*s && (e == NULL || s < e)) {
+               switch (decode(&state, &codepoint, (uint8_t) *s++)) {
+               case UTF8_ACCEPT:
+                       if (codepoint == '\n') {
+                               if (max) {
+                                       *t = s;
+                                       return len;
+                               }
+                               len++;
+                       } else if (codepoint == '\t') {
+                               len++;                  /* rendered as single 
space */
+                       } else if (codepoint <= 0x1F || codepoint == 0177) {
+                               len += 4;               /* control, rendered as 
"\\%03o" */
+                       } else if (0x80 <= codepoint && codepoint <= 0x9F) {
+                               len += 6;               /* control, rendered as 
"u\\%04x" */
+                       } else {
+                               /* charwidth() returning -1 is caught by the 
above */
+                               len += charwidth(codepoint);
                        }
-                       len++;
-                       n = 0;
-               } else if (*s == '\t') {
-                       assert(n == 0);
-                       len++;  /* rendered as single space */
-                       n = 0;
-               } else if ((unsigned char) *s <= 0x1F || *s == '\177') {
-                       assert(n == 0);
-                       len += 4;
-                       n = 0;
-               } else if ((*s & 0x80) == 0) {
-                       assert(n == 0);
-                       len++;
-                       n = 0;
-               } else if ((*s & 0xC0) == 0x80) {
-                       c = (c << 6) | (*s & 0x3F);
-                       if (--n == 0) {
-                               /* last byte of a multi-byte character */
-                               n = charwidth(c);
-                               if (n >= 0)
-                                       len += n;
-                               else
-                                       len++;          /* assume width 1 if 
unprintable */
-                               n = 0;
-                       }
-               } else if ((*s & 0xE0) == 0xC0) {
-                       assert(n == 0);
-                       n = 1;
-                       c = *s & 0x1F;
-               } else if ((*s & 0xF0) == 0xE0) {
-                       assert(n == 0);
-                       n = 2;
-                       c = *s & 0x0F;
-               } else if ((*s & 0xF8) == 0xF0) {
-                       assert(n == 0);
-                       n = 3;
-                       c = *s & 0x07;
-               } else if ((*s & 0xFC) == 0xF8) {
-                       assert(n == 0);
-                       n = 4;
-                       c = *s & 0x03;
-               } else {
-                       assert(0);
-                       n = 0;
-               }
-               s++;
-               if (n == 0) {
                        if (max != 0) {
                                if (len > max) {
                                        *t = t0;
@@ -665,6 +672,13 @@ utf8strlenmax(char *s, char *e, size_t m
                        }
                        t0 = s;
                        len0 = len;
+                       break;
+               case UTF8_REJECT:
+                       /* shouldn't happen */
+                       assert(0);
+                       break;
+               default:
+                       break;
                }
        }
        if (max != 0)
@@ -787,10 +801,19 @@ SQLrow(int *len, int *numeric, char **re
                                                                
mnstr_printf(toConsole, "\\%03o", (unsigned char) *p);
                                                        else if (*p == '\302' &&
                                                                 (p[1] & 0xE0) 
== 0x80) {
+                                                               /* U+0080 - 
U+009F control character */
                                                                
mnstr_printf(toConsole, "\\u%04x", (unsigned) ((p[1] & 0x3F) | 0x80));
                                                                p++;
-                                                       } else
+                                                       } else if (((unsigned 
char) *p & 0x80) == 0) {
                                                                
mnstr_write(toConsole, p, 1, 1);
+                                                       } else {
+                                                               /* do a 
complete UTF-8 character
+                                                                * sequence in 
one go */
+                                                               char *q = p;
+                                                               while 
(((unsigned char) *++p & 0xC0) == 0x80)
+                                                                       ;
+                                                               
mnstr_write(toConsole, q, p-- - q, 1);
+                                                       }
                                                }
                                                mnstr_printf(toConsole, 
"...%*s",
                                                             len[i] - 2 - (int) 
utf8strlen(rest[i], t),
@@ -805,10 +828,19 @@ SQLrow(int *len, int *numeric, char **re
                                                                
mnstr_printf(toConsole, "\\%03o", (unsigned char) *p);
                                                        else if (*p == '\302' &&
                                                                 (p[1] & 0xE0) 
== 0x80) {
+                                                               /* U+0080 - 
U+009F control character */
                                                                
mnstr_printf(toConsole, "\\u%04x", (unsigned) ((p[1] & 0x3F) | 0x80));
                                                                p++;
-                                                       } else
+                                                       } else if (((unsigned 
char) *p & 0x80) == 0) {
                                                                
mnstr_write(toConsole, p, 1, 1);
+                                                       } else {
+                                                               /* do a 
complete UTF-8 character
+                                                                * sequence in 
one go */
+                                                               char *q = p;
+                                                               while 
(((unsigned char) *++p & 0xC0) == 0x80)
+                                                                       ;
+                                                               
mnstr_write(toConsole, q, p-- - q, 1);
+                                                       }
                                                }
                                                mnstr_write(toConsole, " ", 1, 
1);
                                                if (!numeric[i])
@@ -839,24 +871,32 @@ SQLrow(int *len, int *numeric, char **re
                                                             rest[i]);
                                        }
                                        if (!numeric[i]) {
-                                               char *p;
                                                /* replace tabs with a
                                                 * single space to
                                                 * avoid screwup the
                                                 * width
                                                 * calculations */
                                                mnstr_write(toConsole, " ", 1, 
1);
-                                               for (p = rest[i]; *p; p++) {
+                                               for (char *p = rest[i]; *p; 
p++) {
                                                        if (*p == '\t')
                                                                
mnstr_write(toConsole, " ", 1, 1);
                                                        else if ((unsigned 
char) *p <= 0x1F || *p == '\177')
                                                                
mnstr_printf(toConsole, "\\%03o", (unsigned char) *p);
                                                        else if (*p == '\302' &&
                                                                 (p[1] & 0xE0) 
== 0x80) {
+                                                               /* U+0080 - 
U+009F control character */
                                                                
mnstr_printf(toConsole, "\\u%04x", (unsigned) ((p[1] & 0x3F) | 0x80));
                                                                p++;
-                                                       } else
+                                                       } else if (((unsigned 
char) *p & 0x80) == 0) {
                                                                
mnstr_write(toConsole, p, 1, 1);
+                                                       } else {
+                                                               /* do a 
complete UTF-8 character
+                                                                * sequence in 
one go */
+                                                               char *q = p;
+                                                               while 
(((unsigned char) *++p & 0xC0) == 0x80)
+                                                                       ;
+                                                               
mnstr_write(toConsole, q, p-- - q, 1);
+                                                       }
                                                }
                                                mnstr_printf(toConsole, " %*s",
                                                             (int) (len[i] - 
ulen),
@@ -887,32 +927,37 @@ XMLprdata(const char *val)
 {
        if (val == NULL)
                return;
-       while (*val) {
-               if (*val == '&')
-                       mnstr_printf(toConsole, "&amp;");
-               else if (*val == '<')
-                       mnstr_printf(toConsole, "&lt;");
-               else if (*val == '>')
-                       mnstr_printf(toConsole, "&gt;");
-               else if (*val == '"')
-                       mnstr_printf(toConsole, "&quot;");
-               else if (*val == '\'')
-                       mnstr_printf(toConsole, "&apos;");
-               else if ((*val & 0xFF) < 0x20)  /* control character */
-                       mnstr_printf(toConsole, "&#%d;", *val & 0xFF);
-               else if ((*val & 0x80) != 0 /* && encoding != NULL */ ) {
-                       int n;
-                       unsigned int m;
-                       unsigned int c = *val & 0x7F;
-
-                       for (n = 0, m = 0x40; c & m; n++, m >>= 1)
-                               c &= ~m;
-                       while (--n >= 0)
-                               c = (c << 6) | (*++val & 0x3F);
-                       mnstr_printf(toConsole, "&#x%x;", c);
-               } else
-                       mnstr_write(toConsole, val, 1, 1);
-               val++;
+       for (uint32_t state = 0, codepoint = 0; *val; val++) {
+               if (decode(&state, &codepoint, (uint8_t) *val) == UTF8_ACCEPT) {
+                       switch (codepoint) {
+                       case '&':
+                               mnstr_printf(toConsole, "&amp;");
+                               break;
+                       case '<':
+                               mnstr_printf(toConsole, "&lt;");
+                               break;
+                       case '>':
+                               mnstr_printf(toConsole, "&gt;");
+                               break;
+                       case '"':
+                               mnstr_printf(toConsole, "&quot;");
+                               break;
+                       case '\'':
+                               mnstr_printf(toConsole, "&apos;");
+                               break;
+                       default:
+                               if ((codepoint & ~0x80) <= 0x1F || codepoint == 
0177) {
+                                       /* control character */
_______________________________________________
checkin-list mailing list -- checkin-list@monetdb.org
To unsubscribe send an email to checkin-list-le...@monetdb.org

MonetDB: ascii-flag - Some more UTF-8 stuff.

Reply via email to