Changeset: f6acdb431fb1 for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB/rev/f6acdb431fb1
Modified Files:
        gdk/gdk.h
        gdk/gdk_string.c
Branch: ascii-flag
Log Message:

Do some special processing for ASCII prefixes.  Also clear properties.


diffs (truncated from 465 to 300 lines):

diff --git a/gdk/gdk.h b/gdk/gdk.h
--- a/gdk/gdk.h
+++ b/gdk/gdk.h
@@ -2341,8 +2341,8 @@ gdk_export gdk_return BATfirstn(BAT **to
 
 #include "gdk_calc.h"
 
-gdk_export gdk_return GDKtoupper(char **buf, size_t *buflen, const char *s);
-gdk_export gdk_return GDKtolower(char **buf, size_t *buflen, const char *s);
+gdk_export gdk_return GDKtoupper(char **restrict buf, size_t *restrict buflen, 
const char *restrict s);
+gdk_export gdk_return GDKtolower(char **restrict buf, size_t *restrict buflen, 
const char *restrict s);
 gdk_export int GDKstrncasecmp(const char *str1, const char *str2, size_t l1, 
size_t l2);
 gdk_export int GDKstrcasecmp(const char *s1, const char *s2);
 gdk_export char *GDKstrcasestr(const char *haystack, const char *needle);
diff --git a/gdk/gdk_string.c b/gdk/gdk_string.c
--- a/gdk/gdk_string.c
+++ b/gdk/gdk_string.c
@@ -1460,17 +1460,83 @@ GDKanalytical_str_group_concat(BAT *r, B
  * For the first byte of a UTF-8 encoding, use the value as index into
  * the table.  If the value is zero, there are no conversions for any
  * UTF-8 string starting with this byte (this includes both multi-byte
- * sequences and single-byte sequences).  For a single-byte sequence, if
- * the value is not zero, it is the converted codepoint.  For a
- * multi-byte sequence, if the value is not zero, it is an offset into
- * the same table.  The next byte is added to the offset and again used
- * as index into the table (including the top two bits which are always
- * 1 and 0 respectively).  The process then repeats: if zero, no
- * conversions for any sequence starting with the bytes looked up so
- * far, if non-zero, if this is the last byte of a sequence, it is the
- * converted codepoint, and otherwise a (new) offset into the same
- * table. */
-static int lowercase[4288] = {
+ * sequences and single-byte sequences, though note that for single-byte
+ * sequences (ASCII-compatible) the table is filled in completely at no
+ * extra cost).  For a single-byte sequence, if the value is not zero,
+ * it is the converted codepoint.  For a multi-byte sequence, if the
+ * value is not zero, it is an offset into the same table.  The next
+ * byte is added to the offset and again used as index into the table
+ * (including the top two bits which are always 1 and 0 respectively).
+ * The process then repeats: if zero, no conversions for any sequence
+ * starting with the bytes looked up so far, if non-zero, if this is the
+ * last byte of a sequence, it is the converted codepoint, and otherwise
+ * a (new) offset into the same table. */
+static const int lowercase[4288] = {
+       [0x0] = 0x0,            /* U+0000: <control> */
+       [0x1] = 0x1,            /* U+0001: <control> */
+       [0x2] = 0x2,            /* U+0002: <control> */
+       [0x3] = 0x3,            /* U+0003: <control> */
+       [0x4] = 0x4,            /* U+0004: <control> */
+       [0x5] = 0x5,            /* U+0005: <control> */
+       [0x6] = 0x6,            /* U+0006: <control> */
+       [0x7] = 0x7,            /* U+0007: <control> */
+       [0x8] = 0x8,            /* U+0008: <control> */
+       [0x9] = 0x9,            /* U+0009: <control> */
+       [0xA] = 0xA,            /* U+000A: <control> */
+       [0xB] = 0xB,            /* U+000B: <control> */
+       [0xC] = 0xC,            /* U+000C: <control> */
+       [0xD] = 0xD,            /* U+000D: <control> */
+       [0xE] = 0xE,            /* U+000E: <control> */
+       [0xF] = 0xF,            /* U+000F: <control> */
+       [0x10] = 0x10,          /* U+0010: <control> */
+       [0x11] = 0x11,          /* U+0011: <control> */
+       [0x12] = 0x12,          /* U+0012: <control> */
+       [0x13] = 0x13,          /* U+0013: <control> */
+       [0x14] = 0x14,          /* U+0014: <control> */
+       [0x15] = 0x15,          /* U+0015: <control> */
+       [0x16] = 0x16,          /* U+0016: <control> */
+       [0x17] = 0x17,          /* U+0017: <control> */
+       [0x18] = 0x18,          /* U+0018: <control> */
+       [0x19] = 0x19,          /* U+0019: <control> */
+       [0x1A] = 0x1A,          /* U+001A: <control> */
+       [0x1B] = 0x1B,          /* U+001B: <control> */
+       [0x1C] = 0x1C,          /* U+001C: <control> */
+       [0x1D] = 0x1D,          /* U+001D: <control> */
+       [0x1E] = 0x1E,          /* U+001E: <control> */
+       [0x1F] = 0x1F,          /* U+001F: <control> */
+       [0x20] = 0x20,          /* U+0020: SPACE */
+       [0x21] = 0x21,          /* U+0021: EXCLAMATION MARK */
+       [0x22] = 0x22,          /* U+0022: QUOTATION MARK */
+       [0x23] = 0x23,          /* U+0023: NUMBER SIGN */
+       [0x24] = 0x24,          /* U+0024: DOLLAR SIGN */
+       [0x25] = 0x25,          /* U+0025: PERCENT SIGN */
+       [0x26] = 0x26,          /* U+0026: AMPERSAND */
+       [0x27] = 0x27,          /* U+0027: APOSTROPHE */
+       [0x28] = 0x28,          /* U+0028: LEFT PARENTHESIS */
+       [0x29] = 0x29,          /* U+0029: RIGHT PARENTHESIS */
+       [0x2A] = 0x2A,          /* U+002A: ASTERISK */
+       [0x2B] = 0x2B,          /* U+002B: PLUS SIGN */
+       [0x2C] = 0x2C,          /* U+002C: COMMA */
+       [0x2D] = 0x2D,          /* U+002D: HYPHEN-MINUS */
+       [0x2E] = 0x2E,          /* U+002E: FULL STOP */
+       [0x2F] = 0x2F,          /* U+002F: SOLIDUS */
+       [0x30] = 0x30,          /* U+0030: DIGIT ZERO */
+       [0x31] = 0x31,          /* U+0031: DIGIT ONE */
+       [0x32] = 0x32,          /* U+0032: DIGIT TWO */
+       [0x33] = 0x33,          /* U+0033: DIGIT THREE */
+       [0x34] = 0x34,          /* U+0034: DIGIT FOUR */
+       [0x35] = 0x35,          /* U+0035: DIGIT FIVE */
+       [0x36] = 0x36,          /* U+0036: DIGIT SIX */
+       [0x37] = 0x37,          /* U+0037: DIGIT SEVEN */
+       [0x38] = 0x38,          /* U+0038: DIGIT EIGHT */
+       [0x39] = 0x39,          /* U+0039: DIGIT NINE */
+       [0x3A] = 0x3A,          /* U+003A: COLON */
+       [0x3B] = 0x3B,          /* U+003B: SEMICOLON */
+       [0x3C] = 0x3C,          /* U+003C: LESS-THAN SIGN */
+       [0x3D] = 0x3D,          /* U+003D: EQUALS SIGN */
+       [0x3E] = 0x3E,          /* U+003E: GREATER-THAN SIGN */
+       [0x3F] = 0x3F,          /* U+003F: QUESTION MARK */
+       [0x40] = 0x40,          /* U+0040: COMMERCIAL AT */
        [0x41] = 0x61,          /* U+0041: LATIN CAPITAL LETTER A */
        [0x42] = 0x62,          /* U+0042: LATIN CAPITAL LETTER B */
        [0x43] = 0x63,          /* U+0043: LATIN CAPITAL LETTER C */
@@ -1497,6 +1563,43 @@ static int lowercase[4288] = {
        [0x58] = 0x78,          /* U+0058: LATIN CAPITAL LETTER X */
        [0x59] = 0x79,          /* U+0059: LATIN CAPITAL LETTER Y */
        [0x5A] = 0x7A,          /* U+005A: LATIN CAPITAL LETTER Z */
+       [0x5B] = 0x5B,          /* U+005B: LEFT SQUARE BRACKET */
+       [0x5C] = 0x5C,          /* U+005C: REVERSE SOLIDUS */
+       [0x5D] = 0x5D,          /* U+005D: RIGHT SQUARE BRACKET */
+       [0x5E] = 0x5E,          /* U+005E: CIRCUMFLEX ACCENT */
+       [0x5F] = 0x5F,          /* U+005F: LOW LINE */
+       [0x60] = 0x60,          /* U+0060: GRAVE ACCENT */
+       [0x61] = 0x61,          /* U+0061: LATIN SMALL LETTER A */
+       [0x62] = 0x62,          /* U+0062: LATIN SMALL LETTER B */
+       [0x63] = 0x63,          /* U+0063: LATIN SMALL LETTER C */
+       [0x64] = 0x64,          /* U+0064: LATIN SMALL LETTER D */
+       [0x65] = 0x65,          /* U+0065: LATIN SMALL LETTER E */
+       [0x66] = 0x66,          /* U+0066: LATIN SMALL LETTER F */
+       [0x67] = 0x67,          /* U+0067: LATIN SMALL LETTER G */
+       [0x68] = 0x68,          /* U+0068: LATIN SMALL LETTER H */
+       [0x69] = 0x69,          /* U+0069: LATIN SMALL LETTER I */
+       [0x6A] = 0x6A,          /* U+006A: LATIN SMALL LETTER J */
+       [0x6B] = 0x6B,          /* U+006B: LATIN SMALL LETTER K */
+       [0x6C] = 0x6C,          /* U+006C: LATIN SMALL LETTER L */
+       [0x6D] = 0x6D,          /* U+006D: LATIN SMALL LETTER M */
+       [0x6E] = 0x6E,          /* U+006E: LATIN SMALL LETTER N */
+       [0x6F] = 0x6F,          /* U+006F: LATIN SMALL LETTER O */
+       [0x70] = 0x70,          /* U+0070: LATIN SMALL LETTER P */
+       [0x71] = 0x71,          /* U+0071: LATIN SMALL LETTER Q */
+       [0x72] = 0x72,          /* U+0072: LATIN SMALL LETTER R */
+       [0x73] = 0x73,          /* U+0073: LATIN SMALL LETTER S */
+       [0x74] = 0x74,          /* U+0074: LATIN SMALL LETTER T */
+       [0x75] = 0x75,          /* U+0075: LATIN SMALL LETTER U */
+       [0x76] = 0x76,          /* U+0076: LATIN SMALL LETTER V */
+       [0x77] = 0x77,          /* U+0077: LATIN SMALL LETTER W */
+       [0x78] = 0x78,          /* U+0078: LATIN SMALL LETTER X */
+       [0x79] = 0x79,          /* U+0079: LATIN SMALL LETTER Y */
+       [0x7A] = 0x7A,          /* U+007A: LATIN SMALL LETTER Z */
+       [0x7B] = 0x7B,          /* U+007B: LEFT CURLY BRACKET */
+       [0x7C] = 0x7C,          /* U+007C: VERTICAL LINE */
+       [0x7D] = 0x7D,          /* U+007D: RIGHT CURLY BRACKET */
+       [0x7E] = 0x7E,          /* U+007E: TILDE */
+       [0x7F] = 0x7F,          /* U+007F: <control> */
        [0xC3] = 256 - 0x80,    /* 303 ... */
        [256+0x0] = 0xE0,       /* U+00C0: LATIN CAPITAL LETTER A WITH GRAVE */
        [256+0x1] = 0xE1,       /* U+00C1: LATIN CAPITAL LETTER A WITH ACUTE */
@@ -2969,7 +3072,104 @@ static int lowercase[4288] = {
        [4224+0x21] = 0x1E943,  /* U+1E921: ADLAM CAPITAL LETTER SHA */
 };
 
-static int uppercase[4544] = {
+static const int uppercase[4544] = {
+       [0x0] = 0x0,            /* U+0000: <control> */
+       [0x1] = 0x1,            /* U+0001: <control> */
+       [0x2] = 0x2,            /* U+0002: <control> */
+       [0x3] = 0x3,            /* U+0003: <control> */
+       [0x4] = 0x4,            /* U+0004: <control> */
+       [0x5] = 0x5,            /* U+0005: <control> */
+       [0x6] = 0x6,            /* U+0006: <control> */
+       [0x7] = 0x7,            /* U+0007: <control> */
+       [0x8] = 0x8,            /* U+0008: <control> */
+       [0x9] = 0x9,            /* U+0009: <control> */
+       [0xA] = 0xA,            /* U+000A: <control> */
+       [0xB] = 0xB,            /* U+000B: <control> */
+       [0xC] = 0xC,            /* U+000C: <control> */
+       [0xD] = 0xD,            /* U+000D: <control> */
+       [0xE] = 0xE,            /* U+000E: <control> */
+       [0xF] = 0xF,            /* U+000F: <control> */
+       [0x10] = 0x10,          /* U+0010: <control> */
+       [0x11] = 0x11,          /* U+0011: <control> */
+       [0x12] = 0x12,          /* U+0012: <control> */
+       [0x13] = 0x13,          /* U+0013: <control> */
+       [0x14] = 0x14,          /* U+0014: <control> */
+       [0x15] = 0x15,          /* U+0015: <control> */
+       [0x16] = 0x16,          /* U+0016: <control> */
+       [0x17] = 0x17,          /* U+0017: <control> */
+       [0x18] = 0x18,          /* U+0018: <control> */
+       [0x19] = 0x19,          /* U+0019: <control> */
+       [0x1A] = 0x1A,          /* U+001A: <control> */
+       [0x1B] = 0x1B,          /* U+001B: <control> */
+       [0x1C] = 0x1C,          /* U+001C: <control> */
+       [0x1D] = 0x1D,          /* U+001D: <control> */
+       [0x1E] = 0x1E,          /* U+001E: <control> */
+       [0x1F] = 0x1F,          /* U+001F: <control> */
+       [0x20] = 0x20,          /* U+0020: SPACE */
+       [0x21] = 0x21,          /* U+0021: EXCLAMATION MARK */
+       [0x22] = 0x22,          /* U+0022: QUOTATION MARK */
+       [0x23] = 0x23,          /* U+0023: NUMBER SIGN */
+       [0x24] = 0x24,          /* U+0024: DOLLAR SIGN */
+       [0x25] = 0x25,          /* U+0025: PERCENT SIGN */
+       [0x26] = 0x26,          /* U+0026: AMPERSAND */
+       [0x27] = 0x27,          /* U+0027: APOSTROPHE */
+       [0x28] = 0x28,          /* U+0028: LEFT PARENTHESIS */
+       [0x29] = 0x29,          /* U+0029: RIGHT PARENTHESIS */
+       [0x2A] = 0x2A,          /* U+002A: ASTERISK */
+       [0x2B] = 0x2B,          /* U+002B: PLUS SIGN */
+       [0x2C] = 0x2C,          /* U+002C: COMMA */
+       [0x2D] = 0x2D,          /* U+002D: HYPHEN-MINUS */
+       [0x2E] = 0x2E,          /* U+002E: FULL STOP */
+       [0x2F] = 0x2F,          /* U+002F: SOLIDUS */
+       [0x30] = 0x30,          /* U+0030: DIGIT ZERO */
+       [0x31] = 0x31,          /* U+0031: DIGIT ONE */
+       [0x32] = 0x32,          /* U+0032: DIGIT TWO */
+       [0x33] = 0x33,          /* U+0033: DIGIT THREE */
+       [0x34] = 0x34,          /* U+0034: DIGIT FOUR */
+       [0x35] = 0x35,          /* U+0035: DIGIT FIVE */
+       [0x36] = 0x36,          /* U+0036: DIGIT SIX */
+       [0x37] = 0x37,          /* U+0037: DIGIT SEVEN */
+       [0x38] = 0x38,          /* U+0038: DIGIT EIGHT */
+       [0x39] = 0x39,          /* U+0039: DIGIT NINE */
+       [0x3A] = 0x3A,          /* U+003A: COLON */
+       [0x3B] = 0x3B,          /* U+003B: SEMICOLON */
+       [0x3C] = 0x3C,          /* U+003C: LESS-THAN SIGN */
+       [0x3D] = 0x3D,          /* U+003D: EQUALS SIGN */
+       [0x3E] = 0x3E,          /* U+003E: GREATER-THAN SIGN */
+       [0x3F] = 0x3F,          /* U+003F: QUESTION MARK */
+       [0x40] = 0x40,          /* U+0040: COMMERCIAL AT */
+       [0x41] = 0x41,          /* U+0041: LATIN CAPITAL LETTER A */
+       [0x42] = 0x42,          /* U+0042: LATIN CAPITAL LETTER B */
+       [0x43] = 0x43,          /* U+0043: LATIN CAPITAL LETTER C */
+       [0x44] = 0x44,          /* U+0044: LATIN CAPITAL LETTER D */
+       [0x45] = 0x45,          /* U+0045: LATIN CAPITAL LETTER E */
+       [0x46] = 0x46,          /* U+0046: LATIN CAPITAL LETTER F */
+       [0x47] = 0x47,          /* U+0047: LATIN CAPITAL LETTER G */
+       [0x48] = 0x48,          /* U+0048: LATIN CAPITAL LETTER H */
+       [0x49] = 0x49,          /* U+0049: LATIN CAPITAL LETTER I */
+       [0x4A] = 0x4A,          /* U+004A: LATIN CAPITAL LETTER J */
+       [0x4B] = 0x4B,          /* U+004B: LATIN CAPITAL LETTER K */
+       [0x4C] = 0x4C,          /* U+004C: LATIN CAPITAL LETTER L */
+       [0x4D] = 0x4D,          /* U+004D: LATIN CAPITAL LETTER M */
+       [0x4E] = 0x4E,          /* U+004E: LATIN CAPITAL LETTER N */
+       [0x4F] = 0x4F,          /* U+004F: LATIN CAPITAL LETTER O */
+       [0x50] = 0x50,          /* U+0050: LATIN CAPITAL LETTER P */
+       [0x51] = 0x51,          /* U+0051: LATIN CAPITAL LETTER Q */
+       [0x52] = 0x52,          /* U+0052: LATIN CAPITAL LETTER R */
+       [0x53] = 0x53,          /* U+0053: LATIN CAPITAL LETTER S */
+       [0x54] = 0x54,          /* U+0054: LATIN CAPITAL LETTER T */
+       [0x55] = 0x55,          /* U+0055: LATIN CAPITAL LETTER U */
+       [0x56] = 0x56,          /* U+0056: LATIN CAPITAL LETTER V */
+       [0x57] = 0x57,          /* U+0057: LATIN CAPITAL LETTER W */
+       [0x58] = 0x58,          /* U+0058: LATIN CAPITAL LETTER X */
+       [0x59] = 0x59,          /* U+0059: LATIN CAPITAL LETTER Y */
+       [0x5A] = 0x5A,          /* U+005A: LATIN CAPITAL LETTER Z */
+       [0x5B] = 0x5B,          /* U+005B: LEFT SQUARE BRACKET */
+       [0x5C] = 0x5C,          /* U+005C: REVERSE SOLIDUS */
+       [0x5D] = 0x5D,          /* U+005D: RIGHT SQUARE BRACKET */
+       [0x5E] = 0x5E,          /* U+005E: CIRCUMFLEX ACCENT */
+       [0x5F] = 0x5F,          /* U+005F: LOW LINE */
+       [0x60] = 0x60,          /* U+0060: GRAVE ACCENT */
        [0x61] = 0x41,          /* U+0061: LATIN SMALL LETTER A */
        [0x62] = 0x42,          /* U+0062: LATIN SMALL LETTER B */
        [0x63] = 0x43,          /* U+0063: LATIN SMALL LETTER C */
@@ -2996,6 +3196,11 @@ static int uppercase[4544] = {
        [0x78] = 0x58,          /* U+0078: LATIN SMALL LETTER X */
        [0x79] = 0x59,          /* U+0079: LATIN SMALL LETTER Y */
        [0x7A] = 0x5A,          /* U+007A: LATIN SMALL LETTER Z */
+       [0x7B] = 0x7B,          /* U+007B: LEFT CURLY BRACKET */
+       [0x7C] = 0x7C,          /* U+007C: VERTICAL LINE */
+       [0x7D] = 0x7D,          /* U+007D: RIGHT CURLY BRACKET */
+       [0x7E] = 0x7E,          /* U+007E: TILDE */
+       [0x7F] = 0x7F,          /* U+007F: <control> */
        [0xC2] = 256 - 0x80,    /* 302 ... */
        [256+0x35] = 0x39C,     /* U+00B5: MICRO SIGN */
        [0xC3] = 320 - 0x80,    /* 303 ... */
@@ -4495,62 +4700,93 @@ static int uppercase[4544] = {
  * without error), the current buffer is in *buf, and the current size
  * in *buflen. */
 static gdk_return
-convertcase(char **buf, size_t *buflen, const uint8_t *s, const int *convtab)
+convertcase(char **restrict buf, size_t *restrict buflen,
+           const uint8_t *restrict s, const int *restrict convtab)
 {
        uint8_t *dst = (uint8_t *) *buf;
        size_t dstoff = 0;
-       size_t bl = *buflen;
+       size_t bl;
 
-       if (*buf == NULL)
-               bl = 0;
-       while (*s) {
-               /* we are at the start of a Unicode codepoint encoded in
-                * UTF-8 */
-               if (dstoff + 5 > bl) {
-                       /* make sure we have enough space for the
-                        * largest codepoint, i.e. 4 bytes plus
-                        * terminting NUL */
-                       size_t newlen = bl + 1024;
-                       dst = GDKrealloc(*buf, newlen);
-                       if (dst == NULL) {
-                               *buflen = bl;
_______________________________________________
checkin-list mailing list -- checkin-list@monetdb.org
To unsubscribe send an email to checkin-list-le...@monetdb.org

Reply via email to