Changeset: f6acdb431fb1 for MonetDB URL: https://dev.monetdb.org/hg/MonetDB/rev/f6acdb431fb1 Modified Files: gdk/gdk.h gdk/gdk_string.c Branch: ascii-flag Log Message:
Do some special processing for ASCII prefixes. Also clear properties. diffs (truncated from 465 to 300 lines): diff --git a/gdk/gdk.h b/gdk/gdk.h --- a/gdk/gdk.h +++ b/gdk/gdk.h @@ -2341,8 +2341,8 @@ gdk_export gdk_return BATfirstn(BAT **to #include "gdk_calc.h" -gdk_export gdk_return GDKtoupper(char **buf, size_t *buflen, const char *s); -gdk_export gdk_return GDKtolower(char **buf, size_t *buflen, const char *s); +gdk_export gdk_return GDKtoupper(char **restrict buf, size_t *restrict buflen, const char *restrict s); +gdk_export gdk_return GDKtolower(char **restrict buf, size_t *restrict buflen, const char *restrict s); gdk_export int GDKstrncasecmp(const char *str1, const char *str2, size_t l1, size_t l2); gdk_export int GDKstrcasecmp(const char *s1, const char *s2); gdk_export char *GDKstrcasestr(const char *haystack, const char *needle); diff --git a/gdk/gdk_string.c b/gdk/gdk_string.c --- a/gdk/gdk_string.c +++ b/gdk/gdk_string.c @@ -1460,17 +1460,83 @@ GDKanalytical_str_group_concat(BAT *r, B * For the first byte of a UTF-8 encoding, use the value as index into * the table. If the value is zero, there are no conversions for any * UTF-8 string starting with this byte (this includes both multi-byte - * sequences and single-byte sequences). For a single-byte sequence, if - * the value is not zero, it is the converted codepoint. For a - * multi-byte sequence, if the value is not zero, it is an offset into - * the same table. The next byte is added to the offset and again used - * as index into the table (including the top two bits which are always - * 1 and 0 respectively). The process then repeats: if zero, no - * conversions for any sequence starting with the bytes looked up so - * far, if non-zero, if this is the last byte of a sequence, it is the - * converted codepoint, and otherwise a (new) offset into the same - * table. */ -static int lowercase[4288] = { + * sequences and single-byte sequences, though note that for single-byte + * sequences (ASCII-compatible) the table is filled in completely at no + * extra cost). For a single-byte sequence, if the value is not zero, + * it is the converted codepoint. For a multi-byte sequence, if the + * value is not zero, it is an offset into the same table. The next + * byte is added to the offset and again used as index into the table + * (including the top two bits which are always 1 and 0 respectively). + * The process then repeats: if zero, no conversions for any sequence + * starting with the bytes looked up so far, if non-zero, if this is the + * last byte of a sequence, it is the converted codepoint, and otherwise + * a (new) offset into the same table. */ +static const int lowercase[4288] = { + [0x0] = 0x0, /* U+0000: <control> */ + [0x1] = 0x1, /* U+0001: <control> */ + [0x2] = 0x2, /* U+0002: <control> */ + [0x3] = 0x3, /* U+0003: <control> */ + [0x4] = 0x4, /* U+0004: <control> */ + [0x5] = 0x5, /* U+0005: <control> */ + [0x6] = 0x6, /* U+0006: <control> */ + [0x7] = 0x7, /* U+0007: <control> */ + [0x8] = 0x8, /* U+0008: <control> */ + [0x9] = 0x9, /* U+0009: <control> */ + [0xA] = 0xA, /* U+000A: <control> */ + [0xB] = 0xB, /* U+000B: <control> */ + [0xC] = 0xC, /* U+000C: <control> */ + [0xD] = 0xD, /* U+000D: <control> */ + [0xE] = 0xE, /* U+000E: <control> */ + [0xF] = 0xF, /* U+000F: <control> */ + [0x10] = 0x10, /* U+0010: <control> */ + [0x11] = 0x11, /* U+0011: <control> */ + [0x12] = 0x12, /* U+0012: <control> */ + [0x13] = 0x13, /* U+0013: <control> */ + [0x14] = 0x14, /* U+0014: <control> */ + [0x15] = 0x15, /* U+0015: <control> */ + [0x16] = 0x16, /* U+0016: <control> */ + [0x17] = 0x17, /* U+0017: <control> */ + [0x18] = 0x18, /* U+0018: <control> */ + [0x19] = 0x19, /* U+0019: <control> */ + [0x1A] = 0x1A, /* U+001A: <control> */ + [0x1B] = 0x1B, /* U+001B: <control> */ + [0x1C] = 0x1C, /* U+001C: <control> */ + [0x1D] = 0x1D, /* U+001D: <control> */ + [0x1E] = 0x1E, /* U+001E: <control> */ + [0x1F] = 0x1F, /* U+001F: <control> */ + [0x20] = 0x20, /* U+0020: SPACE */ + [0x21] = 0x21, /* U+0021: EXCLAMATION MARK */ + [0x22] = 0x22, /* U+0022: QUOTATION MARK */ + [0x23] = 0x23, /* U+0023: NUMBER SIGN */ + [0x24] = 0x24, /* U+0024: DOLLAR SIGN */ + [0x25] = 0x25, /* U+0025: PERCENT SIGN */ + [0x26] = 0x26, /* U+0026: AMPERSAND */ + [0x27] = 0x27, /* U+0027: APOSTROPHE */ + [0x28] = 0x28, /* U+0028: LEFT PARENTHESIS */ + [0x29] = 0x29, /* U+0029: RIGHT PARENTHESIS */ + [0x2A] = 0x2A, /* U+002A: ASTERISK */ + [0x2B] = 0x2B, /* U+002B: PLUS SIGN */ + [0x2C] = 0x2C, /* U+002C: COMMA */ + [0x2D] = 0x2D, /* U+002D: HYPHEN-MINUS */ + [0x2E] = 0x2E, /* U+002E: FULL STOP */ + [0x2F] = 0x2F, /* U+002F: SOLIDUS */ + [0x30] = 0x30, /* U+0030: DIGIT ZERO */ + [0x31] = 0x31, /* U+0031: DIGIT ONE */ + [0x32] = 0x32, /* U+0032: DIGIT TWO */ + [0x33] = 0x33, /* U+0033: DIGIT THREE */ + [0x34] = 0x34, /* U+0034: DIGIT FOUR */ + [0x35] = 0x35, /* U+0035: DIGIT FIVE */ + [0x36] = 0x36, /* U+0036: DIGIT SIX */ + [0x37] = 0x37, /* U+0037: DIGIT SEVEN */ + [0x38] = 0x38, /* U+0038: DIGIT EIGHT */ + [0x39] = 0x39, /* U+0039: DIGIT NINE */ + [0x3A] = 0x3A, /* U+003A: COLON */ + [0x3B] = 0x3B, /* U+003B: SEMICOLON */ + [0x3C] = 0x3C, /* U+003C: LESS-THAN SIGN */ + [0x3D] = 0x3D, /* U+003D: EQUALS SIGN */ + [0x3E] = 0x3E, /* U+003E: GREATER-THAN SIGN */ + [0x3F] = 0x3F, /* U+003F: QUESTION MARK */ + [0x40] = 0x40, /* U+0040: COMMERCIAL AT */ [0x41] = 0x61, /* U+0041: LATIN CAPITAL LETTER A */ [0x42] = 0x62, /* U+0042: LATIN CAPITAL LETTER B */ [0x43] = 0x63, /* U+0043: LATIN CAPITAL LETTER C */ @@ -1497,6 +1563,43 @@ static int lowercase[4288] = { [0x58] = 0x78, /* U+0058: LATIN CAPITAL LETTER X */ [0x59] = 0x79, /* U+0059: LATIN CAPITAL LETTER Y */ [0x5A] = 0x7A, /* U+005A: LATIN CAPITAL LETTER Z */ + [0x5B] = 0x5B, /* U+005B: LEFT SQUARE BRACKET */ + [0x5C] = 0x5C, /* U+005C: REVERSE SOLIDUS */ + [0x5D] = 0x5D, /* U+005D: RIGHT SQUARE BRACKET */ + [0x5E] = 0x5E, /* U+005E: CIRCUMFLEX ACCENT */ + [0x5F] = 0x5F, /* U+005F: LOW LINE */ + [0x60] = 0x60, /* U+0060: GRAVE ACCENT */ + [0x61] = 0x61, /* U+0061: LATIN SMALL LETTER A */ + [0x62] = 0x62, /* U+0062: LATIN SMALL LETTER B */ + [0x63] = 0x63, /* U+0063: LATIN SMALL LETTER C */ + [0x64] = 0x64, /* U+0064: LATIN SMALL LETTER D */ + [0x65] = 0x65, /* U+0065: LATIN SMALL LETTER E */ + [0x66] = 0x66, /* U+0066: LATIN SMALL LETTER F */ + [0x67] = 0x67, /* U+0067: LATIN SMALL LETTER G */ + [0x68] = 0x68, /* U+0068: LATIN SMALL LETTER H */ + [0x69] = 0x69, /* U+0069: LATIN SMALL LETTER I */ + [0x6A] = 0x6A, /* U+006A: LATIN SMALL LETTER J */ + [0x6B] = 0x6B, /* U+006B: LATIN SMALL LETTER K */ + [0x6C] = 0x6C, /* U+006C: LATIN SMALL LETTER L */ + [0x6D] = 0x6D, /* U+006D: LATIN SMALL LETTER M */ + [0x6E] = 0x6E, /* U+006E: LATIN SMALL LETTER N */ + [0x6F] = 0x6F, /* U+006F: LATIN SMALL LETTER O */ + [0x70] = 0x70, /* U+0070: LATIN SMALL LETTER P */ + [0x71] = 0x71, /* U+0071: LATIN SMALL LETTER Q */ + [0x72] = 0x72, /* U+0072: LATIN SMALL LETTER R */ + [0x73] = 0x73, /* U+0073: LATIN SMALL LETTER S */ + [0x74] = 0x74, /* U+0074: LATIN SMALL LETTER T */ + [0x75] = 0x75, /* U+0075: LATIN SMALL LETTER U */ + [0x76] = 0x76, /* U+0076: LATIN SMALL LETTER V */ + [0x77] = 0x77, /* U+0077: LATIN SMALL LETTER W */ + [0x78] = 0x78, /* U+0078: LATIN SMALL LETTER X */ + [0x79] = 0x79, /* U+0079: LATIN SMALL LETTER Y */ + [0x7A] = 0x7A, /* U+007A: LATIN SMALL LETTER Z */ + [0x7B] = 0x7B, /* U+007B: LEFT CURLY BRACKET */ + [0x7C] = 0x7C, /* U+007C: VERTICAL LINE */ + [0x7D] = 0x7D, /* U+007D: RIGHT CURLY BRACKET */ + [0x7E] = 0x7E, /* U+007E: TILDE */ + [0x7F] = 0x7F, /* U+007F: <control> */ [0xC3] = 256 - 0x80, /* 303 ... */ [256+0x0] = 0xE0, /* U+00C0: LATIN CAPITAL LETTER A WITH GRAVE */ [256+0x1] = 0xE1, /* U+00C1: LATIN CAPITAL LETTER A WITH ACUTE */ @@ -2969,7 +3072,104 @@ static int lowercase[4288] = { [4224+0x21] = 0x1E943, /* U+1E921: ADLAM CAPITAL LETTER SHA */ }; -static int uppercase[4544] = { +static const int uppercase[4544] = { + [0x0] = 0x0, /* U+0000: <control> */ + [0x1] = 0x1, /* U+0001: <control> */ + [0x2] = 0x2, /* U+0002: <control> */ + [0x3] = 0x3, /* U+0003: <control> */ + [0x4] = 0x4, /* U+0004: <control> */ + [0x5] = 0x5, /* U+0005: <control> */ + [0x6] = 0x6, /* U+0006: <control> */ + [0x7] = 0x7, /* U+0007: <control> */ + [0x8] = 0x8, /* U+0008: <control> */ + [0x9] = 0x9, /* U+0009: <control> */ + [0xA] = 0xA, /* U+000A: <control> */ + [0xB] = 0xB, /* U+000B: <control> */ + [0xC] = 0xC, /* U+000C: <control> */ + [0xD] = 0xD, /* U+000D: <control> */ + [0xE] = 0xE, /* U+000E: <control> */ + [0xF] = 0xF, /* U+000F: <control> */ + [0x10] = 0x10, /* U+0010: <control> */ + [0x11] = 0x11, /* U+0011: <control> */ + [0x12] = 0x12, /* U+0012: <control> */ + [0x13] = 0x13, /* U+0013: <control> */ + [0x14] = 0x14, /* U+0014: <control> */ + [0x15] = 0x15, /* U+0015: <control> */ + [0x16] = 0x16, /* U+0016: <control> */ + [0x17] = 0x17, /* U+0017: <control> */ + [0x18] = 0x18, /* U+0018: <control> */ + [0x19] = 0x19, /* U+0019: <control> */ + [0x1A] = 0x1A, /* U+001A: <control> */ + [0x1B] = 0x1B, /* U+001B: <control> */ + [0x1C] = 0x1C, /* U+001C: <control> */ + [0x1D] = 0x1D, /* U+001D: <control> */ + [0x1E] = 0x1E, /* U+001E: <control> */ + [0x1F] = 0x1F, /* U+001F: <control> */ + [0x20] = 0x20, /* U+0020: SPACE */ + [0x21] = 0x21, /* U+0021: EXCLAMATION MARK */ + [0x22] = 0x22, /* U+0022: QUOTATION MARK */ + [0x23] = 0x23, /* U+0023: NUMBER SIGN */ + [0x24] = 0x24, /* U+0024: DOLLAR SIGN */ + [0x25] = 0x25, /* U+0025: PERCENT SIGN */ + [0x26] = 0x26, /* U+0026: AMPERSAND */ + [0x27] = 0x27, /* U+0027: APOSTROPHE */ + [0x28] = 0x28, /* U+0028: LEFT PARENTHESIS */ + [0x29] = 0x29, /* U+0029: RIGHT PARENTHESIS */ + [0x2A] = 0x2A, /* U+002A: ASTERISK */ + [0x2B] = 0x2B, /* U+002B: PLUS SIGN */ + [0x2C] = 0x2C, /* U+002C: COMMA */ + [0x2D] = 0x2D, /* U+002D: HYPHEN-MINUS */ + [0x2E] = 0x2E, /* U+002E: FULL STOP */ + [0x2F] = 0x2F, /* U+002F: SOLIDUS */ + [0x30] = 0x30, /* U+0030: DIGIT ZERO */ + [0x31] = 0x31, /* U+0031: DIGIT ONE */ + [0x32] = 0x32, /* U+0032: DIGIT TWO */ + [0x33] = 0x33, /* U+0033: DIGIT THREE */ + [0x34] = 0x34, /* U+0034: DIGIT FOUR */ + [0x35] = 0x35, /* U+0035: DIGIT FIVE */ + [0x36] = 0x36, /* U+0036: DIGIT SIX */ + [0x37] = 0x37, /* U+0037: DIGIT SEVEN */ + [0x38] = 0x38, /* U+0038: DIGIT EIGHT */ + [0x39] = 0x39, /* U+0039: DIGIT NINE */ + [0x3A] = 0x3A, /* U+003A: COLON */ + [0x3B] = 0x3B, /* U+003B: SEMICOLON */ + [0x3C] = 0x3C, /* U+003C: LESS-THAN SIGN */ + [0x3D] = 0x3D, /* U+003D: EQUALS SIGN */ + [0x3E] = 0x3E, /* U+003E: GREATER-THAN SIGN */ + [0x3F] = 0x3F, /* U+003F: QUESTION MARK */ + [0x40] = 0x40, /* U+0040: COMMERCIAL AT */ + [0x41] = 0x41, /* U+0041: LATIN CAPITAL LETTER A */ + [0x42] = 0x42, /* U+0042: LATIN CAPITAL LETTER B */ + [0x43] = 0x43, /* U+0043: LATIN CAPITAL LETTER C */ + [0x44] = 0x44, /* U+0044: LATIN CAPITAL LETTER D */ + [0x45] = 0x45, /* U+0045: LATIN CAPITAL LETTER E */ + [0x46] = 0x46, /* U+0046: LATIN CAPITAL LETTER F */ + [0x47] = 0x47, /* U+0047: LATIN CAPITAL LETTER G */ + [0x48] = 0x48, /* U+0048: LATIN CAPITAL LETTER H */ + [0x49] = 0x49, /* U+0049: LATIN CAPITAL LETTER I */ + [0x4A] = 0x4A, /* U+004A: LATIN CAPITAL LETTER J */ + [0x4B] = 0x4B, /* U+004B: LATIN CAPITAL LETTER K */ + [0x4C] = 0x4C, /* U+004C: LATIN CAPITAL LETTER L */ + [0x4D] = 0x4D, /* U+004D: LATIN CAPITAL LETTER M */ + [0x4E] = 0x4E, /* U+004E: LATIN CAPITAL LETTER N */ + [0x4F] = 0x4F, /* U+004F: LATIN CAPITAL LETTER O */ + [0x50] = 0x50, /* U+0050: LATIN CAPITAL LETTER P */ + [0x51] = 0x51, /* U+0051: LATIN CAPITAL LETTER Q */ + [0x52] = 0x52, /* U+0052: LATIN CAPITAL LETTER R */ + [0x53] = 0x53, /* U+0053: LATIN CAPITAL LETTER S */ + [0x54] = 0x54, /* U+0054: LATIN CAPITAL LETTER T */ + [0x55] = 0x55, /* U+0055: LATIN CAPITAL LETTER U */ + [0x56] = 0x56, /* U+0056: LATIN CAPITAL LETTER V */ + [0x57] = 0x57, /* U+0057: LATIN CAPITAL LETTER W */ + [0x58] = 0x58, /* U+0058: LATIN CAPITAL LETTER X */ + [0x59] = 0x59, /* U+0059: LATIN CAPITAL LETTER Y */ + [0x5A] = 0x5A, /* U+005A: LATIN CAPITAL LETTER Z */ + [0x5B] = 0x5B, /* U+005B: LEFT SQUARE BRACKET */ + [0x5C] = 0x5C, /* U+005C: REVERSE SOLIDUS */ + [0x5D] = 0x5D, /* U+005D: RIGHT SQUARE BRACKET */ + [0x5E] = 0x5E, /* U+005E: CIRCUMFLEX ACCENT */ + [0x5F] = 0x5F, /* U+005F: LOW LINE */ + [0x60] = 0x60, /* U+0060: GRAVE ACCENT */ [0x61] = 0x41, /* U+0061: LATIN SMALL LETTER A */ [0x62] = 0x42, /* U+0062: LATIN SMALL LETTER B */ [0x63] = 0x43, /* U+0063: LATIN SMALL LETTER C */ @@ -2996,6 +3196,11 @@ static int uppercase[4544] = { [0x78] = 0x58, /* U+0078: LATIN SMALL LETTER X */ [0x79] = 0x59, /* U+0079: LATIN SMALL LETTER Y */ [0x7A] = 0x5A, /* U+007A: LATIN SMALL LETTER Z */ + [0x7B] = 0x7B, /* U+007B: LEFT CURLY BRACKET */ + [0x7C] = 0x7C, /* U+007C: VERTICAL LINE */ + [0x7D] = 0x7D, /* U+007D: RIGHT CURLY BRACKET */ + [0x7E] = 0x7E, /* U+007E: TILDE */ + [0x7F] = 0x7F, /* U+007F: <control> */ [0xC2] = 256 - 0x80, /* 302 ... */ [256+0x35] = 0x39C, /* U+00B5: MICRO SIGN */ [0xC3] = 320 - 0x80, /* 303 ... */ @@ -4495,62 +4700,93 @@ static int uppercase[4544] = { * without error), the current buffer is in *buf, and the current size * in *buflen. */ static gdk_return -convertcase(char **buf, size_t *buflen, const uint8_t *s, const int *convtab) +convertcase(char **restrict buf, size_t *restrict buflen, + const uint8_t *restrict s, const int *restrict convtab) { uint8_t *dst = (uint8_t *) *buf; size_t dstoff = 0; - size_t bl = *buflen; + size_t bl; - if (*buf == NULL) - bl = 0; - while (*s) { - /* we are at the start of a Unicode codepoint encoded in - * UTF-8 */ - if (dstoff + 5 > bl) { - /* make sure we have enough space for the - * largest codepoint, i.e. 4 bytes plus - * terminting NUL */ - size_t newlen = bl + 1024; - dst = GDKrealloc(*buf, newlen); - if (dst == NULL) { - *buflen = bl; _______________________________________________ checkin-list mailing list -- checkin-list@monetdb.org To unsubscribe send an email to checkin-list-le...@monetdb.org