Changeset: 6616adb34787 for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB/rev/6616adb34787
Modified Files:
        clients/Tests/MAL-signatures-hge.test
        clients/Tests/MAL-signatures.test
        clients/Tests/exports.stable.out
        gdk/gdk.h
        gdk/gdk_string.c
        monetdb5/modules/atoms/str.c
        monetdb5/modules/kernel/batstr.c
        monetdb5/modules/mal/pcre.c
Branch: ascii-flag
Log Message:

Implemented case folding, and use it for ILIKE.


diffs (truncated from 2394 to 300 lines):

diff --git a/clients/Tests/MAL-signatures-hge.test 
b/clients/Tests/MAL-signatures-hge.test
--- a/clients/Tests/MAL-signatures-hge.test
+++ b/clients/Tests/MAL-signatures-hge.test
@@ -34709,6 +34709,16 @@ pattern batstr.asciify(X_0:bat[:str], X_
 BATSTRasciify;
 Transform BAT of strings from UTF8 to ASCII
 batstr
+caseFold
+pattern batstr.caseFold(X_0:bat[:str]):bat[:str] 
+STRbatCaseFold;
+Fold the case of a string.
+batstr
+caseFold
+pattern batstr.caseFold(X_0:bat[:str], X_1:bat[:oid]):bat[:str] 
+STRbatCaseFold;
+Fold the case of a string.
+batstr
 contains
 pattern batstr.contains(X_0:bat[:str], X_1:bat[:str]):bat[:bit] 
 BATSTRcontains;
@@ -50684,6 +50694,11 @@ command str.asciify(X_0:str):str
 STRasciify;
 Transform string from UTF8 to ASCII
 str
+caseFold
+command str.caseFold(X_0:str):str 
+STRcasefold;
+Fold the case of a string.
+str
 contains
 pattern str.contains(X_0:str, X_1:str):bit 
 STRcontains;
diff --git a/clients/Tests/MAL-signatures.test 
b/clients/Tests/MAL-signatures.test
--- a/clients/Tests/MAL-signatures.test
+++ b/clients/Tests/MAL-signatures.test
@@ -25759,6 +25759,16 @@ pattern batstr.asciify(X_0:bat[:str], X_
 BATSTRasciify;
 Transform BAT of strings from UTF8 to ASCII
 batstr
+caseFold
+pattern batstr.caseFold(X_0:bat[:str]):bat[:str] 
+STRbatCaseFold;
+Fold the case of a string.
+batstr
+caseFold
+pattern batstr.caseFold(X_0:bat[:str], X_1:bat[:oid]):bat[:str] 
+STRbatCaseFold;
+Fold the case of a string.
+batstr
 contains
 pattern batstr.contains(X_0:bat[:str], X_1:bat[:str]):bat[:bit] 
 BATSTRcontains;
@@ -39009,6 +39019,11 @@ command str.asciify(X_0:str):str
 STRasciify;
 Transform string from UTF8 to ASCII
 str
+caseFold
+command str.caseFold(X_0:str):str 
+STRcasefold;
+Fold the case of a string.
+str
 contains
 pattern str.contains(X_0:str, X_1:str):bit 
 STRcontains;
diff --git a/clients/Tests/exports.stable.out b/clients/Tests/exports.stable.out
--- a/clients/Tests/exports.stable.out
+++ b/clients/Tests/exports.stable.out
@@ -109,6 +109,7 @@ dbl BATcalcvariance_population(dbl *avgp
 dbl BATcalcvariance_sample(dbl *avgp, BAT *b);
 BAT *BATcalcxor(BAT *b1, BAT *b2, BAT *s1, BAT *s2);
 BAT *BATcalcxorcst(BAT *b, const ValRecord *v, BAT *s);
+BAT *BATcasefold(BAT *b, BAT *s);
 bool BATcheckorderidx(BAT *b);
 gdk_return BATclear(BAT *b, bool force);
 void BATcommit(BAT *b, BUN size);
@@ -271,6 +272,7 @@ gdk_return GDKanalyticalsum(BAT *r, BAT 
 gdk_return GDKanalyticalwindowbounds(BAT *r, BAT *b, BAT *p, BAT *l, const 
void *restrict bound, int tp1, int tp2, int unit, bool preceding, oid 
first_half);
 gdk_return GDKasciify(char **restrict buf, size_t *restrict buflen, const char 
*restrict s);
 int GDKatomcnt;
+gdk_return GDKcasefold(char **restrict buf, size_t *restrict buflen, const 
char *restrict s);
 void GDKclrerr(void);
 gdk_return GDKcopyenv(BAT **key, BAT **val, bool writable);
 gdk_return GDKcreatedir(const char *nme);
diff --git a/gdk/gdk.h b/gdk/gdk.h
--- a/gdk/gdk.h
+++ b/gdk/gdk.h
@@ -2343,11 +2343,13 @@ gdk_export gdk_return BATfirstn(BAT **to
 
 gdk_export gdk_return GDKtoupper(char **restrict buf, size_t *restrict buflen, 
const char *restrict s);
 gdk_export gdk_return GDKtolower(char **restrict buf, size_t *restrict buflen, 
const char *restrict s);
+gdk_export gdk_return GDKcasefold(char **restrict buf, size_t *restrict 
buflen, const char *restrict s);
 gdk_export int GDKstrncasecmp(const char *str1, const char *str2, size_t l1, 
size_t l2);
 gdk_export int GDKstrcasecmp(const char *s1, const char *s2);
 gdk_export char *GDKstrcasestr(const char *haystack, const char *needle);
+gdk_export BAT *BATtoupper(BAT *b, BAT *s);
 gdk_export BAT *BATtolower(BAT *b, BAT *s);
-gdk_export BAT *BATtoupper(BAT *b, BAT *s);
+gdk_export BAT *BATcasefold(BAT *b, BAT *s);
 gdk_export gdk_return GDKasciify(char **restrict buf, size_t *restrict buflen, 
const char *restrict s);
 gdk_export BAT *BATasciify(BAT *b, BAT *s);
 
diff --git a/gdk/gdk_string.c b/gdk/gdk_string.c
--- a/gdk/gdk_string.c
+++ b/gdk/gdk_string.c
@@ -1473,8 +1473,80 @@ GDKanalytical_str_group_concat(BAT *r, B
  * a (new) offset into the same table. */
 static const char *const specialcase[] = {
        NULL,
+       "ss",
+       "i\xCC\x87",
+       "\xCA\xBCn",
+       "j\xCC\x8C",
+       "\xCE\xB9\xCC\x88\xCC\x81",
+       "\xCF\x85\xCC\x88\xCC\x81",
+       "\xD5\xA5\xD6\x82",
+       "h\xCC\xB1",
+       "t\xCC\x88",
+       "w\xCC\x8A",
+       "y\xCC\x8A",
+       "a\xCA\xBE",
+       "\xCF\x85\xCC\x93",
+       "\xCF\x85\xCC\x93\xCC\x80",
+       "\xCF\x85\xCC\x93\xCC\x81",
+       "\xCF\x85\xCC\x93\xCD\x82",
+       "\xE1\xBC\x80\xCE\xB9",
+       "\xE1\xBC\x81\xCE\xB9",
+       "\xE1\xBC\x82\xCE\xB9",
+       "\xE1\xBC\x83\xCE\xB9",
+       "\xE1\xBC\x84\xCE\xB9",
+       "\xE1\xBC\x85\xCE\xB9",
+       "\xE1\xBC\x86\xCE\xB9",
+       "\xE1\xBC\x87\xCE\xB9",
+       "\xE1\xBC\xA0\xCE\xB9",
+       "\xE1\xBC\xA1\xCE\xB9",
+       "\xE1\xBC\xA2\xCE\xB9",
+       "\xE1\xBC\xA3\xCE\xB9",
+       "\xE1\xBC\xA4\xCE\xB9",
+       "\xE1\xBC\xA5\xCE\xB9",
+       "\xE1\xBC\xA6\xCE\xB9",
+       "\xE1\xBC\xA7\xCE\xB9",
+       "\xE1\xBD\xA0\xCE\xB9",
+       "\xE1\xBD\xA1\xCE\xB9",
+       "\xE1\xBD\xA2\xCE\xB9",
+       "\xE1\xBD\xA3\xCE\xB9",
+       "\xE1\xBD\xA4\xCE\xB9",
+       "\xE1\xBD\xA5\xCE\xB9",
+       "\xE1\xBD\xA6\xCE\xB9",
+       "\xE1\xBD\xA7\xCE\xB9",
+       "\xE1\xBD\xB0\xCE\xB9",
+       "\xCE\xB1\xCE\xB9",
+       "\xCE\xAC\xCE\xB9",
+       "\xCE\xB1\xCD\x82",
+       "\xCE\xB1\xCD\x82\xCE\xB9",
+       "\xE1\xBD\xB4\xCE\xB9",
+       "\xCE\xB7\xCE\xB9",
+       "\xCE\xAE\xCE\xB9",
+       "\xCE\xB7\xCD\x82",
+       "\xCE\xB7\xCD\x82\xCE\xB9",
+       "\xCE\xB9\xCC\x88\xCC\x80",
+       "\xCE\xB9\xCD\x82",
+       "\xCE\xB9\xCC\x88\xCD\x82",
+       "\xCF\x85\xCC\x88\xCC\x80",
+       "\xCF\x81\xCC\x93",
+       "\xCF\x85\xCD\x82",
+       "\xCF\x85\xCC\x88\xCD\x82",
+       "\xE1\xBD\xBC\xCE\xB9",
+       "\xCF\x89\xCE\xB9",
+       "\xCF\x8E\xCE\xB9",
+       "\xCF\x89\xCD\x82",
+       "\xCF\x89\xCD\x82\xCE\xB9",
+       "ff",
+       "fi",
+       "fl",
+       "ffi",
+       "ffl",
+       "st",
+       "\xD5\xB4\xD5\xB6",
+       "\xD5\xB4\xD5\xA5",
+       "\xD5\xB4\xD5\xAB",
+       "\xD5\xBE\xD5\xB6",
+       "\xD5\xB4\xD5\xAD",
        "SS",
-       "i\xCC\x87",
        "FF",
        "FI",
        "FL",
@@ -1548,6 +1620,38 @@ static const char *const specialcase[] =
        "\xCE\xA9\xCD\x82\xCE\x99",
 };
 static const int lowercase[4288] = {
+       [0x00] = 0x0000,        /* U+0000: <control> */
+       [0x01] = 0x0001,        /* U+0001: <control> */
+       [0x02] = 0x0002,        /* U+0002: <control> */
+       [0x03] = 0x0003,        /* U+0003: <control> */
+       [0x04] = 0x0004,        /* U+0004: <control> */
+       [0x05] = 0x0005,        /* U+0005: <control> */
+       [0x06] = 0x0006,        /* U+0006: <control> */
+       [0x07] = 0x0007,        /* U+0007: <control> */
+       [0x08] = 0x0008,        /* U+0008: <control> */
+       [0x09] = 0x0009,        /* U+0009: <control> */
+       [0x0A] = 0x000A,        /* U+000A: <control> */
+       [0x0B] = 0x000B,        /* U+000B: <control> */
+       [0x0C] = 0x000C,        /* U+000C: <control> */
+       [0x0D] = 0x000D,        /* U+000D: <control> */
+       [0x0E] = 0x000E,        /* U+000E: <control> */
+       [0x0F] = 0x000F,        /* U+000F: <control> */
+       [0x10] = 0x0010,        /* U+0010: <control> */
+       [0x11] = 0x0011,        /* U+0011: <control> */
+       [0x12] = 0x0012,        /* U+0012: <control> */
+       [0x13] = 0x0013,        /* U+0013: <control> */
+       [0x14] = 0x0014,        /* U+0014: <control> */
+       [0x15] = 0x0015,        /* U+0015: <control> */
+       [0x16] = 0x0016,        /* U+0016: <control> */
+       [0x17] = 0x0017,        /* U+0017: <control> */
+       [0x18] = 0x0018,        /* U+0018: <control> */
+       [0x19] = 0x0019,        /* U+0019: <control> */
+       [0x1A] = 0x001A,        /* U+001A: <control> */
+       [0x1B] = 0x001B,        /* U+001B: <control> */
+       [0x1C] = 0x001C,        /* U+001C: <control> */
+       [0x1D] = 0x001D,        /* U+001D: <control> */
+       [0x1E] = 0x001E,        /* U+001E: <control> */
+       [0x1F] = 0x001F,        /* U+001F: <control> */
        [0x20] = 0x0020,        /* U+0020: SPACE */
        [0x21] = 0x0021,        /* U+0021: EXCLAMATION MARK */
        [0x22] = 0x0022,        /* U+0022: QUOTATION MARK */
@@ -1643,6 +1747,7 @@ static const int lowercase[4288] = {
        [0x7C] = 0x007C,        /* U+007C: VERTICAL LINE */
        [0x7D] = 0x007D,        /* U+007D: RIGHT CURLY BRACKET */
        [0x7E] = 0x007E,        /* U+007E: TILDE */
+       [0x7F] = 0x007F,        /* U+007F: <control> */
        [0xC3] = 256 - 0x80,    /* 303 ... */
        [256+0x00] = 0x00E0,    /* U+00C0: LATIN CAPITAL LETTER A WITH GRAVE */
        [256+0x01] = 0x00E1,    /* U+00C1: LATIN CAPITAL LETTER A WITH ACUTE */
@@ -3115,6 +3220,38 @@ static const int lowercase[4288] = {
        [4224+0x21] = 0x1E943,  /* U+1E921: ADLAM CAPITAL LETTER SHA */
 };
 static const int uppercase[4608] = {
+       [0x00] = 0x0000,        /* U+0000: <control> */
+       [0x01] = 0x0001,        /* U+0001: <control> */
+       [0x02] = 0x0002,        /* U+0002: <control> */
+       [0x03] = 0x0003,        /* U+0003: <control> */
+       [0x04] = 0x0004,        /* U+0004: <control> */
+       [0x05] = 0x0005,        /* U+0005: <control> */
+       [0x06] = 0x0006,        /* U+0006: <control> */
+       [0x07] = 0x0007,        /* U+0007: <control> */
+       [0x08] = 0x0008,        /* U+0008: <control> */
+       [0x09] = 0x0009,        /* U+0009: <control> */
+       [0x0A] = 0x000A,        /* U+000A: <control> */
+       [0x0B] = 0x000B,        /* U+000B: <control> */
+       [0x0C] = 0x000C,        /* U+000C: <control> */
+       [0x0D] = 0x000D,        /* U+000D: <control> */
+       [0x0E] = 0x000E,        /* U+000E: <control> */
+       [0x0F] = 0x000F,        /* U+000F: <control> */
+       [0x10] = 0x0010,        /* U+0010: <control> */
+       [0x11] = 0x0011,        /* U+0011: <control> */
+       [0x12] = 0x0012,        /* U+0012: <control> */
+       [0x13] = 0x0013,        /* U+0013: <control> */
+       [0x14] = 0x0014,        /* U+0014: <control> */
+       [0x15] = 0x0015,        /* U+0015: <control> */
+       [0x16] = 0x0016,        /* U+0016: <control> */
+       [0x17] = 0x0017,        /* U+0017: <control> */
+       [0x18] = 0x0018,        /* U+0018: <control> */
+       [0x19] = 0x0019,        /* U+0019: <control> */
+       [0x1A] = 0x001A,        /* U+001A: <control> */
+       [0x1B] = 0x001B,        /* U+001B: <control> */
+       [0x1C] = 0x001C,        /* U+001C: <control> */
+       [0x1D] = 0x001D,        /* U+001D: <control> */
+       [0x1E] = 0x001E,        /* U+001E: <control> */
+       [0x1F] = 0x001F,        /* U+001F: <control> */
        [0x20] = 0x0020,        /* U+0020: SPACE */
        [0x21] = 0x0021,        /* U+0021: EXCLAMATION MARK */
        [0x22] = 0x0022,        /* U+0022: QUOTATION MARK */
@@ -3210,10 +3347,11 @@ static const int uppercase[4608] = {
        [0x7C] = 0x007C,        /* U+007C: VERTICAL LINE */
        [0x7D] = 0x007D,        /* U+007D: RIGHT CURLY BRACKET */
        [0x7E] = 0x007E,        /* U+007E: TILDE */
+       [0x7F] = 0x007F,        /* U+007F: <control> */
        [0xC2] = 256 - 0x80,    /* 302 ... */
        [256+0x35] = 0x039C,    /* U+00B5: MICRO SIGN */
        [0xC3] = 320 - 0x80,    /* 303 ... */
-       [320+0x1F] = -1,        /* U+00DF: LATIN SMALL LETTER SHARP S */
+       [320+0x1F] = -74,       /* U+00DF: LATIN SMALL LETTER SHARP S */
        [320+0x20] = 0x00C0,    /* U+00E0: LATIN SMALL LETTER A WITH GRAVE */
        [320+0x21] = 0x00C1,    /* U+00E1: LATIN SMALL LETTER A WITH ACUTE */
        [320+0x22] = 0x00C2,    /* U+00E2: LATIN SMALL LETTER A WITH CIRCUMFLEX 
*/
@@ -3283,7 +3421,7 @@ static const int uppercase[4608] = {
        [448+0x04] = 0x0143,    /* U+0144: LATIN SMALL LETTER N WITH ACUTE */
        [448+0x06] = 0x0145,    /* U+0146: LATIN SMALL LETTER N WITH CEDILLA */
        [448+0x08] = 0x0147,    /* U+0148: LATIN SMALL LETTER N WITH CARON */
-       [448+0x09] = -15,       /* U+0149: LATIN SMALL LETTER N PRECEDED BY 
APOSTROPHE */
+       [448+0x09] = -87,       /* U+0149: LATIN SMALL LETTER N PRECEDED BY 
APOSTROPHE */
        [448+0x0B] = 0x014A,    /* U+014B: LATIN SMALL LETTER ENG */
        [448+0x0D] = 0x014C,    /* U+014D: LATIN SMALL LETTER O WITH MACRON */
        [448+0x0F] = 0x014E,    /* U+014F: LATIN SMALL LETTER O WITH BREVE */
@@ -3358,7 +3496,7 @@ static const int uppercase[4608] = {
        [576+0x2B] = 0x01EA,    /* U+01EB: LATIN SMALL LETTER O WITH OGONEK */
        [576+0x2D] = 0x01EC,    /* U+01ED: LATIN SMALL LETTER O WITH OGONEK AND 
MACRON */
        [576+0x2F] = 0x01EE,    /* U+01EF: LATIN SMALL LETTER EZH WITH CARON */
-       [576+0x30] = -18,       /* U+01F0: LATIN SMALL LETTER J WITH CARON */
+       [576+0x30] = -90,       /* U+01F0: LATIN SMALL LETTER J WITH CARON */
_______________________________________________
checkin-list mailing list -- checkin-list@monetdb.org
To unsubscribe send an email to checkin-list-le...@monetdb.org

Reply via email to