Changeset: f6605069493d for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=f6605069493d
Modified Files:
        monetdb5/modules/atoms/str.c
        sql/test/SQLancer/Tests/sqlancer03.sql
        sql/test/SQLancer/Tests/sqlancer03.stable.out
Branch: Jun2020
Log Message:

Handle UTF-8 strings at str.insert function


diffs (238 lines):

diff --git a/monetdb5/modules/atoms/str.c b/monetdb5/modules/atoms/str.c
--- a/monetdb5/modules/atoms/str.c
+++ b/monetdb5/modules/atoms/str.c
@@ -3156,6 +3156,59 @@ UTF8_strtail(const char *s, int pos)
        return (str) s;
 }
 
+static inline str
+UTF8_strncpy(char *restrict dst, const char *restrict s, int n)
+{
+       UTF8_assert(s);
+       while (*s && n) {
+               if ((*s & 0xF8) == 0xF0) {
+                       /* 4 byte UTF-8 sequence */
+                       *dst++ = *s++;
+                       *dst++ = *s++;
+                       *dst++ = *s++;
+                       *dst++ = *s++;
+               } else if ((*s & 0xF0) == 0xE0) {
+                       /* 3 byte UTF-8 sequence */
+                       *dst++ = *s++;
+                       *dst++ = *s++;
+                       *dst++ = *s++;
+               } else if ((*s & 0xE0) == 0xC0) {
+                       /* 2 byte UTF-8 sequence */
+                       *dst++ = *s++;
+                       *dst++ = *s++;
+               } else {
+                       /* 1 byte UTF-8 "sequence" */
+                       *dst++ = *s++;
+               }
+               n--;
+       }
+       *dst = '\0';
+       return dst;
+}
+
+static inline str
+UTF8_offset(char *restrict s, int n)
+{
+       UTF8_assert(s);
+       while (*s && n) {
+               if ((*s & 0xF8) == 0xF0) {
+                       /* 4 byte UTF-8 sequence */
+                       s += 4;
+               } else if ((*s & 0xF0) == 0xE0) {
+                       /* 3 byte UTF-8 sequence */
+                       s += 3;
+               } else if ((*s & 0xE0) == 0xC0) {
+                       /* 2 byte UTF-8 sequence */
+                       s += 2;
+               } else {
+                       /* 1 byte UTF-8 "sequence" */
+                       s++;
+               }
+               n--;
+       }
+       return s;
+}
+
 static str
 convertCase(BAT *from, BAT *to, str *res, const char *src, const char *malfunc)
 {
@@ -4120,20 +4173,18 @@ STRlocate(int *ret, const str *needle, c
 }
 
 str
-STRinsert(str *ret, const str *s, const int *start, const int *l, const str 
*s2)
+STRinsert(str *ret, const str *input, const int *start, const int *nchars, 
const str *input2)
 {
-       str v;
-       int strt = *start;
-       if (strNil(*s) || strNil(*s2) || is_int_nil(*start) || is_int_nil(*l)) {
+       str v, s = *input, s2 = *input2;
+       int strt = *start, l = *nchars;
+
+       if (strNil(s) || strNil(s2) || is_int_nil(strt) || is_int_nil(l)) {
                if ((*ret = GDKstrdup(str_nil)) == NULL)
                        throw(MAL, "str.insert", SQLSTATE(HY013) 
MAL_MALLOC_FAIL);
        } else {
-               size_t l1 = strlen(*s);
-               size_t l2 = strlen(*s2);
+               size_t l1 = UTF8_strlen(s);
 
-               if (l1 + l2 + 1 >= INT_MAX)
-                       throw(MAL, "str.insert", SQLSTATE(HY013) 
MAL_MALLOC_FAIL);
-               if (*l < 0)
+               if (l < 0)
                        throw(MAL, "str.insert", SQLSTATE(42000) "The number of 
characters for insert function must be non negative");
                if (strt < 0) {
                        if ((size_t) -strt <= l1)
@@ -4143,15 +4194,14 @@ STRinsert(str *ret, const str *s, const 
                }
                if ((size_t) strt > l1)
                        strt = (int) l1;
-               v = *ret = GDKmalloc(strlen(*s) + strlen(*s2) + 1);
+               v = *ret = GDKmalloc(strlen(s) + strlen(s2) + 1);
                if (v == NULL)
                        throw(MAL, "str.insert", SQLSTATE(HY013) 
MAL_MALLOC_FAIL);
                if (strt > 0)
-                       strncpy(v, *s, strt);
-               v[strt] = 0;
-               strcpy(v + strt, *s2);
-               if (strt + *l < (int) l1)
-                       strcat(v, *s + strt + *l);
+                       v = UTF8_strncpy(v, s, strt);
+               strcpy(v, s2);
+               if (strt + l < (int) l1)
+                       strcat(v, UTF8_offset(s, strt + l));
        }
        return MAL_SUCCEED;
 }
diff --git a/sql/test/SQLancer/Tests/sqlancer03.sql 
b/sql/test/SQLancer/Tests/sqlancer03.sql
--- a/sql/test/SQLancer/Tests/sqlancer03.sql
+++ b/sql/test/SQLancer/Tests/sqlancer03.sql
@@ -24,3 +24,44 @@ SELECT sql_min(sql_max(NULL, ''), '');
 SELECT ALL length(upper(MIN(ALL CAST(((trim(CAST(r'' AS STRING(659)), 
CAST(r'o3%+i]抔DCöf▟nßOpNbybಜ7' AS STRING)))||(sql_min(sql_max(NULL, r''), 
splitpart(r'x', r',7+.', t0.c1)))) AS STRING(151))))), 0.4179268710155164 
 FROM v0 LEFT OUTER JOIN t0 ON NOT (t0.c0) WHERE t0.c0 GROUP BY 0.3584962, 
CAST(t0.c1 AS STRING(601)), t0.c1;
 ROLLBACK;
+
+START TRANSACTION; -- Bug 6919
+CREATE TABLE "sys"."t0" (
+       "c0" INTEGER       NOT NULL,
+       "c1" DOUBLE,
+       CONSTRAINT "t0_c0_pkey" PRIMARY KEY ("c0")
+);
+COPY 29 RECORDS INTO "sys"."t0" FROM stdin USING DELIMITERS E'\t',E'\n','"';
+6      0.01926179604972278
+7      0.01926179604972278
+8      0.01926179604972278
+9      0.01926179604972278
+10     0.01926179604972278
+11     0.01926179604972278
+12     0.01926179604972278
+13     0.01926179604972278
+14     0.01926179604972278
+15     0.01926179604972278
+16     0.01926179604972278
+17     0.01926179604972278
+954233931      0.01926179604972278
+-890980732     0.01926179604972278
+18     0.9441921149477416
+19     0.8647722974466762
+20     0.6303259287607281
+21     0.7198562388857971
+22     1905034025
+1927464158     0.827299544139285
+421223489      0.03854140660184213
+-906851618     0.01926179604972278
+23     0.44641096314987394
+24     0.5358519423727929
+25     0.8490801972106654
+911090097      1
+-708085857     0.7843275143974144
+26     1130231849
+27     0.1052118441396751
+
+select "insert"('屁{珙', 1, 1, '1'), "insert"('屁{珙', 1, 1, '抔'), "insert"('屁抔珙', 
1, 1, 'ಜ'), "insert"('a', 0, 1, 'ಜ'), "insert"('a', 0, 0, 'ಜ');
+select "insert"('屁{珙', 1, 1, '1'), "insert"('屁{珙', 1, 1, '抔'), "insert"('屁抔珙', 
1, 1, 'ಜ') from t0;
+ROLLBACK;
diff --git a/sql/test/SQLancer/Tests/sqlancer03.stable.out 
b/sql/test/SQLancer/Tests/sqlancer03.stable.out
--- a/sql/test/SQLancer/Tests/sqlancer03.stable.out
+++ b/sql/test/SQLancer/Tests/sqlancer03.stable.out
@@ -60,6 +60,74 @@ stdout of test 'sqlancer03` in directory
 % int, decimal # type
 % 1,   19 # length
 #ROLLBACK;
+#START TRANSACTION; -- Bug 6919
+#CREATE TABLE "sys"."t0" (
+#      "c0" INTEGER       NOT NULL,
+#      "c1" DOUBLE,
+#      CONSTRAINT "t0_c0_pkey" PRIMARY KEY ("c0")
+#);
+#COPY 29 RECORDS INTO "sys"."t0" FROM stdin USING DELIMITERS E'\t',E'\n','"';
+#6     0.01926179604972278
+#7     0.01926179604972278
+#8     0.01926179604972278
+#9     0.01926179604972278
+#10    0.01926179604972278
+#11    0.01926179604972278
+#12    0.01926179604972278
+#13    0.01926179604972278
+#14    0.01926179604972278
+#15    0.01926179604972278
+#16    0.01926179604972278
+#17    0.01926179604972278
+#954233931     0.01926179604972278
+#-890980732    0.01926179604972278
+#18    0.9441921149477416
+#19    0.8647722974466762
+#20    0.6303259287607281
+#21    0.7198562388857971
+#22    1905034025
+[ 29   ]
+#select "insert"('屁{珙', 1, 1, '1'), "insert"('屁{珙', 1, 1, '抔'), 
"insert"('屁抔珙', 1, 1, 'ಜ'), "insert"('a', 0, 1, 'ಜ'), "insert"('a', 0, 0, 'ಜ');
+% .%2, .%3,    .%4,    .%5,    .%6 # table_name
+% %2,  %3,     %4,     %5,     %6 # name
+% clob,        clob,   clob,   clob,   clob # type
+% 5,   6,      5,      1,      2 # length
+[ "屁1珙",       "屁抔珙",  "屁ಜ珙",  "ಜ",    "ಜa"    ]
+#select "insert"('屁{珙', 1, 1, '1'), "insert"('屁{珙', 1, 1, '抔'), 
"insert"('屁抔珙', 1, 1, 'ಜ') from t0;
+% .%1, .%2,    .%3 # table_name
+% %1,  %2,     %3 # name
+% clob,        clob,   clob # type
+% 5,   6,      5 # length
+[ "屁1珙",       "屁抔珙",  "屁ಜ珙"   ]
+[ "屁1珙",       "屁抔珙",  "屁ಜ珙"   ]
+[ "屁1珙",       "屁抔珙",  "屁ಜ珙"   ]
+[ "屁1珙",       "屁抔珙",  "屁ಜ珙"   ]
+[ "屁1珙",       "屁抔珙",  "屁ಜ珙"   ]
+[ "屁1珙",       "屁抔珙",  "屁ಜ珙"   ]
+[ "屁1珙",       "屁抔珙",  "屁ಜ珙"   ]
+[ "屁1珙",       "屁抔珙",  "屁ಜ珙"   ]
+[ "屁1珙",       "屁抔珙",  "屁ಜ珙"   ]
+[ "屁1珙",       "屁抔珙",  "屁ಜ珙"   ]
+[ "屁1珙",       "屁抔珙",  "屁ಜ珙"   ]
+[ "屁1珙",       "屁抔珙",  "屁ಜ珙"   ]
+[ "屁1珙",       "屁抔珙",  "屁ಜ珙"   ]
+[ "屁1珙",       "屁抔珙",  "屁ಜ珙"   ]
+[ "屁1珙",       "屁抔珙",  "屁ಜ珙"   ]
+[ "屁1珙",       "屁抔珙",  "屁ಜ珙"   ]
+[ "屁1珙",       "屁抔珙",  "屁ಜ珙"   ]
+[ "屁1珙",       "屁抔珙",  "屁ಜ珙"   ]
+[ "屁1珙",       "屁抔珙",  "屁ಜ珙"   ]
+[ "屁1珙",       "屁抔珙",  "屁ಜ珙"   ]
+[ "屁1珙",       "屁抔珙",  "屁ಜ珙"   ]
+[ "屁1珙",       "屁抔珙",  "屁ಜ珙"   ]
+[ "屁1珙",       "屁抔珙",  "屁ಜ珙"   ]
+[ "屁1珙",       "屁抔珙",  "屁ಜ珙"   ]
+[ "屁1珙",       "屁抔珙",  "屁ಜ珙"   ]
+[ "屁1珙",       "屁抔珙",  "屁ಜ珙"   ]
+[ "屁1珙",       "屁抔珙",  "屁ಜ珙"   ]
+[ "屁1珙",       "屁抔珙",  "屁ಜ珙"   ]
+[ "屁1珙",       "屁抔珙",  "屁ಜ珙"   ]
+#ROLLBACK;
 
 # 17:14:16 >  
 # 17:14:16 >  "Done."
_______________________________________________
checkin-list mailing list
checkin-list@monetdb.org
https://www.monetdb.org/mailman/listinfo/checkin-list

Reply via email to