This is an automated email from the ASF dual-hosted git repository.

yiguolei pushed a commit to branch branch-2.1
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/branch-2.1 by this push:
     new 1b108604d5e branch-2.1: [fix](function) fix error result in 
split_by_string with utf8 chars #40710 (#50689)
1b108604d5e is described below

commit 1b108604d5eeb2ec2f0639a42f6cd465351c9a84
Author: github-actions[bot] 
<41898282+github-actions[bot]@users.noreply.github.com>
AuthorDate: Thu May 8 19:15:52 2025 +0800

    branch-2.1: [fix](function) fix error result in split_by_string with utf8 
chars #40710 (#50689)
    
    Cherry-picked from #40710
    
    Co-authored-by: Mryange <[email protected]>
---
 be/src/vec/functions/function_string.h             |  83 ++++++++++++---------
 .../string_functions/test_split_by_string.out      | Bin 1880 -> 2000 bytes
 .../string_functions/test_split_by_string.groovy   |   2 +
 3 files changed, 49 insertions(+), 36 deletions(-)

diff --git a/be/src/vec/functions/function_string.h 
b/be/src/vec/functions/function_string.h
index 69613662a49..7411f944cbe 100644
--- a/be/src/vec/functions/function_string.h
+++ b/be/src/vec/functions/function_string.h
@@ -2618,18 +2618,8 @@ private:
                 continue;
             }
             if (delimiter_ref.size == 0) {
-                for (size_t str_pos = 0; str_pos < str_ref.size;) {
-                    const size_t str_offset = str_pos;
-                    const size_t old_size = column_string_chars.size();
-                    str_pos++;
-                    const size_t new_size = old_size + 1;
-                    column_string_chars.resize(new_size);
-                    memcpy(column_string_chars.data() + old_size, str_ref.data 
+ str_offset, 1);
-                    (*dest_nested_null_map).push_back(false);
-                    string_pos++;
-                    dest_pos++;
-                    column_string_offsets.push_back(string_pos);
-                }
+                split_empty_delimiter(str_ref, column_string_chars, 
column_string_offsets,
+                                      dest_nested_null_map, string_pos, 
dest_pos);
             } else {
                 for (size_t str_pos = 0; str_pos <= str_ref.size;) {
                     const size_t str_offset = str_pos;
@@ -2684,18 +2674,8 @@ private:
                 continue;
             }
             if (delimiter_ref.size == 0) {
-                for (size_t str_pos = 0; str_pos < str_ref.size;) {
-                    const size_t str_offset = str_pos;
-                    const size_t old_size = column_string_chars.size();
-                    str_pos++;
-                    const size_t new_size = old_size + 1;
-                    column_string_chars.resize(new_size);
-                    memcpy(column_string_chars.data() + old_size, str_ref.data 
+ str_offset, 1);
-                    (*dest_nested_null_map).push_back(false);
-                    string_pos++;
-                    dest_pos++;
-                    column_string_offsets.push_back(string_pos);
-                }
+                split_empty_delimiter(str_ref, column_string_chars, 
column_string_offsets,
+                                      dest_nested_null_map, string_pos, 
dest_pos);
             } else {
                 for (size_t str_pos = 0; str_pos <= str_ref.size;) {
                     const size_t str_offset = str_pos;
@@ -2736,18 +2716,8 @@ private:
             const StringRef delimiter_ref = delimiter_col.get_data_at(i);
 
             if (delimiter_ref.size == 0) {
-                for (size_t str_pos = 0; str_pos < str_ref.size;) {
-                    const size_t str_offset = str_pos;
-                    const size_t old_size = column_string_chars.size();
-                    str_pos++;
-                    const size_t new_size = old_size + 1;
-                    column_string_chars.resize(new_size);
-                    memcpy(column_string_chars.data() + old_size, str_ref.data 
+ str_offset, 1);
-                    (*dest_nested_null_map).push_back(false);
-                    string_pos++;
-                    dest_pos++;
-                    column_string_offsets.push_back(string_pos);
-                }
+                split_empty_delimiter(str_ref, column_string_chars, 
column_string_offsets,
+                                      dest_nested_null_map, string_pos, 
dest_pos);
             } else {
                 for (size_t str_pos = 0; str_pos <= str_ref.size;) {
                     const size_t str_offset = str_pos;
@@ -2781,6 +2751,47 @@ private:
         }
         return pos - old_size;
     }
+
+    void split_empty_delimiter(const StringRef& str_ref, ColumnString::Chars& 
column_string_chars,
+                               ColumnString::Offsets& column_string_offsets,
+                               NullMapType* dest_nested_null_map, 
ColumnArray::Offset64& string_pos,
+                               ColumnArray::Offset64& dest_pos) const {
+        const size_t old_size = column_string_chars.size();
+        const size_t new_size = old_size + str_ref.size;
+        column_string_chars.resize(new_size);
+        memcpy(column_string_chars.data() + old_size, str_ref.data, 
str_ref.size);
+        if (simd::VStringFunctions::is_ascii(str_ref)) {
+            const auto size = str_ref.size;
+
+            dest_nested_null_map->resize_fill(dest_nested_null_map->size() + 
size, false);
+
+            const auto old_size = column_string_offsets.size();
+            const auto new_size = old_size + size;
+            column_string_offsets.resize(new_size);
+            std::iota(column_string_offsets.data() + old_size,
+                      column_string_offsets.data() + new_size, string_pos + 1);
+
+            string_pos += size;
+            dest_pos += size;
+            // The above code is equivalent to the code in the following 
comment.
+            // for (size_t i = 0; i < str_ref.size; i++) {
+            //     string_pos++;
+            //     column_string_offsets.push_back(string_pos);
+            //     (*dest_nested_null_map).push_back(false);
+            //     dest_pos++;
+            // }
+        } else {
+            for (size_t i = 0, utf8_char_len = 0; i < str_ref.size; i += 
utf8_char_len) {
+                utf8_char_len = UTF8_BYTE_LENGTH[(unsigned 
char)str_ref.data[i]];
+
+                string_pos += utf8_char_len;
+                column_string_offsets.push_back(string_pos);
+
+                (*dest_nested_null_map).push_back(false);
+                dest_pos++;
+            }
+        }
+    }
 };
 
 class FunctionCountSubString : public IFunction {
diff --git 
a/regression-test/data/query_p0/sql_functions/string_functions/test_split_by_string.out
 
b/regression-test/data/query_p0/sql_functions/string_functions/test_split_by_string.out
index c46fa2bd27e..0a335392197 100644
Binary files 
a/regression-test/data/query_p0/sql_functions/string_functions/test_split_by_string.out
 and 
b/regression-test/data/query_p0/sql_functions/string_functions/test_split_by_string.out
 differ
diff --git 
a/regression-test/suites/query_p0/sql_functions/string_functions/test_split_by_string.groovy
 
b/regression-test/suites/query_p0/sql_functions/string_functions/test_split_by_string.groovy
index 2ec70e36124..6b7f32b5aae 100644
--- 
a/regression-test/suites/query_p0/sql_functions/string_functions/test_split_by_string.groovy
+++ 
b/regression-test/suites/query_p0/sql_functions/string_functions/test_split_by_string.groovy
@@ -18,6 +18,7 @@
 suite("test_split_by_string") {
     // split by char
     qt_sql "select split_by_string('abcde','');"
+    qt_sql "select split_by_string('你a好b世c界','');"
     qt_sql "select split_by_string('12553','');"
     qt_sql "select split_by_string('','');"
     qt_sql "select split_by_string('',',');"
@@ -70,6 +71,7 @@ suite("test_split_by_string") {
     sql """ INSERT INTO ${tableName1} VALUES(9, 'a,b,c,', ',') """
     sql """ INSERT INTO ${tableName1} VALUES(10, null, ',') """
     sql """ INSERT INTO ${tableName1} VALUES(11, 'a,b,c,12345,', ',') """
+    sql """ INSERT INTO ${tableName1} VALUES(12, '你a好b世c界', '') """
 
     qt_sql "SELECT *, split_by_string(v1, v2) FROM ${tableName1} ORDER BY k1"
 


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to