This is an automated email from the ASF dual-hosted git repository.
yiguolei pushed a commit to branch branch-2.1
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-2.1 by this push:
new 1b108604d5e branch-2.1: [fix](function) fix error result in
split_by_string with utf8 chars #40710 (#50689)
1b108604d5e is described below
commit 1b108604d5eeb2ec2f0639a42f6cd465351c9a84
Author: github-actions[bot]
<41898282+github-actions[bot]@users.noreply.github.com>
AuthorDate: Thu May 8 19:15:52 2025 +0800
branch-2.1: [fix](function) fix error result in split_by_string with utf8
chars #40710 (#50689)
Cherry-picked from #40710
Co-authored-by: Mryange <[email protected]>
---
be/src/vec/functions/function_string.h | 83 ++++++++++++---------
.../string_functions/test_split_by_string.out | Bin 1880 -> 2000 bytes
.../string_functions/test_split_by_string.groovy | 2 +
3 files changed, 49 insertions(+), 36 deletions(-)
diff --git a/be/src/vec/functions/function_string.h
b/be/src/vec/functions/function_string.h
index 69613662a49..7411f944cbe 100644
--- a/be/src/vec/functions/function_string.h
+++ b/be/src/vec/functions/function_string.h
@@ -2618,18 +2618,8 @@ private:
continue;
}
if (delimiter_ref.size == 0) {
- for (size_t str_pos = 0; str_pos < str_ref.size;) {
- const size_t str_offset = str_pos;
- const size_t old_size = column_string_chars.size();
- str_pos++;
- const size_t new_size = old_size + 1;
- column_string_chars.resize(new_size);
- memcpy(column_string_chars.data() + old_size, str_ref.data
+ str_offset, 1);
- (*dest_nested_null_map).push_back(false);
- string_pos++;
- dest_pos++;
- column_string_offsets.push_back(string_pos);
- }
+ split_empty_delimiter(str_ref, column_string_chars,
column_string_offsets,
+ dest_nested_null_map, string_pos,
dest_pos);
} else {
for (size_t str_pos = 0; str_pos <= str_ref.size;) {
const size_t str_offset = str_pos;
@@ -2684,18 +2674,8 @@ private:
continue;
}
if (delimiter_ref.size == 0) {
- for (size_t str_pos = 0; str_pos < str_ref.size;) {
- const size_t str_offset = str_pos;
- const size_t old_size = column_string_chars.size();
- str_pos++;
- const size_t new_size = old_size + 1;
- column_string_chars.resize(new_size);
- memcpy(column_string_chars.data() + old_size, str_ref.data
+ str_offset, 1);
- (*dest_nested_null_map).push_back(false);
- string_pos++;
- dest_pos++;
- column_string_offsets.push_back(string_pos);
- }
+ split_empty_delimiter(str_ref, column_string_chars,
column_string_offsets,
+ dest_nested_null_map, string_pos,
dest_pos);
} else {
for (size_t str_pos = 0; str_pos <= str_ref.size;) {
const size_t str_offset = str_pos;
@@ -2736,18 +2716,8 @@ private:
const StringRef delimiter_ref = delimiter_col.get_data_at(i);
if (delimiter_ref.size == 0) {
- for (size_t str_pos = 0; str_pos < str_ref.size;) {
- const size_t str_offset = str_pos;
- const size_t old_size = column_string_chars.size();
- str_pos++;
- const size_t new_size = old_size + 1;
- column_string_chars.resize(new_size);
- memcpy(column_string_chars.data() + old_size, str_ref.data
+ str_offset, 1);
- (*dest_nested_null_map).push_back(false);
- string_pos++;
- dest_pos++;
- column_string_offsets.push_back(string_pos);
- }
+ split_empty_delimiter(str_ref, column_string_chars,
column_string_offsets,
+ dest_nested_null_map, string_pos,
dest_pos);
} else {
for (size_t str_pos = 0; str_pos <= str_ref.size;) {
const size_t str_offset = str_pos;
@@ -2781,6 +2751,47 @@ private:
}
return pos - old_size;
}
+
+ void split_empty_delimiter(const StringRef& str_ref, ColumnString::Chars&
column_string_chars,
+ ColumnString::Offsets& column_string_offsets,
+ NullMapType* dest_nested_null_map,
ColumnArray::Offset64& string_pos,
+ ColumnArray::Offset64& dest_pos) const {
+ const size_t old_size = column_string_chars.size();
+ const size_t new_size = old_size + str_ref.size;
+ column_string_chars.resize(new_size);
+ memcpy(column_string_chars.data() + old_size, str_ref.data,
str_ref.size);
+ if (simd::VStringFunctions::is_ascii(str_ref)) {
+ const auto size = str_ref.size;
+
+ dest_nested_null_map->resize_fill(dest_nested_null_map->size() +
size, false);
+
+ const auto old_size = column_string_offsets.size();
+ const auto new_size = old_size + size;
+ column_string_offsets.resize(new_size);
+ std::iota(column_string_offsets.data() + old_size,
+ column_string_offsets.data() + new_size, string_pos + 1);
+
+ string_pos += size;
+ dest_pos += size;
+ // The above code is equivalent to the code in the following
comment.
+ // for (size_t i = 0; i < str_ref.size; i++) {
+ // string_pos++;
+ // column_string_offsets.push_back(string_pos);
+ // (*dest_nested_null_map).push_back(false);
+ // dest_pos++;
+ // }
+ } else {
+ for (size_t i = 0, utf8_char_len = 0; i < str_ref.size; i +=
utf8_char_len) {
+ utf8_char_len = UTF8_BYTE_LENGTH[(unsigned
char)str_ref.data[i]];
+
+ string_pos += utf8_char_len;
+ column_string_offsets.push_back(string_pos);
+
+ (*dest_nested_null_map).push_back(false);
+ dest_pos++;
+ }
+ }
+ }
};
class FunctionCountSubString : public IFunction {
diff --git
a/regression-test/data/query_p0/sql_functions/string_functions/test_split_by_string.out
b/regression-test/data/query_p0/sql_functions/string_functions/test_split_by_string.out
index c46fa2bd27e..0a335392197 100644
Binary files
a/regression-test/data/query_p0/sql_functions/string_functions/test_split_by_string.out
and
b/regression-test/data/query_p0/sql_functions/string_functions/test_split_by_string.out
differ
diff --git
a/regression-test/suites/query_p0/sql_functions/string_functions/test_split_by_string.groovy
b/regression-test/suites/query_p0/sql_functions/string_functions/test_split_by_string.groovy
index 2ec70e36124..6b7f32b5aae 100644
---
a/regression-test/suites/query_p0/sql_functions/string_functions/test_split_by_string.groovy
+++
b/regression-test/suites/query_p0/sql_functions/string_functions/test_split_by_string.groovy
@@ -18,6 +18,7 @@
suite("test_split_by_string") {
// split by char
qt_sql "select split_by_string('abcde','');"
+ qt_sql "select split_by_string('你a好b世c界','');"
qt_sql "select split_by_string('12553','');"
qt_sql "select split_by_string('','');"
qt_sql "select split_by_string('',',');"
@@ -70,6 +71,7 @@ suite("test_split_by_string") {
sql """ INSERT INTO ${tableName1} VALUES(9, 'a,b,c,', ',') """
sql """ INSERT INTO ${tableName1} VALUES(10, null, ',') """
sql """ INSERT INTO ${tableName1} VALUES(11, 'a,b,c,12345,', ',') """
+ sql """ INSERT INTO ${tableName1} VALUES(12, '你a好b世c界', '') """
qt_sql "SELECT *, split_by_string(v1, v2) FROM ${tableName1} ORDER BY k1"
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]