This is an automated email from the ASF dual-hosted git repository.

kxiao pushed a commit to branch branch-2.0
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/branch-2.0 by this push:
     new 2e447160fd4 [fix](split_by_string) Fix split by string core on column 
string (#28030) (#28365)
2e447160fd4 is described below

commit 2e447160fd405c3ed0829f12409185d882864c90
Author: zhiqiang <[email protected]>
AuthorDate: Thu Dec 14 13:21:27 2023 +0800

    [fix](split_by_string) Fix split by string core on column string (#28030) 
(#28365)
---
 be/src/vec/functions/function_string.h             | 102 +++++++++++++++++----
 .../string_functions/test_split_by_string.out      |  20 ++++
 .../string_functions/test_split_by_string.groovy   |  60 ++++++++++++
 3 files changed, 165 insertions(+), 17 deletions(-)

diff --git a/be/src/vec/functions/function_string.h 
b/be/src/vec/functions/function_string.h
index 37a21a3ea5b..e48fbd263ec 100644
--- a/be/src/vec/functions/function_string.h
+++ b/be/src/vec/functions/function_string.h
@@ -1803,6 +1803,7 @@ public:
         const auto& [right_column, right_const] =
                 unpack_if_const(block.get_by_position(arguments[1]).column);
 
+        DataTypePtr right_column_type = 
block.get_by_position(arguments[1]).type;
         DataTypePtr src_column_type = block.get_by_position(arguments[0]).type;
         auto dest_column_ptr = 
ColumnArray::create(make_nullable(src_column_type)->create_column(),
                                                    
ColumnArray::ColumnOffsets::create());
@@ -1818,27 +1819,42 @@ public:
         dest_nested_column = dest_nullable_col->get_nested_column_ptr();
         dest_nested_null_map = 
&dest_nullable_col->get_null_map_column().get_data();
 
-        if (auto col_left = 
check_and_get_column<ColumnString>(src_column.get())) {
-            if (auto col_right = 
check_and_get_column<ColumnString>(right_column.get())) {
-                if (right_const) {
-                    _execute_constant(*col_left, col_right->get_data_at(0), 
*dest_nested_column,
-                                      dest_offsets, dest_nested_null_map);
-                } else {
-                    _execute_vector(*col_left, *col_right, 
*dest_nested_column, dest_offsets,
-                                    dest_nested_null_map);
-                }
+        auto col_left = check_and_get_column<ColumnString>(src_column.get());
+        if (!col_left) {
+            return Status::InternalError("Left operator of function {} can not 
be {}", get_name(),
+                                         src_column_type->get_name());
+        }
 
-                block.replace_by_position(result, std::move(dest_column_ptr));
-                return Status::OK();
-            }
+        auto col_right = 
check_and_get_column<ColumnString>(right_column.get());
+        if (!col_right) {
+            return Status::InternalError("Right operator of function {} can 
not be {}", get_name(),
+                                         right_column_type->get_name());
+        }
+
+        // split_by_string(ColumnString, "xxx")
+        if (right_const) {
+            _execute_constant_delimiter(*col_left, col_right->get_data_at(0), 
*dest_nested_column,
+                                        dest_offsets, dest_nested_null_map);
+        } else if (left_const) {
+            // split_by_string("xxx", ColumnString)
+            _execute_constant_src_string(col_left->get_data_at(0), *col_right, 
*dest_nested_column,
+                                         dest_offsets, dest_nested_null_map);
+        } else {
+            // split_by_string(ColumnString, ColumnString)
+            _execute_vector(*col_left, *col_right, *dest_nested_column, 
dest_offsets,
+                            dest_nested_null_map);
         }
-        return Status::RuntimeError("unimplements function {}", get_name());
+
+        block.replace_by_position(result, std::move(dest_column_ptr));
+
+        return Status::OK();
     }
 
 private:
-    void _execute_constant(const ColumnString& src_column_string, const 
StringRef& delimiter_ref,
-                           IColumn& dest_nested_column, 
ColumnArray::Offsets64& dest_offsets,
-                           NullMapType* dest_nested_null_map) {
+    void _execute_constant_delimiter(const ColumnString& src_column_string,
+                                     const StringRef& delimiter_ref, IColumn& 
dest_nested_column,
+                                     ColumnArray::Offsets64& dest_offsets,
+                                     NullMapType* dest_nested_null_map) const {
         ColumnString& dest_column_string = 
reinterpret_cast<ColumnString&>(dest_nested_column);
         ColumnString::Chars& column_string_chars = 
dest_column_string.get_chars();
         ColumnString::Offsets& column_string_offsets = 
dest_column_string.get_offsets();
@@ -1958,7 +1974,59 @@ private:
         }
     }
 
-    size_t split_str(size_t& pos, const StringRef str_ref, StringRef 
delimiter_ref) {
+    void _execute_constant_src_string(const StringRef& str_ref, const 
ColumnString& delimiter_col,
+                                      IColumn& dest_nested_column,
+                                      ColumnArray::Offsets64& dest_offsets,
+                                      NullMapType* dest_nested_null_map) const 
{
+        ColumnString& dest_column_string = 
reinterpret_cast<ColumnString&>(dest_nested_column);
+        ColumnString::Chars& column_string_chars = 
dest_column_string.get_chars();
+        ColumnString::Offsets& column_string_offsets = 
dest_column_string.get_offsets();
+        column_string_chars.reserve(0);
+
+        ColumnArray::Offset64 string_pos = 0;
+        ColumnArray::Offset64 dest_pos = 0;
+        const ColumnArray::Offset64 delimiter_offsets_size = 
delimiter_col.get_offsets().size();
+
+        for (size_t i = 0; i < delimiter_offsets_size; ++i) {
+            const StringRef delimiter_ref = delimiter_col.get_data_at(i);
+
+            if (delimiter_ref.size == 0) {
+                for (size_t str_pos = 0; str_pos < str_ref.size;) {
+                    const size_t str_offset = str_pos;
+                    const size_t old_size = column_string_chars.size();
+                    str_pos++;
+                    const size_t new_size = old_size + 1;
+                    column_string_chars.resize(new_size);
+                    memcpy(column_string_chars.data() + old_size, str_ref.data 
+ str_offset, 1);
+                    (*dest_nested_null_map).push_back(false);
+                    string_pos++;
+                    dest_pos++;
+                    column_string_offsets.push_back(string_pos);
+                }
+            } else {
+                for (size_t str_pos = 0; str_pos <= str_ref.size;) {
+                    const size_t str_offset = str_pos;
+                    const size_t old_size = column_string_chars.size();
+                    const size_t split_part_size = split_str(str_pos, str_ref, 
delimiter_ref);
+                    str_pos += delimiter_ref.size;
+                    const size_t new_size = old_size + split_part_size;
+                    column_string_chars.resize(new_size);
+                    if (split_part_size > 0) {
+                        memcpy_small_allow_read_write_overflow15(
+                                column_string_chars.data() + old_size, 
str_ref.data + str_offset,
+                                split_part_size);
+                    }
+                    (*dest_nested_null_map).push_back(false);
+                    string_pos += split_part_size;
+                    dest_pos++;
+                    column_string_offsets.push_back(string_pos);
+                }
+            }
+            dest_offsets.push_back(dest_pos);
+        }
+    }
+
+    size_t split_str(size_t& pos, const StringRef str_ref, StringRef 
delimiter_ref) const {
         size_t old_size = pos;
         size_t str_size = str_ref.size;
         while (pos < str_size &&
diff --git 
a/regression-test/data/query_p0/sql_functions/string_functions/test_split_by_string.out
 
b/regression-test/data/query_p0/sql_functions/string_functions/test_split_by_string.out
index 00d9ad99781..c46fa2bd27e 100644
--- 
a/regression-test/data/query_p0/sql_functions/string_functions/test_split_by_string.out
+++ 
b/regression-test/data/query_p0/sql_functions/string_functions/test_split_by_string.out
@@ -87,3 +87,23 @@
 9      a,b,c,  ,       ["a", "b", "c", ""]
 10     \N      ,       \N
 
+-- !sql_1 --
+1      ["a", "b", "c"] ["a", "b", "c"] ["a", "b", "c"] ["a", "b", "c"]
+2      ["a", "b", "c"] ["a", "b", "c"] ["a", "b", "c"] ["a", "b", "c"]
+
+-- !sql_2 --
+3      ["a", "b", "c"] ["a", "b", "c"] ["a", "b", "c"] ["a", "b", "c"]
+4      ["a", "b", "c"] ["a", "b", "c"] ["a", "b", "c"] ["a", "b", "c"]
+
+-- !sql_3 --
+1      []      []      []      []
+2      []      []      []      []
+3      ["a", ",", "b", ",", "c"]       ["a", ",", "b", ",", "c"]       ["a", 
",", "b", ",", "c"]       ["a", ",", "b", ",", "c"]
+4      ["a", ",", "b", ",", "c"]       ["a", ",", "b", ",", "c"]       ["a", 
",", "b", ",", "c"]       ["a", ",", "b", ",", "c"]
+
+-- !sql_4 --
+1      []      []      []      []
+2      []      []      []      []
+3      [""]    [""]    [""]    [""]
+4      [""]    [""]    [""]    [""]
+
diff --git 
a/regression-test/suites/query_p0/sql_functions/string_functions/test_split_by_string.groovy
 
b/regression-test/suites/query_p0/sql_functions/string_functions/test_split_by_string.groovy
index d3f05885181..2ec70e36124 100644
--- 
a/regression-test/suites/query_p0/sql_functions/string_functions/test_split_by_string.groovy
+++ 
b/regression-test/suites/query_p0/sql_functions/string_functions/test_split_by_string.groovy
@@ -102,4 +102,64 @@ suite("test_split_by_string") {
 
 
     qt_sql "SELECT *, split_by_string(v1, v2) FROM ${tableName2} ORDER BY k1"
+
+    // Case where both of operator are column string is covered by above test. 
+    sql """DROP TABLE IF EXISTS test_split_by_string_2"""
+    sql """
+            CREATE TABLE IF NOT EXISTS test_split_by_string_2 (
+              `rid` INT NULL,
+              `str` TEXT NULL,
+              `vc`  VARCHAR(5) NULL,
+              `chr`   CHAR(5) NULL,
+              `txt` TEXT NULL
+            ) ENGINE=OLAP
+            DUPLICATE KEY(`rid`)
+            DISTRIBUTED BY HASH(`rid`) BUCKETS 1
+            PROPERTIES (
+            "replication_allocation" = "tag.location.default: 1",
+            "storage_format" = "V2"
+            )
+        """
+    sql """ INSERT INTO test_split_by_string_2 
+            VALUES (1, "", "", "", ""),
+                   (2, "", "", "", ""),
+                   (3, "a,b,c", "a,b,c", "a,b,c", "a,b,c"),
+                   (4, "a,b,c", "a,b,c", "a,b,c", "a,b,c")
+        """
+    // Left operator is const, right operator is column string
+    qt_sql_1 """ 
+                SELECT rid, 
+                      split_by_string("abc", str),
+                      split_by_string("abc", vc),
+                      split_by_string("abc", chr),
+                      split_by_string("abc", txt)
+                FROM test_split_by_string_2 WHERE rid=1 OR rid=2 ORDER BY rid;
+              """
+    // Left operator is column string, right operator is const
+    qt_sql_2 """
+                SELECT rid, 
+                      split_by_string(str, ","),
+                      split_by_string(vc, ","),
+                      split_by_string(chr, ","),
+                      split_by_string(txt, ",")
+                FROM test_split_by_string_2 WHERE rid=3 OR rid=4 ORDER BY rid;
+             """
+
+    // Empty string
+    qt_sql_3 """
+                SELECT rid, 
+                      split_by_string(str, ""),
+                      split_by_string(vc, ""),
+                      split_by_string(chr, ""),
+                      split_by_string(txt, "")
+                FROM test_split_by_string_2 ORDER BY rid;
+             """
+    qt_sql_4 """
+                SELECT rid, 
+                      split_by_string("", str),
+                      split_by_string("", vc),
+                      split_by_string("", chr),
+                      split_by_string("", txt)
+                FROM test_split_by_string_2 ORDER BY rid;
+             """
 }
\ No newline at end of file


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to