This is an automated email from the ASF dual-hosted git repository.

lihaopeng pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new cee07d62ce6 [fix](function) fix Substring/SubReplace error result with 
input utf8 string (#40929)
cee07d62ce6 is described below

commit cee07d62ce6fcd9370f5789daec4571a09af41a4
Author: Mryange <[email protected]>
AuthorDate: Thu Sep 19 09:29:31 2024 +0800

    [fix](function) fix Substring/SubReplace error result with input utf8 
string (#40929)
    
    ```
    
    mysql [(none)]>select sub_replace("你好世界","a",1);
    +-------------------------------------+
    | sub_replace('你好世界', 'a', 1)     |
    +-------------------------------------+
    | �a�好世界                             |
    +-------------------------------------+
    
    
    
    mysql [(none)]>select SUBSTRING('中文测试',5);
    +------------------------------------------+
    | substring('中文测试', 5, 2147483647)     |
    +------------------------------------------+
    | 中文测试                                 |
    +------------------------------------------+
    1 row in set (0.04 sec)
    
    
    
    now
    mysql [(none)]>select sub_replace("你好世界","a",1);
    +-------------------------------------+
    | sub_replace('你好世界', 'a', 1)     |
    +-------------------------------------+
    | 你a世界                             |
    +-------------------------------------+
    1 row in set (0.05 sec)
    
    mysql [(none)]>select SUBSTRING('中文测试',5);
    +------------------------------------------+
    | substring('中文测试', 5, 2147483647)     |
    +------------------------------------------+
    |                                          |
    +------------------------------------------+
    1 row in set (0.13 sec)
    ```
---
 be/src/vec/functions/function_string.h             | 132 +++++++++++++++------
 .../string_functions/test_string_function.out      |  60 ++++++++++
 .../string_functions/test_string_function.out      | Bin 4590 -> 4838 bytes
 .../string_functions/test_string_function.groovy   |  23 ++++
 .../string_functions/test_string_function.groovy   |  10 ++
 5 files changed, 188 insertions(+), 37 deletions(-)

diff --git a/be/src/vec/functions/function_string.h 
b/be/src/vec/functions/function_string.h
index 53c300f50aa..4ae8cbf5ff2 100644
--- a/be/src/vec/functions/function_string.h
+++ b/be/src/vec/functions/function_string.h
@@ -242,9 +242,11 @@ private:
             const char* str_data = (char*)chars.data() + offsets[i - 1];
             int start_value = is_const ? start[0] : start[i];
             int len_value = is_const ? len[0] : len[i];
-
+            // Unsigned numbers cannot be used here because start_value can be 
negative.
+            int char_len = simd::VStringFunctions::get_char_len(str_data, 
str_size);
             // return empty string if start > src.length
-            if (start_value > str_size || str_size == 0 || start_value == 0 || 
len_value <= 0) {
+            // Here, start_value is compared against the length of the 
character.
+            if (start_value > char_len || str_size == 0 || start_value == 0 || 
len_value <= 0) {
                 StringOP::push_empty_string(i, res_chars, res_offsets);
                 continue;
             }
@@ -3386,8 +3388,6 @@ public:
         return get_variadic_argument_types_impl().size();
     }
 
-    bool use_default_implementation_for_nulls() const override { return false; 
}
-
     Status execute_impl(FunctionContext* context, Block& block, const 
ColumnNumbers& arguments,
                         size_t result, size_t input_rows_count) const override 
{
         return Impl::execute_impl(context, block, arguments, result, 
input_rows_count);
@@ -3398,59 +3398,116 @@ struct SubReplaceImpl {
     static Status replace_execute(Block& block, const ColumnNumbers& 
arguments, size_t result,
                                   size_t input_rows_count) {
         auto res_column = ColumnString::create();
-        auto result_column = assert_cast<ColumnString*>(res_column.get());
+        auto* result_column = assert_cast<ColumnString*>(res_column.get());
         auto args_null_map = ColumnUInt8::create(input_rows_count, 0);
         ColumnPtr argument_columns[4];
+        bool col_const[4];
         for (int i = 0; i < 4; ++i) {
-            argument_columns[i] =
-                    
block.get_by_position(arguments[i]).column->convert_to_full_column_if_const();
-            if (auto* nullable = 
check_and_get_column<ColumnNullable>(*argument_columns[i])) {
-                // Danger: Here must dispose the null map data first! Because
-                // argument_columns[i]=nullable->get_nested_column_ptr(); will 
release the mem
-                // of column nullable mem of null map
-                VectorizedUtils::update_null_map(args_null_map->get_data(),
-                                                 
nullable->get_null_map_data());
-                argument_columns[i] = nullable->get_nested_column_ptr();
-            }
+            std::tie(argument_columns[i], col_const[i]) =
+                    
unpack_if_const(block.get_by_position(arguments[i]).column);
         }
+        const auto* data_column = assert_cast<const 
ColumnString*>(argument_columns[0].get());
+        const auto* mask_column = assert_cast<const 
ColumnString*>(argument_columns[1].get());
+        const auto* start_column =
+                assert_cast<const 
ColumnVector<Int32>*>(argument_columns[2].get());
+        const auto* length_column =
+                assert_cast<const 
ColumnVector<Int32>*>(argument_columns[3].get());
 
-        auto data_column = assert_cast<const 
ColumnString*>(argument_columns[0].get());
-        auto mask_column = assert_cast<const 
ColumnString*>(argument_columns[1].get());
-        auto start_column = assert_cast<const 
ColumnVector<Int32>*>(argument_columns[2].get());
-        auto length_column = assert_cast<const 
ColumnVector<Int32>*>(argument_columns[3].get());
-
-        vector(data_column, mask_column, start_column->get_data(), 
length_column->get_data(),
-               args_null_map->get_data(), result_column, input_rows_count);
-
+        std::visit(
+                [&](auto origin_str_const, auto new_str_const, auto 
start_const, auto len_const) {
+                    if (simd::VStringFunctions::is_ascii(
+                                StringRef {data_column->get_chars().data(), 
data_column->size()})) {
+                        vector_ascii<origin_str_const, new_str_const, 
start_const, len_const>(
+                                data_column, mask_column, 
start_column->get_data(),
+                                length_column->get_data(), 
args_null_map->get_data(), result_column,
+                                input_rows_count);
+                    } else {
+                        vector_utf8<origin_str_const, new_str_const, 
start_const, len_const>(
+                                data_column, mask_column, 
start_column->get_data(),
+                                length_column->get_data(), 
args_null_map->get_data(), result_column,
+                                input_rows_count);
+                    }
+                },
+                vectorized::make_bool_variant(col_const[0]),
+                vectorized::make_bool_variant(col_const[1]),
+                vectorized::make_bool_variant(col_const[2]),
+                vectorized::make_bool_variant(col_const[3]));
         block.get_by_position(result).column =
                 ColumnNullable::create(std::move(res_column), 
std::move(args_null_map));
         return Status::OK();
     }
 
 private:
-    static void vector(const ColumnString* data_column, const ColumnString* 
mask_column,
-                       const PaddedPODArray<Int32>& start, const 
PaddedPODArray<Int32>& length,
-                       NullMap& args_null_map, ColumnString* result_column,
-                       size_t input_rows_count) {
+    template <bool origin_str_const, bool new_str_const, bool start_const, 
bool len_const>
+    static void vector_ascii(const ColumnString* data_column, const 
ColumnString* mask_column,
+                             const PaddedPODArray<Int32>& args_start,
+                             const PaddedPODArray<Int32>& args_length, 
NullMap& args_null_map,
+                             ColumnString* result_column, size_t 
input_rows_count) {
         ColumnString::Chars& res_chars = result_column->get_chars();
         ColumnString::Offsets& res_offsets = result_column->get_offsets();
         for (size_t row = 0; row < input_rows_count; ++row) {
-            StringRef origin_str = data_column->get_data_at(row);
-            StringRef new_str = mask_column->get_data_at(row);
-            size_t origin_str_len = origin_str.size;
+            StringRef origin_str =
+                    
data_column->get_data_at(index_check_const<origin_str_const>(row));
+            StringRef new_str = 
mask_column->get_data_at(index_check_const<new_str_const>(row));
+            const auto start = args_start[index_check_const<start_const>(row)];
+            const auto length = args_length[index_check_const<len_const>(row)];
+            const size_t origin_str_len = origin_str.size;
             //input is null, start < 0, len < 0, str_size <= start. return NULL
-            if (args_null_map[row] || start[row] < 0 || length[row] < 0 ||
-                origin_str_len <= start[row]) {
+            if (args_null_map[row] || start < 0 || length < 0 || 
origin_str_len <= start) {
                 res_offsets.push_back(res_chars.size());
                 args_null_map[row] = 1;
             } else {
                 std::string_view replace_str = new_str.to_string_view();
                 std::string result = origin_str.to_string();
-                result.replace(start[row], length[row], replace_str);
+                result.replace(start, length, replace_str);
                 result_column->insert_data(result.data(), result.length());
             }
         }
     }
+
+    template <bool origin_str_const, bool new_str_const, bool start_const, 
bool len_const>
+    static void vector_utf8(const ColumnString* data_column, const 
ColumnString* mask_column,
+                            const PaddedPODArray<Int32>& args_start,
+                            const PaddedPODArray<Int32>& args_length, NullMap& 
args_null_map,
+                            ColumnString* result_column, size_t 
input_rows_count) {
+        ColumnString::Chars& res_chars = result_column->get_chars();
+        ColumnString::Offsets& res_offsets = result_column->get_offsets();
+
+        for (size_t row = 0; row < input_rows_count; ++row) {
+            StringRef origin_str =
+                    
data_column->get_data_at(index_check_const<origin_str_const>(row));
+            StringRef new_str = 
mask_column->get_data_at(index_check_const<new_str_const>(row));
+            const auto start = args_start[index_check_const<start_const>(row)];
+            const auto length = args_length[index_check_const<len_const>(row)];
+            //input is null, start < 0, len < 0 return NULL
+            if (args_null_map[row] || start < 0 || length < 0) {
+                res_offsets.push_back(res_chars.size());
+                args_null_map[row] = 1;
+                continue;
+            }
+
+            const auto [start_byte_len, start_char_len] =
+                    
simd::VStringFunctions::iterate_utf8_with_limit_length(origin_str.begin(),
+                                                                           
origin_str.end(), start);
+
+            // start >= orgin.size
+            DCHECK(start_char_len <= start);
+            if (start_byte_len == origin_str.size) {
+                res_offsets.push_back(res_chars.size());
+                args_null_map[row] = 1;
+                continue;
+            }
+
+            auto [end_byte_len, end_char_len] =
+                    simd::VStringFunctions::iterate_utf8_with_limit_length(
+                            origin_str.begin() + start_byte_len, 
origin_str.end(), length);
+            DCHECK(end_char_len <= length);
+            std::string_view replace_str = new_str.to_string_view();
+            std::string result = origin_str.to_string();
+            result.replace(start_byte_len, end_byte_len, replace_str);
+            result_column->insert_data(result.data(), result.length());
+        }
+    }
 };
 
 struct SubReplaceThreeImpl {
@@ -3467,13 +3524,14 @@ struct SubReplaceThreeImpl {
 
         auto str_col =
                 
block.get_by_position(arguments[1]).column->convert_to_full_column_if_const();
-        if (auto* nullable = check_and_get_column<const 
ColumnNullable>(*str_col)) {
+        if (const auto* nullable = check_and_get_column<const 
ColumnNullable>(*str_col)) {
             str_col = nullable->get_nested_column_ptr();
         }
-        auto& str_offset = assert_cast<const 
ColumnString*>(str_col.get())->get_offsets();
-
+        const auto* str_column = assert_cast<const 
ColumnString*>(str_col.get());
+        // use utf8 len
         for (int i = 0; i < input_rows_count; ++i) {
-            strlen_data[i] = str_offset[i] - str_offset[i - 1];
+            StringRef str_ref = str_column->get_data_at(i);
+            strlen_data[i] = 
simd::VStringFunctions::get_char_len(str_ref.data, str_ref.size);
         }
 
         block.insert({std::move(params), std::make_shared<DataTypeInt32>(), 
"strlen"});
diff --git 
a/regression-test/data/nereids_p0/sql_functions/string_functions/test_string_function.out
 
b/regression-test/data/nereids_p0/sql_functions/string_functions/test_string_function.out
index e8305c284ff..d85794989f7 100644
--- 
a/regression-test/data/nereids_p0/sql_functions/string_functions/test_string_function.out
+++ 
b/regression-test/data/nereids_p0/sql_functions/string_functions/test_string_function.out
@@ -386,3 +386,63 @@ tNEW-STRorigin str
 -- !sql --
 d***is
 
+-- !sub_replace_utf8_sql1 --
+你a世界
+
+-- !sub_replace_utf8_sql2 --
+你ab界
+
+-- !sub_replace_utf8_sql3 --
+你ab
+
+-- !sub_replace_utf8_sql4 --
+你abcd我界
+
+-- !sub_replace_utf8_sql5 --
+\N
+
+-- !sub_replace_utf8_sql6 --
+大家世界
+
+-- !sub_replace_utf8_sql7 --
+你大家114514
+
+-- !sub_replace_utf8_sql8 --
+\N
+
+-- !sub_replace_utf8_sql9 --
+\N
+
+-- !sub_replace_utf8_sql10 --
+\N
+
+-- !sub_replace_utf8_sql1 --
+你a世界
+
+-- !sub_replace_utf8_sql2 --
+你ab界
+
+-- !sub_replace_utf8_sql3 --
+你ab
+
+-- !sub_replace_utf8_sql4 --
+你abcd我界
+
+-- !sub_replace_utf8_sql5 --
+\N
+
+-- !sub_replace_utf8_sql6 --
+大家世界
+
+-- !sub_replace_utf8_sql7 --
+你大家114514
+
+-- !sub_replace_utf8_sql8 --
+\N
+
+-- !sub_replace_utf8_sql9 --
+\N
+
+-- !sub_replace_utf8_sql10 --
+\N
+
diff --git 
a/regression-test/data/query_p0/sql_functions/string_functions/test_string_function.out
 
b/regression-test/data/query_p0/sql_functions/string_functions/test_string_function.out
index dfcf50a244b..cadf5039794 100644
Binary files 
a/regression-test/data/query_p0/sql_functions/string_functions/test_string_function.out
 and 
b/regression-test/data/query_p0/sql_functions/string_functions/test_string_function.out
 differ
diff --git 
a/regression-test/suites/nereids_p0/sql_functions/string_functions/test_string_function.groovy
 
b/regression-test/suites/nereids_p0/sql_functions/string_functions/test_string_function.groovy
index 20c8294b114..6e9cd947bc2 100644
--- 
a/regression-test/suites/nereids_p0/sql_functions/string_functions/test_string_function.groovy
+++ 
b/regression-test/suites/nereids_p0/sql_functions/string_functions/test_string_function.groovy
@@ -191,4 +191,27 @@ suite("test_string_function") {
 
     qt_sql "select sub_replace(\"this is origin str\",\"NEW-STR\",1);"
     qt_sql "select sub_replace(\"doris\",\"***\",1,2);"
+    sql """ set debug_skip_fold_constant = true;"""
+    qt_sub_replace_utf8_sql1 " select sub_replace('你好世界','a',1);"
+    qt_sub_replace_utf8_sql2 " select sub_replace('你好世界','ab',1);"
+    qt_sub_replace_utf8_sql3 " select sub_replace('你好世界','ab',1,20);"
+    qt_sub_replace_utf8_sql4 " select sub_replace('你好世界','abcd我',1,2);"
+    qt_sub_replace_utf8_sql5 " select sub_replace('你好世界','a',6);"
+    qt_sub_replace_utf8_sql6 " select sub_replace('你好世界','大家',0);"
+    qt_sub_replace_utf8_sql7 " select sub_replace('你好世界','大家114514',1,20);"
+    qt_sub_replace_utf8_sql8 " select sub_replace('你好世界','大家114514',6,20);"
+    qt_sub_replace_utf8_sql9 " select sub_replace('你好世界','大家',4);"
+    qt_sub_replace_utf8_sql10 " select sub_replace('你好世界','大家',-1);"
+    sql """ set debug_skip_fold_constant = false;"""
+    qt_sub_replace_utf8_sql1 " select sub_replace('你好世界','a',1);"
+    qt_sub_replace_utf8_sql2 " select sub_replace('你好世界','ab',1);"
+    qt_sub_replace_utf8_sql3 " select sub_replace('你好世界','ab',1,20);"
+    qt_sub_replace_utf8_sql4 " select sub_replace('你好世界','abcd我',1,2);"
+    qt_sub_replace_utf8_sql5 " select sub_replace('你好世界','a',6);"
+    qt_sub_replace_utf8_sql6 " select sub_replace('你好世界','大家',0);"
+    qt_sub_replace_utf8_sql7 " select sub_replace('你好世界','大家114514',1,20);"
+    qt_sub_replace_utf8_sql8 " select sub_replace('你好世界','大家114514',6,20);"
+    qt_sub_replace_utf8_sql9 " select sub_replace('你好世界','大家',4);"
+    qt_sub_replace_utf8_sql10 " select sub_replace('你好世界','大家',-1);"
+
 }
diff --git 
a/regression-test/suites/query_p0/sql_functions/string_functions/test_string_function.groovy
 
b/regression-test/suites/query_p0/sql_functions/string_functions/test_string_function.groovy
index b71d339a538..6e18fb57eeb 100644
--- 
a/regression-test/suites/query_p0/sql_functions/string_functions/test_string_function.groovy
+++ 
b/regression-test/suites/query_p0/sql_functions/string_functions/test_string_function.groovy
@@ -228,6 +228,16 @@ suite("test_string_function", "arrow_flight_sql") {
     qt_sql "select substring('abcdef',3,-1);"
     qt_sql "select substring('abcdef',-3,-1);"
     qt_sql "select substring('abcdef',10,1);"
+    sql """ set debug_skip_fold_constant = true;"""
+    qt_substring_utf8_sql "select substring('中文测试',5);"
+    qt_substring_utf8_sql "select substring('中文测试',4);"
+    qt_substring_utf8_sql "select substring('中文测试',2,2);"
+    qt_substring_utf8_sql "select substring('中文测试',-1,2);"
+    sql """ set debug_skip_fold_constant = false;"""
+    qt_substring_utf8_sql "select substring('中文测试',5);"
+    qt_substring_utf8_sql "select substring('中文测试',4);"
+    qt_substring_utf8_sql "select substring('中文测试',2,2);"
+    qt_substring_utf8_sql "select substring('中文测试',-1,2);"
 
     sql """ drop table if exists test_string_function; """
     sql """ create table test_string_function (


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to