This is an automated email from the ASF dual-hosted git repository.

yiguolei pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new 3884c1aba86 [fix](function) fix tokenize function incorrect result 
when first argument is const (#62699)
3884c1aba86 is described below

commit 3884c1aba86ebf1076c628e4d25d9d22007046eb
Author: Jack <[email protected]>
AuthorDate: Wed May 27 16:05:50 2026 +0800

    [fix](function) fix tokenize function incorrect result when first argument 
is const (#62699)
    
    ## Proposed changes
    
    Fix a bug in the `tokenize` function where `unpack_if_const` unwraps a
    `ColumnConst` to its inner data column (which has only 1 row), but
    `_do_tokenize` and `_do_tokenize_none` iterate based on the source
    column's row count. This causes only 1 output row to be produced instead
    of `input_rows_count` rows when the first argument is a constant.
    
    For example, `SELECT tokenize('hello world', 'parser=english') FROM
    table_with_many_rows` would previously return only 1 row instead of the
    expected number of rows matching the table.
    
    The fix wraps the result in `ColumnConst` when the source column was
    const, which is the standard pattern used throughout the Doris codebase
    for handling const columns in function execution.
    
    ## Further comments
    
    Related Jira: DORIS-25296
    
    ## Checklist(Required)
    
    1. Does it affect the results of the existing test cases (Yes/No): No
    2. Does it need to update the document (Yes/No): No
    3. Is there a risk of compatibility changes (Yes/No): No
---
 be/src/exprs/function/function_tokenize.cpp       | 18 ++++-
 be/src/exprs/function/function_tokenize.h         |  2 +-
 be/test/exprs/function/function_tokenize_test.cpp | 94 +++++++++++++++++++++++
 3 files changed, 110 insertions(+), 4 deletions(-)

diff --git a/be/src/exprs/function/function_tokenize.cpp 
b/be/src/exprs/function/function_tokenize.cpp
index e8d923f981a..a3d616d6357 100644
--- a/be/src/exprs/function/function_tokenize.cpp
+++ b/be/src/exprs/function/function_tokenize.cpp
@@ -30,6 +30,7 @@
 #include "core/block/block.h"
 #include "core/block/column_with_type_and_name.h"
 #include "core/column/column.h"
+#include "core/column/column_const.h"
 #include "core/data_type/data_type_nullable.h"
 #include "core/data_type/data_type_number.h"
 #include "core/string_ref.h"
@@ -134,7 +135,7 @@ void FunctionTokenize::_do_tokenize(const ColumnString& 
src_column_string,
 
 Status FunctionTokenize::execute_impl(FunctionContext* /*context*/, Block& 
block,
                                       const ColumnNumbers& arguments, uint32_t 
result,
-                                      size_t /*input_rows_count*/) const {
+                                      size_t input_rows_count) const {
     DCHECK_EQ(arguments.size(), 2);
     const auto& [src_column, left_const] =
             unpack_if_const(block.get_by_position(arguments[0]).column);
@@ -165,7 +166,13 @@ Status FunctionTokenize::execute_impl(FunctionContext* 
/*context*/, Block& block
             if (config.analyzer_name.empty() &&
                 config.parser_type == InvertedIndexParserType::PARSER_NONE) {
                 _do_tokenize_none(*col_left, dest_column_ptr);
-                block.replace_by_position(result, std::move(dest_column_ptr));
+                if (left_const) {
+                    block.replace_by_position(
+                            result,
+                            ColumnConst::create(std::move(dest_column_ptr), 
input_rows_count));
+                } else {
+                    block.replace_by_position(result, 
std::move(dest_column_ptr));
+                }
                 return Status::OK();
             }
 
@@ -196,7 +203,12 @@ Status FunctionTokenize::execute_impl(FunctionContext* 
/*context*/, Block& block
             analyzer_ctx.analyzer = analyzer_holder;
             _do_tokenize(*col_left, analyzer_ctx, support_phrase, 
dest_column_ptr);
 
-            block.replace_by_position(result, std::move(dest_column_ptr));
+            if (left_const) {
+                block.replace_by_position(
+                        result, 
ColumnConst::create(std::move(dest_column_ptr), input_rows_count));
+            } else {
+                block.replace_by_position(result, std::move(dest_column_ptr));
+            }
             return Status::OK();
         }
     }
diff --git a/be/src/exprs/function/function_tokenize.h 
b/be/src/exprs/function/function_tokenize.h
index 533b411458b..01828d120a5 100644
--- a/be/src/exprs/function/function_tokenize.h
+++ b/be/src/exprs/function/function_tokenize.h
@@ -70,6 +70,6 @@ public:
     void _do_tokenize_none(const ColumnString& src_column_string,
                            const MutableColumnPtr& dest_column_ptr) const;
     Status execute_impl(FunctionContext* /*context*/, Block& block, const 
ColumnNumbers& arguments,
-                        uint32_t result, size_t /*input_rows_count*/) const 
override;
+                        uint32_t result, size_t input_rows_count) const 
override;
 };
 } // namespace doris
diff --git a/be/test/exprs/function/function_tokenize_test.cpp 
b/be/test/exprs/function/function_tokenize_test.cpp
index 322ee56c893..a17974d7c22 100644
--- a/be/test/exprs/function/function_tokenize_test.cpp
+++ b/be/test/exprs/function/function_tokenize_test.cpp
@@ -25,6 +25,7 @@
 
 #include "core/block/block.h"
 #include "core/block/column_with_type_and_name.h"
+#include "core/column/column_const.h"
 #include "core/column/column_string.h"
 #include "core/data_type/data_type_string.h"
 #include "exprs/function/simple_function_factory.h"
@@ -203,6 +204,99 @@ TEST_F(FunctionTokenizeTest, ParserNonePropertyFormats) {
     }
 }
 
+// Regression for const-first-argument row count.
+//
+// The fix in FunctionTokenize::execute_impl wraps the dest column in a
+// ColumnConst(input_rows_count) when the first argument was already a
+// ColumnConst. The generic PreparedFunctionImpl::default_implementation_for_
+// constant_arguments path only short-circuits when *all* arguments are const,
+// so to reach the new left_const branch we need arg0 const but arg1 non-const.
+// FE rejects non-literal second arguments for tokenize, so this path is only
+// exercisable from a unit test that builds the block directly.
+TEST_F(FunctionTokenizeTest, ConstFirstArgPreservesRowCount) {
+    const size_t input_rows_count = 5;
+    const std::string input_str = "Hello World Test";
+    const std::string properties = "parser='english'";
+
+    auto input_inner = ColumnString::create();
+    input_inner->insert_data(input_str.data(), input_str.size());
+    auto input_const = ColumnConst::create(std::move(input_inner), 
input_rows_count);
+
+    auto properties_column = ColumnString::create();
+    for (size_t i = 0; i < input_rows_count; ++i) {
+        properties_column->insert_data(properties.data(), properties.size());
+    }
+
+    auto string_type = std::make_shared<DataTypeString>();
+
+    Block block;
+    block.insert(ColumnWithTypeAndName(std::move(input_const), string_type, 
"input"));
+    block.insert(ColumnWithTypeAndName(std::move(properties_column), 
string_type, "properties"));
+    block.insert(ColumnWithTypeAndName(string_type->create_column(), 
string_type, "result"));
+
+    ColumnNumbers arguments = {0, 1};
+    uint32_t result = 2;
+
+    auto status = _function->execute(nullptr, block, arguments, result, 
input_rows_count);
+    ASSERT_TRUE(status.ok()) << status.to_string();
+
+    auto result_column = block.get_by_position(result).column;
+    ASSERT_EQ(result_column->size(), input_rows_count);
+
+    // Source is constant, so every row must carry the same tokenized value.
+    StringRef first_token = result_column->get_data_at(0);
+    ASSERT_GT(first_token.size, 0);
+    EXPECT_NE(std::string(first_token.data, first_token.size).find("hello"), 
std::string::npos);
+    EXPECT_NE(std::string(first_token.data, first_token.size).find("world"), 
std::string::npos);
+    for (size_t i = 1; i < input_rows_count; ++i) {
+        StringRef row_i = result_column->get_data_at(i);
+        EXPECT_EQ(row_i.size, first_token.size);
+        EXPECT_EQ(memcmp(row_i.data, first_token.data, first_token.size), 0);
+    }
+}
+
+// Same check for the PARSER_NONE early-return branch in execute_impl.
+TEST_F(FunctionTokenizeTest, ConstFirstArgParserNonePreservesRowCount) {
+    const size_t input_rows_count = 4;
+    const std::string input_str = "Hello World";
+    const std::string properties = "parser='none'";
+
+    auto input_inner = ColumnString::create();
+    input_inner->insert_data(input_str.data(), input_str.size());
+    auto input_const = ColumnConst::create(std::move(input_inner), 
input_rows_count);
+
+    auto properties_column = ColumnString::create();
+    for (size_t i = 0; i < input_rows_count; ++i) {
+        properties_column->insert_data(properties.data(), properties.size());
+    }
+
+    auto string_type = std::make_shared<DataTypeString>();
+
+    Block block;
+    block.insert(ColumnWithTypeAndName(std::move(input_const), string_type, 
"input"));
+    block.insert(ColumnWithTypeAndName(std::move(properties_column), 
string_type, "properties"));
+    block.insert(ColumnWithTypeAndName(string_type->create_column(), 
string_type, "result"));
+
+    ColumnNumbers arguments = {0, 1};
+    uint32_t result = 2;
+
+    auto status = _function->execute(nullptr, block, arguments, result, 
input_rows_count);
+    ASSERT_TRUE(status.ok()) << status.to_string();
+
+    auto result_column = block.get_by_position(result).column;
+    ASSERT_EQ(result_column->size(), input_rows_count);
+
+    StringRef first_token = result_column->get_data_at(0);
+    const std::string expected = R"([{
+        "token": "Hello World"
+    }])";
+    EXPECT_EQ(std::string(first_token.data, first_token.size), expected);
+    for (size_t i = 1; i < input_rows_count; ++i) {
+        StringRef row_i = result_column->get_data_at(i);
+        EXPECT_EQ(std::string(row_i.data, row_i.size), expected);
+    }
+}
+
 // Test error cases
 TEST_F(FunctionTokenizeTest, InvalidParser) {
     std::vector<std::string> input_strings = {"Test String"};


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to