This is an automated email from the ASF dual-hosted git repository.
yiguolei pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new 3884c1aba86 [fix](function) fix tokenize function incorrect result
when first argument is const (#62699)
3884c1aba86 is described below
commit 3884c1aba86ebf1076c628e4d25d9d22007046eb
Author: Jack <[email protected]>
AuthorDate: Wed May 27 16:05:50 2026 +0800
[fix](function) fix tokenize function incorrect result when first argument
is const (#62699)
## Proposed changes
Fix a bug in the `tokenize` function where `unpack_if_const` unwraps a
`ColumnConst` to its inner data column (which has only 1 row), but
`_do_tokenize` and `_do_tokenize_none` iterate based on the source
column's row count. This causes only 1 output row to be produced instead
of `input_rows_count` rows when the first argument is a constant.
For example, `SELECT tokenize('hello world', 'parser=english') FROM
table_with_many_rows` would previously return only 1 row instead of the
expected number of rows matching the table.
The fix wraps the result in `ColumnConst` when the source column was
const, which is the standard pattern used throughout the Doris codebase
for handling const columns in function execution.
## Further comments
Related Jira: DORIS-25296
## Checklist(Required)
1. Does it affect the results of the existing test cases (Yes/No): No
2. Does it need to update the document (Yes/No): No
3. Is there a risk of compatibility changes (Yes/No): No
---
be/src/exprs/function/function_tokenize.cpp | 18 ++++-
be/src/exprs/function/function_tokenize.h | 2 +-
be/test/exprs/function/function_tokenize_test.cpp | 94 +++++++++++++++++++++++
3 files changed, 110 insertions(+), 4 deletions(-)
diff --git a/be/src/exprs/function/function_tokenize.cpp
b/be/src/exprs/function/function_tokenize.cpp
index e8d923f981a..a3d616d6357 100644
--- a/be/src/exprs/function/function_tokenize.cpp
+++ b/be/src/exprs/function/function_tokenize.cpp
@@ -30,6 +30,7 @@
#include "core/block/block.h"
#include "core/block/column_with_type_and_name.h"
#include "core/column/column.h"
+#include "core/column/column_const.h"
#include "core/data_type/data_type_nullable.h"
#include "core/data_type/data_type_number.h"
#include "core/string_ref.h"
@@ -134,7 +135,7 @@ void FunctionTokenize::_do_tokenize(const ColumnString&
src_column_string,
Status FunctionTokenize::execute_impl(FunctionContext* /*context*/, Block&
block,
const ColumnNumbers& arguments, uint32_t
result,
- size_t /*input_rows_count*/) const {
+ size_t input_rows_count) const {
DCHECK_EQ(arguments.size(), 2);
const auto& [src_column, left_const] =
unpack_if_const(block.get_by_position(arguments[0]).column);
@@ -165,7 +166,13 @@ Status FunctionTokenize::execute_impl(FunctionContext*
/*context*/, Block& block
if (config.analyzer_name.empty() &&
config.parser_type == InvertedIndexParserType::PARSER_NONE) {
_do_tokenize_none(*col_left, dest_column_ptr);
- block.replace_by_position(result, std::move(dest_column_ptr));
+ if (left_const) {
+ block.replace_by_position(
+ result,
+ ColumnConst::create(std::move(dest_column_ptr),
input_rows_count));
+ } else {
+ block.replace_by_position(result,
std::move(dest_column_ptr));
+ }
return Status::OK();
}
@@ -196,7 +203,12 @@ Status FunctionTokenize::execute_impl(FunctionContext*
/*context*/, Block& block
analyzer_ctx.analyzer = analyzer_holder;
_do_tokenize(*col_left, analyzer_ctx, support_phrase,
dest_column_ptr);
- block.replace_by_position(result, std::move(dest_column_ptr));
+ if (left_const) {
+ block.replace_by_position(
+ result,
ColumnConst::create(std::move(dest_column_ptr), input_rows_count));
+ } else {
+ block.replace_by_position(result, std::move(dest_column_ptr));
+ }
return Status::OK();
}
}
diff --git a/be/src/exprs/function/function_tokenize.h
b/be/src/exprs/function/function_tokenize.h
index 533b411458b..01828d120a5 100644
--- a/be/src/exprs/function/function_tokenize.h
+++ b/be/src/exprs/function/function_tokenize.h
@@ -70,6 +70,6 @@ public:
void _do_tokenize_none(const ColumnString& src_column_string,
const MutableColumnPtr& dest_column_ptr) const;
Status execute_impl(FunctionContext* /*context*/, Block& block, const
ColumnNumbers& arguments,
- uint32_t result, size_t /*input_rows_count*/) const
override;
+ uint32_t result, size_t input_rows_count) const
override;
};
} // namespace doris
diff --git a/be/test/exprs/function/function_tokenize_test.cpp
b/be/test/exprs/function/function_tokenize_test.cpp
index 322ee56c893..a17974d7c22 100644
--- a/be/test/exprs/function/function_tokenize_test.cpp
+++ b/be/test/exprs/function/function_tokenize_test.cpp
@@ -25,6 +25,7 @@
#include "core/block/block.h"
#include "core/block/column_with_type_and_name.h"
+#include "core/column/column_const.h"
#include "core/column/column_string.h"
#include "core/data_type/data_type_string.h"
#include "exprs/function/simple_function_factory.h"
@@ -203,6 +204,99 @@ TEST_F(FunctionTokenizeTest, ParserNonePropertyFormats) {
}
}
+// Regression for const-first-argument row count.
+//
+// The fix in FunctionTokenize::execute_impl wraps the dest column in a
+// ColumnConst(input_rows_count) when the first argument was already a
+// ColumnConst. The generic PreparedFunctionImpl::default_implementation_for_
+// constant_arguments path only short-circuits when *all* arguments are const,
+// so to reach the new left_const branch we need arg0 const but arg1 non-const.
+// FE rejects non-literal second arguments for tokenize, so this path is only
+// exercisable from a unit test that builds the block directly.
+TEST_F(FunctionTokenizeTest, ConstFirstArgPreservesRowCount) {
+ const size_t input_rows_count = 5;
+ const std::string input_str = "Hello World Test";
+ const std::string properties = "parser='english'";
+
+ auto input_inner = ColumnString::create();
+ input_inner->insert_data(input_str.data(), input_str.size());
+ auto input_const = ColumnConst::create(std::move(input_inner),
input_rows_count);
+
+ auto properties_column = ColumnString::create();
+ for (size_t i = 0; i < input_rows_count; ++i) {
+ properties_column->insert_data(properties.data(), properties.size());
+ }
+
+ auto string_type = std::make_shared<DataTypeString>();
+
+ Block block;
+ block.insert(ColumnWithTypeAndName(std::move(input_const), string_type,
"input"));
+ block.insert(ColumnWithTypeAndName(std::move(properties_column),
string_type, "properties"));
+ block.insert(ColumnWithTypeAndName(string_type->create_column(),
string_type, "result"));
+
+ ColumnNumbers arguments = {0, 1};
+ uint32_t result = 2;
+
+ auto status = _function->execute(nullptr, block, arguments, result,
input_rows_count);
+ ASSERT_TRUE(status.ok()) << status.to_string();
+
+ auto result_column = block.get_by_position(result).column;
+ ASSERT_EQ(result_column->size(), input_rows_count);
+
+ // Source is constant, so every row must carry the same tokenized value.
+ StringRef first_token = result_column->get_data_at(0);
+ ASSERT_GT(first_token.size, 0);
+ EXPECT_NE(std::string(first_token.data, first_token.size).find("hello"),
std::string::npos);
+ EXPECT_NE(std::string(first_token.data, first_token.size).find("world"),
std::string::npos);
+ for (size_t i = 1; i < input_rows_count; ++i) {
+ StringRef row_i = result_column->get_data_at(i);
+ EXPECT_EQ(row_i.size, first_token.size);
+ EXPECT_EQ(memcmp(row_i.data, first_token.data, first_token.size), 0);
+ }
+}
+
+// Same check for the PARSER_NONE early-return branch in execute_impl.
+TEST_F(FunctionTokenizeTest, ConstFirstArgParserNonePreservesRowCount) {
+ const size_t input_rows_count = 4;
+ const std::string input_str = "Hello World";
+ const std::string properties = "parser='none'";
+
+ auto input_inner = ColumnString::create();
+ input_inner->insert_data(input_str.data(), input_str.size());
+ auto input_const = ColumnConst::create(std::move(input_inner),
input_rows_count);
+
+ auto properties_column = ColumnString::create();
+ for (size_t i = 0; i < input_rows_count; ++i) {
+ properties_column->insert_data(properties.data(), properties.size());
+ }
+
+ auto string_type = std::make_shared<DataTypeString>();
+
+ Block block;
+ block.insert(ColumnWithTypeAndName(std::move(input_const), string_type,
"input"));
+ block.insert(ColumnWithTypeAndName(std::move(properties_column),
string_type, "properties"));
+ block.insert(ColumnWithTypeAndName(string_type->create_column(),
string_type, "result"));
+
+ ColumnNumbers arguments = {0, 1};
+ uint32_t result = 2;
+
+ auto status = _function->execute(nullptr, block, arguments, result,
input_rows_count);
+ ASSERT_TRUE(status.ok()) << status.to_string();
+
+ auto result_column = block.get_by_position(result).column;
+ ASSERT_EQ(result_column->size(), input_rows_count);
+
+ StringRef first_token = result_column->get_data_at(0);
+ const std::string expected = R"([{
+ "token": "Hello World"
+ }])";
+ EXPECT_EQ(std::string(first_token.data, first_token.size), expected);
+ for (size_t i = 1; i < input_rows_count; ++i) {
+ StringRef row_i = result_column->get_data_at(i);
+ EXPECT_EQ(std::string(row_i.data, row_i.size), expected);
+ }
+}
+
// Test error cases
TEST_F(FunctionTokenizeTest, InvalidParser) {
std::vector<std::string> input_strings = {"Test String"};
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]