Copilot commented on code in PR #54224:
URL: https://github.com/apache/doris/pull/54224#discussion_r2250300446
##########
be/src/vec/functions/function_tokenize.cpp:
##########
@@ -63,6 +63,35 @@ Status parse(const std::string& str, std::map<std::string,
std::string>& result)
return Status::OK();
}
+void FunctionTokenize::_do_tokenize_none(const ColumnString& src_column_string,
+ const MutableColumnPtr&
dest_column_ptr) const {
+ ColumnArray::Offset64 src_offsets_size =
src_column_string.get_offsets().size();
+ for (size_t i = 0; i < src_offsets_size; i++) {
+ const StringRef tokenize_str = src_column_string.get_data_at(i);
+
+ rapidjson::Document doc;
+ doc.SetArray();
+ rapidjson::Document::AllocatorType& allocator = doc.GetAllocator();
+
+ rapidjson::Value obj(rapidjson::kObjectType);
+ obj.AddMember(
+ "token",
+ rapidjson::Value(tokenize_str.data,
Review Comment:
Using `tokenize_str.data` directly without null-termination could be unsafe
if the rapidjson library expects null-terminated strings. Consider using
`tokenize_str.to_string()` or ensuring the data is properly handled.
```suggestion
rapidjson::Value(tokenize_str.to_string().c_str(),
```
##########
be/src/vec/functions/function_tokenize.h:
##########
@@ -67,11 +67,9 @@ class FunctionTokenize : public IFunction {
}
void _do_tokenize(const ColumnString& src_column_string, InvertedIndexCtx&
inverted_index_ctx,
const MutableColumnPtr& dest_column_ptr) const;
+ void _do_tokenize_none(const ColumnString& src_column_string,
+ const MutableColumnPtr& dest_column_ptr) const;
Status execute_impl(FunctionContext* /*context*/, Block& block, const
ColumnNumbers& arguments,
uint32_t result, size_t /*input_rows_count*/) const
override;
};
Review Comment:
The `register_function_tokenize` function declaration is missing from the
header file. It should be declared here since it's defined in the .cpp file and
likely used externally.
```suggestion
};
void register_function_tokenize(SimpleFunctionFactory& factory);
```
##########
be/test/vec/function/function_tokenize_test.cpp:
##########
@@ -0,0 +1,220 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "vec/functions/function_tokenize.h"
+
+#include <gtest/gtest.h>
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "vec/columns/column_string.h"
+#include "vec/core/block.h"
+#include "vec/core/column_with_type_and_name.h"
+#include "vec/data_types/data_type_string.h"
+#include "vec/functions/simple_function_factory.h"
+
+namespace doris::vectorized {
+
+class FunctionTokenizeTest : public ::testing::Test {
+public:
+ void SetUp() override {
+ // Register the tokenize function
+ SimpleFunctionFactory factory;
+ register_function_tokenize(factory);
+
+ // Create argument template for tokenize function (input string,
properties string)
+ auto input_type = std::make_shared<DataTypeString>();
+ auto properties_type = std::make_shared<DataTypeString>();
+ auto return_type = std::make_shared<DataTypeString>();
+
+ ColumnsWithTypeAndName argument_template = {{nullptr, input_type,
"input"},
+ {nullptr, properties_type,
"properties"}};
+
+ _function = factory.get_function("tokenize", argument_template,
return_type, {},
+
BeExecVersionManager::get_newest_version());
+ ASSERT_NE(_function, nullptr);
+ }
+
+protected:
+ FunctionBasePtr _function;
+
+ // Helper function to create a block with string columns
+ Block create_test_block(const std::vector<std::string>& input_strings,
+ const std::string& properties_str) {
+ Block block;
+
+ // Create input string column
+ auto input_column = ColumnString::create();
+ for (const auto& str : input_strings) {
+ input_column->insert_data(str.data(), str.size());
+ }
+ auto input_type = std::make_shared<DataTypeString>();
+ block.insert(ColumnWithTypeAndName(std::move(input_column),
input_type, "input"));
+
+ // Create properties string column
+ auto properties_column = ColumnString::create();
+ properties_column->insert_data(properties_str.data(),
properties_str.size());
+ auto properties_type = std::make_shared<DataTypeString>();
+ block.insert(
+ ColumnWithTypeAndName(std::move(properties_column),
properties_type, "properties"));
+
+ // Add result column
+ auto result_type = std::make_shared<DataTypeString>();
+ auto result_column = result_type->create_column();
+ block.insert(ColumnWithTypeAndName(std::move(result_column),
result_type, "result"));
+
+ return block;
+ }
+
+ // Helper function to execute tokenize function
+ std::vector<std::string> execute_tokenize(const std::vector<std::string>&
input_strings,
+ const std::string&
properties_str) {
+ Block block = create_test_block(input_strings, properties_str);
+ ColumnNumbers arguments = {0, 1}; // input column and properties column
+ uint32_t result = 2; // result column index
+ size_t input_rows_count = input_strings.size();
+
+ auto status = _function->execute(nullptr, block, arguments, result,
input_rows_count);
+ EXPECT_TRUE(status.ok()) << "Error executing tokenize: " <<
status.to_string();
+
+ // Extract results
+ std::vector<std::string> results;
+ auto result_column = block.get_by_position(result).column;
+ const auto* string_column = assert_cast<const
ColumnString*>(result_column.get());
+
+ for (size_t i = 0; i < input_strings.size(); ++i) {
+ StringRef result_str = string_column->get_data_at(i);
+ results.emplace_back(result_str.to_string());
+ }
+
+ return results;
+ }
+};
+
+// Test parser=none functionality
+TEST_F(FunctionTokenizeTest, ParserNone) {
+ std::vector<std::string> input_strings = {"Hello World!", "This is a
test.",
+ "Multiple words here",
+ "", // empty string
+ "Single"};
+
+ std::string properties = "parser='none'";
+ auto results = execute_tokenize(input_strings, properties);
+
+ ASSERT_EQ(results.size(), input_strings.size());
+
+ // For parser=none, each input should return as a single token in JSON
array format
+ EXPECT_EQ(results[0], R"([{
Review Comment:
[nitpick] The expected JSON strings are hardcoded with specific formatting.
Consider using a JSON parsing library to compare the actual JSON structure
rather than string comparison, which makes tests brittle to formatting changes.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]