This is an automated email from the ASF dual-hosted git repository.
yiguolei pushed a commit to branch branch-4.0
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-4.0 by this push:
new 6dce942d889 branch-4.0: [Enhancement](ai) relax the matching
restriction of some AI-Functions #58077 (#58113)
6dce942d889 is described below
commit 6dce942d88907018b5b8c7a61bcd6e80c48869d7
Author: github-actions[bot]
<41898282+github-actions[bot]@users.noreply.github.com>
AuthorDate: Thu Nov 20 09:12:15 2025 +0800
branch-4.0: [Enhancement](ai) relax the matching restriction of some
AI-Functions #58077 (#58113)
Cherry-picked from #58077
Co-authored-by: linrrarity <[email protected]>
---
be/src/vec/functions/ai/ai_functions.h | 38 +++++++-
be/test/ai/ai_function_test.cpp | 154 +++++++++++++++++++++++++++++++++
2 files changed, 190 insertions(+), 2 deletions(-)
diff --git a/be/src/vec/functions/ai/ai_functions.h
b/be/src/vec/functions/ai/ai_functions.h
index 7bb4909b457..c31a7659614 100644
--- a/be/src/vec/functions/ai/ai_functions.h
+++ b/be/src/vec/functions/ai/ai_functions.h
@@ -20,6 +20,9 @@
#include <gen_cpp/FrontendService.h>
#include <gen_cpp/PaloInternalService_types.h>
+#include <algorithm>
+#include <cctype>
+#include <cstdlib>
#include <memory>
#include <string>
#include <type_traits>
@@ -122,8 +125,14 @@ public:
}
case PrimitiveType::TYPE_BOOLEAN: { // boolean for AI_FILTER
#ifdef BE_TEST
- string_result = "0";
+ const char* test_result = std::getenv("AI_TEST_RESULT");
+ if (test_result != nullptr) {
+ string_result = test_result;
+ } else {
+ string_result = "0";
+ }
#endif
+ trim_string(string_result);
if (string_result != "1" && string_result != "0") {
return Status::RuntimeError("Failed to parse boolean
value: " +
string_result);
@@ -133,7 +142,22 @@ public:
break;
}
case PrimitiveType::TYPE_FLOAT: { // float for AI_SIMILARITY
-
assert_cast<ColumnFloat32&>(*col_result).insert_value(std::stof(string_result));
+#ifdef BE_TEST
+ const char* test_result = std::getenv("AI_TEST_RESULT");
+ if (test_result != nullptr) {
+ string_result = test_result;
+ } else {
+ string_result = "0.0";
+ }
+#endif
+ trim_string(string_result);
+ try {
+ float float_value = std::stof(string_result);
+
assert_cast<ColumnFloat32&>(*col_result).insert_value(float_value);
+ } catch (...) {
+ return Status::RuntimeError("Failed to parse float
value: " +
+ string_result);
+ }
break;
}
default:
@@ -147,6 +171,16 @@ public:
}
private:
+ // Trim whitespace and newlines from string
+ static void trim_string(std::string& str) {
+ str.erase(str.begin(), std::find_if(str.begin(), str.end(),
+ [](unsigned char ch) { return
!std::isspace(ch); }));
+ str.erase(std::find_if(str.rbegin(), str.rend(),
+ [](unsigned char ch) { return
!std::isspace(ch); })
+ .base(),
+ str.end());
+ }
+
// The ai resource must be literal
Status _init_from_resource(FunctionContext* context, const Block& block,
const ColumnNumbers& arguments, TAIResource&
config,
diff --git a/be/test/ai/ai_function_test.cpp b/be/test/ai/ai_function_test.cpp
index f288a4b7b92..74a46240be8 100644
--- a/be/test/ai/ai_function_test.cpp
+++ b/be/test/ai/ai_function_test.cpp
@@ -295,6 +295,79 @@ TEST(AIFunctionTest, AISimilarityTest) {
ASSERT_EQ(prompt, "Text 1: I like this dish\nText 2: This dish is very
good");
}
+TEST(AIFunctionTest, AISimilarityExecuteTest) {
+ auto runtime_state = std::make_unique<MockRuntimeState>();
+ auto ctx = FunctionContext::create_context(runtime_state.get(), {}, {});
+
+ std::vector<std::string> resources = {"mock_resource"};
+ std::vector<std::string> text1 = {"I like this dish"};
+ std::vector<std::string> text2 = {"This dish is very good"};
+ auto col_resource = ColumnHelper::create_column<DataTypeString>(resources);
+ auto col_text1 = ColumnHelper::create_column<DataTypeString>(text1);
+ auto col_text2 = ColumnHelper::create_column<DataTypeString>(text2);
+
+ Block block;
+ block.insert({std::move(col_resource), std::make_shared<DataTypeString>(),
"resource"});
+ block.insert({std::move(col_text1), std::make_shared<DataTypeString>(),
"text1"});
+ block.insert({std::move(col_text2), std::make_shared<DataTypeString>(),
"text2"});
+ block.insert({nullptr, std::make_shared<DataTypeFloat32>(), "result"});
+
+ ColumnNumbers arguments = {0, 1, 2};
+ size_t result_idx = 3;
+
+ auto similarity_func = FunctionAISimilarity::create();
+ Status exec_status =
+ similarity_func->execute_impl(ctx.get(), block, arguments,
result_idx, text1.size());
+
+ ASSERT_TRUE(exec_status.ok());
+}
+
+TEST(AIFunctionTest, AISimilarityTrimWhitespace) {
+ auto runtime_state = std::make_unique<MockRuntimeState>();
+ auto ctx = FunctionContext::create_context(runtime_state.get(), {}, {});
+
+ std::vector<std::pair<std::string, float>> test_cases = {
+ {"0.5", 0.5f}, {"1.0", 1.0f}, {"0.0", 0.0f},
{" 0.5", 0.5f},
+ {"0.5 ", 0.5f}, {" 0.5 ", 0.5f}, {"\n0.8", 0.8f},
{"0.3\n", 0.3f},
+ {"\n0.7\n", 0.7f}, {"\t0.2\t", 0.2f}, {" \n\t0.9 \n\t", 0.9f},
{" 0.1 ", 0.1f},
+ {"\r\n0.6\r\n", 0.6f}};
+
+ for (const auto& test_case : test_cases) {
+ setenv("AI_TEST_RESULT", test_case.first.c_str(), 1);
+
+ std::vector<std::string> resources = {"mock_resource"};
+ std::vector<std::string> text1 = {"Test text 1"};
+ std::vector<std::string> text2 = {"Test text 2"};
+ auto col_resource =
ColumnHelper::create_column<DataTypeString>(resources);
+ auto col_text1 = ColumnHelper::create_column<DataTypeString>(text1);
+ auto col_text2 = ColumnHelper::create_column<DataTypeString>(text2);
+
+ Block block;
+ block.insert({std::move(col_resource),
std::make_shared<DataTypeString>(), "resource"});
+ block.insert({std::move(col_text1),
std::make_shared<DataTypeString>(), "text1"});
+ block.insert({std::move(col_text2),
std::make_shared<DataTypeString>(), "text2"});
+ block.insert({nullptr, std::make_shared<DataTypeFloat32>(), "result"});
+
+ ColumnNumbers arguments = {0, 1, 2};
+ size_t result_idx = 3;
+
+ auto similarity_func = FunctionAISimilarity::create();
+ Status exec_status = similarity_func->execute_impl(ctx.get(), block,
arguments, result_idx,
+ text1.size());
+
+ ASSERT_TRUE(exec_status.ok()) << "Failed for test case: '" <<
test_case.first << "'";
+
+ const auto& res_col =
+ assert_cast<const
ColumnFloat32&>(*block.get_by_position(result_idx).column);
+ float val = res_col.get_data()[0];
+ ASSERT_FLOAT_EQ(val, test_case.second)
+ << "Failed for test case: '" << test_case.first
+ << "', expected: " << test_case.second << ", got: " << val;
+ }
+
+ unsetenv("AI_TEST_RESULT");
+}
+
TEST(AIFunctionTest, AIFilterTest) {
FunctionAIFilter function;
@@ -343,6 +416,87 @@ TEST(AIFunctionTest, AIFilterExecuteTest) {
ASSERT_TRUE(val == 0);
}
+TEST(AIFunctionTest, AIFilterTrimWhitespace) {
+ auto runtime_state = std::make_unique<MockRuntimeState>();
+ auto ctx = FunctionContext::create_context(runtime_state.get(), {}, {});
+
+ std::vector<std::pair<std::string, UInt8>> test_cases = {
+ {"0", 0}, {"1", 1}, {" 0", 0}, {"0 ", 0},
+ {" 0 ", 0}, {"\n0", 0}, {"0\n", 0}, {"\n0\n", 0},
+ {"\t1\t", 1}, {" \n\t1 \n\t", 1}, {" 1 ", 1}, {"\r\n0\r\n", 0}};
+
+ for (const auto& test_case : test_cases) {
+ setenv("AI_TEST_RESULT", test_case.first.c_str(), 1);
+
+ std::vector<std::string> resources = {"mock_resource"};
+ std::vector<std::string> texts = {"Test input"};
+ auto col_resource =
ColumnHelper::create_column<DataTypeString>(resources);
+ auto col_text = ColumnHelper::create_column<DataTypeString>(texts);
+
+ Block block;
+ block.insert({std::move(col_resource),
std::make_shared<DataTypeString>(), "resource"});
+ block.insert({std::move(col_text), std::make_shared<DataTypeString>(),
"text"});
+ block.insert({nullptr, std::make_shared<DataTypeBool>(), "result"});
+
+ ColumnNumbers arguments = {0, 1};
+ size_t result_idx = 2;
+
+ auto filter_func = FunctionAIFilter::create();
+ Status exec_status =
+ filter_func->execute_impl(ctx.get(), block, arguments,
result_idx, texts.size());
+
+ ASSERT_TRUE(exec_status.ok()) << "Failed for test case: '" <<
test_case.first << "'";
+
+ const auto& res_col =
+ assert_cast<const
ColumnUInt8&>(*block.get_by_position(result_idx).column);
+ UInt8 val = res_col.get_data()[0];
+ ASSERT_EQ(val, test_case.second)
+ << "Failed for test case: '" << test_case.first
+ << "', expected: " << (int)test_case.second << ", got: " <<
(int)val;
+ }
+
+ unsetenv("AI_TEST_RESULT");
+}
+
+TEST(AIFunctionTest, AIFilterInvalidValue) {
+ auto runtime_state = std::make_unique<MockRuntimeState>();
+ auto ctx = FunctionContext::create_context(runtime_state.get(), {}, {});
+
+ std::vector<std::string> invalid_cases = {
+ "2", "maybe", "ok", "", " ", "01", "0.5", "sure",
"truee", "falsee",
+ "yess", "noo", "true", "false", "yes", "no", "TRUE", "FALSE",
"YES", "NO"};
+
+ for (const auto& invalid_value : invalid_cases) {
+ setenv("AI_TEST_RESULT", invalid_value.c_str(), 1);
+
+ std::vector<std::string> resources = {"mock_resource"};
+ std::vector<std::string> texts = {"Test input"};
+ auto col_resource =
ColumnHelper::create_column<DataTypeString>(resources);
+ auto col_text = ColumnHelper::create_column<DataTypeString>(texts);
+
+ Block block;
+ block.insert({std::move(col_resource),
std::make_shared<DataTypeString>(), "resource"});
+ block.insert({std::move(col_text), std::make_shared<DataTypeString>(),
"text"});
+ block.insert({nullptr, std::make_shared<DataTypeBool>(), "result"});
+
+ ColumnNumbers arguments = {0, 1};
+ size_t result_idx = 2;
+
+ auto filter_func = FunctionAIFilter::create();
+ Status exec_status =
+ filter_func->execute_impl(ctx.get(), block, arguments,
result_idx, texts.size());
+
+ ASSERT_FALSE(exec_status.ok())
+ << "Should have failed for invalid value: '" << invalid_value
<< "'";
+ ASSERT_TRUE(exec_status.to_string().find("Failed to parse boolean
value") !=
+ std::string::npos)
+ << "Error message should mention boolean parsing for value: '"
<< invalid_value
+ << "'";
+ }
+
+ unsetenv("AI_TEST_RESULT");
+}
+
TEST(AIFunctionTest, ResourceNotFound) {
auto runtime_state = std::make_unique<MockRuntimeState>();
auto ctx = FunctionContext::create_context(runtime_state.get(), {}, {});
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]