kou commented on code in PR #39441:
URL: https://github.com/apache/arrow/pull/39441#discussion_r1444073352
##########
cpp/src/gandiva/gdv_string_function_stubs.cc:
##########
@@ -986,6 +1003,32 @@ arrow::Status
ExportedStringFunctions::AddMappings(Engine* engine) const {
engine->AddGlobalMappingForFunc("translate_utf8_utf8_utf8",
types->i8_ptr_type() /*return_type*/, args,
reinterpret_cast<void*>(translate_utf8_utf8_utf8));
+
+ // gdv_fn_regex_like_utf8_utf8
Review Comment:
```suggestion
// gdv_fn_regexp_like_utf8_utf8
```
##########
cpp/src/gandiva/regex_functions_holder.cc:
##########
@@ -275,4 +275,82 @@ const char* ExtractHolder::operator()(ExecutionContext*
ctx, const char* user_in
return result_buffer;
}
+Result<std::shared_ptr<RegexpLikeHolder>> RegexpLikeHolder::Make(
+ const FunctionNode& node) {
+ ARROW_RETURN_IF(
+ node.children().size() < 2,
+ Status::Invalid("'regexp_like' function requires at least two
parameters"));
+ ARROW_RETURN_IF(
+ node.children().size() > 3,
+ Status::Invalid("'regexp_like' function requires at most three
parameters"));
+ auto pattern =
arrow::internal::checked_cast<LiteralNode*>(node.children().at(1).get());
+ ARROW_RETURN_IF(
+ pattern == nullptr,
+ Status::Invalid(
+ "'regexp_like' function requires a literal as the second
parameter"));
+
+ auto pattern_type = pattern->return_type()->id();
+ ARROW_RETURN_IF(
+ !(pattern_type == arrow::Type::STRING || pattern_type ==
arrow::Type::BINARY),
+ Status::Invalid(
+ "'regexp_like' function requires a string literal as the second
parameter"));
+
+ if (node.children().size() > 2) {
+ auto parameter =
+
arrow::internal::checked_cast<LiteralNode*>(node.children().at(2).get());
+ if (parameter != nullptr) {
+ auto parameter_type = parameter->return_type()->id();
+ ARROW_RETURN_IF(
+ !arrow::is_binary_like(parameter_type),
+ Status::Invalid(
+ "'regexp_like' function requires a string literal as the third
parameter"));
+ return RegexpLikeHolder::Make(std::get<std::string>(pattern->holder()),
true,
+
std::get<std::string>(parameter->holder()));
+ }
+ }
+ return RegexpLikeHolder::Make(std::get<std::string>(pattern->holder()),
false, "");
+}
+
+Result<std::shared_ptr<RegexpLikeHolder>> RegexpLikeHolder::Make(
+ const std::string& regex_pattern, bool used_match_parameter,
+ const std::string& match_parameter) {
+ RE2::Options regex_op;
+ // set re2 use posix regex expression which also called ERE in postgre sql
+ regex_op.set_posix_syntax(true);
+ // oracle's regex_like will default treat source str as single line
Review Comment:
```suggestion
// Oracle's regex_like will default treat source string as single line
```
##########
cpp/src/gandiva/regex_functions_holder_test.cc:
##########
@@ -635,4 +635,134 @@ TEST_F(TestExtractHolder, TestErrorWhileBuildingHolder) {
execution_context_.Reset();
}
+class TestRegexpLikeHolder : public ::testing::Test {
+ protected:
+ ExecutionContext execution_context_;
+};
+
+TEST_F(TestRegexpLikeHolder, TestRegexpLikeUseNonParameter) {
+ EXPECT_OK_AND_ASSIGN(auto regex_like_holder, RegexpLikeHolder::Make("ast",
false, ""));
+ auto& regex_like = *regex_like_holder;
Review Comment:
```suggestion
EXPECT_OK_AND_ASSIGN(auto regexp_like_holder,
RegexpLikeHolder::Make("ast", false, ""));
auto& regexp_like = *regexp_like_holder;
```
Other tests have similar codes.
##########
cpp/src/gandiva/regex_functions_holder.cc:
##########
@@ -275,4 +275,82 @@ const char* ExtractHolder::operator()(ExecutionContext*
ctx, const char* user_in
return result_buffer;
}
+Result<std::shared_ptr<RegexpLikeHolder>> RegexpLikeHolder::Make(
+ const FunctionNode& node) {
+ ARROW_RETURN_IF(
+ node.children().size() < 2,
+ Status::Invalid("'regexp_like' function requires at least two
parameters"));
+ ARROW_RETURN_IF(
+ node.children().size() > 3,
+ Status::Invalid("'regexp_like' function requires at most three
parameters"));
+ auto pattern =
arrow::internal::checked_cast<LiteralNode*>(node.children().at(1).get());
+ ARROW_RETURN_IF(
+ pattern == nullptr,
+ Status::Invalid(
+ "'regexp_like' function requires a literal as the second
parameter"));
+
+ auto pattern_type = pattern->return_type()->id();
+ ARROW_RETURN_IF(
+ !(pattern_type == arrow::Type::STRING || pattern_type ==
arrow::Type::BINARY),
+ Status::Invalid(
+ "'regexp_like' function requires a string literal as the second
parameter"));
+
+ if (node.children().size() > 2) {
+ auto parameter =
+
arrow::internal::checked_cast<LiteralNode*>(node.children().at(2).get());
+ if (parameter != nullptr) {
+ auto parameter_type = parameter->return_type()->id();
+ ARROW_RETURN_IF(
+ !arrow::is_binary_like(parameter_type),
+ Status::Invalid(
+ "'regexp_like' function requires a string literal as the third
parameter"));
+ return RegexpLikeHolder::Make(std::get<std::string>(pattern->holder()),
true,
+
std::get<std::string>(parameter->holder()));
+ }
+ }
+ return RegexpLikeHolder::Make(std::get<std::string>(pattern->holder()),
false, "");
+}
+
+Result<std::shared_ptr<RegexpLikeHolder>> RegexpLikeHolder::Make(
+ const std::string& regex_pattern, bool used_match_parameter,
+ const std::string& match_parameter) {
+ RE2::Options regex_op;
+ // set re2 use posix regex expression which also called ERE in postgre sql
Review Comment:
```suggestion
// set RE2 use Posix regex expression which also called ERE in PostgreSQL
```
##########
cpp/src/gandiva/regex_functions_holder.cc:
##########
@@ -275,4 +275,82 @@ const char* ExtractHolder::operator()(ExecutionContext*
ctx, const char* user_in
return result_buffer;
}
+Result<std::shared_ptr<RegexpLikeHolder>> RegexpLikeHolder::Make(
+ const FunctionNode& node) {
+ ARROW_RETURN_IF(
+ node.children().size() < 2,
+ Status::Invalid("'regexp_like' function requires at least two
parameters"));
+ ARROW_RETURN_IF(
+ node.children().size() > 3,
+ Status::Invalid("'regexp_like' function requires at most three
parameters"));
+ auto pattern =
arrow::internal::checked_cast<LiteralNode*>(node.children().at(1).get());
+ ARROW_RETURN_IF(
+ pattern == nullptr,
+ Status::Invalid(
+ "'regexp_like' function requires a literal as the second
parameter"));
+
+ auto pattern_type = pattern->return_type()->id();
+ ARROW_RETURN_IF(
+ !(pattern_type == arrow::Type::STRING || pattern_type ==
arrow::Type::BINARY),
+ Status::Invalid(
+ "'regexp_like' function requires a string literal as the second
parameter"));
+
+ if (node.children().size() > 2) {
+ auto parameter =
+
arrow::internal::checked_cast<LiteralNode*>(node.children().at(2).get());
+ if (parameter != nullptr) {
+ auto parameter_type = parameter->return_type()->id();
+ ARROW_RETURN_IF(
+ !arrow::is_binary_like(parameter_type),
+ Status::Invalid(
+ "'regexp_like' function requires a string literal as the third
parameter"));
+ return RegexpLikeHolder::Make(std::get<std::string>(pattern->holder()),
true,
+
std::get<std::string>(parameter->holder()));
+ }
+ }
+ return RegexpLikeHolder::Make(std::get<std::string>(pattern->holder()),
false, "");
+}
+
+Result<std::shared_ptr<RegexpLikeHolder>> RegexpLikeHolder::Make(
+ const std::string& regex_pattern, bool used_match_parameter,
+ const std::string& match_parameter) {
+ RE2::Options regex_op;
+ // set re2 use posix regex expression which also called ERE in postgre sql
+ regex_op.set_posix_syntax(true);
+ // oracle's regex_like will default treat source str as single line
+ regex_op.set_one_line(true);
+
+ if (used_match_parameter) {
+ for (auto& parameter : match_parameter) {
+ switch (parameter) {
+ case 'i':
+ regex_op.set_case_sensitive(false);
+ break;
+ case 'c':
+ regex_op.set_case_sensitive(true);
+ break;
+ case 'n':
+ regex_op.set_dot_nl(true);
+ break;
+ case 'm':
+ regex_op.set_one_line(false);
+ break;
+ default:
+ ARROW_RETURN_NOT_OK(Status::Invalid("Invalid match parameter '",
parameter,
+ "':Only 'i', 'c', 'n', 'm' are
allowed"));
+ }
+ }
+ }
+
+ std::shared_ptr<RegexpLikeHolder> lholder;
+
+ lholder =
+ std::shared_ptr<RegexpLikeHolder>(new RegexpLikeHolder(regex_pattern,
regex_op));
+
+ ARROW_RETURN_IF(!lholder->regex_.ok(),
+ Status::Invalid("Building posix regex pattern '",
regex_pattern,
Review Comment:
```suggestion
Status::Invalid("Building Posix regular expression '",
regex_pattern,
```
##########
cpp/src/gandiva/regex_functions_holder_test.cc:
##########
@@ -635,4 +635,134 @@ TEST_F(TestExtractHolder, TestErrorWhileBuildingHolder) {
execution_context_.Reset();
}
+class TestRegexpLikeHolder : public ::testing::Test {
+ protected:
+ ExecutionContext execution_context_;
+};
+
+TEST_F(TestRegexpLikeHolder, TestRegexpLikeUseNonParameter) {
+ EXPECT_OK_AND_ASSIGN(auto regex_like_holder, RegexpLikeHolder::Make("ast",
false, ""));
+ auto& regex_like = *regex_like_holder;
+ std::string source_string = "fast";
+
+ auto ret =
+ regex_like(source_string.c_str(),
static_cast<int32_t>(source_string.length()));
+ EXPECT_TRUE(ret);
+
+ source_string = "FAST";
+ ret = regex_like(source_string.c_str(),
static_cast<int32_t>(source_string.length()));
+ EXPECT_TRUE(!ret);
+}
+
+TEST_F(TestRegexpLikeHolder, TestRegexLikeHolderUseIParameter) {
+ EXPECT_OK_AND_ASSIGN(auto regex_like_holder, RegexpLikeHolder::Make("ast",
true, "i"));
+
+ auto& regex_like = *regex_like_holder;
+ std::string source_string = "FAST";
+
+ auto ret =
+ regex_like(source_string.c_str(),
static_cast<int32_t>(source_string.length()));
+ EXPECT_TRUE(ret);
+}
+
+TEST_F(TestRegexpLikeHolder, TestRegexpLikeHolderUseCParameter) {
+ EXPECT_OK_AND_ASSIGN(auto regex_like_holder, RegexpLikeHolder::Make("ast",
true, "c"));
+
+ auto& regex_like = *regex_like_holder;
+ std::string source_string = "FAST";
+
+ auto ret =
+ regex_like(source_string.c_str(),
static_cast<int32_t>(source_string.length()));
+ EXPECT_TRUE(!ret);
+
+ std::string lower_source_string = "fast";
+ ret = regex_like(lower_source_string.c_str(),
+ static_cast<int32_t>(lower_source_string.length()));
+ EXPECT_TRUE(ret);
+}
+
+TEST_F(TestRegexpLikeHolder, TestRegexpLikeHolderUseICParameter) {
+ // In oracle, if specify multiple contradictory values in regex_like, Oracle
uses the
Review Comment:
```suggestion
// In Oracle, if specify multiple contradictory values in regex_like,
Oracle uses the
```
##########
cpp/src/gandiva/tests/filter_test.cc:
##########
@@ -449,4 +449,82 @@ TEST_F(TestFilter, TestLike) {
EXPECT_ARROW_ARRAY_EQUALS(exp, selection_vector->ToArray());
}
+TEST_F(TestFilter, TestRegexPLike) {
Review Comment:
```suggestion
TEST_F(TestFilter, TestRegexpLike) {
```
##########
cpp/src/gandiva/regex_functions_holder.cc:
##########
@@ -275,4 +275,82 @@ const char* ExtractHolder::operator()(ExecutionContext*
ctx, const char* user_in
return result_buffer;
}
+Result<std::shared_ptr<RegexpLikeHolder>> RegexpLikeHolder::Make(
+ const FunctionNode& node) {
+ ARROW_RETURN_IF(
+ node.children().size() < 2,
+ Status::Invalid("'regexp_like' function requires at least two
parameters"));
+ ARROW_RETURN_IF(
+ node.children().size() > 3,
+ Status::Invalid("'regexp_like' function requires at most three
parameters"));
+ auto pattern =
arrow::internal::checked_cast<LiteralNode*>(node.children().at(1).get());
+ ARROW_RETURN_IF(
+ pattern == nullptr,
+ Status::Invalid(
+ "'regexp_like' function requires a literal as the second
parameter"));
+
+ auto pattern_type = pattern->return_type()->id();
+ ARROW_RETURN_IF(
+ !(pattern_type == arrow::Type::STRING || pattern_type ==
arrow::Type::BINARY),
+ Status::Invalid(
+ "'regexp_like' function requires a string literal as the second
parameter"));
+
+ if (node.children().size() > 2) {
+ auto parameter =
+
arrow::internal::checked_cast<LiteralNode*>(node.children().at(2).get());
+ if (parameter != nullptr) {
+ auto parameter_type = parameter->return_type()->id();
+ ARROW_RETURN_IF(
+ !arrow::is_binary_like(parameter_type),
+ Status::Invalid(
+ "'regexp_like' function requires a string literal as the third
parameter"));
+ return RegexpLikeHolder::Make(std::get<std::string>(pattern->holder()),
true,
+
std::get<std::string>(parameter->holder()));
+ }
+ }
+ return RegexpLikeHolder::Make(std::get<std::string>(pattern->holder()),
false, "");
+}
+
+Result<std::shared_ptr<RegexpLikeHolder>> RegexpLikeHolder::Make(
+ const std::string& regex_pattern, bool used_match_parameter,
Review Comment:
How about removing redundant "regex_" prefix? In this context
(`RegexpLikeHolder`), "regex_" is redundant.
```suggestion
const std::string& pattern, bool used_match_parameter,
```
##########
cpp/src/gandiva/gdv_string_function_stubs.cc:
##########
@@ -986,6 +1003,32 @@ arrow::Status
ExportedStringFunctions::AddMappings(Engine* engine) const {
engine->AddGlobalMappingForFunc("translate_utf8_utf8_utf8",
types->i8_ptr_type() /*return_type*/, args,
reinterpret_cast<void*>(translate_utf8_utf8_utf8));
+
+ // gdv_fn_regex_like_utf8_utf8
+ args = {
+ types->i64_type(), // int64_t holder_ptr
+ types->i8_ptr_type(), // const char* source_string
+ types->i32_type(), // int source_string_len
+ types->i8_ptr_type(), // const char* pattern
+ types->i32_type() // int pattern_len
+ };
+ engine->AddGlobalMappingForFunc("gdv_fn_regexp_like_utf8_utf8",
+ types->i1_type() /*return_type*/, args,
+
reinterpret_cast<void*>(gdv_fn_regexp_like_utf8_uft8));
+
+ // gdv_fn_regex_like_utf8_utf8_utf8
Review Comment:
```suggestion
// gdv_fn_regexp_like_utf8_utf8_utf8
```
##########
cpp/src/gandiva/regex_functions_holder.cc:
##########
@@ -275,4 +275,82 @@ const char* ExtractHolder::operator()(ExecutionContext*
ctx, const char* user_in
return result_buffer;
}
+Result<std::shared_ptr<RegexpLikeHolder>> RegexpLikeHolder::Make(
+ const FunctionNode& node) {
+ ARROW_RETURN_IF(
+ node.children().size() < 2,
+ Status::Invalid("'regexp_like' function requires at least two
parameters"));
+ ARROW_RETURN_IF(
+ node.children().size() > 3,
+ Status::Invalid("'regexp_like' function requires at most three
parameters"));
+ auto pattern =
arrow::internal::checked_cast<LiteralNode*>(node.children().at(1).get());
+ ARROW_RETURN_IF(
+ pattern == nullptr,
+ Status::Invalid(
+ "'regexp_like' function requires a literal as the second
parameter"));
+
+ auto pattern_type = pattern->return_type()->id();
+ ARROW_RETURN_IF(
+ !(pattern_type == arrow::Type::STRING || pattern_type ==
arrow::Type::BINARY),
+ Status::Invalid(
+ "'regexp_like' function requires a string literal as the second
parameter"));
+
+ if (node.children().size() > 2) {
+ auto parameter =
+
arrow::internal::checked_cast<LiteralNode*>(node.children().at(2).get());
+ if (parameter != nullptr) {
+ auto parameter_type = parameter->return_type()->id();
+ ARROW_RETURN_IF(
+ !arrow::is_binary_like(parameter_type),
+ Status::Invalid(
+ "'regexp_like' function requires a string literal as the third
parameter"));
+ return RegexpLikeHolder::Make(std::get<std::string>(pattern->holder()),
true,
+
std::get<std::string>(parameter->holder()));
+ }
+ }
+ return RegexpLikeHolder::Make(std::get<std::string>(pattern->holder()),
false, "");
+}
+
+Result<std::shared_ptr<RegexpLikeHolder>> RegexpLikeHolder::Make(
+ const std::string& regex_pattern, bool used_match_parameter,
+ const std::string& match_parameter) {
+ RE2::Options regex_op;
+ // set re2 use posix regex expression which also called ERE in postgre sql
+ regex_op.set_posix_syntax(true);
+ // oracle's regex_like will default treat source str as single line
+ regex_op.set_one_line(true);
+
+ if (used_match_parameter) {
+ for (auto& parameter : match_parameter) {
+ switch (parameter) {
+ case 'i':
+ regex_op.set_case_sensitive(false);
+ break;
+ case 'c':
+ regex_op.set_case_sensitive(true);
+ break;
+ case 'n':
+ regex_op.set_dot_nl(true);
+ break;
+ case 'm':
+ regex_op.set_one_line(false);
+ break;
+ default:
+ ARROW_RETURN_NOT_OK(Status::Invalid("Invalid match parameter '",
parameter,
+ "':Only 'i', 'c', 'n', 'm' are
allowed"));
+ }
+ }
+ }
+
+ std::shared_ptr<RegexpLikeHolder> lholder;
+
+ lholder =
+ std::shared_ptr<RegexpLikeHolder>(new RegexpLikeHolder(regex_pattern,
regex_op));
Review Comment:
* How about using `auto`?
* How about using just `holder`? Because we have only one holder in this
context.
```suggestion
auto holder =
std::shared_ptr<RegexpLikeHolder>(new RegexpLikeHolder(regex_pattern,
regex_op));
```
##########
cpp/src/gandiva/regex_functions_holder.cc:
##########
@@ -275,4 +275,82 @@ const char* ExtractHolder::operator()(ExecutionContext*
ctx, const char* user_in
return result_buffer;
}
+Result<std::shared_ptr<RegexpLikeHolder>> RegexpLikeHolder::Make(
+ const FunctionNode& node) {
+ ARROW_RETURN_IF(
+ node.children().size() < 2,
+ Status::Invalid("'regexp_like' function requires at least two
parameters"));
+ ARROW_RETURN_IF(
+ node.children().size() > 3,
+ Status::Invalid("'regexp_like' function requires at most three
parameters"));
+ auto pattern =
arrow::internal::checked_cast<LiteralNode*>(node.children().at(1).get());
+ ARROW_RETURN_IF(
+ pattern == nullptr,
+ Status::Invalid(
+ "'regexp_like' function requires a literal as the second
parameter"));
+
+ auto pattern_type = pattern->return_type()->id();
+ ARROW_RETURN_IF(
+ !(pattern_type == arrow::Type::STRING || pattern_type ==
arrow::Type::BINARY),
+ Status::Invalid(
+ "'regexp_like' function requires a string literal as the second
parameter"));
+
+ if (node.children().size() > 2) {
+ auto parameter =
+
arrow::internal::checked_cast<LiteralNode*>(node.children().at(2).get());
+ if (parameter != nullptr) {
+ auto parameter_type = parameter->return_type()->id();
+ ARROW_RETURN_IF(
+ !arrow::is_binary_like(parameter_type),
+ Status::Invalid(
+ "'regexp_like' function requires a string literal as the third
parameter"));
+ return RegexpLikeHolder::Make(std::get<std::string>(pattern->holder()),
true,
+
std::get<std::string>(parameter->holder()));
+ }
+ }
+ return RegexpLikeHolder::Make(std::get<std::string>(pattern->holder()),
false, "");
+}
+
+Result<std::shared_ptr<RegexpLikeHolder>> RegexpLikeHolder::Make(
+ const std::string& regex_pattern, bool used_match_parameter,
+ const std::string& match_parameter) {
+ RE2::Options regex_op;
Review Comment:
How about using `options` for this variable? "regex_" prefix is redundant in
this context and "op" isn't suitable abbreviation. (We may use "op" for
abbreviation of "operator" or something.)
```suggestion
RE2::Options options;
```
##########
cpp/src/gandiva/regex_functions_holder.h:
##########
@@ -150,4 +150,27 @@ class GANDIVA_EXPORT ExtractHolder : public FunctionHolder
{
int32_t num_groups_pattern_; // number of groups that user defined inside
the regex
};
+class GANDIVA_EXPORT RegexpLikeHolder : public FunctionHolder {
+ public:
+ ~RegexpLikeHolder() override = default;
+
+ static Result<std::shared_ptr<RegexpLikeHolder>> Make(const FunctionNode&
node);
+
+ static Result<std::shared_ptr<RegexpLikeHolder>> Make(
+ const std::string& regex_pattern, bool used_match_parameter,
+ const std::string& match_parameter);
+
+ bool operator()(const char* source_string, int32_t source_string_len) {
+ std::string source_str(source_string, source_string_len);
+ return RE2::PartialMatch(source_str, regex_);
+ }
+
+ private:
+ explicit RegexpLikeHolder(const std::string& pattern, RE2::Options regex_op)
+ : regex_pattern_(pattern), regex_(pattern, regex_op) {}
+
+ std::string regex_pattern_;
+ RE2 regex_;
Review Comment:
```suggestion
RE2 regexp_;
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]