kou commented on code in PR #39441:
URL: https://github.com/apache/arrow/pull/39441#discussion_r1444073352


##########
cpp/src/gandiva/gdv_string_function_stubs.cc:
##########
@@ -986,6 +1003,32 @@ arrow::Status 
ExportedStringFunctions::AddMappings(Engine* engine) const {
   engine->AddGlobalMappingForFunc("translate_utf8_utf8_utf8",
                                   types->i8_ptr_type() /*return_type*/, args,
                                   
reinterpret_cast<void*>(translate_utf8_utf8_utf8));
+
+  // gdv_fn_regex_like_utf8_utf8

Review Comment:
   ```suggestion
     // gdv_fn_regexp_like_utf8_utf8
   ```



##########
cpp/src/gandiva/regex_functions_holder.cc:
##########
@@ -275,4 +275,82 @@ const char* ExtractHolder::operator()(ExecutionContext* 
ctx, const char* user_in
   return result_buffer;
 }
 
+Result<std::shared_ptr<RegexpLikeHolder>> RegexpLikeHolder::Make(
+    const FunctionNode& node) {
+  ARROW_RETURN_IF(
+      node.children().size() < 2,
+      Status::Invalid("'regexp_like' function requires at least two 
parameters"));
+  ARROW_RETURN_IF(
+      node.children().size() > 3,
+      Status::Invalid("'regexp_like' function requires at most three 
parameters"));
+  auto pattern = 
arrow::internal::checked_cast<LiteralNode*>(node.children().at(1).get());
+  ARROW_RETURN_IF(
+      pattern == nullptr,
+      Status::Invalid(
+          "'regexp_like' function requires a literal as the second 
parameter"));
+
+  auto pattern_type = pattern->return_type()->id();
+  ARROW_RETURN_IF(
+      !(pattern_type == arrow::Type::STRING || pattern_type == 
arrow::Type::BINARY),
+      Status::Invalid(
+          "'regexp_like' function requires a string literal as the second 
parameter"));
+
+  if (node.children().size() > 2) {
+    auto parameter =
+        
arrow::internal::checked_cast<LiteralNode*>(node.children().at(2).get());
+    if (parameter != nullptr) {
+      auto parameter_type = parameter->return_type()->id();
+      ARROW_RETURN_IF(
+          !arrow::is_binary_like(parameter_type),
+          Status::Invalid(
+              "'regexp_like' function requires a string literal as the third 
parameter"));
+      return RegexpLikeHolder::Make(std::get<std::string>(pattern->holder()), 
true,
+                                    
std::get<std::string>(parameter->holder()));
+    }
+  }
+  return RegexpLikeHolder::Make(std::get<std::string>(pattern->holder()), 
false, "");
+}
+
+Result<std::shared_ptr<RegexpLikeHolder>> RegexpLikeHolder::Make(
+    const std::string& regex_pattern, bool used_match_parameter,
+    const std::string& match_parameter) {
+  RE2::Options regex_op;
+  // set re2 use posix regex expression which also called ERE in postgre sql
+  regex_op.set_posix_syntax(true);
+  // oracle's regex_like will default treat source str as single line

Review Comment:
   ```suggestion
     // Oracle's regex_like will default treat source string as single line
   ```



##########
cpp/src/gandiva/regex_functions_holder_test.cc:
##########
@@ -635,4 +635,134 @@ TEST_F(TestExtractHolder, TestErrorWhileBuildingHolder) {
   execution_context_.Reset();
 }
 
+class TestRegexpLikeHolder : public ::testing::Test {
+ protected:
+  ExecutionContext execution_context_;
+};
+
+TEST_F(TestRegexpLikeHolder, TestRegexpLikeUseNonParameter) {
+  EXPECT_OK_AND_ASSIGN(auto regex_like_holder, RegexpLikeHolder::Make("ast", 
false, ""));
+  auto& regex_like = *regex_like_holder;

Review Comment:
   ```suggestion
     EXPECT_OK_AND_ASSIGN(auto regexp_like_holder, 
RegexpLikeHolder::Make("ast", false, ""));
     auto& regexp_like = *regexp_like_holder;
   ```
   
   Other tests have similar codes.



##########
cpp/src/gandiva/regex_functions_holder.cc:
##########
@@ -275,4 +275,82 @@ const char* ExtractHolder::operator()(ExecutionContext* 
ctx, const char* user_in
   return result_buffer;
 }
 
+Result<std::shared_ptr<RegexpLikeHolder>> RegexpLikeHolder::Make(
+    const FunctionNode& node) {
+  ARROW_RETURN_IF(
+      node.children().size() < 2,
+      Status::Invalid("'regexp_like' function requires at least two 
parameters"));
+  ARROW_RETURN_IF(
+      node.children().size() > 3,
+      Status::Invalid("'regexp_like' function requires at most three 
parameters"));
+  auto pattern = 
arrow::internal::checked_cast<LiteralNode*>(node.children().at(1).get());
+  ARROW_RETURN_IF(
+      pattern == nullptr,
+      Status::Invalid(
+          "'regexp_like' function requires a literal as the second 
parameter"));
+
+  auto pattern_type = pattern->return_type()->id();
+  ARROW_RETURN_IF(
+      !(pattern_type == arrow::Type::STRING || pattern_type == 
arrow::Type::BINARY),
+      Status::Invalid(
+          "'regexp_like' function requires a string literal as the second 
parameter"));
+
+  if (node.children().size() > 2) {
+    auto parameter =
+        
arrow::internal::checked_cast<LiteralNode*>(node.children().at(2).get());
+    if (parameter != nullptr) {
+      auto parameter_type = parameter->return_type()->id();
+      ARROW_RETURN_IF(
+          !arrow::is_binary_like(parameter_type),
+          Status::Invalid(
+              "'regexp_like' function requires a string literal as the third 
parameter"));
+      return RegexpLikeHolder::Make(std::get<std::string>(pattern->holder()), 
true,
+                                    
std::get<std::string>(parameter->holder()));
+    }
+  }
+  return RegexpLikeHolder::Make(std::get<std::string>(pattern->holder()), 
false, "");
+}
+
+Result<std::shared_ptr<RegexpLikeHolder>> RegexpLikeHolder::Make(
+    const std::string& regex_pattern, bool used_match_parameter,
+    const std::string& match_parameter) {
+  RE2::Options regex_op;
+  // set re2 use posix regex expression which also called ERE in postgre sql

Review Comment:
   ```suggestion
     // set RE2 use Posix regex expression which also called ERE in PostgreSQL
   ```



##########
cpp/src/gandiva/regex_functions_holder.cc:
##########
@@ -275,4 +275,82 @@ const char* ExtractHolder::operator()(ExecutionContext* 
ctx, const char* user_in
   return result_buffer;
 }
 
+Result<std::shared_ptr<RegexpLikeHolder>> RegexpLikeHolder::Make(
+    const FunctionNode& node) {
+  ARROW_RETURN_IF(
+      node.children().size() < 2,
+      Status::Invalid("'regexp_like' function requires at least two 
parameters"));
+  ARROW_RETURN_IF(
+      node.children().size() > 3,
+      Status::Invalid("'regexp_like' function requires at most three 
parameters"));
+  auto pattern = 
arrow::internal::checked_cast<LiteralNode*>(node.children().at(1).get());
+  ARROW_RETURN_IF(
+      pattern == nullptr,
+      Status::Invalid(
+          "'regexp_like' function requires a literal as the second 
parameter"));
+
+  auto pattern_type = pattern->return_type()->id();
+  ARROW_RETURN_IF(
+      !(pattern_type == arrow::Type::STRING || pattern_type == 
arrow::Type::BINARY),
+      Status::Invalid(
+          "'regexp_like' function requires a string literal as the second 
parameter"));
+
+  if (node.children().size() > 2) {
+    auto parameter =
+        
arrow::internal::checked_cast<LiteralNode*>(node.children().at(2).get());
+    if (parameter != nullptr) {
+      auto parameter_type = parameter->return_type()->id();
+      ARROW_RETURN_IF(
+          !arrow::is_binary_like(parameter_type),
+          Status::Invalid(
+              "'regexp_like' function requires a string literal as the third 
parameter"));
+      return RegexpLikeHolder::Make(std::get<std::string>(pattern->holder()), 
true,
+                                    
std::get<std::string>(parameter->holder()));
+    }
+  }
+  return RegexpLikeHolder::Make(std::get<std::string>(pattern->holder()), 
false, "");
+}
+
+Result<std::shared_ptr<RegexpLikeHolder>> RegexpLikeHolder::Make(
+    const std::string& regex_pattern, bool used_match_parameter,
+    const std::string& match_parameter) {
+  RE2::Options regex_op;
+  // set re2 use posix regex expression which also called ERE in postgre sql
+  regex_op.set_posix_syntax(true);
+  // oracle's regex_like will default treat source str as single line
+  regex_op.set_one_line(true);
+
+  if (used_match_parameter) {
+    for (auto& parameter : match_parameter) {
+      switch (parameter) {
+        case 'i':
+          regex_op.set_case_sensitive(false);
+          break;
+        case 'c':
+          regex_op.set_case_sensitive(true);
+          break;
+        case 'n':
+          regex_op.set_dot_nl(true);
+          break;
+        case 'm':
+          regex_op.set_one_line(false);
+          break;
+        default:
+          ARROW_RETURN_NOT_OK(Status::Invalid("Invalid match parameter '", 
parameter,
+                                              "':Only 'i', 'c', 'n', 'm' are 
allowed"));
+      }
+    }
+  }
+
+  std::shared_ptr<RegexpLikeHolder> lholder;
+
+  lholder =
+      std::shared_ptr<RegexpLikeHolder>(new RegexpLikeHolder(regex_pattern, 
regex_op));
+
+  ARROW_RETURN_IF(!lholder->regex_.ok(),
+                  Status::Invalid("Building posix regex pattern '", 
regex_pattern,

Review Comment:
   ```suggestion
                     Status::Invalid("Building Posix regular expression '", 
regex_pattern,
   ```



##########
cpp/src/gandiva/regex_functions_holder_test.cc:
##########
@@ -635,4 +635,134 @@ TEST_F(TestExtractHolder, TestErrorWhileBuildingHolder) {
   execution_context_.Reset();
 }
 
+class TestRegexpLikeHolder : public ::testing::Test {
+ protected:
+  ExecutionContext execution_context_;
+};
+
+TEST_F(TestRegexpLikeHolder, TestRegexpLikeUseNonParameter) {
+  EXPECT_OK_AND_ASSIGN(auto regex_like_holder, RegexpLikeHolder::Make("ast", 
false, ""));
+  auto& regex_like = *regex_like_holder;
+  std::string source_string = "fast";
+
+  auto ret =
+      regex_like(source_string.c_str(), 
static_cast<int32_t>(source_string.length()));
+  EXPECT_TRUE(ret);
+
+  source_string = "FAST";
+  ret = regex_like(source_string.c_str(), 
static_cast<int32_t>(source_string.length()));
+  EXPECT_TRUE(!ret);
+}
+
+TEST_F(TestRegexpLikeHolder, TestRegexLikeHolderUseIParameter) {
+  EXPECT_OK_AND_ASSIGN(auto regex_like_holder, RegexpLikeHolder::Make("ast", 
true, "i"));
+
+  auto& regex_like = *regex_like_holder;
+  std::string source_string = "FAST";
+
+  auto ret =
+      regex_like(source_string.c_str(), 
static_cast<int32_t>(source_string.length()));
+  EXPECT_TRUE(ret);
+}
+
+TEST_F(TestRegexpLikeHolder, TestRegexpLikeHolderUseCParameter) {
+  EXPECT_OK_AND_ASSIGN(auto regex_like_holder, RegexpLikeHolder::Make("ast", 
true, "c"));
+
+  auto& regex_like = *regex_like_holder;
+  std::string source_string = "FAST";
+
+  auto ret =
+      regex_like(source_string.c_str(), 
static_cast<int32_t>(source_string.length()));
+  EXPECT_TRUE(!ret);
+
+  std::string lower_source_string = "fast";
+  ret = regex_like(lower_source_string.c_str(),
+                   static_cast<int32_t>(lower_source_string.length()));
+  EXPECT_TRUE(ret);
+}
+
+TEST_F(TestRegexpLikeHolder, TestRegexpLikeHolderUseICParameter) {
+  // In oracle, if specify multiple contradictory values in regex_like, Oracle 
uses the

Review Comment:
   ```suggestion
     // In Oracle, if specify multiple contradictory values in regex_like, 
Oracle uses the
   ```



##########
cpp/src/gandiva/tests/filter_test.cc:
##########
@@ -449,4 +449,82 @@ TEST_F(TestFilter, TestLike) {
   EXPECT_ARROW_ARRAY_EQUALS(exp, selection_vector->ToArray());
 }
 
+TEST_F(TestFilter, TestRegexPLike) {

Review Comment:
   ```suggestion
   TEST_F(TestFilter, TestRegexpLike) {
   ```



##########
cpp/src/gandiva/regex_functions_holder.cc:
##########
@@ -275,4 +275,82 @@ const char* ExtractHolder::operator()(ExecutionContext* 
ctx, const char* user_in
   return result_buffer;
 }
 
+Result<std::shared_ptr<RegexpLikeHolder>> RegexpLikeHolder::Make(
+    const FunctionNode& node) {
+  ARROW_RETURN_IF(
+      node.children().size() < 2,
+      Status::Invalid("'regexp_like' function requires at least two 
parameters"));
+  ARROW_RETURN_IF(
+      node.children().size() > 3,
+      Status::Invalid("'regexp_like' function requires at most three 
parameters"));
+  auto pattern = 
arrow::internal::checked_cast<LiteralNode*>(node.children().at(1).get());
+  ARROW_RETURN_IF(
+      pattern == nullptr,
+      Status::Invalid(
+          "'regexp_like' function requires a literal as the second 
parameter"));
+
+  auto pattern_type = pattern->return_type()->id();
+  ARROW_RETURN_IF(
+      !(pattern_type == arrow::Type::STRING || pattern_type == 
arrow::Type::BINARY),
+      Status::Invalid(
+          "'regexp_like' function requires a string literal as the second 
parameter"));
+
+  if (node.children().size() > 2) {
+    auto parameter =
+        
arrow::internal::checked_cast<LiteralNode*>(node.children().at(2).get());
+    if (parameter != nullptr) {
+      auto parameter_type = parameter->return_type()->id();
+      ARROW_RETURN_IF(
+          !arrow::is_binary_like(parameter_type),
+          Status::Invalid(
+              "'regexp_like' function requires a string literal as the third 
parameter"));
+      return RegexpLikeHolder::Make(std::get<std::string>(pattern->holder()), 
true,
+                                    
std::get<std::string>(parameter->holder()));
+    }
+  }
+  return RegexpLikeHolder::Make(std::get<std::string>(pattern->holder()), 
false, "");
+}
+
+Result<std::shared_ptr<RegexpLikeHolder>> RegexpLikeHolder::Make(
+    const std::string& regex_pattern, bool used_match_parameter,

Review Comment:
   How about removing redundant "regex_" prefix? In this context 
(`RegexpLikeHolder`), "regex_" is redundant.
   
   ```suggestion
       const std::string& pattern, bool used_match_parameter,
   ```



##########
cpp/src/gandiva/gdv_string_function_stubs.cc:
##########
@@ -986,6 +1003,32 @@ arrow::Status 
ExportedStringFunctions::AddMappings(Engine* engine) const {
   engine->AddGlobalMappingForFunc("translate_utf8_utf8_utf8",
                                   types->i8_ptr_type() /*return_type*/, args,
                                   
reinterpret_cast<void*>(translate_utf8_utf8_utf8));
+
+  // gdv_fn_regex_like_utf8_utf8
+  args = {
+      types->i64_type(),     // int64_t holder_ptr
+      types->i8_ptr_type(),  // const char* source_string
+      types->i32_type(),     // int source_string_len
+      types->i8_ptr_type(),  // const char* pattern
+      types->i32_type()      // int pattern_len
+  };
+  engine->AddGlobalMappingForFunc("gdv_fn_regexp_like_utf8_utf8",
+                                  types->i1_type() /*return_type*/, args,
+                                  
reinterpret_cast<void*>(gdv_fn_regexp_like_utf8_uft8));
+
+  // gdv_fn_regex_like_utf8_utf8_utf8

Review Comment:
   ```suggestion
     // gdv_fn_regexp_like_utf8_utf8_utf8
   ```



##########
cpp/src/gandiva/regex_functions_holder.cc:
##########
@@ -275,4 +275,82 @@ const char* ExtractHolder::operator()(ExecutionContext* 
ctx, const char* user_in
   return result_buffer;
 }
 
+Result<std::shared_ptr<RegexpLikeHolder>> RegexpLikeHolder::Make(
+    const FunctionNode& node) {
+  ARROW_RETURN_IF(
+      node.children().size() < 2,
+      Status::Invalid("'regexp_like' function requires at least two 
parameters"));
+  ARROW_RETURN_IF(
+      node.children().size() > 3,
+      Status::Invalid("'regexp_like' function requires at most three 
parameters"));
+  auto pattern = 
arrow::internal::checked_cast<LiteralNode*>(node.children().at(1).get());
+  ARROW_RETURN_IF(
+      pattern == nullptr,
+      Status::Invalid(
+          "'regexp_like' function requires a literal as the second 
parameter"));
+
+  auto pattern_type = pattern->return_type()->id();
+  ARROW_RETURN_IF(
+      !(pattern_type == arrow::Type::STRING || pattern_type == 
arrow::Type::BINARY),
+      Status::Invalid(
+          "'regexp_like' function requires a string literal as the second 
parameter"));
+
+  if (node.children().size() > 2) {
+    auto parameter =
+        
arrow::internal::checked_cast<LiteralNode*>(node.children().at(2).get());
+    if (parameter != nullptr) {
+      auto parameter_type = parameter->return_type()->id();
+      ARROW_RETURN_IF(
+          !arrow::is_binary_like(parameter_type),
+          Status::Invalid(
+              "'regexp_like' function requires a string literal as the third 
parameter"));
+      return RegexpLikeHolder::Make(std::get<std::string>(pattern->holder()), 
true,
+                                    
std::get<std::string>(parameter->holder()));
+    }
+  }
+  return RegexpLikeHolder::Make(std::get<std::string>(pattern->holder()), 
false, "");
+}
+
+Result<std::shared_ptr<RegexpLikeHolder>> RegexpLikeHolder::Make(
+    const std::string& regex_pattern, bool used_match_parameter,
+    const std::string& match_parameter) {
+  RE2::Options regex_op;
+  // set re2 use posix regex expression which also called ERE in postgre sql
+  regex_op.set_posix_syntax(true);
+  // oracle's regex_like will default treat source str as single line
+  regex_op.set_one_line(true);
+
+  if (used_match_parameter) {
+    for (auto& parameter : match_parameter) {
+      switch (parameter) {
+        case 'i':
+          regex_op.set_case_sensitive(false);
+          break;
+        case 'c':
+          regex_op.set_case_sensitive(true);
+          break;
+        case 'n':
+          regex_op.set_dot_nl(true);
+          break;
+        case 'm':
+          regex_op.set_one_line(false);
+          break;
+        default:
+          ARROW_RETURN_NOT_OK(Status::Invalid("Invalid match parameter '", 
parameter,
+                                              "':Only 'i', 'c', 'n', 'm' are 
allowed"));
+      }
+    }
+  }
+
+  std::shared_ptr<RegexpLikeHolder> lholder;
+
+  lholder =
+      std::shared_ptr<RegexpLikeHolder>(new RegexpLikeHolder(regex_pattern, 
regex_op));

Review Comment:
   * How about using `auto`?
   * How about using just `holder`? Because we have only one holder in this 
context.
   
   ```suggestion
     auto holder =
         std::shared_ptr<RegexpLikeHolder>(new RegexpLikeHolder(regex_pattern, 
regex_op));
   ```



##########
cpp/src/gandiva/regex_functions_holder.cc:
##########
@@ -275,4 +275,82 @@ const char* ExtractHolder::operator()(ExecutionContext* 
ctx, const char* user_in
   return result_buffer;
 }
 
+Result<std::shared_ptr<RegexpLikeHolder>> RegexpLikeHolder::Make(
+    const FunctionNode& node) {
+  ARROW_RETURN_IF(
+      node.children().size() < 2,
+      Status::Invalid("'regexp_like' function requires at least two 
parameters"));
+  ARROW_RETURN_IF(
+      node.children().size() > 3,
+      Status::Invalid("'regexp_like' function requires at most three 
parameters"));
+  auto pattern = 
arrow::internal::checked_cast<LiteralNode*>(node.children().at(1).get());
+  ARROW_RETURN_IF(
+      pattern == nullptr,
+      Status::Invalid(
+          "'regexp_like' function requires a literal as the second 
parameter"));
+
+  auto pattern_type = pattern->return_type()->id();
+  ARROW_RETURN_IF(
+      !(pattern_type == arrow::Type::STRING || pattern_type == 
arrow::Type::BINARY),
+      Status::Invalid(
+          "'regexp_like' function requires a string literal as the second 
parameter"));
+
+  if (node.children().size() > 2) {
+    auto parameter =
+        
arrow::internal::checked_cast<LiteralNode*>(node.children().at(2).get());
+    if (parameter != nullptr) {
+      auto parameter_type = parameter->return_type()->id();
+      ARROW_RETURN_IF(
+          !arrow::is_binary_like(parameter_type),
+          Status::Invalid(
+              "'regexp_like' function requires a string literal as the third 
parameter"));
+      return RegexpLikeHolder::Make(std::get<std::string>(pattern->holder()), 
true,
+                                    
std::get<std::string>(parameter->holder()));
+    }
+  }
+  return RegexpLikeHolder::Make(std::get<std::string>(pattern->holder()), 
false, "");
+}
+
+Result<std::shared_ptr<RegexpLikeHolder>> RegexpLikeHolder::Make(
+    const std::string& regex_pattern, bool used_match_parameter,
+    const std::string& match_parameter) {
+  RE2::Options regex_op;

Review Comment:
   How about using `options` for this variable? "regex_" prefix is redundant in 
this context and "op" isn't suitable abbreviation. (We may use "op" for 
abbreviation of "operator" or something.)
   
   ```suggestion
     RE2::Options options;
   ```



##########
cpp/src/gandiva/regex_functions_holder.h:
##########
@@ -150,4 +150,27 @@ class GANDIVA_EXPORT ExtractHolder : public FunctionHolder 
{
   int32_t num_groups_pattern_;  // number of groups that user defined inside 
the regex
 };
 
+class GANDIVA_EXPORT RegexpLikeHolder : public FunctionHolder {
+ public:
+  ~RegexpLikeHolder() override = default;
+
+  static Result<std::shared_ptr<RegexpLikeHolder>> Make(const FunctionNode& 
node);
+
+  static Result<std::shared_ptr<RegexpLikeHolder>> Make(
+      const std::string& regex_pattern, bool used_match_parameter,
+      const std::string& match_parameter);
+
+  bool operator()(const char* source_string, int32_t source_string_len) {
+    std::string source_str(source_string, source_string_len);
+    return RE2::PartialMatch(source_str, regex_);
+  }
+
+ private:
+  explicit RegexpLikeHolder(const std::string& pattern, RE2::Options regex_op)
+      : regex_pattern_(pattern), regex_(pattern, regex_op) {}
+
+  std::string regex_pattern_;
+  RE2 regex_;

Review Comment:
   ```suggestion
     RE2 regexp_;
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to