This is an automated email from the ASF dual-hosted git repository.
ravindra pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new a8479e9c25 ARROW-13052: [Gandiva][C++] Add regexp_extract function
a8479e9c25 is described below
commit a8479e9c252482438b6fc2bc0383ac5cf6a09d59
Author: Johnnathan <[email protected]>
AuthorDate: Wed May 11 15:36:53 2022 +0530
ARROW-13052: [Gandiva][C++] Add regexp_extract function
Implements the REGEXP_EXTRACT function based on [the Hive
implementation](https://www.revisitclass.com/hadoop/regexp_extract-function-in-hive-with-examples/).
Closes #13015 from Johnnathanalmeida/feature/add-regexp-extract
Authored-by: Johnnathan <[email protected]>
Signed-off-by: Pindikura Ravindra <[email protected]>
---
cpp/src/gandiva/CMakeLists.txt | 6 +-
cpp/src/gandiva/function_holder_registry.h | 4 +-
cpp/src/gandiva/function_registry_string.cc | 6 +
cpp/src/gandiva/gdv_string_function_stubs.cc | 30 +-
cpp/src/gandiva/like_holder.h | 68 --
cpp/src/gandiva/like_holder_test.cc | 317 ----------
.../{like_holder.cc => regex_functions_holder.cc} | 113 +++-
.../{replace_holder.h => regex_functions_holder.h} | 62 ++
cpp/src/gandiva/regex_functions_holder_test.cc | 701 +++++++++++++++++++++
cpp/src/gandiva/replace_holder.cc | 61 --
cpp/src/gandiva/replace_holder_test.cc | 129 ----
cpp/src/gandiva/tests/projector_test.cc | 50 ++
12 files changed, 962 insertions(+), 585 deletions(-)
diff --git a/cpp/src/gandiva/CMakeLists.txt b/cpp/src/gandiva/CMakeLists.txt
index 71faf9a38e..46e819ccc6 100644
--- a/cpp/src/gandiva/CMakeLists.txt
+++ b/cpp/src/gandiva/CMakeLists.txt
@@ -87,11 +87,10 @@ set(SRC_FILES
interval_holder.cc
llvm_generator.cc
llvm_types.cc
- like_holder.cc
literal_holder.cc
projector.cc
regex_util.cc
- replace_holder.cc
+ regex_functions_holder.cc
selection_vector.cc
tree_expr_builder.cc
to_date_holder.cc
@@ -236,8 +235,7 @@ add_gandiva_test(internals-test
lru_cache_test.cc
to_date_holder_test.cc
simple_arena_test.cc
- like_holder_test.cc
- replace_holder_test.cc
+ regex_functions_holder_test.cc
decimal_type_util_test.cc
random_generator_holder_test.cc
hash_utils_test.cc
diff --git a/cpp/src/gandiva/function_holder_registry.h
b/cpp/src/gandiva/function_holder_registry.h
index bddf32034f..97a03db347 100644
--- a/cpp/src/gandiva/function_holder_registry.h
+++ b/cpp/src/gandiva/function_holder_registry.h
@@ -25,10 +25,9 @@
#include "arrow/status.h"
#include "gandiva/function_holder.h"
#include "gandiva/interval_holder.h"
-#include "gandiva/like_holder.h"
#include "gandiva/node.h"
#include "gandiva/random_generator_holder.h"
-#include "gandiva/replace_holder.h"
+#include "gandiva/regex_functions_holder.h"
#include "gandiva/to_date_holder.h"
namespace gandiva {
@@ -67,6 +66,7 @@ class FunctionHolderRegistry {
{"random",
LAMBDA_MAKER(RandomGeneratorHolder)},
{"rand", LAMBDA_MAKER(RandomGeneratorHolder)},
{"regexp_replace",
LAMBDA_MAKER(ReplaceHolder)},
+ {"regexp_extract",
LAMBDA_MAKER(ExtractHolder)},
{"castintervalday",
LAMBDA_MAKER(IntervalDaysHolder)},
{"castintervalyear",
LAMBDA_MAKER(IntervalYearsHolder)}};
return maker_map;
diff --git a/cpp/src/gandiva/function_registry_string.cc
b/cpp/src/gandiva/function_registry_string.cc
index 4890ec8838..c1b6ef1648 100644
--- a/cpp/src/gandiva/function_registry_string.cc
+++ b/cpp/src/gandiva/function_registry_string.cc
@@ -255,6 +255,12 @@ std::vector<NativeFunction> GetStringFunctionRegistry() {
NativeFunction::kNeedsFunctionHolder |
NativeFunction::kCanReturnErrors),
+ NativeFunction("regexp_extract", {}, DataTypeVector{utf8(), utf8(),
int32()},
+ utf8(), kResultNullIfNull,
"gdv_fn_regexp_extract_utf8_utf8_int32",
+ NativeFunction::kNeedsContext |
+ NativeFunction::kNeedsFunctionHolder |
+ NativeFunction::kCanReturnErrors),
+
NativeFunction("concatOperator", {}, DataTypeVector{utf8(), utf8()},
utf8(),
kResultNullIfNull, "concatOperator_utf8_utf8",
NativeFunction::kNeedsContext),
diff --git a/cpp/src/gandiva/gdv_string_function_stubs.cc
b/cpp/src/gandiva/gdv_string_function_stubs.cc
index 862c6e91cb..1948d3a3e1 100644
--- a/cpp/src/gandiva/gdv_string_function_stubs.cc
+++ b/cpp/src/gandiva/gdv_string_function_stubs.cc
@@ -31,9 +31,8 @@
#include "gandiva/engine.h"
#include "gandiva/exported_funcs.h"
#include "gandiva/formatting_utils.h"
-#include "gandiva/like_holder.h"
#include "gandiva/precompiled/types.h"
-#include "gandiva/replace_holder.h"
+#include "gandiva/regex_functions_holder.h"
extern "C" {
@@ -68,6 +67,19 @@ const char* gdv_fn_regexp_replace_utf8_utf8(
out_length);
}
+const char* gdv_fn_regexp_extract_utf8_utf8_int32(int64_t ptr, int64_t
holder_ptr,
+ const char* data, int32_t
data_len,
+ const char* /*pattern*/,
+ int32_t /*pattern_len*/,
+ int32_t extract_index,
+ int32_t* out_length) {
+ gandiva::ExecutionContext* context =
reinterpret_cast<gandiva::ExecutionContext*>(ptr);
+
+ gandiva::ExtractHolder* holder =
reinterpret_cast<gandiva::ExtractHolder*>(holder_ptr);
+
+ return (*holder)(context, data, data_len, extract_index, out_length);
+}
+
#define GDV_FN_CAST_VARLEN_TYPE_FROM_TYPE(IN_TYPE, CAST_NAME, ARROW_TYPE)
\
GANDIVA_EXPORT
\
const char* gdv_fn_cast##CAST_NAME##_##IN_TYPE##_int64(
\
@@ -495,6 +507,20 @@ void ExportedStringFunctions::AddMappings(Engine* engine)
const {
"gdv_fn_regexp_replace_utf8_utf8", types->i8_ptr_type() /*return_type*/,
args,
reinterpret_cast<void*>(gdv_fn_regexp_replace_utf8_utf8));
+ // gdv_fn_regexp_extract_utf8_utf8_int32
+ args = {types->i64_type(), // int64_t ptr
+ types->i64_type(), // int64_t holder_ptr
+ types->i8_ptr_type(), // const char* data
+ types->i32_type(), // int data_len
+ types->i8_ptr_type(), // const char* pattern
+ types->i32_type(), // int pattern_len
+ types->i32_type(), // int32_t extract_index
+ types->i32_ptr_type()}; // int32_t* out_length
+
+ engine->AddGlobalMappingForFunc(
+ "gdv_fn_regexp_extract_utf8_utf8_int32", types->i8_ptr_type()
/*return_type*/, args,
+ reinterpret_cast<void*>(gdv_fn_regexp_extract_utf8_utf8_int32));
+
// gdv_fn_castVARCHAR_int32_int64
args = {types->i64_type(), // int64_t execution_context
types->i32_type(), // int32_t value
diff --git a/cpp/src/gandiva/like_holder.h b/cpp/src/gandiva/like_holder.h
deleted file mode 100644
index 73e58017de..0000000000
--- a/cpp/src/gandiva/like_holder.h
+++ /dev/null
@@ -1,68 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#pragma once
-
-#include <memory>
-#include <string>
-
-#include <re2/re2.h>
-
-#include "arrow/status.h"
-
-#include "gandiva/function_holder.h"
-#include "gandiva/node.h"
-#include "gandiva/visibility.h"
-
-namespace gandiva {
-
-/// Function Holder for SQL 'like'
-class GANDIVA_EXPORT LikeHolder : public FunctionHolder {
- public:
- ~LikeHolder() override = default;
-
- static Status Make(const FunctionNode& node, std::shared_ptr<LikeHolder>*
holder);
-
- static Status Make(const std::string& sql_pattern,
std::shared_ptr<LikeHolder>* holder);
-
- static Status Make(const std::string& sql_pattern, const std::string&
escape_char,
- std::shared_ptr<LikeHolder>* holder);
-
- static Status Make(const std::string& sql_pattern,
std::shared_ptr<LikeHolder>* holder,
- RE2::Options regex_op);
-
- // Try and optimise a function node with a "like" pattern.
- static const FunctionNode TryOptimize(const FunctionNode& node);
-
- /// Return true if the data matches the pattern.
- bool operator()(const std::string& data) { return RE2::FullMatch(data,
regex_); }
-
- private:
- explicit LikeHolder(const std::string& pattern) : pattern_(pattern),
regex_(pattern) {}
-
- LikeHolder(const std::string& pattern, RE2::Options regex_op)
- : pattern_(pattern), regex_(pattern, regex_op) {}
-
- std::string pattern_; // posix pattern string, to help debugging
- RE2 regex_; // compiled regex for the pattern
-
- static RE2 starts_with_regex_; // pre-compiled pattern for matching
starts_with
- static RE2 ends_with_regex_; // pre-compiled pattern for matching
ends_with
- static RE2 is_substr_regex_; // pre-compiled pattern for matching
is_substr
-};
-
-} // namespace gandiva
diff --git a/cpp/src/gandiva/like_holder_test.cc
b/cpp/src/gandiva/like_holder_test.cc
deleted file mode 100644
index 76a7754298..0000000000
--- a/cpp/src/gandiva/like_holder_test.cc
+++ /dev/null
@@ -1,317 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include "gandiva/like_holder.h"
-#include "gandiva/regex_util.h"
-
-#include <memory>
-#include <vector>
-
-#include <gtest/gtest.h>
-
-namespace gandiva {
-
-class TestLikeHolder : public ::testing::Test {
- public:
- RE2::Options regex_op;
- FunctionNode BuildLike(std::string pattern) {
- auto field = std::make_shared<FieldNode>(arrow::field("in",
arrow::utf8()));
- auto pattern_node =
- std::make_shared<LiteralNode>(arrow::utf8(), LiteralHolder(pattern),
false);
- return FunctionNode("like", {field, pattern_node}, arrow::boolean());
- }
-
- FunctionNode BuildLike(std::string pattern, char escape_char) {
- auto field = std::make_shared<FieldNode>(arrow::field("in",
arrow::utf8()));
- auto pattern_node =
- std::make_shared<LiteralNode>(arrow::utf8(), LiteralHolder(pattern),
false);
- auto escape_char_node = std::make_shared<LiteralNode>(
- arrow::int8(), LiteralHolder((int8_t)escape_char), false);
- return FunctionNode("like", {field, pattern_node, escape_char_node},
- arrow::boolean());
- }
-};
-
-TEST_F(TestLikeHolder, TestMatchAny) {
- std::shared_ptr<LikeHolder> like_holder;
-
- auto status = LikeHolder::Make("ab%", &like_holder, regex_op);
- EXPECT_EQ(status.ok(), true) << status.message();
-
- auto& like = *like_holder;
- EXPECT_TRUE(like("ab"));
- EXPECT_TRUE(like("abc"));
- EXPECT_TRUE(like("abcd"));
-
- EXPECT_FALSE(like("a"));
- EXPECT_FALSE(like("cab"));
-}
-
-TEST_F(TestLikeHolder, TestMatchOne) {
- std::shared_ptr<LikeHolder> like_holder;
-
- auto status = LikeHolder::Make("ab_", &like_holder, regex_op);
- EXPECT_EQ(status.ok(), true) << status.message();
-
- auto& like = *like_holder;
- EXPECT_TRUE(like("abc"));
- EXPECT_TRUE(like("abd"));
-
- EXPECT_FALSE(like("a"));
- EXPECT_FALSE(like("abcd"));
- EXPECT_FALSE(like("dabc"));
-}
-
-TEST_F(TestLikeHolder, TestPcreSpecial) {
- std::shared_ptr<LikeHolder> like_holder;
-
- auto status = LikeHolder::Make(".*ab_", &like_holder, regex_op);
- EXPECT_EQ(status.ok(), true) << status.message();
-
- auto& like = *like_holder;
- EXPECT_TRUE(like(".*abc")); // . and * aren't special in sql regex
- EXPECT_FALSE(like("xxabc"));
-}
-
-TEST_F(TestLikeHolder, TestRegexEscape) {
- std::string res;
- auto status = RegexUtil::SqlLikePatternToPcre("#%hello#_abc_def##", '#',
res);
- EXPECT_TRUE(status.ok()) << status.message();
-
- EXPECT_EQ(res, "%hello_abc.def#");
-}
-
-TEST_F(TestLikeHolder, TestDot) {
- std::shared_ptr<LikeHolder> like_holder;
-
- auto status = LikeHolder::Make("abc.", &like_holder, regex_op);
- EXPECT_EQ(status.ok(), true) << status.message();
-
- auto& like = *like_holder;
- EXPECT_FALSE(like("abcd"));
-}
-
-TEST_F(TestLikeHolder, TestMatchSubString) {
- std::shared_ptr<LikeHolder> like_holder;
-
- auto status = LikeHolder::Make("%abc%", "\\", &like_holder);
- EXPECT_EQ(status.ok(), true) << status.message();
-
- auto& like = *like_holder;
- EXPECT_TRUE(like("abc"));
- EXPECT_FALSE(like("xxabdc"));
-
- status = LikeHolder::Make("%ab-.^$*+?()[]{}|—/c\\%%", "\\", &like_holder);
- EXPECT_EQ(status.ok(), true) << status.message();
-
- auto& like_reserved_char = *like_holder;
- EXPECT_TRUE(like_reserved_char("XXab-.^$*+?()[]{}|—/c%d"));
- EXPECT_FALSE(like_reserved_char("xxad-.^$*+?()[]{}|—/c"));
-}
-
-TEST_F(TestLikeHolder, TestOptimise) {
- // optimise for 'starts_with'
- auto fnode = LikeHolder::TryOptimize(BuildLike("xy 123z%"));
- EXPECT_EQ(fnode.descriptor()->name(), "starts_with");
- EXPECT_EQ(fnode.ToString(), "bool starts_with((string) in, (const string)
'xy 123z')");
-
- // optimise for 'ends_with'
- fnode = LikeHolder::TryOptimize(BuildLike("%xyz"));
- EXPECT_EQ(fnode.descriptor()->name(), "ends_with");
- EXPECT_EQ(fnode.ToString(), "bool ends_with((string) in, (const string)
'xyz')");
-
- // optimise for 'is_substr'
- fnode = LikeHolder::TryOptimize(BuildLike("%abc%"));
- EXPECT_EQ(fnode.descriptor()->name(), "is_substr");
- EXPECT_EQ(fnode.ToString(), "bool is_substr((string) in, (const string)
'abc')");
-
- // optimise for 'is_substr with special characters'
- fnode = LikeHolder::TryOptimize(BuildLike("%ab-c%"));
- EXPECT_EQ(fnode.descriptor()->name(), "is_substr");
- EXPECT_EQ(fnode.ToString(), "bool is_substr((string) in, (const string)
'ab-c')");
-
- // optimise for 'ends_with with special characters'
- fnode = LikeHolder::TryOptimize(BuildLike("%ab-c"));
- EXPECT_EQ(fnode.descriptor()->name(), "ends_with");
- EXPECT_EQ(fnode.ToString(),
- "bool ends_with((string) in, (const string) "
- "'ab-c')");
-
- // optimise for 'starts_with with special characters'
- fnode = LikeHolder::TryOptimize(BuildLike("ab-c%"));
- EXPECT_EQ(fnode.descriptor()->name(), "starts_with");
- EXPECT_EQ(fnode.ToString(),
- "bool starts_with((string) in, (const string) "
- "'ab-c')");
-
- // no optimisation for others.
- fnode = LikeHolder::TryOptimize(BuildLike("xyz_"));
- EXPECT_EQ(fnode.descriptor()->name(), "like");
-
- fnode = LikeHolder::TryOptimize(BuildLike("_xyz"));
- EXPECT_EQ(fnode.descriptor()->name(), "like");
-
- fnode = LikeHolder::TryOptimize(BuildLike("_xyz_"));
- EXPECT_EQ(fnode.descriptor()->name(), "like");
-
- fnode = LikeHolder::TryOptimize(BuildLike("%xyz_"));
- EXPECT_EQ(fnode.descriptor()->name(), "like");
-
- fnode = LikeHolder::TryOptimize(BuildLike("x_yz%"));
- EXPECT_EQ(fnode.descriptor()->name(), "like");
-
- // no optimisation for escaped pattern.
- fnode = LikeHolder::TryOptimize(BuildLike("\\%xyz", '\\'));
- EXPECT_EQ(fnode.descriptor()->name(), "like");
- EXPECT_EQ(fnode.ToString(),
- "bool like((string) in, (const string) '\\%xyz', (const int8)
\\)");
-}
-
-TEST_F(TestLikeHolder, TestMatchOneEscape) {
- std::shared_ptr<LikeHolder> like_holder;
-
- auto status = LikeHolder::Make("ab\\_", "\\", &like_holder);
- EXPECT_EQ(status.ok(), true) << status.message();
-
- auto& like = *like_holder;
-
- EXPECT_TRUE(like("ab_"));
-
- EXPECT_FALSE(like("abc"));
- EXPECT_FALSE(like("abd"));
- EXPECT_FALSE(like("a"));
- EXPECT_FALSE(like("abcd"));
- EXPECT_FALSE(like("dabc"));
-}
-
-TEST_F(TestLikeHolder, TestMatchManyEscape) {
- std::shared_ptr<LikeHolder> like_holder;
-
- auto status = LikeHolder::Make("ab\\%", "\\", &like_holder);
- EXPECT_EQ(status.ok(), true) << status.message();
-
- auto& like = *like_holder;
-
- EXPECT_TRUE(like("ab%"));
-
- EXPECT_FALSE(like("abc"));
- EXPECT_FALSE(like("abd"));
- EXPECT_FALSE(like("a"));
- EXPECT_FALSE(like("abcd"));
- EXPECT_FALSE(like("dabc"));
-}
-
-TEST_F(TestLikeHolder, TestMatchEscape) {
- std::shared_ptr<LikeHolder> like_holder;
-
- auto status = LikeHolder::Make("ab\\\\", "\\", &like_holder);
- EXPECT_EQ(status.ok(), true) << status.message();
-
- auto& like = *like_holder;
-
- EXPECT_TRUE(like("ab\\"));
-
- EXPECT_FALSE(like("abc"));
-}
-
-TEST_F(TestLikeHolder, TestEmptyEscapeChar) {
- std::shared_ptr<LikeHolder> like_holder;
-
- auto status = LikeHolder::Make("ab\\_", "", &like_holder);
- EXPECT_EQ(status.ok(), true) << status.message();
-
- auto& like = *like_holder;
-
- EXPECT_TRUE(like("ab\\c"));
- EXPECT_TRUE(like("ab\\_"));
-
- EXPECT_FALSE(like("ab\\_d"));
- EXPECT_FALSE(like("ab__"));
-}
-
-TEST_F(TestLikeHolder, TestMultipleEscapeChar) {
- std::shared_ptr<LikeHolder> like_holder;
-
- auto status = LikeHolder::Make("ab\\_", "\\\\", &like_holder);
- EXPECT_EQ(status.ok(), false) << status.message();
-}
-class TestILikeHolder : public ::testing::Test {
- public:
- RE2::Options regex_op;
- FunctionNode BuildILike(std::string pattern) {
- auto field = std::make_shared<FieldNode>(arrow::field("in",
arrow::utf8()));
- auto pattern_node =
- std::make_shared<LiteralNode>(arrow::utf8(), LiteralHolder(pattern),
false);
- return FunctionNode("ilike", {field, pattern_node}, arrow::boolean());
- }
-};
-
-TEST_F(TestILikeHolder, TestMatchAny) {
- std::shared_ptr<LikeHolder> like_holder;
-
- regex_op.set_case_sensitive(false);
- auto status = LikeHolder::Make("ab%", &like_holder, regex_op);
- EXPECT_EQ(status.ok(), true) << status.message();
-
- auto& like = *like_holder;
- EXPECT_TRUE(like("ab"));
- EXPECT_TRUE(like("aBc"));
- EXPECT_TRUE(like("ABCD"));
-
- EXPECT_FALSE(like("a"));
- EXPECT_FALSE(like("cab"));
-}
-
-TEST_F(TestILikeHolder, TestMatchOne) {
- std::shared_ptr<LikeHolder> like_holder;
-
- regex_op.set_case_sensitive(false);
- auto status = LikeHolder::Make("Ab_", &like_holder, regex_op);
- EXPECT_EQ(status.ok(), true) << status.message();
-
- auto& like = *like_holder;
- EXPECT_TRUE(like("abc"));
- EXPECT_TRUE(like("aBd"));
-
- EXPECT_FALSE(like("A"));
- EXPECT_FALSE(like("Abcd"));
- EXPECT_FALSE(like("DaBc"));
-}
-
-TEST_F(TestILikeHolder, TestPcreSpecial) {
- std::shared_ptr<LikeHolder> like_holder;
-
- regex_op.set_case_sensitive(false);
- auto status = LikeHolder::Make(".*aB_", &like_holder, regex_op);
- EXPECT_EQ(status.ok(), true) << status.message();
-
- auto& like = *like_holder;
- EXPECT_TRUE(like(".*Abc")); // . and * aren't special in sql regex
- EXPECT_FALSE(like("xxAbc"));
-}
-
-TEST_F(TestILikeHolder, TestDot) {
- std::shared_ptr<LikeHolder> like_holder;
-
- regex_op.set_case_sensitive(false);
- auto status = LikeHolder::Make("aBc.", &like_holder, regex_op);
- EXPECT_EQ(status.ok(), true) << status.message();
-
- auto& like = *like_holder;
- EXPECT_FALSE(like("abcd"));
-}
-} // namespace gandiva
diff --git a/cpp/src/gandiva/like_holder.cc
b/cpp/src/gandiva/regex_functions_holder.cc
similarity index 62%
rename from cpp/src/gandiva/like_holder.cc
rename to cpp/src/gandiva/regex_functions_holder.cc
index 3391c7ec16..b1e2e59cb2 100644
--- a/cpp/src/gandiva/like_holder.cc
+++ b/cpp/src/gandiva/regex_functions_holder.cc
@@ -15,13 +15,13 @@
// specific language governing permissions and limitations
// under the License.
-#include "gandiva/like_holder.h"
-
+#include "gandiva/regex_functions_holder.h"
#include <regex>
#include "gandiva/node.h"
#include "gandiva/regex_util.h"
namespace gandiva {
+
RE2 LikeHolder::starts_with_regex_(R"(([^\.\*])*\.\*)");
RE2 LikeHolder::ends_with_regex_(R"(\.\*([^\.\*])*)");
RE2 LikeHolder::is_substr_regex_(R"(\.\*([^\.\*])*\.\*)");
@@ -163,4 +163,113 @@ Status LikeHolder::Make(const std::string& sql_pattern,
*holder = lholder;
return Status::OK();
}
+
+Status ReplaceHolder::Make(const FunctionNode& node,
+ std::shared_ptr<ReplaceHolder>* holder) {
+ ARROW_RETURN_IF(node.children().size() != 3,
+ Status::Invalid("'replace' function requires three
parameters"));
+
+ auto literal = dynamic_cast<LiteralNode*>(node.children().at(1).get());
+ ARROW_RETURN_IF(
+ literal == nullptr,
+ Status::Invalid("'replace' function requires a literal as the second
parameter"));
+
+ auto literal_type = literal->return_type()->id();
+ ARROW_RETURN_IF(
+ !(literal_type == arrow::Type::STRING || literal_type ==
arrow::Type::BINARY),
+ Status::Invalid(
+ "'replace' function requires a string literal as the second
parameter"));
+
+ return Make(arrow::util::get<std::string>(literal->holder()), holder);
+}
+
+Status ReplaceHolder::Make(const std::string& sql_pattern,
+ std::shared_ptr<ReplaceHolder>* holder) {
+ auto lholder = std::shared_ptr<ReplaceHolder>(new
ReplaceHolder(sql_pattern));
+ ARROW_RETURN_IF(!lholder->regex_.ok(),
+ Status::Invalid("Building RE2 pattern '", sql_pattern, "'
failed"));
+
+ *holder = lholder;
+ return Status::OK();
+}
+
+void ReplaceHolder::return_error(ExecutionContext* context, std::string& data,
+ std::string& replace_string) {
+ std::string err_msg = "Error replacing '" + replace_string + "' on the given
string '" +
+ data + "' for the given pattern: " + pattern_;
+ context->set_error_msg(err_msg.c_str());
+}
+
+Status ExtractHolder::Make(const FunctionNode& node,
+ std::shared_ptr<ExtractHolder>* holder) {
+ ARROW_RETURN_IF(node.children().size() != 3,
+ Status::Invalid("'extract' function requires three
parameters"));
+
+ auto literal = dynamic_cast<LiteralNode*>(node.children().at(1).get());
+ ARROW_RETURN_IF(
+ literal == nullptr ||
!IsArrowStringLiteral(literal->return_type()->id()),
+ Status::Invalid("'extract' function requires a literal as the second
parameter"));
+
+ return ExtractHolder::Make(arrow::util::get<std::string>(literal->holder()),
holder);
+}
+
+Status ExtractHolder::Make(const std::string& sql_pattern,
+ std::shared_ptr<ExtractHolder>* holder) {
+ auto lholder = std::shared_ptr<ExtractHolder>(new
ExtractHolder(sql_pattern));
+ ARROW_RETURN_IF(!lholder->regex_.ok(),
+ Status::Invalid("Building RE2 pattern '", sql_pattern, "'
failed"));
+
+ *holder = lholder;
+ return Status::OK();
+}
+
+const char* ExtractHolder::operator()(ExecutionContext* ctx, const char*
user_input,
+ int32_t user_input_len, int32_t
extract_index,
+ int32_t* out_length) {
+ if (extract_index < 0 || extract_index >= num_groups_pattern_) {
+ ctx->set_error_msg("Index to extract out of range");
+ *out_length = 0;
+ return "";
+ }
+
+ std::string user_input_as_str(user_input, user_input_len);
+
+ // Create the vectors that will store the arguments to be captured by the
regex
+ // groups.
+ std::vector<std::string> arguments_as_str(num_groups_pattern_);
+ std::vector<RE2::Arg> arguments(num_groups_pattern_);
+ std::vector<RE2::Arg*> arguments_ptrs(num_groups_pattern_);
+
+ for (int32_t i = 0; i < num_groups_pattern_; i++) {
+ // Bind argument to string from vector.
+ arguments[i] = &arguments_as_str[i];
+ // Save pointer to argument.
+ arguments_ptrs[i] = &arguments[i];
+ }
+
+ re2::StringPiece piece(user_input_as_str);
+ if (!RE2::FindAndConsumeN(&piece, regex_, arguments_ptrs.data(),
num_groups_pattern_)) {
+ *out_length = 0;
+ return "";
+ }
+
+ auto out_str = arguments_as_str[extract_index];
+ *out_length = static_cast<int32_t>(out_str.size());
+
+ // This condition treats the case where the return is an empty string
+ if (*out_length == 0) {
+ return "";
+ }
+
+ char* result_buffer =
reinterpret_cast<char*>(ctx->arena()->Allocate(*out_length));
+ if (result_buffer == NULLPTR) {
+ ctx->set_error_msg("Could not allocate memory for result");
+ *out_length = 0;
+ return "";
+ }
+
+ memcpy(result_buffer, out_str.data(), *out_length);
+ return result_buffer;
+}
+
} // namespace gandiva
diff --git a/cpp/src/gandiva/replace_holder.h
b/cpp/src/gandiva/regex_functions_holder.h
similarity index 54%
rename from cpp/src/gandiva/replace_holder.h
rename to cpp/src/gandiva/regex_functions_holder.h
index 79150d7aa4..8e0fe44269 100644
--- a/cpp/src/gandiva/replace_holder.h
+++ b/cpp/src/gandiva/regex_functions_holder.h
@@ -30,6 +30,41 @@
namespace gandiva {
+/// Function Holder for SQL 'like'
+class GANDIVA_EXPORT LikeHolder : public FunctionHolder {
+ public:
+ ~LikeHolder() override = default;
+
+ static Status Make(const FunctionNode& node, std::shared_ptr<LikeHolder>*
holder);
+
+ static Status Make(const std::string& sql_pattern,
std::shared_ptr<LikeHolder>* holder);
+
+ static Status Make(const std::string& sql_pattern, const std::string&
escape_char,
+ std::shared_ptr<LikeHolder>* holder);
+
+ static Status Make(const std::string& sql_pattern,
std::shared_ptr<LikeHolder>* holder,
+ RE2::Options regex_op);
+
+ // Try and optimise a function node with a "like" pattern.
+ static const FunctionNode TryOptimize(const FunctionNode& node);
+
+ /// Return true if the data matches the pattern.
+ bool operator()(const std::string& data) { return RE2::FullMatch(data,
regex_); }
+
+ private:
+ explicit LikeHolder(const std::string& pattern) : pattern_(pattern),
regex_(pattern) {}
+
+ LikeHolder(const std::string& pattern, RE2::Options regex_op)
+ : pattern_(pattern), regex_(pattern, regex_op) {}
+
+ std::string pattern_; // posix pattern string, to help debugging
+ RE2 regex_; // compiled regex for the pattern
+
+ static RE2 starts_with_regex_; // pre-compiled pattern for matching
starts_with
+ static RE2 ends_with_regex_; // pre-compiled pattern for matching
ends_with
+ static RE2 is_substr_regex_; // pre-compiled pattern for matching
is_substr
+};
+
/// Function Holder for 'replace'
class GANDIVA_EXPORT ReplaceHolder : public FunctionHolder {
public:
@@ -94,4 +129,31 @@ class GANDIVA_EXPORT ReplaceHolder : public FunctionHolder {
RE2 regex_; // compiled regex for the pattern
};
+/// Function Holder for 'regexp_extract' function
+class GANDIVA_EXPORT ExtractHolder : public FunctionHolder {
+ public:
+ ~ExtractHolder() override = default;
+
+ static Status Make(const FunctionNode& node, std::shared_ptr<ExtractHolder>*
holder);
+
+ static Status Make(const std::string& sql_pattern,
+ std::shared_ptr<ExtractHolder>* holder);
+
+ /// Extracts the matching text from a string using a regex
+ const char* operator()(ExecutionContext* ctx, const char* user_input,
+ int32_t user_input_len, int32_t extract_index,
+ int32_t* out_length);
+
+ private:
+ // The pattern must be enclosed inside an outside group to be able to catch
the string
+ // piece that matched with the entire regex when the user define the group
"0". It is
+ // used because the RE2 library does not provide that defined behavior by
default.
+ explicit ExtractHolder(const std::string& pattern) : regex_("(" + pattern +
")") {
+ num_groups_pattern_ = regex_.NumberOfCapturingGroups();
+ }
+
+ RE2 regex_; // compiled regex for the pattern
+ int32_t num_groups_pattern_; // number of groups that user defined inside
the regex
+};
+
} // namespace gandiva
diff --git a/cpp/src/gandiva/regex_functions_holder_test.cc
b/cpp/src/gandiva/regex_functions_holder_test.cc
new file mode 100644
index 0000000000..584674a20f
--- /dev/null
+++ b/cpp/src/gandiva/regex_functions_holder_test.cc
@@ -0,0 +1,701 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "gandiva/regex_functions_holder.h"
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include <memory>
+#include <vector>
+#include "gandiva/regex_util.h"
+
+namespace gandiva {
+
+class TestLikeHolder : public ::testing::Test {
+ public:
+ RE2::Options regex_op;
+ FunctionNode BuildLike(std::string pattern) {
+ auto field = std::make_shared<FieldNode>(arrow::field("in",
arrow::utf8()));
+ auto pattern_node =
+ std::make_shared<LiteralNode>(arrow::utf8(), LiteralHolder(pattern),
false);
+ return FunctionNode("like", {field, pattern_node}, arrow::boolean());
+ }
+
+ FunctionNode BuildLike(std::string pattern, char escape_char) {
+ auto field = std::make_shared<FieldNode>(arrow::field("in",
arrow::utf8()));
+ auto pattern_node =
+ std::make_shared<LiteralNode>(arrow::utf8(), LiteralHolder(pattern),
false);
+ auto escape_char_node = std::make_shared<LiteralNode>(
+ arrow::int8(), LiteralHolder((int8_t)escape_char), false);
+ return FunctionNode("like", {field, pattern_node, escape_char_node},
+ arrow::boolean());
+ }
+};
+
+TEST_F(TestLikeHolder, TestMatchAny) {
+ std::shared_ptr<LikeHolder> like_holder;
+
+ auto status = LikeHolder::Make("ab%", &like_holder, regex_op);
+ EXPECT_EQ(status.ok(), true) << status.message();
+
+ auto& like = *like_holder;
+ EXPECT_TRUE(like("ab"));
+ EXPECT_TRUE(like("abc"));
+ EXPECT_TRUE(like("abcd"));
+
+ EXPECT_FALSE(like("a"));
+ EXPECT_FALSE(like("cab"));
+}
+
+TEST_F(TestLikeHolder, TestMatchOne) {
+ std::shared_ptr<LikeHolder> like_holder;
+
+ auto status = LikeHolder::Make("ab_", &like_holder, regex_op);
+ EXPECT_EQ(status.ok(), true) << status.message();
+
+ auto& like = *like_holder;
+ EXPECT_TRUE(like("abc"));
+ EXPECT_TRUE(like("abd"));
+
+ EXPECT_FALSE(like("a"));
+ EXPECT_FALSE(like("abcd"));
+ EXPECT_FALSE(like("dabc"));
+}
+
+TEST_F(TestLikeHolder, TestPcreSpecial) {
+ std::shared_ptr<LikeHolder> like_holder;
+
+ auto status = LikeHolder::Make(".*ab_", &like_holder, regex_op);
+ EXPECT_EQ(status.ok(), true) << status.message();
+
+ auto& like = *like_holder;
+ EXPECT_TRUE(like(".*abc")); // . and * aren't special in sql regex
+ EXPECT_FALSE(like("xxabc"));
+}
+
+TEST_F(TestLikeHolder, TestRegexEscape) {
+ std::string res;
+ auto status = RegexUtil::SqlLikePatternToPcre("#%hello#_abc_def##", '#',
res);
+ EXPECT_TRUE(status.ok()) << status.message();
+
+ EXPECT_EQ(res, "%hello_abc.def#");
+}
+
+TEST_F(TestLikeHolder, TestDot) {
+ std::shared_ptr<LikeHolder> like_holder;
+
+ auto status = LikeHolder::Make("abc.", &like_holder, regex_op);
+ EXPECT_EQ(status.ok(), true) << status.message();
+
+ auto& like = *like_holder;
+ EXPECT_FALSE(like("abcd"));
+}
+
+TEST_F(TestLikeHolder, TestMatchSubString) {
+ std::shared_ptr<LikeHolder> like_holder;
+
+ auto status = LikeHolder::Make("%abc%", "\\", &like_holder);
+ EXPECT_EQ(status.ok(), true) << status.message();
+
+ auto& like = *like_holder;
+ EXPECT_TRUE(like("abc"));
+ EXPECT_FALSE(like("xxabdc"));
+
+ status = LikeHolder::Make("%ab-.^$*+?()[]{}|—/c\\%%", "\\", &like_holder);
+ EXPECT_EQ(status.ok(), true) << status.message();
+
+ auto& like_reserved_char = *like_holder;
+ EXPECT_TRUE(like_reserved_char("XXab-.^$*+?()[]{}|—/c%d"));
+ EXPECT_FALSE(like_reserved_char("xxad-.^$*+?()[]{}|—/c"));
+}
+
+TEST_F(TestLikeHolder, TestOptimise) {
+ // optimise for 'starts_with'
+ auto fnode = LikeHolder::TryOptimize(BuildLike("xy 123z%"));
+ EXPECT_EQ(fnode.descriptor()->name(), "starts_with");
+ EXPECT_EQ(fnode.ToString(), "bool starts_with((string) in, (const string)
'xy 123z')");
+
+ // optimise for 'ends_with'
+ fnode = LikeHolder::TryOptimize(BuildLike("%xyz"));
+ EXPECT_EQ(fnode.descriptor()->name(), "ends_with");
+ EXPECT_EQ(fnode.ToString(), "bool ends_with((string) in, (const string)
'xyz')");
+
+ // optimise for 'is_substr'
+ fnode = LikeHolder::TryOptimize(BuildLike("%abc%"));
+ EXPECT_EQ(fnode.descriptor()->name(), "is_substr");
+ EXPECT_EQ(fnode.ToString(), "bool is_substr((string) in, (const string)
'abc')");
+
+ // optimise for 'is_substr with special characters'
+ fnode = LikeHolder::TryOptimize(BuildLike("%ab-c%"));
+ EXPECT_EQ(fnode.descriptor()->name(), "is_substr");
+ EXPECT_EQ(fnode.ToString(), "bool is_substr((string) in, (const string)
'ab-c')");
+
+ // optimise for 'ends_with with special characters'
+ fnode = LikeHolder::TryOptimize(BuildLike("%ab-c"));
+ EXPECT_EQ(fnode.descriptor()->name(), "ends_with");
+ EXPECT_EQ(fnode.ToString(),
+ "bool ends_with((string) in, (const string) "
+ "'ab-c')");
+
+ // optimise for 'starts_with with special characters'
+ fnode = LikeHolder::TryOptimize(BuildLike("ab-c%"));
+ EXPECT_EQ(fnode.descriptor()->name(), "starts_with");
+ EXPECT_EQ(fnode.ToString(),
+ "bool starts_with((string) in, (const string) "
+ "'ab-c')");
+
+ // no optimisation for others.
+ fnode = LikeHolder::TryOptimize(BuildLike("xyz_"));
+ EXPECT_EQ(fnode.descriptor()->name(), "like");
+
+ fnode = LikeHolder::TryOptimize(BuildLike("_xyz"));
+ EXPECT_EQ(fnode.descriptor()->name(), "like");
+
+ fnode = LikeHolder::TryOptimize(BuildLike("_xyz_"));
+ EXPECT_EQ(fnode.descriptor()->name(), "like");
+
+ fnode = LikeHolder::TryOptimize(BuildLike("%xyz_"));
+ EXPECT_EQ(fnode.descriptor()->name(), "like");
+
+ fnode = LikeHolder::TryOptimize(BuildLike("x_yz%"));
+ EXPECT_EQ(fnode.descriptor()->name(), "like");
+
+ // no optimisation for escaped pattern.
+ fnode = LikeHolder::TryOptimize(BuildLike("\\%xyz", '\\'));
+ EXPECT_EQ(fnode.descriptor()->name(), "like");
+ EXPECT_EQ(fnode.ToString(),
+ "bool like((string) in, (const string) '\\%xyz', (const int8)
\\)");
+}
+
+TEST_F(TestLikeHolder, TestMatchOneEscape) {
+ std::shared_ptr<LikeHolder> like_holder;
+
+ auto status = LikeHolder::Make("ab\\_", "\\", &like_holder);
+ EXPECT_EQ(status.ok(), true) << status.message();
+
+ auto& like = *like_holder;
+
+ EXPECT_TRUE(like("ab_"));
+
+ EXPECT_FALSE(like("abc"));
+ EXPECT_FALSE(like("abd"));
+ EXPECT_FALSE(like("a"));
+ EXPECT_FALSE(like("abcd"));
+ EXPECT_FALSE(like("dabc"));
+}
+
+TEST_F(TestLikeHolder, TestMatchManyEscape) {
+ std::shared_ptr<LikeHolder> like_holder;
+
+ auto status = LikeHolder::Make("ab\\%", "\\", &like_holder);
+ EXPECT_EQ(status.ok(), true) << status.message();
+
+ auto& like = *like_holder;
+
+ EXPECT_TRUE(like("ab%"));
+
+ EXPECT_FALSE(like("abc"));
+ EXPECT_FALSE(like("abd"));
+ EXPECT_FALSE(like("a"));
+ EXPECT_FALSE(like("abcd"));
+ EXPECT_FALSE(like("dabc"));
+}
+
+TEST_F(TestLikeHolder, TestMatchEscape) {
+ std::shared_ptr<LikeHolder> like_holder;
+
+ auto status = LikeHolder::Make("ab\\\\", "\\", &like_holder);
+ EXPECT_EQ(status.ok(), true) << status.message();
+
+ auto& like = *like_holder;
+
+ EXPECT_TRUE(like("ab\\"));
+
+ EXPECT_FALSE(like("abc"));
+}
+
+TEST_F(TestLikeHolder, TestEmptyEscapeChar) {
+ std::shared_ptr<LikeHolder> like_holder;
+
+ auto status = LikeHolder::Make("ab\\_", "", &like_holder);
+ EXPECT_EQ(status.ok(), true) << status.message();
+
+ auto& like = *like_holder;
+
+ EXPECT_TRUE(like("ab\\c"));
+ EXPECT_TRUE(like("ab\\_"));
+
+ EXPECT_FALSE(like("ab\\_d"));
+ EXPECT_FALSE(like("ab__"));
+}
+
+TEST_F(TestLikeHolder, TestMultipleEscapeChar) {
+ std::shared_ptr<LikeHolder> like_holder;
+
+ auto status = LikeHolder::Make("ab\\_", "\\\\", &like_holder);
+ EXPECT_EQ(status.ok(), false) << status.message();
+}
+
+class TestILikeHolder : public ::testing::Test {
+ public:
+ RE2::Options regex_op;
+ FunctionNode BuildILike(std::string pattern) {
+ auto field = std::make_shared<FieldNode>(arrow::field("in",
arrow::utf8()));
+ auto pattern_node =
+ std::make_shared<LiteralNode>(arrow::utf8(), LiteralHolder(pattern),
false);
+ return FunctionNode("ilike", {field, pattern_node}, arrow::boolean());
+ }
+};
+
+TEST_F(TestILikeHolder, TestMatchAny) {
+ std::shared_ptr<LikeHolder> like_holder;
+
+ regex_op.set_case_sensitive(false);
+ auto status = LikeHolder::Make("ab%", &like_holder, regex_op);
+ EXPECT_EQ(status.ok(), true) << status.message();
+
+ auto& like = *like_holder;
+ EXPECT_TRUE(like("ab"));
+ EXPECT_TRUE(like("aBc"));
+ EXPECT_TRUE(like("ABCD"));
+
+ EXPECT_FALSE(like("a"));
+ EXPECT_FALSE(like("cab"));
+}
+
+TEST_F(TestILikeHolder, TestMatchOne) {
+ std::shared_ptr<LikeHolder> like_holder;
+
+ regex_op.set_case_sensitive(false);
+ auto status = LikeHolder::Make("Ab_", &like_holder, regex_op);
+ EXPECT_EQ(status.ok(), true) << status.message();
+
+ auto& like = *like_holder;
+ EXPECT_TRUE(like("abc"));
+ EXPECT_TRUE(like("aBd"));
+
+ EXPECT_FALSE(like("A"));
+ EXPECT_FALSE(like("Abcd"));
+ EXPECT_FALSE(like("DaBc"));
+}
+
+TEST_F(TestILikeHolder, TestPcreSpecial) {
+ std::shared_ptr<LikeHolder> like_holder;
+
+ regex_op.set_case_sensitive(false);
+ auto status = LikeHolder::Make(".*aB_", &like_holder, regex_op);
+ EXPECT_EQ(status.ok(), true) << status.message();
+
+ auto& like = *like_holder;
+ EXPECT_TRUE(like(".*Abc")); // . and * aren't special in sql regex
+ EXPECT_FALSE(like("xxAbc"));
+}
+
+TEST_F(TestILikeHolder, TestDot) {
+ std::shared_ptr<LikeHolder> like_holder;
+
+ regex_op.set_case_sensitive(false);
+ auto status = LikeHolder::Make("aBc.", &like_holder, regex_op);
+ EXPECT_EQ(status.ok(), true) << status.message();
+
+ auto& like = *like_holder;
+ EXPECT_FALSE(like("abcd"));
+}
+
+class TestReplaceHolder : public ::testing::Test {
+ protected:
+ ExecutionContext execution_context_;
+};
+
+TEST_F(TestReplaceHolder, TestMultipleReplace) {
+ std::shared_ptr<ReplaceHolder> replace_holder;
+
+ auto status = ReplaceHolder::Make("ana", &replace_holder);
+ EXPECT_EQ(status.ok(), true) << status.message();
+
+ std::string input_string = "banana";
+ std::string replace_string;
+ int32_t out_length = 0;
+
+ auto& replace = *replace_holder;
+ const char* ret =
+ replace(&execution_context_, input_string.c_str(),
+ static_cast<int32_t>(input_string.length()),
replace_string.c_str(),
+ static_cast<int32_t>(replace_string.length()), &out_length);
+ std::string ret_as_str(ret, out_length);
+ EXPECT_EQ(out_length, 3);
+ EXPECT_EQ(ret_as_str, "bna");
+
+ input_string = "bananaana";
+
+ ret = replace(&execution_context_, input_string.c_str(),
+ static_cast<int32_t>(input_string.length()),
replace_string.c_str(),
+ static_cast<int32_t>(replace_string.length()), &out_length);
+ ret_as_str = std::string(ret, out_length);
+ EXPECT_EQ(out_length, 3);
+ EXPECT_EQ(ret_as_str, "bna");
+
+ input_string = "bananana";
+
+ ret = replace(&execution_context_, input_string.c_str(),
+ static_cast<int32_t>(input_string.length()),
replace_string.c_str(),
+ static_cast<int32_t>(replace_string.length()), &out_length);
+ ret_as_str = std::string(ret, out_length);
+ EXPECT_EQ(out_length, 2);
+ EXPECT_EQ(ret_as_str, "bn");
+
+ input_string = "anaana";
+
+ ret = replace(&execution_context_, input_string.c_str(),
+ static_cast<int32_t>(input_string.length()),
replace_string.c_str(),
+ static_cast<int32_t>(replace_string.length()), &out_length);
+ ret_as_str = std::string(ret, out_length);
+ EXPECT_EQ(out_length, 0);
+ EXPECT_FALSE(execution_context_.has_error());
+ EXPECT_EQ(ret_as_str, "");
+}
+
+TEST_F(TestReplaceHolder, TestNoMatchPattern) {
+ std::shared_ptr<ReplaceHolder> replace_holder;
+
+ auto status = ReplaceHolder::Make("ana", &replace_holder);
+ EXPECT_EQ(status.ok(), true) << status.message();
+
+ std::string input_string = "apple";
+ std::string replace_string;
+ int32_t out_length = 0;
+
+ auto& replace = *replace_holder;
+ const char* ret =
+ replace(&execution_context_, input_string.c_str(),
+ static_cast<int32_t>(input_string.length()),
replace_string.c_str(),
+ static_cast<int32_t>(replace_string.length()), &out_length);
+ std::string ret_as_string(ret, out_length);
+ EXPECT_EQ(out_length, 5);
+ EXPECT_EQ(ret_as_string, "apple");
+}
+
+TEST_F(TestReplaceHolder, TestReplaceSameSize) {
+ std::shared_ptr<ReplaceHolder> replace_holder;
+
+ auto status = ReplaceHolder::Make("a", &replace_holder);
+ EXPECT_EQ(status.ok(), true) << status.message();
+
+ std::string input_string = "ananindeua";
+ std::string replace_string = "b";
+ int32_t out_length = 0;
+
+ auto& replace = *replace_holder;
+ const char* ret =
+ replace(&execution_context_, input_string.c_str(),
+ static_cast<int32_t>(input_string.length()),
replace_string.c_str(),
+ static_cast<int32_t>(replace_string.length()), &out_length);
+ std::string ret_as_string(ret, out_length);
+ EXPECT_EQ(out_length, 10);
+ EXPECT_EQ(ret_as_string, "bnbnindeub");
+}
+
+TEST_F(TestReplaceHolder, TestReplaceInvalidPattern) {
+ std::shared_ptr<ReplaceHolder> replace_holder;
+
+ auto status = ReplaceHolder::Make("+", &replace_holder);
+ EXPECT_EQ(status.ok(), false) << status.message();
+
+ execution_context_.Reset();
+}
+
+// Tests related to the REGEXP_EXTRACT function
+class TestExtractHolder : public ::testing::Test {
+ protected:
+ ExecutionContext execution_context_;
+};
+
+TEST_F(TestExtractHolder, TestSimpleExtract) {
+ std::shared_ptr<ExtractHolder> extract_holder;
+
+ // Pattern to match of two group of letters
+ auto status = ExtractHolder::Make(R"((\w+) (\w+))", &extract_holder);
+ EXPECT_EQ(status.ok(), true) << status.message();
+
+ std::string input_string = "John Doe";
+ int32_t extract_index = 2; // Retrieve the surname
+ int32_t out_length = 0;
+
+ auto& extract = *extract_holder;
+ const char* ret =
+ extract(&execution_context_, input_string.c_str(),
+ static_cast<int32_t>(input_string.length()), extract_index,
&out_length);
+ std::string ret_as_str(ret, out_length);
+ EXPECT_EQ(out_length, 3);
+ EXPECT_EQ(ret_as_str, "Doe");
+
+ input_string = "Ringo Beast";
+ extract_index = 1; // Retrieve the first name
+
+ ret = extract(&execution_context_, input_string.c_str(),
+ static_cast<int32_t>(input_string.length()), extract_index,
&out_length);
+ ret_as_str = std::string(ret, out_length);
+ EXPECT_EQ(out_length, 5);
+ EXPECT_EQ(ret_as_str, "Ringo");
+
+ input_string = "Paul Test";
+ extract_index = 0; // Retrieve all match
+
+ ret = extract(&execution_context_, input_string.c_str(),
+ static_cast<int32_t>(input_string.length()), extract_index,
&out_length);
+ ret_as_str = std::string(ret, out_length);
+ EXPECT_EQ(out_length, 9);
+ EXPECT_EQ(ret_as_str, "Paul Test");
+
+ status = ExtractHolder::Make(R"((\w+) (\w+) - (\d+))", &extract_holder);
+ EXPECT_EQ(status.ok(), true) << status.message();
+
+ auto& extract2 = *extract_holder;
+
+ input_string = "John Doe - 124";
+ extract_index = 0; // Retrieve all match
+
+ ret = extract2(&execution_context_, input_string.c_str(),
+ static_cast<int32_t>(input_string.length()), extract_index,
&out_length);
+ ret_as_str = std::string(ret, out_length);
+ EXPECT_EQ(out_length, 14);
+ EXPECT_EQ(ret_as_str, "John Doe - 124");
+
+ input_string = "John Doe - 124 MoreString";
+ extract_index = 0; // Retrieve all match
+
+ ret = extract2(&execution_context_, input_string.c_str(),
+ static_cast<int32_t>(input_string.length()), extract_index,
&out_length);
+ ret_as_str = std::string(ret, out_length);
+ EXPECT_EQ(out_length, 14);
+ EXPECT_EQ(ret_as_str, "John Doe - 124");
+
+ input_string = "MoreString John Doe - 124";
+ extract_index = 0; // Retrieve all match
+
+ ret = extract2(&execution_context_, input_string.c_str(),
+ static_cast<int32_t>(input_string.length()), extract_index,
&out_length);
+ ret_as_str = std::string(ret, out_length);
+ EXPECT_EQ(out_length, 14);
+ EXPECT_EQ(ret_as_str, "John Doe - 124");
+
+ // Pattern to match only numbers
+ status = ExtractHolder::Make(R"(((\w+)))", &extract_holder);
+ EXPECT_EQ(status.ok(), true) << status.message();
+
+ auto& extract_numbers = *extract_holder;
+
+ input_string = "路%$大a";
+ extract_index = 0; // Retrieve all matched string
+
+ ret = extract_numbers(&execution_context_, input_string.c_str(),
+ static_cast<int32_t>(input_string.length()),
extract_index,
+ &out_length);
+ ret_as_str = std::string(ret, out_length);
+ EXPECT_EQ(out_length, 1);
+ EXPECT_EQ(ret_as_str, "a");
+
+ input_string = "b路%$大";
+ extract_index = 0; // Retrieve all matched string
+
+ ret = extract_numbers(&execution_context_, input_string.c_str(),
+ static_cast<int32_t>(input_string.length()),
extract_index,
+ &out_length);
+ ret_as_str = std::string(ret, out_length);
+ EXPECT_EQ(out_length, 1);
+ EXPECT_EQ(ret_as_str, "b");
+
+ input_string = "路%c$大";
+ extract_index = 0; // Retrieve all matched string
+
+ ret = extract_numbers(&execution_context_, input_string.c_str(),
+ static_cast<int32_t>(input_string.length()),
extract_index,
+ &out_length);
+ ret_as_str = std::string(ret, out_length);
+ EXPECT_EQ(out_length, 1);
+ EXPECT_EQ(ret_as_str, "c");
+
+ input_string = "路%c$大";
+ extract_index = 1; // Retrieve all matched string
+
+ ret = extract_numbers(&execution_context_, input_string.c_str(),
+ static_cast<int32_t>(input_string.length()),
extract_index,
+ &out_length);
+ ret_as_str = std::string(ret, out_length);
+ EXPECT_EQ(out_length, 1);
+ EXPECT_EQ(ret_as_str, "c");
+
+ input_string = "路%c$大";
+ extract_index = 2; // Retrieve all matched string
+
+ ret = extract_numbers(&execution_context_, input_string.c_str(),
+ static_cast<int32_t>(input_string.length()),
extract_index,
+ &out_length);
+ ret_as_str = std::string(ret, out_length);
+ EXPECT_EQ(out_length, 1);
+ EXPECT_EQ(ret_as_str, "c");
+
+ input_string = "路%c$大";
+ extract_index = 3; // Retrieve all matched string
+
+ ret = extract_numbers(&execution_context_, input_string.c_str(),
+ static_cast<int32_t>(input_string.length()),
extract_index,
+ &out_length);
+ ret_as_str = std::string(ret, out_length);
+ EXPECT_EQ(out_length, 0);
+ EXPECT_TRUE(execution_context_.has_error());
+}
+
+TEST_F(TestExtractHolder, TestNoMatches) {
+ std::shared_ptr<ExtractHolder> extract_holder;
+
+ // Pattern to match of two group of letters
+ auto status = ExtractHolder::Make(R"((\w+) (\w+))", &extract_holder);
+ EXPECT_EQ(status.ok(), true) << status.message();
+
+ std::string input_string = "John";
+ int32_t extract_index = 2; // The regex will not match with the input string
+ int32_t out_length = 0;
+
+ auto& extract = *extract_holder;
+ const char* ret =
+ extract(&execution_context_, input_string.c_str(),
+ static_cast<int32_t>(input_string.length()), extract_index,
&out_length);
+ std::string ret_as_str(ret, out_length);
+ EXPECT_EQ(out_length, 0);
+ EXPECT_FALSE(execution_context_.has_error());
+
+ // Pattern to match only numbers
+ status = ExtractHolder::Make(R"(\d+)", &extract_holder);
+ EXPECT_EQ(status.ok(), true) << status.message();
+
+ auto& extract_numbers = *extract_holder;
+
+ input_string = "12345";
+ extract_index = 0; // Retrieve all matched string
+
+ ret = extract_numbers(&execution_context_, input_string.c_str(),
+ static_cast<int32_t>(input_string.length()),
extract_index,
+ &out_length);
+ ret_as_str = std::string(ret, out_length);
+ EXPECT_EQ(out_length, 5);
+ EXPECT_EQ(ret_as_str, "12345");
+
+ input_string = "12345A";
+ extract_index = 0; // Retrieve all matched string
+
+ ret = extract_numbers(&execution_context_, input_string.c_str(),
+ static_cast<int32_t>(input_string.length()),
extract_index,
+ &out_length);
+ ret_as_str = std::string(ret, out_length);
+ EXPECT_EQ(out_length, 5);
+ EXPECT_FALSE(execution_context_.has_error());
+ EXPECT_EQ(ret_as_str, "12345");
+}
+
+TEST_F(TestExtractHolder, TestInvalidRange) {
+ std::shared_ptr<ExtractHolder> extract_holder;
+
+ // Pattern to match of two group of letters
+ auto status = ExtractHolder::Make(R"((\w+) (\w+))", &extract_holder);
+ EXPECT_EQ(status.ok(), true) << status.message();
+
+ std::string input_string = "John Doe";
+ int32_t extract_index = -1;
+ int32_t out_length = 0;
+
+ auto& extract = *extract_holder;
+ const char* ret =
+ extract(&execution_context_, input_string.c_str(),
+ static_cast<int32_t>(input_string.length()), extract_index,
&out_length);
+ std::string ret_as_str(ret, out_length);
+ EXPECT_EQ(out_length, 0);
+ EXPECT_TRUE(execution_context_.has_error());
+
+ execution_context_.Reset();
+
+ // The test regex has two capturing groups, so the higher index
+ // allowed for the test regex is 2
+ extract_index = 3;
+
+ ret = extract(&execution_context_, input_string.c_str(),
+ static_cast<int32_t>(input_string.length()), extract_index,
&out_length);
+ ret_as_str = std::string(ret, out_length);
+ EXPECT_EQ(out_length, 0);
+ EXPECT_TRUE(execution_context_.has_error());
+
+ execution_context_.Reset();
+}
+
+TEST_F(TestExtractHolder, TestExtractInvalidPattern) {
+ std::shared_ptr<ExtractHolder> extract_holder;
+
+ auto status = ExtractHolder::Make("+", &extract_holder);
+ EXPECT_EQ(status.ok(), false) << status.message();
+
+ execution_context_.Reset();
+}
+
+TEST_F(TestExtractHolder, TestErrorWhileBuildingHolder) {
+ std::shared_ptr<ExtractHolder> extract_holder;
+
+ // Create function with incorrect number of params
+ auto field = std::make_shared<FieldNode>(arrow::field("in", arrow::utf8()));
+ auto pattern_node = std::make_shared<LiteralNode>(
+ arrow::utf8(), LiteralHolder(R"((\w+) (\w+))"), false);
+ auto function_node =
+ FunctionNode("regexp_extract", {field, pattern_node}, arrow::utf8());
+
+ auto status = ExtractHolder::Make(function_node, &extract_holder);
+ EXPECT_EQ(status.ok(), false);
+ EXPECT_THAT(status.message(),
+ ::testing::HasSubstr("'extract' function requires three
parameters"));
+
+ execution_context_.Reset();
+
+ // Create function with non-utf8 literal parameter as pattern
+ field = std::make_shared<FieldNode>(arrow::field("in", arrow::utf8()));
+ pattern_node = std::make_shared<LiteralNode>(arrow::int32(),
LiteralHolder(2), false);
+ auto index_node = std::make_shared<FieldNode>(arrow::field("idx",
arrow::int32()));
+ function_node =
+ FunctionNode("regexp_extract", {field, pattern_node, index_node},
arrow::utf8());
+
+ status = ExtractHolder::Make(function_node, &extract_holder);
+ EXPECT_EQ(status.ok(), false);
+ EXPECT_THAT(status.message(),
+ ::testing::HasSubstr(
+ "'extract' function requires a literal as the second
parameter"));
+
+ execution_context_.Reset();
+
+ // Create function not using a literal parameter as pattern
+ field = std::make_shared<FieldNode>(arrow::field("in", arrow::utf8()));
+ auto pattern_as_node =
+ std::make_shared<FieldNode>(arrow::field("pattern", arrow::utf8()));
+ index_node = std::make_shared<FieldNode>(arrow::field("idx",
arrow::int32()));
+ function_node =
+ FunctionNode("regexp_extract", {field, pattern_as_node, index_node},
arrow::utf8());
+
+ status = ExtractHolder::Make(function_node, &extract_holder);
+ EXPECT_EQ(status.ok(), false);
+ EXPECT_THAT(status.message(),
+ ::testing::HasSubstr(
+ "'extract' function requires a literal as the second
parameter"));
+
+ execution_context_.Reset();
+}
+
+} // namespace gandiva
diff --git a/cpp/src/gandiva/replace_holder.cc
b/cpp/src/gandiva/replace_holder.cc
deleted file mode 100644
index 1bcbe13802..0000000000
--- a/cpp/src/gandiva/replace_holder.cc
+++ /dev/null
@@ -1,61 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include "gandiva/replace_holder.h"
-
-#include "gandiva/node.h"
-#include "gandiva/regex_util.h"
-
-namespace gandiva {
-
-Status ReplaceHolder::Make(const FunctionNode& node,
- std::shared_ptr<ReplaceHolder>* holder) {
- ARROW_RETURN_IF(node.children().size() != 3,
- Status::Invalid("'replace' function requires three
parameters"));
-
- auto literal = dynamic_cast<LiteralNode*>(node.children().at(1).get());
- ARROW_RETURN_IF(
- literal == nullptr,
- Status::Invalid("'replace' function requires a literal as the second
parameter"));
-
- auto literal_type = literal->return_type()->id();
- ARROW_RETURN_IF(
- !(literal_type == arrow::Type::STRING || literal_type ==
arrow::Type::BINARY),
- Status::Invalid(
- "'replace' function requires a string literal as the second
parameter"));
-
- return Make(arrow::util::get<std::string>(literal->holder()), holder);
-}
-
-Status ReplaceHolder::Make(const std::string& sql_pattern,
- std::shared_ptr<ReplaceHolder>* holder) {
- auto lholder = std::shared_ptr<ReplaceHolder>(new
ReplaceHolder(sql_pattern));
- ARROW_RETURN_IF(!lholder->regex_.ok(),
- Status::Invalid("Building RE2 pattern '", sql_pattern, "'
failed"));
-
- *holder = lholder;
- return Status::OK();
-}
-
-void ReplaceHolder::return_error(ExecutionContext* context, std::string& data,
- std::string& replace_string) {
- std::string err_msg = "Error replacing '" + replace_string + "' on the given
string '" +
- data + "' for the given pattern: " + pattern_;
- context->set_error_msg(err_msg.c_str());
-}
-
-} // namespace gandiva
diff --git a/cpp/src/gandiva/replace_holder_test.cc
b/cpp/src/gandiva/replace_holder_test.cc
deleted file mode 100644
index b0830d4f00..0000000000
--- a/cpp/src/gandiva/replace_holder_test.cc
+++ /dev/null
@@ -1,129 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include "gandiva/replace_holder.h"
-
-#include <gtest/gtest.h>
-
-#include <memory>
-#include <vector>
-
-namespace gandiva {
-
-class TestReplaceHolder : public ::testing::Test {
- protected:
- ExecutionContext execution_context_;
-};
-
-TEST_F(TestReplaceHolder, TestMultipleReplace) {
- std::shared_ptr<ReplaceHolder> replace_holder;
-
- auto status = ReplaceHolder::Make("ana", &replace_holder);
- EXPECT_EQ(status.ok(), true) << status.message();
-
- std::string input_string = "banana";
- std::string replace_string;
- int32_t out_length = 0;
-
- auto& replace = *replace_holder;
- const char* ret =
- replace(&execution_context_, input_string.c_str(),
- static_cast<int32_t>(input_string.length()),
replace_string.c_str(),
- static_cast<int32_t>(replace_string.length()), &out_length);
- std::string ret_as_str(ret, out_length);
- EXPECT_EQ(out_length, 3);
- EXPECT_EQ(ret_as_str, "bna");
-
- input_string = "bananaana";
-
- ret = replace(&execution_context_, input_string.c_str(),
- static_cast<int32_t>(input_string.length()),
replace_string.c_str(),
- static_cast<int32_t>(replace_string.length()), &out_length);
- ret_as_str = std::string(ret, out_length);
- EXPECT_EQ(out_length, 3);
- EXPECT_EQ(ret_as_str, "bna");
-
- input_string = "bananana";
-
- ret = replace(&execution_context_, input_string.c_str(),
- static_cast<int32_t>(input_string.length()),
replace_string.c_str(),
- static_cast<int32_t>(replace_string.length()), &out_length);
- ret_as_str = std::string(ret, out_length);
- EXPECT_EQ(out_length, 2);
- EXPECT_EQ(ret_as_str, "bn");
-
- input_string = "anaana";
-
- ret = replace(&execution_context_, input_string.c_str(),
- static_cast<int32_t>(input_string.length()),
replace_string.c_str(),
- static_cast<int32_t>(replace_string.length()), &out_length);
- ret_as_str = std::string(ret, out_length);
- EXPECT_EQ(out_length, 0);
- EXPECT_FALSE(execution_context_.has_error());
- EXPECT_EQ(ret_as_str, "");
-}
-
-TEST_F(TestReplaceHolder, TestNoMatchPattern) {
- std::shared_ptr<ReplaceHolder> replace_holder;
-
- auto status = ReplaceHolder::Make("ana", &replace_holder);
- EXPECT_EQ(status.ok(), true) << status.message();
-
- std::string input_string = "apple";
- std::string replace_string;
- int32_t out_length = 0;
-
- auto& replace = *replace_holder;
- const char* ret =
- replace(&execution_context_, input_string.c_str(),
- static_cast<int32_t>(input_string.length()),
replace_string.c_str(),
- static_cast<int32_t>(replace_string.length()), &out_length);
- std::string ret_as_string(ret, out_length);
- EXPECT_EQ(out_length, 5);
- EXPECT_EQ(ret_as_string, "apple");
-}
-
-TEST_F(TestReplaceHolder, TestReplaceSameSize) {
- std::shared_ptr<ReplaceHolder> replace_holder;
-
- auto status = ReplaceHolder::Make("a", &replace_holder);
- EXPECT_EQ(status.ok(), true) << status.message();
-
- std::string input_string = "ananindeua";
- std::string replace_string = "b";
- int32_t out_length = 0;
-
- auto& replace = *replace_holder;
- const char* ret =
- replace(&execution_context_, input_string.c_str(),
- static_cast<int32_t>(input_string.length()),
replace_string.c_str(),
- static_cast<int32_t>(replace_string.length()), &out_length);
- std::string ret_as_string(ret, out_length);
- EXPECT_EQ(out_length, 10);
- EXPECT_EQ(ret_as_string, "bnbnindeub");
-}
-
-TEST_F(TestReplaceHolder, TestReplaceInvalidPattern) {
- std::shared_ptr<ReplaceHolder> replace_holder;
-
- auto status = ReplaceHolder::Make("+", &replace_holder);
- EXPECT_EQ(status.ok(), false) << status.message();
-
- execution_context_.Reset();
-}
-
-} // namespace gandiva
diff --git a/cpp/src/gandiva/tests/projector_test.cc
b/cpp/src/gandiva/tests/projector_test.cc
index 06c1dbdf08..93fda6973a 100644
--- a/cpp/src/gandiva/tests/projector_test.cc
+++ b/cpp/src/gandiva/tests/projector_test.cc
@@ -2624,4 +2624,54 @@ TEST_F(TestProjector, TestNextDay) {
// Validate results
EXPECT_ARROW_ARRAY_EQUALS(exp, outputs.at(0));
}
+
+TEST_F(TestProjector, TestRegexpExtract) {
+ // schema for input fields
+ auto field0 = field("f0", arrow::utf8());
+ auto field1 = field("f1", arrow::int32());
+ auto schema = arrow::schema({field0, field1});
+
+ // output fields
+ auto field_extract = field("extract", arrow::utf8());
+
+ // The pattern to match this sequence: string string - number
+ std::string pattern(R"((\w+) (\w+) - (\d+))");
+ auto literal = TreeExprBuilder::MakeStringLiteral(pattern);
+ auto node0 = TreeExprBuilder::MakeField(field0);
+ auto node1 = TreeExprBuilder::MakeField(field1);
+
+ // Build expression
+ auto regexp_extract_func = TreeExprBuilder::MakeFunction(
+ "regexp_extract", {node0, literal, node1}, arrow::utf8());
+ auto extract_expr = TreeExprBuilder::MakeExpression(regexp_extract_func,
field_extract);
+
+ std::shared_ptr<Projector> projector;
+ auto status = Projector::Make(schema, {extract_expr}, TestConfiguration(),
&projector);
+ EXPECT_TRUE(status.ok()) << status.message();
+
+ // Create a row-batch with some sample data
+ int num_records = 7;
+ auto array0 = MakeArrowArrayUtf8(
+ {"John Doe - 124", "John Doe - 124", "John Doe - 124", "John Doe - 124",
+ "John Doe - 124 MoreString", "MoreString John Doe - 124",
"stringthatdonotmatch"},
+ {true, true, true, true, true, true, true});
+ auto array1 = MakeArrowArrayInt32({1, 2, 3, 0, 0, 3, 0},
+ {true, true, true, true, true, true,
true});
+ // expected output
+ auto exp_extract = MakeArrowArrayUtf8(
+ {"John", "Doe", "124", "John Doe - 124", "John Doe - 124", "124", ""},
+ {true, true, true, true, true, true, true});
+
+ // prepare input record batch
+ auto in = arrow::RecordBatch::Make(schema, num_records, {array0, array1});
+
+ // Evaluate expression
+ arrow::ArrayVector outputs;
+ status = projector->Evaluate(*in, pool_, &outputs);
+ EXPECT_TRUE(status.ok()) << status.message();
+
+ // Validate results
+ EXPECT_ARROW_ARRAY_EQUALS(exp_extract, outputs.at(0));
+}
+
} // namespace gandiva