This is an automated email from the ASF dual-hosted git repository.
praveenbingo pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new a06a0f4 ARROW-9328: [C++][Gandiva] Add LTRIM, RTRIM, BTRIM functions
for string
a06a0f4 is described below
commit a06a0f4c6b3268bbbc8da77521f4a229d77a9c94
Author: Sagnik Chakraborty <[email protected]>
AuthorDate: Thu Jul 23 17:57:28 2020 +0530
ARROW-9328: [C++][Gandiva] Add LTRIM, RTRIM, BTRIM functions for string
Closes #7641 from sagnikc-dremio/master and squashes the following commits:
4a9985fc5 <Sagnik Chakraborty> ARROW-9328: Add LTRIM, RTRIM, BTRIM
functions for string
Authored-by: Sagnik Chakraborty <[email protected]>
Signed-off-by: Praveen <[email protected]>
---
cpp/src/gandiva/function_registry_string.cc | 13 +-
cpp/src/gandiva/precompiled/string_ops.cc | 189 ++++++++++++++++--
cpp/src/gandiva/precompiled/string_ops_test.cc | 258 ++++++++++++++++++++++++-
cpp/src/gandiva/precompiled/types.h | 24 ++-
4 files changed, 461 insertions(+), 23 deletions(-)
diff --git a/cpp/src/gandiva/function_registry_string.cc
b/cpp/src/gandiva/function_registry_string.cc
index 436168d..dd32c19 100644
--- a/cpp/src/gandiva/function_registry_string.cc
+++ b/cpp/src/gandiva/function_registry_string.cc
@@ -55,7 +55,9 @@ std::vector<NativeFunction> GetStringFunctionRegistry() {
UNARY_UNSAFE_NULL_IF_NULL(length, {}, utf8, int32),
UNARY_UNSAFE_NULL_IF_NULL(lengthUtf8, {}, binary, int32),
UNARY_UNSAFE_NULL_IF_NULL(reverse, {}, utf8, utf8),
- UNARY_UNSAFE_NULL_IF_NULL(trim, {}, utf8, utf8),
+ UNARY_UNSAFE_NULL_IF_NULL(ltrim, {}, utf8, utf8),
+ UNARY_UNSAFE_NULL_IF_NULL(rtrim, {}, utf8, utf8),
+ UNARY_UNSAFE_NULL_IF_NULL(btrim, {}, utf8, utf8),
UNARY_SAFE_NULL_NEVER_BOOL_FN(isnull, {}),
UNARY_SAFE_NULL_NEVER_BOOL_FN(isnotnull, {}),
@@ -83,6 +85,15 @@ std::vector<NativeFunction> GetStringFunctionRegistry() {
kResultNullIfNull, "gdv_fn_like_utf8_utf8",
NativeFunction::kNeedsFunctionHolder),
+ NativeFunction("ltrim", {}, DataTypeVector{utf8(), utf8()}, utf8(),
+ kResultNullIfNull, "ltrim_utf8_utf8",
NativeFunction::kNeedsContext),
+
+ NativeFunction("rtrim", {}, DataTypeVector{utf8(), utf8()}, utf8(),
+ kResultNullIfNull, "rtrim_utf8_utf8",
NativeFunction::kNeedsContext),
+
+ NativeFunction("btrim", {}, DataTypeVector{utf8(), utf8()}, utf8(),
+ kResultNullIfNull, "btrim_utf8_utf8",
NativeFunction::kNeedsContext),
+
NativeFunction("substr", {"substring"},
DataTypeVector{utf8(), int64() /*offset*/, int64()
/*length*/},
utf8(), kResultNullIfNull, "substr_utf8_int64_int64",
diff --git a/cpp/src/gandiva/precompiled/string_ops.cc
b/cpp/src/gandiva/precompiled/string_ops.cc
index 102532c..f6ef79c 100644
--- a/cpp/src/gandiva/precompiled/string_ops.cc
+++ b/cpp/src/gandiva/precompiled/string_ops.cc
@@ -16,7 +16,6 @@
// under the License.
// String functions
-
#include "arrow/util/value_parsing.h"
extern "C" {
@@ -286,10 +285,48 @@ const char* reverse_utf8(gdv_int64 context, const char*
data, gdv_int32 data_len
return ret;
}
-// Trim a utf8 sequence
+// Trims whitespaces from the left end of the input utf8 sequence
+FORCE_INLINE
+const char* ltrim_utf8(gdv_int64 context, const char* data, gdv_int32 data_len,
+ int32_t* out_len) {
+ if (data_len == 0) {
+ *out_len = 0;
+ return "";
+ }
+
+ gdv_int32 start = 0;
+ // start denotes the first position of non-space characters in the input
string
+ while (start < data_len && data[start] == ' ') {
+ ++start;
+ }
+
+ *out_len = data_len - start;
+ return data + start;
+}
+
+// Trims whitespaces from the right end of the input utf8 sequence
FORCE_INLINE
-const char* trim_utf8(gdv_int64 context, const char* data, gdv_int32 data_len,
- int32_t* out_len) {
+const char* rtrim_utf8(gdv_int64 context, const char* data, gdv_int32 data_len,
+ int32_t* out_len) {
+ if (data_len == 0) {
+ *out_len = 0;
+ return "";
+ }
+
+ gdv_int32 end = data_len - 1;
+ // end denotes the last position of non-space characters in the input string
+ while (end >= 0 && data[end] == ' ') {
+ --end;
+ }
+
+ *out_len = end + 1;
+ return data;
+}
+
+// Trims whitespaces from both the ends of the input utf8 sequence
+FORCE_INLINE
+const char* btrim_utf8(gdv_int64 context, const char* data, gdv_int32 data_len,
+ int32_t* out_len) {
if (data_len == 0) {
*out_len = 0;
return "";
@@ -305,21 +342,145 @@ const char* trim_utf8(gdv_int64 context, const char*
data, gdv_int32 data_len,
--end;
}
- // string with no leading/trailing spaces, return original string
- if (start == 0 && end == data_len - 1) {
- *out_len = data_len;
- return data;
+ // string has some leading/trailing spaces and some non-space characters
+ *out_len = end - start + 1;
+ return data + start;
+}
+
+// Trims characters present in the trim text from the left end of the base text
+FORCE_INLINE
+const char* ltrim_utf8_utf8(gdv_int64 context, const char* basetext,
+ gdv_int32 basetext_len, const char* trimtext,
+ gdv_int32 trimtext_len, int32_t* out_len) {
+ if (basetext_len == 0) {
+ *out_len = 0;
+ return "";
+ } else if (trimtext_len == 0) {
+ *out_len = basetext_len;
+ return basetext;
+ }
+
+ gdv_int32 start_ptr, char_len;
+ // scan the base text from left to right and increment the start pointer till
+ // there is a character which is not present in the trim text
+ for (start_ptr = 0; start_ptr < basetext_len; start_ptr += char_len) {
+ char_len = utf8_char_length(basetext[start_ptr]);
+ if (char_len == 0 || start_ptr + char_len > basetext_len) {
+ // invalid byte or incomplete glyph
+ set_error_for_invalid_utf(context, basetext[start_ptr]);
+ *out_len = 0;
+ return "";
+ }
+ if (!is_substr_utf8_utf8(trimtext, trimtext_len, basetext + start_ptr,
char_len)) {
+ break;
+ }
}
- // string with all spaces
- if (start > end) {
+ *out_len = basetext_len - start_ptr;
+ return basetext + start_ptr;
+}
+
+// Trims characters present in the trim text from the right end of the base
text
+FORCE_INLINE
+const char* rtrim_utf8_utf8(gdv_int64 context, const char* basetext,
+ gdv_int32 basetext_len, const char* trimtext,
+ gdv_int32 trimtext_len, int32_t* out_len) {
+ if (basetext_len == 0) {
*out_len = 0;
return "";
+ } else if (trimtext_len == 0) {
+ *out_len = basetext_len;
+ return basetext;
+ }
+
+ gdv_int32 char_len, end_ptr, byte_cnt = 1;
+ // scan the base text from right to left and decrement the end pointer till
+ // there is a character which is not present in the trim text
+ for (end_ptr = basetext_len - 1; end_ptr >= 0; --end_ptr) {
+ char_len = utf8_char_length(basetext[end_ptr]);
+ if (char_len == 0) { // trailing bytes of multibyte character
+ ++byte_cnt;
+ continue;
+ }
+ // this is the first byte of a character, hence check if char_len =
char_cnt
+ if (byte_cnt != char_len) { // invalid byte or incomplete glyph
+ set_error_for_invalid_utf(context, basetext[end_ptr]);
+ *out_len = 0;
+ return "";
+ }
+ byte_cnt = 1; // reset the counter*/
+ if (!is_substr_utf8_utf8(trimtext, trimtext_len, basetext + end_ptr,
char_len)) {
+ break;
+ }
}
- // string has some leading/trailing spaces and some non-space characters
- *out_len = end - start + 1;
- return data + start;
+ // when all characters in the basetext are part of the trimtext
+ if (end_ptr == -1) {
+ *out_len = 0;
+ return "";
+ }
+
+ end_ptr += utf8_char_length(basetext[end_ptr]); // point to the next
character
+ *out_len = end_ptr;
+ return basetext;
+}
+
+// Trims characters present in the trim text from both ends of the base text
+FORCE_INLINE
+const char* btrim_utf8_utf8(gdv_int64 context, const char* basetext,
+ gdv_int32 basetext_len, const char* trimtext,
+ gdv_int32 trimtext_len, int32_t* out_len) {
+ if (basetext_len == 0) {
+ *out_len = 0;
+ return "";
+ } else if (trimtext_len == 0) {
+ *out_len = basetext_len;
+ return basetext;
+ }
+
+ gdv_int32 start_ptr, end_ptr, char_len, byte_cnt = 1;
+ // scan the base text from left to right and increment the start and
decrement the
+ // end pointers till there are characters which are not present in the trim
text
+ for (start_ptr = 0; start_ptr < basetext_len; start_ptr += char_len) {
+ char_len = utf8_char_length(basetext[start_ptr]);
+ if (char_len == 0 || start_ptr + char_len > basetext_len) {
+ // invalid byte or incomplete glyph
+ set_error_for_invalid_utf(context, basetext[start_ptr]);
+ *out_len = 0;
+ return "";
+ }
+ if (!is_substr_utf8_utf8(trimtext, trimtext_len, basetext + start_ptr,
char_len)) {
+ break;
+ }
+ }
+ for (end_ptr = basetext_len - 1; end_ptr >= start_ptr; --end_ptr) {
+ char_len = utf8_char_length(basetext[end_ptr]);
+ if (char_len == 0) { // trailing byte in multibyte character
+ ++byte_cnt;
+ continue;
+ }
+ // this is the first byte of a character, hence check if char_len =
char_cnt
+ if (byte_cnt != char_len) { // invalid byte or incomplete glyph
+ set_error_for_invalid_utf(context, basetext[end_ptr]);
+ *out_len = 0;
+ return "";
+ }
+ byte_cnt = 1; // reset the counter*/
+ if (!is_substr_utf8_utf8(trimtext, trimtext_len, basetext + end_ptr,
char_len)) {
+ break;
+ }
+ }
+
+ // when all characters are trimmed, start_ptr has been incremented to
basetext_len and
+ // end_ptr still points to basetext_len - 1, hence we need to handle this
case
+ if (start_ptr > end_ptr) {
+ *out_len = 0;
+ return "";
+ }
+
+ end_ptr += utf8_char_length(basetext[end_ptr]); // point to the next
character
+ *out_len = end_ptr - start_ptr;
+ return basetext + start_ptr;
}
// Truncates the string to given length
@@ -680,7 +841,7 @@ const char* replace_utf8_utf8_utf8(gdv_int64 context, const
char* text,
int32_t len) {
\
gdv_##OUT_TYPE val = 0;
\
int32_t trimmed_len;
\
- data = trim_utf8(context, data, len, &trimmed_len);
\
+ data = btrim_utf8(context, data, len, &trimmed_len);
\
if (!arrow::internal::StringConverter<ARROW_TYPE>::Convert(data,
trimmed_len, \
&val)) {
\
std::string err = "Failed to cast the string " + std::string(data,
trimmed_len) + \
diff --git a/cpp/src/gandiva/precompiled/string_ops_test.cc
b/cpp/src/gandiva/precompiled/string_ops_test.cc
index 3d90fcc..88345d5 100644
--- a/cpp/src/gandiva/precompiled/string_ops_test.cc
+++ b/cpp/src/gandiva/precompiled/string_ops_test.cc
@@ -426,29 +426,275 @@ TEST(TestStringOps, TestReverse) {
ctx.Reset();
}
-TEST(TestStringOps, TestTrim) {
+TEST(TestStringOps, TestLtrim) {
gandiva::ExecutionContext ctx;
uint64_t ctx_ptr = reinterpret_cast<gdv_int64>(&ctx);
gdv_int32 out_len = 0;
const char* out_str;
- out_str = trim_utf8(ctx_ptr, "TestString", 10, &out_len);
+ out_str = ltrim_utf8(ctx_ptr, "TestString ", 12, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "TestString ");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = ltrim_utf8(ctx_ptr, " TestString ", 18, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "TestString ");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = ltrim_utf8(ctx_ptr, " Test çåå†bD", 18, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "Test çåå†bD");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = ltrim_utf8(ctx_ptr, "", 0, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = ltrim_utf8(ctx_ptr, " ", 6, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = ltrim_utf8_utf8(ctx_ptr, "", 0, "TestString", 10, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = ltrim_utf8_utf8(ctx_ptr, "TestString", 10, "", 0, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "TestString");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = ltrim_utf8_utf8(ctx_ptr, "abcbbaccabbcdef", 15, "abc", 3,
&out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "def");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = ltrim_utf8_utf8(ctx_ptr, "abcbbaccabbcdef", 15, "ababbac", 7,
&out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "def");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = ltrim_utf8_utf8(ctx_ptr, "ååçåå†eç†Dd", 21, "çåå†", 9, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "eç†Dd");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = ltrim_utf8_utf8(ctx_ptr, "ç†ååçåå†", 18, "çåå†", 9, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "");
+ EXPECT_FALSE(ctx.has_error());
+
+ std::string d(
+ "aa\xc3"
+ "bcd");
+ out_str =
+ ltrim_utf8_utf8(ctx_ptr, d.data(), static_cast<int>(d.length()), "a", 1,
&out_len);
+ EXPECT_EQ(std::string(out_str, out_len),
+ "\xc3"
+ "bcd");
+ EXPECT_FALSE(ctx.has_error());
+
+ std::string e(
+ "åå\xe0\xa0"
+ "bcd");
+ out_str =
+ ltrim_utf8_utf8(ctx_ptr, e.data(), static_cast<int>(e.length()), "å", 2,
&out_len);
+ EXPECT_EQ(std::string(out_str, out_len),
+ "\xE0\xa0"
+ "bcd");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = ltrim_utf8_utf8(ctx_ptr, "TestString", 10, "abcd", 4, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "TestString");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = ltrim_utf8_utf8(ctx_ptr, "acbabbcabb", 10, "abcbd", 5, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "");
+ EXPECT_FALSE(ctx.has_error());
+}
+
+TEST(TestStringOps, TestRtrim) {
+ gandiva::ExecutionContext ctx;
+ uint64_t ctx_ptr = reinterpret_cast<gdv_int64>(&ctx);
+ gdv_int32 out_len = 0;
+ const char* out_str;
+
+ out_str = rtrim_utf8(ctx_ptr, " TestString", 12, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), " TestString");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = rtrim_utf8(ctx_ptr, " TestString ", 18, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), " TestString");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = rtrim_utf8(ctx_ptr, "Test çåå†bD ", 20, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "Test çåå†bD");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = rtrim_utf8(ctx_ptr, "", 0, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = rtrim_utf8(ctx_ptr, " ", 6, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = rtrim_utf8_utf8(ctx_ptr, "", 0, "TestString", 10, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = rtrim_utf8_utf8(ctx_ptr, "TestString", 10, "", 0, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "TestString");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = rtrim_utf8_utf8(ctx_ptr, "TestString", 10, "ring", 4, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "TestSt");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = rtrim_utf8_utf8(ctx_ptr, "defabcbbaccabbc", 15, "abc", 3,
&out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "def");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = rtrim_utf8_utf8(ctx_ptr, "defabcbbaccabbc", 15, "ababbac", 7,
&out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "def");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = rtrim_utf8_utf8(ctx_ptr, "eDdç†ååçåå†", 21, "çåå†", 9, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "eDd");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = rtrim_utf8_utf8(ctx_ptr, "ç†ååçåå†", 18, "çåå†", 9, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "");
+ EXPECT_FALSE(ctx.has_error());
+
+ std::string d(
+ "\xc3"
+ "aaa");
+ out_str =
+ rtrim_utf8_utf8(ctx_ptr, d.data(), static_cast<int>(d.length()), "a", 1,
&out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "");
+ EXPECT_TRUE(ctx.has_error());
+ ctx.Reset();
+
+ std::string e(
+ "\xe0\xa0"
+ "åå");
+ out_str =
+ rtrim_utf8_utf8(ctx_ptr, e.data(), static_cast<int>(e.length()), "å", 2,
&out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "");
+ EXPECT_TRUE(ctx.has_error());
+ ctx.Reset();
+
+ out_str = rtrim_utf8_utf8(ctx_ptr, "åeçå", 7, "çå", 4, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "åe");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = rtrim_utf8_utf8(ctx_ptr, "TestString", 10, "abcd", 4, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "TestString");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = rtrim_utf8_utf8(ctx_ptr, "acbabbcabb", 10, "abcbd", 5, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "");
+ EXPECT_FALSE(ctx.has_error());
+}
+
+TEST(TestStringOps, TestBtrim) {
+ gandiva::ExecutionContext ctx;
+ uint64_t ctx_ptr = reinterpret_cast<gdv_int64>(&ctx);
+ gdv_int32 out_len = 0;
+ const char* out_str;
+
+ out_str = btrim_utf8(ctx_ptr, "TestString", 10, &out_len);
EXPECT_EQ(std::string(out_str, out_len), "TestString");
EXPECT_FALSE(ctx.has_error());
- out_str = trim_utf8(ctx_ptr, " TestString ", 18, &out_len);
+ out_str = btrim_utf8(ctx_ptr, " TestString ", 18, &out_len);
EXPECT_EQ(std::string(out_str, out_len), "TestString");
EXPECT_FALSE(ctx.has_error());
- out_str = trim_utf8(ctx_ptr, " Test çåå†bD ", 21, &out_len);
+ out_str = btrim_utf8(ctx_ptr, " Test çåå†bD ", 21, &out_len);
EXPECT_EQ(std::string(out_str, out_len), "Test çåå†bD");
EXPECT_FALSE(ctx.has_error());
- out_str = trim_utf8(ctx_ptr, "", 0, &out_len);
+ out_str = btrim_utf8(ctx_ptr, "", 0, &out_len);
EXPECT_EQ(std::string(out_str, out_len), "");
EXPECT_FALSE(ctx.has_error());
- out_str = trim_utf8(ctx_ptr, " ", 6, &out_len);
+ out_str = btrim_utf8(ctx_ptr, " ", 6, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = btrim_utf8_utf8(ctx_ptr, "", 0, "TestString", 10, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = btrim_utf8_utf8(ctx_ptr, "TestString", 10, "Test", 4, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "String");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = btrim_utf8_utf8(ctx_ptr, "TestString", 10, "String", 6, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "Tes");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = btrim_utf8_utf8(ctx_ptr, "TestString", 10, "", 0, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "TestString");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = btrim_utf8_utf8(ctx_ptr, "abcbbadefccabbc", 15, "abc", 3,
&out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "def");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = btrim_utf8_utf8(ctx_ptr, "abcbbadefccabbc", 15, "ababbac", 7,
&out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "def");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = btrim_utf8_utf8(ctx_ptr, "ååçåå†Ddeç†", 21, "çåå†", 9, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "Dde");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = btrim_utf8_utf8(ctx_ptr, "ç†ååçåå†", 18, "çåå†", 9, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "");
+ EXPECT_FALSE(ctx.has_error());
+ ctx.Reset();
+
+ std::string d(
+ "acd\xc3"
+ "aaa");
+ out_str =
+ btrim_utf8_utf8(ctx_ptr, d.data(), static_cast<int>(d.length()), "a", 1,
&out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "");
+ EXPECT_TRUE(ctx.has_error());
+ ctx.Reset();
+
+ std::string e(
+ "åbc\xe0\xa0"
+ "åå");
+ out_str =
+ btrim_utf8_utf8(ctx_ptr, e.data(), static_cast<int>(e.length()), "å", 2,
&out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "");
+ EXPECT_TRUE(ctx.has_error());
+ ctx.Reset();
+
+ std::string f(
+ "aa\xc3"
+ "bcd");
+ out_str =
+ btrim_utf8_utf8(ctx_ptr, f.data(), static_cast<int>(f.length()), "a", 1,
&out_len);
+ EXPECT_EQ(std::string(out_str, out_len),
+ "\xc3"
+ "bcd");
+ EXPECT_FALSE(ctx.has_error());
+
+ std::string g(
+ "åå\xe0\xa0"
+ "bcå");
+ out_str =
+ btrim_utf8_utf8(ctx_ptr, g.data(), static_cast<int>(g.length()), "å", 2,
&out_len);
+ EXPECT_EQ(std::string(out_str, out_len),
+ "\xe0\xa0"
+ "bc");
+
+ out_str = btrim_utf8_utf8(ctx_ptr, "åe†çå", 10, "çå", 4, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "e†");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = btrim_utf8_utf8(ctx_ptr, "TestString", 10, "abcd", 4, &out_len);
+ EXPECT_EQ(std::string(out_str, out_len), "TestString");
+ EXPECT_FALSE(ctx.has_error());
+
+ out_str = btrim_utf8_utf8(ctx_ptr, "acbabbcabb", 10, "abcbd", 5, &out_len);
EXPECT_EQ(std::string(out_str, out_len), "");
EXPECT_FALSE(ctx.has_error());
}
diff --git a/cpp/src/gandiva/precompiled/types.h
b/cpp/src/gandiva/precompiled/types.h
index 3a30dae..77f1589 100644
--- a/cpp/src/gandiva/precompiled/types.h
+++ b/cpp/src/gandiva/precompiled/types.h
@@ -165,6 +165,8 @@ bool is_substr_utf8_utf8(const char* data, gdv_int32
data_len, const char* subst
gdv_int32 utf8_length(gdv_int64 context, const char* data, gdv_int32 data_len);
+gdv_int32 utf8_last_char_pos(gdv_int64 context, const char* data, gdv_int32
data_len);
+
gdv_date64 castDATE_utf8(int64_t execution_context, const char* input,
gdv_int32 length);
gdv_date64 castDATE_int64(gdv_int64 date);
@@ -200,8 +202,26 @@ const char* lower_utf8(gdv_int64 context, const char*
data, gdv_int32 data_len,
const char* reverse_utf8(gdv_int64 context, const char* data, gdv_int32
data_len,
int32_t* out_len);
-const char* trim_utf8(gdv_int64 context, const char* data, gdv_int32 data_len,
- int32_t* out_len);
+const char* ltrim_utf8(gdv_int64 context, const char* data, gdv_int32 data_len,
+ int32_t* out_len);
+
+const char* rtrim_utf8(gdv_int64 context, const char* data, gdv_int32 data_len,
+ int32_t* out_len);
+
+const char* btrim_utf8(gdv_int64 context, const char* data, gdv_int32 data_len,
+ int32_t* out_len);
+
+const char* ltrim_utf8_utf8(gdv_int64 context, const char* basetext,
+ gdv_int32 basetext_len, const char* trimtext,
+ gdv_int32 trimtext_len, int32_t* out_len);
+
+const char* rtrim_utf8_utf8(gdv_int64 context, const char* basetext,
+ gdv_int32 basetext_len, const char* trimtext,
+ gdv_int32 trimtext_len, int32_t* out_len);
+
+const char* btrim_utf8_utf8(gdv_int64 context, const char* basetext,
+ gdv_int32 basetext_len, const char* trimtext,
+ gdv_int32 trimtext_len, int32_t* out_len);
gdv_int32 locate_utf8_utf8(gdv_int64 context, const char* sub_str, gdv_int32
sub_str_len,
const char* str, gdv_int32 str_len);