[arrow] branch master updated: ARROW-9328: [C++][Gandiva] Add LTRIM, RTRIM, BTRIM functions for string

praveenbingo Thu, 23 Jul 2020 05:29:13 -0700

This is an automated email from the ASF dual-hosted git repository.

praveenbingo pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git



The following commit(s) were added to refs/heads/master by this push:
     new a06a0f4  ARROW-9328: [C++][Gandiva] Add LTRIM, RTRIM, BTRIM functions 
for string
a06a0f4 is described below

commit a06a0f4c6b3268bbbc8da77521f4a229d77a9c94
Author: Sagnik Chakraborty <[email protected]>
AuthorDate: Thu Jul 23 17:57:28 2020 +0530

    ARROW-9328: [C++][Gandiva] Add LTRIM, RTRIM, BTRIM functions for string
    
    Closes #7641 from sagnikc-dremio/master and squashes the following commits:
    
    4a9985fc5 <Sagnik Chakraborty> ARROW-9328:  Add LTRIM, RTRIM, BTRIM 
functions for string
    
    Authored-by: Sagnik Chakraborty <[email protected]>
    Signed-off-by: Praveen <[email protected]>
---
 cpp/src/gandiva/function_registry_string.cc    |  13 +-
 cpp/src/gandiva/precompiled/string_ops.cc      | 189 ++++++++++++++++--
 cpp/src/gandiva/precompiled/string_ops_test.cc | 258 ++++++++++++++++++++++++-
 cpp/src/gandiva/precompiled/types.h            |  24 ++-
 4 files changed, 461 insertions(+), 23 deletions(-)

diff --git a/cpp/src/gandiva/function_registry_string.cc 
b/cpp/src/gandiva/function_registry_string.cc
index 436168d..dd32c19 100644
--- a/cpp/src/gandiva/function_registry_string.cc
+++ b/cpp/src/gandiva/function_registry_string.cc
@@ -55,7 +55,9 @@ std::vector<NativeFunction> GetStringFunctionRegistry() {
       UNARY_UNSAFE_NULL_IF_NULL(length, {}, utf8, int32),
       UNARY_UNSAFE_NULL_IF_NULL(lengthUtf8, {}, binary, int32),
       UNARY_UNSAFE_NULL_IF_NULL(reverse, {}, utf8, utf8),
-      UNARY_UNSAFE_NULL_IF_NULL(trim, {}, utf8, utf8),
+      UNARY_UNSAFE_NULL_IF_NULL(ltrim, {}, utf8, utf8),
+      UNARY_UNSAFE_NULL_IF_NULL(rtrim, {}, utf8, utf8),
+      UNARY_UNSAFE_NULL_IF_NULL(btrim, {}, utf8, utf8),
 
       UNARY_SAFE_NULL_NEVER_BOOL_FN(isnull, {}),
       UNARY_SAFE_NULL_NEVER_BOOL_FN(isnotnull, {}),
@@ -83,6 +85,15 @@ std::vector<NativeFunction> GetStringFunctionRegistry() {
                      kResultNullIfNull, "gdv_fn_like_utf8_utf8",
                      NativeFunction::kNeedsFunctionHolder),
 
+      NativeFunction("ltrim", {}, DataTypeVector{utf8(), utf8()}, utf8(),
+                     kResultNullIfNull, "ltrim_utf8_utf8", 
NativeFunction::kNeedsContext),
+
+      NativeFunction("rtrim", {}, DataTypeVector{utf8(), utf8()}, utf8(),
+                     kResultNullIfNull, "rtrim_utf8_utf8", 
NativeFunction::kNeedsContext),
+
+      NativeFunction("btrim", {}, DataTypeVector{utf8(), utf8()}, utf8(),
+                     kResultNullIfNull, "btrim_utf8_utf8", 
NativeFunction::kNeedsContext),
+
       NativeFunction("substr", {"substring"},
                      DataTypeVector{utf8(), int64() /*offset*/, int64() 
/*length*/},
                      utf8(), kResultNullIfNull, "substr_utf8_int64_int64",
diff --git a/cpp/src/gandiva/precompiled/string_ops.cc 
b/cpp/src/gandiva/precompiled/string_ops.cc
index 102532c..f6ef79c 100644
--- a/cpp/src/gandiva/precompiled/string_ops.cc
+++ b/cpp/src/gandiva/precompiled/string_ops.cc
@@ -16,7 +16,6 @@
 // under the License.
 
 // String functions
-
 #include "arrow/util/value_parsing.h"
 
 extern "C" {
@@ -286,10 +285,48 @@ const char* reverse_utf8(gdv_int64 context, const char* 
data, gdv_int32 data_len
   return ret;
 }
 
-// Trim a utf8 sequence
+// Trims whitespaces from the left end of the input utf8 sequence
+FORCE_INLINE
+const char* ltrim_utf8(gdv_int64 context, const char* data, gdv_int32 data_len,
+                       int32_t* out_len) {
+  if (data_len == 0) {
+    *out_len = 0;
+    return "";
+  }
+
+  gdv_int32 start = 0;
+  // start denotes the first position of non-space characters in the input 
string
+  while (start < data_len && data[start] == ' ') {
+    ++start;
+  }
+
+  *out_len = data_len - start;
+  return data + start;
+}
+
+// Trims whitespaces from the right end of the input utf8 sequence
 FORCE_INLINE
-const char* trim_utf8(gdv_int64 context, const char* data, gdv_int32 data_len,
-                      int32_t* out_len) {
+const char* rtrim_utf8(gdv_int64 context, const char* data, gdv_int32 data_len,
+                       int32_t* out_len) {
+  if (data_len == 0) {
+    *out_len = 0;
+    return "";
+  }
+
+  gdv_int32 end = data_len - 1;
+  // end denotes the last position of non-space characters in the input string
+  while (end >= 0 && data[end] == ' ') {
+    --end;
+  }
+
+  *out_len = end + 1;
+  return data;
+}
+
+// Trims whitespaces from both the ends of the input utf8 sequence
+FORCE_INLINE
+const char* btrim_utf8(gdv_int64 context, const char* data, gdv_int32 data_len,
+                       int32_t* out_len) {
   if (data_len == 0) {
     *out_len = 0;
     return "";
@@ -305,21 +342,145 @@ const char* trim_utf8(gdv_int64 context, const char* 
data, gdv_int32 data_len,
     --end;
   }
 
-  // string with no leading/trailing spaces, return original string
-  if (start == 0 && end == data_len - 1) {
-    *out_len = data_len;
-    return data;
+  // string has some leading/trailing spaces and some non-space characters
+  *out_len = end - start + 1;
+  return data + start;
+}
+
+// Trims characters present in the trim text from the left end of the base text
+FORCE_INLINE
+const char* ltrim_utf8_utf8(gdv_int64 context, const char* basetext,
+                            gdv_int32 basetext_len, const char* trimtext,
+                            gdv_int32 trimtext_len, int32_t* out_len) {
+  if (basetext_len == 0) {
+    *out_len = 0;
+    return "";
+  } else if (trimtext_len == 0) {
+    *out_len = basetext_len;
+    return basetext;
+  }
+
+  gdv_int32 start_ptr, char_len;
+  // scan the base text from left to right and increment the start pointer till
+  // there is a character which is not present in the trim text
+  for (start_ptr = 0; start_ptr < basetext_len; start_ptr += char_len) {
+    char_len = utf8_char_length(basetext[start_ptr]);
+    if (char_len == 0 || start_ptr + char_len > basetext_len) {
+      // invalid byte or incomplete glyph
+      set_error_for_invalid_utf(context, basetext[start_ptr]);
+      *out_len = 0;
+      return "";
+    }
+    if (!is_substr_utf8_utf8(trimtext, trimtext_len, basetext + start_ptr, 
char_len)) {
+      break;
+    }
   }
 
-  // string with all spaces
-  if (start > end) {
+  *out_len = basetext_len - start_ptr;
+  return basetext + start_ptr;
+}
+
+// Trims characters present in the trim text from the right end of the base 
text
+FORCE_INLINE
+const char* rtrim_utf8_utf8(gdv_int64 context, const char* basetext,
+                            gdv_int32 basetext_len, const char* trimtext,
+                            gdv_int32 trimtext_len, int32_t* out_len) {
+  if (basetext_len == 0) {
     *out_len = 0;
     return "";
+  } else if (trimtext_len == 0) {
+    *out_len = basetext_len;
+    return basetext;
+  }
+
+  gdv_int32 char_len, end_ptr, byte_cnt = 1;
+  // scan the base text from right to left and decrement the end pointer till
+  // there is a character which is not present in the trim text
+  for (end_ptr = basetext_len - 1; end_ptr >= 0; --end_ptr) {
+    char_len = utf8_char_length(basetext[end_ptr]);
+    if (char_len == 0) {  // trailing bytes of multibyte character
+      ++byte_cnt;
+      continue;
+    }
+    // this is the first byte of a character, hence check if char_len = 
char_cnt
+    if (byte_cnt != char_len) {  // invalid byte or incomplete glyph
+      set_error_for_invalid_utf(context, basetext[end_ptr]);
+      *out_len = 0;
+      return "";
+    }
+    byte_cnt = 1;  // reset the counter*/
+    if (!is_substr_utf8_utf8(trimtext, trimtext_len, basetext + end_ptr, 
char_len)) {
+      break;
+    }
   }
 
-  // string has some leading/trailing spaces and some non-space characters
-  *out_len = end - start + 1;
-  return data + start;
+  // when all characters in the basetext are part of the trimtext
+  if (end_ptr == -1) {
+    *out_len = 0;
+    return "";
+  }
+
+  end_ptr += utf8_char_length(basetext[end_ptr]);  // point to the next 
character
+  *out_len = end_ptr;
+  return basetext;
+}
+
+// Trims characters present in the trim text from both ends of the base text
+FORCE_INLINE
+const char* btrim_utf8_utf8(gdv_int64 context, const char* basetext,
+                            gdv_int32 basetext_len, const char* trimtext,
+                            gdv_int32 trimtext_len, int32_t* out_len) {
+  if (basetext_len == 0) {
+    *out_len = 0;
+    return "";
+  } else if (trimtext_len == 0) {
+    *out_len = basetext_len;
+    return basetext;
+  }
+
+  gdv_int32 start_ptr, end_ptr, char_len, byte_cnt = 1;
+  // scan the base text from left to right and increment the start and 
decrement the
+  // end pointers till there are characters which are not present in the trim 
text
+  for (start_ptr = 0; start_ptr < basetext_len; start_ptr += char_len) {
+    char_len = utf8_char_length(basetext[start_ptr]);
+    if (char_len == 0 || start_ptr + char_len > basetext_len) {
+      // invalid byte or incomplete glyph
+      set_error_for_invalid_utf(context, basetext[start_ptr]);
+      *out_len = 0;
+      return "";
+    }
+    if (!is_substr_utf8_utf8(trimtext, trimtext_len, basetext + start_ptr, 
char_len)) {
+      break;
+    }
+  }
+  for (end_ptr = basetext_len - 1; end_ptr >= start_ptr; --end_ptr) {
+    char_len = utf8_char_length(basetext[end_ptr]);
+    if (char_len == 0) {  // trailing byte in multibyte character
+      ++byte_cnt;
+      continue;
+    }
+    // this is the first byte of a character, hence check if char_len = 
char_cnt
+    if (byte_cnt != char_len) {  // invalid byte or incomplete glyph
+      set_error_for_invalid_utf(context, basetext[end_ptr]);
+      *out_len = 0;
+      return "";
+    }
+    byte_cnt = 1;  // reset the counter*/
+    if (!is_substr_utf8_utf8(trimtext, trimtext_len, basetext + end_ptr, 
char_len)) {
+      break;
+    }
+  }
+
+  // when all characters are trimmed, start_ptr has been incremented to 
basetext_len and
+  // end_ptr still points to basetext_len - 1, hence we need to handle this 
case
+  if (start_ptr > end_ptr) {
+    *out_len = 0;
+    return "";
+  }
+
+  end_ptr += utf8_char_length(basetext[end_ptr]);  // point to the next 
character
+  *out_len = end_ptr - start_ptr;
+  return basetext + start_ptr;
 }
 
 // Truncates the string to given length
@@ -680,7 +841,7 @@ const char* replace_utf8_utf8_utf8(gdv_int64 context, const 
char* text,
                                         int32_t len) {                         
         \
     gdv_##OUT_TYPE val = 0;                                                    
         \
     int32_t trimmed_len;                                                       
         \
-    data = trim_utf8(context, data, len, &trimmed_len);                        
         \
+    data = btrim_utf8(context, data, len, &trimmed_len);                       
         \
     if (!arrow::internal::StringConverter<ARROW_TYPE>::Convert(data, 
trimmed_len,       \
                                                                &val)) {        
         \
       std::string err = "Failed to cast the string " + std::string(data, 
trimmed_len) + \
diff --git a/cpp/src/gandiva/precompiled/string_ops_test.cc 
b/cpp/src/gandiva/precompiled/string_ops_test.cc
index 3d90fcc..88345d5 100644
--- a/cpp/src/gandiva/precompiled/string_ops_test.cc
+++ b/cpp/src/gandiva/precompiled/string_ops_test.cc
@@ -426,29 +426,275 @@ TEST(TestStringOps, TestReverse) {
   ctx.Reset();
 }
 
-TEST(TestStringOps, TestTrim) {
+TEST(TestStringOps, TestLtrim) {
   gandiva::ExecutionContext ctx;
   uint64_t ctx_ptr = reinterpret_cast<gdv_int64>(&ctx);
   gdv_int32 out_len = 0;
   const char* out_str;
 
-  out_str = trim_utf8(ctx_ptr, "TestString", 10, &out_len);
+  out_str = ltrim_utf8(ctx_ptr, "TestString  ", 12, &out_len);
+  EXPECT_EQ(std::string(out_str, out_len), "TestString  ");
+  EXPECT_FALSE(ctx.has_error());
+
+  out_str = ltrim_utf8(ctx_ptr, "      TestString  ", 18, &out_len);
+  EXPECT_EQ(std::string(out_str, out_len), "TestString  ");
+  EXPECT_FALSE(ctx.has_error());
+
+  out_str = ltrim_utf8(ctx_ptr, " Test  çåå†bD", 18, &out_len);
+  EXPECT_EQ(std::string(out_str, out_len), "Test  çåå†bD");
+  EXPECT_FALSE(ctx.has_error());
+
+  out_str = ltrim_utf8(ctx_ptr, "", 0, &out_len);
+  EXPECT_EQ(std::string(out_str, out_len), "");
+  EXPECT_FALSE(ctx.has_error());
+
+  out_str = ltrim_utf8(ctx_ptr, "      ", 6, &out_len);
+  EXPECT_EQ(std::string(out_str, out_len), "");
+  EXPECT_FALSE(ctx.has_error());
+
+  out_str = ltrim_utf8_utf8(ctx_ptr, "", 0, "TestString", 10, &out_len);
+  EXPECT_EQ(std::string(out_str, out_len), "");
+  EXPECT_FALSE(ctx.has_error());
+
+  out_str = ltrim_utf8_utf8(ctx_ptr, "TestString", 10, "", 0, &out_len);
+  EXPECT_EQ(std::string(out_str, out_len), "TestString");
+  EXPECT_FALSE(ctx.has_error());
+
+  out_str = ltrim_utf8_utf8(ctx_ptr, "abcbbaccabbcdef", 15, "abc", 3, 
&out_len);
+  EXPECT_EQ(std::string(out_str, out_len), "def");
+  EXPECT_FALSE(ctx.has_error());
+
+  out_str = ltrim_utf8_utf8(ctx_ptr, "abcbbaccabbcdef", 15, "ababbac", 7, 
&out_len);
+  EXPECT_EQ(std::string(out_str, out_len), "def");
+  EXPECT_FALSE(ctx.has_error());
+
+  out_str = ltrim_utf8_utf8(ctx_ptr, "ååçåå†eç†Dd", 21, "çåå†", 9, &out_len);
+  EXPECT_EQ(std::string(out_str, out_len), "eç†Dd");
+  EXPECT_FALSE(ctx.has_error());
+
+  out_str = ltrim_utf8_utf8(ctx_ptr, "ç†ååçåå†", 18, "çåå†", 9, &out_len);
+  EXPECT_EQ(std::string(out_str, out_len), "");
+  EXPECT_FALSE(ctx.has_error());
+
+  std::string d(
+      "aa\xc3"
+      "bcd");
+  out_str =
+      ltrim_utf8_utf8(ctx_ptr, d.data(), static_cast<int>(d.length()), "a", 1, 
&out_len);
+  EXPECT_EQ(std::string(out_str, out_len),
+            "\xc3"
+            "bcd");
+  EXPECT_FALSE(ctx.has_error());
+
+  std::string e(
+      "åå\xe0\xa0"
+      "bcd");
+  out_str =
+      ltrim_utf8_utf8(ctx_ptr, e.data(), static_cast<int>(e.length()), "å", 2, 
&out_len);
+  EXPECT_EQ(std::string(out_str, out_len),
+            "\xE0\xa0"
+            "bcd");
+  EXPECT_FALSE(ctx.has_error());
+
+  out_str = ltrim_utf8_utf8(ctx_ptr, "TestString", 10, "abcd", 4, &out_len);
+  EXPECT_EQ(std::string(out_str, out_len), "TestString");
+  EXPECT_FALSE(ctx.has_error());
+
+  out_str = ltrim_utf8_utf8(ctx_ptr, "acbabbcabb", 10, "abcbd", 5, &out_len);
+  EXPECT_EQ(std::string(out_str, out_len), "");
+  EXPECT_FALSE(ctx.has_error());
+}
+
+TEST(TestStringOps, TestRtrim) {
+  gandiva::ExecutionContext ctx;
+  uint64_t ctx_ptr = reinterpret_cast<gdv_int64>(&ctx);
+  gdv_int32 out_len = 0;
+  const char* out_str;
+
+  out_str = rtrim_utf8(ctx_ptr, "  TestString", 12, &out_len);
+  EXPECT_EQ(std::string(out_str, out_len), "  TestString");
+  EXPECT_FALSE(ctx.has_error());
+
+  out_str = rtrim_utf8(ctx_ptr, "  TestString      ", 18, &out_len);
+  EXPECT_EQ(std::string(out_str, out_len), "  TestString");
+  EXPECT_FALSE(ctx.has_error());
+
+  out_str = rtrim_utf8(ctx_ptr, "Test  çåå†bD   ", 20, &out_len);
+  EXPECT_EQ(std::string(out_str, out_len), "Test  çåå†bD");
+  EXPECT_FALSE(ctx.has_error());
+
+  out_str = rtrim_utf8(ctx_ptr, "", 0, &out_len);
+  EXPECT_EQ(std::string(out_str, out_len), "");
+  EXPECT_FALSE(ctx.has_error());
+
+  out_str = rtrim_utf8(ctx_ptr, "      ", 6, &out_len);
+  EXPECT_EQ(std::string(out_str, out_len), "");
+  EXPECT_FALSE(ctx.has_error());
+
+  out_str = rtrim_utf8_utf8(ctx_ptr, "", 0, "TestString", 10, &out_len);
+  EXPECT_EQ(std::string(out_str, out_len), "");
+  EXPECT_FALSE(ctx.has_error());
+
+  out_str = rtrim_utf8_utf8(ctx_ptr, "TestString", 10, "", 0, &out_len);
+  EXPECT_EQ(std::string(out_str, out_len), "TestString");
+  EXPECT_FALSE(ctx.has_error());
+
+  out_str = rtrim_utf8_utf8(ctx_ptr, "TestString", 10, "ring", 4, &out_len);
+  EXPECT_EQ(std::string(out_str, out_len), "TestSt");
+  EXPECT_FALSE(ctx.has_error());
+
+  out_str = rtrim_utf8_utf8(ctx_ptr, "defabcbbaccabbc", 15, "abc", 3, 
&out_len);
+  EXPECT_EQ(std::string(out_str, out_len), "def");
+  EXPECT_FALSE(ctx.has_error());
+
+  out_str = rtrim_utf8_utf8(ctx_ptr, "defabcbbaccabbc", 15, "ababbac", 7, 
&out_len);
+  EXPECT_EQ(std::string(out_str, out_len), "def");
+  EXPECT_FALSE(ctx.has_error());
+
+  out_str = rtrim_utf8_utf8(ctx_ptr, "eDdç†ååçåå†", 21, "çåå†", 9, &out_len);
+  EXPECT_EQ(std::string(out_str, out_len), "eDd");
+  EXPECT_FALSE(ctx.has_error());
+
+  out_str = rtrim_utf8_utf8(ctx_ptr, "ç†ååçåå†", 18, "çåå†", 9, &out_len);
+  EXPECT_EQ(std::string(out_str, out_len), "");
+  EXPECT_FALSE(ctx.has_error());
+
+  std::string d(
+      "\xc3"
+      "aaa");
+  out_str =
+      rtrim_utf8_utf8(ctx_ptr, d.data(), static_cast<int>(d.length()), "a", 1, 
&out_len);
+  EXPECT_EQ(std::string(out_str, out_len), "");
+  EXPECT_TRUE(ctx.has_error());
+  ctx.Reset();
+
+  std::string e(
+      "\xe0\xa0"
+      "åå");
+  out_str =
+      rtrim_utf8_utf8(ctx_ptr, e.data(), static_cast<int>(e.length()), "å", 2, 
&out_len);
+  EXPECT_EQ(std::string(out_str, out_len), "");
+  EXPECT_TRUE(ctx.has_error());
+  ctx.Reset();
+
+  out_str = rtrim_utf8_utf8(ctx_ptr, "åeçå", 7, "çå", 4, &out_len);
+  EXPECT_EQ(std::string(out_str, out_len), "åe");
+  EXPECT_FALSE(ctx.has_error());
+
+  out_str = rtrim_utf8_utf8(ctx_ptr, "TestString", 10, "abcd", 4, &out_len);
+  EXPECT_EQ(std::string(out_str, out_len), "TestString");
+  EXPECT_FALSE(ctx.has_error());
+
+  out_str = rtrim_utf8_utf8(ctx_ptr, "acbabbcabb", 10, "abcbd", 5, &out_len);
+  EXPECT_EQ(std::string(out_str, out_len), "");
+  EXPECT_FALSE(ctx.has_error());
+}
+
+TEST(TestStringOps, TestBtrim) {
+  gandiva::ExecutionContext ctx;
+  uint64_t ctx_ptr = reinterpret_cast<gdv_int64>(&ctx);
+  gdv_int32 out_len = 0;
+  const char* out_str;
+
+  out_str = btrim_utf8(ctx_ptr, "TestString", 10, &out_len);
   EXPECT_EQ(std::string(out_str, out_len), "TestString");
   EXPECT_FALSE(ctx.has_error());
 
-  out_str = trim_utf8(ctx_ptr, "      TestString  ", 18, &out_len);
+  out_str = btrim_utf8(ctx_ptr, "      TestString  ", 18, &out_len);
   EXPECT_EQ(std::string(out_str, out_len), "TestString");
   EXPECT_FALSE(ctx.has_error());
 
-  out_str = trim_utf8(ctx_ptr, " Test  çåå†bD   ", 21, &out_len);
+  out_str = btrim_utf8(ctx_ptr, " Test  çåå†bD   ", 21, &out_len);
   EXPECT_EQ(std::string(out_str, out_len), "Test  çåå†bD");
   EXPECT_FALSE(ctx.has_error());
 
-  out_str = trim_utf8(ctx_ptr, "", 0, &out_len);
+  out_str = btrim_utf8(ctx_ptr, "", 0, &out_len);
   EXPECT_EQ(std::string(out_str, out_len), "");
   EXPECT_FALSE(ctx.has_error());
 
-  out_str = trim_utf8(ctx_ptr, "      ", 6, &out_len);
+  out_str = btrim_utf8(ctx_ptr, "      ", 6, &out_len);
+  EXPECT_EQ(std::string(out_str, out_len), "");
+  EXPECT_FALSE(ctx.has_error());
+
+  out_str = btrim_utf8_utf8(ctx_ptr, "", 0, "TestString", 10, &out_len);
+  EXPECT_EQ(std::string(out_str, out_len), "");
+  EXPECT_FALSE(ctx.has_error());
+
+  out_str = btrim_utf8_utf8(ctx_ptr, "TestString", 10, "Test", 4, &out_len);
+  EXPECT_EQ(std::string(out_str, out_len), "String");
+  EXPECT_FALSE(ctx.has_error());
+
+  out_str = btrim_utf8_utf8(ctx_ptr, "TestString", 10, "String", 6, &out_len);
+  EXPECT_EQ(std::string(out_str, out_len), "Tes");
+  EXPECT_FALSE(ctx.has_error());
+
+  out_str = btrim_utf8_utf8(ctx_ptr, "TestString", 10, "", 0, &out_len);
+  EXPECT_EQ(std::string(out_str, out_len), "TestString");
+  EXPECT_FALSE(ctx.has_error());
+
+  out_str = btrim_utf8_utf8(ctx_ptr, "abcbbadefccabbc", 15, "abc", 3, 
&out_len);
+  EXPECT_EQ(std::string(out_str, out_len), "def");
+  EXPECT_FALSE(ctx.has_error());
+
+  out_str = btrim_utf8_utf8(ctx_ptr, "abcbbadefccabbc", 15, "ababbac", 7, 
&out_len);
+  EXPECT_EQ(std::string(out_str, out_len), "def");
+  EXPECT_FALSE(ctx.has_error());
+
+  out_str = btrim_utf8_utf8(ctx_ptr, "ååçåå†Ddeç†", 21, "çåå†", 9, &out_len);
+  EXPECT_EQ(std::string(out_str, out_len), "Dde");
+  EXPECT_FALSE(ctx.has_error());
+
+  out_str = btrim_utf8_utf8(ctx_ptr, "ç†ååçåå†", 18, "çåå†", 9, &out_len);
+  EXPECT_EQ(std::string(out_str, out_len), "");
+  EXPECT_FALSE(ctx.has_error());
+  ctx.Reset();
+
+  std::string d(
+      "acd\xc3"
+      "aaa");
+  out_str =
+      btrim_utf8_utf8(ctx_ptr, d.data(), static_cast<int>(d.length()), "a", 1, 
&out_len);
+  EXPECT_EQ(std::string(out_str, out_len), "");
+  EXPECT_TRUE(ctx.has_error());
+  ctx.Reset();
+
+  std::string e(
+      "åbc\xe0\xa0"
+      "åå");
+  out_str =
+      btrim_utf8_utf8(ctx_ptr, e.data(), static_cast<int>(e.length()), "å", 2, 
&out_len);
+  EXPECT_EQ(std::string(out_str, out_len), "");
+  EXPECT_TRUE(ctx.has_error());
+  ctx.Reset();
+
+  std::string f(
+      "aa\xc3"
+      "bcd");
+  out_str =
+      btrim_utf8_utf8(ctx_ptr, f.data(), static_cast<int>(f.length()), "a", 1, 
&out_len);
+  EXPECT_EQ(std::string(out_str, out_len),
+            "\xc3"
+            "bcd");
+  EXPECT_FALSE(ctx.has_error());
+
+  std::string g(
+      "åå\xe0\xa0"
+      "bcå");
+  out_str =
+      btrim_utf8_utf8(ctx_ptr, g.data(), static_cast<int>(g.length()), "å", 2, 
&out_len);
+  EXPECT_EQ(std::string(out_str, out_len),
+            "\xe0\xa0"
+            "bc");
+
+  out_str = btrim_utf8_utf8(ctx_ptr, "åe†çå", 10, "çå", 4, &out_len);
+  EXPECT_EQ(std::string(out_str, out_len), "e†");
+  EXPECT_FALSE(ctx.has_error());
+
+  out_str = btrim_utf8_utf8(ctx_ptr, "TestString", 10, "abcd", 4, &out_len);
+  EXPECT_EQ(std::string(out_str, out_len), "TestString");
+  EXPECT_FALSE(ctx.has_error());
+
+  out_str = btrim_utf8_utf8(ctx_ptr, "acbabbcabb", 10, "abcbd", 5, &out_len);
   EXPECT_EQ(std::string(out_str, out_len), "");
   EXPECT_FALSE(ctx.has_error());
 }
diff --git a/cpp/src/gandiva/precompiled/types.h 
b/cpp/src/gandiva/precompiled/types.h
index 3a30dae..77f1589 100644
--- a/cpp/src/gandiva/precompiled/types.h
+++ b/cpp/src/gandiva/precompiled/types.h
@@ -165,6 +165,8 @@ bool is_substr_utf8_utf8(const char* data, gdv_int32 
data_len, const char* subst
 
 gdv_int32 utf8_length(gdv_int64 context, const char* data, gdv_int32 data_len);
 
+gdv_int32 utf8_last_char_pos(gdv_int64 context, const char* data, gdv_int32 
data_len);
+
 gdv_date64 castDATE_utf8(int64_t execution_context, const char* input, 
gdv_int32 length);
 
 gdv_date64 castDATE_int64(gdv_int64 date);
@@ -200,8 +202,26 @@ const char* lower_utf8(gdv_int64 context, const char* 
data, gdv_int32 data_len,
 const char* reverse_utf8(gdv_int64 context, const char* data, gdv_int32 
data_len,
                          int32_t* out_len);
 
-const char* trim_utf8(gdv_int64 context, const char* data, gdv_int32 data_len,
-                      int32_t* out_len);
+const char* ltrim_utf8(gdv_int64 context, const char* data, gdv_int32 data_len,
+                       int32_t* out_len);
+
+const char* rtrim_utf8(gdv_int64 context, const char* data, gdv_int32 data_len,
+                       int32_t* out_len);
+
+const char* btrim_utf8(gdv_int64 context, const char* data, gdv_int32 data_len,
+                       int32_t* out_len);
+
+const char* ltrim_utf8_utf8(gdv_int64 context, const char* basetext,
+                            gdv_int32 basetext_len, const char* trimtext,
+                            gdv_int32 trimtext_len, int32_t* out_len);
+
+const char* rtrim_utf8_utf8(gdv_int64 context, const char* basetext,
+                            gdv_int32 basetext_len, const char* trimtext,
+                            gdv_int32 trimtext_len, int32_t* out_len);
+
+const char* btrim_utf8_utf8(gdv_int64 context, const char* basetext,
+                            gdv_int32 basetext_len, const char* trimtext,
+                            gdv_int32 trimtext_len, int32_t* out_len);
 
 gdv_int32 locate_utf8_utf8(gdv_int64 context, const char* sub_str, gdv_int32 
sub_str_len,
                            const char* str, gdv_int32 str_len);

[arrow] branch master updated: ARROW-9328: [C++][Gandiva] Add LTRIM, RTRIM, BTRIM functions for string

Reply via email to