[GitHub] [arrow] projjal commented on a change in pull request #7641: ARROW-9328: [C++][Gandiva] Add LTRIM, RTRIM, BTRIM functions for string

GitBox Thu, 23 Jul 2020 01:58:35 -0700


projjal commented on a change in pull request #7641:
URL: https://github.com/apache/arrow/pull/7641#discussion_r459302381




##########
File path: cpp/src/gandiva/precompiled/string_ops.cc
##########
@@ -305,21 +342,161 @@ const char* trim_utf8(gdv_int64 context, const char* 
data, gdv_int32 data_len,
     --end;
   }
 
-  // string with no leading/trailing spaces, return original string
-  if (start == 0 && end == data_len - 1) {
-    *out_len = data_len;
-    return data;
+  // string has some leading/trailing spaces and some non-space characters
+  *out_len = end - start + 1;
+  return data + start;
+}
+
+// Trims characters present in the trim text from the left end of the base text
+FORCE_INLINE
+const char* ltrim_utf8_utf8(gdv_int64 context, const char* basetext,
+                            gdv_int32 basetext_len, const char* trimtext,
+                            gdv_int32 trimtext_len, int32_t* out_len) {
+  if (basetext_len == 0) {
+    *out_len = 0;
+    return "";
+  } else if (trimtext_len == 0) {
+    *out_len = basetext_len;
+    return basetext;
+  }
+
+  gdv_int32 start_ptr, char_len;
+  // scan the base text from left to right and increment the start pointer till
+  // there is a character which is not present in the trim text
+  for (start_ptr = 0; start_ptr < basetext_len; start_ptr += char_len) {
+    char_len = utf8_char_length(basetext[start_ptr]);
+    if (char_len == 0 || start_ptr + char_len > basetext_len) {
+      // invalid byte or incomplete glyph
+      set_error_for_invalid_utf(context, basetext[start_ptr]);
+      *out_len = 0;
+      return "";
+    }
+    for (gdv_int32 char_ptr = 1; char_ptr < char_len; ++char_ptr) {
+      if ((basetext[start_ptr + char_ptr] & 0xC0) != 0x80) {
+        // bytes following head-byte of glyph
+        set_error_for_invalid_utf(context, basetext[start_ptr + char_ptr]);
+        *out_len = 0;
+        return "";
+      }
+    }
+    if (!is_substr_utf8_utf8(trimtext, trimtext_len, basetext + start_ptr, 
char_len)) {
+      break;
+    }
   }
 
-  // string with all spaces
-  if (start > end) {
+  *out_len = basetext_len - start_ptr;
+  return basetext + start_ptr;
+}
+
+// Trims characters present in the trim text from the right end of the base 
text
+FORCE_INLINE
+const char* rtrim_utf8_utf8(gdv_int64 context, const char* basetext,
+                            gdv_int32 basetext_len, const char* trimtext,
+                            gdv_int32 trimtext_len, int32_t* out_len) {
+  if (basetext_len == 0) {
     *out_len = 0;
     return "";
+  } else if (trimtext_len == 0) {
+    *out_len = basetext_len;
+    return basetext;
+  }
+
+  gdv_int32 char_len, end_ptr, char_cnt = 1;
+  // scan the base text from right to left and decrement the end pointer till
+  // there is a character which is not present in the trim text
+  for (end_ptr = basetext_len - 1; end_ptr >= 0; --end_ptr) {
+    char_len = utf8_char_length(basetext[end_ptr]);
+    if (char_len == 0) {  // trailing bytes of multibyte character
+      ++char_cnt;

Review comment:
       byte_cnt would be a better name

##########
File path: cpp/src/gandiva/precompiled/string_ops.cc
##########
@@ -305,21 +342,161 @@ const char* trim_utf8(gdv_int64 context, const char* 
data, gdv_int32 data_len,
     --end;
   }
 
-  // string with no leading/trailing spaces, return original string
-  if (start == 0 && end == data_len - 1) {
-    *out_len = data_len;
-    return data;
+  // string has some leading/trailing spaces and some non-space characters
+  *out_len = end - start + 1;
+  return data + start;
+}
+
+// Trims characters present in the trim text from the left end of the base text
+FORCE_INLINE
+const char* ltrim_utf8_utf8(gdv_int64 context, const char* basetext,
+                            gdv_int32 basetext_len, const char* trimtext,
+                            gdv_int32 trimtext_len, int32_t* out_len) {
+  if (basetext_len == 0) {
+    *out_len = 0;
+    return "";
+  } else if (trimtext_len == 0) {
+    *out_len = basetext_len;
+    return basetext;
+  }
+
+  gdv_int32 start_ptr, char_len;
+  // scan the base text from left to right and increment the start pointer till
+  // there is a character which is not present in the trim text
+  for (start_ptr = 0; start_ptr < basetext_len; start_ptr += char_len) {
+    char_len = utf8_char_length(basetext[start_ptr]);
+    if (char_len == 0 || start_ptr + char_len > basetext_len) {
+      // invalid byte or incomplete glyph
+      set_error_for_invalid_utf(context, basetext[start_ptr]);
+      *out_len = 0;
+      return "";
+    }
+    for (gdv_int32 char_ptr = 1; char_ptr < char_len; ++char_ptr) {

Review comment:
       I am not sure if this check is really needed. Verifying all the bytes 
seems extra overhead unless this function really means to validate the entire 
utf8 string. The check at L388 is required because otherwise it may crash.
   If you do think it is required we should also do this in rtrim to have 
consistent behavior in both.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
[email protected]

[GitHub] [arrow] projjal commented on a change in pull request #7641: ARROW-9328: [C++][Gandiva] Add LTRIM, RTRIM, BTRIM functions for string

Reply via email to