[GitHub] [arrow] projjal commented on a change in pull request #10155: ARROW-12534: [C++][Gandiva] Implement LEFT and RIGHT functions on Gandiva for string input values

GitBox Wed, 28 Apr 2021 05:41:28 -0700


projjal commented on a change in pull request #10155:
URL: https://github.com/apache/arrow/pull/10155#discussion_r622128465




##########
File path: cpp/src/gandiva/precompiled/string_ops.cc
##########
@@ -1481,6 +1481,82 @@ const char* split_part(gdv_int64 context, const char* 
text, gdv_int32 text_len,
   return "";
 }
 
+FORCE_INLINE
+const char* left(gdv_int64 context, const char* text, gdv_int32 text_len,
+                 gdv_int32 number, gdv_int32* out_len) {
+  // returns the 'number' left most characters of a given text
+  if (text_len == 0 || number == 0) {
+    *out_len = 0;
+    return "";
+  }
+
+  // initially counts the number of utf8 characters in the defined text
+  int32_t charCount = utf8_length(context, text, text_len);
+  // charCount is zero if input has invalid utf8 char
+  if (charCount == 0) {

Review comment:
       Better to treat the invalid bytes as single char and pad accordingly 
instead of returning empty string i think

##########
File path: cpp/src/gandiva/precompiled/string_ops.cc
##########
@@ -1481,6 +1481,82 @@ const char* split_part(gdv_int64 context, const char* 
text, gdv_int32 text_len,
   return "";
 }
 
+FORCE_INLINE
+const char* left(gdv_int64 context, const char* text, gdv_int32 text_len,
+                 gdv_int32 number, gdv_int32* out_len) {
+  // returns the 'number' left most characters of a given text
+  if (text_len == 0 || number == 0) {
+    *out_len = 0;
+    return "";
+  }
+
+  // initially counts the number of utf8 characters in the defined text
+  int32_t charCount = utf8_length(context, text, text_len);
+  // charCount is zero if input has invalid utf8 char
+  if (charCount == 0) {
+    *out_len = 0;
+    return "";
+  }
+
+  int32_t endCharPos;  // the char result end position (inclusive)
+  if (number > 0) {
+    // case where left('abc', 5) -> 'abc'
+    endCharPos = (charCount < number) ? charCount : number;
+  } else if (number < 0) {
+    // case where left('abc', -5) ==> ''
+    endCharPos = (charCount + number > 0) ? charCount + number : 0;
+  } else {
+    endCharPos = 0;
+  }
+
+  *out_len = utf8_byte_pos(context, text, text_len, endCharPos);
+  return text;
+}
+
+FORCE_INLINE
+const char* right(gdv_int64 context, const char* text, gdv_int32 text_len,
+                  gdv_int32 number, gdv_int32* out_len) {
+  // returns the 'number' left most characters of a given text
+  if (text_len == 0 || number == 0) {
+    *out_len = 0;
+    return "";
+  }
+
+  // initially counts the number of utf8 characters in the defined text
+  int32_t charCount = utf8_length(context, text, text_len);
+  // charCount is zero if input has invalid utf8 char
+  if (charCount == 0) {
+    *out_len = 0;
+    return "";
+  }
+
+  int32_t startCharPos;  // the char result start position (inclusive)
+  int32_t endCharLen;    // the char result end position (inclusive)
+  if (number > 0) {
+    // case where right('abc', 5) ==> 'abc' startCharPos=1.
+    startCharPos = (charCount - number + 1 > 1) ? charCount - number + 1 : 1;

Review comment:
       can be simplified to (charCount > number) . Also why doing 1-indexing 
here ?

##########
File path: cpp/src/gandiva/precompiled/string_ops.cc
##########
@@ -1481,6 +1481,82 @@ const char* split_part(gdv_int64 context, const char* 
text, gdv_int32 text_len,
   return "";
 }
 
+FORCE_INLINE
+const char* left(gdv_int64 context, const char* text, gdv_int32 text_len,
+                 gdv_int32 number, gdv_int32* out_len) {
+  // returns the 'number' left most characters of a given text
+  if (text_len == 0 || number == 0) {
+    *out_len = 0;
+    return "";
+  }
+
+  // initially counts the number of utf8 characters in the defined text
+  int32_t charCount = utf8_length(context, text, text_len);
+  // charCount is zero if input has invalid utf8 char
+  if (charCount == 0) {
+    *out_len = 0;
+    return "";
+  }
+
+  int32_t endCharPos;  // the char result end position (inclusive)
+  if (number > 0) {
+    // case where left('abc', 5) -> 'abc'
+    endCharPos = (charCount < number) ? charCount : number;
+  } else if (number < 0) {
+    // case where left('abc', -5) ==> ''
+    endCharPos = (charCount + number > 0) ? charCount + number : 0;
+  } else {
+    endCharPos = 0;
+  }
+
+  *out_len = utf8_byte_pos(context, text, text_len, endCharPos);
+  return text;
+}
+
+FORCE_INLINE
+const char* right(gdv_int64 context, const char* text, gdv_int32 text_len,
+                  gdv_int32 number, gdv_int32* out_len) {
+  // returns the 'number' left most characters of a given text
+  if (text_len == 0 || number == 0) {
+    *out_len = 0;
+    return "";
+  }
+
+  // initially counts the number of utf8 characters in the defined text
+  int32_t charCount = utf8_length(context, text, text_len);
+  // charCount is zero if input has invalid utf8 char
+  if (charCount == 0) {
+    *out_len = 0;
+    return "";
+  }
+
+  int32_t startCharPos;  // the char result start position (inclusive)
+  int32_t endCharLen;    // the char result end position (inclusive)
+  if (number > 0) {
+    // case where right('abc', 5) ==> 'abc' startCharPos=1.
+    startCharPos = (charCount - number + 1 > 1) ? charCount - number + 1 : 1;
+    endCharLen = charCount - startCharPos + 1;
+  } else {
+    startCharPos = ((number > 0) ? number : number * -1) + 1;
+    endCharLen = charCount - startCharPos + 1;
+  }

Review comment:
       Can you add a comment to the top of the functions detailing all the cases

##########
File path: cpp/src/gandiva/precompiled/string_ops.cc
##########
@@ -1481,6 +1481,82 @@ const char* split_part(gdv_int64 context, const char* 
text, gdv_int32 text_len,
   return "";
 }
 
+FORCE_INLINE
+const char* left(gdv_int64 context, const char* text, gdv_int32 text_len,
+                 gdv_int32 number, gdv_int32* out_len) {
+  // returns the 'number' left most characters of a given text
+  if (text_len == 0 || number == 0) {
+    *out_len = 0;
+    return "";
+  }
+
+  // initially counts the number of utf8 characters in the defined text
+  int32_t charCount = utf8_length(context, text, text_len);
+  // charCount is zero if input has invalid utf8 char
+  if (charCount == 0) {
+    *out_len = 0;
+    return "";
+  }
+
+  int32_t endCharPos;  // the char result end position (inclusive)
+  if (number > 0) {
+    // case where left('abc', 5) -> 'abc'
+    endCharPos = (charCount < number) ? charCount : number;
+  } else if (number < 0) {
+    // case where left('abc', -5) ==> ''
+    endCharPos = (charCount + number > 0) ? charCount + number : 0;
+  } else {
+    endCharPos = 0;
+  }
+
+  *out_len = utf8_byte_pos(context, text, text_len, endCharPos);
+  return text;
+}
+
+FORCE_INLINE
+const char* right(gdv_int64 context, const char* text, gdv_int32 text_len,
+                  gdv_int32 number, gdv_int32* out_len) {
+  // returns the 'number' left most characters of a given text
+  if (text_len == 0 || number == 0) {
+    *out_len = 0;
+    return "";
+  }
+
+  // initially counts the number of utf8 characters in the defined text
+  int32_t charCount = utf8_length(context, text, text_len);
+  // charCount is zero if input has invalid utf8 char
+  if (charCount == 0) {
+    *out_len = 0;
+    return "";
+  }
+
+  int32_t startCharPos;  // the char result start position (inclusive)
+  int32_t endCharLen;    // the char result end position (inclusive)
+  if (number > 0) {
+    // case where right('abc', 5) ==> 'abc' startCharPos=1.
+    startCharPos = (charCount - number + 1 > 1) ? charCount - number + 1 : 1;
+    endCharLen = charCount - startCharPos + 1;
+  } else {
+    startCharPos = ((number > 0) ? number : number * -1) + 1;

Review comment:
       number < 0 in this block why check again? also startCharPos may become 
negative if number is too large.

##########
File path: cpp/src/gandiva/precompiled/string_ops.cc
##########
@@ -1481,6 +1481,82 @@ const char* split_part(gdv_int64 context, const char* 
text, gdv_int32 text_len,
   return "";
 }
 
+FORCE_INLINE
+const char* left(gdv_int64 context, const char* text, gdv_int32 text_len,
+                 gdv_int32 number, gdv_int32* out_len) {
+  // returns the 'number' left most characters of a given text
+  if (text_len == 0 || number == 0) {
+    *out_len = 0;
+    return "";
+  }
+
+  // initially counts the number of utf8 characters in the defined text
+  int32_t charCount = utf8_length(context, text, text_len);
+  // charCount is zero if input has invalid utf8 char
+  if (charCount == 0) {
+    *out_len = 0;
+    return "";
+  }
+
+  int32_t endCharPos;  // the char result end position (inclusive)
+  if (number > 0) {
+    // case where left('abc', 5) -> 'abc'
+    endCharPos = (charCount < number) ? charCount : number;
+  } else if (number < 0) {
+    // case where left('abc', -5) ==> ''
+    endCharPos = (charCount + number > 0) ? charCount + number : 0;
+  } else {
+    endCharPos = 0;
+  }
+
+  *out_len = utf8_byte_pos(context, text, text_len, endCharPos);
+  return text;
+}
+
+FORCE_INLINE
+const char* right(gdv_int64 context, const char* text, gdv_int32 text_len,
+                  gdv_int32 number, gdv_int32* out_len) {
+  // returns the 'number' left most characters of a given text
+  if (text_len == 0 || number == 0) {
+    *out_len = 0;
+    return "";
+  }
+
+  // initially counts the number of utf8 characters in the defined text
+  int32_t charCount = utf8_length(context, text, text_len);

Review comment:
       nit: keep consistent camel case or snake case

##########
File path: cpp/src/gandiva/precompiled/string_ops.cc
##########
@@ -1481,6 +1481,82 @@ const char* split_part(gdv_int64 context, const char* 
text, gdv_int32 text_len,
   return "";
 }
 
+FORCE_INLINE
+const char* left(gdv_int64 context, const char* text, gdv_int32 text_len,
+                 gdv_int32 number, gdv_int32* out_len) {
+  // returns the 'number' left most characters of a given text
+  if (text_len == 0 || number == 0) {
+    *out_len = 0;
+    return "";
+  }
+
+  // initially counts the number of utf8 characters in the defined text
+  int32_t charCount = utf8_length(context, text, text_len);

Review comment:
       Instead of traversing twice you can do it once.




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [arrow] projjal commented on a change in pull request #10155: ARROW-12534: [C++][Gandiva] Implement LEFT and RIGHT functions on Gandiva for string input values

Reply via email to