projjal commented on a change in pull request #10173:
URL: https://github.com/apache/arrow/pull/10173#discussion_r621930991
##########
File path: cpp/src/gandiva/precompiled/string_ops.cc
##########
@@ -1422,6 +1422,239 @@ const char* replace_utf8_utf8_utf8(gdv_int64 context,
const char* text,
out_len);
}
+FORCE_INLINE
+const char* lpad(gdv_int64 context, const char* text, gdv_int32 text_len,
+ gdv_int32 return_length, const char* fill_text, gdv_int32
fill_text_len,
+ gdv_int32* out_len) {
+ // if the text length or the defined return length (number of characters to
return)
+ // is <=0, then return an empty string.
+ if (text_len == 0 || return_length <= 0) {
+ *out_len = 0;
+ return "";
+ }
+
+ // initially counts the number of utf8 characters in the defined text and
fill_text
+ int32_t text_char_count = utf8_length(context, text, text_len);
+ int32_t fill_char_count = utf8_length(context, fill_text, fill_text_len);
+ // text_char_count is zero if input has invalid utf8 char
+ // fill_char_count is zero if fill_text_len is > 0 and its value has invalid
utf8 char
+ if (text_char_count == 0 || (fill_text_len > 0 && fill_char_count == 0)) {
+ *out_len = 0;
+ return "";
+ }
+
+ if (return_length == text_char_count ||
+ (return_length > text_char_count && fill_text_len == 0)) {
+ // case where the return length is same as the text's length, or if it
need to
+ // fill into text but "fill_text" is empty, then return text directly.
+ *out_len = text_len;
+ return text;
+ } else if (return_length < text_char_count) {
+ // case where it truncates the result on return length.
+ *out_len = utf8_byte_pos(context, text, text_len, return_length);
+ return text;
+ } else {
+ // case (return_length > text_char_count)
+ // case where it needs to copy "fill_text" on the string left. The total
number
+ // of chars to copy is given by (return_length - text_char_count)
+ char* ret =
+ reinterpret_cast<gdv_binary>(gdv_fn_context_arena_malloc(context,
return_length));
+ if (ret == nullptr) {
+ gdv_fn_context_set_error_msg(context,
+ "Could not allocate memory for output
string");
+ *out_len = 0;
+ return "";
+ }
+ // try to fulfill the return string with the "fill_text" continuously
+ int32_t copied_chars_count = 0;
+ int32_t copied_chars_position = 0;
+ while (copied_chars_count < return_length - text_char_count) {
+ int32_t char_len;
+ int32_t fill_index;
+ // for each char, evaluate its length to consider it when mem copying
+ for (fill_index = 0; fill_index < fill_text_len; fill_index += char_len)
{
+ if (copied_chars_count >= return_length - text_char_count) {
+ break;
+ }
+ char_len = utf8_char_length(fill_text[fill_index]);
+ copied_chars_count++;
+ }
+ memcpy(ret + copied_chars_position, fill_text, fill_index);
+ copied_chars_position += fill_index;
+ }
+ // after fulfilling the text, copy the main string
+ memcpy(ret + copied_chars_position, text, text_len);
+ *out_len = copied_chars_position + text_len;
+ return ret;
+ }
+}
+
+FORCE_INLINE
+const char* rpad(gdv_int64 context, const char* text, gdv_int32 text_len,
+ gdv_int32 return_length, const char* fill_text, gdv_int32
fill_text_len,
+ gdv_int32* out_len) {
+ // if the text length or the defined return length (number of characters to
return)
+ // is <=0, then return an empty string.
+ if (text_len == 0 || return_length <= 0) {
+ *out_len = 0;
+ return "";
+ }
+
+ // initially counts the number of utf8 characters in the defined text and
fill_text
+ int32_t text_char_count = utf8_length(context, text, text_len);
+ int32_t fill_char_count = utf8_length(context, fill_text, fill_text_len);
+ // text_char_count is zero if input has invalid utf8 char
+ // fill_char_count is zero if fill_text_len is > 0 and its value has invalid
utf8 char
+ if (text_char_count == 0 || (fill_text_len > 0 && fill_char_count == 0)) {
+ *out_len = 0;
+ return "";
+ }
+
+ if (return_length == text_char_count ||
+ (return_length > text_char_count && fill_text_len == 0)) {
+ // case where the return length is same as the text's length, or if it
need to
+ // fill into text but "fill_text" is empty, then return text directly.
+ *out_len = text_len;
+ return text;
+ } else if (return_length < text_char_count) {
+ // case where it truncates the result on return length.
+ *out_len = utf8_byte_pos(context, text, text_len, return_length);
+ return text;
+ } else {
+ // case (return_length > text_char_count)
+ // case where it needs to copy "fill_text" on the string right
+ char* ret =
+ reinterpret_cast<gdv_binary>(gdv_fn_context_arena_malloc(context,
return_length));
+ if (ret == nullptr) {
+ gdv_fn_context_set_error_msg(context,
+ "Could not allocate memory for output
string");
+ *out_len = 0;
+ return "";
+ }
+ // fulfill the initial text copying the main input string
+ memcpy(ret, text, text_len);
+ // try to fulfill the return string with the "fill_text" continuously
+ int32_t copied_chars_count = 0;
+ int32_t copied_chars_position = 0;
+ while (text_char_count + copied_chars_count < return_length) {
+ int32_t char_len;
+ int32_t fill_length;
+ // for each char, evaluate its length to consider it when mem copying
+ for (fill_length = 0; fill_length < fill_text_len; fill_length +=
char_len) {
+ if (text_char_count + copied_chars_count >= return_length) {
+ break;
+ }
+ char_len = utf8_char_length(fill_text[fill_length]);
+ copied_chars_count++;
+ }
+ memcpy(ret + text_len + copied_chars_position, fill_text, fill_length);
+ copied_chars_position += fill_length;
+ }
+ *out_len = copied_chars_position + text_len;
+ return ret;
+ }
+}
+
+FORCE_INLINE
+const char* lpad_no_fill_text(gdv_int64 context, const char* text, gdv_int32
text_len,
Review comment:
just have a single lpad & rpad impl and call that function with ' ' for
no_fill case
##########
File path: cpp/src/gandiva/precompiled/string_ops.cc
##########
@@ -1422,6 +1422,239 @@ const char* replace_utf8_utf8_utf8(gdv_int64 context,
const char* text,
out_len);
}
+FORCE_INLINE
+const char* lpad(gdv_int64 context, const char* text, gdv_int32 text_len,
+ gdv_int32 return_length, const char* fill_text, gdv_int32
fill_text_len,
+ gdv_int32* out_len) {
+ // if the text length or the defined return length (number of characters to
return)
+ // is <=0, then return an empty string.
+ if (text_len == 0 || return_length <= 0) {
+ *out_len = 0;
+ return "";
+ }
+
+ // initially counts the number of utf8 characters in the defined text and
fill_text
+ int32_t text_char_count = utf8_length(context, text, text_len);
+ int32_t fill_char_count = utf8_length(context, fill_text, fill_text_len);
+ // text_char_count is zero if input has invalid utf8 char
+ // fill_char_count is zero if fill_text_len is > 0 and its value has invalid
utf8 char
+ if (text_char_count == 0 || (fill_text_len > 0 && fill_char_count == 0)) {
Review comment:
i think its better to add a new function that ignores invalid utf8 chars
(ie count as size 1) instead of silently returning empty string
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]