[GitHub] [arrow] projjal commented on a change in pull request #9844: ARROW-12146: [C++][Gandiva] Implement CONVERT_FROM(expression, replacement char) function

GitBox Wed, 07 Apr 2021 06:13:17 -0700


projjal commented on a change in pull request #9844:
URL: https://github.com/apache/arrow/pull/9844#discussion_r608629370




##########
File path: cpp/src/gandiva/precompiled/string_ops.cc
##########
@@ -1246,6 +1257,54 @@ const char* convert_fromUTF8_binary(gdv_int64 context, 
const char* bin_in, gdv_i
   return ret;
 }
 
+FORCE_INLINE
+const char* convert_replace_invalid_fromUTF8_binary(int64_t context, const 
char* text_in,
+                                                    int32_t text_len,
+                                                    const char* 
char_to_replace,
+                                                    int32_t 
char_to_replace_len,
+                                                    int32_t* out_len) {
+  if (char_to_replace_len > 1) {

Review comment:
       char_to_replace can be empty string in which case return the original 
string

##########
File path: cpp/src/gandiva/precompiled/string_ops.cc
##########
@@ -1246,6 +1257,54 @@ const char* convert_fromUTF8_binary(gdv_int64 context, 
const char* bin_in, gdv_i
   return ret;
 }
 
+FORCE_INLINE
+const char* convert_replace_invalid_fromUTF8_binary(int64_t context, const 
char* text_in,
+                                                    int32_t text_len,
+                                                    const char* 
char_to_replace,
+                                                    int32_t 
char_to_replace_len,
+                                                    int32_t* out_len) {
+  if (char_to_replace_len > 1) {
+    gdv_fn_context_set_error_msg(context, "Replacement of multiple bytes not 
supported");
+    *out_len = 0;
+    return "";
+  }
+  // actually the convert_replace function replaces the invalid bytes with a 
single byte

Review comment:
       nit: better to rephrase as only supports ascii character.

##########
File path: cpp/src/gandiva/precompiled/string_ops.cc
##########
@@ -1246,6 +1257,54 @@ const char* convert_fromUTF8_binary(gdv_int64 context, 
const char* bin_in, gdv_i
   return ret;
 }
 
+FORCE_INLINE
+const char* convert_replace_invalid_fromUTF8_binary(int64_t context, const 
char* text_in,
+                                                    int32_t text_len,
+                                                    const char* 
char_to_replace,
+                                                    int32_t 
char_to_replace_len,
+                                                    int32_t* out_len) {
+  if (char_to_replace_len > 1) {
+    gdv_fn_context_set_error_msg(context, "Replacement of multiple bytes not 
supported");
+    *out_len = 0;
+    return "";
+  }
+  // actually the convert_replace function replaces the invalid bytes with a 
single byte
+  // so the output length will be the same as the input length
+  *out_len = text_len;
+  char* ret = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, 
*out_len));
+  if (ret == nullptr) {
+    gdv_fn_context_set_error_msg(context, "Could not allocate memory for 
output string");
+    *out_len = 0;
+    return "";
+  }
+  int32_t valid_bytes_to_cpy = 0;
+  int32_t out_byte_counter = 0;
+  int32_t char_len;
+  // scan the base text from left to right and increment the start pointer till
+  // looking for invalid chars to substitute
+  for (int text_index = 0; text_index < text_len; text_index += char_len) {
+    char_len = utf8_char_length(text_in[text_index]);
+    // only memory copy the bytes when detect invalid char
+    if (char_len == 0 || text_index + char_len > text_len ||
+        !validate_utf8_following_bytes(text_in, char_len, text_index)) {
+      // define char_len = 1 to increase text_index by 1 (as ASCII char fits 
in 1 byte)
+      char_len = 1;
+      // first copy the valid bytes until now and then replace the invalid 
character
+      memcpy(ret + out_byte_counter, text_in + out_byte_counter, 
valid_bytes_to_cpy);
+      memcpy(ret + out_byte_counter + valid_bytes_to_cpy, char_to_replace, 
char_len);
+      out_byte_counter += valid_bytes_to_cpy + char_len;
+      valid_bytes_to_cpy = 0;
+      continue;
+    }
+    valid_bytes_to_cpy += char_len;
+  }
+  // if there are still valid bytes to copy, do it
+  if (valid_bytes_to_cpy != 0) {

Review comment:
       If there are are no invalid chars (which should be in the majority of 
cases) you can skip the memcpy and return the original string.

##########
File path: cpp/src/gandiva/precompiled/string_ops.cc
##########
@@ -1246,6 +1257,54 @@ const char* convert_fromUTF8_binary(gdv_int64 context, 
const char* bin_in, gdv_i
   return ret;
 }
 
+FORCE_INLINE
+const char* convert_replace_invalid_fromUTF8_binary(int64_t context, const 
char* text_in,
+                                                    int32_t text_len,
+                                                    const char* 
char_to_replace,
+                                                    int32_t 
char_to_replace_len,
+                                                    int32_t* out_len) {
+  if (char_to_replace_len > 1) {
+    gdv_fn_context_set_error_msg(context, "Replacement of multiple bytes not 
supported");
+    *out_len = 0;
+    return "";
+  }
+  // actually the convert_replace function replaces the invalid bytes with a 
single byte
+  // so the output length will be the same as the input length
+  *out_len = text_len;
+  char* ret = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, 
*out_len));
+  if (ret == nullptr) {
+    gdv_fn_context_set_error_msg(context, "Could not allocate memory for 
output string");
+    *out_len = 0;
+    return "";
+  }
+  int32_t valid_bytes_to_cpy = 0;
+  int32_t out_byte_counter = 0;
+  int32_t char_len;
+  // scan the base text from left to right and increment the start pointer till
+  // looking for invalid chars to substitute
+  for (int text_index = 0; text_index < text_len; text_index += char_len) {
+    char_len = utf8_char_length(text_in[text_index]);
+    // only memory copy the bytes when detect invalid char
+    if (char_len == 0 || text_index + char_len > text_len ||
+        !validate_utf8_following_bytes(text_in, char_len, text_index)) {
+      // define char_len = 1 to increase text_index by 1 (as ASCII char fits 
in 1 byte)
+      char_len = 1;
+      // first copy the valid bytes until now and then replace the invalid 
character
+      memcpy(ret + out_byte_counter, text_in + out_byte_counter, 
valid_bytes_to_cpy);
+      memcpy(ret + out_byte_counter + valid_bytes_to_cpy, char_to_replace, 
char_len);

Review comment:
       nit: you can do normal assignment: ret[out_byte_counter + 
valid_bytes_to_cpy] = char_to_replace[0]




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
[email protected]

[GitHub] [arrow] projjal commented on a change in pull request #9844: ARROW-12146: [C++][Gandiva] Implement CONVERT_FROM(expression, replacement char) function

Reply via email to