projjal commented on a change in pull request #9844:
URL: https://github.com/apache/arrow/pull/9844#discussion_r605111131



##########
File path: cpp/src/gandiva/precompiled/string_ops.cc
##########
@@ -1246,6 +1258,52 @@ const char* convert_fromUTF8_binary(gdv_int64 context, 
const char* bin_in, gdv_i
   return ret;
 }
 
+FORCE_INLINE
+const char* convert_replace_invalid_fromUTF8_binary(
+    gdv_int64 context, const char* text_in, gdv_int32 text_len,
+    const char* char_to_replace, gdv_int32 char_to_replace_len, gdv_int32* 
out_len) {
+  if (char_to_replace_len > 1) {
+    gdv_fn_context_set_error_msg(context, "Replacement of multiple bytes not 
supported");
+    *out_len = 0;
+    return "";
+  }
+  // actually the convert_replace function replaces the invalid bytes with a 
single byte
+  // so the output length will be the same as the input length
+  *out_len = text_len;
+  char* ret = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, 
*out_len));
+  if (ret == nullptr) {
+    gdv_fn_context_set_error_msg(context, "Could not allocate memory for 
output string");
+    *out_len = 0;
+    return "";
+  }
+  gdv_int32 out_byte_counter = 0;
+  gdv_int32 char_len;
+  // scan the base text from left to right and increment the start pointer till
+  // looking for invalid chars to substitute
+  for (int text_index = 0; text_index < text_len; text_index += char_len) {
+    char_len = utf8_char_length(text_in[text_index]);
+    if (char_len == 0) {
+      memcpy(ret + out_byte_counter, char_to_replace, 1);
+      out_byte_counter += 1;
+      // define char_len = 1 to increase text_index by 1 (as ASCII char fits 
in 1 byte)
+      char_len = 1;
+      continue;
+    }
+    // if the char length is greater than 0, execute another validation on MSBs
+    char* invalid_char = reinterpret_cast<char*>(malloc(char_len));
+    if (text_index + char_len > text_len || validate_utf8_following_bytes(
+        text_in, char_len, text_index, invalid_char) == 0) {
+      memcpy(ret + out_byte_counter, char_to_replace, char_len);
+    } else {
+      memcpy(ret + out_byte_counter, text_in + text_index, char_len);

Review comment:
       It is currently doing a memcpy for character by character. You can keep 
track of valid chars across the loop and do a single memcpy at the end or after 
detecting invalid char. 




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
[email protected]


Reply via email to