projjal commented on a change in pull request #9844:
URL: https://github.com/apache/arrow/pull/9844#discussion_r604801995



##########
File path: cpp/src/gandiva/precompiled/string_ops.cc
##########
@@ -1246,6 +1246,44 @@ const char* convert_fromUTF8_binary(gdv_int64 context, 
const char* bin_in, gdv_i
   return ret;
 }
 
+FORCE_INLINE
+const char* convert_replace_invalid_fromUTF8_binary(
+    gdv_int64 context, const char* text_in, gdv_int32 text_len,
+    const char* char_to_replace, gdv_int32 char_to_replace_len, gdv_int32* 
out_len) {
+  if (char_to_replace_len > 1) {
+    gdv_fn_context_set_error_msg(context, "Replacement of multiple bytes not 
supported");
+    *out_len = 0;
+    return "";
+  }
+  // actually the convert_replace function replaces the invalid bytes with a 
single byte
+  // so the output length will be the same as the input length
+  *out_len = text_len;
+  char* ret = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, 
*out_len));
+  if (ret == nullptr) {
+    gdv_fn_context_set_error_msg(context, "Could not allocate memory for 
output string");
+    *out_len = 0;
+    return "";
+  }
+  gdv_int32 out_byte_counter = 0;
+  gdv_int32 char_len;
+  // scan the base text from left to right and increment the start pointer till
+  // looking for invalid chars to substitute
+  for (int text_index = 0; text_index < text_len; text_index += char_len) {
+    char_len = utf8_char_length(text_in[text_index]);

Review comment:
       Since most of the time the input will be ascii, you can optimise it for 
that case.




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
[email protected]


Reply via email to