projjal commented on a change in pull request #9844:
URL: https://github.com/apache/arrow/pull/9844#discussion_r604303465



##########
File path: cpp/src/gandiva/precompiled/string_ops.cc
##########
@@ -1246,6 +1246,39 @@ const char* convert_fromUTF8_binary(gdv_int64 context, 
const char* bin_in, gdv_i
   return ret;
 }
 
+FORCE_INLINE
+const char* convert_replace_invalid_fromUTF8_binary(
+    gdv_int64 context, const char* text_in, gdv_int32 text_len,
+    const char* char_to_replace, gdv_int32 /*char_to_replace_len*/, gdv_int32* 
out_len) {
+  // actually the convert_replace function replaces the invalid bytes with a 
single byte
+  // so the output length will be the same as the input length
+  *out_len = text_len;
+  char* ret = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, 
*out_len));
+  if (ret == nullptr) {
+    gdv_fn_context_set_error_msg(context, "Could not allocate memory for 
output string");
+    *out_len = 0;
+    return "";
+  }
+  gdv_int32 out_byte_counter = 0;
+  gdv_int32 char_len;
+  // scan the base text from left to right and increment the start pointer till
+  // looking for invalid chars to substitute
+  for (int text_index = 0; text_index < text_len; text_index += char_len) {
+    char_len = utf8_char_length(text_in[text_index]);
+    if (char_len == 0 || text_index + char_len > text_len) {

Review comment:
       utf8_char_length doesn't completely validate the utf8 char. You need to 
call a validate function in the if statement.




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
[email protected]


Reply via email to