projjal commented on a change in pull request #9844:
URL: https://github.com/apache/arrow/pull/9844#discussion_r604801337
##########
File path: cpp/src/gandiva/precompiled/string_ops.cc
##########
@@ -1246,6 +1246,39 @@ const char* convert_fromUTF8_binary(gdv_int64 context,
const char* bin_in, gdv_i
return ret;
}
+FORCE_INLINE
+const char* convert_replace_invalid_fromUTF8_binary(
+ gdv_int64 context, const char* text_in, gdv_int32 text_len,
+ const char* char_to_replace, gdv_int32 /*char_to_replace_len*/, gdv_int32*
out_len) {
+ // actually the convert_replace function replaces the invalid bytes with a
single byte
+ // so the output length will be the same as the input length
+ *out_len = text_len;
+ char* ret = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context,
*out_len));
+ if (ret == nullptr) {
+ gdv_fn_context_set_error_msg(context, "Could not allocate memory for
output string");
+ *out_len = 0;
+ return "";
+ }
+ gdv_int32 out_byte_counter = 0;
+ gdv_int32 char_len;
+ // scan the base text from left to right and increment the start pointer till
+ // looking for invalid chars to substitute
+ for (int text_index = 0; text_index < text_len; text_index += char_len) {
+ char_len = utf8_char_length(text_in[text_index]);
+ if (char_len == 0 || text_index + char_len > text_len) {
Review comment:
it should be simple to write. You can check the logic in utf8_length
function. Basically it just checks if the MSBs of the bytes is correct
according to the format.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]