vvellanki commented on a change in pull request #11551:
URL: https://github.com/apache/arrow/pull/11551#discussion_r753694207
##########
File path: cpp/src/gandiva/gdv_function_stubs.cc
##########
@@ -794,6 +812,144 @@ const char* gdv_fn_initcap_utf8(int64_t context, const
char* data, int32_t data_
*out_len = out_idx;
return out;
}
+
+GANDIVA_EXPORT
+const char* gdv_mask_first_n_utf8_int32(int64_t context, const char* data,
+ int32_t data_len, int32_t n_to_mask,
+ int32_t* out_len) {
+ if (data_len <= 0) {
+ *out_len = 0;
+ return nullptr;
+ }
+
+ *out_len = data_len;
+
+ char* out = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context,
*out_len));
+ if (out == nullptr) {
+ gdv_fn_context_set_error_msg(context, "Could not allocate memory for
output string");
+ *out_len = 0;
+ return nullptr;
+ }
+
+ int num_masked;
+ for (num_masked = 0; num_masked < n_to_mask; num_masked++) {
+ unsigned char char_single_byte = data[num_masked];
+ if (char_single_byte > 127) {
+ // found a multi-byte utf-8 char
+ break;
+ }
+ out[num_masked] = mask_array[char_single_byte];
+ }
+
+ utf8proc_int32_t utf8_char;
+ utf8proc_ssize_t multi_byte_masked = num_masked;
+ while (num_masked < n_to_mask) {
+ auto char_len = utf8proc_iterate(
+ reinterpret_cast<const utf8proc_uint8_t*>(data + multi_byte_masked),
data_len,
+ &utf8_char);
+ multi_byte_masked += char_len;
+
+ if (char_len == 1) {
+ out[num_masked] = mask_array[utf8_char];
+ num_masked++;
+ continue;
+ }
+
+ switch (utf8proc_category(utf8_char)) {
+ case 1:
+ out[num_masked] = 'X';
+ break;
+ case 2:
+ out[num_masked] = 'x';
+ break;
+ case 9:
+ out[num_masked] = 'n';
+ break;
+ case 10:
+ out[num_masked] = 'n';
+ break;
+ default:
+ out[num_masked] = utf8_char;
Review comment:
This is incorrect. utf8_char is a multi-byte character. out is a char *,
you cannot do this. You have to do a memcpy(). Also, num_masked has to be
incremented accordingly; i.e num_masked += char_len for this case
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]