augustoasilva commented on a change in pull request #11551:
URL: https://github.com/apache/arrow/pull/11551#discussion_r756463198



##########
File path: cpp/src/gandiva/gdv_function_stubs.cc
##########
@@ -794,6 +812,187 @@ const char* gdv_fn_initcap_utf8(int64_t context, const 
char* data, int32_t data_
   *out_len = out_idx;
   return out;
 }
+
+GANDIVA_EXPORT
+const char* gdv_mask_first_n_utf8_int32(int64_t context, const char* data,
+                                        int32_t data_len, int32_t n_to_mask,
+                                        int32_t* out_len) {
+  if (data_len <= 0) {
+    *out_len = 0;
+    return nullptr;
+  }
+
+  if (n_to_mask > data_len) {
+    n_to_mask = data_len;
+  }
+
+  *out_len = data_len;
+
+  if (n_to_mask <= 0) {
+    return data;
+  }
+
+  char* out = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, 
*out_len));
+  if (out == nullptr) {
+    gdv_fn_context_set_error_msg(context, "Could not allocate memory for 
output string");
+    *out_len = 0;
+    return nullptr;
+  }
+
+  int bytes_masked;
+  for (bytes_masked = 0; bytes_masked < n_to_mask; bytes_masked++) {
+    unsigned char char_single_byte = data[bytes_masked];
+    if (char_single_byte > 127) {
+      // found a multi-byte utf-8 char
+      break;
+    }
+    out[bytes_masked] = mask_array[char_single_byte];
+  }
+
+  int data_idx = bytes_masked;
+  int out_idx = bytes_masked;
+
+  // Handle multibyte utf8 characters
+  utf8proc_int32_t utf8_char;
+  while (data_idx < n_to_mask) {

Review comment:
       It makes sense, I added a case with 3 chinese letters (each one has 3 
bytes) and the n_to_mask = 4 and there was an error indeed, so I've change the 
while condition and it worked.
   
   Will change it from data_idx to chars_masked, it seems more accurate, what 
do you think?




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


Reply via email to