vvellanki commented on a change in pull request #11551:
URL: https://github.com/apache/arrow/pull/11551#discussion_r754827342



##########
File path: cpp/src/gandiva/gdv_function_stubs.cc
##########
@@ -794,6 +812,188 @@ const char* gdv_fn_initcap_utf8(int64_t context, const 
char* data, int32_t data_
   *out_len = out_idx;
   return out;
 }
+
+GANDIVA_EXPORT
+const char* gdv_mask_first_n_utf8_int32(int64_t context, const char* data,
+                                        int32_t data_len, int32_t n_to_mask,
+                                        int32_t* out_len) {
+  if (data_len <= 0) {
+    *out_len = 0;
+    return nullptr;
+  }
+
+  *out_len = data_len;
+
+  char* out = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, 
*out_len));
+  if (out == nullptr) {
+    gdv_fn_context_set_error_msg(context, "Could not allocate memory for 
output string");
+    *out_len = 0;
+    return nullptr;
+  }
+
+  if (n_to_mask < 0) {

Review comment:
       Change this to if (n_to_mask <= 0)

##########
File path: cpp/src/gandiva/gdv_function_stubs.cc
##########
@@ -794,6 +812,188 @@ const char* gdv_fn_initcap_utf8(int64_t context, const 
char* data, int32_t data_
   *out_len = out_idx;
   return out;
 }
+
+GANDIVA_EXPORT
+const char* gdv_mask_first_n_utf8_int32(int64_t context, const char* data,
+                                        int32_t data_len, int32_t n_to_mask,
+                                        int32_t* out_len) {
+  if (data_len <= 0) {
+    *out_len = 0;
+    return nullptr;
+  }
+
+  *out_len = data_len;
+
+  char* out = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, 
*out_len));
+  if (out == nullptr) {
+    gdv_fn_context_set_error_msg(context, "Could not allocate memory for 
output string");
+    *out_len = 0;
+    return nullptr;
+  }
+
+  if (n_to_mask < 0) {
+    memcpy(out, data, data_len);
+    return out;
+  }
+
+  int num_masked;
+  for (num_masked = 0; num_masked < n_to_mask; num_masked++) {
+    unsigned char char_single_byte = data[num_masked];
+    if (char_single_byte > 127) {
+      // found a multi-byte utf-8 char
+      break;
+    }
+    out[num_masked] = mask_array[char_single_byte];
+  }
+
+  utf8proc_int32_t utf8_char;
+  int char_counter = num_masked;
+  int out_idx = num_masked;
+  while (char_counter < n_to_mask) {
+    auto char_len =
+        utf8proc_iterate(reinterpret_cast<const utf8proc_uint8_t*>(data + 
num_masked),
+                         data_len, &utf8_char);
+
+    if (char_len == 1) {
+      out[char_counter] = mask_array[utf8_char];

Review comment:
       The output array should be updated at idx_in_out ... not char_counter 
since char_counter is tracking the number of masked characters
   

##########
File path: cpp/src/gandiva/gdv_function_stubs.cc
##########
@@ -794,6 +812,188 @@ const char* gdv_fn_initcap_utf8(int64_t context, const 
char* data, int32_t data_
   *out_len = out_idx;
   return out;
 }
+
+GANDIVA_EXPORT
+const char* gdv_mask_first_n_utf8_int32(int64_t context, const char* data,
+                                        int32_t data_len, int32_t n_to_mask,
+                                        int32_t* out_len) {
+  if (data_len <= 0) {
+    *out_len = 0;
+    return nullptr;
+  }
+
+  *out_len = data_len;
+
+  char* out = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, 
*out_len));
+  if (out == nullptr) {
+    gdv_fn_context_set_error_msg(context, "Could not allocate memory for 
output string");
+    *out_len = 0;
+    return nullptr;
+  }
+
+  if (n_to_mask < 0) {
+    memcpy(out, data, data_len);
+    return out;
+  }
+
+  int num_masked;
+  for (num_masked = 0; num_masked < n_to_mask; num_masked++) {
+    unsigned char char_single_byte = data[num_masked];
+    if (char_single_byte > 127) {
+      // found a multi-byte utf-8 char
+      break;
+    }
+    out[num_masked] = mask_array[char_single_byte];
+  }
+
+  utf8proc_int32_t utf8_char;
+  int char_counter = num_masked;

Review comment:
       It is better to think of 3 variables:
   idx_in_out = num_masked; // this is the index in the output array. this is 
where the write should happen
   idx_in_in = num_masked; // this is the index in the input array
   num_masked // is the number of chars masked till now
   
   The while loop in 852 should be: while (num_masked < n_to_mask)
   
   the code is quite confusing now. Please re-write using these variables and 
update them correctly in each path




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


Reply via email to