vvellanki commented on a change in pull request #11551:
URL: https://github.com/apache/arrow/pull/11551#discussion_r756018526



##########
File path: cpp/src/gandiva/gdv_function_stubs.cc
##########
@@ -794,6 +812,187 @@ const char* gdv_fn_initcap_utf8(int64_t context, const 
char* data, int32_t data_
   *out_len = out_idx;
   return out;
 }
+
+GANDIVA_EXPORT
+const char* gdv_mask_first_n_utf8_int32(int64_t context, const char* data,
+                                        int32_t data_len, int32_t n_to_mask,
+                                        int32_t* out_len) {
+  if (data_len <= 0) {
+    *out_len = 0;
+    return nullptr;
+  }
+
+  if (n_to_mask > data_len) {
+    n_to_mask = data_len;
+  }
+
+  *out_len = data_len;
+
+  if (n_to_mask <= 0) {
+    return data;
+  }
+
+  char* out = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, 
*out_len));
+  if (out == nullptr) {
+    gdv_fn_context_set_error_msg(context, "Could not allocate memory for 
output string");
+    *out_len = 0;
+    return nullptr;
+  }
+
+  int bytes_masked;
+  for (bytes_masked = 0; bytes_masked < n_to_mask; bytes_masked++) {
+    unsigned char char_single_byte = data[bytes_masked];
+    if (char_single_byte > 127) {
+      // found a multi-byte utf-8 char
+      break;
+    }
+    out[bytes_masked] = mask_array[char_single_byte];
+  }
+
+  int data_idx = bytes_masked;
+  int out_idx = bytes_masked;
+
+  // Handle multibyte utf8 characters
+  utf8proc_int32_t utf8_char;
+  while (data_idx < n_to_mask) {

Review comment:
       This condition is incorrect. This assumes that there are enough utf-8 
characters to mask n_to_mask times..
   
   Let there by 3 utf-8 characters - each of length 3. the input length is 9. 
n_to_mask is 4. this should terminate after 3 iterations in this case and not 
after 4 iterations
   
   this should be
   while ((data_idx < n_to_mask) &&
              (bytes_masked < data_len))
   
   Also, data_idx is an odd name ... data_idx refers to the number of times the 
mask has been applied

##########
File path: cpp/src/gandiva/gdv_function_stubs.cc
##########
@@ -794,6 +812,187 @@ const char* gdv_fn_initcap_utf8(int64_t context, const 
char* data, int32_t data_
   *out_len = out_idx;
   return out;
 }
+
+GANDIVA_EXPORT
+const char* gdv_mask_first_n_utf8_int32(int64_t context, const char* data,
+                                        int32_t data_len, int32_t n_to_mask,
+                                        int32_t* out_len) {
+  if (data_len <= 0) {
+    *out_len = 0;
+    return nullptr;
+  }
+
+  if (n_to_mask > data_len) {
+    n_to_mask = data_len;
+  }
+
+  *out_len = data_len;
+
+  if (n_to_mask <= 0) {
+    return data;
+  }
+
+  char* out = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, 
*out_len));
+  if (out == nullptr) {
+    gdv_fn_context_set_error_msg(context, "Could not allocate memory for 
output string");
+    *out_len = 0;
+    return nullptr;
+  }
+
+  int bytes_masked;
+  for (bytes_masked = 0; bytes_masked < n_to_mask; bytes_masked++) {
+    unsigned char char_single_byte = data[bytes_masked];
+    if (char_single_byte > 127) {
+      // found a multi-byte utf-8 char
+      break;
+    }
+    out[bytes_masked] = mask_array[char_single_byte];
+  }
+
+  int data_idx = bytes_masked;
+  int out_idx = bytes_masked;
+
+  // Handle multibyte utf8 characters
+  utf8proc_int32_t utf8_char;
+  while (data_idx < n_to_mask) {
+    auto char_len =
+        utf8proc_iterate(reinterpret_cast<const utf8proc_uint8_t*>(data + 
bytes_masked),
+                         data_len, &utf8_char);
+
+    switch (utf8proc_category(utf8_char)) {
+      case 1:
+        out[out_idx] = 'X';
+        out_idx += 1;
+        break;
+      case 2:
+        out[out_idx] = 'x';
+        out_idx += 1;
+        break;
+      case 9:
+        out[out_idx] = 'n';
+        out_idx += 1;
+        break;
+      case 10:
+        out[out_idx] = 'n';
+        out_idx += 1;
+        break;
+      default:
+        memcpy(out + out_idx, data + bytes_masked, char_len);
+        out_idx += static_cast<int>(char_len);
+        break;
+    }
+    bytes_masked += static_cast<int>(char_len);
+    data_idx++;
+  }
+
+  // Correct the out_len after masking multibyte characters with single byte 
characters
+  *out_len = *out_len - (bytes_masked - out_idx);
+
+  if (data_idx < data_len) {

Review comment:
       This if condition is also wrong. It should be
   if (bytes_masked < data_len) { // that is there are input bytes remaining 
after masking n_to_mask chars
   }
   
   Please add unit tests to catch the bugs caught during code review. this is 
only way we can ensure that new bugs are not introduced when someone changes 
this code later

##########
File path: cpp/src/gandiva/gdv_function_stubs.cc
##########
@@ -794,6 +812,187 @@ const char* gdv_fn_initcap_utf8(int64_t context, const 
char* data, int32_t data_
   *out_len = out_idx;
   return out;
 }
+
+GANDIVA_EXPORT
+const char* gdv_mask_first_n_utf8_int32(int64_t context, const char* data,
+                                        int32_t data_len, int32_t n_to_mask,
+                                        int32_t* out_len) {
+  if (data_len <= 0) {
+    *out_len = 0;
+    return nullptr;
+  }
+
+  if (n_to_mask > data_len) {
+    n_to_mask = data_len;
+  }
+
+  *out_len = data_len;
+
+  if (n_to_mask <= 0) {
+    return data;
+  }
+
+  char* out = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, 
*out_len));
+  if (out == nullptr) {
+    gdv_fn_context_set_error_msg(context, "Could not allocate memory for 
output string");
+    *out_len = 0;
+    return nullptr;
+  }
+
+  int bytes_masked;
+  for (bytes_masked = 0; bytes_masked < n_to_mask; bytes_masked++) {
+    unsigned char char_single_byte = data[bytes_masked];
+    if (char_single_byte > 127) {
+      // found a multi-byte utf-8 char
+      break;
+    }
+    out[bytes_masked] = mask_array[char_single_byte];
+  }
+
+  int data_idx = bytes_masked;
+  int out_idx = bytes_masked;
+
+  // Handle multibyte utf8 characters
+  utf8proc_int32_t utf8_char;
+  while (data_idx < n_to_mask) {
+    auto char_len =
+        utf8proc_iterate(reinterpret_cast<const utf8proc_uint8_t*>(data + 
bytes_masked),
+                         data_len, &utf8_char);
+
+    switch (utf8proc_category(utf8_char)) {
+      case 1:
+        out[out_idx] = 'X';
+        out_idx += 1;
+        break;
+      case 2:
+        out[out_idx] = 'x';
+        out_idx += 1;
+        break;
+      case 9:
+        out[out_idx] = 'n';
+        out_idx += 1;
+        break;
+      case 10:
+        out[out_idx] = 'n';
+        out_idx += 1;
+        break;
+      default:
+        memcpy(out + out_idx, data + bytes_masked, char_len);
+        out_idx += static_cast<int>(char_len);
+        break;
+    }
+    bytes_masked += static_cast<int>(char_len);
+    data_idx++;
+  }
+
+  // Correct the out_len after masking multibyte characters with single byte 
characters
+  *out_len = *out_len - (bytes_masked - out_idx);

Review comment:
       Good one - I was wondering if out_len is correct. this change is correct




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


Reply via email to