vvellanki commented on a change in pull request #11551:
URL: https://github.com/apache/arrow/pull/11551#discussion_r743405369



##########
File path: cpp/src/gandiva/gdv_function_stubs.cc
##########
@@ -794,6 +795,88 @@ const char* gdv_fn_initcap_utf8(int64_t context, const 
char* data, int32_t data_
   *out_len = out_idx;
   return out;
 }
+
+GANDIVA_EXPORT
+const char* gdv_fn_mask_first_n(int64_t context, const char* data, int32_t 
data_len,
+                                int32_t n_to_mask, int32_t* out_len) {
+  if (data_len <= 0) {
+    *out_len = 0;
+    return nullptr;
+  }
+
+  if (n_to_mask < 0) {
+    n_to_mask = n_to_mask * (-1);
+  }
+
+  *out_len = data_len;
+
+  char* out = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, 
*out_len));
+  if (out == nullptr) {
+    gdv_fn_context_set_error_msg(context, "Could not allocate memory for 
output string");
+    *out_len = 0;
+    return nullptr;
+  }
+
+  // do the masking
+  for (int i = 0; i < data_len; ++i) {
+    if(isdigit(data[i]) && i < n_to_mask) {
+      out[i] = 'n';
+      continue;
+    }
+    if(isupper(data[i]) && i < n_to_mask) {
+      out[i] = 'X';
+      continue;
+    }
+    if(islower(data[i]) && i < n_to_mask) {
+      out[i] = 'x';
+      continue;
+    }
+    out[i] = data[i];
+  }
+
+  return out;
+}
+
+GANDIVA_EXPORT
+const char* gdv_fn_mask_last_n(int64_t context, const char* data, int32_t 
data_len,
+                               int32_t n_to_mask, int32_t* out_len) {
+  if (data_len <= 0) {
+    *out_len = 0;
+    return nullptr;
+  }
+
+  if (n_to_mask < 0) {
+    n_to_mask = n_to_mask * (-1);
+  }
+
+  *out_len = data_len;
+
+  char* out = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, 
*out_len));
+  if (out == nullptr) {
+    gdv_fn_context_set_error_msg(context, "Could not allocate memory for 
output string");
+    *out_len = 0;
+    return nullptr;
+  }
+
+  // do the masking
+  for (int i = 0; i < data_len; ++i) {

Review comment:
       Same here... This should be memcpy followed by a loop towards the end 
for the last n chars
   
   And, use the mask_array to make the loop efficient

##########
File path: cpp/src/gandiva/gdv_function_stubs.cc
##########
@@ -794,6 +795,88 @@ const char* gdv_fn_initcap_utf8(int64_t context, const 
char* data, int32_t data_
   *out_len = out_idx;
   return out;
 }
+
+GANDIVA_EXPORT
+const char* gdv_fn_mask_first_n(int64_t context, const char* data, int32_t 
data_len,
+                                int32_t n_to_mask, int32_t* out_len) {
+  if (data_len <= 0) {
+    *out_len = 0;
+    return nullptr;
+  }
+
+  if (n_to_mask < 0) {

Review comment:
       Not sure about this. Can you check how this is implemented in Hive?

##########
File path: cpp/src/gandiva/gdv_function_stubs.cc
##########
@@ -794,6 +795,88 @@ const char* gdv_fn_initcap_utf8(int64_t context, const 
char* data, int32_t data_
   *out_len = out_idx;
   return out;
 }
+
+GANDIVA_EXPORT
+const char* gdv_fn_mask_first_n(int64_t context, const char* data, int32_t 
data_len,
+                                int32_t n_to_mask, int32_t* out_len) {
+  if (data_len <= 0) {
+    *out_len = 0;
+    return nullptr;
+  }
+
+  if (n_to_mask < 0) {
+    n_to_mask = n_to_mask * (-1);
+  }
+
+  *out_len = data_len;
+
+  char* out = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, 
*out_len));
+  if (out == nullptr) {
+    gdv_fn_context_set_error_msg(context, "Could not allocate memory for 
output string");
+    *out_len = 0;
+    return nullptr;
+  }
+
+  // do the masking
+  for (int i = 0; i < data_len; ++i) {
+    if(isdigit(data[i]) && i < n_to_mask) {

Review comment:
       Can you check how Hive implements this for utf-8 strings? This logic 
works for ascii input, it doesn't work for utf-8 input

##########
File path: cpp/src/gandiva/gdv_function_stubs.cc
##########
@@ -794,6 +795,88 @@ const char* gdv_fn_initcap_utf8(int64_t context, const 
char* data, int32_t data_
   *out_len = out_idx;
   return out;
 }
+
+GANDIVA_EXPORT
+const char* gdv_fn_mask_first_n(int64_t context, const char* data, int32_t 
data_len,
+                                int32_t n_to_mask, int32_t* out_len) {
+  if (data_len <= 0) {
+    *out_len = 0;
+    return nullptr;
+  }
+
+  if (n_to_mask < 0) {
+    n_to_mask = n_to_mask * (-1);
+  }
+
+  *out_len = data_len;
+
+  char* out = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, 
*out_len));
+  if (out == nullptr) {
+    gdv_fn_context_set_error_msg(context, "Could not allocate memory for 
output string");
+    *out_len = 0;
+    return nullptr;
+  }
+
+  // do the masking
+  for (int i = 0; i < data_len; ++i) {

Review comment:
       This loop should only be from i = 0; and i < n_to_mask
   
   After this loop, you should just use memcpy() to copy the remaining bytes

##########
File path: cpp/src/gandiva/gdv_function_stubs.cc
##########
@@ -794,6 +795,88 @@ const char* gdv_fn_initcap_utf8(int64_t context, const 
char* data, int32_t data_
   *out_len = out_idx;
   return out;
 }
+
+GANDIVA_EXPORT
+const char* gdv_fn_mask_first_n(int64_t context, const char* data, int32_t 
data_len,
+                                int32_t n_to_mask, int32_t* out_len) {
+  if (data_len <= 0) {
+    *out_len = 0;
+    return nullptr;
+  }
+
+  if (n_to_mask < 0) {
+    n_to_mask = n_to_mask * (-1);
+  }
+
+  *out_len = data_len;
+
+  char* out = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, 
*out_len));
+  if (out == nullptr) {
+    gdv_fn_context_set_error_msg(context, "Could not allocate memory for 
output string");
+    *out_len = 0;
+    return nullptr;
+  }
+
+  // do the masking
+  for (int i = 0; i < data_len; ++i) {
+    if(isdigit(data[i]) && i < n_to_mask) {

Review comment:
       This is going to be slow... Please construct a static array as follows:
   final char[256] mask_array = {
    (char)0, (char)1, ....
    'n', 'n', // for the ascii values of '0' to '9'
    'X', 'X', // for the ascii values of 'A' to 'Z'
    'x', 'x', // for the ascii values of 'a' to 'z'
   };
   
   With this array, this loop becomes:
   out[i] = mask_array[data[i]];




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


Reply via email to