Copilot commented on code in PR #49813:
URL: https://github.com/apache/arrow/pull/49813#discussion_r3147133915


##########
cpp/src/gandiva/precompiled/string_ops.cc:
##########
@@ -2824,13 +2766,22 @@ const char* elt_int32_utf8_utf8_utf8_utf8_utf8(
 FORCE_INLINE
 const char* to_hex_binary(int64_t context, const char* text, int32_t text_len,
                           int32_t* out_len) {
-  if (text_len == 0) {
+  if (ARROW_PREDICT_FALSE(text_len <= 0)) {
     *out_len = 0;
     return "";
   }
 
-  auto ret =
-      reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, text_len * 
2 + 1));
+  int32_t alloc_length = 0;
+
+  // Check overflow for text_len
+  if (ARROW_PREDICT_FALSE(
+          arrow::internal::AddWithOverflow(1, (2 * text_len), &alloc_length))) 
{
+    gdv_fn_context_set_error_msg(context, "Memory allocation size too large");
+    *out_len = 0;
+    return "";
+  }

Review Comment:
   `(2 * text_len)` is evaluated before `AddWithOverflow`, so for `text_len > 
INT32_MAX/2` this can overflow a signed integer (undefined behavior) prior to 
the intended check. Compute `2 * text_len` via `MultiplyWithOverflow` (or widen 
to int64) and then add 1 with `AddWithOverflow` to keep the allocation length 
calculation well-defined and safe.



##########
cpp/src/gandiva/precompiled/string_ops.cc:
##########
@@ -1924,9 +1924,17 @@ const char* quote_utf8(gdv_int64 context, const char* 
in, gdv_int32 in_len,
     *out_len = 0;
     return "";
   }
+
+  int32_t alloc_length = 0;
+  if (ARROW_PREDICT_FALSE(
+          arrow::internal::AddWithOverflow(2, (2 * in_len), &alloc_length))) {
+    gdv_fn_context_set_error_msg(context, "Memory allocation size too large");
+    *out_len = 0;
+    return "";
+  }

Review Comment:
   The overflow check computes `(2 * in_len)` before calling `AddWithOverflow`. 
If `in_len > INT32_MAX/2`, the multiplication can overflow a signed integer 
(undefined behavior) before the overflow helper runs, defeating the safety 
check. Use `MultiplyWithOverflow(2, in_len, &tmp)` (or widen to int64) and then 
`AddWithOverflow(tmp, 2, &alloc_length)` to keep all arithmetic overflow-safe.



##########
cpp/src/gandiva/precompiled/string_ops.cc:
##########
@@ -2444,182 +2452,172 @@ void concat_word(char* out_buf, int* out_idx, const 
char* in_buf, int in_len,
   *out_idx += in_len;
 }
 
-FORCE_INLINE
-const char* concat_ws_utf8_utf8(int64_t context, const char* separator,
-                                int32_t separator_len, bool separator_validity,
-                                const char* word1, int32_t word1_len, bool 
word1_validity,
-                                const char* word2, int32_t word2_len, bool 
word2_validity,
-                                bool* out_valid, int32_t* out_len) {
-  *out_len = 0;
-  int numValidInput = 0;
-  // If separator is null, always return null
-  if (!separator_validity) {
-    *out_len = 0;
-    *out_valid = false;
-    return "";
-  }
+// Helper structure to maintain state during safe length accumulation
+struct SafeLengthState {
+  int32_t total_len = 0;
+  int32_t num_valid = 0;
+  bool overflow = false;
+};
 
-  if (word1_validity) {
-    *out_len += word1_len;
-    numValidInput++;
-  }
-  if (word2_validity) {
-    *out_len += word2_len;
-    numValidInput++;
+// Helper to safely add a word length
+static inline bool safe_accumulate_word(SafeLengthState& state, int32_t 
word_len,
+                                        bool word_validity) {
+  if (not word_validity) return true;
+
+  int32_t temp = 0;
+  if (ARROW_PREDICT_FALSE(
+          arrow::internal::AddWithOverflow(state.total_len, word_len, &temp))) 
{
+    state.overflow = true;
+    return false;
   }
+  state.total_len = temp;
+  state.num_valid++;
+  return true;
+}
 
-  *out_len += separator_len * (numValidInput > 1 ? numValidInput - 1 : 0);
-  if (*out_len == 0) {
-    *out_valid = true;
-    return "";
+// Helper to safely add separators based on number of valid words
+static inline bool safe_add_separators(SafeLengthState* state, int32_t 
separator_len) {
+  if (state->num_valid <= 1) return true;
+
+  int32_t sep_total = 0;
+  int32_t temp = 0;
+
+  if (ARROW_PREDICT_FALSE(arrow::internal::MultiplyWithOverflow(
+          separator_len, state->num_valid - 1, &sep_total))) {
+    state->overflow = true;
+    return false;
   }
 
-  char* out = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, 
*out_len));
-  if (out == nullptr) {
-    gdv_fn_context_set_error_msg(context, "Could not allocate memory for 
output string");
-    *out_len = 0;
-    *out_valid = false;
-    return "";
+  if (ARROW_PREDICT_FALSE(
+          arrow::internal::AddWithOverflow(state->total_len, sep_total, 
&temp))) {
+    state->overflow = true;
+    return false;
   }
 
-  char* tmp = out;
-  int out_idx = 0;
-  bool seenAnyValidInput = false;
+  state->total_len = temp;
+  return true;
+}
 
-  concat_word(tmp, &out_idx, word1, word1_len, word1_validity, separator, 
separator_len,
-              &seenAnyValidInput);
-  concat_word(tmp, &out_idx, word2, word2_len, word2_validity, separator, 
separator_len,
-              &seenAnyValidInput);
+// Helper to handle overflow failure (sets output parameters and returns 
nullptr)
+static inline const char* handle_overflow_failure(bool* out_valid, int32_t* 
out_len) {
+  *out_len = 0;
+  *out_valid = false;
+  return "";
+}
 
+// Helper to handle empty result (all words invalid)
+static inline const char* handle_empty_result(bool* out_valid, int32_t* 
out_len) {
+  *out_len = 0;
   *out_valid = true;
-  *out_len = out_idx;
-  return out;
+  return "";
 }
 
-FORCE_INLINE
-const char* concat_ws_utf8_utf8_utf8(
-    int64_t context, const char* separator, int32_t separator_len,
-    bool separator_validity, const char* word1, int32_t word1_len, bool 
word1_validity,
-    const char* word2, int32_t word2_len, bool word2_validity, const char* 
word3,
-    int32_t word3_len, bool word3_validity, bool* out_valid, int32_t* out_len) 
{
+struct WordArg {
+  const char* data;
+  int32_t len;
+  bool valid;
+};
+
+static inline const char* concat_ws_impl(int64_t context, const char* 
separator,
+                                         int32_t separator_len, bool 
separator_validity,
+                                         bool* out_valid, int32_t* out_len,
+                                         std::initializer_list<WordArg> words) 
{
   *out_len = 0;
-  int numValidInput = 0;
-  // If separator is null, always return null
-  if (!separator_validity) {
-    *out_len = 0;
+
+  // Separator validity check
+  if (not separator_validity) {
     *out_valid = false;
     return "";
   }

Review Comment:
   `concat_ws_impl` does not validate that `separator_len` and each `word*_len` 
are non-negative when their validity flags are true. A negative length will 
propagate into `concat_word`, where it is passed to `memcpy` as a (converted) 
huge `size_t`, causing out-of-bounds reads/writes. Add explicit guards (e.g., 
treat negative lengths as invalid input: set an error, `*out_valid=false`, 
`*out_len=0`, and return "").



##########
cpp/src/gandiva/precompiled/string_ops.cc:
##########
@@ -2444,182 +2452,172 @@ void concat_word(char* out_buf, int* out_idx, const 
char* in_buf, int in_len,
   *out_idx += in_len;
 }
 
-FORCE_INLINE
-const char* concat_ws_utf8_utf8(int64_t context, const char* separator,
-                                int32_t separator_len, bool separator_validity,
-                                const char* word1, int32_t word1_len, bool 
word1_validity,
-                                const char* word2, int32_t word2_len, bool 
word2_validity,
-                                bool* out_valid, int32_t* out_len) {
-  *out_len = 0;
-  int numValidInput = 0;
-  // If separator is null, always return null
-  if (!separator_validity) {
-    *out_len = 0;
-    *out_valid = false;
-    return "";
-  }
+// Helper structure to maintain state during safe length accumulation
+struct SafeLengthState {
+  int32_t total_len = 0;
+  int32_t num_valid = 0;
+  bool overflow = false;
+};
 
-  if (word1_validity) {
-    *out_len += word1_len;
-    numValidInput++;
-  }
-  if (word2_validity) {
-    *out_len += word2_len;
-    numValidInput++;
+// Helper to safely add a word length
+static inline bool safe_accumulate_word(SafeLengthState& state, int32_t 
word_len,
+                                        bool word_validity) {
+  if (not word_validity) return true;
+
+  int32_t temp = 0;
+  if (ARROW_PREDICT_FALSE(
+          arrow::internal::AddWithOverflow(state.total_len, word_len, &temp))) 
{
+    state.overflow = true;
+    return false;
   }
+  state.total_len = temp;
+  state.num_valid++;
+  return true;
+}
 
-  *out_len += separator_len * (numValidInput > 1 ? numValidInput - 1 : 0);
-  if (*out_len == 0) {
-    *out_valid = true;
-    return "";
+// Helper to safely add separators based on number of valid words
+static inline bool safe_add_separators(SafeLengthState* state, int32_t 
separator_len) {
+  if (state->num_valid <= 1) return true;
+
+  int32_t sep_total = 0;
+  int32_t temp = 0;
+
+  if (ARROW_PREDICT_FALSE(arrow::internal::MultiplyWithOverflow(
+          separator_len, state->num_valid - 1, &sep_total))) {
+    state->overflow = true;
+    return false;
   }
 
-  char* out = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, 
*out_len));
-  if (out == nullptr) {
-    gdv_fn_context_set_error_msg(context, "Could not allocate memory for 
output string");
-    *out_len = 0;
-    *out_valid = false;
-    return "";
+  if (ARROW_PREDICT_FALSE(
+          arrow::internal::AddWithOverflow(state->total_len, sep_total, 
&temp))) {
+    state->overflow = true;
+    return false;
   }
 
-  char* tmp = out;
-  int out_idx = 0;
-  bool seenAnyValidInput = false;
+  state->total_len = temp;
+  return true;
+}
 
-  concat_word(tmp, &out_idx, word1, word1_len, word1_validity, separator, 
separator_len,
-              &seenAnyValidInput);
-  concat_word(tmp, &out_idx, word2, word2_len, word2_validity, separator, 
separator_len,
-              &seenAnyValidInput);
+// Helper to handle overflow failure (sets output parameters and returns 
nullptr)
+static inline const char* handle_overflow_failure(bool* out_valid, int32_t* 
out_len) {
+  *out_len = 0;
+  *out_valid = false;
+  return "";
+}
 
+// Helper to handle empty result (all words invalid)
+static inline const char* handle_empty_result(bool* out_valid, int32_t* 
out_len) {
+  *out_len = 0;
   *out_valid = true;
-  *out_len = out_idx;
-  return out;
+  return "";

Review Comment:
   `handle_overflow_failure` / `handle_empty_result` are defined but not used 
anywhere in this translation unit. This can trigger `-Wunused-function` 
warnings (often promoted to errors) and adds dead code. Either remove these 
helpers or refactor the existing early-return paths to call them.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to