Copilot commented on code in PR #49813:
URL: https://github.com/apache/arrow/pull/49813#discussion_r3165002347
##########
cpp/src/gandiva/precompiled/string_ops_test.cc:
##########
@@ -2498,6 +2561,25 @@ TEST(TestStringOps, TestToHex) {
output = std::string(out_str, out_len);
EXPECT_EQ(out_len, 2 * in_len);
EXPECT_EQ(output, "090A090A090A090A0A0A092061206C657474405D6572");
+ ctx.Reset();
+
+ int32_t bad_text_len = std::numeric_limits<int32_t>::max() / 2 + 20;
+ out_str = to_hex_binary(ctx_ptr, binary_string, bad_text_len, &out_len);
+ EXPECT_EQ(out_len, 0);
+ EXPECT_STREQ(out_str, "");
+ ctx.Reset();
+
+ bad_text_len = (std::numeric_limits<int32_t>::max() / 2) + 1;
+ out_str = to_hex_binary(ctx_ptr, binary_string, bad_text_len, &out_len);
+ EXPECT_EQ(out_len, 0);
+ EXPECT_STREQ(out_str, "");
Review Comment:
The overflow test cases for `to_hex_binary` reset the context but never
assert that the expected error was set. Since `to_hex_binary` now calls
`gdv_fn_context_set_error_msg` on overflow, please assert `ctx.has_error()` /
`ctx.get_error()` for these `bad_text_len` inputs before calling `ctx.Reset()`
to ensure the overflow path is actually exercised.
```suggestion
EXPECT_STREQ(out_str, "");
EXPECT_TRUE(ctx.has_error());
EXPECT_NE(ctx.get_error(), "");
ctx.Reset();
bad_text_len = (std::numeric_limits<int32_t>::max() / 2) + 1;
out_str = to_hex_binary(ctx_ptr, binary_string, bad_text_len, &out_len);
EXPECT_EQ(out_len, 0);
EXPECT_STREQ(out_str, "");
EXPECT_TRUE(ctx.has_error());
EXPECT_NE(ctx.get_error(), "");
```
##########
cpp/src/gandiva/precompiled/string_ops.cc:
##########
@@ -2444,182 +2462,176 @@ void concat_word(char* out_buf, int* out_idx, const
char* in_buf, int in_len,
*out_idx += in_len;
}
-FORCE_INLINE
-const char* concat_ws_utf8_utf8(int64_t context, const char* separator,
- int32_t separator_len, bool separator_validity,
- const char* word1, int32_t word1_len, bool
word1_validity,
- const char* word2, int32_t word2_len, bool
word2_validity,
- bool* out_valid, int32_t* out_len) {
- *out_len = 0;
- int numValidInput = 0;
- // If separator is null, always return null
- if (!separator_validity) {
- *out_len = 0;
- *out_valid = false;
- return "";
- }
+// Helper structure to maintain state during safe length accumulation
+struct SafeLengthState {
+ int32_t total_len = 0;
+ int32_t num_valid = 0;
+ bool overflow = false;
+};
- if (word1_validity) {
- *out_len += word1_len;
- numValidInput++;
+// Helper to safely add a word length
+static inline bool safe_accumulate_word(SafeLengthState& state, int32_t
word_len,
+ bool word_validity) {
+ if (not word_validity) return true;
+
+ if (word_len < 0) {
+ return false;
}
- if (word2_validity) {
- *out_len += word2_len;
- numValidInput++;
+
+ int32_t temp = 0;
+ if (ARROW_PREDICT_FALSE(
+ arrow::internal::AddWithOverflow(state.total_len, word_len, &temp)))
{
+ state.overflow = true;
+ return false;
}
+ state.total_len = temp;
+ state.num_valid++;
+ return true;
+}
- *out_len += separator_len * (numValidInput > 1 ? numValidInput - 1 : 0);
- if (*out_len == 0) {
- *out_valid = true;
- return "";
+// Helper to safely add separators based on number of valid words
+static inline bool safe_add_separators(SafeLengthState* state, int32_t
separator_len) {
+ if (state->num_valid <= 1) return true;
+
+ int32_t sep_total = 0;
+ int32_t temp = 0;
+
+ if (ARROW_PREDICT_FALSE(arrow::internal::MultiplyWithOverflow(
+ separator_len, state->num_valid - 1, &sep_total))) {
+ state->overflow = true;
+ return false;
}
- char* out = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context,
*out_len));
- if (out == nullptr) {
- gdv_fn_context_set_error_msg(context, "Could not allocate memory for
output string");
- *out_len = 0;
- *out_valid = false;
- return "";
+ if (ARROW_PREDICT_FALSE(
+ arrow::internal::AddWithOverflow(state->total_len, sep_total,
&temp))) {
+ state->overflow = true;
+ return false;
}
- char* tmp = out;
- int out_idx = 0;
- bool seenAnyValidInput = false;
+ state->total_len = temp;
+ return true;
+}
- concat_word(tmp, &out_idx, word1, word1_len, word1_validity, separator,
separator_len,
- &seenAnyValidInput);
- concat_word(tmp, &out_idx, word2, word2_len, word2_validity, separator,
separator_len,
- &seenAnyValidInput);
+// Helper to handle overflow failure (sets output parameters and returns empty
string)
+static inline const char* handle_overflow_failure(bool* out_valid, int32_t*
out_len) {
+ *out_len = 0;
+ *out_valid = false;
+ return "";
+}
+// Helper to handle empty result (all words invalid)
+static inline const char* handle_empty_result(bool* out_valid, int32_t*
out_len) {
+ *out_len = 0;
*out_valid = true;
- *out_len = out_idx;
- return out;
+ return "";
}
-FORCE_INLINE
-const char* concat_ws_utf8_utf8_utf8(
- int64_t context, const char* separator, int32_t separator_len,
- bool separator_validity, const char* word1, int32_t word1_len, bool
word1_validity,
- const char* word2, int32_t word2_len, bool word2_validity, const char*
word3,
- int32_t word3_len, bool word3_validity, bool* out_valid, int32_t* out_len)
{
+struct WordArg {
+ const char* data;
+ int32_t len;
+ bool valid;
+};
+
+static inline const char* concat_ws_impl(int64_t context, const char*
separator,
+ int32_t separator_len, bool
separator_validity,
+ bool* out_valid, int32_t* out_len,
+ std::initializer_list<WordArg> words)
{
*out_len = 0;
- int numValidInput = 0;
- // If separator is null, always return null
- if (!separator_validity) {
- *out_len = 0;
+
+ // Separator validity check
+ if (not separator_validity) {
*out_valid = false;
return "";
}
-
- if (word1_validity) {
- *out_len += word1_len;
- numValidInput++;
- }
- if (word2_validity) {
- *out_len += word2_len;
- numValidInput++;
+ if (separator_len < 0) {
+ *out_valid = false;
+ return "";
}
- if (word3_validity) {
- *out_len += word3_len;
- numValidInput++;
+
+ SafeLengthState state;
+
+ // Accumulate all word lengths safely
+ for (const WordArg& w : words) {
+ if (not safe_accumulate_word(state, w.len, w.valid)) {
+ *out_len = 0;
+ *out_valid = false;
+ return "";
+ }
+ if (state.overflow) {
+ return handle_overflow_failure(out_valid, out_len);
+ }
Review Comment:
In `concat_ws_impl`, the `if (state.overflow)` branch is effectively
unreachable: `safe_accumulate_word()` returns `false` immediately when
`AddWithOverflow` triggers, so the loop exits before `state.overflow` can be
observed while continuing. Consider removing `state.overflow` (and this check)
or refactoring `safe_accumulate_word()` / the loop to use a single
overflow-handling path, to avoid dead logic and future confusion.
```suggestion
```
##########
cpp/src/gandiva/gdv_string_function_stubs.cc:
##########
@@ -445,8 +485,12 @@ const char* gdv_fn_substring_index(int64_t context, const
char* txt, int32_t txt
return out;
} else {
+ if (txt_len < 0) {
+ *out_len = 0;
+ return "";
+ }
Review Comment:
`gdv_fn_substring_index` already rejects negative `txt_len` near the top of
the function, so the additional `if (txt_len < 0)` check in this `else` branch
is redundant/unreachable. Removing it would simplify the control flow and avoid
duplicated validation logic.
```suggestion
```
##########
cpp/src/gandiva/precompiled/string_ops_test.cc:
##########
@@ -1165,6 +1165,16 @@ TEST(TestStringOps, TestQuote) {
out_str = quote_utf8(ctx_ptr, "'''''''''", 9, &out_len);
EXPECT_EQ(std::string(out_str, out_len), "'\\'\\'\\'\\'\\'\\'\\'\\'\\''");
EXPECT_FALSE(ctx.has_error());
+
+ int32_t bad_in_len = std::numeric_limits<int32_t>::max() / 2 + 1;
+ out_str = quote_utf8(ctx_ptr, "YYZ", bad_in_len, &out_len);
+ EXPECT_EQ(out_len, 0);
+ EXPECT_STREQ(out_str, "");
+
+ bad_in_len = std::numeric_limits<int32_t>::max() / 2 + 20;
+ out_str = quote_utf8(ctx_ptr, "ABCDE", bad_in_len, &out_len);
+ EXPECT_EQ(out_len, 0);
+ EXPECT_STREQ(out_str, "");
Review Comment:
The new overflow cases for `quote_utf8` only assert that the function
returns an empty string and `out_len == 0`. Since the overflow path sets an
error message on the execution context, the test should also assert
`ctx.has_error()` (and ideally the error text) and reset the context
afterwards, so the behavior is fully validated.
```suggestion
{
gandiva::ExecutionContext overflow_ctx;
uint64_t overflow_ctx_ptr = reinterpret_cast<gdv_int64>(&overflow_ctx);
out_str = quote_utf8(overflow_ctx_ptr, "YYZ", bad_in_len, &out_len);
EXPECT_EQ(out_len, 0);
EXPECT_STREQ(out_str, "");
EXPECT_TRUE(overflow_ctx.has_error());
}
bad_in_len = std::numeric_limits<int32_t>::max() / 2 + 20;
{
gandiva::ExecutionContext overflow_ctx;
uint64_t overflow_ctx_ptr = reinterpret_cast<gdv_int64>(&overflow_ctx);
out_str = quote_utf8(overflow_ctx_ptr, "ABCDE", bad_in_len, &out_len);
EXPECT_EQ(out_len, 0);
EXPECT_STREQ(out_str, "");
EXPECT_TRUE(overflow_ctx.has_error());
}
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]