Copilot commented on code in PR #49813:
URL: https://github.com/apache/arrow/pull/49813#discussion_r3291445715
##########
cpp/src/gandiva/gdv_string_function_stubs.cc:
##########
@@ -751,6 +800,11 @@ const char* translate_utf8_utf8_utf8(int64_t context,
const char* in, int32_t in
break;
}
+ // Updating len to char in this position
+ len_char_from = gdv_fn_utf8_char_length(from[from_for]);
+ // Making copy to std::string with length for this char position
+ std::string copy_from_compare(from + from_for, len_char_from);
+
Review Comment:
The inner multibyte loop over `from` increments by `len_char_from`
(`from_for += len_char_from`), but `len_char_from` is computed via
`gdv_fn_utf8_char_length(from[from_for])` which returns 0 for invalid leading
bytes. If `from` contains an invalid byte with the high bit set, this loop can
become non-terminating. Add validation for `len_char_from <= 0` (and similarly
for `len_char_to`) to avoid infinite loops and to report invalid UTF-8.
##########
cpp/src/gandiva/gdv_string_function_stubs.cc:
##########
@@ -367,6 +396,15 @@ const char* gdv_fn_substring_index(int64_t context, const
char* txt, int32_t txt
return "";
}
+ if (ARROW_PREDICT_FALSE(txt_len < 0)) {
+ *out_len = 0;
+ return "";
+ }
+ if (ARROW_PREDICT_FALSE(pat_len < 0)) {
Review Comment:
`gdv_fn_substring_index` now guards against negative `txt_len` / `pat_len`
by returning an empty string, but it doesn't set an error message. This differs
from other negative-length checks in this file (e.g., cast macros /
`is_datalen_valid`) that set a context error, and makes invalid inputs fail
silently. Consider calling `gdv_fn_context_set_error_msg` for negative lengths
so callers can detect misuse.
##########
cpp/src/gandiva/precompiled/string_ops.cc:
##########
@@ -2444,182 +2462,178 @@ void concat_word(char* out_buf, int* out_idx, const
char* in_buf, int in_len,
*out_idx += in_len;
}
-FORCE_INLINE
-const char* concat_ws_utf8_utf8(int64_t context, const char* separator,
- int32_t separator_len, bool separator_validity,
- const char* word1, int32_t word1_len, bool
word1_validity,
- const char* word2, int32_t word2_len, bool
word2_validity,
- bool* out_valid, int32_t* out_len) {
- *out_len = 0;
- int numValidInput = 0;
- // If separator is null, always return null
- if (!separator_validity) {
- *out_len = 0;
- *out_valid = false;
- return "";
- }
+// Helper structure to maintain state during safe length accumulation
+struct SafeLengthState {
+ int32_t total_len = 0;
+ int32_t num_valid = 0;
+ bool overflow = false;
+};
+
+// Helper to safely add a word length
+static inline bool safe_accumulate_word(int64_t context, SafeLengthState&
state,
+ int32_t word_len, bool word_validity) {
+ if (!word_validity) return true;
- if (word1_validity) {
- *out_len += word1_len;
- numValidInput++;
+ if (word_len < 0) {
+ gdv_fn_context_set_error_msg(context, "Invalid word length.");
+ return false;
}
- if (word2_validity) {
- *out_len += word2_len;
- numValidInput++;
+
+ int32_t temp = 0;
+ if (ARROW_PREDICT_FALSE(
+ arrow::internal::AddWithOverflow(state.total_len, word_len, &temp)))
{
+ gdv_fn_context_set_error_msg(context, "Overflow in addition detected.");
+ state.overflow = true;
+ return false;
}
+ state.total_len = temp;
+ state.num_valid++;
+ return true;
+}
- *out_len += separator_len * (numValidInput > 1 ? numValidInput - 1 : 0);
- if (*out_len == 0) {
- *out_valid = true;
- return "";
+// Helper to safely add separators based on number of valid words
+static inline bool safe_add_separators(int64_t context, SafeLengthState* state,
+ int32_t separator_len) {
+ if (state->num_valid <= 1) return true;
+
+ int32_t sep_total = 0;
+ int32_t temp = 0;
+
+ if (ARROW_PREDICT_FALSE(arrow::internal::MultiplyWithOverflow(
+ separator_len, state->num_valid - 1, &sep_total))) {
+ gdv_fn_context_set_error_msg(context, "Overflow in multiplication
detected.");
+ state->overflow = true;
+ return false;
}
- char* out = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context,
*out_len));
- if (out == nullptr) {
- gdv_fn_context_set_error_msg(context, "Could not allocate memory for
output string");
- *out_len = 0;
- *out_valid = false;
- return "";
+ if (ARROW_PREDICT_FALSE(
+ arrow::internal::AddWithOverflow(state->total_len, sep_total,
&temp))) {
+ gdv_fn_context_set_error_msg(context, "Overflow in addition detected.");
+ state->overflow = true;
+ return false;
}
- char* tmp = out;
- int out_idx = 0;
- bool seenAnyValidInput = false;
+ state->total_len = temp;
+ return true;
+}
- concat_word(tmp, &out_idx, word1, word1_len, word1_validity, separator,
separator_len,
- &seenAnyValidInput);
- concat_word(tmp, &out_idx, word2, word2_len, word2_validity, separator,
separator_len,
- &seenAnyValidInput);
+// Helper to handle overflow failure (sets output parameters and returns empty
string)
+static inline const char* handle_overflow_failure(bool* out_valid, int32_t*
out_len) {
+ *out_len = 0;
+ *out_valid = false;
+ return "";
+}
+// Helper to handle empty result (all words invalid)
+static inline const char* handle_empty_result(bool* out_valid, int32_t*
out_len) {
+ *out_len = 0;
*out_valid = true;
- *out_len = out_idx;
- return out;
+ return "";
}
-FORCE_INLINE
-const char* concat_ws_utf8_utf8_utf8(
- int64_t context, const char* separator, int32_t separator_len,
- bool separator_validity, const char* word1, int32_t word1_len, bool
word1_validity,
- const char* word2, int32_t word2_len, bool word2_validity, const char*
word3,
- int32_t word3_len, bool word3_validity, bool* out_valid, int32_t* out_len)
{
+struct WordArg {
+ const char* data;
+ int32_t len;
+ bool valid;
+};
+
+static inline const char* concat_ws_impl(int64_t context, const char*
separator,
+ int32_t separator_len, bool
separator_validity,
+ bool* out_valid, int32_t* out_len,
+ std::initializer_list<WordArg> words)
{
*out_len = 0;
- int numValidInput = 0;
- // If separator is null, always return null
+
+ // Separator validity check
if (!separator_validity) {
- *out_len = 0;
*out_valid = false;
return "";
}
-
- if (word1_validity) {
- *out_len += word1_len;
- numValidInput++;
- }
- if (word2_validity) {
- *out_len += word2_len;
- numValidInput++;
+ if (separator_len < 0) {
Review Comment:
`concat_ws_impl` treats a negative `separator_len` as invalid and returns
`out_valid=false`, but it doesn't set an error message on the execution
context. This makes negative-length failures silent, unlike the other
invalid-length/overflow paths here that call `gdv_fn_context_set_error_msg`.
Consider setting a clear error (and keeping `*out_len = 0`) when `separator_len
< 0` so callers/tests can distinguish invalid input from NULL separator
handling.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]