This is an automated email from the ASF dual-hosted git repository. github-merge-queue[bot] pushed a commit to branch gh-readonly-queue/main/pr-22293-0add0469eb87dbb7fd9b6f78ded41ca7c8b46b6f in repository https://gitbox.apache.org/repos/asf/datafusion.git
commit f4a66a2370894cc4b1fb6feacca6a9aaafac01e4 Author: Sean Kenneth Doherty <[email protected]> AuthorDate: Wed May 27 13:27:47 2026 -0500 fix: guard repeat array length overflow (#22293) ## Which issue does this PR close? - Closes #22217. ## Rationale for this change The array execution path for `repeat(string, count)` calculated `string.len() * count` before checking the configured string-size limit. For very large counts, that multiplication can overflow and panic instead of returning the same string-size overflow error used by the scalar path. ## What changes are included in this PR? - Adds checked count conversion and repeated-length calculation helpers. - Uses checked multiplication and checked total-capacity accumulation in the array path. - Adds Rust and sqllogictest coverage for the one-row columnar reproducer from the issue. ## Are these changes tested? - `cargo fmt --all` - `TMPDIR=/home/sean/Projects/datafusion-repeat-overflow/target/tmp cargo test -p datafusion-functions string::repeat::tests::test_repeat_string_array_overflow -- --nocapture` - `TMPDIR=/home/sean/Projects/datafusion-repeat-overflow/target/tmp cargo test --profile=ci --test sqllogictests -- string/string_literal.slt` - `TMPDIR=/home/sean/Projects/datafusion-repeat-overflow/target/tmp cargo clippy --all-targets --all-features -- -D warnings` - `git diff --check` ## Are there any user-facing changes? Invalid oversized `repeat` results in the columnar path now return a normal DataFusion string-size overflow error instead of panicking. --------- Co-authored-by: Andrew Lamb <[email protected]> --- datafusion/functions/src/string/repeat.rs | 67 +++++++++++++++++----- .../test_files/string/string_literal.slt | 4 ++ 2 files changed, 57 insertions(+), 14 deletions(-) diff --git a/datafusion/functions/src/string/repeat.rs b/datafusion/functions/src/string/repeat.rs index b551d2ac70..a53f1e2e4f 100644 --- a/datafusion/functions/src/string/repeat.rs +++ b/datafusion/functions/src/string/repeat.rs @@ -26,7 +26,9 @@ use arrow::datatypes::DataType::{LargeUtf8, Utf8, Utf8View}; use datafusion_common::cast::as_int64_array; use datafusion_common::types::{NativeType, logical_int64, logical_string}; use datafusion_common::utils::take_function_args; -use datafusion_common::{DataFusionError, Result, ScalarValue, exec_err, internal_err}; +use datafusion_common::{ + DataFusionError, Result, ScalarValue, exec_datafusion_err, exec_err, internal_err, +}; use datafusion_expr::{ColumnarValue, Documentation, Volatility}; use datafusion_expr::{ScalarFunctionArgs, ScalarUDFImpl, Signature}; use datafusion_expr_common::signature::{Coercion, TypeSignatureClass}; @@ -166,7 +168,21 @@ fn compute_repeat(s: &str, count: i64, max_size: usize) -> Result<String> { if count <= 0 { return Ok(String::new()); } - let result_len = s.len().saturating_mul(count as usize); + let result_len = repeat_len(s.len(), count, max_size)?; + debug_assert!(result_len <= max_size); + let count = repeat_count(count, max_size)?; + Ok(s.repeat(count)) +} + +fn repeat_len(string_len: usize, count: i64, max_size: usize) -> Result<usize> { + let count = repeat_count(count, max_size)?; + let result_len = string_len.checked_mul(count).ok_or_else(|| { + exec_datafusion_err!( + "string size overflow on repeat, max size is {}, but got {}", + max_size, + usize::MAX + ) + })?; if result_len > max_size { return exec_err!( "string size overflow on repeat, max size is {}, but got {}", @@ -174,7 +190,18 @@ fn compute_repeat(s: &str, count: i64, max_size: usize) -> Result<String> { result_len ); } - Ok(s.repeat(count as usize)) + Ok(result_len) +} + +fn repeat_count(count: i64, max_size: usize) -> Result<usize> { + match usize::try_from(count) { + Ok(count) => Ok(count), + Err(_) => exec_err!( + "string size overflow on repeat, max size is {}, but got {}", + max_size, + usize::MAX + ), + } } /// Repeats string the specified number of times. @@ -227,22 +254,22 @@ fn calculate_capacities<'a, S>( where S: StringArrayType<'a>, { - let mut total_capacity = 0; - let mut max_item_capacity = 0; + let mut total_capacity = 0usize; + let mut max_item_capacity = 0usize; string_array.iter().zip(number_array.iter()).try_for_each( |(string, number)| -> Result<(), DataFusionError> { match (string, number) { (Some(string), Some(number)) if number >= 0 => { - let item_capacity = string.len() * number as usize; - if item_capacity > max_str_len { - return exec_err!( - "string size overflow on repeat, max size is {}, but got {}", - max_str_len, - number as usize * string.len() - ); - } - total_capacity += item_capacity; + let item_capacity = repeat_len(string.len(), number, max_str_len)?; + total_capacity = + total_capacity.checked_add(item_capacity).ok_or_else(|| { + exec_datafusion_err!( + "string size overflow on repeat, max size is {}, but got {}", + max_str_len, + usize::MAX + ) + })?; max_item_capacity = max_item_capacity.max(item_capacity); } _ => (), @@ -487,6 +514,18 @@ mod tests { assert_sliced_offset_output::<StringArray>(result); } + #[test] + fn test_repeat_string_array_overflow() { + let strings: ArrayRef = Arc::new(StringArray::from(vec![Some("abc")])); + let counts: ArrayRef = Arc::new(Int64Array::from(vec![Some(i64::MAX)])); + + let err = super::repeat(&strings, &counts).unwrap_err().to_string(); + assert!( + err.contains("string size overflow on repeat"), + "unexpected error: {err}" + ); + } + #[test] fn test_repeat_sliced_large_string_with_null_offset() { let (strings, counts) = diff --git a/datafusion/sqllogictest/test_files/string/string_literal.slt b/datafusion/sqllogictest/test_files/string/string_literal.slt index 97f2a40c13..d7547bf145 100644 --- a/datafusion/sqllogictest/test_files/string/string_literal.slt +++ b/datafusion/sqllogictest/test_files/string/string_literal.slt @@ -391,6 +391,10 @@ SELECT repeat(arrow_cast('foo', 'Dictionary(Int32, Utf8)'), 3) ---- foofoofoo +query error DataFusion error: Execution error: string size overflow on repeat, max size is 2147483647, but got \d+ +SELECT repeat(x, 9223372036854775807) +FROM (VALUES ('abc')) AS t(x); + query T SELECT arrow_typeof(repeat('foo', 3)) ---- --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
