This is an automated email from the ASF dual-hosted git repository.

github-merge-queue[bot] pushed a commit to branch 
gh-readonly-queue/main/pr-22293-0add0469eb87dbb7fd9b6f78ded41ca7c8b46b6f
in repository https://gitbox.apache.org/repos/asf/datafusion.git

commit f4a66a2370894cc4b1fb6feacca6a9aaafac01e4
Author: Sean Kenneth Doherty <[email protected]>
AuthorDate: Wed May 27 13:27:47 2026 -0500

    fix: guard repeat array length overflow (#22293)
    
    ## Which issue does this PR close?
    
    - Closes #22217.
    
    ## Rationale for this change
    
    The array execution path for `repeat(string, count)` calculated
    `string.len() * count` before checking the configured string-size limit.
    For very large counts, that multiplication can overflow and panic
    instead of returning the same string-size overflow error used by the
    scalar path.
    
    ## What changes are included in this PR?
    
    - Adds checked count conversion and repeated-length calculation helpers.
    - Uses checked multiplication and checked total-capacity accumulation in
    the array path.
    - Adds Rust and sqllogictest coverage for the one-row columnar
    reproducer from the issue.
    
    ## Are these changes tested?
    
    - `cargo fmt --all`
    - `TMPDIR=/home/sean/Projects/datafusion-repeat-overflow/target/tmp
    cargo test -p datafusion-functions
    string::repeat::tests::test_repeat_string_array_overflow -- --nocapture`
    - `TMPDIR=/home/sean/Projects/datafusion-repeat-overflow/target/tmp
    cargo test --profile=ci --test sqllogictests --
    string/string_literal.slt`
    - `TMPDIR=/home/sean/Projects/datafusion-repeat-overflow/target/tmp
    cargo clippy --all-targets --all-features -- -D warnings`
    - `git diff --check`
    
    ## Are there any user-facing changes?
    
    Invalid oversized `repeat` results in the columnar path now return a
    normal DataFusion string-size overflow error instead of panicking.
    
    ---------
    
    Co-authored-by: Andrew Lamb <[email protected]>
---
 datafusion/functions/src/string/repeat.rs          | 67 +++++++++++++++++-----
 .../test_files/string/string_literal.slt           |  4 ++
 2 files changed, 57 insertions(+), 14 deletions(-)

diff --git a/datafusion/functions/src/string/repeat.rs 
b/datafusion/functions/src/string/repeat.rs
index b551d2ac70..a53f1e2e4f 100644
--- a/datafusion/functions/src/string/repeat.rs
+++ b/datafusion/functions/src/string/repeat.rs
@@ -26,7 +26,9 @@ use arrow::datatypes::DataType::{LargeUtf8, Utf8, Utf8View};
 use datafusion_common::cast::as_int64_array;
 use datafusion_common::types::{NativeType, logical_int64, logical_string};
 use datafusion_common::utils::take_function_args;
-use datafusion_common::{DataFusionError, Result, ScalarValue, exec_err, 
internal_err};
+use datafusion_common::{
+    DataFusionError, Result, ScalarValue, exec_datafusion_err, exec_err, 
internal_err,
+};
 use datafusion_expr::{ColumnarValue, Documentation, Volatility};
 use datafusion_expr::{ScalarFunctionArgs, ScalarUDFImpl, Signature};
 use datafusion_expr_common::signature::{Coercion, TypeSignatureClass};
@@ -166,7 +168,21 @@ fn compute_repeat(s: &str, count: i64, max_size: usize) -> 
Result<String> {
     if count <= 0 {
         return Ok(String::new());
     }
-    let result_len = s.len().saturating_mul(count as usize);
+    let result_len = repeat_len(s.len(), count, max_size)?;
+    debug_assert!(result_len <= max_size);
+    let count = repeat_count(count, max_size)?;
+    Ok(s.repeat(count))
+}
+
+fn repeat_len(string_len: usize, count: i64, max_size: usize) -> Result<usize> 
{
+    let count = repeat_count(count, max_size)?;
+    let result_len = string_len.checked_mul(count).ok_or_else(|| {
+        exec_datafusion_err!(
+            "string size overflow on repeat, max size is {}, but got {}",
+            max_size,
+            usize::MAX
+        )
+    })?;
     if result_len > max_size {
         return exec_err!(
             "string size overflow on repeat, max size is {}, but got {}",
@@ -174,7 +190,18 @@ fn compute_repeat(s: &str, count: i64, max_size: usize) -> 
Result<String> {
             result_len
         );
     }
-    Ok(s.repeat(count as usize))
+    Ok(result_len)
+}
+
+fn repeat_count(count: i64, max_size: usize) -> Result<usize> {
+    match usize::try_from(count) {
+        Ok(count) => Ok(count),
+        Err(_) => exec_err!(
+            "string size overflow on repeat, max size is {}, but got {}",
+            max_size,
+            usize::MAX
+        ),
+    }
 }
 
 /// Repeats string the specified number of times.
@@ -227,22 +254,22 @@ fn calculate_capacities<'a, S>(
 where
     S: StringArrayType<'a>,
 {
-    let mut total_capacity = 0;
-    let mut max_item_capacity = 0;
+    let mut total_capacity = 0usize;
+    let mut max_item_capacity = 0usize;
 
     string_array.iter().zip(number_array.iter()).try_for_each(
         |(string, number)| -> Result<(), DataFusionError> {
             match (string, number) {
                 (Some(string), Some(number)) if number >= 0 => {
-                    let item_capacity = string.len() * number as usize;
-                    if item_capacity > max_str_len {
-                        return exec_err!(
-                            "string size overflow on repeat, max size is {}, 
but got {}",
-                            max_str_len,
-                            number as usize * string.len()
-                        );
-                    }
-                    total_capacity += item_capacity;
+                    let item_capacity = repeat_len(string.len(), number, 
max_str_len)?;
+                    total_capacity =
+                        
total_capacity.checked_add(item_capacity).ok_or_else(|| {
+                            exec_datafusion_err!(
+                                "string size overflow on repeat, max size is 
{}, but got {}",
+                                max_str_len,
+                                usize::MAX
+                            )
+                        })?;
                     max_item_capacity = max_item_capacity.max(item_capacity);
                 }
                 _ => (),
@@ -487,6 +514,18 @@ mod tests {
         assert_sliced_offset_output::<StringArray>(result);
     }
 
+    #[test]
+    fn test_repeat_string_array_overflow() {
+        let strings: ArrayRef = Arc::new(StringArray::from(vec![Some("abc")]));
+        let counts: ArrayRef = 
Arc::new(Int64Array::from(vec![Some(i64::MAX)]));
+
+        let err = super::repeat(&strings, &counts).unwrap_err().to_string();
+        assert!(
+            err.contains("string size overflow on repeat"),
+            "unexpected error: {err}"
+        );
+    }
+
     #[test]
     fn test_repeat_sliced_large_string_with_null_offset() {
         let (strings, counts) =
diff --git a/datafusion/sqllogictest/test_files/string/string_literal.slt 
b/datafusion/sqllogictest/test_files/string/string_literal.slt
index 97f2a40c13..d7547bf145 100644
--- a/datafusion/sqllogictest/test_files/string/string_literal.slt
+++ b/datafusion/sqllogictest/test_files/string/string_literal.slt
@@ -391,6 +391,10 @@ SELECT repeat(arrow_cast('foo', 'Dictionary(Int32, 
Utf8)'), 3)
 ----
 foofoofoo
 
+query error DataFusion error: Execution error: string size overflow on repeat, 
max size is 2147483647, but got \d+
+SELECT repeat(x, 9223372036854775807)
+FROM (VALUES ('abc')) AS t(x);
+
 query T
 SELECT arrow_typeof(repeat('foo', 3))
 ----


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to