This is an automated email from the ASF dual-hosted git repository.
github-bot pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git
The following commit(s) were added to refs/heads/main by this push:
new 794660482b Prevent overflow and panics when casting DATE to TIMESTAMP
by validating bounds (#18761)
794660482b is described below
commit 794660482bb21a9f974c6345d4b9a631009a316b
Author: kosiew <[email protected]>
AuthorDate: Wed Nov 19 21:47:01 2025 +0800
Prevent overflow and panics when casting DATE to TIMESTAMP by validating
bounds (#18761)
## Which issue does this PR close?
* Closes #17534.
## Rationale for this change
This change ensures that casting from `Date32` or `Date64` to timestamp
types behaves safely and predictably. Previously, extreme date values
(e.g., `9999-12-31`) could cause integer overflow during unit
conversion, leading to panics in debug mode and silent incorrect results
in release mode. This patch introduces explicit bounds checking so these
cases return clear, user-facing errors instead of panicking.
### Before
```
❯ cargo run --bin datafusion-cli -- --command "SELECT CAST(DATE
'9999-12-31' As timestamp);"
Finished `dev` profile [unoptimized + debuginfo] target(s) in 0.48s
Running `target/debug/datafusion-cli --command 'SELECT CAST(DATE
'\''9999-12-31'\'' As timestamp);'`
DataFusion CLI v51.0.0
thread 'main' panicked at
/Users/kosiew/.cargo/registry/src/index.crates.io-1949cf8c6b5b557f/arrow-cast-57.0.0/src/cast/mod.rs:2079:58:
attempt to multiply with overflow
```
### After
```
❯ cargo run --bin datafusion-cli -- --command "SELECT CAST(DATE
'9999-12-31' As timestamp);"
Finished `dev` profile [unoptimized + debuginfo] target(s) in 0.64s
Running `target/debug/datafusion-cli --command 'SELECT CAST(DATE
'\''9999-12-31'\'' As timestamp);'`
DataFusion CLI v51.0.0
Error: Optimizer rule 'simplify_expressions' failed
caused by
Execution error: Cannot cast Date32 value 2932896 to Timestamp(ns):
timestamp values are limited to +/-2262 years
```
## What changes are included in this PR?
* Introduces `date_to_timestamp_multiplier` and
`ensure_timestamp_in_bounds` to centralize safe conversion logic.
* Adds bounds validation for both scalar and array-based casts,
preventing overflow when multiplying date values.
* Enhances error messages with consistent timestamp type formatting.
* Integrates bounds checks into `ScalarValue::cast_to` and
`ColumnarValue::cast_to`.
* Adds comprehensive tests covering formatting, overflow detection,
scalar casts, array casts, and SQL-level behavior.
## Are these changes tested?
Yes. The PR includes new unit tests validating:
* Formatting of timestamp types in error messages.
* Overflow detection for both `Date32` and `Date64` to nanoseconds.
* Array-based overflow handling.
* SQL-level behavior for casting large date literals.
## Are there any user-facing changes?
Yes. Users now receive clear and consistent error messages when
attempting to cast dates that exceed the representable timestamp range
(approximately ±2262 years). Instead of panics or silent overflow, a
descriptive execution error is returned.
## LLM-generated code disclosure
This pull request includes LLM-generated content that has been manually
reviewed and tested.
---
datafusion/common/src/scalar/mod.rs | 174 ++++++++++++++++++++++++++-
datafusion/core/tests/sql/select.rs | 17 +++
datafusion/expr-common/src/columnar_value.rs | 103 ++++++++++++++--
3 files changed, 283 insertions(+), 11 deletions(-)
diff --git a/datafusion/common/src/scalar/mod.rs
b/datafusion/common/src/scalar/mod.rs
index 787bd78b1d..3fd5a37224 100644
--- a/datafusion/common/src/scalar/mod.rs
+++ b/datafusion/common/src/scalar/mod.rs
@@ -92,6 +92,103 @@ use chrono::{Duration, NaiveDate};
use half::f16;
pub use struct_builder::ScalarStructBuilder;
+const SECONDS_PER_DAY: i64 = 86_400;
+const MILLIS_PER_DAY: i64 = SECONDS_PER_DAY * 1_000;
+const MICROS_PER_DAY: i64 = MILLIS_PER_DAY * 1_000;
+const NANOS_PER_DAY: i64 = MICROS_PER_DAY * 1_000;
+const MICROS_PER_MILLISECOND: i64 = 1_000;
+const NANOS_PER_MILLISECOND: i64 = 1_000_000;
+
+/// Returns the multiplier that converts the input date representation into the
+/// desired timestamp unit, if the conversion requires a multiplication that
can
+/// overflow an `i64`.
+pub fn date_to_timestamp_multiplier(
+ source_type: &DataType,
+ target_type: &DataType,
+) -> Option<i64> {
+ let DataType::Timestamp(target_unit, _) = target_type else {
+ return None;
+ };
+
+ // Only `Timestamp` target types have a time unit; otherwise no
+ // multiplier applies (handled above). The function returns `Some(m)`
+ // when converting the `source_type` to `target_type` requires a
+ // multiplication that could overflow `i64`. It returns `None` when
+ // the conversion is a division or otherwise doesn't require a
+ // multiplication (e.g. Date64 -> Second).
+ match source_type {
+ // Date32 stores days since epoch. Converting to any timestamp
+ // unit requires multiplying by the per-day factor (seconds,
+ // milliseconds, microseconds, nanoseconds).
+ DataType::Date32 => Some(match target_unit {
+ TimeUnit::Second => SECONDS_PER_DAY,
+ TimeUnit::Millisecond => MILLIS_PER_DAY,
+ TimeUnit::Microsecond => MICROS_PER_DAY,
+ TimeUnit::Nanosecond => NANOS_PER_DAY,
+ }),
+
+ // Date64 stores milliseconds since epoch. Converting to
+ // seconds is a division (no multiplication), so return `None`.
+ // Converting to milliseconds is 1:1 (multiplier 1). Converting
+ // to micro/nano requires multiplying by 1_000 / 1_000_000.
+ DataType::Date64 => match target_unit {
+ TimeUnit::Second => None,
+ // Converting Date64 (ms since epoch) to millisecond timestamps
+ // is an identity conversion and does not require multiplication.
+ // Returning `None` indicates no multiplication-based overflow
+ // check is necessary.
+ TimeUnit::Millisecond => None,
+ TimeUnit::Microsecond => Some(MICROS_PER_MILLISECOND),
+ TimeUnit::Nanosecond => Some(NANOS_PER_MILLISECOND),
+ },
+
+ _ => None,
+ }
+}
+
+/// Ensures the provided value can be represented as a timestamp with the given
+/// multiplier. Returns an [`DataFusionError::Execution`] when the converted
+/// value would overflow the timestamp range.
+pub fn ensure_timestamp_in_bounds(
+ value: i64,
+ multiplier: i64,
+ source_type: &DataType,
+ target_type: &DataType,
+) -> Result<()> {
+ if multiplier <= 1 {
+ return Ok(());
+ }
+
+ if value.checked_mul(multiplier).is_none() {
+ let target = format_timestamp_type_for_error(target_type);
+ _exec_err!(
+ "Cannot cast {} value {} to {}: converted value exceeds the
representable i64 range",
+ source_type,
+ value,
+ target
+ )
+ } else {
+ Ok(())
+ }
+}
+
+/// Format a `DataType::Timestamp` into a short, stable string used in
+/// user-facing error messages.
+pub(crate) fn format_timestamp_type_for_error(target_type: &DataType) ->
String {
+ match target_type {
+ DataType::Timestamp(unit, _) => {
+ let s = match unit {
+ TimeUnit::Second => "s",
+ TimeUnit::Millisecond => "ms",
+ TimeUnit::Microsecond => "us",
+ TimeUnit::Nanosecond => "ns",
+ };
+ format!("Timestamp({s})")
+ }
+ other => format!("{other}"),
+ }
+}
+
/// A dynamically typed, nullable single value.
///
/// While an arrow [`Array`]) stores one or more values of the same type, in a
@@ -3619,11 +3716,27 @@ impl ScalarValue {
target_type: &DataType,
cast_options: &CastOptions<'static>,
) -> Result<Self> {
+ let source_type = self.data_type();
+ if let Some(multiplier) = date_to_timestamp_multiplier(&source_type,
target_type)
+ {
+ if let Some(value) = self.date_scalar_value_as_i64() {
+ ensure_timestamp_in_bounds(value, multiplier, &source_type,
target_type)?;
+ }
+ }
+
let scalar_array = self.to_array()?;
let cast_arr = cast_with_options(&scalar_array, target_type,
cast_options)?;
ScalarValue::try_from_array(&cast_arr, 0)
}
+ fn date_scalar_value_as_i64(&self) -> Option<i64> {
+ match self {
+ ScalarValue::Date32(Some(value)) => Some(i64::from(*value)),
+ ScalarValue::Date64(Some(value)) => Some(*value),
+ _ => None,
+ }
+ }
+
fn eq_array_decimal32(
array: &ArrayRef,
index: usize,
@@ -4991,7 +5104,7 @@ mod tests {
use arrow::buffer::{Buffer, NullBuffer, OffsetBuffer};
use arrow::compute::{is_null, kernels};
use arrow::datatypes::{
- ArrowNumericType, Fields, Float64Type, DECIMAL256_MAX_PRECISION,
+ ArrowNumericType, Fields, Float64Type, TimeUnit,
DECIMAL256_MAX_PRECISION,
};
use arrow::error::ArrowError;
use arrow::util::pretty::pretty_format_columns;
@@ -5024,6 +5137,52 @@ mod tests {
assert_eq!(actual, &expected);
}
+ #[test]
+ fn test_format_timestamp_type_for_error_and_bounds() {
+ // format helper
+ let ts_ns = format_timestamp_type_for_error(&DataType::Timestamp(
+ TimeUnit::Nanosecond,
+ None,
+ ));
+ assert_eq!(ts_ns, "Timestamp(ns)");
+
+ let ts_us = format_timestamp_type_for_error(&DataType::Timestamp(
+ TimeUnit::Microsecond,
+ None,
+ ));
+ assert_eq!(ts_us, "Timestamp(us)");
+
+ // ensure_timestamp_in_bounds: Date32 non-overflow
+ let ok = ensure_timestamp_in_bounds(
+ 1000,
+ NANOS_PER_DAY,
+ &DataType::Date32,
+ &DataType::Timestamp(TimeUnit::Nanosecond, None),
+ );
+ assert!(ok.is_ok());
+
+ // Date32 overflow -- known large day value (9999-12-31 -> 2932896)
+ let err = ensure_timestamp_in_bounds(
+ 2932896,
+ NANOS_PER_DAY,
+ &DataType::Date32,
+ &DataType::Timestamp(TimeUnit::Nanosecond, None),
+ );
+ assert!(err.is_err());
+ let msg = err.unwrap_err().to_string();
+ assert!(msg.contains("Cannot cast Date32 value 2932896 to
Timestamp(ns): converted value exceeds the representable i64 range"));
+
+ // Date64 overflow for ns (millis * 1_000_000)
+ let overflow_millis: i64 = (i64::MAX / NANOS_PER_MILLISECOND) + 1;
+ let err2 = ensure_timestamp_in_bounds(
+ overflow_millis,
+ NANOS_PER_MILLISECOND,
+ &DataType::Date64,
+ &DataType::Timestamp(TimeUnit::Nanosecond, None),
+ );
+ assert!(err2.is_err());
+ }
+
#[test]
fn test_scalar_value_from_for_struct() {
let boolean = Arc::new(BooleanArray::from(vec![false]));
@@ -8605,6 +8764,19 @@ mod tests {
assert!(dense_scalar.is_null());
}
+ #[test]
+ fn cast_date_to_timestamp_overflow_returns_error() {
+ let scalar = ScalarValue::Date32(Some(i32::MAX));
+ let err = scalar
+ .cast_to(&DataType::Timestamp(TimeUnit::Nanosecond, None))
+ .expect_err("expected cast to fail");
+ assert!(
+ err.to_string()
+ .contains("converted value exceeds the representable i64
range"),
+ "unexpected error: {err}"
+ );
+ }
+
#[test]
fn null_dictionary_scalar_produces_null_dictionary_array() {
let dictionary_scalar = ScalarValue::Dictionary(
diff --git a/datafusion/core/tests/sql/select.rs
b/datafusion/core/tests/sql/select.rs
index 28f0dcb8bd..84899137e5 100644
--- a/datafusion/core/tests/sql/select.rs
+++ b/datafusion/core/tests/sql/select.rs
@@ -414,3 +414,20 @@ async fn test_select_no_projection() -> Result<()> {
");
Ok(())
}
+
+#[tokio::test]
+async fn test_select_cast_date_literal_to_timestamp_overflow() -> Result<()> {
+ let ctx = SessionContext::new();
+ let err = ctx
+ .sql("SELECT CAST(DATE '9999-12-31' AS TIMESTAMP)")
+ .await?
+ .collect()
+ .await
+ .unwrap_err();
+
+ assert_contains!(
+ err.to_string(),
+ "Cannot cast Date32 value 2932896 to Timestamp(ns): converted value
exceeds the representable i64 range"
+ );
+ Ok(())
+}
diff --git a/datafusion/expr-common/src/columnar_value.rs
b/datafusion/expr-common/src/columnar_value.rs
index d508164b6b..585b47a980 100644
--- a/datafusion/expr-common/src/columnar_value.rs
+++ b/datafusion/expr-common/src/columnar_value.rs
@@ -17,12 +17,19 @@
//! [`ColumnarValue`] represents the result of evaluating an expression.
-use arrow::array::{Array, ArrayRef, NullArray};
-use arrow::compute::{kernels, CastOptions};
-use arrow::datatypes::DataType;
-use arrow::util::pretty::pretty_format_columns;
-use datafusion_common::format::DEFAULT_CAST_OPTIONS;
-use datafusion_common::{internal_err, Result, ScalarValue};
+use arrow::{
+ array::{Array, ArrayRef, Date32Array, Date64Array, NullArray},
+ compute::{kernels, max, min, CastOptions},
+ datatypes::DataType,
+ util::pretty::pretty_format_columns,
+};
+use datafusion_common::internal_datafusion_err;
+use datafusion_common::{
+ format::DEFAULT_CAST_OPTIONS,
+ internal_err,
+ scalar::{date_to_timestamp_multiplier, ensure_timestamp_in_bounds},
+ Result, ScalarValue,
+};
use std::fmt;
use std::sync::Arc;
@@ -275,9 +282,14 @@ impl ColumnarValue {
) -> Result<ColumnarValue> {
let cast_options =
cast_options.cloned().unwrap_or(DEFAULT_CAST_OPTIONS);
match self {
- ColumnarValue::Array(array) => Ok(ColumnarValue::Array(
- kernels::cast::cast_with_options(array, cast_type,
&cast_options)?,
- )),
+ ColumnarValue::Array(array) => {
+ ensure_date_array_timestamp_bounds(array, cast_type)?;
+ Ok(ColumnarValue::Array(kernels::cast::cast_with_options(
+ array,
+ cast_type,
+ &cast_options,
+ )?))
+ }
ColumnarValue::Scalar(scalar) => Ok(ColumnarValue::Scalar(
scalar.cast_to_with_options(cast_type, &cast_options)?,
)),
@@ -285,6 +297,59 @@ impl ColumnarValue {
}
}
+fn ensure_date_array_timestamp_bounds(
+ array: &ArrayRef,
+ cast_type: &DataType,
+) -> Result<()> {
+ let source_type = array.data_type().clone();
+ let Some(multiplier) = date_to_timestamp_multiplier(&source_type,
cast_type) else {
+ return Ok(());
+ };
+
+ if multiplier <= 1 {
+ return Ok(());
+ }
+
+ // Use compute kernels to find min/max instead of iterating all elements
+ let (min_val, max_val): (Option<i64>, Option<i64>) = match &source_type {
+ DataType::Date32 => {
+ let arr = array
+ .as_any()
+ .downcast_ref::<Date32Array>()
+ .ok_or_else(|| {
+ internal_datafusion_err!(
+ "Expected Date32Array but found {}",
+ array.data_type()
+ )
+ })?;
+ (min(arr).map(|v| v as i64), max(arr).map(|v| v as i64))
+ }
+ DataType::Date64 => {
+ let arr = array
+ .as_any()
+ .downcast_ref::<Date64Array>()
+ .ok_or_else(|| {
+ internal_datafusion_err!(
+ "Expected Date64Array but found {}",
+ array.data_type()
+ )
+ })?;
+ (min(arr), max(arr))
+ }
+ _ => return Ok(()), // Not a date type, nothing to do
+ };
+
+ // Only validate the min and max values instead of all elements
+ if let Some(min) = min_val {
+ ensure_timestamp_in_bounds(min, multiplier, &source_type, cast_type)?;
+ }
+ if let Some(max) = max_val {
+ ensure_timestamp_in_bounds(max, multiplier, &source_type, cast_type)?;
+ }
+
+ Ok(())
+}
+
// Implement Display trait for ColumnarValue
impl fmt::Display for ColumnarValue {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
@@ -312,7 +377,10 @@ impl fmt::Display for ColumnarValue {
#[cfg(test)]
mod tests {
use super::*;
- use arrow::array::Int32Array;
+ use arrow::{
+ array::{Date64Array, Int32Array},
+ datatypes::TimeUnit,
+ };
#[test]
fn into_array_of_size() {
@@ -484,4 +552,19 @@ mod tests {
)
);
}
+
+ #[test]
+ fn cast_date64_array_to_timestamp_overflow() {
+ let overflow_value = i64::MAX / 1_000_000 + 1;
+ let array: ArrayRef =
Arc::new(Date64Array::from(vec![Some(overflow_value)]));
+ let value = ColumnarValue::Array(array);
+ let result =
+ value.cast_to(&DataType::Timestamp(TimeUnit::Nanosecond, None),
None);
+ let err = result.expect_err("expected overflow to be detected");
+ assert!(
+ err.to_string()
+ .contains("converted value exceeds the representable i64
range"),
+ "unexpected error: {err}"
+ );
+ }
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]