This is an automated email from the ASF dual-hosted git repository.
Jefffrey pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/main by this push:
new 2f923f7298 fix(arrow-cast): support full Date32 range when parsing
extended-year dates (#9961)
2f923f7298 is described below
commit 2f923f72989ab9df0cb02c749891c5ab3093f743
Author: Swanand Mulay <[email protected]>
AuthorDate: Fri May 22 12:28:39 2026 +0530
fix(arrow-cast): support full Date32 range when parsing extended-year dates
(#9961)
`Date32Type::parse` previously used `chrono::NaiveDate`, which caps at
roughly +-262,143 years and rejected valid ISO 8601 extended-year inputs
like `+2739877-01-03`
As Gregorian repeats in 400-year era (146,097 days), we find the current
era and then calculate & validate the date in current era. We recover
the absolute day count by adding era * 146,097.
Claude code's help was taken to come up with this.
# Which issue does this PR close?
<!--
We generally require a GitHub issue to be filed for all bug fixes and
enhancements and this helps us generate change logs for our releases.
You can link an issue to this PR using the GitHub syntax.
-->
- Closes #9960
# Rationale for this change
<!--
Why are you proposing this change? If this is already explained clearly
in the issue then this section is not needed.
Explaining clearly why changes are proposed helps reviewers understand
your changes and offer better suggestions for fixes.
-->
Supporting full range of date's allows other dependents like delta-rs to
parse data written/managed by other engines like Spark which support
full Date32
changing `parse_date()` signature is also other option but would need
changes with Date64 as well.
# What changes are included in this PR?
<!--
There is no need to duplicate the description in the issue here but it
is sometimes worth providing a summary of the individual changes in this
PR.
-->
calculating number of days without converting it full extended year to
NaiveDate. And tests for it.
# Are these changes tested?
<!--
We typically require tests for all PRs in order to:
1. Prevent the code from being accidentally broken by subsequent changes
2. Serve as another way to document the expected behavior of the code
If tests are not included in your PR, please explain why (for example,
are they covered by existing tests)?
If this PR claims a performance improvement, please include evidence
such as benchmark results.
-->
added tests and relying on existing tests for verification
# Are there any user-facing changes?
<!--
If there are user-facing changes then we may require documentation to be
updated before approving the PR.
If there are any breaking changes to public APIs, please call them out.
-->
Maybe as we parse some data successfully which would have previously
been None / error
Signed-off-by: Swanand Mulay <[email protected]>
---
arrow-cast/src/parse.rs | 92 ++++++++++++++++++++++++++++++++++++++++---------
1 file changed, 75 insertions(+), 17 deletions(-)
diff --git a/arrow-cast/src/parse.rs b/arrow-cast/src/parse.rs
index a23f421c34..e7c6f90e75 100644
--- a/arrow-cast/src/parse.rs
+++ b/arrow-cast/src/parse.rs
@@ -585,6 +585,32 @@ const EPOCH_DAYS_FROM_CE: i32 = 719_163;
/// Error message if nanosecond conversion request beyond supported interval
const ERR_NANOSECONDS_NOT_SUPPORTED: &str = "The dates that can be represented
as nanoseconds have to be between 1677-09-21T00:12:44.0 and
2262-04-11T23:47:16.854775804";
+/// Parse the ISO 8601 signed extended-year form (`±YYYY[Y...]-MM-DD`) into
+/// raw `(year, month, day)` components, without validating the calendar date.
+///
+/// The caller must have already verified that `string` begins with `+` or `-`;
+/// the year must have at least 4 digits. Returns `None` if the shape is
+/// malformed or any component fails to parse numerically.
+fn parse_extended_ymd(string: &str) -> Option<(i32, u32, u32)> {
+ debug_assert!(string.starts_with('+') || string.starts_with('-'));
+ // Skip the sign and look for the hyphen that terminates the year digits.
+ // Per ISO 8601 the unsigned year part must be at least 4 digits.
+ let rest = &string[1..];
+ let hyphen = rest.find('-')?;
+ if hyphen < 4 {
+ return None;
+ }
+ // The year substring is the sign and the digits (but not the separator),
+ // e.g. for "+10999-12-31", hyphen is 5 and s[..6] is "+10999".
+ let year: i32 = string[..hyphen + 1].parse().ok()?;
+ // The remainder should begin with a '-' which we strip off, leaving the
month-day part.
+ let remainder = string[hyphen + 1..].strip_prefix('-')?;
+ let mut parts = remainder.splitn(2, '-');
+ let month: u32 = parts.next()?.parse().ok()?;
+ let day: u32 = parts.next()?.parse().ok()?;
+ Some((year, month, day))
+}
+
fn parse_date(string: &str) -> Option<NaiveDate> {
// If the date has an extended (signed) year such as "+10999-12-31" or
"-0012-05-06"
//
@@ -594,21 +620,7 @@ fn parse_date(string: &str) -> Option<NaiveDate> {
//
// [ISO 8601]:
https://docs.oracle.com/en/java/javase/17/docs/api/java.base/java/time/format/DateTimeFormatter.html#ISO_LOCAL_DATE
if string.starts_with('+') || string.starts_with('-') {
- // Skip the sign and look for the hyphen that terminates the year
digits.
- // According to ISO 8601 the unsigned part must be at least 4 digits.
- let rest = &string[1..];
- let hyphen = rest.find('-')?;
- if hyphen < 4 {
- return None;
- }
- // The year substring is the sign and the digits (but not the
separator)
- // e.g. for "+10999-12-31", hyphen is 5 and s[..6] is "+10999"
- let year: i32 = string[..hyphen + 1].parse().ok()?;
- // The remainder should begin with a '-' which we strip off, leaving
the month-day part.
- let remainder = string[hyphen + 1..].strip_prefix('-')?;
- let mut parts = remainder.splitn(2, '-');
- let month: u32 = parts.next()?.parse().ok()?;
- let day: u32 = parts.next()?.parse().ok()?;
+ let (year, month, day) = parse_extended_ymd(string)?;
return NaiveDate::from_ymd_opt(year, month, day);
}
@@ -679,10 +691,30 @@ fn parse_date(string: &str) -> Option<NaiveDate> {
NaiveDate::from_ymd_opt(year as _, month as _, day as _)
}
+/// Parse a date string into days since 1970-01-01, covering the full
+/// `Date32` range (years ≈ ±5,881,580) for the signed extended-year form.
+///
+/// The Gregorian calendar repeats exactly every 400 years (146,097 days), so
+/// we fold the year into `[0, 400)`, validate the folded date, and add
+/// `era * 146_097` to recover the absolute day count.
+///
+/// For all other inputs, behavior matches [`parse_date`].
+fn parse_date_to_days(string: &str) -> Option<i32> {
+ if string.starts_with('+') || string.starts_with('-') {
+ let (year, month, day) = parse_extended_ymd(string)?;
+ let y = year as i64;
+ let era = y.div_euclid(400);
+ let yoe = y.rem_euclid(400) as i32;
+ let nd = NaiveDate::from_ymd_opt(yoe, month, day)?;
+ let in_era = (nd.num_days_from_ce() - EPOCH_DAYS_FROM_CE) as i64;
+ return i32::try_from(era * 146_097 + in_era).ok();
+ }
+ parse_date(string).map(|nd| nd.num_days_from_ce() - EPOCH_DAYS_FROM_CE)
+}
+
impl Parser for Date32Type {
fn parse(string: &str) -> Option<i32> {
- let date = parse_date(string)?;
- Some(date.num_days_from_ce() - EPOCH_DAYS_FROM_CE)
+ parse_date_to_days(string)
}
fn parse_formatted(string: &str, format: &str) -> Option<i32> {
@@ -1797,6 +1829,32 @@ mod tests {
}
}
+ #[test]
+ fn parse_date32_extended_year() {
+ // `Date32` covers any i32 days-from-epoch, verify we can parse it
+ let cases: &[(&str, i32)] = &[
+ ("+1970-01-01", 0),
+ ("+2024-01-01", 19_723),
+ ("-0001-01-01", -719_893),
+ ("+29349-01-26", 10_000_000),
+ ("+2739877-01-03", 1_000_000_000),
+ // Extremes of the Date32 representable range.
+ ("+5881580-07-11", i32::MAX),
+ ("-5877641-06-23", i32::MIN),
+ ];
+ for (input, expected) in cases {
+ assert_eq!(Date32Type::parse(input), Some(*expected), "input:
{input}");
+ }
+
+ // One past Date32::MAX / MIN overflows i32 days-from-epoch.
+ assert_eq!(Date32Type::parse("+5881580-07-12"), None);
+ assert_eq!(Date32Type::parse("-5877641-06-22"), None);
+ // Invalid calendar dates still rejected regardless of year magnitude.
+ assert_eq!(Date32Type::parse("+2739877-02-30"), None);
+ assert_eq!(Date32Type::parse("+2739877-13-01"), None);
+ assert_eq!(Date32Type::parse("-2739877-02-30"), None);
+ }
+
#[test]
fn parse_time64_nanos() {
assert_eq!(