This is an automated email from the ASF dual-hosted git repository.

Jefffrey pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/main by this push:
     new 2f923f7298 fix(arrow-cast): support full Date32 range when parsing 
extended-year dates (#9961)
2f923f7298 is described below

commit 2f923f72989ab9df0cb02c749891c5ab3093f743
Author: Swanand Mulay <[email protected]>
AuthorDate: Fri May 22 12:28:39 2026 +0530

    fix(arrow-cast): support full Date32 range when parsing extended-year dates 
(#9961)
    
    `Date32Type::parse` previously used `chrono::NaiveDate`, which caps at
    roughly +-262,143 years and rejected valid ISO 8601 extended-year inputs
    like `+2739877-01-03`
    
    As Gregorian repeats in 400-year era (146,097 days), we find the current
    era and then calculate & validate the date in current era. We recover
    the absolute day count by adding era * 146,097.
    
    
    Claude code's help was taken to come up with this.
    
    # Which issue does this PR close?
    
    <!--
    We generally require a GitHub issue to be filed for all bug fixes and
    enhancements and this helps us generate change logs for our releases.
    You can link an issue to this PR using the GitHub syntax.
    -->
    
    - Closes #9960
    
    # Rationale for this change
    
    <!--
    Why are you proposing this change? If this is already explained clearly
    in the issue then this section is not needed.
    Explaining clearly why changes are proposed helps reviewers understand
    your changes and offer better suggestions for fixes.
    -->
    
    Supporting full range of date's allows other dependents like delta-rs to
    parse data written/managed by other engines like Spark which support
    full Date32
    
    
    changing `parse_date()` signature is also other option but would need
    changes with Date64 as well.
    
    # What changes are included in this PR?
    
    <!--
    There is no need to duplicate the description in the issue here but it
    is sometimes worth providing a summary of the individual changes in this
    PR.
    -->
    
    calculating number of days without converting it full extended year to
    NaiveDate. And tests for it.
    
    # Are these changes tested?
    
    <!--
    We typically require tests for all PRs in order to:
    1. Prevent the code from being accidentally broken by subsequent changes
    2. Serve as another way to document the expected behavior of the code
    
    If tests are not included in your PR, please explain why (for example,
    are they covered by existing tests)?
    
    If this PR claims a performance improvement, please include evidence
    such as benchmark results.
    -->
    
    added tests and relying on existing tests for verification
    
    # Are there any user-facing changes?
    
    <!--
    If there are user-facing changes then we may require documentation to be
    updated before approving the PR.
    
    If there are any breaking changes to public APIs, please call them out.
    -->
    
    Maybe as we parse some data successfully which would have previously
    been None / error
    
    Signed-off-by: Swanand Mulay <[email protected]>
---
 arrow-cast/src/parse.rs | 92 ++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 75 insertions(+), 17 deletions(-)

diff --git a/arrow-cast/src/parse.rs b/arrow-cast/src/parse.rs
index a23f421c34..e7c6f90e75 100644
--- a/arrow-cast/src/parse.rs
+++ b/arrow-cast/src/parse.rs
@@ -585,6 +585,32 @@ const EPOCH_DAYS_FROM_CE: i32 = 719_163;
 /// Error message if nanosecond conversion request beyond supported interval
 const ERR_NANOSECONDS_NOT_SUPPORTED: &str = "The dates that can be represented 
as nanoseconds have to be between 1677-09-21T00:12:44.0 and 
2262-04-11T23:47:16.854775804";
 
+/// Parse the ISO 8601 signed extended-year form (`±YYYY[Y...]-MM-DD`) into
+/// raw `(year, month, day)` components, without validating the calendar date.
+///
+/// The caller must have already verified that `string` begins with `+` or `-`;
+/// the year must have at least 4 digits. Returns `None` if the shape is
+/// malformed or any component fails to parse numerically.
+fn parse_extended_ymd(string: &str) -> Option<(i32, u32, u32)> {
+    debug_assert!(string.starts_with('+') || string.starts_with('-'));
+    // Skip the sign and look for the hyphen that terminates the year digits.
+    // Per ISO 8601 the unsigned year part must be at least 4 digits.
+    let rest = &string[1..];
+    let hyphen = rest.find('-')?;
+    if hyphen < 4 {
+        return None;
+    }
+    // The year substring is the sign and the digits (but not the separator),
+    // e.g. for "+10999-12-31", hyphen is 5 and s[..6] is "+10999".
+    let year: i32 = string[..hyphen + 1].parse().ok()?;
+    // The remainder should begin with a '-' which we strip off, leaving the 
month-day part.
+    let remainder = string[hyphen + 1..].strip_prefix('-')?;
+    let mut parts = remainder.splitn(2, '-');
+    let month: u32 = parts.next()?.parse().ok()?;
+    let day: u32 = parts.next()?.parse().ok()?;
+    Some((year, month, day))
+}
+
 fn parse_date(string: &str) -> Option<NaiveDate> {
     // If the date has an extended (signed) year such as "+10999-12-31" or 
"-0012-05-06"
     //
@@ -594,21 +620,7 @@ fn parse_date(string: &str) -> Option<NaiveDate> {
     //
     // [ISO 8601]: 
https://docs.oracle.com/en/java/javase/17/docs/api/java.base/java/time/format/DateTimeFormatter.html#ISO_LOCAL_DATE
     if string.starts_with('+') || string.starts_with('-') {
-        // Skip the sign and look for the hyphen that terminates the year 
digits.
-        // According to ISO 8601 the unsigned part must be at least 4 digits.
-        let rest = &string[1..];
-        let hyphen = rest.find('-')?;
-        if hyphen < 4 {
-            return None;
-        }
-        // The year substring is the sign and the digits (but not the 
separator)
-        // e.g. for "+10999-12-31", hyphen is 5 and s[..6] is "+10999"
-        let year: i32 = string[..hyphen + 1].parse().ok()?;
-        // The remainder should begin with a '-' which we strip off, leaving 
the month-day part.
-        let remainder = string[hyphen + 1..].strip_prefix('-')?;
-        let mut parts = remainder.splitn(2, '-');
-        let month: u32 = parts.next()?.parse().ok()?;
-        let day: u32 = parts.next()?.parse().ok()?;
+        let (year, month, day) = parse_extended_ymd(string)?;
         return NaiveDate::from_ymd_opt(year, month, day);
     }
 
@@ -679,10 +691,30 @@ fn parse_date(string: &str) -> Option<NaiveDate> {
     NaiveDate::from_ymd_opt(year as _, month as _, day as _)
 }
 
+/// Parse a date string into days since 1970-01-01, covering the full
+/// `Date32` range (years ≈ ±5,881,580) for the signed extended-year form.
+///
+/// The Gregorian calendar repeats exactly every 400 years (146,097 days), so
+/// we fold the year into `[0, 400)`, validate the folded date, and add
+/// `era * 146_097` to recover the absolute day count.
+///
+/// For all other inputs, behavior matches [`parse_date`].
+fn parse_date_to_days(string: &str) -> Option<i32> {
+    if string.starts_with('+') || string.starts_with('-') {
+        let (year, month, day) = parse_extended_ymd(string)?;
+        let y = year as i64;
+        let era = y.div_euclid(400);
+        let yoe = y.rem_euclid(400) as i32;
+        let nd = NaiveDate::from_ymd_opt(yoe, month, day)?;
+        let in_era = (nd.num_days_from_ce() - EPOCH_DAYS_FROM_CE) as i64;
+        return i32::try_from(era * 146_097 + in_era).ok();
+    }
+    parse_date(string).map(|nd| nd.num_days_from_ce() - EPOCH_DAYS_FROM_CE)
+}
+
 impl Parser for Date32Type {
     fn parse(string: &str) -> Option<i32> {
-        let date = parse_date(string)?;
-        Some(date.num_days_from_ce() - EPOCH_DAYS_FROM_CE)
+        parse_date_to_days(string)
     }
 
     fn parse_formatted(string: &str, format: &str) -> Option<i32> {
@@ -1797,6 +1829,32 @@ mod tests {
         }
     }
 
+    #[test]
+    fn parse_date32_extended_year() {
+        // `Date32` covers any i32 days-from-epoch, verify we can parse it
+        let cases: &[(&str, i32)] = &[
+            ("+1970-01-01", 0),
+            ("+2024-01-01", 19_723),
+            ("-0001-01-01", -719_893),
+            ("+29349-01-26", 10_000_000),
+            ("+2739877-01-03", 1_000_000_000),
+            // Extremes of the Date32 representable range.
+            ("+5881580-07-11", i32::MAX),
+            ("-5877641-06-23", i32::MIN),
+        ];
+        for (input, expected) in cases {
+            assert_eq!(Date32Type::parse(input), Some(*expected), "input: 
{input}");
+        }
+
+        // One past Date32::MAX / MIN overflows i32 days-from-epoch.
+        assert_eq!(Date32Type::parse("+5881580-07-12"), None);
+        assert_eq!(Date32Type::parse("-5877641-06-22"), None);
+        // Invalid calendar dates still rejected regardless of year magnitude.
+        assert_eq!(Date32Type::parse("+2739877-02-30"), None);
+        assert_eq!(Date32Type::parse("+2739877-13-01"), None);
+        assert_eq!(Date32Type::parse("-2739877-02-30"), None);
+    }
+
     #[test]
     fn parse_time64_nanos() {
         assert_eq!(

Reply via email to