This is an automated email from the ASF dual-hosted git repository.

tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/master by this push:
     new c5a9953f2a Clarify interval comparison behavior with documentation and 
tests (#5192)
c5a9953f2a is described below

commit c5a9953f2a9193e9f31366219d2ee1853215a6d0
Author: Andrew Lamb <[email protected]>
AuthorDate: Fri Dec 8 13:38:12 2023 -0500

    Clarify interval comparison behavior with documentation and tests (#5192)
    
    * Clarify interval comparison behavior with documentation and tests
    
    * refine language
---
 arrow-array/src/array/primitive_array.rs |   8 ++-
 arrow-array/src/types.rs                 |  71 ++++++++++++++++++-
 arrow-ord/src/comparison.rs              | 113 +++++++++++++++++++++++++++++++
 arrow-ord/src/ord.rs                     |  67 ++++++++++++++++++
 4 files changed, 255 insertions(+), 4 deletions(-)

diff --git a/arrow-array/src/array/primitive_array.rs 
b/arrow-array/src/array/primitive_array.rs
index 1112acacfc..2296cebd46 100644
--- a/arrow-array/src/array/primitive_array.rs
+++ b/arrow-array/src/array/primitive_array.rs
@@ -352,12 +352,18 @@ pub type Time64MicrosecondArray = 
PrimitiveArray<Time64MicrosecondType>;
 pub type Time64NanosecondArray = PrimitiveArray<Time64NanosecondType>;
 
 /// A [`PrimitiveArray`] of “calendar” intervals in months
+///
+/// See [`IntervalYearMonthType`] for details on representation and caveats.
 pub type IntervalYearMonthArray = PrimitiveArray<IntervalYearMonthType>;
 
 /// A [`PrimitiveArray`] of “calendar” intervals in days and milliseconds
+///
+/// See [`IntervalDayTimeType`] for details on representation and caveats.
 pub type IntervalDayTimeArray = PrimitiveArray<IntervalDayTimeType>;
 
-/// A [`PrimitiveArray`] of “calendar” intervals in  months, days, and 
nanoseconds
+/// A [`PrimitiveArray`] of “calendar” intervals in  months, days, and 
nanoseconds.
+///
+/// See [`IntervalMonthDayNanoType`] for details on representation and caveats.
 pub type IntervalMonthDayNanoArray = PrimitiveArray<IntervalMonthDayNanoType>;
 
 /// A [`PrimitiveArray`] of elapsed durations in seconds
diff --git a/arrow-array/src/types.rs b/arrow-array/src/types.rs
index 16d0e822d0..6e177838c4 100644
--- a/arrow-array/src/types.rs
+++ b/arrow-array/src/types.rs
@@ -213,19 +213,84 @@ make_type!(
     IntervalYearMonthType,
     i32,
     DataType::Interval(IntervalUnit::YearMonth),
-    "A “calendar” interval type in months."
+    "A “calendar” interval stored as the number of whole months."
 );
 make_type!(
     IntervalDayTimeType,
     i64,
     DataType::Interval(IntervalUnit::DayTime),
-    "A “calendar” interval type in days and milliseconds."
+    r#"A “calendar” interval type in days and milliseconds.
+
+## Representation
+This type is stored as a single 64 bit integer, interpreted as two i32 fields:
+1. the number of elapsed days
+2. The number of milliseconds (no leap seconds),
+
+```text
+ ┌──────────────┬──────────────┐
+ │     Days     │ Milliseconds │
+ │  (32 bits)   │  (32 bits)   │
+ └──────────────┴──────────────┘
+ 0              31            63 bit offset
+```
+Please see the [Arrow 
Spec](https://github.com/apache/arrow/blob/081b4022fe6f659d8765efc82b3f4787c5039e3c/format/Schema.fbs#L406-L408)
 for more details
+
+## Note on Comparing and Ordering for Calendar Types
+
+Values of `IntervalDayTimeType` are compared using their binary representation,
+which can lead to surprising results. Please see the description of ordering on
+[`IntervalMonthDayNanoType`] for more details
+"#
 );
 make_type!(
     IntervalMonthDayNanoType,
     i128,
     DataType::Interval(IntervalUnit::MonthDayNano),
-    "A “calendar” interval type in months, days, and nanoseconds."
+    r#"A “calendar” interval type in months, days, and nanoseconds.
+
+## Representation
+This type is stored as a single 128 bit integer,
+interpreted as three different signed integral fields:
+
+1. The number of months (32 bits)
+2. The number days (32 bits)
+2. The number of nanoseconds (64 bits).
+
+Nanoseconds does not allow for leap seconds.
+Each field is independent (e.g. there is no constraint that the quantity of
+nanoseconds represents less than a day's worth of time).
+
+```text
+┌──────────────────────────────┬─────────────┬──────────────┐
+│            Nanos             │    Days     │    Months    │
+│          (64 bits)           │ (32 bits)   │  (32 bits)   │
+└──────────────────────────────┴─────────────┴──────────────┘
+  0                            63            95           127 bit offset
+```
+Please see the [Arrow 
Spec](https://github.com/apache/arrow/blob/081b4022fe6f659d8765efc82b3f4787c5039e3c/format/Schema.fbs#L409-L415)
 for more details
+
+## Note on Comparing and Ordering for Calendar Types
+Values of `IntervalMonthDayNanoType` are compared using their binary 
representation,
+which can lead to surprising results.
+
+Spans of time measured in calendar units are not fixed in absolute size (e.g.
+number of seconds) which makes defining comparisons and ordering non trivial.
+For example `1 month` is 28 days for February but `1 month` is 31 days
+in December.
+
+This makes the seemingly simple operation of comparing two intervals
+complicated in practice. For example is `1 month` more or less than `30 days`? 
The
+answer depends on what month you are talking about.
+
+This crate defines comparisons for calendar types using their binary
+representation which is fast and efficient, but leads
+to potentially surprising results.
+
+For example a
+`IntervalMonthDayNano` of `1 month` will compare as **greater** than a
+`IntervalMonthDayNano` of `100 days` because the binary representation of `1 
month`
+is larger than the binary representation of 100 days.
+"#
 );
 make_type!(
     DurationSecondType,
diff --git a/arrow-ord/src/comparison.rs b/arrow-ord/src/comparison.rs
index 4dbb395192..4d552b038a 100644
--- a/arrow-ord/src/comparison.rs
+++ b/arrow-ord/src/comparison.rs
@@ -1407,6 +1407,48 @@ mod tests {
             vec![6, 7, 8, 9, 10, 6, 7, 8, 9, 10],
             vec![false, false, true, false, false, false, false, true, false, 
false]
         );
+
+        cmp_vec!(
+            eq,
+            eq_dyn,
+            IntervalYearMonthArray,
+            vec![
+                IntervalYearMonthType::make_value(1, 2),
+                IntervalYearMonthType::make_value(2, 1),
+                // 1 year
+                IntervalYearMonthType::make_value(1, 0),
+            ],
+            vec![
+                IntervalYearMonthType::make_value(1, 2),
+                IntervalYearMonthType::make_value(1, 2),
+                // NB 12 months is treated as equal to a year (as the 
underlying
+                // type stores number of months)
+                IntervalYearMonthType::make_value(0, 12),
+            ],
+            vec![true, false, true]
+        );
+
+        cmp_vec!(
+            eq,
+            eq_dyn,
+            IntervalMonthDayNanoArray,
+            vec![
+                IntervalMonthDayNanoType::make_value(1, 2, 3),
+                IntervalMonthDayNanoType::make_value(3, 2, 1),
+                // 1 month
+                IntervalMonthDayNanoType::make_value(1, 0, 0),
+                IntervalMonthDayNanoType::make_value(1, 0, 0),
+            ],
+            vec![
+                IntervalMonthDayNanoType::make_value(1, 2, 3),
+                IntervalMonthDayNanoType::make_value(1, 2, 3),
+                // 30 days is not treated as a month
+                IntervalMonthDayNanoType::make_value(0, 30, 0),
+                // 100 days
+                IntervalMonthDayNanoType::make_value(0, 100, 0),
+            ],
+            vec![true, false, false, false]
+        );
     }
 
     #[test]
@@ -1660,6 +1702,77 @@ mod tests {
             vec![6, 7, 8, 9, 10, 6, 7, 8, 9, 10],
             vec![false, false, false, true, true, false, false, false, true, 
true]
         );
+
+        cmp_vec!(
+            lt,
+            lt_dyn,
+            IntervalDayTimeArray,
+            vec![
+                IntervalDayTimeType::make_value(1, 0),
+                IntervalDayTimeType::make_value(0, 1000),
+                IntervalDayTimeType::make_value(1, 1000),
+                IntervalDayTimeType::make_value(1, 3000),
+                // 90M milliseconds
+                IntervalDayTimeType::make_value(0, 90_000_000),
+            ],
+            vec![
+                IntervalDayTimeType::make_value(0, 1000),
+                IntervalDayTimeType::make_value(1, 0),
+                IntervalDayTimeType::make_value(10, 0),
+                IntervalDayTimeType::make_value(2, 1),
+                // NB even though 1 day is less than 90M milliseconds long,
+                // it compares as greater because the underlying type stores
+                // days and milliseconds as different fields
+                IntervalDayTimeType::make_value(0, 12),
+            ],
+            vec![false, true, true, true ,false]
+        );
+
+        cmp_vec!(
+            lt,
+            lt_dyn,
+            IntervalYearMonthArray,
+            vec![
+                IntervalYearMonthType::make_value(1, 2),
+                IntervalYearMonthType::make_value(2, 1),
+                IntervalYearMonthType::make_value(1, 2),
+                // 1 year
+                IntervalYearMonthType::make_value(1, 0),
+            ],
+            vec![
+                IntervalYearMonthType::make_value(1, 2),
+                IntervalYearMonthType::make_value(1, 2),
+                IntervalYearMonthType::make_value(2, 1),
+                // NB 12 months is treated as equal to a year (as the 
underlying
+                // type stores number of months)
+                IntervalYearMonthType::make_value(0, 12),
+            ],
+            vec![false, false, true, false]
+        );
+
+        cmp_vec!(
+            lt,
+            lt_dyn,
+            IntervalMonthDayNanoArray,
+            vec![
+                IntervalMonthDayNanoType::make_value(1, 2, 3),
+                IntervalMonthDayNanoType::make_value(3, 2, 1),
+                // 1 month
+                IntervalMonthDayNanoType::make_value(1, 0, 0),
+                IntervalMonthDayNanoType::make_value(1, 2, 0),
+                IntervalMonthDayNanoType::make_value(1, 0, 0),
+            ],
+            vec![
+                IntervalMonthDayNanoType::make_value(1, 2, 3),
+                IntervalMonthDayNanoType::make_value(1, 2, 3),
+                IntervalMonthDayNanoType::make_value(2, 0, 0),
+                // 30 days is not treated as a month
+                IntervalMonthDayNanoType::make_value(0, 30, 0),
+                // 100 days (note is treated as greater than 1 month as the 
underlying integer representation)
+                IntervalMonthDayNanoType::make_value(0, 100, 0),
+            ],
+            vec![false, false, true, false, false]
+        );
     }
 
     #[test]
diff --git a/arrow-ord/src/ord.rs b/arrow-ord/src/ord.rs
index 28ca07cce2..f6bd39c9cd 100644
--- a/arrow-ord/src/ord.rs
+++ b/arrow-ord/src/ord.rs
@@ -216,6 +216,73 @@ pub mod tests {
         assert_eq!(Ordering::Greater, cmp(1, 0));
     }
 
+    #[test]
+    fn test_interval_day_time() {
+        let array = IntervalDayTimeArray::from(vec![
+            // 0 days, 1 second
+            IntervalDayTimeType::make_value(0, 1000),
+            // 1 day, 2 milliseconds
+            IntervalDayTimeType::make_value(1, 2),
+            // 90M milliseconds (which is more than is in 1 day)
+            IntervalDayTimeType::make_value(0, 90_000_000),
+        ]);
+
+        let cmp = build_compare(&array, &array).unwrap();
+
+        assert_eq!(Ordering::Less, cmp(0, 1));
+        assert_eq!(Ordering::Greater, cmp(1, 0));
+
+        // somewhat confusingly, while 90M milliseconds is more than 1 day,
+        // it will compare less as the comparison is done on the underlying
+        // values not field by field
+        assert_eq!(Ordering::Greater, cmp(1, 2));
+        assert_eq!(Ordering::Less, cmp(2, 1));
+    }
+
+    #[test]
+    fn test_interval_year_month() {
+        let array = IntervalYearMonthArray::from(vec![
+            // 1 year, 0 months
+            IntervalYearMonthType::make_value(1, 0),
+            // 0 years, 13 months
+            IntervalYearMonthType::make_value(0, 13),
+            // 1 year, 1 month
+            IntervalYearMonthType::make_value(1, 1),
+        ]);
+
+        let cmp = build_compare(&array, &array).unwrap();
+
+        assert_eq!(Ordering::Less, cmp(0, 1));
+        assert_eq!(Ordering::Greater, cmp(1, 0));
+
+        // the underlying representation is months, so both quantities are the 
same
+        assert_eq!(Ordering::Equal, cmp(1, 2));
+        assert_eq!(Ordering::Equal, cmp(2, 1));
+    }
+
+    #[test]
+    fn test_interval_month_day_nano() {
+        let array = IntervalMonthDayNanoArray::from(vec![
+            // 100 days
+            IntervalMonthDayNanoType::make_value(0, 100, 0),
+            // 1 month
+            IntervalMonthDayNanoType::make_value(1, 0, 0),
+            // 100 day, 1 nanoseconds
+            IntervalMonthDayNanoType::make_value(0, 100, 2),
+        ]);
+
+        let cmp = build_compare(&array, &array).unwrap();
+
+        assert_eq!(Ordering::Less, cmp(0, 1));
+        assert_eq!(Ordering::Greater, cmp(1, 0));
+
+        // somewhat confusingly, while 100 days is more than 1 month in all 
cases
+        // it will compare less as the comparison is done on the underlying
+        // values not field by field
+        assert_eq!(Ordering::Greater, cmp(1, 2));
+        assert_eq!(Ordering::Less, cmp(2, 1));
+    }
+
     #[test]
     fn test_decimal() {
         let array = vec![Some(5_i128), Some(2_i128), Some(3_i128)]

Reply via email to