This is an automated email from the ASF dual-hosted git repository.
tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new c5a9953f2a Clarify interval comparison behavior with documentation and
tests (#5192)
c5a9953f2a is described below
commit c5a9953f2a9193e9f31366219d2ee1853215a6d0
Author: Andrew Lamb <[email protected]>
AuthorDate: Fri Dec 8 13:38:12 2023 -0500
Clarify interval comparison behavior with documentation and tests (#5192)
* Clarify interval comparison behavior with documentation and tests
* refine language
---
arrow-array/src/array/primitive_array.rs | 8 ++-
arrow-array/src/types.rs | 71 ++++++++++++++++++-
arrow-ord/src/comparison.rs | 113 +++++++++++++++++++++++++++++++
arrow-ord/src/ord.rs | 67 ++++++++++++++++++
4 files changed, 255 insertions(+), 4 deletions(-)
diff --git a/arrow-array/src/array/primitive_array.rs
b/arrow-array/src/array/primitive_array.rs
index 1112acacfc..2296cebd46 100644
--- a/arrow-array/src/array/primitive_array.rs
+++ b/arrow-array/src/array/primitive_array.rs
@@ -352,12 +352,18 @@ pub type Time64MicrosecondArray =
PrimitiveArray<Time64MicrosecondType>;
pub type Time64NanosecondArray = PrimitiveArray<Time64NanosecondType>;
/// A [`PrimitiveArray`] of “calendar” intervals in months
+///
+/// See [`IntervalYearMonthType`] for details on representation and caveats.
pub type IntervalYearMonthArray = PrimitiveArray<IntervalYearMonthType>;
/// A [`PrimitiveArray`] of “calendar” intervals in days and milliseconds
+///
+/// See [`IntervalDayTimeType`] for details on representation and caveats.
pub type IntervalDayTimeArray = PrimitiveArray<IntervalDayTimeType>;
-/// A [`PrimitiveArray`] of “calendar” intervals in months, days, and
nanoseconds
+/// A [`PrimitiveArray`] of “calendar” intervals in months, days, and
nanoseconds.
+///
+/// See [`IntervalMonthDayNanoType`] for details on representation and caveats.
pub type IntervalMonthDayNanoArray = PrimitiveArray<IntervalMonthDayNanoType>;
/// A [`PrimitiveArray`] of elapsed durations in seconds
diff --git a/arrow-array/src/types.rs b/arrow-array/src/types.rs
index 16d0e822d0..6e177838c4 100644
--- a/arrow-array/src/types.rs
+++ b/arrow-array/src/types.rs
@@ -213,19 +213,84 @@ make_type!(
IntervalYearMonthType,
i32,
DataType::Interval(IntervalUnit::YearMonth),
- "A “calendar” interval type in months."
+ "A “calendar” interval stored as the number of whole months."
);
make_type!(
IntervalDayTimeType,
i64,
DataType::Interval(IntervalUnit::DayTime),
- "A “calendar” interval type in days and milliseconds."
+ r#"A “calendar” interval type in days and milliseconds.
+
+## Representation
+This type is stored as a single 64 bit integer, interpreted as two i32 fields:
+1. the number of elapsed days
+2. The number of milliseconds (no leap seconds),
+
+```text
+ ┌──────────────┬──────────────┐
+ │ Days │ Milliseconds │
+ │ (32 bits) │ (32 bits) │
+ └──────────────┴──────────────┘
+ 0 31 63 bit offset
+```
+Please see the [Arrow
Spec](https://github.com/apache/arrow/blob/081b4022fe6f659d8765efc82b3f4787c5039e3c/format/Schema.fbs#L406-L408)
for more details
+
+## Note on Comparing and Ordering for Calendar Types
+
+Values of `IntervalDayTimeType` are compared using their binary representation,
+which can lead to surprising results. Please see the description of ordering on
+[`IntervalMonthDayNanoType`] for more details
+"#
);
make_type!(
IntervalMonthDayNanoType,
i128,
DataType::Interval(IntervalUnit::MonthDayNano),
- "A “calendar” interval type in months, days, and nanoseconds."
+ r#"A “calendar” interval type in months, days, and nanoseconds.
+
+## Representation
+This type is stored as a single 128 bit integer,
+interpreted as three different signed integral fields:
+
+1. The number of months (32 bits)
+2. The number days (32 bits)
+2. The number of nanoseconds (64 bits).
+
+Nanoseconds does not allow for leap seconds.
+Each field is independent (e.g. there is no constraint that the quantity of
+nanoseconds represents less than a day's worth of time).
+
+```text
+┌──────────────────────────────┬─────────────┬──────────────┐
+│ Nanos │ Days │ Months │
+│ (64 bits) │ (32 bits) │ (32 bits) │
+└──────────────────────────────┴─────────────┴──────────────┘
+ 0 63 95 127 bit offset
+```
+Please see the [Arrow
Spec](https://github.com/apache/arrow/blob/081b4022fe6f659d8765efc82b3f4787c5039e3c/format/Schema.fbs#L409-L415)
for more details
+
+## Note on Comparing and Ordering for Calendar Types
+Values of `IntervalMonthDayNanoType` are compared using their binary
representation,
+which can lead to surprising results.
+
+Spans of time measured in calendar units are not fixed in absolute size (e.g.
+number of seconds) which makes defining comparisons and ordering non trivial.
+For example `1 month` is 28 days for February but `1 month` is 31 days
+in December.
+
+This makes the seemingly simple operation of comparing two intervals
+complicated in practice. For example is `1 month` more or less than `30 days`?
The
+answer depends on what month you are talking about.
+
+This crate defines comparisons for calendar types using their binary
+representation which is fast and efficient, but leads
+to potentially surprising results.
+
+For example a
+`IntervalMonthDayNano` of `1 month` will compare as **greater** than a
+`IntervalMonthDayNano` of `100 days` because the binary representation of `1
month`
+is larger than the binary representation of 100 days.
+"#
);
make_type!(
DurationSecondType,
diff --git a/arrow-ord/src/comparison.rs b/arrow-ord/src/comparison.rs
index 4dbb395192..4d552b038a 100644
--- a/arrow-ord/src/comparison.rs
+++ b/arrow-ord/src/comparison.rs
@@ -1407,6 +1407,48 @@ mod tests {
vec![6, 7, 8, 9, 10, 6, 7, 8, 9, 10],
vec![false, false, true, false, false, false, false, true, false,
false]
);
+
+ cmp_vec!(
+ eq,
+ eq_dyn,
+ IntervalYearMonthArray,
+ vec![
+ IntervalYearMonthType::make_value(1, 2),
+ IntervalYearMonthType::make_value(2, 1),
+ // 1 year
+ IntervalYearMonthType::make_value(1, 0),
+ ],
+ vec![
+ IntervalYearMonthType::make_value(1, 2),
+ IntervalYearMonthType::make_value(1, 2),
+ // NB 12 months is treated as equal to a year (as the
underlying
+ // type stores number of months)
+ IntervalYearMonthType::make_value(0, 12),
+ ],
+ vec![true, false, true]
+ );
+
+ cmp_vec!(
+ eq,
+ eq_dyn,
+ IntervalMonthDayNanoArray,
+ vec![
+ IntervalMonthDayNanoType::make_value(1, 2, 3),
+ IntervalMonthDayNanoType::make_value(3, 2, 1),
+ // 1 month
+ IntervalMonthDayNanoType::make_value(1, 0, 0),
+ IntervalMonthDayNanoType::make_value(1, 0, 0),
+ ],
+ vec![
+ IntervalMonthDayNanoType::make_value(1, 2, 3),
+ IntervalMonthDayNanoType::make_value(1, 2, 3),
+ // 30 days is not treated as a month
+ IntervalMonthDayNanoType::make_value(0, 30, 0),
+ // 100 days
+ IntervalMonthDayNanoType::make_value(0, 100, 0),
+ ],
+ vec![true, false, false, false]
+ );
}
#[test]
@@ -1660,6 +1702,77 @@ mod tests {
vec![6, 7, 8, 9, 10, 6, 7, 8, 9, 10],
vec![false, false, false, true, true, false, false, false, true,
true]
);
+
+ cmp_vec!(
+ lt,
+ lt_dyn,
+ IntervalDayTimeArray,
+ vec![
+ IntervalDayTimeType::make_value(1, 0),
+ IntervalDayTimeType::make_value(0, 1000),
+ IntervalDayTimeType::make_value(1, 1000),
+ IntervalDayTimeType::make_value(1, 3000),
+ // 90M milliseconds
+ IntervalDayTimeType::make_value(0, 90_000_000),
+ ],
+ vec![
+ IntervalDayTimeType::make_value(0, 1000),
+ IntervalDayTimeType::make_value(1, 0),
+ IntervalDayTimeType::make_value(10, 0),
+ IntervalDayTimeType::make_value(2, 1),
+ // NB even though 1 day is less than 90M milliseconds long,
+ // it compares as greater because the underlying type stores
+ // days and milliseconds as different fields
+ IntervalDayTimeType::make_value(0, 12),
+ ],
+ vec![false, true, true, true ,false]
+ );
+
+ cmp_vec!(
+ lt,
+ lt_dyn,
+ IntervalYearMonthArray,
+ vec![
+ IntervalYearMonthType::make_value(1, 2),
+ IntervalYearMonthType::make_value(2, 1),
+ IntervalYearMonthType::make_value(1, 2),
+ // 1 year
+ IntervalYearMonthType::make_value(1, 0),
+ ],
+ vec![
+ IntervalYearMonthType::make_value(1, 2),
+ IntervalYearMonthType::make_value(1, 2),
+ IntervalYearMonthType::make_value(2, 1),
+ // NB 12 months is treated as equal to a year (as the
underlying
+ // type stores number of months)
+ IntervalYearMonthType::make_value(0, 12),
+ ],
+ vec![false, false, true, false]
+ );
+
+ cmp_vec!(
+ lt,
+ lt_dyn,
+ IntervalMonthDayNanoArray,
+ vec![
+ IntervalMonthDayNanoType::make_value(1, 2, 3),
+ IntervalMonthDayNanoType::make_value(3, 2, 1),
+ // 1 month
+ IntervalMonthDayNanoType::make_value(1, 0, 0),
+ IntervalMonthDayNanoType::make_value(1, 2, 0),
+ IntervalMonthDayNanoType::make_value(1, 0, 0),
+ ],
+ vec![
+ IntervalMonthDayNanoType::make_value(1, 2, 3),
+ IntervalMonthDayNanoType::make_value(1, 2, 3),
+ IntervalMonthDayNanoType::make_value(2, 0, 0),
+ // 30 days is not treated as a month
+ IntervalMonthDayNanoType::make_value(0, 30, 0),
+ // 100 days (note is treated as greater than 1 month as the
underlying integer representation)
+ IntervalMonthDayNanoType::make_value(0, 100, 0),
+ ],
+ vec![false, false, true, false, false]
+ );
}
#[test]
diff --git a/arrow-ord/src/ord.rs b/arrow-ord/src/ord.rs
index 28ca07cce2..f6bd39c9cd 100644
--- a/arrow-ord/src/ord.rs
+++ b/arrow-ord/src/ord.rs
@@ -216,6 +216,73 @@ pub mod tests {
assert_eq!(Ordering::Greater, cmp(1, 0));
}
+ #[test]
+ fn test_interval_day_time() {
+ let array = IntervalDayTimeArray::from(vec![
+ // 0 days, 1 second
+ IntervalDayTimeType::make_value(0, 1000),
+ // 1 day, 2 milliseconds
+ IntervalDayTimeType::make_value(1, 2),
+ // 90M milliseconds (which is more than is in 1 day)
+ IntervalDayTimeType::make_value(0, 90_000_000),
+ ]);
+
+ let cmp = build_compare(&array, &array).unwrap();
+
+ assert_eq!(Ordering::Less, cmp(0, 1));
+ assert_eq!(Ordering::Greater, cmp(1, 0));
+
+ // somewhat confusingly, while 90M milliseconds is more than 1 day,
+ // it will compare less as the comparison is done on the underlying
+ // values not field by field
+ assert_eq!(Ordering::Greater, cmp(1, 2));
+ assert_eq!(Ordering::Less, cmp(2, 1));
+ }
+
+ #[test]
+ fn test_interval_year_month() {
+ let array = IntervalYearMonthArray::from(vec![
+ // 1 year, 0 months
+ IntervalYearMonthType::make_value(1, 0),
+ // 0 years, 13 months
+ IntervalYearMonthType::make_value(0, 13),
+ // 1 year, 1 month
+ IntervalYearMonthType::make_value(1, 1),
+ ]);
+
+ let cmp = build_compare(&array, &array).unwrap();
+
+ assert_eq!(Ordering::Less, cmp(0, 1));
+ assert_eq!(Ordering::Greater, cmp(1, 0));
+
+ // the underlying representation is months, so both quantities are the
same
+ assert_eq!(Ordering::Equal, cmp(1, 2));
+ assert_eq!(Ordering::Equal, cmp(2, 1));
+ }
+
+ #[test]
+ fn test_interval_month_day_nano() {
+ let array = IntervalMonthDayNanoArray::from(vec![
+ // 100 days
+ IntervalMonthDayNanoType::make_value(0, 100, 0),
+ // 1 month
+ IntervalMonthDayNanoType::make_value(1, 0, 0),
+ // 100 day, 1 nanoseconds
+ IntervalMonthDayNanoType::make_value(0, 100, 2),
+ ]);
+
+ let cmp = build_compare(&array, &array).unwrap();
+
+ assert_eq!(Ordering::Less, cmp(0, 1));
+ assert_eq!(Ordering::Greater, cmp(1, 0));
+
+ // somewhat confusingly, while 100 days is more than 1 month in all
cases
+ // it will compare less as the comparison is done on the underlying
+ // values not field by field
+ assert_eq!(Ordering::Greater, cmp(1, 2));
+ assert_eq!(Ordering::Less, cmp(2, 1));
+ }
+
#[test]
fn test_decimal() {
let array = vec![Some(5_i128), Some(2_i128), Some(3_i128)]