NGA-TRAN commented on code in PR #5982:
URL: https://github.com/apache/arrow-datafusion/pull/5982#discussion_r1164706474
##########
datafusion/core/tests/sqllogictests/test_files/timestamps.slt:
##########
@@ -500,6 +500,118 @@ FROM (
(TIMESTAMP '2021-06-10 17:19:10Z', TIMESTAMP '2001-01-01T00:00:00Z', 0.3)
) as t (time, origin, val)
+# MONTH/YEAR INTERVAL ON ARRAY DATA STILL NOT PROVIDE CORRECT ANSWERS YET
Review Comment:
Test on array data (equivalent to testing on table data). These tests do not
yet return the correct answers. See below for my comment on how to fix this in
my next PR
##########
datafusion/core/tests/sqllogictests/test_files/timestamps.slt:
##########
@@ -500,6 +500,118 @@ FROM (
(TIMESTAMP '2021-06-10 17:19:10Z', TIMESTAMP '2001-01-01T00:00:00Z', 0.3)
) as t (time, origin, val)
+# MONTH/YEAR INTERVAL ON ARRAY DATA STILL NOT PROVIDE CORRECT ANSWERS YET
+# month interval in date_bin with default start time
+query P
+select date_bin('1 month', column1)
+from (values
+ (timestamp '2022-01-01 00:00:00'),
+ (timestamp '2022-01-01 01:00:00'),
+ (timestamp '2022-01-02 00:00:00'),
+ (timestamp '2022-02-02 00:00:00'),
+ (timestamp '2022-02-15 00:00:00'),
+ (timestamp '2022-03-31 00:00:00')
+) as sq
+----
+2021-12-29T00:00:00
+2021-12-29T00:00:00
+2021-12-29T00:00:00
+2022-01-28T00:00:00
+2022-01-28T00:00:00
+2022-03-29T00:00:00
+
+# month interval with specified start time
+query P
+select date_bin('1 month', column1, TIMESTAMP '1970-01-01T00:00:00Z')
+from (values
+ (timestamp '2022-01-01 00:00:00'),
+ (timestamp '2022-01-01 01:00:00'),
+ (timestamp '2022-01-02 00:00:00'),
+ (timestamp '2022-02-02 00:00:00'),
+ (timestamp '2022-02-15 00:00:00'),
+ (timestamp '2022-03-31 00:00:00')
+) as sq
+----
+2021-12-29T00:00:00
+2021-12-29T00:00:00
+2021-12-29T00:00:00
+2022-01-28T00:00:00
+2022-01-28T00:00:00
+2022-03-29T00:00:00
+
+# year interval in date_bin with default start time
+query P
+select date_bin('1 year', column1)
+from (values
+ (timestamp '2022-01-01 00:00:00'),
+ (timestamp '2022-01-01 01:00:00'),
+ (timestamp '2022-01-02 00:00:00'),
+ (timestamp '2022-02-02 00:00:00'),
+ (timestamp '2022-02-15 00:00:00'),
+ (timestamp '2022-03-31 00:00:00')
+) as sq
+----
+2021-04-03T00:00:00
+2021-04-03T00:00:00
+2021-04-03T00:00:00
+2021-04-03T00:00:00
+2021-04-03T00:00:00
+2022-03-29T00:00:00
+
+# month interval on constant
Review Comment:
From here down, all the tests did not work before (error: not supported) but
now working
##########
datafusion/physical-expr/src/datetime_expressions.rs:
##########
@@ -429,7 +515,18 @@ fn date_bin_impl(
)),
};
- let f = |x: Option<i64>| x.map(|x| date_bin_single(stride, x, origin));
+ let f = move |x: Option<i64>| {
+ x.map(|x| match stride {
+ Interval::Nanoseconds(stride) => {
+ let y = DateBinNanosInterval::new();
+ y.date_bin_interval(stride, x, origin)
+ }
+ Interval::Months(stride) => {
+ let y = DateBinMonthsInterval::new();
+ y.date_bin_interval(stride, x, origin)
+ }
+ })
+ };
Review Comment:
This is the closure
##########
datafusion/physical-expr/src/datetime_expressions.rs:
##########
@@ -366,6 +432,17 @@ pub fn date_bin(args: &[ColumnarValue]) ->
Result<ColumnarValue> {
}
}
+enum Interval {
+ Nanoseconds(i64),
+ Months(i64),
+}
+
+// Supported intervals:
+// 1. IntervalDayTime: this means that the stride is in days, hours, minutes,
seconds and milliseconds
+// We will assume month interval won't be converted into this type
+// TODO (my next PR): for array data, the stride was converted into
ScalarValue::IntervalDayTime somwhere
+// for month interval. I need to find that and make it
ScalarValue::IntervalMonthDayNano instead
Review Comment:
This is how to support month interval on array data. I think the change will
be minimal but I need to figure out where
##########
datafusion/physical-expr/src/datetime_expressions.rs:
##########
@@ -333,21 +333,87 @@ pub fn date_trunc(args: &[ColumnarValue]) ->
Result<ColumnarValue> {
})
}
-fn date_bin_single(stride: i64, source: i64, origin: i64) -> i64 {
+trait DateBinInterval {
+ fn date_bin_interval(&self, stride: i64, source: i64, origin: i64) -> i64;
+}
Review Comment:
I create this trait for it to work with an available closure because I now
need 2 different ways to handle date_bin intervals nanoseconds and months
##########
datafusion/core/tests/sqllogictests/test_files/timestamps.slt:
##########
@@ -421,7 +421,7 @@ SELECT DATE_BIN(INTERVAL '5 microseconds', TIMESTAMP
'2022-08-03 14:38:50.000006
2022-08-03T14:38:50.000005
# Does not support months for Month-Day-Nano interval
-statement error This feature is not implemented: DATE_BIN stride does not
support month intervals
+statement error DataFusion error: This feature is not implemented: DATE_BIN
stride does not support combination of month, day and nanosecond intervals
Review Comment:
new error message I have added in this PR
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]