NGA-TRAN commented on code in PR #5982:
URL: https://github.com/apache/arrow-datafusion/pull/5982#discussion_r1166052425
##########
datafusion/core/tests/sqllogictests/test_files/timestamps.slt:
##########
@@ -500,6 +500,118 @@ FROM (
(TIMESTAMP '2021-06-10 17:19:10Z', TIMESTAMP '2001-01-01T00:00:00Z', 0.3)
) as t (time, origin, val)
+# MONTH/YEAR INTERVAL ON ARRAY DATA STILL NOT PROVIDE CORRECT ANSWERS YET
+# month interval in date_bin with default start time
+query P
+select date_bin('1 month', column1)
+from (values
+ (timestamp '2022-01-01 00:00:00'),
+ (timestamp '2022-01-01 01:00:00'),
+ (timestamp '2022-01-02 00:00:00'),
+ (timestamp '2022-02-02 00:00:00'),
+ (timestamp '2022-02-15 00:00:00'),
+ (timestamp '2022-03-31 00:00:00')
+) as sq
+----
+2021-12-29T00:00:00
+2021-12-29T00:00:00
+2021-12-29T00:00:00
+2022-01-28T00:00:00
+2022-01-28T00:00:00
+2022-03-29T00:00:00
+
+# month interval with specified start time
+query P
+select date_bin('1 month', column1, TIMESTAMP '1970-01-01T00:00:00Z')
+from (values
+ (timestamp '2022-01-01 00:00:00'),
+ (timestamp '2022-01-01 01:00:00'),
+ (timestamp '2022-01-02 00:00:00'),
+ (timestamp '2022-02-02 00:00:00'),
+ (timestamp '2022-02-15 00:00:00'),
+ (timestamp '2022-03-31 00:00:00')
+) as sq
+----
+2021-12-29T00:00:00
+2021-12-29T00:00:00
+2021-12-29T00:00:00
+2022-01-28T00:00:00
+2022-01-28T00:00:00
+2022-03-29T00:00:00
+
+# year interval in date_bin with default start time
+query P
+select date_bin('1 year', column1)
+from (values
+ (timestamp '2022-01-01 00:00:00'),
+ (timestamp '2022-01-01 01:00:00'),
+ (timestamp '2022-01-02 00:00:00'),
+ (timestamp '2022-02-02 00:00:00'),
+ (timestamp '2022-02-15 00:00:00'),
+ (timestamp '2022-03-31 00:00:00')
+) as sq
+----
+2021-04-03T00:00:00
+2021-04-03T00:00:00
+2021-04-03T00:00:00
+2021-04-03T00:00:00
+2021-04-03T00:00:00
+2022-03-29T00:00:00
+
+# month interval on constant
+query P
+SELECT DATE_BIN(INTERVAL '1 month', TIMESTAMP '2022-01-01 00:00:00Z',
TIMESTAMP '1970-01-01T00:00:00Z');
Review Comment:
Hah, by removing these, I have found that without keyword `INTERVAL`, the
month is converted into days even on constant time. That is the problem I will
work on next. Array data works correctly if there is `INTERVAL` keyword
##########
datafusion/physical-expr/src/datetime_expressions.rs:
##########
@@ -333,21 +333,87 @@ pub fn date_trunc(args: &[ColumnarValue]) ->
Result<ColumnarValue> {
})
}
-fn date_bin_single(stride: i64, source: i64, origin: i64) -> i64 {
+trait DateBinInterval {
+ fn date_bin_interval(&self, stride: i64, source: i64, origin: i64) -> i64;
+}
Review Comment:
This trait is no longer needed from the comment below
##########
datafusion/physical-expr/src/datetime_expressions.rs:
##########
@@ -333,21 +333,87 @@ pub fn date_trunc(args: &[ColumnarValue]) ->
Result<ColumnarValue> {
})
}
-fn date_bin_single(stride: i64, source: i64, origin: i64) -> i64 {
+trait DateBinInterval {
+ fn date_bin_interval(&self, stride: i64, source: i64, origin: i64) -> i64;
+}
+
+struct DateBinNanosInterval;
+impl DateBinNanosInterval {
+ fn new() -> Self {
+ Self {}
+ }
+}
+impl DateBinInterval for DateBinNanosInterval {
+ fn date_bin_interval(&self, stride: i64, source: i64, origin: i64) -> i64 {
+ date_bin_nanos_interval(stride, source, origin)
+ }
+}
+
+struct DateBinMonthsInterval;
+impl DateBinMonthsInterval {
+ fn new() -> Self {
+ Self {}
+ }
+}
+impl DateBinInterval for DateBinMonthsInterval {
+ fn date_bin_interval(&self, stride: i64, source: i64, origin: i64) -> i64 {
+ date_bin_months_interval(stride, source, origin)
+ }
+}
+
+// return time in nanoseconds that the source timestamp falls into based on
the stride and origin
+fn date_bin_nanos_interval(stride_nanos: i64, source: i64, origin: i64) -> i64
{
let time_diff = source - origin;
// distance to bin
- let time_delta = time_diff - (time_diff % stride);
+ let time_delta = time_diff - (time_diff % stride_nanos);
- let time_delta = if time_diff < 0 && stride > 1 {
+ let time_delta = if time_diff < 0 && stride_nanos > 1 {
// The origin is later than the source timestamp, round down to the
previous bin
- time_delta - stride
+ time_delta - stride_nanos
} else {
time_delta
};
origin + time_delta
}
+// return time in nanoseconds that the source timestamp falls into based on
the stride and origin
+fn date_bin_months_interval(stride_months: i64, source: i64, origin: i64) ->
i64 {
+ // convert source and origin to DateTime<Utc>
+ let source_date = to_utc_date_time(source);
+ let origin_date = to_utc_date_time(origin);
+
+ // calculate the number of months between the source and origin
+ let month_diff = (source_date.year() - origin_date.year()) * 12
+ + source_date.month() as i32
+ - origin_date.month() as i32;
+
+ // distance from origin to bin
+ let month_delta = month_diff - (month_diff % stride_months as i32);
+
+ let month_delta = if month_diff < 0 && stride_months > 1 {
+ // The origin is later than the source timestamp, round down to the
previous bin
+ month_delta - stride_months as i32
+ } else {
+ month_delta
+ };
Review Comment:
Done
##########
datafusion/physical-expr/src/datetime_expressions.rs:
##########
@@ -366,6 +432,17 @@ pub fn date_bin(args: &[ColumnarValue]) ->
Result<ColumnarValue> {
}
}
+enum Interval {
Review Comment:
It works. Thanks Andrew
##########
datafusion/physical-expr/src/datetime_expressions.rs:
##########
@@ -333,21 +333,87 @@ pub fn date_trunc(args: &[ColumnarValue]) ->
Result<ColumnarValue> {
})
}
-fn date_bin_single(stride: i64, source: i64, origin: i64) -> i64 {
+trait DateBinInterval {
+ fn date_bin_interval(&self, stride: i64, source: i64, origin: i64) -> i64;
+}
+
+struct DateBinNanosInterval;
+impl DateBinNanosInterval {
+ fn new() -> Self {
+ Self {}
+ }
+}
+impl DateBinInterval for DateBinNanosInterval {
+ fn date_bin_interval(&self, stride: i64, source: i64, origin: i64) -> i64 {
+ date_bin_nanos_interval(stride, source, origin)
+ }
+}
+
+struct DateBinMonthsInterval;
+impl DateBinMonthsInterval {
+ fn new() -> Self {
+ Self {}
+ }
+}
+impl DateBinInterval for DateBinMonthsInterval {
+ fn date_bin_interval(&self, stride: i64, source: i64, origin: i64) -> i64 {
+ date_bin_months_interval(stride, source, origin)
+ }
+}
+
+// return time in nanoseconds that the source timestamp falls into based on
the stride and origin
+fn date_bin_nanos_interval(stride_nanos: i64, source: i64, origin: i64) -> i64
{
Review Comment:
This is also no longer needed after removing the trait
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]