alamb commented on code in PR #5982:
URL: https://github.com/apache/arrow-datafusion/pull/5982#discussion_r1165928265


##########
datafusion/physical-expr/src/datetime_expressions.rs:
##########
@@ -366,6 +432,17 @@ pub fn date_bin(args: &[ColumnarValue]) -> 
Result<ColumnarValue> {
     }
 }
 
+enum Interval {

Review Comment:
   Since we have this enum anyways, I wonder if there is any need for the trait?
   
   For example, what if the code looked like
   
   ```rust
   impl Inteval {
     // return time in nanoseconds that the source timestamp falls into based 
on the stride and origin
     fn bin(source: i64, origin: i64) -> i64 {
       match self {
         Self::Nanoseconds(v) => date_bin_nanos_interval(v, source, origin),
         Self::Months(v) => date_bin_months_interval(v, source, origin)
       }
     }
   ```
   
   And then the  closure looks like
   ```
       let f = |x: Option<i64>| x.map(move |x| stride.bin( x, origin));
   ```
   
   🤔 
   
   I don't think you would need a trait at all



##########
datafusion/physical-expr/src/datetime_expressions.rs:
##########
@@ -333,21 +333,87 @@ pub fn date_trunc(args: &[ColumnarValue]) -> 
Result<ColumnarValue> {
     })
 }
 
-fn date_bin_single(stride: i64, source: i64, origin: i64) -> i64 {
+trait DateBinInterval {
+    fn date_bin_interval(&self, stride: i64, source: i64, origin: i64) -> i64;
+}
+
+struct DateBinNanosInterval;
+impl DateBinNanosInterval {
+    fn new() -> Self {
+        Self {}
+    }
+}
+impl DateBinInterval for DateBinNanosInterval {
+    fn date_bin_interval(&self, stride: i64, source: i64, origin: i64) -> i64 {
+        date_bin_nanos_interval(stride, source, origin)
+    }
+}
+
+struct DateBinMonthsInterval;
+impl DateBinMonthsInterval {
+    fn new() -> Self {
+        Self {}
+    }
+}
+impl DateBinInterval for DateBinMonthsInterval {
+    fn date_bin_interval(&self, stride: i64, source: i64, origin: i64) -> i64 {
+        date_bin_months_interval(stride, source, origin)
+    }
+}
+
+// return time in nanoseconds that the source timestamp falls into based on 
the stride and origin
+fn date_bin_nanos_interval(stride_nanos: i64, source: i64, origin: i64) -> i64 
{

Review Comment:
   stylistically I wonder if you considered inlining the function definition 
into the trait? It might be less verbose and put the comments closer to where 
they were alled



##########
datafusion/core/tests/sqllogictests/test_files/timestamps.slt:
##########
@@ -500,6 +500,118 @@ FROM (
     (TIMESTAMP '2021-06-10 17:19:10Z', TIMESTAMP '2001-01-01T00:00:00Z', 0.3)
   ) as t (time, origin, val)
 
+# MONTH/YEAR INTERVAL ON ARRAY DATA STILL NOT PROVIDE CORRECT ANSWERS YET

Review Comment:
   Maybe we can add a link to the relevant ticket:
   ```suggestion
   # MONTH/YEAR INTERVAL ON ARRAY DATA STILL NOT PROVIDE CORRECT ANSWERS YET
   # https://github.com/apache/arrow-datafusion/issues/5689
   ```



##########
datafusion/core/tests/sqllogictests/test_files/timestamps.slt:
##########
@@ -500,6 +500,118 @@ FROM (
     (TIMESTAMP '2021-06-10 17:19:10Z', TIMESTAMP '2001-01-01T00:00:00Z', 0.3)
   ) as t (time, origin, val)
 
+# MONTH/YEAR INTERVAL ON ARRAY DATA STILL NOT PROVIDE CORRECT ANSWERS YET
+# month interval in date_bin with default start time
+query P
+select date_bin('1 month', column1)
+from (values
+  (timestamp '2022-01-01 00:00:00'),
+  (timestamp '2022-01-01 01:00:00'),
+  (timestamp '2022-01-02 00:00:00'), 
+  (timestamp '2022-02-02 00:00:00'),
+  (timestamp '2022-02-15 00:00:00'),
+  (timestamp '2022-03-31 00:00:00') 
+) as sq
+----
+2021-12-29T00:00:00
+2021-12-29T00:00:00
+2021-12-29T00:00:00
+2022-01-28T00:00:00
+2022-01-28T00:00:00
+2022-03-29T00:00:00
+
+# month interval with specified start time
+query P
+select date_bin('1 month', column1, TIMESTAMP '1970-01-01T00:00:00Z')
+from (values
+  (timestamp '2022-01-01 00:00:00'),
+  (timestamp '2022-01-01 01:00:00'),
+  (timestamp '2022-01-02 00:00:00'), 
+  (timestamp '2022-02-02 00:00:00'),
+  (timestamp '2022-02-15 00:00:00'),
+  (timestamp '2022-03-31 00:00:00') 
+) as sq
+----
+2021-12-29T00:00:00
+2021-12-29T00:00:00
+2021-12-29T00:00:00
+2022-01-28T00:00:00
+2022-01-28T00:00:00
+2022-03-29T00:00:00
+
+# year interval in date_bin with default start time
+query P
+select date_bin('1 year', column1)
+from (values
+  (timestamp '2022-01-01 00:00:00'),
+  (timestamp '2022-01-01 01:00:00'),
+  (timestamp '2022-01-02 00:00:00'), 
+  (timestamp '2022-02-02 00:00:00'),
+  (timestamp '2022-02-15 00:00:00'),
+  (timestamp '2022-03-31 00:00:00') 
+) as sq
+----
+2021-04-03T00:00:00
+2021-04-03T00:00:00
+2021-04-03T00:00:00
+2021-04-03T00:00:00
+2021-04-03T00:00:00
+2022-03-29T00:00:00
+
+# month interval on constant
+query P
+SELECT DATE_BIN(INTERVAL '1 month', TIMESTAMP '2022-01-01 00:00:00Z', 
TIMESTAMP '1970-01-01T00:00:00Z');

Review Comment:
   I think you could write these tests more concisely without the explicit 
`TIMESTAMP` and `INTERVAL` syntax. Something like 
   
   ```suggestion
   SELECT DATE_BIN('1 month', '2022-01-01 00:00:00Z', '1970-01-01T00:00:00Z');
   ```
   
   I 



##########
datafusion/physical-expr/src/datetime_expressions.rs:
##########
@@ -333,21 +333,87 @@ pub fn date_trunc(args: &[ColumnarValue]) -> 
Result<ColumnarValue> {
     })
 }
 
-fn date_bin_single(stride: i64, source: i64, origin: i64) -> i64 {
+trait DateBinInterval {
+    fn date_bin_interval(&self, stride: i64, source: i64, origin: i64) -> i64;
+}

Review Comment:
   Could you please add some comments about what the `i64` return value of this 
trait is? Is it a width in terms of nanoseconds, for example? What is source 
and how it is different than stride or origin?
   
   It looks from reading below it is actually the "bin value" for the timestamp 
at `source`?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to