This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git
The following commit(s) were added to refs/heads/main by this push:
new 44e3876758 Add additional test coverage for aggregaes using
dates/times/timestamps/decimals (#6939)
44e3876758 is described below
commit 44e3876758bd234547ed07afd6e48c7fc62ea525
Author: Andrew Lamb <[email protected]>
AuthorDate: Wed Jul 12 17:59:22 2023 -0400
Add additional test coverage for aggregaes using
dates/times/timestamps/decimals (#6939)
* Add additional test coverage for aggregaes using
dates/times/timestamps/decimals
* Add coverage for date32/date64
---
.../tests/sqllogictests/test_files/aggregate.slt | 232 +++++++++++++++++----
1 file changed, 194 insertions(+), 38 deletions(-)
diff --git a/datafusion/core/tests/sqllogictests/test_files/aggregate.slt
b/datafusion/core/tests/sqllogictests/test_files/aggregate.slt
index bf1ab2cbd1..95cf51d571 100644
--- a/datafusion/core/tests/sqllogictests/test_files/aggregate.slt
+++ b/datafusion/core/tests/sqllogictests/test_files/aggregate.slt
@@ -39,9 +39,10 @@ WITH HEADER ROW
LOCATION '../../testing/data/csv/aggregate_test_100.csv'
statement ok
-CREATE TABLE d_table (c1 decimal(10,3)) as values
-(110.000), (110.001), (110.002), (110.003), (110.004), (110.005), (110.006),
(110.007), (110.008), (110.009),
-(-100.000),(-100.001),(-100.002),(-100.003),(-100.004),(-100.005),(-100.006),(-100.007),(-100.008),(-100.009)
+CREATE TABLE d_table (c1 decimal(10,3), c2 varchar)
+as values
+(110.000, 'A'), (110.001, 'A'), (110.002, 'A'), (110.003, 'A'), (110.004,
'A'), (110.005, 'A'), (110.006, 'A'), (110.007, 'A'), (110.008, 'A'), (110.009,
'A'),
+(-100.000, 'B'),(-100.001, 'B'),(-100.002, 'B'),(-100.003, 'B'),(-100.004,
'B'),(-100.005, 'B'),(-100.006, 'B'),(-100.007, 'B'),(-100.008, 'B'),(-100.009,
'B')
statement ok
CREATE TABLE median_table (
@@ -448,7 +449,7 @@ drop table cpu;
# this test is to show create table as and select into works in the same way
statement ok
-SELECT * INTO cpu
+SELECT * INTO cpu
FROM (VALUES
('host0', 90.1),
('host1', 90.2),
@@ -1483,22 +1484,6 @@ NULL 2
statement ok
drop table the_nulls;
-# All supported timestamp types
-
-# "nanos" --> TimestampNanosecondArray
-# "micros" --> TimestampMicrosecondArray
-# "millis" --> TimestampMillisecondArray
-# "secs" --> TimestampSecondArray
-# "names" --> StringArray
-
-statement ok
-create table t_source
-as values
- ('2018-11-13T17:11:10.011375885995', 'Row 0'),
- ('2011-12-13T11:13:10.12345', 'Row 1'),
- (null, 'Row 2'),
- ('2021-01-01T05:11:10.432', 'Row 3');
-
statement ok
create table bit_aggregate_functions (
c1 SMALLINT NOT NULL,
@@ -1568,6 +1553,22 @@ SELECT bool_or(distinct c1), bool_or(distinct c2),
bool_or(distinct c3), bool_or
----
true true true false true true false NULL
+# All supported timestamp types
+
+# "nanos" --> TimestampNanosecondArray
+# "micros" --> TimestampMicrosecondArray
+# "millis" --> TimestampMillisecondArray
+# "secs" --> TimestampSecondArray
+# "names" --> StringArray
+
+statement ok
+create table t_source
+as values
+ ('2018-11-13T17:11:10.011375885995', 'Row 0', 'X'),
+ ('2011-12-13T11:13:10.12345', 'Row 1', 'X'),
+ (null, 'Row 2', 'Y'),
+ ('2021-01-01T05:11:10.432', 'Row 3', 'Y');
+
statement ok
create table t as
select
@@ -1575,29 +1576,38 @@ select
arrow_cast(column1, 'Timestamp(Microsecond, None)') as micros,
arrow_cast(column1, 'Timestamp(Millisecond, None)') as millis,
arrow_cast(column1, 'Timestamp(Second, None)') as secs,
- column2 as names
+ column2 as names,
+ column3 as tag
from t_source;
# Demonstate the contents
-query PPPPT
+query PPPPTT
select * from t;
----
-2018-11-13T17:11:10.011375885 2018-11-13T17:11:10.011375
2018-11-13T17:11:10.011 2018-11-13T17:11:10 Row 0
-2011-12-13T11:13:10.123450 2011-12-13T11:13:10.123450 2011-12-13T11:13:10.123
2011-12-13T11:13:10 Row 1
-NULL NULL NULL NULL Row 2
-2021-01-01T05:11:10.432 2021-01-01T05:11:10.432 2021-01-01T05:11:10.432
2021-01-01T05:11:10 Row 3
+2018-11-13T17:11:10.011375885 2018-11-13T17:11:10.011375
2018-11-13T17:11:10.011 2018-11-13T17:11:10 Row 0 X
+2011-12-13T11:13:10.123450 2011-12-13T11:13:10.123450 2011-12-13T11:13:10.123
2011-12-13T11:13:10 Row 1 X
+NULL NULL NULL NULL Row 2 Y
+2021-01-01T05:11:10.432 2021-01-01T05:11:10.432 2021-01-01T05:11:10.432
2021-01-01T05:11:10 Row 3 Y
# aggregate_timestamps_sum
-statement error Error during planning: The function Sum does not support
inputs of type Timestamp\(Nanosecond, None\)
+statement error DataFusion error: Error during planning: The function Sum does
not support inputs of type Timestamp\(Nanosecond, None\)\.
SELECT sum(nanos), sum(micros), sum(millis), sum(secs) FROM t;
+statement error DataFusion error: Error during planning: The function Sum does
not support inputs of type Timestamp\(Nanosecond, None\)\.
+SELECT tag, sum(nanos), sum(micros), sum(millis), sum(secs) FROM t GROUP BY
tag ORDER BY tag;
+
# aggregate_timestamps_count
query IIII
SELECT count(nanos), count(micros), count(millis), count(secs) FROM t;
----
3 3 3 3
+query TIIII
+SELECT tag, count(nanos), count(micros), count(millis), count(secs) FROM t
GROUP BY tag ORDER BY tag;
+----
+X 2 2 2 2
+Y 1 1 1 1
# aggregate_timestamps_min
query PPPP
@@ -1605,18 +1615,32 @@ SELECT min(nanos), min(micros), min(millis), min(secs)
FROM t;
----
2011-12-13T11:13:10.123450 2011-12-13T11:13:10.123450 2011-12-13T11:13:10.123
2011-12-13T11:13:10
+query TPPPP
+SELECT tag, min(nanos), min(micros), min(millis), min(secs) FROM t GROUP BY
tag ORDER BY tag;
+----
+X 2011-12-13T11:13:10.123450 2011-12-13T11:13:10.123450
2011-12-13T11:13:10.123 2011-12-13T11:13:10
+Y 2021-01-01T05:11:10.432 2021-01-01T05:11:10.432 2021-01-01T05:11:10.432
2021-01-01T05:11:10
+
# aggregate_timestamps_max
query PPPP
SELECT max(nanos), max(micros), max(millis), max(secs) FROM t;
----
2021-01-01T05:11:10.432 2021-01-01T05:11:10.432 2021-01-01T05:11:10.432
2021-01-01T05:11:10
+query TPPPP
+SELECT tag, max(nanos), max(micros), max(millis), max(secs) FROM t GROUP BY
tag ORDER BY tag
+----
+X 2018-11-13T17:11:10.011375885 2018-11-13T17:11:10.011375
2018-11-13T17:11:10.011 2018-11-13T17:11:10
+Y 2021-01-01T05:11:10.432 2021-01-01T05:11:10.432 2021-01-01T05:11:10.432
2021-01-01T05:11:10
# aggregate_timestamps_avg
-statement error Error during planning: The function Avg does not support
inputs of type Timestamp\(Nanosecond, None\).
+statement error DataFusion error: Error during planning: The function Avg does
not support inputs of type Timestamp\(Nanosecond, None\)\.
SELECT avg(nanos), avg(micros), avg(millis), avg(secs) FROM t
+statement error DataFusion error: Error during planning: The function Avg does
not support inputs of type Timestamp\(Nanosecond, None\)\.
+SELECT tag, avg(nanos), avg(micros), avg(millis), avg(secs) FROM t GROUP BY
tag ORDER BY tag;
+
statement ok
drop table t_source;
@@ -1624,6 +1648,101 @@ drop table t_source;
statement ok
drop table t;
+
+# All supported Date tpes
+
+# "date32" --> Date32Array
+# "date64" --> Date64Array
+# "names" --> StringArray
+
+statement ok
+create table t_source
+as values
+ ('2018-11-13', 'Row 0', 'X'),
+ ('2011-12-13', 'Row 1', 'X'),
+ (null, 'Row 2', 'Y'),
+ ('2021-01-01', 'Row 3', 'Y');
+
+statement ok
+create table t as
+select
+ arrow_cast(column1, 'Date32') as date32,
+ -- Workaround https://github.com/apache/arrow-rs/issues/4512 is fixed, can
use this
+ -- arrow_cast(column1, 'Date64') as date64,
+ arrow_cast(arrow_cast(column1, 'Date32'), 'Date64') as date64,
+ column2 as names,
+ column3 as tag
+from t_source;
+
+# Demonstate the contents
+query DDTT
+select * from t;
+----
+2018-11-13 2018-11-13T00:00:00 Row 0 X
+2011-12-13 2011-12-13T00:00:00 Row 1 X
+NULL NULL Row 2 Y
+2021-01-01 2021-01-01T00:00:00 Row 3 Y
+
+
+# aggregate_timestamps_sum
+statement error DataFusion error: Error during planning: The function Sum does
not support inputs of type Date32\.
+SELECT sum(date32), sum(date64) FROM t;
+
+statement error DataFusion error: Error during planning: The function Sum does
not support inputs of type Date32\.
+SELECT tag, sum(date32), sum(date64) FROM t GROUP BY tag ORDER BY tag;
+
+# aggregate_timestamps_count
+query II
+SELECT count(date32), count(date64) FROM t;
+----
+3 3
+
+query TII
+SELECT tag, count(date32), count(date64) FROM t GROUP BY tag ORDER BY tag;
+----
+X 2 2
+Y 1 1
+
+# aggregate_timestamps_min
+query DD
+SELECT min(date32), min(date64) FROM t;
+----
+2011-12-13 2011-12-13T00:00:00
+
+query TDD
+SELECT tag, min(date32), min(date64) FROM t GROUP BY tag ORDER BY tag;
+----
+X 2011-12-13 2011-12-13T00:00:00
+Y 2021-01-01 2021-01-01T00:00:00
+
+# aggregate_timestamps_max
+query DD
+SELECT max(date32), max(date64) FROM t;
+----
+2021-01-01 2021-01-01T00:00:00
+
+query TDD
+SELECT tag, max(date32), max(date64) FROM t GROUP BY tag ORDER BY tag
+----
+X 2018-11-13 2018-11-13T00:00:00
+Y 2021-01-01 2021-01-01T00:00:00
+
+
+# aggregate_timestamps_avg
+statement error DataFusion error: Error during planning: The function Avg does
not support inputs of type Date32\.
+SELECT avg(date32), avg(date64) FROM t
+
+statement error DataFusion error: Error during planning: The function Avg does
not support inputs of type Date32\.
+SELECT tag, avg(date32), avg(date64) FROM t GROUP BY tag ORDER BY tag;
+
+
+statement ok
+drop table t_source;
+
+statement ok
+drop table t;
+
+
# All supported time types
# Columns are named:
@@ -1636,10 +1755,10 @@ drop table t;
statement ok
create table t_source
as values
- ('18:06:30.243620451', 'Row 0'),
- ('20:08:28.161121654', 'Row 1'),
- ('19:11:04.156423842', 'Row 2'),
- ('21:06:28.247821084', 'Row 3');
+ ('18:06:30.243620451', 'Row 0', 'A'),
+ ('20:08:28.161121654', 'Row 1', 'A'),
+ ('19:11:04.156423842', 'Row 2', 'B'),
+ ('21:06:28.247821084', 'Row 3', 'B');
statement ok
@@ -1649,28 +1768,38 @@ select
arrow_cast(column1, 'Time64(Microsecond)') as micros,
arrow_cast(column1, 'Time32(Millisecond)') as millis,
arrow_cast(column1, 'Time32(Second)') as secs,
- column2 as names
+ column2 as names,
+ column3 as tag
from t_source;
# Demonstate the contents
-query DDDDT
+query DDDDTT
select * from t;
----
-18:06:30.243620451 18:06:30.243620 18:06:30.243 18:06:30 Row 0
-20:08:28.161121654 20:08:28.161121 20:08:28.161 20:08:28 Row 1
-19:11:04.156423842 19:11:04.156423 19:11:04.156 19:11:04 Row 2
-21:06:28.247821084 21:06:28.247821 21:06:28.247 21:06:28 Row 3
+18:06:30.243620451 18:06:30.243620 18:06:30.243 18:06:30 Row 0 A
+20:08:28.161121654 20:08:28.161121 20:08:28.161 20:08:28 Row 1 A
+19:11:04.156423842 19:11:04.156423 19:11:04.156 19:11:04 Row 2 B
+21:06:28.247821084 21:06:28.247821 21:06:28.247 21:06:28 Row 3 B
# aggregate_times_sum
statement error DataFusion error: Error during planning: The function Sum does
not support inputs of type Time64\(Nanosecond\).
SELECT sum(nanos), sum(micros), sum(millis), sum(secs) FROM t
+statement error DataFusion error: Error during planning: The function Sum does
not support inputs of type Time64\(Nanosecond\)\.
+SELECT tag, sum(nanos), sum(micros), sum(millis), sum(secs) FROM t GROUP BY
tag ORDER BY tag
+
# aggregate_times_count
query IIII
SELECT count(nanos), count(micros), count(millis), count(secs) FROM t
----
4 4 4 4
+query TIIII
+SELECT tag, count(nanos), count(micros), count(millis), count(secs) FROM t
GROUP BY tag ORDER BY tag
+----
+A 2 2 2 2
+B 2 2 2 2
+
# aggregate_times_min
query DDDD
@@ -1678,17 +1807,32 @@ SELECT min(nanos), min(micros), min(millis), min(secs)
FROM t
----
18:06:30.243620451 18:06:30.243620 18:06:30.243 18:06:30
+query TDDDD
+SELECT tag, min(nanos), min(micros), min(millis), min(secs) FROM t GROUP BY
tag ORDER BY tag
+----
+A 18:06:30.243620451 18:06:30.243620 18:06:30.243 18:06:30
+B 19:11:04.156423842 19:11:04.156423 19:11:04.156 19:11:04
+
# aggregate_times_max
query DDDD
SELECT max(nanos), max(micros), max(millis), max(secs) FROM t
----
21:06:28.247821084 21:06:28.247821 21:06:28.247 21:06:28
+query TDDDD
+SELECT tag, max(nanos), max(micros), max(millis), max(secs) FROM t GROUP BY
tag ORDER BY tag
+----
+A 20:08:28.161121654 20:08:28.161121 20:08:28.161 20:08:28
+B 21:06:28.247821084 21:06:28.247821 21:06:28.247 21:06:28
+
# aggregate_times_avg
statement error DataFusion error: Error during planning: The function Avg does
not support inputs of type Time64\(Nanosecond\).
SELECT avg(nanos), avg(micros), avg(millis), avg(secs) FROM t
+statement error DataFusion error: Error during planning: The function Avg does
not support inputs of type Time64\(Nanosecond\)\.
+SELECT tag, avg(nanos), avg(micros), avg(millis), avg(secs) FROM t GROUP BY
tag ORDER BY tag;
+
statement ok
drop table t_source;
@@ -1710,6 +1854,12 @@ select sum(c1), arrow_typeof(sum(c1)) from d_table;
----
100 Decimal128(20, 3)
+query TRT
+select c2, sum(c1), arrow_typeof(sum(c1)) from d_table GROUP BY c2 ORDER BY c2;
+----
+A 1100.045 Decimal128(20, 3)
+B -1000.045 Decimal128(20, 3)
+
# aggregate_decimal_avg
query RT
@@ -1717,6 +1867,12 @@ select avg(c1), arrow_typeof(avg(c1)) from d_table
----
5 Decimal128(14, 7)
+query TRT
+select c2, avg(c1), arrow_typeof(avg(c1)) from d_table GROUP BY c2 ORDER BY c2
+----
+A 110.0045 Decimal128(14, 7)
+B -100.0045 Decimal128(14, 7)
+
# Use PostgresSQL dialect
statement ok
set datafusion.sql_parser.dialect = 'Postgres';