This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git
The following commit(s) were added to refs/heads/main by this push:
new bc0ed23b9e fix: Timestamp with timezone not considered `join on`
(#8150)
bc0ed23b9e is described below
commit bc0ed23b9ecc113cdfd9a5d68d581be9c4cbf914
Author: L_B__ <[email protected]>
AuthorDate: Sat Nov 18 03:34:47 2023 +0800
fix: Timestamp with timezone not considered `join on` (#8150)
* fix: Timestamp with timezone not considerd in
* Add Test For Explain HashJoin On Timestamp with Tz
---------
Co-authored-by: ackingliu <[email protected]>
---
datafusion/expr/src/utils.rs | 2 +-
datafusion/sqllogictest/test_files/joins.slt | 77 ++++++++++++++++++++++++++++
2 files changed, 78 insertions(+), 1 deletion(-)
diff --git a/datafusion/expr/src/utils.rs b/datafusion/expr/src/utils.rs
index 8f13bf5f61..ff95ff10e7 100644
--- a/datafusion/expr/src/utils.rs
+++ b/datafusion/expr/src/utils.rs
@@ -901,7 +901,7 @@ pub fn can_hash(data_type: &DataType) -> bool {
DataType::UInt64 => true,
DataType::Float32 => true,
DataType::Float64 => true,
- DataType::Timestamp(time_unit, None) => match time_unit {
+ DataType::Timestamp(time_unit, _) => match time_unit {
TimeUnit::Second => true,
TimeUnit::Millisecond => true,
TimeUnit::Microsecond => true,
diff --git a/datafusion/sqllogictest/test_files/joins.slt
b/datafusion/sqllogictest/test_files/joins.slt
index fa3a6cff8c..737b43b5a9 100644
--- a/datafusion/sqllogictest/test_files/joins.slt
+++ b/datafusion/sqllogictest/test_files/joins.slt
@@ -140,6 +140,17 @@ SELECT
FROM
test_timestamps_table_source;
+# create a table of timestamps with time zone
+statement ok
+CREATE TABLE test_timestamps_tz_table as
+SELECT
+ arrow_cast(ts::timestamp::bigint, 'Timestamp(Nanosecond, Some("UTC"))') as
nanos,
+ arrow_cast(ts::timestamp::bigint / 1000, 'Timestamp(Microsecond,
Some("UTC"))') as micros,
+ arrow_cast(ts::timestamp::bigint / 1000000, 'Timestamp(Millisecond,
Some("UTC"))') as millis,
+ arrow_cast(ts::timestamp::bigint / 1000000000, 'Timestamp(Second,
Some("UTC"))') as secs,
+ names
+FROM
+ test_timestamps_table_source;
statement ok
@@ -2462,6 +2473,16 @@ test_timestamps_table
NULL NULL NULL NULL Row 2
2021-01-01T05:11:10.432 2021-01-01T05:11:10.432 2021-01-01T05:11:10.432
2021-01-01T05:11:10 Row 3
+# show the contents of the timestamp with timezone table
+query PPPPT
+select * from
+test_timestamps_tz_table
+----
+2018-11-13T17:11:10.011375885Z 2018-11-13T17:11:10.011375Z
2018-11-13T17:11:10.011Z 2018-11-13T17:11:10Z Row 0
+2011-12-13T11:13:10.123450Z 2011-12-13T11:13:10.123450Z
2011-12-13T11:13:10.123Z 2011-12-13T11:13:10Z Row 1
+NULL NULL NULL NULL Row 2
+2021-01-01T05:11:10.432Z 2021-01-01T05:11:10.432Z 2021-01-01T05:11:10.432Z
2021-01-01T05:11:10Z Row 3
+
# test timestamp join on nanos datatype
query PPPPTPPPPT rowsort
SELECT * FROM test_timestamps_table as t1 JOIN (SELECT * FROM
test_timestamps_table ) as t2 ON t1.nanos = t2.nanos;
@@ -2470,6 +2491,14 @@ SELECT * FROM test_timestamps_table as t1 JOIN (SELECT *
FROM test_timestamps_ta
2018-11-13T17:11:10.011375885 2018-11-13T17:11:10.011375
2018-11-13T17:11:10.011 2018-11-13T17:11:10 Row 0 2018-11-13T17:11:10.011375885
2018-11-13T17:11:10.011375 2018-11-13T17:11:10.011 2018-11-13T17:11:10 Row 0
2021-01-01T05:11:10.432 2021-01-01T05:11:10.432 2021-01-01T05:11:10.432
2021-01-01T05:11:10 Row 3 2021-01-01T05:11:10.432 2021-01-01T05:11:10.432
2021-01-01T05:11:10.432 2021-01-01T05:11:10 Row 3
+# test timestamp with timezone join on nanos datatype
+query PPPPTPPPPT rowsort
+SELECT * FROM test_timestamps_tz_table as t1 JOIN (SELECT * FROM
test_timestamps_tz_table ) as t2 ON t1.nanos = t2.nanos;
+----
+2011-12-13T11:13:10.123450Z 2011-12-13T11:13:10.123450Z
2011-12-13T11:13:10.123Z 2011-12-13T11:13:10Z Row 1 2011-12-13T11:13:10.123450Z
2011-12-13T11:13:10.123450Z 2011-12-13T11:13:10.123Z 2011-12-13T11:13:10Z Row 1
+2018-11-13T17:11:10.011375885Z 2018-11-13T17:11:10.011375Z
2018-11-13T17:11:10.011Z 2018-11-13T17:11:10Z Row 0
2018-11-13T17:11:10.011375885Z 2018-11-13T17:11:10.011375Z
2018-11-13T17:11:10.011Z 2018-11-13T17:11:10Z Row 0
+2021-01-01T05:11:10.432Z 2021-01-01T05:11:10.432Z 2021-01-01T05:11:10.432Z
2021-01-01T05:11:10Z Row 3 2021-01-01T05:11:10.432Z 2021-01-01T05:11:10.432Z
2021-01-01T05:11:10.432Z 2021-01-01T05:11:10Z Row 3
+
# test timestamp join on micros datatype
query PPPPTPPPPT rowsort
SELECT * FROM test_timestamps_table as t1 JOIN (SELECT * FROM
test_timestamps_table ) as t2 ON t1.micros = t2.micros
@@ -2478,6 +2507,14 @@ SELECT * FROM test_timestamps_table as t1 JOIN (SELECT *
FROM test_timestamps_ta
2018-11-13T17:11:10.011375885 2018-11-13T17:11:10.011375
2018-11-13T17:11:10.011 2018-11-13T17:11:10 Row 0 2018-11-13T17:11:10.011375885
2018-11-13T17:11:10.011375 2018-11-13T17:11:10.011 2018-11-13T17:11:10 Row 0
2021-01-01T05:11:10.432 2021-01-01T05:11:10.432 2021-01-01T05:11:10.432
2021-01-01T05:11:10 Row 3 2021-01-01T05:11:10.432 2021-01-01T05:11:10.432
2021-01-01T05:11:10.432 2021-01-01T05:11:10 Row 3
+# test timestamp with timezone join on micros datatype
+query PPPPTPPPPT rowsort
+SELECT * FROM test_timestamps_tz_table as t1 JOIN (SELECT * FROM
test_timestamps_tz_table ) as t2 ON t1.micros = t2.micros
+----
+2011-12-13T11:13:10.123450Z 2011-12-13T11:13:10.123450Z
2011-12-13T11:13:10.123Z 2011-12-13T11:13:10Z Row 1 2011-12-13T11:13:10.123450Z
2011-12-13T11:13:10.123450Z 2011-12-13T11:13:10.123Z 2011-12-13T11:13:10Z Row 1
+2018-11-13T17:11:10.011375885Z 2018-11-13T17:11:10.011375Z
2018-11-13T17:11:10.011Z 2018-11-13T17:11:10Z Row 0
2018-11-13T17:11:10.011375885Z 2018-11-13T17:11:10.011375Z
2018-11-13T17:11:10.011Z 2018-11-13T17:11:10Z Row 0
+2021-01-01T05:11:10.432Z 2021-01-01T05:11:10.432Z 2021-01-01T05:11:10.432Z
2021-01-01T05:11:10Z Row 3 2021-01-01T05:11:10.432Z 2021-01-01T05:11:10.432Z
2021-01-01T05:11:10.432Z 2021-01-01T05:11:10Z Row 3
+
# test timestamp join on millis datatype
query PPPPTPPPPT rowsort
SELECT * FROM test_timestamps_table as t1 JOIN (SELECT * FROM
test_timestamps_table ) as t2 ON t1.millis = t2.millis
@@ -2486,6 +2523,46 @@ SELECT * FROM test_timestamps_table as t1 JOIN (SELECT *
FROM test_timestamps_ta
2018-11-13T17:11:10.011375885 2018-11-13T17:11:10.011375
2018-11-13T17:11:10.011 2018-11-13T17:11:10 Row 0 2018-11-13T17:11:10.011375885
2018-11-13T17:11:10.011375 2018-11-13T17:11:10.011 2018-11-13T17:11:10 Row 0
2021-01-01T05:11:10.432 2021-01-01T05:11:10.432 2021-01-01T05:11:10.432
2021-01-01T05:11:10 Row 3 2021-01-01T05:11:10.432 2021-01-01T05:11:10.432
2021-01-01T05:11:10.432 2021-01-01T05:11:10 Row 3
+# test timestamp with timezone join on millis datatype
+query PPPPTPPPPT rowsort
+SELECT * FROM test_timestamps_tz_table as t1 JOIN (SELECT * FROM
test_timestamps_tz_table ) as t2 ON t1.millis = t2.millis
+----
+2011-12-13T11:13:10.123450Z 2011-12-13T11:13:10.123450Z
2011-12-13T11:13:10.123Z 2011-12-13T11:13:10Z Row 1 2011-12-13T11:13:10.123450Z
2011-12-13T11:13:10.123450Z 2011-12-13T11:13:10.123Z 2011-12-13T11:13:10Z Row 1
+2018-11-13T17:11:10.011375885Z 2018-11-13T17:11:10.011375Z
2018-11-13T17:11:10.011Z 2018-11-13T17:11:10Z Row 0
2018-11-13T17:11:10.011375885Z 2018-11-13T17:11:10.011375Z
2018-11-13T17:11:10.011Z 2018-11-13T17:11:10Z Row 0
+2021-01-01T05:11:10.432Z 2021-01-01T05:11:10.432Z 2021-01-01T05:11:10.432Z
2021-01-01T05:11:10Z Row 3 2021-01-01T05:11:10.432Z 2021-01-01T05:11:10.432Z
2021-01-01T05:11:10.432Z 2021-01-01T05:11:10Z Row 3
+
+####
+# Config setup
+####
+
+statement ok
+set datafusion.explain.logical_plan_only = false;
+
+statement ok
+set datafusion.optimizer.prefer_hash_join = true;
+
+# explain hash join on timestamp with timezone type
+query TT
+EXPLAIN SELECT * FROM test_timestamps_tz_table as t1 JOIN
test_timestamps_tz_table as t2 ON t1.millis = t2.millis
+----
+logical_plan
+Inner Join: t1.millis = t2.millis
+--SubqueryAlias: t1
+----TableScan: test_timestamps_tz_table projection=[nanos, micros, millis,
secs, names]
+--SubqueryAlias: t2
+----TableScan: test_timestamps_tz_table projection=[nanos, micros, millis,
secs, names]
+physical_plan
+CoalesceBatchesExec: target_batch_size=2
+--HashJoinExec: mode=Partitioned, join_type=Inner, on=[(millis@2, millis@2)]
+----CoalesceBatchesExec: target_batch_size=2
+------RepartitionExec: partitioning=Hash([millis@2], 2), input_partitions=2
+--------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
+----------MemoryExec: partitions=1, partition_sizes=[1]
+----CoalesceBatchesExec: target_batch_size=2
+------RepartitionExec: partitioning=Hash([millis@2], 2), input_partitions=2
+--------RepartitionExec: partitioning=RoundRobinBatch(2), input_partitions=1
+----------MemoryExec: partitions=1, partition_sizes=[1]
+
# left_join_using_2
query II
SELECT t1.c1, t2.c2 FROM test_partition_table t1 JOIN test_partition_table t2
USING (c2) ORDER BY t2.c2;