This is an automated email from the ASF dual-hosted git repository.
pvary pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hive.git
The following commit(s) were added to refs/heads/master by this push:
new 94b383c82a HIVE-26193: Fix Iceberg partitioned tables null bucket
handling (Peter Vary reviewed by Marton Bod and Adam Szita) (#3261)
94b383c82a is described below
commit 94b383c82a7c5d2e619c3bab84dfca88b4591b4e
Author: pvary <[email protected]>
AuthorDate: Mon May 2 08:35:29 2022 +0200
HIVE-26193: Fix Iceberg partitioned tables null bucket handling (Peter Vary
reviewed by Marton Bod and Adam Szita) (#3261)
---
.../iceberg/mr/hive/GenericUDFIcebergBucket.java | 2 +-
.../queries/positive/dynamic_partition_writes.q | 4 +-
.../positive/dynamic_partition_writes.q.out | 43 ++++++++++++++--------
.../hive/ql/exec/vector/VectorizedRowBatchCtx.java | 4 +-
4 files changed, 32 insertions(+), 21 deletions(-)
diff --git
a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/GenericUDFIcebergBucket.java
b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/GenericUDFIcebergBucket.java
index cab4bb11ab..52b0a1edbf 100644
---
a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/GenericUDFIcebergBucket.java
+++
b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/GenericUDFIcebergBucket.java
@@ -185,7 +185,7 @@ public class GenericUDFIcebergBucket extends GenericUDF {
public Object evaluate(DeferredObject[] arguments) throws HiveException {
DeferredObject argument = arguments[0];
- if (argument == null) {
+ if (argument == null || argument.get() == null) {
return null;
} else {
evaluator.apply(argument);
diff --git
a/iceberg/iceberg-handler/src/test/queries/positive/dynamic_partition_writes.q
b/iceberg/iceberg-handler/src/test/queries/positive/dynamic_partition_writes.q
index 93094529f6..622e870350 100644
---
a/iceberg/iceberg-handler/src/test/queries/positive/dynamic_partition_writes.q
+++
b/iceberg/iceberg-handler/src/test/queries/positive/dynamic_partition_writes.q
@@ -10,9 +10,9 @@ drop table if exists tbl_target_mixed;
create external table tbl_src (a int, b string, c bigint) stored by iceberg
stored as orc;
-insert into tbl_src values (1, 'EUR', 10), (2, 'EUR', 10), (3, 'USD', 11), (4,
'EUR', 12), (5, 'HUF', 30), (6, 'USD', 10), (7, 'USD', 100), (8, 'PLN', 20),
(9, 'PLN', 11), (10, 'CZK', 5);
+insert into tbl_src values (1, 'EUR', 10), (2, 'EUR', 10), (3, 'USD', 11), (4,
'EUR', 12), (5, 'HUF', 30), (6, 'USD', 10), (7, 'USD', 100), (8, 'PLN', 20),
(9, 'PLN', 11), (10, 'CZK', 5), (12, NULL, NULL);
--need at least 2 files to ensure ClusteredWriter encounters out-of-order
records
-insert into tbl_src values (10, 'EUR', 12), (20, 'EUR', 11), (30, 'USD', 100),
(40, 'EUR', 10), (50, 'HUF', 30), (60, 'USD', 12), (70, 'USD', 20), (80, 'PLN',
100), (90, 'PLN', 18), (100, 'CZK', 12);
+insert into tbl_src values (10, 'EUR', 12), (20, 'EUR', 11), (30, 'USD', 100),
(40, 'EUR', 10), (50, 'HUF', 30), (60, 'USD', 12), (70, 'USD', 20), (80, 'PLN',
100), (90, 'PLN', 18), (100, 'CZK', 12), (110, NULL, NULL);
create external table tbl_target_identity (a int) partitioned by (ccy string)
stored by iceberg stored as orc;
explain insert overwrite table tbl_target_identity select a, b from tbl_src;
diff --git
a/iceberg/iceberg-handler/src/test/results/positive/dynamic_partition_writes.q.out
b/iceberg/iceberg-handler/src/test/results/positive/dynamic_partition_writes.q.out
index 36fb39c091..53be8c39ed 100644
---
a/iceberg/iceberg-handler/src/test/results/positive/dynamic_partition_writes.q.out
+++
b/iceberg/iceberg-handler/src/test/results/positive/dynamic_partition_writes.q.out
@@ -22,19 +22,19 @@ POSTHOOK: query: create external table tbl_src (a int, b
string, c bigint) store
POSTHOOK: type: CREATETABLE
POSTHOOK: Output: database:default
POSTHOOK: Output: default@tbl_src
-PREHOOK: query: insert into tbl_src values (1, 'EUR', 10), (2, 'EUR', 10), (3,
'USD', 11), (4, 'EUR', 12), (5, 'HUF', 30), (6, 'USD', 10), (7, 'USD', 100),
(8, 'PLN', 20), (9, 'PLN', 11), (10, 'CZK', 5)
+PREHOOK: query: insert into tbl_src values (1, 'EUR', 10), (2, 'EUR', 10), (3,
'USD', 11), (4, 'EUR', 12), (5, 'HUF', 30), (6, 'USD', 10), (7, 'USD', 100),
(8, 'PLN', 20), (9, 'PLN', 11), (10, 'CZK', 5), (12, NULL, NULL)
PREHOOK: type: QUERY
PREHOOK: Input: _dummy_database@_dummy_table
PREHOOK: Output: default@tbl_src
-POSTHOOK: query: insert into tbl_src values (1, 'EUR', 10), (2, 'EUR', 10),
(3, 'USD', 11), (4, 'EUR', 12), (5, 'HUF', 30), (6, 'USD', 10), (7, 'USD',
100), (8, 'PLN', 20), (9, 'PLN', 11), (10, 'CZK', 5)
+POSTHOOK: query: insert into tbl_src values (1, 'EUR', 10), (2, 'EUR', 10),
(3, 'USD', 11), (4, 'EUR', 12), (5, 'HUF', 30), (6, 'USD', 10), (7, 'USD',
100), (8, 'PLN', 20), (9, 'PLN', 11), (10, 'CZK', 5), (12, NULL, NULL)
POSTHOOK: type: QUERY
POSTHOOK: Input: _dummy_database@_dummy_table
POSTHOOK: Output: default@tbl_src
-PREHOOK: query: insert into tbl_src values (10, 'EUR', 12), (20, 'EUR', 11),
(30, 'USD', 100), (40, 'EUR', 10), (50, 'HUF', 30), (60, 'USD', 12), (70,
'USD', 20), (80, 'PLN', 100), (90, 'PLN', 18), (100, 'CZK', 12)
+PREHOOK: query: insert into tbl_src values (10, 'EUR', 12), (20, 'EUR', 11),
(30, 'USD', 100), (40, 'EUR', 10), (50, 'HUF', 30), (60, 'USD', 12), (70,
'USD', 20), (80, 'PLN', 100), (90, 'PLN', 18), (100, 'CZK', 12), (110, NULL,
NULL)
PREHOOK: type: QUERY
PREHOOK: Input: _dummy_database@_dummy_table
PREHOOK: Output: default@tbl_src
-POSTHOOK: query: insert into tbl_src values (10, 'EUR', 12), (20, 'EUR', 11),
(30, 'USD', 100), (40, 'EUR', 10), (50, 'HUF', 30), (60, 'USD', 12), (70,
'USD', 20), (80, 'PLN', 100), (90, 'PLN', 18), (100, 'CZK', 12)
+POSTHOOK: query: insert into tbl_src values (10, 'EUR', 12), (20, 'EUR', 11),
(30, 'USD', 100), (40, 'EUR', 10), (50, 'HUF', 30), (60, 'USD', 12), (70,
'USD', 20), (80, 'PLN', 100), (90, 'PLN', 18), (100, 'CZK', 12), (110, NULL,
NULL)
POSTHOOK: type: QUERY
POSTHOOK: Input: _dummy_database@_dummy_table
POSTHOOK: Output: default@tbl_src
@@ -76,9 +76,9 @@ Stage-3
<-Map 1 [SIMPLE_EDGE] vectorized
PARTITION_ONLY_SHUFFLE [RS_13]
PartitionCols:_col1
- Select Operator [SEL_12] (rows=20 width=91)
+ Select Operator [SEL_12] (rows=22 width=87)
Output:["_col0","_col1"]
- TableScan [TS_0] (rows=20 width=91)
+ TableScan [TS_0] (rows=22 width=87)
default@tbl_src,tbl_src,Tbl:COMPLETE,Col:COMPLETE,Output:["a","b"]
Reducer 3 vectorized
File Output Operator [FS_21]
@@ -90,7 +90,7 @@ Stage-3
PARTITION_ONLY_SHUFFLE [RS_16]
Group By Operator [GBY_15] (rows=1 width=400)
Output:["_col0","_col1","_col2","_col3","_col4","_col5","_col6","_col7","_col8"],aggregations:["min(a)","max(a)","count(1)","count(a)","compute_bit_vector_hll(a)","max(length(ccy))","avg(COALESCE(length(ccy),0))","count(ccy)","compute_bit_vector_hll(ccy)"]
- Select Operator [SEL_14] (rows=20 width=91)
+ Select Operator [SEL_14] (rows=22 width=87)
Output:["a","ccy"]
Please refer to the previous Select Operator
[SEL_12]
@@ -121,6 +121,7 @@ POSTHOOK: Output: hdfs://### HDFS PATH ###
9 PLN
10 CZK
10 EUR
+12 NULL
20 EUR
30 USD
40 EUR
@@ -130,6 +131,7 @@ POSTHOOK: Output: hdfs://### HDFS PATH ###
80 PLN
90 PLN
100 CZK
+110 NULL
PREHOOK: query: create external table tbl_target_bucket (a int, ccy string)
partitioned by spec (bucket (2, ccy)) stored by iceberg stored as orc
PREHOOK: type: CREATETABLE
PREHOOK: Output: database:default
@@ -168,9 +170,9 @@ Stage-3
<-Map 1 [SIMPLE_EDGE] vectorized
PARTITION_ONLY_SHUFFLE [RS_13]
PartitionCols:iceberg_bucket(_col1, 2)
- Select Operator [SEL_12] (rows=20 width=91)
+ Select Operator [SEL_12] (rows=22 width=87)
Output:["_col0","_col1"]
- TableScan [TS_0] (rows=20 width=91)
+ TableScan [TS_0] (rows=22 width=87)
default@tbl_src,tbl_src,Tbl:COMPLETE,Col:COMPLETE,Output:["a","b"]
Reducer 3 vectorized
File Output Operator [FS_21]
@@ -182,7 +184,7 @@ Stage-3
PARTITION_ONLY_SHUFFLE [RS_16]
Group By Operator [GBY_15] (rows=1 width=400)
Output:["_col0","_col1","_col2","_col3","_col4","_col5","_col6","_col7","_col8"],aggregations:["min(a)","max(a)","count(1)","count(a)","compute_bit_vector_hll(a)","max(length(ccy))","avg(COALESCE(length(ccy),0))","count(ccy)","compute_bit_vector_hll(ccy)"]
- Select Operator [SEL_14] (rows=20 width=91)
+ Select Operator [SEL_14] (rows=22 width=87)
Output:["a","ccy"]
Please refer to the previous Select Operator
[SEL_12]
@@ -213,6 +215,7 @@ POSTHOOK: Output: hdfs://### HDFS PATH ###
9 PLN
10 CZK
10 EUR
+12 NULL
20 EUR
30 USD
40 EUR
@@ -222,6 +225,7 @@ POSTHOOK: Output: hdfs://### HDFS PATH ###
80 PLN
90 PLN
100 CZK
+110 NULL
PREHOOK: query: create external table tbl_target_mixed (a int, ccy string, c
bigint) partitioned by spec (ccy, bucket (3, c)) stored by iceberg stored as orc
PREHOOK: type: CREATETABLE
PREHOOK: Output: database:default
@@ -260,9 +264,9 @@ Stage-3
<-Map 1 [SIMPLE_EDGE] vectorized
PARTITION_ONLY_SHUFFLE [RS_13]
PartitionCols:_col1, iceberg_bucket(_col2, 3)
- Select Operator [SEL_12] (rows=20 width=99)
+ Select Operator [SEL_12] (rows=22 width=94)
Output:["_col0","_col1","_col2"]
- TableScan [TS_0] (rows=20 width=99)
+ TableScan [TS_0] (rows=22 width=94)
default@tbl_src,tbl_src,Tbl:COMPLETE,Col:COMPLETE,Output:["a","b","c"]
Reducer 3 vectorized
File Output Operator [FS_21]
@@ -274,7 +278,7 @@ Stage-3
PARTITION_ONLY_SHUFFLE [RS_16]
Group By Operator [GBY_15] (rows=1 width=568)
Output:["_col0","_col1","_col2","_col3","_col4","_col5","_col6","_col7","_col8","_col9","_col10","_col11","_col12"],aggregations:["min(a)","max(a)","count(1)","count(a)","compute_bit_vector_hll(a)","max(length(ccy))","avg(COALESCE(length(ccy),0))","count(ccy)","compute_bit_vector_hll(ccy)","min(c)","max(c)","count(c)","compute_bit_vector_hll(c)"]
- Select Operator [SEL_14] (rows=20 width=99)
+ Select Operator [SEL_14] (rows=22 width=94)
Output:["a","ccy","c"]
Please refer to the previous Select Operator
[SEL_12]
@@ -305,6 +309,7 @@ POSTHOOK: Output: hdfs://### HDFS PATH ###
9 PLN 11
10 CZK 5
10 EUR 12
+12 NULL NULL
20 EUR 11
30 USD 100
40 EUR 10
@@ -314,6 +319,7 @@ POSTHOOK: Output: hdfs://### HDFS PATH ###
80 PLN 100
90 PLN 18
100 CZK 12
+110 NULL NULL
PREHOOK: query: select * from default.tbl_target_mixed.partitions
PREHOOK: type: QUERY
PREHOOK: Input: default@tbl_target_mixed
@@ -324,14 +330,15 @@ POSTHOOK: Input: default@tbl_target_mixed
POSTHOOK: Output: hdfs://### HDFS PATH ###
{"ccy":"EUR","c_bucket":0} 1 1
{"ccy":"EUR","c_bucket":1} 2 1
-{"ccy":"HUF","c_bucket":1} 2 1
{"ccy":"EUR","c_bucket":2} 3 1
{"ccy":"USD","c_bucket":1} 3 1
{"ccy":"CZK","c_bucket":1} 1 1
{"ccy":"USD","c_bucket":0} 2 1
{"ccy":"USD","c_bucket":2} 1 1
{"ccy":"CZK","c_bucket":2} 1 1
+{"ccy":"HUF","c_bucket":1} 2 1
{"ccy":"PLN","c_bucket":2} 1 1
+{"ccy":null,"c_bucket":null} 2 1
{"ccy":"PLN","c_bucket":0} 2 1
{"ccy":"PLN","c_bucket":1} 1 1
PREHOOK: query: select * from default.tbl_target_mixed.files
@@ -342,6 +349,7 @@ POSTHOOK: query: select * from
default.tbl_target_mixed.files
POSTHOOK: type: QUERY
POSTHOOK: Input: default@tbl_target_mixed
POSTHOOK: Output: hdfs://### HDFS PATH ###
+0 hdfs://### HDFS PATH ### ORC 0
{"ccy":null,"c_bucket":null} 2 417 {1:7,2:5,3:5} {1:2,2:2,3:2}
{1:0,2:2,3:2} {} {1: