This is an automated email from the ASF dual-hosted git repository.
sbadhya pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hive.git
The following commit(s) were added to refs/heads/master by this push:
new 00d0b6d7b94 HIVE-27938: Iceberg: Fix java.lang.ClassCastException
during vectorized reads on partition columns (#5048) (Simhadri Govindappa
reviewed by Sourabh Badhya)
00d0b6d7b94 is described below
commit 00d0b6d7b94e3db00d671542efa3c3cf1ad14714
Author: Simhadri Govindappa <[email protected]>
AuthorDate: Thu Feb 1 15:30:38 2024 +0530
HIVE-27938: Iceberg: Fix java.lang.ClassCastException during vectorized
reads on partition columns (#5048) (Simhadri Govindappa reviewed by Sourabh
Badhya)
---
.../mapreduce/HiveIdentityPartitionConverters.java | 69 ++++++++++
.../iceberg/mr/mapreduce/IcebergInputFormat.java | 5 +-
.../positive/iceberg_partition_vectorized_read.q | 24 ++++
.../iceberg_partition_vectorized_read.q.out | 139 +++++++++++++++++++++
4 files changed, 235 insertions(+), 2 deletions(-)
diff --git
a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/mapreduce/HiveIdentityPartitionConverters.java
b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/mapreduce/HiveIdentityPartitionConverters.java
new file mode 100644
index 00000000000..6c51de9dabb
--- /dev/null
+++
b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/mapreduce/HiveIdentityPartitionConverters.java
@@ -0,0 +1,69 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.iceberg.mr.mapreduce;
+
+import java.math.BigDecimal;
+import org.apache.avro.generic.GenericData;
+import org.apache.hadoop.hive.common.type.Date;
+import org.apache.hadoop.hive.common.type.HiveDecimal;
+import org.apache.hadoop.hive.common.type.Timestamp;
+import org.apache.iceberg.types.Type;
+import org.apache.iceberg.types.Types;
+import org.apache.iceberg.util.DateTimeUtil;
+
+public class HiveIdentityPartitionConverters {
+
+ private HiveIdentityPartitionConverters() {
+ }
+
+ public static Object convertConstant(Type type, Object value) {
+ if (value == null) {
+ return null;
+ }
+
+ switch (type.typeId()) {
+ case STRING:
+ return value.toString();
+ case TIME:
+ return DateTimeUtil.timeFromMicros((Long) value);
+ case DATE:
+ return Date.ofEpochDay((Integer) value);
+ case TIMESTAMP:
+ if (((Types.TimestampType) type).shouldAdjustToUTC()) {
+ return DateTimeUtil.timestamptzFromMicros((Long)
value).toOffsetTime();
+ } else {
+ return new Timestamp(DateTimeUtil.timestampFromMicros((Long) value));
+ }
+ case DECIMAL:
+ if (value.getClass().isAssignableFrom(BigDecimal.class)) {
+ return HiveDecimal.create((BigDecimal) value);
+ }
+ return value;
+ case FIXED:
+ if (value instanceof GenericData.Fixed) {
+ return ((GenericData.Fixed) value).bytes();
+ }
+ return value;
+ default:
+ }
+ return value;
+ }
+
+}
diff --git
a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/mapreduce/IcebergInputFormat.java
b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/mapreduce/IcebergInputFormat.java
index 3ec1a3b3b7a..754d78e4d93 100644
---
a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/mapreduce/IcebergInputFormat.java
+++
b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/mapreduce/IcebergInputFormat.java
@@ -391,7 +391,7 @@ public class IcebergInputFormat<T> extends
InputFormat<Void, T> {
"Vectorized read is unsupported for Hive 2 integration.");
Path path = new Path(task.file().path().toString());
- Map<Integer, ?> idToConstant = constantsMap(task,
IdentityPartitionConverters::convertConstant);
+ Map<Integer, ?> idToConstant = constantsMap(task,
HiveIdentityPartitionConverters::convertConstant);
Expression residual = HiveIcebergInputFormat.residualForTask(task,
context.getConfiguration());
// TODO: We have to take care of the EncryptionManager when LLAP and
vectorization is used
@@ -544,7 +544,8 @@ public class IcebergInputFormat<T> extends
InputFormat<Void, T> {
Types.StructType partitionType = Partitioning.partitionType(table);
return PartitionUtil.constantsMap(task, partitionType, converter);
} else if (projectsIdentityPartitionColumns) {
- return PartitionUtil.constantsMap(task, converter);
+ Types.StructType partitionType = Partitioning.partitionType(table);
+ return PartitionUtil.constantsMap(task, partitionType, converter);
} else {
return Collections.emptyMap();
}
diff --git
a/iceberg/iceberg-handler/src/test/queries/positive/iceberg_partition_vectorized_read.q
b/iceberg/iceberg-handler/src/test/queries/positive/iceberg_partition_vectorized_read.q
new file mode 100644
index 00000000000..506f6948871
--- /dev/null
+++
b/iceberg/iceberg-handler/src/test/queries/positive/iceberg_partition_vectorized_read.q
@@ -0,0 +1,24 @@
+set hive.vectorized.execution.enabled=true;
+
+CREATE EXTERNAL TABLE ice_date (`col1` int, `day` date, `calday` date)
PARTITIONED BY SPEC (calday) stored by
+iceberg tblproperties('format-version'='2');
+insert into ice_date values(1, '2020-11-20', '2020-11-20'), (1, '2020-11-20',
'2020-11-20');
+select * from ice_date;
+select count(calday) from ice_date;
+select distinct(calday) from ice_date;
+
+
+CREATE EXTERNAL TABLE ice_timestamp (`col1` int, `day` date, `times`
timestamp) PARTITIONED BY SPEC (times) stored
+by iceberg tblproperties('format-version'='2');
+insert into ice_timestamp values(1, '2020-11-20', '2020-11-20'), (1,
'2020-11-20', '2020-11-20');
+select * from ice_timestamp;
+select count(times) from ice_timestamp;
+select distinct(times) from ice_timestamp;
+
+
+CREATE EXTERNAL TABLE ice_decimal (`col1` int, `decimalA` decimal(5,2),
`decimalC` decimal(5,2)) PARTITIONED BY SPEC
+(decimalC) stored by iceberg tblproperties('format-version'='2');
+insert into ice_decimal values(1, 122.91, 102.21), (1, 12.32, 200.12);
+select * from ice_decimal;
+select distinct(decimalc) from ice_decimal;
+select count(decimala) from ice_decimal where decimala=122.91;
diff --git
a/iceberg/iceberg-handler/src/test/results/positive/iceberg_partition_vectorized_read.q.out
b/iceberg/iceberg-handler/src/test/results/positive/iceberg_partition_vectorized_read.q.out
new file mode 100644
index 00000000000..3cc643380fd
--- /dev/null
+++
b/iceberg/iceberg-handler/src/test/results/positive/iceberg_partition_vectorized_read.q.out
@@ -0,0 +1,139 @@
+PREHOOK: query: CREATE EXTERNAL TABLE ice_date (`col1` int, `day` date,
`calday` date) PARTITIONED BY SPEC (calday) stored by
+iceberg tblproperties('format-version'='2')
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@ice_date
+POSTHOOK: query: CREATE EXTERNAL TABLE ice_date (`col1` int, `day` date,
`calday` date) PARTITIONED BY SPEC (calday) stored by
+iceberg tblproperties('format-version'='2')
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@ice_date
+PREHOOK: query: insert into ice_date values(1, '2020-11-20', '2020-11-20'),
(1, '2020-11-20', '2020-11-20')
+PREHOOK: type: QUERY
+PREHOOK: Input: _dummy_database@_dummy_table
+PREHOOK: Output: default@ice_date
+POSTHOOK: query: insert into ice_date values(1, '2020-11-20', '2020-11-20'),
(1, '2020-11-20', '2020-11-20')
+POSTHOOK: type: QUERY
+POSTHOOK: Input: _dummy_database@_dummy_table
+POSTHOOK: Output: default@ice_date
+PREHOOK: query: select * from ice_date
+PREHOOK: type: QUERY
+PREHOOK: Input: default@ice_date
+PREHOOK: Output: hdfs://### HDFS PATH ###
+POSTHOOK: query: select * from ice_date
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@ice_date
+POSTHOOK: Output: hdfs://### HDFS PATH ###
+1 2020-11-20 2020-11-20
+1 2020-11-20 2020-11-20
+PREHOOK: query: select count(calday) from ice_date
+PREHOOK: type: QUERY
+PREHOOK: Input: default@ice_date
+PREHOOK: Output: hdfs://### HDFS PATH ###
+POSTHOOK: query: select count(calday) from ice_date
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@ice_date
+POSTHOOK: Output: hdfs://### HDFS PATH ###
+2
+PREHOOK: query: select distinct(calday) from ice_date
+PREHOOK: type: QUERY
+PREHOOK: Input: default@ice_date
+PREHOOK: Output: hdfs://### HDFS PATH ###
+POSTHOOK: query: select distinct(calday) from ice_date
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@ice_date
+POSTHOOK: Output: hdfs://### HDFS PATH ###
+2020-11-20
+PREHOOK: query: CREATE EXTERNAL TABLE ice_timestamp (`col1` int, `day` date,
`times` timestamp) PARTITIONED BY SPEC (times) stored
+by iceberg tblproperties('format-version'='2')
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@ice_timestamp
+POSTHOOK: query: CREATE EXTERNAL TABLE ice_timestamp (`col1` int, `day`
date, `times` timestamp) PARTITIONED BY SPEC (times) stored
+by iceberg tblproperties('format-version'='2')
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@ice_timestamp
+PREHOOK: query: insert into ice_timestamp values(1, '2020-11-20',
'2020-11-20'), (1, '2020-11-20', '2020-11-20')
+PREHOOK: type: QUERY
+PREHOOK: Input: _dummy_database@_dummy_table
+PREHOOK: Output: default@ice_timestamp
+POSTHOOK: query: insert into ice_timestamp values(1, '2020-11-20',
'2020-11-20'), (1, '2020-11-20', '2020-11-20')
+POSTHOOK: type: QUERY
+POSTHOOK: Input: _dummy_database@_dummy_table
+POSTHOOK: Output: default@ice_timestamp
+PREHOOK: query: select * from ice_timestamp
+PREHOOK: type: QUERY
+PREHOOK: Input: default@ice_timestamp
+PREHOOK: Output: hdfs://### HDFS PATH ###
+POSTHOOK: query: select * from ice_timestamp
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@ice_timestamp
+POSTHOOK: Output: hdfs://### HDFS PATH ###
+1 2020-11-20 2020-11-20 00:00:00
+1 2020-11-20 2020-11-20 00:00:00
+PREHOOK: query: select count(times) from ice_timestamp
+PREHOOK: type: QUERY
+PREHOOK: Input: default@ice_timestamp
+PREHOOK: Output: hdfs://### HDFS PATH ###
+POSTHOOK: query: select count(times) from ice_timestamp
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@ice_timestamp
+POSTHOOK: Output: hdfs://### HDFS PATH ###
+2
+PREHOOK: query: select distinct(times) from ice_timestamp
+PREHOOK: type: QUERY
+PREHOOK: Input: default@ice_timestamp
+PREHOOK: Output: hdfs://### HDFS PATH ###
+POSTHOOK: query: select distinct(times) from ice_timestamp
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@ice_timestamp
+POSTHOOK: Output: hdfs://### HDFS PATH ###
+2020-11-20 00:00:00
+PREHOOK: query: CREATE EXTERNAL TABLE ice_decimal (`col1` int, `decimalA`
decimal(5,2), `decimalC` decimal(5,2)) PARTITIONED BY SPEC
+(decimalC) stored by iceberg tblproperties('format-version'='2')
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@ice_decimal
+POSTHOOK: query: CREATE EXTERNAL TABLE ice_decimal (`col1` int, `decimalA`
decimal(5,2), `decimalC` decimal(5,2)) PARTITIONED BY SPEC
+(decimalC) stored by iceberg tblproperties('format-version'='2')
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@ice_decimal
+PREHOOK: query: insert into ice_decimal values(1, 122.91, 102.21), (1, 12.32,
200.12)
+PREHOOK: type: QUERY
+PREHOOK: Input: _dummy_database@_dummy_table
+PREHOOK: Output: default@ice_decimal
+POSTHOOK: query: insert into ice_decimal values(1, 122.91, 102.21), (1, 12.32,
200.12)
+POSTHOOK: type: QUERY
+POSTHOOK: Input: _dummy_database@_dummy_table
+POSTHOOK: Output: default@ice_decimal
+PREHOOK: query: select * from ice_decimal
+PREHOOK: type: QUERY
+PREHOOK: Input: default@ice_decimal
+PREHOOK: Output: hdfs://### HDFS PATH ###
+POSTHOOK: query: select * from ice_decimal
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@ice_decimal
+POSTHOOK: Output: hdfs://### HDFS PATH ###
+1 122.91 102.21
+1 12.32 200.12
+PREHOOK: query: select distinct(decimalc) from ice_decimal
+PREHOOK: type: QUERY
+PREHOOK: Input: default@ice_decimal
+PREHOOK: Output: hdfs://### HDFS PATH ###
+POSTHOOK: query: select distinct(decimalc) from ice_decimal
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@ice_decimal
+POSTHOOK: Output: hdfs://### HDFS PATH ###
+102.21
+200.12
+PREHOOK: query: select count(decimala) from ice_decimal where decimala=122.91
+PREHOOK: type: QUERY
+PREHOOK: Input: default@ice_decimal
+PREHOOK: Output: hdfs://### HDFS PATH ###
+POSTHOOK: query: select count(decimala) from ice_decimal where decimala=122.91
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@ice_decimal
+POSTHOOK: Output: hdfs://### HDFS PATH ###
+1