This is an automated email from the ASF dual-hosted git repository.
dkuzmenko pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hive.git
The following commit(s) were added to refs/heads/master by this push:
new a8a0ae782be HIVE-27158: Store hive columns stats in puffin files for
iceberg tables (Simhadri Govindappa, reviewed by Ayush Saxena, Denys Kuzmenko,
Rajesh Balamohan, Zsolt Miskolczi)
a8a0ae782be is described below
commit a8a0ae782be87d1198006ea3cb508f14070231b7
Author: SimhadriGovindappa <[email protected]>
AuthorDate: Wed Apr 19 15:31:29 2023 +0530
HIVE-27158: Store hive columns stats in puffin files for iceberg tables
(Simhadri Govindappa, reviewed by Ayush Saxena, Denys Kuzmenko, Rajesh
Balamohan, Zsolt Miskolczi)
Closes #4131
---
.../java/org/apache/hadoop/hive/conf/HiveConf.java | 5 +-
.../iceberg/mr/hive/HiveIcebergStorageHandler.java | 103 +++-
.../src/test/queries/positive/col_stats.q | 58 ++
.../positive/use_basic_stats_from_iceberg.q | 4 +-
.../positive/vectorized_iceberg_read_mixed.q | 8 +
.../src/test/results/positive/col_stats.q.out | 615 +++++++++++++++++++++
.../positive/dynamic_partition_writes.q.out | 22 +-
.../llap/vectorized_iceberg_read_mixed.q.out | 110 +++-
.../positive/vectorized_iceberg_read_mixed.q.out | 71 ++-
.../hive/ql/metadata/HiveStorageHandler.java | 41 ++
.../hadoop/hive/ql/stats/ColStatsProcessor.java | 3 +
.../apache/hadoop/hive/ql/stats/StatsUtils.java | 8 +-
12 files changed, 1022 insertions(+), 26 deletions(-)
diff --git a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
index 8b666164212..7e6903a39d6 100644
--- a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
+++ b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
@@ -2205,9 +2205,8 @@ public class HiveConf extends Configuration {
"padding tolerance config (hive.exec.orc.block.padding.tolerance)."),
HIVE_ORC_CODEC_POOL("hive.use.orc.codec.pool", false,
"Whether to use codec pool in ORC. Disable if there are bugs with
codec reuse."),
- HIVE_USE_STATS_FROM("hive.use.stats.from","iceberg","Use stats from
iceberg table snapshot for query " +
- "planning. This has three values metastore, puffin and iceberg"),
-
+ HIVE_ICEBERG_STATS_SOURCE("hive.iceberg.stats.source", "iceberg",
+ "Use stats from iceberg table snapshot for query planning. This has
two values metastore and iceberg"),
HIVEUSEEXPLICITRCFILEHEADER("hive.exec.rcfile.use.explicit.header", true,
"If this is set the header for RCFiles will simply be RCF. If this is
not\n" +
"set the header will be that borrowed from sequence files, e.g. SEQ-
followed\n" +
diff --git
a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergStorageHandler.java
b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergStorageHandler.java
index bcadebbf4c0..db69d6c34c8 100644
---
a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergStorageHandler.java
+++
b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergStorageHandler.java
@@ -23,6 +23,7 @@ import java.io.IOException;
import java.io.Serializable;
import java.net.URI;
import java.net.URISyntaxException;
+import java.nio.ByteBuffer;
import java.util.Collection;
import java.util.Collections;
import java.util.List;
@@ -35,14 +36,20 @@ import java.util.function.BiFunction;
import java.util.function.Function;
import java.util.stream.Collectors;
import org.apache.commons.collections4.ListUtils;
+import org.apache.commons.lang3.SerializationUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.common.StatsSetupConst;
import org.apache.hadoop.hive.common.type.Date;
import org.apache.hadoop.hive.common.type.SnapshotContext;
import org.apache.hadoop.hive.common.type.Timestamp;
+import org.apache.hadoop.hive.conf.Constants;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.metastore.HiveMetaHook;
+import org.apache.hadoop.hive.metastore.api.ColumnStatistics;
+import org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj;
import org.apache.hadoop.hive.metastore.api.EnvironmentContext;
import org.apache.hadoop.hive.metastore.api.FieldSchema;
import org.apache.hadoop.hive.metastore.api.LockType;
@@ -112,6 +119,12 @@ import org.apache.iceberg.exceptions.NoSuchTableException;
import org.apache.iceberg.hadoop.HadoopConfigurable;
import org.apache.iceberg.mr.Catalogs;
import org.apache.iceberg.mr.InputFormatConfig;
+import org.apache.iceberg.puffin.Blob;
+import org.apache.iceberg.puffin.BlobMetadata;
+import org.apache.iceberg.puffin.Puffin;
+import org.apache.iceberg.puffin.PuffinCompressionCodec;
+import org.apache.iceberg.puffin.PuffinReader;
+import org.apache.iceberg.puffin.PuffinWriter;
import
org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting;
import org.apache.iceberg.relocated.com.google.common.base.Preconditions;
import org.apache.iceberg.relocated.com.google.common.base.Splitter;
@@ -121,7 +134,10 @@ import
org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap;
import org.apache.iceberg.relocated.com.google.common.collect.Lists;
import org.apache.iceberg.relocated.com.google.common.collect.Maps;
import org.apache.iceberg.relocated.com.google.common.collect.Sets;
+import org.apache.iceberg.relocated.com.google.common.collect.Streams;
import org.apache.iceberg.types.Types;
+import org.apache.iceberg.util.ByteBuffers;
+import org.apache.iceberg.util.Pair;
import org.apache.iceberg.util.SerializationUtil;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -136,6 +152,7 @@ public class HiveIcebergStorageHandler implements
HiveStoragePredicateHandler, H
private static final String PUFFIN = "puffin";
public static final String COPY_ON_WRITE = "copy-on-write";
public static final String MERGE_ON_READ = "merge-on-read";
+ public static final String STATS = "/stats/";
/**
* Function template for producing a custom sort expression function:
* Takes the source column index and the bucket count to creat a function
where Iceberg bucket UDF is used to build
@@ -318,7 +335,7 @@ public class HiveIcebergStorageHandler implements
HiveStoragePredicateHandler, H
org.apache.hadoop.hive.ql.metadata.Table hmsTable = partish.getTable();
// For write queries where rows got modified, don't fetch from cache as
values could have changed.
Table table = getTable(hmsTable);
- String statsSource = HiveConf.getVar(conf,
HiveConf.ConfVars.HIVE_USE_STATS_FROM).toLowerCase();
+ String statsSource = HiveConf.getVar(conf,
HiveConf.ConfVars.HIVE_ICEBERG_STATS_SOURCE).toLowerCase();
Map<String, String> stats = Maps.newHashMap();
switch (statsSource) {
case ICEBERG:
@@ -361,6 +378,90 @@ public class HiveIcebergStorageHandler implements
HiveStoragePredicateHandler, H
return table;
}
+ @Override
+ public boolean canSetColStatistics(org.apache.hadoop.hive.ql.metadata.Table
hmsTable) {
+ Table table = IcebergTableUtil.getTable(conf, hmsTable.getTTable());
+ return table.currentSnapshot() != null ? getStatsSource().equals(ICEBERG)
: false;
+ }
+
+ @Override
+ public boolean setColStatistics(org.apache.hadoop.hive.ql.metadata.Table
hmsTable,
+ List<ColumnStatistics> colStats) {
+ Table tbl = IcebergTableUtil.getTable(conf, hmsTable.getTTable());
+ String snapshotId = String.format("%s-STATS-%d", tbl.name(),
tbl.currentSnapshot().snapshotId());
+ invalidateStats(getStatsPath(tbl));
+ byte[] serializeColStats = SerializationUtils.serialize((Serializable)
colStats);
+ try (PuffinWriter writer =
Puffin.write(tbl.io().newOutputFile(getStatsPath(tbl).toString()))
+ .createdBy(Constants.HIVE_ENGINE).build()) {
+ writer.add(
+ new Blob(
+ tbl.name() + "-" + snapshotId,
+ ImmutableList.of(1),
+ tbl.currentSnapshot().snapshotId(),
+ tbl.currentSnapshot().sequenceNumber(),
+ ByteBuffer.wrap(serializeColStats),
+ PuffinCompressionCodec.NONE,
+ ImmutableMap.of()));
+ writer.finish();
+ } catch (IOException e) {
+ LOG.error(String.valueOf(e));
+ }
+ return false;
+ }
+
+ @Override
+ public boolean
canProvideColStatistics(org.apache.hadoop.hive.ql.metadata.Table hmsTable) {
+ Table table = IcebergTableUtil.getTable(conf, hmsTable.getTTable());
+ if (canSetColStatistics(hmsTable)) {
+ Path statsPath = getStatsPath(table);
+ try (FileSystem fs = statsPath.getFileSystem(conf)) {
+ if (fs.exists(statsPath)) {
+ return true;
+ }
+ } catch (IOException e) {
+ LOG.warn("Exception when trying to find Iceberg column stats for
table:{} , snapshot:{} , " +
+ "statsPath: {} , stack trace: {}", table.name(),
table.currentSnapshot(), statsPath, e);
+ }
+ }
+ return false;
+ }
+
+ @Override
+ public List<ColumnStatisticsObj>
getColStatistics(org.apache.hadoop.hive.ql.metadata.Table hmsTable) {
+ Table table = IcebergTableUtil.getTable(conf, hmsTable.getTTable());
+ String statsPath = getStatsPath(table).toString();
+ LOG.info("Using stats from puffin file at: {}", statsPath);
+ try (PuffinReader reader =
Puffin.read(table.io().newInputFile(statsPath)).build()) {
+ List<BlobMetadata> blobMetadata = reader.fileMetadata().blobs();
+ Map<BlobMetadata, List<ColumnStatistics>> collect =
+
Streams.stream(reader.readAll(blobMetadata)).collect(Collectors.toMap(Pair::first,
+ blobMetadataByteBufferPair -> SerializationUtils.deserialize(
+
ByteBuffers.toByteArray(blobMetadataByteBufferPair.second()))));
+ return collect.get(blobMetadata.get(0)).get(0).getStatsObj();
+ } catch (IOException e) {
+ LOG.error("Error when trying to read iceberg col stats from puffin
files: {}", e);
+ }
+ return null;
+ }
+
+ private String getStatsSource() {
+ return HiveConf.getVar(conf, HiveConf.ConfVars.HIVE_ICEBERG_STATS_SOURCE,
ICEBERG).toLowerCase();
+ }
+
+ private Path getStatsPath(Table table) {
+ return new Path(table.location() + STATS + table.name() +
table.currentSnapshot().snapshotId());
+ }
+
+ private void invalidateStats(Path statsPath) {
+ try (FileSystem fs = statsPath.getFileSystem(conf)) {
+ if (fs.exists(statsPath)) {
+ fs.delete(statsPath, true);
+ }
+ } catch (IOException e) {
+ LOG.error("Failed to invalidate stale column stats: {}", e);
+ }
+ }
+
/**
* No need for exclusive locks when writing, since Iceberg tables use
optimistic concurrency when writing
* and only lock the table during the commit operation.
diff --git a/iceberg/iceberg-handler/src/test/queries/positive/col_stats.q
b/iceberg/iceberg-handler/src/test/queries/positive/col_stats.q
new file mode 100644
index 00000000000..d7c4d811a8b
--- /dev/null
+++ b/iceberg/iceberg-handler/src/test/queries/positive/col_stats.q
@@ -0,0 +1,58 @@
+-- Mask random uuid
+--! qt:replace:/(\s+uuid\s+)\S+(\s*)/$1#Masked#$2/
+set hive.stats.autogather=true;
+set hive.stats.column.autogather=true;
+
+set hive.iceberg.stats.source=iceberg;
+drop table if exists tbl_ice_puffin;
+create external table tbl_ice_puffin(a int, b string, c int) stored by iceberg
tblproperties ('format-version'='2');
+insert into tbl_ice_puffin values (1, 'one', 50), (2, 'two', 51),(2, 'two',
51),(2, 'two', 51), (3, 'three', 52), (4, 'four', 53), (5, 'five', 54), (111,
'one', 55), (333, 'two', 56);
+insert into tbl_ice_puffin values (1, 'one', 50), (2, 'two', 51),(2, 'two',
51),(2, 'two', 51), (3, 'three', 52), (4, 'four', 53), (5, 'five', 54), (111,
'one', 55), (333, 'two', 56);
+explain select * from tbl_ice_puffin order by a, b, c;
+select * from tbl_ice_puffin order by a, b, c;
+desc formatted tbl_ice_puffin b;
+update tbl_ice_puffin set b='two' where b='one' or b='three';
+analyze table tbl_ice_puffin compute statistics for columns;
+explain select * from tbl_ice_puffin order by a, b, c;
+select * from tbl_ice_puffin order by a, b, c;
+select count(*) from tbl_ice_puffin ;
+desc formatted tbl_ice_puffin b;
+
+
+-- Test if hive.iceberg.stats.source is empty
+set hive.iceberg.stats.source= ;
+drop table if exists tbl_ice_puffin;
+create external table tbl_ice_puffin(a int, b string, c int) stored by iceberg
tblproperties ('format-version'='2');
+insert into tbl_ice_puffin values (1, 'one', 50), (2, 'two', 51),(2, 'two',
51),(2, 'two', 51), (3, 'three', 52), (4, 'four', 53), (5, 'five', 54), (111,
'one', 55), (333, 'two', 56);
+explain select * from tbl_ice_puffin order by a, b, c;
+
+
+set hive.iceberg.stats.source=iceberg;
+drop table if exists tbl_ice_puffin;
+create external table tbl_ice_puffin(a int, b string, c int) stored by iceberg
tblproperties ('format-version'='2');
+insert into tbl_ice_puffin values (1, 'one', 50), (2, 'two', 51),(2, 'two',
51),(2, 'two', 51), (3, 'three', 52), (4, 'four', 53), (5, 'five', 54), (111,
'one', 55), (333, 'two', 56);
+explain select * from tbl_ice_puffin order by a, b, c;
+select * from tbl_ice_puffin order by a, b, c;
+select count(*) from tbl_ice_puffin ;
+desc formatted tbl_ice_puffin a;
+
+
+set hive.iceberg.stats.source=metastore;
+
+drop table if exists tbl_ice;
+create external table tbl_ice(a int, b string, c int) stored by iceberg
tblproperties ('format-version'='2');
+insert into tbl_ice values (1, 'one', 50), (2, 'two', 51),(2, 'two', 51),(2,
'two', 51), (3, 'three', 52), (4, 'four', 53), (5, 'five', 54), (111, 'one',
55), (333, 'two', 56);
+explain select * from tbl_ice order by a, b, c;
+select * from tbl_ice order by a, b, c;
+select count(*) from tbl_ice ;
+
+set hive.iceberg.stats.source=iceberg;
+delete from tbl_ice_puffin where a = 2;
+explain select * from tbl_ice order by a, b, c;
+select count(*) from tbl_ice ;
+
+create table t1 (a int) stored by iceberg tblproperties ('format-version'='2');
+create table t2 (b int) stored by iceberg tblproperties ('format-version'='2');
+describe formatted t1;
+describe formatted t2;
+explain select * from t1 join t2 on t1.a = t2.b;
\ No newline at end of file
diff --git
a/iceberg/iceberg-handler/src/test/queries/positive/use_basic_stats_from_iceberg.q
b/iceberg/iceberg-handler/src/test/queries/positive/use_basic_stats_from_iceberg.q
index 90e2d95d1df..d80f420c42c 100644
---
a/iceberg/iceberg-handler/src/test/queries/positive/use_basic_stats_from_iceberg.q
+++
b/iceberg/iceberg-handler/src/test/queries/positive/use_basic_stats_from_iceberg.q
@@ -4,13 +4,13 @@ set hive.stats.autogather=true;
set hive.stats.column.autogather=true;
drop table if exists tbl_ice;
-set hive.use.stats.from = metastore;
+set hive.iceberg.stats.source=metastore;
create external table tbl_ice(a int, b string, c int) stored by iceberg
tblproperties ('format-version'='2');
insert into tbl_ice values (1, 'one', 50), (2, 'two', 51),(2, 'two', 51),(2,
'two', 51), (3, 'three', 52), (4, 'four', 53), (5, 'five', 54), (111, 'one',
55), (333, 'two', 56);
explain select * from tbl_ice order by a, b, c;
drop table if exists tbl_ice;
-set hive.use.stats.from = iceberg;
+set hive.iceberg.stats.source = iceberg;
create external table tbl_ice(a int, b string, c int) stored by iceberg
tblproperties ('format-version'='2');
insert into tbl_ice values (1, 'one', 50), (2, 'two', 51),(2, 'two', 51),(2,
'two', 51), (3, 'three', 52), (4, 'four', 53), (5, 'five', 54), (111, 'one',
55), (333, 'two', 56);
explain select * from tbl_ice order by a, b, c;
diff --git
a/iceberg/iceberg-handler/src/test/queries/positive/vectorized_iceberg_read_mixed.q
b/iceberg/iceberg-handler/src/test/queries/positive/vectorized_iceberg_read_mixed.q
index b630b1f802d..99069bc266a 100644
---
a/iceberg/iceberg-handler/src/test/queries/positive/vectorized_iceberg_read_mixed.q
+++
b/iceberg/iceberg-handler/src/test/queries/positive/vectorized_iceberg_read_mixed.q
@@ -46,6 +46,14 @@ explain select max(t_float), t_double, t_boolean, t_int,
t_bigint, t_binary, t_s
select max(t_float), t_double, t_boolean, t_int, t_bigint, t_binary, t_string,
t_timestamp, t_date, t_decimal from tbl_ice_mixed_all_types
group by t_double, t_boolean, t_int, t_bigint, t_binary, t_string,
t_timestamp, t_date, t_decimal;
+create external table t1 stored as orc as select * from
tbl_ice_mixed_all_types ;
+
+explain select max(t_float), t_double, t_boolean, t_int, t_bigint, t_binary,
t_string, t_timestamp, t_date, t_decimal from tbl_ice_mixed_all_types
+ group by t_double, t_boolean, t_int, t_bigint, t_binary, t_string,
t_timestamp, t_date, t_decimal;
+select max(t_float), t_double, t_boolean, t_int, t_bigint, t_binary, t_string,
t_timestamp, t_date, t_decimal from tbl_ice_mixed_all_types
+ group by t_double, t_boolean, t_int, t_bigint, t_binary, t_string,
t_timestamp, t_date, t_decimal;
+
+
create external table tbl_ice_mixed_parted (
a int,
b string
diff --git a/iceberg/iceberg-handler/src/test/results/positive/col_stats.q.out
b/iceberg/iceberg-handler/src/test/results/positive/col_stats.q.out
new file mode 100644
index 00000000000..b1f13fa76b5
--- /dev/null
+++ b/iceberg/iceberg-handler/src/test/results/positive/col_stats.q.out
@@ -0,0 +1,615 @@
+PREHOOK: query: drop table if exists tbl_ice_puffin
+PREHOOK: type: DROPTABLE
+POSTHOOK: query: drop table if exists tbl_ice_puffin
+POSTHOOK: type: DROPTABLE
+PREHOOK: query: create external table tbl_ice_puffin(a int, b string, c int)
stored by iceberg tblproperties ('format-version'='2')
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@tbl_ice_puffin
+POSTHOOK: query: create external table tbl_ice_puffin(a int, b string, c int)
stored by iceberg tblproperties ('format-version'='2')
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@tbl_ice_puffin
+PREHOOK: query: insert into tbl_ice_puffin values (1, 'one', 50), (2, 'two',
51),(2, 'two', 51),(2, 'two', 51), (3, 'three', 52), (4, 'four', 53), (5,
'five', 54), (111, 'one', 55), (333, 'two', 56)
+PREHOOK: type: QUERY
+PREHOOK: Input: _dummy_database@_dummy_table
+PREHOOK: Output: default@tbl_ice_puffin
+POSTHOOK: query: insert into tbl_ice_puffin values (1, 'one', 50), (2, 'two',
51),(2, 'two', 51),(2, 'two', 51), (3, 'three', 52), (4, 'four', 53), (5,
'five', 54), (111, 'one', 55), (333, 'two', 56)
+POSTHOOK: type: QUERY
+POSTHOOK: Input: _dummy_database@_dummy_table
+POSTHOOK: Output: default@tbl_ice_puffin
+PREHOOK: query: insert into tbl_ice_puffin values (1, 'one', 50), (2, 'two',
51),(2, 'two', 51),(2, 'two', 51), (3, 'three', 52), (4, 'four', 53), (5,
'five', 54), (111, 'one', 55), (333, 'two', 56)
+PREHOOK: type: QUERY
+PREHOOK: Input: _dummy_database@_dummy_table
+PREHOOK: Output: default@tbl_ice_puffin
+POSTHOOK: query: insert into tbl_ice_puffin values (1, 'one', 50), (2, 'two',
51),(2, 'two', 51),(2, 'two', 51), (3, 'three', 52), (4, 'four', 53), (5,
'five', 54), (111, 'one', 55), (333, 'two', 56)
+POSTHOOK: type: QUERY
+POSTHOOK: Input: _dummy_database@_dummy_table
+POSTHOOK: Output: default@tbl_ice_puffin
+PREHOOK: query: explain select * from tbl_ice_puffin order by a, b, c
+PREHOOK: type: QUERY
+PREHOOK: Input: default@tbl_ice_puffin
+PREHOOK: Output: hdfs://### HDFS PATH ###
+POSTHOOK: query: explain select * from tbl_ice_puffin order by a, b, c
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@tbl_ice_puffin
+POSTHOOK: Output: hdfs://### HDFS PATH ###
+Plan optimized by CBO.
+
+Vertex dependency in root stage
+Reducer 2 <- Map 1 (SIMPLE_EDGE)
+
+Stage-0
+ Fetch Operator
+ limit:-1
+ Stage-1
+ Reducer 2 vectorized
+ File Output Operator [FS_8]
+ Select Operator [SEL_7] (rows=18 width=95)
+ Output:["_col0","_col1","_col2"]
+ <-Map 1 [SIMPLE_EDGE] vectorized
+ SHUFFLE [RS_6]
+ Select Operator [SEL_5] (rows=18 width=95)
+ Output:["_col0","_col1","_col2"]
+ TableScan [TS_0] (rows=18 width=95)
+
default@tbl_ice_puffin,tbl_ice_puffin,Tbl:COMPLETE,Col:COMPLETE,Output:["a","b","c"]
+
+PREHOOK: query: select * from tbl_ice_puffin order by a, b, c
+PREHOOK: type: QUERY
+PREHOOK: Input: default@tbl_ice_puffin
+PREHOOK: Output: hdfs://### HDFS PATH ###
+POSTHOOK: query: select * from tbl_ice_puffin order by a, b, c
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@tbl_ice_puffin
+POSTHOOK: Output: hdfs://### HDFS PATH ###
+1 one 50
+1 one 50
+2 two 51
+2 two 51
+2 two 51
+2 two 51
+2 two 51
+2 two 51
+3 three 52
+3 three 52
+4 four 53
+4 four 53
+5 five 54
+5 five 54
+111 one 55
+111 one 55
+333 two 56
+333 two 56
+PREHOOK: query: desc formatted tbl_ice_puffin b
+PREHOOK: type: DESCTABLE
+PREHOOK: Input: default@tbl_ice_puffin
+POSTHOOK: query: desc formatted tbl_ice_puffin b
+POSTHOOK: type: DESCTABLE
+POSTHOOK: Input: default@tbl_ice_puffin
+col_name b
+data_type string
+min
+max
+num_nulls 0
+distinct_count 5
+avg_col_len 3.4444444444444446
+max_col_len 5
+num_trues
+num_falses
+bit_vector HL
+comment
+COLUMN_STATS_ACCURATE
{\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"a\":\"true\",\"b\":\"true\",\"c\":\"true\"}}
+PREHOOK: query: update tbl_ice_puffin set b='two' where b='one' or b='three'
+PREHOOK: type: QUERY
+PREHOOK: Input: default@tbl_ice_puffin
+PREHOOK: Output: default@tbl_ice_puffin
+PREHOOK: Output: default@tbl_ice_puffin
+POSTHOOK: query: update tbl_ice_puffin set b='two' where b='one' or b='three'
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@tbl_ice_puffin
+POSTHOOK: Output: default@tbl_ice_puffin
+POSTHOOK: Output: default@tbl_ice_puffin
+PREHOOK: query: analyze table tbl_ice_puffin compute statistics for columns
+PREHOOK: type: ANALYZE_TABLE
+PREHOOK: Input: default@tbl_ice_puffin
+PREHOOK: Output: default@tbl_ice_puffin
+PREHOOK: Output: hdfs://### HDFS PATH ###
+POSTHOOK: query: analyze table tbl_ice_puffin compute statistics for columns
+POSTHOOK: type: ANALYZE_TABLE
+POSTHOOK: Input: default@tbl_ice_puffin
+POSTHOOK: Output: default@tbl_ice_puffin
+POSTHOOK: Output: hdfs://### HDFS PATH ###
+PREHOOK: query: explain select * from tbl_ice_puffin order by a, b, c
+PREHOOK: type: QUERY
+PREHOOK: Input: default@tbl_ice_puffin
+PREHOOK: Output: hdfs://### HDFS PATH ###
+POSTHOOK: query: explain select * from tbl_ice_puffin order by a, b, c
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@tbl_ice_puffin
+POSTHOOK: Output: hdfs://### HDFS PATH ###
+Plan optimized by CBO.
+
+Vertex dependency in root stage
+Reducer 2 <- Map 1 (SIMPLE_EDGE)
+
+Stage-0
+ Fetch Operator
+ limit:-1
+ Stage-1
+ Reducer 2 vectorized
+ File Output Operator [FS_8]
+ Select Operator [SEL_7] (rows=24 width=95)
+ Output:["_col0","_col1","_col2"]
+ <-Map 1 [SIMPLE_EDGE] vectorized
+ SHUFFLE [RS_6]
+ Select Operator [SEL_5] (rows=24 width=95)
+ Output:["_col0","_col1","_col2"]
+ TableScan [TS_0] (rows=24 width=95)
+
default@tbl_ice_puffin,tbl_ice_puffin,Tbl:COMPLETE,Col:COMPLETE,Output:["a","b","c"]
+
+PREHOOK: query: select * from tbl_ice_puffin order by a, b, c
+PREHOOK: type: QUERY
+PREHOOK: Input: default@tbl_ice_puffin
+PREHOOK: Output: hdfs://### HDFS PATH ###
+POSTHOOK: query: select * from tbl_ice_puffin order by a, b, c
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@tbl_ice_puffin
+POSTHOOK: Output: hdfs://### HDFS PATH ###
+1 two 50
+1 two 50
+2 two 51
+2 two 51
+2 two 51
+2 two 51
+2 two 51
+2 two 51
+3 two 52
+3 two 52
+4 four 53
+4 four 53
+5 five 54
+5 five 54
+111 two 55
+111 two 55
+333 two 56
+333 two 56
+PREHOOK: query: select count(*) from tbl_ice_puffin
+PREHOOK: type: QUERY
+PREHOOK: Input: default@tbl_ice_puffin
+PREHOOK: Output: hdfs://### HDFS PATH ###
+POSTHOOK: query: select count(*) from tbl_ice_puffin
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@tbl_ice_puffin
+POSTHOOK: Output: hdfs://### HDFS PATH ###
+18
+PREHOOK: query: desc formatted tbl_ice_puffin b
+PREHOOK: type: DESCTABLE
+PREHOOK: Input: default@tbl_ice_puffin
+POSTHOOK: query: desc formatted tbl_ice_puffin b
+POSTHOOK: type: DESCTABLE
+POSTHOOK: Input: default@tbl_ice_puffin
+col_name b
+data_type string
+min
+max
+num_nulls 0
+distinct_count 3
+avg_col_len 3.2222222222222223
+max_col_len 4
+num_trues
+num_falses
+bit_vector HL
+comment
+COLUMN_STATS_ACCURATE
{\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"a\":\"true\",\"b\":\"true\",\"c\":\"true\"}}
+PREHOOK: query: drop table if exists tbl_ice_puffin
+PREHOOK: type: DROPTABLE
+PREHOOK: Input: default@tbl_ice_puffin
+PREHOOK: Output: default@tbl_ice_puffin
+POSTHOOK: query: drop table if exists tbl_ice_puffin
+POSTHOOK: type: DROPTABLE
+POSTHOOK: Input: default@tbl_ice_puffin
+POSTHOOK: Output: default@tbl_ice_puffin
+PREHOOK: query: create external table tbl_ice_puffin(a int, b string, c int)
stored by iceberg tblproperties ('format-version'='2')
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@tbl_ice_puffin
+POSTHOOK: query: create external table tbl_ice_puffin(a int, b string, c int)
stored by iceberg tblproperties ('format-version'='2')
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@tbl_ice_puffin
+PREHOOK: query: insert into tbl_ice_puffin values (1, 'one', 50), (2, 'two',
51),(2, 'two', 51),(2, 'two', 51), (3, 'three', 52), (4, 'four', 53), (5,
'five', 54), (111, 'one', 55), (333, 'two', 56)
+PREHOOK: type: QUERY
+PREHOOK: Input: _dummy_database@_dummy_table
+PREHOOK: Output: default@tbl_ice_puffin
+POSTHOOK: query: insert into tbl_ice_puffin values (1, 'one', 50), (2, 'two',
51),(2, 'two', 51),(2, 'two', 51), (3, 'three', 52), (4, 'four', 53), (5,
'five', 54), (111, 'one', 55), (333, 'two', 56)
+POSTHOOK: type: QUERY
+POSTHOOK: Input: _dummy_database@_dummy_table
+POSTHOOK: Output: default@tbl_ice_puffin
+PREHOOK: query: explain select * from tbl_ice_puffin order by a, b, c
+PREHOOK: type: QUERY
+PREHOOK: Input: default@tbl_ice_puffin
+PREHOOK: Output: hdfs://### HDFS PATH ###
+POSTHOOK: query: explain select * from tbl_ice_puffin order by a, b, c
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@tbl_ice_puffin
+POSTHOOK: Output: hdfs://### HDFS PATH ###
+Plan optimized by CBO.
+
+Vertex dependency in root stage
+Reducer 2 <- Map 1 (SIMPLE_EDGE)
+
+Stage-0
+ Fetch Operator
+ limit:-1
+ Stage-1
+ Reducer 2 vectorized
+ File Output Operator [FS_8]
+ Select Operator [SEL_7] (rows=9 width=95)
+ Output:["_col0","_col1","_col2"]
+ <-Map 1 [SIMPLE_EDGE] vectorized
+ SHUFFLE [RS_6]
+ Select Operator [SEL_5] (rows=9 width=95)
+ Output:["_col0","_col1","_col2"]
+ TableScan [TS_0] (rows=9 width=95)
+
default@tbl_ice_puffin,tbl_ice_puffin,Tbl:COMPLETE,Col:COMPLETE,Output:["a","b","c"]
+
+PREHOOK: query: drop table if exists tbl_ice_puffin
+PREHOOK: type: DROPTABLE
+PREHOOK: Input: default@tbl_ice_puffin
+PREHOOK: Output: default@tbl_ice_puffin
+POSTHOOK: query: drop table if exists tbl_ice_puffin
+POSTHOOK: type: DROPTABLE
+POSTHOOK: Input: default@tbl_ice_puffin
+POSTHOOK: Output: default@tbl_ice_puffin
+PREHOOK: query: create external table tbl_ice_puffin(a int, b string, c int)
stored by iceberg tblproperties ('format-version'='2')
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@tbl_ice_puffin
+POSTHOOK: query: create external table tbl_ice_puffin(a int, b string, c int)
stored by iceberg tblproperties ('format-version'='2')
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@tbl_ice_puffin
+PREHOOK: query: insert into tbl_ice_puffin values (1, 'one', 50), (2, 'two',
51),(2, 'two', 51),(2, 'two', 51), (3, 'three', 52), (4, 'four', 53), (5,
'five', 54), (111, 'one', 55), (333, 'two', 56)
+PREHOOK: type: QUERY
+PREHOOK: Input: _dummy_database@_dummy_table
+PREHOOK: Output: default@tbl_ice_puffin
+POSTHOOK: query: insert into tbl_ice_puffin values (1, 'one', 50), (2, 'two',
51),(2, 'two', 51),(2, 'two', 51), (3, 'three', 52), (4, 'four', 53), (5,
'five', 54), (111, 'one', 55), (333, 'two', 56)
+POSTHOOK: type: QUERY
+POSTHOOK: Input: _dummy_database@_dummy_table
+POSTHOOK: Output: default@tbl_ice_puffin
+PREHOOK: query: explain select * from tbl_ice_puffin order by a, b, c
+PREHOOK: type: QUERY
+PREHOOK: Input: default@tbl_ice_puffin
+PREHOOK: Output: hdfs://### HDFS PATH ###
+POSTHOOK: query: explain select * from tbl_ice_puffin order by a, b, c
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@tbl_ice_puffin
+POSTHOOK: Output: hdfs://### HDFS PATH ###
+Plan optimized by CBO.
+
+Vertex dependency in root stage
+Reducer 2 <- Map 1 (SIMPLE_EDGE)
+
+Stage-0
+ Fetch Operator
+ limit:-1
+ Stage-1
+ Reducer 2 vectorized
+ File Output Operator [FS_8]
+ Select Operator [SEL_7] (rows=9 width=95)
+ Output:["_col0","_col1","_col2"]
+ <-Map 1 [SIMPLE_EDGE] vectorized
+ SHUFFLE [RS_6]
+ Select Operator [SEL_5] (rows=9 width=95)
+ Output:["_col0","_col1","_col2"]
+ TableScan [TS_0] (rows=9 width=95)
+
default@tbl_ice_puffin,tbl_ice_puffin,Tbl:COMPLETE,Col:COMPLETE,Output:["a","b","c"]
+
+PREHOOK: query: select * from tbl_ice_puffin order by a, b, c
+PREHOOK: type: QUERY
+PREHOOK: Input: default@tbl_ice_puffin
+PREHOOK: Output: hdfs://### HDFS PATH ###
+POSTHOOK: query: select * from tbl_ice_puffin order by a, b, c
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@tbl_ice_puffin
+POSTHOOK: Output: hdfs://### HDFS PATH ###
+1 one 50
+2 two 51
+2 two 51
+2 two 51
+3 three 52
+4 four 53
+5 five 54
+111 one 55
+333 two 56
+PREHOOK: query: select count(*) from tbl_ice_puffin
+PREHOOK: type: QUERY
+PREHOOK: Input: default@tbl_ice_puffin
+PREHOOK: Output: hdfs://### HDFS PATH ###
+POSTHOOK: query: select count(*) from tbl_ice_puffin
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@tbl_ice_puffin
+POSTHOOK: Output: hdfs://### HDFS PATH ###
+9
+PREHOOK: query: desc formatted tbl_ice_puffin a
+PREHOOK: type: DESCTABLE
+PREHOOK: Input: default@tbl_ice_puffin
+POSTHOOK: query: desc formatted tbl_ice_puffin a
+POSTHOOK: type: DESCTABLE
+POSTHOOK: Input: default@tbl_ice_puffin
+col_name a
+data_type int
+min 1
+max 333
+num_nulls 0
+distinct_count 7
+avg_col_len
+max_col_len
+num_trues
+num_falses
+bit_vector HL
+comment
+COLUMN_STATS_ACCURATE
{\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"a\":\"true\",\"b\":\"true\",\"c\":\"true\"}}
+PREHOOK: query: drop table if exists tbl_ice
+PREHOOK: type: DROPTABLE
+POSTHOOK: query: drop table if exists tbl_ice
+POSTHOOK: type: DROPTABLE
+PREHOOK: query: create external table tbl_ice(a int, b string, c int) stored
by iceberg tblproperties ('format-version'='2')
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@tbl_ice
+POSTHOOK: query: create external table tbl_ice(a int, b string, c int) stored
by iceberg tblproperties ('format-version'='2')
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@tbl_ice
+PREHOOK: query: insert into tbl_ice values (1, 'one', 50), (2, 'two', 51),(2,
'two', 51),(2, 'two', 51), (3, 'three', 52), (4, 'four', 53), (5, 'five', 54),
(111, 'one', 55), (333, 'two', 56)
+PREHOOK: type: QUERY
+PREHOOK: Input: _dummy_database@_dummy_table
+PREHOOK: Output: default@tbl_ice
+POSTHOOK: query: insert into tbl_ice values (1, 'one', 50), (2, 'two', 51),(2,
'two', 51),(2, 'two', 51), (3, 'three', 52), (4, 'four', 53), (5, 'five', 54),
(111, 'one', 55), (333, 'two', 56)
+POSTHOOK: type: QUERY
+POSTHOOK: Input: _dummy_database@_dummy_table
+POSTHOOK: Output: default@tbl_ice
+PREHOOK: query: explain select * from tbl_ice order by a, b, c
+PREHOOK: type: QUERY
+PREHOOK: Input: default@tbl_ice
+PREHOOK: Output: hdfs://### HDFS PATH ###
+POSTHOOK: query: explain select * from tbl_ice order by a, b, c
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@tbl_ice
+POSTHOOK: Output: hdfs://### HDFS PATH ###
+Plan optimized by CBO.
+
+Vertex dependency in root stage
+Reducer 2 <- Map 1 (SIMPLE_EDGE)
+
+Stage-0
+ Fetch Operator
+ limit:-1
+ Stage-1
+ Reducer 2 vectorized
+ File Output Operator [FS_8]
+ Select Operator [SEL_7] (rows=9 width=95)
+ Output:["_col0","_col1","_col2"]
+ <-Map 1 [SIMPLE_EDGE] vectorized
+ SHUFFLE [RS_6]
+ Select Operator [SEL_5] (rows=9 width=95)
+ Output:["_col0","_col1","_col2"]
+ TableScan [TS_0] (rows=9 width=95)
+
default@tbl_ice,tbl_ice,Tbl:COMPLETE,Col:COMPLETE,Output:["a","b","c"]
+
+PREHOOK: query: select * from tbl_ice order by a, b, c
+PREHOOK: type: QUERY
+PREHOOK: Input: default@tbl_ice
+PREHOOK: Output: hdfs://### HDFS PATH ###
+POSTHOOK: query: select * from tbl_ice order by a, b, c
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@tbl_ice
+POSTHOOK: Output: hdfs://### HDFS PATH ###
+1 one 50
+2 two 51
+2 two 51
+2 two 51
+3 three 52
+4 four 53
+5 five 54
+111 one 55
+333 two 56
+PREHOOK: query: select count(*) from tbl_ice
+PREHOOK: type: QUERY
+PREHOOK: Input: default@tbl_ice
+PREHOOK: Output: hdfs://### HDFS PATH ###
+POSTHOOK: query: select count(*) from tbl_ice
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@tbl_ice
+POSTHOOK: Output: hdfs://### HDFS PATH ###
+9
+PREHOOK: query: delete from tbl_ice_puffin where a = 2
+PREHOOK: type: QUERY
+PREHOOK: Input: default@tbl_ice_puffin
+PREHOOK: Output: default@tbl_ice_puffin
+POSTHOOK: query: delete from tbl_ice_puffin where a = 2
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@tbl_ice_puffin
+POSTHOOK: Output: default@tbl_ice_puffin
+PREHOOK: query: explain select * from tbl_ice order by a, b, c
+PREHOOK: type: QUERY
+PREHOOK: Input: default@tbl_ice
+PREHOOK: Output: hdfs://### HDFS PATH ###
+POSTHOOK: query: explain select * from tbl_ice order by a, b, c
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@tbl_ice
+POSTHOOK: Output: hdfs://### HDFS PATH ###
+Plan optimized by CBO.
+
+Vertex dependency in root stage
+Reducer 2 <- Map 1 (SIMPLE_EDGE)
+
+Stage-0
+ Fetch Operator
+ limit:-1
+ Stage-1
+ Reducer 2 vectorized
+ File Output Operator [FS_8]
+ Select Operator [SEL_7] (rows=9 width=95)
+ Output:["_col0","_col1","_col2"]
+ <-Map 1 [SIMPLE_EDGE] vectorized
+ SHUFFLE [RS_6]
+ Select Operator [SEL_5] (rows=9 width=95)
+ Output:["_col0","_col1","_col2"]
+ TableScan [TS_0] (rows=9 width=95)
+
default@tbl_ice,tbl_ice,Tbl:COMPLETE,Col:COMPLETE,Output:["a","b","c"]
+
+PREHOOK: query: select count(*) from tbl_ice
+PREHOOK: type: QUERY
+PREHOOK: Input: default@tbl_ice
+PREHOOK: Output: hdfs://### HDFS PATH ###
+POSTHOOK: query: select count(*) from tbl_ice
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@tbl_ice
+POSTHOOK: Output: hdfs://### HDFS PATH ###
+9
+PREHOOK: query: create table t1 (a int) stored by iceberg tblproperties
('format-version'='2')
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@t1
+POSTHOOK: query: create table t1 (a int) stored by iceberg tblproperties
('format-version'='2')
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@t1
+PREHOOK: query: create table t2 (b int) stored by iceberg tblproperties
('format-version'='2')
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@t2
+POSTHOOK: query: create table t2 (b int) stored by iceberg tblproperties
('format-version'='2')
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@t2
+PREHOOK: query: describe formatted t1
+PREHOOK: type: DESCTABLE
+PREHOOK: Input: default@t1
+POSTHOOK: query: describe formatted t1
+POSTHOOK: type: DESCTABLE
+POSTHOOK: Input: default@t1
+# col_name data_type comment
+a int
+
+# Detailed Table Information
+Database: default
+#### A masked pattern was here ####
+Retention: 0
+#### A masked pattern was here ####
+Table Type: MANAGED_TABLE
+Table Parameters:
+ COLUMN_STATS_ACCURATE
{\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"a\":\"true\"}}
+ bucketing_version 2
+ current-schema
{\"type\":\"struct\",\"schema-id\":0,\"fields\":[{\"id\":1,\"name\":\"a\",\"required\":false,\"type\":\"int\"}]}
+ engine.hive.enabled true
+ format-version 2
+ iceberg.orc.files.only false
+ metadata_location hdfs://### HDFS PATH ###
+ numFiles 0
+ numRows 0
+ rawDataSize 0
+ serialization.format 1
+ snapshot-count 0
+ storage_handler
org.apache.iceberg.mr.hive.HiveIcebergStorageHandler
+ table_type ICEBERG
+ totalSize 0
+#### A masked pattern was here ####
+ uuid #Masked#
+ write.delete.mode merge-on-read
+ write.merge.mode merge-on-read
+ write.update.mode merge-on-read
+
+# Storage Information
+SerDe Library: org.apache.iceberg.mr.hive.HiveIcebergSerDe
+InputFormat: org.apache.iceberg.mr.hive.HiveIcebergInputFormat
+OutputFormat: org.apache.iceberg.mr.hive.HiveIcebergOutputFormat
+Compressed: No
+Sort Columns: []
+PREHOOK: query: describe formatted t2
+PREHOOK: type: DESCTABLE
+PREHOOK: Input: default@t2
+POSTHOOK: query: describe formatted t2
+POSTHOOK: type: DESCTABLE
+POSTHOOK: Input: default@t2
+# col_name data_type comment
+b int
+
+# Detailed Table Information
+Database: default
+#### A masked pattern was here ####
+Retention: 0
+#### A masked pattern was here ####
+Table Type: MANAGED_TABLE
+Table Parameters:
+ COLUMN_STATS_ACCURATE
{\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"b\":\"true\"}}
+ bucketing_version 2
+ current-schema
{\"type\":\"struct\",\"schema-id\":0,\"fields\":[{\"id\":1,\"name\":\"b\",\"required\":false,\"type\":\"int\"}]}
+ engine.hive.enabled true
+ format-version 2
+ iceberg.orc.files.only false
+ metadata_location hdfs://### HDFS PATH ###
+ numFiles 0
+ numRows 0
+ rawDataSize 0
+ serialization.format 1
+ snapshot-count 0
+ storage_handler
org.apache.iceberg.mr.hive.HiveIcebergStorageHandler
+ table_type ICEBERG
+ totalSize 0
+#### A masked pattern was here ####
+ uuid #Masked#
+ write.delete.mode merge-on-read
+ write.merge.mode merge-on-read
+ write.update.mode merge-on-read
+
+# Storage Information
+SerDe Library: org.apache.iceberg.mr.hive.HiveIcebergSerDe
+InputFormat: org.apache.iceberg.mr.hive.HiveIcebergInputFormat
+OutputFormat: org.apache.iceberg.mr.hive.HiveIcebergOutputFormat
+Compressed: No
+Sort Columns: []
+PREHOOK: query: explain select * from t1 join t2 on t1.a = t2.b
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t1
+PREHOOK: Input: default@t2
+PREHOOK: Output: hdfs://### HDFS PATH ###
+POSTHOOK: query: explain select * from t1 join t2 on t1.a = t2.b
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t1
+POSTHOOK: Input: default@t2
+POSTHOOK: Output: hdfs://### HDFS PATH ###
+Plan optimized by CBO.
+
+Vertex dependency in root stage
+Reducer 2 <- Map 1 (SIMPLE_EDGE), Map 3 (SIMPLE_EDGE)
+
+Stage-0
+ Fetch Operator
+ limit:-1
+ Stage-1
+ Reducer 2
+ File Output Operator [FS_10]
+ Merge Join Operator [MERGEJOIN_25] (rows=1 width=4)
+ Conds:RS_28._col0=RS_31._col0(Inner),Output:["_col0","_col1"]
+ <-Map 1 [SIMPLE_EDGE] vectorized
+ SHUFFLE [RS_28]
+ PartitionCols:_col0
+ Select Operator [SEL_27] (rows=1 width=4)
+ Output:["_col0"]
+ Filter Operator [FIL_26] (rows=1 width=4)
+ predicate:a is not null
+ TableScan [TS_0] (rows=1 width=4)
+ default@t1,t1,Tbl:COMPLETE,Col:NONE,Output:["a"]
+ <-Map 3 [SIMPLE_EDGE] vectorized
+ SHUFFLE [RS_31]
+ PartitionCols:_col0
+ Select Operator [SEL_30] (rows=1 width=4)
+ Output:["_col0"]
+ Filter Operator [FIL_29] (rows=1 width=4)
+ predicate:b is not null
+ TableScan [TS_3] (rows=1 width=4)
+ default@t2,t2,Tbl:COMPLETE,Col:NONE,Output:["b"]
+
diff --git
a/iceberg/iceberg-handler/src/test/results/positive/dynamic_partition_writes.q.out
b/iceberg/iceberg-handler/src/test/results/positive/dynamic_partition_writes.q.out
index 2cf955f898c..7e7a5eab1e3 100644
---
a/iceberg/iceberg-handler/src/test/results/positive/dynamic_partition_writes.q.out
+++
b/iceberg/iceberg-handler/src/test/results/positive/dynamic_partition_writes.q.out
@@ -76,9 +76,9 @@ Stage-3
<-Map 1 [SIMPLE_EDGE] vectorized
PARTITION_ONLY_SHUFFLE [RS_13]
PartitionCols:_col1
- Select Operator [SEL_12] (rows=22 width=87)
+ Select Operator [SEL_12] (rows=22 width=91)
Output:["_col0","_col1"]
- TableScan [TS_0] (rows=22 width=87)
+ TableScan [TS_0] (rows=22 width=91)
default@tbl_src,tbl_src,Tbl:COMPLETE,Col:COMPLETE,Output:["a","b"]
Reducer 3 vectorized
File Output Operator [FS_21]
@@ -90,7 +90,7 @@ Stage-3
PARTITION_ONLY_SHUFFLE [RS_16]
Group By Operator [GBY_15] (rows=1 width=400)
Output:["_col0","_col1","_col2","_col3","_col4","_col5","_col6","_col7","_col8"],aggregations:["min(a)","max(a)","count(1)","count(a)","compute_bit_vector_hll(a)","max(length(ccy))","avg(COALESCE(length(ccy),0))","count(ccy)","compute_bit_vector_hll(ccy)"]
- Select Operator [SEL_14] (rows=22 width=87)
+ Select Operator [SEL_14] (rows=22 width=91)
Output:["a","ccy"]
Please refer to the previous Select Operator
[SEL_12]
@@ -170,9 +170,9 @@ Stage-3
<-Map 1 [SIMPLE_EDGE] vectorized
PARTITION_ONLY_SHUFFLE [RS_13]
PartitionCols:iceberg_bucket(_col1, 2)
- Select Operator [SEL_12] (rows=22 width=87)
+ Select Operator [SEL_12] (rows=22 width=91)
Output:["_col0","_col1"]
- TableScan [TS_0] (rows=22 width=87)
+ TableScan [TS_0] (rows=22 width=91)
default@tbl_src,tbl_src,Tbl:COMPLETE,Col:COMPLETE,Output:["a","b"]
Reducer 3 vectorized
File Output Operator [FS_21]
@@ -184,7 +184,7 @@ Stage-3
PARTITION_ONLY_SHUFFLE [RS_16]
Group By Operator [GBY_15] (rows=1 width=400)
Output:["_col0","_col1","_col2","_col3","_col4","_col5","_col6","_col7","_col8"],aggregations:["min(a)","max(a)","count(1)","count(a)","compute_bit_vector_hll(a)","max(length(ccy))","avg(COALESCE(length(ccy),0))","count(ccy)","compute_bit_vector_hll(ccy)"]
- Select Operator [SEL_14] (rows=22 width=87)
+ Select Operator [SEL_14] (rows=22 width=91)
Output:["a","ccy"]
Please refer to the previous Select Operator
[SEL_12]
@@ -264,9 +264,9 @@ Stage-3
<-Map 1 [SIMPLE_EDGE] vectorized
PARTITION_ONLY_SHUFFLE [RS_13]
PartitionCols:_col1, iceberg_bucket(_col2, 3)
- Select Operator [SEL_12] (rows=22 width=94)
+ Select Operator [SEL_12] (rows=22 width=99)
Output:["_col0","_col1","_col2"]
- TableScan [TS_0] (rows=22 width=94)
+ TableScan [TS_0] (rows=22 width=99)
default@tbl_src,tbl_src,Tbl:COMPLETE,Col:COMPLETE,Output:["a","b","c"]
Reducer 3 vectorized
File Output Operator [FS_21]
@@ -278,7 +278,7 @@ Stage-3
PARTITION_ONLY_SHUFFLE [RS_16]
Group By Operator [GBY_15] (rows=1 width=568)
Output:["_col0","_col1","_col2","_col3","_col4","_col5","_col6","_col7","_col8","_col9","_col10","_col11","_col12"],aggregations:["min(a)","max(a)","count(1)","count(a)","compute_bit_vector_hll(a)","max(length(ccy))","avg(COALESCE(length(ccy),0))","count(ccy)","compute_bit_vector_hll(ccy)","min(c)","max(c)","count(c)","compute_bit_vector_hll(c)"]
- Select Operator [SEL_14] (rows=22 width=94)
+ Select Operator [SEL_14] (rows=22 width=99)
Output:["a","ccy","c"]
Please refer to the previous Select Operator
[SEL_12]
@@ -403,7 +403,7 @@ Stage-3
Output:["_col0","_col1","_col2"]
Filter Operator [FIL_14] (rows=4 width=99)
predicate:(b = 'EUR')
- TableScan [TS_0] (rows=22 width=94)
+ TableScan [TS_0] (rows=22 width=99)
default@tbl_src,tbl_src,Tbl:COMPLETE,Col:COMPLETE,Output:["a","b","c"]
Reducer 3 vectorized
File Output Operator [FS_24]
@@ -461,7 +461,7 @@ Stage-3
Output:["_col0","_col1","_col2"]
Filter Operator [FIL_12] (rows=1 width=99)
predicate:((c = 100L) and (b = 'USD'))
- TableScan [TS_0] (rows=22 width=94)
+ TableScan [TS_0] (rows=22 width=99)
default@tbl_src,tbl_src,Tbl:COMPLETE,Col:COMPLETE,Output:["a","b","c"]
PARTITION_ONLY_SHUFFLE [RS_17]
Group By Operator [GBY_16] (rows=1 width=568)
diff --git
a/iceberg/iceberg-handler/src/test/results/positive/llap/vectorized_iceberg_read_mixed.q.out
b/iceberg/iceberg-handler/src/test/results/positive/llap/vectorized_iceberg_read_mixed.q.out
index bdef92e60f6..50ce82dc248 100644
---
a/iceberg/iceberg-handler/src/test/results/positive/llap/vectorized_iceberg_read_mixed.q.out
+++
b/iceberg/iceberg-handler/src/test/results/positive/llap/vectorized_iceberg_read_mixed.q.out
@@ -589,13 +589,13 @@ STAGE PLANS:
minReductionHashAggr: 0.99
mode: hash
outputColumnNames: _col0, _col1, _col2, _col3, _col4,
_col5, _col6, _col7, _col8, _col9
- Statistics: Num rows: 2 Data size: 746 Basic stats:
COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 1 Data size: 373 Basic stats:
COMPLETE Column stats: COMPLETE
Reduce Output Operator
key expressions: _col0 (type: double), _col1 (type:
boolean), _col2 (type: int), _col3 (type: bigint), _col4 (type: binary), _col5
(type: string), _col6 (type: timestamp), _col7 (type: date), _col8 (type:
decimal(4,2))
null sort order: zzzzzzzzz
sort order: +++++++++
Map-reduce partition columns: _col0 (type: double),
_col1 (type: boolean), _col2 (type: int), _col3 (type: bigint), _col4 (type:
binary), _col5 (type: string), _col6 (type: timestamp), _col7 (type: date),
_col8 (type: decimal(4,2))
- Statistics: Num rows: 2 Data size: 746 Basic stats:
COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 1 Data size: 373 Basic stats:
COMPLETE Column stats: COMPLETE
value expressions: _col9 (type: float)
Execution mode: vectorized, llap
LLAP IO: all inputs (cache only)
@@ -607,14 +607,116 @@ STAGE PLANS:
keys: KEY._col0 (type: double), KEY._col1 (type: boolean),
KEY._col2 (type: int), KEY._col3 (type: bigint), KEY._col4 (type: binary),
KEY._col5 (type: string), KEY._col6 (type: timestamp), KEY._col7 (type: date),
KEY._col8 (type: decimal(4,2))
mode: mergepartial
outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5,
_col6, _col7, _col8, _col9
- Statistics: Num rows: 2 Data size: 746 Basic stats: COMPLETE
Column stats: COMPLETE
+ Statistics: Num rows: 1 Data size: 373 Basic stats: COMPLETE
Column stats: COMPLETE
Select Operator
expressions: _col9 (type: float), _col0 (type: double),
_col1 (type: boolean), _col2 (type: int), _col3 (type: bigint), _col4 (type:
binary), _col5 (type: string), _col6 (type: timestamp), _col7 (type: date),
_col8 (type: decimal(4,2))
outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5,
_col6, _col7, _col8, _col9
+ Statistics: Num rows: 1 Data size: 373 Basic stats: COMPLETE
Column stats: COMPLETE
+ File Output Operator
+ compressed: false
+ Statistics: Num rows: 1 Data size: 373 Basic stats:
COMPLETE Column stats: COMPLETE
+ table:
+ input format:
org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format:
org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ serde:
org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+ Processor Tree:
+ ListSink
+
+PREHOOK: query: select max(t_float), t_double, t_boolean, t_int, t_bigint,
t_binary, t_string, t_timestamp, t_date, t_decimal from tbl_ice_mixed_all_types
+ group by t_double, t_boolean, t_int, t_bigint, t_binary, t_string,
t_timestamp, t_date, t_decimal
+PREHOOK: type: QUERY
+PREHOOK: Input: default@tbl_ice_mixed_all_types
+#### A masked pattern was here ####
+POSTHOOK: query: select max(t_float), t_double, t_boolean, t_int, t_bigint,
t_binary, t_string, t_timestamp, t_date, t_decimal from tbl_ice_mixed_all_types
+ group by t_double, t_boolean, t_int, t_bigint, t_binary, t_string,
t_timestamp, t_date, t_decimal
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@tbl_ice_mixed_all_types
+#### A masked pattern was here ####
+1.1 1.2 false 4 567890123456789 6 col7 2012-10-03
19:58:08 1234-09-02 10.01
+5.1 6.2 true 40 567890123456780 8 col07 2012-10-03
19:58:09 1234-09-03 10.02
+PREHOOK: query: create external table t1 stored as orc as select * from
tbl_ice_mixed_all_types
+PREHOOK: type: CREATETABLE_AS_SELECT
+PREHOOK: Input: default@tbl_ice_mixed_all_types
+PREHOOK: Output: database:default
+PREHOOK: Output: default@t1
+POSTHOOK: query: create external table t1 stored as orc as select * from
tbl_ice_mixed_all_types
+POSTHOOK: type: CREATETABLE_AS_SELECT
+POSTHOOK: Input: default@tbl_ice_mixed_all_types
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@t1
+POSTHOOK: Lineage: t1.t_bigint SIMPLE
[(tbl_ice_mixed_all_types)tbl_ice_mixed_all_types.FieldSchema(name:t_bigint,
type:bigint, comment:null), ]
+POSTHOOK: Lineage: t1.t_binary SIMPLE
[(tbl_ice_mixed_all_types)tbl_ice_mixed_all_types.FieldSchema(name:t_binary,
type:binary, comment:null), ]
+POSTHOOK: Lineage: t1.t_boolean SIMPLE
[(tbl_ice_mixed_all_types)tbl_ice_mixed_all_types.FieldSchema(name:t_boolean,
type:boolean, comment:null), ]
+POSTHOOK: Lineage: t1.t_date SIMPLE
[(tbl_ice_mixed_all_types)tbl_ice_mixed_all_types.FieldSchema(name:t_date,
type:date, comment:null), ]
+POSTHOOK: Lineage: t1.t_decimal SIMPLE
[(tbl_ice_mixed_all_types)tbl_ice_mixed_all_types.FieldSchema(name:t_decimal,
type:decimal(4,2), comment:null), ]
+POSTHOOK: Lineage: t1.t_double SIMPLE
[(tbl_ice_mixed_all_types)tbl_ice_mixed_all_types.FieldSchema(name:t_double,
type:double, comment:null), ]
+POSTHOOK: Lineage: t1.t_float SIMPLE
[(tbl_ice_mixed_all_types)tbl_ice_mixed_all_types.FieldSchema(name:t_float,
type:float, comment:null), ]
+POSTHOOK: Lineage: t1.t_int SIMPLE
[(tbl_ice_mixed_all_types)tbl_ice_mixed_all_types.FieldSchema(name:t_int,
type:int, comment:null), ]
+POSTHOOK: Lineage: t1.t_string SIMPLE
[(tbl_ice_mixed_all_types)tbl_ice_mixed_all_types.FieldSchema(name:t_string,
type:string, comment:null), ]
+POSTHOOK: Lineage: t1.t_timestamp SIMPLE
[(tbl_ice_mixed_all_types)tbl_ice_mixed_all_types.FieldSchema(name:t_timestamp,
type:timestamp, comment:null), ]
+PREHOOK: query: explain select max(t_float), t_double, t_boolean, t_int,
t_bigint, t_binary, t_string, t_timestamp, t_date, t_decimal from
tbl_ice_mixed_all_types
+ group by t_double, t_boolean, t_int, t_bigint, t_binary, t_string,
t_timestamp, t_date, t_decimal
+PREHOOK: type: QUERY
+PREHOOK: Input: default@tbl_ice_mixed_all_types
+#### A masked pattern was here ####
+POSTHOOK: query: explain select max(t_float), t_double, t_boolean, t_int,
t_bigint, t_binary, t_string, t_timestamp, t_date, t_decimal from
tbl_ice_mixed_all_types
+ group by t_double, t_boolean, t_int, t_bigint, t_binary, t_string,
t_timestamp, t_date, t_decimal
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@tbl_ice_mixed_all_types
+#### A masked pattern was here ####
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+ Stage: Stage-1
+ Tez
+#### A masked pattern was here ####
+ Edges:
+ Reducer 2 <- Map 1 (SIMPLE_EDGE)
+#### A masked pattern was here ####
+ Vertices:
+ Map 1
+ Map Operator Tree:
+ TableScan
+ alias: tbl_ice_mixed_all_types
Statistics: Num rows: 2 Data size: 746 Basic stats: COMPLETE
Column stats: COMPLETE
+ Group By Operator
+ aggregations: max(t_float)
+ keys: t_double (type: double), t_boolean (type: boolean),
t_int (type: int), t_bigint (type: bigint), t_binary (type: binary), t_string
(type: string), t_timestamp (type: timestamp), t_date (type: date), t_decimal
(type: decimal(4,2))
+ minReductionHashAggr: 0.99
+ mode: hash
+ outputColumnNames: _col0, _col1, _col2, _col3, _col4,
_col5, _col6, _col7, _col8, _col9
+ Statistics: Num rows: 1 Data size: 373 Basic stats:
COMPLETE Column stats: COMPLETE
+ Reduce Output Operator
+ key expressions: _col0 (type: double), _col1 (type:
boolean), _col2 (type: int), _col3 (type: bigint), _col4 (type: binary), _col5
(type: string), _col6 (type: timestamp), _col7 (type: date), _col8 (type:
decimal(4,2))
+ null sort order: zzzzzzzzz
+ sort order: +++++++++
+ Map-reduce partition columns: _col0 (type: double),
_col1 (type: boolean), _col2 (type: int), _col3 (type: bigint), _col4 (type:
binary), _col5 (type: string), _col6 (type: timestamp), _col7 (type: date),
_col8 (type: decimal(4,2))
+ Statistics: Num rows: 1 Data size: 373 Basic stats:
COMPLETE Column stats: COMPLETE
+ value expressions: _col9 (type: float)
+ Execution mode: vectorized, llap
+ LLAP IO: all inputs (cache only)
+ Reducer 2
+ Execution mode: vectorized, llap
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations: max(VALUE._col0)
+ keys: KEY._col0 (type: double), KEY._col1 (type: boolean),
KEY._col2 (type: int), KEY._col3 (type: bigint), KEY._col4 (type: binary),
KEY._col5 (type: string), KEY._col6 (type: timestamp), KEY._col7 (type: date),
KEY._col8 (type: decimal(4,2))
+ mode: mergepartial
+ outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5,
_col6, _col7, _col8, _col9
+ Statistics: Num rows: 1 Data size: 373 Basic stats: COMPLETE
Column stats: COMPLETE
+ Select Operator
+ expressions: _col9 (type: float), _col0 (type: double),
_col1 (type: boolean), _col2 (type: int), _col3 (type: bigint), _col4 (type:
binary), _col5 (type: string), _col6 (type: timestamp), _col7 (type: date),
_col8 (type: decimal(4,2))
+ outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5,
_col6, _col7, _col8, _col9
+ Statistics: Num rows: 1 Data size: 373 Basic stats: COMPLETE
Column stats: COMPLETE
File Output Operator
compressed: false
- Statistics: Num rows: 2 Data size: 746 Basic stats:
COMPLETE Column stats: COMPLETE
+ Statistics: Num rows: 1 Data size: 373 Basic stats:
COMPLETE Column stats: COMPLETE
table:
input format:
org.apache.hadoop.mapred.SequenceFileInputFormat
output format:
org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
diff --git
a/iceberg/iceberg-handler/src/test/results/positive/vectorized_iceberg_read_mixed.q.out
b/iceberg/iceberg-handler/src/test/results/positive/vectorized_iceberg_read_mixed.q.out
index a43950aa6ac..34696284306 100644
---
a/iceberg/iceberg-handler/src/test/results/positive/vectorized_iceberg_read_mixed.q.out
+++
b/iceberg/iceberg-handler/src/test/results/positive/vectorized_iceberg_read_mixed.q.out
@@ -538,14 +538,79 @@ Stage-0
Stage-1
Reducer 2 vectorized
File Output Operator [FS_11]
- Select Operator [SEL_10] (rows=2 width=373)
+ Select Operator [SEL_10] (rows=1 width=373)
Output:["_col0","_col1","_col2","_col3","_col4","_col5","_col6","_col7","_col8","_col9"]
- Group By Operator [GBY_9] (rows=2 width=373)
+ Group By Operator [GBY_9] (rows=1 width=373)
Output:["_col0","_col1","_col2","_col3","_col4","_col5","_col6","_col7","_col8","_col9"],aggregations:["max(VALUE._col0)"],keys:KEY._col0,
KEY._col1, KEY._col2, KEY._col3, KEY._col4, KEY._col5, KEY._col6, KEY._col7,
KEY._col8
<-Map 1 [SIMPLE_EDGE] vectorized
SHUFFLE [RS_8]
PartitionCols:_col0, _col1, _col2, _col3, _col4, _col5, _col6,
_col7, _col8
- Group By Operator [GBY_7] (rows=2 width=373)
+ Group By Operator [GBY_7] (rows=1 width=373)
+
Output:["_col0","_col1","_col2","_col3","_col4","_col5","_col6","_col7","_col8","_col9"],aggregations:["max(t_float)"],keys:t_double,
t_boolean, t_int, t_bigint, t_binary, t_string, t_timestamp, t_date, t_decimal
+ TableScan [TS_0] (rows=2 width=373)
+
default@tbl_ice_mixed_all_types,tbl_ice_mixed_all_types,Tbl:COMPLETE,Col:COMPLETE,Output:["t_float","t_double","t_boolean","t_int","t_bigint","t_binary","t_string","t_timestamp","t_date","t_decimal"]
+
+PREHOOK: query: select max(t_float), t_double, t_boolean, t_int, t_bigint,
t_binary, t_string, t_timestamp, t_date, t_decimal from tbl_ice_mixed_all_types
+ group by t_double, t_boolean, t_int, t_bigint, t_binary, t_string,
t_timestamp, t_date, t_decimal
+PREHOOK: type: QUERY
+PREHOOK: Input: default@tbl_ice_mixed_all_types
+PREHOOK: Output: hdfs://### HDFS PATH ###
+POSTHOOK: query: select max(t_float), t_double, t_boolean, t_int, t_bigint,
t_binary, t_string, t_timestamp, t_date, t_decimal from tbl_ice_mixed_all_types
+ group by t_double, t_boolean, t_int, t_bigint, t_binary, t_string,
t_timestamp, t_date, t_decimal
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@tbl_ice_mixed_all_types
+POSTHOOK: Output: hdfs://### HDFS PATH ###
+1.1 1.2 false 4 567890123456789 6 col7 2012-10-03
19:58:08 1234-09-02 10.01
+5.1 6.2 true 40 567890123456780 8 col07 2012-10-03
19:58:09 1234-09-03 10.02
+PREHOOK: query: create external table t1 stored as orc as select * from
tbl_ice_mixed_all_types
+PREHOOK: type: CREATETABLE_AS_SELECT
+PREHOOK: Input: default@tbl_ice_mixed_all_types
+PREHOOK: Output: database:default
+PREHOOK: Output: default@t1
+POSTHOOK: query: create external table t1 stored as orc as select * from
tbl_ice_mixed_all_types
+POSTHOOK: type: CREATETABLE_AS_SELECT
+POSTHOOK: Input: default@tbl_ice_mixed_all_types
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@t1
+POSTHOOK: Lineage: t1.t_bigint SIMPLE
[(tbl_ice_mixed_all_types)tbl_ice_mixed_all_types.FieldSchema(name:t_bigint,
type:bigint, comment:null), ]
+POSTHOOK: Lineage: t1.t_binary SIMPLE
[(tbl_ice_mixed_all_types)tbl_ice_mixed_all_types.FieldSchema(name:t_binary,
type:binary, comment:null), ]
+POSTHOOK: Lineage: t1.t_boolean SIMPLE
[(tbl_ice_mixed_all_types)tbl_ice_mixed_all_types.FieldSchema(name:t_boolean,
type:boolean, comment:null), ]
+POSTHOOK: Lineage: t1.t_date SIMPLE
[(tbl_ice_mixed_all_types)tbl_ice_mixed_all_types.FieldSchema(name:t_date,
type:date, comment:null), ]
+POSTHOOK: Lineage: t1.t_decimal SIMPLE
[(tbl_ice_mixed_all_types)tbl_ice_mixed_all_types.FieldSchema(name:t_decimal,
type:decimal(4,2), comment:null), ]
+POSTHOOK: Lineage: t1.t_double SIMPLE
[(tbl_ice_mixed_all_types)tbl_ice_mixed_all_types.FieldSchema(name:t_double,
type:double, comment:null), ]
+POSTHOOK: Lineage: t1.t_float SIMPLE
[(tbl_ice_mixed_all_types)tbl_ice_mixed_all_types.FieldSchema(name:t_float,
type:float, comment:null), ]
+POSTHOOK: Lineage: t1.t_int SIMPLE
[(tbl_ice_mixed_all_types)tbl_ice_mixed_all_types.FieldSchema(name:t_int,
type:int, comment:null), ]
+POSTHOOK: Lineage: t1.t_string SIMPLE
[(tbl_ice_mixed_all_types)tbl_ice_mixed_all_types.FieldSchema(name:t_string,
type:string, comment:null), ]
+POSTHOOK: Lineage: t1.t_timestamp SIMPLE
[(tbl_ice_mixed_all_types)tbl_ice_mixed_all_types.FieldSchema(name:t_timestamp,
type:timestamp, comment:null), ]
+PREHOOK: query: explain select max(t_float), t_double, t_boolean, t_int,
t_bigint, t_binary, t_string, t_timestamp, t_date, t_decimal from
tbl_ice_mixed_all_types
+ group by t_double, t_boolean, t_int, t_bigint, t_binary, t_string,
t_timestamp, t_date, t_decimal
+PREHOOK: type: QUERY
+PREHOOK: Input: default@tbl_ice_mixed_all_types
+PREHOOK: Output: hdfs://### HDFS PATH ###
+POSTHOOK: query: explain select max(t_float), t_double, t_boolean, t_int,
t_bigint, t_binary, t_string, t_timestamp, t_date, t_decimal from
tbl_ice_mixed_all_types
+ group by t_double, t_boolean, t_int, t_bigint, t_binary, t_string,
t_timestamp, t_date, t_decimal
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@tbl_ice_mixed_all_types
+POSTHOOK: Output: hdfs://### HDFS PATH ###
+Plan optimized by CBO.
+
+Vertex dependency in root stage
+Reducer 2 <- Map 1 (SIMPLE_EDGE)
+
+Stage-0
+ Fetch Operator
+ limit:-1
+ Stage-1
+ Reducer 2 vectorized
+ File Output Operator [FS_11]
+ Select Operator [SEL_10] (rows=1 width=373)
+
Output:["_col0","_col1","_col2","_col3","_col4","_col5","_col6","_col7","_col8","_col9"]
+ Group By Operator [GBY_9] (rows=1 width=373)
+
Output:["_col0","_col1","_col2","_col3","_col4","_col5","_col6","_col7","_col8","_col9"],aggregations:["max(VALUE._col0)"],keys:KEY._col0,
KEY._col1, KEY._col2, KEY._col3, KEY._col4, KEY._col5, KEY._col6, KEY._col7,
KEY._col8
+ <-Map 1 [SIMPLE_EDGE] vectorized
+ SHUFFLE [RS_8]
+ PartitionCols:_col0, _col1, _col2, _col3, _col4, _col5, _col6,
_col7, _col8
+ Group By Operator [GBY_7] (rows=1 width=373)
Output:["_col0","_col1","_col2","_col3","_col4","_col5","_col6","_col7","_col8","_col9"],aggregations:["max(t_float)"],keys:t_double,
t_boolean, t_int, t_bigint, t_binary, t_string, t_timestamp, t_date, t_decimal
TableScan [TS_0] (rows=2 width=373)
default@tbl_ice_mixed_all_types,tbl_ice_mixed_all_types,Tbl:COMPLETE,Col:COMPLETE,Output:["t_float","t_double","t_boolean","t_int","t_bigint","t_binary","t_string","t_timestamp","t_date","t_decimal"]
diff --git
a/ql/src/java/org/apache/hadoop/hive/ql/metadata/HiveStorageHandler.java
b/ql/src/java/org/apache/hadoop/hive/ql/metadata/HiveStorageHandler.java
index aff2f51cbc1..65e14af478a 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/metadata/HiveStorageHandler.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/metadata/HiveStorageHandler.java
@@ -29,6 +29,8 @@ import
org.apache.hadoop.hive.common.classification.InterfaceStability;
import org.apache.hadoop.hive.common.type.SnapshotContext;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.metastore.HiveMetaHook;
+import org.apache.hadoop.hive.metastore.api.ColumnStatistics;
+import org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj;
import org.apache.hadoop.hive.metastore.api.EnvironmentContext;
import org.apache.hadoop.hive.metastore.api.FieldSchema;
import org.apache.hadoop.hive.metastore.api.LockType;
@@ -42,6 +44,7 @@ import org.apache.hadoop.hive.ql.hooks.WriteEntity;
import org.apache.hadoop.hive.ql.parse.AlterTableExecuteSpec;
import org.apache.hadoop.hive.ql.parse.TransformSpec;
import org.apache.hadoop.hive.ql.parse.SemanticException;
+import org.apache.hadoop.hive.ql.plan.ColStatistics;
import org.apache.hadoop.hive.ql.plan.DynamicPartitionCtx;
import org.apache.hadoop.hive.ql.plan.ExprNodeDesc;
import org.apache.hadoop.hive.ql.plan.FileSinkDesc;
@@ -245,6 +248,44 @@ public interface HiveStorageHandler extends Configurable {
return false;
}
+ /**
+ * Return some col statistics (Lower bounds, Upper bounds, Null value
counts, NaN, total counts) calculated by
+ * the underlying storage handler implementation.
+ * @param table
+ * @return A List of Column Statistics Objects, can be null
+ */
+ default
List<ColumnStatisticsObj>getColStatistics(org.apache.hadoop.hive.ql.metadata.Table
table) {
+ return null;
+ }
+
+ /**
+ * Set column stats for non-native tables
+ * @param table
+ * @param colStats
+ * @return boolean
+ */
+ default boolean setColStatistics(org.apache.hadoop.hive.ql.metadata.Table
table,
+ List<ColumnStatistics> colStats) {
+ return false;
+ }
+
+ /**
+ * Check if the storage handler can provide col statistics.
+ * @param tbl
+ * @return true if the storage handler can supply the col statistics
+ */
+ default boolean
canProvideColStatistics(org.apache.hadoop.hive.ql.metadata.Table tbl) {
+ return false;
+ }
+
+ /**
+ * Check if the storage handler can set col statistics.
+ * @return true if the storage handler can set the col statistics
+ */
+ default boolean canSetColStatistics(org.apache.hadoop.hive.ql.metadata.Table
tbl) {
+ return false;
+ }
+
/**
* Check if CTAS and CMV operations should behave in a direct-insert manner
(i.e. no move task).
* <p>
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/stats/ColStatsProcessor.java
b/ql/src/java/org/apache/hadoop/hive/ql/stats/ColStatsProcessor.java
index 95d4b439d16..e2ee8ae07b4 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/stats/ColStatsProcessor.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/stats/ColStatsProcessor.java
@@ -218,6 +218,9 @@ public class ColStatsProcessor implements IStatsProcessor {
}
start = System. currentTimeMillis();
+ if (tbl != null && tbl.isNonNative() &&
tbl.getStorageHandler().canSetColStatistics(tbl)) {
+ tbl.getStorageHandler().setColStatistics(tbl, colStats);
+ }
db.setPartitionColumnStatistics(request);
end = System.currentTimeMillis();
LOG.info("Time taken to update " + colStats.size() + " stats : " + ((end
- start)/1000F) + " seconds.");
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java
b/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java
index 9c1926a747e..a758bcdecd1 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/stats/StatsUtils.java
@@ -1069,8 +1069,12 @@ public class StatsUtils {
}
if (fetchColStats && !colStatsToRetrieve.isEmpty()) {
try {
- List<ColumnStatisticsObj> colStat =
Hive.get().getTableColumnStatistics(
- dbName, tabName, colStatsToRetrieve, false);
+ List<ColumnStatisticsObj> colStat;
+ if (table.isNonNative() &&
table.getStorageHandler().canProvideColStatistics(table)) {
+ colStat = table.getStorageHandler().getColStatistics(table);
+ } else {
+ colStat = Hive.get().getTableColumnStatistics(dbName, tabName,
colStatsToRetrieve, false);
+ }
stats = convertColStats(colStat, tabName);
} catch (HiveException e) {
LOG.error("Failed to retrieve table statistics: ", e);