[hive] branch master updated: HIVE-26628: Iceberg table is created when running explain ctas command (Krisztian Kasa, reviewed by Denys Kuzmenko)

krisztiankasa Tue, 15 Nov 2022 09:27:57 -0800

This is an automated email from the ASF dual-hosted git repository.

krisztiankasa pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hive.git



The following commit(s) were added to refs/heads/master by this push:
     new 76a214cbd15 HIVE-26628: Iceberg table is created when running explain 
ctas command (Krisztian Kasa, reviewed by Denys Kuzmenko)
76a214cbd15 is described below

commit 76a214cbd15d322d4806f7ffed246275fd847089
Author: Krisztian Kasa <[email protected]>
AuthorDate: Tue Nov 15 18:27:31 2022 +0100

    HIVE-26628: Iceberg table is created when running explain ctas command 
(Krisztian Kasa, reviewed by Denys Kuzmenko)
---
 .../org/apache/hadoop/hive/conf/Constants.java     |   2 +
 .../apache/iceberg/mr/hive/HiveIcebergSerDe.java   |  11 +-
 .../iceberg/mr/hive/HiveIcebergStorageHandler.java |  61 ++++-
 .../positive/ctas_iceberg_partitioned_orc.q        |  19 ++
 .../positive/ctas_iceberg_partitioned_orc.q.out    | 304 +++++++++++++++++++++
 .../hive/ql/ddl/table/create/CreateTableDesc.java  |  11 +-
 .../hadoop/hive/ql/parse/SemanticAnalyzer.java     |  81 +++---
 .../apache/hadoop/hive/ql/plan/PartitionDesc.java  |   2 +-
 .../org/apache/hadoop/hive/ql/plan/PlanUtils.java  |  30 +-
 .../org/apache/hadoop/hive/ql/plan/TableDesc.java  |  14 +-
 10 files changed, 469 insertions(+), 66 deletions(-)

diff --git a/common/src/java/org/apache/hadoop/hive/conf/Constants.java 
b/common/src/java/org/apache/hadoop/hive/conf/Constants.java
index c923c2bcce3..870fefd148b 100644
--- a/common/src/java/org/apache/hadoop/hive/conf/Constants.java
+++ b/common/src/java/org/apache/hadoop/hive/conf/Constants.java
@@ -105,4 +105,6 @@ public class Constants {
 
   public static final String HTTP_HEADER_REQUEST_TRACK = "Request-Track";
   public static final String TIME_POSTFIX_REQUEST_TRACK = "_TIME";
+
+  public static final String EXPLAIN_CTAS_LOCATION = "explainCtasLocation";
 }
diff --git 
a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergSerDe.java
 
b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergSerDe.java
index f9f01e3b2f0..6a6aa0884f0 100644
--- 
a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergSerDe.java
+++ 
b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergSerDe.java
@@ -28,6 +28,7 @@ import java.util.stream.Collectors;
 import java.util.stream.IntStream;
 import javax.annotation.Nullable;
 import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.hive.conf.Constants;
 import org.apache.hadoop.hive.conf.HiveConf;
 import org.apache.hadoop.hive.metastore.api.FieldSchema;
 import org.apache.hadoop.hive.metastore.api.hive_metastoreConstants;
@@ -121,12 +122,14 @@ public class HiveIcebergSerDe extends AbstractSerDe {
         this.partitionColumns = ImmutableList.of();
         // create table for CTAS
         if (e instanceof NoSuchTableException &&
-            
Boolean.parseBoolean(serDeProperties.getProperty(hive_metastoreConstants.TABLE_IS_CTAS)))
 {
+                
Boolean.parseBoolean(serDeProperties.getProperty(hive_metastoreConstants.TABLE_IS_CTAS)))
 {
           if (!Catalogs.hiveCatalog(configuration, serDeProperties)) {
             throw new SerDeException(CTAS_EXCEPTION_MSG);
           }
 
-          createTableForCTAS(configuration, serDeProperties);
+          if (!serDeProperties.containsKey(Constants.EXPLAIN_CTAS_LOCATION)) {
+            createTableForCTAS(configuration, serDeProperties);
+          }
         }
       }
     }
@@ -321,4 +324,8 @@ public class HiveIcebergSerDe extends AbstractSerDe {
   public boolean shouldStoreFieldsInMetastore(Map<String, String> tableParams) 
{
     return true;
   }
+
+  public Schema getTableSchema() {
+    return tableSchema;
+  }
 }
diff --git 
a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergStorageHandler.java
 
b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergStorageHandler.java
index ada8eeb2b9e..f96b5e1bfae 100644
--- 
a/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergStorageHandler.java
+++ 
b/iceberg/iceberg-handler/src/main/java/org/apache/iceberg/mr/hive/HiveIcebergStorageHandler.java
@@ -35,11 +35,13 @@ import java.util.function.BiFunction;
 import java.util.function.Function;
 import java.util.stream.Collectors;
 import java.util.stream.Stream;
+import org.apache.commons.lang3.StringUtils;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.hive.common.StatsSetupConst;
 import org.apache.hadoop.hive.common.type.Date;
 import org.apache.hadoop.hive.common.type.SnapshotContext;
 import org.apache.hadoop.hive.common.type.Timestamp;
+import org.apache.hadoop.hive.conf.Constants;
 import org.apache.hadoop.hive.conf.HiveConf;
 import org.apache.hadoop.hive.metastore.HiveMetaHook;
 import org.apache.hadoop.hive.metastore.api.EnvironmentContext;
@@ -90,6 +92,7 @@ import org.apache.iceberg.BaseTable;
 import org.apache.iceberg.FileFormat;
 import org.apache.iceberg.ManifestFile;
 import org.apache.iceberg.NullOrder;
+import org.apache.iceberg.PartitionSpec;
 import org.apache.iceberg.PartitionSpecParser;
 import org.apache.iceberg.RowLevelOperationMode;
 import org.apache.iceberg.Schema;
@@ -102,6 +105,7 @@ import org.apache.iceberg.SortField;
 import org.apache.iceberg.SortOrder;
 import org.apache.iceberg.Table;
 import org.apache.iceberg.TableProperties;
+import org.apache.iceberg.exceptions.NoSuchTableException;
 import org.apache.iceberg.hadoop.HadoopConfigurable;
 import org.apache.iceberg.mr.Catalogs;
 import org.apache.iceberg.mr.InputFormatConfig;
@@ -804,31 +808,60 @@ public class HiveIcebergStorageHandler implements 
HiveStoragePredicateHandler, H
   @VisibleForTesting
   static void overlayTableProperties(Configuration configuration, TableDesc 
tableDesc, Map<String, String> map) {
     Properties props = tableDesc.getProperties();
-    Table table = IcebergTableUtil.getTable(configuration, props);
-    String schemaJson = SchemaParser.toJson(table.schema());
 
     Maps.fromProperties(props).entrySet().stream()
         .filter(entry -> !map.containsKey(entry.getKey())) // map overrides 
tableDesc properties
         .forEach(entry -> map.put(entry.getKey(), entry.getValue()));
 
-    map.put(InputFormatConfig.TABLE_IDENTIFIER, 
props.getProperty(Catalogs.NAME));
-    map.put(InputFormatConfig.TABLE_LOCATION, table.location());
-    map.put(InputFormatConfig.TABLE_SCHEMA, schemaJson);
-    props.put(InputFormatConfig.PARTITION_SPEC, 
PartitionSpecParser.toJson(table.spec()));
+    String location;
+    Schema schema;
+    PartitionSpec spec;
+    try {
+      Table table = IcebergTableUtil.getTable(configuration, props);
+      location = table.location();
+      schema = table.schema();
+      spec = table.spec();
+
+      // serialize table object into config
+      Table serializableTable = SerializableTable.copyOf(table);
+      checkAndSkipIoConfigSerialization(configuration, serializableTable);
+      map.put(InputFormatConfig.SERIALIZED_TABLE_PREFIX + 
tableDesc.getTableName(),
+          SerializationUtil.serializeToBase64(serializableTable));
+    } catch (NoSuchTableException ex) {
+      if 
(!(StringUtils.isNotBlank(props.getProperty(hive_metastoreConstants.TABLE_IS_CTAS))
 &&
+              
StringUtils.isNotBlank(props.getProperty(Constants.EXPLAIN_CTAS_LOCATION)))) {
+        throw ex;
+      }
 
-    // serialize table object into config
-    Table serializableTable = SerializableTable.copyOf(table);
-    checkAndSkipIoConfigSerialization(configuration, serializableTable);
-    map.put(InputFormatConfig.SERIALIZED_TABLE_PREFIX + 
tableDesc.getTableName(),
-        SerializationUtil.serializeToBase64(serializableTable));
+      location = map.get(hive_metastoreConstants.META_TABLE_LOCATION);
+      if (StringUtils.isBlank(location)) {
+        location = props.getProperty(Constants.EXPLAIN_CTAS_LOCATION);
+      }
 
-    // We need to remove this otherwise the job.xml will be invalid as column 
comments are separated with '\0' and
-    // the serialization utils fail to serialize this character
-    map.remove("columns.comments");
+      try {
+        AbstractSerDe serDe = tableDesc.getDeserializer(configuration);
+        HiveIcebergSerDe icebergSerDe = (HiveIcebergSerDe) serDe;
+        schema = icebergSerDe.getTableSchema();
+        spec = IcebergTableUtil.spec(configuration, 
icebergSerDe.getTableSchema());
+      } catch (Exception e) {
+        throw new RuntimeException(e);
+      }
+    }
 
+    map.put(InputFormatConfig.TABLE_IDENTIFIER, 
props.getProperty(Catalogs.NAME));
+    if (StringUtils.isNotBlank(location)) {
+      map.put(InputFormatConfig.TABLE_LOCATION, location);
+    }
+    String schemaJson = SchemaParser.toJson(schema);
+    map.put(InputFormatConfig.TABLE_SCHEMA, schemaJson);
     // save schema into table props as well to avoid repeatedly hitting the 
HMS during serde initializations
     // this is an exception to the interface documentation, but it's a safe 
operation to add this property
     props.put(InputFormatConfig.TABLE_SCHEMA, schemaJson);
+    props.put(InputFormatConfig.PARTITION_SPEC, 
PartitionSpecParser.toJson(spec));
+
+    // We need to remove this otherwise the job.xml will be invalid as column 
comments are separated with '\0' and
+    // the serialization utils fail to serialize this character
+    map.remove("columns.comments");
   }
 
   /**
diff --git 
a/iceberg/iceberg-handler/src/test/queries/positive/ctas_iceberg_partitioned_orc.q
 
b/iceberg/iceberg-handler/src/test/queries/positive/ctas_iceberg_partitioned_orc.q
new file mode 100644
index 00000000000..3897c0e4ca0
--- /dev/null
+++ 
b/iceberg/iceberg-handler/src/test/queries/positive/ctas_iceberg_partitioned_orc.q
@@ -0,0 +1,19 @@
+set 
hive.query.lifetime.hooks=org.apache.iceberg.mr.hive.HiveIcebergQueryLifeTimeHook;
+--! qt:replace:/(\s+uuid\s+)\S+(\s*)/$1#Masked#$2/
+set hive.explain.user=false;
+
+create table source(a int, b string, c int);
+
+insert into source values (1, 'one', 3);
+insert into source values (1, 'two', 4);
+
+explain extended
+create external table tbl_ice partitioned by spec (bucket(16, a), truncate(3, 
b)) stored by iceberg stored as orc tblproperties ('format-version'='2') as
+select a, b, c from source;
+
+create external table tbl_ice partitioned by spec (bucket(16, a), truncate(3, 
b)) stored by iceberg stored as orc tblproperties ('format-version'='2') as
+select a, b, c from source;
+
+describe formatted tbl_ice;
+
+select * from tbl_ice;
diff --git 
a/iceberg/iceberg-handler/src/test/results/positive/ctas_iceberg_partitioned_orc.q.out
 
b/iceberg/iceberg-handler/src/test/results/positive/ctas_iceberg_partitioned_orc.q.out
new file mode 100644
index 00000000000..863d41cc5ae
--- /dev/null
+++ 
b/iceberg/iceberg-handler/src/test/results/positive/ctas_iceberg_partitioned_orc.q.out
@@ -0,0 +1,304 @@
+PREHOOK: query: create table source(a int, b string, c int)
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@source
+POSTHOOK: query: create table source(a int, b string, c int)
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@source
+PREHOOK: query: insert into source values (1, 'one', 3)
+PREHOOK: type: QUERY
+PREHOOK: Input: _dummy_database@_dummy_table
+PREHOOK: Output: default@source
+POSTHOOK: query: insert into source values (1, 'one', 3)
+POSTHOOK: type: QUERY
+POSTHOOK: Input: _dummy_database@_dummy_table
+POSTHOOK: Output: default@source
+POSTHOOK: Lineage: source.a SCRIPT []
+POSTHOOK: Lineage: source.b SCRIPT []
+POSTHOOK: Lineage: source.c SCRIPT []
+PREHOOK: query: insert into source values (1, 'two', 4)
+PREHOOK: type: QUERY
+PREHOOK: Input: _dummy_database@_dummy_table
+PREHOOK: Output: default@source
+POSTHOOK: query: insert into source values (1, 'two', 4)
+POSTHOOK: type: QUERY
+POSTHOOK: Input: _dummy_database@_dummy_table
+POSTHOOK: Output: default@source
+POSTHOOK: Lineage: source.a SCRIPT []
+POSTHOOK: Lineage: source.b SCRIPT []
+POSTHOOK: Lineage: source.c SCRIPT []
+PREHOOK: query: explain extended
+create external table tbl_ice partitioned by spec (bucket(16, a), truncate(3, 
b)) stored by iceberg stored as orc tblproperties ('format-version'='2') as
+select a, b, c from source
+PREHOOK: type: CREATETABLE_AS_SELECT
+PREHOOK: Input: default@source
+PREHOOK: Output: database:default
+PREHOOK: Output: default@tbl_ice
+PREHOOK: Output: hdfs://### HDFS PATH ###
+POSTHOOK: query: explain extended
+create external table tbl_ice partitioned by spec (bucket(16, a), truncate(3, 
b)) stored by iceberg stored as orc tblproperties ('format-version'='2') as
+select a, b, c from source
+POSTHOOK: type: CREATETABLE_AS_SELECT
+POSTHOOK: Input: default@source
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@tbl_ice
+POSTHOOK: Output: hdfs://### HDFS PATH ###
+OPTIMIZED SQL: SELECT `a`, `b`, `c`
+FROM `default`.`source`
+STAGE DEPENDENCIES:
+  Stage-1 is a root stage
+  Stage-2 depends on stages: Stage-1
+  Stage-3 depends on stages: Stage-0, Stage-2
+  Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+  Stage: Stage-1
+    Tez
+#### A masked pattern was here ####
+      Edges:
+        Reducer 2 <- Map 1 (CUSTOM_SIMPLE_EDGE)
+#### A masked pattern was here ####
+      Vertices:
+        Map 1 
+            Map Operator Tree:
+                TableScan
+                  alias: source
+                  Statistics: Num rows: 2 Data size: 190 Basic stats: COMPLETE 
Column stats: COMPLETE
+                  GatherStats: false
+                  Select Operator
+                    expressions: a (type: int), b (type: string), c (type: int)
+                    outputColumnNames: _col0, _col1, _col2
+                    Statistics: Num rows: 2 Data size: 190 Basic stats: 
COMPLETE Column stats: COMPLETE
+                    File Output Operator
+                      bucketingVersion: 2
+                      compressed: false
+                      GlobalTableId: 1
+                      directory: hdfs://### HDFS PATH ###
+                      NumFilesPerFileSink: 1
+                      Statistics: Num rows: 2 Data size: 190 Basic stats: 
COMPLETE Column stats: COMPLETE
+                      Stats Publishing Key Prefix: hdfs://### HDFS PATH ###
+                      table:
+                          input format: 
org.apache.iceberg.mr.hive.HiveIcebergInputFormat
+                          jobProperties:
+                            bucketing_version -1
+                            columns a,b,c
+                            columns.types int:string:int
+                            created_with_ctas true
+                            format-version 2
+                            iceberg.mr.operation.type.default.tbl_ice OTHER
+                            iceberg.mr.table.identifier default.tbl_ice
+                            iceberg.mr.table.location hdfs://### HDFS PATH ###
+                            iceberg.mr.table.schema 
{"type":"struct","schema-id":0,"fields":[{"id":1,"name":"a","required":false,"type":"int"},{"id":2,"name":"b","required":false,"type":"string"},{"id":3,"name":"c","required":false,"type":"int"}]}
+                            mapred.output.committer.class 
org.apache.iceberg.mr.hive.HiveIcebergStorageHandler$HiveIcebergNoJobCommitter
+                            name default.tbl_ice
+                            serialization.format 1
+                            serialization.lib 
org.apache.iceberg.mr.hive.HiveIcebergSerDe
+                            storage_handler 
org.apache.iceberg.mr.hive.HiveIcebergStorageHandler
+                            write.format.default orc
+                          output format: 
org.apache.iceberg.mr.hive.HiveIcebergOutputFormat
+                          properties:
+                            bucketing_version -1
+                            columns a,b,c
+                            columns.types int:string:int
+                            format-version 2
+                            iceberg.mr.operation.type.default.tbl_ice OTHER
+                            iceberg.mr.table.partition.spec 
{"spec-id":0,"fields":[{"name":"a_bucket","transform":"bucket[16]","source-id":1,"field-id":1000},{"name":"b_trunc","transform":"truncate[3]","source-id":2,"field-id":1001}]}
+                            iceberg.mr.table.schema 
{"type":"struct","schema-id":0,"fields":[{"id":1,"name":"a","required":false,"type":"int"},{"id":2,"name":"b","required":false,"type":"string"},{"id":3,"name":"c","required":false,"type":"int"}]}
+                            name default.tbl_ice
+                            serialization.format 1
+                            serialization.lib 
org.apache.iceberg.mr.hive.HiveIcebergSerDe
+                            storage_handler 
org.apache.iceberg.mr.hive.HiveIcebergStorageHandler
+                            write.format.default orc
+                          serde: org.apache.iceberg.mr.hive.HiveIcebergSerDe
+                          name: default.tbl_ice
+                      TotalFiles: 1
+                      GatherStats: true
+                      MultiFileSpray: false
+                    Select Operator
+                      expressions: _col0 (type: int), _col1 (type: string), 
_col2 (type: int)
+                      outputColumnNames: col1, col2, col3
+                      Statistics: Num rows: 2 Data size: 190 Basic stats: 
COMPLETE Column stats: COMPLETE
+                      Group By Operator
+                        aggregations: min(col1), max(col1), count(1), 
count(col1), compute_bit_vector_hll(col1), max(length(col2)), 
avg(COALESCE(length(col2),0)), count(col2), compute_bit_vector_hll(col2), 
min(col3), max(col3), count(col3), compute_bit_vector_hll(col3)
+                        minReductionHashAggr: 0.5
+                        mode: hash
+                        outputColumnNames: _col0, _col1, _col2, _col3, _col4, 
_col5, _col6, _col7, _col8, _col9, _col10, _col11, _col12
+                        Statistics: Num rows: 1 Data size: 560 Basic stats: 
COMPLETE Column stats: COMPLETE
+                        Reduce Output Operator
+                          bucketingVersion: 2
+                          null sort order: 
+                          numBuckets: -1
+                          sort order: 
+                          Statistics: Num rows: 1 Data size: 560 Basic stats: 
COMPLETE Column stats: COMPLETE
+                          tag: -1
+                          value expressions: _col0 (type: int), _col1 (type: 
int), _col2 (type: bigint), _col3 (type: bigint), _col4 (type: binary), _col5 
(type: int), _col6 (type: struct<count:bigint,sum:double,input:int>), _col7 
(type: bigint), _col8 (type: binary), _col9 (type: int), _col10 (type: int), 
_col11 (type: bigint), _col12 (type: binary)
+                          auto parallelism: false
+            Execution mode: vectorized
+            Path -> Alias:
+              hdfs://### HDFS PATH ### [source]
+            Path -> Partition:
+              hdfs://### HDFS PATH ### 
+                Partition
+                  base file name: source
+                  input format: org.apache.hadoop.mapred.TextInputFormat
+                  output format: 
org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+                  properties:
+                    bucket_count -1
+                    bucketing_version 2
+                    column.name.delimiter ,
+                    columns a,b,c
+                    columns.types int:string:int
+#### A masked pattern was here ####
+                    location hdfs://### HDFS PATH ###
+                    name default.source
+                    serialization.format 1
+                    serialization.lib 
org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+                  serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+                
+                    input format: org.apache.hadoop.mapred.TextInputFormat
+                    output format: 
org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+                    properties:
+                      bucketing_version 2
+                      column.name.delimiter ,
+                      columns a,b,c
+                      columns.comments 
+                      columns.types int:string:int
+#### A masked pattern was here ####
+                      location hdfs://### HDFS PATH ###
+                      name default.source
+                      serialization.format 1
+                      serialization.lib 
org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+                    serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+                    name: default.source
+                  name: default.source
+            Truncated Path -> Alias:
+              /source [source]
+        Reducer 2 
+            Execution mode: vectorized
+            Needs Tagging: false
+            Reduce Operator Tree:
+              Group By Operator
+                aggregations: min(VALUE._col0), max(VALUE._col1), 
count(VALUE._col2), count(VALUE._col3), compute_bit_vector_hll(VALUE._col4), 
max(VALUE._col5), avg(VALUE._col6), count(VALUE._col7), 
compute_bit_vector_hll(VALUE._col8), min(VALUE._col9), max(VALUE._col10), 
count(VALUE._col11), compute_bit_vector_hll(VALUE._col12)
+                mode: mergepartial
+                outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, 
_col6, _col7, _col8, _col9, _col10, _col11, _col12
+                Statistics: Num rows: 1 Data size: 492 Basic stats: COMPLETE 
Column stats: COMPLETE
+                Select Operator
+                  expressions: 'LONG' (type: string), UDFToLong(_col0) (type: 
bigint), UDFToLong(_col1) (type: bigint), (_col2 - _col3) (type: bigint), 
COALESCE(ndv_compute_bit_vector(_col4),0) (type: bigint), _col4 (type: binary), 
'STRING' (type: string), UDFToLong(COALESCE(_col5,0)) (type: bigint), 
COALESCE(_col6,0) (type: double), (_col2 - _col7) (type: bigint), 
COALESCE(ndv_compute_bit_vector(_col8),0) (type: bigint), _col8 (type: binary), 
'LONG' (type: string), UDFToLong(_col9) (typ [...]
+                  outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, 
_col6, _col7, _col8, _col9, _col10, _col11, _col12, _col13, _col14, _col15, 
_col16, _col17
+                  Statistics: Num rows: 1 Data size: 794 Basic stats: COMPLETE 
Column stats: COMPLETE
+                  File Output Operator
+                    bucketingVersion: 2
+                    compressed: false
+                    GlobalTableId: 0
+                    directory: hdfs://### HDFS PATH ###
+                    NumFilesPerFileSink: 1
+                    Statistics: Num rows: 1 Data size: 794 Basic stats: 
COMPLETE Column stats: COMPLETE
+                    Stats Publishing Key Prefix: hdfs://### HDFS PATH ###
+                    table:
+                        input format: 
org.apache.hadoop.mapred.SequenceFileInputFormat
+                        output format: 
org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+                        properties:
+                          bucketing_version -1
+                          columns 
_col0,_col1,_col2,_col3,_col4,_col5,_col6,_col7,_col8,_col9,_col10,_col11,_col12,_col13,_col14,_col15,_col16,_col17
+                          columns.types 
string:bigint:bigint:bigint:bigint:binary:string:bigint:double:bigint:bigint:binary:string:bigint:bigint:bigint:bigint:binary
+                          escape.delim \
+                          hive.serialization.extend.additional.nesting.levels 
true
+                          serialization.escape.crlf true
+                          serialization.format 1
+                          serialization.lib 
org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+                        serde: 
org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+                    TotalFiles: 1
+                    GatherStats: false
+                    MultiFileSpray: false
+
+  Stage: Stage-2
+    Dependency Collection
+
+  Stage: Stage-3
+    Stats Work
+      Basic Stats Work:
+          Stats Aggregation Key Prefix: hdfs://### HDFS PATH ###
+      Column Stats Desc:
+          Columns: a, b, c
+          Column Types: int, string, int
+          Table: default.tbl_ice
+          Is Table Level Stats: true
+
+  Stage: Stage-0
+    Move Operator
+      files:
+          hdfs directory: true
+          source: hdfs://### HDFS PATH ###
+          destination: hdfs://### HDFS PATH ###
+
+PREHOOK: query: create external table tbl_ice partitioned by spec (bucket(16, 
a), truncate(3, b)) stored by iceberg stored as orc tblproperties 
('format-version'='2') as
+select a, b, c from source
+PREHOOK: type: CREATETABLE_AS_SELECT
+PREHOOK: Input: default@source
+PREHOOK: Output: database:default
+PREHOOK: Output: default@tbl_ice
+PREHOOK: Output: hdfs://### HDFS PATH ###
+POSTHOOK: query: create external table tbl_ice partitioned by spec (bucket(16, 
a), truncate(3, b)) stored by iceberg stored as orc tblproperties 
('format-version'='2') as
+select a, b, c from source
+POSTHOOK: type: CREATETABLE_AS_SELECT
+POSTHOOK: Input: default@source
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@tbl_ice
+POSTHOOK: Output: hdfs://### HDFS PATH ###
+PREHOOK: query: describe formatted tbl_ice
+PREHOOK: type: DESCTABLE
+PREHOOK: Input: default@tbl_ice
+POSTHOOK: query: describe formatted tbl_ice
+POSTHOOK: type: DESCTABLE
+POSTHOOK: Input: default@tbl_ice
+# col_name             data_type               comment             
+a                      int                                         
+b                      string                                      
+c                      int                                         
+                
+# Partition Transform Information               
+# col_name             transform_type           
+a                      BUCKET[16]               
+b                      TRUNCATE[3]              
+                
+# Detailed Table Information            
+Database:              default                  
+#### A masked pattern was here ####
+Retention:             2147483647               
+#### A masked pattern was here ####
+Table Type:            EXTERNAL_TABLE           
+Table Parameters:               
+       COLUMN_STATS_ACCURATE   
{\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"a\":\"true\",\"b\":\"true\",\"c\":\"true\"}}
+       EXTERNAL                TRUE                
+       bucketing_version       -1                  
+       engine.hive.enabled     true                
+       iceberg.orc.files.only  true                
+       metadata_location       hdfs://### HDFS PATH ###
+       numFiles                2                   
+       numRows                 2                   
+       previous_metadata_location      hdfs://### HDFS PATH ###
+       serialization.format    1                   
+       storage_handler         
org.apache.iceberg.mr.hive.HiveIcebergStorageHandler
+       table_type              ICEBERG             
+       totalSize               812                 
+#### A masked pattern was here ####
+       uuid                    #Masked#
+       write.format.default    orc                 
+                
+# Storage Information           
+SerDe Library:         org.apache.iceberg.mr.hive.HiveIcebergSerDe      
+InputFormat:           org.apache.iceberg.mr.hive.HiveIcebergInputFormat       
 
+OutputFormat:          org.apache.iceberg.mr.hive.HiveIcebergOutputFormat      
 
+Compressed:            No                       
+Sort Columns:          []                       
+PREHOOK: query: select * from tbl_ice
+PREHOOK: type: QUERY
+PREHOOK: Input: default@tbl_ice
+PREHOOK: Output: hdfs://### HDFS PATH ###
+POSTHOOK: query: select * from tbl_ice
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@tbl_ice
+POSTHOOK: Output: hdfs://### HDFS PATH ###
+1      one     3
+1      two     4
diff --git 
a/ql/src/java/org/apache/hadoop/hive/ql/ddl/table/create/CreateTableDesc.java 
b/ql/src/java/org/apache/hadoop/hive/ql/ddl/table/create/CreateTableDesc.java
index eeac3a6d339..75179f38cac 100644
--- 
a/ql/src/java/org/apache/hadoop/hive/ql/ddl/table/create/CreateTableDesc.java
+++ 
b/ql/src/java/org/apache/hadoop/hive/ql/ddl/table/create/CreateTableDesc.java
@@ -21,7 +21,6 @@ package org.apache.hadoop.hive.ql.ddl.table.create;
 
 import java.io.Serializable;
 import java.util.ArrayList;
-import java.util.HashMap;
 import java.util.Iterator;
 import java.util.List;
 import java.util.Map;
@@ -73,6 +72,8 @@ import org.apache.hadoop.mapred.OutputFormat;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
+import static org.apache.hadoop.hive.conf.Constants.EXPLAIN_CTAS_LOCATION;
+
 /**
  * DDL task description for CREATE TABLE commands.
  */
@@ -518,10 +519,10 @@ public class CreateTableDesc implements DDLDesc, 
Serializable {
 
   @Explain(displayName = "table properties")
   public Map<String, String> getTblPropsExplain() { // only for displaying plan
-    HashMap<String, String> copy = new HashMap<>(tblProps);
-    copy.remove(hive_metastoreConstants.TABLE_IS_CTAS);
-    copy.remove(hive_metastoreConstants.TABLE_BUCKETING_VERSION);
-    return copy;
+    return PlanUtils.getPropertiesForExplain(tblProps,
+            EXPLAIN_CTAS_LOCATION,
+            hive_metastoreConstants.TABLE_IS_CTAS,
+            hive_metastoreConstants.TABLE_BUCKETING_VERSION);
   }
 
   /**
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java 
b/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java
index 4373643aea4..a298e5355b2 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/SemanticAnalyzer.java
@@ -21,10 +21,12 @@ package org.apache.hadoop.hive.ql.parse;
 import static java.util.Objects.nonNull;
 import static org.apache.commons.lang3.StringUtils.isNotBlank;
 import static org.apache.hadoop.hive.common.AcidConstants.SOFT_DELETE_TABLE;
+import static org.apache.hadoop.hive.conf.Constants.EXPLAIN_CTAS_LOCATION;
 import static 
org.apache.hadoop.hive.conf.HiveConf.ConfVars.DYNAMICPARTITIONCONVERT;
 import static org.apache.hadoop.hive.conf.HiveConf.ConfVars.HIVEARCHIVEENABLED;
 import static 
org.apache.hadoop.hive.conf.HiveConf.ConfVars.HIVE_DEFAULT_STORAGE_HANDLER;
 import static org.apache.hadoop.hive.conf.HiveConf.ConfVars.HIVESTATSDBCLASS;
+import static 
org.apache.hadoop.hive.metastore.api.hive_metastoreConstants.META_TABLE_LOCATION;
 import static 
org.apache.hadoop.hive.metastore.api.hive_metastoreConstants.META_TABLE_STORAGE;
 import static 
org.apache.hadoop.hive.metastore.api.hive_metastoreConstants.TABLE_IS_CTAS;
 import static 
org.apache.hadoop.hive.metastore.api.hive_metastoreConstants.DEFAULT_TABLE_TYPE;
@@ -7731,36 +7733,57 @@ public class SemanticAnalyzer extends 
BaseSemanticAnalyzer {
         isDestTempFile = false;
       }
 
-      if (tblDesc == null) {
-        if (viewDesc != null) {
-          tableDescriptor = PlanUtils.getTableDesc(viewDesc, cols, colTypes);
-        } else if (qb.getIsQuery()) {
-          Class<? extends Deserializer> serdeClass = LazySimpleSerDe.class;
-          String fileFormat = conf.getResultFileFormat().toString();
-          if (SessionState.get().getIsUsingThriftJDBCBinarySerDe()) {
-            serdeClass = ThriftJDBCBinarySerDe.class;
-            fileFormat = ResultFileFormat.SEQUENCEFILE.toString();
-            // Set the fetch formatter to be a no-op for the ListSinkOperator, 
since we'll
-            // write out formatted thrift objects to SequenceFile
-            conf.set(SerDeUtils.LIST_SINK_OUTPUT_FORMATTER, 
NoOpFetchFormatter.class.getName());
-          } else if (fileFormat.equals(PlanUtils.LLAP_OUTPUT_FORMAT_KEY)) {
-            // If this output format is Llap, check to see if Arrow is 
requested
-            boolean useArrow = HiveConf.getBoolVar(conf, 
HiveConf.ConfVars.LLAP_OUTPUT_FORMAT_ARROW);
-            serdeClass = useArrow ? ArrowColumnarBatchSerDe.class : 
LazyBinarySerDe2.class;
-          }
-          tableDescriptor = PlanUtils.getDefaultQueryOutputTableDesc(cols, 
colTypes, fileFormat,
+      try {
+        if (tblDesc == null) {
+          if (viewDesc != null) {
+            destinationTable = viewDesc.toTable(conf);
+            tableDescriptor = PlanUtils.getTableDesc(viewDesc, cols, colTypes);
+          } else if (qb.getIsQuery()) {
+            Class<? extends Deserializer> serdeClass = LazySimpleSerDe.class;
+            String fileFormat = conf.getResultFileFormat().toString();
+            if (SessionState.get().getIsUsingThriftJDBCBinarySerDe()) {
+              serdeClass = ThriftJDBCBinarySerDe.class;
+              fileFormat = ResultFileFormat.SEQUENCEFILE.toString();
+              // Set the fetch formatter to be a no-op for the 
ListSinkOperator, since we'll
+              // write out formatted thrift objects to SequenceFile
+              conf.set(SerDeUtils.LIST_SINK_OUTPUT_FORMATTER, 
NoOpFetchFormatter.class.getName());
+            } else if (fileFormat.equals(PlanUtils.LLAP_OUTPUT_FORMAT_KEY)) {
+              // If this output format is Llap, check to see if Arrow is 
requested
+              boolean useArrow = HiveConf.getBoolVar(conf, 
HiveConf.ConfVars.LLAP_OUTPUT_FORMAT_ARROW);
+              serdeClass = useArrow ? ArrowColumnarBatchSerDe.class : 
LazyBinarySerDe2.class;
+            }
+            tableDescriptor = PlanUtils.getDefaultQueryOutputTableDesc(cols, 
colTypes, fileFormat,
                 serdeClass);
+            destinationTable = null;
+          } else {
+            tableDescriptor = 
PlanUtils.getDefaultTableDesc(qb.getDirectoryDesc(), cols, colTypes);
+          }
         } else {
-          tableDescriptor = 
PlanUtils.getDefaultTableDesc(qb.getDirectoryDesc(), cols, colTypes);
+          destinationTable = 
db.getTranslateTableDryrun(tblDesc.toTable(conf).getTTable());
+          if (ctx.isExplainPlan() &&
+                  tblDesc.getTblProps().containsKey(TABLE_IS_CTAS) &&
+                  !tblDesc.getTblProps().containsKey(META_TABLE_LOCATION)) {
+            if (destinationTable.getDataLocation() == null) {
+              // no metastore.metadata.transformer.class was set
+              tblDesc.getTblProps().put(EXPLAIN_CTAS_LOCATION, new 
Warehouse(conf).getDefaultTablePath(
+                      destinationTable.getDbName(),
+                      destinationTable.getTableName(),
+                      
Boolean.parseBoolean(destinationTable.getParameters().get("EXTERNAL"))).toString());
+            } else {
+              tblDesc.getTblProps().put(EXPLAIN_CTAS_LOCATION, 
destinationTable.getDataLocation().toString());
+            }
+          }
+          tableDescriptor = PlanUtils.getTableDesc(tblDesc, cols, colTypes);
         }
-      } else {
-        tableDescriptor = PlanUtils.getTableDesc(tblDesc, cols, colTypes);
+      } catch (HiveException | MetaException e) {
+        throw new SemanticException(e);
       }
 
+
       // if available, set location in table desc properties
       if (tblDesc != null && tblDesc.getLocation() != null && tableDescriptor 
!= null &&
-          
!tableDescriptor.getProperties().containsKey(hive_metastoreConstants.META_TABLE_LOCATION))
 {
-        
tableDescriptor.getProperties().setProperty(hive_metastoreConstants.META_TABLE_LOCATION,
 tblDesc.getLocation());
+          !tableDescriptor.getProperties().containsKey(META_TABLE_LOCATION)) {
+        tableDescriptor.getProperties().setProperty(META_TABLE_LOCATION, 
tblDesc.getLocation());
       }
 
       // We need a specific rowObjectInspector in this case
@@ -7773,13 +7796,6 @@ public class SemanticAnalyzer extends 
BaseSemanticAnalyzer {
 
       boolean isDfsDir = (destType == QBMetaData.DEST_DFS_FILE);
 
-      try {
-        destinationTable = tblDesc != null ? 
db.getTranslateTableDryrun(tblDesc.toTable(conf).getTTable()) :
-                viewDesc != null ? viewDesc.toTable(conf) : null;
-      } catch (HiveException e) {
-        throw new SemanticException(e);
-      }
-
       destTableIsFullAcid = AcidUtils.isFullAcidTable(destinationTable);
 
       // Data organization (DISTRIBUTED, SORTED, CLUSTERED) for materialized 
view
@@ -13884,7 +13900,7 @@ public class SemanticAnalyzer extends 
BaseSemanticAnalyzer {
           throw new SemanticException(e);
         }
       }
-      if (!SessionStateUtil.addResource(conf, 
hive_metastoreConstants.META_TABLE_LOCATION, tblLocation)) {
+      if (!SessionStateUtil.addResource(conf, META_TABLE_LOCATION, 
tblLocation)) {
         throw new SemanticException(
             "Query state attached to Session state must be not null. Table 
location cannot be saved.");
       }
@@ -14009,6 +14025,9 @@ public class SemanticAnalyzer extends 
BaseSemanticAnalyzer {
           isTransactional, isManaged, new String[]{qualifiedTabName.getDb(), 
qualifiedTabName.getTable()}, isDefaultTableTypeChanged);
       isExt = isExternalTableChanged(tblProps, isTransactional, isExt, 
isDefaultTableTypeChanged);
       tblProps.put(TABLE_IS_CTAS, "true");
+      if (ctx.isExplainPlan()) {
+        tblProps.put(EXPLAIN_CTAS_LOCATION, "");
+      }
       addDbAndTabToOutputs(new String[] {qualifiedTabName.getDb(), 
qualifiedTabName.getTable()},
           TableType.MANAGED_TABLE, isTemporary, tblProps, storageFormat);
       tableDesc = new CreateTableDesc(qualifiedTabName, isExt, isTemporary, 
cols,
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/plan/PartitionDesc.java 
b/ql/src/java/org/apache/hadoop/hive/ql/plan/PartitionDesc.java
index 875a093abb0..5358e100660 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/plan/PartitionDesc.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/plan/PartitionDesc.java
@@ -213,7 +213,7 @@ public class PartitionDesc implements Serializable, 
Cloneable {
 
   @Explain(displayName = "properties", explainLevels = { Level.EXTENDED })
   public Map getPropertiesExplain() {
-    return PlanUtils.getPropertiesExplain(getProperties());
+    return PlanUtils.getPropertiesForExplain(getProperties());
   }
 
   public void setProperties(final Properties properties) {
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/plan/PlanUtils.java 
b/ql/src/java/org/apache/hadoop/hive/ql/plan/PlanUtils.java
index 36f75afcb77..80bab489620 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/plan/PlanUtils.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/plan/PlanUtils.java
@@ -18,6 +18,7 @@
 
 package org.apache.hadoop.hive.ql.plan;
 
+import static org.apache.hadoop.hive.conf.Constants.EXPLAIN_CTAS_LOCATION;
 import static 
org.apache.hadoop.hive.metastore.api.hive_metastoreConstants.TABLE_IS_CTAS;
 import static org.apache.hive.common.util.HiveStringUtils.quoteComments;
 
@@ -26,14 +27,12 @@ import java.util.ArrayList;
 import java.util.Collection;
 import java.util.Collections;
 import java.util.Comparator;
-import java.util.EnumSet;
 import java.util.HashMap;
 import java.util.LinkedHashMap;
 import java.util.List;
 import java.util.Map;
 import java.util.Properties;
 import java.util.Set;
-import java.util.stream.Collectors;
 
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.hive.common.JavaUtils;
@@ -1223,11 +1222,13 @@ public final class PlanUtils {
     return LazySimpleSerDe.class;
   }
 
+  private static final String[] FILTER_OUT_FROM_EXPLAIN = {TABLE_IS_CTAS, 
EXPLAIN_CTAS_LOCATION};
+
   /**
    * Get a Map of table or partition properties to be used in explain extended 
output.
    * Do some filtering to make output readable and/or concise.
    */
-  static Map getPropertiesExplain(Properties properties) {
+  static Map<Object, Object> getPropertiesForExplain(Properties properties) {
     if (properties != null) {
       Map<Object, Object> clone = null;
       String value = properties.getProperty("columns.comments");
@@ -1244,11 +1245,13 @@ public final class PlanUtils {
         }
         clone.remove(StatsSetupConst.NUM_ERASURE_CODED_FILES);
       }
-      if (properties.containsKey(TABLE_IS_CTAS)) {
-        if (clone == null) {
-          clone = new HashMap<>(properties);
+      for (String key : FILTER_OUT_FROM_EXPLAIN) {
+        if (properties.containsKey(key)) {
+          if (clone == null) {
+            clone = new HashMap<>(properties);
+          }
+          clone.remove(key);
         }
-        clone.remove(TABLE_IS_CTAS);
       }
       if (clone != null) {
         return clone;
@@ -1256,4 +1259,17 @@ public final class PlanUtils {
     }
     return properties;
   }
+
+  public static Map<String, String> getPropertiesForExplain(Map<String, 
String> properties, String... propertiesToRemove) {
+    if (properties == null) {
+      return Collections.emptyMap();
+    }
+
+    Map<String, String> clone = new HashMap<>(properties);
+    for (String key : propertiesToRemove) {
+      clone.remove(key);
+    }
+
+    return clone;
+  }
 }
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/plan/TableDesc.java 
b/ql/src/java/org/apache/hadoop/hive/ql/plan/TableDesc.java
index 4ead854c1a9..12366eff289 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/plan/TableDesc.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/plan/TableDesc.java
@@ -26,14 +26,10 @@ import org.apache.hadoop.hive.ql.io.HiveFileFormatUtils;
 import org.apache.hadoop.hive.ql.plan.Explain.Level;
 import org.apache.hadoop.hive.serde.serdeConstants;
 import org.apache.hadoop.hive.serde2.AbstractSerDe;
-import org.apache.hadoop.hive.serde2.Deserializer;
 import org.apache.hadoop.hive.serde2.SerDeException;
-import org.apache.hadoop.hive.serde2.SerDeUtils;
 import org.apache.hadoop.mapred.InputFormat;
 import org.apache.hadoop.mapred.OutputFormat;
 import org.apache.hive.common.util.ReflectionUtil;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
 
 import java.io.Serializable;
 import java.util.Enumeration;
@@ -41,6 +37,8 @@ import java.util.LinkedHashMap;
 import java.util.Map;
 import java.util.Properties;
 
+import static org.apache.hadoop.hive.conf.Constants.EXPLAIN_CTAS_LOCATION;
+
 /**
  * TableDesc.
  *
@@ -129,7 +127,7 @@ public class TableDesc implements Serializable, Cloneable {
 
   @Explain(displayName = "properties", explainLevels = { Level.EXTENDED })
   public Map getPropertiesExplain() {
-    return PlanUtils.getPropertiesExplain(getProperties());
+    return PlanUtils.getPropertiesForExplain(getProperties());
   }
 
   public void setProperties(final Properties properties) {
@@ -141,11 +139,15 @@ public class TableDesc implements Serializable, Cloneable 
{
     this.jobProperties = jobProperties;
   }
 
-  @Explain(displayName = "jobProperties", explainLevels = { Level.EXTENDED })
   public Map<String, String> getJobProperties() {
     return jobProperties;
   }
 
+  @Explain(displayName = "jobProperties", explainLevels = { Level.EXTENDED })
+  public Map<String, String> getJobPropertiesExplain() {
+    return PlanUtils.getPropertiesForExplain(jobProperties, 
EXPLAIN_CTAS_LOCATION);
+  }
+
   public void setJobSecrets(Map<String, String> jobSecrets) {
     this.jobSecrets = jobSecrets;
   }

[hive] branch master updated: HIVE-26628: Iceberg table is created when running explain ctas command (Krisztian Kasa, reviewed by Denys Kuzmenko)

Reply via email to