kasakrisz commented on code in PR #3745:
URL: https://github.com/apache/hive/pull/3745#discussion_r1031171049
##########
iceberg/iceberg-handler/src/test/results/positive/ctas_iceberg_partitioned_orc.q.out:
##########
@@ -0,0 +1,304 @@
+PREHOOK: query: create table source(a int, b string, c int)
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@source
+POSTHOOK: query: create table source(a int, b string, c int)
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@source
+PREHOOK: query: insert into source values (1, 'one', 3)
+PREHOOK: type: QUERY
+PREHOOK: Input: _dummy_database@_dummy_table
+PREHOOK: Output: default@source
+POSTHOOK: query: insert into source values (1, 'one', 3)
+POSTHOOK: type: QUERY
+POSTHOOK: Input: _dummy_database@_dummy_table
+POSTHOOK: Output: default@source
+POSTHOOK: Lineage: source.a SCRIPT []
+POSTHOOK: Lineage: source.b SCRIPT []
+POSTHOOK: Lineage: source.c SCRIPT []
+PREHOOK: query: insert into source values (1, 'two', 4)
+PREHOOK: type: QUERY
+PREHOOK: Input: _dummy_database@_dummy_table
+PREHOOK: Output: default@source
+POSTHOOK: query: insert into source values (1, 'two', 4)
+POSTHOOK: type: QUERY
+POSTHOOK: Input: _dummy_database@_dummy_table
+POSTHOOK: Output: default@source
+POSTHOOK: Lineage: source.a SCRIPT []
+POSTHOOK: Lineage: source.b SCRIPT []
+POSTHOOK: Lineage: source.c SCRIPT []
+PREHOOK: query: explain extended
+create external table tbl_ice partitioned by spec (bucket(16, a), truncate(3,
b)) stored by iceberg stored as orc tblproperties ('format-version'='2') as
+select a, b, c from source
+PREHOOK: type: CREATETABLE_AS_SELECT
+PREHOOK: Input: default@source
+PREHOOK: Output: database:default
+PREHOOK: Output: default@tbl_ice
+PREHOOK: Output: hdfs://### HDFS PATH ###
+POSTHOOK: query: explain extended
+create external table tbl_ice partitioned by spec (bucket(16, a), truncate(3,
b)) stored by iceberg stored as orc tblproperties ('format-version'='2') as
+select a, b, c from source
+POSTHOOK: type: CREATETABLE_AS_SELECT
+POSTHOOK: Input: default@source
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@tbl_ice
+POSTHOOK: Output: hdfs://### HDFS PATH ###
+OPTIMIZED SQL: SELECT `a`, `b`, `c`
+FROM `default`.`source`
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-2 depends on stages: Stage-1
+ Stage-3 depends on stages: Stage-0, Stage-2
+ Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+ Stage: Stage-1
+ Tez
+#### A masked pattern was here ####
+ Edges:
+ Reducer 2 <- Map 1 (CUSTOM_SIMPLE_EDGE)
+#### A masked pattern was here ####
+ Vertices:
+ Map 1
+ Map Operator Tree:
+ TableScan
+ alias: source
+ Statistics: Num rows: 2 Data size: 190 Basic stats: COMPLETE
Column stats: COMPLETE
+ GatherStats: false
+ Select Operator
+ expressions: a (type: int), b (type: string), c (type: int)
+ outputColumnNames: _col0, _col1, _col2
+ Statistics: Num rows: 2 Data size: 190 Basic stats:
COMPLETE Column stats: COMPLETE
+ File Output Operator
+ bucketingVersion: 2
+ compressed: false
+ GlobalTableId: 1
+ directory: hdfs://### HDFS PATH ###
+ NumFilesPerFileSink: 1
+ Statistics: Num rows: 2 Data size: 190 Basic stats:
COMPLETE Column stats: COMPLETE
+ Stats Publishing Key Prefix: hdfs://### HDFS PATH ###
+ table:
+ input format:
org.apache.iceberg.mr.hive.HiveIcebergInputFormat
+ jobProperties:
+ bucketing_version -1
+ columns a,b,c
+ columns.types int:string:int
+ created_with_ctas true
+ format-version 2
+ iceberg.mr.operation.type.default.tbl_ice OTHER
+ iceberg.mr.table.identifier default.tbl_ice
+ iceberg.mr.table.location hdfs://### HDFS PATH ###
+ iceberg.mr.table.schema
{"type":"struct","schema-id":0,"fields":[{"id":1,"name":"a","required":false,"type":"int"},{"id":2,"name":"b","required":false,"type":"string"},{"id":3,"name":"c","required":false,"type":"int"}]}
+ mapred.output.committer.class
org.apache.iceberg.mr.hive.HiveIcebergStorageHandler$HiveIcebergNoJobCommitter
+ name default.tbl_ice
+ serialization.format 1
+ serialization.lib
org.apache.iceberg.mr.hive.HiveIcebergSerDe
+ storage_handler
org.apache.iceberg.mr.hive.HiveIcebergStorageHandler
+ write.format.default orc
+ output format:
org.apache.iceberg.mr.hive.HiveIcebergOutputFormat
+ properties:
+ bucketing_version -1
+ columns a,b,c
+ columns.types int:string:int
+ format-version 2
+ iceberg.mr.operation.type.default.tbl_ice OTHER
+ iceberg.mr.table.partition.spec
{"spec-id":0,"fields":[{"name":"a_bucket","transform":"bucket[16]","source-id":1,"field-id":1000},{"name":"b_trunc","transform":"truncate[3]","source-id":2,"field-id":1001}]}
+ iceberg.mr.table.schema
{"type":"struct","schema-id":0,"fields":[{"id":1,"name":"a","required":false,"type":"int"},{"id":2,"name":"b","required":false,"type":"string"},{"id":3,"name":"c","required":false,"type":"int"}]}
+ name default.tbl_ice
+ serialization.format 1
+ serialization.lib
org.apache.iceberg.mr.hive.HiveIcebergSerDe
+ storage_handler
org.apache.iceberg.mr.hive.HiveIcebergStorageHandler
+ write.format.default orc
+ serde: org.apache.iceberg.mr.hive.HiveIcebergSerDe
+ name: default.tbl_ice
+ TotalFiles: 1
+ GatherStats: true
+ MultiFileSpray: false
+ Select Operator
+ expressions: _col0 (type: int), _col1 (type: string),
_col2 (type: int)
+ outputColumnNames: col1, col2, col3
+ Statistics: Num rows: 2 Data size: 190 Basic stats:
COMPLETE Column stats: COMPLETE
+ Group By Operator
+ aggregations: min(col1), max(col1), count(1),
count(col1), compute_bit_vector_hll(col1), max(length(col2)),
avg(COALESCE(length(col2),0)), count(col2), compute_bit_vector_hll(col2),
min(col3), max(col3), count(col3), compute_bit_vector_hll(col3)
+ minReductionHashAggr: 0.5
+ mode: hash
+ outputColumnNames: _col0, _col1, _col2, _col3, _col4,
_col5, _col6, _col7, _col8, _col9, _col10, _col11, _col12
+ Statistics: Num rows: 1 Data size: 560 Basic stats:
COMPLETE Column stats: COMPLETE
+ Reduce Output Operator
+ bucketingVersion: 2
+ null sort order:
+ numBuckets: -1
+ sort order:
+ Statistics: Num rows: 1 Data size: 560 Basic stats:
COMPLETE Column stats: COMPLETE
+ tag: -1
+ value expressions: _col0 (type: int), _col1 (type:
int), _col2 (type: bigint), _col3 (type: bigint), _col4 (type: binary), _col5
(type: int), _col6 (type: struct<count:bigint,sum:double,input:int>), _col7
(type: bigint), _col8 (type: binary), _col9 (type: int), _col10 (type: int),
_col11 (type: bigint), _col12 (type: binary)
+ auto parallelism: false
+ Execution mode: vectorized
+ Path -> Alias:
+ hdfs://### HDFS PATH ### [source]
+ Path -> Partition:
+ hdfs://### HDFS PATH ###
+ Partition
+ base file name: source
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format:
org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ bucket_count -1
+ bucketing_version 2
+ column.name.delimiter ,
+ columns a,b,c
+ columns.types int:string:int
+#### A masked pattern was here ####
+ location hdfs://### HDFS PATH ###
+ name default.source
+ serialization.format 1
+ serialization.lib
org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format:
org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ bucketing_version 2
+ column.name.delimiter ,
+ columns a,b,c
+ columns.comments
+ columns.types int:string:int
+#### A masked pattern was here ####
+ location hdfs://### HDFS PATH ###
+ name default.source
+ serialization.format 1
+ serialization.lib
org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.source
+ name: default.source
+ Truncated Path -> Alias:
+ /source [source]
+ Reducer 2
+ Execution mode: vectorized
+ Needs Tagging: false
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations: min(VALUE._col0), max(VALUE._col1),
count(VALUE._col2), count(VALUE._col3), compute_bit_vector_hll(VALUE._col4),
max(VALUE._col5), avg(VALUE._col6), count(VALUE._col7),
compute_bit_vector_hll(VALUE._col8), min(VALUE._col9), max(VALUE._col10),
count(VALUE._col11), compute_bit_vector_hll(VALUE._col12)
+ mode: mergepartial
+ outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5,
_col6, _col7, _col8, _col9, _col10, _col11, _col12
+ Statistics: Num rows: 1 Data size: 492 Basic stats: COMPLETE
Column stats: COMPLETE
+ Select Operator
+ expressions: 'LONG' (type: string), UDFToLong(_col0) (type:
bigint), UDFToLong(_col1) (type: bigint), (_col2 - _col3) (type: bigint),
COALESCE(ndv_compute_bit_vector(_col4),0) (type: bigint), _col4 (type: binary),
'STRING' (type: string), UDFToLong(COALESCE(_col5,0)) (type: bigint),
COALESCE(_col6,0) (type: double), (_col2 - _col7) (type: bigint),
COALESCE(ndv_compute_bit_vector(_col8),0) (type: bigint), _col8 (type: binary),
'LONG' (type: string), UDFToLong(_col9) (type: bigint), UDFToLong(_col10)
(type: bigint), (_col2 - _col11) (type: bigint),
COALESCE(ndv_compute_bit_vector(_col12),0) (type: bigint), _col12 (type: binary)
+ outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5,
_col6, _col7, _col8, _col9, _col10, _col11, _col12, _col13, _col14, _col15,
_col16, _col17
+ Statistics: Num rows: 1 Data size: 794 Basic stats: COMPLETE
Column stats: COMPLETE
+ File Output Operator
+ bucketingVersion: 2
+ compressed: false
+ GlobalTableId: 0
+ directory: hdfs://### HDFS PATH ###
+ NumFilesPerFileSink: 1
+ Statistics: Num rows: 1 Data size: 794 Basic stats:
COMPLETE Column stats: COMPLETE
+ Stats Publishing Key Prefix: hdfs://### HDFS PATH ###
+ table:
+ input format:
org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format:
org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ properties:
+ bucketing_version -1
+ columns
_col0,_col1,_col2,_col3,_col4,_col5,_col6,_col7,_col8,_col9,_col10,_col11,_col12,_col13,_col14,_col15,_col16,_col17
+ columns.types
string:bigint:bigint:bigint:bigint:binary:string:bigint:double:bigint:bigint:binary:string:bigint:bigint:bigint:bigint:binary
+ escape.delim \
+ hive.serialization.extend.additional.nesting.levels
true
+ serialization.escape.crlf true
+ serialization.format 1
+ serialization.lib
org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ serde:
org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ TotalFiles: 1
+ GatherStats: false
+ MultiFileSpray: false
+
+ Stage: Stage-2
+ Dependency Collection
+
+ Stage: Stage-3
+ Stats Work
+ Basic Stats Work:
+ Stats Aggregation Key Prefix: hdfs://### HDFS PATH ###
+ Column Stats Desc:
+ Columns: a, b, c
+ Column Types: int, string, int
+ Table: default.tbl_ice
+ Is Table Level Stats: true
+
+ Stage: Stage-0
+ Move Operator
+ files:
+ hdfs directory: true
+ source: hdfs://### HDFS PATH ###
+ destination: hdfs://### HDFS PATH ###
+
+PREHOOK: query: create external table tbl_ice partitioned by spec (bucket(16,
a), truncate(3, b)) stored by iceberg stored as orc tblproperties
('format-version'='2') as
+select a, b, c from source
+PREHOOK: type: CREATETABLE_AS_SELECT
+PREHOOK: Input: default@source
+PREHOOK: Output: database:default
+PREHOOK: Output: default@tbl_ice
+PREHOOK: Output: hdfs://### HDFS PATH ###
+POSTHOOK: query: create external table tbl_ice partitioned by spec (bucket(16,
a), truncate(3, b)) stored by iceberg stored as orc tblproperties
('format-version'='2') as
+select a, b, c from source
+POSTHOOK: type: CREATETABLE_AS_SELECT
+POSTHOOK: Input: default@source
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@tbl_ice
+POSTHOOK: Output: hdfs://### HDFS PATH ###
+PREHOOK: query: describe formatted tbl_ice
+PREHOOK: type: DESCTABLE
+PREHOOK: Input: default@tbl_ice
+POSTHOOK: query: describe formatted tbl_ice
+POSTHOOK: type: DESCTABLE
+POSTHOOK: Input: default@tbl_ice
+# col_name data_type comment
+a int
+b string
+c int
+
+# Partition Transform Information
+# col_name transform_type
+a BUCKET[16]
+b TRUNCATE[3]
+
+# Detailed Table Information
+Database: default
+#### A masked pattern was here ####
+Retention: 2147483647
+#### A masked pattern was here ####
+Table Type: EXTERNAL_TABLE
+Table Parameters:
+ COLUMN_STATS_ACCURATE
{\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"a\":\"true\",\"b\":\"true\",\"c\":\"true\"}}
+ EXTERNAL TRUE
+ bucketing_version -1
+ engine.hive.enabled true
+ iceberg.orc.files.only true
+ metadata_location hdfs://### HDFS PATH ###
+ numFiles 2
+ numRows 2
+ previous_metadata_location hdfs://### HDFS PATH ###
+ serialization.format 1
+ storage_handler
org.apache.iceberg.mr.hive.HiveIcebergStorageHandler
+ table_type ICEBERG
+ totalSize 812
+#### A masked pattern was here ####
+ uuid #Masked#
+ write.format.default orc
+
Review Comment:
@ayushtkn
Thanks for the notice.
I try to fix this in a follow-up patch focusing on the way ctas plan is
generated and the statement is executed.
https://issues.apache.org/jira/browse/HIVE-26771
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]