[
https://issues.apache.org/jira/browse/HIVE-26628?focusedWorklogId=828606&page=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-828606
]
ASF GitHub Bot logged work on HIVE-26628:
-----------------------------------------
Author: ASF GitHub Bot
Created on: 24/Nov/22 08:16
Start Date: 24/Nov/22 08:16
Worklog Time Spent: 10m
Work Description: kasakrisz commented on code in PR #3745:
URL: https://github.com/apache/hive/pull/3745#discussion_r1031171049
##########
iceberg/iceberg-handler/src/test/results/positive/ctas_iceberg_partitioned_orc.q.out:
##########
@@ -0,0 +1,304 @@
+PREHOOK: query: create table source(a int, b string, c int)
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@source
+POSTHOOK: query: create table source(a int, b string, c int)
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@source
+PREHOOK: query: insert into source values (1, 'one', 3)
+PREHOOK: type: QUERY
+PREHOOK: Input: _dummy_database@_dummy_table
+PREHOOK: Output: default@source
+POSTHOOK: query: insert into source values (1, 'one', 3)
+POSTHOOK: type: QUERY
+POSTHOOK: Input: _dummy_database@_dummy_table
+POSTHOOK: Output: default@source
+POSTHOOK: Lineage: source.a SCRIPT []
+POSTHOOK: Lineage: source.b SCRIPT []
+POSTHOOK: Lineage: source.c SCRIPT []
+PREHOOK: query: insert into source values (1, 'two', 4)
+PREHOOK: type: QUERY
+PREHOOK: Input: _dummy_database@_dummy_table
+PREHOOK: Output: default@source
+POSTHOOK: query: insert into source values (1, 'two', 4)
+POSTHOOK: type: QUERY
+POSTHOOK: Input: _dummy_database@_dummy_table
+POSTHOOK: Output: default@source
+POSTHOOK: Lineage: source.a SCRIPT []
+POSTHOOK: Lineage: source.b SCRIPT []
+POSTHOOK: Lineage: source.c SCRIPT []
+PREHOOK: query: explain extended
+create external table tbl_ice partitioned by spec (bucket(16, a), truncate(3,
b)) stored by iceberg stored as orc tblproperties ('format-version'='2') as
+select a, b, c from source
+PREHOOK: type: CREATETABLE_AS_SELECT
+PREHOOK: Input: default@source
+PREHOOK: Output: database:default
+PREHOOK: Output: default@tbl_ice
+PREHOOK: Output: hdfs://### HDFS PATH ###
+POSTHOOK: query: explain extended
+create external table tbl_ice partitioned by spec (bucket(16, a), truncate(3,
b)) stored by iceberg stored as orc tblproperties ('format-version'='2') as
+select a, b, c from source
+POSTHOOK: type: CREATETABLE_AS_SELECT
+POSTHOOK: Input: default@source
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@tbl_ice
+POSTHOOK: Output: hdfs://### HDFS PATH ###
+OPTIMIZED SQL: SELECT `a`, `b`, `c`
+FROM `default`.`source`
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-2 depends on stages: Stage-1
+ Stage-3 depends on stages: Stage-0, Stage-2
+ Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+ Stage: Stage-1
+ Tez
+#### A masked pattern was here ####
+ Edges:
+ Reducer 2 <- Map 1 (CUSTOM_SIMPLE_EDGE)
+#### A masked pattern was here ####
+ Vertices:
+ Map 1
+ Map Operator Tree:
+ TableScan
+ alias: source
+ Statistics: Num rows: 2 Data size: 190 Basic stats: COMPLETE
Column stats: COMPLETE
+ GatherStats: false
+ Select Operator
+ expressions: a (type: int), b (type: string), c (type: int)
+ outputColumnNames: _col0, _col1, _col2
+ Statistics: Num rows: 2 Data size: 190 Basic stats:
COMPLETE Column stats: COMPLETE
+ File Output Operator
+ bucketingVersion: 2
+ compressed: false
+ GlobalTableId: 1
+ directory: hdfs://### HDFS PATH ###
+ NumFilesPerFileSink: 1
+ Statistics: Num rows: 2 Data size: 190 Basic stats:
COMPLETE Column stats: COMPLETE
+ Stats Publishing Key Prefix: hdfs://### HDFS PATH ###
+ table:
+ input format:
org.apache.iceberg.mr.hive.HiveIcebergInputFormat
+ jobProperties:
+ bucketing_version -1
+ columns a,b,c
+ columns.types int:string:int
+ created_with_ctas true
+ format-version 2
+ iceberg.mr.operation.type.default.tbl_ice OTHER
+ iceberg.mr.table.identifier default.tbl_ice
+ iceberg.mr.table.location hdfs://### HDFS PATH ###
+ iceberg.mr.table.schema
{"type":"struct","schema-id":0,"fields":[{"id":1,"name":"a","required":false,"type":"int"},{"id":2,"name":"b","required":false,"type":"string"},{"id":3,"name":"c","required":false,"type":"int"}]}
+ mapred.output.committer.class
org.apache.iceberg.mr.hive.HiveIcebergStorageHandler$HiveIcebergNoJobCommitter
+ name default.tbl_ice
+ serialization.format 1
+ serialization.lib
org.apache.iceberg.mr.hive.HiveIcebergSerDe
+ storage_handler
org.apache.iceberg.mr.hive.HiveIcebergStorageHandler
+ write.format.default orc
+ output format:
org.apache.iceberg.mr.hive.HiveIcebergOutputFormat
+ properties:
+ bucketing_version -1
+ columns a,b,c
+ columns.types int:string:int
+ format-version 2
+ iceberg.mr.operation.type.default.tbl_ice OTHER
+ iceberg.mr.table.partition.spec
{"spec-id":0,"fields":[{"name":"a_bucket","transform":"bucket[16]","source-id":1,"field-id":1000},{"name":"b_trunc","transform":"truncate[3]","source-id":2,"field-id":1001}]}
+ iceberg.mr.table.schema
{"type":"struct","schema-id":0,"fields":[{"id":1,"name":"a","required":false,"type":"int"},{"id":2,"name":"b","required":false,"type":"string"},{"id":3,"name":"c","required":false,"type":"int"}]}
+ name default.tbl_ice
+ serialization.format 1
+ serialization.lib
org.apache.iceberg.mr.hive.HiveIcebergSerDe
+ storage_handler
org.apache.iceberg.mr.hive.HiveIcebergStorageHandler
+ write.format.default orc
+ serde: org.apache.iceberg.mr.hive.HiveIcebergSerDe
+ name: default.tbl_ice
+ TotalFiles: 1
+ GatherStats: true
+ MultiFileSpray: false
+ Select Operator
+ expressions: _col0 (type: int), _col1 (type: string),
_col2 (type: int)
+ outputColumnNames: col1, col2, col3
+ Statistics: Num rows: 2 Data size: 190 Basic stats:
COMPLETE Column stats: COMPLETE
+ Group By Operator
+ aggregations: min(col1), max(col1), count(1),
count(col1), compute_bit_vector_hll(col1), max(length(col2)),
avg(COALESCE(length(col2),0)), count(col2), compute_bit_vector_hll(col2),
min(col3), max(col3), count(col3), compute_bit_vector_hll(col3)
+ minReductionHashAggr: 0.5
+ mode: hash
+ outputColumnNames: _col0, _col1, _col2, _col3, _col4,
_col5, _col6, _col7, _col8, _col9, _col10, _col11, _col12
+ Statistics: Num rows: 1 Data size: 560 Basic stats:
COMPLETE Column stats: COMPLETE
+ Reduce Output Operator
+ bucketingVersion: 2
+ null sort order:
+ numBuckets: -1
+ sort order:
+ Statistics: Num rows: 1 Data size: 560 Basic stats:
COMPLETE Column stats: COMPLETE
+ tag: -1
+ value expressions: _col0 (type: int), _col1 (type:
int), _col2 (type: bigint), _col3 (type: bigint), _col4 (type: binary), _col5
(type: int), _col6 (type: struct<count:bigint,sum:double,input:int>), _col7
(type: bigint), _col8 (type: binary), _col9 (type: int), _col10 (type: int),
_col11 (type: bigint), _col12 (type: binary)
+ auto parallelism: false
+ Execution mode: vectorized
+ Path -> Alias:
+ hdfs://### HDFS PATH ### [source]
+ Path -> Partition:
+ hdfs://### HDFS PATH ###
+ Partition
+ base file name: source
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format:
org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ bucket_count -1
+ bucketing_version 2
+ column.name.delimiter ,
+ columns a,b,c
+ columns.types int:string:int
+#### A masked pattern was here ####
+ location hdfs://### HDFS PATH ###
+ name default.source
+ serialization.format 1
+ serialization.lib
org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ input format: org.apache.hadoop.mapred.TextInputFormat
+ output format:
org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+ properties:
+ bucketing_version 2
+ column.name.delimiter ,
+ columns a,b,c
+ columns.comments
+ columns.types int:string:int
+#### A masked pattern was here ####
+ location hdfs://### HDFS PATH ###
+ name default.source
+ serialization.format 1
+ serialization.lib
org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ name: default.source
+ name: default.source
+ Truncated Path -> Alias:
+ /source [source]
+ Reducer 2
+ Execution mode: vectorized
+ Needs Tagging: false
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations: min(VALUE._col0), max(VALUE._col1),
count(VALUE._col2), count(VALUE._col3), compute_bit_vector_hll(VALUE._col4),
max(VALUE._col5), avg(VALUE._col6), count(VALUE._col7),
compute_bit_vector_hll(VALUE._col8), min(VALUE._col9), max(VALUE._col10),
count(VALUE._col11), compute_bit_vector_hll(VALUE._col12)
+ mode: mergepartial
+ outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5,
_col6, _col7, _col8, _col9, _col10, _col11, _col12
+ Statistics: Num rows: 1 Data size: 492 Basic stats: COMPLETE
Column stats: COMPLETE
+ Select Operator
+ expressions: 'LONG' (type: string), UDFToLong(_col0) (type:
bigint), UDFToLong(_col1) (type: bigint), (_col2 - _col3) (type: bigint),
COALESCE(ndv_compute_bit_vector(_col4),0) (type: bigint), _col4 (type: binary),
'STRING' (type: string), UDFToLong(COALESCE(_col5,0)) (type: bigint),
COALESCE(_col6,0) (type: double), (_col2 - _col7) (type: bigint),
COALESCE(ndv_compute_bit_vector(_col8),0) (type: bigint), _col8 (type: binary),
'LONG' (type: string), UDFToLong(_col9) (type: bigint), UDFToLong(_col10)
(type: bigint), (_col2 - _col11) (type: bigint),
COALESCE(ndv_compute_bit_vector(_col12),0) (type: bigint), _col12 (type: binary)
+ outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5,
_col6, _col7, _col8, _col9, _col10, _col11, _col12, _col13, _col14, _col15,
_col16, _col17
+ Statistics: Num rows: 1 Data size: 794 Basic stats: COMPLETE
Column stats: COMPLETE
+ File Output Operator
+ bucketingVersion: 2
+ compressed: false
+ GlobalTableId: 0
+ directory: hdfs://### HDFS PATH ###
+ NumFilesPerFileSink: 1
+ Statistics: Num rows: 1 Data size: 794 Basic stats:
COMPLETE Column stats: COMPLETE
+ Stats Publishing Key Prefix: hdfs://### HDFS PATH ###
+ table:
+ input format:
org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format:
org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ properties:
+ bucketing_version -1
+ columns
_col0,_col1,_col2,_col3,_col4,_col5,_col6,_col7,_col8,_col9,_col10,_col11,_col12,_col13,_col14,_col15,_col16,_col17
+ columns.types
string:bigint:bigint:bigint:bigint:binary:string:bigint:double:bigint:bigint:binary:string:bigint:bigint:bigint:bigint:binary
+ escape.delim \
+ hive.serialization.extend.additional.nesting.levels
true
+ serialization.escape.crlf true
+ serialization.format 1
+ serialization.lib
org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ serde:
org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+ TotalFiles: 1
+ GatherStats: false
+ MultiFileSpray: false
+
+ Stage: Stage-2
+ Dependency Collection
+
+ Stage: Stage-3
+ Stats Work
+ Basic Stats Work:
+ Stats Aggregation Key Prefix: hdfs://### HDFS PATH ###
+ Column Stats Desc:
+ Columns: a, b, c
+ Column Types: int, string, int
+ Table: default.tbl_ice
+ Is Table Level Stats: true
+
+ Stage: Stage-0
+ Move Operator
+ files:
+ hdfs directory: true
+ source: hdfs://### HDFS PATH ###
+ destination: hdfs://### HDFS PATH ###
+
+PREHOOK: query: create external table tbl_ice partitioned by spec (bucket(16,
a), truncate(3, b)) stored by iceberg stored as orc tblproperties
('format-version'='2') as
+select a, b, c from source
+PREHOOK: type: CREATETABLE_AS_SELECT
+PREHOOK: Input: default@source
+PREHOOK: Output: database:default
+PREHOOK: Output: default@tbl_ice
+PREHOOK: Output: hdfs://### HDFS PATH ###
+POSTHOOK: query: create external table tbl_ice partitioned by spec (bucket(16,
a), truncate(3, b)) stored by iceberg stored as orc tblproperties
('format-version'='2') as
+select a, b, c from source
+POSTHOOK: type: CREATETABLE_AS_SELECT
+POSTHOOK: Input: default@source
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@tbl_ice
+POSTHOOK: Output: hdfs://### HDFS PATH ###
+PREHOOK: query: describe formatted tbl_ice
+PREHOOK: type: DESCTABLE
+PREHOOK: Input: default@tbl_ice
+POSTHOOK: query: describe formatted tbl_ice
+POSTHOOK: type: DESCTABLE
+POSTHOOK: Input: default@tbl_ice
+# col_name data_type comment
+a int
+b string
+c int
+
+# Partition Transform Information
+# col_name transform_type
+a BUCKET[16]
+b TRUNCATE[3]
+
+# Detailed Table Information
+Database: default
+#### A masked pattern was here ####
+Retention: 2147483647
+#### A masked pattern was here ####
+Table Type: EXTERNAL_TABLE
+Table Parameters:
+ COLUMN_STATS_ACCURATE
{\"BASIC_STATS\":\"true\",\"COLUMN_STATS\":{\"a\":\"true\",\"b\":\"true\",\"c\":\"true\"}}
+ EXTERNAL TRUE
+ bucketing_version -1
+ engine.hive.enabled true
+ iceberg.orc.files.only true
+ metadata_location hdfs://### HDFS PATH ###
+ numFiles 2
+ numRows 2
+ previous_metadata_location hdfs://### HDFS PATH ###
+ serialization.format 1
+ storage_handler
org.apache.iceberg.mr.hive.HiveIcebergStorageHandler
+ table_type ICEBERG
+ totalSize 812
+#### A masked pattern was here ####
+ uuid #Masked#
+ write.format.default orc
+
Review Comment:
@ayushtkn
Thanks for the notice.
I try to fix this in a follow-up patch focusing on the way ctas plan is
generated and the statement is executed.
https://issues.apache.org/jira/browse/HIVE-26771
Issue Time Tracking
-------------------
Worklog Id: (was: 828606)
Time Spent: 10h 20m (was: 10h 10m)
> Iceberg table is created when running explain ctas command
> ----------------------------------------------------------
>
> Key: HIVE-26628
> URL: https://issues.apache.org/jira/browse/HIVE-26628
> Project: Hive
> Issue Type: Bug
> Components: StorageHandler
> Reporter: Krisztian Kasa
> Assignee: Krisztian Kasa
> Priority: Major
> Labels: pull-request-available
> Fix For: 4.0.0
>
> Time Spent: 10h 20m
> Remaining Estimate: 0h
>
> {code}
> create table source(a int, b string, c int);
> explain
> create table tbl_ice stored by iceberg stored as orc tblproperties
> ('format-version'='2') as
> select a, b, c from source;
> create table tbl_ice stored by iceberg stored as orc tblproperties
> ('format-version'='2') as
> select a, b, c from source;
> {code}
> {code}
> org.apache.hadoop.hive.ql.parse.SemanticException:
> org.apache.hadoop.hive.ql.parse.SemanticException: Table already exists:
> default.tbl_ice
> at
> org.apache.hadoop.hive.ql.parse.SemanticAnalyzer.analyzeCreateTable(SemanticAnalyzer.java:13963)
> at
> org.apache.hadoop.hive.ql.parse.SemanticAnalyzer.genResolvedParseTree(SemanticAnalyzer.java:12528)
> at
> org.apache.hadoop.hive.ql.parse.SemanticAnalyzer.analyzeInternal(SemanticAnalyzer.java:12693)
> at
> org.apache.hadoop.hive.ql.parse.CalcitePlanner.analyzeInternal(CalcitePlanner.java:460)
> at
> org.apache.hadoop.hive.ql.parse.BaseSemanticAnalyzer.analyze(BaseSemanticAnalyzer.java:317)
> at org.apache.hadoop.hive.ql.Compiler.analyze(Compiler.java:224)
> at org.apache.hadoop.hive.ql.Compiler.compile(Compiler.java:106)
> at org.apache.hadoop.hive.ql.Driver.compile(Driver.java:522)
> at org.apache.hadoop.hive.ql.Driver.compileInternal(Driver.java:474)
> at org.apache.hadoop.hive.ql.Driver.compileAndRespond(Driver.java:439)
> at org.apache.hadoop.hive.ql.Driver.compileAndRespond(Driver.java:433)
> at
> org.apache.hadoop.hive.ql.reexec.ReExecDriver.compileAndRespond(ReExecDriver.java:121)
> at
> org.apache.hadoop.hive.ql.reexec.ReExecDriver.run(ReExecDriver.java:227)
> at
> org.apache.hadoop.hive.cli.CliDriver.processLocalCmd(CliDriver.java:255)
> at org.apache.hadoop.hive.cli.CliDriver.processCmd1(CliDriver.java:200)
> at org.apache.hadoop.hive.cli.CliDriver.processCmd(CliDriver.java:126)
> at org.apache.hadoop.hive.cli.CliDriver.processLine(CliDriver.java:421)
> at org.apache.hadoop.hive.cli.CliDriver.processLine(CliDriver.java:352)
> at
> org.apache.hadoop.hive.ql.QTestUtil.executeClientInternal(QTestUtil.java:727)
> at org.apache.hadoop.hive.ql.QTestUtil.executeClient(QTestUtil.java:697)
> at
> org.apache.hadoop.hive.cli.control.CoreCliDriver.runTest(CoreCliDriver.java:114)
> at
> org.apache.hadoop.hive.cli.control.CliAdapter.runTest(CliAdapter.java:157)
> at
> org.apache.hadoop.hive.cli.TestIcebergLlapLocalCliDriver.testCliDriver(TestIcebergLlapLocalCliDriver.java:60)
> at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
> at
> sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
> at
> sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
> at java.lang.reflect.Method.invoke(Method.java:498)
> at
> org.junit.runners.model.FrameworkMethod$1.runReflectiveCall(FrameworkMethod.java:59)
> at
> org.junit.internal.runners.model.ReflectiveCallable.run(ReflectiveCallable.java:12)
> at
> org.junit.runners.model.FrameworkMethod.invokeExplosively(FrameworkMethod.java:56)
> at
> org.junit.internal.runners.statements.InvokeMethod.evaluate(InvokeMethod.java:17)
> at
> org.apache.hadoop.hive.cli.control.CliAdapter$2$1.evaluate(CliAdapter.java:135)
> at org.junit.runners.ParentRunner$3.evaluate(ParentRunner.java:306)
> at
> org.junit.runners.BlockJUnit4ClassRunner$1.evaluate(BlockJUnit4ClassRunner.java:100)
> at org.junit.runners.ParentRunner.runLeaf(ParentRunner.java:366)
> at
> org.junit.runners.BlockJUnit4ClassRunner.runChild(BlockJUnit4ClassRunner.java:103)
> at
> org.junit.runners.BlockJUnit4ClassRunner.runChild(BlockJUnit4ClassRunner.java:63)
> at org.junit.runners.ParentRunner$4.run(ParentRunner.java:331)
> at org.junit.runners.ParentRunner$1.schedule(ParentRunner.java:79)
> at org.junit.runners.ParentRunner.runChildren(ParentRunner.java:329)
> at org.junit.runners.ParentRunner.access$100(ParentRunner.java:66)
> at org.junit.runners.ParentRunner$2.evaluate(ParentRunner.java:293)
> at org.junit.runners.ParentRunner.run(ParentRunner.java:413)
> at org.junit.runners.Suite.runChild(Suite.java:128)
> at org.junit.runners.Suite.runChild(Suite.java:27)
> at org.junit.runners.ParentRunner$4.run(ParentRunner.java:331)
> at org.junit.runners.ParentRunner$1.schedule(ParentRunner.java:79)
> at org.junit.runners.ParentRunner.runChildren(ParentRunner.java:329)
> at org.junit.runners.ParentRunner.access$100(ParentRunner.java:66)
> at org.junit.runners.ParentRunner$2.evaluate(ParentRunner.java:293)
> at
> org.apache.hadoop.hive.cli.control.CliAdapter$1$1.evaluate(CliAdapter.java:95)
> at org.junit.rules.RunRules.evaluate(RunRules.java:20)
> at org.junit.runners.ParentRunner$3.evaluate(ParentRunner.java:306)
> at org.junit.runners.ParentRunner.run(ParentRunner.java:413)
> at
> org.apache.maven.surefire.junit4.JUnit4Provider.execute(JUnit4Provider.java:365)
> at
> org.apache.maven.surefire.junit4.JUnit4Provider.executeWithRerun(JUnit4Provider.java:273)
> at
> org.apache.maven.surefire.junit4.JUnit4Provider.executeTestSet(JUnit4Provider.java:238)
> at
> org.apache.maven.surefire.junit4.JUnit4Provider.invoke(JUnit4Provider.java:159)
> at
> org.apache.maven.surefire.booter.ForkedBooter.runSuitesInProcess(ForkedBooter.java:377)
> at
> org.apache.maven.surefire.booter.ForkedBooter.execute(ForkedBooter.java:138)
> at
> org.apache.maven.surefire.booter.ForkedBooter.run(ForkedBooter.java:465)
> at
> org.apache.maven.surefire.booter.ForkedBooter.main(ForkedBooter.java:451)
> Caused by: org.apache.hadoop.hive.ql.parse.SemanticException: Table already
> exists: default.tbl_ice
> at
> org.apache.hadoop.hive.ql.parse.SemanticAnalyzer.analyzeCreateTable(SemanticAnalyzer.java:13960)
> ... 61 more
> {code}
> The EXPLAIN ... command creates the Iceberg table default.tbl_ice hence the
> ctas command executed after it fails with table already exists.
--
This message was sent by Atlassian Jira
(v8.20.10#820010)