(hive) branch master updated: HIVE-28127: Exception when rebuilding materialized view with calculated columns on iceberg sources (Krisztian Kasa, reviewed by Denys Kuzmenko)

krisztiankasa Mon, 15 Apr 2024 23:59:27 -0700

This is an automated email from the ASF dual-hosted git repository.

krisztiankasa pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hive.git



The following commit(s) were added to refs/heads/master by this push:
     new f06cc292042 HIVE-28127: Exception when rebuilding materialized view 
with calculated columns on iceberg sources (Krisztian Kasa, reviewed by Denys 
Kuzmenko)
f06cc292042 is described below

commit f06cc2920424817da6405e0efe268ce6cd64a363
Author: Krisztian Kasa <[email protected]>
AuthorDate: Tue Apr 16 08:57:43 2024 +0200

    HIVE-28127: Exception when rebuilding materialized view with calculated 
columns on iceberg sources (Krisztian Kasa, reviewed by Denys Kuzmenko)
---
 .../src/test/queries/positive/mv_iceberg_orc5.q    |   6 +-
 .../test/results/positive/mv_iceberg_orc5.q.out    | 345 +++++++++++++++++++--
 .../AlterMaterializedViewRebuildAnalyzer.java      |   7 +-
 .../alter/rebuild/MaterializedViewASTBuilder.java  |  16 +-
 .../NativeAcidMaterializedViewASTBuilder.java      |   7 +
 .../NonNativeAcidMaterializedViewASTBuilder.java   |  27 ++
 6 files changed, 373 insertions(+), 35 deletions(-)

diff --git 
a/iceberg/iceberg-handler/src/test/queries/positive/mv_iceberg_orc5.q 
b/iceberg/iceberg-handler/src/test/queries/positive/mv_iceberg_orc5.q
index dd55b918d5b..149a2b4a3ed 100644
--- a/iceberg/iceberg-handler/src/test/queries/positive/mv_iceberg_orc5.q
+++ b/iceberg/iceberg-handler/src/test/queries/positive/mv_iceberg_orc5.q
@@ -1,7 +1,9 @@
 -- MV source tables are iceberg and MV has aggregate. It also has avg which is 
calculated from sum and count.
 -- SORT_QUERY_RESULTS
 --! qt:replace:/(.*fromVersion=\[)\S+(\].*)/$1#Masked#$2/
+--! qt:replace:/(\s+Version\sinterval\sfrom\:\s+)\d+(\s*)/$1#Masked#/
 
+set hive.explain.user=false;
 set hive.support.concurrency=true;
 set hive.txn.manager=org.apache.hadoop.hive.ql.lockmgr.DbTxnManager;
 
@@ -13,7 +15,7 @@ create external table tbl_ice_v2(d int, e string, f int) 
stored by iceberg store
 insert into tbl_ice values (1, 'one', 50), (4, 'four', 53), (5, 'five', 54);
 insert into tbl_ice_v2 values (1, 'one v2', 50), (4, 'four v2', 53), (5, 'five 
v2', 54);
 
-create materialized view mat2 as
+create materialized view mat2 stored by iceberg stored as orc tblproperties 
('format-version'='2') as
 select tbl_ice.b, tbl_ice.c, sum(tbl_ice_v2.f), count(tbl_ice_v2.f), 
avg(tbl_ice_v2.f)
 from tbl_ice
 join tbl_ice_v2 on tbl_ice.a=tbl_ice_v2.d where tbl_ice.c > 52
@@ -25,6 +27,8 @@ insert into tbl_ice_v2 values (1, 'one v2', 50), (4, 'four 
v2', 53), (5, 'five v
 
 explain cbo
 alter materialized view mat2 rebuild;
+explain
+alter materialized view mat2 rebuild;
 alter materialized view mat2 rebuild;
 
 select * from mat2;
diff --git 
a/iceberg/iceberg-handler/src/test/results/positive/mv_iceberg_orc5.q.out 
b/iceberg/iceberg-handler/src/test/results/positive/mv_iceberg_orc5.q.out
index 9279dfeec0a..bdee0374413 100644
--- a/iceberg/iceberg-handler/src/test/results/positive/mv_iceberg_orc5.q.out
+++ b/iceberg/iceberg-handler/src/test/results/positive/mv_iceberg_orc5.q.out
@@ -36,7 +36,7 @@ POSTHOOK: query: insert into tbl_ice_v2 values (1, 'one v2', 
50), (4, 'four v2',
 POSTHOOK: type: QUERY
 POSTHOOK: Input: _dummy_database@_dummy_table
 POSTHOOK: Output: default@tbl_ice_v2
-PREHOOK: query: create materialized view mat2 as
+PREHOOK: query: create materialized view mat2 stored by iceberg stored as orc 
tblproperties ('format-version'='2') as
 select tbl_ice.b, tbl_ice.c, sum(tbl_ice_v2.f), count(tbl_ice_v2.f), 
avg(tbl_ice_v2.f)
 from tbl_ice
 join tbl_ice_v2 on tbl_ice.a=tbl_ice_v2.d where tbl_ice.c > 52
@@ -46,7 +46,8 @@ PREHOOK: Input: default@tbl_ice
 PREHOOK: Input: default@tbl_ice_v2
 PREHOOK: Output: database:default
 PREHOOK: Output: default@mat2
-POSTHOOK: query: create materialized view mat2 as
+PREHOOK: Output: hdfs://### HDFS PATH ###
+POSTHOOK: query: create materialized view mat2 stored by iceberg stored as orc 
tblproperties ('format-version'='2') as
 select tbl_ice.b, tbl_ice.c, sum(tbl_ice_v2.f), count(tbl_ice_v2.f), 
avg(tbl_ice_v2.f)
 from tbl_ice
 join tbl_ice_v2 on tbl_ice.a=tbl_ice_v2.d where tbl_ice.c > 52
@@ -56,6 +57,7 @@ POSTHOOK: Input: default@tbl_ice
 POSTHOOK: Input: default@tbl_ice_v2
 POSTHOOK: Output: database:default
 POSTHOOK: Output: default@mat2
+POSTHOOK: Output: hdfs://### HDFS PATH ###
 POSTHOOK: Lineage: mat2._c2 EXPRESSION 
[(tbl_ice_v2)tbl_ice_v2.FieldSchema(name:f, type:int, comment:null), ]
 POSTHOOK: Lineage: mat2._c3 EXPRESSION 
[(tbl_ice_v2)tbl_ice_v2.FieldSchema(name:f, type:int, comment:null), ]
 POSTHOOK: Lineage: mat2._c4 EXPRESSION 
[(tbl_ice_v2)tbl_ice_v2.FieldSchema(name:f, type:int, comment:null), ]
@@ -84,6 +86,7 @@ PREHOOK: Input: default@mat2
 PREHOOK: Input: default@tbl_ice
 PREHOOK: Input: default@tbl_ice_v2
 PREHOOK: Output: default@mat2
+PREHOOK: Output: default@mat2
 POSTHOOK: query: explain cbo
 alter materialized view mat2 rebuild
 POSTHOOK: type: ALTER_MATERIALIZED_VIEW_REBUILD
@@ -91,22 +94,323 @@ POSTHOOK: Input: default@mat2
 POSTHOOK: Input: default@tbl_ice
 POSTHOOK: Input: default@tbl_ice_v2
 POSTHOOK: Output: default@mat2
+POSTHOOK: Output: default@mat2
 CBO PLAN:
-HiveProject(b=[$0], c=[$1], _o__c2=[$2], _o__c3=[COALESCE($3, 0:BIGINT)], 
_o__c4=[/(CAST($2):DOUBLE, COALESCE($3, 0:BIGINT))])
-  HiveAggregate(group=[{0, 1}], agg#0=[sum($2)], agg#1=[sum($3)])
-    HiveProject($f0=[$0], $f1=[$1], $f2=[$2], $f3=[$3])
-      HiveUnion(all=[true])
-        HiveProject(b=[$0], c=[$1], $f2=[$2], $f3=[$3])
-          HiveAggregate(group=[{1, 2}], agg#0=[sum($4)], agg#1=[count($4)])
-            HiveJoin(condition=[=($0, $3)], joinType=[inner], 
algorithm=[none], cost=[not available])
-              HiveProject(a=[$0], b=[$1], c=[$2])
-                HiveFilter(condition=[AND(>($2, 52), IS NOT NULL($0))])
-                  HiveTableScan(table=[[default, tbl_ice]], 
table:alias=[tbl_ice], fromVersion=[#Masked#])
-              HiveProject(d=[$0], f=[$2])
-                HiveFilter(condition=[IS NOT NULL($0)])
-                  HiveTableScan(table=[[default, tbl_ice_v2]], 
table:alias=[tbl_ice_v2], fromVersion=[#Masked#])
-        HiveProject(b=[$0], c=[$1], _c2=[$2], _c3=[$3])
-          HiveTableScan(table=[[default, mat2]], table:alias=[default.mat2])
+HiveProject(b=[$5], c=[$6], _o__c2=[CASE(IS NULL($2), $7, IS NULL($7), $2, 
+($7, $2))], _o__c3=[CASE(IS NULL($3), $8, IS NULL($8), $3, +($8, $3))], 
_o__c4=[/(CAST(CASE(IS NULL($2), $7, IS NULL($7), $2, +($7, $2))):DOUBLE, 
CASE(IS NULL($3), $8, IS NULL($8), $3, +($8, $3)))])
+  HiveFilter(condition=[OR($4, IS NULL($4))])
+    HiveJoin(condition=[AND(IS NOT DISTINCT FROM($0, $5), IS NOT DISTINCT 
FROM($1, $6))], joinType=[right], algorithm=[none], cost=[not available])
+      HiveProject(b=[$0], c=[$1], _c2=[$2], _c3=[$3], $f4=[true])
+        HiveTableScan(table=[[default, mat2]], table:alias=[default.mat2])
+      HiveProject(b=[$0], c=[$1], $f2=[$2], $f3=[$3])
+        HiveAggregate(group=[{1, 2}], agg#0=[sum($4)], agg#1=[count($4)])
+          HiveJoin(condition=[=($0, $3)], joinType=[inner], algorithm=[none], 
cost=[not available])
+            HiveProject(a=[$0], b=[$1], c=[$2])
+              HiveFilter(condition=[AND(>($2, 52), IS NOT NULL($0))])
+                HiveTableScan(table=[[default, tbl_ice]], 
table:alias=[tbl_ice], fromVersion=[#Masked#])
+            HiveProject(d=[$0], f=[$2])
+              HiveFilter(condition=[IS NOT NULL($0)])
+                HiveTableScan(table=[[default, tbl_ice_v2]], 
table:alias=[tbl_ice_v2], fromVersion=[#Masked#])
+
+PREHOOK: query: explain
+alter materialized view mat2 rebuild
+PREHOOK: type: ALTER_MATERIALIZED_VIEW_REBUILD
+PREHOOK: Input: default@mat2
+PREHOOK: Input: default@tbl_ice
+PREHOOK: Input: default@tbl_ice_v2
+PREHOOK: Output: default@mat2
+PREHOOK: Output: default@mat2
+POSTHOOK: query: explain
+alter materialized view mat2 rebuild
+POSTHOOK: type: ALTER_MATERIALIZED_VIEW_REBUILD
+POSTHOOK: Input: default@mat2
+POSTHOOK: Input: default@tbl_ice
+POSTHOOK: Input: default@tbl_ice_v2
+POSTHOOK: Output: default@mat2
+POSTHOOK: Output: default@mat2
+STAGE DEPENDENCIES:
+  Stage-3 is a root stage
+  Stage-4 depends on stages: Stage-3
+  Stage-0 depends on stages: Stage-4
+  Stage-5 depends on stages: Stage-0
+  Stage-6 depends on stages: Stage-5
+
+STAGE PLANS:
+  Stage: Stage-3
+    Tez
+#### A masked pattern was here ####
+      Edges:
+        Reducer 2 <- Map 1 (SIMPLE_EDGE), Reducer 8 (SIMPLE_EDGE)
+        Reducer 3 <- Reducer 2 (SIMPLE_EDGE)
+        Reducer 4 <- Reducer 2 (CUSTOM_SIMPLE_EDGE)
+        Reducer 5 <- Reducer 2 (CUSTOM_SIMPLE_EDGE)
+        Reducer 7 <- Map 6 (SIMPLE_EDGE), Map 9 (SIMPLE_EDGE)
+        Reducer 8 <- Reducer 7 (SIMPLE_EDGE)
+#### A masked pattern was here ####
+      Vertices:
+        Map 1 
+            Map Operator Tree:
+                TableScan
+                  alias: default.mat2
+                  Statistics: Num rows: 2 Data size: 232 Basic stats: COMPLETE 
Column stats: COMPLETE
+                  Select Operator
+                    expressions: b (type: string), c (type: int), _c2 (type: 
bigint), _c3 (type: bigint), true (type: boolean), PARTITION__SPEC__ID (type: 
int), PARTITION__HASH (type: bigint), FILE__PATH (type: string), ROW__POSITION 
(type: bigint), _c4 (type: double)
+                    outputColumnNames: _col0, _col1, _col2, _col3, _col4, 
_col5, _col6, _col7, _col8, _col9
+                    Statistics: Num rows: 2 Data size: 648 Basic stats: 
COMPLETE Column stats: COMPLETE
+                    Reduce Output Operator
+                      key expressions: _col0 (type: string), _col1 (type: int)
+                      null sort order: zz
+                      sort order: ++
+                      Map-reduce partition columns: _col0 (type: string), 
_col1 (type: int)
+                      Statistics: Num rows: 2 Data size: 648 Basic stats: 
COMPLETE Column stats: COMPLETE
+                      value expressions: _col2 (type: bigint), _col3 (type: 
bigint), _col4 (type: boolean), _col5 (type: int), _col6 (type: bigint), _col7 
(type: string), _col8 (type: bigint), _col9 (type: double)
+            Execution mode: vectorized
+        Map 6 
+            Map Operator Tree:
+                TableScan
+                  alias: tbl_ice
+                  filterExpr: ((c > 52) and a is not null) (type: boolean)
+                  Statistics: Num rows: 8 Data size: 768 Basic stats: COMPLETE 
Column stats: COMPLETE
+                  Version interval from: #Masked#
+                  Filter Operator
+                    predicate: ((c > 52) and a is not null) (type: boolean)
+                    Statistics: Num rows: 4 Data size: 384 Basic stats: 
COMPLETE Column stats: COMPLETE
+                    Select Operator
+                      expressions: a (type: int), b (type: string), c (type: 
int)
+                      outputColumnNames: _col0, _col1, _col2
+                      Statistics: Num rows: 4 Data size: 384 Basic stats: 
COMPLETE Column stats: COMPLETE
+                      Reduce Output Operator
+                        key expressions: _col0 (type: int)
+                        null sort order: z
+                        sort order: +
+                        Map-reduce partition columns: _col0 (type: int)
+                        Statistics: Num rows: 4 Data size: 384 Basic stats: 
COMPLETE Column stats: COMPLETE
+                        value expressions: _col1 (type: string), _col2 (type: 
int)
+            Execution mode: vectorized
+        Map 9 
+            Map Operator Tree:
+                TableScan
+                  alias: tbl_ice_v2
+                  filterExpr: d is not null (type: boolean)
+                  Statistics: Num rows: 6 Data size: 48 Basic stats: COMPLETE 
Column stats: COMPLETE
+                  Version interval from: #Masked#
+                  Filter Operator
+                    predicate: d is not null (type: boolean)
+                    Statistics: Num rows: 6 Data size: 48 Basic stats: 
COMPLETE Column stats: COMPLETE
+                    Select Operator
+                      expressions: d (type: int), f (type: int)
+                      outputColumnNames: _col0, _col1
+                      Statistics: Num rows: 6 Data size: 48 Basic stats: 
COMPLETE Column stats: COMPLETE
+                      Reduce Output Operator
+                        key expressions: _col0 (type: int)
+                        null sort order: z
+                        sort order: +
+                        Map-reduce partition columns: _col0 (type: int)
+                        Statistics: Num rows: 6 Data size: 48 Basic stats: 
COMPLETE Column stats: COMPLETE
+                        value expressions: _col1 (type: int)
+            Execution mode: vectorized
+        Reducer 2 
+            Reduce Operator Tree:
+              Merge Join Operator
+                condition map:
+                     Right Outer Join 0 to 1
+                keys:
+                  0 _col0 (type: string), _col1 (type: int)
+                  1 _col0 (type: string), _col1 (type: int)
+                nullSafes: [true, true]
+                outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, 
_col6, _col7, _col8, _col9, _col10, _col11, _col12, _col13
+                Statistics: Num rows: 6 Data size: 1620 Basic stats: COMPLETE 
Column stats: COMPLETE
+                Filter Operator
+                  predicate: _col4 (type: boolean)
+                  Statistics: Num rows: 1 Data size: 432 Basic stats: COMPLETE 
Column stats: COMPLETE
+                  Select Operator
+                    expressions: _col5 (type: int), _col6 (type: bigint), 
_col7 (type: string), _col8 (type: bigint), _col0 (type: string), _col1 (type: 
int), _col2 (type: bigint), _col3 (type: bigint), _col9 (type: double)
+                    outputColumnNames: _col0, _col1, _col2, _col3, _col4, 
_col5, _col6, _col7, _col8
+                    Statistics: Num rows: 1 Data size: 320 Basic stats: 
COMPLETE Column stats: COMPLETE
+                    Reduce Output Operator
+                      key expressions: _col0 (type: int), _col1 (type: 
bigint), _col2 (type: string), _col3 (type: bigint)
+                      null sort order: aaaa
+                      sort order: ++++
+                      Statistics: Num rows: 1 Data size: 320 Basic stats: 
COMPLETE Column stats: COMPLETE
+                      value expressions: _col4 (type: string), _col5 (type: 
int), _col6 (type: bigint), _col7 (type: bigint), _col8 (type: double)
+                Filter Operator
+                  predicate: _col4 (type: boolean)
+                  Statistics: Num rows: 1 Data size: 432 Basic stats: COMPLETE 
Column stats: COMPLETE
+                  Select Operator
+                    expressions: _col10 (type: string), _col11 (type: int), 
CASE WHEN (_col2 is null) THEN (_col12) WHEN (_col12 is null) THEN (_col2) ELSE 
((_col12 + _col2)) END (type: bigint), CASE WHEN (_col3 is null) THEN (_col13) 
WHEN (_col13 is null) THEN (_col3) ELSE ((_col13 + _col3)) END (type: bigint), 
(UDFToDouble(CASE WHEN (_col2 is null) THEN (_col12) WHEN (_col12 is null) THEN 
(_col2) ELSE ((_col12 + _col2)) END) / CASE WHEN (_col3 is null) THEN (_col13) 
WHEN (_col13 is nul [...]
+                    outputColumnNames: _col0, _col1, _col2, _col3, _col4
+                    Statistics: Num rows: 1 Data size: 116 Basic stats: 
COMPLETE Column stats: COMPLETE
+                    File Output Operator
+                      compressed: false
+                      Statistics: Num rows: 1 Data size: 116 Basic stats: 
COMPLETE Column stats: COMPLETE
+                      table:
+                          input format: 
org.apache.iceberg.mr.hive.HiveIcebergInputFormat
+                          output format: 
org.apache.iceberg.mr.hive.HiveIcebergOutputFormat
+                          serde: org.apache.iceberg.mr.hive.HiveIcebergSerDe
+                          name: default.mat2
+                    Select Operator
+                      expressions: _col0 (type: string), _col1 (type: int), 
_col2 (type: bigint), _col3 (type: bigint), _col4 (type: double)
+                      outputColumnNames: b, c, _c2, _c3, _c4
+                      Statistics: Num rows: 1 Data size: 116 Basic stats: 
COMPLETE Column stats: COMPLETE
+                      Group By Operator
+                        aggregations: max(length(b)), 
avg(COALESCE(length(b),0)), count(1), count(b), compute_bit_vector_hll(b), 
min(c), max(c), count(c), compute_bit_vector_hll(c), min(_c2), max(_c2), 
count(_c2), compute_bit_vector_hll(_c2), min(_c3), max(_c3), count(_c3), 
compute_bit_vector_hll(_c3), min(_c4), max(_c4), count(_c4), 
compute_bit_vector_hll(_c4)
+                        minReductionHashAggr: 0.4
+                        mode: hash
+                        outputColumnNames: _col0, _col1, _col2, _col3, _col4, 
_col5, _col6, _col7, _col8, _col9, _col10, _col11, _col12, _col13, _col14, 
_col15, _col16, _col17, _col18, _col19, _col20
+                        Statistics: Num rows: 1 Data size: 904 Basic stats: 
COMPLETE Column stats: COMPLETE
+                        Reduce Output Operator
+                          null sort order: 
+                          sort order: 
+                          Statistics: Num rows: 1 Data size: 904 Basic stats: 
COMPLETE Column stats: COMPLETE
+                          value expressions: _col0 (type: int), _col1 (type: 
struct<count:bigint,sum:double,input:int>), _col2 (type: bigint), _col3 (type: 
bigint), _col4 (type: binary), _col5 (type: int), _col6 (type: int), _col7 
(type: bigint), _col8 (type: binary), _col9 (type: bigint), _col10 (type: 
bigint), _col11 (type: bigint), _col12 (type: binary), _col13 (type: bigint), 
_col14 (type: bigint), _col15 (type: bigint), _col16 (type: binary), _col17 
(type: double), _col18 (type: dou [...]
+                Filter Operator
+                  predicate: _col4 is null (type: boolean)
+                  Statistics: Num rows: 4 Data size: 1080 Basic stats: 
COMPLETE Column stats: COMPLETE
+                  Select Operator
+                    expressions: _col10 (type: string), _col11 (type: int), 
CASE WHEN (_col2 is null) THEN (_col12) WHEN (_col12 is null) THEN (_col2) ELSE 
((_col12 + _col2)) END (type: bigint), CASE WHEN (_col3 is null) THEN (_col13) 
WHEN (_col13 is null) THEN (_col3) ELSE ((_col13 + _col3)) END (type: bigint), 
(UDFToDouble(CASE WHEN (_col2 is null) THEN (_col12) WHEN (_col12 is null) THEN 
(_col2) ELSE ((_col12 + _col2)) END) / CASE WHEN (_col3 is null) THEN (_col13) 
WHEN (_col13 is nul [...]
+                    outputColumnNames: _col0, _col1, _col2, _col3, _col4
+                    Statistics: Num rows: 4 Data size: 432 Basic stats: 
COMPLETE Column stats: COMPLETE
+                    File Output Operator
+                      compressed: false
+                      Statistics: Num rows: 4 Data size: 432 Basic stats: 
COMPLETE Column stats: COMPLETE
+                      table:
+                          input format: 
org.apache.iceberg.mr.hive.HiveIcebergInputFormat
+                          output format: 
org.apache.iceberg.mr.hive.HiveIcebergOutputFormat
+                          serde: org.apache.iceberg.mr.hive.HiveIcebergSerDe
+                          name: default.mat2
+                    Select Operator
+                      expressions: _col0 (type: string), _col1 (type: int), 
_col2 (type: bigint), _col3 (type: bigint), _col4 (type: double)
+                      outputColumnNames: b, c, _c2, _c3, _c4
+                      Statistics: Num rows: 4 Data size: 432 Basic stats: 
COMPLETE Column stats: COMPLETE
+                      Group By Operator
+                        aggregations: max(length(b)), 
avg(COALESCE(length(b),0)), count(1), count(b), compute_bit_vector_hll(b), 
min(c), max(c), count(c), compute_bit_vector_hll(c), min(_c2), max(_c2), 
count(_c2), compute_bit_vector_hll(_c2), min(_c3), max(_c3), count(_c3), 
compute_bit_vector_hll(_c3), min(_c4), max(_c4), count(_c4), 
compute_bit_vector_hll(_c4)
+                        minReductionHashAggr: 0.75
+                        mode: hash
+                        outputColumnNames: _col0, _col1, _col2, _col3, _col4, 
_col5, _col6, _col7, _col8, _col9, _col10, _col11, _col12, _col13, _col14, 
_col15, _col16, _col17, _col18, _col19, _col20
+                        Statistics: Num rows: 1 Data size: 904 Basic stats: 
COMPLETE Column stats: COMPLETE
+                        Reduce Output Operator
+                          null sort order: 
+                          sort order: 
+                          Statistics: Num rows: 1 Data size: 904 Basic stats: 
COMPLETE Column stats: COMPLETE
+                          value expressions: _col0 (type: int), _col1 (type: 
struct<count:bigint,sum:double,input:int>), _col2 (type: bigint), _col3 (type: 
bigint), _col4 (type: binary), _col5 (type: int), _col6 (type: int), _col7 
(type: bigint), _col8 (type: binary), _col9 (type: bigint), _col10 (type: 
bigint), _col11 (type: bigint), _col12 (type: binary), _col13 (type: bigint), 
_col14 (type: bigint), _col15 (type: bigint), _col16 (type: binary), _col17 
(type: double), _col18 (type: dou [...]
+        Reducer 3 
+            Execution mode: vectorized
+            Reduce Operator Tree:
+              Select Operator
+                expressions: KEY.reducesinkkey0 (type: int), 
KEY.reducesinkkey1 (type: bigint), KEY.reducesinkkey2 (type: string), 
KEY.reducesinkkey3 (type: bigint), VALUE._col0 (type: string), VALUE._col1 
(type: int), VALUE._col2 (type: bigint), VALUE._col3 (type: bigint), 
VALUE._col4 (type: double)
+                outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, 
_col6, _col7, _col8
+                Statistics: Num rows: 1 Data size: 320 Basic stats: COMPLETE 
Column stats: COMPLETE
+                File Output Operator
+                  compressed: false
+                  Statistics: Num rows: 1 Data size: 320 Basic stats: COMPLETE 
Column stats: COMPLETE
+                  table:
+                      input format: 
org.apache.iceberg.mr.hive.HiveIcebergInputFormat
+                      output format: 
org.apache.iceberg.mr.hive.HiveIcebergOutputFormat
+                      serde: org.apache.iceberg.mr.hive.HiveIcebergSerDe
+                      name: default.mat2
+        Reducer 4 
+            Execution mode: vectorized
+            Reduce Operator Tree:
+              Group By Operator
+                aggregations: max(VALUE._col0), avg(VALUE._col1), 
count(VALUE._col2), count(VALUE._col3), compute_bit_vector_hll(VALUE._col4), 
min(VALUE._col5), max(VALUE._col6), count(VALUE._col7), 
compute_bit_vector_hll(VALUE._col8), min(VALUE._col9), max(VALUE._col10), 
count(VALUE._col11), compute_bit_vector_hll(VALUE._col12), min(VALUE._col13), 
max(VALUE._col14), count(VALUE._col15), compute_bit_vector_hll(VALUE._col16), 
min(VALUE._col17), max(VALUE._col18), count(VALUE._col19), comp [...]
+                mode: mergepartial
+                outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, 
_col6, _col7, _col8, _col9, _col10, _col11, _col12, _col13, _col14, _col15, 
_col16, _col17, _col18, _col19, _col20
+                Statistics: Num rows: 1 Data size: 836 Basic stats: COMPLETE 
Column stats: COMPLETE
+                Select Operator
+                  expressions: 'STRING' (type: string), 
UDFToLong(COALESCE(_col0,0)) (type: bigint), COALESCE(_col1,0) (type: double), 
(_col2 - _col3) (type: bigint), COALESCE(ndv_compute_bit_vector(_col4),0) 
(type: bigint), _col4 (type: binary), 'LONG' (type: string), UDFToLong(_col5) 
(type: bigint), UDFToLong(_col6) (type: bigint), (_col2 - _col7) (type: 
bigint), COALESCE(ndv_compute_bit_vector(_col8),0) (type: bigint), _col8 (type: 
binary), 'LONG' (type: string), _col9 (type: bigint), [...]
+                  outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, 
_col6, _col7, _col8, _col9, _col10, _col11, _col12, _col13, _col14, _col15, 
_col16, _col17, _col18, _col19, _col20, _col21, _col22, _col23, _col24, _col25, 
_col26, _col27, _col28, _col29
+                  Statistics: Num rows: 1 Data size: 1324 Basic stats: 
COMPLETE Column stats: COMPLETE
+                  File Output Operator
+                    compressed: false
+                    Statistics: Num rows: 1 Data size: 1324 Basic stats: 
COMPLETE Column stats: COMPLETE
+                    table:
+                        input format: 
org.apache.hadoop.mapred.SequenceFileInputFormat
+                        output format: 
org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+                        serde: 
org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+        Reducer 5 
+            Execution mode: vectorized
+            Reduce Operator Tree:
+              Group By Operator
+                aggregations: max(VALUE._col0), avg(VALUE._col1), 
count(VALUE._col2), count(VALUE._col3), compute_bit_vector_hll(VALUE._col4), 
min(VALUE._col5), max(VALUE._col6), count(VALUE._col7), 
compute_bit_vector_hll(VALUE._col8), min(VALUE._col9), max(VALUE._col10), 
count(VALUE._col11), compute_bit_vector_hll(VALUE._col12), min(VALUE._col13), 
max(VALUE._col14), count(VALUE._col15), compute_bit_vector_hll(VALUE._col16), 
min(VALUE._col17), max(VALUE._col18), count(VALUE._col19), comp [...]
+                mode: mergepartial
+                outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, 
_col6, _col7, _col8, _col9, _col10, _col11, _col12, _col13, _col14, _col15, 
_col16, _col17, _col18, _col19, _col20
+                Statistics: Num rows: 1 Data size: 836 Basic stats: COMPLETE 
Column stats: COMPLETE
+                Select Operator
+                  expressions: 'STRING' (type: string), 
UDFToLong(COALESCE(_col0,0)) (type: bigint), COALESCE(_col1,0) (type: double), 
(_col2 - _col3) (type: bigint), COALESCE(ndv_compute_bit_vector(_col4),0) 
(type: bigint), _col4 (type: binary), 'LONG' (type: string), UDFToLong(_col5) 
(type: bigint), UDFToLong(_col6) (type: bigint), (_col2 - _col7) (type: 
bigint), COALESCE(ndv_compute_bit_vector(_col8),0) (type: bigint), _col8 (type: 
binary), 'LONG' (type: string), _col9 (type: bigint), [...]
+                  outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, 
_col6, _col7, _col8, _col9, _col10, _col11, _col12, _col13, _col14, _col15, 
_col16, _col17, _col18, _col19, _col20, _col21, _col22, _col23, _col24, _col25, 
_col26, _col27, _col28, _col29
+                  Statistics: Num rows: 1 Data size: 1324 Basic stats: 
COMPLETE Column stats: COMPLETE
+                  File Output Operator
+                    compressed: false
+                    Statistics: Num rows: 1 Data size: 1324 Basic stats: 
COMPLETE Column stats: COMPLETE
+                    table:
+                        input format: 
org.apache.hadoop.mapred.SequenceFileInputFormat
+                        output format: 
org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+                        serde: 
org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+        Reducer 7 
+            Reduce Operator Tree:
+              Merge Join Operator
+                condition map:
+                     Inner Join 0 to 1
+                keys:
+                  0 _col0 (type: int)
+                  1 _col0 (type: int)
+                outputColumnNames: _col1, _col2, _col4
+                Statistics: Num rows: 6 Data size: 576 Basic stats: COMPLETE 
Column stats: COMPLETE
+                Group By Operator
+                  aggregations: sum(_col4), count(_col4)
+                  keys: _col1 (type: string), _col2 (type: int)
+                  minReductionHashAggr: 0.4
+                  mode: hash
+                  outputColumnNames: _col0, _col1, _col2, _col3
+                  Statistics: Num rows: 4 Data size: 432 Basic stats: COMPLETE 
Column stats: COMPLETE
+                  Reduce Output Operator
+                    key expressions: _col0 (type: string), _col1 (type: int)
+                    null sort order: zz
+                    sort order: ++
+                    Map-reduce partition columns: _col0 (type: string), _col1 
(type: int)
+                    Statistics: Num rows: 4 Data size: 432 Basic stats: 
COMPLETE Column stats: COMPLETE
+                    value expressions: _col2 (type: bigint), _col3 (type: 
bigint)
+        Reducer 8 
+            Execution mode: vectorized
+            Reduce Operator Tree:
+              Group By Operator
+                aggregations: sum(VALUE._col0), count(VALUE._col1)
+                keys: KEY._col0 (type: string), KEY._col1 (type: int)
+                mode: mergepartial
+                outputColumnNames: _col0, _col1, _col2, _col3
+                Statistics: Num rows: 4 Data size: 432 Basic stats: COMPLETE 
Column stats: COMPLETE
+                Reduce Output Operator
+                  key expressions: _col0 (type: string), _col1 (type: int)
+                  null sort order: zz
+                  sort order: ++
+                  Map-reduce partition columns: _col0 (type: string), _col1 
(type: int)
+                  Statistics: Num rows: 4 Data size: 432 Basic stats: COMPLETE 
Column stats: COMPLETE
+                  value expressions: _col2 (type: bigint), _col3 (type: bigint)
+
+  Stage: Stage-4
+    Dependency Collection
+
+  Stage: Stage-0
+    Move Operator
+      tables:
+          replace: false
+          table:
+              input format: org.apache.iceberg.mr.hive.HiveIcebergInputFormat
+              output format: org.apache.iceberg.mr.hive.HiveIcebergOutputFormat
+              serde: org.apache.iceberg.mr.hive.HiveIcebergSerDe
+              name: default.mat2
+
+  Stage: Stage-5
+    Stats Work
+      Basic Stats Work:
+      Column Stats Desc:
+          Columns: b, c, _c2, _c3, _c4
+          Column Types: string, int, bigint, bigint, double
+          Table: default.mat2
+
+  Stage: Stage-6
+    Materialized View Update
+      name: default.mat2
+      update creation metadata: true
 
 PREHOOK: query: alter materialized view mat2 rebuild
 PREHOOK: type: ALTER_MATERIALIZED_VIEW_REBUILD
@@ -114,17 +418,14 @@ PREHOOK: Input: default@mat2
 PREHOOK: Input: default@tbl_ice
 PREHOOK: Input: default@tbl_ice_v2
 PREHOOK: Output: default@mat2
+PREHOOK: Output: default@mat2
 POSTHOOK: query: alter materialized view mat2 rebuild
 POSTHOOK: type: ALTER_MATERIALIZED_VIEW_REBUILD
 POSTHOOK: Input: default@mat2
 POSTHOOK: Input: default@tbl_ice
 POSTHOOK: Input: default@tbl_ice_v2
 POSTHOOK: Output: default@mat2
-POSTHOOK: Lineage: mat2._c2 EXPRESSION 
[(tbl_ice_v2)tbl_ice_v2.FieldSchema(name:f, type:int, comment:null), 
(mat2)default.mat2.FieldSchema(name:_c2, type:bigint, comment:null), ]
-POSTHOOK: Lineage: mat2._c3 EXPRESSION 
[(tbl_ice_v2)tbl_ice_v2.FieldSchema(name:f, type:int, comment:null), 
(mat2)default.mat2.FieldSchema(name:_c3, type:bigint, comment:null), ]
-POSTHOOK: Lineage: mat2._c4 EXPRESSION 
[(tbl_ice_v2)tbl_ice_v2.FieldSchema(name:f, type:int, comment:null), 
(mat2)default.mat2.FieldSchema(name:_c2, type:bigint, comment:null), 
(mat2)default.mat2.FieldSchema(name:_c3, type:bigint, comment:null), ]
-POSTHOOK: Lineage: mat2.b EXPRESSION [(tbl_ice)tbl_ice.FieldSchema(name:b, 
type:string, comment:null), (mat2)default.mat2.FieldSchema(name:b, type:string, 
comment:null), ]
-POSTHOOK: Lineage: mat2.c EXPRESSION [(tbl_ice)tbl_ice.FieldSchema(name:c, 
type:int, comment:null), (mat2)default.mat2.FieldSchema(name:c, type:int, 
comment:null), ]
+POSTHOOK: Output: default@mat2
 PREHOOK: query: select * from mat2
 PREHOOK: type: QUERY
 PREHOOK: Input: default@mat2
diff --git 
a/ql/src/java/org/apache/hadoop/hive/ql/ddl/view/materialized/alter/rebuild/AlterMaterializedViewRebuildAnalyzer.java
 
b/ql/src/java/org/apache/hadoop/hive/ql/ddl/view/materialized/alter/rebuild/AlterMaterializedViewRebuildAnalyzer.java
index 3c6d973b1a7..855ef466ebd 100644
--- 
a/ql/src/java/org/apache/hadoop/hive/ql/ddl/view/materialized/alter/rebuild/AlterMaterializedViewRebuildAnalyzer.java
+++ 
b/ql/src/java/org/apache/hadoop/hive/ql/ddl/view/materialized/alter/rebuild/AlterMaterializedViewRebuildAnalyzer.java
@@ -609,10 +609,11 @@ public class AlterMaterializedViewRebuildAnalyzer extends 
CalcitePlanner {
     ASTNode selectNodeInputROJ = new ASTSearcher().simpleBreadthFirstSearch(
             subqueryNodeInputROJ, HiveParser.TOK_SUBQUERY, 
HiveParser.TOK_QUERY,
             HiveParser.TOK_INSERT, HiveParser.TOK_SELECT);
-    astBuilder.createAcidSortNodes(TableName.getDbTable(
+    astBuilder.appendDeleteSelectNodes(
+        selectNodeInputROJ,
+        TableName.getDbTable(
             materializationNode.getChild(0).getText(),
-            materializationNode.getChild(1).getText()))
-            .forEach(astNode -> 
ParseDriver.adaptor.addChild(selectNodeInputROJ, astNode));
+            materializationNode.getChild(1).getText()));
     // 4) Transform first INSERT branch into an UPDATE
     // 4.1) Modifying filter condition.
     ASTNode whereClauseInUpdate = findWhereClause(updateInsertNode);
diff --git 
a/ql/src/java/org/apache/hadoop/hive/ql/ddl/view/materialized/alter/rebuild/MaterializedViewASTBuilder.java
 
b/ql/src/java/org/apache/hadoop/hive/ql/ddl/view/materialized/alter/rebuild/MaterializedViewASTBuilder.java
index ab4e2c04d68..57decf2827c 100644
--- 
a/ql/src/java/org/apache/hadoop/hive/ql/ddl/view/materialized/alter/rebuild/MaterializedViewASTBuilder.java
+++ 
b/ql/src/java/org/apache/hadoop/hive/ql/ddl/view/materialized/alter/rebuild/MaterializedViewASTBuilder.java
@@ -32,9 +32,7 @@ abstract class MaterializedViewASTBuilder {
     return createAcidSortNodesInternal(inputNode.getText());
   }
 
-  public List<ASTNode> createAcidSortNodes(String tableName) {
-    return wrapIntoSelExpr(createAcidSortNodesInternal(tableName));
-  }
+  public abstract void appendDeleteSelectNodes(ASTNode selectNode, String 
tableName);
 
   protected abstract List<ASTNode> createAcidSortNodesInternal(String 
tableName);
 
@@ -55,13 +53,13 @@ abstract class MaterializedViewASTBuilder {
   }
 
   public List<ASTNode> wrapIntoSelExpr(List<ASTNode> expressionNodes) {
-    return expressionNodes.stream().map(expressionNode -> {
-      ASTNode selectExpr = (ASTNode) ParseDriver.adaptor.create(
-              HiveParser.TOK_SELEXPR, "TOK_SELEXPR");
+    return 
expressionNodes.stream().map(this::wrapIntoSelExpr).collect(Collectors.toList());
+  }
 
-      ParseDriver.adaptor.addChild(selectExpr, expressionNode);
-      return selectExpr;
-    }).collect(Collectors.toList());
+  public ASTNode wrapIntoSelExpr(ASTNode expressionNode) {
+    ASTNode selectExpr = (ASTNode) 
ParseDriver.adaptor.create(HiveParser.TOK_SELEXPR, "TOK_SELEXPR");
+    ParseDriver.adaptor.addChild(selectExpr, expressionNode);
+    return selectExpr;
   }
 
   public ASTNode createSortNodes(List<ASTNode> sortKeyNodes) {
diff --git 
a/ql/src/java/org/apache/hadoop/hive/ql/ddl/view/materialized/alter/rebuild/NativeAcidMaterializedViewASTBuilder.java
 
b/ql/src/java/org/apache/hadoop/hive/ql/ddl/view/materialized/alter/rebuild/NativeAcidMaterializedViewASTBuilder.java
index d88075bd36d..d920cbb61d9 100644
--- 
a/ql/src/java/org/apache/hadoop/hive/ql/ddl/view/materialized/alter/rebuild/NativeAcidMaterializedViewASTBuilder.java
+++ 
b/ql/src/java/org/apache/hadoop/hive/ql/ddl/view/materialized/alter/rebuild/NativeAcidMaterializedViewASTBuilder.java
@@ -20,6 +20,7 @@ package 
org.apache.hadoop.hive.ql.ddl.view.materialized.alter.rebuild;
 
 import org.apache.hadoop.hive.ql.metadata.VirtualColumn;
 import org.apache.hadoop.hive.ql.parse.ASTNode;
+import org.apache.hadoop.hive.ql.parse.ParseDriver;
 
 import java.util.List;
 
@@ -31,6 +32,12 @@ public class NativeAcidMaterializedViewASTBuilder extends 
MaterializedViewASTBui
     return wrapIntoSelExpr(singletonList(createQualifiedColumnNode(tableName, 
VirtualColumn.ROWID.getName())));
   }
 
+  @Override
+  public void appendDeleteSelectNodes(ASTNode selectNode, String tableName) {
+    wrapIntoSelExpr(createAcidSortNodesInternal(tableName))
+        .forEach(astNode -> ParseDriver.adaptor.addChild(selectNode, astNode));
+  }
+
   @Override
   protected List<ASTNode> createAcidSortNodesInternal(String tableName) {
     return singletonList(createQualifiedColumnNode(tableName, 
VirtualColumn.ROWID.getName()));
diff --git 
a/ql/src/java/org/apache/hadoop/hive/ql/ddl/view/materialized/alter/rebuild/NonNativeAcidMaterializedViewASTBuilder.java
 
b/ql/src/java/org/apache/hadoop/hive/ql/ddl/view/materialized/alter/rebuild/NonNativeAcidMaterializedViewASTBuilder.java
index f9d50a809d2..4e6eb2c2560 100644
--- 
a/ql/src/java/org/apache/hadoop/hive/ql/ddl/view/materialized/alter/rebuild/NonNativeAcidMaterializedViewASTBuilder.java
+++ 
b/ql/src/java/org/apache/hadoop/hive/ql/ddl/view/materialized/alter/rebuild/NonNativeAcidMaterializedViewASTBuilder.java
@@ -18,11 +18,16 @@
 
 package org.apache.hadoop.hive.ql.ddl.view.materialized.alter.rebuild;
 
+import org.apache.hadoop.hive.metastore.api.FieldSchema;
 import org.apache.hadoop.hive.ql.Context;
 import org.apache.hadoop.hive.ql.metadata.Table;
 import org.apache.hadoop.hive.ql.parse.ASTNode;
+import org.apache.hadoop.hive.ql.parse.HiveParser;
+import org.apache.hadoop.hive.ql.parse.ParseDriver;
 
+import java.util.HashSet;
 import java.util.List;
+import java.util.Set;
 import java.util.stream.Collectors;
 
 public class NonNativeAcidMaterializedViewASTBuilder extends 
MaterializedViewASTBuilder {
@@ -39,6 +44,28 @@ public class NonNativeAcidMaterializedViewASTBuilder extends 
MaterializedViewAST
             .collect(Collectors.toList()));
   }
 
+  @Override
+  public void appendDeleteSelectNodes(ASTNode selectNode, String tableName) {
+    Set<String> selectedColumns = new HashSet<>(selectNode.getChildCount());
+
+    for (int i = 0; i < selectNode.getChildCount(); ++i) {
+      ASTNode selectExpr = (ASTNode) selectNode.getChild(i);
+      ASTNode expression = (ASTNode) selectExpr.getChild(0);
+      if (expression.getType() == HiveParser.DOT &&
+          expression.getChildCount() == 2 &&
+          expression.getChild(0).getType() == HiveParser.TOK_TABLE_OR_COL) {
+        selectedColumns.add(expression.getChild(1).getText());
+      }
+    }
+
+    for (FieldSchema fieldSchema : 
mvTable.getStorageHandler().acidSelectColumns(mvTable, 
Context.Operation.DELETE)) {
+      if (!selectedColumns.contains(fieldSchema.getName())) {
+        ParseDriver.adaptor.addChild(selectNode, wrapIntoSelExpr(
+            createQualifiedColumnNode(tableName, fieldSchema.getName())));
+      }
+    }
+  }
+
   @Override
   protected List<ASTNode> createAcidSortNodesInternal(String tableName) {
     return mvTable.getStorageHandler().acidSortColumns(mvTable, 
Context.Operation.DELETE).stream()

(hive) branch master updated: HIVE-28127: Exception when rebuilding materialized view with calculated columns on iceberg sources (Krisztian Kasa, reviewed by Denys Kuzmenko)

Reply via email to