This is an automated email from the ASF dual-hosted git repository.
krisztiankasa pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hive.git
The following commit(s) were added to refs/heads/master by this push:
new f829ff986e9 HIVE-27727: Materialized view query rewrite fails if query
has decimal derived aggregate (Krisztian Kasa, reviewed by Stamatis Zampetakis)
f829ff986e9 is described below
commit f829ff986e919cc004c19770030b823d35b2281f
Author: Krisztian Kasa <[email protected]>
AuthorDate: Wed Sep 27 05:53:57 2023 +0200
HIVE-27727: Materialized view query rewrite fails if query has decimal
derived aggregate (Krisztian Kasa, reviewed by Stamatis Zampetakis)
---
.../ql/optimizer/calcite/HiveTypeSystemImpl.java | 11 ++--
.../hadoop/hive/ql/udf/generic/GenericUDAFSum.java | 3 +-
.../materialized_view_create_rewrite_11.q | 18 +++++++
.../clientpositive/llap/decimal_precision.q.out | 2 +-
.../llap/materialized_view_create_rewrite_11.q.out | 62 ++++++++++++++++++++++
.../llap/materialized_view_create_rewrite_6.q.out | 6 +--
.../llap/vector_decimal_precision.q.out | 12 ++---
.../perf/tpcds30tb/tez/cbo_query13.q.out | 2 +-
.../perf/tpcds30tb/tez/query13.q.out | 2 +-
9 files changed, 101 insertions(+), 17 deletions(-)
diff --git
a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/HiveTypeSystemImpl.java
b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/HiveTypeSystemImpl.java
index 5e301fdbcbd..229801bb154 100644
---
a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/HiveTypeSystemImpl.java
+++
b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/HiveTypeSystemImpl.java
@@ -22,6 +22,8 @@ import org.apache.calcite.rel.type.RelDataTypeFactory;
import org.apache.calcite.rel.type.RelDataTypeSystemImpl;
import org.apache.calcite.sql.type.SqlTypeName;
+import static
org.apache.hadoop.hive.ql.udf.generic.GenericUDAFSum.SUM_RESULT_PRECISION_INCREASE;
+
public class HiveTypeSystemImpl extends RelDataTypeSystemImpl {
// TODO: This should come from type system; Currently there is no definition
// in type system for this.
@@ -178,10 +180,11 @@ public class HiveTypeSystemImpl extends
RelDataTypeSystemImpl {
// In Hive, SUM aggregate on decimal column will add 10 to compute
// the output precision; see
//
GenericUDAFSum.GenericUDAFSumHiveDecimal#getOutputDecimalTypeInfoForSum
- return typeFactory.createSqlType(
- SqlTypeName.DECIMAL,
- Math.min(MAX_DECIMAL_PRECISION, argumentType.getPrecision() + 10),
- argumentType.getScale());
+ return typeFactory.createTypeWithNullability(
+ typeFactory.createSqlType(
+ SqlTypeName.DECIMAL,
+ Math.min(MAX_DECIMAL_PRECISION, argumentType.getPrecision() +
SUM_RESULT_PRECISION_INCREASE),
+ argumentType.getScale()), argumentType.isNullable());
}
return argumentType;
}
diff --git
a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFSum.java
b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFSum.java
index 40c7a7d7b5e..a78b5f5734d 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFSum.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDAFSum.java
@@ -60,6 +60,7 @@ import org.slf4j.LoggerFactory;
public class GenericUDAFSum extends AbstractGenericUDAFResolver {
static final Logger LOG =
LoggerFactory.getLogger(GenericUDAFSum.class.getName());
+ public static final int SUM_RESULT_PRECISION_INCREASE = 10;
@Override
public GenericUDAFEvaluator getEvaluator(TypeInfo[] parameters)
@@ -252,7 +253,7 @@ public class GenericUDAFSum extends
AbstractGenericUDAFResolver {
// 10b rows. The scale is the same as the input.
DecimalTypeInfo outputTypeInfo = null;
if (mode == Mode.PARTIAL1 || mode == Mode.COMPLETE) {
- int precision = Math.min(HiveDecimal.MAX_PRECISION, inputPrecision +
10);
+ int precision = Math.min(HiveDecimal.MAX_PRECISION, inputPrecision +
SUM_RESULT_PRECISION_INCREASE);
outputTypeInfo = TypeInfoFactory.getDecimalTypeInfo(precision,
inputScale);
} else {
outputTypeInfo = TypeInfoFactory.getDecimalTypeInfo(inputPrecision,
inputScale);
diff --git
a/ql/src/test/queries/clientpositive/materialized_view_create_rewrite_11.q
b/ql/src/test/queries/clientpositive/materialized_view_create_rewrite_11.q
new file mode 100644
index 00000000000..b7933e5949d
--- /dev/null
+++ b/ql/src/test/queries/clientpositive/materialized_view_create_rewrite_11.q
@@ -0,0 +1,18 @@
+-- Test rewrite when query has aggregate can be derived from MV aggregates:
avg(x) = sum(x)/count(x)
+
+set hive.support.concurrency=true;
+set hive.txn.manager=org.apache.hadoop.hive.ql.lockmgr.DbTxnManager;
+
+set hive.stats.autogather=false;
+
+create table t1 (a int, b decimal(3,2)) stored as orc TBLPROPERTIES
('transactional'='true');
+
+insert into t1 values (1,1), (2,1), (3,3);
+
+create materialized view mv1 as
+select a, sum(b), count(b) from t1 group by a;
+
+explain cbo
+select a, avg(b) from t1 group by a;
+
+select a, avg(b) from t1 group by a;
diff --git a/ql/src/test/results/clientpositive/llap/decimal_precision.q.out
b/ql/src/test/results/clientpositive/llap/decimal_precision.q.out
index 32c7fc7844a..66dca467fd8 100644
--- a/ql/src/test/results/clientpositive/llap/decimal_precision.q.out
+++ b/ql/src/test/results/clientpositive/llap/decimal_precision.q.out
@@ -575,7 +575,7 @@ STAGE PLANS:
outputColumnNames: _col0, _col1
Statistics: Num rows: 1 Data size: 232 Basic stats: COMPLETE
Column stats: NONE
Select Operator
- expressions: CAST( (_col0 / _col1) AS decimal(24,14)) (type:
decimal(24,14)), CAST( _col0 AS decimal(30,10)) (type: decimal(30,10))
+ expressions: CAST( (_col0 / _col1) AS decimal(24,14)) (type:
decimal(24,14)), _col0 (type: decimal(30,10))
outputColumnNames: _col0, _col1
Statistics: Num rows: 1 Data size: 232 Basic stats: COMPLETE
Column stats: NONE
File Output Operator
diff --git
a/ql/src/test/results/clientpositive/llap/materialized_view_create_rewrite_11.q.out
b/ql/src/test/results/clientpositive/llap/materialized_view_create_rewrite_11.q.out
new file mode 100644
index 00000000000..34087c975a5
--- /dev/null
+++
b/ql/src/test/results/clientpositive/llap/materialized_view_create_rewrite_11.q.out
@@ -0,0 +1,62 @@
+PREHOOK: query: create table t1 (a int, b decimal(3,2)) stored as orc
TBLPROPERTIES ('transactional'='true')
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@t1
+POSTHOOK: query: create table t1 (a int, b decimal(3,2)) stored as orc
TBLPROPERTIES ('transactional'='true')
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@t1
+PREHOOK: query: insert into t1 values (1,1), (2,1), (3,3)
+PREHOOK: type: QUERY
+PREHOOK: Input: _dummy_database@_dummy_table
+PREHOOK: Output: default@t1
+POSTHOOK: query: insert into t1 values (1,1), (2,1), (3,3)
+POSTHOOK: type: QUERY
+POSTHOOK: Input: _dummy_database@_dummy_table
+POSTHOOK: Output: default@t1
+POSTHOOK: Lineage: t1.a SCRIPT []
+POSTHOOK: Lineage: t1.b SCRIPT []
+PREHOOK: query: create materialized view mv1 as
+select a, sum(b), count(b) from t1 group by a
+PREHOOK: type: CREATE_MATERIALIZED_VIEW
+PREHOOK: Input: default@t1
+PREHOOK: Output: database:default
+PREHOOK: Output: default@mv1
+POSTHOOK: query: create materialized view mv1 as
+select a, sum(b), count(b) from t1 group by a
+POSTHOOK: type: CREATE_MATERIALIZED_VIEW
+POSTHOOK: Input: default@t1
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@mv1
+POSTHOOK: Lineage: mv1._c1 EXPRESSION [(t1)t1.FieldSchema(name:b,
type:decimal(3,2), comment:null), ]
+POSTHOOK: Lineage: mv1._c2 EXPRESSION [(t1)t1.FieldSchema(name:b,
type:decimal(3,2), comment:null), ]
+POSTHOOK: Lineage: mv1.a SIMPLE [(t1)t1.FieldSchema(name:a, type:int,
comment:null), ]
+PREHOOK: query: explain cbo
+select a, avg(b) from t1 group by a
+PREHOOK: type: QUERY
+PREHOOK: Input: default@mv1
+PREHOOK: Input: default@t1
+#### A masked pattern was here ####
+POSTHOOK: query: explain cbo
+select a, avg(b) from t1 group by a
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@mv1
+POSTHOOK: Input: default@t1
+#### A masked pattern was here ####
+CBO PLAN:
+HiveProject(a=[$0], _o__c1=[CAST(/($1, $2)):DECIMAL(7, 6)])
+ HiveTableScan(table=[[default, mv1]], table:alias=[default.mv1])
+
+PREHOOK: query: select a, avg(b) from t1 group by a
+PREHOOK: type: QUERY
+PREHOOK: Input: default@mv1
+PREHOOK: Input: default@t1
+#### A masked pattern was here ####
+POSTHOOK: query: select a, avg(b) from t1 group by a
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@mv1
+POSTHOOK: Input: default@t1
+#### A masked pattern was here ####
+3 3.000000
+1 1.000000
+2 1.000000
diff --git
a/ql/src/test/results/clientpositive/llap/materialized_view_create_rewrite_6.q.out
b/ql/src/test/results/clientpositive/llap/materialized_view_create_rewrite_6.q.out
index 49c23f6da6d..baeed8afc97 100644
---
a/ql/src/test/results/clientpositive/llap/materialized_view_create_rewrite_6.q.out
+++
b/ql/src/test/results/clientpositive/llap/materialized_view_create_rewrite_6.q.out
@@ -246,7 +246,7 @@ POSTHOOK: Input: default@t2
POSTHOOK: Output: default@mat1
POSTHOOK: Output: default@mat1
CBO PLAN:
-HiveProject(a=[$5], _o__c1=[CAST(CASE(IS NULL($1), $6, +($6, $1))):DECIMAL(17,
2)], _o__c2=[CASE(IS NULL($2), $7, +($7, $2))], _o__c3=[CAST(/(CAST(CASE(IS
NULL($1), $6, +($6, $1))):DECIMAL(17, 2), CASE(IS NULL($2), $7, +($7,
$2)))):DECIMAL(11, 6)], _o__c4=[CASE(IS NULL($3), $8, +($8, $3))])
+HiveProject(a=[$5], _o__c1=[CAST(CASE(IS NULL($1), $6, IS NULL($6), $1, +($6,
$1))):DECIMAL(17, 2)], _o__c2=[CASE(IS NULL($2), $7, +($7, $2))],
_o__c3=[CAST(/(CAST(CASE(IS NULL($1), $6, IS NULL($6), $1, +($6,
$1))):DECIMAL(17, 2), CASE(IS NULL($2), $7, +($7, $2)))):DECIMAL(11, 6)],
_o__c4=[CASE(IS NULL($3), $8, +($8, $3))])
HiveFilter(condition=[OR(AND($4, OR(AND(IS NULL($3), =($8, 0)), AND(=(+($8,
$3), 0), IS NOT NULL($3)))), AND(IS NULL($4), OR(AND(IS NULL($3), >($8, 0)),
AND(>(+($8, $3), 0), IS NOT NULL($3)))), AND($4, OR(AND(IS NULL($3), >($8, 0)),
AND(>(+($8, $3), 0), IS NOT NULL($3)))))])
HiveJoin(condition=[IS NOT DISTINCT FROM($0, $5)], joinType=[right],
algorithm=[none], cost=[not available])
HiveProject(a=[$0], _c1=[$1], _c2=[$2], _c4=[$4], $f4=[true])
@@ -413,7 +413,7 @@ STAGE PLANS:
predicate: (_col4 and ((_col3 is null and (_col9 > 0L)) or
(((_col9 + _col3) > 0) and _col3 is not null))) (type: boolean)
Statistics: Num rows: 1 Data size: 429 Basic stats: COMPLETE
Column stats: COMPLETE
Select Operator
- expressions: _col6 (type: char(15)), CAST( if(_col1 is
null, _col7, (_col7 + _col1)) AS decimal(17,2)) (type: decimal(17,2)), if(_col2
is null, _col8, (_col8 + _col2)) (type: bigint), CAST( (CAST( if(_col1 is null,
_col7, (_col7 + _col1)) AS decimal(17,2)) / if(_col2 is null, _col8, (_col8 +
_col2))) AS decimal(11,6)) (type: decimal(11,6)), if(_col3 is null, _col9,
(_col9 + _col3)) (type: bigint)
+ expressions: _col6 (type: char(15)), CAST( CASE WHEN
(_col1 is null) THEN (_col7) WHEN (_col7 is null) THEN (_col1) ELSE ((_col7 +
_col1)) END AS decimal(17,2)) (type: decimal(17,2)), if(_col2 is null, _col8,
(_col8 + _col2)) (type: bigint), CAST( (CAST( CASE WHEN (_col1 is null) THEN
(_col7) WHEN (_col7 is null) THEN (_col1) ELSE ((_col7 + _col1)) END AS
decimal(17,2)) / if(_col2 is null, _col8, (_col8 + _col2))) AS decimal(11,6))
(type: decimal(11,6)), if(_col3 is n [...]
outputColumnNames: _col0, _col1, _col2, _col3, _col4
Statistics: Num rows: 1 Data size: 333 Basic stats:
COMPLETE Column stats: COMPLETE
File Output Operator
@@ -444,7 +444,7 @@ STAGE PLANS:
predicate: (_col4 is null and ((_col3 is null and (_col9 >
0L)) or (((_col9 + _col3) > 0) and _col3 is not null))) (type: boolean)
Statistics: Num rows: 1 Data size: 429 Basic stats: COMPLETE
Column stats: COMPLETE
Select Operator
- expressions: _col6 (type: char(15)), CAST( if(_col1 is
null, _col7, (_col7 + _col1)) AS decimal(17,2)) (type: decimal(17,2)), if(_col2
is null, _col8, (_col8 + _col2)) (type: bigint), CAST( (CAST( if(_col1 is null,
_col7, (_col7 + _col1)) AS decimal(17,2)) / if(_col2 is null, _col8, (_col8 +
_col2))) AS decimal(11,6)) (type: decimal(11,6)), if(_col3 is null, _col9,
(_col9 + _col3)) (type: bigint)
+ expressions: _col6 (type: char(15)), CAST( CASE WHEN
(_col1 is null) THEN (_col7) WHEN (_col7 is null) THEN (_col1) ELSE ((_col7 +
_col1)) END AS decimal(17,2)) (type: decimal(17,2)), if(_col2 is null, _col8,
(_col8 + _col2)) (type: bigint), CAST( (CAST( CASE WHEN (_col1 is null) THEN
(_col7) WHEN (_col7 is null) THEN (_col1) ELSE ((_col7 + _col1)) END AS
decimal(17,2)) / if(_col2 is null, _col8, (_col8 + _col2))) AS decimal(11,6))
(type: decimal(11,6)), if(_col3 is n [...]
outputColumnNames: _col0, _col1, _col2, _col3, _col4
Statistics: Num rows: 1 Data size: 333 Basic stats:
COMPLETE Column stats: COMPLETE
File Output Operator
diff --git
a/ql/src/test/results/clientpositive/llap/vector_decimal_precision.q.out
b/ql/src/test/results/clientpositive/llap/vector_decimal_precision.q.out
index b215702ffd3..25ab3f04fd3 100644
--- a/ql/src/test/results/clientpositive/llap/vector_decimal_precision.q.out
+++ b/ql/src/test/results/clientpositive/llap/vector_decimal_precision.q.out
@@ -664,13 +664,13 @@ STAGE PLANS:
outputColumnNames: _col0, _col1
Statistics: Num rows: 1 Data size: 120 Basic stats: COMPLETE
Column stats: COMPLETE
Select Operator
- expressions: CAST( (_col0 / _col1) AS decimal(24,14)) (type:
decimal(24,14)), CAST( _col0 AS decimal(30,10)) (type: decimal(30,10))
+ expressions: CAST( (_col0 / _col1) AS decimal(24,14)) (type:
decimal(24,14)), _col0 (type: decimal(30,10))
outputColumnNames: _col0, _col1
Select Vectorization:
className: VectorSelectOperator
native: true
- projectedOutputColumnNums: [4, 5]
- selectExpressions: CastDecimalToDecimal(col
3:decimal(38,18))(children: DecimalColDivideDecimalColumn(col 0:decimal(30,10),
col 2:decimal(19,0))(children: CastLongToDecimal(col 1:bigint) ->
2:decimal(19,0)) -> 3:decimal(38,18)) -> 4:decimal(24,14),
CastDecimalToDecimal(col 0:decimal(30,10)) -> 5:decimal(30,10)
+ projectedOutputColumnNums: [4, 0]
+ selectExpressions: CastDecimalToDecimal(col
3:decimal(38,18))(children: DecimalColDivideDecimalColumn(col 0:decimal(30,10),
col 2:decimal(19,0))(children: CastLongToDecimal(col 1:bigint) ->
2:decimal(19,0)) -> 3:decimal(38,18)) -> 4:decimal(24,14)
Statistics: Num rows: 1 Data size: 224 Basic stats: COMPLETE
Column stats: COMPLETE
File Output Operator
compressed: false
@@ -1288,13 +1288,13 @@ STAGE PLANS:
outputColumnNames: _col0, _col1
Statistics: Num rows: 1 Data size: 232 Basic stats: COMPLETE
Column stats: NONE
Select Operator
- expressions: CAST( (_col0 / _col1) AS decimal(24,14)) (type:
decimal(24,14)), CAST( _col0 AS decimal(30,10)) (type: decimal(30,10))
+ expressions: CAST( (_col0 / _col1) AS decimal(24,14)) (type:
decimal(24,14)), _col0 (type: decimal(30,10))
outputColumnNames: _col0, _col1
Select Vectorization:
className: VectorSelectOperator
native: true
- projectedOutputColumnNums: [4, 5]
- selectExpressions: CastDecimalToDecimal(col
3:decimal(38,18))(children: DecimalColDivideDecimalColumn(col 0:decimal(30,10),
col 2:decimal(19,0))(children: CastLongToDecimal(col 1:bigint) ->
2:decimal(19,0)) -> 3:decimal(38,18)) -> 4:decimal(24,14),
CastDecimalToDecimal(col 0:decimal(30,10)) -> 5:decimal(30,10)
+ projectedOutputColumnNums: [4, 0]
+ selectExpressions: CastDecimalToDecimal(col
3:decimal(38,18))(children: DecimalColDivideDecimalColumn(col 0:decimal(30,10),
col 2:decimal(19,0))(children: CastLongToDecimal(col 1:bigint) ->
2:decimal(19,0)) -> 3:decimal(38,18)) -> 4:decimal(24,14)
Statistics: Num rows: 1 Data size: 232 Basic stats: COMPLETE
Column stats: NONE
File Output Operator
compressed: false
diff --git
a/ql/src/test/results/clientpositive/perf/tpcds30tb/tez/cbo_query13.q.out
b/ql/src/test/results/clientpositive/perf/tpcds30tb/tez/cbo_query13.q.out
index aaeea7af89a..f7b701009cc 100644
--- a/ql/src/test/results/clientpositive/perf/tpcds30tb/tez/cbo_query13.q.out
+++ b/ql/src/test/results/clientpositive/perf/tpcds30tb/tez/cbo_query13.q.out
@@ -1,5 +1,5 @@
CBO PLAN:
-HiveProject(_o__c0=[/(CAST($0):DOUBLE, $1)], _o__c1=[CAST(/($2,
$3)):DECIMAL(11, 6)], _o__c2=[CAST(/($4, $5)):DECIMAL(11, 6)],
_o__c3=[CAST($4):DECIMAL(17, 2)])
+HiveProject(_o__c0=[/(CAST($0):DOUBLE, $1)], _o__c1=[CAST(/($2,
$3)):DECIMAL(11, 6)], _o__c2=[CAST(/($4, $5)):DECIMAL(11, 6)], _o__c3=[$4])
HiveAggregate(group=[{}], agg#0=[sum($3)], agg#1=[count($3)],
agg#2=[sum($4)], agg#3=[count($4)], agg#4=[sum($5)], agg#5=[count($5)])
HiveJoin(condition=[AND(=($21, $0), OR(AND($22, $23, $10, $19), AND($24,
$25, $11, $20), AND($26, $27, $12, $20)))], joinType=[inner], algorithm=[none],
cost=[not available])
HiveJoin(condition=[=($1, $18)], joinType=[inner], algorithm=[none],
cost=[not available])
diff --git
a/ql/src/test/results/clientpositive/perf/tpcds30tb/tez/query13.q.out
b/ql/src/test/results/clientpositive/perf/tpcds30tb/tez/query13.q.out
index 0b11dce5b5e..113957a1384 100644
--- a/ql/src/test/results/clientpositive/perf/tpcds30tb/tez/query13.q.out
+++ b/ql/src/test/results/clientpositive/perf/tpcds30tb/tez/query13.q.out
@@ -200,7 +200,7 @@ STAGE PLANS:
outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5
Statistics: Num rows: 1 Data size: 256 Basic stats: COMPLETE
Column stats: COMPLETE
Select Operator
- expressions: (UDFToDouble(_col0) / _col1) (type: double),
CAST( (_col2 / _col3) AS decimal(11,6)) (type: decimal(11,6)), CAST( (_col4 /
_col5) AS decimal(11,6)) (type: decimal(11,6)), CAST( _col4 AS decimal(17,2))
(type: decimal(17,2))
+ expressions: (UDFToDouble(_col0) / _col1) (type: double),
CAST( (_col2 / _col3) AS decimal(11,6)) (type: decimal(11,6)), CAST( (_col4 /
_col5) AS decimal(11,6)) (type: decimal(11,6)), _col4 (type: decimal(17,2))
outputColumnNames: _col0, _col1, _col2, _col3
Statistics: Num rows: 1 Data size: 344 Basic stats: COMPLETE
Column stats: COMPLETE
File Output Operator