[
https://issues.apache.org/jira/browse/CALCITE-2158?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16479476#comment-16479476
]
Volodymyr Vysotskyi commented on CALCITE-2158:
----------------------------------------------
Realized that this aggregate call may be used to check that aggregate call
returns the single value, so it is not needed to remove it from the aggregate.
But the real problem appears in the case when the query has several correlated
subqueries. This test:
{code:java}
@Test public void testRelDecorrelatorWithComplexFilters() {
final HepProgram program = HepProgram.builder()
.build();
final String sql = "SELECT cs1.sal\n"
+ "FROM emp cs1,\n"
+ " emp cs3\n"
+ "WHERE cs1.ename = cs3.ename\n"
+ "AND EXISTS\n"
+ "(SELECT *\n"
+ "FROM emp cs2\n"
+ "WHERE cs1.sal = cs2.sal\n"
+ "AND cs1.deptno <> cs2.deptno)"
+ "AND EXISTS"
+ "(SELECT *\n"
+ "FROM emp cr1\n"
+ "WHERE cs1.sal = cr1.sal)";
sql(sql)
.withDecorrelation(true)
.expand(true)
.with(program)
.checkUnchanged();
}
{code}
Returns plan with join with true in condition
{noformat}
LogicalProject(SAL=[$5])
LogicalProject(EMPNO=[$0], ENAME=[$1], JOB=[$2], MGR=[$3], HIREDATE=[$4],
SAL=[$5], COMM=[$6], DEPTNO=[$7], SLACKER=[$8], EMPNO0=[$9], ENAME0=[$10],
JOB0=[$11], MGR0=[$12], HIREDATE0=[$13], SAL0=[$14], COMM0=[$15],
DEPTNO0=[$16], SLACKER0=[$17], $f0=[$18], $f019=[$20])
LogicalProject(EMPNO=[$0], ENAME=[$1], JOB=[$2], MGR=[$3], HIREDATE=[$4],
SAL=[$5], COMM=[$6], DEPTNO=[$7], SLACKER=[$8], EMPNO0=[$9], ENAME0=[$10],
JOB0=[$11], MGR0=[$12], HIREDATE0=[$13], SAL0=[$14], COMM0=[$15],
DEPTNO0=[$16], SLACKER0=[$17], $f0=[$18], SAL9=[CAST($19):INTEGER],
$f1=[CAST($20):BOOLEAN])
LogicalJoin(condition=[=($5, $19)], joinType=[inner])
LogicalFilter(condition=[AND(=($1, $10), IS NOT NULL($18))])
LogicalProject(EMPNO=[$0], ENAME=[$1], JOB=[$2], MGR=[$3],
HIREDATE=[$4], SAL=[$5], COMM=[$6], DEPTNO=[$7], SLACKER=[$8], EMPNO0=[$9],
ENAME0=[$10], JOB0=[$11], MGR0=[$12], HIREDATE0=[$13], SAL0=[$14], COMM0=[$15],
DEPTNO0=[$16], SLACKER0=[$17], $f0=[$20])
LogicalJoin(condition=[AND(=($5, $18), =($7, $19))],
joinType=[left])
LogicalJoin(condition=[true], joinType=[inner])
LogicalTableScan(table=[[CATALOG, SALES, EMP]])
LogicalTableScan(table=[[CATALOG, SALES, EMP]])
LogicalAggregate(group=[{0, 1}], agg#0=[MIN($2)])
LogicalProject(SAL0=[$1], DEPTNO0=[$2], $f0=[$0])
LogicalProject($f0=[true], SAL0=[$9], DEPTNO0=[$10])
LogicalProject(EMPNO=[$0], ENAME=[$1], JOB=[$2], MGR=[$3],
HIREDATE=[$4], SAL=[$5], COMM=[$6], DEPTNO=[$7], SLACKER=[$8], SAL0=[$9],
DEPTNO0=[$10])
LogicalJoin(condition=[AND(=($9, $5), <>($10, $7))],
joinType=[inner])
LogicalTableScan(table=[[CATALOG, SALES, EMP]])
LogicalAggregate(group=[{0, 1}])
LogicalProject(SAL=[$5], DEPTNO=[$7])
LogicalJoin(condition=[true], joinType=[inner])
LogicalTableScan(table=[[CATALOG, SALES, EMP]])
LogicalTableScan(table=[[CATALOG, SALES, EMP]])
LogicalAggregate(group=[{0}], agg#0=[MIN($1)])
LogicalProject(SAL9=[$1], $f0=[$0])
LogicalProject($f0=[true], SAL9=[$9])
LogicalProject(EMPNO=[$0], ENAME=[$1], JOB=[$2], MGR=[$3],
HIREDATE=[$4], SAL=[$5], COMM=[$6], DEPTNO=[$7], SLACKER=[$8], SAL9=[$5])
LogicalTableScan(table=[[CATALOG, SALES, EMP]])
{noformat}
It happens because {{HepPlanner}} in {{RelDecorrelator}} partially optimizes
input plan:
{noformat}
LogicalProject(SAL=[$5]): rowcount = 1.0, cumulative cost = 4049.750000000001,
id = 67
LogicalFilter(condition=[AND(=($1, $10), IS NOT NULL($18), IS NOT
NULL($19))]): rowcount = 1.0, cumulative cost = 4048.750000000001, id = 65
LogicalCorrelate(correlation=[$cor2], joinType=[left],
requiredColumns=[{5}]): rowcount = 1.0, cumulative cost = 4047.750000000001, id
= 63
LogicalCorrelate(correlation=[$cor0], joinType=[left],
requiredColumns=[{5, 7}]): rowcount = 1.0, cumulative cost =
4002.9000000000005, id = 52
LogicalJoin(condition=[true], joinType=[inner]): rowcount = 196.0,
cumulative cost = 224.0, id = 41
LogicalTableScan(table=[[CATALOG, SALES, EMP]]): rowcount = 14.0,
cumulative cost = 14.0, id = 22
LogicalTableScan(table=[[CATALOG, SALES, EMP]]): rowcount = 14.0,
cumulative cost = 14.0, id = 23
LogicalAggregate(group=[{}], agg#0=[MIN($0)]): rowcount = 1.0,
cumulative cost = 18.275000000000002, id = 50
LogicalProject($f0=[true]): rowcount = 1.05, cumulative cost =
17.150000000000002, id = 48
LogicalProject(EMPNO=[$0], ENAME=[$1], JOB=[$2], MGR=[$3],
HIREDATE=[$4], SAL=[$5], COMM=[$6], DEPTNO=[$7], SLACKER=[$8]): rowcount =
1.05, cumulative cost = 16.1, id = 46
LogicalFilter(condition=[AND(=($cor0.SAL, $5), <>($cor0.DEPTNO,
$7))]): rowcount = 1.05, cumulative cost = 15.05, id = 44
LogicalTableScan(table=[[CATALOG, SALES, EMP]]): rowcount =
14.0, cumulative cost = 14.0, id = 25
LogicalAggregate(group=[{}], agg#0=[MIN($0)]): rowcount = 1.0, cumulative
cost = 21.425000000000004, id = 61
LogicalProject($f0=[true]): rowcount = 2.1, cumulative cost =
20.300000000000004, id = 59
LogicalProject(EMPNO=[$0], ENAME=[$1], JOB=[$2], MGR=[$3],
HIREDATE=[$4], SAL=[$5], COMM=[$6], DEPTNO=[$7], SLACKER=[$8]): rowcount = 2.1,
cumulative cost = 18.200000000000003, id = 57
LogicalFilter(condition=[=($cor2.SAL, $5)]): rowcount = 2.1,
cumulative cost = 16.1, id = 55
LogicalTableScan(table=[[CATALOG, SALES, EMP]]): rowcount = 14.0,
cumulative cost = 14.0, id = 31
{noformat}
Plan after applying {{HepPlanner}} (stage before performing decorrelation
itself):
{noformat}
LogicalProject(SAL=[$5]): rowcount = 1.0, cumulative cost = 4051.8850000000007,
id = 97
LogicalProject(EMPNO=[$0], ENAME=[$1], JOB=[$2], MGR=[$3], HIREDATE=[$4],
SAL=[$5], COMM=[$6], DEPTNO=[$7], SLACKER=[$8], EMPNO0=[$9], ENAME0=[$10],
JOB0=[$11], MGR0=[$12], HIREDATE0=[$13], SAL0=[$14], COMM0=[$15],
DEPTNO0=[$16], SLACKER0=[$17], $f0=[$18], $f019=[$19]): rowcount = 1.0,
cumulative cost = 4050.8850000000007, id = 124
LogicalFilter(condition=[IS NOT NULL($19)]): rowcount = 1.0, cumulative
cost = 4049.8850000000007, id = 132
LogicalCorrelate(correlation=[$cor2], joinType=[left],
requiredColumns=[{5}]): rowcount = 1.0, cumulative cost = 4048.8850000000007,
id = 130
LogicalFilter(condition=[AND(=($1, $10), IS NOT NULL($18))]): rowcount
= 1.0, cumulative cost = 4004.9000000000005, id = 126
LogicalProject(EMPNO=[$0], ENAME=[$1], JOB=[$2], MGR=[$3],
HIREDATE=[$4], SAL=[$5], COMM=[$6], DEPTNO=[$7], SLACKER=[$8], EMPNO0=[$9],
ENAME0=[$10], JOB0=[$11], MGR0=[$12], HIREDATE0=[$13], SAL0=[$14], COMM0=[$15],
DEPTNO0=[$16], SLACKER0=[$17], $f0=[$18]): rowcount = 1.0, cumulative cost =
4003.9000000000005, id = 116
LogicalCorrelate(correlation=[$cor0], joinType=[left],
requiredColumns=[{5, 7}]): rowcount = 1.0, cumulative cost =
4002.9000000000005, id = 114
LogicalJoin(condition=[true], joinType=[inner]): rowcount =
196.0, cumulative cost = 224.0, id = 71
LogicalTableScan(table=[[CATALOG, SALES, EMP]]): rowcount =
14.0, cumulative cost = 14.0, id = 22
LogicalTableScan(table=[[CATALOG, SALES, EMP]]): rowcount =
14.0, cumulative cost = 14.0, id = 23
LogicalAggregate(group=[{}], agg#0=[MIN($0)]): rowcount = 1.0,
cumulative cost = 18.275000000000002, id = 80
LogicalProject($f0=[true]): rowcount = 1.05, cumulative cost =
17.150000000000002, id = 78
LogicalProject(EMPNO=[$0], ENAME=[$1], JOB=[$2], MGR=[$3],
HIREDATE=[$4], SAL=[$5], COMM=[$6], DEPTNO=[$7], SLACKER=[$8]): rowcount =
1.05, cumulative cost = 16.1, id = 76
LogicalFilter(condition=[AND(=($cor0.SAL, $5),
<>($cor0.DEPTNO, $7))]): rowcount = 1.05, cumulative cost = 15.05, id = 74
LogicalTableScan(table=[[CATALOG, SALES, EMP]]): rowcount
= 14.0, cumulative cost = 14.0, id = 25
LogicalAggregate(group=[{}], agg#0=[MIN($0)]): rowcount = 1.0,
cumulative cost = 21.425000000000004, id = 91
LogicalProject($f0=[true]): rowcount = 2.1, cumulative cost =
20.300000000000004, id = 89
LogicalProject(EMPNO=[$0], ENAME=[$1], JOB=[$2], MGR=[$3],
HIREDATE=[$4], SAL=[$5], COMM=[$6], DEPTNO=[$7], SLACKER=[$8]): rowcount = 2.1,
cumulative cost = 18.200000000000003, id = 87
LogicalFilter(condition=[=($cor2.SAL, $5)]): rowcount = 2.1,
cumulative cost = 16.1, id = 85
LogicalTableScan(table=[[CATALOG, SALES, EMP]]): rowcount =
14.0, cumulative cost = 14.0, id = 31
{noformat}
But some rules may be applied more times to receive more optimal plans because
for this case some joins and filters are split by correlates and after
decorrelating it is impossible to push those filters past aggregates.
> SubQuery with EXISTS clause creates redundant aggregate call
> ------------------------------------------------------------
>
> Key: CALCITE-2158
> URL: https://issues.apache.org/jira/browse/CALCITE-2158
> Project: Calcite
> Issue Type: Bug
> Reporter: Volodymyr Vysotskyi
> Assignee: Julian Hyde
> Priority: Major
>
> When {{SqlToRelConverter.Config.isExpand()}} returns true, subqueries are
> expanded in {{SqlToRelConverter}}.
> Then for the queries, like this:
> {code:sql}
> SELECT cs1.sal
> FROM emp cs1
> WHERE EXISTS
> (SELECT *
> FROM emp cs2
> WHERE cs1.sal = cs2.sal
> AND cs1.deptno <> cs2.deptno)
> {code}
> Calcite returns logical plan with excessive aggregate calls:
> {noformat}
> LogicalProject(SAL=[$5])
> LogicalFilter(condition=[IS NOT NULL($9)])
> LogicalCorrelate(correlation=[$cor0], joinType=[left],
> requiredColumns=[{5, 7}])
> LogicalTableScan(table=[[CATALOG, SALES, EMP]])
> LogicalAggregate(group=[{}], agg#0=[MIN($0)])
> LogicalProject($f0=[true])
> LogicalProject(EMPNO=[$0], ENAME=[$1], JOB=[$2], MGR=[$3],
> HIREDATE=[$4], SAL=[$5], COMM=[$6], DEPTNO=[$7], SLACKER=[$8])
> LogicalFilter(condition=[AND(=($cor0.SAL, $5), <>($cor0.DEPTNO,
> $7))])
> LogicalTableScan(table=[[CATALOG, SALES, EMP]])
> {noformat}
> But when {{SqlToRelConverter.Config.isExpand()}} returns false and
> SubQueryRemoveRule rules are applied to the logical plan with RexSubQuery,
> the resulting logical plan is correct and does not contain excessive
> aggregate calls:
> {noformat}
> LogicalProject(SAL=[$5])
> LogicalProject(EMPNO=[$0], ENAME=[$1], JOB=[$2], MGR=[$3], HIREDATE=[$4],
> SAL=[$5], COMM=[$6], DEPTNO=[$7], SLACKER=[$8])
> LogicalCorrelate(correlation=[$cor0], joinType=[inner],
> requiredColumns=[{5, 7}])
> LogicalTableScan(table=[[CATALOG, SALES, EMP]])
> LogicalAggregate(group=[{0}])
> LogicalProject(i=[true])
> LogicalFilter(condition=[AND(=($cor0.SAL, $5), <>($cor0.DEPTNO,
> $7))])
> LogicalTableScan(table=[[CATALOG, SALES, EMP]])
> {noformat}
> These cases may be observed using this unit test:
> {code:java}
> @Test public void testExistsExpand() {
> final HepProgram preProgram = HepProgram.builder()
> .addRuleInstance(SubQueryRemoveRule.FILTER)
> .addRuleInstance(SubQueryRemoveRule.PROJECT)
> .addRuleInstance(SubQueryRemoveRule.JOIN)
> .build();
> final HepProgram program = HepProgram.builder()
> .build();
> final String sql = "SELECT cs1.sal\n"
> + "FROM emp cs1 \n"
> + "WHERE EXISTS\n"
> + "(SELECT *\n"
> + "FROM emp cs2\n"
> + "WHERE cs1.sal = cs2.sal\n"
> + "AND cs1.deptno <> cs2.deptno)";
> sql(sql)
> .withDecorrelation(false)
> .withTrim(false)
> .expand(true) // change to false
> .withPre(preProgram)
> .with(program)
> .checkUnchanged();
> }
> {code}
--
This message was sent by Atlassian JIRA
(v7.6.3#76005)