Repository: lens Updated Branches: refs/heads/master 8923ce7ff -> 9497c8367
LENS-827 : Expression pushdown for query optimisation in JDBC Project: http://git-wip-us.apache.org/repos/asf/lens/repo Commit: http://git-wip-us.apache.org/repos/asf/lens/commit/9497c836 Tree: http://git-wip-us.apache.org/repos/asf/lens/tree/9497c836 Diff: http://git-wip-us.apache.org/repos/asf/lens/diff/9497c836 Branch: refs/heads/master Commit: 9497c8367c0d2e314d64d6a080db116f3a399cfe Parents: 8923ce7 Author: Sushil Mohanty <[email protected]> Authored: Fri Oct 16 18:14:25 2015 +0530 Committer: Amareshwari Sriramadasu <[email protected]> Committed: Fri Oct 16 18:14:25 2015 +0530 ---------------------------------------------------------------------- .../lens/driver/jdbc/ColumnarSQLRewriter.java | 46 +++- .../driver/jdbc/TestColumnarSQLRewriter.java | 211 ++++++++++++------- 2 files changed, 171 insertions(+), 86 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/lens/blob/9497c836/lens-driver-jdbc/src/main/java/org/apache/lens/driver/jdbc/ColumnarSQLRewriter.java ---------------------------------------------------------------------- diff --git a/lens-driver-jdbc/src/main/java/org/apache/lens/driver/jdbc/ColumnarSQLRewriter.java b/lens-driver-jdbc/src/main/java/org/apache/lens/driver/jdbc/ColumnarSQLRewriter.java index db278ee..9e7da72 100644 --- a/lens-driver-jdbc/src/main/java/org/apache/lens/driver/jdbc/ColumnarSQLRewriter.java +++ b/lens-driver-jdbc/src/main/java/org/apache/lens/driver/jdbc/ColumnarSQLRewriter.java @@ -389,23 +389,57 @@ public class ColumnarSQLRewriter implements QueryRewriter { } /** - * Check if expression is used in select + * Check if expression is answerable from fact, then push it to fact pushdown subquery * * @param node * @return true if expressions is used */ - public boolean isExpressionsUsed(ASTNode node) { + public boolean isExpressionsAnswerableFromFact(ASTNode node) { + boolean isAnswerable = true; for (int i = 0; i < node.getChildCount(); i++) { if (node.getChild(i).getType() == HiveParser.TOK_SELEXPR) { int cnt = getColumnCount((ASTNode) node.getChild(i)); if (cnt >= 2) { - return true; + if (cnt == getNumFactTableInExpressions((ASTNode) node.getChild(i), new MutableInt(0))) { + isAnswerable = true; + } else { + isAnswerable = false; + } } } } - return false; + return isAnswerable; } + /** + * Get number of fact columns used in the an expression + * + * @param node + * @param count + * @return Number of fact columns used in expression + */ + private int getNumFactTableInExpressions(ASTNode node, MutableInt count) { + + if (node == null) { + log.debug("ASTNode is null "); + return 0; + } + if (node.getToken().getType() == HiveParser.TOK_TABLE_OR_COL) { + String factAlias = getFactAlias(); + String table = node.getChild(0).getText(); + if (table.equals(factAlias)) { + count.add(1); + } + } + for (int i = 0; i < node.getChildCount(); i++) { + ASTNode child = (ASTNode) node.getChild(i); + getNumFactTableInExpressions(child, count); + } + + return count.intValue(); + } + + /* * Get filter conditions if user has specified a join condition for filter pushdown. */ @@ -971,10 +1005,8 @@ public class ColumnarSQLRewriter implements QueryRewriter { // Construct the final fact in-line query with keys, // measures and individual sub queries built. - - if (whereTree == null || joinTree == null || allSubQueries.length() == 0 - || aggColumn.isEmpty() || isExpressionsUsed(selectAST)) { + || aggColumn.isEmpty() || !isExpressionsAnswerableFromFact(selectAST)) { log.info("@@@Query not eligible for inner subquery rewrite"); // construct query without fact sub query constructQuery(selectTree, whereTree, groupByTree, havingTree, orderByTree, limit); http://git-wip-us.apache.org/repos/asf/lens/blob/9497c836/lens-driver-jdbc/src/test/java/org/apache/lens/driver/jdbc/TestColumnarSQLRewriter.java ---------------------------------------------------------------------- diff --git a/lens-driver-jdbc/src/test/java/org/apache/lens/driver/jdbc/TestColumnarSQLRewriter.java b/lens-driver-jdbc/src/test/java/org/apache/lens/driver/jdbc/TestColumnarSQLRewriter.java index ff6531b..bda1095 100644 --- a/lens-driver-jdbc/src/test/java/org/apache/lens/driver/jdbc/TestColumnarSQLRewriter.java +++ b/lens-driver-jdbc/src/test/java/org/apache/lens/driver/jdbc/TestColumnarSQLRewriter.java @@ -413,24 +413,34 @@ public class TestColumnarSQLRewriter { String actual = qtest.rewrite(query, conf, hconf); String expected = "select ( sales_fact___fact . time_key ), ( time_dim___time_dim . day_of_week ), " - + "date(( time_dim___time_dim . day )), ( item_dim___item_dim . item_key ), case when " - + "(sum(( sales_fact___fact . dollars_sold )) = 0 ) then 0.0 else sum(( sales_fact___fact . dollars_sold )) " - + "end dollars_sold , format(sum(( sales_fact___fact . units_sold )), 4 ), " - + "format(avg(( sales_fact___fact . dollars_sold )), '##################.###' ), " - + "min(( sales_fact___fact . dollars_sold )), max(( sales_fact___fact . dollars_sold )) " - + "from sales_fact sales_fact___fact inner join " - + "(select time_key, day_of_week, day from time_dim) time_dim___time_dim on " - + "(( sales_fact___fact . time_key ) = ( time_dim___time_dim . time_key )) inner join " - + "(select location_key, location_name from location_dim) location_dim___location_dim " - + "on (( sales_fact___fact . location_key ) = " - + "( location_dim___location_dim . location_key )) inner join (select item_key, " - + "item_name from item_dim) item_dim___item_dim " - + "on ((( sales_fact___fact . item_key ) = ( item_dim___item_dim . item_key )) and " - + "(( location_dim___location_dim . location_name ) = 'test123' )) where " - + "(( time_dim___time_dim . time_key ) between date_add( '2013-01-01' , interval 1 day) " - + "and date_sub( '2013-01-31' , interval 3 day) and (( item_dim___item_dim . item_name ) = 'item_1' )) " - + "group by ( sales_fact___fact . time_key ), ( time_dim___time_dim . day_of_week ), " - + "( time_dim___time_dim . day ), ( item_dim___item_dim . item_key ) order by dollars_sold asc"; + + "date(( time_dim___time_dim . day )), ( item_dim___item_dim . item_key ), " + + "case when (sum(alias2) = 0 ) then 0.0 else sum(alias2) end dollars_sold , " + + "format(sum(alias3), 4 ), format(avg(alias4), '##################.###' ), " + + "min(alias5), max(alias6) from (select sales_fact___fact.time_key, " + + "sales_fact___fact.location_key, sales_fact___fact.item_key,sum(( sales_fact___fact . " + + "dollars_sold )) as alias1, sum(( sales_fact___fact . dollars_sold )) as alias2, " + + "sum(( sales_fact___fact . units_sold )) as alias3, avg(( sales_fact___fact . dollars_sold )) " + + "as alias4, min(( sales_fact___fact . dollars_sold )) as alias5, max(( sales_fact___fact . " + + "dollars_sold )) as alias6 from sales_fact sales_fact___fact where sales_fact___fact.time_key " + + "in ( select time_dim .time_key from time_dim where ( time_dim. time_key ) between " + + "date_add( '2013-01-01' , interval 1 day) and date_sub( '2013-01-31' , interval 3 day) ) " + + "and sales_fact___fact.location_key in ( select location_dim .location_key from " + + "location_dim where (( location_dim. location_name ) = 'test123' ) ) and " + + "sales_fact___fact.item_key in ( select item_dim .item_key from item_dim where " + + "(( item_dim. item_name ) = 'item_1' ) ) group by sales_fact___fact.time_key, " + + "sales_fact___fact.location_key, sales_fact___fact.item_key) sales_fact___fact " + + "inner join (select time_key,day_of_week,day from time_dim) time_dim___time_dim on " + + "(( sales_fact___fact . time_key ) = ( time_dim___time_dim . time_key )) inner join " + + "(select location_key,location_name from location_dim) location_dim___location_dim on " + + "(( sales_fact___fact . location_key ) = ( location_dim___location_dim . location_key )) " + + "inner join (select item_key,item_name from item_dim) item_dim___item_dim on " + + "((( sales_fact___fact . item_key ) = ( item_dim___item_dim . item_key )) and " + + "(( location_dim___location_dim . location_name ) = 'test123' )) where " + + "(( time_dim___time_dim . time_key ) between date_add( '2013-01-01' , interval 1 day) " + + "and date_sub( '2013-01-31' , interval 3 day) and (( item_dim___item_dim . item_name ) = " + + "'item_1' )) group by ( sales_fact___fact . time_key ), ( time_dim___time_dim . day_of_week ), " + + "( time_dim___time_dim . day ), ( item_dim___item_dim . item_key ) " + + "order by dollars_sold asc"; compareQueries(expected, actual); } @@ -471,39 +481,60 @@ public class TestColumnarSQLRewriter { String actual = qtest.rewrite(query, conf, hconf); String expected = "select ( sales_fact___fact . time_key ), ( time_dim___time_dim . day_of_week ), " - + "( time_dim___time_dim . day ), case when (sum(( sales_fact___fact . dollars_sold )) = 0 ) " - + "then 0.0 else sum(( sales_fact___fact . dollars_sold )) end dollars_sold from sales_fact s" - + "ales_fact___fact inner join (select time_key, day_of_week, day from time_dim) " - + "time_dim___time_dim on (( sales_fact___fact . time_key ) " - + "= ( time_dim___time_dim . time_key )) inner join (select location_key, location_name from location_dim) " - + "location_dim___location_dim on " - + "((( sales_fact___fact . location_key ) = ( location_dim___location_dim . location_key )) and " - + "(( location_dim___location_dim . location_name ) = 'test123' )) where ( time_dim___time_dim . time_key ) " - + "between '2013-01-01' and '2013-01-05' group by ( sales_fact___fact . time_key ), " - + "( time_dim___time_dim . day_of_week ), ( time_dim___time_dim . day ) order by dollars_sold " - + "asc union all select ( sales_fact___fact . time_key ), ( time_dim___time_dim . day_of_week ), " - + "( time_dim___time_dim . day ), case when (sum(( sales_fact___fact . dollars_sold )) = 0 ) then 0.0 " - + "else sum(( sales_fact___fact . dollars_sold )) end dollars_sold from sales_fact sales_fact___fact " - + "inner join (select time_key, day_of_week, day from time_dim) time_dim___time_dim " - + "on (( sales_fact___fact . time_key ) = " - + "( time_dim___time_dim . time_key )) inner join (select location_key, " - + "location_name from location_dim) location_dim___location_dim on " - + "((( sales_fact___fact . location_key ) = ( location_dim___location_dim . location_key )) and " - + "(( location_dim___location_dim . location_name ) = 'test123' )) where ( time_dim___time_dim . time_key ) " - + "between '2013-02-01' and '2013-02-05' group by ( sales_fact___fact . time_key ), " - + "( time_dim___time_dim . day_of_week ), ( time_dim___time_dim . day ) order by dollars_sold asc union all " - + "select ( sales_fact___fact . time_key ), ( time_dim___time_dim . day_of_week ), " - + "( time_dim___time_dim . day ), case when (sum(( sales_fact___fact . dollars_sold )) = 0 ) then 0.0 " - + "else sum(( sales_fact___fact . dollars_sold )) end dollars_sold from sales_fact sales_fact___fact " - + "inner join (select time_key, day_of_week, day from time_dim) " - + "time_dim___time_dim on (( sales_fact___fact . time_key ) = " - + "( time_dim___time_dim . time_key )) inner join (select location_key, location_name from location_dim) " - + "location_dim___location_dim on " - + "((( sales_fact___fact . location_key ) = ( location_dim___location_dim . location_key )) and " - + "(( location_dim___location_dim . location_name ) = 'test123' )) where " - + "( time_dim___time_dim . time_key ) between '2013-03-01' and '2013-03-05' " - + " group by ( sales_fact___fact . time_key ), ( time_dim___time_dim . day_of_week ), " - + "( time_dim___time_dim . day ) order by dollars_sold asc"; + + "( time_dim___time_dim . day ), case when (sum(alias2) = 0 ) then 0.0 " + + "else sum(alias2) end dollars_sold from (select sales_fact___fact.time_key, " + + "sales_fact___fact.location_key,sum(( sales_fact___fact . dollars_sold )) as alias1, " + + "sum(( sales_fact___fact . dollars_sold )) as alias2 from sales_fact sales_fact___fact " + + "where sales_fact___fact.time_key in ( select time_dim .time_key from time_dim where " + + "( time_dim. time_key ) between '2013-01-01' and '2013-01-05' ) and " + + "sales_fact___fact.location_key in ( select location_dim .location_key from " + + "location_dim where (( location_dim. location_name ) = 'test123' ) ) group by " + + "sales_fact___fact.time_key, sales_fact___fact.location_key) sales_fact___fact " + + "inner join (select time_key,day_of_week,day from time_dim) time_dim___time_dim " + + "on (( sales_fact___fact . time_key ) = ( time_dim___time_dim . time_key )) " + + "inner join (select location_key,location_name from location_dim) " + + "location_dim___location_dim on ((( sales_fact___fact . location_key ) = " + + "( location_dim___location_dim . location_key )) and (( location_dim___location_dim . " + + "location_name ) = 'test123' )) where ( time_dim___time_dim . time_key ) " + + "between '2013-01-01' and '2013-01-05' group by ( sales_fact___fact . time_key ), " + + "( time_dim___time_dim . day_of_week ), ( time_dim___time_dim . day ) " + + "order by dollars_sold asc union all select ( sales_fact___fact . time_key ), " + + "( time_dim___time_dim . day_of_week ), ( time_dim___time_dim . day ), " + + "case when (sum(alias2) = 0 ) then 0.0 else sum(alias2) end dollars_sold " + + "from (select sales_fact___fact.time_key, sales_fact___fact.location_key," + + "sum(( sales_fact___fact . dollars_sold )) as alias1, sum(( sales_fact___fact . dollars_sold )) " + + "as alias2 from sales_fact sales_fact___fact where sales_fact___fact.time_key in " + + "( select time_dim .time_key from time_dim where ( time_dim. time_key ) between " + + " '2013-02-01' and '2013-02-05' ) and sales_fact___fact.location_key in (" + + " select location_dim .location_key from location_dim where (( location_dim. location_name ) " + + "= 'test123' ) ) group by sales_fact___fact.time_key, sales_fact___fact.location_key) " + + "sales_fact___fact inner join (select time_key,day_of_week,day from time_dim) " + + "time_dim___time_dim on (( sales_fact___fact . time_key ) = ( time_dim___time_dim . time_key )) " + + "inner join (select location_key,location_name from location_dim) location_dim___location_dim " + + "on ((( sales_fact___fact . location_key ) = ( location_dim___location_dim . location_key )) " + + "and (( location_dim___location_dim . location_name ) = 'test123' )) where " + + "( time_dim___time_dim . time_key ) between '2013-02-01' and '2013-02-05' " + + "group by ( sales_fact___fact . time_key ), ( time_dim___time_dim . day_of_week ), " + + "( time_dim___time_dim . day ) order by dollars_sold asc union all select " + + "( sales_fact___fact . time_key ), ( time_dim___time_dim . day_of_week ), " + + "( time_dim___time_dim . day ), case when (sum(alias2) = 0 ) then 0.0 " + + "else sum(alias2) end dollars_sold from (select sales_fact___fact.time_key, " + + "sales_fact___fact.location_key,sum(( sales_fact___fact . dollars_sold )) as alias1, " + + "sum(( sales_fact___fact . dollars_sold )) as alias2 from sales_fact sales_fact___fact " + + "where sales_fact___fact.time_key in ( select time_dim .time_key from time_dim where " + + "( time_dim. time_key ) between '2013-03-01' and '2013-03-05' ) " + + "and sales_fact___fact.location_key in ( select location_dim .location_key from " + + "location_dim where (( location_dim. location_name ) = 'test123' ) ) " + + "group by sales_fact___fact.time_key, sales_fact___fact.location_key) " + + "sales_fact___fact inner join (select time_key,day_of_week,day from time_dim) " + + "time_dim___time_dim on (( sales_fact___fact . time_key ) = " + + "( time_dim___time_dim . time_key )) inner join (select location_key,location_name " + + "from location_dim) location_dim___location_dim on ((( sales_fact___fact . location_key ) " + + "= ( location_dim___location_dim . location_key )) and " + + "(( location_dim___location_dim . location_name ) = 'test123' )) " + + "where ( time_dim___time_dim . time_key ) between '2013-03-01' and '2013-03-05' " + + "group by ( sales_fact___fact . time_key ), ( time_dim___time_dim . day_of_week ), " + + "( time_dim___time_dim . day ) order by dollars_sold asc"; compareQueries(expected, actual); } @@ -544,23 +575,37 @@ public class TestColumnarSQLRewriter { String actual = qtest.rewrite(query, conf, hconf); String expected = "select ( sales_fact___fact . time_key ), ( time_dim___time_dim . day_of_week ), " - + "( time_dim___time_dim . day ), ( item_dim___item_dim . item_key ), sum( case when " - + "(( sales_fact___fact . dollars_sold ) = 0 ) then 0.0 else ( sales_fact___fact . dollars_sold ) end ) " - + "dollars_sold , round(sum(( sales_fact___fact . units_sold )), 2 ), " - + "avg(( sales_fact___fact . dollars_sold )), " - + "min(( sales_fact___fact . dollars_sold )), max(( sales_fact___fact . dollars_sold )), location_name , " - + " from sales_fact sales_fact___fact inner join (select time_key, day_of_week, day" - + "from time_dim) time_dim___time_dim " - + "on (( sales_fact___fact . time_key ) = ( time_dim___time_dim . time_key )) " - + "inner join (select location_key, location_name from location_dim) " - + "location_dim___location_dim on (( sales_fact___fact . location_key ) = " - + "( location_dim___location_dim . location_key )) inner join (select item_key, " - + "item_name from item_dim) item_dim___item_dim on " - + "((( sales_fact___fact . item_key ) = ( item_dim___item_dim . item_key )) and " - + "(( location_dim___location_dim . location_name ) = 'test123' )) where (( time_dim___time_dim . time_key ) " - + "between '2013-01-01' and '2013-01-31' and (( item_dim___item_dim . item_name ) = 'item_1' )) " - + "group by ( sales_fact___fact . time_key ), ( time_dim___time_dim . day_of_week ), " - + "( time_dim___time_dim . day ), ( item_dim___item_dim . item_key ) order by dollars_sold desc"; + + "( time_dim___time_dim . day ), ( item_dim___item_dim . item_key ), sum(alias1) dollars_sold , " + + "round(sum(alias2), 2 ), avg(alias3), min(alias4), max(alias5), location_name " + + "from (select sales_fact___fact.time_key, sales_fact___fact.location_key, " + + "sales_fact___fact.item_key, sales_fact___fact.dollars_sold,sum( case " + + "when (( sales_fact___fact . dollars_sold ) = 0 ) then 0.0 else " + + "( sales_fact___fact . dollars_sold ) end ) as alias1, " + + "sum(( sales_fact___fact . units_sold )) as alias2, " + + "avg(( sales_fact___fact . dollars_sold )) as alias3, " + + "min(( sales_fact___fact . dollars_sold )) as alias4, " + + "max(( sales_fact___fact . dollars_sold )) as alias5 " + + "from sales_fact sales_fact___fact where sales_fact___fact.time_key " + + "in ( select time_dim .time_key from time_dim where " + + "( time_dim. time_key ) between '2013-01-01' and '2013-01-31' ) " + + "and sales_fact___fact.location_key in ( select location_dim .location_key " + + "from location_dim where (( location_dim. location_name ) = 'test123' ) ) " + + "and sales_fact___fact.item_key in ( select item_dim .item_key from item_dim " + + "where (( item_dim. item_name ) = 'item_1' ) ) group by sales_fact___fact.time_key, " + + "sales_fact___fact.location_key, sales_fact___fact.item_key, sales_fact___fact.dollars_sold) " + + "sales_fact___fact inner join (select time_key,day_of_week,day from time_dim) " + + "time_dim___time_dim on (( sales_fact___fact . time_key ) = " + + "( time_dim___time_dim . time_key )) inner join (select location_key," + + "location_name from location_dim) location_dim___location_dim on " + + "(( sales_fact___fact . location_key ) = ( location_dim___location_dim . location_key )) " + + "inner join (select item_key,item_name from item_dim) item_dim___item_dim on " + + "((( sales_fact___fact . item_key ) = ( item_dim___item_dim . item_key )) and " + + "(( location_dim___location_dim . location_name ) = 'test123' )) where " + + "(( time_dim___time_dim . time_key ) between '2013-01-01' and '2013-01-31' " + + "and (( item_dim___item_dim . item_name ) = 'item_1' )) group by " + + "( sales_fact___fact . time_key ), ( time_dim___time_dim . day_of_week ), " + + "( time_dim___time_dim . day ), ( item_dim___item_dim . item_key ) " + + "order by dollars_sold desc"; compareQueries(expected, actual); } @@ -694,17 +739,25 @@ public class TestColumnarSQLRewriter { String actual = qtest.rewrite(query, conf, hconf); String expected = "select ( sales_fact__db_sales_fact_fact . time_key ), ( time_dim___time_dim . day_of_week ), " - + "( time_dim___time_dim . day ), case when (sum(( sales_fact__db_sales_fact_fact . dollars_sold )) = 0 ) " - + "then 0.0 else sum(( sales_fact__db_sales_fact_fact . dollars_sold )) end dollars_sold " - + "from db.sales_fact sales_fact__db_sales_fact_fact inner join " - + "(select time_key, day_of_week, day from time_dim) time_dim___time_dim " - + "on (( sales_fact__db_sales_fact_fact . time_key ) = ( time_dim___time_dim . time_key )) " - + "inner join (select location_key, location_name from db.location_dim) location_dim__db_location_dim_ld on " - + "((( sales_fact__db_sales_fact_fact . location_key ) = ( location_dim__db_location_dim_ld . location_key )) " - + "and (( location_dim__db_location_dim_ld . location_name ) = 'test123' )) where " - + "( time_dim___time_dim . time_key ) between '2013-01-01' and '2013-01-31' " - + "group by ( sales_fact__db_sales_fact_fact . time_key ), ( time_dim___time_dim . day_of_week ), " - + "( time_dim___time_dim . day ) order by dollars_sold desc"; + + "( time_dim___time_dim . day ), case when (sum(alias2) = 0 ) " + + "then 0.0 else sum(alias2) end dollars_sold from (select sales_fact__db_sales_fact_fact.time_key, " + + "sales_fact__db_sales_fact_fact.location_key,sum(( sales_fact__db_sales_fact_fact . dollars_sold )) " + + "as alias1, sum(( sales_fact__db_sales_fact_fact . dollars_sold )) as alias2 from db.sales_fact " + + "sales_fact__db_sales_fact_fact where sales_fact__db_sales_fact_fact.time_key in " + + "( select time_dim .time_key from time_dim where ( time_dim. time_key ) " + + "between '2013-01-01' and '2013-01-31' ) and sales_fact__db_sales_fact_fact.location_key in " + + "( select location_dim .location_key from location_dim where (( location_dim. location_name ) " + + "= 'test123' ) ) group by sales_fact__db_sales_fact_fact.time_key, " + + "sales_fact__db_sales_fact_fact.location_key) sales_fact__db_sales_fact_fact " + + "inner join (select time_key,day_of_week,day from time_dim) time_dim___time_dim on " + + "(( sales_fact__db_sales_fact_fact . time_key ) = ( time_dim___time_dim . time_key )) " + + "inner join (select location_key,location_name from db.location_dim) " + + "location_dim__db_location_dim_ld on ((( sales_fact__db_sales_fact_fact . location_key ) " + + "= ( location_dim__db_location_dim_ld . location_key )) and (( location_dim__db_location_dim_ld . " + + "location_name ) = 'test123' )) where ( time_dim___time_dim . time_key ) between " + + "'2013-01-01' and '2013-01-31' group by ( sales_fact__db_sales_fact_fact . time_key ), " + + "( time_dim___time_dim . day_of_week ), ( time_dim___time_dim . day ) " + + "order by dollars_sold desc"; compareQueries(expected, actual); }
