Repository: incubator-impala Updated Branches: refs/heads/master 50b2b888c -> a9c405955
IMPALA-4232: qgen: Hive does not support aggregates inside specific analytic clauses Hive does not support aggregates inside the following analytic clauses: * AVG ... OVER * COUNT ... OVER * FIRSTVALUE ... OVER * LAG ... OVER * LASTVALUE ... OVER * LEAD ... OVER * MAX ... OVER * MIN ... OVER * SUM ... OVER So the following query works in Impala, but not in Hive: SELECT LEAD(SUM(1)) OVER (PARTITION BY MAX(a1.tinyint_col_3) ORDER BY MAX(a1.tinyint_col_3)) FROM table_1 a1; but the following query works in both Impala and Hive: SELECT LEAD(1) OVER (PARTITION BY MAX(a1.tinyint_col_3) ORDER BY MAX(a1.tinyint_col_3)) FROM table_1 a1; This patch modifies the qgen code so that it doesn't create aggregates inside the above analytic clauses, if the HiveProfile is used. A new method called get_analytic_funcs_that_cannot_contain_aggs() is added to the DefaultProfile and to the HiveProfile. The implementation in the DefaultProfile returns an empty list. The implementation in the HiveProfile returns the list of methods above. The QueryGenerator._create_agg_or_analytic_tree() method is modified so that it checks the get_analytic_funcs_that_cannot_contain_aggs() method when populating the possible function types of the next child in the function tree. If it finds any of these functions already exist in the tree, it ensures that the next child function cannot be an aggregate function. Misc Changes: A few miscellaneous changes were made that popped up during testing: * Fixed a possible NPE in _create_boolean_func_tree * Disabled ONLY_USE_EQUALITY_JOIN_PREDICATES for the HiveProfile, this should have been done in IMPALA-4101; Hive doesn't support all equality-joins, so specific types need to be disabled, see IMPALA-4101 for more details Testing: * Unit tests added: test_query_generator.py and test_hive_create_agg_or_analytic_tree.py * All unit tests pass * Tested locally against Hive * Tested against Impala via Leopard * Tested against Impala via the discrepancy searcher Change-Id: Ie1096c4cde7ea52a52b39e31cd93242da53b549f Reviewed-on: http://gerrit.cloudera.org:8080/4581 Reviewed-by: Taras Bobrovytsky <tbobrovyt...@cloudera.com> Tested-by: Taras Bobrovytsky <tbobrovyt...@cloudera.com> Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/b0f5e0a5 Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/b0f5e0a5 Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/b0f5e0a5 Branch: refs/heads/master Commit: b0f5e0a5d0e0abd9b2fccf04e1851ed2c46bd23f Parents: 50b2b88 Author: Sahil Takiar <stak...@cloudera.com> Authored: Thu Sep 29 18:07:16 2016 -0700 Committer: Taras Bobrovytsky <tbobrovyt...@cloudera.com> Committed: Fri Oct 7 22:18:30 2016 +0000 ---------------------------------------------------------------------- tests/comparison/query_generator.py | 42 ++++++++-- tests/comparison/query_profile.py | 27 ++++++- .../test_hive_create_agg_or_analytic_tree.py | 82 ++++++++++++++++++++ tests/comparison/tests/test_query_generator.py | 44 +++++++++++ 4 files changed, 189 insertions(+), 6 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/b0f5e0a5/tests/comparison/query_generator.py ---------------------------------------------------------------------- diff --git a/tests/comparison/query_generator.py b/tests/comparison/query_generator.py index a38aea4..3a923e0 100644 --- a/tests/comparison/query_generator.py +++ b/tests/comparison/query_generator.py @@ -580,7 +580,8 @@ class QueryGenerator(object): def _create_agg_func_tree(self, return_type): return self._create_agg_or_analytic_tree(return_type, agg_funcs=AGG_FUNCS) - def _create_agg_or_analytic_tree(self, return_type, agg_funcs=[], analytic_funcs=[]): + def _create_agg_or_analytic_tree(self, return_type, agg_funcs=[], analytic_funcs=[], + basic_funcs=FUNCS): '''Returns an instance of a function that is guaranteed to either be or contain an aggregate or analytic function. The arguments of the returned function will either be None or an instance of a function as in _create_func_tree. @@ -597,6 +598,10 @@ class QueryGenerator(object): to types that can be generated by permutations of available functions. If the max nested expr count in the query profile is at least one, then any return_type should be possible to generate but this is not guaranteed. + + basic_funcs is a list of allowed basic functions that will be used in the function + tree. By default, all basic funcs defined in funcs.py will be allowed. A basic + function is any non-aggregate, non-analytic function. ''' # The creation of aggregate and analytic functions is so similar that they are # combined here. "basic" function creation is much simpler so that is kept separate. @@ -620,7 +625,7 @@ class QueryGenerator(object): basic_signatures_by_return_type = self._group_signatures_by_return_type( self._find_matching_signatures( - self._funcs_to_allowed_signatures(FUNCS), + self._funcs_to_allowed_signatures(basic_funcs), allow_subquery=False)) agg_signatures_by_return_type = self._group_signatures_by_return_type( self._funcs_to_allowed_signatures(agg_funcs)) @@ -789,6 +794,12 @@ class QueryGenerator(object): else: chosen_func.parent = None + # Check if the func_tree contains any analytic functions returned by + # profile.get_analytic_funcs_that_cannot_contain_aggs; if it does then aggregate + # functions are not allowed in child operators + allow_agg_func = not self._func_tree_contains_funcs( + chosen_func, self.profile.get_analytic_funcs_that_cannot_contain_aggs()) + # Place the args of the chosen function into the appropriate pools. Aggregate and # analytic functions have different rules about which functions they accept as # arguments. If the chosen function is neither then the tree must be inspected to @@ -797,7 +808,8 @@ class QueryGenerator(object): node = chosen_func while True: if node.is_analytic: - child_null_arg_pools.add(AggFunc) + if allow_agg_func: + child_null_arg_pools.add(AggFunc) child_null_arg_pools.add(Func) break elif node.is_agg: @@ -842,6 +854,22 @@ class QueryGenerator(object): return root_func + def _func_tree_contains_funcs(self, func_tree, funcs): + """ + Returns True if any of the funcs are in the given func_tree. A func_tree must have + a parent property defined that links to the parent function. This method compares + functions based on their name. + """ + if not funcs: + return False + node = func_tree + func_names = set(func.name() for func in funcs) + while node: + if node.name() in func_names: + return True + node = node.parent + return False + def _create_analytic_select_item(self, table_exprs, select_item_exprs, @@ -1452,9 +1480,13 @@ class QueryGenerator(object): # the arg type needs to be preserved, just always assume that this is a child of # a relational function. accepts = return_type if return_type in relational_col_types else None - signature = self.profile.choose_func_signature(self._find_matching_signatures( + matching_signatures = self._find_matching_signatures( allowed_signatures, returns=return_type, accepts=accepts, - allow_subquery=allow_subquery)) + allow_subquery=allow_subquery) + if matching_signatures: + signature = self.profile.choose_func_signature(matching_signatures) + else: + continue func = signature.func(signature) if root_func: http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/b0f5e0a5/tests/comparison/query_profile.py ---------------------------------------------------------------------- diff --git a/tests/comparison/query_profile.py b/tests/comparison/query_profile.py index 81e4f78..f58fa84 100644 --- a/tests/comparison/query_profile.py +++ b/tests/comparison/query_profile.py @@ -27,6 +27,15 @@ from db_types import ( TYPES, Timestamp) from funcs import ( + AnalyticAvg, + AnalyticCount, + AnalyticFirstValue, + AnalyticLag, + AnalyticLastValue, + AnalyticLead, + AnalyticMax, + AnalyticMin, + AnalyticSum, And, Coalesce, Equals, @@ -591,6 +600,13 @@ class DefaultProfile(object): IsNotDistinctFrom, IsNotDistinctFromOp, LessThan, LessThanOrEquals, NotEquals, NotIn) + def get_analytic_funcs_that_cannot_contain_aggs(self): + """ + Returns a list of analytic functions that should not contain aggregate functions + """ + return None + + class ImpalaNestedTypesProfile(DefaultProfile): def __init__(self): @@ -633,7 +649,8 @@ class TestFunctionProfile(DefaultProfile): class HiveProfile(DefaultProfile): def __init__(self): - super(HiveProfile, self).__init__() + super(HiveProfile, self).__init__() + self._probabilities['MISC']['ONLY_USE_EQUALITY_JOIN_PREDICATES'] = 0 def use_having_without_groupby(self): return False @@ -673,5 +690,13 @@ class HiveProfile(DefaultProfile): return [signature for signature in signatures if signature.func in (Equals, And) or len(signature.args) == 1] + def get_analytic_funcs_that_cannot_contain_aggs(self): + """ + Hive does not support aggregate functions inside AVG, COUNT, FIRSTVALUE, LAG, + LASTVALUE, LEAD, MAX, MIN, or SUM functions + """ + return (AnalyticAvg, AnalyticCount, AnalyticFirstValue, AnalyticLag, + AnalyticLastValue, AnalyticLead, AnalyticMax, AnalyticMin, AnalyticSum) + PROFILES = [var for var in locals().values() if isinstance(var, type) and var.__name__.endswith('Profile')] http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/b0f5e0a5/tests/comparison/tests/hive/test_hive_create_agg_or_analytic_tree.py ---------------------------------------------------------------------- diff --git a/tests/comparison/tests/hive/test_hive_create_agg_or_analytic_tree.py b/tests/comparison/tests/hive/test_hive_create_agg_or_analytic_tree.py new file mode 100644 index 0000000..45f0de1 --- /dev/null +++ b/tests/comparison/tests/hive/test_hive_create_agg_or_analytic_tree.py @@ -0,0 +1,82 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import pytest + +from tests.comparison.db_types import Int +from tests.comparison.funcs import AGG_FUNCS +from tests.comparison.query_generator import QueryGenerator +from tests.comparison.query_profile import HiveProfile, DefaultProfile + + +def _idfn(analytic_func): + return analytic_func.name() + +@pytest.mark.parametrize('analytic_func', HiveProfile() + .get_analytic_funcs_that_cannot_contain_aggs(), ids=_idfn) +def test_hive_analytics_cannot_contain_aggs(analytic_func): + """ + Tests that the HiveProfile does not allow nested aggs inside specific analytic + functions. The list of analytic functions that cannot contain aggs is defined by + QueryProfile.test_hive_analytics_cannot_contain_aggs() + """ + + class FakeDefaultQueryProfile(DefaultProfile): + """ + A DefaultProfile that forces only nested expression in any expression trees + """ + + def __init__(self): + super(FakeDefaultQueryProfile, self).__init__() + self._bounds.update({'MAX_NESTED_EXPR_COUNT': (1, 1)}) + + class FakeHiveQueryProfile(HiveProfile): + """ + A HiveProfile that forces only nested expression in any expression trees + """ + + def __init__(self): + super(FakeHiveQueryProfile, self).__init__() + self._bounds.update({'MAX_NESTED_EXPR_COUNT': (1, 1)}) + + # Aggregate functions can only return specific types, such as Int, Number, or Float; + # while certain AnalyticFuncs can return non-numeric types such as FirstValue or + # LastValue. So for simplicity, these tests are only run against the Int return_type. + if Int in [signature.return_type for signature in analytic_func.signatures()]: + + # Generate an agg + analytic func tree using the FakeDefaultQueryProfile and ensure + # the root func is a analytic_func and that the tree contains an aggregate func. An + # empty list of funcs is passed into the _create_agg_or_analytic_tree so that no + # basic funcs are created. We can be sure that the root_func is an analytic_func + # because agg_funcs cannot contain analytic_funcs. + + qgen = QueryGenerator(FakeDefaultQueryProfile()) + func_tree = qgen._create_agg_or_analytic_tree(Int, agg_funcs=AGG_FUNCS, + analytic_funcs=[analytic_func], + basic_funcs=[]) + assert isinstance(func_tree, analytic_func) + assert func_tree.contains_agg + + # Do the same for the FakeHiveQueryProfile, but now check that the func_tree has no + # aggregates. + + qgen = QueryGenerator(FakeHiveQueryProfile()) + func_tree = qgen._create_agg_or_analytic_tree(Int, agg_funcs=AGG_FUNCS, + analytic_funcs=[analytic_func], + basic_funcs=[]) + assert isinstance(func_tree, analytic_func) + assert not func_tree.contains_agg http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/b0f5e0a5/tests/comparison/tests/test_query_generator.py ---------------------------------------------------------------------- diff --git a/tests/comparison/tests/test_query_generator.py b/tests/comparison/tests/test_query_generator.py new file mode 100644 index 0000000..d64fc3e --- /dev/null +++ b/tests/comparison/tests/test_query_generator.py @@ -0,0 +1,44 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from tests.comparison.db_types import Boolean +from tests.comparison.funcs import And, Equals, Or +from tests.comparison.query_generator import QueryGenerator +from tests.comparison.query_profile import DefaultProfile + + +def test_func_tree_contains_funcs(): + """ + Tests the QueryGenerator.func_tree_contains_funcs() method + """ + + qgen = QueryGenerator(DefaultProfile()) + + # Create a simple func_tree with only one function + and_func = And.create_from_args(Boolean(True), Boolean(True)) + and_func.parent = None + assert qgen._func_tree_contains_funcs(and_func, [And]) + assert not qgen._func_tree_contains_funcs(and_func, [Or]) + + # Create a func_tree that contains one parent, and two children + equals_func = Equals.create_from_args(Boolean(True), Boolean(True)) + and_func = And.create_from_args(equals_func, equals_func) + equals_func.parent = and_func + and_func.parent = None + assert qgen._func_tree_contains_funcs(equals_func, [And]) + assert qgen._func_tree_contains_funcs(equals_func, [Equals]) + assert not qgen._func_tree_contains_funcs(equals_func, [Or]) \ No newline at end of file