Repository: incubator-impala Updated Branches: refs/heads/master 872d5462b -> f5c7668f9
IMPALA-4101: qgen: Hive join predicates should only contains equality functions Background: Hive only supports equi-joins in its JOIN clause, while Postgres and Impala support more complex functions such as <, <=, >, >=, etc. This change modifies the QueryGenerator._create_relational_join_condition and QueryGenerator._create_boolean_func_tree methods to only construct equality join conditions under certain conditions. The _create_boolean_func_tree method is invoked via QueryGenerator -> create_query -> _create_from_clause -> _create_join_clause -> _create_relational_join_condition -> _create_boolean_func_tree. This method is invoked when constructing the JOIN, WHERE, and HAVING clauses. It creates a tree of functions that would typically be found in any of these clauses. Changes: The parameter "signatures" is added to the method _create_boolean_func_tree, and it lists out all the allowed signatures the function is allowed to use. Previously, this list of signatures was populated by calling _funcs_to_allowed_signatures(FUNCS), and if "signatures" is not specified, then the code defaults back to the results of that method. A new method in the DefaultProfile called get_allowed_join_signatures is introduced and returns a list of function signatures that are allowed within a JOIN clause. The DefaultProfile allows all given signatures, while the HiveProfile only allows for the Equals and And functions, as well as any function that operates over only one column. The reason for these restrictions is that Hive only allows equality joins, does not allow OR operators in the join clause, and has some restrictions on functions that operate over multiple different tables. This last restriction is somewhat subtle; if one side of the equals operator contains a function that operates over two different tables, the other side of the operator cannot contain either of those tables. While it is possible to have functions that take in multiple input parameters, the inputs must be taken from specific tables to prevent Hive from throwing a compile time exception. Adding support for this in qgen code will require significant effort and modification to some core methods (_create_relational_join_condition and _populate_func_with_vals), so it's best to disable these for Hive altogether. Note that the _create_boolean_func_tree still allows for OR operators due to some logic around its "and_or_fill_ratio" variable. The plan is to fix this in a future patch that specifically focuses on removing OR operators from Hive JOIN clauses. Minor change to discrepancy_searcher so that the logs print out "Hive" instead of "Impala" when running against Hive. Testing: * Added a new unit test that ensures the HiveProfile only returns equality joins * Unit tests pass * Tested against Hive locally * Tested against Impala via Leopard * Tested against Impala via the Discrepancy Checker Change-Id: Ibe8832a03cfa0d7ecc293ec6db6db2bcb34ab459 Reviewed-on: http://gerrit.cloudera.org:8080/4419 Reviewed-by: Taras Bobrovytsky <[email protected]> Tested-by: Taras Bobrovytsky <[email protected]> Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/f5c7668f Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/f5c7668f Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/f5c7668f Branch: refs/heads/master Commit: f5c7668f9ea603f85dbeff13b4ef8f18f39f01a9 Parents: 872d546 Author: Sahil Takiar <[email protected]> Authored: Fri Sep 9 15:39:46 2016 -0700 Committer: Taras Bobrovytsky <[email protected]> Committed: Tue Sep 27 18:14:42 2016 +0000 ---------------------------------------------------------------------- tests/comparison/discrepancy_searcher.py | 15 +++-- tests/comparison/query_generator.py | 46 ++++++++----- tests/comparison/query_profile.py | 28 ++++++++ ...est_hive_create_relational_join_condition.py | 70 ++++++++++++++++++++ 4 files changed, 139 insertions(+), 20 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/f5c7668f/tests/comparison/discrepancy_searcher.py ---------------------------------------------------------------------- diff --git a/tests/comparison/discrepancy_searcher.py b/tests/comparison/discrepancy_searcher.py index c8e5f45..11f65c0 100755 --- a/tests/comparison/discrepancy_searcher.py +++ b/tests/comparison/discrepancy_searcher.py @@ -78,6 +78,10 @@ class QueryResultComparator(object): flatten_dialect=flatten_dialect) @property + def test_db_type(self): + return self.test_conn.db_type + + @property def ref_db_type(self): return self.ref_conn.db_type @@ -85,7 +89,7 @@ class QueryResultComparator(object): '''Execute the query, compare the data, and return a ComparisonResult, which summarizes the outcome. ''' - comparison_result = ComparisonResult(query, self.ref_db_type) + comparison_result = ComparisonResult(query, self.test_db_type, self.ref_db_type) (ref_sql, ref_exception, ref_data_set, ref_cursor_description), (test_sql, test_exception, test_data_set, test_cursor_description) = \ self.query_executor.fetch_query_results(query) @@ -452,8 +456,9 @@ class QueryExecutor(object): class ComparisonResult(object): '''Represents a result.''' - def __init__(self, query, ref_db_type): + def __init__(self, query, test_db_type, ref_db_type): self.query = query + self.test_db_type = test_db_type self.ref_db_type = ref_db_type self.ref_sql = None self.test_sql = None @@ -474,8 +479,9 @@ class ComparisonResult(object): self._error_message = str(self.exception) elif (self.ref_row_count or self.test_row_count) and \ self.ref_row_count != self.test_row_count: - self._error_message = 'Row counts do not match: %s Impala rows vs %s %s rows' \ + self._error_message = 'Row counts do not match: %s %s rows vs %s %s rows' \ % (self.test_row_count, + self.test_db_type, self.ref_db_type, self.ref_row_count) elif self.mismatch_at_row_number is not None: @@ -489,10 +495,11 @@ class ComparisonResult(object): for idx, val in enumerate(self.ref_row) ) + ']' self._error_message = \ - 'Column %s in row %s does not match: %s Impala row vs %s %s row' \ + 'Column %s in row %s does not match: %s %s row vs %s %s row' \ % (self.mismatch_at_col_number, self.mismatch_at_row_number, test_row, + self.test_db_type, ref_row, self.ref_db_type) return self._error_message http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/f5c7668f/tests/comparison/query_generator.py ---------------------------------------------------------------------- diff --git a/tests/comparison/query_generator.py b/tests/comparison/query_generator.py index 4453f5a..4685934 100644 --- a/tests/comparison/query_generator.py +++ b/tests/comparison/query_generator.py @@ -1208,9 +1208,17 @@ class QueryGenerator(object): if not table_exprs_by_col_types: raise Exception('Tables have no joinable columns in common') + join_signatures = self._funcs_to_allowed_signatures(FUNCS) + if self.profile.only_use_equality_join_predicates(): + join_signatures = [sig for sig in join_signatures if not + self.profile.is_non_equality_join_predicate(sig.func)] + else: + join_signatures = self.profile.get_allowed_join_signatures(join_signatures) + root_predicate, relational_predicates = self._create_boolean_func_tree( - require_relational_func=True, - relational_col_types=table_exprs_by_col_types.keys()) + require_relational_func=True, + relational_col_types=table_exprs_by_col_types.keys(), + allowed_signatures=join_signatures) for predicate in relational_predicates: left_table_expr = choice(candidiate_table_exprs) @@ -1344,7 +1352,8 @@ class QueryGenerator(object): return exprs_to_funcs def _create_boolean_func_tree(self, require_relational_func=False, - relational_col_types=None, allow_subquery=False): + relational_col_types=None, allow_subquery=False, + allowed_signatures=None): '''Creates a function that returns a Boolean. The returned value is a Tuple<Function, List<Function>> where the first function is the root of the tree and the list contains relational functions (though other relational functions @@ -1352,10 +1361,18 @@ class QueryGenerator(object): depending on the query profile in use. The leaf arguments of the tree will all be set to a NULL literal. - If 'require_relational_func' is True, at least one function in the tree will be - a relational function such as Equals, LessThanOrEquals, etc. - 'relational_col_types' can be used to restrict the signature of the chosen - relational function to the given types. + Args: + require_relational_func: If True, at least one function in the tree will be a + relational function such as Equals, LessThanOrEquals, etc. + + relational_col_types: Can be used to restrict the signature of the chosen + relational function to the given types. + + allow_subquery: If True, a subquery may be added to the function tree. + + allowed_signatures: A list of allowable signatures to be used inside the function + tree. This list is not definitive, functions outside this list may be added to + the function tree if they are explicitly added by the QueryProfile. ''' # To create a realistic expression, the nodes nearest the root of the tree will be # ANDs and ORs, their children will be relational functions, then other number of @@ -1404,7 +1421,8 @@ class QueryGenerator(object): # 'relational_col_types'. null_args_by_type = defaultdict(list) - signatures = self._funcs_to_allowed_signatures(FUNCS) + if not allowed_signatures: + allowed_signatures = self._funcs_to_allowed_signatures(FUNCS) if not relational_col_types: relational_col_types = tuple() @@ -1419,13 +1437,9 @@ class QueryGenerator(object): elif relational_count > 0: relational_count -= 1 is_relational = True - if self.profile.only_use_equality_join_predicates(): - signature = self.profile.choose_func_signature(self._find_matching_signatures( - Equals.signatures(), accepts_only=tuple(relational_col_types))) - else: - relational_signatures = self._find_matching_signatures( - signatures, accepts_only=tuple(relational_col_types), returns=Boolean) - signature = self.profile.choose_relational_func_signature(relational_signatures) + relational_signatures = self._find_matching_signatures( + allowed_signatures, accepts_only=tuple(relational_col_types), returns=Boolean) + signature = self.profile.choose_relational_func_signature(relational_signatures) else: if not root_func or Boolean in null_args_by_type: # Prefer to replace Boolean leaves to get a more realistic expression. @@ -1437,7 +1451,7 @@ class QueryGenerator(object): # a relational function. accepts = return_type if return_type in relational_col_types else None signature = self.profile.choose_func_signature(self._find_matching_signatures( - signatures, returns=return_type, accepts=accepts, + allowed_signatures, returns=return_type, accepts=accepts, allow_subquery=allow_subquery)) func = signature.func(signature) http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/f5c7668f/tests/comparison/query_profile.py ---------------------------------------------------------------------- diff --git a/tests/comparison/query_profile.py b/tests/comparison/query_profile.py index 00835ef..81e4f78 100644 --- a/tests/comparison/query_profile.py +++ b/tests/comparison/query_profile.py @@ -575,6 +575,21 @@ class DefaultProfile(object): return False return True + def get_allowed_join_signatures(self, signatures): + """ + Returns all the function signatures that are allowed inside a JOIN clause. This + method is mutually exclusive with only_use_equality_join_predicates. This results of + this method are ignored if only_use_equality_join_predicates return True. + """ + return signatures + + def is_non_equality_join_predicate(self, func): + """ + Returns True if the given func is considered a non-equality join condition. + """ + return func in (GreaterThan, GreaterThanOrEquals, In, + IsNotDistinctFrom, IsNotDistinctFromOp, LessThan, + LessThanOrEquals, NotEquals, NotIn) class ImpalaNestedTypesProfile(DefaultProfile): @@ -644,6 +659,19 @@ class HiveProfile(DefaultProfile): return False return DefaultProfile.allow_func_signature(self, signature) + def get_allowed_join_signatures(self, signatures): + """ + Restricts the function signatures inside a JOIN clause to either be an Equals + operator, an And operator, or any operator that only takes in one argument. The reason + is that Hive only supports equi-joins, does not allow OR operators inside a JOIN, and + does not allow any other operator that operates over multiple columns. + + The reason ONLY_USE_EQUALITY_JOIN_PREDICATES is not sufficient to guarantee this is + that Hive needs to restrict the functions used based on the argument size of a + function. + """ + return [signature for signature in signatures if + signature.func in (Equals, And) or len(signature.args) == 1] PROFILES = [var for var in locals().values() if isinstance(var, type) and var.__name__.endswith('Profile')] http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/f5c7668f/tests/comparison/tests/hive/test_hive_create_relational_join_condition.py ---------------------------------------------------------------------- diff --git a/tests/comparison/tests/hive/test_hive_create_relational_join_condition.py b/tests/comparison/tests/hive/test_hive_create_relational_join_condition.py new file mode 100644 index 0000000..4d4eea1 --- /dev/null +++ b/tests/comparison/tests/hive/test_hive_create_relational_join_condition.py @@ -0,0 +1,70 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from tests.comparison.common import TableExprList, Column, Table +from tests.comparison.db_types import Int +from tests.comparison.funcs import Equals +from tests.comparison.query_generator import QueryGenerator +from tests.comparison.query_profile import HiveProfile + + +def test_hive_create_equality_only_joins(): + """ + Tests that QueryGenerator produces a join condition with only equality functions if the + HiveProfile is used. + """ + + class FakeHiveQueryProfile(HiveProfile): + """ + A fake QueryProfile that extends the HiveProfile, various weights are modified in + order to ensure that this test is deterministic. + """ + + def choose_join_condition_count(self): + """ + There should be only one operator in the JOIN condition + """ + return 1 + + def choose_conjunct_disjunct_fill_ratio(self): + """ + There should be no AND or OR operators + """ + return 0 + + def choose_relational_func_fill_ratio(self): + """ + Force all operators to be relational + """ + return 1 + + query_generator = QueryGenerator(FakeHiveQueryProfile()) + + # Create two tables that have one joinable Column + right_table_expr_list = TableExprList() + right_table = Table("right_table") + right_table.add_col(Column("right_table", "right_col", Int)) + right_table_expr_list.append(right_table) + + left_table_expr_list = TableExprList() + left_table = Table("left_table") + left_table.add_col(Column("left_table", "left_col", Int)) + left_table_expr_list.append(left_table) + + # Validate the root predicate is an Equals funcs + assert isinstance(query_generator._create_relational_join_condition( + right_table_expr_list, left_table_expr_list), Equals)
