This is an automated email from the ASF dual-hosted git repository. joemcdonnell pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/impala.git
commit 6eabff9eab8c96b9cc6e68506dc9173722a07ad1 Author: Riza Suminto <[email protected]> AuthorDate: Wed Aug 21 16:34:12 2024 -0700 IMPALA-13323: Remove redundant tests in test_join_queries.py Before this patch, test_join_queries.py is expensive to run in exhaustive exploration because it run many test dimension permutation, but actually never exercise some of the dimensions. Those redundant tests are follows: Have mt_dop dimension, but not exercising it: - test_outer_to_inner_joins - test_single_node_nested_loop_joins Have batch_size dimension but not exercising it: - test_outer_to_inner_joins - test_single_node_nested_loop_joins - test_single_node_nested_loop_joins_exhaustive - test_semi_joins_exhaustive Have enable_outer_join_to_inner_transformation dimension but not exercising it: - All TestJoinQueries except test_outer_to_inner_joins test_miss_tuple_joins is also valid to run with much fewer test dimension because it mainly test correctness of predicate pushdown during planning. This patch reorganize test_join_queries.py into several test classes where exec option dimensions are clearly declared and correctly exercised without duplication. The reduction are follows: Before patch: 41 core tests, 526 exhaustive tests After patch: 28 core tests, 93 exhaustive tests Added validate_exec_option_dimension() in impala_test_suite.py to log WARNING if there is a dimension name matching query option name that is not also registered in 'exec_option' name. That log can be observed in log file such as logs/ee_tests/results/TEST-impala-parallel.xml. Also fix couple flake8 issues found at impala_test_suite.py Change-Id: I76efa82dad59dbb9f58a36a4a2938b5f73a382f6 Reviewed-on: http://gerrit.cloudera.org:8080/21716 Tested-by: Impala Public Jenkins <[email protected]> Reviewed-by: Michael Smith <[email protected]> --- tests/common/impala_test_suite.py | 112 ++++++++++------ tests/query_test/test_join_queries.py | 238 ++++++++++++++++++---------------- 2 files changed, 200 insertions(+), 150 deletions(-) diff --git a/tests/common/impala_test_suite.py b/tests/common/impala_test_suite.py index 6d73765da..14c378ab0 100644 --- a/tests/common/impala_test_suite.py +++ b/tests/common/impala_test_suite.py @@ -60,7 +60,7 @@ from tests.common.test_result_verifier import ( verify_lineage, verify_raw_results, verify_runtime_profile) -from tests.common.test_vector import ImpalaTestDimension +from tests.common.test_vector import ImpalaTestDimension, EXEC_OPTION_KEY from tests.performance.query import Query from tests.performance.query_exec_functions import execute_using_jdbc from tests.performance.query_executor import JdbcQueryExecConfig @@ -98,6 +98,9 @@ from tests.util.thrift_util import create_transport from hive_metastore import ThriftHiveMetastore from thrift.protocol import TBinaryProtocol +# Import to validate query option names +from ImpalaService.ttypes import TImpalaQueryOptions + # Initializing the logger before conditional imports, since we will need it # for them. LOG = logging.getLogger('impala_test_suite') @@ -123,8 +126,8 @@ IMPALAD_HS2_PORT = int(pytest.config.option.impalad_hs2_port) IMPALAD_HS2_HOST_PORT = IMPALAD_HOSTNAME + ":" + str(IMPALAD_HS2_PORT) # Calculate the hs2 ports based on the first hs2 port and the deltas of the beeswax ports IMPALAD_HS2_HOST_PORT_LIST = [ - IMPALAD_HOSTNAME_LIST[i] + ':' + - str(IMPALAD_BEESWAX_PORT_LIST[i] - IMPALAD_BEESWAX_PORT + IMPALAD_HS2_PORT) + IMPALAD_HOSTNAME_LIST[i] + ':' + + str(IMPALAD_BEESWAX_PORT_LIST[i] - IMPALAD_BEESWAX_PORT + IMPALAD_HS2_PORT) for i in range(len(IMPALAD_HOST_PORT_LIST)) ] @@ -133,8 +136,8 @@ IMPALAD_HS2_HTTP_HOST_PORT = IMPALAD_HOSTNAME + ":" + str(IMPALAD_HS2_HTTP_PORT) # Calculate the hs2-http ports based on the first hs2-http port and the deltas of the # beeswax ports IMPALAD_HS2_HTTP_HOST_PORT_LIST = [ - IMPALAD_HOSTNAME_LIST[i] + ':' + - str(IMPALAD_BEESWAX_PORT_LIST[i] - IMPALAD_BEESWAX_PORT + IMPALAD_HS2_HTTP_PORT) + IMPALAD_HOSTNAME_LIST[i] + ':' + + str(IMPALAD_BEESWAX_PORT_LIST[i] - IMPALAD_BEESWAX_PORT + IMPALAD_HS2_HTTP_PORT) for i in range(len(IMPALAD_HOST_PORT_LIST)) ] @@ -164,6 +167,10 @@ VARZ_URL = 'http://{0}:25000/varz?json'.format(IMPALAD_HOSTNAME) GROUP_NAME = grp.getgrgid(pwd.getpwnam(getuser()).pw_gid).gr_name +EXEC_OPTION_NAMES = set([val.lower() + for val in TImpalaQueryOptions._VALUES_TO_NAMES.values()]) + + # Base class for Impala tests. All impala test cases should inherit from this class class ImpalaTestSuite(BaseTestSuite): @classmethod @@ -426,7 +433,7 @@ class ImpalaTestSuite(BaseTestSuite): # Restore all the changed query options. for query_option in query_options_changed: query_option = query_option.upper() - if not query_option in self.default_query_options: + if query_option not in self.default_query_options: continue default_val = self.default_query_options[query_option] query_str = 'SET ' + query_option + '="' + default_val + '"' @@ -535,7 +542,6 @@ class ImpalaTestSuite(BaseTestSuite): s = s.replace(k, v) return s - def __verify_exceptions(self, expected_strs, actual_str, use_db): """ Verifies that at least one of the strings in 'expected_str' is either: @@ -584,7 +590,8 @@ class ImpalaTestSuite(BaseTestSuite): .replace('$EXTERNAL_WAREHOUSE_DIR', EXTERNAL_WAREHOUSE_DIR) if use_db: - test_section[section_name] = test_section[section_name].replace('$DATABASE', use_db) + test_section[section_name] = test_section[section_name].replace( + '$DATABASE', use_db) result_section, type_section = 'RESULTS', 'TYPES' if vector.get_value('protocol').startswith('hs2'): # hs2 or hs2-http @@ -598,7 +605,6 @@ class ImpalaTestSuite(BaseTestSuite): result_section, type_section, pytest.config.option.update_results, replace_filenames_with_placeholder) - def run_test_case(self, test_file_name, vector, use_db=None, multiple_impalad=False, encoding=None, test_file_vars=None): """ @@ -618,8 +624,9 @@ class ImpalaTestSuite(BaseTestSuite): values in queries before they are executed. Callers need to avoid using reserved key names, see 'reserved_keywords' below. """ + self.validate_exec_option_dimension(vector) table_format_info = vector.get_value('table_format') - exec_options = vector.get_value('exec_option') + exec_options = vector.get_value(EXEC_OPTION_KEY) protocol = vector.get_value('protocol') target_impalad_clients = list() @@ -662,11 +669,12 @@ class ImpalaTestSuite(BaseTestSuite): for query in query.split(';'): set_pattern_match = SET_PATTERN.match(query) if set_pattern_match: - query_options_changed.append(set_pattern_match.groups()[0]) - assert set_pattern_match.groups()[0] not in vector.get_value("exec_option"), \ - "%s cannot be set in the '.test' file since it is in the test vector. " \ - "Consider deepcopy()-ing the vector and removing this option in the " \ - "python test." % set_pattern_match.groups()[0] + option_name = set_pattern_match.groups()[0] + query_options_changed.append(option_name) + assert option_name not in vector.get_value(EXEC_OPTION_KEY), ( + "{} cannot be set in the '.test' file since it is in the test vector. " + "Consider deepcopy()-ing the vector and removing this option in the " + "python test.".format(option_name)) result = self.__execute_query(target_impalad_client, query, user=user) finally: if len(query_options_changed) > 0: @@ -719,9 +727,9 @@ class ImpalaTestSuite(BaseTestSuite): query_section = test_section['HIVE_QUERY'] exec_fn = __exec_in_hive else: - assert 0, ('Error in test file %s. Test cases require a ' + - '-- QUERY or HIVE_QUERY section.\n%s') %\ - (test_file_name, pprint.pformat(test_section)) + assert 0, ('Error in test file {}. Test cases require a ' + '-- QUERY or HIVE_QUERY section.\n{}').format( + test_file_name, pprint.pformat(test_section)) # TODO: support running query tests against different scale factors query = QueryTestSectionReader.build_query( @@ -733,9 +741,6 @@ class ImpalaTestSuite(BaseTestSuite): result = None try: result = exec_fn(query, user=test_section.get('USER', '').strip() or None) - user = None - if 'USER' in test_section: - user = test_section['USER'].strip() except Exception as e: if 'CATCH' in test_section: self.__verify_exceptions(test_section['CATCH'], str(e), use_db) @@ -798,18 +803,18 @@ class ImpalaTestSuite(BaseTestSuite): test_section[rt_profile_info] = "".join(rt_profile) if 'LINEAGE' in test_section: - # Lineage flusher thread runs every 5s by default and is not configurable. Wait - # for that period. (TODO) Get rid of this for faster test execution. - time.sleep(5) - current_query_lineage = self.get_query_lineage(result.query_id, lineage_log_dir) - assert current_query_lineage is not "",\ - "No lineage found for query %s in dir %s" %\ - (result.query_id, lineage_log_dir) - if pytest.config.option.update_results: - test_section['LINEAGE'] = json.dumps(current_query_lineage, indent=2, - separators=(',', ': ')) - else: - verify_lineage(json.loads(test_section['LINEAGE']), current_query_lineage) + # Lineage flusher thread runs every 5s by default and is not configurable. Wait + # for that period. (TODO) Get rid of this for faster test execution. + time.sleep(5) + current_query_lineage = self.get_query_lineage(result.query_id, lineage_log_dir) + assert current_query_lineage != "", ( + "No lineage found for query {} in dir {}".format( + result.query_id, lineage_log_dir)) + if pytest.config.option.update_results: + test_section['LINEAGE'] = json.dumps(current_query_lineage, indent=2, + separators=(',', ': ')) + else: + verify_lineage(json.loads(test_section['LINEAGE']), current_query_lineage) if 'DML_RESULTS' in test_section: assert 'ERRORS' not in test_section @@ -823,7 +828,7 @@ class ImpalaTestSuite(BaseTestSuite): update_section=pytest.config.option.update_results) if pytest.config.option.update_results: output_file = os.path.join(EE_TEST_LOGS_DIR, - test_file_name.replace('/','_') + ".test") + test_file_name.replace('/', '_') + ".test") write_test_file(output_file, sections, encoding=encoding) def get_query_lineage(self, query_id, lineage_dir): @@ -855,8 +860,8 @@ class ImpalaTestSuite(BaseTestSuite): @classmethod def change_database(cls, impala_client, table_format=None, db_name=None, scale_factor=None): - if db_name == None: - assert table_format != None + if db_name is None: + assert table_format is not None db_name = QueryTestSectionReader.get_db_name(table_format, scale_factor if scale_factor else '') query = 'use %s' % db_name @@ -932,14 +937,16 @@ class ImpalaTestSuite(BaseTestSuite): return end_time - start_time def execute_query_using_client(self, client, query, vector): + self.validate_exec_option_dimension(vector) self.change_database(client, vector.get_value('table_format')) - query_options = vector.get_value('exec_option') + query_options = vector.get_value(EXEC_OPTION_KEY) if query_options is not None: client.set_configuration(query_options) return client.execute(query) def execute_query_async_using_client(self, client, query, vector): + self.validate_exec_option_dimension(vector) self.change_database(client, vector.get_value('table_format')) - query_options = vector.get_value('exec_option') + query_options = vector.get_value(EXEC_OPTION_KEY) if query_options is not None: client.set_configuration(query_options) return client.execute_async(query) @@ -971,7 +978,7 @@ class ImpalaTestSuite(BaseTestSuite): assert len(result.data) <= 1, 'Multiple values returned from scalar' return result.data[0] if len(result.data) == 1 else None - def exec_and_compare_hive_and_impala_hs2(self, stmt, compare = lambda x, y: x == y): + def exec_and_compare_hive_and_impala_hs2(self, stmt, compare=lambda x, y: x == y): """Compare Hive and Impala results when executing the same statment over HS2""" # execute_using_jdbc expects a Query object. Convert the query string into a Query # object @@ -1025,7 +1032,7 @@ class ImpalaTestSuite(BaseTestSuite): def clone_table(self, src_tbl, dst_tbl, recover_partitions, vector): src_loc = self._get_table_location(src_tbl, vector) - self.client.execute("create external table {0} like {1} location '{2}'"\ + self.client.execute("create external table {0} like {1} location '{2}'" .format(dst_tbl, src_tbl, src_loc)) if recover_partitions: self.client.execute("alter table {0} recover partitions".format(dst_tbl)) @@ -1033,7 +1040,7 @@ class ImpalaTestSuite(BaseTestSuite): def appx_equals(self, a, b, diff_perc): """Returns True if 'a' and 'b' are within 'diff_perc' percent of each other, False otherwise. 'diff_perc' must be a float in [0,1].""" - if a == b: return True # Avoid division by 0 + if a == b: return True # Avoid division by 0 assert abs(a - b) / float(max(abs(a), abs(b))) <= diff_perc def _get_table_location(self, table_name, vector): @@ -1349,6 +1356,29 @@ class ImpalaTestSuite(BaseTestSuite): str(e)) time.sleep(1) + def validate_exec_option_dimension(self, vector): + """Validate that test dimension with name matching query option name is + also registered in 'exec_option' dimension.""" + option_dim_names = [] + exec_option = dict() + for vector_value in vector.vector_values: + if vector_value.name == EXEC_OPTION_KEY: + exec_option = vector.get_value(EXEC_OPTION_KEY) + elif vector_value.name.lower() in EXEC_OPTION_NAMES: + option_dim_names.append(vector_value.name.lower()) + + if not option_dim_names: + return + + for name in option_dim_names: + # TODO: enforce these warnings by changing them into pytest.fail() + if name not in exec_option: + LOG.warn("Exec option {} declared as independent dimension but not inserted " + "into {} dimension.".format(name, EXEC_OPTION_KEY)) + elif vector.get_value(name) != exec_option[name]: + LOG.warn("{}[{}]={} does not match against dimension {}={}.".format( + EXEC_OPTION_KEY, name, exec_option[name], name, vector.get_value(name))) + @staticmethod def get_random_name(prefix='', length=5): """ diff --git a/tests/query_test/test_join_queries.py b/tests/query_test/test_join_queries.py index 8bfc7e7f0..2db2714fe 100644 --- a/tests/query_test/test_join_queries.py +++ b/tests/query_test/test_join_queries.py @@ -23,19 +23,36 @@ from copy import deepcopy from tests.common.impala_test_suite import ImpalaTestSuite from tests.common.skip import SkipIf, SkipIfFS -from tests.common.test_vector import ImpalaTestDimension from tests.common.test_dimensions import ( + add_exec_option_dimension, add_mandatory_exec_option, create_exec_option_dimension, + create_single_exec_option_dimension, create_table_format_dimension) -class TestJoinQueries(ImpalaTestSuite): - BATCH_SIZES = [0, 1] - MT_DOP_VALUES = [0, 4] - # Additional values for exhaustive tests. - MT_DOP_VALUES_EXHAUSTIVE = [1] - ENABLE_OUTER_JOIN_TO_INNER_TRANSFORMATION = ['false', 'true'] +ENABLE_OUTER_JOIN_TO_INNER_TRANSFORMATION = ['false', 'true'] + + +def batch_size_dim(cls): + if cls.exploration_strategy() == 'exhaustive': + return [0, 1] + else: + return [0] + + +def mt_dop_dim(cls): + if cls.exploration_strategy() == 'exhaustive': + return [0, 1, 4] + else: + return [0, 4] + + +class TestJoinBase(ImpalaTestSuite): + """The base class for test join classes. + Intended to provide subclasses with default test dimensions declaration + and constraints through add_test_dimensions() both in core and + exhaustive exploration.""" @classmethod def get_workload(cls): @@ -43,88 +60,105 @@ class TestJoinQueries(ImpalaTestSuite): @classmethod def add_test_dimensions(cls): - super(TestJoinQueries, cls).add_test_dimensions() - cls.ImpalaTestMatrix.add_dimension( - ImpalaTestDimension('batch_size', *TestJoinQueries.BATCH_SIZES)) - mt_dop_values = cls.MT_DOP_VALUES - if cls.exploration_strategy() == 'exhaustive': - mt_dop_values += cls.MT_DOP_VALUES_EXHAUSTIVE + super(TestJoinBase, cls).add_test_dimensions() + + # Set exec options cls.ImpalaTestMatrix.add_dimension( - ImpalaTestDimension('mt_dop', *mt_dop_values)) - # TODO: Look into splitting up join tests to accomodate hbase. - # Joins with hbase tables produce drastically different results. - cls.ImpalaTestMatrix.add_constraint(lambda v:\ + create_exec_option_dimension(batch_sizes=batch_size_dim(cls))) + + cls.ImpalaTestMatrix.add_constraint(lambda v: v.get_value('table_format').file_format in ['parquet']) - if cls.exploration_strategy() != 'exhaustive': - # Cut down on execution time when not running in exhaustive mode. - cls.ImpalaTestMatrix.add_constraint(lambda v: v.get_value('batch_size') != 1) - cls.ImpalaTestMatrix.add_dimension( - ImpalaTestDimension('enable_outer_join_to_inner_transformation', - *TestJoinQueries.ENABLE_OUTER_JOIN_TO_INNER_TRANSFORMATION)) +class TestJoinQueries(TestJoinBase): - def test_basic_joins(self, vector): - new_vector = deepcopy(vector) - new_vector.get_value('exec_option')['batch_size'] = vector.get_value('batch_size') - new_vector.get_value('exec_option')['mt_dop'] = vector.get_value('mt_dop') - self.run_test_case('QueryTest/joins', new_vector) + @classmethod + def add_test_dimensions(cls): + super(TestJoinQueries, cls).add_test_dimensions() + add_exec_option_dimension(cls, 'mt_dop', mt_dop_dim(cls)) - def test_single_node_joins_with_limits_exhaustive(self, vector): - if self.exploration_strategy() != 'exhaustive': pytest.skip() - new_vector = deepcopy(vector) - new_vector.get_value('exec_option')['num_nodes'] = 1 - new_vector.get_value('exec_option')['mt_dop'] = vector.get_value('mt_dop') - del new_vector.get_value('exec_option')['batch_size'] # .test file sets batch_size - self.run_test_case('QueryTest/single-node-joins-with-limits-exhaustive', new_vector) + def test_basic_joins(self, vector): + self.run_test_case('QueryTest/joins', vector) @SkipIfFS.hbase @SkipIf.skip_hbase def test_joins_against_hbase(self, vector): - new_vector = deepcopy(vector) - new_vector.get_value('exec_option')['batch_size'] = vector.get_value('batch_size') - new_vector.get_value('exec_option')['mt_dop'] = vector.get_value('mt_dop') - self.run_test_case('QueryTest/joins-against-hbase', new_vector) + # TODO: Look into splitting up join tests to accomodate hbase. + # Joins with hbase tables produce drastically different results. + self.run_test_case('QueryTest/joins-against-hbase', vector) def test_outer_joins(self, vector): - new_vector = deepcopy(vector) - new_vector.get_value('exec_option')['batch_size'] = vector.get_value('batch_size') - new_vector.get_value('exec_option')['mt_dop'] = vector.get_value('mt_dop') - self.run_test_case('QueryTest/outer-joins', new_vector) + self.run_test_case('QueryTest/outer-joins', vector) - def test_outer_to_inner_joins(self, vector): - new_vector = deepcopy(vector) - new_vector.get_value('exec_option')['enable_outer_join_to_inner_transformation']\ - = vector.get_value('enable_outer_join_to_inner_transformation') - self.run_test_case('QueryTest/outer-to-inner-joins', new_vector) + def test_empty_build_joins(self, vector): + self.run_test_case('QueryTest/empty-build-joins', vector) + + +class TestSingleNodeJoins(TestJoinBase): + + @classmethod + def add_test_dimensions(cls): + super(TestSingleNodeJoins, cls).add_test_dimensions() + # Redeclare exec options with num_nodes=1, batch_size=0. + cls.ImpalaTestMatrix.add_dimension( + create_exec_option_dimension(cluster_sizes=[1], batch_sizes=[0])) def test_single_node_nested_loop_joins(self, vector): # Test the execution of nested-loops joins for join types that can only be # executed in a single node (right [outer|semi|anti] and full outer joins). + self.run_test_case('QueryTest/single-node-nlj', vector) + + +class TestSingleNodeJoinsExhaustive(TestJoinBase): + + @classmethod + def add_test_dimensions(cls): + super(TestSingleNodeJoinsExhaustive, cls).add_test_dimensions() + if cls.exploration_strategy() != 'exhaustive': + # skip this test if not in exhaustive exploration. + pytest.skip("Only run in exhaustive exploration.") + + # Redeclare exec options with num_nodes=1, batch_size=0. + cls.ImpalaTestMatrix.add_dimension( + create_exec_option_dimension(cluster_sizes=[1], batch_sizes=[0])) + add_exec_option_dimension(cls, 'mt_dop', mt_dop_dim(cls)) + + def test_single_node_joins_with_limits_exhaustive(self, vector): new_vector = deepcopy(vector) - new_vector.get_value('exec_option')['num_nodes'] = 1 - self.run_test_case('QueryTest/single-node-nlj', new_vector) + del new_vector.get_value('exec_option')['batch_size'] # .test file sets batch_size + self.run_test_case('QueryTest/single-node-joins-with-limits-exhaustive', new_vector) def test_single_node_nested_loop_joins_exhaustive(self, vector): - if self.exploration_strategy() != 'exhaustive': pytest.skip() - new_vector = deepcopy(vector) - new_vector.get_value('exec_option')['num_nodes'] = 1 - new_vector.get_value('exec_option')['mt_dop'] = vector.get_value('mt_dop') - self.run_test_case('QueryTest/single-node-nlj-exhaustive', new_vector) + # Test the execution of nested-loops joins for join types that can only be + # executed in a single node (right [outer|semi|anti] and full outer joins). + self.run_test_case('QueryTest/single-node-nlj-exhaustive', vector) - def test_empty_build_joins(self, vector): - new_vector = deepcopy(vector) - new_vector.get_value('exec_option')['batch_size'] = vector.get_value('batch_size') - new_vector.get_value('exec_option')['mt_dop'] = vector.get_value('mt_dop') - self.run_test_case('QueryTest/empty-build-joins', new_vector) + +class TestOuterJoinToInnerTransformation(TestJoinBase): + + @classmethod + def add_test_dimensions(cls): + super(TestOuterJoinToInnerTransformation, cls).add_test_dimensions() + add_exec_option_dimension(cls, 'enable_outer_join_to_inner_transformation', + ENABLE_OUTER_JOIN_TO_INNER_TRANSFORMATION) + + def test_outer_to_inner_joins(self, vector): + self.run_test_case('QueryTest/outer-to-inner-joins', vector) + + +class TestMissTupleJoins(TestJoinBase): + + @classmethod + def add_test_dimensions(cls): + super(TestMissTupleJoins, cls).add_test_dimensions() + # Only need to run with single exec option dimension. + cls.ImpalaTestMatrix.add_dimension(create_single_exec_option_dimension()) def test_miss_tuple_joins(self, vector, unique_database): - new_vector = deepcopy(vector) - new_vector.get_value('exec_option')['batch_size'] = vector.get_value('batch_size') - new_vector.get_value('exec_option')['mt_dop'] = vector.get_value('mt_dop') - self.run_test_case('QueryTest/miss-tuple-joins', new_vector, unique_database) + self.run_test_case('QueryTest/miss-tuple-joins', vector, unique_database) -class TestTPCHJoinQueries(ImpalaTestSuite): + +class TestTPCHJoinQueries(TestJoinBase): # Uses the TPC-H dataset in order to have larger joins. Needed for example to test # the repartitioning codepaths. @@ -135,75 +169,61 @@ class TestTPCHJoinQueries(ImpalaTestSuite): @classmethod def add_test_dimensions(cls): super(TestTPCHJoinQueries, cls).add_test_dimensions() - cls.ImpalaTestMatrix.add_dimension( - ImpalaTestDimension('batch_size', *TestJoinQueries.BATCH_SIZES)) - cls.ImpalaTestMatrix.add_constraint(lambda v:\ - v.get_value('table_format').file_format in ['parquet']) - - if cls.exploration_strategy() != 'exhaustive': - # Cut down on execution time when not running in exhaustive mode. - cls.ImpalaTestMatrix.add_constraint(lambda v: v.get_value('batch_size') != 1) @classmethod def teardown_class(cls): - cls.client.execute('set mem_limit = 0'); + cls.client.execute('set mem_limit = 0') super(TestTPCHJoinQueries, cls).teardown_class() def test_outer_joins(self, vector): - new_vector = deepcopy(vector) - new_vector.get_value('exec_option')['batch_size'] = vector.get_value('batch_size') - self.run_test_case('tpch-outer-joins', new_vector) + self.run_test_case('tpch-outer-joins', vector) -class TestSemiJoinQueries(ImpalaTestSuite): - @classmethod - def get_workload(cls): - return 'functional-query' + +class TestSemiJoinQueries(TestJoinBase): @classmethod def add_test_dimensions(cls): super(TestSemiJoinQueries, cls).add_test_dimensions() - cls.ImpalaTestMatrix.add_dimension( - ImpalaTestDimension('batch_size', *TestJoinQueries.BATCH_SIZES)) - # Joins with hbase tables produce drastically different results. - cls.ImpalaTestMatrix.add_constraint(lambda v:\ - v.get_value('table_format').file_format in ['parquet']) - - if cls.exploration_strategy() != 'exhaustive': - # Cut down on execution time when not running in exhaustive mode. - cls.ImpalaTestMatrix.add_constraint(lambda v: v.get_value('batch_size') != 1) def __load_semi_join_tables(self, db_name): # Create and load fresh test tables for semi/anti-join tests fq_tbl_name_a = '%s.SemiJoinTblA' % db_name self.client.execute('create table %s (a int, b int, c int)' % fq_tbl_name_a) - self.client.execute('insert into %s values(1,1,1)' % fq_tbl_name_a); - self.client.execute('insert into %s values(1,1,10)' % fq_tbl_name_a); - self.client.execute('insert into %s values(1,2,10)' % fq_tbl_name_a); - self.client.execute('insert into %s values(1,3,10)' % fq_tbl_name_a); - self.client.execute('insert into %s values(NULL,NULL,30)' % fq_tbl_name_a); - self.client.execute('insert into %s values(2,4,30)' % fq_tbl_name_a); - self.client.execute('insert into %s values(2,NULL,20)' % fq_tbl_name_a); + self.client.execute('insert into %s values(1,1,1)' % fq_tbl_name_a) + self.client.execute('insert into %s values(1,1,10)' % fq_tbl_name_a) + self.client.execute('insert into %s values(1,2,10)' % fq_tbl_name_a) + self.client.execute('insert into %s values(1,3,10)' % fq_tbl_name_a) + self.client.execute('insert into %s values(NULL,NULL,30)' % fq_tbl_name_a) + self.client.execute('insert into %s values(2,4,30)' % fq_tbl_name_a) + self.client.execute('insert into %s values(2,NULL,20)' % fq_tbl_name_a) fq_tbl_name_b = '%s.SemiJoinTblB' % db_name self.client.execute('create table %s (a int, b int, c int)' % fq_tbl_name_b) - self.client.execute('insert into %s values(1,1,1)' % fq_tbl_name_b); - self.client.execute('insert into %s values(1,1,10)' % fq_tbl_name_b); - self.client.execute('insert into %s values(1,2,5)' % fq_tbl_name_b); - self.client.execute('insert into %s values(1,NULL,10)' % fq_tbl_name_b); - self.client.execute('insert into %s values(2,10,NULL)' % fq_tbl_name_b); - self.client.execute('insert into %s values(3,NULL,NULL)' % fq_tbl_name_b); - self.client.execute('insert into %s values(3,NULL,50)' % fq_tbl_name_b); + self.client.execute('insert into %s values(1,1,1)' % fq_tbl_name_b) + self.client.execute('insert into %s values(1,1,10)' % fq_tbl_name_b) + self.client.execute('insert into %s values(1,2,5)' % fq_tbl_name_b) + self.client.execute('insert into %s values(1,NULL,10)' % fq_tbl_name_b) + self.client.execute('insert into %s values(2,10,NULL)' % fq_tbl_name_b) + self.client.execute('insert into %s values(3,NULL,NULL)' % fq_tbl_name_b) + self.client.execute('insert into %s values(3,NULL,50)' % fq_tbl_name_b) def test_semi_joins(self, vector, unique_database): - new_vector = deepcopy(vector) - new_vector.get_value('exec_option')['batch_size'] = vector.get_value('batch_size') self.__load_semi_join_tables(unique_database) - self.run_test_case('QueryTest/semi-joins', new_vector, unique_database) + self.run_test_case('QueryTest/semi-joins', vector, unique_database) + + +class TestSemiJoinQueriesExhaustive(TestJoinBase): + + @classmethod + def add_test_dimensions(cls): + super(TestSemiJoinQueriesExhaustive, cls).add_test_dimensions() + if cls.exploration_strategy() != 'exhaustive': + # skip this test if not in exhaustive exploration. + pytest.skip("Only run in exhaustive exploration.") @pytest.mark.execute_serially def test_semi_joins_exhaustive(self, vector): """Expensive and memory-intensive semi-join tests.""" - if self.exploration_strategy() != 'exhaustive': pytest.skip() self.run_test_case('QueryTest/semi-joins-exhaustive', vector) @@ -218,7 +238,7 @@ class TestSpillingHashJoin(ImpalaTestSuite): super(TestSpillingHashJoin, cls).add_test_dimensions() # To cut down on test execution time, only run in exhaustive. if cls.exploration_strategy() != 'exhaustive': - cls.ImpalaTestMatrix.add_constraint(lambda v: False) + pytest.skip("Only run in exhaustive exploration.") cls.ImpalaTestMatrix.add_constraint( lambda v: v.get_value('table_format').file_format == 'parquet') cls.ImpalaTestMatrix.add_constraint(lambda v:
