(impala) 01/02: IMPALA-13323: Remove redundant tests in test_join_queries.py

joemcdonnell Mon, 26 Aug 2024 18:04:01 -0700

This is an automated email from the ASF dual-hosted git repository.

joemcdonnell pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git


commit 6eabff9eab8c96b9cc6e68506dc9173722a07ad1
Author: Riza Suminto <[email protected]>
AuthorDate: Wed Aug 21 16:34:12 2024 -0700

    IMPALA-13323: Remove redundant tests in test_join_queries.py
    
    Before this patch, test_join_queries.py is expensive to run in
    exhaustive exploration because it run many test dimension permutation,
    but actually never exercise some of the dimensions. Those redundant
    tests are follows:
    
    Have mt_dop dimension, but not exercising it:
    - test_outer_to_inner_joins
    - test_single_node_nested_loop_joins
    
    Have batch_size dimension but not exercising it:
    - test_outer_to_inner_joins
    - test_single_node_nested_loop_joins
    - test_single_node_nested_loop_joins_exhaustive
    - test_semi_joins_exhaustive
    
    Have enable_outer_join_to_inner_transformation dimension but not
    exercising it:
    - All TestJoinQueries except test_outer_to_inner_joins
    
    test_miss_tuple_joins is also valid to run with much fewer test
    dimension because it mainly test correctness of predicate pushdown
    during planning.
    
    This patch reorganize test_join_queries.py into several test classes
    where exec option dimensions are clearly declared and correctly
    exercised without duplication. The reduction are follows:
    
    Before patch:
    41 core tests, 526 exhaustive tests
    
    After patch:
    28 core tests, 93 exhaustive tests
    
    Added validate_exec_option_dimension() in impala_test_suite.py to log
    WARNING if there is a dimension name matching query option name that is
    not also registered in 'exec_option' name. That log can be observed in
    log file such as logs/ee_tests/results/TEST-impala-parallel.xml. Also
    fix couple flake8 issues found at impala_test_suite.py
    
    Change-Id: I76efa82dad59dbb9f58a36a4a2938b5f73a382f6
    Reviewed-on: http://gerrit.cloudera.org:8080/21716
    Tested-by: Impala Public Jenkins <[email protected]>
    Reviewed-by: Michael Smith <[email protected]>
---
 tests/common/impala_test_suite.py     | 112 ++++++++++------
 tests/query_test/test_join_queries.py | 238 ++++++++++++++++++----------------
 2 files changed, 200 insertions(+), 150 deletions(-)

diff --git a/tests/common/impala_test_suite.py 
b/tests/common/impala_test_suite.py
index 6d73765da..14c378ab0 100644
--- a/tests/common/impala_test_suite.py
+++ b/tests/common/impala_test_suite.py
@@ -60,7 +60,7 @@ from tests.common.test_result_verifier import (
     verify_lineage,
     verify_raw_results,
     verify_runtime_profile)
-from tests.common.test_vector import ImpalaTestDimension
+from tests.common.test_vector import ImpalaTestDimension, EXEC_OPTION_KEY
 from tests.performance.query import Query
 from tests.performance.query_exec_functions import execute_using_jdbc
 from tests.performance.query_executor import JdbcQueryExecConfig
@@ -98,6 +98,9 @@ from tests.util.thrift_util import create_transport
 from hive_metastore import ThriftHiveMetastore
 from thrift.protocol import TBinaryProtocol
 
+# Import to validate query option names
+from ImpalaService.ttypes import TImpalaQueryOptions
+
 # Initializing the logger before conditional imports, since we will need it
 # for them.
 LOG = logging.getLogger('impala_test_suite')
@@ -123,8 +126,8 @@ IMPALAD_HS2_PORT = 
int(pytest.config.option.impalad_hs2_port)
 IMPALAD_HS2_HOST_PORT = IMPALAD_HOSTNAME + ":" + str(IMPALAD_HS2_PORT)
 # Calculate the hs2 ports based on the first hs2 port and the deltas of the 
beeswax ports
 IMPALAD_HS2_HOST_PORT_LIST = [
-    IMPALAD_HOSTNAME_LIST[i] + ':' +
-    str(IMPALAD_BEESWAX_PORT_LIST[i] - IMPALAD_BEESWAX_PORT + IMPALAD_HS2_PORT)
+    IMPALAD_HOSTNAME_LIST[i] + ':'
+    + str(IMPALAD_BEESWAX_PORT_LIST[i] - IMPALAD_BEESWAX_PORT + 
IMPALAD_HS2_PORT)
     for i in range(len(IMPALAD_HOST_PORT_LIST))
 ]
 
@@ -133,8 +136,8 @@ IMPALAD_HS2_HTTP_HOST_PORT = IMPALAD_HOSTNAME + ":" + 
str(IMPALAD_HS2_HTTP_PORT)
 # Calculate the hs2-http ports based on the first hs2-http port and the deltas 
of the
 # beeswax ports
 IMPALAD_HS2_HTTP_HOST_PORT_LIST = [
-    IMPALAD_HOSTNAME_LIST[i] + ':' +
-    str(IMPALAD_BEESWAX_PORT_LIST[i] - IMPALAD_BEESWAX_PORT + 
IMPALAD_HS2_HTTP_PORT)
+    IMPALAD_HOSTNAME_LIST[i] + ':'
+    + str(IMPALAD_BEESWAX_PORT_LIST[i] - IMPALAD_BEESWAX_PORT + 
IMPALAD_HS2_HTTP_PORT)
     for i in range(len(IMPALAD_HOST_PORT_LIST))
 ]
 
@@ -164,6 +167,10 @@ VARZ_URL = 
'http://{0}:25000/varz?json'.format(IMPALAD_HOSTNAME)
 
 GROUP_NAME = grp.getgrgid(pwd.getpwnam(getuser()).pw_gid).gr_name
 
+EXEC_OPTION_NAMES = set([val.lower()
+  for val in TImpalaQueryOptions._VALUES_TO_NAMES.values()])
+
+
 # Base class for Impala tests. All impala test cases should inherit from this 
class
 class ImpalaTestSuite(BaseTestSuite):
   @classmethod
@@ -426,7 +433,7 @@ class ImpalaTestSuite(BaseTestSuite):
     # Restore all the changed query options.
     for query_option in query_options_changed:
       query_option = query_option.upper()
-      if not query_option in self.default_query_options:
+      if query_option not in self.default_query_options:
         continue
       default_val = self.default_query_options[query_option]
       query_str = 'SET ' + query_option + '="' + default_val + '"'
@@ -535,7 +542,6 @@ class ImpalaTestSuite(BaseTestSuite):
       s = s.replace(k, v)
     return s
 
-
   def __verify_exceptions(self, expected_strs, actual_str, use_db):
     """
     Verifies that at least one of the strings in 'expected_str' is either:
@@ -584,7 +590,8 @@ class ImpalaTestSuite(BaseTestSuite):
                                      .replace('$EXTERNAL_WAREHOUSE_DIR',
                                               EXTERNAL_WAREHOUSE_DIR)
         if use_db:
-          test_section[section_name] = 
test_section[section_name].replace('$DATABASE', use_db)
+          test_section[section_name] = test_section[section_name].replace(
+              '$DATABASE', use_db)
     result_section, type_section = 'RESULTS', 'TYPES'
     if vector.get_value('protocol').startswith('hs2'):
       # hs2 or hs2-http
@@ -598,7 +605,6 @@ class ImpalaTestSuite(BaseTestSuite):
                        result_section, type_section, 
pytest.config.option.update_results,
                        replace_filenames_with_placeholder)
 
-
   def run_test_case(self, test_file_name, vector, use_db=None, 
multiple_impalad=False,
       encoding=None, test_file_vars=None):
     """
@@ -618,8 +624,9 @@ class ImpalaTestSuite(BaseTestSuite):
     values in queries before they are executed. Callers need to avoid using 
reserved key
     names, see 'reserved_keywords' below.
     """
+    self.validate_exec_option_dimension(vector)
     table_format_info = vector.get_value('table_format')
-    exec_options = vector.get_value('exec_option')
+    exec_options = vector.get_value(EXEC_OPTION_KEY)
     protocol = vector.get_value('protocol')
 
     target_impalad_clients = list()
@@ -662,11 +669,12 @@ class ImpalaTestSuite(BaseTestSuite):
         for query in query.split(';'):
           set_pattern_match = SET_PATTERN.match(query)
           if set_pattern_match:
-            query_options_changed.append(set_pattern_match.groups()[0])
-            assert set_pattern_match.groups()[0] not in 
vector.get_value("exec_option"), \
-                "%s cannot be set in  the '.test' file since it is in the test 
vector. " \
-                "Consider deepcopy()-ing the vector and removing this option 
in the " \
-                "python test." % set_pattern_match.groups()[0]
+            option_name = set_pattern_match.groups()[0]
+            query_options_changed.append(option_name)
+            assert option_name not in vector.get_value(EXEC_OPTION_KEY), (
+                "{} cannot be set in  the '.test' file since it is in the test 
vector. "
+                "Consider deepcopy()-ing the vector and removing this option 
in the "
+                "python test.".format(option_name))
           result = self.__execute_query(target_impalad_client, query, 
user=user)
       finally:
         if len(query_options_changed) > 0:
@@ -719,9 +727,9 @@ class ImpalaTestSuite(BaseTestSuite):
         query_section = test_section['HIVE_QUERY']
         exec_fn = __exec_in_hive
       else:
-        assert 0, ('Error in test file %s. Test cases require a ' +
-            '-- QUERY or HIVE_QUERY section.\n%s') %\
-            (test_file_name, pprint.pformat(test_section))
+        assert 0, ('Error in test file {}. Test cases require a '
+            '-- QUERY or HIVE_QUERY section.\n{}').format(
+                test_file_name, pprint.pformat(test_section))
 
       # TODO: support running query tests against different scale factors
       query = QueryTestSectionReader.build_query(
@@ -733,9 +741,6 @@ class ImpalaTestSuite(BaseTestSuite):
       result = None
       try:
         result = exec_fn(query, user=test_section.get('USER', '').strip() or 
None)
-        user = None
-        if 'USER' in test_section:
-          user = test_section['USER'].strip()
       except Exception as e:
         if 'CATCH' in test_section:
           self.__verify_exceptions(test_section['CATCH'], str(e), use_db)
@@ -798,18 +803,18 @@ class ImpalaTestSuite(BaseTestSuite):
           test_section[rt_profile_info] = "".join(rt_profile)
 
       if 'LINEAGE' in test_section:
-         # Lineage flusher thread runs every 5s by default and is not 
configurable. Wait
-         # for that period. (TODO) Get rid of this for faster test execution.
-         time.sleep(5)
-         current_query_lineage = self.get_query_lineage(result.query_id, 
lineage_log_dir)
-         assert current_query_lineage is not "",\
-             "No lineage found for query %s in dir %s" %\
-             (result.query_id, lineage_log_dir)
-         if pytest.config.option.update_results:
-           test_section['LINEAGE'] = json.dumps(current_query_lineage, 
indent=2,
-               separators=(',', ': '))
-         else:
-           verify_lineage(json.loads(test_section['LINEAGE']), 
current_query_lineage)
+        # Lineage flusher thread runs every 5s by default and is not 
configurable. Wait
+        # for that period. (TODO) Get rid of this for faster test execution.
+        time.sleep(5)
+        current_query_lineage = self.get_query_lineage(result.query_id, 
lineage_log_dir)
+        assert current_query_lineage != "", (
+            "No lineage found for query {} in dir {}".format(
+              result.query_id, lineage_log_dir))
+        if pytest.config.option.update_results:
+          test_section['LINEAGE'] = json.dumps(current_query_lineage, indent=2,
+              separators=(',', ': '))
+        else:
+          verify_lineage(json.loads(test_section['LINEAGE']), 
current_query_lineage)
 
       if 'DML_RESULTS' in test_section:
         assert 'ERRORS' not in test_section
@@ -823,7 +828,7 @@ class ImpalaTestSuite(BaseTestSuite):
             update_section=pytest.config.option.update_results)
     if pytest.config.option.update_results:
       output_file = os.path.join(EE_TEST_LOGS_DIR,
-                                 test_file_name.replace('/','_') + ".test")
+                                 test_file_name.replace('/', '_') + ".test")
       write_test_file(output_file, sections, encoding=encoding)
 
   def get_query_lineage(self, query_id, lineage_dir):
@@ -855,8 +860,8 @@ class ImpalaTestSuite(BaseTestSuite):
   @classmethod
   def change_database(cls, impala_client, table_format=None,
       db_name=None, scale_factor=None):
-    if db_name == None:
-      assert table_format != None
+    if db_name is None:
+      assert table_format is not None
       db_name = QueryTestSectionReader.get_db_name(table_format,
           scale_factor if scale_factor else '')
     query = 'use %s' % db_name
@@ -932,14 +937,16 @@ class ImpalaTestSuite(BaseTestSuite):
     return end_time - start_time
 
   def execute_query_using_client(self, client, query, vector):
+    self.validate_exec_option_dimension(vector)
     self.change_database(client, vector.get_value('table_format'))
-    query_options = vector.get_value('exec_option')
+    query_options = vector.get_value(EXEC_OPTION_KEY)
     if query_options is not None: client.set_configuration(query_options)
     return client.execute(query)
 
   def execute_query_async_using_client(self, client, query, vector):
+    self.validate_exec_option_dimension(vector)
     self.change_database(client, vector.get_value('table_format'))
-    query_options = vector.get_value('exec_option')
+    query_options = vector.get_value(EXEC_OPTION_KEY)
     if query_options is not None: client.set_configuration(query_options)
     return client.execute_async(query)
 
@@ -971,7 +978,7 @@ class ImpalaTestSuite(BaseTestSuite):
     assert len(result.data) <= 1, 'Multiple values returned from scalar'
     return result.data[0] if len(result.data) == 1 else None
 
-  def exec_and_compare_hive_and_impala_hs2(self, stmt, compare = lambda x, y: 
x == y):
+  def exec_and_compare_hive_and_impala_hs2(self, stmt, compare=lambda x, y: x 
== y):
     """Compare Hive and Impala results when executing the same statment over 
HS2"""
     # execute_using_jdbc expects a Query object. Convert the query string into 
a Query
     # object
@@ -1025,7 +1032,7 @@ class ImpalaTestSuite(BaseTestSuite):
 
   def clone_table(self, src_tbl, dst_tbl, recover_partitions, vector):
     src_loc = self._get_table_location(src_tbl, vector)
-    self.client.execute("create external table {0} like {1} location '{2}'"\
+    self.client.execute("create external table {0} like {1} location '{2}'"
         .format(dst_tbl, src_tbl, src_loc))
     if recover_partitions:
       self.client.execute("alter table {0} recover partitions".format(dst_tbl))
@@ -1033,7 +1040,7 @@ class ImpalaTestSuite(BaseTestSuite):
   def appx_equals(self, a, b, diff_perc):
     """Returns True if 'a' and 'b' are within 'diff_perc' percent of each 
other,
     False otherwise. 'diff_perc' must be a float in [0,1]."""
-    if a == b: return True # Avoid division by 0
+    if a == b: return True  # Avoid division by 0
     assert abs(a - b) / float(max(abs(a), abs(b))) <= diff_perc
 
   def _get_table_location(self, table_name, vector):
@@ -1349,6 +1356,29 @@ class ImpalaTestSuite(BaseTestSuite):
             str(e))
         time.sleep(1)
 
+  def validate_exec_option_dimension(self, vector):
+    """Validate that test dimension with name matching query option name is
+    also registered in 'exec_option' dimension."""
+    option_dim_names = []
+    exec_option = dict()
+    for vector_value in vector.vector_values:
+      if vector_value.name == EXEC_OPTION_KEY:
+        exec_option = vector.get_value(EXEC_OPTION_KEY)
+      elif vector_value.name.lower() in EXEC_OPTION_NAMES:
+        option_dim_names.append(vector_value.name.lower())
+
+    if not option_dim_names:
+      return
+
+    for name in option_dim_names:
+      # TODO: enforce these warnings by changing them into pytest.fail()
+      if name not in exec_option:
+        LOG.warn("Exec option {} declared as independent dimension but not 
inserted "
+                 "into {} dimension.".format(name, EXEC_OPTION_KEY))
+      elif vector.get_value(name) != exec_option[name]:
+        LOG.warn("{}[{}]={} does not match against dimension {}={}.".format(
+          EXEC_OPTION_KEY, name, exec_option[name], name, 
vector.get_value(name)))
+
   @staticmethod
   def get_random_name(prefix='', length=5):
     """
diff --git a/tests/query_test/test_join_queries.py 
b/tests/query_test/test_join_queries.py
index 8bfc7e7f0..2db2714fe 100644
--- a/tests/query_test/test_join_queries.py
+++ b/tests/query_test/test_join_queries.py
@@ -23,19 +23,36 @@ from copy import deepcopy
 
 from tests.common.impala_test_suite import ImpalaTestSuite
 from tests.common.skip import SkipIf, SkipIfFS
-from tests.common.test_vector import ImpalaTestDimension
 from tests.common.test_dimensions import (
+    add_exec_option_dimension,
     add_mandatory_exec_option,
     create_exec_option_dimension,
+    create_single_exec_option_dimension,
     create_table_format_dimension)
 
 
-class TestJoinQueries(ImpalaTestSuite):
-  BATCH_SIZES = [0, 1]
-  MT_DOP_VALUES = [0, 4]
-  # Additional values for exhaustive tests.
-  MT_DOP_VALUES_EXHAUSTIVE = [1]
-  ENABLE_OUTER_JOIN_TO_INNER_TRANSFORMATION = ['false', 'true']
+ENABLE_OUTER_JOIN_TO_INNER_TRANSFORMATION = ['false', 'true']
+
+
+def batch_size_dim(cls):
+  if cls.exploration_strategy() == 'exhaustive':
+    return [0, 1]
+  else:
+    return [0]
+
+
+def mt_dop_dim(cls):
+  if cls.exploration_strategy() == 'exhaustive':
+    return [0, 1, 4]
+  else:
+    return [0, 4]
+
+
+class TestJoinBase(ImpalaTestSuite):
+  """The base class for test join classes.
+  Intended to provide subclasses with default test dimensions declaration
+  and constraints through add_test_dimensions() both in core and
+  exhaustive exploration."""
 
   @classmethod
   def get_workload(cls):
@@ -43,88 +60,105 @@ class TestJoinQueries(ImpalaTestSuite):
 
   @classmethod
   def add_test_dimensions(cls):
-    super(TestJoinQueries, cls).add_test_dimensions()
-    cls.ImpalaTestMatrix.add_dimension(
-        ImpalaTestDimension('batch_size', *TestJoinQueries.BATCH_SIZES))
-    mt_dop_values = cls.MT_DOP_VALUES
-    if cls.exploration_strategy() == 'exhaustive':
-      mt_dop_values += cls.MT_DOP_VALUES_EXHAUSTIVE
+    super(TestJoinBase, cls).add_test_dimensions()
+
+    # Set exec options
     cls.ImpalaTestMatrix.add_dimension(
-        ImpalaTestDimension('mt_dop', *mt_dop_values))
-    # TODO: Look into splitting up join tests to accomodate hbase.
-    # Joins with hbase tables produce drastically different results.
-    cls.ImpalaTestMatrix.add_constraint(lambda v:\
+        create_exec_option_dimension(batch_sizes=batch_size_dim(cls)))
+
+    cls.ImpalaTestMatrix.add_constraint(lambda v:
         v.get_value('table_format').file_format in ['parquet'])
 
-    if cls.exploration_strategy() != 'exhaustive':
-      # Cut down on execution time when not running in exhaustive mode.
-      cls.ImpalaTestMatrix.add_constraint(lambda v: v.get_value('batch_size') 
!= 1)
 
-    cls.ImpalaTestMatrix.add_dimension(
-        ImpalaTestDimension('enable_outer_join_to_inner_transformation',
-        *TestJoinQueries.ENABLE_OUTER_JOIN_TO_INNER_TRANSFORMATION))
+class TestJoinQueries(TestJoinBase):
 
-  def test_basic_joins(self, vector):
-    new_vector = deepcopy(vector)
-    new_vector.get_value('exec_option')['batch_size'] = 
vector.get_value('batch_size')
-    new_vector.get_value('exec_option')['mt_dop'] = vector.get_value('mt_dop')
-    self.run_test_case('QueryTest/joins', new_vector)
+  @classmethod
+  def add_test_dimensions(cls):
+    super(TestJoinQueries, cls).add_test_dimensions()
+    add_exec_option_dimension(cls, 'mt_dop', mt_dop_dim(cls))
 
-  def test_single_node_joins_with_limits_exhaustive(self, vector):
-    if self.exploration_strategy() != 'exhaustive': pytest.skip()
-    new_vector = deepcopy(vector)
-    new_vector.get_value('exec_option')['num_nodes'] = 1
-    new_vector.get_value('exec_option')['mt_dop'] = vector.get_value('mt_dop')
-    del new_vector.get_value('exec_option')['batch_size']  # .test file sets 
batch_size
-    self.run_test_case('QueryTest/single-node-joins-with-limits-exhaustive', 
new_vector)
+  def test_basic_joins(self, vector):
+    self.run_test_case('QueryTest/joins', vector)
 
   @SkipIfFS.hbase
   @SkipIf.skip_hbase
   def test_joins_against_hbase(self, vector):
-    new_vector = deepcopy(vector)
-    new_vector.get_value('exec_option')['batch_size'] = 
vector.get_value('batch_size')
-    new_vector.get_value('exec_option')['mt_dop'] = vector.get_value('mt_dop')
-    self.run_test_case('QueryTest/joins-against-hbase', new_vector)
+    # TODO: Look into splitting up join tests to accomodate hbase.
+    # Joins with hbase tables produce drastically different results.
+    self.run_test_case('QueryTest/joins-against-hbase', vector)
 
   def test_outer_joins(self, vector):
-    new_vector = deepcopy(vector)
-    new_vector.get_value('exec_option')['batch_size'] = 
vector.get_value('batch_size')
-    new_vector.get_value('exec_option')['mt_dop'] = vector.get_value('mt_dop')
-    self.run_test_case('QueryTest/outer-joins', new_vector)
+    self.run_test_case('QueryTest/outer-joins', vector)
 
-  def test_outer_to_inner_joins(self, vector):
-    new_vector = deepcopy(vector)
-    
new_vector.get_value('exec_option')['enable_outer_join_to_inner_transformation']\
-        = vector.get_value('enable_outer_join_to_inner_transformation')
-    self.run_test_case('QueryTest/outer-to-inner-joins', new_vector)
+  def test_empty_build_joins(self, vector):
+    self.run_test_case('QueryTest/empty-build-joins', vector)
+
+
+class TestSingleNodeJoins(TestJoinBase):
+
+  @classmethod
+  def add_test_dimensions(cls):
+    super(TestSingleNodeJoins, cls).add_test_dimensions()
+    # Redeclare exec options with num_nodes=1, batch_size=0.
+    cls.ImpalaTestMatrix.add_dimension(
+        create_exec_option_dimension(cluster_sizes=[1], batch_sizes=[0]))
 
   def test_single_node_nested_loop_joins(self, vector):
     # Test the execution of nested-loops joins for join types that can only be
     # executed in a single node (right [outer|semi|anti] and full outer joins).
+    self.run_test_case('QueryTest/single-node-nlj', vector)
+
+
+class TestSingleNodeJoinsExhaustive(TestJoinBase):
+
+  @classmethod
+  def add_test_dimensions(cls):
+    super(TestSingleNodeJoinsExhaustive, cls).add_test_dimensions()
+    if cls.exploration_strategy() != 'exhaustive':
+      # skip this test if not in exhaustive exploration.
+      pytest.skip("Only run in exhaustive exploration.")
+
+    # Redeclare exec options with num_nodes=1, batch_size=0.
+    cls.ImpalaTestMatrix.add_dimension(
+        create_exec_option_dimension(cluster_sizes=[1], batch_sizes=[0]))
+    add_exec_option_dimension(cls, 'mt_dop', mt_dop_dim(cls))
+
+  def test_single_node_joins_with_limits_exhaustive(self, vector):
     new_vector = deepcopy(vector)
-    new_vector.get_value('exec_option')['num_nodes'] = 1
-    self.run_test_case('QueryTest/single-node-nlj', new_vector)
+    del new_vector.get_value('exec_option')['batch_size']  # .test file sets 
batch_size
+    self.run_test_case('QueryTest/single-node-joins-with-limits-exhaustive', 
new_vector)
 
   def test_single_node_nested_loop_joins_exhaustive(self, vector):
-    if self.exploration_strategy() != 'exhaustive': pytest.skip()
-    new_vector = deepcopy(vector)
-    new_vector.get_value('exec_option')['num_nodes'] = 1
-    new_vector.get_value('exec_option')['mt_dop'] = vector.get_value('mt_dop')
-    self.run_test_case('QueryTest/single-node-nlj-exhaustive', new_vector)
+    # Test the execution of nested-loops joins for join types that can only be
+    # executed in a single node (right [outer|semi|anti] and full outer joins).
+    self.run_test_case('QueryTest/single-node-nlj-exhaustive', vector)
 
-  def test_empty_build_joins(self, vector):
-    new_vector = deepcopy(vector)
-    new_vector.get_value('exec_option')['batch_size'] = 
vector.get_value('batch_size')
-    new_vector.get_value('exec_option')['mt_dop'] = vector.get_value('mt_dop')
-    self.run_test_case('QueryTest/empty-build-joins', new_vector)
+
+class TestOuterJoinToInnerTransformation(TestJoinBase):
+
+  @classmethod
+  def add_test_dimensions(cls):
+    super(TestOuterJoinToInnerTransformation, cls).add_test_dimensions()
+    add_exec_option_dimension(cls, 'enable_outer_join_to_inner_transformation',
+                              ENABLE_OUTER_JOIN_TO_INNER_TRANSFORMATION)
+
+  def test_outer_to_inner_joins(self, vector):
+    self.run_test_case('QueryTest/outer-to-inner-joins', vector)
+
+
+class TestMissTupleJoins(TestJoinBase):
+
+  @classmethod
+  def add_test_dimensions(cls):
+    super(TestMissTupleJoins, cls).add_test_dimensions()
+    # Only need to run with single exec option dimension.
+    cls.ImpalaTestMatrix.add_dimension(create_single_exec_option_dimension())
 
   def test_miss_tuple_joins(self, vector, unique_database):
-    new_vector = deepcopy(vector)
-    new_vector.get_value('exec_option')['batch_size'] = 
vector.get_value('batch_size')
-    new_vector.get_value('exec_option')['mt_dop'] = vector.get_value('mt_dop')
-    self.run_test_case('QueryTest/miss-tuple-joins', new_vector, 
unique_database)
+    self.run_test_case('QueryTest/miss-tuple-joins', vector, unique_database)
 
-class TestTPCHJoinQueries(ImpalaTestSuite):
+
+class TestTPCHJoinQueries(TestJoinBase):
   # Uses the TPC-H dataset in order to have larger joins. Needed for example 
to test
   # the repartitioning codepaths.
 
@@ -135,75 +169,61 @@ class TestTPCHJoinQueries(ImpalaTestSuite):
   @classmethod
   def add_test_dimensions(cls):
     super(TestTPCHJoinQueries, cls).add_test_dimensions()
-    cls.ImpalaTestMatrix.add_dimension(
-        ImpalaTestDimension('batch_size', *TestJoinQueries.BATCH_SIZES))
-    cls.ImpalaTestMatrix.add_constraint(lambda v:\
-        v.get_value('table_format').file_format in ['parquet'])
-
-    if cls.exploration_strategy() != 'exhaustive':
-      # Cut down on execution time when not running in exhaustive mode.
-      cls.ImpalaTestMatrix.add_constraint(lambda v: v.get_value('batch_size') 
!= 1)
 
   @classmethod
   def teardown_class(cls):
-    cls.client.execute('set mem_limit = 0');
+    cls.client.execute('set mem_limit = 0')
     super(TestTPCHJoinQueries, cls).teardown_class()
 
   def test_outer_joins(self, vector):
-    new_vector = deepcopy(vector)
-    new_vector.get_value('exec_option')['batch_size'] = 
vector.get_value('batch_size')
-    self.run_test_case('tpch-outer-joins', new_vector)
+    self.run_test_case('tpch-outer-joins', vector)
 
-class TestSemiJoinQueries(ImpalaTestSuite):
-  @classmethod
-  def get_workload(cls):
-    return 'functional-query'
+
+class TestSemiJoinQueries(TestJoinBase):
 
   @classmethod
   def add_test_dimensions(cls):
     super(TestSemiJoinQueries, cls).add_test_dimensions()
-    cls.ImpalaTestMatrix.add_dimension(
-        ImpalaTestDimension('batch_size', *TestJoinQueries.BATCH_SIZES))
-    # Joins with hbase tables produce drastically different results.
-    cls.ImpalaTestMatrix.add_constraint(lambda v:\
-        v.get_value('table_format').file_format in ['parquet'])
-
-    if cls.exploration_strategy() != 'exhaustive':
-      # Cut down on execution time when not running in exhaustive mode.
-      cls.ImpalaTestMatrix.add_constraint(lambda v: v.get_value('batch_size') 
!= 1)
 
   def __load_semi_join_tables(self, db_name):
     # Create and load fresh test tables for semi/anti-join tests
     fq_tbl_name_a = '%s.SemiJoinTblA' % db_name
     self.client.execute('create table %s (a int, b int, c int)' % 
fq_tbl_name_a)
-    self.client.execute('insert into %s values(1,1,1)' % fq_tbl_name_a);
-    self.client.execute('insert into %s values(1,1,10)' % fq_tbl_name_a);
-    self.client.execute('insert into %s values(1,2,10)' % fq_tbl_name_a);
-    self.client.execute('insert into %s values(1,3,10)' % fq_tbl_name_a);
-    self.client.execute('insert into %s values(NULL,NULL,30)'  % 
fq_tbl_name_a);
-    self.client.execute('insert into %s values(2,4,30)' % fq_tbl_name_a);
-    self.client.execute('insert into %s values(2,NULL,20)' % fq_tbl_name_a);
+    self.client.execute('insert into %s values(1,1,1)' % fq_tbl_name_a)
+    self.client.execute('insert into %s values(1,1,10)' % fq_tbl_name_a)
+    self.client.execute('insert into %s values(1,2,10)' % fq_tbl_name_a)
+    self.client.execute('insert into %s values(1,3,10)' % fq_tbl_name_a)
+    self.client.execute('insert into %s values(NULL,NULL,30)' % fq_tbl_name_a)
+    self.client.execute('insert into %s values(2,4,30)' % fq_tbl_name_a)
+    self.client.execute('insert into %s values(2,NULL,20)' % fq_tbl_name_a)
 
     fq_tbl_name_b = '%s.SemiJoinTblB' % db_name
     self.client.execute('create table %s (a int, b int, c int)' % 
fq_tbl_name_b)
-    self.client.execute('insert into %s values(1,1,1)' % fq_tbl_name_b);
-    self.client.execute('insert into %s values(1,1,10)' % fq_tbl_name_b);
-    self.client.execute('insert into %s values(1,2,5)' % fq_tbl_name_b);
-    self.client.execute('insert into %s values(1,NULL,10)' % fq_tbl_name_b);
-    self.client.execute('insert into %s values(2,10,NULL)' % fq_tbl_name_b);
-    self.client.execute('insert into %s values(3,NULL,NULL)' % fq_tbl_name_b);
-    self.client.execute('insert into %s values(3,NULL,50)' % fq_tbl_name_b);
+    self.client.execute('insert into %s values(1,1,1)' % fq_tbl_name_b)
+    self.client.execute('insert into %s values(1,1,10)' % fq_tbl_name_b)
+    self.client.execute('insert into %s values(1,2,5)' % fq_tbl_name_b)
+    self.client.execute('insert into %s values(1,NULL,10)' % fq_tbl_name_b)
+    self.client.execute('insert into %s values(2,10,NULL)' % fq_tbl_name_b)
+    self.client.execute('insert into %s values(3,NULL,NULL)' % fq_tbl_name_b)
+    self.client.execute('insert into %s values(3,NULL,50)' % fq_tbl_name_b)
 
   def test_semi_joins(self, vector, unique_database):
-    new_vector = deepcopy(vector)
-    new_vector.get_value('exec_option')['batch_size'] = 
vector.get_value('batch_size')
     self.__load_semi_join_tables(unique_database)
-    self.run_test_case('QueryTest/semi-joins', new_vector, unique_database)
+    self.run_test_case('QueryTest/semi-joins', vector, unique_database)
+
+
+class TestSemiJoinQueriesExhaustive(TestJoinBase):
+
+  @classmethod
+  def add_test_dimensions(cls):
+    super(TestSemiJoinQueriesExhaustive, cls).add_test_dimensions()
+    if cls.exploration_strategy() != 'exhaustive':
+      # skip this test if not in exhaustive exploration.
+      pytest.skip("Only run in exhaustive exploration.")
 
   @pytest.mark.execute_serially
   def test_semi_joins_exhaustive(self, vector):
     """Expensive and memory-intensive semi-join tests."""
-    if self.exploration_strategy() != 'exhaustive': pytest.skip()
     self.run_test_case('QueryTest/semi-joins-exhaustive', vector)
 
 
@@ -218,7 +238,7 @@ class TestSpillingHashJoin(ImpalaTestSuite):
     super(TestSpillingHashJoin, cls).add_test_dimensions()
     # To cut down on test execution time, only run in exhaustive.
     if cls.exploration_strategy() != 'exhaustive':
-      cls.ImpalaTestMatrix.add_constraint(lambda v: False)
+      pytest.skip("Only run in exhaustive exploration.")
     cls.ImpalaTestMatrix.add_constraint(
         lambda v: v.get_value('table_format').file_format == 'parquet')
     cls.ImpalaTestMatrix.add_constraint(lambda v:

(impala) 01/02: IMPALA-13323: Remove redundant tests in test_join_queries.py

Reply via email to