Repository: impala Updated Branches: refs/heads/master 649f175df -> 971cf179f
IMPALA-7460 part 1: require user to install Paramiko and Fabric - Remove Fabric and Paramiko as requirements. They aren't needed by anything in buildall.sh. - Add a means to install into the impala-python virtual environment by hand. impala-pip is fine for this. - Add another requirements file for extended testing. The dependency situation is messy and untangling that out of impala-python and into lib/python should be out of the scope of IMPALA-7460. - Update core tests, which cover real regressions that have happened in the past, to run against locations that don't require a Paramiko import. This moves some logic out of concurrent_select.py into a thinner module. - Insulate ssh_util from globally-scoped import so that it only imports when needed. Testing: - This works in my development environment. - This works in my downstream stress and query gen environments. - This works when doing a full data load. - Impala still builds on a variety of OSs. Todo: - A subsequent review will update the versions. Change-Id: Ibf9010a0387b52c95b7bda5d1d4606eba1008b65 Reviewed-on: http://gerrit.cloudera.org:8080/11264 Reviewed-by: Michael Brown <[email protected]> Tested-by: Impala Public Jenkins <[email protected]> Project: http://git-wip-us.apache.org/repos/asf/impala/repo Commit: http://git-wip-us.apache.org/repos/asf/impala/commit/971cf179 Tree: http://git-wip-us.apache.org/repos/asf/impala/tree/971cf179 Diff: http://git-wip-us.apache.org/repos/asf/impala/diff/971cf179 Branch: refs/heads/master Commit: 971cf179f674b312cd133397fe74ee8c5f29a215 Parents: 649f175 Author: Michael Brown <[email protected]> Authored: Mon Aug 20 10:24:37 2018 -0700 Committer: Impala Public Jenkins <[email protected]> Committed: Thu Aug 23 00:20:15 2018 +0000 ---------------------------------------------------------------------- bin/impala-pip | 21 ++++++ infra/python/deps/compiled-requirements.txt | 4 -- .../python/deps/extended-test-requirements.txt | 24 +++++++ tests/comparison/cluster.py | 4 +- tests/comparison/leopard/impala_docker_env.py | 7 +- tests/infra/test_stress_infra.py | 10 ++- tests/stress/concurrent_select.py | 69 ++------------------ tests/util/cluster_controller.py | 14 ++-- tests/util/parse_util.py | 42 ++++++++++++ tests/util/ssh_util.py | 10 ++- tests/util/test_file_parser.py | 34 +++++++++- 11 files changed, 157 insertions(+), 82 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/impala/blob/971cf179/bin/impala-pip ---------------------------------------------------------------------- diff --git a/bin/impala-pip b/bin/impala-pip new file mode 100755 index 0000000..0041600 --- /dev/null +++ b/bin/impala-pip @@ -0,0 +1,21 @@ +#!/bin/bash + +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +source "$(dirname "$0")/impala-python-common.sh" +exec "$PY_DIR/env/bin/python" "$PY_DIR/env/bin/pip" "$@" http://git-wip-us.apache.org/repos/asf/impala/blob/971cf179/infra/python/deps/compiled-requirements.txt ---------------------------------------------------------------------- diff --git a/infra/python/deps/compiled-requirements.txt b/infra/python/deps/compiled-requirements.txt index 2c5590e..72e939a 100644 --- a/infra/python/deps/compiled-requirements.txt +++ b/infra/python/deps/compiled-requirements.txt @@ -19,10 +19,6 @@ # after the toolchain is bootstrapped. Installed after requirements.txt argparse == 1.4.0 -Fabric == 1.10.2 - paramiko == 1.15.2 - ecdsa == 0.13 - pycrypto == 2.6.1 impyla == 0.14.0 bitarray == 0.8.1 sasl == 0.1.3 http://git-wip-us.apache.org/repos/asf/impala/blob/971cf179/infra/python/deps/extended-test-requirements.txt ---------------------------------------------------------------------- diff --git a/infra/python/deps/extended-test-requirements.txt b/infra/python/deps/extended-test-requirements.txt new file mode 100644 index 0000000..c62c99a --- /dev/null +++ b/infra/python/deps/extended-test-requirements.txt @@ -0,0 +1,24 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# Requirements for use of test tooling (tests/comparison and tests/stress). + +# IMPALA-7460 TODO: upgrade Fabric and Paramiko +Fabric == 1.10.2 + paramiko == 1.15.2 + ecdsa == 0.13 + pycrypto == 2.6.1 http://git-wip-us.apache.org/repos/asf/impala/blob/971cf179/tests/comparison/cluster.py ---------------------------------------------------------------------- diff --git a/tests/comparison/cluster.py b/tests/comparison/cluster.py index 1cf9516..a2d8fc0 100644 --- a/tests/comparison/cluster.py +++ b/tests/comparison/cluster.py @@ -48,7 +48,6 @@ from zipfile import ZipFile from db_connection import HiveConnection, ImpalaConnection from tests.common.errors import Timeout from tests.util.shell_util import shell as local_shell -from tests.util.ssh_util import SshClient from tests.util.parse_util import parse_glog, parse_mem_to_mb LOG = logging.getLogger(os.path.splitext(os.path.basename(__file__))[0]) @@ -292,6 +291,9 @@ class CmCluster(Cluster): if clients: client = clients.pop() else: + # IMPALA-7460: Insulate this import away from the global context so as to avoid + # requiring Paramiko unless it's absolutely needed. + from tests.util.ssh_util import SshClient LOG.debug("Creating new SSH client for %s", host_name) client = SshClient() client.connect(host_name, username=self.ssh_user, key_filename=self.ssh_key_file) http://git-wip-us.apache.org/repos/asf/impala/blob/971cf179/tests/comparison/leopard/impala_docker_env.py ---------------------------------------------------------------------- diff --git a/tests/comparison/leopard/impala_docker_env.py b/tests/comparison/leopard/impala_docker_env.py index 38555a4..a837c00 100755 --- a/tests/comparison/leopard/impala_docker_env.py +++ b/tests/comparison/leopard/impala_docker_env.py @@ -18,7 +18,12 @@ '''This module generates a docker environment for a job''' from __future__ import division -from fabric.api import sudo, run, settings +try: + from fabric.api import sudo, run, settings +except ImportError as e: + raise Exception( + "Please run impala-pip install -r $IMPALA_HOME/infra/python/deps/extended-test-" + "requirements.txt:\n{0}".format(str(e))) from logging import getLogger from os.path import ( join as join_path, http://git-wip-us.apache.org/repos/asf/impala/blob/971cf179/tests/infra/test_stress_infra.py ---------------------------------------------------------------------- diff --git a/tests/infra/test_stress_infra.py b/tests/infra/test_stress_infra.py index 58f7625..7e97ffa 100644 --- a/tests/infra/test_stress_infra.py +++ b/tests/infra/test_stress_infra.py @@ -24,12 +24,10 @@ import pytest from decimal import Decimal from tests.common.impala_test_suite import ImpalaTestSuite -from tests.stress.concurrent_select import ( - EXPECTED_TPCDS_QUERIES_COUNT, - EXPECTED_TPCH_NESTED_QUERIES_COUNT, - EXPECTED_TPCH_QUERIES_COUNT, - load_tpc_queries, - match_memory_estimate) +from tests.util.parse_util import ( + EXPECTED_TPCDS_QUERIES_COUNT, EXPECTED_TPCH_NESTED_QUERIES_COUNT, + EXPECTED_TPCH_QUERIES_COUNT, match_memory_estimate) +from tests.util.test_file_parser import load_tpc_queries class TestStressInfra(ImpalaTestSuite): http://git-wip-us.apache.org/repos/asf/impala/blob/971cf179/tests/stress/concurrent_select.py ---------------------------------------------------------------------- diff --git a/tests/stress/concurrent_select.py b/tests/stress/concurrent_select.py index 64e06e1..71fc146 100755 --- a/tests/stress/concurrent_select.py +++ b/tests/stress/concurrent_select.py @@ -85,25 +85,13 @@ from tests.comparison.db_types import Int, TinyInt, SmallInt, BigInt from tests.comparison.model_translator import SqlWriter from tests.comparison.query_generator import QueryGenerator from tests.comparison.query_profile import DefaultProfile -from tests.util.parse_util import parse_mem_to_mb +from tests.util.parse_util import ( + EXPECTED_TPCDS_QUERIES_COUNT, EXPECTED_TPCH_NESTED_QUERIES_COUNT, + EXPECTED_TPCH_QUERIES_COUNT, match_memory_estimate, parse_mem_to_mb) from tests.util.thrift_util import op_handle_to_query_id LOG = logging.getLogger(os.path.splitext(os.path.basename(__file__))[0]) -# IMPALA-6715: Every so often the stress test or the TPC workload directories get -# changed, and the stress test loses the ability to run the full set of queries. Set -# these constants and assert that when a workload is used, all the queries we expect to -# use are there. -EXPECTED_TPCDS_QUERIES_COUNT = 71 -EXPECTED_TPCH_NESTED_QUERIES_COUNT = 22 -EXPECTED_TPCH_QUERIES_COUNT = 22 - -# Regex to extract the estimated memory from an explain plan. -# The unit prefixes can be found in -# fe/src/main/java/org/apache/impala/common/PrintUtils.java -MEM_ESTIMATE_PATTERN = re.compile( - r"Per-Host Resource Estimates: Memory=(\d+\.?\d*)(P|T|G|M|K)?B") - PROFILES_DIR = "profiles" RESULT_HASHES_DIR = "result_hashes" @@ -1230,25 +1218,10 @@ class QueryRunner(object): def load_tpc_queries(workload): """Returns a list of TPC queries. 'workload' should either be 'tpch' or 'tpcds'.""" LOG.info("Loading %s queries", workload) - queries = list() - query_dir = os.path.join( - os.path.dirname(__file__), "..", "..", "testdata", "workloads", workload, "queries") - # IMPALA-6715 and others from the past: This pattern enforces the queries we actually - # find. Both workload directories contain other queries that are not part of the TPC - # spec. - file_name_pattern = re.compile(r"^{0}-(q.*).test$".format(workload)) - for query_file in os.listdir(query_dir): - match = file_name_pattern.search(query_file) - if not match: - continue - file_path = os.path.join(query_dir, query_file) - file_queries = load_queries_from_test_file(file_path) - if len(file_queries) != 1: - raise Exception( - "Expected exactly 1 query to be in file %s but got %s" - % (file_path, len(file_queries))) - query = file_queries[0] - query.name = match.group(1) + queries = [] + for query_text in test_file_parser.load_tpc_queries(workload): + query = Query() + query.sql = query_text queries.append(query) return queries @@ -1537,34 +1510,6 @@ def populate_runtime_info(query, impala, converted_args, timeout_secs=maxint): LOG.debug("Query after populating runtime info: %s", query) -def match_memory_estimate(explain_lines): - """ - Given a list of strings from EXPLAIN output, find the estimated memory needed. This is - used as a binary search start point. - - Params: - explain_lines: list of str - - Returns: - 2-tuple str of memory limit in decimal string and units (one of 'P', 'T', 'G', 'M', - 'K', '' bytes) - - Raises: - Exception if no match found - """ - # IMPALA-6441: This method is a public, first class method so it can be importable and - # tested with actual EXPLAIN output to make sure we always find the start point. - mem_limit, units = None, None - for line in explain_lines: - regex_result = MEM_ESTIMATE_PATTERN.search(line) - if regex_result: - mem_limit, units = regex_result.groups() - break - if None in (mem_limit, units): - raise Exception('could not parse explain string:\n' + '\n'.join(explain_lines)) - return mem_limit, units - - def estimate_query_mem_mb_usage(query, query_runner): """Runs an explain plan then extracts and returns the estimated memory needed to run the query. http://git-wip-us.apache.org/repos/asf/impala/blob/971cf179/tests/util/cluster_controller.py ---------------------------------------------------------------------- diff --git a/tests/util/cluster_controller.py b/tests/util/cluster_controller.py index 6b45429..388fa49 100644 --- a/tests/util/cluster_controller.py +++ b/tests/util/cluster_controller.py @@ -15,17 +15,23 @@ # specific language governing permissions and limitations # under the License. -import fabric.decorators +try: + import fabric.decorators + from fabric.context_managers import hide, settings + from fabric.operations import local, run, sudo + from fabric.tasks import execute +except ImportError as e: + raise Exception( + "Please run impala-pip install -r $IMPALA_HOME/infra/python/deps/extended-test-" + "requirements.txt:\n{0}".format(str(e))) import logging import os from contextlib import contextmanager -from fabric.context_managers import hide, settings -from fabric.operations import local, run, sudo -from fabric.tasks import execute from textwrap import dedent LOG = logging.getLogger('cluster_controller') + class ClusterController(object): """A convenience wrapper around fabric.""" http://git-wip-us.apache.org/repos/asf/impala/blob/971cf179/tests/util/parse_util.py ---------------------------------------------------------------------- diff --git a/tests/util/parse_util.py b/tests/util/parse_util.py index 592716b..b84f635 100644 --- a/tests/util/parse_util.py +++ b/tests/util/parse_util.py @@ -18,8 +18,21 @@ import re from datetime import datetime +# IMPALA-6715: Every so often the stress test or the TPC workload directories get +# changed, and the stress test loses the ability to run the full set of queries. Set +# these constants and assert that when a workload is used, all the queries we expect to +# use are there. +EXPECTED_TPCDS_QUERIES_COUNT = 71 +EXPECTED_TPCH_NESTED_QUERIES_COUNT = 22 +EXPECTED_TPCH_QUERIES_COUNT = 22 +# Regex to extract the estimated memory from an explain plan. +# The unit prefixes can be found in +# fe/src/main/java/org/apache/impala/common/PrintUtils.java +MEM_ESTIMATE_PATTERN = re.compile( + r"Per-Host Resource Estimates: Memory=(\d+\.?\d*)(P|T|G|M|K)?B") NEW_GLOG_ENTRY_PATTERN = re.compile(r"[IWEF](?P<Time>\d{4} \d{2}:\d{2}:\d{2}\.\d{6}).*") + def parse_glog(text, start_time=None): '''Parses the log 'text' and returns a list of log entries. If a 'start_time' is provided only log entries that are after the time will be returned. @@ -71,6 +84,7 @@ def parse_mem_to_mb(mem, units): raise Exception('Unexpected memory unit "%s"' % units) return int(mem) + def parse_duration_string_ms(duration): """Parses a duration string of the form 1h2h3m4s5.6ms4.5us7.8ns into milliseconds.""" pattern = r'(?P<value>[0-9]+\.?[0-9]*?)(?P<units>\D+)' @@ -83,3 +97,31 @@ def parse_duration_string_ms(duration): times[parsed['units']] = float(parsed['value']) return (times['h'] * 60 * 60 + times['m'] * 60 + times['s']) * 1000 + times['ms'] + + +def match_memory_estimate(explain_lines): + """ + Given a list of strings from EXPLAIN output, find the estimated memory needed. This is + used as a binary search start point. + + Params: + explain_lines: list of str + + Returns: + 2-tuple str of memory limit in decimal string and units (one of 'P', 'T', 'G', 'M', + 'K', '' bytes) + + Raises: + Exception if no match found + """ + # IMPALA-6441: This method is a public, first class method so it can be importable and + # tested with actual EXPLAIN output to make sure we always find the start point. + mem_limit, units = None, None + for line in explain_lines: + regex_result = MEM_ESTIMATE_PATTERN.search(line) + if regex_result: + mem_limit, units = regex_result.groups() + break + if None in (mem_limit, units): + raise Exception('could not parse explain string:\n' + '\n'.join(explain_lines)) + return mem_limit, units http://git-wip-us.apache.org/repos/asf/impala/blob/971cf179/tests/util/ssh_util.py ---------------------------------------------------------------------- diff --git a/tests/util/ssh_util.py b/tests/util/ssh_util.py index 8a56d96..a26f1bc 100644 --- a/tests/util/ssh_util.py +++ b/tests/util/ssh_util.py @@ -18,13 +18,19 @@ import atexit import logging import os -import paramiko +try: + import paramiko +except ImportError as e: + raise Exception( + "Please run impala-pip install -r $IMPALA_HOME/infra/python/deps/extended-test-" + "requirements.txt:\n{0}".format(str(e))) import textwrap import time LOG = logging.getLogger(os.path.splitext(os.path.basename(__file__))[0]) -from tests.common.errors import Timeout +from tests.common.errors import Timeout # noqa:E402 + class SshClient(paramiko.SSHClient): """A paramiko SSH client modified to: http://git-wip-us.apache.org/repos/asf/impala/blob/971cf179/tests/util/test_file_parser.py ---------------------------------------------------------------------- diff --git a/tests/util/test_file_parser.py b/tests/util/test_file_parser.py index 9fa8d70..8d3c58a 100644 --- a/tests/util/test_file_parser.py +++ b/tests/util/test_file_parser.py @@ -20,9 +20,11 @@ import codecs import collections import logging +import os +import os.path import re + from collections import defaultdict -from os.path import isfile from textwrap import dedent LOG = logging.getLogger('impala_test_suite') @@ -102,7 +104,7 @@ def parse_table_constraints(constraints_file): schema_include = defaultdict(list) schema_exclude = defaultdict(list) schema_only = defaultdict(list) - if not isfile(constraints_file): + if not os.path.isfile(constraints_file): LOG.info('No schema constraints file file found') else: with open(constraints_file, 'rb') as constraints_file: @@ -313,3 +315,31 @@ def write_test_file(test_file_name, test_file_sections, encoding=None): test_file_text.append(section_value) test_file_text.append(SECTION_DELIMITER) test_file.write(('\n').join(test_file_text)) + + +def load_tpc_queries(workload): + """Returns a list of TPC queries. 'workload' should either be 'tpch' or 'tpcds'.""" + LOG.info("Loading %s queries", workload) + queries = list() + query_dir = os.path.join( + os.environ['IMPALA_HOME'], "testdata", "workloads", workload, "queries") + # IMPALA-6715 and others from the past: This pattern enforces the queries we actually + # find. Both workload directories contain other queries that are not part of the TPC + # spec. + file_name_pattern = re.compile(r"^{0}-(q.*).test$".format(workload)) + for query_file in os.listdir(query_dir): + match = file_name_pattern.search(query_file) + if not match: + continue + file_path = os.path.join(query_dir, query_file) + test_cases = parse_query_test_file(file_path) + file_queries = list() + for test_case in test_cases: + query_sql = remove_comments(test_case["QUERY"]) + file_queries.append(query_sql) + if len(file_queries) != 1: + raise Exception( + "Expected exactly 1 query to be in file %s but got %s" + % (file_path, len(file_queries))) + queries.append(file_queries[0]) + return queries
