Repository: incubator-impala Updated Branches: refs/heads/master d1b92c8b5 -> e98d2f1c0
IMPALA-6160: Allow multiple statements in a Query object. Testing: - Reproduced problem with bin/run-workload.py. - Ran bin/run-workload.py --workloads=tpch,targeted-perf,tpcds --impalads=localhost:21000,localhost:21001,localhost:21002 --results_json_file=$PWD/perf_results/IMPALA-6160.json --query_iterations=3 --table_formats=parquet/none --plan_first --query_names='.*' (Close to command line that single_node_perf_run.py builds.) - Manually reviewed perf_results/IMPALA-6160.json to verify presence of plans and proper splitting of query batches. Change-Id: Iac86af181b7c42655f21d2c1efd4652dd35d9297 Reviewed-on: http://gerrit.cloudera.org:8080/8513 Tested-by: Impala Public Jenkins Reviewed-by: Jim Apple <[email protected]> Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/a8c123b5 Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/a8c123b5 Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/a8c123b5 Branch: refs/heads/master Commit: a8c123b55a78d39dd2e465e7641d4706b3aaaf4e Parents: d1b92c8 Author: Tim Wood <[email protected]> Authored: Thu Nov 9 16:12:14 2017 -0800 Committer: Jim Apple <[email protected]> Committed: Wed Nov 15 19:38:30 2017 +0000 ---------------------------------------------------------------------- tests/performance/query.py | 2 +- tests/performance/query_executor.py | 83 +++++++++++++++++++++++--------- 2 files changed, 60 insertions(+), 25 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a8c123b5/tests/performance/query.py ---------------------------------------------------------------------- diff --git a/tests/performance/query.py b/tests/performance/query.py index c8301c0..5884a01 100644 --- a/tests/performance/query.py +++ b/tests/performance/query.py @@ -23,7 +23,7 @@ class Query(object): """Represents a query and all the information neede to execute it Attributes: - query_str (str): The SQL query string. + query_str (str): SQL query string; contains 1 or more ;-delimited SQL statements. name (str): query name? scale_factor (str): for example 300gb, used to determine the database. test_vector (?): Specifies some parameters http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a8c123b5/tests/performance/query_executor.py ---------------------------------------------------------------------- diff --git a/tests/performance/query_executor.py b/tests/performance/query_executor.py index 2295f14..1f4bec2 100644 --- a/tests/performance/query_executor.py +++ b/tests/performance/query_executor.py @@ -31,6 +31,7 @@ import logging import os +import re from tests.performance.query import Query @@ -39,8 +40,16 @@ logging.basicConfig(level=logging.INFO, format='[%(name)s] %(threadName)s: %(mes LOG = logging.getLogger('query_executor') LOG.setLevel(level=logging.INFO) -# globals. +# Globals. hive_result_regex = 'Time taken: (\d*).(\d*) seconds' +# Match any CRUD statement that can follow EXPLAIN. +# The statement may begin with SQL line comments starting with -- +COMMENT_LINES_REGEX = r'(?:\s*--.*\n)*' +DDL_CRUD_PATTERN = \ + re.compile(COMMENT_LINES_REGEX + + r'\s*((CREATE|DELETE|INSERT|SELECT|UPDATE|UPSERT|WITH)\s|VALUES\s*\()', + re.IGNORECASE) + ## TODO: Split executors into their own modules. class QueryExecConfig(object): @@ -173,21 +182,21 @@ class BeeswaxQueryExecConfig(ImpalaQueryExecConfig): class QueryExecutor(object): - """Executes a query. + """Executes one or more queries. Args: name (str): eg. "hive" - query (Query): SQL query to be executed + query (Query): Container holding 1 or more ;-delimited SQL statements to be executed func (function): Function that accepts a QueryExecOption parameter and returns a ImpalaQueryResult. Eg. execute_using_impala_beeswax config (QueryExecOption) exit_on_error (boolean): Exit right after an error encountered. Attributes: - exec_func (function): Function that accepts a QueryExecOption parameter and returns a - ImpalaQueryResult. + exec_func (function): Function that accepts a QueryExecOption parameter and returns + an ImpalaQueryResult. exec_config (QueryExecOption) - query (Query): SQL query to be executed + query (Query): Container holding 1 or more ;-delimited SQL statements to be executed exit_on_error (boolean): Exit right after an error encountered. executor_name (str): eg. "hive" result (ImpalaQueryResult): Contains the result after execute method is called. @@ -211,26 +220,52 @@ class QueryExecutor(object): self.exec_config.impalad = impalad def execute(self, plan_first=False): - """Execute the query using the given execution function. + """Execute a set of SQL statements using the given execution function, + and return a result object. SQL statements can be the familiar SELECT, INSERT, + DELETE, UPDATE and UPSERT DML commands as well as utilities like SET, SHOW, + VERSION, etc. + + If plan_first is true, EXPLAIN the "explainable" queries in the set + first so timing does not include the initial metadata loading + required for planning. - If plan_first is true, EXPLAIN the query first so timing does not include the initial - metadata loading required for planning. + This function furnishes a query result object in self._result, for the last + query in the batch ONLY. """ - if plan_first: - LOG.debug('Planning %s' % self.query) - assert isinstance(self.query, Query) - self.query.query_str = 'EXPLAIN ' + self.query.query_str - try: - self.exec_func(self.query, self.exec_config) - finally: - self.query.query_str = self.query.query_str[len('EXPLAIN '):] - LOG.debug('Executing %s' % self.query) - self._result = self.exec_func(self.query, self.exec_config) - if not self._result.success: - if self.exit_on_error: - raise RuntimeError(self._result.query_error) - else: - LOG.info("Continuing execution") + assert isinstance(self.query, Query) + orig_query_str = self.query.query_str + try: + statements = self.query.query_str.split(';') + + if plan_first: + # Break out multiple statements in self.query + for stmt in statements: + ddl_crud_match = DDL_CRUD_PATTERN.match(stmt) + if not ddl_crud_match: + # Don't EXPLAIN this statement + continue + + self.query.query_str = 'EXPLAIN ' + stmt + ';' + LOG.debug('Planning %s' % self.query.query_str) + self.exec_func(self.query, self.exec_config) + + # Now actually execute + for stmt in statements: + self.query.query_str = stmt + ';' + LOG.debug('Executing %s' % self.query.query_str) + self._result = self.exec_func(self.query, self.exec_config) + + if not self._result.success: + if self.exit_on_error: + raise RuntimeError(self._result.query_error) + else: + LOG.info("Continuing execution") + finally: + self.query.query_str = orig_query_str + + # We do not need to restore the SET options we changed for the next batch + # (Query object) because the scheduler runs each Query on its own connection + # (XXX pretty strange for a performance test. :) @property def result(self):
