[1/5] incubator-impala git commit: IMPALA-6160: Allow multiple statements in a Query object.

tarmstrong Wed, 15 Nov 2017 13:58:27 -0800

Repository: incubator-impala
Updated Branches:
  refs/heads/master d1b92c8b5 -> e98d2f1c0



IMPALA-6160: Allow multiple statements in a Query object.

Testing:
- Reproduced problem with bin/run-workload.py.
- Ran bin/run-workload.py --workloads=tpch,targeted-perf,tpcds
  --impalads=localhost:21000,localhost:21001,localhost:21002
  --results_json_file=$PWD/perf_results/IMPALA-6160.json
  --query_iterations=3 --table_formats=parquet/none --plan_first
  --query_names='.*' (Close to command line that single_node_perf_run.py
  builds.)
- Manually reviewed perf_results/IMPALA-6160.json to verify presence of
  plans and proper splitting of query batches.

Change-Id: Iac86af181b7c42655f21d2c1efd4652dd35d9297
Reviewed-on: http://gerrit.cloudera.org:8080/8513
Tested-by: Impala Public Jenkins
Reviewed-by: Jim Apple <[email protected]>


Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/a8c123b5
Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/a8c123b5
Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/a8c123b5

Branch: refs/heads/master
Commit: a8c123b55a78d39dd2e465e7641d4706b3aaaf4e
Parents: d1b92c8
Author: Tim Wood <[email protected]>
Authored: Thu Nov 9 16:12:14 2017 -0800
Committer: Jim Apple <[email protected]>
Committed: Wed Nov 15 19:38:30 2017 +0000

----------------------------------------------------------------------
 tests/performance/query.py          |  2 +-
 tests/performance/query_executor.py | 83 +++++++++++++++++++++++---------
 2 files changed, 60 insertions(+), 25 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a8c123b5/tests/performance/query.py
----------------------------------------------------------------------
diff --git a/tests/performance/query.py b/tests/performance/query.py
index c8301c0..5884a01 100644
--- a/tests/performance/query.py
+++ b/tests/performance/query.py
@@ -23,7 +23,7 @@ class Query(object):
   """Represents a query and all the information neede to execute it
 
   Attributes:
-    query_str (str): The SQL query string.
+    query_str (str): SQL query string; contains 1 or more ;-delimited SQL 
statements.
     name (str): query name?
     scale_factor (str): for example 300gb, used to determine the database.
     test_vector (?): Specifies some parameters

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/a8c123b5/tests/performance/query_executor.py
----------------------------------------------------------------------
diff --git a/tests/performance/query_executor.py 
b/tests/performance/query_executor.py
index 2295f14..1f4bec2 100644
--- a/tests/performance/query_executor.py
+++ b/tests/performance/query_executor.py
@@ -31,6 +31,7 @@
 
 import logging
 import os
+import re
 
 from tests.performance.query import Query
 
@@ -39,8 +40,16 @@ logging.basicConfig(level=logging.INFO, format='[%(name)s] 
%(threadName)s: %(mes
 LOG = logging.getLogger('query_executor')
 LOG.setLevel(level=logging.INFO)
 
-# globals.
+# Globals.
 hive_result_regex = 'Time taken: (\d*).(\d*) seconds'
+# Match any CRUD statement that can follow EXPLAIN.
+# The statement may begin with SQL line comments starting with --
+COMMENT_LINES_REGEX = r'(?:\s*--.*\n)*'
+DDL_CRUD_PATTERN = \
+  re.compile(COMMENT_LINES_REGEX +
+      r'\s*((CREATE|DELETE|INSERT|SELECT|UPDATE|UPSERT|WITH)\s|VALUES\s*\()',
+      re.IGNORECASE)
+
 
 ## TODO: Split executors into their own modules.
 class QueryExecConfig(object):
@@ -173,21 +182,21 @@ class BeeswaxQueryExecConfig(ImpalaQueryExecConfig):
 
 
 class QueryExecutor(object):
-  """Executes a query.
+  """Executes one or more queries.
 
   Args:
     name (str): eg. "hive"
-    query (Query): SQL query to be executed
+    query (Query): Container holding 1 or more ;-delimited SQL statements to 
be executed
     func (function): Function that accepts a QueryExecOption parameter and 
returns a
       ImpalaQueryResult. Eg. execute_using_impala_beeswax
     config (QueryExecOption)
     exit_on_error (boolean): Exit right after an error encountered.
 
   Attributes:
-    exec_func (function): Function that accepts a QueryExecOption parameter 
and returns a
-      ImpalaQueryResult.
+    exec_func (function): Function that accepts a QueryExecOption parameter 
and returns
+      an ImpalaQueryResult.
     exec_config (QueryExecOption)
-    query (Query): SQL query to be executed
+    query (Query): Container holding 1 or more ;-delimited SQL statements to 
be executed
     exit_on_error (boolean): Exit right after an error encountered.
     executor_name (str): eg. "hive"
     result (ImpalaQueryResult): Contains the result after execute method is 
called.
@@ -211,26 +220,52 @@ class QueryExecutor(object):
       self.exec_config.impalad = impalad
 
   def execute(self, plan_first=False):
-    """Execute the query using the given execution function.
+    """Execute a set of SQL statements using the given execution function,
+    and return a result object.  SQL statements can be the familiar SELECT, 
INSERT,
+    DELETE, UPDATE and UPSERT DML commands as well as utilities like SET, SHOW,
+    VERSION, etc.
+
+    If plan_first is true, EXPLAIN the "explainable" queries in the set
+    first so timing does not include the initial metadata loading
+    required for planning.
 
-    If plan_first is true, EXPLAIN the query first so timing does not include 
the initial
-    metadata loading required for planning.
+    This function furnishes a query result object in self._result, for the last
+    query in the batch ONLY.
     """
-    if plan_first:
-      LOG.debug('Planning %s' % self.query)
-      assert isinstance(self.query, Query)
-      self.query.query_str = 'EXPLAIN ' + self.query.query_str
-      try:
-        self.exec_func(self.query, self.exec_config)
-      finally:
-        self.query.query_str = self.query.query_str[len('EXPLAIN '):]
-    LOG.debug('Executing %s' % self.query)
-    self._result = self.exec_func(self.query, self.exec_config)
-    if not self._result.success:
-      if self.exit_on_error:
-        raise RuntimeError(self._result.query_error)
-      else:
-        LOG.info("Continuing execution")
+    assert isinstance(self.query, Query)
+    orig_query_str = self.query.query_str
+    try:
+      statements = self.query.query_str.split(';')
+
+      if plan_first:
+        # Break out multiple statements in self.query
+        for stmt in statements:
+          ddl_crud_match = DDL_CRUD_PATTERN.match(stmt)
+          if not ddl_crud_match:
+            # Don't EXPLAIN this statement
+            continue
+
+          self.query.query_str = 'EXPLAIN ' + stmt + ';'
+          LOG.debug('Planning %s' % self.query.query_str)
+          self.exec_func(self.query, self.exec_config)
+
+      # Now actually execute
+      for stmt in statements:
+        self.query.query_str = stmt + ';'
+        LOG.debug('Executing %s' % self.query.query_str)
+        self._result = self.exec_func(self.query, self.exec_config)
+
+        if not self._result.success:
+          if self.exit_on_error:
+            raise RuntimeError(self._result.query_error)
+          else:
+            LOG.info("Continuing execution")
+    finally:
+      self.query.query_str = orig_query_str
+
+    # We do not need to restore the SET options we changed for the next batch
+    # (Query object) because the scheduler runs each Query on its own 
connection
+    # (XXX pretty strange for a performance test. :)
 
   @property
   def result(self):

[1/5] incubator-impala git commit: IMPALA-6160: Allow multiple statements in a Query object.

Reply via email to