[4/4] incubator-impala git commit: Add a script to test performance on a developer machine

tarmstrong Wed, 31 May 2017 05:48:34 -0700

Add a script to test performance on a developer machine

This is a migration from an old and broken script from another
repository. Example use:


    bin/single_node_perf_run.py --ninja --workloads targeted-perf \
      --load --scale 4 --iterations 20 --num_impalads 3 \
      --start_minicluster --query_names PERF_AGG-Q3 \
      $(git rev-parse HEAD~1) $(git rev-parse HEAD)

The script can load data, run benchmarks, and compare the statistics
of those runs for significant differences in performance. It glues
together buildall.sh, bin/load-data.py, bin/run-workload.py, and
tests/benchmark/report_benchmark_results.py.

Change-Id: I70ba7f3c28f612a370915615600bf8dcebcedbc9
Reviewed-on: http://gerrit.cloudera.org:8080/6818
Reviewed-by: Jim Apple <[email protected]>
Tested-by: Impala Public Jenkins


Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/07a71388
Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/07a71388
Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/07a71388

Branch: refs/heads/master
Commit: 07a71388172e8a1f6af2724527bd381b6fbb003a
Parents: 1f34a9e
Author: Jim Apple <[email protected]>
Authored: Wed Mar 15 18:46:08 2017 -0700
Committer: Impala Public Jenkins <[email protected]>
Committed: Wed May 31 08:10:48 2017 +0000

----------------------------------------------------------------------
 bin/load-data.py                           |   2 +
 bin/run-workload.py                        |   6 +-
 bin/single_node_perf_run.py                | 305 ++++++++++++++++++++++++
 testdata/bin/generate-schema-statements.py |  12 +-
 testdata/datasets/tpcds/preload            |  19 +-
 testdata/datasets/tpch/preload             |  19 +-
 tests/common/test_dimensions.py            |  12 +
 tests/performance/query_executor.py        |  22 +-
 tests/performance/scheduler.py             |   5 +-
 tests/performance/workload_runner.py       |   3 +-
 10 files changed, 385 insertions(+), 20 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/07a71388/bin/load-data.py
----------------------------------------------------------------------
diff --git a/bin/load-data.py b/bin/load-data.py
index 1359e01..7b2ab23 100755
--- a/bin/load-data.py
+++ b/bin/load-data.py
@@ -181,6 +181,8 @@ def run_dataset_preload(dataset):
   dataset_preload_script = os.path.join(DATASET_DIR, dataset, "preload")
   if os.path.exists(dataset_preload_script):
     print("Running preload script for " + dataset)
+    if options.scale_factor > 1:
+      dataset_preload_script += " " + str(options.scale_factor)
     exec_cmd(dataset_preload_script, "Error executing preload script for " + 
dataset,
         exit_on_error=True)
 

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/07a71388/bin/run-workload.py
----------------------------------------------------------------------
diff --git a/bin/run-workload.py b/bin/run-workload.py
index 9037a11..9fd677d 100755
--- a/bin/run-workload.py
+++ b/bin/run-workload.py
@@ -75,7 +75,8 @@ parser.add_option("-x", "--workload_iterations", type="int", 
dest="workload_iter
 parser.add_option("--num_clients", type="int", dest="num_clients", default=1,
                   help="Number of clients (threads) to use when executing each 
query.")
 parser.add_option("--query_names", dest="query_names", default=str(),
-                  help="A comma-separated list of query names to execute.")
+                  help="A comma-separated list of regular expressions. A query 
is"
+                    " executed if it matches any of the expressions.")
 parser.add_option("--table_formats", dest="table_formats", default=str(),
                   help=("Override the default test vectors and run using only 
the"
                         " specified table formats. Ex. 
--table_formats=seq/snap/block"
@@ -83,6 +84,9 @@ parser.add_option("--table_formats", dest="table_formats", 
default=str(),
 parser.add_option("--shuffle_query_exec_order", dest="shuffle_queries",
                   action="store_true", default=False, help=("Randomizes the 
order "
                     "of query execution. Useful when the execution scope is a 
workload"))
+parser.add_option("--plan_first", dest="plan_first", action="store_true", 
default=False,
+                  help=("Runs EXPLAIN before running the query so that 
metadata loading"
+                        " is excluded from the timing"))
 
 parser.add_option("--use_kerberos", dest="use_kerberos", action="store_true",
                   default=False, help="If set, enables talking to a kerberized 
impalad")

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/07a71388/bin/single_node_perf_run.py
----------------------------------------------------------------------
diff --git a/bin/single_node_perf_run.py b/bin/single_node_perf_run.py
new file mode 100755
index 0000000..74ab944
--- /dev/null
+++ b/bin/single_node_perf_run.py
@@ -0,0 +1,305 @@
+#!/usr/bin/env impala-python
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Usage:
+# single_node_perf_run.py [options] git_hash_A [git_hash_B]
+#
+# When one hash is given, measures the performance on the specified workloads.
+# When two hashes are given, compares their performance. Output is in
+# $IMPALA_HOME/perf_results/latest.
+#
+# WARNING: This script will run git checkout. You should not touch the tree
+# while the script is running. You should start the script from a clean git
+# tree.
+#
+# WARNING: When --load is used, this script calls load_data.py which can
+# overwrite your TPC-H and TPC-DS data.
+#
+# Options:
+#   -h, --help            show this help message and exit
+#   --workloads=WORKLOADS
+#                         comma-separated list of workloads. Choices: tpch,
+#                         targeted-perf, tpcds. Default: targeted-perf
+#   --scale=SCALE         scale factor for the workloads
+#   --iterations=ITERATIONS
+#                         number of times to run each query
+#   --table_formats=TABLE_FORMATS
+#                         comma-separated list of table formats. Default:
+#                         parquet/none
+#   --num_impalads=NUM_IMPALADS
+#                         number of impalads. Default: 1
+#   --query_names=QUERY_NAMES
+#                         comma-separated list of regular expressions. A query
+#                         is executed if it matches any regular expression in
+#                         this list
+#   --load                load databases for the chosen workloads
+#   --start_minicluster   start a new Hadoop minicluster
+#   --ninja               use ninja, rather than Make, as the build tool
+
+from optparse import OptionParser
+from tempfile import mkdtemp
+
+import json
+import os
+import sh
+import subprocess
+import sys
+import textwrap
+
+from tests.common.test_dimensions import TableFormatInfo
+
+IMPALA_HOME = os.environ["IMPALA_HOME"]
+
+
+def load_data(db_to_load, table_formats, scale):
+  """Loads a database with a particular scale factor."""
+  subprocess.check_call(["{0}/bin/load-data.py".format(IMPALA_HOME),
+                         "--workloads", db_to_load, "--scale_factor", 
str(scale),
+                         "--table_formats", "text/none," + table_formats])
+  for table_format in table_formats.split(","):
+    suffix = TableFormatInfo.create_from_string(None, table_format).db_suffix()
+    db_name = db_to_load + scale + suffix
+    
subprocess.check_call(["{0}/tests/util/compute_table_stats.py".format(IMPALA_HOME),
+                           "--stop_on_error", "--db_names", db_name])
+
+def get_git_hash_for_name(name):
+  return sh.git("rev-parse", name).strip()
+
+
+def build(git_hash, options):
+  """Builds Impala in release mode; doesn't build tests."""
+  sh.git.checkout(git_hash)
+  buildall = ["{0}/buildall.sh".format(IMPALA_HOME), "-notests", "-release", 
"-noclean"]
+  if options.ninja:
+    buildall += ["-ninja"]
+  subprocess.check_call(buildall)
+
+
+def start_minicluster():
+  
subprocess.check_call(["{0}/bin/create-test-configuration.sh".format(IMPALA_HOME)])
+  subprocess.check_call(["{0}/testdata/bin/run-all.sh".format(IMPALA_HOME)])
+
+
+def start_impala(num_impalads):
+  
subprocess.check_call(["{0}/bin/start-impala-cluster.py".format(IMPALA_HOME), 
"-s",
+                         str(num_impalads), "-c", str(num_impalads)])
+
+
+def run_workload(base_dir, workloads, options):
+  """Runs workload with the given options.
+
+  Returns the git hash of the current revision to identify the output file.
+  """
+  git_hash = get_git_hash_for_name("HEAD")
+
+  run_workload = ["{0}/bin/run-workload.py".format(IMPALA_HOME)]
+
+  impalads = ",".join(["localhost:{0}".format(21000 + i)
+                       for i in range(0, int(options.num_impalads))])
+
+  run_workload += ["--workloads={0}".format(workloads),
+                   "--impalads={0}".format(impalads),
+                   "--results_json_file={0}/{1}.json".format(base_dir, 
git_hash),
+                   "--query_iterations={0}".format(options.iterations),
+                   "--table_formats={0}".format(options.table_formats),
+                   "--plan_first"]
+
+  if options.query_names:
+    run_workload += ["--query_names={0}".format(options.query_names)]
+
+  subprocess.check_call(run_workload)
+
+
+def report_benchmark_results(file_a, file_b, description):
+  """Wrapper around report_benchmark_result.py."""
+  result = "{0}/perf_results/latest/performance_result.txt".format(IMPALA_HOME)
+  with open(result, "w") as f:
+    subprocess.check_call(
+      ["{0}/tests/benchmark/report_benchmark_results.py".format(IMPALA_HOME),
+       "--reference_result_file={0}".format(file_a),
+       "--input_result_file={0}".format(file_b),
+       '--report_description="{0}"'.format(description)],
+      stdout=f)
+  sh.cat(result, _out=sys.stdout)
+
+
+def compare(base_dir, hash_a, hash_b):
+  """Take the results of two performance runs and compare them."""
+  file_a = os.path.join(base_dir, hash_a + ".json")
+  file_b = os.path.join(base_dir, hash_b + ".json")
+  description = "{0} vs {1}".format(hash_a, hash_b)
+  report_benchmark_results(file_a, file_b, description)
+
+  # From the two json files extract the profiles and diff them
+  generate_profile_file(file_a, hash_a, base_dir)
+  generate_profile_file(file_b, hash_b, base_dir)
+
+  sh.diff("-u",
+          os.path.join(base_dir, hash_a + "_profile.txt"),
+          os.path.join(base_dir, hash_b + "_profile.txt"),
+          _out=os.path.join(IMPALA_HOME, 
"performance_result_profile_diff.txt"),
+          _ok_code=[0, 1])
+
+
+def generate_profile_file(name, hash, base_dir):
+  """Extracts runtime profiles from the JSON file 'name'.
+
+  Writes the runtime profiles back in a simple text file in the same directory.
+  """
+  with open(name) as fid:
+    data = json.load(fid)
+    with open(os.path.join(base_dir, hash + "_profile.txt"), "w+") as out:
+      # For each query
+      for key in data:
+        for iteration in data[key]:
+          out.write(iteration["runtime_profile"])
+          out.write("\n\n")
+
+
+def backup_workloads():
+  """Copy the workload folder to a temporary directory and returns its name.
+
+  Used to keep workloads from being clobbered by git checkout.
+  """
+  temp_dir = mkdtemp()
+  sh.cp(os.path.join(IMPALA_HOME, "testdata", "workloads"),
+        temp_dir, R=True, _out=sys.stdout, _err=sys.stderr)
+  print "Backed up workloads to {0}".format(temp_dir)
+  return temp_dir
+
+
+def restore_workloads(source):
+  """Restores the workload directory from source into the Impala tree."""
+  sh.cp(os.path.join(source, "workloads"), os.path.join(IMPALA_HOME, 
"testdata"),
+        R=True, _out=sys.stdout, _err=sys.stderr)
+
+
+def perf_ab_test(options, args):
+  """Does the main work: build, run tests, compare."""
+  hash_a = get_git_hash_for_name(args[0])
+
+  # Create the base directory to store the results in
+  results_path = os.path.join(IMPALA_HOME, "perf_results")
+  if not os.access(results_path, os.W_OK):
+    os.makedirs(results_path)
+
+  temp_dir = mkdtemp(dir=results_path, prefix="perf_run_")
+  latest = os.path.join(results_path, "latest")
+  if os.path.islink(latest):
+    os.remove(latest)
+  os.symlink(os.path.basename(temp_dir), latest)
+  workload_dir = backup_workloads()
+
+  build(hash_a, options)
+  restore_workloads(workload_dir)
+
+  if options.start_minicluster:
+    start_minicluster()
+  start_impala(options.num_impalads)
+
+  workloads = set(options.workloads.split(","))
+
+  if options.load:
+    WORKLOAD_TO_DATASET = {"tpch": "tpch", "tpcds": "tpcds", "targeted-perf": 
"tpch"}
+    datasets = set([WORKLOAD_TO_DATASET[workload] for workload in workloads])
+    for dataset in datasets:
+      load_data(dataset, options.table_formats, options.scale)
+
+  workloads = ",".join(["{0}:{1}".format(workload, options.scale)
+                        for workload in workloads])
+
+  run_workload(temp_dir, workloads, options)
+
+  if len(args) > 1 and args[1]:
+    hash_b = get_git_hash_for_name(args[1])
+    build(hash_b, options)
+    restore_workloads(workload_dir)
+    start_impala(options.num_impalads)
+    run_workload(temp_dir, workloads, options)
+    compare(temp_dir, hash_a, hash_b)
+
+
+def parse_options():
+  """Parse and return the options and positional arguments."""
+  parser = OptionParser()
+  parser.add_option("--workloads", default="targeted-perf",
+                    help="comma-separated list of workloads. Choices: tpch, "
+                    "targeted-perf, tpcds. Default: targeted-perf")
+  parser.add_option("--scale", help="scale factor for the workloads")
+  parser.add_option("--iterations", default=30, help="number of times to run 
each query")
+  parser.add_option("--table_formats", default="parquet/none", 
help="comma-separated "
+                    "list of table formats. Default: parquet/none")
+  parser.add_option("--num_impalads", default=3, help="number of impalads. 
Default: 1")
+  # Less commonly-used options:
+  parser.add_option("--query_names",
+                    help="comma-separated list of regular expressions. A query 
is "
+                    "executed if it matches any regular expression in this 
list")
+  parser.add_option("--load", action="store_true",
+                    help="load databases for the chosen workloads")
+  parser.add_option("--start_minicluster", action="store_true",
+                    help="start a new Hadoop minicluster")
+  parser.add_option("--ninja", action="store_true",
+                    help = "use ninja, rather than Make, as the build tool")
+
+  parser.set_usage(textwrap.dedent("""
+    single_node_perf_run.py [options] git_hash_A [git_hash_B]
+
+    When one hash is given, measures the performance on the specified 
workloads.
+    When two hashes are given, compares their performance. Output is in
+    $IMPALA_HOME/perf_results/latest.
+
+    WARNING: This script will run git checkout. You should not touch the tree
+    while the script is running. You should start the script from a clean git
+    tree.
+
+    WARNING: When --load is used, this script calls load_data.py which can
+    overwrite your TPC-H and TPC-DS data."""))
+
+  options, args = parser.parse_args()
+
+  if not 1 <= len(args) <= 2:
+    parser.print_usage(sys.stderr)
+    raise Exception("Invalid arguments: either 1 or 2 Git hashes allowed")
+
+  return options, args
+
+
+def main():
+  """A thin wrapper around perf_ab_test that restores git state after."""
+  options, args = parse_options()
+
+  os.chdir(IMPALA_HOME)
+
+  if sh.git("status", "--porcelain", "--untracked-files=no", 
_out=None).strip():
+    sh.git("status", "--porcelain", "--untracked-files=no", _out=sys.stdout)
+    raise Exception("Working copy is dirty. Consider 'git stash' and try 
again.")
+
+  # Save the current hash to be able to return to this place in the tree when 
done
+  current_hash = sh.git("rev-parse", "--abbrev-ref", "HEAD").strip()
+  if current_hash == "HEAD":
+    current_hash = get_git_hash_for_name("HEAD")
+
+  try:
+    perf_ab_test(options, args)
+  finally:
+    sh.git.checkout(current_hash)
+
+
+if __name__ == "__main__":
+  main()

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/07a71388/testdata/bin/generate-schema-statements.py
----------------------------------------------------------------------
diff --git a/testdata/bin/generate-schema-statements.py 
b/testdata/bin/generate-schema-statements.py
index 0bfbfef..a31eca1 100755
--- a/testdata/bin/generate-schema-statements.py
+++ b/testdata/bin/generate-schema-statements.py
@@ -429,16 +429,6 @@ def build_hbase_create_stmt(db_name, table_name, 
column_families):
   create_stmt.append("create '%s', %s" % (hbase_table_name, column_families))
   return create_stmt
 
-def build_db_suffix(file_format, codec, compression_type):
-  if file_format == 'text' and codec == 'none':
-    return ''
-  elif codec == 'none':
-    return '_%s' % (file_format)
-  elif compression_type == 'record':
-    return '_%s_record_%s' % (file_format, codec)
-  else:
-    return '_%s_%s' % (file_format, codec)
-
 # Does a hdfs directory listing and returns array with all the subdir names.
 def get_hdfs_subdirs_with_data(path):
   tmp_file = tempfile.TemporaryFile("w+")
@@ -505,7 +495,7 @@ def generate_statements(output_name, test_vectors, sections,
     table_format = '%s/%s/%s' % (file_format, codec, compression_type)
     for section in sections:
       table_name = section['BASE_TABLE_NAME'].strip()
-      db_suffix = build_db_suffix(file_format, codec, compression_type)
+      db_suffix = row.db_suffix()
       db_name = '{0}{1}'.format(data_set, options.scale_factor)
       db = '{0}{1}'.format(db_name, db_suffix)
 

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/07a71388/testdata/datasets/tpcds/preload
----------------------------------------------------------------------
diff --git a/testdata/datasets/tpcds/preload b/testdata/datasets/tpcds/preload
index 0deb745..631a1c2 100755
--- a/testdata/datasets/tpcds/preload
+++ b/testdata/datasets/tpcds/preload
@@ -21,6 +21,14 @@ trap 'echo Error in $0 at line $LINENO: $(cd "'$PWD'" && awk 
"NR == $LINENO" $0)
 
 IMPALA_DATA=${IMPALA_HOME}/testdata/impala-data
 TPC_DS_DATA=${IMPALA_DATA}/tpcds
+
+SCALE_FACTOR=1
+if [[ $# == 1 && $1 -gt 1 ]]
+then
+  SCALE_FACTOR=$1
+  TPC_DS_DATA=${TPC_DS_DATA}${SCALE_FACTOR}
+fi
+
 TPC_DS_DIRNAME=tpc-ds-${IMPALA_TPC_DS_VERSION}
 TPC_DS_HOME=${IMPALA_TOOLCHAIN}/${TPC_DS_DIRNAME}
 
@@ -40,7 +48,16 @@ if [ ! -x ${TPC_DS_DSDGEN} ]; then
   echo "Could not find TPC-DS data generator executable: ${TPC_DS_DSDGEN}"
   exit 1
 fi
-${TPC_DS_DSDGEN} -force -verbose
+
+if [ -t 1 ]
+then
+  # Output is terminal, show progress verbosely
+  VERBOSITY='-verbose'
+else
+  VERBOSITY=''
+fi
+
+${TPC_DS_DSDGEN} -force ${VERBOSITY} -scale ${SCALE_FACTOR}
 # Impala expects each table to be in its own subdirectory.
 for FILE in *.dat; do
   FILE_DIR=${FILE%.dat}

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/07a71388/testdata/datasets/tpch/preload
----------------------------------------------------------------------
diff --git a/testdata/datasets/tpch/preload b/testdata/datasets/tpch/preload
index 64553d6..2b0cbb6 100755
--- a/testdata/datasets/tpch/preload
+++ b/testdata/datasets/tpch/preload
@@ -21,6 +21,14 @@ trap 'echo Error in $0 at line $LINENO: $(cd "'$PWD'" && awk 
"NR == $LINENO" $0)
 
 IMPALA_DATA=${IMPALA_HOME}/testdata/impala-data
 TPC_H_DATA=${IMPALA_DATA}/tpch
+
+SCALE_FACTOR=1
+if [[ $# == 1 && $1 -gt 1 ]]
+then
+  SCALE_FACTOR=$1
+  TPC_H_DATA=${TPC_H_DATA}${SCALE_FACTOR}
+fi
+
 TPC_H_HOME=${IMPALA_TOOLCHAIN}/tpc-h-${IMPALA_TPC_H_VERSION}
 TPC_H_DBGEN=${TPC_H_HOME}/bin/dbgen
 
@@ -38,7 +46,16 @@ rm -rf ${TPC_H_DATA}
 mkdir -p ${TPC_H_DATA}
 cd ${TPC_H_DATA}
 
-${TPC_H_DBGEN} -v -f
+if [ -t 1 ]
+then
+  # Output is terminal, show progress verbosely
+  VERBOSITY='-v'
+else
+  VERBOSITY=''
+fi
+
+${TPC_H_DBGEN} ${VERBOSITY} -f -s ${SCALE_FACTOR}
+
 # Impala expects each table to be in its own subdirectory.
 for FILE in *.tbl; do
   FILE_DIR=${FILE%.tbl}

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/07a71388/tests/common/test_dimensions.py
----------------------------------------------------------------------
diff --git a/tests/common/test_dimensions.py b/tests/common/test_dimensions.py
index 3f9c47c..20ccbeb 100644
--- a/tests/common/test_dimensions.py
+++ b/tests/common/test_dimensions.py
@@ -85,6 +85,18 @@ class TableFormatInfo(object):
       compression_str = 'none'
     return '%s/%s' % (self.file_format, compression_str)
 
+  def db_suffix(self):
+    if self.file_format == 'text' and self.compression_codec == 'none':
+      return ''
+    elif self.compression_codec == 'none':
+      return '_%s' % (self.file_format)
+    elif self.compression_type == 'record':
+      return '_%s_record_%s' % (self.file_format, self.compression_codec)
+    else:
+      return '_%s_%s' % (self.file_format, self.compression_codec)
+
+
+
 def create_uncompressed_text_dimension(workload):
   dataset = get_dataset_from_workload(workload)
   return ImpalaTestDimension('table_format',

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/07a71388/tests/performance/query_executor.py
----------------------------------------------------------------------
diff --git a/tests/performance/query_executor.py 
b/tests/performance/query_executor.py
index 5d9fc61..2295f14 100644
--- a/tests/performance/query_executor.py
+++ b/tests/performance/query_executor.py
@@ -32,6 +32,8 @@
 import logging
 import os
 
+from tests.performance.query import Query
+
 # Setup logging for this module.
 logging.basicConfig(level=logging.INFO, format='[%(name)s] %(threadName)s: 
%(message)s')
 LOG = logging.getLogger('query_executor')
@@ -175,7 +177,7 @@ class QueryExecutor(object):
 
   Args:
     name (str): eg. "hive"
-    query (str): string containing SQL query to be executed
+    query (Query): SQL query to be executed
     func (function): Function that accepts a QueryExecOption parameter and 
returns a
       ImpalaQueryResult. Eg. execute_using_impala_beeswax
     config (QueryExecOption)
@@ -185,7 +187,7 @@ class QueryExecutor(object):
     exec_func (function): Function that accepts a QueryExecOption parameter 
and returns a
       ImpalaQueryResult.
     exec_config (QueryExecOption)
-    query (str): string containing SQL query to be executed
+    query (Query): SQL query to be executed
     exit_on_error (boolean): Exit right after an error encountered.
     executor_name (str): eg. "hive"
     result (ImpalaQueryResult): Contains the result after execute method is 
called.
@@ -208,8 +210,20 @@ class QueryExecutor(object):
     if 'hive' not in self.executor_name:
       self.exec_config.impalad = impalad
 
-  def execute(self):
-    """Execute the query using the given execution function"""
+  def execute(self, plan_first=False):
+    """Execute the query using the given execution function.
+
+    If plan_first is true, EXPLAIN the query first so timing does not include 
the initial
+    metadata loading required for planning.
+    """
+    if plan_first:
+      LOG.debug('Planning %s' % self.query)
+      assert isinstance(self.query, Query)
+      self.query.query_str = 'EXPLAIN ' + self.query.query_str
+      try:
+        self.exec_func(self.query, self.exec_config)
+      finally:
+        self.query.query_str = self.query.query_str[len('EXPLAIN '):]
     LOG.debug('Executing %s' % self.query)
     self._result = self.exec_func(self.query, self.exec_config)
     if not self._result.success:

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/07a71388/tests/performance/scheduler.py
----------------------------------------------------------------------
diff --git a/tests/performance/scheduler.py b/tests/performance/scheduler.py
index 777bf84..8a97dd4 100644
--- a/tests/performance/scheduler.py
+++ b/tests/performance/scheduler.py
@@ -43,6 +43,7 @@ class Scheduler(object):
     num_clients (int): Number of concurrent clients.
     impalads (list of str): A list of impalads to connect to. Ignored when the 
executor
       is hive.
+    plan_first (boolean): EXPLAIN queries before executing them
 
   Attributes:
     query_executors (list of QueryExecutor): initialized query executors
@@ -51,6 +52,7 @@ class Scheduler(object):
     query_iterations (int): number of times each query executor will execute
     impalads (list of str?): list of impalads for execution. It is rotated 
after each execution.
     num_clients (int): Number of concurrent clients
+    plan_first (boolean): EXPLAIN queries before executing them
   """
   def __init__(self, **kwargs):
     self.query_executors = kwargs.get('query_executors')
@@ -59,6 +61,7 @@ class Scheduler(object):
     self.query_iterations = kwargs.get('query_iterations', 1)
     self.impalads = kwargs.get('impalads')
     self.num_clients = kwargs.get('num_clients', 1)
+    self.plan_first = kwargs.get('plan_first', False)
     self._exit = Event()
     self._results = list()
     self._result_dict_lock = Lock()
@@ -110,7 +113,7 @@ class Scheduler(object):
             exit(1)
           try:
             query_executor.prepare(self._get_next_impalad())
-            query_executor.execute()
+            query_executor.execute(plan_first=self.plan_first)
           # QueryExecutor only throws an exception if the query fails and 
abort_on_error
           # is set to True. If abort_on_error is False, then the exception is 
logged on
           # the console and execution moves on to the next query.

http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/07a71388/tests/performance/workload_runner.py
----------------------------------------------------------------------
diff --git a/tests/performance/workload_runner.py 
b/tests/performance/workload_runner.py
index 677b2fa..a50af9c 100644
--- a/tests/performance/workload_runner.py
+++ b/tests/performance/workload_runner.py
@@ -140,7 +140,8 @@ class WorkloadRunner(object):
         iterations=self.config.workload_iterations,
         query_iterations=self.config.query_iterations,
         impalads=self.config.impalads,
-        num_clients=self.config.num_clients)
+        num_clients=self.config.num_clients,
+        plan_first=getattr(self.config, 'plan_first', False))
 
     scheduler.run()
     self._results.extend(scheduler.results)

[4/4] incubator-impala git commit: Add a script to test performance on a developer machine

Reply via email to