[impala] 01/03: IMPALA-12090: Split runtime profiles made by single_node_perf_run.py

joemcdonnell Fri, 28 Apr 2023 15:03:21 -0700

This is an automated email from the ASF dual-hosted git repository.

joemcdonnell pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/impala.git


commit 667403b2cb0c05fffd33dc6f8b53dba7f9957a0a
Author: Riza Suminto <[email protected]>
AuthorDate: Thu Apr 20 17:31:59 2023 -0700

    IMPALA-12090: Split runtime profiles made by single_node_perf_run.py
    
    single_node_perf_run.py produce a single text file containing all
    runtime profiles from perf run from one git hash. This is handy, but the
    resulting text file can be very long and makes it difficult to analyze
    individual profile.
    
    This patch add --split_profiles and --no_split_profiles option into
    single_node_perf_run.py. If --split_profiles is specified, it it will
    extract runtime profiles into individual file instead of single long
    text file. Specifying --no_split_profiles will retain the old behavior
    of putting runtime profiles into a single-combined text file. Default to
    split profiles if neither is specified. Files in profile directory will
    look like this with --split_profiles:
    
    $ ls -1 
perf_results/latest/2267d9d104cc3fb0740cba09acb369b4d7ae4f52_profiles/
    TPCDS-Q14-1_iter001.txt
    TPCDS-Q14-1_iter002.txt
    TPCDS-Q14-1_iter003.txt
    TPCDS-Q14-2_iter001.txt
    TPCDS-Q14-2_iter002.txt
    TPCDS-Q14-2_iter003.txt
    TPCDS-Q23-1_iter001.txt
    TPCDS-Q23-1_iter002.txt
    TPCDS-Q23-1_iter003.txt
    TPCDS-Q23-2_iter001.txt
    TPCDS-Q23-2_iter002.txt
    TPCDS-Q23-2_iter003.txt
    
    Testing:
    - Manually test run the script with selected queries from tpcds
      workload with either --split_profiles or --no_split_profiles.
    
    Change-Id: Ibc2d3cefd7ad61b76cbef74c734543ef9ca51795
    Reviewed-on: http://gerrit.cloudera.org:8080/19796
    Reviewed-by: Impala Public Jenkins <[email protected]>
    Tested-by: Impala Public Jenkins <[email protected]>
---
 bin/single_node_perf_run.py | 64 +++++++++++++++++++++++++++++++++++++--------
 1 file changed, 53 insertions(+), 11 deletions(-)

diff --git a/bin/single_node_perf_run.py b/bin/single_node_perf_run.py
index 1452c9fee..2c0418175 100755
--- a/bin/single_node_perf_run.py
+++ b/bin/single_node_perf_run.py
@@ -107,6 +107,7 @@ def load_data(db_to_load, table_formats, scale):
     
configured_call(["{0}/tests/util/compute_table_stats.py".format(IMPALA_HOME),
                      "--stop_on_error", "--db_names", db_name])
 
+
 def get_git_hash_for_name(name):
   return sh.git("rev-parse", name).strip()
 
@@ -169,7 +170,7 @@ def report_benchmark_results(file_a, file_b, description):
   sh.cat(result, _out=sys.stdout)
 
 
-def compare(base_dir, hash_a, hash_b):
+def compare(base_dir, hash_a, hash_b, options):
   """Take the results of two performance runs and compare them."""
   file_a = os.path.join(base_dir, hash_a + ".json")
   file_b = os.path.join(base_dir, hash_b + ".json")
@@ -177,14 +178,22 @@ def compare(base_dir, hash_a, hash_b):
   report_benchmark_results(file_a, file_b, description)
 
   # From the two json files extract the profiles and diff them
-  generate_profile_file(file_a, hash_a, base_dir)
-  generate_profile_file(file_b, hash_b, base_dir)
-
-  sh.diff("-u",
-          os.path.join(base_dir, hash_a + "_profile.txt"),
-          os.path.join(base_dir, hash_b + "_profile.txt"),
-          _out=os.path.join(IMPALA_HOME, 
"performance_result_profile_diff.txt"),
-          _ok_code=[0, 1])
+  if options.split_profiles:
+    generate_profile_files(file_a, hash_a, base_dir)
+    generate_profile_files(file_b, hash_b, base_dir)
+    sh.diff("-u",
+            os.path.join(base_dir, hash_a + "_profiles"),
+            os.path.join(base_dir, hash_b + "_profiles"),
+            _out=os.path.join(IMPALA_HOME, 
"performance_result_profile_diff.txt"),
+            _ok_code=[0, 1])
+  else:
+    generate_profile_file(file_a, hash_a, base_dir)
+    generate_profile_file(file_b, hash_b, base_dir)
+    sh.diff("-u",
+            os.path.join(base_dir, hash_a + "_profile.txt"),
+            os.path.join(base_dir, hash_b + "_profile.txt"),
+            _out=os.path.join(IMPALA_HOME, 
"performance_result_profile_diff.txt"),
+            _ok_code=[0, 1])
 
 
 def generate_profile_file(name, hash, base_dir):
@@ -202,6 +211,33 @@ def generate_profile_file(name, hash, base_dir):
           out.write("\n\n")
 
 
+def generate_profile_files(name, hash, base_dir):
+  """Extracts runtime profiles from the JSON file 'name'.
+
+  Writes the runtime profiles back as separated simple text file in 
'[hash]_profiles' dir
+  in base_dir.
+  """
+  profile_dir = os.path.join(base_dir, hash + "_profiles")
+  if not os.path.exists(profile_dir):
+    os.makedirs(profile_dir)
+  with open(name) as fid:
+    data = json.loads(fid.read().decode("utf-8", "ignore"))
+    iter_num = {}
+    # For each query
+    for key in data:
+      for iteration in data[key]:
+        query_name = iteration["query"]["name"]
+        if query_name in iter_num:
+          iter_num[query_name] += 1
+        else:
+          iter_num[query_name] = 1
+        curr_iter = iter_num[query_name]
+
+        file_name = "{}_iter{:03d}.txt".format(query_name, curr_iter)
+        with open(os.path.join(profile_dir, file_name), "w") as out:
+          out.write(iteration["runtime_profile"])
+
+
 def backup_workloads():
   """Copy the workload folder to a temporary directory and returns its name.
 
@@ -266,7 +302,7 @@ def perf_ab_test(options, args):
     restore_workloads(workload_dir)
     start_impala(options.num_impalads, options)
     run_workload(temp_dir, workloads, options)
-    compare(temp_dir, hash_a, hash_b)
+    compare(temp_dir, hash_a, hash_b, options)
 
 
 def parse_options():
@@ -289,10 +325,16 @@ def parse_options():
   parser.add_option("--start_minicluster", action="store_true",
                     help="start a new Hadoop minicluster")
   parser.add_option("--ninja", action="store_true",
-                    help = "use ninja, rather than Make, as the build tool")
+                    help="use ninja, rather than Make, as the build tool")
   parser.add_option("--impalad_args", dest="impalad_args", action="append", 
type="string",
                     default=[],
                     help="Additional arguments to pass to each Impalad during 
startup")
+  parser.add_option("--split_profiles", action="store_true", 
dest="split_profiles",
+                    default=True, help=("If specified, query profiles will be 
generated "
+                      "as separate files"))
+  parser.add_option("--no_split_profiles", action="store_false", 
dest="split_profiles",
+                    help=("If specified, query profiles will be generated as a 
"
+                      "single-combined file"))
 
   parser.set_usage(textwrap.dedent("""
     single_node_perf_run.py [options] git_hash_A [git_hash_B]

[impala] 01/03: IMPALA-12090: Split runtime profiles made by single_node_perf_run.py

Reply via email to