Repository: systemml Updated Branches: refs/heads/master aedceb611 -> d2efa65c8
[SYSTEMML-1451][Phase3] phase 3 work - Offline CSV support - Family bug fix - Plots - Doc Update - Stats update - Bug train, predict append family name Closes #604 Project: http://git-wip-us.apache.org/repos/asf/systemml/repo Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/d2efa65c Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/d2efa65c Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/d2efa65c Branch: refs/heads/master Commit: d2efa65c89e3f6eaf7073c314eb56a033c8c8d5f Parents: aedceb6 Author: krishnakalyan3 <[email protected]> Authored: Sat Aug 26 11:52:59 2017 -0700 Committer: Nakul Jindal <[email protected]> Committed: Sat Aug 26 11:52:59 2017 -0700 ---------------------------------------------------------------------- docs/python-performance-test.md | 16 ++- pom.xml | 2 + scripts/perftest/python/datagen.py | 1 + .../perftest/python/google_docs/gdocs_utils.py | 35 +++++ scripts/perftest/python/google_docs/stats.py | 134 +++++++++---------- scripts/perftest/python/google_docs/update.py | 55 ++++++-- scripts/perftest/python/predict.py | 9 +- scripts/perftest/python/requirements.txt | 4 + scripts/perftest/python/run_perftest.py | 51 ++++--- scripts/perftest/python/train.py | 9 +- scripts/perftest/python/utils_exec.py | 6 +- scripts/perftest/python/utils_fs.py | 6 + scripts/perftest/python/utils_misc.py | 6 +- 13 files changed, 221 insertions(+), 113 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/systemml/blob/d2efa65c/docs/python-performance-test.md ---------------------------------------------------------------------- diff --git a/docs/python-performance-test.md b/docs/python-performance-test.md index ce36c2d..25e1f35 100644 --- a/docs/python-performance-test.md +++ b/docs/python-performance-test.md @@ -148,6 +148,17 @@ Run performance test for all algorithms under the family `regression2` and log w Run performance test for all algorithms using HDFS. +## Result Consolidation and Plotting +We have two scripts, `stats.py` forpulling results from google docs and `update.py` to updating results to google docs or local file system. + +Example of `update.py` would be below +`./scripts/perftest/python/google_docs/update.py --file ../../temp/perf_test_singlenode.out --exec-type singlenode --tag 2 --append test.csv` +The arguments being `--file` path of the perf-test output, `--exec-type` execution mode used to generate the perf-test output, `--tag` being the realease version or a unique name, `--append` being an optional argument that would append the a local csv file. If instead of `--append` the `--auth` argument needs the location of the `google api key` file. + +Example of `stats.py` below +` ./stats.py --auth ../key/client_json.json --exec-type singlenode --plot stats1_data-gen_none_dense_10k_100` +`--plot` argument needs the name of the composite key that you would like to compare results over. If this argument is not specified the results would be grouped by keys. + ## Operational Notes All performance test depend mainly on two scripts for execution `systemml-standalone.py` and `systemml-spark-submit.py`. Incase we need to change standalone or spark parameters we need to manually change these parameters in their respective scripts. @@ -158,7 +169,7 @@ The logs contain the following information below comma separated. algorithm | run_type | intercept | matrix_type | data_shape | time_sec --- | --- | --- | --- | --- | --- | -multinomial|data-gen|0|dense|10k_100| 0.33 +multinomial|data-gen|0|10k_100|dense| 0.33 MultiLogReg|train|0|10k_100|dense|6.956 MultiLogReg|predict|0|10k_100|dense|4.780 @@ -187,9 +198,12 @@ Matrix Shape | Approximate Data Size 10M_1k|80GB 100M_1k|800GB + For example the command below runs performance test for all data sizes described above `run_perftest.py --family binomial clustering multinomial regression1 regression2 stats1 stats2 --mat-shape 10k_1k 100k_1k 1M_1k 10M_1k 100M_1k --master yarn-client --temp-dir hdfs://localhost:9000/user/systemml` +By default data generated in `hybrid_spark` execution mode is in the current users `hdfs` home directory. + Note: Please use this command `pip3 install -r requirements.txt` before using the perftest scripts. http://git-wip-us.apache.org/repos/asf/systemml/blob/d2efa65c/pom.xml ---------------------------------------------------------------------- diff --git a/pom.xml b/pom.xml index 2ed9374..0ee382c 100644 --- a/pom.xml +++ b/pom.xml @@ -897,6 +897,8 @@ <exclude>src/test/scripts/functions/jmlc/tfmtd_example/dummycoded.column.names</exclude> <exclude>src/test/scripts/functions/jmlc/tfmtd_example2/column.names</exclude> <exclude>src/test/scripts/functions/jmlc/tfmtd_frame_example/tfmtd_frame</exclude> + <!-- Perftest requirement file --> + <exclude>scripts/perftest/python/requirements.txt</exclude> </excludes> </configuration> </plugin> http://git-wip-us.apache.org/repos/asf/systemml/blob/d2efa65c/scripts/perftest/python/datagen.py ---------------------------------------------------------------------- diff --git a/scripts/perftest/python/datagen.py b/scripts/perftest/python/datagen.py index 72a0627..9b9edf1 100755 --- a/scripts/perftest/python/datagen.py +++ b/scripts/perftest/python/datagen.py @@ -243,6 +243,7 @@ def config_packets_datagen(algo_payload, matrix_type, matrix_shape, datagen_dir, This dictionary contains algorithms to be executed as keys and the path of configuration json files to be executed list of values. """ + config_bundle = {} distinct_families = set(map(lambda x: x[1], algo_payload)) http://git-wip-us.apache.org/repos/asf/systemml/blob/d2efa65c/scripts/perftest/python/google_docs/gdocs_utils.py ---------------------------------------------------------------------- diff --git a/scripts/perftest/python/google_docs/gdocs_utils.py b/scripts/perftest/python/google_docs/gdocs_utils.py new file mode 100755 index 0000000..fd85093 --- /dev/null +++ b/scripts/perftest/python/google_docs/gdocs_utils.py @@ -0,0 +1,35 @@ +#!/usr/bin/env python3 +# ------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +# ------------------------------------------------------------- +from oauth2client.service_account import ServiceAccountCredentials +import gspread + + +def auth(path, sheet_name): + """ + Responsible for authorization + """ + + scope = ['https://spreadsheets.google.com/feeds'] + creds = ServiceAccountCredentials.from_json_keyfile_name(path, scope) + gc = gspread.authorize(creds) + sheet = gc.open("Perf").worksheet(sheet_name) + return sheet http://git-wip-us.apache.org/repos/asf/systemml/blob/d2efa65c/scripts/perftest/python/google_docs/stats.py ---------------------------------------------------------------------- diff --git a/scripts/perftest/python/google_docs/stats.py b/scripts/perftest/python/google_docs/stats.py index 3b89abe..f282964 100755 --- a/scripts/perftest/python/google_docs/stats.py +++ b/scripts/perftest/python/google_docs/stats.py @@ -21,93 +21,91 @@ # ------------------------------------------------------------- import argparse -from functools import reduce +import os import pprint -from oauth2client.service_account import ServiceAccountCredentials -import gspread +from os.path import join +import matplotlib.pyplot as plt +from gdocs_utils import auth -# Get time difference between difference runs - -def auth(path, sheet_name): - """ - Responsible for authorization - """ - scope = ['https://spreadsheets.google.com/feeds'] - creds = ServiceAccountCredentials.from_json_keyfile_name(path, scope) - gc = gspread.authorize(creds) - sheet = gc.open("Perf").worksheet(sheet_name) - return sheet - - -def get_data(sheet, tag): +# Dict +# {algo_name : [algo_1.0': t1, 'algo_2.0': t2]} +def get_formatted_data(sheet_data): """ - Get time and algorithm from the sheet + Read all the data from google sheets and transforms it into a dictionary that can be + use for plotting later """ - time = sheet.find('time_{}'.format(tag)) - algo = sheet.find('algo_{}'.format(tag)) - - time_col = sheet.col_values(time.col) - time_col = list(filter(lambda x: len(x) > 0, time_col)) - - algo_col = sheet.col_values(algo.col) - algo_col = list(filter(lambda x: len(x) > 0, algo_col)) - return algo_col, time_col + algo_dict = {} + + for i in sheet_data: + inn_count = 0 + data = [] + for key, val in i.items(): + inn_count += 1 + if inn_count < 3: + data.append(key) + data.append(val) + + if inn_count == 2: + t1, v1, _, v2 = data + if len(str(v2)) > 0: + if v1 not in algo_dict: + algo_dict[v1] = [{t1: v2}] + else: + algo_dict[v1].append({t1: v2}) + inn_count = 0 + data = [] + return algo_dict -def get_data_dict(data_col): +def plot(x, y, xlab, ylab, title): """ - Return data as dictionary with key as algorithm and list time values + Save plots to the current folder based on the arguments """ - data_dict = {} - all_algo = [] - for algo, _ in data_col: - all_algo.append(algo) - - flatten_algo = reduce(lambda x, y: x+y, all_algo) - - # remove the header - filter_data = list(filter(lambda x: not x.startswith('algo_'), flatten_algo)) - distict_algos = set(filter_data) - - for algo_dist in distict_algos: - for algo, time in data_col: - for k, v in zip(algo, time): - if algo_dist == k: - if algo_dist not in data_dict: - data_dict[k] = [v] - else: - data_dict[k].append(v) - return data_dict + CWD = os.getcwd() + PATH = join(CWD, title) + width = .35 + plt.bar(x, y, color="red", width=width) + plt.xticks(x) + plt.xlabel(xlab) + plt.ylabel(ylab) + plt.title(title) + plt.savefig(PATH + '.png') + print('Plot {} generated'.format(title)) + return plt # Example Usage -# ./stats.py --auth client_json.json --exec-mode singlenode --tags 1.0 2.0 +# ./stats.py --auth ../key/client_json.json --exec-mode singlenode if __name__ == '__main__': execution_mode = ['hybrid_spark', 'singlenode'] cparser = argparse.ArgumentParser(description='System-ML Statistics Script') cparser.add_argument('--auth', help='Location to read auth file', required=True, metavar='') - cparser.add_argument('--exec-mode', help='Execution mode', choices=execution_mode, + cparser.add_argument('--exec-type', help='Execution mode', choices=execution_mode, required=True, metavar='') - cparser.add_argument('--tags', help='Tagging header value', - required=True, nargs='+') + cparser.add_argument('--plot', help='Algorithm to plot', metavar='') args = cparser.parse_args() - arg_dict = vars(args) - sheet = auth(args.auth, args.exec_mode) - all_data = sheet.get_all_records() - - data_col = [] - for tag in args.tags: - algo_col, time_col = get_data(sheet, tag) - data_col.append((algo_col, time_col)) - data_dict = get_data_dict(data_col) - - delta_algo = {} - for k, v in data_dict.items(): - delta = float(v[0]) - float(v[1]) - delta_algo[k] = delta + sheet = auth(args.auth, args.exec_type) + all_data = sheet.get_all_records() - pprint.pprint(delta_algo, width=1) + plot_data = get_formatted_data(all_data) + if args.plot is not None: + print(plot_data[args.plot]) + title = args.plot + ylab = 'Time in sec' + xlab = 'Version' + x = [] + y = [] + for i in plot_data[args.plot]: + version = list(i.keys())[0] + time = list(i.values())[0] + y.append(time) + x.append(version) + + x = list(map(lambda x: float(x.split('_')[1]), x)) + plot(x, y, xlab, ylab, title) + else: + pprint.pprint(plot_data, width=1) \ No newline at end of file http://git-wip-us.apache.org/repos/asf/systemml/blob/d2efa65c/scripts/perftest/python/google_docs/update.py ---------------------------------------------------------------------- diff --git a/scripts/perftest/python/google_docs/update.py b/scripts/perftest/python/google_docs/update.py index c2fed38..056fa9c 100755 --- a/scripts/perftest/python/google_docs/update.py +++ b/scripts/perftest/python/google_docs/update.py @@ -20,11 +20,12 @@ # # ------------------------------------------------------------- +import sys +import os.path import argparse -import gspread -from oauth2client.service_account import ServiceAccountCredentials import pandas as pd - +from oauth2client.service_account import ServiceAccountCredentials +import gspread # Update data to google sheets @@ -33,6 +34,7 @@ def parse_data(file_path): Skip reading 1st row : Header Skip reading last row : Footer """ + csv_file = pd.read_csv(file_path, sep=',', skiprows=1, skipfooter=1, engine='python') algo = csv_file['INFO:root:algorithm'].apply(lambda x: x.split(':')[-1]) key = algo + '_'+ csv_file['run_type'] + '_' + csv_file['intercept'] + '_' + \ @@ -44,6 +46,7 @@ def auth(path, sheet_name): """ Responsible for authorization """ + scope = ['https://spreadsheets.google.com/feeds'] creds = ServiceAccountCredentials.from_json_keyfile_name(path, scope) gc = gspread.authorize(creds) @@ -64,6 +67,7 @@ def insert_values(sheet, key, col_num, header): """ Insert data to google sheets based on the arguments """ + # Col Name sheet.update_cell(1, col_num, header) for id, val in enumerate(key): @@ -74,6 +78,7 @@ def get_dim(sheet): """ Get the dimensions of data """ + try: col_count = sheet.get_all_records() except: @@ -83,6 +88,16 @@ def get_dim(sheet): return row, col +def row_append(data_frame, file): + """ + Append results to a local csv + """ + + append_df = pd.read_csv(file) + concat_data = pd.concat([data_frame, append_df], axis=1) + return concat_data + + # Example Usage # ./update.py --file ../temp/test.out --exec-mode singlenode --auth client_json.json --tag 3.0 if __name__ == '__main__': @@ -91,20 +106,36 @@ if __name__ == '__main__': cparser = argparse.ArgumentParser(description='System-ML Update / Stat Script') cparser.add_argument('--file', help='Location of the current perf test outputs', required=True, metavar='') - cparser.add_argument('--exec-mode', help='Backend Type', choices=execution_mode, - required=True, metavar='') - cparser.add_argument('--auth', help='Location to read auth file', + cparser.add_argument('--exec-type', help='Backend Type', choices=execution_mode, required=True, metavar='') cparser.add_argument('--tag', help='Tagging header value', required=True, metavar='') + cparser.add_argument('--auth', help='Location to read auth file', metavar='') + cparser.add_argument('--append', help='Location to append the outputs', metavar='') args = cparser.parse_args() - arg_dict = vars(args) - # Authenticate and get sheet dimensions - sheet = auth(args.auth, args.exec_mode) - row, col = get_dim(sheet) + if args.auth is None and args.append is None: + sys.exit('Both --auth and --append cannot be empty') - # Read data from file and write to google docs algo, time = parse_data(args.file) - insert_pair(algo, time, col + 1, args.tag) + + if args.append is not None: + schema_df = {'algo_{}'.format(args.tag): algo, + 'time_{}'.format(args.tag): time} + data_frame = pd.DataFrame(schema_df) + if os.path.isfile(args.append): + append_data = row_append(data_frame, args.append) + append_data.to_csv(args.append, sep=',', index=False) + else: + data_frame.to_csv(args.append, sep=',', index=False) + + if args.auth is not None: + # Read data from file and write to google docs + algo, time = parse_data(args.file) + + # Authenticate and get sheet dimensions + sheet = auth(args.auth, args.exec_type) + row, col = get_dim(sheet) + + insert_pair(algo, time, col + 1, args.tag) http://git-wip-us.apache.org/repos/asf/systemml/blob/d2efa65c/scripts/perftest/python/predict.py ---------------------------------------------------------------------- diff --git a/scripts/perftest/python/predict.py b/scripts/perftest/python/predict.py index d2e44e6..21eed6a 100755 --- a/scripts/perftest/python/predict.py +++ b/scripts/perftest/python/predict.py @@ -239,10 +239,12 @@ def config_packets_predict(algo_payload, matrix_type, matrix_shape, datagen_dir, This dictionary contains algorithms to be executed as keys and the path of configuration json files to be executed list of values. """ + config_bundle = {} - for k, _ in algo_payload: - config_bundle[k] = [] + for current_algo, current_family in algo_payload: + key_name = current_algo + '.' + current_family + config_bundle[key_name] = [] for current_algo, current_family in algo_payload: current_matrix_type = mat_type_check(current_family, matrix_type, dense_algos) @@ -268,6 +270,7 @@ def config_packets_predict(algo_payload, matrix_type, matrix_shape, datagen_dir, conf_path = globals()[algo_func](save_name, current_data_gen_dir[0], current_train_folder, predict_dir, config_dir) - config_bundle[current_algo].append(conf_path) + key_name = current_algo + '.' + current_family + config_bundle[key_name].append(conf_path) return config_bundle http://git-wip-us.apache.org/repos/asf/systemml/blob/d2efa65c/scripts/perftest/python/requirements.txt ---------------------------------------------------------------------- diff --git a/scripts/perftest/python/requirements.txt b/scripts/perftest/python/requirements.txt new file mode 100644 index 0000000..8e9553d --- /dev/null +++ b/scripts/perftest/python/requirements.txt @@ -0,0 +1,4 @@ +numpy +pandas +gspread +matplotlib http://git-wip-us.apache.org/repos/asf/systemml/blob/d2efa65c/scripts/perftest/python/run_perftest.py ---------------------------------------------------------------------- diff --git a/scripts/perftest/python/run_perftest.py b/scripts/perftest/python/run_perftest.py index 20f5380..dffb7a2 100755 --- a/scripts/perftest/python/run_perftest.py +++ b/scripts/perftest/python/run_perftest.py @@ -233,10 +233,11 @@ def perf_test_entry(family, algo, exec_type, mat_type, mat_shape, config_dir, mo conf_packet = config_packets_train(algos_to_run, mat_type, mat_shape, data_gen_dir, train_dir, DENSE_TYPE_ALGOS, train_config_dir) - for algo_name, config_files in conf_packet.items(): + for algo_family_name, config_files in conf_packet.items(): for config in config_files: + algo_name = algo_family_name.split('.')[0] file_name = ML_TRAIN[algo_name] - algorithm_workflow(algo_name, exec_type, config, file_name, 'train', train_dir) + algorithm_workflow(algo_family_name, exec_type, config, file_name, 'train', train_dir) if 'predict' in mode: # Create config directories @@ -255,10 +256,12 @@ def perf_test_entry(family, algo, exec_type, mat_type, mat_shape, config_dir, mo conf_packet = config_packets_predict(algos_to_run, mat_type, mat_shape, data_gen_dir, train_dir, predict_dir, DENSE_TYPE_ALGOS, predict_config_dir) - for algo_name, config_files in conf_packet.items(): + + for algo_family_name, config_files in conf_packet.items(): for config in config_files: + algo_name = algo_family_name.split('.')[0] file_name = ML_PREDICT[algo_name] - algorithm_workflow(algo_name, exec_type, config, file_name, 'predict', predict_dir) + algorithm_workflow(algo_family_name, exec_type, config, file_name, 'predict', predict_dir) if __name__ == '__main__': @@ -290,12 +293,28 @@ if __name__ == '__main__': # Families all_families = ML_ALGO.keys() + # Default Conf + default_conf = 'spark.driver.maxResultSize=0 ' \ + 'spark.akka.frameSize=128 ' \ + 'spark.network.timeout=6000s ' \ + 'spark.rpc.askTimeout=6000s ' \ + 'spark.memory.useLegacyMode=true ' \ + 'spark.files.useFetchCache=false' \ + + + default_conf_big_job = 'spark.executor.extraJavaOptions=\"-Xmn5500m\" ' \ + 'spark.executor.memory=\"-Xms50g\" ' \ + 'spark.yarn.executor.memoryOverhead=8250 ' \ + 'spark.driver.extraJavaOptions=\"-Xms20g -Xmn2g\"' + + + # Argparse Module cparser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter, description='SystemML Performance Test Script') cparser.add_argument('--family', help='space separated list of classes of algorithms ' '(available : ' + ', '.join(sorted(all_families)) + ')', - metavar='', choices=all_families, nargs='+', default=' '.join(all_families)) + metavar='', choices=all_families, nargs='+', default=all_families) cparser.add_argument('--algo', help='space separated list of algorithm to run ' '(Overrides --family, available : ' + ', '.join(sorted(all_algos)) + ')', metavar='', choices=all_algos, nargs='+') @@ -335,12 +354,13 @@ if __name__ == '__main__': cparser.add_argument('--num-executors', help='Number of executors to launch', metavar='') cparser.add_argument('--executor-memory', help='Memory per executor', metavar='') cparser.add_argument('--executor-cores', help='Number of cores', metavar='') - cparser.add_argument('--conf', help='Spark configuration file', nargs='+', metavar='') + cparser.add_argument('--conf', help='Spark configuration parameters, please use these ' + 'parameters for large performance tests ' + default_conf_big_job, + default=default_conf, nargs='+', metavar='') # Single node execution mode options cparser.add_argument('-heapmem', help='maximum JVM heap memory', metavar='', default='8g') - # Args is a namespace args = cparser.parse_args() all_arg_dict = vars(args) @@ -358,19 +378,7 @@ if __name__ == '__main__': print('length of --mat-type argument cannot be greater than two') sys.exit() - # Check for validity of input arguments - if args.family is not None: - for fam in args.family: - if fam not in ML_ALGO.keys(): - print('{} family not present in the performance test suit'.format(fam)) - sys.exit() - if args.algo is not None: - for algo in args.algo: - if algo not in all_algos: - print('{} algorithm not present in the performance test suit'.format(args.algo)) - sys.exit() - # This section check the validity of dual datagen algorithms like m-svm algo_families = {} for current_algo in args.algo: @@ -385,9 +393,8 @@ if __name__ == '__main__': input_families = set(args.family) common_families = input_families.intersection(valid_families) if len(common_families) == 0: - print('Please specify a valid family for {} and the ' - 'valid families are {}'.format(current_algo, ' '.join(valid_families))) - sys.exit() + sys.exit('Please specify a valid family for {} and the ' + 'valid families are {}'.format(current_algo, ' '.join(valid_families))) # Set level to 0 -> debug mode # Set level to 20 -> Plain metrics http://git-wip-us.apache.org/repos/asf/systemml/blob/d2efa65c/scripts/perftest/python/train.py ---------------------------------------------------------------------- diff --git a/scripts/perftest/python/train.py b/scripts/perftest/python/train.py index ec784d7..4717ff7 100755 --- a/scripts/perftest/python/train.py +++ b/scripts/perftest/python/train.py @@ -369,10 +369,12 @@ def config_packets_train(algo_payload, matrix_type, matrix_shape, datagen_dir, t This dictionary contains algorithms to be executed as keys and the path of configuration json files to be executed list of values. """ + config_bundle = {} - for k, _ in algo_payload: - config_bundle[k] = [] + for current_algo, current_family in algo_payload: + key_name = current_algo + '.' + current_family + config_bundle[key_name] = [] for current_algo, current_family in algo_payload: current_matrix_type = mat_type_check(current_family, matrix_type, dense_algos) @@ -389,7 +391,8 @@ def config_packets_train(algo_payload, matrix_type, matrix_shape, datagen_dir, t algo_func = '_'.join([current_family] + [current_algo.lower().replace('-', '_')] + ['train']) conf_path = globals()[algo_func](save_name, current_datagen_dir, train_dir, config_dir) - config_bundle[current_algo].append(conf_path) + key_name = current_algo + '.' + current_family + config_bundle[key_name].append(conf_path) config_packets = {} http://git-wip-us.apache.org/repos/asf/systemml/blob/d2efa65c/scripts/perftest/python/utils_exec.py ---------------------------------------------------------------------- diff --git a/scripts/perftest/python/utils_exec.py b/scripts/perftest/python/utils_exec.py index 92a267f..27f75f2 100755 --- a/scripts/perftest/python/utils_exec.py +++ b/scripts/perftest/python/utils_exec.py @@ -46,7 +46,8 @@ def subprocess_exec(cmd_string, log_file_path=None, extract=None): Based on extract we return the relevant string """ # Debug - #print(cmd_string) + # print(cmd_string) + exec_command = shlex.split(cmd_string) proc1 = subprocess.Popen(exec_command, stdout=subprocess.PIPE, stderr=subprocess.PIPE) @@ -81,6 +82,7 @@ def parse_hdfs_base(std_outs): return: String hdfs base uri """ + hdfs_uri = None for line in std_outs: if line.startswith('hdfs://'): @@ -94,6 +96,7 @@ def write_logs(std_outs, log_file_path): """ Write all logs to the specified location """ + with open(log_file_path, 'w')as log: log.write("\n".join(std_outs)) @@ -108,6 +111,7 @@ def get_all_logs(process): return: List, List Std out and Error as logs as list """ + out_arr = [] while True: nextline = process.stdout.readline().decode('utf8').strip() http://git-wip-us.apache.org/repos/asf/systemml/blob/d2efa65c/scripts/perftest/python/utils_fs.py ---------------------------------------------------------------------- diff --git a/scripts/perftest/python/utils_fs.py b/scripts/perftest/python/utils_fs.py index b3cc659..0e3a7b6 100755 --- a/scripts/perftest/python/utils_fs.py +++ b/scripts/perftest/python/utils_fs.py @@ -37,6 +37,7 @@ def create_dir_local(directory): directory: String Location to create a directory """ + if not os.path.exists(directory): os.makedirs(directory) @@ -51,6 +52,7 @@ def write_success(time, path): path: String Location to write the SUCCESS file """ + if 'data-gen' in path: if path.startswith('hdfs') and len(time.split('.')) == 2: full_path = join(path, '_SUCCESS') @@ -75,6 +77,7 @@ def check_SUCCESS_file_exists(path): return: Boolean Checks if the file _SUCCESS exists """ + if 'data-gen' in path: if path.startswith('hdfs'): full_path = join(path, '_SUCCESS') @@ -93,6 +96,7 @@ def contains_dir(hdfs_dirs, sub_folder): """ Support for Lambda Function to check if a HDFS subfolder is contained by the HDFS directory """ + if sub_folder in hdfs_dirs: return True else: @@ -106,6 +110,7 @@ def check_hdfs_path(path): """ Check if a path is present in HDFS """ + cmd = ['hdfs', 'dfs', '-test', '-e', path] return_code = subprocess_exec(' '.join(cmd)) if return_code != 0: @@ -137,6 +142,7 @@ def relevant_folders(path, algo, family, matrix_type, matrix_shape, mode): return: List List of folder locations to read data from """ + folders = [] for current_matrix_type in matrix_type: http://git-wip-us.apache.org/repos/asf/systemml/blob/d2efa65c/scripts/perftest/python/utils_misc.py ---------------------------------------------------------------------- diff --git a/scripts/perftest/python/utils_misc.py b/scripts/perftest/python/utils_misc.py index 704f22b..f9904c5 100755 --- a/scripts/perftest/python/utils_misc.py +++ b/scripts/perftest/python/utils_misc.py @@ -55,8 +55,6 @@ def split_config_args(args): perftest_args_dict['mode'] = args['mode'] perftest_args_dict['temp_dir'] = args['temp_dir'] - - systemml_args_dict = {} if args['stats'] is not None: @@ -88,7 +86,8 @@ def split_config_args(args): backend_args_dict['--executor-cores'] = args['executor_cores'] if args['conf'] is not None: - backend_args_dict['--conf'] = ' '.join(args['conf']) + backend_args_dict['--conf'] = ''.join(args['conf']) + elif exec_type == 'singlenode': if args['heapmem'] is not None: backend_args_dict['-heapmem'] = args['heapmem'] @@ -214,6 +213,7 @@ def exec_dml_and_parse_time(exec_type, dml_file_name, args, backend_args_dict, s algorithm = dml_file_name + '.dml' sup_args = ''.join(['{} {}'.format(k, v) for k, v in systemml_args_dict.items()]) + if exec_type == 'singlenode': exec_script = join(os.environ.get('SYSTEMML_HOME'), 'bin', 'systemml-standalone.py') singlenode_pre_args = ''.join([' {} {} '.format(k, v) for k, v in backend_args_dict.items()])
