Repository: systemml Updated Branches: refs/heads/master 0cb2f7f68 -> a725b2d2e
[MINOR]bug fixes & feature added in perf test & spark-submit python scripts Project: http://git-wip-us.apache.org/repos/asf/systemml/repo Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/a725b2d2 Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/a725b2d2 Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/a725b2d2 Branch: refs/heads/master Commit: a725b2d2ebf6dcb56f4edb68376c3849c8991b27 Parents: 0cb2f7f Author: Nakul Jindal <[email protected]> Authored: Thu Sep 28 15:28:17 2017 -0700 Committer: Nakul Jindal <[email protected]> Committed: Thu Sep 28 15:28:17 2017 -0700 ---------------------------------------------------------------------- bin/systemml-spark-submit.py | 30 ++++++++----- scripts/perftest/python/datagen.py | 2 +- scripts/perftest/python/predict.py | 2 +- scripts/perftest/python/run_perftest.py | 19 ++++++--- scripts/perftest/python/train.py | 2 +- scripts/perftest/python/utils_misc.py | 63 +++++++++++++++------------- 6 files changed, 70 insertions(+), 48 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/systemml/blob/a725b2d2/bin/systemml-spark-submit.py ---------------------------------------------------------------------- diff --git a/bin/systemml-spark-submit.py b/bin/systemml-spark-submit.py index b6426b3..b4da801 100755 --- a/bin/systemml-spark-submit.py +++ b/bin/systemml-spark-submit.py @@ -92,25 +92,35 @@ def spark_submit_entry(master, driver_memory, num_executors, executor_memory, ml_options.append(stats) if gpu is not None: ml_options.append('-gpu') - ml_options.append(gpu) + if gpu is not 'no_option': + ml_options.append(gpu) if len(ml_options) < 1: ml_options = '' # stats, explain, target_jars cmd_spark = [spark_path, '--class', 'org.apache.sysml.api.DMLScript', - '--master', master, '--driver-memory', driver_memory, - '--num-executors', num_executors, '--executor-memory', executor_memory, - '--executor-cores', executor_cores, '--conf', default_conf, + '--master', master, + '--driver-memory', driver_memory, + '--conf', default_conf, '--jars', cuda_jars, systemml_jars] + if num_executors is not None: + cmd_spark = cmd_spark + ['--num-executors', num_executors] + + if executor_memory is not None: + cmd_spark = cmd_spark + ['--executor-memory', executor_memory] + + if executor_cores is not None: + cmd_spark = cmd_spark + ['--executor-cores', executor_cores] + cmd_system_ml = ['-config', default_config, '-exec', 'hybrid_spark', '-f', script_file, ' '.join(ml_options)] cmd = cmd_spark + cmd_system_ml # Debug - # print(' '.join(cmd)) + print(' '.join(cmd)) return_code = os.system(' '.join(cmd)) return return_code @@ -120,10 +130,10 @@ if __name__ == '__main__': description='System-ML Spark Submit Script') # SPARK-SUBMIT Options cparser.add_argument('--master', default='local[*]', help='local, yarn-client, yarn-cluster', metavar='') - cparser.add_argument('--driver-memory', default='5G', help='Memory for driver (e.g. 512M)', metavar='') - cparser.add_argument('--num-executors', default='2', help='Number of executors to launch', metavar='') - cparser.add_argument('--executor-memory', default='2G', help='Memory per executor', metavar='') - cparser.add_argument('--executor-cores', default='1', help='Number of cores', metavar='') + cparser.add_argument('--driver-memory', default='8G', help='Memory for driver (e.g. 512M, 1G)', metavar='') + cparser.add_argument('--num-executors', nargs=1, help='Number of executors to launch', metavar='') + cparser.add_argument('--executor-memory', nargs=1, help='Memory per executor', metavar='') + cparser.add_argument('--executor-cores', nargs=1, help='Number of executor cores', metavar='') cparser.add_argument('--conf', help='Spark configuration file', nargs='+', metavar='') # SYSTEM-ML Options @@ -138,7 +148,7 @@ if __name__ == '__main__': metavar='') cparser.add_argument('-gpu', help='uses CUDA instructions when reasonable, ' 'set <force> option to skip conservative memory estimates ' - 'and use GPU wherever possible', nargs='?') + 'and use GPU wherever possible', nargs='?', const='no_option') cparser.add_argument('-f', required=True, help='specifies dml/pydml file to execute; ' 'path can be local/hdfs/gpfs', metavar='') http://git-wip-us.apache.org/repos/asf/systemml/blob/a725b2d2/scripts/perftest/python/datagen.py ---------------------------------------------------------------------- diff --git a/scripts/perftest/python/datagen.py b/scripts/perftest/python/datagen.py index 9b9edf1..6794187 100755 --- a/scripts/perftest/python/datagen.py +++ b/scripts/perftest/python/datagen.py @@ -25,7 +25,7 @@ from os.path import join from utils_misc import split_rowcol, config_writer, mat_type_check # This file contains configuration settings for data generation -DATA_FORMAT = 'csv' +DATA_FORMAT = 'binary' MATRIX_TYPE_DICT = {'dense': '0.9', 'sparse': '0.01'} http://git-wip-us.apache.org/repos/asf/systemml/blob/a725b2d2/scripts/perftest/python/predict.py ---------------------------------------------------------------------- diff --git a/scripts/perftest/python/predict.py b/scripts/perftest/python/predict.py index 21eed6a..67467b1 100755 --- a/scripts/perftest/python/predict.py +++ b/scripts/perftest/python/predict.py @@ -26,7 +26,7 @@ from utils_misc import config_writer, mat_type_check from utils_fs import relevant_folders # Contains configuration setting for predicting -DATA_FORMAT = 'csv' +DATA_FORMAT = 'binary' def m_svm_predict(save_folder_name, datagen_dir, train_dir, predict_dir, config_dir): http://git-wip-us.apache.org/repos/asf/systemml/blob/a725b2d2/scripts/perftest/python/run_perftest.py ---------------------------------------------------------------------- diff --git a/scripts/perftest/python/run_perftest.py b/scripts/perftest/python/run_perftest.py index dffb7a2..f0b272a 100755 --- a/scripts/perftest/python/run_perftest.py +++ b/scripts/perftest/python/run_perftest.py @@ -141,7 +141,7 @@ def algorithm_workflow(algo, exec_type, config_path, dml_file_name, action_mode, return exit_flag_success -def perf_test_entry(family, algo, exec_type, mat_type, mat_shape, config_dir, mode, temp_dir): +def perf_test_entry(family, algo, exec_type, mat_type, mat_shape, config_dir, mode, temp_dir, file_system_type): """ This function is the entry point for performance testing @@ -168,6 +168,9 @@ def perf_test_entry(family, algo, exec_type, mat_type, mat_shape, config_dir, mo temp_dir: String Location to store all output files created during perf test + + file_system_type: String + """ # algos to run is a list of tuples with # [(m-svm, binomial), (m-svm, multinomial)...] @@ -275,6 +278,7 @@ if __name__ == '__main__': mat_type = ['dense', 'sparse', 'all'] workload = ['data-gen', 'train', 'predict'] execution_mode = ['hybrid_spark', 'singlenode'] + file_system_type = ['hdfs', 'local'] # Default Arguments default_mat_shape = ['10k_100'] @@ -308,7 +312,6 @@ if __name__ == '__main__': 'spark.driver.extraJavaOptions=\"-Xms20g -Xmn2g\"' - # Argparse Module cparser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter, description='SystemML Performance Test Script') @@ -335,8 +338,12 @@ if __name__ == '__main__': cparser.add_argument('--mode', default=workload, help='space separated list of types of workloads to run (available: data-gen, train, predict)', metavar='', choices=workload, nargs='+') - # Change this to temp-dir - cparser.add_argument('--temp-dir', help='define the file system to work on', metavar='') + cparser.add_argument('--temp-dir', help='the path on the file system to place the working temporary directory at', + metavar='') + cparser.add_argument('--file-system-type', choices=file_system_type, metavar='', + help='file system for temp directory, ' + 'supported types are \'hdfs\' for hybrid_spark and \'local\' for standalone;' + 'default for hybrid_spark is \'hdfs\' and for standalone is \'local\'') # Configuration Options cparser.add_argument('-stats', help='Monitor and report caching/recompilation statistics, ' @@ -347,7 +354,7 @@ if __name__ == '__main__': cparser.add_argument('-config', help='System-ML configuration file (e.g SystemML-config.xml)', metavar='') cparser.add_argument('-gpu', help='uses CUDA instructions when reasonable, ' 'set <force> option to skip conservative memory estimates ' - 'and use GPU wherever possible', nargs='?') + 'and use GPU wherever possible', nargs='?', const='no_option') # Spark Configuration Option cparser.add_argument('--master', help='local, yarn-client, yarn-cluster', metavar='') cparser.add_argument('--driver-memory', help='Memory for driver (e.g. 512M)', metavar='') @@ -371,7 +378,7 @@ if __name__ == '__main__': perftest_args_dict, systemml_args_dict, backend_args_dict = split_config_args(all_arg_dict) # temp_dir hdfs / local path check - perftest_args_dict['temp_dir'] = get_default_dir(args.temp_dir, args.exec_type, default_config_dir) + perftest_args_dict['temp_dir'] = get_default_dir(args.file_system_type, args.temp_dir, args.exec_type, default_config_dir) # default_mat_type validity if len(args.mat_type) > 2: http://git-wip-us.apache.org/repos/asf/systemml/blob/a725b2d2/scripts/perftest/python/train.py ---------------------------------------------------------------------- diff --git a/scripts/perftest/python/train.py b/scripts/perftest/python/train.py index 4717ff7..a95950d 100755 --- a/scripts/perftest/python/train.py +++ b/scripts/perftest/python/train.py @@ -27,7 +27,7 @@ from functools import reduce from utils_fs import relevant_folders # Contains configuration setting for training -DATA_FORMAT = 'csv' +DATA_FORMAT = 'binary' def binomial_m_svm_train(save_folder_name, datagen_dir, train_dir, config_dir): http://git-wip-us.apache.org/repos/asf/systemml/blob/a725b2d2/scripts/perftest/python/utils_misc.py ---------------------------------------------------------------------- diff --git a/scripts/perftest/python/utils_misc.py b/scripts/perftest/python/utils_misc.py index 92dbc73..87b870b 100755 --- a/scripts/perftest/python/utils_misc.py +++ b/scripts/perftest/python/utils_misc.py @@ -54,30 +54,28 @@ def split_config_args(args): perftest_args_dict['filename'] = args['filename'] perftest_args_dict['mode'] = args['mode'] perftest_args_dict['temp_dir'] = args['temp_dir'] + perftest_args_dict['file_system_type'] = args['file_system_type'] systemml_args_dict = {} - if 'stats' in args.keys(): - if args['stats'] is not None: - systemml_args_dict['-stats'] = args['stats'] - else: - systemml_args_dict['-stats'] = '' + if args['stats'] is not None: + systemml_args_dict['-stats'] = args['stats'] + else: + systemml_args_dict['-stats'] = '' - if 'explain' in args.keys(): - if args['explain'] is not None: - systemml_args_dict['-explain'] = args['explain'] - else: - systemml_args_dict['-explain'] = '' + if args['explain'] is not None: + systemml_args_dict['-explain'] = args['explain'] + else: + systemml_args_dict['-explain'] = '' - if 'config' in args.keys(): - if args['config'] is not None: - systemml_args_dict['-config'] = args['config'] + if args['config'] is not None: + systemml_args_dict['-config'] = args['config'] - if 'gpu' in args.keys(): - if args['gpu'] is not None: - systemml_args_dict['-gpu'] = args['gpu'] - else: + if args['gpu'] is not None: + if args['gpu'] == 'no_option': systemml_args_dict['-gpu'] = '' + else: + systemml_args_dict['-gpu'] = args['gpu'] backend_args_dict = {} exec_type = args['exec_type'] @@ -373,8 +371,9 @@ def mat_type_check(current_family, matrix_types, dense_algos): return current_type -def get_default_dir(temp_dir, exec_mode, config_dir): +def get_default_dir(file_system_type, temp_dir, exec_mode, config_dir): """ + file_system_type: String temp_dir: String exec_mode: String config_dir: String @@ -390,17 +389,23 @@ def get_default_dir(temp_dir, exec_mode, config_dir): return temp_dir if exec_mode == 'hybrid_spark': - cmd = ['hdfs', 'getconf', '-confKey', 'fs.default.name'] - hdfs_base = subprocess_exec(' '.join(cmd), extract='hdfs_base') + if file_system_type == 'hdfs': + cmd = ['hdfs', 'getconf', '-confKey', 'fs.default.name'] + hdfs_base = subprocess_exec(' '.join(cmd), extract='hdfs_base') - if temp_dir is None: - hdfs_home = join(hdfs_base, 'user', getpass.getuser()) - check_hdfs_path(hdfs_home) - return hdfs_home + if temp_dir is None: + hdfs_home = join(hdfs_base, 'user', getpass.getuser()) + check_hdfs_path(hdfs_home) + return hdfs_home - if temp_dir is not None: - if temp_dir.startswith('hdfs'): + if temp_dir is not None: + if temp_dir.startswith('hdfs'): + return temp_dir + else: + hdfs_home = join(hdfs_base, 'user', getpass.getuser(), temp_dir) + return hdfs_home + else: + if temp_dir is None: + return config_dir + if temp_dir is not None: return temp_dir - else: - hdfs_home = join(hdfs_base, 'user', getpass.getuser(), temp_dir) - return hdfs_home
