Repository: systemml Updated Branches: refs/heads/master a2bf0006f -> ce240af57
[MINOR] updates to performance scripts Closes #618 Project: http://git-wip-us.apache.org/repos/asf/systemml/repo Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/ce240af5 Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/ce240af5 Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/ce240af5 Branch: refs/heads/master Commit: ce240af57fb68caa3a978a8bad62701cb55a139d Parents: a2bf000 Author: Nakul Jindal <[email protected]> Authored: Wed Aug 16 11:14:43 2017 -0700 Committer: Nakul Jindal <[email protected]> Committed: Wed Aug 16 11:14:43 2017 -0700 ---------------------------------------------------------------------- bin/systemml-standalone.py | 10 ++- scripts/perftest/python/run_perftest.py | 26 +++---- scripts/perftest/python/utils_exec.py | 1 + scripts/perftest/python/utils_misc.py | 104 +++++++++++++++------------ 4 files changed, 82 insertions(+), 59 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/systemml/blob/ce240af5/bin/systemml-standalone.py ---------------------------------------------------------------------- diff --git a/bin/systemml-standalone.py b/bin/systemml-standalone.py index 4000e75..02aefcf 100755 --- a/bin/systemml-standalone.py +++ b/bin/systemml-standalone.py @@ -43,7 +43,7 @@ def default_classpath(systemml_home): #TODO # User dir, fix for SYSTEMML_1795 -def standalone_execution_entry(nvargs, args, config, explain, debug, stats, gpu, f): +def standalone_execution_entry(nvargs, args, config, explain, debug, stats, gpu, heapmem, f): """ This function is responsible for the execution of arguments via subprocess call in singlenode mode @@ -57,7 +57,7 @@ def standalone_execution_entry(nvargs, args, config, explain, debug, stats, gpu, else: default_cp = ':'.join(default_classpath(systemml_home)) - java_memory = '-Xmx8g -Xms4g -Xmn1g' + java_memory = '-Xmx' + heapmem + ' -Xms4g -Xmn1g' # Log4j log4j = log4j_path(systemml_home) @@ -93,7 +93,10 @@ def standalone_execution_entry(nvargs, args, config, explain, debug, stats, gpu, '-f', script_file, '-exec', 'singlenode', '-config', default_config, ' '.join(ml_options)] - return_code = os.system(' '.join(cmd)) + cmd = ' '.join(cmd) + print(cmd) + + return_code = os.system(cmd) return return_code @@ -115,6 +118,7 @@ if __name__ == '__main__': cparser.add_argument('-gpu', help='uses CUDA instructions when reasonable, ' 'set <force> option to skip conservative memory estimates ' 'and use GPU wherever possible', nargs='?') + cparser.add_argument('-heapmem', help='maximum JVM heap memory', metavar='', default='8g') cparser.add_argument('-f', required=True, help='specifies dml/pydml file to execute; ' 'path can be local/hdfs/gpfs', metavar='') http://git-wip-us.apache.org/repos/asf/systemml/blob/ce240af5/scripts/perftest/python/run_perftest.py ---------------------------------------------------------------------- diff --git a/scripts/perftest/python/run_perftest.py b/scripts/perftest/python/run_perftest.py index d430569..8c3d1fa 100755 --- a/scripts/perftest/python/run_perftest.py +++ b/scripts/perftest/python/run_perftest.py @@ -32,8 +32,7 @@ from datagen import config_packets_datagen from train import config_packets_train from predict import config_packets_predict from utils_misc import get_families, config_reader, \ - exec_dml_and_parse_time, exec_test_data, check_predict, get_folder_metrics, args_dict_split, \ - get_config_args + exec_dml_and_parse_time, exec_test_data, check_predict, get_folder_metrics, split_config_args from utils_fs import create_dir_local, write_success, check_SUCCESS_file_exists # A packet is a dictionary @@ -84,8 +83,6 @@ ML_PREDICT = {'Kmeans': 'Kmeans-predict', DENSE_TYPE_ALGOS = ['clustering', 'stats1', 'stats2'] -sup_args_dict = {} - # Responsible for execution and metric logging def algorithm_workflow(algo, exec_type, config_path, dml_file_name, action_mode, current_dir): @@ -134,7 +131,7 @@ def algorithm_workflow(algo, exec_type, config_path, dml_file_name, action_mode, if exit_flag_success: time = 'data_exists' else: - time = exec_dml_and_parse_time(exec_type, dml_file_name, args, spark_args_dict, sup_args_dict, config_path) + time = exec_dml_and_parse_time(exec_type, dml_file_name, args, backend_args_dict, systemml_args_dict, config_path) write_success(time, temp_cwd) print('{},{},{},{},{},{}'.format(algo, action_mode, intercept, mat_type, mat_shape, time)) @@ -222,7 +219,7 @@ def perf_test_entry(family, algo, exec_type, mat_type, mat_shape, config_dir, mo # Statistic family do not require to be split if family_name not in ['stats1', 'stats2']: if not success_file: - exec_test_data(exec_type, spark_args_dict, sup_args_dict, data_gen_dir, config) + exec_test_data(exec_type, backend_args_dict, systemml_args_dict, data_gen_dir, config) if 'train' in mode: # Create config directories @@ -297,7 +294,7 @@ if __name__ == '__main__': description='SystemML Performance Test Script') cparser.add_argument('--family', help='space separated list of classes of algorithms ' '(available : ' + ', '.join(sorted(all_families)) + ')', - metavar='', choices=all_families, nargs='+') + metavar='', choices=all_families, nargs='+', default=' '.join(all_families)) cparser.add_argument('--algo', help='space separated list of algorithm to run ' '(Overrides --family, available : ' + ', '.join(sorted(all_algos)) + ')', metavar='', choices=all_algos, nargs='+') @@ -329,7 +326,9 @@ if __name__ == '__main__': cparser.add_argument('-explain', help='explains plan levels can be hops, runtime, ' 'recompile_hops, recompile_runtime', nargs='?', const='runtime', metavar='') cparser.add_argument('-config', help='System-ML configuration file (e.g SystemML-config.xml)', metavar='') - + cparser.add_argument('-gpu', help='uses CUDA instructions when reasonable, ' + 'set <force> option to skip conservative memory estimates ' + 'and use GPU wherever possible', nargs='?') # Spark Configuration Option cparser.add_argument('--master', help='local, yarn-client, yarn-cluster', metavar='') cparser.add_argument('--driver-memory', help='Memory for driver (e.g. 512M)', metavar='') @@ -338,15 +337,18 @@ if __name__ == '__main__': cparser.add_argument('--executor-cores', help='Number of cores', metavar='') cparser.add_argument('--conf', help='Spark configuration file', nargs='+', metavar='') + # Single node execution mode options + cparser.add_argument('-heapmem', help='maximum JVM heap memory', metavar='', default='8g') + + # Args is a namespace args = cparser.parse_args() all_arg_dict = vars(args) - arg_dict, config_dict, spark_dict = args_dict_split(all_arg_dict) create_dir_local(args.config_dir) # Global variables - sup_args_dict, spark_args_dict = get_config_args(config_dict, spark_dict, args.exec_type) + perftest_args_dict, systemml_args_dict, backend_args_dict = split_config_args(all_arg_dict) # Debug arguments # print(arg_dict) @@ -395,8 +397,8 @@ if __name__ == '__main__': logging.info('algorithm,run_type,intercept,matrix_type,data_shape,time_sec') # Remove filename item from dictionary as its already used to create the log above - del arg_dict['filename'] - perf_test_entry(**arg_dict) + del perftest_args_dict['filename'] + perf_test_entry(**perftest_args_dict) total_time = (time.time() - start_time) logging.info('Performance tests complete {0:.3f} secs \n'.format(total_time)) http://git-wip-us.apache.org/repos/asf/systemml/blob/ce240af5/scripts/perftest/python/utils_exec.py ---------------------------------------------------------------------- diff --git a/scripts/perftest/python/utils_exec.py b/scripts/perftest/python/utils_exec.py index 87ae3cd..cf98d0f 100755 --- a/scripts/perftest/python/utils_exec.py +++ b/scripts/perftest/python/utils_exec.py @@ -52,6 +52,7 @@ def subprocess_exec(cmd_string, log_file_path=None, extract=None): error_arr, out_arr = get_all_logs(proc1) std_outs = out_arr + error_arr + std_outs.insert(0, ' '.join(exec_command)) return_code = proc1.returncode if log_file_path is not None: http://git-wip-us.apache.org/repos/asf/systemml/blob/ce240af5/scripts/perftest/python/utils_misc.py ---------------------------------------------------------------------- diff --git a/scripts/perftest/python/utils_misc.py b/scripts/perftest/python/utils_misc.py index e247ce8..a3c98c2 100755 --- a/scripts/perftest/python/utils_misc.py +++ b/scripts/perftest/python/utils_misc.py @@ -30,53 +30,68 @@ from utils_exec import subprocess_exec # This file contains all misc utility functions required by performance test module -def get_config_args(config_dict, spark_dict, exec_type): +def split_config_args(args): """ Based on configuration parameters passed build configuration dictionary used by subprocess - config_dict: Dictionary - General configuration options + args: Dictionary + All parameters passed in - spark_dict: Dictionary - Spark configuration options + return: Dictionary, Dictionary, Dictionary + 3 dictionaries - one for perf tests, one for systemml specific args, one for backend options + """ + + perftest_args_dict = {} + + perftest_args_dict['family'] = args['family'] + perftest_args_dict['algo'] = args['algo'] + perftest_args_dict['exec_type'] = args['exec_type'] + perftest_args_dict['mat_type'] = args['mat_type'] + perftest_args_dict['mat_shape'] = args['mat_shape'] + perftest_args_dict['config_dir'] = args['config_dir'] + perftest_args_dict['filename'] = args['filename'] + perftest_args_dict['mode'] = args['mode'] + perftest_args_dict['temp_dir'] = args['temp_dir'] - exec_type: String - Contains the execution type singlenode / hybrid_spark - return: Dictionary, Dictionary - Based on the parameters passed we build to dictionary that need to be passed either at the - beginning or at the end - """ - sup_args_dict = {} + systemml_args_dict = {} - if config_dict['stats'] is not None: - sup_args_dict['-stats'] = config_dict['stats'] + if args['stats'] is not None: + systemml_args_dict['-stats'] = args['stats'] - if config_dict['explain'] is not None: - sup_args_dict['-explain'] = config_dict['explain'] + if args['explain'] is not None: + systemml_args_dict['-explain'] = args['explain'] - if config_dict['config'] is not None: - sup_args_dict['-config'] = config_dict['config'] + if args['config'] is not None: + systemml_args_dict['-config'] = args['config'] - spark_args_dict = {} + if args['gpu'] is not None: + systemml_args_dict['-gpu'] = args['gpu'] + + backend_args_dict = {} + exec_type = args['exec_type'] + if exec_type == 'hybrid_spark': - if spark_dict['master'] is not None: - spark_args_dict['--master'] = spark_dict['master'] + if args['master'] is not None: + backend_args_dict['--master'] = args['master'] - if spark_dict['num_executors'] is not None: - spark_args_dict['--num-executors'] = spark_dict['num_executors'] + if args['num_executors'] is not None: + backend_args_dict['--num-executors'] = args['num_executors'] - if spark_dict['driver_memory'] is not None: - spark_args_dict['--driver-memory'] = spark_dict['driver_memory'] + if args['driver_memory'] is not None: + backend_args_dict['--driver-memory'] = args['driver_memory'] - if spark_dict['executor_cores'] is not None: - spark_args_dict['--executor-cores'] = spark_dict['executor_cores'] + if args['executor_cores'] is not None: + backend_args_dict['--executor-cores'] = args['executor_cores'] - if spark_dict['conf'] is not None: - spark_args_dict['--conf'] = ' '.join(spark_dict['conf']) + if args['conf'] is not None: + backend_args_dict['--conf'] = ' '.join(args['conf']) + elif exec_type == 'singlenode': + if args['heapmem'] is not None: + backend_args_dict['-heapmem'] = args['heapmem'] - return sup_args_dict, spark_args_dict + return perftest_args_dict, systemml_args_dict, backend_args_dict def args_dict_split(all_arguments): @@ -87,13 +102,14 @@ def args_dict_split(all_arguments): All input arguments parsed return: Dictionary, Dictionary, Dictionary - We return three dictionaries for init, script, spark arguments + We return four dictionaries for init, script, spark arguments, singlenode arguments """ args_dict = dict(list(all_arguments.items())[0:9]) - config_dict = dict(list(all_arguments.items())[9:12]) - spark_dict = dict(list(all_arguments.items())[12:]) + config_dict = dict(list(all_arguments.items())[9:13]) + spark_dict = dict(list(all_arguments.items())[13:19]) + singlenode_dict = dict(list(all_arguments.items())[19:]) - return args_dict, config_dict, spark_dict + return args_dict, config_dict, spark_dict, singlenode_dict def get_families(current_algo, ml_algo): @@ -166,7 +182,7 @@ def config_reader(read_path): return conf_file -def exec_dml_and_parse_time(exec_type, dml_file_name, args, spark_args_dict, sup_args_dict, log_file_name=None): +def exec_dml_and_parse_time(exec_type, dml_file_name, args, backend_args_dict, systemml_args_dict, log_file_name=None): """ This function is responsible of execution of input arguments via python sub process, We also extract time obtained from the output of this subprocess @@ -180,10 +196,10 @@ def exec_dml_and_parse_time(exec_type, dml_file_name, args, spark_args_dict, sup args: Dictionary Key values pairs depending on the arg type - spark_args_dict: Dictionary - Spark configuration arguments + backend_args_dict: Dictionary + Spark configuration arguments / singlenode config arguments - sup_args_dict: Dictionary + systemml_args_dict: Dictionary Supplementary arguments required by the script log_file_name: String @@ -195,17 +211,17 @@ def exec_dml_and_parse_time(exec_type, dml_file_name, args, spark_args_dict, sup algorithm = dml_file_name + '.dml' - sup_args = ''.join(['{} {}'.format(k, v) for k, v in sup_args_dict.items()]) + sup_args = ''.join(['{} {}'.format(k, v) for k, v in systemml_args_dict.items()]) if exec_type == 'singlenode': exec_script = join(os.environ.get('SYSTEMML_HOME'), 'bin', 'systemml-standalone.py') - + singlenode_pre_args = ''.join([' {} {} '.format(k, v) for k, v in backend_args_dict.items()]) args = ''.join(['{} {}'.format(k, v) for k, v in args.items()]) - cmd = [exec_script, '-f', algorithm, args, sup_args] + cmd = [exec_script, singlenode_pre_args, '-f', algorithm, args, sup_args] cmd_string = ' '.join(cmd) if exec_type == 'hybrid_spark': exec_script = join(os.environ.get('SYSTEMML_HOME'), 'bin', 'systemml-spark-submit.py') - spark_pre_args = ''.join([' {} {} '.format(k, v) for k, v in spark_args_dict.items()]) + spark_pre_args = ''.join([' {} {} '.format(k, v) for k, v in backend_args_dict.items()]) args = ''.join(['{} {}'.format(k, v) for k, v in args.items()]) cmd = [exec_script, spark_pre_args, '-f', algorithm, args, sup_args] cmd_string = ' '.join(cmd) @@ -238,7 +254,7 @@ def parse_time(raw_logs): return 'time_not_found' -def exec_test_data(exec_type, spark_args_dict, sup_args_dict, datagen_path, config): +def exec_test_data(exec_type, backend_args_dict, systemml_args_dict, datagen_path, config): """ Creates the test data split from the given input path @@ -256,7 +272,7 @@ def exec_test_data(exec_type, spark_args_dict, sup_args_dict, datagen_path, conf X_test = join(path, 'X_test.data') Y_test = join(path, 'Y_test.data') args = {'-args': ' '.join([X, Y, X_test, Y_test, 'csv'])} - exec_dml_and_parse_time(exec_type, test_split_script, args, spark_args_dict, sup_args_dict) + exec_dml_and_parse_time(exec_type, test_split_script, args, backend_args_dict, systemml_args_dict) def check_predict(current_algo, ml_predict):
