[SYSTEMML-1451] phase 2 work Completed these tasks as part for Phase 2 for Google Summer of Code '17 - Decouple systemml-spark-submit.py - Decouple systemml-standalone.py - Refractor perf test suit to accept args like debug, stats, config etc... - Add HDFS support - Google Docs support - Compare SystemML with previous versions - Pylint, Comment - Extra arguments configuration Test - Windows Test - Doc update - systemml standalone comments - systemml spark submit comments
Closes #575 Project: http://git-wip-us.apache.org/repos/asf/systemml/repo Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/e94374af Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/e94374af Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/e94374af Branch: refs/heads/master Commit: e94374afb2e6be5dc81524f9c7a5de09b9f4ba26 Parents: a2db1ad Author: krishnakalyan3 <krishnakaly...@gmail.com> Authored: Tue Aug 1 13:46:30 2017 -0700 Committer: Nakul Jindal <naku...@gmail.com> Committed: Tue Aug 1 13:46:30 2017 -0700 ---------------------------------------------------------------------- bin/systemml-spark-submit.py | 278 +++++++-------- bin/systemml-standalone.py | 256 +++++--------- bin/utils.py | 113 ++++++ docs/python-performance-test.md | 35 +- scripts/perftest/python/datagen.py | 141 ++++---- scripts/perftest/python/google_docs/stats.py | 113 ++++++ scripts/perftest/python/google_docs/update.py | 110 ++++++ scripts/perftest/python/predict.py | 156 ++++----- scripts/perftest/python/run_perftest.py | 135 ++++--- scripts/perftest/python/train.py | 257 +++++++------- scripts/perftest/python/utils.py | 390 --------------------- scripts/perftest/python/utils_exec.py | 137 ++++++++ scripts/perftest/python/utils_fs.py | 162 +++++++++ scripts/perftest/python/utils_misc.py | 347 ++++++++++++++++++ 14 files changed, 1580 insertions(+), 1050 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/systemml/blob/e94374af/bin/systemml-spark-submit.py ---------------------------------------------------------------------- diff --git a/bin/systemml-spark-submit.py b/bin/systemml-spark-submit.py index 30974ec..b6426b3 100755 --- a/bin/systemml-spark-submit.py +++ b/bin/systemml-spark-submit.py @@ -21,167 +21,131 @@ # ------------------------------------------------------------- import os -import sys -from os.path import join, exists, abspath -from os import environ import glob -import argparse -import shutil +from os.path import join import platform - -if environ.get('SPARK_HOME') is None: - print('SPARK_HOME not set') - sys.exit(1) -else: - spark_home = environ.get('SPARK_HOME') +import argparse +from utils import get_env_systemml_home, get_env_spark_home, find_dml_file, log4j_path, config_path + + +def default_jars(systemml_home): + """ + return: String + Location of systemml and jcuda jars + """ + build_dir = join(systemml_home, 'target') + lib_dir = join(build_dir, 'lib') + systemml_jar = build_dir + os.sep + "SystemML.jar" + jcuda_jars = glob.glob(lib_dir + os.sep + "jcu*.jar") + target_jars = ','.join(jcuda_jars) + return target_jars, systemml_jar + + +def spark_submit_entry(master, driver_memory, num_executors, executor_memory, + executor_cores, conf, + nvargs, args, config, explain, debug, stats, gpu, f): + """ + This function is responsible for the execution of arguments via + subprocess call in hybrid_spark mode + """ + + spark_home = get_env_spark_home() + systemml_home = get_env_systemml_home() spark_path = join(spark_home, 'bin', 'spark-submit') + script_file = find_dml_file(systemml_home, f) + # Jars + cuda_jars, systemml_jars = default_jars(systemml_home) -# error help print -def print_usage_and_exit(): - print('Usage: ./systemml-spark-submit.py -f <dml-filename> [arguments]') - sys.exit(1) - -cparser = argparse.ArgumentParser(description='System-ML Spark Submit Script') - -# SPARK-SUBMIT Options -cparser.add_argument('--master', default='local[*]', help='local, yarn-client, yarn-cluster', metavar='') -cparser.add_argument('--driver-memory', default='5G', help='Memory for driver (e.g. 512M)', metavar='') -cparser.add_argument('--num-executors', default='2', help='Number of executors to launch', metavar='') -cparser.add_argument('--executor-memory', default='2G', help='Memory per executor', metavar='') -cparser.add_argument('--executor-cores', default='1', help='Number of cores', metavar='') -cparser.add_argument('--conf', help='Spark configuration file', nargs='+', metavar='') - -# SYSTEM-ML Options -cparser.add_argument('-nvargs', help='List of attributeName-attributeValue pairs', nargs='+', metavar='') -cparser.add_argument('-args', help='List of positional argument values', metavar='', nargs='+') -cparser.add_argument('-config', help='System-ML configuration file (e.g SystemML-config.xml)', metavar='') -cparser.add_argument('-exec', default='hybrid_spark', help='System-ML backend (e.g spark, spark-hybrid)', metavar='') -cparser.add_argument('-explain', help='explains plan levels can be hops, runtime, ' - 'recompile_hops, recompile_runtime', nargs='?', const='runtime', metavar='') -cparser.add_argument('-debug', help='runs in debug mode', action='store_true') -cparser.add_argument('-stats', help='Monitor and report caching/recompilation statistics, ' - 'heavy hitter <count> is 10 unless overridden', nargs='?', const='10', metavar='') -cparser.add_argument('-gpu', help='uses CUDA instructions when reasonable, ' - 'set <force> option to skip conservative memory estimates ' - 'and use GPU wherever possible', nargs='?') -cparser.add_argument('-f', required=True, help='specifies dml/pydml file to execute; ' - 'path can be local/hdfs/gpfs', metavar='') - -args = cparser.parse_args() - -# Optional arguments -ml_options = [] -if args.nvargs is not None: - ml_options.append('-nvargs') - ml_options.append(' '.join(args.nvargs)) -if args.args is not None: - ml_options.append('-args') - ml_options.append(' '.join(args.args)) -if args.debug is not False: - ml_options.append('-debug') -if args.explain is not None: - ml_options.append('-explain') - ml_options.append(args.explain) -if args.gpu is not None: - ml_options.append('-gpu') - ml_options.append(args.gpu) -if args.stats is not None: - ml_options.append('-stats') - ml_options.append(args.stats) - -# Assign script file to name received from argparse module -script_file = args.f - -# find the systemML root path which contains the bin folder, the script folder and the target folder -# tolerate path with spaces -script_dir = os.path.dirname(os.path.realpath(__file__)) -project_root_dir = os.path.dirname(script_dir) -user_dir = os.getcwd() - -scripts_dir = join(project_root_dir, 'scripts') -build_dir = join(project_root_dir, 'target') -lib_dir = join(build_dir, 'lib') - -systemml_jar = build_dir + os.sep + "SystemML.jar" -jcuda_jars = glob.glob(lib_dir + os.sep + "jcu*.jar") -target_jars = ','.join(jcuda_jars) # Include all JCuda Jars - -log4j_properties_path = join(project_root_dir, 'conf', 'log4j.properties.template') - -build_err_msg = 'You must build the project before running this script.' -build_dir_err_msg = 'Could not find target directory ' + build_dir + '. ' + build_err_msg - -# check if the project had been built and the jar files exist -if not (exists(build_dir)): - print(build_dir_err_msg) - sys.exit(1) - -print('================================================================================') - -# if the present working directory is the project root or bin folder, then use the temp folder as user.dir -if user_dir == project_root_dir or user_dir == join(project_root_dir, 'bin'): - user_dir = join(project_root_dir, 'temp') - print('Output dir: ' + user_dir) - -# if the SystemML-config.xml does not exist, create it from the template -systemml_config_path = join(project_root_dir, 'conf', 'SystemML-config.xml') -systemml_template_config_path = join(project_root_dir, 'conf', 'SystemML-config.xml.template') -if not (exists(systemml_config_path)): - shutil.copyfile(systemml_template_config_path, systemml_config_path) - print('... created ' + systemml_config_path) - -# if SystemML-config.xml is provided as arguments -if args.config is None: - systemml_config_path_arg = systemml_config_path -else: - systemml_config_path_arg = args.config - - -# from http://stackoverflow.com/questions/1724693/find-a-file-in-python -def find_file(name, path): - for root, dirs, files in os.walk(path): - if name in files: - return join(root, name) - return None - -# if the script file path was omitted, try to complete the script path -if not (exists(script_file)): - script_file_name = abspath(script_file) - script_file_found = find_file(script_file, scripts_dir) - if script_file_found is None: - print('Could not find DML script: ' + script_file) - print_usage_and_exit() + # Log4j + log4j = log4j_path(systemml_home) + log4j_properties_path = 'spark.driver.extraJavaOptions=-Dlog4j.configuration=file:{}'.format(log4j) + if conf is None: + default_conf = log4j_properties_path else: - script_file = script_file_found - print('DML Script:' + script_file) - -default_conf = 'spark.driver.extraJavaOptions=-Dlog4j.configuration=file:{}'.format(log4j_properties_path) - -# Backslash problem in windows. -if platform.system() == 'Windows': - default_conf = default_conf.replace('\\', '//') - -if args.conf is not None: - conf = ' --conf '.join(args.conf + [default_conf]) -else: - conf = default_conf + default_conf = ' --conf '.join(conf + [log4j_properties_path]) -cmd_spark = [spark_path, '--class', 'org.apache.sysml.api.DMLScript', - '--master', args.master, '--driver-memory', args.driver_memory, - '--num-executors', args.num_executors, '--executor-memory', args.executor_memory, - '--executor-cores', args.executor_cores, '--conf', conf, '--jars', target_jars, - systemml_jar] - -cmd_system_ml = ['-config', systemml_config_path_arg, - '-exec', vars(args)['exec'], '-f', script_file, ' '.join(ml_options)] - -cmd = cmd_spark + cmd_system_ml - -return_code = os.system(' '.join(cmd)) -# For debugging -# print(' '.join(cmd)) - -if return_code != 0: - print('Failed to run SystemML. Exit code :' + str(return_code)) - print(' '.join(cmd)) + # Config XML + if config is None: + default_config = config_path(systemml_home) + else: + default_config = ' -config '.join([config] + [config_path(systemml_home)]) + + if platform.system() == 'Windows': + default_conf = default_conf.replace('\\', '//') + + # optional arguments + ml_options = [] + if nvargs is not None: + ml_options.append('-nvargs') + ml_options.append(' '.join(nvargs)) + if args is not None: + ml_options.append('-args') + ml_options.append(' '.join(args)) + if explain is not None: + ml_options.append('-explain') + ml_options.append(explain) + if debug is not False: + ml_options.append('-debug') + if stats is not None: + ml_options.append('-stats') + ml_options.append(stats) + if gpu is not None: + ml_options.append('-gpu') + ml_options.append(gpu) + + if len(ml_options) < 1: + ml_options = '' + + # stats, explain, target_jars + cmd_spark = [spark_path, '--class', 'org.apache.sysml.api.DMLScript', + '--master', master, '--driver-memory', driver_memory, + '--num-executors', num_executors, '--executor-memory', executor_memory, + '--executor-cores', executor_cores, '--conf', default_conf, + '--jars', cuda_jars, systemml_jars] + + cmd_system_ml = ['-config', default_config, + '-exec', 'hybrid_spark', '-f', script_file, ' '.join(ml_options)] + + cmd = cmd_spark + cmd_system_ml + + # Debug + # print(' '.join(cmd)) + return_code = os.system(' '.join(cmd)) + return return_code + + +if __name__ == '__main__': + cparser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter, + description='System-ML Spark Submit Script') + # SPARK-SUBMIT Options + cparser.add_argument('--master', default='local[*]', help='local, yarn-client, yarn-cluster', metavar='') + cparser.add_argument('--driver-memory', default='5G', help='Memory for driver (e.g. 512M)', metavar='') + cparser.add_argument('--num-executors', default='2', help='Number of executors to launch', metavar='') + cparser.add_argument('--executor-memory', default='2G', help='Memory per executor', metavar='') + cparser.add_argument('--executor-cores', default='1', help='Number of cores', metavar='') + cparser.add_argument('--conf', help='Spark configuration file', nargs='+', metavar='') + + # SYSTEM-ML Options + cparser.add_argument('-nvargs', help='List of attributeName-attributeValue pairs', nargs='+', metavar='') + cparser.add_argument('-args', help='List of positional argument values', metavar='', nargs='+') + cparser.add_argument('-config', help='System-ML configuration file (e.g SystemML-config.xml)', metavar='') + cparser.add_argument('-explain', help='explains plan levels can be hops, runtime, ' + 'recompile_hops, recompile_runtime', nargs='?', const='runtime', metavar='') + cparser.add_argument('-debug', help='runs in debug mode', action='store_true') + cparser.add_argument('-stats', help='Monitor and report caching/recompilation statistics, ' + 'heavy hitter <count> is 10 unless overridden', nargs='?', const='10', + metavar='') + cparser.add_argument('-gpu', help='uses CUDA instructions when reasonable, ' + 'set <force> option to skip conservative memory estimates ' + 'and use GPU wherever possible', nargs='?') + cparser.add_argument('-f', required=True, help='specifies dml/pydml file to execute; ' + 'path can be local/hdfs/gpfs', metavar='') + + args = cparser.parse_args() + arg_dict = vars(args) + + return_code = spark_submit_entry(**arg_dict) + + if return_code != 0: + print('Failed to run SystemML. Exit code :' + str(return_code)) http://git-wip-us.apache.org/repos/asf/systemml/blob/e94374af/bin/systemml-standalone.py ---------------------------------------------------------------------- diff --git a/bin/systemml-standalone.py b/bin/systemml-standalone.py index a0ee8db..4000e75 100755 --- a/bin/systemml-standalone.py +++ b/bin/systemml-standalone.py @@ -21,180 +21,106 @@ #------------------------------------------------------------- import os -import shutil -import sys -from os.path import join, exists +from os.path import join +import argparse +import platform +from utils import get_env_systemml_home, find_dml_file, log4j_path, config_path -# error help print -def print_usage_and_exit(): - this_script = sys.argv[0] - print('Usage: ' + this_script + ' <dml-filename> [arguments]') - sys.exit(1) +def default_classpath(systemml_home): + """ + Classpath information required for excution + return: String + Classpath location of build, library and hadoop directories + """ + build_lib = join(systemml_home, 'target', '*') + lib_lib = join(systemml_home, 'target', 'lib', '*') + hadoop_lib = join(systemml_home, 'target', 'lib', 'hadoop', '*') -# from http://stackoverflow.com/questions/1724693/find-a-file-in-python -def find_file(name, path): - for root, dirs, files in os.walk(path): - if name in files: - return join(root, name) - return None + return build_lib, lib_lib, hadoop_lib -if len(sys.argv) < 2: - print('Wrong usage') - print_usage_and_exit() +#TODO +# User dir, fix for SYSTEMML_1795 +def standalone_execution_entry(nvargs, args, config, explain, debug, stats, gpu, f): + """ + This function is responsible for the execution of arguments via + subprocess call in singlenode mode + """ + systemml_home = get_env_systemml_home() + script_file = find_dml_file(systemml_home, f) -# find the systemML root path which contains the bin folder, the script folder and the target folder -# tolerate path with spaces -script_dir = os.path.dirname(os.path.realpath(__file__)) -project_root_dir = os.path.dirname(script_dir) -user_dir = os.getcwd() - -scripts_dir = join(project_root_dir, 'scripts') -build_dir = join(project_root_dir, 'target') -lib_dir = join(build_dir, 'lib') -dml_script_class = join(build_dir, 'classes', 'org', 'apache', 'sysml', 'api', 'DMLScript.class') -hadoop_home = join(lib_dir, 'hadoop') - - -build_err_msg = 'You must build the project before running this script.' -build_dir_err_msg = 'Could not find target directory ' + build_dir + '. ' + build_err_msg - -lib_dir_err_msg = 'Could not find required libraries.' + build_err_msg -dml_script_err_msg = 'Could not find ' + dml_script_class + '. ' + build_err_msg - -# check if the project had been built and the jar files exist -if not(exists(build_dir)): - print(build_dir_err_msg) - sys.exit(1) -if not(exists(lib_dir)): - print(lib_dir_err_msg) - sys.exit(1) -if not(exists(dml_script_class)): - print(dml_script_err_msg) - sys.exit(1) - -print('================================================================================') - - -# if the present working directory is the project root or bin folder, then use the temp folder as user.dir -if user_dir == project_root_dir or user_dir == join(project_root_dir, 'bin'): - user_dir = join(project_root_dir, 'temp') - print('Output dir: ' + user_dir) - -# if the SystemML-config.xml does not exist, create it from the template -systemml_config_path = join(project_root_dir, 'conf', 'SystemML-config.xml') -systemml_template_config_path = join(project_root_dir, 'conf', 'SystemML-config.xml.template') -if not(exists(systemml_config_path)): - shutil.copyfile(systemml_template_config_path, systemml_config_path) - print('... created ' + systemml_config_path) - -# if the log4j.properties do not exist, create them from the template -log4j_properties_path = join(project_root_dir, 'conf', 'log4j.properties') -log4j_template_properties_path = join(project_root_dir, 'conf', 'log4j.properties.template') -if not(exists(log4j_properties_path)): - shutil.copyfile(log4j_template_properties_path, log4j_properties_path) - print('... created ' + log4j_properties_path) + if platform.system() == 'Windows': + default_cp = ';'.join(default_classpath(systemml_home)) + else: + default_cp = ':'.join(default_classpath(systemml_home)) + java_memory = '-Xmx8g -Xms4g -Xmn1g' -script_file = sys.argv[1] + # Log4j + log4j = log4j_path(systemml_home) + log4j_properties_path = '-Dlog4j.configuration=file:{}'.format(log4j) -# if the script file path was omitted, try to complete the script path -if not(exists(script_file)): - script_file_name = os.path.abspath(script_file) - script_file_found = find_file(script_file, scripts_dir) - if script_file_found is None: - print('Could not find DML script: ' + script_file) - print_usage_and_exit() + # Config + if config is None: + default_config = config_path(systemml_home) else: - script_file = script_file_found - print('DML Script:' + script_file) - -# add libraries which were generated by the build to the classpath -systemml_jar = join(build_dir, 'classes') - - -# For the *nix and windows, os.pathsep works reliably -# however for cygwin, the pathsep is set for *nix, which is ':' -# but the underlying java, which is a windows program requires ';' -# also all arguments passed to the JVM need to be converted to windows style -# if in the cygwin environment -lib_dir_star = join(lib_dir, '*') -if sys.platform == 'cygwin': - classpath_sep = os.pathsep - classpath_sep = ';' - lib_dir = os.popen('cygpath -pw ' + lib_dir).read().strip() - lib_dir_star = '"' + lib_dir + "\*" + '"' - systemml_jar = '"' + os.popen('cygpath -pw ' + systemml_jar).read().strip() + '"' - hadoop_home = '"' + os.popen('cygpath -pw ' + hadoop_home).read().strip() + '"' - log4j_properties_path = '"' + os.popen('cygpath -pw ' + log4j_properties_path).read().strip() + '"' - user_dir = '"' + os.popen('cygpath -pw ' + user_dir).read().strip() + '"' - script_file = '"' + os.popen('cygpath -pw ' + script_file).read().strip() + '"' - systemml_config_path = '"' + os.popen('cygpath -pw ' + systemml_config_path).read().strip() + '"' - classpath = lib_dir_star + '\\' + classpath_sep + systemml_jar -else: - #classpath = '"' + lib_dir_star + '"' + os.pathsep + '"' + systemml_jar + '"' - classpath = lib_dir_star + os.pathsep + systemml_jar - - -# Set the HADOOP_HOME environment variable -if 'HADOOP_HOME' not in os.environ: - os.environ['HADOOP_HOME'] = hadoop_home - - -print('================================================================================') - -# Set default Java options -systemml_default_java_opts = \ - '-Xmx8g -Xms4g -Xmn1g ' + \ - '-cp ' + classpath + ' ' + \ - '-Dlog4j.configuration=file:' + log4j_properties_path + ' ' \ - '-Duser.dir=' + user_dir -# '-agentlib:jdwp=transport=dt_socket,server=y,suspend=y,address=8111' - - -# Reads in key-value pairs from the conf/systemml-env.sh file -def parse_env_file(env_file_path): - env_vars = {} - with open(env_file_path) as f: - for l in f: - l = l.strip() - if l and not(l.startswith('#')) and '=' in l: - k, v = l.split('=', 1) - k = k.strip() - v = v.strip() - if len(v) > 0: - # strip quotes - if v[0] == v[len(v) - 1] and v[0] in ['"', "'"]: - v = v[1:-1] - - env_vars[k] = v - return env_vars - - -# Add any custom Java options set by the user at command line, overriding defaults as necessary. -if 'SYSTEMML_JAVA_OPTS' in os.environ: - systemml_java_opts = os.environ['SYSTEMML_JAVA_OPTS'] - systemml_default_java_opts = systemml_default_java_opts + ' ' + systemml_java_opts - # del os.environ['SYSTEMML_JAVA_OPTS'] - -# Add any custom Java options set by the user in the environment variables file, -# overriding defaults as necessary. -systemml_env_path = join(project_root_dir, 'conf', 'systemml-env.sh') -if exists(systemml_env_path): - env_vars = parse_env_file(systemml_env_path) - os.environ.update(env_vars) - - -# Invoke the jar with options and arguments -cmd = ['java', systemml_default_java_opts, 'org.apache.sysml.api.DMLScript', '-f', script_file, '-exec singlenode', '-config', systemml_config_path] + sys.argv[2:] -# For debugging -# print(' '.join(cmd)) - -return_code = os.system(' '.join(cmd)) - -if return_code != 0: - print('Failed to run SystemML. Exit code :' + str(return_code)) - print(' '.join(cmd)) + default_config = config + + ml_options = [] + if nvargs is not None: + ml_options.append('-nvargs') + ml_options.append(' '.join(nvargs)) + if args is not None: + ml_options.append('-args') + ml_options.append(' '.join(args)) + if explain is not None: + ml_options.append('-explain') + ml_options.append(explain) + if debug is not False: + ml_options.append('-debug') + if stats is not None: + ml_options.append('-stats') + ml_options.append(stats) + if gpu is not None: + ml_options.append('-gpu') + ml_options.append(gpu) + + cmd = ['java', java_memory, log4j_properties_path, + '-cp', default_cp, 'org.apache.sysml.api.DMLScript', + '-f', script_file, '-exec', 'singlenode', '-config', default_config, + ' '.join(ml_options)] + + return_code = os.system(' '.join(cmd)) + return return_code + + +if __name__ == '__main__': + + cparser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter, + description='System-ML Standalone Script') + + # SYSTEM-ML Options + cparser.add_argument('-nvargs', help='List of attributeName-attributeValue pairs', nargs='+', metavar='') + cparser.add_argument('-args', help='List of positional argument values', metavar='', nargs='+') + cparser.add_argument('-config', help='System-ML configuration file (e.g SystemML-config.xml)', metavar='') + cparser.add_argument('-explain', help='explains plan levels can be hops, runtime, ' + 'recompile_hops, recompile_runtime', nargs='?', const='runtime', metavar='') + cparser.add_argument('-debug', help='runs in debug mode', action='store_true') + cparser.add_argument('-stats', help='Monitor and report caching/recompilation statistics, ' + 'heavy hitter <count> is 10 unless overridden', nargs='?', const='10', + metavar='') + cparser.add_argument('-gpu', help='uses CUDA instructions when reasonable, ' + 'set <force> option to skip conservative memory estimates ' + 'and use GPU wherever possible', nargs='?') + cparser.add_argument('-f', required=True, help='specifies dml/pydml file to execute; ' + 'path can be local/hdfs/gpfs', metavar='') + + args = cparser.parse_args() + arg_dict = vars(args) + return_code = standalone_execution_entry(**arg_dict) + + if return_code != 0: + print('Failed to run SystemML. Exit code :' + str(return_code)) http://git-wip-us.apache.org/repos/asf/systemml/blob/e94374af/bin/utils.py ---------------------------------------------------------------------- diff --git a/bin/utils.py b/bin/utils.py new file mode 100644 index 0000000..6f40881 --- /dev/null +++ b/bin/utils.py @@ -0,0 +1,113 @@ +#!/usr/bin/env python +# ------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +# ------------------------------------------------------------- + +import sys +import os +from os.path import join, exists +from os import environ +import shutil + + +def get_env_systemml_home(): + """ + Env variable error check and path location + + return: String + Location of SYSTEMML_HOME + """ + systemml_home = os.environ.get('SYSTEMML_HOME') + if systemml_home is None: + print('SYSTEMML_HOME not found') + sys.exit() + + return systemml_home + + +def get_env_spark_home(): + """ + Env variable error check and path location + + return: String + Location of SPARK_HOME + """ + spark_home = environ.get('SPARK_HOME') + if spark_home is None: + print('SPARK_HOME not found') + sys.exit() + + return spark_home + + +def find_file(name, path): + """ + Responsible for finding a specific file recursively given a location + """ + for root, dirs, files in os.walk(path): + if name in files: + return join(root, name) + + +def find_dml_file(systemml_home, script_file): + """ + Find the location of DML script being executed + + return: String + Location of the dml script + """ + scripts_dir = join(systemml_home, 'scripts') + if not (exists(script_file)): + script_file = find_file(script_file, scripts_dir) + if script_file is None: + print('Could not find DML script: ' + script_file) + sys.exit() + + return script_file + + +def log4j_path(systemml_home): + """ + Create log4j.properties from the template if not exist + + return: String + Location of log4j.properties path + """ + log4j_properties_path = join(systemml_home, 'conf', 'log4j.properties') + log4j_template_properties_path = join(systemml_home, 'conf', 'log4j.properties.template') + if not (exists(log4j_properties_path)): + shutil.copyfile(log4j_template_properties_path, log4j_properties_path) + print('... created ' + log4j_properties_path) + return log4j_properties_path + + +def config_path(systemml_home): + """ + Create SystemML-config from the template if not exist + + return: String + Location of SystemML-config.xml + """ + systemml_config_path = join(systemml_home, 'conf', 'SystemML-config.xml') + systemml_template_config_path = join(systemml_home, 'conf', 'SystemML-config.xml.template') + if not (exists(systemml_config_path)): + shutil.copyfile(systemml_template_config_path, systemml_config_path) + print('... created ' + systemml_config_path) + return systemml_config_path http://git-wip-us.apache.org/repos/asf/systemml/blob/e94374af/docs/python-performance-test.md ---------------------------------------------------------------------- diff --git a/docs/python-performance-test.md b/docs/python-performance-test.md index c265bc6..02d3e34 100644 --- a/docs/python-performance-test.md +++ b/docs/python-performance-test.md @@ -11,10 +11,13 @@ Our performance tests suit contains `7` families namely `binomial`, `multinomial On a very high level use construct a string with arguments required to run each operation. Once this string is constructed we use the subprocess module to execute this string and extract time from the standard out. -We also use `json` module write our configurations to a json file. This ensure that our current operation is easy to debug. +We also use `json` module write our configurations to a json file. This ensure that our operations are easy to debug. +We have `7` files in performance test suit: -We have `5` files in performance test suit `run_perftest.py`, `datagen.py`, `train.py`, `predict.py` and `utils.py`. +- Entry File `run_perftest.py` +- Supporting Files `datagen.py`, `train.py`, `predict.py` +- Utility Files `utils_exec.py`, `utils_fs.py`, `utils_misc.py` `datagen.py`, `train.py` and `predict.py` generate a dictionary. Our key is the name of algorithm being processed and values is a list with path(s) where all the data required is present. We define this dictionary as a configuration packet. @@ -28,7 +31,7 @@ In `train.py` script we have functions required to generate training output. We The file `predict.py` contains all functions for all algorithms in the performance test that contain predict script. We return the required configuration packet as a result of this script, that contains key as the algorithm to run and values with location to read predict json files from. -In the file `utils.py` we have all the helper functions required in our performance test. These functions do operations like write `json` files, extract time from std out etc. +In the file(s) `utils_*.py` we have all the helper functions required in our performance test. These functions do operations like write `json` files, extract time from std out etc. ### Adding New Algorithms While adding a new algorithm we need know if it has to be part of the any pre existing family. If this algorithm depends on a new data generation script we would need to create a new family. Steps below to take below to add a new algorithm. @@ -75,7 +78,7 @@ Default setting for our performance test below: - Matrix size to 10,000 rows and 100 columns. - Execution mode `singlenode`. - Operation modes `data-gen`, `train` and `predict` in sequence. -- Matrix type set to `all`. Which will generate `dense` or / and `sparse` matrices for all relevant algorithms. +- Matrix type set to `all`. Which will generate `dense`, `sparse` matrices for all relevant algorithms. ### Examples Some examples of SystemML performance test with arguments shown below: @@ -104,6 +107,9 @@ Run performance test for the algorithms `m-svm` with `multinomial` family. Run o ` Run performance test for all algorithms under the family `regression2` and log with filename `new_log`. +`./scripts/perftest/python/run_perftest.py --family binomial clustering multinomial regression1 regression2 stats1 stats2 --config-dir /Users/krishna/open-source/systemml/scripts/perftest/temp3 --temp-dir hdfs://localhost:9000/temp3` +Run performance test for all algorithms using HDFS. + ### Operational Notes All performance test depend mainly on two scripts for execution `systemml-standalone.py` and `systemml-spark-submit.py`. Incase we need to change standalone or spark parameters we need to manually change these parameters in their respective scripts. @@ -117,13 +123,26 @@ multinomial|data-gen|0|dense|10k_100| 0.33 MultiLogReg|train|0|10k_100|dense|6.956 MultiLogReg|predict|0|10k_100|dense|4.780 -These logs can be found in `temp` folder (`$SYSTEMML_HOME/scripts/perftest/temp`) in-case not overridden by `--temp-dir`. This `temp` folders also contain the data generated during our performance test. +These logs and config `json` files can be found in `temp` folder (`$SYSTEMML_HOME/scripts/perftest/temp`) in-case not overridden by `--config-dir`. + +`--temp-dir` by default points to local file system. We can change this to point to a hdfs path by `--temp-dir hdfs://localhost:9000/temp` where all files generated during execution will be saved. + +Every time a script executes in `data-gen` mode successfully, we write a `_SUCCESS` file. If this file exists we ensures that re-runs of the same script is not possible. Support for configuration options like `-stats`, `-explain`, `--conf` have also been added. + +Results obtained by our performance tests can be automatically uploaded to google docs. + +`./update.py --file ../temp/singlenode.out --exec-mode singlenode --auth client_json.json --tag 1.0` + +In the example above `--tag` can be a major/minor systemml version and `--auth` points to the `json` key required by `google docs`. + +Currently we only support time difference between algorithms in different versions. This can be obtained by running the script below +`./stats.py --auth client_json.json --exec-mode singlenode --tags 1.0 2.0` -Every time a script executes in `data-gen` mode successfully, we write a `_SUCCESS` file. If this file exists we ensures that re-run of the same script is not possible as data already exists. +Note: Please pip install `https://github.com/burnash/gspread` to use google docs client. ### Troubleshooting We can debug the performance test by making changes in the following locations based on -- Please see `utils.py` function `exec_dml_and_parse_time`. In uncommenting the debug print statement in the function `exec_dml_and_parse_time`. This allows us to inspect the subprocess string being executed. +- Please see `utils_exec.py` function `subprocess_exec`. - Please see `run_perftest.py`. Changing the verbosity level to `0` allows us to log more information while the script runs. -- Eyeballing the json files generated and making sure the arguments are correct. +- Eyeballing the json files generated and making sure the configuration arguments are correct. http://git-wip-us.apache.org/repos/asf/systemml/blob/e94374af/scripts/perftest/python/datagen.py ---------------------------------------------------------------------- diff --git a/scripts/perftest/python/datagen.py b/scripts/perftest/python/datagen.py index 88a71f0..72a0627 100755 --- a/scripts/perftest/python/datagen.py +++ b/scripts/perftest/python/datagen.py @@ -22,7 +22,7 @@ import itertools from os.path import join -from utils import split_rowcol, config_writer, mat_type_check +from utils_misc import split_rowcol, config_writer, mat_type_check # This file contains configuration settings for data generation DATA_FORMAT = 'csv' @@ -33,42 +33,44 @@ MATRIX_TYPE_DICT = {'dense': '0.9', FAMILY_NO_MATRIX_TYPE = ['clustering', 'stats1', 'stats2'] -def multinomial_datagen(matrix_dim, matrix_type, datagen_dir): +def multinomial_datagen(matrix_dim, matrix_type, datagen_dir, config_dir): + path_name = '.'.join(['multinomial', matrix_type, str(matrix_dim)]) + datagen_write = join(datagen_dir, path_name) + save_path = join(config_dir, path_name) row, col = split_rowcol(matrix_dim) - path_name = '.'.join(['multinomial', matrix_type, str(matrix_dim)]) - full_path = join(datagen_dir, path_name) numSamples = row numFeatures = col sparsity = MATRIX_TYPE_DICT[matrix_type] num_categories = '150' intercept = '0' - X = join(full_path, 'X.data') - Y = join(full_path, 'Y.data') + X = join(datagen_write, 'X.data') + Y = join(datagen_write, 'Y.data') fmt = DATA_FORMAT config = [numSamples, numFeatures, sparsity, num_categories, intercept, X, Y, fmt, '1'] - config_writer(full_path + '.json', config) + config_writer(save_path + '.json', config) - return full_path + return save_path -def binomial_datagen(matrix_dim, matrix_type, datagen_dir): +def binomial_datagen(matrix_dim, matrix_type, datagen_dir, config_dir): + path_name = '.'.join(['binomial', matrix_type, str(matrix_dim)]) + datagen_write = join(datagen_dir, path_name) + save_path = join(config_dir, path_name) row, col = split_rowcol(matrix_dim) - path_name = '.'.join(['binomial', matrix_type, str(matrix_dim)]) - full_path = join(datagen_dir, path_name) numSamples = row numFeatures = col maxFeatureValue = '5' maxWeight = '5' - loc_weights = join(full_path, 'weight.data') - loc_data = join(full_path, 'X.data') - loc_labels = join(full_path, 'Y.data') + loc_weights = join(datagen_write, 'weight.data') + loc_data = join(datagen_write, 'X.data') + loc_labels = join(datagen_write, 'Y.data') noise = '1' intercept = '0' sparsity = MATRIX_TYPE_DICT[matrix_type] @@ -77,24 +79,25 @@ def binomial_datagen(matrix_dim, matrix_type, datagen_dir): config = [numSamples, numFeatures, maxFeatureValue, maxWeight, loc_weights, loc_data, loc_labels, noise, intercept, sparsity, fmt, tranform_labels] - config_writer(full_path + '.json', config) + config_writer(save_path + '.json', config) - return full_path + return save_path -def regression1_datagen(matrix_dim, matrix_type, datagen_dir): +def regression1_datagen(matrix_dim, matrix_type, datagen_dir, config_dir): + path_name = '.'.join(['regression1', matrix_type, str(matrix_dim)]) + datagen_write = join(datagen_dir, path_name) + save_path = join(config_dir, path_name) row, col = split_rowcol(matrix_dim) - path_name = '.'.join(['regression1', matrix_type, str(matrix_dim)]) - full_path = join(datagen_dir, path_name) numSamples = row numFeatures = col maxFeatureValue = '5' maxWeight = '5' - loc_weights = join(full_path, 'weight.data') - loc_data = join(full_path, 'X.data') - loc_labels = join(full_path, 'Y.data') + loc_weights = join(datagen_write, 'weight.data') + loc_data = join(datagen_write, 'X.data') + loc_labels = join(datagen_write, 'Y.data') noise = '1' intercept = '0' sparsity = MATRIX_TYPE_DICT[matrix_type] @@ -103,24 +106,25 @@ def regression1_datagen(matrix_dim, matrix_type, datagen_dir): config = [numSamples, numFeatures, maxFeatureValue, maxWeight, loc_weights, loc_data, loc_labels, noise, intercept, sparsity, fmt, tranform_labels] - config_writer(full_path + '.json', config) + config_writer(save_path + '.json', config) - return full_path + return save_path -def regression2_datagen(matrix_dim, matrix_type, datagen_dir): +def regression2_datagen(matrix_dim, matrix_type, datagen_dir, config_dir): + path_name = '.'.join(['regression2', matrix_type, str(matrix_dim)]) + datagen_write = join(datagen_dir, path_name) + save_path = join(config_dir, path_name) row, col = split_rowcol(matrix_dim) - path_name = '.'.join(['regression2', matrix_type, str(matrix_dim)]) - full_path = join(datagen_dir, path_name) numSamples = row numFeatures = col maxFeatureValue = '5' maxWeight = '5' - loc_weights = join(full_path, 'weight.data') - loc_data = join(full_path, 'X.data') - loc_labels = join(full_path, 'Y.data') + loc_weights = join(datagen_write, 'weight.data') + loc_data = join(datagen_write, 'X.data') + loc_labels = join(datagen_write, 'Y.data') noise = '1' intercept = '0' sparsity = MATRIX_TYPE_DICT[matrix_type] @@ -129,21 +133,22 @@ def regression2_datagen(matrix_dim, matrix_type, datagen_dir): config = [numSamples, numFeatures, maxFeatureValue, maxWeight, loc_weights, loc_data, loc_labels, noise, intercept, sparsity, fmt, tranform_labels] - config_writer(full_path + '.json', config) + config_writer(save_path + '.json', config) - return full_path + return save_path -def clustering_datagen(matrix_dim, matrix_type, datagen_dir): +def clustering_datagen(matrix_dim, matrix_type, datagen_dir, config_dir): - row, col = split_rowcol(matrix_dim) path_name = '.'.join(['clustering', matrix_type, str(matrix_dim)]) + datagen_write = join(datagen_dir, path_name) + save_path = join(config_dir, path_name) + row, col = split_rowcol(matrix_dim) - full_path = join(datagen_dir, path_name) - X = join(full_path, 'X.data') - Y = join(full_path, 'Y.data') - YbyC = join(full_path, 'YbyC.data') - C = join(full_path, 'C.data') + X = join(datagen_write, 'X.data') + Y = join(datagen_write, 'Y.data') + YbyC = join(datagen_write, 'YbyC.data') + C = join(datagen_write, 'C.data') nc = '50' dc = '10.0' dr = '1.0' @@ -153,22 +158,24 @@ def clustering_datagen(matrix_dim, matrix_type, datagen_dir): config = dict(nr=row, nf=col, nc=nc, dc=dc, dr=dr, fbf=fbf, cbf=cbf, X=X, C=C, Y=Y, YbyC=YbyC, fmt=DATA_FORMAT) - config_writer(full_path + '.json', config) - return full_path + config_writer(save_path + '.json', config) + return save_path -def stats1_datagen(matrix_dim, matrix_type, datagen_dir): +def stats1_datagen(matrix_dim, matrix_type, datagen_dir, config_dir): - row, col = split_rowcol(matrix_dim) path_name = '.'.join(['stats1', matrix_type, str(matrix_dim)]) - full_path = join(datagen_dir, path_name) - - DATA = join(full_path, 'X.data') - TYPES = join(full_path, 'types') - TYPES1 = join(full_path, 'set1.types') - TYPES2 = join(full_path, 'set2.types') - INDEX1 = join(full_path, 'set1.indices') - INDEX2 = join(full_path, 'set2.indices') + datagen_write = join(datagen_dir, path_name) + save_path = join(config_dir, path_name) + + row, col = split_rowcol(matrix_dim) + + DATA = join(datagen_write, 'X.data') + TYPES = join(datagen_write, 'types') + TYPES1 = join(datagen_write, 'set1.types') + TYPES2 = join(datagen_write, 'set2.types') + INDEX1 = join(datagen_write, 'set1.indices') + INDEX2 = join(datagen_write, 'set2.indices') MAXDOMAIN = '1100' SETSIZE = '20' LABELSETSIZE = '10' @@ -184,30 +191,31 @@ def stats1_datagen(matrix_dim, matrix_type, datagen_dir): LABELSETSIZE=LABELSETSIZE, TYPES1=TYPES1, TYPES2=TYPES2, INDEX1=INDEX1, INDEX2=INDEX2, fmt=DATA_FORMAT) - config_writer(full_path + '.json', config) + config_writer(save_path + '.json', config) - return full_path + return save_path -def stats2_datagen(matrix_dim, matrix_type, datagen_dir): +def stats2_datagen(matrix_dim, matrix_type, datagen_dir, config_dir): - row, col = split_rowcol(matrix_dim) path_name = '.'.join(['stats2', matrix_type, str(matrix_dim)]) - full_path = join(datagen_dir, path_name) + datagen_write = join(datagen_dir, path_name) + save_path = join(config_dir, path_name) + row, col = split_rowcol(matrix_dim) - D = join(full_path, 'X.data') - Xcid = join(full_path, 'Xcid.data') - Ycid = join(full_path, 'Ycid.data') - A = join(full_path, 'A.data') + D = join(datagen_write, 'X.data') + Xcid = join(datagen_write, 'Xcid.data') + Ycid = join(datagen_write, 'Ycid.data') + A = join(datagen_write, 'A.data') config = dict(nr=row, nf=col, D=D, Xcid=Xcid, Ycid=Ycid, A=A, fmt=DATA_FORMAT) - config_writer(full_path + '.json', config) - return full_path + config_writer(save_path + '.json', config) + return save_path -def config_packets_datagen(algo_payload, matrix_type, matrix_shape, datagen_dir, dense_algos): +def config_packets_datagen(algo_payload, matrix_type, matrix_shape, datagen_dir, dense_algos, config_dir): """ This function has two responsibilities. Generate the configuration files for datagen algorithms and return a dictionary that will be used for execution. @@ -228,13 +236,14 @@ def config_packets_datagen(algo_payload, matrix_type, matrix_shape, datagen_dir, dense_algos: List Algorithms that support only dense matrix type + config_dir: String + Location to store + return: Dictionary {string: list} This dictionary contains algorithms to be executed as keys and the path of configuration json files to be executed list of values. """ - config_bundle = {} - distinct_families = set(map(lambda x: x[1], algo_payload)) # Cross Product of all configurations @@ -249,7 +258,7 @@ def config_packets_datagen(algo_payload, matrix_type, matrix_shape, datagen_dir, config_packets[current_family] = [] for size, type in configs: family_func = current_family.lower() + '_datagen' - conf_path = globals()[family_func](size, type, datagen_dir) + conf_path = globals()[family_func](size, type, datagen_dir, config_dir) config_packets[current_family].append(conf_path) return config_packets http://git-wip-us.apache.org/repos/asf/systemml/blob/e94374af/scripts/perftest/python/google_docs/stats.py ---------------------------------------------------------------------- diff --git a/scripts/perftest/python/google_docs/stats.py b/scripts/perftest/python/google_docs/stats.py new file mode 100755 index 0000000..3b89abe --- /dev/null +++ b/scripts/perftest/python/google_docs/stats.py @@ -0,0 +1,113 @@ +#!/usr/bin/env python3 +# ------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +# ------------------------------------------------------------- + +import argparse +from functools import reduce +import pprint +from oauth2client.service_account import ServiceAccountCredentials +import gspread + +# Get time difference between difference runs + + +def auth(path, sheet_name): + """ + Responsible for authorization + """ + scope = ['https://spreadsheets.google.com/feeds'] + creds = ServiceAccountCredentials.from_json_keyfile_name(path, scope) + gc = gspread.authorize(creds) + sheet = gc.open("Perf").worksheet(sheet_name) + return sheet + + +def get_data(sheet, tag): + """ + Get time and algorithm from the sheet + """ + time = sheet.find('time_{}'.format(tag)) + algo = sheet.find('algo_{}'.format(tag)) + + time_col = sheet.col_values(time.col) + time_col = list(filter(lambda x: len(x) > 0, time_col)) + + algo_col = sheet.col_values(algo.col) + algo_col = list(filter(lambda x: len(x) > 0, algo_col)) + return algo_col, time_col + + +def get_data_dict(data_col): + """ + Return data as dictionary with key as algorithm and list time values + """ + data_dict = {} + all_algo = [] + for algo, _ in data_col: + all_algo.append(algo) + + flatten_algo = reduce(lambda x, y: x+y, all_algo) + + # remove the header + filter_data = list(filter(lambda x: not x.startswith('algo_'), flatten_algo)) + distict_algos = set(filter_data) + + for algo_dist in distict_algos: + for algo, time in data_col: + for k, v in zip(algo, time): + if algo_dist == k: + if algo_dist not in data_dict: + data_dict[k] = [v] + else: + data_dict[k].append(v) + return data_dict + +# Example Usage +# ./stats.py --auth client_json.json --exec-mode singlenode --tags 1.0 2.0 +if __name__ == '__main__': + execution_mode = ['hybrid_spark', 'singlenode'] + + cparser = argparse.ArgumentParser(description='System-ML Statistics Script') + cparser.add_argument('--auth', help='Location to read auth file', + required=True, metavar='') + cparser.add_argument('--exec-mode', help='Execution mode', choices=execution_mode, + required=True, metavar='') + cparser.add_argument('--tags', help='Tagging header value', + required=True, nargs='+') + + args = cparser.parse_args() + arg_dict = vars(args) + sheet = auth(args.auth, args.exec_mode) + all_data = sheet.get_all_records() + + data_col = [] + for tag in args.tags: + algo_col, time_col = get_data(sheet, tag) + data_col.append((algo_col, time_col)) + + data_dict = get_data_dict(data_col) + + delta_algo = {} + for k, v in data_dict.items(): + delta = float(v[0]) - float(v[1]) + delta_algo[k] = delta + + pprint.pprint(delta_algo, width=1) http://git-wip-us.apache.org/repos/asf/systemml/blob/e94374af/scripts/perftest/python/google_docs/update.py ---------------------------------------------------------------------- diff --git a/scripts/perftest/python/google_docs/update.py b/scripts/perftest/python/google_docs/update.py new file mode 100755 index 0000000..c2fed38 --- /dev/null +++ b/scripts/perftest/python/google_docs/update.py @@ -0,0 +1,110 @@ +#!/usr/bin/env python3 +# ------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +# ------------------------------------------------------------- + +import argparse +import gspread +from oauth2client.service_account import ServiceAccountCredentials +import pandas as pd + +# Update data to google sheets + + +def parse_data(file_path): + """ + Skip reading 1st row : Header + Skip reading last row : Footer + """ + csv_file = pd.read_csv(file_path, sep=',', skiprows=1, skipfooter=1, engine='python') + algo = csv_file['INFO:root:algorithm'].apply(lambda x: x.split(':')[-1]) + key = algo + '_'+ csv_file['run_type'] + '_' + csv_file['intercept'] + '_' + \ + csv_file['matrix_type'] + '_' + csv_file['data_shape'] + return key, csv_file['time_sec'] + + +def auth(path, sheet_name): + """ + Responsible for authorization + """ + scope = ['https://spreadsheets.google.com/feeds'] + creds = ServiceAccountCredentials.from_json_keyfile_name(path, scope) + gc = gspread.authorize(creds) + sheet = gc.open("Perf").worksheet(sheet_name) + return sheet + + +def insert_pair(algo, time, start_col, tag): + """ + Wrapper function that calls insert_values to insert algo and time + """ + insert_values(sheet, algo, start_col, 'algo_{}'.format(tag)) + insert_values(sheet, time, start_col + 1, 'time_{}'.format(tag)) + print('Writing Complete') + + +def insert_values(sheet, key, col_num, header): + """ + Insert data to google sheets based on the arguments + """ + # Col Name + sheet.update_cell(1, col_num, header) + for id, val in enumerate(key): + sheet.update_cell(id + 2, col_num, val) + + +def get_dim(sheet): + """ + Get the dimensions of data + """ + try: + col_count = sheet.get_all_records() + except: + col_count = [[]] + row = len(col_count) + col = len(col_count[0]) + return row, col + + +# Example Usage +# ./update.py --file ../temp/test.out --exec-mode singlenode --auth client_json.json --tag 3.0 +if __name__ == '__main__': + execution_mode = ['hybrid_spark', 'singlenode'] + + cparser = argparse.ArgumentParser(description='System-ML Update / Stat Script') + cparser.add_argument('--file', help='Location of the current perf test outputs', + required=True, metavar='') + cparser.add_argument('--exec-mode', help='Backend Type', choices=execution_mode, + required=True, metavar='') + cparser.add_argument('--auth', help='Location to read auth file', + required=True, metavar='') + cparser.add_argument('--tag', help='Tagging header value', + required=True, metavar='') + + args = cparser.parse_args() + arg_dict = vars(args) + + # Authenticate and get sheet dimensions + sheet = auth(args.auth, args.exec_mode) + row, col = get_dim(sheet) + + # Read data from file and write to google docs + algo, time = parse_data(args.file) + insert_pair(algo, time, col + 1, args.tag) http://git-wip-us.apache.org/repos/asf/systemml/blob/e94374af/scripts/perftest/python/predict.py ---------------------------------------------------------------------- diff --git a/scripts/perftest/python/predict.py b/scripts/perftest/python/predict.py index 92d3af4..d2e44e6 100755 --- a/scripts/perftest/python/predict.py +++ b/scripts/perftest/python/predict.py @@ -22,47 +22,42 @@ import sys from os.path import join -from utils import config_writer, relevant_folders, mat_type_check +from utils_misc import config_writer, mat_type_check +from utils_fs import relevant_folders # Contains configuration setting for predicting DATA_FORMAT = 'csv' -def m_svm_predict(save_file_name, datagen_dir, train_dir, predict_dir): +def m_svm_predict(save_folder_name, datagen_dir, train_dir, predict_dir, config_dir): + save_path = join(config_dir, save_folder_name) X = join(datagen_dir, 'X_test.data') Y = join(datagen_dir, 'Y_test.data') - - icpt = save_file_name.split('.')[-1] + icpt = save_folder_name.split('.')[-1] model = join(train_dir, 'model.data') - fmt = DATA_FORMAT - - config = dict(X=X, Y=Y, icpt=icpt, model=model, fmt=fmt) - - full_path_predict = join(predict_dir, save_file_name) - config_writer(full_path_predict + '.json', config) + config = dict(X=X, Y=Y, icpt=icpt, model=model, fmt=DATA_FORMAT) + config_writer(save_path + '.json', config) - return full_path_predict + return save_path -def l2_svm_predict(save_file_name, datagen_dir, train_dir, predict_dir): +def l2_svm_predict(save_folder_name, datagen_dir, train_dir, predict_dir, config_dir): + save_path = join(config_dir, save_folder_name) X = join(datagen_dir, 'X_test.data') Y = join(datagen_dir, 'Y_test.data') - - icpt = save_file_name.split('.')[-1] + icpt = save_folder_name.split('.')[-1] model = join(train_dir, 'model.data') - fmt = DATA_FORMAT - - config = dict(X=X, Y=Y, icpt=icpt, model=model, fmt=fmt) + config = dict(X=X, Y=Y, icpt=icpt, model=model, fmt=DATA_FORMAT) + config_writer(save_path + '.json', config) - full_path_predict = join(predict_dir, save_file_name) - config_writer(full_path_predict + '.json', config) + return save_path - return full_path_predict +def multilogreg_predict(save_folder_name, datagen_dir, train_dir, predict_dir, config_dir): + save_path = join(config_dir, save_folder_name) -def multilogreg_predict(save_file_name, datagen_dir, train_dir, predict_dir): X = join(datagen_dir, 'X_test.data') Y = join(datagen_dir, 'Y_test.data') B = join(train_dir, 'B.data') @@ -70,156 +65,147 @@ def multilogreg_predict(save_file_name, datagen_dir, train_dir, predict_dir): dfam = '3' vpow = '-1' link = '2' - fmt = DATA_FORMAT - config = dict(dfam=dfam, vpow=vpow, link=link, fmt=fmt, X=X, B=B, Y=Y, M=M) + config = dict(dfam=dfam, vpow=vpow, link=link, fmt=DATA_FORMAT, X=X, B=B, Y=Y, M=M) - full_path_predict = join(predict_dir, save_file_name) - config_writer(full_path_predict + '.json', config) + config_writer(save_path + '.json', config) - return full_path_predict + return save_path -def naive_bayes_predict(save_file_name, datagen_dir, train_dir, predict_dir): +def naive_bayes_predict(save_folder_name, datagen_dir, train_dir, predict_dir, config_dir): + save_path = join(config_dir, save_folder_name) X = join(datagen_dir, 'X_test.data') Y = join(datagen_dir, 'Y_test.data') - prior = join(train_dir, 'prior') conditionals = join(train_dir, 'conditionals') - fmt = DATA_FORMAT probabilities = join(train_dir, 'probabilities') - config = dict(X=X, Y=Y, prior=prior, conditionals=conditionals, fmt=fmt, probabilities=probabilities) + config = dict(X=X, Y=Y, prior=prior, conditionals=conditionals, fmt=DATA_FORMAT, + probabilities=probabilities) + config_writer(save_path + '.json', config) - full_path_predict = join(predict_dir, save_file_name) - config_writer(full_path_predict + '.json', config) + return save_path - return full_path_predict +def kmeans_predict(save_folder_name, datagen_dir, train_dir, predict_dir, config_dir): -def kmeans_predict(save_file_name, datagen_dir, train_dir, predict_dir): + save_path = join(config_dir, save_folder_name) + predict_write = join(predict_dir, save_folder_name) X = join(datagen_dir, 'X_test.data') C = join(datagen_dir, 'C.data') - full_path_predict = join(predict_dir, save_file_name) - prY = join(full_path_predict, 'prY.data') + prY = join(predict_write, 'prY.data') config = dict(X=X, C=C, prY=prY) - config_writer(full_path_predict + '.json', config) + config_writer(save_path + '.json', config) - return full_path_predict + return save_path -def linearregcg_predict(save_file_name, datagen_dir, train_dir, predict_dir): +def linearregcg_predict(save_folder_name, datagen_dir, train_dir, predict_dir, config_dir): + + save_path = join(config_dir, save_folder_name) + predict_write = join(predict_dir, save_folder_name) dfam = '1' link = '1' vpow = '0.0' lpow = '1.0' - X = join(datagen_dir, 'X_test.data') B = join(train_dir, 'B.data') Y = join(datagen_dir, 'Y_test.data') - - full_path_predict = join(predict_dir, save_file_name) - M = join(full_path_predict, 'M.data') - O = join(full_path_predict, 'O.data') - + M = join(predict_write, 'M.data') + O = join(predict_write, 'O.data') config = dict(dfam=dfam, link=link, vpow=vpow, lpow=lpow, fmt=DATA_FORMAT, X=X, B=B, Y=Y, M=M, O=O) - config_writer(full_path_predict + '.json', config) + config_writer(save_path + '.json', config) - return full_path_predict + return save_path -def linearregds_predict(save_file_name, datagen_dir, train_dir, predict_dir): +def linearregds_predict(save_folder_name, datagen_dir, train_dir, predict_dir, config_dir): + save_path = join(config_dir, save_folder_name) + predict_write = join(predict_dir, save_folder_name) dfam = '1' link = '1' vpow = '0.0' lpow = '1.0' - X = join(datagen_dir, 'X_test.data') B = join(train_dir, 'B.data') Y = join(datagen_dir, 'Y_test.data') - - full_path_predict = join(predict_dir, save_file_name) - M = join(full_path_predict, 'M.data') - O = join(full_path_predict, 'O.data') - + M = join(predict_write, 'M.data') + O = join(predict_write, 'O.data') config = dict(dfam=dfam, link=link, vpow=vpow, lpow=lpow, fmt=DATA_FORMAT, X=X, B=B, Y=Y, M=M, O=O) - config_writer(full_path_predict + '.json', config) + config_writer(save_path + '.json', config) - return full_path_predict + return save_path -def glm_poisson_predict(save_file_name, datagen_dir, train_dir, predict_dir): +def glm_poisson_predict(save_folder_name, datagen_dir, train_dir, predict_dir, config_dir): + save_path = join(config_dir, save_folder_name) + predict_write = join(predict_dir, save_folder_name) dfam = '1' link = '1' vpow = '1' lpow = '1.0' - X = join(datagen_dir, 'X_test.data') B = join(train_dir, 'B.data') Y = join(datagen_dir, 'Y_test.data') - - full_path_predict = join(predict_dir, save_file_name) - M = join(full_path_predict, 'M.data') - O = join(full_path_predict, 'O.data') + M = join(predict_write, 'M.data') + O = join(predict_write, 'O.data') config = dict(dfam=dfam, link=link, vpow=vpow, lpow=lpow, fmt=DATA_FORMAT, X=X, B=B, Y=Y, M=M, O=O) - config_writer(full_path_predict + '.json', config) + config_writer(save_path + '.json', config) - return full_path_predict + return save_path -def glm_binomial_predict(save_file_name, datagen_dir, train_dir, predict_dir): +def glm_binomial_predict(save_folder_name, datagen_dir, train_dir, predict_dir, config_dir): + save_path = join(config_dir, save_folder_name) + predict_write = join(predict_dir, save_folder_name) dfam = '2' link = '3' - X = join(datagen_dir, 'X_test.data') B = join(train_dir, 'B.data') Y = join(datagen_dir, 'Y_test.data') - - full_path_predict = join(predict_dir, save_file_name) - M = join(full_path_predict, 'M.data') - O = join(full_path_predict, 'O.data') + M = join(predict_write, 'M.data') + O = join(predict_write, 'O.data') config = dict(dfam=dfam, link=link, fmt=DATA_FORMAT, X=X, B=B, Y=Y, M=M, O=O) - config_writer(full_path_predict + '.json', config) + config_writer(save_path + '.json', config) - return full_path_predict + return save_path -def glm_gamma_predict(save_file_name, datagen_dir, train_dir, predict_dir): +def glm_gamma_predict(save_folder_name, datagen_dir, train_dir, predict_dir, config_dir): + save_path = join(config_dir, save_folder_name) + predict_write = join(predict_dir, save_folder_name) dfam = '1' link = '1' vpow = '2' lpow = '0' - X = join(datagen_dir, 'X_test.data') B = join(train_dir, 'B.data') Y = join(datagen_dir, 'Y_test.data') - - full_path_predict = join(predict_dir, save_file_name) - M = join(full_path_predict, 'M.data') - O = join(full_path_predict, 'O.data') - + M = join(predict_write, 'M.data') + O = join(predict_write, 'O.data') config = dict(dfam=dfam, link=link, vpow=vpow, lpow=lpow, fmt=DATA_FORMAT, X=X, B=B, Y=Y, M=M, O=O) - config_writer(full_path_predict + '.json', config) + config_writer(save_path + '.json', config) - return full_path_predict + return save_path -def config_packets_predict(algo_payload, matrix_type, matrix_shape, datagen_dir, train_dir, predict_dir, dense_algos): +def config_packets_predict(algo_payload, matrix_type, matrix_shape, datagen_dir, train_dir, predict_dir, dense_algos, config_dir): """ This function has two responsibilities. Generate the configuration files for prediction algorithms and return a dictionary that will be used for execution. @@ -246,6 +232,9 @@ def config_packets_predict(algo_payload, matrix_type, matrix_shape, datagen_dir, dense_algos: List Algorithms that support only dense matrix type + config_dir: String + Location to store to configuration json file + return: Dictionary {string: list} This dictionary contains algorithms to be executed as keys and the path of configuration json files to be executed list of values. @@ -267,6 +256,7 @@ def config_packets_predict(algo_payload, matrix_type, matrix_shape, datagen_dir, for current_train_folder in train_folders: current_data_gen_dir = relevant_folders(datagen_dir, current_algo, current_family, current_matrix_type, matrix_shape, 'data-gen') + if len(current_data_gen_dir) == 0: print('data-gen folders not present for {}'.format(current_family)) sys.exit() @@ -276,7 +266,7 @@ def config_packets_predict(algo_payload, matrix_type, matrix_shape, datagen_dir, # current_data_gen_dir has index 0 as we would expect one datagen for each algorithm conf_path = globals()[algo_func](save_name, current_data_gen_dir[0], - current_train_folder, predict_dir) + current_train_folder, predict_dir, config_dir) config_bundle[current_algo].append(conf_path) http://git-wip-us.apache.org/repos/asf/systemml/blob/e94374af/scripts/perftest/python/run_perftest.py ---------------------------------------------------------------------- diff --git a/scripts/perftest/python/run_perftest.py b/scripts/perftest/python/run_perftest.py index 3360285..a15d7e6 100755 --- a/scripts/perftest/python/run_perftest.py +++ b/scripts/perftest/python/run_perftest.py @@ -31,9 +31,10 @@ from datetime import datetime from datagen import config_packets_datagen from train import config_packets_train from predict import config_packets_predict -from utils import get_families, config_reader, create_dir, get_existence, \ - exec_dml_and_parse_time, exec_test_data, check_predict, get_folder_metrics - +from utils_misc import get_families, config_reader, \ + exec_dml_and_parse_time, exec_test_data, check_predict, get_folder_metrics, args_dict_split, \ + get_config_args +from utils_fs import create_dir_local, write_success, check_SUCCESS_file_exists # A packet is a dictionary # with key as the algorithm @@ -83,32 +84,35 @@ ML_PREDICT = {'Kmeans': 'Kmeans-predict', DENSE_TYPE_ALGOS = ['clustering', 'stats1', 'stats2'] +sup_args_dict = {} + # Responsible for execution and metric logging -def algorithm_workflow(algo, exec_type, config_path, dml_file_name, action_mode): +def algorithm_workflow(algo, exec_type, config_path, dml_file_name, action_mode, current_dir): """ This function is responsible for overall workflow. This does the following actions Check if the input is key value argument or list of positional args Execution and time Logging Metrics - - algo : String + algo: String Input algorithm specified - exec_type : String + exec_type: String Contains the execution type singlenode / hybrid_spark - config_path : String + config_path: String Path to read the json file from - dml_file_name : String + dml_file_name: String DML file name to be used while processing the arguments give - action_mode : String + action_mode: String Type of action data-gen, train ... - """ + current_dir: String + Current location of hdfs / local temp being processed + """ config_data = config_reader(config_path + '.json') if isinstance(config_data, dict): @@ -122,26 +126,24 @@ def algorithm_workflow(algo, exec_type, config_path, dml_file_name, action_mode) config_file_name = config_path.split('/')[-1] mat_type, mat_shape, intercept = get_folder_metrics(config_file_name, action_mode) - exit_flag_success = get_existence(config_path, action_mode) + temp_cwd = join(current_dir, config_file_name) + + # temp_dir_exist + exit_flag_success = check_SUCCESS_file_exists(temp_cwd) if exit_flag_success: - print('data already exists {}'.format(config_path)) time = 'data_exists' else: - time = exec_dml_and_parse_time(exec_type, dml_file_name, config_file_name, args) - - # Write a _SUCCESS file only if time is found and in data-gen action_mode - if len(time.split('.')) == 2 and action_mode == 'data-gen': - full_path = join(config_path, '_SUCCESS') - open(full_path, 'w').close() + time = exec_dml_and_parse_time(exec_type, dml_file_name, args, spark_args_dict, sup_args_dict) + write_success(time, temp_cwd) print('{},{},{},{},{},{}'.format(algo, action_mode, intercept, mat_type, mat_shape, time)) current_metrics = [algo, action_mode, intercept, mat_type, mat_shape, time] logging.info(','.join(current_metrics)) + return exit_flag_success -# Perf test entry point -def perf_test_entry(family, algo, exec_type, mat_type, mat_shape, temp_dir, mode): +def perf_test_entry(family, algo, exec_type, mat_type, mat_shape, config_dir, mode, temp_dir): """ This function is the entry point for performance testing @@ -160,13 +162,15 @@ def perf_test_entry(family, algo, exec_type, mat_type, mat_shape, temp_dir, mode mat_shape: List Dimensions of the input matrix with rows and columns - temp_dir: String - Location to store all files created during perf test + config_dir: String + Location to store all configuration mode: List Type of workload to run. data-gen, train ... - """ + temp_dir: String + Location to store all output files created during perf test + """ # algos to run is a list of tuples with # [(m-svm, binomial), (m-svm, multinomial)...] # Basic block for execution of scripts @@ -202,48 +206,64 @@ def perf_test_entry(family, algo, exec_type, mat_type, mat_shape, temp_dir, mode algos_to_run.append((current_algo, current_family)) if 'data-gen' in mode: + # Create config directories + data_gen_config_dir = join(config_dir, 'data-gen') + create_dir_local(data_gen_config_dir) + + # Create output path data_gen_dir = join(temp_dir, 'data-gen') - create_dir(data_gen_dir) conf_packet = config_packets_datagen(algos_to_run, mat_type, mat_shape, data_gen_dir, - DENSE_TYPE_ALGOS) + DENSE_TYPE_ALGOS, data_gen_config_dir) + for family_name, config_folders in conf_packet.items(): for config in config_folders: file_name = ML_GENDATA[family_name] - algorithm_workflow(family_name, exec_type, config, file_name, 'data-gen') + success_file = algorithm_workflow(family_name, exec_type, config, file_name, 'data-gen', data_gen_dir) # Statistic family do not require to be split if family_name not in ['stats1', 'stats2']: - exec_test_data(exec_type, config) + if not success_file: + exec_test_data(exec_type, spark_args_dict, sup_args_dict, data_gen_dir, config) if 'train' in mode: + # Create config directories + train_config_dir = join(config_dir, 'train') + create_dir_local(train_config_dir) + + # Create output path data_gen_dir = join(temp_dir, 'data-gen') train_dir = join(temp_dir, 'train') - create_dir(train_dir) + conf_packet = config_packets_train(algos_to_run, mat_type, mat_shape, data_gen_dir, - train_dir, DENSE_TYPE_ALGOS) + train_dir, DENSE_TYPE_ALGOS, train_config_dir) for algo_name, config_files in conf_packet.items(): for config in config_files: file_name = ML_TRAIN[algo_name] - algorithm_workflow(algo_name, exec_type, config, file_name, 'train') + algorithm_workflow(algo_name, exec_type, config, file_name, 'train', train_dir) if 'predict' in mode: + # Create config directories + predict_config_dir = join(config_dir, 'predict') + create_dir_local(predict_config_dir) + + # Create output path data_gen_dir = join(temp_dir, 'data-gen') train_dir = join(temp_dir, 'train') predict_dir = join(temp_dir, 'predict') - create_dir(predict_dir) - algos_to_run_perdict = list(filter(lambda algo: check_predict(algo[0], ML_PREDICT), algos_to_run)) - if len(algos_to_run_perdict) < 1: + + algos_to_run = list(filter(lambda algo: check_predict(algo[0], ML_PREDICT), algos_to_run)) + if len(algos_to_run) < 1: # No algorithms with predict found pass - conf_packet = config_packets_predict(algos_to_run_perdict, mat_type, mat_shape, data_gen_dir, - train_dir, predict_dir, DENSE_TYPE_ALGOS) - + conf_packet = config_packets_predict(algos_to_run, mat_type, mat_shape, data_gen_dir, + train_dir, predict_dir, DENSE_TYPE_ALGOS, + predict_config_dir) for algo_name, config_files in conf_packet.items(): for config in config_files: file_name = ML_PREDICT[algo_name] - algorithm_workflow(algo_name, exec_type, config, file_name, 'predict') + algorithm_workflow(algo_name, exec_type, config, file_name, 'predict', predict_dir) -if __name__ == '__main__': +if __name__ == '__main__': # sys ml env set and error handling systemml_home = os.environ.get('SYSTEMML_HOME') if systemml_home is None: @@ -259,7 +279,6 @@ if __name__ == '__main__': # Default temp directory, contains everything generated in perftest default_temp_dir = join(systemml_home, 'scripts', 'perftest', 'temp') - create_dir(default_temp_dir) # Initialize time start_time = time.time() @@ -274,7 +293,8 @@ if __name__ == '__main__': all_families = ML_ALGO.keys() # Argparse Module - cparser = argparse.ArgumentParser(description='SystemML Performance Test Script') + cparser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter, + description='SystemML Performance Test Script') cparser.add_argument('--family', help='space separated list of classes of algorithms ' '(available : ' + ', '.join(sorted(all_families)) + ')', metavar='', choices=all_families, nargs='+') @@ -290,17 +310,43 @@ if __name__ == '__main__': nargs='+') cparser.add_argument('--mat-shape', default=default_mat_shape, help='space separated list of shapes of matrices ' 'to generate (e.g 10k_1k, 20M_4k)', metavar='', nargs='+') - cparser.add_argument('--temp-dir', default=default_temp_dir, help='temporary directory ' + + cparser.add_argument('--config-dir', default=default_temp_dir, help='temporary directory ' 'where generated, training and prediction data is put', metavar='') cparser.add_argument('--filename', default='perf_test', help='name of the output file for the perf' ' metrics', metavar='') cparser.add_argument('--mode', default=workload, help='space separated list of types of workloads to run (available: data-gen, train, predict)', metavar='', choices=workload, nargs='+') + # Change this to temp-dir + cparser.add_argument('--temp-dir', default=default_temp_dir, + help='define the file system to work on', metavar='') + + # Configuration Options + cparser.add_argument('-stats', help='Monitor and report caching/recompilation statistics, ' + 'heavy hitter <count> is 10 unless overridden', nargs='?', const='10', + metavar='') + cparser.add_argument('-explain', help='explains plan levels can be hops, runtime, ' + 'recompile_hops, recompile_runtime', nargs='?', const='runtime', metavar='') + cparser.add_argument('-config', help='System-ML configuration file (e.g SystemML-config.xml)', metavar='') + + # Spark Configuration Option + cparser.add_argument('--master', help='local, yarn-client, yarn-cluster', metavar='') + cparser.add_argument('--driver-memory', help='Memory for driver (e.g. 512M)', metavar='') + cparser.add_argument('--num-executors', help='Number of executors to launch', metavar='') + cparser.add_argument('--executor-memory', help='Memory per executor', metavar='') + cparser.add_argument('--executor-cores', help='Number of cores', metavar='') + cparser.add_argument('--conf', help='Spark configuration file', nargs='+', metavar='') # Args is a namespace args = cparser.parse_args() - arg_dict = vars(args) + all_arg_dict = vars(args) + arg_dict, config_dict, spark_dict = args_dict_split(all_arg_dict) + + create_dir_local(args.config_dir) + + # Global variables + sup_args_dict, spark_args_dict = get_config_args(config_dict, spark_dict, args.exec_type) # Debug arguments # print(arg_dict) @@ -344,13 +390,12 @@ if __name__ == '__main__': # Set level to 0 -> debug mode # Set level to 20 -> Plain metrics log_filename = args.filename + '_' + args.exec_type + '.out' - logging.basicConfig(filename=join(default_temp_dir, log_filename), level=20) + logging.basicConfig(filename=join(args.config_dir, log_filename), level=20) logging.info('New performance test started at {}'.format(time_now)) logging.info('algorithm,run_type,intercept,matrix_type,data_shape,time_sec') # Remove filename item from dictionary as its already used to create the log above del arg_dict['filename'] - perf_test_entry(**arg_dict) total_time = (time.time() - start_time)