[2/2] systemml git commit: [SYSTEMML-1451] phase 2 work

nakul02 Tue, 01 Aug 2017 13:47:25 -0700

[SYSTEMML-1451] phase 2 work

Completed these tasks as part for Phase 2 for Google Summer of Code '17
- Decouple systemml-spark-submit.py
- Decouple systemml-standalone.py
- Refractor perf test suit to accept args like debug, stats, config etc...
- Add HDFS support
- Google Docs support
- Compare SystemML with previous versions
- Pylint, Comment
- Extra arguments configuration Test
- Windows Test
- Doc update
- systemml standalone comments
- systemml spark submit comments


Closes #575


Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/e94374af
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/e94374af
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/e94374af

Branch: refs/heads/master
Commit: e94374afb2e6be5dc81524f9c7a5de09b9f4ba26
Parents: a2db1ad
Author: krishnakalyan3 <krishnakaly...@gmail.com>
Authored: Tue Aug 1 13:46:30 2017 -0700
Committer: Nakul Jindal <naku...@gmail.com>
Committed: Tue Aug 1 13:46:30 2017 -0700

----------------------------------------------------------------------
 bin/systemml-spark-submit.py                  | 278 +++++++--------
 bin/systemml-standalone.py                    | 256 +++++---------
 bin/utils.py                                  | 113 ++++++
 docs/python-performance-test.md               |  35 +-
 scripts/perftest/python/datagen.py            | 141 ++++----
 scripts/perftest/python/google_docs/stats.py  | 113 ++++++
 scripts/perftest/python/google_docs/update.py | 110 ++++++
 scripts/perftest/python/predict.py            | 156 ++++-----
 scripts/perftest/python/run_perftest.py       | 135 ++++---
 scripts/perftest/python/train.py              | 257 +++++++-------
 scripts/perftest/python/utils.py              | 390 ---------------------
 scripts/perftest/python/utils_exec.py         | 137 ++++++++
 scripts/perftest/python/utils_fs.py           | 162 +++++++++
 scripts/perftest/python/utils_misc.py         | 347 ++++++++++++++++++
 14 files changed, 1580 insertions(+), 1050 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/e94374af/bin/systemml-spark-submit.py
----------------------------------------------------------------------
diff --git a/bin/systemml-spark-submit.py b/bin/systemml-spark-submit.py
index 30974ec..b6426b3 100755
--- a/bin/systemml-spark-submit.py
+++ b/bin/systemml-spark-submit.py
@@ -21,167 +21,131 @@
 # -------------------------------------------------------------
 
 import os
-import sys
-from os.path import join, exists, abspath
-from os import environ
 import glob
-import argparse
-import shutil
+from os.path import join
 import platform
-
-if environ.get('SPARK_HOME') is None:
-    print('SPARK_HOME not set')
-    sys.exit(1)
-else:
-    spark_home = environ.get('SPARK_HOME')
+import argparse
+from utils import get_env_systemml_home, get_env_spark_home, find_dml_file, 
log4j_path, config_path
+
+
+def default_jars(systemml_home):
+    """
+    return: String
+    Location of systemml and jcuda jars
+    """
+    build_dir = join(systemml_home, 'target')
+    lib_dir = join(build_dir, 'lib')
+    systemml_jar = build_dir + os.sep + "SystemML.jar"
+    jcuda_jars = glob.glob(lib_dir + os.sep + "jcu*.jar")
+    target_jars = ','.join(jcuda_jars)
+    return target_jars, systemml_jar
+
+
+def spark_submit_entry(master, driver_memory, num_executors, executor_memory,
+                       executor_cores, conf,
+                       nvargs, args, config, explain, debug, stats, gpu, f):
+    """
+    This function is responsible for the execution of arguments via
+    subprocess call in hybrid_spark mode
+    """
+
+    spark_home = get_env_spark_home()
+    systemml_home = get_env_systemml_home()
     spark_path = join(spark_home, 'bin', 'spark-submit')
+    script_file = find_dml_file(systemml_home, f)
 
+    # Jars
+    cuda_jars, systemml_jars = default_jars(systemml_home)
 
-# error help print
-def print_usage_and_exit():
-    print('Usage: ./systemml-spark-submit.py -f <dml-filename> [arguments]')
-    sys.exit(1)
-
-cparser = argparse.ArgumentParser(description='System-ML Spark Submit Script')
-
-# SPARK-SUBMIT Options
-cparser.add_argument('--master', default='local[*]', help='local, yarn-client, 
yarn-cluster', metavar='')
-cparser.add_argument('--driver-memory', default='5G', help='Memory for driver 
(e.g. 512M)', metavar='')
-cparser.add_argument('--num-executors', default='2', help='Number of executors 
to launch', metavar='')
-cparser.add_argument('--executor-memory', default='2G', help='Memory per 
executor', metavar='')
-cparser.add_argument('--executor-cores', default='1', help='Number of cores', 
metavar='')
-cparser.add_argument('--conf', help='Spark configuration file', nargs='+', 
metavar='')
-
-# SYSTEM-ML Options
-cparser.add_argument('-nvargs', help='List of attributeName-attributeValue 
pairs', nargs='+', metavar='')
-cparser.add_argument('-args', help='List of positional argument values', 
metavar='', nargs='+')
-cparser.add_argument('-config', help='System-ML configuration file (e.g 
SystemML-config.xml)', metavar='')
-cparser.add_argument('-exec', default='hybrid_spark', help='System-ML backend 
(e.g spark, spark-hybrid)', metavar='')
-cparser.add_argument('-explain', help='explains plan levels can be hops, 
runtime, '
-                                      'recompile_hops, recompile_runtime', 
nargs='?', const='runtime', metavar='')
-cparser.add_argument('-debug', help='runs in debug mode', action='store_true')
-cparser.add_argument('-stats', help='Monitor and report caching/recompilation 
statistics, '
-                                    'heavy hitter <count> is 10 unless 
overridden', nargs='?', const='10', metavar='')
-cparser.add_argument('-gpu', help='uses CUDA instructions when reasonable, '
-                                  'set <force> option to skip conservative 
memory estimates '
-                                  'and use GPU wherever possible', nargs='?')
-cparser.add_argument('-f', required=True, help='specifies dml/pydml file to 
execute; '
-                                               'path can be local/hdfs/gpfs', 
metavar='')
-
-args = cparser.parse_args()
-
-# Optional arguments
-ml_options = []
-if args.nvargs is not None:
-    ml_options.append('-nvargs')
-    ml_options.append(' '.join(args.nvargs))
-if args.args is not None:
-    ml_options.append('-args')
-    ml_options.append(' '.join(args.args))
-if args.debug is not False:
-    ml_options.append('-debug')
-if args.explain is not None:
-    ml_options.append('-explain')
-    ml_options.append(args.explain)
-if args.gpu is not None:
-    ml_options.append('-gpu')
-    ml_options.append(args.gpu)
-if args.stats is not None:
-    ml_options.append('-stats')
-    ml_options.append(args.stats)
-
-# Assign script file to name received from argparse module
-script_file = args.f
-
-# find the systemML root path which contains the bin folder, the script folder 
and the target folder
-# tolerate path with spaces
-script_dir = os.path.dirname(os.path.realpath(__file__))
-project_root_dir = os.path.dirname(script_dir)
-user_dir = os.getcwd()
-
-scripts_dir = join(project_root_dir, 'scripts')
-build_dir = join(project_root_dir, 'target')
-lib_dir = join(build_dir, 'lib')
-
-systemml_jar = build_dir + os.sep + "SystemML.jar"
-jcuda_jars = glob.glob(lib_dir + os.sep + "jcu*.jar")
-target_jars = ','.join(jcuda_jars) # Include all JCuda Jars
-
-log4j_properties_path = join(project_root_dir, 'conf', 
'log4j.properties.template')
-
-build_err_msg = 'You must build the project before running this script.'
-build_dir_err_msg = 'Could not find target directory ' + build_dir + '. ' + 
build_err_msg
-
-# check if the project had been built and the jar files exist
-if not (exists(build_dir)):
-    print(build_dir_err_msg)
-    sys.exit(1)
-
-print('================================================================================')
-
-# if the present working directory is the project root or bin folder, then use 
the temp folder as user.dir
-if user_dir == project_root_dir or user_dir == join(project_root_dir, 'bin'):
-    user_dir = join(project_root_dir, 'temp')
-    print('Output dir: ' + user_dir)
-
-# if the SystemML-config.xml does not exist, create it from the template
-systemml_config_path = join(project_root_dir, 'conf', 'SystemML-config.xml')
-systemml_template_config_path = join(project_root_dir, 'conf', 
'SystemML-config.xml.template')
-if not (exists(systemml_config_path)):
-    shutil.copyfile(systemml_template_config_path, systemml_config_path)
-    print('... created ' + systemml_config_path)
-
-# if SystemML-config.xml is provided as arguments
-if args.config is None:
-    systemml_config_path_arg = systemml_config_path
-else:
-    systemml_config_path_arg = args.config
-
-
-# from http://stackoverflow.com/questions/1724693/find-a-file-in-python
-def find_file(name, path):
-    for root, dirs, files in os.walk(path):
-        if name in files:
-            return join(root, name)
-    return None
-
-# if the script file path was omitted, try to complete the script path
-if not (exists(script_file)):
-    script_file_name = abspath(script_file)
-    script_file_found = find_file(script_file, scripts_dir)
-    if script_file_found is None:
-        print('Could not find DML script: ' + script_file)
-        print_usage_and_exit()
+    # Log4j
+    log4j = log4j_path(systemml_home)
+    log4j_properties_path = 
'spark.driver.extraJavaOptions=-Dlog4j.configuration=file:{}'.format(log4j)
+    if conf is None:
+        default_conf = log4j_properties_path
     else:
-        script_file = script_file_found
-        print('DML Script:' + script_file)
-
-default_conf = 
'spark.driver.extraJavaOptions=-Dlog4j.configuration=file:{}'.format(log4j_properties_path)
-
-# Backslash problem in windows.
-if platform.system() == 'Windows':
-    default_conf = default_conf.replace('\\', '//')
-
-if args.conf is not None:
-    conf = ' --conf '.join(args.conf + [default_conf])
-else:
-    conf = default_conf
+        default_conf = ' --conf '.join(conf + [log4j_properties_path])
 
-cmd_spark = [spark_path, '--class', 'org.apache.sysml.api.DMLScript',
-             '--master', args.master, '--driver-memory', args.driver_memory,
-             '--num-executors', args.num_executors, '--executor-memory', 
args.executor_memory,
-             '--executor-cores', args.executor_cores, '--conf', conf, 
'--jars', target_jars,
-             systemml_jar]
-
-cmd_system_ml = ['-config', systemml_config_path_arg,
-                 '-exec', vars(args)['exec'], '-f', script_file, ' 
'.join(ml_options)]
-
-cmd = cmd_spark + cmd_system_ml
-
-return_code = os.system(' '.join(cmd))
-# For debugging
-# print(' '.join(cmd))
-
-if return_code != 0:
-    print('Failed to run SystemML. Exit code :' + str(return_code))
-    print(' '.join(cmd))
+    # Config XML
+    if config is None:
+        default_config = config_path(systemml_home)
+    else:
+        default_config = ' -config '.join([config] + 
[config_path(systemml_home)])
+
+    if platform.system() == 'Windows':
+        default_conf = default_conf.replace('\\', '//')
+
+    #  optional arguments
+    ml_options = []
+    if nvargs is not None:
+        ml_options.append('-nvargs')
+        ml_options.append(' '.join(nvargs))
+    if args is not None:
+        ml_options.append('-args')
+        ml_options.append(' '.join(args))
+    if explain is not None:
+        ml_options.append('-explain')
+        ml_options.append(explain)
+    if debug is not False:
+        ml_options.append('-debug')
+    if stats is not None:
+        ml_options.append('-stats')
+        ml_options.append(stats)
+    if gpu is not None:
+        ml_options.append('-gpu')
+        ml_options.append(gpu)
+
+    if len(ml_options) < 1:
+        ml_options = ''
+
+    # stats, explain, target_jars
+    cmd_spark = [spark_path, '--class', 'org.apache.sysml.api.DMLScript',
+                 '--master', master, '--driver-memory', driver_memory,
+                 '--num-executors', num_executors, '--executor-memory', 
executor_memory,
+                 '--executor-cores', executor_cores, '--conf', default_conf,
+                 '--jars', cuda_jars, systemml_jars]
+
+    cmd_system_ml = ['-config', default_config,
+                     '-exec', 'hybrid_spark', '-f', script_file, ' 
'.join(ml_options)]
+
+    cmd = cmd_spark + cmd_system_ml
+
+    # Debug
+    # print(' '.join(cmd))
+    return_code = os.system(' '.join(cmd))
+    return return_code
+
+
+if __name__ == '__main__':
+    cparser = 
argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+                                      description='System-ML Spark Submit 
Script')
+    # SPARK-SUBMIT Options
+    cparser.add_argument('--master', default='local[*]', help='local, 
yarn-client, yarn-cluster', metavar='')
+    cparser.add_argument('--driver-memory', default='5G', help='Memory for 
driver (e.g. 512M)', metavar='')
+    cparser.add_argument('--num-executors', default='2', help='Number of 
executors to launch', metavar='')
+    cparser.add_argument('--executor-memory', default='2G', help='Memory per 
executor', metavar='')
+    cparser.add_argument('--executor-cores', default='1', help='Number of 
cores', metavar='')
+    cparser.add_argument('--conf', help='Spark configuration file', nargs='+', 
metavar='')
+
+    # SYSTEM-ML Options
+    cparser.add_argument('-nvargs', help='List of attributeName-attributeValue 
pairs', nargs='+', metavar='')
+    cparser.add_argument('-args', help='List of positional argument values', 
metavar='', nargs='+')
+    cparser.add_argument('-config', help='System-ML configuration file (e.g 
SystemML-config.xml)', metavar='')
+    cparser.add_argument('-explain', help='explains plan levels can be hops, 
runtime, '
+                                          'recompile_hops, recompile_runtime', 
nargs='?', const='runtime', metavar='')
+    cparser.add_argument('-debug', help='runs in debug mode', 
action='store_true')
+    cparser.add_argument('-stats', help='Monitor and report 
caching/recompilation statistics, '
+                                        'heavy hitter <count> is 10 unless 
overridden', nargs='?', const='10',
+                                   metavar='')
+    cparser.add_argument('-gpu', help='uses CUDA instructions when reasonable, 
'
+                                      'set <force> option to skip conservative 
memory estimates '
+                                      'and use GPU wherever possible', 
nargs='?')
+    cparser.add_argument('-f', required=True, help='specifies dml/pydml file 
to execute; '
+                                                   'path can be 
local/hdfs/gpfs', metavar='')
+
+    args = cparser.parse_args()
+    arg_dict = vars(args)
+
+    return_code = spark_submit_entry(**arg_dict)
+
+    if return_code != 0:
+        print('Failed to run SystemML. Exit code :' + str(return_code))

http://git-wip-us.apache.org/repos/asf/systemml/blob/e94374af/bin/systemml-standalone.py
----------------------------------------------------------------------
diff --git a/bin/systemml-standalone.py b/bin/systemml-standalone.py
index a0ee8db..4000e75 100755
--- a/bin/systemml-standalone.py
+++ b/bin/systemml-standalone.py
@@ -21,180 +21,106 @@
 #-------------------------------------------------------------
 
 import os
-import shutil
-import sys
-from os.path import join, exists
+from os.path import join
+import argparse
+import platform
+from utils import get_env_systemml_home, find_dml_file, log4j_path, config_path
 
 
-# error help print
-def print_usage_and_exit():
-    this_script = sys.argv[0]
-    print('Usage: ' + this_script + ' <dml-filename> [arguments]')
-    sys.exit(1)
+def default_classpath(systemml_home):
+    """
+    Classpath information required for excution
 
+    return: String
+    Classpath location of build, library and hadoop directories
+    """
+    build_lib = join(systemml_home, 'target', '*')
+    lib_lib = join(systemml_home, 'target', 'lib', '*')
+    hadoop_lib = join(systemml_home, 'target', 'lib', 'hadoop', '*')
 
-# from http://stackoverflow.com/questions/1724693/find-a-file-in-python
-def find_file(name, path):
-    for root, dirs, files in os.walk(path):
-        if name in files:
-            return join(root, name)
-    return None
+    return build_lib, lib_lib, hadoop_lib
 
 
-if len(sys.argv) < 2:
-    print('Wrong usage')
-    print_usage_and_exit()
+#TODO
+# User dir, fix for SYSTEMML_1795
+def standalone_execution_entry(nvargs, args, config, explain, debug, stats, 
gpu, f):
+    """
+    This function is responsible for the execution of arguments via
+    subprocess call in singlenode mode
+    """
 
+    systemml_home = get_env_systemml_home()
+    script_file = find_dml_file(systemml_home, f)
 
-# find the systemML root path which contains the bin folder, the script folder 
and the target folder
-# tolerate path with spaces
-script_dir = os.path.dirname(os.path.realpath(__file__))
-project_root_dir = os.path.dirname(script_dir)
-user_dir = os.getcwd()
-
-scripts_dir = join(project_root_dir, 'scripts')
-build_dir = join(project_root_dir, 'target')
-lib_dir = join(build_dir, 'lib')
-dml_script_class = join(build_dir, 'classes', 'org', 'apache', 'sysml', 'api', 
'DMLScript.class')
-hadoop_home = join(lib_dir, 'hadoop')
-
-
-build_err_msg = 'You must build the project before running this script.'
-build_dir_err_msg = 'Could not find target directory ' + build_dir + '. ' + 
build_err_msg
-
-lib_dir_err_msg = 'Could not find required libraries.' + build_err_msg
-dml_script_err_msg = 'Could not find ' + dml_script_class + '. ' + 
build_err_msg
-
-# check if the project had been built and the jar files exist
-if not(exists(build_dir)):
-    print(build_dir_err_msg)
-    sys.exit(1)
-if not(exists(lib_dir)):
-    print(lib_dir_err_msg)
-    sys.exit(1)
-if not(exists(dml_script_class)):
-    print(dml_script_err_msg)
-    sys.exit(1)
-
-print('================================================================================')
-
-
-# if the present working directory is the project root or bin folder, then use 
the temp folder as user.dir
-if user_dir == project_root_dir or user_dir == join(project_root_dir, 'bin'):
-    user_dir = join(project_root_dir, 'temp')
-    print('Output dir: ' + user_dir)
-
-# if the SystemML-config.xml does not exist, create it from the template
-systemml_config_path = join(project_root_dir, 'conf', 'SystemML-config.xml')
-systemml_template_config_path = join(project_root_dir, 'conf', 
'SystemML-config.xml.template')
-if not(exists(systemml_config_path)):
-    shutil.copyfile(systemml_template_config_path, systemml_config_path)
-    print('... created ' + systemml_config_path)
-
-# if the log4j.properties do not exist, create them from the template
-log4j_properties_path = join(project_root_dir, 'conf', 'log4j.properties')
-log4j_template_properties_path = join(project_root_dir, 'conf', 
'log4j.properties.template')
-if not(exists(log4j_properties_path)):
-    shutil.copyfile(log4j_template_properties_path, log4j_properties_path)
-    print('... created ' + log4j_properties_path)
+    if platform.system() == 'Windows':
+        default_cp = ';'.join(default_classpath(systemml_home))
+    else:
+        default_cp = ':'.join(default_classpath(systemml_home))
 
+    java_memory = '-Xmx8g -Xms4g -Xmn1g'
 
-script_file = sys.argv[1]
+    # Log4j
+    log4j = log4j_path(systemml_home)
+    log4j_properties_path = '-Dlog4j.configuration=file:{}'.format(log4j)
 
-# if the script file path was omitted, try to complete the script path
-if not(exists(script_file)):
-    script_file_name = os.path.abspath(script_file)
-    script_file_found = find_file(script_file, scripts_dir)
-    if script_file_found is None:
-        print('Could not find DML script: ' + script_file)
-        print_usage_and_exit()
+    # Config
+    if config is None:
+        default_config = config_path(systemml_home)
     else:
-        script_file = script_file_found
-        print('DML Script:' + script_file)
-
-# add libraries which were generated by the build to the classpath
-systemml_jar = join(build_dir, 'classes')
-
-
-# For the *nix and windows, os.pathsep works reliably
-# however for cygwin, the pathsep is set for *nix, which is ':'
-# but the underlying java, which is a windows program requires ';'
-# also all arguments passed to the JVM need to be converted to windows style
-# if in the cygwin environment
-lib_dir_star = join(lib_dir, '*')
-if sys.platform == 'cygwin':
-    classpath_sep = os.pathsep
-    classpath_sep = ';'
-    lib_dir = os.popen('cygpath -pw ' + lib_dir).read().strip()
-    lib_dir_star = '"' + lib_dir + "\*" + '"'
-    systemml_jar = '"' + os.popen('cygpath -pw ' + 
systemml_jar).read().strip() + '"'
-    hadoop_home = '"' + os.popen('cygpath -pw ' + hadoop_home).read().strip() 
+ '"'
-    log4j_properties_path = '"' + os.popen('cygpath -pw ' + 
log4j_properties_path).read().strip() + '"'
-    user_dir = '"' + os.popen('cygpath -pw ' + user_dir).read().strip() + '"'
-    script_file = '"' + os.popen('cygpath -pw ' + script_file).read().strip() 
+ '"'
-    systemml_config_path = '"' + os.popen('cygpath -pw ' + 
systemml_config_path).read().strip() + '"'
-    classpath = lib_dir_star + '\\' + classpath_sep + systemml_jar
-else:
-    #classpath = '"' + lib_dir_star + '"' + os.pathsep + '"' + systemml_jar + 
'"'
-    classpath = lib_dir_star + os.pathsep + systemml_jar
-
-
-# Set the HADOOP_HOME environment variable
-if 'HADOOP_HOME' not in os.environ:
-    os.environ['HADOOP_HOME'] = hadoop_home
-
-
-print('================================================================================')
-
-# Set default Java options
-systemml_default_java_opts = \
-    '-Xmx8g -Xms4g -Xmn1g ' + \
-    '-cp ' + classpath + ' ' + \
-    '-Dlog4j.configuration=file:' + log4j_properties_path + ' ' \
-    '-Duser.dir=' + user_dir 
-#    '-agentlib:jdwp=transport=dt_socket,server=y,suspend=y,address=8111'
-
-
-# Reads in key-value pairs from the conf/systemml-env.sh file
-def parse_env_file(env_file_path):
-    env_vars = {}
-    with open(env_file_path) as f:
-        for l in f:
-            l = l.strip()
-            if l and not(l.startswith('#')) and '=' in l:
-                k, v = l.split('=', 1)
-                k = k.strip()
-                v = v.strip()
-                if len(v) > 0:
-                    # strip quotes
-                    if v[0] == v[len(v) - 1] and v[0] in ['"', "'"]:
-                        v = v[1:-1]
-
-                env_vars[k] = v
-    return env_vars
-
-
-# Add any custom Java options set by the user at command line, overriding 
defaults as necessary.
-if 'SYSTEMML_JAVA_OPTS' in os.environ:
-    systemml_java_opts = os.environ['SYSTEMML_JAVA_OPTS']
-    systemml_default_java_opts = systemml_default_java_opts + ' ' + 
systemml_java_opts
-    # del os.environ['SYSTEMML_JAVA_OPTS']
-
-# Add any custom Java options set by the user in the environment variables 
file,
-# overriding defaults as necessary.
-systemml_env_path = join(project_root_dir, 'conf', 'systemml-env.sh')
-if exists(systemml_env_path):
-    env_vars = parse_env_file(systemml_env_path)
-    os.environ.update(env_vars)
-
-
-# Invoke the jar with options and arguments
-cmd = ['java', systemml_default_java_opts, 'org.apache.sysml.api.DMLScript', 
'-f', script_file, '-exec singlenode', '-config', systemml_config_path] + 
sys.argv[2:]
-# For debugging
-# print(' '.join(cmd))
-
-return_code = os.system(' '.join(cmd))
-
-if return_code != 0:
-    print('Failed to run SystemML. Exit code :' + str(return_code))
-    print(' '.join(cmd))
+        default_config = config
+
+    ml_options = []
+    if nvargs is not None:
+        ml_options.append('-nvargs')
+        ml_options.append(' '.join(nvargs))
+    if args is not None:
+        ml_options.append('-args')
+        ml_options.append(' '.join(args))
+    if explain is not None:
+        ml_options.append('-explain')
+        ml_options.append(explain)
+    if debug is not False:
+        ml_options.append('-debug')
+    if stats is not None:
+        ml_options.append('-stats')
+        ml_options.append(stats)
+    if gpu is not None:
+        ml_options.append('-gpu')
+        ml_options.append(gpu)
+
+    cmd = ['java', java_memory, log4j_properties_path,
+           '-cp', default_cp, 'org.apache.sysml.api.DMLScript',
+           '-f', script_file, '-exec', 'singlenode', '-config', default_config,
+           ' '.join(ml_options)]
+
+    return_code = os.system(' '.join(cmd))
+    return return_code
+
+
+if __name__ == '__main__':
+
+    cparser = 
argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+                                      description='System-ML Standalone 
Script')
+
+    # SYSTEM-ML Options
+    cparser.add_argument('-nvargs', help='List of attributeName-attributeValue 
pairs', nargs='+', metavar='')
+    cparser.add_argument('-args', help='List of positional argument values', 
metavar='', nargs='+')
+    cparser.add_argument('-config', help='System-ML configuration file (e.g 
SystemML-config.xml)', metavar='')
+    cparser.add_argument('-explain', help='explains plan levels can be hops, 
runtime, '
+                                          'recompile_hops, recompile_runtime', 
nargs='?', const='runtime', metavar='')
+    cparser.add_argument('-debug', help='runs in debug mode', 
action='store_true')
+    cparser.add_argument('-stats', help='Monitor and report 
caching/recompilation statistics, '
+                                        'heavy hitter <count> is 10 unless 
overridden', nargs='?', const='10',
+                         metavar='')
+    cparser.add_argument('-gpu', help='uses CUDA instructions when reasonable, 
'
+                                      'set <force> option to skip conservative 
memory estimates '
+                                      'and use GPU wherever possible', 
nargs='?')
+    cparser.add_argument('-f', required=True, help='specifies dml/pydml file 
to execute; '
+                                                   'path can be 
local/hdfs/gpfs', metavar='')
+
+    args = cparser.parse_args()
+    arg_dict = vars(args)
+    return_code = standalone_execution_entry(**arg_dict)
+
+    if return_code != 0:
+        print('Failed to run SystemML. Exit code :' + str(return_code))

http://git-wip-us.apache.org/repos/asf/systemml/blob/e94374af/bin/utils.py
----------------------------------------------------------------------
diff --git a/bin/utils.py b/bin/utils.py
new file mode 100644
index 0000000..6f40881
--- /dev/null
+++ b/bin/utils.py
@@ -0,0 +1,113 @@
+#!/usr/bin/env python
+# -------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# -------------------------------------------------------------
+
+import sys
+import os
+from os.path import join, exists
+from os import environ
+import shutil
+
+
+def get_env_systemml_home():
+    """
+    Env variable error check and path location
+
+    return: String
+    Location of SYSTEMML_HOME
+    """
+    systemml_home = os.environ.get('SYSTEMML_HOME')
+    if systemml_home is None:
+        print('SYSTEMML_HOME not found')
+        sys.exit()
+
+    return systemml_home
+
+
+def get_env_spark_home():
+    """
+    Env variable error check and path location
+
+    return: String
+    Location of SPARK_HOME
+    """
+    spark_home = environ.get('SPARK_HOME')
+    if spark_home is None:
+        print('SPARK_HOME not found')
+        sys.exit()
+
+    return spark_home
+
+
+def find_file(name, path):
+    """
+    Responsible for finding a specific file recursively given a location
+    """
+    for root, dirs, files in os.walk(path):
+        if name in files:
+            return join(root, name)
+
+
+def find_dml_file(systemml_home, script_file):
+    """
+    Find the location of DML script being executed
+
+    return: String
+    Location of the dml script
+    """
+    scripts_dir = join(systemml_home, 'scripts')
+    if not (exists(script_file)):
+        script_file = find_file(script_file, scripts_dir)
+        if script_file is None:
+            print('Could not find DML script: ' + script_file)
+            sys.exit()
+
+    return script_file
+
+
+def log4j_path(systemml_home):
+    """
+    Create log4j.properties from the template if not exist
+
+    return: String
+    Location of log4j.properties path
+    """
+    log4j_properties_path = join(systemml_home, 'conf', 'log4j.properties')
+    log4j_template_properties_path = join(systemml_home, 'conf', 
'log4j.properties.template')
+    if not (exists(log4j_properties_path)):
+        shutil.copyfile(log4j_template_properties_path, log4j_properties_path)
+        print('... created ' + log4j_properties_path)
+    return log4j_properties_path
+
+
+def config_path(systemml_home):
+    """
+    Create SystemML-config from the template if not exist
+
+    return: String
+    Location of SystemML-config.xml
+    """
+    systemml_config_path = join(systemml_home, 'conf', 'SystemML-config.xml')
+    systemml_template_config_path = join(systemml_home, 'conf', 
'SystemML-config.xml.template')
+    if not (exists(systemml_config_path)):
+        shutil.copyfile(systemml_template_config_path, systemml_config_path)
+        print('... created ' + systemml_config_path)
+    return systemml_config_path

http://git-wip-us.apache.org/repos/asf/systemml/blob/e94374af/docs/python-performance-test.md
----------------------------------------------------------------------
diff --git a/docs/python-performance-test.md b/docs/python-performance-test.md
index c265bc6..02d3e34 100644
--- a/docs/python-performance-test.md
+++ b/docs/python-performance-test.md
@@ -11,10 +11,13 @@ Our performance tests suit contains `7` families namely 
`binomial`, `multinomial
 
 On a very high level use construct a string with arguments required to run 
each operation. Once this string is constructed we use the subprocess module to 
execute this string and extract time from the standard out. 
 
-We also use `json` module write our configurations to a json file. This ensure 
that our current operation is easy to debug.
+We also use `json` module write our configurations to a json file. This ensure 
that our operations are easy to debug.
 
+We have `7` files in performance test suit:
 
-We have `5` files in performance test suit `run_perftest.py`, `datagen.py`, 
`train.py`, `predict.py` and `utils.py`. 
+- Entry File `run_perftest.py`
+- Supporting Files `datagen.py`, `train.py`, `predict.py`
+- Utility Files `utils_exec.py`, `utils_fs.py`, `utils_misc.py`
 
 `datagen.py`, `train.py` and `predict.py` generate a dictionary. Our key is 
the name of algorithm being processed and values is a list with path(s) where 
all the data required is present. We define this dictionary as a configuration 
packet.
 
@@ -28,7 +31,7 @@ In `train.py` script we have functions required to generate 
training output. We
 
 The file `predict.py` contains all functions for all algorithms in the 
performance test that contain predict script. We return the required 
configuration packet as a result of this script, that contains key as the 
algorithm to run and values with location to read predict json files from.
 
-In the file `utils.py` we have all the helper functions required in our 
performance test. These functions do operations like write `json` files, 
extract time from std out etc.
+In the file(s) `utils_*.py` we have all the helper functions required in our 
performance test. These functions do operations like write `json` files, 
extract time from std out etc.
  
 ### Adding New Algorithms
 While adding a new algorithm we need know if it has to be part of the any pre 
existing family. If this algorithm depends on a new data generation script we 
would need to create a new family. Steps below to take below to add a new 
algorithm.
@@ -75,7 +78,7 @@ Default setting for our performance test below:
 - Matrix size to 10,000 rows and 100 columns.
 - Execution mode `singlenode`.
 - Operation modes `data-gen`, `train` and `predict` in sequence.
-- Matrix type set to `all`. Which will generate `dense` or / and `sparse` 
matrices for all relevant algorithms.
+- Matrix type set to `all`. Which will generate `dense`, `sparse` matrices for 
all relevant algorithms.
 
 ### Examples
 Some examples of SystemML performance test with arguments shown below:
@@ -104,6 +107,9 @@ Run performance test for the algorithms `m-svm` with 
`multinomial` family. Run o
 `
 Run performance test for all algorithms under the family `regression2` and log 
with filename `new_log`.
 
+`./scripts/perftest/python/run_perftest.py --family binomial clustering 
multinomial regression1 regression2 stats1 stats2 --config-dir 
/Users/krishna/open-source/systemml/scripts/perftest/temp3 --temp-dir 
hdfs://localhost:9000/temp3`
+Run performance test for all algorithms using HDFS.
+
 ### Operational Notes
 All performance test depend mainly on two scripts for execution 
`systemml-standalone.py` and `systemml-spark-submit.py`. Incase we need to 
change standalone or spark parameters we need to manually change these 
parameters in their respective scripts.
 
@@ -117,13 +123,26 @@ multinomial|data-gen|0|dense|10k_100| 0.33
 MultiLogReg|train|0|10k_100|dense|6.956
 MultiLogReg|predict|0|10k_100|dense|4.780
 
-These logs can be found in `temp` folder 
(`$SYSTEMML_HOME/scripts/perftest/temp`) in-case not overridden by 
`--temp-dir`. This `temp` folders also contain the data generated during our 
performance test.
+These logs and config `json` files can be found in `temp` folder 
(`$SYSTEMML_HOME/scripts/perftest/temp`) in-case not overridden by 
`--config-dir`.
+
+`--temp-dir` by default points to local file system. We can change this to 
point to a hdfs path by `--temp-dir hdfs://localhost:9000/temp` where all files 
generated during execution will be saved.
+
+Every time a script executes in `data-gen` mode successfully, we write a 
`_SUCCESS` file. If this file exists we ensures that re-runs of the same script 
is not possible. Support for configuration options like `-stats`, `-explain`, 
`--conf` have also been added.
+
+Results obtained by our performance tests can be automatically uploaded to 
google docs.
+
+`./update.py --file ../temp/singlenode.out --exec-mode singlenode --auth 
client_json.json --tag 1.0`
+
+In the example above `--tag` can be a major/minor systemml version and 
`--auth` points to the `json` key required by `google docs`.
+
+Currently we only support time difference between algorithms in different 
versions. This can be obtained by running the script below
+`./stats.py --auth client_json.json --exec-mode singlenode --tags 1.0 2.0`
 
-Every time a script executes in `data-gen` mode successfully, we write a 
`_SUCCESS` file. If this file exists we ensures that re-run of the same script 
is not possible as data already exists.
+Note: Please pip install `https://github.com/burnash/gspread` to use google 
docs client.
 
 ### Troubleshooting
 We can debug the performance test by making changes in the following locations 
based on 
 
-- Please see `utils.py` function `exec_dml_and_parse_time`. In  uncommenting 
the debug print statement in the function `exec_dml_and_parse_time`. This 
allows us to inspect the subprocess string being executed.
+- Please see `utils_exec.py` function `subprocess_exec`.
 - Please see `run_perftest.py`. Changing the verbosity level to `0` allows us 
to log more information while the script runs.
-- Eyeballing the json files generated and making sure the arguments are 
correct.
+- Eyeballing the json files generated and making sure the configuration 
arguments are correct.

http://git-wip-us.apache.org/repos/asf/systemml/blob/e94374af/scripts/perftest/python/datagen.py
----------------------------------------------------------------------
diff --git a/scripts/perftest/python/datagen.py 
b/scripts/perftest/python/datagen.py
index 88a71f0..72a0627 100755
--- a/scripts/perftest/python/datagen.py
+++ b/scripts/perftest/python/datagen.py
@@ -22,7 +22,7 @@
 
 import itertools
 from os.path import join
-from utils import split_rowcol, config_writer, mat_type_check
+from utils_misc import split_rowcol, config_writer, mat_type_check
 
 # This file contains configuration settings for data generation
 DATA_FORMAT = 'csv'
@@ -33,42 +33,44 @@ MATRIX_TYPE_DICT = {'dense': '0.9',
 FAMILY_NO_MATRIX_TYPE = ['clustering', 'stats1', 'stats2']
 
 
-def multinomial_datagen(matrix_dim, matrix_type, datagen_dir):
+def multinomial_datagen(matrix_dim, matrix_type, datagen_dir, config_dir):
+    path_name = '.'.join(['multinomial', matrix_type, str(matrix_dim)])
+    datagen_write = join(datagen_dir, path_name)
+    save_path = join(config_dir, path_name)
 
     row, col = split_rowcol(matrix_dim)
-    path_name = '.'.join(['multinomial', matrix_type, str(matrix_dim)])
-    full_path = join(datagen_dir, path_name)
 
     numSamples = row
     numFeatures = col
     sparsity = MATRIX_TYPE_DICT[matrix_type]
     num_categories = '150'
     intercept = '0'
-    X = join(full_path, 'X.data')
-    Y = join(full_path, 'Y.data')
+    X = join(datagen_write, 'X.data')
+    Y = join(datagen_write, 'Y.data')
     fmt = DATA_FORMAT
 
     config = [numSamples, numFeatures, sparsity, num_categories, intercept,
               X, Y, fmt, '1']
 
-    config_writer(full_path + '.json', config)
+    config_writer(save_path + '.json', config)
 
-    return full_path
+    return save_path
 
 
-def binomial_datagen(matrix_dim, matrix_type, datagen_dir):
+def binomial_datagen(matrix_dim, matrix_type, datagen_dir, config_dir):
+    path_name = '.'.join(['binomial', matrix_type, str(matrix_dim)])
+    datagen_write = join(datagen_dir, path_name)
+    save_path = join(config_dir, path_name)
 
     row, col = split_rowcol(matrix_dim)
-    path_name = '.'.join(['binomial', matrix_type, str(matrix_dim)])
-    full_path = join(datagen_dir, path_name)
 
     numSamples = row
     numFeatures = col
     maxFeatureValue = '5'
     maxWeight = '5'
-    loc_weights = join(full_path, 'weight.data')
-    loc_data = join(full_path, 'X.data')
-    loc_labels = join(full_path, 'Y.data')
+    loc_weights = join(datagen_write, 'weight.data')
+    loc_data = join(datagen_write, 'X.data')
+    loc_labels = join(datagen_write, 'Y.data')
     noise = '1'
     intercept = '0'
     sparsity = MATRIX_TYPE_DICT[matrix_type]
@@ -77,24 +79,25 @@ def binomial_datagen(matrix_dim, matrix_type, datagen_dir):
 
     config = [numSamples, numFeatures, maxFeatureValue, maxWeight, 
loc_weights, loc_data,
               loc_labels, noise, intercept, sparsity, fmt, tranform_labels]
-    config_writer(full_path + '.json', config)
+    config_writer(save_path + '.json', config)
 
-    return full_path
+    return save_path
 
 
-def regression1_datagen(matrix_dim, matrix_type, datagen_dir):
+def regression1_datagen(matrix_dim, matrix_type, datagen_dir, config_dir):
+    path_name = '.'.join(['regression1', matrix_type, str(matrix_dim)])
+    datagen_write = join(datagen_dir, path_name)
+    save_path = join(config_dir, path_name)
 
     row, col = split_rowcol(matrix_dim)
-    path_name = '.'.join(['regression1', matrix_type, str(matrix_dim)])
-    full_path = join(datagen_dir, path_name)
 
     numSamples = row
     numFeatures = col
     maxFeatureValue = '5'
     maxWeight = '5'
-    loc_weights = join(full_path, 'weight.data')
-    loc_data = join(full_path, 'X.data')
-    loc_labels = join(full_path, 'Y.data')
+    loc_weights = join(datagen_write, 'weight.data')
+    loc_data = join(datagen_write, 'X.data')
+    loc_labels = join(datagen_write, 'Y.data')
     noise = '1'
     intercept = '0'
     sparsity = MATRIX_TYPE_DICT[matrix_type]
@@ -103,24 +106,25 @@ def regression1_datagen(matrix_dim, matrix_type, 
datagen_dir):
 
     config = [numSamples, numFeatures, maxFeatureValue, maxWeight, 
loc_weights, loc_data,
               loc_labels, noise, intercept, sparsity, fmt, tranform_labels]
-    config_writer(full_path + '.json', config)
+    config_writer(save_path + '.json', config)
 
-    return full_path
+    return save_path
 
 
-def regression2_datagen(matrix_dim, matrix_type, datagen_dir):
+def regression2_datagen(matrix_dim, matrix_type, datagen_dir, config_dir):
+    path_name = '.'.join(['regression2', matrix_type, str(matrix_dim)])
+    datagen_write = join(datagen_dir, path_name)
+    save_path = join(config_dir, path_name)
 
     row, col = split_rowcol(matrix_dim)
-    path_name = '.'.join(['regression2', matrix_type, str(matrix_dim)])
-    full_path = join(datagen_dir, path_name)
 
     numSamples = row
     numFeatures = col
     maxFeatureValue = '5'
     maxWeight = '5'
-    loc_weights = join(full_path, 'weight.data')
-    loc_data = join(full_path, 'X.data')
-    loc_labels = join(full_path, 'Y.data')
+    loc_weights = join(datagen_write, 'weight.data')
+    loc_data = join(datagen_write, 'X.data')
+    loc_labels = join(datagen_write, 'Y.data')
     noise = '1'
     intercept = '0'
     sparsity = MATRIX_TYPE_DICT[matrix_type]
@@ -129,21 +133,22 @@ def regression2_datagen(matrix_dim, matrix_type, 
datagen_dir):
 
     config = [numSamples, numFeatures, maxFeatureValue, maxWeight, 
loc_weights, loc_data,
               loc_labels, noise, intercept, sparsity, fmt, tranform_labels]
-    config_writer(full_path + '.json', config)
+    config_writer(save_path + '.json', config)
 
-    return full_path
+    return save_path
 
 
-def clustering_datagen(matrix_dim, matrix_type, datagen_dir):
+def clustering_datagen(matrix_dim, matrix_type, datagen_dir, config_dir):
 
-    row, col = split_rowcol(matrix_dim)
     path_name = '.'.join(['clustering', matrix_type, str(matrix_dim)])
+    datagen_write = join(datagen_dir, path_name)
+    save_path = join(config_dir, path_name)
+    row, col = split_rowcol(matrix_dim)
 
-    full_path = join(datagen_dir, path_name)
-    X = join(full_path, 'X.data')
-    Y = join(full_path, 'Y.data')
-    YbyC = join(full_path, 'YbyC.data')
-    C = join(full_path, 'C.data')
+    X = join(datagen_write, 'X.data')
+    Y = join(datagen_write, 'Y.data')
+    YbyC = join(datagen_write, 'YbyC.data')
+    C = join(datagen_write, 'C.data')
     nc = '50'
     dc = '10.0'
     dr = '1.0'
@@ -153,22 +158,24 @@ def clustering_datagen(matrix_dim, matrix_type, 
datagen_dir):
     config = dict(nr=row, nf=col, nc=nc, dc=dc, dr=dr, fbf=fbf, cbf=cbf, X=X, 
C=C, Y=Y,
                   YbyC=YbyC, fmt=DATA_FORMAT)
 
-    config_writer(full_path + '.json', config)
-    return full_path
+    config_writer(save_path + '.json', config)
+    return save_path
 
 
-def stats1_datagen(matrix_dim, matrix_type, datagen_dir):
+def stats1_datagen(matrix_dim, matrix_type, datagen_dir, config_dir):
 
-    row, col = split_rowcol(matrix_dim)
     path_name = '.'.join(['stats1', matrix_type, str(matrix_dim)])
-    full_path = join(datagen_dir, path_name)
-
-    DATA = join(full_path, 'X.data')
-    TYPES = join(full_path, 'types')
-    TYPES1 = join(full_path, 'set1.types')
-    TYPES2 = join(full_path, 'set2.types')
-    INDEX1 = join(full_path, 'set1.indices')
-    INDEX2 = join(full_path, 'set2.indices')
+    datagen_write = join(datagen_dir, path_name)
+    save_path = join(config_dir, path_name)
+
+    row, col = split_rowcol(matrix_dim)
+
+    DATA = join(datagen_write, 'X.data')
+    TYPES = join(datagen_write, 'types')
+    TYPES1 = join(datagen_write, 'set1.types')
+    TYPES2 = join(datagen_write, 'set2.types')
+    INDEX1 = join(datagen_write, 'set1.indices')
+    INDEX2 = join(datagen_write, 'set2.indices')
     MAXDOMAIN = '1100'
     SETSIZE = '20'
     LABELSETSIZE = '10'
@@ -184,30 +191,31 @@ def stats1_datagen(matrix_dim, matrix_type, datagen_dir):
                   LABELSETSIZE=LABELSETSIZE, TYPES1=TYPES1, TYPES2=TYPES2, 
INDEX1=INDEX1,
                   INDEX2=INDEX2, fmt=DATA_FORMAT)
 
-    config_writer(full_path + '.json', config)
+    config_writer(save_path + '.json', config)
 
-    return full_path
+    return save_path
 
 
-def stats2_datagen(matrix_dim, matrix_type, datagen_dir):
+def stats2_datagen(matrix_dim, matrix_type, datagen_dir, config_dir):
 
-    row, col = split_rowcol(matrix_dim)
     path_name = '.'.join(['stats2', matrix_type, str(matrix_dim)])
-    full_path = join(datagen_dir, path_name)
+    datagen_write = join(datagen_dir, path_name)
+    save_path = join(config_dir, path_name)
+    row, col = split_rowcol(matrix_dim)
 
-    D = join(full_path, 'X.data')
-    Xcid = join(full_path, 'Xcid.data')
-    Ycid = join(full_path, 'Ycid.data')
-    A = join(full_path, 'A.data')
+    D = join(datagen_write, 'X.data')
+    Xcid = join(datagen_write, 'Xcid.data')
+    Ycid = join(datagen_write, 'Ycid.data')
+    A = join(datagen_write, 'A.data')
 
     config = dict(nr=row, nf=col, D=D, Xcid=Xcid, Ycid=Ycid,
                   A=A, fmt=DATA_FORMAT)
 
-    config_writer(full_path + '.json', config)
-    return full_path
+    config_writer(save_path + '.json', config)
+    return save_path
 
 
-def config_packets_datagen(algo_payload, matrix_type, matrix_shape, 
datagen_dir, dense_algos):
+def config_packets_datagen(algo_payload, matrix_type, matrix_shape, 
datagen_dir, dense_algos, config_dir):
     """
     This function has two responsibilities. Generate the configuration files 
for
     datagen algorithms and return a dictionary that will be used for execution.
@@ -228,13 +236,14 @@ def config_packets_datagen(algo_payload, matrix_type, 
matrix_shape, datagen_dir,
     dense_algos: List
     Algorithms that support only dense matrix type
 
+    config_dir: String
+    Location to store
+
     return: Dictionary {string: list}
     This dictionary contains algorithms to be executed as keys and the path of 
configuration
     json files to be executed list of values.
     """
-
     config_bundle = {}
-
     distinct_families = set(map(lambda x: x[1], algo_payload))
 
     # Cross Product of all configurations
@@ -249,7 +258,7 @@ def config_packets_datagen(algo_payload, matrix_type, 
matrix_shape, datagen_dir,
         config_packets[current_family] = []
         for size, type in configs:
             family_func = current_family.lower() + '_datagen'
-            conf_path = globals()[family_func](size, type, datagen_dir)
+            conf_path = globals()[family_func](size, type, datagen_dir, 
config_dir)
             config_packets[current_family].append(conf_path)
 
     return config_packets

http://git-wip-us.apache.org/repos/asf/systemml/blob/e94374af/scripts/perftest/python/google_docs/stats.py
----------------------------------------------------------------------
diff --git a/scripts/perftest/python/google_docs/stats.py 
b/scripts/perftest/python/google_docs/stats.py
new file mode 100755
index 0000000..3b89abe
--- /dev/null
+++ b/scripts/perftest/python/google_docs/stats.py
@@ -0,0 +1,113 @@
+#!/usr/bin/env python3
+# -------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# -------------------------------------------------------------
+
+import argparse
+from functools import reduce
+import pprint
+from oauth2client.service_account import ServiceAccountCredentials
+import gspread
+
+# Get time difference between difference runs
+
+
+def auth(path, sheet_name):
+    """
+    Responsible for authorization
+    """
+    scope = ['https://spreadsheets.google.com/feeds']
+    creds = ServiceAccountCredentials.from_json_keyfile_name(path, scope)
+    gc = gspread.authorize(creds)
+    sheet = gc.open("Perf").worksheet(sheet_name)
+    return sheet
+
+
+def get_data(sheet, tag):
+    """
+    Get time and algorithm from the sheet
+    """
+    time = sheet.find('time_{}'.format(tag))
+    algo = sheet.find('algo_{}'.format(tag))
+
+    time_col = sheet.col_values(time.col)
+    time_col = list(filter(lambda x: len(x) > 0, time_col))
+
+    algo_col = sheet.col_values(algo.col)
+    algo_col = list(filter(lambda x: len(x) > 0, algo_col))
+    return algo_col, time_col
+
+
+def get_data_dict(data_col):
+    """
+    Return data as dictionary with key as algorithm and list time values
+    """
+    data_dict = {}
+    all_algo = []
+    for algo, _ in data_col:
+        all_algo.append(algo)
+
+    flatten_algo = reduce(lambda x, y: x+y, all_algo)
+
+    # remove the header
+    filter_data = list(filter(lambda x: not x.startswith('algo_'), 
flatten_algo))
+    distict_algos = set(filter_data)
+
+    for algo_dist in distict_algos:
+        for algo, time in data_col:
+            for k, v in zip(algo, time):
+                if algo_dist == k:
+                    if algo_dist not in data_dict:
+                        data_dict[k] = [v]
+                    else:
+                        data_dict[k].append(v)
+    return data_dict
+
+# Example Usage
+# ./stats.py --auth client_json.json --exec-mode singlenode --tags 1.0 2.0
+if __name__ == '__main__':
+    execution_mode = ['hybrid_spark', 'singlenode']
+
+    cparser = argparse.ArgumentParser(description='System-ML Statistics 
Script')
+    cparser.add_argument('--auth', help='Location to read auth file',
+                         required=True, metavar='')
+    cparser.add_argument('--exec-mode', help='Execution mode', 
choices=execution_mode,
+                         required=True, metavar='')
+    cparser.add_argument('--tags', help='Tagging header value',
+                         required=True, nargs='+')
+
+    args = cparser.parse_args()
+    arg_dict = vars(args)
+    sheet = auth(args.auth, args.exec_mode)
+    all_data = sheet.get_all_records()
+
+    data_col = []
+    for tag in args.tags:
+        algo_col, time_col = get_data(sheet, tag)
+        data_col.append((algo_col, time_col))
+
+    data_dict = get_data_dict(data_col)
+
+    delta_algo = {}
+    for k, v in data_dict.items():
+        delta = float(v[0]) - float(v[1])
+        delta_algo[k] = delta
+
+    pprint.pprint(delta_algo, width=1)

http://git-wip-us.apache.org/repos/asf/systemml/blob/e94374af/scripts/perftest/python/google_docs/update.py
----------------------------------------------------------------------
diff --git a/scripts/perftest/python/google_docs/update.py 
b/scripts/perftest/python/google_docs/update.py
new file mode 100755
index 0000000..c2fed38
--- /dev/null
+++ b/scripts/perftest/python/google_docs/update.py
@@ -0,0 +1,110 @@
+#!/usr/bin/env python3
+# -------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# -------------------------------------------------------------
+
+import argparse
+import gspread
+from oauth2client.service_account import ServiceAccountCredentials
+import pandas as pd
+
+# Update data to google sheets
+
+
+def parse_data(file_path):
+    """
+    Skip reading 1st row : Header
+    Skip reading last row : Footer
+    """
+    csv_file = pd.read_csv(file_path, sep=',', skiprows=1, skipfooter=1, 
engine='python')
+    algo = csv_file['INFO:root:algorithm'].apply(lambda x: x.split(':')[-1])
+    key = algo + '_'+ csv_file['run_type'] + '_' + csv_file['intercept'] + '_' 
+ \
+                csv_file['matrix_type'] + '_' + csv_file['data_shape']
+    return key, csv_file['time_sec']
+
+
+def auth(path, sheet_name):
+    """
+    Responsible for authorization
+    """
+    scope = ['https://spreadsheets.google.com/feeds']
+    creds = ServiceAccountCredentials.from_json_keyfile_name(path, scope)
+    gc = gspread.authorize(creds)
+    sheet = gc.open("Perf").worksheet(sheet_name)
+    return sheet
+
+
+def insert_pair(algo, time, start_col, tag):
+    """
+    Wrapper function that calls insert_values to insert algo and time
+    """
+    insert_values(sheet, algo, start_col, 'algo_{}'.format(tag))
+    insert_values(sheet, time, start_col + 1, 'time_{}'.format(tag))
+    print('Writing Complete')
+
+
+def insert_values(sheet, key, col_num, header):
+    """
+    Insert data to google sheets based on the arguments
+    """
+    # Col Name
+    sheet.update_cell(1, col_num, header)
+    for id, val in enumerate(key):
+        sheet.update_cell(id + 2, col_num, val)
+
+
+def get_dim(sheet):
+    """
+    Get the dimensions of data
+    """
+    try:
+        col_count = sheet.get_all_records()
+    except:
+        col_count = [[]]
+    row = len(col_count)
+    col = len(col_count[0])
+    return row, col
+
+
+# Example Usage
+#  ./update.py --file ../temp/test.out --exec-mode singlenode --auth 
client_json.json --tag 3.0
+if __name__ == '__main__':
+    execution_mode = ['hybrid_spark', 'singlenode']
+
+    cparser = argparse.ArgumentParser(description='System-ML Update / Stat 
Script')
+    cparser.add_argument('--file', help='Location of the current perf test 
outputs',
+                         required=True, metavar='')
+    cparser.add_argument('--exec-mode', help='Backend Type', 
choices=execution_mode,
+                         required=True, metavar='')
+    cparser.add_argument('--auth', help='Location to read auth file',
+                         required=True, metavar='')
+    cparser.add_argument('--tag', help='Tagging header value',
+                         required=True, metavar='')
+
+    args = cparser.parse_args()
+    arg_dict = vars(args)
+
+    # Authenticate and get sheet dimensions
+    sheet = auth(args.auth, args.exec_mode)
+    row, col = get_dim(sheet)
+
+    # Read data from file and write to google docs
+    algo, time = parse_data(args.file)
+    insert_pair(algo, time, col + 1, args.tag)

http://git-wip-us.apache.org/repos/asf/systemml/blob/e94374af/scripts/perftest/python/predict.py
----------------------------------------------------------------------
diff --git a/scripts/perftest/python/predict.py 
b/scripts/perftest/python/predict.py
index 92d3af4..d2e44e6 100755
--- a/scripts/perftest/python/predict.py
+++ b/scripts/perftest/python/predict.py
@@ -22,47 +22,42 @@
 
 import sys
 from os.path import join
-from utils import config_writer, relevant_folders, mat_type_check
+from utils_misc import config_writer, mat_type_check
+from utils_fs import relevant_folders
 
 # Contains configuration setting for predicting
 DATA_FORMAT = 'csv'
 
 
-def m_svm_predict(save_file_name, datagen_dir, train_dir, predict_dir):
+def m_svm_predict(save_folder_name, datagen_dir, train_dir, predict_dir, 
config_dir):
+    save_path = join(config_dir, save_folder_name)
 
     X = join(datagen_dir, 'X_test.data')
     Y = join(datagen_dir, 'Y_test.data')
-
-    icpt = save_file_name.split('.')[-1]
+    icpt = save_folder_name.split('.')[-1]
     model = join(train_dir, 'model.data')
-    fmt = DATA_FORMAT
-
-    config = dict(X=X, Y=Y, icpt=icpt, model=model, fmt=fmt)
-
-    full_path_predict = join(predict_dir, save_file_name)
-    config_writer(full_path_predict + '.json', config)
+    config = dict(X=X, Y=Y, icpt=icpt, model=model, fmt=DATA_FORMAT)
+    config_writer(save_path + '.json', config)
 
-    return full_path_predict
+    return save_path
 
 
-def l2_svm_predict(save_file_name, datagen_dir, train_dir, predict_dir):
+def l2_svm_predict(save_folder_name, datagen_dir, train_dir, predict_dir, 
config_dir):
+    save_path = join(config_dir, save_folder_name)
 
     X = join(datagen_dir, 'X_test.data')
     Y = join(datagen_dir, 'Y_test.data')
-
-    icpt = save_file_name.split('.')[-1]
+    icpt = save_folder_name.split('.')[-1]
     model = join(train_dir, 'model.data')
-    fmt = DATA_FORMAT
-
-    config = dict(X=X, Y=Y, icpt=icpt, model=model, fmt=fmt)
+    config = dict(X=X, Y=Y, icpt=icpt, model=model, fmt=DATA_FORMAT)
+    config_writer(save_path + '.json', config)
 
-    full_path_predict = join(predict_dir, save_file_name)
-    config_writer(full_path_predict + '.json', config)
+    return save_path
 
-    return full_path_predict
 
+def multilogreg_predict(save_folder_name, datagen_dir, train_dir, predict_dir, 
config_dir):
+    save_path = join(config_dir, save_folder_name)
 
-def multilogreg_predict(save_file_name, datagen_dir, train_dir, predict_dir):
     X = join(datagen_dir, 'X_test.data')
     Y = join(datagen_dir, 'Y_test.data')
     B = join(train_dir, 'B.data')
@@ -70,156 +65,147 @@ def multilogreg_predict(save_file_name, datagen_dir, 
train_dir, predict_dir):
     dfam = '3'
     vpow = '-1'
     link = '2'
-    fmt = DATA_FORMAT
 
-    config = dict(dfam=dfam, vpow=vpow, link=link, fmt=fmt, X=X, B=B, Y=Y, M=M)
+    config = dict(dfam=dfam, vpow=vpow, link=link, fmt=DATA_FORMAT, X=X, B=B, 
Y=Y, M=M)
 
-    full_path_predict = join(predict_dir, save_file_name)
-    config_writer(full_path_predict + '.json', config)
+    config_writer(save_path + '.json', config)
 
-    return full_path_predict
+    return save_path
 
 
-def naive_bayes_predict(save_file_name, datagen_dir, train_dir, predict_dir):
+def naive_bayes_predict(save_folder_name, datagen_dir, train_dir, predict_dir, 
config_dir):
+    save_path = join(config_dir, save_folder_name)
 
     X = join(datagen_dir, 'X_test.data')
     Y = join(datagen_dir, 'Y_test.data')
-
     prior = join(train_dir, 'prior')
     conditionals = join(train_dir, 'conditionals')
-    fmt = DATA_FORMAT
     probabilities = join(train_dir, 'probabilities')
-    config = dict(X=X, Y=Y, prior=prior, conditionals=conditionals, fmt=fmt, 
probabilities=probabilities)
+    config = dict(X=X, Y=Y, prior=prior, conditionals=conditionals, 
fmt=DATA_FORMAT,
+                  probabilities=probabilities)
+    config_writer(save_path + '.json', config)
 
-    full_path_predict = join(predict_dir, save_file_name)
-    config_writer(full_path_predict + '.json', config)
+    return save_path
 
-    return full_path_predict
 
+def kmeans_predict(save_folder_name, datagen_dir, train_dir, predict_dir, 
config_dir):
 
-def kmeans_predict(save_file_name, datagen_dir, train_dir, predict_dir):
+    save_path = join(config_dir, save_folder_name)
+    predict_write = join(predict_dir, save_folder_name)
 
     X = join(datagen_dir, 'X_test.data')
     C = join(datagen_dir, 'C.data')
 
-    full_path_predict = join(predict_dir, save_file_name)
-    prY = join(full_path_predict, 'prY.data')
+    prY = join(predict_write, 'prY.data')
 
     config = dict(X=X, C=C, prY=prY)
-    config_writer(full_path_predict + '.json', config)
+    config_writer(save_path + '.json', config)
 
-    return full_path_predict
+    return save_path
 
 
-def linearregcg_predict(save_file_name, datagen_dir, train_dir, predict_dir):
+def linearregcg_predict(save_folder_name, datagen_dir, train_dir, predict_dir, 
config_dir):
+
+    save_path = join(config_dir, save_folder_name)
+    predict_write = join(predict_dir, save_folder_name)
 
     dfam = '1'
     link = '1'
     vpow = '0.0'
     lpow = '1.0'
-
     X = join(datagen_dir, 'X_test.data')
     B = join(train_dir, 'B.data')
     Y = join(datagen_dir, 'Y_test.data')
-
-    full_path_predict = join(predict_dir, save_file_name)
-    M = join(full_path_predict, 'M.data')
-    O = join(full_path_predict, 'O.data')
-
+    M = join(predict_write, 'M.data')
+    O = join(predict_write, 'O.data')
     config = dict(dfam=dfam, link=link, vpow=vpow, lpow=lpow, fmt=DATA_FORMAT, 
X=X,
                   B=B, Y=Y, M=M, O=O)
-    config_writer(full_path_predict + '.json', config)
+    config_writer(save_path + '.json', config)
 
-    return full_path_predict
+    return save_path
 
 
-def linearregds_predict(save_file_name, datagen_dir, train_dir, predict_dir):
+def linearregds_predict(save_folder_name, datagen_dir, train_dir, predict_dir, 
config_dir):
+    save_path = join(config_dir, save_folder_name)
+    predict_write = join(predict_dir, save_folder_name)
 
     dfam = '1'
     link = '1'
     vpow = '0.0'
     lpow = '1.0'
-
     X = join(datagen_dir, 'X_test.data')
     B = join(train_dir, 'B.data')
     Y = join(datagen_dir, 'Y_test.data')
-
-    full_path_predict = join(predict_dir, save_file_name)
-    M = join(full_path_predict, 'M.data')
-    O = join(full_path_predict, 'O.data')
-
+    M = join(predict_write, 'M.data')
+    O = join(predict_write, 'O.data')
     config = dict(dfam=dfam, link=link, vpow=vpow, lpow=lpow, fmt=DATA_FORMAT, 
X=X,
                   B=B, Y=Y, M=M, O=O)
-    config_writer(full_path_predict + '.json', config)
+    config_writer(save_path + '.json', config)
 
-    return full_path_predict
+    return save_path
 
 
-def glm_poisson_predict(save_file_name, datagen_dir, train_dir, predict_dir):
+def glm_poisson_predict(save_folder_name, datagen_dir, train_dir, predict_dir, 
config_dir):
+    save_path = join(config_dir, save_folder_name)
+    predict_write = join(predict_dir, save_folder_name)
 
     dfam = '1'
     link = '1'
     vpow = '1'
     lpow = '1.0'
-
     X = join(datagen_dir, 'X_test.data')
     B = join(train_dir, 'B.data')
     Y = join(datagen_dir, 'Y_test.data')
-
-    full_path_predict = join(predict_dir, save_file_name)
-    M = join(full_path_predict, 'M.data')
-    O = join(full_path_predict, 'O.data')
+    M = join(predict_write, 'M.data')
+    O = join(predict_write, 'O.data')
 
     config = dict(dfam=dfam, link=link, vpow=vpow, lpow=lpow, fmt=DATA_FORMAT, 
X=X,
                   B=B, Y=Y, M=M, O=O)
-    config_writer(full_path_predict + '.json', config)
+    config_writer(save_path + '.json', config)
 
-    return full_path_predict
+    return save_path
 
 
-def glm_binomial_predict(save_file_name, datagen_dir, train_dir, predict_dir):
+def glm_binomial_predict(save_folder_name, datagen_dir, train_dir, 
predict_dir, config_dir):
+    save_path = join(config_dir, save_folder_name)
+    predict_write = join(predict_dir, save_folder_name)
 
     dfam = '2'
     link = '3'
-
     X = join(datagen_dir, 'X_test.data')
     B = join(train_dir, 'B.data')
     Y = join(datagen_dir, 'Y_test.data')
-
-    full_path_predict = join(predict_dir, save_file_name)
-    M = join(full_path_predict, 'M.data')
-    O = join(full_path_predict, 'O.data')
+    M = join(predict_write, 'M.data')
+    O = join(predict_write, 'O.data')
 
     config = dict(dfam=dfam, link=link, fmt=DATA_FORMAT, X=X,
                   B=B, Y=Y, M=M, O=O)
-    config_writer(full_path_predict + '.json', config)
+    config_writer(save_path + '.json', config)
 
-    return full_path_predict
+    return save_path
 
 
-def glm_gamma_predict(save_file_name, datagen_dir, train_dir, predict_dir):
+def glm_gamma_predict(save_folder_name, datagen_dir, train_dir, predict_dir, 
config_dir):
+    save_path = join(config_dir, save_folder_name)
+    predict_write = join(predict_dir, save_folder_name)
 
     dfam = '1'
     link = '1'
     vpow = '2'
     lpow = '0'
-
     X = join(datagen_dir, 'X_test.data')
     B = join(train_dir, 'B.data')
     Y = join(datagen_dir, 'Y_test.data')
-
-    full_path_predict = join(predict_dir, save_file_name)
-    M = join(full_path_predict, 'M.data')
-    O = join(full_path_predict, 'O.data')
-
+    M = join(predict_write, 'M.data')
+    O = join(predict_write, 'O.data')
     config = dict(dfam=dfam, link=link, vpow=vpow, lpow=lpow, fmt=DATA_FORMAT, 
X=X,
                   B=B, Y=Y, M=M, O=O)
-    config_writer(full_path_predict + '.json', config)
+    config_writer(save_path + '.json', config)
 
-    return full_path_predict
+    return save_path
 
 
-def config_packets_predict(algo_payload, matrix_type, matrix_shape, 
datagen_dir, train_dir, predict_dir, dense_algos):
+def config_packets_predict(algo_payload, matrix_type, matrix_shape, 
datagen_dir, train_dir, predict_dir, dense_algos, config_dir):
     """
     This function has two responsibilities. Generate the configuration files 
for
     prediction algorithms and return a dictionary that will be used for 
execution.
@@ -246,6 +232,9 @@ def config_packets_predict(algo_payload, matrix_type, 
matrix_shape, datagen_dir,
     dense_algos: List
     Algorithms that support only dense matrix type
 
+    config_dir: String
+    Location to store to configuration json file
+
     return: Dictionary  {string: list}
     This dictionary contains algorithms to be executed as keys and the path of 
configuration
     json files to be executed list of values.
@@ -267,6 +256,7 @@ def config_packets_predict(algo_payload, matrix_type, 
matrix_shape, datagen_dir,
         for current_train_folder in train_folders:
             current_data_gen_dir = relevant_folders(datagen_dir, current_algo, 
current_family,
                                                     current_matrix_type, 
matrix_shape, 'data-gen')
+
             if len(current_data_gen_dir) == 0:
                 print('data-gen folders not present for 
{}'.format(current_family))
                 sys.exit()
@@ -276,7 +266,7 @@ def config_packets_predict(algo_payload, matrix_type, 
matrix_shape, datagen_dir,
 
             # current_data_gen_dir has index 0 as we would expect one datagen 
for each algorithm
             conf_path = globals()[algo_func](save_name, 
current_data_gen_dir[0],
-                                             current_train_folder, predict_dir)
+                                             current_train_folder, 
predict_dir, config_dir)
 
             config_bundle[current_algo].append(conf_path)
 

http://git-wip-us.apache.org/repos/asf/systemml/blob/e94374af/scripts/perftest/python/run_perftest.py
----------------------------------------------------------------------
diff --git a/scripts/perftest/python/run_perftest.py 
b/scripts/perftest/python/run_perftest.py
index 3360285..a15d7e6 100755
--- a/scripts/perftest/python/run_perftest.py
+++ b/scripts/perftest/python/run_perftest.py
@@ -31,9 +31,10 @@ from datetime import datetime
 from datagen import config_packets_datagen
 from train import config_packets_train
 from predict import config_packets_predict
-from utils import get_families, config_reader, create_dir, get_existence, \
-    exec_dml_and_parse_time, exec_test_data, check_predict, get_folder_metrics
-
+from utils_misc import get_families, config_reader, \
+    exec_dml_and_parse_time, exec_test_data, check_predict, 
get_folder_metrics, args_dict_split, \
+    get_config_args
+from utils_fs import create_dir_local, write_success, check_SUCCESS_file_exists
 
 # A packet is a dictionary
 # with key as the algorithm
@@ -83,32 +84,35 @@ ML_PREDICT = {'Kmeans': 'Kmeans-predict',
 
 DENSE_TYPE_ALGOS = ['clustering', 'stats1', 'stats2']
 
+sup_args_dict = {}
+
 
 # Responsible for execution and metric logging
-def algorithm_workflow(algo, exec_type, config_path, dml_file_name, 
action_mode):
+def algorithm_workflow(algo, exec_type, config_path, dml_file_name, 
action_mode, current_dir):
     """
     This function is responsible for overall workflow. This does the following 
actions
     Check if the input is key value argument or list of positional args
     Execution and time
     Logging Metrics
 
-
-    algo : String
+    algo: String
     Input algorithm specified
 
-    exec_type : String
+    exec_type: String
     Contains the execution type singlenode / hybrid_spark
 
-    config_path : String
+    config_path: String
     Path to read the json file from
 
-    dml_file_name : String
+    dml_file_name: String
     DML file name to be used while processing the arguments give
 
-    action_mode : String
+    action_mode: String
     Type of action data-gen, train ...
-    """
 
+    current_dir: String
+    Current location of hdfs / local temp being processed
+    """
     config_data = config_reader(config_path + '.json')
 
     if isinstance(config_data, dict):
@@ -122,26 +126,24 @@ def algorithm_workflow(algo, exec_type, config_path, 
dml_file_name, action_mode)
     config_file_name = config_path.split('/')[-1]
     mat_type, mat_shape, intercept = get_folder_metrics(config_file_name, 
action_mode)
 
-    exit_flag_success = get_existence(config_path, action_mode)
+    temp_cwd = join(current_dir, config_file_name)
+
+    # temp_dir_exist
+    exit_flag_success = check_SUCCESS_file_exists(temp_cwd)
 
     if exit_flag_success:
-        print('data already exists {}'.format(config_path))
         time = 'data_exists'
     else:
-        time = exec_dml_and_parse_time(exec_type, dml_file_name, 
config_file_name, args)
-
-    # Write a _SUCCESS file only if time is found and in data-gen action_mode
-    if len(time.split('.')) == 2 and action_mode == 'data-gen':
-        full_path = join(config_path, '_SUCCESS')
-        open(full_path, 'w').close()
+        time = exec_dml_and_parse_time(exec_type, dml_file_name, args, 
spark_args_dict, sup_args_dict)
+        write_success(time, temp_cwd)
 
     print('{},{},{},{},{},{}'.format(algo, action_mode, intercept, mat_type, 
mat_shape, time))
     current_metrics = [algo, action_mode, intercept, mat_type, mat_shape, time]
     logging.info(','.join(current_metrics))
+    return exit_flag_success
 
 
-# Perf test entry point
-def perf_test_entry(family, algo, exec_type, mat_type, mat_shape, temp_dir, 
mode):
+def perf_test_entry(family, algo, exec_type, mat_type, mat_shape, config_dir, 
mode, temp_dir):
     """
     This function is the entry point for performance testing
 
@@ -160,13 +162,15 @@ def perf_test_entry(family, algo, exec_type, mat_type, 
mat_shape, temp_dir, mode
     mat_shape: List
     Dimensions of the input matrix with rows and columns
 
-    temp_dir: String
-    Location to store all files created during perf test
+    config_dir: String
+    Location to store all configuration
 
     mode: List
     Type of workload to run. data-gen, train ...
-    """
 
+    temp_dir: String
+    Location to store all output files created during perf test
+    """
     # algos to run is a list of tuples with
     # [(m-svm, binomial), (m-svm, multinomial)...]
     # Basic block for execution of scripts
@@ -202,48 +206,64 @@ def perf_test_entry(family, algo, exec_type, mat_type, 
mat_shape, temp_dir, mode
                 algos_to_run.append((current_algo, current_family))
 
     if 'data-gen' in mode:
+        # Create config directories
+        data_gen_config_dir = join(config_dir, 'data-gen')
+        create_dir_local(data_gen_config_dir)
+
+        # Create output path
         data_gen_dir = join(temp_dir, 'data-gen')
-        create_dir(data_gen_dir)
         conf_packet = config_packets_datagen(algos_to_run, mat_type, 
mat_shape, data_gen_dir,
-                                             DENSE_TYPE_ALGOS)
+                                             DENSE_TYPE_ALGOS, 
data_gen_config_dir)
+
         for family_name, config_folders in conf_packet.items():
             for config in config_folders:
                 file_name = ML_GENDATA[family_name]
-                algorithm_workflow(family_name, exec_type, config, file_name, 
'data-gen')
+                success_file = algorithm_workflow(family_name, exec_type, 
config, file_name, 'data-gen', data_gen_dir)
                 # Statistic family do not require to be split
                 if family_name not in ['stats1', 'stats2']:
-                    exec_test_data(exec_type, config)
+                    if not success_file:
+                        exec_test_data(exec_type, spark_args_dict, 
sup_args_dict, data_gen_dir, config)
 
     if 'train' in mode:
+        # Create config directories
+        train_config_dir = join(config_dir, 'train')
+        create_dir_local(train_config_dir)
+
+        # Create output path
         data_gen_dir = join(temp_dir, 'data-gen')
         train_dir = join(temp_dir, 'train')
-        create_dir(train_dir)
+
         conf_packet = config_packets_train(algos_to_run, mat_type, mat_shape, 
data_gen_dir,
-                                           train_dir, DENSE_TYPE_ALGOS)
+                                           train_dir, DENSE_TYPE_ALGOS, 
train_config_dir)
         for algo_name, config_files in conf_packet.items():
             for config in config_files:
                 file_name = ML_TRAIN[algo_name]
-                algorithm_workflow(algo_name, exec_type, config, file_name, 
'train')
+                algorithm_workflow(algo_name, exec_type, config, file_name, 
'train', train_dir)
 
     if 'predict' in mode:
+        # Create config directories
+        predict_config_dir = join(config_dir, 'predict')
+        create_dir_local(predict_config_dir)
+
+        # Create output path
         data_gen_dir = join(temp_dir, 'data-gen')
         train_dir = join(temp_dir, 'train')
         predict_dir = join(temp_dir, 'predict')
-        create_dir(predict_dir)
-        algos_to_run_perdict = list(filter(lambda algo: check_predict(algo[0], 
ML_PREDICT), algos_to_run))
-        if len(algos_to_run_perdict) < 1:
+
+        algos_to_run = list(filter(lambda algo: check_predict(algo[0], 
ML_PREDICT), algos_to_run))
+        if len(algos_to_run) < 1:
             # No algorithms with predict found
             pass
-        conf_packet = config_packets_predict(algos_to_run_perdict, mat_type, 
mat_shape, data_gen_dir,
-                                             train_dir, predict_dir, 
DENSE_TYPE_ALGOS)
-
+        conf_packet = config_packets_predict(algos_to_run, mat_type, 
mat_shape, data_gen_dir,
+                                             train_dir, predict_dir, 
DENSE_TYPE_ALGOS,
+                                             predict_config_dir)
         for algo_name, config_files in conf_packet.items():
                 for config in config_files:
                     file_name = ML_PREDICT[algo_name]
-                    algorithm_workflow(algo_name, exec_type, config, 
file_name, 'predict')
+                    algorithm_workflow(algo_name, exec_type, config, 
file_name, 'predict', predict_dir)
 
-if __name__ == '__main__':
 
+if __name__ == '__main__':
     # sys ml env set and error handling
     systemml_home = os.environ.get('SYSTEMML_HOME')
     if systemml_home is None:
@@ -259,7 +279,6 @@ if __name__ == '__main__':
 
     # Default temp directory, contains everything generated in perftest
     default_temp_dir = join(systemml_home, 'scripts', 'perftest', 'temp')
-    create_dir(default_temp_dir)
 
     # Initialize time
     start_time = time.time()
@@ -274,7 +293,8 @@ if __name__ == '__main__':
     all_families = ML_ALGO.keys()
 
     # Argparse Module
-    cparser = argparse.ArgumentParser(description='SystemML Performance Test 
Script')
+    cparser = 
argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+                                      description='SystemML Performance Test 
Script')
     cparser.add_argument('--family', help='space separated list of classes of 
algorithms '
                          '(available : ' + ', '.join(sorted(all_families)) + 
')',
                          metavar='', choices=all_families, nargs='+')
@@ -290,17 +310,43 @@ if __name__ == '__main__':
                          nargs='+')
     cparser.add_argument('--mat-shape', default=default_mat_shape, help='space 
separated list of shapes of matrices '
                          'to generate (e.g 10k_1k, 20M_4k)', metavar='', 
nargs='+')
-    cparser.add_argument('--temp-dir', default=default_temp_dir, 
help='temporary directory '
+
+    cparser.add_argument('--config-dir', default=default_temp_dir, 
help='temporary directory '
                          'where generated, training and prediction data is 
put', metavar='')
     cparser.add_argument('--filename', default='perf_test', help='name of the 
output file for the perf'
                          ' metrics', metavar='')
     cparser.add_argument('--mode', default=workload,
                          help='space separated list of types of workloads to 
run (available: data-gen, train, predict)',
                          metavar='', choices=workload, nargs='+')
+    # Change this to temp-dir
+    cparser.add_argument('--temp-dir', default=default_temp_dir,
+                         help='define the file system to work on', metavar='')
+
+    # Configuration Options
+    cparser.add_argument('-stats', help='Monitor and report 
caching/recompilation statistics, '
+                                        'heavy hitter <count> is 10 unless 
overridden', nargs='?', const='10',
+                         metavar='')
+    cparser.add_argument('-explain', help='explains plan levels can be hops, 
runtime, '
+                                          'recompile_hops, recompile_runtime', 
nargs='?', const='runtime', metavar='')
+    cparser.add_argument('-config', help='System-ML configuration file (e.g 
SystemML-config.xml)', metavar='')
+
+    # Spark Configuration Option
+    cparser.add_argument('--master', help='local, yarn-client, yarn-cluster', 
metavar='')
+    cparser.add_argument('--driver-memory', help='Memory for driver (e.g. 
512M)', metavar='')
+    cparser.add_argument('--num-executors', help='Number of executors to 
launch', metavar='')
+    cparser.add_argument('--executor-memory', help='Memory per executor', 
metavar='')
+    cparser.add_argument('--executor-cores', help='Number of cores', 
metavar='')
+    cparser.add_argument('--conf', help='Spark configuration file', nargs='+', 
metavar='')
 
     # Args is a namespace
     args = cparser.parse_args()
-    arg_dict = vars(args)
+    all_arg_dict = vars(args)
+    arg_dict, config_dict, spark_dict = args_dict_split(all_arg_dict)
+
+    create_dir_local(args.config_dir)
+
+    # Global variables
+    sup_args_dict, spark_args_dict = get_config_args(config_dict, spark_dict, 
args.exec_type)
 
     # Debug arguments
     # print(arg_dict)
@@ -344,13 +390,12 @@ if __name__ == '__main__':
     # Set level to 0 -> debug mode
     # Set level to 20 -> Plain metrics
     log_filename = args.filename + '_' + args.exec_type + '.out'
-    logging.basicConfig(filename=join(default_temp_dir, log_filename), 
level=20)
+    logging.basicConfig(filename=join(args.config_dir, log_filename), level=20)
     logging.info('New performance test started at {}'.format(time_now))
     
logging.info('algorithm,run_type,intercept,matrix_type,data_shape,time_sec')
 
     # Remove filename item from dictionary as its already used to create the 
log above
     del arg_dict['filename']
-
     perf_test_entry(**arg_dict)
 
     total_time = (time.time() - start_time)

[2/2] systemml git commit: [SYSTEMML-1451] phase 2 work

Reply via email to