systemml git commit: [SYSTEMML-1451][GSoC Phase 1] Single script to run perf tests

nakul02 Sun, 02 Jul 2017 00:01:52 -0700

Repository: systemml
Updated Branches:
  refs/heads/master 31952e47d -> e7cfcadc9



[SYSTEMML-1451][GSoC Phase 1] Single script to run perf tests

- Single entry point to run perf tests in any combination of algoriths,
  families, matrix shapes & densities
- Reports time taken by a single perf test by parsing the output and
  grep-ing for the time
- Detects tests that did not run and reports in the generated log
- Robust error handling and reporting, informative help message

Closes #537


Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/e7cfcadc
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/e7cfcadc
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/e7cfcadc

Branch: refs/heads/master
Commit: e7cfcadc9b0e72637c67c8d6a6dcc62f62ba5177
Parents: 31952e4
Author: krishnakalyan3 <krishnakaly...@gmail.com>
Authored: Sun Jul 2 00:00:49 2017 -0700
Committer: Nakul Jindal <naku...@gmail.com>
Committed: Sun Jul 2 00:00:49 2017 -0700

----------------------------------------------------------------------
 scripts/perftest/python/datagen.py      | 252 ++++++++++++++++
 scripts/perftest/python/predict.py      | 285 +++++++++++++++++++
 scripts/perftest/python/run_perftest.py | 339 ++++++++++++++++++++++
 scripts/perftest/python/train.py        | 411 +++++++++++++++++++++++++++
 scripts/perftest/python/utils.py        | 296 +++++++++++++++++++
 5 files changed, 1583 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/e7cfcadc/scripts/perftest/python/datagen.py
----------------------------------------------------------------------
diff --git a/scripts/perftest/python/datagen.py 
b/scripts/perftest/python/datagen.py
new file mode 100755
index 0000000..d9c49e9
--- /dev/null
+++ b/scripts/perftest/python/datagen.py
@@ -0,0 +1,252 @@
+#!/usr/bin/env python3
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+import itertools
+from os.path import join
+from utils import split_rowcol, config_writer
+
+# This file contains configuration settings for data generation
+DATA_FORMAT = 'csv'
+
+MATRIX_TYPE_DICT = {'dense': '0.9',
+                    'sparse': '0.01'}
+
+FAMILY_NO_MATRIX_TYPE = ['clustering', 'stats1', 'stats2']
+
+
+def multinomial_datagen(matrix_dim, matrix_type, datagen_dir):
+
+    row, col = split_rowcol(matrix_dim)
+    path_name = '.'.join(['multinomial', matrix_type, str(matrix_dim)])
+    full_path = join(datagen_dir, path_name)
+
+    numSamples = row
+    numFeatures = col
+    sparsity = MATRIX_TYPE_DICT[matrix_type]
+    num_categories = '150'
+    intercept = '0'
+    X = join(full_path, 'X.data')
+    Y = join(full_path, 'Y.data')
+    fmt = DATA_FORMAT
+
+    config = [numSamples, numFeatures, sparsity, num_categories, intercept,
+              X, Y, fmt, '1']
+
+    config_writer(full_path + '.json', config)
+
+    return full_path
+
+
+def binomial_datagen(matrix_dim, matrix_type, datagen_dir):
+
+    row, col = split_rowcol(matrix_dim)
+    path_name = '.'.join(['binomial', matrix_type, str(matrix_dim)])
+    full_path = join(datagen_dir, path_name)
+
+    numSamples = row
+    numFeatures = col
+    maxFeatureValue = '5'
+    maxWeight = '5'
+    loc_weights = join(full_path, 'weight.data')
+    loc_data = join(full_path, 'X.data')
+    loc_labels = join(full_path, 'Y.data')
+    noise = '1'
+    intercept = '0'
+    sparsity = MATRIX_TYPE_DICT[matrix_type]
+    tranform_labels = '1'
+    fmt = DATA_FORMAT
+
+    config = [numSamples, numFeatures, maxFeatureValue, maxWeight, 
loc_weights, loc_data,
+              loc_labels, noise, intercept, sparsity, fmt, tranform_labels]
+    config_writer(full_path + '.json', config)
+
+    return full_path
+
+
+def regression1_datagen(matrix_dim, matrix_type, datagen_dir):
+
+    row, col = split_rowcol(matrix_dim)
+    path_name = '.'.join(['regression1', matrix_type, str(matrix_dim)])
+    full_path = join(datagen_dir, path_name)
+
+    numSamples = row
+    numFeatures = col
+    maxFeatureValue = '5'
+    maxWeight = '5'
+    loc_weights = join(full_path, 'weight.data')
+    loc_data = join(full_path, 'X.data')
+    loc_labels = join(full_path, 'Y.data')
+    noise = '1'
+    intercept = '0'
+    sparsity = MATRIX_TYPE_DICT[matrix_type]
+    tranform_labels = '1'
+    fmt = DATA_FORMAT
+
+    config = [numSamples, numFeatures, maxFeatureValue, maxWeight, 
loc_weights, loc_data,
+              loc_labels, noise, intercept, sparsity, fmt, tranform_labels]
+    config_writer(full_path + '.json', config)
+
+    return full_path
+
+
+def regression2_datagen(matrix_dim, matrix_type, datagen_dir):
+
+    row, col = split_rowcol(matrix_dim)
+    path_name = '.'.join(['regression2', matrix_type, str(matrix_dim)])
+    full_path = join(datagen_dir, path_name)
+
+    numSamples = row
+    numFeatures = col
+    maxFeatureValue = '5'
+    maxWeight = '5'
+    loc_weights = join(full_path, 'weight.data')
+    loc_data = join(full_path, 'X.data')
+    loc_labels = join(full_path, 'Y.data')
+    noise = '1'
+    intercept = '0'
+    sparsity = MATRIX_TYPE_DICT[matrix_type]
+    tranform_labels = '1'
+    fmt = DATA_FORMAT
+
+    config = [numSamples, numFeatures, maxFeatureValue, maxWeight, 
loc_weights, loc_data,
+              loc_labels, noise, intercept, sparsity, fmt, tranform_labels]
+    config_writer(full_path + '.json', config)
+
+    return full_path
+
+
+def clustering_datagen(matrix_dim, matrix_type, datagen_dir):
+
+    row, col = split_rowcol(matrix_dim)
+    path_name = '.'.join(['clustering', matrix_type, str(matrix_dim)])
+
+    full_path = join(datagen_dir, path_name)
+    X = join(full_path, 'X.data')
+    Y = join(full_path, 'Y.data')
+    YbyC = join(full_path, 'YbyC.data')
+    C = join(full_path, 'C.data')
+    nc = '50'
+    dc = '10.0'
+    dr = '1.0'
+    fbf = '100.0'
+    cbf = '100.0'
+
+    config = dict(nr=row, nf=col, nc=nc, dc=dc, dr=dr, fbf=fbf, cbf=cbf, X=X, 
C=C, Y=Y,
+                  YbyC=YbyC, fmt=DATA_FORMAT)
+
+    config_writer(full_path + '.json', config)
+    return full_path
+
+
+def stats1_datagen(matrix_dim, matrix_type, datagen_dir):
+
+    row, col = split_rowcol(matrix_dim)
+    path_name = '.'.join(['stats1', matrix_type, str(matrix_dim)])
+    full_path = join(datagen_dir, path_name)
+
+    DATA = join(full_path, 'X.data')
+    TYPES = join(full_path, 'types')
+    TYPES1 = join(full_path, 'set1.types')
+    TYPES2 = join(full_path, 'set2.types')
+    INDEX1 = join(full_path, 'set1.indices')
+    INDEX2 = join(full_path, 'set2.indices')
+    MAXDOMAIN = '1100'
+    SETSIZE = '20'
+    LABELSETSIZE = '10'
+
+    # NC should be less than C and more than num0
+    # NC = 10 (old value)
+    # num0 = NC/2
+    # num0 < NC < C
+    # NC = C/2
+    NC = int(int(col)/2)
+
+    config = dict(R=row, C=col, NC=NC, MAXDOMAIN=MAXDOMAIN, DATA=DATA, 
TYPES=TYPES, SETSIZE=SETSIZE,
+                  LABELSETSIZE=LABELSETSIZE, TYPES1=TYPES1, TYPES2=TYPES2, 
INDEX1=INDEX1, INDEX2=INDEX2,
+                  fmt=DATA_FORMAT)
+
+    config_writer(full_path + '.json', config)
+
+    return full_path
+
+
+def stats2_datagen(matrix_dim, matrix_type, datagen_dir):
+
+    row, col = split_rowcol(matrix_dim)
+    path_name = '.'.join(['stats2', matrix_type, str(matrix_dim)])
+    full_path = join(datagen_dir, path_name)
+
+    D = join(full_path, 'X.data')
+    Xcid = join(full_path, 'Xcid.data')
+    Ycid = join(full_path, 'Ycid.data')
+    A = join(full_path, 'A.data')
+
+    config = dict(nr=row, nf=col, D=D, Xcid=Xcid, Ycid=Ycid,
+                  A=A, fmt=DATA_FORMAT)
+
+    config_writer(full_path + '.json', config)
+    return full_path
+
+
+def config_packets_datagen(algo_payload, matrix_type, matrix_shape, 
datagen_dir):
+    """
+    This function has two responsibilities. Generate the configuration files 
for
+    datagen algorithms and return a dictionary that will be used for execution.
+
+    algo_payload : List of tuples
+    The first tuple index contains algorithm name and the second index contains
+    family type.
+
+    matrix_type: String
+    Type of matrix to generate e.g dense or sparse
+
+    matrix_shape: String
+    Shape of matrix to generate e.g 100k_10
+
+    return: Dictionary {string: list}
+    This dictionary contains algorithms to be executed as keys and the path of 
configuration
+    json files to be executed list of values.
+    """
+
+    config_bundle = {}
+
+    distinct_families = set(map(lambda x: x[1], algo_payload))
+
+    # Cross Product of all configurations
+    for current_family in distinct_families:
+        if current_family in FAMILY_NO_MATRIX_TYPE:
+            config = list(itertools.product(matrix_shape, ['dense']))
+            config_bundle[current_family] = config
+        else:
+            config = list(itertools.product(matrix_shape, matrix_type))
+            # clustering : [[10k_1, dense], [10k_2, dense], ...]
+            config_bundle[current_family] = config
+
+    config_packets = {}
+    for current_family, configs in config_bundle.items():
+        config_packets[current_family] = []
+        for size, type in configs:
+            family_func = current_family.lower() + '_datagen'
+            conf_path = globals()[family_func](size, type, datagen_dir)
+            config_packets[current_family].append(conf_path)
+
+    return config_packets

http://git-wip-us.apache.org/repos/asf/systemml/blob/e7cfcadc/scripts/perftest/python/predict.py
----------------------------------------------------------------------
diff --git a/scripts/perftest/python/predict.py 
b/scripts/perftest/python/predict.py
new file mode 100755
index 0000000..bc034da
--- /dev/null
+++ b/scripts/perftest/python/predict.py
@@ -0,0 +1,285 @@
+#!/usr/bin/env python3
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for dadditional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+import sys
+import os
+from os.path import join
+import glob
+from utils import create_dir, config_writer
+
+# Contains configuration setting for predicting
+DATA_FORMAT = 'csv'
+
+
+def m_svm_predict(save_file_name, datagen_dir, train_dir, predict_dir):
+
+    X = join(datagen_dir, 'X_test.data')
+    Y = join(datagen_dir, 'Y_test.data')
+
+    icpt = save_file_name.split('.')[-1]
+    model = join(train_dir, 'model.data')
+    fmt = DATA_FORMAT
+
+    config = dict(X=X, Y=Y, icpt=icpt, model=model, fmt=fmt)
+
+    full_path_predict = join(predict_dir, save_file_name)
+    config_writer(full_path_predict + '.json', config)
+
+    return full_path_predict
+
+
+def l2_svm_predict(save_file_name, datagen_dir, train_dir, predict_dir):
+
+    X = join(datagen_dir, 'X_test.data')
+    Y = join(datagen_dir, 'Y_test.data')
+
+    icpt = save_file_name.split('.')[-1]
+    model = join(train_dir, 'model.data')
+    fmt = DATA_FORMAT
+
+    config = dict(X=X, Y=Y, icpt=icpt, model=model, fmt=fmt)
+
+    full_path_predict = join(predict_dir, save_file_name)
+    config_writer(full_path_predict + '.json', config)
+
+    return full_path_predict
+
+
+def multilogreg_predict(save_file_name, datagen_dir, train_dir, predict_dir):
+    X = join(datagen_dir, 'X_test.data')
+    Y = join(datagen_dir, 'Y_test.data')
+    B = join(train_dir, 'B.data')
+    M = join(train_dir, 'M.data')
+    dfam = '3'
+    vpow = '-1'
+    link = '2'
+    fmt = DATA_FORMAT
+
+    config = dict(dfam=dfam, vpow=vpow, link=link, fmt=fmt, X=X, B=B, Y=Y, M=M)
+
+    full_path_predict = join(predict_dir, save_file_name)
+    config_writer(full_path_predict + '.json', config)
+
+    return full_path_predict
+
+
+def naive_bayes_predict(save_file_name, datagen_dir, train_dir, predict_dir):
+
+    X = join(datagen_dir, 'X_test.data')
+    Y = join(datagen_dir, 'Y_test.data')
+
+    prior = join(train_dir, 'prior')
+    conditionals = join(train_dir, 'conditionals')
+    fmt = DATA_FORMAT
+    probabilities = join(train_dir, 'probabilities')
+    config = dict(X=X, Y=Y, prior=prior, conditionals=conditionals, fmt=fmt, 
probabilities=probabilities)
+
+    full_path_predict = join(predict_dir, save_file_name)
+    config_writer(full_path_predict + '.json', config)
+
+    return full_path_predict
+
+
+def kmeans_predict(save_file_name, datagen_dir, train_dir, predict_dir):
+
+    X = join(datagen_dir, 'X_test.data')
+    C = join(datagen_dir, 'C.data')
+
+    full_path_predict = join(predict_dir, save_file_name)
+    prY = join(full_path_predict, 'prY.data')
+
+    config = dict(X=X, C=C, prY=prY)
+    config_writer(full_path_predict + '.json', config)
+
+    return full_path_predict
+
+
+def linearregcg_predict(save_file_name, datagen_dir, train_dir, predict_dir):
+
+    dfam = '1'
+    link = '1'
+    vpow = '0.0'
+    lpow = '1.0'
+
+    X = join(datagen_dir, 'X_test.data')
+    B = join(train_dir, 'B.data')
+    Y = join(datagen_dir, 'Y_test.data')
+
+    full_path_predict = join(predict_dir, save_file_name)
+    M = join(full_path_predict, 'M.data')
+    O = join(full_path_predict, 'O.data')
+
+    config = dict(dfam=dfam, link=link, vpow=vpow, lpow=lpow, fmt=DATA_FORMAT, 
X=X,
+                  B=B, Y=Y, M=M, O=O)
+    config_writer(full_path_predict + '.json', config)
+
+    return full_path_predict
+
+
+def linearregds_predict(save_file_name, datagen_dir, train_dir, predict_dir):
+
+    dfam = '1'
+    link = '1'
+    vpow = '0.0'
+    lpow = '1.0'
+
+    X = join(datagen_dir, 'X_test.data')
+    B = join(train_dir, 'B.data')
+    Y = join(datagen_dir, 'Y_test.data')
+
+    full_path_predict = join(predict_dir, save_file_name)
+    M = join(full_path_predict, 'M.data')
+    O = join(full_path_predict, 'O.data')
+
+    config = dict(dfam=dfam, link=link, vpow=vpow, lpow=lpow, fmt=DATA_FORMAT, 
X=X,
+                  B=B, Y=Y, M=M, O=O)
+    config_writer(full_path_predict + '.json', config)
+
+    return full_path_predict
+
+
+def glm_poisson_predict(save_file_name, datagen_dir, train_dir, predict_dir):
+
+    dfam = '1'
+    link = '1'
+    vpow = '1'
+    lpow = '1.0'
+
+    X = join(datagen_dir, 'X_test.data')
+    B = join(train_dir, 'B.data')
+    Y = join(datagen_dir, 'Y_test.data')
+
+    full_path_predict = join(predict_dir, save_file_name)
+    M = join(full_path_predict, 'M.data')
+    O = join(full_path_predict, 'O.data')
+
+    config = dict(dfam=dfam, link=link, vpow=vpow, lpow=lpow, fmt=DATA_FORMAT, 
X=X,
+                  B=B, Y=Y, M=M, O=O)
+    config_writer(full_path_predict + '.json', config)
+
+    return full_path_predict
+
+
+def glm_binomial_predict(save_file_name, datagen_dir, train_dir, predict_dir):
+
+    dfam = '2'
+    link = '3'
+
+    X = join(datagen_dir, 'X_test.data')
+    B = join(train_dir, 'B.data')
+    Y = join(datagen_dir, 'Y_test.data')
+
+    full_path_predict = join(predict_dir, save_file_name)
+    M = join(full_path_predict, 'M.data')
+    O = join(full_path_predict, 'O.data')
+
+    config = dict(dfam=dfam, link=link, fmt=DATA_FORMAT, X=X,
+                  B=B, Y=Y, M=M, O=O)
+    config_writer(full_path_predict + '.json', config)
+
+    return full_path_predict
+
+
+def glm_gamma_predict(save_file_name, datagen_dir, train_dir, predict_dir):
+
+    dfam = '1'
+    link = '1'
+    vpow = '2'
+    lpow = '0'
+
+    X = join(datagen_dir, 'X_test.data')
+    B = join(train_dir, 'B.data')
+    Y = join(datagen_dir, 'Y_test.data')
+
+    full_path_predict = join(predict_dir, save_file_name)
+    M = join(full_path_predict, 'M.data')
+    O = join(full_path_predict, 'O.data')
+
+    config = dict(dfam=dfam, link=link, vpow=vpow, lpow=lpow, fmt=DATA_FORMAT, 
X=X,
+                  B=B, Y=Y, M=M, O=O)
+    config_writer(full_path_predict + '.json', config)
+
+    return full_path_predict
+
+
+def config_packets_predict(algo_payload, datagen_dir, train_dir, predict_dir):
+    """
+    This function has two responsibilities. Generate the configuration files 
for
+    prediction algorithms and return a dictionary that will be used for 
execution.
+
+    algo_payload : List of tuples
+    The first tuple index contains algorithm name and the second index contains
+    family type.
+
+    datagen_dir: String
+    Path of the data generation directory
+
+    train_dir: String
+    Path of the training directory
+
+    predict_dir: String
+    Path of the prediction directory
+
+    return: Dictionary  {string: list}
+    This dictionary contains algorithms to be executed as keys and the path of 
configuration
+    json files to be executed list of values.
+    """
+
+    algo_payload_distinct = set(map(lambda x: x[0], algo_payload))
+
+    config_bundle = {}
+
+    for k, v in algo_payload:
+        config_bundle[k] = []
+
+    for current_algo in algo_payload_distinct:
+        # Get all train folders related to the algorithm
+        train_path = join(train_dir, current_algo)
+        train_subdir = glob.glob(train_path + "*")
+        train_folders = list(filter(lambda x: os.path.isdir(x), train_subdir))
+
+        if len(train_folders) == 0:
+            print('training folders not present for {}'.format(current_algo))
+            sys.exit()
+
+        for current_train_folder in train_folders:
+            save_name = current_train_folder.split('/')[-1]
+            # Get all datagen folders
+            data_gen_folder_name = '.'.join(save_name.split('.')[1:-1])
+            data_gen_path = join(datagen_dir, data_gen_folder_name)
+            data_gen_subdir = glob.glob(data_gen_path + "*")
+            data_gen_folder = list(filter(lambda x: os.path.isdir(x), 
data_gen_subdir))
+
+            if len(data_gen_folder) == 0:
+                print('data-gen folders not present for 
{}'.format(current_family))
+                sys.exit()
+
+            # Ideally we will have more than one datagen directory to be found
+            current_data_gen_dir = list(data_gen_folder)[0]
+
+            algo_func = '_'.join([current_algo.lower().replace('-', '_')] + 
['predict'])
+            conf_path = globals()[algo_func](save_name, current_data_gen_dir,
+                                             current_train_folder, predict_dir)
+
+            config_bundle[current_algo].append(conf_path)
+
+    return config_bundle

http://git-wip-us.apache.org/repos/asf/systemml/blob/e7cfcadc/scripts/perftest/python/run_perftest.py
----------------------------------------------------------------------
diff --git a/scripts/perftest/python/run_perftest.py 
b/scripts/perftest/python/run_perftest.py
new file mode 100755
index 0000000..1421c2c
--- /dev/null
+++ b/scripts/perftest/python/run_perftest.py
@@ -0,0 +1,339 @@
+#!/usr/bin/env python3
+# -------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# -------------------------------------------------------------
+
+import sys
+import time
+import argparse
+from functools import reduce
+import os
+from os.path import join
+from utils import get_families, config_reader, create_dir,  get_existence, \
+    exec_dml_and_parse_time, exec_test_data, check_predict, get_folder_metrics
+import logging
+from datetime import datetime
+from datagen import config_packets_datagen
+from train import config_packets_train
+from predict import config_packets_predict
+
+# A packet is a dictionary
+# with key as the algorithm
+# value as the list with configuration json files
+
+
+ML_ALGO = {'binomial': ['MultiLogReg', 'l2-svm', 'm-svm'],
+           'clustering': ['Kmeans'],
+           'multinomial': ['naive-bayes', 'MultiLogReg', 'm-svm'],
+           'regression1': ['LinearRegDS', 'LinearRegCG'],
+           'regression2': ['GLM_poisson', 'GLM_gamma', 'GLM_binomial'],
+           'stats1': ['Univar-Stats', 'bivar-stats'],
+           'stats2': ['stratstats']}
+
+ML_GENDATA = {'binomial': 'genRandData4LogisticRegression',
+              'clustering': 'genRandData4Kmeans',
+              'multinomial': 'genRandData4Multinomial',
+              'regression1': 'genRandData4LogisticRegression',
+              'regression2': 'genRandData4LogisticRegression',
+              'stats1': 'genRandData4DescriptiveStats',
+              'stats2': 'genRandData4StratStats'}
+
+ML_TRAIN = {'GLM_poisson': 'GLM',
+            'GLM_gamma': 'GLM',
+            'GLM_binomial': 'GLM',
+            'LinearRegCG': 'LinearRegCG',
+            'LinearRegDS': 'LinearRegDS',
+            'stratstats': 'stratstats',
+            'Univar-Stats': 'Univar-Stats',
+            'bivar-stats': 'bivar-stats',
+            'Kmeans': 'Kmeans',
+            'm-svm': 'm-svm',
+            'l2-svm': 'l2-svm',
+            'MultiLogReg': 'MultiLogReg',
+            'naive-bayes': 'naive-bayes'}
+
+ML_PREDICT = {'Kmeans': 'Kmeans-predict',
+              'LinearRegCG': 'GLM-predict',
+              'LinearRegDS': 'GLM-predict',
+              'm-svm': 'm-svm-predict',
+              'l2-svm': 'l2-svm-predict',
+              'MultiLogReg': 'GLM-predict',
+              'naive-bayes': 'naive-bayes-predict',
+              'GLM_poisson': 'GLM-predict',
+              'GLM_gamma': 'GLM-predict',
+              'GLM_binomial': 'GLM-predict'}
+
+
+# Responsible for execution and metric logging
+def algorithm_workflow(algo, exec_type, config_path, file_name, action_mode):
+    """
+    This function is responsible for overall workflow. This does the following 
actions
+    Check if the input is key value argument or list of positional args
+    Execution and time
+    Logging Metrics
+
+
+    algo : String
+    Input algorithm specified
+
+    exec_type : String
+    Contains the execution type singlenode / hybrid_spark
+
+    config_path : String
+    Path to read the json file from
+
+    file_name : String
+    DML file name to be used while processing the arguments give
+
+    action_mode : String
+    Type of action data-gen, train ...
+    """
+
+    config_data = config_reader(config_path + '.json')
+
+    if isinstance(config_data, dict):
+        dict_args = ' '.join([str(key) + '=' + str(val) for key, val in 
config_data.items()])
+        args = {'-nvargs': dict_args}
+
+    if isinstance(config_data, list):
+        list_args = ' '.join(config_data)
+        args = {'-args': list_args}
+
+    folder_name = config_path.split('/')[-1]
+    mat_type, mat_shape, intercept = get_folder_metrics(folder_name, 
action_mode)
+
+    exit_flag_success = get_existence(config_path, action_mode)
+
+    if exit_flag_success:
+        print('data already exists {}'.format(config_path))
+        time = 'data_exists'
+    else:
+        time = exec_dml_and_parse_time(exec_type, file_name, args)
+
+    # Write a _SUCCESS file only if time is found and in data-gen action_mode
+    if len(time.split('.')) == 2 and action_mode == 'data-gen':
+        full_path = join(config_path, '_SUCCESS')
+        open(full_path, 'w').close()
+
+    print('{},{},{},{},{},{}'.format(algo, action_mode, intercept, mat_type, 
mat_shape, time))
+    current_metrics = [algo, action_mode, intercept, mat_type, mat_shape, time]
+    logging.info(','.join(current_metrics))
+
+
+# Perf test entry point
+def perf_test_entry(family, algo, exec_type, mat_type, mat_shape, temp_dir, 
mode):
+    """
+    This function is the entry point for performance testing
+
+    family: List
+    A family may contain one or more algorithm based on data generation script 
used
+
+    algo: List
+    Input algorithms
+
+    exec_type: String
+    Contains the execution type singlenode / hybrid_spark
+
+    mat_type: List
+    Type of matrix to generate dense or sparse
+
+    mat_shape: List
+    Dimensions of the input matrix with rows and columns
+
+    temp_dir: String
+    Location to store all files created during perf test
+
+    mode: List
+    Type of workload to run. data-gen, train ...
+    """
+
+    # algos to run is a list of tuples with
+    # [(m-svm, binomial), (m-svm, multinomial)...]
+    # Basic block for execution of scripts
+    algos_to_run = []
+
+    # Sections below build algos_to_run in our performance test
+    # Handles algorithms like m-svm and MultiLogReg which have multiple
+    # data generation scripts (dual datagen)
+    # --family is taken into consideration only when there are multiple 
datagen for an algo
+
+    if family is not None and algo is not None:
+        for current_algo in algo:
+            family_list = get_families(current_algo, ML_ALGO)
+            if len(family_list) == 1:
+                algos_to_run.append((current_algo, family_list[0]))
+            else:
+                intersection = set(family).intersection(family_list)
+                for valid_family in intersection:
+                    algos_to_run.append((current_algo, valid_family))
+
+    # When the user inputs just algorithms to run
+    elif algo is not None:
+        for current_algo in algo:
+            family_list = get_families(current_algo, ML_ALGO)
+            for f in family_list:
+                algos_to_run.append((current_algo, f))
+
+    # When the user just specifies only families to run
+    elif family is not None:
+        for current_family in family:
+            algos = ML_ALGO[current_family]
+            for current_algo in algos:
+                algos_to_run.append((current_algo, current_family))
+
+    if 'data-gen' in mode:
+        data_gen_dir = join(temp_dir, 'data-gen')
+        create_dir(data_gen_dir)
+        conf_packet = config_packets_datagen(algos_to_run, mat_type, 
mat_shape, data_gen_dir)
+        for family_name, config_folders in conf_packet.items():
+            for config in config_folders:
+                file_name = ML_GENDATA[family_name]
+                algorithm_workflow(family_name, exec_type, config, file_name, 
'data-gen')
+
+                # Statistic family do not require to be split
+                if family_name not in ['stats1', 'stats2']:
+                    exec_test_data(exec_type, config)
+
+    if 'train' in mode:
+        data_gen_dir = join(temp_dir, 'data-gen')
+        train_dir = join(temp_dir, 'train')
+        create_dir(train_dir)
+        conf_packet = config_packets_train(algos_to_run, data_gen_dir, 
train_dir)
+        for algo_name, config_files in conf_packet.items():
+            for config in config_files:
+                file_name = ML_TRAIN[algo_name]
+                algorithm_workflow(algo_name, exec_type, config, file_name, 
'train')
+
+    if 'predict' in mode:
+        data_gen_dir = join(temp_dir, 'data-gen')
+        train_dir = join(temp_dir, 'train')
+        predict_dir = join(temp_dir, 'predict')
+        create_dir(predict_dir)
+        algos_to_run_perdict = list(filter(lambda algo: check_predict(algo[0], 
ML_PREDICT), algos_to_run))
+        if len(algos_to_run_perdict) < 0:
+            pass
+        conf_packet = config_packets_predict(algos_to_run_perdict, 
data_gen_dir, train_dir, predict_dir)
+        for algo_name, config_files in conf_packet.items():
+                for config in config_files:
+                    file_name = ML_PREDICT[algo_name]
+                    algorithm_workflow(algo_name, exec_type, config, 
file_name, 'predict')
+
+if __name__ == '__main__':
+
+    # sys ml env set and error handling
+    systemml_home = os.environ.get('SYSTEMML_HOME')
+    if systemml_home is None:
+        print('SYSTEMML_HOME not found')
+        sys.exit()
+
+    # Default Arguments
+    default_mat_type = ['dense', 'sparse']
+    default_workload = ['data-gen', 'train', 'predict']
+    default_mat_shape = ['10k_100']
+    default_execution_mode = ['hybrid_spark', 'singlenode']
+
+    # Default temp directory, contains everything generated in perftest
+    default_temp_dir = join(systemml_home, 'scripts', 'perftest', 'temp')
+    create_dir(default_temp_dir)
+
+    # Initialize time
+    start_time = time.time()
+
+    # Default Date Time
+    time_now = str(datetime.now())
+
+    # Remove duplicates algorithms and used as default inputs
+    all_algos = set(reduce(lambda x, y: x + y, ML_ALGO.values()))
+
+    # Argparse Module
+    cparser = argparse.ArgumentParser(description='SystemML Performance Test 
Script')
+    cparser.add_argument('--family', help='specify class of algorithms (e.g 
regression, binomial)',
+                         metavar='', choices=ML_ALGO.keys(), nargs='+')
+    cparser.add_argument('--algo', help='specify the type of algorithm to run 
(Overrides --family)', metavar='',
+                         choices=all_algos, nargs='+')
+
+    cparser.add_argument('--exec-type', default='singlenode', help='System-ML 
backend '
+                         '(e.g singlenode, spark-hybrid)', metavar='',
+                         choices=default_execution_mode)
+    cparser.add_argument('--mat-type', default=default_mat_type, help='type of 
matrix to generate '
+                         '(e.g dense or sparse)', metavar='', 
choices=default_mat_type,
+                         nargs='+')
+    cparser.add_argument('--mat-shape', default=default_mat_shape, help='shape 
of matrix '
+                         'to generate (e.g 10k_1k)', metavar='', nargs='+')
+    cparser.add_argument('--temp-dir', default=default_temp_dir, help='specify 
temporary directory',
+                         metavar='')
+    cparser.add_argument('--filename', default='perf_test', help='specify 
output file for the perf'
+                         ' metics', metavar='')
+    cparser.add_argument('--mode', default=default_workload,
+                         help='specify type of workload to run (e.g data-gen, 
train, predict)',
+                         metavar='', choices=default_workload, nargs='+')
+
+    # Args is a namespace
+    args = cparser.parse_args()
+    arg_dict = vars(args)
+
+    # Debug arguments
+    # print(arg_dict)
+
+    # Check for validity of input arguments
+    if args.family is not None:
+        for fam in args.family:
+            if fam not in ML_ALGO.keys():
+                print('{} family not present in the performance test 
suit'.format(fam))
+                sys.exit()
+
+    if args.algo is not None:
+        for algo in args.algo:
+            if algo not in all_algos:
+                print('{} algorithm not present in the performance test 
suit'.format(args.algo))
+                sys.exit()
+
+        # This section check the validity of dual datagen algorithms like m-svm
+        algo_families = {}
+        for current_algo in args.algo:
+            algo_families[current_algo] = get_families(current_algo, ML_ALGO)
+
+        if len(algo_families[current_algo]) > 1:
+            if args.family is None:
+                print('family should be present for {}'.format(current_algo))
+                sys.exit()
+
+            valid_families = set(algo_families[current_algo])
+            input_families = set(args.family)
+            common_families = input_families.intersection(valid_families)
+            if len(common_families) == 0:
+                print('Please specify a valid family for {} and the '
+                      'valid families are {}'.format(current_algo, ' 
'.join(valid_families)))
+                sys.exit()
+
+    # Set level to 0 -> debug mode
+    # Set level to 20 -> Plain metrics
+    log_filename = args.filename + '_' + args.exec_type + '.out'
+    logging.basicConfig(filename=join(default_temp_dir, log_filename), 
level=20)
+    logging.info('New performance test started at {}'.format(time_now))
+    
logging.info('algorithm,run_type,intercept,matrix_type,data_shape,time_sec')
+
+    # Remove filename item from dictionary as its already used to create the 
log above
+    del arg_dict['filename']
+
+    perf_test_entry(**arg_dict)
+
+    total_time = (time.time() - start_time)
+    logging.info('Performance tests complete {0:.3f} secs 
\n'.format(total_time))

http://git-wip-us.apache.org/repos/asf/systemml/blob/e7cfcadc/scripts/perftest/python/train.py
----------------------------------------------------------------------
diff --git a/scripts/perftest/python/train.py b/scripts/perftest/python/train.py
new file mode 100755
index 0000000..1ab2880
--- /dev/null
+++ b/scripts/perftest/python/train.py
@@ -0,0 +1,411 @@
+#!/usr/bin/env python3
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+import sys
+import glob
+import os
+from os.path import join
+from utils import config_writer
+from functools import reduce
+
+# Contains configuration setting for training
+DATA_FORMAT = 'csv'
+
+
+def binomial_m_svm_train(save_folder_name, datagen_dir, train_dir):
+
+    data_folders = []
+    for i in [0, 1]:
+        icpt = str(i)
+        reg = '0.01'
+        tol = '0.0001'
+        maxiter = 20
+        X = join(datagen_dir, 'X.data')
+        Y = join(datagen_dir, 'Y.data')
+
+        full_path_train = join(train_dir, save_folder_name + '.' + str(i))
+        data_folders.append(full_path_train)
+
+        model = join(full_path_train, 'model.data')
+        Log = join(full_path_train, 'Log.data')
+
+        config = dict(X=X, Y=Y, icpt=icpt, classes=2, reg=reg, tol=tol, 
maxiter=maxiter, model=model,
+                      Log=Log, fmt=DATA_FORMAT)
+        config_writer(full_path_train + '.json', config)
+
+    return data_folders
+
+
+def binomial_l2_svm_train(save_folder_name, datagen_dir, train_dir):
+
+    data_folders = []
+    for i in [0, 1]:
+        icpt = str(i)
+        reg = '0.01'
+        tol = '0.0001'
+        maxiter = '100'
+        X = join(datagen_dir, 'X.data')
+        Y = join(datagen_dir, 'Y.data')
+
+        full_path_train = join(train_dir, save_folder_name + '.' + str(i))
+        data_folders.append(full_path_train)
+
+        model = join(full_path_train, 'model.data')
+        Log = join(full_path_train, 'Log.data')
+
+        config = dict(X=X, Y=Y, icpt=icpt, reg=reg, tol=tol, maxiter=maxiter, 
model=model,
+                      Log=Log, fmt=DATA_FORMAT)
+        config_writer(full_path_train + '.json', config)
+
+    return data_folders
+
+
+def binomial_multilogreg_train(save_folder_name, datagen_dir, train_dir):
+    data_folders = []
+
+    for i in [0, 1, 2]:
+        icpt = str(i)
+        reg = '0.01'
+        tol = '0.0001'
+        moi = '100'
+        mii = '5'
+        X = join(datagen_dir, 'X.data')
+        Y = join(datagen_dir, 'Y.data')
+
+        full_path_train = join(train_dir, save_folder_name + '.' + str(i))
+        data_folders.append(full_path_train)
+
+        B = join(full_path_train, 'B.data')
+
+        config = dict(X=X, Y=Y, icpt=icpt, reg=reg, tol=tol, moi=moi, mii=mii,
+                      B=B)
+        config_writer(full_path_train + '.json', config)
+    return data_folders
+
+
+def multinomial_m_svm_train(save_folder_name, datagen_dir, train_dir):
+
+    data_folders = []
+    for i in [0, 1]:
+        icpt = str(i)
+        reg = '0.01'
+        tol = '0.0001'
+        maxiter = '20'
+        X = join(datagen_dir, 'X.data')
+        Y = join(datagen_dir, 'Y.data')
+
+        full_path_train = join(train_dir, save_folder_name + '.' + str(i))
+        model = join(full_path_train, 'model.data')
+        Log = join(full_path_train, 'Log.data')
+
+        config = dict(X=X, Y=Y, icpt=icpt, classes=150, reg=reg, tol=tol, 
maxiter=maxiter, model=model,
+                      Log=Log, fmt=DATA_FORMAT)
+        config_writer(full_path_train + '.json', config)
+        data_folders.append(full_path_train)
+
+    return data_folders
+
+
+def clustering_kmeans_train(save_folder_name, datagen_dir, train_dir):
+
+    X = join(datagen_dir, 'X.data')
+
+    full_path_train = join(train_dir, save_folder_name)
+    C = join(full_path_train, 'C.data')
+    k = '50'
+    maxi = '50'
+    tol = '0.0001'
+    config = dict(X=X, k=k, maxi=maxi, tol=tol, C=C)
+
+    config_writer(full_path_train + '.json', config)
+
+    return [full_path_train]
+
+
+def stats1_univar_stats_train(save_folder_name, datagen_dir, train_dir):
+
+    X = join(datagen_dir, 'X.data')
+    TYPES = join(datagen_dir, 'types')
+
+    full_path_train = join(train_dir, save_folder_name)
+    STATS = join(full_path_train, 'STATS.data')
+
+    config = dict(X=X, TYPES=TYPES, STATS=STATS)
+    config_writer(full_path_train + '.json', config)
+
+    return [full_path_train]
+
+
+def stats1_bivar_stats_train(save_folder_name, datagen_dir, train_dir):
+
+    X = join(datagen_dir, 'X.data')
+    index1 = join(datagen_dir, 'set1.indices')
+    index2 = join(datagen_dir, 'set2.indices')
+    types1 = join(datagen_dir, 'set1.types')
+    types2 = join(datagen_dir, 'set2.types')
+
+    full_path_train = join(train_dir, save_folder_name)
+    OUTDIR = full_path_train
+
+    config = dict(X=X, index1=index1, index2=index2, types1=types1, 
types2=types2, OUTDIR=OUTDIR)
+    config_writer(full_path_train + '.json', config)
+    return [full_path_train]
+
+
+def stats2_stratstats_train(save_folder_name, datagen_dir, train_dir):
+
+    X = join(datagen_dir, 'X.data')
+    Xcid = join(datagen_dir, 'Xcid.data')
+    Ycid = join(datagen_dir, 'Ycid.data')
+
+    full_path_train = join(train_dir, save_folder_name)
+    O = join(full_path_train, 'O.data')
+
+    config = dict(X=X, Xcid=Xcid, Ycid=Ycid, O=O, fmt=DATA_FORMAT)
+
+    config_writer(full_path_train + '.json', config)
+
+    return [full_path_train]
+
+
+def multinomial_naive_bayes_train(save_folder_name, datagen_dir, train_dir):
+
+    X = join(datagen_dir, 'X.data')
+    Y = join(datagen_dir, 'Y.data')
+    classes = '150'
+
+    full_path_train = join(train_dir, save_folder_name)
+    prior = join(full_path_train, 'prior')
+    conditionals = join(full_path_train, 'conditionals')
+    accuracy = join(full_path_train, 'accuracy')
+    fmt = DATA_FORMAT
+    probabilities = join(full_path_train, 'probabilities')
+
+    config = dict(X=X, Y=Y, classes=classes, prior=prior, 
conditionals=conditionals,
+                  accuracy=accuracy, fmt=fmt, probabilities=probabilities)
+
+    config_writer(full_path_train + '.json', config)
+
+    return [full_path_train]
+
+
+def multinomial_multilogreg_train(save_folder_name, datagen_dir, train_dir):
+
+    data_folders = []
+    for i in [0, 1, 2]:
+        icpt = str(i)
+        reg = '0.01'
+        tol = '0.0001'
+        moi = '100'
+        mii = '0'
+        X = join(datagen_dir, 'X.data')
+        Y = join(datagen_dir, 'Y.data')
+
+        full_path_train = join(train_dir, save_folder_name + '.' + str(i))
+        data_folders.append(full_path_train)
+        B = join(full_path_train, 'B.data')
+
+        config = dict(X=X, Y=Y, B=B, icpt=icpt, reg=reg, tol=tol, moi=moi, 
mii=mii, fmt=DATA_FORMAT)
+        config_writer(full_path_train + '.json', config)
+
+    return data_folders
+
+
+def regression1_linearregds_train(save_folder_name, datagen_dir, train_dir):
+
+    data_folders = []
+    for i in [0, 1, 2]:
+        icpt = str(i)
+        reg = '0.01'
+        X = join(datagen_dir, 'X.data')
+        Y = join(datagen_dir, 'Y.data')
+
+        full_path_train = join(train_dir, save_folder_name + '.' + str(i))
+        data_folders.append(full_path_train)
+        B = join(full_path_train, 'B.data')
+
+        config = dict(X=X, Y=Y, B=B, icpt=icpt, fmt=DATA_FORMAT, reg=reg)
+        config_writer(full_path_train + '.json', config)
+
+    return data_folders
+
+
+def regression1_linearregcg_train(save_folder_name, datagen_dir, train_dir):
+
+    data_folders = []
+    for i in [0, 1, 2]:
+        icpt = str(i)
+        reg = '0.01'
+        tol = '0.0001'
+        maxi = '20'
+        X = join(datagen_dir, 'X.data')
+        Y = join(datagen_dir, 'Y.data')
+
+        full_path_train = join(train_dir, save_folder_name + '.' + str(i))
+        data_folders.append(full_path_train)
+        B = join(full_path_train, 'B.data')
+
+        config = dict(X=X, Y=Y, B=B, icpt=icpt, fmt=DATA_FORMAT, maxi=maxi, 
tol=tol, reg=reg)
+        config_writer(full_path_train + '.json', config)
+
+    return data_folders
+
+
+def regression2_glm_gamma_train(save_folder_name, datagen_dir, train_dir):
+
+    data_folders = []
+
+    for i in [0, 1, 2]:
+        X = join(datagen_dir, 'X.data')
+        Y = join(datagen_dir, 'Y.data')
+
+        full_path_train = join(train_dir, save_folder_name)
+        data_folders.append(full_path_train)
+
+        B = join(full_path_train, 'B.data')
+        icpt = str(i)
+        fmt = DATA_FORMAT
+        moi = '200'
+        mii = '5'
+        dfam = '1'
+        vpow = '2.0'
+        link = '1'
+        lpow = '0.0'
+        tol = '0.0001'
+        reg = '0.01'
+        config = dict(X=X, Y=Y, B=B, icpt=icpt, fmt=fmt, moi=moi, mii=mii, 
dfam=dfam,
+                      vpov=vpow, link=link, lpow=lpow, tol=tol, reg=reg)
+
+        config_writer(full_path_train + '.json', config)
+
+    return data_folders
+
+
+def regression2_glm_binomial_train(save_folder_name, datagen_dir, train_dir):
+
+    data_folders = []
+
+    for i in [0, 1, 2]:
+        X = join(datagen_dir, 'X.data')
+        Y = join(datagen_dir, 'Y.data')
+
+        full_path_train = join(train_dir, save_folder_name)
+        data_folders.append(full_path_train)
+
+        B = join(full_path_train, 'B.data')
+        icpt = str(i)
+        fmt = DATA_FORMAT
+        moi = '200'
+        mii = '5'
+        dfam = '2'
+        link = '3'
+        yneg = '2'
+        tol = '0.0001'
+        reg = '0.01'
+        config = dict(X=X, Y=Y, B=B, icpt=icpt, fmt=fmt, moi=moi, mii=mii,
+                      dfam=dfam, link=link, yneg=yneg, tol=tol, reg=reg)
+
+        config_writer(full_path_train + '.json', config)
+
+    return data_folders
+
+
+def regression2_glm_poisson_train(save_folder_name, datagen_dir, train_dir):
+
+    data_folders = []
+
+    for i in [0, 1, 2]:
+        X = join(datagen_dir, 'X.data')
+        Y = join(datagen_dir, 'Y.data')
+
+        full_path_train = join(train_dir, save_folder_name)
+        data_folders.append(full_path_train)
+
+        B = join(full_path_train, 'B.data')
+        icpt = str(i)
+        fmt = DATA_FORMAT
+        moi = '200'
+        mii = '5'
+        dfam = '1'
+        vpov = '1'
+        link = '1'
+        lpow = '0'
+        tol = '0.0001'
+        reg = '0.01'
+        config = dict(X=X, Y=Y, B=B, icpt=icpt, fmt=fmt, moi=moi, mii=mii,
+                      dfam=dfam, vpov=vpov, link=link, lpow=lpow, tol=tol, 
reg=reg)
+        config_writer(full_path_train + '.json', config)
+
+    return data_folders
+
+
+def config_packets_train(algo_payload, datagen_dir, train_dir):
+    """
+    This function has two responsibilities. Generate the configuration files 
for
+    input training algorithms and return a dictionary that will be used for 
execution.
+
+    algo_payload : List of tuples
+    The first tuple index contains algorithm name and the second index contains
+    family type.
+
+    datagen_dir: String
+    Path of the data generation directory
+
+    train_dir: String
+    Path of the training directory
+
+    return: {string: list}
+    This dictionary contains algorithms to be executed as keys and the path of 
configuration
+    json files to be executed list of values.
+
+    """
+
+    config_bundle = {}
+
+    for k, v in algo_payload:
+        config_bundle[k] = []
+
+    for current_algo, current_family in algo_payload:
+        data_gen_path = join(datagen_dir, current_family)
+        data_gen_subdir = glob.glob(data_gen_path + "*")
+
+        # Filter for specific data gen
+        data_gen_folders = list(filter(lambda x: os.path.isdir(x), 
data_gen_subdir))
+        if len(data_gen_folders) == 0:
+            print('datagen folders not present for {}'.format(current_family))
+            sys.exit()
+
+        for current_folder in data_gen_folders:
+            file_path_last = current_folder.split('/')[-1]
+            save_name = '.'.join([current_algo] + [file_path_last])
+            algo_func = '_'.join([current_family] + 
[current_algo.lower().replace('-', '_')]
+                                 + ['train'])
+            conf_path = globals()[algo_func](save_name, current_folder, 
train_dir)
+            config_bundle[current_algo].append(conf_path)
+
+    config_packets = {}
+
+    # Flatten
+    for current_algo, current_family in config_bundle.items():
+        config_packets[current_algo] = reduce(lambda x, y: x + y, 
current_family)
+
+    return config_packets

http://git-wip-us.apache.org/repos/asf/systemml/blob/e7cfcadc/scripts/perftest/python/utils.py
----------------------------------------------------------------------
diff --git a/scripts/perftest/python/utils.py b/scripts/perftest/python/utils.py
new file mode 100755
index 0000000..7ff3b54
--- /dev/null
+++ b/scripts/perftest/python/utils.py
@@ -0,0 +1,296 @@
+#!/usr/bin/env python3
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+from os.path import join
+import os
+import json
+import subprocess
+import shlex
+import re
+import logging
+
+# This file contains all the utility functions required for performance test 
module
+
+
+def get_families(current_algo, ML_ALGO):
+    """
+    Given current algorithm we get its families.
+
+    current_algo  : String
+    Input algorithm specified
+
+    ml_algo : Dictionary
+    key, value dictionary with family as key and algorithms as list of values
+
+    return: List
+    List of families returned
+    """
+
+    family_list = []
+    for family, algos in ML_ALGO.items():
+        if current_algo in algos:
+            family_list.append(family)
+    return family_list
+
+
+def split_rowcol(matrix_dim):
+    """
+    Split the input matrix dimensions into row and columns
+
+    matrix_dim: String
+    Input concatenated string with row and column
+
+    return: Tuple
+    Row and column split based on suffix
+    """
+
+    k = str(0) * 3
+    M = str(0) * 6
+    replace_M = matrix_dim.replace('M', str(M))
+    replace_k = replace_M.replace('k', str(k))
+    row, col = replace_k.split('_')
+    return row, col
+
+
+def config_writer(write_path, config_obj):
+    """
+    Writes the dictionary as an configuration json file to the give path
+
+    write_path: String
+    Absolute path of file name to be written
+
+    config_obj: List or Dictionary
+    Can be a dictionary or a list based on the object passed
+    """
+
+    with open(write_path, 'w') as input_file:
+        json.dump(config_obj, input_file, indent=4)
+
+
+def config_reader(read_path):
+    """
+    Read json file given path
+
+    return: List or Dictionary
+    Reading the json file can give us a list if we have positional args or
+    key value for a dictionary
+    """
+
+    with open(read_path, 'r') as input_file:
+        conf_file = json.load(input_file)
+
+    return conf_file
+
+
+def create_dir(directory):
+    """
+    Create directory given path if the directory does not exist already
+
+    directory: String
+    Input folder path
+    """
+
+    if not os.path.exists(directory):
+        os.makedirs(directory)
+
+
+def get_existence(path, action_mode):
+    """
+    Check SUCCESS file is present in the input path
+
+    path: String
+    Input folder path
+
+    action_mode : String
+    Type of action data-gen, train ...
+
+    return: Boolean check if the file _SUCCESS exists
+    """
+
+    if action_mode == 'data-gen':
+        full_path = join(path, '_SUCCESS')
+        exist = os.path.isfile(full_path)
+    else:
+        # Files does not exist for other modes return False to continue
+        # For e.g some predict algorithms do not generate an output folder
+        # hence checking for SUCCESS would fail
+        exist = False
+
+    return exist
+
+
+def exec_dml_and_parse_time(exec_type, file_name, args, Time=True):
+    """
+    This function is responsible of execution of input arguments via python 
sub process,
+    We also extract time obtained from the output of this subprocess
+
+    exec_type: String
+    Contains the execution type singlenode / hybrid_spark
+
+    file_name: String
+    DML file name to be used while processing the arguments give
+
+    args: Dictionary
+    Key values pairs depending on the arg type
+
+    time: Boolean (default=True)
+    Boolean argument used to extract time from raw output logs.
+    """
+
+    algorithm = file_name + '.dml'
+    if exec_type == 'singlenode':
+        exec_script = join(os.environ.get('SYSTEMML_HOME'), 'bin', 
'systemml-standalone.py')
+
+        args = ''.join(['{} {}'.format(k, v) for k, v in args.items()])
+        cmd = [exec_script, algorithm, args]
+        cmd_string = ' '.join(cmd)
+
+    if exec_type == 'hybrid_spark':
+        exec_script = join(os.environ.get('SYSTEMML_HOME'), 'bin', 
'systemml-spark-submit.py')
+        args = ''.join(['{} {}'.format(k, v) for k, v in args.items()])
+        cmd = [exec_script, '-f', algorithm, args]
+        cmd_string = ' '.join(cmd)
+
+    # Debug
+    # print(cmd_string)
+
+    # Subprocess to execute input arguments
+    # proc1_log contains the shell output which is used for time parsing
+    proc1 = subprocess.Popen(shlex.split(cmd_string), stdout=subprocess.PIPE,
+                             stderr=subprocess.PIPE)
+
+    if Time:
+        proc1_log = []
+        while proc1.poll() is None:
+            raw_std_out = proc1.stdout.readline()
+            decode_raw = raw_std_out.decode('ascii').strip()
+            proc1_log.append(decode_raw)
+            logging.log(10, decode_raw)
+
+        out1, err1 = proc1.communicate()
+
+        if "Error" in str(err1):
+            print('Error Found in {}'.format(file_name))
+            total_time = 'failure'
+        else:
+            total_time = parse_time(proc1_log)
+
+    else:
+        total_time = 'not_specified'
+
+    return total_time
+
+
+def parse_time(raw_logs):
+    """
+    Parses raw input list and extracts time
+
+    raw_logs : List
+    Each line obtained from the standard output is in the list
+
+    return: String
+    Extracted time in seconds or time_not_found
+    """
+    # Debug
+    # print(raw_logs)
+
+    for line in raw_logs:
+        if line.startswith('Total execution time'):
+            extract_time = re.findall(r'\d+', line)
+            total_time = '.'.join(extract_time)
+
+            return total_time
+
+    return 'time_not_found'
+
+
+def exec_test_data(exec_type, path):
+    """
+    Creates the test data split from the given input path
+
+    exec_type : String
+    Contains the execution type singlenode / hybrid_spark
+
+    path : String
+    Location of the input folder to pick X and Y
+    """
+    systemml_home = os.environ.get('SYSTEMML_HOME')
+    test_split_script = join(systemml_home, 'scripts', 'perftest', 
'extractTestData')
+    X = join(path, 'X.data')
+    Y = join(path, 'Y.data')
+    X_test = join(path, 'X_test.data')
+    Y_test = join(path, 'Y_test.data')
+    args = {'-args': ' '.join([X, Y, X_test, Y_test, 'csv'])}
+
+    # Call the exec script without time
+    exec_dml_and_parse_time(exec_type, test_split_script, args, False)
+
+
+def check_predict(current_algo, ML_PREDICT):
+    """
+    To check if the current algorithm requires to run the predict
+
+    current_algo: String
+    Algorithm being processed
+
+    ML_PREDICT: Dictionary
+    Key value pairs of algorithm and predict file to process
+    """
+    if current_algo in ML_PREDICT.keys():
+        return True
+    else:
+        return False
+
+
+def get_folder_metrics(folder_name, action_mode):
+    """
+    Gets metrics from folder name
+
+    folder_name: String
+    Folder from which we want to grab details
+
+    return: List(3)
+    A list with mat_type, mat_shape, intercept
+    """
+
+    if action_mode == 'data-gen':
+        split_name = folder_name.split('.')
+        mat_type = split_name[1]
+        mat_shape = split_name[2]
+        intercept = 'none'
+
+    try:
+        if action_mode == 'train':
+            split_name = folder_name.split('.')
+            mat_type = split_name[3]
+            mat_shape = split_name[2]
+            intercept = split_name[4]
+
+        if action_mode == 'predict':
+            split_name = folder_name.split('.')
+            mat_type = split_name[3]
+            mat_shape = split_name[2]
+            intercept = split_name[4]
+    except IndexError:
+        intercept = 'none'
+
+    return mat_type, mat_shape, intercept
\ No newline at end of file

systemml git commit: [SYSTEMML-1451][GSoC Phase 1] Single script to run perf tests

Reply via email to