(systemds) branch main updated: [SYSTEMDS-3847] Fix perftest refactoring (datagen scripts)

mboehm7 Sat, 05 Apr 2025 10:04:54 -0700

This is an automated email from the ASF dual-hosted git repository.

mboehm7 pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/systemds.git



The following commit(s) were added to refs/heads/main by this push:
     new 56c782384b [SYSTEMDS-3847] Fix perftest refactoring (datagen scripts)
56c782384b is described below

commit 56c782384b73b560a1cdee7ff8c04dacefa2ec76
Author: Matthias Boehm <mboe...@gmail.com>
AuthorDate: Sat Apr 5 19:04:19 2025 +0200

    [SYSTEMDS-3847] Fix perftest refactoring (datagen scripts)
---
 scripts/perftest/datagen/genALSData.sh             |  68 ++++++
 scripts/perftest/datagen/genBinomialData.sh        |  78 +++++++
 scripts/perftest/datagen/genClusteringData.sh      |  68 ++++++
 .../datagen/genDescriptiveStatisticsData.sh        |  60 ++++++
 .../perftest/datagen/genDimensionReductionData.sh  |  61 ++++++
 scripts/perftest/datagen/genIOData.sh              |  72 +++++++
 scripts/perftest/datagen/genL2SVMData.sh           |  38 ++++
 scripts/perftest/datagen/genMultinomialData.sh     |  78 +++++++
 scripts/perftest/datagen/genRandData4ALS.dml       |  47 +++++
 .../datagen/genRandData4ChisquaredTest.dml         |  87 ++++++++
 .../perftest/datagen/genRandData4DecisionTree.sh   |  58 +++++
 .../perftest/datagen/genRandData4DecisionTree1.dml |  40 ++++
 .../perftest/datagen/genRandData4DecisionTree2.dml |  41 ++++
 .../datagen/genRandData4DescriptiveStats.dml       | 149 +++++++++++++
 scripts/perftest/datagen/genRandData4FTest.dml     |  95 +++++++++
 scripts/perftest/datagen/genRandData4Kmeans.dml    | 120 +++++++++++
 .../datagen/genRandData4LinearReg_LTstats.dml      | 233 +++++++++++++++++++++
 .../datagen/genRandData4LinearRegression.dml       |  61 ++++++
 .../datagen/genRandData4LogReg_LTstats.dml         | 233 +++++++++++++++++++++
 .../datagen/genRandData4LogisticRegression.dml     |  72 +++++++
 .../perftest/datagen/genRandData4MultiClassSVM.dml |  68 ++++++
 .../perftest/datagen/genRandData4Multinomial.dml   |  66 ++++++
 scripts/perftest/datagen/genRandData4NMF.dml       | 129 ++++++++++++
 .../perftest/datagen/genRandData4NMFBlockwise.dml  | 138 ++++++++++++
 scripts/perftest/datagen/genRandData4PCA.dml       |  61 ++++++
 .../perftest/datagen/genRandData4StratStats.dml    | 155 ++++++++++++++
 .../perftest/datagen/genRandData4SurvAnalysis.dml  | 133 ++++++++++++
 scripts/perftest/datagen/genRandData4Transform.dml |  96 +++++++++
 .../perftest/datagen/genRandData4Univariate.dml    |  61 ++++++
 scripts/perftest/datagen/genStratStatisticsData.sh |  61 ++++++
 scripts/perftest/sparkDML2.sh                      |  24 ++-
 31 files changed, 2750 insertions(+), 1 deletion(-)

diff --git a/scripts/perftest/datagen/genALSData.sh 
b/scripts/perftest/datagen/genALSData.sh
new file mode 100644
index 0000000000..3d1a22a675
--- /dev/null
+++ b/scripts/perftest/datagen/genALSData.sh
@@ -0,0 +1,68 @@
+#!/bin/bash
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+if [ "$(basename $PWD)" != "perftest" ];
+then
+  echo "Please execute scripts from directory 'perftest'"
+  exit 1;
+fi
+
+CMD=$1
+DATADIR=$2/als
+MAXMEM=$3
+
+FORMAT="text" # can be csv, mm, text, binary
+DENSE_SP=0.9
+SPARSE_SP=0.01
+
+echo "-- Generating ALS data." >> results/times.txt;
+
+#generate XS scenarios (80MB)
+if [ $MAXMEM -ge 80 ]; then
+  ${CMD} -f ../datagen/genRandData4ALS.dml --nvargs X=${DATADIR}/X10k_1k_dense 
rows=10000 cols=1000 rank=10 nnz=`echo "scale=0; 10000 * 1000 * $DENSE_SP" | 
bc` sigma=0.01 fmt=$FORMAT &
+  ${CMD} -f ../datagen/genRandData4ALS.dml --nvargs 
X=${DATADIR}/X10k_1k_sparse rows=10000 cols=1000 rank=10 nnz=`echo "scale=0; 
10000 * 1000 * $SPARSE_SP" | bc` sigma=0.01 fmt=$FORMAT &
+fi
+
+#generate S scenarios (800MB)
+if [ $MAXMEM -ge 800 ]; then
+  ${CMD} -f ../datagen/genRandData4ALS.dml --nvargs 
X=${DATADIR}/X100k_1k_dense rows=100000 cols=1000 rank=10 nnz=`echo "scale=0; 
100000 * 1000 * $DENSE_SP" | bc` sigma=0.01 fmt=$FORMAT &
+  ${CMD} -f ../datagen/genRandData4ALS.dml --nvargs 
X=${DATADIR}/X100k_1k_sparse rows=100000 cols=1000 rank=10 nnz=`echo "scale=0; 
100000 * 1000 * $SPARSE_SP" | bc` sigma=0.01 fmt=$FORMAT &
+fi
+
+#generate M scenarios (8GB)
+if [ $MAXMEM -ge 8000 ]; then
+  ${CMD} -f ../datagen/genRandData4ALS.dml --nvargs X=${DATADIR}/X1M_1k_dense 
rows=1000000 cols=1000 rank=10 nnz=`echo "scale=0; 1000000 * 1000 * $DENSE_SP" 
| bc` sigma=0.01 fmt=$FORMAT &
+  ${CMD} -f ../datagen/genRandData4ALS.dml --nvargs X=${DATADIR}/X1M_1k_sparse 
rows=1000000 cols=1000 rank=10 nnz=`echo "scale=0; 1000000 * 1000 * $SPARSE_SP" 
| bc` sigma=0.01 fmt=$FORMAT &
+fi
+
+#generate L scenarios (80GB)
+if [ $MAXMEM -ge 80000 ]; then
+  ${CMD} -f ../datagen/genRandData4ALS.dml --nvargs X=${DATADIR}/X10M_1k_dense 
rows=10000000 cols=1000 rank=10 nnz=`echo "scale=0; 10000000 * 1000 * 
$DENSE_SP" | bc` sigma=0.01 fmt=$FORMAT
+  ${CMD} -f ../datagen/genRandData4ALS.dml --nvargs 
X=${DATADIR}/X10M_1k_sparse rows=10000000 cols=1000 rank=10 nnz=`echo "scale=0; 
10000000 * 1000 * $SPARSE_SP" | bc` sigma=0.01 fmt=$FORMAT
+fi
+
+#generate XL scenarios (800GB)
+if [ $MAXMEM -ge 800000 ]; then
+  ${CMD} -f ../datagen/genRandData4ALS.dml --nvargs 
X=${DATADIR}/X100M_1k_dense rows=100000000 cols=1000 rank=10 nnz=`echo 
"scale=0; 100000000 * 1000 * $DENSE_SP" | bc` sigma=0.01 fmt=$FORMAT
+  ${CMD} -f ../datagen/genRandData4ALS.dml --nvargs 
X=${DATADIR}/X100M_1k_sparse rows=100000000 cols=1000 rank=10 nnz=`echo 
"scale=0; 100000000 * 1000 * $SPARSE_SP" | bc` sigma=0.01 fmt=$FORMAT
+fi
+
+wait
\ No newline at end of file
diff --git a/scripts/perftest/datagen/genBinomialData.sh 
b/scripts/perftest/datagen/genBinomialData.sh
new file mode 100644
index 0000000000..c911175ace
--- /dev/null
+++ b/scripts/perftest/datagen/genBinomialData.sh
@@ -0,0 +1,78 @@
+#!/bin/bash
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+if [ "$(basename $PWD)" != "perftest" ];
+then
+  echo "Please execute scripts from directory 'perftest'"
+  exit 1;
+fi
+
+CMD=$1
+BASE=$2/binomial
+MAXMEM=$3
+
+FORMAT="binary" # can be csv, mm, text, binary
+DENSE_SP=0.9
+SPARSE_SP=0.01
+
+echo -e "\n\n-- Generating binomial data..." >> results/times.txt;
+
+#generate XS scenarios (80MB)
+if [ $MAXMEM -ge 80 ]; then
+  ${CMD} -f datagen/genRandData4LogisticRegression.dml --args 10000 1000 5 5 
${BASE}/w10k_1k_dense ${BASE}/X10k_1k_dense ${BASE}/y10k_1k_dense 1 0 $DENSE_SP 
$FORMAT 1       & pidDense80=$!
+  ${CMD} -f datagen/genRandData4LogisticRegression.dml --args 10000 1000 5 5 
${BASE}/w10k_1k_sparse ${BASE}/X10k_1k_sparse ${BASE}/y10k_1k_sparse 1 0 
$SPARSE_SP $FORMAT 1   & pidSparse80=$!
+  wait $pidDense80;  ${CMD} -f scripts/extractTestData.dml --args 
${BASE}/X10k_1k_dense ${BASE}/y10k_1k_dense ${BASE}/X10k_1k_dense_test 
${BASE}/y10k_1k_dense_test $FORMAT     &
+  wait $pidSparse80; ${CMD} -f scripts/extractTestData.dml --args 
${BASE}/X10k_1k_sparse ${BASE}/y10k_1k_sparse ${BASE}/X10k_1k_sparse_test 
${BASE}/y10k_1k_sparse_test $FORMAT &
+fi
+
+##generate S scenarios (800MB)
+if [ $MAXMEM -ge 800 ]; then
+  ${CMD} -f datagen/genRandData4LogisticRegression.dml --args 100000 1000 5 5 
${BASE}/w100k_1k_dense ${BASE}/X100k_1k_dense ${BASE}/y100k_1k_dense 1 0 
$DENSE_SP $FORMAT 1 & pidDense800=$!
+  ${CMD} -f datagen/genRandData4LogisticRegression.dml --args 100000 1000 5 5 
${BASE}/w100k_1k_sparse ${BASE}/X100k_1k_sparse ${BASE}/y100k_1k_sparse 1 0 
$SPARSE_SP $FORMAT 1 & pidSparse800=$!
+  wait $pidDense800;  ${CMD} -f scripts/extractTestData.dml --args 
${BASE}/X100k_1k_dense ${BASE}/y100k_1k_dense ${BASE}/X100k_1k_dense_test 
${BASE}/y100k_1k_dense_test $FORMAT &
+  wait $pidSparse800; ${CMD} -f scripts/extractTestData.dml --args 
${BASE}/X100k_1k_sparse ${BASE}/y100k_1k_sparse ${BASE}/X100k_1k_sparse_test 
${BASE}/y100k_1k_sparse_test $FORMAT &
+fi
+
+#generate M scenarios (8GB)
+if [ $MAXMEM -ge 8000 ]; then
+  ${CMD} -f datagen/genRandData4LogisticRegression.dml --args 1000000 1000 5 5 
${BASE}/w1M_1k_dense ${BASE}/X1M_1k_dense ${BASE}/y1M_1k_dense 1 0 $DENSE_SP 
$FORMAT 1  & pidDense8000=$!
+  ${CMD} -f datagen/genRandData4LogisticRegression.dml --args 1000000 1000 5 5 
${BASE}/w1M_1k_sparse ${BASE}/X1M_1k_sparse ${BASE}/y1M_1k_sparse 1 0 
$SPARSE_SP $FORMAT 1  & pidSparse8000=$!
+  wait $pidDense8000;  ${CMD} -f scripts/extractTestData.dml --args 
${BASE}/X1M_1k_dense ${BASE}/y1M_1k_dense ${BASE}/X1M_1k_dense_test 
${BASE}/y1M_1k_dense_test $FORMAT &
+  wait $pidSparse8000; ${CMD} -f scripts/extractTestData.dml --args 
${BASE}/X1M_1k_sparse ${BASE}/y1M_1k_sparse ${BASE}/X1M_1k_sparse_test 
${BASE}/y1M_1k_sparse_test $FORMAT &
+fi
+
+#generate L scenarios (80GB)
+if [ $MAXMEM -ge 80000 ]; then
+  ${CMD} -f datagen/genRandData4LogisticRegression.dml --args 10000000 1000 5 
5 ${BASE}/w10M_1k_dense ${BASE}/X10M_1k_dense ${BASE}/y10M_1k_dense 1 0 
$DENSE_SP $FORMAT 1
+  ${CMD} -f datagen/genRandData4LogisticRegression.dml --args 10000000 1000 5 
5 ${BASE}/w10M_1k_sparse ${BASE}/X10M_1k_sparse ${BASE}/y10M_1k_sparse 1 0 
$SPARSE_SP $FORMAT 1
+  ${CMD} -f scripts/extractTestData.dml --args ${BASE}/X10M_1k_dense 
${BASE}/y10M_1k_dense ${BASE}/X10M_1k_dense_test ${BASE}/y10M_1k_dense_test 
$FORMAT
+  ${CMD} -f scripts/extractTestData.dml --args ${BASE}/X10M_1k_sparse 
${BASE}/y10M_1k_sparse ${BASE}/X10M_1k_sparse_test ${BASE}/y10M_1k_sparse_test 
$FORMAT
+fi
+
+##generate XL scenarios (800GB)
+if [ $MAXMEM -ge 800000 ]; then
+  ${CMD} -f datagen/genRandData4LogisticRegression.dml --args 100000000 1000 5 
5 ${BASE}/w100M_1k_dense ${BASE}/X100M_1k_dense ${BASE}/y100M_1k_dense 1 0 
$DENSE_SP $FORMAT 1
+  ${CMD} -f datagen/genRandData4LogisticRegression.dml --args 100000000 1000 5 
5 ${BASE}/w100M_1k_sparse ${BASE}/X100M_1k_sparse ${BASE}/y100M_1k_sparse 1 0 
$SPARSE_SP $FORMAT 1
+  ${CMD} -f scripts/extractTestData.dml --args ${BASE}/X100M_1k_dense 
${BASE}/y100M_1k_dense ${BASE}/X100M_1k_dense_test ${BASE}/y100M_1k_dense_test 
$FORMAT
+  ${CMD} -f scripts/extractTestData.dml --args ${BASE}/X100M_1k_sparse 
${BASE}/y100M_1k_sparse ${BASE}/X100M_1k_sparse_test 
${BASE}/y100M_1k_sparse_test $FORMAT
+fi
+
+wait
\ No newline at end of file
diff --git a/scripts/perftest/datagen/genClusteringData.sh 
b/scripts/perftest/datagen/genClusteringData.sh
new file mode 100644
index 0000000000..46adffb9e3
--- /dev/null
+++ b/scripts/perftest/datagen/genClusteringData.sh
@@ -0,0 +1,68 @@
+#!/bin/bash
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+if [ "$(basename $PWD)" != "perftest" ];
+then
+  echo "Please execute scripts from directory 'perftest'"
+  exit 1;
+fi
+
+CMD=${1:-systemds}
+BASE=${2:-"temp"}/clustering
+MAXMEM=${3:-80}
+
+FORMAT="binary" 
+DENSE_SP=0.9
+SPARSE_SP=0.01
+
+echo "-- Generating clustering data..." >> results/times.txt;
+
+#generate XS scenarios (80MB)
+if [ $MAXMEM -ge 80 ]; then
+  ${CMD} -f datagen/genRandData4Kmeans.dml --nvargs nr=10000 nf=1000 nc=5 
dc=10.0 dr=1.0 fbf=100.0 cbf=100.0 X=$BASE/X10k_1k_dense C=$BASE/C10k_1k_dense 
Y=$BASE/y10k_1k_dense YbyC=$BASE/YbyC10k_1k_dense fmt=$FORMAT & pidDense80=$!
+  wait $pidDense80; ${CMD} -f scripts/extractTestData.dml --args 
$BASE/X10k_1k_dense $BASE/y10k_1k_dense $BASE/X10k_1k_dense_test 
$BASE/y10k_1k_dense_test $FORMAT &
+fi
+
+#generate S scenarios (800MB)
+if [ $MAXMEM -ge 800 ]; then
+  ${CMD} -f datagen/genRandData4Kmeans.dml --nvargs nr=100000 nf=1000 nc=5 
dc=10.0 dr=1.0 fbf=100.0 cbf=100.0 X=$BASE/X100k_1k_dense 
C=$BASE/C100k_1k_dense Y=$BASE/y100k_1k_dense YbyC=$BASE/YbyC100k_1k_dense 
fmt=$FORMAT & pidDense800=$!
+  wait $pidDense800; ${CMD} -f scripts/extractTestData.dml --args 
$BASE/X100k_1k_dense $BASE/y100k_1k_dense $BASE/X100k_1k_dense_test 
$BASE/y100k_1k_dense_test $FORMAT &
+fi
+
+#generate M scenarios (8GB)
+if [ $MAXMEM -ge 8000 ]; then
+  ${CMD} -f datagen/genRandData4Kmeans.dml --nvargs nr=1000000 nf=1000 nc=5 
dc=10.0 dr=1.0 fbf=100.0 cbf=100.0 X=$BASE/X1M_1k_dense C=$BASE/C1M_1k_dense 
Y=$BASE/y1M_1k_dense YbyC=$BASE/YbyC1M_1k_dense fmt=$FORMAT & pidDense8000=$!
+  wait $pidDense8000; ${CMD} -f scripts/extractTestData.dml --args 
$BASE/X1M_1k_dense $BASE/y1M_1k_dense $BASE/X1M_1k_dense_test 
$BASE/y1M_1k_dense_test $FORMAT &
+fi
+
+#generate L scenarios (80GB)
+if [ $MAXMEM -ge 80000 ]; then
+  ${CMD} -f datagen/genRandData4Kmeans.dml --nvargs nr=10000000 nf=1000 nc=5 
dc=10.0 dr=1.0 fbf=100.0 cbf=100.0 X=$BASE/X10M_1k_dense C=$BASE/C10M_1k_dense 
Y=$BASE/y10M_1k_dense YbyC=$BASE/YbyC10M_1k_dense fmt=$FORMAT
+  ${CMD} -f scripts/extractTestData.dml --args $BASE/X10M_1k_dense 
$BASE/y10M_1k_dense $BASE/X10M_1k_dense_test $BASE/y10M_1k_dense_test $FORMAT
+fi
+
+#generate LARGE scenarios (800GB)
+if [ $MAXMEM -ge 800000 ]; then
+  ${CMD} -f datagen/genRandData4Kmeans.dml --nvargs nr=100000000 nf=1000 nc=5 
dc=10.0 dr=1.0 fbf=100.0 cbf=100.0 X=$BASE/X100M_1k_dense 
C=$BASE/C100M_1k_dense Y=$BASE/y100M_1k_dense YbyC=$BASE/YbyC100M_1k_dense 
fmt=$FORMAT
+  ${CMD} -f scripts/extractTestData.dml --args $BASE/X100M_1k_dense 
$BASE/y100M_1k_dense $BASE/X100M_1k_dense_test $BASE/y100M_1k_dense_test $FORMAT
+fi
+
+wait
\ No newline at end of file
diff --git a/scripts/perftest/datagen/genDescriptiveStatisticsData.sh 
b/scripts/perftest/datagen/genDescriptiveStatisticsData.sh
new file mode 100644
index 0000000000..c59fdc6a2a
--- /dev/null
+++ b/scripts/perftest/datagen/genDescriptiveStatisticsData.sh
@@ -0,0 +1,60 @@
+#!/bin/bash
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+if [ "$(basename $PWD)" != "perftest" ];
+then
+  echo "Please execute scripts from directory 'perftest'"
+  exit 1;
+fi
+
+CMD=$1
+BASE=$2/bivar
+MAXMEM=$3
+
+FORMAT="binary"
+
+c=1000
+nc=100
+mdomain=1100
+set=20
+labelset=10
+
+#XS data 10K rows
+if [ $MAXMEM -ge 80 ]; then
+  ${CMD} -f datagen/genRandData4DescriptiveStats.dml --explain --stats 
--nvargs R=10000 C=$c NC=$nc MAXDOMAIN=$mdomain DATA=${BASE}/A_10k/data 
TYPES=${BASE}/A_10k/types SETSIZE=$set LABELSETSIZE=$labelset 
TYPES1=${BASE}/A_10k/set1.types TYPES2=${BASE}/A_10k/set2.types 
INDEX1=${BASE}/A_10k/set1.indices INDEX2=${BASE}/A_10k/set2.indices FMT=$FORMAT 
&
+fi
+
+#S data 100K rows
+if [ $MAXMEM -ge 800 ]; then
+  ${CMD} -f datagen/genRandData4DescriptiveStats.dml --explain --stats 
--nvargs R=100000 C=$c NC=$nc MAXDOMAIN=$mdomain DATA=${BASE}/A_100k/data 
TYPES=${BASE}/A_100k/types SETSIZE=$set LABELSETSIZE=$labelset 
TYPES1=${BASE}/A_100k/set1.types TYPES2=${BASE}/A_100k/set2.types 
INDEX1=${BASE}/A_100k/set1.indices INDEX2=${BASE}/A_100k/set2.indices 
FMT=$FORMAT &
+fi
+
+#M data 1M rows
+if [ $MAXMEM -ge 8000 ]; then
+  ${CMD} -f datagen/genRandData4DescriptiveStats.dml --explain --stats 
--nvargs R=1000000 C=$c NC=$nc MAXDOMAIN=$mdomain DATA=${BASE}/A_1M/data 
TYPES=${BASE}/A_1M/types SETSIZE=$set LABELSETSIZE=$labelset 
TYPES1=${BASE}/A_1M/set1.types TYPES2=${BASE}/A_1M/set2.types 
INDEX1=${BASE}/A_1M/set1.indices INDEX2=${BASE}/A_1M/set2.indices FMT=$FORMAT &
+fi
+
+#L data 10M rows
+if [ $MAXMEM -ge 80000 ]; then
+  ${CMD} -f datagen/genRandData4DescriptiveStats.dml --explain --stats 
--nvargs R=10000000 C=$c NC=$nc MAXDOMAIN=$mdomain DATA=${BASE}/A_10M/data 
TYPES=${BASE}/A_10M/types SETSIZE=$set LABELSETSIZE=$labelset 
TYPES1=${BASE}/A_10M/set1.types TYPES2=${BASE}/A_10M/set2.types 
INDEX1=${BASE}/A_10M/set1.indices INDEX2=${BASE}/A_10M/set2.indices FMT=$FORMAT
+fi
+
+wait
\ No newline at end of file
diff --git a/scripts/perftest/datagen/genDimensionReductionData.sh 
b/scripts/perftest/datagen/genDimensionReductionData.sh
new file mode 100644
index 0000000000..cd90aa1758
--- /dev/null
+++ b/scripts/perftest/datagen/genDimensionReductionData.sh
@@ -0,0 +1,61 @@
+#!/bin/bash
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+if [ "$(basename $PWD)" != "perftest" ];
+then
+  echo "Please execute scripts from directory 'perftest'"
+  exit 1;
+fi
+
+CMD=${1:-systemds}
+BASE=${2:-"temp"}/dimensionreduction
+MAXMEM=${3:-80}
+
+FORMAT="binary"
+
+echo "-- Generating Dimension Reduction data." >> results/times.txt;
+
+#generate XS scenarios (80MB)
+if [ $MAXMEM -ge 80 ]; then
+  ${CMD} -f datagen/genRandData4PCA.dml --nvargs R=5000 C=2000 
OUT=$BASE/pcaData5k_2k_dense FMT=$FORMAT &
+fi
+
+#generate S scenarios (800MB)
+if [ $MAXMEM -ge 800 ]; then
+  ${CMD} -f datagen/genRandData4PCA.dml --nvargs R=50000 C=2000 
OUT=$BASE/pcaData50k_2k_dense FMT=$FORMAT &
+fi
+
+#generate M scenarios (8GB)
+if [ $MAXMEM -ge 8000 ]; then
+  ${CMD} -f datagen/genRandData4PCA.dml --nvargs R=500000 C=2000 
OUT=$BASE/pcaData500k_2k_dense FMT=$FORMAT &
+fi
+
+#generate L scenarios (80GB)
+if [ $MAXMEM -ge 80000 ]; then
+  ${CMD} -f datagen/genRandData4PCA.dml --nvargs R=5000000 C=2000 
OUT=$BASE/pcaData5M_2k_dense FMT=$FORMAT
+fi
+
+#generate XL scenarios (800GB)
+if [ $MAXMEM -ge 800000 ]; then
+  ${CMD} -f datagen/genRandData4PCA.dml --nvargs R=50000000 C=2000 
OUT=$BASE/pcaData50M_2k_dense FMT=$FORMAT
+fi
+
+wait
\ No newline at end of file
diff --git a/scripts/perftest/datagen/genIOData.sh 
b/scripts/perftest/datagen/genIOData.sh
new file mode 100644
index 0000000000..46154f8636
--- /dev/null
+++ b/scripts/perftest/datagen/genIOData.sh
@@ -0,0 +1,72 @@
+#!/bin/bash
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+if [ "$(basename $PWD)" != "perftest" ];
+then
+  echo "Please execute scripts from directory 'perftest'"
+  exit 1;
+fi
+
+CMD=${1:-systemds}
+DATADIR=${2:-"temp"}/io
+MAXMEM=${3:-1}
+
+FORMAT="csv" # can be csv, mm, text, binary
+
+echo "-- Generating IO data." >> results/times.txt;
+
+
+#generate XS scenarios (10MB)
+if [ $MAXMEM -ge 1 ]; then
+  ${CMD} -f ../utils/generateData.dml --nvargs Path=${DATADIR}/X500_250_dense 
R=500 C=250 Fmt=$FORMAT &
+fi
+
+#generate XS scenarios (10MB)
+if [ $MAXMEM -ge 10 ]; then
+  ${CMD} -f ../utils/generateData.dml --nvargs Path=${DATADIR}/X5k_250_dense 
R=5000 C=250 Fmt=$FORMAT &
+fi
+
+#generate XS scenarios (80MB)
+if [ $MAXMEM -ge 80 ]; then
+  ${CMD} -f ../utils/generateData.dml --nvargs Path=${DATADIR}/X10k_1k_dense 
R=10000 C=1000 Fmt=$FORMAT &
+fi
+
+#generate S scenarios (800MB)
+if [ $MAXMEM -ge 800 ]; then
+  ${CMD} -f ../utils/generateData.dml --nvargs Path=${DATADIR}/X100k_1k_dense 
R=100000 C=1000 Fmt=$FORMAT &
+fi
+
+#generate M scenarios (8GB)
+if [ $MAXMEM -ge 8000 ]; then
+  ${CMD} -f ../utils/generateData.dml --nvargs Path=${DATADIR}/X1M_1k_dense 
R=1000000 C=1000 Fmt=$FORMAT &
+fi
+
+#generate L scenarios (80GB)
+if [ $MAXMEM -ge 80000 ]; then
+  ${CMD} -f ../utils/generateData.dml --nvargs Path=${DATADIR}/X10M_1k_dense 
R=10000000 C=1000 Fmt=$FORMAT &
+fi
+
+#generate XL scenarios (800GB)
+if [ $MAXMEM -ge 800000 ]; then
+  ${CMD} -f ../utils/generateData.dml --nvargs Path=${DATADIR}/X100M_1k_dense 
R=100000000 C=1000 Fmt=$FORMAT &
+fi
+
+wait
diff --git a/scripts/perftest/datagen/genL2SVMData.sh 
b/scripts/perftest/datagen/genL2SVMData.sh
new file mode 100644
index 0000000000..d25e433530
--- /dev/null
+++ b/scripts/perftest/datagen/genL2SVMData.sh
@@ -0,0 +1,38 @@
+#!/bin/bash
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+if [ "$(basename $PWD)" != "perftest" ];
+then
+  echo "Please execute scripts from directory 'perftest'"
+  exit 1;
+fi
+
+CMD=$1
+DATADIR=$2
+
+FORMAT="binary" # can be csv, mm, text, binary
+DENSE_SP=0.9
+SPARSE_SP=0.01
+
+BASEPATH=$(dirname $0)
+
+#generate XS scenarios (80MB)
+${CMD} -f ${BASEPATH}/../datagen/genRandData4LogisticRegression.dml --args 
10000 1000 5 5 ${DATADIR}/w10k_1k_dense ${DATADIR}/X10k_1k_dense 
${DATADIR}/Y10k_1k_dense 1 0 $DENSE_SP $FORMAT 1
+${CMD} -f ${BASEPATH}/../datagen/genRandData4LogisticRegression.dml --args 
10000 1000 5 5 ${DATADIR}/w10k_1k_sparse ${DATADIR}/X10k_1k_sparse 
${DATADIR}/Y10k_1k_sparse 1 0 $SPARSE_SP $FORMAT 1
diff --git a/scripts/perftest/datagen/genMultinomialData.sh 
b/scripts/perftest/datagen/genMultinomialData.sh
new file mode 100644
index 0000000000..95c42f87dd
--- /dev/null
+++ b/scripts/perftest/datagen/genMultinomialData.sh
@@ -0,0 +1,78 @@
+#!/bin/bash
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+if [ "$(basename $PWD)" != "perftest" ];
+then
+  echo "Please execute scripts from directory 'perftest'"
+  exit 1;
+fi
+
+CMD=$1
+BASE=$2/multinomial
+MAXMEM=$3
+
+FORMAT="binary" 
+DENSE_SP=0.9
+SPARSE_SP=0.01
+
+echo "-- Generating multinomial data..." >> results/times.txt;
+
+#generate XS scenarios (80MB)
+if [ $MAXMEM -ge 80 ]; then
+  ${CMD} -f datagen/genRandData4Multinomial.dml $DASH-args 10000 1000 
$DENSE_SP 5 0 $BASE/X10k_1k_dense_k5 $BASE/y10k_1k_dense_k5 $FORMAT 1 & 
pidDense80=$!
+  ${CMD} -f datagen/genRandData4Multinomial.dml $DASH-args 10000 1000 
$SPARSE_SP 5 0 $BASE/X10k_1k_sparse_k5 $BASE/y10k_1k_sparse_k5 $FORMAT 1 & 
pidSparse80=$!
+  wait $pidDense80;  ${CMD} -f scripts/extractTestData.dml $DASH-args 
$BASE/X10k_1k_dense_k5 $BASE/y10k_1k_dense_k5 $BASE/X10k_1k_dense_k5_test 
$BASE/y10k_1k_dense_k5_test $FORMAT &
+  wait $pidSparse80; ${CMD} -f scripts/extractTestData.dml $DASH-args 
$BASE/X10k_1k_sparse_k5 $BASE/y10k_1k_sparse_k5 $BASE/X10k_1k_sparse_k5_test 
$BASE/y10k_1k_sparse_k5_test $FORMAT &
+fi
+
+##generate S scenarios (800MB)
+if [ $MAXMEM -ge 800 ]; then
+  ${CMD} -f datagen/genRandData4Multinomial.dml $DASH-args 100000 1000 
$DENSE_SP 5 0 $BASE/X100k_1k_dense_k5 $BASE/y100k_1k_dense_k5 $FORMAT 1 & 
pidDense800=$!
+  ${CMD} -f datagen/genRandData4Multinomial.dml $DASH-args 100000 1000 
$SPARSE_SP 5 0 $BASE/X100k_1k_sparse_k5 $BASE/y100k_1k_sparse_k5 $FORMAT 1 & 
pidSparse800=$!
+  wait $pidDense800;  ${CMD} -f scripts/extractTestData.dml $DASH-args 
$BASE/X100k_1k_dense_k5 $BASE/y100k_1k_dense_k5 $BASE/X100k_1k_dense_k5_test 
$BASE/y100k_1k_dense_k5_test $FORMAT &
+  wait $pidSparse800; ${CMD} -f scripts/extractTestData.dml $DASH-args 
$BASE/X100k_1k_sparse_k5 $BASE/y100k_1k_sparse_k5 $BASE/X100k_1k_sparse_k5_test 
$BASE/y100k_1k_sparse_k5_test $FORMAT &
+fi
+
+##generate M scenarios (8GB)
+if [ $MAXMEM -ge 8000 ]; then
+  ${CMD} -f datagen/genRandData4Multinomial.dml $DASH-args 1000000 1000 
$DENSE_SP 5 0 $BASE/X1M_1k_dense_k5 $BASE/y1M_1k_dense_k5 $FORMAT 1 & 
pidDense8000=$!
+  ${CMD} -f datagen/genRandData4Multinomial.dml $DASH-args 1000000 1000 
$SPARSE_SP 5 0 $BASE/X1M_1k_sparse_k5 $BASE/y1M_1k_sparse_k5 $FORMAT 1 & 
pidSparse8000=$!
+  wait $pidDense8000;  ${CMD} -f scripts/extractTestData.dml $DASH-args 
$BASE/X1M_1k_dense_k5 $BASE/y1M_1k_dense_k5 $BASE/X1M_1k_dense_k5_test 
$BASE/y1M_1k_dense_k5_test $FORMAT &
+  wait $pidSparse8000; ${CMD} -f scripts/extractTestData.dml $DASH-args 
$BASE/X1M_1k_sparse_k5 $BASE/y1M_1k_sparse_k5 $BASE/X1M_1k_sparse_k5_test 
$BASE/y1M_1k_sparse_k5_test $FORMAT &
+fi
+
+##generate L scenarios (80GB)
+if [ $MAXMEM -ge 80000 ]; then
+  ${CMD} -f datagen/genRandData4Multinomial.dml $DASH-args 10000000 1000 
$DENSE_SP 5 0 $BASE/X10M_1k_dense_k5 $BASE/y10M_1k_dense_k5 $FORMAT 1
+  ${CMD} -f datagen/genRandData4Multinomial.dml $DASH-args 10000000 1000 
$SPARSE_SP 5 0 $BASE/X10M_1k_sparse_k5 $BASE/y10M_1k_sparse_k5 $FORMAT 1
+  ${CMD} -f scripts/extractTestData.dml $DASH-args $BASE/X10M_1k_dense_k5 
$BASE/y10M_1k_dense_k5 $BASE/X10M_1k_dense_k5_test $BASE/y10M_1k_dense_k5_test 
$FORMAT
+  ${CMD} -f scripts/extractTestData.dml $DASH-args $BASE/X10M_1k_sparse_k5 
$BASE/y10M_1k_sparse_k5 $BASE/X10M_1k_sparse_k5_test 
$BASE/y10M_1k_sparse_k5_test $FORMAT
+fi
+
+#generate LARGE scenarios (800GB)
+if [ $MAXMEM -ge 800000 ]; then
+  ${CMD} -f datagen/genRandData4Multinomial.dml $DASH-args 100000000 1000 
$DENSE_SP 5 0 $BASE/X100M_1k_dense_k5 $BASE/y100M_1k_dense_k5 $FORMAT 1
+  ${CMD} -f datagen/genRandData4Multinomial.dml $DASH-args 100000000 1000 
$SPARSE_SP 5 0 $BASE/X100M_1k_sparse_k5 $BASE/y100M_1k_sparse_k5 $FORMAT 1
+  ${CMD} -f scripts/extractTestData.dml $DASH-args $BASE/X100M_1k_dense_k5 
$BASE/y100M_1k_dense_k5 $BASE/X100M_1k_dense_k5_test 
$BASE/y100M_1k_dense_k5_test $FORMAT
+  ${CMD} -f scripts/extractTestData.dml $DASH-args $BASE/X100M_1k_sparse_k5 
$BASE/y100M_1k_sparse_k5 $BASE/X100M_1k_sparse_k5_test 
$BASE/y100M_1k_sparse_k5_test $FORMAT
+fi
+
+wait
\ No newline at end of file
diff --git a/scripts/perftest/datagen/genRandData4ALS.dml 
b/scripts/perftest/datagen/genRandData4ALS.dml
new file mode 100644
index 0000000000..f6c3562862
--- /dev/null
+++ b/scripts/perftest/datagen/genRandData4ALS.dml
@@ -0,0 +1,47 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+Xfile = $X; # input matrix X of size m x n
+Ufile = ifdef($U, " "); # original row factor of size m x r
+Vfile = ifdef($V, " "); # original col factor of size r x n
+m = $rows; # no. of rows of X
+n = $cols; # no. of cols of X
+r = $rank; # rank of factorization
+nnz = $nnz; # no. of nonzeros in X
+sigma = ifdef ($sigma, 0.01); # variance of Gaussian noise
+fmt = ifdef ($fmt, "binary"); # output format
+
+# generate original factors by sampling from a normal(0,1.0) distribution
+U = rand(rows = m, cols = r, pdf = "normal", seed = 123);
+V = rand(rows = n, cols = r, pdf = "normal", seed = 456);
+
+I = floor(rand(rows = nnz, cols = 1, min = 1, max = m + 0.999999999));
+J = floor(rand(rows = nnz, cols = 1, min = 1, max = n + 0.999999999));
+X = rand(rows = nnz, cols = 1, pdf = "normal") * sqrt(sigma);
+N = table(I, J, X);
+X = (N != 0) * (U %*% t(V)) + N;
+write(X, Xfile, format = fmt);
+if( Ufile != " " )
+  write(U, Ufile, format = fmt);
+if( Vfile != " " ) {
+  V = t(V);
+  write(V, Vfile, format = fmt);
+}
diff --git a/scripts/perftest/datagen/genRandData4ChisquaredTest.dml 
b/scripts/perftest/datagen/genRandData4ChisquaredTest.dml
new file mode 100644
index 0000000000..8f2b945e01
--- /dev/null
+++ b/scripts/perftest/datagen/genRandData4ChisquaredTest.dml
@@ -0,0 +1,87 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+# generates a two column matrix of categorical
+# variables
+# used to test systemds's chi-squared bivariate stat
+# computation
+
+# $1 is number of samples to generate
+# $2 is number of categories for 1st categorical variable
+# $3 is number of categories for 2nd categorical variable
+# $4 is the file to write out the chi-squared statistic to
+# $5 is the file to write out the generated data to
+
+numSamples = $1
+numCategories1 = $2
+numCategories2 = $3
+
+o = Rand(rows=numCategories1, cols=numCategories2, min=0.0, max=1.0, 
pdf="uniform", seed=0)
+o = o / sum(o)
+
+probs1 = rowSums(o)
+probs1 = probs1 / sum(probs1)
+probs2 = colSums(o)
+probs2 = probs2 / sum(probs2)
+e = probs1 %*% probs2
+
+chisquared = sum((o-e)^2/e)
+write(chisquared, $4, format="binary")
+
+oCDF = Rand(rows=numCategories1, cols=numCategories2, min=0.0, max=0.0, 
pdf="uniform", seed=0)
+for(i in 1:numCategories1){
+       for(j in 1:numCategories2){
+               if(i==1 & j==1){
+                       oCDF[i,j] = o[1,1]
+               }
+               if(i != 1 & j == 1){
+                       oCDF[i,j] = oCDF[i-1,numCategories2] + o[i,j]
+               }
+               if(j > 1){
+                       oCDF[i,j] = oCDF[i,j-1] + o[i,j]
+               }
+       }
+}
+
+one = Rand(rows=1, cols=1, min=1.0, max=1.0, pdf="uniform", seed=0)
+data = Rand(rows=numSamples, cols=2, min=0.0, max=0.0, pdf="uniform", seed=0)
+parfor(s in 1:numSamples){
+       r_mat = Rand(rows=1, cols=1, min=0.0, max=1.0, pdf="uniform", seed=0)
+       r = as.scalar(r_mat)
+
+       cat1 = -1
+       cat2 = -1
+       continue = 1
+       for(i in 1:numCategories1){
+               for(j in 1:numCategories2){
+                       cdf = as.scalar(oCDF[i,j])
+                       if(continue == 1 & r <= cdf){
+                               cat1 = i
+                               cat2 = j
+                               continue = 0
+                       }
+               }
+       }
+       
+       data[s,1] = cat1*one
+       data[s,2] = cat2*one
+}
+write(data, $5, format="binary")
diff --git a/scripts/perftest/datagen/genRandData4DecisionTree.sh 
b/scripts/perftest/datagen/genRandData4DecisionTree.sh
new file mode 100644
index 0000000000..44978192fe
--- /dev/null
+++ b/scripts/perftest/datagen/genRandData4DecisionTree.sh
@@ -0,0 +1,58 @@
+#!/bin/bash
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+if [ "$1" == "" -o "$2" == "" ]; then echo "Usage: $0 <hdfsDataDir> <MR | 
SPARK | ECHO>   e.g. $0 perftest SPARK" ; exit 1 ; fi
+if [ "$2" == "SPARK" ]; then CMD="./sparkDML.sh "; DASH="-"; elif [ "$2" == 
"MR" ]; then CMD="hadoop jar SystemDS.jar " ; else CMD="echo " ; fi
+
+BASE=$1/trees
+
+FORMAT="csv" 
+DENSE_SP=0.9
+SPARSE_SP=0.01
+
+PATH_LOCAL=/tmp/datagen
+PATH_HDFS=$BASE
+
+#### part 1: generating class labels and categorical features  
+${CMD} -f ../datagen/genRandData4DecisionTree1.dml $DASH-nvargs 
XCat=$BASE/XCat Y=$BASE/Y num_records=1000 num_cat=100 num_class=10 
num_distinct=100 sp=$DENSE_SP
+
+#### part 2: generating spec.json on HDFS
+NUM_FEATURES=100
+
+echo "{ \"ids\": true 
+       ,\"recode\": [1 " > $PATH_LOCAL/spec.json
+for i in $(seq 2 $NUM_FEATURES); do
+       echo " , "$i >> $PATH_LOCAL/spec.json
+done
+echo " ] , \"dummycode\": [ 1" >> $PATH_LOCAL/spec.json
+for i in $(seq 2 $NUM_FEATURES); do
+       echo " , "$i >> $PATH_LOCAL/spec.json
+done
+echo "] }" >> $PATH_LOCAL/spec.json
+
+hadoop fs -rm $PATH_HDFS/spec.json
+hadoop fs -copyFromLocal $PATH_LOCAL/spec.json $PATH_HDFS/spec.json  
+
+#### part 3: generating scale feature and transforming categorical features, 
finally combaning scale and categorical features
+${CMD} -f ../datagen/genRandData4DecisionTree2.dml $DASH-nvargs 
tPath=$BASE/metadata tSpec=$BASE/spec.json XCat=$BASE/XCat X=$BASE/X 
num_records=1000 num_scale=100 sp=$DENSE_SP fmt=$FORMAT
+
+
diff --git a/scripts/perftest/datagen/genRandData4DecisionTree1.dml 
b/scripts/perftest/datagen/genRandData4DecisionTree1.dml
new file mode 100644
index 0000000000..7d1dd50d6b
--- /dev/null
+++ b/scripts/perftest/datagen/genRandData4DecisionTree1.dml
@@ -0,0 +1,40 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+
+XCatFile = $XCat;
+YFile = $Y;
+num_records = $num_records;
+num_cat_features = $num_cat;
+num_class = $num_class;
+num_distinct = $num_distinct;
+sparsity = $sp;
+
+# generate class labels
+Y = floor (rand (rows = num_records, cols = 1, min = 1, max = num_class + 
0.99999999999999)); 
+Y_bin = table (seq (1, num_records), Y); 
+write (Y_bin, YFile);
+
+# generate categorical features
+X_cat = floor (rand (rows = num_records, cols = num_cat_features, min = 1, max 
= num_distinct + 0.99999999999999, sparsity = sparsity));
+fX_cat = as.frame(X_cat);
+write (fX_cat, XCatFile, format = "csv");
+
diff --git a/scripts/perftest/datagen/genRandData4DecisionTree2.dml 
b/scripts/perftest/datagen/genRandData4DecisionTree2.dml
new file mode 100644
index 0000000000..715924915c
--- /dev/null
+++ b/scripts/perftest/datagen/genRandData4DecisionTree2.dml
@@ -0,0 +1,41 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+
+transformPath = $tPath;
+transformSpec = $tSpec;
+XCatFile = $XCat;
+XFile = $X;
+num_records = $num_records;
+num_scale_features = $num_scale;
+sparsity = $sp;
+fmt = $fmt;
+
+# generate scale features
+X_scale = rand (rows = num_records, cols = num_scale_features, min = 0, max = 
10, sparsity = sparsity); 
+
+# transform categorical features
+XCF = read (XCatFile);
+specJson = read(transformSpec, data_type="scalar", value_type="string");
+X_cat_transformed = transform (target = XCF, spec = specJson, transformPath = 
transformPath);
+
+X = cbind (X_scale, X_cat_transformed);
+write (X, XFile, format = fmt);
diff --git a/scripts/perftest/datagen/genRandData4DescriptiveStats.dml 
b/scripts/perftest/datagen/genRandData4DescriptiveStats.dml
new file mode 100644
index 0000000000..6f96162074
--- /dev/null
+++ b/scripts/perftest/datagen/genRandData4DescriptiveStats.dml
@@ -0,0 +1,149 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+/*
+------------------------------------------------
+  Parameters                                      
+------------------------------------------------
+$R          = #rows
+$C          = #columns
+$NC         = number of categorical attributes
+$MAXDOMAIN  = maximum domain size
+$DATA       = output file path on HDFS
+$SETSIZE    = Size of one bivariate set
+$LABELSETSIZE= Size of second bivariate set with labels
+$TYPES      = output attribute types
+$TYPES1     = Attribute types for Set1
+$TYPES2     = Attribute types for Set2
+$INDEX1     = Indices for Set1
+$INDEX2     = Indices for Set2
+$FMT        = output format
+------------------------------------------------
+hadoop jar SystemDS.jar -f genData4Stats.dml -nvargs R=1000000 C=1000 NC=50 
MAXDOMAIN=1100 DATA=stats/data TYPES=stats/types SETSIZE=15 LABELSETSIZE=10 
TYPES1=... Types2=... INDEX1=.. INDEX2=..FMT=csv
+------------------------------------------------
+*/
+
+
+FMT = ifdef($FMT,"binary"); # default output format
+
+# number of categorical attributes.. numC <= C
+numC = $NC;
+numO = as.integer(numC/2);
+numNominal = numC - numO;
+print("Categorical Mix = (" + numC + "," + numO + "," + numNominal +")");
+
+# maximum domain size among all categorical attributes
+maxDomainSize = $MAXDOMAIN;
+
+# Divide $C attributes according to the following logic:
+#
+#   1     numO       numC               C
+#   |-------|---------|-----------------|
+#      ord    nominal    scale
+#
+# numC+1-$C: scale
+# 1-numC/2: ordinal
+# (numC/2+1)-numC: nominal
+
+types = matrix(1, rows=1, cols=$C);
+ocutoff = numO;
+types[1,1:ocutoff] = matrix(1,rows=1,cols=ocutoff)*3;
+types[1, ocutoff+1:numC] = matrix(1,rows=1,cols=(numC-ocutoff))*2;
+
+# Generate data
+A = rand(rows=$R, cols=$C, sparsity=1);
+B = matrix(0,rows=nrow(A), cols=ncol(A));
+parfor (i in 1:numC) {
+    Ai = A[,i];
+
+    tmp = round(rand(rows=1,cols=1, min=1, max=maxDomainSize));
+    domain = as.scalar(tmp[1,1]);
+
+    # for some attributes, choose the maxDomainSize
+    tmp = rand(rows=1,cols=1);
+    if (as.scalar(tmp[1,1]) < 0.5) {
+        domain = maxDomainSize;
+    }
+
+    B[,i] = round(1+(domain-1)*Ai);
+}
+B[ ,(numC+1):ncol(A)] = A[, (numC+1):ncol(A)];
+ 
+
+write(B, $DATA, format=FMT);
+write(types, $TYPES, format=FMT);
+
+# ----- Generator for Bivariate ---------
+
+settypes1 = matrix(1, rows=1, cols=$SETSIZE);
+index1   = matrix(0, rows=1, cols=$SETSIZE);
+
+catSetSize = as.integer($SETSIZE/2);
+ocutoff = as.integer(catSetSize/2);
+print("Set Mix = (" + $SETSIZE + "," + catSetSize + "," + ocutoff + ")" );
+settypes1[1, 1:ocutoff] = matrix(1,rows=1,cols=ocutoff)*3;
+settypes1[1, ocutoff+1:catSetSize] = 
matrix(1,rows=1,cols=(catSetSize-ocutoff))*2;
+
+# select ordinal indices
+tmp = rand(rows=1, cols=ocutoff);
+index1[1, 1:ocutoff] = round(1 + (numO-1)*tmp);
+
+# select nominal indices
+nominalSetSize = catSetSize-ocutoff;
+tmp = rand(rows=1, cols=nominalSetSize);
+index1[1, ocutoff+1:catSetSize] = round(numO+1 + (numC-numO-1)*tmp);
+
+# select scale attributes
+scaleSetSize = $SETSIZE-catSetSize;
+tmp = rand(rows=1, cols=scaleSetSize);
+index1[1, catSetSize+1:$SETSIZE] = round(numC+1 + ($C-numC-1)*tmp);
+
+
+# --- select types and indices for LABELSET
+settypes2 = matrix(2, rows=1, cols=$LABELSETSIZE);
+index2   = matrix(0, rows=1, cols=$LABELSETSIZE);
+if($LABELSETSIZE > 1) {
+    settypes2[1,1] = 1;
+    r = as.scalar(rand(rows=1,cols=1));
+    index2[1,1] = round(numC+1 + ($C-numC-1)*r)
+}
+else {
+    r = as.scalar(rand(rows=1,cols=1));
+    index2[1,1] = round( numO+1 + (numC-numO-1)*r )
+}
+
+for(i in 2:as.integer($LABELSETSIZE/2)) {
+    settypes2[1,i] = 3;
+    r = as.scalar(rand(rows=1,cols=1));
+    index2[1,i] = round( 1 + (numO-1)*r )
+}
+
+for(i in as.integer($LABELSETSIZE/2)+1:$LABELSETSIZE) {
+    settypes2[1,i] = 2;
+    r = as.scalar(rand(rows=1,cols=1));
+    index2[1,i] = round( numO+1 + (numC-numO-1)*r )
+}
+
+write(settypes1, $TYPES1, format=FMT);
+write(settypes2, $TYPES2, format=FMT);
+write(index1, $INDEX1, format=FMT);
+write(index2, $INDEX2, format=FMT);
+
diff --git a/scripts/perftest/datagen/genRandData4FTest.dml 
b/scripts/perftest/datagen/genRandData4FTest.dml
new file mode 100644
index 0000000000..9f0e1d6c68
--- /dev/null
+++ b/scripts/perftest/datagen/genRandData4FTest.dml
@@ -0,0 +1,95 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+# generates random data for F-test
+#
+# $1 is number of groups (some of 
+#              which may share a gaussian)
+# $2 is number of actual groups 
+# $3 is number of points
+# $4 is mean of the gaussian means
+# $5 is mean of the gaussian std. deviations
+# $6 is file to store computed f-statistic
+# $7 is file to store generated data
+
+numGroups = $1
+numActualGroups = $2
+numSamples = $3
+meanOfMeans = $4
+meanOfStddevs = $5
+
+cntProbs = Rand(rows=numGroups, cols=1, min=0.0, max=1.0, pdf="uniform", 
seed=0)
+cntProbs = cntProbs/sum(cntProbs)
+cntArr = round(cntProbs * numSamples)
+last_cnt = cntArr[numGroups,1]
+cntArr[numGroups,1] = numSamples - (sum(cntArr) - last_cnt)
+
+permut = Rand(rows=numActualGroups, cols=numGroups, min=0.0, max=0.0, 
pdf="uniform")
+ones = Rand(rows=numActualGroups, cols=1, min=1.0, max=1.0, pdf="uniform")
+permut[,1:numActualGroups] = diag(ones)
+
+one = Rand(rows=1, cols=1, min=1.0, max=1.0, pdf="uniform")
+copy_start_index = numActualGroups+1
+parfor(i in copy_start_index:numGroups){
+       r = Rand(rows=1, cols=1, min=1.0, max=numActualGroups, pdf="uniform", 
seed=0)
+       j = as.scalar(round(r))
+       permut[j,i] = one
+}
+
+means_std = Rand(rows=numActualGroups, cols=1, pdf="normal", seed=0)
+abs_means = means_std + meanOfMeans
+means = t(t(abs_means) %*% permut)
+
+stddevs_std = Rand(rows=numActualGroups, cols=1, pdf="normal", seed=0)
+abs_stddevs = stddevs_std + meanOfStddevs
+stddevs = t(t(abs_stddevs) %*% permut)
+
+overall_mean = sum(means*cntArr)/numSamples
+
+explained_variance = sum(cntArr * (means - overall_mean)^2) / (numGroups-1.0)
+unexplained_variance = sum(cntArr * stddevs^2) / (numSamples - numGroups)
+f = explained_variance / unexplained_variance
+write(f, $6, format="binary")
+
+cntCDFs = cntProbs
+for(i in 2:numGroups){
+       cntCDFs[i,1] = cntCDFs[i-1,1] + cntProbs[i,1]
+}
+
+data = Rand(rows=numSamples, cols=1, min=0.0, max=0.0, pdf="uniform")
+parfor(i in 1:numSamples){
+       r_mat = Rand(rows=1, cols=1, min=0.0, max=1.0, pdf="uniform", seed=0)
+       r1 = as.scalar(r_mat)
+
+       g = -1
+       continue = 1
+       for(k in 1:numGroups){
+               cdf = as.scalar(cntCDFs[k,1])
+               if(continue==1 & r1<=cdf){
+                       g = k
+                       continue=0
+               }       
+       }
+       
+       point = Rand(rows=1, cols=1, pdf="normal", seed=0)
+       data[i,1] = point*stddevs[g,1] + means[g,1]
+}
+write(data, $7, format="binary")
diff --git a/scripts/perftest/datagen/genRandData4Kmeans.dml 
b/scripts/perftest/datagen/genRandData4Kmeans.dml
new file mode 100644
index 0000000000..3098650b26
--- /dev/null
+++ b/scripts/perftest/datagen/genRandData4Kmeans.dml
@@ -0,0 +1,120 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+#
+# Generates random Gaussian-mixture data to test k-Means clustering algorithms
+#
+# INPUT PARAMETERS:
+# ----------------------------------------------------------------------------
+# NAME  TYPE   DEFAULT  MEANING
+# ----------------------------------------------------------------------------
+# nr    Int     ---     Number of records
+# nf    Int     ---     Number of features
+# nc    Int     ---     Number of clusters
+# dc    Double  ---     St.dev. of cluster "centroid" features from zero mean
+# dr    Double  ---     St.dev. of the 1-st feature in a record within cluster
+# fbf   Double  ---     Feature bias factor: Stdev(last) / Stdev(1-st) feature
+# cbf   Double  ---     Cluster bias factor: Prob[1-st clus] / Prob[k-th clus]
+# X     String  ---     Location to write matrix X with generated data records
+# C     String  ---     Location to write cluster "centroids" (Gaussian means)
+# Y     String  ---     Location to write assignment of records to cluster ids
+# YbyC  String  ---     Location to write rec-cluster assigns by min-dist to C
+# ----------------------------------------------------------------------------
+#
+# Example:
+# hadoop jar SystemDS.jar -f genRandData4Kmeans.dml -nvargs nr=100000 nf=100
+#     nc=10 dc=10.0 dr=1.0 fbf=100.0 cbf=100.0 X=X.mtx C=C.mtx Y=Y.mtx 
YbyC=YbyC.mtx
+
+print ("BEGIN K-MEANS GENERATOR SCRIPT");
+
+num_records   = $nr;
+num_features  = $nf;
+num_centroids = $nc;
+dist_per_feature_centroids = $dc;
+dist_per_feature_first_record = $dr;
+feature_bias_factor = $fbf;
+cluster_bias_factor = $cbf;
+
+fileX    = ifdef ($X, "X");
+fileC    = ifdef ($C, "C");
+fileY    = ifdef ($Y, "Y");
+fileYbyC = ifdef ($YbyC, "YbyC");
+fmt      = ifdef ($fmt, "text");
+
+print ("Generating cluster distribution (mixture) centroids...");
+
+C = Rand (rows = num_centroids, cols = num_features, pdf = "normal");
+C = C * dist_per_feature_centroids;
+
+print ("Generating record-to-cluster assignments...");
+
+# Y is a multinomial in {1, ..., num_centroids} with 1 being more likely
+# than "num_centroids" by the factor of "cluster_bias_factor"
+
+rnd = Rand (rows = num_records, cols = 1, min = 0.0, max = 1.0, pdf = 
"uniform");
+if (cluster_bias_factor == 1.0) {
+    Y = round (0.5 + rnd * num_centroids);
+} else {
+    rnd_scaled = rnd * (1 - cluster_bias_factor ^ (- num_centroids / 
(num_centroids - 1)));
+    Y = round (0.5 - (num_centroids - 1) * log (1 - rnd_scaled) / log 
(cluster_bias_factor));
+}
+
+print ("Generating within-cluster random shifts...");
+
+X_shift = Rand (rows = num_records, cols = num_features, pdf = "normal");
+feature_factors = dist_per_feature_first_record * 
+    exp ((seq (1, num_features) - 1) / (num_features - 1) * log 
(feature_bias_factor));
+X_shift = X_shift %*% diag (feature_factors);
+
+print ("Generating records by shifting from centroids..."); 
+
+Y_bitmap_raw = table (seq (1, num_records), Y);
+Y_bitmap = matrix (0, rows = num_records, cols = num_centroids);
+Y_bitmap [, 1 : ncol (Y_bitmap_raw)] = Y_bitmap_raw;
+X = Y_bitmap %*% C + X_shift;
+
+print ("Computing record-to-cluster assignments by minimum centroid 
distance...");
+
+D = t(t(-2 * (X %*% t(C))) + rowSums (C ^ 2));
+P = (D <= rowMins (D));
+aggr_P = t(cumsum (t(P)));
+Y_by_C = rowSums (aggr_P == 0) + 1;
+
+print ("Computing useful statistics...");
+
+sumXsq = sum (X ^ 2);
+default_wcss  = sumXsq - sum (colSums (X) ^ 2) / num_records;
+attained_wcss = sumXsq + sum (rowMins (D));
+
+print ("Default (single-cluster) WCSS = " + default_wcss);
+print (num_centroids + "-cluster WCSS attained by the mixture centroids = " + 
attained_wcss);
+
+print ("Writing out the resulting dataset...");
+
+write (X, fileX, format = fmt);
+write (C, fileC, format = fmt);
+write (Y, fileY, format = fmt);
+write (Y_by_C, fileYbyC, format = fmt);
+
+print ("Please run the scoring script to compare " + fileY + " with " + 
fileYbyC); 
+
+print ("DONE: K-MEANS GENERATOR SCRIPT");
+
diff --git a/scripts/perftest/datagen/genRandData4LinearReg_LTstats.dml 
b/scripts/perftest/datagen/genRandData4LinearReg_LTstats.dml
new file mode 100644
index 0000000000..9bb1ca189e
--- /dev/null
+++ b/scripts/perftest/datagen/genRandData4LinearReg_LTstats.dml
@@ -0,0 +1,233 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+#
+# generates random data to test bi- and multinomial logistic regression
+
+# $N  = number of training samples
+# $Nt = number of test samples (or 0 if none)
+# $nf = number of features (independent variables)
+# $nc = number of categories; = 1 if "binomial" with +1/-1 labels
+# $Xmin  = minimum feature value
+# $Xmax  = maximum feature value
+# $spars = controls sparsity in the generated data
+# $avgLTmin = average linear term (X %*% beta + intercept), minimum value
+# $avgLTmax = average linear term (X %*% beta + intercept), maximum value
+# $stdLT = requested standard deviation for the linear terms
+# $iceptmin = intercept, minimum value (0.0 disables intercept)
+# $iceptmax = intercept, maximum value (0.0 disables intercept)
+# $B  = location to store generated regression parameters
+# $X  = location to store generated training data
+# $Y  = location to store generated training category labels
+# $Xt = location to store generated test data
+# $Yt = location to store generated test category labels
+# $fmt = format of the output
+#
+# Example:
+# hadoop jar SystemDS.jar -f genRandData4LinearReg_LTstats.dml -nvargs
+#     N=1000000 Nt=1000 nf=20 nc=3 Xmin=0.0 Xmax=1.0 spars=1.0 avgLTmin=3.0 
avgLTmax=5.0 stdLT=1.25
+#     iceptmin=1.0 iceptmax=1.0 B=./B123 X=./X123 Y=./Y123 Xt=./Xt123 
Yt=./Yt123 fmt=binary
+
+numTrainingSamples = $N;
+numTestSamples = $Nt;
+numFeatures = $nf;
+numCategories = $nc;
+minIntercept = $iceptmin;
+maxIntercept = $iceptmax;
+minXentry = $Xmin;
+maxXentry = $Xmax;
+minAvgLT = $avgLTmin;
+maxAvgLT = $avgLTmax;
+sparsityLevel = $spars;
+stdevLT = $stdLT;
+fileB  = ifdef ($B,  "B");
+fileX  = ifdef ($X,  "X");
+fileY  = ifdef ($Y,  "Y");
+fileXt = ifdef ($Xt, "Xt");
+fileYt = ifdef ($Yt, "Yt");
+fmt = ifdef ($fmt, "mm");
+
+numSamples = numTrainingSamples + numTestSamples;
+
+isBinomialPMOne = FALSE;
+if (numCategories == 1) {
+    numCategories = 2;
+    isBinomialPMOne = TRUE;
+}
+do_we_output_intercept = 1;
+if (minIntercept == 0 & maxIntercept == 0) {
+    do_we_output_intercept = 0;
+}
+
+X = Rand (rows = numSamples, cols = numFeatures, min = minXentry, max = 
maxXentry, pdf = "uniform", sparsity = sparsityLevel);
+
+meanLT  = Rand (rows = 1, cols = numCategories - 1, min = minAvgLT, max = 
maxAvgLT, pdf = "uniform");
+sigmaLT = matrix (stdevLT, rows = 1, cols = numCategories - 1);
+b_intercept = Rand (rows = 1, cols = numCategories - 1, min = minIntercept, 
max = maxIntercept, pdf = "uniform");
+
+meanLT_minus_intercept = meanLT - b_intercept;
+[B, new_sigmaLT] = generateWeights (X, meanLT_minus_intercept, sigmaLT);
+
+ones = matrix (1.0, rows = numSamples, cols = 1);
+LT = X %*% B + ones %*% b_intercept;
+actual_meanLT  = colSums (LT) / numSamples;
+actual_sigmaLT = sqrt (colSums ((LT - ones %*% actual_meanLT)^2) / numSamples);
+
+for (i in 1:(numCategories - 1)) {
+    if (as.scalar (new_sigmaLT [1, i]) == as.scalar (sigmaLT [1, i])) {
+        print ("Category " + i + ":  Intercept = " + as.scalar (b_intercept 
[1, i])); 
+    } else {
+        print ("Category " + i + ":  Intercept = " + as.scalar (b_intercept 
[1, i]) + ",  st.dev.(LT) relaxed from " + as.scalar (sigmaLT [1, i])); 
+    }
+    print ("    Wanted LT mean = " + as.scalar (meanLT [1, i])        + ",  
st.dev. = " + as.scalar (new_sigmaLT [1, i]));
+    print ("    Actual LT mean = " + as.scalar (actual_meanLT [1, i]) + ",  
st.dev. = " + as.scalar (actual_sigmaLT [1, i]));
+}
+
+
+/*
+ones = matrix (1.0, rows = 1, cols = numCategories - 1);
+Prob = exp (LT);
+Prob = Prob / ((1.0 + rowSums (Prob)) %*% ones);
+Prob = t(cumsum (t(Prob)));
+
+r = Rand (rows = numSamples, cols = 1, min = 0, max = 1, pdf = "uniform", seed 
= 0);
+R = r %*% ones;
+Y = 1 + rowSums (Prob < R);
+if (isBinomialPMOne) {
+    Y = 3 - 2 * Y;
+}
+*/
+
+/* USE FOR LINEAR REGRESSION */
+
+r = Rand (rows = numSamples, cols = 1, pdf = "normal");
+Y = LT [, 1] + r;
+
+
+if (do_we_output_intercept == 1) {
+    new_B = matrix (0.0, rows = nrow(B) + 1, cols = ncol(B));
+    new_B [1:nrow(B), 1:ncol(B)] = B;
+    new_B [nrow(B)+1, 1:ncol(B)] = b_intercept;
+    write (new_B, fileB, format=fmt);
+} else {
+    write (B, fileB, format=fmt);
+}
+
+if (numTestSamples > 0) {
+    X_train = X [1:numTrainingSamples,];
+    Y_train = Y [1:numTrainingSamples,];
+    X_test  = X [(numTrainingSamples+1):numSamples,];
+    Y_test  = Y [(numTrainingSamples+1):numSamples,];
+    write (X_train, fileX,  format=fmt);
+    write (Y_train, fileY,  format=fmt);
+    write (X_test,  fileXt, format=fmt);
+    write (Y_test,  fileYt, format=fmt);
+} else {
+    write (X, fileX, format=fmt);
+    write (Y, fileY, format=fmt);
+}
+
+
+
+
+
+
+# Generates weight vectors to ensure the desired statistics for Linear Terms = 
X %*% W
+# To be used for data generation in the testing of GLM, Logistic Regression, 
etc.
+# INPUT:  meanLT and sigmaLT are row vectors, meanLT[1, i] and sigmaLT[1, i] 
are
+#         the desired mean and standard deviation for X %*% W[, i]
+# OUTPUT: "W" is the matrix of generated (column) weight vectors W[, i]
+#         new_sigmaLT[1, i] == sigmaLT[1, i] if the std.dev is successfully 
enforced,
+#         new_sigmaLT[1, i]  > sigmaLT[1, i] if we had to relax this 
constraint.
+generateWeights = 
+    function (Matrix[double] X, Matrix[double] meanLT, Matrix[double] sigmaLT)
+    return   (Matrix[double] W, Matrix[double] new_sigmaLT)
+{
+    num_w = ncol (meanLT);  # Number of output weight vectors
+    dim_w = ncol (X);       # Number of features / dimensions in a weight 
vector
+    w_X = t(colSums(X));    # "Prohibited" weight shift direction that changes 
meanLT
+                            # (all orthogonal shift directions do not affect 
meanLT)
+
+    # Compute "w_1" with meanLT = 1 and with the smallest possible sigmaLT
+
+    w_1 = straightenX (X);
+    r_1 = (X %*% w_1) - 1.0;
+    norm_r_1_sq = sum (r_1 ^ 2);
+    
+    # For each W[, i] generate uniformly random directions to shift away from 
"w_1"
+    
+    DW_raw = Rand (rows = dim_w, cols = num_w, pdf = "normal");
+    DW = DW_raw - (w_X %*% t(w_X) %*% DW_raw) / sum (w_X ^ 2); # Orthogonal to 
w_X
+    XDW = X %*% DW;
+    
+    # Determine how far to shift in the chosen directions to satisfy the 
constraints
+    # Use the positive root of the quadratic equation; relax sigmaLT where 
needed
+    
+    a_qe = colSums (XDW ^ 2);
+    b_qe = 2.0 * meanLT * (t(r_1) %*% XDW);
+    c_qe = meanLT^2 * norm_r_1_sq - sigmaLT^2 * nrow(X);
+
+    is_sigmaLT_OK = (c_qe <= 0);
+    new_sigmaLT = is_sigmaLT_OK * sigmaLT + (1 - is_sigmaLT_OK) * abs (meanLT) 
* sqrt (norm_r_1_sq / nrow(X));
+    c_qe = is_sigmaLT_OK * c_qe;
+    x_qe = (- b_qe + sqrt (b_qe * b_qe - 4.0 * a_qe * c_qe)) / (2.0 * a_qe);
+    
+    # Scale and shift "w_1" in the "DW" directions to produce the result:
+    
+    ones = matrix (1.0, rows = dim_w, cols = 1);
+    W = w_1 %*% meanLT + DW * (ones %*% x_qe);
+}
+
+# Computes vector w such that  ||X %*% w - 1|| -> MIN  given  avg(X %*% w) = 1
+# We find z_LS such that ||X %*% z_LS - 1|| -> MIN unconditionally, then scale
+# it to compute  w = c * z_LS  such that  sum(X %*% w) = nrow(X).
+straightenX =
+    function (Matrix[double] X)
+    return   (Matrix[double] w)
+{
+    w_X = t(colSums(X));
+    lambda_LS = 0.000001 * sum(X ^ 2) / ncol(X);
+    eps = 0.000000001 * nrow(X);
+
+    # BEGIN LEAST SQUARES
+    
+    r_LS = - w_X;
+    z_LS = matrix (0.0, rows = ncol(X), cols = 1);
+    p_LS = - r_LS;
+    norm_r2_LS = sum (r_LS ^ 2);
+    i_LS = 0;
+    while (i_LS < 50 & i_LS < ncol(X) & norm_r2_LS >= eps)
+    {
+        temp_LS = X %*% p_LS;
+        q_LS = (t(X) %*% temp_LS) + lambda_LS * p_LS;
+        alpha_LS = norm_r2_LS / sum (p_LS * q_LS);
+        z_LS = z_LS + alpha_LS * p_LS;
+        old_norm_r2_LS = norm_r2_LS;
+        r_LS = r_LS + alpha_LS * q_LS;
+        norm_r2_LS = sum (r_LS ^ 2);
+        p_LS = -r_LS + (norm_r2_LS / old_norm_r2_LS) * p_LS;
+        i_LS = i_LS + 1;
+    }
+    
+    # END LEAST SQUARES
+    
+    w = (nrow(X) / sum (w_X * z_LS)) * z_LS;
+}
diff --git a/scripts/perftest/datagen/genRandData4LinearRegression.dml 
b/scripts/perftest/datagen/genRandData4LinearRegression.dml
new file mode 100644
index 0000000000..ebce4f30d1
--- /dev/null
+++ b/scripts/perftest/datagen/genRandData4LinearRegression.dml
@@ -0,0 +1,61 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+# generates data to test linear regression
+
+# $1 is number of samples
+# $2 is number of features (independent variables)
+# $3 is maximum feature value (absolute value)
+# $4 is maximum weight (absolute value)
+# $5 is location to store generated weights
+# $6 is location to store generated data
+# $7 is location to store generated labels
+# $8 is 0/1. 0 suppresses noise, 1 will add noise to Y
+# $9 is b, 0 disables intercept
+# $10 controls sparsity in the generated data
+# $11 output format
+
+numSamples = $1
+numFeatures = $2
+maxFeatureValue = $3
+maxWeight = $4
+addNoise = $8
+b = $9
+fmt = $11
+
+X = Rand(rows=numSamples, cols=numFeatures, min=-1, max=1, pdf="uniform", 
seed=0, sparsity=$10)
+w = Rand(rows=numFeatures, cols=1, min=-1, max=1, pdf="uniform", seed=0)
+X = X * maxFeatureValue
+w = w * maxWeight
+Y = X %*% w
+
+if( b != 0 ) {
+       b_mat = Rand(rows=1, cols=1, min=b, max=b, pdf="uniform")
+       w =  rbind(w, t(b_mat))
+       Y = Y + b
+}
+
+noise = Rand(rows=numSamples, cols=1, pdf="normal", seed=0)
+Y = Y + addNoise*noise
+
+write(w, $5, format=fmt)
+write(X, $6, format=fmt)
+write(Y, $7, format=fmt)
diff --git a/scripts/perftest/datagen/genRandData4LogReg_LTstats.dml 
b/scripts/perftest/datagen/genRandData4LogReg_LTstats.dml
new file mode 100644
index 0000000000..f95342f708
--- /dev/null
+++ b/scripts/perftest/datagen/genRandData4LogReg_LTstats.dml
@@ -0,0 +1,233 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+#
+# generates random data to test bi- and multinomial logistic regression
+
+# $N  = number of training samples
+# $Nt = number of test samples (or 0 if none)
+# $nf = number of features (independent variables)
+# $nc = number of categories; = 1 if "binomial" with +1/-1 labels
+# $Xmin  = minimum feature value
+# $Xmax  = maximum feature value
+# $spars = controls sparsity in the generated data
+# $avgLTmin = average linear term (X %*% beta + intercept), minimum value
+# $avgLTmax = average linear term (X %*% beta + intercept), maximum value
+# $stdLT = requested standard deviation for the linear terms
+# $iceptmin = intercept, minimum value (0.0 disables intercept)
+# $iceptmax = intercept, maximum value (0.0 disables intercept)
+# $B  = location to store generated regression parameters
+# $X  = location to store generated training data
+# $Y  = location to store generated training category labels
+# $Xt = location to store generated test data
+# $Yt = location to store generated test category labels
+#
+# Example:
+# hadoop jar SystemDS.jar -f genRandData4LogReg_LTstats.dml -nvargs
+#     N=1000000 Nt=1000 nf=20 nc=3 Xmin=0.0 Xmax=1.0 spars=1.0 avgLTmin=3.0 
avgLTmax=5.0 stdLT=1.25
+#     iceptmin=1.0 iceptmax=1.0 B=./B123 X=./X123 Y=./Y123 Xt=./Xt123 
Yt=./Yt123
+
+numTrainingSamples = $N;
+numTestSamples = $Nt;
+numFeatures = $nf;
+numCategories = $nc;
+minIntercept = $iceptmin;
+maxIntercept = $iceptmax;
+minXentry = $Xmin;
+maxXentry = $Xmax;
+minAvgLT = $avgLTmin;
+maxAvgLT = $avgLTmax;
+sparsityLevel = $spars;
+stdevLT = $stdLT;
+fileB  = ifdef ($B,  "B");
+fileX  = ifdef ($X,  "X");
+fileY  = ifdef ($Y,  "Y");
+fileXt = ifdef ($Xt, "Xt");
+fileYt = ifdef ($Yt, "Yt");
+
+
+numSamples = numTrainingSamples + numTestSamples;
+
+isBinomialPMOne = FALSE;
+if (numCategories == 1) {
+    numCategories = 2;
+    isBinomialPMOne = TRUE;
+}
+do_we_output_intercept = 1;
+if (minIntercept == 0 & maxIntercept == 0) {
+    do_we_output_intercept = 0;
+}
+
+X = Rand (rows = numSamples, cols = numFeatures, min = minXentry, max = 
maxXentry, pdf = "uniform", sparsity = sparsityLevel);
+
+meanLT  = Rand (rows = 1, cols = numCategories - 1, min = minAvgLT, max = 
maxAvgLT, pdf = "uniform");
+sigmaLT = matrix (stdevLT, rows = 1, cols = numCategories - 1);
+b_intercept = Rand (rows = 1, cols = numCategories - 1, min = minIntercept, 
max = maxIntercept, pdf = "uniform");
+
+meanLT_minus_intercept = meanLT - b_intercept;
+[B, new_sigmaLT] = generateWeights (X, meanLT_minus_intercept, sigmaLT);
+
+ones = matrix (1.0, rows = numSamples, cols = 1);
+LT = X %*% B + ones %*% b_intercept;
+actual_meanLT  = colSums (LT) / numSamples;
+actual_sigmaLT = sqrt (colSums ((LT - ones %*% actual_meanLT)^2) / numSamples);
+
+for (i in 1:(numCategories - 1)) {
+    if (as.scalar (new_sigmaLT [1, i]) == as.scalar (sigmaLT [1, i])) {
+        print ("Category " + i + ":  Intercept = " + as.scalar (b_intercept 
[1, i])); 
+    } else {
+        print ("Category " + i + ":  Intercept = " + as.scalar (b_intercept 
[1, i]) + ",  st.dev.(LT) relaxed from " + as.scalar (sigmaLT [1, i])); 
+    }
+    print ("    Wanted LT mean = " + as.scalar (meanLT [1, i])        + ",  
st.dev. = " + as.scalar (new_sigmaLT [1, i]));
+    print ("    Actual LT mean = " + as.scalar (actual_meanLT [1, i]) + ",  
st.dev. = " + as.scalar (actual_sigmaLT [1, i]));
+}
+
+
+ones = matrix (1.0, rows = 1, cols = numCategories - 1);
+Prob = exp (LT);
+Prob = Prob / ((1.0 + rowSums (Prob)) %*% ones);
+Prob = t(cumsum (t(Prob)));
+
+r = Rand (rows = numSamples, cols = 1, min = 0, max = 1, pdf = "uniform", seed 
= 0);
+R = r %*% ones;
+Y = 1 + rowSums (Prob < R);
+if (isBinomialPMOne) {
+    Y = 3 - 2 * Y;
+}
+
+
+/* USE FOR LINEAR REGRESSION
+
+r = Rand (rows = numSamples, cols = 1, pdf = "normal");
+Y = LT [, 1] + r;
+
+*/
+
+
+if (do_we_output_intercept == 1) {
+    new_B = matrix (0.0, rows = nrow(B) + 1, cols = ncol(B));
+    new_B [1:nrow(B), 1:ncol(B)] = B;
+    new_B [nrow(B)+1, 1:ncol(B)] = b_intercept;
+    write (new_B, fileB, format="mm");
+} else {
+    write (B, fileB, format="mm");
+}
+
+if (numTestSamples > 0) {
+    X_train = X [1:numTrainingSamples,];
+    Y_train = Y [1:numTrainingSamples,];
+    X_test  = X [(numTrainingSamples+1):numSamples,];
+    Y_test  = Y [(numTrainingSamples+1):numSamples,];
+    write (X_train, fileX,  format="mm");
+    write (Y_train, fileY,  format="mm");
+    write (X_test,  fileXt, format="mm");
+    write (Y_test,  fileYt, format="mm");
+} else {
+    write (X, fileX, format="mm");
+    write (Y, fileY, format="mm");
+}
+
+
+
+
+
+
+# Generates weight vectors to ensure the desired statistics for Linear Terms = 
X %*% W
+# To be used for data generation in the testing of GLM, Logistic Regression, 
etc.
+# INPUT:  meanLT and sigmaLT are row vectors, meanLT[1, i] and sigmaLT[1, i] 
are
+#         the desired mean and standard deviation for X %*% W[, i]
+# OUTPUT: "W" is the matrix of generated (column) weight vectors W[, i]
+#         new_sigmaLT[1, i] == sigmaLT[1, i] if the std.dev is successfully 
enforced,
+#         new_sigmaLT[1, i]  > sigmaLT[1, i] if we had to relax this 
constraint.
+generateWeights = 
+    function (Matrix[double] X, Matrix[double] meanLT, Matrix[double] sigmaLT)
+    return   (Matrix[double] W, Matrix[double] new_sigmaLT)
+{
+    num_w = ncol (meanLT);  # Number of output weight vectors
+    dim_w = ncol (X);       # Number of features / dimensions in a weight 
vector
+    w_X = t(colSums(X));    # "Prohibited" weight shift direction that changes 
meanLT
+                            # (all orthogonal shift directions do not affect 
meanLT)
+
+    # Compute "w_1" with meanLT = 1 and with the smallest possible sigmaLT
+
+    w_1 = straightenX (X);
+    r_1 = (X %*% w_1) - 1.0;
+    norm_r_1_sq = sum (r_1 ^ 2);
+    
+    # For each W[, i] generate uniformly random directions to shift away from 
"w_1"
+    
+    DW_raw = Rand (rows = dim_w, cols = num_w, pdf = "normal");
+    DW = DW_raw - (w_X %*% t(w_X) %*% DW_raw) / sum (w_X ^ 2); # Orthogonal to 
w_X
+    XDW = X %*% DW;
+    
+    # Determine how far to shift in the chosen directions to satisfy the 
constraints
+    # Use the positive root of the quadratic equation; relax sigmaLT where 
needed
+    
+    a_qe = colSums (XDW ^ 2);
+    b_qe = 2.0 * meanLT * (t(r_1) %*% XDW);
+    c_qe = meanLT^2 * norm_r_1_sq - sigmaLT^2 * nrow(X);
+
+    is_sigmaLT_OK = (c_qe <= 0);
+    new_sigmaLT = is_sigmaLT_OK * sigmaLT + (1 - is_sigmaLT_OK) * abs (meanLT) 
* sqrt (norm_r_1_sq / nrow(X));
+    c_qe = is_sigmaLT_OK * c_qe;
+    x_qe = (- b_qe + sqrt (b_qe * b_qe - 4.0 * a_qe * c_qe)) / (2.0 * a_qe);
+    
+    # Scale and shift "w_1" in the "DW" directions to produce the result:
+    
+    ones = matrix (1.0, rows = dim_w, cols = 1);
+    W = w_1 %*% meanLT + DW * (ones %*% x_qe);
+}
+
+# Computes vector w such that  ||X %*% w - 1|| -> MIN  given  avg(X %*% w) = 1
+# We find z_LS such that ||X %*% z_LS - 1|| -> MIN unconditionally, then scale
+# it to compute  w = c * z_LS  such that  sum(X %*% w) = nrow(X).
+straightenX =
+    function (Matrix[double] X)
+    return   (Matrix[double] w)
+{
+    w_X = t(colSums(X));
+    lambda_LS = 0.000001 * sum(X ^ 2) / ncol(X);
+    eps = 0.000000001 * nrow(X);
+
+    # BEGIN LEAST SQUARES
+    
+    r_LS = - w_X;
+    z_LS = matrix (0.0, rows = ncol(X), cols = 1);
+    p_LS = - r_LS;
+    norm_r2_LS = sum (r_LS ^ 2);
+    i_LS = 0;
+    while (i_LS < 50 & i_LS < ncol(X) & norm_r2_LS >= eps)
+    {
+        temp_LS = X %*% p_LS;
+        q_LS = (t(X) %*% temp_LS) + lambda_LS * p_LS;
+        alpha_LS = norm_r2_LS / sum (p_LS * q_LS);
+        z_LS = z_LS + alpha_LS * p_LS;
+        old_norm_r2_LS = norm_r2_LS;
+        r_LS = r_LS + alpha_LS * q_LS;
+        norm_r2_LS = sum (r_LS ^ 2);
+        p_LS = -r_LS + (norm_r2_LS / old_norm_r2_LS) * p_LS;
+        i_LS = i_LS + 1;
+    }
+    
+    # END LEAST SQUARES
+    
+    w = (nrow(X) / sum (w_X * z_LS)) * z_LS;
+}
diff --git a/scripts/perftest/datagen/genRandData4LogisticRegression.dml 
b/scripts/perftest/datagen/genRandData4LogisticRegression.dml
new file mode 100644
index 0000000000..f0850938ad
--- /dev/null
+++ b/scripts/perftest/datagen/genRandData4LogisticRegression.dml
@@ -0,0 +1,72 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+# generates random data to test linear logistic regression
+
+# $1 is number of samples
+# $2 is number of features (independent variables)
+# $3 is maximum feature value (absolute value)
+# $4 is maximum weight (absolute value)
+# $5 is location to store generated weights
+# $6 is location to store generated data
+# $7 is location to store generated labels
+# $8 addNoise. if 0 then no noise is added, to add noise set this to 1
+# $9 is b, 0 disables intercept
+# $10 controls sparsity in the generated data
+# $11 output format
+# $12 transform labels. if 0 then -1/1; otherwise 1/2
+
+numSamples = $1
+numFeatures = $2
+maxFeatureValue = $3
+maxWeight = $4
+addNoise = $8
+b = $9
+
+X = Rand(rows=numSamples, cols=numFeatures, min=-1, max=1, pdf="uniform", 
seed=0, sparsity=$10)
+X = X * maxFeatureValue 
+
+w = Rand(rows=numFeatures, cols=1, min=-1, max=1, pdf="uniform", seed=0)
+w = w * maxWeight
+
+ot = X %*% w
+if( b != 0) {
+       b_mat = Rand(rows=1, cols=1, min=b, max=b, pdf="uniform")
+       w =  rbind(w, t(b_mat))
+       ot = ot + b
+}
+
+prob = 1 / (1 + exp(-ot))
+if( addNoise == 1 ){
+       r = Rand(rows=numSamples, cols=1, min=0, max=1, pdf="uniform", seed=0)
+} 
+else {
+       print("this data generator generates the same dataset for both noise=0 
and noise=1")
+       r = Rand(rows=numSamples, cols=1, min=0, max=1, pdf="uniform", seed=0)
+}
+
+Y = 1 - 2 * (prob < r)
+if( $12 == 1 )
+  Y = (Y + 3) / 2
+
+write(w, $5, format=$11)
+write(X, $6, format=$11)
+write(Y, $7, format=$11)
diff --git a/scripts/perftest/datagen/genRandData4MultiClassSVM.dml 
b/scripts/perftest/datagen/genRandData4MultiClassSVM.dml
new file mode 100644
index 0000000000..011b4dab18
--- /dev/null
+++ b/scripts/perftest/datagen/genRandData4MultiClassSVM.dml
@@ -0,0 +1,68 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+# generates random data to test linear logistic regression
+
+# $1 is number of samples
+# $2 is number of features (independent variables)
+# $3 is maximum feature value (absolute value)
+# $4 is maximum weight (absolute value)
+# $5 is location to store generated weights
+# $6 is location to store generated data
+# $7 is location to store generated labels
+# $8 addNoise. if 0 then no noise is added, to add noise set this to 1
+# $9 is b, 0 disables intercept
+# $10 controls sparsity in the generated data
+
+numSamples = $1
+numFeatures = $2
+maxFeatureValue = $3
+maxWeight = $4
+addNoise = $8
+b = $9
+
+X = Rand(rows=numSamples, cols=numFeatures, min=-1, max=1, pdf="uniform", 
seed=0, sparsity=$10)
+X = X * maxFeatureValue 
+
+w = Rand(rows=numFeatures, cols=1, min=-1, max=1, pdf="uniform", seed=0)
+w = w * maxWeight
+
+ot = X%*%w
+if(b!=0) {
+       b_mat = Rand(rows=1, cols=1, min=b, max=b, pdf="uniform")
+       w =  t(cbind(t(w), b_mat))
+       ot = ot + b
+}
+
+prob = 1/(1+exp(-ot))
+if(addNoise == 1){
+       r = Rand(rows=numSamples, cols=1, min=0, max=1, pdf="uniform", seed=0)
+}else{
+       print("this data generator generates the same dataset for both noise=0 
and noise=1")
+       r = Rand(rows=numSamples, cols=1, min=0, max=1, pdf="uniform", seed=0)
+       #r = Rand(rows=numSamples, cols=1, min=0.5, max=0.5, pdf="uniform")
+}
+Y = 1 - 2 * (prob < r)
+Y = (Y+3)/2
+
+write(w, $5, format="binary")
+write(X, $6, format="binary")
+write(Y, $7, format="binary")
diff --git a/scripts/perftest/datagen/genRandData4Multinomial.dml 
b/scripts/perftest/datagen/genRandData4Multinomial.dml
new file mode 100644
index 0000000000..93666758b5
--- /dev/null
+++ b/scripts/perftest/datagen/genRandData4Multinomial.dml
@@ -0,0 +1,66 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+num_records = $1;
+num_features = $2;
+
+p = $3; #sparsity
+num_categories = $4; #num classes
+is_intercept = $5==1; 
+
+stdevLT = 1.0;
+beta_range = 3.0 * stdevLT / sqrt (num_features * p);
+
+if (is_intercept) {
+    intercept = Rand (rows = 1, cols = num_categories - 1, min = -1.0, max = 
1.0);
+}
+
+X = Rand( rows = num_records, 
+          cols = num_features, 
+          min = 1, 
+          max = 5, 
+          pdf = "uniform", 
+          sparsity = p );
+
+B = Rand (rows = num_features, 
+          cols = num_categories - 1, 
+          min = -1.0, 
+          max = 1.0, 
+          pdf = "uniform", 
+          sparsity = 1.0) * beta_range;
+
+LT = X %*% B;
+if (is_intercept) {
+    LT = LT + matrix (1, rows = num_records, cols = 1) %*% intercept;
+}
+
+Prob = exp (LT);
+Prob = Prob / (1.0 + rowSums(Prob));
+Prob = t(cumsum (t(Prob)));
+
+r = Rand (rows = num_records, cols = 1, min = 0, max = 1, pdf = "uniform");
+Y = 1 + rowSums (Prob < r);
+
+# ensure all classes are represented
+Y[(num_records-num_categories+1):num_records,1] = seq(1,num_categories);
+
+write(X, $6, format=$8)
+write(Y, $7, format=$8);
\ No newline at end of file
diff --git a/scripts/perftest/datagen/genRandData4NMF.dml 
b/scripts/perftest/datagen/genRandData4NMF.dml
new file mode 100644
index 0000000000..a82ac4e0f1
--- /dev/null
+++ b/scripts/perftest/datagen/genRandData4NMF.dml
@@ -0,0 +1,129 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+# generates random data for non-negative
+# matrix factorization
+#
+# follows lda's generative model
+# see Blei, Ng & Jordan, JMLR'03 paper
+# titled Latent Dirichlet Allocation
+#
+# $1 is number of samples
+# $2 is number of features
+# $3 is number of latent factors
+# $4 is number of features per sample
+#       (may overlap). use this to vary
+#       sparsity.      
+# $5 is file to store sample mixtures
+# $6 is file to store factors
+# $7 is file to store generated data
+
+numDocuments = $1
+numFeatures = $2
+numTopics = $3
+numWordsPerDoc = $4
+
+docTopicMixtures = Rand(rows=numDocuments, cols=numTopics, min=0.0, max=1.0, 
pdf="uniform", seed=0, sparsity=0.75)
+denomsTM = rowSums(docTopicMixtures)
+zerosInDenomsTM = denomsTM == 0
+denomsTM = 0.1*zerosInDenomsTM + (1-zerosInDenomsTM)*denomsTM
+parfor(i in 1:numTopics){
+       docTopicMixtures[,i] = docTopicMixtures[,i]/denomsTM
+}
+write(docTopicMixtures, $5, format="binary")
+for(j in 2:numTopics){
+       docTopicMixtures[,j] = docTopicMixtures[,j-1] + docTopicMixtures[,j]
+}
+
+topicDistributions = Rand(rows=numTopics, cols=numFeatures, min=0.0, max=1.0, 
pdf="uniform", seed=0, sparsity=0.75)
+parfor(i in 1:numTopics){
+       topicDist = topicDistributions[i,]
+       
+       denom2 = sum(topicDist)
+       if(denom2 == 0){
+               denom2 = denom2 + 0.1
+       }
+       
+       topicDistributions[i,] = topicDist / denom2
+}
+write(topicDistributions, $6, format="binary")
+for(j in 2:numFeatures){
+       topicDistributions[,j] = topicDistributions[,j-1] + 
topicDistributions[,j]
+}
+
+data = Rand(rows=numDocuments, cols=numFeatures, min=0, max=0, pdf="uniform")
+
+parfor(i in 1:numDocuments){
+       docTopic = docTopicMixtures[i,]
+       
+    ldata = Rand(rows=1, cols=numFeatures, min=0, max=0, pdf="uniform");
+  
+       r_z = Rand(rows=numWordsPerDoc, cols=1, min=0, max=1, pdf="uniform", 
seed=0)
+       r_w = Rand(rows=numWordsPerDoc, cols=1, min=0, max=1, pdf="uniform", 
seed=0)
+       
+       for(j in 1:numWordsPerDoc){
+               rz = as.scalar(r_z[j,1])
+               continue = 1
+               
+               z = -1
+               #this is a workaround
+               #z=1    
+               
+               for(k1 in 1:numTopics){
+                       prob = as.scalar(docTopic[1,k1])
+                       if(continue==1 & rz <= prob){
+                               z=k1
+                               continue=0
+                       }
+               }
+               
+               if(z==-1){
+                       print("z is unassigned: " + z)
+                       z = numTopics
+               }
+               
+               rw = as.scalar(r_w[j,1])
+               continue = 1
+               
+               w = -1
+               #this is a workaround
+               #w = 1
+               
+               for(k2 in 1:numFeatures){
+                       prob = as.scalar(topicDistributions[z,k2])
+                       if(continue == 1 & rw <= prob){
+                               w = k2
+                               continue = 0
+                       }
+               }
+               
+               if(w==-1){
+                       print("w is unassigned: " + w)
+                       w = numFeatures
+               }
+               
+               ldata[1,w] = ldata[1,w] + 1
+       }
+  
+    data[i,] = ldata;
+}
+
+write(data, $7, format="binary")
diff --git a/scripts/perftest/datagen/genRandData4NMFBlockwise.dml 
b/scripts/perftest/datagen/genRandData4NMFBlockwise.dml
new file mode 100644
index 0000000000..0ad548ead2
--- /dev/null
+++ b/scripts/perftest/datagen/genRandData4NMFBlockwise.dml
@@ -0,0 +1,138 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+# generates random data for non-negative
+# matrix factorization
+#
+# follows lda's generative model
+# see Blei, Ng & Jordan, JMLR'03 paper
+# titled Latent Dirichlet Allocation
+#
+# $1 is number of samples
+# $2 is number of features
+# $3 is number of latent factors
+# $4 is number of features per sample
+#       (may overlap). use this to vary
+#       sparsity.      
+# $5 is file to store sample mixtures
+# $6 is file to store factors
+# $7 is file to store generated data
+#
+# $8 is the blocksize, i.e., number of rows per block
+#    (should be set such that $8x$2 fits in mem budget)
+
+numDocuments = $1
+numFeatures = $2
+numTopics = $3
+numWordsPerDoc = $4
+blocksize = $8
+
+docTopicMixtures = Rand(rows=numDocuments, cols=numTopics, min=0.0, max=1.0, 
pdf="uniform", seed=0, sparsity=0.75)
+denomsTM = rowSums(docTopicMixtures)
+zerosInDenomsTM = (denomsTM == 0)
+denomsTM = 0.1*zerosInDenomsTM + (1-zerosInDenomsTM)*denomsTM
+parfor(i in 1:numTopics){
+       docTopicMixtures[,i] = docTopicMixtures[,i]/denomsTM
+}
+write(docTopicMixtures, $5, format="binary")
+for(j in 2:numTopics){
+       docTopicMixtures[,j] = docTopicMixtures[,j-1] + docTopicMixtures[,j]
+}
+
+topicDistributions = Rand(rows=numTopics, cols=numFeatures, min=0.0, max=1.0, 
pdf="uniform", seed=0, sparsity=0.75)
+parfor(i in 1:numTopics){
+       topicDist = topicDistributions[i,]
+       
+       denom2 = sum(topicDist)
+       if(denom2 == 0){
+               denom2 = denom2 + 0.1
+       }
+       
+       topicDistributions[i,] = topicDist / denom2
+}
+write(topicDistributions, $6, format="binary")
+for(j in 2:numFeatures){
+       topicDistributions[,j] = topicDistributions[,j-1] + 
topicDistributions[,j]
+}
+
+data0 = Rand(rows=numDocuments, cols=numFeatures, min=0, max=0, pdf="uniform")
+
+#outer-loop for blockwise computation
+for( k in seq(1,numDocuments,blocksize) )  
+{
+  len = min(blocksize,numDocuments-k); #block length
+  data = data0[k:(k+len),];            #obtain block
+  
+  parfor(i in 1:len){
+       docTopic = docTopicMixtures[i,]
+       
+       r_z = Rand(rows=numWordsPerDoc, cols=1, min=0, max=1, pdf="uniform", 
seed=0)
+       r_w = Rand(rows=numWordsPerDoc, cols=1, min=0, max=1, pdf="uniform", 
seed=0)
+       
+       for(j in 1:numWordsPerDoc){
+               rz = as.scalar(r_z[j,1])
+               continue = 1
+               
+               z = -1
+               #this is a workaround
+               #z=1    
+               
+               for(k1 in 1:numTopics){
+                       prob = as.scalar(docTopic[1,k1])
+                       if(continue==1 & rz <= prob){
+                               z=k1
+                               continue=0
+                       }
+               }
+               
+               if(z==-1){
+                       print("z is unassigned: " + z)
+                       z = numTopics
+               }
+               
+               rw = as.scalar(r_w[j,1])
+               continue = 1
+               
+               w = -1
+               #this is a workaround
+               #w = 1
+               
+               for(k2 in 1:numFeatures){
+                       prob = as.scalar(topicDistributions[z,k2])
+                       if(continue == 1 & rw <= prob){
+                               w = k2
+                               continue = 0
+                       }
+               }
+               
+               if(w==-1){
+                       print("w is unassigned: " + w)
+                       w = numFeatures
+               }
+               
+               data[i,w] = data[i,w] + 1
+       }
+  }
+  
+  data0[k:(k+len),] = data; # write block back
+}
+
+write(data0, $7, format="binary")
diff --git a/scripts/perftest/datagen/genRandData4PCA.dml 
b/scripts/perftest/datagen/genRandData4PCA.dml
new file mode 100644
index 0000000000..413d5c458e
--- /dev/null
+++ b/scripts/perftest/datagen/genRandData4PCA.dml
@@ -0,0 +1,61 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+#
+# Synthetic data generator for PCA
+# 3 hidden dimensions (V1, V2, V3)
+# generates only "dense" data
+#
+# INPUT PARAMETERS:
+# 
--------------------------------------------------------------------------------------------
+# NAME   TYPE   DEFAULT  MEANING
+# 
--------------------------------------------------------------------------------------------
+# R      Int     10000   Number of rows
+# C      Int     1000    Number of categorical attributes
+# OUT    String  ---     Location (on HDFS) to store the generated dataset
+# FMT    String  "csv"   Matrix output format, usually "text", "csv" or 
"binary"
+# 
--------------------------------------------------------------------------------------------
+#
+# Example:
+# hadoop jar SystemDS.jar -f genRandData4PCA.dml -nvargs R=1000000 C=1000 
OUT=/user/biuser/pcaData.mtx FMT=csv
+
+R   = ifdef ($R, 10000)
+C   = ifdef ($C, 1000)
+FMT = ifdef ($FMT, "csv");
+
+# Modified version of the procedure from Zou et.al., "Sparse Principal 
Component Analysis", 2006.
+
+# V1 ~ N(0,290); V2~N(0,300); V3 = -0.3V1+0.925V2 + e, e ~ N(0,1)
+V1 = 0 + 290*rand(rows=R, cols=1, pdf="normal");
+V2 = 0 + 300*rand(rows=R, cols=1, pdf="normal");
+V3 = -0.3*V1 + 0.925*V2 + rand(rows=R, cols=1, pdf="normal");
+
+C1 = ceil(C/2.5);
+C2 = ceil(C/2.5);
+C3 = C - C1 - C2;
+
+M = matrix(0, rows=R, cols=C)
+
+M[,1:C1]       = rand(rows=R, cols=C1, pdf="normal") + V1;
+M[,C1+1:C1+C2] = rand(rows=R, cols=C2, pdf="normal") + V2;
+M[,C1+C2+1:C]  = rand(rows=R, cols=C3, pdf="normal") + V3;
+
+write(M, $OUT, format=FMT);
diff --git a/scripts/perftest/datagen/genRandData4StratStats.dml 
b/scripts/perftest/datagen/genRandData4StratStats.dml
new file mode 100644
index 0000000000..6a4c07f734
--- /dev/null
+++ b/scripts/perftest/datagen/genRandData4StratStats.dml
@@ -0,0 +1,155 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+# THIS SCRIPT GENERATES SYNTHETIC DATA FOR STRATSTATS (STRATIFIED STATISTICS) 
TESTING
+#
+# INPUT PARAMETERS:
+# 
--------------------------------------------------------------------------------------------
+# NAME   TYPE   DEFAULT  MEANING
+# 
--------------------------------------------------------------------------------------------
+# nr     Int    100000   Number of records in the generated dataset
+# nf     Int      10     Number of features in the X and the Y parts of the 
generated dataset
+# smin   Int     10000   Minimum stratum value, a positive integer
+# smax   Int     20000   Maximum stratum value, a positive integer
+# prs    Double  100.0   How many times more likely to have minimum vs. 
maximum stratum value
+# pxnan  Double    0.05  Probability of a NaN replacing a value in X
+# pynan  Double    0.05  Probability of a NaN replacing a value in Y
+# psnan  Double    0.05  Probability of a NaN replacing a value in the stratum 
column
+# 
--------------------------------------------------------------------------------------------
+# mxmin  Double   10.0   Baseline (mean) value for the first feature in X
+# mxmax  Double   19.0   Baseline (mean) value for the last feature in X
+# mymin  Double   30.0   Baseline (mean) value for the first feature in Y 
(before adding X)
+# mymax  Double   39.0   Baseline (mean) value for the last feature in Y 
(before adding X)
+# bmin   Double    3.0   "Beta" multiplied by X before adding to Y, for the 
first feature
+# bmax   Double    3.0   "Beta" multiplied by X before adding to Y, for the 
last feature
+# 
--------------------------------------------------------------------------------------------
+# sxbmin Double    3.0   Standard deviation for the first feature in X, 
stratum dependent
+# sxbmax Double    3.0   Standard deviation for the last feature in X, stratum 
dependent
+# sxwmin Double    4.0   Standard deviation for the first feature in X, 
residual
+# sxwmax Double    4.0   Standard deviation for the last feature in X, residual
+# sybmin Double sqrt(28) Standard deviation for the first feature in Y, 
stratum dependent
+# sybmax Double sqrt(28) Standard deviation for the last feature in Y, stratum 
dependent
+# sywmin Double    6.0   Standard deviation for the first feature in Y, 
residual
+# sywmax Double    6.0   Standard deviation for the last feature in Y, residual
+# 
--------------------------------------------------------------------------------------------
+# D      String  "Data"  Location (on HDFS) to store the generated dataset
+# Xcid   String  "Xcid"  Location (on HDFS) to store the column indices of X 
features
+# Ycid   String  "Ycid"  Location (on HDFS) to store the column indices of Y 
features
+# A      String  "Aux"   Location (on HDFS) to store the auxiliary parameter 
values, if any
+# fmt    String  "text"  Matrix output format, usually "text", "mm", or "csv"
+# 
--------------------------------------------------------------------------------------------
+# OUTPUT: Matrix with the generated dataset, Xcid and Ycid, and possibly other 
auxiliaries
+
+num_records   = ifdef ($nr, 100000);
+num_features  = ifdef ($nf, 10);
+min_stratumID = ifdef ($smin, 10000);
+max_stratumID = ifdef ($smax, 20000);
+prob_ratio_min_to_max_stratumID = ifdef ($prs, 100);
+prob_NaN_in_X = ifdef ($pxnan, 0.05);
+prob_NaN_in_Y = ifdef ($pynan, 0.05);
+prob_NaN_in_stratum = ifdef ($psnan, 0.05);
+
+mean_X_min = ifdef ($mxmin, 31.0);
+mean_X_max = ifdef ($mxmax, 40.0);
+mean_Y_min = ifdef ($mymin, 11.0);
+mean_Y_max = ifdef ($mymax, 20.0);
+beta_min   = ifdef ($bmin,   3.0);
+beta_max   = ifdef ($bmax,   3.0);
+
+stdev_X_between_strata_min = ifdef ($sxbmin, 3.0);
+stdev_X_between_strata_max = ifdef ($sxbmax, 3.0);
+stdev_X_within_strata_min  = ifdef ($sxwmin, 4.0);
+stdev_X_within_strata_max  = ifdef ($sxwmax, 4.0);
+stdev_Y_between_strata_min = ifdef ($sybmin, sqrt(28.0));
+stdev_Y_between_strata_max = ifdef ($sybmax, sqrt(28.0));
+stdev_Y_within_strata_min  = ifdef ($sywmin, 6.0);
+stdev_Y_within_strata_max  = ifdef ($sywmax, 6.0);
+
+fileData = ifdef ($D,    "Data");
+fileXcid = ifdef ($Xcid, "Xcid");
+fileYcid = ifdef ($Ycid, "Ycid");
+fileAux  = ifdef ($A,    "Aux" );
+fmt      = ifdef ($fmt,  "text");
+
+# Generate the strata, from 1 to (max_stratumID - min_stratumID + 1), as 
multinomial
+# in which 1 is less likely than (max_stratumID - min_stratumID + 1) by a 
factor of
+# prob_ratio_min_to_max_stratumID
+
+r_power = (max_stratumID - min_stratumID) / log 
(prob_ratio_min_to_max_stratumID);
+r_bound = prob_ratio_min_to_max_stratumID ^ (1.0 + 1.0 / (max_stratumID - 
min_stratumID));
+
+if (r_bound < 1.0) {
+    R_S = Rand (rows = num_records, cols = 1, min = 0.0, max = 1.0, pdf = 
"uniform");
+    R_S = r_bound + R_S * (1.0-r_bound);
+} else {
+    R_S = Rand (rows = num_records, cols = 1, min = 0.0, max = 1.0, pdf = 
"uniform");
+    R_S = 1.0 + R_S * (r_bound-1);
+}
+
+SID = round (0.5 + log (R_S) * r_power);
+num_strata = max (SID);
+Smap = table (SID, seq (1, num_records, 1));
+
+# Compute baseline values and standard deviations of X, Y, and beta, at each 
feature
+
+mean_X = mean_X_min + ((mean_X_max - mean_X_min) / (num_features - 1)) * seq 
(0, num_features - 1, 1);
+mean_Y = mean_Y_min + ((mean_Y_max - mean_Y_min) / (num_features - 1)) * seq 
(0, num_features - 1, 1);
+betas  =   beta_min + ((  beta_max -   beta_min) / (num_features - 1)) * seq 
(0, num_features - 1, 1);
+
+stdev_X_within_strata  = stdev_X_within_strata_min  + 
+    ((stdev_X_within_strata_max  - stdev_X_within_strata_min ) / (num_features 
- 1)) * seq (0, num_features - 1, 1);
+stdev_X_between_strata = stdev_X_between_strata_min + 
+    ((stdev_X_between_strata_max - stdev_X_between_strata_min) / (num_features 
- 1)) * seq (0, num_features - 1, 1);
+stdev_Y_within_strata  = stdev_Y_within_strata_min  + 
+    ((stdev_Y_within_strata_max  - stdev_Y_within_strata_min ) / (num_features 
- 1)) * seq (0, num_features - 1, 1);
+stdev_Y_between_strata = stdev_Y_between_strata_min + 
+    ((stdev_Y_between_strata_max - stdev_Y_between_strata_min) / (num_features 
- 1)) * seq (0, num_features - 1, 1);
+
+# Generate X and Y matrices
+
+RX_strata  = Rand (rows = num_features, cols = num_strata,  pdf = "normal");  
# transposed
+RY_strata  = Rand (rows = num_features, cols = num_strata,  pdf = "normal");  
# to allow
+RX_records = Rand (rows = num_features, cols = num_records, pdf = "normal");  
# matrix-vector
+RY_records = Rand (rows = num_features, cols = num_records, pdf = "normal");  
# operations
+
+t_X = RX_records * stdev_X_within_strata + (RX_strata * stdev_X_between_strata 
+ mean_X) %*% Smap;
+t_Y = RY_records * stdev_Y_within_strata + (RY_strata * stdev_Y_between_strata 
+ mean_Y) %*% Smap + (t_X * betas);
+Data = cbind (min_stratumID - 1 + SID, t(t_X), t(t_Y));
+
+# Set up the NaNs
+
+RNaNS = Rand  (rows = num_records, cols = 1, min = 1.0, max = 1.0, sparsity = 
prob_NaN_in_stratum);
+RNaNX = Rand  (rows = num_records, cols = num_features, min = 1.0, max = 1.0, 
sparsity = prob_NaN_in_X);
+RNaNY = Rand  (rows = num_records, cols = num_features, min = 1.0, max = 1.0, 
sparsity = prob_NaN_in_Y);
+Mask = cbind (RNaNS, RNaNX, RNaNY) != 0;
+Data = Data + (1.0 - Mask) / (1.0 - Mask);
+
+# Output the dataset and the auxiliaries
+
+Xcid = t(seq (2, num_features + 1, 1));
+Ycid = t(seq (num_features + 2, 2 * num_features + 1, 1));
+Aux = cbind (mean_X, mean_Y, betas);
+
+write (Data, fileData, format=fmt);
+write (Xcid, fileXcid, format=fmt);
+write (Ycid, fileYcid, format=fmt);
+write (Aux,  fileAux,  format=fmt);
+
diff --git a/scripts/perftest/datagen/genRandData4SurvAnalysis.dml 
b/scripts/perftest/datagen/genRandData4SurvAnalysis.dml
new file mode 100644
index 0000000000..75117cf6d7
--- /dev/null
+++ b/scripts/perftest/datagen/genRandData4SurvAnalysis.dml
@@ -0,0 +1,133 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+#  
+# THIS SCRIPT GENERATED RANDOM DATA FOR KAPLAN-MEIER AND COX PROPORTIONAL 
HAZARD MODELS
+# ASSUMPTION: BASELINE HAZARD HAS WEIBULL DISTRIBUTION WITH PARAMETERS LAMBDA 
AND V
+#
+# INPUT   PARAMETERS:
+# 
---------------------------------------------------------------------------------------------
+# NAME    TYPE     DEFAULT      MEANING
+# 
---------------------------------------------------------------------------------------------
+# type    Sting    ---          The type of model for which the data is being 
generated: "kaplan-meier" or "cox"
+# n       Int                   Number of records 
+# lambda  Double   2.0          Scale parameter of the Weibull distribution 
used for generating timestamps 
+# v       Double   1.5          Shape parameter of the Weibull distribution 
used for generating timestamps 
+# p       Double   0.8          1 - probability of a record being censored
+# g       Int      2            If type=kaplan-meier the number of categorical 
features used for grouping 
+# s       Int      1            If type=kaplan-meier the number of categorical 
features used for stratifying
+# f       Int      10           If type=kaplan-meier maximum number of levels 
(i.e., distinct values) of g+s categorical features
+# m       Int      100          If type=cox the number of features in the model
+# sp      Double   1.0          If type=cox the sparsity of the feature matrix 
+# O       String   ---          Location to write the output matrix containing 
random data for the kaplan-meier or the cox model 
+# B       String   ---          If type=cox location to write the output 
matrix containing the coefficients for the cox model 
+# TE     String   ---                  Location to store column indices of X 
corresponding to timestamp (first row) and event information (second row)
+# F       String   ---                 Location to store column indices of X 
which are to be used for fitting the Cox model
+# fmt     String   "text"       The output format of results of the 
kaplan-meier analysis, such as "text" or "csv"
+# 
---------------------------------------------------------------------------------------------
+# OUTPUTS: 
+# 1- If type=kaplan-meier an n x (2+g+s) matrix O with      
+#    - column 1 contains timestamps generated randomly from a Weibull 
distribution with parameters lambda and v
+#       - column 2 contains the information whether an event occurred (1) or 
data is censored (0)
+#       - columns 3:2+g contain categorical features used for grouping 
+#    - columns 3+g:2+g+s contain categorical features used for stratifying
+#   if type=cox an n x (2+m) matrix O with 
+#       - column 1 contains timestamps generated randomly from a Weibull 
distribution with parameters lambda and v
+#       - column 2 contains the information whether an event occurred (1) or 
data is censored (0)
+#       - columns 3:2+m contain scale features 
+# 2- If type=cox a coefficient matrix B
+# 3- A column matrix TE containing the column indices of X corresponding to 
timestamp (first row) and event information (second row)
+# 4- A column matrix F containing the column indices of X which are to be used 
for KM analysis or fitting the Cox model
+
+type = $type; # either "kaplan-meier" or "cox" 
+num_records = $n; 
+lambda = ifdef ($l, 2.0); 
+p_event = ifdef ($p, 0.8); # 1 - prob. of a record being censored
+# parameters related to the kaplan-meier model
+n_groups = ifdef ($g, 2);
+n_strata = ifdef ($s, 1);
+max_level = ifdef ($f, 10);
+# parameters related to the cox model
+num_features = ifdef ($m, 1000);  
+sparsity = ifdef ($sp, 1.0); 
+fileO = $O;
+fileB = $B; 
+fileTE = $TE;
+fileF = $F;
+fmtO = ifdef ($fmt, "text"); # $fmt="text" 
+p_censor = 1 - p_event; # prob. that record is censored
+
+if (type == "kaplan-meier") {
+       
+       v = ifdef ($v, 1.5);
+       # generate categorical features used for grouping and stratifying
+       X = ceil (rand (rows = num_records, cols = n_groups + n_strata, min = 
0.000000001, max = max_level - 0.000000001, pdf = "uniform"));
+       
+       # generate timestamps
+       U = rand (rows = num_records, cols = 1, min = 0.000000001, max = 1); 
+       T = (-log (U) / lambda) ^ (1/v);
+
+} else if (type == "cox") {
+
+       v = ifdef ($v, 50);
+       # generate feature matrix
+       X = rand (rows = num_records, cols = num_features, min = 1, max = 5, 
pdf = "uniform", sparsity = sparsity);
+
+       # generate coefficients
+       B = rand (rows = num_features, cols = 1, min = -1.0, max = 1.0, pdf = 
"uniform", sparsity = 1.0); # * beta_range;       
+
+       # generate timestamps
+       U = rand (rows = num_records, cols = 1, min = 0.000000001, max = 1); 
+       T = (-log (U) / (lambda * exp (X %*% B)) ) ^ (1/v);
+
+} else {
+       stop ("Wrong model type!");
+}
+
+Y = matrix (0, rows = num_records, cols = 2);
+event = floor (rand (rows = num_records, cols = 1, min = (1 - p_censor), max = 
(1 + p_event)));
+n_time = sum (event);
+Y[,2] = event;
+       
+# binning of event times
+min_T = min (T);
+max_T = max (T);
+# T = T - min_T;
+len = max_T - min_T;
+num_bins = len / n_time;
+T = ceil (T / num_bins);
+
+# print ("min(T) " + min(T) + " max(T) " + max(T));
+Y[,1] = T;
+
+O = cbind (Y, X);
+write (O, fileO, format = fmtO);
+
+if (type == "cox") {
+       write (B, fileB, format = fmtO);
+       
+}
+
+TE = matrix ("1 2", rows = 2, cols = 1);
+F = seq (1, num_features);
+write (TE, fileTE, format = fmtO);
+write (F, fileF, format = fmtO);
+
diff --git a/scripts/perftest/datagen/genRandData4Transform.dml 
b/scripts/perftest/datagen/genRandData4Transform.dml
new file mode 100644
index 0000000000..edab7c2873
--- /dev/null
+++ b/scripts/perftest/datagen/genRandData4Transform.dml
@@ -0,0 +1,96 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+#
+# Generates random data to test transform with
+#
+# rows, cols: dimensions of the data matrix to be generated
+# prob_categorical: percentage of the generated cols to be categorical
+# min_domain, max_domain: provide a range for domain sizes of the generated 
categorical cols
+# prob_missing: percentage of the generated (scale) cols to have missing values
+# prob_missing_cell: probability of a cell to have a missing value
+# out_X, out_missing, out_categorical: output file names
+#
+
+#params for size of data
+num_rows = ifdef($rows, 1000)
+num_cols = ifdef($cols, 25)
+
+#params for kind of cols
+prob_categorical = ifdef($prob_cat, 0.1)
+min_domain_size = ifdef($min_domain, 1)
+max_domain_size = ifdef($max_domain, 10)
+
+#params for missing value cols
+prob_missing_col = ifdef($prob_missing, 0.1)
+prob_missing_val = ifdef($prob_missing_cell, 0.2)
+
+num_scalar_cols = as.double(num_cols)
+num_categorical_cols = 0.0
+scalar_ind = matrix(1, rows=num_scalar_cols, cols=1)
+if(prob_categorical > 0){
+  categorical_ind = Rand(rows=num_cols, cols=1, min=0, max=1, pdf="uniform")
+  categorical_ind = categorical_ind < prob_categorical
+  categorical_col_ids = removeEmpty(target=seq(1, num_cols, 
1)*categorical_ind, margin="rows")
+  num_categorical_cols = sum(categorical_ind)
+  write(categorical_col_ids, $out_categorical, format="csv")
+  
+  domain_sizes = Rand(rows=num_categorical_cols, cols=1, min=0, max=1, 
pdf="uniform")
+  domain_sizes = round(min_domain_size + (max_domain_size - 
min_domain_size)*domain_sizes)
+  
+  categorical_X = Rand(rows=num_rows, cols=num_categorical_cols, min=0, max=1, 
pdf="uniform")
+  categorical_X = t(round(1 + t(categorical_X)*(domain_sizes - 1)))
+
+  scalar_ind = 1-categorical_ind
+}
+
+scalar_col_ids = removeEmpty(target=seq(1, num_cols, 1)*scalar_ind, 
margin="rows")
+num_scalar_cols = sum(scalar_ind)
+scalar_X = Rand(rows=num_rows, cols=num_scalar_cols, min=0, max=1, 
pdf="uniform")
+  
+if(num_categorical_cols > 0 & num_scalar_cols > 0){
+  X = cbind(scalar_X, categorical_X)
+  permut_mat = table(seq(1, num_scalar_cols, 1), scalar_col_ids, 
num_scalar_cols, num_cols)
+  fill_in = matrix(0, rows=num_cols-num_scalar_cols, cols=num_cols)
+  permut_mat = t(cbind(t(permut_mat), t(fill_in)))
+  X = X %*% permut_mat
+}else{
+  if(num_categorical_cols > 0) X = categorical_X
+  else{
+    if(num_scalar_cols > 0) X = scalar_X
+    else print("somehow, we've managed to compute that precisely 0 cols should 
be categorical and 0 cols should be scale")
+  }
+}
+
+if(prob_missing_col > 0){
+  missing_col_ind = Rand(rows=num_cols, cols=1, min=0, max=1, pdf="uniform")
+  missing_col_ind = missing_col_ind < prob_missing_col
+  #currently only support missing value imputation for scale cols
+  missing_col_ind = missing_col_ind * scalar_ind
+  missing_col_ids = removeEmpty(target=seq(1, num_cols, 1)*missing_col_ind, 
margin="rows")
+  missing_values = Rand(rows=num_rows, cols=nrow(missing_col_ids), min=0, 
max=1, pdf="uniform")
+  missing_values = missing_values < prob_missing_val
+  X = cbind(X, missing_values)
+  
+  write(missing_col_ids, $out_missing, format="csv")
+}
+
+write(X, $out_X, format="csv")
diff --git a/scripts/perftest/datagen/genRandData4Univariate.dml 
b/scripts/perftest/datagen/genRandData4Univariate.dml
new file mode 100644
index 0000000000..bcbd528eb9
--- /dev/null
+++ b/scripts/perftest/datagen/genRandData4Univariate.dml
@@ -0,0 +1,61 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+# generates random numbers from a distribution
+# with specified mean, standard deviation, 
+# skewness, kurtosis
+# mean and standard deviation are taken in as
+# arguments by this script
+# a,b,c,d are coefficients computed by some
+# equation solver determined from the specified
+# skewness and kurtosis using power method
+# polynomials
+#
+# for more details see:
+# Statistical Simulation: Power Method Polynomials
+# and Other Transformations
+# Author: Todd C. Headrick
+# Chapman & Hall/CRC, Boca Raton, FL, 2010.
+# ISBN 978-1-4200-6490-2
+
+# $1 is the number of random points to be sampled
+# $2 is specified mean
+# $3 is specified standard deviation
+# $4-$7 are a,b,c,d obtained by solving a system
+# of equations using specified kurtosis and skewness
+# $8 is the file to write out the generated data to
+
+numSamples = $1
+mu = $2
+sigma = $3
+a = $4
+b = $5
+c = $6
+d = $7
+
+
+print("a=" + a + " b=" + b + " c=" + c + " d=" + d)
+
+X = Rand(rows=numSamples, cols=1, pdf="normal", seed=0)
+Y = a + b*X + c*X^2 + d*X^3
+
+Z = Y*sigma + mu
+write(Z, $8, format="binary")
diff --git a/scripts/perftest/datagen/genStratStatisticsData.sh 
b/scripts/perftest/datagen/genStratStatisticsData.sh
new file mode 100644
index 0000000000..330247cce0
--- /dev/null
+++ b/scripts/perftest/datagen/genStratStatisticsData.sh
@@ -0,0 +1,61 @@
+#!/bin/bash
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+if [ "$(basename $PWD)" != "perftest" ];
+then
+  echo "Please execute scripts from directory 'perftest'"
+  exit 1;
+fi
+
+CMD=$1
+BASE=$2/stratstats
+MAXMEM=$3
+
+FORMAT="binary"
+
+echo "-- Generating stats data..." >> results/times.txt;
+
+#XS data 10K rows
+if [ $MAXMEM -ge 80 ]; then
+  ${CMD} -f datagen/genRandData4StratStats.dml --explain --stats --nvargs 
nr=10000 nf=100 D=${BASE}/A_10k/data Xcid=${BASE}/A_10k/Xcid 
Ycid=${BASE}/A_10k/Ycid A=${BASE}/A_10k/A fmt=$FORMAT &
+fi
+
+#S data 100K rows
+if [ $MAXMEM -ge 800 ]; then
+  ${CMD} -f datagen/genRandData4StratStats.dml --explain --stats --nvargs 
nr=100000 nf=100 D=${BASE}/A_100k/data Xcid=${BASE}/A_100k/Xcid 
Ycid=${BASE}/A_100k/Ycid A=${BASE}/A_100k/A fmt=$FORMAT &
+fi
+
+#M data 1M rows
+if [ $MAXMEM -ge 8000 ]; then
+  ${CMD} -f datagen/genRandData4StratStats.dml --explain --stats --nvargs 
nr=1000000 nf=100 D=${BASE}/A_1M/data Xcid=${BASE}/A_1M/Xcid 
Ycid=${BASE}/A_1M/Ycid A=${BASE}/A_1M/A fmt=$FORMAT &
+fi
+
+#L data 10M rows
+if [ $MAXMEM -ge 80000 ]; then
+  ${CMD} -f datagen/genRandData4StratStats.dml --explain --stats --nvargs 
nr=10000000 nf=100 D=${BASE}/A_10M/data Xcid=${BASE}/A_10M/Xcid 
Ycid=${BASE}/A_10M/Ycid A=${BASE}/A_10M/A fmt=$FORMAT
+fi
+
+#XL data 100M rows
+if [ $MAXMEM -ge 800000 ]; then
+  ${CMD} -f datagen/genRandData4StratStats.dml --explain --stats --nvargs 
nr=100000000 nf=100 D=${BASE}/A_10M/data Xcid=${BASE}/A_10M/Xcid 
Ycid=${BASE}/A_10M/Ycid A=${BASE}/A_10M/A fmt=$FORMAT
+fi
+
+wait
\ No newline at end of file
diff --git a/scripts/perftest/sparkDML2.sh b/scripts/perftest/sparkDML2.sh
index dde9805719..6102fb3d8a 100644
--- a/scripts/perftest/sparkDML2.sh
+++ b/scripts/perftest/sparkDML2.sh
@@ -1,3 +1,25 @@
+#!/bin/bash
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
  #Client mode spark-submit script
 export SPARK_HOME=/home/hadoop/spark-3.3.1-bin-hadoop3
 export HADOOP_CONF_DIR=/home/hadoop/hadoop-3.3.1/etc/hadoop
@@ -13,4 +35,4 @@ $SPARK_HOME/bin/spark-submit \
      --conf spark.network.timeout=512s \
      --executor-memory 200g \
      --executor-cores 48 \
-      SystemDS.jar "$@" 
\ No newline at end of file
+      SystemDS.jar "$@"

(systemds) branch main updated: [SYSTEMDS-3847] Fix perftest refactoring (datagen scripts)

Reply via email to