This is an automated email from the ASF dual-hosted git repository. baunsgaard pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/systemds.git
The following commit(s) were added to refs/heads/main by this push: new 79be9e9 [SYSTEMDS-2832] Refactoring of old performance benchmarks 79be9e9 is described below commit 79be9e96b1891ef6be1e121b2fff91aed00dc4f0 Author: David Sandru <san...@student.tugraz.at> AuthorDate: Tue Nov 30 13:47:33 2021 +0100 [SYSTEMDS-2832] Refactoring of old performance benchmarks This commit extensively modify the performance benchmarks to use the builtin functions. also added is arguments to execute the entire benchmark within specific memory budgets. DIA project WS2021/22 Closes #1481 In detail: - Refactored old statistics benchmarks and changed them to use built-in functions. - Improved logging management for benchmark outputs - ALS conjugate gradient and direct solve benchmark with prediction. - Added forced execution in specific folder. --- scripts/datagen/genRandData4PCA.dml | 4 +- scripts/perftest/CHANGES.md | 57 -------------- scripts/perftest/MatrixMult.sh | 47 ++++++----- scripts/perftest/MatrixTranspose.sh | 59 ++++++++------ .../{scripts/transpose.dml => conf/env-variables} | 11 +-- scripts/perftest/conf/log4j-off.properties | 10 +-- scripts/perftest/conf/log4j.properties | 56 ++++++------- scripts/perftest/fed/genALS_FedData.sh | 56 +++++++++++++ scripts/perftest/fed/runALSFed.sh | 36 ++++++--- .../perftest/{runALS.sh => fed/runALS_CG_Fed.sh} | 13 ++- scripts/perftest/fed/runAllFed.sh | 7 +- scripts/perftest/genALSData.sh | 58 +++++++++----- scripts/perftest/genBinomialData.sh | 66 ++++++++++------ scripts/perftest/genClusteringData.sh | 66 ++++++++++++++++ scripts/perftest/genDescriptiveStatisticsData.sh | 60 ++++++++++++++ .../{todo => }/genDimensionReductionData.sh | 38 ++++++--- scripts/perftest/genL2SVMData.sh | 6 ++ scripts/perftest/genMultinomialData.sh | 62 +++++++++------ scripts/perftest/genStratStatisticsData.sh | 59 ++++++++++++++ scripts/perftest/{runALS.sh => runALS_CG.sh} | 29 +++++-- scripts/perftest/{runALS.sh => runALS_DS.sh} | 31 ++++++-- scripts/perftest/runAll.sh | 75 +++++++++++------- .../{runAllMultinomial.sh => runAllALS.sh} | 44 +++++------ scripts/perftest/runAllBinomial.sh | 15 +++- scripts/perftest/{todo => }/runAllClustering.sh | 37 +++++---- .../{todo => }/runAllDimensionReduction.sh | 30 ++++--- scripts/perftest/runAllMultinomial.sh | 17 +++- scripts/perftest/runAllRegression.sh | 17 +++- scripts/perftest/{todo => }/runAllStats.sh | 43 +++++----- scripts/perftest/{todo => }/runBivarStats.sh | 21 +++-- scripts/perftest/runGLM_binomial_probit.sh | 7 +- scripts/perftest/runGLM_gamma_log.sh | 7 +- scripts/perftest/runGLM_poisson_log.sh | 7 +- .../perftest/{runNaiveBayes.sh => runKmeans.sh} | 32 ++++---- scripts/perftest/runL2SVM.sh | 6 ++ scripts/perftest/runLinearRegCG.sh | 7 +- scripts/perftest/runLinearRegDS.sh | 7 +- scripts/perftest/runMSVM.sh | 8 +- scripts/perftest/runMultiLogReg.sh | 7 +- scripts/perftest/runNaiveBayes.sh | 8 +- scripts/perftest/{todo => }/runPCA.sh | 21 +++-- scripts/perftest/{todo => }/runStratStats.sh | 22 ++++-- scripts/perftest/{todo => }/runUnivarStats.sh | 23 ++++-- .../scripts/{transpose.dml => Kmeans-predict.dml} | 11 +-- .../perftest/scripts/{transpose.dml => Kmeans.dml} | 15 ++-- scripts/perftest/scripts/MM.dml | 2 +- scripts/perftest/scripts/{alsCG.dml => PCA.dml} | 31 ++++---- .../scripts/{transpose.dml => Univar-Stats.dml} | 10 +-- .../scripts/{transpose.dml => als-predict.dml} | 23 +++++- scripts/perftest/scripts/alsCG.dml | 10 +-- scripts/perftest/scripts/{alsCG.dml => alsDS.dml} | 11 ++- .../scripts/{alsCG.dml => bivar-stats.dml} | 26 +++--- .../scripts/{transpose.dml => stratstats.dml} | 14 ++-- scripts/perftest/scripts/transpose.dml | 2 +- scripts/perftest/todo/genClusteringData.sh | 52 ------------ .../perftest/todo/genDescriptiveStatisticsData.sh | 46 ----------- scripts/perftest/todo/genRandLogRegData_LTStats.sh | 0 scripts/perftest/todo/genStratStatisticsData.sh | 41 ---------- scripts/perftest/todo/genTreeData.sh | 15 ++-- scripts/perftest/todo/runAllTrees.sh | 2 +- scripts/perftest/todo/runDecTree.sh | 7 +- scripts/perftest/todo/runKmeans.sh | 40 ---------- scripts/perftest/todo/runRandTree.sh | 7 +- scripts/perftest/todo/scripts/decision-tree.dml | 85 ++++++++++++++++++++ scripts/perftest/todo/scripts/random-forest.dml | 92 ++++++++++++++++++++++ 65 files changed, 1164 insertions(+), 670 deletions(-) diff --git a/scripts/datagen/genRandData4PCA.dml b/scripts/datagen/genRandData4PCA.dml index d9e18d8..413d5c4 100644 --- a/scripts/datagen/genRandData4PCA.dml +++ b/scripts/datagen/genRandData4PCA.dml @@ -37,11 +37,11 @@ # Example: # hadoop jar SystemDS.jar -f genRandData4PCA.dml -nvargs R=1000000 C=1000 OUT=/user/biuser/pcaData.mtx FMT=csv -R = ifdef ($R, 10000) +R = ifdef ($R, 10000) C = ifdef ($C, 1000) FMT = ifdef ($FMT, "csv"); -# Modofied version of the procedure from Zou et.al., "Sparse Principal Component Analysis", 2006. +# Modified version of the procedure from Zou et.al., "Sparse Principal Component Analysis", 2006. # V1 ~ N(0,290); V2~N(0,300); V3 = -0.3V1+0.925V2 + e, e ~ N(0,1) V1 = 0 + 290*rand(rows=R, cols=1, pdf="normal"); diff --git a/scripts/perftest/CHANGES.md b/scripts/perftest/CHANGES.md deleted file mode 100755 index a71c9db..0000000 --- a/scripts/perftest/CHANGES.md +++ /dev/null @@ -1,57 +0,0 @@ -<!-- -{% comment %} -Licensed to the Apache Software Foundation (ASF) under one or more -contributor license agreements. See the NOTICE file distributed with -this work for additional information regarding copyright ownership. -The ASF licenses this file to you under the Apache License, Version 2.0 -(the "License"); you may not use this file except in compliance with -the License. You may obtain a copy of the License at - -http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -{% end comment %} ---> - -# New additions to the performance test suite -Most of the new files were copied from the deprecated performance test suite (scripts/perftestDeprecated) and refactored to call SystemDS with additional configuration. -Most of the new DML scripts were copied from scripts/algorithms to scripts/perftest/scripts and then adapted to use built-in functions, if available. - -### General changes of perftest and the refactored files moved from perftestDeprecated to perftest -- Added line for intel oneapi MKL system variable initialization in the matrixmult script. The initialization is commented for now, as it would be executed by the runAll.sh. -- Added commented initialization for MKL system variables in the runAll.sh. -- By default, shell scripts can now be invoked without any additional parameters, but optional arguments can be given for output folder and the command to be ran (MR, SPARK, ECHO). -- Added SystemDS-config.xml in the perftest/conf folder, which is used by all refactored perftest scripts. -- times.txt was moved to the "results" folder in perftest. -- Time measurements appended to results/times.txt are now additionally measured in microseconds instead of just seconds, for the smaller data benchmarks. -- All DML scripts, that are ultimately called by the microbenchmarks, can be found in perftest/scripts. This excludes the original algorithmic scripts that are still in use, if there was no corresponding built-in function. -- Removed the -explain flag from all systemds calls. - -### Bash scripts that now call a new DML script that makes use of a built-in function, instead of a fully implemented algorithm -- perftest/runMultiLogReg.sh -> perftest/scripts/MultiLogReg.dml -- perftest/runL2SVM.sh -> perftest/scripts/l2-svm-predict.dml -- perftest/runMSVM.sh -> perftest/scripts/m-svm.dml -- perftest/runMSVM.sh -> perftest/scripts/m-svm-predict.dml -- perftest/runNaiveBayes.sh -> perftest/scripts/naive-bayes.dml -- perftest/runNaiveBayes.sh -> perftest/scripts/naive-bayes-predict.dml -- perftest/runLinearRegCG.sh -> perftest/scripts/LinearRegCG.dml -- perftest/runLinearRegDS.sh -> perftest/scripts/LinearRegDS.dml -- perftest/runGLM_poisson_log.sh -> perftest/scripts/GLM.dml -- perftest/runGLM_gamma_log.sh -> perftest/scripts/GLM.dml -- perftest/runGLM_binomial_probit.sh -> perftest/scripts/GLM.dml - - -### Bash scripts still calling old DML scripts, which fully implement algorithms -- perftest/runMultiLogReg.sh -> algorithms/GLM-predict.dml -- perftest/runLinearRegCG.sh -> algorithms/GLM-predict.dml -- perftest/runLinearRegDS.sh -> algorithms/GLM-predict.dml -- perftest/runGLM_poisson_log.sh -> algorithms/GLM-predict.dml -- perftest/runGLM_gamma_log.sh -> algorithms/GLM-predict.dml -- perftest/runGLM_binomial_probit.sh -> algorithms/GLM-predict.dml - -### Bash scripts that already did call a DML script with a single built-in functions (only needed some refactoring) -- perftest/runL2SVM.sh -> algorithms/l2-svm.dml (This already uses the built-in function l2svm!) \ No newline at end of file diff --git a/scripts/perftest/MatrixMult.sh b/scripts/perftest/MatrixMult.sh index 6bb5e33..ca13899 100755 --- a/scripts/perftest/MatrixMult.sh +++ b/scripts/perftest/MatrixMult.sh @@ -20,51 +20,56 @@ # #------------------------------------------------------------- -# Import MKL -#if [ -d ~/intel ] && [ -d ~/intel/bin ] && [ -f ~/intel/bin/compilervars.sh ]; then -# . ~/intel/bin/compilervars.sh intel64 -#elif [ -d ~/intel ] && [ -d ~/intel/oneapi ] && [ -f ~/intel/oneapi/setvars.sh ]; then -# # For the new intel oneAPI -# . ~/intel/oneapi/setvars.sh intel64 -#else -# . /opt/intel/bin/compilervars.sh intel64 -#fi - -# Set properties -#export LOG4JPROP='scripts/perftest/conf/log4j-off.properties' -#export SYSDS_QUIET=1 -#export SYSTEMDS_ROOT=$(pwd) -#export PATH=$SYSTEMDS_ROOT/bin:$PATH +if [ "$(basename $PWD)" != "perftest" ]; +then + echo "Please execute scripts from directory 'perftest'" + exit 1; +fi +if ! command -v perf &> /dev/null +then + echo "Perf stat not installed for matrix operation benchmarks, see README" + exit 0; +fi +CMD=$1 # Logging output -LogName='results/MM.log' -mkdir -p 'results' +LogName='logs/MM.log' rm -f $LogName +tstart=$(date +%s.%N) # Baseline perf stat -d -d -d -r 5 \ - systemds scripts/MM.dml \ + ${CMD} scripts/MM.dml \ -config conf/std.xml \ -stats \ -args 5000 5000 5000 1.0 1.0 3 \ >>$LogName 2>&1 +ttrain=$(echo "$(date +%s.%N) - $tstart - .4" | bc) +echo "Matrix mult 5000x5000 %*% 5000x5000 without mkl/openblas:" $ttrain >> results/times.txt + +tstart=$(date +%s.%N) # MKL perf stat -d -d -d -r 5 \ - systemds scripts/MM.dml \ + ${CMD} scripts/MM.dml \ -config conf/mkl.xml \ -stats \ -args 5000 5000 5000 1.0 1.0 3 \ >>$LogName 2>&1 +ttrain=$(echo "$(date +%s.%N) - $tstart - .4" | bc) +echo "Matrix mult 5000x5000 %*% 5000x5000 with mkl:" $ttrain >> results/times.txt +tstart=$(date +%s.%N) # Open Blas perf stat -d -d -d -r 5 \ - systemds scripts/MM.dml \ + ${CMD} scripts/MM.dml \ -config conf/openblas.xml \ -stats \ -args 5000 5000 5000 1.0 1.0 3 \ >>$LogName 2>&1 +ttrain=$(echo "$(date +%s.%N) - $tstart - .4" | bc) +echo "Matrix mult 5000x5000 %*% 5000x5000 with openblas:" $ttrain >> results/times.txt -cat $LogName | grep -E ' ba\+\* |Total elapsed time|-----------| instructions | cycles | CPUs utilized ' | tee $LogName.log \ No newline at end of file +cat $LogName | grep -E ' ba\+\* |Total elapsed time|-----------| instructions | cycles | CPUs utilized ' >> $LogName.log \ No newline at end of file diff --git a/scripts/perftest/MatrixTranspose.sh b/scripts/perftest/MatrixTranspose.sh index 90db557..50141bb 100755 --- a/scripts/perftest/MatrixTranspose.sh +++ b/scripts/perftest/MatrixTranspose.sh @@ -20,16 +20,19 @@ # #------------------------------------------------------------- -# Set properties -#export LOG4JPROP='scripts/perftest/conf/log4j-off.properties' -#export SYSDS_QUIET=1 -#export SYSTEMDS_ROOT=$(pwd) -#export PATH=$SYSTEMDS_ROOT/bin:$PATH +if [ "$(basename $PWD)" != "perftest" ]; +then + echo "Please execute scripts from directory 'perftest'" + exit 1; +fi -# export SYSTEMDS_STANDALONE_OPTS="-Xmx20g -Xms20g -Xmn2000m" -export SYSTEMDS_STANDALONE_OPTS="-Xmx10g -Xms10g -Xmn2000m" +if ! command -v perf &> /dev/null +then + echo "Perf stat not installed for matrix operation benchmarks, see README" + exit 0; +fi -mkdir -p 'results' +CMD=$1 repeatScript=5 methodRepeat=5 @@ -37,60 +40,68 @@ sparsities=("1.0 0.1") for s in $sparsities; do - LogName="results/transpose-skinny-$s.log" + LogName="logs/transpose-skinny-$s.log" rm -f $LogName + tstart=$(date +%s.%N) # Baseline perf stat -d -d -d -r $repeatScript \ - systemds scripts/transpose.dml \ + ${CMD} scripts/transpose.dml \ -config conf/std.xml \ -stats \ -args 2500000 50 $s $methodRepeat \ >>$LogName 2>&1 + ttrain=$(echo "$(date +%s.%N) - $tstart - .4" | bc) + echo "Matrix transpose 2500000x50 matrix and sparsity "$s ": " $ttrain >> results/times.txt - echo $LogName - cat $LogName | grep -E ' r. |Total elapsed time|-----------| instructions | cycles | CPUs utilized ' | tee $LogName.log + cat $LogName | grep -E ' r. |Total elapsed time|-----------| instructions | cycles | CPUs utilized ' >> $LogName.log - LogName="results/transpose-wide-$s.log" + LogName="logs/transpose-wide-$s.log" rm -f $LogName + tstart=$(date +%s.%N) # Baseline perf stat -d -d -d -r $repeatScript \ - systemds scripts/transpose.dml \ + ${CMD} scripts/transpose.dml \ -config conf/std.xml \ -stats \ -args 50 2500000 $s $methodRepeat \ >>$LogName 2>&1 + ttrain=$(echo "$(date +%s.%N) - $tstart - .4" | bc) + echo "Matrix transpose 50x2500000 matrix and sparsity "$s ": "$ttrain >> results/times.txt - echo $LogName - cat $LogName | grep -E ' r. |Total elapsed time|-----------| instructions | cycles | CPUs utilized ' | tee $LogName.log + cat $LogName | grep -E ' r. |Total elapsed time|-----------| instructions | cycles | CPUs utilized ' >> $LogName.log - LogName="results/transpose-full-$s.log" + LogName="logs/transpose-full-$s.log" rm -f $LogName + tstart=$(date +%s.%N) # Baseline perf stat -d -d -d -r $repeatScript \ - systemds scripts/transpose.dml \ + ${CMD} scripts/transpose.dml \ -config conf/std.xml \ -stats \ -args 20000 5000 $s $methodRepeat \ >>$LogName 2>&1 + ttrain=$(echo "$(date +%s.%N) - $tstart - .4" | bc) + echo "Matrix transpose 20000x5000 matrix and sparsity "$s ": " $ttrain >> results/times.txt - echo $LogName - cat $LogName | grep -E ' r. |Total elapsed time|-----------| instructions | cycles | CPUs utilized ' | tee $LogName.log + cat $LogName | grep -E ' r. |Total elapsed time|-----------| instructions | cycles | CPUs utilized ' >> $LogName.log done -LogName="results/transpose-large.log" +LogName="logs/transpose-large.log" rm -f $LogName # Baseline +tstart=$(date +%s.%N) perf stat -d -d -d -r $repeatScript \ - systemds scripts/transpose.dml \ + ${CMD} scripts/transpose.dml \ -config conf/std.xml \ -stats \ -args 15000000 30 0.8 $methodRepeat \ >>$LogName 2>&1 +ttrain=$(echo "$(date +%s.%N) - $tstart - .4" | bc) +echo "Matrix transpose 15000000x30 matrix and sparsity 0.8: " $ttrain >> results/times.txt -echo $LogName -cat $LogName | grep -E ' r. |Total elapsed time|-----------| instructions | cycles | CPUs utilized ' | tee $LogName.log +cat $LogName | grep -E ' r. |Total elapsed time|-----------| instructions | cycles | CPUs utilized ' >> $LogName.log diff --git a/scripts/perftest/scripts/transpose.dml b/scripts/perftest/conf/env-variables old mode 100755 new mode 100644 similarity index 82% copy from scripts/perftest/scripts/transpose.dml copy to scripts/perftest/conf/env-variables index 2fb2f0d..1549aa1 --- a/scripts/perftest/scripts/transpose.dml +++ b/scripts/perftest/conf/env-variables @@ -1,3 +1,4 @@ +#!/bin/bash #------------------------------------------------------------- # # Licensed to the Apache Software Foundation (ASF) under one @@ -19,8 +20,8 @@ # #------------------------------------------------------------- -x = rand(rows=$1, cols=$2, min= 0.0, max= 1.0, sparsity=$3, seed= 12) -for(i in 1:$4) { - res = t(x) -} -print(sum(res)) \ No newline at end of file +export LOG4JPROP='conf/log4j-off.properties' +export SYSDS_QUIET=1 + +# stratstats needs a large heap for datasize of 800MB +# export SYSTEMDS_STANDALONE_OPTS="-Xmx10g -Xms10g -Xmn2000m" \ No newline at end of file diff --git a/scripts/perftest/conf/log4j-off.properties b/scripts/perftest/conf/log4j-off.properties index bbbee4d..39f2cd4 100755 --- a/scripts/perftest/conf/log4j-off.properties +++ b/scripts/perftest/conf/log4j-off.properties @@ -21,12 +21,12 @@ log4j.rootLogger=ALL, console -log4j.logger.org.apache.sysds=INFO -log4j.logger.org.apache.spark=ERROR -log4j.logger.org.apache.hadoop=ERROR -log4j.logger.io.netty=INFO +log4j.logger.org.apache.sysds=OFF +log4j.logger.org.apache.spark=OFF +log4j.logger.org.apache.hadoop=OFF +log4j.logger.io.netty=OFF log4j.appender.console=org.apache.log4j.ConsoleAppender log4j.appender.console.target=System.err log4j.appender.console.layout=org.apache.log4j.PatternLayout -log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{2}: %m%n +log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{5}: %m%n diff --git a/scripts/perftest/conf/log4j.properties b/scripts/perftest/conf/log4j.properties index fbfd465..7308334 100644 --- a/scripts/perftest/conf/log4j.properties +++ b/scripts/perftest/conf/log4j.properties @@ -1,40 +1,32 @@ +#------------------------------------------------------------- # -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at # -# http://www.apache.org/licenses/LICENSE-2.0 +# http://www.apache.org/licenses/LICENSE-2.0 # -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. # +#------------------------------------------------------------- + +log4j.rootLogger=ALL, console + +log4j.logger.org.apache.sysds=ERROR +log4j.logger.org.apache.spark=ERROR +log4j.logger.org.apache.hadoop=ERROR +log4j.logger.io.netty=ERROR -# Set everything to be logged to the console -log4j.rootCategory=ERROR, console log4j.appender.console=org.apache.log4j.ConsoleAppender log4j.appender.console.target=System.err log4j.appender.console.layout=org.apache.log4j.PatternLayout -log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n - -# Set the default spark-shell log level to WARN. When running the spark-shell, the -# log level for this class is used to overwrite the root logger's log level, so that -# the user can have different defaults for the shell and regular Spark apps. -log4j.logger.org.apache.spark.repl.Main=WARN - -# Settings to quiet third party logs that are too verbose -log4j.logger.org.spark_project.jetty=WARN -log4j.logger.org.spark_project.jetty.util.component.AbstractLifeCycle=ERROR -log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=INFO -log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=INFO -log4j.logger.org.apache.parquet=ERROR -log4j.logger.parquet=ERROR - -# SPARK-9183: Settings to avoid annoying messages when looking up nonexistent UDFs in SparkSQL with Hive support -log4j.logger.org.apache.hadoop.hive.metastore.RetryingHMSHandler=FATAL -log4j.logger.org.apache.hadoop.hive.ql.exec.FunctionRegistry=ERROR +log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{2}: %m%n diff --git a/scripts/perftest/fed/genALS_FedData.sh b/scripts/perftest/fed/genALS_FedData.sh new file mode 100755 index 0000000..af0ac6f --- /dev/null +++ b/scripts/perftest/fed/genALS_FedData.sh @@ -0,0 +1,56 @@ +#!/bin/bash +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +CMD=$1 +DATADIR=$2 +MAXMEM=$3 + +FORMAT="binary" # can be csv, mm, text, binary +DENSE_SP=0.9 +SPARSE_SP=0.01 + +BASEPATH=$(dirname $0) + +#generate XS scenarios (80MB) +if [ $MAXMEM -lt 80 ]; then exit 0; fi +${CMD} -f ${BASEPATH}/../../datagen/genRandData4ALS.dml --nvargs X=${DATADIR}/X10k_1k_dense rows=10000 cols=1000 rank=10 nnz=`echo "scale=0; 10000 * 1000 * $DENSE_SP" | bc` sigma=0.01 fmt=$FORMAT +${CMD} -f ${BASEPATH}/../../datagen/genRandData4ALS.dml --nvargs X=${DATADIR}/X10k_1k_sparse rows=10000 cols=1000 rank=10 nnz=`echo "scale=0; 10000 * 1000 * $SPARSE_SP" | bc` sigma=0.01 fmt=$FORMAT + +#generate S scenarios (800MB) +if [ $MAXMEM -lt 800 ]; then exit 0; fi +${CMD} -f ${BASEPATH}/../../datagen/genRandData4ALS.dml --nvargs X=${DATADIR}/X100k_1k_dense rows=100000 cols=1000 rank=10 nnz=`echo "scale=0; 100000 * 1000 * $DENSE_SP" | bc` sigma=0.01 fmt=$FORMAT +${CMD} -f ${BASEPATH}/../../datagen/genRandData4ALS.dml --nvargs X=${DATADIR}/X100k_1k_sparse rows=100000 cols=1000 rank=10 nnz=`echo "scale=0; 100000 * 1000 * $SPARSE_SP" | bc` sigma=0.01 fmt=$FORMAT + +#generate M scenarios (8GB) +if [ $MAXMEM -lt 8000 ]; then exit 0; fi +${CMD} -f ${BASEPATH}/../../datagen/genRandData4ALS.dml --nvargs X=${DATADIR}/X1M_1k_dense rows=1000000 cols=1000 rank=10 nnz=`echo "scale=0; 1000000 * 1000 * $DENSE_SP" | bc` sigma=0.01 fmt=$FORMAT +${CMD} -f ${BASEPATH}/../../datagen/genRandData4ALS.dml --nvargs X=${DATADIR}/X1M_1k_sparse rows=1000000 cols=1000 rank=10 nnz=`echo "scale=0; 1000000 * 1000 * $SPARSE_SP" | bc` sigma=0.01 fmt=$FORMAT + +#generate L scenarios (80GB) +if [ $MAXMEM -lt 80000 ]; then exit 0; fi +${CMD} -f ${BASEPATH}/../../datagen/genRandData4ALS.dml --nvargs X=${DATADIR}/X10M_1k_dense rows=10000000 cols=1000 rank=10 nnz=`echo "scale=0; 10000000 * 1000 * $DENSE_SP" | bc` sigma=0.01 fmt=$FORMAT +${CMD} -f ${BASEPATH}/../../datagen/genRandData4ALS.dml --nvargs X=${DATADIR}/X10M_1k_sparse rows=10000000 cols=1000 rank=10 nnz=`echo "scale=0; 10000000 * 1000 * $SPARSE_SP" | bc` sigma=0.01 fmt=$FORMAT + +#generate XL scenarios (800GB) +if [ $MAXMEM -lt 800000 ]; then exit 0; fi +${CMD} -f ${BASEPATH}/../../datagen/genRandData4ALS.dml --nvargs X=${DATADIR}/X100M_1k_dense rows=100000000 cols=1000 rank=10 nnz=`echo "scale=0; 100000000 * 1000 * $DENSE_SP" | bc` sigma=0.01 fmt=$FORMAT +${CMD} -f ${BASEPATH}/../../datagen/genRandData4ALS.dml --nvargs X=${DATADIR}/X100M_1k_sparse rows=100000000 cols=1000 rank=10 nnz=`echo "scale=0; 100000000 * 1000 * $SPARSE_SP" | bc` sigma=0.01 fmt=$FORMAT diff --git a/scripts/perftest/fed/runALSFed.sh b/scripts/perftest/fed/runALSFed.sh index 9204d50..e37d25e 100755 --- a/scripts/perftest/fed/runALSFed.sh +++ b/scripts/perftest/fed/runALSFed.sh @@ -22,8 +22,9 @@ CMD=${1:-"systemds"} DATADIR=${2:-"temp"}/als -NUMFED=${3:-4} -MAXITR=${4:-100} +MAXMEM=${3:-80} +NUMFED=${4:-4} +MAXITR=${5:-100} FILENAME=$0 err_report() { @@ -35,24 +36,41 @@ trap 'err_report $LINENO' ERR export SYSDS_QUIET=1 BASEPATH=$(dirname "$0") +TEMPFILENAME=$(basename -- "$FILENAME") +BASEFILENAME=${TEMPFILENAME%.*} -${BASEPATH}/../genALSData.sh systemds $DATADIR; # generate the data +${BASEPATH}/genALS_FedData.sh $CMD $DATADIR $MAXMEM &> ${BASEPATH}/../logs/genALS_FedData.out; # generate the data + +DATA=() +if [ $MAXMEM -ge 80 ]; then DATA+=("10k_1k_dense" "10k_1k_sparse"); fi +if [ $MAXMEM -ge 800 ]; then DATA+=("100k_1k_dense" "100k_1k_sparse"); fi +if [ $MAXMEM -ge 8000 ]; then DATA+=("1M_1k_dense" "1M_1k_sparse"); fi +if [ $MAXMEM -ge 80000 ]; then DATA+=("10M_1k_dense" "10M_1k_sparse"); fi +if [ $MAXMEM -ge 800000 ]; then DATA+=("100M_1k_dense" "100M_1k_sparse"); fi # start the federated workers on localhost -${BASEPATH}/utils/startFedWorkers.sh systemds $DATADIR $NUMFED "localhost"; +date &> ${BASEPATH}/../logs/runAllFed.out +${BASEPATH}/utils/startFedWorkers.sh $CMD $DATADIR $NUMFED "localhost" &>> ${BASEPATH}/../logs/runAllFed.out; + +echo "test 1" -for d in "10k_1k_dense" "10k_1k_sparse" # "100k_1k_dense" "100k_1k_sparse" "1M_1k_dense" "1M_1k_sparse" "10M_1k_dense" "10M_1k_sparse" "100M_1k_dense" "100M_1k_sparse" +for d in ${DATA[@]} do - # split the generated data into paritions and create a federated object + # split the generated data into partitions and create a federated object ${CMD} -f ${BASEPATH}/data/splitAndMakeFederated.dml \ --config ${BASEPATH}/../conf/SystemDS-config.xml \ --nvargs data=${DATADIR}/X${d} nSplit=$NUMFED transposed=FALSE \ - target=${DATADIR}/X${d}_fed.json hosts=${DATADIR}/workers/hosts fmt="csv" + target=${DATADIR}/X${d}_fed.json hosts=${DATADIR}/workers/hosts fmt="csv" \ + &> ${BASEPATH}/../logs/${BASEFILENAME}_${d}.out; echo "-- Running ALS-CG with federated data ("$d") on "$NUMFED" federated workers" >> results/times.txt # run the als algorithm on the federated object - ${BASEPATH}/../runALS.sh ${DATADIR}/X${d}_fed.json $MAXITR $DATADIR systemds 0.001 FALSE; + ${BASEPATH}/runALS_CG_Fed.sh ${DATADIR}/X${d}_fed.json $MAXITR $DATADIR $CMD 0.001 FALSE &>> ${BASEPATH}/../logs/${BASEFILENAME}_${d}.out; done -${BASEPATH}/utils/killFedWorkers.sh $DATADIR; # kill the federated workers +echo "test 2" + +${BASEPATH}/utils/killFedWorkers.sh $DATADIR &>> ${BASEPATH}/../logs/runAllFed.out; # kill the federated workers + +echo "test 3" \ No newline at end of file diff --git a/scripts/perftest/runALS.sh b/scripts/perftest/fed/runALS_CG_Fed.sh similarity index 81% copy from scripts/perftest/runALS.sh copy to scripts/perftest/fed/runALS_CG_Fed.sh index 0cb3524..99a101a 100755 --- a/scripts/perftest/runALS.sh +++ b/scripts/perftest/fed/runALS_CG_Fed.sh @@ -22,7 +22,7 @@ X=$1 MAXITER=${2:-100} -DATADIR=${3:-"temp"}/als +DATADIR=${3:-"temp"} CMD=${4:-"systemds"} THRESHOLD=${5:-0.0001} VERBOSE=${6:-FALSE} @@ -33,16 +33,15 @@ err_report() { } trap 'err_report $LINENO' ERR -tstart=$(date +%s.%N) - BASEPATH=$(dirname "$0") tstart=$(date +%s.%N) -${CMD} -f ${BASEPATH}/scripts/alsCG.dml \ +${CMD} -f ${BASEPATH}/../scripts/alsCG.dml \ --config ${BASEPATH}/conf/SystemDS-config.xml \ - --nvargs X=$X rank=15 reg="L2" lambda=0.000001 maxiter=$MAXITER thr=$THRESHOLD verbose=$VERBOSE modelB=${DATADIR}/B modelM=${DATADIR}/M fmt="csv" + --stats \ + --nvargs X=$X rank=15 reg="L2" lambda=0.000001 maxiter=$MAXITER thr=$THRESHOLD verbose=$VERBOSE modelU=${DATADIR}/U modelV=${DATADIR}/V fmt="csv" -tend=$(echo "$(date +%s.%N) - $tstart - .4" | bc) -echo "ALS-CG algorithm on "$X": "$tend >> results/times.txt +ttrain=$(echo "$(date +%s.%N) - $tstart - .4" | bc) +echo "ALS-CG algorithm on "$X": "$ttrain >> results/times.txt diff --git a/scripts/perftest/fed/runAllFed.sh b/scripts/perftest/fed/runAllFed.sh index d142af4..5c5f46e 100755 --- a/scripts/perftest/fed/runAllFed.sh +++ b/scripts/perftest/fed/runAllFed.sh @@ -22,9 +22,8 @@ COMMAND=${1:-"systemds"} TEMPFOLDER=${2:-"temp"} - +MAXMEM=$3 DATADIR=${TEMPFOLDER}/fed - NUMFED=5 FILENAME=$0 @@ -33,6 +32,8 @@ err_report() { } trap 'err_report $LINENO' ERR +if [ ! -d logs ]; then mkdir -p logs ; fi + BASEPATH=$(dirname "$0") # Set properties @@ -43,5 +44,5 @@ if [ ! -d results ]; then mkdir -p results ; fi echo "RUN FEDERATED EXPERIMENTS: "$(date) >> results/times.txt -${BASEPATH}/runALSFed.sh systemds $DATADIR $NUMFED +${BASEPATH}/runALSFed.sh $COMMAND $DATADIR $MAXMEM $NUMFED diff --git a/scripts/perftest/genALSData.sh b/scripts/perftest/genALSData.sh index 3c18783..fef1eb4 100755 --- a/scripts/perftest/genALSData.sh +++ b/scripts/perftest/genALSData.sh @@ -19,32 +19,48 @@ # under the License. # #------------------------------------------------------------- +if [ "$(basename $PWD)" != "perftest" ]; +then + echo "Please execute scripts from directory 'perftest'" + exit 1; +fi CMD=$1 -DATADIR=$2 +DATADIR=$2/als +MAXMEM=$3 -FORMAT="binary" # can be csv, mm, text, binary +FORMAT="text" # can be csv, mm, text, binary DENSE_SP=0.9 SPARSE_SP=0.01 -BASEPATH=$(dirname $0) - #generate XS scenarios (80MB) -${CMD} -f ${BASEPATH}/../datagen/genRandData4ALS.dml --nvargs X=${DATADIR}/X10k_1k_dense rows=10000 cols=1000 rank=10 nnz=`echo "scale=0; 10000 * 1000 * $DENSE_SP" | bc` sigma=0.01 fmt=$FORMAT -${CMD} -f ${BASEPATH}/../datagen/genRandData4ALS.dml --nvargs X=${DATADIR}/X10k_1k_sparse rows=10000 cols=1000 rank=10 nnz=`echo "scale=0; 10000 * 1000 * $SPARSE_SP" | bc` sigma=0.01 fmt=$FORMAT +if [ $MAXMEM -ge 80 ]; then + ${CMD} -f ../datagen/genRandData4ALS.dml --nvargs X=${DATADIR}/X10k_1k_dense rows=10000 cols=1000 rank=10 nnz=`echo "scale=0; 10000 * 1000 * $DENSE_SP" | bc` sigma=0.01 fmt=$FORMAT & + ${CMD} -f ../datagen/genRandData4ALS.dml --nvargs X=${DATADIR}/X10k_1k_sparse rows=10000 cols=1000 rank=10 nnz=`echo "scale=0; 10000 * 1000 * $SPARSE_SP" | bc` sigma=0.01 fmt=$FORMAT & +fi -# #generate S scenarios (800MB) -# ${CMD} -f ../datagen/genRandData4ALS.dml --nvargs X=${DATADIR}/X100k_1k_dense rows=100000 cols=1000 rank=10 nnz=`echo "scale=0; 100000 * 1000 * $DENSE_SP" | bc` sigma=0.01 fmt=$FORMAT -# ${CMD} -f ../datagen/genRandData4ALS.dml --nvargs X=${DATADIR}/X100k_1k_sparse rows=100000 cols=1000 rank=10 nnz=`echo "scale=0; 100000 * 1000 * $SPARSE_SP" | bc` sigma=0.01 fmt=$FORMAT -# -# #generate M scenarios (8GB) -# ${CMD} -f ../datagen/genRandData4ALS.dml --nvargs X=${DATADIR}/X1M_1k_dense rows=1000000 cols=1000 rank=10 nnz=`echo "scale=0; 1000000 * 1000 * $DENSE_SP" | bc` sigma=0.01 fmt=$FORMAT -# ${CMD} -f ../datagen/genRandData4ALS.dml --nvargs X=${DATADIR}/X1M_1k_sparse rows=1000000 cols=1000 rank=10 nnz=`echo "scale=0; 1000000 * 1000 * $SPARSE_SP" | bc` sigma=0.01 fmt=$FORMAT -# -# #generate L scenarios (80GB) -# ${CMD} -f ../datagen/genRandData4ALS.dml --nvargs X=${DATADIR}/X10M_1k_dense rows=10000000 cols=1000 rank=10 nnz=`echo "scale=0; 10000000 * 1000 * $DENSE_SP" | bc` sigma=0.01 fmt=$FORMAT -# ${CMD} -f ../datagen/genRandData4ALS.dml --nvargs X=${DATADIR}/X10M_1k_sparse rows=10000000 cols=1000 rank=10 nnz=`echo "scale=0; 10000000 * 1000 * $SPARSE_SP" | bc` sigma=0.01 fmt=$FORMAT -# -# #generate XL scenarios (800GB) -# ${CMD} -f ../datagen/genRandData4ALS.dml --nvargs X=${DATADIR}/X100M_1k_dense rows=100000000 cols=1000 rank=10 nnz=`echo "scale=0; 100000000 * 1000 * $DENSE_SP" | bc` sigma=0.01 fmt=$FORMAT -# ${CMD} -f ../datagen/genRandData4ALS.dml --nvargs X=${DATADIR}/X100M_1k_sparse rows=100000000 cols=1000 rank=10 nnz=`echo "scale=0; 100000000 * 1000 * $SPARSE_SP" | bc` sigma=0.01 fmt=$FORMAT +#generate S scenarios (800MB) +if [ $MAXMEM -ge 800 ]; then + ${CMD} -f ../datagen/genRandData4ALS.dml --nvargs X=${DATADIR}/X100k_1k_dense rows=100000 cols=1000 rank=10 nnz=`echo "scale=0; 100000 * 1000 * $DENSE_SP" | bc` sigma=0.01 fmt=$FORMAT & + ${CMD} -f ../datagen/genRandData4ALS.dml --nvargs X=${DATADIR}/X100k_1k_sparse rows=100000 cols=1000 rank=10 nnz=`echo "scale=0; 100000 * 1000 * $SPARSE_SP" | bc` sigma=0.01 fmt=$FORMAT & +fi + +#generate M scenarios (8GB) +if [ $MAXMEM -ge 8000 ]; then + ${CMD} -f ../datagen/genRandData4ALS.dml --nvargs X=${DATADIR}/X1M_1k_dense rows=1000000 cols=1000 rank=10 nnz=`echo "scale=0; 1000000 * 1000 * $DENSE_SP" | bc` sigma=0.01 fmt=$FORMAT & + ${CMD} -f ../datagen/genRandData4ALS.dml --nvargs X=${DATADIR}/X1M_1k_sparse rows=1000000 cols=1000 rank=10 nnz=`echo "scale=0; 1000000 * 1000 * $SPARSE_SP" | bc` sigma=0.01 fmt=$FORMAT & +fi + +#generate L scenarios (80GB) +if [ $MAXMEM -ge 80000 ]; then + ${CMD} -f ../datagen/genRandData4ALS.dml --nvargs X=${DATADIR}/X10M_1k_dense rows=10000000 cols=1000 rank=10 nnz=`echo "scale=0; 10000000 * 1000 * $DENSE_SP" | bc` sigma=0.01 fmt=$FORMAT + ${CMD} -f ../datagen/genRandData4ALS.dml --nvargs X=${DATADIR}/X10M_1k_sparse rows=10000000 cols=1000 rank=10 nnz=`echo "scale=0; 10000000 * 1000 * $SPARSE_SP" | bc` sigma=0.01 fmt=$FORMAT +fi + +#generate XL scenarios (800GB) +if [ $MAXMEM -ge 800000 ]; then + ${CMD} -f ../datagen/genRandData4ALS.dml --nvargs X=${DATADIR}/X100M_1k_dense rows=100000000 cols=1000 rank=10 nnz=`echo "scale=0; 100000000 * 1000 * $DENSE_SP" | bc` sigma=0.01 fmt=$FORMAT + ${CMD} -f ../datagen/genRandData4ALS.dml --nvargs X=${DATADIR}/X100M_1k_sparse rows=100000000 cols=1000 rank=10 nnz=`echo "scale=0; 100000000 * 1000 * $SPARSE_SP" | bc` sigma=0.01 fmt=$FORMAT +fi + +wait \ No newline at end of file diff --git a/scripts/perftest/genBinomialData.sh b/scripts/perftest/genBinomialData.sh index 8fda720..a8027ae 100755 --- a/scripts/perftest/genBinomialData.sh +++ b/scripts/perftest/genBinomialData.sh @@ -19,40 +19,58 @@ # under the License. # #------------------------------------------------------------- +if [ "$(basename $PWD)" != "perftest" ]; +then + echo "Please execute scripts from directory 'perftest'" + exit 1; +fi CMD=$1 BASE=$2/binomial +MAXMEM=$3 FORMAT="binary" # can be csv, mm, text, binary DENSE_SP=0.9 SPARSE_SP=0.01 #generate XS scenarios (80MB) -${CMD} -f ./datagen/genRandData4LogisticRegression.dml --args 10000 1000 5 5 ${BASE}/w10k_1k_dense ${BASE}/X10k_1k_dense ${BASE}/y10k_1k_dense 1 0 $DENSE_SP $FORMAT 1 -${CMD} -f ./datagen/genRandData4LogisticRegression.dml --args 10000 1000 5 5 ${BASE}/w10k_1k_sparse ${BASE}/X10k_1k_sparse ${BASE}/y10k_1k_sparse 1 0 $SPARSE_SP $FORMAT 1 -${CMD} -f scripts/extractTestData.dml --args ${BASE}/X10k_1k_dense ${BASE}/y10k_1k_dense ${BASE}/X10k_1k_dense_test ${BASE}/y10k_1k_dense_test $FORMAT -${CMD} -f scripts/extractTestData.dml --args ${BASE}/X10k_1k_sparse ${BASE}/y10k_1k_sparse ${BASE}/X10k_1k_sparse_test ${BASE}/y10k_1k_sparse_test $FORMAT +if [ $MAXMEM -ge 80 ]; then + ${CMD} -f ../datagen/genRandData4LogisticRegression.dml --args 10000 1000 5 5 ${BASE}/w10k_1k_dense ${BASE}/X10k_1k_dense ${BASE}/y10k_1k_dense 1 0 $DENSE_SP $FORMAT 1 & pidDense80=$! + ${CMD} -f ../datagen/genRandData4LogisticRegression.dml --args 10000 1000 5 5 ${BASE}/w10k_1k_sparse ${BASE}/X10k_1k_sparse ${BASE}/y10k_1k_sparse 1 0 $SPARSE_SP $FORMAT 1 & pidSparse80=$! + wait $pidDense80; ${CMD} -f scripts/extractTestData.dml --args ${BASE}/X10k_1k_dense ${BASE}/y10k_1k_dense ${BASE}/X10k_1k_dense_test ${BASE}/y10k_1k_dense_test $FORMAT & + wait $pidSparse80; ${CMD} -f scripts/extractTestData.dml --args ${BASE}/X10k_1k_sparse ${BASE}/y10k_1k_sparse ${BASE}/X10k_1k_sparse_test ${BASE}/y10k_1k_sparse_test $FORMAT & +fi ##generate S scenarios (800MB) -${CMD} -f ./datagen/genRandData4LogisticRegression.dml --args 100000 1000 5 5 ${BASE}/w100k_1k_dense ${BASE}/X100k_1k_dense ${BASE}/y100k_1k_dense 1 0 $DENSE_SP $FORMAT 1 -${CMD} -f ./datagen/genRandData4LogisticRegression.dml --args 100000 1000 5 5 ${BASE}/w100k_1k_sparse ${BASE}/X100k_1k_sparse ${BASE}/y100k_1k_sparse 1 0 $SPARSE_SP $FORMAT 1 -${CMD} -f scripts/extractTestData.dml --args ${BASE}/X100k_1k_dense ${BASE}/y100k_1k_dense ${BASE}/X100k_1k_dense_test ${BASE}/y100k_1k_dense_test $FORMAT -${CMD} -f scripts/extractTestData.dml --args ${BASE}/X100k_1k_sparse ${BASE}/y100k_1k_sparse ${BASE}/X100k_1k_sparse_test ${BASE}/y100k_1k_sparse_test $FORMAT - -##generate M scenarios (8GB) -${CMD} -f ./datagen/genRandData4LogisticRegression.dml --args 1000000 1000 5 5 ${BASE}/w1M_1k_dense ${BASE}/X1M_1k_dense ${BASE}/y1M_1k_dense 1 0 $DENSE_SP $FORMAT 1 -${CMD} -f ./datagen/genRandData4LogisticRegression.dml --args 1000000 1000 5 5 ${BASE}/w1M_1k_sparse ${BASE}/X1M_1k_sparse ${BASE}/y1M_1k_sparse 1 0 $SPARSE_SP $FORMAT 1 -${CMD} -f scripts/extractTestData.dml --args ${BASE}/X1M_1k_dense ${BASE}/y1M_1k_dense ${BASE}/X1M_1k_dense_test ${BASE}/y1M_1k_dense_test $FORMAT -${CMD} -f scripts/extractTestData.dml --args ${BASE}/X1M_1k_sparse ${BASE}/y1M_1k_sparse ${BASE}/X1M_1k_sparse_test ${BASE}/y1M_1k_sparse_test $FORMAT - -##generate L scenarios (80GB) -${CMD} -f ./datagen/genRandData4LogisticRegression.dml --args 10000000 1000 5 5 ${BASE}/w10M_1k_dense ${BASE}/X10M_1k_dense ${BASE}/y10M_1k_dense 1 0 $DENSE_SP $FORMAT 1 -${CMD} -f ./datagen/genRandData4LogisticRegression.dml --args 10000000 1000 5 5 ${BASE}/w10M_1k_sparse ${BASE}/X10M_1k_sparse ${BASE}/y10M_1k_sparse 1 0 $SPARSE_SP $FORMAT 1 -${CMD} -f scripts/extractTestData.dml --args ${BASE}/X10M_1k_dense ${BASE}/y10M_1k_dense ${BASE}/X10M_1k_dense_test ${BASE}/y10M_1k_dense_test $FORMAT -${CMD} -f scripts/extractTestData.dml --args ${BASE}/X10M_1k_sparse ${BASE}/y10M_1k_sparse ${BASE}/X10M_1k_sparse_test ${BASE}/y10M_1k_sparse_test $FORMAT +if [ $MAXMEM -ge 800 ]; then + ${CMD} -f ../datagen/genRandData4LogisticRegression.dml --args 100000 1000 5 5 ${BASE}/w100k_1k_dense ${BASE}/X100k_1k_dense ${BASE}/y100k_1k_dense 1 0 $DENSE_SP $FORMAT 1 & pidDense800=$! + ${CMD} -f ../datagen/genRandData4LogisticRegression.dml --args 100000 1000 5 5 ${BASE}/w100k_1k_sparse ${BASE}/X100k_1k_sparse ${BASE}/y100k_1k_sparse 1 0 $SPARSE_SP $FORMAT 1 & pidSparse800=$! + wait $pidDense800; ${CMD} -f scripts/extractTestData.dml --args ${BASE}/X100k_1k_dense ${BASE}/y100k_1k_dense ${BASE}/X100k_1k_dense_test ${BASE}/y100k_1k_dense_test $FORMAT & + wait $pidSparse800; ${CMD} -f scripts/extractTestData.dml --args ${BASE}/X100k_1k_sparse ${BASE}/y100k_1k_sparse ${BASE}/X100k_1k_sparse_test ${BASE}/y100k_1k_sparse_test $FORMAT & +fi + +#generate M scenarios (8GB) +if [ $MAXMEM -ge 8000 ]; then + ${CMD} -f ../datagen/genRandData4LogisticRegression.dml --args 1000000 1000 5 5 ${BASE}/w1M_1k_dense ${BASE}/X1M_1k_dense ${BASE}/y1M_1k_dense 1 0 $DENSE_SP $FORMAT 1 & pidDense8000=$! + ${CMD} -f ../datagen/genRandData4LogisticRegression.dml --args 1000000 1000 5 5 ${BASE}/w1M_1k_sparse ${BASE}/X1M_1k_sparse ${BASE}/y1M_1k_sparse 1 0 $SPARSE_SP $FORMAT 1 & pidSparse8000=$! + wait $pidDense8000; ${CMD} -f scripts/extractTestData.dml --args ${BASE}/X1M_1k_dense ${BASE}/y1M_1k_dense ${BASE}/X1M_1k_dense_test ${BASE}/y1M_1k_dense_test $FORMAT & + wait $pidSparse8000; ${CMD} -f scripts/extractTestData.dml --args ${BASE}/X1M_1k_sparse ${BASE}/y1M_1k_sparse ${BASE}/X1M_1k_sparse_test ${BASE}/y1M_1k_sparse_test $FORMAT & +fi + +#generate L scenarios (80GB) +if [ $MAXMEM -ge 80000 ]; then + ${CMD} -f ../datagen/genRandData4LogisticRegression.dml --args 10000000 1000 5 5 ${BASE}/w10M_1k_dense ${BASE}/X10M_1k_dense ${BASE}/y10M_1k_dense 1 0 $DENSE_SP $FORMAT 1 + ${CMD} -f ../datagen/genRandData4LogisticRegression.dml --args 10000000 1000 5 5 ${BASE}/w10M_1k_sparse ${BASE}/X10M_1k_sparse ${BASE}/y10M_1k_sparse 1 0 $SPARSE_SP $FORMAT 1 + ${CMD} -f scripts/extractTestData.dml --args ${BASE}/X10M_1k_dense ${BASE}/y10M_1k_dense ${BASE}/X10M_1k_dense_test ${BASE}/y10M_1k_dense_test $FORMAT + ${CMD} -f scripts/extractTestData.dml --args ${BASE}/X10M_1k_sparse ${BASE}/y10M_1k_sparse ${BASE}/X10M_1k_sparse_test ${BASE}/y10M_1k_sparse_test $FORMAT +fi ##generate XL scenarios (800GB) -#${CMD} -f ./datagen/genRandData4LogisticRegression.dml --args 100000000 1000 5 5 ${BASE}/w100M_1k_dense ${BASE}/X100M_1k_dense ${BASE}/y100M_1k_dense 1 0 $DENSE_SP $FORMAT 1 -#${CMD} -f ./datagen/genRandData4LogisticRegression.dml --args 100000000 1000 5 5 ${BASE}/w100M_1k_sparse ${BASE}/X100M_1k_sparse ${BASE}/y100M_1k_sparse 1 0 $SPARSE_SP $FORMAT 1 -#${CMD} -f scripts/extractTestData.dml --args ${BASE}/X100M_1k_dense ${BASE}/y100M_1k_dense ${BASE}/X100M_1k_dense_test ${BASE}/y100M_1k_dense_test $FORMAT -#${CMD} -f scripts/extractTestData.dml --args ${BASE}/X100M_1k_sparse ${BASE}/y100M_1k_sparse ${BASE}/X100M_1k_sparse_test ${BASE}/y100M_1k_sparse_test $FORMAT \ No newline at end of file +if [ $MAXMEM -ge 800000 ]; then + ${CMD} -f ../datagen/genRandData4LogisticRegression.dml --args 100000000 1000 5 5 ${BASE}/w100M_1k_dense ${BASE}/X100M_1k_dense ${BASE}/y100M_1k_dense 1 0 $DENSE_SP $FORMAT 1 + ${CMD} -f ../datagen/genRandData4LogisticRegression.dml --args 100000000 1000 5 5 ${BASE}/w100M_1k_sparse ${BASE}/X100M_1k_sparse ${BASE}/y100M_1k_sparse 1 0 $SPARSE_SP $FORMAT 1 + ${CMD} -f scripts/extractTestData.dml --args ${BASE}/X100M_1k_dense ${BASE}/y100M_1k_dense ${BASE}/X100M_1k_dense_test ${BASE}/y100M_1k_dense_test $FORMAT + ${CMD} -f scripts/extractTestData.dml --args ${BASE}/X100M_1k_sparse ${BASE}/y100M_1k_sparse ${BASE}/X100M_1k_sparse_test ${BASE}/y100M_1k_sparse_test $FORMAT +fi + +wait \ No newline at end of file diff --git a/scripts/perftest/genClusteringData.sh b/scripts/perftest/genClusteringData.sh new file mode 100755 index 0000000..02df510 --- /dev/null +++ b/scripts/perftest/genClusteringData.sh @@ -0,0 +1,66 @@ +#!/bin/bash +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- +if [ "$(basename $PWD)" != "perftest" ]; +then + echo "Please execute scripts from directory 'perftest'" + exit 1; +fi + +CMD=$1 +BASE=$2/clustering +MAXMEM=$3 + +FORMAT="binary" +DENSE_SP=0.9 +SPARSE_SP=0.01 + +#generate XS scenarios (80MB) +if [ $MAXMEM -ge 80 ]; then + ${CMD} -f ../datagen/genRandData4Kmeans.dml --nvargs nr=10000 nf=1000 nc=5 dc=10.0 dr=1.0 fbf=100.0 cbf=100.0 X=$BASE/X10k_1k_dense C=$BASE/C10k_1k_dense Y=$BASE/y10k_1k_dense YbyC=$BASE/YbyC10k_1k_dense fmt=$FORMAT & pidDense80=$! + wait $pidDense80; ${CMD} -f scripts/extractTestData.dml --args $BASE/X10k_1k_dense $BASE/y10k_1k_dense $BASE/X10k_1k_dense_test $BASE/y10k_1k_dense_test $FORMAT & +fi + +#generate S scenarios (800MB) +if [ $MAXMEM -ge 800 ]; then + ${CMD} -f ../datagen/genRandData4Kmeans.dml --nvargs nr=100000 nf=1000 nc=5 dc=10.0 dr=1.0 fbf=100.0 cbf=100.0 X=$BASE/X100k_1k_dense C=$BASE/C100k_1k_dense Y=$BASE/y100k_1k_dense YbyC=$BASE/YbyC100k_1k_dense fmt=$FORMAT & pidDense800=$! + wait $pidDense800; ${CMD} -f scripts/extractTestData.dml --args $BASE/X100k_1k_dense $BASE/y100k_1k_dense $BASE/X100k_1k_dense_test $BASE/y100k_1k_dense_test $FORMAT & +fi + +#generate M scenarios (8GB) +if [ $MAXMEM -ge 8000 ]; then + ${CMD} -f ../datagen/genRandData4Kmeans.dml --nvargs nr=1000000 nf=1000 nc=5 dc=10.0 dr=1.0 fbf=100.0 cbf=100.0 X=$BASE/X1M_1k_dense C=$BASE/C1M_1k_dense Y=$BASE/y1M_1k_dense YbyC=$BASE/YbyC1M_1k_dense fmt=$FORMAT & pidDense8000=$! + wait $pidDense8000; ${CMD} -f scripts/extractTestData.dml --args $BASE/X1M_1k_dense $BASE/y1M_1k_dense $BASE/X1M_1k_dense_test $BASE/y1M_1k_dense_test $FORMAT & +fi + +#generate L scenarios (80GB) +if [ $MAXMEM -ge 80000 ]; then + ${CMD} -f ../datagen/genRandData4Kmeans.dml --nvargs nr=10000000 nf=1000 nc=5 dc=10.0 dr=1.0 fbf=100.0 cbf=100.0 X=$BASE/X10M_1k_dense C=$BASE/C10M_1k_dense Y=$BASE/y10M_1k_dense YbyC=$BASE/YbyC10M_1k_dense fmt=$FORMAT + ${CMD} -f scripts/extractTestData.dml --args $BASE/X10M_1k_dense $BASE/y10M_1k_dense $BASE/X10M_1k_dense_test $BASE/y10M_1k_dense_test $FORMAT +fi + +#generate LARGE scenarios (800GB) +if [ $MAXMEM -ge 800000 ]; then + ${CMD} -f ../datagen/genRandData4Kmeans.dml --nvargs nr=100000000 nf=1000 nc=5 dc=10.0 dr=1.0 fbf=100.0 cbf=100.0 X=$BASE/X100M_1k_dense C=$BASE/C100M_1k_dense Y=$BASE/y100M_1k_dense YbyC=$BASE/YbyC100M_1k_dense fmt=$FORMAT + ${CMD} -f scripts/extractTestData.dml --args $BASE/X100M_1k_dense $BASE/y100M_1k_dense $BASE/X100M_1k_dense_test $BASE/y100M_1k_dense_test $FORMAT +fi + +wait \ No newline at end of file diff --git a/scripts/perftest/genDescriptiveStatisticsData.sh b/scripts/perftest/genDescriptiveStatisticsData.sh new file mode 100755 index 0000000..55af5f1 --- /dev/null +++ b/scripts/perftest/genDescriptiveStatisticsData.sh @@ -0,0 +1,60 @@ +#!/bin/bash +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- +if [ "$(basename $PWD)" != "perftest" ]; +then + echo "Please execute scripts from directory 'perftest'" + exit 1; +fi + +CMD=$1 +BASE=$2/bivar +MAXMEM=$3 + +FORMAT="binary" + +c=1000 +nc=100 +mdomain=1100 +set=20 +labelset=10 + +#XS data 10K rows +if [ $MAXMEM -ge 80 ]; then + ${CMD} -f ../datagen/genRandData4DescriptiveStats.dml --explain --stats --nvargs R=10000 C=$c NC=$nc MAXDOMAIN=$mdomain DATA=${BASE}/A_10k/data TYPES=${BASE}/A_10k/types SETSIZE=$set LABELSETSIZE=$labelset TYPES1=${BASE}/A_10k/set1.types TYPES2=${BASE}/A_10k/set2.types INDEX1=${BASE}/A_10k/set1.indices INDEX2=${BASE}/A_10k/set2.indices FMT=$FORMAT & +fi + +#S data 100K rows +if [ $MAXMEM -ge 800 ]; then + ${CMD} -f ../datagen/genRandData4DescriptiveStats.dml --explain --stats --nvargs R=100000 C=$c NC=$nc MAXDOMAIN=$mdomain DATA=${BASE}/A_100k/data TYPES=${BASE}/A_100k/types SETSIZE=$set LABELSETSIZE=$labelset TYPES1=${BASE}/A_100k/set1.types TYPES2=${BASE}/A_100k/set2.types INDEX1=${BASE}/A_100k/set1.indices INDEX2=${BASE}/A_100k/set2.indices FMT=$FORMAT & +fi + +#M data 1M rows +if [ $MAXMEM -ge 8000 ]; then + ${CMD} -f ../datagen/genRandData4DescriptiveStats.dml --explain --stats --nvargs R=1000000 C=$c NC=$nc MAXDOMAIN=$mdomain DATA=${BASE}/A_1M/data TYPES=${BASE}/A_1M/types SETSIZE=$set LABELSETSIZE=$labelset TYPES1=${BASE}/A_1M/set1.types TYPES2=${BASE}/A_1M/set2.types INDEX1=${BASE}/A_1M/set1.indices INDEX2=${BASE}/A_1M/set2.indices FMT=$FORMAT & +fi + +#L data 10M rows +if [ $MAXMEM -ge 80000 ]; then + ${CMD} -f ../datagen/genRandData4DescriptiveStats.dml --explain --stats --nvargs R=10000000 C=$c NC=$nc MAXDOMAIN=$mdomain DATA=${BASE}/A_10M/data TYPES=${BASE}/A_10M/types SETSIZE=$set LABELSETSIZE=$labelset TYPES1=${BASE}/A_10M/set1.types TYPES2=${BASE}/A_10M/set2.types INDEX1=${BASE}/A_10M/set1.indices INDEX2=${BASE}/A_10M/set2.indices FMT=$FORMAT +fi + +wait \ No newline at end of file diff --git a/scripts/perftest/todo/genDimensionReductionData.sh b/scripts/perftest/genDimensionReductionData.sh old mode 100644 new mode 100755 similarity index 54% rename from scripts/perftest/todo/genDimensionReductionData.sh rename to scripts/perftest/genDimensionReductionData.sh index 2589c28..5f14654 --- a/scripts/perftest/todo/genDimensionReductionData.sh +++ b/scripts/perftest/genDimensionReductionData.sh @@ -19,29 +19,41 @@ # under the License. # #------------------------------------------------------------- +if [ "$(basename $PWD)" != "perftest" ]; +then + echo "Please execute scripts from directory 'perftest'" + exit 1; +fi -if [ "$1" == "" -o "$2" == "" ]; then echo "Usage: $0 <hdfsDataDir> <MR | SPARK | ECHO> e.g. $0 perftest SPARK" ; exit 1 ; fi -if [ "$2" == "SPARK" ]; then CMD="./sparkDML.sh "; DASH="-"; elif [ "$2" == "MR" ]; then CMD="hadoop jar SystemDS.jar " ; else CMD="echo " ; fi - - -FORMAT="binary" -BASE=$1/dimensionreduction - -export HADOOP_CLIENT_OPTS="-Xmx2048m -Xms2048m -Xmn256m" +CMD=$1 +BASE=$2/dimensionreduction +MAXMEM=$3 +FORMAT="binary" #generate XS scenarios (80MB) -${CMD} -f ../datagen/genRandData4PCA.dml $DASH-nvargs 5000 2000 $BASE/pcaData5k_2k_dense $FORMAT +if [ $MAXMEM -ge 80 ]; then + ${CMD} -f ../datagen/genRandData4PCA.dml --nvargs R=5000 C=2000 OUT=$BASE/pcaData5k_2k_dense FMT=$FORMAT & +fi #generate S scenarios (800MB) -#${CMD} -f ../datagen/genRandData4PCA.dml $DASH-nvargs 50000 2000 $BASE/pcaData50k_2k_dense $FORMAT +if [ $MAXMEM -ge 800 ]; then + ${CMD} -f ../datagen/genRandData4PCA.dml --nvargs R=50000 C=2000 OUT=$BASE/pcaData50k_2k_dense FMT=$FORMAT & +fi #generate M scenarios (8GB) -#${CMD} -f ../datagen/genRandData4PCA.dml $DASH-nvargs 500000 2000 $BASE/pcaData500k_2k_dense $FORMAT +if [ $MAXMEM -ge 8000 ]; then + ${CMD} -f ../datagen/genRandData4PCA.dml --nvargs R=500000 C=2000 OUT=$BASE/pcaData500k_2k_dense FMT=$FORMAT & +fi #generate L scenarios (80GB) -#${CMD} -f ../datagen/genRandData4PCA.dml $DASH-nvargs 5000000 2000 $BASE/pcaData5M_2k_dense $FORMAT +if [ $MAXMEM -ge 80000 ]; then + ${CMD} -f ../datagen/genRandData4PCA.dml --nvargs R=5000000 C=2000 OUT=$BASE/pcaData5M_2k_dense FMT=$FORMAT +fi #generate XL scenarios (800GB) -#${CMD} -f ../datagen/genRandData4PCA.dml $DASH-nvargs 50000000 2000 $BASE/pcaData50M_2k_dense $FORMAT +if [ $MAXMEM -ge 800000 ]; then + ${CMD} -f ${EXTRADOT}./datagen/genRandData4PCA.dml --nvargs R=50000000 C=2000 OUT=$BASE/pcaData50M_2k_dense FMT=$FORMAT +fi +wait \ No newline at end of file diff --git a/scripts/perftest/genL2SVMData.sh b/scripts/perftest/genL2SVMData.sh index 237de1d..d25e433 100755 --- a/scripts/perftest/genL2SVMData.sh +++ b/scripts/perftest/genL2SVMData.sh @@ -17,6 +17,12 @@ # specific language governing permissions and limitations # under the License. # +#------------------------------------------------------------- +if [ "$(basename $PWD)" != "perftest" ]; +then + echo "Please execute scripts from directory 'perftest'" + exit 1; +fi CMD=$1 DATADIR=$2 diff --git a/scripts/perftest/genMultinomialData.sh b/scripts/perftest/genMultinomialData.sh index 7ea6cad..e7ef109 100755 --- a/scripts/perftest/genMultinomialData.sh +++ b/scripts/perftest/genMultinomialData.sh @@ -19,40 +19,58 @@ # under the License. # #------------------------------------------------------------- +if [ "$(basename $PWD)" != "perftest" ]; +then + echo "Please execute scripts from directory 'perftest'" + exit 1; +fi CMD=$1 BASE=$2/multinomial +MAXMEM=$3 FORMAT="binary" DENSE_SP=0.9 SPARSE_SP=0.01 #generate XS scenarios (80MB) -${CMD} -f ./datagen/genRandData4Multinomial.dml $DASH-args 10000 1000 $DENSE_SP 5 0 $BASE/X10k_1k_dense_k5 $BASE/y10k_1k_dense_k5 $FORMAT 1 -${CMD} -f ./datagen/genRandData4Multinomial.dml $DASH-args 10000 1000 $SPARSE_SP 5 0 $BASE/X10k_1k_sparse_k5 $BASE/y10k_1k_sparse_k5 $FORMAT 1 -${CMD} -f scripts/extractTestData.dml $DASH-args $BASE/X10k_1k_dense_k5 $BASE/y10k_1k_dense_k5 $BASE/X10k_1k_dense_k5_test $BASE/y10k_1k_dense_k5_test $FORMAT -${CMD} -f scripts/extractTestData.dml $DASH-args $BASE/X10k_1k_sparse_k5 $BASE/y10k_1k_sparse_k5 $BASE/X10k_1k_sparse_k5_test $BASE/y10k_1k_sparse_k5_test $FORMAT +if [ $MAXMEM -ge 80 ]; then + ${CMD} -f ../datagen/genRandData4Multinomial.dml $DASH-args 10000 1000 $DENSE_SP 5 0 $BASE/X10k_1k_dense_k5 $BASE/y10k_1k_dense_k5 $FORMAT 1 & pidDense80=$! + ${CMD} -f ../datagen/genRandData4Multinomial.dml $DASH-args 10000 1000 $SPARSE_SP 5 0 $BASE/X10k_1k_sparse_k5 $BASE/y10k_1k_sparse_k5 $FORMAT 1 & pidSparse80=$! + wait $pidDense80; ${CMD} -f scripts/extractTestData.dml $DASH-args $BASE/X10k_1k_dense_k5 $BASE/y10k_1k_dense_k5 $BASE/X10k_1k_dense_k5_test $BASE/y10k_1k_dense_k5_test $FORMAT & + wait $pidSparse80; ${CMD} -f scripts/extractTestData.dml $DASH-args $BASE/X10k_1k_sparse_k5 $BASE/y10k_1k_sparse_k5 $BASE/X10k_1k_sparse_k5_test $BASE/y10k_1k_sparse_k5_test $FORMAT & +fi ##generate S scenarios (800MB) -${CMD} -f ./datagen/genRandData4Multinomial.dml $DASH-args 100000 1000 $DENSE_SP 5 0 $BASE/X100k_1k_dense_k5 $BASE/y100k_1k_dense_k5 $FORMAT 1 -${CMD} -f ./datagen/genRandData4Multinomial.dml $DASH-args 100000 1000 $SPARSE_SP 5 0 $BASE/X100k_1k_sparse_k5 $BASE/y100k_1k_sparse_k5 $FORMAT 1 -${CMD} -f scripts/extractTestData.dml $DASH-args $BASE/X100k_1k_dense_k5 $BASE/y100k_1k_dense_k5 $BASE/X100k_1k_dense_k5_test $BASE/y100k_1k_dense_k5_test $FORMAT -${CMD} -f scripts/extractTestData.dml $DASH-args $BASE/X100k_1k_sparse_k5 $BASE/y100k_1k_sparse_k5 $BASE/X100k_1k_sparse_k5_test $BASE/y100k_1k_sparse_k5_test $FORMAT +if [ $MAXMEM -ge 800 ]; then + ${CMD} -f ../datagen/genRandData4Multinomial.dml $DASH-args 100000 1000 $DENSE_SP 5 0 $BASE/X100k_1k_dense_k5 $BASE/y100k_1k_dense_k5 $FORMAT 1 & pidDense800=$! + ${CMD} -f ../datagen/genRandData4Multinomial.dml $DASH-args 100000 1000 $SPARSE_SP 5 0 $BASE/X100k_1k_sparse_k5 $BASE/y100k_1k_sparse_k5 $FORMAT 1 & pidSparse800=$! + wait $pidDense800; ${CMD} -f scripts/extractTestData.dml $DASH-args $BASE/X100k_1k_dense_k5 $BASE/y100k_1k_dense_k5 $BASE/X100k_1k_dense_k5_test $BASE/y100k_1k_dense_k5_test $FORMAT & + wait $pidSparse800; ${CMD} -f scripts/extractTestData.dml $DASH-args $BASE/X100k_1k_sparse_k5 $BASE/y100k_1k_sparse_k5 $BASE/X100k_1k_sparse_k5_test $BASE/y100k_1k_sparse_k5_test $FORMAT & +fi ##generate M scenarios (8GB) -${CMD} -f ./datagen/genRandData4Multinomial.dml $DASH-args 1000000 1000 $DENSE_SP 5 0 $BASE/X1M_1k_dense_k5 $BASE/y1M_1k_dense_k5 $FORMAT 1 -${CMD} -f ./datagen/genRandData4Multinomial.dml $DASH-args 1000000 1000 $SPARSE_SP 5 0 $BASE/X1M_1k_sparse_k5 $BASE/y1M_1k_sparse_k5 $FORMAT 1 -${CMD} -f scripts/extractTestData.dml $DASH-args $BASE/X1M_1k_dense_k5 $BASE/y1M_1k_dense_k5 $BASE/X1M_1k_dense_k5_test $BASE/y1M_1k_dense_k5_test $FORMAT -${CMD} -f scripts/extractTestData.dml $DASH-args $BASE/X1M_1k_sparse_k5 $BASE/y1M_1k_sparse_k5 $BASE/X1M_1k_sparse_k5_test $BASE/y1M_1k_sparse_k5_test $FORMAT +if [ $MAXMEM -ge 8000 ]; then + ${CMD} -f ../datagen/genRandData4Multinomial.dml $DASH-args 1000000 1000 $DENSE_SP 5 0 $BASE/X1M_1k_dense_k5 $BASE/y1M_1k_dense_k5 $FORMAT 1 & pidDense8000=$! + ${CMD} -f ../datagen/genRandData4Multinomial.dml $DASH-args 1000000 1000 $SPARSE_SP 5 0 $BASE/X1M_1k_sparse_k5 $BASE/y1M_1k_sparse_k5 $FORMAT 1 & pidSparse8000=$! + wait $pidDense8000; ${CMD} -f scripts/extractTestData.dml $DASH-args $BASE/X1M_1k_dense_k5 $BASE/y1M_1k_dense_k5 $BASE/X1M_1k_dense_k5_test $BASE/y1M_1k_dense_k5_test $FORMAT & + wait $pidSparse8000; ${CMD} -f scripts/extractTestData.dml $DASH-args $BASE/X1M_1k_sparse_k5 $BASE/y1M_1k_sparse_k5 $BASE/X1M_1k_sparse_k5_test $BASE/y1M_1k_sparse_k5_test $FORMAT & +fi ##generate L scenarios (80GB) -${CMD} -f ./datagen/genRandData4Multinomial.dml $DASH-args 10000000 1000 $DENSE_SP 5 0 $BASE/X10M_1k_dense_k5 $BASE/y10M_1k_dense_k5 $FORMAT 1 -${CMD} -f ./datagen/genRandData4Multinomial.dml $DASH-args 10000000 1000 $SPARSE_SP 5 0 $BASE/X10M_1k_sparse_k5 $BASE/y10M_1k_sparse_k5 $FORMAT 1 -${CMD} -f scripts/extractTestData.dml $DASH-args $BASE/X10M_1k_dense_k5 $BASE/y10M_1k_dense_k5 $BASE/X10M_1k_dense_k5_test $BASE/y10M_1k_dense_k5_test $FORMAT -${CMD} -f scripts/extractTestData.dml $DASH-args $BASE/X10M_1k_sparse_k5 $BASE/y10M_1k_sparse_k5 $BASE/X10M_1k_sparse_k5_test $BASE/y10M_1k_sparse_k5_test $FORMAT - -##generate LARGE scenarios (800GB) -#${CMD} -f ./datagen/genRandData4Multinomial.dml $DASH-args 100000000 1000 $DENSE_SP 5 0 $BASE/X100M_1k_dense_k5 $BASE/y100M_1k_dense_k5 $FORMAT 1 -#${CMD} -f ./datagen/genRandData4Multinomial.dml $DASH-args 100000000 1000 $SPARSE_SP 5 0 $BASE/X100M_1k_sparse_k5 $BASE/y100M_1k_sparse_k5 $FORMAT 1 -#${CMD} -f scripts/extractTestData.dml $DASH-args $BASE/X100M_1k_dense_k5 $BASE/y100M_1k_dense_k5 $BASE/X100M_1k_dense_k5_test $BASE/y100M_1k_dense_k5_test $FORMAT -#${CMD} -f scripts/extractTestData.dml $DASH-args $BASE/X100M_1k_sparse_k5 $BASE/y100M_1k_sparse_k5 $BASE/X100M_1k_sparse_k5_test $BASE/y100M_1k_sparse_k5_test $FORMAT +if [ $MAXMEM -ge 80000 ]; then + ${CMD} -f ../datagen/genRandData4Multinomial.dml $DASH-args 10000000 1000 $DENSE_SP 5 0 $BASE/X10M_1k_dense_k5 $BASE/y10M_1k_dense_k5 $FORMAT 1 + ${CMD} -f ../datagen/genRandData4Multinomial.dml $DASH-args 10000000 1000 $SPARSE_SP 5 0 $BASE/X10M_1k_sparse_k5 $BASE/y10M_1k_sparse_k5 $FORMAT 1 + ${CMD} -f scripts/extractTestData.dml $DASH-args $BASE/X10M_1k_dense_k5 $BASE/y10M_1k_dense_k5 $BASE/X10M_1k_dense_k5_test $BASE/y10M_1k_dense_k5_test $FORMAT + ${CMD} -f scripts/extractTestData.dml $DASH-args $BASE/X10M_1k_sparse_k5 $BASE/y10M_1k_sparse_k5 $BASE/X10M_1k_sparse_k5_test $BASE/y10M_1k_sparse_k5_test $FORMAT +fi + +#generate LARGE scenarios (800GB) +if [ $MAXMEM -ge 800000 ]; then + ${CMD} -f ../datagen/genRandData4Multinomial.dml $DASH-args 100000000 1000 $DENSE_SP 5 0 $BASE/X100M_1k_dense_k5 $BASE/y100M_1k_dense_k5 $FORMAT 1 + ${CMD} -f ../datagen/genRandData4Multinomial.dml $DASH-args 100000000 1000 $SPARSE_SP 5 0 $BASE/X100M_1k_sparse_k5 $BASE/y100M_1k_sparse_k5 $FORMAT 1 + ${CMD} -f scripts/extractTestData.dml $DASH-args $BASE/X100M_1k_dense_k5 $BASE/y100M_1k_dense_k5 $BASE/X100M_1k_dense_k5_test $BASE/y100M_1k_dense_k5_test $FORMAT + ${CMD} -f scripts/extractTestData.dml $DASH-args $BASE/X100M_1k_sparse_k5 $BASE/y100M_1k_sparse_k5 $BASE/X100M_1k_sparse_k5_test $BASE/y100M_1k_sparse_k5_test $FORMAT +fi + +wait \ No newline at end of file diff --git a/scripts/perftest/genStratStatisticsData.sh b/scripts/perftest/genStratStatisticsData.sh new file mode 100755 index 0000000..7aa18e3 --- /dev/null +++ b/scripts/perftest/genStratStatisticsData.sh @@ -0,0 +1,59 @@ +#!/bin/bash +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- +if [ "$(basename $PWD)" != "perftest" ]; +then + echo "Please execute scripts from directory 'perftest'" + exit 1; +fi + +CMD=$1 +BASE=$2/stratstats +MAXMEM=$3 + +FORMAT="binary" + +#XS data 10K rows +if [ $MAXMEM -ge 80 ]; then + ${CMD} -f ../datagen/genRandData4StratStats.dml --explain --stats --nvargs nr=10000 nf=100 D=${BASE}/A_10k/data Xcid=${BASE}/A_10k/Xcid Ycid=${BASE}/A_10k/Ycid A=${BASE}/A_10k/A fmt=$FORMAT & +fi + +#S data 100K rows +if [ $MAXMEM -ge 800 ]; then + ${CMD} -f ../datagen/genRandData4StratStats.dml --explain --stats --nvargs nr=100000 nf=100 D=${BASE}/A_100k/data Xcid=${BASE}/A_100k/Xcid Ycid=${BASE}/A_100k/Ycid A=${BASE}/A_100k/A fmt=$FORMAT & +fi + +#M data 1M rows +if [ $MAXMEM -ge 8000 ]; then + ${CMD} -f ../datagen/genRandData4StratStats.dml --explain --stats --nvargs nr=1000000 nf=100 D=${BASE}/A_1M/data Xcid=${BASE}/A_1M/Xcid Ycid=${BASE}/A_1M/Ycid A=${BASE}/A_1M/A fmt=$FORMAT & +fi + +#L data 10M rows +if [ $MAXMEM -ge 80000 ]; then + ${CMD} -f ../datagen/genRandData4StratStats.dml --explain --stats --nvargs nr=10000000 nf=100 D=${BASE}/A_10M/data Xcid=${BASE}/A_10M/Xcid Ycid=${BASE}/A_10M/Ycid A=${BASE}/A_10M/A fmt=$FORMAT +fi + +#XL data 100M rows +if [ $MAXMEM -ge 800000 ]; then + ${CMD} -f ../datagen/genRandData4StratStats.dml --explain --stats --nvargs nr=100000000 nf=100 D=${BASE}/A_10M/data Xcid=${BASE}/A_10M/Xcid Ycid=${BASE}/A_10M/Ycid A=${BASE}/A_10M/A fmt=$FORMAT +fi + +wait \ No newline at end of file diff --git a/scripts/perftest/runALS.sh b/scripts/perftest/runALS_CG.sh similarity index 67% copy from scripts/perftest/runALS.sh copy to scripts/perftest/runALS_CG.sh index 0cb3524..172b566 100755 --- a/scripts/perftest/runALS.sh +++ b/scripts/perftest/runALS_CG.sh @@ -19,10 +19,17 @@ # under the License. # #------------------------------------------------------------- +set -e + +if [ "$(basename $PWD)" != "perftest" ]; +then + echo "Please execute scripts from directory 'perftest'" + exit 1; +fi X=$1 MAXITER=${2:-100} -DATADIR=${3:-"temp"}/als +DATADIR=${3:-"temp"} CMD=${4:-"systemds"} THRESHOLD=${5:-0.0001} VERBOSE=${6:-FALSE} @@ -33,16 +40,26 @@ err_report() { } trap 'err_report $LINENO' ERR -tstart=$(date +%s.%N) - BASEPATH=$(dirname "$0") tstart=$(date +%s.%N) ${CMD} -f ${BASEPATH}/scripts/alsCG.dml \ --config ${BASEPATH}/conf/SystemDS-config.xml \ - --nvargs X=$X rank=15 reg="L2" lambda=0.000001 maxiter=$MAXITER thr=$THRESHOLD verbose=$VERBOSE modelB=${DATADIR}/B modelM=${DATADIR}/M fmt="csv" + --stats \ + --nvargs X=$X rank=15 reg="L2" lambda=0.000001 maxiter=$MAXITER thr=$THRESHOLD verbose=$VERBOSE modelU=${DATADIR}/U modelV=${DATADIR}/V fmt="csv" + +ttrain=$(echo "$(date +%s.%N) - $tstart - .4" | bc) +echo "ALS-CG algorithm on "$X": "$ttrain >> results/times.txt + + +tstart=$(date +%s.%N) + +${CMD} -f ./scripts/als-predict.dml \ + --config ${BASEPATH}/conf/SystemDS-config.xml \ + --stats \ + --nvargs X=$X Y=${DATADIR}/Y L=${DATADIR}/U R=${DATADIR}/V fmt="csv" -tend=$(echo "$(date +%s.%N) - $tstart - .4" | bc) -echo "ALS-CG algorithm on "$X": "$tend >> results/times.txt +tpredict=$(echo "$(date +%s.%N) - $tstart - .4" | bc) +echo "ALS-CG predict ict="$i" on "$1": "$tpredict >> results/times.txt diff --git a/scripts/perftest/runALS.sh b/scripts/perftest/runALS_DS.sh similarity index 62% rename from scripts/perftest/runALS.sh rename to scripts/perftest/runALS_DS.sh index 0cb3524..0d3bfcf 100755 --- a/scripts/perftest/runALS.sh +++ b/scripts/perftest/runALS_DS.sh @@ -19,10 +19,17 @@ # under the License. # #------------------------------------------------------------- +set -e + +if [ "$(basename $PWD)" != "perftest" ]; +then + echo "Please execute scripts from directory 'perftest'" + exit 1; +fi X=$1 MAXITER=${2:-100} -DATADIR=${3:-"temp"}/als +DATADIR=${3:-"temp"} CMD=${4:-"systemds"} THRESHOLD=${5:-0.0001} VERBOSE=${6:-FALSE} @@ -33,16 +40,26 @@ err_report() { } trap 'err_report $LINENO' ERR -tstart=$(date +%s.%N) - BASEPATH=$(dirname "$0") tstart=$(date +%s.%N) -${CMD} -f ${BASEPATH}/scripts/alsCG.dml \ +${CMD} -f ${BASEPATH}/scripts/alsDS.dml \ --config ${BASEPATH}/conf/SystemDS-config.xml \ - --nvargs X=$X rank=15 reg="L2" lambda=0.000001 maxiter=$MAXITER thr=$THRESHOLD verbose=$VERBOSE modelB=${DATADIR}/B modelM=${DATADIR}/M fmt="csv" + --stats \ + --nvargs X=$X rank=15 lambda=0.000001 maxiter=$MAXITER thr=$THRESHOLD verbose=$VERBOSE modelU=${DATADIR}/U modelV=${DATADIR}/V fmt="csv" + +ttrain=$(echo "$(date +%s.%N) - $tstart - .4" | bc) +echo "ALS-DS algorithm on "$X": "$ttrain >> results/times.txt + + +tstart=$(date +%s.%N) + +${CMD} -f ./scripts/als-predict.dml \ + --config conf/SystemDS-config.xml \ + --stats \ + --nvargs X=$X Y=${DATADIR}/Y L=${DATADIR}/U R=${DATADIR}/V fmt="csv" -tend=$(echo "$(date +%s.%N) - $tstart - .4" | bc) -echo "ALS-CG algorithm on "$X": "$tend >> results/times.txt +tpredict=$(echo "$(date +%s.%N) - $tstart - .4" | bc) +echo "ALS-DS predict ict="$i" on "$1": "$tpredict >> results/times.txt diff --git a/scripts/perftest/runAll.sh b/scripts/perftest/runAll.sh index 6b70082..67701a0 100755 --- a/scripts/perftest/runAll.sh +++ b/scripts/perftest/runAll.sh @@ -20,56 +20,75 @@ # #------------------------------------------------------------- +if [ "$(basename $PWD)" != "perftest" ]; +then + echo "Please execute scripts from directory 'perftest'" + exit 1; +fi + # Optional argument that can be a folder name for where generated data is stored TEMPFOLDER=$1 if [ "$TEMPFOLDER" == "" ]; then TEMPFOLDER=temp ; fi -# Set properties -export LOG4JPROP='conf/log4j-off.properties' -export SYSDS_QUIET=1 - # Command to be executed -#CMD="systemds" -CMD="./sparkDML.sh" +CMD="systemds" +# CMD="./sparkDML.sh" + +# Max memory of data to be benchmarked +MAXMEM=80 # Possible values: 80/80MB, 800/800MB, 8000/8000MB/8GB, 80000/80000MB/80GB, 800000/800000MB/800GB +MAXMEM=${MAXMEM%"MB"}; MAXMEM=${MAXMEM/GB/"000"} + +# Set properties +source ./conf/env-variables # Possible lines to initialize Intel MKL, depending on version and install location # . ~/intel/bin/compilervars.sh intel64 # . ~/intel/oneapi/setvars.sh intel64 # . /opt/intel/bin/compilervars.sh intel64 -### Micro Benchmarks: -#./MatrixMult.sh -#./MatrixTranspose.sh - # init time measurement if [ ! -d logs ]; then mkdir -p logs ; fi if [ ! -d results ]; then mkdir -p results ; fi -if [ ! -d results ]; then mkdir -p results ; fi +if [ ! -d temp ]; then mkdir -p temp ; fi date >> results/times.txt ### Data Generation -echo "-- Generating binomial data: " >> results/times.txt; -./genBinomialData.sh ${CMD} ${TEMPFOLDER} &>> logs/genBinomialData.out -echo "-- Generating multinomial data." >> results/times.txt; -./genMultinomialData.sh ${CMD} ${TEMPFOLDER} &>> logs/genMultinomialData.out +echo "-- Generating binomial data..." >> results/times.txt; +./genBinomialData.sh ${CMD} ${TEMPFOLDER} ${MAXMEM} &> logs/genBinomialData.out +echo "-- Generating multinomial data..." >> results/times.txt; +./genMultinomialData.sh ${CMD} ${TEMPFOLDER} ${MAXMEM} &> logs/genMultinomialData.out +echo "-- Generating stats data..." >> results/times.txt; +./genDescriptiveStatisticsData.sh ${CMD} ${TEMPFOLDER} ${MAXMEM} &> logs/genStatsData.out +./genStratStatisticsData.sh ${CMD} ${TEMPFOLDER} ${MAXMEM} &> logs/genStratStatsData.out +echo "-- Generating clustering data..." >> results/times.txt; +./genClusteringData.sh ${CMD} ${TEMPFOLDER} ${MAXMEM} &> logs/genClusteringData.out +echo "-- Generating Dimension Reduction data." >> results/times.txt; +./genDimensionReductionData.sh ${CMD} ${TEMPFOLDER} ${MAXMEM} &> logs/genDimensionReductionData.out +echo "-- Generating ALS data." >> results/times.txt; +./genALSData.sh ${CMD} ${TEMPFOLDER} ${MAXMEM} &> logs/genALSData.out -### Algorithms Benchmarks: -./runAllBinomial.sh $CMD $TEMPFOLDER -./runAllMultinomial.sh $CMD $TEMPFOLDER -./runAllRegression.sh $CMD $TEMPFOLDER -./fed/runAllFed.sh $CMD $TEMPFOLDER +### Micro Benchmarks: +./MatrixMult.sh ${CMD} +./MatrixTranspose.sh ${CMD} + +# Federate benchmark +#./fed/runAllFed.sh ${CMD} ${TEMPFOLDER} ${MAXMEM} -# TODO The following commented benchmarks have yet to be cleaned up and ported from perftestDeprecated to perftest -#./runAllStats.sh $CMD $TEMPFOLDER -#./runAllClustering.sh $CMD $TEMPFOLDER +### Algorithms Benchmarks: +./runAllBinomial.sh ${CMD} ${TEMPFOLDER} ${MAXMEM} +./runAllMultinomial.sh ${CMD} ${TEMPFOLDER} ${MAXMEM} +./runAllRegression.sh ${CMD} ${TEMPFOLDER} ${MAXMEM} +./runAllStats.sh ${CMD} ${TEMPFOLDER} ${MAXMEM} +./runAllClustering.sh ${CMD} ${TEMPFOLDER} ${MAXMEM} +./runAllDimensionReduction.sh ${CMD} ${TEMPFOLDER} ${MAXMEM} +./runAllALS.sh ${CMD} ${TEMPFOLDER} ${MAXMEM} +# TODO The following benchmarks have yet to be written. The decision tree algorithms additionally need to be fixed. # add stepwise Linear # add stepwise GLM -#./runAllTrees $CMD $TEMPFOLDER +#./runAllTrees.sh $CMD $TEMPFOLDER # add randomForest -#./runAllDimensionReduction $CMD $TEMPFOLDER -#./runAllMatrixFactorization $CMD $TEMPFOLDER -#ALS -#./runAllSurvival $CMD $TEMPFOLDER +#./runAllMatrixFactorization.sh $CMD $TEMPFOLDER +#./runAllSurvival.sh $CMD $TEMPFOLDER #KaplanMeier #Cox diff --git a/scripts/perftest/runAllMultinomial.sh b/scripts/perftest/runAllALS.sh similarity index 55% copy from scripts/perftest/runAllMultinomial.sh copy to scripts/perftest/runAllALS.sh index 4df9931..b0ac290 100755 --- a/scripts/perftest/runAllMultinomial.sh +++ b/scripts/perftest/runAllALS.sh @@ -8,9 +8,9 @@ # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY @@ -19,14 +19,16 @@ # under the License. # #------------------------------------------------------------- +if [ "$(basename $PWD)" != "perftest" ]; +then + echo "Please execute scripts from directory 'perftest'" + exit 1; +fi -COMMAND=$1 -TEMPFOLDER=$2 -if [ "$TEMPFOLDER" == "" ]; then TEMPFOLDER=temp ; fi - -BASE=${TEMPFOLDER}/multinomial -BASE0=${TEMPFOLDER}/binomial -MAXITR=20 +CMD=${1:-"systemds"} +DATADIR=${2:-"temp"}/als +MAXMEM=$3 +MAXITR=${4:-100} FILENAME=$0 err_report() { @@ -34,22 +36,20 @@ err_report() { } trap 'err_report $LINENO' ERR -echo " RUN MULTINOMIAL EXPERIMENTS: "$(date) >> results/times.txt; +DATA=() +if [ $MAXMEM -ge 80 ]; then DATA+=("10k_1k_dense" "10k_1k_sparse"); fi +if [ $MAXMEM -ge 800 ]; then DATA+=("100k_1k_dense" "100k_1k_sparse"); fi +if [ $MAXMEM -ge 8000 ]; then DATA+=("1M_1k_dense" "1M_1k_sparse"); fi +if [ $MAXMEM -ge 80000 ]; then DATA+=("10M_1k_dense" "10M_1k_sparse"); fi +if [ $MAXMEM -ge 800000 ]; then DATA+=("100M_1k_dense" "100M_1k_sparse"); fi -# run all classifiers with binomial labels on all datasets -# see genMultinomialData -for d in "10k_1k_dense" "10k_1k_sparse" "100k_1k_dense" "100k_1k_sparse" "1M_1k_dense" "1M_1k_sparse" "10M_1k_dense" "10M_1k_sparse" #"100M_1k_dense" "100M_1k_sparse" -do - for f in "runNaiveBayes" - do - echo "-- Running "$f" on "$d" (all configs)" >> results/times.txt; - ./${f}.sh ${BASE}/X${d}_k5 ${BASE}/y${d}_k5 5 ${BASE} ${COMMAND} &> logs/${f}_${d}_k5.out; - done +echo "RUN ALS EXPERIMENTS: " $(date) >> results/times.txt; - # run with the parameter setting maximum of iterations - for f in "runMultiLogReg" "runMSVM" +for d in ${DATA[@]} +do + for f in "runALS_CG" "runALS_DS" do echo "-- Running "$f" on "$d" (all configs)" >> results/times.txt; - ./${f}.sh ${BASE}/X${d}_k5 ${BASE}/y${d}_k5 5 ${BASE} ${MAXITR} ${COMMAND} &> logs/${f}_${d}_k5.out; + ./${f}.sh ${DATADIR}/X${d} $MAXITR $DATADIR ${CMD} 0.001 FALSE &> logs/${f}_${d}.out; done done diff --git a/scripts/perftest/runAllBinomial.sh b/scripts/perftest/runAllBinomial.sh index 65f5734..a40c6a7 100755 --- a/scripts/perftest/runAllBinomial.sh +++ b/scripts/perftest/runAllBinomial.sh @@ -19,9 +19,15 @@ # under the License. # #------------------------------------------------------------- +if [ "$(basename $PWD)" != "perftest" ]; +then + echo "Please execute scripts from directory 'perftest'" + exit 1; +fi COMMAND=$1 TEMPFOLDER=$2 +MAXMEM=$3 BASE=${TEMPFOLDER}/binomial MAXITR=20 @@ -32,11 +38,18 @@ err_report() { } trap 'err_report $LINENO' ERR +DATA=() +if [ $MAXMEM -ge 80 ]; then DATA+=("10k_1k_dense" "10k_1k_sparse"); fi +if [ $MAXMEM -ge 800 ]; then DATA+=("100k_1k_dense" "100k_1k_sparse"); fi +if [ $MAXMEM -ge 8000 ]; then DATA+=("1M_1k_dense" "1M_1k_sparse"); fi +if [ $MAXMEM -ge 80000 ]; then DATA+=("10M_1k_dense" "10M_1k_sparse"); fi +if [ $MAXMEM -ge 800000 ]; then DATA+=("100M_1k_dense" "100M_1k_sparse"); fi + echo "RUN BINOMIAL EXPERIMENTS: "$(date) >> results/times.txt; # run all classifiers with binomial labels on all datasets # see genBinomialData -for d in "10k_1k_dense" "10k_1k_sparse" "100k_1k_dense" "100k_1k_sparse" "1M_1k_dense" "1M_1k_sparse" "10M_1k_dense" "10M_1k_sparse" #"_KDD" "100M_1k_dense" "100M_1k_sparse" +for d in ${DATA[@]} #"_KDD" do for f in "runMultiLogReg" "runL2SVM" "runMSVM" do diff --git a/scripts/perftest/todo/runAllClustering.sh b/scripts/perftest/runAllClustering.sh old mode 100644 new mode 100755 similarity index 60% rename from scripts/perftest/todo/runAllClustering.sh rename to scripts/perftest/runAllClustering.sh index 0d5a533..a5a5a22 --- a/scripts/perftest/todo/runAllClustering.sh +++ b/scripts/perftest/runAllClustering.sh @@ -19,8 +19,18 @@ # under the License. # #------------------------------------------------------------- +if [ "$(basename $PWD)" != "perftest" ]; +then + echo "Please execute scripts from directory 'perftest'" + exit 1; +fi -if [ "$1" == "" -o "$2" == "" ]; then echo "Usage: $0 <hdfsDataDir> <MR | SPARK | ECHO> e.g. $0 perftest SPARK" ; exit 1 ; fi +COMMAND=$1 +TEMPFOLDER=$2 +MAXMEM=$3 + +BASE=${TEMPFOLDER}/clustering +MAXITR=20 FILENAME=$0 err_report() { @@ -28,21 +38,18 @@ err_report() { } trap 'err_report $LINENO' ERR -BASE=$1/clustering - -echo $2" RUN CLUSTERING EXPERIMENTS: " $(date) >> times.txt; +DATA=() +if [ $MAXMEM -ge 80 ]; then DATA+=("10k_1k_dense"); fi +if [ $MAXMEM -ge 800 ]; then DATA+=("100k_1k_dense"); fi +if [ $MAXMEM -ge 8000 ]; then DATA+=("1M_1k_dense"); fi +if [ $MAXMEM -ge 80000 ]; then DATA+=("10M_1k_dense"); fi +if [ $MAXMEM -ge 800000 ]; then DATA+=("100M_1k_dense"); fi -if [ ! -d logs ]; then mkdir logs ; fi - -# data generation -echo "-- Using cluster data." >> times.txt; -./genClusteringData.sh $1 $2 &>> logs/genClusteringData.out +echo "RUN CLUSTERING EXPERIMENTS: " $(date) >> results/times.txt; # run all clustering algorithms on all datasets -MAXITR=20 -for d in "10k_1k_dense" #"100k_1k_dense" "1M_1k_dense" #"10M_1k_dense" #"100M_1k_dense" -do - echo "-- Running Kmeans on "$d >> times.txt; - ./runKmeans.sh ${BASE}/X${d} ${MAXITR} ${BASE} $2 &> logs/runKmeans_${d}.out; - +for d in ${DATA[@]} +do + echo "-- Running Kmeans on "$d >> results/times.txt; + ./runKmeans.sh ${BASE}/X${d} ${MAXITR} ${BASE} ${COMMAND} &> logs/runKmeans_${d}.out; done diff --git a/scripts/perftest/todo/runAllDimensionReduction.sh b/scripts/perftest/runAllDimensionReduction.sh old mode 100644 new mode 100755 similarity index 61% rename from scripts/perftest/todo/runAllDimensionReduction.sh rename to scripts/perftest/runAllDimensionReduction.sh index b845666..fb13e44 --- a/scripts/perftest/todo/runAllDimensionReduction.sh +++ b/scripts/perftest/runAllDimensionReduction.sh @@ -19,8 +19,15 @@ # under the License. # #------------------------------------------------------------- +if [ "$(basename $PWD)" != "perftest" ]; +then + echo "Please execute scripts from directory 'perftest'" + exit 1; +fi -if [ "$1" == "" -o "$2" == "" ]; then echo "Usage: $0 <hdfsDataDir> <MR | SPARK | ECHO> e.g. $0 perftest SPARK" ; exit 1 ; fi +COMMAND=$1 +BASE=$2/dimensionreduction +MAXMEM=$3 FILENAME=$0 err_report() { @@ -28,20 +35,19 @@ err_report() { } trap 'err_report $LINENO' ERR -BASE=$1/dimensionreduction +DATA=() +if [ $MAXMEM -ge 80 ]; then DATA+=("5k_2k_dense"); fi +if [ $MAXMEM -ge 800 ]; then DATA+=("50k_2k_dense"); fi +if [ $MAXMEM -ge 8000 ]; then DATA+=("500k_2k_dense"); fi +if [ $MAXMEM -ge 80000 ]; then DATA+=("5M_2k_dense"); fi +if [ $MAXMEM -ge 800000 ]; then DATA+=("50M_2k_dense"); fi -echo $2" RUN DIMENSION REDUCTION EXPERIMENTS: " $(date) >> times.txt; - -if [ ! -d logs ]; then mkdir logs ; fi - -# data generation -echo "-- Using Dimension Reduction data." >> times.txt; -./genDimensionReductionData.sh $1 $2 &>> logs/genDimensionReductionData.out +echo "RUN DIMENSION REDUCTION EXPERIMENTS: " $(date) >> results/times.txt; # run all dimension reduction algorithms on all datasets -for d in "5k_2k_dense" #"50k_2k_dense" "500k_2k_dense" "5M_2k_dense" "50M_2k_dense" +for d in ${DATA[@]} do - echo "-- Running Dimension Reduction on "$d >> times.txt; - ./runPCA.sh pcaData${d} ${BASE} $2 &> logs/runPCA_${d}.out; + echo "-- Running Dimension Reduction on "$d >> results/times.txt; + ./runPCA.sh ${BASE}/pcaData${d} ${BASE} ${COMMAND} &> logs/runPCA_${d}.out; done diff --git a/scripts/perftest/runAllMultinomial.sh b/scripts/perftest/runAllMultinomial.sh index 4df9931..d55a0b7 100755 --- a/scripts/perftest/runAllMultinomial.sh +++ b/scripts/perftest/runAllMultinomial.sh @@ -19,11 +19,17 @@ # under the License. # #------------------------------------------------------------- +if [ "$(basename $PWD)" != "perftest" ]; +then + echo "Please execute scripts from directory 'perftest'" + exit 1; +fi COMMAND=$1 TEMPFOLDER=$2 -if [ "$TEMPFOLDER" == "" ]; then TEMPFOLDER=temp ; fi +MAXMEM=$3 +if [ "$TEMPFOLDER" == "" ]; then TEMPFOLDER=temp ; fi BASE=${TEMPFOLDER}/multinomial BASE0=${TEMPFOLDER}/binomial MAXITR=20 @@ -34,11 +40,18 @@ err_report() { } trap 'err_report $LINENO' ERR +DATA=() +if [ $MAXMEM -ge 80 ]; then DATA+=("10k_1k_dense" "10k_1k_sparse"); fi +if [ $MAXMEM -ge 800 ]; then DATA+=("100k_1k_dense" "100k_1k_sparse"); fi +if [ $MAXMEM -ge 8000 ]; then DATA+=("1M_1k_dense" "1M_1k_sparse"); fi +if [ $MAXMEM -ge 80000 ]; then DATA+=("10M_1k_dense" "10M_1k_sparse"); fi +if [ $MAXMEM -ge 800000 ]; then DATA+=("100M_1k_dense" "100M_1k_sparse"); fi + echo " RUN MULTINOMIAL EXPERIMENTS: "$(date) >> results/times.txt; # run all classifiers with binomial labels on all datasets # see genMultinomialData -for d in "10k_1k_dense" "10k_1k_sparse" "100k_1k_dense" "100k_1k_sparse" "1M_1k_dense" "1M_1k_sparse" "10M_1k_dense" "10M_1k_sparse" #"100M_1k_dense" "100M_1k_sparse" +for d in ${DATA[@]} do for f in "runNaiveBayes" do diff --git a/scripts/perftest/runAllRegression.sh b/scripts/perftest/runAllRegression.sh index 1322560..73fe7da 100755 --- a/scripts/perftest/runAllRegression.sh +++ b/scripts/perftest/runAllRegression.sh @@ -19,11 +19,17 @@ # under the License. # #------------------------------------------------------------- +if [ "$(basename $PWD)" != "perftest" ]; +then + echo "Please execute scripts from directory 'perftest'" + exit 1; +fi COMMAND=$1 TEMPFOLDER=$2 -if [ "$TEMPFOLDER" == "" ]; then TEMPFOLDER=temp ; fi +MAXMEM=$3 +if [ "$TEMPFOLDER" == "" ]; then TEMPFOLDER=temp ; fi BASE=${TEMPFOLDER}/binomial MAXITR=20 @@ -33,11 +39,18 @@ err_report() { } trap 'err_report $LINENO' ERR +DATA=() +if [ $MAXMEM -ge 80 ]; then DATA+=("10k_1k_dense" "10k_1k_sparse"); fi +if [ $MAXMEM -ge 800 ]; then DATA+=("100k_1k_dense" "100k_1k_sparse"); fi +if [ $MAXMEM -ge 8000 ]; then DATA+=("1M_1k_dense" "1M_1k_sparse"); fi +if [ $MAXMEM -ge 80000 ]; then DATA+=("10M_1k_dense" "10M_1k_sparse"); fi +if [ $MAXMEM -ge 800000 ]; then DATA+=("100M_1k_dense" "100M_1k_sparse"); fi + echo "RUN REGRESSION EXPERIMENTS" $(date) >> results/times.txt; # run all regression algorithms with binomial labels on all datasets # see genBinomialData -for d in "10k_1k_dense" "10k_1k_sparse" "100k_1k_dense" "100k_1k_sparse" "1M_1k_dense" "1M_1k_sparse" "10M_1k_dense" "10M_1k_sparse" #"_KDD" "100M_1k_dense" "100M_1k_sparse" +for d in ${DATA[@]} #"_KDD" do # ------------------------------------------------------------------------------------------------------------------- diff --git a/scripts/perftest/todo/runAllStats.sh b/scripts/perftest/runAllStats.sh old mode 100644 new mode 100755 similarity index 59% rename from scripts/perftest/todo/runAllStats.sh rename to scripts/perftest/runAllStats.sh index 225316d..d8f1314 --- a/scripts/perftest/todo/runAllStats.sh +++ b/scripts/perftest/runAllStats.sh @@ -19,8 +19,18 @@ # under the License. # #------------------------------------------------------------- +if [ "$(basename $PWD)" != "perftest" ]; +then + echo "Please execute scripts from directory 'perftest'" + exit 1; +fi -if [ "$1" == "" -o "$2" == "" ]; then echo "Usage: $0 <hdfsDataDir> <MR | SPARK | ECHO> e.g. $0 perftest SPARK" ; exit 1 ; fi +COMMAND=$1 +TEMPFOLDER=$2 +MAXMEM=$3 + +BASE2=${TEMPFOLDER}/bivar +BASE3=${TEMPFOLDER}/stratstats FILENAME=$0 err_report() { @@ -28,29 +38,24 @@ err_report() { } trap 'err_report $LINENO' ERR -BASE2=$1/bivar -BASE3=$1/stratstats - -echo $2" RUN DESCRIPTIVE STATISTICS EXPERIMENTS: " $(date) >> times.txt; - -if [ ! -d logs ]; then mkdir logs ; fi +DATA=() +if [ $MAXMEM -ge 80 ]; then DATA+=("A_10k"); fi +if [ $MAXMEM -ge 800 ]; then DATA+=("A_100k"); fi +if [ $MAXMEM -ge 8000 ]; then DATA+=("A_1M"); fi +if [ $MAXMEM -ge 80000 ]; then DATA+=("A_10M"); fi -# data generation -echo "-- Generating stats data: " >> times.txt; -#OLD ./genStatsData.sh &>> logs/genStatsData.out -./genDescriptiveStatisticsData.sh $1 $2 &>> logs/genStatsData.out -./genStratStatisticsData.sh $1 $2 &>> logs/genStratStatsData.out +echo "RUN DESCRIPTIVE STATISTICS EXPERIMENTS: " $(date) >> results/times.txt; # run all descriptive statistics on all datasets -for d in "A_10k" # "A_100k" "A_1M" "A_10M" #"census" +for d in ${DATA[@]} #"census" do - echo "-- Running runUnivarStats on "$d"" >> times.txt; - ./runUnivarStats.sh ${BASE2}/${d}/data ${BASE2}/${d}/types ${BASE2} $2 &>> logs/runUnivar-Stats_${d}.out; + echo "-- Running runUnivarStats on "$d >> results/times.txt; + ./runUnivarStats.sh ${BASE2}/${d}/data ${BASE2}/${d}/types ${BASE2} ${COMMAND} &> logs/runUnivar-Stats_${d}.out; - echo "-- Running runBivarStats on "$d"" >> times.txt; - ./runBivarStats.sh ${BASE2}/${d}/data ${BASE2}/${d}/set1.indices ${BASE2}/${d}/set2.indices ${BASE2}/${d}/set1.types ${BASE2}/${d}/set2.types ${BASE2} $2 &>> logs/runbivar-stats_${d}.out; + echo "-- Running runBivarStats on "$d >> results/times.txt; + ./runBivarStats.sh ${BASE2}/${d}/data ${BASE2}/${d}/set1.indices ${BASE2}/${d}/set2.indices ${BASE2}/${d}/set1.types ${BASE2}/${d}/set2.types ${BASE2} ${COMMAND} &> logs/runBivar-stats_${d}.out; - echo "-- Running runStratStats on "$d"" >> times.txt; - ./runStratStats.sh ${BASE3}/${d}/data ${BASE3}/${d}/Xcid ${BASE3}/${d}/Ycid ${BASE3} $2 &> logs/runstrats-stats_${d}.out; + echo "-- Running runStratStats on "$d >> results/times.txt; + ./runStratStats.sh ${BASE3}/${d}/data ${BASE3}/${d}/Xcid ${BASE3}/${d}/Ycid ${BASE3} ${COMMAND} &> logs/runStrats-stats_${d}.out; done diff --git a/scripts/perftest/todo/runBivarStats.sh b/scripts/perftest/runBivarStats.sh old mode 100644 new mode 100755 similarity index 68% rename from scripts/perftest/todo/runBivarStats.sh rename to scripts/perftest/runBivarStats.sh index 9761610..b4b8572 --- a/scripts/perftest/todo/runBivarStats.sh +++ b/scripts/perftest/runBivarStats.sh @@ -21,16 +21,25 @@ #------------------------------------------------------------- set -e -if [ "$7" == "SPARK" ]; then CMD="./sparkDML.sh "; DASH="-"; elif [ "$7" == "MR" ]; then CMD="hadoop jar SystemDS.jar " ; else CMD="echo " ; fi +if [ "$(basename $PWD)" != "perftest" ]; +then + echo "Please execute scripts from directory 'perftest'" + exit 1; +fi +CMD=$7 BASE=$6 -export HADOOP_CLIENT_OPTS="-Xmx2048m -Xms2048m -Xmn256m" echo "running Bivar-Stats" -tstart=$SECONDS -${CMD} -f ../algorithms/bivar-stats.dml $DASH-explain $DASH-stats $DASH-nvargs X=$1 index1=$2 index2=$3 types1=$4 types2=$5 OUTDIR=${BASE}/stats/b -ttrain=$(($SECONDS - $tstart - 3)) -echo "BivariateStatistics on "$1": "$ttrain >> times.txt +tstart=$(date +%s.%N) + +${CMD} -f ./scripts/bivar-stats.dml \ + --config conf/SystemDS-config.xml \ + --stats \ + --nvargs X=$1 index1=$2 index2=$3 types1=$4 types2=$5 OUTDIR=${BASE}/stats/b + +ttrain=$(echo "$(date +%s.%N) - $tstart - .4" | bc) +echo "BivariateStatistics on "$1": "$ttrain >> results/times.txt diff --git a/scripts/perftest/runGLM_binomial_probit.sh b/scripts/perftest/runGLM_binomial_probit.sh index e37872a..f2affee 100755 --- a/scripts/perftest/runGLM_binomial_probit.sh +++ b/scripts/perftest/runGLM_binomial_probit.sh @@ -21,6 +21,12 @@ #------------------------------------------------------------- set -e +if [ "$(basename $PWD)" != "perftest" ]; +then + echo "Please execute scripts from directory 'perftest'" + exit 1; +fi + CMD=$5 BASE=$3 @@ -30,7 +36,6 @@ for i in 0 1 2; do #training tstart=$(date +%s.%N) - # ${CMD} -f ./algorithms/GLM.dml \ ${CMD} -f scripts/GLM.dml \ --config conf/SystemDS-config.xml \ --stats \ diff --git a/scripts/perftest/runGLM_gamma_log.sh b/scripts/perftest/runGLM_gamma_log.sh index 6308a50..09bb753 100755 --- a/scripts/perftest/runGLM_gamma_log.sh +++ b/scripts/perftest/runGLM_gamma_log.sh @@ -21,6 +21,12 @@ #------------------------------------------------------------- set -e +if [ "$(basename $PWD)" != "perftest" ]; +then + echo "Please execute scripts from directory 'perftest'" + exit 1; +fi + CMD=$5 BASE=$3 @@ -30,7 +36,6 @@ for i in 0 1 2; do #training tstart=$(date +%s.%N) - #${CMD} -f ./algorithms/GLM.dml \ ${CMD} -f scripts/GLM.dml \ --config conf/SystemDS-config.xml \ --stats \ diff --git a/scripts/perftest/runGLM_poisson_log.sh b/scripts/perftest/runGLM_poisson_log.sh index 698ca65..adf2cdf 100755 --- a/scripts/perftest/runGLM_poisson_log.sh +++ b/scripts/perftest/runGLM_poisson_log.sh @@ -21,6 +21,12 @@ #------------------------------------------------------------- set -e +if [ "$(basename $PWD)" != "perftest" ]; +then + echo "Please execute scripts from directory 'perftest'" + exit 1; +fi + CMD=$5 BASE=$3 @@ -30,7 +36,6 @@ for i in 0 1 2; do #training tstart=$(date +%s.%N) - #${CMD} -f ./algorithms/GLM.dml \ ${CMD} -f scripts/GLM.dml \ --config conf/SystemDS-config.xml \ --stats \ diff --git a/scripts/perftest/runNaiveBayes.sh b/scripts/perftest/runKmeans.sh similarity index 61% copy from scripts/perftest/runNaiveBayes.sh copy to scripts/perftest/runKmeans.sh index f4931db..853e664 100755 --- a/scripts/perftest/runNaiveBayes.sh +++ b/scripts/perftest/runKmeans.sh @@ -21,27 +21,31 @@ #------------------------------------------------------------- set -e -CMD=$5 -BASE=$4 +if [ "$(basename $PWD)" != "perftest" ]; +then + echo "Please execute scripts from directory 'perftest'" + exit 1; +fi + +CMD=$4 +BASE=$3 #training tstart=$(date +%s.%N) -#${CMD} -f ./algorithms/naive-bayes.dml \ -${CMD} -f scripts/naive-bayes.dml \ - --config conf/SystemDS-config.xml \ - --stats \ - --nvargs X=$1 Y=$2 prior=${BASE}/prior conditionals=${BASE}/conditionals fmt="csv" +${CMD} -f ./scripts/Kmeans.dml \ + --config conf/SystemDS-config.xml \ + --stats \ + --nvargs X=$1 k=5 C=${BASE}/centroids.mtx maxi=$2 tol=0.0001 prY=${BASE}/prY_implicit.mtx ttrain=$(echo "$(date +%s.%N) - $tstart - .4" | bc) -echo "NaiveBayes train on "$1": "$ttrain >> results/times.txt +echo "Kmeans train on "$1": "$ttrain >> results/times.txt #predict tstart=$(date +%s.%N) -#${CMD} -f ./algorithms/naive-bayes-predict.dml \ -${CMD} -f scripts/naive-bayes-predict.dml \ - --config conf/SystemDS-config.xml \ - --stats \ - --nvargs X=$1_test Y=$2_test prior=${BASE}/prior conditionals=${BASE}/conditionals fmt="csv" probabilities=${BASE}/probabilities #accuracy=${BASE}/accuracy confusion=${BASE}/confusion +${CMD} -f ./scripts/Kmeans-predict.dml \ + --config conf/SystemDS-config.xml \ + --stats \ + --nvargs X=$1 C=${BASE}/centroids.mtx prY=${BASE}/prY.mtx tpredict=$(echo "$(date +%s.%N) - $tstart - .4" | bc) -echo "NaiveBayes predict on "$1": "$tpredict >> results/times.txt +echo "Kmeans predict on "$1": "$tpredict >> results/times.txt diff --git a/scripts/perftest/runL2SVM.sh b/scripts/perftest/runL2SVM.sh index 6c0ffd1..b7ddb64 100755 --- a/scripts/perftest/runL2SVM.sh +++ b/scripts/perftest/runL2SVM.sh @@ -21,6 +21,12 @@ #------------------------------------------------------------- set -e +if [ "$(basename $PWD)" != "perftest" ]; +then + echo "Please execute scripts from directory 'perftest'" + exit 1; +fi + CMD=$6 BASE=$4 RUNPrediction=${7:-true} diff --git a/scripts/perftest/runLinearRegCG.sh b/scripts/perftest/runLinearRegCG.sh index e3c36b6..487bd09 100755 --- a/scripts/perftest/runLinearRegCG.sh +++ b/scripts/perftest/runLinearRegCG.sh @@ -21,6 +21,12 @@ #------------------------------------------------------------- set -e +if [ "$(basename $PWD)" != "perftest" ]; +then + echo "Please execute scripts from directory 'perftest'" + exit 1; +fi + CMD=$5 BASE=$3 @@ -31,7 +37,6 @@ do #training tstart=$(date +%s.%N) - #${CMD} -f ./algorithms/LinearRegCG.dml \ ${CMD} -f scripts/LinearRegCG.dml \ --config conf/SystemDS-config.xml \ --stats \ diff --git a/scripts/perftest/runLinearRegDS.sh b/scripts/perftest/runLinearRegDS.sh index b285aff..c6d24fd 100755 --- a/scripts/perftest/runLinearRegDS.sh +++ b/scripts/perftest/runLinearRegDS.sh @@ -21,6 +21,12 @@ #------------------------------------------------------------- set -e +if [ "$(basename $PWD)" != "perftest" ]; +then + echo "Please execute scripts from directory 'perftest'" + exit 1; +fi + CMD=$4 BASE=$3 @@ -31,7 +37,6 @@ do #training tstart=$(date +%s.%N) - #${CMD} -f ./algorithms/LinearRegDS.dml \ ${CMD} -f scripts/LinearRegDS.dml \ --config conf/SystemDS-config.xml \ --stats \ diff --git a/scripts/perftest/runMSVM.sh b/scripts/perftest/runMSVM.sh index 8cabc4d..97be13d 100755 --- a/scripts/perftest/runMSVM.sh +++ b/scripts/perftest/runMSVM.sh @@ -21,6 +21,12 @@ #------------------------------------------------------------- set -e +if [ "$(basename $PWD)" != "perftest" ]; +then + echo "Please execute scripts from directory 'perftest'" + exit 1; +fi + CMD=$6 BASE=$4 @@ -28,7 +34,6 @@ BASE=$4 for i in 0 1; do #training tstart=$(date +%s.%N) - # ${CMD} -f ./algorithms/m-svm.dml \ ${CMD} -f scripts/m-svm.dml \ --config conf/SystemDS-config.xml \ --stats \ @@ -39,7 +44,6 @@ for i in 0 1; do #predict tstart=$(date +%s.%N) - #${CMD} -f ./algorithms/m-svm-predict.dml \ ${CMD} -f scripts/m-svm-predict.dml \ --config conf/SystemDS-config.xml \ --stats \ diff --git a/scripts/perftest/runMultiLogReg.sh b/scripts/perftest/runMultiLogReg.sh index b5503df..783e330 100755 --- a/scripts/perftest/runMultiLogReg.sh +++ b/scripts/perftest/runMultiLogReg.sh @@ -21,6 +21,12 @@ #------------------------------------------------------------- set -e +if [ "$(basename $PWD)" != "perftest" ]; +then + echo "Please execute scripts from directory 'perftest'" + exit 1; +fi + CMD=$6 BASE=$4 @@ -31,7 +37,6 @@ if [ $3 -gt 2 ]; then DFAM=3; fi for i in 0 1 2; do #training tstart=$(date +%s.%N) - # ${CMD} -f ./algorithms/MultiLogReg.dml \ ${CMD} -f scripts/MultiLogReg.dml \ --config conf/SystemDS-config.xml \ --stats \ diff --git a/scripts/perftest/runNaiveBayes.sh b/scripts/perftest/runNaiveBayes.sh index f4931db..6b3de28 100755 --- a/scripts/perftest/runNaiveBayes.sh +++ b/scripts/perftest/runNaiveBayes.sh @@ -21,12 +21,17 @@ #------------------------------------------------------------- set -e +if [ "$(basename $PWD)" != "perftest" ]; +then + echo "Please execute scripts from directory 'perftest'" + exit 1; +fi + CMD=$5 BASE=$4 #training tstart=$(date +%s.%N) -#${CMD} -f ./algorithms/naive-bayes.dml \ ${CMD} -f scripts/naive-bayes.dml \ --config conf/SystemDS-config.xml \ --stats \ @@ -37,7 +42,6 @@ echo "NaiveBayes train on "$1": "$ttrain >> results/times.txt #predict tstart=$(date +%s.%N) -#${CMD} -f ./algorithms/naive-bayes-predict.dml \ ${CMD} -f scripts/naive-bayes-predict.dml \ --config conf/SystemDS-config.xml \ --stats \ diff --git a/scripts/perftest/todo/runPCA.sh b/scripts/perftest/runPCA.sh old mode 100644 new mode 100755 similarity index 69% rename from scripts/perftest/todo/runPCA.sh rename to scripts/perftest/runPCA.sh index e47050e..66fd356 --- a/scripts/perftest/todo/runPCA.sh +++ b/scripts/perftest/runPCA.sh @@ -21,14 +21,23 @@ #------------------------------------------------------------- set -e -if [ "$3" == "SPARK" ]; then CMD="./sparkDML.sh "; DASH="-"; elif [ "$3" == "MR" ]; then CMD="hadoop jar SystemDS.jar " ; else CMD="echo " ; fi +if [ "$(basename $PWD)" != "perftest" ]; +then + echo "Please execute scripts from directory 'perftest'" + exit 1; +fi +CMD=$3 BASE=$2 -export HADOOP_CLIENT_OPTS="-Xmx2048m -Xms2048m -Xmn256m" +tstart=$(date +%s.%N) -tstart=$SECONDS -${CMD} -f ../algorithms/PCA.dml $DASH-explain $DASH-stats $DASH-nvargs INPUT=$1 SCALE=1 PROJDATA=1 OUTPUT=${BASE}/output -ttrain=$(($SECONDS - $tstart - 3)) -echo "PCA on "$1": "$ttrain >> times.txt +# ${CMD} -f ../algorithms/PCA.dml \ +${CMD} -f ./scripts/PCA.dml \ + --config conf/SystemDS-config.xml \ + --stats \ + --nvargs INPUT=$1 SCALE=1 PROJDATA=1 OUTPUT=${BASE}/output + +ttrain=$(echo "$(date +%s.%N) - $tstart - .4" | bc) +echo "PCA on "$1": "$ttrain >> results/times.txt diff --git a/scripts/perftest/todo/runStratStats.sh b/scripts/perftest/runStratStats.sh old mode 100644 new mode 100755 similarity index 68% rename from scripts/perftest/todo/runStratStats.sh rename to scripts/perftest/runStratStats.sh index fc05a56..2778d31 --- a/scripts/perftest/todo/runStratStats.sh +++ b/scripts/perftest/runStratStats.sh @@ -21,13 +21,23 @@ #------------------------------------------------------------- set -e -if [ "$5" == "SPARK" ]; then CMD="./sparkDML.sh "; DASH="-"; elif [ "$5" == "MR" ]; then CMD="hadoop jar SystemDS.jar " ; else CMD="echo " ; fi +if [ "$(basename $PWD)" != "perftest" ]; +then + echo "Please execute scripts from directory 'perftest'" + exit 1; +fi +CMD=$5 BASE=$4 -export HADOOP_CLIENT_OPTS="-Xmx2048m -Xms2048m -Xmn256m" echo "running stratstats" -tstart=$SECONDS -${CMD} -f ../algorithms/stratstats.dml $DASH-explain $DASH-stats $DASH-nvargs X=$1 Xcid=$2 Ycid=$3 O=${BASE}/STATS/s fmt=csv -ttrain=$(($SECONDS - $tstart - 3)) -echo "StatifiedStatistics on "$1": "$ttrain >> times.txt +tstart=$(date +%s.%N) + +#${CMD} -f ../algorithms/stratstats.dml \ +${CMD} -f ./scripts/stratstats.dml \ + --config conf/SystemDS-config.xml \ + --stats \ + --nvargs X=$1 Xcid=$2 Ycid=$3 O=${BASE}/STATS/s fmt=csv + +ttrain=$(echo "$(date +%s.%N) - $tstart - .4" | bc) +echo "StratifiedStatistics on "$1": "$ttrain >> results/times.txt diff --git a/scripts/perftest/todo/runUnivarStats.sh b/scripts/perftest/runUnivarStats.sh old mode 100644 new mode 100755 similarity index 68% rename from scripts/perftest/todo/runUnivarStats.sh rename to scripts/perftest/runUnivarStats.sh index 08fe395..3f0ec81 --- a/scripts/perftest/todo/runUnivarStats.sh +++ b/scripts/perftest/runUnivarStats.sh @@ -21,14 +21,23 @@ #------------------------------------------------------------- set -e -if [ "$4" == "SPARK" ]; then CMD="./sparkDML.sh "; DASH="-"; elif [ "$4" == "MR" ]; then CMD="hadoop jar SystemDS.jar " ; else CMD="echo " ; fi +if [ "$(basename $PWD)" != "perftest" ]; +then + echo "Please execute scripts from directory 'perftest'" + exit 1; +fi +CMD=$4 BASE=$3 -export HADOOP_CLIENT_OPTS="-Xmx2048m -Xms2048m -Xmn256m" - echo "running Univar-Stats" -tstart=$SECONDS -${CMD} -f ../algorithms/Univar-Stats.dml $DASH-explain $DASH-stats $DASH-nvargs X=$1 TYPES=$2 STATS=${BASE}/stats/u -ttrain=$(($SECONDS - $tstart - 3)) -echo "UnivariateStatistics on "$1": "$ttrain >> times.txt +tstart=$(date +%s.%N) + +# ${CMD} -f ../algorithms/Univar-Stats.dml \ +${CMD} -f ./scripts/Univar-Stats.dml \ + --config conf/SystemDS-config.xml \ + --stats \ + --nvargs X=$1 TYPES=$2 STATS=${BASE}/stats/u + +ttrain=$(echo "$(date +%s.%N) - $tstart - .4" | bc) +echo "UnivariateStatistics on "$1": "$ttrain >> results/times.txt diff --git a/scripts/perftest/scripts/transpose.dml b/scripts/perftest/scripts/Kmeans-predict.dml similarity index 88% copy from scripts/perftest/scripts/transpose.dml copy to scripts/perftest/scripts/Kmeans-predict.dml index 2fb2f0d..ccfa901 100755 --- a/scripts/perftest/scripts/transpose.dml +++ b/scripts/perftest/scripts/Kmeans-predict.dml @@ -19,8 +19,9 @@ # #------------------------------------------------------------- -x = rand(rows=$1, cols=$2, min= 0.0, max= 1.0, sparsity=$3, seed= 12) -for(i in 1:$4) { - res = t(x) -} -print(sum(res)) \ No newline at end of file +X = read($X); +C = read($C); + +Y = kmeansPredict(X = X, C = C) + +write(Y, $prY, "text") diff --git a/scripts/perftest/scripts/transpose.dml b/scripts/perftest/scripts/Kmeans.dml similarity index 75% copy from scripts/perftest/scripts/transpose.dml copy to scripts/perftest/scripts/Kmeans.dml index 2fb2f0d..d818659 100755 --- a/scripts/perftest/scripts/transpose.dml +++ b/scripts/perftest/scripts/Kmeans.dml @@ -19,8 +19,13 @@ # #------------------------------------------------------------- -x = rand(rows=$1, cols=$2, min= 0.0, max= 1.0, sparsity=$3, seed= 12) -for(i in 1:$4) { - res = t(x) -} -print(sum(res)) \ No newline at end of file +X = read($X); +fileC = $C +num_centroids = $k; +max_iter = ifdef ($maxi, 1000); # $maxi=1000; +eps = ifdef ($tol, 0.000001); # $tol=0.000001; + +[C, Y] = kmeans(X = X, k = num_centroids, max_iter = max_iter, eps = eps) + +write (C, fileC, format="text"); +write (Y, $prY, format="text"); diff --git a/scripts/perftest/scripts/MM.dml b/scripts/perftest/scripts/MM.dml index 1620684..336e770 100755 --- a/scripts/perftest/scripts/MM.dml +++ b/scripts/perftest/scripts/MM.dml @@ -24,4 +24,4 @@ v = rand(rows=ncol(x), cols=$3, min=0.0, max=1.0, sparsity=$5, seed= 13) for(i in 1:$6) { res = x %*% v } -print(sum(res)) \ No newline at end of file +print(sum(res)) diff --git a/scripts/perftest/scripts/alsCG.dml b/scripts/perftest/scripts/PCA.dml old mode 100644 new mode 100755 similarity index 54% copy from scripts/perftest/scripts/alsCG.dml copy to scripts/perftest/scripts/PCA.dml index f409d40..e97cde8 --- a/scripts/perftest/scripts/alsCG.dml +++ b/scripts/perftest/scripts/PCA.dml @@ -19,20 +19,23 @@ # #------------------------------------------------------------- -rank = ifdef($rank, 10); -reg = ifdef($reg, "L2"); -lambda = ifdef($lambda, 0.000001); -maxiter = ifdef($maxiter, 50); -thr = ifdef($thr, 0.0001); -verbose = ifdef($verbose, TRUE); -modelB = ifdef($modelB, "B"); -modelM = ifdef($modelM, "M"); -fmt = ifdef($fmt, "text"); -check = ifdef($check, TRUE); +X = read($INPUT); +K = ifdef($K, ncol(X)); +ofmt = ifdef($OFMT, "CSV"); +projectData = ifdef($PROJDATA,0); +center = ifdef($CENTER,0); +scale = ifdef($SCALE,0); +output = ifdef($OUTPUT,"/"); -X = read($X); +[Xout, Mout, Centering, ScaleFactor] = pca(X = X, K = K, center = center, scale = scale) -[B, M] = alsCG(X=X, rank=rank, reg=reg, lambda=lambda, maxi=maxiter, check=check, thr=thr, verbose=verbose); +# These files can not be created, as the built-in PCA function does not return the eigenvalues. +# write(eval_stdev_dominant, output+"/dominant.eigen.standard.deviations", format=ofmt); +# write(eval_dominant, output+"/dominant.eigen.values", format=ofmt); -write(B, $modelB, format=fmt); -write(M, $modelM, format=fmt); +write(Mout, output+"/dominant.eigen.vectors", format=ofmt); + +if (projectData == 1){ + # Construct new data set by treating computed dominant eigenvectors as the basis vectors + write(Xout, output+"/projected.data", format=ofmt); +} diff --git a/scripts/perftest/scripts/transpose.dml b/scripts/perftest/scripts/Univar-Stats.dml similarity index 86% copy from scripts/perftest/scripts/transpose.dml copy to scripts/perftest/scripts/Univar-Stats.dml index 2fb2f0d..53686ee 100755 --- a/scripts/perftest/scripts/transpose.dml +++ b/scripts/perftest/scripts/Univar-Stats.dml @@ -18,9 +18,9 @@ # under the License. # #------------------------------------------------------------- +X = read($X); # data file +types = read($TYPES); # attribute kind file -x = rand(rows=$1, cols=$2, min= 0.0, max= 1.0, sparsity=$3, seed= 12) -for(i in 1:$4) { - res = t(x) -} -print(sum(res)) \ No newline at end of file +baseStats = univar(X, types) + +write(baseStats, $STATS); diff --git a/scripts/perftest/scripts/transpose.dml b/scripts/perftest/scripts/als-predict.dml similarity index 67% copy from scripts/perftest/scripts/transpose.dml copy to scripts/perftest/scripts/als-predict.dml index 2fb2f0d..f61a73f 100755 --- a/scripts/perftest/scripts/transpose.dml +++ b/scripts/perftest/scripts/als-predict.dml @@ -19,8 +19,23 @@ # #------------------------------------------------------------- -x = rand(rows=$1, cols=$2, min= 0.0, max= 1.0, sparsity=$3, seed= 12) -for(i in 1:$4) { - res = t(x) +X = read($X); +fileY = $Y; +L = read($L); +R = read($R); + +userIDs = seq(1, nrow(X)); +write(userIDs, "temp/als/userIDs", format = $fmt); + +I = matrix (0, rows=nrow(X), cols=ncol(X)) +parfor(i in 1:nrow(X)){ + parfor(j in 1:ncol(X)){ + if(as.integer(as.scalar(X[i,j])) != 0){ + I[i,j] = 1; + } + } } -print(sum(res)) \ No newline at end of file +write(I, "temp/als/I", format = $fmt); + +Y = alsPredict(userIDs = userIDs, I = I, L = L, R = R); +write(Y, fileY, format = $fmt); diff --git a/scripts/perftest/scripts/alsCG.dml b/scripts/perftest/scripts/alsCG.dml index f409d40..913fbbb 100644 --- a/scripts/perftest/scripts/alsCG.dml +++ b/scripts/perftest/scripts/alsCG.dml @@ -25,14 +25,14 @@ lambda = ifdef($lambda, 0.000001); maxiter = ifdef($maxiter, 50); thr = ifdef($thr, 0.0001); verbose = ifdef($verbose, TRUE); -modelB = ifdef($modelB, "B"); -modelM = ifdef($modelM, "M"); +modelU = ifdef($modelU, "U"); +modelV = ifdef($modelV, "V"); fmt = ifdef($fmt, "text"); check = ifdef($check, TRUE); X = read($X); -[B, M] = alsCG(X=X, rank=rank, reg=reg, lambda=lambda, maxi=maxiter, check=check, thr=thr, verbose=verbose); +[U, V] = alsCG(X=X, rank=rank, reg=reg, lambda=lambda, maxi=maxiter, check=check, thr=thr, verbose=verbose); -write(B, $modelB, format=fmt); -write(M, $modelM, format=fmt); +write(U, $modelU, format=fmt); +write(V, $modelV, format=fmt); diff --git a/scripts/perftest/scripts/alsCG.dml b/scripts/perftest/scripts/alsDS.dml old mode 100644 new mode 100755 similarity index 81% copy from scripts/perftest/scripts/alsCG.dml copy to scripts/perftest/scripts/alsDS.dml index f409d40..2c3380c --- a/scripts/perftest/scripts/alsCG.dml +++ b/scripts/perftest/scripts/alsDS.dml @@ -20,19 +20,18 @@ #------------------------------------------------------------- rank = ifdef($rank, 10); -reg = ifdef($reg, "L2"); lambda = ifdef($lambda, 0.000001); maxiter = ifdef($maxiter, 50); thr = ifdef($thr, 0.0001); verbose = ifdef($verbose, TRUE); -modelB = ifdef($modelB, "B"); -modelM = ifdef($modelM, "M"); +modelU = ifdef($modelU, "U"); +modelV = ifdef($modelV, "V"); fmt = ifdef($fmt, "text"); check = ifdef($check, TRUE); X = read($X); -[B, M] = alsCG(X=X, rank=rank, reg=reg, lambda=lambda, maxi=maxiter, check=check, thr=thr, verbose=verbose); +[U, V] = alsDS(X=X, rank=rank, lambda=lambda, maxi=maxiter, check=check, thr=thr, verbose=verbose); -write(B, $modelB, format=fmt); -write(M, $modelM, format=fmt); +write(U, $modelU, format=fmt); +write(V, $modelV, format=fmt); diff --git a/scripts/perftest/scripts/alsCG.dml b/scripts/perftest/scripts/bivar-stats.dml old mode 100644 new mode 100755 similarity index 57% copy from scripts/perftest/scripts/alsCG.dml copy to scripts/perftest/scripts/bivar-stats.dml index f409d40..149fa0f --- a/scripts/perftest/scripts/alsCG.dml +++ b/scripts/perftest/scripts/bivar-stats.dml @@ -19,20 +19,16 @@ # #------------------------------------------------------------- -rank = ifdef($rank, 10); -reg = ifdef($reg, "L2"); -lambda = ifdef($lambda, 0.000001); -maxiter = ifdef($maxiter, 50); -thr = ifdef($thr, 0.0001); -verbose = ifdef($verbose, TRUE); -modelB = ifdef($modelB, "B"); -modelM = ifdef($modelM, "M"); -fmt = ifdef($fmt, "text"); -check = ifdef($check, TRUE); +X = read($X); # input data set +S1 = read($index1); # attribute set 1 +S2 = read($index2); # attribute set 2 +T1 = read($types1); # kind for attributes in S1 +T2 = read($types2); # kind for attributes in S2 -X = read($X); +[basestats_scale_scale, basestats_nominal_scale, basestats_nominal_nominal, basestats_ordinal_ordinal] = +bivar(X = X, S1 = S1, S2 = S2, T1 = T1, T2 = T2, verbose=FALSE) -[B, M] = alsCG(X=X, rank=rank, reg=reg, lambda=lambda, maxi=maxiter, check=check, thr=thr, verbose=verbose); - -write(B, $modelB, format=fmt); -write(M, $modelM, format=fmt); +write(basestats_scale_scale, $OUTDIR + "/bivar.scale.scale.stats"); +write(basestats_nominal_scale, $OUTDIR + "/bivar.nominal.scale.stats"); +write(basestats_nominal_nominal, $OUTDIR + "/bivar.nominal.nominal.stats"); +write(basestats_ordinal_ordinal, $OUTDIR + "/bivar.ordinal.ordinal.stats"); diff --git a/scripts/perftest/scripts/transpose.dml b/scripts/perftest/scripts/stratstats.dml similarity index 84% copy from scripts/perftest/scripts/transpose.dml copy to scripts/perftest/scripts/stratstats.dml index 2fb2f0d..833e481 100755 --- a/scripts/perftest/scripts/transpose.dml +++ b/scripts/perftest/scripts/stratstats.dml @@ -19,8 +19,12 @@ # #------------------------------------------------------------- -x = rand(rows=$1, cols=$2, min= 0.0, max= 1.0, sparsity=$3, seed= 12) -for(i in 1:$4) { - res = t(x) -} -print(sum(res)) \ No newline at end of file +X = read($X); +fileO = $O; +fmtO = $fmt; + +Xcid = read($Xcid); +Ycid = read($Ycid); + +OutMtx = stratstats(X = X, Xcid = Xcid, Ycid = Ycid); +write (OutMtx, fileO, format=fmtO); diff --git a/scripts/perftest/scripts/transpose.dml b/scripts/perftest/scripts/transpose.dml index 2fb2f0d..4992a90 100755 --- a/scripts/perftest/scripts/transpose.dml +++ b/scripts/perftest/scripts/transpose.dml @@ -23,4 +23,4 @@ x = rand(rows=$1, cols=$2, min= 0.0, max= 1.0, sparsity=$3, seed= 12) for(i in 1:$4) { res = t(x) } -print(sum(res)) \ No newline at end of file +print(sum(res)) diff --git a/scripts/perftest/todo/genClusteringData.sh b/scripts/perftest/todo/genClusteringData.sh deleted file mode 100644 index 5794e64..0000000 --- a/scripts/perftest/todo/genClusteringData.sh +++ /dev/null @@ -1,52 +0,0 @@ -#!/bin/bash -#------------------------------------------------------------- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -#------------------------------------------------------------- - -if [ "$2" == "SPARK" ]; then CMD="./sparkDML.sh "; DASH="-"; elif [ "$2" == "MR" ]; then CMD="hadoop jar SystemDS.jar " ; else CMD="echo " ; fi - -BASE=$1/clustering - -FORMAT="binary" -DENSE_SP=0.9 -SPARSE_SP=0.01 - -export HADOOP_CLIENT_OPTS="-Xmx2048m -Xms2048m -Xmn256m" - -#generate XS scenarios (80MB) -${CMD} -f ../datagen/genRandData4Kmeans.dml $DASH-nvargs nr=10000 nf=1000 nc=5 dc=10.0 dr=1.0 fbf=100.0 cbf=100.0 X=$BASE/X10k_1k_dense C=$BASE/C10k_1k_dense Y=$BASE/y10k_1k_dense YbyC=$BASE/YbyC10k_1k_dense fmt=$FORMAT -${CMD} -f extractTestData.dml $DASH-args $BASE/X10k_1k_dense $BASE/y10k_1k_dense $BASE/X10k_1k_dense_test $BASE/y10k_1k_dense_test $FORMAT - -#generate S scenarios (800MB) -#${CMD} -f ../datagen/genRandData4Kmeans.dml $DASH-nvargs nr=100000 nf=1000 nc=5 dc=10.0 dr=1.0 fbf=100.0 cbf=100.0 X=$BASE/X100k_1k_dense C=$BASE/C100k_1k_dense Y=$BASE/y100k_1k_dense YbyC=$BASE/YbyC100k_1k_dense fmt=$FORMAT -#${CMD} -f extractTestData.dml $DASH-args $BASE/X100k_1k_dense $BASE/y100k_1k_dense $BASE/X100k_1k_dense_test $BASE/y100k_1k_dense_test $FORMAT - -#generate M scenarios (8GB) -#${CMD} -f ../datagen/genRandData4Kmeans.dml $DASH-nvargs nr=1000000 nf=1000 nc=5 dc=10.0 dr=1.0 fbf=100.0 cbf=100.0 X=$BASE/X1M_1k_dense C=$BASE/C1M_1k_dense Y=$BASE/y1M_1k_dense YbyC=$BASE/YbyC1M_1k_dense fmt=$FORMAT -#${CMD} -f extractTestData.dml $DASH-args $BASE/X1M_1k_dense $BASE/y1M_1k_dense $BASE/X1M_1k_dense_test $BASE/y1M_1k_dense_test $FORMAT - -#generate L scenarios (80GB) -#${CMD} -f ../datagen/genRandData4Kmeans.dml $DASH-nvargs nr=10000000 nf=1000 nc=5 dc=10.0 dr=1.0 fbf=100.0 cbf=100.0 X=$BASE/X10M_1k_dense C=$BASE/C10M_1k_dense Y=$BASE/y10M_1k_dense YbyC=$BASE/YbyC10M_1k_dense fmt=$FORMAT -#${CMD} -f extractTestData.dml $DASH-args $BASE/X10M_1k_dense $BASE/y10M_1k_dense $BASE/X10M_1k_dense_test $BASE/y10M_1k_dense_test $FORMAT - -#generate LARGE scenarios (800GB) -#${CMD} -f ../datagen/genRandData4Kmeans.dml $DASH-nvargs nr=100000000 nf=1000 nc=5 dc=10.0 dr=1.0 fbf=100.0 cbf=100.0 X=$BASE/X100M_1k_dense C=$BASE/C100M_1k_dense Y=$BASE/y100M_1k_dense YbyC=$BASE/YbyC100M_1k_dense fmt=$FORMAT -#${CMD} -f extractTestData.dml $DASH-args $BASE/X100M_1k_dense $BASE/y100M_1k_dense $BASE/X100M_1k_dense_test $BASE/y100M_1k_dense_test $FORMAT - diff --git a/scripts/perftest/todo/genDescriptiveStatisticsData.sh b/scripts/perftest/todo/genDescriptiveStatisticsData.sh deleted file mode 100644 index e223114..0000000 --- a/scripts/perftest/todo/genDescriptiveStatisticsData.sh +++ /dev/null @@ -1,46 +0,0 @@ -#!/bin/bash -#------------------------------------------------------------- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -#------------------------------------------------------------- - -if [ "$2" == "SPARK" ]; then CMD="./sparkDML.sh "; DASH="-"; elif [ "$2" == "MR" ]; then CMD="hadoop jar SystemDS.jar " ; else CMD="echo " ; fi - -FORMAT="binary" -BASE=$1/bivar - -export HADOOP_CLIENT_OPTS="-Xmx2048m -Xms2048m -Xmn256m" - -c=1000 -nc=100 -mdomain=1100 -set=20 -labelset=10 - -#XS data 10K rows -${CMD} -f ../datagen/genRandData4DescriptiveStats.dml $DASH-explain $DASH-stats $DASH-nvargs R=10000 C=$c NC=$nc MAXDOMAIN=$mdomain DATA=${BASE}/A_10k/data TYPES=${BASE}/A_10k/types SETSIZE=$set LABELSETSIZE=$labelset TYPES1=${BASE}/A_10k/set1.types TYPES2=${BASE}/A_10k/set2.types INDEX1=${BASE}/A_10k/set1.indices INDEX2=${BASE}/A_10k/set2.indices FMT=$FORMAT - -#S data 100K rows -#${CMD} -f ../datagen/genRandData4DescriptiveStats.dml $DASH-explain $DASH-stats $DASH-nvargs R=100000 C=$c NC=$nc MAXDOMAIN=$mdomain DATA=${BASE}/A_100k/data TYPES=${BASE}/A_100k/types SETSIZE=$set LABELSETSIZE=$labelset TYPES1=${BASE}/A_100k/set1.types TYPES2=${BASE}/A_100k/set2.types INDEX1=${BASE}/A_100k/set1.indices INDEX2=${BASE}/A_100k/set2.indices FMT=$FORMAT - -#M data 1M rows -#${CMD} -f ../datagen/genRandData4DescriptiveStats.dml $DASH-explain $DASH-stats $DASH-nvargs R=1000000 C=$c NC=$nc MAXDOMAIN=$mdomain DATA=${BASE}/A_1M/data TYPES=${BASE}/A_1M/types SETSIZE=$set LABELSETSIZE=$labelset TYPES1=${BASE}/A_1M/set1.types TYPES2=${BASE}/A_1M/set2.types INDEX1=${BASE}/A_1M/set1.indices INDEX2=${BASE}/A_1M/set2.indices FMT=$FORMAT - -#L data 10M rows -#${CMD} -f ../datagen/genRandData4DescriptiveStats.dml $DASH-explain $DASH-stats $DASH-nvargs R=10000000 C=$c NC=$nc MAXDOMAIN=$mdomain DATA=${BASE}/A_10M/data TYPES=${BASE}/A_10M/types SETSIZE=$set LABELSETSIZE=$labelset TYPES1=${BASE}/A_10M/set1.types TYPES2=${BASE}/A_10M/set2.types INDEX1=${BASE}/A_10M/set1.indices INDEX2=${BASE}/A_10M/set2.indices FMT=$FORMAT diff --git a/scripts/perftest/todo/genRandLogRegData_LTStats.sh b/scripts/perftest/todo/genRandLogRegData_LTStats.sh old mode 100644 new mode 100755 diff --git a/scripts/perftest/todo/genStratStatisticsData.sh b/scripts/perftest/todo/genStratStatisticsData.sh deleted file mode 100644 index d5f0c17..0000000 --- a/scripts/perftest/todo/genStratStatisticsData.sh +++ /dev/null @@ -1,41 +0,0 @@ -#!/bin/bash -#------------------------------------------------------------- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -#------------------------------------------------------------- - -if [ "$2" == "SPARK" ]; then CMD="./sparkDML.sh "; DASH="-"; elif [ "$2" == "MR" ]; then CMD="hadoop jar SystemDS.jar " ; else CMD="echo " ; fi - -FORMAT="binary" -BASE=$1/stratstats - -export HADOOP_CLIENT_OPTS="-Xmx2048m -Xms2048m -Xmn256m" - - -#XS data 10K rows -${CMD} -f ../datagen/genRandData4StratStats.dml $DASH-explain $DASH-stats $DASH-nvargs nr=10000 nf=100 D=${BASE}/A_10k/data Xcid=${BASE}/A_10k/Xcid Ycid=${BASE}/A_10k/Ycid A=${BASE}/A_10k/A fmt=$FORMAT - -#S data 100K rows -#${CMD} -f ../datagen/genRandData4StratStats.dml $DASH-explain $DASH-stats $DASH-nvargs nr=100000 nf=100 D=${BASE}/A_100k/data Xcid=${BASE}/A_100k/Xcid Ycid=${BASE}/A_100k/Ycid A=${BASE}/A_100k/A fmt=$FORMAT - -#M data 1M rows -#${CMD} -f ../datagen/genRandData4StratStats.dml $DASH-explain $DASH-stats $DASH-nvargs nr=1000000 nf=100 D=${BASE}/A_1M/data Xcid=${BASE}/A_1M/Xcid Ycid=${BASE}/A_1M/Ycid A=${BASE}/A_1M/A fmt=$FORMAT - -#L data 10M rows -#${CMD} -f ../datagen/genRandData4StratStats.dml $DASH-explain $DASH-stats $DASH-nvargs nr=10000000 nf=100 D=${BASE}/A_10M/data Xcid=${BASE}/A_10M/Xcid Ycid=${BASE}/A_10M/Ycid A=${BASE}/A_10M/A fmt=$FORMAT diff --git a/scripts/perftest/todo/genTreeData.sh b/scripts/perftest/todo/genTreeData.sh old mode 100644 new mode 100755 index af9cab2..4c22e19 --- a/scripts/perftest/todo/genTreeData.sh +++ b/scripts/perftest/todo/genTreeData.sh @@ -21,25 +21,24 @@ #------------------------------------------------------------- if [ "$1" == "" -o "$2" == "" ]; then echo "Usage: $0 <hdfsDataDir> <MR | SPARK | ECHO> e.g. $0 perftest SPARK" ; exit 1 ; fi -if [ "$2" == "SPARK" ]; then CMD="./sparkDML.sh "; DASH="-"; elif [ "$2" == "MR" ]; then CMD="hadoop jar SystemDS.jar " ; else CMD="echo " ; fi - +CMD=systemds BASE=$1/trees -FORMAT="binary" +FORMAT="text" DENSE_SP=0.9 SPARSE_SP=0.01 export HADOOP_CLIENT_OPTS="-Xmx2048m -Xms2048m -Xmn256m" -echo "NOT DONE YET. WAITING FOR DML SCRIPT FROM FARAZ" ; exit 1 +# echo "NOT DONE YET. WAITING FOR DML SCRIPT FROM FARAZ" ; exit 1 #generate XS scenarios (80MB) -${CMD} -f ../datagen/genRandData4LogisticRegression.dml $DASH-args 10000 1000 5 5 $BASE/w10k_1k_dense $BASE/X10k_1k_dense $BASE/y10k_1k_dense 1 0 $DENSE_SP $FORMAT -${CMD} -f ../datagen/genRandData4LogisticRegression.dml $DASH-args 10000 1000 5 5 $BASE/w10k_1k_sparse $BASE/X10k_1k_sparse $BASE/y10k_1k_sparse 1 0 $SPARSE_SP $FORMAT -${CMD} -f extractTestData.dml $DASH-args $BASE/X10k_1k_dense $BASE/y10k_1k_dense $BASE/X10k_1k_dense_test $BASE/y10k_1k_dense_test $FORMAT -${CMD} -f extractTestData.dml $DASH-args $BASE/X10k_1k_sparse $BASE/y10k_1k_sparse $BASE/X10k_1k_sparse_test $BASE/y10k_1k_sparse_test $FORMAT +${CMD} -f ../../datagen/genRandData4LogisticRegression.dml $DASH-args 10000 1000 5 5 $BASE/w10k_1k_dense $BASE/X10k_1k_dense $BASE/y10k_1k_dense 1 0 $DENSE_SP $FORMAT 0 +${CMD} -f ../../datagen/genRandData4LogisticRegression.dml $DASH-args 10000 1000 5 5 $BASE/w10k_1k_sparse $BASE/X10k_1k_sparse $BASE/y10k_1k_sparse 1 0 $SPARSE_SP $FORMAT 0 +${CMD} -f ../scripts/extractTestData.dml $DASH-args $BASE/X10k_1k_dense $BASE/y10k_1k_dense $BASE/X10k_1k_dense_test $BASE/y10k_1k_dense_test $FORMAT +${CMD} -f ../scripts/extractTestData.dml $DASH-args $BASE/X10k_1k_sparse $BASE/y10k_1k_sparse $BASE/X10k_1k_sparse_test $BASE/y10k_1k_sparse_test $FORMAT ##generate S scenarios (800MB) #${CMD} -f ../datagen/genRandData4LogisticRegression.dml $DASH-args 100000 1000 5 5 $BASE/w100k_1k_dense $BASE/X100k_1k_dense $BASE/y100k_1k_dense 1 0 $DENSE_SP $FORMAT diff --git a/scripts/perftest/todo/runAllTrees.sh b/scripts/perftest/todo/runAllTrees.sh old mode 100644 new mode 100755 index 3437dfa..1671d26 --- a/scripts/perftest/todo/runAllTrees.sh +++ b/scripts/perftest/todo/runAllTrees.sh @@ -36,7 +36,7 @@ if [ ! -d logs ]; then mkdir logs ; fi # data generation echo $2"-- Generating Tree data: " >> times.txt; -./genTreeData.sh $1 $2 &>> logs/genTreeData.out +./genTreeData.sh $1 $2 &> logs/genTreeData.out # run all trees with on all datasets for d in "10k_1k_dense" "10k_1k_sparse" # "100k_1k_dense" "100k_1k_sparse" "1M_1k_dense" "1M_1k_sparse" "10M_1k_dense" "10M_1k_sparse" #"_KDD" "100M_1k_dense" "100M_1k_sparse" diff --git a/scripts/perftest/todo/runDecTree.sh b/scripts/perftest/todo/runDecTree.sh old mode 100644 new mode 100755 index d1841c2..798a1ad --- a/scripts/perftest/todo/runDecTree.sh +++ b/scripts/perftest/todo/runDecTree.sh @@ -21,8 +21,7 @@ #------------------------------------------------------------- set -e -if [ "$4" == "SPARK" ]; then CMD="./sparkDML.sh "; DASH="-"; elif [ "$4" == "MR" ]; then CMD="hadoop jar SystemDS.jar " ; else CMD="echo " ; fi - +CMD=systemds BASE=$3 export HADOOP_CLIENT_OPTS="-Xmx2048m -Xms2048m -Xmn256m" @@ -31,13 +30,13 @@ echo "running decision tree" #training tstart=$SECONDS -${CMD} -f ../algorithms/decision-tree.dml $DASH-explain $DASH-stats $DASH-nvargs X=$1 Y=$2 fmt=csv M=${BASE}/M +${CMD} -f scripts/decision-tree.dml --explain --stats --nvargs X=$1 Y=$2 fmt=csv M=${BASE}/M ttrain=$(($SECONDS - $tstart - 3)) echo "DecisionTree train on "$1": "$ttrain >> times.txt #predict tstart=$SECONDS -${CMD} -f ../algorithms/decision-tree-predict.dml $DASH-explain $DASH-stats $DASH-nvargs M=${BASE}/M X=$1_test Y=$2_test P=${BASE}/P +${CMD} -f ../../algorithms/decision-tree-predict.dml --explain --stats --nvargs M=${BASE}/M X=$1_test Y=$2_test P=${BASE}/P tpredict=$(($SECONDS - $tstart - 3)) echo "DecisionTree predict on "$1": "$tpredict >> times.txt diff --git a/scripts/perftest/todo/runKmeans.sh b/scripts/perftest/todo/runKmeans.sh deleted file mode 100644 index cdeae94..0000000 --- a/scripts/perftest/todo/runKmeans.sh +++ /dev/null @@ -1,40 +0,0 @@ -#!/bin/bash -#------------------------------------------------------------- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -#------------------------------------------------------------- -set -e - -if [ "$4" == "SPARK" ]; then CMD="./sparkDML.sh "; DASH="-"; elif [ "$4" == "MR" ]; then CMD="hadoop jar SystemDS.jar " ; else CMD="echo " ; fi - -BASE=$3 - -export HADOOP_CLIENT_OPTS="-Xmx2048m -Xms2048m -Xmn256m" - -#training -tstart=$SECONDS -${CMD} -f ../algorithms/Kmeans.dml $DASH-explain $DASH-stats $DASH-nvargs X=$1 k=5 C=${BASE}/centroids.mtx maxi=$2 tol=0.0001 -ttrain=$(($SECONDS - $tstart - 3)) -echo "Kmeans train on "$1": "$ttrain >> times.txt - -#predict -tstart=$SECONDS -${CMD} -f ../algorithms/Kmeans-predict.dml $DASH-explain $DASH-stats $DASH-nvargs X=$1 C=${BASE}/centroids.mtx prY=${BASE}/prY.mtx -tpredict=$(($SECONDS - $tstart - 3)) -echo "Kmeans predict on "$1": "$tpredict >> times.txt diff --git a/scripts/perftest/todo/runRandTree.sh b/scripts/perftest/todo/runRandTree.sh old mode 100644 new mode 100755 index a13aa12..3c4b793 --- a/scripts/perftest/todo/runRandTree.sh +++ b/scripts/perftest/todo/runRandTree.sh @@ -21,8 +21,7 @@ #------------------------------------------------------------- set -e -if [ "$4" == "SPARK" ]; then CMD="./sparkDML.sh "; DASH="-"; elif [ "$4" == "MR" ]; then CMD="hadoop jar SystemDS.jar " ; else CMD="echo " ; fi - +CMD=systemds BASE=$3 export HADOOP_CLIENT_OPTS="-Xmx2048m -Xms2048m -Xmn256m" @@ -31,13 +30,13 @@ echo "running random forest" #training tstart=$SECONDS -${CMD} -f ../algorithms/random-forest.dml $DASH-explain $DASH-stats $DASH-nvargs X=$1 Y=$2 fmt=csv M=${BASE}/M +${CMD} -f scripts/random-forest.dml --explain --stats --nvargs X=$1 Y=$2 fmt=csv M=${BASE}/M ttrain=$(($SECONDS - $tstart - 3)) echo "RandomForest train on "$1": "$ttrain >> times.txt #predict tstart=$SECONDS -${CMD} -f ../algorithms/random-forest-predict.dml $DASH-explain $DASH-stats $DASH-nvargs M=${BASE}/M X=$1_test Y=$2_test P=${BASE}/P +${CMD} -f ../../algorithms/random-forest-predict.dml --explain --stats --nvargs M=${BASE}/M X=$1_test Y=$2_test P=${BASE}/P tpredict=$(($SECONDS - $tstart - 3)) echo "Randomforest predict on "$1": "$tpredict >> times.txt diff --git a/scripts/perftest/todo/scripts/decision-tree.dml b/scripts/perftest/todo/scripts/decision-tree.dml new file mode 100644 index 0000000..d887532 --- /dev/null +++ b/scripts/perftest/todo/scripts/decision-tree.dml @@ -0,0 +1,85 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +# +# THIS SCRIPT IMPLEMENTS CLASSIFICATION TREES WITH BOTH SCALE AND CATEGORICAL FEATURES +# +# INPUT PARAMETERS: +# --------------------------------------------------------------------------------------------- +# NAME TYPE DEFAULT MEANING +# --------------------------------------------------------------------------------------------- +# X String --- Location to read feature matrix X; note that X needs to be both recoded and dummy coded +# Y String --- Location to read label matrix Y; note that Y needs to be both recoded and dummy coded +# R String " " Location to read the matrix R which for each feature in X contains the following information +# - R[,1]: column ids +# - R[,2]: start indices +# - R[,3]: end indices +# If R is not provided by default all variables are assumed to be scale +# bins Int 20 Number of equiheight bins per scale feature to choose thresholds +# depth Int 25 Maximum depth of the learned tree +# num_leaf Int 10 Number of samples when splitting stops and a leaf node is added +# num_samples Int 3000 Number of samples at which point we switch to in-memory subtree building +# impurity String "Gini" Impurity measure: entropy or Gini (the default) +# M String --- Location to write matrix M containing the learned tree +# O String " " Location to write the training accuracy; by default is standard output +# S_map String " " Location to write the mappings from scale feature ids to global feature ids +# C_map String " " Location to write the mappings from categorical feature ids to global feature ids +# fmt String "text" The output format of the model (matrix M), such as "text" or "csv" +# --------------------------------------------------------------------------------------------- +# OUTPUT: +# Matrix M where each column corresponds to a node in the learned tree and each row contains the following information: +# M[1,j]: id of node j (in a complete binary tree) +# M[2,j]: Offset (no. of columns) to left child of j if j is an internal node, otherwise 0 +# M[3,j]: Feature index of the feature (scale feature id if the feature is scale or categorical feature id if the feature is categorical) +# that node j looks at if j is an internal node, otherwise 0 +# M[4,j]: Type of the feature that node j looks at if j is an internal node: 1 for scale and 2 for categorical features, +# otherwise the label that leaf node j is supposed to predict +# M[5,j]: If j is an internal node: 1 if the feature chosen for j is scale, otherwise the size of the subset of values +# stored in rows 6,7,... if j is categorical +# If j is a leaf node: number of misclassified samples reaching at node j +# M[6:,j]: If j is an internal node: Threshold the example's feature value is compared to is stored at M[6,j] if the feature chosen for j is scale, +# otherwise if the feature chosen for j is categorical rows 6,7,... depict the value subset chosen for j +# If j is a leaf node 1 if j is impure and the number of samples at j > threshold, otherwise 0 +# ------------------------------------------------------------------------------------------- +# HOW TO INVOKE THIS SCRIPT - EXAMPLE: +# hadoop jar SystemDS.jar -f decision-tree.dml -nvargs X=INPUT_DIR/X Y=INPUT_DIR/Y R=INPUT_DIR/R M=OUTPUT_DIR/model +# bins=20 depth=25 num_leaf=10 num_samples=3000 impurity=Gini fmt=csv + +# Default values of some parameters +fileR = ifdef ($R, " "); +fileO = ifdef ($O, " "); +fileS_map = ifdef ($S_map, " "); +fileC_map = ifdef ($C_map, " "); +fileM = $M; +num_bins = ifdef($bins, 20); +depth = ifdef($depth, 25); +num_leaf = ifdef($num_leaf, 10); +threshold = ifdef ($num_samples, 3000); +imp = ifdef($impurity, "Gini"); +fmtO = ifdef($fmt, "text"); + +X = read($X); +Y_bin = read($Y); +R = matrix(1, rows=1, cols=ncol(X)); + +M = decisionTree(X = X, Y = Y_bin, R = R, bins = num_bins, depth = depth); + +write (M, fileM, format = fmtO); diff --git a/scripts/perftest/todo/scripts/random-forest.dml b/scripts/perftest/todo/scripts/random-forest.dml new file mode 100644 index 0000000..c01ecd7 --- /dev/null +++ b/scripts/perftest/todo/scripts/random-forest.dml @@ -0,0 +1,92 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +# +# THIS SCRIPT IMPLEMENTS CLASSIFICATION RANDOM FOREST WITH BOTH SCALE AND CATEGORICAL FEATURES +# +# INPUT PARAMETERS: +# --------------------------------------------------------------------------------------------- +# NAME TYPE DEFAULT MEANING +# --------------------------------------------------------------------------------------------- +# X String --- Location to read feature matrix X; note that X needs to be both recoded and dummy coded +# Y String --- Location to read label matrix Y; note that Y needs to be both recoded and dummy coded +# R String " " Location to read the matrix R which for each feature in X contains the following information +# - R[,1]: column ids +# - R[,2]: start indices +# - R[,3]: end indices +# If R is not provided by default all variables are assumed to be scale +# bins Int 20 Number of equiheight bins per scale feature to choose thresholds +# depth Int 25 Maximum depth of the learned tree +# num_leaf Int 10 Number of samples when splitting stops and a leaf node is added +# num_samples Int 3000 Number of samples at which point we switch to in-memory subtree building +# num_trees Int 10 Number of trees to be learned in the random forest model +# subsamp_rate Double 1.0 Parameter controlling the size of each tree in the forest; samples are selected from a +# Poisson distribution with parameter subsamp_rate (the default value is 1.0) +# feature_subset Double 0.5 Parameter that controls the number of feature used as candidates for splitting at each tree node +# as a power of number of features in the dataset; +# by default square root of features (i.e., feature_subset = 0.5) are used at each tree node +# impurity String "Gini" Impurity measure: entropy or Gini (the default) +# M String --- Location to write matrix M containing the learned tree +# C String " " Location to write matrix C containing the number of times samples are chosen in each tree of the random forest +# S_map String " " Location to write the mappings from scale feature ids to global feature ids +# C_map String " " Location to write the mappings from categorical feature ids to global feature ids +# fmt String "text" The output format of the model (matrix M), such as "text" or "csv" +# --------------------------------------------------------------------------------------------- +# OUTPUT: +# Matrix M where each column corresponds to a node in the learned tree and each row contains the following information: +# M[1,j]: id of node j (in a complete binary tree) +# M[2,j]: tree id to which node j belongs +# M[3,j]: Offset (no. of columns) to left child of j +# M[4,j]: Feature index of the feature that node j looks at if j is an internal node, otherwise 0 +# M[5,j]: Type of the feature that node j looks at if j is an internal node: 1 for scale and 2 for categorical features, +# otherwise the label that leaf node j is supposed to predict +# M[6,j]: 1 if j is an internal node and the feature chosen for j is scale, otherwise the size of the subset of values +# stored in rows 7,8,... if j is categorical +# M[7:,j]: Only applicable for internal nodes. Threshold the example's feature value is compared to is stored at M[7,j] if the feature chosen for j is scale; +# If the feature chosen for j is categorical rows 7,8,... depict the value subset chosen for j +# ------------------------------------------------------------------------------------------- +# HOW TO INVOKE THIS SCRIPT - EXAMPLE: +# hadoop jar SystemDS.jar -f random-forest.dml -nvargs X=INPUT_DIR/X Y=INPUT_DIR/Y R=INPUT_DIR/R M=OUTPUT_DIR/model +# bins=20 depth=25 num_leaf=10 num_samples=3000 num_trees=10 impurity=Gini fmt=csv + + +# Default values of some parameters +fileR = ifdef ($R, " "); +fileM = $M; +num_bins = ifdef($bins, 20); +depth = ifdef($depth, 25); +num_leaf = ifdef($num_leaf, 10); +num_trees = ifdef($num_trees, 1); +threshold = ifdef ($num_samples, 3000); +imp = ifdef($impurity, "Gini"); +rate = ifdef ($subsamp_rate, 1); +fpow = ifdef ($feature_subset, 0.5); +fmtO = ifdef($fmt, "text"); + +X = read($X); +Y_bin = read($Y); +R = matrix(0, cols=0, rows=0); + +[M, C, S_map, C_map] = randomForest(X = X, Y = Y_bin, R = R, + bins = num_bins, depth = depth, num_leaf = num_leaf, num_samples = threshold, + num_trees = num_trees, subsamp_rate = rate, feature_subset = fpow, impurity = imp); + +write (M, fileM, format = fmtO);