This is an automated email from the ASF dual-hosted git repository. mboehm7 pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/systemds.git
The following commit(s) were added to refs/heads/main by this push: new 56c782384b [SYSTEMDS-3847] Fix perftest refactoring (datagen scripts) 56c782384b is described below commit 56c782384b73b560a1cdee7ff8c04dacefa2ec76 Author: Matthias Boehm <mboe...@gmail.com> AuthorDate: Sat Apr 5 19:04:19 2025 +0200 [SYSTEMDS-3847] Fix perftest refactoring (datagen scripts) --- scripts/perftest/datagen/genALSData.sh | 68 ++++++ scripts/perftest/datagen/genBinomialData.sh | 78 +++++++ scripts/perftest/datagen/genClusteringData.sh | 68 ++++++ .../datagen/genDescriptiveStatisticsData.sh | 60 ++++++ .../perftest/datagen/genDimensionReductionData.sh | 61 ++++++ scripts/perftest/datagen/genIOData.sh | 72 +++++++ scripts/perftest/datagen/genL2SVMData.sh | 38 ++++ scripts/perftest/datagen/genMultinomialData.sh | 78 +++++++ scripts/perftest/datagen/genRandData4ALS.dml | 47 +++++ .../datagen/genRandData4ChisquaredTest.dml | 87 ++++++++ .../perftest/datagen/genRandData4DecisionTree.sh | 58 +++++ .../perftest/datagen/genRandData4DecisionTree1.dml | 40 ++++ .../perftest/datagen/genRandData4DecisionTree2.dml | 41 ++++ .../datagen/genRandData4DescriptiveStats.dml | 149 +++++++++++++ scripts/perftest/datagen/genRandData4FTest.dml | 95 +++++++++ scripts/perftest/datagen/genRandData4Kmeans.dml | 120 +++++++++++ .../datagen/genRandData4LinearReg_LTstats.dml | 233 +++++++++++++++++++++ .../datagen/genRandData4LinearRegression.dml | 61 ++++++ .../datagen/genRandData4LogReg_LTstats.dml | 233 +++++++++++++++++++++ .../datagen/genRandData4LogisticRegression.dml | 72 +++++++ .../perftest/datagen/genRandData4MultiClassSVM.dml | 68 ++++++ .../perftest/datagen/genRandData4Multinomial.dml | 66 ++++++ scripts/perftest/datagen/genRandData4NMF.dml | 129 ++++++++++++ .../perftest/datagen/genRandData4NMFBlockwise.dml | 138 ++++++++++++ scripts/perftest/datagen/genRandData4PCA.dml | 61 ++++++ .../perftest/datagen/genRandData4StratStats.dml | 155 ++++++++++++++ .../perftest/datagen/genRandData4SurvAnalysis.dml | 133 ++++++++++++ scripts/perftest/datagen/genRandData4Transform.dml | 96 +++++++++ .../perftest/datagen/genRandData4Univariate.dml | 61 ++++++ scripts/perftest/datagen/genStratStatisticsData.sh | 61 ++++++ scripts/perftest/sparkDML2.sh | 24 ++- 31 files changed, 2750 insertions(+), 1 deletion(-) diff --git a/scripts/perftest/datagen/genALSData.sh b/scripts/perftest/datagen/genALSData.sh new file mode 100644 index 0000000000..3d1a22a675 --- /dev/null +++ b/scripts/perftest/datagen/genALSData.sh @@ -0,0 +1,68 @@ +#!/bin/bash +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- +if [ "$(basename $PWD)" != "perftest" ]; +then + echo "Please execute scripts from directory 'perftest'" + exit 1; +fi + +CMD=$1 +DATADIR=$2/als +MAXMEM=$3 + +FORMAT="text" # can be csv, mm, text, binary +DENSE_SP=0.9 +SPARSE_SP=0.01 + +echo "-- Generating ALS data." >> results/times.txt; + +#generate XS scenarios (80MB) +if [ $MAXMEM -ge 80 ]; then + ${CMD} -f ../datagen/genRandData4ALS.dml --nvargs X=${DATADIR}/X10k_1k_dense rows=10000 cols=1000 rank=10 nnz=`echo "scale=0; 10000 * 1000 * $DENSE_SP" | bc` sigma=0.01 fmt=$FORMAT & + ${CMD} -f ../datagen/genRandData4ALS.dml --nvargs X=${DATADIR}/X10k_1k_sparse rows=10000 cols=1000 rank=10 nnz=`echo "scale=0; 10000 * 1000 * $SPARSE_SP" | bc` sigma=0.01 fmt=$FORMAT & +fi + +#generate S scenarios (800MB) +if [ $MAXMEM -ge 800 ]; then + ${CMD} -f ../datagen/genRandData4ALS.dml --nvargs X=${DATADIR}/X100k_1k_dense rows=100000 cols=1000 rank=10 nnz=`echo "scale=0; 100000 * 1000 * $DENSE_SP" | bc` sigma=0.01 fmt=$FORMAT & + ${CMD} -f ../datagen/genRandData4ALS.dml --nvargs X=${DATADIR}/X100k_1k_sparse rows=100000 cols=1000 rank=10 nnz=`echo "scale=0; 100000 * 1000 * $SPARSE_SP" | bc` sigma=0.01 fmt=$FORMAT & +fi + +#generate M scenarios (8GB) +if [ $MAXMEM -ge 8000 ]; then + ${CMD} -f ../datagen/genRandData4ALS.dml --nvargs X=${DATADIR}/X1M_1k_dense rows=1000000 cols=1000 rank=10 nnz=`echo "scale=0; 1000000 * 1000 * $DENSE_SP" | bc` sigma=0.01 fmt=$FORMAT & + ${CMD} -f ../datagen/genRandData4ALS.dml --nvargs X=${DATADIR}/X1M_1k_sparse rows=1000000 cols=1000 rank=10 nnz=`echo "scale=0; 1000000 * 1000 * $SPARSE_SP" | bc` sigma=0.01 fmt=$FORMAT & +fi + +#generate L scenarios (80GB) +if [ $MAXMEM -ge 80000 ]; then + ${CMD} -f ../datagen/genRandData4ALS.dml --nvargs X=${DATADIR}/X10M_1k_dense rows=10000000 cols=1000 rank=10 nnz=`echo "scale=0; 10000000 * 1000 * $DENSE_SP" | bc` sigma=0.01 fmt=$FORMAT + ${CMD} -f ../datagen/genRandData4ALS.dml --nvargs X=${DATADIR}/X10M_1k_sparse rows=10000000 cols=1000 rank=10 nnz=`echo "scale=0; 10000000 * 1000 * $SPARSE_SP" | bc` sigma=0.01 fmt=$FORMAT +fi + +#generate XL scenarios (800GB) +if [ $MAXMEM -ge 800000 ]; then + ${CMD} -f ../datagen/genRandData4ALS.dml --nvargs X=${DATADIR}/X100M_1k_dense rows=100000000 cols=1000 rank=10 nnz=`echo "scale=0; 100000000 * 1000 * $DENSE_SP" | bc` sigma=0.01 fmt=$FORMAT + ${CMD} -f ../datagen/genRandData4ALS.dml --nvargs X=${DATADIR}/X100M_1k_sparse rows=100000000 cols=1000 rank=10 nnz=`echo "scale=0; 100000000 * 1000 * $SPARSE_SP" | bc` sigma=0.01 fmt=$FORMAT +fi + +wait \ No newline at end of file diff --git a/scripts/perftest/datagen/genBinomialData.sh b/scripts/perftest/datagen/genBinomialData.sh new file mode 100644 index 0000000000..c911175ace --- /dev/null +++ b/scripts/perftest/datagen/genBinomialData.sh @@ -0,0 +1,78 @@ +#!/bin/bash +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- +if [ "$(basename $PWD)" != "perftest" ]; +then + echo "Please execute scripts from directory 'perftest'" + exit 1; +fi + +CMD=$1 +BASE=$2/binomial +MAXMEM=$3 + +FORMAT="binary" # can be csv, mm, text, binary +DENSE_SP=0.9 +SPARSE_SP=0.01 + +echo -e "\n\n-- Generating binomial data..." >> results/times.txt; + +#generate XS scenarios (80MB) +if [ $MAXMEM -ge 80 ]; then + ${CMD} -f datagen/genRandData4LogisticRegression.dml --args 10000 1000 5 5 ${BASE}/w10k_1k_dense ${BASE}/X10k_1k_dense ${BASE}/y10k_1k_dense 1 0 $DENSE_SP $FORMAT 1 & pidDense80=$! + ${CMD} -f datagen/genRandData4LogisticRegression.dml --args 10000 1000 5 5 ${BASE}/w10k_1k_sparse ${BASE}/X10k_1k_sparse ${BASE}/y10k_1k_sparse 1 0 $SPARSE_SP $FORMAT 1 & pidSparse80=$! + wait $pidDense80; ${CMD} -f scripts/extractTestData.dml --args ${BASE}/X10k_1k_dense ${BASE}/y10k_1k_dense ${BASE}/X10k_1k_dense_test ${BASE}/y10k_1k_dense_test $FORMAT & + wait $pidSparse80; ${CMD} -f scripts/extractTestData.dml --args ${BASE}/X10k_1k_sparse ${BASE}/y10k_1k_sparse ${BASE}/X10k_1k_sparse_test ${BASE}/y10k_1k_sparse_test $FORMAT & +fi + +##generate S scenarios (800MB) +if [ $MAXMEM -ge 800 ]; then + ${CMD} -f datagen/genRandData4LogisticRegression.dml --args 100000 1000 5 5 ${BASE}/w100k_1k_dense ${BASE}/X100k_1k_dense ${BASE}/y100k_1k_dense 1 0 $DENSE_SP $FORMAT 1 & pidDense800=$! + ${CMD} -f datagen/genRandData4LogisticRegression.dml --args 100000 1000 5 5 ${BASE}/w100k_1k_sparse ${BASE}/X100k_1k_sparse ${BASE}/y100k_1k_sparse 1 0 $SPARSE_SP $FORMAT 1 & pidSparse800=$! + wait $pidDense800; ${CMD} -f scripts/extractTestData.dml --args ${BASE}/X100k_1k_dense ${BASE}/y100k_1k_dense ${BASE}/X100k_1k_dense_test ${BASE}/y100k_1k_dense_test $FORMAT & + wait $pidSparse800; ${CMD} -f scripts/extractTestData.dml --args ${BASE}/X100k_1k_sparse ${BASE}/y100k_1k_sparse ${BASE}/X100k_1k_sparse_test ${BASE}/y100k_1k_sparse_test $FORMAT & +fi + +#generate M scenarios (8GB) +if [ $MAXMEM -ge 8000 ]; then + ${CMD} -f datagen/genRandData4LogisticRegression.dml --args 1000000 1000 5 5 ${BASE}/w1M_1k_dense ${BASE}/X1M_1k_dense ${BASE}/y1M_1k_dense 1 0 $DENSE_SP $FORMAT 1 & pidDense8000=$! + ${CMD} -f datagen/genRandData4LogisticRegression.dml --args 1000000 1000 5 5 ${BASE}/w1M_1k_sparse ${BASE}/X1M_1k_sparse ${BASE}/y1M_1k_sparse 1 0 $SPARSE_SP $FORMAT 1 & pidSparse8000=$! + wait $pidDense8000; ${CMD} -f scripts/extractTestData.dml --args ${BASE}/X1M_1k_dense ${BASE}/y1M_1k_dense ${BASE}/X1M_1k_dense_test ${BASE}/y1M_1k_dense_test $FORMAT & + wait $pidSparse8000; ${CMD} -f scripts/extractTestData.dml --args ${BASE}/X1M_1k_sparse ${BASE}/y1M_1k_sparse ${BASE}/X1M_1k_sparse_test ${BASE}/y1M_1k_sparse_test $FORMAT & +fi + +#generate L scenarios (80GB) +if [ $MAXMEM -ge 80000 ]; then + ${CMD} -f datagen/genRandData4LogisticRegression.dml --args 10000000 1000 5 5 ${BASE}/w10M_1k_dense ${BASE}/X10M_1k_dense ${BASE}/y10M_1k_dense 1 0 $DENSE_SP $FORMAT 1 + ${CMD} -f datagen/genRandData4LogisticRegression.dml --args 10000000 1000 5 5 ${BASE}/w10M_1k_sparse ${BASE}/X10M_1k_sparse ${BASE}/y10M_1k_sparse 1 0 $SPARSE_SP $FORMAT 1 + ${CMD} -f scripts/extractTestData.dml --args ${BASE}/X10M_1k_dense ${BASE}/y10M_1k_dense ${BASE}/X10M_1k_dense_test ${BASE}/y10M_1k_dense_test $FORMAT + ${CMD} -f scripts/extractTestData.dml --args ${BASE}/X10M_1k_sparse ${BASE}/y10M_1k_sparse ${BASE}/X10M_1k_sparse_test ${BASE}/y10M_1k_sparse_test $FORMAT +fi + +##generate XL scenarios (800GB) +if [ $MAXMEM -ge 800000 ]; then + ${CMD} -f datagen/genRandData4LogisticRegression.dml --args 100000000 1000 5 5 ${BASE}/w100M_1k_dense ${BASE}/X100M_1k_dense ${BASE}/y100M_1k_dense 1 0 $DENSE_SP $FORMAT 1 + ${CMD} -f datagen/genRandData4LogisticRegression.dml --args 100000000 1000 5 5 ${BASE}/w100M_1k_sparse ${BASE}/X100M_1k_sparse ${BASE}/y100M_1k_sparse 1 0 $SPARSE_SP $FORMAT 1 + ${CMD} -f scripts/extractTestData.dml --args ${BASE}/X100M_1k_dense ${BASE}/y100M_1k_dense ${BASE}/X100M_1k_dense_test ${BASE}/y100M_1k_dense_test $FORMAT + ${CMD} -f scripts/extractTestData.dml --args ${BASE}/X100M_1k_sparse ${BASE}/y100M_1k_sparse ${BASE}/X100M_1k_sparse_test ${BASE}/y100M_1k_sparse_test $FORMAT +fi + +wait \ No newline at end of file diff --git a/scripts/perftest/datagen/genClusteringData.sh b/scripts/perftest/datagen/genClusteringData.sh new file mode 100644 index 0000000000..46adffb9e3 --- /dev/null +++ b/scripts/perftest/datagen/genClusteringData.sh @@ -0,0 +1,68 @@ +#!/bin/bash +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- +if [ "$(basename $PWD)" != "perftest" ]; +then + echo "Please execute scripts from directory 'perftest'" + exit 1; +fi + +CMD=${1:-systemds} +BASE=${2:-"temp"}/clustering +MAXMEM=${3:-80} + +FORMAT="binary" +DENSE_SP=0.9 +SPARSE_SP=0.01 + +echo "-- Generating clustering data..." >> results/times.txt; + +#generate XS scenarios (80MB) +if [ $MAXMEM -ge 80 ]; then + ${CMD} -f datagen/genRandData4Kmeans.dml --nvargs nr=10000 nf=1000 nc=5 dc=10.0 dr=1.0 fbf=100.0 cbf=100.0 X=$BASE/X10k_1k_dense C=$BASE/C10k_1k_dense Y=$BASE/y10k_1k_dense YbyC=$BASE/YbyC10k_1k_dense fmt=$FORMAT & pidDense80=$! + wait $pidDense80; ${CMD} -f scripts/extractTestData.dml --args $BASE/X10k_1k_dense $BASE/y10k_1k_dense $BASE/X10k_1k_dense_test $BASE/y10k_1k_dense_test $FORMAT & +fi + +#generate S scenarios (800MB) +if [ $MAXMEM -ge 800 ]; then + ${CMD} -f datagen/genRandData4Kmeans.dml --nvargs nr=100000 nf=1000 nc=5 dc=10.0 dr=1.0 fbf=100.0 cbf=100.0 X=$BASE/X100k_1k_dense C=$BASE/C100k_1k_dense Y=$BASE/y100k_1k_dense YbyC=$BASE/YbyC100k_1k_dense fmt=$FORMAT & pidDense800=$! + wait $pidDense800; ${CMD} -f scripts/extractTestData.dml --args $BASE/X100k_1k_dense $BASE/y100k_1k_dense $BASE/X100k_1k_dense_test $BASE/y100k_1k_dense_test $FORMAT & +fi + +#generate M scenarios (8GB) +if [ $MAXMEM -ge 8000 ]; then + ${CMD} -f datagen/genRandData4Kmeans.dml --nvargs nr=1000000 nf=1000 nc=5 dc=10.0 dr=1.0 fbf=100.0 cbf=100.0 X=$BASE/X1M_1k_dense C=$BASE/C1M_1k_dense Y=$BASE/y1M_1k_dense YbyC=$BASE/YbyC1M_1k_dense fmt=$FORMAT & pidDense8000=$! + wait $pidDense8000; ${CMD} -f scripts/extractTestData.dml --args $BASE/X1M_1k_dense $BASE/y1M_1k_dense $BASE/X1M_1k_dense_test $BASE/y1M_1k_dense_test $FORMAT & +fi + +#generate L scenarios (80GB) +if [ $MAXMEM -ge 80000 ]; then + ${CMD} -f datagen/genRandData4Kmeans.dml --nvargs nr=10000000 nf=1000 nc=5 dc=10.0 dr=1.0 fbf=100.0 cbf=100.0 X=$BASE/X10M_1k_dense C=$BASE/C10M_1k_dense Y=$BASE/y10M_1k_dense YbyC=$BASE/YbyC10M_1k_dense fmt=$FORMAT + ${CMD} -f scripts/extractTestData.dml --args $BASE/X10M_1k_dense $BASE/y10M_1k_dense $BASE/X10M_1k_dense_test $BASE/y10M_1k_dense_test $FORMAT +fi + +#generate LARGE scenarios (800GB) +if [ $MAXMEM -ge 800000 ]; then + ${CMD} -f datagen/genRandData4Kmeans.dml --nvargs nr=100000000 nf=1000 nc=5 dc=10.0 dr=1.0 fbf=100.0 cbf=100.0 X=$BASE/X100M_1k_dense C=$BASE/C100M_1k_dense Y=$BASE/y100M_1k_dense YbyC=$BASE/YbyC100M_1k_dense fmt=$FORMAT + ${CMD} -f scripts/extractTestData.dml --args $BASE/X100M_1k_dense $BASE/y100M_1k_dense $BASE/X100M_1k_dense_test $BASE/y100M_1k_dense_test $FORMAT +fi + +wait \ No newline at end of file diff --git a/scripts/perftest/datagen/genDescriptiveStatisticsData.sh b/scripts/perftest/datagen/genDescriptiveStatisticsData.sh new file mode 100644 index 0000000000..c59fdc6a2a --- /dev/null +++ b/scripts/perftest/datagen/genDescriptiveStatisticsData.sh @@ -0,0 +1,60 @@ +#!/bin/bash +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- +if [ "$(basename $PWD)" != "perftest" ]; +then + echo "Please execute scripts from directory 'perftest'" + exit 1; +fi + +CMD=$1 +BASE=$2/bivar +MAXMEM=$3 + +FORMAT="binary" + +c=1000 +nc=100 +mdomain=1100 +set=20 +labelset=10 + +#XS data 10K rows +if [ $MAXMEM -ge 80 ]; then + ${CMD} -f datagen/genRandData4DescriptiveStats.dml --explain --stats --nvargs R=10000 C=$c NC=$nc MAXDOMAIN=$mdomain DATA=${BASE}/A_10k/data TYPES=${BASE}/A_10k/types SETSIZE=$set LABELSETSIZE=$labelset TYPES1=${BASE}/A_10k/set1.types TYPES2=${BASE}/A_10k/set2.types INDEX1=${BASE}/A_10k/set1.indices INDEX2=${BASE}/A_10k/set2.indices FMT=$FORMAT & +fi + +#S data 100K rows +if [ $MAXMEM -ge 800 ]; then + ${CMD} -f datagen/genRandData4DescriptiveStats.dml --explain --stats --nvargs R=100000 C=$c NC=$nc MAXDOMAIN=$mdomain DATA=${BASE}/A_100k/data TYPES=${BASE}/A_100k/types SETSIZE=$set LABELSETSIZE=$labelset TYPES1=${BASE}/A_100k/set1.types TYPES2=${BASE}/A_100k/set2.types INDEX1=${BASE}/A_100k/set1.indices INDEX2=${BASE}/A_100k/set2.indices FMT=$FORMAT & +fi + +#M data 1M rows +if [ $MAXMEM -ge 8000 ]; then + ${CMD} -f datagen/genRandData4DescriptiveStats.dml --explain --stats --nvargs R=1000000 C=$c NC=$nc MAXDOMAIN=$mdomain DATA=${BASE}/A_1M/data TYPES=${BASE}/A_1M/types SETSIZE=$set LABELSETSIZE=$labelset TYPES1=${BASE}/A_1M/set1.types TYPES2=${BASE}/A_1M/set2.types INDEX1=${BASE}/A_1M/set1.indices INDEX2=${BASE}/A_1M/set2.indices FMT=$FORMAT & +fi + +#L data 10M rows +if [ $MAXMEM -ge 80000 ]; then + ${CMD} -f datagen/genRandData4DescriptiveStats.dml --explain --stats --nvargs R=10000000 C=$c NC=$nc MAXDOMAIN=$mdomain DATA=${BASE}/A_10M/data TYPES=${BASE}/A_10M/types SETSIZE=$set LABELSETSIZE=$labelset TYPES1=${BASE}/A_10M/set1.types TYPES2=${BASE}/A_10M/set2.types INDEX1=${BASE}/A_10M/set1.indices INDEX2=${BASE}/A_10M/set2.indices FMT=$FORMAT +fi + +wait \ No newline at end of file diff --git a/scripts/perftest/datagen/genDimensionReductionData.sh b/scripts/perftest/datagen/genDimensionReductionData.sh new file mode 100644 index 0000000000..cd90aa1758 --- /dev/null +++ b/scripts/perftest/datagen/genDimensionReductionData.sh @@ -0,0 +1,61 @@ +#!/bin/bash +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- +if [ "$(basename $PWD)" != "perftest" ]; +then + echo "Please execute scripts from directory 'perftest'" + exit 1; +fi + +CMD=${1:-systemds} +BASE=${2:-"temp"}/dimensionreduction +MAXMEM=${3:-80} + +FORMAT="binary" + +echo "-- Generating Dimension Reduction data." >> results/times.txt; + +#generate XS scenarios (80MB) +if [ $MAXMEM -ge 80 ]; then + ${CMD} -f datagen/genRandData4PCA.dml --nvargs R=5000 C=2000 OUT=$BASE/pcaData5k_2k_dense FMT=$FORMAT & +fi + +#generate S scenarios (800MB) +if [ $MAXMEM -ge 800 ]; then + ${CMD} -f datagen/genRandData4PCA.dml --nvargs R=50000 C=2000 OUT=$BASE/pcaData50k_2k_dense FMT=$FORMAT & +fi + +#generate M scenarios (8GB) +if [ $MAXMEM -ge 8000 ]; then + ${CMD} -f datagen/genRandData4PCA.dml --nvargs R=500000 C=2000 OUT=$BASE/pcaData500k_2k_dense FMT=$FORMAT & +fi + +#generate L scenarios (80GB) +if [ $MAXMEM -ge 80000 ]; then + ${CMD} -f datagen/genRandData4PCA.dml --nvargs R=5000000 C=2000 OUT=$BASE/pcaData5M_2k_dense FMT=$FORMAT +fi + +#generate XL scenarios (800GB) +if [ $MAXMEM -ge 800000 ]; then + ${CMD} -f datagen/genRandData4PCA.dml --nvargs R=50000000 C=2000 OUT=$BASE/pcaData50M_2k_dense FMT=$FORMAT +fi + +wait \ No newline at end of file diff --git a/scripts/perftest/datagen/genIOData.sh b/scripts/perftest/datagen/genIOData.sh new file mode 100644 index 0000000000..46154f8636 --- /dev/null +++ b/scripts/perftest/datagen/genIOData.sh @@ -0,0 +1,72 @@ +#!/bin/bash +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- +if [ "$(basename $PWD)" != "perftest" ]; +then + echo "Please execute scripts from directory 'perftest'" + exit 1; +fi + +CMD=${1:-systemds} +DATADIR=${2:-"temp"}/io +MAXMEM=${3:-1} + +FORMAT="csv" # can be csv, mm, text, binary + +echo "-- Generating IO data." >> results/times.txt; + + +#generate XS scenarios (10MB) +if [ $MAXMEM -ge 1 ]; then + ${CMD} -f ../utils/generateData.dml --nvargs Path=${DATADIR}/X500_250_dense R=500 C=250 Fmt=$FORMAT & +fi + +#generate XS scenarios (10MB) +if [ $MAXMEM -ge 10 ]; then + ${CMD} -f ../utils/generateData.dml --nvargs Path=${DATADIR}/X5k_250_dense R=5000 C=250 Fmt=$FORMAT & +fi + +#generate XS scenarios (80MB) +if [ $MAXMEM -ge 80 ]; then + ${CMD} -f ../utils/generateData.dml --nvargs Path=${DATADIR}/X10k_1k_dense R=10000 C=1000 Fmt=$FORMAT & +fi + +#generate S scenarios (800MB) +if [ $MAXMEM -ge 800 ]; then + ${CMD} -f ../utils/generateData.dml --nvargs Path=${DATADIR}/X100k_1k_dense R=100000 C=1000 Fmt=$FORMAT & +fi + +#generate M scenarios (8GB) +if [ $MAXMEM -ge 8000 ]; then + ${CMD} -f ../utils/generateData.dml --nvargs Path=${DATADIR}/X1M_1k_dense R=1000000 C=1000 Fmt=$FORMAT & +fi + +#generate L scenarios (80GB) +if [ $MAXMEM -ge 80000 ]; then + ${CMD} -f ../utils/generateData.dml --nvargs Path=${DATADIR}/X10M_1k_dense R=10000000 C=1000 Fmt=$FORMAT & +fi + +#generate XL scenarios (800GB) +if [ $MAXMEM -ge 800000 ]; then + ${CMD} -f ../utils/generateData.dml --nvargs Path=${DATADIR}/X100M_1k_dense R=100000000 C=1000 Fmt=$FORMAT & +fi + +wait diff --git a/scripts/perftest/datagen/genL2SVMData.sh b/scripts/perftest/datagen/genL2SVMData.sh new file mode 100644 index 0000000000..d25e433530 --- /dev/null +++ b/scripts/perftest/datagen/genL2SVMData.sh @@ -0,0 +1,38 @@ +#!/bin/bash +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- +if [ "$(basename $PWD)" != "perftest" ]; +then + echo "Please execute scripts from directory 'perftest'" + exit 1; +fi + +CMD=$1 +DATADIR=$2 + +FORMAT="binary" # can be csv, mm, text, binary +DENSE_SP=0.9 +SPARSE_SP=0.01 + +BASEPATH=$(dirname $0) + +#generate XS scenarios (80MB) +${CMD} -f ${BASEPATH}/../datagen/genRandData4LogisticRegression.dml --args 10000 1000 5 5 ${DATADIR}/w10k_1k_dense ${DATADIR}/X10k_1k_dense ${DATADIR}/Y10k_1k_dense 1 0 $DENSE_SP $FORMAT 1 +${CMD} -f ${BASEPATH}/../datagen/genRandData4LogisticRegression.dml --args 10000 1000 5 5 ${DATADIR}/w10k_1k_sparse ${DATADIR}/X10k_1k_sparse ${DATADIR}/Y10k_1k_sparse 1 0 $SPARSE_SP $FORMAT 1 diff --git a/scripts/perftest/datagen/genMultinomialData.sh b/scripts/perftest/datagen/genMultinomialData.sh new file mode 100644 index 0000000000..95c42f87dd --- /dev/null +++ b/scripts/perftest/datagen/genMultinomialData.sh @@ -0,0 +1,78 @@ +#!/bin/bash +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- +if [ "$(basename $PWD)" != "perftest" ]; +then + echo "Please execute scripts from directory 'perftest'" + exit 1; +fi + +CMD=$1 +BASE=$2/multinomial +MAXMEM=$3 + +FORMAT="binary" +DENSE_SP=0.9 +SPARSE_SP=0.01 + +echo "-- Generating multinomial data..." >> results/times.txt; + +#generate XS scenarios (80MB) +if [ $MAXMEM -ge 80 ]; then + ${CMD} -f datagen/genRandData4Multinomial.dml $DASH-args 10000 1000 $DENSE_SP 5 0 $BASE/X10k_1k_dense_k5 $BASE/y10k_1k_dense_k5 $FORMAT 1 & pidDense80=$! + ${CMD} -f datagen/genRandData4Multinomial.dml $DASH-args 10000 1000 $SPARSE_SP 5 0 $BASE/X10k_1k_sparse_k5 $BASE/y10k_1k_sparse_k5 $FORMAT 1 & pidSparse80=$! + wait $pidDense80; ${CMD} -f scripts/extractTestData.dml $DASH-args $BASE/X10k_1k_dense_k5 $BASE/y10k_1k_dense_k5 $BASE/X10k_1k_dense_k5_test $BASE/y10k_1k_dense_k5_test $FORMAT & + wait $pidSparse80; ${CMD} -f scripts/extractTestData.dml $DASH-args $BASE/X10k_1k_sparse_k5 $BASE/y10k_1k_sparse_k5 $BASE/X10k_1k_sparse_k5_test $BASE/y10k_1k_sparse_k5_test $FORMAT & +fi + +##generate S scenarios (800MB) +if [ $MAXMEM -ge 800 ]; then + ${CMD} -f datagen/genRandData4Multinomial.dml $DASH-args 100000 1000 $DENSE_SP 5 0 $BASE/X100k_1k_dense_k5 $BASE/y100k_1k_dense_k5 $FORMAT 1 & pidDense800=$! + ${CMD} -f datagen/genRandData4Multinomial.dml $DASH-args 100000 1000 $SPARSE_SP 5 0 $BASE/X100k_1k_sparse_k5 $BASE/y100k_1k_sparse_k5 $FORMAT 1 & pidSparse800=$! + wait $pidDense800; ${CMD} -f scripts/extractTestData.dml $DASH-args $BASE/X100k_1k_dense_k5 $BASE/y100k_1k_dense_k5 $BASE/X100k_1k_dense_k5_test $BASE/y100k_1k_dense_k5_test $FORMAT & + wait $pidSparse800; ${CMD} -f scripts/extractTestData.dml $DASH-args $BASE/X100k_1k_sparse_k5 $BASE/y100k_1k_sparse_k5 $BASE/X100k_1k_sparse_k5_test $BASE/y100k_1k_sparse_k5_test $FORMAT & +fi + +##generate M scenarios (8GB) +if [ $MAXMEM -ge 8000 ]; then + ${CMD} -f datagen/genRandData4Multinomial.dml $DASH-args 1000000 1000 $DENSE_SP 5 0 $BASE/X1M_1k_dense_k5 $BASE/y1M_1k_dense_k5 $FORMAT 1 & pidDense8000=$! + ${CMD} -f datagen/genRandData4Multinomial.dml $DASH-args 1000000 1000 $SPARSE_SP 5 0 $BASE/X1M_1k_sparse_k5 $BASE/y1M_1k_sparse_k5 $FORMAT 1 & pidSparse8000=$! + wait $pidDense8000; ${CMD} -f scripts/extractTestData.dml $DASH-args $BASE/X1M_1k_dense_k5 $BASE/y1M_1k_dense_k5 $BASE/X1M_1k_dense_k5_test $BASE/y1M_1k_dense_k5_test $FORMAT & + wait $pidSparse8000; ${CMD} -f scripts/extractTestData.dml $DASH-args $BASE/X1M_1k_sparse_k5 $BASE/y1M_1k_sparse_k5 $BASE/X1M_1k_sparse_k5_test $BASE/y1M_1k_sparse_k5_test $FORMAT & +fi + +##generate L scenarios (80GB) +if [ $MAXMEM -ge 80000 ]; then + ${CMD} -f datagen/genRandData4Multinomial.dml $DASH-args 10000000 1000 $DENSE_SP 5 0 $BASE/X10M_1k_dense_k5 $BASE/y10M_1k_dense_k5 $FORMAT 1 + ${CMD} -f datagen/genRandData4Multinomial.dml $DASH-args 10000000 1000 $SPARSE_SP 5 0 $BASE/X10M_1k_sparse_k5 $BASE/y10M_1k_sparse_k5 $FORMAT 1 + ${CMD} -f scripts/extractTestData.dml $DASH-args $BASE/X10M_1k_dense_k5 $BASE/y10M_1k_dense_k5 $BASE/X10M_1k_dense_k5_test $BASE/y10M_1k_dense_k5_test $FORMAT + ${CMD} -f scripts/extractTestData.dml $DASH-args $BASE/X10M_1k_sparse_k5 $BASE/y10M_1k_sparse_k5 $BASE/X10M_1k_sparse_k5_test $BASE/y10M_1k_sparse_k5_test $FORMAT +fi + +#generate LARGE scenarios (800GB) +if [ $MAXMEM -ge 800000 ]; then + ${CMD} -f datagen/genRandData4Multinomial.dml $DASH-args 100000000 1000 $DENSE_SP 5 0 $BASE/X100M_1k_dense_k5 $BASE/y100M_1k_dense_k5 $FORMAT 1 + ${CMD} -f datagen/genRandData4Multinomial.dml $DASH-args 100000000 1000 $SPARSE_SP 5 0 $BASE/X100M_1k_sparse_k5 $BASE/y100M_1k_sparse_k5 $FORMAT 1 + ${CMD} -f scripts/extractTestData.dml $DASH-args $BASE/X100M_1k_dense_k5 $BASE/y100M_1k_dense_k5 $BASE/X100M_1k_dense_k5_test $BASE/y100M_1k_dense_k5_test $FORMAT + ${CMD} -f scripts/extractTestData.dml $DASH-args $BASE/X100M_1k_sparse_k5 $BASE/y100M_1k_sparse_k5 $BASE/X100M_1k_sparse_k5_test $BASE/y100M_1k_sparse_k5_test $FORMAT +fi + +wait \ No newline at end of file diff --git a/scripts/perftest/datagen/genRandData4ALS.dml b/scripts/perftest/datagen/genRandData4ALS.dml new file mode 100644 index 0000000000..f6c3562862 --- /dev/null +++ b/scripts/perftest/datagen/genRandData4ALS.dml @@ -0,0 +1,47 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +Xfile = $X; # input matrix X of size m x n +Ufile = ifdef($U, " "); # original row factor of size m x r +Vfile = ifdef($V, " "); # original col factor of size r x n +m = $rows; # no. of rows of X +n = $cols; # no. of cols of X +r = $rank; # rank of factorization +nnz = $nnz; # no. of nonzeros in X +sigma = ifdef ($sigma, 0.01); # variance of Gaussian noise +fmt = ifdef ($fmt, "binary"); # output format + +# generate original factors by sampling from a normal(0,1.0) distribution +U = rand(rows = m, cols = r, pdf = "normal", seed = 123); +V = rand(rows = n, cols = r, pdf = "normal", seed = 456); + +I = floor(rand(rows = nnz, cols = 1, min = 1, max = m + 0.999999999)); +J = floor(rand(rows = nnz, cols = 1, min = 1, max = n + 0.999999999)); +X = rand(rows = nnz, cols = 1, pdf = "normal") * sqrt(sigma); +N = table(I, J, X); +X = (N != 0) * (U %*% t(V)) + N; +write(X, Xfile, format = fmt); +if( Ufile != " " ) + write(U, Ufile, format = fmt); +if( Vfile != " " ) { + V = t(V); + write(V, Vfile, format = fmt); +} diff --git a/scripts/perftest/datagen/genRandData4ChisquaredTest.dml b/scripts/perftest/datagen/genRandData4ChisquaredTest.dml new file mode 100644 index 0000000000..8f2b945e01 --- /dev/null +++ b/scripts/perftest/datagen/genRandData4ChisquaredTest.dml @@ -0,0 +1,87 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +# generates a two column matrix of categorical +# variables +# used to test systemds's chi-squared bivariate stat +# computation + +# $1 is number of samples to generate +# $2 is number of categories for 1st categorical variable +# $3 is number of categories for 2nd categorical variable +# $4 is the file to write out the chi-squared statistic to +# $5 is the file to write out the generated data to + +numSamples = $1 +numCategories1 = $2 +numCategories2 = $3 + +o = Rand(rows=numCategories1, cols=numCategories2, min=0.0, max=1.0, pdf="uniform", seed=0) +o = o / sum(o) + +probs1 = rowSums(o) +probs1 = probs1 / sum(probs1) +probs2 = colSums(o) +probs2 = probs2 / sum(probs2) +e = probs1 %*% probs2 + +chisquared = sum((o-e)^2/e) +write(chisquared, $4, format="binary") + +oCDF = Rand(rows=numCategories1, cols=numCategories2, min=0.0, max=0.0, pdf="uniform", seed=0) +for(i in 1:numCategories1){ + for(j in 1:numCategories2){ + if(i==1 & j==1){ + oCDF[i,j] = o[1,1] + } + if(i != 1 & j == 1){ + oCDF[i,j] = oCDF[i-1,numCategories2] + o[i,j] + } + if(j > 1){ + oCDF[i,j] = oCDF[i,j-1] + o[i,j] + } + } +} + +one = Rand(rows=1, cols=1, min=1.0, max=1.0, pdf="uniform", seed=0) +data = Rand(rows=numSamples, cols=2, min=0.0, max=0.0, pdf="uniform", seed=0) +parfor(s in 1:numSamples){ + r_mat = Rand(rows=1, cols=1, min=0.0, max=1.0, pdf="uniform", seed=0) + r = as.scalar(r_mat) + + cat1 = -1 + cat2 = -1 + continue = 1 + for(i in 1:numCategories1){ + for(j in 1:numCategories2){ + cdf = as.scalar(oCDF[i,j]) + if(continue == 1 & r <= cdf){ + cat1 = i + cat2 = j + continue = 0 + } + } + } + + data[s,1] = cat1*one + data[s,2] = cat2*one +} +write(data, $5, format="binary") diff --git a/scripts/perftest/datagen/genRandData4DecisionTree.sh b/scripts/perftest/datagen/genRandData4DecisionTree.sh new file mode 100644 index 0000000000..44978192fe --- /dev/null +++ b/scripts/perftest/datagen/genRandData4DecisionTree.sh @@ -0,0 +1,58 @@ +#!/bin/bash +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +if [ "$1" == "" -o "$2" == "" ]; then echo "Usage: $0 <hdfsDataDir> <MR | SPARK | ECHO> e.g. $0 perftest SPARK" ; exit 1 ; fi +if [ "$2" == "SPARK" ]; then CMD="./sparkDML.sh "; DASH="-"; elif [ "$2" == "MR" ]; then CMD="hadoop jar SystemDS.jar " ; else CMD="echo " ; fi + +BASE=$1/trees + +FORMAT="csv" +DENSE_SP=0.9 +SPARSE_SP=0.01 + +PATH_LOCAL=/tmp/datagen +PATH_HDFS=$BASE + +#### part 1: generating class labels and categorical features +${CMD} -f ../datagen/genRandData4DecisionTree1.dml $DASH-nvargs XCat=$BASE/XCat Y=$BASE/Y num_records=1000 num_cat=100 num_class=10 num_distinct=100 sp=$DENSE_SP + +#### part 2: generating spec.json on HDFS +NUM_FEATURES=100 + +echo "{ \"ids\": true + ,\"recode\": [1 " > $PATH_LOCAL/spec.json +for i in $(seq 2 $NUM_FEATURES); do + echo " , "$i >> $PATH_LOCAL/spec.json +done +echo " ] , \"dummycode\": [ 1" >> $PATH_LOCAL/spec.json +for i in $(seq 2 $NUM_FEATURES); do + echo " , "$i >> $PATH_LOCAL/spec.json +done +echo "] }" >> $PATH_LOCAL/spec.json + +hadoop fs -rm $PATH_HDFS/spec.json +hadoop fs -copyFromLocal $PATH_LOCAL/spec.json $PATH_HDFS/spec.json + +#### part 3: generating scale feature and transforming categorical features, finally combaning scale and categorical features +${CMD} -f ../datagen/genRandData4DecisionTree2.dml $DASH-nvargs tPath=$BASE/metadata tSpec=$BASE/spec.json XCat=$BASE/XCat X=$BASE/X num_records=1000 num_scale=100 sp=$DENSE_SP fmt=$FORMAT + + diff --git a/scripts/perftest/datagen/genRandData4DecisionTree1.dml b/scripts/perftest/datagen/genRandData4DecisionTree1.dml new file mode 100644 index 0000000000..7d1dd50d6b --- /dev/null +++ b/scripts/perftest/datagen/genRandData4DecisionTree1.dml @@ -0,0 +1,40 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + + +XCatFile = $XCat; +YFile = $Y; +num_records = $num_records; +num_cat_features = $num_cat; +num_class = $num_class; +num_distinct = $num_distinct; +sparsity = $sp; + +# generate class labels +Y = floor (rand (rows = num_records, cols = 1, min = 1, max = num_class + 0.99999999999999)); +Y_bin = table (seq (1, num_records), Y); +write (Y_bin, YFile); + +# generate categorical features +X_cat = floor (rand (rows = num_records, cols = num_cat_features, min = 1, max = num_distinct + 0.99999999999999, sparsity = sparsity)); +fX_cat = as.frame(X_cat); +write (fX_cat, XCatFile, format = "csv"); + diff --git a/scripts/perftest/datagen/genRandData4DecisionTree2.dml b/scripts/perftest/datagen/genRandData4DecisionTree2.dml new file mode 100644 index 0000000000..715924915c --- /dev/null +++ b/scripts/perftest/datagen/genRandData4DecisionTree2.dml @@ -0,0 +1,41 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + + +transformPath = $tPath; +transformSpec = $tSpec; +XCatFile = $XCat; +XFile = $X; +num_records = $num_records; +num_scale_features = $num_scale; +sparsity = $sp; +fmt = $fmt; + +# generate scale features +X_scale = rand (rows = num_records, cols = num_scale_features, min = 0, max = 10, sparsity = sparsity); + +# transform categorical features +XCF = read (XCatFile); +specJson = read(transformSpec, data_type="scalar", value_type="string"); +X_cat_transformed = transform (target = XCF, spec = specJson, transformPath = transformPath); + +X = cbind (X_scale, X_cat_transformed); +write (X, XFile, format = fmt); diff --git a/scripts/perftest/datagen/genRandData4DescriptiveStats.dml b/scripts/perftest/datagen/genRandData4DescriptiveStats.dml new file mode 100644 index 0000000000..6f96162074 --- /dev/null +++ b/scripts/perftest/datagen/genRandData4DescriptiveStats.dml @@ -0,0 +1,149 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +/* +------------------------------------------------ + Parameters +------------------------------------------------ +$R = #rows +$C = #columns +$NC = number of categorical attributes +$MAXDOMAIN = maximum domain size +$DATA = output file path on HDFS +$SETSIZE = Size of one bivariate set +$LABELSETSIZE= Size of second bivariate set with labels +$TYPES = output attribute types +$TYPES1 = Attribute types for Set1 +$TYPES2 = Attribute types for Set2 +$INDEX1 = Indices for Set1 +$INDEX2 = Indices for Set2 +$FMT = output format +------------------------------------------------ +hadoop jar SystemDS.jar -f genData4Stats.dml -nvargs R=1000000 C=1000 NC=50 MAXDOMAIN=1100 DATA=stats/data TYPES=stats/types SETSIZE=15 LABELSETSIZE=10 TYPES1=... Types2=... INDEX1=.. INDEX2=..FMT=csv +------------------------------------------------ +*/ + + +FMT = ifdef($FMT,"binary"); # default output format + +# number of categorical attributes.. numC <= C +numC = $NC; +numO = as.integer(numC/2); +numNominal = numC - numO; +print("Categorical Mix = (" + numC + "," + numO + "," + numNominal +")"); + +# maximum domain size among all categorical attributes +maxDomainSize = $MAXDOMAIN; + +# Divide $C attributes according to the following logic: +# +# 1 numO numC C +# |-------|---------|-----------------| +# ord nominal scale +# +# numC+1-$C: scale +# 1-numC/2: ordinal +# (numC/2+1)-numC: nominal + +types = matrix(1, rows=1, cols=$C); +ocutoff = numO; +types[1,1:ocutoff] = matrix(1,rows=1,cols=ocutoff)*3; +types[1, ocutoff+1:numC] = matrix(1,rows=1,cols=(numC-ocutoff))*2; + +# Generate data +A = rand(rows=$R, cols=$C, sparsity=1); +B = matrix(0,rows=nrow(A), cols=ncol(A)); +parfor (i in 1:numC) { + Ai = A[,i]; + + tmp = round(rand(rows=1,cols=1, min=1, max=maxDomainSize)); + domain = as.scalar(tmp[1,1]); + + # for some attributes, choose the maxDomainSize + tmp = rand(rows=1,cols=1); + if (as.scalar(tmp[1,1]) < 0.5) { + domain = maxDomainSize; + } + + B[,i] = round(1+(domain-1)*Ai); +} +B[ ,(numC+1):ncol(A)] = A[, (numC+1):ncol(A)]; + + +write(B, $DATA, format=FMT); +write(types, $TYPES, format=FMT); + +# ----- Generator for Bivariate --------- + +settypes1 = matrix(1, rows=1, cols=$SETSIZE); +index1 = matrix(0, rows=1, cols=$SETSIZE); + +catSetSize = as.integer($SETSIZE/2); +ocutoff = as.integer(catSetSize/2); +print("Set Mix = (" + $SETSIZE + "," + catSetSize + "," + ocutoff + ")" ); +settypes1[1, 1:ocutoff] = matrix(1,rows=1,cols=ocutoff)*3; +settypes1[1, ocutoff+1:catSetSize] = matrix(1,rows=1,cols=(catSetSize-ocutoff))*2; + +# select ordinal indices +tmp = rand(rows=1, cols=ocutoff); +index1[1, 1:ocutoff] = round(1 + (numO-1)*tmp); + +# select nominal indices +nominalSetSize = catSetSize-ocutoff; +tmp = rand(rows=1, cols=nominalSetSize); +index1[1, ocutoff+1:catSetSize] = round(numO+1 + (numC-numO-1)*tmp); + +# select scale attributes +scaleSetSize = $SETSIZE-catSetSize; +tmp = rand(rows=1, cols=scaleSetSize); +index1[1, catSetSize+1:$SETSIZE] = round(numC+1 + ($C-numC-1)*tmp); + + +# --- select types and indices for LABELSET +settypes2 = matrix(2, rows=1, cols=$LABELSETSIZE); +index2 = matrix(0, rows=1, cols=$LABELSETSIZE); +if($LABELSETSIZE > 1) { + settypes2[1,1] = 1; + r = as.scalar(rand(rows=1,cols=1)); + index2[1,1] = round(numC+1 + ($C-numC-1)*r) +} +else { + r = as.scalar(rand(rows=1,cols=1)); + index2[1,1] = round( numO+1 + (numC-numO-1)*r ) +} + +for(i in 2:as.integer($LABELSETSIZE/2)) { + settypes2[1,i] = 3; + r = as.scalar(rand(rows=1,cols=1)); + index2[1,i] = round( 1 + (numO-1)*r ) +} + +for(i in as.integer($LABELSETSIZE/2)+1:$LABELSETSIZE) { + settypes2[1,i] = 2; + r = as.scalar(rand(rows=1,cols=1)); + index2[1,i] = round( numO+1 + (numC-numO-1)*r ) +} + +write(settypes1, $TYPES1, format=FMT); +write(settypes2, $TYPES2, format=FMT); +write(index1, $INDEX1, format=FMT); +write(index2, $INDEX2, format=FMT); + diff --git a/scripts/perftest/datagen/genRandData4FTest.dml b/scripts/perftest/datagen/genRandData4FTest.dml new file mode 100644 index 0000000000..9f0e1d6c68 --- /dev/null +++ b/scripts/perftest/datagen/genRandData4FTest.dml @@ -0,0 +1,95 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +# generates random data for F-test +# +# $1 is number of groups (some of +# which may share a gaussian) +# $2 is number of actual groups +# $3 is number of points +# $4 is mean of the gaussian means +# $5 is mean of the gaussian std. deviations +# $6 is file to store computed f-statistic +# $7 is file to store generated data + +numGroups = $1 +numActualGroups = $2 +numSamples = $3 +meanOfMeans = $4 +meanOfStddevs = $5 + +cntProbs = Rand(rows=numGroups, cols=1, min=0.0, max=1.0, pdf="uniform", seed=0) +cntProbs = cntProbs/sum(cntProbs) +cntArr = round(cntProbs * numSamples) +last_cnt = cntArr[numGroups,1] +cntArr[numGroups,1] = numSamples - (sum(cntArr) - last_cnt) + +permut = Rand(rows=numActualGroups, cols=numGroups, min=0.0, max=0.0, pdf="uniform") +ones = Rand(rows=numActualGroups, cols=1, min=1.0, max=1.0, pdf="uniform") +permut[,1:numActualGroups] = diag(ones) + +one = Rand(rows=1, cols=1, min=1.0, max=1.0, pdf="uniform") +copy_start_index = numActualGroups+1 +parfor(i in copy_start_index:numGroups){ + r = Rand(rows=1, cols=1, min=1.0, max=numActualGroups, pdf="uniform", seed=0) + j = as.scalar(round(r)) + permut[j,i] = one +} + +means_std = Rand(rows=numActualGroups, cols=1, pdf="normal", seed=0) +abs_means = means_std + meanOfMeans +means = t(t(abs_means) %*% permut) + +stddevs_std = Rand(rows=numActualGroups, cols=1, pdf="normal", seed=0) +abs_stddevs = stddevs_std + meanOfStddevs +stddevs = t(t(abs_stddevs) %*% permut) + +overall_mean = sum(means*cntArr)/numSamples + +explained_variance = sum(cntArr * (means - overall_mean)^2) / (numGroups-1.0) +unexplained_variance = sum(cntArr * stddevs^2) / (numSamples - numGroups) +f = explained_variance / unexplained_variance +write(f, $6, format="binary") + +cntCDFs = cntProbs +for(i in 2:numGroups){ + cntCDFs[i,1] = cntCDFs[i-1,1] + cntProbs[i,1] +} + +data = Rand(rows=numSamples, cols=1, min=0.0, max=0.0, pdf="uniform") +parfor(i in 1:numSamples){ + r_mat = Rand(rows=1, cols=1, min=0.0, max=1.0, pdf="uniform", seed=0) + r1 = as.scalar(r_mat) + + g = -1 + continue = 1 + for(k in 1:numGroups){ + cdf = as.scalar(cntCDFs[k,1]) + if(continue==1 & r1<=cdf){ + g = k + continue=0 + } + } + + point = Rand(rows=1, cols=1, pdf="normal", seed=0) + data[i,1] = point*stddevs[g,1] + means[g,1] +} +write(data, $7, format="binary") diff --git a/scripts/perftest/datagen/genRandData4Kmeans.dml b/scripts/perftest/datagen/genRandData4Kmeans.dml new file mode 100644 index 0000000000..3098650b26 --- /dev/null +++ b/scripts/perftest/datagen/genRandData4Kmeans.dml @@ -0,0 +1,120 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +# +# Generates random Gaussian-mixture data to test k-Means clustering algorithms +# +# INPUT PARAMETERS: +# ---------------------------------------------------------------------------- +# NAME TYPE DEFAULT MEANING +# ---------------------------------------------------------------------------- +# nr Int --- Number of records +# nf Int --- Number of features +# nc Int --- Number of clusters +# dc Double --- St.dev. of cluster "centroid" features from zero mean +# dr Double --- St.dev. of the 1-st feature in a record within cluster +# fbf Double --- Feature bias factor: Stdev(last) / Stdev(1-st) feature +# cbf Double --- Cluster bias factor: Prob[1-st clus] / Prob[k-th clus] +# X String --- Location to write matrix X with generated data records +# C String --- Location to write cluster "centroids" (Gaussian means) +# Y String --- Location to write assignment of records to cluster ids +# YbyC String --- Location to write rec-cluster assigns by min-dist to C +# ---------------------------------------------------------------------------- +# +# Example: +# hadoop jar SystemDS.jar -f genRandData4Kmeans.dml -nvargs nr=100000 nf=100 +# nc=10 dc=10.0 dr=1.0 fbf=100.0 cbf=100.0 X=X.mtx C=C.mtx Y=Y.mtx YbyC=YbyC.mtx + +print ("BEGIN K-MEANS GENERATOR SCRIPT"); + +num_records = $nr; +num_features = $nf; +num_centroids = $nc; +dist_per_feature_centroids = $dc; +dist_per_feature_first_record = $dr; +feature_bias_factor = $fbf; +cluster_bias_factor = $cbf; + +fileX = ifdef ($X, "X"); +fileC = ifdef ($C, "C"); +fileY = ifdef ($Y, "Y"); +fileYbyC = ifdef ($YbyC, "YbyC"); +fmt = ifdef ($fmt, "text"); + +print ("Generating cluster distribution (mixture) centroids..."); + +C = Rand (rows = num_centroids, cols = num_features, pdf = "normal"); +C = C * dist_per_feature_centroids; + +print ("Generating record-to-cluster assignments..."); + +# Y is a multinomial in {1, ..., num_centroids} with 1 being more likely +# than "num_centroids" by the factor of "cluster_bias_factor" + +rnd = Rand (rows = num_records, cols = 1, min = 0.0, max = 1.0, pdf = "uniform"); +if (cluster_bias_factor == 1.0) { + Y = round (0.5 + rnd * num_centroids); +} else { + rnd_scaled = rnd * (1 - cluster_bias_factor ^ (- num_centroids / (num_centroids - 1))); + Y = round (0.5 - (num_centroids - 1) * log (1 - rnd_scaled) / log (cluster_bias_factor)); +} + +print ("Generating within-cluster random shifts..."); + +X_shift = Rand (rows = num_records, cols = num_features, pdf = "normal"); +feature_factors = dist_per_feature_first_record * + exp ((seq (1, num_features) - 1) / (num_features - 1) * log (feature_bias_factor)); +X_shift = X_shift %*% diag (feature_factors); + +print ("Generating records by shifting from centroids..."); + +Y_bitmap_raw = table (seq (1, num_records), Y); +Y_bitmap = matrix (0, rows = num_records, cols = num_centroids); +Y_bitmap [, 1 : ncol (Y_bitmap_raw)] = Y_bitmap_raw; +X = Y_bitmap %*% C + X_shift; + +print ("Computing record-to-cluster assignments by minimum centroid distance..."); + +D = t(t(-2 * (X %*% t(C))) + rowSums (C ^ 2)); +P = (D <= rowMins (D)); +aggr_P = t(cumsum (t(P))); +Y_by_C = rowSums (aggr_P == 0) + 1; + +print ("Computing useful statistics..."); + +sumXsq = sum (X ^ 2); +default_wcss = sumXsq - sum (colSums (X) ^ 2) / num_records; +attained_wcss = sumXsq + sum (rowMins (D)); + +print ("Default (single-cluster) WCSS = " + default_wcss); +print (num_centroids + "-cluster WCSS attained by the mixture centroids = " + attained_wcss); + +print ("Writing out the resulting dataset..."); + +write (X, fileX, format = fmt); +write (C, fileC, format = fmt); +write (Y, fileY, format = fmt); +write (Y_by_C, fileYbyC, format = fmt); + +print ("Please run the scoring script to compare " + fileY + " with " + fileYbyC); + +print ("DONE: K-MEANS GENERATOR SCRIPT"); + diff --git a/scripts/perftest/datagen/genRandData4LinearReg_LTstats.dml b/scripts/perftest/datagen/genRandData4LinearReg_LTstats.dml new file mode 100644 index 0000000000..9bb1ca189e --- /dev/null +++ b/scripts/perftest/datagen/genRandData4LinearReg_LTstats.dml @@ -0,0 +1,233 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +# +# generates random data to test bi- and multinomial logistic regression + +# $N = number of training samples +# $Nt = number of test samples (or 0 if none) +# $nf = number of features (independent variables) +# $nc = number of categories; = 1 if "binomial" with +1/-1 labels +# $Xmin = minimum feature value +# $Xmax = maximum feature value +# $spars = controls sparsity in the generated data +# $avgLTmin = average linear term (X %*% beta + intercept), minimum value +# $avgLTmax = average linear term (X %*% beta + intercept), maximum value +# $stdLT = requested standard deviation for the linear terms +# $iceptmin = intercept, minimum value (0.0 disables intercept) +# $iceptmax = intercept, maximum value (0.0 disables intercept) +# $B = location to store generated regression parameters +# $X = location to store generated training data +# $Y = location to store generated training category labels +# $Xt = location to store generated test data +# $Yt = location to store generated test category labels +# $fmt = format of the output +# +# Example: +# hadoop jar SystemDS.jar -f genRandData4LinearReg_LTstats.dml -nvargs +# N=1000000 Nt=1000 nf=20 nc=3 Xmin=0.0 Xmax=1.0 spars=1.0 avgLTmin=3.0 avgLTmax=5.0 stdLT=1.25 +# iceptmin=1.0 iceptmax=1.0 B=./B123 X=./X123 Y=./Y123 Xt=./Xt123 Yt=./Yt123 fmt=binary + +numTrainingSamples = $N; +numTestSamples = $Nt; +numFeatures = $nf; +numCategories = $nc; +minIntercept = $iceptmin; +maxIntercept = $iceptmax; +minXentry = $Xmin; +maxXentry = $Xmax; +minAvgLT = $avgLTmin; +maxAvgLT = $avgLTmax; +sparsityLevel = $spars; +stdevLT = $stdLT; +fileB = ifdef ($B, "B"); +fileX = ifdef ($X, "X"); +fileY = ifdef ($Y, "Y"); +fileXt = ifdef ($Xt, "Xt"); +fileYt = ifdef ($Yt, "Yt"); +fmt = ifdef ($fmt, "mm"); + +numSamples = numTrainingSamples + numTestSamples; + +isBinomialPMOne = FALSE; +if (numCategories == 1) { + numCategories = 2; + isBinomialPMOne = TRUE; +} +do_we_output_intercept = 1; +if (minIntercept == 0 & maxIntercept == 0) { + do_we_output_intercept = 0; +} + +X = Rand (rows = numSamples, cols = numFeatures, min = minXentry, max = maxXentry, pdf = "uniform", sparsity = sparsityLevel); + +meanLT = Rand (rows = 1, cols = numCategories - 1, min = minAvgLT, max = maxAvgLT, pdf = "uniform"); +sigmaLT = matrix (stdevLT, rows = 1, cols = numCategories - 1); +b_intercept = Rand (rows = 1, cols = numCategories - 1, min = minIntercept, max = maxIntercept, pdf = "uniform"); + +meanLT_minus_intercept = meanLT - b_intercept; +[B, new_sigmaLT] = generateWeights (X, meanLT_minus_intercept, sigmaLT); + +ones = matrix (1.0, rows = numSamples, cols = 1); +LT = X %*% B + ones %*% b_intercept; +actual_meanLT = colSums (LT) / numSamples; +actual_sigmaLT = sqrt (colSums ((LT - ones %*% actual_meanLT)^2) / numSamples); + +for (i in 1:(numCategories - 1)) { + if (as.scalar (new_sigmaLT [1, i]) == as.scalar (sigmaLT [1, i])) { + print ("Category " + i + ": Intercept = " + as.scalar (b_intercept [1, i])); + } else { + print ("Category " + i + ": Intercept = " + as.scalar (b_intercept [1, i]) + ", st.dev.(LT) relaxed from " + as.scalar (sigmaLT [1, i])); + } + print (" Wanted LT mean = " + as.scalar (meanLT [1, i]) + ", st.dev. = " + as.scalar (new_sigmaLT [1, i])); + print (" Actual LT mean = " + as.scalar (actual_meanLT [1, i]) + ", st.dev. = " + as.scalar (actual_sigmaLT [1, i])); +} + + +/* +ones = matrix (1.0, rows = 1, cols = numCategories - 1); +Prob = exp (LT); +Prob = Prob / ((1.0 + rowSums (Prob)) %*% ones); +Prob = t(cumsum (t(Prob))); + +r = Rand (rows = numSamples, cols = 1, min = 0, max = 1, pdf = "uniform", seed = 0); +R = r %*% ones; +Y = 1 + rowSums (Prob < R); +if (isBinomialPMOne) { + Y = 3 - 2 * Y; +} +*/ + +/* USE FOR LINEAR REGRESSION */ + +r = Rand (rows = numSamples, cols = 1, pdf = "normal"); +Y = LT [, 1] + r; + + +if (do_we_output_intercept == 1) { + new_B = matrix (0.0, rows = nrow(B) + 1, cols = ncol(B)); + new_B [1:nrow(B), 1:ncol(B)] = B; + new_B [nrow(B)+1, 1:ncol(B)] = b_intercept; + write (new_B, fileB, format=fmt); +} else { + write (B, fileB, format=fmt); +} + +if (numTestSamples > 0) { + X_train = X [1:numTrainingSamples,]; + Y_train = Y [1:numTrainingSamples,]; + X_test = X [(numTrainingSamples+1):numSamples,]; + Y_test = Y [(numTrainingSamples+1):numSamples,]; + write (X_train, fileX, format=fmt); + write (Y_train, fileY, format=fmt); + write (X_test, fileXt, format=fmt); + write (Y_test, fileYt, format=fmt); +} else { + write (X, fileX, format=fmt); + write (Y, fileY, format=fmt); +} + + + + + + +# Generates weight vectors to ensure the desired statistics for Linear Terms = X %*% W +# To be used for data generation in the testing of GLM, Logistic Regression, etc. +# INPUT: meanLT and sigmaLT are row vectors, meanLT[1, i] and sigmaLT[1, i] are +# the desired mean and standard deviation for X %*% W[, i] +# OUTPUT: "W" is the matrix of generated (column) weight vectors W[, i] +# new_sigmaLT[1, i] == sigmaLT[1, i] if the std.dev is successfully enforced, +# new_sigmaLT[1, i] > sigmaLT[1, i] if we had to relax this constraint. +generateWeights = + function (Matrix[double] X, Matrix[double] meanLT, Matrix[double] sigmaLT) + return (Matrix[double] W, Matrix[double] new_sigmaLT) +{ + num_w = ncol (meanLT); # Number of output weight vectors + dim_w = ncol (X); # Number of features / dimensions in a weight vector + w_X = t(colSums(X)); # "Prohibited" weight shift direction that changes meanLT + # (all orthogonal shift directions do not affect meanLT) + + # Compute "w_1" with meanLT = 1 and with the smallest possible sigmaLT + + w_1 = straightenX (X); + r_1 = (X %*% w_1) - 1.0; + norm_r_1_sq = sum (r_1 ^ 2); + + # For each W[, i] generate uniformly random directions to shift away from "w_1" + + DW_raw = Rand (rows = dim_w, cols = num_w, pdf = "normal"); + DW = DW_raw - (w_X %*% t(w_X) %*% DW_raw) / sum (w_X ^ 2); # Orthogonal to w_X + XDW = X %*% DW; + + # Determine how far to shift in the chosen directions to satisfy the constraints + # Use the positive root of the quadratic equation; relax sigmaLT where needed + + a_qe = colSums (XDW ^ 2); + b_qe = 2.0 * meanLT * (t(r_1) %*% XDW); + c_qe = meanLT^2 * norm_r_1_sq - sigmaLT^2 * nrow(X); + + is_sigmaLT_OK = (c_qe <= 0); + new_sigmaLT = is_sigmaLT_OK * sigmaLT + (1 - is_sigmaLT_OK) * abs (meanLT) * sqrt (norm_r_1_sq / nrow(X)); + c_qe = is_sigmaLT_OK * c_qe; + x_qe = (- b_qe + sqrt (b_qe * b_qe - 4.0 * a_qe * c_qe)) / (2.0 * a_qe); + + # Scale and shift "w_1" in the "DW" directions to produce the result: + + ones = matrix (1.0, rows = dim_w, cols = 1); + W = w_1 %*% meanLT + DW * (ones %*% x_qe); +} + +# Computes vector w such that ||X %*% w - 1|| -> MIN given avg(X %*% w) = 1 +# We find z_LS such that ||X %*% z_LS - 1|| -> MIN unconditionally, then scale +# it to compute w = c * z_LS such that sum(X %*% w) = nrow(X). +straightenX = + function (Matrix[double] X) + return (Matrix[double] w) +{ + w_X = t(colSums(X)); + lambda_LS = 0.000001 * sum(X ^ 2) / ncol(X); + eps = 0.000000001 * nrow(X); + + # BEGIN LEAST SQUARES + + r_LS = - w_X; + z_LS = matrix (0.0, rows = ncol(X), cols = 1); + p_LS = - r_LS; + norm_r2_LS = sum (r_LS ^ 2); + i_LS = 0; + while (i_LS < 50 & i_LS < ncol(X) & norm_r2_LS >= eps) + { + temp_LS = X %*% p_LS; + q_LS = (t(X) %*% temp_LS) + lambda_LS * p_LS; + alpha_LS = norm_r2_LS / sum (p_LS * q_LS); + z_LS = z_LS + alpha_LS * p_LS; + old_norm_r2_LS = norm_r2_LS; + r_LS = r_LS + alpha_LS * q_LS; + norm_r2_LS = sum (r_LS ^ 2); + p_LS = -r_LS + (norm_r2_LS / old_norm_r2_LS) * p_LS; + i_LS = i_LS + 1; + } + + # END LEAST SQUARES + + w = (nrow(X) / sum (w_X * z_LS)) * z_LS; +} diff --git a/scripts/perftest/datagen/genRandData4LinearRegression.dml b/scripts/perftest/datagen/genRandData4LinearRegression.dml new file mode 100644 index 0000000000..ebce4f30d1 --- /dev/null +++ b/scripts/perftest/datagen/genRandData4LinearRegression.dml @@ -0,0 +1,61 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +# generates data to test linear regression + +# $1 is number of samples +# $2 is number of features (independent variables) +# $3 is maximum feature value (absolute value) +# $4 is maximum weight (absolute value) +# $5 is location to store generated weights +# $6 is location to store generated data +# $7 is location to store generated labels +# $8 is 0/1. 0 suppresses noise, 1 will add noise to Y +# $9 is b, 0 disables intercept +# $10 controls sparsity in the generated data +# $11 output format + +numSamples = $1 +numFeatures = $2 +maxFeatureValue = $3 +maxWeight = $4 +addNoise = $8 +b = $9 +fmt = $11 + +X = Rand(rows=numSamples, cols=numFeatures, min=-1, max=1, pdf="uniform", seed=0, sparsity=$10) +w = Rand(rows=numFeatures, cols=1, min=-1, max=1, pdf="uniform", seed=0) +X = X * maxFeatureValue +w = w * maxWeight +Y = X %*% w + +if( b != 0 ) { + b_mat = Rand(rows=1, cols=1, min=b, max=b, pdf="uniform") + w = rbind(w, t(b_mat)) + Y = Y + b +} + +noise = Rand(rows=numSamples, cols=1, pdf="normal", seed=0) +Y = Y + addNoise*noise + +write(w, $5, format=fmt) +write(X, $6, format=fmt) +write(Y, $7, format=fmt) diff --git a/scripts/perftest/datagen/genRandData4LogReg_LTstats.dml b/scripts/perftest/datagen/genRandData4LogReg_LTstats.dml new file mode 100644 index 0000000000..f95342f708 --- /dev/null +++ b/scripts/perftest/datagen/genRandData4LogReg_LTstats.dml @@ -0,0 +1,233 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +# +# generates random data to test bi- and multinomial logistic regression + +# $N = number of training samples +# $Nt = number of test samples (or 0 if none) +# $nf = number of features (independent variables) +# $nc = number of categories; = 1 if "binomial" with +1/-1 labels +# $Xmin = minimum feature value +# $Xmax = maximum feature value +# $spars = controls sparsity in the generated data +# $avgLTmin = average linear term (X %*% beta + intercept), minimum value +# $avgLTmax = average linear term (X %*% beta + intercept), maximum value +# $stdLT = requested standard deviation for the linear terms +# $iceptmin = intercept, minimum value (0.0 disables intercept) +# $iceptmax = intercept, maximum value (0.0 disables intercept) +# $B = location to store generated regression parameters +# $X = location to store generated training data +# $Y = location to store generated training category labels +# $Xt = location to store generated test data +# $Yt = location to store generated test category labels +# +# Example: +# hadoop jar SystemDS.jar -f genRandData4LogReg_LTstats.dml -nvargs +# N=1000000 Nt=1000 nf=20 nc=3 Xmin=0.0 Xmax=1.0 spars=1.0 avgLTmin=3.0 avgLTmax=5.0 stdLT=1.25 +# iceptmin=1.0 iceptmax=1.0 B=./B123 X=./X123 Y=./Y123 Xt=./Xt123 Yt=./Yt123 + +numTrainingSamples = $N; +numTestSamples = $Nt; +numFeatures = $nf; +numCategories = $nc; +minIntercept = $iceptmin; +maxIntercept = $iceptmax; +minXentry = $Xmin; +maxXentry = $Xmax; +minAvgLT = $avgLTmin; +maxAvgLT = $avgLTmax; +sparsityLevel = $spars; +stdevLT = $stdLT; +fileB = ifdef ($B, "B"); +fileX = ifdef ($X, "X"); +fileY = ifdef ($Y, "Y"); +fileXt = ifdef ($Xt, "Xt"); +fileYt = ifdef ($Yt, "Yt"); + + +numSamples = numTrainingSamples + numTestSamples; + +isBinomialPMOne = FALSE; +if (numCategories == 1) { + numCategories = 2; + isBinomialPMOne = TRUE; +} +do_we_output_intercept = 1; +if (minIntercept == 0 & maxIntercept == 0) { + do_we_output_intercept = 0; +} + +X = Rand (rows = numSamples, cols = numFeatures, min = minXentry, max = maxXentry, pdf = "uniform", sparsity = sparsityLevel); + +meanLT = Rand (rows = 1, cols = numCategories - 1, min = minAvgLT, max = maxAvgLT, pdf = "uniform"); +sigmaLT = matrix (stdevLT, rows = 1, cols = numCategories - 1); +b_intercept = Rand (rows = 1, cols = numCategories - 1, min = minIntercept, max = maxIntercept, pdf = "uniform"); + +meanLT_minus_intercept = meanLT - b_intercept; +[B, new_sigmaLT] = generateWeights (X, meanLT_minus_intercept, sigmaLT); + +ones = matrix (1.0, rows = numSamples, cols = 1); +LT = X %*% B + ones %*% b_intercept; +actual_meanLT = colSums (LT) / numSamples; +actual_sigmaLT = sqrt (colSums ((LT - ones %*% actual_meanLT)^2) / numSamples); + +for (i in 1:(numCategories - 1)) { + if (as.scalar (new_sigmaLT [1, i]) == as.scalar (sigmaLT [1, i])) { + print ("Category " + i + ": Intercept = " + as.scalar (b_intercept [1, i])); + } else { + print ("Category " + i + ": Intercept = " + as.scalar (b_intercept [1, i]) + ", st.dev.(LT) relaxed from " + as.scalar (sigmaLT [1, i])); + } + print (" Wanted LT mean = " + as.scalar (meanLT [1, i]) + ", st.dev. = " + as.scalar (new_sigmaLT [1, i])); + print (" Actual LT mean = " + as.scalar (actual_meanLT [1, i]) + ", st.dev. = " + as.scalar (actual_sigmaLT [1, i])); +} + + +ones = matrix (1.0, rows = 1, cols = numCategories - 1); +Prob = exp (LT); +Prob = Prob / ((1.0 + rowSums (Prob)) %*% ones); +Prob = t(cumsum (t(Prob))); + +r = Rand (rows = numSamples, cols = 1, min = 0, max = 1, pdf = "uniform", seed = 0); +R = r %*% ones; +Y = 1 + rowSums (Prob < R); +if (isBinomialPMOne) { + Y = 3 - 2 * Y; +} + + +/* USE FOR LINEAR REGRESSION + +r = Rand (rows = numSamples, cols = 1, pdf = "normal"); +Y = LT [, 1] + r; + +*/ + + +if (do_we_output_intercept == 1) { + new_B = matrix (0.0, rows = nrow(B) + 1, cols = ncol(B)); + new_B [1:nrow(B), 1:ncol(B)] = B; + new_B [nrow(B)+1, 1:ncol(B)] = b_intercept; + write (new_B, fileB, format="mm"); +} else { + write (B, fileB, format="mm"); +} + +if (numTestSamples > 0) { + X_train = X [1:numTrainingSamples,]; + Y_train = Y [1:numTrainingSamples,]; + X_test = X [(numTrainingSamples+1):numSamples,]; + Y_test = Y [(numTrainingSamples+1):numSamples,]; + write (X_train, fileX, format="mm"); + write (Y_train, fileY, format="mm"); + write (X_test, fileXt, format="mm"); + write (Y_test, fileYt, format="mm"); +} else { + write (X, fileX, format="mm"); + write (Y, fileY, format="mm"); +} + + + + + + +# Generates weight vectors to ensure the desired statistics for Linear Terms = X %*% W +# To be used for data generation in the testing of GLM, Logistic Regression, etc. +# INPUT: meanLT and sigmaLT are row vectors, meanLT[1, i] and sigmaLT[1, i] are +# the desired mean and standard deviation for X %*% W[, i] +# OUTPUT: "W" is the matrix of generated (column) weight vectors W[, i] +# new_sigmaLT[1, i] == sigmaLT[1, i] if the std.dev is successfully enforced, +# new_sigmaLT[1, i] > sigmaLT[1, i] if we had to relax this constraint. +generateWeights = + function (Matrix[double] X, Matrix[double] meanLT, Matrix[double] sigmaLT) + return (Matrix[double] W, Matrix[double] new_sigmaLT) +{ + num_w = ncol (meanLT); # Number of output weight vectors + dim_w = ncol (X); # Number of features / dimensions in a weight vector + w_X = t(colSums(X)); # "Prohibited" weight shift direction that changes meanLT + # (all orthogonal shift directions do not affect meanLT) + + # Compute "w_1" with meanLT = 1 and with the smallest possible sigmaLT + + w_1 = straightenX (X); + r_1 = (X %*% w_1) - 1.0; + norm_r_1_sq = sum (r_1 ^ 2); + + # For each W[, i] generate uniformly random directions to shift away from "w_1" + + DW_raw = Rand (rows = dim_w, cols = num_w, pdf = "normal"); + DW = DW_raw - (w_X %*% t(w_X) %*% DW_raw) / sum (w_X ^ 2); # Orthogonal to w_X + XDW = X %*% DW; + + # Determine how far to shift in the chosen directions to satisfy the constraints + # Use the positive root of the quadratic equation; relax sigmaLT where needed + + a_qe = colSums (XDW ^ 2); + b_qe = 2.0 * meanLT * (t(r_1) %*% XDW); + c_qe = meanLT^2 * norm_r_1_sq - sigmaLT^2 * nrow(X); + + is_sigmaLT_OK = (c_qe <= 0); + new_sigmaLT = is_sigmaLT_OK * sigmaLT + (1 - is_sigmaLT_OK) * abs (meanLT) * sqrt (norm_r_1_sq / nrow(X)); + c_qe = is_sigmaLT_OK * c_qe; + x_qe = (- b_qe + sqrt (b_qe * b_qe - 4.0 * a_qe * c_qe)) / (2.0 * a_qe); + + # Scale and shift "w_1" in the "DW" directions to produce the result: + + ones = matrix (1.0, rows = dim_w, cols = 1); + W = w_1 %*% meanLT + DW * (ones %*% x_qe); +} + +# Computes vector w such that ||X %*% w - 1|| -> MIN given avg(X %*% w) = 1 +# We find z_LS such that ||X %*% z_LS - 1|| -> MIN unconditionally, then scale +# it to compute w = c * z_LS such that sum(X %*% w) = nrow(X). +straightenX = + function (Matrix[double] X) + return (Matrix[double] w) +{ + w_X = t(colSums(X)); + lambda_LS = 0.000001 * sum(X ^ 2) / ncol(X); + eps = 0.000000001 * nrow(X); + + # BEGIN LEAST SQUARES + + r_LS = - w_X; + z_LS = matrix (0.0, rows = ncol(X), cols = 1); + p_LS = - r_LS; + norm_r2_LS = sum (r_LS ^ 2); + i_LS = 0; + while (i_LS < 50 & i_LS < ncol(X) & norm_r2_LS >= eps) + { + temp_LS = X %*% p_LS; + q_LS = (t(X) %*% temp_LS) + lambda_LS * p_LS; + alpha_LS = norm_r2_LS / sum (p_LS * q_LS); + z_LS = z_LS + alpha_LS * p_LS; + old_norm_r2_LS = norm_r2_LS; + r_LS = r_LS + alpha_LS * q_LS; + norm_r2_LS = sum (r_LS ^ 2); + p_LS = -r_LS + (norm_r2_LS / old_norm_r2_LS) * p_LS; + i_LS = i_LS + 1; + } + + # END LEAST SQUARES + + w = (nrow(X) / sum (w_X * z_LS)) * z_LS; +} diff --git a/scripts/perftest/datagen/genRandData4LogisticRegression.dml b/scripts/perftest/datagen/genRandData4LogisticRegression.dml new file mode 100644 index 0000000000..f0850938ad --- /dev/null +++ b/scripts/perftest/datagen/genRandData4LogisticRegression.dml @@ -0,0 +1,72 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +# generates random data to test linear logistic regression + +# $1 is number of samples +# $2 is number of features (independent variables) +# $3 is maximum feature value (absolute value) +# $4 is maximum weight (absolute value) +# $5 is location to store generated weights +# $6 is location to store generated data +# $7 is location to store generated labels +# $8 addNoise. if 0 then no noise is added, to add noise set this to 1 +# $9 is b, 0 disables intercept +# $10 controls sparsity in the generated data +# $11 output format +# $12 transform labels. if 0 then -1/1; otherwise 1/2 + +numSamples = $1 +numFeatures = $2 +maxFeatureValue = $3 +maxWeight = $4 +addNoise = $8 +b = $9 + +X = Rand(rows=numSamples, cols=numFeatures, min=-1, max=1, pdf="uniform", seed=0, sparsity=$10) +X = X * maxFeatureValue + +w = Rand(rows=numFeatures, cols=1, min=-1, max=1, pdf="uniform", seed=0) +w = w * maxWeight + +ot = X %*% w +if( b != 0) { + b_mat = Rand(rows=1, cols=1, min=b, max=b, pdf="uniform") + w = rbind(w, t(b_mat)) + ot = ot + b +} + +prob = 1 / (1 + exp(-ot)) +if( addNoise == 1 ){ + r = Rand(rows=numSamples, cols=1, min=0, max=1, pdf="uniform", seed=0) +} +else { + print("this data generator generates the same dataset for both noise=0 and noise=1") + r = Rand(rows=numSamples, cols=1, min=0, max=1, pdf="uniform", seed=0) +} + +Y = 1 - 2 * (prob < r) +if( $12 == 1 ) + Y = (Y + 3) / 2 + +write(w, $5, format=$11) +write(X, $6, format=$11) +write(Y, $7, format=$11) diff --git a/scripts/perftest/datagen/genRandData4MultiClassSVM.dml b/scripts/perftest/datagen/genRandData4MultiClassSVM.dml new file mode 100644 index 0000000000..011b4dab18 --- /dev/null +++ b/scripts/perftest/datagen/genRandData4MultiClassSVM.dml @@ -0,0 +1,68 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +# generates random data to test linear logistic regression + +# $1 is number of samples +# $2 is number of features (independent variables) +# $3 is maximum feature value (absolute value) +# $4 is maximum weight (absolute value) +# $5 is location to store generated weights +# $6 is location to store generated data +# $7 is location to store generated labels +# $8 addNoise. if 0 then no noise is added, to add noise set this to 1 +# $9 is b, 0 disables intercept +# $10 controls sparsity in the generated data + +numSamples = $1 +numFeatures = $2 +maxFeatureValue = $3 +maxWeight = $4 +addNoise = $8 +b = $9 + +X = Rand(rows=numSamples, cols=numFeatures, min=-1, max=1, pdf="uniform", seed=0, sparsity=$10) +X = X * maxFeatureValue + +w = Rand(rows=numFeatures, cols=1, min=-1, max=1, pdf="uniform", seed=0) +w = w * maxWeight + +ot = X%*%w +if(b!=0) { + b_mat = Rand(rows=1, cols=1, min=b, max=b, pdf="uniform") + w = t(cbind(t(w), b_mat)) + ot = ot + b +} + +prob = 1/(1+exp(-ot)) +if(addNoise == 1){ + r = Rand(rows=numSamples, cols=1, min=0, max=1, pdf="uniform", seed=0) +}else{ + print("this data generator generates the same dataset for both noise=0 and noise=1") + r = Rand(rows=numSamples, cols=1, min=0, max=1, pdf="uniform", seed=0) + #r = Rand(rows=numSamples, cols=1, min=0.5, max=0.5, pdf="uniform") +} +Y = 1 - 2 * (prob < r) +Y = (Y+3)/2 + +write(w, $5, format="binary") +write(X, $6, format="binary") +write(Y, $7, format="binary") diff --git a/scripts/perftest/datagen/genRandData4Multinomial.dml b/scripts/perftest/datagen/genRandData4Multinomial.dml new file mode 100644 index 0000000000..93666758b5 --- /dev/null +++ b/scripts/perftest/datagen/genRandData4Multinomial.dml @@ -0,0 +1,66 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +num_records = $1; +num_features = $2; + +p = $3; #sparsity +num_categories = $4; #num classes +is_intercept = $5==1; + +stdevLT = 1.0; +beta_range = 3.0 * stdevLT / sqrt (num_features * p); + +if (is_intercept) { + intercept = Rand (rows = 1, cols = num_categories - 1, min = -1.0, max = 1.0); +} + +X = Rand( rows = num_records, + cols = num_features, + min = 1, + max = 5, + pdf = "uniform", + sparsity = p ); + +B = Rand (rows = num_features, + cols = num_categories - 1, + min = -1.0, + max = 1.0, + pdf = "uniform", + sparsity = 1.0) * beta_range; + +LT = X %*% B; +if (is_intercept) { + LT = LT + matrix (1, rows = num_records, cols = 1) %*% intercept; +} + +Prob = exp (LT); +Prob = Prob / (1.0 + rowSums(Prob)); +Prob = t(cumsum (t(Prob))); + +r = Rand (rows = num_records, cols = 1, min = 0, max = 1, pdf = "uniform"); +Y = 1 + rowSums (Prob < r); + +# ensure all classes are represented +Y[(num_records-num_categories+1):num_records,1] = seq(1,num_categories); + +write(X, $6, format=$8) +write(Y, $7, format=$8); \ No newline at end of file diff --git a/scripts/perftest/datagen/genRandData4NMF.dml b/scripts/perftest/datagen/genRandData4NMF.dml new file mode 100644 index 0000000000..a82ac4e0f1 --- /dev/null +++ b/scripts/perftest/datagen/genRandData4NMF.dml @@ -0,0 +1,129 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +# generates random data for non-negative +# matrix factorization +# +# follows lda's generative model +# see Blei, Ng & Jordan, JMLR'03 paper +# titled Latent Dirichlet Allocation +# +# $1 is number of samples +# $2 is number of features +# $3 is number of latent factors +# $4 is number of features per sample +# (may overlap). use this to vary +# sparsity. +# $5 is file to store sample mixtures +# $6 is file to store factors +# $7 is file to store generated data + +numDocuments = $1 +numFeatures = $2 +numTopics = $3 +numWordsPerDoc = $4 + +docTopicMixtures = Rand(rows=numDocuments, cols=numTopics, min=0.0, max=1.0, pdf="uniform", seed=0, sparsity=0.75) +denomsTM = rowSums(docTopicMixtures) +zerosInDenomsTM = denomsTM == 0 +denomsTM = 0.1*zerosInDenomsTM + (1-zerosInDenomsTM)*denomsTM +parfor(i in 1:numTopics){ + docTopicMixtures[,i] = docTopicMixtures[,i]/denomsTM +} +write(docTopicMixtures, $5, format="binary") +for(j in 2:numTopics){ + docTopicMixtures[,j] = docTopicMixtures[,j-1] + docTopicMixtures[,j] +} + +topicDistributions = Rand(rows=numTopics, cols=numFeatures, min=0.0, max=1.0, pdf="uniform", seed=0, sparsity=0.75) +parfor(i in 1:numTopics){ + topicDist = topicDistributions[i,] + + denom2 = sum(topicDist) + if(denom2 == 0){ + denom2 = denom2 + 0.1 + } + + topicDistributions[i,] = topicDist / denom2 +} +write(topicDistributions, $6, format="binary") +for(j in 2:numFeatures){ + topicDistributions[,j] = topicDistributions[,j-1] + topicDistributions[,j] +} + +data = Rand(rows=numDocuments, cols=numFeatures, min=0, max=0, pdf="uniform") + +parfor(i in 1:numDocuments){ + docTopic = docTopicMixtures[i,] + + ldata = Rand(rows=1, cols=numFeatures, min=0, max=0, pdf="uniform"); + + r_z = Rand(rows=numWordsPerDoc, cols=1, min=0, max=1, pdf="uniform", seed=0) + r_w = Rand(rows=numWordsPerDoc, cols=1, min=0, max=1, pdf="uniform", seed=0) + + for(j in 1:numWordsPerDoc){ + rz = as.scalar(r_z[j,1]) + continue = 1 + + z = -1 + #this is a workaround + #z=1 + + for(k1 in 1:numTopics){ + prob = as.scalar(docTopic[1,k1]) + if(continue==1 & rz <= prob){ + z=k1 + continue=0 + } + } + + if(z==-1){ + print("z is unassigned: " + z) + z = numTopics + } + + rw = as.scalar(r_w[j,1]) + continue = 1 + + w = -1 + #this is a workaround + #w = 1 + + for(k2 in 1:numFeatures){ + prob = as.scalar(topicDistributions[z,k2]) + if(continue == 1 & rw <= prob){ + w = k2 + continue = 0 + } + } + + if(w==-1){ + print("w is unassigned: " + w) + w = numFeatures + } + + ldata[1,w] = ldata[1,w] + 1 + } + + data[i,] = ldata; +} + +write(data, $7, format="binary") diff --git a/scripts/perftest/datagen/genRandData4NMFBlockwise.dml b/scripts/perftest/datagen/genRandData4NMFBlockwise.dml new file mode 100644 index 0000000000..0ad548ead2 --- /dev/null +++ b/scripts/perftest/datagen/genRandData4NMFBlockwise.dml @@ -0,0 +1,138 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +# generates random data for non-negative +# matrix factorization +# +# follows lda's generative model +# see Blei, Ng & Jordan, JMLR'03 paper +# titled Latent Dirichlet Allocation +# +# $1 is number of samples +# $2 is number of features +# $3 is number of latent factors +# $4 is number of features per sample +# (may overlap). use this to vary +# sparsity. +# $5 is file to store sample mixtures +# $6 is file to store factors +# $7 is file to store generated data +# +# $8 is the blocksize, i.e., number of rows per block +# (should be set such that $8x$2 fits in mem budget) + +numDocuments = $1 +numFeatures = $2 +numTopics = $3 +numWordsPerDoc = $4 +blocksize = $8 + +docTopicMixtures = Rand(rows=numDocuments, cols=numTopics, min=0.0, max=1.0, pdf="uniform", seed=0, sparsity=0.75) +denomsTM = rowSums(docTopicMixtures) +zerosInDenomsTM = (denomsTM == 0) +denomsTM = 0.1*zerosInDenomsTM + (1-zerosInDenomsTM)*denomsTM +parfor(i in 1:numTopics){ + docTopicMixtures[,i] = docTopicMixtures[,i]/denomsTM +} +write(docTopicMixtures, $5, format="binary") +for(j in 2:numTopics){ + docTopicMixtures[,j] = docTopicMixtures[,j-1] + docTopicMixtures[,j] +} + +topicDistributions = Rand(rows=numTopics, cols=numFeatures, min=0.0, max=1.0, pdf="uniform", seed=0, sparsity=0.75) +parfor(i in 1:numTopics){ + topicDist = topicDistributions[i,] + + denom2 = sum(topicDist) + if(denom2 == 0){ + denom2 = denom2 + 0.1 + } + + topicDistributions[i,] = topicDist / denom2 +} +write(topicDistributions, $6, format="binary") +for(j in 2:numFeatures){ + topicDistributions[,j] = topicDistributions[,j-1] + topicDistributions[,j] +} + +data0 = Rand(rows=numDocuments, cols=numFeatures, min=0, max=0, pdf="uniform") + +#outer-loop for blockwise computation +for( k in seq(1,numDocuments,blocksize) ) +{ + len = min(blocksize,numDocuments-k); #block length + data = data0[k:(k+len),]; #obtain block + + parfor(i in 1:len){ + docTopic = docTopicMixtures[i,] + + r_z = Rand(rows=numWordsPerDoc, cols=1, min=0, max=1, pdf="uniform", seed=0) + r_w = Rand(rows=numWordsPerDoc, cols=1, min=0, max=1, pdf="uniform", seed=0) + + for(j in 1:numWordsPerDoc){ + rz = as.scalar(r_z[j,1]) + continue = 1 + + z = -1 + #this is a workaround + #z=1 + + for(k1 in 1:numTopics){ + prob = as.scalar(docTopic[1,k1]) + if(continue==1 & rz <= prob){ + z=k1 + continue=0 + } + } + + if(z==-1){ + print("z is unassigned: " + z) + z = numTopics + } + + rw = as.scalar(r_w[j,1]) + continue = 1 + + w = -1 + #this is a workaround + #w = 1 + + for(k2 in 1:numFeatures){ + prob = as.scalar(topicDistributions[z,k2]) + if(continue == 1 & rw <= prob){ + w = k2 + continue = 0 + } + } + + if(w==-1){ + print("w is unassigned: " + w) + w = numFeatures + } + + data[i,w] = data[i,w] + 1 + } + } + + data0[k:(k+len),] = data; # write block back +} + +write(data0, $7, format="binary") diff --git a/scripts/perftest/datagen/genRandData4PCA.dml b/scripts/perftest/datagen/genRandData4PCA.dml new file mode 100644 index 0000000000..413d5c458e --- /dev/null +++ b/scripts/perftest/datagen/genRandData4PCA.dml @@ -0,0 +1,61 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +# +# Synthetic data generator for PCA +# 3 hidden dimensions (V1, V2, V3) +# generates only "dense" data +# +# INPUT PARAMETERS: +# -------------------------------------------------------------------------------------------- +# NAME TYPE DEFAULT MEANING +# -------------------------------------------------------------------------------------------- +# R Int 10000 Number of rows +# C Int 1000 Number of categorical attributes +# OUT String --- Location (on HDFS) to store the generated dataset +# FMT String "csv" Matrix output format, usually "text", "csv" or "binary" +# -------------------------------------------------------------------------------------------- +# +# Example: +# hadoop jar SystemDS.jar -f genRandData4PCA.dml -nvargs R=1000000 C=1000 OUT=/user/biuser/pcaData.mtx FMT=csv + +R = ifdef ($R, 10000) +C = ifdef ($C, 1000) +FMT = ifdef ($FMT, "csv"); + +# Modified version of the procedure from Zou et.al., "Sparse Principal Component Analysis", 2006. + +# V1 ~ N(0,290); V2~N(0,300); V3 = -0.3V1+0.925V2 + e, e ~ N(0,1) +V1 = 0 + 290*rand(rows=R, cols=1, pdf="normal"); +V2 = 0 + 300*rand(rows=R, cols=1, pdf="normal"); +V3 = -0.3*V1 + 0.925*V2 + rand(rows=R, cols=1, pdf="normal"); + +C1 = ceil(C/2.5); +C2 = ceil(C/2.5); +C3 = C - C1 - C2; + +M = matrix(0, rows=R, cols=C) + +M[,1:C1] = rand(rows=R, cols=C1, pdf="normal") + V1; +M[,C1+1:C1+C2] = rand(rows=R, cols=C2, pdf="normal") + V2; +M[,C1+C2+1:C] = rand(rows=R, cols=C3, pdf="normal") + V3; + +write(M, $OUT, format=FMT); diff --git a/scripts/perftest/datagen/genRandData4StratStats.dml b/scripts/perftest/datagen/genRandData4StratStats.dml new file mode 100644 index 0000000000..6a4c07f734 --- /dev/null +++ b/scripts/perftest/datagen/genRandData4StratStats.dml @@ -0,0 +1,155 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +# THIS SCRIPT GENERATES SYNTHETIC DATA FOR STRATSTATS (STRATIFIED STATISTICS) TESTING +# +# INPUT PARAMETERS: +# -------------------------------------------------------------------------------------------- +# NAME TYPE DEFAULT MEANING +# -------------------------------------------------------------------------------------------- +# nr Int 100000 Number of records in the generated dataset +# nf Int 10 Number of features in the X and the Y parts of the generated dataset +# smin Int 10000 Minimum stratum value, a positive integer +# smax Int 20000 Maximum stratum value, a positive integer +# prs Double 100.0 How many times more likely to have minimum vs. maximum stratum value +# pxnan Double 0.05 Probability of a NaN replacing a value in X +# pynan Double 0.05 Probability of a NaN replacing a value in Y +# psnan Double 0.05 Probability of a NaN replacing a value in the stratum column +# -------------------------------------------------------------------------------------------- +# mxmin Double 10.0 Baseline (mean) value for the first feature in X +# mxmax Double 19.0 Baseline (mean) value for the last feature in X +# mymin Double 30.0 Baseline (mean) value for the first feature in Y (before adding X) +# mymax Double 39.0 Baseline (mean) value for the last feature in Y (before adding X) +# bmin Double 3.0 "Beta" multiplied by X before adding to Y, for the first feature +# bmax Double 3.0 "Beta" multiplied by X before adding to Y, for the last feature +# -------------------------------------------------------------------------------------------- +# sxbmin Double 3.0 Standard deviation for the first feature in X, stratum dependent +# sxbmax Double 3.0 Standard deviation for the last feature in X, stratum dependent +# sxwmin Double 4.0 Standard deviation for the first feature in X, residual +# sxwmax Double 4.0 Standard deviation for the last feature in X, residual +# sybmin Double sqrt(28) Standard deviation for the first feature in Y, stratum dependent +# sybmax Double sqrt(28) Standard deviation for the last feature in Y, stratum dependent +# sywmin Double 6.0 Standard deviation for the first feature in Y, residual +# sywmax Double 6.0 Standard deviation for the last feature in Y, residual +# -------------------------------------------------------------------------------------------- +# D String "Data" Location (on HDFS) to store the generated dataset +# Xcid String "Xcid" Location (on HDFS) to store the column indices of X features +# Ycid String "Ycid" Location (on HDFS) to store the column indices of Y features +# A String "Aux" Location (on HDFS) to store the auxiliary parameter values, if any +# fmt String "text" Matrix output format, usually "text", "mm", or "csv" +# -------------------------------------------------------------------------------------------- +# OUTPUT: Matrix with the generated dataset, Xcid and Ycid, and possibly other auxiliaries + +num_records = ifdef ($nr, 100000); +num_features = ifdef ($nf, 10); +min_stratumID = ifdef ($smin, 10000); +max_stratumID = ifdef ($smax, 20000); +prob_ratio_min_to_max_stratumID = ifdef ($prs, 100); +prob_NaN_in_X = ifdef ($pxnan, 0.05); +prob_NaN_in_Y = ifdef ($pynan, 0.05); +prob_NaN_in_stratum = ifdef ($psnan, 0.05); + +mean_X_min = ifdef ($mxmin, 31.0); +mean_X_max = ifdef ($mxmax, 40.0); +mean_Y_min = ifdef ($mymin, 11.0); +mean_Y_max = ifdef ($mymax, 20.0); +beta_min = ifdef ($bmin, 3.0); +beta_max = ifdef ($bmax, 3.0); + +stdev_X_between_strata_min = ifdef ($sxbmin, 3.0); +stdev_X_between_strata_max = ifdef ($sxbmax, 3.0); +stdev_X_within_strata_min = ifdef ($sxwmin, 4.0); +stdev_X_within_strata_max = ifdef ($sxwmax, 4.0); +stdev_Y_between_strata_min = ifdef ($sybmin, sqrt(28.0)); +stdev_Y_between_strata_max = ifdef ($sybmax, sqrt(28.0)); +stdev_Y_within_strata_min = ifdef ($sywmin, 6.0); +stdev_Y_within_strata_max = ifdef ($sywmax, 6.0); + +fileData = ifdef ($D, "Data"); +fileXcid = ifdef ($Xcid, "Xcid"); +fileYcid = ifdef ($Ycid, "Ycid"); +fileAux = ifdef ($A, "Aux" ); +fmt = ifdef ($fmt, "text"); + +# Generate the strata, from 1 to (max_stratumID - min_stratumID + 1), as multinomial +# in which 1 is less likely than (max_stratumID - min_stratumID + 1) by a factor of +# prob_ratio_min_to_max_stratumID + +r_power = (max_stratumID - min_stratumID) / log (prob_ratio_min_to_max_stratumID); +r_bound = prob_ratio_min_to_max_stratumID ^ (1.0 + 1.0 / (max_stratumID - min_stratumID)); + +if (r_bound < 1.0) { + R_S = Rand (rows = num_records, cols = 1, min = 0.0, max = 1.0, pdf = "uniform"); + R_S = r_bound + R_S * (1.0-r_bound); +} else { + R_S = Rand (rows = num_records, cols = 1, min = 0.0, max = 1.0, pdf = "uniform"); + R_S = 1.0 + R_S * (r_bound-1); +} + +SID = round (0.5 + log (R_S) * r_power); +num_strata = max (SID); +Smap = table (SID, seq (1, num_records, 1)); + +# Compute baseline values and standard deviations of X, Y, and beta, at each feature + +mean_X = mean_X_min + ((mean_X_max - mean_X_min) / (num_features - 1)) * seq (0, num_features - 1, 1); +mean_Y = mean_Y_min + ((mean_Y_max - mean_Y_min) / (num_features - 1)) * seq (0, num_features - 1, 1); +betas = beta_min + (( beta_max - beta_min) / (num_features - 1)) * seq (0, num_features - 1, 1); + +stdev_X_within_strata = stdev_X_within_strata_min + + ((stdev_X_within_strata_max - stdev_X_within_strata_min ) / (num_features - 1)) * seq (0, num_features - 1, 1); +stdev_X_between_strata = stdev_X_between_strata_min + + ((stdev_X_between_strata_max - stdev_X_between_strata_min) / (num_features - 1)) * seq (0, num_features - 1, 1); +stdev_Y_within_strata = stdev_Y_within_strata_min + + ((stdev_Y_within_strata_max - stdev_Y_within_strata_min ) / (num_features - 1)) * seq (0, num_features - 1, 1); +stdev_Y_between_strata = stdev_Y_between_strata_min + + ((stdev_Y_between_strata_max - stdev_Y_between_strata_min) / (num_features - 1)) * seq (0, num_features - 1, 1); + +# Generate X and Y matrices + +RX_strata = Rand (rows = num_features, cols = num_strata, pdf = "normal"); # transposed +RY_strata = Rand (rows = num_features, cols = num_strata, pdf = "normal"); # to allow +RX_records = Rand (rows = num_features, cols = num_records, pdf = "normal"); # matrix-vector +RY_records = Rand (rows = num_features, cols = num_records, pdf = "normal"); # operations + +t_X = RX_records * stdev_X_within_strata + (RX_strata * stdev_X_between_strata + mean_X) %*% Smap; +t_Y = RY_records * stdev_Y_within_strata + (RY_strata * stdev_Y_between_strata + mean_Y) %*% Smap + (t_X * betas); +Data = cbind (min_stratumID - 1 + SID, t(t_X), t(t_Y)); + +# Set up the NaNs + +RNaNS = Rand (rows = num_records, cols = 1, min = 1.0, max = 1.0, sparsity = prob_NaN_in_stratum); +RNaNX = Rand (rows = num_records, cols = num_features, min = 1.0, max = 1.0, sparsity = prob_NaN_in_X); +RNaNY = Rand (rows = num_records, cols = num_features, min = 1.0, max = 1.0, sparsity = prob_NaN_in_Y); +Mask = cbind (RNaNS, RNaNX, RNaNY) != 0; +Data = Data + (1.0 - Mask) / (1.0 - Mask); + +# Output the dataset and the auxiliaries + +Xcid = t(seq (2, num_features + 1, 1)); +Ycid = t(seq (num_features + 2, 2 * num_features + 1, 1)); +Aux = cbind (mean_X, mean_Y, betas); + +write (Data, fileData, format=fmt); +write (Xcid, fileXcid, format=fmt); +write (Ycid, fileYcid, format=fmt); +write (Aux, fileAux, format=fmt); + diff --git a/scripts/perftest/datagen/genRandData4SurvAnalysis.dml b/scripts/perftest/datagen/genRandData4SurvAnalysis.dml new file mode 100644 index 0000000000..75117cf6d7 --- /dev/null +++ b/scripts/perftest/datagen/genRandData4SurvAnalysis.dml @@ -0,0 +1,133 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +# +# THIS SCRIPT GENERATED RANDOM DATA FOR KAPLAN-MEIER AND COX PROPORTIONAL HAZARD MODELS +# ASSUMPTION: BASELINE HAZARD HAS WEIBULL DISTRIBUTION WITH PARAMETERS LAMBDA AND V +# +# INPUT PARAMETERS: +# --------------------------------------------------------------------------------------------- +# NAME TYPE DEFAULT MEANING +# --------------------------------------------------------------------------------------------- +# type Sting --- The type of model for which the data is being generated: "kaplan-meier" or "cox" +# n Int Number of records +# lambda Double 2.0 Scale parameter of the Weibull distribution used for generating timestamps +# v Double 1.5 Shape parameter of the Weibull distribution used for generating timestamps +# p Double 0.8 1 - probability of a record being censored +# g Int 2 If type=kaplan-meier the number of categorical features used for grouping +# s Int 1 If type=kaplan-meier the number of categorical features used for stratifying +# f Int 10 If type=kaplan-meier maximum number of levels (i.e., distinct values) of g+s categorical features +# m Int 100 If type=cox the number of features in the model +# sp Double 1.0 If type=cox the sparsity of the feature matrix +# O String --- Location to write the output matrix containing random data for the kaplan-meier or the cox model +# B String --- If type=cox location to write the output matrix containing the coefficients for the cox model +# TE String --- Location to store column indices of X corresponding to timestamp (first row) and event information (second row) +# F String --- Location to store column indices of X which are to be used for fitting the Cox model +# fmt String "text" The output format of results of the kaplan-meier analysis, such as "text" or "csv" +# --------------------------------------------------------------------------------------------- +# OUTPUTS: +# 1- If type=kaplan-meier an n x (2+g+s) matrix O with +# - column 1 contains timestamps generated randomly from a Weibull distribution with parameters lambda and v +# - column 2 contains the information whether an event occurred (1) or data is censored (0) +# - columns 3:2+g contain categorical features used for grouping +# - columns 3+g:2+g+s contain categorical features used for stratifying +# if type=cox an n x (2+m) matrix O with +# - column 1 contains timestamps generated randomly from a Weibull distribution with parameters lambda and v +# - column 2 contains the information whether an event occurred (1) or data is censored (0) +# - columns 3:2+m contain scale features +# 2- If type=cox a coefficient matrix B +# 3- A column matrix TE containing the column indices of X corresponding to timestamp (first row) and event information (second row) +# 4- A column matrix F containing the column indices of X which are to be used for KM analysis or fitting the Cox model + +type = $type; # either "kaplan-meier" or "cox" +num_records = $n; +lambda = ifdef ($l, 2.0); +p_event = ifdef ($p, 0.8); # 1 - prob. of a record being censored +# parameters related to the kaplan-meier model +n_groups = ifdef ($g, 2); +n_strata = ifdef ($s, 1); +max_level = ifdef ($f, 10); +# parameters related to the cox model +num_features = ifdef ($m, 1000); +sparsity = ifdef ($sp, 1.0); +fileO = $O; +fileB = $B; +fileTE = $TE; +fileF = $F; +fmtO = ifdef ($fmt, "text"); # $fmt="text" +p_censor = 1 - p_event; # prob. that record is censored + +if (type == "kaplan-meier") { + + v = ifdef ($v, 1.5); + # generate categorical features used for grouping and stratifying + X = ceil (rand (rows = num_records, cols = n_groups + n_strata, min = 0.000000001, max = max_level - 0.000000001, pdf = "uniform")); + + # generate timestamps + U = rand (rows = num_records, cols = 1, min = 0.000000001, max = 1); + T = (-log (U) / lambda) ^ (1/v); + +} else if (type == "cox") { + + v = ifdef ($v, 50); + # generate feature matrix + X = rand (rows = num_records, cols = num_features, min = 1, max = 5, pdf = "uniform", sparsity = sparsity); + + # generate coefficients + B = rand (rows = num_features, cols = 1, min = -1.0, max = 1.0, pdf = "uniform", sparsity = 1.0); # * beta_range; + + # generate timestamps + U = rand (rows = num_records, cols = 1, min = 0.000000001, max = 1); + T = (-log (U) / (lambda * exp (X %*% B)) ) ^ (1/v); + +} else { + stop ("Wrong model type!"); +} + +Y = matrix (0, rows = num_records, cols = 2); +event = floor (rand (rows = num_records, cols = 1, min = (1 - p_censor), max = (1 + p_event))); +n_time = sum (event); +Y[,2] = event; + +# binning of event times +min_T = min (T); +max_T = max (T); +# T = T - min_T; +len = max_T - min_T; +num_bins = len / n_time; +T = ceil (T / num_bins); + +# print ("min(T) " + min(T) + " max(T) " + max(T)); +Y[,1] = T; + +O = cbind (Y, X); +write (O, fileO, format = fmtO); + +if (type == "cox") { + write (B, fileB, format = fmtO); + +} + +TE = matrix ("1 2", rows = 2, cols = 1); +F = seq (1, num_features); +write (TE, fileTE, format = fmtO); +write (F, fileF, format = fmtO); + diff --git a/scripts/perftest/datagen/genRandData4Transform.dml b/scripts/perftest/datagen/genRandData4Transform.dml new file mode 100644 index 0000000000..edab7c2873 --- /dev/null +++ b/scripts/perftest/datagen/genRandData4Transform.dml @@ -0,0 +1,96 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +# +# Generates random data to test transform with +# +# rows, cols: dimensions of the data matrix to be generated +# prob_categorical: percentage of the generated cols to be categorical +# min_domain, max_domain: provide a range for domain sizes of the generated categorical cols +# prob_missing: percentage of the generated (scale) cols to have missing values +# prob_missing_cell: probability of a cell to have a missing value +# out_X, out_missing, out_categorical: output file names +# + +#params for size of data +num_rows = ifdef($rows, 1000) +num_cols = ifdef($cols, 25) + +#params for kind of cols +prob_categorical = ifdef($prob_cat, 0.1) +min_domain_size = ifdef($min_domain, 1) +max_domain_size = ifdef($max_domain, 10) + +#params for missing value cols +prob_missing_col = ifdef($prob_missing, 0.1) +prob_missing_val = ifdef($prob_missing_cell, 0.2) + +num_scalar_cols = as.double(num_cols) +num_categorical_cols = 0.0 +scalar_ind = matrix(1, rows=num_scalar_cols, cols=1) +if(prob_categorical > 0){ + categorical_ind = Rand(rows=num_cols, cols=1, min=0, max=1, pdf="uniform") + categorical_ind = categorical_ind < prob_categorical + categorical_col_ids = removeEmpty(target=seq(1, num_cols, 1)*categorical_ind, margin="rows") + num_categorical_cols = sum(categorical_ind) + write(categorical_col_ids, $out_categorical, format="csv") + + domain_sizes = Rand(rows=num_categorical_cols, cols=1, min=0, max=1, pdf="uniform") + domain_sizes = round(min_domain_size + (max_domain_size - min_domain_size)*domain_sizes) + + categorical_X = Rand(rows=num_rows, cols=num_categorical_cols, min=0, max=1, pdf="uniform") + categorical_X = t(round(1 + t(categorical_X)*(domain_sizes - 1))) + + scalar_ind = 1-categorical_ind +} + +scalar_col_ids = removeEmpty(target=seq(1, num_cols, 1)*scalar_ind, margin="rows") +num_scalar_cols = sum(scalar_ind) +scalar_X = Rand(rows=num_rows, cols=num_scalar_cols, min=0, max=1, pdf="uniform") + +if(num_categorical_cols > 0 & num_scalar_cols > 0){ + X = cbind(scalar_X, categorical_X) + permut_mat = table(seq(1, num_scalar_cols, 1), scalar_col_ids, num_scalar_cols, num_cols) + fill_in = matrix(0, rows=num_cols-num_scalar_cols, cols=num_cols) + permut_mat = t(cbind(t(permut_mat), t(fill_in))) + X = X %*% permut_mat +}else{ + if(num_categorical_cols > 0) X = categorical_X + else{ + if(num_scalar_cols > 0) X = scalar_X + else print("somehow, we've managed to compute that precisely 0 cols should be categorical and 0 cols should be scale") + } +} + +if(prob_missing_col > 0){ + missing_col_ind = Rand(rows=num_cols, cols=1, min=0, max=1, pdf="uniform") + missing_col_ind = missing_col_ind < prob_missing_col + #currently only support missing value imputation for scale cols + missing_col_ind = missing_col_ind * scalar_ind + missing_col_ids = removeEmpty(target=seq(1, num_cols, 1)*missing_col_ind, margin="rows") + missing_values = Rand(rows=num_rows, cols=nrow(missing_col_ids), min=0, max=1, pdf="uniform") + missing_values = missing_values < prob_missing_val + X = cbind(X, missing_values) + + write(missing_col_ids, $out_missing, format="csv") +} + +write(X, $out_X, format="csv") diff --git a/scripts/perftest/datagen/genRandData4Univariate.dml b/scripts/perftest/datagen/genRandData4Univariate.dml new file mode 100644 index 0000000000..bcbd528eb9 --- /dev/null +++ b/scripts/perftest/datagen/genRandData4Univariate.dml @@ -0,0 +1,61 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +# generates random numbers from a distribution +# with specified mean, standard deviation, +# skewness, kurtosis +# mean and standard deviation are taken in as +# arguments by this script +# a,b,c,d are coefficients computed by some +# equation solver determined from the specified +# skewness and kurtosis using power method +# polynomials +# +# for more details see: +# Statistical Simulation: Power Method Polynomials +# and Other Transformations +# Author: Todd C. Headrick +# Chapman & Hall/CRC, Boca Raton, FL, 2010. +# ISBN 978-1-4200-6490-2 + +# $1 is the number of random points to be sampled +# $2 is specified mean +# $3 is specified standard deviation +# $4-$7 are a,b,c,d obtained by solving a system +# of equations using specified kurtosis and skewness +# $8 is the file to write out the generated data to + +numSamples = $1 +mu = $2 +sigma = $3 +a = $4 +b = $5 +c = $6 +d = $7 + + +print("a=" + a + " b=" + b + " c=" + c + " d=" + d) + +X = Rand(rows=numSamples, cols=1, pdf="normal", seed=0) +Y = a + b*X + c*X^2 + d*X^3 + +Z = Y*sigma + mu +write(Z, $8, format="binary") diff --git a/scripts/perftest/datagen/genStratStatisticsData.sh b/scripts/perftest/datagen/genStratStatisticsData.sh new file mode 100644 index 0000000000..330247cce0 --- /dev/null +++ b/scripts/perftest/datagen/genStratStatisticsData.sh @@ -0,0 +1,61 @@ +#!/bin/bash +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- +if [ "$(basename $PWD)" != "perftest" ]; +then + echo "Please execute scripts from directory 'perftest'" + exit 1; +fi + +CMD=$1 +BASE=$2/stratstats +MAXMEM=$3 + +FORMAT="binary" + +echo "-- Generating stats data..." >> results/times.txt; + +#XS data 10K rows +if [ $MAXMEM -ge 80 ]; then + ${CMD} -f datagen/genRandData4StratStats.dml --explain --stats --nvargs nr=10000 nf=100 D=${BASE}/A_10k/data Xcid=${BASE}/A_10k/Xcid Ycid=${BASE}/A_10k/Ycid A=${BASE}/A_10k/A fmt=$FORMAT & +fi + +#S data 100K rows +if [ $MAXMEM -ge 800 ]; then + ${CMD} -f datagen/genRandData4StratStats.dml --explain --stats --nvargs nr=100000 nf=100 D=${BASE}/A_100k/data Xcid=${BASE}/A_100k/Xcid Ycid=${BASE}/A_100k/Ycid A=${BASE}/A_100k/A fmt=$FORMAT & +fi + +#M data 1M rows +if [ $MAXMEM -ge 8000 ]; then + ${CMD} -f datagen/genRandData4StratStats.dml --explain --stats --nvargs nr=1000000 nf=100 D=${BASE}/A_1M/data Xcid=${BASE}/A_1M/Xcid Ycid=${BASE}/A_1M/Ycid A=${BASE}/A_1M/A fmt=$FORMAT & +fi + +#L data 10M rows +if [ $MAXMEM -ge 80000 ]; then + ${CMD} -f datagen/genRandData4StratStats.dml --explain --stats --nvargs nr=10000000 nf=100 D=${BASE}/A_10M/data Xcid=${BASE}/A_10M/Xcid Ycid=${BASE}/A_10M/Ycid A=${BASE}/A_10M/A fmt=$FORMAT +fi + +#XL data 100M rows +if [ $MAXMEM -ge 800000 ]; then + ${CMD} -f datagen/genRandData4StratStats.dml --explain --stats --nvargs nr=100000000 nf=100 D=${BASE}/A_10M/data Xcid=${BASE}/A_10M/Xcid Ycid=${BASE}/A_10M/Ycid A=${BASE}/A_10M/A fmt=$FORMAT +fi + +wait \ No newline at end of file diff --git a/scripts/perftest/sparkDML2.sh b/scripts/perftest/sparkDML2.sh index dde9805719..6102fb3d8a 100644 --- a/scripts/perftest/sparkDML2.sh +++ b/scripts/perftest/sparkDML2.sh @@ -1,3 +1,25 @@ +#!/bin/bash +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + #Client mode spark-submit script export SPARK_HOME=/home/hadoop/spark-3.3.1-bin-hadoop3 export HADOOP_CONF_DIR=/home/hadoop/hadoop-3.3.1/etc/hadoop @@ -13,4 +35,4 @@ $SPARK_HOME/bin/spark-submit \ --conf spark.network.timeout=512s \ --executor-memory 200g \ --executor-cores 48 \ - SystemDS.jar "$@" \ No newline at end of file + SystemDS.jar "$@"