Repository: incubator-systemml Updated Branches: refs/heads/master 588bafac3 -> 77363c0c6
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/77363c0c/docs/spark-mlcontext-programming-guide.md ---------------------------------------------------------------------- diff --git a/docs/spark-mlcontext-programming-guide.md b/docs/spark-mlcontext-programming-guide.md index 2eaf1be..6c2d2af 100644 --- a/docs/spark-mlcontext-programming-guide.md +++ b/docs/spark-mlcontext-programming-guide.md @@ -108,7 +108,7 @@ ml.execute(helloScript) <div data-lang="Spark Shell" markdown="1"> {% highlight scala %} scala> val helloScript = dml("print('hello world')") -helloScript: org.apache.sysml.api.mlcontext.Script = +helloScript: org.apache.sysml.api.mlcontext.Script = Inputs: None @@ -117,7 +117,7 @@ None scala> ml.execute(helloScript) hello world -res0: org.apache.sysml.api.mlcontext.MLResults = +res0: org.apache.sysml.api.mlcontext.MLResults = None {% endhighlight %} @@ -214,7 +214,7 @@ scala> val minMaxMean = | maxOut = max(Xin) | meanOut = mean(Xin) | """ -minMaxMean: String = +minMaxMean: String = " minOut = min(Xin) maxOut = max(Xin) @@ -307,7 +307,7 @@ scala> val sums = """ | message = "s1 and s2 are equal" | } | """ -sums: String = +sums: String = " s1 = sum(m1); s2 = sum(m2); @@ -323,7 +323,7 @@ if (s1 > s2) { scala> scala.tools.nsc.io.File("sums.dml").writeAll(sums) scala> val sumScript = dmlFromFile("sums.dml").in(Map("m1"-> rdd1, "m2"-> rdd2)).out("s1", "s2", "message") -sumScript: org.apache.sysml.api.mlcontext.Script = +sumScript: org.apache.sysml.api.mlcontext.Script = Inputs: [1] (RDD) m1: ParallelCollectionRDD[42] at parallelize at <console>:38 [2] (RDD) m2: ParallelCollectionRDD[43] at parallelize at <console>:38 @@ -334,7 +334,7 @@ Outputs: [3] message scala> val sumResults = ml.execute(sumScript) -sumResults: org.apache.sysml.api.mlcontext.MLResults = +sumResults: org.apache.sysml.api.mlcontext.MLResults = [1] (Double) s1: 10.0 [2] (Double) s2: 26.0 [3] (String) message: s2 is greater @@ -378,7 +378,7 @@ scala> val rdd2Metadata = new MatrixMetadata(2, 2) rdd2Metadata: org.apache.sysml.api.mlcontext.MatrixMetadata = rows: 2, columns: 2, non-zeros: None, rows per block: None, columns per block: None scala> val sumScript = dmlFromFile("sums.dml").in(Seq(("m1", rdd1, rdd1Metadata), ("m2", rdd2, rdd2Metadata))).out("s1", "s2", "message") -sumScript: org.apache.sysml.api.mlcontext.Script = +sumScript: org.apache.sysml.api.mlcontext.Script = Inputs: [1] (RDD) m1: ParallelCollectionRDD[42] at parallelize at <console>:38 [2] (RDD) m2: ParallelCollectionRDD[43] at parallelize at <console>:38 @@ -416,7 +416,7 @@ val (firstSum, secondSum, sumMessage) = ml.execute(sumScript).getTuple[Double, D <div data-lang="Spark Shell" markdown="1"> {% highlight scala %} scala> val sumScript = dmlFromFile("sums.dml").in("m1", rdd1, rdd1Metadata).in("m2", rdd2, rdd2Metadata).out("s1").out("s2").out("message") -sumScript: org.apache.sysml.api.mlcontext.Script = +sumScript: org.apache.sysml.api.mlcontext.Script = Inputs: [1] (RDD) m1: ParallelCollectionRDD[42] at parallelize at <console>:38 [2] (RDD) m2: ParallelCollectionRDD[43] at parallelize at <console>:38 @@ -445,7 +445,7 @@ Let's look at an example of reading a matrix out of SystemML. We'll create a DML in which we create a 2x2 matrix `m`. We'll set the variable `n` to be the sum of the cells in the matrix. We create a script object using String `s`, and we set `m` and `n` as the outputs. We execute the script, and in -the results we see we have Matrix `m` and Double `n`. The `n` output variable has a value of `110.0`. +the results we see we have Matrix `m` and Double `n`. The `n` output variable has a value of `110.0`. We get Matrix `m` and Double `n` as a Tuple of values `x` and `y`. We then convert Matrix `m` to an RDD of IJV values, an RDD of CSV values, a DataFrame, and a two-dimensional Double Array, and we display @@ -478,14 +478,14 @@ scala> val s = | m = matrix("11 22 33 44", rows=2, cols=2) | n = sum(m) | """ -s: String = +s: String = " m = matrix("11 22 33 44", rows=2, cols=2) n = sum(m) " scala> val scr = dml(s).out("m", "n"); -scr: org.apache.sysml.api.mlcontext.Script = +scr: org.apache.sysml.api.mlcontext.Script = Inputs: None @@ -495,7 +495,7 @@ Outputs: scala> val res = ml.execute(scr) -res: org.apache.sysml.api.mlcontext.MLResults = +res: org.apache.sysml.api.mlcontext.MLResults = [1] (Matrix) m: Matrix: scratch_space//_p12059_9.31.117.12//_t0/temp26_14, [2 x 2, nnz=4, blocks (1000 x 1000)], binaryblock, dirty [2] (Double) n: 110.0 @@ -588,7 +588,7 @@ scala> val scriptUrl = "https://raw.githubusercontent.com/apache/incubator-syste scriptUrl: String = https://raw.githubusercontent.com/apache/incubator-systemml/master/scripts/algorithms/Univar-Stats.dml scala> val uni = dmlFromUrl(scriptUrl).in("A", habermanRDD, habermanMetadata).in("K", typesRDD, typesMetadata).in("$CONSOLE_OUTPUT", true) -uni: org.apache.sysml.api.mlcontext.Script = +uni: org.apache.sysml.api.mlcontext.Script = Inputs: [1] (RDD) A: ParallelCollectionRDD[159] at parallelize at <console>:43 [2] (RDD) K: ParallelCollectionRDD[160] at parallelize at <console>:39 @@ -653,7 +653,7 @@ Feature [4]: Categorical (Nominal) (15) Num of categories | 2 (16) Mode | 1 (17) Num of modes | 1 -res23: org.apache.sysml.api.mlcontext.MLResults = +res23: org.apache.sysml.api.mlcontext.MLResults = None {% endhighlight %} @@ -723,7 +723,7 @@ baseStats.asRDDStringIJV.collect.slice(0,9).foreach(println) <div data-lang="Spark Shell" markdown="1"> {% highlight scala %} scala> val uni = dmlFromUrl(scriptUrl).in("A", habermanRDD, habermanMetadata).in("K", typesRDD, typesMetadata).out("baseStats") -uni: org.apache.sysml.api.mlcontext.Script = +uni: org.apache.sysml.api.mlcontext.Script = Inputs: [1] (RDD) A: ParallelCollectionRDD[159] at parallelize at <console>:43 [2] (RDD) K: ParallelCollectionRDD[160] at parallelize at <console>:39 @@ -783,7 +783,7 @@ scala> val minMaxMean = | maxOut = max(Xin) | meanOut = mean(Xin) | """ -minMaxMean: String = +minMaxMean: String = " minOut = min(Xin) maxOut = max(Xin) @@ -937,7 +937,7 @@ scala> val minMaxMean = | maxOut = max(Xin) | meanOut = mean(Xin) | """ -minMaxMean: String = +minMaxMean: String = " minOut = min(Xin) maxOut = max(Xin) @@ -1023,7 +1023,7 @@ scala> val minMaxMean = | maxOut = max(Xin) | meanOut = mean(Xin) | """ -minMaxMean: String = +minMaxMean: String = " minOut = min(Xin) maxOut = max(Xin) @@ -1186,7 +1186,7 @@ scala> class MyScriptExecutor extends org.apache.sysml.api.mlcontext.ScriptExecu defined class MyScriptExecutor scala> val helloScript = dml("print('hello world')") -helloScript: org.apache.sysml.api.mlcontext.Script = +helloScript: org.apache.sysml.api.mlcontext.Script = Inputs: None @@ -1197,7 +1197,7 @@ scala> ml.execute(helloScript, new MyScriptExecutor) Parsing script Validating script hello world -res63: org.apache.sysml.api.mlcontext.MLResults = +res63: org.apache.sysml.api.mlcontext.MLResults = None {% endhighlight %} @@ -1242,7 +1242,7 @@ scala> val rddCSV = sc.parallelize(Array("1.0,2.0", "3.0,4.0")) rddCSV: org.apache.spark.rdd.RDD[String] = ParallelCollectionRDD[190] at parallelize at <console>:38 scala> val sumAndMean = dml("sum = sum(m); mean = mean(m)").in("m", rddCSV).out("sum", "mean") -sumAndMean: org.apache.sysml.api.mlcontext.Script = +sumAndMean: org.apache.sysml.api.mlcontext.Script = Inputs: [1] (RDD) m: ParallelCollectionRDD[190] at parallelize at <console>:38 @@ -1251,7 +1251,7 @@ Outputs: [2] mean scala> ml.execute(sumAndMean) -res20: org.apache.sysml.api.mlcontext.MLResults = +res20: org.apache.sysml.api.mlcontext.MLResults = [1] (Double) sum: 10.0 [2] (Double) mean: 2.5 @@ -1291,7 +1291,7 @@ scala> val mm3x3 = new MatrixMetadata(MatrixFormat.IJV, 3, 3) mm3x3: org.apache.sysml.api.mlcontext.MatrixMetadata = rows: 3, columns: 3, non-zeros: None, rows per block: None, columns per block: None scala> val sumAndMean = dml("sum = sum(m); mean = mean(m)").in("m", rddIJV, mm3x3).out("sum", "mean") -sumAndMean: org.apache.sysml.api.mlcontext.Script = +sumAndMean: org.apache.sysml.api.mlcontext.Script = Inputs: [1] (RDD) m: ParallelCollectionRDD[202] at parallelize at <console>:38 @@ -1300,7 +1300,7 @@ Outputs: [2] mean scala> ml.execute(sumAndMean) -res21: org.apache.sysml.api.mlcontext.MLResults = +res21: org.apache.sysml.api.mlcontext.MLResults = [1] (Double) sum: 10.0 [2] (Double) mean: 1.1111111111111112 @@ -1333,7 +1333,7 @@ scala> val mm4x4 = new MatrixMetadata(MatrixFormat.IJV, 4, 4) mm4x4: org.apache.sysml.api.mlcontext.MatrixMetadata = rows: 4, columns: 4, non-zeros: None, rows per block: None, columns per block: None scala> val sumAndMean = dml("sum = sum(m); mean = mean(m)").in("m", rddIJV, mm4x4).out("sum", "mean") -sumAndMean: org.apache.sysml.api.mlcontext.Script = +sumAndMean: org.apache.sysml.api.mlcontext.Script = Inputs: [1] (RDD) m: ParallelCollectionRDD[210] at parallelize at <console>:38 @@ -1342,7 +1342,7 @@ Outputs: [2] mean scala> ml.execute(sumAndMean) -res22: org.apache.sysml.api.mlcontext.MLResults = +res22: org.apache.sysml.api.mlcontext.MLResults = [1] (Double) sum: 10.0 [2] (Double) mean: 0.625 @@ -1445,7 +1445,7 @@ scala> val rddCSV = sc.parallelize(Array("1.0,2.0", "3.0,4.0")) rddCSV: org.apache.spark.rdd.RDD[String] = ParallelCollectionRDD[341] at parallelize at <console>:53 scala> val add = dml("y = x + 1").in("x", rddCSV).out("y") -add: org.apache.sysml.api.mlcontext.Script = +add: org.apache.sysml.api.mlcontext.Script = Inputs: [1] (RDD) x: ParallelCollectionRDD[341] at parallelize at <console>:53 @@ -2100,3 +2100,8 @@ plt.title('PNMF Training Loss') {% endhighlight %}  + +# Recommended Spark Configuration Settings + +For best performance, we recommend setting the following flags when running SystemML with Spark: +`--conf spark.driver.maxResultSize=0 --conf spark.akka.frameSize=128`. http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/77363c0c/docs/standalone-guide.md ---------------------------------------------------------------------- diff --git a/docs/standalone-guide.md b/docs/standalone-guide.md new file mode 100644 index 0000000..38b6497 --- /dev/null +++ b/docs/standalone-guide.md @@ -0,0 +1,582 @@ +--- +layout: global +title: SystemML Standalone Guide +description: SystemML Standalone Guide +displayTitle: SystemML Standalone Guide +--- +<!-- +{% comment %} +Licensed to the Apache Software Foundation (ASF) under one or more +contributor license agreements. See the NOTICE file distributed with +this work for additional information regarding copyright ownership. +The ASF licenses this file to you under the Apache License, Version 2.0 +(the "License"); you may not use this file except in compliance with +the License. You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +{% endcomment %} +--> + +* This will become a table of contents (this text will be scraped). +{:toc} + +<br/> + +This tutorial provides a quick introduction to using SystemML by +running existing SystemML algorithms in standalone mode. + + +# What is SystemML + +SystemML enables large-scale machine learning (ML) via a high-level declarative +language with R-like syntax called [DML](dml-language-reference.html) and +Python-like syntax called PyDML. DML and PyDML allow data scientists to +express their ML algorithms with full flexibility but without the need to fine-tune +distributed runtime execution plans and system configurations. +These ML programs are dynamically compiled and optimized based on data +and cluster characteristics using rule-based and cost-based optimization techniques. +The compiler automatically generates hybrid runtime execution plans ranging +from in-memory, single node execution to distributed computation for Hadoop +or Spark Batch execution. +SystemML features a suite of algorithms for Descriptive Statistics, Classification, +Clustering, Regression, Matrix Factorization, and Survival Analysis. Detailed descriptions of these +algorithms can be found in the [Algorithms Reference](algorithms-reference.html). + +# Download SystemML + +Apache incubator releases of SystemML are available from the [downloads](http://systemml.apache.org/download.html) page. + +The SystemML project is available on GitHub at [https://github.com/apache/incubator-systemml](https://github.com/apache/incubator-systemml). +SystemML can be downloaded from GitHub and built with Maven. Instructions to build and +test SystemML can be found in the [SystemML GitHub README](https://github.com/apache/incubator-systemml). + +# Standalone vs Distributed Execution Mode + +SystemML's standalone mode is designed to allow data scientists to rapidly prototype algorithms +on a single machine. In standalone mode, all operations occur on a single node in a non-Hadoop +environment. Standalone mode is not appropriate for large datasets. + +For large-scale production environments, SystemML algorithm execution can be +distributed across multi-node clusters using [Apache Hadoop](https://hadoop.apache.org/) +or [Apache Spark](http://spark.apache.org/). +We will make use of standalone mode throughout this tutorial. + +# Choosing Test Data + +In this tutorial we will use the [Haberman's Survival Data Set](http://archive.ics.uci.edu/ml/datasets/Haberman%27s+Survival) +which can be downloaded in CSV format from the [Center for Machine Learning and Intelligent Systems](http://cml.ics.uci.edu/) + + $ wget -P data/ http://archive.ics.uci.edu/ml/machine-learning-databases/haberman/haberman.data + +The [Haberman Data Set](http://archive.ics.uci.edu/ml/machine-learning-databases/haberman/haberman.names) +has 306 instances and 4 attributes (including the class attribute): + + 1. Age of patient at time of operation (numerical) + 2. Patient's year of operation (year - 1900, numerical) + 3. Number of positive axillary nodes detected (numerical) + 4. Survival status (class attribute) + * `1` = the patient survived 5 years or longer + * `2` = the patient died within 5 year + + +We will need to create a metadata file (MTD) which stores metadata information +about the content of the data file. The name of the MTD file associated with the +data file `<filename>` must be `<filename>.mtd`. + + $ echo '{"rows": 306, "cols": 4, "format": "csv"}' > data/haberman.data.mtd + +<br/> + +# Example 1 - Univariate Statistics + +Let's start with a simple example, computing certain [univariate statistics](algorithms-descriptive-statistics.html#univariate-statistics) +for each feature column using the algorithm `Univar-Stats.dml` which requires 3 +[arguments](algorithms-descriptive-statistics.html#arguments): + +* `X`: location of the input data file to analyze +* `TYPES`: location of the file that contains the feature column types encoded by integer numbers: `1` = scale, `2` = nominal, `3` = ordinal +* `STATS`: location where the output matrix of computed statistics is to be stored + +We need to create a file `types.csv` that describes the type of each column in +the data along with its metadata file `types.csv.mtd`. + + $ echo '1,1,1,2' > data/types.csv + $ echo '{"rows": 1, "cols": 4, "format": "csv"}' > data/types.csv.mtd + + +To run the `Univar-Stats.dml` algorithm, issue the following command (we set the optional argument `CONSOLE_OUTPUT` to `TRUE` to print the statistics to the console): + + $ ./runStandaloneSystemML.sh scripts/algorithms/Univar-Stats.dml -nvargs X=data/haberman.data TYPES=data/types.csv STATS=data/univarOut.mtx CONSOLE_OUTPUT=TRUE + + [...] + ------------------------------------------------- + Feature [1]: Scale + (01) Minimum | 30.0 + (02) Maximum | 83.0 + (03) Range | 53.0 + (04) Mean | 52.45751633986928 + (05) Variance | 116.71458266366658 + (06) Std deviation | 10.803452349303281 + (07) Std err of mean | 0.6175922641866753 + (08) Coeff of variation | 0.20594669940735139 + (09) Skewness | 0.1450718616532357 + (10) Kurtosis | -0.6150152487211726 + (11) Std err of skewness | 0.13934809593495995 + (12) Std err of kurtosis | 0.277810485320835 + (13) Median | 52.0 + (14) Interquartile mean | 52.16013071895425 + ------------------------------------------------- + Feature [2]: Scale + (01) Minimum | 58.0 + (02) Maximum | 69.0 + (03) Range | 11.0 + (04) Mean | 62.85294117647059 + (05) Variance | 10.558630665380907 + (06) Std deviation | 3.2494046632238507 + (07) Std err of mean | 0.18575610076612029 + (08) Coeff of variation | 0.051698529971741194 + (09) Skewness | 0.07798443581479181 + (10) Kurtosis | -1.1324380182967442 + (11) Std err of skewness | 0.13934809593495995 + (12) Std err of kurtosis | 0.277810485320835 + (13) Median | 63.0 + (14) Interquartile mean | 62.80392156862745 + ------------------------------------------------- + Feature [3]: Scale + (01) Minimum | 0.0 + (02) Maximum | 52.0 + (03) Range | 52.0 + (04) Mean | 4.026143790849673 + (05) Variance | 51.691117539912135 + (06) Std deviation | 7.189653506248555 + (07) Std err of mean | 0.41100513466216837 + (08) Coeff of variation | 1.7857418611299172 + (09) Skewness | 2.954633471088322 + (10) Kurtosis | 11.425776549251449 + (11) Std err of skewness | 0.13934809593495995 + (12) Std err of kurtosis | 0.277810485320835 + (13) Median | 1.0 + (14) Interquartile mean | 1.2483660130718954 + ------------------------------------------------- + Feature [4]: Categorical (Nominal) + (15) Num of categories | 2 + (16) Mode | 1 + (17) Num of modes | 1 + + +The `Univar-Stats.dml` script writes the computed statistics to the `univarOut.mtx` file. The matrix has one row per univariate statistic and one column per input feature. The first column gives the number of the statistic +(see above table), the second column gives the number of the feature column in +the input data, and the third column gives the value of the univariate statistic. + + 1 1 30.0 + 1 2 58.0 + 2 1 83.0 + 2 2 69.0 + 2 3 52.0 + 3 1 53.0 + 3 2 11.0 + 3 3 52.0 + 4 1 52.45751633986928 + 4 2 62.85294117647059 + 4 3 4.026143790849673 + 5 1 116.71458266366658 + 5 2 10.558630665380907 + 5 3 51.691117539912135 + 6 1 10.803452349303281 + 6 2 3.2494046632238507 + 6 3 7.189653506248555 + 7 1 0.6175922641866753 + 7 2 0.18575610076612029 + 7 3 0.41100513466216837 + 8 1 0.20594669940735139 + 8 2 0.051698529971741194 + 8 3 1.7857418611299172 + 9 1 0.1450718616532357 + 9 2 0.07798443581479181 + 9 3 2.954633471088322 + 10 1 -0.6150152487211726 + 10 2 -1.1324380182967442 + 10 3 11.425776549251449 + 11 1 0.13934809593495995 + 11 2 0.13934809593495995 + 11 3 0.13934809593495995 + 12 1 0.277810485320835 + 12 2 0.277810485320835 + 12 3 0.277810485320835 + 13 1 52.0 + 13 2 63.0 + 13 3 1.0 + 14 1 52.16013071895425 + 14 2 62.80392156862745 + 14 3 1.2483660130718954 + 15 4 2.0 + 16 4 1.0 + 17 4 1.0 + + +<br/> +<br/> + +# Example 2 - Binary-class Support Vector Machines + +Let's take the same `haberman.data` to explore the +[binary-class support vector machines](algorithms-classification.html#binary-class-support-vector-machines) algorithm `l2-svm.dml`. +This example also illustrates how to use of the sampling algorithm `sample.dml` +and the data split algorithm `spliXY.dml`. + +## Sampling the Test Data + +First we need to use the `sample.dml` algorithm to separate the input into one +training data set and one data set for model prediction. + +Parameters: + + * `X` : (input) input data set: filename of input data set + * `sv` : (input) sampling vector: filename of 1-column vector w/ percentages. sum(sv) must be 1. + * `O` : (output) folder name w/ samples generated + * `ofmt` : (output) format of O: "csv", "binary" (default) + + +We will create the file `perc.csv` and `perc.csv.mtd` to define the sampling vector with a sampling rate of +50% to generate 2 data sets: + + $ printf "0.5\n0.5" > data/perc.csv + $ echo '{"rows": 2, "cols": 1, "format": "csv"}' > data/perc.csv.mtd + +Let's run the sampling algorithm to create the two data samples: + + $ ./runStandaloneSystemML.sh scripts/utils/sample.dml -nvargs X=data/haberman.data sv=data/perc.csv O=data/haberman.part ofmt="csv" + + +## Splitting Labels from Features + +Next we use the `splitXY.dml` algorithm to separate the feature columns from +the label column(s). + +Parameters: + + * `X` : (input) filename of data matrix + * `y` : (input) colIndex: starting index is 1 + * `OX` : (output) filename of output matrix with all columns except y + * `OY` : (output) filename of output matrix with y column + * `ofmt` : (output) format of OX and OY output matrix: "csv", "binary" (default) + +We specify `y=4` as the 4th column contains the labels to be predicted and run +the `splitXY.dml` algorithm on our training and test data sets. + + $ ./runStandaloneSystemML.sh scripts/utils/splitXY.dml -nvargs X=data/haberman.part/1 y=4 OX=data/haberman.train.data.csv OY=data/haberman.train.labels.csv ofmt="csv" + + $ ./runStandaloneSystemML.sh scripts/utils/splitXY.dml -nvargs X=data/haberman.part/2 y=4 OX=data/haberman.test.data.csv OY=data/haberman.test.labels.csv ofmt="csv" + +## Training and Testing the Model + +Now we need to train our model using the `l2-svm.dml` algorithm. + +[Parameters](algorithms-classification.html#arguments-1): + + * `X` : (input) filename of training data features + * `Y` : (input) filename of training data labels + * `model` : (output) filename of model that contains the learnt weights + * `fmt` : (output) format of model: "csv", "text" (sparse-matrix) + * `Log` : (output) log file for metrics and progress while training + * `confusion` : (output) filename of confusion matrix computed using a held-out test set (optional) + +The `l2-svm.dml` algorithm is used on our training data sample to train the model. + + $ ./runStandaloneSystemML.sh scripts/algorithms/l2-svm.dml -nvargs X=data/haberman.train.data.csv Y=data/haberman.train.labels.csv model=data/l2-svm-model.csv fmt="csv" Log=data/l2-svm-log.csv + +The `l2-svm-predict.dml` algorithm is used on our test data sample to predict the labels based on the trained model. + + $ ./runStandaloneSystemML.sh scripts/algorithms/l2-svm-predict.dml -nvargs X=data/haberman.test.data.csv Y=data/haberman.test.labels.csv model=data/l2-svm-model.csv fmt="csv" confusion=data/l2-svm-confusion.csv + +The console output should show the accuracy of the trained model in percent, i.e.: + + 15/09/01 01:32:51 INFO api.DMLScript: BEGIN DML run 09/01/2015 01:32:51 + 15/09/01 01:32:51 INFO conf.DMLConfig: Updating localtmpdir with value /tmp/systemml + 15/09/01 01:32:51 INFO conf.DMLConfig: Updating scratch with value scratch_space + 15/09/01 01:32:51 INFO conf.DMLConfig: Updating optlevel with value 2 + 15/09/01 01:32:51 INFO conf.DMLConfig: Updating numreducers with value 10 + 15/09/01 01:32:51 INFO conf.DMLConfig: Updating jvmreuse with value false + 15/09/01 01:32:51 INFO conf.DMLConfig: Updating defaultblocksize with value 1000 + 15/09/01 01:32:51 INFO conf.DMLConfig: Updating dml.yarn.appmaster with value false + 15/09/01 01:32:51 INFO conf.DMLConfig: Updating dml.yarn.appmaster.mem with value 2048 + 15/09/01 01:32:51 INFO conf.DMLConfig: Updating dml.yarn.mapreduce.mem with value 2048 + 15/09/01 01:32:51 INFO conf.DMLConfig: Updating dml.yarn.app.queue with value default + 15/09/01 01:32:51 INFO conf.DMLConfig: Updating cp.parallel.matrixmult with value true + 15/09/01 01:32:51 INFO conf.DMLConfig: Updating cp.parallel.textio with value true + Accuracy (%): 74.14965986394557 + 15/09/01 01:32:52 INFO api.DMLScript: SystemML Statistics: + Total execution time: 0.130 sec. + Number of executed MR Jobs: 0. + +The generated file `l2-svm-confusion.csv` should contain the following confusion matrix of this form: + + |0 1.0 2.0| + |1.0 t1 t2 | + |2.0 t3 t4 | + + * The model correctly predicted label 1 `t1` times + * The model incorrectly predicted label 1 as opposed to label 2 `t2` times + * The model incorrectly predicted label 2 as opposed to label 1 `t3` times + * The model correctly predicted label 2 `t4` times. + +If the confusion matrix looks like this ... + + 0,1.0,2.0 + 1.0,107.0,38.0 + 2.0,0.0,2.0 + +... then the accuracy of the model is (t1+t4)/(t1+t2+t3+t4) = (107+2)/107+38+0+2) = 0.741496599 + +<br/> + +Refer to the [Algorithms Reference](algorithms-reference.html) for more details. + +<br/> + +# Example 3 - Linear Regression + +For this example, we'll use a standalone wrapper executable, `bin/systemml`, that is available to +be run directly within the project's source directory when built locally. + +After you build SystemML from source (`mvn clean package`), the standalone mode can be executed +either on Linux or OS X using the `./bin/systemml` script, or on Windows using the +`.\bin\systemml.bat` batch file. + +If you run from the script from the project root folder `./` or from the `./bin` folder, then the +output files from running SystemML will be created inside the `./temp` folder to keep them separate +from the SystemML source files managed by Git. The output files for this example will be created +under the `./temp` folder. + +The runtime behavior and logging behavior of SystemML can be customized by editing the files +`./conf/SystemML-config.xml` and `./conf/log4j.properties`. Both files will be created from their +corresponding `*.template` files during the first execution of the SystemML executable script. + +When invoking the `./bin/systemml` or `.\bin\systemml.bat` with any of the prepackaged DML scripts +you can omit the relative path to the DML script file. The following two commands are equivalent: + + ./bin/systemml ./scripts/datagen/genLinearRegressionData.dml -nvargs numSamples=1000 numFeatures=50 maxFeatureValue=5 maxWeight=5 addNoise=FALSE b=0 sparsity=0.7 output=linRegData.csv format=csv perc=0.5 + + ./bin/systemml genLinearRegressionData.dml -nvargs numSamples=1000 numFeatures=50 maxFeatureValue=5 maxWeight=5 addNoise=FALSE b=0 sparsity=0.7 output=linRegData.csv format=csv perc=0.5 + +In this guide we invoke the command with the relative folder to make it easier to look up the source +of the DML scripts. + +## Linear Regression Example + +As an example of the capabilities and power of SystemML and DML, let's consider the Linear Regression algorithm. +We require sets of data to train and test our model. To obtain this data, we can either use real data or +generate data for our algorithm. The +[UCI Machine Learning Repository Datasets](https://archive.ics.uci.edu/ml/datasets.html) is one location for real data. +Use of real data typically involves some degree of data wrangling. In the following example, we will use SystemML to +generate random data to train and test our model. + +This example consists of the following parts: + + * [Run DML Script to Generate Random Data](#run-dml-script-to-generate-random-data) + * [Divide Generated Data into Two Sample Groups](#divide-generated-data-into-two-sample-groups) + * [Split Label Column from First Sample](#split-label-column-from-first-sample) + * [Split Label Column from Second Sample](#split-label-column-from-second-sample) + * [Train Model on First Sample](#train-model-on-first-sample) + * [Test Model on Second Sample](#test-model-on-second-sample) + +SystemML is distributed in several packages, including a standalone package. We'll operate in Standalone mode in this +example. + +<a name="run-dml-script-to-generate-random-data" /> + +### Run DML Script to Generate Random Data + +We can execute the `genLinearRegressionData.dml` script in Standalone mode using either the `systemml` or `systemml.bat` +file. +In this example, we'll generate a matrix of 1000 rows of 50 columns of test data, with sparsity 0.7. In addition to +this, a 51<sup>st</sup> column consisting of labels will +be appended to the matrix. + + ./bin/systemml ./scripts/datagen/genLinearRegressionData.dml -nvargs numSamples=1000 numFeatures=50 maxFeatureValue=5 maxWeight=5 addNoise=FALSE b=0 sparsity=0.7 output=linRegData.csv format=csv perc=0.5 + +This generates the following files inside the `./temp` folder: + + linRegData.csv # 1000 rows of 51 columns of doubles (50 data columns and 1 label column), csv format + linRegData.csv.mtd # Metadata file + perc.csv # Used to generate two subsets of the data (for training and testing) + perc.csv.mtd # Metadata file + scratch_space # SystemML scratch_space directory + +<a name="divide-generated-data-into-two-sample-groups" /> + +### Divide Generated Data into Two Sample Groups + +Next, we'll create two subsets of the generated data, each of size ~50%. We can accomplish this using the `sample.dml` +script with the `perc.csv` file created in the previous step: + + 0.5 + 0.5 + + +The `sample.dml` script will randomly sample rows from the `linRegData.csv` file and place them into 2 files based +on the percentages specified in `perc.csv`. This will create two sample groups of roughly 50 percent each. + + ./bin/systemml ./scripts/utils/sample.dml -nvargs X=linRegData.csv sv=perc.csv O=linRegDataParts ofmt=csv + + +This script creates two partitions of the original data and places them in a `linRegDataParts` folder. The files created +are as follows: + + linRegDataParts/1 # first partition of data, ~50% of rows of linRegData.csv, csv format + linRegDataParts/1.mtd # metadata + linRegDataParts/2 # second partition of data, ~50% of rows of linRegData.csv, csv format + linRegDataParts/2.mtd # metadata + + +The `1` file contains the first partition of data, and the `2` file contains the second partition of data. +An associated metadata file describes +the nature of each partition of data. If we open `1` and `2` and look at the number of rows, we can see that typically +the partitions are not exactly 50% but instead are close to 50%. However, we find that the total number of rows in the +original data file equals the sum of the number of rows in `1` and `2`. + + +<a name="split-label-column-from-first-sample" /> + +### Split Label Column from First Sample + +The next task is to split the label column from the first sample. We can do this using the `splitXY.dml` script. + + ./bin/systemml ./scripts/utils/splitXY.dml -nvargs X=linRegDataParts/1 y=51 OX=linRegData.train.data.csv OY=linRegData.train.labels.csv ofmt=csv + +This splits column 51, the label column, off from the data. When done, the following files have been created. + + linRegData.train.data.csv # training data of 50 columns, csv format + linRegData.train.data.csv.mtd # metadata + linRegData.train.labels.csv # training labels of 1 column, csv format + linRegData.train.labels.csv.mtd # metadata + + +<a name="split-label-column-from-second-sample" /> + +### Split Label Column from Second Sample + +We also need to split the label column from the second sample. + + ./bin/systemml ./scripts/utils/splitXY.dml -nvargs X=linRegDataParts/2 y=51 OX=linRegData.test.data.csv OY=linRegData.test.labels.csv ofmt=csv + +This splits column 51 off the data, resulting in the following files: + + linRegData.test.data.csv # test data of 50 columns, csv format + linRegData.test.data.csv.mtd # metadata + linRegData.test.labels.csv # test labels of 1 column, csv format + linRegData.test.labels.csv.mtd # metadata + + +<a name="train-model-on-first-sample" /> + +### Train Model on First Sample + +Now, we can train our model based on the first sample. To do this, we utilize the `LinearRegDS.dml` (Linear Regression +Direct Solve) script. Note that SystemML also includes a `LinearRegCG.dml` (Linear Regression Conjugate Gradient) +algorithm for situations where the number of features is large. + + ./bin/systemml ./scripts/algorithms/LinearRegDS.dml -nvargs X=linRegData.train.data.csv Y=linRegData.train.labels.csv B=betas.csv fmt=csv + +This will generate the following files: + + betas.csv # betas, 50 rows of 1 column, csv format + betas.csv.mtd # metadata + +The LinearRegDS.dml script generates statistics to standard output similar to the following. + + BEGIN LINEAR REGRESSION SCRIPT + Reading X and Y... + Calling the Direct Solver... + Computing the statistics... + AVG_TOT_Y,-2.160284487670675 + STDEV_TOT_Y,66.86434576808432 + AVG_RES_Y,-3.3127468704080085E-10 + STDEV_RES_Y,1.7231785003947183E-8 + DISPERSION,2.963950542926297E-16 + PLAIN_R2,1.0 + ADJUSTED_R2,1.0 + PLAIN_R2_NOBIAS,1.0 + ADJUSTED_R2_NOBIAS,1.0 + PLAIN_R2_VS_0,1.0 + ADJUSTED_R2_VS_0,1.0 + Writing the output matrix... + END LINEAR REGRESSION SCRIPT + +Now that we have our `betas.csv`, we can test our model with our second set of data. + + +<a name="test-model-on-second-sample" /> + +### Test Model on Second Sample + +To test our model on the second sample, we can use the `GLM-predict.dml` script. This script can be used for both +prediction and scoring. Here, we're using it for scoring since we include the `Y` named argument. Our `betas.csv` +file is specified as the `B` named argument. + + ./bin/systemml ./scripts/algorithms/GLM-predict.dml -nvargs X=linRegData.test.data.csv Y=linRegData.test.labels.csv B=betas.csv fmt=csv + +This generates statistics similar to the following to standard output. + + LOGLHOOD_Z,,FALSE,NaN + LOGLHOOD_Z_PVAL,,FALSE,NaN + PEARSON_X2,,FALSE,1.895530994504798E-13 + PEARSON_X2_BY_DF,,FALSE,4.202951207327712E-16 + PEARSON_X2_PVAL,,FALSE,1.0 + DEVIANCE_G2,,FALSE,0.0 + DEVIANCE_G2_BY_DF,,FALSE,0.0 + DEVIANCE_G2_PVAL,,FALSE,1.0 + LOGLHOOD_Z,,TRUE,NaN + LOGLHOOD_Z_PVAL,,TRUE,NaN + PEARSON_X2,,TRUE,1.895530994504798E-13 + PEARSON_X2_BY_DF,,TRUE,4.202951207327712E-16 + PEARSON_X2_PVAL,,TRUE,1.0 + DEVIANCE_G2,,TRUE,0.0 + DEVIANCE_G2_BY_DF,,TRUE,0.0 + DEVIANCE_G2_PVAL,,TRUE,1.0 + AVG_TOT_Y,1,,1.0069397725436522 + STDEV_TOT_Y,1,,68.29092137526905 + AVG_RES_Y,1,,-4.1450397073455047E-10 + STDEV_RES_Y,1,,2.0519206226041048E-8 + PRED_STDEV_RES,1,TRUE,1.0 + PLAIN_R2,1,,1.0 + ADJUSTED_R2,1,,1.0 + PLAIN_R2_NOBIAS,1,,1.0 + ADJUSTED_R2_NOBIAS,1,,1.0 + + +We see that the STDEV_RES_Y value of the testing phase is of similar magnitude +to the value obtained from the model training phase. + +For convenience, we can encapsulate our DML invocations in a single script: + + #!/bin/bash + + ./bin/systemml ./scripts/datagen/genLinearRegressionData.dml -nvargs numSamples=1000 numFeatures=50 maxFeatureValue=5 maxWeight=5 addNoise=FALSE b=0 sparsity=0.7 output=linRegData.csv format=csv perc=0.5 + + ./bin/systemml ./scripts/utils/sample.dml -nvargs X=linRegData.csv sv=perc.csv O=linRegDataParts ofmt=csv + + ./bin/systemml ./scripts/utils/splitXY.dml -nvargs X=linRegDataParts/1 y=51 OX=linRegData.train.data.csv OY=linRegData.train.labels.csv ofmt=csv + + ./bin/systemml ./scripts/utils/splitXY.dml -nvargs X=linRegDataParts/2 y=51 OX=linRegData.test.data.csv OY=linRegData.test.labels.csv ofmt=csv + + ./bin/systemml ./scripts/algorithms/LinearRegDS.dml -nvargs X=linRegData.train.data.csv Y=linRegData.train.labels.csv B=betas.csv fmt=csv + + ./bin/systemml ./scripts/algorithms/GLM-predict.dml -nvargs X=linRegData.test.data.csv Y=linRegData.test.labels.csv B=betas.csv fmt=csv + + +# Troubleshooting + +If you encounter a `"java.lang.OutOfMemoryError"` you can edit the invocation +script (`runStandaloneSystemML.sh` or `runStandaloneSystemML.bat`) to increase +the memory available to the JVM, i.e: + + java -Xmx16g -Xms4g -Xmn1g -cp ${CLASSPATH} org.apache.sysml.api.DMLScript \ + -f ${SCRIPT_FILE} -exec singlenode -config=SystemML-config.xml \ + $@
