Added: systemml/site/docs/1.1.0/standalone-guide.html URL: http://svn.apache.org/viewvc/systemml/site/docs/1.1.0/standalone-guide.html?rev=1828046&view=auto ============================================================================== --- systemml/site/docs/1.1.0/standalone-guide.html (added) +++ systemml/site/docs/1.1.0/standalone-guide.html Fri Mar 30 04:31:05 2018 @@ -0,0 +1,892 @@ +<!DOCTYPE html> +<!--[if lt IE 7]> <html class="no-js lt-ie9 lt-ie8 lt-ie7"> <![endif]--> +<!--[if IE 7]> <html class="no-js lt-ie9 lt-ie8"> <![endif]--> +<!--[if IE 8]> <html class="no-js lt-ie9"> <![endif]--> +<!--[if gt IE 8]><!--> <html class="no-js"> <!--<![endif]--> + <head> + <title>SystemML Standalone Guide - SystemML 1.1.0</title> + <meta charset="utf-8"> + <meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1"> + + <meta name="description" content="SystemML Standalone Guide"> + + <meta name="viewport" content="width=device-width"> + <link rel="stylesheet" href="css/bootstrap.min.css"> + <link rel="stylesheet" href="css/main.css"> + <link rel="stylesheet" href="css/pygments-default.css"> + <link rel="shortcut icon" href="img/favicon.png"> + </head> + <body> + <!--[if lt IE 7]> + <p class="chromeframe">You are using an outdated browser. <a href="http://browsehappy.com/">Upgrade your browser today</a> or <a href="http://www.google.com/chromeframe/?redirect=true">install Google Chrome Frame</a> to better experience this site.</p> + <![endif]--> + + <header class="navbar navbar-default navbar-fixed-top" id="topbar"> + <div class="container"> + <div class="navbar-header"> + <div class="navbar-brand brand projectlogo"> + <a href="http://systemml.apache.org/"><img class="logo" src="img/systemml-logo.png" alt="Apache SystemML" title="Apache SystemML"/></a> + </div> + <div class="navbar-brand brand projecttitle"> + <a href="http://systemml.apache.org/">Apache SystemML<sup id="trademark">â¢</sup></a><br/> + <span class="version">1.1.0</span> + </div> + <button type="button" class="navbar-toggle collapsed" data-toggle="collapse" data-target=".navbar-collapse"> + <span class="sr-only">Toggle navigation</span> + <span class="icon-bar"></span> + <span class="icon-bar"></span> + <span class="icon-bar"></span> + </button> + </div> + <nav class="navbar-collapse collapse"> + <ul class="nav navbar-nav navbar-right"> + <li><a href="index.html">Overview</a></li> + <li><a href="https://github.com/apache/systemml">GitHub</a></li> + <li class="dropdown"> + <a href="#" class="dropdown-toggle" data-toggle="dropdown">Documentation<b class="caret"></b></a> + <ul class="dropdown-menu" role="menu"> + <li><b>Running SystemML:</b></li> + <li><a href="https://github.com/apache/systemml">SystemML GitHub README</a></li> + <li><a href="spark-mlcontext-programming-guide.html">Spark MLContext</a></li> + <li><a href="spark-batch-mode.html">Spark Batch Mode</a> + <li><a href="hadoop-batch-mode.html">Hadoop Batch Mode</a> + <li><a href="standalone-guide.html">Standalone Guide</a></li> + <li><a href="jmlc.html">Java Machine Learning Connector (JMLC)</a> + <li class="divider"></li> + <li><b>Language Guides:</b></li> + <li><a href="dml-language-reference.html">DML Language Reference</a></li> + <li><a href="beginners-guide-to-dml-and-pydml.html">Beginner's Guide to DML and PyDML</a></li> + <li><a href="beginners-guide-python.html">Beginner's Guide for Python Users</a></li> + <li><a href="python-reference.html">Reference Guide for Python Users</a></li> + <li class="divider"></li> + <li><b>ML Algorithms:</b></li> + <li><a href="algorithms-reference.html">Algorithms Reference</a></li> + <li class="divider"></li> + <li><b>Tools:</b></li> + <li><a href="debugger-guide.html">Debugger Guide</a></li> + <li><a href="developer-tools-systemml.html">IDE Guide</a></li> + <li class="divider"></li> + <li><b>Other:</b></li> + <li><a href="contributing-to-systemml.html">Contributing to SystemML</a></li> + <li><a href="engine-dev-guide.html">Engine Developer Guide</a></li> + <li><a href="troubleshooting-guide.html">Troubleshooting Guide</a></li> + <li><a href="release-process.html">Release Process</a></li> + </ul> + </li> + + <li class="dropdown"> + <a href="#" class="dropdown-toggle" data-toggle="dropdown">API Docs<b class="caret"></b></a> + <ul class="dropdown-menu" role="menu"> + <li><a href="./api/java/index.html">Java</a></li> + <li><a href="./api/python/index.html">Python</a></li> + </ul> + </li> + + <li class="dropdown"> + <a href="#" class="dropdown-toggle" data-toggle="dropdown">Issues<b class="caret"></b></a> + <ul class="dropdown-menu" role="menu"> + <li><b>JIRA:</b></li> + <li><a href="https://issues.apache.org/jira/browse/SYSTEMML">SystemML JIRA</a></li> + + </ul> + </li> + </ul> + </nav> + </div> + </header> + + <div class="container" id="content"> + + <h1 class="title">SystemML Standalone Guide</h1> + + + <!-- + +--> + +<ul id="markdown-toc"> + <li><a href="#what-is-systemml" id="markdown-toc-what-is-systemml">What is SystemML</a></li> + <li><a href="#download-systemml" id="markdown-toc-download-systemml">Download SystemML</a></li> + <li><a href="#standalone-vs-distributed-execution-mode" id="markdown-toc-standalone-vs-distributed-execution-mode">Standalone vs Distributed Execution Mode</a></li> + <li><a href="#choosing-test-data" id="markdown-toc-choosing-test-data">Choosing Test Data</a></li> + <li><a href="#example-1---univariate-statistics" id="markdown-toc-example-1---univariate-statistics">Example 1 - Univariate Statistics</a></li> + <li><a href="#example-2---binary-class-support-vector-machines" id="markdown-toc-example-2---binary-class-support-vector-machines">Example 2 - Binary-class Support Vector Machines</a> <ul> + <li><a href="#sampling-the-test-data" id="markdown-toc-sampling-the-test-data">Sampling the Test Data</a></li> + <li><a href="#splitting-labels-from-features" id="markdown-toc-splitting-labels-from-features">Splitting Labels from Features</a></li> + <li><a href="#training-and-testing-the-model" id="markdown-toc-training-and-testing-the-model">Training and Testing the Model</a></li> + </ul> + </li> + <li><a href="#example-3---linear-regression" id="markdown-toc-example-3---linear-regression">Example 3 - Linear Regression</a> <ul> + <li><a href="#linear-regression-example" id="markdown-toc-linear-regression-example">Linear Regression Example</a> <ul> + <li><a href="#run-dml-script-to-generate-random-data" id="markdown-toc-run-dml-script-to-generate-random-data">Run DML Script to Generate Random Data</a></li> + <li><a href="#divide-generated-data-into-two-sample-groups" id="markdown-toc-divide-generated-data-into-two-sample-groups">Divide Generated Data into Two Sample Groups</a></li> + <li><a href="#split-label-column-from-first-sample" id="markdown-toc-split-label-column-from-first-sample">Split Label Column from First Sample</a></li> + <li><a href="#split-label-column-from-second-sample" id="markdown-toc-split-label-column-from-second-sample">Split Label Column from Second Sample</a></li> + <li><a href="#train-model-on-first-sample" id="markdown-toc-train-model-on-first-sample">Train Model on First Sample</a></li> + <li><a href="#test-model-on-second-sample" id="markdown-toc-test-model-on-second-sample">Test Model on Second Sample</a></li> + </ul> + </li> + </ul> + </li> + <li><a href="#troubleshooting" id="markdown-toc-troubleshooting">Troubleshooting</a></li> +</ul> + +<p><br /></p> + +<p>This tutorial provides a quick introduction to using SystemML by +running existing SystemML algorithms in standalone mode.</p> + +<h1 id="what-is-systemml">What is SystemML</h1> + +<p>SystemML enables large-scale machine learning (ML) via a high-level declarative +language with R-like syntax called <a href="dml-language-reference.html">DML</a> and +Python-like syntax called PyDML. DML and PyDML allow data scientists to +express their ML algorithms with full flexibility but without the need to fine-tune +distributed runtime execution plans and system configurations. +These ML programs are dynamically compiled and optimized based on data +and cluster characteristics using rule-based and cost-based optimization techniques. +The compiler automatically generates hybrid runtime execution plans ranging +from in-memory, single node execution to distributed computation for Hadoop +or Spark Batch execution. +SystemML features a suite of algorithms for Descriptive Statistics, Classification, +Clustering, Regression, Matrix Factorization, and Survival Analysis. Detailed descriptions of these +algorithms can be found in the <a href="algorithms-reference.html">Algorithms Reference</a>.</p> + +<h1 id="download-systemml">Download SystemML</h1> + +<p>Apache SystemML releases are available from the <a href="http://systemml.apache.org/download.html">Downloads</a> page.</p> + +<p>SystemML can also be downloaded from GitHub and built with Maven. +The SystemML project is available on GitHub at <a href="https://github.com/apache/systemml">https://github.com/apache/systemml</a>. +Instructions to build SystemML can be found in the <a href="engine-dev-guide.html">Engine Developer Guide</a>.</p> + +<h1 id="standalone-vs-distributed-execution-mode">Standalone vs Distributed Execution Mode</h1> + +<p>SystemML’s standalone mode is designed to allow data scientists to rapidly prototype algorithms +on a single machine. In standalone mode, all operations occur on a single node in a non-Hadoop +environment. Standalone mode is not appropriate for large datasets.</p> + +<p>For large-scale production environments, SystemML algorithm execution can be +distributed across multi-node clusters using <a href="https://hadoop.apache.org/">Apache Hadoop</a> +or <a href="http://spark.apache.org/">Apache Spark</a>. +We will make use of standalone mode throughout this tutorial.</p> + +<h1 id="choosing-test-data">Choosing Test Data</h1> + +<p>In this tutorial we will use the <a href="http://archive.ics.uci.edu/ml/datasets/Haberman%27s+Survival">Haberman’s Survival Data Set</a> +which can be downloaded in CSV format from the <a href="http://cml.ics.uci.edu/">Center for Machine Learning and Intelligent Systems</a></p> + +<pre><code>$ wget -P data/ http://archive.ics.uci.edu/ml/machine-learning-databases/haberman/haberman.data +</code></pre> + +<p>The <a href="http://archive.ics.uci.edu/ml/machine-learning-databases/haberman/haberman.names">Haberman Data Set</a> +has 306 instances and 4 attributes (including the class attribute):</p> + +<ol> + <li>Age of patient at time of operation (numerical)</li> + <li>Patient’s year of operation (year - 1900, numerical)</li> + <li>Number of positive axillary nodes detected (numerical)</li> + <li>Survival status (class attribute) + * <code>1</code> = the patient survived 5 years or longer + * <code>2</code> = the patient died within 5 year</li> +</ol> + +<p>We will need to create a metadata file (MTD) which stores metadata information +about the content of the data file. The name of the MTD file associated with the +data file <code><filename></code> must be <code><filename>.mtd</code>.</p> + +<pre><code>$ echo '{"rows": 306, "cols": 4, "format": "csv"}' > data/haberman.data.mtd +</code></pre> + +<hr /> + +<h1 id="example-1---univariate-statistics">Example 1 - Univariate Statistics</h1> + +<p>Let’s start with a simple example, computing certain <a href="algorithms-descriptive-statistics.html#univariate-statistics">univariate statistics</a> +for each feature column using the algorithm <code>Univar-Stats.dml</code> which requires 3 +<a href="algorithms-descriptive-statistics.html#arguments">arguments</a>:</p> + +<ul> + <li><code>X</code>: location of the input data file to analyze</li> + <li><code>TYPES</code>: location of the file that contains the feature column types encoded by integer numbers: <code>1</code> = scale, <code>2</code> = nominal, <code>3</code> = ordinal</li> + <li><code>STATS</code>: location where the output matrix of computed statistics is to be stored</li> +</ul> + +<p>We need to create a file <code>types.csv</code> that describes the type of each column in +the data along with its metadata file <code>types.csv.mtd</code>.</p> + +<pre><code>$ echo '1,1,1,2' > data/types.csv +$ echo '{"rows": 1, "cols": 4, "format": "csv"}' > data/types.csv.mtd +</code></pre> + +<p>To run the <code>Univar-Stats.dml</code> algorithm, issue the following command (we set the optional argument <code>CONSOLE_OUTPUT</code> to <code>TRUE</code> to print the statistics to the console):</p> + +<pre><code>$ ./runStandaloneSystemML.sh scripts/algorithms/Univar-Stats.dml -nvargs X=data/haberman.data TYPES=data/types.csv STATS=data/univarOut.mtx CONSOLE_OUTPUT=TRUE + +[...] +------------------------------------------------- +Feature [1]: Scale + (01) Minimum | 30.0 + (02) Maximum | 83.0 + (03) Range | 53.0 + (04) Mean | 52.45751633986928 + (05) Variance | 116.71458266366658 + (06) Std deviation | 10.803452349303281 + (07) Std err of mean | 0.6175922641866753 + (08) Coeff of variation | 0.20594669940735139 + (09) Skewness | 0.1450718616532357 + (10) Kurtosis | -0.6150152487211726 + (11) Std err of skewness | 0.13934809593495995 + (12) Std err of kurtosis | 0.277810485320835 + (13) Median | 52.0 + (14) Interquartile mean | 52.16013071895425 +------------------------------------------------- +Feature [2]: Scale + (01) Minimum | 58.0 + (02) Maximum | 69.0 + (03) Range | 11.0 + (04) Mean | 62.85294117647059 + (05) Variance | 10.558630665380907 + (06) Std deviation | 3.2494046632238507 + (07) Std err of mean | 0.18575610076612029 + (08) Coeff of variation | 0.051698529971741194 + (09) Skewness | 0.07798443581479181 + (10) Kurtosis | -1.1324380182967442 + (11) Std err of skewness | 0.13934809593495995 + (12) Std err of kurtosis | 0.277810485320835 + (13) Median | 63.0 + (14) Interquartile mean | 62.80392156862745 +------------------------------------------------- +Feature [3]: Scale + (01) Minimum | 0.0 + (02) Maximum | 52.0 + (03) Range | 52.0 + (04) Mean | 4.026143790849673 + (05) Variance | 51.691117539912135 + (06) Std deviation | 7.189653506248555 + (07) Std err of mean | 0.41100513466216837 + (08) Coeff of variation | 1.7857418611299172 + (09) Skewness | 2.954633471088322 + (10) Kurtosis | 11.425776549251449 + (11) Std err of skewness | 0.13934809593495995 + (12) Std err of kurtosis | 0.277810485320835 + (13) Median | 1.0 + (14) Interquartile mean | 1.2483660130718954 +------------------------------------------------- +Feature [4]: Categorical (Nominal) + (15) Num of categories | 2 + (16) Mode | 1 + (17) Num of modes | 1 +</code></pre> + +<p>In addition to writing statistics to the console, the <code>Univar-Stats.dml</code> script writes the computed statistics +to the <code>data/univarOut.mtx</code> file specified by the STATS input parameter.</p> + +<p><strong>univarOut.mtx file</strong></p> + +<pre><code>1 1 30.0 +1 2 58.0 +2 1 83.0 +2 2 69.0 +2 3 52.0 +3 1 53.0 +3 2 11.0 +3 3 52.0 +4 1 52.45751633986928 +4 2 62.85294117647059 +4 3 4.026143790849673 +5 1 116.71458266366658 +5 2 10.558630665380907 +5 3 51.691117539912135 +6 1 10.803452349303281 +6 2 3.2494046632238507 +6 3 7.189653506248555 +7 1 0.6175922641866753 +7 2 0.18575610076612029 +7 3 0.41100513466216837 +8 1 0.20594669940735139 +8 2 0.051698529971741194 +8 3 1.7857418611299172 +9 1 0.1450718616532357 +9 2 0.07798443581479181 +9 3 2.954633471088322 +10 1 -0.6150152487211726 +10 2 -1.1324380182967442 +10 3 11.425776549251449 +11 1 0.13934809593495995 +11 2 0.13934809593495995 +11 3 0.13934809593495995 +12 1 0.277810485320835 +12 2 0.277810485320835 +12 3 0.277810485320835 +13 1 52.0 +13 2 63.0 +13 3 1.0 +14 1 52.16013071895425 +14 2 62.80392156862745 +14 3 1.2483660130718954 +15 4 2.0 +16 4 1.0 +17 4 1.0 +</code></pre> + +<p>The following table lists the number and name of each univariate statistic. The row +numbers below correspond to the elements of the first column in the output +matrix above. The signs “+” show applicability to scale or/and to categorical +features.</p> + +<table> + <thead> + <tr> + <th style="text-align: center">Row</th> + <th style="text-align: left">Name of Statistic</th> + <th style="text-align: center">Scale</th> + <th style="text-align: center">Categ.</th> + </tr> + </thead> + <tbody> + <tr> + <td style="text-align: center">1</td> + <td style="text-align: left">Minimum</td> + <td style="text-align: center">+</td> + <td style="text-align: center"> </td> + </tr> + <tr> + <td style="text-align: center">2</td> + <td style="text-align: left">Maximum</td> + <td style="text-align: center">+</td> + <td style="text-align: center"> </td> + </tr> + <tr> + <td style="text-align: center">3</td> + <td style="text-align: left">Range</td> + <td style="text-align: center">+</td> + <td style="text-align: center"> </td> + </tr> + <tr> + <td style="text-align: center">4</td> + <td style="text-align: left">Mean</td> + <td style="text-align: center">+</td> + <td style="text-align: center"> </td> + </tr> + <tr> + <td style="text-align: center">5</td> + <td style="text-align: left">Variance</td> + <td style="text-align: center">+</td> + <td style="text-align: center"> </td> + </tr> + <tr> + <td style="text-align: center">6</td> + <td style="text-align: left">Standard deviation</td> + <td style="text-align: center">+</td> + <td style="text-align: center"> </td> + </tr> + <tr> + <td style="text-align: center">7</td> + <td style="text-align: left">Standard error of mean</td> + <td style="text-align: center">+</td> + <td style="text-align: center"> </td> + </tr> + <tr> + <td style="text-align: center">8</td> + <td style="text-align: left">Coefficient of variation</td> + <td style="text-align: center">+</td> + <td style="text-align: center"> </td> + </tr> + <tr> + <td style="text-align: center">9</td> + <td style="text-align: left">Skewness</td> + <td style="text-align: center">+</td> + <td style="text-align: center"> </td> + </tr> + <tr> + <td style="text-align: center">10</td> + <td style="text-align: left">Kurtosis</td> + <td style="text-align: center">+</td> + <td style="text-align: center"> </td> + </tr> + <tr> + <td style="text-align: center">11</td> + <td style="text-align: left">Standard error of skewness</td> + <td style="text-align: center">+</td> + <td style="text-align: center"> </td> + </tr> + <tr> + <td style="text-align: center">12</td> + <td style="text-align: left">Standard error of kurtosis</td> + <td style="text-align: center">+</td> + <td style="text-align: center"> </td> + </tr> + <tr> + <td style="text-align: center">13</td> + <td style="text-align: left">Median</td> + <td style="text-align: center">+</td> + <td style="text-align: center"> </td> + </tr> + <tr> + <td style="text-align: center">14</td> + <td style="text-align: left">Inter quartile mean</td> + <td style="text-align: center">+</td> + <td style="text-align: center"> </td> + </tr> + <tr> + <td style="text-align: center">15</td> + <td style="text-align: left">Number of categories</td> + <td style="text-align: center"> </td> + <td style="text-align: center">+</td> + </tr> + <tr> + <td style="text-align: center">16</td> + <td style="text-align: left">Mode</td> + <td style="text-align: center"> </td> + <td style="text-align: center">+</td> + </tr> + <tr> + <td style="text-align: center">17</td> + <td style="text-align: left">Number of modes</td> + <td style="text-align: center"> </td> + <td style="text-align: center">+</td> + </tr> + </tbody> +</table> + +<hr /> + +<h1 id="example-2---binary-class-support-vector-machines">Example 2 - Binary-class Support Vector Machines</h1> + +<p>Let’s take the same <code>haberman.data</code> to explore the +<a href="algorithms-classification.html#binary-class-support-vector-machines">binary-class support vector machines</a> algorithm <code>l2-svm.dml</code>. +This example also illustrates how to use of the sampling algorithm <code>sample.dml</code> +and the data split algorithm <code>spliXY.dml</code>.</p> + +<h2 id="sampling-the-test-data">Sampling the Test Data</h2> + +<p>First we need to use the <code>sample.dml</code> algorithm to separate the input into one +training data set and one data set for model prediction.</p> + +<p>Parameters:</p> + +<ul> + <li><code>X</code> : (input) input data set: filename of input data set</li> + <li><code>sv</code> : (input) sampling vector: filename of 1-column vector w/ percentages. sum(sv) must be 1.</li> + <li><code>O</code> : (output) folder name w/ samples generated</li> + <li><code>ofmt</code> : (output) format of O: “csv”, “binary” (default)</li> +</ul> + +<p>We will create the file <code>perc.csv</code> and <code>perc.csv.mtd</code> to define the sampling vector with a sampling rate of +50% to generate 2 data sets:</p> + +<pre><code>$ printf "0.5\n0.5" > data/perc.csv +$ echo '{"rows": 2, "cols": 1, "format": "csv"}' > data/perc.csv.mtd +</code></pre> + +<p>Let’s run the sampling algorithm to create the two data samples:</p> + +<pre><code>$ ./runStandaloneSystemML.sh scripts/utils/sample.dml -nvargs X=data/haberman.data sv=data/perc.csv O=data/haberman.part ofmt="csv" +</code></pre> + +<h2 id="splitting-labels-from-features">Splitting Labels from Features</h2> + +<p>Next we use the <code>splitXY.dml</code> algorithm to separate the feature columns from +the label column(s).</p> + +<p>Parameters:</p> + +<ul> + <li><code>X</code> : (input) filename of data matrix</li> + <li><code>y</code> : (input) colIndex: starting index is 1</li> + <li><code>OX</code> : (output) filename of output matrix with all columns except y</li> + <li><code>OY</code> : (output) filename of output matrix with y column</li> + <li><code>ofmt</code> : (output) format of OX and OY output matrix: “csv”, “binary” (default)</li> +</ul> + +<p>We specify <code>y=4</code> as the 4th column contains the labels to be predicted and run +the <code>splitXY.dml</code> algorithm on our training and test data sets.</p> + +<pre><code>$ ./runStandaloneSystemML.sh scripts/utils/splitXY.dml -nvargs X=data/haberman.part/1 y=4 OX=data/haberman.train.data.csv OY=data/haberman.train.labels.csv ofmt="csv" + +$ ./runStandaloneSystemML.sh scripts/utils/splitXY.dml -nvargs X=data/haberman.part/2 y=4 OX=data/haberman.test.data.csv OY=data/haberman.test.labels.csv ofmt="csv" +</code></pre> + +<h2 id="training-and-testing-the-model">Training and Testing the Model</h2> + +<p>Now we need to train our model using the <code>l2-svm.dml</code> algorithm.</p> + +<p><a href="algorithms-classification.html#arguments-1">Parameters</a>:</p> + +<ul> + <li><code>X</code> : (input) filename of training data features</li> + <li><code>Y</code> : (input) filename of training data labels</li> + <li><code>model</code> : (output) filename of model that contains the learnt weights</li> + <li><code>fmt</code> : (output) format of model: “csv”, “text” (sparse-matrix)</li> + <li><code>Log</code> : (output) log file for metrics and progress while training</li> + <li><code>confusion</code> : (output) filename of confusion matrix computed using a held-out test set (optional)</li> +</ul> + +<p>The <code>l2-svm.dml</code> algorithm is used on our training data sample to train the model.</p> + +<pre><code>$ ./runStandaloneSystemML.sh scripts/algorithms/l2-svm.dml -nvargs X=data/haberman.train.data.csv Y=data/haberman.train.labels.csv model=data/l2-svm-model.csv fmt="csv" Log=data/l2-svm-log.csv +</code></pre> + +<p>The <code>l2-svm-predict.dml</code> algorithm is used on our test data sample to predict the labels based on the trained model.</p> + +<pre><code>$ ./runStandaloneSystemML.sh scripts/algorithms/l2-svm-predict.dml -nvargs X=data/haberman.test.data.csv Y=data/haberman.test.labels.csv model=data/l2-svm-model.csv fmt="csv" confusion=data/l2-svm-confusion.csv +</code></pre> + +<p>The console output should show the accuracy of the trained model in percent, i.e.:</p> + +<pre><code>15/09/01 01:32:51 INFO api.DMLScript: BEGIN DML run 09/01/2015 01:32:51 +15/09/01 01:32:51 INFO conf.DMLConfig: Updating sysml.localtmpdir with value /tmp/systemml +15/09/01 01:32:51 INFO conf.DMLConfig: Updating sysml.scratch with value scratch_space +15/09/01 01:32:51 INFO conf.DMLConfig: Updating sysml.optlevel with value 2 +15/09/01 01:32:51 INFO conf.DMLConfig: Updating sysml.numreducers with value 10 +15/09/01 01:32:51 INFO conf.DMLConfig: Updating sysml.jvmreuse with value false +15/09/01 01:32:51 INFO conf.DMLConfig: Updating sysml.defaultblocksize with value 1000 +15/09/01 01:32:51 INFO conf.DMLConfig: Updating sysml.yarn.appmaster with value false +15/09/01 01:32:51 INFO conf.DMLConfig: Updating sysml.yarn.appmaster.mem with value 2048 +15/09/01 01:32:51 INFO conf.DMLConfig: Updating sysml.yarn.mapreduce.mem with value 2048 +15/09/01 01:32:51 INFO conf.DMLConfig: Updating sysml.yarn.app.queue with value default +15/09/01 01:32:51 INFO conf.DMLConfig: Updating sysml.parallel.ops with value true +15/09/01 01:32:51 INFO conf.DMLConfig: Updating sysml.parallel.io with value true +Accuracy (%): 74.14965986394557 +15/09/01 01:32:52 INFO api.DMLScript: SystemML Statistics: +Total execution time: 0.130 sec. +Number of executed MR Jobs: 0. +</code></pre> + +<p>The generated file <code>l2-svm-confusion.csv</code> should contain the following confusion matrix of this form:</p> + +<pre><code>|0 1.0 2.0| +|1.0 t1 t2 | +|2.0 t3 t4 | +</code></pre> + +<ul> + <li>The model correctly predicted label 1 <code>t1</code> times</li> + <li>The model incorrectly predicted label 1 as opposed to label 2 <code>t2</code> times</li> + <li>The model incorrectly predicted label 2 as opposed to label 1 <code>t3</code> times</li> + <li>The model correctly predicted label 2 <code>t4</code> times.</li> +</ul> + +<p>If the confusion matrix looks like this …</p> + +<pre><code>0,1.0,2.0 +1.0,107.0,38.0 +2.0,0.0,2.0 +</code></pre> + +<p>… then the accuracy of the model is (t1+t4)/(t1+t2+t3+t4) = (107+2)/107+38+0+2) = 0.741496599</p> + +<p><br /></p> + +<p>Refer to the <a href="algorithms-reference.html">Algorithms Reference</a> for more details.</p> + +<hr /> + +<h1 id="example-3---linear-regression">Example 3 - Linear Regression</h1> + +<p>For this example, we’ll use a standalone wrapper executable, <code>bin/systemml</code>, that is available to +be run directly within the project’s source directory when built locally.</p> + +<p>After you build SystemML from source (<code>mvn clean package</code>), the standalone mode can be executed +either on Linux or OS X using the <code>./bin/systemml</code> script, or on Windows using the +<code>.\bin\systemml.bat</code> batch file.</p> + +<p>If you run from the script from the project root folder <code>./</code> or from the <code>./bin</code> folder, then the +output files from running SystemML will be created inside the <code>./temp</code> folder to keep them separate +from the SystemML source files managed by Git. The output files for this example will be created +under the <code>./temp</code> folder.</p> + +<p>The runtime behavior and logging behavior of SystemML can be customized by editing the files +<code>./conf/SystemML-config.xml</code> and <code>./conf/log4j.properties</code>. Both files will be created from their +corresponding <code>*.template</code> files during the first execution of the SystemML executable script.</p> + +<p>When invoking the <code>./bin/systemml</code> or <code>.\bin\systemml.bat</code> with any of the prepackaged DML scripts +you can omit the relative path to the DML script file. The following two commands are equivalent:</p> + +<pre><code>./bin/systemml ./scripts/datagen/genLinearRegressionData.dml -nvargs numSamples=1000 numFeatures=50 maxFeatureValue=5 maxWeight=5 addNoise=FALSE b=0 sparsity=0.7 output=linRegData.csv format=csv perc=0.5 + +./bin/systemml genLinearRegressionData.dml -nvargs numSamples=1000 numFeatures=50 maxFeatureValue=5 maxWeight=5 addNoise=FALSE b=0 sparsity=0.7 output=linRegData.csv format=csv perc=0.5 +</code></pre> + +<p>In this guide we invoke the command with the relative folder to make it easier to look up the source +of the DML scripts.</p> + +<h2 id="linear-regression-example">Linear Regression Example</h2> + +<p>As an example of the capabilities and power of SystemML and DML, let’s consider the Linear Regression algorithm. +We require sets of data to train and test our model. To obtain this data, we can either use real data or +generate data for our algorithm. The +<a href="https://archive.ics.uci.edu/ml/datasets.html">UCI Machine Learning Repository Datasets</a> is one location for real data. +Use of real data typically involves some degree of data wrangling. In the following example, we will use SystemML to +generate random data to train and test our model.</p> + +<p>This example consists of the following parts:</p> + +<ul> + <li><a href="#run-dml-script-to-generate-random-data">Run DML Script to Generate Random Data</a></li> + <li><a href="#divide-generated-data-into-two-sample-groups">Divide Generated Data into Two Sample Groups</a></li> + <li><a href="#split-label-column-from-first-sample">Split Label Column from First Sample</a></li> + <li><a href="#split-label-column-from-second-sample">Split Label Column from Second Sample</a></li> + <li><a href="#train-model-on-first-sample">Train Model on First Sample</a></li> + <li><a href="#test-model-on-second-sample">Test Model on Second Sample</a></li> +</ul> + +<p>SystemML is distributed in several packages, including a standalone package. We’ll operate in Standalone mode in this +example.</p> + +<p><a name="run-dml-script-to-generate-random-data"></a></p> + +<h3 id="run-dml-script-to-generate-random-data">Run DML Script to Generate Random Data</h3> + +<p>We can execute the <code>genLinearRegressionData.dml</code> script in Standalone mode using either the <code>systemml</code> or <code>systemml.bat</code> +file. +In this example, we’ll generate a matrix of 1000 rows of 50 columns of test data, with sparsity 0.7. In addition to +this, a 51<sup>st</sup> column consisting of labels will +be appended to the matrix.</p> + +<pre><code>./bin/systemml ./scripts/datagen/genLinearRegressionData.dml -nvargs numSamples=1000 numFeatures=50 maxFeatureValue=5 maxWeight=5 addNoise=FALSE b=0 sparsity=0.7 output=linRegData.csv format=csv perc=0.5 +</code></pre> + +<p>This generates the following files inside the <code>./temp</code> folder:</p> + +<pre><code>linRegData.csv # 1000 rows of 51 columns of doubles (50 data columns and 1 label column), csv format +linRegData.csv.mtd # Metadata file +perc.csv # Used to generate two subsets of the data (for training and testing) +perc.csv.mtd # Metadata file +scratch_space # SystemML scratch_space directory +</code></pre> + +<p><a name="divide-generated-data-into-two-sample-groups"></a></p> + +<h3 id="divide-generated-data-into-two-sample-groups">Divide Generated Data into Two Sample Groups</h3> + +<p>Next, we’ll create two subsets of the generated data, each of size ~50%. We can accomplish this using the <code>sample.dml</code> +script with the <code>perc.csv</code> file created in the previous step:</p> + +<pre><code>0.5 +0.5 +</code></pre> + +<p>The <code>sample.dml</code> script will randomly sample rows from the <code>linRegData.csv</code> file and place them into 2 files based +on the percentages specified in <code>perc.csv</code>. This will create two sample groups of roughly 50 percent each.</p> + +<pre><code>./bin/systemml ./scripts/utils/sample.dml -nvargs X=linRegData.csv sv=perc.csv O=linRegDataParts ofmt=csv +</code></pre> + +<p>This script creates two partitions of the original data and places them in a <code>linRegDataParts</code> folder. The files created +are as follows:</p> + +<pre><code>linRegDataParts/1 # first partition of data, ~50% of rows of linRegData.csv, csv format +linRegDataParts/1.mtd # metadata +linRegDataParts/2 # second partition of data, ~50% of rows of linRegData.csv, csv format +linRegDataParts/2.mtd # metadata +</code></pre> + +<p>The <code>1</code> file contains the first partition of data, and the <code>2</code> file contains the second partition of data. +An associated metadata file describes +the nature of each partition of data. If we open <code>1</code> and <code>2</code> and look at the number of rows, we can see that typically +the partitions are not exactly 50% but instead are close to 50%. However, we find that the total number of rows in the +original data file equals the sum of the number of rows in <code>1</code> and <code>2</code>.</p> + +<p><a name="split-label-column-from-first-sample"></a></p> + +<h3 id="split-label-column-from-first-sample">Split Label Column from First Sample</h3> + +<p>The next task is to split the label column from the first sample. We can do this using the <code>splitXY.dml</code> script.</p> + +<pre><code>./bin/systemml ./scripts/utils/splitXY.dml -nvargs X=linRegDataParts/1 y=51 OX=linRegData.train.data.csv OY=linRegData.train.labels.csv ofmt=csv +</code></pre> + +<p>This splits column 51, the label column, off from the data. When done, the following files have been created.</p> + +<pre><code>linRegData.train.data.csv # training data of 50 columns, csv format +linRegData.train.data.csv.mtd # metadata +linRegData.train.labels.csv # training labels of 1 column, csv format +linRegData.train.labels.csv.mtd # metadata +</code></pre> + +<p><a name="split-label-column-from-second-sample"></a></p> + +<h3 id="split-label-column-from-second-sample">Split Label Column from Second Sample</h3> + +<p>We also need to split the label column from the second sample.</p> + +<pre><code>./bin/systemml ./scripts/utils/splitXY.dml -nvargs X=linRegDataParts/2 y=51 OX=linRegData.test.data.csv OY=linRegData.test.labels.csv ofmt=csv +</code></pre> + +<p>This splits column 51 off the data, resulting in the following files:</p> + +<pre><code>linRegData.test.data.csv # test data of 50 columns, csv format +linRegData.test.data.csv.mtd # metadata +linRegData.test.labels.csv # test labels of 1 column, csv format +linRegData.test.labels.csv.mtd # metadata +</code></pre> + +<p><a name="train-model-on-first-sample"></a></p> + +<h3 id="train-model-on-first-sample">Train Model on First Sample</h3> + +<p>Now, we can train our model based on the first sample. To do this, we utilize the <code>LinearRegDS.dml</code> (Linear Regression +Direct Solve) script. Note that SystemML also includes a <code>LinearRegCG.dml</code> (Linear Regression Conjugate Gradient) +algorithm for situations where the number of features is large.</p> + +<pre><code>./bin/systemml ./scripts/algorithms/LinearRegDS.dml -nvargs X=linRegData.train.data.csv Y=linRegData.train.labels.csv B=betas.csv fmt=csv +</code></pre> + +<p>This will generate the following files:</p> + +<pre><code>betas.csv # betas, 50 rows of 1 column, csv format +betas.csv.mtd # metadata +</code></pre> + +<p>The LinearRegDS.dml script generates statistics to standard output similar to the following.</p> + +<pre><code>BEGIN LINEAR REGRESSION SCRIPT +Reading X and Y... +Calling the Direct Solver... +Computing the statistics... +AVG_TOT_Y,-2.160284487670675 +STDEV_TOT_Y,66.86434576808432 +AVG_RES_Y,-3.3127468704080085E-10 +STDEV_RES_Y,1.7231785003947183E-8 +DISPERSION,2.963950542926297E-16 +R2,1.0 +ADJUSTED_R2,1.0 +R2_NOBIAS,1.0 +ADJUSTED_R2_NOBIAS,1.0 +R2_VS_0,1.0 +ADJUSTED_R2_VS_0,1.0 +Writing the output matrix... +END LINEAR REGRESSION SCRIPT +</code></pre> + +<p>Now that we have our <code>betas.csv</code>, we can test our model with our second set of data.</p> + +<p><a name="test-model-on-second-sample"></a></p> + +<h3 id="test-model-on-second-sample">Test Model on Second Sample</h3> + +<p>To test our model on the second sample, we can use the <code>GLM-predict.dml</code> script. This script can be used for both +prediction and scoring. Here, we’re using it for scoring since we include the <code>Y</code> named argument. Our <code>betas.csv</code> +file is specified as the <code>B</code> named argument.</p> + +<pre><code>./bin/systemml ./scripts/algorithms/GLM-predict.dml -nvargs X=linRegData.test.data.csv Y=linRegData.test.labels.csv B=betas.csv fmt=csv +</code></pre> + +<p>This generates statistics similar to the following to standard output.</p> + +<pre><code>LOGLHOOD_Z,,FALSE,NaN +LOGLHOOD_Z_PVAL,,FALSE,NaN +PEARSON_X2,,FALSE,1.895530994504798E-13 +PEARSON_X2_BY_DF,,FALSE,4.202951207327712E-16 +PEARSON_X2_PVAL,,FALSE,1.0 +DEVIANCE_G2,,FALSE,0.0 +DEVIANCE_G2_BY_DF,,FALSE,0.0 +DEVIANCE_G2_PVAL,,FALSE,1.0 +LOGLHOOD_Z,,TRUE,NaN +LOGLHOOD_Z_PVAL,,TRUE,NaN +PEARSON_X2,,TRUE,1.895530994504798E-13 +PEARSON_X2_BY_DF,,TRUE,4.202951207327712E-16 +PEARSON_X2_PVAL,,TRUE,1.0 +DEVIANCE_G2,,TRUE,0.0 +DEVIANCE_G2_BY_DF,,TRUE,0.0 +DEVIANCE_G2_PVAL,,TRUE,1.0 +AVG_TOT_Y,1,,1.0069397725436522 +STDEV_TOT_Y,1,,68.29092137526905 +AVG_RES_Y,1,,-4.1450397073455047E-10 +STDEV_RES_Y,1,,2.0519206226041048E-8 +PRED_STDEV_RES,1,TRUE,1.0 +R2,1,,1.0 +ADJUSTED_R2,1,,1.0 +R2_NOBIAS,1,,1.0 +ADJUSTED_R2_NOBIAS,1,,1.0 +</code></pre> + +<p>We see that the STDEV_RES_Y value of the testing phase is of similar magnitude +to the value obtained from the model training phase.</p> + +<p>For convenience, we can encapsulate our DML invocations in a single script:</p> + +<pre><code>#!/bin/bash + +./bin/systemml ./scripts/datagen/genLinearRegressionData.dml -nvargs numSamples=1000 numFeatures=50 maxFeatureValue=5 maxWeight=5 addNoise=FALSE b=0 sparsity=0.7 output=linRegData.csv format=csv perc=0.5 + +./bin/systemml ./scripts/utils/sample.dml -nvargs X=linRegData.csv sv=perc.csv O=linRegDataParts ofmt=csv + +./bin/systemml ./scripts/utils/splitXY.dml -nvargs X=linRegDataParts/1 y=51 OX=linRegData.train.data.csv OY=linRegData.train.labels.csv ofmt=csv + +./bin/systemml ./scripts/utils/splitXY.dml -nvargs X=linRegDataParts/2 y=51 OX=linRegData.test.data.csv OY=linRegData.test.labels.csv ofmt=csv + +./bin/systemml ./scripts/algorithms/LinearRegDS.dml -nvargs X=linRegData.train.data.csv Y=linRegData.train.labels.csv B=betas.csv fmt=csv + +./bin/systemml ./scripts/algorithms/GLM-predict.dml -nvargs X=linRegData.test.data.csv Y=linRegData.test.labels.csv B=betas.csv fmt=csv +</code></pre> + +<h1 id="troubleshooting">Troubleshooting</h1> + +<p>If you encounter a <code>"java.lang.OutOfMemoryError"</code> you can edit the invocation +script (<code>runStandaloneSystemML.sh</code> or <code>runStandaloneSystemML.bat</code>) to increase +the memory available to the JVM, i.e:</p> + +<pre><code>java -Xmx16g -Xms4g -Xmn1g -cp ${CLASSPATH} org.apache.sysml.api.DMLScript \ + -f ${SCRIPT_FILE} -exec singlenode -config SystemML-config.xml \ + $@ +</code></pre> + + + </div> <!-- /container --> + + + + <script src="js/vendor/jquery-1.12.0.min.js"></script> + <script src="js/vendor/bootstrap.min.js"></script> + <script src="js/vendor/anchor.min.js"></script> + <script src="js/main.js"></script> + + + + + + <!-- Analytics --> + <script> + (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){ + (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o), + m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m) + })(window,document,'script','//www.google-analytics.com/analytics.js','ga'); + ga('create', 'UA-71553733-1', 'auto'); + ga('send', 'pageview'); + </script> + + + + <!-- MathJax Section --> + <script type="text/x-mathjax-config"> + MathJax.Hub.Config({ + TeX: { equationNumbers: { autoNumber: "AMS" } } + }); + </script> + <script> + // Note that we load MathJax this way to work with local file (file://), HTTP and HTTPS. + // We could use "//cdn.mathjax...", but that won't support "file://". + (function(d, script) { + script = d.createElement('script'); + script.type = 'text/javascript'; + script.async = true; + script.onload = function(){ + MathJax.Hub.Config({ + tex2jax: { + inlineMath: [ ["$", "$"], ["\\\\(","\\\\)"] ], + displayMath: [ ["$$","$$"], ["\\[", "\\]"] ], + processEscapes: true, + skipTags: ['script', 'noscript', 'style', 'textarea', 'pre'] + } + }); + }; + script.src = ('https:' == document.location.protocol ? 'https://' : 'http://') + + 'cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML'; + d.getElementsByTagName('head')[0].appendChild(script); + }(document)); + </script> + </body> +</html>
Added: systemml/site/docs/1.1.0/troubleshooting-guide.html URL: http://svn.apache.org/viewvc/systemml/site/docs/1.1.0/troubleshooting-guide.html?rev=1828046&view=auto ============================================================================== --- systemml/site/docs/1.1.0/troubleshooting-guide.html (added) +++ systemml/site/docs/1.1.0/troubleshooting-guide.html Fri Mar 30 04:31:05 2018 @@ -0,0 +1,300 @@ +<!DOCTYPE html> +<!--[if lt IE 7]> <html class="no-js lt-ie9 lt-ie8 lt-ie7"> <![endif]--> +<!--[if IE 7]> <html class="no-js lt-ie9 lt-ie8"> <![endif]--> +<!--[if IE 8]> <html class="no-js lt-ie9"> <![endif]--> +<!--[if gt IE 8]><!--> <html class="no-js"> <!--<![endif]--> + <head> + <title>Troubleshooting Guide - SystemML 1.1.0</title> + <meta charset="utf-8"> + <meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1"> + + <meta name="description" content="Troubleshooting Guide"> + + <meta name="viewport" content="width=device-width"> + <link rel="stylesheet" href="css/bootstrap.min.css"> + <link rel="stylesheet" href="css/main.css"> + <link rel="stylesheet" href="css/pygments-default.css"> + <link rel="shortcut icon" href="img/favicon.png"> + </head> + <body> + <!--[if lt IE 7]> + <p class="chromeframe">You are using an outdated browser. <a href="http://browsehappy.com/">Upgrade your browser today</a> or <a href="http://www.google.com/chromeframe/?redirect=true">install Google Chrome Frame</a> to better experience this site.</p> + <![endif]--> + + <header class="navbar navbar-default navbar-fixed-top" id="topbar"> + <div class="container"> + <div class="navbar-header"> + <div class="navbar-brand brand projectlogo"> + <a href="http://systemml.apache.org/"><img class="logo" src="img/systemml-logo.png" alt="Apache SystemML" title="Apache SystemML"/></a> + </div> + <div class="navbar-brand brand projecttitle"> + <a href="http://systemml.apache.org/">Apache SystemML<sup id="trademark">â¢</sup></a><br/> + <span class="version">1.1.0</span> + </div> + <button type="button" class="navbar-toggle collapsed" data-toggle="collapse" data-target=".navbar-collapse"> + <span class="sr-only">Toggle navigation</span> + <span class="icon-bar"></span> + <span class="icon-bar"></span> + <span class="icon-bar"></span> + </button> + </div> + <nav class="navbar-collapse collapse"> + <ul class="nav navbar-nav navbar-right"> + <li><a href="index.html">Overview</a></li> + <li><a href="https://github.com/apache/systemml">GitHub</a></li> + <li class="dropdown"> + <a href="#" class="dropdown-toggle" data-toggle="dropdown">Documentation<b class="caret"></b></a> + <ul class="dropdown-menu" role="menu"> + <li><b>Running SystemML:</b></li> + <li><a href="https://github.com/apache/systemml">SystemML GitHub README</a></li> + <li><a href="spark-mlcontext-programming-guide.html">Spark MLContext</a></li> + <li><a href="spark-batch-mode.html">Spark Batch Mode</a> + <li><a href="hadoop-batch-mode.html">Hadoop Batch Mode</a> + <li><a href="standalone-guide.html">Standalone Guide</a></li> + <li><a href="jmlc.html">Java Machine Learning Connector (JMLC)</a> + <li class="divider"></li> + <li><b>Language Guides:</b></li> + <li><a href="dml-language-reference.html">DML Language Reference</a></li> + <li><a href="beginners-guide-to-dml-and-pydml.html">Beginner's Guide to DML and PyDML</a></li> + <li><a href="beginners-guide-python.html">Beginner's Guide for Python Users</a></li> + <li><a href="python-reference.html">Reference Guide for Python Users</a></li> + <li class="divider"></li> + <li><b>ML Algorithms:</b></li> + <li><a href="algorithms-reference.html">Algorithms Reference</a></li> + <li class="divider"></li> + <li><b>Tools:</b></li> + <li><a href="debugger-guide.html">Debugger Guide</a></li> + <li><a href="developer-tools-systemml.html">IDE Guide</a></li> + <li class="divider"></li> + <li><b>Other:</b></li> + <li><a href="contributing-to-systemml.html">Contributing to SystemML</a></li> + <li><a href="engine-dev-guide.html">Engine Developer Guide</a></li> + <li><a href="troubleshooting-guide.html">Troubleshooting Guide</a></li> + <li><a href="release-process.html">Release Process</a></li> + </ul> + </li> + + <li class="dropdown"> + <a href="#" class="dropdown-toggle" data-toggle="dropdown">API Docs<b class="caret"></b></a> + <ul class="dropdown-menu" role="menu"> + <li><a href="./api/java/index.html">Java</a></li> + <li><a href="./api/python/index.html">Python</a></li> + </ul> + </li> + + <li class="dropdown"> + <a href="#" class="dropdown-toggle" data-toggle="dropdown">Issues<b class="caret"></b></a> + <ul class="dropdown-menu" role="menu"> + <li><b>JIRA:</b></li> + <li><a href="https://issues.apache.org/jira/browse/SYSTEMML">SystemML JIRA</a></li> + + </ul> + </li> + </ul> + </nav> + </div> + </header> + + <div class="container" id="content"> + + <h1 class="title">Troubleshooting Guide</h1> + + + <!-- + +--> + +<ul id="markdown-toc"> + <li><a href="#classnotfoundexception-for-commons-math3" id="markdown-toc-classnotfoundexception-for-commons-math3">ClassNotFoundException for commons-math3</a></li> + <li><a href="#outofmemoryerror-in-hadoop-reduce-phase" id="markdown-toc-outofmemoryerror-in-hadoop-reduce-phase">OutOfMemoryError in Hadoop Reduce Phase</a></li> + <li><a href="#total-size-of-serialized-results-is-bigger-than-sparkdrivermaxresultsize" id="markdown-toc-total-size-of-serialized-results-is-bigger-than-sparkdrivermaxresultsize">Total size of serialized results is bigger than spark.driver.maxResultSize</a></li> + <li><a href="#file-does-not-exist-on-hdfslfs-error-from-remote-parfor" id="markdown-toc-file-does-not-exist-on-hdfslfs-error-from-remote-parfor">File does not exist on HDFS/LFS error from remote parfor</a></li> + <li><a href="#jvm-garbage-collection-related-flags" id="markdown-toc-jvm-garbage-collection-related-flags">JVM Garbage Collection related flags</a></li> + <li><a href="#memory-overhead" id="markdown-toc-memory-overhead">Memory overhead</a></li> + <li><a href="#network-timeout" id="markdown-toc-network-timeout">Network timeout</a></li> + <li><a href="#advanced-developer-statistics" id="markdown-toc-advanced-developer-statistics">Advanced developer statistics</a></li> + <li><a href="#out-of-memory-on-executors" id="markdown-toc-out-of-memory-on-executors">Out-Of-Memory on executors</a></li> + <li><a href="#native-blas-errors" id="markdown-toc-native-blas-errors">Native BLAS errors</a></li> +</ul> + +<p><br /></p> + +<h2 id="classnotfoundexception-for-commons-math3">ClassNotFoundException for commons-math3</h2> + +<p>The Apache Commons Math library is utilized by SystemML. The commons-math3 +dependency is included with Spark and with newer versions of Hadoop. Running +SystemML on an older Hadoop cluster can potentially generate an error such +as the following due to the missing commons-math3 dependency:</p> + +<pre><code>java.lang.ClassNotFoundException: org.apache.commons.math3.linear.RealMatrix +</code></pre> + +<p>This issue can be fixed by changing the commons-math3 <code>scope</code> in the pom.xml file +from <code>provided</code> to <code>compile</code>.</p> + +<pre><code><dependency> + <groupId>org.apache.commons</groupId> + <artifactId>commons-math3</artifactId> + <version>3.1.1</version> + <scope>compile</scope> +</dependency> +</code></pre> + +<p>SystemML can then be rebuilt with the <code>commons-math3</code> dependency using +Maven (<code>mvn clean package -P distribution</code>).</p> + +<h2 id="outofmemoryerror-in-hadoop-reduce-phase">OutOfMemoryError in Hadoop Reduce Phase</h2> +<p>In Hadoop MapReduce, outputs from mapper nodes are copied to reducer nodes and then sorted (known as the <em>shuffle</em> phase) before being consumed by reducers. The shuffle phase utilizes several buffers that share memory space with other MapReduce tasks, which will throw an <code>OutOfMemoryError</code> if the shuffle buffers take too much space:</p> + +<pre><code>Error: java.lang.OutOfMemoryError: Java heap space + at org.apache.hadoop.mapred.IFile$Reader.readNextBlock(IFile.java:357) + at org.apache.hadoop.mapred.IFile$Reader.next(IFile.java:419) + at org.apache.hadoop.mapred.Merger$Segment.next(Merger.java:238) + at org.apache.hadoop.mapred.Merger$MergeQueue.adjustPriorityQueue(Merger.java:348) + at org.apache.hadoop.mapred.Merger$MergeQueue.next(Merger.java:368) + at org.apache.hadoop.mapred.Merger.writeFile(Merger.java:156) + ... +</code></pre> + +<p>One way to fix this issue is lowering the following buffer thresholds.</p> + +<pre><code>mapred.job.shuffle.input.buffer.percent # default 0.70; try 0.20 +mapred.job.shuffle.merge.percent # default 0.66; try 0.20 +mapred.job.reduce.input.buffer.percent # default 0.0; keep 0.0 +</code></pre> + +<p>These configurations can be modified <strong>globally</strong> by inserting/modifying the following in <code>mapred-site.xml</code>.</p> + +<pre><code><property> + <name>mapred.job.shuffle.input.buffer.percent</name> + <value>0.2</value> +</property> +<property> + <name>mapred.job.shuffle.merge.percent</name> + <value>0.2</value> +</property> +<property> + <name>mapred.job.reduce.input.buffer.percent</name> + <value>0.0</value> +</property> +</code></pre> + +<p>They can also be configured on a <strong>per SystemML-task basis</strong> by inserting the following in <code>SystemML-config.xml</code>.</p> + +<pre><code><mapred.job.shuffle.merge.percent>0.2</mapred.job.shuffle.merge.percent> +<mapred.job.shuffle.input.buffer.percent>0.2</mapred.job.shuffle.input.buffer.percent> +<mapred.job.reduce.input.buffer.percent>0</mapred.job.reduce.input.buffer.percent> +</code></pre> + +<p>Note: The default <code>SystemML-config.xml</code> is located in <code><path to SystemML root>/conf/</code>. It is passed to SystemML using the <code>-config</code> argument:</p> + +<pre><code>hadoop jar SystemML.jar [-? | -help | -f <filename>] (-config <config_filename>) ([-args | -nvargs] <args-list>) +</code></pre> + +<p>See <a href="hadoop-batch-mode.html">Invoking SystemML in Hadoop Batch Mode</a> for details of the syntax.</p> + +<h2 id="total-size-of-serialized-results-is-bigger-than-sparkdrivermaxresultsize">Total size of serialized results is bigger than spark.driver.maxResultSize</h2> + +<p>Spark aborts a job if the estimated result size of collect is greater than maxResultSize to avoid out-of-memory errors in driver. +However, SystemML’s optimizer has estimates the memory required for each operator and provides guards against these out-of-memory errors in driver. +So, we recommend setting the configuration <code>--conf spark.driver.maxResultSize=0</code>.</p> + +<h2 id="file-does-not-exist-on-hdfslfs-error-from-remote-parfor">File does not exist on HDFS/LFS error from remote parfor</h2> + +<p>This error usually comes from incorrect HDFS configuration on the worker nodes. To investigate this, we recommend</p> + +<ul> + <li>Testing if HDFS is accessible from the worker node: <code>hadoop fs -ls <file path></code></li> + <li>Synchronize hadoop configuration across the worker nodes.</li> + <li>Set the environment variable <code>HADOOP_CONF_DIR</code>. You may have to restart the cluster-manager to get the hadoop configuration.</li> +</ul> + +<h2 id="jvm-garbage-collection-related-flags">JVM Garbage Collection related flags</h2> + +<p>We recommend providing 10% of maximum memory to young generation and using <code>-server</code> flag for robust garbage collection policy. +For example: if you intend to use 20G driver and 60G executor, then please add following to your configuration:</p> + +<pre><code> spark-submit --driver-memory 20G --executor-memory 60G --conf "spark.executor.extraJavaOptions=-Xmn6G -server" --conf "spark.driver.extraJavaOptions=-Xmn2G -server" ... +</code></pre> + +<h2 id="memory-overhead">Memory overhead</h2> + +<p>Spark sets <code>spark.yarn.executor.memoryOverhead</code>, <code>spark.yarn.driver.memoryOverhead</code> and <code>spark.yarn.am.memoryOverhead</code> to be 10% of memory provided +to the executor, driver and YARN Application Master respectively (with minimum of 384 MB). For certain workloads, the user may have to increase this +overhead to 12-15% of the memory budget.</p> + +<h2 id="network-timeout">Network timeout</h2> + +<p>To avoid false-positive errors due to network failures in case of compute-bound scripts, the user may have to increase the timeout <code>spark.network.timeout</code> (default: 120s).</p> + +<h2 id="advanced-developer-statistics">Advanced developer statistics</h2> + +<p>Few of our operators (for example: convolution-related operator) and GPU backend allows an expert user to get advanced statistics +by setting the configuration <code>systemml.stats.extraGPU</code> and <code>systemml.stats.extraDNN</code> in the file SystemML-config.xml.</p> + +<h2 id="out-of-memory-on-executors">Out-Of-Memory on executors</h2> + +<p>Out-Of-Memory on executors is often caused due to side-effects of lazy evaluation and in-memory input data of Spark for large-scale problems. +Though we are constantly improving our optimizer to address this scenario, a quick hack to resolve this is reducing the number of cores allocated to the executor. +We would highly appreciate if you file a bug report on our <a href="https://issues.apache.org/jira/browse/SYSTEMML">issue tracker</a> if and when you encounter OOM.</p> + +<h2 id="native-blas-errors">Native BLAS errors</h2> + +<p>Please see <a href="http://apache.github.io/systemml/native-backend">the user guide of native backend</a>.</p> + + + </div> <!-- /container --> + + + + <script src="js/vendor/jquery-1.12.0.min.js"></script> + <script src="js/vendor/bootstrap.min.js"></script> + <script src="js/vendor/anchor.min.js"></script> + <script src="js/main.js"></script> + + + + + + <!-- Analytics --> + <script> + (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){ + (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o), + m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m) + })(window,document,'script','//www.google-analytics.com/analytics.js','ga'); + ga('create', 'UA-71553733-1', 'auto'); + ga('send', 'pageview'); + </script> + + + + <!-- MathJax Section --> + <script type="text/x-mathjax-config"> + MathJax.Hub.Config({ + TeX: { equationNumbers: { autoNumber: "AMS" } } + }); + </script> + <script> + // Note that we load MathJax this way to work with local file (file://), HTTP and HTTPS. + // We could use "//cdn.mathjax...", but that won't support "file://". + (function(d, script) { + script = d.createElement('script'); + script.type = 'text/javascript'; + script.async = true; + script.onload = function(){ + MathJax.Hub.Config({ + tex2jax: { + inlineMath: [ ["$", "$"], ["\\\\(","\\\\)"] ], + displayMath: [ ["$$","$$"], ["\\[", "\\]"] ], + processEscapes: true, + skipTags: ['script', 'noscript', 'style', 'textarea', 'pre'] + } + }); + }; + script.src = ('https:' == document.location.protocol ? 'https://' : 'http://') + + 'cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML'; + d.getElementsByTagName('head')[0].appendChild(script); + }(document)); + </script> + </body> +</html>
