svn commit: r1828046 [20/20] - /systemml/site/docs/1.1.0/

reinwald Thu, 29 Mar 2018 21:31:54 -0700

Added: systemml/site/docs/1.1.0/standalone-guide.html
URL: 
http://svn.apache.org/viewvc/systemml/site/docs/1.1.0/standalone-guide.html?rev=1828046&view=auto
==============================================================================
--- systemml/site/docs/1.1.0/standalone-guide.html (added)
+++ systemml/site/docs/1.1.0/standalone-guide.html Fri Mar 30 04:31:05 2018
@@ -0,0 +1,892 @@
+<!DOCTYPE html>
+<!--[if lt IE 7]>      <html class="no-js lt-ie9 lt-ie8 lt-ie7"> <![endif]-->
+<!--[if IE 7]>         <html class="no-js lt-ie9 lt-ie8"> <![endif]-->
+<!--[if IE 8]>         <html class="no-js lt-ie9"> <![endif]-->
+<!--[if gt IE 8]><!--> <html class="no-js"> <!--<![endif]-->
+    <head>
+        <title>SystemML Standalone Guide - SystemML 1.1.0</title>
+        <meta charset="utf-8">
+        <meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1">
+        
+        <meta name="description" content="SystemML Standalone Guide">
+        
+        <meta name="viewport" content="width=device-width">
+        <link rel="stylesheet" href="css/bootstrap.min.css">
+        <link rel="stylesheet" href="css/main.css">
+        <link rel="stylesheet" href="css/pygments-default.css">
+        <link rel="shortcut icon" href="img/favicon.png">
+    </head>
+    <body>
+        <!--[if lt IE 7]>
+            <p class="chromeframe">You are using an outdated browser. <a 
href="http://browsehappy.com/";>Upgrade your browser today</a> or <a 
href="http://www.google.com/chromeframe/?redirect=true";>install Google Chrome 
Frame</a> to better experience this site.</p>
+        <![endif]-->
+
+        <header class="navbar navbar-default navbar-fixed-top" id="topbar">
+            <div class="container">
+                <div class="navbar-header">
+                    <div class="navbar-brand brand projectlogo">
+                        <a href="http://systemml.apache.org/";><img 
class="logo" src="img/systemml-logo.png" alt="Apache SystemML" title="Apache 
SystemML"/></a>
+                    </div>
+                    <div class="navbar-brand brand projecttitle">
+                        <a href="http://systemml.apache.org/";>Apache 
SystemML<sup id="trademark">â¢</sup></a><br/>
+                        <span class="version">1.1.0</span>
+                    </div>
+                    <button type="button" class="navbar-toggle collapsed" 
data-toggle="collapse" data-target=".navbar-collapse">
+                        <span class="sr-only">Toggle navigation</span>
+                        <span class="icon-bar"></span>
+                        <span class="icon-bar"></span>
+                        <span class="icon-bar"></span>
+                    </button>
+                </div>
+                <nav class="navbar-collapse collapse">
+                    <ul class="nav navbar-nav navbar-right">
+                        <li><a href="index.html">Overview</a></li>
+                        <li><a 
href="https://github.com/apache/systemml";>GitHub</a></li>
+                        <li class="dropdown">
+                            <a href="#" class="dropdown-toggle" 
data-toggle="dropdown">Documentation<b class="caret"></b></a>
+                            <ul class="dropdown-menu" role="menu">
+                                <li><b>Running SystemML:</b></li>
+                                <li><a 
href="https://github.com/apache/systemml";>SystemML GitHub README</a></li>
+                                <li><a 
href="spark-mlcontext-programming-guide.html">Spark MLContext</a></li>
+                                <li><a href="spark-batch-mode.html">Spark 
Batch Mode</a>
+                                <li><a href="hadoop-batch-mode.html">Hadoop 
Batch Mode</a>
+                                <li><a href="standalone-guide.html">Standalone 
Guide</a></li>
+                                <li><a href="jmlc.html">Java Machine Learning 
Connector (JMLC)</a>
+                                <li class="divider"></li>
+                                <li><b>Language Guides:</b></li>
+                                <li><a href="dml-language-reference.html">DML 
Language Reference</a></li>
+                                <li><a 
href="beginners-guide-to-dml-and-pydml.html">Beginner's Guide to DML and 
PyDML</a></li>
+                                <li><a 
href="beginners-guide-python.html">Beginner's Guide for Python Users</a></li>
+                                <li><a href="python-reference.html">Reference 
Guide for Python Users</a></li>
+                                <li class="divider"></li>
+                                <li><b>ML Algorithms:</b></li>
+                                <li><a 
href="algorithms-reference.html">Algorithms Reference</a></li>
+                                <li class="divider"></li>
+                                <li><b>Tools:</b></li>
+                                <li><a href="debugger-guide.html">Debugger 
Guide</a></li>
+                                <li><a 
href="developer-tools-systemml.html">IDE Guide</a></li>
+                                <li class="divider"></li>
+                                <li><b>Other:</b></li>
+                                <li><a 
href="contributing-to-systemml.html">Contributing to SystemML</a></li>
+                                <li><a href="engine-dev-guide.html">Engine 
Developer Guide</a></li>
+                                <li><a 
href="troubleshooting-guide.html">Troubleshooting Guide</a></li>
+                                <li><a href="release-process.html">Release 
Process</a></li>
+                            </ul>
+                        </li>
+                        
+                        <li class="dropdown">
+                            <a href="#" class="dropdown-toggle" 
data-toggle="dropdown">API Docs<b class="caret"></b></a>
+                            <ul class="dropdown-menu" role="menu">
+                                <li><a 
href="./api/java/index.html">Java</a></li>
+                                <li><a 
href="./api/python/index.html">Python</a></li>
+                            </ul>
+                        </li>
+                        
+                        <li class="dropdown">
+                            <a href="#" class="dropdown-toggle" 
data-toggle="dropdown">Issues<b class="caret"></b></a>
+                            <ul class="dropdown-menu" role="menu">
+                                <li><b>JIRA:</b></li>
+                                <li><a 
href="https://issues.apache.org/jira/browse/SYSTEMML";>SystemML JIRA</a></li>
+                                
+                            </ul>
+                        </li>
+                    </ul>
+                </nav>
+            </div>
+        </header>
+
+        <div class="container" id="content">
+          
+            <h1 class="title">SystemML Standalone Guide</h1>
+          
+
+          <!--
+
+-->
+
+<ul id="markdown-toc">
+  <li><a href="#what-is-systemml" id="markdown-toc-what-is-systemml">What is 
SystemML</a></li>
+  <li><a href="#download-systemml" 
id="markdown-toc-download-systemml">Download SystemML</a></li>
+  <li><a href="#standalone-vs-distributed-execution-mode" 
id="markdown-toc-standalone-vs-distributed-execution-mode">Standalone vs 
Distributed Execution Mode</a></li>
+  <li><a href="#choosing-test-data" 
id="markdown-toc-choosing-test-data">Choosing Test Data</a></li>
+  <li><a href="#example-1---univariate-statistics" 
id="markdown-toc-example-1---univariate-statistics">Example 1 - Univariate 
Statistics</a></li>
+  <li><a href="#example-2---binary-class-support-vector-machines" 
id="markdown-toc-example-2---binary-class-support-vector-machines">Example 2 - 
Binary-class Support Vector Machines</a>    <ul>
+      <li><a href="#sampling-the-test-data" 
id="markdown-toc-sampling-the-test-data">Sampling the Test Data</a></li>
+      <li><a href="#splitting-labels-from-features" 
id="markdown-toc-splitting-labels-from-features">Splitting Labels from 
Features</a></li>
+      <li><a href="#training-and-testing-the-model" 
id="markdown-toc-training-and-testing-the-model">Training and Testing the 
Model</a></li>
+    </ul>
+  </li>
+  <li><a href="#example-3---linear-regression" 
id="markdown-toc-example-3---linear-regression">Example 3 - Linear 
Regression</a>    <ul>
+      <li><a href="#linear-regression-example" 
id="markdown-toc-linear-regression-example">Linear Regression Example</a>       
 <ul>
+          <li><a href="#run-dml-script-to-generate-random-data" 
id="markdown-toc-run-dml-script-to-generate-random-data">Run DML Script to 
Generate Random Data</a></li>
+          <li><a href="#divide-generated-data-into-two-sample-groups" 
id="markdown-toc-divide-generated-data-into-two-sample-groups">Divide Generated 
Data into Two Sample Groups</a></li>
+          <li><a href="#split-label-column-from-first-sample" 
id="markdown-toc-split-label-column-from-first-sample">Split Label Column from 
First Sample</a></li>
+          <li><a href="#split-label-column-from-second-sample" 
id="markdown-toc-split-label-column-from-second-sample">Split Label Column from 
Second Sample</a></li>
+          <li><a href="#train-model-on-first-sample" 
id="markdown-toc-train-model-on-first-sample">Train Model on First 
Sample</a></li>
+          <li><a href="#test-model-on-second-sample" 
id="markdown-toc-test-model-on-second-sample">Test Model on Second 
Sample</a></li>
+        </ul>
+      </li>
+    </ul>
+  </li>
+  <li><a href="#troubleshooting" 
id="markdown-toc-troubleshooting">Troubleshooting</a></li>
+</ul>
+
+<p><br /></p>
+
+<p>This tutorial provides a quick introduction to using SystemML by
+running existing SystemML algorithms in standalone mode.</p>
+
+<h1 id="what-is-systemml">What is SystemML</h1>
+
+<p>SystemML enables large-scale machine learning (ML) via a high-level 
declarative
+language with R-like syntax called <a 
href="dml-language-reference.html">DML</a> and
+Python-like syntax called PyDML. DML and PyDML allow data scientists to
+express their ML algorithms with full flexibility but without the need to 
fine-tune
+distributed runtime execution plans and system configurations.
+These ML programs are dynamically compiled and optimized based on data
+and cluster characteristics using rule-based and cost-based optimization 
techniques.
+The compiler automatically generates hybrid runtime execution plans ranging
+from in-memory, single node execution to distributed computation for Hadoop
+or Spark Batch execution.
+SystemML features a suite of algorithms for Descriptive Statistics, 
Classification,
+Clustering, Regression, Matrix Factorization, and Survival Analysis. Detailed 
descriptions of these
+algorithms can be found in the <a href="algorithms-reference.html">Algorithms 
Reference</a>.</p>
+
+<h1 id="download-systemml">Download SystemML</h1>
+
+<p>Apache SystemML releases are available from the <a 
href="http://systemml.apache.org/download.html";>Downloads</a> page.</p>
+
+<p>SystemML can also be downloaded from GitHub and built with Maven.
+The SystemML project is available on GitHub at <a 
href="https://github.com/apache/systemml";>https://github.com/apache/systemml</a>.
+Instructions to build SystemML can be found in the <a 
href="engine-dev-guide.html">Engine Developer Guide</a>.</p>
+
+<h1 id="standalone-vs-distributed-execution-mode">Standalone vs Distributed 
Execution Mode</h1>
+
+<p>SystemML&#8217;s standalone mode is designed to allow data scientists to 
rapidly prototype algorithms
+on a single machine. In standalone mode, all operations occur on a single node 
in a non-Hadoop
+environment. Standalone mode is not appropriate for large datasets.</p>
+
+<p>For large-scale production environments, SystemML algorithm execution can be
+distributed across multi-node clusters using <a 
href="https://hadoop.apache.org/";>Apache Hadoop</a>
+or <a href="http://spark.apache.org/";>Apache Spark</a>.
+We will make use of standalone mode throughout this tutorial.</p>
+
+<h1 id="choosing-test-data">Choosing Test Data</h1>
+
+<p>In this tutorial we will use the <a 
href="http://archive.ics.uci.edu/ml/datasets/Haberman%27s+Survival";>Haberman&#8217;s
 Survival Data Set</a>
+which can be downloaded in CSV format from the <a 
href="http://cml.ics.uci.edu/";>Center for Machine Learning and Intelligent 
Systems</a></p>
+
+<pre><code>$ wget -P data/ 
http://archive.ics.uci.edu/ml/machine-learning-databases/haberman/haberman.data
+</code></pre>
+
+<p>The <a 
href="http://archive.ics.uci.edu/ml/machine-learning-databases/haberman/haberman.names";>Haberman
 Data Set</a>
+has 306 instances and 4 attributes (including the class attribute):</p>
+
+<ol>
+  <li>Age of patient at time of operation (numerical)</li>
+  <li>Patient&#8217;s year of operation (year - 1900, numerical)</li>
+  <li>Number of positive axillary nodes detected (numerical)</li>
+  <li>Survival status (class attribute)
+   * <code>1</code> = the patient survived 5 years or longer
+   * <code>2</code> = the patient died within 5 year</li>
+</ol>
+
+<p>We will need to create a metadata file (MTD) which stores metadata 
information
+about the content of the data file. The name of the MTD file associated with 
the
+data file <code>&lt;filename&gt;</code> must be 
<code>&lt;filename&gt;.mtd</code>.</p>
+
+<pre><code>$ echo '{"rows": 306, "cols": 4, "format": "csv"}' &gt; 
data/haberman.data.mtd
+</code></pre>
+
+<hr />
+
+<h1 id="example-1---univariate-statistics">Example 1 - Univariate 
Statistics</h1>
+
+<p>Let&#8217;s start with a simple example, computing certain <a 
href="algorithms-descriptive-statistics.html#univariate-statistics">univariate 
statistics</a>
+for each feature column using the algorithm <code>Univar-Stats.dml</code> 
which requires 3
+<a href="algorithms-descriptive-statistics.html#arguments">arguments</a>:</p>
+
+<ul>
+  <li><code>X</code>:  location of the input data file to analyze</li>
+  <li><code>TYPES</code>:  location of the file that contains the feature 
column types encoded by integer numbers: <code>1</code> = scale, <code>2</code> 
= nominal, <code>3</code> = ordinal</li>
+  <li><code>STATS</code>:  location where the output matrix of computed 
statistics is to be stored</li>
+</ul>
+
+<p>We need to create a file <code>types.csv</code> that describes the type of 
each column in
+the data along with its metadata file <code>types.csv.mtd</code>.</p>
+
+<pre><code>$ echo '1,1,1,2' &gt; data/types.csv
+$ echo '{"rows": 1, "cols": 4, "format": "csv"}' &gt; data/types.csv.mtd
+</code></pre>
+
+<p>To run the <code>Univar-Stats.dml</code> algorithm, issue the following 
command (we set the optional argument <code>CONSOLE_OUTPUT</code> to 
<code>TRUE</code> to print the statistics to the console):</p>
+
+<pre><code>$ ./runStandaloneSystemML.sh scripts/algorithms/Univar-Stats.dml 
-nvargs X=data/haberman.data TYPES=data/types.csv STATS=data/univarOut.mtx 
CONSOLE_OUTPUT=TRUE
+
+[...]
+-------------------------------------------------
+Feature [1]: Scale
+ (01) Minimum             | 30.0
+ (02) Maximum             | 83.0
+ (03) Range               | 53.0
+ (04) Mean                | 52.45751633986928
+ (05) Variance            | 116.71458266366658
+ (06) Std deviation       | 10.803452349303281
+ (07) Std err of mean     | 0.6175922641866753
+ (08) Coeff of variation  | 0.20594669940735139
+ (09) Skewness            | 0.1450718616532357
+ (10) Kurtosis            | -0.6150152487211726
+ (11) Std err of skewness | 0.13934809593495995
+ (12) Std err of kurtosis | 0.277810485320835
+ (13) Median              | 52.0
+ (14) Interquartile mean  | 52.16013071895425
+-------------------------------------------------
+Feature [2]: Scale
+ (01) Minimum             | 58.0
+ (02) Maximum             | 69.0
+ (03) Range               | 11.0
+ (04) Mean                | 62.85294117647059
+ (05) Variance            | 10.558630665380907
+ (06) Std deviation       | 3.2494046632238507
+ (07) Std err of mean     | 0.18575610076612029
+ (08) Coeff of variation  | 0.051698529971741194
+ (09) Skewness            | 0.07798443581479181
+ (10) Kurtosis            | -1.1324380182967442
+ (11) Std err of skewness | 0.13934809593495995
+ (12) Std err of kurtosis | 0.277810485320835
+ (13) Median              | 63.0
+ (14) Interquartile mean  | 62.80392156862745
+-------------------------------------------------
+Feature [3]: Scale
+ (01) Minimum             | 0.0
+ (02) Maximum             | 52.0
+ (03) Range               | 52.0
+ (04) Mean                | 4.026143790849673
+ (05) Variance            | 51.691117539912135
+ (06) Std deviation       | 7.189653506248555
+ (07) Std err of mean     | 0.41100513466216837
+ (08) Coeff of variation  | 1.7857418611299172
+ (09) Skewness            | 2.954633471088322
+ (10) Kurtosis            | 11.425776549251449
+ (11) Std err of skewness | 0.13934809593495995
+ (12) Std err of kurtosis | 0.277810485320835
+ (13) Median              | 1.0
+ (14) Interquartile mean  | 1.2483660130718954
+-------------------------------------------------
+Feature [4]: Categorical (Nominal)
+ (15) Num of categories   | 2
+ (16) Mode                | 1
+ (17) Num of modes        | 1
+</code></pre>
+
+<p>In addition to writing statistics to the console, the 
<code>Univar-Stats.dml</code> script writes the computed statistics
+to the <code>data/univarOut.mtx</code> file specified by the STATS input 
parameter.</p>
+
+<p><strong>univarOut.mtx file</strong></p>
+
+<pre><code>1 1 30.0
+1 2 58.0
+2 1 83.0
+2 2 69.0
+2 3 52.0
+3 1 53.0
+3 2 11.0
+3 3 52.0
+4 1 52.45751633986928
+4 2 62.85294117647059
+4 3 4.026143790849673
+5 1 116.71458266366658
+5 2 10.558630665380907
+5 3 51.691117539912135
+6 1 10.803452349303281
+6 2 3.2494046632238507
+6 3 7.189653506248555
+7 1 0.6175922641866753
+7 2 0.18575610076612029
+7 3 0.41100513466216837
+8 1 0.20594669940735139
+8 2 0.051698529971741194
+8 3 1.7857418611299172
+9 1 0.1450718616532357
+9 2 0.07798443581479181
+9 3 2.954633471088322
+10 1 -0.6150152487211726
+10 2 -1.1324380182967442
+10 3 11.425776549251449
+11 1 0.13934809593495995
+11 2 0.13934809593495995
+11 3 0.13934809593495995
+12 1 0.277810485320835
+12 2 0.277810485320835
+12 3 0.277810485320835
+13 1 52.0
+13 2 63.0
+13 3 1.0
+14 1 52.16013071895425
+14 2 62.80392156862745
+14 3 1.2483660130718954
+15 4 2.0
+16 4 1.0
+17 4 1.0
+</code></pre>
+
+<p>The following table lists the number and name of each univariate statistic. 
The row
+numbers below correspond to the elements of the first column in the output
+matrix above. The signs &#8220;+&#8221; show applicability to scale or/and to 
categorical
+features.</p>
+
+<table>
+  <thead>
+    <tr>
+      <th style="text-align: center">Row</th>
+      <th style="text-align: left">Name of Statistic</th>
+      <th style="text-align: center">Scale</th>
+      <th style="text-align: center">Categ.</th>
+    </tr>
+  </thead>
+  <tbody>
+    <tr>
+      <td style="text-align: center">1</td>
+      <td style="text-align: left">Minimum</td>
+      <td style="text-align: center">+</td>
+      <td style="text-align: center">&#160;</td>
+    </tr>
+    <tr>
+      <td style="text-align: center">2</td>
+      <td style="text-align: left">Maximum</td>
+      <td style="text-align: center">+</td>
+      <td style="text-align: center">&#160;</td>
+    </tr>
+    <tr>
+      <td style="text-align: center">3</td>
+      <td style="text-align: left">Range</td>
+      <td style="text-align: center">+</td>
+      <td style="text-align: center">&#160;</td>
+    </tr>
+    <tr>
+      <td style="text-align: center">4</td>
+      <td style="text-align: left">Mean</td>
+      <td style="text-align: center">+</td>
+      <td style="text-align: center">&#160;</td>
+    </tr>
+    <tr>
+      <td style="text-align: center">5</td>
+      <td style="text-align: left">Variance</td>
+      <td style="text-align: center">+</td>
+      <td style="text-align: center">&#160;</td>
+    </tr>
+    <tr>
+      <td style="text-align: center">6</td>
+      <td style="text-align: left">Standard deviation</td>
+      <td style="text-align: center">+</td>
+      <td style="text-align: center">&#160;</td>
+    </tr>
+    <tr>
+      <td style="text-align: center">7</td>
+      <td style="text-align: left">Standard error of mean</td>
+      <td style="text-align: center">+</td>
+      <td style="text-align: center">&#160;</td>
+    </tr>
+    <tr>
+      <td style="text-align: center">8</td>
+      <td style="text-align: left">Coefficient of variation</td>
+      <td style="text-align: center">+</td>
+      <td style="text-align: center">&#160;</td>
+    </tr>
+    <tr>
+      <td style="text-align: center">9</td>
+      <td style="text-align: left">Skewness</td>
+      <td style="text-align: center">+</td>
+      <td style="text-align: center">&#160;</td>
+    </tr>
+    <tr>
+      <td style="text-align: center">10</td>
+      <td style="text-align: left">Kurtosis</td>
+      <td style="text-align: center">+</td>
+      <td style="text-align: center">&#160;</td>
+    </tr>
+    <tr>
+      <td style="text-align: center">11</td>
+      <td style="text-align: left">Standard error of skewness</td>
+      <td style="text-align: center">+</td>
+      <td style="text-align: center">&#160;</td>
+    </tr>
+    <tr>
+      <td style="text-align: center">12</td>
+      <td style="text-align: left">Standard error of kurtosis</td>
+      <td style="text-align: center">+</td>
+      <td style="text-align: center">&#160;</td>
+    </tr>
+    <tr>
+      <td style="text-align: center">13</td>
+      <td style="text-align: left">Median</td>
+      <td style="text-align: center">+</td>
+      <td style="text-align: center">&#160;</td>
+    </tr>
+    <tr>
+      <td style="text-align: center">14</td>
+      <td style="text-align: left">Inter quartile mean</td>
+      <td style="text-align: center">+</td>
+      <td style="text-align: center">&#160;</td>
+    </tr>
+    <tr>
+      <td style="text-align: center">15</td>
+      <td style="text-align: left">Number of categories</td>
+      <td style="text-align: center">&#160;</td>
+      <td style="text-align: center">+</td>
+    </tr>
+    <tr>
+      <td style="text-align: center">16</td>
+      <td style="text-align: left">Mode</td>
+      <td style="text-align: center">&#160;</td>
+      <td style="text-align: center">+</td>
+    </tr>
+    <tr>
+      <td style="text-align: center">17</td>
+      <td style="text-align: left">Number of modes</td>
+      <td style="text-align: center">&#160;</td>
+      <td style="text-align: center">+</td>
+    </tr>
+  </tbody>
+</table>
+
+<hr />
+
+<h1 id="example-2---binary-class-support-vector-machines">Example 2 - 
Binary-class Support Vector Machines</h1>
+
+<p>Let&#8217;s take the same <code>haberman.data</code> to explore the
+<a 
href="algorithms-classification.html#binary-class-support-vector-machines">binary-class
 support vector machines</a> algorithm <code>l2-svm.dml</code>.
+This example also illustrates how to use of the sampling algorithm 
<code>sample.dml</code>
+and the data split algorithm <code>spliXY.dml</code>.</p>
+
+<h2 id="sampling-the-test-data">Sampling the Test Data</h2>
+
+<p>First we need to use the <code>sample.dml</code> algorithm to separate the 
input into one
+training data set and one data set for model prediction.</p>
+
+<p>Parameters:</p>
+
+<ul>
+  <li><code>X</code>       : (input)  input data set: filename of input data 
set</li>
+  <li><code>sv</code>      : (input)  sampling vector: filename of 1-column 
vector w/ percentages. sum(sv) must be 1.</li>
+  <li><code>O</code>       : (output) folder name w/ samples generated</li>
+  <li><code>ofmt</code>    : (output) format of O: &#8220;csv&#8221;, 
&#8220;binary&#8221; (default)</li>
+</ul>
+
+<p>We will create the file <code>perc.csv</code> and <code>perc.csv.mtd</code> 
to define the sampling vector with a sampling rate of
+50% to generate 2 data sets:</p>
+
+<pre><code>$ printf "0.5\n0.5" &gt; data/perc.csv
+$ echo '{"rows": 2, "cols": 1, "format": "csv"}' &gt; data/perc.csv.mtd
+</code></pre>
+
+<p>Let&#8217;s run the sampling algorithm to create the two data samples:</p>
+
+<pre><code>$ ./runStandaloneSystemML.sh scripts/utils/sample.dml -nvargs 
X=data/haberman.data sv=data/perc.csv O=data/haberman.part ofmt="csv"
+</code></pre>
+
+<h2 id="splitting-labels-from-features">Splitting Labels from Features</h2>
+
+<p>Next we use the <code>splitXY.dml</code> algorithm to separate the feature 
columns from
+the label column(s).</p>
+
+<p>Parameters:</p>
+
+<ul>
+  <li><code>X</code>       : (input)  filename of data matrix</li>
+  <li><code>y</code>       : (input)  colIndex: starting index is 1</li>
+  <li><code>OX</code>      : (output) filename of output matrix with all 
columns except y</li>
+  <li><code>OY</code>      : (output) filename of output matrix with y 
column</li>
+  <li><code>ofmt</code>    : (output) format of OX and OY output matrix: 
&#8220;csv&#8221;, &#8220;binary&#8221; (default)</li>
+</ul>
+
+<p>We specify <code>y=4</code> as the 4th column contains the labels to be 
predicted and run
+the <code>splitXY.dml</code> algorithm on our training and test data sets.</p>
+
+<pre><code>$ ./runStandaloneSystemML.sh scripts/utils/splitXY.dml -nvargs 
X=data/haberman.part/1 y=4 OX=data/haberman.train.data.csv 
OY=data/haberman.train.labels.csv ofmt="csv"
+
+$ ./runStandaloneSystemML.sh scripts/utils/splitXY.dml -nvargs 
X=data/haberman.part/2 y=4 OX=data/haberman.test.data.csv  
OY=data/haberman.test.labels.csv  ofmt="csv"
+</code></pre>
+
+<h2 id="training-and-testing-the-model">Training and Testing the Model</h2>
+
+<p>Now we need to train our model using the <code>l2-svm.dml</code> 
algorithm.</p>
+
+<p><a href="algorithms-classification.html#arguments-1">Parameters</a>:</p>
+
+<ul>
+  <li><code>X</code>         : (input)  filename of training data features</li>
+  <li><code>Y</code>         : (input)  filename of training data labels</li>
+  <li><code>model</code>     : (output) filename of model that contains the 
learnt weights</li>
+  <li><code>fmt</code>       : (output) format of model: &#8220;csv&#8221;, 
&#8220;text&#8221; (sparse-matrix)</li>
+  <li><code>Log</code>       : (output) log file for metrics and progress 
while training</li>
+  <li><code>confusion</code> : (output) filename of confusion matrix computed 
using a held-out test set (optional)</li>
+</ul>
+
+<p>The <code>l2-svm.dml</code> algorithm is used on our training data sample 
to train the model.</p>
+
+<pre><code>$ ./runStandaloneSystemML.sh scripts/algorithms/l2-svm.dml -nvargs 
X=data/haberman.train.data.csv Y=data/haberman.train.labels.csv 
model=data/l2-svm-model.csv fmt="csv" Log=data/l2-svm-log.csv
+</code></pre>
+
+<p>The <code>l2-svm-predict.dml</code> algorithm is used on our test data 
sample to predict the labels based on the trained model.</p>
+
+<pre><code>$ ./runStandaloneSystemML.sh scripts/algorithms/l2-svm-predict.dml 
-nvargs X=data/haberman.test.data.csv Y=data/haberman.test.labels.csv 
model=data/l2-svm-model.csv fmt="csv" confusion=data/l2-svm-confusion.csv
+</code></pre>
+
+<p>The console output should show the accuracy of the trained model in 
percent, i.e.:</p>
+
+<pre><code>15/09/01 01:32:51 INFO api.DMLScript: BEGIN DML run 09/01/2015 
01:32:51
+15/09/01 01:32:51 INFO conf.DMLConfig: Updating sysml.localtmpdir with value 
/tmp/systemml
+15/09/01 01:32:51 INFO conf.DMLConfig: Updating sysml.scratch with value 
scratch_space
+15/09/01 01:32:51 INFO conf.DMLConfig: Updating sysml.optlevel with value 2
+15/09/01 01:32:51 INFO conf.DMLConfig: Updating sysml.numreducers with value 10
+15/09/01 01:32:51 INFO conf.DMLConfig: Updating sysml.jvmreuse with value false
+15/09/01 01:32:51 INFO conf.DMLConfig: Updating sysml.defaultblocksize with 
value 1000
+15/09/01 01:32:51 INFO conf.DMLConfig: Updating sysml.yarn.appmaster with 
value false
+15/09/01 01:32:51 INFO conf.DMLConfig: Updating sysml.yarn.appmaster.mem with 
value 2048
+15/09/01 01:32:51 INFO conf.DMLConfig: Updating sysml.yarn.mapreduce.mem with 
value 2048
+15/09/01 01:32:51 INFO conf.DMLConfig: Updating sysml.yarn.app.queue with 
value default
+15/09/01 01:32:51 INFO conf.DMLConfig: Updating sysml.parallel.ops with value 
true
+15/09/01 01:32:51 INFO conf.DMLConfig: Updating sysml.parallel.io with value 
true
+Accuracy (%): 74.14965986394557
+15/09/01 01:32:52 INFO api.DMLScript: SystemML Statistics:
+Total execution time:          0.130 sec.
+Number of executed MR Jobs:    0.
+</code></pre>
+
+<p>The generated file <code>l2-svm-confusion.csv</code> should contain the 
following confusion matrix of this form:</p>
+
+<pre><code>|0   1.0 2.0|
+|1.0 t1  t2 |
+|2.0 t3  t4 |
+</code></pre>
+
+<ul>
+  <li>The model correctly predicted label 1 <code>t1</code> times</li>
+  <li>The model incorrectly predicted label 1 as opposed to label 2 
<code>t2</code> times</li>
+  <li>The model incorrectly predicted label 2 as opposed to label 1 
<code>t3</code> times</li>
+  <li>The model correctly predicted label 2 <code>t4</code> times.</li>
+</ul>
+
+<p>If the confusion matrix looks like this &#8230;</p>
+
+<pre><code>0,1.0,2.0
+1.0,107.0,38.0
+2.0,0.0,2.0
+</code></pre>
+
+<p>&#8230; then the accuracy of the model is (t1+t4)/(t1+t2+t3+t4) = 
(107+2)/107+38+0+2) = 0.741496599</p>
+
+<p><br /></p>
+
+<p>Refer to the <a href="algorithms-reference.html">Algorithms Reference</a> 
for more details.</p>
+
+<hr />
+
+<h1 id="example-3---linear-regression">Example 3 - Linear Regression</h1>
+
+<p>For this example, we&#8217;ll use a standalone wrapper executable, 
<code>bin/systemml</code>, that is available to
+be run directly within the project&#8217;s source directory when built 
locally.</p>
+
+<p>After you build SystemML from source (<code>mvn clean package</code>), the 
standalone mode can be executed
+either on Linux or OS X using the <code>./bin/systemml</code> script, or on 
Windows using the
+<code>.\bin\systemml.bat</code> batch file.</p>
+
+<p>If you run from the script from the project root folder <code>./</code> or 
from the <code>./bin</code> folder, then the
+output files from running SystemML will be created inside the 
<code>./temp</code> folder to keep them separate
+from the SystemML source files managed by Git. The output files for this 
example will be created
+under the <code>./temp</code> folder.</p>
+
+<p>The runtime behavior and logging behavior of SystemML can be customized by 
editing the files
+<code>./conf/SystemML-config.xml</code> and 
<code>./conf/log4j.properties</code>. Both files will be created from their
+corresponding <code>*.template</code> files during the first execution of the 
SystemML executable script.</p>
+
+<p>When invoking the <code>./bin/systemml</code> or 
<code>.\bin\systemml.bat</code> with any of the prepackaged DML scripts
+you can omit the relative path to the DML script file. The following two 
commands are equivalent:</p>
+
+<pre><code>./bin/systemml ./scripts/datagen/genLinearRegressionData.dml 
-nvargs numSamples=1000 numFeatures=50 maxFeatureValue=5 maxWeight=5 
addNoise=FALSE b=0 sparsity=0.7 output=linRegData.csv format=csv perc=0.5
+
+./bin/systemml genLinearRegressionData.dml -nvargs numSamples=1000 
numFeatures=50 maxFeatureValue=5 maxWeight=5 addNoise=FALSE b=0 sparsity=0.7 
output=linRegData.csv format=csv perc=0.5
+</code></pre>
+
+<p>In this guide we invoke the command with the relative folder to make it 
easier to look up the source
+of the DML scripts.</p>
+
+<h2 id="linear-regression-example">Linear Regression Example</h2>
+
+<p>As an example of the capabilities and power of SystemML and DML, 
let&#8217;s consider the Linear Regression algorithm.
+We require sets of data to train and test our model. To obtain this data, we 
can either use real data or
+generate data for our algorithm. The
+<a href="https://archive.ics.uci.edu/ml/datasets.html";>UCI Machine Learning 
Repository Datasets</a> is one location for real data.
+Use of real data typically involves some degree of data wrangling. In the 
following example, we will use SystemML to
+generate random data to train and test our model.</p>
+
+<p>This example consists of the following parts:</p>
+
+<ul>
+  <li><a href="#run-dml-script-to-generate-random-data">Run DML Script to 
Generate Random Data</a></li>
+  <li><a href="#divide-generated-data-into-two-sample-groups">Divide Generated 
Data into Two Sample Groups</a></li>
+  <li><a href="#split-label-column-from-first-sample">Split Label Column from 
First Sample</a></li>
+  <li><a href="#split-label-column-from-second-sample">Split Label Column from 
Second Sample</a></li>
+  <li><a href="#train-model-on-first-sample">Train Model on First 
Sample</a></li>
+  <li><a href="#test-model-on-second-sample">Test Model on Second 
Sample</a></li>
+</ul>
+
+<p>SystemML is distributed in several packages, including a standalone 
package. We&#8217;ll operate in Standalone mode in this
+example.</p>
+
+<p><a name="run-dml-script-to-generate-random-data"></a></p>
+
+<h3 id="run-dml-script-to-generate-random-data">Run DML Script to Generate 
Random Data</h3>
+
+<p>We can execute the <code>genLinearRegressionData.dml</code> script in 
Standalone mode using either the <code>systemml</code> or 
<code>systemml.bat</code>
+file.
+In this example, we&#8217;ll generate a matrix of 1000 rows of 50 columns of 
test data, with sparsity 0.7. In addition to
+this, a 51<sup>st</sup> column consisting of labels will
+be appended to the matrix.</p>
+
+<pre><code>./bin/systemml ./scripts/datagen/genLinearRegressionData.dml 
-nvargs numSamples=1000 numFeatures=50 maxFeatureValue=5 maxWeight=5 
addNoise=FALSE b=0 sparsity=0.7 output=linRegData.csv format=csv perc=0.5
+</code></pre>
+
+<p>This generates the following files inside the <code>./temp</code> 
folder:</p>
+
+<pre><code>linRegData.csv      # 1000 rows of 51 columns of doubles (50 data 
columns and 1 label column), csv format
+linRegData.csv.mtd  # Metadata file
+perc.csv            # Used to generate two subsets of the data (for training 
and testing)
+perc.csv.mtd        # Metadata file
+scratch_space       # SystemML scratch_space directory
+</code></pre>
+
+<p><a name="divide-generated-data-into-two-sample-groups"></a></p>
+
+<h3 id="divide-generated-data-into-two-sample-groups">Divide Generated Data 
into Two Sample Groups</h3>
+
+<p>Next, we&#8217;ll create two subsets of the generated data, each of size 
~50%. We can accomplish this using the <code>sample.dml</code>
+script with the <code>perc.csv</code> file created in the previous step:</p>
+
+<pre><code>0.5
+0.5
+</code></pre>
+
+<p>The <code>sample.dml</code> script will randomly sample rows from the 
<code>linRegData.csv</code> file and place them into 2 files based
+on the percentages specified in <code>perc.csv</code>. This will create two 
sample groups of roughly 50 percent each.</p>
+
+<pre><code>./bin/systemml ./scripts/utils/sample.dml -nvargs X=linRegData.csv 
sv=perc.csv O=linRegDataParts ofmt=csv
+</code></pre>
+
+<p>This script creates two partitions of the original data and places them in 
a <code>linRegDataParts</code> folder. The files created
+are as follows:</p>
+
+<pre><code>linRegDataParts/1       # first partition of data, ~50% of rows of 
linRegData.csv, csv format
+linRegDataParts/1.mtd   # metadata
+linRegDataParts/2       # second partition of data, ~50% of rows of 
linRegData.csv, csv format
+linRegDataParts/2.mtd   # metadata
+</code></pre>
+
+<p>The <code>1</code> file contains the first partition of data, and the 
<code>2</code> file contains the second partition of data.
+An associated metadata file describes
+the nature of each partition of data. If we open <code>1</code> and 
<code>2</code> and look at the number of rows, we can see that typically
+the partitions are not exactly 50% but instead are close to 50%. However, we 
find that the total number of rows in the
+original data file equals the sum of the number of rows in <code>1</code> and 
<code>2</code>.</p>
+
+<p><a name="split-label-column-from-first-sample"></a></p>
+
+<h3 id="split-label-column-from-first-sample">Split Label Column from First 
Sample</h3>
+
+<p>The next task is to split the label column from the first sample. We can do 
this using the <code>splitXY.dml</code> script.</p>
+
+<pre><code>./bin/systemml ./scripts/utils/splitXY.dml -nvargs 
X=linRegDataParts/1 y=51 OX=linRegData.train.data.csv 
OY=linRegData.train.labels.csv ofmt=csv
+</code></pre>
+
+<p>This splits column 51, the label column, off from the data. When done, the 
following files have been created.</p>
+
+<pre><code>linRegData.train.data.csv        # training data of 50 columns, csv 
format
+linRegData.train.data.csv.mtd    # metadata
+linRegData.train.labels.csv      # training labels of 1 column, csv format
+linRegData.train.labels.csv.mtd  # metadata
+</code></pre>
+
+<p><a name="split-label-column-from-second-sample"></a></p>
+
+<h3 id="split-label-column-from-second-sample">Split Label Column from Second 
Sample</h3>
+
+<p>We also need to split the label column from the second sample.</p>
+
+<pre><code>./bin/systemml ./scripts/utils/splitXY.dml -nvargs 
X=linRegDataParts/2 y=51 OX=linRegData.test.data.csv 
OY=linRegData.test.labels.csv ofmt=csv
+</code></pre>
+
+<p>This splits column 51 off the data, resulting in the following files:</p>
+
+<pre><code>linRegData.test.data.csv        # test data of 50 columns, csv 
format
+linRegData.test.data.csv.mtd    # metadata
+linRegData.test.labels.csv      # test labels of 1 column, csv format
+linRegData.test.labels.csv.mtd  # metadata
+</code></pre>
+
+<p><a name="train-model-on-first-sample"></a></p>
+
+<h3 id="train-model-on-first-sample">Train Model on First Sample</h3>
+
+<p>Now, we can train our model based on the first sample. To do this, we 
utilize the <code>LinearRegDS.dml</code> (Linear Regression
+Direct Solve) script. Note that SystemML also includes a 
<code>LinearRegCG.dml</code> (Linear Regression Conjugate Gradient)
+algorithm for situations where the number of features is large.</p>
+
+<pre><code>./bin/systemml ./scripts/algorithms/LinearRegDS.dml -nvargs 
X=linRegData.train.data.csv Y=linRegData.train.labels.csv B=betas.csv fmt=csv
+</code></pre>
+
+<p>This will generate the following files:</p>
+
+<pre><code>betas.csv      # betas, 50 rows of 1 column, csv format
+betas.csv.mtd  # metadata
+</code></pre>
+
+<p>The LinearRegDS.dml script generates statistics to standard output similar 
to the following.</p>
+
+<pre><code>BEGIN LINEAR REGRESSION SCRIPT
+Reading X and Y...
+Calling the Direct Solver...
+Computing the statistics...
+AVG_TOT_Y,-2.160284487670675
+STDEV_TOT_Y,66.86434576808432
+AVG_RES_Y,-3.3127468704080085E-10
+STDEV_RES_Y,1.7231785003947183E-8
+DISPERSION,2.963950542926297E-16
+R2,1.0
+ADJUSTED_R2,1.0
+R2_NOBIAS,1.0
+ADJUSTED_R2_NOBIAS,1.0
+R2_VS_0,1.0
+ADJUSTED_R2_VS_0,1.0
+Writing the output matrix...
+END LINEAR REGRESSION SCRIPT
+</code></pre>
+
+<p>Now that we have our <code>betas.csv</code>, we can test our model with our 
second set of data.</p>
+
+<p><a name="test-model-on-second-sample"></a></p>
+
+<h3 id="test-model-on-second-sample">Test Model on Second Sample</h3>
+
+<p>To test our model on the second sample, we can use the 
<code>GLM-predict.dml</code> script. This script can be used for both
+prediction and scoring. Here, we&#8217;re using it for scoring since we 
include the <code>Y</code> named argument. Our <code>betas.csv</code>
+file is specified as the <code>B</code> named argument.</p>
+
+<pre><code>./bin/systemml ./scripts/algorithms/GLM-predict.dml -nvargs 
X=linRegData.test.data.csv Y=linRegData.test.labels.csv B=betas.csv fmt=csv
+</code></pre>
+
+<p>This generates statistics similar to the following to standard output.</p>
+
+<pre><code>LOGLHOOD_Z,,FALSE,NaN
+LOGLHOOD_Z_PVAL,,FALSE,NaN
+PEARSON_X2,,FALSE,1.895530994504798E-13
+PEARSON_X2_BY_DF,,FALSE,4.202951207327712E-16
+PEARSON_X2_PVAL,,FALSE,1.0
+DEVIANCE_G2,,FALSE,0.0
+DEVIANCE_G2_BY_DF,,FALSE,0.0
+DEVIANCE_G2_PVAL,,FALSE,1.0
+LOGLHOOD_Z,,TRUE,NaN
+LOGLHOOD_Z_PVAL,,TRUE,NaN
+PEARSON_X2,,TRUE,1.895530994504798E-13
+PEARSON_X2_BY_DF,,TRUE,4.202951207327712E-16
+PEARSON_X2_PVAL,,TRUE,1.0
+DEVIANCE_G2,,TRUE,0.0
+DEVIANCE_G2_BY_DF,,TRUE,0.0
+DEVIANCE_G2_PVAL,,TRUE,1.0
+AVG_TOT_Y,1,,1.0069397725436522
+STDEV_TOT_Y,1,,68.29092137526905
+AVG_RES_Y,1,,-4.1450397073455047E-10
+STDEV_RES_Y,1,,2.0519206226041048E-8
+PRED_STDEV_RES,1,TRUE,1.0
+R2,1,,1.0
+ADJUSTED_R2,1,,1.0
+R2_NOBIAS,1,,1.0
+ADJUSTED_R2_NOBIAS,1,,1.0
+</code></pre>
+
+<p>We see that the STDEV_RES_Y value of the testing phase is of similar 
magnitude
+to the value obtained from the model training phase.</p>
+
+<p>For convenience, we can encapsulate our DML invocations in a single 
script:</p>
+
+<pre><code>#!/bin/bash
+
+./bin/systemml ./scripts/datagen/genLinearRegressionData.dml -nvargs 
numSamples=1000 numFeatures=50 maxFeatureValue=5 maxWeight=5 addNoise=FALSE b=0 
sparsity=0.7 output=linRegData.csv format=csv perc=0.5
+
+./bin/systemml ./scripts/utils/sample.dml -nvargs X=linRegData.csv sv=perc.csv 
O=linRegDataParts ofmt=csv
+
+./bin/systemml ./scripts/utils/splitXY.dml -nvargs X=linRegDataParts/1 y=51 
OX=linRegData.train.data.csv OY=linRegData.train.labels.csv ofmt=csv
+
+./bin/systemml ./scripts/utils/splitXY.dml -nvargs X=linRegDataParts/2 y=51 
OX=linRegData.test.data.csv OY=linRegData.test.labels.csv ofmt=csv
+
+./bin/systemml ./scripts/algorithms/LinearRegDS.dml -nvargs 
X=linRegData.train.data.csv Y=linRegData.train.labels.csv B=betas.csv fmt=csv
+
+./bin/systemml ./scripts/algorithms/GLM-predict.dml -nvargs 
X=linRegData.test.data.csv Y=linRegData.test.labels.csv B=betas.csv fmt=csv
+</code></pre>
+
+<h1 id="troubleshooting">Troubleshooting</h1>
+
+<p>If you encounter a <code>"java.lang.OutOfMemoryError"</code> you can edit 
the invocation
+script (<code>runStandaloneSystemML.sh</code> or 
<code>runStandaloneSystemML.bat</code>) to increase
+the memory available to the JVM, i.e:</p>
+
+<pre><code>java -Xmx16g -Xms4g -Xmn1g -cp ${CLASSPATH} 
org.apache.sysml.api.DMLScript \
+     -f ${SCRIPT_FILE} -exec singlenode -config SystemML-config.xml \
+     $@
+</code></pre>
+
+
+        </div> <!-- /container -->
+
+        
+
+        <script src="js/vendor/jquery-1.12.0.min.js"></script>
+        <script src="js/vendor/bootstrap.min.js"></script>
+        <script src="js/vendor/anchor.min.js"></script>
+        <script src="js/main.js"></script>
+        
+
+
+
+
+        <!-- Analytics -->
+        <script>
+            
(function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
+            (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new 
Date();a=s.createElement(o),
+            
m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
+            
})(window,document,'script','//www.google-analytics.com/analytics.js','ga');
+            ga('create', 'UA-71553733-1', 'auto');
+            ga('send', 'pageview');
+        </script>
+
+
+
+        <!-- MathJax Section -->
+        <script type="text/x-mathjax-config">
+            MathJax.Hub.Config({
+                TeX: { equationNumbers: { autoNumber: "AMS" } }
+            });
+        </script>
+        <script>
+            // Note that we load MathJax this way to work with local file 
(file://), HTTP and HTTPS.
+            // We could use "//cdn.mathjax...", but that won't support 
"file://".
+            (function(d, script) {
+                script = d.createElement('script');
+                script.type = 'text/javascript';
+                script.async = true;
+                script.onload = function(){
+                    MathJax.Hub.Config({
+                        tex2jax: {
+                            inlineMath: [ ["$", "$"], ["\\\\(","\\\\)"] ],
+                            displayMath: [ ["$$","$$"], ["\\[", "\\]"] ],
+                            processEscapes: true,
+                            skipTags: ['script', 'noscript', 'style', 
'textarea', 'pre']
+                        }
+                    });
+                };
+                script.src = ('https:' == document.location.protocol ? 
'https://' : 'http://') +
+                    
'cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML';
+                d.getElementsByTagName('head')[0].appendChild(script);
+            }(document));
+        </script>
+    </body>
+</html>


Added: systemml/site/docs/1.1.0/troubleshooting-guide.html
URL: 
http://svn.apache.org/viewvc/systemml/site/docs/1.1.0/troubleshooting-guide.html?rev=1828046&view=auto
==============================================================================
--- systemml/site/docs/1.1.0/troubleshooting-guide.html (added)
+++ systemml/site/docs/1.1.0/troubleshooting-guide.html Fri Mar 30 04:31:05 2018
@@ -0,0 +1,300 @@
+<!DOCTYPE html>
+<!--[if lt IE 7]>      <html class="no-js lt-ie9 lt-ie8 lt-ie7"> <![endif]-->
+<!--[if IE 7]>         <html class="no-js lt-ie9 lt-ie8"> <![endif]-->
+<!--[if IE 8]>         <html class="no-js lt-ie9"> <![endif]-->
+<!--[if gt IE 8]><!--> <html class="no-js"> <!--<![endif]-->
+    <head>
+        <title>Troubleshooting Guide - SystemML 1.1.0</title>
+        <meta charset="utf-8">
+        <meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1">
+        
+        <meta name="description" content="Troubleshooting Guide">
+        
+        <meta name="viewport" content="width=device-width">
+        <link rel="stylesheet" href="css/bootstrap.min.css">
+        <link rel="stylesheet" href="css/main.css">
+        <link rel="stylesheet" href="css/pygments-default.css">
+        <link rel="shortcut icon" href="img/favicon.png">
+    </head>
+    <body>
+        <!--[if lt IE 7]>
+            <p class="chromeframe">You are using an outdated browser. <a 
href="http://browsehappy.com/";>Upgrade your browser today</a> or <a 
href="http://www.google.com/chromeframe/?redirect=true";>install Google Chrome 
Frame</a> to better experience this site.</p>
+        <![endif]-->
+
+        <header class="navbar navbar-default navbar-fixed-top" id="topbar">
+            <div class="container">
+                <div class="navbar-header">
+                    <div class="navbar-brand brand projectlogo">
+                        <a href="http://systemml.apache.org/";><img 
class="logo" src="img/systemml-logo.png" alt="Apache SystemML" title="Apache 
SystemML"/></a>
+                    </div>
+                    <div class="navbar-brand brand projecttitle">
+                        <a href="http://systemml.apache.org/";>Apache 
SystemML<sup id="trademark">â¢</sup></a><br/>
+                        <span class="version">1.1.0</span>
+                    </div>
+                    <button type="button" class="navbar-toggle collapsed" 
data-toggle="collapse" data-target=".navbar-collapse">
+                        <span class="sr-only">Toggle navigation</span>
+                        <span class="icon-bar"></span>
+                        <span class="icon-bar"></span>
+                        <span class="icon-bar"></span>
+                    </button>
+                </div>
+                <nav class="navbar-collapse collapse">
+                    <ul class="nav navbar-nav navbar-right">
+                        <li><a href="index.html">Overview</a></li>
+                        <li><a 
href="https://github.com/apache/systemml";>GitHub</a></li>
+                        <li class="dropdown">
+                            <a href="#" class="dropdown-toggle" 
data-toggle="dropdown">Documentation<b class="caret"></b></a>
+                            <ul class="dropdown-menu" role="menu">
+                                <li><b>Running SystemML:</b></li>
+                                <li><a 
href="https://github.com/apache/systemml";>SystemML GitHub README</a></li>
+                                <li><a 
href="spark-mlcontext-programming-guide.html">Spark MLContext</a></li>
+                                <li><a href="spark-batch-mode.html">Spark 
Batch Mode</a>
+                                <li><a href="hadoop-batch-mode.html">Hadoop 
Batch Mode</a>
+                                <li><a href="standalone-guide.html">Standalone 
Guide</a></li>
+                                <li><a href="jmlc.html">Java Machine Learning 
Connector (JMLC)</a>
+                                <li class="divider"></li>
+                                <li><b>Language Guides:</b></li>
+                                <li><a href="dml-language-reference.html">DML 
Language Reference</a></li>
+                                <li><a 
href="beginners-guide-to-dml-and-pydml.html">Beginner's Guide to DML and 
PyDML</a></li>
+                                <li><a 
href="beginners-guide-python.html">Beginner's Guide for Python Users</a></li>
+                                <li><a href="python-reference.html">Reference 
Guide for Python Users</a></li>
+                                <li class="divider"></li>
+                                <li><b>ML Algorithms:</b></li>
+                                <li><a 
href="algorithms-reference.html">Algorithms Reference</a></li>
+                                <li class="divider"></li>
+                                <li><b>Tools:</b></li>
+                                <li><a href="debugger-guide.html">Debugger 
Guide</a></li>
+                                <li><a 
href="developer-tools-systemml.html">IDE Guide</a></li>
+                                <li class="divider"></li>
+                                <li><b>Other:</b></li>
+                                <li><a 
href="contributing-to-systemml.html">Contributing to SystemML</a></li>
+                                <li><a href="engine-dev-guide.html">Engine 
Developer Guide</a></li>
+                                <li><a 
href="troubleshooting-guide.html">Troubleshooting Guide</a></li>
+                                <li><a href="release-process.html">Release 
Process</a></li>
+                            </ul>
+                        </li>
+                        
+                        <li class="dropdown">
+                            <a href="#" class="dropdown-toggle" 
data-toggle="dropdown">API Docs<b class="caret"></b></a>
+                            <ul class="dropdown-menu" role="menu">
+                                <li><a 
href="./api/java/index.html">Java</a></li>
+                                <li><a 
href="./api/python/index.html">Python</a></li>
+                            </ul>
+                        </li>
+                        
+                        <li class="dropdown">
+                            <a href="#" class="dropdown-toggle" 
data-toggle="dropdown">Issues<b class="caret"></b></a>
+                            <ul class="dropdown-menu" role="menu">
+                                <li><b>JIRA:</b></li>
+                                <li><a 
href="https://issues.apache.org/jira/browse/SYSTEMML";>SystemML JIRA</a></li>
+                                
+                            </ul>
+                        </li>
+                    </ul>
+                </nav>
+            </div>
+        </header>
+
+        <div class="container" id="content">
+          
+            <h1 class="title">Troubleshooting Guide</h1>
+          
+
+          <!--
+
+-->
+
+<ul id="markdown-toc">
+  <li><a href="#classnotfoundexception-for-commons-math3" 
id="markdown-toc-classnotfoundexception-for-commons-math3">ClassNotFoundException
 for commons-math3</a></li>
+  <li><a href="#outofmemoryerror-in-hadoop-reduce-phase" 
id="markdown-toc-outofmemoryerror-in-hadoop-reduce-phase">OutOfMemoryError in 
Hadoop Reduce Phase</a></li>
+  <li><a 
href="#total-size-of-serialized-results-is-bigger-than-sparkdrivermaxresultsize"
 
id="markdown-toc-total-size-of-serialized-results-is-bigger-than-sparkdrivermaxresultsize">Total
 size of serialized results is bigger than spark.driver.maxResultSize</a></li>
+  <li><a href="#file-does-not-exist-on-hdfslfs-error-from-remote-parfor" 
id="markdown-toc-file-does-not-exist-on-hdfslfs-error-from-remote-parfor">File 
does not exist on HDFS/LFS error from remote parfor</a></li>
+  <li><a href="#jvm-garbage-collection-related-flags" 
id="markdown-toc-jvm-garbage-collection-related-flags">JVM Garbage Collection 
related flags</a></li>
+  <li><a href="#memory-overhead" id="markdown-toc-memory-overhead">Memory 
overhead</a></li>
+  <li><a href="#network-timeout" id="markdown-toc-network-timeout">Network 
timeout</a></li>
+  <li><a href="#advanced-developer-statistics" 
id="markdown-toc-advanced-developer-statistics">Advanced developer 
statistics</a></li>
+  <li><a href="#out-of-memory-on-executors" 
id="markdown-toc-out-of-memory-on-executors">Out-Of-Memory on executors</a></li>
+  <li><a href="#native-blas-errors" 
id="markdown-toc-native-blas-errors">Native BLAS errors</a></li>
+</ul>
+
+<p><br /></p>
+
+<h2 id="classnotfoundexception-for-commons-math3">ClassNotFoundException for 
commons-math3</h2>
+
+<p>The Apache Commons Math library is utilized by SystemML. The commons-math3
+dependency is included with Spark and with newer versions of Hadoop. Running
+SystemML on an older Hadoop cluster can potentially generate an error such
+as the following due to the missing commons-math3 dependency:</p>
+
+<pre><code>java.lang.ClassNotFoundException: 
org.apache.commons.math3.linear.RealMatrix
+</code></pre>
+
+<p>This issue can be fixed by changing the commons-math3 <code>scope</code> in 
the pom.xml file
+from <code>provided</code> to <code>compile</code>.</p>
+
+<pre><code>&lt;dependency&gt;
+       &lt;groupId&gt;org.apache.commons&lt;/groupId&gt;
+       &lt;artifactId&gt;commons-math3&lt;/artifactId&gt;
+       &lt;version&gt;3.1.1&lt;/version&gt;
+       &lt;scope&gt;compile&lt;/scope&gt;
+&lt;/dependency&gt;
+</code></pre>
+
+<p>SystemML can then be rebuilt with the <code>commons-math3</code> dependency 
using
+Maven (<code>mvn clean package -P distribution</code>).</p>
+
+<h2 id="outofmemoryerror-in-hadoop-reduce-phase">OutOfMemoryError in Hadoop 
Reduce Phase</h2>
+<p>In Hadoop MapReduce, outputs from mapper nodes are copied to reducer nodes 
and then sorted (known as the <em>shuffle</em> phase) before being consumed by 
reducers. The shuffle phase utilizes several buffers that share memory space 
with other MapReduce tasks, which will throw an <code>OutOfMemoryError</code> 
if the shuffle buffers take too much space:</p>
+
+<pre><code>Error: java.lang.OutOfMemoryError: Java heap space
+    at org.apache.hadoop.mapred.IFile$Reader.readNextBlock(IFile.java:357)
+    at org.apache.hadoop.mapred.IFile$Reader.next(IFile.java:419)
+    at org.apache.hadoop.mapred.Merger$Segment.next(Merger.java:238)
+    at 
org.apache.hadoop.mapred.Merger$MergeQueue.adjustPriorityQueue(Merger.java:348)
+    at org.apache.hadoop.mapred.Merger$MergeQueue.next(Merger.java:368)
+    at org.apache.hadoop.mapred.Merger.writeFile(Merger.java:156)
+    ...
+</code></pre>
+
+<p>One way to fix this issue is lowering the following buffer thresholds.</p>
+
+<pre><code>mapred.job.shuffle.input.buffer.percent # default 0.70; try 0.20 
+mapred.job.shuffle.merge.percent # default 0.66; try 0.20
+mapred.job.reduce.input.buffer.percent # default 0.0; keep 0.0
+</code></pre>
+
+<p>These configurations can be modified <strong>globally</strong> by 
inserting/modifying the following in <code>mapred-site.xml</code>.</p>
+
+<pre><code>&lt;property&gt;
+ &lt;name&gt;mapred.job.shuffle.input.buffer.percent&lt;/name&gt;
+ &lt;value&gt;0.2&lt;/value&gt;
+&lt;/property&gt;
+&lt;property&gt;
+ &lt;name&gt;mapred.job.shuffle.merge.percent&lt;/name&gt;
+ &lt;value&gt;0.2&lt;/value&gt;
+&lt;/property&gt;
+&lt;property&gt;
+ &lt;name&gt;mapred.job.reduce.input.buffer.percent&lt;/name&gt;
+ &lt;value&gt;0.0&lt;/value&gt;
+&lt;/property&gt;
+</code></pre>
+
+<p>They can also be configured on a <strong>per SystemML-task basis</strong> 
by inserting the following in <code>SystemML-config.xml</code>.</p>
+
+<pre><code>&lt;mapred.job.shuffle.merge.percent&gt;0.2&lt;/mapred.job.shuffle.merge.percent&gt;
+&lt;mapred.job.shuffle.input.buffer.percent&gt;0.2&lt;/mapred.job.shuffle.input.buffer.percent&gt;
+&lt;mapred.job.reduce.input.buffer.percent&gt;0&lt;/mapred.job.reduce.input.buffer.percent&gt;
+</code></pre>
+
+<p>Note: The default <code>SystemML-config.xml</code> is located in 
<code>&lt;path to SystemML root&gt;/conf/</code>. It is passed to SystemML 
using the <code>-config</code> argument:</p>
+
+<pre><code>hadoop jar SystemML.jar [-? | -help | -f &lt;filename&gt;] (-config 
&lt;config_filename&gt;) ([-args | -nvargs] &lt;args-list&gt;)
+</code></pre>
+
+<p>See <a href="hadoop-batch-mode.html">Invoking SystemML in Hadoop Batch 
Mode</a> for details of the syntax.</p>
+
+<h2 
id="total-size-of-serialized-results-is-bigger-than-sparkdrivermaxresultsize">Total
 size of serialized results is bigger than spark.driver.maxResultSize</h2>
+
+<p>Spark aborts a job if the estimated result size of collect is greater than 
maxResultSize to avoid out-of-memory errors in driver.
+However, SystemML&#8217;s optimizer has estimates the memory required for each 
operator and provides guards against these out-of-memory errors in driver.
+So, we recommend setting the configuration <code>--conf 
spark.driver.maxResultSize=0</code>.</p>
+
+<h2 id="file-does-not-exist-on-hdfslfs-error-from-remote-parfor">File does not 
exist on HDFS/LFS error from remote parfor</h2>
+
+<p>This error usually comes from incorrect HDFS configuration on the worker 
nodes. To investigate this, we recommend</p>
+
+<ul>
+  <li>Testing if HDFS is accessible from the worker node: <code>hadoop fs -ls 
&lt;file path&gt;</code></li>
+  <li>Synchronize hadoop configuration across the worker nodes.</li>
+  <li>Set the environment variable <code>HADOOP_CONF_DIR</code>. You may have 
to restart the cluster-manager to get the hadoop configuration.</li>
+</ul>
+
+<h2 id="jvm-garbage-collection-related-flags">JVM Garbage Collection related 
flags</h2>
+
+<p>We recommend providing 10% of maximum memory to young generation and using 
<code>-server</code> flag for robust garbage collection policy. 
+For example: if you intend to use 20G driver and 60G executor, then please add 
following to your configuration:</p>
+
+<pre><code> spark-submit --driver-memory 20G --executor-memory 60G --conf 
"spark.executor.extraJavaOptions=-Xmn6G -server" --conf  
"spark.driver.extraJavaOptions=-Xmn2G -server" ... 
+</code></pre>
+
+<h2 id="memory-overhead">Memory overhead</h2>
+
+<p>Spark sets <code>spark.yarn.executor.memoryOverhead</code>, 
<code>spark.yarn.driver.memoryOverhead</code> and 
<code>spark.yarn.am.memoryOverhead</code> to be 10% of memory provided
+to the executor, driver and YARN Application Master respectively (with minimum 
of 384 MB). For certain workloads, the user may have to increase this
+overhead to 12-15% of the memory budget.</p>
+
+<h2 id="network-timeout">Network timeout</h2>
+
+<p>To avoid false-positive errors due to network failures in case of 
compute-bound scripts, the user may have to increase the timeout 
<code>spark.network.timeout</code> (default: 120s).</p>
+
+<h2 id="advanced-developer-statistics">Advanced developer statistics</h2>
+
+<p>Few of our operators (for example: convolution-related operator) and GPU 
backend allows an expert user to get advanced statistics
+by setting the configuration <code>systemml.stats.extraGPU</code> and 
<code>systemml.stats.extraDNN</code> in the file SystemML-config.xml.</p>
+
+<h2 id="out-of-memory-on-executors">Out-Of-Memory on executors</h2>
+
+<p>Out-Of-Memory on executors is often caused due to side-effects of lazy 
evaluation and in-memory input data of Spark for large-scale problems. 
+Though we are constantly improving our optimizer to address this scenario, a 
quick hack to resolve this is reducing the number of cores allocated to the 
executor.
+We would highly appreciate if you file a bug report on our <a 
href="https://issues.apache.org/jira/browse/SYSTEMML";>issue tracker</a> if and 
when you encounter OOM.</p>
+
+<h2 id="native-blas-errors">Native BLAS errors</h2>
+
+<p>Please see <a href="http://apache.github.io/systemml/native-backend";>the 
user guide of native backend</a>.</p>
+
+
+        </div> <!-- /container -->
+
+        
+
+        <script src="js/vendor/jquery-1.12.0.min.js"></script>
+        <script src="js/vendor/bootstrap.min.js"></script>
+        <script src="js/vendor/anchor.min.js"></script>
+        <script src="js/main.js"></script>
+        
+
+
+
+
+        <!-- Analytics -->
+        <script>
+            
(function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
+            (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new 
Date();a=s.createElement(o),
+            
m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
+            
})(window,document,'script','//www.google-analytics.com/analytics.js','ga');
+            ga('create', 'UA-71553733-1', 'auto');
+            ga('send', 'pageview');
+        </script>
+
+
+
+        <!-- MathJax Section -->
+        <script type="text/x-mathjax-config">
+            MathJax.Hub.Config({
+                TeX: { equationNumbers: { autoNumber: "AMS" } }
+            });
+        </script>
+        <script>
+            // Note that we load MathJax this way to work with local file 
(file://), HTTP and HTTPS.
+            // We could use "//cdn.mathjax...", but that won't support 
"file://".
+            (function(d, script) {
+                script = d.createElement('script');
+                script.type = 'text/javascript';
+                script.async = true;
+                script.onload = function(){
+                    MathJax.Hub.Config({
+                        tex2jax: {
+                            inlineMath: [ ["$", "$"], ["\\\\(","\\\\)"] ],
+                            displayMath: [ ["$$","$$"], ["\\[", "\\]"] ],
+                            processEscapes: true,
+                            skipTags: ['script', 'noscript', 'style', 
'textarea', 'pre']
+                        }
+                    });
+                };
+                script.src = ('https:' == document.location.protocol ? 
'https://' : 'http://') +
+                    
'cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML';
+                d.getElementsByTagName('head')[0].appendChild(script);
+            }(document));
+        </script>
+    </body>
+</html>

svn commit: r1828046 [20/20] - /systemml/site/docs/1.1.0/

Reply via email to