svn commit: r1828046 [14/20] - /systemml/site/docs/1.1.0/

reinwald Thu, 29 Mar 2018 21:31:54 -0700

Added: systemml/site/docs/1.1.0/hadoop-batch-mode.html
URL: 
http://svn.apache.org/viewvc/systemml/site/docs/1.1.0/hadoop-batch-mode.html?rev=1828046&view=auto
==============================================================================
--- systemml/site/docs/1.1.0/hadoop-batch-mode.html (added)
+++ systemml/site/docs/1.1.0/hadoop-batch-mode.html Fri Mar 30 04:31:05 2018
@@ -0,0 +1,1224 @@
+<!DOCTYPE html>
+<!--[if lt IE 7]>      <html class="no-js lt-ie9 lt-ie8 lt-ie7"> <![endif]-->
+<!--[if IE 7]>         <html class="no-js lt-ie9 lt-ie8"> <![endif]-->
+<!--[if IE 8]>         <html class="no-js lt-ie9"> <![endif]-->
+<!--[if gt IE 8]><!--> <html class="no-js"> <!--<![endif]-->
+    <head>
+        <title>Invoking SystemML in Hadoop Batch Mode - SystemML 1.1.0</title>
+        <meta charset="utf-8">
+        <meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1">
+        
+        <meta name="description" content="Invoking SystemML in Hadoop Batch 
Mode">
+        
+        <meta name="viewport" content="width=device-width">
+        <link rel="stylesheet" href="css/bootstrap.min.css">
+        <link rel="stylesheet" href="css/main.css">
+        <link rel="stylesheet" href="css/pygments-default.css">
+        <link rel="shortcut icon" href="img/favicon.png">
+    </head>
+    <body>
+        <!--[if lt IE 7]>
+            <p class="chromeframe">You are using an outdated browser. <a 
href="http://browsehappy.com/";>Upgrade your browser today</a> or <a 
href="http://www.google.com/chromeframe/?redirect=true";>install Google Chrome 
Frame</a> to better experience this site.</p>
+        <![endif]-->
+
+        <header class="navbar navbar-default navbar-fixed-top" id="topbar">
+            <div class="container">
+                <div class="navbar-header">
+                    <div class="navbar-brand brand projectlogo">
+                        <a href="http://systemml.apache.org/";><img 
class="logo" src="img/systemml-logo.png" alt="Apache SystemML" title="Apache 
SystemML"/></a>
+                    </div>
+                    <div class="navbar-brand brand projecttitle">
+                        <a href="http://systemml.apache.org/";>Apache 
SystemML<sup id="trademark">â¢</sup></a><br/>
+                        <span class="version">1.1.0</span>
+                    </div>
+                    <button type="button" class="navbar-toggle collapsed" 
data-toggle="collapse" data-target=".navbar-collapse">
+                        <span class="sr-only">Toggle navigation</span>
+                        <span class="icon-bar"></span>
+                        <span class="icon-bar"></span>
+                        <span class="icon-bar"></span>
+                    </button>
+                </div>
+                <nav class="navbar-collapse collapse">
+                    <ul class="nav navbar-nav navbar-right">
+                        <li><a href="index.html">Overview</a></li>
+                        <li><a 
href="https://github.com/apache/systemml";>GitHub</a></li>
+                        <li class="dropdown">
+                            <a href="#" class="dropdown-toggle" 
data-toggle="dropdown">Documentation<b class="caret"></b></a>
+                            <ul class="dropdown-menu" role="menu">
+                                <li><b>Running SystemML:</b></li>
+                                <li><a 
href="https://github.com/apache/systemml";>SystemML GitHub README</a></li>
+                                <li><a 
href="spark-mlcontext-programming-guide.html">Spark MLContext</a></li>
+                                <li><a href="spark-batch-mode.html">Spark 
Batch Mode</a>
+                                <li><a href="hadoop-batch-mode.html">Hadoop 
Batch Mode</a>
+                                <li><a href="standalone-guide.html">Standalone 
Guide</a></li>
+                                <li><a href="jmlc.html">Java Machine Learning 
Connector (JMLC)</a>
+                                <li class="divider"></li>
+                                <li><b>Language Guides:</b></li>
+                                <li><a href="dml-language-reference.html">DML 
Language Reference</a></li>
+                                <li><a 
href="beginners-guide-to-dml-and-pydml.html">Beginner's Guide to DML and 
PyDML</a></li>
+                                <li><a 
href="beginners-guide-python.html">Beginner's Guide for Python Users</a></li>
+                                <li><a href="python-reference.html">Reference 
Guide for Python Users</a></li>
+                                <li class="divider"></li>
+                                <li><b>ML Algorithms:</b></li>
+                                <li><a 
href="algorithms-reference.html">Algorithms Reference</a></li>
+                                <li class="divider"></li>
+                                <li><b>Tools:</b></li>
+                                <li><a href="debugger-guide.html">Debugger 
Guide</a></li>
+                                <li><a 
href="developer-tools-systemml.html">IDE Guide</a></li>
+                                <li class="divider"></li>
+                                <li><b>Other:</b></li>
+                                <li><a 
href="contributing-to-systemml.html">Contributing to SystemML</a></li>
+                                <li><a href="engine-dev-guide.html">Engine 
Developer Guide</a></li>
+                                <li><a 
href="troubleshooting-guide.html">Troubleshooting Guide</a></li>
+                                <li><a href="release-process.html">Release 
Process</a></li>
+                            </ul>
+                        </li>
+                        
+                        <li class="dropdown">
+                            <a href="#" class="dropdown-toggle" 
data-toggle="dropdown">API Docs<b class="caret"></b></a>
+                            <ul class="dropdown-menu" role="menu">
+                                <li><a 
href="./api/java/index.html">Java</a></li>
+                                <li><a 
href="./api/python/index.html">Python</a></li>
+                            </ul>
+                        </li>
+                        
+                        <li class="dropdown">
+                            <a href="#" class="dropdown-toggle" 
data-toggle="dropdown">Issues<b class="caret"></b></a>
+                            <ul class="dropdown-menu" role="menu">
+                                <li><b>JIRA:</b></li>
+                                <li><a 
href="https://issues.apache.org/jira/browse/SYSTEMML";>SystemML JIRA</a></li>
+                                
+                            </ul>
+                        </li>
+                    </ul>
+                </nav>
+            </div>
+        </header>
+
+        <div class="container" id="content">
+          
+            <h1 class="title">Invoking SystemML in Hadoop Batch Mode</h1>
+          
+
+          <!--
+
+-->
+
+<ul id="markdown-toc">
+  <li><a href="#overview" id="markdown-toc-overview">Overview</a></li>
+  <li><a href="#hadoop-batch-mode-invocation-syntax" 
id="markdown-toc-hadoop-batch-mode-invocation-syntax">Hadoop Batch Mode 
Invocation Syntax</a></li>
+  <li><a href="#systemml-with-standalone-hadoop" 
id="markdown-toc-systemml-with-standalone-hadoop">SystemML with Standalone 
Hadoop</a></li>
+  <li><a href="#systemml-with-pseudo-distributed-hadoop" 
id="markdown-toc-systemml-with-pseudo-distributed-hadoop">SystemML with 
Pseudo-Distributed Hadoop</a></li>
+  <li><a href="#systemml-with-pseudo-distributed-hadoop-and-yarn" 
id="markdown-toc-systemml-with-pseudo-distributed-hadoop-and-yarn">SystemML 
with Pseudo-Distributed Hadoop and YARN</a></li>
+  <li><a href="#systemml-with-distributed-hadoop-and-yarn" 
id="markdown-toc-systemml-with-distributed-hadoop-and-yarn">SystemML with 
Distributed Hadoop and YARN</a>    <ul>
+      <li><a 
href="#systemml-with-distributed-hadoop-and-yarn-linear-regression-example" 
id="markdown-toc-systemml-with-distributed-hadoop-and-yarn-linear-regression-example">SystemML
 with Distributed Hadoop and YARN: Linear Regression Example</a></li>
+      <li><a 
href="#systemml-with-distributed-hadoop-and-yarn-k-means-clustering-example" 
id="markdown-toc-systemml-with-distributed-hadoop-and-yarn-k-means-clustering-example">SystemML
 with Distributed Hadoop and YARN: K-Means Clustering Example</a></li>
+    </ul>
+  </li>
+  <li><a href="#recommended-hadoop-cluster-configuration-settings" 
id="markdown-toc-recommended-hadoop-cluster-configuration-settings">Recommended 
Hadoop Cluster Configuration Settings</a></li>
+</ul>
+
+<p><br /></p>
+
+<h1 id="overview">Overview</h1>
+
+<p>Given that a primary purpose of SystemML is to perform machine learning on 
large distributed data sets,
+two of the most important ways to invoke SystemML are Hadoop Batch and Spark 
Batch modes.
+Here, we will look at SystemML&#8217;s Hadoop Batch mode in more depth.</p>
+
+<p>We will look at running SystemML with Standalone Hadoop, Pseudo-Distributed 
Hadoop, and Distributed Hadoop.
+We will first run SystemML on a single machine with Hadoop running in 
Standalone mode. Next, we&#8217;ll run SystemML on HDFS
+in Hadoop&#8217;s Pseudo-Distributed mode on a single machine, followed by 
Pseudo-Distributed mode with YARN.
+After that, we&#8217;ll set up a 4-node Hadoop cluster and run SystemML on 
Distributed Hadoop with YARN.</p>
+
+<p>Note that this tutorial does not address security. For security 
considerations with regards to Hadoop, please
+refer to the Hadoop documentation.</p>
+
+<hr />
+
+<h1 id="hadoop-batch-mode-invocation-syntax">Hadoop Batch Mode Invocation 
Syntax</h1>
+
+<p>SystemML can be invoked in Hadoop Batch mode using the following syntax:</p>
+
+<pre><code>hadoop jar SystemML.jar [-? | -help | -f &lt;filename&gt;] (-config 
&lt;config_filename&gt;) ([-args | -nvargs] &lt;args-list&gt;)
+</code></pre>
+
+<p>The <code>SystemML.jar</code> file is specified to Hadoop using the 
<code>jar</code> option.
+The DML script to invoke is specified after the <code>-f</code> argument. 
Configuration settings can be passed to SystemML
+using the optional <code>-config </code> argument. DML scripts can optionally 
take named arguments (<code>-nvargs</code>) or positional
+arguments (<code>-args</code>). Named arguments are preferred over positional 
arguments. Positional arguments are considered
+to be deprecated. All the primary algorithm scripts included with SystemML use 
named arguments.</p>
+
+<p><strong>Example #1: DML Invocation with Named Arguments</strong></p>
+
+<pre><code>hadoop jar systemml/SystemML.jar -f systemml/algorithms/Kmeans.dml 
-nvargs X=X.mtx k=5
+</code></pre>
+
+<p><strong>Example #2: DML Invocation with Positional Arguments</strong></p>
+
+<pre><code>hadoop jar systemml/SystemML.jar -f 
example/test/LinearRegression.dml -args "v" "y" 0.00000001 "w"
+</code></pre>
+
+<p>In a clustered environment, it is <em>highly</em> recommended that SystemML 
configuration settings are specified
+in a <code>SystemML-config.xml</code> file. By default, SystemML will look for 
this file in the current working
+directory (<code>./SystemML-config.xml</code>). This location can be 
overridden by the <code>-config </code> argument.</p>
+
+<p><strong>Example #3: DML Invocation with Configuration File Explicitly 
Specified and Named Arguments</strong></p>
+
+<pre><code>hadoop jar systemml/SystemML.jar -f systemml/algorithms/Kmeans.dml 
-config /conf/SystemML-config.xml -nvargs X=X.mtx k=5
+</code></pre>
+
+<p>For recommended SystemML configuration settings in a clustered environment, 
please see
+<a 
href="hadoop-batch-mode.html#recommended-hadoop-cluster-configuration-settings">Recommended
 Hadoop Cluster Configuration Settings</a>.</p>
+
+<hr />
+
+<h1 id="systemml-with-standalone-hadoop">SystemML with Standalone Hadoop</h1>
+
+<p>In Standalone mode, Hadoop runs on a single machine as a single Java 
process.</p>
+
+<p>To begin, I connected to my Linux server as root and created a hadoop 
user.</p>
+
+<pre><code>$ ssh [email protected]
+[root@host1 ~]# useradd hadoop
+[root@host1 ~]# passwd hadoop
+</code></pre>
+
+<p>Next, I logged on as the hadoop user. I downloaded the version of Hadoop 
that I wanted to use from an Apache mirror.
+A list of Hadoop releases can be found at the <a 
href="http://hadoop.apache.org/releases.html";>Apache Hadoop Releases</a> 
website.
+After downloading the Hadoop binary release, I unpacked it.</p>
+
+<pre><code>$ ssh [email protected]
+[hadoop@host1 ~]$ wget 
http://mirror.sdunix.com/apache/hadoop/common/hadoop-2.6.2/hadoop-2.6.2.tar.gz
+[hadoop@host1 ~]$ tar -xvzf hadoop-2.6.2.tar.gz
+</code></pre>
+
+<p>My Linux server already had a JDK (Java Development Kit) installed. If you 
haven&#8217;t done so already, you will need Java
+installed in order to use Hadoop.</p>
+
+<p>I updated my <code>.bash_profile</code> file to export a 
<code>JAVA_HOME</code> environment variable, which I pointed to my JDK 
installation
+directory. I also exported a <code>HADOOP_HOME</code> environment variable, 
which points to the root directory of the Hadoop release
+that I unpacked. I updated the <code>PATH</code> variable to include the 
<code>JAVA_HOME</code> <code>bin</code> directory, the <code>HADOOP_HOME</code> 
<code>bin</code> directory,
+and the <code>HADOOP_HOME</code> <code>sbin</code> directory.</p>
+
+<pre><code>[hadoop@host1 ~]# vi .bash_profile
+
+...
+export JAVA_HOME=/usr/lib/jvm/java-1.7.0-openjdk.x86_64
+export HADOOP_HOME=/home/hadoop/hadoop-2.6.2
+PATH=$JAVA_HOME/bin:$PATH:$HOME/bin:$HADOOP_HOME/bin:$HADOOP_HOME/sbin
+export PATH
+...
+
+[hadoop@host1 ~]$ source ~/.bash_profile
+</code></pre>
+
+<p>To verify that Java and Hadoop were on the path, I used the <code>java 
-version</code> and <code>hadoop version</code> commands.</p>
+
+<pre><code>[hadoop@host1 ~]$ java -version
+java version "1.7.0_79"
+OpenJDK Runtime Environment (rhel-2.5.5.1.el6_6-x86_64 u79-b14)
+OpenJDK 64-Bit Server VM (build 24.79-b02, mixed mode)
+[hadoop@host1 ~]$ hadoop version
+Hadoop 2.6.2
+Subversion https://git-wip-us.apache.org/repos/asf/hadoop.git -r 
0cfd050febe4a30b1ee1551dcc527589509fb681
+Compiled by jenkins on 2015-10-22T00:42Z
+Compiled with protoc 2.5.0
+From source with checksum f9ebb94bf5bf9bec892825ede28baca
+This command was run using 
/home/hadoop/hadoop-2.6.2/share/hadoop/common/hadoop-common-2.6.2.jar
+</code></pre>
+
+<p>Next, I downloaded a SystemML release from the <a 
href="http://systemml.apache.org/download.html";>downloads</a> page.
+Following this, I unpacked it.</p>
+
+<pre><code>[hadoop@host1 ~]$ tar -xvzf systemml-1.1.0.tar.gz
+</code></pre>
+
+<p><strong>Alternatively</strong>, we could have built the SystemML 
distributed release using <a href="http://maven.apache.org";>Apache Maven</a> 
and unpacked it.</p>
+
+<pre><code>[hadoop@host1 ~]$ git clone https://github.com/apache/systemml.git
+[hadoop@host1 ~]$ cd systemml
+[hadoop@host1 systemml]$ mvn clean package -P distribution
+[hadoop@host1 systemml]$ tar -xvzf target/systemml-1.1.0.tar.gz -C ..
+[hadoop@host1 ~]$ cd ..
+</code></pre>
+
+<p>I downloaded the <code>genLinearRegressionData.dml</code> script that is 
used in the SystemML README example.</p>
+
+<pre><code>[hadoop@host1 ~]$ wget 
https://raw.githubusercontent.com/apache/systemml/master/scripts/datagen/genLinearRegressionData.dml
+</code></pre>
+
+<p>Next, I invoked the <code>genLinearRegressionData.dml</code> DML script in 
Hadoop Batch mode.
+Hadoop was executed with the <code>SystemML.jar</code> file specified by the 
hadoop <code>jar</code> option.
+The <code>genLinearRegressionData.dml</code> was specified using the 
<code>-f</code> option. Named input
+arguments to the DML script were specified following the <code>-nvargs</code> 
option.</p>
+
+<pre><code>[hadoop@host1 ~]$ hadoop jar systemml-1.1.0/SystemML.jar -f 
genLinearRegressionData.dml -nvargs numSamples=1000 numFeatures=50 
maxFeatureValue=5 maxWeight=5 addNoise=FALSE b=0 sparsity=0.7 
output=linRegData.csv format=csv perc=0.5
+15/11/11 15:56:21 INFO api.DMLScript: BEGIN DML run 11/11/2015 15:56:21
+15/11/11 15:56:21 INFO api.DMLScript: HADOOP_HOME: /home/hadoop/hadoop-2.6.2
+15/11/11 15:56:21 WARN conf.DMLConfig: No default SystemML config file 
(./SystemML-config.xml) found
+15/11/11 15:56:21 WARN conf.DMLConfig: Using default settings in DMLConfig
+15/11/11 15:56:22 INFO jvm.JvmMetrics: Initializing JVM Metrics with 
processName=JobTracker, sessionId=
+15/11/11 15:56:22 WARN hops.OptimizerUtils: Auto-disable multi-threaded text 
read for 'text' and 'csv' due to thread contention on JRE &lt; 1.8 
(java.version=1.7.0_79).
+15/11/11 15:56:22 INFO api.DMLScript: SystemML Statistics:
+Total execution time:          0.288 sec.
+Number of executed MR Jobs:    0.
+
+15/11/11 15:56:22 INFO api.DMLScript: END DML run 11/11/2015 15:56:22
+</code></pre>
+
+<p>In the console output, we see a warning that no default SystemML config 
file was found in the current working directory.
+In a distributed environment on a large data set, it is highly advisable to 
specify configuration settings in a SystemML config file for
+optimal performance. The location of the SystemML config file can be 
explicitly specified using the <code>-config </code> argument.</p>
+
+<p>The OptimizerUtils warning occurs because parallel multi-threaded text 
reads in Java versions less than 1.8 result
+in thread contention issues, so only a single thread reads matrix data in text 
formats.</p>
+
+<p>If we examine the contents of the directory, we see that 
<code>linRegData.csv</code> and <code>perc.csv</code> were written to the file 
system,
+along with their corresponding metadata files. The <code>scratch_space</code> 
directory is used to write temporary matrix files.</p>
+
+<pre><code>[hadoop@host1 ~]$ ls -l
+total 197500
+-rw-rw-r-- 1 hadoop hadoop      2208 Nov 11 15:45 genLinearRegressionData.dml
+drwxr-xr-x 9 hadoop hadoop      4096 Oct 21 17:53 hadoop-2.6.2
+-rw-rw-r-- 1 hadoop hadoop 195515434 Oct 30 14:04 hadoop-2.6.2.tar.gz
+drwxrwxrwx 2 hadoop hadoop      4096 Nov 11 15:56 linRegData.csv
+-rw-r--r-- 1 hadoop hadoop       214 Nov 11 15:56 linRegData.csv.mtd
+drwxrwxrwx 2 hadoop hadoop      4096 Nov 11 15:56 perc.csv
+-rw-r--r-- 1 hadoop hadoop       206 Nov 11 15:56 perc.csv.mtd
+drwxrwxrwx 2 hadoop hadoop      4096 Nov 11 15:56 scratch_space
+drwxrwxr-x 4 hadoop hadoop      4096 Nov 11 15:42 systemml-1.1.0
+-rw-rw-r-- 1 hadoop hadoop   6683281 Oct 27 21:13 systemml-1.1.0.tar.gz
+</code></pre>
+
+<p>To clean things up, I&#8217;ll delete the files that were generated.</p>
+
+<pre><code>[hadoop@host1 ~]$ rm -r *.csv
+[hadoop@host1 ~]$ rm *.csv.mtd
+[hadoop@host1 ~]$ rmdir scratch_space/
+</code></pre>
+
+<hr />
+
+<h1 id="systemml-with-pseudo-distributed-hadoop">SystemML with 
Pseudo-Distributed Hadoop</h1>
+
+<p>Next, we&#8217;ll look at running SystemML with Hadoop in 
Pseudo-Distributed mode. In Pseudo-Distributed mode, each Hadoop daemon
+(such as NameNode and DataNode) runs in a separate Java process on a single 
machine.</p>
+
+<p>In the previous section about Hadoop Standalone mode, we set up the 
<code>JAVA_HOME</code> and <code>HADOOP_HOME</code> environment variables
+and added <code>JAVA_HOME/bin</code>, <code>HADOOP_HOME/bin</code>, and 
<code>HADOOP_HOME/sbin</code> to the <code>PATH</code> in 
<code>.bash_profile</code>.</p>
+
+<p>We also need to set the <code>JAVA_HOME</code> value in the 
<code>hadoop-env.sh</code> file in the Hadoop configuration directory 
(<code>etc/hadoop</code>).</p>
+
+<pre><code>[hadoop@host1 hadoop]$ pwd
+/home/hadoop/hadoop-2.6.2/etc/hadoop
+[hadoop@host1 hadoop]$ vi hadoop-env.sh
+
+...
+export JAVA_HOME=/usr/lib/jvm/java-1.7.0-openjdk.x86_64
+...
+</code></pre>
+
+<p>We need to be able to passwordlessly <code>ssh</code> to localhost. To do 
so, I&#8217;ll generate a public key/private key pair and add
+the public key to the hadoop user&#8217;s <code>authorized_keys</code>. We can 
<code>ssh</code> to localhost to verify that we can connect without
+a password.</p>
+
+<pre><code>[hadoop@host1 ~]$ ssh-keygen -t rsa -b 4096 -C "hadoop example"
+Your identification has been saved in /home/hadoop/.ssh/id_rsa.
+Your public key has been saved in /home/hadoop/.ssh/id_rsa.pub.
+[hadoop@host1 ~]$ cat ~/.ssh/id_rsa.pub &gt;&gt; ~/.ssh/authorized_keys
+[hadoop@host1 ~]$ chmod 600 ~/.ssh/authorized_keys
+[hadoop@host1 ~]$ ssh localhost
+The authenticity of host 'localhost (::1)' can't be established.
+RSA key fingerprint is 6b:86:78:86:13:0a:49:d4:c7:a7:15:10:d1:27:88:9e.
+Are you sure you want to continue connecting (yes/no)? yes
+Warning: Permanently added 'localhost' (RSA) to the list of known hosts.
+[hadoop@host1 ~]$ exit
+logout
+Connection to localhost closed.
+[hadoop@host1 ~]$ ls -l .ssh
+total 16
+-rw------- 1 hadoop hadoop  736 Nov 11 16:44 authorized_keys
+-rw------- 1 hadoop hadoop 3243 Nov 11 16:41 id_rsa
+-rw-r--r-- 1 hadoop hadoop  736 Nov 11 16:41 id_rsa.pub
+-rw-r--r-- 1 hadoop hadoop  391 Nov 11 16:46 known_hosts
+</code></pre>
+
+<p>In the Hadoop configuration directory (<code>etc/hadoop</code>), in the 
<code>core-site.xml</code> file, we specify the <code>fs.defaultFS</code>
+property to be <code>localhost</code> with port <code>9000</code>.</p>
+
+<pre><code>[hadoop@host1 hadoop]$ vi core-site.xml
+
+...
+&lt;configuration&gt;
+    &lt;property&gt;
+        &lt;name&gt;fs.defaultFS&lt;/name&gt;
+        &lt;value&gt;hdfs://localhost:9000&lt;/value&gt;
+    &lt;/property&gt;
+&lt;/configuration&gt;
+...
+</code></pre>
+
+<p>By default, HDFS replicates data on three nodes. Since we&#8217;re running 
on a single machine, we&#8217;ll change this to one.
+We&#8217;ll add a <code>dfs.replication</code> property to 
<code>hdfs-site.xml</code> and set its value to <code>1</code>.</p>
+
+<pre><code>[hadoop@host1 hadoop]$ vi hdfs-site.xml
+
+...
+&lt;configuration&gt;
+    &lt;property&gt;
+        &lt;name&gt;dfs.replication&lt;/name&gt;
+        &lt;value&gt;1&lt;/value&gt;
+    &lt;/property&gt;
+&lt;/configuration&gt;
+...
+</code></pre>
+
+<p>Next, we&#8217;ll format HDFS.</p>
+
+<pre><code>[hadoop@host1 ~]$ hdfs namenode -format
+15/11/11 17:23:33 INFO namenode.NameNode: STARTUP_MSG:
+/************************************************************
+STARTUP_MSG: Starting NameNode
+STARTUP_MSG:   host = host1.example.com/9.30.252.15
+STARTUP_MSG:   args = [-format]
+STARTUP_MSG:   version = 2.6.2
+...
+STARTUP_MSG:   java = 1.7.0_79
+************************************************************/
+...
+15/11/11 17:23:34 INFO common.Storage: Storage directory 
/tmp/hadoop-hadoop/dfs/name has been successfully formatted.
+...
+/************************************************************
+SHUTDOWN_MSG: Shutting down NameNode at host1.example.com/9.30.252.15
+************************************************************/
+</code></pre>
+
+<p>We&#8217;ll start up HDFS using the <code>start-dfs.sh</code> script. This 
starts the NameNode, DataNode, and SecondaryNameNode daemons
+on the single machine.</p>
+
+<pre><code>[hadoop@host1 ~]$ start-dfs.sh
+Starting namenodes on [localhost]
+localhost: starting namenode, logging to 
/home/hadoop/hadoop-2.6.2/logs/hadoop-hadoop-namenode-host1.out
+localhost: starting datanode, logging to 
/home/hadoop/hadoop-2.6.2/logs/hadoop-hadoop-datanode-host1.out
+Starting secondary namenodes [0.0.0.0]
+The authenticity of host '0.0.0.0 (0.0.0.0)' can't be established.
+RSA key fingerprint is 6b:86:78:86:13:0a:49:d4:c7:a7:15:10:d1:27:88:9e.
+Are you sure you want to continue connecting (yes/no)? yes
+0.0.0.0: Warning: Permanently added '0.0.0.0' (RSA) to the list of known hosts.
+0.0.0.0: starting secondarynamenode, logging to 
/home/hadoop/hadoop-2.6.2/logs/hadoop-hadoop-secondarynamenode-host1.out
+</code></pre>
+
+<p>We can see the running Java processes using the <code>jps</code> 
command.</p>
+
+<pre><code>[hadoop@host1 ~]$ jps
+36128 Jps
+35844 DataNode
+36007 SecondaryNameNode
+35722 NameNode
+</code></pre>
+
+<p>Here, we can see detailed information about the Java processes that were 
started.</p>
+
+<pre><code>[hadoop@host1 ~]$ ps -C java -f -ww
+UID        PID  PPID  C STIME TTY          TIME CMD
+hadoop  35722     1  5 17:38 ?        00:00:05 
/usr/lib/jvm/java-1.7.0-openjdk.x86_64/bin/java -Dproc_namenode -Xmx1000m 
-Djava.net.preferIPv4Stack=true -Dhadoop.log.dir=/home/hadoop/hadoop-2.6.2/logs 
-Dhadoop.log.file=hadoop.log -Dhadoop.home.dir=/home/hadoop/hadoop-2.6.2 
-Dhadoop.id.str=hadoop -Dhadoop.root.logger=INFO,console 
-Djava.library.path=/home/hadoop/hadoop-2.6.2/lib/native 
-Dhadoop.policy.file=hadoop-policy.xml -Djava.net.preferIPv4Stack=true 
-Djava.net.preferIPv4Stack=true -Djava.net.preferIPv4Stack=true 
-Dhadoop.log.dir=/home/hadoop/hadoop-2.6.2/logs 
-Dhadoop.log.file=hadoop-hadoop-namenode-host1.log 
-Dhadoop.home.dir=/home/hadoop/hadoop-2.6.2 -Dhadoop.id.str=hadoop 
-Dhadoop.root.logger=INFO,RFA 
-Djava.library.path=/home/hadoop/hadoop-2.6.2/lib/native 
-Dhadoop.policy.file=hadoop-policy.xml -Djava.net.preferIPv4Stack=true 
-Dhadoop.security.logger=INFO,RFAS -Dhdfs.audit.logger=INFO,NullAppender 
-Dhadoop.security.logger=INFO,RFAS -Dhdfs.audit.logger=INFO,NullAppender -Dha
 doop.security.logger=INFO,RFAS -Dhdfs.audit.logger=INFO,NullAppender 
-Dhadoop.security.logger=INFO,RFAS 
org.apache.hadoop.hdfs.server.namenode.NameNode
+hadoop  35844     1  4 17:38 ?        00:00:04 
/usr/lib/jvm/java-1.7.0-openjdk.x86_64/bin/java -Dproc_datanode -Xmx1000m 
-Djava.net.preferIPv4Stack=true -Dhadoop.log.dir=/home/hadoop/hadoop-2.6.2/logs 
-Dhadoop.log.file=hadoop.log -Dhadoop.home.dir=/home/hadoop/hadoop-2.6.2 
-Dhadoop.id.str=hadoop -Dhadoop.root.logger=INFO,console 
-Djava.library.path=/home/hadoop/hadoop-2.6.2/lib/native 
-Dhadoop.policy.file=hadoop-policy.xml -Djava.net.preferIPv4Stack=true 
-Djava.net.preferIPv4Stack=true -Djava.net.preferIPv4Stack=true 
-Dhadoop.log.dir=/home/hadoop/hadoop-2.6.2/logs 
-Dhadoop.log.file=hadoop-hadoop-datanode-host1.log 
-Dhadoop.home.dir=/home/hadoop/hadoop-2.6.2 -Dhadoop.id.str=hadoop 
-Dhadoop.root.logger=INFO,RFA 
-Djava.library.path=/home/hadoop/hadoop-2.6.2/lib/native 
-Dhadoop.policy.file=hadoop-policy.xml -Djava.net.preferIPv4Stack=true -server 
-Dhadoop.security.logger=ERROR,RFAS -Dhadoop.security.logger=ERROR,RFAS 
-Dhadoop.security.logger=ERROR,RFAS -Dhadoop.security.logger=INFO,RFAS
  org.apache.hadoop.hdfs.server.datanode.DataNode
+hadoop  36007     1  5 17:38 ?        00:00:04 
/usr/lib/jvm/java-1.7.0-openjdk.x86_64/bin/java -Dproc_secondarynamenode 
-Xmx1000m -Djava.net.preferIPv4Stack=true 
-Dhadoop.log.dir=/home/hadoop/hadoop-2.6.2/logs -Dhadoop.log.file=hadoop.log 
-Dhadoop.home.dir=/home/hadoop/hadoop-2.6.2 -Dhadoop.id.str=hadoop 
-Dhadoop.root.logger=INFO,console 
-Djava.library.path=/home/hadoop/hadoop-2.6.2/lib/native 
-Dhadoop.policy.file=hadoop-policy.xml -Djava.net.preferIPv4Stack=true 
-Djava.net.preferIPv4Stack=true -Djava.net.preferIPv4Stack=true 
-Dhadoop.log.dir=/home/hadoop/hadoop-2.6.2/logs 
-Dhadoop.log.file=hadoop-hadoop-secondarynamenode-host1.log 
-Dhadoop.home.dir=/home/hadoop/hadoop-2.6.2 -Dhadoop.id.str=hadoop 
-Dhadoop.root.logger=INFO,RFA 
-Djava.library.path=/home/hadoop/hadoop-2.6.2/lib/native 
-Dhadoop.policy.file=hadoop-policy.xml -Djava.net.preferIPv4Stack=true 
-Dhadoop.security.logger=INFO,RFAS -Dhdfs.audit.logger=INFO,NullAppender 
-Dhadoop.security.logger=INFO,RFAS -Dhdfs.audit.logger=INFO
 ,NullAppender -Dhadoop.security.logger=INFO,RFAS 
-Dhdfs.audit.logger=INFO,NullAppender -Dhadoop.security.logger=INFO,RFAS 
org.apache.hadoop.hdfs.server.namenode.SecondaryNameNode
+</code></pre>
+
+<p>Useful log information is created by default in the hadoop 
<code>logs</code> directory.</p>
+
+<p>If everything worked correctly, we can hit port 50070 in a browser 
(http://host1.example.com:50070) to see Hadoop information.</p>
+
+<p>If we look at our HDFS file system, we see that it currently doesn&#8217;t 
contain any files.</p>
+
+<pre><code>[hadoop@host1 ~]$ hdfs dfs -ls
+ls: `.': No such file or directory
+</code></pre>
+
+<p>Let&#8217;s go ahead and execute the 
<code>genLinearRegressionData.dml</code> script in Hadoop Pseudo-Distributed 
mode.</p>
+
+<pre><code>[hadoop@host1 ~]$ hadoop jar systemml-1.1.0/SystemML.jar -f 
genLinearRegressionData.dml -nvargs numSamples=1000 numFeatures=50 
maxFeatureValue=5 maxWeight=5 addNoise=FALSE b=0 sparsity=0.7 
output=linRegData.csv format=csv perc=0.5
+15/11/11 18:16:33 INFO api.DMLScript: BEGIN DML run 11/11/2015 18:16:33
+15/11/11 18:16:33 INFO api.DMLScript: HADOOP_HOME: /home/hadoop/hadoop-2.6.2
+15/11/11 18:16:33 WARN conf.DMLConfig: No default SystemML config file 
(./SystemML-config.xml) found
+15/11/11 18:16:33 WARN conf.DMLConfig: Using default settings in DMLConfig
+15/11/11 18:16:33 INFO jvm.JvmMetrics: Initializing JVM Metrics with 
processName=JobTracker, sessionId=
+15/11/11 18:16:33 WARN hops.OptimizerUtils: Auto-disable multi-threaded text 
read for 'text' and 'csv' due to thread contention on JRE &lt; 1.8 
(java.version=1.7.0_79).
+15/11/11 18:16:35 INFO api.DMLScript: SystemML Statistics:
+Total execution time:          1.484 sec.
+Number of executed MR Jobs:    0.
+
+15/11/11 18:16:35 INFO api.DMLScript: END DML run 11/11/2015 18:16:35
+</code></pre>
+
+<p>If we list the contents of the current directory in our regular file 
system, we see that no files have been written
+to the regular file system.</p>
+
+<pre><code>[hadoop@host1 ~]$ ls
+genLinearRegressionData.dml  hadoop-2.6.2  hadoop-2.6.2.tar.gz  systemml-1.1.0 
 systemml-1.1.0.tar.gz
+</code></pre>
+
+<p>If we list the contents of the HDFS file system, we see that HDFS contains 
our data files and the corresponding metadata files.</p>
+
+<pre><code>[hadoop@host1 ~]$ hdfs dfs -ls
+Found 5 items
+drwxr-xr-x   - hadoop supergroup          0 2015-11-11 18:16 linRegData.csv
+-rw-r--r--   1 hadoop supergroup        214 2015-11-11 18:16 linRegData.csv.mtd
+drwxr-xr-x   - hadoop supergroup          0 2015-11-11 18:16 perc.csv
+-rw-r--r--   1 hadoop supergroup        206 2015-11-11 18:16 perc.csv.mtd
+drwxr-xr-x   - hadoop supergroup          0 2015-11-11 18:16 scratch_space
+</code></pre>
+
+<p>If we examine the Hadoop web interface mentioned previously, we see that 
the files, directories, and blocks in HDFS have
+increased in number.</p>
+
+<p>Now that we&#8217;re done with this example, I&#8217;ll clean things up and 
delete the generated files from HDFS.</p>
+
+<pre><code>[hadoop@host1 hadoop]$ hdfs dfs -rm -r *.csv
+[hadoop@host1 hadoop]$ hdfs dfs -rm *.mtd
+[hadoop@host1 hadoop]$ hdfs dfs -rmdir scratch_space
+</code></pre>
+
+<p>I&#8217;ll stop HDFS using the <code>stop-dfs.sh</code> script and then 
verify that the Java processes have stopped.</p>
+
+<pre><code>[hadoop@host1 ~]$ stop-dfs.sh
+Stopping namenodes on [localhost]
+localhost: stopping namenode
+localhost: stopping datanode
+Stopping secondary namenodes [0.0.0.0]
+0.0.0.0: stopping secondarynamenode
+
+[hadoop@host1 ~]$ jps
+37337 Jps
+</code></pre>
+
+<hr />
+
+<h1 id="systemml-with-pseudo-distributed-hadoop-and-yarn">SystemML with 
Pseudo-Distributed Hadoop and YARN</h1>
+
+<p>To add YARN to Pseudo-Distributed Hadoop on the single machine, we need to 
take our setup from the
+previous example and update two configuration
+files and start the ResourceManager and NodeManager daemons.</p>
+
+<p>In the <code>mapred-site.xml</code> configuration file, we specify the
+<code>mapreduce.framework.name</code> property as <code>yarn</code>.</p>
+
+<pre><code>[hadoop@host1 hadoop]$ pwd
+/home/hadoop/hadoop-2.6.2/etc/hadoop
+[hadoop@host1 hadoop]$ cp mapred-site.xml.template mapred-site.xml
+[hadoop@host1 hadoop]$ vi mapred-site.xml
+
+...
+&lt;configuration&gt;
+    &lt;property&gt;
+        &lt;name&gt;mapreduce.framework.name&lt;/name&gt;
+        &lt;value&gt;yarn&lt;/value&gt;
+    &lt;/property&gt;
+&lt;/configuration&gt;
+...
+</code></pre>
+
+<p>In the <code>yarn-site.xml</code> configuration file, we specify the 
<code>yarn.nodemanager.aux-services</code> property
+to be <code>mapreduce_shuffle</code>.</p>
+
+<pre><code>[hadoop@host1 hadoop]$ vi yarn-site.xml
+
+...
+&lt;configuration&gt;
+    &lt;property&gt;
+        &lt;name&gt;yarn.nodemanager.aux-services&lt;/name&gt;
+        &lt;value&gt;mapreduce_shuffle&lt;/value&gt;
+    &lt;/property&gt;
+&lt;/configuration&gt;
+...
+</code></pre>
+
+<p>Next, we&#8217;ll start HDFS using the <code>start-dfs.sh</code> script.</p>
+
+<pre><code>[hadoop@host1 hadoop]$ start-dfs.sh
+Starting namenodes on [localhost]
+localhost: starting namenode, logging to 
/home/hadoop/hadoop-2.6.2/logs/hadoop-hadoop-namenode-host1.out
+localhost: starting datanode, logging to 
/home/hadoop/hadoop-2.6.2/logs/hadoop-hadoop-datanode-host1.out
+Starting secondary namenodes [0.0.0.0]
+0.0.0.0: starting secondarynamenode, logging to 
/home/hadoop/hadoop-2.6.2/logs/hadoop-hadoop-secondarynamenode-host1.out
+</code></pre>
+
+<p>After that, we&#8217;ll start YARN using the <code>start-yarn.sh</code> 
script.</p>
+
+<pre><code>[hadoop@host1 hadoop]$ start-yarn.sh
+starting yarn daemons
+starting resourcemanager, logging to 
/home/hadoop/hadoop-2.6.2/logs/yarn-hadoop-resourcemanager-host1.out
+localhost: starting nodemanager, logging to 
/home/hadoop/hadoop-2.6.2/logs/yarn-hadoop-nodemanager-host1.out
+</code></pre>
+
+<p>We can use the <code>jps</code> command to verify that the HDFS daemons 
(NameNode, DataNode, and SecondaryNameNode) and YARN
+daemons (ResourceManager and NodeManager) are running.</p>
+
+<pre><code>[hadoop@host1 hadoop]$ jps
+52046 ResourceManager
+52482 Jps
+52149 NodeManager
+51582 NameNode
+51712 DataNode
+51880 SecondaryNameNode
+</code></pre>
+
+<p>We can now view YARN information via the web interface on port 8088 
(http://host1.example.com:8088).</p>
+
+<p>I&#8217;ll execute the <code>genLinearRegressionData.dml</code> example 
that we&#8217;ve previously considered.</p>
+
+<pre><code>[hadoop@host1 hadoop]$ cd ~
+[hadoop@host1 ~]$ hadoop jar systemml-1.1.0/SystemML.jar -f 
genLinearRegressionData.dml -nvargs numSamples=1000 numFeatures=50 
maxFeatureValue=5 maxWeight=5 addNoise=FALSE b=0 sparsity=0.7 
output=linRegData.csv format=csv perc=0.5
+15/11/12 11:57:04 INFO api.DMLScript: BEGIN DML run 11/12/2015 11:57:04
+15/11/12 11:57:04 INFO api.DMLScript: HADOOP_HOME: /home/hadoop/hadoop-2.6.2
+15/11/12 11:57:04 WARN conf.DMLConfig: No default SystemML config file 
(./SystemML-config.xml) found
+15/11/12 11:57:04 WARN conf.DMLConfig: Using default settings in DMLConfig
+15/11/12 11:57:05 INFO client.RMProxy: Connecting to ResourceManager at 
/0.0.0.0:8032
+15/11/12 11:57:06 WARN hops.OptimizerUtils: Auto-disable multi-threaded text 
read for 'text' and 'csv' due to thread contention on JRE &lt; 1.8 
(java.version=1.7.0_79).
+15/11/12 11:57:07 INFO api.DMLScript: SystemML Statistics:
+Total execution time:          1.265 sec.
+Number of executed MR Jobs:    0.
+
+15/11/12 11:57:07 INFO api.DMLScript: END DML run 11/12/2015 11:57:07
+</code></pre>
+
+<p>If we examine the HDFS file system, we see the files generated by the 
execution of the DML script by SystemML on Hadoop.</p>
+
+<pre><code>[hadoop@host1 ~]$ hdfs dfs -ls
+Found 5 items
+drwxr-xr-x   - hadoop supergroup          0 2015-11-12 11:57 linRegData.csv
+-rw-r--r--   1 hadoop supergroup        214 2015-11-12 11:57 linRegData.csv.mtd
+drwxr-xr-x   - hadoop supergroup          0 2015-11-12 11:57 perc.csv
+-rw-r--r--   1 hadoop supergroup        206 2015-11-12 11:57 perc.csv.mtd
+drwxr-xr-x   - hadoop supergroup          0 2015-11-12 11:57 scratch_space
+</code></pre>
+
+<p>I&#8217;ll go ahead and delete the generated example files from HDFS.</p>
+
+<pre><code>[hadoop@host1 ~]$ hdfs dfs -rm -r *.csv
+[hadoop@host1 ~]$ hdfs dfs -rm *.mtd
+[hadoop@host1 ~]$ hdfs dfs -rmdir scratch_space
+</code></pre>
+
+<p>We&#8217;ll stop the YARN daemons using the <code>stop-yarn.sh</code> 
script.</p>
+
+<pre><code>[hadoop@host1 ~]$ stop-yarn.sh
+stopping yarn daemons
+stopping resourcemanager
+localhost: stopping nodemanager
+no proxyserver to stop
+</code></pre>
+
+<p>We can stop HDFS with the <code>stop-dfs.sh</code> script.</p>
+
+<pre><code>[hadoop@host1 ~]$ stop-dfs.sh
+Stopping namenodes on [localhost]
+localhost: stopping namenode
+localhost: stopping datanode
+Stopping secondary namenodes [0.0.0.0]
+0.0.0.0: stopping secondarynamenode
+</code></pre>
+
+<p>If we list the running Java processes, we see all the YARN daemons and HDFS 
daemons have stopped.</p>
+
+<pre><code>[hadoop@host1 ~]$ jps
+53459 Jps
+</code></pre>
+
+<p>For cleanliness, I&#8217;ll also delete the <code>/tmp/hadoop-hadoop</code> 
files created by Hadoop before proceeding to
+the next example.</p>
+
+<hr />
+
+<h1 id="systemml-with-distributed-hadoop-and-yarn">SystemML with Distributed 
Hadoop and YARN</h1>
+
+<p>In our previous example, we ran SystemML on Hadoop in Pseudo-Distributed 
mode with YARN on a single machine.
+This example will look at Distributed Hadoop with YARN on a 4-node cluster. 
Each server is running
+Red Hat Enterprise Linux Server, release 6.6.</p>
+
+<p>I have 4 nodes: host1, host2, host3, and host4. The host1 node
+that we previously set up will act as the master for both HDFS and YARN,
+and host2, bd150, and host4 will be slaves. For more information regarding
+network configurations, please see the Hadoop documentation.</p>
+
+<p>First, I created a hadoop user on each slave node.</p>
+
+<pre><code>[root@host1 ~]$ ssh [email protected]
+[root@host2 ~]# useradd hadoop
+[root@host2 ~]# passwd hadoop
+[root@host2 ~]# exit
+
+[root@host1 ~]$ ssh [email protected]
+[root@host2 ~]# useradd hadoop
+[root@host2 ~]# passwd hadoop
+[root@host2 ~]# exit
+
+[root@host1 ~]$ ssh [email protected]
+[root@host2 ~]# useradd hadoop
+[root@host2 ~]# passwd hadoop
+[root@host2 ~]# exit
+</code></pre>
+
+<p>Next, I set up passwordless login from the hadoop user on the master node 
(host1)
+to each of the slave nodes. The <code>ssh-copy-id</code> command copied the 
master node&#8217;s hadoop user&#8217;s
+public key value to the ~/.ssh/authorized_keys file of each of the slave 
nodes. I
+tested the passwordless login from the master node to each of the slave nodes 
for the hadoop
+user.</p>
+
+<pre><code>$ ssh [email protected]
+
+[hadoop@host1 ~]$ ssh-copy-id host2.example.com
+[hadoop@host1 ~]$ ssh [email protected]
+Last login: Thu Nov 12 14:16:21 2015
+[hadoop@host2 ~]$ exit
+
+[hadoop@host1 ~]$ ssh-copy-id host3.example.com
+[hadoop@host1 ~]$ ssh [email protected]
+Last login: Thu Nov 12 14:16:40 2015
+[hadoop@host3 ~]$ exit
+
+[hadoop@host1 ~]$ ssh-copy-id host4.example.com
+[hadoop@host1 ~]$ ssh [email protected]
+Last login: Thu Nov 12 14:17:10 2015
+[hadoop@host4 ~]$ exit
+</code></pre>
+
+<p>On the master node, I specified the slave nodes in the Hadoop 
<code>slaves</code> configuration file.</p>
+
+<pre><code>[hadoop@host1 hadoop]$ pwd
+/home/hadoop/hadoop-2.6.2/etc/hadoop
+[hadoop@host1 hadoop]$ more slaves
+host2.example.com
+host3.example.com
+host4.example.com
+</code></pre>
+
+<p>In the <code>core-site.xml</code> file, I specified the 
<code>fs.defaultFS</code> property to reference the master node.</p>
+
+<pre><code>[hadoop@host1 hadoop]$ more core-site.xml
+
+...
+&lt;configuration&gt;
+    &lt;property&gt;
+        &lt;name&gt;fs.defaultFS&lt;/name&gt;
+        &lt;value&gt;hdfs://host1.example.com:9000&lt;/value&gt;
+    &lt;/property&gt;
+&lt;/configuration&gt;
+...
+</code></pre>
+
+<p>In the <code>hdfs-site.xml</code> configuration file, I removed the 
previous <code>dfs.replication</code> property, since we
+will use the default replication value (of 3).</p>
+
+<pre><code>[hadoop@host1 hadoop]$ more hdfs-site.xml
+
+...
+&lt;configuration&gt;
+&lt;/configuration&gt;
+...
+</code></pre>
+
+<p>We&#8217;ll be using YARN, so our <code>mapred-site.xml</code> will have 
the <code>mapreduce.framework.name</code>
+property set to <code>yarn</code>, as in the previous example. Additionally, 
we&#8217;ll set the <code>mapreduce.map.java.opts</code> and
+<code>mapreduce.reduce.java.opts</code> properties to <code>-Xmx2g -Xms2g 
-Xmn200m</code>. The <code>-Xmn</code> parameter fixes the
+size of the young generation and typically is set to 10% of the maximum heap, 
which we have set to 2g.
+Furthermore, we&#8217;ll set <code>mapreduce.map.memory.mb</code> and 
<code>mapreduce.reduce.memory.mb</code> to <code>3072</code>. Typically these
+values are set to at least 1.5 times the value of the maximum heap size.</p>
+
+<pre><code>[hadoop@host1 hadoop]$ more mapred-site.xml
+
+...
+&lt;configuration&gt;
+    &lt;property&gt;
+        &lt;name&gt;mapreduce.framework.name&lt;/name&gt;
+        &lt;value&gt;yarn&lt;/value&gt;
+    &lt;/property&gt;
+    &lt;property&gt;
+        &lt;name&gt;mapreduce.map.java.opts&lt;/name&gt;
+        &lt;value&gt;-Xmx2g -Xms2g -Xmn200m&lt;/value&gt;
+    &lt;/property&gt;
+    &lt;property&gt;
+        &lt;name&gt;mapreduce.reduce.java.opts&lt;/name&gt;
+        &lt;value&gt;-Xmx2g -Xms2g -Xmn200m&lt;/value&gt;
+    &lt;/property&gt;
+    &lt;property&gt;
+        &lt;name&gt;mapreduce.map.memory.mb&lt;/name&gt;
+        &lt;value&gt;3072&lt;/value&gt;
+    &lt;/property&gt;
+    &lt;property&gt;
+        &lt;name&gt;mapreduce.reduce.memory.mb&lt;/name&gt;
+        &lt;value&gt;3072&lt;/value&gt;
+    &lt;/property&gt;
+&lt;/configuration&gt;
+...
+</code></pre>
+
+<p>In the <code>yarn-site.xml</code> configuration file, I added a 
<code>yarn.resourcemanager.hostname</code> property and specified
+the master node as the host.</p>
+
+<pre><code>[hadoop@host1 hadoop]$ more yarn-site.xml
+
+...
+&lt;configuration&gt;
+    &lt;property&gt;
+        &lt;name&gt;yarn.nodemanager.aux-services&lt;/name&gt;
+        &lt;value&gt;mapreduce_shuffle&lt;/value&gt;
+    &lt;/property&gt;
+    &lt;property&gt;
+        &lt;name&gt;yarn.resourcemanager.hostname&lt;/name&gt;
+        &lt;value&gt;host1.example.com&lt;/value&gt;
+    &lt;/property&gt;
+&lt;/configuration&gt;
+...
+</code></pre>
+
+<p>In the previous example, we specified the <code>JAVA_HOME</code> in the 
<code>hadoop-env.sh</code> configuration script.
+We will use that same value.</p>
+
+<pre><code>[hadoop@host1 hadoop]$ more hadoop-env.sh
+
+...
+export JAVA_HOME=/usr/lib/jvm/java-1.7.0-openjdk.x86_64
+...
+</code></pre>
+
+<p>Next, I copied my hadoop installation (which includes all of the mentioned 
configuration settings)
+to each slave node.</p>
+
+<pre><code>[hadoop@host1 ~]$ pwd
+/home/hadoop
+[hadoop@host1 ~]$ scp -r hadoop-2.6.2 [email protected]:~/
+[hadoop@host1 ~]$ scp -r hadoop-2.6.2 [email protected]:~/
+[hadoop@host1 ~]$ scp -r hadoop-2.6.2 [email protected]:~/
+</code></pre>
+
+<p>My master node <code>.bash_profile</code> contains <code>JAVA_HOME</code> 
and <code>HADOOP_HOME</code> environment variables
+and adds <code>JAVA_HOME/bin</code>, <code>HADOOP_HOME/bin</code> and 
<code>HADOOP_HOME/sbin</code> to the <code>PATH</code>.</p>
+
+<pre><code>...
+export JAVA_HOME=/usr/lib/jvm/java-1.7.0-openjdk.x86_64
+export HADOOP_HOME=/home/hadoop/hadoop-2.6.2
+PATH=$JAVA_HOME/bin:$PATH:$HOME/bin:$HADOOP_HOME/bin:$HADOOP_HOME/sbin
+export PATH
+...
+</code></pre>
+
+<p>I copied the <code>.bash_profile</code> file to the slave nodes.</p>
+
+<pre><code>[hadoop@host1 ~]$ pwd
+/home/hadoop
+[hadoop@host1 ~]$ scp .bash_profile [email protected]:~/.bash_profile
+[hadoop@host1 ~]$ scp .bash_profile [email protected]:~/.bash_profile
+[hadoop@host1 ~]$ scp .bash_profile [email protected]:~/.bash_profile
+</code></pre>
+
+<p>On the master, I formatted HDFS.</p>
+
+<pre><code>[hadoop@host1 ~]$ hdfs namenode -format
+</code></pre>
+
+<p>Next, on the master, I started HDFS using <code>start-dfs.sh</code>. We can 
see that the master NameNode
+and the slave DataNodes started up.</p>
+
+<pre><code>[hadoop@host1 ~]$ start-dfs.sh
+Starting namenodes on [host1.example.com]
+host1.example.com: starting namenode, logging to 
/home/hadoop/hadoop-2.6.2/logs/hadoop-hadoop-namenode-host1.out
+host4.example.com: starting datanode, logging to 
/home/hadoop/hadoop-2.6.2/logs/hadoop-hadoop-datanode-host4.out
+host2.example.com: starting datanode, logging to 
/home/hadoop/hadoop-2.6.2/logs/hadoop-hadoop-datanode-host2.out
+host3.example.com: starting datanode, logging to 
/home/hadoop/hadoop-2.6.2/logs/hadoop-hadoop-datanode-host3.out
+Starting secondary namenodes [0.0.0.0]
+0.0.0.0: starting secondarynamenode, logging to 
/home/hadoop/hadoop-2.6.2/logs/hadoop-hadoop-secondarynamenode-host1.out
+</code></pre>
+
+<p>Next I started YARN using the <code>start-yarn.sh</code> script. We see the 
master ResourceManager and the
+slave NodeManagers started up.</p>
+
+<pre><code>[hadoop@host1 ~]$ start-yarn.sh
+starting yarn daemons
+starting resourcemanager, logging to 
/home/hadoop/hadoop-2.6.2/logs/yarn-hadoop-resourcemanager-host1.out
+host3.example.com: starting nodemanager, logging to 
/home/hadoop/hadoop-2.6.2/logs/yarn-hadoop-nodemanager-host3.out
+host2.example.com: starting nodemanager, logging to 
/home/hadoop/hadoop-2.6.2/logs/yarn-hadoop-nodemanager-host2.out
+host4.example.com: starting nodemanager, logging to 
/home/hadoop/hadoop-2.6.2/logs/yarn-hadoop-nodemanager-host4.out
+</code></pre>
+
+<p>On the master, we see that the NameNode, SecondaryNameNode, and 
ResourceManager daemons are running.</p>
+
+<pre><code>[hadoop@host1 ~]$ jps
+1563 NameNode
+1775 SecondaryNameNode
+2240 Jps
+1978 ResourceManager
+</code></pre>
+
+<p>On the slaves, we see that the DataNode and NodeManager daemons are 
running.</p>
+
+<pre><code>[hadoop@host2 ~]$ jps
+29096 Jps
+28974 NodeManager
+28821 DataNode
+
+[hadoop@host3 ~]$ jps
+5950 Jps
+5706 DataNode
+5819 NodeManager
+
+[hadoop@host4 ~]$ jps
+16388 Jps
+16153 DataNode
+16266 NodeManager
+</code></pre>
+
+<p>If we look at the Hadoop (on port 50070) and YARN (on port 8088) web 
interfaces, we can see information about our running cluster.</p>
+
+<hr />
+
+<h2 
id="systemml-with-distributed-hadoop-and-yarn-linear-regression-example">SystemML
 with Distributed Hadoop and YARN: Linear Regression Example</h2>
+
+<p>Let&#8217;s go ahead and run the SystemML example from the GitHub 
README.</p>
+
+<pre><code>[hadoop@host1 ~]$ hadoop jar systemml-1.1.0/SystemML.jar -f 
genLinearRegressionData.dml -nvargs numSamples=1000 numFeatures=50 
maxFeatureValue=5 maxWeight=5 addNoise=FALSE b=0 sparsity=0.7 
output=linRegData.csv format=csv perc=0.5
+
+[hadoop@host1 ~]$ hadoop jar systemml-1.1.0/SystemML.jar -f 
systemml-1.1.0/algorithms/utils/sample.dml -nvargs X=linRegData.csv sv=perc.csv 
O=linRegDataParts ofmt=csv
+
+[hadoop@host1 ~]$ hadoop jar systemml-1.1.0/SystemML.jar -f 
systemml-1.1.0/algorithms/utils/splitXY.dml -nvargs X=linRegDataParts/1 y=51 
OX=linRegData.train.data.csv OY=linRegData.train.labels.csv ofmt=csv
+
+[hadoop@host1 ~]$ hadoop jar systemml-1.1.0/SystemML.jar -f 
systemml-1.1.0/algorithms/utils/splitXY.dml -nvargs X=linRegDataParts/2 y=51 
OX=linRegData.test.data.csv OY=linRegData.test.labels.csv ofmt=csv
+
+[hadoop@host1 ~]$ hadoop jar systemml-1.1.0/SystemML.jar -f 
systemml-1.1.0/algorithms/LinearRegDS.dml -nvargs X=linRegData.train.data.csv 
Y=linRegData.train.labels.csv B=betas.csv fmt=csv
+...
+BEGIN LINEAR REGRESSION SCRIPT
+Reading X and Y...
+Calling the Direct Solver...
+Computing the statistics...
+AVG_TOT_Y,-0.051722694902638956
+STDEV_TOT_Y,54.132787822718356
+AVG_RES_Y,1.5905895170230406E-10
+STDEV_RES_Y,2.0668015575844624E-8
+DISPERSION,4.262683023432828E-16
+R2,1.0
+ADJUSTED_R2,1.0
+R2_NOBIAS,1.0
+ADJUSTED_R2_NOBIAS,1.0
+R2_VS_0,1.0
+ADJUSTED_R2_VS_0,1.0
+Writing the output matrix...
+END LINEAR REGRESSION SCRIPT
+15/11/17 15:50:34 INFO api.DMLScript: SystemML Statistics:
+Total execution time:          0.480 sec.
+...
+
+[hadoop@host1 ~]$ hadoop jar systemml-1.1.0/SystemML.jar -f 
systemml-1.1.0/algorithms/GLM-predict.dml -nvargs X=linRegData.test.data.csv 
Y=linRegData.test.labels.csv B=betas.csv fmt=csv
+...
+LOGLHOOD_Z,,FALSE,NaN
+LOGLHOOD_Z_PVAL,,FALSE,NaN
+PEARSON_X2,,FALSE,2.5039962709907123E-13
+PEARSON_X2_BY_DF,,FALSE,5.703863943031236E-16
+PEARSON_X2_PVAL,,FALSE,1.0
+DEVIANCE_G2,,FALSE,0.0
+DEVIANCE_G2_BY_DF,,FALSE,0.0
+DEVIANCE_G2_PVAL,,FALSE,1.0
+LOGLHOOD_Z,,TRUE,NaN
+LOGLHOOD_Z_PVAL,,TRUE,NaN
+PEARSON_X2,,TRUE,2.5039962709907123E-13
+PEARSON_X2_BY_DF,,TRUE,5.703863943031236E-16
+PEARSON_X2_PVAL,,TRUE,1.0
+DEVIANCE_G2,,TRUE,0.0
+DEVIANCE_G2_BY_DF,,TRUE,0.0
+DEVIANCE_G2_PVAL,,TRUE,1.0
+AVG_TOT_Y,1,,0.9381218622147646
+STDEV_TOT_Y,1,,55.6116696631821
+AVG_RES_Y,1,,2.5577864570734575E-10
+STDEV_RES_Y,1,,2.390848397359923E-8
+PRED_STDEV_RES,1,TRUE,1.0
+R2,1,,1.0
+ADJUSTED_R2,1,,1.0
+R2_NOBIAS,1,,1.0
+ADJUSTED_R2_NOBIAS,1,,1.0
+15/11/17 15:51:17 INFO api.DMLScript: SystemML Statistics:
+Total execution time:          0.269 sec.
+...
+</code></pre>
+
+<p>If we look at HDFS, we can see the files that were generated by the 
SystemML DML script executions.</p>
+
+<pre><code>[hadoop@host1 ~]$ hdfs dfs -ls
+Found 16 items
+drwxr-xr-x   - hadoop supergroup          0 2015-11-17 15:50 betas.csv
+-rw-r--r--   3 hadoop supergroup        208 2015-11-17 15:50 betas.csv.mtd
+drwxr-xr-x   - hadoop supergroup          0 2015-11-17 15:21 linRegData.csv
+-rw-r--r--   3 hadoop supergroup        214 2015-11-17 15:21 linRegData.csv.mtd
+drwxr-xr-x   - hadoop supergroup          0 2015-11-17 15:50 
linRegData.test.data.csv
+-rw-r--r--   3 hadoop supergroup        213 2015-11-17 15:50 
linRegData.test.data.csv.mtd
+drwxr-xr-x   - hadoop supergroup          0 2015-11-17 15:50 
linRegData.test.labels.csv
+-rw-r--r--   3 hadoop supergroup        210 2015-11-17 15:50 
linRegData.test.labels.csv.mtd
+drwxr-xr-x   - hadoop supergroup          0 2015-11-17 15:49 
linRegData.train.data.csv
+-rw-r--r--   3 hadoop supergroup        213 2015-11-17 15:49 
linRegData.train.data.csv.mtd
+drwxr-xr-x   - hadoop supergroup          0 2015-11-17 15:49 
linRegData.train.labels.csv
+-rw-r--r--   3 hadoop supergroup        210 2015-11-17 15:49 
linRegData.train.labels.csv.mtd
+drwxr-xr-x   - hadoop supergroup          0 2015-11-17 15:49 linRegDataParts
+drwxr-xr-x   - hadoop supergroup          0 2015-11-17 15:21 perc.csv
+-rw-r--r--   3 hadoop supergroup        206 2015-11-17 15:21 perc.csv.mtd
+drwxr-xr-x   - hadoop supergroup          0 2015-11-17 15:21 scratch_space
+</code></pre>
+
+<p>Before the next example, I&#8217;ll delete the files created in HDFS by 
this example.</p>
+
+<pre><code>[hadoop@host1 ~]$ hdfs dfs -rm -r linRegData*
+[hadoop@host1 ~]$ hdfs dfs -rm -r *.csv
+[hadoop@host1 ~]$ hdfs dfs -rm -r *.mtd
+</code></pre>
+
+<hr />
+
+<h2 
id="systemml-with-distributed-hadoop-and-yarn-k-means-clustering-example">SystemML
 with Distributed Hadoop and YARN: K-Means Clustering Example</h2>
+
+<p>Our previous example showed SystemML running in Hadoop Batch mode on a 
4-node cluster with YARN.
+However, the size of the data used was trivial. In this example, we&#8217;ll 
generate a slightly larger set
+of data and then analyze that data with the <code>Kmeans.dml</code> and 
<code>Kmeans-predict.dml</code> scripts.
+Information about the SystemML K-means clustering algorithm can be found in the
+<a href="algorithms-clustering.html#k-means-clustering">K-Means Clustering</a> 
section of the <a href="algorithms-reference.html">SystemML
+Algorithms Reference</a>.</p>
+
+<p>I&#8217;m going to modify my <code>SystemML-config.xml</code> file.
+I updated the <code>numreducers</code> property to be 6, which is twice my 
number of data nodes.
+The <code>numreducers</code> property specifies the number of reduce tasks per 
MR job.</p>
+
+<pre><code>&lt;numreducers&gt;6&lt;/numreducers&gt;
+</code></pre>
+
+<p>To begin, I&#8217;ll download the <code>genRandData4Kmeans.dml</code> 
script that I&#8217;ll use to generate a set of data.</p>
+
+<pre><code>[hadoop@host1 ~]$ wget 
https://raw.githubusercontent.com/apache/systemml/master/scripts/datagen/genRandData4Kmeans.dml
+</code></pre>
+
+<p>A description of the named arguments that can be passed in to this script 
can be found in the comment section at the top of the
+<code>genRandData4Kmeans.dml</code> file. For data, I&#8217;ll generate a 
matrix <code>X.mtx</code> consisting of 1 million rows and 100 features. 
I&#8217;ll explicitly reference my <code>SystemML-config.xml</code> file, since 
I&#8217;m
+executing SystemML in Hadoop from my home directory rather than from the 
SystemML project root directory.</p>
+
+<pre><code>[hadoop@host1 ~]$ hadoop jar systemml-1.1.0/SystemML.jar -f 
genRandData4Kmeans.dml -config systemml-1.1.0/SystemML-config.xml -nvargs 
nr=1000000 nf=100 nc=10 dc=10.0 dr=1.0 fbf=100.0 cbf=100.0 X=X.mtx C=C.mtx 
Y=Y.mtx YbyC=YbyC.mtx
+</code></pre>
+
+<p>After the data generation has finished, I&#8217;ll check HDFS for the 
amount of space used. The 1M-row matrix <code>X.mtx</code>
+requires about 2.8GB of space.</p>
+
+<pre><code>[hadoop@host1 ~]$ hdfs dfs -df -h
+Filesystem                           Size   Used  Available  Use%
+hdfs://host1.example.com:9000  400.7 G  2.8 G    318.7 G    1%
+</code></pre>
+
+<p>Here we can see the data files that were generated.</p>
+
+<pre><code>[hadoop@host1 ~]$ hdfs dfs -ls
+Found 9 items
+drwxr-xr-x   - hadoop supergroup          0 2015-11-19 11:53 C.mtx
+-rw-r--r--   3 hadoop supergroup        176 2015-11-19 11:53 C.mtx.mtd
+drwxr-xr-x   - hadoop supergroup          0 2015-11-19 11:56 X.mtx
+-rw-r--r--   3 hadoop supergroup        186 2015-11-19 11:56 X.mtx.mtd
+drwxr-xr-x   - hadoop supergroup          0 2015-11-19 11:53 Y.mtx
+-rw-r--r--   3 hadoop supergroup        182 2015-11-19 11:53 Y.mtx.mtd
+drwxr-xr-x   - hadoop supergroup          0 2015-11-19 11:56 YbyC.mtx
+-rw-r--r--   3 hadoop supergroup        182 2015-11-19 11:56 YbyC.mtx.mtd
+drwxr-xr-x   - hadoop supergroup          0 2015-11-19 11:56 scratch_space
+</code></pre>
+
+<p>Here we can see the <code>X.mtx</code> data files.</p>
+
+<pre><code>[hadoop@host1 ~]$ hdfs dfs -ls X.mtx
+Found 6 items
+-rw-r--r--   1 hadoop supergroup  484418384 2015-11-19 11:56 X.mtx/2-r-00000
+-rw-r--r--   1 hadoop supergroup  481626112 2015-11-19 11:56 X.mtx/2-r-00001
+-rw-r--r--   1 hadoop supergroup  475834931 2015-11-19 11:56 X.mtx/2-r-00002
+-rw-r--r--   1 hadoop supergroup  478519922 2015-11-19 11:56 X.mtx/2-r-00003
+-rw-r--r--   1 hadoop supergroup  481624723 2015-11-19 11:56 X.mtx/2-r-00004
+-rw-r--r--   1 hadoop supergroup  481624048 2015-11-19 11:56 X.mtx/2-r-00005
+</code></pre>
+
+<p>Next, I&#8217;ll run the <code>Kmeans.dml</code> algorithm on the 1M-row 
matrix <code>X.mtx</code>.</p>
+
+<pre><code>[hadoop@host1 ~]$ hadoop jar systemml-1.1.0/SystemML.jar -f 
systemml-1.1.0/algorithms/Kmeans.dml -config 
/systemml-1.1.0/SystemML-config.xml -nvargs X=X.mtx k=5 C=Centroids.mtx
+</code></pre>
+
+<p>We can see the <code>Centroids.mtx</code> data file has been written to 
HDFS.</p>
+
+<pre><code>[hadoop@host1 ~]$ hdfs dfs -ls
+Found 11 items
+drwxr-xr-x   - hadoop supergroup          0 2015-11-19 11:53 C.mtx
+-rw-r--r--   3 hadoop supergroup        176 2015-11-19 11:53 C.mtx.mtd
+drwxr-xr-x   - hadoop supergroup          0 2015-11-19 12:10 Centroids.mtx
+-rw-r--r--   3 hadoop supergroup        174 2015-11-19 12:10 Centroids.mtx.mtd
+drwxr-xr-x   - hadoop supergroup          0 2015-11-19 11:56 X.mtx
+-rw-r--r--   3 hadoop supergroup        186 2015-11-19 11:56 X.mtx.mtd
+drwxr-xr-x   - hadoop supergroup          0 2015-11-19 11:53 Y.mtx
+-rw-r--r--   3 hadoop supergroup        182 2015-11-19 11:53 Y.mtx.mtd
+drwxr-xr-x   - hadoop supergroup          0 2015-11-19 11:56 YbyC.mtx
+-rw-r--r--   3 hadoop supergroup        182 2015-11-19 11:56 YbyC.mtx.mtd
+drwxr-xr-x   - hadoop supergroup          0 2015-11-19 12:10 scratch_space
+</code></pre>
+
+<p>Now that we have trained our model, next we will test our model. We can do 
this with
+the <code>Kmeans-predict.dml</code> script.</p>
+
+<pre><code>[hadoop@host1 ~]$ hadoop jar systemml-1.1.0/SystemML.jar -f 
systemml-1.1.0/algorithms/Kmeans-predict.dml -config 
systemml-1.1.0/SystemML-config.xml -nvargs X=X.mtx C=Centroids.mtx 
prY=PredY.mtx O=stats.txt
+</code></pre>
+
+<p>In the file system, we can see that the <code>PredY.mtx</code> matrix was 
created.
+The <code>stats.txt</code> file lists statistics about the results.</p>
+
+<pre><code>[hadoop@host1 ~]$ hdfs dfs -ls
+Found 15 items
+drwxr-xr-x   - hadoop supergroup          0 2015-11-19 11:53 C.mtx
+-rw-r--r--   3 hadoop supergroup        176 2015-11-19 11:53 C.mtx.mtd
+drwxr-xr-x   - hadoop supergroup          0 2015-11-19 12:10 Centroids.mtx
+-rw-r--r--   3 hadoop supergroup        174 2015-11-19 12:10 Centroids.mtx.mtd
+drwxr-xr-x   - hadoop supergroup          0 2015-11-19 13:20 PredY.mtx
+-rw-r--r--   3 hadoop supergroup        182 2015-11-19 13:20 PredY.mtx.mtd
+drwxr-xr-x   - hadoop supergroup          0 2015-11-19 11:56 X.mtx
+-rw-r--r--   3 hadoop supergroup        186 2015-11-19 11:56 X.mtx.mtd
+drwxr-xr-x   - hadoop supergroup          0 2015-11-19 11:53 Y.mtx
+-rw-r--r--   3 hadoop supergroup        182 2015-11-19 11:53 Y.mtx.mtd
+drwxr-xr-x   - hadoop supergroup          0 2015-11-19 11:56 YbyC.mtx
+-rw-r--r--   3 hadoop supergroup        182 2015-11-19 11:56 YbyC.mtx.mtd
+drwxr-xr-x   - hadoop supergroup          0 2015-11-19 13:21 scratch_space
+-rw-r--r--   3 hadoop supergroup        261 2015-11-19 13:21 stats.txt
+-rw-r--r--   3 hadoop supergroup        127 2015-11-19 13:21 stats.txt.mtd
+</code></pre>
+
+<p>The <code>PredY.mtx</code> matrix consists of a single column of a million 
rows of doubles, as we can
+see in the resulting metadata file.</p>
+
+<pre><code>[hadoop@host1 ~]$ hdfs dfs -cat PredY.mtx.mtd
+{
+    "data_type": "matrix"
+    ,"value_type": "double"
+    ,"rows": 1000000
+    ,"cols": 1
+    ,"nnz": 1000000
+    ,"format": "text"
+    ,"description": { "author": "SystemML" }
+}
+</code></pre>
+
+<p>The statistics generated from testing the method are displayed below.</p>
+
+<pre><code>[hadoop@host1 ~]$ hdfs dfs -cat stats.txt
+TSS,,1.1262427174414966E11
+WCSS_M,,9.77022617396343E10
+WCSS_M_PC,,86.75062686450579
+BCSS_M,,1.4922010004515366E10
+BCSS_M_PC,,13.249373135494215
+WCSS_C,,9.770230517014426E10
+WCSS_C_PC,,86.75066542680617
+BCSS_C,,1.4921964103415842E10
+BCSS_C_PC,,13.249332379537428
+</code></pre>
+
+<hr />
+
+<h1 id="recommended-hadoop-cluster-configuration-settings">Recommended Hadoop 
Cluster Configuration Settings</h1>
+
+<p>Below are some recommended Hadoop configuration file settings that may be 
of assistance when running SystemML on Hadoop
+in a clustered environment.</p>
+
+<table>
+  <thead>
+    <tr>
+      <th>Configuration File</th>
+      <th>Setting</th>
+      <th>Value</th>
+      <th>Description</th>
+    </tr>
+  </thead>
+  <tbody>
+    <tr>
+      <td nowrap="" rowspan="5" style="vertical-align: top; padding-top: 
22px;"><code>mapred-site.xml</code></td>
+      <td nowrap=""><code>mapreduce.map.java.opts</code></td>
+      <td nowrap=""><code>-Xmx2g -Xms2g -Xmn200m</code></td>
+      <td>Increase memory of child JVMs of Maps, max/min heap size of 2GB, 
-Xmn specifies young generation which is typically 10% of maximum heap size</td>
+    </tr>
+    <tr>
+      <td nowrap=""><code>mapreduce.reduce.java.opts</code></td>
+      <td nowrap=""><code>-Xmx2g -Xms2g -Xmn200m</code></td>
+      <td>Increase memory of child JVMs of Reduces, max/min heap size of 2GB, 
-Xmn specifies young generation which is typically 10% of maximum heap size</td>
+    </tr>
+    <tr>
+      <td nowrap=""><code>mapreduce.map.memory.mb</code></td>
+      <td nowrap=""><code>3072</code></td>
+      <td>Set to at least 1.5 times the value of the Map max heap size</td>
+    </tr>
+    <tr>
+      <td nowrap=""><code>mapreduce.reduce.memory.mb</code></td>
+      <td nowrap=""><code>3072</code></td>
+      <td>Set to at least 1.5 times the value of the Reduce max heap size</td>
+    </tr>
+    <tr>
+      <td nowrap=""><code>io.sort.mb</code> (deprecated) /<br /> 
<code>mapreduce.task.io.sort.mb</code></td>
+      <td nowrap=""><code>384</code></td>
+      <td>Memory limit while sorting data</td>
+    </tr>
+    <tr>
+      <td nowrap=""><code>yarn-site.xml</code></td>
+      <td nowrap=""><code>yarn.nodemanager.vmem-pmem-ratio</code></td>
+      <td nowrap=""><code>2</code> to <code>5</code></td>
+      <td>Maximum ratio of virtual memory to physical memory</td>
+    </tr>
+  </tbody>
+</table>
+
+
+        </div> <!-- /container -->
+
+        
+
+        <script src="js/vendor/jquery-1.12.0.min.js"></script>
+        <script src="js/vendor/bootstrap.min.js"></script>
+        <script src="js/vendor/anchor.min.js"></script>
+        <script src="js/main.js"></script>
+        
+
+
+
+
+        <!-- Analytics -->
+        <script>
+            
(function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
+            (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new 
Date();a=s.createElement(o),
+            
m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
+            
})(window,document,'script','//www.google-analytics.com/analytics.js','ga');
+            ga('create', 'UA-71553733-1', 'auto');
+            ga('send', 'pageview');
+        </script>
+
+
+
+        <!-- MathJax Section -->
+        <script type="text/x-mathjax-config">
+            MathJax.Hub.Config({
+                TeX: { equationNumbers: { autoNumber: "AMS" } }
+            });
+        </script>
+        <script>
+            // Note that we load MathJax this way to work with local file 
(file://), HTTP and HTTPS.
+            // We could use "//cdn.mathjax...", but that won't support 
"file://".
+            (function(d, script) {
+                script = d.createElement('script');
+                script.type = 'text/javascript';
+                script.async = true;
+                script.onload = function(){
+                    MathJax.Hub.Config({
+                        tex2jax: {
+                            inlineMath: [ ["$", "$"], ["\\\\(","\\\\)"] ],
+                            displayMath: [ ["$$","$$"], ["\\[", "\\]"] ],
+                            processEscapes: true,
+                            skipTags: ['script', 'noscript', 'style', 
'textarea', 'pre']
+                        }
+                    });
+                };
+                script.src = ('https:' == document.location.protocol ? 
'https://' : 'http://') +
+                    
'cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML';
+                d.getElementsByTagName('head')[0].appendChild(script);
+            }(document));
+        </script>
+    </body>
+</html>


Added: systemml/site/docs/1.1.0/index.html
URL: 
http://svn.apache.org/viewvc/systemml/site/docs/1.1.0/index.html?rev=1828046&view=auto
==============================================================================
--- systemml/site/docs/1.1.0/index.html (added)
+++ systemml/site/docs/1.1.0/index.html Fri Mar 30 04:31:05 2018
@@ -0,0 +1,240 @@
+<!DOCTYPE html>
+<!--[if lt IE 7]>      <html class="no-js lt-ie9 lt-ie8 lt-ie7"> <![endif]-->
+<!--[if IE 7]>         <html class="no-js lt-ie9 lt-ie8"> <![endif]-->
+<!--[if IE 8]>         <html class="no-js lt-ie9"> <![endif]-->
+<!--[if gt IE 8]><!--> <html class="no-js"> <!--<![endif]-->
+    <head>
+        <title>SystemML Documentation - SystemML 1.1.0</title>
+        <meta charset="utf-8">
+        <meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1">
+        
+        <meta name="description" content="SystemML Documentation">
+        
+        <meta name="viewport" content="width=device-width">
+        <link rel="stylesheet" href="css/bootstrap.min.css">
+        <link rel="stylesheet" href="css/main.css">
+        <link rel="stylesheet" href="css/pygments-default.css">
+        <link rel="shortcut icon" href="img/favicon.png">
+    </head>
+    <body>
+        <!--[if lt IE 7]>
+            <p class="chromeframe">You are using an outdated browser. <a 
href="http://browsehappy.com/";>Upgrade your browser today</a> or <a 
href="http://www.google.com/chromeframe/?redirect=true";>install Google Chrome 
Frame</a> to better experience this site.</p>
+        <![endif]-->
+
+        <header class="navbar navbar-default navbar-fixed-top" id="topbar">
+            <div class="container">
+                <div class="navbar-header">
+                    <div class="navbar-brand brand projectlogo">
+                        <a href="http://systemml.apache.org/";><img 
class="logo" src="img/systemml-logo.png" alt="Apache SystemML" title="Apache 
SystemML"/></a>
+                    </div>
+                    <div class="navbar-brand brand projecttitle">
+                        <a href="http://systemml.apache.org/";>Apache 
SystemML<sup id="trademark">â¢</sup></a><br/>
+                        <span class="version">1.1.0</span>
+                    </div>
+                    <button type="button" class="navbar-toggle collapsed" 
data-toggle="collapse" data-target=".navbar-collapse">
+                        <span class="sr-only">Toggle navigation</span>
+                        <span class="icon-bar"></span>
+                        <span class="icon-bar"></span>
+                        <span class="icon-bar"></span>
+                    </button>
+                </div>
+                <nav class="navbar-collapse collapse">
+                    <ul class="nav navbar-nav navbar-right">
+                        <li><a href="index.html">Overview</a></li>
+                        <li><a 
href="https://github.com/apache/systemml";>GitHub</a></li>
+                        <li class="dropdown">
+                            <a href="#" class="dropdown-toggle" 
data-toggle="dropdown">Documentation<b class="caret"></b></a>
+                            <ul class="dropdown-menu" role="menu">
+                                <li><b>Running SystemML:</b></li>
+                                <li><a 
href="https://github.com/apache/systemml";>SystemML GitHub README</a></li>
+                                <li><a 
href="spark-mlcontext-programming-guide.html">Spark MLContext</a></li>
+                                <li><a href="spark-batch-mode.html">Spark 
Batch Mode</a>
+                                <li><a href="hadoop-batch-mode.html">Hadoop 
Batch Mode</a>
+                                <li><a href="standalone-guide.html">Standalone 
Guide</a></li>
+                                <li><a href="jmlc.html">Java Machine Learning 
Connector (JMLC)</a>
+                                <li class="divider"></li>
+                                <li><b>Language Guides:</b></li>
+                                <li><a href="dml-language-reference.html">DML 
Language Reference</a></li>
+                                <li><a 
href="beginners-guide-to-dml-and-pydml.html">Beginner's Guide to DML and 
PyDML</a></li>
+                                <li><a 
href="beginners-guide-python.html">Beginner's Guide for Python Users</a></li>
+                                <li><a href="python-reference.html">Reference 
Guide for Python Users</a></li>
+                                <li class="divider"></li>
+                                <li><b>ML Algorithms:</b></li>
+                                <li><a 
href="algorithms-reference.html">Algorithms Reference</a></li>
+                                <li class="divider"></li>
+                                <li><b>Tools:</b></li>
+                                <li><a href="debugger-guide.html">Debugger 
Guide</a></li>
+                                <li><a 
href="developer-tools-systemml.html">IDE Guide</a></li>
+                                <li class="divider"></li>
+                                <li><b>Other:</b></li>
+                                <li><a 
href="contributing-to-systemml.html">Contributing to SystemML</a></li>
+                                <li><a href="engine-dev-guide.html">Engine 
Developer Guide</a></li>
+                                <li><a 
href="troubleshooting-guide.html">Troubleshooting Guide</a></li>
+                                <li><a href="release-process.html">Release 
Process</a></li>
+                            </ul>
+                        </li>
+                        
+                        <li class="dropdown">
+                            <a href="#" class="dropdown-toggle" 
data-toggle="dropdown">API Docs<b class="caret"></b></a>
+                            <ul class="dropdown-menu" role="menu">
+                                <li><a 
href="./api/java/index.html">Java</a></li>
+                                <li><a 
href="./api/python/index.html">Python</a></li>
+                            </ul>
+                        </li>
+                        
+                        <li class="dropdown">
+                            <a href="#" class="dropdown-toggle" 
data-toggle="dropdown">Issues<b class="caret"></b></a>
+                            <ul class="dropdown-menu" role="menu">
+                                <li><b>JIRA:</b></li>
+                                <li><a 
href="https://issues.apache.org/jira/browse/SYSTEMML";>SystemML JIRA</a></li>
+                                
+                            </ul>
+                        </li>
+                    </ul>
+                </nav>
+            </div>
+        </header>
+
+        <div class="container" id="content">
+          
+            <h1 class="title">SystemML Documentation</h1>
+          
+
+          <!--
+
+-->
+
+<p>SystemML is a flexible, scalable machine learning system.
+SystemML&#8217;s distinguishing characteristics are:</p>
+
+<ol>
+  <li><strong>Algorithm customizability via R-like and Python-like 
languages</strong>.</li>
+  <li><strong>Multiple execution modes</strong>, including Spark MLContext, 
Spark Batch, Hadoop Batch, Standalone, and JMLC.</li>
+  <li><strong>Automatic optimization</strong> based on data and cluster 
characteristics to ensure both efficiency and scalability.</li>
+</ol>
+
+<p>The <a href="https://github.com/apache/systemml";>SystemML GitHub README</a> 
describes
+building, testing, and running SystemML. Please read <a 
href="contributing-to-systemml">Contributing to SystemML</a>
+to find out how to help make SystemML even better!</p>
+
+<p>To download SystemML, visit the <a 
href="http://systemml.apache.org/download";>downloads</a> page.</p>
+
+<p>This version of SystemML supports: Java 8+, Scala 2.11+, Python 2.7/3.5+, 
Hadoop 2.6+, and Spark 2.1+.</p>
+
+<h2 id="running-systemml">Running SystemML</h2>
+
+<ul>
+  <li><a href="beginners-guide-python">Beginner&#8217;s Guide For Python 
Users</a> - Beginner&#8217;s Guide for Python users.</li>
+  <li><a href="spark-mlcontext-programming-guide">Spark MLContext</a> - Spark 
MLContext is a programmatic API
+for running SystemML from Spark via Scala, Python, or Java.
+    <ul>
+      <li><a 
href="spark-mlcontext-programming-guide#spark-shell-example">Spark Shell 
Example (Scala)</a></li>
+      <li><a 
href="spark-mlcontext-programming-guide#jupyter-pyspark-notebook-example---poisson-nonnegative-matrix-factorization">Jupyter
 Notebook Example (PySpark)</a></li>
+    </ul>
+  </li>
+  <li><a href="spark-batch-mode">Spark Batch</a> - Algorithms are 
automatically optimized to run across Spark clusters.</li>
+  <li><a href="hadoop-batch-mode">Hadoop Batch</a> - Algorithms are 
automatically optimized when distributed across Hadoop clusters.</li>
+  <li><a href="standalone-guide">Standalone</a> - Standalone mode allows data 
scientists to rapidly prototype algorithms on a single
+machine in R-like and Python-like declarative languages.</li>
+  <li><a href="jmlc">JMLC</a> - Java Machine Learning Connector.</li>
+  <li><a href="deep-learning">Deep Learning with SystemML</a>
+    <ul>
+      <li><em>Experimental</em> Caffe2DML API for Deep Learning (<a 
href="beginners-guide-caffe2dml">beginner&#8217;s guide</a>, <a 
href="reference-guide-caffe2dml">reference guide</a>) - Converts a Caffe 
specification to DML.</li>
+      <li><em>Experimental</em> <a href="beginners-guide-keras2dml">Keras2DML 
API</a> for Deep Learning.</li>
+    </ul>
+  </li>
+</ul>
+
+<h2 id="language-guides">Language Guides</h2>
+
+<ul>
+  <li><a href="python-reference">Python API Reference</a> - API Reference 
Guide for Python users.</li>
+  <li><a href="dml-language-reference">DML Language Reference</a> -
+DML is a high-level R-like declarative language for machine learning.</li>
+  <li><strong>PyDML Language Reference</strong> -
+PyDML is a high-level Python-like declarative language for machine 
learning.</li>
+  <li><a href="beginners-guide-to-dml-and-pydml">Beginner&#8217;s Guide to DML 
and PyDML</a> -
+An introduction to the basics of DML and PyDML.</li>
+</ul>
+
+<h2 id="ml-algorithms">ML Algorithms</h2>
+
+<ul>
+  <li><a href="algorithms-reference">Algorithms Reference</a> - The Algorithms 
Reference describes the
+machine learning algorithms included with SystemML in detail.</li>
+</ul>
+
+<h2 id="tools">Tools</h2>
+
+<ul>
+  <li><a href="debugger-guide">Debugger Guide</a> - SystemML supports DML 
script-level debugging through a
+command-line interface.</li>
+  <li><a href="developer-tools-systemml">IDE Guide</a> - Useful IDE Guide for 
Developing SystemML.</li>
+</ul>
+
+<h2 id="other">Other</h2>
+
+<ul>
+  <li><a href="contributing-to-systemml">Contributing to SystemML</a> - 
Describes ways to contribute to SystemML.</li>
+  <li><a href="engine-dev-guide">Engine Developer Guide</a> - Guide for 
internal SystemML engine development.</li>
+  <li><a href="troubleshooting-guide">Troubleshooting Guide</a> - Troubleshoot 
various issues related to SystemML.</li>
+  <li><a href="release-process">Release Process</a> - Description of the 
SystemML release process.</li>
+  <li><a href="native-backend">Using Native BLAS</a> in SystemML.</li>
+</ul>
+
+
+        </div> <!-- /container -->
+
+        
+
+        <script src="js/vendor/jquery-1.12.0.min.js"></script>
+        <script src="js/vendor/bootstrap.min.js"></script>
+        <script src="js/vendor/anchor.min.js"></script>
+        <script src="js/main.js"></script>
+        
+
+
+
+
+        <!-- Analytics -->
+        <script>
+            
(function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
+            (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new 
Date();a=s.createElement(o),
+            
m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
+            
})(window,document,'script','//www.google-analytics.com/analytics.js','ga');
+            ga('create', 'UA-71553733-1', 'auto');
+            ga('send', 'pageview');
+        </script>
+
+
+
+        <!-- MathJax Section -->
+        <script type="text/x-mathjax-config">
+            MathJax.Hub.Config({
+                TeX: { equationNumbers: { autoNumber: "AMS" } }
+            });
+        </script>
+        <script>
+            // Note that we load MathJax this way to work with local file 
(file://), HTTP and HTTPS.
+            // We could use "//cdn.mathjax...", but that won't support 
"file://".
+            (function(d, script) {
+                script = d.createElement('script');
+                script.type = 'text/javascript';
+                script.async = true;
+                script.onload = function(){
+                    MathJax.Hub.Config({
+                        tex2jax: {
+                            inlineMath: [ ["$", "$"], ["\\\\(","\\\\)"] ],
+                            displayMath: [ ["$$","$$"], ["\\[", "\\]"] ],
+                            processEscapes: true,
+                            skipTags: ['script', 'noscript', 'style', 
'textarea', 'pre']
+                        }
+                    });
+                };
+                script.src = ('https:' == document.location.protocol ? 
'https://' : 'http://') +
+                    
'cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML';
+                d.getElementsByTagName('head')[0].appendChild(script);
+            }(document));
+        </script>
+    </body>
+</html>

svn commit: r1828046 [14/20] - /systemml/site/docs/1.1.0/

Reply via email to