Added: systemml/site/docs/1.1.0/hadoop-batch-mode.html URL: http://svn.apache.org/viewvc/systemml/site/docs/1.1.0/hadoop-batch-mode.html?rev=1828046&view=auto ============================================================================== --- systemml/site/docs/1.1.0/hadoop-batch-mode.html (added) +++ systemml/site/docs/1.1.0/hadoop-batch-mode.html Fri Mar 30 04:31:05 2018 @@ -0,0 +1,1224 @@ +<!DOCTYPE html> +<!--[if lt IE 7]> <html class="no-js lt-ie9 lt-ie8 lt-ie7"> <![endif]--> +<!--[if IE 7]> <html class="no-js lt-ie9 lt-ie8"> <![endif]--> +<!--[if IE 8]> <html class="no-js lt-ie9"> <![endif]--> +<!--[if gt IE 8]><!--> <html class="no-js"> <!--<![endif]--> + <head> + <title>Invoking SystemML in Hadoop Batch Mode - SystemML 1.1.0</title> + <meta charset="utf-8"> + <meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1"> + + <meta name="description" content="Invoking SystemML in Hadoop Batch Mode"> + + <meta name="viewport" content="width=device-width"> + <link rel="stylesheet" href="css/bootstrap.min.css"> + <link rel="stylesheet" href="css/main.css"> + <link rel="stylesheet" href="css/pygments-default.css"> + <link rel="shortcut icon" href="img/favicon.png"> + </head> + <body> + <!--[if lt IE 7]> + <p class="chromeframe">You are using an outdated browser. <a href="http://browsehappy.com/">Upgrade your browser today</a> or <a href="http://www.google.com/chromeframe/?redirect=true">install Google Chrome Frame</a> to better experience this site.</p> + <![endif]--> + + <header class="navbar navbar-default navbar-fixed-top" id="topbar"> + <div class="container"> + <div class="navbar-header"> + <div class="navbar-brand brand projectlogo"> + <a href="http://systemml.apache.org/"><img class="logo" src="img/systemml-logo.png" alt="Apache SystemML" title="Apache SystemML"/></a> + </div> + <div class="navbar-brand brand projecttitle"> + <a href="http://systemml.apache.org/">Apache SystemML<sup id="trademark">â¢</sup></a><br/> + <span class="version">1.1.0</span> + </div> + <button type="button" class="navbar-toggle collapsed" data-toggle="collapse" data-target=".navbar-collapse"> + <span class="sr-only">Toggle navigation</span> + <span class="icon-bar"></span> + <span class="icon-bar"></span> + <span class="icon-bar"></span> + </button> + </div> + <nav class="navbar-collapse collapse"> + <ul class="nav navbar-nav navbar-right"> + <li><a href="index.html">Overview</a></li> + <li><a href="https://github.com/apache/systemml">GitHub</a></li> + <li class="dropdown"> + <a href="#" class="dropdown-toggle" data-toggle="dropdown">Documentation<b class="caret"></b></a> + <ul class="dropdown-menu" role="menu"> + <li><b>Running SystemML:</b></li> + <li><a href="https://github.com/apache/systemml">SystemML GitHub README</a></li> + <li><a href="spark-mlcontext-programming-guide.html">Spark MLContext</a></li> + <li><a href="spark-batch-mode.html">Spark Batch Mode</a> + <li><a href="hadoop-batch-mode.html">Hadoop Batch Mode</a> + <li><a href="standalone-guide.html">Standalone Guide</a></li> + <li><a href="jmlc.html">Java Machine Learning Connector (JMLC)</a> + <li class="divider"></li> + <li><b>Language Guides:</b></li> + <li><a href="dml-language-reference.html">DML Language Reference</a></li> + <li><a href="beginners-guide-to-dml-and-pydml.html">Beginner's Guide to DML and PyDML</a></li> + <li><a href="beginners-guide-python.html">Beginner's Guide for Python Users</a></li> + <li><a href="python-reference.html">Reference Guide for Python Users</a></li> + <li class="divider"></li> + <li><b>ML Algorithms:</b></li> + <li><a href="algorithms-reference.html">Algorithms Reference</a></li> + <li class="divider"></li> + <li><b>Tools:</b></li> + <li><a href="debugger-guide.html">Debugger Guide</a></li> + <li><a href="developer-tools-systemml.html">IDE Guide</a></li> + <li class="divider"></li> + <li><b>Other:</b></li> + <li><a href="contributing-to-systemml.html">Contributing to SystemML</a></li> + <li><a href="engine-dev-guide.html">Engine Developer Guide</a></li> + <li><a href="troubleshooting-guide.html">Troubleshooting Guide</a></li> + <li><a href="release-process.html">Release Process</a></li> + </ul> + </li> + + <li class="dropdown"> + <a href="#" class="dropdown-toggle" data-toggle="dropdown">API Docs<b class="caret"></b></a> + <ul class="dropdown-menu" role="menu"> + <li><a href="./api/java/index.html">Java</a></li> + <li><a href="./api/python/index.html">Python</a></li> + </ul> + </li> + + <li class="dropdown"> + <a href="#" class="dropdown-toggle" data-toggle="dropdown">Issues<b class="caret"></b></a> + <ul class="dropdown-menu" role="menu"> + <li><b>JIRA:</b></li> + <li><a href="https://issues.apache.org/jira/browse/SYSTEMML">SystemML JIRA</a></li> + + </ul> + </li> + </ul> + </nav> + </div> + </header> + + <div class="container" id="content"> + + <h1 class="title">Invoking SystemML in Hadoop Batch Mode</h1> + + + <!-- + +--> + +<ul id="markdown-toc"> + <li><a href="#overview" id="markdown-toc-overview">Overview</a></li> + <li><a href="#hadoop-batch-mode-invocation-syntax" id="markdown-toc-hadoop-batch-mode-invocation-syntax">Hadoop Batch Mode Invocation Syntax</a></li> + <li><a href="#systemml-with-standalone-hadoop" id="markdown-toc-systemml-with-standalone-hadoop">SystemML with Standalone Hadoop</a></li> + <li><a href="#systemml-with-pseudo-distributed-hadoop" id="markdown-toc-systemml-with-pseudo-distributed-hadoop">SystemML with Pseudo-Distributed Hadoop</a></li> + <li><a href="#systemml-with-pseudo-distributed-hadoop-and-yarn" id="markdown-toc-systemml-with-pseudo-distributed-hadoop-and-yarn">SystemML with Pseudo-Distributed Hadoop and YARN</a></li> + <li><a href="#systemml-with-distributed-hadoop-and-yarn" id="markdown-toc-systemml-with-distributed-hadoop-and-yarn">SystemML with Distributed Hadoop and YARN</a> <ul> + <li><a href="#systemml-with-distributed-hadoop-and-yarn-linear-regression-example" id="markdown-toc-systemml-with-distributed-hadoop-and-yarn-linear-regression-example">SystemML with Distributed Hadoop and YARN: Linear Regression Example</a></li> + <li><a href="#systemml-with-distributed-hadoop-and-yarn-k-means-clustering-example" id="markdown-toc-systemml-with-distributed-hadoop-and-yarn-k-means-clustering-example">SystemML with Distributed Hadoop and YARN: K-Means Clustering Example</a></li> + </ul> + </li> + <li><a href="#recommended-hadoop-cluster-configuration-settings" id="markdown-toc-recommended-hadoop-cluster-configuration-settings">Recommended Hadoop Cluster Configuration Settings</a></li> +</ul> + +<p><br /></p> + +<h1 id="overview">Overview</h1> + +<p>Given that a primary purpose of SystemML is to perform machine learning on large distributed data sets, +two of the most important ways to invoke SystemML are Hadoop Batch and Spark Batch modes. +Here, we will look at SystemML’s Hadoop Batch mode in more depth.</p> + +<p>We will look at running SystemML with Standalone Hadoop, Pseudo-Distributed Hadoop, and Distributed Hadoop. +We will first run SystemML on a single machine with Hadoop running in Standalone mode. Next, we’ll run SystemML on HDFS +in Hadoop’s Pseudo-Distributed mode on a single machine, followed by Pseudo-Distributed mode with YARN. +After that, we’ll set up a 4-node Hadoop cluster and run SystemML on Distributed Hadoop with YARN.</p> + +<p>Note that this tutorial does not address security. For security considerations with regards to Hadoop, please +refer to the Hadoop documentation.</p> + +<hr /> + +<h1 id="hadoop-batch-mode-invocation-syntax">Hadoop Batch Mode Invocation Syntax</h1> + +<p>SystemML can be invoked in Hadoop Batch mode using the following syntax:</p> + +<pre><code>hadoop jar SystemML.jar [-? | -help | -f <filename>] (-config <config_filename>) ([-args | -nvargs] <args-list>) +</code></pre> + +<p>The <code>SystemML.jar</code> file is specified to Hadoop using the <code>jar</code> option. +The DML script to invoke is specified after the <code>-f</code> argument. Configuration settings can be passed to SystemML +using the optional <code>-config </code> argument. DML scripts can optionally take named arguments (<code>-nvargs</code>) or positional +arguments (<code>-args</code>). Named arguments are preferred over positional arguments. Positional arguments are considered +to be deprecated. All the primary algorithm scripts included with SystemML use named arguments.</p> + +<p><strong>Example #1: DML Invocation with Named Arguments</strong></p> + +<pre><code>hadoop jar systemml/SystemML.jar -f systemml/algorithms/Kmeans.dml -nvargs X=X.mtx k=5 +</code></pre> + +<p><strong>Example #2: DML Invocation with Positional Arguments</strong></p> + +<pre><code>hadoop jar systemml/SystemML.jar -f example/test/LinearRegression.dml -args "v" "y" 0.00000001 "w" +</code></pre> + +<p>In a clustered environment, it is <em>highly</em> recommended that SystemML configuration settings are specified +in a <code>SystemML-config.xml</code> file. By default, SystemML will look for this file in the current working +directory (<code>./SystemML-config.xml</code>). This location can be overridden by the <code>-config </code> argument.</p> + +<p><strong>Example #3: DML Invocation with Configuration File Explicitly Specified and Named Arguments</strong></p> + +<pre><code>hadoop jar systemml/SystemML.jar -f systemml/algorithms/Kmeans.dml -config /conf/SystemML-config.xml -nvargs X=X.mtx k=5 +</code></pre> + +<p>For recommended SystemML configuration settings in a clustered environment, please see +<a href="hadoop-batch-mode.html#recommended-hadoop-cluster-configuration-settings">Recommended Hadoop Cluster Configuration Settings</a>.</p> + +<hr /> + +<h1 id="systemml-with-standalone-hadoop">SystemML with Standalone Hadoop</h1> + +<p>In Standalone mode, Hadoop runs on a single machine as a single Java process.</p> + +<p>To begin, I connected to my Linux server as root and created a hadoop user.</p> + +<pre><code>$ ssh [email protected] +[root@host1 ~]# useradd hadoop +[root@host1 ~]# passwd hadoop +</code></pre> + +<p>Next, I logged on as the hadoop user. I downloaded the version of Hadoop that I wanted to use from an Apache mirror. +A list of Hadoop releases can be found at the <a href="http://hadoop.apache.org/releases.html">Apache Hadoop Releases</a> website. +After downloading the Hadoop binary release, I unpacked it.</p> + +<pre><code>$ ssh [email protected] +[hadoop@host1 ~]$ wget http://mirror.sdunix.com/apache/hadoop/common/hadoop-2.6.2/hadoop-2.6.2.tar.gz +[hadoop@host1 ~]$ tar -xvzf hadoop-2.6.2.tar.gz +</code></pre> + +<p>My Linux server already had a JDK (Java Development Kit) installed. If you haven’t done so already, you will need Java +installed in order to use Hadoop.</p> + +<p>I updated my <code>.bash_profile</code> file to export a <code>JAVA_HOME</code> environment variable, which I pointed to my JDK installation +directory. I also exported a <code>HADOOP_HOME</code> environment variable, which points to the root directory of the Hadoop release +that I unpacked. I updated the <code>PATH</code> variable to include the <code>JAVA_HOME</code> <code>bin</code> directory, the <code>HADOOP_HOME</code> <code>bin</code> directory, +and the <code>HADOOP_HOME</code> <code>sbin</code> directory.</p> + +<pre><code>[hadoop@host1 ~]# vi .bash_profile + +... +export JAVA_HOME=/usr/lib/jvm/java-1.7.0-openjdk.x86_64 +export HADOOP_HOME=/home/hadoop/hadoop-2.6.2 +PATH=$JAVA_HOME/bin:$PATH:$HOME/bin:$HADOOP_HOME/bin:$HADOOP_HOME/sbin +export PATH +... + +[hadoop@host1 ~]$ source ~/.bash_profile +</code></pre> + +<p>To verify that Java and Hadoop were on the path, I used the <code>java -version</code> and <code>hadoop version</code> commands.</p> + +<pre><code>[hadoop@host1 ~]$ java -version +java version "1.7.0_79" +OpenJDK Runtime Environment (rhel-2.5.5.1.el6_6-x86_64 u79-b14) +OpenJDK 64-Bit Server VM (build 24.79-b02, mixed mode) +[hadoop@host1 ~]$ hadoop version +Hadoop 2.6.2 +Subversion https://git-wip-us.apache.org/repos/asf/hadoop.git -r 0cfd050febe4a30b1ee1551dcc527589509fb681 +Compiled by jenkins on 2015-10-22T00:42Z +Compiled with protoc 2.5.0 +From source with checksum f9ebb94bf5bf9bec892825ede28baca +This command was run using /home/hadoop/hadoop-2.6.2/share/hadoop/common/hadoop-common-2.6.2.jar +</code></pre> + +<p>Next, I downloaded a SystemML release from the <a href="http://systemml.apache.org/download.html">downloads</a> page. +Following this, I unpacked it.</p> + +<pre><code>[hadoop@host1 ~]$ tar -xvzf systemml-1.1.0.tar.gz +</code></pre> + +<p><strong>Alternatively</strong>, we could have built the SystemML distributed release using <a href="http://maven.apache.org">Apache Maven</a> and unpacked it.</p> + +<pre><code>[hadoop@host1 ~]$ git clone https://github.com/apache/systemml.git +[hadoop@host1 ~]$ cd systemml +[hadoop@host1 systemml]$ mvn clean package -P distribution +[hadoop@host1 systemml]$ tar -xvzf target/systemml-1.1.0.tar.gz -C .. +[hadoop@host1 ~]$ cd .. +</code></pre> + +<p>I downloaded the <code>genLinearRegressionData.dml</code> script that is used in the SystemML README example.</p> + +<pre><code>[hadoop@host1 ~]$ wget https://raw.githubusercontent.com/apache/systemml/master/scripts/datagen/genLinearRegressionData.dml +</code></pre> + +<p>Next, I invoked the <code>genLinearRegressionData.dml</code> DML script in Hadoop Batch mode. +Hadoop was executed with the <code>SystemML.jar</code> file specified by the hadoop <code>jar</code> option. +The <code>genLinearRegressionData.dml</code> was specified using the <code>-f</code> option. Named input +arguments to the DML script were specified following the <code>-nvargs</code> option.</p> + +<pre><code>[hadoop@host1 ~]$ hadoop jar systemml-1.1.0/SystemML.jar -f genLinearRegressionData.dml -nvargs numSamples=1000 numFeatures=50 maxFeatureValue=5 maxWeight=5 addNoise=FALSE b=0 sparsity=0.7 output=linRegData.csv format=csv perc=0.5 +15/11/11 15:56:21 INFO api.DMLScript: BEGIN DML run 11/11/2015 15:56:21 +15/11/11 15:56:21 INFO api.DMLScript: HADOOP_HOME: /home/hadoop/hadoop-2.6.2 +15/11/11 15:56:21 WARN conf.DMLConfig: No default SystemML config file (./SystemML-config.xml) found +15/11/11 15:56:21 WARN conf.DMLConfig: Using default settings in DMLConfig +15/11/11 15:56:22 INFO jvm.JvmMetrics: Initializing JVM Metrics with processName=JobTracker, sessionId= +15/11/11 15:56:22 WARN hops.OptimizerUtils: Auto-disable multi-threaded text read for 'text' and 'csv' due to thread contention on JRE < 1.8 (java.version=1.7.0_79). +15/11/11 15:56:22 INFO api.DMLScript: SystemML Statistics: +Total execution time: 0.288 sec. +Number of executed MR Jobs: 0. + +15/11/11 15:56:22 INFO api.DMLScript: END DML run 11/11/2015 15:56:22 +</code></pre> + +<p>In the console output, we see a warning that no default SystemML config file was found in the current working directory. +In a distributed environment on a large data set, it is highly advisable to specify configuration settings in a SystemML config file for +optimal performance. The location of the SystemML config file can be explicitly specified using the <code>-config </code> argument.</p> + +<p>The OptimizerUtils warning occurs because parallel multi-threaded text reads in Java versions less than 1.8 result +in thread contention issues, so only a single thread reads matrix data in text formats.</p> + +<p>If we examine the contents of the directory, we see that <code>linRegData.csv</code> and <code>perc.csv</code> were written to the file system, +along with their corresponding metadata files. The <code>scratch_space</code> directory is used to write temporary matrix files.</p> + +<pre><code>[hadoop@host1 ~]$ ls -l +total 197500 +-rw-rw-r-- 1 hadoop hadoop 2208 Nov 11 15:45 genLinearRegressionData.dml +drwxr-xr-x 9 hadoop hadoop 4096 Oct 21 17:53 hadoop-2.6.2 +-rw-rw-r-- 1 hadoop hadoop 195515434 Oct 30 14:04 hadoop-2.6.2.tar.gz +drwxrwxrwx 2 hadoop hadoop 4096 Nov 11 15:56 linRegData.csv +-rw-r--r-- 1 hadoop hadoop 214 Nov 11 15:56 linRegData.csv.mtd +drwxrwxrwx 2 hadoop hadoop 4096 Nov 11 15:56 perc.csv +-rw-r--r-- 1 hadoop hadoop 206 Nov 11 15:56 perc.csv.mtd +drwxrwxrwx 2 hadoop hadoop 4096 Nov 11 15:56 scratch_space +drwxrwxr-x 4 hadoop hadoop 4096 Nov 11 15:42 systemml-1.1.0 +-rw-rw-r-- 1 hadoop hadoop 6683281 Oct 27 21:13 systemml-1.1.0.tar.gz +</code></pre> + +<p>To clean things up, I’ll delete the files that were generated.</p> + +<pre><code>[hadoop@host1 ~]$ rm -r *.csv +[hadoop@host1 ~]$ rm *.csv.mtd +[hadoop@host1 ~]$ rmdir scratch_space/ +</code></pre> + +<hr /> + +<h1 id="systemml-with-pseudo-distributed-hadoop">SystemML with Pseudo-Distributed Hadoop</h1> + +<p>Next, we’ll look at running SystemML with Hadoop in Pseudo-Distributed mode. In Pseudo-Distributed mode, each Hadoop daemon +(such as NameNode and DataNode) runs in a separate Java process on a single machine.</p> + +<p>In the previous section about Hadoop Standalone mode, we set up the <code>JAVA_HOME</code> and <code>HADOOP_HOME</code> environment variables +and added <code>JAVA_HOME/bin</code>, <code>HADOOP_HOME/bin</code>, and <code>HADOOP_HOME/sbin</code> to the <code>PATH</code> in <code>.bash_profile</code>.</p> + +<p>We also need to set the <code>JAVA_HOME</code> value in the <code>hadoop-env.sh</code> file in the Hadoop configuration directory (<code>etc/hadoop</code>).</p> + +<pre><code>[hadoop@host1 hadoop]$ pwd +/home/hadoop/hadoop-2.6.2/etc/hadoop +[hadoop@host1 hadoop]$ vi hadoop-env.sh + +... +export JAVA_HOME=/usr/lib/jvm/java-1.7.0-openjdk.x86_64 +... +</code></pre> + +<p>We need to be able to passwordlessly <code>ssh</code> to localhost. To do so, I’ll generate a public key/private key pair and add +the public key to the hadoop user’s <code>authorized_keys</code>. We can <code>ssh</code> to localhost to verify that we can connect without +a password.</p> + +<pre><code>[hadoop@host1 ~]$ ssh-keygen -t rsa -b 4096 -C "hadoop example" +Your identification has been saved in /home/hadoop/.ssh/id_rsa. +Your public key has been saved in /home/hadoop/.ssh/id_rsa.pub. +[hadoop@host1 ~]$ cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys +[hadoop@host1 ~]$ chmod 600 ~/.ssh/authorized_keys +[hadoop@host1 ~]$ ssh localhost +The authenticity of host 'localhost (::1)' can't be established. +RSA key fingerprint is 6b:86:78:86:13:0a:49:d4:c7:a7:15:10:d1:27:88:9e. +Are you sure you want to continue connecting (yes/no)? yes +Warning: Permanently added 'localhost' (RSA) to the list of known hosts. +[hadoop@host1 ~]$ exit +logout +Connection to localhost closed. +[hadoop@host1 ~]$ ls -l .ssh +total 16 +-rw------- 1 hadoop hadoop 736 Nov 11 16:44 authorized_keys +-rw------- 1 hadoop hadoop 3243 Nov 11 16:41 id_rsa +-rw-r--r-- 1 hadoop hadoop 736 Nov 11 16:41 id_rsa.pub +-rw-r--r-- 1 hadoop hadoop 391 Nov 11 16:46 known_hosts +</code></pre> + +<p>In the Hadoop configuration directory (<code>etc/hadoop</code>), in the <code>core-site.xml</code> file, we specify the <code>fs.defaultFS</code> +property to be <code>localhost</code> with port <code>9000</code>.</p> + +<pre><code>[hadoop@host1 hadoop]$ vi core-site.xml + +... +<configuration> + <property> + <name>fs.defaultFS</name> + <value>hdfs://localhost:9000</value> + </property> +</configuration> +... +</code></pre> + +<p>By default, HDFS replicates data on three nodes. Since we’re running on a single machine, we’ll change this to one. +We’ll add a <code>dfs.replication</code> property to <code>hdfs-site.xml</code> and set its value to <code>1</code>.</p> + +<pre><code>[hadoop@host1 hadoop]$ vi hdfs-site.xml + +... +<configuration> + <property> + <name>dfs.replication</name> + <value>1</value> + </property> +</configuration> +... +</code></pre> + +<p>Next, we’ll format HDFS.</p> + +<pre><code>[hadoop@host1 ~]$ hdfs namenode -format +15/11/11 17:23:33 INFO namenode.NameNode: STARTUP_MSG: +/************************************************************ +STARTUP_MSG: Starting NameNode +STARTUP_MSG: host = host1.example.com/9.30.252.15 +STARTUP_MSG: args = [-format] +STARTUP_MSG: version = 2.6.2 +... +STARTUP_MSG: java = 1.7.0_79 +************************************************************/ +... +15/11/11 17:23:34 INFO common.Storage: Storage directory /tmp/hadoop-hadoop/dfs/name has been successfully formatted. +... +/************************************************************ +SHUTDOWN_MSG: Shutting down NameNode at host1.example.com/9.30.252.15 +************************************************************/ +</code></pre> + +<p>We’ll start up HDFS using the <code>start-dfs.sh</code> script. This starts the NameNode, DataNode, and SecondaryNameNode daemons +on the single machine.</p> + +<pre><code>[hadoop@host1 ~]$ start-dfs.sh +Starting namenodes on [localhost] +localhost: starting namenode, logging to /home/hadoop/hadoop-2.6.2/logs/hadoop-hadoop-namenode-host1.out +localhost: starting datanode, logging to /home/hadoop/hadoop-2.6.2/logs/hadoop-hadoop-datanode-host1.out +Starting secondary namenodes [0.0.0.0] +The authenticity of host '0.0.0.0 (0.0.0.0)' can't be established. +RSA key fingerprint is 6b:86:78:86:13:0a:49:d4:c7:a7:15:10:d1:27:88:9e. +Are you sure you want to continue connecting (yes/no)? yes +0.0.0.0: Warning: Permanently added '0.0.0.0' (RSA) to the list of known hosts. +0.0.0.0: starting secondarynamenode, logging to /home/hadoop/hadoop-2.6.2/logs/hadoop-hadoop-secondarynamenode-host1.out +</code></pre> + +<p>We can see the running Java processes using the <code>jps</code> command.</p> + +<pre><code>[hadoop@host1 ~]$ jps +36128 Jps +35844 DataNode +36007 SecondaryNameNode +35722 NameNode +</code></pre> + +<p>Here, we can see detailed information about the Java processes that were started.</p> + +<pre><code>[hadoop@host1 ~]$ ps -C java -f -ww +UID PID PPID C STIME TTY TIME CMD +hadoop 35722 1 5 17:38 ? 00:00:05 /usr/lib/jvm/java-1.7.0-openjdk.x86_64/bin/java -Dproc_namenode -Xmx1000m -Djava.net.preferIPv4Stack=true -Dhadoop.log.dir=/home/hadoop/hadoop-2.6.2/logs -Dhadoop.log.file=hadoop.log -Dhadoop.home.dir=/home/hadoop/hadoop-2.6.2 -Dhadoop.id.str=hadoop -Dhadoop.root.logger=INFO,console -Djava.library.path=/home/hadoop/hadoop-2.6.2/lib/native -Dhadoop.policy.file=hadoop-policy.xml -Djava.net.preferIPv4Stack=true -Djava.net.preferIPv4Stack=true -Djava.net.preferIPv4Stack=true -Dhadoop.log.dir=/home/hadoop/hadoop-2.6.2/logs -Dhadoop.log.file=hadoop-hadoop-namenode-host1.log -Dhadoop.home.dir=/home/hadoop/hadoop-2.6.2 -Dhadoop.id.str=hadoop -Dhadoop.root.logger=INFO,RFA -Djava.library.path=/home/hadoop/hadoop-2.6.2/lib/native -Dhadoop.policy.file=hadoop-policy.xml -Djava.net.preferIPv4Stack=true -Dhadoop.security.logger=INFO,RFAS -Dhdfs.audit.logger=INFO,NullAppender -Dhadoop.security.logger=INFO,RFAS -Dhdfs.audit.logger=INFO,NullAppender -Dha doop.security.logger=INFO,RFAS -Dhdfs.audit.logger=INFO,NullAppender -Dhadoop.security.logger=INFO,RFAS org.apache.hadoop.hdfs.server.namenode.NameNode +hadoop 35844 1 4 17:38 ? 00:00:04 /usr/lib/jvm/java-1.7.0-openjdk.x86_64/bin/java -Dproc_datanode -Xmx1000m -Djava.net.preferIPv4Stack=true -Dhadoop.log.dir=/home/hadoop/hadoop-2.6.2/logs -Dhadoop.log.file=hadoop.log -Dhadoop.home.dir=/home/hadoop/hadoop-2.6.2 -Dhadoop.id.str=hadoop -Dhadoop.root.logger=INFO,console -Djava.library.path=/home/hadoop/hadoop-2.6.2/lib/native -Dhadoop.policy.file=hadoop-policy.xml -Djava.net.preferIPv4Stack=true -Djava.net.preferIPv4Stack=true -Djava.net.preferIPv4Stack=true -Dhadoop.log.dir=/home/hadoop/hadoop-2.6.2/logs -Dhadoop.log.file=hadoop-hadoop-datanode-host1.log -Dhadoop.home.dir=/home/hadoop/hadoop-2.6.2 -Dhadoop.id.str=hadoop -Dhadoop.root.logger=INFO,RFA -Djava.library.path=/home/hadoop/hadoop-2.6.2/lib/native -Dhadoop.policy.file=hadoop-policy.xml -Djava.net.preferIPv4Stack=true -server -Dhadoop.security.logger=ERROR,RFAS -Dhadoop.security.logger=ERROR,RFAS -Dhadoop.security.logger=ERROR,RFAS -Dhadoop.security.logger=INFO,RFAS org.apache.hadoop.hdfs.server.datanode.DataNode +hadoop 36007 1 5 17:38 ? 00:00:04 /usr/lib/jvm/java-1.7.0-openjdk.x86_64/bin/java -Dproc_secondarynamenode -Xmx1000m -Djava.net.preferIPv4Stack=true -Dhadoop.log.dir=/home/hadoop/hadoop-2.6.2/logs -Dhadoop.log.file=hadoop.log -Dhadoop.home.dir=/home/hadoop/hadoop-2.6.2 -Dhadoop.id.str=hadoop -Dhadoop.root.logger=INFO,console -Djava.library.path=/home/hadoop/hadoop-2.6.2/lib/native -Dhadoop.policy.file=hadoop-policy.xml -Djava.net.preferIPv4Stack=true -Djava.net.preferIPv4Stack=true -Djava.net.preferIPv4Stack=true -Dhadoop.log.dir=/home/hadoop/hadoop-2.6.2/logs -Dhadoop.log.file=hadoop-hadoop-secondarynamenode-host1.log -Dhadoop.home.dir=/home/hadoop/hadoop-2.6.2 -Dhadoop.id.str=hadoop -Dhadoop.root.logger=INFO,RFA -Djava.library.path=/home/hadoop/hadoop-2.6.2/lib/native -Dhadoop.policy.file=hadoop-policy.xml -Djava.net.preferIPv4Stack=true -Dhadoop.security.logger=INFO,RFAS -Dhdfs.audit.logger=INFO,NullAppender -Dhadoop.security.logger=INFO,RFAS -Dhdfs.audit.logger=INFO ,NullAppender -Dhadoop.security.logger=INFO,RFAS -Dhdfs.audit.logger=INFO,NullAppender -Dhadoop.security.logger=INFO,RFAS org.apache.hadoop.hdfs.server.namenode.SecondaryNameNode +</code></pre> + +<p>Useful log information is created by default in the hadoop <code>logs</code> directory.</p> + +<p>If everything worked correctly, we can hit port 50070 in a browser (http://host1.example.com:50070) to see Hadoop information.</p> + +<p>If we look at our HDFS file system, we see that it currently doesn’t contain any files.</p> + +<pre><code>[hadoop@host1 ~]$ hdfs dfs -ls +ls: `.': No such file or directory +</code></pre> + +<p>Let’s go ahead and execute the <code>genLinearRegressionData.dml</code> script in Hadoop Pseudo-Distributed mode.</p> + +<pre><code>[hadoop@host1 ~]$ hadoop jar systemml-1.1.0/SystemML.jar -f genLinearRegressionData.dml -nvargs numSamples=1000 numFeatures=50 maxFeatureValue=5 maxWeight=5 addNoise=FALSE b=0 sparsity=0.7 output=linRegData.csv format=csv perc=0.5 +15/11/11 18:16:33 INFO api.DMLScript: BEGIN DML run 11/11/2015 18:16:33 +15/11/11 18:16:33 INFO api.DMLScript: HADOOP_HOME: /home/hadoop/hadoop-2.6.2 +15/11/11 18:16:33 WARN conf.DMLConfig: No default SystemML config file (./SystemML-config.xml) found +15/11/11 18:16:33 WARN conf.DMLConfig: Using default settings in DMLConfig +15/11/11 18:16:33 INFO jvm.JvmMetrics: Initializing JVM Metrics with processName=JobTracker, sessionId= +15/11/11 18:16:33 WARN hops.OptimizerUtils: Auto-disable multi-threaded text read for 'text' and 'csv' due to thread contention on JRE < 1.8 (java.version=1.7.0_79). +15/11/11 18:16:35 INFO api.DMLScript: SystemML Statistics: +Total execution time: 1.484 sec. +Number of executed MR Jobs: 0. + +15/11/11 18:16:35 INFO api.DMLScript: END DML run 11/11/2015 18:16:35 +</code></pre> + +<p>If we list the contents of the current directory in our regular file system, we see that no files have been written +to the regular file system.</p> + +<pre><code>[hadoop@host1 ~]$ ls +genLinearRegressionData.dml hadoop-2.6.2 hadoop-2.6.2.tar.gz systemml-1.1.0 systemml-1.1.0.tar.gz +</code></pre> + +<p>If we list the contents of the HDFS file system, we see that HDFS contains our data files and the corresponding metadata files.</p> + +<pre><code>[hadoop@host1 ~]$ hdfs dfs -ls +Found 5 items +drwxr-xr-x - hadoop supergroup 0 2015-11-11 18:16 linRegData.csv +-rw-r--r-- 1 hadoop supergroup 214 2015-11-11 18:16 linRegData.csv.mtd +drwxr-xr-x - hadoop supergroup 0 2015-11-11 18:16 perc.csv +-rw-r--r-- 1 hadoop supergroup 206 2015-11-11 18:16 perc.csv.mtd +drwxr-xr-x - hadoop supergroup 0 2015-11-11 18:16 scratch_space +</code></pre> + +<p>If we examine the Hadoop web interface mentioned previously, we see that the files, directories, and blocks in HDFS have +increased in number.</p> + +<p>Now that we’re done with this example, I’ll clean things up and delete the generated files from HDFS.</p> + +<pre><code>[hadoop@host1 hadoop]$ hdfs dfs -rm -r *.csv +[hadoop@host1 hadoop]$ hdfs dfs -rm *.mtd +[hadoop@host1 hadoop]$ hdfs dfs -rmdir scratch_space +</code></pre> + +<p>I’ll stop HDFS using the <code>stop-dfs.sh</code> script and then verify that the Java processes have stopped.</p> + +<pre><code>[hadoop@host1 ~]$ stop-dfs.sh +Stopping namenodes on [localhost] +localhost: stopping namenode +localhost: stopping datanode +Stopping secondary namenodes [0.0.0.0] +0.0.0.0: stopping secondarynamenode + +[hadoop@host1 ~]$ jps +37337 Jps +</code></pre> + +<hr /> + +<h1 id="systemml-with-pseudo-distributed-hadoop-and-yarn">SystemML with Pseudo-Distributed Hadoop and YARN</h1> + +<p>To add YARN to Pseudo-Distributed Hadoop on the single machine, we need to take our setup from the +previous example and update two configuration +files and start the ResourceManager and NodeManager daemons.</p> + +<p>In the <code>mapred-site.xml</code> configuration file, we specify the +<code>mapreduce.framework.name</code> property as <code>yarn</code>.</p> + +<pre><code>[hadoop@host1 hadoop]$ pwd +/home/hadoop/hadoop-2.6.2/etc/hadoop +[hadoop@host1 hadoop]$ cp mapred-site.xml.template mapred-site.xml +[hadoop@host1 hadoop]$ vi mapred-site.xml + +... +<configuration> + <property> + <name>mapreduce.framework.name</name> + <value>yarn</value> + </property> +</configuration> +... +</code></pre> + +<p>In the <code>yarn-site.xml</code> configuration file, we specify the <code>yarn.nodemanager.aux-services</code> property +to be <code>mapreduce_shuffle</code>.</p> + +<pre><code>[hadoop@host1 hadoop]$ vi yarn-site.xml + +... +<configuration> + <property> + <name>yarn.nodemanager.aux-services</name> + <value>mapreduce_shuffle</value> + </property> +</configuration> +... +</code></pre> + +<p>Next, we’ll start HDFS using the <code>start-dfs.sh</code> script.</p> + +<pre><code>[hadoop@host1 hadoop]$ start-dfs.sh +Starting namenodes on [localhost] +localhost: starting namenode, logging to /home/hadoop/hadoop-2.6.2/logs/hadoop-hadoop-namenode-host1.out +localhost: starting datanode, logging to /home/hadoop/hadoop-2.6.2/logs/hadoop-hadoop-datanode-host1.out +Starting secondary namenodes [0.0.0.0] +0.0.0.0: starting secondarynamenode, logging to /home/hadoop/hadoop-2.6.2/logs/hadoop-hadoop-secondarynamenode-host1.out +</code></pre> + +<p>After that, we’ll start YARN using the <code>start-yarn.sh</code> script.</p> + +<pre><code>[hadoop@host1 hadoop]$ start-yarn.sh +starting yarn daemons +starting resourcemanager, logging to /home/hadoop/hadoop-2.6.2/logs/yarn-hadoop-resourcemanager-host1.out +localhost: starting nodemanager, logging to /home/hadoop/hadoop-2.6.2/logs/yarn-hadoop-nodemanager-host1.out +</code></pre> + +<p>We can use the <code>jps</code> command to verify that the HDFS daemons (NameNode, DataNode, and SecondaryNameNode) and YARN +daemons (ResourceManager and NodeManager) are running.</p> + +<pre><code>[hadoop@host1 hadoop]$ jps +52046 ResourceManager +52482 Jps +52149 NodeManager +51582 NameNode +51712 DataNode +51880 SecondaryNameNode +</code></pre> + +<p>We can now view YARN information via the web interface on port 8088 (http://host1.example.com:8088).</p> + +<p>I’ll execute the <code>genLinearRegressionData.dml</code> example that we’ve previously considered.</p> + +<pre><code>[hadoop@host1 hadoop]$ cd ~ +[hadoop@host1 ~]$ hadoop jar systemml-1.1.0/SystemML.jar -f genLinearRegressionData.dml -nvargs numSamples=1000 numFeatures=50 maxFeatureValue=5 maxWeight=5 addNoise=FALSE b=0 sparsity=0.7 output=linRegData.csv format=csv perc=0.5 +15/11/12 11:57:04 INFO api.DMLScript: BEGIN DML run 11/12/2015 11:57:04 +15/11/12 11:57:04 INFO api.DMLScript: HADOOP_HOME: /home/hadoop/hadoop-2.6.2 +15/11/12 11:57:04 WARN conf.DMLConfig: No default SystemML config file (./SystemML-config.xml) found +15/11/12 11:57:04 WARN conf.DMLConfig: Using default settings in DMLConfig +15/11/12 11:57:05 INFO client.RMProxy: Connecting to ResourceManager at /0.0.0.0:8032 +15/11/12 11:57:06 WARN hops.OptimizerUtils: Auto-disable multi-threaded text read for 'text' and 'csv' due to thread contention on JRE < 1.8 (java.version=1.7.0_79). +15/11/12 11:57:07 INFO api.DMLScript: SystemML Statistics: +Total execution time: 1.265 sec. +Number of executed MR Jobs: 0. + +15/11/12 11:57:07 INFO api.DMLScript: END DML run 11/12/2015 11:57:07 +</code></pre> + +<p>If we examine the HDFS file system, we see the files generated by the execution of the DML script by SystemML on Hadoop.</p> + +<pre><code>[hadoop@host1 ~]$ hdfs dfs -ls +Found 5 items +drwxr-xr-x - hadoop supergroup 0 2015-11-12 11:57 linRegData.csv +-rw-r--r-- 1 hadoop supergroup 214 2015-11-12 11:57 linRegData.csv.mtd +drwxr-xr-x - hadoop supergroup 0 2015-11-12 11:57 perc.csv +-rw-r--r-- 1 hadoop supergroup 206 2015-11-12 11:57 perc.csv.mtd +drwxr-xr-x - hadoop supergroup 0 2015-11-12 11:57 scratch_space +</code></pre> + +<p>I’ll go ahead and delete the generated example files from HDFS.</p> + +<pre><code>[hadoop@host1 ~]$ hdfs dfs -rm -r *.csv +[hadoop@host1 ~]$ hdfs dfs -rm *.mtd +[hadoop@host1 ~]$ hdfs dfs -rmdir scratch_space +</code></pre> + +<p>We’ll stop the YARN daemons using the <code>stop-yarn.sh</code> script.</p> + +<pre><code>[hadoop@host1 ~]$ stop-yarn.sh +stopping yarn daemons +stopping resourcemanager +localhost: stopping nodemanager +no proxyserver to stop +</code></pre> + +<p>We can stop HDFS with the <code>stop-dfs.sh</code> script.</p> + +<pre><code>[hadoop@host1 ~]$ stop-dfs.sh +Stopping namenodes on [localhost] +localhost: stopping namenode +localhost: stopping datanode +Stopping secondary namenodes [0.0.0.0] +0.0.0.0: stopping secondarynamenode +</code></pre> + +<p>If we list the running Java processes, we see all the YARN daemons and HDFS daemons have stopped.</p> + +<pre><code>[hadoop@host1 ~]$ jps +53459 Jps +</code></pre> + +<p>For cleanliness, I’ll also delete the <code>/tmp/hadoop-hadoop</code> files created by Hadoop before proceeding to +the next example.</p> + +<hr /> + +<h1 id="systemml-with-distributed-hadoop-and-yarn">SystemML with Distributed Hadoop and YARN</h1> + +<p>In our previous example, we ran SystemML on Hadoop in Pseudo-Distributed mode with YARN on a single machine. +This example will look at Distributed Hadoop with YARN on a 4-node cluster. Each server is running +Red Hat Enterprise Linux Server, release 6.6.</p> + +<p>I have 4 nodes: host1, host2, host3, and host4. The host1 node +that we previously set up will act as the master for both HDFS and YARN, +and host2, bd150, and host4 will be slaves. For more information regarding +network configurations, please see the Hadoop documentation.</p> + +<p>First, I created a hadoop user on each slave node.</p> + +<pre><code>[root@host1 ~]$ ssh [email protected] +[root@host2 ~]# useradd hadoop +[root@host2 ~]# passwd hadoop +[root@host2 ~]# exit + +[root@host1 ~]$ ssh [email protected] +[root@host2 ~]# useradd hadoop +[root@host2 ~]# passwd hadoop +[root@host2 ~]# exit + +[root@host1 ~]$ ssh [email protected] +[root@host2 ~]# useradd hadoop +[root@host2 ~]# passwd hadoop +[root@host2 ~]# exit +</code></pre> + +<p>Next, I set up passwordless login from the hadoop user on the master node (host1) +to each of the slave nodes. The <code>ssh-copy-id</code> command copied the master node’s hadoop user’s +public key value to the ~/.ssh/authorized_keys file of each of the slave nodes. I +tested the passwordless login from the master node to each of the slave nodes for the hadoop +user.</p> + +<pre><code>$ ssh [email protected] + +[hadoop@host1 ~]$ ssh-copy-id host2.example.com +[hadoop@host1 ~]$ ssh [email protected] +Last login: Thu Nov 12 14:16:21 2015 +[hadoop@host2 ~]$ exit + +[hadoop@host1 ~]$ ssh-copy-id host3.example.com +[hadoop@host1 ~]$ ssh [email protected] +Last login: Thu Nov 12 14:16:40 2015 +[hadoop@host3 ~]$ exit + +[hadoop@host1 ~]$ ssh-copy-id host4.example.com +[hadoop@host1 ~]$ ssh [email protected] +Last login: Thu Nov 12 14:17:10 2015 +[hadoop@host4 ~]$ exit +</code></pre> + +<p>On the master node, I specified the slave nodes in the Hadoop <code>slaves</code> configuration file.</p> + +<pre><code>[hadoop@host1 hadoop]$ pwd +/home/hadoop/hadoop-2.6.2/etc/hadoop +[hadoop@host1 hadoop]$ more slaves +host2.example.com +host3.example.com +host4.example.com +</code></pre> + +<p>In the <code>core-site.xml</code> file, I specified the <code>fs.defaultFS</code> property to reference the master node.</p> + +<pre><code>[hadoop@host1 hadoop]$ more core-site.xml + +... +<configuration> + <property> + <name>fs.defaultFS</name> + <value>hdfs://host1.example.com:9000</value> + </property> +</configuration> +... +</code></pre> + +<p>In the <code>hdfs-site.xml</code> configuration file, I removed the previous <code>dfs.replication</code> property, since we +will use the default replication value (of 3).</p> + +<pre><code>[hadoop@host1 hadoop]$ more hdfs-site.xml + +... +<configuration> +</configuration> +... +</code></pre> + +<p>We’ll be using YARN, so our <code>mapred-site.xml</code> will have the <code>mapreduce.framework.name</code> +property set to <code>yarn</code>, as in the previous example. Additionally, we’ll set the <code>mapreduce.map.java.opts</code> and +<code>mapreduce.reduce.java.opts</code> properties to <code>-Xmx2g -Xms2g -Xmn200m</code>. The <code>-Xmn</code> parameter fixes the +size of the young generation and typically is set to 10% of the maximum heap, which we have set to 2g. +Furthermore, we’ll set <code>mapreduce.map.memory.mb</code> and <code>mapreduce.reduce.memory.mb</code> to <code>3072</code>. Typically these +values are set to at least 1.5 times the value of the maximum heap size.</p> + +<pre><code>[hadoop@host1 hadoop]$ more mapred-site.xml + +... +<configuration> + <property> + <name>mapreduce.framework.name</name> + <value>yarn</value> + </property> + <property> + <name>mapreduce.map.java.opts</name> + <value>-Xmx2g -Xms2g -Xmn200m</value> + </property> + <property> + <name>mapreduce.reduce.java.opts</name> + <value>-Xmx2g -Xms2g -Xmn200m</value> + </property> + <property> + <name>mapreduce.map.memory.mb</name> + <value>3072</value> + </property> + <property> + <name>mapreduce.reduce.memory.mb</name> + <value>3072</value> + </property> +</configuration> +... +</code></pre> + +<p>In the <code>yarn-site.xml</code> configuration file, I added a <code>yarn.resourcemanager.hostname</code> property and specified +the master node as the host.</p> + +<pre><code>[hadoop@host1 hadoop]$ more yarn-site.xml + +... +<configuration> + <property> + <name>yarn.nodemanager.aux-services</name> + <value>mapreduce_shuffle</value> + </property> + <property> + <name>yarn.resourcemanager.hostname</name> + <value>host1.example.com</value> + </property> +</configuration> +... +</code></pre> + +<p>In the previous example, we specified the <code>JAVA_HOME</code> in the <code>hadoop-env.sh</code> configuration script. +We will use that same value.</p> + +<pre><code>[hadoop@host1 hadoop]$ more hadoop-env.sh + +... +export JAVA_HOME=/usr/lib/jvm/java-1.7.0-openjdk.x86_64 +... +</code></pre> + +<p>Next, I copied my hadoop installation (which includes all of the mentioned configuration settings) +to each slave node.</p> + +<pre><code>[hadoop@host1 ~]$ pwd +/home/hadoop +[hadoop@host1 ~]$ scp -r hadoop-2.6.2 [email protected]:~/ +[hadoop@host1 ~]$ scp -r hadoop-2.6.2 [email protected]:~/ +[hadoop@host1 ~]$ scp -r hadoop-2.6.2 [email protected]:~/ +</code></pre> + +<p>My master node <code>.bash_profile</code> contains <code>JAVA_HOME</code> and <code>HADOOP_HOME</code> environment variables +and adds <code>JAVA_HOME/bin</code>, <code>HADOOP_HOME/bin</code> and <code>HADOOP_HOME/sbin</code> to the <code>PATH</code>.</p> + +<pre><code>... +export JAVA_HOME=/usr/lib/jvm/java-1.7.0-openjdk.x86_64 +export HADOOP_HOME=/home/hadoop/hadoop-2.6.2 +PATH=$JAVA_HOME/bin:$PATH:$HOME/bin:$HADOOP_HOME/bin:$HADOOP_HOME/sbin +export PATH +... +</code></pre> + +<p>I copied the <code>.bash_profile</code> file to the slave nodes.</p> + +<pre><code>[hadoop@host1 ~]$ pwd +/home/hadoop +[hadoop@host1 ~]$ scp .bash_profile [email protected]:~/.bash_profile +[hadoop@host1 ~]$ scp .bash_profile [email protected]:~/.bash_profile +[hadoop@host1 ~]$ scp .bash_profile [email protected]:~/.bash_profile +</code></pre> + +<p>On the master, I formatted HDFS.</p> + +<pre><code>[hadoop@host1 ~]$ hdfs namenode -format +</code></pre> + +<p>Next, on the master, I started HDFS using <code>start-dfs.sh</code>. We can see that the master NameNode +and the slave DataNodes started up.</p> + +<pre><code>[hadoop@host1 ~]$ start-dfs.sh +Starting namenodes on [host1.example.com] +host1.example.com: starting namenode, logging to /home/hadoop/hadoop-2.6.2/logs/hadoop-hadoop-namenode-host1.out +host4.example.com: starting datanode, logging to /home/hadoop/hadoop-2.6.2/logs/hadoop-hadoop-datanode-host4.out +host2.example.com: starting datanode, logging to /home/hadoop/hadoop-2.6.2/logs/hadoop-hadoop-datanode-host2.out +host3.example.com: starting datanode, logging to /home/hadoop/hadoop-2.6.2/logs/hadoop-hadoop-datanode-host3.out +Starting secondary namenodes [0.0.0.0] +0.0.0.0: starting secondarynamenode, logging to /home/hadoop/hadoop-2.6.2/logs/hadoop-hadoop-secondarynamenode-host1.out +</code></pre> + +<p>Next I started YARN using the <code>start-yarn.sh</code> script. We see the master ResourceManager and the +slave NodeManagers started up.</p> + +<pre><code>[hadoop@host1 ~]$ start-yarn.sh +starting yarn daemons +starting resourcemanager, logging to /home/hadoop/hadoop-2.6.2/logs/yarn-hadoop-resourcemanager-host1.out +host3.example.com: starting nodemanager, logging to /home/hadoop/hadoop-2.6.2/logs/yarn-hadoop-nodemanager-host3.out +host2.example.com: starting nodemanager, logging to /home/hadoop/hadoop-2.6.2/logs/yarn-hadoop-nodemanager-host2.out +host4.example.com: starting nodemanager, logging to /home/hadoop/hadoop-2.6.2/logs/yarn-hadoop-nodemanager-host4.out +</code></pre> + +<p>On the master, we see that the NameNode, SecondaryNameNode, and ResourceManager daemons are running.</p> + +<pre><code>[hadoop@host1 ~]$ jps +1563 NameNode +1775 SecondaryNameNode +2240 Jps +1978 ResourceManager +</code></pre> + +<p>On the slaves, we see that the DataNode and NodeManager daemons are running.</p> + +<pre><code>[hadoop@host2 ~]$ jps +29096 Jps +28974 NodeManager +28821 DataNode + +[hadoop@host3 ~]$ jps +5950 Jps +5706 DataNode +5819 NodeManager + +[hadoop@host4 ~]$ jps +16388 Jps +16153 DataNode +16266 NodeManager +</code></pre> + +<p>If we look at the Hadoop (on port 50070) and YARN (on port 8088) web interfaces, we can see information about our running cluster.</p> + +<hr /> + +<h2 id="systemml-with-distributed-hadoop-and-yarn-linear-regression-example">SystemML with Distributed Hadoop and YARN: Linear Regression Example</h2> + +<p>Let’s go ahead and run the SystemML example from the GitHub README.</p> + +<pre><code>[hadoop@host1 ~]$ hadoop jar systemml-1.1.0/SystemML.jar -f genLinearRegressionData.dml -nvargs numSamples=1000 numFeatures=50 maxFeatureValue=5 maxWeight=5 addNoise=FALSE b=0 sparsity=0.7 output=linRegData.csv format=csv perc=0.5 + +[hadoop@host1 ~]$ hadoop jar systemml-1.1.0/SystemML.jar -f systemml-1.1.0/algorithms/utils/sample.dml -nvargs X=linRegData.csv sv=perc.csv O=linRegDataParts ofmt=csv + +[hadoop@host1 ~]$ hadoop jar systemml-1.1.0/SystemML.jar -f systemml-1.1.0/algorithms/utils/splitXY.dml -nvargs X=linRegDataParts/1 y=51 OX=linRegData.train.data.csv OY=linRegData.train.labels.csv ofmt=csv + +[hadoop@host1 ~]$ hadoop jar systemml-1.1.0/SystemML.jar -f systemml-1.1.0/algorithms/utils/splitXY.dml -nvargs X=linRegDataParts/2 y=51 OX=linRegData.test.data.csv OY=linRegData.test.labels.csv ofmt=csv + +[hadoop@host1 ~]$ hadoop jar systemml-1.1.0/SystemML.jar -f systemml-1.1.0/algorithms/LinearRegDS.dml -nvargs X=linRegData.train.data.csv Y=linRegData.train.labels.csv B=betas.csv fmt=csv +... +BEGIN LINEAR REGRESSION SCRIPT +Reading X and Y... +Calling the Direct Solver... +Computing the statistics... +AVG_TOT_Y,-0.051722694902638956 +STDEV_TOT_Y,54.132787822718356 +AVG_RES_Y,1.5905895170230406E-10 +STDEV_RES_Y,2.0668015575844624E-8 +DISPERSION,4.262683023432828E-16 +R2,1.0 +ADJUSTED_R2,1.0 +R2_NOBIAS,1.0 +ADJUSTED_R2_NOBIAS,1.0 +R2_VS_0,1.0 +ADJUSTED_R2_VS_0,1.0 +Writing the output matrix... +END LINEAR REGRESSION SCRIPT +15/11/17 15:50:34 INFO api.DMLScript: SystemML Statistics: +Total execution time: 0.480 sec. +... + +[hadoop@host1 ~]$ hadoop jar systemml-1.1.0/SystemML.jar -f systemml-1.1.0/algorithms/GLM-predict.dml -nvargs X=linRegData.test.data.csv Y=linRegData.test.labels.csv B=betas.csv fmt=csv +... +LOGLHOOD_Z,,FALSE,NaN +LOGLHOOD_Z_PVAL,,FALSE,NaN +PEARSON_X2,,FALSE,2.5039962709907123E-13 +PEARSON_X2_BY_DF,,FALSE,5.703863943031236E-16 +PEARSON_X2_PVAL,,FALSE,1.0 +DEVIANCE_G2,,FALSE,0.0 +DEVIANCE_G2_BY_DF,,FALSE,0.0 +DEVIANCE_G2_PVAL,,FALSE,1.0 +LOGLHOOD_Z,,TRUE,NaN +LOGLHOOD_Z_PVAL,,TRUE,NaN +PEARSON_X2,,TRUE,2.5039962709907123E-13 +PEARSON_X2_BY_DF,,TRUE,5.703863943031236E-16 +PEARSON_X2_PVAL,,TRUE,1.0 +DEVIANCE_G2,,TRUE,0.0 +DEVIANCE_G2_BY_DF,,TRUE,0.0 +DEVIANCE_G2_PVAL,,TRUE,1.0 +AVG_TOT_Y,1,,0.9381218622147646 +STDEV_TOT_Y,1,,55.6116696631821 +AVG_RES_Y,1,,2.5577864570734575E-10 +STDEV_RES_Y,1,,2.390848397359923E-8 +PRED_STDEV_RES,1,TRUE,1.0 +R2,1,,1.0 +ADJUSTED_R2,1,,1.0 +R2_NOBIAS,1,,1.0 +ADJUSTED_R2_NOBIAS,1,,1.0 +15/11/17 15:51:17 INFO api.DMLScript: SystemML Statistics: +Total execution time: 0.269 sec. +... +</code></pre> + +<p>If we look at HDFS, we can see the files that were generated by the SystemML DML script executions.</p> + +<pre><code>[hadoop@host1 ~]$ hdfs dfs -ls +Found 16 items +drwxr-xr-x - hadoop supergroup 0 2015-11-17 15:50 betas.csv +-rw-r--r-- 3 hadoop supergroup 208 2015-11-17 15:50 betas.csv.mtd +drwxr-xr-x - hadoop supergroup 0 2015-11-17 15:21 linRegData.csv +-rw-r--r-- 3 hadoop supergroup 214 2015-11-17 15:21 linRegData.csv.mtd +drwxr-xr-x - hadoop supergroup 0 2015-11-17 15:50 linRegData.test.data.csv +-rw-r--r-- 3 hadoop supergroup 213 2015-11-17 15:50 linRegData.test.data.csv.mtd +drwxr-xr-x - hadoop supergroup 0 2015-11-17 15:50 linRegData.test.labels.csv +-rw-r--r-- 3 hadoop supergroup 210 2015-11-17 15:50 linRegData.test.labels.csv.mtd +drwxr-xr-x - hadoop supergroup 0 2015-11-17 15:49 linRegData.train.data.csv +-rw-r--r-- 3 hadoop supergroup 213 2015-11-17 15:49 linRegData.train.data.csv.mtd +drwxr-xr-x - hadoop supergroup 0 2015-11-17 15:49 linRegData.train.labels.csv +-rw-r--r-- 3 hadoop supergroup 210 2015-11-17 15:49 linRegData.train.labels.csv.mtd +drwxr-xr-x - hadoop supergroup 0 2015-11-17 15:49 linRegDataParts +drwxr-xr-x - hadoop supergroup 0 2015-11-17 15:21 perc.csv +-rw-r--r-- 3 hadoop supergroup 206 2015-11-17 15:21 perc.csv.mtd +drwxr-xr-x - hadoop supergroup 0 2015-11-17 15:21 scratch_space +</code></pre> + +<p>Before the next example, I’ll delete the files created in HDFS by this example.</p> + +<pre><code>[hadoop@host1 ~]$ hdfs dfs -rm -r linRegData* +[hadoop@host1 ~]$ hdfs dfs -rm -r *.csv +[hadoop@host1 ~]$ hdfs dfs -rm -r *.mtd +</code></pre> + +<hr /> + +<h2 id="systemml-with-distributed-hadoop-and-yarn-k-means-clustering-example">SystemML with Distributed Hadoop and YARN: K-Means Clustering Example</h2> + +<p>Our previous example showed SystemML running in Hadoop Batch mode on a 4-node cluster with YARN. +However, the size of the data used was trivial. In this example, we’ll generate a slightly larger set +of data and then analyze that data with the <code>Kmeans.dml</code> and <code>Kmeans-predict.dml</code> scripts. +Information about the SystemML K-means clustering algorithm can be found in the +<a href="algorithms-clustering.html#k-means-clustering">K-Means Clustering</a> section of the <a href="algorithms-reference.html">SystemML +Algorithms Reference</a>.</p> + +<p>I’m going to modify my <code>SystemML-config.xml</code> file. +I updated the <code>numreducers</code> property to be 6, which is twice my number of data nodes. +The <code>numreducers</code> property specifies the number of reduce tasks per MR job.</p> + +<pre><code><numreducers>6</numreducers> +</code></pre> + +<p>To begin, I’ll download the <code>genRandData4Kmeans.dml</code> script that I’ll use to generate a set of data.</p> + +<pre><code>[hadoop@host1 ~]$ wget https://raw.githubusercontent.com/apache/systemml/master/scripts/datagen/genRandData4Kmeans.dml +</code></pre> + +<p>A description of the named arguments that can be passed in to this script can be found in the comment section at the top of the +<code>genRandData4Kmeans.dml</code> file. For data, I’ll generate a matrix <code>X.mtx</code> consisting of 1 million rows and 100 features. I’ll explicitly reference my <code>SystemML-config.xml</code> file, since I’m +executing SystemML in Hadoop from my home directory rather than from the SystemML project root directory.</p> + +<pre><code>[hadoop@host1 ~]$ hadoop jar systemml-1.1.0/SystemML.jar -f genRandData4Kmeans.dml -config systemml-1.1.0/SystemML-config.xml -nvargs nr=1000000 nf=100 nc=10 dc=10.0 dr=1.0 fbf=100.0 cbf=100.0 X=X.mtx C=C.mtx Y=Y.mtx YbyC=YbyC.mtx +</code></pre> + +<p>After the data generation has finished, I’ll check HDFS for the amount of space used. The 1M-row matrix <code>X.mtx</code> +requires about 2.8GB of space.</p> + +<pre><code>[hadoop@host1 ~]$ hdfs dfs -df -h +Filesystem Size Used Available Use% +hdfs://host1.example.com:9000 400.7 G 2.8 G 318.7 G 1% +</code></pre> + +<p>Here we can see the data files that were generated.</p> + +<pre><code>[hadoop@host1 ~]$ hdfs dfs -ls +Found 9 items +drwxr-xr-x - hadoop supergroup 0 2015-11-19 11:53 C.mtx +-rw-r--r-- 3 hadoop supergroup 176 2015-11-19 11:53 C.mtx.mtd +drwxr-xr-x - hadoop supergroup 0 2015-11-19 11:56 X.mtx +-rw-r--r-- 3 hadoop supergroup 186 2015-11-19 11:56 X.mtx.mtd +drwxr-xr-x - hadoop supergroup 0 2015-11-19 11:53 Y.mtx +-rw-r--r-- 3 hadoop supergroup 182 2015-11-19 11:53 Y.mtx.mtd +drwxr-xr-x - hadoop supergroup 0 2015-11-19 11:56 YbyC.mtx +-rw-r--r-- 3 hadoop supergroup 182 2015-11-19 11:56 YbyC.mtx.mtd +drwxr-xr-x - hadoop supergroup 0 2015-11-19 11:56 scratch_space +</code></pre> + +<p>Here we can see the <code>X.mtx</code> data files.</p> + +<pre><code>[hadoop@host1 ~]$ hdfs dfs -ls X.mtx +Found 6 items +-rw-r--r-- 1 hadoop supergroup 484418384 2015-11-19 11:56 X.mtx/2-r-00000 +-rw-r--r-- 1 hadoop supergroup 481626112 2015-11-19 11:56 X.mtx/2-r-00001 +-rw-r--r-- 1 hadoop supergroup 475834931 2015-11-19 11:56 X.mtx/2-r-00002 +-rw-r--r-- 1 hadoop supergroup 478519922 2015-11-19 11:56 X.mtx/2-r-00003 +-rw-r--r-- 1 hadoop supergroup 481624723 2015-11-19 11:56 X.mtx/2-r-00004 +-rw-r--r-- 1 hadoop supergroup 481624048 2015-11-19 11:56 X.mtx/2-r-00005 +</code></pre> + +<p>Next, I’ll run the <code>Kmeans.dml</code> algorithm on the 1M-row matrix <code>X.mtx</code>.</p> + +<pre><code>[hadoop@host1 ~]$ hadoop jar systemml-1.1.0/SystemML.jar -f systemml-1.1.0/algorithms/Kmeans.dml -config /systemml-1.1.0/SystemML-config.xml -nvargs X=X.mtx k=5 C=Centroids.mtx +</code></pre> + +<p>We can see the <code>Centroids.mtx</code> data file has been written to HDFS.</p> + +<pre><code>[hadoop@host1 ~]$ hdfs dfs -ls +Found 11 items +drwxr-xr-x - hadoop supergroup 0 2015-11-19 11:53 C.mtx +-rw-r--r-- 3 hadoop supergroup 176 2015-11-19 11:53 C.mtx.mtd +drwxr-xr-x - hadoop supergroup 0 2015-11-19 12:10 Centroids.mtx +-rw-r--r-- 3 hadoop supergroup 174 2015-11-19 12:10 Centroids.mtx.mtd +drwxr-xr-x - hadoop supergroup 0 2015-11-19 11:56 X.mtx +-rw-r--r-- 3 hadoop supergroup 186 2015-11-19 11:56 X.mtx.mtd +drwxr-xr-x - hadoop supergroup 0 2015-11-19 11:53 Y.mtx +-rw-r--r-- 3 hadoop supergroup 182 2015-11-19 11:53 Y.mtx.mtd +drwxr-xr-x - hadoop supergroup 0 2015-11-19 11:56 YbyC.mtx +-rw-r--r-- 3 hadoop supergroup 182 2015-11-19 11:56 YbyC.mtx.mtd +drwxr-xr-x - hadoop supergroup 0 2015-11-19 12:10 scratch_space +</code></pre> + +<p>Now that we have trained our model, next we will test our model. We can do this with +the <code>Kmeans-predict.dml</code> script.</p> + +<pre><code>[hadoop@host1 ~]$ hadoop jar systemml-1.1.0/SystemML.jar -f systemml-1.1.0/algorithms/Kmeans-predict.dml -config systemml-1.1.0/SystemML-config.xml -nvargs X=X.mtx C=Centroids.mtx prY=PredY.mtx O=stats.txt +</code></pre> + +<p>In the file system, we can see that the <code>PredY.mtx</code> matrix was created. +The <code>stats.txt</code> file lists statistics about the results.</p> + +<pre><code>[hadoop@host1 ~]$ hdfs dfs -ls +Found 15 items +drwxr-xr-x - hadoop supergroup 0 2015-11-19 11:53 C.mtx +-rw-r--r-- 3 hadoop supergroup 176 2015-11-19 11:53 C.mtx.mtd +drwxr-xr-x - hadoop supergroup 0 2015-11-19 12:10 Centroids.mtx +-rw-r--r-- 3 hadoop supergroup 174 2015-11-19 12:10 Centroids.mtx.mtd +drwxr-xr-x - hadoop supergroup 0 2015-11-19 13:20 PredY.mtx +-rw-r--r-- 3 hadoop supergroup 182 2015-11-19 13:20 PredY.mtx.mtd +drwxr-xr-x - hadoop supergroup 0 2015-11-19 11:56 X.mtx +-rw-r--r-- 3 hadoop supergroup 186 2015-11-19 11:56 X.mtx.mtd +drwxr-xr-x - hadoop supergroup 0 2015-11-19 11:53 Y.mtx +-rw-r--r-- 3 hadoop supergroup 182 2015-11-19 11:53 Y.mtx.mtd +drwxr-xr-x - hadoop supergroup 0 2015-11-19 11:56 YbyC.mtx +-rw-r--r-- 3 hadoop supergroup 182 2015-11-19 11:56 YbyC.mtx.mtd +drwxr-xr-x - hadoop supergroup 0 2015-11-19 13:21 scratch_space +-rw-r--r-- 3 hadoop supergroup 261 2015-11-19 13:21 stats.txt +-rw-r--r-- 3 hadoop supergroup 127 2015-11-19 13:21 stats.txt.mtd +</code></pre> + +<p>The <code>PredY.mtx</code> matrix consists of a single column of a million rows of doubles, as we can +see in the resulting metadata file.</p> + +<pre><code>[hadoop@host1 ~]$ hdfs dfs -cat PredY.mtx.mtd +{ + "data_type": "matrix" + ,"value_type": "double" + ,"rows": 1000000 + ,"cols": 1 + ,"nnz": 1000000 + ,"format": "text" + ,"description": { "author": "SystemML" } +} +</code></pre> + +<p>The statistics generated from testing the method are displayed below.</p> + +<pre><code>[hadoop@host1 ~]$ hdfs dfs -cat stats.txt +TSS,,1.1262427174414966E11 +WCSS_M,,9.77022617396343E10 +WCSS_M_PC,,86.75062686450579 +BCSS_M,,1.4922010004515366E10 +BCSS_M_PC,,13.249373135494215 +WCSS_C,,9.770230517014426E10 +WCSS_C_PC,,86.75066542680617 +BCSS_C,,1.4921964103415842E10 +BCSS_C_PC,,13.249332379537428 +</code></pre> + +<hr /> + +<h1 id="recommended-hadoop-cluster-configuration-settings">Recommended Hadoop Cluster Configuration Settings</h1> + +<p>Below are some recommended Hadoop configuration file settings that may be of assistance when running SystemML on Hadoop +in a clustered environment.</p> + +<table> + <thead> + <tr> + <th>Configuration File</th> + <th>Setting</th> + <th>Value</th> + <th>Description</th> + </tr> + </thead> + <tbody> + <tr> + <td nowrap="" rowspan="5" style="vertical-align: top; padding-top: 22px;"><code>mapred-site.xml</code></td> + <td nowrap=""><code>mapreduce.map.java.opts</code></td> + <td nowrap=""><code>-Xmx2g -Xms2g -Xmn200m</code></td> + <td>Increase memory of child JVMs of Maps, max/min heap size of 2GB, -Xmn specifies young generation which is typically 10% of maximum heap size</td> + </tr> + <tr> + <td nowrap=""><code>mapreduce.reduce.java.opts</code></td> + <td nowrap=""><code>-Xmx2g -Xms2g -Xmn200m</code></td> + <td>Increase memory of child JVMs of Reduces, max/min heap size of 2GB, -Xmn specifies young generation which is typically 10% of maximum heap size</td> + </tr> + <tr> + <td nowrap=""><code>mapreduce.map.memory.mb</code></td> + <td nowrap=""><code>3072</code></td> + <td>Set to at least 1.5 times the value of the Map max heap size</td> + </tr> + <tr> + <td nowrap=""><code>mapreduce.reduce.memory.mb</code></td> + <td nowrap=""><code>3072</code></td> + <td>Set to at least 1.5 times the value of the Reduce max heap size</td> + </tr> + <tr> + <td nowrap=""><code>io.sort.mb</code> (deprecated) /<br /> <code>mapreduce.task.io.sort.mb</code></td> + <td nowrap=""><code>384</code></td> + <td>Memory limit while sorting data</td> + </tr> + <tr> + <td nowrap=""><code>yarn-site.xml</code></td> + <td nowrap=""><code>yarn.nodemanager.vmem-pmem-ratio</code></td> + <td nowrap=""><code>2</code> to <code>5</code></td> + <td>Maximum ratio of virtual memory to physical memory</td> + </tr> + </tbody> +</table> + + + </div> <!-- /container --> + + + + <script src="js/vendor/jquery-1.12.0.min.js"></script> + <script src="js/vendor/bootstrap.min.js"></script> + <script src="js/vendor/anchor.min.js"></script> + <script src="js/main.js"></script> + + + + + + <!-- Analytics --> + <script> + (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){ + (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o), + m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m) + })(window,document,'script','//www.google-analytics.com/analytics.js','ga'); + ga('create', 'UA-71553733-1', 'auto'); + ga('send', 'pageview'); + </script> + + + + <!-- MathJax Section --> + <script type="text/x-mathjax-config"> + MathJax.Hub.Config({ + TeX: { equationNumbers: { autoNumber: "AMS" } } + }); + </script> + <script> + // Note that we load MathJax this way to work with local file (file://), HTTP and HTTPS. + // We could use "//cdn.mathjax...", but that won't support "file://". + (function(d, script) { + script = d.createElement('script'); + script.type = 'text/javascript'; + script.async = true; + script.onload = function(){ + MathJax.Hub.Config({ + tex2jax: { + inlineMath: [ ["$", "$"], ["\\\\(","\\\\)"] ], + displayMath: [ ["$$","$$"], ["\\[", "\\]"] ], + processEscapes: true, + skipTags: ['script', 'noscript', 'style', 'textarea', 'pre'] + } + }); + }; + script.src = ('https:' == document.location.protocol ? 'https://' : 'http://') + + 'cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML'; + d.getElementsByTagName('head')[0].appendChild(script); + }(document)); + </script> + </body> +</html>
Added: systemml/site/docs/1.1.0/index.html URL: http://svn.apache.org/viewvc/systemml/site/docs/1.1.0/index.html?rev=1828046&view=auto ============================================================================== --- systemml/site/docs/1.1.0/index.html (added) +++ systemml/site/docs/1.1.0/index.html Fri Mar 30 04:31:05 2018 @@ -0,0 +1,240 @@ +<!DOCTYPE html> +<!--[if lt IE 7]> <html class="no-js lt-ie9 lt-ie8 lt-ie7"> <![endif]--> +<!--[if IE 7]> <html class="no-js lt-ie9 lt-ie8"> <![endif]--> +<!--[if IE 8]> <html class="no-js lt-ie9"> <![endif]--> +<!--[if gt IE 8]><!--> <html class="no-js"> <!--<![endif]--> + <head> + <title>SystemML Documentation - SystemML 1.1.0</title> + <meta charset="utf-8"> + <meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1"> + + <meta name="description" content="SystemML Documentation"> + + <meta name="viewport" content="width=device-width"> + <link rel="stylesheet" href="css/bootstrap.min.css"> + <link rel="stylesheet" href="css/main.css"> + <link rel="stylesheet" href="css/pygments-default.css"> + <link rel="shortcut icon" href="img/favicon.png"> + </head> + <body> + <!--[if lt IE 7]> + <p class="chromeframe">You are using an outdated browser. <a href="http://browsehappy.com/">Upgrade your browser today</a> or <a href="http://www.google.com/chromeframe/?redirect=true">install Google Chrome Frame</a> to better experience this site.</p> + <![endif]--> + + <header class="navbar navbar-default navbar-fixed-top" id="topbar"> + <div class="container"> + <div class="navbar-header"> + <div class="navbar-brand brand projectlogo"> + <a href="http://systemml.apache.org/"><img class="logo" src="img/systemml-logo.png" alt="Apache SystemML" title="Apache SystemML"/></a> + </div> + <div class="navbar-brand brand projecttitle"> + <a href="http://systemml.apache.org/">Apache SystemML<sup id="trademark">â¢</sup></a><br/> + <span class="version">1.1.0</span> + </div> + <button type="button" class="navbar-toggle collapsed" data-toggle="collapse" data-target=".navbar-collapse"> + <span class="sr-only">Toggle navigation</span> + <span class="icon-bar"></span> + <span class="icon-bar"></span> + <span class="icon-bar"></span> + </button> + </div> + <nav class="navbar-collapse collapse"> + <ul class="nav navbar-nav navbar-right"> + <li><a href="index.html">Overview</a></li> + <li><a href="https://github.com/apache/systemml">GitHub</a></li> + <li class="dropdown"> + <a href="#" class="dropdown-toggle" data-toggle="dropdown">Documentation<b class="caret"></b></a> + <ul class="dropdown-menu" role="menu"> + <li><b>Running SystemML:</b></li> + <li><a href="https://github.com/apache/systemml">SystemML GitHub README</a></li> + <li><a href="spark-mlcontext-programming-guide.html">Spark MLContext</a></li> + <li><a href="spark-batch-mode.html">Spark Batch Mode</a> + <li><a href="hadoop-batch-mode.html">Hadoop Batch Mode</a> + <li><a href="standalone-guide.html">Standalone Guide</a></li> + <li><a href="jmlc.html">Java Machine Learning Connector (JMLC)</a> + <li class="divider"></li> + <li><b>Language Guides:</b></li> + <li><a href="dml-language-reference.html">DML Language Reference</a></li> + <li><a href="beginners-guide-to-dml-and-pydml.html">Beginner's Guide to DML and PyDML</a></li> + <li><a href="beginners-guide-python.html">Beginner's Guide for Python Users</a></li> + <li><a href="python-reference.html">Reference Guide for Python Users</a></li> + <li class="divider"></li> + <li><b>ML Algorithms:</b></li> + <li><a href="algorithms-reference.html">Algorithms Reference</a></li> + <li class="divider"></li> + <li><b>Tools:</b></li> + <li><a href="debugger-guide.html">Debugger Guide</a></li> + <li><a href="developer-tools-systemml.html">IDE Guide</a></li> + <li class="divider"></li> + <li><b>Other:</b></li> + <li><a href="contributing-to-systemml.html">Contributing to SystemML</a></li> + <li><a href="engine-dev-guide.html">Engine Developer Guide</a></li> + <li><a href="troubleshooting-guide.html">Troubleshooting Guide</a></li> + <li><a href="release-process.html">Release Process</a></li> + </ul> + </li> + + <li class="dropdown"> + <a href="#" class="dropdown-toggle" data-toggle="dropdown">API Docs<b class="caret"></b></a> + <ul class="dropdown-menu" role="menu"> + <li><a href="./api/java/index.html">Java</a></li> + <li><a href="./api/python/index.html">Python</a></li> + </ul> + </li> + + <li class="dropdown"> + <a href="#" class="dropdown-toggle" data-toggle="dropdown">Issues<b class="caret"></b></a> + <ul class="dropdown-menu" role="menu"> + <li><b>JIRA:</b></li> + <li><a href="https://issues.apache.org/jira/browse/SYSTEMML">SystemML JIRA</a></li> + + </ul> + </li> + </ul> + </nav> + </div> + </header> + + <div class="container" id="content"> + + <h1 class="title">SystemML Documentation</h1> + + + <!-- + +--> + +<p>SystemML is a flexible, scalable machine learning system. +SystemML’s distinguishing characteristics are:</p> + +<ol> + <li><strong>Algorithm customizability via R-like and Python-like languages</strong>.</li> + <li><strong>Multiple execution modes</strong>, including Spark MLContext, Spark Batch, Hadoop Batch, Standalone, and JMLC.</li> + <li><strong>Automatic optimization</strong> based on data and cluster characteristics to ensure both efficiency and scalability.</li> +</ol> + +<p>The <a href="https://github.com/apache/systemml">SystemML GitHub README</a> describes +building, testing, and running SystemML. Please read <a href="contributing-to-systemml">Contributing to SystemML</a> +to find out how to help make SystemML even better!</p> + +<p>To download SystemML, visit the <a href="http://systemml.apache.org/download">downloads</a> page.</p> + +<p>This version of SystemML supports: Java 8+, Scala 2.11+, Python 2.7/3.5+, Hadoop 2.6+, and Spark 2.1+.</p> + +<h2 id="running-systemml">Running SystemML</h2> + +<ul> + <li><a href="beginners-guide-python">Beginner’s Guide For Python Users</a> - Beginner’s Guide for Python users.</li> + <li><a href="spark-mlcontext-programming-guide">Spark MLContext</a> - Spark MLContext is a programmatic API +for running SystemML from Spark via Scala, Python, or Java. + <ul> + <li><a href="spark-mlcontext-programming-guide#spark-shell-example">Spark Shell Example (Scala)</a></li> + <li><a href="spark-mlcontext-programming-guide#jupyter-pyspark-notebook-example---poisson-nonnegative-matrix-factorization">Jupyter Notebook Example (PySpark)</a></li> + </ul> + </li> + <li><a href="spark-batch-mode">Spark Batch</a> - Algorithms are automatically optimized to run across Spark clusters.</li> + <li><a href="hadoop-batch-mode">Hadoop Batch</a> - Algorithms are automatically optimized when distributed across Hadoop clusters.</li> + <li><a href="standalone-guide">Standalone</a> - Standalone mode allows data scientists to rapidly prototype algorithms on a single +machine in R-like and Python-like declarative languages.</li> + <li><a href="jmlc">JMLC</a> - Java Machine Learning Connector.</li> + <li><a href="deep-learning">Deep Learning with SystemML</a> + <ul> + <li><em>Experimental</em> Caffe2DML API for Deep Learning (<a href="beginners-guide-caffe2dml">beginner’s guide</a>, <a href="reference-guide-caffe2dml">reference guide</a>) - Converts a Caffe specification to DML.</li> + <li><em>Experimental</em> <a href="beginners-guide-keras2dml">Keras2DML API</a> for Deep Learning.</li> + </ul> + </li> +</ul> + +<h2 id="language-guides">Language Guides</h2> + +<ul> + <li><a href="python-reference">Python API Reference</a> - API Reference Guide for Python users.</li> + <li><a href="dml-language-reference">DML Language Reference</a> - +DML is a high-level R-like declarative language for machine learning.</li> + <li><strong>PyDML Language Reference</strong> - +PyDML is a high-level Python-like declarative language for machine learning.</li> + <li><a href="beginners-guide-to-dml-and-pydml">Beginner’s Guide to DML and PyDML</a> - +An introduction to the basics of DML and PyDML.</li> +</ul> + +<h2 id="ml-algorithms">ML Algorithms</h2> + +<ul> + <li><a href="algorithms-reference">Algorithms Reference</a> - The Algorithms Reference describes the +machine learning algorithms included with SystemML in detail.</li> +</ul> + +<h2 id="tools">Tools</h2> + +<ul> + <li><a href="debugger-guide">Debugger Guide</a> - SystemML supports DML script-level debugging through a +command-line interface.</li> + <li><a href="developer-tools-systemml">IDE Guide</a> - Useful IDE Guide for Developing SystemML.</li> +</ul> + +<h2 id="other">Other</h2> + +<ul> + <li><a href="contributing-to-systemml">Contributing to SystemML</a> - Describes ways to contribute to SystemML.</li> + <li><a href="engine-dev-guide">Engine Developer Guide</a> - Guide for internal SystemML engine development.</li> + <li><a href="troubleshooting-guide">Troubleshooting Guide</a> - Troubleshoot various issues related to SystemML.</li> + <li><a href="release-process">Release Process</a> - Description of the SystemML release process.</li> + <li><a href="native-backend">Using Native BLAS</a> in SystemML.</li> +</ul> + + + </div> <!-- /container --> + + + + <script src="js/vendor/jquery-1.12.0.min.js"></script> + <script src="js/vendor/bootstrap.min.js"></script> + <script src="js/vendor/anchor.min.js"></script> + <script src="js/main.js"></script> + + + + + + <!-- Analytics --> + <script> + (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){ + (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o), + m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m) + })(window,document,'script','//www.google-analytics.com/analytics.js','ga'); + ga('create', 'UA-71553733-1', 'auto'); + ga('send', 'pageview'); + </script> + + + + <!-- MathJax Section --> + <script type="text/x-mathjax-config"> + MathJax.Hub.Config({ + TeX: { equationNumbers: { autoNumber: "AMS" } } + }); + </script> + <script> + // Note that we load MathJax this way to work with local file (file://), HTTP and HTTPS. + // We could use "//cdn.mathjax...", but that won't support "file://". + (function(d, script) { + script = d.createElement('script'); + script.type = 'text/javascript'; + script.async = true; + script.onload = function(){ + MathJax.Hub.Config({ + tex2jax: { + inlineMath: [ ["$", "$"], ["\\\\(","\\\\)"] ], + displayMath: [ ["$$","$$"], ["\\[", "\\]"] ], + processEscapes: true, + skipTags: ['script', 'noscript', 'style', 'textarea', 'pre'] + } + }); + }; + script.src = ('https:' == document.location.protocol ? 'https://' : 'http://') + + 'cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML'; + d.getElementsByTagName('head')[0].appendChild(script); + }(document)); + </script> + </body> +</html>
