Added: systemml/site/docs/1.1.0/beginners-guide-keras2dml.html URL: http://svn.apache.org/viewvc/systemml/site/docs/1.1.0/beginners-guide-keras2dml.html?rev=1828046&view=auto ============================================================================== --- systemml/site/docs/1.1.0/beginners-guide-keras2dml.html (added) +++ systemml/site/docs/1.1.0/beginners-guide-keras2dml.html Fri Mar 30 04:31:05 2018 @@ -0,0 +1,354 @@ +<!DOCTYPE html> +<!--[if lt IE 7]> <html class="no-js lt-ie9 lt-ie8 lt-ie7"> <![endif]--> +<!--[if IE 7]> <html class="no-js lt-ie9 lt-ie8"> <![endif]--> +<!--[if IE 8]> <html class="no-js lt-ie9"> <![endif]--> +<!--[if gt IE 8]><!--> <html class="no-js"> <!--<![endif]--> + <head> + <title>Beginner's Guide for Keras2DML users - SystemML 1.1.0</title> + <meta charset="utf-8"> + <meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1"> + + <meta name="description" content="Beginner's Guide for Keras2DML users"> + + <meta name="viewport" content="width=device-width"> + <link rel="stylesheet" href="css/bootstrap.min.css"> + <link rel="stylesheet" href="css/main.css"> + <link rel="stylesheet" href="css/pygments-default.css"> + <link rel="shortcut icon" href="img/favicon.png"> + </head> + <body> + <!--[if lt IE 7]> + <p class="chromeframe">You are using an outdated browser. <a href="http://browsehappy.com/">Upgrade your browser today</a> or <a href="http://www.google.com/chromeframe/?redirect=true">install Google Chrome Frame</a> to better experience this site.</p> + <![endif]--> + + <header class="navbar navbar-default navbar-fixed-top" id="topbar"> + <div class="container"> + <div class="navbar-header"> + <div class="navbar-brand brand projectlogo"> + <a href="http://systemml.apache.org/"><img class="logo" src="img/systemml-logo.png" alt="Apache SystemML" title="Apache SystemML"/></a> + </div> + <div class="navbar-brand brand projecttitle"> + <a href="http://systemml.apache.org/">Apache SystemML<sup id="trademark">â¢</sup></a><br/> + <span class="version">1.1.0</span> + </div> + <button type="button" class="navbar-toggle collapsed" data-toggle="collapse" data-target=".navbar-collapse"> + <span class="sr-only">Toggle navigation</span> + <span class="icon-bar"></span> + <span class="icon-bar"></span> + <span class="icon-bar"></span> + </button> + </div> + <nav class="navbar-collapse collapse"> + <ul class="nav navbar-nav navbar-right"> + <li><a href="index.html">Overview</a></li> + <li><a href="https://github.com/apache/systemml">GitHub</a></li> + <li class="dropdown"> + <a href="#" class="dropdown-toggle" data-toggle="dropdown">Documentation<b class="caret"></b></a> + <ul class="dropdown-menu" role="menu"> + <li><b>Running SystemML:</b></li> + <li><a href="https://github.com/apache/systemml">SystemML GitHub README</a></li> + <li><a href="spark-mlcontext-programming-guide.html">Spark MLContext</a></li> + <li><a href="spark-batch-mode.html">Spark Batch Mode</a> + <li><a href="hadoop-batch-mode.html">Hadoop Batch Mode</a> + <li><a href="standalone-guide.html">Standalone Guide</a></li> + <li><a href="jmlc.html">Java Machine Learning Connector (JMLC)</a> + <li class="divider"></li> + <li><b>Language Guides:</b></li> + <li><a href="dml-language-reference.html">DML Language Reference</a></li> + <li><a href="beginners-guide-to-dml-and-pydml.html">Beginner's Guide to DML and PyDML</a></li> + <li><a href="beginners-guide-python.html">Beginner's Guide for Python Users</a></li> + <li><a href="python-reference.html">Reference Guide for Python Users</a></li> + <li class="divider"></li> + <li><b>ML Algorithms:</b></li> + <li><a href="algorithms-reference.html">Algorithms Reference</a></li> + <li class="divider"></li> + <li><b>Tools:</b></li> + <li><a href="debugger-guide.html">Debugger Guide</a></li> + <li><a href="developer-tools-systemml.html">IDE Guide</a></li> + <li class="divider"></li> + <li><b>Other:</b></li> + <li><a href="contributing-to-systemml.html">Contributing to SystemML</a></li> + <li><a href="engine-dev-guide.html">Engine Developer Guide</a></li> + <li><a href="troubleshooting-guide.html">Troubleshooting Guide</a></li> + <li><a href="release-process.html">Release Process</a></li> + </ul> + </li> + + <li class="dropdown"> + <a href="#" class="dropdown-toggle" data-toggle="dropdown">API Docs<b class="caret"></b></a> + <ul class="dropdown-menu" role="menu"> + <li><a href="./api/java/index.html">Java</a></li> + <li><a href="./api/python/index.html">Python</a></li> + </ul> + </li> + + <li class="dropdown"> + <a href="#" class="dropdown-toggle" data-toggle="dropdown">Issues<b class="caret"></b></a> + <ul class="dropdown-menu" role="menu"> + <li><b>JIRA:</b></li> + <li><a href="https://issues.apache.org/jira/browse/SYSTEMML">SystemML JIRA</a></li> + + </ul> + </li> + </ul> + </nav> + </div> + </header> + + <div class="container" id="content"> + + <h1 class="title">Beginner's Guide for Keras2DML users</h1> + + + <!-- + +--> + +<ul id="markdown-toc"> + <li><a href="#introduction" id="markdown-toc-introduction">Introduction</a> <ul> + <li><a href="#getting-started" id="markdown-toc-getting-started">Getting Started</a></li> + <li><a href="#model-conversion" id="markdown-toc-model-conversion">Model Conversion</a></li> + </ul> + </li> + <li><a href="#frequently-asked-questions" id="markdown-toc-frequently-asked-questions">Frequently asked questions</a> <ul> + <li><a href="#what-is-the-mapping-between-keras-parameters-and-caffes-solver-specification-" id="markdown-toc-what-is-the-mapping-between-keras-parameters-and-caffes-solver-specification-">What is the mapping between Keras’ parameters and Caffe’s solver specification ?</a></li> + <li><a href="#how-do-i-specify-the-batch-size-and-the-number-of-epochs-" id="markdown-toc-how-do-i-specify-the-batch-size-and-the-number-of-epochs-">How do I specify the batch size and the number of epochs ?</a></li> + <li><a href="#what-optimizer-and-loss-does-keras2dml-use-by-default-if-kerasmodel-is-not-compiled-" id="markdown-toc-what-optimizer-and-loss-does-keras2dml-use-by-default-if-kerasmodel-is-not-compiled-">What optimizer and loss does Keras2DML use by default if <code>keras_model</code> is not compiled ?</a></li> + <li><a href="#what-is-the-learning-rate-schedule-used-" id="markdown-toc-what-is-the-learning-rate-schedule-used-">What is the learning rate schedule used ?</a></li> + <li><a href="#how-to-set-the-size-of-the-validation-dataset-" id="markdown-toc-how-to-set-the-size-of-the-validation-dataset-">How to set the size of the validation dataset ?</a></li> + <li><a href="#how-to-monitor-loss-via-command-line-" id="markdown-toc-how-to-monitor-loss-via-command-line-">How to monitor loss via command-line ?</a></li> + </ul> + </li> +</ul> + +<p><br /></p> + +<h2 id="introduction">Introduction</h2> + +<p>Keras2DML is an <strong>experimental API</strong> that converts a Keras specification to DML through the intermediate Caffe2DML module. +It is designed to fit well into the mllearn framework and hence supports NumPy, Pandas as well as PySpark DataFrame.</p> + +<h3 id="getting-started">Getting Started</h3> + +<p>To create a Keras2DML object, one needs to create a Keras model through the Funcitonal API. please see the <a href="https://keras.io/models/model/">Functional API.</a> +This module utilizes the existing <a href="beginners-guide-caffe2dml">Caffe2DML</a> backend to convert Keras models into DML. Keras models are +parsed and translated into Caffe prototext and caffemodel files which are then piped into Caffe2DML. Thus one can follow the Caffe2DML +documentation for further information.</p> + +<h3 id="model-conversion">Model Conversion</h3> + +<p>Keras models are parsed based on their layer structure and corresponding weights and translated into the relative Caffe layer and weight +configuration. Be aware that currently this is a translation into Caffe and there will be loss of information from keras models such as +intializer information, and other layers which do not exist in Caffe.</p> + +<p>To create a Keras2DML object, simply pass the keras object to the Keras2DML constructor. It’s also important to note that your models +should be compiled so that the loss can be accessed for Caffe2DML</p> + +<p>```python +from systemml.mllearn import Keras2DML +import keras +from keras.applications.resnet50 import preprocess_input, decode_predictions, ResNet50</p> + +<p>keras_model = ResNet50(weights=’imagenet’,include_top=True,pooling=’None’,input_shape=(224,224,3)) +keras_model.compile(optimizer=’sgd’, loss= ‘categorical_crossentropy’)</p> + +<p>sysml_model = Keras2DML(spark, keras_model,input_shape=(3,224,224)) +sysml_model.summary() +```</p> + +<h1 id="frequently-asked-questions">Frequently asked questions</h1> + +<h4 id="what-is-the-mapping-between-keras-parameters-and-caffes-solver-specification-">What is the mapping between Keras’ parameters and Caffe’s solver specification ?</h4> + +<table> + <thead> + <tr> + <th> </th> + <th>Specified via the given parameter in the Keras2DML constructor</th> + <th>From input Keras’ model</th> + <th>Corresponding parameter in the Caffe solver file</th> + </tr> + </thead> + <tbody> + <tr> + <td>Solver type</td> + <td> </td> + <td><code>type(keras_model.optimizer)</code>. Supported types: <code>keras.optimizers.{SGD, Adagrad, Adam}</code></td> + <td><code>type</code></td> + </tr> + <tr> + <td>Maximum number of iterations</td> + <td><code>max_iter</code></td> + <td>The <code>epoch</code> parameter in the <code>fit</code> method is not supported.</td> + <td><code>max_iter</code></td> + </tr> + <tr> + <td>Validation dataset</td> + <td><code>test_iter</code> (explained in the below section)</td> + <td>The <code>validation_data</code> parameter in the <code>fit</code> method is not supported.</td> + <td><code>test_iter</code></td> + </tr> + <tr> + <td>Monitoring the loss</td> + <td><code>display, test_interval</code> (explained in the below section)</td> + <td>The <code>LossHistory</code> callback in the <code>fit</code> method is not supported.</td> + <td><code>display, test_interval</code></td> + </tr> + <tr> + <td>Learning rate schedule</td> + <td><code>lr_policy</code></td> + <td>The <code>LearningRateScheduler</code> callback in the <code>fit</code> method is not supported.</td> + <td><code>lr_policy</code> (default: step)</td> + </tr> + <tr> + <td>Base learning rate</td> + <td> </td> + <td><code>keras_model.optimizer.lr</code></td> + <td><code>base_lr</code></td> + </tr> + <tr> + <td>Learning rate decay over each update</td> + <td> </td> + <td><code>keras_model.optimizer.decay</code></td> + <td><code>gamma</code></td> + </tr> + <tr> + <td>Global regularizer to use for all layers</td> + <td><code>regularization_type,weight_decay</code></td> + <td>The current version of Keras2DML doesnot support custom regularizers per layer.</td> + <td><code>regularization_type,weight_decay</code></td> + </tr> + <tr> + <td>If type of the optimizer is <code>keras.optimizers.SGD</code></td> + <td> </td> + <td><code>momentum, nesterov</code></td> + <td><code>momentum, type</code></td> + </tr> + <tr> + <td>If type of the optimizer is <code>keras.optimizers.Adam</code></td> + <td> </td> + <td><code>beta_1, beta_2, epsilon</code>. The parameter <code>amsgrad</code> is not supported.</td> + <td><code>momentum, momentum2, delta</code></td> + </tr> + <tr> + <td>If type of the optimizer is <code>keras.optimizers.Adagrad</code></td> + <td> </td> + <td><code>epsilon</code></td> + <td><code>delta</code></td> + </tr> + </tbody> +</table> + +<h4 id="how-do-i-specify-the-batch-size-and-the-number-of-epochs-">How do I specify the batch size and the number of epochs ?</h4> + +<p>Since Keras2DML is a mllearn API, it doesnot accept the batch size and number of epochs as the parameter in the <code>fit</code> method. +Instead, these parameters are passed via <code>batch_size</code> and <code>max_iter</code> parameters in the Keras2DML constructor. +For example, the equivalent Python code for <code>keras_model.fit(features, labels, epochs=10, batch_size=64)</code> is as follows:</p> + +<p><code>python +from systemml.mllearn import Keras2DML +epochs = 10 +batch_size = 64 +num_samples = features.shape[0] +max_iter = int(epochs*math.ceil(num_samples/batch_size)) +sysml_model = Keras2DML(spark, keras_model, batch_size=batch_size, max_iter=max_iter, ...) +sysml_model.fit(features, labels) +</code></p> + +<h4 id="what-optimizer-and-loss-does-keras2dml-use-by-default-if-kerasmodel-is-not-compiled-">What optimizer and loss does Keras2DML use by default if <code>keras_model</code> is not compiled ?</h4> + +<p>If the user does not <code>compile</code> the keras model, then we use cross entropy loss and SGD optimizer with nesterov momentum:</p> + +<p><code>python +keras_model.compile(loss='categorical_crossentropy', optimizer=keras.optimizers.SGD(lr=0.01, momentum=0.95, decay=5e-4, nesterov=True)) +</code></p> + +<h4 id="what-is-the-learning-rate-schedule-used-">What is the learning rate schedule used ?</h4> + +<p>Keras2DML does not support the <code>LearningRateScheduler</code> callback. +Instead one can set the custom learning rate schedule to one of the following schedules by using the <code>lr_policy</code> parameter of the constructor: +- <code>step</code>: return <code>base_lr * gamma ^ (floor(iter / step))</code> (default schedule) +- <code>fixed</code>: always return <code>base_lr</code>. +- <code>exp</code>: return <code>base_lr * gamma ^ iter</code> +- <code>inv</code>: return <code>base_lr * (1 + gamma * iter) ^ (- power)</code> +- <code>poly</code>: the effective learning rate follows a polynomial decay, to be zero by the max_iter. return <code>base_lr (1 - iter/max_iter) ^ (power)</code> +- <code>sigmoid</code>: the effective learning rate follows a sigmod decay return b<code>ase_lr ( 1/(1 + exp(-gamma * (iter - stepsize))))</code></p> + +<h4 id="how-to-set-the-size-of-the-validation-dataset-">How to set the size of the validation dataset ?</h4> + +<p>The size of the validation dataset is determined by the parameters <code>test_iter</code> and the batch size. For example: If the batch size is 64 and +<code>test_iter</code> is set to 10 in the <code>Keras2DML</code>’s constructor, then the validation size is 640. This setting generates following DML code internally:</p> + +<p><code>python +num_images = nrow(y_full) +BATCH_SIZE = 64 +num_validation = 10 * BATCH_SIZE +X = X_full[(num_validation+1):num_images,]; y = y_full[(num_validation+1):num_images,] +X_val = X_full[1:num_validation,]; y_val = y_full[1:num_validation,] +num_images = nrow(y) +</code></p> + +<h4 id="how-to-monitor-loss-via-command-line-">How to monitor loss via command-line ?</h4> + +<p>To monitor loss, please set the parameters <code>display</code>, <code>test_iter</code> and <code>test_interval</code> in the <code>Keras2DML</code>’s constructor.<br /> +For example: for the expression <code>Keras2DML(..., display=100, test_iter=10, test_interval=500)</code>, we +- display the training loss and accuracy every 100 iterations and +- carry out validation every 500 training iterations and display validation loss and accuracy.</p> + + + + </div> <!-- /container --> + + + + <script src="js/vendor/jquery-1.12.0.min.js"></script> + <script src="js/vendor/bootstrap.min.js"></script> + <script src="js/vendor/anchor.min.js"></script> + <script src="js/main.js"></script> + + + + + + <!-- Analytics --> + <script> + (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){ + (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o), + m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m) + })(window,document,'script','//www.google-analytics.com/analytics.js','ga'); + ga('create', 'UA-71553733-1', 'auto'); + ga('send', 'pageview'); + </script> + + + + <!-- MathJax Section --> + <script type="text/x-mathjax-config"> + MathJax.Hub.Config({ + TeX: { equationNumbers: { autoNumber: "AMS" } } + }); + </script> + <script> + // Note that we load MathJax this way to work with local file (file://), HTTP and HTTPS. + // We could use "//cdn.mathjax...", but that won't support "file://". + (function(d, script) { + script = d.createElement('script'); + script.type = 'text/javascript'; + script.async = true; + script.onload = function(){ + MathJax.Hub.Config({ + tex2jax: { + inlineMath: [ ["$", "$"], ["\\\\(","\\\\)"] ], + displayMath: [ ["$$","$$"], ["\\[", "\\]"] ], + processEscapes: true, + skipTags: ['script', 'noscript', 'style', 'textarea', 'pre'] + } + }); + }; + script.src = ('https:' == document.location.protocol ? 'https://' : 'http://') + + 'cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML'; + d.getElementsByTagName('head')[0].appendChild(script); + }(document)); + </script> + </body> +</html>
Added: systemml/site/docs/1.1.0/beginners-guide-python.html URL: http://svn.apache.org/viewvc/systemml/site/docs/1.1.0/beginners-guide-python.html?rev=1828046&view=auto ============================================================================== --- systemml/site/docs/1.1.0/beginners-guide-python.html (added) +++ systemml/site/docs/1.1.0/beginners-guide-python.html Fri Mar 30 04:31:05 2018 @@ -0,0 +1,540 @@ +<!DOCTYPE html> +<!--[if lt IE 7]> <html class="no-js lt-ie9 lt-ie8 lt-ie7"> <![endif]--> +<!--[if IE 7]> <html class="no-js lt-ie9 lt-ie8"> <![endif]--> +<!--[if IE 8]> <html class="no-js lt-ie9"> <![endif]--> +<!--[if gt IE 8]><!--> <html class="no-js"> <!--<![endif]--> + <head> + <title>Beginner's Guide for Python Users - SystemML 1.1.0</title> + <meta charset="utf-8"> + <meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1"> + + <meta name="description" content="Beginner's Guide for Python Users"> + + <meta name="viewport" content="width=device-width"> + <link rel="stylesheet" href="css/bootstrap.min.css"> + <link rel="stylesheet" href="css/main.css"> + <link rel="stylesheet" href="css/pygments-default.css"> + <link rel="shortcut icon" href="img/favicon.png"> + </head> + <body> + <!--[if lt IE 7]> + <p class="chromeframe">You are using an outdated browser. <a href="http://browsehappy.com/">Upgrade your browser today</a> or <a href="http://www.google.com/chromeframe/?redirect=true">install Google Chrome Frame</a> to better experience this site.</p> + <![endif]--> + + <header class="navbar navbar-default navbar-fixed-top" id="topbar"> + <div class="container"> + <div class="navbar-header"> + <div class="navbar-brand brand projectlogo"> + <a href="http://systemml.apache.org/"><img class="logo" src="img/systemml-logo.png" alt="Apache SystemML" title="Apache SystemML"/></a> + </div> + <div class="navbar-brand brand projecttitle"> + <a href="http://systemml.apache.org/">Apache SystemML<sup id="trademark">â¢</sup></a><br/> + <span class="version">1.1.0</span> + </div> + <button type="button" class="navbar-toggle collapsed" data-toggle="collapse" data-target=".navbar-collapse"> + <span class="sr-only">Toggle navigation</span> + <span class="icon-bar"></span> + <span class="icon-bar"></span> + <span class="icon-bar"></span> + </button> + </div> + <nav class="navbar-collapse collapse"> + <ul class="nav navbar-nav navbar-right"> + <li><a href="index.html">Overview</a></li> + <li><a href="https://github.com/apache/systemml">GitHub</a></li> + <li class="dropdown"> + <a href="#" class="dropdown-toggle" data-toggle="dropdown">Documentation<b class="caret"></b></a> + <ul class="dropdown-menu" role="menu"> + <li><b>Running SystemML:</b></li> + <li><a href="https://github.com/apache/systemml">SystemML GitHub README</a></li> + <li><a href="spark-mlcontext-programming-guide.html">Spark MLContext</a></li> + <li><a href="spark-batch-mode.html">Spark Batch Mode</a> + <li><a href="hadoop-batch-mode.html">Hadoop Batch Mode</a> + <li><a href="standalone-guide.html">Standalone Guide</a></li> + <li><a href="jmlc.html">Java Machine Learning Connector (JMLC)</a> + <li class="divider"></li> + <li><b>Language Guides:</b></li> + <li><a href="dml-language-reference.html">DML Language Reference</a></li> + <li><a href="beginners-guide-to-dml-and-pydml.html">Beginner's Guide to DML and PyDML</a></li> + <li><a href="beginners-guide-python.html">Beginner's Guide for Python Users</a></li> + <li><a href="python-reference.html">Reference Guide for Python Users</a></li> + <li class="divider"></li> + <li><b>ML Algorithms:</b></li> + <li><a href="algorithms-reference.html">Algorithms Reference</a></li> + <li class="divider"></li> + <li><b>Tools:</b></li> + <li><a href="debugger-guide.html">Debugger Guide</a></li> + <li><a href="developer-tools-systemml.html">IDE Guide</a></li> + <li class="divider"></li> + <li><b>Other:</b></li> + <li><a href="contributing-to-systemml.html">Contributing to SystemML</a></li> + <li><a href="engine-dev-guide.html">Engine Developer Guide</a></li> + <li><a href="troubleshooting-guide.html">Troubleshooting Guide</a></li> + <li><a href="release-process.html">Release Process</a></li> + </ul> + </li> + + <li class="dropdown"> + <a href="#" class="dropdown-toggle" data-toggle="dropdown">API Docs<b class="caret"></b></a> + <ul class="dropdown-menu" role="menu"> + <li><a href="./api/java/index.html">Java</a></li> + <li><a href="./api/python/index.html">Python</a></li> + </ul> + </li> + + <li class="dropdown"> + <a href="#" class="dropdown-toggle" data-toggle="dropdown">Issues<b class="caret"></b></a> + <ul class="dropdown-menu" role="menu"> + <li><b>JIRA:</b></li> + <li><a href="https://issues.apache.org/jira/browse/SYSTEMML">SystemML JIRA</a></li> + + </ul> + </li> + </ul> + </nav> + </div> + </header> + + <div class="container" id="content"> + + <h1 class="title">Beginner's Guide for Python Users</h1> + + + <!-- + +--> + +<ul id="markdown-toc"> + <li><a href="#introduction" id="markdown-toc-introduction">Introduction</a></li> + <li><a href="#download--setup" id="markdown-toc-download--setup">Download & Setup</a> <ul> + <li><a href="#install-java-need-java-8-and-apache-spark" id="markdown-toc-install-java-need-java-8-and-apache-spark">Install Java (need Java 8) and Apache Spark</a></li> + <li><a href="#install-systemml" id="markdown-toc-install-systemml">Install SystemML</a></li> + <li><a href="#uninstall-systemml" id="markdown-toc-uninstall-systemml">Uninstall SystemML</a></li> + <li><a href="#start-pyspark-shell" id="markdown-toc-start-pyspark-shell">Start Pyspark shell</a></li> + </ul> + </li> + <li><a href="#matrix-operations" id="markdown-toc-matrix-operations">Matrix operations</a></li> + <li><a href="#invoke-systemmls-algorithms" id="markdown-toc-invoke-systemmls-algorithms">Invoke SystemML’s algorithms</a> <ul> + <li><a href="#scikit-learn-interface" id="markdown-toc-scikit-learn-interface">Scikit-learn interface</a></li> + <li><a href="#passing-pyspark-dataframe" id="markdown-toc-passing-pyspark-dataframe">Passing PySpark DataFrame</a></li> + <li><a href="#mlpipeline-interface" id="markdown-toc-mlpipeline-interface">MLPipeline interface</a></li> + </ul> + </li> + <li><a href="#invoking-dmlpydml-scripts-using-mlcontext" id="markdown-toc-invoking-dmlpydml-scripts-using-mlcontext">Invoking DML/PyDML scripts using MLContext</a></li> +</ul> + +<p><br /></p> + +<h2 id="introduction">Introduction</h2> + +<p>SystemML enables flexible, scalable machine learning. This flexibility is achieved through the specification of a high-level declarative machine learning language that comes in two flavors, +one with an R-like syntax (DML) and one with a Python-like syntax (PyDML).</p> + +<p>Algorithm scripts written in DML and PyDML can be run on Hadoop, on Spark, or in Standalone mode. +No script modifications are required to change between modes. SystemML automatically performs advanced optimizations +based on data and cluster characteristics, so much of the need to manually tweak algorithms is largely reduced or eliminated. +To understand more about DML and PyDML, we recommend that you read <a href="https://apache.github.io/systemml/beginners-guide-to-dml-and-pydml.html">Beginner’s Guide to DML and PyDML</a>.</p> + +<p>For convenience of Python users, SystemML exposes several language-level APIs that allow Python users to use SystemML +and its algorithms without the need to know DML or PyDML. We explain these APIs in the below sections with example usecases.</p> + +<h2 id="download--setup">Download & Setup</h2> + +<p>Before you get started on SystemML, make sure that your environment is set up and ready to go.</p> + +<h3 id="install-java-need-java-8-and-apache-spark">Install Java (need Java 8) and Apache Spark</h3> + +<p>If you already have an Apache Spark installation, you can skip this step.</p> + +<div class="codetabs"> +<div data-lang="OSX"> + <p><code>bash +/usr/bin/ruby -e "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/master/install)" +brew tap caskroom/cask +brew install Caskroom/cask/java +brew tap homebrew/versions +brew install apache-spark16 +</code></p> + </div> +<div data-lang="Linux"> + <p><code>bash +ruby -e "$(curl -fsSL https://raw.githubusercontent.com/Linuxbrew/install/master/install)" +brew tap caskroom/cask +brew install Caskroom/cask/java +brew tap homebrew/versions +brew install apache-spark16 +</code></p> + </div> +</div> + +<h3 id="install-systemml">Install SystemML</h3> + +<p>To install released SystemML, please use following commands:</p> + +<div class="codetabs"> +<div data-lang="Python 2"> + <p><code>bash +pip install systemml +</code></p> + </div> +<div data-lang="Python 3"> + <p><code>bash +pip3 install systemml +</code></p> + </div> +</div> + +<p>If you want to try out the bleeding edge version, please use following commands:</p> + +<div class="codetabs"> +<div data-lang="Python 2"> + <p><code>bash +git checkout https://github.com/apache/systemml.git +cd systemml +mvn clean package -P distribution +pip install target/systemml-1.0.0-SNAPSHOT-python.tar.gz +</code></p> + </div> +<div data-lang="Python 3"> + <p><code>bash +git checkout https://github.com/apache/systemml.git +cd systemml +mvn clean package -P distribution +pip3 install target/systemml-1.0.0-SNAPSHOT-python.tar.gz +</code></p> + </div> +</div> + +<h3 id="uninstall-systemml">Uninstall SystemML</h3> +<p>To uninstall SystemML, please use following command:</p> + +<div class="codetabs"> +<div data-lang="Python 2"> + <p><code>bash +pip uninstall systemml +</code></p> + </div> +<div data-lang="Python 3"> + <p><code>bash +pip3 uninstall systemml +</code></p> + </div> +</div> + +<h3 id="start-pyspark-shell">Start Pyspark shell</h3> + +<div class="codetabs"> +<div data-lang="Python 2"> + <p><code>bash +pyspark +</code></p> + </div> +<div data-lang="Python 3"> + <p><code>bash +PYSPARK_PYTHON=python3 pyspark +</code></p> + </div> +</div> + +<hr /> + +<h2 id="matrix-operations">Matrix operations</h2> + +<p>To get started with SystemML, let’s try few elementary matrix multiplication operations:</p> + +<p><code>python +import systemml as sml +import numpy as np +m1 = sml.matrix(np.ones((3,3)) + 2) +m2 = sml.matrix(np.ones((3,3)) + 3) +m2 = m1 * (m2 + m1) +m4 = 1.0 - m2 +m4.sum(axis=1).toNumPy() +</code></p> + +<p>Output:</p> + +<p><code>python +array([[-60.], + [-60.], + [-60.]]) +</code></p> + +<p>Let us now write a simple script to train <a href="https://apache.github.io/systemml/algorithms-regression.html#linear-regression">linear regression</a> +model: $ \beta = solve(X^T X, X^T y) $. For simplicity, we will use direct-solve method and ignore +regularization parameter as well as intercept.</p> + +<p><code>python +import numpy as np +from sklearn import datasets +import systemml as sml +# Load the diabetes dataset +diabetes = datasets.load_diabetes() +# Use only one feature +diabetes_X = diabetes.data[:, np.newaxis, 2] +# Split the data into training/testing sets +X_train = diabetes_X[:-20] +X_test = diabetes_X[-20:] +# Split the targets into training/testing sets +y_train = diabetes.target[:-20] +y_test = diabetes.target[-20:] +# Train Linear Regression model +X = sml.matrix(X_train) +y = sml.matrix(np.matrix(y_train).T) +A = X.transpose().dot(X) +b = X.transpose().dot(y) +beta = sml.solve(A, b).toNumPy() +y_predicted = X_test.dot(beta) +print('Residual sum of squares: %.2f' % np.mean((y_predicted - y_test) ** 2)) +</code></p> + +<p>Output:</p> + +<p><code>bash +Residual sum of squares: 25282.12 +</code></p> + +<p>We can improve the residual error by adding an intercept and regularization parameter. To do so, we +will use <code>mllearn</code> API described in the next section.</p> + +<hr /> + +<h2 id="invoke-systemmls-algorithms">Invoke SystemML’s algorithms</h2> + +<p>SystemML also exposes a subpackage <a href="https://apache.github.io/systemml/python-reference#mllearn-api">mllearn</a>. This subpackage allows Python users to invoke SystemML algorithms +using Scikit-learn or MLPipeline API.</p> + +<h3 id="scikit-learn-interface">Scikit-learn interface</h3> + +<p>In the below example, we invoke SystemML’s <a href="https://apache.github.io/systemml/algorithms-regression.html#linear-regression">Linear Regression</a> +algorithm.</p> + +<p><code>python +import numpy as np +from sklearn import datasets +from systemml.mllearn import LinearRegression +# Load the diabetes dataset +diabetes = datasets.load_diabetes() +# Use only one feature +diabetes_X = diabetes.data[:, np.newaxis, 2] +# Split the data into training/testing sets +X_train = diabetes_X[:-20] +X_test = diabetes_X[-20:] +# Split the targets into training/testing sets +y_train = diabetes.target[:-20] +y_test = diabetes.target[-20:] +# Create linear regression object +regr = LinearRegression(spark, fit_intercept=True, C=float("inf"), solver='direct-solve') +# Train the model using the training sets +regr.fit(X_train, y_train) +y_predicted = regr.predict(X_test) +print('Residual sum of squares: %.2f' % np.mean((y_predicted - y_test) ** 2)) +</code></p> + +<p>Output:</p> + +<p><code>bash +Residual sum of squares: 6991.17 +</code></p> + +<p>As expected, by adding intercept and regularizer the residual error drops significantly.</p> + +<p>Here is another example that where we invoke SystemML’s <a href="https://apache.github.io/systemml/algorithms-classification.html#multinomial-logistic-regression">Logistic Regression</a> +algorithm on digits datasets.</p> + +<p><code>python +# Scikit-learn way +from sklearn import datasets, neighbors +from systemml.mllearn import LogisticRegression +digits = datasets.load_digits() +X_digits = digits.data +y_digits = digits.target +n_samples = len(X_digits) +X_train = X_digits[:int(.9 * n_samples)] +y_train = y_digits[:int(.9 * n_samples)] +X_test = X_digits[int(.9 * n_samples):] +y_test = y_digits[int(.9 * n_samples):] +logistic = LogisticRegression(spark) +print('LogisticRegression score: %f' % logistic.fit(X_train, y_train).score(X_test, y_test)) +</code></p> + +<p>Output:</p> + +<p><code>bash +LogisticRegression score: 0.927778 +</code></p> + +<p>You can also save the trained model and load it later for prediction:</p> + +<p><code>python +# Assuming logistic.fit(X_train, y_train) is already invoked +logistic.save('logistic_model') +new_logistic = LogisticRegression(spark) +new_logistic.load('logistic_model') +print('LogisticRegression score: %f' % new_logistic.score(X_test, y_test)) +</code></p> + +<h3 id="passing-pyspark-dataframe">Passing PySpark DataFrame</h3> + +<p>To train the above algorithm on larger dataset, we can load the dataset into DataFrame and pass it to the <code>fit</code> method:</p> + +<p><code>python +from sklearn import datasets +from systemml.mllearn import LogisticRegression +import pandas as pd +from sklearn.metrics import accuracy_score +import systemml as sml +digits = datasets.load_digits() +X_digits = digits.data +y_digits = digits.target +n_samples = len(X_digits) +# Split the data into training/testing sets and convert to PySpark DataFrame +df_train = sml.convertToLabeledDF(sqlCtx, X_digits[:int(.9 * n_samples)], y_digits[:int(.9 * n_samples)]) +X_test = spark.createDataFrame(pd.DataFrame(X_digits[int(.9 * n_samples):])) +logistic = LogisticRegression(spark) +logistic.fit(df_train) +y_predicted = logistic.predict(X_test) +y_predicted = y_predicted.select('prediction').toPandas().as_matrix().flatten() +y_test = y_digits[int(.9 * n_samples):] +print('LogisticRegression score: %f' % accuracy_score(y_test, y_predicted)) +</code></p> + +<p>Output:</p> + +<p><code>bash +LogisticRegression score: 0.922222 +</code></p> + +<h3 id="mlpipeline-interface">MLPipeline interface</h3> + +<p>In the below example, we demonstrate how the same <code>LogisticRegression</code> class can allow SystemML to fit seamlessly into +large data pipelines.</p> + +<p><code>python +# MLPipeline way +from pyspark.ml import Pipeline +from systemml.mllearn import LogisticRegression +from pyspark.ml.feature import HashingTF, Tokenizer +training = spark.createDataFrame([ + (0, "a b c d e spark", 1.0), + (1, "b d", 2.0), + (2, "spark f g h", 1.0), + (3, "hadoop mapreduce", 2.0), + (4, "b spark who", 1.0), + (5, "g d a y", 2.0), + (6, "spark fly", 1.0), + (7, "was mapreduce", 2.0), + (8, "e spark program", 1.0), + (9, "a e c l", 2.0), + (10, "spark compile", 1.0), + (11, "hadoop software", 2.0) +], ["id", "text", "label"]) +tokenizer = Tokenizer(inputCol="text", outputCol="words") +hashingTF = HashingTF(inputCol="words", outputCol="features", numFeatures=20) +lr = LogisticRegression(sqlCtx) +pipeline = Pipeline(stages=[tokenizer, hashingTF, lr]) +model = pipeline.fit(training) +test = spark.createDataFrame([ + (12, "spark i j k"), + (13, "l m n"), + (14, "mapreduce spark"), + (15, "apache hadoop")], ["id", "text"]) +prediction = model.transform(test) +prediction.show() +</code></p> + +<p>Output:</p> + +<p><code>bash ++-------+---+---------------+------------------+--------------------+--------------------+----------+ +|__INDEX| id| text| words| features| probability|prediction| ++-------+---+---------------+------------------+--------------------+--------------------+----------+ +| 1.0| 12| spark i j k| [spark, i, j, k]|(20,[5,6,7],[2.0,...|[0.99999999999975...| 1.0| +| 2.0| 13| l m n| [l, m, n]|(20,[8,9,10],[1.0...|[1.37552128844736...| 2.0| +| 3.0| 14|mapreduce spark|[mapreduce, spark]|(20,[5,10],[1.0,1...|[0.99860290938153...| 1.0| +| 4.0| 15| apache hadoop| [apache, hadoop]|(20,[9,14],[1.0,1...|[5.41688748236143...| 2.0| ++-------+---+---------------+------------------+--------------------+--------------------+----------+ +</code></p> + +<hr /> + +<h2 id="invoking-dmlpydml-scripts-using-mlcontext">Invoking DML/PyDML scripts using MLContext</h2> + +<p>The below example demonstrates how to invoke the algorithm <a href="https://github.com/apache/systemml/blob/master/scripts/algorithms/MultiLogReg.dml">scripts/algorithms/MultiLogReg.dml</a> +using Python <a href="https://apache.github.io/systemml/spark-mlcontext-programming-guide">MLContext API</a>.</p> + +<p><code>python +from sklearn import datasets +from pyspark.sql import SQLContext +import systemml as sml +import pandas as pd +digits = datasets.load_digits() +X_digits = digits.data +y_digits = digits.target + 1 +n_samples = len(X_digits) +# Split the data into training/testing sets and convert to PySpark DataFrame +X_df = sqlCtx.createDataFrame(pd.DataFrame(X_digits[:int(.9 * n_samples)])) +y_df = sqlCtx.createDataFrame(pd.DataFrame(y_digits[:int(.9 * n_samples)])) +ml = sml.MLContext(sc) +# Run the MultiLogReg.dml script at the given URL +scriptUrl = "https://raw.githubusercontent.com/apache/systemml/master/scripts/algorithms/MultiLogReg.dml" +script = sml.dml(scriptUrl).input(X=X_df, Y_vec=y_df).output("B_out") +beta = ml.execute(script).get('B_out').toNumPy() +</code></p> + + + </div> <!-- /container --> + + + + <script src="js/vendor/jquery-1.12.0.min.js"></script> + <script src="js/vendor/bootstrap.min.js"></script> + <script src="js/vendor/anchor.min.js"></script> + <script src="js/main.js"></script> + + + + + + <!-- Analytics --> + <script> + (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){ + (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o), + m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m) + })(window,document,'script','//www.google-analytics.com/analytics.js','ga'); + ga('create', 'UA-71553733-1', 'auto'); + ga('send', 'pageview'); + </script> + + + + <!-- MathJax Section --> + <script type="text/x-mathjax-config"> + MathJax.Hub.Config({ + TeX: { equationNumbers: { autoNumber: "AMS" } } + }); + </script> + <script> + // Note that we load MathJax this way to work with local file (file://), HTTP and HTTPS. + // We could use "//cdn.mathjax...", but that won't support "file://". + (function(d, script) { + script = d.createElement('script'); + script.type = 'text/javascript'; + script.async = true; + script.onload = function(){ + MathJax.Hub.Config({ + tex2jax: { + inlineMath: [ ["$", "$"], ["\\\\(","\\\\)"] ], + displayMath: [ ["$$","$$"], ["\\[", "\\]"] ], + processEscapes: true, + skipTags: ['script', 'noscript', 'style', 'textarea', 'pre'] + } + }); + }; + script.src = ('https:' == document.location.protocol ? 'https://' : 'http://') + + 'cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML'; + d.getElementsByTagName('head')[0].appendChild(script); + }(document)); + </script> + </body> +</html>
