http://git-wip-us.apache.org/repos/asf/flink-web/blob/f0ac0cdb/content/docs/0.9/libs/ml/quickstart.html ---------------------------------------------------------------------- diff --git a/content/docs/0.9/libs/ml/quickstart.html b/content/docs/0.9/libs/ml/quickstart.html deleted file mode 100644 index 188864b..0000000 --- a/content/docs/0.9/libs/ml/quickstart.html +++ /dev/null @@ -1,438 +0,0 @@ -<!-- -Licensed to the Apache Software Foundation (ASF) under one -or more contributor license agreements. See the NOTICE file -distributed with this work for additional information -regarding copyright ownership. The ASF licenses this file -to you under the Apache License, Version 2.0 (the -"License"); you may not use this file except in compliance -with the License. You may obtain a copy of the License at - -http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, -software distributed under the License is distributed on an -"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -KIND, either express or implied. See the License for the -specific language governing permissions and limitations -under the License. ---> -<!DOCTYPE html> - -<html lang="en"> - <head> - <meta charset="utf-8"> - <meta http-equiv="X-UA-Compatible" content="IE=edge"> - <meta name="viewport" content="width=device-width, initial-scale=1"> - <!-- The above 3 meta tags *must* come first in the head; any other head content must come *after* these tags --> - - <title>Apache Flink 0.9.0 Documentation: FlinkML - Quickstart Guide</title> - - <link rel="shortcut icon" href="http://flink.apache.org/docs/0.9/page/favicon.ico" type="image/x-icon"> - <link rel="icon" href="http://flink.apache.org/docs/0.9/page/favicon.ico" type="image/x-icon"> - - <!-- Bootstrap --> - <link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.4/css/bootstrap.min.css"> - <link rel="stylesheet" href="http://flink.apache.org/docs/0.9/page/css/flink.css"> - <link rel="stylesheet" href="http://flink.apache.org/docs/0.9/page/css/syntax.css"> - <link rel="stylesheet" href="http://flink.apache.org/docs/0.9/page/css/codetabs.css"> - - <script type="text/x-mathjax-config"> - MathJax.Hub.Config({ - tex2jax: { - inlineMath: [['$','$'], ['\\(','\\)']] }, - TeX: { - equationNumbers: { autoNumber: "AMS" } } - }); - </script> - <script type="text/javascript" - src="https://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML"> - </script> - - <!-- HTML5 shim and Respond.js for IE8 support of HTML5 elements and media queries --> - <!-- WARNING: Respond.js doesn't work if you view the page via file:// --> - <!--[if lt IE 9]> - <script src="https://oss.maxcdn.com/html5shiv/3.7.2/html5shiv.min.js"></script> - <script src="https://oss.maxcdn.com/respond/1.4.2/respond.min.js"></script> - <![endif]--> - </head> - <body> - - - - - - - <!-- Top navbar. --> - <nav class="navbar navbar-default navbar-fixed-top"> - <div class="container"> - <!-- The logo. --> - <div class="navbar-header"> - <button type="button" class="navbar-toggle collapsed" data-toggle="collapse" data-target="#bs-example-navbar-collapse-1"> - <span class="icon-bar"></span> - <span class="icon-bar"></span> - <span class="icon-bar"></span> - </button> - <div class="navbar-logo"> - <a href="http://flink.apache.org"><img alt="Apache Flink" src="http://flink.apache.org/docs/0.9/page/img/navbar-brand-logo.jpg"></a> - </div> - </div><!-- /.navbar-header --> - - <!-- The navigation links. --> - <div class="collapse navbar-collapse" id="bs-example-navbar-collapse-1"> - <ul class="nav navbar-nav"> - <li><a href="http://flink.apache.org/docs/0.9/index.html">Overview<span class="hidden-sm hidden-xs"> 0.9.0</span></a></li> - - <!-- Setup --> - <li class="dropdown"> - <a href="http://flink.apache.org/docs/0.9/setup" class="dropdown-toggle" data-toggle="dropdown" role="button" aria-expanded="false">Setup <span class="caret"></span></a> - <ul class="dropdown-menu" role="menu"> - <li><a href="http://flink.apache.org/docs/0.9/setup/building.html">Get Flink 0.9-SNAPSHOT</a></li> - - <li class="divider"></li> - <li role="presentation" class="dropdown-header"><strong>Deployment</strong></li> - <li><a href="http://flink.apache.org/docs/0.9/setup/local_setup.html" class="active">Local</a></li> - <li><a href="http://flink.apache.org/docs/0.9/setup/cluster_setup.html">Cluster (Standalone)</a></li> - <li><a href="http://flink.apache.org/docs/0.9/setup/yarn_setup.html">YARN</a></li> - <li><a href="http://flink.apache.org/docs/0.9/setup/gce_setup.html">GCloud</a></li> - <li><a href="http://flink.apache.org/docs/0.9/setup/flink_on_tez.html">Flink on Tez <span class="badge">Beta</span></a></li> - - <li class="divider"></li> - <li><a href="http://flink.apache.org/docs/0.9/setup/config.html">Configuration</a></li> - </ul> - </li> - - <!-- Programming Guides --> - <li class="dropdown"> - <a href="http://flink.apache.org/docs/0.9/apis" class="dropdown-toggle" data-toggle="dropdown" role="button" aria-expanded="false">Programming Guides <span class="caret"></span></a> - <ul class="dropdown-menu" role="menu"> - <li><a href="http://flink.apache.org/docs/0.9/apis/programming_guide.html"><strong>Batch: DataSet API</strong></a></li> - <li><a href="http://flink.apache.org/docs/0.9/apis/streaming_guide.html"><strong>Streaming: DataStream API</strong> <span class="badge">Beta</span></a></li> - <li><a href="http://flink.apache.org/docs/0.9/apis/python.html">Python API <span class="badge">Beta</span></a></li> - - <li class="divider"></li> - <li><a href="scala_shell.html">Interactive Scala Shell</a></li> - <li><a href="http://flink.apache.org/docs/0.9/apis/dataset_transformations.html">Dataset Transformations</a></li> - <li><a href="http://flink.apache.org/docs/0.9/apis/best_practices.html">Best Practices</a></li> - <li><a href="http://flink.apache.org/docs/0.9/apis/example_connectors.html">Connectors</a></li> - <li><a href="http://flink.apache.org/docs/0.9/apis/examples.html">Examples</a></li> - <li><a href="http://flink.apache.org/docs/0.9/apis/local_execution.html">Local Execution</a></li> - <li><a href="http://flink.apache.org/docs/0.9/apis/cluster_execution.html">Cluster Execution</a></li> - <li><a href="http://flink.apache.org/docs/0.9/apis/cli.html">Command Line Interface</a></li> - <li><a href="http://flink.apache.org/docs/0.9/apis/web_client.html">Web Client</a></li> - <li><a href="http://flink.apache.org/docs/0.9/apis/iterations.html">Iterations</a></li> - <li><a href="http://flink.apache.org/docs/0.9/apis/java8.html">Java 8</a></li> - <li><a href="http://flink.apache.org/docs/0.9/apis/hadoop_compatibility.html">Hadoop Compatability <span class="badge">Beta</span></a></li> - </ul> - </li> - - <!-- Libraries --> - <li class="dropdown"> - <a href="http://flink.apache.org/docs/0.9/libs" class="dropdown-toggle" data-toggle="dropdown" role="button" aria-expanded="false">Libraries <span class="caret"></span></a> - <ul class="dropdown-menu" role="menu"> - <li><a href="http://flink.apache.org/docs/0.9/libs/spargel_guide.html">Graphs: Spargel</a></li> - <li><a href="http://flink.apache.org/docs/0.9/libs/gelly_guide.html">Graphs: Gelly <span class="badge">Beta</span></a></li> - <li><a href="http://flink.apache.org/docs/0.9/libs/ml/">Machine Learning <span class="badge">Beta</span></a></li> - <li><a href="http://flink.apache.org/docs/0.9/libs/table.html">Relational: Table <span class="badge">Beta</span></a></li> - </ul> - </li> - - <!-- Internals --> - <li class="dropdown"> - <a href="http://flink.apache.org/docs/0.9/internals" class="dropdown-toggle" data-toggle="dropdown" role="button" aria-expanded="false">Internals <span class="caret"></span></a> - <ul class="dropdown-menu" role="menu"> - <li role="presentation" class="dropdown-header"><strong>Contribute</strong></li> - <li><a href="http://flink.apache.org/docs/0.9/internals/how_to_contribute.html">How to Contribute</a></li> - <li><a href="http://flink.apache.org/docs/0.9/internals/coding_guidelines.html">Coding Guidelines</a></li> - <li><a href="http://flink.apache.org/docs/0.9/internals/ide_setup.html">IDE Setup</a></li> - <li><a href="http://flink.apache.org/docs/0.9/internals/logging.html">Logging</a></li> - <li class="divider"></li> - <li role="presentation" class="dropdown-header"><strong>Internals</strong></li> - <li><a href="http://flink.apache.org/docs/0.9/internals/general_arch.html">Architecture & Process Model</a></li> - <li><a href="http://flink.apache.org/docs/0.9/internals/types_serialization.html">Type Extraction & Serialization</a></li> - <li><a href="http://flink.apache.org/docs/0.9/internals/job_scheduling.html">Jobs & Scheduling</a></li> - <li><a href="http://flink.apache.org/docs/0.9/internals/add_operator.html">How-To: Add an Operator</a></li> - </ul> - </li> - </ul> - <form class="navbar-form navbar-right hidden-sm hidden-md" role="search" action="http://flink.apache.org/docs/0.9/search-results.html"> - <div class="form-group"> - <input type="text" class="form-control" name="q" placeholder="Search all pages"> - </div> - <button type="submit" class="btn btn-default">Search</button> - </form> - </div><!-- /.navbar-collapse --> - </div><!-- /.container --> - </nav> - - - - - -<!--Some of the Latex math notation has been adapted from Apache Spark MLlib's documentation--> -$$ -\newcommand{\R}{\mathbb{R}} -\newcommand{\E}{\mathbb{E}} -\newcommand{\x}{\mathbf{x}} -\newcommand{\y}{\mathbf{y}} -\newcommand{\wv}{\mathbf{w}} -\newcommand{\av}{\mathbf{\alpha}} -\newcommand{\bv}{\mathbf{b}} -\newcommand{\N}{\mathbb{N}} -\newcommand{\id}{\mathbf{I}} -\newcommand{\ind}{\mathbf{1}} -\newcommand{\0}{\mathbf{0}} -\newcommand{\unit}{\mathbf{e}} -\newcommand{\one}{\mathbf{1}} -\newcommand{\zero}{\mathbf{0}} -\newcommand\rfrac[2]{^{#1}\!/_{#2}} -\newcommand{\norm}[1]{\left\lVert#1\right\rVert} -$$ - - - <!-- Main content. --> - <div class="container"> - - -<div class="row"> - <div class="col-sm-10 col-sm-offset-1"> - <h1><a href="../ml">FlinkML</a> - Quickstart Guide</h1> - - - -<ul id="markdown-toc"> - <li><a href="#introduction" id="markdown-toc-introduction">Introduction</a></li> - <li><a href="#linking-with-flinkml" id="markdown-toc-linking-with-flinkml">Linking with FlinkML</a></li> - <li><a href="#loading-data" id="markdown-toc-loading-data">Loading data</a></li> - <li><a href="#classification" id="markdown-toc-classification">Classification</a></li> - <li><a href="#data-pre-processing-and-pipelines" id="markdown-toc-data-pre-processing-and-pipelines">Data pre-processing and pipelines</a></li> - <li><a href="#where-to-go-from-here" id="markdown-toc-where-to-go-from-here">Where to go from here</a></li> -</ul> - -<h2 id="introduction">Introduction</h2> - -<p>FlinkML is designed to make learning from your data a straight-forward process, abstracting away -the complexities that usually come with big data learning tasks. In this -quick-start guide we will show just how easy it is to solve a simple supervised learning problem -using FlinkML. But first some basics, feel free to skip the next few lines if youâre already -familiar with Machine Learning (ML).</p> - -<p>As defined by Murphy <a href="#murphy">[1]</a> ML deals with detecting patterns in data, and using those -learned patterns to make predictions about the future. We can categorize most ML algorithms into -two major categories: Supervised and Unsupervised Learning.</p> - -<ul> - <li> - <p><strong>Supervised Learning</strong> deals with learning a function (mapping) from a set of inputs -(features) to a set of outputs. The learning is done using a <em>training set</em> of (input, -output) pairs that we use to approximate the mapping function. Supervised learning problems are -further divided into classification and regression problems. In classification problems we try to -predict the <em>class</em> that an example belongs to, for example whether a user is going to click on -an ad or not. Regression problems one the other hand, are about predicting (real) numerical -values, often called the dependent variable, for example what the temperature will be tomorrow.</p> - </li> - <li> - <p><strong>Unsupervised Learning</strong> deals with discovering patterns and regularities in the data. An example -of this would be <em>clustering</em>, where we try to discover groupings of the data from the -descriptive features. Unsupervised learning can also be used for feature selection, for example -through <a href="https://en.wikipedia.org/wiki/Principal_component_analysis">principal components analysis</a>.</p> - </li> -</ul> - -<h2 id="linking-with-flinkml">Linking with FlinkML</h2> - -<p>In order to use FlinkML in your project, first you have to -<a href="http://ci.apache.org/projects/flink/flink-docs-master/apis/programming_guide.html#linking-with-flink">set up a Flink program</a>. -Next, you have to add the FlinkML dependency to the <code>pom.xml</code> of your project:</p> - -<div class="highlight"><pre><code class="language-xml" data-lang="xml"><span class="nt"><dependency></span> - <span class="nt"><groupId></span>org.apache.flink<span class="nt"></groupId></span> - <span class="nt"><artifactId></span>flink-ml<span class="nt"></artifactId></span> - <span class="nt"><version></span>0.9.0<span class="nt"></version></span> -<span class="nt"></dependency></span></code></pre></div> - -<h2 id="loading-data">Loading data</h2> - -<p>To load data to be used with FlinkML we can use the ETL capabilities of Flink, or specialized -functions for formatted data, such as the LibSVM format. For supervised learning problems it is -common to use the <code>LabeledVector</code> class to represent the <code>(label, features)</code> examples. A <code>LabeledVector</code> -object will have a FlinkML <code>Vector</code> member representing the features of the example and a <code>Double</code> -member which represents the label, which could be the class in a classification problem, or the dependent -variable for a regression problem.</p> - -<p>As an example, we can use Habermanâs Survival Data Set , which you can -<a href="http://archive.ics.uci.edu/ml/machine-learning-databases/haberman/haberman.data">download from the UCI ML repository</a>. -This dataset <em>âcontains cases from a study conducted on the survival of patients who had undergone -surgery for breast cancerâ</em>. The data comes in a comma-separated file, where the first 3 columns -are the features and last column is the class, and the 4th column indicates whether the patient -survived 5 years or longer (label 1), or died within 5 years (label 2). You can check the <a href="https://archive.ics.uci.edu/ml/datasets/Haberman%27s+Survival">UCI -page</a> for more information on the data.</p> - -<p>We can load the data as a <code>DataSet[String]</code> first:</p> - -<div class="highlight"><pre><code class="language-scala" data-lang="scala"><span class="k">import</span> <span class="nn">org.apache.flink.api.scala.ExecutionEnvironment</span> - -<span class="k">val</span> <span class="n">env</span> <span class="k">=</span> <span class="nc">ExecutionEnvironment</span><span class="o">.</span><span class="n">getExecutionEnvironment</span> - -<span class="k">val</span> <span class="n">survival</span> <span class="k">=</span> <span class="n">env</span><span class="o">.</span><span class="n">readCsvFile</span><span class="o">[(</span><span class="kt">String</span>, <span class="kt">String</span>, <span class="kt">String</span>, <span class="kt">String</span><span class="o">)](</span><span class="s">"/path/to/haberman.data"</span><span class="o">)</span></code></pre></div> - -<p>We can now transform the data into a <code>DataSet[LabeledVector]</code>. This will allow us to use the -dataset with the FlinkML classification algorithms. We know that the 4th element of the dataset -is the class label, and the rest are features, so we can build <code>LabeledVector</code> elements like this:</p> - -<div class="highlight"><pre><code class="language-scala" data-lang="scala"><span class="k">import</span> <span class="nn">org.apache.flink.ml.common.LabeledVector</span> -<span class="k">import</span> <span class="nn">org.apache.flink.ml.math.DenseVector</span> - -<span class="k">val</span> <span class="n">survivalLV</span> <span class="k">=</span> <span class="n">survival</span> - <span class="o">.</span><span class="n">map</span><span class="o">{</span><span class="n">tuple</span> <span class="k">=></span> - <span class="k">val</span> <span class="n">list</span> <span class="k">=</span> <span class="n">tuple</span><span class="o">.</span><span class="n">productIterator</span><span class="o">.</span><span class="n">toList</span> - <span class="k">val</span> <span class="n">numList</span> <span class="k">=</span> <span class="n">list</span><span class="o">.</span><span class="n">map</span><span class="o">(</span><span class="k">_</span><span class="o">.</span><span class="n">asInstanceOf</span><span class="o">[</span><span class="kt">String</span><span class="o">].</span><span class="n">toDouble</span><span class="o">)</span> - <span class="nc">LabeledVector</span><span class="o">(</span><span class="n">numList</span><span class="o">(</span><span class="mi">3</span><span class="o">),</span> <span class="nc">DenseVector</span><span class="o">(</span><span class="n">numList</span><span class="o">.</span><span class="n">take</span><span class="o">(</span><span class="mi">3</span><span class="o">).</span><span class="n">toArray</span><span class="o">))</span> - <span class="o">}</span></code></pre></div> - -<p>We can then use this data to train a learner. We will however use another dataset to exemplify -building a learner; that will allow us to show how we can import other dataset formats.</p> - -<p><strong>LibSVM files</strong></p> - -<p>A common format for ML datasets is the LibSVM format and a number of datasets using that format can be -found <a href="http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/">in the LibSVM datasets website</a>. FlinkML provides utilities for loading -datasets using the LibSVM format through the <code>readLibSVM</code> function available through the <code>MLUtils</code> -object. -You can also save datasets in the LibSVM format using the <code>writeLibSVM</code> function. -Letâs import the svmguide1 dataset. You can download the -<a href="http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/svmguide1">training set here</a> -and the <a href="http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary/svmguide1.t">test set here</a>. -This is an astroparticle binary classification dataset, used by Hsu et al. <a href="#hsu">[3]</a> in their -practical Support Vector Machine (SVM) guide. It contains 4 numerical features, and the class label.</p> - -<p>We can simply import the dataset then using:</p> - -<div class="highlight"><pre><code class="language-scala" data-lang="scala"><span class="k">import</span> <span class="nn">org.apache.flink.ml.MLUtils</span> - -<span class="k">val</span> <span class="n">astroTrain</span><span class="k">:</span> <span class="kt">DataSet</span><span class="o">[</span><span class="kt">LabeledVector</span><span class="o">]</span> <span class="k">=</span> <span class="nc">MLUtils</span><span class="o">.</span><span class="n">readLibSVM</span><span class="o">(</span><span class="s">"/path/to/svmguide1"</span><span class="o">)</span> -<span class="k">val</span> <span class="n">astroTest</span><span class="k">:</span> <span class="kt">DataSet</span><span class="o">[</span><span class="kt">LabeledVector</span><span class="o">]</span> <span class="k">=</span> <span class="nc">MLUtils</span><span class="o">.</span><span class="n">readLibSVM</span><span class="o">(</span><span class="s">"/path/to/svmguide1.t"</span><span class="o">)</span></code></pre></div> - -<p>This gives us two <code>DataSet[LabeledVector]</code> objects that we will use in the following section to -create a classifier.</p> - -<h2 id="classification">Classification</h2> - -<p>Once we have imported the dataset we can train a <code>Predictor</code> such as a linear SVM classifier. -We can set a number of parameters for the classifier. Here we set the <code>Blocks</code> parameter, -which is used to split the input by the underlying CoCoA algorithm <a href="#jaggi">[2]</a> uses. The -regularization parameter determines the amount of $l_2$ regularization applied, which is used -to avoid overfitting. The step size determines the contribution of the weight vector updates to -the next weight vector value. This parameter sets the initial step size.</p> - -<div class="highlight"><pre><code class="language-scala" data-lang="scala"><span class="k">import</span> <span class="nn">org.apache.flink.ml.classification.SVM</span> - -<span class="k">val</span> <span class="n">svm</span> <span class="k">=</span> <span class="nc">SVM</span><span class="o">()</span> - <span class="o">.</span><span class="n">setBlocks</span><span class="o">(</span><span class="n">env</span><span class="o">.</span><span class="n">getParallelism</span><span class="o">)</span> - <span class="o">.</span><span class="n">setIterations</span><span class="o">(</span><span class="mi">100</span><span class="o">)</span> - <span class="o">.</span><span class="n">setRegularization</span><span class="o">(</span><span class="mf">0.001</span><span class="o">)</span> - <span class="o">.</span><span class="n">setStepsize</span><span class="o">(</span><span class="mf">0.1</span><span class="o">)</span> - <span class="o">.</span><span class="n">setSeed</span><span class="o">(</span><span class="mi">42</span><span class="o">)</span> - -<span class="n">svm</span><span class="o">.</span><span class="n">fit</span><span class="o">(</span><span class="n">astroTrain</span><span class="o">)</span></code></pre></div> - -<p>We can now make predictions on the test set.</p> - -<div class="highlight"><pre><code class="language-scala" data-lang="scala"><span class="k">val</span> <span class="n">predictionPairs</span> <span class="k">=</span> <span class="n">svm</span><span class="o">.</span><span class="n">predict</span><span class="o">(</span><span class="n">astroTest</span><span class="o">)</span></code></pre></div> - -<p>Next we will see how we can pre-process our data, and use the ML pipelines capabilities of FlinkML.</p> - -<h2 id="data-pre-processing-and-pipelines">Data pre-processing and pipelines</h2> - -<p>A pre-processing step that is often encouraged <a href="#hsu">[3]</a> when using SVM classification is scaling -the input features to the [0, 1] range, in order to avoid features with extreme values -dominating the rest. -FlinkML has a number of <code>Transformers</code> such as <code>MinMaxScaler</code> that are used to pre-process data, -and a key feature is the ability to chain <code>Transformers</code> and <code>Predictors</code> together. This allows -us to run the same pipeline of transformations and make predictions on the train and test data in -a straight-forward and type-safe manner. You can read more on the pipeline system of FlinkML -<a href="pipelines.html">in the pipelines documentation</a>.</p> - -<p>Let us first create a normalizing transformer for the features in our dataset, and chain it to a -new SVM classifier.</p> - -<div class="highlight"><pre><code class="language-scala" data-lang="scala"><span class="k">import</span> <span class="nn">org.apache.flink.ml.preprocessing.MinMaxScaler</span> - -<span class="k">val</span> <span class="n">scaler</span> <span class="k">=</span> <span class="nc">MinMaxScaler</span><span class="o">()</span> - -<span class="k">val</span> <span class="n">scaledSVM</span> <span class="k">=</span> <span class="n">scaler</span><span class="o">.</span><span class="n">chainPredictor</span><span class="o">(</span><span class="n">svm</span><span class="o">)</span></code></pre></div> - -<p>We can now use our newly created pipeline to make predictions on the test set. -First we call fit again, to train the scaler and the SVM classifier. -The data of the test set will then be automatically scaled before being passed on to the SVM to -make predictions.</p> - -<div class="highlight"><pre><code class="language-scala" data-lang="scala"><span class="n">scaledSVM</span><span class="o">.</span><span class="n">fit</span><span class="o">(</span><span class="n">astroTrain</span><span class="o">)</span> - -<span class="k">val</span> <span class="n">predictionPairsScaled</span><span class="k">:</span> <span class="kt">DataSet</span><span class="o">[(</span><span class="kt">Double</span>, <span class="kt">Double</span><span class="o">)]</span> <span class="k">=</span> <span class="n">scaledSVM</span><span class="o">.</span><span class="n">predict</span><span class="o">(</span><span class="n">astroTest</span><span class="o">)</span></code></pre></div> - -<p>The scaled inputs should give us better prediction performance. -The result of the prediction on <code>LabeledVector</code>s is a data set of tuples where the first entry denotes the true label value and the second entry is the predicted label value.</p> - -<h2 id="where-to-go-from-here">Where to go from here</h2> - -<p>This quickstart guide can act as an introduction to the basic concepts of FlinkML, but thereâs a lot -more you can do. -We recommend going through the <a href="index.html">FlinkML documentation</a>, and trying out the different -algorithms. -A very good way to get started is to play around with interesting datasets from the UCI ML -repository and the LibSVM datasets. -Tackling an interesting problem from a website like <a href="https://www.kaggle.com">Kaggle</a> or -<a href="http://www.drivendata.org/">DrivenData</a> is also a great way to learn by competing with other -data scientists. -If you would like to contribute some new algorithms take a look at our -<a href="contribution_guide.html">contribution guide</a>.</p> - -<p><strong>References</strong></p> - -<p><a name="murphy"></a>[1] Murphy, Kevin P. <em>Machine learning: a probabilistic perspective.</em> MIT -press, 2012.</p> - -<p><a name="jaggi"></a>[2] Jaggi, Martin, et al. <em>Communication-efficient distributed dual -coordinate ascent.</em> Advances in Neural Information Processing Systems. 2014.</p> - -<p><a name="hsu"></a>[3] Hsu, Chih-Wei, Chih-Chung Chang, and Chih-Jen Lin. - <em>A practical guide to support vector classification.</em> 2003.</p> - - </div> - - <div class="col-sm-10 col-sm-offset-1"> - <!-- Disqus thread and some vertical offset --> - <div style="margin-top: 75px; margin-bottom: 50px" id="disqus_thread"></div> - </div> -</div> - - </div><!-- /.container --> - - <!-- jQuery (necessary for Bootstrap's JavaScript plugins) --> - <script src="https://ajax.googleapis.com/ajax/libs/jquery/1.11.2/jquery.min.js"></script> - <!-- Include all compiled plugins (below), or include individual files as needed --> - <script src="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.4/js/bootstrap.min.js"></script> - <script src="http://flink.apache.org/docs/0.9/page/js/codetabs.js"></script> - - <!-- Google Analytics --> - <script> - (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){ - (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o), - m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m) - })(window,document,'script','//www.google-analytics.com/analytics.js','ga'); - - ga('create', 'UA-52545728-1', 'auto'); - ga('send', 'pageview'); - </script> - - <!-- Disqus --> - <script type="text/javascript"> - var disqus_shortname = 'stratosphere-eu'; - (function() { - var dsq = document.createElement('script'); dsq.type = 'text/javascript'; dsq.async = true; - dsq.src = '//' + disqus_shortname + '.disqus.com/embed.js'; - (document.getElementsByTagName('head')[0] || document.getElementsByTagName('body')[0]).appendChild(dsq); - })(); -</script> - </body> -</html>
http://git-wip-us.apache.org/repos/asf/flink-web/blob/f0ac0cdb/content/docs/0.9/libs/ml/standard_scaler.html ---------------------------------------------------------------------- diff --git a/content/docs/0.9/libs/ml/standard_scaler.html b/content/docs/0.9/libs/ml/standard_scaler.html deleted file mode 100644 index 9f84bfa..0000000 --- a/content/docs/0.9/libs/ml/standard_scaler.html +++ /dev/null @@ -1,340 +0,0 @@ -<!-- -Licensed to the Apache Software Foundation (ASF) under one -or more contributor license agreements. See the NOTICE file -distributed with this work for additional information -regarding copyright ownership. The ASF licenses this file -to you under the Apache License, Version 2.0 (the -"License"); you may not use this file except in compliance -with the License. You may obtain a copy of the License at - -http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, -software distributed under the License is distributed on an -"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -KIND, either express or implied. See the License for the -specific language governing permissions and limitations -under the License. ---> -<!DOCTYPE html> - -<html lang="en"> - <head> - <meta charset="utf-8"> - <meta http-equiv="X-UA-Compatible" content="IE=edge"> - <meta name="viewport" content="width=device-width, initial-scale=1"> - <!-- The above 3 meta tags *must* come first in the head; any other head content must come *after* these tags --> - - <title>Apache Flink 0.9.0 Documentation: FlinkML - Standard Scaler</title> - - <link rel="shortcut icon" href="http://flink.apache.org/docs/0.9/page/favicon.ico" type="image/x-icon"> - <link rel="icon" href="http://flink.apache.org/docs/0.9/page/favicon.ico" type="image/x-icon"> - - <!-- Bootstrap --> - <link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.4/css/bootstrap.min.css"> - <link rel="stylesheet" href="http://flink.apache.org/docs/0.9/page/css/flink.css"> - <link rel="stylesheet" href="http://flink.apache.org/docs/0.9/page/css/syntax.css"> - <link rel="stylesheet" href="http://flink.apache.org/docs/0.9/page/css/codetabs.css"> - - <script type="text/x-mathjax-config"> - MathJax.Hub.Config({ - tex2jax: { - inlineMath: [['$','$'], ['\\(','\\)']] }, - TeX: { - equationNumbers: { autoNumber: "AMS" } } - }); - </script> - <script type="text/javascript" - src="https://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML"> - </script> - - <!-- HTML5 shim and Respond.js for IE8 support of HTML5 elements and media queries --> - <!-- WARNING: Respond.js doesn't work if you view the page via file:// --> - <!--[if lt IE 9]> - <script src="https://oss.maxcdn.com/html5shiv/3.7.2/html5shiv.min.js"></script> - <script src="https://oss.maxcdn.com/respond/1.4.2/respond.min.js"></script> - <![endif]--> - </head> - <body> - - - - - - - <!-- Top navbar. --> - <nav class="navbar navbar-default navbar-fixed-top"> - <div class="container"> - <!-- The logo. --> - <div class="navbar-header"> - <button type="button" class="navbar-toggle collapsed" data-toggle="collapse" data-target="#bs-example-navbar-collapse-1"> - <span class="icon-bar"></span> - <span class="icon-bar"></span> - <span class="icon-bar"></span> - </button> - <div class="navbar-logo"> - <a href="http://flink.apache.org"><img alt="Apache Flink" src="http://flink.apache.org/docs/0.9/page/img/navbar-brand-logo.jpg"></a> - </div> - </div><!-- /.navbar-header --> - - <!-- The navigation links. --> - <div class="collapse navbar-collapse" id="bs-example-navbar-collapse-1"> - <ul class="nav navbar-nav"> - <li><a href="http://flink.apache.org/docs/0.9/index.html">Overview<span class="hidden-sm hidden-xs"> 0.9.0</span></a></li> - - <!-- Setup --> - <li class="dropdown"> - <a href="http://flink.apache.org/docs/0.9/setup" class="dropdown-toggle" data-toggle="dropdown" role="button" aria-expanded="false">Setup <span class="caret"></span></a> - <ul class="dropdown-menu" role="menu"> - <li><a href="http://flink.apache.org/docs/0.9/setup/building.html">Get Flink 0.9-SNAPSHOT</a></li> - - <li class="divider"></li> - <li role="presentation" class="dropdown-header"><strong>Deployment</strong></li> - <li><a href="http://flink.apache.org/docs/0.9/setup/local_setup.html" class="active">Local</a></li> - <li><a href="http://flink.apache.org/docs/0.9/setup/cluster_setup.html">Cluster (Standalone)</a></li> - <li><a href="http://flink.apache.org/docs/0.9/setup/yarn_setup.html">YARN</a></li> - <li><a href="http://flink.apache.org/docs/0.9/setup/gce_setup.html">GCloud</a></li> - <li><a href="http://flink.apache.org/docs/0.9/setup/flink_on_tez.html">Flink on Tez <span class="badge">Beta</span></a></li> - - <li class="divider"></li> - <li><a href="http://flink.apache.org/docs/0.9/setup/config.html">Configuration</a></li> - </ul> - </li> - - <!-- Programming Guides --> - <li class="dropdown"> - <a href="http://flink.apache.org/docs/0.9/apis" class="dropdown-toggle" data-toggle="dropdown" role="button" aria-expanded="false">Programming Guides <span class="caret"></span></a> - <ul class="dropdown-menu" role="menu"> - <li><a href="http://flink.apache.org/docs/0.9/apis/programming_guide.html"><strong>Batch: DataSet API</strong></a></li> - <li><a href="http://flink.apache.org/docs/0.9/apis/streaming_guide.html"><strong>Streaming: DataStream API</strong> <span class="badge">Beta</span></a></li> - <li><a href="http://flink.apache.org/docs/0.9/apis/python.html">Python API <span class="badge">Beta</span></a></li> - - <li class="divider"></li> - <li><a href="scala_shell.html">Interactive Scala Shell</a></li> - <li><a href="http://flink.apache.org/docs/0.9/apis/dataset_transformations.html">Dataset Transformations</a></li> - <li><a href="http://flink.apache.org/docs/0.9/apis/best_practices.html">Best Practices</a></li> - <li><a href="http://flink.apache.org/docs/0.9/apis/example_connectors.html">Connectors</a></li> - <li><a href="http://flink.apache.org/docs/0.9/apis/examples.html">Examples</a></li> - <li><a href="http://flink.apache.org/docs/0.9/apis/local_execution.html">Local Execution</a></li> - <li><a href="http://flink.apache.org/docs/0.9/apis/cluster_execution.html">Cluster Execution</a></li> - <li><a href="http://flink.apache.org/docs/0.9/apis/cli.html">Command Line Interface</a></li> - <li><a href="http://flink.apache.org/docs/0.9/apis/web_client.html">Web Client</a></li> - <li><a href="http://flink.apache.org/docs/0.9/apis/iterations.html">Iterations</a></li> - <li><a href="http://flink.apache.org/docs/0.9/apis/java8.html">Java 8</a></li> - <li><a href="http://flink.apache.org/docs/0.9/apis/hadoop_compatibility.html">Hadoop Compatability <span class="badge">Beta</span></a></li> - </ul> - </li> - - <!-- Libraries --> - <li class="dropdown"> - <a href="http://flink.apache.org/docs/0.9/libs" class="dropdown-toggle" data-toggle="dropdown" role="button" aria-expanded="false">Libraries <span class="caret"></span></a> - <ul class="dropdown-menu" role="menu"> - <li><a href="http://flink.apache.org/docs/0.9/libs/spargel_guide.html">Graphs: Spargel</a></li> - <li><a href="http://flink.apache.org/docs/0.9/libs/gelly_guide.html">Graphs: Gelly <span class="badge">Beta</span></a></li> - <li><a href="http://flink.apache.org/docs/0.9/libs/ml/">Machine Learning <span class="badge">Beta</span></a></li> - <li><a href="http://flink.apache.org/docs/0.9/libs/table.html">Relational: Table <span class="badge">Beta</span></a></li> - </ul> - </li> - - <!-- Internals --> - <li class="dropdown"> - <a href="http://flink.apache.org/docs/0.9/internals" class="dropdown-toggle" data-toggle="dropdown" role="button" aria-expanded="false">Internals <span class="caret"></span></a> - <ul class="dropdown-menu" role="menu"> - <li role="presentation" class="dropdown-header"><strong>Contribute</strong></li> - <li><a href="http://flink.apache.org/docs/0.9/internals/how_to_contribute.html">How to Contribute</a></li> - <li><a href="http://flink.apache.org/docs/0.9/internals/coding_guidelines.html">Coding Guidelines</a></li> - <li><a href="http://flink.apache.org/docs/0.9/internals/ide_setup.html">IDE Setup</a></li> - <li><a href="http://flink.apache.org/docs/0.9/internals/logging.html">Logging</a></li> - <li class="divider"></li> - <li role="presentation" class="dropdown-header"><strong>Internals</strong></li> - <li><a href="http://flink.apache.org/docs/0.9/internals/general_arch.html">Architecture & Process Model</a></li> - <li><a href="http://flink.apache.org/docs/0.9/internals/types_serialization.html">Type Extraction & Serialization</a></li> - <li><a href="http://flink.apache.org/docs/0.9/internals/job_scheduling.html">Jobs & Scheduling</a></li> - <li><a href="http://flink.apache.org/docs/0.9/internals/add_operator.html">How-To: Add an Operator</a></li> - </ul> - </li> - </ul> - <form class="navbar-form navbar-right hidden-sm hidden-md" role="search" action="http://flink.apache.org/docs/0.9/search-results.html"> - <div class="form-group"> - <input type="text" class="form-control" name="q" placeholder="Search all pages"> - </div> - <button type="submit" class="btn btn-default">Search</button> - </form> - </div><!-- /.navbar-collapse --> - </div><!-- /.container --> - </nav> - - - - - -<!--Some of the Latex math notation has been adapted from Apache Spark MLlib's documentation--> -$$ -\newcommand{\R}{\mathbb{R}} -\newcommand{\E}{\mathbb{E}} -\newcommand{\x}{\mathbf{x}} -\newcommand{\y}{\mathbf{y}} -\newcommand{\wv}{\mathbf{w}} -\newcommand{\av}{\mathbf{\alpha}} -\newcommand{\bv}{\mathbf{b}} -\newcommand{\N}{\mathbb{N}} -\newcommand{\id}{\mathbf{I}} -\newcommand{\ind}{\mathbf{1}} -\newcommand{\0}{\mathbf{0}} -\newcommand{\unit}{\mathbf{e}} -\newcommand{\one}{\mathbf{1}} -\newcommand{\zero}{\mathbf{0}} -\newcommand\rfrac[2]{^{#1}\!/_{#2}} -\newcommand{\norm}[1]{\left\lVert#1\right\rVert} -$$ - - - <!-- Main content. --> - <div class="container"> - - -<div class="row"> - <div class="col-sm-10 col-sm-offset-1"> - <h1><a href="../ml">FlinkML</a> - Standard Scaler</h1> - - - -<ul id="markdown-toc"> - <li><a href="#description" id="markdown-toc-description">Description</a></li> - <li><a href="#operations" id="markdown-toc-operations">Operations</a> <ul> - <li><a href="#fit" id="markdown-toc-fit">Fit</a></li> - <li><a href="#transform" id="markdown-toc-transform">Transform</a></li> - </ul> - </li> - <li><a href="#parameters" id="markdown-toc-parameters">Parameters</a></li> - <li><a href="#examples" id="markdown-toc-examples">Examples</a></li> -</ul> - -<h2 id="description">Description</h2> - -<p>The standard scaler scales the given data set, so that all features will have a user specified mean and variance. - In case the user does not provide a specific mean and standard deviation, the standard scaler transforms the features of the input data set to have mean equal to 0 and standard deviation equal to 1. - Given a set of input data $x_1, x_2,⦠x_n$, with mean:</p> - -<script type="math/tex; mode=display">\bar{x} = \frac{1}{n}\sum_{i=1}^{n}x_{i}</script> - -<p>and standard deviation:</p> - -<script type="math/tex; mode=display">\sigma_{x}=\sqrt{ \frac{1}{n} \sum_{i=1}^{n}(x_{i}-\bar{x})^{2}}</script> - -<p>The scaled data set $z_1, z_2,â¦,z_n$ will be:</p> - -<script type="math/tex; mode=display">z_{i}= std \left (\frac{x_{i} - \bar{x} }{\sigma_{x}}\right ) + mean</script> - -<p>where $\textit{std}$ and $\textit{mean}$ are the user specified values for the standard deviation and mean.</p> - -<h2 id="operations">Operations</h2> - -<p><code>StandardScaler</code> is a <code>Transformer</code>. -As such, it supports the <code>fit</code> and <code>transform</code> operation.</p> - -<h3 id="fit">Fit</h3> - -<p>StandardScaler is trained on all subtypes of <code>Vector</code> or <code>LabeledVector</code>:</p> - -<ul> - <li><code>fit[T <: Vector]: DataSet[T] => Unit</code></li> - <li><code>fit: DataSet[LabeledVector] => Unit</code></li> -</ul> - -<h3 id="transform">Transform</h3> - -<p>StandardScaler transforms all subtypes of <code>Vector</code> or <code>LabeledVector</code> into the respective type:</p> - -<ul> - <li><code>transform[T <: Vector]: DataSet[T] => DataSet[T]</code></li> - <li><code>transform: DataSet[LabeledVector] => DataSet[LabeledVector]</code></li> -</ul> - -<h2 id="parameters">Parameters</h2> - -<p>The standard scaler implementation can be controlled by the following two parameters:</p> - -<table class="table table-bordered"> - <thead> - <tr> - <th class="text-left" style="width: 20%">Parameters</th> - <th class="text-center">Description</th> - </tr> - </thead> - - <tbody> - <tr> - <td><strong>Mean</strong></td> - <td> - <p> - The mean of the scaled data set. (Default value: <strong>0.0</strong>) - </p> - </td> - </tr> - <tr> - <td><strong>Std</strong></td> - <td> - <p> - The standard deviation of the scaled data set. (Default value: <strong>1.0</strong>) - </p> - </td> - </tr> - </tbody> -</table> - -<h2 id="examples">Examples</h2> - -<div class="highlight"><pre><code class="language-scala" data-lang="scala"><span class="c1">// Create standard scaler transformer</span> -<span class="k">val</span> <span class="n">scaler</span> <span class="k">=</span> <span class="nc">StandardScaler</span><span class="o">()</span> -<span class="o">.</span><span class="n">setMean</span><span class="o">(</span><span class="mf">10.0</span><span class="o">)</span> -<span class="o">.</span><span class="n">setStd</span><span class="o">(</span><span class="mf">2.0</span><span class="o">)</span> - -<span class="c1">// Obtain data set to be scaled</span> -<span class="k">val</span> <span class="n">dataSet</span><span class="k">:</span> <span class="kt">DataSet</span><span class="o">[</span><span class="kt">Vector</span><span class="o">]</span> <span class="k">=</span> <span class="o">...</span> - -<span class="c1">// Learn the mean and standard deviation of the training data</span> -<span class="n">scaler</span><span class="o">.</span><span class="n">fit</span><span class="o">(</span><span class="n">dataSet</span><span class="o">)</span> - -<span class="c1">// Scale the provided data set to have mean=10.0 and std=2.0</span> -<span class="k">val</span> <span class="n">scaledDS</span> <span class="k">=</span> <span class="n">scaler</span><span class="o">.</span><span class="n">transform</span><span class="o">(</span><span class="n">dataSet</span><span class="o">)</span></code></pre></div> - - - </div> - - <div class="col-sm-10 col-sm-offset-1"> - <!-- Disqus thread and some vertical offset --> - <div style="margin-top: 75px; margin-bottom: 50px" id="disqus_thread"></div> - </div> -</div> - - </div><!-- /.container --> - - <!-- jQuery (necessary for Bootstrap's JavaScript plugins) --> - <script src="https://ajax.googleapis.com/ajax/libs/jquery/1.11.2/jquery.min.js"></script> - <!-- Include all compiled plugins (below), or include individual files as needed --> - <script src="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.4/js/bootstrap.min.js"></script> - <script src="http://flink.apache.org/docs/0.9/page/js/codetabs.js"></script> - - <!-- Google Analytics --> - <script> - (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){ - (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o), - m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m) - })(window,document,'script','//www.google-analytics.com/analytics.js','ga'); - - ga('create', 'UA-52545728-1', 'auto'); - ga('send', 'pageview'); - </script> - - <!-- Disqus --> - <script type="text/javascript"> - var disqus_shortname = 'stratosphere-eu'; - (function() { - var dsq = document.createElement('script'); dsq.type = 'text/javascript'; dsq.async = true; - dsq.src = '//' + disqus_shortname + '.disqus.com/embed.js'; - (document.getElementsByTagName('head')[0] || document.getElementsByTagName('body')[0]).appendChild(dsq); - })(); -</script> - </body> -</html> http://git-wip-us.apache.org/repos/asf/flink-web/blob/f0ac0cdb/content/docs/0.9/libs/ml/svm.html ---------------------------------------------------------------------- diff --git a/content/docs/0.9/libs/ml/svm.html b/content/docs/0.9/libs/ml/svm.html deleted file mode 100644 index 00184c4..0000000 --- a/content/docs/0.9/libs/ml/svm.html +++ /dev/null @@ -1,419 +0,0 @@ -<!-- -Licensed to the Apache Software Foundation (ASF) under one -or more contributor license agreements. See the NOTICE file -distributed with this work for additional information -regarding copyright ownership. The ASF licenses this file -to you under the Apache License, Version 2.0 (the -"License"); you may not use this file except in compliance -with the License. You may obtain a copy of the License at - -http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, -software distributed under the License is distributed on an -"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -KIND, either express or implied. See the License for the -specific language governing permissions and limitations -under the License. ---> -<!DOCTYPE html> - -<html lang="en"> - <head> - <meta charset="utf-8"> - <meta http-equiv="X-UA-Compatible" content="IE=edge"> - <meta name="viewport" content="width=device-width, initial-scale=1"> - <!-- The above 3 meta tags *must* come first in the head; any other head content must come *after* these tags --> - - <title>Apache Flink 0.9.0 Documentation: FlinkML - SVM using CoCoA</title> - - <link rel="shortcut icon" href="http://flink.apache.org/docs/0.9/page/favicon.ico" type="image/x-icon"> - <link rel="icon" href="http://flink.apache.org/docs/0.9/page/favicon.ico" type="image/x-icon"> - - <!-- Bootstrap --> - <link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.4/css/bootstrap.min.css"> - <link rel="stylesheet" href="http://flink.apache.org/docs/0.9/page/css/flink.css"> - <link rel="stylesheet" href="http://flink.apache.org/docs/0.9/page/css/syntax.css"> - <link rel="stylesheet" href="http://flink.apache.org/docs/0.9/page/css/codetabs.css"> - - <script type="text/x-mathjax-config"> - MathJax.Hub.Config({ - tex2jax: { - inlineMath: [['$','$'], ['\\(','\\)']] }, - TeX: { - equationNumbers: { autoNumber: "AMS" } } - }); - </script> - <script type="text/javascript" - src="https://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML"> - </script> - - <!-- HTML5 shim and Respond.js for IE8 support of HTML5 elements and media queries --> - <!-- WARNING: Respond.js doesn't work if you view the page via file:// --> - <!--[if lt IE 9]> - <script src="https://oss.maxcdn.com/html5shiv/3.7.2/html5shiv.min.js"></script> - <script src="https://oss.maxcdn.com/respond/1.4.2/respond.min.js"></script> - <![endif]--> - </head> - <body> - - - - - - - <!-- Top navbar. --> - <nav class="navbar navbar-default navbar-fixed-top"> - <div class="container"> - <!-- The logo. --> - <div class="navbar-header"> - <button type="button" class="navbar-toggle collapsed" data-toggle="collapse" data-target="#bs-example-navbar-collapse-1"> - <span class="icon-bar"></span> - <span class="icon-bar"></span> - <span class="icon-bar"></span> - </button> - <div class="navbar-logo"> - <a href="http://flink.apache.org"><img alt="Apache Flink" src="http://flink.apache.org/docs/0.9/page/img/navbar-brand-logo.jpg"></a> - </div> - </div><!-- /.navbar-header --> - - <!-- The navigation links. --> - <div class="collapse navbar-collapse" id="bs-example-navbar-collapse-1"> - <ul class="nav navbar-nav"> - <li><a href="http://flink.apache.org/docs/0.9/index.html">Overview<span class="hidden-sm hidden-xs"> 0.9.0</span></a></li> - - <!-- Setup --> - <li class="dropdown"> - <a href="http://flink.apache.org/docs/0.9/setup" class="dropdown-toggle" data-toggle="dropdown" role="button" aria-expanded="false">Setup <span class="caret"></span></a> - <ul class="dropdown-menu" role="menu"> - <li><a href="http://flink.apache.org/docs/0.9/setup/building.html">Get Flink 0.9-SNAPSHOT</a></li> - - <li class="divider"></li> - <li role="presentation" class="dropdown-header"><strong>Deployment</strong></li> - <li><a href="http://flink.apache.org/docs/0.9/setup/local_setup.html" class="active">Local</a></li> - <li><a href="http://flink.apache.org/docs/0.9/setup/cluster_setup.html">Cluster (Standalone)</a></li> - <li><a href="http://flink.apache.org/docs/0.9/setup/yarn_setup.html">YARN</a></li> - <li><a href="http://flink.apache.org/docs/0.9/setup/gce_setup.html">GCloud</a></li> - <li><a href="http://flink.apache.org/docs/0.9/setup/flink_on_tez.html">Flink on Tez <span class="badge">Beta</span></a></li> - - <li class="divider"></li> - <li><a href="http://flink.apache.org/docs/0.9/setup/config.html">Configuration</a></li> - </ul> - </li> - - <!-- Programming Guides --> - <li class="dropdown"> - <a href="http://flink.apache.org/docs/0.9/apis" class="dropdown-toggle" data-toggle="dropdown" role="button" aria-expanded="false">Programming Guides <span class="caret"></span></a> - <ul class="dropdown-menu" role="menu"> - <li><a href="http://flink.apache.org/docs/0.9/apis/programming_guide.html"><strong>Batch: DataSet API</strong></a></li> - <li><a href="http://flink.apache.org/docs/0.9/apis/streaming_guide.html"><strong>Streaming: DataStream API</strong> <span class="badge">Beta</span></a></li> - <li><a href="http://flink.apache.org/docs/0.9/apis/python.html">Python API <span class="badge">Beta</span></a></li> - - <li class="divider"></li> - <li><a href="scala_shell.html">Interactive Scala Shell</a></li> - <li><a href="http://flink.apache.org/docs/0.9/apis/dataset_transformations.html">Dataset Transformations</a></li> - <li><a href="http://flink.apache.org/docs/0.9/apis/best_practices.html">Best Practices</a></li> - <li><a href="http://flink.apache.org/docs/0.9/apis/example_connectors.html">Connectors</a></li> - <li><a href="http://flink.apache.org/docs/0.9/apis/examples.html">Examples</a></li> - <li><a href="http://flink.apache.org/docs/0.9/apis/local_execution.html">Local Execution</a></li> - <li><a href="http://flink.apache.org/docs/0.9/apis/cluster_execution.html">Cluster Execution</a></li> - <li><a href="http://flink.apache.org/docs/0.9/apis/cli.html">Command Line Interface</a></li> - <li><a href="http://flink.apache.org/docs/0.9/apis/web_client.html">Web Client</a></li> - <li><a href="http://flink.apache.org/docs/0.9/apis/iterations.html">Iterations</a></li> - <li><a href="http://flink.apache.org/docs/0.9/apis/java8.html">Java 8</a></li> - <li><a href="http://flink.apache.org/docs/0.9/apis/hadoop_compatibility.html">Hadoop Compatability <span class="badge">Beta</span></a></li> - </ul> - </li> - - <!-- Libraries --> - <li class="dropdown"> - <a href="http://flink.apache.org/docs/0.9/libs" class="dropdown-toggle" data-toggle="dropdown" role="button" aria-expanded="false">Libraries <span class="caret"></span></a> - <ul class="dropdown-menu" role="menu"> - <li><a href="http://flink.apache.org/docs/0.9/libs/spargel_guide.html">Graphs: Spargel</a></li> - <li><a href="http://flink.apache.org/docs/0.9/libs/gelly_guide.html">Graphs: Gelly <span class="badge">Beta</span></a></li> - <li><a href="http://flink.apache.org/docs/0.9/libs/ml/">Machine Learning <span class="badge">Beta</span></a></li> - <li><a href="http://flink.apache.org/docs/0.9/libs/table.html">Relational: Table <span class="badge">Beta</span></a></li> - </ul> - </li> - - <!-- Internals --> - <li class="dropdown"> - <a href="http://flink.apache.org/docs/0.9/internals" class="dropdown-toggle" data-toggle="dropdown" role="button" aria-expanded="false">Internals <span class="caret"></span></a> - <ul class="dropdown-menu" role="menu"> - <li role="presentation" class="dropdown-header"><strong>Contribute</strong></li> - <li><a href="http://flink.apache.org/docs/0.9/internals/how_to_contribute.html">How to Contribute</a></li> - <li><a href="http://flink.apache.org/docs/0.9/internals/coding_guidelines.html">Coding Guidelines</a></li> - <li><a href="http://flink.apache.org/docs/0.9/internals/ide_setup.html">IDE Setup</a></li> - <li><a href="http://flink.apache.org/docs/0.9/internals/logging.html">Logging</a></li> - <li class="divider"></li> - <li role="presentation" class="dropdown-header"><strong>Internals</strong></li> - <li><a href="http://flink.apache.org/docs/0.9/internals/general_arch.html">Architecture & Process Model</a></li> - <li><a href="http://flink.apache.org/docs/0.9/internals/types_serialization.html">Type Extraction & Serialization</a></li> - <li><a href="http://flink.apache.org/docs/0.9/internals/job_scheduling.html">Jobs & Scheduling</a></li> - <li><a href="http://flink.apache.org/docs/0.9/internals/add_operator.html">How-To: Add an Operator</a></li> - </ul> - </li> - </ul> - <form class="navbar-form navbar-right hidden-sm hidden-md" role="search" action="http://flink.apache.org/docs/0.9/search-results.html"> - <div class="form-group"> - <input type="text" class="form-control" name="q" placeholder="Search all pages"> - </div> - <button type="submit" class="btn btn-default">Search</button> - </form> - </div><!-- /.navbar-collapse --> - </div><!-- /.container --> - </nav> - - - - - -<!--Some of the Latex math notation has been adapted from Apache Spark MLlib's documentation--> -$$ -\newcommand{\R}{\mathbb{R}} -\newcommand{\E}{\mathbb{E}} -\newcommand{\x}{\mathbf{x}} -\newcommand{\y}{\mathbf{y}} -\newcommand{\wv}{\mathbf{w}} -\newcommand{\av}{\mathbf{\alpha}} -\newcommand{\bv}{\mathbf{b}} -\newcommand{\N}{\mathbb{N}} -\newcommand{\id}{\mathbf{I}} -\newcommand{\ind}{\mathbf{1}} -\newcommand{\0}{\mathbf{0}} -\newcommand{\unit}{\mathbf{e}} -\newcommand{\one}{\mathbf{1}} -\newcommand{\zero}{\mathbf{0}} -\newcommand\rfrac[2]{^{#1}\!/_{#2}} -\newcommand{\norm}[1]{\left\lVert#1\right\rVert} -$$ - - - <!-- Main content. --> - <div class="container"> - - -<div class="row"> - <div class="col-sm-10 col-sm-offset-1"> - <h1><a href="../ml">FlinkML</a> - SVM using CoCoA</h1> - - - -<ul id="markdown-toc"> - <li><a href="#description" id="markdown-toc-description">Description</a></li> - <li><a href="#operations" id="markdown-toc-operations">Operations</a> <ul> - <li><a href="#fit" id="markdown-toc-fit">Fit</a></li> - <li><a href="#predict" id="markdown-toc-predict">Predict</a></li> - </ul> - </li> - <li><a href="#parameters" id="markdown-toc-parameters">Parameters</a></li> - <li><a href="#examples" id="markdown-toc-examples">Examples</a></li> -</ul> - -<h2 id="description">Description</h2> - -<p>Implements an SVM with soft-margin using the communication-efficient distributed dual coordinate -ascent algorithm with hinge-loss function. -The algorithm solves the following minimization problem:</p> - -<script type="math/tex; mode=display">\min_{\mathbf{w} \in \mathbb{R}^d} \frac{\lambda}{2} \left\lVert \mathbf{w} \right\rVert^2 + \frac{1}{n} \sum_{i=1}^n l_{i}\left(\mathbf{w}^T\mathbf{x}_i\right)</script> - -<p>with $\mathbf{w}$ being the weight vector, $\lambda$ being the regularization constant, -<script type="math/tex">\mathbf{x}_i \in \mathbb{R}^d</script> being the data points and <script type="math/tex">l_{i}</script> being the convex loss -functions, which can also depend on the labels <script type="math/tex">y_{i} \in \mathbb{R}</script>. -In the current implementation the regularizer is the $\ell_2$-norm and the loss functions are the hinge-loss functions:</p> - -<script type="math/tex; mode=display">l_{i} = \max\left(0, 1 - y_{i} \mathbf{w}^T\mathbf{x}_i \right)</script> - -<p>With these choices, the problem definition is equivalent to a SVM with soft-margin. -Thus, the algorithm allows us to train a SVM with soft-margin.</p> - -<p>The minimization problem is solved by applying stochastic dual coordinate ascent (SDCA). -In order to make the algorithm efficient in a distributed setting, the CoCoA algorithm calculates -several iterations of SDCA locally on a data block before merging the local updates into a -valid global state. -This state is redistributed to the different data partitions where the next round of local SDCA -iterations is then executed. -The number of outer iterations and local SDCA iterations control the overall network costs, because -there is only network communication required for each outer iteration. -The local SDCA iterations are embarrassingly parallel once the individual data partitions have been -distributed across the cluster.</p> - -<p>The implementation of this algorithm is based on the work of -<a href="http://arxiv.org/abs/1409.1458">Jaggi et al.</a></p> - -<h2 id="operations">Operations</h2> - -<p><code>SVM</code> is a <code>Predictor</code>. -As such, it supports the <code>fit</code> and <code>predict</code> operation.</p> - -<h3 id="fit">Fit</h3> - -<p>SVM is trained given a set of <code>LabeledVector</code>:</p> - -<ul> - <li><code>fit: DataSet[LabeledVector] => Unit</code></li> -</ul> - -<h3 id="predict">Predict</h3> - -<p>SVM predicts for all subtypes of <code>Vector</code> the corresponding class label:</p> - -<ul> - <li><code>predict[T <: Vector]: DataSet[T] => DataSet[LabeledVector]</code></li> -</ul> - -<p>If we call predict with a <code>DataSet[LabeledVector]</code>, we make a prediction on the class label -for each example, and return a <code>DataSet[(Double, Double)]</code>. In each tuple the first element -is the true value, as was provided from the input <code>DataSet[LabeledVector]</code> and the second element -is the predicted value. You can then use these <code>(truth, prediction)</code> tuples to evaluate -the algorithmâs performance.</p> - -<ul> - <li><code>predict: DataSet[LabeledVector] => DataSet[(Double, Double)]</code></li> -</ul> - -<h2 id="parameters">Parameters</h2> - -<p>The SVM implementation can be controlled by the following parameters:</p> - -<table class="table table-bordered"> - <thead> - <tr> - <th class="text-left" style="width: 20%">Parameters</th> - <th class="text-center">Description</th> - </tr> - </thead> - - <tbody> - <tr> - <td><strong>Blocks</strong></td> - <td> - <p> - Sets the number of blocks into which the input data will be split. - On each block the local stochastic dual coordinate ascent method is executed. - This number should be set at least to the degree of parallelism. - If no value is specified, then the parallelism of the input DataSet is used as the number of blocks. - (Default value: <strong>None</strong>) - </p> - </td> - </tr> - <tr> - <td><strong>Iterations</strong></td> - <td> - <p> - Defines the maximum number of iterations of the outer loop method. - In other words, it defines how often the SDCA method is applied to the blocked data. - After each iteration, the locally computed weight vector updates have to be reduced to update the global weight vector value. - The new weight vector is broadcast to all SDCA tasks at the beginning of each iteration. - (Default value: <strong>10</strong>) - </p> - </td> - </tr> - <tr> - <td><strong>LocalIterations</strong></td> - <td> - <p> - Defines the maximum number of SDCA iterations. - In other words, it defines how many data points are drawn from each local data block to calculate the stochastic dual coordinate ascent. - (Default value: <strong>10</strong>) - </p> - </td> - </tr> - <tr> - <td><strong>Regularization</strong></td> - <td> - <p> - Defines the regularization constant of the SVM algorithm. - The higher the value, the smaller will the 2-norm of the weight vector be. - In case of a SVM with hinge loss this means that the SVM margin will be wider even though it might contain some false classifications. - (Default value: <strong>1.0</strong>) - </p> - </td> - </tr> - <tr> - <td><strong>Stepsize</strong></td> - <td> - <p> - Defines the initial step size for the updates of the weight vector. - The larger the step size is, the larger will be the contribution of the weight vector updates to the next weight vector value. - The effective scaling of the updates is $\frac{stepsize}{blocks}$. - This value has to be tuned in case that the algorithm becomes unstable. - (Default value: <strong>1.0</strong>) - </p> - </td> - </tr> - <tr> - <td><strong>Seed</strong></td> - <td> - <p> - Defines the seed to initialize the random number generator. - The seed directly controls which data points are chosen for the SDCA method. - (Default value: <strong>0</strong>) - </p> - </td> - </tr> - </tbody> - </table> - -<h2 id="examples">Examples</h2> - -<div class="highlight"><pre><code class="language-scala" data-lang="scala"><span class="c1">// Read the training data set, from a LibSVM formatted file</span> -<span class="k">val</span> <span class="n">trainingDS</span><span class="k">:</span> <span class="kt">DataSet</span><span class="o">[</span><span class="kt">LabeledVector</span><span class="o">]</span> <span class="k">=</span> <span class="n">env</span><span class="o">.</span><span class="n">readLibSVM</span><span class="o">(</span><span class="n">pathToTrainingFile</span><span class="o">)</span> - -<span class="c1">// Create the SVM learner</span> -<span class="k">val</span> <span class="n">svm</span> <span class="k">=</span> <span class="nc">SVM</span><span class="o">()</span> -<span class="o">.</span><span class="n">setBlocks</span><span class="o">(</span><span class="mi">10</span><span class="o">)</span> -<span class="o">.</span><span class="n">setIterations</span><span class="o">(</span><span class="mi">10</span><span class="o">)</span> -<span class="o">.</span><span class="n">setLocalIterations</span><span class="o">(</span><span class="mi">10</span><span class="o">)</span> -<span class="o">.</span><span class="n">setRegularization</span><span class="o">(</span><span class="mf">0.5</span><span class="o">)</span> -<span class="o">.</span><span class="n">setStepsize</span><span class="o">(</span><span class="mf">0.5</span><span class="o">)</span> - -<span class="c1">// Learn the SVM model</span> -<span class="n">svm</span><span class="o">.</span><span class="n">fit</span><span class="o">(</span><span class="n">trainingDS</span><span class="o">)</span> - -<span class="c1">// Read the testing data set</span> -<span class="k">val</span> <span class="n">testingDS</span><span class="k">:</span> <span class="kt">DataSet</span><span class="o">[</span><span class="kt">Vector</span><span class="o">]</span> <span class="k">=</span> <span class="n">env</span><span class="o">.</span><span class="n">readVectorFile</span><span class="o">(</span><span class="n">pathToTestingFile</span><span class="o">)</span> - -<span class="c1">// Calculate the predictions for the testing data set</span> -<span class="k">val</span> <span class="n">predictionDS</span><span class="k">:</span> <span class="kt">DataSet</span><span class="o">[</span><span class="kt">LabeledVector</span><span class="o">]</span> <span class="k">=</span> <span class="n">svm</span><span class="o">.</span><span class="n">predict</span><span class="o">(</span><span class="n">testingDS</span><span class="o">)</span></code></pre></div> - - - </div> - - <div class="col-sm-10 col-sm-offset-1"> - <!-- Disqus thread and some vertical offset --> - <div style="margin-top: 75px; margin-bottom: 50px" id="disqus_thread"></div> - </div> -</div> - - </div><!-- /.container --> - - <!-- jQuery (necessary for Bootstrap's JavaScript plugins) --> - <script src="https://ajax.googleapis.com/ajax/libs/jquery/1.11.2/jquery.min.js"></script> - <!-- Include all compiled plugins (below), or include individual files as needed --> - <script src="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.4/js/bootstrap.min.js"></script> - <script src="http://flink.apache.org/docs/0.9/page/js/codetabs.js"></script> - - <!-- Google Analytics --> - <script> - (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){ - (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o), - m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m) - })(window,document,'script','//www.google-analytics.com/analytics.js','ga'); - - ga('create', 'UA-52545728-1', 'auto'); - ga('send', 'pageview'); - </script> - - <!-- Disqus --> - <script type="text/javascript"> - var disqus_shortname = 'stratosphere-eu'; - (function() { - var dsq = document.createElement('script'); dsq.type = 'text/javascript'; dsq.async = true; - dsq.src = '//' + disqus_shortname + '.disqus.com/embed.js'; - (document.getElementsByTagName('head')[0] || document.getElementsByTagName('body')[0]).appendChild(dsq); - })(); -</script> - </body> -</html> http://git-wip-us.apache.org/repos/asf/flink-web/blob/f0ac0cdb/content/docs/0.9/libs/ml/vision_roadmap.html ---------------------------------------------------------------------- diff --git a/content/docs/0.9/libs/ml/vision_roadmap.html b/content/docs/0.9/libs/ml/vision_roadmap.html deleted file mode 100644 index 5793d68..0000000 --- a/content/docs/0.9/libs/ml/vision_roadmap.html +++ /dev/null @@ -1,313 +0,0 @@ -<!-- -Licensed to the Apache Software Foundation (ASF) under one -or more contributor license agreements. See the NOTICE file -distributed with this work for additional information -regarding copyright ownership. The ASF licenses this file -to you under the Apache License, Version 2.0 (the -"License"); you may not use this file except in compliance -with the License. You may obtain a copy of the License at - -http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, -software distributed under the License is distributed on an -"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -KIND, either express or implied. See the License for the -specific language governing permissions and limitations -under the License. ---> -<!DOCTYPE html> - -<html lang="en"> - <head> - <meta charset="utf-8"> - <meta http-equiv="X-UA-Compatible" content="IE=edge"> - <meta name="viewport" content="width=device-width, initial-scale=1"> - <!-- The above 3 meta tags *must* come first in the head; any other head content must come *after* these tags --> - - <title>Apache Flink 0.9.0 Documentation: FlinkML - Vision and Roadmap</title> - - <link rel="shortcut icon" href="http://flink.apache.org/docs/0.9/page/favicon.ico" type="image/x-icon"> - <link rel="icon" href="http://flink.apache.org/docs/0.9/page/favicon.ico" type="image/x-icon"> - - <!-- Bootstrap --> - <link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.4/css/bootstrap.min.css"> - <link rel="stylesheet" href="http://flink.apache.org/docs/0.9/page/css/flink.css"> - <link rel="stylesheet" href="http://flink.apache.org/docs/0.9/page/css/syntax.css"> - <link rel="stylesheet" href="http://flink.apache.org/docs/0.9/page/css/codetabs.css"> - - <!-- HTML5 shim and Respond.js for IE8 support of HTML5 elements and media queries --> - <!-- WARNING: Respond.js doesn't work if you view the page via file:// --> - <!--[if lt IE 9]> - <script src="https://oss.maxcdn.com/html5shiv/3.7.2/html5shiv.min.js"></script> - <script src="https://oss.maxcdn.com/respond/1.4.2/respond.min.js"></script> - <![endif]--> - </head> - <body> - - - - - - - <!-- Top navbar. --> - <nav class="navbar navbar-default navbar-fixed-top"> - <div class="container"> - <!-- The logo. --> - <div class="navbar-header"> - <button type="button" class="navbar-toggle collapsed" data-toggle="collapse" data-target="#bs-example-navbar-collapse-1"> - <span class="icon-bar"></span> - <span class="icon-bar"></span> - <span class="icon-bar"></span> - </button> - <div class="navbar-logo"> - <a href="http://flink.apache.org"><img alt="Apache Flink" src="http://flink.apache.org/docs/0.9/page/img/navbar-brand-logo.jpg"></a> - </div> - </div><!-- /.navbar-header --> - - <!-- The navigation links. --> - <div class="collapse navbar-collapse" id="bs-example-navbar-collapse-1"> - <ul class="nav navbar-nav"> - <li><a href="http://flink.apache.org/docs/0.9/index.html">Overview<span class="hidden-sm hidden-xs"> 0.9.0</span></a></li> - - <!-- Setup --> - <li class="dropdown"> - <a href="http://flink.apache.org/docs/0.9/setup" class="dropdown-toggle" data-toggle="dropdown" role="button" aria-expanded="false">Setup <span class="caret"></span></a> - <ul class="dropdown-menu" role="menu"> - <li><a href="http://flink.apache.org/docs/0.9/setup/building.html">Get Flink 0.9-SNAPSHOT</a></li> - - <li class="divider"></li> - <li role="presentation" class="dropdown-header"><strong>Deployment</strong></li> - <li><a href="http://flink.apache.org/docs/0.9/setup/local_setup.html" class="active">Local</a></li> - <li><a href="http://flink.apache.org/docs/0.9/setup/cluster_setup.html">Cluster (Standalone)</a></li> - <li><a href="http://flink.apache.org/docs/0.9/setup/yarn_setup.html">YARN</a></li> - <li><a href="http://flink.apache.org/docs/0.9/setup/gce_setup.html">GCloud</a></li> - <li><a href="http://flink.apache.org/docs/0.9/setup/flink_on_tez.html">Flink on Tez <span class="badge">Beta</span></a></li> - - <li class="divider"></li> - <li><a href="http://flink.apache.org/docs/0.9/setup/config.html">Configuration</a></li> - </ul> - </li> - - <!-- Programming Guides --> - <li class="dropdown"> - <a href="http://flink.apache.org/docs/0.9/apis" class="dropdown-toggle" data-toggle="dropdown" role="button" aria-expanded="false">Programming Guides <span class="caret"></span></a> - <ul class="dropdown-menu" role="menu"> - <li><a href="http://flink.apache.org/docs/0.9/apis/programming_guide.html"><strong>Batch: DataSet API</strong></a></li> - <li><a href="http://flink.apache.org/docs/0.9/apis/streaming_guide.html"><strong>Streaming: DataStream API</strong> <span class="badge">Beta</span></a></li> - <li><a href="http://flink.apache.org/docs/0.9/apis/python.html">Python API <span class="badge">Beta</span></a></li> - - <li class="divider"></li> - <li><a href="scala_shell.html">Interactive Scala Shell</a></li> - <li><a href="http://flink.apache.org/docs/0.9/apis/dataset_transformations.html">Dataset Transformations</a></li> - <li><a href="http://flink.apache.org/docs/0.9/apis/best_practices.html">Best Practices</a></li> - <li><a href="http://flink.apache.org/docs/0.9/apis/example_connectors.html">Connectors</a></li> - <li><a href="http://flink.apache.org/docs/0.9/apis/examples.html">Examples</a></li> - <li><a href="http://flink.apache.org/docs/0.9/apis/local_execution.html">Local Execution</a></li> - <li><a href="http://flink.apache.org/docs/0.9/apis/cluster_execution.html">Cluster Execution</a></li> - <li><a href="http://flink.apache.org/docs/0.9/apis/cli.html">Command Line Interface</a></li> - <li><a href="http://flink.apache.org/docs/0.9/apis/web_client.html">Web Client</a></li> - <li><a href="http://flink.apache.org/docs/0.9/apis/iterations.html">Iterations</a></li> - <li><a href="http://flink.apache.org/docs/0.9/apis/java8.html">Java 8</a></li> - <li><a href="http://flink.apache.org/docs/0.9/apis/hadoop_compatibility.html">Hadoop Compatability <span class="badge">Beta</span></a></li> - </ul> - </li> - - <!-- Libraries --> - <li class="dropdown"> - <a href="http://flink.apache.org/docs/0.9/libs" class="dropdown-toggle" data-toggle="dropdown" role="button" aria-expanded="false">Libraries <span class="caret"></span></a> - <ul class="dropdown-menu" role="menu"> - <li><a href="http://flink.apache.org/docs/0.9/libs/spargel_guide.html">Graphs: Spargel</a></li> - <li><a href="http://flink.apache.org/docs/0.9/libs/gelly_guide.html">Graphs: Gelly <span class="badge">Beta</span></a></li> - <li><a href="http://flink.apache.org/docs/0.9/libs/ml/">Machine Learning <span class="badge">Beta</span></a></li> - <li><a href="http://flink.apache.org/docs/0.9/libs/table.html">Relational: Table <span class="badge">Beta</span></a></li> - </ul> - </li> - - <!-- Internals --> - <li class="dropdown"> - <a href="http://flink.apache.org/docs/0.9/internals" class="dropdown-toggle" data-toggle="dropdown" role="button" aria-expanded="false">Internals <span class="caret"></span></a> - <ul class="dropdown-menu" role="menu"> - <li role="presentation" class="dropdown-header"><strong>Contribute</strong></li> - <li><a href="http://flink.apache.org/docs/0.9/internals/how_to_contribute.html">How to Contribute</a></li> - <li><a href="http://flink.apache.org/docs/0.9/internals/coding_guidelines.html">Coding Guidelines</a></li> - <li><a href="http://flink.apache.org/docs/0.9/internals/ide_setup.html">IDE Setup</a></li> - <li><a href="http://flink.apache.org/docs/0.9/internals/logging.html">Logging</a></li> - <li class="divider"></li> - <li role="presentation" class="dropdown-header"><strong>Internals</strong></li> - <li><a href="http://flink.apache.org/docs/0.9/internals/general_arch.html">Architecture & Process Model</a></li> - <li><a href="http://flink.apache.org/docs/0.9/internals/types_serialization.html">Type Extraction & Serialization</a></li> - <li><a href="http://flink.apache.org/docs/0.9/internals/job_scheduling.html">Jobs & Scheduling</a></li> - <li><a href="http://flink.apache.org/docs/0.9/internals/add_operator.html">How-To: Add an Operator</a></li> - </ul> - </li> - </ul> - <form class="navbar-form navbar-right hidden-sm hidden-md" role="search" action="http://flink.apache.org/docs/0.9/search-results.html"> - <div class="form-group"> - <input type="text" class="form-control" name="q" placeholder="Search all pages"> - </div> - <button type="submit" class="btn btn-default">Search</button> - </form> - </div><!-- /.navbar-collapse --> - </div><!-- /.container --> - </nav> - - - - - <!-- Main content. --> - <div class="container"> - - -<div class="row"> - <div class="col-sm-10 col-sm-offset-1"> - <h1><a href="../ml">FlinkML</a> - Vision and Roadmap</h1> - - - -<ul id="markdown-toc"> - <li><a href="#vision" id="markdown-toc-vision">Vision</a></li> - <li><a href="#roadmap" id="markdown-toc-roadmap">Roadmap</a></li> -</ul> - -<h2 id="vision">Vision</h2> - -<p>The Machine Learning (ML) library for Flink is a new effort to bring scalable ML tools to the Flink -community. Our goal is is to design and implement a system that is scalable and can deal with -problems of various sizes, whether your data size is measured in megabytes or terabytes and beyond. -We call this library FlinkML.</p> - -<p>An important concern for developers of ML systems is the amount of glue code that developers are -forced to write [1] in the process of implementing an end-to-end ML system. Our goal with FlinkML -is to help developers keep glue code to a minimum. The Flink ecosystem provides a great setting to -tackle this problem, with its scalable ETL capabilities that can be easily combined inside the same -program with FlinkML, allowing the development of robust pipelines without the need to use yet -another technology for data ingestion and data munging.</p> - -<p>Another goal for FlinkML is to make the library easy to use. To that end we will be providing -detailed documentation along with examples for every part of the system. Our aim is that developers -will be able to get started with writing their ML pipelines quickly, using familiar programming -concepts and terminology.</p> - -<p>Contrary to other data-processing systems, Flink exploits in-memory data streaming, and natively -executes iterative processing algorithms which are common in ML. We plan to exploit the streaming -nature of Flink, and provide functionality designed specifically for data streams.</p> - -<p>FlinkML will allow data scientists to test their models locally and using subsets of data, and then -use the same code to run their algorithms at a much larger scale in a cluster setting.</p> - -<p>We are inspired by other open source efforts to provide ML systems, in particular -<a href="http://scikit-learn.org/">scikit-learn</a> for cleanly specifying ML pipelines, and Sparkâs -<a href="https://spark.apache.org/mllib/">MLLib</a> for providing ML algorithms that scale with problem and -cluster sizes.</p> - -<h2 id="roadmap">Roadmap</h2> - -<p>The roadmap below can provide an indication of the algorithms we aim to implement in the coming -months. If you are interested in helping out, please check our <a href="contribution_guide.html">contribution guide</a>. -Items in <strong>bold</strong> have already been implemented:</p> - -<ul> - <li>Pipelines of transformers and learners</li> - <li>Data pre-processing - <ul> - <li><strong>Feature scaling</strong></li> - <li><strong>Polynomial feature base mapper</strong></li> - <li>Feature hashing</li> - <li>Feature extraction for text</li> - <li>Dimensionality reduction</li> - </ul> - </li> - <li>Model selection and performance evaluation - <ul> - <li>Cross-validation for model selection and evaluation</li> - </ul> - </li> - <li>Supervised learning - <ul> - <li>Optimization framework - <ul> - <li><strong>Stochastic Gradient Descent</strong></li> - <li>L-BFGS</li> - </ul> - </li> - <li>Generalized Linear Models - <ul> - <li><strong>Multiple linear regression</strong></li> - <li>LASSO, Ridge regression</li> - <li>Multi-class Logistic regression</li> - </ul> - </li> - <li>Random forests</li> - <li><strong>Support Vector Machines</strong></li> - </ul> - </li> - <li>Unsupervised learning - <ul> - <li>Clustering - <ul> - <li>K-means clustering</li> - </ul> - </li> - <li>PCA</li> - </ul> - </li> - <li>Recommendation - <ul> - <li><strong>ALS</strong></li> - </ul> - </li> - <li>Text analytics - <ul> - <li>LDA</li> - </ul> - </li> - <li>Statistical estimation tools</li> - <li>Distributed linear algebra</li> - <li>Streaming ML</li> -</ul> - -<p><strong>References:</strong></p> - -<p>[1] D. Sculley, G. Holt, D. Golovin, E. Davydov, T. Phillips, D. Ebner, V. Chaudhary, -and M. Young. <em>Machine learning: The high interest credit card of technical debt.</em> In SE4ML: -Software Engineering for Machine Learning (NIPS 2014 Workshop), 2014.</p> - - </div> - - <div class="col-sm-10 col-sm-offset-1"> - <!-- Disqus thread and some vertical offset --> - <div style="margin-top: 75px; margin-bottom: 50px" id="disqus_thread"></div> - </div> -</div> - - </div><!-- /.container --> - - <!-- jQuery (necessary for Bootstrap's JavaScript plugins) --> - <script src="https://ajax.googleapis.com/ajax/libs/jquery/1.11.2/jquery.min.js"></script> - <!-- Include all compiled plugins (below), or include individual files as needed --> - <script src="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.4/js/bootstrap.min.js"></script> - <script src="http://flink.apache.org/docs/0.9/page/js/codetabs.js"></script> - - <!-- Google Analytics --> - <script> - (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){ - (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o), - m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m) - })(window,document,'script','//www.google-analytics.com/analytics.js','ga'); - - ga('create', 'UA-52545728-1', 'auto'); - ga('send', 'pageview'); - </script> - - <!-- Disqus --> - <script type="text/javascript"> - var disqus_shortname = 'stratosphere-eu'; - (function() { - var dsq = document.createElement('script'); dsq.type = 'text/javascript'; dsq.async = true; - dsq.src = '//' + disqus_shortname + '.disqus.com/embed.js'; - (document.getElementsByTagName('head')[0] || document.getElementsByTagName('body')[0]).appendChild(dsq); - })(); -</script> - </body> -</html>