http://git-wip-us.apache.org/repos/asf/flink-web/blob/f0ac0cdb/content/docs/0.9/apis/programming_guide.html
----------------------------------------------------------------------
diff --git a/content/docs/0.9/apis/programming_guide.html 
b/content/docs/0.9/apis/programming_guide.html
deleted file mode 100644
index 91c5205..0000000
--- a/content/docs/0.9/apis/programming_guide.html
+++ /dev/null
@@ -1,3433 +0,0 @@
-<!--
-Licensed to the Apache Software Foundation (ASF) under one
-or more contributor license agreements.  See the NOTICE file
-distributed with this work for additional information
-regarding copyright ownership.  The ASF licenses this file
-to you under the Apache License, Version 2.0 (the
-"License"); you may not use this file except in compliance
-with the License.  You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing,
-software distributed under the License is distributed on an
-"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-KIND, either express or implied.  See the License for the
-specific language governing permissions and limitations
-under the License.
--->
-<!DOCTYPE html>
-
-<html lang="en">
-  <head>
-    <meta charset="utf-8">
-    <meta http-equiv="X-UA-Compatible" content="IE=edge">
-    <meta name="viewport" content="width=device-width, initial-scale=1">
-    <!-- The above 3 meta tags *must* come first in the head; any other head 
content must come *after* these tags -->
-    
-    <title>Apache Flink 0.9.0 Documentation: Flink Programming Guide</title>
-    
-    <link rel="shortcut icon" 
href="http://flink.apache.org/docs/0.9/page/favicon.ico"; type="image/x-icon">
-    <link rel="icon" href="http://flink.apache.org/docs/0.9/page/favicon.ico"; 
type="image/x-icon">
-
-    <!-- Bootstrap -->
-    <link rel="stylesheet" 
href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.4/css/bootstrap.min.css";>
-    <link rel="stylesheet" 
href="http://flink.apache.org/docs/0.9/page/css/flink.css";>
-    <link rel="stylesheet" 
href="http://flink.apache.org/docs/0.9/page/css/syntax.css";>
-    <link rel="stylesheet" 
href="http://flink.apache.org/docs/0.9/page/css/codetabs.css";>
-    
-    <!-- HTML5 shim and Respond.js for IE8 support of HTML5 elements and media 
queries -->
-    <!-- WARNING: Respond.js doesn't work if you view the page via file:// -->
-    <!--[if lt IE 9]>
-      <script 
src="https://oss.maxcdn.com/html5shiv/3.7.2/html5shiv.min.js";></script>
-      <script 
src="https://oss.maxcdn.com/respond/1.4.2/respond.min.js";></script>
-    <![endif]-->
-  </head>
-  <body>
-    
-    
-
-
-
-
-    <!-- Top navbar. -->
-    <nav class="navbar navbar-default navbar-fixed-top">
-      <div class="container">
-        <!-- The logo. -->
-        <div class="navbar-header">
-          <button type="button" class="navbar-toggle collapsed" 
data-toggle="collapse" data-target="#bs-example-navbar-collapse-1">
-            <span class="icon-bar"></span>
-            <span class="icon-bar"></span>
-            <span class="icon-bar"></span>
-          </button>
-          <div class="navbar-logo">
-            <a href="http://flink.apache.org";><img alt="Apache Flink" 
src="http://flink.apache.org/docs/0.9/page/img/navbar-brand-logo.jpg";></a>
-          </div>
-        </div><!-- /.navbar-header -->
-
-        <!-- The navigation links. -->
-        <div class="collapse navbar-collapse" 
id="bs-example-navbar-collapse-1">
-          <ul class="nav navbar-nav">
-            <li><a 
href="http://flink.apache.org/docs/0.9/index.html";>Overview<span 
class="hidden-sm hidden-xs"> 0.9.0</span></a></li>
-
-            <!-- Setup -->
-            <li class="dropdown">
-              <a href="http://flink.apache.org/docs/0.9/setup"; 
class="dropdown-toggle" data-toggle="dropdown" role="button" 
aria-expanded="false">Setup <span class="caret"></span></a>
-              <ul class="dropdown-menu" role="menu">
-                <li><a 
href="http://flink.apache.org/docs/0.9/setup/building.html";>Get Flink 
0.9-SNAPSHOT</a></li>
-
-                <li class="divider"></li>
-                <li role="presentation" 
class="dropdown-header"><strong>Deployment</strong></li>
-                <li><a 
href="http://flink.apache.org/docs/0.9/setup/local_setup.html"; 
class="active">Local</a></li>
-                <li><a 
href="http://flink.apache.org/docs/0.9/setup/cluster_setup.html";>Cluster 
(Standalone)</a></li>
-                <li><a 
href="http://flink.apache.org/docs/0.9/setup/yarn_setup.html";>YARN</a></li>
-                <li><a 
href="http://flink.apache.org/docs/0.9/setup/gce_setup.html";>GCloud</a></li>
-                <li><a 
href="http://flink.apache.org/docs/0.9/setup/flink_on_tez.html";>Flink on Tez 
<span class="badge">Beta</span></a></li>
-
-                <li class="divider"></li>
-                <li><a 
href="http://flink.apache.org/docs/0.9/setup/config.html";>Configuration</a></li>
-              </ul>
-            </li>
-
-            <!-- Programming Guides -->
-            <li class="dropdown">
-              <a href="http://flink.apache.org/docs/0.9/apis"; 
class="dropdown-toggle" data-toggle="dropdown" role="button" 
aria-expanded="false">Programming Guides <span class="caret"></span></a>
-              <ul class="dropdown-menu" role="menu">
-                <li><a 
href="http://flink.apache.org/docs/0.9/apis/programming_guide.html";><strong>Batch:
 DataSet API</strong></a></li>
-                <li><a 
href="http://flink.apache.org/docs/0.9/apis/streaming_guide.html";><strong>Streaming:
 DataStream API</strong> <span class="badge">Beta</span></a></li>
-                <li><a 
href="http://flink.apache.org/docs/0.9/apis/python.html";>Python API <span 
class="badge">Beta</span></a></li>
-
-                <li class="divider"></li>
-                <li><a href="scala_shell.html">Interactive Scala Shell</a></li>
-                <li><a 
href="http://flink.apache.org/docs/0.9/apis/dataset_transformations.html";>Dataset
 Transformations</a></li>
-                <li><a 
href="http://flink.apache.org/docs/0.9/apis/best_practices.html";>Best 
Practices</a></li>
-                <li><a 
href="http://flink.apache.org/docs/0.9/apis/example_connectors.html";>Connectors</a></li>
-                <li><a 
href="http://flink.apache.org/docs/0.9/apis/examples.html";>Examples</a></li>
-                <li><a 
href="http://flink.apache.org/docs/0.9/apis/local_execution.html";>Local 
Execution</a></li>
-                <li><a 
href="http://flink.apache.org/docs/0.9/apis/cluster_execution.html";>Cluster 
Execution</a></li>
-                <li><a 
href="http://flink.apache.org/docs/0.9/apis/cli.html";>Command Line 
Interface</a></li>
-                <li><a 
href="http://flink.apache.org/docs/0.9/apis/web_client.html";>Web Client</a></li>
-                <li><a 
href="http://flink.apache.org/docs/0.9/apis/iterations.html";>Iterations</a></li>
-                <li><a 
href="http://flink.apache.org/docs/0.9/apis/java8.html";>Java 8</a></li>
-                <li><a 
href="http://flink.apache.org/docs/0.9/apis/hadoop_compatibility.html";>Hadoop 
Compatability <span class="badge">Beta</span></a></li>
-              </ul>
-            </li>
-
-            <!-- Libraries -->
-            <li class="dropdown">
-              <a href="http://flink.apache.org/docs/0.9/libs"; 
class="dropdown-toggle" data-toggle="dropdown" role="button" 
aria-expanded="false">Libraries <span class="caret"></span></a>
-                <ul class="dropdown-menu" role="menu">
-                  <li><a 
href="http://flink.apache.org/docs/0.9/libs/spargel_guide.html";>Graphs: 
Spargel</a></li>
-                  <li><a 
href="http://flink.apache.org/docs/0.9/libs/gelly_guide.html";>Graphs: Gelly 
<span class="badge">Beta</span></a></li>
-                  <li><a 
href="http://flink.apache.org/docs/0.9/libs/ml/";>Machine Learning <span 
class="badge">Beta</span></a></li>
-                  <li><a 
href="http://flink.apache.org/docs/0.9/libs/table.html";>Relational: Table <span 
class="badge">Beta</span></a></li>
-              </ul>
-            </li>
-
-            <!-- Internals -->
-            <li class="dropdown">
-              <a href="http://flink.apache.org/docs/0.9/internals"; 
class="dropdown-toggle" data-toggle="dropdown" role="button" 
aria-expanded="false">Internals <span class="caret"></span></a>
-              <ul class="dropdown-menu" role="menu">
-                <li role="presentation" 
class="dropdown-header"><strong>Contribute</strong></li>
-                <li><a 
href="http://flink.apache.org/docs/0.9/internals/how_to_contribute.html";>How to 
Contribute</a></li>
-                <li><a 
href="http://flink.apache.org/docs/0.9/internals/coding_guidelines.html";>Coding 
Guidelines</a></li>
-                <li><a 
href="http://flink.apache.org/docs/0.9/internals/ide_setup.html";>IDE 
Setup</a></li>
-                <li><a 
href="http://flink.apache.org/docs/0.9/internals/logging.html";>Logging</a></li>
-                <li class="divider"></li>
-                <li role="presentation" 
class="dropdown-header"><strong>Internals</strong></li>
-                <li><a 
href="http://flink.apache.org/docs/0.9/internals/general_arch.html";>Architecture
 &amp; Process Model</a></li>
-                <li><a 
href="http://flink.apache.org/docs/0.9/internals/types_serialization.html";>Type 
Extraction &amp; Serialization</a></li>
-                <li><a 
href="http://flink.apache.org/docs/0.9/internals/job_scheduling.html";>Jobs 
&amp; Scheduling</a></li>
-                <li><a 
href="http://flink.apache.org/docs/0.9/internals/add_operator.html";>How-To: Add 
an Operator</a></li>
-              </ul>
-            </li>
-          </ul>
-          <form class="navbar-form navbar-right hidden-sm hidden-md" 
role="search" action="http://flink.apache.org/docs/0.9/search-results.html";>
-            <div class="form-group">
-              <input type="text" class="form-control" name="q" 
placeholder="Search all pages">
-            </div>
-            <button type="submit" class="btn btn-default">Search</button>
-          </form>
-        </div><!-- /.navbar-collapse -->
-      </div><!-- /.container -->
-    </nav>
-
-
-    
-
-    <!-- Main content. -->
-    <div class="container">
-      
-      
-<div class="row">
-  <div class="col-sm-10 col-sm-offset-1">
-    <h1>Flink Programming Guide</h1>
-
-
-
-<p><a href="#top"></a></p>
-
-<p>Analysis programs in Flink are regular programs that implement 
transformations on data sets
-(e.g., filtering, mapping, joining, grouping). The data sets are initially 
created from certain
-sources (e.g., by reading files, or from local collections). Results are 
returned via sinks, which may for
-example write the data to (distributed) files, or to standard output (for 
example the command line
-terminal). Flink programs run in a variety of contexts, standalone, or 
embedded in other programs.
-The execution can happen in a local JVM, or on clusters of many machines.</p>
-
-<p>In order to create your own Flink program, we encourage you to start with 
the
-<a href="#program-skeleton">program skeleton</a> and gradually add your own
-<a href="#transformations">transformations</a>. The remaining sections act as 
references for additional
-operations and advanced features.</p>
-
-<ul id="markdown-toc">
-  <li><a href="#example-program" id="markdown-toc-example-program">Example 
Program</a></li>
-  <li><a href="#linking-with-flink" 
id="markdown-toc-linking-with-flink">Linking with Flink</a></li>
-  <li><a href="#program-skeleton" id="markdown-toc-program-skeleton">Program 
Skeleton</a></li>
-  <li><a href="#dataset-abstraction" 
id="markdown-toc-dataset-abstraction">DataSet abstraction</a></li>
-  <li><a href="#lazy-evaluation" id="markdown-toc-lazy-evaluation">Lazy 
Evaluation</a></li>
-  <li><a href="#transformations" 
id="markdown-toc-transformations">Transformations</a></li>
-  <li><a href="#specifying-keys" id="markdown-toc-specifying-keys">Specifying 
Keys</a></li>
-  <li><a href="#passing-functions-to-flink" 
id="markdown-toc-passing-functions-to-flink">Passing Functions to Flink</a></li>
-  <li><a href="#data-types" id="markdown-toc-data-types">Data Types</a></li>
-  <li><a href="#data-sources" id="markdown-toc-data-sources">Data Sources</a>  
  <ul>
-      <li><a href="#read-compressed-files" 
id="markdown-toc-read-compressed-files">Read Compressed Files</a></li>
-    </ul>
-  </li>
-  <li><a href="#execution-configuration" 
id="markdown-toc-execution-configuration">Execution Configuration</a></li>
-  <li><a href="#data-sinks" id="markdown-toc-data-sinks">Data Sinks</a></li>
-  <li><a href="#debugging" id="markdown-toc-debugging">Debugging</a>    <ul>
-      <li><a href="#local-execution-environment" 
id="markdown-toc-local-execution-environment">Local Execution 
Environment</a></li>
-      <li><a href="#collection-data-sources-and-sinks" 
id="markdown-toc-collection-data-sources-and-sinks">Collection Data Sources and 
Sinks</a></li>
-    </ul>
-  </li>
-  <li><a href="#iteration-operators" 
id="markdown-toc-iteration-operators">Iteration Operators</a></li>
-  <li><a href="#semantic-annotations" 
id="markdown-toc-semantic-annotations">Semantic Annotations</a></li>
-  <li><a href="#broadcast-variables" 
id="markdown-toc-broadcast-variables">Broadcast Variables</a></li>
-  <li><a href="#passing-parameters-to-functions" 
id="markdown-toc-passing-parameters-to-functions">Passing Parameters to 
Functions</a></li>
-  <li><a href="#program-packaging--distributed-execution" 
id="markdown-toc-program-packaging--distributed-execution">Program Packaging 
&amp; Distributed Execution</a></li>
-  <li><a href="#accumulators--counters" 
id="markdown-toc-accumulators--counters">Accumulators &amp; Counters</a></li>
-  <li><a href="#parallel-execution" 
id="markdown-toc-parallel-execution">Parallel Execution</a>    <ul>
-      <li><a href="#operator-level" id="markdown-toc-operator-level">Operator 
Level</a></li>
-      <li><a href="#execution-environment-level" 
id="markdown-toc-execution-environment-level">Execution Environment 
Level</a></li>
-      <li><a href="#client-level" id="markdown-toc-client-level">Client 
Level</a></li>
-      <li><a href="#system-level" id="markdown-toc-system-level">System 
Level</a></li>
-    </ul>
-  </li>
-  <li><a href="#execution-plans" id="markdown-toc-execution-plans">Execution 
Plans</a></li>
-</ul>
-
-<h2 id="example-program">Example Program</h2>
-
-<p>The following program is a complete, working example of WordCount. You can 
copy &amp; paste the code
-to run it locally. You only have to include the correct Flink’s library into 
your project
-(see Section <a href="#linking-with-flink">Linking with Flink</a>) and specify 
the imports. Then you are ready
-to go!</p>
-
-<div class="codetabs">
-  <div data-lang="java">
-
-    <div class="highlight"><pre><code class="language-java" 
data-lang="java"><span class="kd">public</span> <span class="kd">class</span> 
<span class="nc">WordCountExample</span> <span class="o">{</span>
-    <span class="kd">public</span> <span class="kd">static</span> <span 
class="kt">void</span> <span class="nf">main</span><span 
class="o">(</span><span class="n">String</span><span class="o">[]</span> <span 
class="n">args</span><span class="o">)</span> <span class="kd">throws</span> 
<span class="n">Exception</span> <span class="o">{</span>
-        <span class="kd">final</span> <span 
class="n">ExecutionEnvironment</span> <span class="n">env</span> <span 
class="o">=</span> <span class="n">ExecutionEnvironment</span><span 
class="o">.</span><span class="na">getExecutionEnvironment</span><span 
class="o">();</span>
-
-        <span class="n">DataSet</span><span class="o">&lt;</span><span 
class="n">String</span><span class="o">&gt;</span> <span class="n">text</span> 
<span class="o">=</span> <span class="n">env</span><span 
class="o">.</span><span class="na">fromElements</span><span class="o">(</span>
-            <span class="s">&quot;Who&#39;s there?&quot;</span><span 
class="o">,</span>
-            <span class="s">&quot;I think I hear them. Stand, ho! Who&#39;s 
there?&quot;</span><span class="o">);</span>
-
-        <span class="n">DataSet</span><span class="o">&lt;</span><span 
class="n">Tuple2</span><span class="o">&lt;</span><span 
class="n">String</span><span class="o">,</span> <span 
class="n">Integer</span><span class="o">&gt;&gt;</span> <span 
class="n">wordCounts</span> <span class="o">=</span> <span class="n">text</span>
-            <span class="o">.</span><span class="na">flatMap</span><span 
class="o">(</span><span class="k">new</span> <span 
class="nf">LineSplitter</span><span class="o">())</span>
-            <span class="o">.</span><span class="na">groupBy</span><span 
class="o">(</span><span class="mi">0</span><span class="o">)</span>
-            <span class="o">.</span><span class="na">sum</span><span 
class="o">(</span><span class="mi">1</span><span class="o">);</span>
-
-        <span class="n">wordCounts</span><span class="o">.</span><span 
class="na">print</span><span class="o">();</span>
-    <span class="o">}</span>
-
-    <span class="kd">public</span> <span class="kd">static</span> <span 
class="kd">class</span> <span class="nc">LineSplitter</span> <span 
class="kd">implements</span> <span class="n">FlatMapFunction</span><span 
class="o">&lt;</span><span class="n">String</span><span class="o">,</span> 
<span class="n">Tuple2</span><span class="o">&lt;</span><span 
class="n">String</span><span class="o">,</span> <span 
class="n">Integer</span><span class="o">&gt;&gt;</span> <span class="o">{</span>
-        <span class="nd">@Override</span>
-        <span class="kd">public</span> <span class="kt">void</span> <span 
class="nf">flatMap</span><span class="o">(</span><span class="n">String</span> 
<span class="n">line</span><span class="o">,</span> <span 
class="n">Collector</span><span class="o">&lt;</span><span 
class="n">Tuple2</span><span class="o">&lt;</span><span 
class="n">String</span><span class="o">,</span> <span 
class="n">Integer</span><span class="o">&gt;&gt;</span> <span 
class="n">out</span><span class="o">)</span> <span class="o">{</span>
-            <span class="k">for</span> <span class="o">(</span><span 
class="n">String</span> <span class="n">word</span> <span class="o">:</span> 
<span class="n">line</span><span class="o">.</span><span 
class="na">split</span><span class="o">(</span><span class="s">&quot; 
&quot;</span><span class="o">))</span> <span class="o">{</span>
-                <span class="n">out</span><span class="o">.</span><span 
class="na">collect</span><span class="o">(</span><span class="k">new</span> 
<span class="n">Tuple2</span><span class="o">&lt;</span><span 
class="n">String</span><span class="o">,</span> <span 
class="n">Integer</span><span class="o">&gt;(</span><span 
class="n">word</span><span class="o">,</span> <span class="mi">1</span><span 
class="o">));</span>
-            <span class="o">}</span>
-        <span class="o">}</span>
-    <span class="o">}</span>
-<span class="o">}</span></code></pre></div>
-
-  </div>
-
-  <div data-lang="scala">
-
-    <div class="highlight"><pre><code class="language-scala" 
data-lang="scala"><span class="k">import</span> <span 
class="nn">org.apache.flink.api.scala._</span>
-
-<span class="k">object</span> <span class="nc">WordCount</span> <span 
class="o">{</span>
-  <span class="k">def</span> <span class="n">main</span><span 
class="o">(</span><span class="n">args</span><span class="k">:</span> <span 
class="kt">Array</span><span class="o">[</span><span 
class="kt">String</span><span class="o">])</span> <span class="o">{</span>
-
-    <span class="k">val</span> <span class="n">env</span> <span 
class="k">=</span> <span class="nc">ExecutionEnvironment</span><span 
class="o">.</span><span class="n">getExecutionEnvironment</span>
-    <span class="k">val</span> <span class="n">text</span> <span 
class="k">=</span> <span class="n">env</span><span class="o">.</span><span 
class="n">fromElements</span><span class="o">(</span>
-      <span class="s">&quot;Who&#39;s there?&quot;</span><span 
class="o">,</span>
-      <span class="s">&quot;I think I hear them. Stand, ho! Who&#39;s 
there?&quot;</span><span class="o">)</span>
-
-    <span class="k">val</span> <span class="n">counts</span> <span 
class="k">=</span> <span class="n">text</span><span class="o">.</span><span 
class="n">flatMap</span> <span class="o">{</span> <span class="k">_</span><span 
class="o">.</span><span class="n">toLowerCase</span><span 
class="o">.</span><span class="n">split</span><span class="o">(</span><span 
class="s">&quot;\\W+&quot;</span><span class="o">)</span> <span 
class="n">filter</span> <span class="o">{</span> <span class="k">_</span><span 
class="o">.</span><span class="n">nonEmpty</span> <span class="o">}</span> 
<span class="o">}</span>
-      <span class="o">.</span><span class="n">map</span> <span 
class="o">{</span> <span class="o">(</span><span class="k">_</span><span 
class="o">,</span> <span class="mi">1</span><span class="o">)</span> <span 
class="o">}</span>
-      <span class="o">.</span><span class="n">groupBy</span><span 
class="o">(</span><span class="mi">0</span><span class="o">)</span>
-      <span class="o">.</span><span class="n">sum</span><span 
class="o">(</span><span class="mi">1</span><span class="o">)</span>
-
-    <span class="n">counts</span><span class="o">.</span><span 
class="n">print</span><span class="o">()</span>
-  <span class="o">}</span>
-<span class="o">}</span></code></pre></div>
-
-  </div>
-
-</div>
-
-<p><a href="#top">Back to top</a></p>
-
-<h2 id="linking-with-flink">Linking with Flink</h2>
-
-<p>To write programs with Flink, you need to include the Flink library 
corresponding to
-your programming language in your project.</p>
-
-<p>The simplest way to do this is to use one of the quickstart scripts: either 
for
-<a 
href="http://flink.apache.org/docs/0.9/quickstart/java_api_quickstart.html";>Java</a>
 or for <a 
href="http://flink.apache.org/docs/0.9/quickstart/scala_api_quickstart.html";>Scala</a>.
 They
-create a blank project from a template (a Maven Archetype), which sets up 
everything for you. To
-manually create the project, you can use the archetype and create a project by 
calling:</p>
-
-<div class="codetabs">
-  <div data-lang="java">
-
-    <div class="highlight"><pre><code class="language-bash" 
data-lang="bash">mvn archetype:generate /
-    -DarchetypeGroupId<span class="o">=</span>org.apache.flink/
-    -DarchetypeArtifactId<span class="o">=</span>flink-quickstart-java /
-    -DarchetypeVersion<span class="o">=</span>0.9.0</code></pre></div>
-
-  </div>
-  <div data-lang="scala">
-
-    <div class="highlight"><pre><code class="language-bash" 
data-lang="bash">mvn archetype:generate /
-    -DarchetypeGroupId<span class="o">=</span>org.apache.flink/
-    -DarchetypeArtifactId<span class="o">=</span>flink-quickstart-scala /
-    -DarchetypeVersion<span class="o">=</span>0.9.0</code></pre></div>
-
-  </div>
-</div>
-
-<p>The archetypes are working for stable releases and preview versions 
(<code>-SNAPSHOT</code>)</p>
-
-<p>If you want to add Flink to an existing Maven project, add the following 
entry to your
-<em>dependencies</em> section in the <em>pom.xml</em> file of your project:</p>
-
-<div class="codetabs">
-  <div data-lang="java">
-
-    <div class="highlight"><pre><code class="language-xml" 
data-lang="xml"><span class="nt">&lt;dependency&gt;</span>
-  <span class="nt">&lt;groupId&gt;</span>org.apache.flink<span 
class="nt">&lt;/groupId&gt;</span>
-  <span class="nt">&lt;artifactId&gt;</span>flink-java<span 
class="nt">&lt;/artifactId&gt;</span>
-  <span class="nt">&lt;version&gt;</span>0.9.0<span 
class="nt">&lt;/version&gt;</span>
-<span class="nt">&lt;/dependency&gt;</span>
-<span class="nt">&lt;dependency&gt;</span>
-  <span class="nt">&lt;groupId&gt;</span>org.apache.flink<span 
class="nt">&lt;/groupId&gt;</span>
-  <span class="nt">&lt;artifactId&gt;</span>flink-clients<span 
class="nt">&lt;/artifactId&gt;</span>
-  <span class="nt">&lt;version&gt;</span>0.9.0<span 
class="nt">&lt;/version&gt;</span>
-<span class="nt">&lt;/dependency&gt;</span></code></pre></div>
-
-  </div>
-  <div data-lang="scala">
-
-    <div class="highlight"><pre><code class="language-xml" 
data-lang="xml"><span class="nt">&lt;dependency&gt;</span>
-  <span class="nt">&lt;groupId&gt;</span>org.apache.flink<span 
class="nt">&lt;/groupId&gt;</span>
-  <span class="nt">&lt;artifactId&gt;</span>flink-scala<span 
class="nt">&lt;/artifactId&gt;</span>
-  <span class="nt">&lt;version&gt;</span>0.9.0<span 
class="nt">&lt;/version&gt;</span>
-<span class="nt">&lt;/dependency&gt;</span>
-<span class="nt">&lt;dependency&gt;</span>
-  <span class="nt">&lt;groupId&gt;</span>org.apache.flink<span 
class="nt">&lt;/groupId&gt;</span>
-  <span class="nt">&lt;artifactId&gt;</span>flink-clients<span 
class="nt">&lt;/artifactId&gt;</span>
-  <span class="nt">&lt;version&gt;</span>0.9.0<span 
class="nt">&lt;/version&gt;</span>
-<span class="nt">&lt;/dependency&gt;</span></code></pre></div>
-
-    <p><strong>Important:</strong> When working with the Scala API you must 
have one of these two imports:</p>
-
-    <div class="highlight"><pre><code class="language-scala" 
data-lang="scala"><span class="k">import</span> <span 
class="nn">org.apache.flink.api.scala._</span></code></pre></div>
-
-    <p>or</p>
-
-    <div class="highlight"><pre><code class="language-scala" 
data-lang="scala"><span class="k">import</span> <span 
class="nn">org.apache.flink.api.scala.createTypeInformation</span></code></pre></div>
-
-    <p>The reason is that Flink analyzes the types that are used in a program 
and generates serializers
-and comparaters for them. By having either of those imports you enable an 
implicit conversion
-that creates the type information for Flink operations.</p>
-  </div>
-</div>
-
-<h4 id="hadoop-dependency-versions">Hadoop Dependency Versions</h4>
-
-<p>If you are using Flink together with Hadoop, the version of the dependency 
may vary depending on the
-version of Hadoop (or more specifically, HDFS) that you want to use Flink 
with. Please refer to the
-<a href="http://flink.apache.org/downloads.html";>downloads page</a> for a list 
of available versions, and instructions
-on how to link with custom versions of Hadoop.</p>
-
-<p>In order to link against the latest SNAPSHOT versions of the code, please 
follow
-<a 
href="http://flink.apache.org/how-to-contribute.html#snapshots-nightly-builds";>this
 guide</a>.</p>
-
-<p>The <em>flink-clients</em> dependency is only necessary to invoke the Flink 
program locally (for example to
-run it standalone for testing and debugging).  If you intend to only export 
the program as a JAR
-file and <a href="cluster_execution.html">run it on a cluster</a>, you can 
skip that dependency.</p>
-
-<p><a href="#top">Back to top</a></p>
-
-<h2 id="program-skeleton">Program Skeleton</h2>
-
-<div class="codetabs">
-  <div data-lang="java">
-
-    <p>As we already saw in the example, Flink programs look like regular Java
-programs with a <code>main()</code> method. Each program consists of the same 
basic parts:</p>
-
-    <ol>
-      <li>Obtain an <code>ExecutionEnvironment</code>,</li>
-      <li>Load/create the initial data,</li>
-      <li>Specify transformations on this data,</li>
-      <li>Specify where to put the results of your computations, and</li>
-      <li>Trigger the program execution</li>
-    </ol>
-
-    <p>We will now give an overview of each of those steps, please refer to 
the respective sections for
-more details. Note that all
-<a 
href="https://github.com/apache/flink/blob/master//flink-java/src/main/java/org/apache/flink/api/java";>core
 classes of the Java API</a>
-are found in the package <code>org.apache.flink.api.java</code>.</p>
-
-    <p>The <code>ExecutionEnvironment</code> is the basis for all Flink 
programs. You can
-obtain one using these static methods on class 
<code>ExecutionEnvironment</code>:</p>
-
-    <div class="highlight"><pre><code class="language-java" 
data-lang="java"><span class="n">getExecutionEnvironment</span><span 
class="o">()</span>
-
-<span class="n">createLocalEnvironment</span><span class="o">()</span>
-<span class="n">createLocalEnvironment</span><span class="o">(</span><span 
class="kt">int</span> <span class="n">parallelism</span><span class="o">)</span>
-<span class="n">createLocalEnvironment</span><span class="o">(</span><span 
class="n">Configuration</span> <span class="n">customConfiguration</span><span 
class="o">)</span>
-
-<span class="n">createRemoteEnvironment</span><span class="o">(</span><span 
class="n">String</span> <span class="n">host</span><span class="o">,</span> 
<span class="kt">int</span> <span class="n">port</span><span class="o">,</span> 
<span class="n">String</span><span class="o">...</span> <span 
class="n">jarFiles</span><span class="o">)</span>
-<span class="n">createRemoteEnvironment</span><span class="o">(</span><span 
class="n">String</span> <span class="n">host</span><span class="o">,</span> 
<span class="kt">int</span> <span class="n">port</span><span class="o">,</span> 
<span class="kt">int</span> <span class="n">parallelism</span><span 
class="o">,</span> <span class="n">String</span><span class="o">...</span> 
<span class="n">jarFiles</span><span class="o">)</span></code></pre></div>
-
-    <p>Typically, you only need to use <code>getExecutionEnvironment()</code>, 
since this
-will do the right thing depending on the context: if you are executing
-your program inside an IDE or as a regular Java program it will create
-a local environment that will execute your program on your local machine. If
-you created a JAR file from you program, and invoke it through the <a 
href="cli.html">command line</a>
-or the <a href="web_client.html">web interface</a>,
-the Flink cluster manager will execute your main method and 
<code>getExecutionEnvironment()</code> will return
-an execution environment for executing your program on a cluster.</p>
-
-    <p>For specifying data sources the execution environment has several 
methods
-to read from files using various methods: you can just read them line by line,
-as CSV files, or using completely custom data input formats. To just read
-a text file as a sequence of lines, you can use:</p>
-
-    <div class="highlight"><pre><code class="language-java" 
data-lang="java"><span class="kd">final</span> <span 
class="n">ExecutionEnvironment</span> <span class="n">env</span> <span 
class="o">=</span> <span class="n">ExecutionEnvironment</span><span 
class="o">.</span><span class="na">getExecutionEnvironment</span><span 
class="o">();</span>
-
-<span class="n">DataSet</span><span class="o">&lt;</span><span 
class="n">String</span><span class="o">&gt;</span> <span class="n">text</span> 
<span class="o">=</span> <span class="n">env</span><span 
class="o">.</span><span class="na">readTextFile</span><span 
class="o">(</span><span class="s">&quot;file:///path/to/file&quot;</span><span 
class="o">);</span></code></pre></div>
-
-    <p>This will give you a DataSet on which you can then apply 
transformations. For
-more information on data sources and input formats, please refer to
-<a href="#data-sources">Data Sources</a>.</p>
-
-    <p>Once you have a DataSet you can apply transformations to create a new
-DataSet which you can then write to a file, transform again, or
-combine with other DataSets. You apply transformations by calling
-methods on DataSet with your own custom transformation function. For example,
-a map transformation looks like this:</p>
-
-    <div class="highlight"><pre><code class="language-java" 
data-lang="java"><span class="n">DataSet</span><span class="o">&lt;</span><span 
class="n">String</span><span class="o">&gt;</span> <span class="n">input</span> 
<span class="o">=</span> <span class="o">...;</span>
-
-<span class="n">DataSet</span><span class="o">&lt;</span><span 
class="n">Integer</span><span class="o">&gt;</span> <span 
class="n">tokenized</span> <span class="o">=</span> <span 
class="n">text</span><span class="o">.</span><span class="na">map</span><span 
class="o">(</span><span class="k">new</span> <span 
class="n">MapFunction</span><span class="o">&lt;</span><span 
class="n">String</span><span class="o">,</span> <span 
class="n">Integer</span><span class="o">&gt;()</span> <span class="o">{</span>
-    <span class="nd">@Override</span>
-    <span class="kd">public</span> <span class="n">Integer</span> <span 
class="nf">map</span><span class="o">(</span><span class="n">String</span> 
<span class="n">value</span><span class="o">)</span> <span class="o">{</span>
-        <span class="k">return</span> <span class="n">Integer</span><span 
class="o">.</span><span class="na">parseInt</span><span class="o">(</span><span 
class="n">value</span><span class="o">);</span>
-    <span class="o">}</span>
-<span class="o">});</span></code></pre></div>
-
-    <p>This will create a new DataSet by converting every String in the 
original
-set to an Integer. For more information and a list of all the transformations,
-please refer to <a href="#transformations">Transformations</a>.</p>
-
-    <p>Once you have a DataSet containing your final results, you can either 
write the result
-to a file system (HDFS or local) or print it.</p>
-
-    <div class="highlight"><pre><code class="language-java" 
data-lang="java"><span class="n">writeAsText</span><span 
class="o">(</span><span class="n">String</span> <span 
class="n">path</span><span class="o">)</span>
-<span class="n">writeAsCsv</span><span class="o">(</span><span 
class="n">String</span> <span class="n">path</span><span class="o">)</span>
-<span class="n">write</span><span class="o">(</span><span 
class="n">FileOutputFormat</span><span class="o">&lt;</span><span 
class="n">T</span><span class="o">&gt;</span> <span 
class="n">outputFormat</span><span class="o">,</span> <span 
class="n">String</span> <span class="n">filePath</span><span class="o">)</span>
-
-<span class="n">print</span><span class="o">()</span>
-<span class="n">printOnTaskManager</span><span class="o">()</span>
-
-<span class="n">collect</span><span class="o">()</span></code></pre></div>
-
-  </div>
-  <div data-lang="scala">
-
-    <p>As we already saw in the example, Flink programs look like regular Scala
-programs with a <code>main()</code> method. Each program consists of the same 
basic parts:</p>
-
-    <ol>
-      <li>Obtain an <code>ExecutionEnvironment</code>,</li>
-      <li>Load/create the initial data,</li>
-      <li>Specify transformations on this data,</li>
-      <li>Specify where to put the results of your computations, and</li>
-      <li>Trigger the program execution</li>
-    </ol>
-
-    <p>We will now give an overview of each of those steps but please refer to 
the respective sections for
-more details. Note that all core classes of the Scala API are found in the 
package 
-<a 
href="https://github.com/apache/flink/blob/master//flink-scala/src/main/scala/org/apache/flink/api/scala";>org.apache.flink.api.scala</a>.</p>
-
-    <p>The <code>ExecutionEnvironment</code> is the basis for all Flink 
programs. You can
-obtain one using these static methods on class 
<code>ExecutionEnvironment</code>:</p>
-
-    <div class="highlight"><pre><code class="language-scala" 
data-lang="scala"><span class="k">def</span> <span 
class="n">getExecutionEnvironment</span>
-
-<span class="k">def</span> <span class="n">createLocalEnvironment</span><span 
class="o">(</span><span class="n">parallelism</span><span class="k">:</span> 
<span class="kt">Int</span> <span class="o">=</span> <span 
class="nc">Runtime</span><span class="o">.</span><span 
class="n">getRuntime</span><span class="o">.</span><span 
class="n">availableProcessors</span><span class="o">()))</span>
-<span class="k">def</span> <span class="n">createLocalEnvironment</span><span 
class="o">(</span><span class="n">customConfiguration</span><span 
class="k">:</span> <span class="kt">Configuration</span><span class="o">)</span>
-
-<span class="k">def</span> <span class="n">createRemoteEnvironment</span><span 
class="o">(</span><span class="n">host</span><span class="k">:</span> <span 
class="kt">String</span><span class="o">,</span> <span 
class="n">port</span><span class="k">:</span> <span 
class="kt">String</span><span class="o">,</span> <span 
class="n">jarFiles</span><span class="k">:</span> <span 
class="kt">String*</span><span class="o">)</span>
-<span class="k">def</span> <span class="n">createRemoteEnvironment</span><span 
class="o">(</span><span class="n">host</span><span class="k">:</span> <span 
class="kt">String</span><span class="o">,</span> <span 
class="n">port</span><span class="k">:</span> <span 
class="kt">String</span><span class="o">,</span> <span 
class="n">parallelism</span><span class="k">:</span> <span 
class="kt">Int</span><span class="o">,</span> <span 
class="n">jarFiles</span><span class="k">:</span> <span 
class="kt">String*</span><span class="o">)</span></code></pre></div>
-
-    <p>Typically, you only need to use <code>getExecutionEnvironment()</code>, 
since this
-will do the right thing depending on the context: if you are executing
-your program inside an IDE or as a regular Scala program it will create
-a local environment that will execute your program on your local machine. If
-you created a JAR file from you program, and invoke it through the <a 
href="cli.html">command line</a>
-or the <a href="web_client.html">web interface</a>,
-the Flink cluster manager will execute your main method and 
<code>getExecutionEnvironment()</code> will return
-an execution environment for executing your program on a cluster.</p>
-
-    <p>For specifying data sources the execution environment has several 
methods
-to read from files using various methods: you can just read them line by line,
-as CSV files, or using completely custom data input formats. To just read
-a text file as a sequence of lines, you can use:</p>
-
-    <div class="highlight"><pre><code class="language-scala" 
data-lang="scala"><span class="k">val</span> <span class="n">env</span> <span 
class="k">=</span> <span class="nc">ExecutionEnvironment</span><span 
class="o">.</span><span class="n">getExecutionEnvironment</span><span 
class="o">()</span>
-
-<span class="k">val</span> <span class="n">text</span> <span 
class="k">=</span> <span class="n">env</span><span class="o">.</span><span 
class="n">readTextFile</span><span class="o">(</span><span 
class="s">&quot;file:///path/to/file&quot;</span><span 
class="o">)</span></code></pre></div>
-
-    <p>This will give you a DataSet on which you can then apply 
transformations. For
-more information on data sources and input formats, please refer to
-<a href="#data-sources">Data Sources</a>.</p>
-
-    <p>Once you have a DataSet you can apply transformations to create a new
-DataSet which you can then write to a file, transform again, or
-combine with other DataSets. You apply transformations by calling
-methods on DataSet with your own custom transformation function. For example,
-a map transformation looks like this:</p>
-
-    <div class="highlight"><pre><code class="language-scala" 
data-lang="scala"><span class="k">val</span> <span class="n">input</span><span 
class="k">:</span> <span class="kt">DataSet</span><span class="o">[</span><span 
class="kt">String</span><span class="o">]</span> <span class="k">=</span> <span 
class="o">...</span>
-
-<span class="k">val</span> <span class="n">mapped</span> <span 
class="k">=</span> <span class="n">text</span><span class="o">.</span><span 
class="n">map</span> <span class="o">{</span> <span class="n">x</span> <span 
class="k">=&gt;</span> <span class="n">x</span><span class="o">.</span><span 
class="n">toInt</span> <span class="o">}</span></code></pre></div>
-
-    <p>This will create a new DataSet by converting every String in the 
original
-set to an Integer. For more information and a list of all the transformations,
-please refer to <a href="#transformations">Transformations</a>.</p>
-
-    <p>Once you have a DataSet containing your final results, you can either 
write the result
-to a file system (HDFS or local) or print it.</p>
-
-    <div class="highlight"><pre><code class="language-scala" 
data-lang="scala"><span class="k">def</span> <span 
class="n">writeAsText</span><span class="o">(</span><span 
class="n">path</span><span class="k">:</span> <span 
class="kt">String</span><span class="o">,</span> <span 
class="n">writeMode</span><span class="k">:</span> <span 
class="kt">WriteMode</span> <span class="o">=</span> <span 
class="nc">WriteMode</span><span class="o">.</span><span 
class="nc">NO_OVERWRITE</span><span class="o">)</span>
-<span class="k">def</span> <span class="n">writeAsCsv</span><span 
class="o">(</span>
-    <span class="n">filePath</span><span class="k">:</span> <span 
class="kt">String</span><span class="o">,</span>
-    <span class="n">rowDelimiter</span><span class="k">:</span> <span 
class="kt">String</span> <span class="o">=</span> <span 
class="s">&quot;\n&quot;</span><span class="o">,</span>
-    <span class="n">fieldDelimiter</span><span class="k">:</span> <span 
class="kt">String</span> <span class="o">=</span> <span 
class="sc">&#39;,&#39;</span><span class="o">,</span>
-    <span class="n">writeMode</span><span class="k">:</span> <span 
class="kt">WriteMode</span> <span class="o">=</span> <span 
class="nc">WriteMode</span><span class="o">.</span><span 
class="nc">NO_OVERWRITE</span><span class="o">)</span>
-<span class="k">def</span> <span class="n">write</span><span 
class="o">(</span><span class="n">outputFormat</span><span class="k">:</span> 
<span class="kt">FileOutputFormat</span><span class="o">[</span><span 
class="kt">T</span><span class="o">],</span>
-    <span class="n">path</span><span class="k">:</span> <span 
class="kt">String</span><span class="o">,</span>
-    <span class="n">writeMode</span><span class="k">:</span> <span 
class="kt">WriteMode</span> <span class="o">=</span> <span 
class="nc">WriteMode</span><span class="o">.</span><span 
class="nc">NO_OVERWRITE</span><span class="o">)</span>
-
-<span class="k">def</span> <span class="n">printOnTaskManager</span><span 
class="o">()</span>
-
-<span class="k">def</span> <span class="n">print</span><span 
class="o">()</span>
-
-<span class="k">def</span> <span class="n">collect</span><span 
class="o">()</span></code></pre></div>
-
-  </div>
-</div>
-
-<p>The first two methods (<code>writeAsText()</code> and 
<code>writeAsCsv()</code>) do as the name suggests, the third one 
-can be used to specify a custom data output format. Please refer to <a 
href="#data-sinks">Data Sinks</a> for 
-more information on writing to files and also about custom data output 
formats.</p>
-
-<p>The <code>print()</code> method is useful for developing/debugging. It will 
output the contents of the DataSet 
-to standard output (on the JVM starting the Flink execution). 
<strong>NOTE</strong> The behavior of the <code>print()</code>
-method changed with Flink 0.9.x. Before it was printing to the log file of the 
workers, now its 
-sending the DataSet results to the client and printing the results there.</p>
-
-<p><code>collect()</code> retrieve the DataSet from the cluster to the local 
JVM. The <code>collect()</code> method 
-will return a <code>List</code> containing the elements.</p>
-
-<p>Both <code>print()</code> and <code>collect()</code> will trigger the 
execution of the program. You don’t need to further call 
<code>execute()</code>.</p>
-
-<p><strong>NOTE</strong> <code>print()</code> and <code>collect()</code> 
retrieve the data from the cluster to the client. Currently,
-the data sizes you can retrieve with <code>collect()</code> are limited due to 
our RPC system. It is not advised
-to collect DataSets larger than 10MBs.</p>
-
-<p>There is also a <code>printOnTaskManager()</code> method which will print 
the DataSet contents on the TaskManager 
-(so you have to get them from the log file). The 
<code>printOnTaskManager()</code> method will not trigger a
-program execution.</p>
-
-<p>Once you specified the complete program you need to <strong>trigger the 
program execution</strong>. You can call
-<code>execute()</code> directly on the <code>ExecutionEnviroment</code> or you 
implicitly trigger the execution with
-<code>collect()</code> or <code>print()</code>.
-Depending on the type of the <code>ExecutionEnvironment</code> the execution 
will be triggered on your local 
-machine or submit your program for execution on a cluster.</p>
-
-<p>Note that you can not call both <code>print()</code> (or 
<code>collect()</code>) and <code>execute()</code> at the end of program.</p>
-
-<p>The <code>execute()</code> method is returning the 
<code>JobExecutionResult</code>, including execution times and
-accumulator results. <code>print()</code> and <code>collect()</code> are not 
returning the result, but it can be
-accessed from the <code>getLastJobExecutionResult()</code> method.</p>
-
-<p><a href="#top">Back to top</a></p>
-
-<h2 id="dataset-abstraction">DataSet abstraction</h2>
-
-<p>The batch processing APIs of Flink are centered around the 
<code>DataSet</code> abstraction. A <code>DataSet</code> is only
-an abstract representation of a set of data that can contain duplicates.</p>
-
-<p>Also note that Flink is not always physically creating (materializing) each 
DataSet at runtime. This 
-depends on the used runtime, the configuration and optimizer decisions.</p>
-
-<p>The Flink runtime does not need to always materialize the DataSets because 
it is using a streaming runtime model.</p>
-
-<p>DataSets are only materialized to avoid distributed deadlocks (at points 
where the data flow graph branches out and joins again later) or if the 
execution mode has explicitly been set to a batched execution.</p>
-
-<p>When using Flink on Tez, all DataSets are materialized.</p>
-
-<p><a href="#top">Back to top</a></p>
-
-<h2 id="lazy-evaluation">Lazy Evaluation</h2>
-
-<p>All Flink programs are executed lazily: When the program’s main method is 
executed, the data loading
-and transformations do not happen directly. Rather, each operation is created 
and added to the
-program’s plan. The operations are actually executed when the execution is 
explicitly triggered by 
-an <code>execute()</code> call on the ExecutionEnvironment object. Also, 
<code>collect()</code> and <code>print()</code> will trigger
-the job execution. Whether the program is executed locally or on a cluster 
depends
-on the environment of the program.</p>
-
-<p>The lazy evaluation lets you construct sophisticated programs that Flink 
executes as one
-holistically planned unit.</p>
-
-<p><a href="#top">Back to top</a></p>
-
-<h2 id="transformations">Transformations</h2>
-
-<p>Data transformations transform one or more DataSets into a new DataSet. 
Programs can combine
-multiple transformations into sophisticated assemblies.</p>
-
-<p>This section gives a brief overview of the available transformations. The 
<a href="dataset_transformations.html">transformations
-documentation</a> has a full description of all transformations with
-examples.</p>
-
-<div class="codetabs">
-  <div data-lang="java">
-
-    <p><br /></p>
-
-    <table class="table table-bordered">
-  <thead>
-    <tr>
-      <th class="text-left" style="width: 20%">Transformation</th>
-      <th class="text-center">Description</th>
-    </tr>
-  </thead>
-
-  <tbody>
-    <tr>
-      <td><strong>Map</strong></td>
-      <td>
-        <p>Takes one element and produces one element.</p>
-
-<div class="highlight"><pre><code class="language-java" data-lang="java"><span 
class="n">data</span><span class="o">.</span><span class="na">map</span><span 
class="o">(</span><span class="k">new</span> <span 
class="n">MapFunction</span><span class="o">&lt;</span><span 
class="n">String</span><span class="o">,</span> <span 
class="n">Integer</span><span class="o">&gt;()</span> <span class="o">{</span>
-  <span class="kd">public</span> <span class="n">Integer</span> <span 
class="nf">map</span><span class="o">(</span><span class="n">String</span> 
<span class="n">value</span><span class="o">)</span> <span class="o">{</span> 
<span class="k">return</span> <span class="n">Integer</span><span 
class="o">.</span><span class="na">parseInt</span><span class="o">(</span><span 
class="n">value</span><span class="o">);</span> <span class="o">}</span>
-<span class="o">});</span></code></pre></div>
-
-      </td>
-    </tr>
-
-    <tr>
-      <td><strong>FlatMap</strong></td>
-      <td>
-        <p>Takes one element and produces zero, one, or more elements. </p>
-
-<div class="highlight"><pre><code class="language-java" data-lang="java"><span 
class="n">data</span><span class="o">.</span><span 
class="na">flatMap</span><span class="o">(</span><span class="k">new</span> 
<span class="n">FlatMapFunction</span><span class="o">&lt;</span><span 
class="n">String</span><span class="o">,</span> <span 
class="n">String</span><span class="o">&gt;()</span> <span class="o">{</span>
-  <span class="kd">public</span> <span class="kt">void</span> <span 
class="nf">flatMap</span><span class="o">(</span><span class="n">String</span> 
<span class="n">value</span><span class="o">,</span> <span 
class="n">Collector</span><span class="o">&lt;</span><span 
class="n">String</span><span class="o">&gt;</span> <span 
class="n">out</span><span class="o">)</span> <span class="o">{</span>
-    <span class="k">for</span> <span class="o">(</span><span 
class="n">String</span> <span class="n">s</span> <span class="o">:</span> <span 
class="n">value</span><span class="o">.</span><span 
class="na">split</span><span class="o">(</span><span class="s">&quot; 
&quot;</span><span class="o">))</span> <span class="o">{</span>
-      <span class="n">out</span><span class="o">.</span><span 
class="na">collect</span><span class="o">(</span><span class="n">s</span><span 
class="o">);</span>
-    <span class="o">}</span>
-  <span class="o">}</span>
-<span class="o">});</span></code></pre></div>
-
-      </td>
-    </tr>
-
-    <tr>
-      <td><strong>MapPartition</strong></td>
-      <td>
-        <p>Transforms a parallel partition in a single function call. The 
function get the partition
-        as an `Iterable` stream and can produce an arbitrary number of result 
values. The number of
-        elements in each partition depends on the degree-of-parallelism and 
previous operations.</p>
-
-<div class="highlight"><pre><code class="language-java" data-lang="java"><span 
class="n">data</span><span class="o">.</span><span 
class="na">mapPartition</span><span class="o">(</span><span 
class="k">new</span> <span class="n">MapPartitionFunction</span><span 
class="o">&lt;</span><span class="n">String</span><span class="o">,</span> 
<span class="n">Long</span><span class="o">&gt;()</span> <span 
class="o">{</span>
-  <span class="kd">public</span> <span class="kt">void</span> <span 
class="nf">mapPartition</span><span class="o">(</span><span 
class="n">Iterable</span><span class="o">&lt;</span><span 
class="n">String</span><span class="o">&gt;</span> <span 
class="n">values</span><span class="o">,</span> <span 
class="n">Collector</span><span class="o">&lt;</span><span 
class="n">Long</span><span class="o">&gt;</span> <span 
class="n">out</span><span class="o">)</span> <span class="o">{</span>
-    <span class="kt">long</span> <span class="n">c</span> <span 
class="o">=</span> <span class="mi">0</span><span class="o">;</span>
-    <span class="k">for</span> <span class="o">(</span><span 
class="n">String</span> <span class="n">s</span> <span class="o">:</span> <span 
class="n">values</span><span class="o">)</span> <span class="o">{</span>
-      <span class="n">c</span><span class="o">++;</span>
-    <span class="o">}</span>
-    <span class="n">out</span><span class="o">.</span><span 
class="na">collect</span><span class="o">(</span><span class="n">c</span><span 
class="o">);</span>
-  <span class="o">}</span>
-<span class="o">});</span></code></pre></div>
-
-      </td>
-    </tr>
-
-    <tr>
-      <td><strong>Filter</strong></td>
-      <td>
-        <p>Evaluates a boolean function for each element and retains those for 
which the function
-        returns true.<br />
-        
-        <strong>IMPORTANT:</strong> The system assumes that the function does 
not modify the elements on which the predicate is applied. Violating this 
assumption
-        can lead to incorrect results.
-        </p>
-
-<div class="highlight"><pre><code class="language-java" data-lang="java"><span 
class="n">data</span><span class="o">.</span><span 
class="na">filter</span><span class="o">(</span><span class="k">new</span> 
<span class="n">FilterFunction</span><span class="o">&lt;</span><span 
class="n">Integer</span><span class="o">&gt;()</span> <span class="o">{</span>
-  <span class="kd">public</span> <span class="kt">boolean</span> <span 
class="nf">filter</span><span class="o">(</span><span class="n">Integer</span> 
<span class="n">value</span><span class="o">)</span> <span class="o">{</span> 
<span class="k">return</span> <span class="n">value</span> <span 
class="o">&gt;</span> <span class="mi">1000</span><span class="o">;</span> 
<span class="o">}</span>
-<span class="o">});</span></code></pre></div>
-
-      </td>
-    </tr>
-
-    <tr>
-      <td><strong>Reduce</strong></td>
-      <td>
-        <p>Combines a group of elements into a single element by repeatedly 
combining two elements
-        into one. Reduce may be applied on a full data set, or on a grouped 
data set.</p>
-
-<div class="highlight"><pre><code class="language-java" data-lang="java"><span 
class="n">data</span><span class="o">.</span><span 
class="na">reduce</span><span class="o">(</span><span class="k">new</span> 
<span class="n">ReduceFunction</span><span class="o">&lt;</span><span 
class="n">Integer</span><span class="o">&gt;</span> <span class="o">{</span>
-  <span class="kd">public</span> <span class="n">Integer</span> <span 
class="nf">reduce</span><span class="o">(</span><span class="n">Integer</span> 
<span class="n">a</span><span class="o">,</span> <span class="n">Integer</span> 
<span class="n">b</span><span class="o">)</span> <span class="o">{</span> <span 
class="k">return</span> <span class="n">a</span> <span class="o">+</span> <span 
class="n">b</span><span class="o">;</span> <span class="o">}</span>
-<span class="o">});</span></code></pre></div>
-
-      </td>
-    </tr>
-
-    <tr>
-      <td><strong>ReduceGroup</strong></td>
-      <td>
-        <p>Combines a group of elements into one or more elements. ReduceGroup 
may be applied on a
-        full data set, or on a grouped data set.</p>
-
-<div class="highlight"><pre><code class="language-java" data-lang="java"><span 
class="n">data</span><span class="o">.</span><span 
class="na">reduceGroup</span><span class="o">(</span><span class="k">new</span> 
<span class="n">GroupReduceFunction</span><span class="o">&lt;</span><span 
class="n">Integer</span><span class="o">,</span> <span 
class="n">Integer</span><span class="o">&gt;</span> <span class="o">{</span>
-  <span class="kd">public</span> <span class="kt">void</span> <span 
class="nf">reduce</span><span class="o">(</span><span 
class="n">Iterable</span><span class="o">&lt;</span><span 
class="n">Integer</span><span class="o">&gt;</span> <span 
class="n">values</span><span class="o">,</span> <span 
class="n">Collector</span><span class="o">&lt;</span><span 
class="n">Integer</span><span class="o">&gt;</span> <span 
class="n">out</span><span class="o">)</span> <span class="o">{</span>
-    <span class="kt">int</span> <span class="n">prefixSum</span> <span 
class="o">=</span> <span class="mi">0</span><span class="o">;</span>
-    <span class="k">for</span> <span class="o">(</span><span 
class="n">Integer</span> <span class="n">i</span> <span class="o">:</span> 
<span class="n">values</span><span class="o">)</span> <span class="o">{</span>
-      <span class="n">prefixSum</span> <span class="o">+=</span> <span 
class="n">i</span><span class="o">;</span>
-      <span class="n">out</span><span class="o">.</span><span 
class="na">collect</span><span class="o">(</span><span 
class="n">prefixSum</span><span class="o">);</span>
-    <span class="o">}</span>
-  <span class="o">}</span>
-<span class="o">});</span></code></pre></div>
-
-      </td>
-    </tr>
-
-    <tr>
-      <td><strong>Aggregate</strong></td>
-      <td>
-        <p>Aggregates a group of values into a single value. Aggregation 
functions can be thought of
-        as built-in reduce functions. Aggregate may be applied on a full data 
set, or on a grouped
-        data set.</p>
-
-<div class="highlight"><pre><code class="language-java" data-lang="java"><span 
class="n">Dataset</span><span class="o">&lt;</span><span 
class="n">Tuple3</span><span class="o">&lt;</span><span 
class="n">Integer</span><span class="o">,</span> <span 
class="n">String</span><span class="o">,</span> <span 
class="n">Double</span><span class="o">&gt;&gt;</span> <span 
class="n">input</span> <span class="o">=</span> <span class="c1">// [...]</span>
-<span class="n">DataSet</span><span class="o">&lt;</span><span 
class="n">Tuple3</span><span class="o">&lt;</span><span 
class="n">Integer</span><span class="o">,</span> <span 
class="n">String</span><span class="o">,</span> <span 
class="n">Double</span><span class="o">&gt;&gt;</span> <span 
class="n">output</span> <span class="o">=</span> <span 
class="n">input</span><span class="o">.</span><span 
class="na">aggregate</span><span class="o">(</span><span 
class="n">SUM</span><span class="o">,</span> <span class="mi">0</span><span 
class="o">).</span><span class="na">and</span><span class="o">(</span><span 
class="n">MIN</span><span class="o">,</span> <span class="mi">2</span><span 
class="o">);</span></code></pre></div>
-
-       <p>You can also use short-hand syntax for minimum, maximum, and sum 
aggregations.</p>
-       
-<div class="highlight"><pre><code class="language-java" data-lang="java"><span 
class="n">Dataset</span><span class="o">&lt;</span><span 
class="n">Tuple3</span><span class="o">&lt;</span><span 
class="n">Integer</span><span class="o">,</span> <span 
class="n">String</span><span class="o">,</span> <span 
class="n">Double</span><span class="o">&gt;&gt;</span> <span 
class="n">input</span> <span class="o">=</span> <span class="c1">// [...]</span>
-<span class="n">DataSet</span><span class="o">&lt;</span><span 
class="n">Tuple3</span><span class="o">&lt;</span><span 
class="n">Integer</span><span class="o">,</span> <span 
class="n">String</span><span class="o">,</span> <span 
class="n">Double</span><span class="o">&gt;&gt;</span> <span 
class="n">output</span> <span class="o">=</span> <span 
class="n">input</span><span class="o">.</span><span class="na">sum</span><span 
class="o">(</span><span class="mi">0</span><span class="o">).</span><span 
class="na">andMin</span><span class="o">(</span><span class="mi">2</span><span 
class="o">);</span></code></pre></div>
-
-      </td>
-    </tr>
-
-    
-      <td><strong>Join</strong></td>
-      <td>
-        Joins two data sets by creating all pairs of elements that are equal 
on their keys.
-        Optionally uses a JoinFunction to turn the pair of elements into a 
single element, or a
-        FlatJoinFunction to turn the pair of elements into arbitararily many 
(including none)
-        elements. See <a href="#specifying-keys">keys</a> on how to define 
join keys.
-
-<div class="highlight"><pre><code class="language-java" data-lang="java"><span 
class="n">result</span> <span class="o">=</span> <span 
class="n">input1</span><span class="o">.</span><span 
class="na">join</span><span class="o">(</span><span 
class="n">input2</span><span class="o">)</span>
-               <span class="o">.</span><span class="na">where</span><span 
class="o">(</span><span class="mi">0</span><span class="o">)</span>       <span 
class="c1">// key of the first input (tuple field 0)</span>
-               <span class="o">.</span><span class="na">equalTo</span><span 
class="o">(</span><span class="mi">1</span><span class="o">);</span>    <span 
class="c1">// key of the second input (tuple field 1)</span></code></pre></div>
-
-        You can specify the way that the runtime executes the join via <i>Join 
Hints</i>. The hints
-        describe whether the join happens through partitioning or 
broadcasting, and whether it uses
-        a sort-based or a hash-based algorithm. Please refer to the 
-        <a 
href="dataset_transformations.html#join-algorithm-hints">Transformations 
Guide</a> for
-        a list of possible hints and an example.
-        If no hint is specified, the system will try to make an estimate of 
the input sizes and
-        pick a the best strategy according to those estimates. 
-
-<div class="highlight"><pre><code class="language-java" data-lang="java"><span 
class="c1">// This executes a join by broadcasting the first data set</span>
-<span class="c1">// using a hash table for the broadcasted data</span>
-<span class="n">result</span> <span class="o">=</span> <span 
class="n">input1</span><span class="o">.</span><span 
class="na">join</span><span class="o">(</span><span 
class="n">input2</span><span class="o">,</span> <span 
class="n">JoinHint</span><span class="o">.</span><span 
class="na">BROADCAST_HASH_FIRST</span><span class="o">)</span>
-               <span class="o">.</span><span class="na">where</span><span 
class="o">(</span><span class="mi">0</span><span class="o">).</span><span 
class="na">equalTo</span><span class="o">(</span><span class="mi">1</span><span 
class="o">);</span></code></pre></div>
-
-        Note that the join transformation works only for equi-joins. Other 
join types, for example outer-joins need to be expressed using CoGroup.
-      </td>
-    
-
-    <tr>
-      <td><strong>CoGroup</strong></td>
-      <td>
-        <p>The two-dimensional variant of the reduce operation. Groups each 
input on one or more
-        fields and then joins the groups. The transformation function is 
called per pair of groups.
-        See <a href="#specifying-keys">keys</a> on how to define coGroup 
keys.</p>
-
-<div class="highlight"><pre><code class="language-java" data-lang="java"><span 
class="n">data1</span><span class="o">.</span><span 
class="na">coGroup</span><span class="o">(</span><span 
class="n">data2</span><span class="o">)</span>
-     <span class="o">.</span><span class="na">where</span><span 
class="o">(</span><span class="mi">0</span><span class="o">)</span>
-     <span class="o">.</span><span class="na">equalTo</span><span 
class="o">(</span><span class="mi">1</span><span class="o">)</span>
-     <span class="o">.</span><span class="na">with</span><span 
class="o">(</span><span class="k">new</span> <span 
class="n">CoGroupFunction</span><span class="o">&lt;</span><span 
class="n">String</span><span class="o">,</span> <span 
class="n">String</span><span class="o">,</span> <span 
class="n">String</span><span class="o">&gt;()</span> <span class="o">{</span>
-         <span class="kd">public</span> <span class="kt">void</span> <span 
class="nf">coGroup</span><span class="o">(</span><span 
class="n">Iterable</span><span class="o">&lt;</span><span 
class="n">String</span><span class="o">&gt;</span> <span 
class="n">in1</span><span class="o">,</span> <span 
class="n">Iterable</span><span class="o">&lt;</span><span 
class="n">String</span><span class="o">&gt;</span> <span 
class="n">in2</span><span class="o">,</span> <span 
class="n">Collector</span><span class="o">&lt;</span><span 
class="n">String</span><span class="o">&gt;</span> <span 
class="n">out</span><span class="o">)</span> <span class="o">{</span>
-           <span class="n">out</span><span class="o">.</span><span 
class="na">collect</span><span class="o">(...);</span>
-         <span class="o">}</span>
-      <span class="o">});</span></code></pre></div>
-
-      </td>
-    </tr>
-
-    <tr>
-      <td><strong>Cross</strong></td>
-      <td>
-        <p>Builds the Cartesian product (cross product) of two inputs, 
creating all pairs of
-        elements. Optionally uses a CrossFunction to turn the pair of elements 
into a single
-        element</p>
-
-<div class="highlight"><pre><code class="language-java" data-lang="java"><span 
class="n">DataSet</span><span class="o">&lt;</span><span 
class="n">Integer</span><span class="o">&gt;</span> <span 
class="n">data1</span> <span class="o">=</span> <span class="c1">// [...]</span>
-<span class="n">DataSet</span><span class="o">&lt;</span><span 
class="n">String</span><span class="o">&gt;</span> <span class="n">data2</span> 
<span class="o">=</span> <span class="c1">// [...]</span>
-<span class="n">DataSet</span><span class="o">&lt;</span><span 
class="n">Tuple2</span><span class="o">&lt;</span><span 
class="n">Integer</span><span class="o">,</span> <span 
class="n">String</span><span class="o">&gt;&gt;</span> <span 
class="n">result</span> <span class="o">=</span> <span 
class="n">data1</span><span class="o">.</span><span 
class="na">cross</span><span class="o">(</span><span 
class="n">data2</span><span class="o">);</span></code></pre></div>
-
-      <p>Note: Cross is potentially a <b>very</b> compute-intensive operation 
which can challenge even large compute clusters! It is adviced to hint the 
system with the DataSet sizes by using <i>crossWithTiny()</i> and 
<i>crossWithHuge()</i>.</p>
-      </td>
-    </tr>
-    <tr>
-      <td><strong>Union</strong></td>
-      <td>
-        <p>Produces the union of two data sets. This operation happens 
implicitly if more than one
-        data set is used for a specific function input.</p>
-
-<div class="highlight"><pre><code class="language-java" data-lang="java"><span 
class="n">DataSet</span><span class="o">&lt;</span><span 
class="n">String</span><span class="o">&gt;</span> <span class="n">data1</span> 
<span class="o">=</span> <span class="c1">// [...]</span>
-<span class="n">DataSet</span><span class="o">&lt;</span><span 
class="n">String</span><span class="o">&gt;</span> <span class="n">data2</span> 
<span class="o">=</span> <span class="c1">// [...]</span>
-<span class="n">DataSet</span><span class="o">&lt;</span><span 
class="n">String</span><span class="o">&gt;</span> <span 
class="n">result</span> <span class="o">=</span> <span 
class="n">data1</span><span class="o">.</span><span 
class="na">union</span><span class="o">(</span><span 
class="n">data2</span><span class="o">);</span></code></pre></div>
-
-      </td>
-    </tr>
-    <tr>
-      <td><strong>Rebalance</strong></td>
-      <td>
-        <p>Evenly rebalances the parallel partitions of a data set to 
eliminate data skew. Only Map-like transformations may follow a rebalance 
transformation.</p>
-
-<div class="highlight"><pre><code class="language-java" data-lang="java"><span 
class="n">DataSet</span><span class="o">&lt;</span><span 
class="n">String</span><span class="o">&gt;</span> <span class="n">in</span> 
<span class="o">=</span> <span class="c1">// [...]</span>
-<span class="n">DataSet</span><span class="o">&lt;</span><span 
class="n">String</span><span class="o">&gt;</span> <span 
class="n">result</span> <span class="o">=</span> <span class="n">in</span><span 
class="o">.</span><span class="na">rebalance</span><span class="o">()</span>
-                           <span class="o">.</span><span 
class="na">map</span><span class="o">(</span><span class="k">new</span> <span 
class="nf">Mapper</span><span class="o">());</span></code></pre></div>
-
-      </td>
-    </tr>
-    <tr>
-      <td><strong>Hash-Partition</strong></td>
-      <td>
-        <p>Hash-partitions a data set on a given key. Keys can be specified as 
key-selector functions or field position keys.</p>
-
-<div class="highlight"><pre><code class="language-java" data-lang="java"><span 
class="n">DataSet</span><span class="o">&lt;</span><span 
class="n">Tuple2</span><span class="o">&lt;</span><span 
class="n">String</span><span class="o">,</span><span 
class="n">Integer</span><span class="o">&gt;&gt;</span> <span 
class="n">in</span> <span class="o">=</span> <span class="c1">// [...]</span>
-<span class="n">DataSet</span><span class="o">&lt;</span><span 
class="n">Integer</span><span class="o">&gt;</span> <span 
class="n">result</span> <span class="o">=</span> <span class="n">in</span><span 
class="o">.</span><span class="na">partitionByHash</span><span 
class="o">(</span><span class="mi">0</span><span class="o">)</span>
-                            <span class="o">.</span><span 
class="na">mapPartition</span><span class="o">(</span><span 
class="k">new</span> <span class="nf">PartitionMapper</span><span 
class="o">());</span></code></pre></div>
-
-      </td>
-    </tr>
-    <tr>
-      <td><strong>Custom Partitioning</strong></td>
-      <td>
-        <p>Manually specify a partitioning over the data.
-          <br />
-          <i>Note</i>: This method works only on single field keys.</p>
-
-<div class="highlight"><pre><code class="language-java" data-lang="java"><span 
class="n">DataSet</span><span class="o">&lt;</span><span 
class="n">Tuple2</span><span class="o">&lt;</span><span 
class="n">String</span><span class="o">,</span><span 
class="n">Integer</span><span class="o">&gt;&gt;</span> <span 
class="n">in</span> <span class="o">=</span> <span class="c1">// [...]</span>
-<span class="n">DataSet</span><span class="o">&lt;</span><span 
class="n">Integer</span><span class="o">&gt;</span> <span 
class="n">result</span> <span class="o">=</span> <span class="n">in</span><span 
class="o">.</span><span class="na">partitionCustom</span><span 
class="o">(</span><span class="n">Partitioner</span><span 
class="o">&lt;</span><span class="n">K</span><span class="o">&gt;</span> <span 
class="n">partitioner</span><span class="o">,</span> <span 
class="n">key</span><span class="o">)</span></code></pre></div>
-
-      </td>
-    </tr>
-    <tr>
-      <td><strong>Sort Partition</strong></td>
-      <td>
-        <p>Locally sorts all partitions of a data set on a specified field in 
a specified order. 
-          Fields can be specified as tuple positions or field expressions. 
-          Sorting on multiple fields is done by chaining sortPartition() 
calls.</p>
-
-<div class="highlight"><pre><code class="language-java" data-lang="java"><span 
class="n">DataSet</span><span class="o">&lt;</span><span 
class="n">Tuple2</span><span class="o">&lt;</span><span 
class="n">String</span><span class="o">,</span><span 
class="n">Integer</span><span class="o">&gt;&gt;</span> <span 
class="n">in</span> <span class="o">=</span> <span class="c1">// [...]</span>
-<span class="n">DataSet</span><span class="o">&lt;</span><span 
class="n">Integer</span><span class="o">&gt;</span> <span 
class="n">result</span> <span class="o">=</span> <span class="n">in</span><span 
class="o">.</span><span class="na">sortPartition</span><span 
class="o">(</span><span class="mi">1</span><span class="o">,</span> <span 
class="n">Order</span><span class="o">.</span><span 
class="na">ASCENDING</span><span class="o">)</span>
-                            <span class="o">.</span><span 
class="na">mapPartition</span><span class="o">(</span><span 
class="k">new</span> <span class="nf">PartitionMapper</span><span 
class="o">());</span></code></pre></div>
-
-      </td>
-    </tr>
-    <tr>
-      <td><strong>First-n</strong></td>
-      <td>
-        <p>Returns the first n (arbitrary) elements of a data set. First-n can 
be applied on a regular data set, a grouped data set, or a grouped-sorted data 
set. Grouping keys can be specified as key-selector functions or field position 
keys.</p>
-
-<div class="highlight"><pre><code class="language-java" data-lang="java"><span 
class="n">DataSet</span><span class="o">&lt;</span><span 
class="n">Tuple2</span><span class="o">&lt;</span><span 
class="n">String</span><span class="o">,</span><span 
class="n">Integer</span><span class="o">&gt;&gt;</span> <span 
class="n">in</span> <span class="o">=</span> <span class="c1">// [...]</span>
-<span class="c1">// regular data set</span>
-<span class="n">DataSet</span><span class="o">&lt;</span><span 
class="n">Tuple2</span><span class="o">&lt;</span><span 
class="n">String</span><span class="o">,</span><span 
class="n">Integer</span><span class="o">&gt;&gt;</span> <span 
class="n">result1</span> <span class="o">=</span> <span 
class="n">in</span><span class="o">.</span><span class="na">first</span><span 
class="o">(</span><span class="mi">3</span><span class="o">);</span>
-<span class="c1">// grouped data set</span>
-<span class="n">DataSet</span><span class="o">&lt;</span><span 
class="n">Tuple2</span><span class="o">&lt;</span><span 
class="n">String</span><span class="o">,</span><span 
class="n">Integer</span><span class="o">&gt;&gt;</span> <span 
class="n">result2</span> <span class="o">=</span> <span 
class="n">in</span><span class="o">.</span><span class="na">groupBy</span><span 
class="o">(</span><span class="mi">0</span><span class="o">)</span>
-                                            <span class="o">.</span><span 
class="na">first</span><span class="o">(</span><span class="mi">3</span><span 
class="o">);</span>
-<span class="c1">// grouped-sorted data set</span>
-<span class="n">DataSet</span><span class="o">&lt;</span><span 
class="n">Tuple2</span><span class="o">&lt;</span><span 
class="n">String</span><span class="o">,</span><span 
class="n">Integer</span><span class="o">&gt;&gt;</span> <span 
class="n">result3</span> <span class="o">=</span> <span 
class="n">in</span><span class="o">.</span><span class="na">groupBy</span><span 
class="o">(</span><span class="mi">0</span><span class="o">)</span>
-                                            <span class="o">.</span><span 
class="na">sortGroup</span><span class="o">(</span><span 
class="mi">1</span><span class="o">,</span> <span class="n">Order</span><span 
class="o">.</span><span class="na">ASCENDING</span><span class="o">)</span>
-                                            <span class="o">.</span><span 
class="na">first</span><span class="o">(</span><span class="mi">3</span><span 
class="o">);</span></code></pre></div>
-
-      </td>
-    </tr>
-  </tbody>
-</table>
-
-    <hr />
-
-    <p>The following transformations are available on data sets of Tuples:</p>
-
-    <table class="table table-bordered">
-  <thead>
-    <tr>
-      <th class="text-left" style="width: 20%">Transformation</th>
-      <th class="text-center">Description</th>
-    </tr>
-  </thead>
-  <tbody>
-   <tr>
-      <td><strong>Project</strong></td>
-      <td>
-        <p>Selects a subset of fields from the tuples</p>
-
-<div class="highlight"><pre><code class="language-java" data-lang="java"><span 
class="n">DataSet</span><span class="o">&lt;</span><span 
class="n">Tuple3</span><span class="o">&lt;</span><span 
class="n">Integer</span><span class="o">,</span> <span 
class="n">Double</span><span class="o">,</span> <span 
class="n">String</span><span class="o">&gt;&gt;</span> <span 
class="n">in</span> <span class="o">=</span> <span class="c1">// [...]</span>
-<span class="n">DataSet</span><span class="o">&lt;</span><span 
class="n">Tuple2</span><span class="o">&lt;</span><span 
class="n">String</span><span class="o">,</span> <span 
class="n">Integer</span><span class="o">&gt;&gt;</span> <span 
class="n">out</span> <span class="o">=</span> <span class="n">in</span><span 
class="o">.</span><span class="na">project</span><span class="o">(</span><span 
class="mi">2</span><span class="o">,</span><span class="mi">0</span><span 
class="o">);</span></code></pre></div>
-
-      </td>
-    </tr>
-  </tbody>
-</table>
-
-  </div>
-  <div data-lang="scala">
-    <p><br /></p>
-
-    <table class="table table-bordered">
-  <thead>
-    <tr>
-      <th class="text-left" style="width: 20%">Transformation</th>
-      <th class="text-center">Description</th>
-    </tr>
-  </thead>
-
-  <tbody>
-    <tr>
-      <td><strong>Map</strong></td>
-      <td>
-        <p>Takes one element and produces one element.</p>
-
-<div class="highlight"><pre><code class="language-scala" 
data-lang="scala"><span class="n">data</span><span class="o">.</span><span 
class="n">map</span> <span class="o">{</span> <span class="n">x</span> <span 
class="k">=&gt;</span> <span class="n">x</span><span class="o">.</span><span 
class="n">toInt</span> <span class="o">}</span></code></pre></div>
-
-      </td>
-    </tr>
-
-    <tr>
-      <td><strong>FlatMap</strong></td>
-      <td>
-        <p>Takes one element and produces zero, one, or more elements. </p>
-
-<div class="highlight"><pre><code class="language-scala" 
data-lang="scala"><span class="n">data</span><span class="o">.</span><span 
class="n">flatMap</span> <span class="o">{</span> <span class="n">str</span> 
<span class="k">=&gt;</span> <span class="n">str</span><span 
class="o">.</span><span class="n">split</span><span class="o">(</span><span 
class="s">&quot; &quot;</span><span class="o">)</span> <span 
class="o">}</span></code></pre></div>
-
-      </td>
-    </tr>
-
-    <tr>
-      <td><strong>MapPartition</strong></td>
-      <td>
-        <p>Transforms a parallel partition in a single function call. The 
function get the partition
-        as an `Iterator` and can produce an arbitrary number of result values. 
The number of
-        elements in each partition depends on the degree-of-parallelism and 
previous operations.</p>
-
-<div class="highlight"><pre><code class="language-scala" 
data-lang="scala"><span class="n">data</span><span class="o">.</span><span 
class="n">mapPartition</span> <span class="o">{</span> <span 
class="n">in</span> <span class="k">=&gt;</span> <span class="n">in</span> 
<span class="n">map</span> <span class="o">{</span> <span 
class="o">(</span><span class="k">_</span><span class="o">,</span> <span 
class="mi">1</span><span class="o">)</span> <span class="o">}</span> <span 
class="o">}</span></code></pre></div>
-
-      </td>
-    </tr>
-
-    <tr>
-      <td><strong>Filter</strong></td>
-      <td>
-        <p>Evaluates a boolean function for each element and retains those for 
which the function
-        returns true.<br />
-        <strong>IMPORTANT:</strong> The system assumes that the function does 
not modify the element on which the predicate is applied.
-        Violating this assumption can lead to incorrect results.</p>
-
-<div class="highlight"><pre><code class="language-scala" 
data-lang="scala"><span class="n">data</span><span class="o">.</span><span 
class="n">filter</span> <span class="o">{</span> <span class="k">_</span> <span 
class="o">&gt;</span> <span class="mi">1000</span> <span 
class="o">}</span></code></pre></div>
-
-      </td>
-    </tr>
-
-    <tr>
-      <td><strong>Reduce</strong></td>
-      <td>
-        <p>Combines a group of elements into a single element by repeatedly 
combining two elements
-        into one. Reduce may be applied on a full data set, or on a grouped 
data set.</p>
-
-<div class="highlight"><pre><code class="language-scala" 
data-lang="scala"><span class="n">data</span><span class="o">.</span><span 
class="n">reduce</span> <span class="o">{</span> <span class="k">_</span> <span 
class="o">+</span> <span class="k">_</span> <span 
class="o">}</span></code></pre></div>
-
-      </td>
-    </tr>
-
-    <tr>
-      <td><strong>ReduceGroup</strong></td>
-      <td>
-        <p>Combines a group of elements into one or more elements. ReduceGroup 
may be applied on a
-        full data set, or on a grouped data set.</p>
-
-<div class="highlight"><pre><code class="language-scala" 
data-lang="scala"><span class="n">data</span><span class="o">.</span><span 
class="n">reduceGroup</span> <span class="o">{</span> <span 
class="n">elements</span> <span class="k">=&gt;</span> <span 
class="n">elements</span><span class="o">.</span><span class="n">sum</span> 
<span class="o">}</span></code></pre></div>
-
-      </td>
-    </tr>
-
-    <tr>
-      <td><strong>Aggregate</strong></td>
-      <td>
-        <p>Aggregates a group of values into a single value. Aggregation 
functions can be thought of
-        as built-in reduce functions. Aggregate may be applied on a full data 
set, or on a grouped
-        data set.</p>
-
-<div class="highlight"><pre><code class="language-scala" 
data-lang="scala"><span class="k">val</span> <span class="n">input</span><span 
class="k">:</span> <span class="kt">DataSet</span><span 
class="o">[(</span><span class="kt">Int</span>, <span class="kt">String</span>, 
<span class="kt">Double</span><span class="o">)]</span> <span 
class="k">=</span> <span class="c1">// [...]</span>
-<span class="k">val</span> <span class="n">output</span><span 
class="k">:</span> <span class="kt">DataSet</span><span 
class="o">[(</span><span class="kt">Int</span>, <span class="kt">String</span>, 
<span class="kt">Doublr</span><span class="o">)]</span> <span 
class="k">=</span> <span class="n">input</span><span class="o">.</span><span 
class="n">aggregate</span><span class="o">(</span><span 
class="nc">SUM</span><span class="o">,</span> <span class="mi">0</span><span 
class="o">).</span><span class="n">aggregate</span><span 
class="o">(</span><span class="nc">MIN</span><span class="o">,</span> <span 
class="mi">2</span><span class="o">);</span></code></pre></div>
-
-  <p>You can also use short-hand syntax for minimum, maximum, and sum 
aggregations.</p>
-
-<div class="highlight"><pre><code class="language-scala" 
data-lang="scala"><span class="k">val</span> <span class="n">input</span><span 
class="k">:</span> <span class="kt">DataSet</span><span 
class="o">[(</span><span class="kt">Int</span>, <span class="kt">String</span>, 
<span class="kt">Double</span><span class="o">)]</span> <span 
class="k">=</span> <span class="c1">// [...]</span>
-<span class="k">val</span> <span class="n">output</span><span 
class="k">:</span> <span class="kt">DataSet</span><span 
class="o">[(</span><span class="kt">Int</span>, <span class="kt">String</span>, 
<span class="kt">Doublr</span><span class="o">)]</span> <span 
class="k">=</span> <span class="n">input</span><span class="o">.</span><span 
class="n">sum</span><span class="o">(</span><span class="mi">0</span><span 
class="o">).</span><span class="n">min</span><span class="o">(</span><span 
class="mi">2</span><span class="o">)</span></code></pre></div>
-
-      </td>
-    </tr>
-
-    
-      <td><strong>Join</strong></td>
-      <td>
-        Joins two data sets by creating all pairs of elements that are equal 
on their keys.
-        Optionally uses a JoinFunction to turn the pair of elements into a 
single element, or a
-        FlatJoinFunction to turn the pair of elements into arbitararily many 
(including none)
-        elements. See <a href="#specifying-keys">keys</a> on how to define 
join keys.
-
-<div class="highlight"><pre><code class="language-scala" 
data-lang="scala"><span class="c1">// In this case tuple fields are used as 
keys. &quot;0&quot; is the join field on the first tuple</span>
-<span class="c1">// &quot;1&quot; is the join field on the second tuple.</span>
-<span class="k">val</span> <span class="n">result</span> <span 
class="k">=</span> <span class="n">input1</span><span class="o">.</span><span 
class="n">join</span><span class="o">(</span><span class="n">input2</span><span 
class="o">).</span><span class="n">where</span><span class="o">(</span><span 
class="mi">0</span><span class="o">).</span><span class="n">equalTo</span><span 
class="o">(</span><span class="mi">1</span><span 
class="o">)</span></code></pre></div>
-
-        You can specify the way that the runtime executes the join via <i>Join 
Hints</i>. The hints
-        describe whether the join happens through partitioning or 
broadcasting, and whether it uses
-        a sort-based or a hash-based algorithm. Please refer to the 
-        <a 
href="dataset_transformations.html#join-algorithm-hints">Transformations 
Guide</a> for
-        a list of possible hints and an example.
-        If no hint is specified, the system will try to make an estimate of 
the input sizes and
-        pick a the best strategy according to those estimates.
-
-<div class="highlight"><pre><code class="language-scala" 
data-lang="scala"><span class="c1">// This executes a join by broadcasting the 
first data set</span>
-<span class="c1">// using a hash table for the broadcasted data</span>
-<span class="k">val</span> <span class="n">result</span> <span 
class="k">=</span> <span class="n">input1</span><span class="o">.</span><span 
class="n">join</span><span class="o">(</span><span class="n">input2</span><span 
class="o">,</span> <span class="nc">JoinHint</span><span 
class="o">.</span><span class="nc">BROADCAST_HASH_FIRST</span><span 
class="o">)</span>
-                   <span class="o">.</span><span class="n">where</span><span 
class="o">(</span><span class="mi">0</span><span class="o">).</span><span 
class="n">equalTo</span><span class="o">(</span><span class="mi">1</span><span 
class="o">)</span></code></pre></div>
-
-          Note that the join transformation works only for equi-joins. Other 
join types, for example outer-joins need to be expressed using CoGroup.
-      </td>
-    
-
-    <tr>
-      <td><strong>CoGroup</strong></td>
-      <td>
-        <p>The two-dimensional variant of the reduce operation. Groups each 
input on one or more
-        fields and then joins the groups. The transformation function is 
called per pair of groups.
-        See <a href="#specifying-keys">keys</a> on how to define coGroup 
keys.</p>
-
-<div class="highlight"><pre><code class="language-scala" 
data-lang="scala"><span class="n">data1</span><span class="o">.</span><span 
class="n">coGroup</span><span class="o">(</span><span 
class="n">data2</span><span class="o">).</span><span 
class="n">where</span><span class="o">(</span><span class="mi">0</span><span 
class="o">).</span><span class="n">equalTo</span><span class="o">(</span><span 
class="mi">1</span><span class="o">)</span></code></pre></div>
-
-      </td>
-    </tr>
-
-    <tr>
-      <td><strong>Cross</strong></td>
-      <td>
-        <p>Builds the Cartesian product (cross product) of two inputs, 
creating all pairs of
-        elements. Optionally uses a CrossFunction to turn the pair of elements 
into a single
-        element</p>
-
-<div class="highlight"><pre><code class="language-scala" 
data-lang="scala"><span class="k">val</span> <span class="n">data1</span><span 
class="k">:</span> <span class="kt">DataSet</span><span class="o">[</span><span 
class="kt">Int</span><span class="o">]</span> <span class="k">=</span> <span 
class="c1">// [...]</span>
-<span class="k">val</span> <span class="n">data2</span><span 
class="k">:</span> <span class="kt">DataSet</span><span class="o">[</span><span 
class="kt">String</span><span class="o">]</span> <span class="k">=</span> <span 
class="c1">// [...]</span>
-<span class="k">val</span> <span class="n">result</span><span 
class="k">:</span> <span class="kt">DataSet</span><span 
class="o">[(</span><span class="kt">Int</span>, <span 
class="kt">String</span><span class="o">)]</span> <span class="k">=</span> 
<span class="n">data1</span><span class="o">.</span><span 
class="n">cross</span><span class="o">(</span><span class="n">data2</span><span 
class="o">)</span></code></pre></div>
-
-        <p>Note: Cross is potentially a <b>very</b> compute-intensive 
operation which can challenge even large compute clusters! It is adviced to 
hint the system with the DataSet sizes by using <i>crossWithTiny()</i> and 
<i>crossWithHuge()</i>.</p>
-      </td>
-    </tr>
-    <tr>
-      <td><strong>Union</strong></td>
-      <td>
-        <p>Produces the union of two data sets.</p>
-
-<div class="highlight"><pre><code class="language-scala" 
data-lang="scala"><span class="n">data</span><span class="o">.</span><span 
class="n">union</span><span class="o">(</span><span class="n">data2</span><span 
class="o">)</span></code></pre></div>
-
-      </td>
-    </tr>
-    <tr>
-      <td><strong>Rebalance</strong></td>
-      <td>
-        <p>Evenly rebalances the parallel partitions of a data set to 
eliminate data skew. Only Map-like transformations may follow a rebalance 
transformation.</p>
-
-<div class="highlight"><pre><code class="language-scala" 
data-lang="scala"><span class="k">val</span> <span class="n">data1</span><span 
class="k">:</span> <span class="kt">DataSet</span><span class="o">[</span><span 
class="kt">Int</span><span class="o">]</span> <span class="k">=</span> <span 
class="c1">// [...]</span>
-<span class="k">val</span> <span class="n">result</span><span 
class="k">:</span> <span class="kt">DataSet</span><span 
class="o">[(</span><span class="kt">Int</span>, <span 
class="kt">String</span><span class="o">)]</span> <span class="k">=</span> 
<span class="n">data1</span><span class="o">.</span><span 
class="n">rebalance</span><span class="o">().</span><span 
class="n">map</span><span class="o">(...)</span></code></pre></div>
-
-      </td>
-    </tr>
-    <tr>
-      <td><strong>Hash-Partition</strong></td>
-      <td>
-        <p>Hash-partitions a data set on a given key. Keys can be specified as 
key-selector functions, tuple positions
-        or case class fields.</p>
-
-<div class="highlight"><pre><code class="language-scala" 
data-lang="scala"><span class="k">val</span> <span class="n">in</span><span 
class="k">:</span> <span class="kt">DataSet</span><span 
class="o">[(</span><span class="kt">Int</span>, <span 
class="kt">String</span><span class="o">)]</span> <span class="k">=</span> 
<span class="c1">// [...]</span>
-<span class="k">val</span> <span class="n">result</span> <span 
class="k">=</span> <span class="n">in</span><span class="o">.</span><span 
class="n">partitionByHash</span><span class="o">(</span><span 
class="mi">0</span><span class="o">).</span><span class="n">mapPartition</span> 
<span class="o">{</span> <span class="o">...</span> <span 
class="o">}</span></code></pre></div>
-
-      </td>
-    </tr>
-    
-    <tr>
-      <td><strong>Sort Partition</strong></td>
-      <td>
-        <p>Locall

<TRUNCATED>

Reply via email to