[47/51] [abbrv] [partial] incubator-beam-site git commit: Regenerate site

davor Fri, 04 Nov 2016 16:49:00 -0700

http://git-wip-us.apache.org/repos/asf/incubator-beam-site/blob/61ba656f/content/documentation/programming-guide/index.html
----------------------------------------------------------------------
diff --git a/content/documentation/programming-guide/index.html 
b/content/documentation/programming-guide/index.html
new file mode 100644
index 0000000..8c695d1
--- /dev/null
+++ b/content/documentation/programming-guide/index.html
@@ -0,0 +1,621 @@
+<!DOCTYPE html>
+<html lang="en">
+
+  <head>
+  <meta charset="utf-8">
+  <meta http-equiv="X-UA-Compatible" content="IE=edge">
+  <meta name="viewport" content="width=device-width, initial-scale=1">
+
+  <title>Beam Programming Guide</title>
+  <meta name="description" content="Apache Beam is an open source, unified 
model and set of language-specific SDKs for defining and executing data 
processing workflows, and also data ingestion and integration flows, supporting 
Enterprise Integration Patterns (EIPs) and Domain Specific Languages (DSLs). 
Dataflow pipelines simplify the mechanics of large-scale batch and streaming 
data processing and can run on a number of runtimes like Apache Flink, Apache 
Spark, and Google Cloud Dataflow (a cloud service). Beam also brings DSL in 
different languages, allowing users to easily implement their data integration 
processes.
+">
+
+  <link rel="stylesheet" href="/styles/site.css">
+  <link rel="stylesheet" href="/css/theme.css">
+  <script 
src="https://ajax.googleapis.com/ajax/libs/jquery/2.2.0/jquery.min.js";></script>
+  <script src="/js/bootstrap.min.js"></script>
+  <script src="/js/language-switch.js"></script>
+  <link rel="canonical" 
href="http://beam.incubator.apache.org/documentation/programming-guide/"; 
data-proofer-ignore>
+  <link rel="alternate" type="application/rss+xml" title="Apache Beam 
(incubating)" href="http://beam.incubator.apache.org/feed.xml";>
+  <script>
+    
(function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
+    (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new 
Date();a=s.createElement(o),
+    
m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
+    
})(window,document,'script','//www.google-analytics.com/analytics.js','ga');
+
+    ga('create', 'UA-73650088-1', 'auto');
+    ga('send', 'pageview');
+
+  </script>
+  <link rel="shortcut icon" type="image/x-icon" href="/images/favicon.ico">
+</head>
+
+
+  <body role="document">
+
+    <nav class="navbar navbar-default navbar-fixed-top">
+  <div class="container">
+    <div class="navbar-header">
+      <a href="/" class="navbar-brand" >
+        <img alt="Brand" style="height: 25px" 
src="/images/beam_logo_navbar.png">
+      </a>
+      <button type="button" class="navbar-toggle collapsed" 
data-toggle="collapse" data-target="#navbar" aria-expanded="false" 
aria-controls="navbar">
+        <span class="sr-only">Toggle navigation</span>
+        <span class="icon-bar"></span>
+        <span class="icon-bar"></span>
+        <span class="icon-bar"></span>
+      </button>
+    </div>
+    <div id="navbar" class="navbar-collapse collapse">
+      <ul class="nav navbar-nav">
+        <li class="dropdown">
+                 <a href="#" class="dropdown-toggle" data-toggle="dropdown" 
role="button" aria-haspopup="true" aria-expanded="false">Get Started <span 
class="caret"></span></a>
+                 <ul class="dropdown-menu">
+                         <li><a href="/get-started/beam-overview/">Beam 
Overview</a></li>
+              <li><a href="/get-started/quickstart/">Quickstart</a></li>
+                         <li role="separator" class="divider"></li>
+                         <li class="dropdown-header">Example Walkthroughs</li>
+                         <li><a 
href="/get-started/wordcount-example/">WordCount</a></li>
+                         <li><a 
href="/get-started/mobile-gaming-example/">Mobile Gaming</a></li>
+              <li role="separator" class="divider"></li>
+              <li class="dropdown-header">Resources</li>
+              <li><a href="/get-started/downloads">Downloads</a></li>
+              <li><a href="/get-started/releases">Release Notes</a></li>
+              <li><a href="/get-started/support">Support</a></li>
+                 </ul>
+           </li>
+        <li class="dropdown">
+                 <a href="#" class="dropdown-toggle" data-toggle="dropdown" 
role="button" aria-haspopup="true" aria-expanded="false">Documentation <span 
class="caret"></span></a>
+                 <ul class="dropdown-menu">
+                         <li><a href="/documentation">Using the 
Documentation</a></li>
+                         <li role="separator" class="divider"></li>
+                         <li class="dropdown-header">Beam Concepts</li>
+                         <li><a 
href="/documentation/programming-guide/">Programming Guide</a></li>
+                         <li><a href="/documentation/resources/">Additional 
Resources</a></li>
+                         <li role="separator" class="divider"></li>
+              <li class="dropdown-header">Pipeline Fundamentals</li>
+              <li><a 
href="/documentation/pipelines/design-your-pipeline/">Design Your 
Pipeline</a></li>
+              <li><a 
href="/documentation/pipelines/create-your-pipeline/">Create Your 
Pipeline</a></li>
+              <li><a href="/documentation/pipelines/test-your-pipeline/">Test 
Your Pipeline</a></li>
+              <li role="separator" class="divider"></li>
+                         <li class="dropdown-header">SDKs</li>
+                         <li><a href="/documentation/sdks/java/">Java 
SDK</a></li>
+                         <li><a href="/documentation/sdks/javadoc/">Java SDK 
API Reference</a></li>
+                         <li role="separator" class="divider"></li>
+                         <li class="dropdown-header">Runners</li>
+                         <li><a 
href="/documentation/runners/capability-matrix/">Capability Matrix</a></li>
+                         <li><a href="/documentation/runners/direct/">Direct 
Runner</a></li>
+                         <li><a href="/documentation/runners/flink/">Apache 
Flink Runner</a></li>
+                         <li><a href="/documentation/runners/spark/">Apache 
Spark Runner</a></li>
+                         <li><a href="/documentation/runners/dataflow/">Cloud 
Dataflow Runner</a></li>
+                 </ul>
+           </li>
+        <li class="dropdown">
+                 <a href="#" class="dropdown-toggle" data-toggle="dropdown" 
role="button" aria-haspopup="true" aria-expanded="false">Contribute <span 
class="caret"></span></a>
+                 <ul class="dropdown-menu">
+                         <li><a href="/contribute">Get Started 
Contributing</a></li>
+        <li role="separator" class="divider"></li>
+        <li class="dropdown-header">Guides</li>
+                         <li><a 
href="/contribute/contribution-guide/">Contribution Guide</a></li>
+        <li><a href="/contribute/testing/">Testing Guide</a></li>
+        <li><a href="/contribute/release-guide/">Release Guide</a></li>
+        <li role="separator" class="divider"></li>
+        <li class="dropdown-header">Technical References</li>
+        <li><a href="/contribute/design-principles/">Design Principles</a></li>
+                         <li><a href="/contribute/work-in-progress/">Ongoing 
Projects</a></li>
+        <li><a href="/contribute/source-repository/">Source 
Repository</a></li>      
+        <li role="separator" class="divider"></li>
+                         <li class="dropdown-header">Promotion</li>
+        <li><a href="/contribute/presentation-materials/">Presentation 
Materials</a></li>
+        <li><a href="/contribute/logos/">Logos and Design</a></li>
+        <li role="separator" class="divider"></li>
+        <li><a href="/contribute/team/">Team</a></li>
+                 </ul>
+           </li>
+
+        <li><a href="/blog">Blog</a></li>
+      </ul>
+      <ul class="nav navbar-nav navbar-right">
+        <li class="dropdown">
+          <a href="#" class="dropdown-toggle" data-toggle="dropdown" 
role="button" aria-haspopup="true" aria-expanded="false"><img 
src="https://www.apache.org/foundation/press/kit/feather_small.png"; alt="Apache 
Logo" style="height:24px;">Apache Software Foundation<span 
class="caret"></span></a>
+          <ul class="dropdown-menu dropdown-menu-right">
+            <li><a href="http://www.apache.org/";>ASF Homepage</a></li>
+            <li><a href="http://www.apache.org/licenses/";>License</a></li>
+            <li><a href="http://www.apache.org/security/";>Security</a></li>
+            <li><a 
href="http://www.apache.org/foundation/thanks.html";>Thanks</a></li>
+            <li><a 
href="http://www.apache.org/foundation/sponsorship.html";>Sponsorship</a></li>
+            <li><a 
href="https://www.apache.org/foundation/policies/conduct";>Code of 
Conduct</a></li>
+          </ul>
+        </li>
+      </ul>
+    </div><!--/.nav-collapse -->
+  </div>
+</nav>
+
+
+<link rel="stylesheet" href="">
+
+
+    <div class="container" role="main">
+
+      <div class="row">
+        <h1 id="apache-beam-programming-guide">Apache Beam Programming 
Guide</h1>
+
+<p>The <strong>Beam Programming Guide</strong> is intended for Beam users who 
want to use the Beam SDKs to create data processing pipelines. It provides 
guidance for using the Beam SDK classes to build and test your pipeline. It is 
not intended as an exhaustive reference, but as a language-agnostic, high-level 
guide to programmatically building your Beam pipeline. As the programming guide 
is filled out, the text will include code samples in multiple languages to help 
illustrate how to implement Beam concepts in your programs.</p>
+
+<h2 id="contents">Contents</h2>
+
+<ul>
+  <li><a href="#overview">Overview</a></li>
+  <li><a href="#pipeline">Creating the Pipeline</a></li>
+  <li><a href="#pcollection">Working with PCollections</a>
+    <ul>
+      <li><a href="#pccreate">Creating a PCollection</a></li>
+      <li><a href="#pccharacteristics">PCollection Characteristics</a>
+        <ul>
+          <li><a href="#pcelementtype">Element Type</a></li>
+          <li><a href="#pcimmutability">Immutability</a></li>
+          <li><a href="#pcrandomaccess">Random Access</a></li>
+          <li><a href="#pcsizebound">Size and Boundedness</a></li>
+          <li><a href="#pctimestamps">Element Timestamps</a></li>
+        </ul>
+      </li>
+    </ul>
+  </li>
+  <li><a href="#transforms">Applying Transforms</a>
+    <ul>
+      <li><a href="#transforms-pardo">Using ParDo</a></li>
+      <li><a href="#transforms-gbk">Using GroupByKey</a></li>
+      <li><a href="#transforms-combine">Using Combine</a></li>
+      <li><a href="#transforms-usercodereqs">General Requirements for Writing 
User Code for Beam Transforms</a></li>
+      <li><a href="#transforms-sideio">Side Inputs and Side Outputs</a></li>
+    </ul>
+  </li>
+  <li><a href="#io">I/O</a></li>
+  <li><a href="#running">Running the Pipeline</a></li>
+  <li><a href="#coders">Data Encoding and Type Safety</a></li>
+  <li><a href="#windowing">Working with Windowing</a></li>
+  <li><a href="#triggers">Working with Triggers</a></li>
+</ul>
+
+<h2 id="a-nameoverviewaoverview"><a name="overview"></a>Overview</h2>
+
+<p>To use Beam, you need to first create a driver program using the classes in 
one of the Beam SDKs. Your driver program <em>defines</em> your pipeline, 
including all of the inputs, transforms, and outputs; it also sets execution 
options for your pipeline (typically passed in using command-line options). 
These include the Pipeline Runner, which, in turn, determines what back-end 
your pipeline will run on.</p>
+
+<p>The Beam SDKs provide a number of abstractions that simplify the mechanics 
of large-scale distributed data processing. The same Beam abstractions work 
with both batch and streaming data sources. When you create your Beam pipeline, 
you can think about your data processing task in terms of these abstractions. 
They include:</p>
+
+<ul>
+  <li>
+    <p><code class="highlighter-rouge">Pipeline</code>: A <code 
class="highlighter-rouge">Pipeline</code> encapsulates your entire data 
processing task, from start to finish. This includes reading input data, 
transforming that data, and writing output data. All Beam driver programs must 
create a <code class="highlighter-rouge">Pipeline</code>. When you create the 
<code class="highlighter-rouge">Pipeline</code>, you must also specify the 
execution options that tell the <code class="highlighter-rouge">Pipeline</code> 
where and how to run.</p>
+  </li>
+  <li>
+    <p><code class="highlighter-rouge">PCollection</code>: A <code 
class="highlighter-rouge">PCollection</code> represents a distributed data set 
that your Beam pipeline operates on. The data set can be <em>bounded</em>, 
meaning it comes from a fixed source like a file, or <em>unbounded</em>, 
meaning it comes from a continuously updating source via a subscription or 
other mechanism. Your pipeline typically creates an initial <code 
class="highlighter-rouge">PCollection</code> by reading data from an external 
data source, but you can also create a <code 
class="highlighter-rouge">PCollection</code> from in-memory data within your 
driver program. From there, <code class="highlighter-rouge">PCollection</code>s 
are the inputs and outputs for each step in your pipeline.</p>
+  </li>
+  <li>
+    <p><code class="highlighter-rouge">Transform</code>: A <code 
class="highlighter-rouge">Transform</code> represents a data processing 
operation, or a step, in your pipeline. Every <code 
class="highlighter-rouge">Transform</code> takes one or more <code 
class="highlighter-rouge">PCollection</code> objects as input, perfroms a 
processing function that you provide on the elements of that <code 
class="highlighter-rouge">PCollection</code>, and produces one or more output 
<code class="highlighter-rouge">PCollection</code> objects.</p>
+  </li>
+  <li>
+    <p>I/O <code class="highlighter-rouge">Source</code> and <code 
class="highlighter-rouge">Sink</code>: Beam provides <code 
class="highlighter-rouge">Source</code> and <code 
class="highlighter-rouge">Sink</code> APIs to represent reading and writing 
data, respectively. <code class="highlighter-rouge">Source</code> encapsulates 
the code necessary to read data into your Beam pipeline from some external 
source, such as cloud file storage or a subscription to a streaming data 
source. <code class="highlighter-rouge">Sink</code> likewise encapsulates the 
code necessary to write the elements of a <code 
class="highlighter-rouge">PCollection</code> to an external data sink.</p>
+  </li>
+</ul>
+
+<p>A typical Beam driver program works as follows:</p>
+
+<ul>
+  <li>Create a <code class="highlighter-rouge">Pipeline</code> object and set 
the pipeline execution options, including the Pipeline Runner.</li>
+  <li>Create an initial <code class="highlighter-rouge">PCollection</code> for 
pipeline data, either using the <code class="highlighter-rouge">Source</code> 
API to read data from an external source, or using a <code 
class="highlighter-rouge">Create</code> transform to build a <code 
class="highlighter-rouge">PCollection</code> from in-memory data.</li>
+  <li>Apply <strong>Transforms</strong> to each <code 
class="highlighter-rouge">PCollection</code>. Transforms can change, filter, 
group, analyze, or otherwise process the elements in a <code 
class="highlighter-rouge">PCollection</code>. A transform creates a new output 
<code class="highlighter-rouge">PCollection</code> <em>without consuming the 
input collection</em>. A typical pipeline applies subsequent transforms to the 
each new output <code class="highlighter-rouge">PCollection</code> in turn 
until processing is complete.</li>
+  <li>Output the final, transformed <code 
class="highlighter-rouge">PCollection</code>(s), typically using the <code 
class="highlighter-rouge">Sink</code> API to write data to an external 
source.</li>
+  <li><strong>Run</strong> the pipeline using the designated Pipeline 
Runner.</li>
+</ul>
+
+<p>When you run your Beam driver program, the Pipeline Runner that you 
designate constructs a <strong>workflow graph</strong> of your pipeline based 
on the <code class="highlighter-rouge">PCollection</code> objects youâve 
created and transforms that youâve applied. That graph is then executed using 
the appropriate distributed processing back-end, becoming an asynchronous 
âjobâ (or equivalent) on that back-end.</p>
+
+<h2 id="a-namepipelineacreating-the-pipeline"><a name="pipeline"></a>Creating 
the Pipeline</h2>
+
+<p>The <code class="highlighter-rouge">Pipeline</code> abstraction 
encapsulates all the data and steps in your data processing task. Your Beam 
driver program typically starts by constructing a <a 
href="https://github.com/apache/incubator-beam/blob/master/sdks/java/core/src/main/java/org/apache/beam/sdk/Pipeline.java";>Pipeline</a>
 object, and then using that object as the basis for creating the pipelineâs 
data sets as <code class="highlighter-rouge">PCollection</code>s and its 
operations as <code class="highlighter-rouge">Transform</code>s.</p>
+
+<p>To use Beam, your driver program must first create an instance of the Beam 
SDK class <code class="highlighter-rouge">Pipeline</code> (typically in the 
<code class="highlighter-rouge">main()</code> function). When you create your 
<code class="highlighter-rouge">Pipeline</code>, youâll also need to set some 
<strong>configuration options</strong>. You can set your pipelineâs 
configuration options programatically, but itâs often easier to set the 
options ahead of time (or read them from the command line) and pass them to the 
<code class="highlighter-rouge">Pipeline</code> object when you create the 
object.</p>
+
+<p>The pipeline configuration options determine, among other things, the <code 
class="highlighter-rouge">PipelineRunner</code> that determines where the 
pipeline gets executed: locally, or using a distributed back-end of your 
choice. Depending on where your pipeline gets executed and what your specifed 
Runner requires, the options can also help you specify other aspects of 
execution.</p>
+
+<p>To set your pipelineâs configuration options and create the pipeline, 
create an object of type <a 
href="https://github.com/apache/incubator-beam/blob/master/sdks/java/core/src/main/java/org/apache/beam/sdk/options/PipelineOptions.java";>PipelineOptions</a>
 and pass it to <code class="highlighter-rouge">Pipeline.Create()</code>. The 
most common way to do this is by parsing arguments from the command-line:</p>
+
+<div class="language-java highlighter-rouge"><pre 
class="highlight"><code><span class="kd">public</span> <span 
class="kd">static</span> <span class="kt">void</span> <span 
class="nf">main</span><span class="o">(</span><span 
class="n">String</span><span class="o">[]</span> <span 
class="n">args</span><span class="o">)</span> <span class="o">{</span>
+   <span class="c1">// Will parse the arguments passed into the application 
and construct a PipelineOptions</span>
+   <span class="c1">// Note that --help will print registered options, and 
--help=PipelineOptionsClassName</span>
+   <span class="c1">// will print out usage for the specific class.</span>
+   <span class="n">PipelineOptions</span> <span class="n">options</span> <span 
class="o">=</span>
+       <span class="n">PipelineOptionsFactory</span><span 
class="o">.</span><span class="na">fromArgs</span><span class="o">(</span><span 
class="n">args</span><span class="o">).</span><span 
class="na">create</span><span class="o">();</span>
+
+   <span class="n">Pipeline</span> <span class="n">p</span> <span 
class="o">=</span> <span class="n">Pipeline</span><span class="o">.</span><span 
class="na">create</span><span class="o">(</span><span 
class="n">options</span><span class="o">);</span>
+</code></pre>
+</div>
+
+<p>The Beam SDKs contain various subclasses of <code 
class="highlighter-rouge">PipelineOptions</code> that correspond to different 
Runners. For example, <code 
class="highlighter-rouge">DirectPipelineOptions</code> contains options for the 
Direct (local) pipeline runner, while <code 
class="highlighter-rouge">DataflowPipelineOptions</code> contains options for 
using the runner for Google Cloud Dataflow. You can also define your own custom 
<code class="highlighter-rouge">PipelineOptions</code> by creating an interface 
that extends the Beam SDKsâ <code 
class="highlighter-rouge">PipelineOptions</code> class.</p>
+
+<h2 id="a-namepcollectionaworking-with-pcollections"><a 
name="pcollection"></a>Working with PCollections</h2>
+
+<p>The <a 
href="https://github.com/apache/incubator-beam/blob/master/sdks/java/core/src/main/java/org/apache/beam/sdk/values/PCollection.java";>PCollection</a>
 abstraction represents a potentially distributed, multi-element data set. You 
can think of a <code class="highlighter-rouge">PCollection</code> as 
âpipelineâ data; Beam transforms use <code 
class="highlighter-rouge">PCollection</code> objects as inputs and outputs. As 
such, if you want to work with data in your pipeline, it must be in the form of 
a <code class="highlighter-rouge">PCollection</code>.</p>
+
+<p>After youâve created your <code 
class="highlighter-rouge">Pipeline</code>, youâll need to begin by creating 
at least one <code class="highlighter-rouge">PCollection</code> in some form. 
The <code class="highlighter-rouge">PCollection</code> you create serves as the 
input for the first operation in your pipeline.</p>
+
+<h3 id="a-namepccreateacreating-a-pcollection"><a name="pccreate"></a>Creating 
a PCollection</h3>
+
+<p>You create a <code class="highlighter-rouge">PCollection</code> by either 
reading data from an external source using Beamâs <a href="#io">Source 
API</a>, or you can create a <code class="highlighter-rouge">PCollection</code> 
of data stored in an in-memory collection class in your driver program. The 
former is typically how a production pipeline would ingest data; Beamâs 
Source APIs contain adapters to help you read from external sources like large 
cloud-based files, databases, or subscription services. The latter is primarily 
useful for testing and debugging purposes.</p>
+
+<h4 id="reading-from-an-external-source">Reading from an External Source</h4>
+
+<p>To read from an external source, you use one of the <a 
href="#io">Beam-provided I/O adapters</a>. The adapters vary in their exact 
usage, but all of them from some external data source and return a <code 
class="highlighter-rouge">PCollection</code> whose elements represent the data 
records in that source.</p>
+
+<p>Each data source adapter has a <code class="highlighter-rouge">Read</code> 
transform; to read, you must apply that transform to the <code 
class="highlighter-rouge">Pipeline</code> object itself. <code 
class="highlighter-rouge">TextIO.Read</code>, for example, reads from an 
external text file and returns a <code 
class="highlighter-rouge">PCollection</code> whose elements are of type <code 
class="highlighter-rouge">String</code>; each <code 
class="highlighter-rouge">String</code> represents one line from the text file. 
Hereâs how you would apply <code class="highlighter-rouge">TextIO.Read</code> 
to your <code class="highlighter-rouge">Pipeline</code> to create a <code 
class="highlighter-rouge">PCollection</code>:</p>
+
+<div class="language-java highlighter-rouge"><pre 
class="highlight"><code><span class="kd">public</span> <span 
class="kd">static</span> <span class="kt">void</span> <span 
class="nf">main</span><span class="o">(</span><span 
class="n">String</span><span class="o">[]</span> <span 
class="n">args</span><span class="o">)</span> <span class="o">{</span>
+    <span class="c1">// Create the pipeline.</span>
+    <span class="n">PipelineOptions</span> <span class="n">options</span> 
<span class="o">=</span> 
+        <span class="n">PipelineOptionsFactory</span><span 
class="o">.</span><span class="na">fromArgs</span><span class="o">(</span><span 
class="n">args</span><span class="o">).</span><span 
class="na">create</span><span class="o">();</span>
+    <span class="n">Pipeline</span> <span class="n">p</span> <span 
class="o">=</span> <span class="n">Pipeline</span><span class="o">.</span><span 
class="na">create</span><span class="o">(</span><span 
class="n">options</span><span class="o">);</span>
+
+    <span class="n">PCollection</span><span class="o">&lt;</span><span 
class="n">String</span><span class="o">&gt;</span> <span class="n">lines</span> 
<span class="o">=</span> <span class="n">p</span><span class="o">.</span><span 
class="na">apply</span><span class="o">(</span>
+      <span class="n">TextIO</span><span class="o">.</span><span 
class="na">Read</span><span class="o">.</span><span 
class="na">named</span><span class="o">(</span><span 
class="s">"ReadMyFile"</span><span class="o">).</span><span 
class="na">from</span><span class="o">(</span><span 
class="s">"gs://some/inputData.txt"</span><span class="o">));</span>
+<span class="o">}</span>
+</code></pre>
+</div>
+
+<p>See the <a href="#io">section on I/O</a> to learn more about how to read 
from the various data sources supported by the Beam SDK.</p>
+
+<h4 id="creating-a-pcollection-from-in-memory-data">Creating a PCollection 
from In-Memory Data</h4>
+
+<p>To create a <code class="highlighter-rouge">PCollection</code> from an 
in-memory Java <code class="highlighter-rouge">Collection</code>, you use the 
Beam-provided <code class="highlighter-rouge">Create</code> transform. Much 
like a data adapterâs <code class="highlighter-rouge">Read</code>, you apply 
<code class="highlighter-rouge">Create</code> sirectly to your <code 
class="highlighter-rouge">Pipeline</code> object itself.</p>
+
+<p>As parameters, <code class="highlighter-rouge">Create</code> accepts the 
Java <code class="highlighter-rouge">Collection</code> and a <code 
class="highlighter-rouge">Coder</code> object. The <code 
class="highlighter-rouge">Coder</code> specifies how the elements in the <code 
class="highlighter-rouge">Collection</code> should be <a 
href="#pcelementtype">encoded</a>.</p>
+
+<p>The following example code shows how to create a <code 
class="highlighter-rouge">PCollection</code> from an in-memory Java <code 
class="highlighter-rouge">List</code>:</p>
+
+<div class="language-java highlighter-rouge"><pre 
class="highlight"><code><span class="kd">public</span> <span 
class="kd">static</span> <span class="kt">void</span> <span 
class="nf">main</span><span class="o">(</span><span 
class="n">String</span><span class="o">[]</span> <span 
class="n">args</span><span class="o">)</span> <span class="o">{</span>
+    <span class="c1">// Create a Java Collection, in this case a List of 
Strings.</span>
+    <span class="kd">static</span> <span class="kd">final</span> <span 
class="n">List</span><span class="o">&lt;</span><span 
class="n">String</span><span class="o">&gt;</span> <span class="n">LINES</span> 
<span class="o">=</span> <span class="n">Arrays</span><span 
class="o">.</span><span class="na">asList</span><span class="o">(</span>
+      <span class="s">"To be, or not to be: that is the question: 
"</span><span class="o">,</span>
+      <span class="s">"Whether 'tis nobler in the mind to suffer "</span><span 
class="o">,</span>
+      <span class="s">"The slings and arrows of outrageous fortune, 
"</span><span class="o">,</span>
+      <span class="s">"Or to take arms against a sea of troubles, 
"</span><span class="o">);</span>
+
+    <span class="c1">// Create the pipeline.</span>
+    <span class="n">PipelineOptions</span> <span class="n">options</span> 
<span class="o">=</span> 
+        <span class="n">PipelineOptionsFactory</span><span 
class="o">.</span><span class="na">fromArgs</span><span class="o">(</span><span 
class="n">args</span><span class="o">).</span><span 
class="na">create</span><span class="o">();</span>
+    <span class="n">Pipeline</span> <span class="n">p</span> <span 
class="o">=</span> <span class="n">Pipeline</span><span class="o">.</span><span 
class="na">create</span><span class="o">(</span><span 
class="n">options</span><span class="o">);</span>
+
+    <span class="c1">// Apply Create, passing the list and the coder, to 
create the PCollection.</span>
+    <span class="n">p</span><span class="o">.</span><span 
class="na">apply</span><span class="o">(</span><span 
class="n">Create</span><span class="o">.</span><span class="na">of</span><span 
class="o">(</span><span class="n">LINES</span><span class="o">)).</span><span 
class="na">setCoder</span><span class="o">(</span><span 
class="n">StringUtf8Coder</span><span class="o">.</span><span 
class="na">of</span><span class="o">())</span>
+<span class="o">}</span>
+</code></pre>
+</div>
+<h3 id="a-namepccharacteristicspcollection-characteristics"><a 
name="pccharacteristics">PCollection Characteristics</a></h3>
+
+<p>A <code class="highlighter-rouge">PCollection</code> is owned by the 
specific <code class="highlighter-rouge">Pipeline</code> object for which it is 
created; multiple pipelines cannot share a <code 
class="highlighter-rouge">PCollection</code>. In some respects, a <code 
class="highlighter-rouge">PCollection</code> functions like a collection class. 
However, a <code class="highlighter-rouge">PCollection</code> can differ in a 
few key ways:</p>
+
+<h4 id="a-namepcelementtypeaelement-type"><a name="pcelementtype"></a>Element 
Type</h4>
+
+<p>The elements of a <code class="highlighter-rouge">PCollection</code> may be 
of any type, but must all be of the same type. However, to support distributed 
processing, Beam needs to be able to encode each individual element as a byte 
string (so elements can be passed around to distributed workers). The Beam SDKs 
provide a data encoding mechanism that includes built-in encoding for 
commonly-used types as well as support for specifying custom encodings as 
needed.</p>
+
+<h4 id="a-namepcimmutabilityaimmutability"><a 
name="pcimmutability"></a>Immutability</h4>
+
+<p>A <code class="highlighter-rouge">PCollection</code> is immutable. Once 
created, you cannot add, remove, or change individual elements. A Beam 
Transform might process each element of a <code 
class="highlighter-rouge">PCollection</code> and generate new pipeline data (as 
a new <code class="highlighter-rouge">PCollection</code>), <em>but it does not 
consume or modify the original input collection</em>.</p>
+
+<h4 id="a-namepcrandomaccessarandom-access"><a 
name="pcrandomaccess"></a>Random Access</h4>
+
+<p>A <code class="highlighter-rouge">PCollection</code> does not support 
random access to individual elements. Instead, Beam Transforms consider every 
element in a <code class="highlighter-rouge">PCollection</code> 
individually.</p>
+
+<h4 id="a-namepcsizeboundasize-and-boundedness"><a name="pcsizebound"></a>Size 
and Boundedness</h4>
+
+<p>A <code class="highlighter-rouge">PCollection</code> is a large, immutable 
âbagâ of elements. There is no upper limit on how many elements a <code 
class="highlighter-rouge">PCollection</code> can contain; any given <code 
class="highlighter-rouge">PCollection</code> might fit in memory on a single 
machine, or it might represent a very large distributed data set backed by a 
persistent data store.</p>
+
+<p>A <code class="highlighter-rouge">PCollection</code> can be either 
<strong>bounded</strong> or <strong>unbounded</strong> in size. A 
<strong>bounded</strong> <code class="highlighter-rouge">PCollection</code> 
represents a data set of a known, fixed size, while an 
<strong>unbounded</strong> <code class="highlighter-rouge">PCollection</code> 
represents a data set of unlimited size. Whether a <code 
class="highlighter-rouge">PCollection</code> is bounded or unbounded depends on 
the source of the data set that it represents. Reading from a batch data 
source, such as a file or a database, creates a bounded <code 
class="highlighter-rouge">PCollection</code>. Reading from a streaming or 
continously-updating data source, such as Pub/Sub or Kafka, creates an 
unbounded <code class="highlighter-rouge">PCollection</code> (unless you 
explicitly tell it not to).</p>
+
+<p>The bounded (or unbounded) nature of your <code 
class="highlighter-rouge">PCollection</code> affects how Beam processes your 
data. A bounded <code class="highlighter-rouge">PCollection</code> can be 
processed using a batch job, which might read the entire data set once, and 
perform processing in a job of finite length. An unbounded <code 
class="highlighter-rouge">PCollection</code> must be processed using a 
streaming job that runs continuously, as the entire collection can never be 
available for processing at any one time.</p>
+
+<p>When performing an operation that groups elements in an unbounded <code 
class="highlighter-rouge">PCollection</code>, Beam requires a concept called 
<strong>Windowing</strong> to divide a continuously updating data set into 
logical windows of finite size.  Beam processes each window as a bundle, and 
processing continues as the data set is generated. These logical windows are 
determined by some characteristic associated with a data element, such as a 
<strong>timestamp</strong>.</p>
+
+<h4 id="a-namepctimestampsaelement-timestamps"><a 
name="pctimestamps"></a>Element Timestamps</h4>
+
+<p>Each element in a <code class="highlighter-rouge">PCollection</code> has an 
associated intrinsic <strong>timestamp</strong>. The timestamp for each element 
is initially assigned by the <a href="#io">Source</a> that creates the <code 
class="highlighter-rouge">PCollection</code>. Sources that create an unbounded 
<code class="highlighter-rouge">PCollection</code> often assign each new 
element a timestamp that corresponds to when the element was read or added.</p>
+
+<blockquote>
+  <p><strong>Note</strong>: Sources that create a bounded <code 
class="highlighter-rouge">PCollection</code> for a fixed data set also 
automatically assign timestamps, but the most common behavior is to assign 
every element the same timestamp (<code 
class="highlighter-rouge">Long.MIN_VALUE</code>).</p>
+</blockquote>
+
+<p>Timestamps are useful for a <code 
class="highlighter-rouge">PCollection</code> that contains elements with an 
inherent notion of time. If your pipeline is reading a stream of events, like 
Tweets or other social media messages, each element might use the time the 
event was posted as the element timestamp.</p>
+
+<p>You can manually assign timestamps to the elements of a <code 
class="highlighter-rouge">PCollection</code> if the source doesnât do it for 
you. Youâll want to do this if the elements have an inherent timestamp, but 
the timestamp is somewhere in the structure of the element itself (such as a 
âtimeâ field in a server log entry). Beam has <a 
href="#transforms">Transforms</a> that take a <code 
class="highlighter-rouge">PCollection</code> as input and output an identical 
<code class="highlighter-rouge">PCollection</code> with timestamps attached; 
see <a href="#windowing">Assigning Timestamps</a> for more information on how 
to do so.</p>
+
+<h2 id="a-nametransformsaapplying-transforms"><a 
name="transforms"></a>Applying Transforms</h2>
+
+<p>In the Beam SDKs, <strong>transforms</strong> are the operations in your 
pipeline. A transform takes a <code 
class="highlighter-rouge">PCollection</code> (or more than one <code 
class="highlighter-rouge">PCollection</code>) as input, performs an operation 
that you specify on each element in that collection, and produces a new output 
<code class="highlighter-rouge">PCollection</code>. To invoke a transform, you 
must <strong>apply</strong> it to the input <code 
class="highlighter-rouge">PCollection</code>.</p>
+
+<p>In Beam SDK for Java, each transform has a generic <code 
class="highlighter-rouge">apply</code> method. In the Beam SDK for Python, you 
use the pipe operator (<code class="highlighter-rouge">|</code>) to apply a 
transform. Invoking multiple Beam transforms is similar to <em>method 
chaining</em>, but with one slight difference: You apply the transform to the 
input <code class="highlighter-rouge">PCollection</code>, passing the transform 
itself as an argument, and the operation returns the output <code 
class="highlighter-rouge">PCollection</code>. This takes the general form:</p>
+
+<div class="language-java highlighter-rouge"><pre 
class="highlight"><code><span class="o">[</span><span class="n">Output</span> 
<span class="n">PCollection</span><span class="o">]</span> <span 
class="o">=</span> <span class="o">[</span><span class="n">Input</span> <span 
class="n">PCollection</span><span class="o">].</span><span 
class="na">apply</span><span class="o">([</span><span 
class="n">Transform</span><span class="o">])</span>
+</code></pre>
+</div>
+
+<p>Because Beam uses a generic <code class="highlighter-rouge">apply</code> 
method for <code class="highlighter-rouge">PCollection</code>, you can both 
chain transforms sequentially and also apply transforms that contain other 
transforms nested within (called <strong>composite transforms</strong> in the 
Beam SDKs).</p>
+
+<p>How you apply your pipelineâs transforms determines the structure of your 
pipeline. The best way to think of your pipeline is as a directed acyclic 
graph, where the nodes are <code class="highlighter-rouge">PCollection</code>s 
and the edges are transforms. For example, you can chain transforms to create a 
sequential pipeline, like this one:</p>
+
+<div class="language-java highlighter-rouge"><pre 
class="highlight"><code><span class="o">[</span><span class="n">Final</span> 
<span class="n">Output</span> <span class="n">PCollection</span><span 
class="o">]</span> <span class="o">=</span> <span class="o">[</span><span 
class="n">Initial</span> <span class="n">Input</span> <span 
class="n">PCollection</span><span class="o">].</span><span 
class="na">apply</span><span class="o">([</span><span class="n">First</span> 
<span class="n">Transform</span><span class="o">])</span>
+                                                       <span 
class="o">.</span><span class="na">apply</span><span class="o">([</span><span 
class="n">Second</span> <span class="n">Transform</span><span 
class="o">])</span>
+                                                       <span 
class="o">.</span><span class="na">apply</span><span class="o">([</span><span 
class="n">Third</span> <span class="n">Transform</span><span class="o">])</span>
+</code></pre>
+</div>
+
+<p>The resulting workflow graph of the above pipeline looks like this:</p>
+
+<p>[Sequential Graph Graphic]</p>
+
+<p>However, note that a transform <em>does not consume or otherwise alter</em> 
the input collectionâremember that a <code 
class="highlighter-rouge">PCollection</code> is immutable by definition. This 
means that you can apply multiple transforms to the same input <code 
class="highlighter-rouge">PCollection</code> to create a branching pipeline, 
like so:</p>
+
+<div class="language-java highlighter-rouge"><pre 
class="highlight"><code><span class="o">[</span><span class="n">Output</span> 
<span class="n">PCollection</span> <span class="mi">1</span><span 
class="o">]</span> <span class="o">=</span> <span class="o">[</span><span 
class="n">Input</span> <span class="n">PCollection</span><span 
class="o">].</span><span class="na">apply</span><span class="o">([</span><span 
class="n">Transform</span> <span class="mi">1</span><span class="o">])</span>
+<span class="o">[</span><span class="n">Output</span> <span 
class="n">PCollection</span> <span class="mi">2</span><span class="o">]</span> 
<span class="o">=</span> <span class="o">[</span><span class="n">Input</span> 
<span class="n">PCollection</span><span class="o">].</span><span 
class="na">apply</span><span class="o">([</span><span 
class="n">Transform</span> <span class="mi">2</span><span class="o">])</span>
+</code></pre>
+</div>
+
+<p>The resulting workflow graph from the branching pipeline abouve looks like 
this:</p>
+
+<p>[Branching Graph Graphic]</p>
+
+<p>You can also build your own <a href="#transforms-composite">composite 
transforms</a> that nest multiple sub-steps inside a single, larger transform. 
Composite transforms are particularly useful for building a reusable sequence 
of simple steps that get used in a lot of different places.</p>
+
+<h3 id="transforms-in-the-beam-sdk">Transforms in the Beam SDK</h3>
+
+<p>The transforms in the Beam SDKs provide a generic <strong>processing 
framework</strong>, where you provide processing logic in the form of a 
function object (colloquially referred to as âuser codeâ). The user code 
gets applied to the elements of the input <code 
class="highlighter-rouge">PCollection</code>. Instances of your user code might 
then be executed in parallel by many different workers across a cluster, 
depending on the pipeline runner and back-end that you choose to execute your 
Beam pipeline. The user code running on each worker generates the output 
elements that are ultimately added to the final output <code 
class="highlighter-rouge">PCollection</code> that the transform produces.</p>
+
+<h3 id="core-beam-transforms">Core Beam Transforms</h3>
+
+<p>Beam provides the following transforms, each of which represents a 
different processing paradigm:</p>
+
+<ul>
+  <li><code class="highlighter-rouge">ParDo</code></li>
+  <li><code class="highlighter-rouge">GroupByKey</code></li>
+  <li><code class="highlighter-rouge">Combine</code></li>
+  <li><code class="highlighter-rouge">Flatten</code></li>
+</ul>
+
+<h4 id="a-nametransforms-pardoapardo"><a name="transforms-pardo"></a>ParDo</h4>
+
+<p><code class="highlighter-rouge">ParDo</code> is a Beam transform for 
generic parallel processing. The <code class="highlighter-rouge">ParDo</code> 
processing paradigm is similar to the âMapâ phase of a 
Map/Shuffle/Reduce-style algorithm: a <code 
class="highlighter-rouge">ParDo</code> transform considers each element in the 
input <code class="highlighter-rouge">PCollection</code>, performs some 
processing function (your user code) on that element, and emits zero, one, or 
multiple elements to an output <code 
class="highlighter-rouge">PCollection</code>.</p>
+
+<p><code class="highlighter-rouge">ParDo</code> is useful for a variety of 
common data processing operations, including:</p>
+
+<ul>
+  <li><strong>Filtering a data set.</strong> You can use <code 
class="highlighter-rouge">ParDo</code> to consider each element in a <code 
class="highlighter-rouge">PCollection</code> and either output that element to 
a new collection, or discard it.</li>
+  <li><strong>Formatting or type-converting each element in a data 
set.</strong> If your input <code class="highlighter-rouge">PCollection</code> 
contains elements that are of a different type or format than you want, you can 
use <code class="highlighter-rouge">ParDo</code> to perform a conversion on 
each element and output the result to a new <code 
class="highlighter-rouge">PCollection</code>.</li>
+  <li><strong>Extracting parts of each element in a data set.</strong> If you 
have a <code class="highlighter-rouge">PCollection</code> of records with 
multiple fields, for example, you can use a <code 
class="highlighter-rouge">ParDo</code> to parse out just the fields you want to 
consider into a new <code class="highlighter-rouge">PCollection</code>.</li>
+  <li><strong>Performing computations on each element in a data set.</strong> 
You can use <code class="highlighter-rouge">ParDo</code> to perform simple or 
complex computations on every element, or certain elements, of a <code 
class="highlighter-rouge">PCollection</code> and output the results as a new 
<code class="highlighter-rouge">PCollection</code>.</li>
+</ul>
+
+<p>In such roles, <code class="highlighter-rouge">ParDo</code> is a common 
intermediate step in a pipeline. You might use it to extract certain fields 
from a set of raw input records, or convert raw input into a different format; 
you might also use <code class="highlighter-rouge">ParDo</code> to convert 
processed data into a format suitable for output, like database table rows or 
printable strings.</p>
+
+<p>When you apply a <code class="highlighter-rouge">ParDo</code> transform, 
youâll need to provide user code in the form of a <code 
class="highlighter-rouge">DoFn</code> object. <code 
class="highlighter-rouge">DoFn</code> is a Beam SDK class that defines a 
distribured processing function.</p>
+
+<blockquote>
+  <p>When you create a subclass of <code 
class="highlighter-rouge">DoFn</code>, note that your subclass should adhere to 
the <a href="#transforms-usercodereqs">General Requirements for Writing User 
Code for Beam Transforms</a>.</p>
+</blockquote>
+
+<h5 id="applying-pardo">Applying ParDo</h5>
+
+<p>Like all Beam transforms, you apply <code 
class="highlighter-rouge">ParDo</code> by calling the <code 
class="highlighter-rouge">apply</code> method on the input <code 
class="highlighter-rouge">PCollection</code> and passing <code 
class="highlighter-rouge">ParDo</code> as an argument, as shown in the 
following example code:</p>
+
+<div class="language-java highlighter-rouge"><pre 
class="highlight"><code><span class="c1">// The input PCollection of 
Strings.</span>
+<span class="n">PCollection</span><span class="o">&lt;</span><span 
class="n">String</span><span class="o">&gt;</span> <span class="n">words</span> 
<span class="o">=</span> <span class="o">...;</span>
+
+<span class="c1">// The DoFn to perform on each element in the input 
PCollection.</span>
+<span class="kd">static</span> <span class="kd">class</span> <span 
class="nc">ComputeWordLengthFn</span> <span class="kd">extends</span> <span 
class="n">DoFn</span><span class="o">&lt;</span><span 
class="n">String</span><span class="o">,</span> <span 
class="n">Integer</span><span class="o">&gt;</span> <span class="o">{</span> 
<span class="o">...</span> <span class="o">}</span>
+
+<span class="c1">// Apply a ParDo to the PCollection "words" to compute 
lengths for each word.</span>
+<span class="n">PCollection</span><span class="o">&lt;</span><span 
class="n">Integer</span><span class="o">&gt;</span> <span 
class="n">wordLengths</span> <span class="o">=</span> <span 
class="n">words</span><span class="o">.</span><span 
class="na">apply</span><span class="o">(</span>
+    <span class="n">ParDo</span>
+    <span class="o">.</span><span class="na">of</span><span 
class="o">(</span><span class="k">new</span> <span 
class="n">ComputeWordLengthFn</span><span class="o">()));</span>        <span 
class="c1">// The DoFn to perform on each element, which</span>
+                                            <span class="c1">// we define 
above.</span>
+</code></pre>
+</div>
+
+<p>In the example, our input <code 
class="highlighter-rouge">PCollection</code> contains <code 
class="highlighter-rouge">String</code> values. We apply a <code 
class="highlighter-rouge">ParDo</code> transform that specifies a function 
(<code class="highlighter-rouge">ComputeWordLengthFn</code>) to compute the 
length of each string, and outputs the result to a new <code 
class="highlighter-rouge">PCollection</code> of <code 
class="highlighter-rouge">Integer</code> values that stores the length of each 
word.</p>
+
+<h5 id="creating-a-dofn">Creating a DoFn</h5>
+
+<p>The <code class="highlighter-rouge">DoFn</code> object that you pass to 
<code class="highlighter-rouge">ParDo</code> contains the processing logic that 
gets applied to the elements in the input collection. When you use Beam, often 
the most important pieces of code youâll write are these <code 
class="highlighter-rouge">DoFn</code>sâtheyâre what define your 
pipelineâs exact data processing tasks.</p>
+
+<blockquote>
+  <p><strong>Note:</strong> When you create your <code 
class="highlighter-rouge">DoFn</code>, be mindful of the <a 
href="#transforms-usercodereqs">General Requirements for Writing User Code for 
Beam Transforms</a> and ensure that your code follows them.</p>
+</blockquote>
+
+<p>A <code class="highlighter-rouge">DoFn</code> processes one element at a 
time from the input <code class="highlighter-rouge">PCollection</code>. When 
you create a subclass of <code class="highlighter-rouge">DoFn</code>, youâll 
need to provide type paraemters that match the types of the input and output 
elements. If your <code class="highlighter-rouge">DoFn</code> processes 
incoming <code class="highlighter-rouge">String</code> elements and produces 
<code class="highlighter-rouge">Integer</code> elements for the output 
collection (like our previous example, <code 
class="highlighter-rouge">ComputeWordLengthFn</code>), your class declaration 
would look like this:</p>
+
+<div class="language-java highlighter-rouge"><pre 
class="highlight"><code><span class="kd">static</span> <span 
class="kd">class</span> <span class="nc">ComputeWordLengthFn</span> <span 
class="kd">extends</span> <span class="n">DoFn</span><span 
class="o">&lt;</span><span class="n">String</span><span class="o">,</span> 
<span class="n">Integer</span><span class="o">&gt;</span> <span 
class="o">{</span> <span class="o">...</span> <span class="o">}</span>
+</code></pre>
+</div>
+
+<p>Inside your <code class="highlighter-rouge">DoFn</code> subclass, youâll 
write a method annotated with <code 
class="highlighter-rouge">@ProcessElement</code> where you provide the actual 
processing logic. You donât need to manually extract the elements from the 
input collection; the Beam SDKs handle that for you. Your <code 
class="highlighter-rouge">@ProcessElement</code> method should accept an object 
of type <code class="highlighter-rouge">ProcessContext</code>. The <code 
class="highlighter-rouge">ProcessContext</code> object gives you access to an 
input element and a method for emitting an output element:</p>
+
+<div class="language-java highlighter-rouge"><pre 
class="highlight"><code><span class="kd">static</span> <span 
class="kd">class</span> <span class="nc">ComputeWordLengthFn</span> <span 
class="kd">extends</span> <span class="n">DoFn</span><span 
class="o">&lt;</span><span class="n">String</span><span class="o">,</span> 
<span class="n">Integer</span><span class="o">&gt;</span> <span 
class="o">{</span>
+  <span class="nd">@ProcessElement</span>
+  <span class="kd">public</span> <span class="kt">void</span> <span 
class="nf">processElement</span><span class="o">(</span><span 
class="n">ProcessContext</span> <span class="n">c</span><span 
class="o">)</span> <span class="o">{</span>
+    <span class="c1">// Get the input element from ProcessContext.</span>
+    <span class="n">String</span> <span class="n">word</span> <span 
class="o">=</span> <span class="n">c</span><span class="o">.</span><span 
class="na">element</span><span class="o">();</span>
+    <span class="c1">// Use ProcessContext.output to emit the output 
element.</span>
+    <span class="n">c</span><span class="o">.</span><span 
class="na">output</span><span class="o">(</span><span 
class="n">word</span><span class="o">.</span><span 
class="na">length</span><span class="o">());</span>
+  <span class="o">}</span>
+<span class="o">}</span>
+</code></pre>
+</div>
+
+<blockquote>
+  <p><strong>Note:</strong> If the elements in your input <code 
class="highlighter-rouge">PCollection</code> are key/value pairs, you can 
access the key or value by using <code 
class="highlighter-rouge">ProcessContext.element().getKey()</code> or <code 
class="highlighter-rouge">ProcessContext.element().getValue()</code>, 
respectively.</p>
+</blockquote>
+
+<p>A given <code class="highlighter-rouge">DoFn</code> instance generally gets 
invoked one or more times to process some arbitrary bundle of elements. 
However, Beam doesnât guarantee an exact number of invocations; it may be 
invoked multiple times on a given worker node to account for failures and 
retries. As such, you can cache information across multiple calls to your <code 
class="highlighter-rouge">@ProcessElement</code> method, but if you do so, make 
sure the implementation <strong>does not depend on the number of 
invocations</strong>.</p>
+
+<p>In your <code class="highlighter-rouge">@ProcessElement</code> method, 
youâll also need to meet some immutability requirements to ensure that Beam 
and the processing back-end can safely serialize and cache the values in your 
pipeline. Your method should meet the following requirements:</p>
+
+<ul>
+  <li>You should not in any way modify an element returned by <code 
class="highlighter-rouge">ProcessContext.element()</code> or <code 
class="highlighter-rouge">ProcessContext.sideInput()</code> (the incoming 
elements from the input collection).</li>
+  <li>Once you output a value using <code 
class="highlighter-rouge">ProcessContext.output()</code> or <code 
class="highlighter-rouge">ProcessContext.sideOutput()</code>, you should not 
modify that value in any way.</li>
+</ul>
+
+<h5 id="lightweight-dofns-and-other-abstractions">Lightweight DoFns and Other 
Abstractions</h5>
+
+<p>If your function is relatively straightforward, you can simply your use of 
<code class="highlighter-rouge">ParDo</code> by providing a lightweight <code 
class="highlighter-rouge">DoFn</code> in-line. In Java, you can specify your 
<code class="highlighter-rouge">DoFn</code> as an anonymous inner class 
instance, and in Python you can use a <code 
class="highlighter-rouge">Callable</code>.</p>
+
+<p>Hereâs the previous example, <code class="highlighter-rouge">ParDo</code> 
with <code class="highlighter-rouge">ComputeLengthWordsFn</code>, with the 
<code class="highlighter-rouge">DoFn</code> specified as an anonymous inner 
class instance:</p>
+
+<div class="language-java highlighter-rouge"><pre 
class="highlight"><code><span class="c1">// The input PCollection.</span>
+<span class="n">PCollection</span><span class="o">&lt;</span><span 
class="n">String</span><span class="o">&gt;</span> <span class="n">words</span> 
<span class="o">=</span> <span class="o">...;</span>
+
+<span class="c1">// Apply a ParDo with an anonymous DoFn to the PCollection 
words.</span>
+<span class="c1">// Save the result as the PCollection wordLengths.</span>
+<span class="n">PCollection</span><span class="o">&lt;</span><span 
class="n">Integer</span><span class="o">&gt;</span> <span 
class="n">wordLengths</span> <span class="o">=</span> <span 
class="n">words</span><span class="o">.</span><span 
class="na">apply</span><span class="o">(</span>
+  <span class="n">ParDo</span>
+    <span class="o">.</span><span class="na">named</span><span 
class="o">(</span><span class="s">"ComputeWordLengths"</span><span 
class="o">)</span>            <span class="c1">// the transform name</span>
+    <span class="o">.</span><span class="na">of</span><span 
class="o">(</span><span class="k">new</span> <span class="n">DoFn</span><span 
class="o">&lt;</span><span class="n">String</span><span class="o">,</span> 
<span class="n">Integer</span><span class="o">&gt;()</span> <span 
class="o">{</span>       <span class="c1">// a DoFn as an anonymous inner class 
instance</span>
+      <span class="nd">@ProcessElement</span>
+      <span class="kd">public</span> <span class="kt">void</span> <span 
class="nf">processElement</span><span class="o">(</span><span 
class="n">ProcessContext</span> <span class="n">c</span><span 
class="o">)</span> <span class="o">{</span>
+        <span class="n">c</span><span class="o">.</span><span 
class="na">output</span><span class="o">(</span><span class="n">c</span><span 
class="o">.</span><span class="na">element</span><span 
class="o">().</span><span class="na">length</span><span class="o">());</span>
+      <span class="o">}</span>
+    <span class="o">}));</span>
+</code></pre>
+</div>
+
+<p>If your <code class="highlighter-rouge">ParDo</code> performs a one-to-one 
mapping of input elements to output elementsâthat is, for each input element, 
it applies a function that produces <em>exactly one</em> output element, you 
can use the higher-level <code class="highlighter-rouge">MapElements</code> 
transform. <code class="highlighter-rouge">MapElements</code> can accept an 
anonymous Java 8 lambda function for additional brevity.</p>
+
+<p>Hereâs the previous example using <code 
class="highlighter-rouge">MapElements</code>:</p>
+
+<div class="language-java highlighter-rouge"><pre 
class="highlight"><code><span class="c1">// The input PCollection.</span>
+<span class="n">PCollection</span><span class="o">&amp;</span><span 
class="n">lt</span><span class="o">;</span><span class="n">String</span><span 
class="o">&amp;</span><span class="n">gt</span><span class="o">;</span> <span 
class="n">words</span> <span class="o">=</span> <span class="o">...;</span>
+
+<span class="c1">// Apply a MapElements with an anonymous lambda function to 
the PCollection words.</span>
+<span class="c1">// Save the result as the PCollection wordLengths.</span>
+<span class="n">PCollection</span><span class="o">&amp;</span><span 
class="n">lt</span><span class="o">;</span><span class="n">Integer</span><span 
class="o">&amp;</span><span class="n">gt</span><span class="o">;</span> <span 
class="n">wordLengths</span> <span class="o">=</span> <span 
class="n">words</span><span class="o">.</span><span 
class="na">apply</span><span class="o">(</span>
+  <span class="n">MapElements</span><span class="o">.</span><span 
class="na">via</span><span class="o">((</span><span class="n">String</span> 
<span class="n">word</span><span class="o">)</span> <span 
class="o">-&amp;</span><span class="n">gt</span><span class="o">;</span> <span 
class="n">word</span><span class="o">.</span><span 
class="na">length</span><span class="o">())</span>
+      <span class="o">.</span><span class="na">withOutputType</span><span 
class="o">(</span><span class="k">new</span> <span 
class="n">TypeDescriptor</span><span class="o">&amp;</span><span 
class="n">lt</span><span class="o">;</span><span class="n">Integer</span><span 
class="o">&amp;</span><span class="n">gt</span><span class="o">;()</span> <span 
class="o">{});</span>
+</code></pre>
+</div>
+
+<blockquote>
+  <p><strong>Note:</strong> You can use Java 8 lambda functions with several 
other Beam transforms, including <code class="highlighter-rouge">Filter</code>, 
<code class="highlighter-rouge">FlatMapElements</code>, and <code 
class="highlighter-rouge">Partition</code>.</p>
+</blockquote>
+
+<h4 id="a-nametransforms-gbkausing-groupbykey"><a 
name="transforms-gbk"></a>Using GroupByKey</h4>
+
+<p><code class="highlighter-rouge">GroupByKey</code> is a Beam transform for 
processing collections of key/value pairs. Itâs a parallel reduction 
operation, analagous to the Shuffle phase of a Map/Shuffle/Reduce-style 
algorithm. The input to <code class="highlighter-rouge">GroupByKey</code> is a 
collection of key/value pairs that represents a <em>multimap</em>, where the 
collection contains multiple pairs that have the same key, but different 
values. Given such a collection, you use <code 
class="highlighter-rouge">GroupByKey</code> to collect all of the values 
associated with each unique key.</p>
+
+<p><code class="highlighter-rouge">GroupByKey</code> is a good way to 
aggregate data that has something in common. For example, if you have a 
collection that stores records of customer orders, you might want to group 
together all the orders from the same postal code (wherein the âkeyâ of the 
key/value pair is the postal code field, and the âvalueâ is the remainder 
of the record).</p>
+
+<p>Letâs examine the mechanics of <code 
class="highlighter-rouge">GroupByKey</code> with a simple xample case, where 
our data set consists of words from a text file and the line number on which 
they appear. We want to group together all the line numbers (values) that share 
the same word (key), letting us see all the places in the text where a 
particular word appears.</p>
+
+<p>Our input is a <code class="highlighter-rouge">PCollection</code> of 
key/value pairs where each word is a key, and the value is a line number in the 
file where the word appears. Hereâs a list of the key/value pairs in the 
input collection:</p>
+
+<div class="highlighter-rouge"><pre class="highlight"><code>cat, 1
+dog, 5
+and, 1
+jump, 3
+tree, 2
+cat, 5
+dog, 2
+and, 2
+cat, 9
+and, 6
+...
+</code></pre>
+</div>
+
+<p><code class="highlighter-rouge">GroupByKey</code> gathers up all the values 
with the same key and outputs a new pair consisting of the unique key and a 
collection of all of the values that were associated with that key in the input 
collection. If we apply <code class="highlighter-rouge">GroupByKey</code> to 
our input collection above, the output collection would look like this:</p>
+
+<div class="highlighter-rouge"><pre class="highlight"><code>cat, [1,5,9]
+dog, [5,2]
+and, [1,2,6]
+jump, [3]
+tree, [2]
+...
+</code></pre>
+</div>
+
+<p>Thus, <code class="highlighter-rouge">GroupByKey</code> represents a 
transform from a multimap (multiple keys to individual values) to a uni-map 
(unique keys to collections of values).</p>
+
+<blockquote>
+  <p><strong>A Note on Key/Value Pairs:</strong> Beam represents key/value 
pairs slightly differently depending on the language and SDK youâre using. In 
the Beam SDK for Java, you represent a key/value pair with an object of type 
<code class="highlighter-rouge">KV&lt;K, V&gt;</code>. In Python, you represent 
key/value pairs with 2-tuples.</p>
+</blockquote>
+
+<h4 id="a-nametransforms-combineausing-combine"><a 
name="transforms-combine"></a>Using Combine</h4>
+
+<h4 
id="a-nametransforms-usercodereqsageneral-requirements-for-writing-user-code-for-beam-transforms"><a
 name="transforms-usercodereqs"></a>General Requirements for Writing User Code 
for Beam Transforms</h4>
+
+<p>When you build user code for a Beam transform, you should keep in mind the 
distributed nature of execution. For example, there might be many copies of 
your function running on a lot of different machines in parallel, and those 
copies function independently, without communicating or sharing state with any 
of the other copies. Depending on the Pipeline Runner and processing back-end 
you choose for your pipeline, each copy of your user code function may be 
retried or run multiple times. As such, you should be cautious about including 
things like state dependency in your user code.</p>
+
+<p>In general, your user code must fulfill at least these requirements:</p>
+
+<ul>
+  <li>Your function object must be <strong>serializable</strong>.</li>
+  <li>Your function object must be <strong>thread-compatible</strong>, and be 
aware that <em>the Beam SDKs are not thread-safe</em>.</li>
+</ul>
+
+<p>In addition, itâs recommended that you make your function object 
<strong>idempotent</strong>.</p>
+
+<blockquote>
+  <p><strong>Note:</strong> These requirements apply to subclasses of <code 
class="highlighter-rouge">DoFn</code> (a function object used with the <a 
href="#transforms-pardo">ParDo</a> transform), <code 
class="highlighter-rouge">CombineFn</code> (a function object used with the <a 
href="#transforms-combine">Combine</a> transform), and <code 
class="highlighter-rouge">WindowFn</code> (a function object used with the <a 
href="#windowing">Window</a> transform).</p>
+</blockquote>
+
+<h5 id="serializability">Serializability</h5>
+
+<p>Any function object you provide to a transform must be <strong>fully 
serializable</strong>. This is because a copy of the function needs to be 
serialized and transmitted to a remote worker in your processing cluster. The 
base classes for user code, such as <code 
class="highlighter-rouge">DoFn</code>, <code 
class="highlighter-rouge">CombineFn</code>, and <code 
class="highlighter-rouge">WindowFn</code>, already implement <code 
class="highlighter-rouge">Serializable</code>; however, your subclass must not 
add any non-serializable members.</p>
+
+<p>Some other serializability factors you should keep in mind are:</p>
+
+<ul>
+  <li>Transient fields in your function object are <em>not</em> transmitted to 
worker instances, because they are not automatically serialized.</li>
+  <li>Avoid loading a field with a large amount of data before 
serialization.</li>
+  <li>Individual instances of your function object cannot share data.</li>
+  <li>Mutating a function object after it gets applied will have no 
effect.</li>
+  <li>Take care when declaring your function object inline by using an 
anonymous inner class instance. In a non-static context, your inner class 
instance will implicitly contain a pointer to the enclosing class and that 
classâ state. That enclosing class will also be serialized, and thus the same 
considerations that apply to the function object itself also apply to this 
outer class.</li>
+</ul>
+
+<h5 id="thread-compatibility">Thread-Compatibility</h5>
+
+<p>Your function object should be thread-compatible. Each instance of your 
function object is accessed by a single thread on a worker instance, unless you 
explicitly create your own threads. Note, however, that <strong>the Beam SDKs 
are not thread-safe</strong>. If you create your own threads in your user code, 
you must provide your own synchronization. Note that static members in your 
function object are not passed to worker instances and that multiple instances 
of your function may be accessed from different threads.</p>
+
+<h5 id="idempotence">Idempotence</h5>
+
+<p>Itâs recommended that you make your function object idempotentâthat is, 
that it can be repeated or retried as often as necessary without causing 
unintended side effects. The Beam model provides no guarantees as to the number 
of times your user code might be invoked or retried; as such, keeping your 
function object idempotent keeps your pipelineâs output deterministic, and 
your transformsâ behavior more predictable and easier to debug.</p>
+
+<p><a name="io"></a>
+<a name="running"></a>
+<a name="transforms-composite"></a>
+<a name="transforms-sideio"></a>
+<a name="coders"></a>
+<a name="windowing"></a>
+<a name="triggers"></a></p>
+
+<blockquote>
+  <p><strong>Note:</strong> This guide is still in progress. There is an open 
issue to finish the guide (<a 
href="https://issues.apache.org/jira/browse/BEAM-193";>BEAM-193</a>)</p>
+</blockquote>
+
+      </div>
+
+
+    <hr>
+  <div class="row">
+      <div class="col-xs-12">
+          <footer>
+              <p class="text-center">&copy; Copyright 2016
+                <a href="http://www.apache.org";>The Apache Software 
Foundation.</a> All Rights Reserved.</p>
+                <p class="text-center"><a href="/privacy_policy">Privacy 
Policy</a> |
+                <a href="/feed.xml">RSS Feed</a></p>
+          </footer>
+      </div>
+  </div>
+  <!-- container div end -->
+</div>
+
+
+  </body>
+
+</html>


http://git-wip-us.apache.org/repos/asf/incubator-beam-site/blob/61ba656f/content/documentation/resources/index.html
----------------------------------------------------------------------
diff --git a/content/documentation/resources/index.html 
b/content/documentation/resources/index.html
new file mode 100644
index 0000000..ff63294
--- /dev/null
+++ b/content/documentation/resources/index.html
@@ -0,0 +1,175 @@
+<!DOCTYPE html>
+<html lang="en">
+
+  <head>
+  <meta charset="utf-8">
+  <meta http-equiv="X-UA-Compatible" content="IE=edge">
+  <meta name="viewport" content="width=device-width, initial-scale=1">
+
+  <title>Beam Learning Resources</title>
+  <meta name="description" content="Apache Beam is an open source, unified 
model and set of language-specific SDKs for defining and executing data 
processing workflows, and also data ingestion and integration flows, supporting 
Enterprise Integration Patterns (EIPs) and Domain Specific Languages (DSLs). 
Dataflow pipelines simplify the mechanics of large-scale batch and streaming 
data processing and can run on a number of runtimes like Apache Flink, Apache 
Spark, and Google Cloud Dataflow (a cloud service). Beam also brings DSL in 
different languages, allowing users to easily implement their data integration 
processes.
+">
+
+  <link rel="stylesheet" href="/styles/site.css">
+  <link rel="stylesheet" href="/css/theme.css">
+  <script 
src="https://ajax.googleapis.com/ajax/libs/jquery/2.2.0/jquery.min.js";></script>
+  <script src="/js/bootstrap.min.js"></script>
+  <script src="/js/language-switch.js"></script>
+  <link rel="canonical" 
href="http://beam.incubator.apache.org/documentation/resources/"; 
data-proofer-ignore>
+  <link rel="alternate" type="application/rss+xml" title="Apache Beam 
(incubating)" href="http://beam.incubator.apache.org/feed.xml";>
+  <script>
+    
(function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
+    (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new 
Date();a=s.createElement(o),
+    
m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
+    
})(window,document,'script','//www.google-analytics.com/analytics.js','ga');
+
+    ga('create', 'UA-73650088-1', 'auto');
+    ga('send', 'pageview');
+
+  </script>
+  <link rel="shortcut icon" type="image/x-icon" href="/images/favicon.ico">
+</head>
+
+
+  <body role="document">
+
+    <nav class="navbar navbar-default navbar-fixed-top">
+  <div class="container">
+    <div class="navbar-header">
+      <a href="/" class="navbar-brand" >
+        <img alt="Brand" style="height: 25px" 
src="/images/beam_logo_navbar.png">
+      </a>
+      <button type="button" class="navbar-toggle collapsed" 
data-toggle="collapse" data-target="#navbar" aria-expanded="false" 
aria-controls="navbar">
+        <span class="sr-only">Toggle navigation</span>
+        <span class="icon-bar"></span>
+        <span class="icon-bar"></span>
+        <span class="icon-bar"></span>
+      </button>
+    </div>
+    <div id="navbar" class="navbar-collapse collapse">
+      <ul class="nav navbar-nav">
+        <li class="dropdown">
+                 <a href="#" class="dropdown-toggle" data-toggle="dropdown" 
role="button" aria-haspopup="true" aria-expanded="false">Get Started <span 
class="caret"></span></a>
+                 <ul class="dropdown-menu">
+                         <li><a href="/get-started/beam-overview/">Beam 
Overview</a></li>
+              <li><a href="/get-started/quickstart/">Quickstart</a></li>
+                         <li role="separator" class="divider"></li>
+                         <li class="dropdown-header">Example Walkthroughs</li>
+                         <li><a 
href="/get-started/wordcount-example/">WordCount</a></li>
+                         <li><a 
href="/get-started/mobile-gaming-example/">Mobile Gaming</a></li>
+              <li role="separator" class="divider"></li>
+              <li class="dropdown-header">Resources</li>
+              <li><a href="/get-started/downloads">Downloads</a></li>
+              <li><a href="/get-started/releases">Release Notes</a></li>
+              <li><a href="/get-started/support">Support</a></li>
+                 </ul>
+           </li>
+        <li class="dropdown">
+                 <a href="#" class="dropdown-toggle" data-toggle="dropdown" 
role="button" aria-haspopup="true" aria-expanded="false">Documentation <span 
class="caret"></span></a>
+                 <ul class="dropdown-menu">
+                         <li><a href="/documentation">Using the 
Documentation</a></li>
+                         <li role="separator" class="divider"></li>
+                         <li class="dropdown-header">Beam Concepts</li>
+                         <li><a 
href="/documentation/programming-guide/">Programming Guide</a></li>
+                         <li><a href="/documentation/resources/">Additional 
Resources</a></li>
+                         <li role="separator" class="divider"></li>
+              <li class="dropdown-header">Pipeline Fundamentals</li>
+              <li><a 
href="/documentation/pipelines/design-your-pipeline/">Design Your 
Pipeline</a></li>
+              <li><a 
href="/documentation/pipelines/create-your-pipeline/">Create Your 
Pipeline</a></li>
+              <li><a href="/documentation/pipelines/test-your-pipeline/">Test 
Your Pipeline</a></li>
+              <li role="separator" class="divider"></li>
+                         <li class="dropdown-header">SDKs</li>
+                         <li><a href="/documentation/sdks/java/">Java 
SDK</a></li>
+                         <li><a href="/documentation/sdks/javadoc/">Java SDK 
API Reference</a></li>
+                         <li role="separator" class="divider"></li>
+                         <li class="dropdown-header">Runners</li>
+                         <li><a 
href="/documentation/runners/capability-matrix/">Capability Matrix</a></li>
+                         <li><a href="/documentation/runners/direct/">Direct 
Runner</a></li>
+                         <li><a href="/documentation/runners/flink/">Apache 
Flink Runner</a></li>
+                         <li><a href="/documentation/runners/spark/">Apache 
Spark Runner</a></li>
+                         <li><a href="/documentation/runners/dataflow/">Cloud 
Dataflow Runner</a></li>
+                 </ul>
+           </li>
+        <li class="dropdown">
+                 <a href="#" class="dropdown-toggle" data-toggle="dropdown" 
role="button" aria-haspopup="true" aria-expanded="false">Contribute <span 
class="caret"></span></a>
+                 <ul class="dropdown-menu">
+                         <li><a href="/contribute">Get Started 
Contributing</a></li>
+        <li role="separator" class="divider"></li>
+        <li class="dropdown-header">Guides</li>
+                         <li><a 
href="/contribute/contribution-guide/">Contribution Guide</a></li>
+        <li><a href="/contribute/testing/">Testing Guide</a></li>
+        <li><a href="/contribute/release-guide/">Release Guide</a></li>
+        <li role="separator" class="divider"></li>
+        <li class="dropdown-header">Technical References</li>
+        <li><a href="/contribute/design-principles/">Design Principles</a></li>
+                         <li><a href="/contribute/work-in-progress/">Ongoing 
Projects</a></li>
+        <li><a href="/contribute/source-repository/">Source 
Repository</a></li>      
+        <li role="separator" class="divider"></li>
+                         <li class="dropdown-header">Promotion</li>
+        <li><a href="/contribute/presentation-materials/">Presentation 
Materials</a></li>
+        <li><a href="/contribute/logos/">Logos and Design</a></li>
+        <li role="separator" class="divider"></li>
+        <li><a href="/contribute/team/">Team</a></li>
+                 </ul>
+           </li>
+
+        <li><a href="/blog">Blog</a></li>
+      </ul>
+      <ul class="nav navbar-nav navbar-right">
+        <li class="dropdown">
+          <a href="#" class="dropdown-toggle" data-toggle="dropdown" 
role="button" aria-haspopup="true" aria-expanded="false"><img 
src="https://www.apache.org/foundation/press/kit/feather_small.png"; alt="Apache 
Logo" style="height:24px;">Apache Software Foundation<span 
class="caret"></span></a>
+          <ul class="dropdown-menu dropdown-menu-right">
+            <li><a href="http://www.apache.org/";>ASF Homepage</a></li>
+            <li><a href="http://www.apache.org/licenses/";>License</a></li>
+            <li><a href="http://www.apache.org/security/";>Security</a></li>
+            <li><a 
href="http://www.apache.org/foundation/thanks.html";>Thanks</a></li>
+            <li><a 
href="http://www.apache.org/foundation/sponsorship.html";>Sponsorship</a></li>
+            <li><a 
href="https://www.apache.org/foundation/policies/conduct";>Code of 
Conduct</a></li>
+          </ul>
+        </li>
+      </ul>
+    </div><!--/.nav-collapse -->
+  </div>
+</nav>
+
+
+<link rel="stylesheet" href="">
+
+
+    <div class="container" role="main">
+
+      <div class="row">
+        <h1 
id="additional-resources-for-learning-about-apache-beam">Additional Resources 
for Learning about Apache Beam</h1>
+
+<p>This page is under construction (<a 
href="https://issues.apache.org/jira/browse/BEAM-509";>BEAM-509</a>).</p>
+
+<p>Hereâs some links to some of our favorite articles and videos get you 
started.</p>
+
+<ul>
+  <li><a 
href="https://www.oreilly.com/ideas/the-world-beyond-batch-streaming-101";>The 
world beyond batch: Streaming 101</a></li>
+  <li><a 
href="https://www.oreilly.com/ideas/the-world-beyond-batch-streaming-102";>The 
world beyong batch: Streaming 102</a></li>
+  <li><a 
href="https://cloud.google.com/dataflow/blog/dataflow-beam-and-spark-comparison";>Dataflow/Beam
 &amp; Spark: A Programming Model Comparison</a></li>
+  <li><a 
href="http://googlecloudplatform.blogspot.com/2016/01/Dataflow-and-open-source-proposal-to-join-the-Apache-Incubator.html";>Dataflow
 and open source - proposal to join the Apache Incubator</a></li>
+</ul>
+
+      </div>
+
+
+    <hr>
+  <div class="row">
+      <div class="col-xs-12">
+          <footer>
+              <p class="text-center">&copy; Copyright 2016
+                <a href="http://www.apache.org";>The Apache Software 
Foundation.</a> All Rights Reserved.</p>
+                <p class="text-center"><a href="/privacy_policy">Privacy 
Policy</a> |
+                <a href="/feed.xml">RSS Feed</a></p>
+          </footer>
+      </div>
+  </div>
+  <!-- container div end -->
+</div>
+
+
+  </body>
+
+</html>

[47/51] [abbrv] [partial] incubator-beam-site git commit: Regenerate site

Reply via email to