Regenerate website

Project: http://git-wip-us.apache.org/repos/asf/beam-site/repo
Commit: http://git-wip-us.apache.org/repos/asf/beam-site/commit/75c59b81
Tree: http://git-wip-us.apache.org/repos/asf/beam-site/tree/75c59b81
Diff: http://git-wip-us.apache.org/repos/asf/beam-site/diff/75c59b81

Branch: refs/heads/asf-site
Commit: 75c59b817f85f9fdc91aa8cee3261968257a6c53
Parents: 69ee8d5
Author: Ahmet Altay <al...@google.com>
Authored: Tue May 9 14:24:56 2017 -0700
Committer: Ahmet Altay <al...@google.com>
Committed: Tue May 9 14:24:56 2017 -0700

----------------------------------------------------------------------
 .../documentation/io/built-in/hadoop/index.html | 373 +++++++++++++++++++
 content/documentation/io/built-in/index.html    |   2 +-
 2 files changed, 374 insertions(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/beam-site/blob/75c59b81/content/documentation/io/built-in/hadoop/index.html
----------------------------------------------------------------------
diff --git a/content/documentation/io/built-in/hadoop/index.html 
b/content/documentation/io/built-in/hadoop/index.html
new file mode 100644
index 0000000..7fda51a
--- /dev/null
+++ b/content/documentation/io/built-in/hadoop/index.html
@@ -0,0 +1,373 @@
+<!DOCTYPE html>
+<html lang="en">
+
+  <head>
+  <meta charset="utf-8">
+  <meta http-equiv="X-UA-Compatible" content="IE=edge">
+  <meta name="viewport" content="width=device-width, initial-scale=1">
+
+  <title>Apache Hadoop InputFormat IO</title>
+  <meta name="description" content="Apache Beam is an open source, unified 
model and set of language-specific SDKs for defining and executing data 
processing workflows, and also data ingestion and integration flows, supporting 
Enterprise Integration Patterns (EIPs) and Domain Specific Languages (DSLs). 
Dataflow pipelines simplify the mechanics of large-scale batch and streaming 
data processing and can run on a number of runtimes like Apache Flink, Apache 
Spark, and Google Cloud Dataflow (a cloud service). Beam also brings DSL in 
different languages, allowing users to easily implement their data integration 
processes.
+">
+
+  <link rel="stylesheet" href="/styles/site.css">
+  <link rel="stylesheet" href="/css/theme.css">
+  <script 
src="https://ajax.googleapis.com/ajax/libs/jquery/2.2.0/jquery.min.js";></script>
+  <script src="/js/bootstrap.min.js"></script>
+  <script src="/js/language-switch.js"></script>
+  <link rel="canonical" 
href="https://beam.apache.org/documentation/io/built-in/hadoop/"; 
data-proofer-ignore>
+  <link rel="alternate" type="application/rss+xml" title="Apache Beam" 
href="https://beam.apache.org/feed.xml";>
+  <script>
+    
(function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
+    (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new 
Date();a=s.createElement(o),
+    
m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
+    
})(window,document,'script','//www.google-analytics.com/analytics.js','ga');
+
+    ga('create', 'UA-73650088-1', 'auto');
+    ga('send', 'pageview');
+
+  </script>
+  <link rel="shortcut icon" type="image/x-icon" href="/images/favicon.ico">
+</head>
+
+
+  <body role="document">
+
+    <nav class="navbar navbar-default navbar-fixed-top">
+  <div class="container">
+    <div class="navbar-header">
+      <a href="/" class="navbar-brand" >
+        <img alt="Brand" style="height: 25px" 
src="/images/beam_logo_navbar.png">
+      </a>
+      <button type="button" class="navbar-toggle collapsed" 
data-toggle="collapse" data-target="#navbar" aria-expanded="false" 
aria-controls="navbar">
+        <span class="sr-only">Toggle navigation</span>
+        <span class="icon-bar"></span>
+        <span class="icon-bar"></span>
+        <span class="icon-bar"></span>
+      </button>
+    </div>
+    <div id="navbar" class="navbar-collapse collapse">
+      <ul class="nav navbar-nav">
+        <li class="dropdown">
+                 <a href="#" class="dropdown-toggle" data-toggle="dropdown" 
role="button" aria-haspopup="true" aria-expanded="false">Get Started <span 
class="caret"></span></a>
+                 <ul class="dropdown-menu">
+                         <li><a href="/get-started/beam-overview/">Beam 
Overview</a></li>
+        <li><a href="/get-started/quickstart-java/">Quickstart - Java</a></li>
+        <li><a href="/get-started/quickstart-py/">Quickstart - Python</a></li>
+                         <li role="separator" class="divider"></li>
+                         <li class="dropdown-header">Example Walkthroughs</li>
+                         <li><a 
href="/get-started/wordcount-example/">WordCount</a></li>
+                         <li><a 
href="/get-started/mobile-gaming-example/">Mobile Gaming</a></li>
+              <li role="separator" class="divider"></li>
+              <li class="dropdown-header">Resources</li>
+              <li><a href="/get-started/downloads">Downloads</a></li>
+              <li><a href="/get-started/support">Support</a></li>
+                 </ul>
+           </li>
+        <li class="dropdown">
+                 <a href="#" class="dropdown-toggle" data-toggle="dropdown" 
role="button" aria-haspopup="true" aria-expanded="false">Documentation <span 
class="caret"></span></a>
+                 <ul class="dropdown-menu">
+                         <li><a href="/documentation">Using the 
Documentation</a></li>
+                         <li role="separator" class="divider"></li>
+                         <li class="dropdown-header">Beam Concepts</li>
+                         <li><a 
href="/documentation/programming-guide/">Programming Guide</a></li>
+                         <li><a href="/documentation/resources/">Additional 
Resources</a></li>
+                         <li role="separator" class="divider"></li>
+              <li class="dropdown-header">Pipeline Fundamentals</li>
+              <li><a 
href="/documentation/pipelines/design-your-pipeline/">Design Your 
Pipeline</a></li>
+              <li><a 
href="/documentation/pipelines/create-your-pipeline/">Create Your 
Pipeline</a></li>
+              <li><a href="/documentation/pipelines/test-your-pipeline/">Test 
Your Pipeline</a></li>
+              <li><a href="/documentation/io/io-toc/">Pipeline I/O</a></li>
+              <li role="separator" class="divider"></li>
+                         <li class="dropdown-header">SDKs</li>
+                         <li><a href="/documentation/sdks/java/">Java 
SDK</a></li>
+                         <li><a href="/documentation/sdks/javadoc/0.6.0/" 
target="_blank">Java SDK API Reference <img src="/images/external-link-icon.png"
+                 width="14" height="14"
+                 alt="External link."></a>
+        </li>
+        <li><a href="/documentation/sdks/python/">Python SDK</a></li>
+        <li><a href="/documentation/sdks/pydoc/0.6.0/" target="_blank">Python 
SDK API Reference <img src="/images/external-link-icon.png"
+                 width="14" height="14"
+                 alt="External link."></a>
+        </li>
+                         <li role="separator" class="divider"></li>
+                         <li class="dropdown-header">Runners</li>
+                         <li><a 
href="/documentation/runners/capability-matrix/">Capability Matrix</a></li>
+                         <li><a href="/documentation/runners/direct/">Direct 
Runner</a></li>
+                         <li><a href="/documentation/runners/apex/">Apache 
Apex Runner</a></li>
+                         <li><a href="/documentation/runners/flink/">Apache 
Flink Runner</a></li>
+                         <li><a href="/documentation/runners/spark/">Apache 
Spark Runner</a></li>
+                         <li><a href="/documentation/runners/dataflow/">Cloud 
Dataflow Runner</a></li>
+                 </ul>
+           </li>
+        <li class="dropdown">
+                 <a href="#" class="dropdown-toggle" data-toggle="dropdown" 
role="button" aria-haspopup="true" aria-expanded="false">Contribute <span 
class="caret"></span></a>
+                 <ul class="dropdown-menu">
+                         <li><a href="/contribute">Get Started 
Contributing</a></li>
+        <li role="separator" class="divider"></li>
+        <li class="dropdown-header">Guides</li>
+                         <li><a 
href="/contribute/contribution-guide/">Contribution Guide</a></li>
+        <li><a href="/contribute/testing/">Testing Guide</a></li>
+        <li><a href="/contribute/release-guide/">Release Guide</a></li>
+        <li><a href="/contribute/ptransform-style-guide/">PTransform Style 
Guide</a></li>
+        <li role="separator" class="divider"></li>
+        <li class="dropdown-header">Technical References</li>
+        <li><a href="/contribute/design-principles/">Design Principles</a></li>
+                         <li><a href="/contribute/work-in-progress/">Ongoing 
Projects</a></li>
+        <li><a href="/contribute/source-repository/">Source Repository</a></li>
+        <li role="separator" class="divider"></li>
+                         <li class="dropdown-header">Promotion</li>
+        <li><a href="/contribute/presentation-materials/">Presentation 
Materials</a></li>
+        <li><a href="/contribute/logos/">Logos and Design</a></li>
+        <li role="separator" class="divider"></li>
+        <li><a href="/contribute/maturity-model/">Maturity Model</a></li>
+        <li><a href="/contribute/team/">Team</a></li>
+                 </ul>
+           </li>
+
+        <li><a href="/blog">Blog</a></li>
+      </ul>
+      <ul class="nav navbar-nav navbar-right">
+        <li class="dropdown">
+          <a href="#" class="dropdown-toggle" data-toggle="dropdown" 
role="button" aria-haspopup="true" aria-expanded="false"><img 
src="https://www.apache.org/foundation/press/kit/feather_small.png"; alt="Apache 
Logo" style="height:24px;">Apache Software Foundation<span 
class="caret"></span></a>
+          <ul class="dropdown-menu dropdown-menu-right">
+            <li><a href="http://www.apache.org/";>ASF Homepage</a></li>
+            <li><a href="http://www.apache.org/licenses/";>License</a></li>
+            <li><a href="http://www.apache.org/security/";>Security</a></li>
+            <li><a 
href="http://www.apache.org/foundation/thanks.html";>Thanks</a></li>
+            <li><a 
href="http://www.apache.org/foundation/sponsorship.html";>Sponsorship</a></li>
+            <li><a 
href="https://www.apache.org/foundation/policies/conduct";>Code of 
Conduct</a></li>
+          </ul>
+        </li>
+      </ul>
+    </div><!--/.nav-collapse -->
+  </div>
+</nav>
+
+
+<link rel="stylesheet" href="">
+
+
+    <div class="container" role="main">
+
+      <div class="row">
+        <p><a href="/documentation/io/io-toc/">Pipeline I/O Table of 
Contents</a></p>
+
+<h1 id="hadoop-inputformat-io">Hadoop InputFormat IO</h1>
+
+<p>A <code class="highlighter-rouge">HadoopInputFormatIO</code> is a transform 
for reading data from any source that implements Hadoop’s <code 
class="highlighter-rouge">InputFormat</code>. For example, Cassandra, 
Elasticsearch, HBase, Redis, Postgres, etc.</p>
+
+<p><code class="highlighter-rouge">HadoopInputFormatIO</code> allows you to 
connect to many data sources that do not yet have a Beam IO transform. However, 
<code class="highlighter-rouge">HadoopInputFormatIO</code> has to make several 
performance trade-offs in connecting to <code 
class="highlighter-rouge">InputFormat</code>. So, if there is another Beam IO 
transform for connecting specifically to your data source of choice, we 
recommend you use that one.</p>
+
+<p>You will need to pass a Hadoop <code 
class="highlighter-rouge">Configuration</code> with parameters specifying how 
the read will occur. Many properties of the <code 
class="highlighter-rouge">Configuration</code> are optional and some are 
required for certain <code class="highlighter-rouge">InputFormat</code> 
classes, but the following properties must be set for all <code 
class="highlighter-rouge">InputFormat</code> classes:</p>
+
+<ul>
+  <li><code class="highlighter-rouge">mapreduce.job.inputformat.class</code> - 
The <code class="highlighter-rouge">InputFormat</code> class used to connect to 
your data source of choice.</li>
+  <li><code class="highlighter-rouge">key.class</code> - The <code 
class="highlighter-rouge">Key</code> class returned by the <code 
class="highlighter-rouge">InputFormat</code> in <code 
class="highlighter-rouge">mapreduce.job.inputformat.class</code>.</li>
+  <li><code class="highlighter-rouge">value.class</code> - The <code 
class="highlighter-rouge">Value</code> class returned by the <code 
class="highlighter-rouge">InputFormat</code> in <code 
class="highlighter-rouge">mapreduce.job.inputformat.class</code>.</li>
+</ul>
+
+<p>For example:</p>
+<div class="language-java highlighter-rouge"><pre 
class="highlight"><code><span class="n">Configuration</span> <span 
class="n">myHadoopConfiguration</span> <span class="o">=</span> <span 
class="k">new</span> <span class="n">Configuration</span><span 
class="o">(</span><span class="kc">false</span><span class="o">);</span>
+<span class="c1">// Set Hadoop InputFormat, key and value class in 
configuration</span>
+<span class="n">myHadoopConfiguration</span><span class="o">.</span><span 
class="na">setClass</span><span class="o">(</span><span 
class="s">"mapreduce.job.inputformat.class"</span><span class="o">,</span> 
<span class="n">InputFormatClass</span><span class="o">,</span>
+  <span class="n">InputFormat</span><span class="o">.</span><span 
class="na">class</span><span class="o">);</span>
+<span class="n">myHadoopConfiguration</span><span class="o">.</span><span 
class="na">setClass</span><span class="o">(</span><span 
class="s">"key.class"</span><span class="o">,</span> <span 
class="n">InputFormatKeyClass</span><span class="o">,</span> <span 
class="n">Object</span><span class="o">.</span><span 
class="na">class</span><span class="o">);</span>
+<span class="n">myHadoopConfiguration</span><span class="o">.</span><span 
class="na">setClass</span><span class="o">(</span><span 
class="s">"value.class"</span><span class="o">,</span> <span 
class="n">InputFormatValueClass</span><span class="o">,</span> <span 
class="n">Object</span><span class="o">.</span><span 
class="na">class</span><span class="o">);</span>
+</code></pre>
+</div>
+
+<div class="language-py highlighter-rouge"><pre class="highlight"><code>  
<span class="c"># The Beam SDK for Python does not support Hadoop InputFormat 
IO.</span>
+</code></pre>
+</div>
+
+<p>You will need to check if the <code class="highlighter-rouge">Key</code> 
and <code class="highlighter-rouge">Value</code> classes output by the <code 
class="highlighter-rouge">InputFormat</code> have a Beam <code 
class="highlighter-rouge">Coder</code> available. If not, you can use <code 
class="highlighter-rouge">withKeyTranslation</code> or <code 
class="highlighter-rouge">withValueTranslation</code> to specify a method 
transforming instances of those classes into another class that is supported by 
a Beam <code class="highlighter-rouge">Coder</code>. These settings are 
optional and you don’t need to specify translation for both key and value.</p>
+
+<p>For example:</p>
+<div class="language-java highlighter-rouge"><pre 
class="highlight"><code><span class="n">SimpleFunction</span><span 
class="o">&lt;</span><span class="n">InputFormatKeyClass</span><span 
class="o">,</span> <span class="n">MyKeyClass</span><span class="o">&gt;</span> 
<span class="n">myOutputKeyType</span> <span class="o">=</span>
+<span class="k">new</span> <span class="n">SimpleFunction</span><span 
class="o">&lt;</span><span class="n">InputFormatKeyClass</span><span 
class="o">,</span> <span class="n">MyKeyClass</span><span 
class="o">&gt;()</span> <span class="o">{</span>
+  <span class="kd">public</span> <span class="n">MyKeyClass</span> <span 
class="nf">apply</span><span class="o">(</span><span 
class="n">InputFormatKeyClass</span> <span class="n">input</span><span 
class="o">)</span> <span class="o">{</span>
+  <span class="c1">// ...logic to transform InputFormatKeyClass to 
MyKeyClass</span>
+  <span class="o">}</span>
+<span class="o">};</span>
+<span class="n">SimpleFunction</span><span class="o">&lt;</span><span 
class="n">InputFormatValueClass</span><span class="o">,</span> <span 
class="n">MyValueClass</span><span class="o">&gt;</span> <span 
class="n">myOutputValueType</span> <span class="o">=</span>
+<span class="k">new</span> <span class="n">SimpleFunction</span><span 
class="o">&lt;</span><span class="n">InputFormatValueClass</span><span 
class="o">,</span> <span class="n">MyValueClass</span><span 
class="o">&gt;()</span> <span class="o">{</span>
+  <span class="kd">public</span> <span class="n">MyValueClass</span> <span 
class="nf">apply</span><span class="o">(</span><span 
class="n">InputFormatValueClass</span> <span class="n">input</span><span 
class="o">)</span> <span class="o">{</span>
+  <span class="c1">// ...logic to transform InputFormatValueClass to 
MyValueClass</span>
+  <span class="o">}</span>
+<span class="o">};</span>
+</code></pre>
+</div>
+
+<div class="language-py highlighter-rouge"><pre class="highlight"><code>  
<span class="c"># The Beam SDK for Python does not support Hadoop InputFormat 
IO.</span>
+</code></pre>
+</div>
+
+<h3 id="reading-using-hadoop-inputformat-io">Reading using Hadoop InputFormat 
IO</h3>
+
+<h4 id="read-data-only-with-hadoop-configuration">Read data only with Hadoop 
configuration.</h4>
+
+<div class="language-java highlighter-rouge"><pre 
class="highlight"><code><span class="n">p</span><span class="o">.</span><span 
class="na">apply</span><span class="o">(</span><span 
class="s">"read"</span><span class="o">,</span>
+  <span class="n">HadoopInputFormatIO</span><span class="o">.&lt;</span><span 
class="n">InputFormatKeyClass</span><span class="o">,</span> <span 
class="n">InputFormatKeyClass</span><span class="o">&gt;</span><span 
class="n">read</span><span class="o">()</span>
+  <span class="o">.</span><span class="na">withConfiguration</span><span 
class="o">(</span><span class="n">myHadoopConfiguration</span><span 
class="o">);</span>
+</code></pre>
+</div>
+
+<div class="language-py highlighter-rouge"><pre class="highlight"><code>  
<span class="c"># The Beam SDK for Python does not support Hadoop InputFormat 
IO.</span>
+</code></pre>
+</div>
+
+<h4 id="read-data-with-configuration-and-key-translation">Read data with 
configuration and key translation</h4>
+
+<p>For example scenario: Beam <code class="highlighter-rouge">Coder</code> is 
not available for key class hence key translation is required.</p>
+
+<div class="language-java highlighter-rouge"><pre 
class="highlight"><code><span class="n">p</span><span class="o">.</span><span 
class="na">apply</span><span class="o">(</span><span 
class="s">"read"</span><span class="o">,</span>
+  <span class="n">HadoopInputFormatIO</span><span class="o">.&lt;</span><span 
class="n">MyKeyClass</span><span class="o">,</span> <span 
class="n">InputFormatKeyClass</span><span class="o">&gt;</span><span 
class="n">read</span><span class="o">()</span>
+  <span class="o">.</span><span class="na">withConfiguration</span><span 
class="o">(</span><span class="n">myHadoopConfiguration</span><span 
class="o">)</span>
+  <span class="o">.</span><span class="na">withKeyTranslation</span><span 
class="o">(</span><span class="n">myOutputKeyType</span><span 
class="o">);</span>
+</code></pre>
+</div>
+
+<div class="language-py highlighter-rouge"><pre class="highlight"><code>  
<span class="c"># The Beam SDK for Python does not support Hadoop InputFormat 
IO.</span>
+</code></pre>
+</div>
+
+<h4 id="read-data-with-configuration-and-value-translation">Read data with 
configuration and value translation</h4>
+
+<p>For example scenario: Beam <code class="highlighter-rouge">Coder</code> is 
not available for value class hence value translation is required.</p>
+
+<div class="language-java highlighter-rouge"><pre 
class="highlight"><code><span class="n">p</span><span class="o">.</span><span 
class="na">apply</span><span class="o">(</span><span 
class="s">"read"</span><span class="o">,</span>
+  <span class="n">HadoopInputFormatIO</span><span class="o">.&lt;</span><span 
class="n">InputFormatKeyClass</span><span class="o">,</span> <span 
class="n">MyValueClass</span><span class="o">&gt;</span><span 
class="n">read</span><span class="o">()</span>
+  <span class="o">.</span><span class="na">withConfiguration</span><span 
class="o">(</span><span class="n">myHadoopConfiguration</span><span 
class="o">)</span>
+  <span class="o">.</span><span class="na">withValueTranslation</span><span 
class="o">(</span><span class="n">myOutputValueType</span><span 
class="o">);</span>
+</code></pre>
+</div>
+
+<div class="language-py highlighter-rouge"><pre class="highlight"><code>  
<span class="c"># The Beam SDK for Python does not support Hadoop InputFormat 
IO.</span>
+</code></pre>
+</div>
+
+<h4 
id="read-data-with-configuration-value-translation-and-key-translation">Read 
data with configuration, value translation and key translation</h4>
+
+<p>For example scenario: Beam Coders are not available for both <code 
class="highlighter-rouge">Key</code> class and <code 
class="highlighter-rouge">Value</code> class of <code 
class="highlighter-rouge">InputFormat</code> hence key and value translation is 
required.</p>
+
+<div class="language-java highlighter-rouge"><pre 
class="highlight"><code><span class="n">p</span><span class="o">.</span><span 
class="na">apply</span><span class="o">(</span><span 
class="s">"read"</span><span class="o">,</span>
+  <span class="n">HadoopInputFormatIO</span><span class="o">.&lt;</span><span 
class="n">MyKeyClass</span><span class="o">,</span> <span 
class="n">MyValueClass</span><span class="o">&gt;</span><span 
class="n">read</span><span class="o">()</span>
+  <span class="o">.</span><span class="na">withConfiguration</span><span 
class="o">(</span><span class="n">myHadoopConfiguration</span><span 
class="o">)</span>
+  <span class="o">.</span><span class="na">withKeyTranslation</span><span 
class="o">(</span><span class="n">myOutputKeyType</span><span class="o">)</span>
+  <span class="o">.</span><span class="na">withValueTranslation</span><span 
class="o">(</span><span class="n">myOutputValueType</span><span 
class="o">);</span>
+</code></pre>
+</div>
+
+<div class="language-py highlighter-rouge"><pre class="highlight"><code>  
<span class="c"># The Beam SDK for Python does not support Hadoop InputFormat 
IO.</span>
+</code></pre>
+</div>
+
+<h1 id="examples-for-specific-inputformats">Examples for specific 
InputFormats</h1>
+
+<h3 id="cassandra---cqlinputformat">Cassandra - CqlInputFormat</h3>
+
+<p>To read data from Cassandra, use <code 
class="highlighter-rouge">org.apache.cassandra.hadoop.cql3.CqlInputFormat</code>,
 which needs the following properties to be set:</p>
+
+<div class="language-java highlighter-rouge"><pre 
class="highlight"><code><span class="n">Configuration</span> <span 
class="n">cassandraConf</span> <span class="o">=</span> <span 
class="k">new</span> <span class="n">Configuration</span><span 
class="o">();</span>
+<span class="n">cassandraConf</span><span class="o">.</span><span 
class="na">set</span><span class="o">(</span><span 
class="s">"cassandra.input.thrift.port"</span><span class="o">,</span> <span 
class="s">"9160"</span><span class="o">);</span>
+<span class="n">cassandraConf</span><span class="o">.</span><span 
class="na">set</span><span class="o">(</span><span 
class="s">"cassandra.input.thrift.address"</span><span class="o">,</span> <span 
class="n">CassandraHostIp</span><span class="o">);</span>
+<span class="n">cassandraConf</span><span class="o">.</span><span 
class="na">set</span><span class="o">(</span><span 
class="s">"cassandra.input.partitioner.class"</span><span class="o">,</span> 
<span class="s">"Murmur3Partitioner"</span><span class="o">);</span>
+<span class="n">cassandraConf</span><span class="o">.</span><span 
class="na">set</span><span class="o">(</span><span 
class="s">"cassandra.input.keyspace"</span><span class="o">,</span> <span 
class="s">"myKeySpace"</span><span class="o">);</span>
+<span class="n">cassandraConf</span><span class="o">.</span><span 
class="na">set</span><span class="o">(</span><span 
class="s">"cassandra.input.columnfamily"</span><span class="o">,</span> <span 
class="s">"myColumnFamily"</span><span class="o">);</span>
+<span class="n">cassandraConf</span><span class="o">.</span><span 
class="na">setClass</span><span class="o">(</span><span 
class="s">"key.class"</span><span class="o">,</span> <span 
class="n">java</span><span class="o">.</span><span class="na">lang</span><span 
class="o">.</span><span class="na">Long</span> <span class="n">Long</span><span 
class="o">.</span><span class="na">class</span><span class="o">,</span> <span 
class="n">Object</span><span class="o">.</span><span 
class="na">class</span><span class="o">);</span>
+<span class="n">cassandraConf</span><span class="o">.</span><span 
class="na">setClass</span><span class="o">(</span><span 
class="s">"value.class"</span><span class="o">,</span> <span 
class="n">com</span><span class="o">.</span><span 
class="na">datastax</span><span class="o">.</span><span 
class="na">driver</span><span class="o">.</span><span 
class="na">core</span><span class="o">.</span><span class="na">Row</span> <span 
class="n">Row</span><span class="o">.</span><span class="na">class</span><span 
class="o">,</span> <span class="n">Object</span><span class="o">.</span><span 
class="na">class</span><span class="o">);</span>
+<span class="n">cassandraConf</span><span class="o">.</span><span 
class="na">setClass</span><span class="o">(</span><span 
class="s">"mapreduce.job.inputformat.class"</span><span class="o">,</span> 
<span class="n">org</span><span class="o">.</span><span 
class="na">apache</span><span class="o">.</span><span 
class="na">cassandra</span><span class="o">.</span><span 
class="na">hadoop</span><span class="o">.</span><span 
class="na">cql3</span><span class="o">.</span><span 
class="na">CqlInputFormat</span> <span class="n">CqlInputFormat</span><span 
class="o">.</span><span class="na">class</span><span class="o">,</span> <span 
class="n">InputFormat</span><span class="o">.</span><span 
class="na">class</span><span class="o">);</span>
+</code></pre>
+</div>
+
+<div class="language-py highlighter-rouge"><pre class="highlight"><code>  
<span class="c"># The Beam SDK for Python does not support Hadoop InputFormat 
IO.</span>
+</code></pre>
+</div>
+
+<p>Call Read transform as follows:</p>
+
+<div class="language-java highlighter-rouge"><pre 
class="highlight"><code><span class="n">PCollection</span><span 
class="o">&lt;</span><span class="n">KV</span><span class="o">&lt;</span><span 
class="n">Long</span><span class="o">,</span> <span 
class="n">String</span><span class="o">&gt;&gt;</span> <span 
class="n">cassandraData</span> <span class="o">=</span>
+  <span class="n">p</span><span class="o">.</span><span 
class="na">apply</span><span class="o">(</span><span 
class="s">"read"</span><span class="o">,</span>
+  <span class="n">HadoopInputFormatIO</span><span class="o">.&lt;</span><span 
class="n">Long</span><span class="o">,</span> <span 
class="n">String</span><span class="o">&gt;</span><span 
class="n">read</span><span class="o">()</span>
+  <span class="o">.</span><span class="na">withConfiguration</span><span 
class="o">(</span><span class="n">cassandraConf</span><span class="o">)</span>
+  <span class="o">.</span><span class="na">withValueTranslation</span><span 
class="o">(</span><span class="n">cassandraOutputValueType</span><span 
class="o">);</span>
+</code></pre>
+</div>
+
+<div class="language-py highlighter-rouge"><pre class="highlight"><code>  
<span class="c"># The Beam SDK for Python does not support Hadoop InputFormat 
IO.</span>
+</code></pre>
+</div>
+
+<p>The <code class="highlighter-rouge">CqlInputFormat</code> key class is 
<code class="highlighter-rouge">java.lang.Long</code> <code 
class="highlighter-rouge">Long</code>, which has a Beam <code 
class="highlighter-rouge">Coder</code>. The <code 
class="highlighter-rouge">CqlInputFormat</code> value class is <code 
class="highlighter-rouge">com.datastax.driver.core.Row</code> <code 
class="highlighter-rouge">Row</code>, which does not have a Beam <code 
class="highlighter-rouge">Coder</code>. Rather than write a new coder, you can 
provide your own translation method, as follows:</p>
+
+<div class="language-java highlighter-rouge"><pre 
class="highlight"><code><span class="n">SimpleFunction</span><span 
class="o">&lt;</span><span class="n">Row</span><span class="o">,</span> <span 
class="n">String</span><span class="o">&gt;</span> <span 
class="n">cassandraOutputValueType</span> <span class="o">=</span> <span 
class="n">SimpleFunction</span><span class="o">&lt;</span><span 
class="n">Row</span><span class="o">,</span> <span class="n">String</span><span 
class="o">&gt;()</span>
+<span class="o">{</span>
+  <span class="kd">public</span> <span class="n">String</span> <span 
class="nf">apply</span><span class="o">(</span><span class="n">Row</span> <span 
class="n">row</span><span class="o">)</span> <span class="o">{</span>
+    <span class="k">return</span> <span class="n">row</span><span 
class="o">.</span><span class="na">getString</span><span 
class="o">(</span><span class="err">'</span><span 
class="n">myColName</span><span class="err">'</span><span class="o">);</span>
+  <span class="o">}</span>
+<span class="o">};</span>
+</code></pre>
+</div>
+
+<div class="language-py highlighter-rouge"><pre class="highlight"><code>  
<span class="c"># The Beam SDK for Python does not support Hadoop InputFormat 
IO.</span>
+</code></pre>
+</div>
+
+<h3 id="elasticsearch---esinputformat">Elasticsearch - EsInputFormat</h3>
+
+<p>To read data from Elasticsearch, use <code 
class="highlighter-rouge">EsInputFormat</code>, which needs following 
properties to be set:</p>
+
+<div class="language-java highlighter-rouge"><pre 
class="highlight"><code><span class="n">Configuration</span> <span 
class="n">elasticSearchConf</span> <span class="o">=</span> <span 
class="k">new</span> <span class="n">Configuration</span><span 
class="o">();</span>
+<span class="n">elasticSearchConf</span><span class="o">.</span><span 
class="na">set</span><span class="o">(</span><span 
class="s">"es.nodes"</span><span class="o">,</span> <span 
class="n">ElasticsearchHostIp</span><span class="o">);</span>
+<span class="n">elasticSearchConf</span><span class="o">.</span><span 
class="na">set</span><span class="o">(</span><span 
class="s">"es.port"</span><span class="o">,</span> <span 
class="s">"9200"</span><span class="o">);</span>
+<span class="n">elasticSearchConf</span><span class="o">.</span><span 
class="na">set</span><span class="o">(</span><span 
class="s">"es.resource"</span><span class="o">,</span> <span 
class="s">"ElasticIndexName/ElasticTypeName"</span><span class="o">);</span>
+<span class="n">elasticSearchConf</span><span class="o">.</span><span 
class="na">setClass</span><span class="o">(</span><span 
class="s">"key.class"</span><span class="o">,</span> <span 
class="n">org</span><span class="o">.</span><span class="na">apache</span><span 
class="o">.</span><span class="na">hadoop</span><span class="o">.</span><span 
class="na">io</span><span class="o">.</span><span class="na">Text</span> <span 
class="n">Text</span><span class="o">.</span><span class="na">class</span><span 
class="o">,</span> <span class="n">Object</span><span class="o">.</span><span 
class="na">class</span><span class="o">);</span>
+<span class="n">elasticSearchConf</span><span class="o">.</span><span 
class="na">setClass</span><span class="o">(</span><span 
class="s">"value.class"</span><span class="o">,</span> <span 
class="n">org</span><span class="o">.</span><span 
class="na">elasticsearch</span><span class="o">.</span><span 
class="na">hadoop</span><span class="o">.</span><span class="na">mr</span><span 
class="o">.</span><span class="na">LinkedMapWritable</span> <span 
class="n">LinkedMapWritable</span><span class="o">.</span><span 
class="na">class</span><span class="o">,</span> <span 
class="n">Object</span><span class="o">.</span><span 
class="na">class</span><span class="o">);</span>
+<span class="n">elasticSearchConf</span><span class="o">.</span><span 
class="na">setClass</span><span class="o">(</span><span 
class="s">"mapreduce.job.inputformat.class"</span><span class="o">,</span> 
<span class="n">org</span><span class="o">.</span><span 
class="na">elasticsearch</span><span class="o">.</span><span 
class="na">hadoop</span><span class="o">.</span><span class="na">mr</span><span 
class="o">.</span><span class="na">EsInputFormat</span> <span 
class="n">EsInputFormat</span><span class="o">.</span><span 
class="na">class</span><span class="o">,</span> <span 
class="n">InputFormat</span><span class="o">.</span><span 
class="na">class</span><span class="o">);</span>
+</code></pre>
+</div>
+
+<div class="language-py highlighter-rouge"><pre class="highlight"><code>  
<span class="c"># The Beam SDK for Python does not support Hadoop InputFormat 
IO.</span>
+</code></pre>
+</div>
+
+<p>Call Read transform as follows:</p>
+
+<div class="language-java highlighter-rouge"><pre 
class="highlight"><code><span class="n">PCollection</span><span 
class="o">&lt;</span><span class="n">KV</span><span class="o">&lt;</span><span 
class="n">Text</span><span class="o">,</span> <span 
class="n">LinkedMapWritable</span><span class="o">&gt;&gt;</span> <span 
class="n">elasticData</span> <span class="o">=</span> <span 
class="n">p</span><span class="o">.</span><span class="na">apply</span><span 
class="o">(</span><span class="s">"read"</span><span class="o">,</span>
+  <span class="n">HadoopInputFormatIO</span><span class="o">.&lt;</span><span 
class="n">Text</span><span class="o">,</span> <span 
class="n">LinkedMapWritable</span><span class="o">&gt;</span><span 
class="n">read</span><span class="o">().</span><span 
class="na">withConfiguration</span><span class="o">(</span><span 
class="n">elasticSearchConf</span><span class="o">));</span>
+</code></pre>
+</div>
+
+<div class="language-py highlighter-rouge"><pre class="highlight"><code>  
<span class="c"># The Beam SDK for Python does not support Hadoop InputFormat 
IO.</span>
+</code></pre>
+</div>
+
+<p>The <code 
class="highlighter-rouge">org.elasticsearch.hadoop.mr.EsInputFormat</code>’s 
<code class="highlighter-rouge">EsInputFormat</code> key class is <code 
class="highlighter-rouge">org.apache.hadoop.io.Text</code> <code 
class="highlighter-rouge">Text</code>, and its value class is <code 
class="highlighter-rouge">org.elasticsearch.hadoop.mr.LinkedMapWritable</code> 
<code class="highlighter-rouge">LinkedMapWritable</code>. Both key and value 
classes have Beam Coders.</p>
+
+      </div>
+
+
+    <hr>
+  <div class="row">
+      <div class="col-xs-12">
+          <footer>
+              <p class="text-center">
+                &copy; Copyright
+                <a href="http://www.apache.org";>The Apache Software 
Foundation</a>,
+                2017. All Rights Reserved.
+              </p>
+              <p class="text-center">
+                <a href="/privacy_policy">Privacy Policy</a> |
+                <a href="/feed.xml">RSS Feed</a>
+              </p>
+          </footer>
+      </div>
+  </div>
+  <!-- container div end -->
+</div>
+
+
+  </body>
+
+</html>

http://git-wip-us.apache.org/repos/asf/beam-site/blob/75c59b81/content/documentation/io/built-in/index.html
----------------------------------------------------------------------
diff --git a/content/documentation/io/built-in/index.html 
b/content/documentation/io/built-in/index.html
index cf262d3..3d2498e 100644
--- a/content/documentation/io/built-in/index.html
+++ b/content/documentation/io/built-in/index.html
@@ -181,7 +181,7 @@
     <p><a 
href="https://github.com/apache/beam/blob/master/sdks/java/core/src/main/java/org/apache/beam/sdk/io";>Google
 Cloud PubSub</a></p>
   </td>
   <td>
-    <p><a 
href="https://github.com/apache/beam/tree/master/sdks/java/io/hadoop";>Apache 
Hadoop InputFormat</a></p>
+    <p><a href="/documentation/io/built-in/hadoop/">Apache Hadoop 
InputFormat</a></p>
     <p><a 
href="https://github.com/apache/beam/tree/master/sdks/java/io/hbase";>Apache 
HBase</a></p>
     <p><a 
href="https://github.com/apache/beam/tree/master/sdks/java/io/mongodb";>MongoDB</a></p>
     <p><a 
href="https://github.com/apache/beam/tree/master/sdks/java/io/jdbc";>JDBC</a></p>

Reply via email to