Author: buildbot
Date: Wed Oct 29 03:38:42 2014
New Revision: 927238
Log:
Staging update by buildbot for crunch
Modified:
websites/staging/crunch/trunk/content/ (props changed)
websites/staging/crunch/trunk/content/user-guide.html
Propchange: websites/staging/crunch/trunk/content/
------------------------------------------------------------------------------
--- cms:source-revision (original)
+++ cms:source-revision Wed Oct 29 03:38:42 2014
@@ -1 +1 @@
-1635032
+1635033
Modified: websites/staging/crunch/trunk/content/user-guide.html
==============================================================================
--- websites/staging/crunch/trunk/content/user-guide.html (original)
+++ websites/staging/crunch/trunk/content/user-guide.html Wed Oct 29 03:38:42
2014
@@ -579,104 +579,100 @@ can be used to kick off a shuffle on the
}
</pre>
-<p>If you find yourself in a situation where you have a
PCollection<Pair<K, V>> and you need a PTable<K, V>, the
+<p>If you find yourself in a situation where you have a
<code>PCollection<Pair<K, V>></code> and you need a
<code>PTable<K, V></code>, the
<a href="apidocs/0.10.0/org/apache/crunch/lib/PTables.html">PTables</a>
library class has methods that will do the conversion for you.</p>
<p>Let's look at some more example PTypes created using the common primitive
and collection types. For most of your pipelines,
you will use one type family exclusively, and so you can cut down on some of
the boilerplate in your classes by importing
all of the methods from the <code>Writables</code> or <code>Avros</code>
classes into your class:</p>
-<pre>
-// Import all of the PType factory methods from Avros
-import static org.apache.crunch.types.avro.Avros.*;
+<div class="codehilite"><pre><span class="c1">// Import all of the PType
factory methods from Avros</span>
+<span class="kn">import</span> <span class="nn">static</span> <span
class="n">org</span><span class="p">.</span><span class="n">apache</span><span
class="p">.</span><span class="n">crunch</span><span class="p">.</span><span
class="n">types</span><span class="p">.</span><span class="n">avro</span><span
class="p">.</span><span class="n">Avros</span><span class="p">.</span><span
class="o">*</span><span class="p">;</span>
+
+<span class="kn">import</span> <span class="nn">org</span><span
class="p">.</span><span class="n">apache</span><span class="p">.</span><span
class="n">crunch</span><span class="p">.</span><span class="n">Pair</span><span
class="p">;</span>
+<span class="kn">import</span> <span class="nn">org</span><span
class="p">.</span><span class="n">apache</span><span class="p">.</span><span
class="n">crunch</span><span class="p">.</span><span
class="n">Tuple3</span><span class="p">;</span>
+<span class="kn">import</span> <span class="nn">org</span><span
class="p">.</span><span class="n">apache</span><span class="p">.</span><span
class="n">crunch</span><span class="p">.</span><span
class="n">TupleN</span><span class="p">;</span>
+
+<span class="kn">import</span> <span class="nn">java</span><span
class="p">.</span><span class="n">nio</span><span class="p">.</span><span
class="n">ByteBuffer</span><span class="p">;</span>
+<span class="kn">import</span> <span class="nn">java</span><span
class="p">.</span><span class="n">util</span><span class="p">.</span><span
class="n">Collection</span><span class="p">;</span>
+<span class="kn">import</span> <span class="nn">java</span><span
class="p">.</span><span class="n">util</span><span class="p">.</span><span
class="n">Map</span><span class="p">;</span>
+
+<span class="n">public</span> <span class="k">class</span> <span
class="n">MyPipeline</span> <span class="p">{</span>
+
+ <span class="c1">// Common primitive types</span>
+ <span class="n">PType</span><span class="o"><</span><span
class="n">Integer</span><span class="o">></span> <span
class="n">intType</span> <span class="o">=</span> <span
class="n">ints</span><span class="p">();</span>
+ <span class="n">PType</span><span class="o"><</span><span
class="n">Long</span><span class="o">></span> <span
class="n">longType</span> <span class="o">=</span> <span
class="n">longs</span><span class="p">();</span>
+ <span class="n">PType</span><span class="o"><</span><span
class="n">Double</span><span class="o">></span> <span
class="n">doubleType</span> <span class="o">=</span> <span
class="n">doubles</span><span class="p">();</span>
+ <span class="c1">// Bytes are represented by java.nio.ByteBuffer</span>
+ <span class="n">PType</span><span class="o"><</span><span
class="n">ByteBuffer</span><span class="o">></span> <span
class="n">bytesType</span> <span class="o">=</span> <span
class="n">bytes</span><span class="p">();</span>
+
+ <span class="c1">// A PTableType: using tableOf will return a PTable instead
of a</span>
+ <span class="c1">// PCollection from a parallelDo call.</span>
+ <span class="n">PTableType</span><span class="o"><</span><span
class="n">String</span><span class="p">,</span> <span
class="n">Boolean</span><span class="o">></span> <span
class="n">tableType</span> <span class="o">=</span> <span
class="n">tableOf</span><span class="p">(</span><span
class="n">strings</span><span class="p">(),</span> <span
class="n">booleans</span><span class="p">());</span>
+
+ <span class="c1">// Pair types: </span>
+ <span class="n">PType</span><span class="o"><</span><span
class="n">Pair</span><span class="o"><</span><span
class="n">String</span><span class="p">,</span> <span
class="n">Boolean</span><span class="o">>></span> <span
class="n">pairType</span> <span class="o">=</span> <span
class="n">pairs</span><span class="p">(</span><span
class="n">strings</span><span class="p">(),</span> <span
class="n">booleans</span><span class="p">());</span>
+ <span class="n">PType</span><span class="o"><</span><span
class="n">Pair</span><span class="o"><</span><span
class="n">String</span><span class="p">,</span> <span
class="n">Pair</span><span class="o"><</span><span
class="n">Long</span><span class="p">,</span> <span class="n">Long</span><span
class="o">>></span> <span class="n">nestedPairType</span> <span
class="o">=</span> <span class="n">pairs</span><span class="p">(</span><span
class="n">strings</span><span class="p">(),</span> <span
class="n">pairs</span><span class="p">(</span><span class="n">longs</span><span
class="p">(),</span> <span class="n">longs</span><span class="p">()));</span>
+
+ <span class="c1">// A triple</span>
+ <span class="n">PType</span><span class="o"><</span><span
class="n">Tuple3</span><span class="o"><</span><span
class="n">Long</span><span class="p">,</span> <span class="n">Float</span><span
class="p">,</span> <span class="n">Float</span><span class="o">>></span>
<span class="n">tripType</span> <span class="o">=</span> <span
class="n">trips</span><span class="p">(</span><span class="n">longs</span><span
class="p">(),</span> <span class="n">floats</span><span class="p">(),</span>
<span class="n">floats</span><span class="p">());</span>
+ <span class="c1">// An arbitrary length tuple-- note that we lose the
generic type information</span>
+ <span class="n">PType</span><span class="o"><</span><span
class="n">TupleN</span><span class="o">></span> <span
class="n">tupleType</span> <span class="o">=</span> <span
class="n">tupleN</span><span class="p">(</span><span class="n">ints</span><span
class="p">(),</span> <span class="n">ints</span><span class="p">(),</span>
<span class="n">floats</span><span class="p">(),</span> <span
class="n">strings</span><span class="p">(),</span> <span
class="n">strings</span><span class="p">(),</span> <span
class="n">ints</span><span class="p">());</span>
+
+ <span class="c1">// A Collection type</span>
+ <span class="n">PType</span><span class="o"><</span><span
class="n">Collection</span><span class="o"><</span><span
class="n">Long</span><span class="o">>></span> <span
class="n">longsType</span> <span class="o">=</span> <span
class="n">collections</span><span class="p">(</span><span
class="n">longs</span><span class="p">());</span>
+ <span class="c1">// A Map Type-- note that the keys are always strings, we
only specify the value.</span>
+ <span class="n">PType</span><span class="o"><</span><span
class="n">Map</span><span class="o"><</span><span
class="n">String</span><span class="p">,</span> <span
class="n">Boolean</span><span class="o">>></span> <span
class="n">mapType</span> <span class="o">=</span> <span
class="n">maps</span><span class="p">(</span><span
class="n">booleans</span><span class="p">());</span>
+
+ <span class="c1">// A Pair of collections</span>
+ <span class="n">PType</span><span class="o"><</span><span
class="n">Pair</span><span class="o"><</span><span
class="n">Collection</span><span class="o"><</span><span
class="n">String</span><span class="o">></span><span class="p">,</span>
<span class="n">Collection</span><span class="o"><</span><span
class="n">Long</span><span class="o">>>></span> <span
class="n">pairColType</span> <span class="o">=</span> <span
class="n">pairs</span><span class="p">(</span>
+ <span class="n">collections</span><span class="p">(</span><span
class="n">strings</span><span class="p">()),</span>
+ <span class="n">collections</span><span class="p">(</span><span
class="n">longs</span><span class="p">()));</span>
+<span class="p">}</span>
+</pre></div>
-import org.apache.crunch.Pair;
-import org.apache.crunch.Tuple3;
-import org.apache.crunch.TupleN;
-
-import java.nio.ByteBuffer;
-import java.util.Collection;
-import java.util.Map;
-
-public class MyPipeline {
-
- // Common primitive types
- PType<Integer> intType = ints();
- PType<Long> longType = longs();
- PType<Double> doubleType = doubles();
- // Bytes are represented by java.nio.ByteBuffer
- PType<ByteBuffer> bytesType = bytes();
-
- // A PTableType: using tableOf will return a PTable instead of a
- // PCollection from a parallelDo call.
- PTableType<String, Boolean> tableType = tableOf(strings(), booleans());
-
- // Pair types:
- PType<Pair<String, Boolean>> pairType = pairs(strings(),
booleans());
- PType<Pair<String, Pair<Long, Long>> nestedPairType =
pairs(strings(), pairs(longs(), longs()));
-
- // A triple
- PType<Tuple3<Long, Float, Float>> tripType = trips(longs(),
floats(), floats());
- // An arbitrary length tuple-- note that we lose the generic type information
- PType<TupleN> tupleType = tupleN(ints(), ints(), floats(), strings(),
strings(), ints());
-
- // A Collection type
- PType<Collection<Long>> longsType = collections(longs());
- // A Map Type-- note that the keys are always strings, we only specify the
value.
- PType<Map<String, Boolean>> mapType = maps(booleans());
-
- // A Pair of collections
- PType<Pair<Collection<String>, Collection<Long>>>
pairColType = pairs(
- collections(strings()),
- collections(longs()));
-}
-</pre>
<p>Both type families also have a method named <code>PType<T>
records(Class<T> clazz)</code> that can be used to create PTypes that
support the common
record format for each type family. For the WritableTypeFamily, the records
method supports PTypes for implementations of the <code>Writable</code>
interface, and for the AvroTypeFamily, the records method supports PTypes for
implementations of Avro's <code>IndexedRecord</code> interface, which
includes both Avro generic and specific records:</p>
-<pre>
- PType<FooWritable> fwType1 = Writables.records(FooWritable.class);
- // The more obvious "writables" method also works.
- PType<FooWritable> fwType = Writables.writables(FooWritable.class);
-
- // For a generated Avro class, this works:
- PType<Person> personType1 = Avros.records(Person.class);
- // As does this:
- PType<Person> personType2 = Avros.containers(Person.class);
- // If you only have a schema, you can create a generic type, like this:
- org.apache.avro.Schema schema = ...;
- PType<Record> avroGenericType = Avros.generics(schema);
-</pre>
+<div class="codehilite"><pre><span class="n">PType</span><span
class="o"><</span><span class="n">FooWritable</span><span
class="o">></span> <span class="n">fwType1</span> <span class="p">=</span>
<span class="n">Writables</span><span class="p">.</span><span
class="n">records</span><span class="p">(</span><span
class="n">FooWritable</span><span class="p">.</span><span
class="n">class</span><span class="p">);</span>
+<span class="o">//</span> <span class="n">The</span> <span
class="n">more</span> <span class="n">obvious</span> "<span
class="n">writables</span>" <span class="n">method</span> <span
class="n">also</span> <span class="n">works</span><span class="p">.</span>
+<span class="n">PType</span><span class="o"><</span><span
class="n">FooWritable</span><span class="o">></span> <span
class="n">fwType</span> <span class="p">=</span> <span
class="n">Writables</span><span class="p">.</span><span
class="n">writables</span><span class="p">(</span><span
class="n">FooWritable</span><span class="p">.</span><span
class="n">class</span><span class="p">);</span>
+
+<span class="o">//</span> <span class="n">For</span> <span class="n">a</span>
<span class="n">generated</span> <span class="n">Avro</span> <span
class="n">class</span><span class="p">,</span> <span class="n">this</span>
<span class="n">works</span><span class="p">:</span>
+<span class="n">PType</span><span class="o"><</span><span
class="n">Person</span><span class="o">></span> <span
class="n">personType1</span> <span class="p">=</span> <span
class="n">Avros</span><span class="p">.</span><span
class="n">records</span><span class="p">(</span><span
class="n">Person</span><span class="p">.</span><span
class="n">class</span><span class="p">);</span>
+<span class="o">//</span> <span class="n">As</span> <span
class="n">does</span> <span class="n">this</span><span class="p">:</span>
+<span class="n">PType</span><span class="o"><</span><span
class="n">Person</span><span class="o">></span> <span
class="n">personType2</span> <span class="p">=</span> <span
class="n">Avros</span><span class="p">.</span><span
class="n">containers</span><span class="p">(</span><span
class="n">Person</span><span class="p">.</span><span
class="n">class</span><span class="p">);</span>
+<span class="o">//</span> <span class="n">If</span> <span class="n">you</span>
<span class="n">only</span> <span class="n">have</span> <span
class="n">a</span> <span class="n">schema</span><span class="p">,</span> <span
class="n">you</span> <span class="n">can</span> <span class="n">create</span>
<span class="n">a</span> <span class="n">generic</span> <span
class="n">type</span><span class="p">,</span> <span class="n">like</span> <span
class="n">this</span><span class="p">:</span>
+<span class="n">org</span><span class="p">.</span><span
class="n">apache</span><span class="p">.</span><span class="n">avro</span><span
class="p">.</span><span class="n">Schema</span> <span class="n">schema</span>
<span class="p">=</span> <span class="p">...;</span>
+<span class="n">PType</span><span class="o"><</span><span
class="n">Record</span><span class="o">></span> <span
class="n">avroGenericType</span> <span class="p">=</span> <span
class="n">Avros</span><span class="p">.</span><span
class="n">generics</span><span class="p">(</span><span
class="n">schema</span><span class="p">);</span>
+</pre></div>
+
<p>The <a
href="apidocs/0.10.0/org/apache/crunch/types/avro/Avros.html">Avros</a> class
also has a <code>reflects</code> method for creating PTypes
for POJOs using Avro's reflection-based serialization mechanism. There are a
couple of restrictions on the structure of
the POJO:</p>
<ol>
<li>It must have a default, no-arg constructor.</li>
-<li>All of its fields must be Avro primitive types or collection types that
have Avro equivalents, like <code>ArrayList</code> and
-<code>HashMap<String, T></code>. You may also have arrays of Avro
primitive types.</li>
-</ol>
-<pre>
- // Declare an inline data type and use it for Crunch serialization
- public static class UrlData {
- // The fields don't have to be public, just doing this for the example.
- double curPageRank;
- String[] outboundUrls;
-
- // Remember: you must have a no-arg constructor.
- public UrlData() { this(0.0, new String[0]); }
-
- // The regular constructor
- public UrlData(double pageRank, String[] outboundUrls) {
- this.curPageRank = pageRank;
- this.outboundUrls = outboundUrls;
- }
+<li>
+<p>All of its fields must be Avro primitive types or collection types that
have Avro equivalents, like <code>ArrayList</code> and
+<code>HashMap<String, T></code>. You may also have arrays of Avro
primitive types.</p>
+<p>// Declare an inline data type and use it for Crunch serialization
+public static class UrlData {
+ // The fields don't have to be public, just doing this for the example.
+ double curPageRank;
+ String[] outboundUrls;</p>
+<p>// Remember: you must have a no-arg constructor.
+ public UrlData() { this(0.0, new String[0]); }</p>
+<p>// The regular constructor
+ public UrlData(double pageRank, String[] outboundUrls) {
+ this.curPageRank = pageRank;
+ this.outboundUrls = outboundUrls;
}
-
- PType<UrlData> urlDataType = Avros.reflects(UrlData.class);
- PTableType<String, UrlData> pageRankType =
Avros.tableOf(Avros.strings(), urlDataType);
-</pre>
-
+}</p>
+<p>PType<UrlData> urlDataType = Avros.reflects(UrlData.class);
+PTableType<String, UrlData> pageRankType = Avros.tableOf(Avros.strings(),
urlDataType);</p>
+</li>
+</ol>
<p>Avro reflection is a great way to define intermediate types for your Crunch
pipelines; not only is your logic clear
and easy to test, but the fact that the data is written out as Avro records
means that you can use tools like Hive and Pig
to query intermediate results to aid in debugging pipeline failures.</p>