Author: jwills
Date: Wed Oct 29 03:38:33 2014
New Revision: 1635033
URL: http://svn.apache.org/r1635033
Log:
Battling Markdown, Take 3
Modified:
crunch/site/trunk/content/user-guide.mdtext
Modified: crunch/site/trunk/content/user-guide.mdtext
URL:
http://svn.apache.org/viewvc/crunch/site/trunk/content/user-guide.mdtext?rev=1635033&r1=1635032&r2=1635033&view=diff
==============================================================================
--- crunch/site/trunk/content/user-guide.mdtext (original)
+++ crunch/site/trunk/content/user-guide.mdtext Wed Oct 29 03:38:33 2014
@@ -478,77 +478,73 @@ can be used to kick off a shuffle on the
}
</pre>
-If you find yourself in a situation where you have a PCollection<Pair<K,
V>> and you need a PTable<K, V>, the
+If you find yourself in a situation where you have a `PCollection<Pair<K, V>>`
and you need a `PTable<K, V>`, the
[PTables](apidocs/0.10.0/org/apache/crunch/lib/PTables.html) library class has
methods that will do the conversion for you.
Let's look at some more example PTypes created using the common primitive and
collection types. For most of your pipelines,
you will use one type family exclusively, and so you can cut down on some of
the boilerplate in your classes by importing
all of the methods from the `Writables` or `Avros` classes into your class:
-<pre>
-// Import all of the PType factory methods from Avros
-import static org.apache.crunch.types.avro.Avros.*;
-
-import org.apache.crunch.Pair;
-import org.apache.crunch.Tuple3;
-import org.apache.crunch.TupleN;
-
-import java.nio.ByteBuffer;
-import java.util.Collection;
-import java.util.Map;
-
-public class MyPipeline {
-
- // Common primitive types
- PType<Integer> intType = ints();
- PType<Long> longType = longs();
- PType<Double> doubleType = doubles();
- // Bytes are represented by java.nio.ByteBuffer
- PType<ByteBuffer> bytesType = bytes();
-
- // A PTableType: using tableOf will return a PTable instead of a
- // PCollection from a parallelDo call.
- PTableType<String, Boolean> tableType = tableOf(strings(), booleans());
-
- // Pair types:
- PType<Pair<String, Boolean>> pairType = pairs(strings(),
booleans());
- PType<Pair<String, Pair<Long, Long>> nestedPairType =
pairs(strings(), pairs(longs(), longs()));
-
- // A triple
- PType<Tuple3<Long, Float, Float>> tripType = trips(longs(),
floats(), floats());
- // An arbitrary length tuple-- note that we lose the generic type information
- PType<TupleN> tupleType = tupleN(ints(), ints(), floats(), strings(),
strings(), ints());
-
- // A Collection type
- PType<Collection<Long>> longsType = collections(longs());
- // A Map Type-- note that the keys are always strings, we only specify the
value.
- PType<Map<String, Boolean>> mapType = maps(booleans());
-
- // A Pair of collections
- PType<Pair<Collection<String>, Collection<Long>>>
pairColType = pairs(
- collections(strings()),
- collections(longs()));
-}
-</pre>
+ // Import all of the PType factory methods from Avros
+ import static org.apache.crunch.types.avro.Avros.*;
+
+ import org.apache.crunch.Pair;
+ import org.apache.crunch.Tuple3;
+ import org.apache.crunch.TupleN;
+
+ import java.nio.ByteBuffer;
+ import java.util.Collection;
+ import java.util.Map;
+
+ public class MyPipeline {
+
+ // Common primitive types
+ PType<Integer> intType = ints();
+ PType<Long> longType = longs();
+ PType<Double> doubleType = doubles();
+ // Bytes are represented by java.nio.ByteBuffer
+ PType<ByteBuffer> bytesType = bytes();
+
+ // A PTableType: using tableOf will return a PTable instead of a
+ // PCollection from a parallelDo call.
+ PTableType<String, Boolean> tableType = tableOf(strings(),
booleans());
+
+ // Pair types:
+ PType<Pair<String, Boolean>> pairType = pairs(strings(), booleans());
+ PType<Pair<String, Pair<Long, Long>> nestedPairType =
pairs(strings(), pairs(longs(), longs()));
+
+ // A triple
+ PType<Tuple3<Long, Float, Float>> tripType = trips(longs(), floats(),
floats());
+ // An arbitrary length tuple-- note that we lose the generic type
information
+ PType<TupleN> tupleType = tupleN(ints(), ints(), floats(), strings(),
strings(), ints());
+
+ // A Collection type
+ PType<Collection<Long>> longsType = collections(longs());
+ // A Map Type-- note that the keys are always strings, we only
specify the value.
+ PType<Map<String, Boolean>> mapType = maps(booleans());
+
+ // A Pair of collections
+ PType<Pair<Collection<String>, Collection<Long>>> pairColType = pairs(
+ collections(strings()),
+ collections(longs()));
+ }
Both type families also have a method named `PType<T> records(Class<T> clazz)`
that can be used to create PTypes that support the common
record format for each type family. For the WritableTypeFamily, the records
method supports PTypes for implementations of the `Writable`
interface, and for the AvroTypeFamily, the records method supports PTypes for
implementations of Avro's `IndexedRecord` interface, which
includes both Avro generic and specific records:
-<pre>
- PType<FooWritable> fwType1 = Writables.records(FooWritable.class);
- // The more obvious "writables" method also works.
- PType<FooWritable> fwType = Writables.writables(FooWritable.class);
-
- // For a generated Avro class, this works:
- PType<Person> personType1 = Avros.records(Person.class);
- // As does this:
- PType<Person> personType2 = Avros.containers(Person.class);
- // If you only have a schema, you can create a generic type, like this:
- org.apache.avro.Schema schema = ...;
- PType<Record> avroGenericType = Avros.generics(schema);
-</pre>
+ PType<FooWritable> fwType1 = Writables.records(FooWritable.class);
+ // The more obvious "writables" method also works.
+ PType<FooWritable> fwType = Writables.writables(FooWritable.class);
+
+ // For a generated Avro class, this works:
+ PType<Person> personType1 = Avros.records(Person.class);
+ // As does this:
+ PType<Person> personType2 = Avros.containers(Person.class);
+ // If you only have a schema, you can create a generic type, like this:
+ org.apache.avro.Schema schema = ...;
+ PType<Record> avroGenericType = Avros.generics(schema);
The [Avros](apidocs/0.10.0/org/apache/crunch/types/avro/Avros.html) class also
has a `reflects` method for creating PTypes
for POJOs using Avro's reflection-based serialization mechanism. There are a
couple of restrictions on the structure of
@@ -558,26 +554,24 @@ the POJO:
2. All of its fields must be Avro primitive types or collection types that
have Avro equivalents, like `ArrayList` and
`HashMap<String, T>`. You may also have arrays of Avro primitive types.
-<pre>
- // Declare an inline data type and use it for Crunch serialization
- public static class UrlData {
- // The fields don't have to be public, just doing this for the example.
- double curPageRank;
- String[] outboundUrls;
-
- // Remember: you must have a no-arg constructor.
- public UrlData() { this(0.0, new String[0]); }
-
- // The regular constructor
- public UrlData(double pageRank, String[] outboundUrls) {
- this.curPageRank = pageRank;
- this.outboundUrls = outboundUrls;
- }
- }
-
- PType<UrlData> urlDataType = Avros.reflects(UrlData.class);
- PTableType<String, UrlData> pageRankType =
Avros.tableOf(Avros.strings(), urlDataType);
-</pre>
+ // Declare an inline data type and use it for Crunch serialization
+ public static class UrlData {
+ // The fields don't have to be public, just doing this for the
example.
+ double curPageRank;
+ String[] outboundUrls;
+
+ // Remember: you must have a no-arg constructor.
+ public UrlData() { this(0.0, new String[0]); }
+
+ // The regular constructor
+ public UrlData(double pageRank, String[] outboundUrls) {
+ this.curPageRank = pageRank;
+ this.outboundUrls = outboundUrls;
+ }
+ }
+
+ PType<UrlData> urlDataType = Avros.reflects(UrlData.class);
+ PTableType<String, UrlData> pageRankType =
Avros.tableOf(Avros.strings(), urlDataType);
Avro reflection is a great way to define intermediate types for your Crunch
pipelines; not only is your logic clear
and easy to test, but the fact that the data is written out as Avro records
means that you can use tools like Hive and Pig