Added: websites/staging/crunch/trunk/content/crunch/mailing-lists.html
==============================================================================
--- websites/staging/crunch/trunk/content/crunch/mailing-lists.html (added)
+++ websites/staging/crunch/trunk/content/crunch/mailing-lists.html Sun Sep 16 
18:50:04 2012
@@ -0,0 +1,174 @@
+<!DOCTYPE html>
+
+
+<html xmlns="http://www.w3.org/1999/xhtml"; lang="en">
+  <head>
+    <meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
+    <meta name="viewport" content="width=device-width, initial-scale=1.0" />
+    <meta http-equiv="Content-Language" content="en" />
+
+    <title>Apache Crunch - Mailing Lists</title>
+
+    <link rel="stylesheet" href="/crunch/css/bootstrap-2.1.0.min.css" />
+    <link rel="stylesheet" href="/crunch/css/crunch.css" type="text/css">
+    <script type="text/javascript" 
src="/crunch/js/bootstrap-2.1.0.min.js"></script>
+  </head>
+  <body>
+
+    <div class="navbar navbar-inverse navbar-static-top">
+      
+        <div class="container-fluid">
+
+          <a class="nav pull-right brand" href="http://incubator.apache.org";>
+            <img src="http://incubator.apache.org/images/egg-logo.png"; 
alt="apache Incubator Logo" />
+          </a>
+
+        </div>
+      
+    </div>
+
+    <ul class="breadcrumb">
+      <li>
+        <a href="/">Incubator</a>
+       <span class="divider">&raquo;</span>
+      </li>
+      <li>
+        <a href="/crunch/">Crunch</a>
+      </li>
+      
+    </ul>
+
+    <div class="container-fluid">
+      <div class="row-fluid">
+
+        <!-- SIDEBAR AREA -->
+        <div class="span2">
+          <div class="sidebar-nav">
+            <ul class="nav nav-list">
+              
+                
+                  <li class="nav-header">Apache Crunch</li>
+                
+              
+                
+                  
+                    <li><a href="/crunch/index.html">Overview</a></li>
+                  
+                
+              
+                
+                  
+                    <li><a href="/crunch/apidocs/">API</a></li>
+                  
+                
+              
+                
+                  
+                    <li><a 
href="https://cwiki.apache.org/confluence/display/CRUNCH/";>Wiki</a></li>
+                  
+                
+              
+                
+                  <li class="nav-header">Project</li>
+                
+              
+                
+                  
+                    <li><a href="/crunch/source-repository.html">Source 
Code</a></li>
+                  
+                
+              
+                
+                  
+                    <li><b>Mailing Lists</b></li>
+                  
+                
+              
+                
+                  
+                    <li><a 
href="http://issues.apache.org/jira/browse/CRUNCH";>Issue Tracking</a></li>
+                  
+                
+              
+                
+                  
+                    <li><a 
href="http://apache.org/licenses/LICENSE-2.0.html";>License</a></li>
+                  
+                
+              
+            </ul>
+          </div> <!-- /well -->
+        </div> <!-- /span -->
+
+        <!-- CONTENT AREA -->
+        <div class="span10">
+          <h1 class="title">
+            Mailing Lists
+            
+          </h1>
+
+          <!--
+  Markdown-generated tables don't have the proper CSS classes,
+  so we use plain HTML tables.
+-->
+
+<p>There are several mailing lists for Apache Crunch. To subscribe or 
unsubscribe
+to a list send mail to the respective administrative address given below. You
+will then receive a confirmation mail with further instructions.</p>
+<table class="table">
+  <thead>
+    <tr>
+      <th>Name</th>
+      <th>Subscribe</th>
+      <th>Unsubscribe</th>
+      <th>Post</th>
+      <th>Archive</th>
+    </tr>
+  </thead>
+  <tbody>
+    <tr>
+      <td>User List</td>
+      <td><a 
href="mailto:[email protected]";>Subscribe</a></td>
+      <td><a 
href="mailto:[email protected]";>Unsubscribe</a></td>
+      <td><a href="mailto:[email protected]";>Post</a></td>
+      <td><a 
href="http://mail-archives.apache.org/mod_mbox/incubator-crunch-user/";>mail-archives.apache.org</a></td>
+    </tr>
+    <tr>
+      <td>Developer List</td>
+      <td><a 
href="mailto:[email protected]";>Subscribe</a></td>
+      <td><a 
href="mailto:[email protected]";>Unsubscribe</a></td>
+      <td><a href="mailto:[email protected]";>Post</a></td>
+      <td><a 
href="http://mail-archives.apache.org/mod_mbox/incubator-crunch-dev/";>mail-archives.apache.org</a></td>
+    </tr>
+    <tr>
+      <td>Commits List</td>
+      <td><a 
href="mailto:[email protected]";>Subscribe</a></td>
+      <td><a 
href="mailto:[email protected]";>Unsubscribe</a></td>
+      <td><a href="mailto:[email protected]";>Post</a></td>
+      <td><a 
href="http://mail-archives.apache.org/mod_mbox/incubator-crunch-commits/";>mail-archives.apache.org</a></td>
+    </tr>
+  </tbody>
+</table>
+        </div> <!-- /span -->
+
+      </div> <!-- /row-fluid -->
+
+    </div>
+
+    <hr/>
+
+    <footer>
+      <div class="container-fluid">
+        <div class="row span12">Copyright &copy; 2012
+          <a href="http://www.apache.org/";>The Apache Software Foundation</a>,
+          licensed under the <a 
href="http://www.apache.org/licenses/LICENSE-2.0";>Apache License, Version 
2.0</a>.
+         <p><small>Apache Incubator, Apache Hadoop, Hadoop, Apache, and the
+         Apache feather logo are trademarks of The Apache Software Foundation.
+         Other names appearing on the site may be trademarks of their
+         respective owners.</small></p>
+        </div>
+      </div>
+    </footer>
+
+  </body>
+</html>

Added: websites/staging/crunch/trunk/content/crunch/pipelines.html
==============================================================================
--- websites/staging/crunch/trunk/content/crunch/pipelines.html (added)
+++ websites/staging/crunch/trunk/content/crunch/pipelines.html Sun Sep 16 
18:50:04 2012
@@ -0,0 +1,205 @@
+<!DOCTYPE html>
+
+
+<html xmlns="http://www.w3.org/1999/xhtml"; lang="en">
+  <head>
+    <meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
+    <meta name="viewport" content="width=device-width, initial-scale=1.0" />
+    <meta http-equiv="Content-Language" content="en" />
+
+    <title>Apache Crunch - Writing Your Own Pipelines</title>
+
+    <link rel="stylesheet" href="/crunch/css/bootstrap-2.1.0.min.css" />
+    <link rel="stylesheet" href="/crunch/css/crunch.css" type="text/css">
+    <script type="text/javascript" 
src="/crunch/js/bootstrap-2.1.0.min.js"></script>
+  </head>
+  <body>
+
+    <div class="navbar navbar-inverse navbar-static-top">
+      
+        <div class="container-fluid">
+
+          <a class="nav pull-right brand" href="http://incubator.apache.org";>
+            <img src="http://incubator.apache.org/images/egg-logo.png"; 
alt="apache Incubator Logo" />
+          </a>
+
+        </div>
+      
+    </div>
+
+    <ul class="breadcrumb">
+      <li>
+        <a href="/">Incubator</a>
+       <span class="divider">&raquo;</span>
+      </li>
+      <li>
+        <a href="/crunch/">Crunch</a>
+      </li>
+      
+    </ul>
+
+    <div class="container-fluid">
+      <div class="row-fluid">
+
+        <!-- SIDEBAR AREA -->
+        <div class="span2">
+          <div class="sidebar-nav">
+            <ul class="nav nav-list">
+              
+                
+                  <li class="nav-header">Apache Crunch</li>
+                
+              
+                
+                  
+                    <li><a href="/crunch/index.html">Overview</a></li>
+                  
+                
+              
+                
+                  
+                    <li><a href="/crunch/apidocs/">API</a></li>
+                  
+                
+              
+                
+                  
+                    <li><a 
href="https://cwiki.apache.org/confluence/display/CRUNCH/";>Wiki</a></li>
+                  
+                
+              
+                
+                  <li class="nav-header">Project</li>
+                
+              
+                
+                  
+                    <li><a href="/crunch/source-repository.html">Source 
Code</a></li>
+                  
+                
+              
+                
+                  
+                    <li><a href="/crunch/mailing-lists.html">Mailing 
Lists</a></li>
+                  
+                
+              
+                
+                  
+                    <li><a 
href="http://issues.apache.org/jira/browse/CRUNCH";>Issue Tracking</a></li>
+                  
+                
+              
+                
+                  
+                    <li><a 
href="http://apache.org/licenses/LICENSE-2.0.html";>License</a></li>
+                  
+                
+              
+            </ul>
+          </div> <!-- /well -->
+        </div> <!-- /span -->
+
+        <!-- CONTENT AREA -->
+        <div class="span10">
+          <h1 class="title">
+            Writing Your Own Pipelines
+            
+          </h1>
+
+          <p>This section discusses the different steps of creating your own 
Crunch pipelines in more detail.</p>
+<h2 id="writing-a-dofn">Writing a DoFn</h2>
+<p>The DoFn class is designed to keep the complexity of the MapReduce APIs out 
of your way when you
+don't need them while still keeping them accessible when you do.</p>
+<h3 id="serialization">Serialization</h3>
+<p>First, all DoFn instances are required to be 
<code>java.io.Serializable</code>. This is a key aspect of Crunch's design:
+once a particular DoFn is assigned to the Map or Reduce stage of a MapReduce 
job, all of the state
+of that DoFn is serialized so that it may be distributed to all of the nodes 
in the Hadoop cluster that
+will be running that task. There are two important implications of this for 
developers:</p>
+<ol>
+<li>All member values of a DoFn must be either serializable or marked as 
<code>transient</code>.</li>
+<li>All anonymous DoFn instances must be defined in a static method or in a 
class that is itself serializable.</li>
+</ol>
+<p>Because sometimes you will need to work with non-serializable objects 
inside of a DoFn, every DoFn provides an
+<code>initialize</code> method that is called before the <code>process</code> 
method is ever called so that any initialization tasks,
+such as creating a non-serializable member variable, can be performed before 
processing begins. Similarly, all
+DoFn instances have a <code>cleanup</code> method that may be called after 
processing has finished to perform any required
+cleanup tasks.</p>
+<h3 id="scale-factor">Scale Factor</h3>
+<p>The DoFn class defines a <code>scaleFactor</code> method that can be used 
to signal to the MapReduce compiler that a particular
+DoFn implementation will yield an output PCollection that is larger 
(scaleFactor &gt; 1) or smaller (0 &lt; scaleFactor &lt; 1)
+than the input PCollection it is applied to. The compiler may use this 
information to determine how to optimally
+split processing tasks between the Map and Reduce phases of dependent 
MapReduce jobs.</p>
+<h3 id="other-utilities">Other Utilities</h3>
+<p>The DoFn base class provides convenience methods for accessing the 
<code>Configuration</code> and <code>Counter</code> objects that
+are associated with a MapReduce stage, so that they may be accessed during 
initialization, processing, and cleanup.</p>
+<h3 id="performing-cogroups-and-joins">Performing Cogroups and Joins</h3>
+<p>In Crunch, cogroups and joins are performed on PTable instances that have 
the same key type. This section walks through
+the basic flow of a cogroup operation, explaining how this higher-level 
operation is composed of Crunch's four primitives.
+In general, these common operations are provided as part of the core Crunch 
library or in extensions, you do not need
+to write them yourself. But it can be useful to understand how they work under 
the covers.</p>
+<p>Assume we have a <code>PTable&lt;K, U&gt;</code> named "a" and a different 
<code>PTable&lt;K, V&gt;</code> named "b" that we would like to combine into a
+single <code>PTable&lt;K, Pair&lt;Collection&lt;U&gt;, 
Collection&lt;V&gt;&gt;&gt;</code>. First, we need to apply parallelDo 
operations to a and b that
+convert them into the same Crunch type, <code>PTable&lt;K, Pair&lt;U, 
V&gt;&gt;</code>:</p>
+<div class="codehilite"><pre><span class="sr">//</span> <span 
class="n">Perform</span> <span class="n">the</span> <span 
class="s">&quot;tagging&quot;</span> <span class="n">operation</span> <span 
class="n">as</span> <span class="n">a</span> <span class="n">parallelDo</span> 
<span class="n">on</span> <span class="n">PTable</span> <span class="n">a</span>
+<span class="n">PTable</span><span class="o">&lt;</span><span 
class="n">K</span><span class="p">,</span> <span class="n">Pair</span><span 
class="o">&lt;</span><span class="n">U</span><span class="p">,</span> <span 
class="n">V</span><span class="o">&gt;&gt;</span> <span class="n">aPrime</span> 
<span class="o">=</span> <span class="n">a</span><span class="o">.</span><span 
class="n">parallelDo</span><span class="p">(</span><span 
class="s">&quot;taga&quot;</span><span class="p">,</span> <span 
class="k">new</span> <span class="n">MapFn</span><span 
class="o">&lt;</span><span class="n">Pair</span><span 
class="o">&lt;</span><span class="n">K</span><span class="p">,</span> <span 
class="n">U</span><span class="o">&gt;</span><span class="p">,</span> <span 
class="n">Pair</span><span class="o">&lt;</span><span class="n">K</span><span 
class="p">,</span> <span class="n">Pair</span><span class="o">&lt;</span><span 
class="n">U</span><span class="p">,</span> <span class="n">V</span><span clas
 s="o">&gt;&gt;&gt;</span><span class="p">()</span> <span class="p">{</span>
+  <span class="n">public</span> <span class="n">Pair</span><span 
class="o">&lt;</span><span class="n">K</span><span class="p">,</span> <span 
class="n">Pair</span><span class="o">&lt;</span><span class="n">U</span><span 
class="p">,</span> <span class="n">V</span><span class="o">&gt;&gt;</span> 
<span class="nb">map</span><span class="p">(</span><span 
class="n">Pair</span><span class="o">&lt;</span><span class="n">K</span><span 
class="p">,</span> <span class="n">U</span><span class="o">&gt;</span> <span 
class="n">input</span><span class="p">)</span> <span class="p">{</span>
+    <span class="k">return</span> <span class="n">Pair</span><span 
class="o">.</span><span class="n">of</span><span class="p">(</span><span 
class="n">input</span><span class="o">.</span><span class="n">first</span><span 
class="p">(),</span> <span class="n">Pair</span><span class="o">.</span><span 
class="n">of</span><span class="p">(</span><span class="n">input</span><span 
class="o">.</span><span class="n">second</span><span class="p">(),</span> <span 
class="n">null</span><span class="p">));</span>
+  <span class="p">}</span>
+<span class="p">},</span> <span class="n">tableOf</span><span 
class="p">(</span><span class="n">a</span><span class="o">.</span><span 
class="n">getKeyType</span><span class="p">(),</span> <span 
class="n">pair</span><span class="p">(</span><span class="n">a</span><span 
class="o">.</span><span class="n">getValueType</span><span class="p">(),</span> 
<span class="n">b</span><span class="o">.</span><span 
class="n">getValueType</span><span class="p">())));</span>
+
+<span class="sr">//</span> <span class="n">Perform</span> <span 
class="n">the</span> <span class="s">&quot;tagging&quot;</span> <span 
class="n">operation</span> <span class="n">as</span> <span class="n">a</span> 
<span class="n">parallelDo</span> <span class="n">on</span> <span 
class="n">PTable</span> <span class="n">b</span>
+<span class="n">PTable</span><span class="o">&lt;</span><span 
class="n">K</span><span class="p">,</span> <span class="n">Pair</span><span 
class="o">&lt;</span><span class="n">U</span><span class="p">,</span> <span 
class="n">V</span><span class="o">&gt;&gt;</span> <span class="n">bPrime</span> 
<span class="o">=</span> <span class="n">b</span><span class="o">.</span><span 
class="n">parallelDo</span><span class="p">(</span><span 
class="s">&quot;tagb&quot;</span><span class="p">,</span> <span 
class="k">new</span> <span class="n">MapFn</span><span 
class="o">&lt;</span><span class="n">Pair</span><span 
class="o">&lt;</span><span class="n">K</span><span class="p">,</span> <span 
class="n">V</span><span class="o">&gt;</span><span class="p">,</span> <span 
class="n">Pair</span><span class="o">&lt;</span><span class="n">K</span><span 
class="p">,</span> <span class="n">Pair</span><span class="o">&lt;</span><span 
class="n">U</span><span class="p">,</span> <span class="n">V</span><span clas
 s="o">&gt;&gt;&gt;</span><span class="p">()</span> <span class="p">{</span>
+  <span class="n">public</span> <span class="n">Pair</span><span 
class="o">&lt;</span><span class="n">K</span><span class="p">,</span> <span 
class="n">Pair</span><span class="o">&lt;</span><span class="n">U</span><span 
class="p">,</span> <span class="n">V</span><span class="o">&gt;&gt;</span> 
<span class="nb">map</span><span class="p">(</span><span 
class="n">Pair</span><span class="o">&lt;</span><span class="n">K</span><span 
class="p">,</span> <span class="n">V</span><span class="o">&gt;</span> <span 
class="n">input</span><span class="p">)</span> <span class="p">{</span>
+    <span class="k">return</span> <span class="n">Pair</span><span 
class="o">.</span><span class="n">of</span><span class="p">(</span><span 
class="n">input</span><span class="o">.</span><span class="n">first</span><span 
class="p">(),</span> <span class="n">Pair</span><span class="o">.</span><span 
class="n">of</span><span class="p">(</span><span class="n">null</span><span 
class="p">,</span> <span class="n">input</span><span class="o">.</span><span 
class="n">second</span><span class="p">()));</span>
+  <span class="p">}</span>
+<span class="p">},</span> <span class="n">tableOf</span><span 
class="p">(</span><span class="n">a</span><span class="o">.</span><span 
class="n">getKeyType</span><span class="p">(),</span> <span 
class="n">pair</span><span class="p">(</span><span class="n">a</span><span 
class="o">.</span><span class="n">getValueType</span><span class="p">(),</span> 
<span class="n">b</span><span class="o">.</span><span 
class="n">getValueType</span><span class="p">())));</span>
+</pre></div>
+
+
+<p>Once the input PTables are tagged into a single type, we can apply the 
union operation to create a single PTable
+reference that includes both of the tagged PTables and then group the unioned 
PTable by the common key:</p>
+<div class="codehilite"><pre><span class="n">PTable</span><span 
class="o">&lt;</span><span class="n">K</span><span class="p">,</span> <span 
class="n">Pair</span><span class="o">&lt;</span><span class="n">U</span><span 
class="p">,</span> <span class="n">V</span><span class="o">&gt;&gt;</span> 
<span class="n">both</span> <span class="o">=</span> <span 
class="n">aPrime</span><span class="o">.</span><span 
class="n">union</span><span class="p">(</span><span 
class="n">bPrime</span><span class="p">);</span>
+<span class="n">PGroupedTable</span><span class="o">&lt;</span><span 
class="n">K</span><span class="p">,</span> <span class="n">Pair</span><span 
class="o">&lt;</span><span class="n">U</span><span class="p">,</span> <span 
class="n">V</span><span class="o">&gt;&gt;</span> <span 
class="n">grouped</span> <span class="o">=</span> <span 
class="n">both</span><span class="o">.</span><span 
class="n">groupByKey</span><span class="p">();</span>
+</pre></div>
+
+
+<p>The grouping operation will create an <code>Iterable&lt;Pair&lt;U, 
V&gt;&gt;</code> which we can then convert to a 
<code>Pair&lt;Collection&lt;U&gt;, Collection&lt;V&gt;&gt;</code>:</p>
+<div class="codehilite"><pre><span class="n">grouped</span><span 
class="o">.</span><span class="n">parallelDo</span><span 
class="p">(</span><span class="s">&quot;cogroup&quot;</span><span 
class="p">,</span> <span class="k">new</span> <span class="n">MapFn</span><span 
class="o">&lt;</span><span class="n">Pair</span><span 
class="o">&lt;</span><span class="n">K</span><span class="p">,</span> <span 
class="n">Iterable</span><span class="o">&lt;</span><span 
class="n">Pair</span><span class="o">&lt;</span><span class="n">U</span><span 
class="p">,</span> <span class="n">V</span><span 
class="o">&gt;&gt;&gt;</span><span class="p">,</span> <span 
class="n">Pair</span><span class="o">&lt;</span><span class="n">K</span><span 
class="p">,</span> <span class="n">Pair</span><span 
class="sr">&lt;Collection&lt;U&gt;</span><span class="p">,</span> <span 
class="n">Collection</span><span class="sr">&lt;V&gt;</span><span 
class="o">&gt;&gt;&gt;</span><span class="p">()</span> <span class="p">{</span
 >
+  <span class="n">public</span> <span class="n">Pair</span><span 
class="o">&lt;</span><span class="n">K</span><span class="p">,</span> <span 
class="n">Pair</span><span class="sr">&lt;Collection&lt;U&gt;</span><span 
class="p">,</span> <span class="n">Collection</span><span 
class="sr">&lt;V&gt;</span><span class="o">&gt;&gt;</span> <span 
class="nb">map</span><span class="p">(</span><span class="n">Pair</span><span 
class="o">&lt;</span><span class="n">K</span><span class="p">,</span> <span 
class="n">Iterable</span><span class="o">&lt;</span><span 
class="n">Pair</span><span class="o">&lt;</span><span class="n">U</span><span 
class="p">,</span> <span class="n">V</span><span class="o">&gt;&gt;&gt;</span> 
<span class="n">input</span><span class="p">)</span> <span class="p">{</span>
+    <span class="n">Collection</span><span class="sr">&lt;U&gt;</span> <span 
class="n">uValues</span> <span class="o">=</span> <span class="k">new</span> 
<span class="n">ArrayList</span><span class="sr">&lt;U&gt;</span><span 
class="p">();</span>
+    <span class="n">Collection</span><span class="sr">&lt;V&gt;</span> <span 
class="n">vValues</span> <span class="o">=</span> <span class="k">new</span> 
<span class="n">ArrayList</span><span class="sr">&lt;V&gt;</span><span 
class="p">();</span>
+    <span class="k">for</span> <span class="p">(</span><span 
class="n">Pair</span><span class="o">&lt;</span><span class="n">U</span><span 
class="p">,</span> <span class="n">V</span><span class="o">&gt;</span> <span 
class="n">pair</span> <span class="p">:</span> <span 
class="n">input</span><span class="o">.</span><span 
class="n">second</span><span class="p">())</span> <span class="p">{</span>
+      <span class="k">if</span> <span class="p">(</span><span 
class="n">pair</span><span class="o">.</span><span class="n">first</span><span 
class="p">()</span> <span class="o">!=</span> <span class="n">null</span><span 
class="p">)</span> <span class="p">{</span>
+        <span class="n">uValues</span><span class="o">.</span><span 
class="n">add</span><span class="p">(</span><span class="n">pair</span><span 
class="o">.</span><span class="n">first</span><span class="p">());</span>
+      <span class="p">}</span> <span class="k">else</span> <span 
class="p">{</span>
+        <span class="n">vValues</span><span class="o">.</span><span 
class="n">add</span><span class="p">(</span><span class="n">pair</span><span 
class="o">.</span><span class="n">second</span><span class="p">());</span>
+      <span class="p">}</span>
+    <span class="p">}</span>
+    <span class="k">return</span> <span class="n">Pair</span><span 
class="o">.</span><span class="n">of</span><span class="p">(</span><span 
class="n">input</span><span class="o">.</span><span class="n">first</span><span 
class="p">(),</span> <span class="n">Pair</span><span class="o">.</span><span 
class="n">of</span><span class="p">(</span><span class="n">uValues</span><span 
class="p">,</span> <span class="n">vValues</span><span class="p">));</span>
+  <span class="p">},</span>
+<span class="p">},</span> <span class="n">tableOf</span><span 
class="p">(</span><span class="n">grouped</span><span class="o">.</span><span 
class="n">getKeyType</span><span class="p">(),</span> <span 
class="n">pair</span><span class="p">(</span><span 
class="n">collections</span><span class="p">(</span><span 
class="n">a</span><span class="o">.</span><span 
class="n">getValueType</span><span class="p">()),</span> <span 
class="n">collections</span><span class="p">(</span><span 
class="n">b</span><span class="o">.</span><span 
class="n">getValueType</span><span class="p">()))));</span>
+</pre></div>
+        </div> <!-- /span -->
+
+      </div> <!-- /row-fluid -->
+
+    </div>
+
+    <hr/>
+
+    <footer>
+      <div class="container-fluid">
+        <div class="row span12">Copyright &copy; 2012
+          <a href="http://www.apache.org/";>The Apache Software Foundation</a>,
+          licensed under the <a 
href="http://www.apache.org/licenses/LICENSE-2.0";>Apache License, Version 
2.0</a>.
+         <p><small>Apache Incubator, Apache Hadoop, Hadoop, Apache, and the
+         Apache feather logo are trademarks of The Apache Software Foundation.
+         Other names appearing on the site may be trademarks of their
+         respective owners.</small></p>
+        </div>
+      </div>
+    </footer>
+
+  </body>
+</html>

Added: websites/staging/crunch/trunk/content/crunch/scrunch.html
==============================================================================
--- websites/staging/crunch/trunk/content/crunch/scrunch.html (added)
+++ websites/staging/crunch/trunk/content/crunch/scrunch.html Sun Sep 16 
18:50:04 2012
@@ -0,0 +1,198 @@
+<!DOCTYPE html>
+
+
+<html xmlns="http://www.w3.org/1999/xhtml"; lang="en">
+  <head>
+    <meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
+    <meta name="viewport" content="width=device-width, initial-scale=1.0" />
+    <meta http-equiv="Content-Language" content="en" />
+
+    <title>Apache Crunch - Scrunch</title>
+
+    <link rel="stylesheet" href="/crunch/css/bootstrap-2.1.0.min.css" />
+    <link rel="stylesheet" href="/crunch/css/crunch.css" type="text/css">
+    <script type="text/javascript" 
src="/crunch/js/bootstrap-2.1.0.min.js"></script>
+  </head>
+  <body>
+
+    <div class="navbar navbar-inverse navbar-static-top">
+      
+        <div class="container-fluid">
+
+          <a class="nav pull-right brand" href="http://incubator.apache.org";>
+            <img src="http://incubator.apache.org/images/egg-logo.png"; 
alt="apache Incubator Logo" />
+          </a>
+
+        </div>
+      
+    </div>
+
+    <ul class="breadcrumb">
+      <li>
+        <a href="/">Incubator</a>
+       <span class="divider">&raquo;</span>
+      </li>
+      <li>
+        <a href="/crunch/">Crunch</a>
+      </li>
+      
+    </ul>
+
+    <div class="container-fluid">
+      <div class="row-fluid">
+
+        <!-- SIDEBAR AREA -->
+        <div class="span2">
+          <div class="sidebar-nav">
+            <ul class="nav nav-list">
+              
+                
+                  <li class="nav-header">Apache Crunch</li>
+                
+              
+                
+                  
+                    <li><a href="/crunch/index.html">Overview</a></li>
+                  
+                
+              
+                
+                  
+                    <li><a href="/crunch/apidocs/">API</a></li>
+                  
+                
+              
+                
+                  
+                    <li><a 
href="https://cwiki.apache.org/confluence/display/CRUNCH/";>Wiki</a></li>
+                  
+                
+              
+                
+                  <li class="nav-header">Project</li>
+                
+              
+                
+                  
+                    <li><a href="/crunch/source-repository.html">Source 
Code</a></li>
+                  
+                
+              
+                
+                  
+                    <li><a href="/crunch/mailing-lists.html">Mailing 
Lists</a></li>
+                  
+                
+              
+                
+                  
+                    <li><a 
href="http://issues.apache.org/jira/browse/CRUNCH";>Issue Tracking</a></li>
+                  
+                
+              
+                
+                  
+                    <li><a 
href="http://apache.org/licenses/LICENSE-2.0.html";>License</a></li>
+                  
+                
+              
+            </ul>
+          </div> <!-- /well -->
+        </div> <!-- /span -->
+
+        <!-- CONTENT AREA -->
+        <div class="span10">
+          <h1 class="title">
+            Scrunch
+            
+              <small>A Scala Wrapper for Apache Crunch</small>
+            
+          </h1>
+
+          <h2 id="introduction">Introduction</h2>
+<p>Scrunch is an experimental Scala wrapper for Crunch, based on the same 
ideas as the
+<a href="http://days2011.scala-lang.org/node/138/282";>Cascade</a> project at 
Google, which created
+a Scala wrapper for FlumeJava.</p>
+<h2 id="why-scala">Why Scala?</h2>
+<p>In many ways, Scala is the perfect language for writing Crunch pipelines. 
Scala supports
+a mixture of functional and object-oriented programming styles and has 
powerful type-inference
+capabilities, allowing us to create complex pipelines using very few 
keystrokes. Here is
+the Scrunch analogue of the classic WordCount problem:</p>
+<div class="codehilite"><pre><span class="nb">import</span> <span 
class="n">org</span><span class="o">.</span><span class="n">apache</span><span 
class="o">.</span><span class="n">crunch</span><span class="o">.</span><span 
class="n">io</span><span class="o">.</span><span class="p">{</span><span 
class="n">From</span> <span class="o">=&gt;</span> <span 
class="n">from</span><span class="p">}</span>
+<span class="nb">import</span> <span class="n">org</span><span 
class="o">.</span><span class="n">apache</span><span class="o">.</span><span 
class="n">crunch</span><span class="o">.</span><span 
class="n">scrunch</span><span class="o">.</span><span class="n">_</span>
+<span class="nb">import</span> <span class="n">org</span><span 
class="o">.</span><span class="n">apache</span><span class="o">.</span><span 
class="n">crunch</span><span class="o">.</span><span 
class="n">scrunch</span><span class="o">.</span><span 
class="n">Conversions_</span>  <span class="c1"># For implicit type 
conversions</span>
+
+<span class="n">class</span> <span class="n">WordCountExample</span> <span 
class="p">{</span>
+  <span class="n">val</span> <span class="n">pipeline</span> <span 
class="o">=</span> <span class="k">new</span> <span 
class="n">Pipeline</span><span class="p">[</span><span 
class="n">WordCountExample</span><span class="p">]</span>
+
+  <span class="n">def</span> <span class="n">wordCount</span><span 
class="p">(</span><span class="n">fileName:</span> <span 
class="n">String</span><span class="p">)</span> <span class="o">=</span> <span 
class="p">{</span>
+    <span class="n">pipeline</span><span class="o">.</span><span 
class="nb">read</span><span class="p">(</span><span class="n">from</span><span 
class="o">.</span><span class="n">textFile</span><span class="p">(</span><span 
class="n">fileName</span><span class="p">))</span>
+      <span class="o">.</span><span class="n">flatMap</span><span 
class="p">(</span><span class="n">_</span><span class="o">.</span><span 
class="n">toLowerCase</span><span class="o">.</span><span 
class="nb">split</span><span class="p">(</span><span 
class="s">&quot;\\W+&quot;</span><span class="p">))</span>
+      <span class="o">.</span><span class="n">filter</span><span 
class="p">(</span><span class="o">!</span><span class="n">_</span><span 
class="o">.</span><span class="n">isEmpty</span><span class="p">())</span>
+      <span class="o">.</span><span class="n">count</span>
+  <span class="p">}</span>
+<span class="p">}</span>
+</pre></div>
+
+
+<p>The Scala compiler can infer the return type of the flatMap function as an 
Array[String], and
+the Scrunch wrapper uses the type inference mechanism to figure out how to 
serialize the
+data between the Map and Reduce stages. Here's a slightly more complex 
example, in which we
+get the word counts for two different files and compute the deltas of how 
often different
+words occur, and then only returns the words where the first file had more 
occurrences then
+the second:</p>
+<div class="codehilite"><pre><span class="n">class</span> <span 
class="n">WordCountExample</span> <span class="p">{</span>
+  <span class="n">def</span> <span class="n">wordGt</span><span 
class="p">(</span><span class="n">firstFile:</span> <span 
class="n">String</span><span class="p">,</span> <span 
class="n">secondFile:</span> <span class="n">String</span><span 
class="p">)</span> <span class="o">=</span> <span class="p">{</span>
+    <span class="n">wordCount</span><span class="p">(</span><span 
class="n">firstFile</span><span class="p">)</span><span class="o">.</span><span 
class="n">cogroup</span><span class="p">(</span><span 
class="n">wordCount</span><span class="p">(</span><span 
class="n">secondFile</span><span class="p">))</span>
+      <span class="o">.</span><span class="nb">map</span><span 
class="p">((</span><span class="n">k</span><span class="p">,</span> <span 
class="n">v</span><span class="p">)</span> <span class="o">=&gt;</span> <span 
class="p">(</span><span class="n">k</span><span class="p">,</span> <span 
class="p">(</span><span class="n">v</span><span class="o">.</span><span 
class="n">_1</span><span class="o">.</span><span class="n">sum</span> <span 
class="o">-</span> <span class="n">v</span><span class="o">.</span><span 
class="n">_2</span><span class="o">.</span><span class="n">sum</span><span 
class="p">)))</span>
+      <span class="o">.</span><span class="n">filter</span><span 
class="p">((</span><span class="n">k</span><span class="p">,</span> <span 
class="n">v</span><span class="p">)</span> <span class="o">=&gt;</span> <span 
class="n">v</span> <span class="o">&gt;</span> <span class="mi">0</span><span 
class="p">)</span><span class="o">.</span><span class="nb">map</span><span 
class="p">((</span><span class="n">k</span><span class="p">,</span> <span 
class="n">v</span><span class="p">)</span> <span class="o">=&gt;</span> <span 
class="n">k</span><span class="p">)</span>
+  <span class="p">}</span>
+<span class="p">}</span>
+</pre></div>
+
+
+<p>Note that all of the functions are using Scala Tuples, not Crunch Tuples. 
Under the covers,
+Scrunch uses Scala's implicit type conversion mechanism to transparently 
convert data from the
+Crunch format to the Scala format and back again.</p>
+<h2 id="materializing-job-outputs">Materializing Job Outputs</h2>
+<p>Scrunch also incorporates Crunch's materialize functionality, which allows 
us to easily read
+the output of a Crunch pipeline into the client:</p>
+<div class="codehilite"><pre><span class="n">class</span> <span 
class="n">WordCountExample</span> <span class="p">{</span>
+  <span class="n">def</span> <span class="n">hasHamlet</span> <span 
class="o">=</span> <span class="n">wordGt</span><span class="p">(</span><span 
class="s">&quot;shakespeare.txt&quot;</span><span class="p">,</span> <span 
class="s">&quot;maugham.txt&quot;</span><span class="p">)</span><span 
class="o">.</span><span class="n">materialize</span><span 
class="o">.</span><span class="nb">exists</span><span class="p">(</span><span 
class="n">_</span> <span class="o">==</span> <span 
class="s">&quot;hamlet&quot;</span><span class="p">)</span>
+<span class="p">}</span>
+</pre></div>
+
+
+<h2 id="notes-and-thanks">Notes and Thanks</h2>
+<p>Scrunch is alpha-quality code, written by someone who was learning Scala on 
the fly. There will be bugs,
+rough edges, and non-idiomatic Scala usage all over the place. This will 
improve with time, and we welcome
+contributions from Scala experts who are interested in helping us make Scrunch 
into a first-class project.</p>
+<p>Scrunch emerged out of conversations with <a 
href="http://twitter.com/#!/squarecog";>Dmitriy Ryaboy</a>,
+<a href="http://twitter.com/#!/posco";>Oscar Boykin</a>, and <a 
href="http://twitter.com/#!/avibryant";>Avi Bryant</a> from Twitter.
+Many thanks to them for their feedback, guidance, and encouragement. We are 
also grateful to
+<a href="http://twitter.com/#!/matei_zaharia";>Matei Zaharia</a>, whose <a 
href="http://www.spark-project.org/";>Spark Project</a>
+inspired much of our implementation and was kind enough to loan us the 
ClosureCleaner implementation
+Spark developed for use in Scrunch.</p>
+        </div> <!-- /span -->
+
+      </div> <!-- /row-fluid -->
+
+    </div>
+
+    <hr/>
+
+    <footer>
+      <div class="container-fluid">
+        <div class="row span12">Copyright &copy; 2012
+          <a href="http://www.apache.org/";>The Apache Software Foundation</a>,
+          licensed under the <a 
href="http://www.apache.org/licenses/LICENSE-2.0";>Apache License, Version 
2.0</a>.
+         <p><small>Apache Incubator, Apache Hadoop, Hadoop, Apache, and the
+         Apache feather logo are trademarks of The Apache Software Foundation.
+         Other names appearing on the site may be trademarks of their
+         respective owners.</small></p>
+        </div>
+      </div>
+    </footer>
+
+  </body>
+</html>

Added: websites/staging/crunch/trunk/content/crunch/source-repository.html
==============================================================================
--- websites/staging/crunch/trunk/content/crunch/source-repository.html (added)
+++ websites/staging/crunch/trunk/content/crunch/source-repository.html Sun Sep 
16 18:50:04 2012
@@ -0,0 +1,139 @@
+<!DOCTYPE html>
+
+
+<html xmlns="http://www.w3.org/1999/xhtml"; lang="en">
+  <head>
+    <meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
+    <meta name="viewport" content="width=device-width, initial-scale=1.0" />
+    <meta http-equiv="Content-Language" content="en" />
+
+    <title>Apache Crunch - Getting the Source Code</title>
+
+    <link rel="stylesheet" href="/crunch/css/bootstrap-2.1.0.min.css" />
+    <link rel="stylesheet" href="/crunch/css/crunch.css" type="text/css">
+    <script type="text/javascript" 
src="/crunch/js/bootstrap-2.1.0.min.js"></script>
+  </head>
+  <body>
+
+    <div class="navbar navbar-inverse navbar-static-top">
+      
+        <div class="container-fluid">
+
+          <a class="nav pull-right brand" href="http://incubator.apache.org";>
+            <img src="http://incubator.apache.org/images/egg-logo.png"; 
alt="apache Incubator Logo" />
+          </a>
+
+        </div>
+      
+    </div>
+
+    <ul class="breadcrumb">
+      <li>
+        <a href="/">Incubator</a>
+       <span class="divider">&raquo;</span>
+      </li>
+      <li>
+        <a href="/crunch/">Crunch</a>
+      </li>
+      
+    </ul>
+
+    <div class="container-fluid">
+      <div class="row-fluid">
+
+        <!-- SIDEBAR AREA -->
+        <div class="span2">
+          <div class="sidebar-nav">
+            <ul class="nav nav-list">
+              
+                
+                  <li class="nav-header">Apache Crunch</li>
+                
+              
+                
+                  
+                    <li><a href="/crunch/index.html">Overview</a></li>
+                  
+                
+              
+                
+                  
+                    <li><a href="/crunch/apidocs/">API</a></li>
+                  
+                
+              
+                
+                  
+                    <li><a 
href="https://cwiki.apache.org/confluence/display/CRUNCH/";>Wiki</a></li>
+                  
+                
+              
+                
+                  <li class="nav-header">Project</li>
+                
+              
+                
+                  
+                    <li><b>Source Code</b></li>
+                  
+                
+              
+                
+                  
+                    <li><a href="/crunch/mailing-lists.html">Mailing 
Lists</a></li>
+                  
+                
+              
+                
+                  
+                    <li><a 
href="http://issues.apache.org/jira/browse/CRUNCH";>Issue Tracking</a></li>
+                  
+                
+              
+                
+                  
+                    <li><a 
href="http://apache.org/licenses/LICENSE-2.0.html";>License</a></li>
+                  
+                
+              
+            </ul>
+          </div> <!-- /well -->
+        </div> <!-- /span -->
+
+        <!-- CONTENT AREA -->
+        <div class="span10">
+          <h1 class="title">
+            Getting the Source Code
+            
+          </h1>
+
+          <p>Apache Crunch uses <a href="http://git-scm.com/";>Git</a> for 
version control. Run the
+following command to clone the repository:</p>
+<div class="codehilite"><pre><span class="n">git</span> <span 
class="n">clone</span> <span class="n">https:</span><span 
class="sr">//gi</span><span class="n">t</span><span class="o">-</span><span 
class="n">wip</span><span class="o">-</span><span class="n">us</span><span 
class="o">.</span><span class="n">apache</span><span class="o">.</span><span 
class="n">org</span><span class="sr">/repos/</span><span 
class="n">asf</span><span class="o">/</span><span 
class="n">incubator</span><span class="o">-</span><span 
class="n">crunch</span><span class="o">.</span><span class="n">git</span>
+</pre></div>
+
+
+<p>There is also a <a 
href="https://git-wip-us.apache.org/repos/asf/incubator-crunch.git";>Web UI</a> 
to browse the repository online.</p>
+        </div> <!-- /span -->
+
+      </div> <!-- /row-fluid -->
+
+    </div>
+
+    <hr/>
+
+    <footer>
+      <div class="container-fluid">
+        <div class="row span12">Copyright &copy; 2012
+          <a href="http://www.apache.org/";>The Apache Software Foundation</a>,
+          licensed under the <a 
href="http://www.apache.org/licenses/LICENSE-2.0";>Apache License, Version 
2.0</a>.
+         <p><small>Apache Incubator, Apache Hadoop, Hadoop, Apache, and the
+         Apache feather logo are trademarks of The Apache Software Foundation.
+         Other names appearing on the site may be trademarks of their
+         respective owners.</small></p>
+        </div>
+      </div>
+    </footer>
+
+  </body>
+</html>


Reply via email to