documentation...

yanfang Fri, 05 Dec 2014 10:55:02 -0800

Added: 
incubator/samza/site/learn/documentation/0.8/comparisons/introduction.html
URL: 
http://svn.apache.org/viewvc/incubator/samza/site/learn/documentation/0.8/comparisons/introduction.html?rev=1643389&view=auto
==============================================================================
--- incubator/samza/site/learn/documentation/0.8/comparisons/introduction.html 
(added)
+++ incubator/samza/site/learn/documentation/0.8/comparisons/introduction.html 
Fri Dec  5 18:52:32 2014
@@ -0,0 +1,252 @@
+<!DOCTYPE html>
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+<html lang="en">
+  <head>
+    <meta charset="utf-8">
+    <title>Samza - Comparison Introduction</title>
+    <link href='/css/ropa-sans.css' rel='stylesheet' type='text/css'/>
+    <link href="/css/bootstrap.min.css" rel="stylesheet"/>
+    <link href="/css/font-awesome.min.css" rel="stylesheet"/>
+    <link href="/css/main.css" rel="stylesheet"/>
+    <link href="/css/syntax.css" rel="stylesheet"/>
+    <link rel="icon" type="image/png" href="/img/samza-icon.png">
+    <script src="/js/jquery-1.11.1.min.js"></script>
+  </head>
+  <body>
+    <div class="wrapper">
+      <div class="wrapper-content">
+
+        <div class="masthead">
+          <div class="container">
+            <div class="masthead-logo">
+              <a href="/" class="logo">samza</a>
+            </div>
+            <div class="masthead-icons">
+              <div class="pull-right">
+                <a href="/startup/download"><i class="fa 
fa-arrow-circle-o-down masthead-icon"></i></a>
+                <a 
href="https://git-wip-us.apache.org/repos/asf?p=incubator-samza.git;a=tree"; 
target="_blank"><i class="fa fa-code masthead-icon" style="font-weight: 
bold;"></i></a>
+                <a href="https://twitter.com/samzastream"; target="_blank"><i 
class="fa fa-twitter masthead-icon"></i></a>
+                <!-- this icon only shows in versioned pages -->
+                
+                  
+                    
+                  
+                  <a 
href="http://samza.incubator.apache.org/learn/documentation/latest/comparisons/introduction.html";><i
 id="switch-version-button"></i></a>
+                   <!-- links for the navigation bar -->
+                
+
+              </div>
+            </div>
+          </div><!-- /.container -->
+        </div>
+
+        <div class="container">
+          <div class="menu">
+            <h1><i class="fa fa-rocket"></i> Getting Started</h1>
+            <ul>
+              <li><a href="/startup/hello-samza/0.8">Hello Samza</a></li>
+              <li><a href="/startup/download">Download</a></li>
+            </ul>
+
+            <h1><i class="fa fa-book"></i> Learn</h1>
+            <ul>
+              <li><a href="/learn/documentation/0.8">Documentation</a></li>
+              <li><a 
href="/learn/documentation/0.8/jobs/configuration-table.html">Configuration</a></li>
+              <li><a 
href="/learn/documentation/0.8/api/javadocs/">Javadocs</a></li>
+              <li><a href="/learn/tutorials/0.8">Tutorials</a></li>
+              <li><a href="http://wiki.apache.org/samza/FAQ";>FAQ</a></li>
+              <li><a href="http://wiki.apache.org/samza";>Wiki</a></li>
+              <li><a href="http://wiki.apache.org/samza/PapersAndTalks";>Papers 
&amp; Talks</a></li>
+              <li><a href="http://blogs.apache.org/samza";>Blog</a></li>
+            </ul>
+
+            <h1><i class="fa fa-comments"></i> Community</h1>
+            <ul>
+              <li><a href="/community/mailing-lists.html">Mailing 
Lists</a></li>
+              <li><a href="/community/irc.html">IRC</a></li>
+              <li><a 
href="https://issues.apache.org/jira/browse/SAMZA";>Bugs</a></li>
+              <li><a href="http://wiki.apache.org/samza/PoweredBy";>Powered 
by</a></li>
+              <li><a 
href="http://wiki.apache.org/samza/Ecosystem";>Ecosystem</a></li>
+              <li><a href="/community/committers.html">Committers</a></li>
+            </ul>
+
+            <h1><i class="fa fa-code"></i> Contribute</h1>
+            <ul>
+              <li><a href="/contribute/rules.html">Rules</a></li>
+              <li><a href="/contribute/coding-guide.html">Coding Guide</a></li>
+              <li><a href="/contribute/projects.html">Projects</a></li>
+              <li><a href="/contribute/design-documents.html">Design 
Documents</a></li>
+              <li><a href="/contribute/code.html">Code</a></li>
+              <li><a href="https://reviews.apache.org/groups/samza";>Review 
Board</a></li>
+              <li><a href="/contribute/tests.html">Tests</a></li>
+              <li><a href="/contribute/disclaimer.html">Disclaimer</a></li>
+            </ul>
+
+            <h1><i class="fa fa-history"></i> Archive</h1>
+            <ul>
+              <li><a href="/archive/index.html#latest">latest</a></li>
+              <li><a href="/archive/index.html#08">0.8</a></li>
+              <li><a href="/archive/index.html#07">0.7</a></li>
+            </ul>
+          </div>
+
+          <div class="content">
+            <!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<h2>Comparison Introduction</h2>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<p>Here are a few of the high-level design decisions that we think make Samza 
a bit different from other stream processing projects.</p>
+
+<h3 id="the-stream-model">The Stream Model</h3>
+
+<p>Streams are the input and output to Samza jobs. Samza has a very strong 
model of a stream&mdash;it is more than just a simple message exchange 
mechanism. A stream in Samza is a partitioned, ordered-per-partition, 
replayable, multi-subscriber, lossless sequence of messages. Streams are not 
just inputs and outputs to the system, but also buffers that isolate processing 
stages from each other.</p>
+
+<p>This stronger model requires persistence, fault-tolerance, and buffering in 
the stream implementation, but it has several benefits.</p>
+
+<p>First, delays in a downstream processing stage cannot block an upstream 
stage. A Samza job can stop consuming for a few minutes, or even a few hours 
(perhaps because of a bad deploy, or a long-running computation) without having 
any effect on upstream jobs. This makes Samza suitable for large deployments, 
such as processing all data flows in a large company: isolation between jobs is 
essential when they are written, owned, and run by different teams in different 
code bases with varying SLAs.</p>
+
+<p>This is motivated by our experience building analogous offline processing 
pipelines in Hadoop. In Hadoop the processing stages are MapReduce jobs, and 
the output of a processing stage is a directory of files on HDFS. The input to 
the next processing stage is simply the files produced by the earlier stage. We 
have found that this strong isolation between stages makes it possible to have 
hundreds of loosely coupled jobs, maintained by different teams, that comprise 
an offline processing ecosystem. Our goal is to replicate this kind of rich 
ecosystem in a near-real-time setting.</p>
+
+<p>The second benefit of this stronger model is that all stages are 
multi-subscriber. In practical terms this means that if one person adds a set 
of processing flows that create output data streams, others can see the output, 
consume it, and build on it, without introducing any coupling of code between 
the jobs. As a happy side-effect, this makes debugging flows easy, as you can 
manually inspect the output of any stage.</p>
+
+<p>Finally, this strong stream model greatly simplifies the implementation of 
features in the Samza framework. Each job need only be concerned with its own 
inputs and outputs, and in the case of a fault, each job can be recovered and 
restarted independently. There is no need for central control over the entire 
dataflow graph.</p>
+
+<p>The tradeoff we need to make for this stronger stream model is that 
messages are written to disk. We are willing to make this tradeoff because 
MapReduce and HDFS have shown that durable storage can offer very high read and 
write throughput, and almost limitless disk space. This observation is the 
foundation of Kafka, which allows hundreds of MB/sec of replicated throughput, 
and many TB of disk space per node. When used this way, disk throughput often 
isn&rsquo;t the bottleneck.</p>
+
+<p>MapReduce is sometimes criticized for writing to disk more than necessary. 
However, this criticism applies less to stream processing: batch processing 
like MapReduce often is used for processing large historical collections of 
data in a short period of time (e.g. query a month of data in ten minutes), 
whereas stream processing mostly needs to keep up with the steady-state flow of 
data (process 10 minutes worth of data in 10 minutes). This means that the raw 
throughput requirements for stream processing are, generally, orders of 
magnitude lower than for batch processing.</p>
+
+<h3 id="-state"><a name="state"></a> State</h3>
+
+<p>Only the very simplest stream processing problems are stateless (i.e. can 
process one message at a time, independently of all other messages). Many 
stream processing applications require a job to maintain some state. For 
example:</p>
+
+<ul>
+<li>If you want to know how many events have been seen for a particular user 
ID, you need to keep a counter for each user ID.</li>
+<li>If you want to know how many distinct users visit your site per day, you 
need to keep a set of all user IDs for which you&rsquo;ve seen at least one 
event today.</li>
+<li>If you want to join two streams (for example, if you want to determine the 
click-through rate of adverts by joining a stream of ad impression events with 
a stream of ad click events) you need to store the event from one stream until 
you receive the corresponding event from the other stream.</li>
+<li>If you want to augment events with some information from a database (for 
example, extending a page-view event with some information about the user who 
viewed the page), the job needs to access the current state of that 
database.</li>
+</ul>
+
+<p>Some kinds of state, such as counters, could be kept in-memory in the 
tasks, but then that state would be lost if the job is restarted. 
Alternatively, you can keep the state in a remote database, but performance can 
become unacceptable if you need to perform a database query for every message 
you process. Kafka can easily handle 100k-500k messages/sec per node (depending 
on message size), but throughput for queries against a remote key-value store 
tend to be closer to 1-5k requests per second &mdash; two orders of magnitude 
slower.</p>
+
+<p>In Samza, we have put particular effort into supporting high-performance, 
reliable state. The key is to keep state local to each node (so that queries 
don&rsquo;t need to go over the network), and to make it robust to machine 
failures by replicating state changes to another stream.</p>
+
+<p>This approach is especially interesting when combined with database 
<em>change capture</em>. Take the
+example above, where you have a stream of page-view events including the ID of 
the user who viewed the page, and you want to augment the events with more 
information about that user. At first glance, it looks as though you have no 
choice but to query the user database to look up every user ID you see (perhaps 
with some caching). With Samza, we can do better.</p>
+
+<p><em>Change capture</em> means that every time some data changes in your 
database, you get an event telling you what changed. If you have that stream of 
change events, going all the way back to when the database was created, you can 
reconstruct the entire contents of the database by replaying the stream. That 
<em>changelog</em> stream can also be used as input to a Samza job.</p>
+
+<p>Now you can write a Samza job that takes both the page-view event and the 
changelog as inputs. You make sure that they are partitioned on the same key 
(e.g. user ID). Every time a changelog event comes in, you write the updated 
user information to the task&rsquo;s local storage. Every time a page-view 
event comes in, you read the current information about that user from local 
storage. That way, you can keep all the state local to a task, and never need 
to query a remote database.</p>
+
+<p><img src="/img/0.8/learn/documentation/introduction/samza_state.png" 
alt="Stateful Processing" class="diagram-large"></p>
+
+<p>In effect, you now have a replica of the main database, broken into small 
partitions that are on the same machines as the Samza tasks. Database writes 
still need to go to the main database, but when you need to read from the 
database in order to process a message from the input stream, you can just 
consult the task&rsquo;s local state.</p>
+
+<p>This approach is not only much faster than querying a remote database, it 
is also much better for operations. If you are processing a high-volume stream 
with Samza, and making a remote query for every message, you can easily 
overwhelm the database with requests and affect other services using the same 
database. By contrast, when a task uses local state, it is isolated from 
everything else, so it cannot accidentally bring down other services.</p>
+
+<p>Partitioned local state is not always appropriate, and not required &mdash; 
nothing in Samza prevents calls to external databases. If you cannot produce a 
feed of changes from your database, or you need to rely on logic that exists 
only in a remote service, then it may be more convenient to call a remote 
service from your Samza job. But if you want to use local state, it works out 
of the box.</p>
+
+<h3 id="execution-framework">Execution Framework</h3>
+
+<p>One final decision we made was to not build a custom distributed execution 
system in Samza. Instead, execution is pluggable, and currently completely 
handled by YARN. This has two benefits.</p>
+
+<p>The first benefit is practical: there is another team of smart people 
working on the execution framework. YARN is developing at a rapid pace, and 
already supports a rich set of features around resource quotas and security. 
This allows you to control what portion of the cluster is allocated to which 
users and groups, and also control the resource utilization on individual nodes 
(CPU, memory, etc) via cgroups. YARN is run at massive scale to support Hadoop 
and will likely become an ubiquitous layer. Since Samza runs entirely through 
YARN, there are no separate daemons or masters to run beyond the YARN cluster 
itself. In other words, if you already have Kafka and YARN, you don&rsquo;t 
need to install anything in order to run Samza jobs.</p>
+
+<p>Secondly, our integration with YARN is completely componentized. It exists 
in a separate package, and the main Samza framework does not depend on it at 
build time. This means that YARN can be replaced with other virtualization 
frameworks &mdash; in particular, we are interested in adding direct AWS 
integration. Many companies run in AWS which is itself a virtualization 
framework, which for Samza&rsquo;s purposes is equivalent to YARN: it allows 
you to create and destroy virtual &ldquo;container&rdquo; machines and 
guarantees fixed resources for these containers. Since stream processing jobs 
are long-running, it is a bit silly to run a YARN cluster inside AWS and then 
schedule individual jobs within this cluster. Instead, a more sensible approach 
would be to directly allocate a set of EC2 instances for your jobs.</p>
+
+<p>We think there will be a lot of innovation both in open source 
virtualization frameworks like Mesos and YARN and in commercial cloud providers 
like Amazon, so it makes sense to integrate with them.</p>
+
+<h2 id="mupd8-&raquo;"><a href="mupd8.html">MUPD8 &raquo;</a></h2>
+
+
+          </div>
+        </div>
+
+      </div><!-- /.wrapper-content -->
+    </div><!-- /.wrapper -->
+
+    <div class="footer">
+      <div class="container">
+        <!-- nothing for now. -->
+      </div>
+    </div>
+
+  
+    <script>
+      $( document ).ready(function() {
+        if ( $.fn.urlExists( 
"/learn/documentation/latest/comparisons/introduction.html" ) ) {
+          $("#switch-version-button").addClass("fa fa-history masthead-icon");
+        }
+      });
+
+      /* a function to test whether the url exists or not */
+      (function( $ ) {
+        $.fn.urlExists = function(url) {
+          var http = new XMLHttpRequest();
+          http.open('HEAD', url, false);
+          http.send();
+          return http.status != 404;
+        };
+      }( jQuery ));
+    </script>
+  
+
+    <!-- Google Analytics -->
+    <script>
+      
(function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
+      (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new 
Date();a=s.createElement(o),
+      
m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
+      
})(window,document,'script','//www.google-analytics.com/analytics.js','ga');
+
+      ga('create', 'UA-43122768-1', 'apache.org');
+      ga('send', 'pageview');
+
+    </script>
+  </body>
+</html>


Added: incubator/samza/site/learn/documentation/0.8/comparisons/mupd8.html
URL: 
http://svn.apache.org/viewvc/incubator/samza/site/learn/documentation/0.8/comparisons/mupd8.html?rev=1643389&view=auto
==============================================================================
--- incubator/samza/site/learn/documentation/0.8/comparisons/mupd8.html (added)
+++ incubator/samza/site/learn/documentation/0.8/comparisons/mupd8.html Fri Dec 
 5 18:52:32 2014
@@ -0,0 +1,258 @@
+<!DOCTYPE html>
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+<html lang="en">
+  <head>
+    <meta charset="utf-8">
+    <title>Samza - MUPD8</title>
+    <link href='/css/ropa-sans.css' rel='stylesheet' type='text/css'/>
+    <link href="/css/bootstrap.min.css" rel="stylesheet"/>
+    <link href="/css/font-awesome.min.css" rel="stylesheet"/>
+    <link href="/css/main.css" rel="stylesheet"/>
+    <link href="/css/syntax.css" rel="stylesheet"/>
+    <link rel="icon" type="image/png" href="/img/samza-icon.png">
+    <script src="/js/jquery-1.11.1.min.js"></script>
+  </head>
+  <body>
+    <div class="wrapper">
+      <div class="wrapper-content">
+
+        <div class="masthead">
+          <div class="container">
+            <div class="masthead-logo">
+              <a href="/" class="logo">samza</a>
+            </div>
+            <div class="masthead-icons">
+              <div class="pull-right">
+                <a href="/startup/download"><i class="fa 
fa-arrow-circle-o-down masthead-icon"></i></a>
+                <a 
href="https://git-wip-us.apache.org/repos/asf?p=incubator-samza.git;a=tree"; 
target="_blank"><i class="fa fa-code masthead-icon" style="font-weight: 
bold;"></i></a>
+                <a href="https://twitter.com/samzastream"; target="_blank"><i 
class="fa fa-twitter masthead-icon"></i></a>
+                <!-- this icon only shows in versioned pages -->
+                
+                  
+                    
+                  
+                  <a 
href="http://samza.incubator.apache.org/learn/documentation/latest/comparisons/mupd8.html";><i
 id="switch-version-button"></i></a>
+                   <!-- links for the navigation bar -->
+                
+
+              </div>
+            </div>
+          </div><!-- /.container -->
+        </div>
+
+        <div class="container">
+          <div class="menu">
+            <h1><i class="fa fa-rocket"></i> Getting Started</h1>
+            <ul>
+              <li><a href="/startup/hello-samza/0.8">Hello Samza</a></li>
+              <li><a href="/startup/download">Download</a></li>
+            </ul>
+
+            <h1><i class="fa fa-book"></i> Learn</h1>
+            <ul>
+              <li><a href="/learn/documentation/0.8">Documentation</a></li>
+              <li><a 
href="/learn/documentation/0.8/jobs/configuration-table.html">Configuration</a></li>
+              <li><a 
href="/learn/documentation/0.8/api/javadocs/">Javadocs</a></li>
+              <li><a href="/learn/tutorials/0.8">Tutorials</a></li>
+              <li><a href="http://wiki.apache.org/samza/FAQ";>FAQ</a></li>
+              <li><a href="http://wiki.apache.org/samza";>Wiki</a></li>
+              <li><a href="http://wiki.apache.org/samza/PapersAndTalks";>Papers 
&amp; Talks</a></li>
+              <li><a href="http://blogs.apache.org/samza";>Blog</a></li>
+            </ul>
+
+            <h1><i class="fa fa-comments"></i> Community</h1>
+            <ul>
+              <li><a href="/community/mailing-lists.html">Mailing 
Lists</a></li>
+              <li><a href="/community/irc.html">IRC</a></li>
+              <li><a 
href="https://issues.apache.org/jira/browse/SAMZA";>Bugs</a></li>
+              <li><a href="http://wiki.apache.org/samza/PoweredBy";>Powered 
by</a></li>
+              <li><a 
href="http://wiki.apache.org/samza/Ecosystem";>Ecosystem</a></li>
+              <li><a href="/community/committers.html">Committers</a></li>
+            </ul>
+
+            <h1><i class="fa fa-code"></i> Contribute</h1>
+            <ul>
+              <li><a href="/contribute/rules.html">Rules</a></li>
+              <li><a href="/contribute/coding-guide.html">Coding Guide</a></li>
+              <li><a href="/contribute/projects.html">Projects</a></li>
+              <li><a href="/contribute/design-documents.html">Design 
Documents</a></li>
+              <li><a href="/contribute/code.html">Code</a></li>
+              <li><a href="https://reviews.apache.org/groups/samza";>Review 
Board</a></li>
+              <li><a href="/contribute/tests.html">Tests</a></li>
+              <li><a href="/contribute/disclaimer.html">Disclaimer</a></li>
+            </ul>
+
+            <h1><i class="fa fa-history"></i> Archive</h1>
+            <ul>
+              <li><a href="/archive/index.html#latest">latest</a></li>
+              <li><a href="/archive/index.html#08">0.8</a></li>
+              <li><a href="/archive/index.html#07">0.7</a></li>
+            </ul>
+          </div>
+
+          <div class="content">
+            <!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<h2>MUPD8</h2>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<p><em>People generally want to know how similar systems compare. We&rsquo;ve 
done our best to fairly contrast the feature sets of Samza with other systems. 
But we aren&rsquo;t experts in these frameworks, and we are, of course, totally 
biased. If we have goofed anything, please let us know and we will correct 
it.</em></p>
+
+<h3 id="durability">Durability</h3>
+
+<p>MUPD8 makes no durability or delivery guarantees. Within MUPD8, stream 
processor tasks receive messages at most once. Samza uses Kafka for messaging, 
which guarantees message delivery.</p>
+
+<h3 id="ordering">Ordering</h3>
+
+<p>As with durability, developers would ideally like their stream processors 
to receive messages in exactly the order that they were written.</p>
+
+<p>We don&rsquo;t entirely follow MUPD8&rsquo;s description of their ordering 
guarantees, but it seems to guarantee that all messages will be processed in 
the order in which they are written to MUPD8 queues, which is comparable to 
Kafka and Samza&rsquo;s guarantee.</p>
+
+<h3 id="buffering">Buffering</h3>
+
+<p>A critical issue for handling large data flows is handling back pressure 
when one downstream processing stage gets slow.</p>
+
+<p>MUPD8 buffers messages in an in-memory queue when passing messages between 
two MUPD8 tasks. When a queue fills up, developers have the option to either 
drop the messages on the floor, log the messages to local disk, or block until 
the queue frees up. All of these options are sub-optimal. Dropping messages 
leads to incorrect results. Blocking your stream processor can have a cascading 
effect, where the slowest processor blocks all upstream processors, which in 
turn block their upstream processors, until the whole system grinds to a halt. 
Logging to local disk is the most reasonable, but when a fault occurs, those 
messages are lost on failover.</p>
+
+<p>By adopting Kafka&rsquo;s broker as a remote buffer, Samza solves all of 
these problems. It doesn&rsquo;t need to block because consumers and producers 
are decoupled using the Kafka brokers&#39; disks as buffers. Messages are not 
dropped because Kafka brokers are highly available as of version 0.8. In the 
event of a failure, when a Samza job is restarted on another machine, its input 
and output are not lost, because they are stored remotely on replicated Kafka 
brokers.</p>
+
+<h3 id="state-management">State Management</h3>
+
+<p>As described in the <a href="introduction.html#state">introduction</a>, 
stream processors often need to maintain some state as they process messages. 
Different frameworks have different approaches to handling such state, and what 
to do in case of a failure.</p>
+
+<p>MUPD8 uses a write back caching strategy to manage in-memory state that is 
periodically written back to Cassandra.</p>
+
+<p>Samza maintains state locally with the task. This allows state larger than 
will fit in memory. State is persisted to an output stream to enable recovery 
should the task fail. We believe this design enables stronger fault tolerance 
semantics, because the change log captures the evolution of state, allowing the 
state of a task to restored to a consistent point in time.</p>
+
+<h3 id="deployment-and-execution">Deployment and execution</h3>
+
+<p>MUPD8 includes a custom execution framework. The functionality that this 
framework supports in terms of users and resource limits isn&rsquo;t clear to 
us.</p>
+
+<p>Samza leverages YARN to deploy user code, and execute it in a distributed 
environment.</p>
+
+<h3 id="fault-tolerance">Fault Tolerance</h3>
+
+<p>What should a stream processing system do when a machine or processor 
fails?</p>
+
+<p>MUPD8 uses its custom equivalent to YARN to manage fault tolerance. When a 
stream processor is unable to send a message to a downstream processor, it 
notifies MUPD8&rsquo;s coordinator, and all other machines are notified. The 
machines then send all messages to a new machine based on the key hash 
that&rsquo;s used. Messages and state can be lost when this happens.</p>
+
+<p>Samza uses YARN to manage fault tolerance. YARN detects when nodes or Samza 
tasks fail, and notifies Samza&rsquo;s <a 
href="../yarn/application-master.html">ApplicationMaster</a>. At that point, 
it&rsquo;s up to Samza to decide what to do. Generally, this means re-starting 
the task on another machine. Since messages are persisted to Kafka brokers 
remotely, and there are no in-memory queues, no messages should be lost (unless 
the processors are using async Kafka producers, which offer higher performance 
but don&rsquo;t wait for messages to be committed).</p>
+
+<h3 id="workflow">Workflow</h3>
+
+<p>Sometimes more than one job or processing stage is needed to accomplish 
something. This is the case where you wish to re-partition a stream, for 
example. MUPD8 has a custom workflow system setup to define how to execute 
multiple jobs at once, and how to feed stream data from one into the other.</p>
+
+<p>Samza makes the individual jobs the level of granularity of execution. Jobs 
communicate via named input and output streams. This implicitly defines a data 
flow graph between all running jobs. We chose this model to enable data flow 
graphs with processing stages owned by different engineers on different teams, 
working in different code bases, without the need to wire everything together 
into a single topology.</p>
+
+<p>This was motivated by our experience with Hadoop, where the data flow 
between jobs is implicitly defined by their input and output directories. This 
decentralized model has proven itself to scale well to a large organization.</p>
+
+<h3 id="memory">Memory</h3>
+
+<p>MUPD8 executes all of its map/update processors inside a single JVM, using 
threads. This is memory-efficient, as the JVM memory overhead is shared across 
the threads.</p>
+
+<p>Samza uses a separate JVM for each <a 
href="../container/samza-container.html">stream processor container</a>. This 
has the disadvantage of using more memory compared to running multiple stream 
processing threads within a single JVM. However, the advantage is improved 
isolation between tasks, which can make them more reliable.</p>
+
+<h3 id="isolation">Isolation</h3>
+
+<p>MUPD8 provides no resource isolation between stream processors. A single 
badly behaved stream processor can bring down all processors on the node.</p>
+
+<p>Samza uses process level isolation between stream processor tasks, 
similarly to Hadoop&rsquo;s approach. We can enforce strict per-process memory 
limits. In addition, Samza supports CPU limits when used with YARN cgroups. As 
the YARN support for cgroups develops further, it should also become possible 
to support disk and network cgroup limits.</p>
+
+<h3 id="further-reading">Further Reading</h3>
+
+<p>The MUPD8 team has published a very good <a 
href="http://vldb.org/pvldb/vol5/p1814_wanglam_vldb2012.pdf";>paper</a> on the 
design of their system.</p>
+
+<h2 id="storm-&raquo;"><a href="storm.html">Storm &raquo;</a></h2>
+
+
+          </div>
+        </div>
+
+      </div><!-- /.wrapper-content -->
+    </div><!-- /.wrapper -->
+
+    <div class="footer">
+      <div class="container">
+        <!-- nothing for now. -->
+      </div>
+    </div>
+
+  
+    <script>
+      $( document ).ready(function() {
+        if ( $.fn.urlExists( 
"/learn/documentation/latest/comparisons/mupd8.html" ) ) {
+          $("#switch-version-button").addClass("fa fa-history masthead-icon");
+        }
+      });
+
+      /* a function to test whether the url exists or not */
+      (function( $ ) {
+        $.fn.urlExists = function(url) {
+          var http = new XMLHttpRequest();
+          http.open('HEAD', url, false);
+          http.send();
+          return http.status != 404;
+        };
+      }( jQuery ));
+    </script>
+  
+
+    <!-- Google Analytics -->
+    <script>
+      
(function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
+      (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new 
Date();a=s.createElement(o),
+      
m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
+      
})(window,document,'script','//www.google-analytics.com/analytics.js','ga');
+
+      ga('create', 'UA-43122768-1', 'apache.org');
+      ga('send', 'pageview');
+
+    </script>
+  </body>
+</html>

Added: 
incubator/samza/site/learn/documentation/0.8/comparisons/spark-streaming.html
URL: 
http://svn.apache.org/viewvc/incubator/samza/site/learn/documentation/0.8/comparisons/spark-streaming.html?rev=1643389&view=auto
==============================================================================
--- 
incubator/samza/site/learn/documentation/0.8/comparisons/spark-streaming.html 
(added)
+++ 
incubator/samza/site/learn/documentation/0.8/comparisons/spark-streaming.html 
Fri Dec  5 18:52:32 2014
@@ -0,0 +1,279 @@
+<!DOCTYPE html>
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+<html lang="en">
+  <head>
+    <meta charset="utf-8">
+    <title>Samza - Spark Streaming</title>
+    <link href='/css/ropa-sans.css' rel='stylesheet' type='text/css'/>
+    <link href="/css/bootstrap.min.css" rel="stylesheet"/>
+    <link href="/css/font-awesome.min.css" rel="stylesheet"/>
+    <link href="/css/main.css" rel="stylesheet"/>
+    <link href="/css/syntax.css" rel="stylesheet"/>
+    <link rel="icon" type="image/png" href="/img/samza-icon.png">
+    <script src="/js/jquery-1.11.1.min.js"></script>
+  </head>
+  <body>
+    <div class="wrapper">
+      <div class="wrapper-content">
+
+        <div class="masthead">
+          <div class="container">
+            <div class="masthead-logo">
+              <a href="/" class="logo">samza</a>
+            </div>
+            <div class="masthead-icons">
+              <div class="pull-right">
+                <a href="/startup/download"><i class="fa 
fa-arrow-circle-o-down masthead-icon"></i></a>
+                <a 
href="https://git-wip-us.apache.org/repos/asf?p=incubator-samza.git;a=tree"; 
target="_blank"><i class="fa fa-code masthead-icon" style="font-weight: 
bold;"></i></a>
+                <a href="https://twitter.com/samzastream"; target="_blank"><i 
class="fa fa-twitter masthead-icon"></i></a>
+                <!-- this icon only shows in versioned pages -->
+                
+                  
+                    
+                  
+                  <a 
href="http://samza.incubator.apache.org/learn/documentation/latest/comparisons/spark-streaming.html";><i
 id="switch-version-button"></i></a>
+                   <!-- links for the navigation bar -->
+                
+
+              </div>
+            </div>
+          </div><!-- /.container -->
+        </div>
+
+        <div class="container">
+          <div class="menu">
+            <h1><i class="fa fa-rocket"></i> Getting Started</h1>
+            <ul>
+              <li><a href="/startup/hello-samza/0.8">Hello Samza</a></li>
+              <li><a href="/startup/download">Download</a></li>
+            </ul>
+
+            <h1><i class="fa fa-book"></i> Learn</h1>
+            <ul>
+              <li><a href="/learn/documentation/0.8">Documentation</a></li>
+              <li><a 
href="/learn/documentation/0.8/jobs/configuration-table.html">Configuration</a></li>
+              <li><a 
href="/learn/documentation/0.8/api/javadocs/">Javadocs</a></li>
+              <li><a href="/learn/tutorials/0.8">Tutorials</a></li>
+              <li><a href="http://wiki.apache.org/samza/FAQ";>FAQ</a></li>
+              <li><a href="http://wiki.apache.org/samza";>Wiki</a></li>
+              <li><a href="http://wiki.apache.org/samza/PapersAndTalks";>Papers 
&amp; Talks</a></li>
+              <li><a href="http://blogs.apache.org/samza";>Blog</a></li>
+            </ul>
+
+            <h1><i class="fa fa-comments"></i> Community</h1>
+            <ul>
+              <li><a href="/community/mailing-lists.html">Mailing 
Lists</a></li>
+              <li><a href="/community/irc.html">IRC</a></li>
+              <li><a 
href="https://issues.apache.org/jira/browse/SAMZA";>Bugs</a></li>
+              <li><a href="http://wiki.apache.org/samza/PoweredBy";>Powered 
by</a></li>
+              <li><a 
href="http://wiki.apache.org/samza/Ecosystem";>Ecosystem</a></li>
+              <li><a href="/community/committers.html">Committers</a></li>
+            </ul>
+
+            <h1><i class="fa fa-code"></i> Contribute</h1>
+            <ul>
+              <li><a href="/contribute/rules.html">Rules</a></li>
+              <li><a href="/contribute/coding-guide.html">Coding Guide</a></li>
+              <li><a href="/contribute/projects.html">Projects</a></li>
+              <li><a href="/contribute/design-documents.html">Design 
Documents</a></li>
+              <li><a href="/contribute/code.html">Code</a></li>
+              <li><a href="https://reviews.apache.org/groups/samza";>Review 
Board</a></li>
+              <li><a href="/contribute/tests.html">Tests</a></li>
+              <li><a href="/contribute/disclaimer.html">Disclaimer</a></li>
+            </ul>
+
+            <h1><i class="fa fa-history"></i> Archive</h1>
+            <ul>
+              <li><a href="/archive/index.html#latest">latest</a></li>
+              <li><a href="/archive/index.html#08">0.8</a></li>
+              <li><a href="/archive/index.html#07">0.7</a></li>
+            </ul>
+          </div>
+
+          <div class="content">
+            <!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<h2>Spark Streaming</h2>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<p><em>People generally want to know how similar systems compare. We&rsquo;ve 
done our best to fairly contrast the feature sets of Samza with other systems. 
But we aren&rsquo;t experts in these frameworks, and we are, of course, totally 
biased. If we have goofed anything, please let us know and we will correct 
it.</em></p>
+
+<p><a 
href="http://spark.apache.org/docs/latest/streaming-programming-guide.html";>Spark
 Streaming</a> is a stream processing system that uses the core <a 
href="http://spark.apache.org/";>Apache Spark</a> API. Both Samza and Spark 
Streaming provide data consistency, fault tolerance, a programming API, etc. 
Spark&rsquo;s approach to streaming is different from Samza&rsquo;s. Samza 
processes messages as they are received, while Spark Streaming treats streaming 
as a series of deterministic batch operations. Spark Streaming groups the 
stream into batches of a fixed duration (such as 1 second). Each batch is 
represented as a Resilient Distributed Dataset (<a 
href="http://www.cs.berkeley.edu/%7Ematei/papers/2012/nsdi_spark.pdf";>RDD</a>). 
A neverending sequence of these RDDs is called a Discretized Stream (<a 
href="http://www.cs.berkeley.edu/%7Ematei/papers/2012/hotcloud_spark_streaming.pdf";>DStream</a>).</p>
+
+<h3 id="overview-of-spark-streaming">Overview of Spark Streaming</h3>
+
+<p>Before going into the comparison, here is a brief overview of the Spark 
Streaming application. If you already are familiar with Spark Streaming, you 
may skip this part. There are two main parts of a Spark Streaming application: 
data receiving and data processing. </p>
+
+<ul>
+<li>Data receiving is accomplished by a <a 
href="https://spark.apache.org/docs/latest/streaming-custom-receivers.html";>receiver</a>
 which receives data and stores data in Spark (though not in an RDD at this 
point). </li>
+<li>Data processing transfers the data stored in Spark into the DStream. You 
can then apply the two <a 
href="https://spark.apache.org/docs/latest/streaming-programming-guide.html#operations";>operations</a>
 &ndash; transformations and output operations &ndash; on the DStream. The 
operations for DStream are a little different from what you can use for the 
general Spark RDD because of the streaming environment.</li>
+</ul>
+
+<p>Here is an overview of the Spark Streaming&rsquo;s <a 
href="https://spark.apache.org/docs/latest/cluster-overview.html";>deploy</a>. 
Spark has a SparkContext (in SparkStreaming, itâs called <a 
href="https://spark.apache.org/docs/1.0.0/api/scala/index.html#org.apache.spark.streaming.StreamingContext";>StreamingContext</a>)
 object in the driver program. The SparkContext talks with cluster manager 
(e.g. YARN, Mesos) which then allocates resources (that is, executors) for the 
Spark application. And executors will run tasks sent by the SparkContext (<a 
href="http://spark.apache.org/docs/latest/cluster-overview.html#compenents";>read
 more</a>). In YARNâs context, one executor is equivalent to one container. 
Tasks are what is running in the containers. The driver program runs in the 
client machine that submits job (<a 
href="https://spark.apache.org/docs/latest/running-on-yarn.html#launching-spark-on-yarn";>client
 mode</a>) or in the application manager (<a href="https://spark.apac
 he.org/docs/latest/running-on-yarn.html#launching-spark-on-yarn">cluster 
mode</a>). Both data receiving and data processing are tasks for executors. One 
receiver (receives one input stream) is a long-running task. Processing has a 
bunch of tasks. All the tasks are sent to the available executors.</p>
+
+<h3 id="ordering-and-guarantees">Ordering and Guarantees</h3>
+
+<p>Spark Streaming guarantees ordered processing of batches in a DStream. 
Since messages are processed in batches by side-effect-free operators, the 
exact ordering of messages is not important in Spark Streaming. Spark Streaming 
does not gurantee at-least-once or at-most-once messaging semantics because in 
some situations it may lose data when the driver program fails (see <a 
href="#fault-tolerance">fault-tolerance</a>). In addition, because Spark 
Streaming requires transformation operations to be deterministic, it is 
unsuitable for nondeterministic processing, e.g. a randomized machine learning 
algorithm.</p>
+
+<p>Samza guarantees processing the messages as the order they appear in the 
partition of the stream. Samza also allows you to define a deterministic 
ordering of messages between partitions using a <a 
href="../container/streams.html">MessageChooser</a>. It provides an 
at-least-once message delivery guarantee. And it does not require operations to 
be deterministic.</p>
+
+<h3 id="state-management">State Management</h3>
+
+<p>Spark Streaming provides a state DStream which keeps the state for each key 
and a transformation operation called <a 
href="https://spark.apache.org/docs/latest/streaming-programming-guide.html#transformations";>updateStateByKey</a>
 to mutate state. Everytime updateStateByKey is applied, you will get a new 
state DStream where all of the state is updated by applying the function passed 
to updateStateByKey. This transformation can serve as a basic key-value store, 
though it has a few drawbacks:</p>
+
+<ul>
+<li>you can only apply the DStream operations to your state because 
essentially it&rsquo;s a DStream.</li>
+<li>does not provide any key-value access to the data. If you want to access a 
certain key-value, you need to iterate the whole DStream.</li>
+<li>it is inefficient when the state is large because every time a new batch 
is processed, Spark Streaming consumes the entire state DStream to update 
relevant keys and values.</li>
+</ul>
+
+<p>Spark Streaming periodically writes intermedia data of stateful operations 
(updateStateByKey and window-based operations) into the HDFS. In the case of 
updateStateByKey, the entire state RDD is written into the HDFS after every 
checkpointing interval. As we mentioned in the <em><a 
href="../container/state-management.html#in-memory-state-with-checkpointing">in 
memory state with checkpointing</a></em>, writing the entire state to durable 
storage is very expensive when the state becomes large.</p>
+
+<p>Samza uses an embedded key-value store for <a 
href="../container/state-management.html#local-state-in-samza">state 
management</a>. This store is replicated as it&rsquo;s mutated, and supports 
both very high throughput writing and reading. And it gives you a lot of 
flexibility to decide what kind of state you want to maintain. What is more, 
you can also plug in other <a 
href="../container/state-management.html#other-storage-engines">storage 
engines</a>, which enables great flexibility in the stream processing 
algorithms you can use. A good comparison of different types of state manager 
approaches can be found <a 
href="../container/state-management.html#approaches-to-managing-task-state">here</a>.</p>
+
+<p>One of the common use cases in state management is <a 
href="../container/state-management.html#stream-stream-join">stream-stream 
join</a>. Though Spark Streaming has the <a 
href="https://spark.apache.org/docs/latest/streaming-programming-guide.html#transformations";>join</a>
 operation, this operation only joins two batches that are in the same time 
interval. It does not deal with the situation where events in two streams have 
mismatch. Spark Streaming&rsquo;s updateStateByKey approach to store mismatch 
events also has the limitation because if the number of mismatch events is 
large, there will be a large state, which causes the inefficience in Spark 
Streaming. While Samza does not have this limitation.</p>
+
+<h3 id="partitioning-and-parallelism">Partitioning and Parallelism</h3>
+
+<p>Spark Streaming&rsquo;s Parallelism is achieved by splitting the job into 
small tasks and sending them to executors. There are two types of <a 
href="http://spark.apache.org/docs/latest/streaming-programming-guide.html#level-of-parallelism-in-data-receiving";>parallelism
 in Spark Streaming</a>: parallelism in receiving the stream and parallelism in 
processing the stream. On the receiving side, one input DStream creates one 
receiver, and one receiver receives one input stream of data and runs as a 
long-running task. So in order to parallelize the receiving process, you can 
split one input stream into multiple input streams based on some criteria (e.g. 
if you are receiving a Kafka stream with some partitions, you may split this 
stream based on the partition). Then you can create multiple input DStreams (so 
multiple receivers) for these streams and the receivers will run as multiple 
tasks. Accordingly, you should provide enough resources by increasing the core 
number of the executors 
 or bringing up more executors. Then you can combine all the input Dstreams 
into one DStream during the processing if necessary. On the processing side, 
since a DStream is a continuous sequence of RDDs, the parallelism is simply 
accomplished by normal RDD operations, such as map, reduceByKey, reduceByWindow 
(check <a 
href="https://spark.apache.org/docs/latest/tuning.html#level-of-parallelism";>here</a>).</p>
+
+<p>Samzaâs parallelism is achieved by splitting processing into independent 
<a href="../api/overview.html">tasks</a> which can be parallelized. You can run 
multiple tasks in one container or only one task per container. That depends on 
your workload and latency requirement. For example, if you want to quickly <a 
href="../jobs/reprocessing.html">reprocess a stream</a>, you may increase the 
number of containers to one task per container. It is important to notice that 
one container only uses <a href="../container/event-loop.html">one thread</a>, 
which maps to exactly one CPU. This design attempts to simplify  resource 
management and the isolation between jobs.</p>
+
+<h3 id="buffering-&amp;-latency">Buffering &amp; Latency</h3>
+
+<p>Spark streaming essentially is a sequence of small batch processes. With a 
fast execution engine, it can reach the latency as low as one second (from 
their <a 
href="http://www.cs.berkeley.edu/%7Ematei/papers/2012/hotcloud_spark_streaming.pdf";>paper</a>).
 If the processing is slower than receiving, the data will be queued as 
DStreams in memory and the queue will keep increasing. In order to run a 
healthy Spark streaming application, the system should be <a 
href="http://spark.apache.org/docs/latest/streaming-programming-guide.html#performance-tuning";>tuned</a>
 until the speed of processing is as fast as receiving.</p>
+
+<p>Samza jobs can have latency in the low milliseconds when running with 
Apache Kafka. It has a different approach to buffering. The buffering mechanism 
is dependent on the input and output system. For example, when using <a 
href="http://kafka.apache.org/";>Kafka</a> as the input and output system, data 
is actually buffered to disk. This design decision, by sacrificing a little 
latency, allows the buffer to absorb a large backlog of messages when a job has 
fallen behind in its processing.</p>
+
+<h3 id="fault-tolerance">Fault-tolerance</h3>
+
+<p>There are two kinds of failures in both Spark Streaming and Samza: worker 
node (running executors) failure in Spark Streaming (equivalent to container 
failure in Samza) and driver node (running driver program) failure (equivalent 
to application manager (AM) failure in Samza).</p>
+
+<p>When a worker node fails in Spark Streaming, it will be restarted by the 
cluster manager. When a container fails in Samza, the application manager will 
work with YARN to start a new container. </p>
+
+<p>When a driver node fails in Spark Streaming, Sparkâs <a 
href="http://spark.apache.org/docs/latest/spark-standalone.html";>standalone 
cluster mode</a> will restart the driver node automatically. But it is 
currently not supported in YARN and Mesos. You will need other mechanisms to 
restart the driver node automatically. Spark Streaming can use the checkpoint 
in HDFS to recreate the StreamingContext. When the AM fails in Samza, YARN will 
handle restarting the AM. Samza will restart all the containers if the AM 
restarts.</p>
+
+<p>In terms of data lost, there is a difference between Spark Streaming and 
Samza. If the input stream is active streaming system, such as Flume, Kafka, 
Spark Streaming may lose data if the failure happens when the data is received 
but not yet replicated to other nodes (also see <a 
href="https://issues.apache.org/jira/browse/SPARK-1647";>SPARK-1647</a>). Samza 
will not lose data when the failure happens because it has the concept of <a 
href="../container/checkpointing.html">checkpointing</a> that stores the offset 
of the latest processed message and always commits the checkpoint after 
processing the data. There is not data lost situation like Spark Streaming has. 
If a container fails, it reads from the latest checkpoint. When a Samza job 
recovers from a failure, it&rsquo;s possible that it will process some data 
more than once. This happens because the job restarts at the last checkpoint, 
and any messages that had been processed between that checkpoint and the 
failure are processed a
 gain. The amount of reprocessed data can be minimized by setting a small 
checkpoint interval period.</p>
+
+<h3 id="deployment-&amp;-execution">Deployment &amp; Execution</h3>
+
+<p>Spark has a SparkContext object to talk with cluster managers, which then 
allocate resources for the application. Currently Spark supports three types of 
cluster managers: <a 
href="http://spark.apache.org/docs/latest/spark-standalone.html";>Spark 
standalone</a>, <a href="http://mesos.apache.org/";>Apache Mesos</a> and <a 
href="http://hadoop.apache.org/docs/current/hadoop-yarn/hadoop-yarn-site/YARN.html";>Hadoop
 YARN</a>. Besides these, Spark has a script for launching in <a 
href="http://spark.apache.org/docs/latest/ec2-scripts.html";>Amazon EC2</a>.</p>
+
+<p>Samza only supports YARN and local execution currently.</p>
+
+<h3 id="isolation">Isolation</h3>
+
+<p>Spark Streaming and Samza have the same isolation. Spark Streaming depends 
on cluster managers (e.g Mesos or YARN) and Samza depend on YARN to provide 
processor isolation. Different applications run in different JVMs. Data cannot 
be shared among different applications unless it is written to external 
storage. Since Samza provides out-of-box Kafka integration, it is very easy to 
reuse the output of other Samza jobs (see <a 
href="../introduction/concepts.html#dataflow-graphs">here</a>).</p>
+
+<h3 id="language-support">Language Support</h3>
+
+<p>Spark Streaming is written in Java and Scala and provides Scala, Java, and 
Python APIs. Samza is written in Java and Scala and has a Java API.</p>
+
+<h3 id="workflow">Workflow</h3>
+
+<p>In Spark Streaming, you build an entire processing graph with a DSL API and 
deploy that entire graph as one unit. The communication between the nodes in 
that graph (in the form of DStreams) is provided by the framework. That is a 
similar to Storm. Samza is totally different &ndash; each job is just a 
message-at-a-time processor, and there is no framework support for topologies. 
Output of a processing task always needs to go back to a message broker (e.g. 
Kafka).</p>
+
+<p>A positive consequence of Samza&rsquo;s design is that a job&rsquo;s output 
can be consumed by multiple unrelated jobs, potentially run by different teams, 
and those jobs are isolated from each other through Kafka&rsquo;s buffering. 
That is not the case with Storm&rsquo;s and Spark Streaming&rsquo;s 
framework-internal streams.</p>
+
+<p>Although a Storm/Spark Streaming job could in principle write its output to 
a message broker, the framework doesn&rsquo;t really make this easy. It seems 
that Storm/Spark aren&rsquo;t intended to used in a way where one 
topology&rsquo;s output is another topology&rsquo;s input. By contrast, in 
Samza, that mode of usage is standard.</p>
+
+<h3 id="maturity">Maturity</h3>
+
+<p>Spark has an active user and developer community, and recently releases 
1.0.0 version. It has a list of companies that use it on its <a 
href="https://cwiki.apache.org/confluence/display/SPARK/Powered+By+Spark";>Powered
 by</a> page. Since Spark contains Spark Streaming, Spark SQL, MLlib, GraphX 
and Bagel, it&rsquo;s tough to tell what portion of companies on the list are 
actually using Spark Streaming, and not just Spark.</p>
+
+<p>Samza is still young, but has just released version 0.7.0. It has a 
responsive community and is being developed actively. That said, it is built on 
solid systems such as YARN and Kafka. Samza is heavily used at LinkedIn and we 
hope others will find it useful as well.</p>
+
+<h2 id="api-overview-&raquo;"><a href="../api/overview.html">API Overview 
&raquo;</a></h2>
+
+
+          </div>
+        </div>
+
+      </div><!-- /.wrapper-content -->
+    </div><!-- /.wrapper -->
+
+    <div class="footer">
+      <div class="container">
+        <!-- nothing for now. -->
+      </div>
+    </div>
+
+  
+    <script>
+      $( document ).ready(function() {
+        if ( $.fn.urlExists( 
"/learn/documentation/latest/comparisons/spark-streaming.html" ) ) {
+          $("#switch-version-button").addClass("fa fa-history masthead-icon");
+        }
+      });
+
+      /* a function to test whether the url exists or not */
+      (function( $ ) {
+        $.fn.urlExists = function(url) {
+          var http = new XMLHttpRequest();
+          http.open('HEAD', url, false);
+          http.send();
+          return http.status != 404;
+        };
+      }( jQuery ));
+    </script>
+  
+
+    <!-- Google Analytics -->
+    <script>
+      
(function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
+      (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new 
Date();a=s.createElement(o),
+      
m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
+      
})(window,document,'script','//www.google-analytics.com/analytics.js','ga');
+
+      ga('create', 'UA-43122768-1', 'apache.org');
+      ga('send', 'pageview');
+
+    </script>
+  </body>
+</html>

Added: incubator/samza/site/learn/documentation/0.8/comparisons/storm.html
URL: 
http://svn.apache.org/viewvc/incubator/samza/site/learn/documentation/0.8/comparisons/storm.html?rev=1643389&view=auto
==============================================================================
--- incubator/samza/site/learn/documentation/0.8/comparisons/storm.html (added)
+++ incubator/samza/site/learn/documentation/0.8/comparisons/storm.html Fri Dec 
 5 18:52:32 2014
@@ -0,0 +1,296 @@
+<!DOCTYPE html>
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+<html lang="en">
+  <head>
+    <meta charset="utf-8">
+    <title>Samza - Storm</title>
+    <link href='/css/ropa-sans.css' rel='stylesheet' type='text/css'/>
+    <link href="/css/bootstrap.min.css" rel="stylesheet"/>
+    <link href="/css/font-awesome.min.css" rel="stylesheet"/>
+    <link href="/css/main.css" rel="stylesheet"/>
+    <link href="/css/syntax.css" rel="stylesheet"/>
+    <link rel="icon" type="image/png" href="/img/samza-icon.png">
+    <script src="/js/jquery-1.11.1.min.js"></script>
+  </head>
+  <body>
+    <div class="wrapper">
+      <div class="wrapper-content">
+
+        <div class="masthead">
+          <div class="container">
+            <div class="masthead-logo">
+              <a href="/" class="logo">samza</a>
+            </div>
+            <div class="masthead-icons">
+              <div class="pull-right">
+                <a href="/startup/download"><i class="fa 
fa-arrow-circle-o-down masthead-icon"></i></a>
+                <a 
href="https://git-wip-us.apache.org/repos/asf?p=incubator-samza.git;a=tree"; 
target="_blank"><i class="fa fa-code masthead-icon" style="font-weight: 
bold;"></i></a>
+                <a href="https://twitter.com/samzastream"; target="_blank"><i 
class="fa fa-twitter masthead-icon"></i></a>
+                <!-- this icon only shows in versioned pages -->
+                
+                  
+                    
+                  
+                  <a 
href="http://samza.incubator.apache.org/learn/documentation/latest/comparisons/storm.html";><i
 id="switch-version-button"></i></a>
+                   <!-- links for the navigation bar -->
+                
+
+              </div>
+            </div>
+          </div><!-- /.container -->
+        </div>
+
+        <div class="container">
+          <div class="menu">
+            <h1><i class="fa fa-rocket"></i> Getting Started</h1>
+            <ul>
+              <li><a href="/startup/hello-samza/0.8">Hello Samza</a></li>
+              <li><a href="/startup/download">Download</a></li>
+            </ul>
+
+            <h1><i class="fa fa-book"></i> Learn</h1>
+            <ul>
+              <li><a href="/learn/documentation/0.8">Documentation</a></li>
+              <li><a 
href="/learn/documentation/0.8/jobs/configuration-table.html">Configuration</a></li>
+              <li><a 
href="/learn/documentation/0.8/api/javadocs/">Javadocs</a></li>
+              <li><a href="/learn/tutorials/0.8">Tutorials</a></li>
+              <li><a href="http://wiki.apache.org/samza/FAQ";>FAQ</a></li>
+              <li><a href="http://wiki.apache.org/samza";>Wiki</a></li>
+              <li><a href="http://wiki.apache.org/samza/PapersAndTalks";>Papers 
&amp; Talks</a></li>
+              <li><a href="http://blogs.apache.org/samza";>Blog</a></li>
+            </ul>
+
+            <h1><i class="fa fa-comments"></i> Community</h1>
+            <ul>
+              <li><a href="/community/mailing-lists.html">Mailing 
Lists</a></li>
+              <li><a href="/community/irc.html">IRC</a></li>
+              <li><a 
href="https://issues.apache.org/jira/browse/SAMZA";>Bugs</a></li>
+              <li><a href="http://wiki.apache.org/samza/PoweredBy";>Powered 
by</a></li>
+              <li><a 
href="http://wiki.apache.org/samza/Ecosystem";>Ecosystem</a></li>
+              <li><a href="/community/committers.html">Committers</a></li>
+            </ul>
+
+            <h1><i class="fa fa-code"></i> Contribute</h1>
+            <ul>
+              <li><a href="/contribute/rules.html">Rules</a></li>
+              <li><a href="/contribute/coding-guide.html">Coding Guide</a></li>
+              <li><a href="/contribute/projects.html">Projects</a></li>
+              <li><a href="/contribute/design-documents.html">Design 
Documents</a></li>
+              <li><a href="/contribute/code.html">Code</a></li>
+              <li><a href="https://reviews.apache.org/groups/samza";>Review 
Board</a></li>
+              <li><a href="/contribute/tests.html">Tests</a></li>
+              <li><a href="/contribute/disclaimer.html">Disclaimer</a></li>
+            </ul>
+
+            <h1><i class="fa fa-history"></i> Archive</h1>
+            <ul>
+              <li><a href="/archive/index.html#latest">latest</a></li>
+              <li><a href="/archive/index.html#08">0.8</a></li>
+              <li><a href="/archive/index.html#07">0.7</a></li>
+            </ul>
+          </div>
+
+          <div class="content">
+            <!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<h2>Storm</h2>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<p><em>People generally want to know how similar systems compare. We&rsquo;ve 
done our best to fairly contrast the feature sets of Samza with other systems. 
But we aren&rsquo;t experts in these frameworks, and we are, of course, totally 
biased. If we have goofed anything, please let us know and we will correct 
it.</em></p>
+
+<p><a href="http://storm-project.net/";>Storm</a> and Samza are fairly similar. 
Both systems provide many of the same high-level features: a partitioned stream 
model, a distributed execution environment, an API for stream processing, fault 
tolerance, Kafka integration, etc.</p>
+
+<p>Storm and Samza use different words for similar concepts: <em>spouts</em> 
in Storm are similar to stream consumers in Samza, <em>bolts</em> are similar 
to tasks, and <em>tuples</em> are similar to messages in Samza. Storm also has 
some additional building blocks which don&rsquo;t have direct equivalents in 
Samza.</p>
+
+<h3 id="ordering-and-guarantees">Ordering and Guarantees</h3>
+
+<p>Storm allows you to choose the level of guarantee with which you want your 
messages to be processed:</p>
+
+<ul>
+<li>The simplest mode is <em>at-most-once delivery</em>, which drops messages 
if they are not processed correctly, or if the machine doing the processing 
fails. This mode requires no special logic, and processes messages in the order 
they were produced by the spout.</li>
+<li>There is also <em>at-least-once delivery</em>, which tracks whether each 
input tuple (and any downstream tuples it generated) was successfully processed 
within a configured timeout, by keeping an in-memory record of all emitted 
tuples. Any tuples that are not fully processed within the timeout are 
re-emitted by the spout. This implies that a bolt may see the same tuple more 
than once, and that messages can be processed out-of-order. This mechanism also 
requires some co-operation from the user code, which must maintain the ancestry 
of records in order to properly acknowledge its input. This is explained in 
depth on <a 
href="https://github.com/nathanmarz/storm/wiki/Guaranteeing-message-processing";>Storm&rsquo;s
 wiki</a>.</li>
+<li>Finally, Storm offers <em>exactly-once semantics</em> using its <a 
href="https://github.com/nathanmarz/storm/wiki/Trident-tutorial";>Trident</a> 
abstraction. This mode uses the same failure detection mechanism as the 
at-least-once mode. Tuples are actually processed at least once, but 
Storm&rsquo;s state implementation allows duplicates to be detected and 
ignored. (The duplicate detection only applies to state managed by Storm. If 
your code has other side-effects, e.g. sending messages to a service outside of 
the topology, it will not have exactly-once semantics.) In this mode, the spout 
breaks the input stream into batches, and processes batches in strictly 
sequential order.</li>
+</ul>
+
+<p>Samza also offers guaranteed delivery &mdash; currently only at-least-once 
delivery, but support for exactly-once semantics is planned. Within each stream 
partition, Samza always processes messages in the order they appear in the 
partition, but there is no guarantee of ordering across different input streams 
or partitions. This model allows Samza to offer at-least-once delivery without 
the overhead of ancestry tracking. In Samza, there would be no performance 
advantage to using at-most-once delivery (i.e. dropping messages on failure), 
which is why we don&rsquo;t offer that mode &mdash; message delivery is always 
guaranteed.</p>
+
+<p>Moreover, because Samza never processes messages in a partition 
out-of-order, it is better suited for handling keyed data. For example, if you 
have a stream of database updates &mdash; where later updates may replace 
earlier updates &mdash; then reordering the messages may change the final 
result. Provided that all updates for the same key appear in the same stream 
partition, Samza is able to guarantee a consistent state.</p>
+
+<h3 id="state-management">State Management</h3>
+
+<p>Storm&rsquo;s lower-level API of bolts does not offer any help for managing 
state in a stream process. A bolt can maintain in-memory state (which is lost 
if that bolt dies), or it can make calls to a remote database to read and write 
state. However, a topology can usually process messages at a much higher rate 
than calls to a remote database can be made, so making a remote call for each 
message quickly becomes a bottleneck.</p>
+
+<p>As part of its higher-level Trident API, Storm offers automatic <a 
href="https://github.com/nathanmarz/storm/wiki/Trident-state";>state 
management</a>. It keeps state in memory, and periodically checkpoints it to a 
remote database (e.g. Cassandra) for durability, so the cost of the remote 
database call is amortized over several processed tuples. By maintaining 
metadata alongside the state, Trident is able to achieve exactly-once 
processing semantics &mdash; for example, if you are counting events, this 
mechanism allows the counters to be correct, even when machines fail and tuples 
are replayed.</p>
+
+<p>Storm&rsquo;s approach of caching and batching state changes works well if 
the amount of state in each bolt is fairly small &mdash; perhaps less than 
100kB. That makes it suitable for keeping track of counters, minimum, maximum 
and average values of a metric, and the like. However, if you need to maintain 
a large amount of state, this approach essentially degrades to making a 
database call per processed tuple, with the associated performance cost.</p>
+
+<p>Samza takes a <a href="../container/state-management.html">completely 
different approach</a> to state management. Rather than using a remote database 
for durable storage, each Samza task includes an embedded key-value store, 
located on the same machine. Reads and writes to this store are very fast, even 
when the contents of the store are larger than the available memory. Changes to 
this key-value store are replicated to other machines in the cluster, so that 
if one machine dies, the state of the tasks it was running can be restored on 
another machine.</p>
+
+<p>By co-locating storage and processing on the same machine, Samza is able to 
achieve very high throughput, even when there is a large amount of state. This 
is necessary if you want to perform stateful operations that are not just 
counters. For example, if you want to perform a window join of multiple 
streams, or join a stream with a database table (replicated to Samza through a 
changelog), or group several related messages into a bigger message, then you 
need to maintain so much state that it is much more efficient to keep the state 
local to the task.</p>
+
+<p>A limitation of Samza&rsquo;s state handling is that it currently does not 
support exactly-once semantics &mdash; only at-least-once is supported right 
now. But we&rsquo;re working on fixing that, so stay tuned for updates.</p>
+
+<h3 id="partitioning-and-parallelism">Partitioning and Parallelism</h3>
+
+<p>Storm&rsquo;s <a 
href="https://github.com/nathanmarz/storm/wiki/Understanding-the-parallelism-of-a-Storm-topology";>parallelism
 model</a> is fairly similar to Samza&rsquo;s. Both frameworks split processing 
into independent <em>tasks</em> that can run in parallel. Resource allocation 
is independent of the number of tasks: a small job can keep all tasks in a 
single process on a single machine; a large job can spread the tasks over many 
processes on many machines.</p>
+
+<p>The biggest difference is that Storm uses one thread per task by default, 
whereas Samza uses single-threaded processes (containers). A Samza container 
may contain multiple tasks, but there is only one thread that invokes each of 
the tasks in turn. This means each container is mapped to exactly one CPU core, 
which makes the resource model much simpler and reduces interference from other 
tasks running on the same machine. Storm&rsquo;s multithreaded model has the 
advantage of taking better advantage of excess capacity on an idle machine, at 
the cost of a less predictable resource model.</p>
+
+<p>Storm supports <em>dynamic rebalancing</em>, which means adding more 
threads or processes to a topology without restarting the topology or cluster. 
This is a convenient feature, especially during development. We haven&rsquo;t 
added this to Samza: philosophically we feel that this kind of change should go 
through a normal configuration management process (i.e. version control, 
notification, etc.) as it impacts production performance. In other words, the 
code and configuration of the jobs should fully recreate the state of the 
cluster.</p>
+
+<p>When using a transactional spout with Trident (a requirement for achieving 
exactly-once semantics), parallelism is potentially reduced. Trident relies on 
a global ordering in its input streams &mdash; that is, ordering across all 
partitions of a stream, not just within one partion. This means that the 
topology&rsquo;s input stream has to go through a single spout instance, 
effectively ignoring the partitioning of the input stream. This spout may 
become a bottleneck on high-volume streams. In Samza, all stream processing is 
parallel &mdash; there are no such choke points.</p>
+
+<h3 id="deployment-&amp;-execution">Deployment &amp; Execution</h3>
+
+<p>A Storm cluster is composed of a set of nodes running a <em>Supervisor</em> 
daemon. The supervisor daemons talk to a single master node running a daemon 
called <em>Nimbus</em>. The Nimbus daemon is responsible for assigning work and 
managing resources in the cluster. See Storm&rsquo;s <a 
href="https://github.com/nathanmarz/storm/wiki/Tutorial";>Tutorial</a> page for 
details. This is quite similar to YARN; though YARN is a bit more fully 
featured and intended to be multi-framework, Nimbus is better integrated with 
Storm.</p>
+
+<p>Yahoo! has also released <a 
href="https://github.com/yahoo/storm-yarn";>Storm-YARN</a>. As described in <a 
href="http://developer.yahoo.com/blogs/ydn/storm-yarn-released-open-source-143745133.html";>this
 Yahoo! blog post</a>, Storm-YARN is a wrapper that starts a single Storm 
cluster (complete with Nimbus, and Supervisors) inside a YARN grid.</p>
+
+<p>There are a lot of similarities between Storm&rsquo;s Nimbus and 
YARN&rsquo;s ResourceManager, as well as between Storm&rsquo;s Supervisor and 
YARN&rsquo;s Node Managers. Rather than writing our own resource management 
framework, or running a second one inside of YARN, we decided that Samza should 
use YARN directly, as a first-class citizen in the YARN ecosystem. YARN is 
stable, well adopted, fully-featured, and inter-operable with Hadoop. It also 
provides a bunch of nice features like security (user authentication), cgroup 
process isolation, etc.</p>
+
+<p>The YARN support in Samza is pluggable, so you can swap it for a different 
execution framework if you wish.</p>
+
+<h3 id="language-support">Language Support</h3>
+
+<p>Storm is written in Java and Clojure but has good support for non-JVM 
languages. It follows a model similar to MapReduce Streaming: the non-JVM task 
is launched in a separate process, data is sent to its stdin, and output is 
read from its stdout.</p>
+
+<p>Samza is written in Java and Scala. It is built with multi-language support 
in mind, but currently only supports JVM languages.</p>
+
+<h3 id="workflow">Workflow</h3>
+
+<p>Storm provides modeling of <em>topologies</em> (a processing graph of 
multiple stages) <a href="https://github.com/nathanmarz/storm/wiki/Tutorial";>in 
code</a>. Trident provides a further <a 
href="https://github.com/nathanmarz/storm/wiki/Trident-tutorial";>higher-level 
API</a> on top of this, including familiar relational-like operators such as 
filters, grouping, aggregation and joins. This means the entire topology is 
wired up in one place, which has the advantage that it is documented in code, 
but has the disadvantage that the entire topology needs to be developed and 
deployed as a whole.</p>
+
+<p>In Samza, each job is an independent entity. You can define multiple jobs 
in a single codebase, or you can have separate teams working on different jobs 
using different codebases. Each job is deployed, started and stopped 
independently. Jobs communicate only through named streams, and you can add 
jobs to the system without affecting any other jobs. This makes Samza well 
suited for handling the data flow in a large company.</p>
+
+<p>Samza&rsquo;s approach can be emulated in Storm by connecting two separate 
topologies via a broker, such as Kafka. However, Storm&rsquo;s implementation 
of exactly-once semantics only works within a single topology.</p>
+
+<h3 id="maturity">Maturity</h3>
+
+<p>We can&rsquo;t speak to Storm&rsquo;s maturity, but it has an <a 
href="https://github.com/nathanmarz/storm/wiki/Powered-By";>impressive number of 
adopters</a>, a strong feature set, and seems to be under active development. 
It integrates well with many common messaging systems (RabbitMQ, Kestrel, 
Kafka, etc).</p>
+
+<p>Samza is pretty immature, though it builds on solid components. YARN is 
fairly new, but is already being run on 3000+ node clusters at Yahoo!, and the 
project is under active development by both <a 
href="http://hortonworks.com/";>Hortonworks</a> and <a 
href="http://www.cloudera.com/content/cloudera/en/home.html";>Cloudera</a>. 
Kafka has a strong <a 
href="https://cwiki.apache.org/KAFKA/powered-by.html";>powered by</a> page, and 
has seen increased adoption recently. It&rsquo;s also frequently used with 
Storm. Samza is a brand new project that is in use at LinkedIn. Our hope is 
that others will find it useful, and adopt it as well.</p>
+
+<h3 id="buffering-&amp;-latency">Buffering &amp; Latency</h3>
+
+<p>Storm uses <a href="http://zeromq.org/";>ZeroMQ</a> for non-durable 
communication between bolts, which enables extremely low latency transmission 
of tuples. Samza does not have an equivalent mechanism, and always writes task 
output to a stream.</p>
+
+<p>On the flip side, when a bolt is trying to send messages using ZeroMQ, and 
the consumer can&rsquo;t read them fast enough, the ZeroMQ buffer in the 
producer&rsquo;s process begins to fill up with messages. If this buffer grows 
too much, the topology&rsquo;s processing timeout may be reached, which causes 
messages to be re-emitted at the spout and makes the problem worse by adding 
even more messages to the buffer. In order to prevent such overflow, you can 
configure a maximum number of messages that can be in flight in the topology at 
any one time; when that threshold is reached, the spout blocks until some of 
the messages in flight are fully processed. This mechanism allows back 
pressure, but requires <a 
href="http://nathanmarz.github.io/storm/doc/backtype/storm/Config.html#TOPOLOGY_MAX_SPOUT_PENDING";>topology.max.spout.pending</a>
 to be carefully configured. If a single bolt in a topology starts running 
slow, the processing in the entire topology grinds to a halt.</p>
+
+<p>A lack of a broker between bolts also adds complexity when trying to deal 
with fault tolerance and messaging semantics.  Storm has a <a 
href="https://github.com/nathanmarz/storm/wiki/Guaranteeing-message-processing";>clever
 mechanism</a> for detecting tuples that failed to be processed, but Samza 
doesn&rsquo;t need such a mechanism because every input and output stream is 
fault-tolerant and replicated.</p>
+
+<p>Samza takes a different approach to buffering. We buffer to disk at every 
hop between a StreamTask. This decision, and its trade-offs, are described in 
detail on the <a href="introduction.html">Comparison Introduction</a> page. 
This design decision makes durability guarantees easy, and has the advantage of 
allowing the buffer to absorb a large backlog of messages if a job has fallen 
behind in its processing. However, it comes at the price of slightly higher 
latency.</p>
+
+<p>As described in the <em>workflow</em> section above, Samza&rsquo;s approach 
can be emulated in Storm, but comes with a loss in functionality.</p>
+
+<h3 id="isolation">Isolation</h3>
+
+<p>Storm provides standard UNIX process-level isolation. Your topology can 
impact another topology&rsquo;s performance (or vice-versa) if too much CPU, 
disk, network, or memory is used.</p>
+
+<p>Samza relies on YARN to provide resource-level isolation. Currently, YARN 
provides explicit controls for memory and CPU limits (through <a 
href="../yarn/isolation.html">cgroups</a>), and both have been used 
successfully with Samza. No isolation for disk or network is provided by YARN 
at this time.</p>
+
+<h3 id="distributed-rpc">Distributed RPC</h3>
+
+<p>In Storm, you can write topologies which not only accept a stream of fixed 
events, but also allow clients to run distributed computations on demand. The 
query is sent into the topology as a tuple on a special spout, and when the 
topology has computed the answer, it is returned to the client (who was 
synchronously waiting for the answer). This facility is called <a 
href="https://github.com/nathanmarz/storm/wiki/Distributed-RPC";>Distributed 
RPC</a> (DRPC).</p>
+
+<p>Samza does not currently have an equivalent API to DRPC, but you can build 
it yourself using Samza&rsquo;s stream processing primitives.</p>
+
+<h3 id="data-model">Data Model</h3>
+
+<p>Storm models all messages as <em>tuples</em> with a defined data model but 
pluggable serialization.</p>
+
+<p>Samza&rsquo;s serialization and data model are both pluggable. We are not 
terribly opinionated about which approach is best.</p>
+
+<h2 id="spark-streaming-&raquo;"><a href="spark-streaming.html">Spark 
Streaming &raquo;</a></h2>
+
+
+          </div>
+        </div>
+
+      </div><!-- /.wrapper-content -->
+    </div><!-- /.wrapper -->
+
+    <div class="footer">
+      <div class="container">
+        <!-- nothing for now. -->
+      </div>
+    </div>
+
+  
+    <script>
+      $( document ).ready(function() {
+        if ( $.fn.urlExists( 
"/learn/documentation/latest/comparisons/storm.html" ) ) {
+          $("#switch-version-button").addClass("fa fa-history masthead-icon");
+        }
+      });
+
+      /* a function to test whether the url exists or not */
+      (function( $ ) {
+        $.fn.urlExists = function(url) {
+          var http = new XMLHttpRequest();
+          http.open('HEAD', url, false);
+          http.send();
+          return http.status != 404;
+        };
+      }( jQuery ));
+    </script>
+  
+
+    <!-- Google Analytics -->
+    <script>
+      
(function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
+      (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new 
Date();a=s.createElement(o),
+      
m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
+      
})(window,document,'script','//www.google-analytics.com/analytics.js','ga');
+
+      ga('create', 'UA-43122768-1', 'apache.org');
+      ga('send', 'pageview');
+
+    </script>
+  </body>
+</html>

svn commit: r1643389 [20/27] - in /incubator/samza/site: ./ archive/ community/ contribute/ img/0.8/ img/0.8/learn/ img/0.8/learn/documentation/ img/0.8/learn/documentation/comparisons/ img/0.8/learn/documentation/container/ img/0.8/learn/documentation...

Reply via email to