int...

nickpan47 Tue, 22 Dec 2015 11:02:05 -0800

Added: samza/site/learn/documentation/0.10/container/samza-container.html
URL: 
http://svn.apache.org/viewvc/samza/site/learn/documentation/0.10/container/samza-container.html?rev=1721445&view=auto
==============================================================================
--- samza/site/learn/documentation/0.10/container/samza-container.html (added)
+++ samza/site/learn/documentation/0.10/container/samza-container.html Tue Dec 
22 19:01:05 2015
@@ -0,0 +1,283 @@
+<!DOCTYPE html>
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+<html lang="en">
+  <head>
+    <meta charset="utf-8">
+    <title>Samza - SamzaContainer</title>
+    <link href='/css/ropa-sans.css' rel='stylesheet' type='text/css'/>
+    <link href="/css/bootstrap.min.css" rel="stylesheet"/>
+    <link href="/css/font-awesome.min.css" rel="stylesheet"/>
+    <link href="/css/main.css" rel="stylesheet"/>
+    <link href="/css/syntax.css" rel="stylesheet"/>
+    <link rel="icon" type="image/png" href="/img/samza-icon.png">
+    <script src="/js/jquery-1.11.1.min.js"></script>
+  </head>
+  <body>
+    <div class="wrapper">
+      <div class="wrapper-content">
+
+        <div class="masthead">
+          <div class="container">
+            <div class="masthead-logo">
+              <a href="/" class="logo">samza</a>
+            </div>
+            <div class="masthead-icons">
+              <div class="pull-right">
+                <a href="/startup/download"><i class="fa 
fa-arrow-circle-o-down masthead-icon"></i></a>
+                <a 
href="https://git-wip-us.apache.org/repos/asf?p=samza.git;a=tree"; 
target="_blank"><i class="fa fa-code masthead-icon" style="font-weight: 
bold;"></i></a>
+                <a href="https://twitter.com/samzastream"; target="_blank"><i 
class="fa fa-twitter masthead-icon"></i></a>
+                <!-- this icon only shows in versioned pages -->
+                
+                  
+                    
+                  
+                  <a 
href="http://samza.apache.org/learn/documentation/latest/container/samza-container.html";><i
 id="switch-version-button"></i></a>
+                   <!-- links for the navigation bar -->
+                
+
+              </div>
+            </div>
+          </div><!-- /.container -->
+        </div>
+
+        <div class="container">
+          <div class="menu">
+            <h1><i class="fa fa-rocket"></i> Getting Started</h1>
+            <ul>
+              <li><a href="/startup/hello-samza/0.10">Hello Samza</a></li>
+              <li><a href="/startup/download">Download</a></li>
+            </ul>
+
+            <h1><i class="fa fa-book"></i> Learn</h1>
+            <ul>
+              <li><a href="/learn/documentation/0.10">Documentation</a></li>
+              <li><a 
href="/learn/documentation/0.10/jobs/configuration-table.html">Configuration</a></li>
+              <li><a 
href="/learn/documentation/0.10/api/javadocs/">Javadocs</a></li>
+              <li><a href="/learn/tutorials/0.10">Tutorials</a></li>
+              <li><a 
href="https://cwiki.apache.org/confluence/display/SAMZA/FAQ";>FAQ</a></li>
+              <li><a 
href="https://cwiki.apache.org/confluence/display/SAMZA/Apache+Samza";>Wiki</a></li>
+              <li><a 
href="https://cwiki.apache.org/confluence/pages/viewpage.action?pageId=51812876";>Papers
 &amp; Talks</a></li>
+              <li><a href="http://blogs.apache.org/samza";>Blog</a></li>
+            </ul>
+
+            <h1><i class="fa fa-comments"></i> Community</h1>
+            <ul>
+              <li><a href="/community/mailing-lists.html">Mailing 
Lists</a></li>
+              <li><a href="/community/irc.html">IRC</a></li>
+              <li><a 
href="https://issues.apache.org/jira/browse/SAMZA";>Bugs</a></li>
+              <li><a 
href="https://cwiki.apache.org/confluence/display/SAMZA/Powered+By";>Powered 
by</a></li>
+              <li><a 
href="https://cwiki.apache.org/confluence/display/SAMZA/Ecosystem";>Ecosystem</a></li>
+              <li><a href="/community/committers.html">Committers</a></li>
+            </ul>
+
+            <h1><i class="fa fa-code"></i> Contribute</h1>
+            <ul>
+              <li><a href="/contribute/rules.html">Rules</a></li>
+              <li><a href="/contribute/coding-guide.html">Coding Guide</a></li>
+              <li><a href="/contribute/projects.html">Projects</a></li>
+              <li><a href="/contribute/design-documents.html">Design 
Documents</a></li>
+              <li><a href="/contribute/code.html">Code</a></li>
+              <li><a href="https://reviews.apache.org/groups/samza";>Review 
Board</a></li>
+              <li><a href="/contribute/tests.html">Tests</a></li>
+            </ul>
+
+            <h1><i class="fa fa-history"></i> Archive</h1>
+            <ul>
+              <li><a href="/archive/index.html#latest">latest</a></li>
+              <li><a href="/archive/index.html#09">0.9</a></li>
+              <li><a href="/archive/index.html#08">0.8</a></li>
+              <li><a href="/archive/index.html#07">0.7</a></li>
+            </ul>
+          </div>
+
+          <div class="content">
+            <!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<h2>SamzaContainer</h2>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<p>The SamzaContainer is responsible for managing the startup, execution, and 
shutdown of one or more <a href="../api/overview.html">StreamTask</a> 
instances. Each SamzaContainer typically runs as an indepentent Java virtual 
machine. A Samza job can consist of several SamzaContainers, potentially 
running on different machines.</p>
+
+<p>When a SamzaContainer starts up, it does the following:</p>
+
+<ol>
+<li>Get last checkpointed offset for each input stream partition that it 
consumes</li>
+<li>Create a &ldquo;reader&rdquo; thread for every input stream partition that 
it consumes</li>
+<li>Start metrics reporters to report metrics</li>
+<li>Start a checkpoint timer to save your task&rsquo;s input stream offsets 
every so often</li>
+<li>Start a window timer to trigger your task&rsquo;s <a 
href="../api/javadocs/org/apache/samza/task/WindowableTask.html">window 
method</a>, if it is defined</li>
+<li>Instantiate and initialize your StreamTask once for each input stream 
partition</li>
+<li>Start an event loop that takes messages from the input stream reader 
threads, and gives them to your StreamTasks</li>
+<li>Notify lifecycle listeners during each one of these steps</li>
+</ol>
+
+<p>Let&rsquo;s start in the middle, with the instantiation of a StreamTask. 
The following sections of the documentation cover the other steps.</p>
+
+<h3 id="tasks-and-partitions">Tasks and Partitions</h3>
+
+<p>When the container starts, it creates instances of the <a 
href="../api/overview.html">task class</a> that you&rsquo;ve written. If the 
task class implements the <a 
href="../api/javadocs/org/apache/samza/task/InitableTask.html">InitableTask</a> 
interface, the SamzaContainer will also call the init() method.</p>
+
+<div class="highlight"><pre><code class="java"><span class="cm">/** Implement 
this if you want a callback when your task starts up. */</span>
+<span class="kd">public</span> <span class="kd">interface</span> <span 
class="nc">InitableTask</span> <span class="o">{</span>
+  <span class="kt">void</span> <span class="nf">init</span><span 
class="o">(</span><span class="n">Config</span> <span 
class="n">config</span><span class="o">,</span> <span 
class="n">TaskContext</span> <span class="n">context</span><span 
class="o">);</span>
+<span class="o">}</span></code></pre></div>
+
+<p>By default, how many instances of your task class are created depends on 
the number of partitions in the job&rsquo;s input streams. If your Samza job 
has ten partitions, there will be ten instantiations of your task class: one 
for each partition. The first task instance will receive all messages for 
partition one, the second instance will receive all messages for partition two, 
and so on.</p>
+
+<p><img src="/img/0.10/learn/documentation/container/tasks-and-partitions.svg" 
alt="Illustration of tasks consuming partitions" class="diagram-large"></p>
+
+<p>The number of partitions in the input streams is determined by the systems 
from which you are consuming. For example, if your input system is Kafka, you 
can specify the number of partitions when you create a topic from the command 
line or using the num.partitions in Kafka&rsquo;s server properties file.</p>
+
+<p>If a Samza job has more than one input stream, the number of task instances 
for the Samza job is the maximum number of partitions across all input streams. 
For example, if a Samza job is reading from PageViewEvent (12 partitions), and 
ServiceMetricEvent (14 partitions), then the Samza job would have 14 task 
instances (numbered 0 through 13). Task instances 12 and 13 only receive events 
from ServiceMetricEvent, because there is no corresponding PageViewEvent 
partition.</p>
+
+<p>With this default approach to assigning input streams to task instances, 
Samza is effectively performing a group-by operation on the input streams with 
their partitions as the key. Other strategies for grouping input stream 
partitions are possible by implementing a new <a 
href="../api/javadocs/org/apache/samza/container/grouper/stream/SystemStreamPartitionGrouper.html">SystemStreamPartitionGrouper</a>
 and factory, and configuring the job to use it via the 
job.systemstreampartition.grouper.factory configuration value.</p>
+
+<p>Samza provides the above-discussed per-partition grouper as well as the 
GroupBySystemStreamPartitionGrouper, which provides a separate task class 
instance for every input stream partition, effectively grouping by the input 
stream itself. This provides maximum scalability in terms of how many 
containers can be used to process those input streams and is appropriate for 
very high volume jobs that need no grouping of the input streams.</p>
+
+<p>Considering the above example of a PageViewEvent partitioned 12 ways and a 
ServiceMetricEvent partitioned 14 ways, the GroupBySystemStreamPartitionGrouper 
would create 12 + 14 = 26 task instances, which would then be distributed 
across the number of containers configured, as discussed below.</p>
+
+<p>Note that once a job has been started using a particular 
SystemStreamPartitionGrouper and that job is using state or checkpointing, it 
is not possible to change that grouping in subsequent job starts, as the 
previous checkpoints and state information would likely be incorrect under the 
new grouping approach.</p>
+
+<h3 id="containers-and-resource-allocation">Containers and resource 
allocation</h3>
+
+<p>Although the number of task instances is fixed &mdash; determined by the 
number of input partitions &mdash; you can configure how many containers you 
want to use for your job. If you are <a href="../jobs/yarn-jobs.html">using 
YARN</a>, the number of containers determines what CPU and memory resources are 
allocated to your job.</p>
+
+<p>If the data volume on your input streams is small, it might be sufficient 
to use just one SamzaContainer. In that case, Samza still creates one task 
instance per input partition, but all those tasks run within the same 
container. At the other extreme, you can create as many containers as you have 
partitions, and Samza will assign one task instance to each container.</p>
+
+<p>Each SamzaContainer is designed to use one CPU core, so it uses a <a 
href="event-loop.html">single-threaded event loop</a> for execution. It&rsquo;s 
not advisable to create your own threads within a SamzaContainer. If you need 
more parallelism, please configure your job to use more containers.</p>
+
+<p>Any <a href="state-management.html">state</a> in your job belongs to a task 
instance, not to a container. This is a key design decision for Samza&rsquo;s 
scalability: as your job&rsquo;s resource requirements grow and shrink, you can 
simply increase or decrease the number of containers, but the number of task 
instances remains unchanged. As you scale up or down, the same state remains 
attached to each task instance. Task instances may be moved from one container 
to another, and any persistent state managed by Samza will be moved with it. 
This allows the job&rsquo;s processing semantics to remain unchanged, even as 
you change the job&rsquo;s parallelism.</p>
+
+<h3 id="joining-multiple-input-streams">Joining multiple input streams</h3>
+
+<p>If your job has multiple input streams, Samza provides a simple but 
powerful mechanism for joining data from different streams: each task instance 
receives messages from one partition of <em>each</em> of the input streams. For 
example, say you have two input streams, A and B, each with four partitions. 
Samza creates four task instances to process them, and assigns the partitions 
as follows:</p>
+
+<table class="table table-condensed table-bordered table-striped">
+  <thead>
+    <tr>
+      <th>Task instance</th>
+      <th>Consumes stream partitions</th>
+    </tr>
+  </thead>
+  <tbody>
+    <tr>
+      <td>0</td><td>stream A partition 0, stream B partition 0</td>
+    </tr>
+    <tr>
+      <td>1</td><td>stream A partition 1, stream B partition 1</td>
+    </tr>
+    <tr>
+      <td>2</td><td>stream A partition 2, stream B partition 2</td>
+    </tr>
+    <tr>
+      <td>3</td><td>stream A partition 3, stream B partition 3</td>
+    </tr>
+  </tbody>
+</table>
+
+<p>Thus, if you want two events in different streams to be processed by the 
same task instance, you need to ensure they are sent to the same partition 
number. You can achieve this by using the same partitioning key when <a 
href="../api/overview.html">sending the messages</a>. Joining streams is 
discussed in detail in the <a href="state-management.html">state management</a> 
section.</p>
+
+<p>There is one caveat in all of this: Samza currently assumes that a 
stream&rsquo;s partition count will never change. Partition splitting or 
repartitioning is not supported. If an input stream has N partitions, it is 
expected that it has always had, and will always have N partitions. If you want 
to re-partition a stream, you can write a job that reads messages from the 
stream, and writes them out to a new stream with the required number of 
partitions. For example, you could read messages from PageViewEvent, and write 
them to PageViewEventRepartition.</p>
+
+<h3 id="broadcast-streams">Broadcast Streams</h3>
+
+<p>After 0.10.0, Samza supports broadcast streams. You can assign partitions 
from some streams to all the tasks. For example, you want all the tasks can 
consume partition 0 and 1 from a stream called global-stream-1, and partition 2 
from a stream called global-stream-2. You now can configure:</p>
+
+<div class="highlight"><pre><code class="jproperties"><span 
class="na">task.broadcast.inputs</span><span class="o">=</span><span 
class="s">yourSystem.broadcast-stream-1#[0-1], 
yourSystem.broadcast-stream-2#2</span></code></pre></div>
+
+<p>If you use &ldquo;[]&rdquo;, you are specifying a range.</p>
+
+<h2 id="streams-&raquo;"><a href="streams.html">Streams &raquo;</a></h2>
+
+
+          </div>
+        </div>
+
+      </div><!-- /.wrapper-content -->
+    </div><!-- /.wrapper -->
+
+    <div class="footer">
+      <div class="container">
+        <!-- nothing for now. -->
+      </div>
+    </div>
+
+  
+    <script>
+      $( document ).ready(function() {
+        if ( $.fn.urlExists( 
"/learn/documentation/latest/container/samza-container.html" ) ) {
+          $("#switch-version-button").addClass("fa fa-history masthead-icon");
+        }
+      });
+
+      /* a function to test whether the url exists or not */
+      (function( $ ) {
+        $.fn.urlExists = function(url) {
+          var http = new XMLHttpRequest();
+          http.open('HEAD', url, false);
+          http.send();
+          return http.status != 404;
+        };
+      }( jQuery ));
+    </script>
+  
+
+    <!-- Google Analytics -->
+    <script>
+      
(function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
+      (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new 
Date();a=s.createElement(o),
+      
m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
+      
})(window,document,'script','//www.google-analytics.com/analytics.js','ga');
+
+      ga('create', 'UA-43122768-1', 'apache.org');
+      ga('send', 'pageview');
+
+    </script>
+  </body>
+</html>


Added: samza/site/learn/documentation/0.10/container/serialization.html
URL: 
http://svn.apache.org/viewvc/samza/site/learn/documentation/0.10/container/serialization.html?rev=1721445&view=auto
==============================================================================
--- samza/site/learn/documentation/0.10/container/serialization.html (added)
+++ samza/site/learn/documentation/0.10/container/serialization.html Tue Dec 22 
19:01:05 2015
@@ -0,0 +1,289 @@
+<!DOCTYPE html>
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+<html lang="en">
+  <head>
+    <meta charset="utf-8">
+    <title>Samza - Serialization</title>
+    <link href='/css/ropa-sans.css' rel='stylesheet' type='text/css'/>
+    <link href="/css/bootstrap.min.css" rel="stylesheet"/>
+    <link href="/css/font-awesome.min.css" rel="stylesheet"/>
+    <link href="/css/main.css" rel="stylesheet"/>
+    <link href="/css/syntax.css" rel="stylesheet"/>
+    <link rel="icon" type="image/png" href="/img/samza-icon.png">
+    <script src="/js/jquery-1.11.1.min.js"></script>
+  </head>
+  <body>
+    <div class="wrapper">
+      <div class="wrapper-content">
+
+        <div class="masthead">
+          <div class="container">
+            <div class="masthead-logo">
+              <a href="/" class="logo">samza</a>
+            </div>
+            <div class="masthead-icons">
+              <div class="pull-right">
+                <a href="/startup/download"><i class="fa 
fa-arrow-circle-o-down masthead-icon"></i></a>
+                <a 
href="https://git-wip-us.apache.org/repos/asf?p=samza.git;a=tree"; 
target="_blank"><i class="fa fa-code masthead-icon" style="font-weight: 
bold;"></i></a>
+                <a href="https://twitter.com/samzastream"; target="_blank"><i 
class="fa fa-twitter masthead-icon"></i></a>
+                <!-- this icon only shows in versioned pages -->
+                
+                  
+                    
+                  
+                  <a 
href="http://samza.apache.org/learn/documentation/latest/container/serialization.html";><i
 id="switch-version-button"></i></a>
+                   <!-- links for the navigation bar -->
+                
+
+              </div>
+            </div>
+          </div><!-- /.container -->
+        </div>
+
+        <div class="container">
+          <div class="menu">
+            <h1><i class="fa fa-rocket"></i> Getting Started</h1>
+            <ul>
+              <li><a href="/startup/hello-samza/0.10">Hello Samza</a></li>
+              <li><a href="/startup/download">Download</a></li>
+            </ul>
+
+            <h1><i class="fa fa-book"></i> Learn</h1>
+            <ul>
+              <li><a href="/learn/documentation/0.10">Documentation</a></li>
+              <li><a 
href="/learn/documentation/0.10/jobs/configuration-table.html">Configuration</a></li>
+              <li><a 
href="/learn/documentation/0.10/api/javadocs/">Javadocs</a></li>
+              <li><a href="/learn/tutorials/0.10">Tutorials</a></li>
+              <li><a 
href="https://cwiki.apache.org/confluence/display/SAMZA/FAQ";>FAQ</a></li>
+              <li><a 
href="https://cwiki.apache.org/confluence/display/SAMZA/Apache+Samza";>Wiki</a></li>
+              <li><a 
href="https://cwiki.apache.org/confluence/pages/viewpage.action?pageId=51812876";>Papers
 &amp; Talks</a></li>
+              <li><a href="http://blogs.apache.org/samza";>Blog</a></li>
+            </ul>
+
+            <h1><i class="fa fa-comments"></i> Community</h1>
+            <ul>
+              <li><a href="/community/mailing-lists.html">Mailing 
Lists</a></li>
+              <li><a href="/community/irc.html">IRC</a></li>
+              <li><a 
href="https://issues.apache.org/jira/browse/SAMZA";>Bugs</a></li>
+              <li><a 
href="https://cwiki.apache.org/confluence/display/SAMZA/Powered+By";>Powered 
by</a></li>
+              <li><a 
href="https://cwiki.apache.org/confluence/display/SAMZA/Ecosystem";>Ecosystem</a></li>
+              <li><a href="/community/committers.html">Committers</a></li>
+            </ul>
+
+            <h1><i class="fa fa-code"></i> Contribute</h1>
+            <ul>
+              <li><a href="/contribute/rules.html">Rules</a></li>
+              <li><a href="/contribute/coding-guide.html">Coding Guide</a></li>
+              <li><a href="/contribute/projects.html">Projects</a></li>
+              <li><a href="/contribute/design-documents.html">Design 
Documents</a></li>
+              <li><a href="/contribute/code.html">Code</a></li>
+              <li><a href="https://reviews.apache.org/groups/samza";>Review 
Board</a></li>
+              <li><a href="/contribute/tests.html">Tests</a></li>
+            </ul>
+
+            <h1><i class="fa fa-history"></i> Archive</h1>
+            <ul>
+              <li><a href="/archive/index.html#latest">latest</a></li>
+              <li><a href="/archive/index.html#09">0.9</a></li>
+              <li><a href="/archive/index.html#08">0.8</a></li>
+              <li><a href="/archive/index.html#07">0.7</a></li>
+            </ul>
+          </div>
+
+          <div class="content">
+            <!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<h2>Serialization</h2>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<p>Every message that is read from or written to a <a 
href="streams.html">stream</a> or a <a href="state-management.html">persistent 
state store</a> needs to eventually be serialized to bytes (which are sent over 
the network or written to disk). There are various places where that 
serialization and deserialization can happen:</p>
+
+<ol>
+<li>In the client library: for example, the library for publishing to Kafka 
and consuming from Kafka supports pluggable serialization.</li>
+<li>In the task implementation: your <a href="../api/overview.html">process 
method</a> can use raw byte arrays as inputs and outputs, and do any parsing 
and serialization itself.</li>
+<li>Between the two: Samza provides a layer of serializers and deserializers, 
or <em>serdes</em> for short.</li>
+</ol>
+
+<p>You can use whatever makes sense for your job; Samza doesn&rsquo;t impose 
any particular data model or serialization scheme on you. However, the cleanest 
solution is usually to use Samza&rsquo;s serde layer. The following 
configuration example shows how to use it.</p>
+
+<div class="highlight"><pre><code class="jproperties"><span class="c"># Define 
a system called &quot;kafka&quot;</span>
+<span class="na">systems.kafka.samza.factory</span><span 
class="o">=</span><span 
class="s">org.apache.samza.system.kafka.KafkaSystemFactory</span>
+
+<span class="c"># The job is going to consume a topic called 
&quot;PageViewEvent&quot; from the &quot;kafka&quot; system</span>
+<span class="na">task.inputs</span><span class="o">=</span><span 
class="s">kafka.PageViewEvent</span>
+
+<span class="c"># Define a serde called &quot;json&quot; which 
parses/serializes JSON objects</span>
+<span class="na">serializers.registry.json.class</span><span 
class="o">=</span><span 
class="s">org.apache.samza.serializers.JsonSerdeFactory</span>
+
+<span class="c"># Define a serde called &quot;integer&quot; which encodes an 
integer as 4 binary bytes (big-endian)</span>
+<span class="na">serializers.registry.integer.class</span><span 
class="o">=</span><span 
class="s">org.apache.samza.serializers.IntegerSerdeFactory</span>
+
+<span class="c"># For messages in the &quot;PageViewEvent&quot; topic, the key 
(the ID of the user viewing the page)</span>
+<span class="c"># is encoded as a binary integer, and the message is encoded 
as JSON.</span>
+<span 
class="na">systems.kafka.streams.PageViewEvent.samza.key.serde</span><span 
class="o">=</span><span class="s">integer</span>
+<span 
class="na">systems.kafka.streams.PageViewEvent.samza.msg.serde</span><span 
class="o">=</span><span class="s">json</span>
+
+<span class="c"># Define a key-value store which stores the most recent page 
view for each user ID.</span>
+<span class="c"># Again, the key is an integer user ID, and the value is 
JSON.</span>
+<span class="na">stores.LastPageViewPerUser.factory</span><span 
class="o">=</span><span 
class="s">org.apache.samza.storage.kv.KeyValueStorageEngineFactory</span>
+<span class="na">stores.LastPageViewPerUser.changelog</span><span 
class="o">=</span><span class="s">kafka.last-page-view-per-user</span>
+<span class="na">stores.LastPageViewPerUser.key.serde</span><span 
class="o">=</span><span class="s">integer</span>
+<span class="na">stores.LastPageViewPerUser.msg.serde</span><span 
class="o">=</span><span class="s">json</span></code></pre></div>
+
+<p>Each serde is defined with a factory class. Samza comes with several 
builtin serdes for UTF-8 strings, binary-encoded integers, JSON and more. The 
following is a comprehensive list of supported serdes in Samza.
+<style>
+            table th, table td {
+                text-align: left;
+                vertical-align: top;
+                padding: 12px;
+                border-bottom: 1px solid #ccc;
+                border-top: 1px solid #ccc;
+                border-left: 0;
+                border-right: 0;
+            }</p>
+<div class="highlight"><pre><code class="language-text" data-lang="text">      
  table td.property, table td.default {
+            white-space: nowrap;
+        }
+
+        table th {
+            background-color: #eee;
+        }
+</code></pre></div>
+<p></style>
+<table>
+    <tr>
+        <th> Serde Name</th>
+        <th> Data Handled </th>
+    </tr>
+    <tr>
+        <td> string </td>
+        <td> UTF-8 strings </td>
+    </tr>
+    <tr>
+        <td> integer </td>
+        <td> binary-encoded integers </td>
+    </tr>
+    <tr>
+        <td> serializable </td>
+        <td> Serializable Object Type </td>
+    </tr>
+    <tr>
+        <td> long </td>
+        <td> long data type </td>
+    </tr>
+    <tr>
+        <td> json </td>
+        <td> JSON formatted data </td>
+    </tr>
+    <tr>
+        <td> byte </td>
+        <td> Plain Bytes (effectively no-op) - Useful for Binary Messages </td>
+    </tr>
+    <tr>
+        <td> bytebuffer </td>
+        <td> Byte Buffer </td>
+    </tr>
+</table></p>
+
+<p>You can also create your own serializer by implementing the <a 
href="../api/javadocs/org/apache/samza/serializers/SerdeFactory.html">SerdeFactory</a>
 interface.</p>
+
+<p>The name you give to a serde (such as &ldquo;json&rdquo; and 
&ldquo;integer&rdquo; in the example above) is only for convenience in your job 
configuration; you can choose whatever name you like. For each stream and each 
state store, you can use the serde name to declare how messages should be 
serialized and deserialized.</p>
+
+<p>If you don&rsquo;t declare a serde, Samza simply passes objects through 
between your task instance and the system stream. In that case your task needs 
to send and receive whatever type of object the underlying client library 
uses.</p>
+
+<p>All the Samza APIs for sending and receiving messages are typed as 
<em>Object</em>. This means that you have to cast messages to the correct type 
before you can use them. It&rsquo;s a little bit more code, but it has the 
advantage that Samza is not restricted to any particular data model.</p>
+
+<h2 id="checkpointing-&raquo;"><a href="checkpointing.html">Checkpointing 
&raquo;</a></h2>
+
+
+          </div>
+        </div>
+
+      </div><!-- /.wrapper-content -->
+    </div><!-- /.wrapper -->
+
+    <div class="footer">
+      <div class="container">
+        <!-- nothing for now. -->
+      </div>
+    </div>
+
+  
+    <script>
+      $( document ).ready(function() {
+        if ( $.fn.urlExists( 
"/learn/documentation/latest/container/serialization.html" ) ) {
+          $("#switch-version-button").addClass("fa fa-history masthead-icon");
+        }
+      });
+
+      /* a function to test whether the url exists or not */
+      (function( $ ) {
+        $.fn.urlExists = function(url) {
+          var http = new XMLHttpRequest();
+          http.open('HEAD', url, false);
+          http.send();
+          return http.status != 404;
+        };
+      }( jQuery ));
+    </script>
+  
+
+    <!-- Google Analytics -->
+    <script>
+      
(function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
+      (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new 
Date();a=s.createElement(o),
+      
m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
+      
})(window,document,'script','//www.google-analytics.com/analytics.js','ga');
+
+      ga('create', 'UA-43122768-1', 'apache.org');
+      ga('send', 'pageview');
+
+    </script>
+  </body>
+</html>

Added: samza/site/learn/documentation/0.10/container/state-management.html
URL: 
http://svn.apache.org/viewvc/samza/site/learn/documentation/0.10/container/state-management.html?rev=1721445&view=auto
==============================================================================
--- samza/site/learn/documentation/0.10/container/state-management.html (added)
+++ samza/site/learn/documentation/0.10/container/state-management.html Tue Dec 
22 19:01:05 2015
@@ -0,0 +1,464 @@
+<!DOCTYPE html>
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+<html lang="en">
+  <head>
+    <meta charset="utf-8">
+    <title>Samza - State Management</title>
+    <link href='/css/ropa-sans.css' rel='stylesheet' type='text/css'/>
+    <link href="/css/bootstrap.min.css" rel="stylesheet"/>
+    <link href="/css/font-awesome.min.css" rel="stylesheet"/>
+    <link href="/css/main.css" rel="stylesheet"/>
+    <link href="/css/syntax.css" rel="stylesheet"/>
+    <link rel="icon" type="image/png" href="/img/samza-icon.png">
+    <script src="/js/jquery-1.11.1.min.js"></script>
+  </head>
+  <body>
+    <div class="wrapper">
+      <div class="wrapper-content">
+
+        <div class="masthead">
+          <div class="container">
+            <div class="masthead-logo">
+              <a href="/" class="logo">samza</a>
+            </div>
+            <div class="masthead-icons">
+              <div class="pull-right">
+                <a href="/startup/download"><i class="fa 
fa-arrow-circle-o-down masthead-icon"></i></a>
+                <a 
href="https://git-wip-us.apache.org/repos/asf?p=samza.git;a=tree"; 
target="_blank"><i class="fa fa-code masthead-icon" style="font-weight: 
bold;"></i></a>
+                <a href="https://twitter.com/samzastream"; target="_blank"><i 
class="fa fa-twitter masthead-icon"></i></a>
+                <!-- this icon only shows in versioned pages -->
+                
+                  
+                    
+                  
+                  <a 
href="http://samza.apache.org/learn/documentation/latest/container/state-management.html";><i
 id="switch-version-button"></i></a>
+                   <!-- links for the navigation bar -->
+                
+
+              </div>
+            </div>
+          </div><!-- /.container -->
+        </div>
+
+        <div class="container">
+          <div class="menu">
+            <h1><i class="fa fa-rocket"></i> Getting Started</h1>
+            <ul>
+              <li><a href="/startup/hello-samza/0.10">Hello Samza</a></li>
+              <li><a href="/startup/download">Download</a></li>
+            </ul>
+
+            <h1><i class="fa fa-book"></i> Learn</h1>
+            <ul>
+              <li><a href="/learn/documentation/0.10">Documentation</a></li>
+              <li><a 
href="/learn/documentation/0.10/jobs/configuration-table.html">Configuration</a></li>
+              <li><a 
href="/learn/documentation/0.10/api/javadocs/">Javadocs</a></li>
+              <li><a href="/learn/tutorials/0.10">Tutorials</a></li>
+              <li><a 
href="https://cwiki.apache.org/confluence/display/SAMZA/FAQ";>FAQ</a></li>
+              <li><a 
href="https://cwiki.apache.org/confluence/display/SAMZA/Apache+Samza";>Wiki</a></li>
+              <li><a 
href="https://cwiki.apache.org/confluence/pages/viewpage.action?pageId=51812876";>Papers
 &amp; Talks</a></li>
+              <li><a href="http://blogs.apache.org/samza";>Blog</a></li>
+            </ul>
+
+            <h1><i class="fa fa-comments"></i> Community</h1>
+            <ul>
+              <li><a href="/community/mailing-lists.html">Mailing 
Lists</a></li>
+              <li><a href="/community/irc.html">IRC</a></li>
+              <li><a 
href="https://issues.apache.org/jira/browse/SAMZA";>Bugs</a></li>
+              <li><a 
href="https://cwiki.apache.org/confluence/display/SAMZA/Powered+By";>Powered 
by</a></li>
+              <li><a 
href="https://cwiki.apache.org/confluence/display/SAMZA/Ecosystem";>Ecosystem</a></li>
+              <li><a href="/community/committers.html">Committers</a></li>
+            </ul>
+
+            <h1><i class="fa fa-code"></i> Contribute</h1>
+            <ul>
+              <li><a href="/contribute/rules.html">Rules</a></li>
+              <li><a href="/contribute/coding-guide.html">Coding Guide</a></li>
+              <li><a href="/contribute/projects.html">Projects</a></li>
+              <li><a href="/contribute/design-documents.html">Design 
Documents</a></li>
+              <li><a href="/contribute/code.html">Code</a></li>
+              <li><a href="https://reviews.apache.org/groups/samza";>Review 
Board</a></li>
+              <li><a href="/contribute/tests.html">Tests</a></li>
+            </ul>
+
+            <h1><i class="fa fa-history"></i> Archive</h1>
+            <ul>
+              <li><a href="/archive/index.html#latest">latest</a></li>
+              <li><a href="/archive/index.html#09">0.9</a></li>
+              <li><a href="/archive/index.html#08">0.8</a></li>
+              <li><a href="/archive/index.html#07">0.7</a></li>
+            </ul>
+          </div>
+
+          <div class="content">
+            <!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<h2>State Management</h2>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<p>One of the more interesting features of Samza is stateful stream 
processing. Tasks can store and query data through APIs provided by Samza. That 
data is stored on the same machine as the stream task; compared to connecting 
over the network to a remote database, Samza&rsquo;s local state allows you to 
read and write large amounts of data with better performance. Samza replicates 
this state across multiple machines for fault-tolerance (described in detail 
below).</p>
+
+<p>Some stream processing jobs don&rsquo;t require state: if you only need to 
transform one message at a time, or filter out messages based on some 
condition, your job can be simple. Every call to your task&rsquo;s <a 
href="../api/overview.html">process method</a> handles one incoming message, 
and each message is independent of all the other messages.</p>
+
+<p>However, being able to maintain state opens up many possibilities for 
sophisticated stream processing jobs: joining input streams, grouping messages 
and aggregating groups of messages. By analogy to SQL, the <em>select</em> and 
<em>where</em> clauses of a query are usually stateless, but <em>join</em>, 
<em>group by</em> and aggregation functions like <em>sum</em> and 
<em>count</em> require state. Samza doesn&rsquo;t yet provide a higher-level 
SQL-like language, but it does provide lower-level primitives that you can use 
to implement streaming aggregation and joins.</p>
+
+<h3 id="common-use-cases-for-stateful-processing">Common use cases for 
stateful processing</h3>
+
+<p>First, let&rsquo;s look at some simple examples of stateful stream 
processing that might be seen in the backend of a consumer website. Later in 
this page we&rsquo;ll discuss how to implement these applications using 
Samza&rsquo;s built-in key-value storage capabilities.</p>
+
+<h4 id="windowed-aggregation">Windowed aggregation</h4>
+
+<p><em>Example: Counting the number of page views for each user per 
hour</em></p>
+
+<p>In this case, your state typically consists of a number of counters which 
are incremented when a message is processed. The aggregation is typically 
limited to a time window (e.g. 1 minute, 1 hour, 1 day) so that you can observe 
changes of activity over time. This kind of windowed processing is common for 
ranking and relevance, detecting &ldquo;trending topics&rdquo;, as well as 
real-time reporting and monitoring.</p>
+
+<p>The simplest implementation keeps this state in memory (e.g. a hash map in 
the task instances), and writes it to a database or output stream at the end of 
every time window. However, you need to consider what happens when a container 
fails and your in-memory state is lost. You might be able to restore it by 
processing all the messages in the current window again, but that might take a 
long time if the window covers a long period of time. Samza can speed up this 
recovery by making the state fault-tolerant rather than trying to recompute 
it.</p>
+
+<h4 id="table-table-join">Table-table join</h4>
+
+<p><em>Example: Join a table of user profiles to a table of user settings by 
user_id and emit the joined stream</em></p>
+
+<p>You might wonder: does it make sense to join two tables in a stream 
processing system? It does if your database can supply a log of all the changes 
in the database. There is a <a 
href="http://engineering.linkedin.com/distributed-systems/log-what-every-software-engineer-should-know-about-real-time-datas-unifying";>duality
 between a database and a changelog stream</a>: you can publish every data 
change to a stream, and if you consume the entire stream from beginning to end, 
you can reconstruct the entire contents of the database. Samza is designed for 
data processing jobs that follow this philosophy.</p>
+
+<p>If you have changelog streams for several database tables, you can write a 
stream processing job which keeps the latest state of each table in a local 
key-value store, where you can access it much faster than by making queries to 
the original database. Now, whenever data in one table changes, you can join it 
with the latest data for the same key in the other table, and output the joined 
result.</p>
+
+<p>There are several real-life examples of data normalization which 
essentially work in this way:</p>
+
+<ul>
+<li>E-commerce companies like Amazon and EBay need to import feeds of 
merchandise from merchants, normalize them by product, and present products 
with all the associated merchants and pricing information.</li>
+<li>Web search requires building a crawler which creates essentially a <a 
href="http://labs.yahoo.com/files/YahooWebmap.pdf";>table of web page 
contents</a> and joins on all the relevance attributes such as click-through 
ratio or pagerank.</li>
+<li>Social networks take feeds of user-entered text and need to normalize out 
entities such as companies, schools, and skills.</li>
+</ul>
+
+<p>Each of these use cases is a massively complex data normalization problem 
that can be thought of as constructing a materialized view over many input 
tables. Samza can help implement such data processing pipelines robustly.</p>
+
+<h4 id="stream-table-join">Stream-table join</h4>
+
+<p><em>Example: Augment a stream of page view events with the user&rsquo;s ZIP 
code (perhaps to allow aggregation by zip code in a later stage)</em></p>
+
+<p>Joining side-information to a real-time feed is a classic use for stream 
processing. It&rsquo;s particularly common in advertising, relevance ranking, 
fraud detection and other domains. Activity events such as page views generally 
only include a small number of attributes, such as the ID of the viewer and the 
viewed items, but not detailed attributes of the viewer and the viewed items, 
such as the ZIP code of the user. If you want to aggregate the stream by 
attributes of the viewer or the viewed items, you need to join with the users 
table or the items table respectively.</p>
+
+<p>In data warehouse terminology, you can think of the raw event stream as 
rows in the central fact table, which needs to be joined with dimension tables 
so that you can use attributes of the dimensions in your analysis.</p>
+
+<h4 id="stream-stream-join">Stream-stream join</h4>
+
+<p><em>Example: Join a stream of ad clicks to a stream of ad impressions (to 
link the information on when the ad was shown to the information on when it was 
clicked)</em></p>
+
+<p>A stream join is useful for &ldquo;nearly aligned&rdquo; streams, where you 
expect to receive related events on several input streams, and you want to 
combine them into a single output event. You cannot rely on the events arriving 
at the stream processor at the same time, but you can set a maximum period of 
time over which you allow the events to be spread out.</p>
+
+<p>In order to perform a join between streams, your job needs to buffer events 
for the time window over which you want to join. For short time windows, you 
can do this in memory (at the risk of losing events if the machine fails). You 
can also use Samza&rsquo;s state store to buffer events, which supports 
buffering more messages than you can fit in memory.</p>
+
+<h4 id="more">More</h4>
+
+<p>There are many variations of joins and aggregations, but most are 
essentially variations and combinations of the above patterns.</p>
+
+<h3 id="approaches-to-managing-task-state">Approaches to managing task 
state</h3>
+
+<p>So how do systems support this kind of stateful processing? We&rsquo;ll 
lead in by describing what we have seen in other stream processing systems, and 
then describe what Samza does.</p>
+
+<h4 id="in-memory-state-with-checkpointing">In-memory state with 
checkpointing</h4>
+
+<p>A simple approach, common in academic stream processing systems, is to 
periodically save the task&rsquo;s entire in-memory data to durable storage. 
This approach works well if the in-memory state consists of only a few values. 
However, you have to store the complete task state on each checkpoint, which 
becomes increasingly expensive as task state grows. Unfortunately, many 
non-trivial use cases for joins and aggregation have large amounts of state 
&mdash; often many gigabytes. This makes full dumps of the state 
impractical.</p>
+
+<p>Some academic systems produce <em>diffs</em> in addition to full 
checkpoints, which are smaller if only some of the state has changed since the 
last checkpoint. <a href="../comparisons/storm.html">Storm&rsquo;s Trident 
abstraction</a> similarly keeps an in-memory cache of state, and periodically 
writes any changes to a remote store such as Cassandra. However, this 
optimization only helps if most of the state remains unchanged. In some use 
cases, such as stream joins, it is normal to have a lot of churn in the state, 
so this technique essentially degrades to making a remote database request for 
every message (see below).</p>
+
+<h4 id="using-an-external-store">Using an external store</h4>
+
+<p>Another common pattern for stateful processing is to store the state in an 
external database or key-value store. Conventional database replication can be 
used to make that database fault-tolerant. The architecture looks something 
like this:</p>
+
+<p><img src="/img/0.10/learn/documentation/container/stream_job_and_db.png" 
alt="state-kv-store"></p>
+
+<p>Samza allows this style of processing &mdash; there is nothing to stop you 
querying a remote database or service within your job. However, there are a few 
reasons why a remote database can be problematic for stateful stream 
processing:</p>
+
+<ol>
+<li><strong>Performance</strong>: Making database queries over a network is 
slow and expensive. A Kafka stream can deliver hundreds of thousands or even 
millions of messages per second per CPU core to a stream processor, but if you 
need to make a remote request for every message you process, your throughput is 
likely to drop by 2-3 orders of magnitude. You can somewhat mitigate this with 
careful caching of reads and batching of writes, but then you&rsquo;re back to 
the problems of checkpointing, discussed above.</li>
+<li><strong>Isolation</strong>: If your database or service also serves 
requests to users, it can be dangerous to use the same database with a stream 
processor. A scalable stream processing system can run with very high 
throughput, and easily generates a huge amount of load (for example when 
catching up on a queue backlog). If you&rsquo;re not very careful, you may 
cause a denial-of-service attack on your own database, and cause problems for 
interactive requests from users.</li>
+<li><strong>Query Capabilities</strong>: Many scalable databases expose very 
limited query interfaces (e.g. only supporting simple key-value lookups), 
because the equivalent of a &ldquo;full table scan&rdquo; or rich traversal 
would be too expensive. Stream processes are often less latency-sensitive, so 
richer query capabilities would be more feasible.</li>
+<li><strong>Correctness</strong>: When a stream processor fails and needs to 
be restarted, how is the database state made consistent with the processing 
task? For this purpose, some frameworks such as <a 
href="../comparisons/storm.html">Storm</a> attach metadata to database entries, 
but it needs to be handled carefully, otherwise the stream process generates 
incorrect output.</li>
+<li><strong>Reprocessing</strong>: Sometimes it can be useful to re-run a 
stream process on a large amount of historical data, e.g. after updating your 
processing task&rsquo;s code. However, the issues above make this impractical 
for jobs that make external queries.</li>
+</ol>
+
+<h3 id="local-state-in-samza">Local state in Samza</h3>
+
+<p>Samza allows tasks to maintain state in a way that is different from the 
approaches described above:</p>
+
+<ul>
+<li>The state is stored on disk, so the job can maintain more state than would 
fit in memory.</li>
+<li>It is stored on the same machine as the processing task, to avoid the 
performance problems of making database queries over the network.</li>
+<li>Each job has its own datastore, to avoid the isolation problems of a 
shared database (if you make an expensive query, it affects only the current 
task, nobody else).</li>
+<li>Different storage engines can be plugged in, enabling rich query 
capabilities.</li>
+<li>The state is continuously replicated, enabling fault tolerance without the 
problems of checkpointing large amounts of state.</li>
+</ul>
+
+<p>Imagine you take a remote database, partition it to match the number of 
tasks in the stream processing job, and co-locate each partition with its task. 
The result looks like this:</p>
+
+<p><img src="/img/0.10/learn/documentation/container/stateful_job.png" 
alt="state-local"></p>
+
+<p>If a machine fails, all the tasks running on that machine and their 
database partitions are lost. In order to make them highly available, all 
writes to the database partition are replicated to a durable changelog 
(typically Kafka). Now, when a machine fails, we can restart the tasks on 
another machine, and consume this changelog in order to restore the contents of 
the database partition.</p>
+
+<p>Note that each task only has access to its own database partition, not to 
any other task&rsquo;s partition. This is important: when you scale out your 
job by giving it more computing resources, Samza needs to move tasks from one 
machine to another. By giving each task its own state, tasks can be relocated 
without affecting the job&rsquo;s operation. If necessary, you can repartition 
your streams so that all messages for a particular database partition are 
routed to the same task instance.</p>
+
+<p><a href="http://kafka.apache.org/documentation.html#compaction";>Log 
compaction</a> runs in the background on the changelog topic, and ensures that 
the changelog does not grow indefinitely. If you overwrite the same value in 
the store many times, log compaction keeps only the most recent value, and 
throws away any old values in the log. If you delete an item from the store, 
log compaction also removes it from the log. With the right tuning, the 
changelog is not much bigger than the database itself.</p>
+
+<p>With this architecture, Samza allows tasks to maintain large amounts of 
fault-tolerant state, at a performance that is almost as good as a pure 
in-memory implementation. There are just a few limitations:</p>
+
+<ul>
+<li>If you have some data that you want to share between tasks (across 
partition boundaries), you need to go to some additional effort to repartition 
and distribute the data. Each task will need its own copy of the data, so this 
may use more space overall.</li>
+<li>When a container is restarted, it can take some time to restore the data 
in all of its state partitions. The time depends on the amount of data, the 
storage engine, your access patterns, and other factors. As a rule of thumb, 
50&nbsp;MB/sec is a reasonable restore time to expect.</li>
+</ul>
+
+<p>Nothing prevents you from using an external database if you want to, but 
for many use cases, Samza&rsquo;s local state is a powerful tool for enabling 
stateful stream processing.</p>
+
+<h3 id="key-value-storage">Key-value storage</h3>
+
+<p>Any storage engine can be plugged into Samza, as described below. Out of 
the box, Samza ships with a key-value store implementation that is built on <a 
href="http://rocksdb.org";>RocksDB</a> using a <a 
href="https://github.com/facebook/rocksdb/wiki/RocksJava-Basics";>JNI 
API</a>.</p>
+
+<p>RocksDB has several nice properties. Its memory allocation is outside of 
the Java heap, which makes it more memory-efficient and less prone to garbage 
collection pauses than a Java-based storage engine. It is very fast for small 
datasets that fit in memory; datasets larger than memory are slower but still 
possible. It is <a 
href="http://www.igvita.com/2012/02/06/sstable-and-log-structured-storage-leveldb/";>log-structured</a>,
 allowing very fast writes. It also includes support for block compression, 
which helps to reduce I/O and memory usage.</p>
+
+<p>Samza includes an additional in-memory caching layer in front of RocksDB, 
which avoids the cost of deserialization for frequently-accessed objects and 
batches writes. If the same key is updated multiple times in quick succession, 
the batching coalesces those updates into a single write. The writes are 
flushed to the changelog when a task <a 
href="checkpointing.html">commits</a>.</p>
+
+<p>To use a key-value store in your job, add the following to your job 
config:</p>
+
+<div class="highlight"><pre><code class="jproperties"><span class="c"># Use 
the key-value store implementation for a store called 
&quot;my-store&quot;</span>
+<span class="na">stores.my-store.factory</span><span class="o">=</span><span 
class="s">org.apache.samza.storage.kv.RocksDbKeyValueStorageEngineFactory</span>
+
+<span class="c"># Use the Kafka topic &quot;my-store-changelog&quot; as the 
changelog stream for this store.</span>
+<span class="c"># This enables automatic recovery of the store after a 
failure. If you don&#39;t</span>
+<span class="c"># configure this, no changelog stream will be generated.</span>
+<span class="na">stores.my-store.changelog</span><span class="o">=</span><span 
class="s">kafka.my-store-changelog</span>
+
+<span class="c"># Encode keys and values in the store as UTF-8 strings.</span>
+<span class="na">serializers.registry.string.class</span><span 
class="o">=</span><span 
class="s">org.apache.samza.serializers.StringSerdeFactory</span>
+<span class="na">stores.my-store.key.serde</span><span class="o">=</span><span 
class="s">string</span>
+<span class="na">stores.my-store.msg.serde</span><span class="o">=</span><span 
class="s">string</span></code></pre></div>
+
+<p>See the <a href="serialization.html">serialization section</a> for more 
information on the <em>serde</em> options.</p>
+
+<p>Here is a simple example that writes every incoming message to the 
store:</p>
+
+<div class="highlight"><pre><code class="java"><span class="kd">public</span> 
<span class="kd">class</span> <span class="nc">MyStatefulTask</span> <span 
class="kd">implements</span> <span class="n">StreamTask</span><span 
class="o">,</span> <span class="n">InitableTask</span> <span class="o">{</span>
+  <span class="kd">private</span> <span class="n">KeyValueStore</span><span 
class="o">&lt;</span><span class="n">String</span><span class="o">,</span> 
<span class="n">String</span><span class="o">&gt;</span> <span 
class="n">store</span><span class="o">;</span>
+
+  <span class="kd">public</span> <span class="kt">void</span> <span 
class="nf">init</span><span class="o">(</span><span class="n">Config</span> 
<span class="n">config</span><span class="o">,</span> <span 
class="n">TaskContext</span> <span class="n">context</span><span 
class="o">)</span> <span class="o">{</span>
+    <span class="k">this</span><span class="o">.</span><span 
class="na">store</span> <span class="o">=</span> <span class="o">(</span><span 
class="n">KeyValueStore</span><span class="o">&lt;</span><span 
class="n">String</span><span class="o">,</span> <span 
class="n">String</span><span class="o">&gt;)</span> <span 
class="n">context</span><span class="o">.</span><span 
class="na">getStore</span><span class="o">(</span><span 
class="s">&quot;my-store&quot;</span><span class="o">);</span>
+  <span class="o">}</span>
+
+  <span class="kd">public</span> <span class="kt">void</span> <span 
class="nf">process</span><span class="o">(</span><span 
class="n">IncomingMessageEnvelope</span> <span class="n">envelope</span><span 
class="o">,</span>
+                      <span class="n">MessageCollector</span> <span 
class="n">collector</span><span class="o">,</span>
+                      <span class="n">TaskCoordinator</span> <span 
class="n">coordinator</span><span class="o">)</span> <span class="o">{</span>
+    <span class="n">store</span><span class="o">.</span><span 
class="na">put</span><span class="o">((</span><span 
class="n">String</span><span class="o">)</span> <span 
class="n">envelope</span><span class="o">.</span><span 
class="na">getKey</span><span class="o">(),</span> <span 
class="o">(</span><span class="n">String</span><span class="o">)</span> <span 
class="n">envelope</span><span class="o">.</span><span 
class="na">getMessage</span><span class="o">());</span>
+  <span class="o">}</span>
+<span class="o">}</span></code></pre></div>
+
+<p>Here is the complete key-value store API:</p>
+
+<div class="highlight"><pre><code class="java"><span class="kd">public</span> 
<span class="kd">interface</span> <span class="nc">KeyValueStore</span><span 
class="o">&lt;</span><span class="n">K</span><span class="o">,</span> <span 
class="n">V</span><span class="o">&gt;</span> <span class="o">{</span>
+  <span class="n">V</span> <span class="nf">get</span><span 
class="o">(</span><span class="n">K</span> <span class="n">key</span><span 
class="o">);</span>
+  <span class="kt">void</span> <span class="nf">put</span><span 
class="o">(</span><span class="n">K</span> <span class="n">key</span><span 
class="o">,</span> <span class="n">V</span> <span class="n">value</span><span 
class="o">);</span>
+  <span class="kt">void</span> <span class="nf">putAll</span><span 
class="o">(</span><span class="n">List</span><span class="o">&lt;</span><span 
class="n">Entry</span><span class="o">&lt;</span><span class="n">K</span><span 
class="o">,</span><span class="n">V</span><span class="o">&gt;&gt;</span> <span 
class="n">entries</span><span class="o">);</span>
+  <span class="kt">void</span> <span class="nf">delete</span><span 
class="o">(</span><span class="n">K</span> <span class="n">key</span><span 
class="o">);</span>
+  <span class="n">KeyValueIterator</span><span class="o">&lt;</span><span 
class="n">K</span><span class="o">,</span><span class="n">V</span><span 
class="o">&gt;</span> <span class="n">range</span><span class="o">(</span><span 
class="n">K</span> <span class="n">from</span><span class="o">,</span> <span 
class="n">K</span> <span class="n">to</span><span class="o">);</span>
+  <span class="n">KeyValueIterator</span><span class="o">&lt;</span><span 
class="n">K</span><span class="o">,</span><span class="n">V</span><span 
class="o">&gt;</span> <span class="n">all</span><span class="o">();</span>
+<span class="o">}</span></code></pre></div>
+
+<p>Additional configuration properties for the key-value store are documented 
in the <a 
href="../jobs/configuration-table.html#keyvalue-rocksdb">configuration 
reference</a>.</p>
+
+<h3 id="debug-key-value-storage">Debug Key-value storage</h3>
+
+<h4 id="materialize-a-state-store-from-the-changelog">Materialize a state 
store from the changelog</h4>
+
+<p>Currently Samza provides a state storage tool which can recover the state 
store from the changelog stream to user-specified directory for reusing and 
debugging.</p>
+
+<div class="highlight"><pre><code 
class="bash">samza-example/target/bin/state-storage-tool.sh <span 
class="se">\</span>
+  --config-path<span class="o">=</span>file:///path/to/job/config.properties 
<span class="se">\</span>
+  --path<span 
class="o">=</span>directory/to/put/state/stores</code></pre></div>
+
+<h4 id="read-the-value-from-a-running-rocksdb">Read the value from a running 
RocksDB</h4>
+
+<p>Samza also provides a tool to read the value from a running job&rsquo;s 
RocksDB.</p>
+
+<div class="highlight"><pre><code 
class="bash">samza-example/target/bin/read-rocksdb-tool.sh <span 
class="se">\</span>
+  --config-path<span class="o">=</span>file:///path/to/job/config.properties 
<span class="se">\</span>
+  --db-path<span 
class="o">=</span>/tmp/nm-local-dir/state/test-state/Partition_0 <span 
class="se">\</span>
+  --db-name<span class="o">=</span><span class="nb">test</span>-state <span 
class="se">\</span>
+  --string-key<span class="o">=</span>a,b,c</code></pre></div>
+
+<ul>
+<li><code>--config-path</code>(required): your job&rsquo;s configuration 
file</li>
+<li><code>--db-path</code>(required): the location of your RocksDB. This is 
convenient if the RocksDB is in the same machine as the tool. E.g. if you are 
running hello-samza in your local machine, the location maybe in 
+<em>/tmp/hadoop/nm-local-dir/usercache/username/appcache/applicationId/containerId/state/storeName/PartitionNumber</em></li>
+<li><code>--db-name</code>(required): if you only have one state store 
specified in the config file, you can ignore this one. Otherwise, you need to 
provide the state store name here.</li>
+<li><code>--string-key</code>: the key list. This one only works if your keys 
are string. There are also another two options: <code>--integer-key</code>, 
<code>--long-key</code>. They work for integer keys and long keys 
respectively.</li>
+</ul>
+
+<p><strong>Limitations</strong>:</p>
+
+<ul>
+<li>This only works with three kinds of keys: string, integer and long. This 
is because we can only accept those kinds of keys from the command line (it is 
really tricky to accept bytes, avro, json, etc from the command line). But it 
is also easy to use this tool programmatically (The key and value both are 
deserialized.)</li>
+</ul>
+
+<div class="highlight"><pre><code class="bash">RocksDbKeyValueReader <span 
class="nv">kvReader</span> <span class="o">=</span> new 
RocksDbKeyValueReader<span class="o">(</span>dbName, pathOfdb, config<span 
class="o">)</span>
+Object <span class="nv">value</span> <span class="o">=</span> 
kvReader.get<span class="o">(</span>key<span 
class="o">)</span></code></pre></div>
+
+<ul>
+<li>Because Samza job has some caches and buffers, you may not be able to see 
expected values (or even not be able to see any value, if all the data is 
buffered). Some of the related configuration are 
<code>stores.store-name.container.write.buffer.size.bytes</code>, 
<code>stores.store-name.write.batch.size</code>, 
<code>stores.store-name.object.cache.size</code>. You may want to set them to 
very small for testing.</li>
+<li>Since RocksDB memtable is not flushed to disk immediately on every write, 
you may not be able to see the expected values until it is written to the SST 
file on disk. For more details on RocksDb, you can refer the docs <a 
href="https://github.com/facebook/rocksdb/wiki/RocksDB-Basics";>here</a>.</li>
+</ul>
+
+<h4 id="known-issues">Known Issues</h4>
+
+<p>RocksDB has several rough edges. It&rsquo;s recommended that you read the 
RocksDB <a 
href="https://github.com/facebook/rocksdb/wiki/RocksDB-Tuning-Guide";>tuning 
guide</a>. Some other notes to be aware of are:</p>
+
+<ol>
+<li>RocksDB is heavily optimized to run with SSD hard disks. Performance on 
non-SSDs degrades significantly.</li>
+<li>Samza&rsquo;s KeyValueStorageEngine.putAll() method does not currently use 
RocksDB&rsquo;s batching-put API because it&rsquo;s <a 
href="https://github.com/facebook/rocksdb/issues/262";>non-functional in 
Java</a>.</li>
+<li>Calling iterator.seekToFirst() is very slow <a 
href="https://github.com/facebook/rocksdb/issues/261";>if there are a lot of 
deletes in the store</a>.</li>
+</ol>
+
+<h3 id="implementing-common-use-cases-with-the-key-value-store">Implementing 
common use cases with the key-value store</h3>
+
+<p>Earlier in this section we discussed some example use cases for stateful 
stream processing. Let&rsquo;s look at how each of these could be implemented 
using a key-value storage engine such as Samza&rsquo;s RocksDB store.</p>
+
+<h4 id="windowed-aggregation">Windowed aggregation</h4>
+
+<p><em>Example: Counting the number of page views for each user per 
hour</em></p>
+
+<p>Implementation: You need two processing stages.</p>
+
+<ol>
+<li>The first one re-partitions the input data by user ID, so that all the 
events for a particular user are routed to the same stream task. If the input 
stream is already partitioned by user ID, you can skip this.</li>
+<li>The second stage does the counting, using a key-value store that maps a 
user ID to the running count. For each new event, the job reads the current 
count for the appropriate user from the store, increments it, and writes it 
back. When the window is complete (e.g. at the end of an hour), the job 
iterates over the contents of the store and emits the aggregates to an output 
stream.</li>
+</ol>
+
+<p>Note that this job effectively pauses at the hour mark to output its 
results. This is totally fine for Samza, as scanning over the contents of the 
key-value store is quite fast. The input stream is buffered while the job is 
doing this hourly work.</p>
+
+<h4 id="table-table-join">Table-table join</h4>
+
+<p><em>Example: Join a table of user profiles to a table of user settings by 
user_id and emit the joined stream</em></p>
+
+<p>Implementation: The job subscribes to the change streams for the user 
profiles database and the user settings database, both partitioned by user_id. 
The job keeps a key-value store keyed by user_id, which contains the latest 
profile record and the latest settings record for each user_id. When a new 
event comes in from either stream, the job looks up the current value in its 
store, updates the appropriate fields (depending on whether it was a profile 
update or a settings update), and writes back the new joined record to the 
store. The changelog of the store doubles as the output stream of the task.</p>
+
+<h4 id="table-stream-join">Table-stream join</h4>
+
+<p><em>Example: Augment a stream of page view events with the user&rsquo;s ZIP 
code (perhaps to allow aggregation by zip code in a later stage)</em></p>
+
+<p>Implementation: The job subscribes to the stream of user profile updates 
and the stream of page view events. Both streams must be partitioned by 
user_id. The job maintains a key-value store where the key is the user_id and 
the value is the user&rsquo;s ZIP code. Every time the job receives a profile 
update, it extracts the user&rsquo;s new ZIP code from the profile update and 
writes it to the store. Every time it receives a page view event, it reads the 
zip code for that user from the store, and emits the page view event with an 
added ZIP code field.</p>
+
+<p>If the next stage needs to aggregate by ZIP code, the ZIP code can be used 
as the partitioning key of the job&rsquo;s output stream. That ensures that all 
the events for the same ZIP code are sent to the same stream partition.</p>
+
+<h4 id="stream-stream-join">Stream-stream join</h4>
+
+<p><em>Example: Join a stream of ad clicks to a stream of ad impressions (to 
link the information on when the ad was shown to the information on when it was 
clicked)</em></p>
+
+<p>In this example we assume that each impression of an ad has a unique 
identifier, e.g. a UUID, and that the same identifier is included in both the 
impression and the click events. This identifier is used as the join key.</p>
+
+<p>Implementation: Partition the ad click and ad impression streams by the 
impression ID or user ID (assuming that two events with the same impression ID 
always have the same user ID). The task keeps two stores, one containing click 
events and one containing impression events, using the impression ID as key for 
both stores. When the job receives a click event, it looks for the 
corresponding impression in the impression store, and vice versa. If a match is 
found, the joined pair is emitted and the entry is deleted. If no match is 
found, the event is written to the appropriate store. Periodically the job 
scans over both stores and deletes any old events that were not matched within 
the time window of the join.</p>
+
+<h3 id="other-storage-engines">Other storage engines</h3>
+
+<p>Samza&rsquo;s fault-tolerance mechanism (sending a local store&rsquo;s 
writes to a replicated changelog) is completely decoupled from the storage 
engine&rsquo;s data structures and query APIs. While a key-value storage engine 
is good for general-purpose processing, you can easily add your own storage 
engines for other types of queries by implementing the <a 
href="../api/javadocs/org/apache/samza/storage/StorageEngine.html">StorageEngine</a>
 interface. Samza&rsquo;s model is especially amenable to embedded storage 
engines, which run as a library in the same process as the stream task. </p>
+
+<p>Some ideas for other storage engines that could be useful: a persistent 
heap (for running top-N queries), <a 
href="http://infolab.stanford.edu/%7Eullman/mmds/ch4.pdf";>approximate 
algorithms</a> such as <a 
href="http://en.wikipedia.org/wiki/Bloom_filter";>bloom filters</a> and <a 
href="http://research.google.com/pubs/pub40671.html";>hyperloglog</a>, or 
full-text indexes such as <a href="http://lucene.apache.org";>Lucene</a>. 
(Patches accepted!)</p>
+
+<h3 id="fault-tolerance-semantics-with-state">Fault tolerance semantics with 
state</h3>
+
+<p>As discussed in the section on <a 
href="checkpointing.html">checkpointing</a>, Samza currently only supports 
at-least-once delivery guarantees in the presence of failure (this is sometimes 
referred to as &ldquo;guaranteed delivery&rdquo;). This means that if a task 
fails, no messages are lost, but some messages may be redelivered.</p>
+
+<p>For many of the stateful processing use cases discussed above, this is not 
a problem: if the effect of a message on state is idempotent, it is safe for 
the same message to be processed more than once. For example, if the store 
contains the ZIP code for each user, then processing the same profile update 
twice has no effect, because the duplicate update does not change the ZIP 
code.</p>
+
+<p>However, for non-idempotent operations such as counting, at-least-once 
delivery guarantees can give incorrect results. If a Samza task fails and is 
restarted, it may double-count some messages that were processed shortly before 
the failure. We are planning to address this limitation in a future release of 
Samza.</p>
+
+<h2 id="metrics-&raquo;"><a href="metrics.html">Metrics &raquo;</a></h2>
+
+
+          </div>
+        </div>
+
+      </div><!-- /.wrapper-content -->
+    </div><!-- /.wrapper -->
+
+    <div class="footer">
+      <div class="container">
+        <!-- nothing for now. -->
+      </div>
+    </div>
+
+  
+    <script>
+      $( document ).ready(function() {
+        if ( $.fn.urlExists( 
"/learn/documentation/latest/container/state-management.html" ) ) {
+          $("#switch-version-button").addClass("fa fa-history masthead-icon");
+        }
+      });
+
+      /* a function to test whether the url exists or not */
+      (function( $ ) {
+        $.fn.urlExists = function(url) {
+          var http = new XMLHttpRequest();
+          http.open('HEAD', url, false);
+          http.send();
+          return http.status != 404;
+        };
+      }( jQuery ));
+    </script>
+  
+
+    <!-- Google Analytics -->
+    <script>
+      
(function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
+      (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new 
Date();a=s.createElement(o),
+      
m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
+      
})(window,document,'script','//www.google-analytics.com/analytics.js','ga');
+
+      ga('create', 'UA-43122768-1', 'apache.org');
+      ga('send', 'pageview');
+
+    </script>
+  </body>
+</html>

svn commit: r1721445 [22/29] - in /samza/site: ./ archive/ community/ contribute/ img/0.10/ img/0.10/learn/ img/0.10/learn/documentation/ img/0.10/learn/documentation/comparisons/ img/0.10/learn/documentation/container/ img/0.10/learn/documentation/int...

Reply via email to