Added: samza/site/learn/documentation/0.10/container/streams.html URL: http://svn.apache.org/viewvc/samza/site/learn/documentation/0.10/container/streams.html?rev=1721445&view=auto ============================================================================== --- samza/site/learn/documentation/0.10/container/streams.html (added) +++ samza/site/learn/documentation/0.10/container/streams.html Tue Dec 22 19:01:05 2015 @@ -0,0 +1,299 @@ +<!DOCTYPE html> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<html lang="en"> + <head> + <meta charset="utf-8"> + <title>Samza - Streams</title> + <link href='/css/ropa-sans.css' rel='stylesheet' type='text/css'/> + <link href="/css/bootstrap.min.css" rel="stylesheet"/> + <link href="/css/font-awesome.min.css" rel="stylesheet"/> + <link href="/css/main.css" rel="stylesheet"/> + <link href="/css/syntax.css" rel="stylesheet"/> + <link rel="icon" type="image/png" href="/img/samza-icon.png"> + <script src="/js/jquery-1.11.1.min.js"></script> + </head> + <body> + <div class="wrapper"> + <div class="wrapper-content"> + + <div class="masthead"> + <div class="container"> + <div class="masthead-logo"> + <a href="/" class="logo">samza</a> + </div> + <div class="masthead-icons"> + <div class="pull-right"> + <a href="/startup/download"><i class="fa fa-arrow-circle-o-down masthead-icon"></i></a> + <a href="https://git-wip-us.apache.org/repos/asf?p=samza.git;a=tree" target="_blank"><i class="fa fa-code masthead-icon" style="font-weight: bold;"></i></a> + <a href="https://twitter.com/samzastream" target="_blank"><i class="fa fa-twitter masthead-icon"></i></a> + <!-- this icon only shows in versioned pages --> + + + + + <a href="http://samza.apache.org/learn/documentation/latest/container/streams.html"><i id="switch-version-button"></i></a> + <!-- links for the navigation bar --> + + + </div> + </div> + </div><!-- /.container --> + </div> + + <div class="container"> + <div class="menu"> + <h1><i class="fa fa-rocket"></i> Getting Started</h1> + <ul> + <li><a href="/startup/hello-samza/0.10">Hello Samza</a></li> + <li><a href="/startup/download">Download</a></li> + </ul> + + <h1><i class="fa fa-book"></i> Learn</h1> + <ul> + <li><a href="/learn/documentation/0.10">Documentation</a></li> + <li><a href="/learn/documentation/0.10/jobs/configuration-table.html">Configuration</a></li> + <li><a href="/learn/documentation/0.10/api/javadocs/">Javadocs</a></li> + <li><a href="/learn/tutorials/0.10">Tutorials</a></li> + <li><a href="https://cwiki.apache.org/confluence/display/SAMZA/FAQ">FAQ</a></li> + <li><a href="https://cwiki.apache.org/confluence/display/SAMZA/Apache+Samza">Wiki</a></li> + <li><a href="https://cwiki.apache.org/confluence/pages/viewpage.action?pageId=51812876">Papers & Talks</a></li> + <li><a href="http://blogs.apache.org/samza">Blog</a></li> + </ul> + + <h1><i class="fa fa-comments"></i> Community</h1> + <ul> + <li><a href="/community/mailing-lists.html">Mailing Lists</a></li> + <li><a href="/community/irc.html">IRC</a></li> + <li><a href="https://issues.apache.org/jira/browse/SAMZA">Bugs</a></li> + <li><a href="https://cwiki.apache.org/confluence/display/SAMZA/Powered+By">Powered by</a></li> + <li><a href="https://cwiki.apache.org/confluence/display/SAMZA/Ecosystem">Ecosystem</a></li> + <li><a href="/community/committers.html">Committers</a></li> + </ul> + + <h1><i class="fa fa-code"></i> Contribute</h1> + <ul> + <li><a href="/contribute/rules.html">Rules</a></li> + <li><a href="/contribute/coding-guide.html">Coding Guide</a></li> + <li><a href="/contribute/projects.html">Projects</a></li> + <li><a href="/contribute/design-documents.html">Design Documents</a></li> + <li><a href="/contribute/code.html">Code</a></li> + <li><a href="https://reviews.apache.org/groups/samza">Review Board</a></li> + <li><a href="/contribute/tests.html">Tests</a></li> + </ul> + + <h1><i class="fa fa-history"></i> Archive</h1> + <ul> + <li><a href="/archive/index.html#latest">latest</a></li> + <li><a href="/archive/index.html#09">0.9</a></li> + <li><a href="/archive/index.html#08">0.8</a></li> + <li><a href="/archive/index.html#07">0.7</a></li> + </ul> + </div> + + <div class="content"> + <!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> + +<h2>Streams</h2> + +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> + +<p>The <a href="samza-container.html">samza container</a> reads and writes messages using the <a href="../api/javadocs/org/apache/samza/system/SystemConsumer.html">SystemConsumer</a> and <a href="../api/javadocs/org/apache/samza/system/SystemProducer.html">SystemProducer</a> interfaces. You can integrate any message broker with Samza by implementing these two interfaces.</p> + +<div class="highlight"><pre><code class="java"><span class="kd">public</span> <span class="kd">interface</span> <span class="nc">SystemConsumer</span> <span class="o">{</span> + <span class="kt">void</span> <span class="nf">start</span><span class="o">();</span> + + <span class="kt">void</span> <span class="nf">stop</span><span class="o">();</span> + + <span class="kt">void</span> <span class="nf">register</span><span class="o">(</span> + <span class="n">SystemStreamPartition</span> <span class="n">systemStreamPartition</span><span class="o">,</span> + <span class="n">String</span> <span class="n">lastReadOffset</span><span class="o">);</span> + + <span class="n">List</span><span class="o"><</span><span class="n">IncomingMessageEnvelope</span><span class="o">></span> <span class="nf">poll</span><span class="o">(</span> + <span class="n">Map</span><span class="o"><</span><span class="n">SystemStreamPartition</span><span class="o">,</span> <span class="n">Integer</span><span class="o">></span> <span class="n">systemStreamPartitions</span><span class="o">,</span> + <span class="kt">long</span> <span class="n">timeout</span><span class="o">)</span> + <span class="kd">throws</span> <span class="n">InterruptedException</span><span class="o">;</span> +<span class="o">}</span> + +<span class="kd">public</span> <span class="kd">class</span> <span class="nc">IncomingMessageEnvelope</span> <span class="o">{</span> + <span class="kd">public</span> <span class="n">Object</span> <span class="nf">getMessage</span><span class="o">()</span> <span class="o">{</span> <span class="o">...</span> <span class="o">}</span> + + <span class="kd">public</span> <span class="n">Object</span> <span class="nf">getKey</span><span class="o">()</span> <span class="o">{</span> <span class="o">...</span> <span class="o">}</span> + + <span class="kd">public</span> <span class="n">SystemStreamPartition</span> <span class="nf">getSystemStreamPartition</span><span class="o">()</span> <span class="o">{</span> <span class="o">...</span> <span class="o">}</span> +<span class="o">}</span> + +<span class="kd">public</span> <span class="kd">interface</span> <span class="nc">SystemProducer</span> <span class="o">{</span> + <span class="kt">void</span> <span class="nf">start</span><span class="o">();</span> + + <span class="kt">void</span> <span class="nf">stop</span><span class="o">();</span> + + <span class="kt">void</span> <span class="nf">register</span><span class="o">(</span><span class="n">String</span> <span class="n">source</span><span class="o">);</span> + + <span class="kt">void</span> <span class="nf">send</span><span class="o">(</span><span class="n">String</span> <span class="n">source</span><span class="o">,</span> <span class="n">OutgoingMessageEnvelope</span> <span class="n">envelope</span><span class="o">);</span> + + <span class="kt">void</span> <span class="nf">flush</span><span class="o">(</span><span class="n">String</span> <span class="n">source</span><span class="o">);</span> +<span class="o">}</span> + +<span class="kd">public</span> <span class="kd">class</span> <span class="nc">OutgoingMessageEnvelope</span> <span class="o">{</span> + <span class="o">...</span> + <span class="kd">public</span> <span class="n">Object</span> <span class="nf">getKey</span><span class="o">()</span> <span class="o">{</span> <span class="o">...</span> <span class="o">}</span> + + <span class="kd">public</span> <span class="n">Object</span> <span class="nf">getMessage</span><span class="o">()</span> <span class="o">{</span> <span class="o">...</span> <span class="o">}</span> +<span class="o">}</span></code></pre></div> + +<p>Out of the box, Samza supports Kafka (KafkaSystemConsumer and KafkaSystemProducer). However, any message bus system can be plugged in, as long as it can provide the semantics required by Samza, as described in the <a href="../api/javadocs/org/apache/samza/system/SystemConsumer.html">javadoc</a>.</p> + +<p>SystemConsumers and SystemProducers may read and write messages of any data type. It’s ok if they only support byte arrays — Samza has a separate <a href="serialization.html">serialization layer</a> which converts to and from objects that application code can use. Samza does not prescribe any particular data model or serialization format.</p> + +<p>The job configuration file can include properties that are specific to a particular consumer and producer implementation. For example, the configuration would typically indicate the hostname and port of the message broker to use, and perhaps connection options.</p> + +<h3 id="how-streams-are-processed">How streams are processed</h3> + +<p>If a job is consuming messages from more than one input stream, and all input streams have messages available, messages are processed in a round robin fashion by default. For example, if a job is consuming AdImpressionEvent and AdClickEvent, the task instance’s process() method is called with a message from AdImpressionEvent, then a message from AdClickEvent, then another message from AdImpressionEvent, … and continues to alternate between the two.</p> + +<p>If one of the input streams has no new messages available (the most recent message has already been consumed), that stream is skipped, and the job continues to consume from the other inputs. It continues to check for new messages becoming available.</p> + +<h4 id="messagechooser">MessageChooser</h4> + +<p>When a Samza container has several incoming messages on different stream partitions, how does it decide which to process first? The behavior is determined by a <a href="../api/javadocs/org/apache/samza/system/chooser/MessageChooser.html">MessageChooser</a>. The default chooser is RoundRobinChooser, but you can override it by implementing a custom chooser.</p> + +<p>To plug in your own message chooser, you need to implement the <a href="../api/javadocs/org/apache/samza/system/chooser/MessageChooserFactory.html">MessageChooserFactory</a> interface, and set the “task.chooser.class” configuration to the fully-qualified class name of your implementation:</p> + +<div class="highlight"><pre><code class="jproperties"><span class="na">task.chooser.class</span><span class="o">=</span><span class="s">com.example.samza.YourMessageChooserFactory</span></code></pre></div> + +<h4 id="prioritizing-input-streams">Prioritizing input streams</h4> + +<p>There are certain times when messages from one stream should be processed with higher priority than messages from another stream. For example, some Samza jobs consume two streams: one stream is fed by a real-time system and the other stream is fed by a batch system. In this case, it’s useful to prioritize the real-time stream over the batch stream, so that the real-time processing doesn’t slow down if there is a sudden burst of data on the batch stream.</p> + +<p>Samza provides a mechanism to prioritize one stream over another by setting this configuration parameter: systems.<system>.streams.<stream>.samza.priority=<number>. For example:</p> + +<div class="highlight"><pre><code class="jproperties"><span class="na">systems.kafka.streams.my-real-time-stream.samza.priority</span><span class="o">=</span><span class="s">2</span> +<span class="na">systems.kafka.streams.my-batch-stream.samza.priority</span><span class="o">=</span><span class="s">1</span></code></pre></div> + +<p>This declares that my-real-time-stream’s messages should be processed with higher priority than my-batch-stream’s messages. If my-real-time-stream has any messages available, they are processed first. Only if there are no messages currently waiting on my-real-time-stream, the Samza job continues processing my-batch-stream.</p> + +<p>Each priority level gets its own MessageChooser. It is valid to define two streams with the same priority. If messages are available from two streams at the same priority level, it’s up to the MessageChooser for that priority level to decide which message should be processed first.</p> + +<p>It’s also valid to only define priorities for some streams. All non-prioritized streams are treated as the lowest priority, and share a MessageChooser.</p> + +<h4 id="bootstrapping">Bootstrapping</h4> + +<p>Sometimes, a Samza job needs to fully consume a stream (from offset 0 up to the most recent message) before it processes messages from any other stream. This is useful in situations where the stream contains some prerequisite data that the job needs, and it doesn’t make sense to process messages from other streams until the job has loaded that prerequisite data. Samza supports this use case with <em>bootstrap streams</em>.</p> + +<p>A bootstrap stream seems similar to a stream with a high priority, but is subtly different. Before allowing any other stream to be processed, a bootstrap stream waits for the consumer to explicitly confirm that the stream has been fully consumed. Until then, the bootstrap stream is the exclusive input to the job: even if a network issue or some other factor causes the bootstrap stream consumer to slow down, other inputs can’t sneak their messages in.</p> + +<p>Another difference between a bootstrap stream and a high-priority stream is that the bootstrap stream’s special treatment is temporary: when it has been fully consumed (we say it has “caught up”), its priority drops to be the same as all the other input streams.</p> + +<p>To configure a stream called “my-bootstrap-stream” to be a fully-consumed bootstrap stream, use the following settings:</p> + +<div class="highlight"><pre><code class="jproperties"><span class="na">systems.kafka.streams.my-bootstrap-stream.samza.bootstrap</span><span class="o">=</span><span class="s">true</span> +<span class="na">systems.kafka.streams.my-bootstrap-stream.samza.reset.offset</span><span class="o">=</span><span class="s">true</span> +<span class="na">systems.kafka.streams.my-bootstrap-stream.samza.offset.default</span><span class="o">=</span><span class="s">oldest</span></code></pre></div> + +<p>The bootstrap=true parameter enables the bootstrap behavior (prioritization over other streams). The combination of reset.offset=true and offset.default=oldest tells Samza to always start reading the stream from the oldest offset, every time a container starts up (rather than starting to read from the most recent checkpoint).</p> + +<p>It is valid to define multiple bootstrap streams. In this case, the order in which they are bootstrapped is determined by the priority.</p> + +<h4 id="batching">Batching</h4> + +<p>In some cases, you can improve performance by consuming several messages from the same stream partition in sequence. Samza supports this mode of operation, called <em>batching</em>.</p> + +<p>For example, if you want to read 100 messages in a row from each stream partition (regardless of the MessageChooser), you can use this configuration parameter:</p> + +<div class="highlight"><pre><code class="jproperties"><span class="na">task.consumer.batch.size</span><span class="o">=</span><span class="s">100</span></code></pre></div> + +<p>With this setting, Samza tries to read a message from the most recently used <a href="../api/javadocs/org/apache/samza/system/SystemStreamPartition.html">SystemStreamPartition</a>. This behavior continues either until no more messages are available for that SystemStreamPartition, or until the batch size has been reached. When that happens, Samza defers to the MessageChooser to determine the next message to process. It then again tries to continue consume from the chosen message’s SystemStreamPartition until the batch size is reached.</p> + +<h2 id="serialization-»"><a href="serialization.html">Serialization »</a></h2> + + + </div> + </div> + + </div><!-- /.wrapper-content --> + </div><!-- /.wrapper --> + + <div class="footer"> + <div class="container"> + <!-- nothing for now. --> + </div> + </div> + + + <script> + $( document ).ready(function() { + if ( $.fn.urlExists( "/learn/documentation/latest/container/streams.html" ) ) { + $("#switch-version-button").addClass("fa fa-history masthead-icon"); + } + }); + + /* a function to test whether the url exists or not */ + (function( $ ) { + $.fn.urlExists = function(url) { + var http = new XMLHttpRequest(); + http.open('HEAD', url, false); + http.send(); + return http.status != 404; + }; + }( jQuery )); + </script> + + + <!-- Google Analytics --> + <script> + (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){ + (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o), + m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m) + })(window,document,'script','//www.google-analytics.com/analytics.js','ga'); + + ga('create', 'UA-43122768-1', 'apache.org'); + ga('send', 'pageview'); + + </script> + </body> +</html>
Added: samza/site/learn/documentation/0.10/container/windowing.html URL: http://svn.apache.org/viewvc/samza/site/learn/documentation/0.10/container/windowing.html?rev=1721445&view=auto ============================================================================== --- samza/site/learn/documentation/0.10/container/windowing.html (added) +++ samza/site/learn/documentation/0.10/container/windowing.html Tue Dec 22 19:01:05 2015 @@ -0,0 +1,227 @@ +<!DOCTYPE html> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<html lang="en"> + <head> + <meta charset="utf-8"> + <title>Samza - Windowing</title> + <link href='/css/ropa-sans.css' rel='stylesheet' type='text/css'/> + <link href="/css/bootstrap.min.css" rel="stylesheet"/> + <link href="/css/font-awesome.min.css" rel="stylesheet"/> + <link href="/css/main.css" rel="stylesheet"/> + <link href="/css/syntax.css" rel="stylesheet"/> + <link rel="icon" type="image/png" href="/img/samza-icon.png"> + <script src="/js/jquery-1.11.1.min.js"></script> + </head> + <body> + <div class="wrapper"> + <div class="wrapper-content"> + + <div class="masthead"> + <div class="container"> + <div class="masthead-logo"> + <a href="/" class="logo">samza</a> + </div> + <div class="masthead-icons"> + <div class="pull-right"> + <a href="/startup/download"><i class="fa fa-arrow-circle-o-down masthead-icon"></i></a> + <a href="https://git-wip-us.apache.org/repos/asf?p=samza.git;a=tree" target="_blank"><i class="fa fa-code masthead-icon" style="font-weight: bold;"></i></a> + <a href="https://twitter.com/samzastream" target="_blank"><i class="fa fa-twitter masthead-icon"></i></a> + <!-- this icon only shows in versioned pages --> + + + + + <a href="http://samza.apache.org/learn/documentation/latest/container/windowing.html"><i id="switch-version-button"></i></a> + <!-- links for the navigation bar --> + + + </div> + </div> + </div><!-- /.container --> + </div> + + <div class="container"> + <div class="menu"> + <h1><i class="fa fa-rocket"></i> Getting Started</h1> + <ul> + <li><a href="/startup/hello-samza/0.10">Hello Samza</a></li> + <li><a href="/startup/download">Download</a></li> + </ul> + + <h1><i class="fa fa-book"></i> Learn</h1> + <ul> + <li><a href="/learn/documentation/0.10">Documentation</a></li> + <li><a href="/learn/documentation/0.10/jobs/configuration-table.html">Configuration</a></li> + <li><a href="/learn/documentation/0.10/api/javadocs/">Javadocs</a></li> + <li><a href="/learn/tutorials/0.10">Tutorials</a></li> + <li><a href="https://cwiki.apache.org/confluence/display/SAMZA/FAQ">FAQ</a></li> + <li><a href="https://cwiki.apache.org/confluence/display/SAMZA/Apache+Samza">Wiki</a></li> + <li><a href="https://cwiki.apache.org/confluence/pages/viewpage.action?pageId=51812876">Papers & Talks</a></li> + <li><a href="http://blogs.apache.org/samza">Blog</a></li> + </ul> + + <h1><i class="fa fa-comments"></i> Community</h1> + <ul> + <li><a href="/community/mailing-lists.html">Mailing Lists</a></li> + <li><a href="/community/irc.html">IRC</a></li> + <li><a href="https://issues.apache.org/jira/browse/SAMZA">Bugs</a></li> + <li><a href="https://cwiki.apache.org/confluence/display/SAMZA/Powered+By">Powered by</a></li> + <li><a href="https://cwiki.apache.org/confluence/display/SAMZA/Ecosystem">Ecosystem</a></li> + <li><a href="/community/committers.html">Committers</a></li> + </ul> + + <h1><i class="fa fa-code"></i> Contribute</h1> + <ul> + <li><a href="/contribute/rules.html">Rules</a></li> + <li><a href="/contribute/coding-guide.html">Coding Guide</a></li> + <li><a href="/contribute/projects.html">Projects</a></li> + <li><a href="/contribute/design-documents.html">Design Documents</a></li> + <li><a href="/contribute/code.html">Code</a></li> + <li><a href="https://reviews.apache.org/groups/samza">Review Board</a></li> + <li><a href="/contribute/tests.html">Tests</a></li> + </ul> + + <h1><i class="fa fa-history"></i> Archive</h1> + <ul> + <li><a href="/archive/index.html#latest">latest</a></li> + <li><a href="/archive/index.html#09">0.9</a></li> + <li><a href="/archive/index.html#08">0.8</a></li> + <li><a href="/archive/index.html#07">0.7</a></li> + </ul> + </div> + + <div class="content"> + <!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> + +<h2>Windowing</h2> + +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> + +<p>Sometimes a stream processing job needs to do something in regular time intervals, regardless of how many incoming messages the job is processing. For example, say you want to report the number of page views per minute. To do this, you increment a counter every time you see a page view event. Once per minute, you send the current counter value to an output stream and reset the counter to zero.</p> + +<p>Samza’s <em>windowing</em> feature provides a way for tasks to do something in regular time intervals, for example once per minute. To enable windowing, you just need to set one property in your job configuration:</p> + +<div class="highlight"><pre><code class="jproperties"><span class="c"># Call the window() method every 60 seconds</span> +<span class="na">task.window.ms</span><span class="o">=</span><span class="s">60000</span></code></pre></div> + +<p>Next, your stream task needs to implement the <a href="../api/javadocs/org/apache/samza/task/WindowableTask.html">WindowableTask</a> interface. This interface defines a window() method which is called by Samza in the regular interval that you configured.</p> + +<p>For example, this is how you would implement a basic per-minute event counter:</p> + +<div class="highlight"><pre><code class="java"><span class="kd">public</span> <span class="kd">class</span> <span class="nc">EventCounterTask</span> <span class="kd">implements</span> <span class="n">StreamTask</span><span class="o">,</span> <span class="n">WindowableTask</span> <span class="o">{</span> + + <span class="kd">public</span> <span class="kd">static</span> <span class="kd">final</span> <span class="n">SystemStream</span> <span class="n">OUTPUT_STREAM</span> <span class="o">=</span> + <span class="k">new</span> <span class="nf">SystemStream</span><span class="o">(</span><span class="s">"kafka"</span><span class="o">,</span> <span class="s">"events-per-minute"</span><span class="o">);</span> + + <span class="kd">private</span> <span class="kt">int</span> <span class="n">eventsSeen</span> <span class="o">=</span> <span class="mi">0</span><span class="o">;</span> + + <span class="kd">public</span> <span class="kt">void</span> <span class="nf">process</span><span class="o">(</span><span class="n">IncomingMessageEnvelope</span> <span class="n">envelope</span><span class="o">,</span> + <span class="n">MessageCollector</span> <span class="n">collector</span><span class="o">,</span> + <span class="n">TaskCoordinator</span> <span class="n">coordinator</span><span class="o">)</span> <span class="o">{</span> + <span class="n">eventsSeen</span><span class="o">++;</span> + <span class="o">}</span> + + <span class="kd">public</span> <span class="kt">void</span> <span class="nf">window</span><span class="o">(</span><span class="n">MessageCollector</span> <span class="n">collector</span><span class="o">,</span> + <span class="n">TaskCoordinator</span> <span class="n">coordinator</span><span class="o">)</span> <span class="o">{</span> + <span class="n">collector</span><span class="o">.</span><span class="na">send</span><span class="o">(</span><span class="k">new</span> <span class="n">OutgoingMessageEnvelope</span><span class="o">(</span><span class="n">OUTPUT_STREAM</span><span class="o">,</span> <span class="n">eventsSeen</span><span class="o">));</span> + <span class="n">eventsSeen</span> <span class="o">=</span> <span class="mi">0</span><span class="o">;</span> + <span class="o">}</span> +<span class="o">}</span></code></pre></div> + +<p>If you need to send messages to output streams, you can use the <a href="../api/javadocs/org/apache/samza/task/MessageCollector.html">MessageCollector</a> object passed to the window() method. Please only use that MessageCollector object for sending messages, and don’t use it outside of the call to window().</p> + +<p>Note that Samza uses <a href="event-loop.html">single-threaded execution</a>, so the window() call can never happen concurrently with a process() call. This has the advantage that you don’t need to worry about thread safety in your code (no need to synchronize anything), but the downside that the window() call may be delayed if your process() method takes a long time to return.</p> + +<h2 id="event-loop-»"><a href="event-loop.html">Event Loop »</a></h2> + + + </div> + </div> + + </div><!-- /.wrapper-content --> + </div><!-- /.wrapper --> + + <div class="footer"> + <div class="container"> + <!-- nothing for now. --> + </div> + </div> + + + <script> + $( document ).ready(function() { + if ( $.fn.urlExists( "/learn/documentation/latest/container/windowing.html" ) ) { + $("#switch-version-button").addClass("fa fa-history masthead-icon"); + } + }); + + /* a function to test whether the url exists or not */ + (function( $ ) { + $.fn.urlExists = function(url) { + var http = new XMLHttpRequest(); + http.open('HEAD', url, false); + http.send(); + return http.status != 404; + }; + }( jQuery )); + </script> + + + <!-- Google Analytics --> + <script> + (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){ + (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o), + m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m) + })(window,document,'script','//www.google-analytics.com/analytics.js','ga'); + + ga('create', 'UA-43122768-1', 'apache.org'); + ga('send', 'pageview'); + + </script> + </body> +</html> Added: samza/site/learn/documentation/0.10/hdfs/producer.html URL: http://svn.apache.org/viewvc/samza/site/learn/documentation/0.10/hdfs/producer.html?rev=1721445&view=auto ============================================================================== --- samza/site/learn/documentation/0.10/hdfs/producer.html (added) +++ samza/site/learn/documentation/0.10/hdfs/producer.html Tue Dec 22 19:01:05 2015 @@ -0,0 +1,231 @@ +<!DOCTYPE html> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<html lang="en"> + <head> + <meta charset="utf-8"> + <title>Samza - Isolation</title> + <link href='/css/ropa-sans.css' rel='stylesheet' type='text/css'/> + <link href="/css/bootstrap.min.css" rel="stylesheet"/> + <link href="/css/font-awesome.min.css" rel="stylesheet"/> + <link href="/css/main.css" rel="stylesheet"/> + <link href="/css/syntax.css" rel="stylesheet"/> + <link rel="icon" type="image/png" href="/img/samza-icon.png"> + <script src="/js/jquery-1.11.1.min.js"></script> + </head> + <body> + <div class="wrapper"> + <div class="wrapper-content"> + + <div class="masthead"> + <div class="container"> + <div class="masthead-logo"> + <a href="/" class="logo">samza</a> + </div> + <div class="masthead-icons"> + <div class="pull-right"> + <a href="/startup/download"><i class="fa fa-arrow-circle-o-down masthead-icon"></i></a> + <a href="https://git-wip-us.apache.org/repos/asf?p=samza.git;a=tree" target="_blank"><i class="fa fa-code masthead-icon" style="font-weight: bold;"></i></a> + <a href="https://twitter.com/samzastream" target="_blank"><i class="fa fa-twitter masthead-icon"></i></a> + <!-- this icon only shows in versioned pages --> + + + + + <a href="http://samza.apache.org/learn/documentation/latest/hdfs/producer.html"><i id="switch-version-button"></i></a> + <!-- links for the navigation bar --> + + + </div> + </div> + </div><!-- /.container --> + </div> + + <div class="container"> + <div class="menu"> + <h1><i class="fa fa-rocket"></i> Getting Started</h1> + <ul> + <li><a href="/startup/hello-samza/0.10">Hello Samza</a></li> + <li><a href="/startup/download">Download</a></li> + </ul> + + <h1><i class="fa fa-book"></i> Learn</h1> + <ul> + <li><a href="/learn/documentation/0.10">Documentation</a></li> + <li><a href="/learn/documentation/0.10/jobs/configuration-table.html">Configuration</a></li> + <li><a href="/learn/documentation/0.10/api/javadocs/">Javadocs</a></li> + <li><a href="/learn/tutorials/0.10">Tutorials</a></li> + <li><a href="https://cwiki.apache.org/confluence/display/SAMZA/FAQ">FAQ</a></li> + <li><a href="https://cwiki.apache.org/confluence/display/SAMZA/Apache+Samza">Wiki</a></li> + <li><a href="https://cwiki.apache.org/confluence/pages/viewpage.action?pageId=51812876">Papers & Talks</a></li> + <li><a href="http://blogs.apache.org/samza">Blog</a></li> + </ul> + + <h1><i class="fa fa-comments"></i> Community</h1> + <ul> + <li><a href="/community/mailing-lists.html">Mailing Lists</a></li> + <li><a href="/community/irc.html">IRC</a></li> + <li><a href="https://issues.apache.org/jira/browse/SAMZA">Bugs</a></li> + <li><a href="https://cwiki.apache.org/confluence/display/SAMZA/Powered+By">Powered by</a></li> + <li><a href="https://cwiki.apache.org/confluence/display/SAMZA/Ecosystem">Ecosystem</a></li> + <li><a href="/community/committers.html">Committers</a></li> + </ul> + + <h1><i class="fa fa-code"></i> Contribute</h1> + <ul> + <li><a href="/contribute/rules.html">Rules</a></li> + <li><a href="/contribute/coding-guide.html">Coding Guide</a></li> + <li><a href="/contribute/projects.html">Projects</a></li> + <li><a href="/contribute/design-documents.html">Design Documents</a></li> + <li><a href="/contribute/code.html">Code</a></li> + <li><a href="https://reviews.apache.org/groups/samza">Review Board</a></li> + <li><a href="/contribute/tests.html">Tests</a></li> + </ul> + + <h1><i class="fa fa-history"></i> Archive</h1> + <ul> + <li><a href="/archive/index.html#latest">latest</a></li> + <li><a href="/archive/index.html#09">0.9</a></li> + <li><a href="/archive/index.html#08">0.8</a></li> + <li><a href="/archive/index.html#07">0.7</a></li> + </ul> + </div> + + <div class="content"> + <!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> + +<h2>Isolation</h2> + +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> + +<h3 id="writing-to-hdfs-from-samza">Writing to HDFS from Samza</h3> + +<p>The <code>samza-hdfs</code> module implements a Samza Producer to write to HDFS. The current implementation includes a ready-to-use <code>HdfsSystemProducer</code>, and two <code>HdfsWriter</code>s: One that writes messages of raw bytes to a <code>SequenceFile</code> of <code>BytesWritable</code> keys and values. The other writes UTF-8 <code>String</code>s to a <code>SequenceFile</code> with <code>LongWritable</code> keys and <code>Text</code> values.</p> + +<h3 id="configuring-an-hdfssystemproducer">Configuring an HdfsSystemProducer</h3> + +<p>You can configure an HdfsSystemProducer like any other Samza system: using configuration keys and values set in a <code>job.properties</code> file. +You might configure the system producer for use by your <code>StreamTasks</code> like this:</p> +<div class="highlight"><pre><code class="language-text" data-lang="text"># set the SystemFactory implementation to instantiate HdfsSystemProducer aliased to 'hdfs-clickstream' +systems.hdfs-clickstream.samza.factory=org.apache.samza.system.hdfs.HdfsSystemFactory + +# define a serializer/deserializer for the hdfs-clickstream system +systems.hdfs-clickstream.samza.msg.serde=some-serde-impl + +# consumer configs not needed for HDFS system, reader is not implemented yet + +# Assign a Metrics implementation via a label we defined earlier in the props file +systems.hdfs-clickstream.streams.metrics.samza.msg.serde=some-metrics-impl + +# Assign the implementation class for this system's HdfsWriter +systems.hdfs-clickstream.producer.hdfs.writer.class=org.apache.samza.system.hdfs.writer.TextSequenceFileHdfsWriter + +# Set HDFS SequenceFile compression type. Only BLOCK compression is supported currently +systems.hdfs-clickstream.producer.hdfs.compression.type=snappy + +# The base dir for HDFS output. The default Bucketer for SequenceFile HdfsWriters +# is currently /BASE/JOB_NAME/DATE_PATH/FILES, where BASE is set below +systems.hdfs-clickstream.producer.hdfs.base.output.dir=/user/me/analytics/clickstream_data + +# Assign the implementation class for the HdfsWriter's Bucketer +systems.hdfs-clickstream.producer.hdfs.bucketer.class=org.apache.samza.system.hdfs.writer.JobNameDateTimeBucketer + +# Configure the DATE_PATH the Bucketer will set to bucket output files by day for this job run. +systems.hdfs-clickstream.producer.hdfs.bucketer.date.path.format=yyyy_MM_dd + +# Optionally set the max output bytes per file. A new file will be cut and output +# continued on the next write call each time this many bytes are written. +systems.hdfs-clickstream.producer.hdfs.write.batch.size.bytes=134217728 +</code></pre></div> +<p>The above configuration assumes a Metrics and Serde implemnetation has been properly configured against the <code>some-serde-impl</code> and <code>some-metrics-impl</code> labels somewhere else in the same <code>job.properties</code> file. Each of these properties has a reasonable default, so you can leave out the ones you don’t need to customize for your job run.</p> + + + </div> + </div> + + </div><!-- /.wrapper-content --> + </div><!-- /.wrapper --> + + <div class="footer"> + <div class="container"> + <!-- nothing for now. --> + </div> + </div> + + + <script> + $( document ).ready(function() { + if ( $.fn.urlExists( "/learn/documentation/latest/hdfs/producer.html" ) ) { + $("#switch-version-button").addClass("fa fa-history masthead-icon"); + } + }); + + /* a function to test whether the url exists or not */ + (function( $ ) { + $.fn.urlExists = function(url) { + var http = new XMLHttpRequest(); + http.open('HEAD', url, false); + http.send(); + return http.status != 404; + }; + }( jQuery )); + </script> + + + <!-- Google Analytics --> + <script> + (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){ + (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o), + m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m) + })(window,document,'script','//www.google-analytics.com/analytics.js','ga'); + + ga('create', 'UA-43122768-1', 'apache.org'); + ga('send', 'pageview'); + + </script> + </body> +</html> Added: samza/site/learn/documentation/0.10/index.html URL: http://svn.apache.org/viewvc/samza/site/learn/documentation/0.10/index.html?rev=1721445&view=auto ============================================================================== --- samza/site/learn/documentation/0.10/index.html (added) +++ samza/site/learn/documentation/0.10/index.html Tue Dec 22 19:01:05 2015 @@ -0,0 +1,268 @@ +<!DOCTYPE html> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<html lang="en"> + <head> + <meta charset="utf-8"> + <title>Samza - Documentation</title> + <link href='/css/ropa-sans.css' rel='stylesheet' type='text/css'/> + <link href="/css/bootstrap.min.css" rel="stylesheet"/> + <link href="/css/font-awesome.min.css" rel="stylesheet"/> + <link href="/css/main.css" rel="stylesheet"/> + <link href="/css/syntax.css" rel="stylesheet"/> + <link rel="icon" type="image/png" href="/img/samza-icon.png"> + <script src="/js/jquery-1.11.1.min.js"></script> + </head> + <body> + <div class="wrapper"> + <div class="wrapper-content"> + + <div class="masthead"> + <div class="container"> + <div class="masthead-logo"> + <a href="/" class="logo">samza</a> + </div> + <div class="masthead-icons"> + <div class="pull-right"> + <a href="/startup/download"><i class="fa fa-arrow-circle-o-down masthead-icon"></i></a> + <a href="https://git-wip-us.apache.org/repos/asf?p=samza.git;a=tree" target="_blank"><i class="fa fa-code masthead-icon" style="font-weight: bold;"></i></a> + <a href="https://twitter.com/samzastream" target="_blank"><i class="fa fa-twitter masthead-icon"></i></a> + <!-- this icon only shows in versioned pages --> + + + + + <a href="http://samza.apache.org/learn/documentation/latest/index.html"><i id="switch-version-button"></i></a> + <!-- links for the navigation bar --> + + + </div> + </div> + </div><!-- /.container --> + </div> + + <div class="container"> + <div class="menu"> + <h1><i class="fa fa-rocket"></i> Getting Started</h1> + <ul> + <li><a href="/startup/hello-samza/0.10">Hello Samza</a></li> + <li><a href="/startup/download">Download</a></li> + </ul> + + <h1><i class="fa fa-book"></i> Learn</h1> + <ul> + <li><a href="/learn/documentation/0.10">Documentation</a></li> + <li><a href="/learn/documentation/0.10/jobs/configuration-table.html">Configuration</a></li> + <li><a href="/learn/documentation/0.10/api/javadocs/">Javadocs</a></li> + <li><a href="/learn/tutorials/0.10">Tutorials</a></li> + <li><a href="https://cwiki.apache.org/confluence/display/SAMZA/FAQ">FAQ</a></li> + <li><a href="https://cwiki.apache.org/confluence/display/SAMZA/Apache+Samza">Wiki</a></li> + <li><a href="https://cwiki.apache.org/confluence/pages/viewpage.action?pageId=51812876">Papers & Talks</a></li> + <li><a href="http://blogs.apache.org/samza">Blog</a></li> + </ul> + + <h1><i class="fa fa-comments"></i> Community</h1> + <ul> + <li><a href="/community/mailing-lists.html">Mailing Lists</a></li> + <li><a href="/community/irc.html">IRC</a></li> + <li><a href="https://issues.apache.org/jira/browse/SAMZA">Bugs</a></li> + <li><a href="https://cwiki.apache.org/confluence/display/SAMZA/Powered+By">Powered by</a></li> + <li><a href="https://cwiki.apache.org/confluence/display/SAMZA/Ecosystem">Ecosystem</a></li> + <li><a href="/community/committers.html">Committers</a></li> + </ul> + + <h1><i class="fa fa-code"></i> Contribute</h1> + <ul> + <li><a href="/contribute/rules.html">Rules</a></li> + <li><a href="/contribute/coding-guide.html">Coding Guide</a></li> + <li><a href="/contribute/projects.html">Projects</a></li> + <li><a href="/contribute/design-documents.html">Design Documents</a></li> + <li><a href="/contribute/code.html">Code</a></li> + <li><a href="https://reviews.apache.org/groups/samza">Review Board</a></li> + <li><a href="/contribute/tests.html">Tests</a></li> + </ul> + + <h1><i class="fa fa-history"></i> Archive</h1> + <ul> + <li><a href="/archive/index.html#latest">latest</a></li> + <li><a href="/archive/index.html#09">0.9</a></li> + <li><a href="/archive/index.html#08">0.8</a></li> + <li><a href="/archive/index.html#07">0.7</a></li> + </ul> + </div> + + <div class="content"> + <!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> + +<h2>Documentation</h2> + +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> + +<h4>Introduction</h4> + +<ul class="documentation-list"> + <li><a href="introduction/background.html">Background</a></li> + <li><a href="introduction/concepts.html">Concepts</a></li> + <li><a href="introduction/architecture.html">Architecture</a></li> +</ul> + +<h4>Comparisons</h4> + +<ul class="documentation-list"> + <li><a href="comparisons/introduction.html">Introduction</a></li> + <li><a href="comparisons/mupd8.html">MUPD8</a></li> + <li><a href="comparisons/storm.html">Storm</a></li> + <li><a href="comparisons/spark-streaming.html">Spark Streaming</a></li> +<!-- TODO comparisons pages + <li><a href="comparisons/aurora.html">Aurora</a></li> + <li><a href="comparisons/jms.html">JMS</a></li> + <li><a href="comparisons/s4.html">S4</a></li> +--> +</ul> + +<h4>API</h4> + +<ul class="documentation-list"> + <li><a href="api/overview.html">Overview</a></li> + <li><a href="jobs/configuration-table.html">Configuration</a></li> + <li><a href="api/javadocs">Javadocs</a></li> +</ul> + +<h4>Core</h4> + +<ul class="documentation-list"> + <li><a href="container/samza-container.html">SamzaContainer</a></li> + <li><a href="container/streams.html">Streams</a></li> + <li><a href="container/serialization.html">Serialization</a></li> + <li><a href="container/checkpointing.html">Checkpointing</a></li> + <li><a href="container/state-management.html">State Management</a></li> + <li><a href="container/windowing.html">Windowing</a></li> + <li><a href="container/coordinator-stream.html">Coordinator Stream</a></li> + <li><a href="container/event-loop.html">Event Loop</a></li> + <li><a href="container/metrics.html">Metrics</a></li> + <li><a href="container/jmx.html">JMX</a></li> +</ul> + +<h4>Job Deployment</h4> + +<ul class="documentation-list"> + <li><a href="jobs/job-runner.html">JobRunner</a></li> + <li><a href="jobs/configuration.html">Configuration</a></li> + <li><a href="jobs/packaging.html">Packaging</a></li> + <li><a href="jobs/yarn-jobs.html">YARN Jobs</a></li> + <li><a href="jobs/logging.html">Logging</a></li> + <li><a href="jobs/reprocessing.html">Reprocessing</a></li> + <li><a href="jobs/web-ui-rest-api.html">Web UI and REST API</a></li> +</ul> + +<h4>YARN</h4> + +<ul class="documentation-list"> + <li><a href="yarn/application-master.html">Application Master</a></li> + <li><a href="yarn/isolation.html">Isolation</a></li> + <li><a href="yarn/yarn-host-affinity.html">Host Affinity & Yarn</a></li> + <li><a href="hdfs/producer.html">Writing to HDFS</a></li> +<!-- TODO write yarn pages + <li><a href="">Fault Tolerance</a></li> + <li><a href="">Security</a></li> +--> +</ul> + +<h4>Operations</h4> + +<ul class="documentation-list"> + <li><a href="operations/security.html">Security</a></li> + <li><a href="operations/kafka.html">Kafka</a></li> +</ul> +</div> + + + </div> + </div> + + </div><!-- /.wrapper-content --> + </div><!-- /.wrapper --> + + <div class="footer"> + <div class="container"> + <!-- nothing for now. --> + </div> + </div> + + + <script> + $( document ).ready(function() { + if ( $.fn.urlExists( "/learn/documentation/latest/index.html" ) ) { + $("#switch-version-button").addClass("fa fa-history masthead-icon"); + } + }); + + /* a function to test whether the url exists or not */ + (function( $ ) { + $.fn.urlExists = function(url) { + var http = new XMLHttpRequest(); + http.open('HEAD', url, false); + http.send(); + return http.status != 404; + }; + }( jQuery )); + </script> + + + <!-- Google Analytics --> + <script> + (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){ + (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o), + m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m) + })(window,document,'script','//www.google-analytics.com/analytics.js','ga'); + + ga('create', 'UA-43122768-1', 'apache.org'); + ga('send', 'pageview'); + + </script> + </body> +</html> Added: samza/site/learn/documentation/0.10/introduction/architecture.html URL: http://svn.apache.org/viewvc/samza/site/learn/documentation/0.10/introduction/architecture.html?rev=1721445&view=auto ============================================================================== --- samza/site/learn/documentation/0.10/introduction/architecture.html (added) +++ samza/site/learn/documentation/0.10/introduction/architecture.html Tue Dec 22 19:01:05 2015 @@ -0,0 +1,286 @@ +<!DOCTYPE html> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<html lang="en"> + <head> + <meta charset="utf-8"> + <title>Samza - Architecture</title> + <link href='/css/ropa-sans.css' rel='stylesheet' type='text/css'/> + <link href="/css/bootstrap.min.css" rel="stylesheet"/> + <link href="/css/font-awesome.min.css" rel="stylesheet"/> + <link href="/css/main.css" rel="stylesheet"/> + <link href="/css/syntax.css" rel="stylesheet"/> + <link rel="icon" type="image/png" href="/img/samza-icon.png"> + <script src="/js/jquery-1.11.1.min.js"></script> + </head> + <body> + <div class="wrapper"> + <div class="wrapper-content"> + + <div class="masthead"> + <div class="container"> + <div class="masthead-logo"> + <a href="/" class="logo">samza</a> + </div> + <div class="masthead-icons"> + <div class="pull-right"> + <a href="/startup/download"><i class="fa fa-arrow-circle-o-down masthead-icon"></i></a> + <a href="https://git-wip-us.apache.org/repos/asf?p=samza.git;a=tree" target="_blank"><i class="fa fa-code masthead-icon" style="font-weight: bold;"></i></a> + <a href="https://twitter.com/samzastream" target="_blank"><i class="fa fa-twitter masthead-icon"></i></a> + <!-- this icon only shows in versioned pages --> + + + + + <a href="http://samza.apache.org/learn/documentation/latest/introduction/architecture.html"><i id="switch-version-button"></i></a> + <!-- links for the navigation bar --> + + + </div> + </div> + </div><!-- /.container --> + </div> + + <div class="container"> + <div class="menu"> + <h1><i class="fa fa-rocket"></i> Getting Started</h1> + <ul> + <li><a href="/startup/hello-samza/0.10">Hello Samza</a></li> + <li><a href="/startup/download">Download</a></li> + </ul> + + <h1><i class="fa fa-book"></i> Learn</h1> + <ul> + <li><a href="/learn/documentation/0.10">Documentation</a></li> + <li><a href="/learn/documentation/0.10/jobs/configuration-table.html">Configuration</a></li> + <li><a href="/learn/documentation/0.10/api/javadocs/">Javadocs</a></li> + <li><a href="/learn/tutorials/0.10">Tutorials</a></li> + <li><a href="https://cwiki.apache.org/confluence/display/SAMZA/FAQ">FAQ</a></li> + <li><a href="https://cwiki.apache.org/confluence/display/SAMZA/Apache+Samza">Wiki</a></li> + <li><a href="https://cwiki.apache.org/confluence/pages/viewpage.action?pageId=51812876">Papers & Talks</a></li> + <li><a href="http://blogs.apache.org/samza">Blog</a></li> + </ul> + + <h1><i class="fa fa-comments"></i> Community</h1> + <ul> + <li><a href="/community/mailing-lists.html">Mailing Lists</a></li> + <li><a href="/community/irc.html">IRC</a></li> + <li><a href="https://issues.apache.org/jira/browse/SAMZA">Bugs</a></li> + <li><a href="https://cwiki.apache.org/confluence/display/SAMZA/Powered+By">Powered by</a></li> + <li><a href="https://cwiki.apache.org/confluence/display/SAMZA/Ecosystem">Ecosystem</a></li> + <li><a href="/community/committers.html">Committers</a></li> + </ul> + + <h1><i class="fa fa-code"></i> Contribute</h1> + <ul> + <li><a href="/contribute/rules.html">Rules</a></li> + <li><a href="/contribute/coding-guide.html">Coding Guide</a></li> + <li><a href="/contribute/projects.html">Projects</a></li> + <li><a href="/contribute/design-documents.html">Design Documents</a></li> + <li><a href="/contribute/code.html">Code</a></li> + <li><a href="https://reviews.apache.org/groups/samza">Review Board</a></li> + <li><a href="/contribute/tests.html">Tests</a></li> + </ul> + + <h1><i class="fa fa-history"></i> Archive</h1> + <ul> + <li><a href="/archive/index.html#latest">latest</a></li> + <li><a href="/archive/index.html#09">0.9</a></li> + <li><a href="/archive/index.html#08">0.8</a></li> + <li><a href="/archive/index.html#07">0.7</a></li> + </ul> + </div> + + <div class="content"> + <!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> + +<h2>Architecture</h2> + +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> + +<p>Samza is made up of three layers:</p> + +<ol> +<li>A streaming layer.</li> +<li>An execution layer.</li> +<li>A processing layer.</li> +</ol> + +<p>Samza provides out of the box support for all three layers.</p> + +<ol> +<li><strong>Streaming:</strong> <a href="http://kafka.apache.org/">Kafka</a></li> +<li><strong>Execution:</strong> <a href="http://hadoop.apache.org/docs/current/hadoop-yarn/hadoop-yarn-site/YARN.html">YARN</a></li> +<li><strong>Processing:</strong> <a href="../api/overview.html">Samza API</a></li> +</ol> + +<p>These three pieces fit together to form Samza:</p> + +<p><img src="/img/0.10/learn/documentation/introduction/samza-ecosystem.png" alt="diagram-medium"></p> + +<p>This architecture follows a similar pattern to Hadoop (which also uses YARN as execution layer, HDFS for storage, and MapReduce as processing API):</p> + +<p><img src="/img/0.10/learn/documentation/introduction/samza-hadoop.png" alt="diagram-medium"></p> + +<p>Before going in-depth on each of these three layers, it should be noted that Samza’s support is not limited to Kafka and YARN. Both Samza’s execution and streaming layer are pluggable, and allow developers to implement alternatives if they prefer.</p> + +<h3 id="kafka">Kafka</h3> + +<p><a href="http://kafka.apache.org/">Kafka</a> is a distributed pub/sub and message queueing system that provides at-least once messaging guarantees (i.e. the system guarantees that no messages are lost, but in certain fault scenarios, a consumer might receive the same message more than once), and highly available partitions (i.e. a stream’s partitions continue to be available even if a machine goes down).</p> + +<p>In Kafka, each stream is called a <em>topic</em>. Each topic is partitioned and replicated across multiple machines called <em>brokers</em>. When a <em>producer</em> sends a message to a topic, it provides a key, which is used to determine which partition the message should be sent to. The Kafka brokers receive and store the messages that the producer sends. Kafka <em>consumers</em> can then read from a topic by subscribing to messages on all partitions of a topic.</p> + +<p>Kafka has some interesting properties: </p> + +<ul> +<li>All messages with the same key are guaranteed to be in the same topic partition. This means that if you wish to read all messages for a specific user ID, you only have to read the messages from the partition that contains the user ID, not the whole topic (assuming the user ID is used as key).</li> +<li>A topic partition is a sequence of messages in order of arrival, so you can reference any message in the partition using a monotonically increasing <em>offset</em> (like an index into an array). This means that the broker doesn’t need to keep track of which messages have been seen by a particular consumer — the consumer can keep track itself by storing the offset of the last message it has processed. It then knows that every message with a lower offset than the current offset has already been processed; every message with a higher offset has not yet been processed.</li> +</ul> + +<p>For more details on Kafka, see Kafka’s <a href="http://kafka.apache.org/documentation.html">documentation</a> pages.</p> + +<h3 id="yarn">YARN</h3> + +<p><a href="http://hadoop.apache.org/docs/current/hadoop-yarn/hadoop-yarn-site/YARN.html">YARN</a> (Yet Another Resource Negotiator) is Hadoop’s next-generation cluster scheduler. It allows you to allocate a number of <em>containers</em> (processes) in a cluster of machines, and execute arbitrary commands on them.</p> + +<p>When an application interacts with YARN, it looks something like this:</p> + +<ol> +<li><strong>Application</strong>: I want to run command X on two machines with 512MB memory.</li> +<li><strong>YARN</strong>: Cool, where’s your code?</li> +<li><strong>Application</strong>: http://path.to.host/jobs/download/my.tgz</li> +<li><strong>YARN</strong>: I’m running your job on node-1.grid and node-2.grid.</li> +</ol> + +<p>Samza uses YARN to manage deployment, fault tolerance, logging, resource isolation, security, and locality. A brief overview of YARN is below; see <a href="http://hortonworks.com/blog/apache-hadoop-yarn-background-and-an-overview/">this page from Hortonworks</a> for a much better overview.</p> + +<h4 id="yarn-architecture">YARN Architecture</h4> + +<p>YARN has three important pieces: a <em>ResourceManager</em>, a <em>NodeManager</em>, and an <em>ApplicationMaster</em>. In a YARN grid, every machine runs a NodeManager, which is responsible for launching processes on that machine. A ResourceManager talks to all of the NodeManagers to tell them what to run. Applications, in turn, talk to the ResourceManager when they wish to run something on the cluster. The third piece, the ApplicationMaster, is actually application-specific code that runs in the YARN cluster. It’s responsible for managing the application’s workload, asking for containers (usually UNIX processes), and handling notifications when one of its containers fails.</p> + +<h4 id="samza-and-yarn">Samza and YARN</h4> + +<p>Samza provides a YARN ApplicationMaster and a YARN job runner out of the box. The integration between Samza and YARN is outlined in the following diagram (different colors indicate different host machines):</p> + +<p><img src="/img/0.10/learn/documentation/introduction/samza-yarn-integration.png" alt="diagram-small"></p> + +<p>The Samza client talks to the YARN RM when it wants to start a new Samza job. The YARN RM talks to a YARN NM to allocate space on the cluster for Samza’s ApplicationMaster. Once the NM allocates space, it starts the Samza AM. After the Samza AM starts, it asks the YARN RM for one or more YARN containers to run <a href="../container/samza-container.html">SamzaContainers</a>. Again, the RM works with NMs to allocate space for the containers. Once the space has been allocated, the NMs start the Samza containers.</p> + +<h3 id="samza">Samza</h3> + +<p>Samza uses YARN and Kafka to provide a framework for stage-wise stream processing and partitioning. Everything, put together, looks like this (different colors indicate different host machines):</p> + +<p><img src="/img/0.10/learn/documentation/introduction/samza-yarn-kafka-integration.png" alt="diagram-small"></p> + +<p>The Samza client uses YARN to run a Samza job: YARN starts and supervises one or more <a href="../container/samza-container.html">SamzaContainers</a>, and your processing code (using the <a href="../api/overview.html">StreamTask</a> API) runs inside those containers. The input and output for the Samza StreamTasks come from Kafka brokers that are (usually) co-located on the same machines as the YARN NMs.</p> + +<h3 id="example">Example</h3> + +<p>Let’s take a look at a real example: suppose we want to count the number of page views. In SQL, you would write something like:</p> + +<div class="highlight"><pre><code class="sql"><span class="k">SELECT</span> <span class="n">user_id</span><span class="p">,</span> <span class="k">COUNT</span><span class="p">(</span><span class="o">*</span><span class="p">)</span> <span class="k">FROM</span> <span class="n">PageViewEvent</span> <span class="k">GROUP</span> <span class="k">BY</span> <span class="n">user_id</span></code></pre></div> + +<p>Although Samza doesn’t support SQL right now, the idea is the same. Two jobs are required to calculate this query: one to group messages by user ID, and the other to do the counting.</p> + +<p>In the first job, the grouping is done by sending all messages with the same user ID to the same partition of an intermediate topic. You can do this by using the user ID as key of the messages that are emitted by the first job, and this key is mapped to one of the intermediate topic’s partitions (usually by taking a hash of the key mod the number of partitions). The second job consumes the intermediate topic. Each task in the second job consumes one partition of the intermediate topic, i.e. all the messages for a subset of user IDs. The task has a counter for each user ID in its partition, and the appropriate counter is incremented every time the task receives a message with a particular user ID.</p> + +<p><img src="/img/0.10/learn/documentation/introduction/group-by-example.png" alt="Repartitioning for a GROUP BY" class="diagram-large"></p> + +<p>If you are familiar with Hadoop, you may recognize this as a Map/Reduce operation, where each record is associated with a particular key in the mappers, records with the same key are grouped together by the framework, and then counted in the reduce step. The difference between Hadoop and Samza is that Hadoop operates on a fixed input, whereas Samza works with unbounded streams of data.</p> + +<p>Kafka takes the messages emitted by the first job and buffers them on disk, distributed across multiple machines. This helps make the system fault-tolerant: if one machine fails, no messages are lost, because they have been replicated to other machines. And if the second job goes slow or stops consuming messages for any reason, the first job is unaffected: the disk buffer can absorb the backlog of messages from the first job until the second job catches up again.</p> + +<p>By partitioning topics, and by breaking a stream process down into jobs and parallel tasks that run on multiple machines, Samza scales to streams with very high message throughput. By using YARN and Kafka, Samza achieves fault-tolerance: if a process or machine fails, it is automatically restarted on another machine and continues processing messages from the point where it left off.</p> + +<h2 id="comparison-introduction-»"><a href="../comparisons/introduction.html">Comparison Introduction »</a></h2> + + + </div> + </div> + + </div><!-- /.wrapper-content --> + </div><!-- /.wrapper --> + + <div class="footer"> + <div class="container"> + <!-- nothing for now. --> + </div> + </div> + + + <script> + $( document ).ready(function() { + if ( $.fn.urlExists( "/learn/documentation/latest/introduction/architecture.html" ) ) { + $("#switch-version-button").addClass("fa fa-history masthead-icon"); + } + }); + + /* a function to test whether the url exists or not */ + (function( $ ) { + $.fn.urlExists = function(url) { + var http = new XMLHttpRequest(); + http.open('HEAD', url, false); + http.send(); + return http.status != 404; + }; + }( jQuery )); + </script> + + + <!-- Google Analytics --> + <script> + (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){ + (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o), + m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m) + })(window,document,'script','//www.google-analytics.com/analytics.js','ga'); + + ga('create', 'UA-43122768-1', 'apache.org'); + ga('send', 'pageview'); + + </script> + </body> +</html>
