Modified: incubator/samza/site/learn/documentation/0.7.0/container/streams.html URL: http://svn.apache.org/viewvc/incubator/samza/site/learn/documentation/0.7.0/container/streams.html?rev=1609232&r1=1609231&r2=1609232&view=diff ============================================================================== --- incubator/samza/site/learn/documentation/0.7.0/container/streams.html (original) +++ incubator/samza/site/learn/documentation/0.7.0/container/streams.html Wed Jul 9 16:37:01 2014 @@ -23,6 +23,7 @@ <link href="/css/bootstrap.min.css" rel="stylesheet"/> <link href="/css/font-awesome.min.css" rel="stylesheet"/> <link href="/css/main.css" rel="stylesheet"/> + <link href="/css/syntax.css" rel="stylesheet"/> <link rel="icon" type="image/png" href="/img/samza-icon.png"> </head> <body> @@ -123,48 +124,49 @@ --> <p>The <a href="samza-container.html">samza container</a> reads and writes messages using the <a href="../api/javadocs/org/apache/samza/system/SystemConsumer.html">SystemConsumer</a> and <a href="../api/javadocs/org/apache/samza/system/SystemProducer.html">SystemProducer</a> interfaces. You can integrate any message broker with Samza by implementing these two interfaces.</p> -<div class="highlight"><pre><code class="language-text" data-lang="text">public interface SystemConsumer { - void start(); - void stop(); +<div class="highlight"><pre><code class="language-java" data-lang="java"><span class="kd">public</span> <span class="kd">interface</span> <span class="nc">SystemConsumer</span> <span class="o">{</span> + <span class="kt">void</span> <span class="nf">start</span><span class="o">();</span> - void register( - SystemStreamPartition systemStreamPartition, - String lastReadOffset); + <span class="kt">void</span> <span class="nf">stop</span><span class="o">();</span> - List<IncomingMessageEnvelope> poll( - Map<SystemStreamPartition, Integer> systemStreamPartitions, - long timeout) - throws InterruptedException; -} + <span class="kt">void</span> <span class="nf">register</span><span class="o">(</span> + <span class="n">SystemStreamPartition</span> <span class="n">systemStreamPartition</span><span class="o">,</span> + <span class="n">String</span> <span class="n">lastReadOffset</span><span class="o">);</span> -public class IncomingMessageEnvelope { - public Object getMessage() { ... } + <span class="n">List</span><span class="o"><</span><span class="n">IncomingMessageEnvelope</span><span class="o">></span> <span class="nf">poll</span><span class="o">(</span> + <span class="n">Map</span><span class="o"><</span><span class="n">SystemStreamPartition</span><span class="o">,</span> <span class="n">Integer</span><span class="o">></span> <span class="n">systemStreamPartitions</span><span class="o">,</span> + <span class="kt">long</span> <span class="n">timeout</span><span class="o">)</span> + <span class="kd">throws</span> <span class="n">InterruptedException</span><span class="o">;</span> +<span class="o">}</span> - public Object getKey() { ... } +<span class="kd">public</span> <span class="kd">class</span> <span class="nc">IncomingMessageEnvelope</span> <span class="o">{</span> + <span class="kd">public</span> <span class="n">Object</span> <span class="nf">getMessage</span><span class="o">()</span> <span class="o">{</span> <span class="o">...</span> <span class="o">}</span> - public SystemStreamPartition getSystemStreamPartition() { ... } -} + <span class="kd">public</span> <span class="n">Object</span> <span class="nf">getKey</span><span class="o">()</span> <span class="o">{</span> <span class="o">...</span> <span class="o">}</span> -public interface SystemProducer { - void start(); + <span class="kd">public</span> <span class="n">SystemStreamPartition</span> <span class="nf">getSystemStreamPartition</span><span class="o">()</span> <span class="o">{</span> <span class="o">...</span> <span class="o">}</span> +<span class="o">}</span> - void stop(); +<span class="kd">public</span> <span class="kd">interface</span> <span class="nc">SystemProducer</span> <span class="o">{</span> + <span class="kt">void</span> <span class="nf">start</span><span class="o">();</span> - void register(String source); + <span class="kt">void</span> <span class="nf">stop</span><span class="o">();</span> - void send(String source, OutgoingMessageEnvelope envelope); + <span class="kt">void</span> <span class="nf">register</span><span class="o">(</span><span class="n">String</span> <span class="n">source</span><span class="o">);</span> - void flush(String source); -} + <span class="kt">void</span> <span class="nf">send</span><span class="o">(</span><span class="n">String</span> <span class="n">source</span><span class="o">,</span> <span class="n">OutgoingMessageEnvelope</span> <span class="n">envelope</span><span class="o">);</span> -public class OutgoingMessageEnvelope { - ... - public Object getKey() { ... } + <span class="kt">void</span> <span class="nf">flush</span><span class="o">(</span><span class="n">String</span> <span class="n">source</span><span class="o">);</span> +<span class="o">}</span> + +<span class="kd">public</span> <span class="kd">class</span> <span class="nc">OutgoingMessageEnvelope</span> <span class="o">{</span> + <span class="o">...</span> + <span class="kd">public</span> <span class="n">Object</span> <span class="nf">getKey</span><span class="o">()</span> <span class="o">{</span> <span class="o">...</span> <span class="o">}</span> + + <span class="kd">public</span> <span class="n">Object</span> <span class="nf">getMessage</span><span class="o">()</span> <span class="o">{</span> <span class="o">...</span> <span class="o">}</span> +<span class="o">}</span></code></pre></div> - public Object getMessage() { ... } -} -</code></pre></div> <p>Out of the box, Samza supports Kafka (KafkaSystemConsumer and KafkaSystemProducer). However, any message bus system can be plugged in, as long as it can provide the semantics required by Samza, as described in the <a href="../api/javadocs/org/apache/samza/system/SystemConsumer.html">javadoc</a>.</p> <p>SystemConsumers and SystemProducers may read and write messages of any data type. It’s ok if they only support byte arrays — Samza has a separate <a href="serialization.html">serialization layer</a> which converts to and from objects that application code can use. Samza does not prescribe any particular data model or serialization format.</p> @@ -182,16 +184,18 @@ public class OutgoingMessageEnvelope { <p>When a Samza container has several incoming messages on different stream partitions, how does it decide which to process first? The behavior is determined by a <a href="../api/javadocs/org/apache/samza/system/chooser/MessageChooser.html">MessageChooser</a>. The default chooser is RoundRobinChooser, but you can override it by implementing a custom chooser.</p> <p>To plug in your own message chooser, you need to implement the <a href="../api/javadocs/org/apache/samza/system/chooser/MessageChooserFactory.html">MessageChooserFactory</a> interface, and set the “task.chooser.class” configuration to the fully-qualified class name of your implementation:</p> -<div class="highlight"><pre><code class="language-text" data-lang="text">task.chooser.class=com.example.samza.YourMessageChooserFactory -</code></pre></div> + +<div class="highlight"><pre><code class="language-jproperties" data-lang="jproperties"><span class="na">task.chooser.class</span><span class="o">=</span><span class="s">com.example.samza.YourMessageChooserFactory</span></code></pre></div> + <h4 id="prioritizing-input-streams">Prioritizing input streams</h4> <p>There are certain times when messages from one stream should be processed with higher priority than messages from another stream. For example, some Samza jobs consume two streams: one stream is fed by a real-time system and the other stream is fed by a batch system. In this case, it’s useful to prioritize the real-time stream over the batch stream, so that the real-time processing doesn’t slow down if there is a sudden burst of data on the batch stream.</p> <p>Samza provides a mechanism to prioritize one stream over another by setting this configuration parameter: systems.<system>.streams.<stream>.samza.priority=<number>. For example:</p> -<div class="highlight"><pre><code class="language-text" data-lang="text">systems.kafka.streams.my-real-time-stream.samza.priority=2 -systems.kafka.streams.my-batch-stream.samza.priority=1 -</code></pre></div> + +<div class="highlight"><pre><code class="language-jproperties" data-lang="jproperties"><span class="na">systems.kafka.streams.my-real-time-stream.samza.priority</span><span class="o">=</span><span class="s">2</span> +<span class="na">systems.kafka.streams.my-batch-stream.samza.priority</span><span class="o">=</span><span class="s">1</span></code></pre></div> + <p>This declares that my-real-time-stream’s messages should be processed with higher priority than my-batch-stream’s messages. If my-real-time-stream has any messages available, they are processed first. Only if there are no messages currently waiting on my-real-time-stream, the Samza job continues processing my-batch-stream.</p> <p>Each priority level gets its own MessageChooser. It is valid to define two streams with the same priority. If messages are available from two streams at the same priority level, it’s up to the MessageChooser for that priority level to decide which message should be processed first.</p> @@ -207,10 +211,11 @@ systems.kafka.streams.my-batch-stream.sa <p>Another difference between a bootstrap stream and a high-priority stream is that the bootstrap stream’s special treatment is temporary: when it has been fully consumed (we say it has “caught up”), its priority drops to be the same as all the other input streams.</p> <p>To configure a stream called “my-bootstrap-stream” to be a fully-consumed bootstrap stream, use the following settings:</p> -<div class="highlight"><pre><code class="language-text" data-lang="text">systems.kafka.streams.my-bootstrap-stream.samza.bootstrap=true -systems.kafka.streams.my-bootstrap-stream.samza.reset.offset=true -systems.kafka.streams.my-bootstrap-stream.samza.offset.default=oldest -</code></pre></div> + +<div class="highlight"><pre><code class="language-jproperties" data-lang="jproperties"><span class="na">systems.kafka.streams.my-bootstrap-stream.samza.bootstrap</span><span class="o">=</span><span class="s">true</span> +<span class="na">systems.kafka.streams.my-bootstrap-stream.samza.reset.offset</span><span class="o">=</span><span class="s">true</span> +<span class="na">systems.kafka.streams.my-bootstrap-stream.samza.offset.default</span><span class="o">=</span><span class="s">oldest</span></code></pre></div> + <p>The bootstrap=true parameter enables the bootstrap behavior (prioritization over other streams). The combination of reset.offset=true and offset.default=oldest tells Samza to always start reading the stream from the oldest offset, every time a container starts up (rather than starting to read from the most recent checkpoint).</p> <p>It is valid to define multiple bootstrap streams. In this case, the order in which they are bootstrapped is determined by the priority.</p> @@ -220,8 +225,9 @@ systems.kafka.streams.my-bootstrap-strea <p>In some cases, you can improve performance by consuming several messages from the same stream partition in sequence. Samza supports this mode of operation, called <em>batching</em>.</p> <p>For example, if you want to read 100 messages in a row from each stream partition (regardless of the MessageChooser), you can use this configuration parameter:</p> -<div class="highlight"><pre><code class="language-text" data-lang="text">task.consumer.batch.size=100 -</code></pre></div> + +<div class="highlight"><pre><code class="language-jproperties" data-lang="jproperties"><span class="na">task.consumer.batch.size</span><span class="o">=</span><span class="s">100</span></code></pre></div> + <p>With this setting, Samza tries to read a message from the most recently used <a href="../api/javadocs/org/apache/samza/system/SystemStreamPartition.html">SystemStreamPartition</a>. This behavior continues either until no more messages are available for that SystemStreamPartition, or until the batch size has been reached. When that happens, Samza defers to the MessageChooser to determine the next message to process. It then again tries to continue consume from the chosen message’s SystemStreamPartition until the batch size is reached.</p> <h2 id="serialization-»"><a href="serialization.html">Serialization »</a></h2>
Modified: incubator/samza/site/learn/documentation/0.7.0/container/windowing.html URL: http://svn.apache.org/viewvc/incubator/samza/site/learn/documentation/0.7.0/container/windowing.html?rev=1609232&r1=1609231&r2=1609232&view=diff ============================================================================== --- incubator/samza/site/learn/documentation/0.7.0/container/windowing.html (original) +++ incubator/samza/site/learn/documentation/0.7.0/container/windowing.html Wed Jul 9 16:37:01 2014 @@ -23,6 +23,7 @@ <link href="/css/bootstrap.min.css" rel="stylesheet"/> <link href="/css/font-awesome.min.css" rel="stylesheet"/> <link href="/css/main.css" rel="stylesheet"/> + <link href="/css/syntax.css" rel="stylesheet"/> <link rel="icon" type="image/png" href="/img/samza-icon.png"> </head> <body> @@ -125,32 +126,34 @@ <p>Sometimes a stream processing job needs to do something in regular time intervals, regardless of how many incoming messages the job is processing. For example, say you want to report the number of page views per minute. To do this, you increment a counter every time you see a page view event. Once per minute, you send the current counter value to an output stream and reset the counter to zero.</p> <p>Samza’s <em>windowing</em> feature provides a way for tasks to do something in regular time intervals, for example once per minute. To enable windowing, you just need to set one property in your job configuration:</p> -<div class="highlight"><pre><code class="language-text" data-lang="text"># Call the window() method every 60 seconds -task.window.ms=60000 -</code></pre></div> + +<div class="highlight"><pre><code class="language-jproperties" data-lang="jproperties"><span class="c"># Call the window() method every 60 seconds</span> +<span class="na">task.window.ms</span><span class="o">=</span><span class="s">60000</span></code></pre></div> + <p>Next, your stream task needs to implement the <a href="../api/javadocs/org/apache/samza/task/WindowableTask.html">WindowableTask</a> interface. This interface defines a window() method which is called by Samza in the regular interval that you configured.</p> <p>For example, this is how you would implement a basic per-minute event counter:</p> -<div class="highlight"><pre><code class="language-text" data-lang="text">public class EventCounterTask implements StreamTask, WindowableTask { - public static final SystemStream OUTPUT_STREAM = - new SystemStream("kafka", "events-per-minute"); +<div class="highlight"><pre><code class="language-java" data-lang="java"><span class="kd">public</span> <span class="kd">class</span> <span class="nc">EventCounterTask</span> <span class="kd">implements</span> <span class="n">StreamTask</span><span class="o">,</span> <span class="n">WindowableTask</span> <span class="o">{</span> + + <span class="kd">public</span> <span class="kd">static</span> <span class="kd">final</span> <span class="n">SystemStream</span> <span class="n">OUTPUT_STREAM</span> <span class="o">=</span> + <span class="k">new</span> <span class="nf">SystemStream</span><span class="o">(</span><span class="s">"kafka"</span><span class="o">,</span> <span class="s">"events-per-minute"</span><span class="o">);</span> + + <span class="kd">private</span> <span class="kt">int</span> <span class="n">eventsSeen</span> <span class="o">=</span> <span class="mi">0</span><span class="o">;</span> - private int eventsSeen = 0; + <span class="kd">public</span> <span class="kt">void</span> <span class="nf">process</span><span class="o">(</span><span class="n">IncomingMessageEnvelope</span> <span class="n">envelope</span><span class="o">,</span> + <span class="n">MessageCollector</span> <span class="n">collector</span><span class="o">,</span> + <span class="n">TaskCoordinator</span> <span class="n">coordinator</span><span class="o">)</span> <span class="o">{</span> + <span class="n">eventsSeen</span><span class="o">++;</span> + <span class="o">}</span> + + <span class="kd">public</span> <span class="kt">void</span> <span class="nf">window</span><span class="o">(</span><span class="n">MessageCollector</span> <span class="n">collector</span><span class="o">,</span> + <span class="n">TaskCoordinator</span> <span class="n">coordinator</span><span class="o">)</span> <span class="o">{</span> + <span class="n">collector</span><span class="o">.</span><span class="na">send</span><span class="o">(</span><span class="k">new</span> <span class="nf">OutgoingMessageEnvelope</span><span class="o">(</span><span class="n">OUTPUT_STREAM</span><span class="o">,</span> <span class="n">eventsSeen</span><span class="o">));</span> + <span class="n">eventsSeen</span> <span class="o">=</span> <span class="mi">0</span><span class="o">;</span> + <span class="o">}</span> +<span class="o">}</span></code></pre></div> - public void process(IncomingMessageEnvelope envelope, - MessageCollector collector, - TaskCoordinator coordinator) { - eventsSeen++; - } - - public void window(MessageCollector collector, - TaskCoordinator coordinator) { - collector.send(new OutgoingMessageEnvelope(OUTPUT_STREAM, eventsSeen)); - eventsSeen = 0; - } -} -</code></pre></div> <p>If you need to send messages to output streams, you can use the <a href="../api/javadocs/org/apache/samza/task/MessageCollector.html">MessageCollector</a> object passed to the window() method. Please only use that MessageCollector object for sending messages, and don’t use it outside of the call to window().</p> <p>Note that Samza uses <a href="event-loop.html">single-threaded execution</a>, so the window() call can never happen concurrently with a process() call. This has the advantage that you don’t need to worry about thread safety in your code (no need to synchronize anything), but the downside that the window() call may be delayed if your process() method takes a long time to return.</p> Modified: incubator/samza/site/learn/documentation/0.7.0/index.html URL: http://svn.apache.org/viewvc/incubator/samza/site/learn/documentation/0.7.0/index.html?rev=1609232&r1=1609231&r2=1609232&view=diff ============================================================================== --- incubator/samza/site/learn/documentation/0.7.0/index.html (original) +++ incubator/samza/site/learn/documentation/0.7.0/index.html Wed Jul 9 16:37:01 2014 @@ -23,6 +23,7 @@ <link href="/css/bootstrap.min.css" rel="stylesheet"/> <link href="/css/font-awesome.min.css" rel="stylesheet"/> <link href="/css/main.css" rel="stylesheet"/> + <link href="/css/syntax.css" rel="stylesheet"/> <link rel="icon" type="image/png" href="/img/samza-icon.png"> </head> <body> @@ -172,6 +173,7 @@ <li><a href="jobs/packaging.html">Packaging</a></li> <li><a href="jobs/yarn-jobs.html">YARN Jobs</a></li> <li><a href="jobs/logging.html">Logging</a></li> + <li><a href="jobs/reprocessing.html">Reprocessing</a></li> </ul> <h4>YARN</h4> Modified: incubator/samza/site/learn/documentation/0.7.0/introduction/architecture.html URL: http://svn.apache.org/viewvc/incubator/samza/site/learn/documentation/0.7.0/introduction/architecture.html?rev=1609232&r1=1609231&r2=1609232&view=diff ============================================================================== --- incubator/samza/site/learn/documentation/0.7.0/introduction/architecture.html (original) +++ incubator/samza/site/learn/documentation/0.7.0/introduction/architecture.html Wed Jul 9 16:37:01 2014 @@ -23,6 +23,7 @@ <link href="/css/bootstrap.min.css" rel="stylesheet"/> <link href="/css/font-awesome.min.css" rel="stylesheet"/> <link href="/css/main.css" rel="stylesheet"/> + <link href="/css/syntax.css" rel="stylesheet"/> <link rel="icon" type="image/png" href="/img/samza-icon.png"> </head> <body> @@ -201,8 +202,9 @@ <h3 id="example">Example</h3> <p>Let’s take a look at a real example: suppose we want to count the number of page views. In SQL, you would write something like:</p> -<div class="highlight"><pre><code class="language-text" data-lang="text">SELECT user_id, COUNT(*) FROM PageViewEvent GROUP BY user_id -</code></pre></div> + +<div class="highlight"><pre><code class="language-sql" data-lang="sql"><span class="k">SELECT</span> <span class="n">user_id</span><span class="p">,</span> <span class="k">COUNT</span><span class="p">(</span><span class="o">*</span><span class="p">)</span> <span class="k">FROM</span> <span class="n">PageViewEvent</span> <span class="k">GROUP</span> <span class="k">BY</span> <span class="n">user_id</span></code></pre></div> + <p>Although Samza doesn’t support SQL right now, the idea is the same. Two jobs are required to calculate this query: one to group messages by user ID, and the other to do the counting.</p> <p>In the first job, the grouping is done by sending all messages with the same user ID to the same partition of an intermediate topic. You can do this by using the user ID as key of the messages that are emitted by the first job, and this key is mapped to one of the intermediate topic’s partitions (usually by taking a hash of the key mod the number of partitions). The second job consumes the intermediate topic. Each task in the second job consumes one partition of the intermediate topic, i.e. all the messages for a subset of user IDs. The task has a counter for each user ID in its partition, and the appropriate counter is incremented every time the task receives a message with a particular user ID.</p> Modified: incubator/samza/site/learn/documentation/0.7.0/introduction/background.html URL: http://svn.apache.org/viewvc/incubator/samza/site/learn/documentation/0.7.0/introduction/background.html?rev=1609232&r1=1609231&r2=1609232&view=diff ============================================================================== --- incubator/samza/site/learn/documentation/0.7.0/introduction/background.html (original) +++ incubator/samza/site/learn/documentation/0.7.0/introduction/background.html Wed Jul 9 16:37:01 2014 @@ -23,6 +23,7 @@ <link href="/css/bootstrap.min.css" rel="stylesheet"/> <link href="/css/font-awesome.min.css" rel="stylesheet"/> <link href="/css/main.css" rel="stylesheet"/> + <link href="/css/syntax.css" rel="stylesheet"/> <link rel="icon" type="image/png" href="/img/samza-icon.png"> </head> <body> Modified: incubator/samza/site/learn/documentation/0.7.0/introduction/concepts.html URL: http://svn.apache.org/viewvc/incubator/samza/site/learn/documentation/0.7.0/introduction/concepts.html?rev=1609232&r1=1609231&r2=1609232&view=diff ============================================================================== --- incubator/samza/site/learn/documentation/0.7.0/introduction/concepts.html (original) +++ incubator/samza/site/learn/documentation/0.7.0/introduction/concepts.html Wed Jul 9 16:37:01 2014 @@ -23,6 +23,7 @@ <link href="/css/bootstrap.min.css" rel="stylesheet"/> <link href="/css/font-awesome.min.css" rel="stylesheet"/> <link href="/css/main.css" rel="stylesheet"/> + <link href="/css/syntax.css" rel="stylesheet"/> <link rel="icon" type="image/png" href="/img/samza-icon.png"> </head> <body>
