[08/13] apex-site git commit: from 60672495c84ed54ff505dfaec874cad9f5f13075

tushar Tue, 02 May 2017 11:07:06 -0700

http://git-wip-us.apache.org/repos/asf/apex-site/blob/82e5a921/content/docs/apex-3.6/css/theme_extra.css
----------------------------------------------------------------------
diff --git a/content/docs/apex-3.6/css/theme_extra.css 
b/content/docs/apex-3.6/css/theme_extra.css
new file mode 100644
index 0000000..9845d00
--- /dev/null
+++ b/content/docs/apex-3.6/css/theme_extra.css
@@ -0,0 +1,154 @@
+/*
+ * Sphinx doesn't have support for section dividers like we do in
+ * MkDocs, this styles the section titles in the nav
+ *
+ * https://github.com/mkdocs/mkdocs/issues/175
+ */
+.wy-menu-vertical span {
+    line-height: 18px;
+    padding: 0.4045em 1.618em;
+    display: block;
+    position: relative;
+    font-size: 90%;
+    color: #838383;
+}
+
+.wy-menu-vertical .subnav a {
+    padding: 0.4045em 2.427em;
+}
+
+/*
+ * Long navigations run off the bottom of the screen as the nav
+ * area doesn't scroll.
+ *
+ * https://github.com/mkdocs/mkdocs/pull/202
+ */
+.wy-nav-side {
+    height: 100%;
+    overflow-y: auto;
+}
+
+/*
+ * readthedocs theme hides nav items when the window height is
+ * too small to contain them.
+ *
+ * https://github.com/mkdocs/mkdocs/issues/#348
+ */
+.wy-menu-vertical ul {
+  margin-bottom: 2em;
+}
+
+/*
+ * Fix wrapping in the code highlighting
+ *
+ * https://github.com/mkdocs/mkdocs/issues/233
+ */
+code {
+    white-space: pre;
+    padding: 2px 5px;
+}
+
+/*
+ * Wrap inline code samples otherwise they shoot of the side and
+ * can't be read at all.
+ *
+ * https://github.com/mkdocs/mkdocs/issues/313
+ */
+p code {
+    word-wrap: break-word;
+}
+
+/**
+ * Make code blocks display as blocks and give them the appropriate
+ * font size and padding.
+ *
+ * https://github.com/mkdocs/mkdocs/issues/855
+ */
+pre code {
+  display: block;
+  padding: 12px;
+  font-size: 12px;
+}
+
+/*
+ * Fix link colors when the link text is inline code.
+ *
+ * https://github.com/mkdocs/mkdocs/issues/718
+ */
+a code {
+    color: #2980B9;
+}
+a:hover code {
+    color: #3091d1;
+}
+a:visited code {
+    color: #9B59B6;
+}
+
+/*
+ * The CSS classes from highlight.js seem to clash with the
+ * ReadTheDocs theme causing some code to be incorrectly made
+ * bold and italic.
+ *
+ * https://github.com/mkdocs/mkdocs/issues/411
+ */
+code.cs, code.c {
+    font-weight: inherit;
+    font-style: inherit;
+}
+
+/*
+ * Fix some issues with the theme and non-highlighted code
+ * samples. Without and highlighting styles attached the
+ * formatting is broken.
+ *
+ * https://github.com/mkdocs/mkdocs/issues/319
+ */
+.no-highlight {
+  display: block;
+  padding: 0.5em;
+  color: #333;
+}
+
+
+/*
+ * Additions specific to the search functionality provided by MkDocs
+ */
+
+#mkdocs-search-results article h3
+{
+    margin-top: 23px;
+    border-top: 1px solid #E1E4E5;
+    padding-top: 24px;
+}
+
+#mkdocs-search-results article:first-child h3 {
+    border-top: none;
+}
+
+#mkdocs-search-query{
+    width: 100%;
+    border-radius: 50px;
+    padding: 6px 12px;
+    border-color: #D1D4D5;
+}
+
+.wy-menu-vertical li ul {
+    display: inherit;
+}
+
+.wy-menu-vertical li ul.subnav ul.subnav{
+    padding-left: 1em;
+}
+
+
+/*
+ * Improve inline code blocks within admonitions.
+ *
+ * https://github.com/mkdocs/mkdocs/issues/656
+ */
+ div.admonition code {
+  color: #404040;
+  border: 1px solid rgba(0, 0, 0, 0.2);
+  background: rgba(255, 255, 255, 0.7);
+}


http://git-wip-us.apache.org/repos/asf/apex-site/blob/82e5a921/content/docs/apex-3.6/development_best_practices/index.html
----------------------------------------------------------------------
diff --git a/content/docs/apex-3.6/development_best_practices/index.html 
b/content/docs/apex-3.6/development_best_practices/index.html
new file mode 100644
index 0000000..bbbff1b
--- /dev/null
+++ b/content/docs/apex-3.6/development_best_practices/index.html
@@ -0,0 +1,383 @@
+<!DOCTYPE html>
+<!--[if IE 8]><html class="no-js lt-ie9" lang="en" > <![endif]-->
+<!--[if gt IE 8]><!--> <html class="no-js" lang="en" > <!--<![endif]-->
+<head>
+  <meta charset="utf-8">
+  <meta http-equiv="X-UA-Compatible" content="IE=edge">
+  <meta name="viewport" content="width=device-width, initial-scale=1.0">
+  
+  
+  
+  <title>Best Practices - Apache Apex Documentation</title>
+  
+
+  <link rel="shortcut icon" href="../favicon.ico">
+  
+
+  
+  <link 
href='https://fonts.googleapis.com/css?family=Lato:400,700|Roboto+Slab:400,700|Inconsolata:400,700'
 rel='stylesheet' type='text/css'>
+
+  <link rel="stylesheet" href="../css/theme.css" type="text/css" />
+  <link rel="stylesheet" href="../css/theme_extra.css" type="text/css" />
+  <link rel="stylesheet" href="../css/highlight.css">
+
+  
+  <script>
+    // Current page data
+    var mkdocs_page_name = "Best Practices";
+    var mkdocs_page_input_path = "development_best_practices.md";
+    var mkdocs_page_url = "/development_best_practices/";
+  </script>
+  
+  <script src="../js/jquery-2.1.1.min.js"></script>
+  <script src="../js/modernizr-2.8.3.min.js"></script>
+  <script type="text/javascript" src="../js/highlight.pack.js"></script>
+  <script src="../js/theme.js"></script> 
+
+  
+</head>
+
+<body class="wy-body-for-nav" role="document">
+
+  <div class="wy-grid-for-nav">
+
+    
+    <nav data-toggle="wy-nav-shift" class="wy-nav-side stickynav">
+      <div class="wy-side-nav-search">
+        <a href=".." class="icon icon-home"> Apache Apex Documentation</a>
+        <div role="search">
+  <form id ="rtd-search-form" class="wy-form" action="../search.html" 
method="get">
+    <input type="text" name="q" placeholder="Search docs" />
+  </form>
+</div>
+      </div>
+
+      <div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" 
aria-label="main navigation">
+        <ul class="current">
+          
+            <li>
+    <li class="toctree-l1 ">
+        <a class="" href="..">Apache Apex</a>
+        
+    </li>
+<li>
+          
+            <li>
+    <ul class="subnav">
+    <li><span>Development</span></li>
+
+        
+            
+    <li class="toctree-l1 ">
+        <a class="" href="../apex_development_setup/">Development Setup</a>
+        
+    </li>
+
+        
+            
+    <li class="toctree-l1 ">
+        <a class="" href="../application_development/">Applications</a>
+        
+    </li>
+
+        
+            
+    <li class="toctree-l1 ">
+        <a class="" href="../application_packages/">Packages</a>
+        
+    </li>
+
+        
+            
+    <li class="toctree-l1 ">
+        <a class="" href="../operator_development/">Operators</a>
+        
+    </li>
+
+        
+            
+    <li class="toctree-l1 ">
+        <a class="" href="../autometrics/">AutoMetric API</a>
+        
+    </li>
+
+        
+            
+    <li class="toctree-l1 ">
+        <a class="" href="../control_tuples/">Custom Control Tuples</a>
+        
+    </li>
+
+        
+            
+    <li class="toctree-l1 current">
+        <a class="current" href="./">Best Practices</a>
+        
+            <ul>
+            
+                <li class="toctree-l3"><a 
href="#development-best-practices">Development Best Practices</a></li>
+                
+                    <li><a class="toctree-l4" 
href="#operators">Operators</a></li>
+                
+                    <li><a class="toctree-l4" href="#input-operators">Input 
Operators</a></li>
+                
+                    <li><a class="toctree-l4" href="#output-operators">Output 
Operators</a></li>
+                
+                    <li><a class="toctree-l4" 
href="#partitioning">Partitioning</a></li>
+                
+                    <li><a class="toctree-l4" href="#threads">Threads</a></li>
+                
+            
+            </ul>
+        
+    </li>
+
+        
+    </ul>
+<li>
+          
+            <li>
+    <ul class="subnav">
+    <li><span>Operations</span></li>
+
+        
+            
+    <li class="toctree-l1 ">
+        <a class="" href="../apex_cli/">Apex CLI</a>
+        
+    </li>
+
+        
+            
+    <li class="toctree-l1 ">
+        <a class="" href="../security/">Security</a>
+        
+    </li>
+
+        
+    </ul>
+<li>
+          
+            <li>
+    <li class="toctree-l1 ">
+        <a class="" href="../compatibility/">Compatibility</a>
+        
+    </li>
+<li>
+          
+        </ul>
+      </div>
+      &nbsp;
+    </nav>
+
+    <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap">
+
+      
+      <nav class="wy-nav-top" role="navigation" aria-label="top navigation">
+        <i data-toggle="wy-nav-top" class="fa fa-bars"></i>
+        <a href="..">Apache Apex Documentation</a>
+      </nav>
+
+      
+      <div class="wy-nav-content">
+        <div class="rst-content">
+          <div role="navigation" aria-label="breadcrumbs navigation">
+  <ul class="wy-breadcrumbs">
+    <li><a href="..">Docs</a> &raquo;</li>
+    
+      
+        
+          <li>Development &raquo;</li>
+        
+      
+    
+    <li>Best Practices</li>
+    <li class="wy-breadcrumbs-aside">
+      
+    </li>
+  </ul>
+  <hr/>
+</div>
+          <div role="main">
+            <div class="section">
+              
+                <h1 id="development-best-practices">Development Best 
Practices</h1>
+<p>This document describes the best practices to follow when developing 
operators and other application components such as partitoners, stream codecs 
etc on the Apache Apex platform.</p>
+<h2 id="operators">Operators</h2>
+<p>These are general guidelines for all operators that are covered in the 
current section. The subsequent sections talk about special considerations for 
input and output operators.</p>
+<ul>
+<li>When writing a new operator to be used in an application, consider 
breaking it down into<ul>
+<li>An abstract operator that encompasses the core functionality but leaves 
application specific schemas and logic to the implementation.</li>
+<li>An optional concrete operator also in the library that extends the 
abstract operator and provides commonly used schema types such as strings, 
byte[] or POJOs.</li>
+</ul>
+</li>
+<li>Follow these conventions for the life cycle methods:<ul>
+<li>Do one time initialization of entities that apply for the entire lifetime 
of the operator in the <strong>setup</strong> method, e.g., factory 
initializations. Initializations in <strong>setup</strong> are done in the 
container where the operator is deployed. Allocating memory for fields in the 
constructor is not efficient as it would lead to extra garbage in memory for 
the following reason. The operator is instantiated on the client from where the 
application is launched, serialized and started one of the Hadoop nodes in a 
container. So the constructor is first called on the client and if it were to 
initialize any of the fields, that state would be saved during serialization. 
In the Hadoop container the operator is deserialized and started. This would 
invoke the constructor again, which will initialize the fields but their state 
will get overwritten by the serialized state and the initial values would 
become garbage in memory.</li>
+<li>Do one time initialization for live entities in <strong>activate</strong> 
method, e.g., opening connections to a database server or starting a thread for 
asynchronous operations. The <strong>activate</strong> method is called right 
before processing starts so it is a better place for these initializations than 
at <strong>setup</strong> which can lead to a delay before processing data from 
the live entity.  </li>
+<li>Perform periodic tasks based on processing time in application window 
boundaries.</li>
+<li>Perform initializations needed for each application window in 
<strong>beginWindow</strong>.</li>
+<li>Perform aggregations needed for each application window  in 
<strong>endWindow</strong>.</li>
+<li>Teardown of live entities (inverse of tasks performed during activate) 
should be in the <strong>deactivate</strong> method.</li>
+<li>Teardown of lifetime entities (those initialized in setup method) should 
happen in the <strong>teardown</strong> method.</li>
+<li>If the operator implementation is not finalized mark it with the 
<strong>@Evolving</strong> annotation.</li>
+</ul>
+</li>
+<li>If the operator needs to perform operations based on event time of the 
individual tuples and not the processing time, extend and use the 
<strong>WindowedOperator</strong>. Refer to documentation of that operator for 
details on how to use it.</li>
+<li>If an operator needs to do some work when it is not receiving any input, 
it should implement <strong>IdleTimeHandler</strong> interface. This interface 
contains <strong>handleIdleTime</strong> method which will be called whenever 
the platform isnât doing anything else and the operator can do the work in 
this method. If for any reason the operator does not have any work to do when 
this method is called, it should sleep for a small amount of time such as that 
specified by the <strong>SPIN_MILLIS</strong> attribute so that it does not 
cause a busy wait when called repeatedly by the platform. Also, the method 
should not block and return in a reasonable amount of time that is less than 
the streaming window size (which is 500ms by default).</li>
+<li>Often operators have customizable parameters such as information about 
locations of external systems or parameters that modify the behavior of the 
operator. Users should be able to specify these easily without having to change 
source code. This can be done by making them properties of the operator because 
they can then be initialized from external properties files.<ul>
+<li>Where possible default values should be provided for the properties in the 
source code.</li>
+<li>Validation rules should be specified for the properties using javax 
constraint validations that check whether the values specified for the 
properties are in the correct format, range or other operator requirements. 
Required properties should have at least a <strong>@NotNull</strong> validation 
specifying that they have to be specified by the user.</li>
+</ul>
+</li>
+</ul>
+<h3 id="checkpointing">Checkpointing</h3>
+<p>Checkpointing is a process of snapshotting the state of an operator and 
saving it so that in case of failure the state can be used to restore the 
operator to a prior state and continue processing. It is automatically 
performed by the platform at a configurable interval. All operators in the 
application are checkpointed in a distributed fashion, thus allowing the entire 
state of the application to be saved and available for recovery if needed. Here 
are some things to remember when it comes to checkpointing:</p>
+<ul>
+<li>The process of checkpointing involves snapshotting the state by 
serializing the operator and saving it to a store. This is done using a 
<strong>StorageAgent</strong>. By default a <em>StorageAgent</em> is already 
provided by the platform and it is called <strong>AsyncFSStorageAgent</strong>. 
It serializes the operator using Kryo and saves the serialized state 
asynchronously to a filesystem such as HDFS. There are other implementations of 
<em>StorageAgent</em> available such as 
<strong>GeodeKeyValueStorageAgent</strong> that stores the serialized state in 
Geode which is an in-memory replicated data grid.</li>
+<li>All variables in the operator marked neither transient nor final are saved 
so any variables in the operator that are not part of the state should be 
marked transient. Specifically any variables like connection objects, i/o 
streams, ports are transient, because they need to be setup again on failure 
recovery.</li>
+<li>If the operator does not keep any state between windows, mark it with the 
<strong>@Stateless</strong> annotation. This results in efficiencies during 
checkpointing and recovery. The operator will not be checkpointed and is always 
restored to the initial state</li>
+<li>The checkpoint interval can be set using the 
<strong>CHECKPOINT_WINDOW_COUNT</strong> attribute which specifies the interval 
in terms of number of streaming windows.</li>
+<li>If the correct functioning of the operator requires the 
<strong>endWindow</strong> method be called before checkpointing can happen, 
then the checkpoint interval should align with application window interval 
i.e., it should be a multiple of application window interval. In this case the 
operator should be marked with <strong>OperatorAnnotation</strong> and 
<strong>checkpointableWithinAppWindow</strong> set to false. If the window 
intervals are configured by the user and they donât align, it will result in 
a DAG validation error and application wonât launch.</li>
+<li>In some cases the operator state related to a piece of data needs to be 
purged once that data is no longer required by the application, otherwise the 
state will continue to build up indefinitely. The platform provides a way to 
let the operator know about this using a callback listener called 
<strong>CheckpointNotificationListener</strong>. This listener has a callback 
method called <strong>committed</strong>, which is called by the platform from 
time to time with a window id that has been processed successfully by all the 
operators in the DAG and hence is no longer needed. The operator can delete all 
the state corresponding to window ids less than or equal to the provided window 
id.</li>
+<li>Sometimes operators need to perform some tasks just before checkpointing. 
For example, filesystem operators may want to flush the files just before 
checkpoint so they can be sure that all pending data is written to disk and no 
data is lost if there is an operator failure just after the checkpoint and the 
operator restarts from the checkpoint. To do this the operator would implement 
the same <em>CheckpointNotificationListener</em> interface and implement the 
<strong>beforeCheckpoint</strong> method where it can do these tasks.</li>
+<li>If the operator is going to have a large state, checkpointing the entire 
state each time becomes unviable. Furthermore, the amount of memory needed to 
hold the state could be larger than the amount of physical memory available. In 
these cases the operator should checkpoint the state incrementally and also 
manage the memory for the state more efficiently. The platform provides a 
utiltiy called <strong>ManagedState</strong> that uses a combination of in 
memory and disk cache to efficiently store and retrieve data in a performant, 
fault tolerant way and also checkpoint it in an incremental fashion. There are 
operators in the platform that use <em>ManagedState</em> and can be used as a 
reference on how to use this utility such as Dedup or Join operators.</li>
+</ul>
+<h2 id="input-operators">Input Operators</h2>
+<p>Input operators have additional requirements:</p>
+<ul>
+<li>The <strong>emitTuples</strong> method implemented by the operator, is 
called by the platform, to give the operator an opportunity to emit some data. 
This method is always called within a window boundary but can be called 
multiple times within the same window. There are some important guidelines on 
how to implement this method:<ul>
+<li>This should not be a blocking method and should return in a reasonable 
time that is less than the streaming window size (which is 500ms by default). 
This also applies to other callback methods called by the platform such as 
<em>beginWindow</em>, <em>endWindow</em> etc., but is more important here since 
this method will be called continuously by the platform.</li>
+<li>If the operator needs to interact with external systems to obtain data and 
this can potentially take a long time, then this should be performed 
asynchronously in a different thread. Refer to the threading section below for 
the guidelines when using threading.</li>
+<li>In each invocation, the method can emit any number of data tuples.</li>
+</ul>
+</li>
+</ul>
+<h3 id="idempotence">Idempotence</h3>
+<p>Many applications write data to external systems using output operators. To 
ensure that data is present exactly once in the external system even in a 
failure recovery scenario, the output operators expect the replayed windows 
during recovery contain the same data as before the failure. This is called 
idempotency. Since operators within the DAG are merely responding to input data 
provided to them by the upstream operators and the input operator has no 
upstream operator, the responsibility of idempotent replay falls on the input 
operators.</p>
+<ul>
+<li>For idempotent replay of data, the operator needs to store some 
meta-information for every window that would allow it to identify what data was 
sent in that window. This is called the idempotent state.<ul>
+<li>If the external source of the input operator allows replayability, this 
could be information such as offset of last piece of data in the window, an 
identifier of the last piece of data itself or number of data tuples sent.</li>
+<li>However if the external source does not allow replayability from an 
operator specified point, then the entire data sent within the window may need 
to be persisted by the operator.</li>
+</ul>
+</li>
+<li>The platform provides a utility called <em>WindowDataManager</em> to allow 
operators to save and retrieve idempotent state every window. Operators should 
use this to implement idempotency.</li>
+</ul>
+<h2 id="output-operators">Output Operators</h2>
+<p>Output operators typically connect to external storage systems such as 
filesystems, databases or key value stores to store data.</p>
+<ul>
+<li>In some situations, the external systems may not be functioning in a 
reliable fashion. They may be having prolonged outages or performance problems. 
If the operator is being designed to work in such environments, it needs to be 
able to to handle these problems gracefully and not block the DAG or fail. In 
these scenarios the operator should cache the data into a local store such as 
HDFS and interact with external systems in a separate thread so as to not have 
problems in the operator lifecycle thread. This pattern is called the 
<strong>Reconciler</strong> pattern and there are operators that implement this 
pattern available in the library for reference.</li>
+</ul>
+<h3 id="end-to-end-exactly-once">End-to-End Exactly Once</h3>
+<p>When output operators store data in external systems, it is important that 
they do not lose data or write duplicate data when there is a failure event and 
the DAG recovers from that failure. In failure recovery, the windows from the 
previous checkpoint are replayed and the operator receives this data again. The 
operator should ensure that it does not write this data again. Operator 
developers should figure out how to do this specifically for the operators they 
are developing depending on the logic of the operators. Below are examples of 
how a couple of existing output operators do this for reference.</p>
+<ul>
+<li>File output operator that writes data to files keeps track of the file 
lengths in the state. These lengths are checkpointed and restored on failure 
recovery. On restart, the operator truncates the file to the length equal to 
the length in the recovered state. This makes the data in the file same as it 
was at the time of checkpoint before the failure. The operator now writes the 
replayed data from the checkpoint in regular fashion as any other data. This 
ensures no data is lost or duplicated in the file.</li>
+<li>The JDBC output operator that writes data to a database table writes the 
data in a window in a single transaction. It also writes the current window id 
into a meta table along with the data as part of the same transaction. It 
commits the transaction at the end of the window. When there is an operator 
failure before the final commit, the state of the database is that it contains 
the data from the previous fully processed window and its window id since the 
current window transaction isnât yet committed. On recovery, the operator 
reads this window id back from the meta table. It ignores all the replayed 
windows whose window id is less than or equal to the recovered window id and 
thus ensures that it does not duplicate data already present in the database. 
It starts writing data normally again when window id of data becomes greater 
than recovered window thus ensuring no data is lost.</li>
+</ul>
+<h2 id="partitioning">Partitioning</h2>
+<p>Partitioning allows an operation to be scaled to handle more pieces of data 
than before but with a similar SLA. This is done by creating multiple instances 
of an operator and distributing the data among them. Input operators can also 
be partitioned to stream more pieces of data into the application. The platform 
provides a lot of flexibility and options for partitioning. Partitioning can 
happen once at startup or can be dynamically changed anytime while the 
application is running, and it can be done in a stateless or stateful way by 
distributing state from the old partitions to new partitions.</p>
+<p>In the platform, the responsibility for partitioning is shared among 
different entities. These are:</p>
+<ol>
+<li>A <strong>partitioner</strong> that specifies <em>how</em> to partition 
the operator, specifically it takes an old set of partitions and creates a new 
set of partitions. At the start of the application the old set has one 
partition and the partitioner can return more than one partitions to start the 
application with multiple partitions. The partitioner can have any custom JAVA 
logic to determine the number of new partitions, set their initial state as a 
brand new state or derive it from the state of the old partitions. It also 
specifies how the data gets distributed among the new partitions. The new set 
doesn't have to contain only new partitions, it can carry over some old 
partitions if desired.</li>
+<li>An optional <strong>statistics (stats) listener</strong> that specifies 
<em>when</em> to partition. The reason it is optional is that it is needed only 
when dynamic partitioning is needed. With the stats listener, the stats can be 
used to determine when to partition.</li>
+<li>In some cases the <em>operator</em> itself should be aware of partitioning 
and would need to provide supporting code.<ul>
+<li>In case of input operators each partition should have a property or a set 
of properties that allow it to distinguish itself from the other partitions and 
fetch unique data.</li>
+</ul>
+</li>
+<li>When an operator that was originally a single instance is split into 
multiple partitions with each partition working on a subset of data, the 
results of the partitions may need to be combined together to compute the final 
result. The combining logic would depend on the logic of the operator. This 
would be specified by the developer using a <strong>Unifier</strong>, which is 
deployed as another operator by the platform. If no <em>Unifier</em> is 
specified, the platform inserts a <strong>default unifier</strong> that merges 
the results of the multiple partition streams into a single stream. Each output 
port can have a different <em>Unifier</em> and this is specified by returning 
the corresponding <em>Unifier</em> in the <strong>getUnifier</strong> method of 
the output port. The operator developer should provide a custom 
<em>Unifier</em> wherever applicable.</li>
+<li>The Apex <em>engine</em> that brings everything together and effects the 
partitioning.</li>
+</ol>
+<p>Since partitioning is critical for scalability of applications, operators 
must support it. There should be a strong reason for an operator to not support 
partitioning, such as, the logic performed by the operator not lending itself 
to parallelism. In order to support partitioning, an operator developer, apart 
from developing the functionality of the operator, may also need to provide a 
partitioner, stats listener and supporting code in the operator as described in 
the steps above. The next sections delve into this. </p>
+<h3 id="out-of-the-box-partitioning">Out of the box partitioning</h3>
+<p>The platform comes with some built-in partitioning utilities that can be 
used in certain scenarios.</p>
+<ul>
+<li>
+<p><strong>StatelessPartitioner</strong> provides a default partitioner, that 
can be used for an operator in certain conditions. If the operator satisfies 
these conditions, the partitioner can be specified for the operator with a 
simple setting and no other partitioning code is needed. The conditions are:</p>
+<ul>
+<li>No dynamic partitioning is needed, see next point about dynamic 
partitioning. </li>
+<li>There is no distinct initial state for the partitions, i.e., all 
partitions start with the same initial state submitted during application 
launch.</li>
+</ul>
+<p>Typically input or output operators do not fall into this category, 
although there are some exceptions. This partitioner is mainly used with 
operators that are in the middle of the DAG, after the input and before the 
output operators. When used with non-input operators, only the data for the 
first declared input port is distributed among the different partitions. All 
other input ports are treated as broadcast and all partitions receive all the 
data for that port.</p>
+</li>
+<li>
+<p><strong>StatelessThroughputBasedPartitioner</strong> in Malhar provides a 
dynamic partitioner based on throughput thresholds. Similarly 
<strong>StatelessLatencyBasedPartitioner</strong> provides a latency based 
dynamic partitioner in RTS. If these partitioners can be used, then separate 
partitioning related code is not needed. The conditions under which these can 
be used are:</p>
+<ul>
+<li>There is no distinct initial state for the partitions.</li>
+<li>There is no state being carried over by the operator from one window to 
the next i.e., operator is stateless.</li>
+</ul>
+</li>
+</ul>
+<h3 id="custom-partitioning">Custom partitioning</h3>
+<p>In many cases, operators donât satisfy the above conditions and a 
built-in partitioner cannot be used. Custom partitioning code needs to be 
written by the operator developer. Below are guidelines for it.</p>
+<ul>
+<li>Since the operator developer is providing a <em>partitioner</em> for the 
operator, the partitioning code should be added to the operator itself by 
making the operator implement the Partitioner interface and implementing the 
required methods, rather than creating a separate partitioner. The advantage is 
the user of the operator does not have to explicitly figure out the partitioner 
and set it for the operator but still has the option to override this built-in 
partitioner with a different one.</li>
+<li>The <em>partitioner</em> is responsible for setting the initial state of 
the new partitions, whether it is at the start of the application or when 
partitioning is happening while the application is running as in the dynamic 
partitioning case. In the dynamic partitioning scenario, the partitioner needs 
to take the state from the old partitions and distribute it among the new 
partitions. It is important to note that apart from the checkpointed state the 
partitioner also needs to distribute idempotent state.</li>
+<li>The <em>partitioner</em> interface has two methods, 
<strong>definePartitions</strong> and <strong>partitioned</strong>. The method 
<em>definePartitons</em> is first called to determine the new partitions, and 
if enough resources are available on the cluster, the <em>partitioned</em> 
method is called passing in the new partitions. This happens both during 
initial partitioning and dynamic partitioning. If resources are not available, 
partitioning is abandoned and existing partitions continue to run untouched. 
This means that any processing intensive operations should be deferred to the 
<em>partitioned</em> call instead of doing them in <em>definePartitions</em>, 
as they may not be needed if there are not enough resources available in the 
cluster.</li>
+<li>The <em>partitioner</em>, along with creating the new partitions, should 
also specify how the data gets distributed across the new partitions. It should 
do this by specifying a mapping called <strong>PartitionKeys</strong> for each 
partition that maps the data to that partition. This mapping needs to be 
specified for every input port in the operator. If the <em>partitioner</em> 
wants to use the standard mapping it can use a utility method called 
<strong>DefaultPartition.assignPartitionKeys</strong>.</li>
+<li>When the partitioner is scaling the operator up to more partitions, try to 
reuse the existing partitions and create new partitions to augment the current 
set. The reuse can be achieved by the partitioner returning the current 
partitions unchanged. This will result in the current partitions continuing to 
run untouched.</li>
+<li>In case of dynamic partitioning, as mentioned earlier, a stats listener is 
also needed to determine when to re-partition. Like the <em>Partitioner</em> 
interface, the operator can also implement the <em>StatsListener</em> interface 
to provide a stats listener implementation that will be automatically used.</li>
+<li>The <em>StatsListener</em> has access to all operator statistics to make 
its decision on partitioning. Apart from the statistics that the platform 
computes for the operators such as throughput, latency etc, operator developers 
can include their own business metrics by using the AutoMetric feature.</li>
+<li>If the operator is not partitionable, mark it so with 
<em>OperatorAnnotation</em> and <em>partitionable</em> element set to 
false.</li>
+</ul>
+<h3 id="streamcodecs">StreamCodecs</h3>
+<p>A <strong>StreamCodec</strong> is used in partitioning to distribute the 
data tuples among the partitions. The <em>StreamCodec</em> computes an integer 
hashcode for a data tuple and this is used along with <em>PartitionKeys</em> 
mapping to determine which partition or partitions receive the data tuple. If a 
<em>StreamCodec</em> is not specified, then a default one is used by the 
platform which returns the JAVA hashcode of the tuple. </p>
+<p><em>StreamCodec</em> is also useful in another aspect of the application. 
It is used to serialize and deserialize the tuple to transfer it between 
operators. The default <em>StreamCodec</em> uses Kryo library for 
serialization. </p>
+<p>The following guidelines are useful when considering a custom 
<em>StreamCodec</em></p>
+<ul>
+<li>A custom <em>StreamCodec</em> is needed if the tuples need to be 
distributed based on a criteria different from the hashcode of the tuple. If 
the correct working of an operator depends on the data from the upstream 
operator being distributed using a custom criteria such as being sticky on a 
âkeyâ field within the tuple, then a custom <em>StreamCodec</em> should be 
provided by the operator developer. This codec can implement the custom 
criteria. The operator should also return this custom codec in the 
<strong>getStreamCodec</strong> method of the input port.</li>
+<li>When implementing a custom <em>StreamCodec</em> for the purpose of using a 
different criteria to distribute the tuples, the codec can extend an existing 
<em>StreamCodec</em> and implement the hashcode method, so that the codec does 
not have to worry about the serialization and deserialization functionality. 
The Apex platform provides two pre-built <em>StreamCodec</em> implementations 
for this purpose, one is <strong>KryoSerializableStreamCodec</strong> that uses 
Kryo for serialization and another one 
<strong>JavaSerializationStreamCodec</strong> that uses JAVA serialization.</li>
+<li>Different <em>StreamCodec</em> implementations can be used for different 
inputs in a stream with multiple inputs when different criteria of distributing 
the tuples is desired between the multiple inputs. </li>
+</ul>
+<h2 id="threads">Threads</h2>
+<p>The operator lifecycle methods such as <strong>setup</strong>, 
<strong>beginWindow</strong>, <strong>endWindow</strong>, 
<strong>process</strong> in <em>InputPorts</em> are all called from a single 
operator lifecycle thread, by the platform, unbeknownst to the user. So the 
user does not have to worry about dealing with the issues arising from 
multi-threaded code. Use of separate threads in an operator is discouraged 
because in most cases the motivation for this is parallelism, but parallelism 
can already be achieved by using multiple partitions and furthermore mistakes 
can be made easily when writing multi-threaded code. When dealing with high 
volume and velocity data, the corner cases with incorrectly written 
multi-threaded code are encountered more easily and exposed. However, there are 
times when separate threads are needed, for example, when interacting with 
external systems the delay in retrieving or sending data can be large at times, 
blocking the operator and other DAG pro
 cessing such as committed windows. In these cases the following guidelines 
must be followed strictly.</p>
+<ul>
+<li>Threads should be started in <strong>activate</strong> and stopped in 
<strong>deactivate</strong>. In <em>deactivate</em> the operator should wait 
till any threads it launched, have finished execution. It can do so by calling 
<strong>join</strong> on the threads or if using 
<strong>ExecutorService</strong>, calling <strong>awaitTermination</strong> on 
the service.</li>
+<li>Threads should not call any methods on the ports directly as this can 
cause concurrency exceptions and also result in invalid states.</li>
+<li>Threads can share state with the lifecycle methods using data structures 
that are either explicitly protected by synchronization or are inherently 
thread safe such as thread safe queues.</li>
+<li>If this shared state needs to be protected against failure then it needs 
to be persisted during checkpoint. To have a consistent checkpoint, the state 
should not be modified by the thread when it is being serialized and saved by 
the operator lifecycle thread during checkpoint. Since the checkpoint process 
happens outside the window boundary the thread should be quiesced between 
<strong>endWindow</strong> and <strong>beginWindow</strong> or more efficiently 
between pre-checkpoint and checkpointed callbacks.</li>
+</ul>
+              
+            </div>
+          </div>
+          <footer>
+  
+    <div class="rst-footer-buttons" role="navigation" aria-label="footer 
navigation">
+      
+        <a href="../apex_cli/" class="btn btn-neutral float-right" title="Apex 
CLI">Next <span class="icon icon-circle-arrow-right"></span></a>
+      
+      
+        <a href="../control_tuples/" class="btn btn-neutral" title="Custom 
Control Tuples"><span class="icon icon-circle-arrow-left"></span> Previous</a>
+      
+    </div>
+  
+
+  <hr/>
+
+  <div role="contentinfo">
+    <!-- Copyright etc -->
+    
+  </div>
+
+  Built with <a href="http://www.mkdocs.org";>MkDocs</a> using a <a 
href="https://github.com/snide/sphinx_rtd_theme";>theme</a> provided by <a 
href="https://readthedocs.org";>Read the Docs</a>.
+</footer>
+         
+        </div>
+      </div>
+
+    </section>
+
+  </div>
+
+<div class="rst-versions" role="note" style="cursor: pointer">
+    <span class="rst-current-version" data-toggle="rst-current-version">
+      
+      
+        <span><a href="../control_tuples/" style="color: #fcfcfc;">&laquo; 
Previous</a></span>
+      
+      
+        <span style="margin-left: 15px"><a href="../apex_cli/" style="color: 
#fcfcfc">Next &raquo;</a></span>
+      
+    </span>
+</div>
+
+</body>
+</html>

http://git-wip-us.apache.org/repos/asf/apex-site/blob/82e5a921/content/docs/apex-3.6/favicon.ico
----------------------------------------------------------------------
diff --git a/content/docs/apex-3.6/favicon.ico 
b/content/docs/apex-3.6/favicon.ico
new file mode 100644
index 0000000..c0b3dae
Binary files /dev/null and b/content/docs/apex-3.6/favicon.ico differ

http://git-wip-us.apache.org/repos/asf/apex-site/blob/82e5a921/content/docs/apex-3.6/fonts/fontawesome-webfont.eot
----------------------------------------------------------------------
diff --git a/content/docs/apex-3.6/fonts/fontawesome-webfont.eot 
b/content/docs/apex-3.6/fonts/fontawesome-webfont.eot
new file mode 100755
index 0000000..0662cb9
Binary files /dev/null and 
b/content/docs/apex-3.6/fonts/fontawesome-webfont.eot differ

[08/13] apex-site git commit: from 60672495c84ed54ff505dfaec874cad9f5f13075

Reply via email to