[01/13] apex-site git commit: Adding apex-3.6.0 documentation

tushar Mon, 01 May 2017 03:20:04 -0700

Repository: apex-site
Updated Branches:
  refs/heads/asf-site 76e8eb4de -> fb75848ff



http://git-wip-us.apache.org/repos/asf/apex-site/blob/fb75848f/docs/apex-3.6/operator_development/index.html
----------------------------------------------------------------------
diff --git a/docs/apex-3.6/operator_development/index.html 
b/docs/apex-3.6/operator_development/index.html
new file mode 100644
index 0000000..da6a106
--- /dev/null
+++ b/docs/apex-3.6/operator_development/index.html
@@ -0,0 +1,683 @@
+<!DOCTYPE html>
+<!--[if IE 8]><html class="no-js lt-ie9" lang="en" > <![endif]-->
+<!--[if gt IE 8]><!--> <html class="no-js" lang="en" > <!--<![endif]-->
+<head>
+  <meta charset="utf-8">
+  <meta http-equiv="X-UA-Compatible" content="IE=edge">
+  <meta name="viewport" content="width=device-width, initial-scale=1.0">
+  
+  
+  
+  <title>Operators - Apache Apex Documentation</title>
+  
+
+  <link rel="shortcut icon" href="../favicon.ico">
+  
+
+  
+  <link 
href='https://fonts.googleapis.com/css?family=Lato:400,700|Roboto+Slab:400,700|Inconsolata:400,700'
 rel='stylesheet' type='text/css'>
+
+  <link rel="stylesheet" href="../css/theme.css" type="text/css" />
+  <link rel="stylesheet" href="../css/theme_extra.css" type="text/css" />
+  <link rel="stylesheet" href="../css/highlight.css">
+
+  
+  <script>
+    // Current page data
+    var mkdocs_page_name = "Operators";
+    var mkdocs_page_input_path = "operator_development.md";
+    var mkdocs_page_url = "/operator_development/";
+  </script>
+  
+  <script src="../js/jquery-2.1.1.min.js"></script>
+  <script src="../js/modernizr-2.8.3.min.js"></script>
+  <script type="text/javascript" src="../js/highlight.pack.js"></script>
+  <script src="../js/theme.js"></script> 
+
+  
+</head>
+
+<body class="wy-body-for-nav" role="document">
+
+  <div class="wy-grid-for-nav">
+
+    
+    <nav data-toggle="wy-nav-shift" class="wy-nav-side stickynav">
+      <div class="wy-side-nav-search">
+        <a href=".." class="icon icon-home"> Apache Apex Documentation</a>
+        <div role="search">
+  <form id ="rtd-search-form" class="wy-form" action="../search.html" 
method="get">
+    <input type="text" name="q" placeholder="Search docs" />
+  </form>
+</div>
+      </div>
+
+      <div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" 
aria-label="main navigation">
+        <ul class="current">
+          
+            <li>
+    <li class="toctree-l1 ">
+        <a class="" href="..">Apache Apex</a>
+        
+    </li>
+<li>
+          
+            <li>
+    <ul class="subnav">
+    <li><span>Development</span></li>
+
+        
+            
+    <li class="toctree-l1 ">
+        <a class="" href="../apex_development_setup/">Development Setup</a>
+        
+    </li>
+
+        
+            
+    <li class="toctree-l1 ">
+        <a class="" href="../application_development/">Applications</a>
+        
+    </li>
+
+        
+            
+    <li class="toctree-l1 ">
+        <a class="" href="../application_packages/">Packages</a>
+        
+    </li>
+
+        
+            
+    <li class="toctree-l1 current">
+        <a class="current" href="./">Operators</a>
+        
+            <ul>
+            
+                <li class="toctree-l3"><a 
href="#operator-development-guide">Operator Development Guide</a></li>
+                
+            
+                <li class="toctree-l3"><a href="#apache-apex-operators">Apache 
Apex Operators </a></li>
+                
+                    <li><a class="toctree-l4" 
href="#operators-what-in-a-nutshell">Operators - âWhatâ in a 
nutshell</a></li>
+                
+                    <li><a class="toctree-l4" 
href="#operators-how-in-a-nutshell">Operators - âHowâ in a nutshell</a></li>
+                
+                    <li><a class="toctree-l4" href="#types-of-operators">Types 
of Operators</a></li>
+                
+                    <li><a class="toctree-l4" 
href="#operators-position-in-a-dag">Operators Position in a DAG</a></li>
+                
+                    <li><a class="toctree-l4" href="#ports">Ports</a></li>
+                
+                    <li><a class="toctree-l4" href="#how-operator-works">How 
Operator Works</a></li>
+                
+            
+                <li class="toctree-l3"><a 
href="#developing-custom-operators">Developing Custom Operators </a></li>
+                
+                    <li><a class="toctree-l4" 
href="#about-this-tutorial">About this tutorial</a></li>
+                
+                    <li><a class="toctree-l4" 
href="#introduction">Introduction</a></li>
+                
+                    <li><a class="toctree-l4" href="#design">Design</a></li>
+                
+                    <li><a class="toctree-l4" 
href="#configuration">Configuration</a></li>
+                
+                    <li><a class="toctree-l4" href="#code">Code</a></li>
+                
+            
+                <li class="toctree-l3"><a href="#operator-reference">Operator 
Reference </a></li>
+                
+                    <li><a class="toctree-l4" href="#the-operator-class">The 
Operator Class</a></li>
+                
+                    <li><a class="toctree-l4" 
href="#class-operator-properties">Class (Operator) properties</a></li>
+                
+                    <li><a class="toctree-l4" href="#the-constructor">The 
Constructor</a></li>
+                
+                    <li><a class="toctree-l4" href="#setup-call">Setup 
call</a></li>
+                
+                    <li><a class="toctree-l4" href="#begin-window-call">Begin 
Window call</a></li>
+                
+                    <li><a class="toctree-l4" 
href="#process-tuple-call">Process Tuple call</a></li>
+                
+                    <li><a class="toctree-l4" href="#end-window-call">End 
Window call</a></li>
+                
+                    <li><a class="toctree-l4" href="#teardown-call">Teardown 
call</a></li>
+                
+                    <li><a class="toctree-l4" 
href="#testing-your-operator">Testing your Operator</a></li>
+                
+            
+                <li class="toctree-l3"><a href="#advanced-features">Advanced 
Features </a></li>
+                
+                    <li><a class="toctree-l4" 
href="#control-tuple-support">Control Tuple Support</a></li>
+                
+            
+                <li class="toctree-l3"><a 
href="#malhar-operator-library">Malhar Operator Library</a></li>
+                
+            
+            </ul>
+        
+    </li>
+
+        
+            
+    <li class="toctree-l1 ">
+        <a class="" href="../autometrics/">AutoMetric API</a>
+        
+    </li>
+
+        
+            
+    <li class="toctree-l1 ">
+        <a class="" href="../control_tuples/">Custom Control Tuples</a>
+        
+    </li>
+
+        
+            
+    <li class="toctree-l1 ">
+        <a class="" href="../development_best_practices/">Best Practices</a>
+        
+    </li>
+
+        
+    </ul>
+<li>
+          
+            <li>
+    <ul class="subnav">
+    <li><span>Operations</span></li>
+
+        
+            
+    <li class="toctree-l1 ">
+        <a class="" href="../apex_cli/">Apex CLI</a>
+        
+    </li>
+
+        
+            
+    <li class="toctree-l1 ">
+        <a class="" href="../security/">Security</a>
+        
+    </li>
+
+        
+    </ul>
+<li>
+          
+            <li>
+    <li class="toctree-l1 ">
+        <a class="" href="../compatibility/">Compatibility</a>
+        
+    </li>
+<li>
+          
+        </ul>
+      </div>
+      &nbsp;
+    </nav>
+
+    <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap">
+
+      
+      <nav class="wy-nav-top" role="navigation" aria-label="top navigation">
+        <i data-toggle="wy-nav-top" class="fa fa-bars"></i>
+        <a href="..">Apache Apex Documentation</a>
+      </nav>
+
+      
+      <div class="wy-nav-content">
+        <div class="rst-content">
+          <div role="navigation" aria-label="breadcrumbs navigation">
+  <ul class="wy-breadcrumbs">
+    <li><a href="..">Docs</a> &raquo;</li>
+    
+      
+        
+          <li>Development &raquo;</li>
+        
+      
+    
+    <li>Operators</li>
+    <li class="wy-breadcrumbs-aside">
+      
+    </li>
+  </ul>
+  <hr/>
+</div>
+          <div role="main">
+            <div class="section">
+              
+                <h1 id="operator-development-guide">Operator Development 
Guide</h1>
+<p>Operators are basic building blocks of an application built to run on
+Apache ApexÂ platform. An application may consist of one or more
+operators each of which define some logical operation to be done on the
+tuples arriving at the operator. These operators are connected together
+using streams forming a Directed Acyclic Graph (DAG). In other words, a 
streaming
+application is represented by a DAG that consists of operations (called 
operators) and
+data flow (called streams).</p>
+<p>In this document we will discuss details on how an operator works and
+its internals. This document is intended to serve the following purposes</p>
+<ol>
+<li><strong><a href="#apex_operators">Apache Apex Operators</a></strong>Â - 
Introduction to operator terminology and concepts.</li>
+<li><strong><a href="#writing_custom_operators">Writing Custom 
Operators</a></strong>Â - Designing, coding and testing new operators from 
scratch.  Includes code examples.</li>
+<li><strong><a href="#operator_reference">Operator Reference</a></strong> - 
Details of operator internals, lifecycle, and best practices and 
optimizations.</li>
+<li><strong><a href="#advanced_features">Advanced Features</a></strong> - 
Advanced features in operator development and its capabilities.</li>
+</ol>
+<hr />
+<h1 id="apache-apex-operators">Apache Apex Operators <a 
name="apex_operators"></a></h1>
+<h2 id="operators-what-in-a-nutshell">Operators - âWhatâ in a nutshell</h2>
+<p>Operators are independent units of logical operations which can
+contribute in executing the business logic of a use case. For example,
+in an ETL workflow, a filtering operation can be represented by a single
+operator. This filtering operator will be responsible for doing just one
+task in the ETL pipeline, i.e. filter incoming tuples. Operators do not
+impose any restrictions on what can or cannot be done as part of a
+operator. An operator may as well contain the entire business logic.
+However, it is recommended, that the operators are light weight
+independent tasks, in
+order to take advantage of the distributed framework that Apache Apex
+provides.Â The structure of a streaming application shares resemblance
+with the way CPU pipelining works. CPU pipelining breaks down the
+computation engine into different stages viz. instruction fetch,
+instruction decode, etc. so that each of them can perform their task on
+different instructions
+parallely. Similarly,
+Apache Apex APIs allow the user to break down their tasks into different
+stages so that all of the tasks can be executed on different tuples
+parallely.</p>
+<p><img alt="" src="../images/operator/image05.png" /></p>
+<h2 id="operators-how-in-a-nutshell">Operators - âHowâ in a nutshell</h2>
+<p>An Apache Apex application runs as a YARN application. Hence, each of
+the operators that the application DAG contains, runs in one of the
+containers provisioned by YARN.Â Further, Apache Apex exposes APIs to
+allow the user to request bundling multiple operators in a single node,
+a single container or even a single thread. We shall look at these calls
+in the reference sections [cite reference sections]. For now, consider
+an operator as some piece of code that runs on some machine of a YARN
+cluster.</p>
+<h2 id="types-of-operators">Types of Operators</h2>
+<p>An operator works on one tuple at a time. These tuples may be supplied
+by other operators in the application or by external sources,
+such as a database or a message bus. Similarly, after the tuples are
+processed, these may be passed on to other operators, or stored into an 
external system.
+There are 3 type of operators based on function:</p>
+<ol>
+<li><strong>Input Adapter</strong> - This is one of the starting points in
+    theÂ application DAG and is responsible for getting tuples from an
+    external system. At the same time, such data may also be generated
+    by the operator itself, without interacting with the outside
+    world.Â These input tuples will form the initial universe of
+    dataÂ that the application works on.</li>
+<li><strong>Generic Operator</strong> - This type of operator accepts input 
tuples from
+    the previous operators and passesÂ them on to the following operators
+    in the DAG.</li>
+<li><strong>Output Adapter</strong> - This is one of the ending points in the 
application
+    DAG and is responsible for writing the data out to some external
+    system.</li>
+</ol>
+<p>Note: There can be multiple operators of all types in an application
+DAG.</p>
+<h2 id="operators-position-in-a-dag">Operators Position in a DAG</h2>
+<p>We may refer to operators depending on their position with respect to
+one another. For any operator opr (see image below), there are two types of 
operators.</p>
+<ol>
+<li><strong>Upstream operators</strong> - These are the operators from which 
there is a
+    directed path to oprÂ in the application DAG.</li>
+<li><strong>Downstream operators</strong> - These are the operators to which 
there is a
+    directed path from oprÂ in the application DAG.</li>
+</ol>
+<p>Note that there are no cycles formed in the applicationÂ DAG.</p>
+<p><img alt="" src="../images/operator/image00.png" /></p>
+<h2 id="ports">Ports</h2>
+<p>Operators in a DAG are connected together via directed flows
+called streams. EachÂ streamÂ has end-points located on the operators
+called ports. Therea are 2 types of ports.</p>
+<ol>
+<li><strong>Input Port</strong> - This is aÂ port through which an operator 
accepts input
+    tuplesÂ from an upstream operator.</li>
+<li><strong>Output port</strong> - This is aÂ port through which an operator 
passes on the
+    processed data to downstream operators.</li>
+</ol>
+<p>Looking at the number of input ports, an Input Adapter is an operator
+with no input ports, a Generic operator has both input and output ports,
+while an Output Adapter has no output ports. At the same time, note that
+an operator may act as an Input Adapter while at the same time have an
+input port. In such cases, the operator is getting data from two
+different sources, viz.Â the input stream from the input port and an
+external source.</p>
+<p><img alt="" src="../images/operator/image02.png" /></p>
+<hr />
+<h2 id="how-operator-works">How Operator Works</h2>
+<p>An operator passes through various stages during its lifetime. Each
+stage is an API call that the Streaming Application Master makes for an
+operator. Â The following figure illustrates the stages through which an
+operator passes.</p>
+<p><img alt="" src="../images/operator/image01.png" /></p>
+<ul>
+<li>The <em>setup()</em> call initializes the operator and prepares itself to
+    start processing tuples.</li>
+<li>The <em>beginWindow()</em> call marks the beginningÂ of anÂ application 
window
+    and allows for any processing to be done before a window starts.</li>
+<li>The <em>process()</em> call belongs to the <em>InputPort</em> and gets 
triggered when
+    any tuple arrives at the Input port of the operator. This call is
+    specific only to Generic and Output adapters, since Input Adapters
+    do not have an input port. This is made for all the tuples at the
+    input port until the end window marker tuple is received on the
+    input port.</li>
+<li>The <em>emitTuples()</em> is the counterpart of <em>process()</em> call 
for Input
+    Adapters.
+    This call is used by Input adapters to emit any tuples that are
+    fetched from the external systems, or generated by the operator.
+    This method is called continuously until the pre-configured window
+    time is elapsed, at which the end window marker tuple is sent out on
+    the output port.</li>
+<li>The <em>endWindow()</em> call marks the end of the window and allows for 
any
+    processing to be done after the window ends.</li>
+<li>The <em>teardown()</em> call is used for gracefully shutting down the
+    operator and releasing any resources held by the operator.</li>
+</ul>
+<h1 id="developing-custom-operators">Developing Custom Operators <a 
name="writing_custom_operators"></a></h1>
+<h2 id="about-this-tutorial">About this tutorial</h2>
+<p>This tutorial will guide the user towards developing a operator from
+scratch. It includes all aspects of writing an operator including
+design, code and unit testing.</p>
+<h2 id="introduction">Introduction</h2>
+<p>In this tutorial, we will design and write, from scratch, an operator
+called Word Count. This operator will accept tuples of type String,
+count the number of occurrences for each word appearing in the tuple and
+send out the updated counts for all the words encountered in the tuple.
+Further, the operator will also accept a file path on HDFS which will
+contain the stop-words which need to be ignored when counting
+occurrences.</p>
+<h2 id="design">Design</h2>
+<p>Design of the operator must be finalized before starting to write an
+operator. Many aspects including the functionality, the data sources,
+the types involved etc. need to be first finalized before writing the
+operator. Let us dive into each of these while considering the Word
+CountÂ operator.</p>
+<h3 id="functionality">Functionality</h3>
+<p>We can define the scope of operator functionality using the following
+tasks:</p>
+<ol>
+<li>Parse the input tuple to identify the words in the tuple</li>
+<li>Identify the stop-words in the tuple by looking up the stop-word
+    file as configured</li>
+<li>For each non-stop-word in the tuple, count the occurrences in that
+    tuple and add it to a global counts</li>
+</ol>
+<p>Letâs consider an example. Suppose we have the following tuples flow
+into the Word Count operator.</p>
+<ol>
+<li><em>Humpty dumpty sat on a wall</em></li>
+<li><em>Humpty dumpty had a great fall</em></li>
+</ol>
+<p>Initially counts for all wordsÂ is 0. Once the first tuple is processed,
+the counts that must be emitted are:</p>
+<pre><code class="java">humpty - 1
+dumpty - 1
+sat - 1
+wall - 1
+</code></pre>
+
+<p>Note that we are ignoring the stop-words, âonâ and âaâ in this case.
+Also note that as a rule, weâll ignore the case of the words when
+counting occurrences.</p>
+<p>Similarly, after the second tuple is processed, the counts that must be
+emitted are:</p>
+<pre><code class="java">humpty - 2
+dumpty - 2
+great - 1
+fall - 1
+</code></pre>
+
+<p>Again, we ignore the words <em>âhadâ</em> and <em>âaâ</em> since 
these are stop-words.</p>
+<p>Note that the most recent count for any word is correct count for that
+word. In other words, any new output for a word, invalidated all the
+previous counts for that word.</p>
+<h3 id="inputs">Inputs</h3>
+<p>As seen from the exampleÂ above, the followingÂ inputs are expected for
+the operator:</p>
+<ol>
+<li>Input stream whose tuple type is String</li>
+<li>Input HDFS file path, pointing to a file containing stop-words</li>
+</ol>
+<p>Only one input port is needed. The stop-word file will be small enough
+to be read completely in a single read. In addition this will be a one
+time activity for the lifetime of the operator. This does not need a
+separate input port.</p>
+<p><img alt="" src="../images/operator/image03.png" /></p>
+<h3 id="outputs">Outputs</h3>
+<p>We can define the output for this operator in multiple ways.</p>
+<ol>
+<li>The operator may send out the set of counts for which the counts
+    have changed after processing each tuple.</li>
+<li>Some applications might not need an update after every tuple, but
+    only afterÂ a certain time duration.</li>
+</ol>
+<p>Let us try and implement both these options depending on the
+configuration. Let us define aÂ boolean configuration parameter
+<em>âsendPerTupleâ</em>. The value of this parameter will indicate whether 
the
+updated counts for words need to be emitted after processing each
+tupleÂ (true)Â or after a certain time duration (false).</p>
+<p>The type of information the operator will be sending out on the output
+port is the same for all the cases. This will be a <em>&lt; key, value 
&gt;</em>Â pair,
+where the keyÂ is the word while, the value is the latest count for that
+word. This means we just need one output port on which this information
+will go out.</p>
+<p><img alt="" src="../images/operator/image04.png" /></p>
+<h2 id="configuration">Configuration</h2>
+<p>We have the following configuration parameters:</p>
+<ol>
+<li><em>stopWordFilePath</em>Â - This parameter will store the path to the stop
+    word file on HDFS as configured by the user.</li>
+<li><em>sendPerTuple</em>Â - This parameter decides whether we send out the
+    updated counts after processing each tuple or at the end of a
+    window. When set to true, the operator will send out the updated
+    counts after each tuple, else it will send at the end of
+    eachÂ window.</li>
+</ol>
+<h2 id="code">Code</h2>
+<p>The source code for the tutorial can be found here:</p>
+<p><a 
href="https://github.com/DataTorrent/examples/tree/master/tutorials/operatorTutorial";>https://github.com/DataTorrent/examples/tree/master/tutorials/operatorTutorial</a></p>
+<h1 id="operator-reference">Operator Reference <a 
name="operator_reference"></a></h1>
+<h3 id="the-operator-class">The Operator Class</h3>
+<p>The operator will exist physically as a class which implements the
+OperatorÂ interface. This interface will require implementations for the
+following method calls:</p>
+<ul>
+<li>setup(OperatorContext context)</li>
+<li>beginWindow(long windowId)</li>
+<li>endWindow()</li>
+<li>tearDown()</li>
+</ul>
+<p>In order to simplify the creation of an operator, ApacheÂ Apex
+library also provides a base class âBaseOperatorâ which has empty
+implementations for these methods. Please refer to the <a 
href="#apex_operators">Apex Operators</a>Â section and the
+<a href="#operator_reference">Reference</a>Â section for details on these.</p>
+<p>We extend the class âBaseOperatorâ to create our own operator
+âWordCountOperatorâ.</p>
+<pre><code class="java">public class WordCountOperator extends BaseOperator
+{
+}
+</code></pre>
+
+<h3 id="class-operator-properties">Class (Operator) properties</h3>
+<p>We define the following class variables:</p>
+<ul>
+<li><em>sendPerTuple</em>Â - Configures the output frequency from the 
operator</li>
+</ul>
+<pre><code class="java">private boolean sendPerTuple = true; // default
+</code></pre>
+
+<ul>
+<li><em>stopWordFilePath</em>Â - Stores the path to the stop words file on 
HDFS</li>
+</ul>
+<pre><code class="java">private String stopWordFilePath;Â // no default
+</code></pre>
+
+<ul>
+<li><em>stopWords</em>Â - Stores the stop words read from the configured 
file</li>
+</ul>
+<pre><code class="java">private transient String[] stopWords;
+</code></pre>
+
+<ul>
+<li><em>globalCounts</em>Â - A Map which stores the counts of all the words
+    encountered so far. Note that this variable is non transient, which
+    means that this variable is saved as part of the checkpoint and can be 
recovered in event of a crash.</li>
+</ul>
+<pre><code class="java">private Map&lt;String, Long&gt; globalCounts;
+</code></pre>
+
+<ul>
+<li><em>updatedCounts</em>Â - A Map which stores the counts for only the most
+    recent tuple(s). sendPerTuple configuration determines whether to store 
the most recent or the recent
+    window worth of tuples.</li>
+</ul>
+<pre><code class="java">private transient Map&lt;String, Long&gt; 
updatedCounts;
+</code></pre>
+
+<ul>
+<li><em>input</em> - The input port for the operator. The type of this input 
port
+    is StringÂ which means it will only accept tuples of type String. The
+    definition of an input port requires implementation of a method
+    called process(String tuple), which shouldÂ have the processing logic
+    for the input tuple which Â arrives at this input port. We delegate
+    this task to another method called processTuple(String tuple). This
+    helps in keeping the operator classes extensible by overriding the
+    processing logic for the input tuples.</li>
+</ul>
+<pre><code class="java">public transient DefaultInputPort&lt;String&gt; input 
= new Â  Â 
+DefaultInputPort&lt;String&gt;()
+{
+Â Â Â Â @Override
+Â Â Â Â public void process(String tuple)
+Â Â Â Â {
+    Â Â Â Â processTuple(tuple);
+Â Â Â Â }
+};
+</code></pre>
+
+<ul>
+<li>output - The output port for the operator. The type of this port is
+    Entry &lt; String, Long &gt;, which means the operator will emit &lt; word,
+    count &gt; pairs for the updated counts.</li>
+</ul>
+<pre><code class="java">public transient DefaultOutputPort 
&lt;Entry&lt;String, Long&gt;&gt; output = new
+DefaultOutputPort&lt;Entry&lt;String,Long&gt;&gt;();
+</code></pre>
+
+<h3 id="the-constructor">The Constructor</h3>
+<p>The constructor is the place where we initialize the non-transient data
+structures,Â since
+constructor is called just once per activation of an operator. With regards to 
Word CountÂ operator, we initialize the globalCounts variable in the 
constructor.</p>
+<pre><code class="java">globalCounts = Maps.newHashMap();
+</code></pre>
+
+<h3 id="setup-call">Setup call</h3>
+<p>The setup method is called only once during an operator lifetime and its 
purpose is to allow
+the operator to set itself up for processing incoming streams. Transient 
objects in the operator are
+not serialized and checkpointed. Hence, it is essential that such objects 
initialized in the setup call.
+In case of operator failure, the operator will be redeployed (most likely on a 
different container). The setup method called by the Apache Apex engine allows 
the operator to prepare for execution in the new container.</p>
+<p>The following tasks are executed as part of the setup call:</p>
+<ol>
+<li>Read the stop-word list from HDFS and store it in the
+    stopWordsÂ array</li>
+<li>Initialize updatedCountsÂ variable. This will store the updated
+    counts for words in most recent tuples processed by the operator.
+    As a transient variable, the value will be lost when operator fails.</li>
+</ol>
+<h3 id="begin-window-call">Begin Window call</h3>
+<p>The begin window call signals the start of an application window. With
+regards to Word Count Operator, we are expecting updated counts for the most 
recent window of
+data if the sendPerTupleÂ is set to false. Hence, we clear the updatedCountsÂ 
variable in the begin window
+call and start accumulating the counts till the end window call.</p>
+<h3 id="process-tuple-call">Process Tuple call</h3>
+<p>The processTupleÂ method is called by the processÂ method of the input
+port, input. This method defines the processing logic for the current
+tuple that is received at the input port. As part of this method, we
+identify the words in the current tuple and update the globalCountsÂ and
+the updatedCountsÂ variables. In addition, if the sendPerTupleÂ variable
+is set to true, we also emit the wordsÂ and corresponding counts in
+updatedCountsÂ to the output port. NoteÂ that in this case (sendPerTuple =
+true), we clear the updatedCountsÂ variable in every call to
+processTuple.</p>
+<h3 id="end-window-call">End Window call</h3>
+<p>This call signals the end of an application window. With regards to Word
+Count Operator, we emit the updatedCountsÂ to the output port if the
+sendPerTupleÂ flag is set to false.</p>
+<h3 id="teardown-call">Teardown call</h3>
+<p>This method allows the operator to gracefully shut down itself after
+releasing the resources that it has acquired. With regards to our operator,
+we call the shutDownÂ method which shuts down the operator along with any
+downstream operators.</p>
+<h2 id="testing-your-operator">Testing your Operator</h2>
+<p>As part of testing our operator, we test the following two facets:</p>
+<ol>
+<li>Test output of the operator after processing a single tuple</li>
+<li>Test output of the operator after processing of a window of tuples</li>
+</ol>
+<p>The unit tests for the WordCount operator are available in the class
+WordCountOperatorTest.java. We simulate the behavior of the engine by
+using the test utilities provided by Apache Apex libraries. We simulate
+the setup, beginWindow, processÂ method of the input port and
+endWindowÂ calls and compare the output received at the simulated output
+ports.</p>
+<ol>
+<li>Invoke constructor; non-transients initialized.</li>
+<li>Copy state from checkpoint -- initialized values from step 1 are
+replaced.</li>
+</ol>
+<h1 id="advanced-features">Advanced Features <a 
name="advanced_features"></a></h1>
+<h2 id="control-tuple-support">Control Tuple Support</h2>
+<p>Operators now also have the capability to emit control tuples. These 
control tuples are different from the control tuples used by the engine like 
BEGIN_WINDOW and END_WINDOW tuples. Operators can create and emit their own 
control tuples which can be used to communicate to the down stream operators 
regarding some event. Examples of such events can be BEGIN_FILE, or END_FILE.
+More details can be found at <a href="../../control_tuples">Control 
Tuples</a></p>
+<h1 id="malhar-operator-library">Malhar Operator Library</h1>
+<p>To see the full list of Apex Malhar operators along with related 
documentation, visit <a href="https://github.com/apache/apex-malhar";>Apex 
Malhar on Github</a></p>
+              
+            </div>
+          </div>
+          <footer>
+  
+    <div class="rst-footer-buttons" role="navigation" aria-label="footer 
navigation">
+      
+        <a href="../autometrics/" class="btn btn-neutral float-right" 
title="AutoMetric API">Next <span class="icon 
icon-circle-arrow-right"></span></a>
+      
+      
+        <a href="../application_packages/" class="btn btn-neutral" 
title="Packages"><span class="icon icon-circle-arrow-left"></span> Previous</a>
+      
+    </div>
+  
+
+  <hr/>
+
+  <div role="contentinfo">
+    <!-- Copyright etc -->
+    
+  </div>
+
+  Built with <a href="http://www.mkdocs.org";>MkDocs</a> using a <a 
href="https://github.com/snide/sphinx_rtd_theme";>theme</a> provided by <a 
href="https://readthedocs.org";>Read the Docs</a>.
+</footer>
+         
+        </div>
+      </div>
+
+    </section>
+
+  </div>
+
+<div class="rst-versions" role="note" style="cursor: pointer">
+    <span class="rst-current-version" data-toggle="rst-current-version">
+      
+      
+        <span><a href="../application_packages/" style="color: 
#fcfcfc;">&laquo; Previous</a></span>
+      
+      
+        <span style="margin-left: 15px"><a href="../autometrics/" 
style="color: #fcfcfc">Next &raquo;</a></span>
+      
+    </span>
+</div>
+
+</body>
+</html>

http://git-wip-us.apache.org/repos/asf/apex-site/blob/fb75848f/docs/apex-3.6/search.html
----------------------------------------------------------------------
diff --git a/docs/apex-3.6/search.html b/docs/apex-3.6/search.html
new file mode 100644
index 0000000..cb4c571
--- /dev/null
+++ b/docs/apex-3.6/search.html
@@ -0,0 +1,221 @@
+<!DOCTYPE html>
+<!--[if IE 8]><html class="no-js lt-ie9" lang="en" > <![endif]-->
+<!--[if gt IE 8]><!--> <html class="no-js" lang="en" > <!--<![endif]-->
+<head>
+  <meta charset="utf-8">
+  <meta http-equiv="X-UA-Compatible" content="IE=edge">
+  <meta name="viewport" content="width=device-width, initial-scale=1.0">
+  
+  
+  
+  <title>Apache Apex Documentation</title>
+  
+
+  <link rel="shortcut icon" href="favicon.ico">
+  
+
+  
+  <link 
href='https://fonts.googleapis.com/css?family=Lato:400,700|Roboto+Slab:400,700|Inconsolata:400,700'
 rel='stylesheet' type='text/css'>
+
+  <link rel="stylesheet" href="./css/theme.css" type="text/css" />
+  <link rel="stylesheet" href="./css/theme_extra.css" type="text/css" />
+  <link rel="stylesheet" href="./css/highlight.css">
+
+  
+  <script src="./js/jquery-2.1.1.min.js"></script>
+  <script src="./js/modernizr-2.8.3.min.js"></script>
+  <script type="text/javascript" src="./js/highlight.pack.js"></script>
+  <script src="./js/theme.js"></script>
+  <script>var base_url = '.';</script>
+  <script data-main="./mkdocs/js/search.js" 
src="./mkdocs/js/require.js"></script>
+
+
+  
+</head>
+
+<body class="wy-body-for-nav" role="document">
+
+  <div class="wy-grid-for-nav">
+
+    
+    <nav data-toggle="wy-nav-shift" class="wy-nav-side stickynav">
+      <div class="wy-side-nav-search">
+        <a href="." class="icon icon-home"> Apache Apex Documentation</a>
+        <div role="search">
+  <form id ="rtd-search-form" class="wy-form" action="./search.html" 
method="get">
+    <input type="text" name="q" placeholder="Search docs" />
+  </form>
+</div>
+      </div>
+
+      <div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" 
aria-label="main navigation">
+        <ul class="current">
+          
+            <li>
+    <li class="toctree-l1 ">
+        <a class="" href=".">Apache Apex</a>
+        
+    </li>
+<li>
+          
+            <li>
+    <ul class="subnav">
+    <li><span>Development</span></li>
+
+        
+            
+    <li class="toctree-l1 ">
+        <a class="" href="apex_development_setup/">Development Setup</a>
+        
+    </li>
+
+        
+            
+    <li class="toctree-l1 ">
+        <a class="" href="application_development/">Applications</a>
+        
+    </li>
+
+        
+            
+    <li class="toctree-l1 ">
+        <a class="" href="application_packages/">Packages</a>
+        
+    </li>
+
+        
+            
+    <li class="toctree-l1 ">
+        <a class="" href="operator_development/">Operators</a>
+        
+    </li>
+
+        
+            
+    <li class="toctree-l1 ">
+        <a class="" href="autometrics/">AutoMetric API</a>
+        
+    </li>
+
+        
+            
+    <li class="toctree-l1 ">
+        <a class="" href="control_tuples/">Custom Control Tuples</a>
+        
+    </li>
+
+        
+            
+    <li class="toctree-l1 ">
+        <a class="" href="development_best_practices/">Best Practices</a>
+        
+    </li>
+
+        
+    </ul>
+<li>
+          
+            <li>
+    <ul class="subnav">
+    <li><span>Operations</span></li>
+
+        
+            
+    <li class="toctree-l1 ">
+        <a class="" href="apex_cli/">Apex CLI</a>
+        
+    </li>
+
+        
+            
+    <li class="toctree-l1 ">
+        <a class="" href="security/">Security</a>
+        
+    </li>
+
+        
+    </ul>
+<li>
+          
+            <li>
+    <li class="toctree-l1 ">
+        <a class="" href="compatibility/">Compatibility</a>
+        
+    </li>
+<li>
+          
+        </ul>
+      </div>
+      &nbsp;
+    </nav>
+
+    <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap">
+
+      
+      <nav class="wy-nav-top" role="navigation" aria-label="top navigation">
+        <i data-toggle="wy-nav-top" class="fa fa-bars"></i>
+        <a href=".">Apache Apex Documentation</a>
+      </nav>
+
+      
+      <div class="wy-nav-content">
+        <div class="rst-content">
+          <div role="navigation" aria-label="breadcrumbs navigation">
+  <ul class="wy-breadcrumbs">
+    <li><a href=".">Docs</a> &raquo;</li>
+    
+    
+    <li class="wy-breadcrumbs-aside">
+      
+    </li>
+  </ul>
+  <hr/>
+</div>
+          <div role="main">
+            <div class="section">
+              
+
+  <h1 id="search">Search Results</h1>
+
+  <form id="content_search" action="search.html">
+    <span role="status" aria-live="polite" 
class="ui-helper-hidden-accessible"></span>
+    <input name="q" id="mkdocs-search-query" type="text" class="search_input 
search-query ui-autocomplete-input" placeholder="Search the Docs" 
autocomplete="off" autofocus>
+  </form>
+
+  <div id="mkdocs-search-results">
+    Searching...
+  </div>
+
+
+            </div>
+          </div>
+          <footer>
+  
+
+  <hr/>
+
+  <div role="contentinfo">
+    <!-- Copyright etc -->
+    
+  </div>
+
+  Built with <a href="http://www.mkdocs.org";>MkDocs</a> using a <a 
href="https://github.com/snide/sphinx_rtd_theme";>theme</a> provided by <a 
href="https://readthedocs.org";>Read the Docs</a>.
+</footer>
+         
+        </div>
+      </div>
+
+    </section>
+
+  </div>
+
+<div class="rst-versions" role="note" style="cursor: pointer">
+    <span class="rst-current-version" data-toggle="rst-current-version">
+      
+      
+      
+    </span>
+</div>
+
+</body>
+</html>

http://git-wip-us.apache.org/repos/asf/apex-site/blob/fb75848f/docs/apex-3.6/searchbox.html
----------------------------------------------------------------------
diff --git a/docs/apex-3.6/searchbox.html b/docs/apex-3.6/searchbox.html
new file mode 100644
index 0000000..177fcb3
--- /dev/null
+++ b/docs/apex-3.6/searchbox.html
@@ -0,0 +1,5 @@
+<div role="search">
+  <form id ="rtd-search-form" class="wy-form" action="{{ base_url 
}}/search.html" method="get">
+    <input type="text" name="q" placeholder="Search docs" />
+  </form>
+</div>

http://git-wip-us.apache.org/repos/asf/apex-site/blob/fb75848f/docs/apex-3.6/security/index.html
----------------------------------------------------------------------
diff --git a/docs/apex-3.6/security/index.html 
b/docs/apex-3.6/security/index.html
new file mode 100644
index 0000000..a83591d
--- /dev/null
+++ b/docs/apex-3.6/security/index.html
@@ -0,0 +1,435 @@
+<!DOCTYPE html>
+<!--[if IE 8]><html class="no-js lt-ie9" lang="en" > <![endif]-->
+<!--[if gt IE 8]><!--> <html class="no-js" lang="en" > <!--<![endif]-->
+<head>
+  <meta charset="utf-8">
+  <meta http-equiv="X-UA-Compatible" content="IE=edge">
+  <meta name="viewport" content="width=device-width, initial-scale=1.0">
+  
+  
+  
+  <title>Security - Apache Apex Documentation</title>
+  
+
+  <link rel="shortcut icon" href="../favicon.ico">
+  
+
+  
+  <link 
href='https://fonts.googleapis.com/css?family=Lato:400,700|Roboto+Slab:400,700|Inconsolata:400,700'
 rel='stylesheet' type='text/css'>
+
+  <link rel="stylesheet" href="../css/theme.css" type="text/css" />
+  <link rel="stylesheet" href="../css/theme_extra.css" type="text/css" />
+  <link rel="stylesheet" href="../css/highlight.css">
+
+  
+  <script>
+    // Current page data
+    var mkdocs_page_name = "Security";
+    var mkdocs_page_input_path = "security.md";
+    var mkdocs_page_url = "/security/";
+  </script>
+  
+  <script src="../js/jquery-2.1.1.min.js"></script>
+  <script src="../js/modernizr-2.8.3.min.js"></script>
+  <script type="text/javascript" src="../js/highlight.pack.js"></script>
+  <script src="../js/theme.js"></script> 
+
+  
+</head>
+
+<body class="wy-body-for-nav" role="document">
+
+  <div class="wy-grid-for-nav">
+
+    
+    <nav data-toggle="wy-nav-shift" class="wy-nav-side stickynav">
+      <div class="wy-side-nav-search">
+        <a href=".." class="icon icon-home"> Apache Apex Documentation</a>
+        <div role="search">
+  <form id ="rtd-search-form" class="wy-form" action="../search.html" 
method="get">
+    <input type="text" name="q" placeholder="Search docs" />
+  </form>
+</div>
+      </div>
+
+      <div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" 
aria-label="main navigation">
+        <ul class="current">
+          
+            <li>
+    <li class="toctree-l1 ">
+        <a class="" href="..">Apache Apex</a>
+        
+    </li>
+<li>
+          
+            <li>
+    <ul class="subnav">
+    <li><span>Development</span></li>
+
+        
+            
+    <li class="toctree-l1 ">
+        <a class="" href="../apex_development_setup/">Development Setup</a>
+        
+    </li>
+
+        
+            
+    <li class="toctree-l1 ">
+        <a class="" href="../application_development/">Applications</a>
+        
+    </li>
+
+        
+            
+    <li class="toctree-l1 ">
+        <a class="" href="../application_packages/">Packages</a>
+        
+    </li>
+
+        
+            
+    <li class="toctree-l1 ">
+        <a class="" href="../operator_development/">Operators</a>
+        
+    </li>
+
+        
+            
+    <li class="toctree-l1 ">
+        <a class="" href="../autometrics/">AutoMetric API</a>
+        
+    </li>
+
+        
+            
+    <li class="toctree-l1 ">
+        <a class="" href="../control_tuples/">Custom Control Tuples</a>
+        
+    </li>
+
+        
+            
+    <li class="toctree-l1 ">
+        <a class="" href="../development_best_practices/">Best Practices</a>
+        
+    </li>
+
+        
+    </ul>
+<li>
+          
+            <li>
+    <ul class="subnav">
+    <li><span>Operations</span></li>
+
+        
+            
+    <li class="toctree-l1 ">
+        <a class="" href="../apex_cli/">Apex CLI</a>
+        
+    </li>
+
+        
+            
+    <li class="toctree-l1 current">
+        <a class="current" href="./">Security</a>
+        
+            <ul>
+            
+                <li class="toctree-l3"><a href="#security">Security</a></li>
+                
+                    <li><a class="toctree-l4" 
href="#kerberos-authentication">Kerberos Authentication</a></li>
+                
+                    <li><a class="toctree-l4" 
href="#configuring-security">Configuring security</a></li>
+                
+                    <li><a class="toctree-l4" 
href="#security-architecture">Security architecture</a></li>
+                
+                    <li><a class="toctree-l4" 
href="#conclusion">Conclusion</a></li>
+                
+            
+            </ul>
+        
+    </li>
+
+        
+    </ul>
+<li>
+          
+            <li>
+    <li class="toctree-l1 ">
+        <a class="" href="../compatibility/">Compatibility</a>
+        
+    </li>
+<li>
+          
+        </ul>
+      </div>
+      &nbsp;
+    </nav>
+
+    <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap">
+
+      
+      <nav class="wy-nav-top" role="navigation" aria-label="top navigation">
+        <i data-toggle="wy-nav-top" class="fa fa-bars"></i>
+        <a href="..">Apache Apex Documentation</a>
+      </nav>
+
+      
+      <div class="wy-nav-content">
+        <div class="rst-content">
+          <div role="navigation" aria-label="breadcrumbs navigation">
+  <ul class="wy-breadcrumbs">
+    <li><a href="..">Docs</a> &raquo;</li>
+    
+      
+        
+          <li>Operations &raquo;</li>
+        
+      
+    
+    <li>Security</li>
+    <li class="wy-breadcrumbs-aside">
+      
+    </li>
+  </ul>
+  <hr/>
+</div>
+          <div role="main">
+            <div class="section">
+              
+                <h1 id="security">Security</h1>
+<p>Applications built on Apex run as native YARN applications on Hadoop. The 
security framework and apparatus in Hadoop apply to the applications. The 
default security mechanism in Hadoop is Kerberos.</p>
+<h2 id="kerberos-authentication">Kerberos Authentication</h2>
+<p>Kerberos is a ticket based authentication system that provides 
authentication in a distributed environment where authentication is needed 
between multiple users, hosts and services. It is the de-facto authentication 
mechanism supported in Hadoop. To use Kerberos authentication, the Hadoop 
installation must first be configured for secure mode with Kerberos. Please 
refer to the administration guide of your Hadoop distribution on how to do 
that. Once Hadoop is configured, some configuration is needed on the Apex side 
as well.</p>
+<h2 id="configuring-security">Configuring security</h2>
+<p>The Apex command line interface (CLI) program, <code>apex</code>, is used 
to launch applications on the Hadoop cluster along with performing various 
other operations and administrative tasks on the applications. In a secure 
cluster additional configuration is needed for the CLI program 
<code>apex</code>.</p>
+<h3 id="cli-configuration">CLI Configuration</h3>
+<p>When Kerberos security is enabled in Hadoop, a Kerberos ticket granting 
ticket (TGT) or the Kerberos credentials of the user are needed by the CLI 
program <code>apex</code> to authenticate with Hadoop for any operation. 
Kerberos credentials are composed of a principal and either a <em>keytab</em> 
or a password. For security and operational reasons only keytabs are supported 
in Hadoop and by extension in Apex platform. When user credentials are 
specified, all operations including launching application are performed as that 
user.</p>
+<h4 id="using-kinit">Using kinit</h4>
+<p>A Kerberos ticket granting ticket (TGT) can be obtained by using the 
Kerberos command <code>kinit</code>. Detailed documentation for the command can 
be found online or in man pages. An sample usage of this command is</p>
+<pre><code>kinit -k -t path-tokeytab-file kerberos-principal
+</code></pre>
+<p>If this command is successful, the TGT is obtained, cached and available 
for other programs. The CLI program <code>apex</code> can then be started to 
launch applications and perform other operations.</p>
+<h4 id="using-kerberos-credentials">Using Kerberos credentials</h4>
+<p>The CLI program <code>apex</code> can also use the Kerberos credentials 
directly without requiring a TGT to be obtained separately. This can be useful 
in batch mode where <code>apex</code> is not launched manually and also in 
scenarios where running another program like <code>kinit</code> is not 
feasible.</p>
+<p>The credentials can be specified in the <code>dt-site.xml</code> 
configuration file. If only a single user is launching applications, the global 
<code>dt-site.xml</code> configuration file in the installation folder can be 
used. In a multi-user environment the users can use the 
<code>dt-site.xml</code> file in their
+home directory. The location of this file will be 
<code>$HOME/.dt/dt-site.xml</code>. If this file does not exist, the user can 
create a new one.</p>
+<p>The snippet below shows the how the credentials can be specified in the 
configuration file as properties.</p>
+<pre><code class="xml">&lt;property&gt;
+        &lt;name&gt;dt.authentication.principal&lt;/name&gt;
+        &lt;value&gt;kerberos-principal-of-user&lt;/value&gt;
+&lt;/property&gt;
+&lt;property&gt;
+        &lt;name&gt;dt.authentication.keytab&lt;/name&gt;
+        &lt;value&gt;absolute-path-to-keytab-file&lt;/value&gt;
+&lt;/property&gt;
+</code></pre>
+
+<p>The property <code>dt.authentication.principal</code> specifies the 
Kerberos user principal and <code>dt.authentication.keytab</code> specifies the 
absolute path to the keytab file for the user.</p>
+<h3 id="web-services-security">Web Services security</h3>
+<p>Alongside every Apex application, there is an application master process 
called Streaming Container Manager (STRAM) running. STRAM manages the 
application by handling the various control aspects of the application such as 
orchestrating the execution of the application on the cluster, playing a key 
role in scalability and fault tolerance, providing application insight by 
collecting statistics among other functionality.</p>
+<p>STRAM provides a web service interface to introspect the state of the 
application and its various components and to make dynamic changes to the 
applications. Some examples of supported functionality are getting resource 
usage and partition information of various operators, getting operator 
statistics and changing properties of running operators.</p>
+<p>Access to the web services can be secured to prevent unauthorized access. 
By default it is automatically enabled in Hadoop secure mode environments and 
not enabled in non-secure environments. How the security actually works is 
described in <code>Security architecture</code> section below.</p>
+<p>There are additional options available for finer grained control on 
enabling it. This can be configured on a per-application basis using an 
application attribute. It can also be enabled or disabled based on Hadoop 
security configuration. The following security options are available</p>
+<ul>
+<li>Enable - Enable Authentication</li>
+<li>Follow Hadoop Authentication - Enable authentication if secure mode is 
enabled in Hadoop, the default</li>
+<li>Follow Hadoop HTTP Authentication - Enable authentication only if HTTP 
authentication is enabled in Hadoop and not just secure mode.</li>
+<li>Disable - Disable Authentication</li>
+</ul>
+<p>To specify the security option for an application the following 
configuration can be specified in the <code>dt-site.xml</code> file</p>
+<pre><code class="xml">&lt;property&gt;
+        
&lt;name&gt;dt.application.name.attr.STRAM_HTTP_AUTHENTICATION&lt;/name&gt;
+        &lt;value&gt;security-option&lt;/value&gt;
+&lt;/property&gt;
+</code></pre>
+
+<p>The security option value can be <code>ENABLED</code>, 
<code>FOLLOW_HADOOP_AUTH</code>, <code>FOLLOW_HADOOP_HTTP_AUTH</code> or 
<code>DISABLE</code> for the four options above respectively.</p>
+<p>The subsequent sections talk about how security works in Apex. This 
information is not needed by users but is intended for the inquisitive techical 
audience who want to know how security works.</p>
+<h4 id="cli-setup">CLI setup</h4>
+<p>The CLI program <code>apex</code> connects to the web service endpoint of 
the STRAM for a running application to query for information or to make changes 
to it. In order to do that, it has to first connect to the YARN proxy web 
service and get the necessary connection information and credentials to connect 
to STRAM. The proxy web service may have security enabled and in that case, the 
CLI program <code>apex</code> would first need to authenticate with the service 
before it can get any information.</p>
+<p>Hadoop allows a lot of flexibility in the kind of security to use for the 
proxy. It allows the user to plug-in their own authentication provider. The 
authentication provider is specified as a JAVA class name. It also comes 
bundled with a provider for Kerberos SPNEGO authentication. Some distributions 
also include a provider for BASIC authentication via SASL.</p>
+<p>The CLI <code>apex</code>, has built-in functionality for Kerberos SPNEGO, 
BASIC and DIGEST authentication mechanisms. Because of the way the 
authentication provider is configured for the proxy on the Hadoop side, there 
is no reliable way to determine before hand what kind of authentication is 
being used. Only at runtime, when the CLI connects to the proxy web service 
will it know the type of authentication that the service is using. For this 
reason, <code>apex</code> allows the user to configure credentials for multiple 
authentication mechanisms it supports and will pick the one that matches what 
the service expects.</p>
+<p>If the authentication mechanism is Kerberos SPNEGO, the properties listed 
in the <a href="#using-kerberos-credentials">Using Kerberos credentials</a> 
section for general communication with Hadoop above are sufficient. No 
additional properties are needed.</p>
+<p>For BASIC authentication, the credentials can be specified using the 
following properties</p>
+<pre><code class="xml">&lt;property&gt;
+        &lt;name&gt;dt.authentication.basic.username&lt;/name&gt;
+        &lt;value&gt;username&lt;/value&gt;
+&lt;/property&gt;
+&lt;property&gt;
+        &lt;name&gt;dt.authentication.basic.password&lt;/name&gt;
+        &lt;value&gt;password&lt;/value&gt;
+&lt;/property&gt;
+</code></pre>
+
+<p>For DIGEST authentication, the credentials can be specified using the 
following properties</p>
+<pre><code class="xml">&lt;property&gt;
+        &lt;name&gt;dt.authentication.digest.username&lt;/name&gt;
+        &lt;value&gt;username&lt;/value&gt;
+&lt;/property&gt;
+&lt;property&gt;
+        &lt;name&gt;dt.authentication.digest.password&lt;/name&gt;
+        &lt;value&gt;password&lt;/value&gt;
+&lt;/property&gt;
+</code></pre>
+
+<h3 id="token-refresh">Token Refresh</h3>
+<p>Apex applications, at runtime, use delegation tokens to authenticate with 
Hadoop services when communicating with them as described in the security 
architecture section below. The delegation tokens are originally issued by 
these Hadoop services and have an expiry time period which is typically 7 days. 
The tokens become invalid beyond this time and the applications will no longer 
be able to communicate with the Hadoop services. For long running applications 
this presents a problem.</p>
+<p>To solve this problem one of the two approaches can be used. The first 
approach is to change the Hadoop configuration itself to extend the token 
expiry time period. This may not be possible in all environments as it requires 
a change in the security policy as the tokens will now be valid for a longer 
period of time and the change also requires administrator privileges to Hadoop. 
The second approach is to use a feature available in apex to auto-refresh the 
tokens before they expire. Both the approaches are detailed below and the users 
can choose the one that works best for them.</p>
+<h4 id="hadoop-configuration-approach">Hadoop configuration approach</h4>
+<p>An Apex application uses delegation tokens to authenticate with Hadoop 
services, Resource Manager (YARN) and Name Node (HDFS), and these tokens are 
issued by those services respectively. Since the application is long-running, 
the tokens can expire while the application is still running. Hadoop uses 
configuration settings for the maximum lifetime of these tokens. </p>
+<p>There are separate settings for ResourceManager and NameNode delegation 
tokens. In this approach the user increases the values of these settings to 
cover the lifetime of the application. Once these settings are changed, the 
YARN and HDFS services would have to be restarted. The values in these settings 
are of type <code>long</code> and has an upper limit so applications cannot run 
forever. This limitation is not present with the next approach described 
below.</p>
+<p>The Resource Manager delegation token max lifetime is specified in 
<code>yarn-site.xml</code> and can be specified as follows for a lifetime of 1 
year as an example</p>
+<pre><code class="xml">&lt;property&gt;
+  &lt;name&gt;yarn.resourcemanager.delegation.token.max-lifetime&lt;/name&gt;
+  &lt;value&gt;31536000000&lt;/value&gt;
+&lt;/property&gt;
+</code></pre>
+
+<p>The Name Node delegation token max lifetime is specified in
+hdfs-site.xml and can be specified as follows for a lifetime of 1 year as an 
example</p>
+<pre><code class="xml">&lt;property&gt;
+   &lt;name&gt;dfs.namenode.delegation.token.max-lifetime&lt;/name&gt;
+   &lt;value&gt;31536000000&lt;/value&gt;
+ &lt;/property&gt;
+</code></pre>
+
+<h4 id="auto-refresh-approach">Auto-refresh approach</h4>
+<p>In this approach the application, in anticipation of a token expiring, 
obtains a new token to replace the current one. It keeps repeating the process 
whenever a token is close to expiry so that the application can continue to run 
indefinitely.</p>
+<p>This requires the application having access to a keytab file at runtime 
because obtaining a new token requires a keytab. The keytab file should be 
present in HDFS so that the application can access it at runtime. The user can 
provide a HDFS location for the keytab file using a setting otherwise the 
keytab file specified for the <code>apex</code> CLI program above will be 
copied from the local filesystem into HDFS before the application is started 
and made available to the application. There are other optional settings 
available to configure the behavior of this feature. All the settings are 
described below.</p>
+<p>The location of the keytab can be specified by using the following setting 
in <code>dt-site.xml</code>. If it is not specified then the file specified in 
<code>dt.authentication.keytab</code> is copied into HDFS and used.</p>
+<pre><code class="xml">&lt;property&gt;
+        &lt;name&gt;dt.authentication.store.keytab&lt;/name&gt;
+        &lt;value&gt;hdfs-path-to-keytab-file&lt;/value&gt;
+&lt;/property&gt;
+</code></pre>
+
+<p>The expiry period of the Resource Manager and Name Node tokens needs to be 
known so that the application can renew them before they expire. These are 
automatically obtained using the 
<code>yarn.resourcemanager.delegation.token.max-lifetime</code> and 
<code>dfs.namenode.delegation.token.max-lifetime</code> properties from the 
hadoop configuration files. Sometimes however these properties are not 
available or kept up-to-date on the nodes running the applications. If that is 
the case then the following properties can be used to specify the expiry 
period, the values are in milliseconds. The example below shows how to specify 
these with values of 7 days.</p>
+<pre><code class="xml">&lt;property&gt;
+        
&lt;name&gt;dt.resourcemanager.delegation.token.max-lifetime&lt;/name&gt;
+        &lt;value&gt;604800000&lt;/value&gt;
+&lt;/property&gt;
+
+&lt;property&gt;
+        &lt;name&gt;dt.namenode.delegation.token.max-lifetime&lt;/name&gt;
+        &lt;value&gt;604800000&lt;/value&gt;
+&lt;/property&gt;
+</code></pre>
+
+<p>As explained earlier new tokens are obtained before the old ones expire. 
How early the new tokens are obtained before expiry is controlled by a setting. 
This setting is specified as a factor of the token expiration with a value 
between 0.0 and 1.0. The default value is <code>0.7</code>. This factor is 
multipled with the expiration time to determine when to refresh the tokens. 
This setting can be changed by the user and the following example shows how 
this can be done</p>
+<pre><code class="xml">&lt;property&gt;
+        &lt;name&gt;dt.authentication.token.refresh.factor&lt;/name&gt;
+        &lt;value&gt;0.7&lt;/value&gt;
+&lt;/property&gt;
+</code></pre>
+
+<h3 id="impersonation">Impersonation</h3>
+<p>The CLI program <code>apex</code> supports Hadoop proxy user impersonation, 
in allowing applications to be launched and other operations to be performed as 
a different user than the one specified by the Kerberos credentials. The 
Kerberos credentials are still used for authentication. This is useful in 
scenarios where a system using <code>apex</code> has to support multiple users 
but only has a single set of Kerberos credentials, those of a system user.</p>
+<h4 id="usage">Usage</h4>
+<p>To use this feature, the following environment variable should be set to 
the user name of the user being impersonated, before running <code>apex</code> 
and the operations will be performed as that user. For example, if launching an 
application, the application will run as the specified user and not as the user 
specified by the Kerberos credentials.</p>
+<pre><code>HADOOP_USER_NAME=&lt;username&gt;
+</code></pre>
+
+<h4 id="hadoop-configuration">Hadoop Configuration</h4>
+<p>For this feature to work, additional configuration settings are needed in 
Hadoop. These settings would allow a specified user, such as a system user, to 
impersonate other users. The example snippet below shows these settings. In 
this example, the specified user can impersonate users belonging to any group 
and can do so running from any host. Note that the user specified here is 
different from the user specified above in usage, there it is the user that is 
being impersonated and here it is the impersonating user such as a system 
user.</p>
+<pre><code class="xml">&lt;property&gt;
+  &lt;name&gt;hadoop.proxyuser.&lt;username&gt;.groups&lt;/name&gt;
+  &lt;value&gt;*&lt;/value&gt;
+&lt;/property&gt;
+
+&lt;property&gt;
+  &lt;name&gt;hadoop.proxyuser.&lt;username&gt;.hosts&lt;/name&gt;
+  &lt;value&gt;*&lt;/value&gt;
+&lt;/property&gt;
+</code></pre>
+
+<h2 id="security-architecture">Security architecture</h2>
+<p>In this section we will see how security works for applications built on 
Apex. We will look at the different methodologies involved in running the 
applications and in each case we will look into the different components that 
are involved. We will go into the architecture of these components and look at 
the different security mechanisms that are in play.</p>
+<h3 id="application-launch">Application Launch</h3>
+<p>To launch applications in Apache Apex the command line client 
<code>apex</code> can be used. The application artifacts such as binaries and 
properties are supplied as an application package. The client, during the 
various steps involved to launch the application needs to communicate with both 
the Resource Manager and the Name Node. The Resource Manager communication 
involves the client asking for new resources to run the application master and 
start the application launch process. The steps along with sample Java code are 
described in Writing YARN Applications. The Name Node communication includes 
the application artifacts being copied to HDFS so that they are available 
across the cluster for launching the different application containers.</p>
+<p>In secure mode, the communications with both Resource Manager and Name Node 
requires authentication and the mechanism is Kerberos. Below is an illustration 
showing this.</p>
+<p><img alt="" src="../images/security/image02.png" /></p>
+<p>The client <code>apex</code> supports Kerberos authentication and will 
automatically enable it in a secure environment. To authenticate, some Kerberos 
configuration namely the Kerberos credentials, are needed by the client. There 
are two parameters, the Kerberos principal and keytab to use for the client. 
These can be specified in the dt-site.xml configuration file. The properties 
are shown below</p>
+<pre><code>    &lt;property&gt;
+            &lt;name&gt;dt.authentication.principal&lt;/name&gt;
+            &lt;value&gt;kerberos-principal-of-user&lt;/value&gt;
+    &lt;/property&gt;
+    &lt;property&gt;
+            &lt;name&gt;dt.authentication.keytab&lt;/name&gt;
+            &lt;value&gt;absolute-path-to-keytab-file&lt;/value&gt;
+    &lt;/property&gt;
+</code></pre>
+<p>Refer to document Operation and Installation Guide section Multi Tenancy 
and Security subsection CLI Configuration in the documentation for more 
information. The document can also be accessed here client configuration</p>
+<p>There is another important functionality that is performed by the client 
and that is to retrieve what are called delegation tokens from the Resource 
Manager and Name Node to seed the application master container that is to be 
launched. This is detailed in the next section. </p>
+<h3 id="runtime-security">Runtime Security</h3>
+<p>When the application is completely up and running, there are different 
components of the application running as separate processes possibly on 
different nodes in the cluster as it is a distributed application. These 
components interactwould be interacting with each other and the Hadoop 
services. In secure mode, all these interactions have to be authenticated 
before they can be successfully processed. The interactions are illustrated 
below in a diagram to give a complete overview. Each of them is explained in 
subsequent sections.</p>
+<p><img alt="" src="../images/security/image00.png" /></p>
+<h4 id="stram-and-hadoop">STRAM and Hadoop</h4>
+<p>Every Apache Apex application has a master process akin to any YARN 
application. In our case it is called STRAM (Streaming Application Master). It 
is a master process that runs in its own container and manages the different 
distributed components of the application. Among other tasks it requests 
Resource Manager for new resources as they are needed and gives back resources 
that are no longer needed. STRAM also needs to communicate with Name Node from 
time-to-time to access the persistent HDFS file system. </p>
+<p>In secure mode, STRAM has to authenticate with both Resource Manager and 
Name Node before it can send any requests and this is achieved using Delegation 
Tokens. Since STRAM runs as a managed application master, it runs in a Hadoop 
container. This container could have been allocated on any node based on what 
resources were available. Since there is no fixed node where STRAM runs, it 
does not have Kerberos credentials. Unlike launch client <code>apex</code>, it 
cannot authenticate with Hadoop services Resource Manager and Name Node using 
Kerberos. Instead, Delegation Tokens are used for authentication.</p>
+<h5 id="delegation-tokens">Delegation Tokens</h5>
+<p>Delegation tokens are tokens that are dynamically issued by the source and 
clients use them to authenticate with the source. The source stores the 
delegation tokens it has issued in a cache and checks the delegation token sent 
by a client against the cache. If a match is found, the authentication is 
successful else it fails. This is the second mode of authentication in secure 
Hadoop after Kerberos. More details can be found in the Hadoop security design 
document. In this case the delegation tokens are issued by Resource Manager and 
Name Node. STRAM would use these tokens to authenticate with them. But how does 
it get them in the first place? This is where the launch client 
<code>apex</code> comes in.</p>
+<p>The client <code>apex</code>, since it possesses Kerberos credentials as 
explained in the Application Launch section, is able to authenticate with 
Resource Manager and Name Node using Kerberos. It then requests for delegation 
tokens over the Kerberos authenticated connection. The servers return the 
delegation tokens in the response payload. The client in requesting the 
resource manager for the start of the application master container for STRAM 
seeds it with these tokens so that when STRAM starts it has these tokens. It 
can then use these tokens to authenticate with the Hadoop services.</p>
+<h4 id="streaming-container">Streaming Container</h4>
+<p>A streaming container is a process that runs a part of the application 
business logic. It is a container deployed on a node in the cluster. The part 
of business logic is implemented in what we call an operator. Multiple 
operators connected together make up the complete application and hence there 
are multiple streaming containers in an application. The streaming containers 
have different types of communications going on as illustrated in the diagram 
above. They are described below.</p>
+<h5 id="stram-delegation-token">STRAM Delegation Token</h5>
+<p>The streaming containers periodically communicate with the application 
master STRAM. In the communication they send what are called heartbeats with 
information such as statistics and receive commands from STRAM such as 
deployment or un-deployment of operators, changing properties of operators etc. 
In secure mode, this communication cannot just occur without any 
authentication. To facilitate this authentication special tokens called STRAM 
Delegation Tokens are used. These tokens are created and managed by STRAM. When 
a new streaming container is being started, since STRAM is the one negotiating 
resources from Resource Manager for the container and requesting to start the 
container, it seeds the container with the STRAM delegation token necessary to 
communicate with it. Thus, a streaming container has the STRAM delegation token 
to successfully authenticate and communicate with STRAM.</p>
+<h5 id="buffer-server-token">Buffer Server Token</h5>
+<p>As mentioned earlier an operator implements a piece of the business logic 
of the application and multiple operators together complete the application. In 
creating the application the operators are assembled together in a direct 
acyclic graph, a pipeline, with output of operators becoming the input for 
other operators. At runtime the stream containers hosting the operators are 
connected to each other and sending data to each other. In secure mode these 
connections should be authenticated too, more importantly than others, as they 
are involved in transferring application data.</p>
+<p>When operators are running there will be effective processing rate 
differences between them due to intrinsic reasons such as operator logic or 
external reasons such as different resource availability of CPU, memory, 
network bandwidth etc. as the operators are running in different containers. To 
maximize performance and utilization the data flow is handled asynchronous to 
the regular operator function and a buffer is used to intermediately store the 
data that is being produced by the operator. This buffered data is served by a 
buffer server over the network connection to the downstream streaming container 
containing the operator that is supposed to receive the data from this 
operator. This connection is secured by a token called the buffer server token. 
These tokens are also generated and seeded by STRAM when the streaming 
containers are deployed and started and it uses different tokens for different 
buffer servers to have better security.</p>
+<h5 id="namenode-delegation-token">NameNode Delegation Token</h5>
+<p>Like STRAM, streaming containers also need to communicate with NameNode to 
use HDFS persistence for reasons such as saving the state of the operators. In 
secure mode they also use NameNode delegation tokens for authentication. These 
tokens are also seeded by STRAM for the streaming containers.</p>
+<h4 id="stram-web-services">Stram Web Services</h4>
+<p>Clients connect to STRAM and make web service requests to obtain 
operational information about running applications. When security is enabled we 
want this connection to also be authenticated. In this mode the client passes a 
web service token in the request and STRAM checks this token. If the token is 
valid, then the request is processed else it is denied.</p>
+<p>How does the client get the web service token in the first place? The 
client will have to first connect to STRAM via the Resource Manager Web 
Services Proxy which is a service run by Hadoop to proxy requests to 
application web services. This connection is authenticated by the proxy service 
using a protocol called SPNEGO when secure mode is enabled. SPNEGO is Kerberos 
over HTTP and the client also needs to support it. If the authentication is 
successful the proxy forwards the request to STRAM. STRAM in processing the 
request generates and sends back a web service token similar to a delegation 
token. This token is then used by the client in subsequent requests it makes 
directly to STRAM and STRAM is able to validate it since it generated the token 
in the first place.</p>
+<p><img alt="" src="../images/security/image03.png" /></p>
+<h2 id="conclusion">Conclusion</h2>
+<p>We looked at the different security configuration options that are 
available in Apex, saw the different security requirements for distributed 
applications in a secure Hadoop environment in detail and looked at how the 
various security mechanisms in Apex solves this.</p>
+              
+            </div>
+          </div>
+          <footer>
+  
+    <div class="rst-footer-buttons" role="navigation" aria-label="footer 
navigation">
+      
+        <a href="../compatibility/" class="btn btn-neutral float-right" 
title="Compatibility">Next <span class="icon 
icon-circle-arrow-right"></span></a>
+      
+      
+        <a href="../apex_cli/" class="btn btn-neutral" title="Apex CLI"><span 
class="icon icon-circle-arrow-left"></span> Previous</a>
+      
+    </div>
+  
+
+  <hr/>
+
+  <div role="contentinfo">
+    <!-- Copyright etc -->
+    
+  </div>
+
+  Built with <a href="http://www.mkdocs.org";>MkDocs</a> using a <a 
href="https://github.com/snide/sphinx_rtd_theme";>theme</a> provided by <a 
href="https://readthedocs.org";>Read the Docs</a>.
+</footer>
+         
+        </div>
+      </div>
+
+    </section>
+
+  </div>
+
+<div class="rst-versions" role="note" style="cursor: pointer">
+    <span class="rst-current-version" data-toggle="rst-current-version">
+      
+      
+        <span><a href="../apex_cli/" style="color: #fcfcfc;">&laquo; 
Previous</a></span>
+      
+      
+        <span style="margin-left: 15px"><a href="../compatibility/" 
style="color: #fcfcfc">Next &raquo;</a></span>
+      
+    </span>
+</div>
+
+</body>
+</html>

http://git-wip-us.apache.org/repos/asf/apex-site/blob/fb75848f/docs/apex-3.6/sitemap.xml
----------------------------------------------------------------------
diff --git a/docs/apex-3.6/sitemap.xml b/docs/apex-3.6/sitemap.xml
new file mode 100644
index 0000000..37a1dc2
--- /dev/null
+++ b/docs/apex-3.6/sitemap.xml
@@ -0,0 +1,82 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9";>
+
+    
+    <url>
+     <loc>/</loc>
+     <lastmod>2017-05-01</lastmod>
+     <changefreq>daily</changefreq>
+    </url>
+    
+
+    
+        
+    <url>
+     <loc>/apex_development_setup/</loc>
+     <lastmod>2017-05-01</lastmod>
+     <changefreq>daily</changefreq>
+    </url>
+        
+    <url>
+     <loc>/application_development/</loc>
+     <lastmod>2017-05-01</lastmod>
+     <changefreq>daily</changefreq>
+    </url>
+        
+    <url>
+     <loc>/application_packages/</loc>
+     <lastmod>2017-05-01</lastmod>
+     <changefreq>daily</changefreq>
+    </url>
+        
+    <url>
+     <loc>/operator_development/</loc>
+     <lastmod>2017-05-01</lastmod>
+     <changefreq>daily</changefreq>
+    </url>
+        
+    <url>
+     <loc>/autometrics/</loc>
+     <lastmod>2017-05-01</lastmod>
+     <changefreq>daily</changefreq>
+    </url>
+        
+    <url>
+     <loc>/control_tuples/</loc>
+     <lastmod>2017-05-01</lastmod>
+     <changefreq>daily</changefreq>
+    </url>
+        
+    <url>
+     <loc>/development_best_practices/</loc>
+     <lastmod>2017-05-01</lastmod>
+     <changefreq>daily</changefreq>
+    </url>
+        
+    
+
+    
+        
+    <url>
+     <loc>/apex_cli/</loc>
+     <lastmod>2017-05-01</lastmod>
+     <changefreq>daily</changefreq>
+    </url>
+        
+    <url>
+     <loc>/security/</loc>
+     <lastmod>2017-05-01</lastmod>
+     <changefreq>daily</changefreq>
+    </url>
+        
+    
+
+    
+    <url>
+     <loc>/compatibility/</loc>
+     <lastmod>2017-05-01</lastmod>
+     <changefreq>daily</changefreq>
+    </url>
+    
+
+</urlset>
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/apex-site/blob/fb75848f/docs/apex-3.6/toc.html
----------------------------------------------------------------------
diff --git a/docs/apex-3.6/toc.html b/docs/apex-3.6/toc.html
new file mode 100644
index 0000000..6cd2fc9
--- /dev/null
+++ b/docs/apex-3.6/toc.html
@@ -0,0 +1,23 @@
+{% if nav_item.children %}
+    <ul class="subnav">
+    <li><span>{{ nav_item.title }}</span></li>
+
+        {% for nav_item in nav_item.children %}
+            {% include 'toc.html' %}
+        {% endfor %}
+    </ul>
+{% else %}
+    <li class="toctree-l1 {% if nav_item.active%}current{%endif%}">
+        <a class="{% if nav_item.active%}current{%endif%}" href="{{ 
nav_item.url }}">{{ nav_item.title }}</a>
+        {% if nav_item == current_page %}
+            <ul>
+            {% for toc_item in toc %}
+                <li class="toctree-l3"><a href="{{ toc_item.url }}">{{ 
toc_item.title }}</a></li>
+                {% for toc_item in toc_item.children %}
+                    <li><a class="toctree-l4" href="{{ toc_item.url }}">{{ 
toc_item.title }}</a></li>
+                {% endfor %}
+            {% endfor %}
+            </ul>
+        {% endif %}
+    </li>
+{% endif %}

http://git-wip-us.apache.org/repos/asf/apex-site/blob/fb75848f/docs/apex-3.6/versions.html
----------------------------------------------------------------------
diff --git a/docs/apex-3.6/versions.html b/docs/apex-3.6/versions.html
new file mode 100644
index 0000000..d12d197
--- /dev/null
+++ b/docs/apex-3.6/versions.html
@@ -0,0 +1,15 @@
+<div class="rst-versions" role="note" style="cursor: pointer">
+    <span class="rst-current-version" data-toggle="rst-current-version">
+      {% if repo_name == 'GitHub' %}
+          <a href="{{ repo_url }}" class="icon icon-github" style="float: 
left; color: #fcfcfc"> GitHub</a>
+      {% elif repo_name == 'Bitbucket' %}
+          <a href="{{ repo_url }}" class="icon icon-bitbucket" style="float: 
left; color: #fcfcfc"> BitBucket</a>
+      {% endif %}
+      {% if previous_page %}
+        <span><a href="{{ previous_page.url }}" style="color: 
#fcfcfc;">&laquo; Previous</a></span>
+      {% endif %}
+      {% if next_page %}
+        <span style="margin-left: 15px"><a href="{{ next_page.url }}" 
style="color: #fcfcfc">Next &raquo;</a></span>
+      {% endif %}
+    </span>
+</div>

[01/13] apex-site git commit: Adding apex-3.6.0 documentation

Reply via email to