[2/2] apex-site git commit: from cedfda71f7908cf704ae51d1e08cb54710ea13d5

thw Mon, 05 Sep 2016 16:40:25 -0700

from cedfda71f7908cf704ae51d1e08cb54710ea13d5


Project: http://git-wip-us.apache.org/repos/asf/apex-site/repo
Commit: http://git-wip-us.apache.org/repos/asf/apex-site/commit/69d479dc
Tree: http://git-wip-us.apache.org/repos/asf/apex-site/tree/69d479dc
Diff: http://git-wip-us.apache.org/repos/asf/apex-site/diff/69d479dc

Branch: refs/heads/asf-site
Commit: 69d479dcb94e8a1a9c5e243faa290760cc4e61ec
Parents: 69764db
Author: Thomas Weise <[email protected]>
Authored: Mon Sep 5 16:38:34 2016 -0700
Committer: Thomas Weise <[email protected]>
Committed: Mon Sep 5 16:38:34 2016 -0700

----------------------------------------------------------------------
 content/docs/malhar                             |   2 +-
 content/docs/malhar-3.5/index.html              |   9 +-
 .../docs/malhar-3.5/mkdocs/search_index.json    | 210 ++++++
 .../operators/block_reader/index.html           |   7 +
 .../malhar-3.5/operators/deduper/index.html     | 724 +++++++++++++++++++
 .../malhar-3.5/operators/enricher/index.html    |  11 +
 .../malhar-3.5/operators/file_output/index.html |   7 +
 .../operators/file_splitter/index.html          |   7 +
 .../operators/images/deduper/image00.png        | Bin 0 -> 8612 bytes
 .../operators/images/deduper/image01.png        | Bin 0 -> 23903 bytes
 .../operators/images/deduper/image02.png        | Bin 0 -> 25300 bytes
 .../operators/images/deduper/image03.png        | Bin 0 -> 10901 bytes
 .../operators/images/deduper/image04.png        | Bin 0 -> 17387 bytes
 .../operators/kafkaInputOperator/index.html     |   7 +
 content/docs/malhar-3.5/search.html             |   7 +
 content/docs/malhar-3.5/sitemap.xml             |  18 +-
 16 files changed, 1001 insertions(+), 8 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/apex-site/blob/69d479dc/content/docs/malhar
----------------------------------------------------------------------
diff --git a/content/docs/malhar b/content/docs/malhar
index 73b3aae..e2eb4d7 120000
--- a/content/docs/malhar
+++ b/content/docs/malhar
@@ -1 +1 @@
-malhar-3.3
\ No newline at end of file
+malhar-3.5
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/apex-site/blob/69d479dc/content/docs/malhar-3.5/index.html
----------------------------------------------------------------------
diff --git a/content/docs/malhar-3.5/index.html 
b/content/docs/malhar-3.5/index.html
index 6566963..cde5c67 100644
--- a/content/docs/malhar-3.5/index.html
+++ b/content/docs/malhar-3.5/index.html
@@ -125,6 +125,13 @@
     </li>
 
         
+            
+    <li class="toctree-l1 ">
+        <a class="" href="operators/deduper/">Deduper</a>
+        
+    </li>
+
+        
     </ul>
 <li>
           
@@ -251,5 +258,5 @@
 
 <!--
 MkDocs version : 0.15.3
-Build Date UTC : 2016-08-29 19:54:18.899629
+Build Date UTC : 2016-09-05 23:29:59.376834
 -->

http://git-wip-us.apache.org/repos/asf/apex-site/blob/69d479dc/content/docs/malhar-3.5/mkdocs/search_index.json
----------------------------------------------------------------------
diff --git a/content/docs/malhar-3.5/mkdocs/search_index.json 
b/content/docs/malhar-3.5/mkdocs/search_index.json
index 524fd0a..2f1eb01 100644
--- a/content/docs/malhar-3.5/mkdocs/search_index.json
+++ b/content/docs/malhar-3.5/mkdocs/search_index.json
@@ -519,6 +519,216 @@
             "location": "/operators/enricher/#latency-based", 
             "text": "Following code can be added to populateDAG method of 
application to dynamically partitioning POJOEnricher:      
StatelessLatencyBasedPartitioner POJOEnricher  partitioner = new 
StatelessLatencyBasedPartitioner ();\n    
partitioner.setCooldownMillis(conf.getLong(COOL_DOWN_MILLIS, 10000));\n    
partitioner.setMaximumLatency(conf.getLong(MAX_THROUGHPUT, 10));\n    
partitioner.setMinimumLatency(conf.getLong(MIN_THROUGHPUT, 3));\n    
dag.setAttribute(pojoEnricherObj, OperatorContext.STATS_LISTENERS, 
Arrays.asList(new StatsListener[]{partitioner}));\n    
dag.setAttribute(pojoEnricherObj, OperatorContext.PARTITIONER, partitioner);  
Above code will dynamically partition POJOEnricher when the overall latency of 
POJOEnricher changes.\nIf the overall latency of POJOEnricher goes beyond 10 ms 
or less than 3 ms, the platform will repartition POJOEnricher \nto balance 
latency of a single partition to be between 3 ms and 10 ms.\nCooldownMillis of 
10000 will be used as the thre
 shold time for which the latency change is observed.", 
             "title": "Latency based"
+        }, 
+        {
+            "location": "/operators/deduper/", 
+            "text": "Deduper - Operator 
Documentation\n\n\nIntroduction\n\n\nAbout this document\n\n\nThis document is 
intended as a guide for understanding and using\nthe Dedup 
operator.\n\n\nTerminology\n\n\nWe will refer to this operator as the Deduper 
or Dedup operator\ninterchangeably.\n\n\nOverview\n\n\nDedup - \u201cWhat\u201d 
in a Nutshell\n\n\nDedup is actually short for Deduplication. Duplicates are 
omnipresent and\ncan be found in almost any kind of data. Most of the times it 
is\nessential to discard, or at the very least separate out the data 
into\nunique\u00a0and duplicate\u00a0components. The entire purpose of 
this\noperator is to de-duplicate data. In other words, when data 
passes\nthrough this operator, it will be segregated into two different 
data\nsets, one of which contains all unique tuples, and the other which 
are\noccurring more than once in the original data set.\n\n\n\n\nDedup - 
\u201cHow\u201d in a Nutshell\n\n\nIn order to quickly decide whether an 
incoming
  tuple is duplicate\nor unique, it has to store each incoming tuple (or a 
signature, like key,\nfor example) to be used for comparison later. A plain 
in-memory storage\nmay work for small datasets, but will not scale for large 
ones. Deduper employs a large scale distributed persistent hashing mechanism 
(known as the Managed State) which allows\nit to identify if a particular tuple 
is duplicate or unique. Managed state is a layer on HDFS which allows all the 
stored data to be persisted in a distributed fashion.\nEach time it identifies 
a tuple as a unique tuple, it also\nstores it into the Managed state for 
future\nlookup.\n\n\n\n\nFollowing are the different components of the 
Deduper\n\n\n\n\nDedup Operator\n - This is responsible for the overall\n    
functionality of the operator. This in turn makes use of other\n    components 
to establish the end goal of deciding whether a tuple is\n    a duplicate of 
some earlier tuple, or is a unique tuple.\n\n\nManaged State\n - Since, all of 
 the data cannot be stored in\n    memory, this component allows us to persist 
existing unique keys on\n    HDFS in form of buckets. This is also responsible 
for fetching data as\n    requested by the Deduper. Since, it communicates with 
the HDFS, data access is slow and so it allows for asynchronous (non-blocking) 
calls to fetch data. This ensures that the Deduper is not blocked and can 
continue to process other tuples. It also supports an in-memory cache where it 
stores the fetched data so that repeated access to the same data is faster. 
Periodically, based on configuration, this also\n    discards data which is no 
longer needed.\n\n\n\n\nThis was a very basic introduction to the functioning 
of the\nDeduper. Following sections will go into more detail on each of 
the\ncomponents.\n\n\nUse cases - Basic Dedup\n\n\nDedup Key\n\n\nA dedup key 
is a set of one or more fields in the data tuple which\nacts as the 
key\u00a0for the tuples.\nThis is used by the deduper to compare tuples to ar
 rive at the\nconclusion on whether two tuples are duplicates.\n\n\nConsider an 
example schema and two sample tuples\n\n\n{Name, Phone, Email, Date, State, 
Zip, Country}\n\n\nTuple 1:\n\n\n{\n  Austin U. Saunders,\n  
+91-319-340-59385,\n  [email protected],\n  2015-11-09 
13:38:38,\n  Texas,\n  73301,\n  United States\n}\n\n\n\n\nTuple 2:\n\n\n{\n  
Austin U. Saunders,\n  +91-319-340-59385,\n  [email protected],\n  
2015-11-09 13:39:38,\n  Texas,\n  73301,\n  United States\n}\n\n\n\n\nLet us 
assume that the Dedup Key\nis \n{Name, Phone}\n. In\nthis case, the two tuples 
are duplicates because the key fields are same\nin both the tuples. However, if 
the Dedup Key is {Phone,Email},\nthe two are unique as the email values 
differ.\n\n\nUse case Details\n\n\nConsider the case of de-duplicating a master 
data set\nwhich is stored in a file.\u00a0Further also consider the\nfollowing 
schema for tuples in the data set.\n\n\n{Name, Phone, Email, Date, City, Zip, 
Country}\n
 \n\nAlso consider that we need to identify unique customers from the\nmaster 
data set. So, ultimately the output needed for the use case is\ntwo data sets - 
Unique Records\u00a0and Duplicate Records.\n\n\nAs part of configuring the 
operator for this use case, we need to\nset the following 
parameters:\n\n\n\n\nkeyExpression\n\u00a0- This can be set as\n    the primary 
key which can be used to uniquely identify a Customer.\n    For example, we can 
set it to \nName,Email\n\n\n\n\nThe above configuration is sufficient to 
address this use case.\n\n\nUse case - Dedup with 
Expiry\n\n\nMotivation\n\n\nThe Basic Dedup use case is the most 
straightforward and is\nusually applied when the amount of data to be processed 
is not huge.\nHowever, if the incoming data is huge, or even never-ending, it 
is\nusually not necessary to keep storing all the data. This is because 
in\nmost real world use cases, the duplicates occur only a short 
distance\napart. Hence, after a while, it is usually okay to for
 get part of\nthe history and consider only limited history for 
identifying\nduplicates, in the interest of efficiency. In other words, we 
expire\n(ignore) some tuples which are (or were supposed to be) delivered 
long\nback. Doing so, reduces the load on the storage mechanism (managed state) 
which effectively deletes part of the history, thus making the whole process 
more\nefficient. We call this use case, Dedup with expiry.\n\n\nExpiry 
Key\n\n\nThe easiest way to understand this use case is to 
consider\ntime\u00a0as the criterion for expiring\ntuples. Time\u00a0is a 
natural expiry\nkey and is in line with the concept of expiry. Formally, an 
expiry field\nis a field in the input tuple which can be used to discard 
incoming\ntuples as expired. This expiry key\nusually works with another 
parameter called Expiry Period defined\nnext.\n\n\nExpiry Period\n\n\nThe 
expiry period is the value supplied by the user to define the\nextent of 
history which should be considered while expiring\ntupl
 es.\n\n\nUse case Details\n\n\nConsider an incoming stream of system logs. The 
use case requires\nus to identify duplicate log messages and pass on only the 
unique ones.\nAnother relaxation in the use case is that the log messages which 
are\nolder than a day, may not be considered and must be filtered out 
as\nexpired. The expiry must be measured with respect to the time stamp in\nthe 
logs. For example, if the timestamp in the incoming message is\n\n30-12-2014 
00:00:00\n and the\nlatest message that the system has encountered had the time 
stamp\n\n31-12-2014 00:00:00\n, then the\nincoming message must be considered 
as expired. However, if the incoming\nmessage had any timestamp like 
\n30-12-2014\n00:11:00\n, it must be accepted into the system and be checked 
for a possible duplicate.\n\n\nThe expiry facet in the use case above gives us 
an advantage in\nthat we do not have to compare the incoming record with 
all\u00a0the data to check if it is a duplicate.\nAt the same time, all\u00a0
 the\nincoming data need not be stored; just a day worth of data is adequate to 
address the above use case.\n\n\nConfiguring the below parameters will solve 
the problem for this\nuse case:\n\n\n\n\nkeyExpression\n\u00a0- This is the 
dedup key for the incoming tuples (similar to the Basic Dedup use case). This 
can be any key which can uniquely identify a record. For log messages this can 
be a serial number attached in the log.\n\n\ntimeExpression\n\u00a0- This is 
the key which can help identify the expired records, as explained above. In 
this particular use case, it can be a timestamp field which indicates when the 
log message was generated.\n\n\nexpireBefore\n\u00a0- This is the period of 
expiry as explained above. In our example use case this will be 24 hour, 
specified in seconds.\n\n\n\n\nConfiguration of the above parameters is 
sufficient to address this use\ncase.\n\n\nUse cases - Summary\n\n\n\n\nBasic 
Dedup\n - Deduplication of\n    bounded datasets. Data is assumed to be bound
 ed. This use case is\n    not meant for never ending streams of data. For 
example:\n    Deduplication of master data like customer records, product 
catalogs\n    etc.\n\n\nTime Based Dedup\n\u00a0- Deduplication of\n    
unlimited streams of data. This use case handles unbounded streams\n    of data 
and can run forever. An expiry key and criterion is expected\n    as part of 
the input which helps avoid storing all the unique data.\n    This helps speed 
up performance. Any timestamp field in the incoming\n    tuple can be used as a 
time based expiry key.\n\n\nWith respect to system time\n\u00a0- Time 
progresses with system time. Any expiry criteria are executed with the notion 
of system time. This is possible if the incoming tuple does not have a time 
field, or the user does not specify a \ntimeExpression\n.\n\n\nWith respect to 
tuple time\n\u00a0- Time progresses based on the time in the incoming tuples. 
Expiry criteria are executed with the notion of time indicated by the incoming t
 uple. Specification of the time field (\ntimeExpression\n) is mandatory for 
this scenario.\n\n\n\n\n\n\n\n\nTechnical Architecture\n\n\nClass 
Structure\n\n\n\n\n\n\nArchitectural Details\n\n\n\n\nConcepts\n\n\nDedup Key - 
Specified by \nkeyExpression\n\u00a0parameter\n\n\nA dedup key is a set of one 
or more fields in the data tuple which\nacts as the key\u00a0for the 
tuples.\nThis is used by the deduper to compare tuples to arrive at 
the\nconclusion on whether two tuples are duplicates. If Dedup Key of 
two\ntuples match, then they are duplicates, else they are unique.\n\n\nExpiry 
Key - Specified by \ntimeExpression\n\u00a0parameter\n\n\nA tuple may or may 
not have an Expiry Key. Dedup operator cannot\nkeep storing all the data that 
is flowing into the operator. At some\npoint it becomes essential to discard 
some of the historical tuples in\ninterest of memory and efficiency.\n\n\nAt 
the same time, tuples are expected to arrive at the Dedup\noperator within some 
time after they are g
 enerated. After this time, the\ntuples may be considered as stale or 
obsolete.\n\n\nIn such cases, the Deduper considers these tuples 
as\nexpired\u00a0and takes no action other than\nseparating out these tuples on 
a different port in order to be processed\nby some other operator or stored 
offline for analysis.\n\n\nIn order to create a criterion for discarding such 
tuples, we\nintroduce an Expiry Key. Looking at the value of the Expiry Key in 
each\ntuple, we can decide whether or not to discard this tuple 
as\nexpired.\n\n\nThe expiry key that we consider in Time Based Dedup is\ntime. 
This usually works with\nanother parameter called Expiry Period defined 
next.\n\n\nExpiry Period\n\n\nThe Expiry Period is the value supplied by the 
user which decides\nwhen a particular tuple expires.\n\n\nTime Points\n\n\nFor 
every dataset that the deduper processes, a set of time points is 
maintained:\n\n\n\n\nLatest Point\n\u00a0- This is the maximum\n    time point 
observed in all the processed tup
 les.\n\n\nExpiry Point\n\u00a0- This is given by:\n    \nExpiry Point = Latest 
Point - Expiry Period\n\n\n\n\nThese points help the deduper to make decisions 
related to expiry\nof a tuple.\n\n\nExample - Expiry\n\n\n\n\n\n\n\n\nTuple 
Id\n\n\nExpiry Key (Expiry Period = 10)\n\n\nLatest Point\n\n\nExpiry 
Point\n\n\nDecision for 
Tuple\n\n\n\n\n\n\n\n\n\n\n1\n\n\n10\n\n\n10\n\n\n1\n\n\nNot 
Expired\n\n\n\n\n\n\n2\n\n\n20\n\n\n20\n\n\n11\n\n\nNot 
Expired\n\n\n\n\n\n\n3\n\n\n25\n\n\n25\n\n\n16\n\n\nNot 
Expired\n\n\n\n\n\n\n4\n\n\n40\n\n\n40\n\n\n31\n\n\nNot 
Expired\n\n\n\n\n\n\n5\n\n\n21\n\n\n40\n\n\n31\n\n\nExpired\n\n\n\n\n\n\n6\n\n\n35\n\n\n40\n\n\n31\n\n\nNot
 Expired\n\n\n\n\n\n\n7\n\n\n45\n\n\n45\n\n\n36\n\n\nNot 
Expired\n\n\n\n\n\n\n8\n\n\n57\n\n\n57\n\n\n48\n\n\nNot 
Expired\n\n\n\n\n\n\n\n\nTime Buckets (A component of Managed State)\n\n\nOne 
of the requirements of the Deduper is to store all the unique\ntuples 
(actually, just the keys of tuples). Keeping an ever growing\ncache in m
 emory is not scalable. So what we need is a limited cache\nbacked by a 
persistent store. When data is requested to be fetched from managed\nstate, it 
is also cached in an in-memory cache. Buckets help\nnarrow down the search of 
duplicates for incoming tuples. A Bucket is an\nabstraction for a collection of 
tuples all of which share a common hash\nvalue based on some hash function or a 
range of time, for example: a\nbucket of data for 5 contiguous minutes. A 
Bucket\u00a0has a span property called Bucket Span.\n\n\nBucket 
Span\n\n\nBucket span is simply the range of the domain\nthat is covered by the 
Bucket. This span is specified in\nthe domain of the Expiry key. If the 
Expiry\nKey is time, \u00a0then the Bucket span\nwill be specified in seconds. 
It is\nonly defined in case tuples have an Expiry Key.\n\n\nNumber of 
Buckets\n\n\nThe number of buckets can be given by - \nNum Buckets = 
Expiry\nPeriod / Bucket Span\n\n\nThis is because at any point of time, we need 
only store Expiry\nPe
 riod worth of data.\n\n\nExample - 
Buckets\n\n\n\n\nAssumptions\n\n\nAssumption 1 \n\n\nThis assumption is only 
applicable in case of Dedup with\nExpiry.\n\n\nFor any two tuples, t1 and t2 
having dedup keys d1 and d2, and\nexpiry keys e1 and e2, respectively, the 
following holds:\n\n\nIf d1 = d2,\n  then e1 = e2\n\n\n\n\nIn other words, 
there may never\nbe\u00a0two tuples t1 and t2 such that:\n\n\nTuple 1: d1, 
e1\nTuple 2: d2, e2\nd1 = d2 and e1 != e2\n\n\n\n\nIn other words, any two 
tuples with the same dedup key are assumed to have the\nsame expiry key as 
well.\nThis assumption was made with respect to certain use cases. These\nuse 
cases follow this assumption in that the records which are\nduplicates are 
exactly identical. An example use case is when log\nmessages are replayed 
erroneously, and we want to identify the duplicate\nlog messages. In such 
cases, we need not worry about two different log\nmessages having the same 
identifier but different timestamps. Since its\na replay 
 of the same data, the duplicate records are assumed to be\nexactly 
identical.\n\n\nIn case the duplicate tuple has a different value for expiry 
key, the behavior of\nthe deduper can be non-deterministic.\n\n\nFlow of a 
Tuple through Dedup Operator\n\n\nTuples flow through the Dedup operator one by 
one. Deduper may process a tuple immediately, or store it in some 
data\nstructure for later processing.\n\n\nWhen a tuple always arrives at the 
input\nport\u00a0of the Dedup operator, it does\nthe following 
tasks.\n\n\nCheck if tuple is Expired\n\n\nThis is only done in case of Dedup 
with expiry. The\nfollowing condition is used to check if the tuple is 
expired.\n\n\nif ( Latest Point - Expiry Key \n Expiry Point )\n  then 
Expired\n\n\n\n\nIf the tuple is expired, then send it to the expired 
port.\n\n\nCheck if tuple is a Duplicate or Unique\n\n\nOnce a tuple passes the 
check of expiry, we proceed to check if\nthe tuple is a duplicate of some 
earlier tuple. Note that\nif the tuple in quest
 ion is not expired, the duplicate will also not\nhave expired due to the 
assumption listed \nhere\n.\nThe Deduper queries the Managed state to fetch the 
value for the tuple key.\nThis request is processed by the Managed state in a 
separate asynchronous thread.\nOnce this request is submitted, the Deduper 
moves on to process other\ntuples. Additionally the Deduper also inserts the 
tuple being processed\ninto a waiting events\u00a0queue for later 
processing.\n\n\nProcess pending tuples\n\n\nOnce the Deduper has looked at the 
all the tuples in the current window,\nit starts to process the tuples in the 
waiting queue to finalize the decision\n(unique or duplicate) for these 
tuples.\nOnce the request to Managed state is completed for a tuple and the 
value is\nfetched from persistent storage, the Deduper can decide if the tuple 
in\nquestion is a duplicate or a unique.\nDepending on whether there is enough 
time left in the current window,\nit can do one of the 
following:\n\n\n\n\nProcess o
 nly the tuples for which the managed state has completed processing.\nThe 
tuples which are still being processed by managed state are skipped only to 
come back to them when it can no longer postpone it. This is typically done 
when the operator\nhas idle time as there are no tuples on the input ports and 
the current window\nhas still not ended.\n\n\nBlock on them to complete their 
processing. This will happen when the current\nwindow has no time left, and the 
decision cannot be postponed. Note: An operator can end its window, only when 
all the tuples have been completely processed.  \n\n\n\n\nPorts, Attributes and 
Properties\n\n\nPorts\n\n\nThe deduper has a single input port and multiple 
output\nports.\n\n\n\n\ninput\n - This is the input port through\n    which the 
tuples arrive at the Deduper.\n\n\nunique\n\u00a0- This is the output port on\n 
   which unique tuples are sent out by the Deduper.\n\n\nduplicate\n\u00a0- 
This is the output port on\n    which duplicate tuples are sent 
 out by the Deduper.\n\n\nexpired\n\u00a0- This is the output port on\n    
which expired tuples are sent out by the Deduper.\n\n\n\n\nThe user can choose 
which output ports to connect the down stream operators.\nAll the output ports 
are optional and can be used as required by the use 
case.\n\n\nAttributes\n\n\n\n\nInput port Attribute - 
input.TUPLE_CLASS\n\u00a0- Class or the fully\nqualified class 
name.\n\n\nMandatory attribute\n\n\nTells the operator about the type of the 
incoming\ntuple.\n\n\n\n\n\n\n\n\nProperties\n\n\n\n\n\n\nkeyExpression\n\u00a0-
 String\n\n\n\n\nMandatory parameter.\n\n\nThe java expression to extract the 
key fields in the incoming tuple 
(POJO)\n\n\n\n\n\n\n\n\ntimeExpression\n\u00a0- String - (Time Based Deduper 
only)\n\n\n\n\nThe java expression to extract the time field in the incoming 
tuple (POJO).\n\n\n\n\n\n\n\n\nexpireBefore\n\u00a0- Long (Seconds) - (Time 
Based Deduper only)\n\n\n\n\nThis is the total time period during which a tuple 
stays in the syste
 m and blocks any other tuple with the same 
key.\n\n\n\n\n\n\n\n\nbucketSpan\n\u00a0- Long (Seconds) - (Time Based Deduper 
only)\n\n\n\n\nMandatory parameter\n\n\nThis is the unit which describes how 
large a bucket can be. Typically this should be defined depending on the use 
case. For example, if we have expireBefore set to 1 hour, then typically we 
would be clubbing data in the order of minutes, so a \nbucketSpan\n of a few 
minutes would make sense. Note that in this case, the entire data worth the 
\nbucketSpan\n will expire as a whole. Setting it to 1 minute would make the 
number of time buckets in the system to be 1 hour / 1 minute = 60 buckets.  
Similarly setting bucketSpan to 5 minutes would make number of buckets to be 
12.\n\n\nNote that having too many or too few buckets could have a performance 
impact. If unsure, set the bucketSpan to the square root of \nexpireBefore\n. 
This way the number of buckets and bucket span are 
balanced.\n\n\n\n\n\n\n\n\nreferenceInstant\n\u00a0- \
 u00a0Long (Seconds) - (Time Based Deduper only)\n\n\n\n\nThe reference point 
from which to start the time which is use for expiry. Setting the 
referenceInstant to say, r seconds from the epoch, would initialize the start 
of expiry to be from that \ninstant = r\n. The start and end of the expiry 
window periodically move by the span of a single 
bucket.\n\n\n\n\n\n\n\n\nnumBuckets\n\u00a0- \u00a0Integer - (Bounded Deduper 
only)\n\n\n\n\nOptional parameter, but recommended to be provided by the 
user.\n\n\nThis is the number of buckets that need to be used for storing the 
keys of the incoming tuples.\n\n\nUsers can decide upon the proper value for 
this parameter by guessing the number of distinct keys in the application. A 
reasonable value is the square root of N, where N is the number of distinct 
keys. If omitted, the Java MAX_VALUE for integer is used for 
N.\n\n\n\n\n\n\n\n\nExample\n\n\nPlease refer to 
\nhttps://github.com/DataTorrent/examples/tree/master/tutorials/dedup\n\u00a0for\na
 n example on how to use Deduper.\n\n\nPartitioning\n\n\nDeduper can be 
statically partitioned using the operator\nattribute: PARTITIONER\n\n\nAdd the 
following property to the properties.xml 
file:\n\n\nproperty\n\n\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\nname\ndt.operator.{OperatorName}.attr.PARTITIONER\n/name\n\n\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\nvalue\ncom.datatorrent.common.partitioner.StatelessPartitioner:2\n/value\n\n\n/property\n\n\n\n\n\nThis
 will partition the Dedup operator into 2 static partitions. Change the 
number\nto the required number of partitions.\n\n\nDynamic partitioning is 
currently not supported in the Deduper.", 
+            "title": "Deduper"
+        }, 
+        {
+            "location": "/operators/deduper/#deduper-operator-documentation", 
+            "text": "", 
+            "title": "Deduper - Operator Documentation"
+        }, 
+        {
+            "location": "/operators/deduper/#introduction", 
+            "text": "", 
+            "title": "Introduction"
+        }, 
+        {
+            "location": "/operators/deduper/#about-this-document", 
+            "text": "This document is intended as a guide for understanding 
and using\nthe Dedup operator.", 
+            "title": "About this document"
+        }, 
+        {
+            "location": "/operators/deduper/#terminology", 
+            "text": "We will refer to this operator as the Deduper or Dedup 
operator\ninterchangeably.", 
+            "title": "Terminology"
+        }, 
+        {
+            "location": "/operators/deduper/#overview", 
+            "text": "", 
+            "title": "Overview"
+        }, 
+        {
+            "location": "/operators/deduper/#dedup-what-in-a-nutshell", 
+            "text": "Dedup is actually short for Deduplication. Duplicates are 
omnipresent and\ncan be found in almost any kind of data. Most of the times it 
is\nessential to discard, or at the very least separate out the data 
into\nunique\u00a0and duplicate\u00a0components. The entire purpose of 
this\noperator is to de-duplicate data. In other words, when data 
passes\nthrough this operator, it will be segregated into two different 
data\nsets, one of which contains all unique tuples, and the other which 
are\noccurring more than once in the original data set.", 
+            "title": "Dedup - \u201cWhat\u201d in a Nutshell"
+        }, 
+        {
+            "location": "/operators/deduper/#dedup-how-in-a-nutshell", 
+            "text": "In order to quickly decide whether an incoming tuple is 
duplicate\nor unique, it has to store each incoming tuple (or a signature, like 
key,\nfor example) to be used for comparison later. A plain in-memory 
storage\nmay work for small datasets, but will not scale for large ones. 
Deduper employs a large scale distributed persistent hashing mechanism (known 
as the Managed State) which allows\nit to identify if a particular tuple is 
duplicate or unique. Managed state is a layer on HDFS which allows all the 
stored data to be persisted in a distributed fashion.\nEach time it identifies 
a tuple as a unique tuple, it also\nstores it into the Managed state for 
future\nlookup.", 
+            "title": "Dedup - \u201cHow\u201d in a Nutshell"
+        }, 
+        {
+            "location": "/operators/deduper/#use-cases-basic-dedup", 
+            "text": "", 
+            "title": "Use cases - Basic Dedup"
+        }, 
+        {
+            "location": "/operators/deduper/#dedup-key", 
+            "text": "A dedup key is a set of one or more fields in the data 
tuple which\nacts as the key\u00a0for the tuples.\nThis is used by the deduper 
to compare tuples to arrive at the\nconclusion on whether two tuples are 
duplicates.  Consider an example schema and two sample tuples  {Name, Phone, 
Email, Date, State, Zip, Country}  Tuple 1:  {\n  Austin U. Saunders,\n  
+91-319-340-59385,\n  [email protected],\n  2015-11-09 
13:38:38,\n  Texas,\n  73301,\n  United States\n}  Tuple 2:  {\n  Austin U. 
Saunders,\n  +91-319-340-59385,\n  [email protected],\n  2015-11-09 
13:39:38,\n  Texas,\n  73301,\n  United States\n}  Let us assume that the Dedup 
Key\nis  {Name, Phone} . In\nthis case, the two tuples are duplicates because 
the key fields are same\nin both the tuples. However, if the Dedup Key is 
{Phone,Email},\nthe two are unique as the email values differ.", 
+            "title": "Dedup Key"
+        }, 
+        {
+            "location": "/operators/deduper/#use-case-details", 
+            "text": "Consider the case of de-duplicating a master data 
set\nwhich is stored in a file.\u00a0Further also consider the\nfollowing 
schema for tuples in the data set.  {Name, Phone, Email, Date, City, Zip, 
Country}  Also consider that we need to identify unique customers from 
the\nmaster data set. So, ultimately the output needed for the use case is\ntwo 
data sets - Unique Records\u00a0and Duplicate Records.  As part of configuring 
the operator for this use case, we need to\nset the following parameters:   
keyExpression \u00a0- This can be set as\n    the primary key which can be used 
to uniquely identify a Customer.\n    For example, we can set it to  Name,Email 
  The above configuration is sufficient to address this use case.", 
+            "title": "Use case Details"
+        }, 
+        {
+            "location": "/operators/deduper/#use-case-dedup-with-expiry", 
+            "text": "", 
+            "title": "Use case - Dedup with Expiry"
+        }, 
+        {
+            "location": "/operators/deduper/#motivation", 
+            "text": "The Basic Dedup use case is the most straightforward and 
is\nusually applied when the amount of data to be processed is not 
huge.\nHowever, if the incoming data is huge, or even never-ending, it 
is\nusually not necessary to keep storing all the data. This is because 
in\nmost real world use cases, the duplicates occur only a short 
distance\napart. Hence, after a while, it is usually okay to forget part 
of\nthe history and consider only limited history for identifying\nduplicates, 
in the interest of efficiency. In other words, we expire\n(ignore) some tuples 
which are (or were supposed to be) delivered long\nback. Doing so, reduces the 
load on the storage mechanism (managed state) which effectively deletes part of 
the history, thus making the whole process more\nefficient. We call this use 
case, Dedup with expiry.", 
+            "title": "Motivation"
+        }, 
+        {
+            "location": "/operators/deduper/#expiry-key", 
+            "text": "The easiest way to understand this use case is to 
consider\ntime\u00a0as the criterion for expiring\ntuples. Time\u00a0is a 
natural expiry\nkey and is in line with the concept of expiry. Formally, an 
expiry field\nis a field in the input tuple which can be used to discard 
incoming\ntuples as expired. This expiry key\nusually works with another 
parameter called Expiry Period defined\nnext.", 
+            "title": "Expiry Key"
+        }, 
+        {
+            "location": "/operators/deduper/#expiry-period", 
+            "text": "The expiry period is the value supplied by the user to 
define the\nextent of history which should be considered while 
expiring\ntuples.", 
+            "title": "Expiry Period"
+        }, 
+        {
+            "location": "/operators/deduper/#use-case-details_1", 
+            "text": "Consider an incoming stream of system logs. The use case 
requires\nus to identify duplicate log messages and pass on only the unique 
ones.\nAnother relaxation in the use case is that the log messages which 
are\nolder than a day, may not be considered and must be filtered out 
as\nexpired. The expiry must be measured with respect to the time stamp in\nthe 
logs. For example, if the timestamp in the incoming message is 30-12-2014 
00:00:00  and the\nlatest message that the system has encountered had the time 
stamp 31-12-2014 00:00:00 , then the\nincoming message must be considered as 
expired. However, if the incoming\nmessage had any timestamp like  
30-12-2014\n00:11:00 , it must be accepted into the system and be checked for a 
possible duplicate.  The expiry facet in the use case above gives us an 
advantage in\nthat we do not have to compare the incoming record with 
all\u00a0the data to check if it is a duplicate.\nAt the same time, 
all\u00a0the\nincoming data need 
 not be stored; just a day worth of data is adequate to address the above use 
case.  Configuring the below parameters will solve the problem for this\nuse 
case:   keyExpression \u00a0- This is the dedup key for the incoming tuples 
(similar to the Basic Dedup use case). This can be any key which can uniquely 
identify a record. For log messages this can be a serial number attached in the 
log.  timeExpression \u00a0- This is the key which can help identify the 
expired records, as explained above. In this particular use case, it can be a 
timestamp field which indicates when the log message was generated.  
expireBefore \u00a0- This is the period of expiry as explained above. In our 
example use case this will be 24 hour, specified in seconds.   Configuration of 
the above parameters is sufficient to address this use\ncase.", 
+            "title": "Use case Details"
+        }, 
+        {
+            "location": "/operators/deduper/#use-cases-summary", 
+            "text": "Basic Dedup  - Deduplication of\n    bounded datasets. 
Data is assumed to be bounded. This use case is\n    not meant for never ending 
streams of data. For example:\n    Deduplication of master data like customer 
records, product catalogs\n    etc.  Time Based Dedup \u00a0- Deduplication 
of\n    unlimited streams of data. This use case handles unbounded streams\n    
of data and can run forever. An expiry key and criterion is expected\n    as 
part of the input which helps avoid storing all the unique data.\n    This 
helps speed up performance. Any timestamp field in the incoming\n    tuple can 
be used as a time based expiry key.  With respect to system time \u00a0- Time 
progresses with system time. Any expiry criteria are executed with the notion 
of system time. This is possible if the incoming tuple does not have a time 
field, or the user does not specify a  timeExpression .  With respect to tuple 
time \u00a0- Time progresses based on the time in the incoming tu
 ples. Expiry criteria are executed with the notion of time indicated by the 
incoming tuple. Specification of the time field ( timeExpression ) is mandatory 
for this scenario.", 
+            "title": "Use cases - Summary"
+        }, 
+        {
+            "location": "/operators/deduper/#technical-architecture", 
+            "text": "", 
+            "title": "Technical Architecture"
+        }, 
+        {
+            "location": "/operators/deduper/#class-structure", 
+            "text": "", 
+            "title": "Class Structure"
+        }, 
+        {
+            "location": "/operators/deduper/#architectural-details", 
+            "text": "", 
+            "title": "Architectural Details"
+        }, 
+        {
+            "location": "/operators/deduper/#concepts", 
+            "text": "", 
+            "title": "Concepts"
+        }, 
+        {
+            "location": 
"/operators/deduper/#dedup-key-specified-by-keyexpression-parameter", 
+            "text": "A dedup key is a set of one or more fields in the data 
tuple which\nacts as the key\u00a0for the tuples.\nThis is used by the deduper 
to compare tuples to arrive at the\nconclusion on whether two tuples are 
duplicates. If Dedup Key of two\ntuples match, then they are duplicates, else 
they are unique.", 
+            "title": "Dedup Key - Specified by keyExpression\u00a0parameter"
+        }, 
+        {
+            "location": 
"/operators/deduper/#expiry-key-specified-by-timeexpression-parameter", 
+            "text": "A tuple may or may not have an Expiry Key. Dedup operator 
cannot\nkeep storing all the data that is flowing into the operator. At 
some\npoint it becomes essential to discard some of the historical tuples 
in\ninterest of memory and efficiency.  At the same time, tuples are expected 
to arrive at the Dedup\noperator within some time after they are generated. 
After this time, the\ntuples may be considered as stale or obsolete.  In such 
cases, the Deduper considers these tuples as\nexpired\u00a0and takes no action 
other than\nseparating out these tuples on a different port in order to be 
processed\nby some other operator or stored offline for analysis.  In order to 
create a criterion for discarding such tuples, we\nintroduce an Expiry Key. 
Looking at the value of the Expiry Key in each\ntuple, we can decide whether or 
not to discard this tuple as\nexpired.  The expiry key that we consider in Time 
Based Dedup is\ntime. This usually works with\nanother parameter called
  Expiry Period defined next.", 
+            "title": "Expiry Key - Specified by timeExpression\u00a0parameter"
+        }, 
+        {
+            "location": "/operators/deduper/#expiry-period_1", 
+            "text": "The Expiry Period is the value supplied by the user which 
decides\nwhen a particular tuple expires.", 
+            "title": "Expiry Period"
+        }, 
+        {
+            "location": "/operators/deduper/#time-points", 
+            "text": "For every dataset that the deduper processes, a set of 
time points is maintained:   Latest Point \u00a0- This is the maximum\n    time 
point observed in all the processed tuples.  Expiry Point \u00a0- This is given 
by:\n     Expiry Point = Latest Point - Expiry Period   These points help the 
deduper to make decisions related to expiry\nof a tuple.", 
+            "title": "Time Points"
+        }, 
+        {
+            "location": "/operators/deduper/#example-expiry", 
+            "text": "Tuple Id  Expiry Key (Expiry Period = 10)  Latest Point  
Expiry Point  Decision for Tuple      1  10  10  1  Not Expired    2  20  20  
11  Not Expired    3  25  25  16  Not Expired    4  40  40  31  Not Expired    
5  21  40  31  Expired    6  35  40  31  Not Expired    7  45  45  36  Not 
Expired    8  57  57  48  Not Expired", 
+            "title": "Example - Expiry"
+        }, 
+        {
+            "location": 
"/operators/deduper/#time-buckets-a-component-of-managed-state", 
+            "text": "One of the requirements of the Deduper is to store all 
the unique\ntuples (actually, just the keys of tuples). Keeping an ever 
growing\ncache in memory is not scalable. So what we need is a limited 
cache\nbacked by a persistent store. When data is requested to be fetched from 
managed\nstate, it is also cached in an in-memory cache. Buckets help\nnarrow 
down the search of duplicates for incoming tuples. A Bucket is an\nabstraction 
for a collection of tuples all of which share a common hash\nvalue based on 
some hash function or a range of time, for example: a\nbucket of data for 5 
contiguous minutes. A Bucket\u00a0has a span property called Bucket Span.", 
+            "title": "Time Buckets (A component of Managed State)"
+        }, 
+        {
+            "location": "/operators/deduper/#bucket-span", 
+            "text": "Bucket span is simply the range of the domain\nthat is 
covered by the Bucket. This span is specified in\nthe domain of the Expiry key. 
If the Expiry\nKey is time, \u00a0then the Bucket span\nwill be specified in 
seconds. It is\nonly defined in case tuples have an Expiry Key.", 
+            "title": "Bucket Span"
+        }, 
+        {
+            "location": "/operators/deduper/#number-of-buckets", 
+            "text": "The number of buckets can be given by -  Num Buckets = 
Expiry\nPeriod / Bucket Span  This is because at any point of time, we need 
only store Expiry\nPeriod worth of data.", 
+            "title": "Number of Buckets"
+        }, 
+        {
+            "location": "/operators/deduper/#example-buckets", 
+            "text": "", 
+            "title": "Example - Buckets"
+        }, 
+        {
+            "location": "/operators/deduper/#assumptions", 
+            "text": "", 
+            "title": "Assumptions"
+        }, 
+        {
+            "location": "/operators/deduper/#assumption-1", 
+            "text": "This assumption is only applicable in case of Dedup 
with\nExpiry.  For any two tuples, t1 and t2 having dedup keys d1 and d2, 
and\nexpiry keys e1 and e2, respectively, the following holds:  If d1 = d2,\n  
then e1 = e2  In other words, there may never\nbe\u00a0two tuples t1 and t2 
such that:  Tuple 1: d1, e1\nTuple 2: d2, e2\nd1 = d2 and e1 != e2  In other 
words, any two tuples with the same dedup key are assumed to have the\nsame 
expiry key as well.\nThis assumption was made with respect to certain use 
cases. These\nuse cases follow this assumption in that the records which 
are\nduplicates are exactly identical. An example use case is when 
log\nmessages are replayed erroneously, and we want to identify the 
duplicate\nlog messages. In such cases, we need not worry about two different 
log\nmessages having the same identifier but different timestamps. Since its\na 
replay of the same data, the duplicate records are assumed to be\nexactly 
identical.  In case the dupl
 icate tuple has a different value for expiry key, the behavior of\nthe deduper 
can be non-deterministic.", 
+            "title": "Assumption 1 "
+        }, 
+        {
+            "location": 
"/operators/deduper/#flow-of-a-tuple-through-dedup-operator", 
+            "text": "Tuples flow through the Dedup operator one by one. 
Deduper may process a tuple immediately, or store it in some data\nstructure 
for later processing.  When a tuple always arrives at the input\nport\u00a0of 
the Dedup operator, it does\nthe following tasks.", 
+            "title": "Flow of a Tuple through Dedup Operator"
+        }, 
+        {
+            "location": "/operators/deduper/#check-if-tuple-is-expired", 
+            "text": "This is only done in case of Dedup with expiry. 
The\nfollowing condition is used to check if the tuple is expired.  if ( Latest 
Point - Expiry Key   Expiry Point )\n  then Expired  If the tuple is expired, 
then send it to the expired port.", 
+            "title": "Check if tuple is Expired"
+        }, 
+        {
+            "location": 
"/operators/deduper/#check-if-tuple-is-a-duplicate-or-unique", 
+            "text": "Once a tuple passes the check of expiry, we proceed to 
check if\nthe tuple is a duplicate of some earlier tuple. Note that\nif the 
tuple in question is not expired, the duplicate will also not\nhave expired due 
to the assumption listed  here .\nThe Deduper queries the Managed state to 
fetch the value for the tuple key.\nThis request is processed by the Managed 
state in a separate asynchronous thread.\nOnce this request is submitted, the 
Deduper moves on to process other\ntuples. Additionally the Deduper also 
inserts the tuple being processed\ninto a waiting events\u00a0queue for later 
processing.", 
+            "title": "Check if tuple is a Duplicate or Unique"
+        }, 
+        {
+            "location": "/operators/deduper/#process-pending-tuples", 
+            "text": "Once the Deduper has looked at the all the tuples in the 
current window,\nit starts to process the tuples in the waiting queue to 
finalize the decision\n(unique or duplicate) for these tuples.\nOnce the 
request to Managed state is completed for a tuple and the value is\nfetched 
from persistent storage, the Deduper can decide if the tuple in\nquestion is a 
duplicate or a unique.\nDepending on whether there is enough time left in the 
current window,\nit can do one of the following:   Process only the tuples for 
which the managed state has completed processing.\nThe tuples which are still 
being processed by managed state are skipped only to come back to them when it 
can no longer postpone it. This is typically done when the operator\nhas idle 
time as there are no tuples on the input ports and the current window\nhas 
still not ended.  Block on them to complete their processing. This will happen 
when the current\nwindow has no time left, and the decision cannot be po
 stponed. Note: An operator can end its window, only when all the tuples have 
been completely processed.", 
+            "title": "Process pending tuples"
+        }, 
+        {
+            "location": "/operators/deduper/#ports-attributes-and-properties", 
+            "text": "", 
+            "title": "Ports, Attributes and Properties"
+        }, 
+        {
+            "location": "/operators/deduper/#ports", 
+            "text": "The deduper has a single input port and multiple 
output\nports.   input  - This is the input port through\n    which the tuples 
arrive at the Deduper.  unique \u00a0- This is the output port on\n    which 
unique tuples are sent out by the Deduper.  duplicate \u00a0- This is the 
output port on\n    which duplicate tuples are sent out by the Deduper.  
expired \u00a0- This is the output port on\n    which expired tuples are sent 
out by the Deduper.   The user can choose which output ports to connect the 
down stream operators.\nAll the output ports are optional and can be used as 
required by the use case.", 
+            "title": "Ports"
+        }, 
+        {
+            "location": "/operators/deduper/#attributes", 
+            "text": "Input port Attribute - input.TUPLE_CLASS \u00a0- Class or 
the fully\nqualified class name.  Mandatory attribute  Tells the operator about 
the type of the incoming\ntuple.", 
+            "title": "Attributes"
+        }, 
+        {
+            "location": "/operators/deduper/#properties", 
+            "text": "keyExpression \u00a0- String   Mandatory parameter.  The 
java expression to extract the key fields in the incoming tuple (POJO)     
timeExpression \u00a0- String - (Time Based Deduper only)   The java expression 
to extract the time field in the incoming tuple (POJO).     expireBefore 
\u00a0- Long (Seconds) - (Time Based Deduper only)   This is the total time 
period during which a tuple stays in the system and blocks any other tuple with 
the same key.     bucketSpan \u00a0- Long (Seconds) - (Time Based Deduper only) 
  Mandatory parameter  This is the unit which describes how large a bucket can 
be. Typically this should be defined depending on the use case. For example, if 
we have expireBefore set to 1 hour, then typically we would be clubbing data in 
the order of minutes, so a  bucketSpan  of a few minutes would make sense. Note 
that in this case, the entire data worth the  bucketSpan  will expire as a 
whole. Setting it to 1 minute would make the number of time b
 uckets in the system to be 1 hour / 1 minute = 60 buckets.  Similarly setting 
bucketSpan to 5 minutes would make number of buckets to be 12.  Note that 
having too many or too few buckets could have a performance impact. If unsure, 
set the bucketSpan to the square root of  expireBefore . This way the number of 
buckets and bucket span are balanced.     referenceInstant \u00a0- \u00a0Long 
(Seconds) - (Time Based Deduper only)   The reference point from which to start 
the time which is use for expiry. Setting the referenceInstant to say, r 
seconds from the epoch, would initialize the start of expiry to be from that  
instant = r . The start and end of the expiry window periodically move by the 
span of a single bucket.     numBuckets \u00a0- \u00a0Integer - (Bounded 
Deduper only)   Optional parameter, but recommended to be provided by the user. 
 This is the number of buckets that need to be used for storing the keys of the 
incoming tuples.  Users can decide upon the proper value for this 
 parameter by guessing the number of distinct keys in the application. A 
reasonable value is the square root of N, where N is the number of distinct 
keys. If omitted, the Java MAX_VALUE for integer is used for N.", 
+            "title": "Properties"
+        }, 
+        {
+            "location": "/operators/deduper/#example", 
+            "text": "Please refer to  
https://github.com/DataTorrent/examples/tree/master/tutorials/dedup 
\u00a0for\nan example on how to use Deduper.", 
+            "title": "Example"
+        }, 
+        {
+            "location": "/operators/deduper/#partitioning", 
+            "text": "Deduper can be statically partitioned using the 
operator\nattribute: PARTITIONER  Add the following property to the 
properties.xml file:  property 
\n\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 name 
dt.operator.{OperatorName}.attr.PARTITIONER /name 
\n\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0\u00a0 value 
com.datatorrent.common.partitioner.StatelessPartitioner:2 /value  /property   
This will partition the Dedup operator into 2 static partitions. Change the 
number\nto the required number of partitions.  Dynamic partitioning is 
currently not supported in the Deduper.", 
+            "title": "Partitioning"
         }
     ]
 }
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/apex-site/blob/69d479dc/content/docs/malhar-3.5/operators/block_reader/index.html
----------------------------------------------------------------------
diff --git a/content/docs/malhar-3.5/operators/block_reader/index.html 
b/content/docs/malhar-3.5/operators/block_reader/index.html
index 9459528..42ab2b3 100644
--- a/content/docs/malhar-3.5/operators/block_reader/index.html
+++ b/content/docs/malhar-3.5/operators/block_reader/index.html
@@ -141,6 +141,13 @@
     </li>
 
         
+            
+    <li class="toctree-l1 ">
+        <a class="" href="../deduper/">Deduper</a>
+        
+    </li>
+
+        
     </ul>
 <li>
           

http://git-wip-us.apache.org/repos/asf/apex-site/blob/69d479dc/content/docs/malhar-3.5/operators/deduper/index.html
----------------------------------------------------------------------
diff --git a/content/docs/malhar-3.5/operators/deduper/index.html 
b/content/docs/malhar-3.5/operators/deduper/index.html
new file mode 100644
index 0000000..045a398
--- /dev/null
+++ b/content/docs/malhar-3.5/operators/deduper/index.html
@@ -0,0 +1,724 @@
+<!DOCTYPE html>
+<!--[if IE 8]><html class="no-js lt-ie9" lang="en" > <![endif]-->
+<!--[if gt IE 8]><!--> <html class="no-js" lang="en" > <!--<![endif]-->
+<head>
+  <meta charset="utf-8">
+  <meta http-equiv="X-UA-Compatible" content="IE=edge">
+  <meta name="viewport" content="width=device-width, initial-scale=1.0">
+  
+  
+  
+  <title>Deduper - Apache Apex Malhar Documentation</title>
+  
+
+  <link rel="shortcut icon" href="../../favicon.ico">
+  
+
+  
+  <link 
href='https://fonts.googleapis.com/css?family=Lato:400,700|Roboto+Slab:400,700|Inconsolata:400,700'
 rel='stylesheet' type='text/css'>
+
+  <link rel="stylesheet" href="../../css/theme.css" type="text/css" />
+  <link rel="stylesheet" href="../../css/theme_extra.css" type="text/css" />
+  <link rel="stylesheet" href="../../css/highlight.css">
+
+  
+  <script>
+    // Current page data
+    var mkdocs_page_name = "Deduper";
+    var mkdocs_page_input_path = "operators/deduper.md";
+    var mkdocs_page_url = "/operators/deduper/";
+  </script>
+  
+  <script src="../../js/jquery-2.1.1.min.js"></script>
+  <script src="../../js/modernizr-2.8.3.min.js"></script>
+  <script type="text/javascript" src="../../js/highlight.pack.js"></script>
+  <script src="../../js/theme.js"></script> 
+
+  
+</head>
+
+<body class="wy-body-for-nav" role="document">
+
+  <div class="wy-grid-for-nav">
+
+    
+    <nav data-toggle="wy-nav-shift" class="wy-nav-side stickynav">
+      <div class="wy-side-nav-search">
+        <a href="../.." class="icon icon-home"> Apache Apex Malhar 
Documentation</a>
+        <div role="search">
+  <form id ="rtd-search-form" class="wy-form" action="../../search.html" 
method="get">
+    <input type="text" name="q" placeholder="Search docs" />
+  </form>
+</div>
+      </div>
+
+      <div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" 
aria-label="main navigation">
+        <ul class="current">
+          
+            <li>
+    <li class="toctree-l1 ">
+        <a class="" href="../..">Apache Apex Malhar</a>
+        
+    </li>
+<li>
+          
+            <li>
+    <ul class="subnav">
+    <li><span>Operators</span></li>
+
+        
+            
+    <li class="toctree-l1 ">
+        <a class="" href="../kafkaInputOperator/">Kafka Input</a>
+        
+    </li>
+
+        
+            
+    <li class="toctree-l1 ">
+        <a class="" href="../file_splitter/">File Splitter</a>
+        
+    </li>
+
+        
+            
+    <li class="toctree-l1 ">
+        <a class="" href="../block_reader/">Block Reader</a>
+        
+    </li>
+
+        
+            
+    <li class="toctree-l1 ">
+        <a class="" href="../file_output/">File Output</a>
+        
+    </li>
+
+        
+            
+    <li class="toctree-l1 ">
+        <a class="" href="../enricher/">Enricher</a>
+        
+    </li>
+
+        
+            
+    <li class="toctree-l1 current">
+        <a class="current" href="./">Deduper</a>
+        
+            <ul>
+            
+                <li class="toctree-l3"><a 
href="#deduper-operator-documentation">Deduper - Operator Documentation</a></li>
+                
+            
+                <li class="toctree-l3"><a 
href="#introduction">Introduction</a></li>
+                
+                    <li><a class="toctree-l4" 
href="#about-this-document">About this document</a></li>
+                
+                    <li><a class="toctree-l4" 
href="#terminology">Terminology</a></li>
+                
+            
+                <li class="toctree-l3"><a href="#overview">Overview</a></li>
+                
+                    <li><a class="toctree-l4" 
href="#dedup-what-in-a-nutshell">Dedup - âWhatâ in a Nutshell</a></li>
+                
+                    <li><a class="toctree-l4" 
href="#dedup-how-in-a-nutshell">Dedup - âHowâ in a Nutshell</a></li>
+                
+            
+                <li class="toctree-l3"><a href="#use-cases-basic-dedup">Use 
cases - Basic Dedup</a></li>
+                
+                    <li><a class="toctree-l4" href="#dedup-key">Dedup 
Key</a></li>
+                
+                    <li><a class="toctree-l4" href="#use-case-details">Use 
case Details</a></li>
+                
+            
+                <li class="toctree-l3"><a 
href="#use-case-dedup-with-expiry">Use case - Dedup with Expiry</a></li>
+                
+                    <li><a class="toctree-l4" 
href="#motivation">Motivation</a></li>
+                
+                    <li><a class="toctree-l4" href="#expiry-key">Expiry 
Key</a></li>
+                
+                    <li><a class="toctree-l4" href="#expiry-period">Expiry 
Period</a></li>
+                
+                    <li><a class="toctree-l4" href="#use-case-details_1">Use 
case Details</a></li>
+                
+            
+                <li class="toctree-l3"><a href="#use-cases-summary">Use cases 
- Summary</a></li>
+                
+            
+                <li class="toctree-l3"><a 
href="#technical-architecture">Technical Architecture</a></li>
+                
+                    <li><a class="toctree-l4" href="#class-structure">Class 
Structure</a></li>
+                
+                    <li><a class="toctree-l4" 
href="#architectural-details">Architectural Details</a></li>
+                
+                    <li><a class="toctree-l4" 
href="#concepts">Concepts</a></li>
+                
+                    <li><a class="toctree-l4" 
href="#assumptions">Assumptions</a></li>
+                
+                    <li><a class="toctree-l4" 
href="#flow-of-a-tuple-through-dedup-operator">Flow of a Tuple through Dedup 
Operator</a></li>
+                
+            
+                <li class="toctree-l3"><a 
href="#ports-attributes-and-properties">Ports, Attributes and 
Properties</a></li>
+                
+                    <li><a class="toctree-l4" href="#ports">Ports</a></li>
+                
+                    <li><a class="toctree-l4" 
href="#attributes">Attributes</a></li>
+                
+                    <li><a class="toctree-l4" 
href="#properties">Properties</a></li>
+                
+            
+                <li class="toctree-l3"><a href="#example">Example</a></li>
+                
+            
+                <li class="toctree-l3"><a 
href="#partitioning">Partitioning</a></li>
+                
+            
+            </ul>
+        
+    </li>
+
+        
+    </ul>
+<li>
+          
+        </ul>
+      </div>
+      &nbsp;
+    </nav>
+
+    <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap">
+
+      
+      <nav class="wy-nav-top" role="navigation" aria-label="top navigation">
+        <i data-toggle="wy-nav-top" class="fa fa-bars"></i>
+        <a href="../..">Apache Apex Malhar Documentation</a>
+      </nav>
+
+      
+      <div class="wy-nav-content">
+        <div class="rst-content">
+          <div role="navigation" aria-label="breadcrumbs navigation">
+  <ul class="wy-breadcrumbs">
+    <li><a href="../..">Docs</a> &raquo;</li>
+    
+      
+        
+          <li>Operators &raquo;</li>
+        
+      
+    
+    <li>Deduper</li>
+    <li class="wy-breadcrumbs-aside">
+      
+    </li>
+  </ul>
+  <hr/>
+</div>
+          <div role="main">
+            <div class="section">
+              
+                <h1 id="deduper-operator-documentation">Deduper - Operator 
Documentation</h1>
+<h1 id="introduction">Introduction</h1>
+<h2 id="about-this-document">About this document</h2>
+<p>This document is intended as a guide for understanding and using
+the Dedup operator.</p>
+<h2 id="terminology">Terminology</h2>
+<p>We will refer to this operator as the Deduper or Dedup operator
+interchangeably.</p>
+<h1 id="overview">Overview</h1>
+<h2 id="dedup-what-in-a-nutshell">Dedup - âWhatâ in a Nutshell</h2>
+<p>Dedup is actually short for Deduplication. Duplicates are omnipresent and
+can be found in almost any kind of data. Most of the times it is
+essential to discard, or at the very least separate out the data into
+uniqueÂ and duplicateÂ components. The entire purpose of this
+operator is to de-duplicate data. In other words, when data passes
+through this operator, it will be segregated into two different data
+sets, one of which contains all unique tuples, and the other which are
+occurring more than once in the original data set.</p>
+<p><img alt="" src="../images/deduper/image00.png" /></p>
+<h2 id="dedup-how-in-a-nutshell">Dedup - âHowâ in a Nutshell</h2>
+<p>In order to quickly decide whether an incoming tuple is duplicate
+or unique, it has to store each incoming tuple (or a signature, like key,
+for example) to be used for comparison later. A plain in-memory storage
+may work for small datasets, but will not scale for large ones. Deduper 
employs a large scale distributed persistent hashing mechanism (known as the 
Managed State) which allows
+it to identify if a particular tuple is duplicate or unique. Managed state is 
a layer on HDFS which allows all the stored data to be persisted in a 
distributed fashion.
+Each time it identifies a tuple as a unique tuple, it also
+stores it into the Managed state for future
+lookup.</p>
+<h2 id="_1"><img alt="" src="../images/deduper/image04.png" /></h2>
+<p>Following are the different components of the Deduper</p>
+<ol>
+<li><strong>Dedup Operator</strong> - This is responsible for the overall
+    functionality of the operator. This in turn makes use of other
+    components to establish the end goal of deciding whether a tuple is
+    a duplicate of some earlier tuple, or is a unique tuple.</li>
+<li><strong>Managed State</strong> - Since, all of the data cannot be stored in
+    memory, this component allows us to persist existing unique keys on
+    HDFS in form of buckets. This is also responsible for fetching data as
+    requested by the Deduper. Since, it communicates with the HDFS, data 
access is slow and so it allows for asynchronous (non-blocking) calls to fetch 
data. This ensures that the Deduper is not blocked and can continue to process 
other tuples. It also supports an in-memory cache where it stores the fetched 
data so that repeated access to the same data is faster. Periodically, based on 
configuration, this also
+    discards data which is no longer needed.</li>
+</ol>
+<p>This was a very basic introduction to the functioning of the
+Deduper. Following sections will go into more detail on each of the
+components.</p>
+<h1 id="use-cases-basic-dedup">Use cases - Basic Dedup</h1>
+<h2 id="dedup-key">Dedup Key</h2>
+<p>A dedup key is a set of one or more fields in the data tuple which
+acts as the keyÂ for the tuples.
+This is used by the deduper to compare tuples to arrive at the
+conclusion on whether two tuples are duplicates.</p>
+<p>Consider an example schema and two sample tuples</p>
+<p><code>{Name, Phone, Email, Date, State, Zip, Country}</code></p>
+<p>Tuple 1:</p>
+<pre><code>{
+  Austin U. Saunders,
+  +91-319-340-59385,
+  [email protected],
+  2015-11-09 13:38:38,
+  Texas,
+  73301,
+  United States
+}
+</code></pre>
+
+<p>Tuple 2:</p>
+<pre><code>{
+  Austin U. Saunders,
+  +91-319-340-59385,
+  [email protected],
+  2015-11-09 13:39:38,
+  Texas,
+  73301,
+  United States
+}
+</code></pre>
+
+<p>Let us assume that the Dedup Key
+is <code>{Name, Phone}</code>. In
+this case, the two tuples are duplicates because the key fields are same
+in both the tuples. However, if the Dedup Key is {Phone,Email},
+the two are unique as the email values differ.</p>
+<h2 id="use-case-details">Use case Details</h2>
+<p>Consider the case of de-duplicating a master data set
+which is stored in a file.Â Further also consider the
+following schema for tuples in the data set.</p>
+<p><code>{Name, Phone, Email, Date, City, Zip, Country}</code></p>
+<p>Also consider that we need to identify unique customers from the
+master data set. So, ultimately the output needed for the use case is
+two data sets - Unique RecordsÂ and Duplicate Records.</p>
+<p>As part of configuring the operator for this use case, we need to
+set the following parameters:</p>
+<ul>
+<li><strong><em>keyExpression</em></strong>Â - This can be set as
+    the primary key which can be used to uniquely identify a Customer.
+    For example, we can set it to <code>Name,Email</code></li>
+</ul>
+<p>The above configuration is sufficient to address this use case.</p>
+<h1 id="use-case-dedup-with-expiry">Use case - Dedup with Expiry</h1>
+<h2 id="motivation">Motivation</h2>
+<p>The Basic Dedup use case is the most straightforward and is
+usually applied when the amount of data to be processed is not huge.
+However, if the incoming data is huge, or even never-ending, it is
+usually not necessary to keep storing all the data. This is because in
+most real world use cases, the duplicates occur only a short distance
+apart. Hence, after a while, it is usually okay to forget part of
+the history and consider only limited history for identifying
+duplicates, in the interest of efficiency. In other words, we expire
+(ignore) some tuples which are (or were supposed to be) delivered long
+back. Doing so, reduces the load on the storage mechanism (managed state) 
which effectively deletes part of the history, thus making the whole process 
more
+efficient. We call this use case, Dedup with expiry.</p>
+<h2 id="expiry-key">Expiry Key</h2>
+<p>The easiest way to understand this use case is to consider
+timeÂ as the criterion for expiring
+tuples. TimeÂ is a natural expiry
+key and is in line with the concept of expiry. Formally, an expiry field
+is a field in the input tuple which can be used to discard incoming
+tuples as expired. This expiry key
+usually works with another parameter called Expiry Period defined
+next.</p>
+<h2 id="expiry-period">Expiry Period</h2>
+<p>The expiry period is the value supplied by the user to define the
+extent of history which should be considered while expiring
+tuples.</p>
+<h2 id="use-case-details_1">Use case Details</h2>
+<p>Consider an incoming stream of system logs. The use case requires
+us to identify duplicate log messages and pass on only the unique ones.
+Another relaxation in the use case is that the log messages which are
+older than a day, may not be considered and must be filtered out as
+expired. The expiry must be measured with respect to the time stamp in
+the logs. For example, if the timestamp in the incoming message is
+<code>30-12-2014 00:00:00</code> and the
+latest message that the system has encountered had the time stamp
+<code>31-12-2014 00:00:00</code>, then the
+incoming message must be considered as expired. However, if the incoming
+message had any timestamp like <code>30-12-2014
+00:11:00</code>, it must be accepted into the system and be checked for a 
possible duplicate.</p>
+<p>The expiry facet in the use case above gives us an advantage in
+that we do not have to compare the incoming record with allÂ the data to check 
if it is a duplicate.
+At the same time, allÂ the
+incoming data need not be stored; just a day worth of data is adequate to 
address the above use case.</p>
+<p>Configuring the below parameters will solve the problem for this
+use case:</p>
+<ul>
+<li><strong><em>keyExpression</em></strong>Â - This is the dedup key for the 
incoming tuples (similar to the Basic Dedup use case). This can be any key 
which can uniquely identify a record. For log messages this can be a serial 
number attached in the log.</li>
+<li><strong><em>timeExpression</em></strong>Â - This is the key which can help 
identify the expired records, as explained above. In this particular use case, 
it can be a timestamp field which indicates when the log message was 
generated.</li>
+<li><strong><em>expireBefore</em></strong>Â - This is the period of expiry as 
explained above. In our example use case this will be 24 hour, specified in 
seconds.</li>
+</ul>
+<p>Configuration of the above parameters is sufficient to address this use
+case.</p>
+<h1 id="use-cases-summary">Use cases - Summary</h1>
+<ol>
+<li><strong>Basic Dedup</strong> - Deduplication of
+    bounded datasets. Data is assumed to be bounded. This use case is
+    not meant for never ending streams of data. For example:
+    Deduplication of master data like customer records, product catalogs
+    etc.</li>
+<li><strong>Time Based Dedup</strong>Â - Deduplication of
+    unlimited streams of data. This use case handles unbounded streams
+    of data and can run forever. An expiry key and criterion is expected
+    as part of the input which helps avoid storing all the unique data.
+    This helps speed up performance. Any timestamp field in the incoming
+    tuple can be used as a time based expiry key.<ul>
+<li><em>With respect to system time</em>Â - Time progresses with system time. 
Any expiry criteria are executed with the notion of system time. This is 
possible if the incoming tuple does not have a time field, or the user does not 
specify a <code>timeExpression</code>.</li>
+<li><em>With respect to tuple time</em>Â - Time progresses based on the time 
in the incoming tuples. Expiry criteria are executed with the notion of time 
indicated by the incoming tuple. Specification of the time field 
(<code>timeExpression</code>) is mandatory for this scenario.</li>
+</ul>
+</li>
+</ol>
+<h1 id="technical-architecture">Technical Architecture</h1>
+<h2 id="class-structure">Class Structure</h2>
+<p><img alt="" src="../images/deduper/image03.png" /></p>
+<hr />
+<h2 id="architectural-details">Architectural Details</h2>
+<p><img alt="" src="../images/deduper/image02.png" /></p>
+<h2 id="concepts">Concepts</h2>
+<h3 id="dedup-key-specified-by-keyexpression-parameter">Dedup Key - Specified 
by <em>keyExpression</em>Â parameter</h3>
+<p>A dedup key is a set of one or more fields in the data tuple which
+acts as the keyÂ for the tuples.
+This is used by the deduper to compare tuples to arrive at the
+conclusion on whether two tuples are duplicates. If Dedup Key of two
+tuples match, then they are duplicates, else they are unique.</p>
+<h3 id="expiry-key-specified-by-timeexpression-parameter">Expiry Key - 
Specified by <em>timeExpression</em>Â parameter</h3>
+<p>A tuple may or may not have an Expiry Key. Dedup operator cannot
+keep storing all the data that is flowing into the operator. At some
+point it becomes essential to discard some of the historical tuples in
+interest of memory and efficiency.</p>
+<p>At the same time, tuples are expected to arrive at the Dedup
+operator within some time after they are generated. After this time, the
+tuples may be considered as stale or obsolete.</p>
+<p>In such cases, the Deduper considers these tuples as
+expiredÂ and takes no action other than
+separating out these tuples on a different port in order to be processed
+by some other operator or stored offline for analysis.</p>
+<p>In order to create a criterion for discarding such tuples, we
+introduce an Expiry Key. Looking at the value of the Expiry Key in each
+tuple, we can decide whether or not to discard this tuple as
+expired.</p>
+<p>The expiry key that we consider in Time Based Dedup is
+time. This usually works with
+another parameter called Expiry Period defined next.</p>
+<h3 id="expiry-period_1">Expiry Period</h3>
+<p>The Expiry Period is the value supplied by the user which decides
+when a particular tuple expires.</p>
+<h3 id="time-points">Time Points</h3>
+<p>For every dataset that the deduper processes, a set of time points is 
maintained:</p>
+<ol>
+<li><em>Latest Point</em>Â - This is the maximum
+    time point observed in all the processed tuples.</li>
+<li><em>Expiry Point</em>Â - This is given by:
+    <code>Expiry Point = Latest Point - Expiry Period</code></li>
+</ol>
+<p>These points help the deduper to make decisions related to expiry
+of a tuple.</p>
+<h3 id="example-expiry">Example - Expiry</h3>
+<table>
+<thead>
+<tr>
+<th align="center">Tuple Id</th>
+<th align="center">Expiry Key (Expiry Period = 10)</th>
+<th align="center">Latest Point</th>
+<th align="center">Expiry Point</th>
+<th align="center">Decision for Tuple</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td align="center">1</td>
+<td align="center">10</td>
+<td align="center">10</td>
+<td align="center">1</td>
+<td align="center">Not Expired</td>
+</tr>
+<tr>
+<td align="center">2</td>
+<td align="center">20</td>
+<td align="center">20</td>
+<td align="center">11</td>
+<td align="center">Not Expired</td>
+</tr>
+<tr>
+<td align="center">3</td>
+<td align="center">25</td>
+<td align="center">25</td>
+<td align="center">16</td>
+<td align="center">Not Expired</td>
+</tr>
+<tr>
+<td align="center">4</td>
+<td align="center">40</td>
+<td align="center">40</td>
+<td align="center">31</td>
+<td align="center">Not Expired</td>
+</tr>
+<tr>
+<td align="center">5</td>
+<td align="center">21</td>
+<td align="center">40</td>
+<td align="center">31</td>
+<td align="center">Expired</td>
+</tr>
+<tr>
+<td align="center">6</td>
+<td align="center">35</td>
+<td align="center">40</td>
+<td align="center">31</td>
+<td align="center">Not Expired</td>
+</tr>
+<tr>
+<td align="center">7</td>
+<td align="center">45</td>
+<td align="center">45</td>
+<td align="center">36</td>
+<td align="center">Not Expired</td>
+</tr>
+<tr>
+<td align="center">8</td>
+<td align="center">57</td>
+<td align="center">57</td>
+<td align="center">48</td>
+<td align="center">Not Expired</td>
+</tr>
+</tbody>
+</table>
+<h3 id="time-buckets-a-component-of-managed-state">Time Buckets (A component 
of Managed State)</h3>
+<p>One of the requirements of the Deduper is to store all the unique
+tuples (actually, just the keys of tuples). Keeping an ever growing
+cache in memory is not scalable. So what we need is a limited cache
+backed by a persistent store. When data is requested to be fetched from managed
+state, it is also cached in an in-memory cache. Buckets help
+narrow down the search of duplicates for incoming tuples. A Bucket is an
+abstraction for a collection of tuples all of which share a common hash
+value based on some hash function or a range of time, for example: a
+bucket of data for 5 contiguous minutes. A BucketÂ has a span property called 
Bucket Span.</p>
+<h3 id="bucket-span">Bucket Span</h3>
+<p>Bucket span is simply the range of the domain
+that is covered by the Bucket. This span is specified in
+the domain of the Expiry key. If the Expiry
+Key is time, Â then the Bucket span
+will be specified in seconds. It is
+only defined in case tuples have an Expiry Key.</p>
+<h3 id="number-of-buckets">Number of Buckets</h3>
+<p>The number of buckets can be given by - <code>Num Buckets = Expiry
+Period / Bucket Span</code></p>
+<p>This is because at any point of time, we need only store Expiry
+Period worth of data.</p>
+<h3 id="example-buckets">Example - Buckets</h3>
+<p><img alt="" src="../images/deduper/image01.png" /></p>
+<h2 id="assumptions">Assumptions</h2>
+<h3 id="assumption-1">Assumption 1 <a name="Assumption1"></a></h3>
+<p>This assumption is only applicable in case of Dedup with
+Expiry.</p>
+<p>For any two tuples, t1 and t2 having dedup keys d1 and d2, and
+expiry keys e1 and e2, respectively, the following holds:</p>
+<pre><code>If d1 = d2,
+  then e1 = e2
+</code></pre>
+
+<p>In other words, there may never
+beÂ two tuples t1 and t2 such that:</p>
+<pre><code>Tuple 1: d1, e1
+Tuple 2: d2, e2
+d1 = d2 and e1 != e2
+</code></pre>
+
+<p>In other words, any two tuples with the same dedup key are assumed to have 
the
+same expiry key as well.
+This assumption was made with respect to certain use cases. These
+use cases follow this assumption in that the records which are
+duplicates are exactly identical. An example use case is when log
+messages are replayed erroneously, and we want to identify the duplicate
+log messages. In such cases, we need not worry about two different log
+messages having the same identifier but different timestamps. Since its
+a replay of the same data, the duplicate records are assumed to be
+exactly identical.</p>
+<p>In case the duplicate tuple has a different value for expiry key, the 
behavior of
+the deduper can be non-deterministic.</p>
+<h2 id="flow-of-a-tuple-through-dedup-operator">Flow of a Tuple through Dedup 
Operator</h2>
+<p>Tuples flow through the Dedup operator one by one. Deduper may process a 
tuple immediately, or store it in some data
+structure for later processing.</p>
+<p>When a tuple always arrives at the input
+portÂ of the Dedup operator, it does
+the following tasks.</p>
+<h4 id="check-if-tuple-is-expired">Check if tuple is Expired</h4>
+<p>This is only done in case of Dedup with expiry. The
+following condition is used to check if the tuple is expired.</p>
+<pre><code>if ( Latest Point - Expiry Key &lt; Expiry Point )
+  then Expired
+</code></pre>
+
+<p>If the tuple is expired, then send it to the expired port.</p>
+<h4 id="check-if-tuple-is-a-duplicate-or-unique">Check if tuple is a Duplicate 
or Unique</h4>
+<p>Once a tuple passes the check of expiry, we proceed to check if
+the tuple is a duplicate of some earlier tuple. Note that
+if the tuple in question is not expired, the duplicate will also not
+have expired due to the assumption listed <a href="#Assumption1">here</a>.
+The Deduper queries the Managed state to fetch the value for the tuple key.
+This request is processed by the Managed state in a separate asynchronous 
thread.
+Once this request is submitted, the Deduper moves on to process other
+tuples. Additionally the Deduper also inserts the tuple being processed
+into a waiting eventsÂ queue for later processing.</p>
+<h4 id="process-pending-tuples">Process pending tuples</h4>
+<p>Once the Deduper has looked at the all the tuples in the current window,
+it starts to process the tuples in the waiting queue to finalize the decision
+(unique or duplicate) for these tuples.
+Once the request to Managed state is completed for a tuple and the value is
+fetched from persistent storage, the Deduper can decide if the tuple in
+question is a duplicate or a unique.
+Depending on whether there is enough time left in the current window,
+it can do one of the following:</p>
+<ul>
+<li>Process only the tuples for which the managed state has completed 
processing.
+The tuples which are still being processed by managed state are skipped only 
to come back to them when it can no longer postpone it. This is typically done 
when the operator
+has idle time as there are no tuples on the input ports and the current window
+has still not ended.</li>
+<li>Block on them to complete their processing. This will happen when the 
current
+window has no time left, and the decision cannot be postponed. Note: An 
operator can end its window, only when all the tuples have been completely 
processed.  </li>
+</ul>
+<h1 id="ports-attributes-and-properties">Ports, Attributes and Properties</h1>
+<h2 id="ports">Ports</h2>
+<p>The deduper has a single input port and multiple output
+ports.</p>
+<ul>
+<li><strong><em>input</em></strong> - This is the input port through
+    which the tuples arrive at the Deduper.</li>
+<li><strong><em>unique</em></strong>Â - This is the output port on
+    which unique tuples are sent out by the Deduper.</li>
+<li><strong><em>duplicate</em></strong>Â - This is the output port on
+    which duplicate tuples are sent out by the Deduper.</li>
+<li><strong><em>expired</em></strong>Â - This is the output port on
+    which expired tuples are sent out by the Deduper.</li>
+</ul>
+<p>The user can choose which output ports to connect the down stream operators.
+All the output ports are optional and can be used as required by the use 
case.</p>
+<h2 id="attributes">Attributes</h2>
+<ul>
+<li><strong><em>Input port Attribute - input.TUPLE_CLASS</em></strong>Â - 
Class or the fully
+qualified class name.<ul>
+<li>Mandatory attribute</li>
+<li>Tells the operator about the type of the incoming
+tuple.</li>
+</ul>
+</li>
+</ul>
+<h2 id="properties">Properties</h2>
+<ul>
+<li>
+<p><strong><em>keyExpression</em></strong>Â - String</p>
+<ul>
+<li>Mandatory parameter.</li>
+<li>The java expression to extract the key fields in the incoming tuple 
(POJO)</li>
+</ul>
+</li>
+<li>
+<p><strong><em>timeExpression</em></strong>Â - String - (Time Based Deduper 
only)</p>
+<ul>
+<li>The java expression to extract the time field in the incoming tuple 
(POJO).</li>
+</ul>
+</li>
+<li>
+<p><strong><em>expireBefore</em></strong>Â - Long (Seconds) - (Time Based 
Deduper only)</p>
+<ul>
+<li>This is the total time period during which a tuple stays in the system and 
blocks any other tuple with the same key.</li>
+</ul>
+</li>
+<li>
+<p><strong><em>bucketSpan</em></strong>Â - Long (Seconds) - (Time Based 
Deduper only)</p>
+<ul>
+<li>Mandatory parameter</li>
+<li>This is the unit which describes how large a bucket can be. Typically this 
should be defined depending on the use case. For example, if we have 
expireBefore set to 1 hour, then typically we would be clubbing data in the 
order of minutes, so a <code>bucketSpan</code> of a few minutes would make 
sense. Note that in this case, the entire data worth the 
<code>bucketSpan</code> will expire as a whole. Setting it to 1 minute would 
make the number of time buckets in the system to be 1 hour / 1 minute = 60 
buckets.  Similarly setting bucketSpan to 5 minutes would make number of 
buckets to be 12.</li>
+<li>Note that having too many or too few buckets could have a performance 
impact. If unsure, set the bucketSpan to the square root of 
<code>expireBefore</code>. This way the number of buckets and bucket span are 
balanced.</li>
+</ul>
+</li>
+<li>
+<p><strong><em>referenceInstant</em></strong>Â - Â Long (Seconds) - (Time 
Based Deduper only)</p>
+<ul>
+<li>The reference point from which to start the time which is use for expiry. 
Setting the referenceInstant to say, r seconds from the epoch, would initialize 
the start of expiry to be from that <code>instant = r</code>. The start and end 
of the expiry window periodically move by the span of a single bucket.</li>
+</ul>
+</li>
+<li>
+<p><strong><em>numBuckets</em></strong>Â - Â Integer - (Bounded Deduper 
only)</p>
+<ul>
+<li>Optional parameter, but recommended to be provided by the user.</li>
+<li>This is the number of buckets that need to be used for storing the keys of 
the incoming tuples.</li>
+<li>Users can decide upon the proper value for this parameter by guessing the 
number of distinct keys in the application. A reasonable value is the square 
root of N, where N is the number of distinct keys. If omitted, the Java 
MAX_VALUE for integer is used for N.</li>
+</ul>
+</li>
+</ul>
+<h1 id="example">Example</h1>
+<p>Please refer to <a 
href="https://github.com/DataTorrent/examples/tree/master/tutorials/dedup";>https://github.com/DataTorrent/examples/tree/master/tutorials/dedup</a>Â
 for
+an example on how to use Deduper.</p>
+<h1 id="partitioning">Partitioning</h1>
+<p>Deduper can be statically partitioned using the operator
+attribute: PARTITIONER</p>
+<p>Add the following property to the properties.xml file:</p>
+<pre><code>&lt;property&gt;
+Â Â Â Â Â Â Â Â 
&lt;name&gt;dt.operator.{OperatorName}.attr.PARTITIONER&lt;/name&gt;
+Â Â Â Â Â Â Â Â 
&lt;value&gt;com.datatorrent.common.partitioner.StatelessPartitioner:2&lt;/value&gt;
+&lt;/property&gt;
+</code></pre>
+
+<p>This will partition the Dedup operator into 2 static partitions. Change the 
number
+to the required number of partitions.</p>
+<p>Dynamic partitioning is currently not supported in the Deduper.</p>
+              
+            </div>
+          </div>
+          <footer>
+  
+    <div class="rst-footer-buttons" role="navigation" aria-label="footer 
navigation">
+      
+      
+        <a href="../enricher/" class="btn btn-neutral" title="Enricher"><span 
class="icon icon-circle-arrow-left"></span> Previous</a>
+      
+    </div>
+  
+
+  <hr/>
+
+  <div role="contentinfo">
+    <!-- Copyright etc -->
+    
+  </div>
+
+  Built with <a href="http://www.mkdocs.org";>MkDocs</a> using a <a 
href="https://github.com/snide/sphinx_rtd_theme";>theme</a> provided by <a 
href="https://readthedocs.org";>Read the Docs</a>.
+</footer>
+         
+        </div>
+      </div>
+
+    </section>
+
+  </div>
+
+<div class="rst-versions" role="note" style="cursor: pointer">
+    <span class="rst-current-version" data-toggle="rst-current-version">
+      
+      
+        <span><a href="../enricher/" style="color: #fcfcfc;">&laquo; 
Previous</a></span>
+      
+      
+    </span>
+</div>
+
+</body>
+</html>

http://git-wip-us.apache.org/repos/asf/apex-site/blob/69d479dc/content/docs/malhar-3.5/operators/enricher/index.html
----------------------------------------------------------------------
diff --git a/content/docs/malhar-3.5/operators/enricher/index.html 
b/content/docs/malhar-3.5/operators/enricher/index.html
index 4223ee5..82fb8d3 100644
--- a/content/docs/malhar-3.5/operators/enricher/index.html
+++ b/content/docs/malhar-3.5/operators/enricher/index.html
@@ -134,6 +134,13 @@
     </li>
 
         
+            
+    <li class="toctree-l1 ">
+        <a class="" href="../deduper/">Deduper</a>
+        
+    </li>
+
+        
     </ul>
 <li>
           
@@ -466,6 +473,8 @@ CooldownMillis of 10000 will be used as the threshold time 
for which the latency
   
     <div class="rst-footer-buttons" role="navigation" aria-label="footer 
navigation">
       
+        <a href="../deduper/" class="btn btn-neutral float-right" 
title="Deduper">Next <span class="icon icon-circle-arrow-right"></span></a>
+      
       
         <a href="../file_output/" class="btn btn-neutral" title="File 
Output"><span class="icon icon-circle-arrow-left"></span> Previous</a>
       
@@ -496,6 +505,8 @@ CooldownMillis of 10000 will be used as the threshold time 
for which the latency
         <span><a href="../file_output/" style="color: #fcfcfc;">&laquo; 
Previous</a></span>
       
       
+        <span style="margin-left: 15px"><a href="../deduper/" style="color: 
#fcfcfc">Next &raquo;</a></span>
+      
     </span>
 </div>
 

http://git-wip-us.apache.org/repos/asf/apex-site/blob/69d479dc/content/docs/malhar-3.5/operators/file_output/index.html
----------------------------------------------------------------------
diff --git a/content/docs/malhar-3.5/operators/file_output/index.html 
b/content/docs/malhar-3.5/operators/file_output/index.html
index df08695..6fa56bc 100644
--- a/content/docs/malhar-3.5/operators/file_output/index.html
+++ b/content/docs/malhar-3.5/operators/file_output/index.html
@@ -115,6 +115,13 @@
     </li>
 
         
+            
+    <li class="toctree-l1 ">
+        <a class="" href="../deduper/">Deduper</a>
+        
+    </li>
+
+        
     </ul>
 <li>
           

http://git-wip-us.apache.org/repos/asf/apex-site/blob/69d479dc/content/docs/malhar-3.5/operators/file_splitter/index.html
----------------------------------------------------------------------
diff --git a/content/docs/malhar-3.5/operators/file_splitter/index.html 
b/content/docs/malhar-3.5/operators/file_splitter/index.html
index 9dcf91e..fe1f3df 100644
--- a/content/docs/malhar-3.5/operators/file_splitter/index.html
+++ b/content/docs/malhar-3.5/operators/file_splitter/index.html
@@ -121,6 +121,13 @@
     </li>
 
         
+            
+    <li class="toctree-l1 ">
+        <a class="" href="../deduper/">Deduper</a>
+        
+    </li>
+
+        
     </ul>
 <li>
           

http://git-wip-us.apache.org/repos/asf/apex-site/blob/69d479dc/content/docs/malhar-3.5/operators/images/deduper/image00.png
----------------------------------------------------------------------
diff --git a/content/docs/malhar-3.5/operators/images/deduper/image00.png 
b/content/docs/malhar-3.5/operators/images/deduper/image00.png
new file mode 100644
index 0000000..ec3e292
Binary files /dev/null and 
b/content/docs/malhar-3.5/operators/images/deduper/image00.png differ

http://git-wip-us.apache.org/repos/asf/apex-site/blob/69d479dc/content/docs/malhar-3.5/operators/images/deduper/image01.png
----------------------------------------------------------------------
diff --git a/content/docs/malhar-3.5/operators/images/deduper/image01.png 
b/content/docs/malhar-3.5/operators/images/deduper/image01.png
new file mode 100644
index 0000000..b9e35a9
Binary files /dev/null and 
b/content/docs/malhar-3.5/operators/images/deduper/image01.png differ

http://git-wip-us.apache.org/repos/asf/apex-site/blob/69d479dc/content/docs/malhar-3.5/operators/images/deduper/image02.png
----------------------------------------------------------------------
diff --git a/content/docs/malhar-3.5/operators/images/deduper/image02.png 
b/content/docs/malhar-3.5/operators/images/deduper/image02.png
new file mode 100644
index 0000000..689bdfe
Binary files /dev/null and 
b/content/docs/malhar-3.5/operators/images/deduper/image02.png differ

http://git-wip-us.apache.org/repos/asf/apex-site/blob/69d479dc/content/docs/malhar-3.5/operators/images/deduper/image03.png
----------------------------------------------------------------------
diff --git a/content/docs/malhar-3.5/operators/images/deduper/image03.png 
b/content/docs/malhar-3.5/operators/images/deduper/image03.png
new file mode 100644
index 0000000..087a0b1
Binary files /dev/null and 
b/content/docs/malhar-3.5/operators/images/deduper/image03.png differ

http://git-wip-us.apache.org/repos/asf/apex-site/blob/69d479dc/content/docs/malhar-3.5/operators/images/deduper/image04.png
----------------------------------------------------------------------
diff --git a/content/docs/malhar-3.5/operators/images/deduper/image04.png 
b/content/docs/malhar-3.5/operators/images/deduper/image04.png
new file mode 100644
index 0000000..4d3bd32
Binary files /dev/null and 
b/content/docs/malhar-3.5/operators/images/deduper/image04.png differ

http://git-wip-us.apache.org/repos/asf/apex-site/blob/69d479dc/content/docs/malhar-3.5/operators/kafkaInputOperator/index.html
----------------------------------------------------------------------
diff --git a/content/docs/malhar-3.5/operators/kafkaInputOperator/index.html 
b/content/docs/malhar-3.5/operators/kafkaInputOperator/index.html
index d35d4c8..c8d6952 100644
--- a/content/docs/malhar-3.5/operators/kafkaInputOperator/index.html
+++ b/content/docs/malhar-3.5/operators/kafkaInputOperator/index.html
@@ -129,6 +129,13 @@
     </li>
 
         
+            
+    <li class="toctree-l1 ">
+        <a class="" href="../deduper/">Deduper</a>
+        
+    </li>
+
+        
     </ul>
 <li>
           

http://git-wip-us.apache.org/repos/asf/apex-site/blob/69d479dc/content/docs/malhar-3.5/search.html
----------------------------------------------------------------------
diff --git a/content/docs/malhar-3.5/search.html 
b/content/docs/malhar-3.5/search.html
index 07c2683..71a75d5 100644
--- a/content/docs/malhar-3.5/search.html
+++ b/content/docs/malhar-3.5/search.html
@@ -98,6 +98,13 @@
     </li>
 
         
+            
+    <li class="toctree-l1 ">
+        <a class="" href="operators/deduper/">Deduper</a>
+        
+    </li>
+
+        
     </ul>
 <li>

[2/2] apex-site git commit: from cedfda71f7908cf704ae51d1e08cb54710ea13d5

Reply via email to