Repository: nifi
Updated Branches:
  refs/heads/master 118f4e8cc -> e85081ed2


NIFI-3979 Documenting how ListHDFS maintains state and performs listings

Signed-off-by: Pierre Villard <[email protected]>

This closes #1860.


Project: http://git-wip-us.apache.org/repos/asf/nifi/repo
Commit: http://git-wip-us.apache.org/repos/asf/nifi/commit/e85081ed
Tree: http://git-wip-us.apache.org/repos/asf/nifi/tree/e85081ed
Diff: http://git-wip-us.apache.org/repos/asf/nifi/diff/e85081ed

Branch: refs/heads/master
Commit: e85081ed201bc7413e3d2d062c814e23c0ff5467
Parents: 118f4e8
Author: Bryan Bende <[email protected]>
Authored: Thu May 25 14:43:21 2017 -0400
Committer: Pierre Villard <[email protected]>
Committed: Fri May 26 10:40:22 2017 +0200

----------------------------------------------------------------------
 .../apache/nifi/processors/hadoop/ListHDFS.java  | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/nifi/blob/e85081ed/nifi-nar-bundles/nifi-hadoop-bundle/nifi-hdfs-processors/src/main/java/org/apache/nifi/processors/hadoop/ListHDFS.java
----------------------------------------------------------------------
diff --git 
a/nifi-nar-bundles/nifi-hadoop-bundle/nifi-hdfs-processors/src/main/java/org/apache/nifi/processors/hadoop/ListHDFS.java
 
b/nifi-nar-bundles/nifi-hadoop-bundle/nifi-hdfs-processors/src/main/java/org/apache/nifi/processors/hadoop/ListHDFS.java
index 06e616c..a705ee8 100644
--- 
a/nifi-nar-bundles/nifi-hadoop-bundle/nifi-hdfs-processors/src/main/java/org/apache/nifi/processors/hadoop/ListHDFS.java
+++ 
b/nifi-nar-bundles/nifi-hadoop-bundle/nifi-hdfs-processors/src/main/java/org/apache/nifi/processors/hadoop/ListHDFS.java
@@ -63,10 +63,12 @@ import java.util.regex.Pattern;
 @TriggerWhenEmpty
 @InputRequirement(Requirement.INPUT_FORBIDDEN)
 @Tags({"hadoop", "HDFS", "get", "list", "ingest", "source", "filesystem"})
-@CapabilityDescription("Retrieves a listing of files from HDFS. For each file 
that is listed in HDFS, creates a FlowFile that represents "
-        + "the HDFS file so that it can be fetched in conjunction with 
FetchHDFS. This Processor is designed to run on Primary Node only "
-        + "in a cluster. If the primary node changes, the new Primary Node 
will pick up where the previous node left off without duplicating "
-        + "all of the data. Unlike GetHDFS, this Processor does not delete any 
data from HDFS.")
+@CapabilityDescription("Retrieves a listing of files from HDFS. Each time a 
listing is performed, the files with the latest timestamp will be excluded "
+        + "and picked up during the next execution of the processor. This is 
done to ensure that we do not miss any files, or produce duplicates, in the "
+        + "cases where files with the same timestamp are written immediately 
before and after a single execution of the processor. For each file that is "
+        + "listed in HDFS, this processor creates a FlowFile that represents 
the HDFS file to be fetched in conjunction with FetchHDFS. This Processor is "
+        +  "designed to run on Primary Node only in a cluster. If the primary 
node changes, the new Primary Node will pick up where the previous node left "
+        +  "off without duplicating all of the data. Unlike GetHDFS, this 
Processor does not delete any data from HDFS.")
 @WritesAttributes({
     @WritesAttribute(attribute="filename", description="The name of the file 
that was read from HDFS."),
     @WritesAttribute(attribute="path", description="The path is set to the 
absolute path of the file's directory on HDFS. For example, if the Directory 
property is set to /tmp, "
@@ -80,10 +82,11 @@ import java.util.regex.Pattern;
     @WritesAttribute(attribute="hdfs.permissions", description="The 
permissions for the file in HDFS. This is formatted as 3 characters for the 
owner, "
             + "3 for the group, and 3 for other users. For example rw-rw-r--")
 })
-@Stateful(scopes = Scope.CLUSTER, description = "After performing a listing of 
HDFS files, the timestamp of the newest file is stored, "
-    + "along with the filenames of all files that share that same timestamp. 
This allows the Processor to list only files that have been added or modified 
after "
-    + "this date the next time that the Processor is run. State is stored 
across the cluster so that this Processor can be run on Primary Node only and 
if a new Primary "
-    + "Node is selected, the new node can pick up where the previous node left 
off, without duplicating the data.")
+@Stateful(scopes = Scope.CLUSTER, description = "After performing a listing of 
HDFS files, the latest timestamp of all the files listed and the latest "
+        + "timestamp of all the files transferred are both stored. This allows 
the Processor to list only files that have been added or modified after "
+        + "this date the next time that the Processor is run, without having 
to store all of the actual filenames/paths which could lead to performance "
+        + "problems. State is stored across the cluster so that this Processor 
can be run on Primary Node only and if a new Primary "
+        + "Node is selected, the new node can pick up where the previous node 
left off, without duplicating the data.")
 @SeeAlso({GetHDFS.class, FetchHDFS.class, PutHDFS.class})
 public class ListHDFS extends AbstractHadoopProcessor {
 

Reply via email to