[FLINK-1081] Add HDFS file-stream source for streaming

Conflicts:
        
flink-addons/flink-streaming/flink-streaming-core/src/main/java/org/apache/flink/streaming/api/environment/StreamExecutionEnvironment.java


Project: http://git-wip-us.apache.org/repos/asf/flink/repo
Commit: http://git-wip-us.apache.org/repos/asf/flink/commit/bf5e39a8
Tree: http://git-wip-us.apache.org/repos/asf/flink/tree/bf5e39a8
Diff: http://git-wip-us.apache.org/repos/asf/flink/diff/bf5e39a8

Branch: refs/heads/master
Commit: bf5e39a8dea1e52de8f8eb92ded3d128ce66d572
Parents: 7aa9a50
Author: Chiwan Park <[email protected]>
Authored: Fri Dec 12 02:49:01 2014 +0900
Committer: Gyula Fora <[email protected]>
Committed: Sun Jan 25 20:21:22 2015 +0100

----------------------------------------------------------------------
 .../environment/StreamExecutionEnvironment.java |  47 ++++----
 .../function/source/FileMonitoringFunction.java | 117 +++++++++++++++++++
 .../api/function/source/FileReadFunction.java   |  51 ++++++++
 .../api/function/source/FileStreamFunction.java |  49 --------
 4 files changed, 191 insertions(+), 73 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/flink/blob/bf5e39a8/flink-addons/flink-streaming/flink-streaming-core/src/main/java/org/apache/flink/streaming/api/environment/StreamExecutionEnvironment.java
----------------------------------------------------------------------
diff --git 
a/flink-addons/flink-streaming/flink-streaming-core/src/main/java/org/apache/flink/streaming/api/environment/StreamExecutionEnvironment.java
 
b/flink-addons/flink-streaming/flink-streaming-core/src/main/java/org/apache/flink/streaming/api/environment/StreamExecutionEnvironment.java
index 8a9be93..b7eeed2 100644
--- 
a/flink-addons/flink-streaming/flink-streaming-core/src/main/java/org/apache/flink/streaming/api/environment/StreamExecutionEnvironment.java
+++ 
b/flink-addons/flink-streaming/flink-streaming-core/src/main/java/org/apache/flink/streaming/api/environment/StreamExecutionEnvironment.java
@@ -29,6 +29,7 @@ import org.apache.flink.api.common.typeinfo.BasicTypeInfo;
 import org.apache.flink.api.common.typeinfo.TypeInformation;
 import org.apache.flink.api.java.ExecutionEnvironment;
 import org.apache.flink.api.java.io.TextInputFormat;
+import org.apache.flink.api.java.tuple.Tuple3;
 import org.apache.flink.api.java.typeutils.TypeExtractor;
 import org.apache.flink.client.program.Client;
 import org.apache.flink.client.program.ContextEnvironment;
@@ -36,8 +37,10 @@ import org.apache.flink.core.fs.Path;
 import org.apache.flink.streaming.api.StreamGraph;
 import org.apache.flink.streaming.api.datastream.DataStream;
 import org.apache.flink.streaming.api.datastream.DataStreamSource;
+import org.apache.flink.streaming.api.function.source.FileMonitoringFunction;
+import 
org.apache.flink.streaming.api.function.source.FileMonitoringFunction.WatchType;
+import org.apache.flink.streaming.api.function.source.FileReadFunction;
 import org.apache.flink.streaming.api.function.source.FileSourceFunction;
-import org.apache.flink.streaming.api.function.source.FileStreamFunction;
 import org.apache.flink.streaming.api.function.source.FromElementsFunction;
 import org.apache.flink.streaming.api.function.source.GenSequenceFunction;
 import org.apache.flink.streaming.api.function.source.GenericSourceFunction;
@@ -214,34 +217,30 @@ public abstract class StreamExecutionEnvironment {
        }
 
        /**
-        * Creates a DataStream that represents the Strings produced by reading 
the
-        * given file line wise multiple times(infinite). The file will be read 
with
-        * the system's default character set. This functionality can be used 
for
-        * testing a topology.
+        * Creates a DataStream that contains the contents of file created while
+        * system watches the given path. The file will be read with the 
system's
+        * default character set.
         * 
         * @param filePath
         *            The path of the file, as a URI (e.g.,
-        *            "file:///some/local/file" or 
"hdfs://host:port/file/path").
-        * @return The DataStream representing the text file.
+        *            "file:///some/local/file" or 
"hdfs://host:port/file/path/").
+        * @param interval
+        *            The interval of file watching.
+        * @param watchType
+        *            The watch type of file stream. When watchType is
+        *            {@link WatchType.ONLY_NEW_FILES}, the system processes 
only
+        *            new files. {@link WatchType.REPROCESS_WITH_APPENDED} means
+        *            that the system re-processes all contents of appended 
file.
+        *            {@link WatchType.PROCESS_ONLY_APPENDED} means that the 
system
+        *            processes only appended contents of files.
+        * 
+        * @return The DataStream containing the given directory.
         */
-       public DataStreamSource<String> readTextStream(String filePath) {
-               checkIfFileExists(filePath);
-               return addSource(new FileStreamFunction(filePath), null, 
"textStream");
-       }
+       public DataStream<String> readFileStream(String filePath, long 
interval, WatchType watchType) {
+               DataStream<Tuple3<String, Long, Long>> source = addSource(new 
FileMonitoringFunction(
+                               filePath, interval, watchType));
 
-       private static void checkIfFileExists(String filePath) {
-               File file = new File(filePath);
-               if (!file.exists()) {
-                       throw new IllegalArgumentException("File not found: " + 
filePath);
-               }
-
-               if (!file.canRead()) {
-                       throw new IllegalArgumentException("Cannot read file: " 
+ filePath);
-               }
-
-               if (file.isDirectory()) {
-                       throw new IllegalArgumentException("Given path is a 
directory: " + filePath);
-               }
+               return source.flatMap(new FileReadFunction());
        }
 
        /**

http://git-wip-us.apache.org/repos/asf/flink/blob/bf5e39a8/flink-addons/flink-streaming/flink-streaming-core/src/main/java/org/apache/flink/streaming/api/function/source/FileMonitoringFunction.java
----------------------------------------------------------------------
diff --git 
a/flink-addons/flink-streaming/flink-streaming-core/src/main/java/org/apache/flink/streaming/api/function/source/FileMonitoringFunction.java
 
b/flink-addons/flink-streaming/flink-streaming-core/src/main/java/org/apache/flink/streaming/api/function/source/FileMonitoringFunction.java
new file mode 100644
index 0000000..c223c53
--- /dev/null
+++ 
b/flink-addons/flink-streaming/flink-streaming-core/src/main/java/org/apache/flink/streaming/api/function/source/FileMonitoringFunction.java
@@ -0,0 +1,117 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.flink.streaming.api.function.source;
+
+import java.io.IOException;
+import java.net.URI;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.flink.api.java.tuple.Tuple3;
+import org.apache.flink.core.fs.FileStatus;
+import org.apache.flink.core.fs.FileSystem;
+import org.apache.flink.core.fs.Path;
+import org.apache.flink.util.Collector;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public class FileMonitoringFunction implements SourceFunction<Tuple3<String, 
Long, Long>> {
+       private static final long serialVersionUID = 1L;
+
+       private static final Logger LOG = 
LoggerFactory.getLogger(FileMonitoringFunction.class);
+
+       public enum WatchType {
+               ONLY_NEW_FILES, // Only new files will be processed.
+               REPROCESS_WITH_APPENDED, // When some files are appended, all 
contents of the files will be processed.
+               PROCESS_ONLY_APPENDED // When some files are appended, only 
appended contents will be processed.
+       }
+
+       private String path;
+       private long interval;
+       private WatchType watchType;
+
+       private FileSystem fileSystem;
+       private long lastModificationTime;
+       private Map<String, Long> offsetOfFiles;
+
+       public FileMonitoringFunction(String path, long interval, WatchType 
watchType) {
+               this.path = path;
+               this.interval = interval;
+               this.watchType = watchType;
+
+               this.lastModificationTime = System.currentTimeMillis();
+               this.offsetOfFiles = new HashMap<String, Long>();
+       }
+
+       @Override
+       public void invoke(Collector<Tuple3<String, Long, Long>> collector) 
throws Exception {
+               fileSystem = FileSystem.get(new URI(path));
+
+               while (true) {
+                       List<String> files = listNewFiles();
+                       for (String filePath : files) {
+                               if (watchType == WatchType.ONLY_NEW_FILES || 
watchType == WatchType.REPROCESS_WITH_APPENDED) {
+                                       collector.collect(new Tuple3<String, 
Long, Long>(filePath, 0L, -1L));
+                                       offsetOfFiles.put(filePath, -1L);
+                               } else if (watchType == 
WatchType.PROCESS_ONLY_APPENDED) {
+                                       long offset = 0;
+                                       long fileSize = 
fileSystem.getFileStatus(new Path(filePath)).getLen();
+                                       if 
(offsetOfFiles.containsKey(filePath)) {
+                                               offset = 
offsetOfFiles.get(filePath);
+                                       }
+
+                                       collector.collect(new Tuple3<String, 
Long, Long>(filePath, offset, fileSize));
+                                       offsetOfFiles.put(filePath, fileSize);
+
+                                       LOG.info("File processed: {}, {}, {}", 
filePath, offset, fileSize);
+                               }
+                       }
+
+                       Thread.sleep(interval);
+               }
+       }
+
+       private List<String> listNewFiles() throws IOException {
+               List<String> files = new ArrayList<String>();
+               FileStatus[] statuses = fileSystem.listStatus(new Path(path));
+
+               for (FileStatus status : statuses) {
+                       Path filePath = status.getPath();
+                       long modificationTime = status.getModificationTime();
+
+                       if (!isFiltered(filePath, modificationTime)) {
+                               files.add(filePath.toString());
+                       }
+               }
+
+               lastModificationTime = System.currentTimeMillis();
+
+               return files;
+       }
+
+       private boolean isFiltered(Path path, long modificationTime) {
+               String filename = path.getName();
+
+               return lastModificationTime > modificationTime // not modified 
file
+                               || (watchType == WatchType.ONLY_NEW_FILES && 
offsetOfFiles.containsKey(path.toString())) // modified file but already 
processed
+                               || filename.startsWith(".") // hidden file
+                               || filename.contains("_COPYING_"); // currently 
copying file
+       }
+}

http://git-wip-us.apache.org/repos/asf/flink/blob/bf5e39a8/flink-addons/flink-streaming/flink-streaming-core/src/main/java/org/apache/flink/streaming/api/function/source/FileReadFunction.java
----------------------------------------------------------------------
diff --git 
a/flink-addons/flink-streaming/flink-streaming-core/src/main/java/org/apache/flink/streaming/api/function/source/FileReadFunction.java
 
b/flink-addons/flink-streaming/flink-streaming-core/src/main/java/org/apache/flink/streaming/api/function/source/FileReadFunction.java
new file mode 100644
index 0000000..0882d9e
--- /dev/null
+++ 
b/flink-addons/flink-streaming/flink-streaming-core/src/main/java/org/apache/flink/streaming/api/function/source/FileReadFunction.java
@@ -0,0 +1,51 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.flink.streaming.api.function.source;
+
+import org.apache.flink.api.common.functions.FlatMapFunction;
+import org.apache.flink.api.java.tuple.Tuple3;
+import org.apache.flink.core.fs.FSDataInputStream;
+import org.apache.flink.core.fs.FileSystem;
+import org.apache.flink.core.fs.Path;
+import org.apache.flink.util.Collector;
+
+import java.io.BufferedReader;
+import java.io.InputStreamReader;
+import java.net.URI;
+
+public class FileReadFunction implements FlatMapFunction<Tuple3<String, Long, 
Long>, String> {
+
+       private static final long serialVersionUID = 1L;
+
+       @Override
+       public void flatMap(Tuple3<String, Long, Long> value, Collector<String> 
out) throws Exception {
+               FSDataInputStream stream = FileSystem.get(new 
URI(value.f0)).open(new Path(value.f0));
+               stream.seek(value.f1);
+
+               BufferedReader reader = new BufferedReader(new 
InputStreamReader(stream));
+               String line;
+
+               try {
+                       while ((line = reader.readLine()) != null && (value.f2 
== -1L || stream.getPos() <= value.f2)) {
+                               out.collect(line);
+                       }
+               } finally {
+                       reader.close();
+               }
+       }
+}

http://git-wip-us.apache.org/repos/asf/flink/blob/bf5e39a8/flink-addons/flink-streaming/flink-streaming-core/src/main/java/org/apache/flink/streaming/api/function/source/FileStreamFunction.java
----------------------------------------------------------------------
diff --git 
a/flink-addons/flink-streaming/flink-streaming-core/src/main/java/org/apache/flink/streaming/api/function/source/FileStreamFunction.java
 
b/flink-addons/flink-streaming/flink-streaming-core/src/main/java/org/apache/flink/streaming/api/function/source/FileStreamFunction.java
deleted file mode 100644
index 7371ac9..0000000
--- 
a/flink-addons/flink-streaming/flink-streaming-core/src/main/java/org/apache/flink/streaming/api/function/source/FileStreamFunction.java
+++ /dev/null
@@ -1,49 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.flink.streaming.api.function.source;
-
-import java.io.BufferedReader;
-import java.io.FileReader;
-import java.io.IOException;
-
-import org.apache.flink.util.Collector;
-
-public class FileStreamFunction implements SourceFunction<String> {
-       private static final long serialVersionUID = 1L;
-
-       private final String path;
-
-       public FileStreamFunction(String path) {
-               this.path = path;
-       }
-
-       @Override
-       public void invoke(Collector<String> collector) throws IOException {
-               while (true) {
-                       BufferedReader br = new BufferedReader(new 
FileReader(path));
-                       String line = br.readLine();
-                       while (line != null) {
-                               if (!line.equals("")) {
-                                       collector.collect(line);
-                               }
-                               line = br.readLine();
-                       }
-                       br.close();
-               }
-       }
-}

Reply via email to