[FLINK-1081] Add HDFS file-stream source for streaming
Conflicts:
flink-addons/flink-streaming/flink-streaming-core/src/main/java/org/apache/flink/streaming/api/environment/StreamExecutionEnvironment.java
Project: http://git-wip-us.apache.org/repos/asf/flink/repo
Commit: http://git-wip-us.apache.org/repos/asf/flink/commit/bf5e39a8
Tree: http://git-wip-us.apache.org/repos/asf/flink/tree/bf5e39a8
Diff: http://git-wip-us.apache.org/repos/asf/flink/diff/bf5e39a8
Branch: refs/heads/master
Commit: bf5e39a8dea1e52de8f8eb92ded3d128ce66d572
Parents: 7aa9a50
Author: Chiwan Park <[email protected]>
Authored: Fri Dec 12 02:49:01 2014 +0900
Committer: Gyula Fora <[email protected]>
Committed: Sun Jan 25 20:21:22 2015 +0100
----------------------------------------------------------------------
.../environment/StreamExecutionEnvironment.java | 47 ++++----
.../function/source/FileMonitoringFunction.java | 117 +++++++++++++++++++
.../api/function/source/FileReadFunction.java | 51 ++++++++
.../api/function/source/FileStreamFunction.java | 49 --------
4 files changed, 191 insertions(+), 73 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/flink/blob/bf5e39a8/flink-addons/flink-streaming/flink-streaming-core/src/main/java/org/apache/flink/streaming/api/environment/StreamExecutionEnvironment.java
----------------------------------------------------------------------
diff --git
a/flink-addons/flink-streaming/flink-streaming-core/src/main/java/org/apache/flink/streaming/api/environment/StreamExecutionEnvironment.java
b/flink-addons/flink-streaming/flink-streaming-core/src/main/java/org/apache/flink/streaming/api/environment/StreamExecutionEnvironment.java
index 8a9be93..b7eeed2 100644
---
a/flink-addons/flink-streaming/flink-streaming-core/src/main/java/org/apache/flink/streaming/api/environment/StreamExecutionEnvironment.java
+++
b/flink-addons/flink-streaming/flink-streaming-core/src/main/java/org/apache/flink/streaming/api/environment/StreamExecutionEnvironment.java
@@ -29,6 +29,7 @@ import org.apache.flink.api.common.typeinfo.BasicTypeInfo;
import org.apache.flink.api.common.typeinfo.TypeInformation;
import org.apache.flink.api.java.ExecutionEnvironment;
import org.apache.flink.api.java.io.TextInputFormat;
+import org.apache.flink.api.java.tuple.Tuple3;
import org.apache.flink.api.java.typeutils.TypeExtractor;
import org.apache.flink.client.program.Client;
import org.apache.flink.client.program.ContextEnvironment;
@@ -36,8 +37,10 @@ import org.apache.flink.core.fs.Path;
import org.apache.flink.streaming.api.StreamGraph;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
+import org.apache.flink.streaming.api.function.source.FileMonitoringFunction;
+import
org.apache.flink.streaming.api.function.source.FileMonitoringFunction.WatchType;
+import org.apache.flink.streaming.api.function.source.FileReadFunction;
import org.apache.flink.streaming.api.function.source.FileSourceFunction;
-import org.apache.flink.streaming.api.function.source.FileStreamFunction;
import org.apache.flink.streaming.api.function.source.FromElementsFunction;
import org.apache.flink.streaming.api.function.source.GenSequenceFunction;
import org.apache.flink.streaming.api.function.source.GenericSourceFunction;
@@ -214,34 +217,30 @@ public abstract class StreamExecutionEnvironment {
}
/**
- * Creates a DataStream that represents the Strings produced by reading
the
- * given file line wise multiple times(infinite). The file will be read
with
- * the system's default character set. This functionality can be used
for
- * testing a topology.
+ * Creates a DataStream that contains the contents of file created while
+ * system watches the given path. The file will be read with the
system's
+ * default character set.
*
* @param filePath
* The path of the file, as a URI (e.g.,
- * "file:///some/local/file" or
"hdfs://host:port/file/path").
- * @return The DataStream representing the text file.
+ * "file:///some/local/file" or
"hdfs://host:port/file/path/").
+ * @param interval
+ * The interval of file watching.
+ * @param watchType
+ * The watch type of file stream. When watchType is
+ * {@link WatchType.ONLY_NEW_FILES}, the system processes
only
+ * new files. {@link WatchType.REPROCESS_WITH_APPENDED} means
+ * that the system re-processes all contents of appended
file.
+ * {@link WatchType.PROCESS_ONLY_APPENDED} means that the
system
+ * processes only appended contents of files.
+ *
+ * @return The DataStream containing the given directory.
*/
- public DataStreamSource<String> readTextStream(String filePath) {
- checkIfFileExists(filePath);
- return addSource(new FileStreamFunction(filePath), null,
"textStream");
- }
+ public DataStream<String> readFileStream(String filePath, long
interval, WatchType watchType) {
+ DataStream<Tuple3<String, Long, Long>> source = addSource(new
FileMonitoringFunction(
+ filePath, interval, watchType));
- private static void checkIfFileExists(String filePath) {
- File file = new File(filePath);
- if (!file.exists()) {
- throw new IllegalArgumentException("File not found: " +
filePath);
- }
-
- if (!file.canRead()) {
- throw new IllegalArgumentException("Cannot read file: "
+ filePath);
- }
-
- if (file.isDirectory()) {
- throw new IllegalArgumentException("Given path is a
directory: " + filePath);
- }
+ return source.flatMap(new FileReadFunction());
}
/**
http://git-wip-us.apache.org/repos/asf/flink/blob/bf5e39a8/flink-addons/flink-streaming/flink-streaming-core/src/main/java/org/apache/flink/streaming/api/function/source/FileMonitoringFunction.java
----------------------------------------------------------------------
diff --git
a/flink-addons/flink-streaming/flink-streaming-core/src/main/java/org/apache/flink/streaming/api/function/source/FileMonitoringFunction.java
b/flink-addons/flink-streaming/flink-streaming-core/src/main/java/org/apache/flink/streaming/api/function/source/FileMonitoringFunction.java
new file mode 100644
index 0000000..c223c53
--- /dev/null
+++
b/flink-addons/flink-streaming/flink-streaming-core/src/main/java/org/apache/flink/streaming/api/function/source/FileMonitoringFunction.java
@@ -0,0 +1,117 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.flink.streaming.api.function.source;
+
+import java.io.IOException;
+import java.net.URI;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.flink.api.java.tuple.Tuple3;
+import org.apache.flink.core.fs.FileStatus;
+import org.apache.flink.core.fs.FileSystem;
+import org.apache.flink.core.fs.Path;
+import org.apache.flink.util.Collector;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public class FileMonitoringFunction implements SourceFunction<Tuple3<String,
Long, Long>> {
+ private static final long serialVersionUID = 1L;
+
+ private static final Logger LOG =
LoggerFactory.getLogger(FileMonitoringFunction.class);
+
+ public enum WatchType {
+ ONLY_NEW_FILES, // Only new files will be processed.
+ REPROCESS_WITH_APPENDED, // When some files are appended, all
contents of the files will be processed.
+ PROCESS_ONLY_APPENDED // When some files are appended, only
appended contents will be processed.
+ }
+
+ private String path;
+ private long interval;
+ private WatchType watchType;
+
+ private FileSystem fileSystem;
+ private long lastModificationTime;
+ private Map<String, Long> offsetOfFiles;
+
+ public FileMonitoringFunction(String path, long interval, WatchType
watchType) {
+ this.path = path;
+ this.interval = interval;
+ this.watchType = watchType;
+
+ this.lastModificationTime = System.currentTimeMillis();
+ this.offsetOfFiles = new HashMap<String, Long>();
+ }
+
+ @Override
+ public void invoke(Collector<Tuple3<String, Long, Long>> collector)
throws Exception {
+ fileSystem = FileSystem.get(new URI(path));
+
+ while (true) {
+ List<String> files = listNewFiles();
+ for (String filePath : files) {
+ if (watchType == WatchType.ONLY_NEW_FILES ||
watchType == WatchType.REPROCESS_WITH_APPENDED) {
+ collector.collect(new Tuple3<String,
Long, Long>(filePath, 0L, -1L));
+ offsetOfFiles.put(filePath, -1L);
+ } else if (watchType ==
WatchType.PROCESS_ONLY_APPENDED) {
+ long offset = 0;
+ long fileSize =
fileSystem.getFileStatus(new Path(filePath)).getLen();
+ if
(offsetOfFiles.containsKey(filePath)) {
+ offset =
offsetOfFiles.get(filePath);
+ }
+
+ collector.collect(new Tuple3<String,
Long, Long>(filePath, offset, fileSize));
+ offsetOfFiles.put(filePath, fileSize);
+
+ LOG.info("File processed: {}, {}, {}",
filePath, offset, fileSize);
+ }
+ }
+
+ Thread.sleep(interval);
+ }
+ }
+
+ private List<String> listNewFiles() throws IOException {
+ List<String> files = new ArrayList<String>();
+ FileStatus[] statuses = fileSystem.listStatus(new Path(path));
+
+ for (FileStatus status : statuses) {
+ Path filePath = status.getPath();
+ long modificationTime = status.getModificationTime();
+
+ if (!isFiltered(filePath, modificationTime)) {
+ files.add(filePath.toString());
+ }
+ }
+
+ lastModificationTime = System.currentTimeMillis();
+
+ return files;
+ }
+
+ private boolean isFiltered(Path path, long modificationTime) {
+ String filename = path.getName();
+
+ return lastModificationTime > modificationTime // not modified
file
+ || (watchType == WatchType.ONLY_NEW_FILES &&
offsetOfFiles.containsKey(path.toString())) // modified file but already
processed
+ || filename.startsWith(".") // hidden file
+ || filename.contains("_COPYING_"); // currently
copying file
+ }
+}
http://git-wip-us.apache.org/repos/asf/flink/blob/bf5e39a8/flink-addons/flink-streaming/flink-streaming-core/src/main/java/org/apache/flink/streaming/api/function/source/FileReadFunction.java
----------------------------------------------------------------------
diff --git
a/flink-addons/flink-streaming/flink-streaming-core/src/main/java/org/apache/flink/streaming/api/function/source/FileReadFunction.java
b/flink-addons/flink-streaming/flink-streaming-core/src/main/java/org/apache/flink/streaming/api/function/source/FileReadFunction.java
new file mode 100644
index 0000000..0882d9e
--- /dev/null
+++
b/flink-addons/flink-streaming/flink-streaming-core/src/main/java/org/apache/flink/streaming/api/function/source/FileReadFunction.java
@@ -0,0 +1,51 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.flink.streaming.api.function.source;
+
+import org.apache.flink.api.common.functions.FlatMapFunction;
+import org.apache.flink.api.java.tuple.Tuple3;
+import org.apache.flink.core.fs.FSDataInputStream;
+import org.apache.flink.core.fs.FileSystem;
+import org.apache.flink.core.fs.Path;
+import org.apache.flink.util.Collector;
+
+import java.io.BufferedReader;
+import java.io.InputStreamReader;
+import java.net.URI;
+
+public class FileReadFunction implements FlatMapFunction<Tuple3<String, Long,
Long>, String> {
+
+ private static final long serialVersionUID = 1L;
+
+ @Override
+ public void flatMap(Tuple3<String, Long, Long> value, Collector<String>
out) throws Exception {
+ FSDataInputStream stream = FileSystem.get(new
URI(value.f0)).open(new Path(value.f0));
+ stream.seek(value.f1);
+
+ BufferedReader reader = new BufferedReader(new
InputStreamReader(stream));
+ String line;
+
+ try {
+ while ((line = reader.readLine()) != null && (value.f2
== -1L || stream.getPos() <= value.f2)) {
+ out.collect(line);
+ }
+ } finally {
+ reader.close();
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/flink/blob/bf5e39a8/flink-addons/flink-streaming/flink-streaming-core/src/main/java/org/apache/flink/streaming/api/function/source/FileStreamFunction.java
----------------------------------------------------------------------
diff --git
a/flink-addons/flink-streaming/flink-streaming-core/src/main/java/org/apache/flink/streaming/api/function/source/FileStreamFunction.java
b/flink-addons/flink-streaming/flink-streaming-core/src/main/java/org/apache/flink/streaming/api/function/source/FileStreamFunction.java
deleted file mode 100644
index 7371ac9..0000000
---
a/flink-addons/flink-streaming/flink-streaming-core/src/main/java/org/apache/flink/streaming/api/function/source/FileStreamFunction.java
+++ /dev/null
@@ -1,49 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.flink.streaming.api.function.source;
-
-import java.io.BufferedReader;
-import java.io.FileReader;
-import java.io.IOException;
-
-import org.apache.flink.util.Collector;
-
-public class FileStreamFunction implements SourceFunction<String> {
- private static final long serialVersionUID = 1L;
-
- private final String path;
-
- public FileStreamFunction(String path) {
- this.path = path;
- }
-
- @Override
- public void invoke(Collector<String> collector) throws IOException {
- while (true) {
- BufferedReader br = new BufferedReader(new
FileReader(path));
- String line = br.readLine();
- while (line != null) {
- if (!line.equals("")) {
- collector.collect(line);
- }
- line = br.readLine();
- }
- br.close();
- }
- }
-}