[ https://issues.apache.org/jira/browse/FLINK-2314?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=15308004#comment-15308004 ]
ASF GitHub Bot commented on FLINK-2314: --------------------------------------- Github user aljoscha commented on a diff in the pull request: https://github.com/apache/flink/pull/2020#discussion_r65212915 --- Diff: flink-streaming-java/src/main/java/org/apache/flink/streaming/api/functions/source/ContinuousFileMonitoringFunction.java --- @@ -0,0 +1,336 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.flink.streaming.api.functions.source; + +import org.apache.flink.annotation.Internal; +import org.apache.flink.api.common.io.FileInputFormat; +import org.apache.flink.api.java.tuple.Tuple2; +import org.apache.flink.api.java.tuple.Tuple3; +import org.apache.flink.configuration.Configuration; +import org.apache.flink.core.fs.FileInputSplit; +import org.apache.flink.core.fs.FileStatus; +import org.apache.flink.core.fs.FileSystem; +import org.apache.flink.core.fs.Path; +import org.apache.flink.runtime.JobException; +import org.apache.flink.streaming.api.checkpoint.Checkpointed; +import org.apache.flink.util.Preconditions; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.net.URI; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Comparator; +import java.util.HashMap; +import java.util.Iterator; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; + +/** + * This is the single (non-parallel) task which takes a {@link FileInputFormat} and is responsible for + * i) monitoring a user-provided path, ii) deciding which files should be further read and processed, + * iii) creating the {@link FileInputSplit FileInputSplits} corresponding to those files, and iv) assigning + * them to downstream tasks for further reading and processing. Which splits will be further processed + * depends on the user-provided {@link ProcessingMode} and the {@link FilePathFilter}. + * The splits of the files to be read are then forwarded to the downstream + * {@link ContinuousFileReaderOperator} which can have parallelism greater than one. + */ +@Internal +public class ContinuousFileMonitoringFunction<OUT> + extends RichSourceFunction<FileInputSplit> implements Checkpointed<Tuple3<List<Tuple2<Long, List<FileInputSplit>>>, Tuple2<Long, List<FileInputSplit>>, Long>> { + + private static final long serialVersionUID = 1L; + + private static final Logger LOG = LoggerFactory.getLogger(ContinuousFileMonitoringFunction.class); + + /** + * The minimum interval allowed between consecutive path scans. This is applicable if the + * {@code watchType} is set to {@code PROCESS_CONTINUOUSLY}. + */ + public static final long MIN_MONITORING_INTERVAL = 100l; + + /** + * Specifies when computation will be triggered. + */ + public enum ProcessingMode { --- End diff -- We should probably move this outside of this class and probably name it `FileProcessingMode`. Just in case we want to change the actual file source again. 😉 > Make Streaming File Sources Persistent > -------------------------------------- > > Key: FLINK-2314 > URL: https://issues.apache.org/jira/browse/FLINK-2314 > Project: Flink > Issue Type: Improvement > Components: Streaming > Affects Versions: 0.9 > Reporter: Stephan Ewen > Assignee: Kostas Kloudas > > Streaming File sources should participate in the checkpointing. They should > track the bytes they read from the file and checkpoint it. > One can look at the sequence generating source function for an example of a > checkpointed source. -- This message was sent by Atlassian JIRA (v6.3.4#6332)