[GitHub] flink pull request: Refactoring the File Monitoring Source.

zentol Mon, 25 Apr 2016 08:10:38 -0700

Github user zentol commented on a diff in the pull request:

    https://github.com/apache/flink/pull/1929#discussion_r60929665
  
    --- Diff: 
flink-streaming-java/src/test/java/org/apache/flink/streaming/api/functions/source/FileSplitMonitoringFunctionTest.java
 ---
    @@ -0,0 +1,371 @@
    +/*
    + * Licensed to the Apache Software Foundation (ASF) under one
    + * or more contributor license agreements.  See the NOTICE file
    + * distributed with this work for additional information
    + * regarding copyright ownership.  The ASF licenses this file
    + * to you under the Apache License, Version 2.0 (the
    + * "License"); you may not use this file except in compliance
    + * with the License.  You may obtain a copy of the License at
    + *
    + *    http://www.apache.org/licenses/LICENSE-2.0
    + *
    + * Unless required by applicable law or agreed to in writing, software
    + * distributed under the License is distributed on an "AS IS" BASIS,
    + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    + * See the License for the specific language governing permissions and
    + * limitations under the License.
    + */
    +
    +package org.apache.flink.streaming.api.functions.source;
    +
    +import org.apache.commons.io.IOUtils;
    +import org.apache.flink.api.common.io.FileInputFormat;
    +import org.apache.flink.api.common.typeinfo.TypeInformation;
    +import org.apache.flink.api.java.typeutils.TypeExtractor;
    +import org.apache.flink.configuration.Configuration;
    +import org.apache.flink.core.fs.FileInputSplit;
    +import org.apache.flink.core.fs.Path;
    +import org.apache.flink.streaming.api.watermark.Watermark;
    +import org.apache.flink.streaming.runtime.streamrecord.StreamRecord;
    +import org.apache.flink.streaming.util.OneInputStreamOperatorTestHarness;
    +import org.apache.hadoop.fs.FSDataOutputStream;
    +import org.apache.hadoop.fs.FileUtil;
    +import org.apache.hadoop.hdfs.MiniDFSCluster;
    +import org.junit.After;
    +import org.junit.Assert;
    +import org.junit.Before;
    +import org.junit.Test;
    +
    +import java.io.File;
    +import java.io.IOException;
    +import java.io.StringWriter;
    +import java.util.HashMap;
    +import java.util.HashSet;
    +import java.util.Map;
    +import java.util.Queue;
    +import java.util.Set;
    +
    +public class FileSplitMonitoringFunctionTest {
    +
    +   private static final int NO_OF_FILES = 10;
    +   private static final int LINES_PER_FILE = 10;
    +
    +   private static final long INTERVAL = 200;
    +
    +   private File baseDir;
    +
    +   private org.apache.hadoop.fs.FileSystem hdfs;
    +   private String hdfsURI;
    +   private MiniDFSCluster hdfsCluster;
    +
    +   private Set<org.apache.hadoop.fs.Path> hdPaths = new HashSet<>();
    +   private Set<String> hdPathNames = new HashSet<>();
    +   private Map<Integer, String> hdPathContents = new HashMap<>();
    +
    +   //                                              PREPARING FOR THE TESTS
    +
    +   @Before
    +   public void createHDFS() {
    +           try {
    +                   baseDir = new 
File("./target/hdfs/hdfsTesting").getAbsoluteFile();
    +                   FileUtil.fullyDelete(baseDir);
    +
    +                   org.apache.hadoop.conf.Configuration hdConf = new 
org.apache.hadoop.conf.Configuration();
    +                   hdConf.set(MiniDFSCluster.HDFS_MINIDFS_BASEDIR, 
baseDir.getAbsolutePath());
    +                   hdConf.set("dfs.block.size", String.valueOf(1048576)); 
// this is the minimum we can set.
    +
    +                   MiniDFSCluster.Builder builder = new 
MiniDFSCluster.Builder(hdConf);
    +                   hdfsCluster = builder.build();
    +
    +                   hdfsURI = "hdfs://" + hdfsCluster.getURI().getHost() + 
":" + hdfsCluster.getNameNodePort() +"/";
    +                   hdfs = new 
org.apache.hadoop.fs.Path(hdfsURI).getFileSystem(hdConf);
    +
    +           } catch(Throwable e) {
    +                   e.printStackTrace();
    +                   Assert.fail("Test failed " + e.getMessage());
    +           }
    +   }
    +
    +   @After
    +   public void destroyHDFS() {
    +           try {
    +                   for(org.apache.hadoop.fs.Path file: hdPaths) {
    +                           hdfs.delete(file, false);
    +                   }
    +                   FileUtil.fullyDelete(baseDir);
    +                   hdfsCluster.shutdown();
    +           } catch (IOException e) {
    +                   throw new RuntimeException(e);
    +           }
    +   }
    +
    +   //                                              END OF PREPARATIONS
    +
    +   //                                              TESTS
    +
    +   @Test
    +   public void testFileContents() throws IOException {
    +           // validates the output
    +           for (org.apache.hadoop.fs.Path file : hdPaths) {
    +                   org.apache.hadoop.fs.FSDataInputStream inStream = 
hdfs.open(file);
    +                   StringWriter writer = new StringWriter();
    +                   IOUtils.copy(inStream, writer);
    +                   inStream.close();
    +
    +                   String resultString = writer.toString();
    +                   Assert.assertEquals(hdPathContents.get(file), 
resultString);
    +           }
    +   }
    +
    +   @Test
    +   public void testFileReadingOperator() throws Exception {
    +
    +           // create the files to read.
    +           for(int i = 0; i < NO_OF_FILES; i++) {
    +                   fillWithData(hdfsURI, "file", i, "This is test line.");
    +           }
    +
    +           StringFileFormat format = new StringFileFormat();
    +           Configuration config = new Configuration();
    +           config.setString("input.file.path", hdfsURI);
    +
    +           TypeInformation<String> typeInfo = 
TypeExtractor.getInputFormatTypes(format);
    +
    +           FileSplitReadOperator reader = new 
FileSplitReadOperator(format, typeInfo, config);
    +
    +           OneInputStreamOperatorTestHarness<FileInputSplit, String> 
tester =
    +                   new OneInputStreamOperatorTestHarness<FileInputSplit, 
String>(reader);
    +           tester.open();
    +
    +           FileInputSplit[] splits = format.createInputSplits(
    +                   
reader.getRuntimeContext().getNumberOfParallelSubtasks());
    +
    +           for(FileInputSplit split: splits) {
    +                   tester.processElement(new StreamRecord<>(split));
    +           }
    +
    +           /*
    +           * Given that the reader is multithreaded, the test finishes 
before the reader finishes
    +           * reading. This results in files being deleted before they are 
read, thus throwing an exception.
    +           * In addition, even if file deletion happens at the end, the 
results are not ready for testing.
    +           * To faces this, we wait until all the output is collected or 
until the waiting time exceeds 1000 ms, or 1s.
    +           * */
    +           long start = System.currentTimeMillis();
    +           Queue<Object> output;
    +           do {
    +                   output = tester.getOutput();
    +           } while ((output == null || output.size() != NO_OF_FILES * 
LINES_PER_FILE) && (System.currentTimeMillis() - start) < 1000);
    +
    +           tester.close();
    +
    +           Map<Integer, String> fileContent = new HashMap<>();
    +           for(Object line: tester.getOutput()) {
    +                   StreamRecord<String> element = (StreamRecord<String>) 
line;
    +
    +                   int fileIdx = 
Character.getNumericValue(element.getValue().charAt(0));
    +                   String content = fileContent.get(fileIdx);
    +                   if(content == null) {
    +                           content = "";
    +                   }
    +                   fileContent.put(fileIdx, content + element.getValue());
    +           }
    +
    +           // test if all the file contents are the expected ones.
    +           for(int fileIdx: fileContent.keySet()) {
    +                   Assert.assertEquals(fileContent.get(fileIdx), 
hdPathContents.get(fileIdx));
    +           }
    +   }
    +
    +
    +   @Test
    +   public void testFilePathFiltering() throws Exception {
    +           Set<String> uniqFilesFound = new HashSet<>();
    +
    +           // create the files to be discarded
    +           for(int i = 0; i < NO_OF_FILES; i++) {
    +                   fillWithData(hdfsURI, "**file", i, "This is test 
line.");
    +           }
    +
    +           // create the files to be kept
    +           for(int i = 0; i < NO_OF_FILES; i++) {
    +                   fillWithData(hdfsURI, "file", i, "This is test line.");
    +           }
    +
    +           StringFileFormat format = new StringFileFormat();
    +           Configuration config = new Configuration();
    +           config.setString("input.file.path", hdfsURI);
    +
    +           FileSplitMonitoringFunction<String> monitoringFunction =
    +                   new FileSplitMonitoringFunction<String>(format, 
hdfsURI, new PathFilter(),
    +                           
FileSplitMonitoringFunction.WatchType.REPROCESS_WITH_APPENDED, 1, INTERVAL);
    +
    +           monitoringFunction.open(config);
    +           monitoringFunction.run(new TestingSourceContext(null, 
monitoringFunction, uniqFilesFound));
    +
    +           Assert.assertTrue(uniqFilesFound.size() == NO_OF_FILES);
    +           for(int i = 0; i < NO_OF_FILES; i++) {
    +                   org.apache.hadoop.fs.Path file = new 
org.apache.hadoop.fs.Path(hdfsURI + "/file" + i);
    +                   
Assert.assertTrue(uniqFilesFound.contains(file.toString()));
    +           }
    +   }
    +
    +   private static class PathFilter implements FilePathFilter {
    +
    +           @Override
    +           public boolean filterPaths(Path filePath) {
    +                   return filePath.getName().startsWith("**");
    +           }
    +   }
    +
    +   @Test
    +   public void testFileSplitMonitoring() throws Exception {
    +           Set<String> uniqFilesFound = new HashSet<>();
    +
    +           FileCreator fc = new FileCreator(INTERVAL);
    +           Thread t = new Thread(fc);
    +           t.start();
    +
    +           StringFileFormat format = new StringFileFormat();
    +           Configuration config = new Configuration();
    +           config.setString("input.file.path", hdfsURI);
    +
    +           FileSplitMonitoringFunction<String> monitoringFunction =
    +                   new FileSplitMonitoringFunction<String>(format, 
hdfsURI, FileSplitMonitoringFunction.WatchType.REPROCESS_WITH_APPENDED, 1, 
INTERVAL);
    +           monitoringFunction.open(config);
    +           monitoringFunction.run(new TestingSourceContext(t, 
monitoringFunction, uniqFilesFound));
    +
    +           Assert.assertTrue(uniqFilesFound.size() == NO_OF_FILES);
    +   }
    +
    +   /**
    +    * A separate thread creating {@link #NO_OF_FILES} files, one file 
every {@link #INTERVAL} milliseconds.
    +    * It serves for testing the file monitoring functinality of the {@link 
FileSplitMonitoringFunction}.
    +    * The files are filled with data by the {@link #fillWithData(String, 
String, int, String)} method.
    +    * */
    +   private class FileCreator implements Runnable {
    +
    +           private final long interval;
    +
    +           FileCreator(long interval) {
    +                   this.interval = interval;
    +           }
    +
    +           public void run() {
    +                   try {
    +                           for(int i = 0; i < NO_OF_FILES; i++) {
    +                                   fillWithData(hdfsURI, "file", i, "This 
is test line.");
    +                                   Thread.sleep(interval);
    +                           }
    +                   } catch (IOException e) {
    +                           e.printStackTrace();
    +                   } catch (InterruptedException e) {
    +                           // we just close without any message.
    +                   }
    +           }
    +   }
    +
    +   private class TestingSourceContext implements 
SourceFunction.SourceContext<FileInputSplit> {
    +
    +           private final Thread fileCreator;
    +           private final FileSplitMonitoringFunction src;
    +           private final Set<String> filesFound;
    +
    +           TestingSourceContext(Thread fileCreationThread, 
FileSplitMonitoringFunction monitoringFunction, Set<String> uniqFilesFound) {
    +                   this.filesFound = uniqFilesFound;
    +                   this.src = monitoringFunction;
    +                   this.fileCreator = fileCreationThread;
    +           }
    +
    +           @Override
    +           public void collect(FileInputSplit element) {
    +
    +                   String filePath = element.getPath().toString();
    +                   Assert.assertTrue(hdPathNames.contains(filePath));
    +
    +                   if(filesFound.contains(filePath)) {
    +                           // check if we have duplicate splits that are 
open during the first time
    +                           // the monitor sees them, and the then close, 
so the modification time changes.
    +                           // I should keep the list of found files, and 
prune it when I do not see the same file again.
    +                           Assert.fail("Duplicate file: " + filePath);
    +                   }
    +
    +                   filesFound.add(filePath);
    +                   try {
    +                           if (filesFound.size() == NO_OF_FILES) {
    +                                   this.src.cancel();
    +                                   this.src.close();
    +                                   if(this.fileCreator != null) {
    --- End diff --
    
    missing space after if



---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at infrastruct...@apache.org or file a JIRA ticket
with INFRA.
---

[GitHub] flink pull request: Refactoring the File Monitoring Source.

Reply via email to