[GitHub] flink pull request #2618: Refactoring the Continuous File Monitoring Functio...

mxm Wed, 12 Oct 2016 07:54:50 -0700

Github user mxm commented on a diff in the pull request:

    https://github.com/apache/flink/pull/2618#discussion_r83014810
  
    --- Diff: 
flink-fs-tests/src/test/java/org/apache/flink/hdfstests/ContinuousFileProcessingTests.java
 ---
    @@ -336,237 +348,294 @@ public int compare(String o1, String o2) {
                        Assert.assertEquals(expectedFileContents.get(fileIdx), 
cntntStr.toString());
                }
     
    -           for(org.apache.hadoop.fs.Path file: filesCreated) {
    +           for (org.apache.hadoop.fs.Path file: filesCreated) {
                        hdfs.delete(file, false);
                }
        }
     
    -   private static class PathFilter extends FilePathFilter {
    -
    -           @Override
    -           public boolean filterPath(Path filePath) {
    -                   return filePath.getName().startsWith("**");
    -           }
    -   }
    +   ////                            Monitoring Function Tests               
                //////
     
        @Test
        public void testFilePathFiltering() throws Exception {
    -           Set<String> uniqFilesFound = new HashSet<>();
                Set<org.apache.hadoop.fs.Path> filesCreated = new HashSet<>();
    +           Set<String> filesKept = new TreeSet<>();
     
                // create the files to be discarded
                for (int i = 0; i < NO_OF_FILES; i++) {
    -                   Tuple2<org.apache.hadoop.fs.Path, String> file = 
fillWithData(hdfsURI, "**file", i, "This is test line.");
    +                   Tuple2<org.apache.hadoop.fs.Path, String> file = 
createFileAndFillWithData(hdfsURI, "**file", i, "This is test line.");
                        filesCreated.add(file.f0);
                }
     
                // create the files to be kept
                for (int i = 0; i < NO_OF_FILES; i++) {
    -                   Tuple2<org.apache.hadoop.fs.Path, String> file = 
fillWithData(hdfsURI, "file", i, "This is test line.");
    +                   Tuple2<org.apache.hadoop.fs.Path, String> file =
    +                           createFileAndFillWithData(hdfsURI, "file", i, 
"This is test line.");
                        filesCreated.add(file.f0);
    +                   filesKept.add(file.f0.getName());
                }
     
                TextInputFormat format = new TextInputFormat(new Path(hdfsURI));
                format.setFilesFilter(new PathFilter());
    +
                ContinuousFileMonitoringFunction<String> monitoringFunction =
                        new ContinuousFileMonitoringFunction<>(format, hdfsURI,
                                FileProcessingMode.PROCESS_ONCE, 1, INTERVAL);
     
    +           final FileVerifyingSourceContext context =
    +                   new FileVerifyingSourceContext(new OneShotLatch(), 
monitoringFunction, 0, -1);
    +
                monitoringFunction.open(new Configuration());
    -           monitoringFunction.run(new 
TestingSourceContext(monitoringFunction, uniqFilesFound));
    +           monitoringFunction.run(context);
     
    -           Assert.assertEquals(NO_OF_FILES, uniqFilesFound.size());
    -           for(int i = 0; i < NO_OF_FILES; i++) {
    -                   org.apache.hadoop.fs.Path file = new 
org.apache.hadoop.fs.Path(hdfsURI + "/file" + i);
    -                   
Assert.assertTrue(uniqFilesFound.contains(file.toString()));
    -           }
    +           Assert.assertArrayEquals(filesKept.toArray(), 
context.getSeenFiles().toArray());
     
    -           for(org.apache.hadoop.fs.Path file: filesCreated) {
    +           // finally delete the files created for the test.
    +           for (org.apache.hadoop.fs.Path file: filesCreated) {
                        hdfs.delete(file, false);
                }
        }
     
    +   private static class PathFilter extends FilePathFilter {
    +           @Override
    +           public boolean filterPath(Path filePath) {
    +                   return filePath.getName().startsWith("**");
    +           }
    +   }
    +
        @Test
    -   public void testFileSplitMonitoringReprocessWithAppended() throws 
Exception {
    -           final Set<String> uniqFilesFound = new HashSet<>();
    +   public void testSortingOnModTime() throws Exception {
    +           final long[] modTimes = new long[NO_OF_FILES];
    +           final org.apache.hadoop.fs.Path[] filesCreated = new 
org.apache.hadoop.fs.Path[NO_OF_FILES];
    +
    +           // create some files
    +           for (int i = 0; i < NO_OF_FILES; i++) {
    +                   Tuple2<org.apache.hadoop.fs.Path, String> file =
    +                           createFileAndFillWithData(hdfsURI, "file", i, 
"This is test line.");
    +                   Thread.sleep(10);
    +
    +                   filesCreated[i] = file.f0;
    +                   modTimes[i] = 
hdfs.getFileStatus(file.f0).getModificationTime();
    +           }
    +
    +           TextInputFormat format = new TextInputFormat(new Path(hdfsURI));
    +           format.setFilesFilter(FilePathFilter.createDefaultFilter());
     
    -           FileCreator fc = new FileCreator(INTERVAL, NO_OF_FILES);
    -           fc.start();
    +           // this is just to verify that all splits have been forwarded 
later.
    +           FileInputSplit[] splits = format.createInputSplits(1);
     
    -           Thread t = new Thread(new Runnable() {
    +           ContinuousFileMonitoringFunction<String> monitoringFunction =
    +                   new ContinuousFileMonitoringFunction<>(format, hdfsURI,
    +                           FileProcessingMode.PROCESS_ONCE, 1, INTERVAL);
    +
    +           ModTimeVerifyingSourceContext context = new 
ModTimeVerifyingSourceContext(modTimes);
    +
    +           monitoringFunction.open(new Configuration());
    +           monitoringFunction.run(context);
    +           Assert.assertEquals(splits.length, context.getCounter());
    +
    +           // delete the created files.
    +           for (int i = 0; i < NO_OF_FILES; i++) {
    +                   hdfs.delete(filesCreated[i], false);
    +           }
    +   }
    +
    +   @Test
    +   public void testProcessOnce() throws Exception {
    +           final OneShotLatch latch = new OneShotLatch();
    +
    +           // create a single file in the directory
    +           Tuple2<org.apache.hadoop.fs.Path, String> bootstrap =
    +                   createFileAndFillWithData(hdfsURI, "file", NO_OF_FILES 
+ 1, "This is test line.");
    +           Assert.assertTrue(hdfs.exists(bootstrap.f0));
    +
    +           // the source is supposed to read only this file.
    +           final Set<String> filesToBeRead = new TreeSet<>();
    +           filesToBeRead.add(bootstrap.f0.getName());
    +
    +           TextInputFormat format = new TextInputFormat(new Path(hdfsURI));
    +           format.setFilesFilter(FilePathFilter.createDefaultFilter());
    +
    +           final ContinuousFileMonitoringFunction<String> 
monitoringFunction =
    +                   new ContinuousFileMonitoringFunction<>(format, hdfsURI,
    +                           FileProcessingMode.PROCESS_ONCE, 1, INTERVAL);
    +
    +           final FileVerifyingSourceContext context =
    +                   new FileVerifyingSourceContext(latch, 
monitoringFunction, 1, -1);
    +
    +           final Thread t = new Thread() {
                        @Override
                        public void run() {
    -                           TextInputFormat format = new 
TextInputFormat(new Path(hdfsURI));
    -                           
format.setFilesFilter(FilePathFilter.createDefaultFilter());
    -                           ContinuousFileMonitoringFunction<String> 
monitoringFunction =
    -                                   new 
ContinuousFileMonitoringFunction<>(format, hdfsURI,
    -                                           
FileProcessingMode.PROCESS_CONTINUOUSLY, 1, INTERVAL);
    -
                                try {
                                        monitoringFunction.open(new 
Configuration());
    -                                   monitoringFunction.run(new 
TestingSourceContext(monitoringFunction, uniqFilesFound));
    +                                   monitoringFunction.run(context);
                                } catch (Exception e) {
    -                                   // do nothing as we interrupted the 
thread.
    +                                   Assert.fail(e.getMessage());
                                }
                        }
    -           });
    +           };
                t.start();
     
    -           // wait until the sink also sees all the splits.
    -           synchronized (uniqFilesFound) {
    -                   uniqFilesFound.wait();
    +           if (!latch.isTriggered()) {
    +                   latch.await();
                }
    -           t.interrupt();
    -           fc.join();
     
    -           Assert.assertEquals(NO_OF_FILES, fc.getFilesCreated().size());
    -           Assert.assertEquals(NO_OF_FILES, uniqFilesFound.size());
    -
    -           Set<org.apache.hadoop.fs.Path> filesCreated = 
fc.getFilesCreated();
    -           Set<String> fileNamesCreated = new HashSet<>();
    -           for (org.apache.hadoop.fs.Path path: fc.getFilesCreated()) {
    -                   fileNamesCreated.add(path.toString());
    +           // create some additional files that would be processed in the 
case of PROCESS_CONTINUOUSLY
    +           final org.apache.hadoop.fs.Path[] filesCreated = new 
org.apache.hadoop.fs.Path[NO_OF_FILES];
    +           for (int i = 0; i < NO_OF_FILES; i++) {
    +                   Tuple2<org.apache.hadoop.fs.Path, String> ignoredFile =
    +                           createFileAndFillWithData(hdfsURI, "file", i, 
"This is test line.");
    +                   filesCreated[i] = ignoredFile.f0;
                }
     
    -           for(String file: uniqFilesFound) {
    -                   Assert.assertTrue(fileNamesCreated.contains(file));
    -           }
    +           // wait until the monitoring thread exits
    +           t.join();
     
    -           for(org.apache.hadoop.fs.Path file: filesCreated) {
    -                   hdfs.delete(file, false);
    +           Assert.assertArrayEquals(filesToBeRead.toArray(), 
context.getSeenFiles().toArray());
    +
    +           // finally delete the files created for the test.
    +           hdfs.delete(bootstrap.f0, false);
    +           for (org.apache.hadoop.fs.Path path: filesCreated) {
    +                   hdfs.delete(path, false);
                }
        }
     
        @Test
    -   public void testFileSplitMonitoringProcessOnce() throws Exception {
    -           Set<String> uniqFilesFound = new HashSet<>();
    -
    -           FileCreator fc = new FileCreator(INTERVAL, 1);
    -           Set<org.apache.hadoop.fs.Path> filesCreated = 
fc.getFilesCreated();
    -           fc.start();
    -
    -           // to make sure that at least one file is created
    -           if (filesCreated.size() == 0) {
    -                   synchronized (filesCreated) {
    -                           if (filesCreated.size() == 0) {
    -                                   filesCreated.wait();
    -                           }
    -                   }
    -           }
    -           Assert.assertTrue(fc.getFilesCreated().size() >= 1);
    +   public void testProcessContinuously() throws Exception {
    +           final OneShotLatch latch = new OneShotLatch();
    +
    +           // create a single file in the directory
    +           Tuple2<org.apache.hadoop.fs.Path, String> bootstrap =
    +                   createFileAndFillWithData(hdfsURI, "file", NO_OF_FILES 
+ 1, "This is test line.");
    +           Assert.assertTrue(hdfs.exists(bootstrap.f0));
    +
    +           // the source is supposed to read only this file.
    +           final Set<String> filesToBeRead = new TreeSet<>();
    +           filesToBeRead.add(bootstrap.f0.getName());
     
                TextInputFormat format = new TextInputFormat(new Path(hdfsURI));
                format.setFilesFilter(FilePathFilter.createDefaultFilter());
    -           ContinuousFileMonitoringFunction<String> monitoringFunction =
    +
    +           final ContinuousFileMonitoringFunction<String> 
monitoringFunction =
                        new ContinuousFileMonitoringFunction<>(format, hdfsURI,
    -                           FileProcessingMode.PROCESS_ONCE, 1, INTERVAL);
    +                           FileProcessingMode.PROCESS_CONTINUOUSLY, 1, 
INTERVAL);
     
    -           monitoringFunction.open(new Configuration());
    -           monitoringFunction.run(new 
TestingSourceContext(monitoringFunction, uniqFilesFound));
    +           final int totalNoOfFilesToBeRead = 11; // 1 for the bootstrap + 
NO_OF_FILES
    +           final FileVerifyingSourceContext context = new 
FileVerifyingSourceContext(latch,
    +                   monitoringFunction, 1, totalNoOfFilesToBeRead);
     
    -           // wait until all the files are created
    -           fc.join();
    +           final Thread t = new Thread() {
     
    -           Assert.assertEquals(NO_OF_FILES, filesCreated.size());
    +                   @Override
    +                   public void run() {
    +                           try {
    +                                   monitoringFunction.open(new 
Configuration());
    +                                   monitoringFunction.run(context);
    +                           } catch (Exception e) {
    +                                   Assert.fail(e.getMessage());
    +                           }
    +                   }
    +           };
    +           t.start();
     
    -           Set<String> fileNamesCreated = new HashSet<>();
    -           for (org.apache.hadoop.fs.Path path: fc.getFilesCreated()) {
    -                   fileNamesCreated.add(path.toString());
    +           if (!latch.isTriggered()) {
    +                   latch.await();
                }
     
    -           Assert.assertTrue(uniqFilesFound.size() >= 1 && 
uniqFilesFound.size() < fileNamesCreated.size());
    -           for(String file: uniqFilesFound) {
    -                   Assert.assertTrue(fileNamesCreated.contains(file));
    +           // create some additional files that would be processed in the 
case of PROCESS_CONTINUOUSLY
    +           final org.apache.hadoop.fs.Path[] filesCreated = new 
org.apache.hadoop.fs.Path[NO_OF_FILES];
    +           for (int i = 0; i < NO_OF_FILES; i++) {
    +                   Tuple2<org.apache.hadoop.fs.Path, String> file =
    +                           createFileAndFillWithData(hdfsURI, "file", i, 
"This is test line.");
    +                   filesCreated[i] = file.f0;
    +                   filesToBeRead.add(file.f0.getName());
                }
     
    -           for(org.apache.hadoop.fs.Path file: filesCreated) {
    -                   hdfs.delete(file, false);
    +           // wait until the monitoring thread exits
    +           t.join();
    +
    +           Assert.assertArrayEquals(filesToBeRead.toArray(), 
context.getSeenFiles().toArray());
    +
    +           // finally delete the files created for the test.
    +           hdfs.delete(bootstrap.f0, false);
    +           for (org.apache.hadoop.fs.Path path: filesCreated) {
    +                   hdfs.delete(path, false);
                }
        }
     
    -   // -------------                End of Tests
    +   ///////////                             Source Contexts Used by the 
tests                               /////////////////
     
    -   private int getLineNo(String line) {
    -           String[] tkns = line.split("\\s");
    -           Assert.assertEquals(6, tkns.length);
    -           return Integer.parseInt(tkns[tkns.length - 1]);
    -   }
    +   private static class FileVerifyingSourceContext extends 
DummySourceContext {
    --- End diff --
    
    I wonder, couldn't you just used `Mockito.mock(SourceContext.class)` for 
that?



---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at [email protected] or file a JIRA ticket
with INFRA.
---

[GitHub] flink pull request #2618: Refactoring the Continuous File Monitoring Functio...

Reply via email to