Github user mxm commented on a diff in the pull request:
https://github.com/apache/flink/pull/2618#discussion_r83014810
--- Diff:
flink-fs-tests/src/test/java/org/apache/flink/hdfstests/ContinuousFileProcessingTests.java
---
@@ -336,237 +348,294 @@ public int compare(String o1, String o2) {
Assert.assertEquals(expectedFileContents.get(fileIdx),
cntntStr.toString());
}
- for(org.apache.hadoop.fs.Path file: filesCreated) {
+ for (org.apache.hadoop.fs.Path file: filesCreated) {
hdfs.delete(file, false);
}
}
- private static class PathFilter extends FilePathFilter {
-
- @Override
- public boolean filterPath(Path filePath) {
- return filePath.getName().startsWith("**");
- }
- }
+ //// Monitoring Function Tests
//////
@Test
public void testFilePathFiltering() throws Exception {
- Set<String> uniqFilesFound = new HashSet<>();
Set<org.apache.hadoop.fs.Path> filesCreated = new HashSet<>();
+ Set<String> filesKept = new TreeSet<>();
// create the files to be discarded
for (int i = 0; i < NO_OF_FILES; i++) {
- Tuple2<org.apache.hadoop.fs.Path, String> file =
fillWithData(hdfsURI, "**file", i, "This is test line.");
+ Tuple2<org.apache.hadoop.fs.Path, String> file =
createFileAndFillWithData(hdfsURI, "**file", i, "This is test line.");
filesCreated.add(file.f0);
}
// create the files to be kept
for (int i = 0; i < NO_OF_FILES; i++) {
- Tuple2<org.apache.hadoop.fs.Path, String> file =
fillWithData(hdfsURI, "file", i, "This is test line.");
+ Tuple2<org.apache.hadoop.fs.Path, String> file =
+ createFileAndFillWithData(hdfsURI, "file", i,
"This is test line.");
filesCreated.add(file.f0);
+ filesKept.add(file.f0.getName());
}
TextInputFormat format = new TextInputFormat(new Path(hdfsURI));
format.setFilesFilter(new PathFilter());
+
ContinuousFileMonitoringFunction<String> monitoringFunction =
new ContinuousFileMonitoringFunction<>(format, hdfsURI,
FileProcessingMode.PROCESS_ONCE, 1, INTERVAL);
+ final FileVerifyingSourceContext context =
+ new FileVerifyingSourceContext(new OneShotLatch(),
monitoringFunction, 0, -1);
+
monitoringFunction.open(new Configuration());
- monitoringFunction.run(new
TestingSourceContext(monitoringFunction, uniqFilesFound));
+ monitoringFunction.run(context);
- Assert.assertEquals(NO_OF_FILES, uniqFilesFound.size());
- for(int i = 0; i < NO_OF_FILES; i++) {
- org.apache.hadoop.fs.Path file = new
org.apache.hadoop.fs.Path(hdfsURI + "/file" + i);
-
Assert.assertTrue(uniqFilesFound.contains(file.toString()));
- }
+ Assert.assertArrayEquals(filesKept.toArray(),
context.getSeenFiles().toArray());
- for(org.apache.hadoop.fs.Path file: filesCreated) {
+ // finally delete the files created for the test.
+ for (org.apache.hadoop.fs.Path file: filesCreated) {
hdfs.delete(file, false);
}
}
+ private static class PathFilter extends FilePathFilter {
+ @Override
+ public boolean filterPath(Path filePath) {
+ return filePath.getName().startsWith("**");
+ }
+ }
+
@Test
- public void testFileSplitMonitoringReprocessWithAppended() throws
Exception {
- final Set<String> uniqFilesFound = new HashSet<>();
+ public void testSortingOnModTime() throws Exception {
+ final long[] modTimes = new long[NO_OF_FILES];
+ final org.apache.hadoop.fs.Path[] filesCreated = new
org.apache.hadoop.fs.Path[NO_OF_FILES];
+
+ // create some files
+ for (int i = 0; i < NO_OF_FILES; i++) {
+ Tuple2<org.apache.hadoop.fs.Path, String> file =
+ createFileAndFillWithData(hdfsURI, "file", i,
"This is test line.");
+ Thread.sleep(10);
+
+ filesCreated[i] = file.f0;
+ modTimes[i] =
hdfs.getFileStatus(file.f0).getModificationTime();
+ }
+
+ TextInputFormat format = new TextInputFormat(new Path(hdfsURI));
+ format.setFilesFilter(FilePathFilter.createDefaultFilter());
- FileCreator fc = new FileCreator(INTERVAL, NO_OF_FILES);
- fc.start();
+ // this is just to verify that all splits have been forwarded
later.
+ FileInputSplit[] splits = format.createInputSplits(1);
- Thread t = new Thread(new Runnable() {
+ ContinuousFileMonitoringFunction<String> monitoringFunction =
+ new ContinuousFileMonitoringFunction<>(format, hdfsURI,
+ FileProcessingMode.PROCESS_ONCE, 1, INTERVAL);
+
+ ModTimeVerifyingSourceContext context = new
ModTimeVerifyingSourceContext(modTimes);
+
+ monitoringFunction.open(new Configuration());
+ monitoringFunction.run(context);
+ Assert.assertEquals(splits.length, context.getCounter());
+
+ // delete the created files.
+ for (int i = 0; i < NO_OF_FILES; i++) {
+ hdfs.delete(filesCreated[i], false);
+ }
+ }
+
+ @Test
+ public void testProcessOnce() throws Exception {
+ final OneShotLatch latch = new OneShotLatch();
+
+ // create a single file in the directory
+ Tuple2<org.apache.hadoop.fs.Path, String> bootstrap =
+ createFileAndFillWithData(hdfsURI, "file", NO_OF_FILES
+ 1, "This is test line.");
+ Assert.assertTrue(hdfs.exists(bootstrap.f0));
+
+ // the source is supposed to read only this file.
+ final Set<String> filesToBeRead = new TreeSet<>();
+ filesToBeRead.add(bootstrap.f0.getName());
+
+ TextInputFormat format = new TextInputFormat(new Path(hdfsURI));
+ format.setFilesFilter(FilePathFilter.createDefaultFilter());
+
+ final ContinuousFileMonitoringFunction<String>
monitoringFunction =
+ new ContinuousFileMonitoringFunction<>(format, hdfsURI,
+ FileProcessingMode.PROCESS_ONCE, 1, INTERVAL);
+
+ final FileVerifyingSourceContext context =
+ new FileVerifyingSourceContext(latch,
monitoringFunction, 1, -1);
+
+ final Thread t = new Thread() {
@Override
public void run() {
- TextInputFormat format = new
TextInputFormat(new Path(hdfsURI));
-
format.setFilesFilter(FilePathFilter.createDefaultFilter());
- ContinuousFileMonitoringFunction<String>
monitoringFunction =
- new
ContinuousFileMonitoringFunction<>(format, hdfsURI,
-
FileProcessingMode.PROCESS_CONTINUOUSLY, 1, INTERVAL);
-
try {
monitoringFunction.open(new
Configuration());
- monitoringFunction.run(new
TestingSourceContext(monitoringFunction, uniqFilesFound));
+ monitoringFunction.run(context);
} catch (Exception e) {
- // do nothing as we interrupted the
thread.
+ Assert.fail(e.getMessage());
}
}
- });
+ };
t.start();
- // wait until the sink also sees all the splits.
- synchronized (uniqFilesFound) {
- uniqFilesFound.wait();
+ if (!latch.isTriggered()) {
+ latch.await();
}
- t.interrupt();
- fc.join();
- Assert.assertEquals(NO_OF_FILES, fc.getFilesCreated().size());
- Assert.assertEquals(NO_OF_FILES, uniqFilesFound.size());
-
- Set<org.apache.hadoop.fs.Path> filesCreated =
fc.getFilesCreated();
- Set<String> fileNamesCreated = new HashSet<>();
- for (org.apache.hadoop.fs.Path path: fc.getFilesCreated()) {
- fileNamesCreated.add(path.toString());
+ // create some additional files that would be processed in the
case of PROCESS_CONTINUOUSLY
+ final org.apache.hadoop.fs.Path[] filesCreated = new
org.apache.hadoop.fs.Path[NO_OF_FILES];
+ for (int i = 0; i < NO_OF_FILES; i++) {
+ Tuple2<org.apache.hadoop.fs.Path, String> ignoredFile =
+ createFileAndFillWithData(hdfsURI, "file", i,
"This is test line.");
+ filesCreated[i] = ignoredFile.f0;
}
- for(String file: uniqFilesFound) {
- Assert.assertTrue(fileNamesCreated.contains(file));
- }
+ // wait until the monitoring thread exits
+ t.join();
- for(org.apache.hadoop.fs.Path file: filesCreated) {
- hdfs.delete(file, false);
+ Assert.assertArrayEquals(filesToBeRead.toArray(),
context.getSeenFiles().toArray());
+
+ // finally delete the files created for the test.
+ hdfs.delete(bootstrap.f0, false);
+ for (org.apache.hadoop.fs.Path path: filesCreated) {
+ hdfs.delete(path, false);
}
}
@Test
- public void testFileSplitMonitoringProcessOnce() throws Exception {
- Set<String> uniqFilesFound = new HashSet<>();
-
- FileCreator fc = new FileCreator(INTERVAL, 1);
- Set<org.apache.hadoop.fs.Path> filesCreated =
fc.getFilesCreated();
- fc.start();
-
- // to make sure that at least one file is created
- if (filesCreated.size() == 0) {
- synchronized (filesCreated) {
- if (filesCreated.size() == 0) {
- filesCreated.wait();
- }
- }
- }
- Assert.assertTrue(fc.getFilesCreated().size() >= 1);
+ public void testProcessContinuously() throws Exception {
+ final OneShotLatch latch = new OneShotLatch();
+
+ // create a single file in the directory
+ Tuple2<org.apache.hadoop.fs.Path, String> bootstrap =
+ createFileAndFillWithData(hdfsURI, "file", NO_OF_FILES
+ 1, "This is test line.");
+ Assert.assertTrue(hdfs.exists(bootstrap.f0));
+
+ // the source is supposed to read only this file.
+ final Set<String> filesToBeRead = new TreeSet<>();
+ filesToBeRead.add(bootstrap.f0.getName());
TextInputFormat format = new TextInputFormat(new Path(hdfsURI));
format.setFilesFilter(FilePathFilter.createDefaultFilter());
- ContinuousFileMonitoringFunction<String> monitoringFunction =
+
+ final ContinuousFileMonitoringFunction<String>
monitoringFunction =
new ContinuousFileMonitoringFunction<>(format, hdfsURI,
- FileProcessingMode.PROCESS_ONCE, 1, INTERVAL);
+ FileProcessingMode.PROCESS_CONTINUOUSLY, 1,
INTERVAL);
- monitoringFunction.open(new Configuration());
- monitoringFunction.run(new
TestingSourceContext(monitoringFunction, uniqFilesFound));
+ final int totalNoOfFilesToBeRead = 11; // 1 for the bootstrap +
NO_OF_FILES
+ final FileVerifyingSourceContext context = new
FileVerifyingSourceContext(latch,
+ monitoringFunction, 1, totalNoOfFilesToBeRead);
- // wait until all the files are created
- fc.join();
+ final Thread t = new Thread() {
- Assert.assertEquals(NO_OF_FILES, filesCreated.size());
+ @Override
+ public void run() {
+ try {
+ monitoringFunction.open(new
Configuration());
+ monitoringFunction.run(context);
+ } catch (Exception e) {
+ Assert.fail(e.getMessage());
+ }
+ }
+ };
+ t.start();
- Set<String> fileNamesCreated = new HashSet<>();
- for (org.apache.hadoop.fs.Path path: fc.getFilesCreated()) {
- fileNamesCreated.add(path.toString());
+ if (!latch.isTriggered()) {
+ latch.await();
}
- Assert.assertTrue(uniqFilesFound.size() >= 1 &&
uniqFilesFound.size() < fileNamesCreated.size());
- for(String file: uniqFilesFound) {
- Assert.assertTrue(fileNamesCreated.contains(file));
+ // create some additional files that would be processed in the
case of PROCESS_CONTINUOUSLY
+ final org.apache.hadoop.fs.Path[] filesCreated = new
org.apache.hadoop.fs.Path[NO_OF_FILES];
+ for (int i = 0; i < NO_OF_FILES; i++) {
+ Tuple2<org.apache.hadoop.fs.Path, String> file =
+ createFileAndFillWithData(hdfsURI, "file", i,
"This is test line.");
+ filesCreated[i] = file.f0;
+ filesToBeRead.add(file.f0.getName());
}
- for(org.apache.hadoop.fs.Path file: filesCreated) {
- hdfs.delete(file, false);
+ // wait until the monitoring thread exits
+ t.join();
+
+ Assert.assertArrayEquals(filesToBeRead.toArray(),
context.getSeenFiles().toArray());
+
+ // finally delete the files created for the test.
+ hdfs.delete(bootstrap.f0, false);
+ for (org.apache.hadoop.fs.Path path: filesCreated) {
+ hdfs.delete(path, false);
}
}
- // ------------- End of Tests
+ /////////// Source Contexts Used by the
tests /////////////////
- private int getLineNo(String line) {
- String[] tkns = line.split("\\s");
- Assert.assertEquals(6, tkns.length);
- return Integer.parseInt(tkns[tkns.length - 1]);
- }
+ private static class FileVerifyingSourceContext extends
DummySourceContext {
--- End diff --
I wonder, couldn't you just used `Mockito.mock(SourceContext.class)` for
that?
---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at [email protected] or file a JIRA ticket
with INFRA.
---