Author: sslavic
Date: Tue Aug 27 21:16:15 2013
New Revision: 1517996
URL: http://svn.apache.org/r1517996
Log:
MAHOUT-1302 Made order of processing mail archives and (sub)directories
deterministic and non-OS specific - first processing files then nested
directories, just as expected by unit test
Modified:
mahout/trunk/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromMailArchives.java
Modified:
mahout/trunk/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromMailArchives.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromMailArchives.java?rev=1517996&r1=1517995&r2=1517996&view=diff
==============================================================================
---
mahout/trunk/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromMailArchives.java
(original)
+++
mahout/trunk/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromMailArchives.java
Tue Aug 27 21:16:15 2013
@@ -19,6 +19,10 @@ package org.apache.mahout.text;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.common.io.Closeables;
+
+import org.apache.commons.io.comparator.CompositeFileComparator;
+import org.apache.commons.io.comparator.DirectoryFileComparator;
+import org.apache.commons.io.comparator.PathFileComparator;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
@@ -41,6 +45,8 @@ import java.io.File;
import java.io.FileFilter;
import java.io.IOException;
import java.nio.charset.Charset;
+import java.util.Arrays;
+import java.util.Comparator;
import java.util.List;
import java.util.Map;
import java.util.regex.Pattern;
@@ -71,13 +77,21 @@ public final class SequenceFilesFromMail
private static final int MAX_JOB_SPLIT_LOCATIONS = 1000000;
+ @SuppressWarnings("unchecked")
+ private static final Comparator<File> FILE_COMPARATOR = new
CompositeFileComparator(
+ DirectoryFileComparator.DIRECTORY_REVERSE,
PathFileComparator.PATH_COMPARATOR);
+
public void createSequenceFiles(MailOptions options) throws IOException {
ChunkedWriter writer = new ChunkedWriter(getConf(),
options.getChunkSize(), new Path(options.getOutputDir()));
MailProcessor processor = new MailProcessor(options, options.getPrefix(),
writer);
try {
if (options.getInput().isDirectory()) {
+ File[] inputFilesAndDirs = options.getInput().listFiles();
+ Arrays.sort(inputFilesAndDirs, FILE_COMPARATOR);
PrefixAdditionFilter filter = new PrefixAdditionFilter(processor,
writer);
- options.getInput().listFiles(filter);
+ for (File aFile : inputFilesAndDirs) {
+ filter.accept(aFile);
+ }
log.info("Parsed {} messages from {}", filter.getMessageCount(),
options.getInput().getAbsolutePath());
} else {
long start = System.currentTimeMillis();
@@ -112,7 +126,11 @@ public final class SequenceFilesFromMail
PrefixAdditionFilter nested = new PrefixAdditionFilter(
new MailProcessor(processor.getOptions(), processor.getPrefix()
+ File.separator + current.getName(), writer), writer);
- current.listFiles(nested);
+ File[] nestedInputFilesAndDirs = current.listFiles();
+ Arrays.sort(nestedInputFilesAndDirs, FILE_COMPARATOR);
+ for (File aFile : nestedInputFilesAndDirs) {
+ nested.accept(aFile);
+ }
long dirCount = nested.getMessageCount();
log.info("Parsed {} messages from directory {}", dirCount,
current.getAbsolutePath());
messageCount += dirCount;