Author: isabel
Date: Thu Dec 16 09:31:06 2010
New Revision: 1049842

URL: http://svn.apache.org/viewvc?rev=1049842&view=rev
Log:
MAHOUT-560 - allow for more flexible file handling when converting text
files to sequence files.

Modified:
    
mahout/trunk/utils/src/main/java/org/apache/mahout/text/SequenceFilesFromDirectory.java

Modified: 
mahout/trunk/utils/src/main/java/org/apache/mahout/text/SequenceFilesFromDirectory.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/text/SequenceFilesFromDirectory.java?rev=1049842&r1=1049841&r2=1049842&view=diff
==============================================================================
--- 
mahout/trunk/utils/src/main/java/org/apache/mahout/text/SequenceFilesFromDirectory.java
 (original)
+++ 
mahout/trunk/utils/src/main/java/org/apache/mahout/text/SequenceFilesFromDirectory.java
 Thu Dec 16 09:31:06 2010
@@ -22,6 +22,8 @@ import java.io.File;
 import java.io.FileFilter;
 import java.io.FileNotFoundException;
 import java.io.IOException;
+import java.lang.reflect.Constructor;
+import java.lang.reflect.InvocationTargetException;
 import java.nio.charset.Charset;
 
 import org.apache.commons.cli2.CommandLine;
@@ -63,9 +65,17 @@ public final class SequenceFilesFromDire
                                   String outputDir,
                                   String prefix,
                                   int chunkSizeInMB,
-                                  Charset charset) throws IOException {
+                                  Charset charset,
+                                  String filter) throws IOException, 
ClassNotFoundException, NoSuchMethodException, InvocationTargetException, 
IllegalAccessException, InstantiationException {
     ChunkedWriter writer = createNewChunkedWriter(chunkSizeInMB, outputDir);
-    parentDir.listFiles(new PrefixAdditionFilter(prefix, writer, charset));
+    if ("PrefixAdditionFilter".equals(filter)) {
+      parentDir.listFiles(new PrefixAdditionFilter(prefix, writer, charset));
+    } else {
+      Class filterClass = Class.forName(filter);
+      Constructor<FileFilter> constructor = 
filterClass.getConstructor(String.class, ChunkedWriter.class, Charset.class);
+      FileFilter fileFilter = constructor.newInstance(prefix, writer, charset);
+      parentDir.listFiles(fileFilter);
+    }
     writer.close();
   }
   
@@ -173,12 +183,16 @@ public final class SequenceFilesFromDire
     Option charsetOpt = 
obuilder.withLongName("charset").withRequired(true).withArgument(
       
abuilder.withName("charset").withMinimum(1).withMaximum(1).create()).withDescription(
       "The name of the character encoding of the input 
files").withShortName("c").create();
+
+    Option fileFilterOpt = 
obuilder.withLongName("fileFilterClass").withArgument(
+      
abuilder.withName("fileFilterClass").withMinimum(1).withMaximum(1).create()).withDescription(
+      "The name of the class to use for file parsing. Default: 
PrefixAdditionFilter").withShortName("filter").create();
     
     Option helpOpt = obuilder.withLongName("help").withDescription("Print out 
help").withShortName("h")
         .create();
     
     Group group = 
gbuilder.withName("Options").withOption(keyPrefixOpt).withOption(chunkSizeOpt).withOption(
-      
charsetOpt).withOption(outputDirOpt).withOption(helpOpt).withOption(parentOpt).create();
+      
charsetOpt).withOption(outputDirOpt).withOption(fileFilterOpt).withOption(helpOpt).withOption(parentOpt).create();
     
     try {
       Parser parser = new Parser();
@@ -201,10 +215,16 @@ public final class SequenceFilesFromDire
       if (cmdLine.hasOption(keyPrefixOpt)) {
         prefix = (String) cmdLine.getValue(keyPrefixOpt);
       }
+
+      String filter = "PrefixAdditionFilter";
+      if (cmdLine.hasOption(fileFilterOpt)) {
+        filter = (String) cmdLine.getValue(fileFilterOpt);
+      }
+
       Charset charset = Charset.forName((String) cmdLine.getValue(charsetOpt));
       SequenceFilesFromDirectory dir = new SequenceFilesFromDirectory();
       
-      dir.createSequenceFiles(parentDir, outputDir, prefix, chunkSize, 
charset);
+      dir.createSequenceFiles(parentDir, outputDir, prefix, chunkSize, 
charset, filter);
     } catch (OptionException e) {
       log.error("Exception", e);
       CommandLineUtil.printHelp(group);


Reply via email to