Author: drew
Date: Mon Sep  6 01:16:38 2010
New Revision: 992920

URL: http://svn.apache.org/viewvc?rev=992920&view=rev
Log:
Throws IllegalArgumentException if input directory does not exist (instead of 
exiting silently). Added whitespace for readability.

Modified:
    
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/Driver.java

Modified: 
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/Driver.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/Driver.java?rev=992920&r1=992919&r2=992920&view=diff
==============================================================================
--- 
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/Driver.java
 (original)
+++ 
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/Driver.java
 Mon Sep  6 01:16:38 2010
@@ -56,74 +56,82 @@ import org.slf4j.LoggerFactory;
 
 public final class Driver {
   private static final Logger log = LoggerFactory.getLogger(Driver.class);
-  
+
   private Driver() { }
-  
+
   public static void main(String[] args) throws IOException {
+
     DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
     ArgumentBuilder abuilder = new ArgumentBuilder();
     GroupBuilder gbuilder = new GroupBuilder();
-    
+
     Option inputOpt = 
obuilder.withLongName("dir").withRequired(true).withArgument(
       abuilder.withName("dir").withMinimum(1).withMaximum(1).create())
         .withDescription("The Lucene directory").withShortName("d").create();
-    
+
     Option outputOpt = 
obuilder.withLongName("output").withRequired(true).withArgument(
       
abuilder.withName("output").withMinimum(1).withMaximum(1).create()).withDescription("The
 output file")
         .withShortName("o").create();
-    
+
     Option fieldOpt = 
obuilder.withLongName("field").withRequired(true).withArgument(
       
abuilder.withName("field").withMinimum(1).withMaximum(1).create()).withDescription(
       "The field in the index").withShortName("f").create();
-    
+
     Option idFieldOpt = 
obuilder.withLongName("idField").withRequired(false).withArgument(
       
abuilder.withName("idField").withMinimum(1).withMaximum(1).create()).withDescription(
       "The field in the index containing the index.  If null, then the Lucene 
internal doc "
           + "id is used which is prone to error if the underlying index 
changes").withShortName("i").create();
-    
+
     Option dictOutOpt = 
obuilder.withLongName("dictOut").withRequired(true).withArgument(
       
abuilder.withName("dictOut").withMinimum(1).withMaximum(1).create()).withDescription(
       "The output of the dictionary").withShortName("t").create();
-    
+
     Option weightOpt = 
obuilder.withLongName("weight").withRequired(false).withArgument(
       
abuilder.withName("weight").withMinimum(1).withMaximum(1).create()).withDescription(
       "The kind of weight to use. Currently TF or 
TFIDF").withShortName("w").create();
-    
+
     Option delimiterOpt = 
obuilder.withLongName("delimiter").withRequired(false).withArgument(
       
abuilder.withName("delimiter").withMinimum(1).withMaximum(1).create()).withDescription(
       "The delimiter for outputing the 
dictionary").withShortName("l").create();
+
     Option powerOpt = 
obuilder.withLongName("norm").withRequired(false).withArgument(
       
abuilder.withName("norm").withMinimum(1).withMaximum(1).create()).withDescription(
       "The norm to use, expressed as either a double or \"INF\" if you want to 
use the Infinite norm.  "
           + "Must be greater or equal to 0.  The default is not to 
normalize").withShortName("n").create();
+
     Option maxOpt = 
obuilder.withLongName("max").withRequired(false).withArgument(
       
abuilder.withName("max").withMinimum(1).withMaximum(1).create()).withDescription(
       "The maximum number of vectors to output.  If not specified, then it 
will loop over all docs")
         .withShortName("m").create();
-    
+
     Option outWriterOpt = 
obuilder.withLongName("outputWriter").withRequired(false).withArgument(
       
abuilder.withName("outputWriter").withMinimum(1).withMaximum(1).create()).withDescription(
       "The VectorWriter to use, either seq "
           + "(SequenceFileVectorWriter - default) or file (Writes to a File 
using JSON format)")
         .withShortName("e").create();
+
     Option minDFOpt = 
obuilder.withLongName("minDF").withRequired(false).withArgument(
       
abuilder.withName("minDF").withMinimum(1).withMaximum(1).create()).withDescription(
       "The minimum document frequency.  Default is 
1").withShortName("md").create();
+
     Option maxDFPercentOpt = 
obuilder.withLongName("maxDFPercent").withRequired(false).withArgument(
       
abuilder.withName("maxDFPercent").withMinimum(1).withMaximum(1).create()).withDescription(
       "The max percentage of docs for the DF.  Can be used to remove really 
high frequency terms."
           + "  Expressed as an integer between 0 and 100. Default is 
99.").withShortName("x").create();
+
     Option helpOpt = obuilder.withLongName("help").withDescription("Print out 
help").withShortName("h")
         .create();
+
     Group group = 
gbuilder.withName("Options").withOption(inputOpt).withOption(idFieldOpt).withOption(
       
outputOpt).withOption(delimiterOpt).withOption(helpOpt).withOption(fieldOpt).withOption(maxOpt)
         
.withOption(dictOutOpt).withOption(powerOpt).withOption(outWriterOpt).withOption(maxDFPercentOpt)
         .withOption(weightOpt).withOption(minDFOpt).create();
+
     try {
       Parser parser = new Parser();
       parser.setGroup(group);
       CommandLine cmdLine = parser.parse(args);
-      
+
       if (cmdLine.hasOption(helpOpt)) {
         
         CommandLineUtil.printHelp(group);
@@ -132,107 +140,121 @@ public final class Driver {
       // Springify all this
       if (cmdLine.hasOption(inputOpt)) { // Lucene case
         File file = new File(cmdLine.getValue(inputOpt).toString());
-        if (file.exists() && file.isDirectory()) {
-          long maxDocs = Long.MAX_VALUE;
-          if (cmdLine.hasOption(maxOpt)) {
-            maxDocs = Long.parseLong(cmdLine.getValue(maxOpt).toString());
-          }
-          if (maxDocs < 0) {
-            throw new IllegalArgumentException("maxDocs must be >= 0");
-          }
-          Directory dir = FSDirectory.open(file);
-          IndexReader reader = IndexReader.open(dir, true);
-          Weight weight;
-          if (cmdLine.hasOption(weightOpt)) {
-            String wString = cmdLine.getValue(weightOpt).toString();
-            if (wString.equalsIgnoreCase("tf")) {
-              weight = new TF();
-            } else if (wString.equalsIgnoreCase("tfidf")) {
-              weight = new TFIDF();
-            } else {
-              throw new OptionException(weightOpt);
-            }
-          } else {
+        if (!file.isDirectory()) {
+          throw new IllegalArgumentException("Lucene directory: " + 
file.getName() + 
+              " does not exist or is not a directory");
+        }
+
+        long maxDocs = Long.MAX_VALUE;
+        if (cmdLine.hasOption(maxOpt)) {
+          maxDocs = Long.parseLong(cmdLine.getValue(maxOpt).toString());
+        }
+        if (maxDocs < 0) {
+          throw new IllegalArgumentException("maxDocs must be >= 0");
+        }
+
+        Directory dir = FSDirectory.open(file);
+        IndexReader reader = IndexReader.open(dir, true);
+
+        Weight weight;
+        if (cmdLine.hasOption(weightOpt)) {
+          String wString = cmdLine.getValue(weightOpt).toString();
+          if (wString.equalsIgnoreCase("tf")) {
+            weight = new TF();
+          } else if (wString.equalsIgnoreCase("tfidf")) {
             weight = new TFIDF();
+          } else {
+            throw new OptionException(weightOpt);
           }
-          String field = cmdLine.getValue(fieldOpt).toString();
-          int minDf = 1;
-          if (cmdLine.hasOption(minDFOpt)) {
-            minDf = Integer.parseInt(cmdLine.getValue(minDFOpt).toString());
-          }
-          int maxDFPercent = 99;
-          if (cmdLine.hasOption(maxDFPercentOpt)) {
-            maxDFPercent = 
Integer.parseInt(cmdLine.getValue(maxDFPercentOpt).toString());
-          }
-          TermInfo termInfo = new CachedTermInfo(reader, field, minDf, 
maxDFPercent);
-          VectorMapper mapper = new TFDFMapper(reader, weight, termInfo);
-          double norm = LuceneIterable.NO_NORMALIZING;
-          if (cmdLine.hasOption(powerOpt)) {
-            String power = cmdLine.getValue(powerOpt).toString();
-            if (power.equals("INF")) {
-              norm = Double.POSITIVE_INFINITY;
-            } else {
-              norm = Double.parseDouble(power);
-            }
-          }
-          String idField = null;
-          if (cmdLine.hasOption(idFieldOpt)) {
-            idField = cmdLine.getValue(idFieldOpt).toString();
-          }
-          LuceneIterable iterable;
-          if (norm == LuceneIterable.NO_NORMALIZING) {
-            iterable = new LuceneIterable(reader, idField, field, mapper, 
LuceneIterable.NO_NORMALIZING);
+        } else {
+          weight = new TFIDF();
+        }
+
+        String field = cmdLine.getValue(fieldOpt).toString();
+
+        int minDf = 1;
+        if (cmdLine.hasOption(minDFOpt)) {
+          minDf = Integer.parseInt(cmdLine.getValue(minDFOpt).toString());
+        }
+
+        int maxDFPercent = 99;
+        if (cmdLine.hasOption(maxDFPercentOpt)) {
+          maxDFPercent = 
Integer.parseInt(cmdLine.getValue(maxDFPercentOpt).toString());
+        }
+
+        TermInfo termInfo = new CachedTermInfo(reader, field, minDf, 
maxDFPercent);
+        VectorMapper mapper = new TFDFMapper(reader, weight, termInfo);
+
+        double norm = LuceneIterable.NO_NORMALIZING;
+        if (cmdLine.hasOption(powerOpt)) {
+          String power = cmdLine.getValue(powerOpt).toString();
+          if (power.equals("INF")) {
+            norm = Double.POSITIVE_INFINITY;
           } else {
-            iterable = new LuceneIterable(reader, idField, field, mapper, 
norm);
+            norm = Double.parseDouble(power);
           }
-          String outFile = cmdLine.getValue(outputOpt).toString();
-          log.info("Output File: {}", outFile);
-          
-          VectorWriter vectorWriter;
-          if (cmdLine.hasOption(outWriterOpt)) {
-            String outWriter = cmdLine.getValue(outWriterOpt).toString();
-            if (outWriter.equals("file")) {
-              BufferedWriter writer = new BufferedWriter(new 
FileWriter(outFile));
-              vectorWriter = new JWriterVectorWriter(writer);
-            } else {
-              vectorWriter = getSeqFileWriter(outFile);
-            }
+        }
+
+        String idField = null;
+        if (cmdLine.hasOption(idFieldOpt)) {
+          idField = cmdLine.getValue(idFieldOpt).toString();
+        }
+
+        LuceneIterable iterable;
+        if (norm == LuceneIterable.NO_NORMALIZING) {
+          iterable = new LuceneIterable(reader, idField, field, mapper, 
LuceneIterable.NO_NORMALIZING);
+        } else {
+          iterable = new LuceneIterable(reader, idField, field, mapper, norm);
+        }
+
+        String outFile = cmdLine.getValue(outputOpt).toString();
+        log.info("Output File: {}", outFile);
+
+        VectorWriter vectorWriter;
+        if (cmdLine.hasOption(outWriterOpt)) {
+          String outWriter = cmdLine.getValue(outWriterOpt).toString();
+          if (outWriter.equals("file")) {
+            BufferedWriter writer = new BufferedWriter(new 
FileWriter(outFile));
+            vectorWriter = new JWriterVectorWriter(writer);
           } else {
             vectorWriter = getSeqFileWriter(outFile);
           }
-          
-          long numDocs = vectorWriter.write(iterable, maxDocs);
-          vectorWriter.close();
-          log.info("Wrote: {} vectors", numDocs);
-          
-          String delimiter = cmdLine.hasOption(delimiterOpt) ? 
cmdLine.getValue(delimiterOpt).toString()
-              : "\t";
-          File dictOutFile = new File(cmdLine.getValue(dictOutOpt).toString());
-          log.info("Dictionary Output file: {}", dictOutFile);
-          BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(
-              new FileOutputStream(dictOutFile), Charset.forName("UTF8")));
-          JWriterTermInfoWriter tiWriter = new JWriterTermInfoWriter(writer, 
delimiter, field);
-          tiWriter.write(termInfo);
-          tiWriter.close();
-          writer.close();
+        } else {
+          vectorWriter = getSeqFileWriter(outFile);
         }
+
+        long numDocs = vectorWriter.write(iterable, maxDocs);
+        vectorWriter.close();
+        log.info("Wrote: {} vectors", numDocs);
+
+        String delimiter = cmdLine.hasOption(delimiterOpt) ? 
cmdLine.getValue(delimiterOpt).toString() : "\t";
+        
+        File dictOutFile = new File(cmdLine.getValue(dictOutOpt).toString());
+        log.info("Dictionary Output file: {}", dictOutFile);
+        BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(
+            new FileOutputStream(dictOutFile), Charset.forName("UTF8")));
+        JWriterTermInfoWriter tiWriter = new JWriterTermInfoWriter(writer, 
delimiter, field);
+        tiWriter.write(termInfo);
+        tiWriter.close();
+        writer.close();
+
       }
-      
     } catch (OptionException e) {
       log.error("Exception", e);
       CommandLineUtil.printHelp(group);
     }
   }
-  
+
   private static VectorWriter getSeqFileWriter(String outFile) throws 
IOException {
     Path path = new Path(outFile);
     Configuration conf = new Configuration();
     FileSystem fs = FileSystem.get(conf);
     // TODO: Make this parameter driven
+
     SequenceFile.Writer seqWriter = SequenceFile.createWriter(fs, conf, path, 
LongWritable.class,
       VectorWritable.class);
-    
+
     return new SequenceFileVectorWriter(seqWriter);
   }
-  
+
 }


Reply via email to