lucene: Driver.java LuceneIterable.java

srowen Thu, 02 Jun 2011 06:12:22 -0700

Author: srowen
Date: Thu Jun  2 13:11:46 2011
New Revision: 1130535

URL: http://svn.apache.org/viewvc?rev=1130535&view=rev
Log:
MAHOUT-671 make Lucene driver configuration use a POJO


Modified:
    
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/Driver.java
    
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterable.java

Modified: 
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/Driver.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/Driver.java?rev=1130535&r1=1130534&r2=1130535&view=diff
==============================================================================
--- 
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/Driver.java
 (original)
+++ 
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/Driver.java
 Thu Jun  2 13:11:46 2011
@@ -22,6 +22,7 @@ import java.io.IOException;
 import java.io.Writer;
 
 import com.google.common.base.Charsets;
+import com.google.common.base.Preconditions;
 import com.google.common.io.Files;
 import org.apache.commons.cli2.CommandLine;
 import org.apache.commons.cli2.Group;
@@ -52,9 +53,70 @@ import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
 public final class Driver {
+
   private static final Logger log = LoggerFactory.getLogger(Driver.class);
 
-  private Driver() { }
+  private String luceneDir;
+  private String outFile;
+  private String field;
+  private String idField;
+  private String dictOut;
+  private String weightType = "tfidf";
+  private String delimiter = "\t";
+  private double norm = LuceneIterable.NO_NORMALIZING;
+  private long maxDocs = Long.MAX_VALUE;
+  private int minDf = 1;
+  private int maxDFPercent = 99;
+  private double maxPercentErrorDocs = 0.0;
+
+  public void dumpVectors() throws IOException {
+
+    File file = new File(luceneDir);
+    Preconditions.checkArgument(file.isDirectory(),
+                                "Lucene directory: " + file.getAbsolutePath()
+                                    + " does not exist or is not a directory");
+    Preconditions.checkArgument(maxDocs >= 0, "maxDocs must be >= 0");
+    Preconditions.checkArgument(minDf >= 1, "minDf must be >= 1");
+    Preconditions.checkArgument(maxDFPercent <= 99, "maxDFPercent must be <= 
99");
+
+    Directory dir = FSDirectory.open(file);
+    IndexReader reader = IndexReader.open(dir, true);
+
+    Weight weight;
+    if ("tf".equalsIgnoreCase(weightType)) {
+      weight = new TF();
+    } else if ("tfidf".equalsIgnoreCase(weightType)) {
+      weight = new TFIDF();
+    } else {
+      throw new IllegalArgumentException("Weight type " + weightType + " is 
not supported");
+    }
+
+    TermInfo termInfo = new CachedTermInfo(reader, field, minDf, maxDFPercent);
+    VectorMapper mapper = new TFDFMapper(reader, weight, termInfo);
+
+    LuceneIterable iterable;
+    if (norm == LuceneIterable.NO_NORMALIZING) {
+      iterable = new LuceneIterable(reader, idField, field, mapper, 
LuceneIterable.NO_NORMALIZING, maxPercentErrorDocs);
+    } else {
+      iterable = new LuceneIterable(reader, idField, field, mapper, norm, 
maxPercentErrorDocs);
+    }
+
+    log.info("Output File: {}", outFile);
+
+    VectorWriter vectorWriter = getSeqFileWriter(outFile);
+
+    long numDocs = vectorWriter.write(iterable, maxDocs);
+    vectorWriter.close();
+    log.info("Wrote: {} vectors", numDocs);
+
+    File dictOutFile = new File(dictOut);
+    log.info("Dictionary Output file: {}", dictOutFile);
+    Writer writer = Files.newWriter(dictOutFile, Charsets.UTF_8);
+    DelimitedTermInfoWriter tiWriter = new DelimitedTermInfoWriter(writer, 
delimiter, field);
+    tiWriter.write(termInfo);
+    tiWriter.close();
+    writer.close();
+  }
 
   public static void main(String[] args) throws IOException {
 
@@ -63,60 +125,66 @@ public final class Driver {
     GroupBuilder gbuilder = new GroupBuilder();
 
     Option inputOpt = 
obuilder.withLongName("dir").withRequired(true).withArgument(
-      abuilder.withName("dir").withMinimum(1).withMaximum(1).create())
+        abuilder.withName("dir").withMinimum(1).withMaximum(1).create())
         .withDescription("The Lucene directory").withShortName("d").create();
 
     Option outputOpt = 
obuilder.withLongName("output").withRequired(true).withArgument(
-      
abuilder.withName("output").withMinimum(1).withMaximum(1).create()).withDescription("The
 output file")
+        
abuilder.withName("output").withMinimum(1).withMaximum(1).create()).withDescription("The
 output file")
         .withShortName("o").create();
 
     Option fieldOpt = 
obuilder.withLongName("field").withRequired(true).withArgument(
-      
abuilder.withName("field").withMinimum(1).withMaximum(1).create()).withDescription(
-      "The field in the index").withShortName("f").create();
+        
abuilder.withName("field").withMinimum(1).withMaximum(1).create()).withDescription(
+        "The field in the index").withShortName("f").create();
 
     Option idFieldOpt = 
obuilder.withLongName("idField").withRequired(false).withArgument(
-      
abuilder.withName("idField").withMinimum(1).withMaximum(1).create()).withDescription(
-      "The field in the index containing the index.  If null, then the Lucene 
internal doc "
-          + "id is used which is prone to error if the underlying index 
changes").withShortName("i").create();
+        
abuilder.withName("idField").withMinimum(1).withMaximum(1).create()).withDescription(
+        "The field in the index containing the index.  If null, then the 
Lucene internal doc "
+            + "id is used which is prone to error if the underlying index 
changes").withShortName("i").create();
 
     Option dictOutOpt = 
obuilder.withLongName("dictOut").withRequired(true).withArgument(
-      
abuilder.withName("dictOut").withMinimum(1).withMaximum(1).create()).withDescription(
-      "The output of the dictionary").withShortName("t").create();
+        
abuilder.withName("dictOut").withMinimum(1).withMaximum(1).create()).withDescription(
+        "The output of the dictionary").withShortName("t").create();
 
     Option weightOpt = 
obuilder.withLongName("weight").withRequired(false).withArgument(
-      
abuilder.withName("weight").withMinimum(1).withMaximum(1).create()).withDescription(
-      "The kind of weight to use. Currently TF or 
TFIDF").withShortName("w").create();
+        
abuilder.withName("weight").withMinimum(1).withMaximum(1).create()).withDescription(
+        "The kind of weight to use. Currently TF or 
TFIDF").withShortName("w").create();
 
     Option delimiterOpt = 
obuilder.withLongName("delimiter").withRequired(false).withArgument(
-      
abuilder.withName("delimiter").withMinimum(1).withMaximum(1).create()).withDescription(
-      "The delimiter for outputting the 
dictionary").withShortName("l").create();
+        
abuilder.withName("delimiter").withMinimum(1).withMaximum(1).create()).withDescription(
+        "The delimiter for outputting the 
dictionary").withShortName("l").create();
 
     Option powerOpt = 
obuilder.withLongName("norm").withRequired(false).withArgument(
-      
abuilder.withName("norm").withMinimum(1).withMaximum(1).create()).withDescription(
-      "The norm to use, expressed as either a double or \"INF\" if you want to 
use the Infinite norm.  "
-          + "Must be greater or equal to 0.  The default is not to 
normalize").withShortName("n").create();
+        
abuilder.withName("norm").withMinimum(1).withMaximum(1).create()).withDescription(
+        "The norm to use, expressed as either a double or \"INF\" if you want 
to use the Infinite norm.  "
+            + "Must be greater or equal to 0.  The default is not to 
normalize").withShortName("n").create();
 
     Option maxOpt = 
obuilder.withLongName("max").withRequired(false).withArgument(
-      
abuilder.withName("max").withMinimum(1).withMaximum(1).create()).withDescription(
-      "The maximum number of vectors to output.  If not specified, then it 
will loop over all docs")
+        
abuilder.withName("max").withMinimum(1).withMaximum(1).create()).withDescription(
+        "The maximum number of vectors to output.  If not specified, then it 
will loop over all docs")
         .withShortName("m").create();
 
     Option minDFOpt = 
obuilder.withLongName("minDF").withRequired(false).withArgument(
-      
abuilder.withName("minDF").withMinimum(1).withMaximum(1).create()).withDescription(
-      "The minimum document frequency.  Default is 
1").withShortName("md").create();
+        
abuilder.withName("minDF").withMinimum(1).withMaximum(1).create()).withDescription(
+        "The minimum document frequency.  Default is 
1").withShortName("md").create();
 
     Option maxDFPercentOpt = 
obuilder.withLongName("maxDFPercent").withRequired(false).withArgument(
-      
abuilder.withName("maxDFPercent").withMinimum(1).withMaximum(1).create()).withDescription(
-      "The max percentage of docs for the DF.  Can be used to remove really 
high frequency terms."
-          + "  Expressed as an integer between 0 and 100. Default is 
99.").withShortName("x").create();
+        
abuilder.withName("maxDFPercent").withMinimum(1).withMaximum(1).create()).withDescription(
+        "The max percentage of docs for the DF.  Can be used to remove really 
high frequency terms."
+            + "  Expressed as an integer between 0 and 100. Default is 
99.").withShortName("x").create();
+
+    Option maxPercentErrorDocsOpt = 
obuilder.withLongName("maxPercentErrorDocs").withRequired(false).withArgument(
+        
abuilder.withName("maxPercentErrorDocs").withMinimum(1).withMaximum(1).create()).withDescription(
+        "The max percentage of docs that can have a null term vector. These 
are noise document and can occur if the " +
+            "analyzer used strips out all terms in the target field. This 
percentage is expressed as a value between 0 and 1. " +
+            "The default is 0.").withShortName("err").create();
 
     Option helpOpt = obuilder.withLongName("help").withDescription("Print out 
help").withShortName("h")
         .create();
 
     Group group = 
gbuilder.withName("Options").withOption(inputOpt).withOption(idFieldOpt).withOption(
-      
outputOpt).withOption(delimiterOpt).withOption(helpOpt).withOption(fieldOpt).withOption(maxOpt)
+        
outputOpt).withOption(delimiterOpt).withOption(helpOpt).withOption(fieldOpt).withOption(maxOpt)
         
.withOption(dictOutOpt).withOption(powerOpt).withOption(maxDFPercentOpt)
-        .withOption(weightOpt).withOption(minDFOpt).create();
+        
.withOption(weightOpt).withOption(minDFOpt).withOption(maxPercentErrorDocsOpt).create();
 
     try {
       Parser parser = new Parser();
@@ -124,99 +192,57 @@ public final class Driver {
       CommandLine cmdLine = parser.parse(args);
 
       if (cmdLine.hasOption(helpOpt)) {
-        
+
         CommandLineUtil.printHelp(group);
         return;
       }
-      // Springify all this
+
       if (cmdLine.hasOption(inputOpt)) { // Lucene case
-        File file = new File(cmdLine.getValue(inputOpt).toString());
-        if (!file.isDirectory()) {
-          throw new IllegalArgumentException("Lucene directory: " + 
file.getAbsolutePath()
-              +  " does not exist or is not a directory");
-        }
+        Driver luceneDriver = new Driver();
+        luceneDriver.setLuceneDir(cmdLine.getValue(inputOpt).toString());
 
-        long maxDocs = Long.MAX_VALUE;
         if (cmdLine.hasOption(maxOpt)) {
-          maxDocs = Long.parseLong(cmdLine.getValue(maxOpt).toString());
-        }
-        if (maxDocs < 0) {
-          throw new IllegalArgumentException("maxDocs must be >= 0");
+          
luceneDriver.setMaxDocs(Long.parseLong(cmdLine.getValue(maxOpt).toString()));
         }
 
-        Directory dir = FSDirectory.open(file);
-        IndexReader reader = IndexReader.open(dir, true);
-
-        Weight weight;
         if (cmdLine.hasOption(weightOpt)) {
-          String wString = cmdLine.getValue(weightOpt).toString();
-          if ("tf".equalsIgnoreCase(wString)) {
-            weight = new TF();
-          } else if ("tfidf".equalsIgnoreCase(wString)) {
-            weight = new TFIDF();
-          } else {
-            throw new OptionException(weightOpt);
-          }
-        } else {
-          weight = new TFIDF();
+          luceneDriver.setWeightType(cmdLine.getValue(weightOpt).toString());
         }
 
-        String field = cmdLine.getValue(fieldOpt).toString();
+        luceneDriver.setField(cmdLine.getValue(fieldOpt).toString());
 
-        int minDf = 1;
         if (cmdLine.hasOption(minDFOpt)) {
-          minDf = Integer.parseInt(cmdLine.getValue(minDFOpt).toString());
+          
luceneDriver.setMinDf(Integer.parseInt(cmdLine.getValue(minDFOpt).toString()));
         }
 
-        int maxDFPercent = 99;
         if (cmdLine.hasOption(maxDFPercentOpt)) {
-          maxDFPercent = 
Integer.parseInt(cmdLine.getValue(maxDFPercentOpt).toString());
+          
luceneDriver.setMaxDFPercent(Integer.parseInt(cmdLine.getValue(maxDFPercentOpt).toString()));
         }
 
-        TermInfo termInfo = new CachedTermInfo(reader, field, minDf, 
maxDFPercent);
-        VectorMapper mapper = new TFDFMapper(reader, weight, termInfo);
-
-        double norm = LuceneIterable.NO_NORMALIZING;
         if (cmdLine.hasOption(powerOpt)) {
           String power = cmdLine.getValue(powerOpt).toString();
           if ("INF".equals(power)) {
-            norm = Double.POSITIVE_INFINITY;
+            luceneDriver.setNorm(Double.POSITIVE_INFINITY);
           } else {
-            norm = Double.parseDouble(power);
+            luceneDriver.setNorm(Double.parseDouble(power));
           }
         }
 
-        String idField = null;
         if (cmdLine.hasOption(idFieldOpt)) {
-          idField = cmdLine.getValue(idFieldOpt).toString();
+          luceneDriver.setIdField(cmdLine.getValue(idFieldOpt).toString());
         }
 
-        LuceneIterable iterable;
-        if (norm == LuceneIterable.NO_NORMALIZING) {
-          iterable = new LuceneIterable(reader, idField, field, mapper, 
LuceneIterable.NO_NORMALIZING);
-        } else {
-          iterable = new LuceneIterable(reader, idField, field, mapper, norm);
-        }
-
-        String outFile = cmdLine.getValue(outputOpt).toString();
-        log.info("Output File: {}", outFile);
-
-        VectorWriter vectorWriter = getSeqFileWriter(outFile);
-
-        long numDocs = vectorWriter.write(iterable, maxDocs);
-        vectorWriter.close();
-        log.info("Wrote: {} vectors", numDocs);
-
-        String delimiter = cmdLine.hasOption(delimiterOpt) ? 
cmdLine.getValue(delimiterOpt).toString() : "\t";
-        
-        File dictOutFile = new File(cmdLine.getValue(dictOutOpt).toString());
-        log.info("Dictionary Output file: {}", dictOutFile);
-        Writer writer = Files.newWriter(dictOutFile, Charsets.UTF_8);
-        DelimitedTermInfoWriter tiWriter = new DelimitedTermInfoWriter(writer, 
delimiter, field);
-        tiWriter.write(termInfo);
-        tiWriter.close();
-        writer.close();
+        if (cmdLine.hasOption(maxPercentErrorDocsOpt)) {
+          
luceneDriver.setMaxPercentErrorDocs(Double.parseDouble(cmdLine.getValue(maxPercentErrorDocsOpt).toString()));
+        }
+
+        luceneDriver.setOutFile(cmdLine.getValue(outputOpt).toString());
+
+        luceneDriver.setDelimiter(cmdLine.hasOption(delimiterOpt) ? 
cmdLine.getValue(delimiterOpt).toString() : "\t");
 
+        luceneDriver.setDictOut(cmdLine.getValue(dictOutOpt).toString());
+
+        luceneDriver.dumpVectors();
       }
     } catch (OptionException e) {
       log.error("Exception", e);
@@ -231,9 +257,56 @@ public final class Driver {
     // TODO: Make this parameter driven
 
     SequenceFile.Writer seqWriter = SequenceFile.createWriter(fs, conf, path, 
LongWritable.class,
-      VectorWritable.class);
+                                                              
VectorWritable.class);
 
     return new SequenceFileVectorWriter(seqWriter);
   }
 
+  public void setLuceneDir(String luceneDir) {
+    this.luceneDir = luceneDir;
+  }
+
+  public void setMaxDocs(long maxDocs) {
+    this.maxDocs = maxDocs;
+  }
+
+  public void setWeightType(String weightType) {
+    this.weightType = weightType;
+  }
+
+  public void setField(String field) {
+    this.field = field;
+  }
+
+  public void setMinDf(int minDf) {
+    this.minDf = minDf;
+  }
+
+  public void setMaxDFPercent(int maxDFPercent) {
+    this.maxDFPercent = maxDFPercent;
+  }
+
+  public void setNorm(double norm) {
+    this.norm = norm;
+  }
+
+  public void setIdField(String idField) {
+    this.idField = idField;
+  }
+
+  public void setOutFile(String outFile) {
+    this.outFile = outFile;
+  }
+
+  public void setDelimiter(String delimiter) {
+    this.delimiter = delimiter;
+  }
+
+  public void setDictOut(String dictOut) {
+    this.dictOut = dictOut;
+  }
+
+  public void setMaxPercentErrorDocs(double maxPercentErrorDocs) {
+    this.maxPercentErrorDocs = maxPercentErrorDocs;
+  }
 }

Modified: 
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterable.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterable.java?rev=1130535&r1=1130534&r2=1130535&view=diff
==============================================================================
--- 
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterable.java
 (original)
+++ 
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterable.java
 Thu Jun  2 13:11:46 2011
@@ -53,6 +53,7 @@ public final class LuceneIterable implem
    * @param field  field to use for the Vector
    * @param mapper {@link VectorMapper} for creating {@link Vector}s from 
Lucene's TermVectors.
    * @param normPower the normalization value. Must be nonnegative, or {@link 
#NO_NORMALIZING}
+   * @param maxPercentErrorDocs the percentage of documents in the lucene 
index that can have a null term vector
    */
   public LuceneIterable(IndexReader indexReader,
                         String idField,

svn commit: r1130535 - in /mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene: Driver.java LuceneIterable.java

Reply via email to