svn commit: r1087990 - in /mahout/trunk/utils/src: main/java/org/apache/mahout/utils/vectors/lucene/ test/java/org/apache/mahout/utils/vectors/lucene/

srowen Sat, 02 Apr 2011 03:14:25 -0700

Author: srowen
Date: Sat Apr  2 10:14:00 2011
New Revision: 1087990

URL: http://svn.apache.org/viewvc?rev=1087990&view=rev
Log:
MAHOUT-644 check for term vectors in field, obey Iterator contract


Added:
    
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterator.java
Modified:
    
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterable.java
    
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/TFDFMapper.java
    
mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/lucene/LuceneIterableTest.java

Modified: 
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterable.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterable.java?rev=1087990&r1=1087989&r2=1087990&view=diff
==============================================================================
--- 
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterable.java
 (original)
+++ 
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterable.java
 Sat Apr  2 10:14:00 2011
@@ -18,32 +18,23 @@
 package org.apache.mahout.utils.vectors.lucene;
 
 import java.io.IOException;
-import java.util.Collections;
 import java.util.Iterator;
 
-import com.google.common.base.Preconditions;
-import org.apache.lucene.document.FieldSelector;
-import org.apache.lucene.document.SetBasedFieldSelector;
 import org.apache.lucene.index.IndexReader;
-import org.apache.lucene.index.TermDocs;
-import org.apache.mahout.math.NamedVector;
 import org.apache.mahout.math.Vector;
 
 /**
- * A LuceneIterable is an Iterable&lt;Vector&gt; that uses a Lucene index as 
the source for creating the
- * {@link Vector}. The Field used to create the Vector currently must have 
Term Vectors stored for it.
+ * {@link Iterable} counterpart to {@link LuceneIterator}.
  */
-public class LuceneIterable implements Iterable<Vector> {
+public final class LuceneIterable implements Iterable<Vector> {
 
   public static final double NO_NORMALIZING = -1.0;
 
   private final IndexReader indexReader;
   private final String field;
   private final String idField;
-  private final FieldSelector idFieldSelector;
-  
   private final VectorMapper mapper;
-  private double normPower = NO_NORMALIZING;
+  private final double normPower;
 
   public LuceneIterable(IndexReader reader, String idField, String field, 
VectorMapper mapper) {
     this(reader, idField, field, mapper, NO_NORMALIZING);
@@ -52,27 +43,14 @@ public class LuceneIterable implements I
   /**
    * Produce a LuceneIterable that can create the Vector plus normalize it.
    * 
-   * @param reader
-   *          The {@link org.apache.lucene.index.IndexReader} to read the 
documents from.
-   * @param idField
-   *          - The Field containing the id. May be null
-   * @param field
-   *          The field to use for the Vector
-   * @param mapper
-   *          The {@link org.apache.mahout.utils.vectors.lucene.VectorMapper} 
for creating
-   *          {@link org.apache.mahout.math.Vector}s from Lucene's TermVectors.
-   * @param normPower
-   *          The normalization value. Must be greater than or equal to 0 or 
equal to {@link #NO_NORMALIZING}
+   * @param indexReader {@link org.apache.lucene.index.IndexReader} to read 
the documents from.
+   * @param idField field containing the id. May be null.
+   * @param field  field to use for the Vector
+   * @param mapper {@link VectorMapper} for creating {@link Vector}s from 
Lucene's TermVectors.
+   * @param normPower the normalization value. Must be nonnegative, or {@link 
#NO_NORMALIZING}
    */
-  public LuceneIterable(IndexReader reader,
-                        String idField,
-                        String field,
-                        VectorMapper mapper,
-                        double normPower) {
-    Preconditions.checkArgument(normPower == NO_NORMALIZING || normPower >= 0,
-        "If specified normPower must be nonnegative", normPower);
-    idFieldSelector = new 
SetBasedFieldSelector(Collections.singleton(idField), 
Collections.<String>emptySet());
-    this.indexReader = reader;
+  public LuceneIterable(IndexReader indexReader, String idField, String field, 
VectorMapper mapper, double normPower) {
+    this.indexReader = indexReader;
     this.idField = idField;
     this.field = field;
     this.mapper = mapper;
@@ -82,66 +60,9 @@ public class LuceneIterable implements I
   @Override
   public Iterator<Vector> iterator() {
     try {
-      return new TDIterator();
+      return new LuceneIterator(indexReader, idField, field, mapper, 
normPower);
     } catch (IOException e) {
       throw new IllegalStateException(e);
     }
   }
-  
-  private final class TDIterator implements Iterator<Vector> {
-    private final TermDocs termDocs;
-    
-    private TDIterator() throws IOException {
-      // term docs(null) is a better way of iterating all the docs in Lucene
-      this.termDocs = indexReader.termDocs(null);
-    }
-    
-    @Override
-    public boolean hasNext() {
-      // TODO this doesn't work with the Iterator contract -- hasNext() cannot 
have a side effect
-      try {
-        return termDocs.next();
-      } catch (IOException e) {
-        throw new IllegalStateException(e);
-      }
-    }
-    
-    @Override
-    public Vector next() {
-      Vector result;
-      int doc = termDocs.doc();
-      //
-      try {
-        indexReader.getTermFreqVector(doc, field, mapper);
-        mapper.setDocumentNumber(doc);
-        result = mapper.getVector();
-        if (result == null) {
-          return null;
-        }
-        String name;
-        if (idField != null) {
-          name = indexReader.document(doc, idFieldSelector).get(idField);
-        } else {
-          name = String.valueOf(doc);
-        }
-        if (normPower == NO_NORMALIZING) {
-          result = new NamedVector(result, name);
-        } else {
-          result = new NamedVector(result.normalize(normPower), name);
-        }
-      } catch (IOException e) {
-        // Log?
-        throw new IllegalStateException(e);
-      }
-      
-      return result;
-    }
-    
-    @Override
-    public void remove() {
-      throw new UnsupportedOperationException();
-    }
-    
-  }
-  
 }

Added: 
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterator.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterator.java?rev=1087990&view=auto
==============================================================================
--- 
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterator.java
 (added)
+++ 
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterator.java
 Sat Apr  2 10:14:00 2011
@@ -0,0 +1,142 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.utils.vectors.lucene;
+
+import java.io.IOException;
+import java.util.Collections;
+import java.util.Iterator;
+import java.util.NoSuchElementException;
+
+import com.google.common.base.Preconditions;
+import org.apache.lucene.document.FieldSelector;
+import org.apache.lucene.document.SetBasedFieldSelector;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.TermDocs;
+import org.apache.lucene.index.TermFreqVector;
+import org.apache.mahout.math.NamedVector;
+import org.apache.mahout.math.Vector;
+
+/**
+ * An {@link Iterator} over {@link Vector}s that uses a Lucene index as the 
source for creating the
+ * {@link Vector}s. The field used to create the vectors currently must have 
term vectors stored for it.
+ */
+public final class LuceneIterator implements Iterator<Vector> {
+
+  private final IndexReader indexReader;
+  private final String field;
+  private final String idField;
+  private final FieldSelector idFieldSelector;
+  private final VectorMapper mapper;
+  private final double normPower;
+  private final TermDocs termDocs;
+  private Vector current;
+  private boolean available;
+
+  /**
+   * Produce a LuceneIterable that can create the Vector plus normalize it.
+   *
+   * @param indexReader {@link org.apache.lucene.index.IndexReader} to read 
the documents from.
+   * @param idField field containing the id. May be null.
+   * @param field  field to use for the Vector
+   * @param mapper {@link VectorMapper} for creating {@link Vector}s from 
Lucene's TermVectors.
+   * @param normPower the normalization value. Must be nonnegative, or {@link 
LuceneIterable#NO_NORMALIZING}
+   */
+  public LuceneIterator(IndexReader indexReader,
+                        String idField,
+                        String field,
+                        VectorMapper mapper,
+                        double normPower) throws IOException {
+    // term docs(null) is a better way of iterating all the docs in Lucene
+    Preconditions.checkArgument(normPower == LuceneIterable.NO_NORMALIZING || 
normPower >= 0,
+                                "If specified normPower must be nonnegative", 
normPower);
+    idFieldSelector = new 
SetBasedFieldSelector(Collections.singleton(idField), 
Collections.<String>emptySet());
+    this.indexReader = indexReader;
+    this.idField = idField;
+    this.field = field;
+    this.mapper = mapper;
+    this.normPower = normPower;
+    // term docs(null) is a better way of iterating all the docs in Lucene
+    this.termDocs = indexReader.termDocs(null);
+    current = null;
+    available = false;
+  }
+
+  private void readVector() throws IOException {
+    available = termDocs.next();
+    if (!available) {
+      current = null;
+      return;
+    }
+    int doc = termDocs.doc();
+    TermFreqVector termFreqVector = indexReader.getTermFreqVector(doc, field);
+    if (termFreqVector == null) {
+      throw new IllegalStateException("Field '" + field + "' does not have 
term vectors");
+    }
+
+    indexReader.getTermFreqVector(doc, field, mapper);
+    mapper.setDocumentNumber(doc);
+    Vector result = mapper.getVector();
+    if (result == null) {
+      // TODO is this right? last version would produce null in the iteration 
in this case, though it
+      // seems like that may not be desirable
+      current = null;
+      return;
+    }
+    String name;
+    if (idField != null) {
+      name = indexReader.document(doc, idFieldSelector).get(idField);
+    } else {
+      name = String.valueOf(doc);
+    }
+    if (normPower == LuceneIterable.NO_NORMALIZING) {
+      result = new NamedVector(result, name);
+    } else {
+      result = new NamedVector(result.normalize(normPower), name);
+    }
+    current = result;
+  }
+
+  @Override
+  public boolean hasNext() {
+    if (!available) {
+      try {
+        readVector();
+      } catch (IOException e) {
+        throw new IllegalStateException(e);
+      }
+    }
+    return available;
+  }
+
+  @Override
+  public Vector next() {
+    if (!hasNext()) {
+      throw new NoSuchElementException();
+    }
+    Vector next = current;
+    current = null;
+    available = false;
+    return next;
+  }
+
+  @Override
+  public void remove() {
+    throw new UnsupportedOperationException();
+  }
+
+}

Modified: 
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/TFDFMapper.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/TFDFMapper.java?rev=1087990&r1=1087989&r2=1087990&view=diff
==============================================================================
--- 
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/TFDFMapper.java
 (original)
+++ 
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/TFDFMapper.java
 Sat Apr  2 10:14:00 2011
@@ -30,10 +30,7 @@ import org.apache.mahout.vectorizer.Weig
  * Not thread-safe
  */
 public class TFDFMapper extends VectorMapper {
-  
-  //public static final int DEFAULT_CACHE_SIZE = 256;
-  
-  //private final IndexReader reader; // TODO never used?
+
   private Vector vector;
   
   private final Weight weight;
@@ -43,7 +40,6 @@ public class TFDFMapper extends VectorMa
   private final int numDocs;
   
   public TFDFMapper(IndexReader reader, Weight weight, TermInfo termInfo) {
-    //this.reader = reader;
     this.weight = weight;
     this.termInfo = termInfo;
     this.numDocs = reader.numDocs();

Modified: 
mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/lucene/LuceneIterableTest.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/lucene/LuceneIterableTest.java?rev=1087990&r1=1087989&r2=1087990&view=diff
==============================================================================
--- 
mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/lucene/LuceneIterableTest.java
 (original)
+++ 
mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/lucene/LuceneIterableTest.java
 Sat Apr  2 10:14:00 2011
@@ -33,6 +33,9 @@ import org.apache.mahout.vectorizer.TFID
 import org.apache.mahout.vectorizer.Weight;
 import org.junit.Test;
 
+import java.io.IOException;
+import java.util.Iterator;
+
 public final class LuceneIterableTest extends MahoutTestCase {
 
   private static final String [] DOCS = {
@@ -44,26 +47,11 @@ public final class LuceneIterableTest ex
   };
 
   private RAMDirectory directory;
-  
+
   @Override
   public void setUp() throws Exception {
     super.setUp();
-    directory = new RAMDirectory();
-    IndexWriter writer = new IndexWriter(
-        directory,
-        new StandardAnalyzer(Version.LUCENE_30),
-        true,
-        IndexWriter.MaxFieldLength.UNLIMITED);
-    for (int i = 0; i < LuceneIterableTest.DOCS.length; i++){
-      Document doc = new Document();
-      Fieldable id = new Field("id", "doc_" + i, Field.Store.YES, 
Field.Index.NOT_ANALYZED_NO_NORMS);
-      doc.add(id);
-      //Store both position and offset information
-      Fieldable text = new Field("content", DOCS[i], Field.Store.NO, 
Field.Index.ANALYZED, Field.TermVector.YES);
-      doc.add(text);
-      writer.addDocument(doc);
-    }
-    writer.close();
+    directory = createTestIndex(Field.TermVector.YES);
   }
 
   @Test
@@ -79,7 +67,7 @@ public final class LuceneIterableTest ex
       assertNotNull(vector);
       assertTrue("vector is not an instanceof " + NamedVector.class, vector 
instanceof NamedVector);
       assertTrue("vector Size: " + vector.size() + " is not greater than: " + 
0, vector.size() > 0);
-      assertTrue(((NamedVector)vector).getName().startsWith("doc_"));
+      assertTrue(((NamedVector) vector).getName().startsWith("doc_"));
     }
 
     iterable = new LuceneIterable(reader, "id", "content", mapper, 3);
@@ -89,10 +77,43 @@ public final class LuceneIterableTest ex
       assertNotNull(vector);
       assertTrue("vector is not an instanceof " + NamedVector.class, vector 
instanceof NamedVector);
       assertTrue("vector Size: " + vector.size() + " is not greater than: " + 
0, vector.size() > 0);
-      assertTrue(((NamedVector)vector).getName().startsWith("doc_"));
+      assertTrue(((NamedVector) vector).getName().startsWith("doc_"));
     }
 
   }
-  
-  
+
+  @Test(expected = IllegalStateException.class)
+  public void testIterable_noTermVectors() throws IOException {
+    RAMDirectory directory = createTestIndex(Field.TermVector.NO);
+
+    IndexReader reader = IndexReader.open(directory, true);
+    Weight weight = new TFIDF();
+    TermInfo termInfo = new CachedTermInfo(reader, "content", 1, 100);
+    VectorMapper mapper = new TFDFMapper(reader, weight, termInfo);
+    LuceneIterable iterable = new LuceneIterable(reader, "id", "content", 
mapper);
+
+    Iterator<Vector> iterator = iterable.iterator();
+    iterator.hasNext();
+    iterator.next();
+  }
+
+  private static RAMDirectory createTestIndex(Field.TermVector termVector) 
throws IOException {
+    RAMDirectory directory = new RAMDirectory();
+    IndexWriter writer = new IndexWriter(
+        directory,
+        new StandardAnalyzer(Version.LUCENE_30),
+        true,
+        IndexWriter.MaxFieldLength.UNLIMITED);
+    for (int i = 0; i < LuceneIterableTest.DOCS.length; i++) {
+      Document doc = new Document();
+      Fieldable id = new Field("id", "doc_" + i, Field.Store.YES, 
Field.Index.NOT_ANALYZED_NO_NORMS);
+      doc.add(id);
+      //Store both position and offset information
+      Fieldable text = new Field("content", DOCS[i], Field.Store.NO, 
Field.Index.ANALYZED, termVector);
+      doc.add(text);
+      writer.addDocument(doc);
+    }
+    writer.close();
+    return directory;
+  }
 }

svn commit: r1087990 - in /mahout/trunk/utils/src: main/java/org/apache/mahout/utils/vectors/lucene/ test/java/org/apache/mahout/utils/vectors/lucene/

Reply via email to