Author: srowen
Date: Sat Apr 2 10:14:00 2011
New Revision: 1087990
URL: http://svn.apache.org/viewvc?rev=1087990&view=rev
Log:
MAHOUT-644 check for term vectors in field, obey Iterator contract
Added:
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterator.java
Modified:
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterable.java
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/TFDFMapper.java
mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/lucene/LuceneIterableTest.java
Modified:
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterable.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterable.java?rev=1087990&r1=1087989&r2=1087990&view=diff
==============================================================================
---
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterable.java
(original)
+++
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterable.java
Sat Apr 2 10:14:00 2011
@@ -18,32 +18,23 @@
package org.apache.mahout.utils.vectors.lucene;
import java.io.IOException;
-import java.util.Collections;
import java.util.Iterator;
-import com.google.common.base.Preconditions;
-import org.apache.lucene.document.FieldSelector;
-import org.apache.lucene.document.SetBasedFieldSelector;
import org.apache.lucene.index.IndexReader;
-import org.apache.lucene.index.TermDocs;
-import org.apache.mahout.math.NamedVector;
import org.apache.mahout.math.Vector;
/**
- * A LuceneIterable is an Iterable<Vector> that uses a Lucene index as
the source for creating the
- * {@link Vector}. The Field used to create the Vector currently must have
Term Vectors stored for it.
+ * {@link Iterable} counterpart to {@link LuceneIterator}.
*/
-public class LuceneIterable implements Iterable<Vector> {
+public final class LuceneIterable implements Iterable<Vector> {
public static final double NO_NORMALIZING = -1.0;
private final IndexReader indexReader;
private final String field;
private final String idField;
- private final FieldSelector idFieldSelector;
-
private final VectorMapper mapper;
- private double normPower = NO_NORMALIZING;
+ private final double normPower;
public LuceneIterable(IndexReader reader, String idField, String field,
VectorMapper mapper) {
this(reader, idField, field, mapper, NO_NORMALIZING);
@@ -52,27 +43,14 @@ public class LuceneIterable implements I
/**
* Produce a LuceneIterable that can create the Vector plus normalize it.
*
- * @param reader
- * The {@link org.apache.lucene.index.IndexReader} to read the
documents from.
- * @param idField
- * - The Field containing the id. May be null
- * @param field
- * The field to use for the Vector
- * @param mapper
- * The {@link org.apache.mahout.utils.vectors.lucene.VectorMapper}
for creating
- * {@link org.apache.mahout.math.Vector}s from Lucene's TermVectors.
- * @param normPower
- * The normalization value. Must be greater than or equal to 0 or
equal to {@link #NO_NORMALIZING}
+ * @param indexReader {@link org.apache.lucene.index.IndexReader} to read
the documents from.
+ * @param idField field containing the id. May be null.
+ * @param field field to use for the Vector
+ * @param mapper {@link VectorMapper} for creating {@link Vector}s from
Lucene's TermVectors.
+ * @param normPower the normalization value. Must be nonnegative, or {@link
#NO_NORMALIZING}
*/
- public LuceneIterable(IndexReader reader,
- String idField,
- String field,
- VectorMapper mapper,
- double normPower) {
- Preconditions.checkArgument(normPower == NO_NORMALIZING || normPower >= 0,
- "If specified normPower must be nonnegative", normPower);
- idFieldSelector = new
SetBasedFieldSelector(Collections.singleton(idField),
Collections.<String>emptySet());
- this.indexReader = reader;
+ public LuceneIterable(IndexReader indexReader, String idField, String field,
VectorMapper mapper, double normPower) {
+ this.indexReader = indexReader;
this.idField = idField;
this.field = field;
this.mapper = mapper;
@@ -82,66 +60,9 @@ public class LuceneIterable implements I
@Override
public Iterator<Vector> iterator() {
try {
- return new TDIterator();
+ return new LuceneIterator(indexReader, idField, field, mapper,
normPower);
} catch (IOException e) {
throw new IllegalStateException(e);
}
}
-
- private final class TDIterator implements Iterator<Vector> {
- private final TermDocs termDocs;
-
- private TDIterator() throws IOException {
- // term docs(null) is a better way of iterating all the docs in Lucene
- this.termDocs = indexReader.termDocs(null);
- }
-
- @Override
- public boolean hasNext() {
- // TODO this doesn't work with the Iterator contract -- hasNext() cannot
have a side effect
- try {
- return termDocs.next();
- } catch (IOException e) {
- throw new IllegalStateException(e);
- }
- }
-
- @Override
- public Vector next() {
- Vector result;
- int doc = termDocs.doc();
- //
- try {
- indexReader.getTermFreqVector(doc, field, mapper);
- mapper.setDocumentNumber(doc);
- result = mapper.getVector();
- if (result == null) {
- return null;
- }
- String name;
- if (idField != null) {
- name = indexReader.document(doc, idFieldSelector).get(idField);
- } else {
- name = String.valueOf(doc);
- }
- if (normPower == NO_NORMALIZING) {
- result = new NamedVector(result, name);
- } else {
- result = new NamedVector(result.normalize(normPower), name);
- }
- } catch (IOException e) {
- // Log?
- throw new IllegalStateException(e);
- }
-
- return result;
- }
-
- @Override
- public void remove() {
- throw new UnsupportedOperationException();
- }
-
- }
-
}
Added:
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterator.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterator.java?rev=1087990&view=auto
==============================================================================
---
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterator.java
(added)
+++
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterator.java
Sat Apr 2 10:14:00 2011
@@ -0,0 +1,142 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.utils.vectors.lucene;
+
+import java.io.IOException;
+import java.util.Collections;
+import java.util.Iterator;
+import java.util.NoSuchElementException;
+
+import com.google.common.base.Preconditions;
+import org.apache.lucene.document.FieldSelector;
+import org.apache.lucene.document.SetBasedFieldSelector;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.TermDocs;
+import org.apache.lucene.index.TermFreqVector;
+import org.apache.mahout.math.NamedVector;
+import org.apache.mahout.math.Vector;
+
+/**
+ * An {@link Iterator} over {@link Vector}s that uses a Lucene index as the
source for creating the
+ * {@link Vector}s. The field used to create the vectors currently must have
term vectors stored for it.
+ */
+public final class LuceneIterator implements Iterator<Vector> {
+
+ private final IndexReader indexReader;
+ private final String field;
+ private final String idField;
+ private final FieldSelector idFieldSelector;
+ private final VectorMapper mapper;
+ private final double normPower;
+ private final TermDocs termDocs;
+ private Vector current;
+ private boolean available;
+
+ /**
+ * Produce a LuceneIterable that can create the Vector plus normalize it.
+ *
+ * @param indexReader {@link org.apache.lucene.index.IndexReader} to read
the documents from.
+ * @param idField field containing the id. May be null.
+ * @param field field to use for the Vector
+ * @param mapper {@link VectorMapper} for creating {@link Vector}s from
Lucene's TermVectors.
+ * @param normPower the normalization value. Must be nonnegative, or {@link
LuceneIterable#NO_NORMALIZING}
+ */
+ public LuceneIterator(IndexReader indexReader,
+ String idField,
+ String field,
+ VectorMapper mapper,
+ double normPower) throws IOException {
+ // term docs(null) is a better way of iterating all the docs in Lucene
+ Preconditions.checkArgument(normPower == LuceneIterable.NO_NORMALIZING ||
normPower >= 0,
+ "If specified normPower must be nonnegative",
normPower);
+ idFieldSelector = new
SetBasedFieldSelector(Collections.singleton(idField),
Collections.<String>emptySet());
+ this.indexReader = indexReader;
+ this.idField = idField;
+ this.field = field;
+ this.mapper = mapper;
+ this.normPower = normPower;
+ // term docs(null) is a better way of iterating all the docs in Lucene
+ this.termDocs = indexReader.termDocs(null);
+ current = null;
+ available = false;
+ }
+
+ private void readVector() throws IOException {
+ available = termDocs.next();
+ if (!available) {
+ current = null;
+ return;
+ }
+ int doc = termDocs.doc();
+ TermFreqVector termFreqVector = indexReader.getTermFreqVector(doc, field);
+ if (termFreqVector == null) {
+ throw new IllegalStateException("Field '" + field + "' does not have
term vectors");
+ }
+
+ indexReader.getTermFreqVector(doc, field, mapper);
+ mapper.setDocumentNumber(doc);
+ Vector result = mapper.getVector();
+ if (result == null) {
+ // TODO is this right? last version would produce null in the iteration
in this case, though it
+ // seems like that may not be desirable
+ current = null;
+ return;
+ }
+ String name;
+ if (idField != null) {
+ name = indexReader.document(doc, idFieldSelector).get(idField);
+ } else {
+ name = String.valueOf(doc);
+ }
+ if (normPower == LuceneIterable.NO_NORMALIZING) {
+ result = new NamedVector(result, name);
+ } else {
+ result = new NamedVector(result.normalize(normPower), name);
+ }
+ current = result;
+ }
+
+ @Override
+ public boolean hasNext() {
+ if (!available) {
+ try {
+ readVector();
+ } catch (IOException e) {
+ throw new IllegalStateException(e);
+ }
+ }
+ return available;
+ }
+
+ @Override
+ public Vector next() {
+ if (!hasNext()) {
+ throw new NoSuchElementException();
+ }
+ Vector next = current;
+ current = null;
+ available = false;
+ return next;
+ }
+
+ @Override
+ public void remove() {
+ throw new UnsupportedOperationException();
+ }
+
+}
Modified:
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/TFDFMapper.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/TFDFMapper.java?rev=1087990&r1=1087989&r2=1087990&view=diff
==============================================================================
---
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/TFDFMapper.java
(original)
+++
mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/TFDFMapper.java
Sat Apr 2 10:14:00 2011
@@ -30,10 +30,7 @@ import org.apache.mahout.vectorizer.Weig
* Not thread-safe
*/
public class TFDFMapper extends VectorMapper {
-
- //public static final int DEFAULT_CACHE_SIZE = 256;
-
- //private final IndexReader reader; // TODO never used?
+
private Vector vector;
private final Weight weight;
@@ -43,7 +40,6 @@ public class TFDFMapper extends VectorMa
private final int numDocs;
public TFDFMapper(IndexReader reader, Weight weight, TermInfo termInfo) {
- //this.reader = reader;
this.weight = weight;
this.termInfo = termInfo;
this.numDocs = reader.numDocs();
Modified:
mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/lucene/LuceneIterableTest.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/lucene/LuceneIterableTest.java?rev=1087990&r1=1087989&r2=1087990&view=diff
==============================================================================
---
mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/lucene/LuceneIterableTest.java
(original)
+++
mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/lucene/LuceneIterableTest.java
Sat Apr 2 10:14:00 2011
@@ -33,6 +33,9 @@ import org.apache.mahout.vectorizer.TFID
import org.apache.mahout.vectorizer.Weight;
import org.junit.Test;
+import java.io.IOException;
+import java.util.Iterator;
+
public final class LuceneIterableTest extends MahoutTestCase {
private static final String [] DOCS = {
@@ -44,26 +47,11 @@ public final class LuceneIterableTest ex
};
private RAMDirectory directory;
-
+
@Override
public void setUp() throws Exception {
super.setUp();
- directory = new RAMDirectory();
- IndexWriter writer = new IndexWriter(
- directory,
- new StandardAnalyzer(Version.LUCENE_30),
- true,
- IndexWriter.MaxFieldLength.UNLIMITED);
- for (int i = 0; i < LuceneIterableTest.DOCS.length; i++){
- Document doc = new Document();
- Fieldable id = new Field("id", "doc_" + i, Field.Store.YES,
Field.Index.NOT_ANALYZED_NO_NORMS);
- doc.add(id);
- //Store both position and offset information
- Fieldable text = new Field("content", DOCS[i], Field.Store.NO,
Field.Index.ANALYZED, Field.TermVector.YES);
- doc.add(text);
- writer.addDocument(doc);
- }
- writer.close();
+ directory = createTestIndex(Field.TermVector.YES);
}
@Test
@@ -79,7 +67,7 @@ public final class LuceneIterableTest ex
assertNotNull(vector);
assertTrue("vector is not an instanceof " + NamedVector.class, vector
instanceof NamedVector);
assertTrue("vector Size: " + vector.size() + " is not greater than: " +
0, vector.size() > 0);
- assertTrue(((NamedVector)vector).getName().startsWith("doc_"));
+ assertTrue(((NamedVector) vector).getName().startsWith("doc_"));
}
iterable = new LuceneIterable(reader, "id", "content", mapper, 3);
@@ -89,10 +77,43 @@ public final class LuceneIterableTest ex
assertNotNull(vector);
assertTrue("vector is not an instanceof " + NamedVector.class, vector
instanceof NamedVector);
assertTrue("vector Size: " + vector.size() + " is not greater than: " +
0, vector.size() > 0);
- assertTrue(((NamedVector)vector).getName().startsWith("doc_"));
+ assertTrue(((NamedVector) vector).getName().startsWith("doc_"));
}
}
-
-
+
+ @Test(expected = IllegalStateException.class)
+ public void testIterable_noTermVectors() throws IOException {
+ RAMDirectory directory = createTestIndex(Field.TermVector.NO);
+
+ IndexReader reader = IndexReader.open(directory, true);
+ Weight weight = new TFIDF();
+ TermInfo termInfo = new CachedTermInfo(reader, "content", 1, 100);
+ VectorMapper mapper = new TFDFMapper(reader, weight, termInfo);
+ LuceneIterable iterable = new LuceneIterable(reader, "id", "content",
mapper);
+
+ Iterator<Vector> iterator = iterable.iterator();
+ iterator.hasNext();
+ iterator.next();
+ }
+
+ private static RAMDirectory createTestIndex(Field.TermVector termVector)
throws IOException {
+ RAMDirectory directory = new RAMDirectory();
+ IndexWriter writer = new IndexWriter(
+ directory,
+ new StandardAnalyzer(Version.LUCENE_30),
+ true,
+ IndexWriter.MaxFieldLength.UNLIMITED);
+ for (int i = 0; i < LuceneIterableTest.DOCS.length; i++) {
+ Document doc = new Document();
+ Fieldable id = new Field("id", "doc_" + i, Field.Store.YES,
Field.Index.NOT_ANALYZED_NO_NORMS);
+ doc.add(id);
+ //Store both position and offset information
+ Fieldable text = new Field("content", DOCS[i], Field.Store.NO,
Field.Index.ANALYZED, termVector);
+ doc.add(text);
+ writer.addDocument(doc);
+ }
+ writer.close();
+ return directory;
+ }
}