Fixed. Thanks for the catch.
On Thu, Apr 21, 2011 at 11:10 PM, Ted Dunning <[email protected]> wrote: > Yes. Forgotten add. > > > On Thu, Apr 21, 2011 at 10:21 PM, Dmitriy Lyubimov <[email protected]>wrote: > >> Hm. I am getting this after this commit. Forgotten class? >> >> [INFO] Compilation failure >> >> >> \projects\mahout\utils\src\main\java\org\apache\mahout\utils\vectors\lucene\Luce >> neIterator.java:[33,30] cannot find symbol >> symbol : class Bump125 >> location: package org.apache.mahout.utils >> >> \projects\mahout\utils\src\main\java\org\apache\mahout\utils\vectors\lucene\Luce >> neIterator.java:[55,10] cannot find symbol >> symbol : class Bump125 >> location: class org.apache.mahout.utils.vectors.lucene.LuceneIterator >> >> \projects\mahout\utils\src\main\java\org\apache\mahout\utils\vectors\lucene\Luce >> neIterator.java:[55,29] cannot find symbol >> symbol : class Bump125 >> location: class org.apache.mahout.utils.vectors.lucene.LuceneIterator >> >> [INFO] >> ------------------------------------------------------------------------ >> [INFO] For more information, run Maven with the -e switch >> [INFO] >> ------------------------------------------------------------------------ >> >> On Thu, Apr 21, 2011 at 9:58 PM, <[email protected]> wrote: >> > Author: tdunning >> > Date: Fri Apr 22 04:58:14 2011 >> > New Revision: 1095864 >> > >> > URL: http://svn.apache.org/viewvc?rev=1095864&view=rev >> > Log: >> > MAHOUT-675 - Add better handling of empty term vectors in lucene >> conversion to vectors. >> > >> > Modified: >> > >> >> mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterable.java >> > >> >> mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterator.java >> > >> >> mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/lucene/LuceneIterableTest.java >> > >> > Modified: >> mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterable.java >> > URL: >> http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterable.java?rev=1095864&r1=1095863&r2=1095864&view=diff >> > >> ============================================================================== >> > --- >> mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterable.java >> (original) >> > +++ >> mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterable.java >> Fri Apr 22 04:58:14 2011 >> > @@ -35,11 +35,16 @@ public final class LuceneIterable implem >> > private final String idField; >> > private final VectorMapper mapper; >> > private final double normPower; >> > + private final double maxPercentErrorDocs; >> > >> > public LuceneIterable(IndexReader reader, String idField, String >> field, VectorMapper mapper) { >> > this(reader, idField, field, mapper, NO_NORMALIZING); >> > } >> > >> > + public LuceneIterable(IndexReader indexReader, String idField, String >> field, VectorMapper mapper, double normPower) { >> > + this(indexReader, idField, field, mapper, normPower, 0); >> > + } >> > + >> > /** >> > * Produce a LuceneIterable that can create the Vector plus normalize >> it. >> > * >> > @@ -49,18 +54,19 @@ public final class LuceneIterable implem >> > * @param mapper {@link VectorMapper} for creating {@link Vector}s >> from Lucene's TermVectors. >> > * @param normPower the normalization value. Must be nonnegative, or >> {@link #NO_NORMALIZING} >> > */ >> > - public LuceneIterable(IndexReader indexReader, String idField, String >> field, VectorMapper mapper, double normPower) { >> > + public LuceneIterable(IndexReader indexReader, String idField, String >> field, VectorMapper mapper, double normPower, double maxPercentErrorDocs) { >> > this.indexReader = indexReader; >> > this.idField = idField; >> > this.field = field; >> > this.mapper = mapper; >> > this.normPower = normPower; >> > + this.maxPercentErrorDocs = maxPercentErrorDocs; >> > } >> > >> > @Override >> > public Iterator<Vector> iterator() { >> > try { >> > - return new LuceneIterator(indexReader, idField, field, mapper, >> normPower); >> > + return new LuceneIterator(indexReader, idField, field, mapper, >> normPower, maxPercentErrorDocs); >> > } catch (IOException e) { >> > throw new IllegalStateException(e); >> > } >> > >> > Modified: >> mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterator.java >> > URL: >> http://svn.apache.org/viewvc/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterator.java?rev=1095864&r1=1095863&r2=1095864&view=diff >> > >> ============================================================================== >> > --- >> mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterator.java >> (original) >> > +++ >> mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/lucene/LuceneIterator.java >> Fri Apr 22 04:58:14 2011 >> > @@ -30,6 +30,7 @@ import org.apache.lucene.index.TermDocs; >> > import org.apache.lucene.index.TermFreqVector; >> > import org.apache.mahout.math.NamedVector; >> > import org.apache.mahout.math.Vector; >> > +import org.apache.mahout.utils.Bump125; >> > import org.slf4j.Logger; >> > import org.slf4j.LoggerFactory; >> > >> > @@ -48,8 +49,12 @@ public final class LuceneIterator extend >> > private final VectorMapper mapper; >> > private final double normPower; >> > private final TermDocs termDocs; >> > - private int numErrorDocs; >> > - private int maxErrorDocs; >> > + >> > + private int numErrorDocs = 0; >> > + private int maxErrorDocs = 0; >> > + private Bump125 bump = new Bump125(); >> > + private long nextLogRecord = bump.increment(); >> > + private int skippedErrorMessages = 0; >> > >> > /** >> > * Produce a LuceneIterable that can create the Vector plus normalize >> it. >> > @@ -65,7 +70,7 @@ public final class LuceneIterator extend >> > String field, >> > VectorMapper mapper, >> > double normPower) throws IOException { >> > - this(indexReader, idField, field, mapper, normPower, 1.0); >> > + this(indexReader, idField, field, mapper, normPower, 0.0); >> > } >> > >> > /** >> > @@ -91,7 +96,6 @@ public final class LuceneIterator extend >> > // term docs(null) is a better way of iterating all the docs in >> Lucene >> > this.termDocs = indexReader.termDocs(null); >> > this.maxErrorDocs = (int) (maxPercentErrorDocs * >> indexReader.numDocs()); >> > - this.numErrorDocs = 0; >> > } >> > >> > @Override >> > @@ -104,11 +108,22 @@ public final class LuceneIterator extend >> > int doc = termDocs.doc(); >> > TermFreqVector termFreqVector = indexReader.getTermFreqVector(doc, >> field); >> > if (termFreqVector == null) { >> > - if (++numErrorDocs >= maxErrorDocs) { >> > + numErrorDocs++; >> > + if (numErrorDocs >= maxErrorDocs) { >> > log.error("There are too many documents that do not have a >> term vector for {}", field); >> > throw new IllegalStateException("There are too many documents >> that do not have a term vector for " + field); >> > } >> > - log.warn("{} does not have a term vector for {}", >> indexReader.document(doc).get(idField), field); >> > + if (numErrorDocs >= nextLogRecord) { >> > + if (skippedErrorMessages == 0) { >> > + log.warn("{} does not have a term vector for {}", >> indexReader.document(doc).get(idField), field); >> > + } else { >> > + log.warn("{} documents do not have a term vector for {}", >> numErrorDocs, field); >> > + } >> > + nextLogRecord = bump.increment(); >> > + skippedErrorMessages = 0; >> > + } else { >> > + skippedErrorMessages++; >> > + } >> > computeNext(); >> > } >> > >> > >> > Modified: >> mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/lucene/LuceneIterableTest.java >> > URL: >> http://svn.apache.org/viewvc/mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/lucene/LuceneIterableTest.java?rev=1095864&r1=1095863&r2=1095864&view=diff >> > >> ============================================================================== >> > --- >> mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/lucene/LuceneIterableTest.java >> (original) >> > +++ >> mahout/trunk/utils/src/test/java/org/apache/mahout/utils/vectors/lucene/LuceneIterableTest.java >> Fri Apr 22 04:58:14 2011 >> > @@ -97,16 +97,81 @@ public final class LuceneIterableTest ex >> > iterator.next(); >> > } >> > >> > + @Test >> > + public void testIterable_someNoiseTermVectors() throws IOException { >> > + //get noise vectors >> > + RAMDirectory directory = createTestIndex(Field.TermVector.YES, new >> RAMDirectory(), true, 0); >> > + //get real vectors >> > + createTestIndex(Field.TermVector.NO, directory, false, 5); >> > + >> > + IndexReader reader = IndexReader.open(directory, true); >> > + Weight weight = new TFIDF(); >> > + TermInfo termInfo = new CachedTermInfo(reader, "content", 1, 100); >> > + VectorMapper mapper = new TFDFMapper(reader, weight, termInfo); >> > + >> > + boolean exceptionThrown; >> > + //0 percent tolerance >> > + LuceneIterable iterable = new LuceneIterable(reader, "id", >> "content", mapper); >> > + try { >> > + Iterator<Vector> iterator = iterable.iterator(); >> > + while (iterator.hasNext()) { >> > + iterator.next(); >> > + } >> > + exceptionThrown = false; >> > + } >> > + catch(IllegalStateException ise) { >> > + exceptionThrown = true; >> > + } >> > + assertTrue(exceptionThrown); >> > + >> > + //100 percent tolerance >> > + iterable = new LuceneIterable(reader, "id", "content", mapper, -1, >> 1.0); >> > + try { >> > + Iterator<Vector> iterator = iterable.iterator(); >> > + while (iterator.hasNext()) { >> > + iterator.next(); >> > + } >> > + exceptionThrown = false; >> > + } >> > + catch(IllegalStateException ise) { >> > + exceptionThrown = true; >> > + } >> > + assertFalse(exceptionThrown); >> > + >> > + //50 percent tolerance >> > + iterable = new LuceneIterable(reader, "id", "content", mapper, -1, >> 0.5); >> > + Iterator<Vector> iterator = iterable.iterator(); >> > + iterator.next(); >> > + iterator.next(); >> > + iterator.next(); >> > + iterator.next(); >> > + iterator.next(); >> > + >> > + try { >> > + while (iterator.hasNext()) { >> > + iterator.next(); >> > + } >> > + exceptionThrown = false; >> > + } >> > + catch(IllegalStateException ise) { >> > + exceptionThrown = true; >> > + } >> > + assertTrue(exceptionThrown); >> > + } >> > + >> > private static RAMDirectory createTestIndex(Field.TermVector >> termVector) throws IOException { >> > - RAMDirectory directory = new RAMDirectory(); >> > + return createTestIndex(termVector, new RAMDirectory(), true, 0); >> > + } >> > + >> > + private static RAMDirectory createTestIndex(Field.TermVector >> termVector, RAMDirectory directory, boolean createNew, int startingId) >> throws IOException { >> > IndexWriter writer = new IndexWriter( >> > directory, >> > new StandardAnalyzer(Version.LUCENE_30), >> > - true, >> > + createNew, >> > IndexWriter.MaxFieldLength.UNLIMITED); >> > for (int i = 0; i < LuceneIterableTest.DOCS.length; i++) { >> > Document doc = new Document(); >> > - Fieldable id = new Field("id", "doc_" + i, Field.Store.YES, >> Field.Index.NOT_ANALYZED_NO_NORMS); >> > + Fieldable id = new Field("id", "doc_" + (i + startingId), >> Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS); >> > doc.add(id); >> > //Store both position and offset information >> > Fieldable text = new Field("content", DOCS[i], Field.Store.NO, >> Field.Index.ANALYZED, termVector); >> > >> > >> > >> > >
