Doug Cutting wrote on 11/07/2007 09:26 AM:
Hadoop's MapFile is similar to Lucene's term index, and supports a
feature where only a subset of the index entries are loaded
(determined by io.map.index.skip). It would not be difficult to add
such a feature to Lucene by changing TermInfosReader#ensureIndexIsRead().
Here's a (totally untested) patch.
Doug, thanks for this suggestion and your quick patch.
I fleshed this out in the version of Lucene we are using, a bit after
2.1. There was an off-by-1 bug plus a few missing pieces. The attached
patch is for 2.1+, but might be useful as it at least contains the
corrections and missing elements. It also contains extensions to the
tests to exercise the patch.
I tried integrating this into 2.3, but enough has changed so that it was
not straightforward (primarily for the test case extensions -- the
implementation seems it will apply with just a bit of manual merging).
Unfortunately, I have so many local changes that is has become difficult
to track the latest Lucene. The task of syncing up will come soon.
I'll post a proper patch against the trunk in jira at a future date if
the issue is not already resolved before then.
Michael McCandless wrote on 11/08/2007 12:43 AM:
I'll open an issue and work through this patch.
Michael, I did not see the issue, else would have posted this there.
Unfortunately, I'm pretty far behind on lucene mail these days.
One thing is: I'd prefer to not use system property for this, since
it's so global, but I'm not sure how to better do it.
Agree strongly that this is not global. Whether ctors or an
index-specific properties object or whatever, it is important to be able
to set this on some indexes and not others in a single application.
Thanks for picking this up!
Chuck
Index: src/test/org/apache/lucene/index/DocHelper.java
===================================================================
--- src/test/org/apache/lucene/index/DocHelper.java (revision 2247)
+++ src/test/org/apache/lucene/index/DocHelper.java (working copy)
@@ -254,10 +254,25 @@
*/
public static void writeDoc(Directory dir, Analyzer analyzer, Similarity similarity, String segment, Document doc) throws IOException
{
- DocumentWriter writer = new DocumentWriter(dir, analyzer, similarity, 50);
- writer.addDocument(segment, doc);
+ writeDoc(dir, analyzer, similarity, segment, doc, IndexWriter.DEFAULT_TERM_INDEX_INTERVAL);
}
+ /**
+ * Writes the document to the directory segment using the analyzer and the similarity score
+ * @param dir
+ * @param analyzer
+ * @param similarity
+ * @param segment
+ * @param doc
+ * @param termIndexInterval
+ * @throws IOException
+ */
+ public static void writeDoc(Directory dir, Analyzer analyzer, Similarity similarity, String segment, Document doc, int termIndexInterval) throws IOException
+ {
+ DocumentWriter writer = new DocumentWriter(dir, analyzer, similarity, 50, termIndexInterval);
+ writer.addDocument(segment, doc);
+ }
+
public static int numFields(Document doc) {
return doc.getFields().size();
}
Index: src/test/org/apache/lucene/index/TestSegmentTermDocs.java
===================================================================
--- src/test/org/apache/lucene/index/TestSegmentTermDocs.java (revision 2247)
+++ src/test/org/apache/lucene/index/TestSegmentTermDocs.java (working copy)
@@ -25,6 +25,7 @@
import org.apache.lucene.document.Field;
import java.io.IOException;
+import org.apache.lucene.search.Similarity;
public class TestSegmentTermDocs extends TestCase {
private Document testDoc = new Document();
@@ -212,6 +213,23 @@
dir.close();
}
+ public void testIndexDivisor() throws IOException {
+ dir = new RAMDirectory();
+ testDoc = new Document();
+ DocHelper.setupDoc(testDoc);
+ DocHelper.writeDoc(dir, new WhitespaceAnalyzer(), Similarity.getDefault(), "test", testDoc, 3);
+
+ assertNull(System.getProperty("lucene.term.index.divisor"));
+ System.setProperty("lucene.term.index.divisor", "2");
+ try {
+ testTermDocs();
+ testBadSeek();
+ testSkipTo();
+ } finally {
+ System.clearProperty("lucene.term.index.divisor");
+ }
+ }
+
private void addDoc(IndexWriter writer, String value) throws IOException
{
Document doc = new Document();
Index: src/test/org/apache/lucene/index/TestSegmentReader.java
===================================================================
--- src/test/org/apache/lucene/index/TestSegmentReader.java (revision 2247)
+++ src/test/org/apache/lucene/index/TestSegmentReader.java (working copy)
@@ -23,10 +23,12 @@
import java.util.List;
import junit.framework.TestCase;
+import org.apache.lucene.analysis.WhitespaceAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Fieldable;
import org.apache.lucene.search.DefaultSimilarity;
+import org.apache.lucene.search.Similarity;
import org.apache.lucene.store.RAMDirectory;
public class TestSegmentReader extends TestCase {
@@ -207,4 +209,25 @@
assertTrue("We do not have 4 term freq vectors, we have: " + results.length, results.length == 4);
}
+ public void testIndexDivisor() throws IOException {
+ dir = new RAMDirectory();
+ testDoc = new Document();
+ DocHelper.setupDoc(testDoc);
+ DocHelper.writeDoc(dir, new WhitespaceAnalyzer(), Similarity.getDefault(), "test", testDoc, 5);
+
+ assertNull(System.getProperty("lucene.term.index.divisor"));
+ System.setProperty("lucene.term.index.divisor", "3");
+ try {
+ reader = SegmentReader.get(new SegmentInfo("test", 1, dir));
+ testDocument();
+ testDelete();
+ testGetFieldNameVariations();
+ testNorms();
+ testTerms();
+ testTermVectors();
+ } finally {
+ System.clearProperty("lucene.term.index.divisor");
+ }
+ }
+
}
Index: src/java/org/apache/lucene/index/TermInfosReader.java
===================================================================
--- src/java/org/apache/lucene/index/TermInfosReader.java (revision 2247)
+++ src/java/org/apache/lucene/index/TermInfosReader.java (working copy)
@@ -39,6 +39,8 @@
private long[] indexPointers;
private SegmentTermEnum indexEnum;
+
+ private int indexDivisor = 1;
TermInfosReader(Directory dir, String seg, FieldInfos fis)
throws IOException {
@@ -53,6 +55,11 @@
indexEnum =
new SegmentTermEnum(directory.openInput(segment + ".tii"),
fieldInfos, true);
+
+ String divisorString = System.getProperty("lucene.term.index.divisor");
+ if (divisorString != null)
+ indexDivisor = Integer.parseInt(divisorString);
+
}
public int getSkipInterval() {
@@ -82,10 +89,10 @@
}
private synchronized void ensureIndexIsRead() throws IOException {
- if (indexTerms != null) // index already read
- return; // do nothing
+ if (indexTerms != null) // index already read
+ return; // do nothing
try {
- int indexSize = (int)indexEnum.size; // otherwise read index
+ int indexSize = 1+((int)indexEnum.size-1)/indexDivisor; // otherwise read index
indexTerms = new Term[indexSize];
indexInfos = new TermInfo[indexSize];
@@ -95,6 +102,10 @@
indexTerms[i] = indexEnum.term();
indexInfos[i] = indexEnum.termInfo();
indexPointers[i] = indexEnum.indexPointer;
+
+ for (int j = 1; j < indexDivisor; j++)
+ if (!indexEnum.next())
+ break;
}
} finally {
indexEnum.close();
@@ -122,7 +133,7 @@
private final void seekEnum(int indexOffset) throws IOException {
getEnum().seek(indexPointers[indexOffset],
- (indexOffset * getEnum().indexInterval) - 1,
+ (indexOffset * indexDivisor * getEnum().indexInterval) - 1,
indexTerms[indexOffset], indexInfos[indexOffset]);
}
@@ -137,7 +148,7 @@
if (enumerator.term() != null // term is at or past current
&& ((enumerator.prev() != null && term.compareTo(enumerator.prev())> 0)
|| term.compareTo(enumerator.term()) >= 0)) {
- int enumOffset = (int)(enumerator.position/enumerator.indexInterval)+1;
+ int enumOffset = (int)(enumerator.position/enumerator.indexInterval/indexDivisor)+1;
if (indexTerms.length == enumOffset // but before end of block
|| term.compareTo(indexTerms[enumOffset]) < 0)
return scanEnum(term); // no need to seek
@@ -165,10 +176,10 @@
SegmentTermEnum enumerator = getEnum();
if (enumerator != null && enumerator.term() != null &&
position >= enumerator.position &&
- position < (enumerator.position + enumerator.indexInterval))
+ position < (enumerator.position + enumerator.indexInterval*indexDivisor))
return scanEnum(position); // can avoid seek
- seekEnum(position / enumerator.indexInterval); // must seek
+ seekEnum(position/enumerator.indexInterval/indexDivisor); // must seek
return scanEnum(position);
}
Index: src/java/org/apache/lucene/index/DocumentWriter.java
===================================================================
--- src/java/org/apache/lucene/index/DocumentWriter.java (revision 2247)
+++ src/java/org/apache/lucene/index/DocumentWriter.java (working copy)
@@ -62,6 +62,23 @@
this.maxFieldLength = maxFieldLength;
}
+ /** This ctor used by test code only.
+ *
+ * @param directory The directory to write the document information to
+ * @param analyzer The analyzer to use for the document
+ * @param similarity The Similarity function
+ * @param maxFieldLength The maximum number of tokens a field may have
+ * @param the termIndexInterval to use for the index of the new segment
+ */
+ DocumentWriter(Directory directory, Analyzer analyzer, Similarity similarity,
+ int maxFieldLength, int termIndexInterval) {
+ this.directory = directory;
+ this.analyzer = analyzer;
+ this.similarity = similarity;
+ this.maxFieldLength = maxFieldLength;
+ this.termIndexInterval = termIndexInterval;
+ }
+
DocumentWriter(Directory directory, Analyzer analyzer, IndexWriter writer) {
this.directory = directory;
this.analyzer = analyzer;
---------------------------------------------------------------------
To unsubscribe, e-mail: [EMAIL PROTECTED]
For additional commands, e-mail: [EMAIL PROTECTED]