Re: Term pollution from binary data

Chuck Williams Mon, 12 Nov 2007 17:34:02 -0800

Doug Cutting wrote on 11/07/2007 09:26 AM:

Hadoop's MapFile is similar to Lucene's term index, and supports afeature where only a subset of the index entries are loaded(determined by io.map.index.skip). It would not be difficult to addsuch a feature to Lucene by changing TermInfosReader#ensureIndexIsRead().
Here's a (totally untested) patch.


Doug, thanks for this suggestion and your quick patch.

I fleshed this out in the version of Lucene we are using, a bit after2.1. There was an off-by-1 bug plus a few missing pieces. The attachedpatch is for 2.1+, but might be useful as it at least contains thecorrections and missing elements. It also contains extensions to thetests to exercise the patch.

I tried integrating this into 2.3, but enough has changed so that it wasnot straightforward (primarily for the test case extensions -- theimplementation seems it will apply with just a bit of manual merging).Unfortunately, I have so many local changes that is has become difficultto track the latest Lucene. The task of syncing up will come soon.I'll post a proper patch against the trunk in jira at a future date ifthe issue is not already resolved before then.


Michael McCandless wrote on 11/08/2007 12:43 AM:

I'll open an issue and work through this patch.

Michael, I did not see the issue, else would have posted this there.Unfortunately, I'm pretty far behind on lucene mail these days.

One thing is: I'd prefer to not use system property for this, since
it's so global, but I'm not sure how to better do it.

Agree strongly that this is not global. Whether ctors or anindex-specific properties object or whatever, it is important to be ableto set this on some indexes and not others in a single application.


Thanks for picking this up!

Chuck

Index: src/test/org/apache/lucene/index/DocHelper.java
===================================================================
--- src/test/org/apache/lucene/index/DocHelper.java	(revision 2247)
+++ src/test/org/apache/lucene/index/DocHelper.java	(working copy)
@@ -254,10 +254,25 @@
    */ 
   public static void writeDoc(Directory dir, Analyzer analyzer, Similarity similarity, String segment, Document doc) throws IOException
   {
-    DocumentWriter writer = new DocumentWriter(dir, analyzer, similarity, 50);
-    writer.addDocument(segment, doc);
+    writeDoc(dir, analyzer, similarity, segment, doc, IndexWriter.DEFAULT_TERM_INDEX_INTERVAL);
   }
 
+  /**
+   * Writes the document to the directory segment using the analyzer and the similarity score
+   * @param dir
+   * @param analyzer
+   * @param similarity
+   * @param segment
+   * @param doc
+   * @param termIndexInterval
+   * @throws IOException
+   */ 
+  public static void writeDoc(Directory dir, Analyzer analyzer, Similarity similarity, String segment, Document doc, int termIndexInterval) throws IOException
+  {
+    DocumentWriter writer = new DocumentWriter(dir, analyzer, similarity, 50, termIndexInterval);
+    writer.addDocument(segment, doc);
+  }
+  
   public static int numFields(Document doc) {
     return doc.getFields().size();
   }
Index: src/test/org/apache/lucene/index/TestSegmentTermDocs.java
===================================================================
--- src/test/org/apache/lucene/index/TestSegmentTermDocs.java	(revision 2247)
+++ src/test/org/apache/lucene/index/TestSegmentTermDocs.java	(working copy)
@@ -25,6 +25,7 @@
 import org.apache.lucene.document.Field;
 
 import java.io.IOException;
+import org.apache.lucene.search.Similarity;
 
 public class TestSegmentTermDocs extends TestCase {
   private Document testDoc = new Document();
@@ -212,6 +213,23 @@
     dir.close();
   }
   
+  public void testIndexDivisor() throws IOException {
+    dir = new RAMDirectory();
+    testDoc = new Document();
+    DocHelper.setupDoc(testDoc);
+    DocHelper.writeDoc(dir, new WhitespaceAnalyzer(), Similarity.getDefault(), "test", testDoc, 3);
+    
+    assertNull(System.getProperty("lucene.term.index.divisor"));
+    System.setProperty("lucene.term.index.divisor", "2");
+    try {
+      testTermDocs();
+      testBadSeek();
+      testSkipTo();
+    } finally {
+      System.clearProperty("lucene.term.index.divisor");
+    }
+  }
+  
   private void addDoc(IndexWriter writer, String value) throws IOException
   {
       Document doc = new Document();
Index: src/test/org/apache/lucene/index/TestSegmentReader.java
===================================================================
--- src/test/org/apache/lucene/index/TestSegmentReader.java	(revision 2247)
+++ src/test/org/apache/lucene/index/TestSegmentReader.java	(working copy)
@@ -23,10 +23,12 @@
 import java.util.List;
 
 import junit.framework.TestCase;
+import org.apache.lucene.analysis.WhitespaceAnalyzer;
 
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Fieldable;
 import org.apache.lucene.search.DefaultSimilarity;
+import org.apache.lucene.search.Similarity;
 import org.apache.lucene.store.RAMDirectory;
 
 public class TestSegmentReader extends TestCase {
@@ -207,4 +209,25 @@
     assertTrue("We do not have 4 term freq vectors, we have: " + results.length, results.length == 4);      
   }    
   
+  public void testIndexDivisor() throws IOException {
+    dir = new RAMDirectory();
+    testDoc = new Document();
+    DocHelper.setupDoc(testDoc);
+    DocHelper.writeDoc(dir, new WhitespaceAnalyzer(), Similarity.getDefault(), "test", testDoc, 5);
+    
+    assertNull(System.getProperty("lucene.term.index.divisor"));
+    System.setProperty("lucene.term.index.divisor", "3");
+    try {
+      reader = SegmentReader.get(new SegmentInfo("test", 1, dir));
+      testDocument();
+      testDelete();
+      testGetFieldNameVariations();
+      testNorms();
+      testTerms();
+      testTermVectors();
+    } finally {
+      System.clearProperty("lucene.term.index.divisor");
+    }
+  }
+  
 }
Index: src/java/org/apache/lucene/index/TermInfosReader.java
===================================================================
--- src/java/org/apache/lucene/index/TermInfosReader.java	(revision 2247)
+++ src/java/org/apache/lucene/index/TermInfosReader.java	(working copy)
@@ -39,6 +39,8 @@
   private long[] indexPointers;
   
   private SegmentTermEnum indexEnum;
+  
+  private int indexDivisor = 1;
 
   TermInfosReader(Directory dir, String seg, FieldInfos fis)
        throws IOException {
@@ -53,6 +55,11 @@
     indexEnum =
       new SegmentTermEnum(directory.openInput(segment + ".tii"),
 			  fieldInfos, true);
+    
+    String divisorString = System.getProperty("lucene.term.index.divisor");
+    if (divisorString != null)
+      indexDivisor = Integer.parseInt(divisorString);
+    
   }
 
   public int getSkipInterval() {
@@ -82,10 +89,10 @@
   }
 
   private synchronized void ensureIndexIsRead() throws IOException {
-    if (indexTerms != null)                       // index already read
-      return;                                     // do nothing
+    if (indexTerms != null)                                    // index already read
+      return;                                                  // do nothing
     try {
-      int indexSize = (int)indexEnum.size;        // otherwise read index
+      int indexSize = 1+((int)indexEnum.size-1)/indexDivisor;  // otherwise read index
 
       indexTerms = new Term[indexSize];
       indexInfos = new TermInfo[indexSize];
@@ -95,6 +102,10 @@
         indexTerms[i] = indexEnum.term();
         indexInfos[i] = indexEnum.termInfo();
         indexPointers[i] = indexEnum.indexPointer;
+        
+        for (int j = 1; j < indexDivisor; j++)
+            if (!indexEnum.next())
+                break;
       }
     } finally {
         indexEnum.close();
@@ -122,7 +133,7 @@
 
   private final void seekEnum(int indexOffset) throws IOException {
     getEnum().seek(indexPointers[indexOffset],
-	      (indexOffset * getEnum().indexInterval) - 1,
+	      (indexOffset * indexDivisor * getEnum().indexInterval) - 1,
 	      indexTerms[indexOffset], indexInfos[indexOffset]);
   }
 
@@ -137,7 +148,7 @@
     if (enumerator.term() != null                 // term is at or past current
 	&& ((enumerator.prev() != null && term.compareTo(enumerator.prev())> 0)
 	    || term.compareTo(enumerator.term()) >= 0)) {
-      int enumOffset = (int)(enumerator.position/enumerator.indexInterval)+1;
+      int enumOffset = (int)(enumerator.position/enumerator.indexInterval/indexDivisor)+1;
       if (indexTerms.length == enumOffset	  // but before end of block
 	  || term.compareTo(indexTerms[enumOffset]) < 0)
 	return scanEnum(term);			  // no need to seek
@@ -165,10 +176,10 @@
     SegmentTermEnum enumerator = getEnum();
     if (enumerator != null && enumerator.term() != null &&
         position >= enumerator.position &&
-	position < (enumerator.position + enumerator.indexInterval))
+	position < (enumerator.position + enumerator.indexInterval*indexDivisor))
       return scanEnum(position);		  // can avoid seek
 
-    seekEnum(position / enumerator.indexInterval); // must seek
+    seekEnum(position/enumerator.indexInterval/indexDivisor); // must seek
     return scanEnum(position);
   }
 
Index: src/java/org/apache/lucene/index/DocumentWriter.java
===================================================================
--- src/java/org/apache/lucene/index/DocumentWriter.java	(revision 2247)
+++ src/java/org/apache/lucene/index/DocumentWriter.java	(working copy)
@@ -62,6 +62,23 @@
     this.maxFieldLength = maxFieldLength;
   }
 
+  /** This ctor used by test code only.
+   *
+   * @param directory The directory to write the document information to
+   * @param analyzer The analyzer to use for the document
+   * @param similarity The Similarity function
+   * @param maxFieldLength The maximum number of tokens a field may have
+   * @param the termIndexInterval to use for the index of the new segment
+   */ 
+  DocumentWriter(Directory directory, Analyzer analyzer, Similarity similarity,
+                 int maxFieldLength, int termIndexInterval) {
+    this.directory = directory;
+    this.analyzer = analyzer;
+    this.similarity = similarity;
+    this.maxFieldLength = maxFieldLength;
+    this.termIndexInterval = termIndexInterval;
+  }
+
   DocumentWriter(Directory directory, Analyzer analyzer, IndexWriter writer) {
     this.directory = directory;
     this.analyzer = analyzer;

---------------------------------------------------------------------
To unsubscribe, e-mail: [EMAIL PROTECTED]
For additional commands, e-mail: [EMAIL PROTECTED]

Re: Term pollution from binary data

Reply via email to