Re: Spans, appended fields, and term positions

Erik Hatcher Mon, 21 Nov 2005 01:50:22 -0800


On 21 Nov 2005, at 04:26, Erik Hatcher wrote:

What about adding an offset to Field, setPositionOffset(intoffset)? Looking at DocumentWriter, it looks like this would bethe simplest thing that could work, without precluding theinteresting option of modifying Analyzer to allow with flags ontokenStream.

I've made the few lines of code change to implement this idea. Thepatch is attached. I added tests to TestPhraseQuery andTestDocumentWriter that show this working as desired. I reformattedTestDocumentWriter to eliminate an unnecessary try/catch, so the difflooks larger than it really is, sorry (and for now the test is justprintln diagnostic, but I'll turn those into asserts).TestPhraseQuery demonstrates two "repeated" fields being indexed,with a position offset of 100 for the second instance. The testfails without the offset, as the phrase matches across the fieldinstance boundaries, and even fails with a phrase slop of >= 100.The test shows that with the offset and a slop of 99 the match stillisn't made.


Would this be an acceptable change to commit?

Thanks,
        Erik


$ svn diff src/
Index: src/test/org/apache/lucene/search/TestPhraseQuery.java
===================================================================

--- src/test/org/apache/lucene/search/TestPhraseQuery.java(revision 345677)+++ src/test/org/apache/lucene/search/TestPhraseQuery.java(working copy)

@@ -45,6 +45,10 @@

     Document doc = new Document();

doc.add(new Field("field", "one two three four five",Field.Store.YES, Field.Index.TOKENIZED));+ doc.add(new Field("repeated", "this is a repeated field - firstpart", Field.Store.YES, Field.Index.TOKENIZED));+ Field repeatedField = new Field("repeated", "second part of arepeated field", Field.Store.YES, Field.Index.TOKENIZED);

+    repeatedField.setPositionOffset(100);
+    doc.add(repeatedField);
     writer.addDocument(doc);

     writer.optimize();
@@ -294,4 +298,15 @@
     assertEquals(2, hits.id(2));
   }
+  public void testWrappedPhrase() throws IOException {
+    query.add(new Term("repeated", "first"));
+    query.add(new Term("repeated", "part"));
+    query.add(new Term("repeated", "second"));
+    query.add(new Term("repeated", "part"));
+    query.setSlop(99);
+
+    Hits hits = searcher.search(query);
+    assertEquals(0, hits.length());
+  }
+
}
Index: src/test/org/apache/lucene/index/DocHelper.java
===================================================================

--- src/test/org/apache/lucene/index/DocHelper.java (revision345677)

+++ src/test/org/apache/lucene/index/DocHelper.java     (working copy)
@@ -68,6 +68,14 @@

public static Field unStoredField2 = new Field(UNSTORED_FIELD_2_KEY, UNSTORED_2_FIELD_TEXT,

       Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.YES);
+  public static final String REPEATED_1_TEXT = "repeated one";
+  public static final String REPEATED_KEY = "repeated";

+ public static Field repeatedField1 = new Field(REPEATED_KEY,REPEATED_1_TEXT,

+      Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.NO);
+  public static final String REPEATED_2_TEXT = "repeated two";

+ public static Field repeatedField2 = new Field(REPEATED_KEY,REPEATED_2_TEXT,

+      Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.NO);
+
   public static Map nameValues = null;
   // ordered list of all the fields...
@@ -81,6 +89,8 @@
     unIndField,
     unStoredField1,
     unStoredField2,
+    repeatedField1,
+    repeatedField2
   };
   // Map<String fieldName, Field field>
@@ -94,6 +104,7 @@
   public static Map noNorms=new HashMap();
   static {
+    repeatedField2.setPositionOffset(500);
     for (int i=0; i<fields.length; i++) {
       Field f = fields[i];
       add(all,f);
Index: src/test/org/apache/lucene/index/TestDocumentWriter.java
===================================================================

--- src/test/org/apache/lucene/index/TestDocumentWriter.java(revision 345677)+++ src/test/org/apache/lucene/index/TestDocumentWriter.java(working copy)

@@ -17,15 +17,13 @@
  */
import junit.framework.TestCase;
-import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.WhitespaceAnalyzer;
-import org.apache.lucene.search.Similarity;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
+import org.apache.lucene.search.Similarity;
+import org.apache.lucene.store.RAMDirectory;
-import java.io.IOException;
-
public class TestDocumentWriter extends TestCase {
   private RAMDirectory dir = new RAMDirectory();
   private Document testDoc = new Document();
@@ -48,54 +46,56 @@
   }
-  public void testAddDocument() {
+  public void testAddDocument() throws Exception {
     Analyzer analyzer = new WhitespaceAnalyzer();
     Similarity similarity = Similarity.getDefault();

DocumentWriter writer = new DocumentWriter(dir, analyzer,similarity, 50);

     assertTrue(writer != null);
-    try {
-      String segName="test";
-      writer.addDocument(segName, testDoc);
-      //After adding the document, we should be able to read it back in

- SegmentReader reader = SegmentReader.get(new SegmentInfo(segName, 1, dir));

-      assertTrue(reader != null);
-      Document doc = reader.document(0);
-      assertTrue(doc != null);
-
-      //System.out.println("Document: " + doc);
-      Field [] fields = doc.getFields("textField2");
-      assertTrue(fields != null && fields.length == 1);

- assertTrue(fields[0].stringValue().equals(DocHelper.FIELD_2_TEXT));

-      assertTrue(fields[0].isTermVectorStored() == true);
-
-      fields = doc.getFields("textField1");
-      assertTrue(fields != null && fields.length == 1);

- assertTrue(fields[0].stringValue().equals(DocHelper.FIELD_1_TEXT));

-      assertTrue(fields[0].isTermVectorStored() == false);
-
-      fields = doc.getFields("keyField");
-      assertTrue(fields != null && fields.length == 1);

- assertTrue(fields[0].stringValue().equals(DocHelper.KEYWORD_TEXT));

+    String segName = "test";
+    writer.addDocument(segName, testDoc);
+    //After adding the document, we should be able to read it back in

+ SegmentReader reader = SegmentReader.get(new SegmentInfo(segName, 1, dir));

+    assertTrue(reader != null);
+    Document doc = reader.document(0);
+    assertTrue(doc != null);
-      fields = doc.getFields(DocHelper.NO_NORMS_KEY);
-      assertTrue(fields != null && fields.length == 1);

- assertTrue(fields[0].stringValue().equals(DocHelper.NO_NORMS_TEXT));

+    //System.out.println("Document: " + doc);
+    Field [] fields = doc.getFields("textField2");
+    assertTrue(fields != null && fields.length == 1);
+    assertTrue(fields[0].stringValue().equals(DocHelper.FIELD_2_TEXT));
+    assertTrue(fields[0].isTermVectorStored() == true);
-      fields = doc.getFields(DocHelper.TEXT_FIELD_3_KEY);
-      assertTrue(fields != null && fields.length == 1);

- assertTrue(fields[0].stringValue().equals(DocHelper.FIELD_3_TEXT));

+    fields = doc.getFields("textField1");
+    assertTrue(fields != null && fields.length == 1);
+    assertTrue(fields[0].stringValue().equals(DocHelper.FIELD_1_TEXT));
+    assertTrue(fields[0].isTermVectorStored() == false);
-      // test that the norm file is not present if omitNorms is true
-      for (int i=0; i<reader.fieldInfos.size(); i++) {
-        FieldInfo fi = reader.fieldInfos.fieldInfo(i);
-        if (fi.isIndexed) {

- assertTrue(fi.omitNorms == !dir.fileExists(segName + ".f"+ i));

-        }
+    fields = doc.getFields("keyField");
+    assertTrue(fields != null && fields.length == 1);
+    assertTrue(fields[0].stringValue().equals(DocHelper.KEYWORD_TEXT));
+
+    fields = doc.getFields(DocHelper.NO_NORMS_KEY);
+    assertTrue(fields != null && fields.length == 1);

+ assertTrue(fields[0].stringValue().equals(DocHelper.NO_NORMS_TEXT));

+
+    fields = doc.getFields(DocHelper.TEXT_FIELD_3_KEY);
+    assertTrue(fields != null && fields.length == 1);
+    assertTrue(fields[0].stringValue().equals(DocHelper.FIELD_3_TEXT));
+
+    // test that the norm file is not present if omitNorms is true
+    for (int i = 0; i < reader.fieldInfos.size(); i++) {
+      FieldInfo fi = reader.fieldInfos.fieldInfo(i);
+      if (fi.isIndexed) {

+ assertTrue(fi.omitNorms == !dir.fileExists(segName + ".f" +i));

       }
+    }
-    } catch (IOException e) {
-      e.printStackTrace();
-      assertTrue(false);

+ TermPositions termPositions = reader.termPositions(new Term(DocHelper.REPEATED_KEY, "repeated"));

+    assertTrue(termPositions.next());
+    int freq = termPositions.freq();
+    for (int i=0; i < freq; i++) {
+      int pos = termPositions.nextPosition();
+      System.out.println("pos = " + pos);
     }
   }
}
Index: src/java/org/apache/lucene/index/DocumentWriter.java
===================================================================

--- src/java/org/apache/lucene/index/DocumentWriter.java(revision 345677)+++ src/java/org/apache/lucene/index/DocumentWriter.java(working copy)

@@ -134,7 +134,7 @@
       int fieldNumber = fieldInfos.fieldNumber(fieldName);
       int length = fieldLengths[fieldNumber];     // length of field
-      int position = fieldPositions[fieldNumber]; // position in field

+ int position = fieldPositions[fieldNumber] +field.getPositionOffset(); // position in field

       int offset = fieldOffsets[fieldNumber];       // offset field
       if (field.isIndexed()) {
Index: src/java/org/apache/lucene/document/Field.java
===================================================================

--- src/java/org/apache/lucene/document/Field.java (revision345677)

+++ src/java/org/apache/lucene/document/Field.java      (working copy)
@@ -50,6 +50,8 @@
   private boolean isCompressed = false;

   private float boost = 1.0f;
+
+  private int positionOffset = 0;

public static final class Store extends Parameter implementsSerializable {


@@ -179,6 +181,14 @@
     return boost;
   }
+  public void setPositionOffset(int offset) {
+    positionOffset = offset;
+  }
+
+  public int getPositionOffset() {
+    return positionOffset;
+  }
+

/** Constructs a String-valued Field that is not tokenized, butis indexed

     and stored.  Useful for non-text fields, e.g. date or url.

@deprecated use [EMAIL PROTECTED] #Field(String, String, Field.Store,Field.Index)


---------------------------------------------------------------------
To unsubscribe, e-mail: [EMAIL PROTECTED]
For additional commands, e-mail: [EMAIL PROTECTED]

Re: Spans, appended fields, and term positions

Reply via email to