On 21 Nov 2005, at 04:26, Erik Hatcher wrote:
What about adding an offset to Field, setPositionOffset(int offset)? Looking at DocumentWriter, it looks like this would be the simplest thing that could work, without precluding the interesting option of modifying Analyzer to allow with flags on tokenStream.

I've made the few lines of code change to implement this idea. The patch is attached. I added tests to TestPhraseQuery and TestDocumentWriter that show this working as desired. I reformatted TestDocumentWriter to eliminate an unnecessary try/catch, so the diff looks larger than it really is, sorry (and for now the test is just println diagnostic, but I'll turn those into asserts). TestPhraseQuery demonstrates two "repeated" fields being indexed, with a position offset of 100 for the second instance. The test fails without the offset, as the phrase matches across the field instance boundaries, and even fails with a phrase slop of >= 100. The test shows that with the offset and a slop of 99 the match still isn't made.

Would this be an acceptable change to commit?

Thanks,
        Erik


$ svn diff src/
Index: src/test/org/apache/lucene/search/TestPhraseQuery.java
===================================================================
--- src/test/org/apache/lucene/search/TestPhraseQuery.java (revision 345677) +++ src/test/org/apache/lucene/search/TestPhraseQuery.java (working copy)
@@ -45,6 +45,10 @@

     Document doc = new Document();
doc.add(new Field("field", "one two three four five", Field.Store.YES, Field.Index.TOKENIZED)); + doc.add(new Field("repeated", "this is a repeated field - first part", Field.Store.YES, Field.Index.TOKENIZED)); + Field repeatedField = new Field("repeated", "second part of a repeated field", Field.Store.YES, Field.Index.TOKENIZED);
+    repeatedField.setPositionOffset(100);
+    doc.add(repeatedField);
     writer.addDocument(doc);

     writer.optimize();
@@ -294,4 +298,15 @@
     assertEquals(2, hits.id(2));
   }
+  public void testWrappedPhrase() throws IOException {
+    query.add(new Term("repeated", "first"));
+    query.add(new Term("repeated", "part"));
+    query.add(new Term("repeated", "second"));
+    query.add(new Term("repeated", "part"));
+    query.setSlop(99);
+
+    Hits hits = searcher.search(query);
+    assertEquals(0, hits.length());
+  }
+
}
Index: src/test/org/apache/lucene/index/DocHelper.java
===================================================================
--- src/test/org/apache/lucene/index/DocHelper.java (revision 345677)
+++ src/test/org/apache/lucene/index/DocHelper.java     (working copy)
@@ -68,6 +68,14 @@
public static Field unStoredField2 = new Field (UNSTORED_FIELD_2_KEY, UNSTORED_2_FIELD_TEXT,
       Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.YES);
+  public static final String REPEATED_1_TEXT = "repeated one";
+  public static final String REPEATED_KEY = "repeated";
+ public static Field repeatedField1 = new Field(REPEATED_KEY, REPEATED_1_TEXT,
+      Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.NO);
+  public static final String REPEATED_2_TEXT = "repeated two";
+ public static Field repeatedField2 = new Field(REPEATED_KEY, REPEATED_2_TEXT,
+      Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.NO);
+
   public static Map nameValues = null;
   // ordered list of all the fields...
@@ -81,6 +89,8 @@
     unIndField,
     unStoredField1,
     unStoredField2,
+    repeatedField1,
+    repeatedField2
   };
   // Map<String fieldName, Field field>
@@ -94,6 +104,7 @@
   public static Map noNorms=new HashMap();
   static {
+    repeatedField2.setPositionOffset(500);
     for (int i=0; i<fields.length; i++) {
       Field f = fields[i];
       add(all,f);
Index: src/test/org/apache/lucene/index/TestDocumentWriter.java
===================================================================
--- src/test/org/apache/lucene/index/TestDocumentWriter.java (revision 345677) +++ src/test/org/apache/lucene/index/TestDocumentWriter.java (working copy)
@@ -17,15 +17,13 @@
  */
import junit.framework.TestCase;
-import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.WhitespaceAnalyzer;
-import org.apache.lucene.search.Similarity;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
+import org.apache.lucene.search.Similarity;
+import org.apache.lucene.store.RAMDirectory;
-import java.io.IOException;
-
public class TestDocumentWriter extends TestCase {
   private RAMDirectory dir = new RAMDirectory();
   private Document testDoc = new Document();
@@ -48,54 +46,56 @@
   }
-  public void testAddDocument() {
+  public void testAddDocument() throws Exception {
     Analyzer analyzer = new WhitespaceAnalyzer();
     Similarity similarity = Similarity.getDefault();
DocumentWriter writer = new DocumentWriter(dir, analyzer, similarity, 50);
     assertTrue(writer != null);
-    try {
-      String segName="test";
-      writer.addDocument(segName, testDoc);
-      //After adding the document, we should be able to read it back in
- SegmentReader reader = SegmentReader.get(new SegmentInfo (segName, 1, dir));
-      assertTrue(reader != null);
-      Document doc = reader.document(0);
-      assertTrue(doc != null);
-
-      //System.out.println("Document: " + doc);
-      Field [] fields = doc.getFields("textField2");
-      assertTrue(fields != null && fields.length == 1);
- assertTrue(fields[0].stringValue().equals (DocHelper.FIELD_2_TEXT));
-      assertTrue(fields[0].isTermVectorStored() == true);
-
-      fields = doc.getFields("textField1");
-      assertTrue(fields != null && fields.length == 1);
- assertTrue(fields[0].stringValue().equals (DocHelper.FIELD_1_TEXT));
-      assertTrue(fields[0].isTermVectorStored() == false);
-
-      fields = doc.getFields("keyField");
-      assertTrue(fields != null && fields.length == 1);
- assertTrue(fields[0].stringValue().equals (DocHelper.KEYWORD_TEXT));
+    String segName = "test";
+    writer.addDocument(segName, testDoc);
+    //After adding the document, we should be able to read it back in
+ SegmentReader reader = SegmentReader.get(new SegmentInfo (segName, 1, dir));
+    assertTrue(reader != null);
+    Document doc = reader.document(0);
+    assertTrue(doc != null);
-      fields = doc.getFields(DocHelper.NO_NORMS_KEY);
-      assertTrue(fields != null && fields.length == 1);
- assertTrue(fields[0].stringValue().equals (DocHelper.NO_NORMS_TEXT));
+    //System.out.println("Document: " + doc);
+    Field [] fields = doc.getFields("textField2");
+    assertTrue(fields != null && fields.length == 1);
+    assertTrue(fields[0].stringValue().equals(DocHelper.FIELD_2_TEXT));
+    assertTrue(fields[0].isTermVectorStored() == true);
-      fields = doc.getFields(DocHelper.TEXT_FIELD_3_KEY);
-      assertTrue(fields != null && fields.length == 1);
- assertTrue(fields[0].stringValue().equals (DocHelper.FIELD_3_TEXT));
+    fields = doc.getFields("textField1");
+    assertTrue(fields != null && fields.length == 1);
+    assertTrue(fields[0].stringValue().equals(DocHelper.FIELD_1_TEXT));
+    assertTrue(fields[0].isTermVectorStored() == false);
-      // test that the norm file is not present if omitNorms is true
-      for (int i=0; i<reader.fieldInfos.size(); i++) {
-        FieldInfo fi = reader.fieldInfos.fieldInfo(i);
-        if (fi.isIndexed) {
- assertTrue(fi.omitNorms == !dir.fileExists(segName + ".f" + i));
-        }
+    fields = doc.getFields("keyField");
+    assertTrue(fields != null && fields.length == 1);
+    assertTrue(fields[0].stringValue().equals(DocHelper.KEYWORD_TEXT));
+
+    fields = doc.getFields(DocHelper.NO_NORMS_KEY);
+    assertTrue(fields != null && fields.length == 1);
+ assertTrue(fields[0].stringValue().equals (DocHelper.NO_NORMS_TEXT));
+
+    fields = doc.getFields(DocHelper.TEXT_FIELD_3_KEY);
+    assertTrue(fields != null && fields.length == 1);
+    assertTrue(fields[0].stringValue().equals(DocHelper.FIELD_3_TEXT));
+
+    // test that the norm file is not present if omitNorms is true
+    for (int i = 0; i < reader.fieldInfos.size(); i++) {
+      FieldInfo fi = reader.fieldInfos.fieldInfo(i);
+      if (fi.isIndexed) {
+ assertTrue(fi.omitNorms == !dir.fileExists(segName + ".f" + i));
       }
+    }
-    } catch (IOException e) {
-      e.printStackTrace();
-      assertTrue(false);
+ TermPositions termPositions = reader.termPositions(new Term (DocHelper.REPEATED_KEY, "repeated"));
+    assertTrue(termPositions.next());
+    int freq = termPositions.freq();
+    for (int i=0; i < freq; i++) {
+      int pos = termPositions.nextPosition();
+      System.out.println("pos = " + pos);
     }
   }
}
Index: src/java/org/apache/lucene/index/DocumentWriter.java
===================================================================
--- src/java/org/apache/lucene/index/DocumentWriter.java (revision 345677) +++ src/java/org/apache/lucene/index/DocumentWriter.java (working copy)
@@ -134,7 +134,7 @@
       int fieldNumber = fieldInfos.fieldNumber(fieldName);
       int length = fieldLengths[fieldNumber];     // length of field
-      int position = fieldPositions[fieldNumber]; // position in field
+ int position = fieldPositions[fieldNumber] + field.getPositionOffset(); // position in field
       int offset = fieldOffsets[fieldNumber];       // offset field
       if (field.isIndexed()) {
Index: src/java/org/apache/lucene/document/Field.java
===================================================================
--- src/java/org/apache/lucene/document/Field.java (revision 345677)
+++ src/java/org/apache/lucene/document/Field.java      (working copy)
@@ -50,6 +50,8 @@
   private boolean isCompressed = false;

   private float boost = 1.0f;
+
+  private int positionOffset = 0;

public static final class Store extends Parameter implements Serializable {

@@ -179,6 +181,14 @@
     return boost;
   }
+  public void setPositionOffset(int offset) {
+    positionOffset = offset;
+  }
+
+  public int getPositionOffset() {
+    return positionOffset;
+  }
+
/** Constructs a String-valued Field that is not tokenized, but is indexed
     and stored.  Useful for non-text fields, e.g. date or url.
@deprecated use [EMAIL PROTECTED] #Field(String, String, Field.Store, Field.Index)

---------------------------------------------------------------------
To unsubscribe, e-mail: [EMAIL PROTECTED]
For additional commands, e-mail: [EMAIL PROTECTED]

Reply via email to