Re: Possible bug in SpanNearQuery

Moti Nisenson Mon, 07 May 2007 00:44:55 -0700

Paul,

The comment should be moved up into SpanNearQuery itself (as opposed to the
comments in the package private implementation classes). Still though, that
comment is inaccurate (regarding overlap - only "exact" overlap is handled).
Here are some additional tests for SpanNearQuery. They all fail except for
testNotExactOverlapInOrder, testTermOvelapStartInOrder and
testTermOverlapEndInOrder (note that the failures for the NotInOrder case
may be alright. There is no documentation indicating the desired behavior).



import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;

import junit.framework.TestCase;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.spans.SpanNearQuery;
import org.apache.lucene.search.spans.SpanQuery;
import org.apache.lucene.search.spans.SpanTermQuery;
import org.apache.lucene.search.spans.Spans;
import org.apache.lucene.store.RAMDirectory;

public class SpanNearQueryTest extends TestCase {

   private RAMDirectory dir;

   @Override
   protected void setUp() throws Exception {
       super.setUp();
       dir = new RAMDirectory();
       Document doc = new Document();
       doc.add(new Field("field", new StringReader("one two two three four
five")));
       IndexWriter writer = new IndexWriter(dir, new StandardAnalyzer());
       writer.addDocument(doc);
       writer.close();
   }

   public void testNearQueryInOrder() throws Exception {
       checkNearQuery(true);
   }

   public void testNearQueryNotInOrder() throws Exception {
       checkNearQuery(false);
   }

   private void checkNearQuery(boolean inOrder) throws Exception {
       SpanNearQuery query = buildQuery(5, inOrder, "one", "two");

       IndexReader reader = IndexReader.open(dir);
       Spans spans = query.getSpans(reader);

       int numSpans = countSpans(spans);

       reader.close();

       assertEquals(2, numSpans);
   }

   private int countSpans(Spans spans) throws IOException {
       int numSpans = 0;
       while (spans.next())
           numSpans++;
       return numSpans;
   }

   public void testMinimalSpanInOrder() throws Exception {
       checkMinimalSpan(true);
   }

   public void testMinimalSpanNotInOrder() throws Exception {
       checkMinimalSpan(false);
   }

   private void checkMinimalSpan(boolean inOrder) throws Exception {
       SpanNearQuery query = buildQuery(5, inOrder, "two", "three");

       IndexReader reader = IndexReader.open(dir);
       Spans spans = query.getSpans(reader);

       boolean firstSpan = true;
       int firstSlop = -1;
       int numSpans = 0;
       while (spans.next()) {
           numSpans++;
           if (firstSpan) {
               firstSlop = spans.end() - spans.start();
               firstSpan = false;
           }
       }

       reader.close();

       assertEquals(1, numSpans);
       assertEquals(1, firstSlop);
   }


   public void testNotContainingStartInOrder() throws Exception {
       checkNotContainingStart(true);
   }

   public void testNotContainingStartNotInOrder() throws Exception {
       checkNotContainingStart(false);
   }

   public void testNotContainingEndInOrder() throws Exception {
       checkNotContainingEnd(true);
   }

   public void testNotContainingEndNotInOrder() throws Exception {
       checkNotContainingEnd(false);
   }

   public void testNotOverlappingInOrder() throws Exception {
       checkNotOverlapping(true);
   }

   public void testNotOverlappingNotInOrder() throws Exception {
       checkNotOverlapping(false);
   }

   public void testNotExactOverlapInOrder() throws Exception {
       checkNotExactOverlap(true);
   }

   public void testNotExactOverlapNotInOrder() throws Exception {
       checkNotExactOverlap(false);
   }


   private void checkNotContainingEnd(boolean inOrder) throws Exception {
       SpanNearQuery query1 = buildQuery(5, inOrder, "one", "three");
       SpanNearQuery query2 = buildQuery(5, inOrder, "two", "three");

       SpanNearQuery query = new SpanNearQuery(new SpanQuery[] {query1,
query2}, 5, inOrder);

       IndexReader reader = IndexReader.open(dir);
       Spans spans = query.getSpans(reader);

       int numSpans = countSpans(spans);

       reader.close();

       assertEquals(0, numSpans);
   }

   private void checkNotContainingStart(boolean inOrder) throws Exception {
       SpanNearQuery query1 = buildQuery(5, inOrder, "three", "four");
       SpanNearQuery query2 = buildQuery(5, inOrder, "three", "five");

       SpanNearQuery query = new SpanNearQuery(new SpanQuery[] {query1,
query2}, 5, inOrder);

       IndexReader reader = IndexReader.open(dir);
       Spans spans = query.getSpans(reader);

       int numSpans = countSpans(spans);

       reader.close();

       assertEquals(0, numSpans);
   }

   private void checkNotOverlapping(boolean inOrder) throws Exception {
       SpanNearQuery query1 = buildQuery(5, inOrder, "three", "four");
       SpanNearQuery query2 = buildQuery(5, inOrder, "four", "five");

       SpanNearQuery query = new SpanNearQuery(new SpanQuery[] {query1,
query2}, 5, inOrder);

       IndexReader reader = IndexReader.open(dir);
       Spans spans = query.getSpans(reader);

       int numSpans = countSpans(spans);

       reader.close();

       assertEquals(0, numSpans);
   }

   private void checkNotExactOverlap(boolean inOrder) throws Exception {
       SpanNearQuery query1 = buildQuery(5, inOrder, "three", "four");
       SpanNearQuery query2 = buildQuery(5, inOrder, "three", "four");

       SpanNearQuery query = new SpanNearQuery(new SpanQuery[] {query1,
query2}, 5, inOrder);

       IndexReader reader = IndexReader.open(dir);
       Spans spans = query.getSpans(reader);

       int numSpans = countSpans(spans);

       reader.close();

       assertEquals(0, numSpans);
   }


   // for these two tests w2 has the same position as w1
   public void testTermOverlapStartInOrder() throws Exception {
       checkTermOverlap("w2", true, "w1", "w2", "w5");
   }
   public void testTermOverlapStartNotInOrder() throws Exception {
       checkTermOverlap("w2", false, "w1", "w2", "w5");
   }

   // for these two tests w5 has the same position as w4
   public void testTermOverlapEndInOrder() throws Exception {
       checkTermOverlap("w5", true, "w1", "w4", "w5");
   }
   public void testTermOverlapEndNotInOrder() throws Exception {
       checkTermOverlap("w5", false, "w1", "w4", "w5");
   }


   private void checkTermOverlap(String term, boolean inOrder, String...
queryTerms) throws Exception {
       RAMDirectory tempDir = new RAMDirectory();
       Document doc = new Document();
       doc.add(new Field("field", new StringReader("w1 w2 w3 w1 w4 w5")));
       IndexWriter writer = new IndexWriter(tempDir,
getPositionAnalyzer(term));
       writer.addDocument(doc);
       writer.close();

       SpanNearQuery query = buildQuery(7, inOrder, queryTerms);

       IndexReader reader = IndexReader.open(tempDir);
       Spans spans = query.getSpans(reader);

       int numSpans = countSpans(spans);

       reader.close();

       assertEquals(0, numSpans);
   }


   private Analyzer getPositionAnalyzer(final String term) {
       return new Analyzer() {

           @Override
           public TokenStream tokenStream(String fieldName, Reader reader)
{
               return new TokenFilter(new
StandardAnalyzer().tokenStream(fieldName, reader)) {

                   @Override
                   public Token next() throws IOException {
                       Token result = input.next();

                       if (result != null && result.termText
().equals(term))
                           result.setPositionIncrement(0);

                       return result;
                   }

               };
           }

       };
   }

   private SpanNearQuery buildQuery(int slop, boolean inOrder, String...
terms) {
       SpanQuery[] termQueries = new SpanQuery[terms.length];
       for (int i = 0; i < termQueries.length; i++)
           termQueries[i] = new SpanTermQuery(new Term("field", terms[i]));

       return new SpanNearQuery(termQueries, slop, inOrder);
   }


   @Override
   protected void tearDown() throws Exception {
       dir = null; // release directory
       super.tearDown();
   }



}

On 5/6/07, Paul Elschot <[EMAIL PROTECTED]> wrote:


Moti,

I tried your test and it fails in the way you describe, however, I don't
think
the test shows a bug.

Below is the javadoc comment for the package private class
NearSpansOrdered.
Would that be sufficient documentation for the ordered case?

/** A Spans that is formed from the ordered subspans of a SpanNearQuery
* where the subspans do not overlap and have a maximum slop between them.
* <p>
* The formed spans only contains minimum slop matches.<br>
* The matching slop is computed from the distance(s) between
* the non overlapping matching Spans.<br>
* Successive matches are always formed from the successive Spans
* of the SpanNearQuery.
* <p>
* The formed spans may contain overlaps when the slop is at least 1.
* For example, when querying using
* <pre>t1 t2 t3</pre>
* with slop at least 1, the fragment:
* <pre>t1 t2 t1 t3 t2 t3</pre>
* matches twice:
* <pre>t1 t2 .. t3      </pre>
* <pre>      t1 .. t2 t3</pre>
*/

Unfortunately for the unordered case in NearSpansUnordered.java there is
no
class comment available in the code.

You can take a look at the existing span tests here:

http://svn.apache.org/viewvc/lucene/java/trunk/src/test/org/apache/lucene/search/spans


Regards,
Paul Elschot



On Sunday 06 May 2007 16:11, Moti Nisenson wrote:
> Looking over the implementation of SpanNearQuery I came upon what looked
> like a bug. Below is a test which fails due to it. SpanNearQuery doesn't
> return all matching spans; once it's found a span it always increments
the
> span of the clause appearing first in that span (ie. in the example
below
> the two spans should be "one two" and "one two two" where the second has
a
> slop of 1 - unfortunately the span of "one" gets incremented after "one
two"
> is found and so no additional spans get returned). Both in-order and
> out-of-order SpanNearQueries fail this test.
>
> I  think this is an undocumented feature and that the assumption is that
if
> someone searches for "one" near "two"  they're interested in the "one
two"
> result and not necessarily the "one two two" result. However,
> SpanNearQueries can be combined and by not returning all matching spans
this
> can result in problems. For example were we to intersect (ie.
SpanNearQuery
> with 0 slop) between the results of different SpanNearQueries, it is
> possible that the shortest possible span won't intersect, while a longer
> span (with legal slop) would.
>
> In my mind this is a bug (at least until there is some documentation),
and I
> would expect there to be an option (either a boolean parameter or a
> different class) which would indeed return all spans which satisfy the
slop
> constraint.
>
> What I'd like to know is:
>
> 1) Is this a bug?
> 2) Is there any known workaround for this issue (besides rolling my own,
of
> course)?
> 3) Could this bug/feature lead to problems with document scoring?
>
> Thanks,
>
> Moti
>
>
>
> import java.io.StringReader;
>
> import junit.framework.TestCase;
>
> import org.apache.lucene.analysis.standard.StandardAnalyzer;
> import org.apache.lucene.document.Document;
> import org.apache.lucene.document.Field ;
> import org.apache.lucene.index.IndexReader;
> import org.apache.lucene.index.IndexWriter;
> import org.apache.lucene.index.Term;
> import org.apache.lucene.search.spans.SpanNearQuery;
> import org.apache.lucene.search.spans.SpanQuery ;
> import org.apache.lucene.search.spans.SpanTermQuery;
> import org.apache.lucene.search.spans.Spans;
> import org.apache.lucene.store.RAMDirectory;
>
> public class SpanNearQueryTest extends TestCase {
>
>     private RAMDirectory dir;
>
>     @Override
>     protected void setUp() throws Exception {
>         super.setUp();
>         dir = new RAMDirectory();
>         Document doc = new Document();
>         doc.add(new Field("field", new StringReader("one two two")));
>         IndexWriter writer = new IndexWriter(dir, new
StandardAnalyzer());
>         writer.addDocument(doc);
>         writer.close();
>     }
>
>     public void testNearQueryInOrder() throws Exception {
>         checkNearQuery(true);
>     }
>
>     public void testNearQueryNotInOrder() throws Exception {
>         checkNearQuery(false);
>     }
>
>     private void checkNearQuery(boolean inOrder) throws Exception {
>         SpanNearQuery query = new SpanNearQuery(new SpanQuery[]
>                     {new SpanTermQuery(new Term("field", "one")),
>                     new SpanTermQuery(new Term("field", "two"))}, 5,
> inOrder);
>
>         IndexReader reader = IndexReader.open(dir);
>         Spans spans = query.getSpans(reader);
>
>         int numSpans = 0;
>         while (spans.next())
>             numSpans++;
>
>         reader.close();
>
>         assertEquals(2, numSpans);
>     }
>
>
>     @Override
>     protected void tearDown() throws Exception {
>         dir = null; // release directory
>         super.tearDown();
>     }
>

---------------------------------------------------------------------
To unsubscribe, e-mail: [EMAIL PROTECTED]
For additional commands, e-mail: [EMAIL PROTECTED]

Re: Possible bug in SpanNearQuery

Reply via email to