RE: Advanced NearSpanQuery

Jeroen Lauwers Fri, 15 Jul 2011 08:12:24 -0700

Hi Mike and Simon,

Thanks again for your help, but I've created my own solution by writing a 
custom span query.
Now, I can perform searches where some of the terms that I supply in the query 
can be missing from the result.
This way it allows for a slop at the query side AND on the result side.


In case you are interested, here's the code: It's C# and not Java, I know, but 
hey, look at it as pseudo-code.

---------------------------
using System;

using IndexReader = Lucene.Net.Index.IndexReader;
using Query = Lucene.Net.Search.Query;
using ToStringUtils = Lucene.Net.Util.ToStringUtils;

namespace Lucene.Net.Search.Spans
{
    [Serializable]
    public class SpanNearWithMinRequiredSubSpanHitsQuery : SpanQuery
    {
        private System.Collections.ArrayList clauses;
        private int slop;
        private bool inOrder;
        private int minRequiredTerms;

        private System.String field;


        public SpanNearWithMinRequiredSubSpanHitsQuery(SpanQuery[] clauses, int 
slop, bool inOrder, int minRequiredSubSpanHits)
        {

            // copy clauses array into an ArrayList
            this.clauses = new System.Collections.ArrayList(clauses.Length);
            for (int i = 0; i < clauses.Length; i++)
            {
                SpanQuery clause = clauses[i];
                if (i == 0)
                {
                    // check field
                    field = clause.GetField();
                }
                else if (!clause.GetField().Equals(field))
                {
                    throw new System.ArgumentException("Clauses must have same 
field.");
                }
                this.clauses.Add(clause);
            }

            this.slop = slop;
            this.inOrder = inOrder;
            this.minRequiredTerms = minRequiredSubSpanHits;
        }

        /// <summary>Return the clauses whose spans are matched. </summary>
        public virtual SpanQuery[] GetClauses()
        {
            return (SpanQuery[])clauses.ToArray(typeof(SpanQuery));
        }

        /// <summary>Return the maximum number of unmatched terms 
permitted.</summary>
        public virtual int GetMinRequiredTerms()
        {
            return minRequiredTerms;
        }

        /// <summary>Return the maximum number of intervening unmatched 
positions permitted.</summary>
        public virtual int GetSlop()
        {
            return slop;
        }

        /// <summary>Return true if matches are required to be 
in-order.</summary>
        public virtual bool IsInOrder()
        {
            return inOrder;
        }

        public override System.String GetField()
        {
            return field;
        }


        public override System.Collections.ICollection GetTerms()
        {
            System.Collections.ArrayList terms = new 
System.Collections.ArrayList();
            System.Collections.IEnumerator i = clauses.GetEnumerator();
            while (i.MoveNext())
            {
                SpanQuery clause = (SpanQuery)i.Current;
                terms.AddRange(clause.GetTerms());
            }
            return terms;
        }

        public override void ExtractTerms(System.Collections.Hashtable terms)
        {
            System.Collections.IEnumerator i = clauses.GetEnumerator();
            while (i.MoveNext())
            {
                SpanQuery clause = (SpanQuery)i.Current;
                clause.ExtractTerms(terms);
            }
        }


        public override System.String ToString(System.String field)
        {
            System.Text.StringBuilder buffer = new System.Text.StringBuilder();
            buffer.Append("spanNear([");
            System.Collections.IEnumerator i = clauses.GetEnumerator();
            while (i.MoveNext())
            {
                SpanQuery clause = (SpanQuery)i.Current;
                buffer.Append(clause.ToString(field));
                buffer.Append(", ");
            }
            if (clauses.Count > 0) buffer.Length -= 2;
            buffer.Append("], ");
            buffer.Append(slop);
            buffer.Append(", ");
            buffer.Append(inOrder);
            buffer.Append(")");
            buffer.Append(ToStringUtils.Boost(GetBoost()));
            return buffer.ToString();
        }

        public override Spans GetSpans(IndexReader reader)
        {
            if (clauses.Count == 0)
                // optimize 0-clause case
                return new SpanOrQuery(GetClauses()).GetSpans(reader);

            if (clauses.Count == 1)
                // optimize 1-clause case
                return ((SpanQuery)clauses[0]).GetSpans(reader);

            return inOrder ? (Spans)new 
NearSpansWithMinRequiredSubSpanHitsOrdered(this, reader) : (Spans)new 
NearSpansWithMinRequiredSubSpanHitsUnordered(this, reader);
        }

        public override Query Rewrite(IndexReader reader)
        {
            SpanNearWithMinRequiredSubSpanHitsQuery clone = null;
            for (int i = 0; i < clauses.Count; i++)
            {
                SpanQuery c = (SpanQuery)clauses[i];
                SpanQuery query = (SpanQuery)c.Rewrite(reader);
                if (query != c)
                {
                    // clause rewrote: must clone
                    if (clone == null)
                        clone = 
(SpanNearWithMinRequiredSubSpanHitsQuery)this.Clone();
                    clone.clauses[i] = query;
                }
            }
            if (clone != null)
            {
                return clone; // some clauses rewrote
            }
            else
            {
                return this; // no clauses rewrote
            }
        }

        /// <summary>Returns true iff <code>o</code> is equal to this. 
</summary>
        public override bool Equals(System.Object o)
        {
            if (this == o)
                return true;
            if (!(o is SpanNearWithMinRequiredSubSpanHitsQuery))
                return false;

            SpanNearWithMinRequiredSubSpanHitsQuery spanNearQuery = 
(SpanNearWithMinRequiredSubSpanHitsQuery)o;

            if (inOrder != spanNearQuery.inOrder)
                return false;
            if (slop != spanNearQuery.slop)
                return false;
            if (clauses.Count != spanNearQuery.clauses.Count)
                return false;
            System.Collections.IEnumerator iter1 = clauses.GetEnumerator();
            System.Collections.IEnumerator iter2 = 
spanNearQuery.clauses.GetEnumerator();
            while (iter1.MoveNext() && iter2.MoveNext())
            {
                SpanNearWithMinRequiredSubSpanHitsQuery item1 = 
(SpanNearWithMinRequiredSubSpanHitsQuery)iter1.Current;
                SpanNearWithMinRequiredSubSpanHitsQuery item2 = 
(SpanNearWithMinRequiredSubSpanHitsQuery)iter2.Current;
                if (!item1.Equals(item2))
                    return false;
            }

            return GetBoost() == spanNearQuery.GetBoost();
        }

        public override int GetHashCode()
        {
            long result;
            result = clauses.GetHashCode();
            // Mix bits before folding in things like boost, since it could 
cancel the
            // last element of clauses.  This particular mix also serves to
            // differentiate SpanNearQuery hashcodes from others.
            result ^= ((result << 14) | (result >> 19)); // reversible
            result += System.Convert.ToInt32(GetBoost());
            result += slop;
            result ^= (inOrder ? (long)0x99AFD3BD : 0);
            return (int)result;
        }
    }
}

--------------------------------------------------------------
using System;
using System.Collections.Generic;

using IndexReader = Lucene.Net.Index.IndexReader;

namespace Lucene.Net.Search.Spans
{
    class NearSpansWithMinRequiredSubSpanHitsOrdered : Spans
    {
        private int allowedSlop;
        private bool firstTime = true;
        private bool more = true;

        /// <summary>The spans in the same order as the SpanNearQuery </summary>
        private Spans[] subSpans;

        private int matchDoc = -1;
        private int matchStart = -1;
        private int matchEnd = -1;

        private SpanNearWithMinRequiredSubSpanHitsQuery query;

        public 
NearSpansWithMinRequiredSubSpanHitsOrdered(SpanNearWithMinRequiredSubSpanHitsQuery
 spanNearWithTermSkipQuery, IndexReader reader)
        {
            if (spanNearWithTermSkipQuery.GetClauses().Length < 2)
            {
                throw new System.ArgumentException("Less than 2 clauses: " + 
spanNearWithTermSkipQuery);
            }
            allowedSlop = spanNearWithTermSkipQuery.GetSlop();

            // get sub spans
            SpanQuery[] clauses = spanNearWithTermSkipQuery.GetClauses();
            subSpans = new Spans[clauses.Length];
            for (int i = 0; i < clauses.Length; i++)
            {
                subSpans[i] = clauses[i].GetSpans(reader);
            }

            query = spanNearWithTermSkipQuery; // kept for toString() only.

        }

        // inherit javadocs
        public virtual int Doc()
        {
            return matchDoc;
        }

        // inherit javadocs
        public virtual int Start()
        {
            return matchStart;
        }

        // inherit javadocs
        public virtual int End()
        {
            return matchEnd;
        }

        // inherit javadocs
        public virtual bool Next()
        {
            if (firstTime)
            {
                firstTime = false;
                // read all sub spans
                for (int i = 0; i < subSpans.Length; i++)
                {
                    if (!subSpans[i].Next()) // read
                        subSpans[i] = null; // set null when EOF
                }
            }
            else if (more)
            {
                // read from the first still active sub span
                bool atLeastOneRead = false;
                for (int i = 0; i < subSpans.Length; i++)
                {
                    if (subSpans[i] != null)
                        if (subSpans[i].Next()) // read
                        {
                            atLeastOneRead = true;
                            break;
                        }
                        else
                            subSpans[i] = null; // set null when EOF
                }
                if (!atLeastOneRead) // if nothing read => quit
                {
                    more = false;
                    return false;
                }
            }
            else
                return false;

            more = AdvanceToNext(); // make shure all hits of the sub spans are 
in the same doc, in order and slop is OK

            return more;
        }

        // inherit javadocs
        public virtual bool SkipTo(int target)
        {
            if (firstTime)
            {
                firstTime = false;
                for (int i = 0; i < subSpans.Length; i++)
                {
                    if (!subSpans[i].SkipTo(target))
                        subSpans[i] = null;
                }
            }
            else if (more)
            {
                bool atLeastOneRead = false;
                for (int i = 0; i < subSpans.Length; i++)
                {
                    if (subSpans[i] != null)
                    {
                        if (subSpans[0].Doc() < target)
                        {
                            if (subSpans[i].SkipTo(target))
                            {
                                atLeastOneRead = true;
                                break;
                            }
                            else
                                subSpans[i] = null;
                        }
                        else
                            return true;
                    }
                }
                if (!atLeastOneRead)
                {
                    more = false;
                    return false;
                }
            }
            else
                return false;

            more = AdvanceToNext();

            return more;
        }

        private bool AdvanceToNext()
        {
            while (true)
            {
                // get doc id's from sub spans
                int[] docIds = new int[subSpans.Length];
                for (int i = 0; i < subSpans.Length; i++)
                {
                    if (subSpans[i] == null)
                        docIds[i] = int.MaxValue;
                    else
                        docIds[i] = subSpans[i].Doc();
                }

                // Sort doc ids
                Array.Sort(docIds);

                // get docid of 'GetMinRequiredTerms'-th item
                int docIdOfXthspan = docIds[query.GetMinRequiredTerms() - 1];

                // check too many missing
                if (docIdOfXthspan == int.MaxValue)
                {
                    more = false;
                    return false;
                }

                // we need at least ... items in the same doc, so the docId of 
the ...th item should be equal to the doc id of the first item.
                // if not, all items before the ...th can be moved forward to 
the doc id of the ...th item
                if (docIdOfXthspan != docIds[0])
                {
                    for (int i = 0; i < subSpans.Length; i++)
                    {
                        if (subSpans[i] != null)
                            if (subSpans[i].Doc() < docIdOfXthspan)
                                if (!subSpans[i].SkipTo(docIdOfXthspan))
                                    subSpans[i] = null;
                    }
                    continue;
                }

                //// get number of subspans in lowest doc id
                List<int> matchingSpans = new List<int>();
                for (int i = 0; i < subSpans.Length; i++)
                {
                    if (subSpans[i] != null && subSpans[i].Doc() == 
docIdOfXthspan)
                        matchingSpans.Add(i);
                }

                // check if spans are in order within the document
                bool extraReadNeeded = false;
                int tmpMatchDoc = subSpans[matchingSpans[0]].Doc();
                for(int i = 1; i < matchingSpans.Count; i++)
                {
                    if  (!DocSpansOrdered(subSpans[matchingSpans[i - 1]], 
subSpans[matchingSpans[i]]))
                    {
                        if (!subSpans[matchingSpans[i]].Next())
                            subSpans[matchingSpans[i]] = null;

                        extraReadNeeded = true;
                        break;
                    }
                }

                // check if a read was needed, if so restart from beginning 
(optimization possible)
                if (extraReadNeeded)
                {
                    matchDoc = -1;
                    continue;
                }

                // check slop
                matchStart = subSpans[matchingSpans[0]].Start();
                matchEnd = subSpans[matchingSpans[matchingSpans.Count-1]].End();
                int matchSlop = matchEnd - matchStart - 
(query.GetMinRequiredTerms() - 1);
                if (matchSlop > allowedSlop)
                {
                    if(!subSpans[matchingSpans[0]].Next())
                        subSpans[matchingSpans[0]] = null;
                    continue;
                }

                //all checks pass
                matchDoc = tmpMatchDoc;
                break;
            }

            return true;
        }

        internal static bool DocSpansOrdered(Spans spans1, Spans spans2)
        {
            if (spans1.Doc() == int.MaxValue || spans2.Doc() == int.MaxValue)
                return true;
            System.Diagnostics.Debug.Assert(spans1.Doc() == spans2.Doc(), "doc1 
" + spans1.Doc() + " != doc2 " + spans2.Doc());
            int start1 = spans1.Start();
            int start2 = spans2.Start();
            /* Do not call DocSpansOrdered(int,int,int,int) to avoid invoking 
.end() : */
            return (start1 == start2) ? (spans1.End() < spans2.End()) : (start1 
< start2);
        }

         public override System.String ToString()
        {
            return GetType().FullName + "(" + query.ToString() + ")@" + 
(firstTime ? "START" : (more ? (Doc() + ":" + Start() + "-" + End()) : "END"));
        }
    }
}

----------------------------------------------------

using System;
using System.Collections.Generic;

using IndexReader = Lucene.Net.Index.IndexReader;

namespace Lucene.Net.Search.Spans
{
    class NearSpansWithMinRequiredSubSpanHitsUnordered
    {
        private int allowedSlop;
        private bool firstTime = true;
        private bool more = true;

        /// <summary>The spans in the same order as the SpanNearQuery </summary>
        private Spans[] subSpans;

        private int matchDoc = -1;
        private int matchStart = -1;
        private int matchEnd = -1;

        private SpanNearWithMinRequiredSubSpanHitsQuery query;

        public 
NearSpansWithMinRequiredSubSpanHitsUnordered(SpanNearWithMinRequiredSubSpanHitsQuery
 spanNearWithTermSkipQuery, IndexReader reader)
        {
            if (spanNearWithTermSkipQuery.GetClauses().Length < 2)
            {
                throw new System.ArgumentException("Less than 2 clauses: " + 
spanNearWithTermSkipQuery);
            }
            allowedSlop = spanNearWithTermSkipQuery.GetSlop();

            // get sub spans
            SpanQuery[] clauses = spanNearWithTermSkipQuery.GetClauses();
            subSpans = new Spans[clauses.Length];
            for (int i = 0; i < clauses.Length; i++)
            {
                subSpans[i] = clauses[i].GetSpans(reader);
            }

            query = spanNearWithTermSkipQuery; // kept for toString() only.

        }

        // inherit javadocs
        public virtual int Doc()
        {
            return matchDoc;
        }

        // inherit javadocs
        public virtual int Start()
        {
            return matchStart;
        }

        // inherit javadocs
        public virtual int End()
        {
            return matchEnd;
        }

        // inherit javadocs
        public virtual bool Next()
        {
            if (firstTime)
            {
                firstTime = false;
                // read all sub spans
                for (int i = 0; i < subSpans.Length; i++)
                {
                    if (!subSpans[i].Next()) // read
                        subSpans[i] = null; // set null when EOF
                }
            }
            else if (more)
            {
                // read from the first still active sub span
                bool atLeastOneRead = false;
                for (int i = 0; i < subSpans.Length; i++)
                {
                    if (subSpans[i] != null)
                        if (subSpans[i].Next()) // read
                        {
                            atLeastOneRead = true;
                            break;
                        }
                        else
                            subSpans[i] = null; // set null when EOF
                }
                if (!atLeastOneRead) // if nothing read => quit
                {
                    more = false;
                    return false;
                }
            }
            else
                return false;

            more = AdvanceToNext(); // make shure all hits of the sub spans are 
in the same doc, in order and slop is OK

            return more;
        }

        // inherit javadocs
        public virtual bool SkipTo(int target)
        {
            if (firstTime)
            {
                firstTime = false;
                for (int i = 0; i < subSpans.Length; i++)
                {
                    if (!subSpans[i].SkipTo(target))
                        subSpans[i] = null;
                }
            }
            else if (more)
            {
                bool atLeastOneRead = false;
                for (int i = 0; i < subSpans.Length; i++)
                {
                    if (subSpans[i] != null)
                    {
                        if (subSpans[0].Doc() < target)
                        {
                            if (subSpans[i].SkipTo(target))
                            {
                                atLeastOneRead = true;
                                break;
                            }
                            else
                                subSpans[i] = null;
                        }
                        else
                            return true;
                    }
                }
                if (!atLeastOneRead)
                {
                    more = false;
                    return false;
                }
            }
            else
                return false;

            more = AdvanceToNext();

            return more;
        }

        private bool AdvanceToNext()
        {
            while (true)
            {
                // get doc id's from sub spans
                int[] docIds = new int[subSpans.Length];
                for (int i = 0; i < subSpans.Length; i++)
                {
                    if (subSpans[i] == null)
                        docIds[i] = int.MaxValue;
                    else
                        docIds[i] = subSpans[i].Doc();
                }

                // Sort doc ids
                Array.Sort(docIds);

                // get docid of 'GetMinRequiredTerms'-th item
                int docIdOfXthspan = docIds[query.GetMinRequiredTerms() - 1];

                // check too many missing
                if (docIdOfXthspan == int.MaxValue)
                {
                    more = false;
                    return false;
                }

                // we need at least ... items in the same doc, so the docId of 
the ...th item should be equal to the doc id of the first item.
                // if not, all items before the ...th can be moved forward to 
the doc id of the ...th item
                if (docIdOfXthspan != docIds[0])
                {
                    for (int i = 0; i < subSpans.Length; i++)
                    {
                        if (subSpans[i] != null)
                            if (subSpans[i].Doc() < docIdOfXthspan)
                                if (!subSpans[i].SkipTo(docIdOfXthspan))
                                    subSpans[i] = null;
                    }
                    continue;
                }

                //// get number of subspans in lowest doc id
                List<int> matchingSpans = new List<int>();
                for (int i = 0; i < subSpans.Length; i++)
                {
                    if (subSpans[i] != null && subSpans[i].Doc() == 
docIdOfXthspan)
                        matchingSpans.Add(i);
                }

                int tmpMatchDoc = subSpans[matchingSpans[0]].Doc();

                // check slop
                matchStart = subSpans[matchingSpans[0]].Start();
                matchEnd = subSpans[matchingSpans[matchingSpans.Count-1]].End();
                int matchSlop = matchEnd - matchStart - 
(query.GetMinRequiredTerms() - 1);
                if (matchSlop > allowedSlop)
                {
                    if(!subSpans[matchingSpans[0]].Next())
                        subSpans[matchingSpans[0]] = null;
                    continue;
                }

                //all checks pass
                matchDoc = tmpMatchDoc;
                break;
            }

            return true;
        }

        internal static bool DocSpansOrdered(Spans spans1, Spans spans2)
        {
            if (spans1.Doc() == int.MaxValue || spans2.Doc() == int.MaxValue)
                return true;
            System.Diagnostics.Debug.Assert(spans1.Doc() == spans2.Doc(), "doc1 
" + spans1.Doc() + " != doc2 " + spans2.Doc());
            int start1 = spans1.Start();
            int start2 = spans2.Start();
            /* Do not call DocSpansOrdered(int,int,int,int) to avoid invoking 
.end() : */
            return (start1 == start2) ? (spans1.End() < spans2.End()) : (start1 
< start2);
        }

         public override System.String ToString()
        {
            return GetType().FullName + "(" + query.ToString() + ")@" + 
(firstTime ? "START" : (more ? (Doc() + ":" + Start() + "-" + End()) : "END"));
        }
    }
}



-----Original Message-----
From: Mike Sokolov [mailto:[email protected]]
Sent: woensdag 13 juli 2011 23:02
To: [email protected]
Cc: Simon Willnauer
Subject: Re: Advanced NearSpanQuery

Sorry for the misdirection ...

On 07/13/2011 11:37 AM, Simon Willnauer wrote:
> I don't think this is possible with spans today. Once
> https://issues.apache.org/jira/browse/LUCENE-2878 is due this should
> be possible with a boolean query I think.
>
> to work around this you need to write a SpanOR query with a
> minShouldMatch functionality though.
>
> simon
>
> On Wed, Jul 13, 2011 at 5:09 PM, Jeroen Lauwers<[email protected]>  
> wrote:
>
>> Hi Mike,
>>
>> Thanks for your quick reply, but do not seem to find any documentation on 
>> "DisjunctionSumQuery" and I'm not familiar with that concept.
>>
>> Could you point me in the right direction?
>>
>> Jeroen
>>
>> -----Original Message-----
>> From: Mike Sokolov [mailto:[email protected]]
>> Sent: woensdag 13 juli 2011 15:23
>> To: [email protected]
>> Cc: Jeroen Lauwers
>> Subject: Re: Advanced NearSpanQuery
>>
>> Can you wrap a SpanNearQuery around an DisjunctionSumQuery with 
>> minNrShouldMatch=8?
>>
>> -Mike
>>
>> On 07/13/2011 08:53 AM, Jeroen Lauwers wrote:
>>
>>> Hi,
>>>
>>> I was wondering if anyone could help me on this:
>>>
>>> I want to search for:
>>>
>>> 1.       a set of words (eg. 10)
>>>
>>> 2.       only a couple of words may come in between (eg. 3) in the result 
>>> document
>>>
>>> 3.       of the supplied set of (10) words, at least 8 must be present (or 
>>> in other words: 2 of the supplied words can be missing)
>>>
>>> I use the SpanNearQuery for (1.) and (2.), but it is the third part that's 
>>> lacking.
>>>
>>> Any ideas?
>>>
>>> Jeroen
>>>
>>>
>>>
>> ---------------------------------------------------------------------
>> To unsubscribe, e-mail: [email protected]
>> For additional commands, e-mail: [email protected]
>>
>>
>>
>>
>> ---------------------------------------------------------------------
>> To unsubscribe, e-mail: [email protected]
>> For additional commands, e-mail: [email protected]
>>
>>
>>
> ---------------------------------------------------------------------
> To unsubscribe, e-mail: [email protected]
> For additional commands, e-mail: [email protected]
>
>

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

RE: Advanced NearSpanQuery

Reply via email to