Port ComplexPhraseQueryParser
Project: http://git-wip-us.apache.org/repos/asf/lucenenet/repo Commit: http://git-wip-us.apache.org/repos/asf/lucenenet/commit/2245f83e Tree: http://git-wip-us.apache.org/repos/asf/lucenenet/tree/2245f83e Diff: http://git-wip-us.apache.org/repos/asf/lucenenet/diff/2245f83e Branch: refs/heads/branch_4x Commit: 2245f83e43d7ebe12b2aac4e403aa9471264e1c5 Parents: c5e9e25 Author: Paul Irwin <[email protected]> Authored: Thu Oct 3 13:09:31 2013 -0400 Committer: Paul Irwin <[email protected]> Committed: Sat Oct 5 16:37:26 2013 -0400 ---------------------------------------------------------------------- .../QueryParsers/Classic/QueryParserBase.cs | 20 +- .../ComplexPhrase/ComplexPhraseQueryParser.cs | 395 +++++++++++++++++++ .../QueryParsers/Contrib.QueryParsers.csproj | 1 + 3 files changed, 406 insertions(+), 10 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/lucenenet/blob/2245f83e/src/contrib/QueryParsers/Classic/QueryParserBase.cs ---------------------------------------------------------------------- diff --git a/src/contrib/QueryParsers/Classic/QueryParserBase.cs b/src/contrib/QueryParsers/Classic/QueryParserBase.cs index 7e0a9fb..42520a1 100644 --- a/src/contrib/QueryParsers/Classic/QueryParserBase.cs +++ b/src/contrib/QueryParsers/Classic/QueryParserBase.cs @@ -88,7 +88,7 @@ namespace Lucene.Net.QueryParsers.Classic public abstract void ReInit(ICharStream stream); public abstract Query TopLevelQuery(String field); - public Query Parse(String query) + public virtual Query Parse(String query) { ReInit(new FastCharStream(new StringReader(query))); try @@ -615,46 +615,46 @@ namespace Lucene.Net.QueryParsers.Classic return NewRangeQuery(field, part1, part2, startInclusive, endInclusive); } - protected BooleanQuery NewBooleanQuery(bool disableCoord) + protected virtual BooleanQuery NewBooleanQuery(bool disableCoord) { return new BooleanQuery(disableCoord); } - protected BooleanClause NewBooleanClause(Query q, Occur occur) + protected virtual BooleanClause NewBooleanClause(Query q, Occur occur) { return new BooleanClause(q, occur); } - protected Query NewTermQuery(Term term) + protected virtual Query NewTermQuery(Term term) { return new TermQuery(term); } - protected PhraseQuery NewPhraseQuery() + protected virtual PhraseQuery NewPhraseQuery() { return new PhraseQuery(); } - protected MultiPhraseQuery NewMultiPhraseQuery() + protected virtual MultiPhraseQuery NewMultiPhraseQuery() { return new MultiPhraseQuery(); } - protected Query NewPrefixQuery(Term prefix) + protected virtual Query NewPrefixQuery(Term prefix) { PrefixQuery query = new PrefixQuery(prefix); query.SetRewriteMethod(multiTermRewriteMethod); return query; } - protected Query NewRegexpQuery(Term regexp) + protected virtual Query NewRegexpQuery(Term regexp) { RegexpQuery query = new RegexpQuery(regexp); query.SetRewriteMethod(multiTermRewriteMethod); return query; } - protected Query NewFuzzyQuery(Term term, float minimumSimilarity, int prefixLength) + protected virtual Query NewFuzzyQuery(Term term, float minimumSimilarity, int prefixLength) { // FuzzyQuery doesn't yet allow constant score rewrite String text = term.Text; @@ -714,7 +714,7 @@ namespace Lucene.Net.QueryParsers.Classic return BytesRef.DeepCopyOf(bytes); } - protected Query NewRangeQuery(String field, String part1, String part2, bool startInclusive, bool endInclusive) + protected virtual Query NewRangeQuery(String field, String part1, String part2, bool startInclusive, bool endInclusive) { BytesRef start; BytesRef end; http://git-wip-us.apache.org/repos/asf/lucenenet/blob/2245f83e/src/contrib/QueryParsers/ComplexPhrase/ComplexPhraseQueryParser.cs ---------------------------------------------------------------------- diff --git a/src/contrib/QueryParsers/ComplexPhrase/ComplexPhraseQueryParser.cs b/src/contrib/QueryParsers/ComplexPhrase/ComplexPhraseQueryParser.cs new file mode 100644 index 0000000..fe5d893 --- /dev/null +++ b/src/contrib/QueryParsers/ComplexPhrase/ComplexPhraseQueryParser.cs @@ -0,0 +1,395 @@ +using Lucene.Net.Analysis; +using Lucene.Net.Index; +using Lucene.Net.QueryParsers.Classic; +using Lucene.Net.Search; +using Lucene.Net.Search.Spans; +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using System.Threading.Tasks; +using Version = Lucene.Net.Util.Version; + +namespace Lucene.Net.QueryParsers.ComplexPhrase +{ + public class ComplexPhraseQueryParser : QueryParser + { + private List<ComplexPhraseQuery> complexPhrases = null; + + private bool isPass2ResolvingPhrases; + + private ComplexPhraseQuery currentPhraseQuery = null; + + public ComplexPhraseQueryParser(Version matchVersion, String f, Analyzer a) + : base(matchVersion, f, a) + { + } + + protected override Query GetFieldQuery(string field, string queryText, int slop) + { + ComplexPhraseQuery cpq = new ComplexPhraseQuery(field, queryText, slop); + complexPhrases.Add(cpq); // add to list of phrases to be parsed once + // we + // are through with this pass + return cpq; + } + + public override Query Parse(string query) + { + if (isPass2ResolvingPhrases) + { + MultiTermQuery.RewriteMethod oldMethod = this.MultiTermRewriteMethod; + try + { + // Temporarily force BooleanQuery rewrite so that Parser will + // generate visible + // collection of terms which we can convert into SpanQueries. + // ConstantScoreRewrite mode produces an + // opaque ConstantScoreQuery object which cannot be interrogated for + // terms in the same way a BooleanQuery can. + // QueryParser is not guaranteed threadsafe anyway so this temporary + // state change should not + // present an issue + this.MultiTermRewriteMethod = MultiTermQuery.SCORING_BOOLEAN_QUERY_REWRITE; + return base.Parse(query); + } + finally + { + this.MultiTermRewriteMethod = oldMethod; + } + } + + // First pass - parse the top-level query recording any PhraseQuerys + // which will need to be resolved + complexPhrases = new List<ComplexPhraseQuery>(); + Query q = base.Parse(query); + + // Perform second pass, using this QueryParser to parse any nested + // PhraseQueries with different + // set of syntax restrictions (i.e. all fields must be same) + isPass2ResolvingPhrases = true; + try + { + foreach (ComplexPhraseQuery currentPhraseQuery in complexPhrases) + { + // in each phrase, now parse the contents between quotes as a + // separate parse operation + currentPhraseQuery.ParsePhraseElements(this); + } + } + finally + { + isPass2ResolvingPhrases = false; + } + return q; + } + + protected override Query NewTermQuery(Term term) + { + if (isPass2ResolvingPhrases) + { + try + { + CheckPhraseClauseIsForSameField(term.Field); + } + catch (ParseException pe) + { + throw new SystemException("Error parsing complex phrase", pe); + } + } + return base.NewTermQuery(term); + } + + private void CheckPhraseClauseIsForSameField(string field) + { + if (!field.Equals(currentPhraseQuery.field)) + { + throw new ParseException("Cannot have clause for field \"" + field + + "\" nested in phrase " + " for field \"" + currentPhraseQuery.field + + "\""); + } + } + + protected override Query GetWildcardQuery(string field, string termStr) + { + if (isPass2ResolvingPhrases) + { + CheckPhraseClauseIsForSameField(field); + } + return base.GetWildcardQuery(field, termStr); + } + + protected override Query GetRangeQuery(string field, string part1, string part2, bool startInclusive, bool endInclusive) + { + if (isPass2ResolvingPhrases) + { + CheckPhraseClauseIsForSameField(field); + } + return base.GetRangeQuery(field, part1, part2, startInclusive, endInclusive); + } + + protected override Query NewRangeQuery(string field, string part1, string part2, bool startInclusive, bool endInclusive) + { + if (isPass2ResolvingPhrases) + { + // Must use old-style RangeQuery in order to produce a BooleanQuery + // that can be turned into SpanOr clause + TermRangeQuery rangeQuery = TermRangeQuery.NewStringRange(field, part1, part2, startInclusive, endInclusive); + rangeQuery.SetRewriteMethod(MultiTermQuery.SCORING_BOOLEAN_QUERY_REWRITE); + return rangeQuery; + } + return base.NewRangeQuery(field, part1, part2, startInclusive, endInclusive); + } + + protected override Query GetFuzzyQuery(string field, string termStr, float minSimilarity) + { + if (isPass2ResolvingPhrases) + { + CheckPhraseClauseIsForSameField(field); + } + return base.GetFuzzyQuery(field, termStr, minSimilarity); + } + + public class ComplexPhraseQuery : Query + { + protected internal string field; + + protected internal string phrasedQueryStringContents; + + protected internal int slopFactor; + + private Query contents; + + public ComplexPhraseQuery(string field, string phrasedQueryStringContents, int slopFactor) + : base() + { + this.field = field; + this.phrasedQueryStringContents = phrasedQueryStringContents; + this.slopFactor = slopFactor; + } + + // Called by ComplexPhraseQueryParser for each phrase after the main + // parse + // thread is through + protected internal void ParsePhraseElements(QueryParser qp) + { + // TODO ensure that field-sensitivity is preserved ie the query + // string below is parsed as + // field+":("+phrasedQueryStringContents+")" + // but this will need code in rewrite to unwrap the first layer of + // boolean query + contents = qp.Parse(phrasedQueryStringContents); + } + + public override Query Rewrite(IndexReader reader) + { + // ArrayList spanClauses = new ArrayList(); + if (contents is TermQuery) + { + return contents; + } + // Build a sequence of Span clauses arranged in a SpanNear - child + // clauses can be complex + // Booleans e.g. nots and ors etc + int numNegatives = 0; + if (!(contents is BooleanQuery)) + { + throw new ArgumentException("Unknown query type \"" + + contents.GetType().Name + + "\" found in phrase query string \"" + phrasedQueryStringContents + + "\""); + } + BooleanQuery bq = (BooleanQuery)contents; + BooleanClause[] bclauses = bq.Clauses; + SpanQuery[] allSpanClauses = new SpanQuery[bclauses.Length]; + // For all clauses e.g. one* two~ + for (int i = 0; i < bclauses.Length; i++) + { + // HashSet bclauseterms=new HashSet(); + Query qc = bclauses[i].Query; + // Rewrite this clause e.g one* becomes (one OR onerous) + qc = qc.Rewrite(reader); + if (bclauses[i].Occur.Equals(Occur.MUST_NOT)) + { + numNegatives++; + } + + if (qc is BooleanQuery) + { + List<SpanQuery> sc = new List<SpanQuery>(); + AddComplexPhraseClause(sc, (BooleanQuery)qc); + if (sc.Count > 0) + { + allSpanClauses[i] = sc[0]; + } + else + { + // Insert fake term e.g. phrase query was for "Fred Smithe*" and + // there were no "Smithe*" terms - need to + // prevent match on just "Fred". + allSpanClauses[i] = new SpanTermQuery(new Term(field, + "Dummy clause because no terms found - must match nothing")); + } + } + else + { + if (qc is TermQuery) + { + TermQuery tq = (TermQuery)qc; + allSpanClauses[i] = new SpanTermQuery(tq.Term); + } + else + { + throw new ArgumentException("Unknown query type \"" + + qc.GetType().Name + + "\" found in phrase query string \"" + + phrasedQueryStringContents + "\""); + } + + } + } + if (numNegatives == 0) + { + // The simple case - no negative elements in phrase + return new SpanNearQuery(allSpanClauses, slopFactor, true); + } + // Complex case - we have mixed positives and negatives in the + // sequence. + // Need to return a SpanNotQuery + List<SpanQuery> positiveClauses = new List<SpanQuery>(); + for (int j = 0; j < allSpanClauses.Length; j++) + { + if (!bclauses[j].Occur.Equals(Occur.MUST_NOT)) + { + positiveClauses.Add(allSpanClauses[j]); + } + } + + SpanQuery[] includeClauses = positiveClauses.ToArray(); + + SpanQuery include = null; + if (includeClauses.Length == 1) + { + include = includeClauses[0]; // only one positive clause + } + else + { + // need to increase slop factor based on gaps introduced by + // negatives + include = new SpanNearQuery(includeClauses, slopFactor + numNegatives, + true); + } + // Use sequence of positive and negative values as the exclude. + SpanNearQuery exclude = new SpanNearQuery(allSpanClauses, slopFactor, + true); + SpanNotQuery snot = new SpanNotQuery(include, exclude); + return snot; + } + + private void AddComplexPhraseClause(IList<SpanQuery> spanClauses, BooleanQuery qc) + { + List<SpanQuery> ors = new List<SpanQuery>(); + List<SpanQuery> nots = new List<SpanQuery>(); + BooleanClause[] bclauses = qc.Clauses; + + // For all clauses e.g. one* two~ + for (int i = 0; i < bclauses.Length; i++) + { + Query childQuery = bclauses[i].Query; + + // select the list to which we will add these options + List<SpanQuery> chosenList = ors; + if (bclauses[i].Occur == Occur.MUST_NOT) + { + chosenList = nots; + } + + if (childQuery is TermQuery) + { + TermQuery tq = (TermQuery)childQuery; + SpanTermQuery stq = new SpanTermQuery(tq.Term); + stq.Boost = tq.Boost; + chosenList.Add(stq); + } + else if (childQuery is BooleanQuery) + { + BooleanQuery cbq = (BooleanQuery)childQuery; + AddComplexPhraseClause(chosenList, cbq); + } + else + { + // TODO alternatively could call extract terms here? + throw new ArgumentException("Unknown query type:" + + childQuery.GetType().Name); + } + } + if (ors.Count == 0) + { + return; + } + SpanOrQuery soq = new SpanOrQuery(ors.ToArray()); + if (nots.Count == 0) + { + spanClauses.Add(soq); + } + else + { + SpanOrQuery snqs = new SpanOrQuery(nots.ToArray()); + SpanNotQuery snq = new SpanNotQuery(soq, snqs); + spanClauses.Add(snq); + } + } + + public override string ToString(string field) + { + return "\"" + phrasedQueryStringContents + "\""; + } + + public override int GetHashCode() + { + int prime = 31; + int result = base.GetHashCode(); + result = prime * result + ((field == null) ? 0 : field.GetHashCode()); + result = prime + * result + + ((phrasedQueryStringContents == null) ? 0 + : phrasedQueryStringContents.GetHashCode()); + result = prime * result + slopFactor; + return result; + } + + public override bool Equals(object obj) + { + if (this == obj) + return true; + if (obj == null) + return false; + if (GetType() != obj.GetType()) + return false; + if (!base.Equals(obj)) + { + return false; + } + ComplexPhraseQuery other = (ComplexPhraseQuery)obj; + if (field == null) + { + if (other.field != null) + return false; + } + else if (!field.Equals(other.field)) + return false; + if (phrasedQueryStringContents == null) + { + if (other.phrasedQueryStringContents != null) + return false; + } + else if (!phrasedQueryStringContents + .Equals(other.phrasedQueryStringContents)) + return false; + if (slopFactor != other.slopFactor) + return false; + return true; + } + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/2245f83e/src/contrib/QueryParsers/Contrib.QueryParsers.csproj ---------------------------------------------------------------------- diff --git a/src/contrib/QueryParsers/Contrib.QueryParsers.csproj b/src/contrib/QueryParsers/Contrib.QueryParsers.csproj index 46d8216..2d29fad 100644 --- a/src/contrib/QueryParsers/Contrib.QueryParsers.csproj +++ b/src/contrib/QueryParsers/Contrib.QueryParsers.csproj @@ -51,6 +51,7 @@ <Compile Include="Classic\QueryParserTokenManager.cs" /> <Compile Include="Classic\Token.cs" /> <Compile Include="Classic\TokenMgrError.cs" /> + <Compile Include="ComplexPhrase\ComplexPhraseQueryParser.cs" /> <Compile Include="Ext\ExtendableQueryParser.cs" /> <Compile Include="Ext\ExtensionQuery.cs" /> <Compile Include="Ext\Extensions.cs" />
