Port AnalyzingQueryParser
Project: http://git-wip-us.apache.org/repos/asf/lucenenet/repo Commit: http://git-wip-us.apache.org/repos/asf/lucenenet/commit/c5e9e254 Tree: http://git-wip-us.apache.org/repos/asf/lucenenet/tree/c5e9e254 Diff: http://git-wip-us.apache.org/repos/asf/lucenenet/diff/c5e9e254 Branch: refs/heads/branch_4x Commit: c5e9e25426b709cad5a457040fa8c78225551e03 Parents: 819bcf4 Author: Paul Irwin <[email protected]> Authored: Thu Oct 3 12:51:36 2013 -0400 Committer: Paul Irwin <[email protected]> Committed: Sat Oct 5 16:37:25 2013 -0400 ---------------------------------------------------------------------- .../Analyzing/AnalyzingQueryParser.cs | 249 +++++++++++++++++++ .../QueryParsers/Contrib.QueryParsers.csproj | 1 + 2 files changed, 250 insertions(+) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/lucenenet/blob/c5e9e254/src/contrib/QueryParsers/Analyzing/AnalyzingQueryParser.cs ---------------------------------------------------------------------- diff --git a/src/contrib/QueryParsers/Analyzing/AnalyzingQueryParser.cs b/src/contrib/QueryParsers/Analyzing/AnalyzingQueryParser.cs new file mode 100644 index 0000000..23c165a --- /dev/null +++ b/src/contrib/QueryParsers/Analyzing/AnalyzingQueryParser.cs @@ -0,0 +1,249 @@ +using Lucene.Net.Analysis; +using Lucene.Net.Analysis.Tokenattributes; +using Lucene.Net.QueryParsers.Classic; +using Lucene.Net.Search; +using System; +using System.Collections.Generic; +using System.IO; +using System.Linq; +using System.Text; +using System.Threading.Tasks; +using Version = Lucene.Net.Util.Version; + +namespace Lucene.Net.QueryParsers.Analyzing +{ + public class AnalyzingQueryParser : QueryParser + { + public AnalyzingQueryParser(Version matchVersion, string field, Analyzer analyzer) + : base(matchVersion, field, analyzer) + { + base.AnalyzeRangeTerms = true; + } + + protected override Query GetWildcardQuery(string field, string termStr) + { + IList<string> tlist = new List<string>(); + IList<string> wlist = new List<string>(); + /* somewhat a hack: find/store wildcard chars + * in order to put them back after analyzing */ + bool isWithinToken = (!termStr.StartsWith("?") && !termStr.StartsWith("*")); + StringBuilder tmpBuffer = new StringBuilder(); + char[] chars = termStr.ToCharArray(); + for (int i = 0; i < termStr.Length; i++) + { + if (chars[i] == '?' || chars[i] == '*') + { + if (isWithinToken) + { + tlist.Add(tmpBuffer.ToString()); + tmpBuffer.Length = 0; + } + isWithinToken = false; + } + else + { + if (!isWithinToken) + { + wlist.Add(tmpBuffer.ToString()); + tmpBuffer.Length = 0; + } + isWithinToken = true; + } + tmpBuffer.Append(chars[i]); + } + if (isWithinToken) + { + tlist.Add(tmpBuffer.ToString()); + } + else + { + wlist.Add(tmpBuffer.ToString()); + } + + // get Analyzer from superclass and tokenize the term + TokenStream source; + + int countTokens = 0; + try + { + source = this.Analyzer.TokenStream(field, new StringReader(termStr)); + source.Reset(); + } + catch (IOException) + { + throw; + } + ICharTermAttribute termAtt = source.AddAttribute<ICharTermAttribute>(); + while (true) + { + try + { + if (!source.IncrementToken()) break; + } + catch (IOException e) + { + break; + } + String term = termAtt.ToString(); + if (!"".Equals(term)) + { + try + { + tlist[countTokens++] = term; + } + catch (IndexOutOfRangeException) + { + countTokens = -1; + } + } + } + try + { + source.End(); + source.Dispose(); + } + catch (IOException e) + { + // ignore + } + + if (countTokens != tlist.Count) + { + /* this means that the analyzer used either added or consumed + * (common for a stemmer) tokens, and we can't build a WildcardQuery */ + throw new ParseException("Cannot build WildcardQuery with analyzer " + + this.Analyzer.GetType() + " - tokens added or lost"); + } + + if (tlist.Count == 0) + { + return null; + } + else if (tlist.Count == 1) + { + if (wlist != null && wlist.Count == 1) + { + /* if wlist contains one wildcard, it must be at the end, because: + * 1) wildcards are not allowed in 1st position of a term by QueryParser + * 2) if wildcard was *not* in end, there would be *two* or more tokens */ + return base.GetWildcardQuery(field, tlist[0] + wlist[0].ToString()); + } + else + { + /* we should never get here! if so, this method was called + * with a termStr containing no wildcard ... */ + throw new ArgumentException("getWildcardQuery called without wildcard"); + } + } + else + { + /* the term was tokenized, let's rebuild to one token + * with wildcards put back in postion */ + StringBuilder sb = new StringBuilder(); + for (int i = 0; i < tlist.Count; i++) + { + sb.Append(tlist[i]); + if (wlist != null && wlist.Count > i) + { + sb.Append(wlist[i]); + } + } + return base.GetWildcardQuery(field, sb.ToString()); + } + } + + protected override Query GetPrefixQuery(string field, string termStr) + { + // get Analyzer from superclass and tokenize the term + TokenStream source; + IList<String> tlist = new List<String>(); + try + { + source = this.Analyzer.TokenStream(field, new StringReader(termStr)); + source.Reset(); + } + catch (IOException) + { + throw; + } + ICharTermAttribute termAtt = source.AddAttribute<ICharTermAttribute>(); + while (true) + { + try + { + if (!source.IncrementToken()) break; + } + catch (IOException e) + { + break; + } + tlist.Add(termAtt.ToString()); + } + + try + { + source.End(); + source.Dispose(); + } + catch (IOException e) + { + // ignore + } + + if (tlist.Count == 1) + { + return base.GetPrefixQuery(field, tlist[0]); + } + else + { + /* this means that the analyzer used either added or consumed + * (common for a stemmer) tokens, and we can't build a PrefixQuery */ + throw new ParseException("Cannot build PrefixQuery with analyzer " + + this.Analyzer.GetType() + + (tlist.Count > 1 ? " - token(s) added" : " - token consumed")); + } + } + + protected override Query GetFuzzyQuery(string field, string termStr, float minSimilarity) + { + // get Analyzer from superclass and tokenize the term + TokenStream source = null; + String nextToken = null; + bool multipleTokens = false; + + try + { + source = this.Analyzer.TokenStream(field, new StringReader(termStr)); + ICharTermAttribute termAtt = source.AddAttribute<ICharTermAttribute>(); + source.Reset(); + if (source.IncrementToken()) + { + nextToken = termAtt.ToString(); + } + multipleTokens = source.IncrementToken(); + } + catch (IOException) + { + nextToken = null; + } + + try + { + source.End(); + source.Dispose(); + } + catch (IOException e) + { + // ignore + } + + if (multipleTokens) + { + throw new ParseException("Cannot build FuzzyQuery with analyzer " + this.Analyzer.GetType() + + " - tokens were added"); + } + + return (nextToken == null) ? null : base.GetFuzzyQuery(field, nextToken, minSimilarity); + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/c5e9e254/src/contrib/QueryParsers/Contrib.QueryParsers.csproj ---------------------------------------------------------------------- diff --git a/src/contrib/QueryParsers/Contrib.QueryParsers.csproj b/src/contrib/QueryParsers/Contrib.QueryParsers.csproj index a9f08d6..46d8216 100644 --- a/src/contrib/QueryParsers/Contrib.QueryParsers.csproj +++ b/src/contrib/QueryParsers/Contrib.QueryParsers.csproj @@ -40,6 +40,7 @@ <Reference Include="System.Xml" /> </ItemGroup> <ItemGroup> + <Compile Include="Analyzing\AnalyzingQueryParser.cs" /> <Compile Include="Classic\FastCharStream.cs" /> <Compile Include="Classic\ICharStream.cs" /> <Compile Include="Classic\MultiFieldQueryParser.cs" />
