http://git-wip-us.apache.org/repos/asf/lucenenet/blob/45ba8d83/src/contrib/Highlighter/VectorHighlight/StringUtils.cs ---------------------------------------------------------------------- diff --git a/src/contrib/Highlighter/VectorHighlight/StringUtils.cs b/src/contrib/Highlighter/VectorHighlight/StringUtils.cs new file mode 100644 index 0000000..7b27259 --- /dev/null +++ b/src/contrib/Highlighter/VectorHighlight/StringUtils.cs @@ -0,0 +1,56 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +using System; +using System.Collections.Generic; + +namespace Lucene.Net.Search.Vectorhighlight +{ + public static class StringUtils + { + /// <summary> + /// Check if the termToMatch is a match for the term, considering the use of a wildcards. + /// </summary> + public static Boolean TermStringMatch(String term, String termToMatch) + { + if (term[0] == '*' || term[0] == '?') + throw new NotSupportedException("Unable to do matching with wildcard at the beginning"); + + if (term[term.Length - 1] == '*') + { + //Wildcard at the end + if (termToMatch.Length < term.Length - 1) return false; + for (int i = 0; i < term.Length - 1; i++) + { + if (termToMatch[i] != term[i]) return false; + } + return true; + } + return term.Equals(termToMatch); + } + + public static Boolean AnyTermMatch(IList<String> terms, String term) + { + for (int i = 0; i < terms.Count; i++) + { + if (StringUtils.TermStringMatch(terms[i], term)) + return true; + } + return false; + } + } +}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/45ba8d83/src/contrib/Highlighter/VectorHighlight/Support.cs ---------------------------------------------------------------------- diff --git a/src/contrib/Highlighter/VectorHighlight/Support.cs b/src/contrib/Highlighter/VectorHighlight/Support.cs new file mode 100644 index 0000000..0dfbf43 --- /dev/null +++ b/src/contrib/Highlighter/VectorHighlight/Support.cs @@ -0,0 +1,73 @@ +/* + * + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + * +*/ + +using System; +using System.Collections.Generic; +using System.Text; + +namespace Lucene.Net.Search.Vectorhighlight +{ + public class HashMap<K, V> : Dictionary<K, V> + { + V _NullKeyValue = default(V); + + public new void Add(K key,V value) + { + if (key == null) + _NullKeyValue = value; + else + base.Add(key,value); + } + + public new int Count + { + get + { + return base.Count + (_NullKeyValue!= null ? 1 : 0); + } + } + + public new V this[K key] + { + get{ + return Get(key); + } + set{ + Add(key,value); + } + } + + public V Get(K key) + { + if (key == null) return _NullKeyValue; + + V v = default(V); + base.TryGetValue(key, out v); + return v; + } + + public void Put(K key, V val) + { + Add(key,val); + } + } +} + http://git-wip-us.apache.org/repos/asf/lucenenet/blob/45ba8d83/src/contrib/Highlighter/VectorHighlight/VectorHighlightMapper.cs ---------------------------------------------------------------------- diff --git a/src/contrib/Highlighter/VectorHighlight/VectorHighlightMapper.cs b/src/contrib/Highlighter/VectorHighlight/VectorHighlightMapper.cs new file mode 100644 index 0000000..da2966c --- /dev/null +++ b/src/contrib/Highlighter/VectorHighlight/VectorHighlightMapper.cs @@ -0,0 +1,134 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#if LUCENENET_350 //Lucene.Net specific code. See https://issues.apache.org/jira/browse/LUCENENET-350 + +using System; +using System.Collections.Generic; +using Lucene.Net.Index; + +namespace Lucene.Net.Search.Vectorhighlight +{ + public class VectorHighlightMapper : TermVectorMapper, ITermFreqVector, TermPositionVector + { + private readonly List<string> _terms; + private Dictionary<string, TermVectorOffsetInfo[]> _tvoi; + private Dictionary<string, int[]> _positions; + private Dictionary<string, int> _frequency; + private List<string> _indexMap; + private string _field; + private bool _storeOffsets; + private bool _storePositions; + + public VectorHighlightMapper(List<string> terms) + { + _terms = terms; + _tvoi = new Dictionary<string, TermVectorOffsetInfo[]>(); + _positions = new Dictionary<string, int[]>(); + _frequency = new Dictionary<string, int>(); + _indexMap = new List<string>(); + } + + public override void SetExpectations(string field, int numTerms, bool storeOffsets, bool storePositions) + { + _field = field; + _storeOffsets = storeOffsets; + _storePositions = storePositions; + if (_storeOffsets) + _tvoi = new Dictionary<string, TermVectorOffsetInfo[]>(numTerms); + if (_storePositions) + _positions = new Dictionary<string, int[]>(numTerms); + _frequency = new Dictionary<string, int>(numTerms); + _indexMap = new List<string>(numTerms); + } + + public override void Map(string term, int frequency, TermVectorOffsetInfo[] offsets, int[] positions) + { + if (StringUtils.AnyTermMatch(_terms, term)) + { + _indexMap.Add(term); + if (_storeOffsets) + _tvoi.Add(term, offsets); + if (_storePositions) + _positions.Add(term, positions); + _frequency.Add(term,frequency); + } + } + + public string Field + { + get { return _field; } + } + + public int Size + { + get { return _tvoi.Count; } + } + + public string[] GetTerms() + { + string[] result = new string[_tvoi.Count]; + _tvoi.Keys.CopyTo(result,0); + return result; + } + + public int[] GetTermFrequencies() + { + int[] result = new int[_frequency.Count]; + _frequency.Values.CopyTo(result,0); + return result; + } + + public int IndexOf(string term) + { + return _indexMap.IndexOf(term); + } + + public int[] IndexesOf(string[] terms, int start, int len) + { + int[] result = new int[terms.Length]; + for (int i = 0; i < terms.Length; i++) + { + string term = terms[i]; + result[i] = _indexMap.IndexOf(term, start, len); + } + return result; + } + + public int[] GetTermPositions(int index) + { + if (index<_positions.Count) + { + string key = _indexMap[index]; + return _positions[key]; + } + return new int[0]; + } + + public TermVectorOffsetInfo[] GetOffsets(int index) + { + if (index < _tvoi.Count) + { + string key = _indexMap[index]; + return _tvoi[key]; + } + return new TermVectorOffsetInfo[0]; + } + } +} + +#endif \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/45ba8d83/src/contrib/Highlighter/WeightedSpanTerm.cs ---------------------------------------------------------------------- diff --git a/src/contrib/Highlighter/WeightedSpanTerm.cs b/src/contrib/Highlighter/WeightedSpanTerm.cs deleted file mode 100644 index 7d94383..0000000 --- a/src/contrib/Highlighter/WeightedSpanTerm.cs +++ /dev/null @@ -1,104 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -using System; -using System.Collections.Generic; -using System.Linq; -using System.Text; - -namespace Lucene.Net.Search.Highlight -{ - /// <summary> - /// Lightweight class to hold term, Weight, and positions used for scoring this term. - /// </summary> - public class WeightedSpanTerm : WeightedTerm - { - private bool _positionSensitive; - private readonly List<PositionSpan> _positionSpans = new List<PositionSpan>(); - - public WeightedSpanTerm(float weight, String term) - : base(weight, term) - { - - this._positionSpans = new List<PositionSpan>(); - } - - public WeightedSpanTerm(float weight, String term, bool positionSensitive) - : base(weight, term) - { - - this._positionSensitive = positionSensitive; - } - - /// <summary> - /// Checks to see if this term is valid at <c>position</c>. - /// </summary> - /// <param name="position">to check against valid term postions</param> - /// <returns>true iff this term is a hit at this position</returns> - public bool CheckPosition(int position) - { - // There would probably be a slight speed improvement if PositionSpans - // where kept in some sort of priority queue - that way this method - // could - // bail early without checking each PositionSpan. - - foreach (var positionSpan in _positionSpans) - { - if (((position >= positionSpan.Start) && (position <= positionSpan.End))) - { - return true; - } - } - - return false; - } - - public void AddPositionSpans(List<PositionSpan> positionSpans) - { - this._positionSpans.AddRange(positionSpans); - } - - public bool IsPositionSensitive() - { - return _positionSensitive; - } - - public void SetPositionSensitive(bool positionSensitive) - { - this._positionSensitive = positionSensitive; - } - - public List<PositionSpan> GetPositionSpans() - { - return _positionSpans; - } - } - - - // Utility class to store a Span - public class PositionSpan - { - public int Start { get; private set; } - public int End { get; private set; } - - public PositionSpan(int start, int end) - { - this.Start = start; - this.End = end; - } - } -} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/45ba8d83/src/contrib/Highlighter/WeightedSpanTermExtractor.cs ---------------------------------------------------------------------- diff --git a/src/contrib/Highlighter/WeightedSpanTermExtractor.cs b/src/contrib/Highlighter/WeightedSpanTermExtractor.cs deleted file mode 100644 index df7a90f..0000000 --- a/src/contrib/Highlighter/WeightedSpanTermExtractor.cs +++ /dev/null @@ -1,667 +0,0 @@ -/* - * - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - * -*/ - -using System; -using System.Collections.Generic; -using System.IO; -using System.Linq; -using System.Text; -using Lucene.Net.Analysis; -using Lucene.Net.Index; -using Lucene.Net.Index.Memory; -using Lucene.Net.Search.Spans; -using Lucene.Net.Store; -using Lucene.Net.Support; -using Lucene.Net.Util; - -namespace Lucene.Net.Search.Highlight -{ - /// <summary> - /// Class used to extract <see cref="WeightedSpanTerm"/>s from a <see cref="Query"/> based on whether - /// <see cref="Term"/>s from the <see cref="Query"/> are contained in a supplied <see cref="Analysis.TokenStream"/>. - /// </summary> - public class WeightedSpanTermExtractor - { - private String fieldName; - private TokenStream tokenStream; - private IDictionary<String, IndexReader> readers = new HashMap<String, IndexReader>(10); - private String defaultField; - private bool expandMultiTermQuery; - private bool cachedTokenStream; - private bool wrapToCaching = true; - - public WeightedSpanTermExtractor() - { - } - - public WeightedSpanTermExtractor(String defaultField) - { - if (defaultField != null) - { - this.defaultField = StringHelper.Intern(defaultField); - } - } - - private void CloseReaders() - { - ICollection<IndexReader> readerSet = readers.Values; - - foreach (IndexReader reader in readerSet) - { - try - { - reader.Close(); - } - catch (IOException e) - { - // alert? - } - } - } - - /// <summary> - /// Fills a <c>Map</c> with <see cref="WeightedSpanTerm"/>s using the terms from the supplied <c>Query</c>. - /// </summary> - /// <param name="query">Query to extract Terms from</param> - /// <param name="terms">Map to place created WeightedSpanTerms in</param> - private void Extract(Query query, IDictionary<String, WeightedSpanTerm> terms) - { - if (query is BooleanQuery) - { - BooleanClause[] queryClauses = ((BooleanQuery) query).GetClauses(); - - for (int i = 0; i < queryClauses.Length; i++) - { - if (!queryClauses[i].IsProhibited) - { - Extract(queryClauses[i].Query, terms); - } - } - } - else if (query is PhraseQuery) - { - PhraseQuery phraseQuery = ((PhraseQuery) query); - Term[] phraseQueryTerms = phraseQuery.GetTerms(); - SpanQuery[] clauses = new SpanQuery[phraseQueryTerms.Length]; - for (int i = 0; i < phraseQueryTerms.Length; i++) - { - clauses[i] = new SpanTermQuery(phraseQueryTerms[i]); - } - int slop = phraseQuery.Slop; - int[] positions = phraseQuery.GetPositions(); - // add largest position increment to slop - if (positions.Length > 0) - { - int lastPos = positions[0]; - int largestInc = 0; - int sz = positions.Length; - for (int i = 1; i < sz; i++) - { - int pos = positions[i]; - int inc = pos - lastPos; - if (inc > largestInc) - { - largestInc = inc; - } - lastPos = pos; - } - if (largestInc > 1) - { - slop += largestInc; - } - } - - bool inorder = slop == 0; - - SpanNearQuery sp = new SpanNearQuery(clauses, slop, inorder); - sp.Boost = query.Boost; - ExtractWeightedSpanTerms(terms, sp); - } - else if (query is TermQuery) - { - ExtractWeightedTerms(terms, query); - } - else if (query is SpanQuery) - { - ExtractWeightedSpanTerms(terms, (SpanQuery) query); - } - else if (query is FilteredQuery) - { - Extract(((FilteredQuery) query).Query, terms); - } - else if (query is DisjunctionMaxQuery) - { - foreach (var q in ((DisjunctionMaxQuery) query)) - { - Extract(q, terms); - } - } - else if (query is MultiTermQuery && expandMultiTermQuery) - { - MultiTermQuery mtq = ((MultiTermQuery) query); - if (mtq.RewriteMethod != MultiTermQuery.SCORING_BOOLEAN_QUERY_REWRITE) - { - mtq = (MultiTermQuery) mtq.Clone(); - mtq.RewriteMethod = MultiTermQuery.SCORING_BOOLEAN_QUERY_REWRITE; - query = mtq; - } - FakeReader fReader = new FakeReader(); - MultiTermQuery.SCORING_BOOLEAN_QUERY_REWRITE.Rewrite(fReader, mtq); - if (fReader.Field != null) - { - IndexReader ir = GetReaderForField(fReader.Field); - Extract(query.Rewrite(ir), terms); - } - } - else if (query is MultiPhraseQuery) - { - MultiPhraseQuery mpq = (MultiPhraseQuery) query; - IList<Term[]> termArrays = mpq.GetTermArrays(); - int[] positions = mpq.GetPositions(); - if (positions.Length > 0) - { - - int maxPosition = positions[positions.Length - 1]; - for (int i = 0; i < positions.Length - 1; ++i) - { - if (positions[i] > maxPosition) - { - maxPosition = positions[i]; - } - } - - var disjunctLists = new List<SpanQuery>[maxPosition + 1]; - int distinctPositions = 0; - - for (int i = 0; i < termArrays.Count; ++i) - { - Term[] termArray = termArrays[i]; - List<SpanQuery> disjuncts = disjunctLists[positions[i]]; - if (disjuncts == null) - { - disjuncts = (disjunctLists[positions[i]] = new List<SpanQuery>(termArray.Length)); - ++distinctPositions; - } - for (int j = 0; j < termArray.Length; ++j) - { - disjuncts.Add(new SpanTermQuery(termArray[j])); - } - } - - int positionGaps = 0; - int position = 0; - SpanQuery[] clauses = new SpanQuery[distinctPositions]; - for (int i = 0; i < disjunctLists.Length; ++i) - { - List<SpanQuery> disjuncts = disjunctLists[i]; - if (disjuncts != null) - { - clauses[position++] = new SpanOrQuery(disjuncts.ToArray()); - } - else - { - ++positionGaps; - } - } - - int slop = mpq.Slop; - bool inorder = (slop == 0); - - SpanNearQuery sp = new SpanNearQuery(clauses, slop + positionGaps, inorder); - sp.Boost = query.Boost; - ExtractWeightedSpanTerms(terms, sp); - } - } - } - - /// <summary> - /// Fills a <c>Map</c> with <see cref="WeightedSpanTerm"/>s using the terms from the supplied <c>SpanQuery</c>. - /// </summary> - /// <param name="terms">Map to place created WeightedSpanTerms in</param> - /// <param name="spanQuery">SpanQuery to extract Terms from</param> - private void ExtractWeightedSpanTerms(IDictionary<String, WeightedSpanTerm> terms, SpanQuery spanQuery) - { - HashSet<String> fieldNames; - - if (fieldName == null) - { - fieldNames = new HashSet<String>(); - CollectSpanQueryFields(spanQuery, fieldNames); - } - else - { - fieldNames = new HashSet<String>(); - fieldNames.Add(fieldName); - } - // To support the use of the default field name - if (defaultField != null) - { - fieldNames.Add(defaultField); - } - - IDictionary<String, SpanQuery> queries = new HashMap<String, SpanQuery>(); - - var nonWeightedTerms = Support.Compatibility.SetFactory.CreateHashSet<Term>(); - bool mustRewriteQuery = MustRewriteQuery(spanQuery); - if (mustRewriteQuery) - { - foreach (String field in fieldNames) - { - SpanQuery rewrittenQuery = (SpanQuery) spanQuery.Rewrite(GetReaderForField(field)); - queries[field] = rewrittenQuery; - rewrittenQuery.ExtractTerms(nonWeightedTerms); - } - } - else - { - spanQuery.ExtractTerms(nonWeightedTerms); - } - - List<PositionSpan> spanPositions = new List<PositionSpan>(); - - foreach (String field in fieldNames) - { - - IndexReader reader = GetReaderForField(field); - Spans.Spans spans; - if (mustRewriteQuery) - { - spans = queries[field].GetSpans(reader); - } - else - { - spans = spanQuery.GetSpans(reader); - } - - - // collect span positions - while (spans.Next()) - { - spanPositions.Add(new PositionSpan(spans.Start(), spans.End() - 1)); - } - - } - - if (spanPositions.Count == 0) - { - // no spans found - return; - } - - foreach (Term queryTerm in nonWeightedTerms) - { - - if (FieldNameComparator(queryTerm.Field)) - { - WeightedSpanTerm weightedSpanTerm = terms[queryTerm.Text]; - - if (weightedSpanTerm == null) - { - weightedSpanTerm = new WeightedSpanTerm(spanQuery.Boost, queryTerm.Text); - weightedSpanTerm.AddPositionSpans(spanPositions); - weightedSpanTerm.SetPositionSensitive(true); - terms[queryTerm.Text] = weightedSpanTerm; - } - else - { - if (spanPositions.Count > 0) - { - weightedSpanTerm.AddPositionSpans(spanPositions); - } - } - } - } - } - - /// <summary> - /// Fills a <c>Map</c> with <see cref="WeightedSpanTerm"/>s using the terms from the supplied <c>Query</c>. - /// </summary> - /// <param name="terms"></param> - /// <param name="query"></param> - private void ExtractWeightedTerms(IDictionary<String, WeightedSpanTerm> terms, Query query) - { - var nonWeightedTerms = Support.Compatibility.SetFactory.CreateHashSet<Term>(); - query.ExtractTerms(nonWeightedTerms); - - foreach (Term queryTerm in nonWeightedTerms) - { - - if (FieldNameComparator(queryTerm.Field)) - { - WeightedSpanTerm weightedSpanTerm = new WeightedSpanTerm(query.Boost, queryTerm.Text); - terms[queryTerm.Text] = weightedSpanTerm; - } - } - } - - /// <summary> - /// Necessary to implement matches for queries against <c>defaultField</c> - /// </summary> - private bool FieldNameComparator(String fieldNameToCheck) - { - bool rv = fieldName == null || fieldNameToCheck == fieldName - || fieldNameToCheck == defaultField; - return rv; - } - - private IndexReader GetReaderForField(String field) - { - if (wrapToCaching && !cachedTokenStream && !(tokenStream is CachingTokenFilter)) - { - tokenStream = new CachingTokenFilter(tokenStream); - cachedTokenStream = true; - } - IndexReader reader = readers[field]; - if (reader == null) - { - MemoryIndex indexer = new MemoryIndex(); - indexer.AddField(field, tokenStream); - tokenStream.Reset(); - IndexSearcher searcher = indexer.CreateSearcher(); - reader = searcher.IndexReader; - readers[field] = reader; - } - - return reader; - } - - /// <summary> - /// Creates a Map of <c>WeightedSpanTerms</c> from the given <c>Query</c> and <c>TokenStream</c>. - /// </summary> - /// <param name="query">query that caused hit</param> - /// <param name="tokenStream">TokenStream of text to be highlighted</param> - /// <returns>Map containing WeightedSpanTerms</returns> - public IDictionary<String, WeightedSpanTerm> GetWeightedSpanTerms(Query query, TokenStream tokenStream) - { - return GetWeightedSpanTerms(query, tokenStream, null); - } - - - /// <summary> - /// Creates a Map of <c>WeightedSpanTerms</c> from the given <c>Query</c> and <c>TokenStream</c>. - /// </summary> - /// <param name="query">query that caused hit</param> - /// <param name="tokenStream">tokenStream of text to be highlighted</param> - /// <param name="fieldName">restricts Term's used based on field name</param> - /// <returns>Map containing WeightedSpanTerms</returns> - public IDictionary<String, WeightedSpanTerm> GetWeightedSpanTerms(Query query, TokenStream tokenStream, - String fieldName) - { - if (fieldName != null) - { - this.fieldName = StringHelper.Intern(fieldName); - } - else - { - this.fieldName = null; - } - - IDictionary<String, WeightedSpanTerm> terms = new PositionCheckingMap<String>(); - this.tokenStream = tokenStream; - try - { - Extract(query, terms); - } - finally - { - CloseReaders(); - } - - return terms; - } - - /// <summary> - /// Creates a Map of <c>WeightedSpanTerms</c> from the given <c>Query</c> and <c>TokenStream</c>. Uses a supplied - /// <c>IndexReader</c> to properly Weight terms (for gradient highlighting). - /// </summary> - /// <param name="query">Query that caused hit</param> - /// <param name="tokenStream">Tokenstream of text to be highlighted</param> - /// <param name="fieldName">restricts Term's used based on field name</param> - /// <param name="reader">to use for scoring</param> - /// <returns>Map of WeightedSpanTerms with quasi tf/idf scores</returns> - public IDictionary<String, WeightedSpanTerm> GetWeightedSpanTermsWithScores(Query query, TokenStream tokenStream, - String fieldName, IndexReader reader) - { - if (fieldName != null) - { - this.fieldName = StringHelper.Intern(fieldName); - } - else - { - this.fieldName = null; - } - this.tokenStream = tokenStream; - - IDictionary<String, WeightedSpanTerm> terms = new PositionCheckingMap<String>(); - Extract(query, terms); - - int totalNumDocs = reader.NumDocs(); - var weightedTerms = terms.Keys; - - try - { - foreach (var wt in weightedTerms) - { - WeightedSpanTerm weightedSpanTerm = terms[wt]; - int docFreq = reader.DocFreq(new Term(fieldName, weightedSpanTerm.Term)); - // docFreq counts deletes - if (totalNumDocs < docFreq) - { - docFreq = totalNumDocs; - } - // IDF algorithm taken from DefaultSimilarity class - float idf = (float) (Math.Log((float) totalNumDocs/(double) (docFreq + 1)) + 1.0); - weightedSpanTerm.Weight *= idf; - } - } - finally - { - - CloseReaders(); - } - - return terms; - } - - private void CollectSpanQueryFields(SpanQuery spanQuery, HashSet<String> fieldNames) - { - if (spanQuery is FieldMaskingSpanQuery) - { - CollectSpanQueryFields(((FieldMaskingSpanQuery) spanQuery).MaskedQuery, fieldNames); - } - else if (spanQuery is SpanFirstQuery) - { - CollectSpanQueryFields(((SpanFirstQuery) spanQuery).Match, fieldNames); - } - else if (spanQuery is SpanNearQuery) - { - foreach (SpanQuery clause in ((SpanNearQuery) spanQuery).GetClauses()) - { - CollectSpanQueryFields(clause, fieldNames); - } - } - else if (spanQuery is SpanNotQuery) - { - CollectSpanQueryFields(((SpanNotQuery) spanQuery).Include, fieldNames); - } - else if (spanQuery is SpanOrQuery) - { - foreach (SpanQuery clause in ((SpanOrQuery) spanQuery).GetClauses()) - { - CollectSpanQueryFields(clause, fieldNames); - } - } - else - { - fieldNames.Add(spanQuery.Field); - } - } - - private bool MustRewriteQuery(SpanQuery spanQuery) - { - if (!expandMultiTermQuery) - { - return false; // Will throw UnsupportedOperationException in case of a SpanRegexQuery. - } - else if (spanQuery is FieldMaskingSpanQuery) - { - return MustRewriteQuery(((FieldMaskingSpanQuery)spanQuery).MaskedQuery); - } - else if (spanQuery is SpanFirstQuery) - { - return MustRewriteQuery(((SpanFirstQuery)spanQuery).Match); - } - else if (spanQuery is SpanNearQuery) - { - foreach (SpanQuery clause in ((SpanNearQuery) spanQuery).GetClauses()) - { - if (MustRewriteQuery(clause)) - { - return true; - } - } - return false; - } - else if (spanQuery is SpanNotQuery) - { - SpanNotQuery spanNotQuery = (SpanNotQuery) spanQuery; - return MustRewriteQuery(spanNotQuery.Include) || MustRewriteQuery(spanNotQuery.Exclude); - } - else if (spanQuery is SpanOrQuery) - { - foreach (SpanQuery clause in ((SpanOrQuery) spanQuery).GetClauses()) - { - if (MustRewriteQuery(clause)) - { - return true; - } - } - return false; - } - else if (spanQuery is SpanTermQuery) - { - return false; - } - else - { - return true; - } - } - - - /// <summary> - /// This class makes sure that if both position sensitive and insensitive - /// versions of the same term are added, the position insensitive one wins. - /// </summary> - /// <typeparam name="K"></typeparam> - private class PositionCheckingMap<K> : HashMap<K, WeightedSpanTerm> - { - public PositionCheckingMap() - { - - } - - public PositionCheckingMap(IEnumerable<KeyValuePair<K, WeightedSpanTerm>> m) - { - PutAll(m); - } - - public void PutAll(IEnumerable<KeyValuePair<K, WeightedSpanTerm>> m) - { - foreach (var entry in m) - { - Add(entry.Key, entry.Value); - } - } - - public override void Add(K key, WeightedSpanTerm value) - { - base.Add(key, value); - WeightedSpanTerm prev = this[key]; - - if (prev == null) return; - - WeightedSpanTerm prevTerm = prev; - WeightedSpanTerm newTerm = value; - if (!prevTerm.IsPositionSensitive()) - { - newTerm.SetPositionSensitive(false); - } - } - - } - - public bool ExpandMultiTermQuery - { - set { this.expandMultiTermQuery = value; } - get { return expandMultiTermQuery; } - } - - public bool IsCachedTokenStream - { - get { return cachedTokenStream; } - } - - public TokenStream TokenStream - { - get { return tokenStream; } - } - - - /// <summary> - /// By default, <see cref="Analysis.TokenStream"/>s that are not of the type - /// <see cref="CachingTokenFilter"/> are wrapped in a <see cref="CachingTokenFilter"/> to - /// <see cref="Analysis.TokenStream"/> impl and you don't want it to be wrapped, set this to - /// false. - /// </summary> - public void SetWrapIfNotCachingTokenFilter(bool wrap) - { - this.wrapToCaching = wrap; - } - - /// <summary> - /// A fake IndexReader class to extract the field from a MultiTermQuery - /// </summary> - protected internal sealed class FakeReader : FilterIndexReader - { - - private static IndexReader EMPTY_MEMORY_INDEX_READER = new MemoryIndex().CreateSearcher().IndexReader; - - public String Field { get; private set; } - - protected internal FakeReader() - : base(EMPTY_MEMORY_INDEX_READER) - { - - } - - public override TermEnum Terms(Term t) - { - // only set first fieldname, maybe use a Set? - if (t != null && Field == null) - Field = t.Field; - return base.Terms(t); - } - - - } - } -} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/45ba8d83/src/contrib/Highlighter/WeightedTerm.cs ---------------------------------------------------------------------- diff --git a/src/contrib/Highlighter/WeightedTerm.cs b/src/contrib/Highlighter/WeightedTerm.cs deleted file mode 100644 index cfe3e12..0000000 --- a/src/contrib/Highlighter/WeightedTerm.cs +++ /dev/null @@ -1,44 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -using System; - -namespace Lucene.Net.Search.Highlight -{ - /// <summary> - /// Lightweight class to hold term and a Weight value used for scoring this term - /// </summary> - public class WeightedTerm - { - public WeightedTerm(float weight, String term) - { - this.Weight = weight; - this.Term = term; - } - - /// <summary> - /// the term value (stemmed) - /// </summary> - public string Term { get; set; } - - /// <summary> - /// the Weight associated with this term - /// </summary> - /// <value> </value> - public float Weight { get; set; } - } -} \ No newline at end of file
