http://git-wip-us.apache.org/repos/asf/lucenenet/blob/02362804/src/contrib/Highlighter/GradientFormatter.cs ---------------------------------------------------------------------- diff --git a/src/contrib/Highlighter/GradientFormatter.cs b/src/contrib/Highlighter/GradientFormatter.cs deleted file mode 100644 index 3b50d42..0000000 --- a/src/contrib/Highlighter/GradientFormatter.cs +++ /dev/null @@ -1,212 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -using System; - -namespace Lucene.Net.Search.Highlight -{ - /// <summary> - /// Formats text with different color intensity depending on the score of the term. - /// </summary> - public class GradientFormatter : IFormatter - { - private float maxScore; - - protected internal int fgRMin, fgGMin, fgBMin; - protected internal int fgRMax, fgGMax, fgBMax; - protected bool highlightForeground; - protected internal int bgRMin, bgGMin, bgBMin; - protected internal int bgRMax, bgGMax, bgBMax; - protected bool highlightBackground; - - /// <summary> Sets the color range for the IDF scores</summary> - /// <param name="maxScore"> - /// The score (and above) displayed as maxColor (See QueryScorer.getMaxWeight - /// which can be used to callibrate scoring scale) - /// </param> - /// <param name="minForegroundColor"> - /// The hex color used for representing IDF scores of zero eg - /// #FFFFFF (white) or null if no foreground color required - /// </param> - /// <param name="maxForegroundColor"> - /// The largest hex color used for representing IDF scores eg - /// #000000 (black) or null if no foreground color required - /// </param> - /// <param name="minBackgroundColor"> - /// The hex color used for representing IDF scores of zero eg - /// #FFFFFF (white) or null if no background color required - /// </param> - /// <param name="maxBackgroundColor"> - /// The largest hex color used for representing IDF scores eg - /// #000000 (black) or null if no background color required - /// </param> - public GradientFormatter(float maxScore, string minForegroundColor, string maxForegroundColor, string minBackgroundColor, string maxBackgroundColor) - { - highlightForeground = (minForegroundColor != null) && (maxForegroundColor != null); - - if (highlightForeground) - { - if (minForegroundColor.Length != 7) - { - throw new ArgumentException("minForegroundColor is not 7 bytes long eg a hex " + "RGB value such as #FFFFFF"); - } - if (maxForegroundColor.Length != 7) - { - throw new ArgumentException("minForegroundColor is not 7 bytes long eg a hex " + "RGB value such as #FFFFFF"); - } - fgRMin = HexToInt(minForegroundColor.Substring(1, 2)); - fgGMin = HexToInt(minForegroundColor.Substring(3, 2)); - fgBMin = HexToInt(minForegroundColor.Substring(5, 2)); - - fgRMax = HexToInt(maxForegroundColor.Substring(1, 2)); - fgGMax = HexToInt(maxForegroundColor.Substring(3, 2)); - fgBMax = HexToInt(maxForegroundColor.Substring(5, 2)); - } - - highlightBackground = (minBackgroundColor != null) && (maxBackgroundColor != null); - if (highlightBackground) - { - if (minBackgroundColor.Length != 7) - { - throw new System.ArgumentException("minBackgroundColor is not 7 bytes long eg a hex " + "RGB value such as #FFFFFF"); - } - if (maxBackgroundColor.Length != 7) - { - throw new System.ArgumentException("minBackgroundColor is not 7 bytes long eg a hex " + "RGB value such as #FFFFFF"); - } - bgRMin = HexToInt(minBackgroundColor.Substring(1, 2)); - bgGMin = HexToInt(minBackgroundColor.Substring(3, 2)); - bgBMin = HexToInt(minBackgroundColor.Substring(5, 2)); - - bgRMax = HexToInt(maxBackgroundColor.Substring(1, 2)); - bgGMax = HexToInt(maxBackgroundColor.Substring(3, 2)); - bgBMax = HexToInt(maxBackgroundColor.Substring(5, 2)); - } - // this.corpusReader = corpusReader; - this.maxScore = maxScore; - // totalNumDocs = corpusReader.numDocs(); - } - - public virtual string HighlightTerm(string originalText, TokenGroup tokenGroup) - { - if (tokenGroup.TotalScore == 0) - return originalText; - float score = tokenGroup.TotalScore; - if (score == 0) - { - return originalText; - } - - var sb = new System.Text.StringBuilder(); - sb.Append("<font "); - if (highlightForeground) - { - sb.Append("color=\""); - sb.Append(GetForegroundColorString(score)); - sb.Append("\" "); - } - if (highlightBackground) - { - sb.Append("bgcolor=\""); - sb.Append(GetBackgroundColorString(score)); - sb.Append("\" "); - } - sb.Append(">"); - sb.Append(originalText); - sb.Append("</font>"); - return sb.ToString(); - } - - protected internal virtual string GetForegroundColorString(float score) - { - int rVal = GetColorVal(fgRMin, fgRMax, score); - int gVal = GetColorVal(fgGMin, fgGMax, score); - int bVal = GetColorVal(fgBMin, fgBMax, score); - var sb = new System.Text.StringBuilder(); - sb.Append("#"); - sb.Append(IntToHex(rVal)); - sb.Append(IntToHex(gVal)); - sb.Append(IntToHex(bVal)); - return sb.ToString(); - } - - protected internal virtual string GetBackgroundColorString(float score) - { - int rVal = GetColorVal(bgRMin, bgRMax, score); - int gVal = GetColorVal(bgGMin, bgGMax, score); - int bVal = GetColorVal(bgBMin, bgBMax, score); - var sb = new System.Text.StringBuilder(); - sb.Append("#"); - sb.Append(IntToHex(rVal)); - sb.Append(IntToHex(gVal)); - sb.Append(IntToHex(bVal)); - return sb.ToString(); - } - - private int GetColorVal(int colorMin, int colorMax, float score) - { - if (colorMin == colorMax) - { - return colorMin; - } - float scale = Math.Abs(colorMin - colorMax); - float relScorePercent = Math.Min(maxScore, score) / maxScore; - float colScore = scale * relScorePercent; - return Math.Min(colorMin, colorMax) + (int) colScore; - } - - private static char[] hexDigits = new char[]{'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F'}; - - private static string IntToHex(int i) - { - return "" + hexDigits[(i & 0xF0) >> 4] + hexDigits[i & 0x0F]; - } - - /// <summary> Converts a hex string into an int. Integer.parseInt(hex, 16) assumes the - /// input is nonnegative unless there is a preceding minus sign. This method - /// reads the input as twos complement instead, so if the input is 8 bytes - /// long, it will correctly restore a negative int produced by - /// Integer.toHexString() but not neccesarily one produced by - /// Integer.toString(x,16) since that method will produce a string like '-FF' - /// for negative integer values. - /// - /// </summary> - /// <param name="hex"> - /// A string in capital or lower case hex, of no more then 16 - /// characters. - /// </param> - /// <exception cref="FormatException">if the string is more than 16 characters long, or if any - /// character is not in the set [0-9a-fA-f]</exception> - public static int HexToInt(string hex) - { - int len = hex.Length; - if (len > 16) - throw new FormatException(); - - int l = 0; - for (int i = 0; i < len; i++) - { - l <<= 4; - int c = (int) System.Char.GetNumericValue(hex[i]); - if (c < 0) - throw new FormatException(); - l |= c; - } - return l; - } - } -} \ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/02362804/src/contrib/Highlighter/HISTORY.txt ---------------------------------------------------------------------- diff --git a/src/contrib/Highlighter/HISTORY.txt b/src/contrib/Highlighter/HISTORY.txt deleted file mode 100644 index 4f7fe85..0000000 --- a/src/contrib/Highlighter/HISTORY.txt +++ /dev/null @@ -1,22 +0,0 @@ -Highlighter.Net History ------------------------ - - -29Jan08: - - Release: Highlighter.Net.2.3.2 build 001 "final" - - -07Jan07: - - Release: Highlighter.Net.2.0.0 build 001 "final" - - Fix: Lucene.Net.Highlight.QueryTermExtractor.GetTerms -- was using the wrong integrator - - Fix: Lucene.Net.Highlight.TextFragment.ToString -- offset error - - Issue: TestEncoding() is failing because the test code is not fully ported to .NET - - -27Dec06: - - Release: Highlighter.Net.2.0.0 build 000 "Alpha" - - Issue: Not fully tested / validated such that many of the NUnit tests are failing. - - -05Jan05: - - Release: Highlighter.Net.1.4.0 RC1 build 001 http://git-wip-us.apache.org/repos/asf/lucenenet/blob/02362804/src/contrib/Highlighter/Highlighter.cs ---------------------------------------------------------------------- diff --git a/src/contrib/Highlighter/Highlighter.cs b/src/contrib/Highlighter/Highlighter.cs deleted file mode 100644 index 239c0f7..0000000 --- a/src/contrib/Highlighter/Highlighter.cs +++ /dev/null @@ -1,477 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -using System; -using System.Collections.Generic; -using System.IO; -using System.Linq; -using System.Text; -using Lucene.Net.Analysis; -using Lucene.Net.Analysis.Tokenattributes; -using Lucene.Net.Util; - -namespace Lucene.Net.Search.Highlight -{ - /// <summary> - /// Class used to markup highlighted terms found in the best sections of a - /// text, using configurable <see cref="IFragmenter"/>, <see cref="Scorer"/>, <see cref="IFormatter"/>, - /// <see cref="IEncoder"/> and tokenizers. - /// </summary> - public class Highlighter - { - public static readonly int DEFAULT_MAX_CHARS_TO_ANALYZE = 50*1024; - - private int _maxDocCharsToAnalyze = DEFAULT_MAX_CHARS_TO_ANALYZE; - private IFormatter _formatter; - private IEncoder _encoder; - private IFragmenter _textFragmenter = new SimpleFragmenter(); - private IScorer _fragmentScorer = null; - - public Highlighter(IScorer fragmentScorer) - : this(new SimpleHTMLFormatter(), fragmentScorer) - { - } - - - public Highlighter(IFormatter formatter, IScorer fragmentScorer) - : this(formatter, new DefaultEncoder(), fragmentScorer) - { - } - - - public Highlighter(IFormatter formatter, IEncoder encoder, IScorer fragmentScorer) - { - _formatter = formatter; - _encoder = encoder; - _fragmentScorer = fragmentScorer; - } - - /// <summary> - /// Highlights chosen terms in a text, extracting the most relevant section. - /// This is a convenience method that calls <see cref="GetBestFragment(TokenStream, string)"/> - /// </summary> - /// <param name="analyzer">the analyzer that will be used to split <c>text</c> into chunks</param> - /// <param name="fieldName">Name of field used to influence analyzer's tokenization policy</param> - /// <param name="text">text to highlight terms in</param> - /// <returns>highlighted text fragment or null if no terms found</returns> - /// <exception cref="InvalidTokenOffsetsException">thrown if any token's endOffset exceeds the provided text's length</exception> - public String GetBestFragment(Analyzer analyzer, String fieldName, String text) - { - TokenStream tokenStream = analyzer.TokenStream(fieldName, new StringReader(text)); - return GetBestFragment(tokenStream, text); - } - - /// <summary> - /// Highlights chosen terms in a text, extracting the most relevant section. - /// The document text is analysed in chunks to record hit statistics - /// across the document. After accumulating stats, the fragment with the highest score - /// is returned - /// </summary> - /// <param name="tokenStream"> - /// a stream of tokens identified in the text parameter, including offset information. - /// This is typically produced by an analyzer re-parsing a document's - /// text. Some work may be done on retrieving TokenStreams more efficiently - /// by adding support for storing original text position data in the Lucene - /// index but this support is not currently available (as of Lucene 1.4 rc2). - /// </param> - /// <param name="text">text to highlight terms in</param> - /// <returns>highlighted text fragment or null if no terms found</returns> - /// <exception cref="InvalidTokenOffsetsException">thrown if any token's endOffset exceeds the provided text's length</exception> - public String GetBestFragment(TokenStream tokenStream, String text) - { - String[] results = GetBestFragments(tokenStream, text, 1); - if (results.Length > 0) - { - return results[0]; - } - return null; - } - - /// <summary> - /// Highlights chosen terms in a text, extracting the most relevant sections. - /// This is a convenience method that calls <see cref="GetBestFragments(TokenStream, string, int)"/> - /// </summary> - /// <param name="analyzer">the analyzer that will be used to split <c>text</c> into chunks</param> - /// <param name="fieldName">the name of the field being highlighted (used by analyzer)</param> - /// <param name="text">text to highlight terms in</param> - /// <param name="maxNumFragments">the maximum number of fragments.</param> - /// <returns>highlighted text fragments (between 0 and maxNumFragments number of fragments)</returns> - /// <exception cref="InvalidTokenOffsetsException">thrown if any token's endOffset exceeds the provided text's length</exception> - public String[] GetBestFragments( - Analyzer analyzer, - String fieldName, - String text, - int maxNumFragments) - { - TokenStream tokenStream = analyzer.TokenStream(fieldName, new StringReader(text)); - return GetBestFragments(tokenStream, text, maxNumFragments); - } - - /// <summary> - /// Highlights chosen terms in a text, extracting the most relevant sections. - /// The document text is analysed in chunks to record hit statistics - /// across the document. After accumulating stats, the fragments with the highest scores - /// are returned as an array of strings in order of score (contiguous fragments are merged into - /// one in their original order to improve readability) - /// </summary> - /// <param name="tokenStream"></param> - /// <param name="text">text to highlight terms in</param> - /// <param name="maxNumFragments">the maximum number of fragments.</param> - /// <returns>highlighted text fragments (between 0 and maxNumFragments number of fragments)</returns> - /// <exception cref="InvalidTokenOffsetsException">thrown if any token's endOffset exceeds the provided text's length</exception> - public String[] GetBestFragments(TokenStream tokenStream, String text, int maxNumFragments) - { - maxNumFragments = Math.Max(1, maxNumFragments); //sanity check - - TextFragment[] frag = GetBestTextFragments(tokenStream, text, true, maxNumFragments); - - //Get text - var fragTexts = new List<String>(); - for (int i = 0; i < frag.Length; i++) - { - if ((frag[i] != null) && (frag[i].Score > 0)) - { - fragTexts.Add(frag[i].ToString()); - } - } - return fragTexts.ToArray(); - } - - /// <summary> - /// Low level api to get the most relevant (formatted) sections of the document. - /// This method has been made public to allow visibility of score information held in TextFragment objects. - /// Thanks to Jason Calabrese for help in redefining the interface. - /// </summary> - public TextFragment[] GetBestTextFragments( - TokenStream tokenStream, - String text, - bool mergeContiguousFragments, - int maxNumFragments) - { - var docFrags = new List<TextFragment>(); - var newText = new StringBuilder(); - - var termAtt = tokenStream.AddAttribute<ITermAttribute>(); - var offsetAtt = tokenStream.AddAttribute<IOffsetAttribute>(); - tokenStream.AddAttribute<IPositionIncrementAttribute>(); - tokenStream.Reset(); - - var currentFrag = new TextFragment(newText, newText.Length, docFrags.Count); - var newStream = _fragmentScorer.Init(tokenStream); - if (newStream != null) - { - tokenStream = newStream; - } - _fragmentScorer.StartFragment(currentFrag); - docFrags.Add(currentFrag); - - var fragQueue = new FragmentQueue(maxNumFragments); - - try - { - - String tokenText; - int startOffset; - int endOffset; - int lastEndOffset = 0; - _textFragmenter.Start(text, tokenStream); - - var tokenGroup = new TokenGroup(tokenStream); - - for (bool next = tokenStream.IncrementToken(); - next && (offsetAtt.StartOffset < _maxDocCharsToAnalyze); - next = tokenStream.IncrementToken()) - { - if ((offsetAtt.EndOffset > text.Length) - || - (offsetAtt.StartOffset > text.Length) - ) - { - throw new InvalidTokenOffsetsException("Token " + termAtt.Term - + " exceeds length of provided text sized " + text.Length); - } - if ((tokenGroup.NumTokens > 0) && (tokenGroup.IsDistinct())) - { - //the current token is distinct from previous tokens - - // markup the cached token group info - startOffset = tokenGroup.MatchStartOffset; - endOffset = tokenGroup.MatchEndOffset; - tokenText = text.Substring(startOffset, endOffset - startOffset); - String markedUpText = _formatter.HighlightTerm(_encoder.EncodeText(tokenText), tokenGroup); - //store any whitespace etc from between this and last group - if (startOffset > lastEndOffset) - newText.Append(_encoder.EncodeText(text.Substring(lastEndOffset, startOffset - lastEndOffset))); - newText.Append(markedUpText); - lastEndOffset = Math.Max(endOffset, lastEndOffset); - tokenGroup.Clear(); - - //check if current token marks the start of a new fragment - if (_textFragmenter.IsNewFragment()) - { - currentFrag.Score = _fragmentScorer.FragmentScore; - //record stats for a new fragment - currentFrag.TextEndPos = newText.Length; - currentFrag = new TextFragment(newText, newText.Length, docFrags.Count); - _fragmentScorer.StartFragment(currentFrag); - docFrags.Add(currentFrag); - } - } - - tokenGroup.AddToken(_fragmentScorer.GetTokenScore()); - - // if(lastEndOffset>maxDocBytesToAnalyze) - // { - // break; - // } - } - currentFrag.Score = _fragmentScorer.FragmentScore; - - if (tokenGroup.NumTokens > 0) - { - //flush the accumulated text (same code as in above loop) - startOffset = tokenGroup.MatchStartOffset; - endOffset = tokenGroup.MatchEndOffset; - tokenText = text.Substring(startOffset, endOffset - startOffset); - var markedUpText = _formatter.HighlightTerm(_encoder.EncodeText(tokenText), tokenGroup); - //store any whitespace etc from between this and last group - if (startOffset > lastEndOffset) - newText.Append(_encoder.EncodeText(text.Substring(lastEndOffset, startOffset - lastEndOffset))); - newText.Append(markedUpText); - lastEndOffset = Math.Max(lastEndOffset, endOffset); - } - - //Test what remains of the original text beyond the point where we stopped analyzing - if ( - // if there is text beyond the last token considered.. - (lastEndOffset < text.Length) - && - // and that text is not too large... - (text.Length <= _maxDocCharsToAnalyze) - ) - { - //append it to the last fragment - newText.Append(_encoder.EncodeText(text.Substring(lastEndOffset))); - } - - currentFrag.TextEndPos = newText.Length; - - //sort the most relevant sections of the text - foreach (var f in docFrags) - { - currentFrag = f; - - //If you are running with a version of Lucene before 11th Sept 03 - // you do not have PriorityQueue.insert() - so uncomment the code below - /* - if (currentFrag.getScore() >= minScore) - { - fragQueue.put(currentFrag); - if (fragQueue.size() > maxNumFragments) - { // if hit queue overfull - fragQueue.pop(); // remove lowest in hit queue - minScore = ((TextFragment) fragQueue.top()).getScore(); // reset minScore - } - - - } - */ - //The above code caused a problem as a result of Christoph Goller's 11th Sept 03 - //fix to PriorityQueue. The correct method to use here is the new "insert" method - // USE ABOVE CODE IF THIS DOES NOT COMPILE! - fragQueue.InsertWithOverflow(currentFrag); - } - - //return the most relevant fragments - var frag = new TextFragment[fragQueue.Size()]; - for (int i = frag.Length - 1; i >= 0; i--) - { - frag[i] = fragQueue.Pop(); - } - - //merge any contiguous fragments to improve readability - if (mergeContiguousFragments) - { - MergeContiguousFragments(frag); - frag = frag.Where(t => (t != null) && (t.Score > 0)).ToArray(); - } - - return frag; - - } - finally - { - if (tokenStream != null) - { - try - { - tokenStream.Close(); - } - catch (Exception) - { - } - } - } - } - - /// <summary> - /// Improves readability of a score-sorted list of TextFragments by merging any fragments - /// that were contiguous in the original text into one larger fragment with the correct order. - /// This will leave a "null" in the array entry for the lesser scored fragment. - /// </summary> - /// <param name="frag">An array of document fragments in descending score</param> - private void MergeContiguousFragments(TextFragment[] frag) - { - bool mergingStillBeingDone; - if (frag.Length > 1) - do - { - mergingStillBeingDone = false; //initialise loop control flag - //for each fragment, scan other frags looking for contiguous blocks - for (int i = 0; i < frag.Length; i++) - { - if (frag[i] == null) - { - continue; - } - //merge any contiguous blocks - for (int x = 0; x < frag.Length; x++) - { - if (frag[x] == null) - { - continue; - } - if (frag[i] == null) - { - break; - } - TextFragment frag1 = null; - TextFragment frag2 = null; - int frag1Num = 0; - int frag2Num = 0; - int bestScoringFragNum; - int worstScoringFragNum; - //if blocks are contiguous.... - if (frag[i].Follows(frag[x])) - { - frag1 = frag[x]; - frag1Num = x; - frag2 = frag[i]; - frag2Num = i; - } - else if (frag[x].Follows(frag[i])) - { - frag1 = frag[i]; - frag1Num = i; - frag2 = frag[x]; - frag2Num = x; - } - //merging required.. - if (frag1 != null) - { - if (frag1.Score > frag2.Score) - { - bestScoringFragNum = frag1Num; - worstScoringFragNum = frag2Num; - } - else - { - bestScoringFragNum = frag2Num; - worstScoringFragNum = frag1Num; - } - frag1.Merge(frag2); - frag[worstScoringFragNum] = null; - mergingStillBeingDone = true; - frag[bestScoringFragNum] = frag1; - } - } - } - } while (mergingStillBeingDone); - } - - /// <summary> - /// Highlights terms in the text , extracting the most relevant sections - /// and concatenating the chosen fragments with a separator (typically "..."). - /// The document text is analysed in chunks to record hit statistics - /// across the document. After accumulating stats, the fragments with the highest scores - /// are returned in order as "separator" delimited strings. - /// </summary> - /// <param name="tokenStream"></param> - /// <param name="text">text to highlight terms in</param> - /// <param name="maxNumFragments">the maximum number of fragments.</param> - /// <param name="separator">the separator used to intersperse the document fragments (typically "...")</param> - /// <returns>highlighted text</returns> - public String GetBestFragments( - TokenStream tokenStream, - String text, - int maxNumFragments, - String separator) - { - string[] sections = GetBestFragments(tokenStream, text, maxNumFragments); - StringBuilder result = new StringBuilder(); - for (int i = 0; i < sections.Length; i++) - { - if (i > 0) - { - result.Append(separator); - } - result.Append(sections[i]); - } - return result.ToString(); - } - - public int MaxDocCharsToAnalyze - { - get { return _maxDocCharsToAnalyze; } - set { this._maxDocCharsToAnalyze = value; } - } - - - public IFragmenter TextFragmenter - { - get { return _textFragmenter; } - set { _textFragmenter = value; } - } - - public IScorer FragmentScorer - { - get { return _fragmentScorer; } - set { _fragmentScorer = value; } - } - - public IEncoder Encoder - { - get { return _encoder; } - set { this._encoder = value; } - } - } - - internal class FragmentQueue : PriorityQueue<TextFragment> - { - public FragmentQueue(int size) - { - Initialize(size); - } - - public override bool LessThan(TextFragment fragA, TextFragment fragB) - { - if (fragA.Score == fragB.Score) - return fragA.FragNum > fragB.FragNum; - else - return fragA.Score < fragB.Score; - } - } -} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/02362804/src/contrib/Highlighter/IEncoder.cs ---------------------------------------------------------------------- diff --git a/src/contrib/Highlighter/IEncoder.cs b/src/contrib/Highlighter/IEncoder.cs deleted file mode 100644 index 0e45d49..0000000 --- a/src/contrib/Highlighter/IEncoder.cs +++ /dev/null @@ -1,26 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -namespace Lucene.Net.Search.Highlight -{ - /// <summary>Encodes original text. The IEncoder works with the Formatter to generate the output.</summary> - public interface IEncoder - { - /// <param name="originalText">The section of text being output</param> - string EncodeText(System.String originalText); - } -} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/02362804/src/contrib/Highlighter/IFormatter.cs ---------------------------------------------------------------------- diff --git a/src/contrib/Highlighter/IFormatter.cs b/src/contrib/Highlighter/IFormatter.cs deleted file mode 100644 index 2164afd..0000000 --- a/src/contrib/Highlighter/IFormatter.cs +++ /dev/null @@ -1,29 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -namespace Lucene.Net.Search.Highlight -{ - /// <summary> Processes terms found in the original text, typically by applying some form - /// of mark-up to highlight terms in HTML search results pages.</summary> - public interface IFormatter - { - /// <param name="originalText">The section of text being considered for markup</param> - /// <param name="tokenGroup">contains one or several overlapping Tokens along with - /// their scores and positions.</param> - string HighlightTerm(System.String originalText, TokenGroup tokenGroup); - } -} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/02362804/src/contrib/Highlighter/IFragmenter.cs ---------------------------------------------------------------------- diff --git a/src/contrib/Highlighter/IFragmenter.cs b/src/contrib/Highlighter/IFragmenter.cs deleted file mode 100644 index 0f35195..0000000 --- a/src/contrib/Highlighter/IFragmenter.cs +++ /dev/null @@ -1,46 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -using Lucene.Net.Analysis; - -namespace Lucene.Net.Search.Highlight -{ - - /// <summary> Implements the policy for breaking text into multiple fragments for consideration - /// by the <see cref="Highlighter"/> class. A sophisticated implementation may do this on the basis - /// of detecting end of sentences in the text. - /// </summary> - /// <author> [email protected] - /// </author> - public interface IFragmenter - { - /// <summary> - /// Initializes the Fragmenter. You can grab references to the Attributes you are - /// interested in from tokenStream and then access the values in {@link #isNewFragment()}. - /// </summary> - /// <param name="originalText">the original source text</param> - ///<param name="tokenStream">tokenStream the <see cref="TokenStream" /> to be fragmented</param> - void Start(string originalText, TokenStream tokenStream); - - /// <summary> - /// Test to see if this token from the stream should be held in a new - /// TextFragment. Every time this is called, the TokenStream - /// passed to start(String, TokenStream) will have been incremented. - /// </summary> - bool IsNewFragment(); - } -} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/02362804/src/contrib/Highlighter/IScorer.cs ---------------------------------------------------------------------- diff --git a/src/contrib/Highlighter/IScorer.cs b/src/contrib/Highlighter/IScorer.cs deleted file mode 100644 index 10d4095..0000000 --- a/src/contrib/Highlighter/IScorer.cs +++ /dev/null @@ -1,62 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -using System.IO; -using Lucene.Net.Analysis; - -namespace Lucene.Net.Search.Highlight -{ - /// <summary> Adds to the score for a fragment based on its tokens</summary> - public interface IScorer - { - /// <summary> - /// Called to init the Scorer with a {@link TokenStream}. You can grab references to - /// the attributes you are interested in here and access them from {@link #getTokenScore()}. - /// </summary> - /// <param name="tokenStream">the {@link TokenStream} that will be scored.</param> - /// <returns> - /// either a {@link TokenStream} that the Highlighter should continue using (eg - /// if you read the tokenSream in this method) or null to continue - /// using the same {@link TokenStream} that was passed in. - /// </returns> - /// <exception cref="IOException"></exception> - /// - TokenStream Init(TokenStream tokenStream); - - /// <summary> - /// Called when a new fragment is started for consideration. - /// </summary> - /// <param name="newFragment">the fragment that will be scored next</param> - void StartFragment(TextFragment newFragment); - - /// <summary> - /// Called for each token in the current fragment. The {@link Highlighter} will - /// increment the {@link TokenStream} passed to init on every call. - /// </summary> - /// <returns>a score which is passed to the {@link Highlighter} class to influence the - /// mark-up of the text (this return value is NOT used to score the - /// fragment)</returns> - float GetTokenScore(); - - ///<summary> - /// Called when the {@link Highlighter} has no more tokens for the current fragment - - /// the Scorer returns the weighting it has derived for the most recent - /// fragment, typically based on the results of {@link #getTokenScore()}. - /// </summary> - float FragmentScore { get; } - } -} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/02362804/src/contrib/Highlighter/InvalidTokenOffsetsException.cs ---------------------------------------------------------------------- diff --git a/src/contrib/Highlighter/InvalidTokenOffsetsException.cs b/src/contrib/Highlighter/InvalidTokenOffsetsException.cs deleted file mode 100644 index 2aa745d..0000000 --- a/src/contrib/Highlighter/InvalidTokenOffsetsException.cs +++ /dev/null @@ -1,51 +0,0 @@ -/* - * - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - * -*/ - -using System; -using System.Collections.Generic; -using System.Linq; -using System.Runtime.Serialization; -using System.Text; - -namespace Lucene.Net.Search.Highlight -{ - [Serializable] - public class InvalidTokenOffsetsException : Exception - { - public InvalidTokenOffsetsException() - { - } - - public InvalidTokenOffsetsException(string message) : base(message) - { - } - - public InvalidTokenOffsetsException(string message, Exception inner) : base(message, inner) - { - } - - protected InvalidTokenOffsetsException( - SerializationInfo info, - StreamingContext context) : base(info, context) - { - } - } -} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/02362804/src/contrib/Highlighter/NullFragmenter.cs ---------------------------------------------------------------------- diff --git a/src/contrib/Highlighter/NullFragmenter.cs b/src/contrib/Highlighter/NullFragmenter.cs deleted file mode 100644 index c86dda1..0000000 --- a/src/contrib/Highlighter/NullFragmenter.cs +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -using Lucene.Net.Analysis; - -namespace Lucene.Net.Search.Highlight -{ - - /// <summary> <see cref="IFragmenter"/> implementation which does not fragment the text. - /// This is useful for highlighting the entire content of a document or field. - /// </summary> - public class NullFragmenter : IFragmenter - { - public virtual void Start(string originalText, TokenStream tokenStream) - { } - - public virtual bool IsNewFragment() - { - return false; - } - } -} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/02362804/src/contrib/Highlighter/Package.html ---------------------------------------------------------------------- diff --git a/src/contrib/Highlighter/Package.html b/src/contrib/Highlighter/Package.html deleted file mode 100644 index 0646b4a..0000000 --- a/src/contrib/Highlighter/Package.html +++ /dev/null @@ -1,81 +0,0 @@ -<!-- - - Licensed to the Apache Software Foundation (ASF) under one - or more contributor license agreements. See the NOTICE file - distributed with this work for additional information - regarding copyright ownership. The ASF licenses this file - to you under the Apache License, Version 2.0 (the - "License"); you may not use this file except in compliance - with the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, - software distributed under the License is distributed on an - "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - KIND, either express or implied. See the License for the - specific language governing permissions and limitations - under the License. - ---> - -<html> - <body> - The highlight package contains classes to provide "keyword in context" features - typically used to highlight search terms in the text of results pages. The - Highlighter class is the central component and can be used to extract the most - interesting sections of a piece of text and highlight them, with the help of - Fragmenter, FragmentScorer, Formatter classes. - <h2>Example Usage</h2> - <pre> - IndexSearcher searcher = new IndexSearcher(ramDir); - Query query = QueryParser.Parse("Kenne*", FIELD_NAME, analyzer); - query = query.Rewrite(reader); //required to expand search terms - Hits hits = searcher.Search(query); - - Highlighter highlighter = new Highlighter(this, new QueryScorer(query)); - for (int i = 0; i < hits.Length(); i++) - { - String text = hits.Doc(i).Get(FIELD_NAME); - TokenStream tokenStream = analyzer.TokenStream(FIELD_NAME, new StringReader(text)); - // Get 3 best fragments and seperate with a "..." - String result = highlighter.GetBestFragments(tokenStream, text, 3, "..."); - System.Out.Console.WriteLine(result); - } -</pre> - <h2>New features 06/02/2005</h2> - This release adds options for encoding (thanks to Nicko Cadell). An "IEncoder" - implementation such as the new SimpleHTMLIEncoder class can be passed to the - highlighter to encode all those non-xhtml standard characters such as & - into legal values. This simple class may not suffice for some languages - - Commons Lang has an implementation that could be used: escapeHtml(String) in - http://svn.apache.org/viewcvs.cgi/jakarta/commons/proper/lang/trunk/src/java/org/apache/commons/lang/StringEscapeUtils.java?rev=137958&view=markup - <h2>New features 22/12/2004</h2> - This release adds some new capabilities: - <ol> - <li> - Faster highlighting using Term vector support</li> - <li> - New formatting options to use color intensity to show informational value</li> - <li> - Options for better summarization by using term IDF scores to influence fragment - selection</li> - </ol> - <p> - The highlighter takes a TokenStream as input. Until now these streams have - typically been produced using an Analyzer but the new class TokenSources - provides helper methods for obtaining TokenStreams from the new TermVector - position support (see latest CVS version).</p> - <p>The new class GradientFormatter can use a scale of colors to highlight terms - according to their score. A subtle use of color can help emphasise the reasons - for matching (useful when doing "MoreLikeThis" queries and you want to see what - the basis of the similarities are).</p> - <p>The QueryScorer class has a new constructor which can use an IndexReader to - derive the IDF (inverse document frequency) for each term in order to - influcence the score. This is useful for helping to extracting the most - significant sections of a document and in supplying scores used by the new - GradientFormatter to color significant words more strongly. The - QueryScorer.getMaxWeight method is useful when passed to the GradientFormatter - constructor to define the top score which is associated with the top color.</p> - </body> -</html> http://git-wip-us.apache.org/repos/asf/lucenenet/blob/02362804/src/contrib/Highlighter/QueryScorer.cs ---------------------------------------------------------------------- diff --git a/src/contrib/Highlighter/QueryScorer.cs b/src/contrib/Highlighter/QueryScorer.cs deleted file mode 100644 index c4682a0..0000000 --- a/src/contrib/Highlighter/QueryScorer.cs +++ /dev/null @@ -1,270 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -using System; -using System.Collections.Generic; -using Lucene.Net.Analysis; -using Lucene.Net.Analysis.Tokenattributes; -using Lucene.Net.Index; -using Lucene.Net.Index.Memory; -using Lucene.Net.Search.Spans; -using Lucene.Net.Support; -using Lucene.Net.Util; - -namespace Lucene.Net.Search.Highlight -{ - - ///<summary> - /// <see cref="IScorer"/> implementation which scores text fragments by the number of - /// unique query terms found. This class converts appropriate <see cref="Query"/>s to - /// <see cref="SpanQuery"/>s and attempts to score only those terms that participated in - /// generating the 'hit' on the document. - /// </summary> - public class QueryScorer : IScorer - { - private float totalScore; - private ISet<String> foundTerms; - private IDictionary<String, WeightedSpanTerm> fieldWeightedSpanTerms; - private float maxTermWeight; - private int position = -1; - private String defaultField; - private ITermAttribute termAtt; - private IPositionIncrementAttribute posIncAtt; - private bool expandMultiTermQuery = true; - private Query query; - private String field; - private IndexReader reader; - private bool skipInitExtractor; - private bool wrapToCaching = true; - - /// <summary> - /// Constructs a new QueryScorer instance - /// </summary> - /// <param name="query">Query to use for highlighting</param> - public QueryScorer(Query query) - { - Init(query, null, null, true); - } - - /// <summary> - /// Constructs a new QueryScorer instance - /// </summary> - /// <param name="query">Query to use for highlighting</param> - /// <param name="field">Field to highlight - pass null to ignore fields</param> - public QueryScorer(Query query, String field) - { - Init(query, field, null, true); - } - - /// <summary> - /// Constructs a new QueryScorer instance - /// </summary> - /// <param name="query">Query to use for highlighting</param> - /// <param name="reader"><see cref="IndexReader"/> to use for quasi tf/idf scoring</param> - /// <param name="field">Field to highlight - pass null to ignore fields</param> - public QueryScorer(Query query, IndexReader reader, String field) - { - Init(query, field, reader, true); - } - - /// <summary> - /// Constructs a new QueryScorer instance - /// </summary> - /// <param name="query">Query to use for highlighting</param> - /// <param name="reader"><see cref="IndexReader"/> to use for quasi tf/idf scoring</param> - /// <param name="field">Field to highlight - pass null to ignore fields</param> - /// <param name="defaultField">The default field for queries with the field name unspecified</param> - public QueryScorer(Query query, IndexReader reader, String field, String defaultField) - { - this.defaultField = StringHelper.Intern(defaultField); - Init(query, field, reader, true); - } - - - /// <summary> - /// Constructs a new QueryScorer instance - /// </summary> - /// <param name="query">Query to use for highlighting</param> - /// <param name="field">Field to highlight - pass null to ignore fields</param> - /// <param name="defaultField">The default field for queries with the field name unspecified</param> - public QueryScorer(Query query, String field, String defaultField) - { - this.defaultField = StringHelper.Intern(defaultField); - Init(query, field, null, true); - } - - /// <summary> - /// Constructs a new QueryScorer instance - /// </summary> - /// <param name="weightedTerms">an array of pre-created <see cref="WeightedSpanTerm"/>s</param> - public QueryScorer(WeightedSpanTerm[] weightedTerms) - { - this.fieldWeightedSpanTerms = new HashMap<String, WeightedSpanTerm>(weightedTerms.Length); - - foreach (WeightedSpanTerm t in weightedTerms) - { - WeightedSpanTerm existingTerm = fieldWeightedSpanTerms[t.Term]; - - if ((existingTerm == null) || - (existingTerm.Weight < t.Weight)) - { - // if a term is defined more than once, always use the highest - // scoring Weight - fieldWeightedSpanTerms[t.Term] = t; - maxTermWeight = Math.Max(maxTermWeight, t.Weight); - } - } - skipInitExtractor = true; - } - - /// <seealso cref="IScorer.FragmentScore"/> - public float FragmentScore - { - get { return totalScore; } - } - - /// <summary> - /// The highest weighted term (useful for passing to GradientFormatter to set top end of coloring scale). - /// </summary> - public float MaxTermWeight - { - get { return maxTermWeight; } - } - - /// <seealso cref="IScorer.GetTokenScore"/> - public float GetTokenScore() - { - position += posIncAtt.PositionIncrement; - String termText = termAtt.Term; - - WeightedSpanTerm weightedSpanTerm; - - if ((weightedSpanTerm = fieldWeightedSpanTerms[termText]) == null) - { - return 0; - } - - if (weightedSpanTerm.IsPositionSensitive() && - !weightedSpanTerm.CheckPosition(position)) - { - return 0; - } - - float score = weightedSpanTerm.Weight; - - // found a query term - is it unique in this doc? - if (!foundTerms.Contains(termText)) - { - totalScore += score; - foundTerms.Add(termText); - } - - return score; - } - - /// <seealso cref="IScorer.Init"/> - public TokenStream Init(TokenStream tokenStream) - { - position = -1; - termAtt = tokenStream.AddAttribute<ITermAttribute>(); - posIncAtt = tokenStream.AddAttribute<IPositionIncrementAttribute>(); - if (!skipInitExtractor) - { - if (fieldWeightedSpanTerms != null) - { - fieldWeightedSpanTerms.Clear(); - } - return InitExtractor(tokenStream); - } - return null; - } - - /// <summary> - /// Retrieve the <see cref="WeightedSpanTerm"/> for the specified token. Useful for passing - /// Span information to a <see cref="IFragmenter"/>. - /// </summary> - /// <param name="token">token to get {@link WeightedSpanTerm} for</param> - /// <returns>WeightedSpanTerm for token</returns> - public WeightedSpanTerm GetWeightedSpanTerm(String token) - { - return fieldWeightedSpanTerms[token]; - } - - private void Init(Query query, String field, IndexReader reader, bool expandMultiTermQuery) - { - this.reader = reader; - this.expandMultiTermQuery = expandMultiTermQuery; - this.query = query; - this.field = field; - } - - private TokenStream InitExtractor(TokenStream tokenStream) - { - WeightedSpanTermExtractor qse = defaultField == null - ? new WeightedSpanTermExtractor() - : new WeightedSpanTermExtractor(defaultField); - - qse.ExpandMultiTermQuery = expandMultiTermQuery; - qse.SetWrapIfNotCachingTokenFilter(wrapToCaching); - if (reader == null) - { - this.fieldWeightedSpanTerms = qse.GetWeightedSpanTerms(query, - tokenStream, field); - } - else - { - this.fieldWeightedSpanTerms = qse.GetWeightedSpanTermsWithScores(query, - tokenStream, field, reader); - } - if (qse.IsCachedTokenStream) - { - return qse.TokenStream; - } - - return null; - } - - /// <seealso cref="IScorer.StartFragment"/> - public void StartFragment(TextFragment newFragment) - { - foundTerms = Support.Compatibility.SetFactory.CreateHashSet<string>(); - totalScore = 0; - } - - /// <summary> - /// Controls whether or not multi-term queries are expanded - /// against a <see cref="MemoryIndex"/> <see cref="IndexReader"/>. - /// </summary> - public bool IsExpandMultiTermQuery - { - get { return expandMultiTermQuery; } - set { this.expandMultiTermQuery = value; } - } - - /// <summary> - /// By default, <see cref="TokenStream"/>s that are not of the type - /// <see cref="CachingTokenFilter"/> are wrapped in a <see cref="CachingTokenFilter"/> to - /// ensure an efficient reset - if you are already using a different caching - /// <see cref="TokenStream"/> impl and you don't want it to be wrapped, set this to - /// false. - /// </summary> - public void SetWrapIfNotCachingTokenFilter(bool wrap) - { - this.wrapToCaching = wrap; - } - } -} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/02362804/src/contrib/Highlighter/QueryTermExtractor.cs ---------------------------------------------------------------------- diff --git a/src/contrib/Highlighter/QueryTermExtractor.cs b/src/contrib/Highlighter/QueryTermExtractor.cs deleted file mode 100644 index 2a5bc7e..0000000 --- a/src/contrib/Highlighter/QueryTermExtractor.cs +++ /dev/null @@ -1,163 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -using System; -using System.Collections.Generic; -using System.IO; -using System.Linq; -using Lucene.Net.Index; -using Lucene.Net.Util; - -namespace Lucene.Net.Search.Highlight -{ - - /// <summary> Utility class used to extract the terms used in a query, plus any weights. - /// This class will not find terms for MultiTermQuery, RangeQuery and PrefixQuery classes - /// so the caller must pass a rewritten query (see Query.rewrite) to obtain a list of - /// expanded terms.</summary> - public static class QueryTermExtractor - { - - /// <summary> Extracts all terms texts of a given Query into an array of WeightedTerms - /// - /// </summary> - /// <param name="query"> Query to extract term texts from - /// </param> - /// <returns> an array of the terms used in a query, plus their weights. - /// </returns> - public static WeightedTerm[] GetTerms(Query query) - { - return GetTerms(query, false); - } - - /// <summary> Extracts all terms texts of a given Query into an array of WeightedTerms - /// - /// </summary> - /// <param name="query">Query to extract term texts from</param> - /// <param name="reader">used to compute IDF which can be used to a) score selected fragments better - /// b) use graded highlights eg chaning intensity of font color</param> - /// <param name="fieldName">the field on which Inverse Document Frequency (IDF) calculations are based</param> - /// <returns> an array of the terms used in a query, plus their weights.</returns> - public static WeightedTerm[] GetIdfWeightedTerms(Query query, IndexReader reader, string fieldName) - { - WeightedTerm[] terms = GetTerms(query, false, fieldName); - int totalNumDocs = reader.NumDocs(); - foreach (WeightedTerm t in terms) - { - try - { - int docFreq = reader.DocFreq(new Term(fieldName, t.Term)); - // docFreq counts deletes - if (totalNumDocs < docFreq) - { - docFreq = totalNumDocs; - } - //IDF algorithm taken from DefaultSimilarity class - var idf = (float)(Math.Log((float)totalNumDocs / (double)(docFreq + 1)) + 1.0); - t.Weight *= idf; - } - catch (IOException e) - { - //ignore - } - } - return terms; - } - - /// <summary>Extracts all terms texts of a given Query into an array of WeightedTerms</summary> - /// <param name="query">Query to extract term texts from</param> - /// <param name="prohibited"><c>true</c> to extract "prohibited" terms, too </param> - /// <param name="fieldName"> The fieldName used to filter query terms</param> - /// <returns>an array of the terms used in a query, plus their weights.</returns> - public static WeightedTerm[] GetTerms(Query query, bool prohibited, string fieldName) - { - var terms = new HashSet<WeightedTerm>(); - if (fieldName != null) - { - fieldName = StringHelper.Intern(fieldName); - } - GetTerms(query, terms, prohibited, fieldName); - return terms.ToArray(); - } - - /// <summary> Extracts all terms texts of a given Query into an array of WeightedTerms - /// - /// </summary> - /// <param name="query"> Query to extract term texts from - /// </param> - /// <param name="prohibited"><c>true</c> to extract "prohibited" terms, too - /// </param> - /// <returns> an array of the terms used in a query, plus their weights. - /// </returns> - public static WeightedTerm[] GetTerms(Query query, bool prohibited) - { - return GetTerms(query, prohibited, null); - } - - //fieldname MUST be interned prior to this call - private static void GetTerms(Query query, HashSet<WeightedTerm> terms, bool prohibited, string fieldName) - { - try - { - if (query is BooleanQuery) - GetTermsFromBooleanQuery((BooleanQuery) query, terms, prohibited, fieldName); - else if (query is FilteredQuery) - GetTermsFromFilteredQuery((FilteredQuery) query, terms, prohibited, fieldName); - else - { - var nonWeightedTerms = Support.Compatibility.SetFactory.CreateHashSet<Term>(); - query.ExtractTerms(nonWeightedTerms); - foreach (var term in nonWeightedTerms) - { - if ((fieldName == null) || (term.Field == fieldName)) - { - terms.Add(new WeightedTerm(query.Boost, term.Text)); - } - } - } - } - catch (System.NotSupportedException ignore) - { - //this is non-fatal for our purposes - } - } - - /// <summary> extractTerms is currently the only query-independent means of introspecting queries but it only reveals - /// a list of terms for that query - not the boosts each individual term in that query may or may not have. - /// "Container" queries such as BooleanQuery should be unwrapped to get at the boost info held - /// in each child element. - /// Some discussion around this topic here: - /// http://www.gossamer-threads.com/lists/lucene/java-dev/34208?search_string=introspection;#34208 - /// Unfortunately there seemed to be limited interest in requiring all Query objects to implement - /// something common which would allow access to child queries so what follows here are query-specific - /// implementations for accessing embedded query elements. - /// </summary> - private static void GetTermsFromBooleanQuery(BooleanQuery query, HashSet<WeightedTerm> terms, bool prohibited, string fieldName) - { - BooleanClause[] queryClauses = query.GetClauses(); - for (int i = 0; i < queryClauses.Length; i++) - { - if (prohibited || queryClauses[i].Occur != Occur.MUST_NOT) - GetTerms(queryClauses[i].Query, terms, prohibited, fieldName); - } - } - private static void GetTermsFromFilteredQuery(FilteredQuery query, HashSet<WeightedTerm> terms, bool prohibited, string fieldName) - { - GetTerms(query.Query, terms, prohibited, fieldName); - } - } -} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/02362804/src/contrib/Highlighter/QueryTermScorer.cs ---------------------------------------------------------------------- diff --git a/src/contrib/Highlighter/QueryTermScorer.cs b/src/contrib/Highlighter/QueryTermScorer.cs deleted file mode 100644 index 4fa6691..0000000 --- a/src/contrib/Highlighter/QueryTermScorer.cs +++ /dev/null @@ -1,190 +0,0 @@ -/* - * - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - * -*/ - -using System; -using System.Collections.Generic; -using System.Linq; -using System.Text; -using Lucene.Net.Analysis; -using Lucene.Net.Analysis.Tokenattributes; -using Lucene.Net.Index; -using Lucene.Net.Support; - -namespace Lucene.Net.Search.Highlight -{ - /* - * {@link Scorer} implementation which scores text fragments by the number of - * unique query terms found. This class uses the {@link QueryTermExtractor} - * class to process determine the query terms and their boosts to be used. - */ - // TODO: provide option to boost score of fragments near beginning of document - // based on fragment.getFragNum() - public class QueryTermScorer : IScorer - { - private TextFragment currentTextFragment = null; - private HashSet<String> uniqueTermsInFragment; - - private float totalScore = 0; - private float maxTermWeight = 0; - private HashMap<String, WeightedTerm> termsToFind; - - private ITermAttribute termAtt; - - /* - * - * @param query a Lucene query (ideally rewritten using query.rewrite before - * being passed to this class and the searcher) - */ - - public QueryTermScorer(Query query) - : this(QueryTermExtractor.GetTerms(query)) - { - } - - /* - * - * @param query a Lucene query (ideally rewritten using query.rewrite before - * being passed to this class and the searcher) - * @param fieldName the Field name which is used to match Query terms - */ - - public QueryTermScorer(Query query, String fieldName) - : this(QueryTermExtractor.GetTerms(query, false, fieldName)) - { - } - - /* - * - * @param query a Lucene query (ideally rewritten using query.rewrite before - * being passed to this class and the searcher) - * @param reader used to compute IDF which can be used to a) score selected - * fragments better b) use graded highlights eg set font color - * intensity - * @param fieldName the field on which Inverse Document Frequency (IDF) - * calculations are based - */ - - public QueryTermScorer(Query query, IndexReader reader, String fieldName) - : this(QueryTermExtractor.GetIdfWeightedTerms(query, reader, fieldName)) - { - } - - public QueryTermScorer(WeightedTerm[] weightedTerms) - { - termsToFind = new HashMap<String, WeightedTerm>(); - for (int i = 0; i < weightedTerms.Length; i++) - { - WeightedTerm existingTerm = termsToFind[weightedTerms[i].Term]; - if ((existingTerm == null) - || (existingTerm.Weight < weightedTerms[i].Weight)) - { - // if a term is defined more than once, always use the highest scoring - // Weight - termsToFind[weightedTerms[i].Term] = weightedTerms[i]; - maxTermWeight = Math.Max(maxTermWeight, weightedTerms[i].Weight); - } - } - } - - /* (non-Javadoc) - * @see org.apache.lucene.search.highlight.Scorer#init(org.apache.lucene.analysis.TokenStream) - */ - - public TokenStream Init(TokenStream tokenStream) - { - termAtt = tokenStream.AddAttribute<ITermAttribute>(); - return null; - } - - /* - * (non-Javadoc) - * - * @see - * org.apache.lucene.search.highlight.FragmentScorer#startFragment(org.apache - * .lucene.search.highlight.TextFragment) - */ - - public void StartFragment(TextFragment newFragment) - { - uniqueTermsInFragment = new HashSet<String>(); - currentTextFragment = newFragment; - totalScore = 0; - - } - - - /* (non-Javadoc) - * @see org.apache.lucene.search.highlight.Scorer#getTokenScore() - */ - - public float GetTokenScore() - { - String termText = termAtt.Term; - - WeightedTerm queryTerm = termsToFind[termText]; - if (queryTerm == null) - { - // not a query term - return - return 0; - } - // found a query term - is it unique in this doc? - if (!uniqueTermsInFragment.Contains(termText)) - { - totalScore += queryTerm.Weight; - uniqueTermsInFragment.Add(termText); - } - return queryTerm.Weight; - } - - - /* (non-Javadoc) - * @see org.apache.lucene.search.highlight.Scorer#getFragmentScore() - */ - - public float FragmentScore - { - get { return totalScore; } - } - - /* - * (non-Javadoc) - * - * @see - * org.apache.lucene.search.highlight.FragmentScorer#allFragmentsProcessed() - */ - - public void AllFragmentsProcessed() - { - // this class has no special operations to perform at end of processing - } - - /* - * - * @return The highest weighted term (useful for passing to GradientFormatter - * to set top end of coloring scale. - */ - - public float MaxTermWeight - { - get { return maxTermWeight; } - } - } -} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/02362804/src/contrib/Highlighter/SimpleFragmenter.cs ---------------------------------------------------------------------- diff --git a/src/contrib/Highlighter/SimpleFragmenter.cs b/src/contrib/Highlighter/SimpleFragmenter.cs deleted file mode 100644 index 8149dca..0000000 --- a/src/contrib/Highlighter/SimpleFragmenter.cs +++ /dev/null @@ -1,87 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -using System; -using Lucene.Net.Analysis; -using Lucene.Net.Analysis.Tokenattributes; - -namespace Lucene.Net.Search.Highlight -{ - - /// <summary> <see cref="IFragmenter"/> implementation which breaks text up into same-size - /// fragments with no concerns over spotting sentence boundaries. - /// </summary> - /// <author> [email protected] - /// </author> - public class SimpleFragmenter : IFragmenter - { - private static int DEFAULT_FRAGMENT_SIZE = 100; - private int currentNumFrags; - private int fragmentSize; - private IOffsetAttribute offsetAtt; - - public SimpleFragmenter() - : this(DEFAULT_FRAGMENT_SIZE) - { - } - - /* - * - * @param fragmentSize size in number of characters of each fragment - */ - - public SimpleFragmenter(int fragmentSize) - { - this.fragmentSize = fragmentSize; - } - - - /* (non-Javadoc) - * @see org.apache.lucene.search.highlight.Fragmenter#start(java.lang.String, org.apache.lucene.analysis.TokenStream) - */ - - public void Start(String originalText, TokenStream stream) - { - offsetAtt = stream.AddAttribute<IOffsetAttribute>(); - currentNumFrags = 1; - } - - - /* (non-Javadoc) - * @see org.apache.lucene.search.highlight.Fragmenter#isNewFragment() - */ - - public bool IsNewFragment() - { - bool isNewFrag = offsetAtt.EndOffset >= (fragmentSize*currentNumFrags); - if (isNewFrag) - { - currentNumFrags++; - } - return isNewFrag; - } - - /// <summary> - /// Gets or sets the size in number of characters of each fragment - /// </summary> - public int FragmentSize - { - get { return fragmentSize; } - set { fragmentSize = value; } - } - } -} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/02362804/src/contrib/Highlighter/SimpleHTMLEncoder.cs ---------------------------------------------------------------------- diff --git a/src/contrib/Highlighter/SimpleHTMLEncoder.cs b/src/contrib/Highlighter/SimpleHTMLEncoder.cs deleted file mode 100644 index 80833de..0000000 --- a/src/contrib/Highlighter/SimpleHTMLEncoder.cs +++ /dev/null @@ -1,85 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -using System; -using System.Text; - -namespace Lucene.Net.Search.Highlight -{ - /// <summary> Simple <see cref="IEncoder"/> implementation to escape text for HTML output</summary> - public class SimpleHTMLEncoder : IEncoder - { - public SimpleHTMLEncoder() - { - } - - public String EncodeText(String originalText) - { - return HtmlEncode(originalText); - } - - /* - * Encode string into HTML - */ - public static String HtmlEncode(String plainText) - { - if (string.IsNullOrEmpty(plainText)) - { - return string.Empty; - } - - var result = new StringBuilder(plainText.Length); - - for (int index = 0; index < plainText.Length; index++) - { - char ch = plainText[index]; - - switch (ch) - { - case '"': - result.Append("""); - break; - - case '&': - result.Append("&"); - break; - - case '<': - result.Append("<"); - break; - - case '>': - result.Append(">"); - break; - - default: - if (ch < 128) - { - result.Append(ch); - } - else - { - result.Append("&#").Append((int)ch).Append(";"); - } - break; - } - } - - return result.ToString(); - } - } -} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/02362804/src/contrib/Highlighter/SimpleHTMLFormatter.cs ---------------------------------------------------------------------- diff --git a/src/contrib/Highlighter/SimpleHTMLFormatter.cs b/src/contrib/Highlighter/SimpleHTMLFormatter.cs deleted file mode 100644 index ea51b05..0000000 --- a/src/contrib/Highlighter/SimpleHTMLFormatter.cs +++ /dev/null @@ -1,63 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -namespace Lucene.Net.Search.Highlight -{ - /// <summary> Simple <see cref="IFormatter"/> implementation to highlight terms with a pre and post tag</summary> - /// <author> MAHarwood - /// - /// </author> - public class SimpleHTMLFormatter : IFormatter - { - internal System.String preTag; - internal System.String postTag; - - - public SimpleHTMLFormatter(System.String preTag, System.String postTag) - { - this.preTag = preTag; - this.postTag = postTag; - } - - /// <summary> Default constructor uses HTML: <B> tags to markup terms - /// - /// - /// </summary> - public SimpleHTMLFormatter() - { - this.preTag = "<B>"; - this.postTag = "</B>"; - } - - /* (non-Javadoc) - * <see cref="Lucene.Net.Highlight.Formatter.highlightTerm(java.lang.String, Lucene.Net.Highlight.TokenGroup)"/> - */ - public virtual System.String HighlightTerm(System.String originalText, TokenGroup tokenGroup) - { - System.Text.StringBuilder returnBuffer; - if (tokenGroup.TotalScore > 0) - { - returnBuffer = new System.Text.StringBuilder(); - returnBuffer.Append(preTag); - returnBuffer.Append(originalText); - returnBuffer.Append(postTag); - return returnBuffer.ToString(); - } - return originalText; - } - } -} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/02362804/src/contrib/Highlighter/SimpleSpanFragmenter.cs ---------------------------------------------------------------------- diff --git a/src/contrib/Highlighter/SimpleSpanFragmenter.cs b/src/contrib/Highlighter/SimpleSpanFragmenter.cs deleted file mode 100644 index 51ab54f..0000000 --- a/src/contrib/Highlighter/SimpleSpanFragmenter.cs +++ /dev/null @@ -1,112 +0,0 @@ -/* - * - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - * -*/ - -using System; -using System.Collections.Generic; -using System.Linq; -using System.Text; -using Lucene.Net.Analysis; -using Lucene.Net.Analysis.Tokenattributes; - -namespace Lucene.Net.Search.Highlight -{ - public class SimpleSpanFragmenter : IFragmenter - { - private static int DEFAULT_FRAGMENT_SIZE = 100; - private int fragmentSize; - private int currentNumFrags; - private int position = -1; - private QueryScorer queryScorer; - private int waitForPos = -1; - private int textSize; - private ITermAttribute termAtt; - private IPositionIncrementAttribute posIncAtt; - private IOffsetAttribute offsetAtt; - - /// <param name="queryScorer">QueryScorer that was used to score hits</param> - public SimpleSpanFragmenter(QueryScorer queryScorer) - : this(queryScorer, DEFAULT_FRAGMENT_SIZE) - { - - } - - /// <param name="queryScorer">QueryScorer that was used to score hits</param> - /// <param name="fragmentSize">size in bytes of each fragment</param> - public SimpleSpanFragmenter(QueryScorer queryScorer, int fragmentSize) - { - this.fragmentSize = fragmentSize; - this.queryScorer = queryScorer; - } - - /// <seealso cref="IFragmenter.IsNewFragment"/> - public bool IsNewFragment() - { - position += posIncAtt.PositionIncrement; - - if (waitForPos == position) - { - waitForPos = -1; - } - else if (waitForPos != -1) - { - return false; - } - - WeightedSpanTerm wSpanTerm = queryScorer.GetWeightedSpanTerm(termAtt.Term); - - if (wSpanTerm != null) - { - List<PositionSpan> positionSpans = wSpanTerm.GetPositionSpans(); - - for (int i = 0; i < positionSpans.Count; i++) - { - if (positionSpans[i].Start == position) - { - waitForPos = positionSpans[i].End + 1; - break; - } - } - } - - bool isNewFrag = offsetAtt.EndOffset >= (fragmentSize*currentNumFrags) - && (textSize - offsetAtt.EndOffset) >= ((uint) fragmentSize >> 1); - - - if (isNewFrag) - { - currentNumFrags++; - } - - return isNewFrag; - } - - /// <seealso cref="IFragmenter.Start(string, TokenStream)"/> - public void Start(String originalText, TokenStream tokenStream) - { - position = -1; - currentNumFrags = 1; - textSize = originalText.Length; - termAtt = tokenStream.AddAttribute<ITermAttribute>(); - posIncAtt = tokenStream.AddAttribute<IPositionIncrementAttribute>(); - offsetAtt = tokenStream.AddAttribute<IOffsetAttribute>(); - } - } -}
