This is an automated email from the ASF dual-hosted git repository. nightowl888 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/lucenenet.git
commit 2e320ea9cd19788db380d6872da53918a977cc3c Author: Shad Storhaug <[email protected]> AuthorDate: Tue Mar 30 19:42:54 2021 +0700 docs: Lucene.Net.Highlighter: Fixed broken formatting and links (see #284, #300) --- src/Lucene.Net.Highlighter/Highlight/package.md | 103 +++++++++++++++------ .../VectorHighlight/package.md | 62 +++++++------ src/Lucene.Net.Highlighter/overview.md | 11 ++- 3 files changed, 118 insertions(+), 58 deletions(-) diff --git a/src/Lucene.Net.Highlighter/Highlight/package.md b/src/Lucene.Net.Highlighter/Highlight/package.md index 9181d31..7d5de00 100644 --- a/src/Lucene.Net.Highlighter/Highlight/package.md +++ b/src/Lucene.Net.Highlighter/Highlight/package.md @@ -1,4 +1,4 @@ ---- +--- uid: Lucene.Net.Search.Highlight summary: *content --- @@ -25,35 +25,82 @@ The highlight package contains classes to provide "keyword in context" features typically used to highlight search terms in the text of results pages. The Highlighter class is the central component and can be used to extract the most interesting sections of a piece of text and highlight them, with the help of -Fragmenter, fragment Scorer, and Formatter classes. +[Fragmenter](xref:Lucene.Net.Search.Highlight.IFragmenter), fragment [Scorer](xref:Lucene.Net.Search.Highlight.IScorer), and [Formatter](xref:Lucene.Net.Search.Highlight.IFormatter) classes. ## Example Usage - //... Above, create documents with two fields, one with term vectors (tv) and one without (notv) - IndexSearcher searcher = new IndexSearcher(directory); - QueryParser parser = new QueryParser("notv", analyzer); - Query query = parser.parse("million"); - - TopDocs hits = searcher.search(query, 10); - - SimpleHTMLFormatter htmlFormatter = new SimpleHTMLFormatter(); - Highlighter highlighter = new Highlighter(htmlFormatter, new QueryScorer(query)); - for (int i = 0; i < 10;="" i++)="" {="" int="" id="hits.scoreDocs[i].doc;" document="" doc="searcher.doc(id);" string="" text="doc.get(" notv");"="" tokenstream="" tokenstream="TokenSources.getAnyTokenStream(searcher.getIndexReader()," id,="" "notv",="" analyzer);="" textfragment[]="" frag="highlighter.getBestTextFragments(tokenStream," text,="" false,="" 10);//highlighter.getbestfragments(tokenstream,="" text,="" 3,="" "...");="" for="" (int="" j="0;" j="">< frag.length;="" j++)=" [...] - System.out.println((frag[j].toString())); - } +```cs +const LuceneVersion matchVersion = LuceneVersion.LUCENE_48; +Analyzer analyzer = new StandardAnalyzer(matchVersion); + +// Create an index to search +string indexPath = Path.Combine(Path.GetTempPath(), Path.GetFileNameWithoutExtension(Path.GetTempFileName())); +Directory dir = FSDirectory.Open(indexPath); +using IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(matchVersion, analyzer)); + +// This field must store term vectors and term vector offsets +var fieldType = new FieldType(TextField.TYPE_STORED) +{ + StoreTermVectors = true, + StoreTermVectorOffsets = true +}; +fieldType.Freeze(); + +// Create documents with two fields, one with term vectors (tv) and one without (notv) +writer.AddDocument(new Document { + new Field("tv", "Thanks a million!", fieldType), + new TextField("notv", "A million ways to win.", Field.Store.YES) +}); +writer.AddDocument(new Document { + new Field("tv", "Hopefully, this won't highlight a million times.", fieldType), + new TextField("notv", "There are a million different ways to do that!", Field.Store.YES) +}); + +using IndexReader indexReader = writer.GetReader(applyAllDeletes: true); +writer.Dispose(); + +// Now search our index using an existing or new IndexReader + +IndexSearcher searcher = new IndexSearcher(indexReader); +QueryParser parser = new QueryParser(matchVersion, "notv", analyzer); +Query query = parser.Parse("million"); + +TopDocs hits = searcher.Search(query, 10); + +SimpleHTMLFormatter htmlFormatter = new SimpleHTMLFormatter(); +Highlighter highlighter = new Highlighter(htmlFormatter, new QueryScorer(query)); +int totalScoreDocs = hits.ScoreDocs.Length > 10 ? 10 : hits.ScoreDocs.Length; +for (int i = 0; i < totalScoreDocs; i++) +{ + int id = hits.ScoreDocs[i].Doc; + Document doc = searcher.Doc(id); + string text = doc.Get("notv"); + TokenStream tokenStream = TokenSources.GetAnyTokenStream(searcher.IndexReader, id, "notv", analyzer); + TextFragment[] frag = highlighter.GetBestTextFragments( + tokenStream, text, mergeContiguousFragments: false, maxNumFragments: 10); // highlighter.GetBestFragments(tokenStream, text, 3, "..."); + for (int j = 0; j < frag.Length; j++) + { + if (frag[j] != null && frag[j].Score > 0) + { + Console.WriteLine(frag[j].ToString()); } - //Term vector - text = doc.get("tv"); - tokenStream = TokenSources.getAnyTokenStream(searcher.getIndexReader(), hits.scoreDocs[i].doc, "tv", analyzer); - frag = highlighter.getBestTextFragments(tokenStream, text, false, 10); - for (int j = 0; j < frag.length;="" j++)="" {="" if="" ((frag[j]="" !="null)" &&="" (frag[j].getscore()=""> 0)) { - System.out.println((frag[j].toString())); - } + } + //Term vector + text = doc.Get("tv"); + tokenStream = TokenSources.GetAnyTokenStream(searcher.IndexReader, hits.ScoreDocs[i].Doc, "tv", analyzer); + frag = highlighter.GetBestTextFragments(tokenStream, text, false, 10); + for (int j = 0; j < frag.Length; j++) + { + if (frag[j] != null && frag[j].Score > 0) + { + Console.WriteLine(frag[j].ToString()); } - System.out.println("-------------"); - } + } + Console.WriteLine("-------------"); +} +``` -## New features 06/02/2005 +## New features 2005-02-06 This release adds options for encoding (thanks to Nicko Cadell). @@ -62,7 +109,7 @@ all those non-xhtml standard characters such as & into legal values. This simple some languages - Commons Lang has an implementation that could be used: escapeHtml(String) in http://svn.apache.org/viewcvs.cgi/jakarta/commons/proper/lang/trunk/src/java/org/apache/commons/lang/StringEscapeUtils.java?rev=137958&view=markup -## New features 22/12/2004 +## New features 2004-12-22 This release adds some new capabilities: @@ -73,8 +120,8 @@ This release adds some new capabilities: 3. Options for better summarization by using term IDF scores to influence fragment selection - The highlighter takes a TokenStream as input. Until now these streams have typically been produced using an Analyzer but the new class TokenSources provides helper methods for obtaining TokenStreams from the new TermVector position support (see latest CVS version). +The highlighter takes a <xref:Lucene.Net.Analysis.TokenStream> as input. Until now these streams have typically been produced using an <xref:Lucene.Net.Analysis.Analyzer> but the new class TokenSources provides helper methods for obtaining TokenStreams from the new TermVector position support (see latest CVS version). -The new class GradientFormatter can use a scale of colors to highlight terms according to their score. A subtle use of color can help emphasise the reasons for matching (useful when doing "MoreLikeThis" queries and you want to see what the basis of the similarities are). +The new class <xref:Lucene.Net.Search.Highlight.GradientFormatter> can use a scale of colors to highlight terms according to their score. A subtle use of color can help emphasize the reasons for matching (useful when doing "MoreLikeThis" queries and you want to see what the basis of the similarities are). -The QueryScorer class has a new constructor which can use an IndexReader to derive the IDF (inverse document frequency) for each term in order to influence the score. This is useful for helping to extracting the most significant sections of a document and in supplying scores used by the new GradientFormatter to color significant words more strongly. The QueryScorer.getMaxWeight method is useful when passed to the GradientFormatter constructor to define the top score which is associated w [...] \ No newline at end of file +The <xref:Lucene.Net.Search.Highlight.QueryScorer> class has a new constructor which can use an <xref:Lucene.Net.Index.IndexReader> to derive the IDF (inverse document frequency) for each term in order to influence the score. This is useful for helping to extracting the most significant sections of a document and in supplying scores used by the new GradientFormatter to color significant words more strongly. The [QueryScorer.MaxTermWeight](xref:Lucene.Net.Search.Highlight.QueryScorer#Luce [...] \ No newline at end of file diff --git a/src/Lucene.Net.Highlighter/VectorHighlight/package.md b/src/Lucene.Net.Highlighter/VectorHighlight/package.md index 3aaa474..224fa4f 100644 --- a/src/Lucene.Net.Highlighter/VectorHighlight/package.md +++ b/src/Lucene.Net.Highlighter/VectorHighlight/package.md @@ -1,4 +1,4 @@ ---- +--- uid: Lucene.Net.Search.VectorHighlight summary: *content --- @@ -32,8 +32,6 @@ This is an another highlighter implementation. * support multi-term (includes wildcard, range, regexp, etc) queries -* need Java 1.5 - * highlight fields need to be stored with Positions and Offsets * take into account query boost and/or IDF-weight to score fragments @@ -77,12 +75,15 @@ For your convenience, here is the offsets and positions info of the sample text. In Step 1, Fast Vector Highlighter generates <xref:Lucene.Net.Search.VectorHighlight.FieldQuery.QueryPhraseMap> from the user query. `QueryPhraseMap` consists of the following members: - public class QueryPhraseMap { - boolean terminal; - int slop; // valid if terminal == true and phraseHighlight == true - float boost; // valid if terminal == true - Map<String, QueryPhraseMap> subMap; - } +```cs +public class QueryPhraseMap +{ + bool terminal; + int slop; // valid if terminal == true and phraseHighlight == true + float boost; // valid if terminal == true + IDictonary<string, QueryPhraseMap> subMap; +} +``` `QueryPhraseMap` has subMap. The key of the subMap is a term text in the user query and the value is a subsequent `QueryPhraseMap`. If the query is a term (not phrase), then the subsequent `QueryPhraseMap` is marked as terminal. If the query is a phrase, then the subsequent `QueryPhraseMap` is not a terminal and it has the next term text in the phrase. @@ -93,13 +94,13 @@ From the sample user query, the following `QueryPhraseMap` will be generated: |"Lucene"|o+->|boost=2|*| * : terminal +--------+-+ +-------+-+ -+--------+-+ +---------+-+ +-------+------+-+ + +--------+-+ +---------+-+ +-------+------+-+ |"search"|o+->|"library"|o+->|boost=1|slop=1|*| +--------+-+ +---------+-+ +-------+------+-+ ### Step 2. -In Step 2, Fast Vector Highlighter generates <xref:Lucene.Net.Search.VectorHighlight.FieldTermStack>. Fast Vector Highlighter uses term vector data (must be stored [#setStoreTermVectorOffsets(boolean)](xref:Lucene.Net.Documents.FieldType) and [#setStoreTermVectorPositions(boolean)](xref:Lucene.Net.Documents.FieldType)) to generate it. `FieldTermStack` keeps the terms in the user query. Therefore, in this sample case, Fast Vector Highlighter generates the following `FieldTermStack`: +In Step 2, Fast Vector Highlighter generates <xref:Lucene.Net.Search.VectorHighlight.FieldTermStack>. Fast Vector Highlighter uses term vector data (must be stored [FieldType.StoreTermVectorOffsets = true](xref:Lucene.Net.Documents.FieldType#Lucene_Net_Documents_FieldType_StoreTermVectorOffsets) and [FieldType.StoreTermVectorPositions = true](xref:Lucene.Net.Documents.FieldType#Lucene_Net_Documents_FieldType_StoreTermVectorPositions)) to generate it. `FieldTermStack` keeps the terms in t [...] FieldTermStack +------------------+ @@ -136,25 +137,32 @@ In Step 4, Fast Vector Highlighter creates `FieldFragList` by reference to `Fiel +---------------------------------+ The calculation for each `FieldFragList.WeightedFragInfo.totalBoost` (weight) -depends on the implementation of `FieldFragList.add( ... )`: - - public void add( int startOffset, int endOffset, List<WeightedPhraseInfo> phraseInfoList ) { - float totalBoost = 0; - List<SubInfo> subInfos = new ArrayList<SubInfo>(); - for( WeightedPhraseInfo phraseInfo : phraseInfoList ){ - subInfos.add( new SubInfo( phraseInfo.getText(), phraseInfo.getTermsOffsets(), phraseInfo.getSeqnum() ) ); - totalBoost += phraseInfo.getBoost(); - } - getFragInfos().add( new WeightedFragInfo( startOffset, endOffset, subInfos, totalBoost ) ); - } +depends on the implementation of `FieldFragList.Add( ... )`: + +```cs +public override void Add(int startOffset, int endOffset, IList<WeightedPhraseInfo> phraseInfoList) +{ + float totalBoost = 0; + List<SubInfo> subInfos = new List<SubInfo>(); + foreach (WeightedPhraseInfo phraseInfo in phraseInfoList) + { + subInfos.Add(new SubInfo(phraseInfo.GetText(), phraseInfo.TermsOffsets, phraseInfo.Seqnum, phraseInfo.Boost)); + totalBoost += phraseInfo.Boost; + } + FragInfos.Add(new WeightedFragInfo(startOffset, endOffset, subInfos, totalBoost)); +} +``` The used implementation of `FieldFragList` is noted in `BaseFragListBuilder.createFieldFragList( ... )`: - public FieldFragList createFieldFragList( FieldPhraseList fieldPhraseList, int fragCharSize ){ - return createFieldFragList( fieldPhraseList, new SimpleFieldFragList( fragCharSize ), fragCharSize ); - } +```cs +public override FieldFragList CreateFieldFragList(FieldPhraseList fieldPhraseList, int fragCharSize) +{ + return CreateFieldFragList(fieldPhraseList, new SimpleFieldFragList(fragCharSize), fragCharSize); +} +``` - Currently there are basically to approaches available: +Currently there are basically to approaches available: * `SimpleFragListBuilder using SimpleFieldFragList`: _sum-of-boosts_-approach. The totalBoost is calculated by summarizing the query-boosts per term. Per default a term is boosted by 1.0 @@ -187,4 +195,4 @@ Comparison of the two approaches: ### Step 5. -In Step 5, by using `FieldFragList` and the field stored data, Fast Vector Highlighter creates highlighted snippets! \ No newline at end of file +In Step 5, by using <xref:Lucene.Net.Search.VectorHighlight.FieldFragList> and the field stored data, Fast Vector Highlighter creates highlighted snippets! \ No newline at end of file diff --git a/src/Lucene.Net.Highlighter/overview.md b/src/Lucene.Net.Highlighter/overview.md index 4580146..a62ce0b 100644 --- a/src/Lucene.Net.Highlighter/overview.md +++ b/src/Lucene.Net.Highlighter/overview.md @@ -1,4 +1,4 @@ ---- +--- uid: Lucene.Net.Highlighter title: Lucene.Net.Highlighter summary: *content @@ -21,5 +21,10 @@ summary: *content limitations under the License. --> - The highlight package contains classes to provide "keyword in context" features - typically used to highlight search terms in the text of results pages. \ No newline at end of file +The highlight package contains classes to provide "keyword in context" features typically used to highlight search terms in the text of results pages. There are 3 main highlighters: + +* <xref:Lucene.Net.Search.Highlight> - A lightweight highlighter for basic usage. + +* <xref:Lucene.Net.Search.PostingsHighlight> (In the <xref:Lucene.Net.ICU> package) - Highlighter implementation that uses offsets from postings lists. This highlighter supports Unicode. + +* <xref:Lucene.Net.Search.VectorHighlight> - This highlighter is fast for large docs, supports N-gram fields, multi-term highlighting, colored highlight tags, and more. There is a <xref:Lucene.Net.Search.VectorHighlight.BreakIteratorBoundaryScanner> in the <xref:Lucene.Net.ICU> package that can be added on for Unicode support. \ No newline at end of file
