highlight Formatter.java Fragmenter.java Highlighter.java HighlighterTest.java QueryScorer.java QueryTermExtractor.java Scorer.java SimpleFragmenter.java SimpleHTMLFormatter.java TextFragment.java WeightedTerm.java package.html

ehatcher Thu, 08 Apr 2004 17:34:57 -0700

ehatcher    2004/04/08 17:34:31

  Added:       contributions/highlighter build.xml
               contributions/highlighter/src/java/org/apache/lucene/search/highlight
                        Formatter.java Fragmenter.java Highlighter.java
                        HighlighterTest.java QueryScorer.java
                        QueryTermExtractor.java Scorer.java
                        SimpleFragmenter.java SimpleHTMLFormatter.java
                        TextFragment.java WeightedTerm.java package.html
  Log:
  Initial commit of Mark Harwood's Highlighter package
  
  Revision  Changes    Path
  1.1                  jakarta-lucene-sandbox/contributions/highlighter/build.xml
  
  Index: build.xml
  ===================================================================
  <?xml version="1.0"?>
  
  <project name="highlighter" default="default">
  
    <description>
      Hits highlighter
    </description>
  
    <import file="../common.xml"/>
  </project>
  
  
  
  1.1                  
jakarta-lucene-sandbox/contributions/highlighter/src/java/org/apache/lucene/search/highlight/Formatter.java
  
  Index: Formatter.java
  ===================================================================
  package org.apache.lucene.search.highlight;


  /**

   * Copyright 2002-2004 The Apache Software Foundation

   *

   * Licensed under the Apache License, Version 2.0 (the "License");

   * you may not use this file except in compliance with the License.

   * You may obtain a copy of the License at

   *

   *     http://www.apache.org/licenses/LICENSE-2.0

   *

   * Unless required by applicable law or agreed to in writing, software

   * distributed under the License is distributed on an "AS IS" BASIS,

   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

   * See the License for the specific language governing permissions and

   * limitations under the License.

   */

  

  

  /**

   * Processes terms found in the original text, typically by applying some form 

   * of mark-up to highlight terms in HTML search results pages.

   *

   */

  public interface Formatter

  {

    /**

     * Highlights a search term. For example, an HTML Formatter could simply do:

     *

     * <p><dl><dt></dt><dd><code>return "&lt;b&gt;" + term + 
"&lt;/b&gt;";</code></dd></dl>

     *

     * @param originalTermText (unstemmed) term text to highlight

     * @param stemmedTerm the stemmed form of the originalTermText

     * @param startOffset the position of the originalTermText in the text being 
highlighted  

     *

     * @return highlighted term text

     */

    String highlightTerm(String originalTermText, String stemmedTerm, float score, int 
startOffset);

  }

  
  
  
  1.1                  
jakarta-lucene-sandbox/contributions/highlighter/src/java/org/apache/lucene/search/highlight/Fragmenter.java
  
  Index: Fragmenter.java
  ===================================================================
  package org.apache.lucene.search.highlight;

  /**

   * Copyright 2002-2004 The Apache Software Foundation

   *

   * Licensed under the Apache License, Version 2.0 (the "License");

   * you may not use this file except in compliance with the License.

   * You may obtain a copy of the License at

   *

   *     http://www.apache.org/licenses/LICENSE-2.0

   *

   * Unless required by applicable law or agreed to in writing, software

   * distributed under the License is distributed on an "AS IS" BASIS,

   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

   * See the License for the specific language governing permissions and

   * limitations under the License.

   */

  

  import org.apache.lucene.analysis.Token;

  

  /**

   * Implements the policy for breaking text into multiple fragments for consideration

   * by the [EMAIL PROTECTED] Highlighter} class. A sophisticated implementation may 
do this on the basis

   * of detecting end of sentences in the text. 

   * @author [EMAIL PROTECTED]

   */

  public interface Fragmenter

  {

        /**

         * Initializes the Fragmenter

         * @param originalText

         */

        public void start(String originalText);

  

        /**

         * Test to see if this token from the stream should be held in a new 
TextFragment

         * @param token

         * @return

         */

        public boolean isNewFragment(Token nextToken);

  }

  
  
  
  1.1                  
jakarta-lucene-sandbox/contributions/highlighter/src/java/org/apache/lucene/search/highlight/Highlighter.java
  
  Index: Highlighter.java
  ===================================================================
  package org.apache.lucene.search.highlight;

  /**

   * Copyright 2002-2004 The Apache Software Foundation

   *

   * Licensed under the Apache License, Version 2.0 (the "License");

   * you may not use this file except in compliance with the License.

   * You may obtain a copy of the License at

   *

   *     http://www.apache.org/licenses/LICENSE-2.0

   *

   * Unless required by applicable law or agreed to in writing, software

   * distributed under the License is distributed on an "AS IS" BASIS,

   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

   * See the License for the specific language governing permissions and

   * limitations under the License.

   */

  

  import java.io.IOException;

  import java.util.ArrayList;

  import java.util.Iterator;

  

  import org.apache.lucene.analysis.TokenStream;

  import org.apache.lucene.util.PriorityQueue;

  

  /**

   * Class used to markup highlighted terms found in the best sections of a 

   * text, using configurable [EMAIL PROTECTED] Fragmenter}, [EMAIL PROTECTED] 
Scorer}, [EMAIL PROTECTED] Formatter} 

   * and tokenizers.

   * @author [EMAIL PROTECTED]

   */

  public class Highlighter

  {

  

        public static final  int DEFAULT_MAX_DOC_BYTES_TO_ANALYZE=50*1024;

        private int maxDocBytesToAnalyze=DEFAULT_MAX_DOC_BYTES_TO_ANALYZE;

        private Formatter formatter;

        private Fragmenter textFragmenter=new SimpleFragmenter();

        private Scorer fragmentScorer=null;

  

        public Highlighter(Scorer fragmentScorer)

        {

                this(new SimpleHTMLFormatter(),fragmentScorer);

        }

        

        

        public Highlighter(Formatter formatter, Scorer fragmentScorer)

        {

                this.formatter = formatter;

                this.fragmentScorer = fragmentScorer;

        }

        

  

  

  

        /**

         * Highlights chosen terms in a text, extracting the most relevant section.

         * The document text is analysed in chunks to record hit statistics

         * across the document. After accumulating stats, the fragment with the 
highest score

         * is returned

         *

         * @param tokenStream   a stream of tokens identified in the text parameter, 
including offset information. 

         * This is typically produced by an analyzer re-parsing a document's 

         * text. Some work may be done on retrieving TokenStreams more efficently 

         * by adding support for storing original text position data in the Lucene

         * index but this support is not currently available (as of Lucene 1.4 rc2).  

         * @param text text to highlight terms in

         *

         * @return highlighted text fragment or null if no terms found

         */

        public final String getBestFragment(TokenStream tokenStream, String text)

                throws IOException

        {

                String[] results = getBestFragments(tokenStream,text, 1);

                if (results.length > 0)

                {

                        return results[0];

                }

                return null;

        }

        /**

         * Highlights chosen terms in a text, extracting the most relevant sections.

         * The document text is analysed in chunks to record hit statistics

         * across the document. After accumulating stats, the fragments with the 
highest scores

         * are returned as an array of strings in order of score (contiguous fragments 
are merged into 

         * one in their original order to improve readability)

         *

         * @param text          text to highlight terms in

         * @param maxNumFragments  the maximum number of fragments.

         *

         * @return highlighted text fragments (between 0 and maxNumFragments number of 
fragments)

         */

        public final String[] getBestFragments(

                TokenStream tokenStream,        

                String text,

                int maxNumFragments)

                throws IOException

        {

                maxNumFragments = Math.max(1, maxNumFragments); //sanity check

                StringBuffer newText = new StringBuffer();

                

                TextFragment[] frag =getBestDocFragments(tokenStream,text, newText, 
maxNumFragments);

  

                mergeContiguousFragments(frag);

  

                //Get text

                ArrayList fragTexts = new ArrayList();

                int n = 0;

                for (int i = 0; i < frag.length; i++)

                {

                        if ((frag[i] != null) && (frag[i].getScore() > 0))

                        {

                                fragTexts.add(

                                        newText.substring(

                                                frag[i].textStartPos,

                                                frag[i].textEndPos));

                        }

                }

                return (String[]) fragTexts.toArray(new String[0]);

        }

  

        /**

         * Low level api to get the most relevant sections of the document

         * @param tokenStream

         * @param text

         * @param maxNumFragments

         * @return 

         * @throws IOException

         */

        private final TextFragment[] getBestDocFragments(

                TokenStream tokenStream,        

                String text,

                StringBuffer newText,

                int maxNumFragments)

                throws IOException

        {

                ArrayList docFrags = new ArrayList();

  

                TextFragment currentFrag =      new TextFragment(newText.length(), 
docFrags.size());

                fragmentScorer.startFragment(currentFrag);

                docFrags.add(currentFrag);

        

                FragmentQueue fragQueue = new FragmentQueue(maxNumFragments);

  

                try

                {

                        org.apache.lucene.analysis.Token token;

                        String tokenText;

                        int startOffset;

                        int endOffset;

                        int lastEndOffset = 0;

                        textFragmenter.start(text);

  

                        while ((token = tokenStream.next()) != null)

                        {

                                

                                startOffset = token.startOffset();

                                endOffset = token.endOffset();          

                                //FIXME an issue was reported with CJKTokenizer that I 
couldnt reproduce

                                // where the analyzer was producing overlapping tokens.

                                // I suspect the fix is to make 
startOffset=Math.max(startOffset,lastEndOffset+1)

                                // but cant be sure so I'll just leave this comment in 
for now

                                tokenText = text.substring(startOffset, endOffset);

  

  

                                // append text between end of last token (or beginning 
of text) and start of current token

                                if (startOffset > lastEndOffset)

                                        newText.append(text.substring(lastEndOffset, 
startOffset));

  

                                // does query contain current token?

                                float score=fragmentScorer.getTokenScore(token);       
                 

                                newText.append(formatter.highlightTerm(tokenText, 
token.termText(), score, startOffset));

                                

  

                                if(textFragmenter.isNewFragment(token))

                                {

                                        
currentFrag.setScore(fragmentScorer.getFragmentScore());

                                        //record stats for a new fragment

                                        currentFrag.textEndPos = newText.length();

                                        currentFrag =new 
TextFragment(newText.length(), docFrags.size());

                                        fragmentScorer.startFragment(currentFrag);

                                        docFrags.add(currentFrag);

                                }

  

                                lastEndOffset = endOffset;

                                if(lastEndOffset>maxDocBytesToAnalyze)

                                {

                                        break;

                                }

                        }

                        currentFrag.setScore(fragmentScorer.getFragmentScore());

                        

  

                        // append text after end of last token

                        if (lastEndOffset < text.length())

                                newText.append(text.substring(lastEndOffset));

  

                        currentFrag.textEndPos = newText.length();

  

                        //sort the most relevant sections of the text

                        int minScore = 0;

                        for (Iterator i = docFrags.iterator(); i.hasNext();)

                        {

                                currentFrag = (TextFragment) i.next();

  

                                //If you are running with a version of Lucene before 
11th Sept 03

                                // you do not have PriorityQueue.insert() - so 
uncomment the code below                                 

                                /*

                                                                        if 
(currentFrag.getScore() >= minScore)

                                                                        {

                                                                                
fragQueue.put(currentFrag);

                                                                                if 
(fragQueue.size() > maxNumFragments)

                                                                                { // 
if hit queue overfull

                                                                                       
 fragQueue.pop(); // remove lowest in hit queue

                                                                                       
 minScore = ((TextFragment) fragQueue.top()).getScore(); // reset minScore

                                                                                }

                                                                                

                                        

                                                                        }

                                */

                                //The above code caused a problem as a result of 
Christoph Goller's 11th Sept 03

                                //fix to PriorityQueue. The correct method to use here 
is the new "insert" method

                                // USE ABOVE CODE IF THIS DOES NOT COMPILE!

                                fragQueue.insert(currentFrag);

                        }

  

                        //return the most relevant fragments

                        TextFragment frag[] = new TextFragment[fragQueue.size()];

                        for (int i = frag.length - 1; i >= 0; i--)

                        {

                                frag[i] = (TextFragment) fragQueue.pop();

                        }

                        return frag;

  

                }

                finally

                {

                        if (tokenStream != null)

                        {

                                try

                                {

                                        tokenStream.close();

                                }

                                catch (Exception e)

                                {

                                }

                        }

                }

        }

  

  

        /** Improves readability of a score-sorted list of TextFragments by merging 
any fragments 

         * that were contiguous in the original text into one larger fragment with the 
correct order.

         * This will leave a "null" in the array entry for the lesser scored fragment. 

         * 

         * @param frag An array of document fragments in descending score

         */

        private void mergeContiguousFragments(TextFragment[] frag)

        {

                boolean mergingStillBeingDone;

                if (frag.length > 1)

                        do

                        {

                                mergingStillBeingDone = false; //initialise loop 
control flag

                                //for each fragment, scan other frags looking for 
contiguous blocks

                                for (int i = 0; i < frag.length; i++)

                                {

                                        if (frag[i] == null)

                                        {

                                                continue;

                                        }

                                        //merge any contiguous blocks 

                                        for (int x = 0; x < frag.length; x++)

                                        {

                                                if (frag[x] == null)

                                                {

                                                        continue;

                                                }

                                                if (frag[i] == null)

                                                {

                                                        break;

                                                }

                                                TextFragment frag1 = null;

                                                TextFragment frag2 = null;

                                                int frag1Num = 0;

                                                int frag2Num = 0;

                                                int bestScoringFragNum;

                                                int worstScoringFragNum;

                                                //if blocks are contiguous....

                                                if (frag[i].follows(frag[x]))

                                                {

                                                        frag1 = frag[x];

                                                        frag1Num = x;

                                                        frag2 = frag[i];

                                                        frag2Num = i;

                                                }

                                                else

                                                        if (frag[x].follows(frag[i]))

                                                        {

                                                                frag1 = frag[i];

                                                                frag1Num = i;

                                                                frag2 = frag[x];

                                                                frag2Num = x;

                                                        }

                                                //merging required..

                                                if (frag1 != null)

                                                {

                                                        if (frag1.getScore() > 
frag2.getScore())

                                                        {

                                                                bestScoringFragNum = 
frag1Num;

                                                                worstScoringFragNum = 
frag2Num;

                                                        }

                                                        else

                                                        {

                                                                bestScoringFragNum = 
frag2Num;

                                                                worstScoringFragNum = 
frag1Num;

                                                        }

                                                        frag1.merge(frag2);

                                                        frag[worstScoringFragNum] = 
null;

                                                        mergingStillBeingDone = true;

                                                        frag[bestScoringFragNum] = 
frag1;

                                                }

                                        }

                                }

                        }

                        while (mergingStillBeingDone);

        }

        

        

        /**

         * Highlights terms in the  text , extracting the most relevant sections

         * and concatenating the chosen fragments with a separator (typically "...").

         * The document text is analysed in chunks to record hit statistics

         * across the document. After accumulating stats, the fragments with the 
highest scores

         * are returned in order as "separator" delimited strings.

         *

         * @param text        text to highlight terms in

         * @param maxNumFragments  the maximum number of fragments.

         * @param separator  the separator used to intersperse the document fragments 
(typically "...")

         *

         * @return highlighted text

         */

        public final String getBestFragments(

                TokenStream tokenStream,        

                String text,

                int maxNumFragments,

                String separator)

                throws IOException

        {

                String sections[] =     getBestFragments(tokenStream,text, 
maxNumFragments);

                StringBuffer result = new StringBuffer();

                for (int i = 0; i < sections.length; i++)

                {

                        if (i > 0)

                        {

                                result.append(separator);

                        }

                        result.append(sections[i]);

                }

                return result.toString();

        }

  

        /**

         * @return the maximum number of bytes to be tokenized per doc 

         */

        public int getMaxDocBytesToAnalyze()

        {

                return maxDocBytesToAnalyze;

        }

  

        /**

         * @param byteCount the maximum number of bytes to be tokenized per doc

         * (This can improve performance with large documents)

         */

        public void setMaxDocBytesToAnalyze(int byteCount)

        {

                maxDocBytesToAnalyze = byteCount;

        }

  

        /**

         * @return

         */

        public Fragmenter getTextFragmenter()

        {

                return textFragmenter;

        }

  

        /**

         * @param fragmenter

         */

        public void setTextFragmenter(Fragmenter fragmenter)

        {

                textFragmenter = fragmenter;

        }

  

        /**

         * @return Object used to score each text fragment 

         */

        public Scorer getFragmentScorer()

        {

                return fragmentScorer;

        }

  

  

        /**

         * @param scorer

         */

        public void setFragmentScorer(Scorer scorer)

        {

                fragmentScorer = scorer;

        }

  

  

  }

  class FragmentQueue extends PriorityQueue

  {

        public FragmentQueue(int size)

        {

                initialize(size);

        }

  

        public final boolean lessThan(Object a, Object b)

        {

                TextFragment fragA = (TextFragment) a;

                TextFragment fragB = (TextFragment) b;

                if (fragA.getScore() == fragB.getScore())

                        return fragA.fragNum > fragB.fragNum;

                else

                        return fragA.getScore() < fragB.getScore();

        }

  }

  
  
  
  1.1                  
jakarta-lucene-sandbox/contributions/highlighter/src/java/org/apache/lucene/search/highlight/HighlighterTest.java
  
  Index: HighlighterTest.java
  ===================================================================
  package org.apache.lucene.search.highlight;

  /**

   * Copyright 2002-2004 The Apache Software Foundation

   *

   * Licensed under the Apache License, Version 2.0 (the "License");

   * you may not use this file except in compliance with the License.

   * You may obtain a copy of the License at

   *

   *     http://www.apache.org/licenses/LICENSE-2.0

   *

   * Unless required by applicable law or agreed to in writing, software

   * distributed under the License is distributed on an "AS IS" BASIS,

   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

   * See the License for the specific language governing permissions and

   * limitations under the License.

   */

  

  import java.io.IOException;

  import java.io.StringReader;

  

  import junit.framework.TestCase;

  

  import org.apache.lucene.analysis.Analyzer;

  import org.apache.lucene.analysis.TokenStream;

  //import org.apache.lucene.analysis.cjk.CJKAnalyzer;

  import org.apache.lucene.analysis.standard.StandardAnalyzer;

  import org.apache.lucene.document.Document;

  import org.apache.lucene.document.Field;

  import org.apache.lucene.index.IndexReader;

  import org.apache.lucene.index.IndexWriter;

  import org.apache.lucene.queryParser.ParseException;

  import org.apache.lucene.queryParser.QueryParser;

  import org.apache.lucene.search.Hits;

  import org.apache.lucene.search.IndexSearcher;

  import org.apache.lucene.search.MultiSearcher;

  import org.apache.lucene.search.Query;

  import org.apache.lucene.search.Searcher;

  import org.apache.lucene.store.RAMDirectory;

  

  /**

   * JUnit Test for Highlighter class.

   * @author [EMAIL PROTECTED]

   */

  public class HighlighterTest extends TestCase implements Formatter

  {

        private IndexReader reader;

        private static final String FIELD_NAME = "contents";

        private Query query;

        RAMDirectory ramDir;

        public Searcher searcher = null;

        public Hits hits = null;

        int numHighlights = 0;

        Analyzer analyzer=new StandardAnalyzer();

  

        String texts[] =

                {

                        "Hello this is a piece of text that is very long and contains 
too much preamble and the meat is really here which says kennedy has been shot",

                        "This piece of text refers to Kennedy at the beginning then 
has a longer piece of text that is very long in the middle and finally ends with 
another reference to Kennedy",

                        "JFK has been shot",

                        "John Kennedy has been shot",

                        "This text has a typo in referring to Keneddy" };

  

        /**

         * Constructor for HighlightExtractorTest.

         * @param arg0

         */

        public HighlighterTest(String arg0)

        {

                super(arg0);

        }

  

        public void testSimpleHighlighter() throws Exception

        {

                doSearching("Kennedy");

                Highlighter highlighter =       new Highlighter(new 
QueryScorer(query));

                highlighter.setTextFragmenter(new SimpleFragmenter(40));               
         

                int maxNumFragmentsRequired = 2;

                for (int i = 0; i < hits.length(); i++)

                {

                        String text = hits.doc(i).get(FIELD_NAME);

                        TokenStream tokenStream=analyzer.tokenStream(FIELD_NAME,new 
StringReader(text));

                        

                        String result =

                                
highlighter.getBestFragments(tokenStream,text,maxNumFragmentsRequired, "...");

                        System.out.println("\t" + result);

                }

                //Not sure we can assert anything here - just running to check we dont 
throw any exceptions 

        }

  

  

  

        public void testGetBestFragmentsSimpleQuery() throws Exception

        {

                doSearching("Kennedy");

                doStandardHighlights();

                assertTrue("Failed to find correct number of highlights " + 
numHighlights + " found", numHighlights == 4);

        }

        public void testGetFuzzyFragments() throws Exception

        {

                doSearching("Kinnedy~");

                doStandardHighlights();

                assertTrue("Failed to find correct number of highlights " + 
numHighlights + " found", numHighlights == 4);

        }

  

        public void testGetWildCardFragments() throws Exception

        {

                doSearching("K?nnedy");

                doStandardHighlights();

                assertTrue("Failed to find correct number of highlights " + 
numHighlights + " found", numHighlights == 4);

        }

        public void testGetMidWildCardFragments() throws Exception

        {

                doSearching("K*dy");

                doStandardHighlights();

                assertTrue("Failed to find correct number of highlights " + 
numHighlights + " found", numHighlights == 5);

        }

        public void testGetRangeFragments() throws Exception

        {

                doSearching(FIELD_NAME + ":[kannedy TO kznnedy]"); //bug?needs lower 
case

                doStandardHighlights();

                assertTrue("Failed to find correct number of highlights " + 
numHighlights + " found", numHighlights == 5);

        }

  

        public void testGetBestFragmentsPhrase() throws Exception

        {

                doSearching("\"John Kennedy\"");

                doStandardHighlights();

                //Currently highlights "John" and "Kennedy" separately

                assertTrue("Failed to find correct number of highlights " + 
numHighlights + " found", numHighlights == 2);

        }

  

        public void testGetBestFragmentsMultiTerm() throws Exception

        {

                doSearching("John Kenn*");

                doStandardHighlights();

                assertTrue("Failed to find correct number of highlights " + 
numHighlights + " found", numHighlights == 5);

        }

        public void testGetBestFragmentsWithOr() throws Exception

        {

                doSearching("JFK OR Kennedy");

                doStandardHighlights();

                assertTrue("Failed to find correct number of highlights " + 
numHighlights + " found", numHighlights == 5);

        }

  

  

        public void testGetBestSingleFragment() throws Exception

        {

                doSearching("Kennedy");

  //            QueryHighlightExtractor highlighter = new 
QueryHighlightExtractor(this, query, new StandardAnalyzer());

                Highlighter highlighter =new Highlighter(this,new QueryScorer(query));

                highlighter.setTextFragmenter(new SimpleFragmenter(40));

  

                for (int i = 0; i < hits.length(); i++)

                {

                        String text = hits.doc(i).get(FIELD_NAME);

                        TokenStream tokenStream=analyzer.tokenStream(FIELD_NAME,new 
StringReader(text));

                        String result = highlighter.getBestFragment(tokenStream,text);

                        System.out.println("\t" + result);

                }

                assertTrue("Failed to find correct number of highlights " + 
numHighlights + " found", numHighlights == 4);

        }

        

        public void testGetBestSingleFragmentWithWeights() throws Exception

        {

                WeightedTerm[]wTerms=new WeightedTerm[2];

                wTerms[0]=new WeightedTerm(10f,"hello");

                wTerms[1]=new WeightedTerm(1f,"kennedy");

                Highlighter highlighter =new Highlighter(new QueryScorer(wTerms));

                TokenStream tokenStream=analyzer.tokenStream(FIELD_NAME,new 
StringReader(texts[0]));

                highlighter.setTextFragmenter(new SimpleFragmenter(2));

                

                String result = 
highlighter.getBestFragment(tokenStream,texts[0]).trim();

                assertTrue("Failed to find best section using weighted terms. Found: 
"+result

                        , "<B>Hello</B>".equals(result));

  

                //readjust weights

                wTerms[1].setWeight(50f);

                tokenStream=analyzer.tokenStream(FIELD_NAME,new 
StringReader(texts[0]));

                highlighter =new Highlighter(new QueryScorer(wTerms));

                highlighter.setTextFragmenter(new SimpleFragmenter(2));

                

                result = highlighter.getBestFragment(tokenStream,texts[0]).trim();

                assertTrue("Failed to find best section using weighted terms. Found: 
"+result

                        , "<B>kennedy</B>".equals(result));

        }

        

        

        

        public void testGetSimpleHighlight() throws Exception

        {

                doSearching("Kennedy");

                Highlighter highlighter =

                        new Highlighter(this,new QueryScorer(query));

  

                for (int i = 0; i < hits.length(); i++)

                {

                        String text = hits.doc(i).get(FIELD_NAME);

                        TokenStream tokenStream=analyzer.tokenStream(FIELD_NAME,new 
StringReader(text));

                        

                        String result = highlighter.getBestFragment(tokenStream,text);

                        System.out.println("\t" + result);

                }

                assertTrue("Failed to find correct number of highlights " + 
numHighlights + " found", numHighlights == 4);

        }

  

        public void testMaxSizeHighlight() throws Exception

        {

                doSearching("meat");

                Highlighter highlighter =

                        new Highlighter(this,new QueryScorer(query));

                highlighter.setMaxDocBytesToAnalyze(30);

                TokenStream tokenStream=analyzer.tokenStream(FIELD_NAME,new 
StringReader(texts[0]));

                String result = highlighter.getBestFragment(tokenStream,texts[0]);

                assertTrue("Setting MaxDocBytesToAnalyze should have prevented " +
                        "us from finding matches for this record" + numHighlights +

                         " found", numHighlights == 0);

        }

  

  

        

        public void testUnRewrittenQuery() throws IOException, ParseException

        {

                //test to show how rewritten query can still be used

                searcher = new IndexSearcher(ramDir);

                Analyzer analyzer=new StandardAnalyzer();

                Query query = QueryParser.parse("JF? or Kenned*", FIELD_NAME, 
analyzer);

                System.out.println("Searching with primitive query");

                //forget to set this and...

                //query=query.rewrite(reader);

                Hits hits = searcher.search(query);

  

                //create an instance of the highlighter with the tags used to surround 
highlighted text

  //            QueryHighlightExtractor highlighter = new 
QueryHighlightExtractor(this, query, new StandardAnalyzer());

                Highlighter highlighter =

                        new Highlighter(this,new QueryScorer(query));

  

                highlighter.setTextFragmenter(new SimpleFragmenter(40));               
 

  

                int maxNumFragmentsRequired = 3;

  

                for (int i = 0; i < hits.length(); i++)

                {

                        String text = hits.doc(i).get(FIELD_NAME);

                        TokenStream tokenStream=analyzer.tokenStream(FIELD_NAME,new 
StringReader(text));

                        

                        String highlightedText = 
highlighter.getBestFragments(tokenStream,text,maxNumFragmentsRequired,"...");

                        System.out.println(highlightedText);

                }

                //We expect to have zero highlights if the query is multi-terms and is 
not rewritten!

                assertTrue("Failed to find correct number of highlights " + 
numHighlights + " found", numHighlights == 0);

        }

        

        public void testNoFragments() throws Exception

        {

                doSearching("AnInvalidQueryWhichShouldYieldNoResults");

                Highlighter highlighter =

                        new Highlighter(this,new QueryScorer(query));

  

                int highlightFragmentSizeInBytes = 40;

                for (int i = 0; i < texts.length; i++)

                {

                        String text = texts[i];

                        TokenStream tokenStream=analyzer.tokenStream(FIELD_NAME,new 
StringReader(text));

                        

                        String result = highlighter.getBestFragment(tokenStream,text);

                        assertNull("The highlight result should be null for text with 
no query terms", result);

                }

        }

        

        public void testMultiSearcher() throws Exception

        {

                //setup index 1

                RAMDirectory ramDir1 = new RAMDirectory();

                IndexWriter writer1 = new IndexWriter(ramDir1, new StandardAnalyzer(), 
true);

                Document d = new Document();

                Field f = new Field(FIELD_NAME, "multiOne", true, true, true);

                d.add(f);               

                writer1.addDocument(d);

                writer1.optimize();

                writer1.close();

                IndexReader reader1 = IndexReader.open(ramDir1);

  

                //setup index 2

                RAMDirectory ramDir2 = new RAMDirectory();

                IndexWriter writer2 = new IndexWriter(ramDir2, new StandardAnalyzer(), 
true);

                d = new Document();

                f = new Field(FIELD_NAME, "multiTwo", true, true, true);

                d.add(f);               

                writer2.addDocument(d);

                writer2.optimize();

                writer2.close();

                IndexReader reader2 = IndexReader.open(ramDir2);

  

                

  

                IndexSearcher searchers[]=new IndexSearcher[2]; 

                searchers[0] = new IndexSearcher(ramDir1);

                searchers[1] = new IndexSearcher(ramDir2);

                MultiSearcher multiSearcher=new MultiSearcher(searchers);

                query = QueryParser.parse("multi*", FIELD_NAME, new 
StandardAnalyzer());

                System.out.println("Searching for: " + query.toString(FIELD_NAME));

                //at this point the multisearcher calls combine(query[])

                hits = multiSearcher.search(query);

  

                //query = QueryParser.parse("multi*", FIELD_NAME, new 
StandardAnalyzer());

                Query expandedQueries[]=new Query[2];

                expandedQueries[0]=query.rewrite(reader1);

                expandedQueries[1]=query.rewrite(reader2);

                query=query.combine(expandedQueries);

                

                

                //create an instance of the highlighter with the tags used to surround 
highlighted text

                Highlighter highlighter =

                        new Highlighter(this,new QueryScorer(query));

  

                for (int i = 0; i < hits.length(); i++)

                {

                        String text = hits.doc(i).get(FIELD_NAME);

                        TokenStream tokenStream=analyzer.tokenStream(FIELD_NAME,new 
StringReader(text));

                        String highlightedText = 
highlighter.getBestFragment(tokenStream,text);

                        System.out.println(highlightedText);

                }

                assertTrue("Failed to find correct number of highlights " + 
numHighlights + " found", numHighlights == 2);              

                

                

                

        }

        

  /*    

  

        public void testBigramAnalyzer() throws IOException, ParseException

        {

                //test to ensure analyzers with none-consecutive start/end offsets

                //dont double-highlight text

                //setup index 1

                RAMDirectory ramDir = new RAMDirectory();

                Analyzer bigramAnalyzer=new CJKAnalyzer();

                IndexWriter writer = new IndexWriter(ramDir,bigramAnalyzer , true);

                Document d = new Document();

                Field f = new Field(FIELD_NAME, "java abc def", true, true, true);

                d.add(f);

                writer.addDocument(d);          

                writer.close();

                IndexReader reader = IndexReader.open(ramDir);

  

                IndexSearcher searcher=new IndexSearcher(reader); 

                query = QueryParser.parse("abc", FIELD_NAME, bigramAnalyzer);

                System.out.println("Searching for: " + query.toString(FIELD_NAME));

                hits = searcher.search(query);

  

                Highlighter highlighter =

                        new Highlighter(this,new QueryFragmentScorer(query));

  

                for (int i = 0; i < hits.length(); i++)

                {

                        String text = hits.doc(i).get(FIELD_NAME);

                        TokenStream 
tokenStream=bigramAnalyzer.tokenStream(FIELD_NAME,new StringReader(text));

                        String highlightedText = 
highlighter.getBestFragment(tokenStream,text);

                        System.out.println(highlightedText);

                }               

                

        }

  */    

  

  

        public String highlightTerm(String originalText , String weightedTerm, float 
score, int startOffset)

        {

                if(score<=0)

                {

                        return originalText;

                }

                numHighlights++; //update stats used in assertions

                return "<b>" + originalText + "</b>";

        }

  

        public void doSearching(String queryString) throws Exception

        {

                searcher = new IndexSearcher(ramDir);

                query = QueryParser.parse(queryString, FIELD_NAME, new 
StandardAnalyzer());

                //for any multi-term queries to work (prefix, wildcard, range,fuzzy 
etc) you must use a rewritten query! 

                query=query.rewrite(reader);

                System.out.println("Searching for: " + query.toString(FIELD_NAME));

                hits = searcher.search(query);

        }

  

        void doStandardHighlights() throws Exception

        {

                Highlighter highlighter =new Highlighter(this,new QueryScorer(query));

                highlighter.setTextFragmenter(new SimpleFragmenter(20));

                for (int i = 0; i < hits.length(); i++)

                {

                        String text = hits.doc(i).get(FIELD_NAME);

                        int maxNumFragmentsRequired = 2;

                        String fragmentSeparator = "...";

                        TokenStream tokenStream=analyzer.tokenStream(FIELD_NAME,new 
StringReader(text));

                        

                        String result =

                                highlighter.getBestFragments(

                                        tokenStream,

                                        text,

                                        maxNumFragmentsRequired,

                                        fragmentSeparator);

                        System.out.println("\t" + result);

                }

        }

  

        /*

         * @see TestCase#setUp()

         */

        protected void setUp() throws Exception

        {

                ramDir = new RAMDirectory();

                IndexWriter writer = new IndexWriter(ramDir, new StandardAnalyzer(), 
true);

                for (int i = 0; i < texts.length; i++)

                {

                        addDoc(writer, texts[i]);

                }

  

                writer.optimize();

                writer.close();

                reader = IndexReader.open(ramDir);

                numHighlights = 0;

        }

  

        private void addDoc(IndexWriter writer, String text) throws IOException

        {

                Document d = new Document();

                Field f = new Field(FIELD_NAME, text, true, true, true);

                d.add(f);

                writer.addDocument(d);

  

        }

  

        /*

         * @see TestCase#tearDown()

         */

        protected void tearDown() throws Exception

        {

                super.tearDown();

        }

  

  }

  
  
  
  1.1                  
jakarta-lucene-sandbox/contributions/highlighter/src/java/org/apache/lucene/search/highlight/QueryScorer.java
  
  Index: QueryScorer.java
  ===================================================================
  package org.apache.lucene.search.highlight;

  /**

   * Copyright 2002-2004 The Apache Software Foundation

   *

   * Licensed under the Apache License, Version 2.0 (the "License");

   * you may not use this file except in compliance with the License.

   * You may obtain a copy of the License at

   *

   *     http://www.apache.org/licenses/LICENSE-2.0

   *

   * Unless required by applicable law or agreed to in writing, software

   * distributed under the License is distributed on an "AS IS" BASIS,

   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

   * See the License for the specific language governing permissions and

   * limitations under the License.

   */

  

  import java.util.HashMap;

  import java.util.HashSet;

  

  import org.apache.lucene.analysis.Token;

  import org.apache.lucene.search.Query;

  

  /**

   * [EMAIL PROTECTED] Scorer} implementation which scores text fragments by the 
number of unique query terms found.

   * This class uses the [EMAIL PROTECTED] QueryTermExtractor} class to process 
determine the query terms and 

   * their boosts to be used. 

   * @author [EMAIL PROTECTED]

   */

  //TODO: provide option to roll idf into the scoring equation by passing a 
IndexReader.

  //TODO: provide option to boost score of fragments near beginning of document 

  // based on fragment.getFragNum()

  public class QueryScorer implements Scorer

  {

        TextFragment currentTextFragment=null;

        HashSet uniqueTermsInFragment;

        float totalScore=0;

        private HashMap termsToFind;

        

  

        /**

         * 

         * @param query a Lucene query (ideally rewritten using query.rewrite 

         * before being passed to this class and the searcher)

         */

        public QueryScorer(Query query)

        {

                this(QueryTermExtractor.getTerms(query));

        }

  

  

        public QueryScorer(WeightedTerm []weightedTerms )

        {

                termsToFind = new HashMap();

                for (int i = 0; i < weightedTerms.length; i++)

                {

                        termsToFind.put(weightedTerms[i].term,weightedTerms[i]);

                }

        }

        

  

        /* (non-Javadoc)

         * @see 
org.apache.lucene.search.highlight.FragmentScorer#startFragment(org.apache.lucene.search.highlight.TextFragment)

         */

        public void startFragment(TextFragment newFragment)

        {

                uniqueTermsInFragment = new HashSet();

                currentTextFragment=newFragment;

                totalScore=0;

                

        }

        

        /* (non-Javadoc)

         * @see 
org.apache.lucene.search.highlight.FragmentScorer#scoreToken(org.apache.lucene.analysis.Token)

         */

        public float getTokenScore(Token token)

        {

                String termText=token.termText();

                

                WeightedTerm queryTerm=(WeightedTerm) termsToFind.get(termText);

                if(queryTerm==null)

                {

                        //not a query term - return

                        return 0;

                }

                //found a query term - is it unique in this doc?

                if(!uniqueTermsInFragment.contains(termText))

                {

                        totalScore+=queryTerm.getWeight();

                        uniqueTermsInFragment.add(termText);

                }

                return queryTerm.getWeight();

        }

        

        

        /* (non-Javadoc)

         * @see 
org.apache.lucene.search.highlight.FragmentScorer#endFragment(org.apache.lucene.search.highlight.TextFragment)

         */

        public float getFragmentScore()

        {

                return totalScore;              

        }

  

  

        /* (non-Javadoc)

         * @see 
org.apache.lucene.search.highlight.FragmentScorer#allFragmentsProcessed()

         */

        public void allFragmentsProcessed()

        {

                //this class has no special operations to perform at end of processing

        }

  

  }

  
  
  
  1.1                  
jakarta-lucene-sandbox/contributions/highlighter/src/java/org/apache/lucene/search/highlight/QueryTermExtractor.java
  
  Index: QueryTermExtractor.java
  ===================================================================
  package org.apache.lucene.search.highlight;

  /**

   * Copyright 2002-2004 The Apache Software Foundation

   *

   * Licensed under the Apache License, Version 2.0 (the "License");

   * you may not use this file except in compliance with the License.

   * You may obtain a copy of the License at

   *

   *     http://www.apache.org/licenses/LICENSE-2.0

   *

   * Unless required by applicable law or agreed to in writing, software

   * distributed under the License is distributed on an "AS IS" BASIS,

   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

   * See the License for the specific language governing permissions and

   * limitations under the License.

   */

  

  import java.util.HashSet;

  

  import org.apache.lucene.index.Term;

  import org.apache.lucene.search.BooleanClause;

  import org.apache.lucene.search.BooleanQuery;

  import org.apache.lucene.search.PhraseQuery;

  import org.apache.lucene.search.Query;

  import org.apache.lucene.search.TermQuery;

  

  /**

   * Utility class used to extract the terms used in a query, plus any weights.

   * This class will not find terms for MultiTermQuery, RangeQuery and PrefixQuery 
classes

   * so the caller must pass a rewritten query (see query.rewrite) to obtain a list of 

   * expanded terms. 

   * 

   */

  public final class QueryTermExtractor

  {

  

        /**

         * Extracts all terms texts of a given Query into an array of WeightedTerms

         *

         * @param query      Query to extract term texts from

         * @return an array of the terms used in a query, plus their weights.

         * @throws IOException

         */

        public static final WeightedTerm[] getTerms(Query query) 

        {

                return getTerms(query,false);

        }

  

  

        /**

         * Extracts all terms texts of a given Query into an array of WeightedTerms

         *

         * @param query      Query to extract term texts from

         * @param prohibited <code>true</code> to extract "prohibited" terms, too

       * @return an array of the terms used in a query, plus their weights.

       * @throws IOException

       */

        public static final WeightedTerm[] getTerms(Query query, boolean prohibited) 

        {

                HashSet terms=new HashSet();

                getTerms(query,terms,prohibited);

                return (WeightedTerm[]) terms.toArray(new WeightedTerm[0]);

        }

  

        private static final void getTerms(Query query, HashSet terms,boolean 
prohibited) 

        {

                if (query instanceof BooleanQuery)

                        getTermsFromBooleanQuery((BooleanQuery) query, terms, 
prohibited);

                else

                        if (query instanceof PhraseQuery)

                                getTermsFromPhraseQuery((PhraseQuery) query, terms);

                        else

                                if (query instanceof TermQuery)

                                        getTermsFromTermQuery((TermQuery) query, 
terms);

  //                            else

  //                                    if ((query instanceof PrefixQuery)

  //                                            || (query instanceof RangeQuery)

  //                                            || (query instanceof MultiTermQuery))

  //                                    {

  //                                            //client should call rewrite BEFORE 
calling highlighter

  //                                            //                                     
         Query expandedQuery = rewrite(reader, query);

  //                                            //                              
getTerms(reader, expandedQuery, terms, prohibited);

  //                                    }

        }

  

        private static final void getTermsFromBooleanQuery(BooleanQuery query, HashSet 
terms, boolean prohibited)

        {

                BooleanClause[] queryClauses = query.getClauses();

                int i;

  

                for (i = 0; i < queryClauses.length; i++)

                {

                        if (prohibited || !queryClauses[i].prohibited)

                                getTerms(queryClauses[i].query, terms, prohibited);

                }

        }

  

        private static final void getTermsFromPhraseQuery(PhraseQuery query, HashSet 
terms)

        {

                Term[] queryTerms = query.getTerms();

                int i;

  

                for (i = 0; i < queryTerms.length; i++)

                {

                        terms.add(new 
WeightedTerm(query.getBoost(),queryTerms[i].text()));

                }

        }

  

        private static final void getTermsFromTermQuery(TermQuery query, HashSet terms)

        {

                terms.add(new WeightedTerm(query.getBoost(),query.getTerm().text()));

        }

  

  

  }

  
  
  
  1.1                  
jakarta-lucene-sandbox/contributions/highlighter/src/java/org/apache/lucene/search/highlight/Scorer.java
  
  Index: Scorer.java
  ===================================================================
  package org.apache.lucene.search.highlight;

  /**

   * Copyright 2002-2004 The Apache Software Foundation

   *

   * Licensed under the Apache License, Version 2.0 (the "License");

   * you may not use this file except in compliance with the License.

   * You may obtain a copy of the License at

   *

   *     http://www.apache.org/licenses/LICENSE-2.0

   *

   * Unless required by applicable law or agreed to in writing, software

   * distributed under the License is distributed on an "AS IS" BASIS,

   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

   * See the License for the specific language governing permissions and

   * limitations under the License.

   */

  

  import org.apache.lucene.analysis.Token;

  

  /**

   * Adds to the score for a fragment based on its tokens

   * @author [EMAIL PROTECTED]

   */

  public interface Scorer

  {

        /**

         * called when a new fragment is started for consideration

         * @param newFragment

         */

        public void startFragment(TextFragment newFragment);

  

        /**

         * Called for each token in the current fragment

         * @param token The token to be scored

         * @return a score which is passed to the TermHighlighter class to influence 
the mark-up of the text

         * (this return value is NOT used to score the fragment)

         */

        public float getTokenScore(Token token);

        

  

        /**

         * Called when the highlighter has no more tokens for the current fragment - 
the scorer will typically

         * call setScore() on the fragment passed in startFragment to record total info

         *

         */     

        public float getFragmentScore();

  

  }

  
  
  
  1.1                  
jakarta-lucene-sandbox/contributions/highlighter/src/java/org/apache/lucene/search/highlight/SimpleFragmenter.java
  
  Index: SimpleFragmenter.java
  ===================================================================
  package org.apache.lucene.search.highlight;

  /**

   * Copyright 2002-2004 The Apache Software Foundation

   *

   * Licensed under the Apache License, Version 2.0 (the "License");

   * you may not use this file except in compliance with the License.

   * You may obtain a copy of the License at

   *

   *     http://www.apache.org/licenses/LICENSE-2.0

   *

   * Unless required by applicable law or agreed to in writing, software

   * distributed under the License is distributed on an "AS IS" BASIS,

   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

   * See the License for the specific language governing permissions and

   * limitations under the License.

   */

  

  import org.apache.lucene.analysis.Token;

  

  /**

   * [EMAIL PROTECTED] Fragmenter} implementation which breaks text up into same-size 

   * fragments with no concerns over spotting sentence boundaries.

   * @author [EMAIL PROTECTED]

   */

  public class SimpleFragmenter implements Fragmenter

  {

        private static final int DEFAULT_FRAGMENT_SIZE =100;

        private int currentNumFrags;

        private int fragmentSize;

  

  

        public SimpleFragmenter()

        {

                this(DEFAULT_FRAGMENT_SIZE);

        }

  

  

        /**

         * 

         * @param fragmentSize size in bytes of each fragment

         */

        public SimpleFragmenter(int fragmentSize)

        {

                this.fragmentSize=fragmentSize;

        }

  

        /* (non-Javadoc)

         * @see 
org.apache.lucene.search.highlight.TextFragmenter#start(java.lang.String)

         */

        public void start(String originalText)

        {

                currentNumFrags=1;

        }

  

        /* (non-Javadoc)

         * @see 
org.apache.lucene.search.highlight.TextFragmenter#isNewFragment(org.apache.lucene.analysis.Token)

         */

        public boolean isNewFragment(Token token)

        {

                boolean isNewFrag= token.endOffset()>=(fragmentSize*currentNumFrags);

                if(isNewFrag)

                {

                        currentNumFrags++;

                }

                return isNewFrag;

        }

  

        /**

         * @return size in bytes of each fragment

         */

        public int getFragmentSize()

        {

                return fragmentSize;

        }

  

        /**

         * @param size size in bytes of each fragment

         */

        public void setFragmentSize(int size)

        {

                fragmentSize = size;

        }

  

  }

  
  
  
  1.1                  
jakarta-lucene-sandbox/contributions/highlighter/src/java/org/apache/lucene/search/highlight/SimpleHTMLFormatter.java
  
  Index: SimpleHTMLFormatter.java
  ===================================================================
  package org.apache.lucene.search.highlight;

  /**

   * Copyright 2002-2004 The Apache Software Foundation

   *

   * Licensed under the Apache License, Version 2.0 (the "License");

   * you may not use this file except in compliance with the License.

   * You may obtain a copy of the License at

   *

   *     http://www.apache.org/licenses/LICENSE-2.0

   *

   * Unless required by applicable law or agreed to in writing, software

   * distributed under the License is distributed on an "AS IS" BASIS,

   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

   * See the License for the specific language governing permissions and

   * limitations under the License.

   */

  

  /**

   * Simple [EMAIL PROTECTED] Formatter} implementation to highlight terms with a pre 
and post tag

   * @author MAHarwood

   *

   */

  public class SimpleHTMLFormatter implements Formatter

  {

        String preTag;

        String postTag;

  

        public SimpleHTMLFormatter(String preTag, String postTag)

        {

                this.preTag = preTag;

                this.postTag = postTag;

        }

  

        /**

         * Default constructor uses HTML: &lt;B&gt; tags to markup terms

         * 

         **/

        public SimpleHTMLFormatter()

        {

                this.preTag = "<B>";

                this.postTag = "</B>";

        }

  

        public String highlightTerm(String originalText, String term, float score, int 
startOffset)

        {

                if(score<=0)

                {

                        return originalText;

                }

                StringBuffer sb = new StringBuffer();

                sb.append(preTag);

                sb.append(originalText);

                sb.append(postTag);

                return sb.toString();

        }

  

  }

  
  
  
  1.1                  
jakarta-lucene-sandbox/contributions/highlighter/src/java/org/apache/lucene/search/highlight/TextFragment.java
  
  Index: TextFragment.java
  ===================================================================
  package org.apache.lucene.search.highlight;

  /**

   * Copyright 2002-2004 The Apache Software Foundation

   *

   * Licensed under the Apache License, Version 2.0 (the "License");

   * you may not use this file except in compliance with the License.

   * You may obtain a copy of the License at

   *

   *     http://www.apache.org/licenses/LICENSE-2.0

   *

   * Unless required by applicable law or agreed to in writing, software

   * distributed under the License is distributed on an "AS IS" BASIS,

   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

   * See the License for the specific language governing permissions and

   * limitations under the License.

   */

  

  

  /**

   * Low-level class used to record information about a section of a document 

   * with a score.

   * @author MAHarwood

   *

   * 

   */

  public class TextFragment

  {

        int fragNum;

        int textStartPos;

        int textEndPos;

        float score;

  

        public TextFragment(int textStartPos, int fragNum)

        {

                this.textStartPos = textStartPos;

                this.fragNum = fragNum;

        }

        void setScore(float score)

        {

                this.score=score;

        }

        public float getScore()

        {

                return score;

        }

        /**

         * @param frag2 Fragment to be merged into this one

         */

        public void merge(TextFragment frag2)

        {

                textEndPos = frag2.textEndPos;

        }

        /**

         * @param fragment 

         * @return true if this fragment follows the one passed

         */

        public boolean follows(TextFragment fragment)

        {

                return textStartPos == fragment.textEndPos;

        }

  

        /**

         * @return the fragment sequence number

         */

        public int getFragNum()

        {

                return fragNum;

        }

  

  }

  
  
  
  1.1                  
jakarta-lucene-sandbox/contributions/highlighter/src/java/org/apache/lucene/search/highlight/WeightedTerm.java
  
  Index: WeightedTerm.java
  ===================================================================
  package org.apache.lucene.search.highlight;

  /**

   * Copyright 2002-2004 The Apache Software Foundation

   *

   * Licensed under the Apache License, Version 2.0 (the "License");

   * you may not use this file except in compliance with the License.

   * You may obtain a copy of the License at

   *

   *     http://www.apache.org/licenses/LICENSE-2.0

   *

   * Unless required by applicable law or agreed to in writing, software

   * distributed under the License is distributed on an "AS IS" BASIS,

   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

   * See the License for the specific language governing permissions and

   * limitations under the License.

   */

  

  /** Lightweight class to hold term and a weight value used for scoring this term 

   * @author Mark Harwood

   */

  public class WeightedTerm

  {

        float weight; // multiplier

        String term; //stemmed form

        public WeightedTerm (float weight,String term)

        {

                this.weight=weight;

                this.term=term;

        }

        

        

        /**

         * @return the term value (stemmed)

         */

        public String getTerm()

        {

                return term;

        }

  

        /**

         * @return the weight associated with this term

         */

        public float getWeight()

        {

                return weight;

        }

  

        /**

         * @param term the term value (stemmed)

         */

        public void setTerm(String term)

        {

                this.term = term;

        }

  

        /**

         * @param weight the weight associated with this term

         */

        public void setWeight(float weight)

        {

                this.weight = weight;

        }

  

  }

  
  
  
  1.1                  
jakarta-lucene-sandbox/contributions/highlighter/src/java/org/apache/lucene/search/highlight/package.html
  
  Index: package.html
  ===================================================================
  <html>

  <body>

  The highlight package contains classes to provide "keyword in context" features

  typically used to highlight search terms in the text of results pages. <br>

  The Highlighter class is the central component and can be used to extract the

  most interesting sections of a piece of text and highlight them, with the help of

  Fragmenter, FragmentScorer and Formatter classes.

  <h2>Example Usage</h2>

  

  <pre>

                IndexSearcher searcher = new IndexSearcher(ramDir);

                Query query = QueryParser.parse("Kenne*", FIELD_NAME, analyzer);

                query=query.rewrite(reader); //required to expand search terms

                Hits hits = searcher.search(query);

  

                Highlighter highlighter =new Highlighter(this,new QueryScorer(query));

                for (int i = 0; i < hits.length(); i++)

                {

                        String text = hits.doc(i).get(FIELD_NAME);

                        TokenStream tokenStream=analyzer.tokenStream(FIELD_NAME,new 
StringReader(text));

                        // Get 3 best fragments and seperate with a "..." 

                        String result = 
highlighter.getBestFragments(tokenStream,text,3,"...");

                        System.out.println(result);

                }

  </pre>

  

  </body>

  </html>
  
  

---------------------------------------------------------------------
To unsubscribe, e-mail: [EMAIL PROTECTED]
For additional commands, e-mail: [EMAIL PROTECTED]

Reply via email to