RE: HTML pages highlighter

Yagnesh Shah Thu, 07 Apr 2005 07:25:12 -0700

Hi! Eric,
        Yes HighlightIt.java and HighlightTest.java works. I did attached the 
file, Anyway here is the source:


      1 <HTML><!-- -*-java-*- -->
      2 <!-- Lucene Search Demo via CompiledPageServlet -->
      3 <!-- Copyright (c) 1998,2000 Douglass R. Cutting. -->
      4
      5 <java type=import>
      6   javax.servlet.*
      7   javax.servlet.http.*
      8  java.io.StringWriter
      9  java.io.StringReader
     10  java.io.IOException
     11   org.apache.lucene.analysis.*
     12   org.apache.lucene.document.*
     13   org.apache.lucene.index.*
     14   org.apache.lucene.search.*
     15   org.apache.lucene.queryParser.*
     16   org.apache.lucene.demo.*
     17   org.apache.lucene.demo.html.Entities
     18   org.apache.lucene.search.highlight.*
     19  org.apache.lucene.analysis.standard.StandardAnalyzer
     20  org.apache.lucene.search.highlight.Formatter
     21  org.apache.lucene.search.highlight.Highlighter
     22  org.apache.lucene.search.highlight.QueryScorer
     23  org.apache.lucene.search.highlight.SimpleFragmenter
     24 </java>
     25
     26 <java>
     27   // get index from request
     28   String indexName = request.getParameter("index");
     29   if (indexName == null)                          // default to "index"
     30     indexName = "/opt/dynamo/prod/hww-doc/hww/help/index";
     31 IndexReader ireader = getReader(indexName);
     32
     33  Searcher searcher = new IndexSearcher(ireader);
     34
     35   // get query from request
     36   String queryString = request.getParameter("query");
     37   if (queryString == null)
     38     throw new ServletException("no query specified");
     39
     40   int start = 0;                                  // first hit to 
display
     41   String startString = request.getParameter("start");
     42   if (startString != null)
     43     start = Integer.parseInt(startString);
     44
     45   int hitsPerPage = 10;                           // number of hits to 
display
     46   String hitsString = request.getParameter("hitsPerPage");
     47   if (hitsString != null)
     48     hitsPerPage = Integer.parseInt(hitsString);
     49
     50   boolean showSummaries = true;                   // show summaries?
     51   if ("false".equals(request.getParameter("showSummaries")))
     52     showSummaries = false;
     53
     54   Query query = null;
     55   try {                                           // parse query
     56     query = QueryParser.parse(queryString, "contents", analyzer);
     57   } catch (ParseException e) {                    // error parsing query
     58     </java>
     59     <HEAD><TITLE>Error Parsing Query</TITLE></HEAD><BODY>
     60     <p>While parsing `queryString`: `e.getMessage()`
     61     <java>
     62     return;
     63   }
     64
     65   String servletPath = request.getRequestURI();   // getServletPath 
should work
     66   int j = servletPath.indexOf('?');               // here but doesn't, 
so we
     67   if (j != -1)                                    // remove query by 
hand...
     68     servletPath = servletPath.substring(0, j);
     69
     70 </java>
     71
     72 <head><title>Lucene Search Results</title></head><body>
     73
     74 <center>
     75  <form name=search action=`servletPath` method=get>
     76  <input name=query size=44 value='`queryString`'>
     77  <input type=hidden name=index value="`indexName`">
     78  <input type=hidden name=hitsPerPage value=`hitsPerPage`>
     79  <input type=hidden name=showSummaries value=`showSummaries`>
     80  <input type=submit value=Search>
     81  </form>
     82 </center>
     83 <java>
     84   Hits hits = searcher.search(query);             // perform query
     85   int end = Math.min(hits.length(), start + hitsPerPage);
     86 </java>
     87
     88
     89
     90 <p>Hits <b><java type=print>start+1</java>-<java 
type=print>end</java></b>
     91 (out of <java type=print>hits.length()</java> total matching documents):
     92
     93 <ul>
     94 <java>
     95     SimpleHTMLFormatter formatter =
     96         new SimpleHTMLFormatter();
     97 QueryScorer scorer = new QueryScorer(query);
     98 query.rewrite(ireader);
     99 Highlighter highlighter = new Highlighter(formatter, scorer);
    100 highlighter.setTextFragmenter(new SimpleFragmenter(50));
    101 String FIELD_NAME = "contents";
    102   String result = "not specific";
    103   String text;
    104   for (int i = start; i < end; i++) {             // display the hits
    105   Document doc = hits.doc(i);
    106   text = hits.doc(i).get(FIELD_NAME);
    107   int maxNumFragmentsRequired = 5;
    108   String fragmentSeparator = "...";
    109         if ( text != null){
    110   TokenStream tokenStream = new 
StandardAnalyzer().tokenStream(FIELD_NAME, new java.io.StringReader(text));
    111   result = 
highlighter.getBestFragments(tokenStream,text,maxNumFragmentsRequired,fragmentSeparator);
    112     System.out.println("text" + text + "start" + start + "end" +end + 
"i" +i +"result=" +result);
    113 }
    114
    115     String title = doc.get("title");
    116     if (title.equals(""))                         // use url for docs 
w/o title
    117       title = doc.get("path");
    118     </java>
    119     <p><b><java type=print>(int)(hits.score(i) * 100.0f)</java>%
    120     <a href="`doc.get("path")`">
    121     <java type=print>Entities.encode(title)</java>
    122     </b></a>
    123 <hr>
    124     <ul><i>Summary</i>:</ul>
    125     <java>
    126     if (text != null) {                          // maybe show summary
    127     </java>
    128         <java type=print>result</java>
    129     }
    130     elseif (showSummaries)
    131     {
    132     </java>
    133       <java type=print>Entities.encode(doc.get("summary"))</java>
    134     <java>
    135     }
    136   }
    137 </java>
    138 </ul>
    139
    140 <java>
    141   if (end < hits.length()) {                      // insert next page 
button
    142 </java>
    143     <center>
    144     <form name=search action=`servletPath` method=get>
    145     <input type=hidden name=query value='`queryString`'>
    146     <input type=hidden name=start value=`end`>
    147     <input type=hidden name=index value="`indexName`">
    148     <input type=hidden name=hitsPerPage value=`hitsPerPage`>
    149     <input type=hidden name=showSummaries value=`showSummaries`>
    150     <input type=submit value=Next>
    151     </form>
    152     </center>
    153 <java>
    154     }
    155 </java>
    156
    157 </body>
    158
    159 <java type=class>
    160
    161   Analyzer analyzer = new StopAnalyzer();         // used to tokenize 
queries
    162
    163   /** Keep a cache of open IndexReader's, so that an index does not 
have to
    164       opened for each query.  The cache re-opens an index when it has 
changed
    165       so that additions and deletions are visible ASAP. */
    166
    167   static Hashtable indexCache = new Hashtable();  // name->CachedIndex
    168
    169   class CachedIndex {                             // an entry in the 
cache
    170     IndexReader reader;                           // an open reader
    171     long modified;                                // reader's modified 
date
    172
    173     CachedIndex(String name) throws IOException {
    174       modified = IndexReader.lastModified(name);  // get modified date
    175       reader = IndexReader.open(name);            // open reader
    176     }
    177   }
    178
    179   IndexReader getReader(String name) throws ServletException {
    180     CachedIndex index =                           // look in cache
    181       (CachedIndex)indexCache.get(name);
    182
    183     try {
    184       if (index != null &&                        // check up-to-date
    185           (index.modified == IndexReader.lastModified(name)))
    186         return index.reader;                      // cache hit
    187       else {
    188         index = new CachedIndex(name);            // cache miss
    189       }
    190     } catch (IOException e) {
    191       StringWriter writer = new StringWriter();
    192       PrintWriter pw = new PrintWriter(writer);
    193       throw new ServletException("Could not open index " + name + ": " +
    194                                  e.getClass().getName() + "--" +
    195                                  e.getMessage());
    196     }
    197
    198     indexCache.put(name, index);                  // add to cache
    199     return index.reader;
    200   }
    201 </java>

-----Original Message-----
From: Erik Hatcher [mailto:[EMAIL PROTECTED]
Sent: Wednesday, April 06, 2005 6:45 PM
To: [email protected]
Subject: Re: HTML pages highlighter


What file do those line numbers correspond to?  I'm lost.

Did the Lucene in Action highlighting code work for you?

        Erik

On Apr 6, 2005, at 6:16 PM, Yagnesh Shah wrote:

> Hi! Erik,
>       Yes basic seems to be working.
> a) My problem is there is a chances that query is not present in 
> stored content of a file so sometimes I am getting empty strings at 
> line#106 so I have to put a special check at line#109 and line#126. I 
> guess this is not a problem. What you think?
> b) When I click on a doc path that was generated by line#120 and 
> line#121 The files that it open do not have a searched query 
> highlighted. Any suggestion for this? How I can do?
>
>
> -----Original Message-----
> From: Erik Hatcher [mailto:[EMAIL PROTECTED]
> Sent: Monday, April 04, 2005 8:45 PM
> To: [email protected]
> Subject: Re: HTML pages highlighter
>
>
> On Apr 4, 2005, at 5:35 PM, Yagnesh Shah wrote:
>>      I end up purchasing your book "Lucene in Action". I have downloaded
>> your code samples. I am able to retrieve "result" only some time.
>> Below is the code I have taken from Search.jhtml in lucene demo. I
>> have 2 problem
>>
>> a) I am unable to display "result" using
>> b) When I click on the title to retrieve document I do not see my
>> query highlighted.
>
> First things first.... get something very very simple working and
> expand from there.  Here is the simple code from our HighlightIt.java:
>
>      TermQuery query = new TermQuery(new Term("f", "ipsum"));
>      QueryScorer scorer = new QueryScorer(query);
>      SimpleHTMLFormatter formatter =
>          new SimpleHTMLFormatter("<span class=\"highlight\">",
>              "</span>");
>      Highlighter highlighter = new Highlighter(formatter, scorer);
>      Fragmenter fragmenter = new SimpleFragmenter(50);
>      highlighter.setTextFragmenter(fragmenter);
>
>      TokenStream tokenStream = new StandardAnalyzer()
>          .tokenStream("f", new StringReader(text));
>
>      String result =
>          highlighter.getBestFragments(tokenStream, text, 5, "...");
>
> One trick is that you must ensure the query you are passing to
> QueryScorer has been rewritten.  In our simple TermQuery case, that is
> not necessary, but in a general application it is.  You can call
> query.rewrite(reader) where reader is your IndexReader instance.  This
> ensures that range, fuzzy, and wildcard queries are expanded and
> highlightable.
>
> I'm not sure what is wrong with the code you are trying.  But again,
> start simple, just try out our HighlightIt or our HighlightTest.  If
> those work fine for you then move on to integrating further with your
> index.  Besides the Query.rewrite() trick, you have to be sure that the
> text you want to highlight is available.  If you're pulling it from the
> index, it must be in a stored field, otherwise you need to retrieve it
> from elsewhere.
>
>       Erik
>
>
>>
>> <java>
>>
>>   Searcher searcher = new IndexSearcher(getReader(indexName));
>>
>>   // get query from request
>>   String queryString = request.getParameter("query");
>>
>>   query = QueryParser.parse(queryString, "contents", analyzer);
>>   Hits hits = searcher.search(query);
>>   SimpleHTMLFormatter formatter =
>>   new SimpleHTMLFormatter();
>>   Highlighter highlighter = new Highlighter(formatter, new
>> QueryScorer(query));
>>   highlighter.setTextFragmenter(new SimpleFragmenter(50));
>>   String FIELD_NAME = "contents";
>>
>>   for (int i = start; i < end; i++) {             // display the hits
>>   Document doc = hits.doc(i);
>>   String text = hits.doc(i).get(FIELD_NAME);
>>   int maxNumFragmentsRequired = 5;
>>   String fragmentSeparator = "...";
>>   if ( text != null){
>>      TokenStream tokenStream = new
>> StandardAnalyzer().tokenStream(FIELD_NAME, new
>> java.io.StringReader(text));
>>      String result =
>> highlighter.getBestFragments         
>> (tokenStream,text,maxNumFragmentsRequired,fragmentSeparator);
>>      System.out.println("result=" +result);
>>   }
>>
>>     String title = doc.get("title");
>>     if (title.equals(""))                         // use url for docs
>> w/o title
>>       title = doc.get("path");
>>     </java>
>>     <p><b><java type=print>(int)(hits.score(i) * 100.0f)</java>%
>>     <a href="`doc.get("path")`">
>>     <java type=print>Entities.encode(title)</java>
>>     </b></a>
>>     <java>
>>     if (showSummaries) {                          // maybe show 
>> summary
>>     </java>
>>     <ul><i>Summary</i>:
>>       <java type=print>Entities.encode(doc.get("summary"))</java>
>>     </ul>
>>     <java>
>>     }
>>   }
>> </java>
>>
>>
>>
>> -----Original Message-----
>> From: Erik Hatcher [mailto:[EMAIL PROTECTED]
>> Sent: Thursday, March 31, 2005 8:04 PM
>> To: [email protected]
>> Subject: Re: HTML pages highlighter
>>
>>
>>
>> On Mar 31, 2005, at 6:36 PM, Yagnesh Shah wrote:
>>>     try {
>>>       fis = new FileInputStream(f);
>>>       HTMLParser parser = new HTMLParser(fis);
>>>
>>>       // Add the tag-stripped contents as a Reader-valued Text field
>>> so it will
>>>       // get tokenized and indexed.
>>> //      doc.add(new Field("contents", parser.getReader()));
>>>       LineNumberReader reader = new
>>> LineNumberReader(parser.getReader());
>>>       for (String l = reader.readLine(); l != null; l =
>>> reader.readLine())
>>> //        System.out.println(l);
>>>       doc.add(Field.Text("contents", l));
>>
>> Notice that your loop here is adding a "contents" field for *every*
>> line read since that is where the first semi-colon is.
>>
>> Look at using Luke to explore your index.  Try indexing just a dummy
>> String:
>>
>>      doc.add(Field.Text("contents", "some dummy text"));
>>
>> to show that it works.  Always always always simplify a complicated
>> situation by doing the most obvious thing that _should_ work.
>>
>> Also, the demo Lucene code is not really designed to be used in a
>> production application (sadly), so you're better off borrowing code
>> from the many articles or our book to begin with.
>>
>>      Erik
>>
>>
>>>
>>>       // Add the summary as a field that is stored and returned with
>>>       // hit documents for display.
>>>       doc.add(new Field("summary", parser.getSummary(),
>>> Field.Store.YES, Field.Index.NO));
>>>
>>>       // Add the title as a field that it can be searched and that is
>>> stored.
>>>       doc.add(new Field("title", parser.getTitle(), Field.Store.YES,
>>> Field.Index.TOKENIZED));
>>>     }
>>>
>>>
>>>
>>> -----Original Message-----
>>> From: Erik Hatcher [mailto:[EMAIL PROTECTED]
>>> Sent: Wednesday, March 30, 2005 7:38 PM
>>> To: [email protected]
>>> Subject: Re: HTML pages highlighter
>>>
>>>
>>>
>>> On Mar 30, 2005, at 4:46 PM, Yagnesh Shah wrote:
>>>
>>>> Hi! Eric,
>>>
>>> Erik - with a 'k' - Sorry, I let it slide once though :)
>>>
>>>>    I try to modified that with this but I get compile error. Do you
>>>> have
>>>> any code snippet of highlighting code to pull the contents from the
>>>> original source?
>>>
>>> I have a whole book full of code examples :)
>>> http://www.lucenebook.com - Grab the source code and look in
>>> src/lia/tools at Highlight*.java
>>>
>>>>  or Do you know how I can do field store?
>>>>
>>>>       doc.add(new Field("contents", parser.getReader(),
>>>> Field.Store.YES, Field.Index.NO));
>>>
>>> You cannot store it with a Reader.  You need to use 
>>> Field.Text(String,
>>> String), or one of the other variations.
>>>
>>>     Erik
>>>
>>>
>>> ---------------------------------------------------------------------
>>> To unsubscribe, e-mail: [EMAIL PROTECTED]
>>> For additional commands, e-mail: [EMAIL PROTECTED]
>>>
>>>
>>> ---------------------------------------------------------------------
>>> To unsubscribe, e-mail: [EMAIL PROTECTED]
>>> For additional commands, e-mail: [EMAIL PROTECTED]
>>
>>
>> ---------------------------------------------------------------------
>> To unsubscribe, e-mail: [EMAIL PROTECTED]
>> For additional commands, e-mail: [EMAIL PROTECTED]
>>
>>
>> ---------------------------------------------------------------------
>> To unsubscribe, e-mail: [EMAIL PROTECTED]
>> For additional commands, e-mail: [EMAIL PROTECTED]
>
>
> ---------------------------------------------------------------------
> To unsubscribe, e-mail: [EMAIL PROTECTED]
> For additional commands, e-mail: [EMAIL PROTECTED]
>
>
> ---------------------------------------------------------------------
> To unsubscribe, e-mail: [EMAIL PROTECTED]
> For additional commands, e-mail: [EMAIL PROTECTED]


---------------------------------------------------------------------
To unsubscribe, e-mail: [EMAIL PROTECTED]
For additional commands, e-mail: [EMAIL PROTECTED]


---------------------------------------------------------------------
To unsubscribe, e-mail: [EMAIL PROTECTED]
For additional commands, e-mail: [EMAIL PROTECTED]

RE: HTML pages highlighter

Reply via email to