vgritsenko    02/01/24 20:34:19

  Modified:    src/java/org/apache/cocoon/components/search
                        IndexHelperField.java
                        LuceneIndexContentHandler.java
                        LuceneXMLIndexer.java
                        SimpleLuceneCocoonIndexerImpl.java
                        SimpleLuceneXMLIndexerImpl.java
  Log:
  Improve lucene searching:
   - When indexing, create one lucene document per resource, not one document per 
element
   - Allow adding attribute values to the body text if element is marked by 
lucene:text-attr
  Result:
   - some important attributes (decided by document author) could be indexed as well
   - AND searches work as expected now. Example: person@name:Donald AND History
  
  Revision  Changes    Path
  1.2       +3 -3      
xml-cocoon2/src/java/org/apache/cocoon/components/search/IndexHelperField.java
  
  Index: IndexHelperField.java
  ===================================================================
  RCS file: 
/home/cvs/xml-cocoon2/src/java/org/apache/cocoon/components/search/IndexHelperField.java,v
  retrieving revision 1.1
  retrieving revision 1.2
  diff -u -r1.1 -r1.2
  --- IndexHelperField.java     3 Jan 2002 12:31:13 -0000       1.1
  +++ IndexHelperField.java     25 Jan 2002 04:34:18 -0000      1.2
  @@ -24,7 +24,7 @@
    * A helper class for generating a lucene document in a SAX ContentHandler.
    *
    * @author     <a href="mailto:[EMAIL PROTECTED]";>Bernhard Huber</a>
  - * @version    CVS $Id: IndexHelperField.java,v 1.1 2002/01/03 12:31:13 giacomo Exp 
$
  + * @version    CVS $Id: IndexHelperField.java,v 1.2 2002/01/25 04:34:18 vgritsenko 
Exp $
    */
   class IndexHelperField
   {
  @@ -89,8 +89,8 @@
        * @return    The text value
        * @since
        */
  -    public String getText() {
  -        return text.toString();
  +    public StringBuffer getText() {
  +        return text;
       }
   
   
  
  
  
  1.3       +32 -27    
xml-cocoon2/src/java/org/apache/cocoon/components/search/LuceneIndexContentHandler.java
  
  Index: LuceneIndexContentHandler.java
  ===================================================================
  RCS file: 
/home/cvs/xml-cocoon2/src/java/org/apache/cocoon/components/search/LuceneIndexContentHandler.java,v
  retrieving revision 1.2
  retrieving revision 1.3
  diff -u -r1.2 -r1.3
  --- LuceneIndexContentHandler.java    23 Jan 2002 19:06:38 -0000      1.2
  +++ LuceneIndexContentHandler.java    25 Jan 2002 04:34:18 -0000      1.3
  @@ -27,17 +27,22 @@
    * Parse XML and generate lucene document(s)
    *
    * @author     <a href="mailto:[EMAIL PROTECTED]";>Bernhard Huber</a>
  - * @version    CVS $Id: LuceneIndexContentHandler.java,v 1.2 2002/01/23 19:06:38 
vgritsenko Exp $
  + * @version    CVS $Id: LuceneIndexContentHandler.java,v 1.3 2002/01/25 04:34:18 
vgritsenko Exp $
    */
   public class LuceneIndexContentHandler implements ContentHandler
   {
  +    public static final String LUCENE_URI = "http://apache.org/cocoon/lucene/1.0";;
  +
  +    /** If this attribute is specified on element, values of all attributes
  +     * if this element added to the text of the element, and to the document
  +     * body text */
  +    public static final String LUCENE_ATTR_TO_TEXT_ATTRIBUTE = "text-attr";
  +
       StringBuffer bodyText;
       private List documents;
       private Document bodyDocument;
  -
       private Stack elementStack;
   
  -
       /**
        *Constructor for the LuceneIndexContentHandler object
        *
  @@ -48,10 +53,9 @@
           this.bodyDocument = new Document();
           this.documents = new ArrayList();
           this.documents.add(this.bodyDocument);
  -        elementStack = new Stack();
  +        this.elementStack = new Stack();
       }
   
  -
       /**
        *Sets the documentLocator attribute of the LuceneIndexContentHandler object
        *
  @@ -92,10 +96,13 @@
        * @since
        */
       public void characters(char[] ch, int start, int length) {
  -        IndexHelperField tos = (IndexHelperField) elementStack.peek();
  -        if (ch.length > 0 && start >= 0 && length > 1 && tos != null) {
  +
  +        if (ch.length > 0 && start >= 0 && length > 1) {
               String text = new String(ch, start, length);
  -            tos.appendText(text);
  +            if (elementStack.size() > 0) {
  +                IndexHelperField tos = (IndexHelperField) elementStack.peek();
  +                tos.appendText(text);
  +            }
               bodyText.append(text);
           }
       }
  @@ -124,30 +131,28 @@
        */
       public void endElement(String namespaceURI, String localName, String qName) {
           IndexHelperField tos = (IndexHelperField) elementStack.pop();
  -        String text = tos.getText();
           String lname = tos.getLocalFieldName();
  -        String qname = tos.getQualifiedFieldName();
  -
  -        Document document = new Document();
  -        boolean add_document = false;
  -        if (text != null && text.length() > 0) {
  -            System.out.println("field qname " + qname);
  -            document.add(Field.UnStored(qName, text));
  -            add_document = true;
  -        }
  +        StringBuffer text = tos.getText();
   
  +        // (VG): Atts are never null, see startElement
           Attributes atts = tos.getAttributes();
  -        if (atts != null && atts.getLength() > 0) {
  -            for (int i = 0; i < atts.getLength(); i++) {
  -                String atts_qname = atts.getQName(i);
  -                String atts_value = atts.getValue(i);
  -                System.out.println("attribute field " + qname + "@" + atts_qname + 
": " + atts_value);
  -                document.add(Field.UnStored(qname + "@" + atts_qname, atts_value));
  -                add_document = true;
  +        boolean attributesToText = atts.getIndex(LUCENE_URI, 
LUCENE_ATTR_TO_TEXT_ATTRIBUTE) != -1;
  +        for (int i = 0; i < atts.getLength(); i++) {
  +            if (LUCENE_URI.equals(atts.getURI(i))) continue;
  +
  +            String atts_lname = atts.getLocalName(i);
  +            String atts_value = atts.getValue(i);
  +            bodyDocument.add(Field.UnStored(lname + "@" + atts_lname, atts_value));
  +            if (attributesToText) {
  +                text.append(atts_value);
  +                text.append(' ');
  +                bodyText.append(atts_value);
  +                bodyText.append(' ');
               }
           }
  -        if (add_document) {
  -            documents.add(document);
  +
  +        if (text != null && text.length() > 0) {
  +            bodyDocument.add(Field.UnStored(lname, text.toString()));
           }
       }
   
  
  
  
  1.3       +3 -27     
xml-cocoon2/src/java/org/apache/cocoon/components/search/LuceneXMLIndexer.java
  
  Index: LuceneXMLIndexer.java
  ===================================================================
  RCS file: 
/home/cvs/xml-cocoon2/src/java/org/apache/cocoon/components/search/LuceneXMLIndexer.java,v
  retrieving revision 1.2
  retrieving revision 1.3
  diff -u -r1.2 -r1.3
  --- LuceneXMLIndexer.java     23 Jan 2002 19:06:38 -0000      1.2
  +++ LuceneXMLIndexer.java     25 Jan 2002 04:34:18 -0000      1.3
  @@ -14,6 +14,7 @@
   import org.apache.avalon.framework.component.Component;
   
   import org.apache.cocoon.ProcessingException;
  +import org.apache.lucene.document.Document;
   
   /**
    * The avalon behavioural component interface of generating
  @@ -33,7 +34,7 @@
    * </p>
    *
    * @author     <a href="mailto:[EMAIL PROTECTED]";>Bernhard Huber</a>
  - * @version    CVS $Id: LuceneXMLIndexer.java,v 1.2 2002/01/23 19:06:38 vgritsenko 
Exp $
  + * @version    CVS $Id: LuceneXMLIndexer.java,v 1.3 2002/01/25 04:34:18 vgritsenko 
Exp $
    */
   public interface LuceneXMLIndexer extends Component
   {
  @@ -92,31 +93,6 @@
        */
       String UID_FIELD = "uid";
   
  -
  -    /**
  -     * Return a list of all lucene documents generated by
  -     * the method build().
  -     *
  -     * @return    List list of lucene documents
  -     * @since
  -     * @see       java.util.List
  -     * @see       #build( URL url )
  -     */
  -    List allDocuments();
  -
  -
  -    /**
  -     * return an iterator of all lucene documents generated by
  -     * the method build().
  -     *
  -     * @return    Iterator iterator of lucene Documents
  -     * @since
  -     * @see       java.util.Iterator
  -     * @see       #build( URL url )
  -     */
  -    Iterator iterator();
  -
  -
       /**
        * Build lucene documents from a URL.
        * <p>
  @@ -129,5 +105,5 @@
        * @exception  ProcessingException  Description of Exception
        * @since
        */
  -    void build(URL url) throws ProcessingException;
  +    List build(URL url) throws ProcessingException;
   }
  
  
  
  1.3       +3 -5      
xml-cocoon2/src/java/org/apache/cocoon/components/search/SimpleLuceneCocoonIndexerImpl.java
  
  Index: SimpleLuceneCocoonIndexerImpl.java
  ===================================================================
  RCS file: 
/home/cvs/xml-cocoon2/src/java/org/apache/cocoon/components/search/SimpleLuceneCocoonIndexerImpl.java,v
  retrieving revision 1.2
  retrieving revision 1.3
  diff -u -r1.2 -r1.3
  --- SimpleLuceneCocoonIndexerImpl.java        23 Jan 2002 19:06:38 -0000      1.2
  +++ SimpleLuceneCocoonIndexerImpl.java        25 Jan 2002 04:34:18 -0000      1.3
  @@ -46,7 +46,7 @@
    * </p>
    *
    * @author     <a href="mailto:[EMAIL PROTECTED]";>Bernhard Huber</a>
  - * @version    CVS $Revision: 1.2 $ $Date: 2002/01/23 19:06:38 $
  + * @version    CVS $Revision: 1.3 $ $Date: 2002/01/25 04:34:18 $
    */
   public class SimpleLuceneCocoonIndexerImpl extends AbstractLoggable
            implements LuceneCocoonIndexer, Configurable, Composable, Disposable
  @@ -206,13 +206,11 @@
                   }
   
                   // build lucene documents from the content of the crawl_url
  -                lxi.build(crawl_url);
  -                Iterator i = lxi.iterator();
  +                Iterator i = lxi.build(crawl_url).iterator();
   
                   // add all built lucene documents
                   while (i.hasNext()) {
  -                    Document document = (Document) i.next();
  -                    writer.addDocument(document);
  +                    writer.addDocument((Document) i.next());
                   }
               }
               // optimize it
  
  
  
  1.3       +7 -41     
xml-cocoon2/src/java/org/apache/cocoon/components/search/SimpleLuceneXMLIndexerImpl.java
  
  Index: SimpleLuceneXMLIndexerImpl.java
  ===================================================================
  RCS file: 
/home/cvs/xml-cocoon2/src/java/org/apache/cocoon/components/search/SimpleLuceneXMLIndexerImpl.java,v
  retrieving revision 1.2
  retrieving revision 1.3
  diff -u -r1.2 -r1.3
  --- SimpleLuceneXMLIndexerImpl.java   23 Jan 2002 19:06:38 -0000      1.2
  +++ SimpleLuceneXMLIndexerImpl.java   25 Jan 2002 04:34:18 -0000      1.3
  @@ -54,10 +54,10 @@
    * A simple class building lucene documents from xml content.
    *
    * @author     <a href="mailto:[EMAIL PROTECTED]";>Bernhard Huber</a>
  - * @version    CVS $Revision: 1.2 $ $Date: 2002/01/23 19:06:38 $
  + * @version    CVS $Revision: 1.3 $ $Date: 2002/01/25 04:34:18 $
    */
   public class SimpleLuceneXMLIndexerImpl extends AbstractLoggable
  -         implements LuceneXMLIndexer, Configurable, Composable
  +         implements LuceneXMLIndexer, Configurable, Composable, ThreadSafe
   {
   
       /**
  @@ -68,13 +68,6 @@
       protected ComponentManager manager = null;
   
       /**
  -     * list of lucene Document objects
  -     *
  -     * @since
  -     */
  -    List documents;
  -
  -    /**
        * append this string to the url in order to get the
        * content view of the url
        *
  @@ -94,8 +87,6 @@
        * @since
        */
       public SimpleLuceneXMLIndexerImpl() {
  -        documents = null;
  -
           allowedContentType = new HashSet();
           allowedContentType.add("text/xml");
           allowedContentType.add("text/xhtml");
  @@ -126,40 +117,13 @@
   
   
       /**
  -     * return a list of all lucene documents generated by
  -     *
  -     * @return    List list of lucene Documents
  -     * @since
  -     * @see       build
  -     */
  -    public List allDocuments() {
  -        return documents;
  -    }
  -
  -
  -    /**
  -     * return an iterator of all lucene documents generated by
  -     *
  -     * @return    Iterator iterator of lucene Documents
  -     * @since
  -     * @see       build
  -     */
  -    public Iterator iterator() {
  -        if (documents == null) {
  -            return new ArrayList().iterator();
  -        }
  -        return documents.iterator();
  -    }
  -
  -
  -    /**
        * Build lucenen documents from a URL
        *
        * @param  url                      the content of this url gets indexed.
        * @exception  ProcessingException  Description of Exception
        * @since
        */
  -    public void build(URL url)
  +    public List build(URL url)
                throws ProcessingException {
   
           try {
  @@ -187,15 +151,17 @@
                       // store ... false, index ... true, token ... false
                       d.add(new Field(UID_FIELD, uid(contentURLConnection), false, 
true, false));
                   }
  -                documents = luceneIndexContentHandler.allDocuments();
  +
  +                return luceneIndexContentHandler.allDocuments();
               } else {
                   if (getLogger().isDebugEnabled()) {
                       getLogger().debug("Ignoring " + contentURL + " (" + contentType 
+ ")");
                   }
  +
  +                return java.util.Collections.EMPTY_LIST;
               }
           } catch (IOException ioe) {
               throw new ProcessingException("Cannot read URL " + url, ioe);
  -        } finally {
           }
       }
   
  
  
  

----------------------------------------------------------------------
In case of troubles, e-mail:     [EMAIL PROTECTED]
To unsubscribe, e-mail:          [EMAIL PROTECTED]
For additional commands, e-mail: [EMAIL PROTECTED]

Reply via email to