vgritsenko 02/01/24 20:34:19 Modified: src/java/org/apache/cocoon/components/search IndexHelperField.java LuceneIndexContentHandler.java LuceneXMLIndexer.java SimpleLuceneCocoonIndexerImpl.java SimpleLuceneXMLIndexerImpl.java Log: Improve lucene searching: - When indexing, create one lucene document per resource, not one document per element - Allow adding attribute values to the body text if element is marked by lucene:text-attr Result: - some important attributes (decided by document author) could be indexed as well - AND searches work as expected now. Example: person@name:Donald AND History Revision Changes Path 1.2 +3 -3 xml-cocoon2/src/java/org/apache/cocoon/components/search/IndexHelperField.java Index: IndexHelperField.java =================================================================== RCS file: /home/cvs/xml-cocoon2/src/java/org/apache/cocoon/components/search/IndexHelperField.java,v retrieving revision 1.1 retrieving revision 1.2 diff -u -r1.1 -r1.2 --- IndexHelperField.java 3 Jan 2002 12:31:13 -0000 1.1 +++ IndexHelperField.java 25 Jan 2002 04:34:18 -0000 1.2 @@ -24,7 +24,7 @@ * A helper class for generating a lucene document in a SAX ContentHandler. * * @author <a href="mailto:[EMAIL PROTECTED]">Bernhard Huber</a> - * @version CVS $Id: IndexHelperField.java,v 1.1 2002/01/03 12:31:13 giacomo Exp $ + * @version CVS $Id: IndexHelperField.java,v 1.2 2002/01/25 04:34:18 vgritsenko Exp $ */ class IndexHelperField { @@ -89,8 +89,8 @@ * @return The text value * @since */ - public String getText() { - return text.toString(); + public StringBuffer getText() { + return text; } 1.3 +32 -27 xml-cocoon2/src/java/org/apache/cocoon/components/search/LuceneIndexContentHandler.java Index: LuceneIndexContentHandler.java =================================================================== RCS file: /home/cvs/xml-cocoon2/src/java/org/apache/cocoon/components/search/LuceneIndexContentHandler.java,v retrieving revision 1.2 retrieving revision 1.3 diff -u -r1.2 -r1.3 --- LuceneIndexContentHandler.java 23 Jan 2002 19:06:38 -0000 1.2 +++ LuceneIndexContentHandler.java 25 Jan 2002 04:34:18 -0000 1.3 @@ -27,17 +27,22 @@ * Parse XML and generate lucene document(s) * * @author <a href="mailto:[EMAIL PROTECTED]">Bernhard Huber</a> - * @version CVS $Id: LuceneIndexContentHandler.java,v 1.2 2002/01/23 19:06:38 vgritsenko Exp $ + * @version CVS $Id: LuceneIndexContentHandler.java,v 1.3 2002/01/25 04:34:18 vgritsenko Exp $ */ public class LuceneIndexContentHandler implements ContentHandler { + public static final String LUCENE_URI = "http://apache.org/cocoon/lucene/1.0"; + + /** If this attribute is specified on element, values of all attributes + * if this element added to the text of the element, and to the document + * body text */ + public static final String LUCENE_ATTR_TO_TEXT_ATTRIBUTE = "text-attr"; + StringBuffer bodyText; private List documents; private Document bodyDocument; - private Stack elementStack; - /** *Constructor for the LuceneIndexContentHandler object * @@ -48,10 +53,9 @@ this.bodyDocument = new Document(); this.documents = new ArrayList(); this.documents.add(this.bodyDocument); - elementStack = new Stack(); + this.elementStack = new Stack(); } - /** *Sets the documentLocator attribute of the LuceneIndexContentHandler object * @@ -92,10 +96,13 @@ * @since */ public void characters(char[] ch, int start, int length) { - IndexHelperField tos = (IndexHelperField) elementStack.peek(); - if (ch.length > 0 && start >= 0 && length > 1 && tos != null) { + + if (ch.length > 0 && start >= 0 && length > 1) { String text = new String(ch, start, length); - tos.appendText(text); + if (elementStack.size() > 0) { + IndexHelperField tos = (IndexHelperField) elementStack.peek(); + tos.appendText(text); + } bodyText.append(text); } } @@ -124,30 +131,28 @@ */ public void endElement(String namespaceURI, String localName, String qName) { IndexHelperField tos = (IndexHelperField) elementStack.pop(); - String text = tos.getText(); String lname = tos.getLocalFieldName(); - String qname = tos.getQualifiedFieldName(); - - Document document = new Document(); - boolean add_document = false; - if (text != null && text.length() > 0) { - System.out.println("field qname " + qname); - document.add(Field.UnStored(qName, text)); - add_document = true; - } + StringBuffer text = tos.getText(); + // (VG): Atts are never null, see startElement Attributes atts = tos.getAttributes(); - if (atts != null && atts.getLength() > 0) { - for (int i = 0; i < atts.getLength(); i++) { - String atts_qname = atts.getQName(i); - String atts_value = atts.getValue(i); - System.out.println("attribute field " + qname + "@" + atts_qname + ": " + atts_value); - document.add(Field.UnStored(qname + "@" + atts_qname, atts_value)); - add_document = true; + boolean attributesToText = atts.getIndex(LUCENE_URI, LUCENE_ATTR_TO_TEXT_ATTRIBUTE) != -1; + for (int i = 0; i < atts.getLength(); i++) { + if (LUCENE_URI.equals(atts.getURI(i))) continue; + + String atts_lname = atts.getLocalName(i); + String atts_value = atts.getValue(i); + bodyDocument.add(Field.UnStored(lname + "@" + atts_lname, atts_value)); + if (attributesToText) { + text.append(atts_value); + text.append(' '); + bodyText.append(atts_value); + bodyText.append(' '); } } - if (add_document) { - documents.add(document); + + if (text != null && text.length() > 0) { + bodyDocument.add(Field.UnStored(lname, text.toString())); } } 1.3 +3 -27 xml-cocoon2/src/java/org/apache/cocoon/components/search/LuceneXMLIndexer.java Index: LuceneXMLIndexer.java =================================================================== RCS file: /home/cvs/xml-cocoon2/src/java/org/apache/cocoon/components/search/LuceneXMLIndexer.java,v retrieving revision 1.2 retrieving revision 1.3 diff -u -r1.2 -r1.3 --- LuceneXMLIndexer.java 23 Jan 2002 19:06:38 -0000 1.2 +++ LuceneXMLIndexer.java 25 Jan 2002 04:34:18 -0000 1.3 @@ -14,6 +14,7 @@ import org.apache.avalon.framework.component.Component; import org.apache.cocoon.ProcessingException; +import org.apache.lucene.document.Document; /** * The avalon behavioural component interface of generating @@ -33,7 +34,7 @@ * </p> * * @author <a href="mailto:[EMAIL PROTECTED]">Bernhard Huber</a> - * @version CVS $Id: LuceneXMLIndexer.java,v 1.2 2002/01/23 19:06:38 vgritsenko Exp $ + * @version CVS $Id: LuceneXMLIndexer.java,v 1.3 2002/01/25 04:34:18 vgritsenko Exp $ */ public interface LuceneXMLIndexer extends Component { @@ -92,31 +93,6 @@ */ String UID_FIELD = "uid"; - - /** - * Return a list of all lucene documents generated by - * the method build(). - * - * @return List list of lucene documents - * @since - * @see java.util.List - * @see #build( URL url ) - */ - List allDocuments(); - - - /** - * return an iterator of all lucene documents generated by - * the method build(). - * - * @return Iterator iterator of lucene Documents - * @since - * @see java.util.Iterator - * @see #build( URL url ) - */ - Iterator iterator(); - - /** * Build lucene documents from a URL. * <p> @@ -129,5 +105,5 @@ * @exception ProcessingException Description of Exception * @since */ - void build(URL url) throws ProcessingException; + List build(URL url) throws ProcessingException; } 1.3 +3 -5 xml-cocoon2/src/java/org/apache/cocoon/components/search/SimpleLuceneCocoonIndexerImpl.java Index: SimpleLuceneCocoonIndexerImpl.java =================================================================== RCS file: /home/cvs/xml-cocoon2/src/java/org/apache/cocoon/components/search/SimpleLuceneCocoonIndexerImpl.java,v retrieving revision 1.2 retrieving revision 1.3 diff -u -r1.2 -r1.3 --- SimpleLuceneCocoonIndexerImpl.java 23 Jan 2002 19:06:38 -0000 1.2 +++ SimpleLuceneCocoonIndexerImpl.java 25 Jan 2002 04:34:18 -0000 1.3 @@ -46,7 +46,7 @@ * </p> * * @author <a href="mailto:[EMAIL PROTECTED]">Bernhard Huber</a> - * @version CVS $Revision: 1.2 $ $Date: 2002/01/23 19:06:38 $ + * @version CVS $Revision: 1.3 $ $Date: 2002/01/25 04:34:18 $ */ public class SimpleLuceneCocoonIndexerImpl extends AbstractLoggable implements LuceneCocoonIndexer, Configurable, Composable, Disposable @@ -206,13 +206,11 @@ } // build lucene documents from the content of the crawl_url - lxi.build(crawl_url); - Iterator i = lxi.iterator(); + Iterator i = lxi.build(crawl_url).iterator(); // add all built lucene documents while (i.hasNext()) { - Document document = (Document) i.next(); - writer.addDocument(document); + writer.addDocument((Document) i.next()); } } // optimize it 1.3 +7 -41 xml-cocoon2/src/java/org/apache/cocoon/components/search/SimpleLuceneXMLIndexerImpl.java Index: SimpleLuceneXMLIndexerImpl.java =================================================================== RCS file: /home/cvs/xml-cocoon2/src/java/org/apache/cocoon/components/search/SimpleLuceneXMLIndexerImpl.java,v retrieving revision 1.2 retrieving revision 1.3 diff -u -r1.2 -r1.3 --- SimpleLuceneXMLIndexerImpl.java 23 Jan 2002 19:06:38 -0000 1.2 +++ SimpleLuceneXMLIndexerImpl.java 25 Jan 2002 04:34:18 -0000 1.3 @@ -54,10 +54,10 @@ * A simple class building lucene documents from xml content. * * @author <a href="mailto:[EMAIL PROTECTED]">Bernhard Huber</a> - * @version CVS $Revision: 1.2 $ $Date: 2002/01/23 19:06:38 $ + * @version CVS $Revision: 1.3 $ $Date: 2002/01/25 04:34:18 $ */ public class SimpleLuceneXMLIndexerImpl extends AbstractLoggable - implements LuceneXMLIndexer, Configurable, Composable + implements LuceneXMLIndexer, Configurable, Composable, ThreadSafe { /** @@ -68,13 +68,6 @@ protected ComponentManager manager = null; /** - * list of lucene Document objects - * - * @since - */ - List documents; - - /** * append this string to the url in order to get the * content view of the url * @@ -94,8 +87,6 @@ * @since */ public SimpleLuceneXMLIndexerImpl() { - documents = null; - allowedContentType = new HashSet(); allowedContentType.add("text/xml"); allowedContentType.add("text/xhtml"); @@ -126,40 +117,13 @@ /** - * return a list of all lucene documents generated by - * - * @return List list of lucene Documents - * @since - * @see build - */ - public List allDocuments() { - return documents; - } - - - /** - * return an iterator of all lucene documents generated by - * - * @return Iterator iterator of lucene Documents - * @since - * @see build - */ - public Iterator iterator() { - if (documents == null) { - return new ArrayList().iterator(); - } - return documents.iterator(); - } - - - /** * Build lucenen documents from a URL * * @param url the content of this url gets indexed. * @exception ProcessingException Description of Exception * @since */ - public void build(URL url) + public List build(URL url) throws ProcessingException { try { @@ -187,15 +151,17 @@ // store ... false, index ... true, token ... false d.add(new Field(UID_FIELD, uid(contentURLConnection), false, true, false)); } - documents = luceneIndexContentHandler.allDocuments(); + + return luceneIndexContentHandler.allDocuments(); } else { if (getLogger().isDebugEnabled()) { getLogger().debug("Ignoring " + contentURL + " (" + contentType + ")"); } + + return java.util.Collections.EMPTY_LIST; } } catch (IOException ioe) { throw new ProcessingException("Cannot read URL " + url, ioe); - } finally { } }
---------------------------------------------------------------------- In case of troubles, e-mail: [EMAIL PROTECTED] To unsubscribe, e-mail: [EMAIL PROTECTED] For additional commands, e-mail: [EMAIL PROTECTED]