stefano 01/12/11 15:00:25 Added: src/org/apache/cocoon/components/lucene CocoonCrawler.java CocoonErrorHandler.java IndexHelperField.java LuceneCocoonHelper.java LuceneCocoonIndexer.java LuceneCocoonPager.java LuceneCocoonSearcher.java LuceneIndexContentHandler.java LuceneXMLIndexer.java SimpleCocoonCrawlerImpl.java SimpleLuceneCocoonIndexerImpl.java SimpleLuceneCocoonSearcherImpl.java SimpleLuceneXMLIndexerImpl.java Log: adding the search components Revision Changes Path 1.1 xml-cocoon2/src/org/apache/cocoon/components/lucene/CocoonCrawler.java Index: CocoonCrawler.java =================================================================== /***************************************************************************** * Copyright (C) The Apache Software Foundation. All rights reserved. * * ------------------------------------------------------------------------- * * This software is published under the terms of the Apache Software License * * version 1.1, a copy of which has been included with this distribution in * * the LICENSE file. * *****************************************************************************/ package org.apache.cocoon.components.lucene; import org.apache.avalon.framework.component.Component; import java.util.*; import java.net.*; /** * A cocoon crawler component */ public interface CocoonCrawler extends Component { public final static String ROLE = "org.apache.cocoon.components.lucene.CocoonCrawler"; /** * start crawlin the URL */ public void crawl( URL url ); /** * iterate over crawled URL */ public Iterator iterator(); } 1.1 xml-cocoon2/src/org/apache/cocoon/components/lucene/CocoonErrorHandler.java Index: CocoonErrorHandler.java =================================================================== /***************************************************************************** * Copyright (C) The Apache Software Foundation. All rights reserved. * * ------------------------------------------------------------------------- * * This software is published under the terms of the Apache Software License * * version 1.1, a copy of which has been included with this distribution in * * the LICENSE file. * *****************************************************************************/ package org.apache.cocoon.components.lucene; /** * Title: lucene * Description: Demo files using lucene indexer&searcher * Copyright: Copyright (c) 2001 * Company: * @author Bernhard Huber * @version 1.0 */ import org.xml.sax.SAXException; import org.xml.sax.ErrorHandler; import org.xml.sax.SAXParseException; public class CocoonErrorHandler implements ErrorHandler { private String getExceptionInfo(SAXParseException exception) { StringBuffer sb = new StringBuffer(); sb.append("pubId: ").append(exception.getPublicId()); sb.append(", "); sb.append("sysId: ").append(exception.getSystemId()); sb.append(", "); sb.append("col: ").append(String.valueOf(exception.getColumnNumber())); sb.append(", "); sb.append("line: ").append(String.valueOf(exception.getLineNumber())); sb.append(" "); return sb.toString(); } public void error(SAXParseException exception) { System.err.println("CocoonErrorHandler : " + getExceptionInfo(exception) + "error: " + exception.getMessage()); } public void fatalError(SAXParseException exception) { System.err.println("CocoonErrorHandler : " + getExceptionInfo(exception) + "fatalError: " + exception.getMessage()); } public void warning(SAXParseException exception) { System.err.println("CocoonErrorHandler : " + getExceptionInfo(exception) + "warning: " + exception.getMessage()); } } 1.1 xml-cocoon2/src/org/apache/cocoon/components/lucene/IndexHelperField.java Index: IndexHelperField.java =================================================================== /***************************************************************************** * Copyright (C) The Apache Software Foundation. All rights reserved. * * ------------------------------------------------------------------------- * * This software is published under the terms of the Apache Software License * * version 1.1, a copy of which has been included with this distribution in * * the LICENSE file. * *****************************************************************************/ package org.apache.cocoon.components.lucene; /** * Title: lucene * Description: Demo files using lucene indexer&searcher * Copyright: Copyright (c) 2001 * Company: * @author Bernhard Huber * @version 1.0 */ import org.xml.sax.ContentHandler; import org.xml.sax.ErrorHandler; import org.xml.sax.Locator; import org.xml.sax.InputSource; import org.xml.sax.SAXException; import org.xml.sax.SAXParseException; import org.xml.sax.XMLReader; import org.xml.sax.Attributes; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.DateField; class IndexHelperField { String localFieldName; String qualifiedFieldName; StringBuffer text; Attributes attributes; IndexHelperField(String lfn, String qfn, Attributes atts) { this.localFieldName = lfn; this.qualifiedFieldName = qfn; this.attributes = atts; this.text = new StringBuffer(); } public String getLocalFieldName() { return localFieldName; } public String getQualifiedFieldName() { return qualifiedFieldName; } public Attributes getAttributes() { return attributes; } public String getText() { return text.toString(); } public void appendText(String text) { this.text.append(text); } public void appendText(char[] str, int offset, int length) { this.text.append(str, offset, length); } } 1.1 xml-cocoon2/src/org/apache/cocoon/components/lucene/LuceneCocoonHelper.java Index: LuceneCocoonHelper.java =================================================================== /***************************************************************************** * Copyright (C) The Apache Software Foundation. All rights reserved. * * ------------------------------------------------------------------------- * * This software is published under the terms of the Apache Software License * * version 1.1, a copy of which has been included with this distribution in * * the LICENSE file. * *****************************************************************************/ package org.apache.cocoon.components.lucene; import org.apache.lucene.store.*; import org.apache.lucene.index.*; import org.apache.lucene.analysis.Analyzer; import java.io.File; import java.io.IOException; /** * This class encapsulates some helper methods. * */ public class LuceneCocoonHelper { public static Directory getDirectory( File directory, boolean create ) throws IOException { FSDirectory fsDirectory = FSDirectory.getDirectory( directory, create ); return fsDirectory; } public static Analyzer getAnalyzer( String analyzer_class_name ) { Analyzer analyzer = null; try { Class analyzer_class = Class.forName( analyzer_class_name ); analyzer = (Analyzer)analyzer_class.newInstance(); } catch (Exception e) { } return analyzer; } public static IndexReader getIndexReader( Directory directory ) throws IOException { IndexReader reader = IndexReader.open( directory ); return reader; } public static IndexWriter getIndexWriter( Directory index, Analyzer analyzer, boolean create ) throws IOException { IndexWriter writer = new IndexWriter( index, analyzer, create ); return writer; } } 1.1 xml-cocoon2/src/org/apache/cocoon/components/lucene/LuceneCocoonIndexer.java Index: LuceneCocoonIndexer.java =================================================================== /***************************************************************************** * Copyright (C) The Apache Software Foundation. All rights reserved. * * ------------------------------------------------------------------------- * * This software is published under the terms of the Apache Software License * * version 1.1, a copy of which has been included with this distribution in * * the LICENSE file. * *****************************************************************************/ package org.apache.cocoon.components.lucene; import org.apache.avalon.framework.component.Component; import org.apache.cocoon.ProcessingException; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.store.Directory; import java.net.URL; public interface LuceneCocoonIndexer extends Component { public final static String ROLE = "org.apache.cocoon.components.lucene.LuceneCocoonIndexer"; public void setAnalyzer( Analyzer analyzer ); public void index( Directory index, boolean create, URL base_url ) throws ProcessingException; } 1.1 xml-cocoon2/src/org/apache/cocoon/components/lucene/LuceneCocoonPager.java Index: LuceneCocoonPager.java =================================================================== /***************************************************************************** * Copyright (C) The Apache Software Foundation. All rights reserved. * * ------------------------------------------------------------------------- * * This software is published under the terms of the Apache Software License * * version 1.1, a copy of which has been included with this distribution in * * the LICENSE file. * *****************************************************************************/ package org.apache.cocoon.components.lucene; import org.apache.lucene.store.*; import org.apache.lucene.index.*; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.search.Hits; import org.apache.lucene.document.Document; import java.io.File; import java.io.IOException; import java.util.*; // implementtion of ListIterator /** * This class should help you to manage paging of hits. * * */ public class LuceneCocoonPager implements ListIterator { public final static int COUNT_OF_HITS_PER_PAGE_DEFAULT = 5; public final static int HITS_INDEX_START_DEFAULT = 0; /** * hits to iterate upon */ private Hits hits; /** * current index of hit to return by next() */ int hitsIndex = HITS_INDEX_START_DEFAULT; /** * maximum count of hits to return by next(), and previous() */ int countOfHitsPerPage = COUNT_OF_HITS_PER_PAGE_DEFAULT; /** */ public LuceneCocoonPager( Hits hits ) { setHits( hits ); } public LuceneCocoonPager() { } public void setHits( Hits hits ) { this.hits = hits; this.hitsIndex = HITS_INDEX_START_DEFAULT; } /** * Set count of hits displayed per single page */ public void setCountOfHitsPerPage( int countOfHitsPerPage ) { this.countOfHitsPerPage = countOfHitsPerPage; } /** * Get count of hits displayed per single page */ public int getCountOfHitsPerPage() { return this.countOfHitsPerPage; } /** * Caluclate count of pages for displaying all hits */ public int getCountOfPages() { int count_of_pages = hits.length() / this.countOfHitsPerPage; int remainder = hits.length() % this.countOfHitsPerPage; if (remainder != 0) { count_of_pages += 1; } return count_of_pages; } /** * Set starting index for retrieving hits */ public int getStartIndex() { return this.hitsIndex; } /** * Get starting index for retrieving hits */ public void setStartIndex( int start_index ) { this.hitsIndex = start_index; } /** * Inserts the specified element into the list (optional operation). */ public void add(Object o) throws UnsupportedOperationException { throw new UnsupportedOperationException(); } /** * Returns true if this list iterator has more elements when traversing * the list in the forward direction. */ public boolean hasNext() { boolean has_next = hitsIndex < hits.length(); return has_next; } /** * Returns true if this list iterator has more elements when traversing * the list in the reverse direction. */ public boolean hasPrevious() { boolean has_previous = hitsIndex > countOfHitsPerPage; return has_previous; } /** * Returns the next element in the list. */ public Object next() { ArrayList hitsPerPageList = new ArrayList(); int endIndex = Math.min( hits.length(), hitsIndex + countOfHitsPerPage ); if (hitsIndex < endIndex) { while (hitsIndex < endIndex) { try { HitWrapper hit_wrapper = new HitWrapper( hits.score(hitsIndex), hits.doc(hitsIndex) ); hitsPerPageList.add( hit_wrapper ); } catch (IOException ioe) { throw new NoSuchElementException( "no more hits: " + ioe.getMessage() ); } hitsIndex++; } } else { throw new NoSuchElementException(); } return hitsPerPageList; } /** * Returns the index of the element that would be returned by a * subsequent call to next. */ public int nextIndex() { int next_index = Math.min( hitsIndex, hits.length() ); return next_index; } /** * Returns the previous element in the list. */ public Object previous() { ArrayList hitsPerPageList = new ArrayList(); int startIndex = Math.max( 0, hitsIndex - 2 * countOfHitsPerPage ); int endIndex = Math.min( hits.length() -1, hitsIndex - countOfHitsPerPage ); if (startIndex < endIndex) { while (startIndex < endIndex) { try { HitWrapper hit_wrapper = new HitWrapper( hits.score(startIndex), hits.doc(startIndex) ); hitsPerPageList.add( hit_wrapper ); } catch (IOException ioe) { throw new NoSuchElementException( "no more hits: " + ioe.getMessage() ); } startIndex++; } hitsIndex = endIndex; } else { throw new NoSuchElementException(); } return hitsPerPageList; } /** * Returns the index of the element that would be returned by a * subsequent call to previous. */ public int previousIndex() { int previous_index = Math.max( 0, hitsIndex - 2 * countOfHitsPerPage ); return previous_index; } /** * Removes from the list the last element that was returned by next or * previous (optional operation). */ public void remove() { throw new UnsupportedOperationException(); } /** * Replaces the last element returned by next or previous with the * specified element (optional operation). */ public void set(Object o) { throw new UnsupportedOperationException(); } /** * A helper class encapsulating found document, and its score */ public static class HitWrapper { float score; Document document; public HitWrapper( float score, Document document ) { this.document = document; this.score = score; } public Document getDocument() { return document; } public float getScore() { return score; } public String getField(String field) { return document.get(field); } } } 1.1 xml-cocoon2/src/org/apache/cocoon/components/lucene/LuceneCocoonSearcher.java Index: LuceneCocoonSearcher.java =================================================================== /***************************************************************************** * Copyright (C) The Apache Software Foundation. All rights reserved. * * ------------------------------------------------------------------------- * * This software is published under the terms of the Apache Software License * * version 1.1, a copy of which has been included with this distribution in * * the LICENSE file. * *****************************************************************************/ package org.apache.cocoon.components.lucene; import org.apache.avalon.framework.component.Component; import org.apache.cocoon.ProcessingException; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.store.Directory; import org.apache.lucene.search.Hits; public interface LuceneCocoonSearcher extends Component { public final static String ROLE = "org.apache.cocoon.components.lucene.LuceneCocoonSearcher"; public void setAnalyzer( Analyzer analyzer ); public void setDirectory( Directory directory ); public Hits search( String query_string, String default_field ) throws ProcessingException; } 1.1 xml-cocoon2/src/org/apache/cocoon/components/lucene/LuceneIndexContentHandler.java Index: LuceneIndexContentHandler.java =================================================================== /***************************************************************************** * Copyright (C) The Apache Software Foundation. All rights reserved. * * ------------------------------------------------------------------------- * * This software is published under the terms of the Apache Software License * * version 1.1, a copy of which has been included with this distribution in * * the LICENSE file. * *****************************************************************************/ package org.apache.cocoon.components.lucene; /** * Title: lucene * Description: Demo files using lucene indexer&searcher * Copyright: Copyright (c) 2001 * Company: * @author Bernhard Huber * @version 1.0 */ import org.xml.sax.ContentHandler; import org.xml.sax.Locator; import org.xml.sax.InputSource; import org.xml.sax.SAXException; import org.xml.sax.Attributes; import org.xml.sax.helpers.AttributesImpl; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.DateField; import java.util.Stack; import java.util.Iterator; import java.util.List; import java.util.ArrayList; /** * Parse XML and generate lucene document(s) */ public class LuceneIndexContentHandler implements ContentHandler { private List documents; StringBuffer bodyText; private Document bodyDocument; private Stack elementStack; private int indentPos = 0; public LuceneIndexContentHandler() { this.bodyText = new StringBuffer(); this.bodyDocument = new Document(); this.documents = new ArrayList(); this.documents.add( this.bodyDocument ); elementStack = new Stack(); } public List allDocuments() { return documents; } public Iterator iterator() { return documents.iterator(); } private String indent() { final String LINE_PREFIX = " "; if (indentPos > LINE_PREFIX.length()) { return LINE_PREFIX; } else { return LINE_PREFIX.substring(0, indentPos); } } private void incIndent() { indentPos++; } private void decIndent() { indentPos--; } public void characters(char[] ch, int start, int length) { IndexHelperField tos = (IndexHelperField) elementStack.peek(); if (ch.length > 0 && start >= 0 && length > 1 && tos != null) { String text = new String( ch, start, length ); tos.appendText( text ); bodyText.append( text ); } } public void endDocument() { /* empty */ bodyDocument.add( Field.UnStored( LuceneXMLIndexer.BODY_FIELD, bodyText.toString()) ); } public void endElement(String namespaceURI, String localName, String qName) { //System.out.println(indent() + "ee: " + "localName " + localName + " " + "qName " + qName); IndexHelperField tos = (IndexHelperField) elementStack.pop(); String text = tos.getText(); String lname = tos.getLocalFieldName(); String qname = tos.getQualifiedFieldName(); Document document = new Document(); boolean add_document = false; if (text != null && text.length() > 0) { System.out.println( "field qname " + qname ); document.add( Field.UnStored( qName, text ) ); add_document = true; } Attributes atts = tos.getAttributes(); if (atts != null && atts.getLength() > 0) { for (int i = 0; i < atts.getLength(); i++ ) { String atts_qname = atts.getQName(i); String atts_value = atts.getValue(i); System.out.println("attribute field " + qname + "@" + atts_qname + ": " + atts_value ); document.add( Field.UnStored( qname + "@" + atts_qname, atts_value ) ); add_document = true; } } if (add_document) { documents.add( document ); } decIndent(); } public void endPrefixMapping(String prefix) { /* empty */ System.out.println(indent() + "endPrefixMapping " + prefix ); } public void ignorableWhitespace(char[] ch, int start, int length) { /* empty */ System.out.println(indent() + "ignorableWhitspace " ); } public void processingInstruction(String target, String data) { /* empty */ System.out.println(indent() + "processingInstruction " + target + " " + data ); } public void setDocumentLocator(Locator locator) { /* empty */ System.out.println(indent() + "startDocuementLocator " + locator ); } public void skippedEntity(String name) { /* empty */ System.out.println(indent() + "skippedEntity " + name ); } public void startDocument() { /* empty */ System.out.println(indent() + "startDocument" ); } public void startElement(String namespaceURI, String localName, String qName, Attributes atts) { incIndent(); //System.out.println(indent() + "se: " + "localName " + localName + " " + "qName " + qName); IndexHelperField ihf = new IndexHelperField(localName, qName, new AttributesImpl(atts) ); elementStack.push(ihf); } public void startPrefixMapping(String prefix, String uri) { /* empty */ System.out.println(indent() + "startPrefixMapping: " + prefix + " " + uri ); } } 1.1 xml-cocoon2/src/org/apache/cocoon/components/lucene/LuceneXMLIndexer.java Index: LuceneXMLIndexer.java =================================================================== /***************************************************************************** * Copyright (C) The Apache Software Foundation. All rights reserved. * * ------------------------------------------------------------------------- * * This software is published under the terms of the Apache Software License * * version 1.1, a copy of which has been included with this distribution in * * the LICENSE file. * *****************************************************************************/ package org.apache.cocoon.components.lucene; import org.apache.avalon.framework.component.Component; import org.apache.cocoon.ProcessingException; import java.util.List; import java.util.Iterator; import java.net.URL; /** * This interface specifies generating lucene documents from an * xml content. * * <p> * The well-known fields of a lucene documents are defined as * <code>*_FIELD</code> constants. * </p> * <p> * You may access generated lucene documents via * <code>allDocuments()</code>, or <code>iterator()</code>. * </p> * <p> * You trigger the generating of lucene documents via * <code>build()</code>. * </p> * */ public interface LuceneXMLIndexer extends Component { public final static String ROLE = "org.apache.cocoon.components.lucene.LuceneXMLIndexer"; /** Field of document's body, ie <tt>body</tt> (mandatory). */ public static final String BODY_FIELD = "body"; public static final String URL_FIELD = "url"; public static final String UID_FIELD = "uid"; /** * return a list of all lucene documents generated by @see build * * @return List list of lucene Documents */ public List allDocuments(); /** * return an iterator of all lucene documents generated by @see build * * @return Iterator iterator of lucene Documents */ public Iterator iterator(); /** * Build lucenen documents from a URL * * @param url the content of this url gets indexed. */ public void build(URL url) throws ProcessingException; } 1.1 xml-cocoon2/src/org/apache/cocoon/components/lucene/SimpleCocoonCrawlerImpl.java Index: SimpleCocoonCrawlerImpl.java =================================================================== /***************************************************************************** * Copyright (C) The Apache Software Foundation. All rights reserved. * * ------------------------------------------------------------------------- * * This software is published under the terms of the Apache Software License * * version 1.1, a copy of which has been included with this distribution in * * the LICENSE file. * *****************************************************************************/ package org.apache.cocoon.components.lucene; import org.apache.avalon.excalibur.pool.Recyclable; import org.apache.avalon.framework.activity.Disposable; import org.apache.avalon.framework.configuration.Configurable; import org.apache.avalon.framework.configuration.Configuration; import org.apache.avalon.framework.configuration.ConfigurationException; import org.apache.avalon.framework.logger.AbstractLoggable; import org.apache.avalon.framework.parameters.Parameters; import org.apache.avalon.framework.thread.ThreadSafe; import org.apache.cocoon.Constants; import org.apache.cocoon.util.Tokenizer; import org.apache.log.Logger; import org.apache.regexp.RE; import org.apache.regexp.RESyntaxException; import java.io.*; import java.util.*; import java.net.*; /** * A simple cocoon crawler. */ public class SimpleCocoonCrawlerImpl extends AbstractLoggable implements CocoonCrawler, Configurable, Disposable, Recyclable { /** * Append this query, for querying the link view of an URL */ private final static String LINK_VIEW_QUERY_CONFIG = "link-view-query"; private final static String LINK_VIEW_QUERY_DEFAULT = "?cocoon-view=links"; private String linkViewQuery = LINK_VIEW_QUERY_DEFAULT; /** * Expected content-type of a link view response. */ public final static String LINK_CONTENT_TYPE_CONFIG = "link-content-type"; public final String LINK_CONTENT_TYPE_DEFAULT = "application/x-cocoon-links"; private String linkContentType = LINK_CONTENT_TYPE_DEFAULT; private final static String EXCLUDE_CONFIG = "exclude"; private HashSet excludeCrawlingURL; private final static String INCLUDE_CONFIG = "include"; private HashSet includeCrawlingURL; private final static String USER_AGENT_CONFIG = "user-agent"; private final static String USER_AGENT_DEFAULT = Constants.COMPLETE_NAME; private String userAgent = USER_AGENT_DEFAULT; private final static String ACCEPT_CONFIG = "accept"; private final static String ACCEPT_DEFAULT = USER_AGENT_DEFAULT; private String accept = ACCEPT_DEFAULT; private HashSet crawled; private HashSet urlsToProcess; /** * configure component * allow: * <pre><tt> * <include>.x</include> or <include>.xxx, .yyy</include> * <exclude>.x</exclude> or <exclude>.xxx, .yyy</exclude> * <link-content-type>dfd</link-content-type> * <link-view-query>dfsd</link-view-query> * </tt></pre> */ public void configure( Configuration configuration ) throws ConfigurationException { Configuration []children; children = configuration.getChildren( INCLUDE_CONFIG ); if (children != null) { for (int i = 0; i < children.length; i++) { String pattern = children[i].getValue(); try { Tokenizer t = new Tokenizer( pattern, ", " ); while (t.hasMoreTokens()) { String tokenized_pattern = t.nextToken(); this.includeCrawlingURL.add( new RE( tokenized_pattern ) ); } } catch (RESyntaxException rese) { getLogger().error( "Cannot create includeing regular-expression for " + pattern, rese ); } } } children = configuration.getChildren( EXCLUDE_CONFIG ); if (children != null) { for (int i = 0; i < children.length; i++) { String pattern = children[i].getValue(); try { Tokenizer t = new Tokenizer( pattern, ", " ); while (t.hasMoreTokens()) { String tokenized_pattern = t.nextToken(); this.excludeCrawlingURL.add( new RE( tokenized_pattern ) ); } } catch (RESyntaxException rese) { getLogger().error( "Cannot create excluding regular-expression for " + pattern, rese ); } } } Configuration child; String value; child = configuration.getChild( LINK_CONTENT_TYPE_CONFIG, false ); if (child != null) { value = child.getValue(); if (value != null && value.length() > 0) { this.linkContentType = value; } } child = configuration.getChild( LINK_VIEW_QUERY_CONFIG, false ); if (child != null) { value = child.getValue(); if (value != null && value.length() > 0) { this.linkViewQuery = value; } } child = configuration.getChild( USER_AGENT_CONFIG, false ); if (child != null) { value = child.getValue(); if (value != null && value.length() > 0) { this.userAgent = value; } } child = configuration.getChild( ACCEPT_CONFIG, false ); if (child != null) { value = child.getValue(); if (value != null && value.length() > 0) { this.accept = value; } } } /** * dispose at end of life cycle, releasing all resources. */ public void dispose() { crawled = null; urlsToProcess = null; excludeCrawlingURL = null; includeCrawlingURL = null; } /** * recylcle this object, relasing resources */ public void recycle() { crawled = null; urlsToProcess = null; } public SimpleCocoonCrawlerImpl() { includeCrawlingURL = null; excludeCrawlingURL = new HashSet(); setImageExcludeFromCrawling(); } private void setImageExcludeFromCrawling() { String []EXCLUDE_FROM_CRAWLING_DEFAULT = { "*.\\.gif$", "*.\\.png$", "*.\\.jpe?g$" }; for (int i = 0; i < EXCLUDE_FROM_CRAWLING_DEFAULT.length; i++ ) { String pattern = EXCLUDE_FROM_CRAWLING_DEFAULT[i]; try { excludeCrawlingURL.add( new RE( pattern ) ); } catch (RESyntaxException rese) { getLogger().error( "Cannot create excluding regular-expression for " + pattern, rese ); } } } /** * start crawling */ public void crawl( URL url ) { crawled = new HashSet(); urlsToProcess = new HashSet(); urlsToProcess.add( url ); } public Iterator iterator() { return new CocoonCrawlerIterator(this); } public static class CocoonCrawlerIterator implements Iterator { private SimpleCocoonCrawlerImpl cocoonCrawler; CocoonCrawlerIterator( SimpleCocoonCrawlerImpl cocoonCrawler ) { this.cocoonCrawler = cocoonCrawler; } /** * check if crawling is finished. */ public boolean hasNext() { return cocoonCrawler.urlsToProcess.size() > 0; } /** * return the next URL */ public Object next() { URL url = null; Iterator i = cocoonCrawler.urlsToProcess.iterator(); if (i.hasNext()) { url = (URL)i.next(); cocoonCrawler.urlsToProcess.remove( url ); List url_links = cocoonCrawler.getLinks( url ); if (url_links != null) { cocoonCrawler.urlsToProcess.addAll( url_links ); } } return url; } /** * remove is not implemented * @exception UnsupportedOperationException is always thrown */ public void remove() { throw new UnsupportedOperationException( "remove is not implemented" ); } } /** * list of links from the parent url * * @param url * @return List of URLs */ private List getLinks( URL url ) { ArrayList url_links = null; if (!isIncludedURL( url.toString())) { return null; } // don't try to get links for url which is excluded if (isExcludedURL( url.toString())) { return null; } // don't try to get links for url which has been crawled already if (crawled.contains( url.toString() )) { return null; } // mark it as crawled crawled.add( url.toString() ); if (getLogger().isDebugEnabled()) { getLogger().debug( "Get links of URL: " + url.toString() ); } // get links of url try { URL links_url = new URL( url, url.getPath() + linkViewQuery ); URLConnection links_url_connection = links_url.openConnection(); InputStream is = links_url_connection.getInputStream(); BufferedReader br = new BufferedReader( new InputStreamReader( is ) ); String content_type = links_url_connection.getContentType(); if (getLogger().isDebugEnabled()) { getLogger().debug( "Content-type: " + content_type ); } if (content_type.equals( linkContentType )) { url_links = new ArrayList(); // content is supposed to be a list of links, // relative to current URL String line; while ((line = br.readLine()) != null) { URL new_url = new URL( url, line ); boolean add_url = true; add_url &= !crawled.contains( new_url.toString() ); add_url &= isIncludedURL( new_url.toString() ); add_url &= !isExcludedURL( new_url.toString() ); if (add_url) { if (getLogger().isDebugEnabled()) { getLogger().debug( "Add URL: " + new_url.toString() ); } url_links.add( new_url ); } } // now we have a list of URL which should be examined } } catch (IOException ioe) { getLogger().warn( "Problems get links of " + url, ioe ); } return url_links; } /** * check if URL is a candidate for indexing */ private boolean isExcludedURL( String url ) { // by default include URL for crawling if (excludeCrawlingURL == null) { return false; } final String s = url.toString(); Iterator i = excludeCrawlingURL.iterator(); while (i.hasNext()) { RE pattern = (RE)i.next(); if (pattern.match( s )) { return true; } } return false; } /** * check if URL is a candidate for indexing */ private boolean isIncludedURL( String url ) { // by default include URL for crawling if (excludeCrawlingURL == null) { return true; } final String s = url.toString(); Iterator i = includeCrawlingURL.iterator(); while (i.hasNext()) { RE pattern = (RE)i.next(); if (pattern.match( s )) { return true; } } return false; } } 1.1 xml-cocoon2/src/org/apache/cocoon/components/lucene/SimpleLuceneCocoonIndexerImpl.java Index: SimpleLuceneCocoonIndexerImpl.java =================================================================== /***************************************************************************** * Copyright (C) The Apache Software Foundation. All rights reserved. * * ------------------------------------------------------------------------- * * This software is published under the terms of the Apache Software License * * version 1.1, a copy of which has been included with this distribution in * * the LICENSE file. * *****************************************************************************/ package org.apache.cocoon.components.lucene; import org.apache.avalon.excalibur.pool.Recyclable; import org.apache.avalon.framework.activity.Disposable; import org.apache.avalon.framework.component.ComponentException; import org.apache.avalon.framework.component.ComponentManager; import org.apache.avalon.framework.component.Composable; import org.apache.avalon.framework.configuration.Configurable; import org.apache.avalon.framework.configuration.Configuration; import org.apache.avalon.framework.configuration.ConfigurationException; import org.apache.cocoon.ProcessingException; import org.apache.avalon.framework.logger.AbstractLoggable; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.DateField; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.store.Directory; import org.apache.lucene.index.*; import java.io.IOException; import java.util.Iterator; import java.net.URL; /** */ public class SimpleLuceneCocoonIndexerImpl extends AbstractLoggable implements LuceneCocoonIndexer, Configurable, Composable, Disposable { protected final static String ANALYZER_CLASSNAME_CONFIG = "analyzer-classname"; protected final static String ANALYZER_CLASSNAME_DEFAULT = "org.apache.lucene.analysis.standard.StandardAnalyzer"; private String analyzerClassnameDefault = ANALYZER_CLASSNAME_DEFAULT; protected final static String DIRECTORY_CONFIG = "directory"; protected final static String DIRECTORY_DEFAULT = null; private String directoryDefault = DIRECTORY_DEFAULT; private final static String MERGE_FACTOR_CONFIG = "merge-factor"; private final static int MERGE_FACTOR_DEFAULT = 20; private int mergeFactor = MERGE_FACTOR_DEFAULT; /** The component manager instance */ protected ComponentManager manager = null; Analyzer analyzer; public void configure( Configuration conf ) throws ConfigurationException { Configuration child; String value; child = conf.getChild( ANALYZER_CLASSNAME_CONFIG, false ); if (child != null) { value = conf.getValue( ANALYZER_CLASSNAME_DEFAULT ); if (value != null) { analyzerClassnameDefault = value; } } child = conf.getChild( MERGE_FACTOR_CONFIG, false ); if (child != null) { mergeFactor = conf.getValueAsInteger( MERGE_FACTOR_DEFAULT ); } } /** * Set the current <code>ComponentManager</code> instance used by this * <code>Composable</code>. */ public void compose(ComponentManager manager) throws ComponentException { this.manager=manager; } public void dispose() { } public void setAnalyzer( Analyzer analyzer ) { this.analyzer = analyzer; } /** * index content of base_url, index content of links from base_url. * * @param index the lucene store to write the index to * @param create iff true create, or overwrite existing index, else * update existing index. * @param base_url index content of base_url, and crawl through all its * links recursivly. */ public void index( Directory index, boolean create, URL base_url ) throws ProcessingException { IndexWriter writer = null; LuceneXMLIndexer lxi = null; CocoonCrawler cocoonCrawler = null; try { lxi = (LuceneXMLIndexer)manager.lookup( LuceneXMLIndexer.ROLE ); writer = new IndexWriter( index, analyzer, create ); writer.mergeFactor = this.mergeFactor; cocoonCrawler = (CocoonCrawler)manager.lookup( CocoonCrawler.ROLE ); cocoonCrawler.crawl( base_url ); Iterator cocoonCrawlerIterator = cocoonCrawler.iterator(); while (cocoonCrawlerIterator.hasNext()) { URL crawl_url = (URL)cocoonCrawlerIterator.next(); if (!crawl_url.getHost().equals( base_url.getHost() ) || crawl_url.getPort() != base_url.getPort() ) { // skip urls using different host, or port than host, // or port of base url System.out.println( "Skipping carwling URL " + crawl_url.toString() + " as base_url is " + base_url.toString() ); continue; } // build lucene documents from the content of the crawl_url lxi.build( crawl_url ); Iterator i = lxi.iterator(); // add all built lucene documents while (i.hasNext()) { Document document = (Document)i.next(); writer.addDocument( document ); } } // optimize it writer.optimize(); } catch (IOException ioe) { throw new ProcessingException( "IOException in index()", ioe ); } catch (ComponentException ce) { throw new ProcessingException( "ComponentException in index()", ce ); } finally { if (writer != null) { try { writer.close(); } catch (IOException ioe) { } writer = null; } if (lxi != null) { manager.release( lxi ); lxi = null; } if (cocoonCrawler != null) { manager.release( cocoonCrawler ); cocoonCrawler = null; } } } class DocumentDeletableIterator { private IndexReader reader; // existing index private TermEnum uidIter; // document id iterator public DocumentDeletableIterator( Directory directory ) throws IOException { reader = IndexReader.open( directory ); // open existing index uidIter = reader.terms( new Term("uid", "")); // init uid iterator } protected void finalize() throws Throwable { super.finalize(); if (uidIter != null) { uidIter.close(); // close uid iterator uidIter = null; } if (reader != null) { reader.close(); // close existing index reader = null; } } public void deleteAllStaleDocuments() throws IOException { while (uidIter.term() != null && uidIter.term().field() == "uid") { reader.delete(uidIter.term()); uidIter.next(); } } public void deleteModifiedDocuments( String uid ) throws IOException { while (documentHasBeenModified( uidIter.term(), uid )) { reader.delete( uidIter.term() ); uidIter.next(); } if (documentHasNotBeenModified( uidIter.term(), uid )) { uidIter.next(); } } boolean documentIsDeletable( Term term ) { return term != null && term.field() == "uid"; } boolean documentHasBeenModified( Term term, String uid ) { return documentIsDeletable( term )&& term.text().compareTo(uid) < 0; } boolean documentHasNotBeenModified( Term term, String uid ) { return documentIsDeletable( term ) && term.text().compareTo(uid) == 0; } } } 1.1 xml-cocoon2/src/org/apache/cocoon/components/lucene/SimpleLuceneCocoonSearcherImpl.java Index: SimpleLuceneCocoonSearcherImpl.java =================================================================== /***************************************************************************** * Copyright (C) The Apache Software Foundation. All rights reserved. * * ------------------------------------------------------------------------- * * This software is published under the terms of the Apache Software License * * version 1.1, a copy of which has been included with this distribution in * * the LICENSE file. * *****************************************************************************/ package org.apache.cocoon.components.lucene; import org.apache.avalon.excalibur.pool.Recyclable; import org.apache.avalon.framework.activity.Disposable; import org.apache.avalon.framework.component.ComponentException; import org.apache.avalon.framework.component.ComponentManager; import org.apache.avalon.framework.component.Composable; import org.apache.avalon.framework.configuration.Configurable; import org.apache.avalon.framework.configuration.Configuration; import org.apache.avalon.framework.configuration.ConfigurationException; import org.apache.cocoon.ProcessingException; import org.apache.avalon.framework.logger.AbstractLoggable; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.DateField; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.store.Directory; import org.apache.lucene.index.*; import org.apache.lucene.search.*; import org.apache.lucene.queryParser.*; import java.io.IOException; import java.util.Iterator; import java.net.URL; /** */ public class SimpleLuceneCocoonSearcherImpl extends AbstractLoggable implements LuceneCocoonSearcher, Configurable, Composable, Disposable, Recyclable { protected final static String ANALYZER_CLASSNAME_CONFIG = "analyzer-classname"; protected final static String ANALYZER_CLASSNAME_DEFAULT = "org.apache.lucene.analysis.standard.StandardAnalyzer"; private String analyzerClassnameDefault = ANALYZER_CLASSNAME_DEFAULT; protected final static String DEFAULT_SEARCH_FIELD_CONFIG = "default-search-field"; protected final static String DEFAULT_SEARCH_FIELD_DEFAULT = "body"; private String defaultSearchFieldDefault = DEFAULT_SEARCH_FIELD_DEFAULT; protected final static String DEFAULT_QUERY_CONFIG = "default-query"; protected final static String DEFAULT_QUERY_DEFAULT = null; private String defaultQueryDefault = DEFAULT_QUERY_DEFAULT; protected final static String QUERYPARSER_CLASSNAME_CONFIG = "queryparser-classname"; protected final static String QUERYPARSER_CLASSNAME_DEFAULT = "org.apache.lucene.queryParser.QueryParser"; private String queryparserClassnameDefault = QUERYPARSER_CLASSNAME_DEFAULT; protected final static String DIRECTORY_CONFIG = "directory"; protected final static String DIRECTORY_DEFAULT = null; private String directoryDefault = DIRECTORY_DEFAULT; /** The component manager instance */ protected ComponentManager manager=null; private Analyzer analyzer; private Directory directory; private IndexSearcher indexSearcher; private IndexReaderCache indexReaderCache; /** * configure */ public void configure( Configuration conf ) throws ConfigurationException { Configuration child; String value; child = conf.getChild( ANALYZER_CLASSNAME_CONFIG, false ); if (child != null) { value = conf.getValue( ANALYZER_CLASSNAME_DEFAULT ); if (value != null) { analyzerClassnameDefault = value; } } child = conf.getChild( DEFAULT_SEARCH_FIELD_CONFIG, false ); if (child != null) { value = conf.getValue( DEFAULT_SEARCH_FIELD_DEFAULT ); if (value != null) { defaultSearchFieldDefault = value; } } child = conf.getChild( DEFAULT_QUERY_CONFIG, false ); if (child != null) { value = conf.getValue( DEFAULT_QUERY_DEFAULT ); if (value != null) { defaultQueryDefault = value; } } child = conf.getChild( QUERYPARSER_CLASSNAME_CONFIG, false ); if (child != null) { value = conf.getValue( QUERYPARSER_CLASSNAME_DEFAULT ); if (value != null) { queryparserClassnameDefault = value; } } child = conf.getChild( DIRECTORY_CONFIG, false ); if (child != null) { value = conf.getValue( DIRECTORY_DEFAULT ); if (value != null) { directoryDefault = value; } } } /** * Set the current <code>ComponentManager</code> instance used by this * <code>Composable</code>. */ public void compose(ComponentManager manager) throws ComponentException { this.manager=manager; } public void dispose() { releaseIndexSearcher(); releaseIndexReaderCache(); } public void recycle() { releaseIndexSearcher(); } private void releaseIndexSearcher() { if (indexSearcher != null) { try { indexSearcher.close(); } catch (IOException ioe) { // ignore it } indexSearcher = null; } } private void releaseIndexReaderCache() { if (indexReaderCache != null) { indexReaderCache = null; } } /** * set an analyzer, overriding the analyzerClassnameDefault. */ public void setAnalyzer( Analyzer analyzer ) { this.analyzer = analyzer; } public void setDirectory( Directory directory ) { this.directory = directory; indexReaderCache = null; } public Hits search( String query_string, String default_field ) throws ProcessingException { Hits hits = null; try { Query query = QueryParser.parse( query_string, default_field, analyzer); // release index searcher for each new search releaseIndexSearcher(); IndexSearcher indexSearcher = new IndexSearcher( getReader() ); hits = indexSearcher.search( query ); // do not close indexSearcher now, as using hits needs an // opened indexSearcher indexSearcher.close(); } catch (ParseException pe) { throw new ProcessingException( "Cannot parse query " + query_string, pe ); } catch (IOException ioe) { throw new ProcessingException( "Cannot access hits", ioe ); } return hits; } /** * This class should help to minimise usage of IndexReaders. * */ static class IndexReaderCache { private Directory directory; private IndexReader indexReader; private long lastModified; IndexReaderCache( Directory directory ) { this.directory = directory; } protected void finalize() throws Throwable { if (indexReader != null) { indexReader.close(); indexReader = null; } } void setIndexReader( IndexReader reader ) throws IOException { if (indexReader != null) { indexReader.close(); } indexReader = reader; lastModified = indexReader.lastModified(this.directory); } IndexReader getIndexReader() { return indexReader; } boolean indexReaderIsValid() throws IOException { return indexReader != null && indexReader.lastModified(this.directory) == lastModified; } } public IndexReader getReader() throws IOException { if (indexReaderCache == null) { indexReaderCache = new IndexReaderCache( directory ); } IndexReader indexReader = null; if (indexReaderCache.indexReaderIsValid()) { indexReader = indexReaderCache.getIndexReader(); } else { indexReader = IndexReader.open( this.directory ); indexReaderCache.setIndexReader( indexReader ); } return indexReader; } } 1.1 xml-cocoon2/src/org/apache/cocoon/components/lucene/SimpleLuceneXMLIndexerImpl.java Index: SimpleLuceneXMLIndexerImpl.java =================================================================== /***************************************************************************** * Copyright (C) The Apache Software Foundation. All rights reserved. * * ------------------------------------------------------------------------- * * This software is published under the terms of the Apache Software License * * version 1.1, a copy of which has been included with this distribution in * * the LICENSE file. * *****************************************************************************/ package org.apache.cocoon.components.lucene; import org.apache.avalon.framework.activity.Disposable; import org.apache.avalon.framework.configuration.Configurable; import org.apache.avalon.framework.configuration.Configuration; import org.apache.avalon.framework.configuration.ConfigurationException; import org.apache.avalon.framework.component.ComponentException; import org.apache.avalon.framework.component.ComponentManager; import org.apache.avalon.framework.component.Composable; import org.apache.avalon.framework.logger.AbstractLoggable; import org.apache.avalon.framework.parameters.Parameters; import org.apache.avalon.framework.thread.ThreadSafe; import org.apache.avalon.framework.logger.AbstractLoggable; import org.apache.cocoon.components.parser.Parser; import org.apache.cocoon.ProcessingException; import org.apache.cocoon.environment.Source; import org.apache.cocoon.environment.SourceResolver; import java.io.*; import java.util.HashSet; import java.util.Map; import java.util.Iterator; import java.util.List; import java.util.ArrayList; import java.net.URL; import java.net.URLConnection; import javax.xml.parsers.*; import org.xml.sax.ContentHandler; import org.xml.sax.ErrorHandler; import org.xml.sax.Locator; import org.xml.sax.InputSource; import org.xml.sax.SAXException; import org.xml.sax.SAXParseException; import org.xml.sax.XMLReader; import org.xml.sax.Attributes; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.DateField; /** * A simple class building lucene documents from xml content. */ public class SimpleLuceneXMLIndexerImpl extends AbstractLoggable implements LuceneXMLIndexer, Configurable, Composable { /** * list of lucene Document objects */ List documents; /** * The component manager instance */ protected ComponentManager manager=null; /** * append this string to the url in order to get the * content view of the url */ final String CONTENT_QUERY = "?cocoon-view=content"; /** * set of allowed content types */ final HashSet allowedContentType; /** */ public SimpleLuceneXMLIndexerImpl() { documents = null; allowedContentType = new HashSet(); allowedContentType.add( "text/xml" ); allowedContentType.add( "text/xhtml" ); } /** * configure */ public void configure(Configuration conf) throws ConfigurationException { } /** * Set the current <code>ComponentManager</code> instance used by this * <code>Composable</code>. */ public void compose(ComponentManager manager) throws ComponentException { this.manager = manager; } /** * return a list of all lucene documents generated by @see build * * @return List list of lucene Documents */ public List allDocuments() { return documents; } /** * return an iterator of all lucene documents generated by @see build * * @return Iterator iterator of lucene Documents */ public Iterator iterator() { if (documents == null) { return new ArrayList().iterator(); } return documents.iterator(); } /** * Build lucenen documents from a URL * * @param url the content of this url gets indexed. */ public void build(URL url) throws ProcessingException { try { URL contentURL = new URL(url, url.getPath() + CONTENT_QUERY ); URLConnection contentURLConnection = contentURL.openConnection(); String contentType = contentURLConnection.getContentType(); if (contentType != null && allowedContentType.contains( contentType )) { LuceneIndexContentHandler luceneIndexContentHandler = new LuceneIndexContentHandler(); indexDocument( contentURLConnection, luceneIndexContentHandler ); // // document is parsed // Iterator it = luceneIndexContentHandler.iterator(); while (it.hasNext()) { Document d = (Document)it.next(); d.add(Field.UnIndexed( URL_FIELD, url.toString())); // store ... false, index ... true, token ... false d.add(new Field( UID_FIELD, uid(contentURLConnection), false, true, false)); } documents = luceneIndexContentHandler.allDocuments(); } } catch (IOException ioe) { throw new ProcessingException( "Cannot read URL " + url, ioe ); } finally { } } /** * index input stream producing lucene Documents * * @param contentURLConnection the xml content which should get indexed. * @param luceneIndexContentHandler ContentHandler for generating * a lucene Document from XML content. */ private void indexDocument( URLConnection contentURLConnection, LuceneIndexContentHandler luceneIndexContentHandler ) throws ProcessingException { InputStream is = null; InputSource in = null; Parser parser = null; try { is = contentURLConnection.getInputStream(); in = new InputSource(is); // get an XML parser parser = (Parser)this.manager.lookup(Parser.ROLE); //reader.setErrorHandler(new CocoonErrorHandler()); parser.setContentHandler( luceneIndexContentHandler ); parser.parse(in); // // document is parsed // } catch (IOException ioe) { throw new ProcessingException( "Cannot read!", ioe ); } catch (SAXException saxe) { throw new ProcessingException("Cannot parse!", saxe); } catch (ComponentException ce) { throw new ProcessingException( "Cannot lookup xml parser!", ce ); } finally { if (parser != null) this.manager.release(parser); } } /** * return a unique uid of a url connection * * @return String unique uid of a urlConnection */ private String uid( URLConnection urlConnection ) { // Append path and date into a string in such a way that lexicographic // sorting gives the same results as a walk of the file hierarchy. Thus // null (\u0000) is used both to separate directory components and to // separate the path from the date. return urlConnection.toString().replace('/', '\u0000') + "\u0000" + DateField.timeToString(urlConnection.getLastModified()); } }
---------------------------------------------------------------------- In case of troubles, e-mail: [EMAIL PROTECTED] To unsubscribe, e-mail: [EMAIL PROTECTED] For additional commands, e-mail: [EMAIL PROTECTED]