(more basic and flexible) based on the old LuceneIndexTransformer
Example of input source:
<page xmlns:lucene="http://apache.org/cocoon/lucene/1.0"> <lucene:index create="true" analyzer="org.apache.lucene.analysis.standard.StandardAnalyzer" directory="d:/indexbase" merge-factor="merge-factor"> <lucene:document>
<lucene:field name="tile" type="keyword">sqdqsdq</lucene:field>
<lucene:field name="description" type="text"> bla bal blalael balbal </lucene:field>
<lucene:field name="date" type="date" dateformat="MM/dd/yyyy">10/12/2002</lucene:field> (see java API Class SimpleDateFormat for more explanation about the dateFormat attribut)
<lucene:field name="date" type="unstored" >just indexed information (not stored)</lucene:field>
<lucene:field name="date" type="unindexed" >just stored information (not indexed)</lucene:field>
</lucene:document>
<lucene:document>
<lucene:field name="author" type="keyword" boost="2">Mr Author</lucene:field> (boost the field for the search (see Lucene documentation)
<lucene:field name="langage" type="keyword">french</lucene:field>
</lucene:document>
</lucene:index>
<lucene:delete directory="d:/indexbase">
<lucene:document field="id" value="1E3RFE"/> //delete all documents with the field id ="1E3RFE"
<lucene:document field="author" value="Mr Author"/>
</lucene:delete>
</page>
Example of Output Source :
<page xmlns:lucene="http://apache.org/cocoon/lucene/1.0"> <lucene:index nbdocuments="2"/> <lucene:delete nbdocuments="1"/> </page>
_________________________________________________________________
MSN Messenger 6 http://g.msn.fr/FR1001/866 : dialoguez en son et en image avec vos amis.
package org.paris5.cocoon.transformation;
import java.io.File; import java.io.IOException; import java.io.Serializable; import java.util.Map; import java.util.Stack; import java.util.Date; import java.text.SimpleDateFormat; import org.apache.avalon.framework.activity.Disposable; import org.apache.avalon.framework.component.ComponentException; import org.apache.avalon.framework.component.ComponentManager; import org.apache.avalon.framework.configuration.Configurable; import org.apache.avalon.framework.configuration.Configuration; import org.apache.avalon.framework.configuration.ConfigurationException; import org.apache.avalon.framework.context.Context; import org.apache.avalon.framework.context.ContextException; import org.apache.avalon.framework.context.Contextualizable; import org.apache.avalon.framework.parameters.Parameters; import org.apache.avalon.excalibur.pool.Recyclable; import org.apache.cocoon.Constants; import org.apache.cocoon.ProcessingException; import org.apache.cocoon.caching.CacheableProcessingComponent; import org.apache.cocoon.components.search.LuceneCocoonHelper; import org.apache.cocoon.components.search.LuceneXMLIndexer; import org.apache.cocoon.transformation.AbstractSAXTransformer; import org.apache.cocoon.environment.SourceResolver; import org.apache.excalibur.source.SourceValidity; import org.apache.excalibur.source.impl.validity.NOPValidity; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.DateField; import org.apache.lucene.store.*; import org.apache.lucene.index.*; import org.xml.sax.Attributes; import org.xml.sax.SAXException; import org.xml.sax.helpers.AttributesImpl; import java.text.*; /** * A lucene index creation transformer. * @author Nicolas Maisoneuve * <p><strong>Example of input source:</strong></p> <p><page xmlns:lucene="http://apache.org/cocoon/lucene/1.0"><br>< lucene:index create="true" <br> analyzer="org.apache.lucene.analysis.standard.StandardAnalyzer"<br> directory="d:/indexbase"<br> merge-factor="merge-factor"><br> <lucene:document></p> <p><lucene:field name="tile" type="keyword">sqdqsdq</lucene:field><br> <lucene:field name="description" type="text"> bla bal blalael balbal </lucene:field><br> <lucene:field name="date" type="date" dateformat="MM/dd/yyyy">10/12/2002</lucene:field> </p> <p><em>(see java API Class SimpleDateFormat for more explanation about the dateFormat attribut)</em></p> <p><br> <lucene:field name="date" type="unstored" >just indexed information (not stored)</lucene:field><br> <lucene:field name="date" type="unindexed" >just stored information (not indexed)</lucene:field><br> </lucene:document></p> <p> <lucene:document><br> <lucene:field name="author" type="keyword" boost="2">Mr Author</lucene:field> <p><em>(boost the field for the search (see Lucene documentation))</p> </em><p><lucene:field name="langage" type="keyword">french</lucene:field><br> </lucene:document><br> < /lucene:index></p> <p><lucene:delete directory="d:/indexbase" ><br> <lucene:document field="author" value="Mr Author"/> <em> (delete all documents with the field author ="Mr Author")</em><br><lucene:document field="id" value="1E3RFE"/> <br> < /lucene:delete></p> </page> <p><strong>Example of Output Source</strong></p> <p><page xmlns:lucene="http://apache.org/cocoon/lucene/1.0"><br> < lucene:index nbdocuments="2"/><br> < lucene:delete nbdocuments="1"/><br> </page> </p> */ public class LuceneIndexTransformer extends AbstractSAXTransformer implements Disposable, CacheableProcessingComponent, Recyclable, Configurable, Contextualizable { public static final String ANALYZER_CLASSNAME_CONFIG = "analyzer-classname"; public static final String ANALYZER_CLASSNAME_PARAMETER = "analyzer-classname"; public static final String DIRECTORY_CONFIG = "directory"; public static final String DIRECTORY_PARAMETER = "directory"; public static final String MERGE_FACTOR_CONFIG = "merge-factor"; public static final String MERGE_FACTOR_PARAMETER = "merge-factor"; public static final String DIRECTORY_DEFAULT = "index"; public static final int MERGE_FACTOR_DEFAULT = 20; public static final String ANALYZER_CLASSNAME_DEFAULT = "org.apache.lucene.analysis.standard.StandardAnalyzer"; public static final String LUCENE_URI = "http://apache.org/cocoon/lucene/1.0"; public static final String LUCENE_QUERY_ELEMENT = "index"; public static final String LUCENE_QUERY_ANALYZER_ATTRIBUTE = "analyzer"; public static final String LUCENE_QUERY_DIRECTORY_ATTRIBUTE = "directory"; public static final String LUCENE_QUERY_CREATE_ATTRIBUTE = "create"; public static final String LUCENE_QUERY_MERGE_FACTOR_ATTRIBUTE = "merge-factor"; public static final String LUCENE_DELETE_ELEMENT = "delete"; public static final String DOCUMENT_NAME_ATTRIBUTE = "name"; public static final String DOCUMENT_VALUE_ATTRIBUTE = "value"; public static final String LUCENE_DOCUMENT_ELEMENT = "document"; public static final String LUCENE_DOCUMENT_FIELD_ATTRIBUTE = "field"; public static final String LUCENE_DOCUMENT_VALUE_ATTRIBUTE = "value"; public static final String LUCENE_FIELD_ELEMENT = "field"; public static final String LUCENE_FIELD_NAME_ATTRIBUTE = "name"; public static final String LUCENE_FIELD_TYPE_ATTRIBUTE = "type"; public static final String LUCENE_FIELD_DATEFORMAT_ATTRIBUTE = "dateformat"; public static final String LUCENE_FIELD_BOOST_ATTRIBUTE = "boost"; public static final int TYPE_KEYWORD = 1; public static final int TYPE_TEXT = 2; public static final int TYPE_DATE = 3; public static final int TYPE_UNSTORED = 4; public static final int TYPE_UNINDEXED = 5; public static final int ADD_ACTION = 1; public static final int DELETE_ACTION = 2; // Initialization time variables protected ComponentManager manager = null; protected File workDir = null; protected int nbdocuments; protected int action; // Declaration time parameters values private String analyzerClassnameDefault; private String directoryDefault; private int mergeFactorDefault; // Invocation time parameters values private String analyzerClassname; private String directory; private int mergeFactor; // Runtime variables private int processing; private IndexWriter writer; private IndexReader reader; private Term term; private Document bodyDocument; private String fieldname; private int fieldtype; private float fieldboost; private String fieldvalue; private SimpleDateFormat df; private static String uid(String url) { return url.replace('/', '\u0000'); // + "\u0000" + DateField.timeToString(urlConnection.getLastModified()); } public void configure(Configuration conf) throws ConfigurationException { this.analyzerClassnameDefault = conf.getChild(ANALYZER_CLASSNAME_CONFIG) .getValue(ANALYZER_CLASSNAME_DEFAULT); this.mergeFactorDefault = conf.getChild(MERGE_FACTOR_CONFIG) .getValueAsInteger(MERGE_FACTOR_DEFAULT); this.directoryDefault = conf.getChild(DIRECTORY_CONFIG) .getValue(DIRECTORY_DEFAULT); } /** * Setup the transformer. */ public void setup(SourceResolver resolver, Map objectModel, String src, Parameters parameters) throws ProcessingException, SAXException, IOException { // We don't need all this stuff this.analyzerClassname = parameters.getParameter( ANALYZER_CLASSNAME_PARAMETER, analyzerClassnameDefault); this.directory = parameters.getParameter(DIRECTORY_PARAMETER, directoryDefault); this.mergeFactor = parameters.getParameterAsInteger(MERGE_FACTOR_PARAMETER, mergeFactorDefault); } public void compose(ComponentManager manager) throws ComponentException { this.manager = manager; } /** * Contextualize this class */ public void contextualize(Context context) throws ContextException { this.workDir = (File) context.get(Constants.CONTEXT_WORK_DIR); } public void recycle() { this.processing = 0; if (this.writer != null) { try { this.writer.close(); } catch (IOException ioe) {} this.writer = null; } if (this.reader != null) { try { this.reader.close(); } catch (IOException ioe) {} this.reader = null; } this.bodyDocument = null; } public void dispose() { } /** * Generate the unique key. * This key must be unique inside the space of this component. * * @return The generated key */ public Serializable getKey() { return "1"; } /** * Generate the validity object. * * @return The generated validity object or <code>null</code> if the * component is currently not cacheable. */ public SourceValidity getValidity() { return NOPValidity.SHARED_INSTANCE; } public void startDocument() throws SAXException { super.startDocument(); } public void endDocument() throws SAXException { super.endDocument(); } /** * Begin the scope of a prefix-URI Namespace mapping. * * @param prefix The Namespace prefix being declared. * @param uri The Namespace URI the prefix is mapped to. */ public void startPrefixMapping(String prefix, String uri) throws SAXException { if (processing == 0) { super.startPrefixMapping(prefix, uri); } } /** * End the scope of a prefix-URI mapping. * * @param prefix The prefix that was being mapping. */ public void endPrefixMapping(String prefix) throws SAXException { if (processing == 0) { super.endPrefixMapping(prefix); } } public void startElement(String namespaceURI, String localName, String qName, Attributes atts) throws SAXException { //System.out.println("START processing: "+processing+" "+localName); if (processing == 0) { if (LUCENE_URI.equals(namespaceURI)) { // INDEX ACTION if (LUCENE_QUERY_ELEMENT.equals(localName)) { action = ADD_ACTION; // create base parameter String sCreate = atts.getValue(LUCENE_QUERY_CREATE_ATTRIBUTE); boolean bCreate = sCreate != null && (sCreate.equalsIgnoreCase("yes") || sCreate.equalsIgnoreCase("true")); // analyzer parameter String analyzerClassname = atts.getValue(LUCENE_QUERY_ANALYZER_ATTRIBUTE); if (analyzerClassname == null) { analyzerClassname = this.ANALYZER_CLASSNAME_DEFAULT; } Analyzer analyzer = LuceneCocoonHelper.getAnalyzer(analyzerClassname); // mergeFactor parameter String sMergeFactor = atts.getValue(LUCENE_QUERY_MERGE_FACTOR_ATTRIBUTE); int mergeFactor = this.mergeFactor; if (sMergeFactor != null) { mergeFactor = Integer.parseInt(sMergeFactor); } // directory parameter String directoryName = atts.getValue(LUCENE_QUERY_DIRECTORY_ATTRIBUTE); if (directoryName == null) { directoryName = this.directory; //System.out.println("QUERY Create=" + bCreate + ", Directory=" + directoryName + ", Analyzer=" + analyzerClassname); } try { Directory directory = LuceneCocoonHelper.getDirectory(new File( workDir, directoryName), bCreate); writer = new IndexWriter(directory, analyzer, bCreate); writer.mergeFactor = mergeFactor; } catch (IOException e) { throw new SAXException(e); } processing = 1; } // DELETE ACTION else if (LUCENE_DELETE_ELEMENT.equals(localName)) { action = DELETE_ACTION; // directory parameter String directoryName = atts.getValue(LUCENE_QUERY_DIRECTORY_ATTRIBUTE); if (directoryName == null) { directoryName = this.directory; } try { Directory directory = LuceneCocoonHelper.getDirectory( new File(workDir, directoryName), false); reader = LuceneCocoonHelper.getIndexReader(directory); //System.out.println("DELETE Directory=" + directoryName); } catch (IOException e) { throw new SAXException(e); } processing = 1; } } else { super.startElement(namespaceURI, localName, qName, atts); } } else if (processing == 1) { if (LUCENE_URI.equals(namespaceURI) && LUCENE_DOCUMENT_ELEMENT.equals(localName)) { if (action == ADD_ACTION) { this.bodyDocument = new Document(); } if (action == DELETE_ACTION) { this.term = new Term(atts.getValue(LUCENE_DOCUMENT_FIELD_ATTRIBUTE), atts.getValue(LUCENE_DOCUMENT_VALUE_ATTRIBUTE)); } processing = 2; } else { throw new SAXException( "<lucene:query> element can contain only <lucene:document> elements!"); } } else if (processing == 2) { if (LUCENE_URI.equals(namespaceURI) && LUCENE_FIELD_ELEMENT.equals(localName)) { this.fieldname = atts.getValue(LUCENE_FIELD_NAME_ATTRIBUTE); if (this.fieldname == null || this.fieldname.equals("")) { throw new SAXException( "<lucene:field> element must contain name attribut"); } String fieldtype = atts.getValue(LUCENE_FIELD_TYPE_ATTRIBUTE); if (fieldtype == null || fieldtype.equals("")) { throw new SAXException( "<lucene:field> element must contain a type attribut"); } if (fieldtype.equals("keyword")) { this.fieldtype = TYPE_KEYWORD; } else if (fieldtype.equals("text")) { this.fieldtype = TYPE_TEXT; } else if (fieldtype.equals("date")) { this.fieldtype = TYPE_DATE; String pattern = atts.getValue(LUCENE_FIELD_DATEFORMAT_ATTRIBUTE); if (pattern == null || pattern.equals("")) { throw new SAXException( "<lucene:field type=\"date\"> element must contain a dateformat attribut"); } df= new SimpleDateFormat(pattern); } else if (fieldtype.equals("unstored")) { this.fieldtype = TYPE_UNSTORED; } else if (fieldtype.equals("unindexed")) { this.fieldtype = TYPE_UNINDEXED; } String fieldboost = atts.getValue(LUCENE_FIELD_BOOST_ATTRIBUTE); if (fieldboost == null) { this.fieldboost = 1.0f; } else { this.fieldboost = Float.parseFloat(fieldboost); } System.out.println("fieldname: " + fieldname + " type: " + fieldtype + " boost: " + fieldboost); processing = 3; } else { throw new SAXException( "<lucene:document> element can contain only <lucene:field> elements!"); } } } public void endElement(String namespaceURI, String localName, String qName) throws SAXException { //System.out.println("END: processing: " + processing + " el: " + localName); if (processing == 1) { if (LUCENE_URI.equals(namespaceURI)) { //ADD ACTION if (LUCENE_QUERY_ELEMENT.equals(localName)) { // End query processing AttributesImpl attrs = new AttributesImpl(); attrs.addAttribute(null, "nbdocuments", "nbdocuments", "CDATA", Integer.toString(nbdocuments)); super.startElement(namespaceURI, localName, qName, attrs); super.endElement(namespaceURI, localName, qName); nbdocuments = 0; try { this.writer.optimize(); this.writer.close(); this.writer = null; } catch (IOException e) { throw new SAXException(e); } this.processing = 0; } // DELETE ACTION else if (LUCENE_DELETE_ELEMENT.equals(localName)) { try { AttributesImpl attrs = new AttributesImpl(); attrs.addAttribute(null, "nbdocuments", "nbdocuments", "CDATA", Integer.toString(nbdocuments)); super.startElement(namespaceURI, localName, qName, attrs); super.endElement(namespaceURI, localName, qName); nbdocuments = 0; this.reader.close(); this.reader = null; } catch (IOException e) { throw new SAXException(e); } this.processing = 0; } } else { if (action == ADD_ACTION) { throw new SAXException("</lucene:" + LUCENE_QUERY_ELEMENT + " was expected!"); } else if (action == DELETE_ACTION) { throw new SAXException("</lucene:" + LUCENE_DELETE_ELEMENT + " was expected!"); } } } else if (processing == 2) { if (action == ADD_ACTION) { try { //System.out.println("DOCUMENT \n " + this.bodyDocument); this.writer.addDocument(this.bodyDocument); nbdocuments++; this.bodyDocument = null; } catch (IOException e) { throw new SAXException(e); } this.processing = 1; } else if (action == DELETE_ACTION) { try { //System.out.println("term \n " + this.term); nbdocuments += reader.delete(this.term); } catch (IOException e) { throw new SAXException(e); } this.processing = 1; } } else if (processing == 3) { Field f = null; // add Field switch (fieldtype) { case TYPE_KEYWORD: f = Field.Keyword(fieldname, fieldvalue); break; case TYPE_TEXT: f = Field.Text(fieldname, fieldvalue); break; case TYPE_DATE: try { f = Field.Keyword(fieldname, DateField.dateToString(df.parse(fieldvalue))); } catch (ParseException ex) { throw new SAXException(ex); } break; case TYPE_UNSTORED: f = Field.UnStored(fieldname, fieldvalue); break; case TYPE_UNINDEXED: f = Field.UnIndexed(fieldname, fieldvalue); break; } if (fieldboost != 1.0f) { f.setBoost(fieldboost); } bodyDocument.add(f); processing = 2; } else { super.endElement(namespaceURI, localName, qName); } } public void characters(char[] ch, int start, int length) throws SAXException { if (processing == 3) { this.fieldvalue = new String(ch, start, length); //System.out.println("value: "+this.fieldvalue); } else { super.characters(ch, start, length); } } }
