ozeigermann 2004/06/29 01:10:58 Added: src/share/org/apache/slide/extractor MSWordExtractor.java MSExcelExtractor.java MSPowerPointExtractor.java PDFExtractor.java lib tm-extractors-0.4.jar PDFBox-0.6.5.jar src/stores/org/apache/slide/index TextContentIndexer.java TextContainsExpression.java TextContainsExpressionFactory.java src/conf/webapp Extractor-Domain.xml Log: Added extractor classes donated by Ryan Rhodes as described in
http://issues.apache.org/bugzilla/show_bug.cgi?id=29842 Revision Changes Path 1.1 jakarta-slide/src/share/org/apache/slide/extractor/MSWordExtractor.java Index: MSWordExtractor.java =================================================================== /* * $Header: /home/cvs/jakarta-slide/src/share/org/apache/slide/extractor/MSWordExtractor.java,v 1.1 2004/06/29 08:10:57 ozeigermann Exp $ * $Revision: 1.1 $ * $Date: 2004/06/29 08:10:57 $ * * ==================================================================== * * Copyright 2004 The Apache Software Foundation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * */ package org.apache.slide.extractor; /** * Author: Ryan Rhodes * Date: Jun 26, 2004 * Time: 12:34:29 AM */ import java.io.*; import org.textmining.text.extraction.WordExtractor; public class MSWordExtractor extends AbstractContentExtractor { public MSWordExtractor(String uri, String contentType) { super(uri, contentType); } public Reader extract(InputStream content) throws ExtractorException { try { WordExtractor extractor = new WordExtractor(); String text = extractor.extractText(content); StringReader reader = new StringReader(text); return reader; } catch(Exception e) { throw new ExtractorException(e.getMessage()); } } public static void main(String[] args) throws Exception { FileInputStream in = new FileInputStream(args[0]); MSWordExtractor ex = new MSWordExtractor(null, null); Reader reader = ex.extract(in); int c; do { c = reader.read(); System.out.print((char)c); } while( c != -1 ); } } 1.1 jakarta-slide/src/share/org/apache/slide/extractor/MSExcelExtractor.java Index: MSExcelExtractor.java =================================================================== /* * $Header: /home/cvs/jakarta-slide/src/share/org/apache/slide/extractor/MSExcelExtractor.java,v 1.1 2004/06/29 08:10:57 ozeigermann Exp $ * $Revision: 1.1 $ * $Date: 2004/06/29 08:10:57 $ * * ==================================================================== * * Copyright 2004 The Apache Software Foundation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * */ package org.apache.slide.extractor; /** * Author: Ryan Rhodes * Date: Jun 26, 2004 * Time: 1:53:31 AM */ import java.io.*; import java.util.Iterator; import org.apache.poi.poifs.filesystem.POIFSFileSystem; import org.apache.poi.hssf.usermodel.HSSFWorkbook; import org.apache.poi.hssf.usermodel.HSSFSheet; import org.apache.poi.hssf.usermodel.HSSFRow; import org.apache.poi.hssf.usermodel.HSSFCell; public class MSExcelExtractor extends AbstractContentExtractor { public MSExcelExtractor(String uri, String contentType) { super(uri, contentType); } public Reader extract(InputStream content) throws ExtractorException { try { CharArrayWriter writer = new CharArrayWriter(); POIFSFileSystem fs = new POIFSFileSystem(content); HSSFWorkbook workbook = new HSSFWorkbook(fs); for (int i = 0; i < workbook.getNumberOfSheets(); i++ ) { HSSFSheet sheet = workbook.getSheetAt(i); Iterator rows = sheet.rowIterator(); while( rows.hasNext() ) { HSSFRow row = (HSSFRow) rows.next(); Iterator cells = row.cellIterator(); while( cells.hasNext() ) { HSSFCell cell = (HSSFCell) cells.next(); switch ( cell.getCellType() ) { case HSSFCell.CELL_TYPE_NUMERIC: String num = Double.toString(cell.getNumericCellValue()).trim(); if(num.length() > 0) writer.write(num + " "); break; case HSSFCell.CELL_TYPE_STRING: String text = cell.getStringCellValue().trim(); if(text.length() > 0) writer.write(text + " "); break; } } } } return new CharArrayReader(writer.toCharArray()); } catch(Exception e ) { throw new ExtractorException(e.getMessage()); } } public static void main(String[] args) throws Exception { FileInputStream in = new FileInputStream(args[0]); MSExcelExtractor ex = new MSExcelExtractor(null, null); Reader reader = ex.extract(in); int c = 0; do { c = reader.read(); System.out.print((char)c); } while(c != -1); } } 1.1 jakarta-slide/src/share/org/apache/slide/extractor/MSPowerPointExtractor.java Index: MSPowerPointExtractor.java =================================================================== /* * $Header: /home/cvs/jakarta-slide/src/share/org/apache/slide/extractor/MSPowerPointExtractor.java,v 1.1 2004/06/29 08:10:57 ozeigermann Exp $ * $Revision: 1.1 $ * $Date: 2004/06/29 08:10:57 $ * * ==================================================================== * * Copyright 2004 The Apache Software Foundation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * */ package org.apache.slide.extractor; import org.apache.poi.util.LittleEndian; import org.apache.poi.poifs.eventfilesystem.POIFSReaderListener; import org.apache.poi.poifs.eventfilesystem.POIFSReaderEvent; import org.apache.poi.poifs.eventfilesystem.POIFSReader; import org.apache.poi.poifs.filesystem.DocumentInputStream; import java.io.*; /** * Author: Ryan Rhodes * Date: Jun 27, 2004 * Time: 3:45:39 AM */ public class MSPowerPointExtractor extends AbstractContentExtractor implements POIFSReaderListener { private ByteArrayOutputStream writer = new ByteArrayOutputStream(); public MSPowerPointExtractor(String uri, String contentType) { super(uri, contentType); } public Reader extract(InputStream content) throws ExtractorException { try { POIFSReader reader = new POIFSReader(); reader.registerListener(this); reader.read(content); return new InputStreamReader(new ByteArrayInputStream(writer.toByteArray())); } catch(Exception e) { throw new ExtractorException(e.getMessage()); } } public void processPOIFSReaderEvent(POIFSReaderEvent event) { try{ if(!event.getName().equalsIgnoreCase("PowerPoint Document")) return; DocumentInputStream input = event.getStream(); byte[] buffer = new byte[input.available()]; input.read(buffer, 0, input.available()); for(int i=0; i<buffer.length-20; i++) { long type = LittleEndian.getUShort(buffer,i+2); long size = LittleEndian.getUInt(buffer,i+4); if(type==4008) { writer.write(buffer, i + 4 + 1, (int) size); i = i + 4 + 1 + (int) size - 1; } } } catch (Exception e) { } } public static void main(String[] args) throws Exception { FileInputStream in = new FileInputStream(args[0]); MSPowerPointExtractor ex = new MSPowerPointExtractor(null, null); Reader reader = ex.extract(in); int c; do { c = reader.read(); System.out.print((char)c); } while( c != -1 ); } } 1.1 jakarta-slide/src/share/org/apache/slide/extractor/PDFExtractor.java Index: PDFExtractor.java =================================================================== /* * $Header: /home/cvs/jakarta-slide/src/share/org/apache/slide/extractor/PDFExtractor.java,v 1.1 2004/06/29 08:10:57 ozeigermann Exp $ * $Revision: 1.1 $ * $Date: 2004/06/29 08:10:57 $ * * ==================================================================== * * Copyright 2004 The Apache Software Foundation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * */ package org.apache.slide.extractor; import org.pdfbox.util.PDFTextStripper; import org.pdfbox.pdfparser.PDFParser; import org.pdfbox.pdmodel.PDDocument; import java.io.*; /** * Author: Ryan Rhodes * Date: Jun 26, 2004 * Time: 4:03:00 AM */ public class PDFExtractor extends AbstractContentExtractor { public PDFExtractor(String uri, String contentType) { super(uri, contentType); } public Reader extract(InputStream content) throws ExtractorException { try { PDFParser parser = new PDFParser( content ); parser.parse(); PDDocument document = parser.getPDDocument(); CharArrayWriter writer = new CharArrayWriter(); PDFTextStripper stripper = new PDFTextStripper(); stripper.setLineSeparator("\n"); stripper.writeText(document, writer); document.close(); writer.close(); return new CharArrayReader(writer.toCharArray()); } catch(Exception e ) { throw new ExtractorException(e.getMessage()); } } public static void main(String[] args) throws Exception { FileInputStream in = new FileInputStream(args[0]); PDFExtractor ex = new PDFExtractor(null, null); Reader reader = ex.extract(in); int c = 0; do { c = reader.read(); System.out.print((char)c); } while(c != -1); } } 1.1 jakarta-slide/lib/tm-extractors-0.4.jar <<Binary file>> 1.1 jakarta-slide/lib/PDFBox-0.6.5.jar <<Binary file>> 1.1 jakarta-slide/src/stores/org/apache/slide/index/TextContentIndexer.java Index: TextContentIndexer.java =================================================================== /* * $Header: /home/cvs/jakarta-slide/src/stores/org/apache/slide/index/TextContentIndexer.java,v 1.1 2004/06/29 08:10:58 ozeigermann Exp $ * $Revision: 1.1 $ * $Date: 2004/06/29 08:10:58 $ * * ==================================================================== * * Copyright 2004 The Apache Software Foundation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * */ package org.apache.slide.index; import org.apache.slide.search.IndexException; import org.apache.slide.search.basic.IBasicExpressionFactory; import org.apache.slide.util.logger.Logger; import org.apache.slide.common.*; import org.apache.slide.content.NodeRevisionNumber; import org.apache.slide.content.NodeRevisionDescriptor; import org.apache.slide.content.NodeRevisionContent; import org.apache.slide.store.IndexStore; import org.apache.slide.extractor.ExtractorManager; import org.apache.slide.extractor.ExtractorException; import org.apache.slide.extractor.ContentExtractor; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.Term; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import java.io.IOException; import java.io.CharArrayReader; import java.io.ByteArrayInputStream; import java.io.Reader; import java.util.Hashtable; /** * Author: Ryan Rhodes * Date: Jun 24, 2004 * Time: 10:34:45 PM */ public class TextContentIndexer extends AbstractService implements IndexStore { private static final String INDEX_PATH = "indexpath"; public static final String URI_FIELD = "uri"; public static final String CONTENT_TEXT = "content"; private String indexpath = ""; private boolean started = false; /** * Create Index, if not yet done. * * @param token a NamespaceAccessToken * * @throws org.apache.slide.common.ServiceInitializationFailedException * */ public void initialize(NamespaceAccessToken token) throws ServiceInitializationFailedException { IndexWriter indexWriter = null; try { indexWriter = new IndexWriter(indexpath, new StandardAnalyzer(), false); } // will fail, if not yet exists catch (IOException e) { try { // create index indexWriter = new IndexWriter(indexpath, new StandardAnalyzer(), true); } catch (IOException ex) { Domain.log("Error while initializing the Lucene index " + e.getMessage(), LOG_CHANNEL, Logger.ERROR); throw new ServiceInitializationFailedException(this, ex); } } try { indexWriter.close(); } catch (IOException e) { Domain.log("Error while initializing the Lucene index " + e.getMessage(), LOG_CHANNEL, Logger.ERROR); throw new ServiceInitializationFailedException (this, e); } Domain.log("Lucene is correctly initialized", LOG_CHANNEL, Logger.INFO); } /** * Index an object content. * * @param uri Uri * @exception IndexException Error accessing the Data Source */ synchronized public void createIndex (Uri uri, NodeRevisionDescriptor revisionDescriptor, NodeRevisionContent revisionContent) throws IndexException { IndexWriter indexWriter = null; try { indexWriter = new IndexWriter(indexpath, new StandardAnalyzer(), false); // Create document Document doc = new Document(); doc.add(Field.UnIndexed(URI_FIELD, uri.toString())); doc.add(Field.Text(CONTENT_TEXT, new CharArrayReader (revisionContent.getContent()))); if ( revisionContent != null && revisionDescriptor != null ) { ContentExtractor[] extractor = ExtractorManager.getInstance().getContentExtractors(null, revisionDescriptor); for ( int i = 0; i < extractor.length; i++ ) { Reader reader = extractor[i].extract(new ByteArrayInputStream(revisionContent.getContentBytes())); doc.add(Field.Text(CONTENT_TEXT, reader)); } } indexWriter.addDocument(doc); indexWriter.optimize(); Domain.log( "Added '" + uri.toString() + " - " + revisionDescriptor.getRevisionNumber().toString() + "' to index", LOG_CHANNEL, Logger.INFO); } catch (IOException e) { Domain.log( "Error creating an index with " + uri.toString() + " - " + revisionDescriptor.getRevisionNumber(), LOG_CHANNEL, Logger.ERROR); } catch( ExtractorException e) { Domain.log( "Error extracting content from " + uri.toString() + " - " + revisionDescriptor.getRevisionNumber(), LOG_CHANNEL, Logger.ERROR); } finally { try { if(indexWriter != null) indexWriter.close(); } catch(IOException ioe ) {} } } /** * Method updateIndex * * @param uri an Uri * @param revisionDescriptor a NodeRevisionDescriptor * @param revisionContent a NodeRevisionContent * * @throws IndexException * */ synchronized public void updateIndex(Uri uri, NodeRevisionDescriptor revisionDescriptor, NodeRevisionContent revisionContent) throws IndexException { IndexWriter indexWriter = null; try { // Delete entries from index IndexReader indexReader = IndexReader.open(indexpath); Term term = new Term(URI_FIELD, uri.toString()); indexReader.delete(term); indexReader.close(); indexWriter = new IndexWriter(indexpath, new StandardAnalyzer(), false); // Create document Document doc = new Document(); doc.add(Field.UnIndexed(URI_FIELD, uri.toString())); doc.add(Field.Text(CONTENT_TEXT, new CharArrayReader (revisionContent.getContent()))); if ( revisionContent != null && revisionDescriptor != null ) { ContentExtractor[] extractor = ExtractorManager.getInstance().getContentExtractors(null, revisionDescriptor); for ( int i = 0; i < extractor.length; i++ ) { Reader reader = extractor[i].extract(new ByteArrayInputStream(revisionContent.getContentBytes())); doc.add(Field.Text(CONTENT_TEXT, reader)); } } indexWriter.addDocument(doc); indexWriter.optimize(); Domain.log( "Updated '" + uri.toString() + " - " + revisionDescriptor.getRevisionNumber().toString() + "' to index", LOG_CHANNEL, Logger.INFO); } catch (IOException e) { Domain.log( "Error updating the index with " + uri.toString() + " - " + revisionDescriptor.getRevisionNumber(), LOG_CHANNEL, Logger.ERROR); } catch( ExtractorException e) { Domain.log( "Error extracting content from " + uri.toString() + " - " + revisionDescriptor.getRevisionNumber(), LOG_CHANNEL, Logger.ERROR); } finally { try { if(indexWriter != null) indexWriter.close(); } catch(IOException ioe ) {} } } /** * Drop an object revision from the index. * * @param uri Uri * @exception IndexException */ synchronized public void dropIndex(Uri uri, NodeRevisionNumber number) throws IndexException { IndexWriter indexWriter = null; try { IndexReader indexReader = IndexReader.open(indexpath); Term term = new Term(URI_FIELD, uri.toString()); indexReader.delete(term); indexReader.close(); indexWriter = new IndexWriter(indexpath, new StandardAnalyzer(), false); indexWriter.optimize(); Domain.log( "Deleted '" + uri.toString() + "' from the index", LOG_CHANNEL, Logger.INFO); } catch (IOException e) { Domain.log("Impossible to delete " + uri.toString() + " - " + number.toString() + " from the Lucene index"); } finally { try { if(indexWriter != null) indexWriter.close(); } catch(IOException ioe ) {} } } /** * Method getFactory * * @return an IBasicExpressionFactory * */ public IBasicExpressionFactory getBasicExpressionFactory() { return new TextContainsExpressionFactory(indexpath); } /** * Connects to the underlying data source (if any is needed). * * @exception ServiceConnectionFailedException Connection failed */ public void connect() throws ServiceConnectionFailedException { Domain.log( "TextContentIndexer: connect", LOG_CHANNEL, Logger.INFO); started = true; } /** * This function tells whether or not the service is connected. * * @return boolean true if we are connected * @exception ServiceAccessException Service access error */ public boolean isConnected() throws ServiceAccessException { return started; } /** * Initializes the service with a set of parameters. Those could be : * <li>User name, login info * <li>Host name on which to connect * <li>Remote port * <li>JDBC driver whoich is to be used :-) * <li>Anything else ... * * @param parameters Hashtable containing the parameters' names * and associated values * @exception ServiceParameterErrorException Incorrect service parameter * @exception ServiceParameterMissingException Service parameter missing */ public void setParameters (Hashtable parameters) throws ServiceParameterErrorException, ServiceParameterMissingException { indexpath = (String)parameters.get (INDEX_PATH); if (indexpath == null || indexpath.length() == 0) throw new ServiceParameterMissingException (this, INDEX_PATH); } /** * Disconnects from the underlying data source. * * @exception ServiceDisconnectionFailedException Disconnection failed */ public void disconnect() throws ServiceDisconnectionFailedException { Domain.log( "TextContentIndexer: disconnect", LOG_CHANNEL, Logger.INFO); started = false; } /** * Deletes service underlying data source, if possible (and meaningful). * * @exception ServiceResetFailedException Reset failed */ public void reset() throws ServiceResetFailedException { Domain.log( "TextContentIndexer: reset", LOG_CHANNEL, Logger.INFO); } } 1.1 jakarta-slide/src/stores/org/apache/slide/index/TextContainsExpression.java Index: TextContainsExpression.java =================================================================== /* * $Header: /home/cvs/jakarta-slide/src/stores/org/apache/slide/index/TextContainsExpression.java,v 1.1 2004/06/29 08:10:58 ozeigermann Exp $ * $Revision: 1.1 $ * $Date: 2004/06/29 08:10:58 $ * * ==================================================================== * * Copyright 2004 The Apache Software Foundation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * */ package org.apache.slide.index; import org.apache.slide.search.basic.*; import org.apache.slide.search.BadQueryException; import org.apache.slide.search.SearchException; import org.apache.slide.search.RequestedResource; import org.apache.slide.structure.ObjectNode; import org.apache.slide.structure.SubjectNode; import org.apache.slide.common.SlideException; import org.apache.slide.common.Domain; import org.apache.slide.util.logger.Logger; import org.apache.lucene.search.Searcher; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.Hits; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.queryParser.QueryParser; import org.apache.lucene.document.Document; import java.util.Collection; /** * Author: Ryan Rhodes * Date: Jun 24, 2004 * Time: 11:45:30 PM */ public class TextContainsExpression implements IBasicExpression { protected static final String LOG_CHANNEL = TextContainsExpression.class.getName(); String searchedText; String indexPath; /** backptr to the factory */ IBasicExpressionFactory factory; /** * constructor for a compare expression like gt, eq, ... * For your concrete implementation you are free, which parameters have to * be passed, let the factory give you everything you need. */ TextContainsExpression (String searchedText, String rootPath) { this.searchedText = searchedText; this.indexPath = rootPath; } /** * constructor for a merge expression */ TextContainsExpression (String mergeOperator, Collection children, IBasicExpressionFactory factory) throws BadQueryException { // this.factory = factory; // Iterator it = children.iterator(); // BasicExpressionTxtContainsSample firstChild = (BasicExpressionTxtContainsSample)it.next(); // // if (firstChild == null) // throw new BadQueryException (mergeOperator + " needs at least one nested element"); // // theExecutableCommand = firstChild.theExecutableCommand; // // // create the executable command // while (it.hasNext()) { // BasicExpressionTxtContainsSample exp = (BasicExpressionTxtContainsSample)it.next(); // theExecutableCommand += " " + mergeOperator + " " + exp.theExecutableCommand; // } } /** * Search the index for this expression using Lucene. * * @return an IBasicResultSet * * @throws org.apache.slide.search.SearchException * */ public IBasicResultSet execute() throws SearchException { IBasicResultSet result = new BasicResultSetImpl (false); try { Searcher searcher = new IndexSearcher(indexPath); Analyzer analyzer = new StandardAnalyzer(); Query query = QueryParser.parse(searchedText, TextContentIndexer.CONTENT_TEXT, analyzer); Hits hits = searcher.search (query); int noOfHits = hits.length(); for (int i = 0; i < noOfHits; i++) { Document doc = hits.doc(i); String uri = doc.get(TextContentIndexer.URI_FIELD); RequestedResource resource = createResource(uri); result.add (resource); } } catch (Exception e) { throw new SearchException (e); } Domain.log( "Executed Search for '" + searchedText + "' in the index", LOG_CHANNEL, Logger.INFO); return result; } private RequestedResource createResource(String uri) throws SearchException { ObjectNode node = new SubjectNode(uri); // this will return the root folder RequestedResource resource = null; IBasicQuery query = factory.getQuery(); try { resource = new ComparableResourceImpl (node, query.getSearchToken(), query.getScope(), factory.getPropertyProvider()); } catch (SlideException e) { throw new SearchException (e); } return resource; } public void setFactory (IBasicExpressionFactory factory) { this.factory = factory; } public IBasicExpressionFactory getFactory() { return this.factory; } } 1.1 jakarta-slide/src/stores/org/apache/slide/index/TextContainsExpressionFactory.java Index: TextContainsExpressionFactory.java =================================================================== /* * $Header: /home/cvs/jakarta-slide/src/stores/org/apache/slide/index/TextContainsExpressionFactory.java,v 1.1 2004/06/29 08:10:58 ozeigermann Exp $ * $Revision: 1.1 $ * $Date: 2004/06/29 08:10:58 $ * * ==================================================================== * * Copyright 2004 The Apache Software Foundation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * */ package org.apache.slide.index; import org.apache.slide.search.basic.IBasicExpressionFactory; import org.apache.slide.search.basic.IBasicQuery; import org.apache.slide.search.basic.IBasicExpression; import org.apache.slide.search.PropertyProvider; import org.apache.slide.search.BadQueryException; import org.apache.slide.content.NodeProperty; import org.jdom.Element; import java.util.Collection; /** * Author: Ryan Rhodes * Date: Jun 24, 2004 * Time: 11:42:35 PM */ public class TextContainsExpressionFactory implements IBasicExpressionFactory { private IBasicQuery query; protected PropertyProvider propertyProvider; private String rootPath; /** * Constructor * * @param rootPath path to the content files * */ public TextContainsExpressionFactory (String rootPath) { this.rootPath = rootPath; } /** * called for merge expressions (or, and). Not defined here * * @param mergeOperator and, or * @param namespace the namespace of this expression * @param expressionsToMerge all expressions, that shall be merged * * @return an IBasicExpression * * @throws BadQueryException * */ public IBasicExpression createMergeExpression (String mergeOperator, String namespace, Collection expressionsToMerge) throws BadQueryException { return null; } /** * Called by the expression compiler for each leave expression. * * @param element an Element discribing the expression * * @return an IBasicExpression * * @throws BadQueryException * */ public IBasicExpression createExpression (Element element) throws BadQueryException { TextContainsExpression result = null; if (element == null) { throw new BadQueryException ("expected a where criteria"); } else { String namespace = element.getNamespace().getURI(); if (namespace.equals (NodeProperty.NamespaceCache.DEFAULT_URI)) result = createDAVExpression (element); // allow store specific extensions // else if (namespace.equals (MyNamespace)) // result = createMyExpression (element); } result.setFactory(this); return result; } /** * Called, when the expression is in the default (DAV:) namespace. * * * @param e an Element * * @return a BasicExpressionTemplate * */ private TextContainsExpression createDAVExpression (Element e) { String name = e.getName(); TextContainsExpression result = null; if (name.equals ("contains")) { String searchedText = e.getTextTrim(); result = new TextContainsExpression (searchedText, rootPath); } return result; } /** * called by BasicExpressionCompiler after construction. * * @param query the associated BasicQuery * @param propertyProvider the PropertyProvider for this expression. * * @throws BadQueryException * */ public void init(IBasicQuery query, PropertyProvider propertyProvider) throws BadQueryException { this.query = (IBasicQuery) query; this.propertyProvider = propertyProvider; } /** * Method getPropertyProvider * * @return the PropertyProvider * */ public PropertyProvider getPropertyProvider() { return propertyProvider; } /** * Method getQuery * * @return the IBasicQuery * */ public IBasicQuery getQuery() { return query; } } 1.1 jakarta-slide/src/conf/webapp/Extractor-Domain.xml Index: Extractor-Domain.xml =================================================================== <?xml version="1.0"?> <slide logger-level="6" default="slide"> <namespace name="slide"> <definition> <store name="tx"> <parameter name="tlock-timeout">20</parameter> <nodestore classname="org.apache.slide.store.txfile.TxXMLFileDescriptorsStore"> <parameter name="rootpath">store/metadata</parameter> <parameter name="workpath">work/metadata</parameter> <parameter name="defer-saving">true</parameter> </nodestore> <sequencestore classname="org.apache.slide.store.txfile.FileSequenceStore"> <parameter name="rootpath">store/sequence</parameter> </sequencestore> <securitystore> <reference store="nodestore"/> </securitystore> <lockstore> <reference store="nodestore"/> </lockstore> <revisiondescriptorsstore> <reference store="nodestore"/> </revisiondescriptorsstore> <revisiondescriptorstore> <reference store="nodestore"/> </revisiondescriptorstore> <contentstore classname="org.apache.slide.store.txfile.TxFileContentStore"> <parameter name="rootpath">store/content</parameter> <parameter name="workpath">work/content</parameter> <parameter name="defer-saving">true</parameter> </contentstore> <contentindexer classname="org.apache.slide.index.TextContentIndexer"> <parameter name="indexpath">store/index</parameter> </contentindexer> </store> <scope match="/" store="tx"/> </definition> <configuration> <!-- Actions mapping --> <read-object>/actions/read</read-object> <create-object>/actions/write</create-object> <remove-object>/actions/write</remove-object> <grant-permission>/actions/write-acl</grant-permission> <revoke-permission>/actions/write-acl</revoke-permission> <read-permissions>/actions/read-acl</read-permissions> <read-own-permissions>/actions/read-current-user-privilege-set</read-own-permissions> <lock-object>/actions/write</lock-object> <kill-lock>/actions/unlock</kill-lock> <read-locks>/actions/read</read-locks> <read-revision-metadata>/actions/read</read-revision-metadata> <create-revision-metadata>/actions/write-properties</create-revision-metadata> <modify-revision-metadata>/actions/write-properties</modify-revision-metadata> <remove-revision-metadata>/actions/write-properties</remove-revision-metadata> <read-revision-content>/actions/read</read-revision-content> <create-revision-content>/actions/write-content</create-revision-content> <modify-revision-content>/actions/write-content</modify-revision-content> <remove-revision-content>/actions/write-content</remove-revision-content> <bind-member>/actions/bind</bind-member> <unbind-member>/actions/unbind</unbind-member> <!-- Paths configuration --> <userspath>/users</userspath> <rolespath>/roles</rolespath> <actionspath>/actions</actionspath> <filespath>/files</filespath> <parameter name="dav">true</parameter> <parameter name="standalone">true</parameter> <parameter name="acl_inheritance_type">path</parameter> <!-- Nested roles: 0 means no nesting (default), 1 means one sublevel, etc. --> <parameter name="nested_roles_maxdepth">0</parameter> </configuration> <data> <objectnode classname="org.apache.slide.structure.SubjectNode" uri="/"> <!-- Subject can be: any user "all" authenticated user "authenticated" unauthenticated user "unauthenticated" self "self" owner of resource "owner" a user "/users/john" a role "/roles/admin" --> <permission action="all" subject="/roles/root" inheritable="true"/> <permission action="/actions/read-acl" subject="all" inheritable="true" negative="true"/> <permission action="/actions/write-acl" subject="all" inheritable="true" negative="true"/> <permission action="/actions/unlock" subject="all" inheritable="true" negative="true"/> <permission action="/actions/read" subject="all" inheritable="true"/> <!-- /users --> <objectnode classname="org.apache.slide.structure.SubjectNode" uri="/users"> <permission action="all" subject="self" inheritable="true"/> <permission action="all" subject="unauthenticated" inheritable="true" negative="true"/> <!-- /users/root represents the administrator --> <objectnode classname="org.apache.slide.structure.SubjectNode" uri="/users/root"> <revision> <property namespace="http://jakarta.apache.org/slide/" name="password">root</property> </revision> </objectnode> <!-- /users/john and /users/john2 represent authenticated users --> <objectnode classname="org.apache.slide.structure.SubjectNode" uri="/users/john"> <revision> <property namespace="http://jakarta.apache.org/slide/" name="password">john</property> </revision> </objectnode> <objectnode classname="org.apache.slide.structure.SubjectNode" uri="/users/john2"> <revision> <property namespace="http://jakarta.apache.org/slide/" name="password">john2</property> </revision> </objectnode> <!-- /users/guest represents an authenticated or unauthenticated guest user --> <objectnode classname="org.apache.slide.structure.SubjectNode" uri="/users/guest"> <revision> <property namespace="http://jakarta.apache.org/slide/" name="password">guest</property> </revision> </objectnode> </objectnode> <!-- /roles --> <objectnode classname="org.apache.slide.structure.SubjectNode" uri="/roles"> <permission action="all" subject="self" inheritable="true"/> <permission action="all" subject="unauthenticated" inheritable="true" negative="true"/> <objectnode classname="org.apache.slide.structure.SubjectNode" uri="/roles/root"> <revision> <property name="group-member-set"><![CDATA[<D:href xmlns:D='DAV:'>/users/root</D:href>]]></property> </revision> </objectnode> <objectnode classname="org.apache.slide.structure.SubjectNode" uri="/roles/user"> <revision> <property name="group-member-set"><![CDATA[<D:href xmlns:D='DAV:'>/users/john</D:href><D:href xmlns:D='DAV:'>/users/john2</D:href><D:href xmlns:D='DAV:'>/users/root</D:href>]]></property> </revision> </objectnode> <objectnode classname="org.apache.slide.structure.SubjectNode" uri="/roles/guest"> <revision> <property name="group-member-set"><![CDATA[<D:href xmlns:D='DAV:'>/users/guest</D:href>]]></property> </revision> </objectnode> </objectnode> <!-- action --> <objectnode classname="org.apache.slide.structure.ActionNode" uri="/actions"> <objectnode classname="org.apache.slide.structure.ActionNode" uri="/actions/read"> <revision> <property name="privilege-member-set"><![CDATA[<D:href xmlns:D='DAV:'>/actions/read-acl</D:href> <D:href xmlns:D='DAV:'>/actions/read-current-user-privilege-set</D:href>]]></property> </revision> </objectnode> <objectnode classname="org.apache.slide.structure.ActionNode" uri="/actions/read-acl"> <revision> <property name="privilege-member-set"/> </revision> </objectnode> <objectnode classname="org.apache.slide.structure.ActionNode" uri="/actions/read-current-user-privilege-set"> <revision> <property name="privilege-member-set"/> </revision> </objectnode> <objectnode classname="org.apache.slide.structure.ActionNode" uri="/actions/write"> <revision> <property name="privilege-member-set"><![CDATA[<D:href xmlns:D='DAV:'>/actions/write-acl</D:href> <D:href xmlns:D='DAV:'>/actions/write-properties</D:href> <D:href xmlns:D='DAV:'>/actions/write-content</D:href>]]></property> </revision> </objectnode> <objectnode classname="org.apache.slide.structure.ActionNode" uri="/actions/write-acl"> <revision> <property name="privilege-member-set"/> </revision> </objectnode> <objectnode classname="org.apache.slide.structure.ActionNode" uri="/actions/write-properties"> <revision> <property name="privilege-member-set"/> </revision> </objectnode> <objectnode classname="org.apache.slide.structure.ActionNode" uri="/actions/write-content"> <revision> <property name="privilege-member-set"><![CDATA[<D:href xmlns:D='DAV:'>/actions/bind</D:href> <D:href xmlns:D='DAV:'>/actions/unbind</D:href>]]></property> </revision> </objectnode> <objectnode classname="org.apache.slide.structure.ActionNode" uri="/actions/bind"> <revision> <property name="privilege-member-set"/> </revision> </objectnode> <objectnode classname="org.apache.slide.structure.ActionNode" uri="/actions/unbind"> <revision> <property name="privilege-member-set"/> </revision> </objectnode> <objectnode classname="org.apache.slide.structure.ActionNode" uri="/actions/unlock"> <revision> <property name="privilege-member-set"/> </revision> </objectnode> </objectnode> <objectnode classname="org.apache.slide.structure.SubjectNode" uri="/files"> <permission action="all" subject="unauthenticated" inheritable="true"/> <permission action="/actions/write" subject="/roles/user" inheritable="true"/> <permission action="/actions/read-acl" subject="owner" inheritable="true"/> </objectnode> <!-- DeltaV: default history and workspace paths --> <objectnode classname="org.apache.slide.structure.SubjectNode" uri="/history"> <permission action="all" subject="unauthenticated" inheritable="true"/> <permission action="/actions/write" subject="/roles/user" inheritable="true"/> <permission action="/actions/read-acl" subject="owner" inheritable="true"/> </objectnode> <objectnode classname="org.apache.slide.structure.SubjectNode" uri="/workspace"> <permission action="all" subject="unauthenticated" inheritable="true"/> <permission action="/actions/write" subject="/roles/user" inheritable="true"/> <permission action="/actions/read-acl" subject="owner" inheritable="true"/> </objectnode> <objectnode classname="org.apache.slide.structure.SubjectNode" uri="/workingresource"> <permission action="all" subject="unauthenticated" inheritable="true"/> <permission action="/actions/write" subject="/roles/user" inheritable="true"/> <permission action="/actions/read-acl" subject="owner" inheritable="true"/> </objectnode> </objectnode> </data> </namespace> <!-- DeltaV global parameters ======================== * historypath (mandatory=no, default="/history"): Specifies a Slide path which determines the location where this DeltaV server stores history data. * workspacepath (mandatory=no, default="/workspace"): Specifies a Slide path which determines the location where this DeltaV server allows workspaces to reside. * workingresourcepath (mandatory=no, default="/workingresource"): Specifies a Slide path which determines the location where this DeltaV server stores working resources. * auto-version (mandatory=no, default="checkout-checkin"): Controls the DeltaV auto-version behaviour. * auto-version-control (mandatory=no, default="false"): Indicates if a resource just created by a PUT should be set under version-control. * versioncontrol-exclude (mandatory=no, default=""): Specifies a Slide path which determines resources which are excluded from version-control. The default value "" makes no path being excluded. * checkout-fork (mandatory=no, default="forbidden"): Controls the DeltaV check-out behaviour when a version is already checked-out or has a successor. * checkin-fork (mandatory=no, default="forbidden"): Controls the DeltaV check-out behaviour when a version has already a successor. * standardLivePropertiesClass (mandatory=no, default="org.apache.slide.webdav.util.resourcekind.AbstractResourceKind"): Determines the "agent" knowing about what the standard live properties are. It should be a loadable class containing the following static methods: - boolean isLiveProperty(String propName) - boolean isProtectedProperty(String propName) - boolean isComputedProperty(String propName) - Set getAllLiveProperties() - Set getAllProtectedProperties() - Set getAllComputedProperties() * uriRedirectorClass (mandatory=no, default="org.apache.slide.webdav.util.DeltavUriRedirector"): Determines the URI redirector class. The DeltaV URI redirector is in charge of the following redirections: - version URI to history URI, e.g. /history/2/1.4 to /history/2 - latest revision number for history resource to 0.0 - latest revision number for version resource to last URI token, e.g. /history/2/1.4 to 1.4 It should be a loadable class containing the following static methods: - String redirectUri(String uri) - NodeRevisionNumber redirectLatestRevisionNumber(String uri) --> <parameter name="historypath">/history</parameter> <parameter name="workspacepath">/workspace</parameter> <parameter name="workingresourcepath">/workingresource</parameter> <parameter name="auto-version">checkout-checkin</parameter> <parameter name="auto-version-control">false</parameter> <parameter name="versioncontrol-exclude"/> <parameter name="checkout-fork">forbidden</parameter> <parameter name="checkin-fork">forbidden</parameter> <!-- Extractor configuration --> <extractors> <extractor classname="org.apache.slide.extractor.SimpleXmlExtractor" uri="/files/articles/test.xml"> <configuration> <instruction property="title" xpath="/article/title/text()" /> <instruction property="summary" xpath="/article/summary/text()" /> </configuration> </extractor> <extractor classname="org.apache.slide.extractor.OfficeExtractor" uri="/files/docs/"> <configuration> <instruction property="author" id="SummaryInformation-0-4" /> <instruction property="application" id="SummaryInformation-0-18" /> </configuration> </extractor> <extractor classname="org.apache.slide.extractor.MSWordExtractor" uri="/files/" content-type="application/ms-word"> </extractor> <extractor classname="org.apache.slide.extractor.MSExcelExtractor" uri="/files/" content-type="application/ms-excel"> </extractor> <extractor classname="org.apache.slide.extractor.MSPowerPointExtractor" uri="/files/" content-type="application/vnd.ms-powerpoint"> </extractor> <extractor classname="org.apache.slide.extractor.PDFExtractor" uri="/files/" content-type="application/pdf"> </extractor> </extractors> <!-- Event configuration --> <events> <event classname="org.apache.slide.webdav.event.WebdavEvent" enable="true" /> <event classname="org.apache.slide.event.ContentEvent" enable="true" /> <event classname="org.apache.slide.event.ContentEvent" method="retrieve" enable="false" /> <event classname="org.apache.slide.event.EventCollection" enable="true" /> <event classname="org.apache.slide.event.TransactionEvent" enable="true" /> <event classname="org.apache.slide.event.MacroEvent" enable="true"/> <listener classname="org.apache.slide.util.event.EventLogger" /> <listener classname="org.apache.slide.event.VetoableEventCollector" /> <listener classname="org.apache.slide.event.TransientEventCollector" /> <listener classname="org.apache.slide.webdav.event.NotificationTrigger"> <configuration> <notification protocol="tcp" include-events="false" /> </configuration> </listener> <listener classname="org.apache.slide.extractor.PropertyExtractorTrigger" /> <listener classname="org.apache.slide.search.IndexTrigger"> <configuration> <indexer classname="org.apache.slide.search.LoggingIndexer" synchronous="false" uri="/files/articles" /> </configuration> </listener> <listener classname="org.apache.slide.macro.MacroPropertyUpdater"> <!-- Listener that updates some properties if resources are copied or moved. This requires MacroEvents enabled (at least methods copy and move) --> <configuration> <update-displayname>true</update-displayname> <update-owner-on-move>false</update-owner-on-move> <update-owner-on-copy>true</update-owner-on-copy> </configuration> </listener> </events> </slide> --------------------------------------------------------------------- To unsubscribe, e-mail: [EMAIL PROTECTED] For additional commands, e-mail: [EMAIL PROTECTED]