Author: natalia Date: Fri Aug 10 19:46:58 2007 New Revision: 564823 URL: http://svn.apache.org/viewvc?view=rev&rev=564823 Log: Full text indexer
Added: xml/xindice/trunk/java/src/org/apache/xindice/core/indexer/LuceneIndexer.java (with props) xml/xindice/trunk/java/src/org/apache/xindice/core/query/TextQueryResolver.java (with props) Modified: xml/xindice/trunk/java/src/org/apache/xindice/core/query/ProcessingException.java Added: xml/xindice/trunk/java/src/org/apache/xindice/core/indexer/LuceneIndexer.java URL: http://svn.apache.org/viewvc/xml/xindice/trunk/java/src/org/apache/xindice/core/indexer/LuceneIndexer.java?view=auto&rev=564823 ============================================================================== --- xml/xindice/trunk/java/src/org/apache/xindice/core/indexer/LuceneIndexer.java (added) +++ xml/xindice/trunk/java/src/org/apache/xindice/core/indexer/LuceneIndexer.java Fri Aug 10 19:46:58 2007 @@ -0,0 +1,494 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * $Id$ + */ + +package org.apache.xindice.core.indexer; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.xindice.core.Collection; +import org.apache.xindice.core.DBObject; +import org.apache.xindice.core.DBException; +import org.apache.xindice.core.data.Key; +import org.apache.xindice.core.FaultCodes; +import org.apache.xindice.core.query.CompilationException; +import org.apache.xindice.core.query.ProcessingException; +import org.apache.xindice.util.Configuration; +import org.apache.xindice.util.XindiceException; +import org.apache.xindice.util.StringUtilities; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.Hits; +import org.apache.lucene.search.Hit; +import org.apache.lucene.search.Query; +import org.apache.lucene.queryParser.QueryParser; +import org.apache.lucene.queryParser.ParseException; + +import java.io.File; +import java.io.IOException; +import java.util.HashMap; +import java.util.Iterator; +import java.util.ArrayList; + +/** + * LuceneIndexer is used for maintaining full text indexes. It operates on + * documents instead of elements and allows to search for documents using + * native Lucene query. There can be only one LuceneIndexer per collection, + * however, it may have more than one IndexPattern.<p> + * + * Every IndexPattern corresponds to a Lucene document field. For every Xindice + * document, value of all matching elements will be indexed by a single Lucene + * document, allowing to search across the patterns.</p><p> + * + * Sample LuceneIndexer configuration: + * <pre> + * <index name='fulltext' class='org.apache.xindice.core.indexer.LuceneIndexer' + * analyzer='org.apache.lucene.analysis.SimpleAnalyzer'> + * <pattern pattern='[EMAIL PROTECTED]' alias='title'/> + * <pattern pattern='description' alias='text'/> + * </index></pre></p><p> + * + * To search over this sample index, one could issue a query <code>"title:tutorial + * AND text:xml"</code>.</p><p> + * + * For more details about LuceneIndexer configuration please see documentation for + * [EMAIL PROTECTED] #setConfig(org.apache.xindice.util.Configuration)} + * </p> + * + * @author Andy Armstrong + * @version $Revision$, $Date$ + */ +public final class LuceneIndexer implements Indexer, DBObject { + + private static final Log log = LogFactory.getLog(LuceneIndexer.class); + + private static final String NAME = "name"; + private static final String PATTERN = "pattern"; + private static final String ANALYZER = "analyzer"; + private static final String PATTERN_STRING = "pattern"; + private static final String PATTERN_ALIAS = "alias"; + + public static final String KEYNAME = "key"; + public static final String TEXTNAME = "text"; + + // Default analyzer to use + private static final String DEFANALYZER = "org.apache.lucene.analysis.SimpleAnalyzer"; + private static final IndexMatch[] EMPTY_MATCHES = new IndexMatch[0]; + + private File idxFile; + private IndexWriter iw; + private IndexReader ir; + private IndexSearcher is; + private Analyzer an; + + private Configuration config; + private Collection collection; + + private String name; + private HashMap patterns = new HashMap(); + + // Keep a count of changes to the index + private int docsAdded; + private int docsDeleted; + + private void setFile(File f) { + idxFile = f; + } + + private File getFile() { + if (null == idxFile) { + throw new IllegalStateException("Not bound to a file"); + } + return idxFile; + } + + public String getIndexStyle() { + return STYLE_FULLTEXT; + } + + /** + * Returns this Indexer's patterns. LuceneIndexer may have more than one + * pattern. + * @return Indexer's patterns + */ + public IndexPattern[] getPatterns() { + return (IndexPattern[]) patterns.keySet().toArray(new IndexPattern[0]); + } + + /** + * Configures LuceneIndexer instance. + * <dl> + * <dt>index + * <dd>Top Indexer configuration element. Can have one or more pattern + * child elements. Its attributes: + * + * <ul><li>name - Indexer name. Required. + * <li>class - Indexer class. Required. + * org.apache.xindice.core.indexer.LuceneIndexer for full text index. + * <li>analyzer - Analyzer to use for indexing. Optional, + * org.apache.lucene.analysis.SimpleAnalyzer by default.</ul> + * + * <dl><dt>pattern + * <dd>Child element. Its attributes: + * <ul><li>pattern - IndexPattern. Required. For acceptable formats, see + * [EMAIL PROTECTED] org.apache.xindice.core.indexer.Indexer#getPatterns()} + * <li>alias - Name of the field to store/search values for that pattern. + * Required.</ul></dl> + * </dl> + * + * @param config Configuration to apply + * @throws XindiceException Configuration does not have required information, + * Analyzer could not have been instantiated. + */ + public void setConfig(Configuration config) throws XindiceException { + this.config = config; + try { + name = config.getAttribute(NAME); + String analyzer = config.getAttribute(ANALYZER); + + String anc = StringUtilities.isBlank(analyzer) ? DEFANALYZER : analyzer; + Class c = Class.forName(anc); + an = (Analyzer) c.newInstance(); + + Configuration[] patterns = config.getChildren(PATTERN); + if (patterns.length == 0) { + throw new CannotCreateException("Configuration must have at least one pattern"); + } + + for (int i = 0; i < patterns.length; i++) { + String name = patterns[i].getAttribute(PATTERN_STRING); + String alias = patterns[i].getAttribute(PATTERN_ALIAS); + this.patterns.put(new IndexPattern(collection.getSymbols(), name, null), alias); + } + + setFile(new File(collection.getCollectionRoot(), name)); + } catch (Exception e) { + throw new XindiceException(e); + } + } + + public Configuration getConfig() { + return config; + } + + public synchronized boolean exists() { + return IndexReader.indexExists(idxFile); + } + + /** + * Creates necessary resources. + * + * @return true, if successful + * @throws DBException The was low-level IOException that prevented index + * from creating resources. + * @throws DuplicateIndexException Parent collection already has full text index + */ + public synchronized boolean create() throws DBException { + if (luceneIndexerFound()) { + throw new DuplicateIndexException("Collection can only have one full text index."); + } + openWrite(true); + return true; + } + + private boolean luceneIndexerFound() throws DBException { + String indexers[] = collection.getIndexManager().list(); + for (int i = 0; i < indexers.length; i++) { + Indexer indexer = collection.getIndexer(indexers[i]); + if (indexer instanceof LuceneIndexer) { + return true; + } + } + + return false; + } + + public synchronized boolean open() throws DBException { + openWrite(false); + return true; + } + + public synchronized boolean isOpened() { + return (null != iw) || (null != ir); + } + + public synchronized boolean close() throws DBException { + closeWrite(); + closeRead(); + return true; + } + + public synchronized boolean drop() throws DBException { + try { + if (IndexReader.indexExists(idxFile)) { + close(); + return deepDelete(getFile()); + } else { + return false; + } + } catch (IOException e) { + throw new DBException(FaultCodes.IDX_CORRUPTED, + "Failed to delete index " + name + ", collection " + collection.getCanonicalName(), e); + } + } + + public String getName() { + return name; + } + + public void setCollection(Collection collection) { + this.collection = collection; + } + + public Analyzer getAnalyzer() { + return an; + } + + private void openRead() throws DBException { + if (log.isTraceEnabled()) { + log.trace("Calling openRead()"); + } + + if (null == ir) { + closeWrite(); + try { + ir = IndexReader.open(getFile()); + } catch (IOException e) { + throw new DBException(FaultCodes.IDX_CORRUPTED, + "Failed to open index " + name + ", collection " + collection.getCanonicalName(), e); + } + } + } + + private void openSearch() throws DBException { + if (log.isTraceEnabled()) { + log.trace("Calling openSearch()"); + } + + if (null == is) { + openRead(); + is = new IndexSearcher(ir); + } + } + + private void openWrite(boolean create) throws DBException { + if (log.isTraceEnabled()) { + log.trace("Calling openWrite(" + create + ")"); + } + + if (null == iw) { + closeRead(); + try { + iw = new IndexWriter(getFile(), getAnalyzer(), create); + } catch (IOException e) { + if (create) { + throw new DBException(FaultCodes.IDX_CANNOT_CREATE, + "Failed to cleate index " + name + ", collection " + collection.getCanonicalName(), e); + } else { + throw new DBException(FaultCodes.IDX_CORRUPTED, + "Failed to open index " + name + ", collection " + collection.getCanonicalName(), e); + } + } + } + } + + private void assertOpen() { + if (!isOpened()) { + throw new IllegalStateException("Index has not been opened"); + } + } + + private void assertRead() throws DBException { + assertOpen(); + openRead(); + } + + private void assertWrite() throws DBException { + assertOpen(); + openWrite(false); + } + + private void closeRead() throws DBException { + if (null != ir) { + closeSearch(); + try { + ir.close(); + ir = null; + } catch (IOException e) { + throw new DBException(FaultCodes.IDX_CORRUPTED, + "Failed to close reader for index " + name + ", collection " + collection.getCanonicalName(), e); + } + } + } + + private void closeSearch() throws DBException { + if (null != is) { + try { + is.close(); + is = null; + } catch (IOException e) { + throw new DBException(FaultCodes.IDX_CORRUPTED, + "Failed to close searcher for index " + name + ", collection " + collection.getCanonicalName(), e); + } + } + } + + private void closeWrite() throws DBException { + if (null != iw) { + try { + int nDocs = iw.docCount(); + /* Fairly arbitrary rules for triggering index optimisation. Need to + * play with these. + */ + if (docsAdded > nDocs / 10 || docsAdded > 50 || docsDeleted > 10) { + if (log.isDebugEnabled()) { + log.debug("Optimizing text index for " + collection.getCanonicalName() + "..."); + } + + iw.optimize(); + docsAdded = 0; + docsDeleted = 0; + } + + iw.close(); + iw = null; + } catch (IOException e) { + throw new DBException(FaultCodes.IDX_CORRUPTED, + "Failed to close writer for index " + name + ", collection " + collection.getCanonicalName(), e); + } + } + } + + private boolean deepDelete(File f) throws IOException { + if (f.isDirectory()) { + File fl[] = f.listFiles(); + for (int i = 0; i < fl.length; i++) { + if (!deepDelete(fl[i])) { + return false; + } + } + } + return f.delete(); + } + + public synchronized void flush() throws DBException { + try { + assertOpen(); + if (iw != null) { + iw.flush(); + } + } catch (IOException e) { + throw new DBException(FaultCodes.IDX_CORRUPTED, + "Could not force unwritten data to disk for index " + name + ", collection " + collection.getCanonicalName(), e); + } + } + + /** + * Creates new instance of a handler to listen to indexer events. For + * every document that being added there will be a separate handler + * that will assemble all relevant values in a single Lucene document. + * + * @return new instance of IndexerEventHandler + */ + public IndexerEventHandler getIndexerEventHandler() { + return new BasicIndexerEventHandler() { + Document doc; + + public synchronized void onDocumentAdded(Key key) throws DBException { + if (doc != null) { + assertWrite(); + + try { + iw.addDocument(doc); + docsAdded++; + } catch (IOException e) { + throw new DBException(FaultCodes.IDX_CORRUPTED, + "Failed to add document to the index " + name + ", collection " + collection.getCanonicalName(), e); + } + } + } + + public synchronized void onDocumentDeleted(Key key) throws DBException { + assertRead(); + + try { + ir.deleteDocuments(new Term(KEYNAME, key.toString())); + docsDeleted++; + } catch (IOException e) { + throw new DBException(FaultCodes.IDX_CORRUPTED, + "Failed to delete document from the index " + name + ", collection " + collection.getCanonicalName(), e); + } + } + + public void onValueAdded(IndexPattern pattern, String value, Key key, int pos, int len, short elemID, short attrID) { + if (doc == null) { + doc = new Document(); + doc.add(new Field(KEYNAME, key.toString(), Field.Store.YES, Field.Index.UN_TOKENIZED)); + } + + String field = (String) patterns.get(pattern); + doc.add(new Field(field, value, Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.YES)); + } + }; + } + + public IndexMatch[] queryMatches(final IndexQuery query) throws DBException { + // this indexer only supports text queries + if (query.getOperator() != IndexQuery.TQ) { + return null; + } + + String textQuery = query.getValue(0).toString(); + try { + return queryMatches(new QueryParser("", getAnalyzer()).parse(textQuery)); + } catch (ParseException e) { + throw new CompilationException("Failed to parse query '" + textQuery + "'", e); + } + } + + /** + * Same as [EMAIL PROTECTED] Indexer#queryMatches(IndexQuery)}, but accepts compiled Lucene query as + * parameter. + * + * @param query Compiled Lucene query. + * @return The resulting matches + * @throws DBException + */ + public synchronized IndexMatch[] queryMatches(Query query) throws DBException { + ArrayList matches = new ArrayList(); + openSearch(); + try { + Hits hits = is.search(query); + for (Iterator i = hits.iterator(); i.hasNext(); ) { + Hit hit = (Hit) i.next(); + Key key = new Key(hit.getDocument().getField(KEYNAME).stringValue()); + matches.add(new IndexMatch(key, -1, -1)); + } + } catch (IOException e) { + throw new ProcessingException("Failed to process a query", e); + } + + return (IndexMatch[]) matches.toArray(EMPTY_MATCHES); + } +} \ No newline at end of file Propchange: xml/xindice/trunk/java/src/org/apache/xindice/core/indexer/LuceneIndexer.java ------------------------------------------------------------------------------ svn:eol-style = native Propchange: xml/xindice/trunk/java/src/org/apache/xindice/core/indexer/LuceneIndexer.java ------------------------------------------------------------------------------ svn:keywords = Id Revision Author Date Modified: xml/xindice/trunk/java/src/org/apache/xindice/core/query/ProcessingException.java URL: http://svn.apache.org/viewvc/xml/xindice/trunk/java/src/org/apache/xindice/core/query/ProcessingException.java?view=diff&rev=564823&r1=564822&r2=564823 ============================================================================== --- xml/xindice/trunk/java/src/org/apache/xindice/core/query/ProcessingException.java (original) +++ xml/xindice/trunk/java/src/org/apache/xindice/core/query/ProcessingException.java Fri Aug 10 19:46:58 2007 @@ -38,6 +38,6 @@ } public ProcessingException(String message, Throwable cause) { - super(FaultCodes.QRY_COMPILATION_ERROR, message, cause); + super(FaultCodes.QRY_PROCESSING_ERROR, message, cause); } } Added: xml/xindice/trunk/java/src/org/apache/xindice/core/query/TextQueryResolver.java URL: http://svn.apache.org/viewvc/xml/xindice/trunk/java/src/org/apache/xindice/core/query/TextQueryResolver.java?view=auto&rev=564823 ============================================================================== --- xml/xindice/trunk/java/src/org/apache/xindice/core/query/TextQueryResolver.java (added) +++ xml/xindice/trunk/java/src/org/apache/xindice/core/query/TextQueryResolver.java Fri Aug 10 19:46:58 2007 @@ -0,0 +1,232 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * $Id$ + */ + +package org.apache.xindice.core.query; + +import java.util.HashSet; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + +import org.apache.xindice.core.Collection; +import org.apache.xindice.core.data.Key; +import org.apache.xindice.core.data.NodeSet; +import org.apache.xindice.core.data.Entry; +import org.apache.xindice.core.DBException; +import org.apache.xindice.core.FaultCodes; +import org.apache.xindice.core.indexer.LuceneIndexer; +import org.apache.xindice.core.indexer.Indexer; +import org.apache.xindice.core.indexer.IndexMatch; +import org.apache.xindice.util.SimpleConfigurable; +import org.apache.xindice.util.XindiceRuntimeException; +import org.apache.xindice.xml.dom.DBDocument; +import org.apache.xindice.xml.NamespaceMap; +import org.apache.lucene.queryParser.QueryParser; +import org.apache.lucene.queryParser.ParseException; +import org.apache.lucene.analysis.Analyzer; + +import org.w3c.dom.Node; + +/** + * Query resolver for full text queries. Requires existing full text index + * to work. + * + * @see org.apache.xindice.core.indexer.LuceneIndexer + * @author Andy Armstrong + * @version $Revision$, $Date$ + */ +public class TextQueryResolver extends SimpleConfigurable implements QueryResolver { + + public final static String STYLE_FT = "Text"; + private static final Log log = LogFactory.getLog(TextQueryResolver.class); + + private class TextQuery implements Query { + private Collection context; + private String query; + private Key keys[]; + private LuceneIndexer idx; + private org.apache.lucene.search.Query compiledQuery; + + private TextQuery(Collection context, String query, Key[] keys) throws QueryException { + this.context = context; + this.keys = keys; + this.query = query; + + try { + idx = findIndex(context); + if (null == idx) { + throw new QueryException(FaultCodes.QRY_STYLE_NOT_FOUND, "Could not find text indexer in this collection"); + } + Analyzer an = idx.getAnalyzer(); + compiledQuery = new QueryParser("", an).parse(query); + } catch (DBException e) { + throw new QueryException(FaultCodes.QRY_COMPILATION_ERROR, "Failed to compile the query due to database error", e); + } catch (ParseException e) { + throw new QueryException(FaultCodes.QRY_COMPILATION_ERROR, "Failed to compile the query", e); + } + } + + public String getQueryStyle() { + return STYLE_FT; + } + + public Collection getQueryContext() { + return context; + } + + public String getQueryString() { + return query; + } + + public NamespaceMap getNamespaceMap() { + return null; + } + + public Key[] getKeySet() { + return keys; + } + + /** + * Executes compiled Lucene query against existing index. + * + * @return NodeSet that contains document element of all matching + * documents + * @throws QueryException + */ + public NodeSet execute() throws QueryException { + try { + IndexMatch[] match = idx.queryMatches(compiledQuery); + Key[] uniqueKeys = QueryEngine.getUniqueKeys(match); + + // convert keys filter to HashMap + HashSet filter = null; + if (keys != null) { + filter = new HashSet(keys.length); + for (int k = 0; k < keys.length; k++) { + filter.add(keys[k]); + } + } + + Key rk[] = new Key[uniqueKeys.length]; + int rkused = 0; + for (int i = 0; i < uniqueKeys.length; i++) { + if (filter == null || filter.contains(uniqueKeys[i])) { + rk[rkused++] = uniqueKeys[i]; + } + } + + return new ResultSet(rk, rkused); + + } catch (DBException e) { + throw new ProcessingException("Error executing full text query: " + e.getMessage(), e); + } + } + + /** + * ResultSet + */ + private class ResultSet implements NodeSet { + private Key[] keySet; + + private int keyPos = 0; + private int keyLen; + private Node nextNode; + + public ResultSet(Key[] keySet, int keyLen) { + this.keySet = keySet; + this.keyLen = keyLen; + + try { + prepareNextNode(); + } catch (Exception e) { + throw new XindiceRuntimeException(e.getMessage()); + } + } + + private void prepareNextNode() throws DBException { + nextNode = null; + + while (nextNode == null && keyPos < keyLen) { + Entry entry = context.getEntry(keySet[keyPos++]); + if (entry == null || entry.getEntryType() != Entry.DOCUMENT) { + continue; + } + + DBDocument d = (DBDocument) entry.getValue(); + if (d != null) { + nextNode = d.getDocumentElement(); + } + } + } + + public boolean hasMoreNodes() { + return nextNode != null; + } + + public Object getNextNode() { + Node n = nextNode; + + try { + prepareNextNode(); + } catch (Exception e) { + throw new XindiceRuntimeException(e); + } + + return n; + } + } + } + + private LuceneIndexer findIndex(Collection c) throws DBException { + return (LuceneIndexer) c.getIndexManager().getBestIndexer(Indexer.STYLE_FULLTEXT, null); + } + + public void setQueryEngine(QueryEngine engine) { + // do nothing + // FIXME: not used + } + + public String getQueryStyle() { + return STYLE_FT; + } + + public Query compileQuery(Collection context, String query, NamespaceMap nsMap, Key[] keys) throws QueryException { + if (log.isTraceEnabled()) { + log.trace("Compiling query for collection " + context.getCanonicalName() + ", query = " + query); + } + + return new TextQuery(context, query, keys); + } + + public NodeSet query(Collection context, String query, NamespaceMap nsMap, Key[] keys) throws QueryException { + if (log.isTraceEnabled()) { + log.trace("Querying collection " + context.getCanonicalName() + ", query = " + query); + } + try { + Query tq = new TextQuery(context, query, keys); + return tq.execute(); + } catch (Exception e) { + if (e instanceof QueryException) { + throw (QueryException) e; + } else { + throw new ProcessingException("Failed to execute text query", e); + } + } + } +} Propchange: xml/xindice/trunk/java/src/org/apache/xindice/core/query/TextQueryResolver.java ------------------------------------------------------------------------------ svn:eol-style = native Propchange: xml/xindice/trunk/java/src/org/apache/xindice/core/query/TextQueryResolver.java ------------------------------------------------------------------------------ svn:keywords = Id Revision Author Date