Author: natalia Date: Sat Oct 13 19:41:50 2007 New Revision: 584476 URL: http://svn.apache.org/viewvc?rev=584476&view=rev Log: Lucene-based full text search in the content of DOM nodes
Added: xml/xindice/trunk/java/src/org/apache/xindice/core/query/ftsearch/ xml/xindice/trunk/java/src/org/apache/xindice/core/query/ftsearch/NodeReader.java (with props) xml/xindice/trunk/java/src/org/apache/xindice/core/query/ftsearch/NodeTermDocs.java (with props) xml/xindice/trunk/java/src/org/apache/xindice/core/query/ftsearch/NodeTermEnum.java (with props) xml/xindice/trunk/java/src/org/apache/xindice/core/query/ftsearch/NodeTermPositions.java (with props) xml/xindice/trunk/java/src/org/apache/xindice/core/query/ftsearch/Searcher.java (with props) Added: xml/xindice/trunk/java/src/org/apache/xindice/core/query/ftsearch/NodeReader.java URL: http://svn.apache.org/viewvc/xml/xindice/trunk/java/src/org/apache/xindice/core/query/ftsearch/NodeReader.java?rev=584476&view=auto ============================================================================== --- xml/xindice/trunk/java/src/org/apache/xindice/core/query/ftsearch/NodeReader.java (added) +++ xml/xindice/trunk/java/src/org/apache/xindice/core/query/ftsearch/NodeReader.java Sat Oct 13 19:41:50 2007 @@ -0,0 +1,286 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * $Id$ + */ + +package org.apache.xindice.core.query.ftsearch; + +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.TermFreqVector; +import org.apache.lucene.index.TermEnum; +import org.apache.lucene.index.Term; +import org.apache.lucene.index.TermDocs; +import org.apache.lucene.index.TermPositions; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.FieldSelector; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Token; +import org.apache.lucene.search.DefaultSimilarity; +import org.apache.xindice.xml.dom.NodeImpl; +import org.w3c.dom.NodeList; +import org.w3c.dom.Node; + +import java.io.IOException; +import java.io.StringReader; +import java.util.Collection; +import java.util.Arrays; +import java.util.ArrayList; +import java.util.SortedMap; +import java.util.TreeMap; +import java.util.HashSet; +import java.util.List; + +/** + * Implementation of IndexReader that works with set of DOM nodes in memory. + * Set of nodes is constant - no modifications of it are allowed. + * + * @version $Revision$, $Date$ + */ +public class NodeReader extends IndexReader { + // list of DOM nodes + private final ArrayList[] nodes; + + private byte[] norms; + + // maps term to a list of documents where it appears + private final SortedMap termMap; + + protected NodeReader(NodeList list, Analyzer analyzer) { + super(null); + + nodes = new ArrayList[list.getLength()]; + + for (int i = 0; i < nodes.length; i++) { + nodes[i] = new ArrayList(); + Node node = list.item(i); + + String text; + if (node instanceof NodeImpl) { + // DOM Level 3 method + text = ((NodeImpl) node).getTextContent(); + } else { + text = getTextContent(node); + } + + TokenStream stream = analyzer.tokenStream("", new StringReader(text)); + try { + Token token; + while ((token = stream.next()) != null) { + nodes[i].add(token.termText()); + } + } catch (IOException e) { + // won't happen + } + } + + // init norms + norms = new byte[nodes.length]; + Arrays.fill(norms, DefaultSimilarity.encodeNorm(1.0f)); + + // build term enumeration + termMap = buildTermMap(); + } + + public TermFreqVector[] getTermFreqVectors(int docNumber) { + throw new UnsupportedOperationException(); + } + + public TermFreqVector getTermFreqVector(int docNumber, String field) { + throw new UnsupportedOperationException(); + } + + public int numDocs() { + return nodes.length; + } + + public int maxDoc() { + return nodes.length; + } + + /** + * Method is not supported. + */ + public Document document(int n, FieldSelector fieldSelector) { + return null; + } + + /** + * Deletion is not supported. + */ + public boolean isDeleted(int n) { + return false; + } + + /** + * Deletion is not supported. + */ + public boolean hasDeletions() { + return false; + } + + public byte[] norms(String field) throws IOException { + return field.length() == 0 ? norms : null; + } + + public void norms(String field, byte[] bytes, int offset) { + System.arraycopy(norms, 0, bytes, offset, maxDoc()); + } + + protected void doSetNorm(int doc, String field, byte value) { + if (field.length() > 0) { + return; + } + + norms[doc] = value; + } + + public TermEnum terms() { + return new NodeTermEnum(termMap); + } + + public TermEnum terms(Term t) { + return new NodeTermEnum(termMap, t); + } + + /** + * Builds the map of all the terms in all the nodes to the list of + * node numbers where those terms appear. + * @return Map with keys sorted in ascending order + */ + private SortedMap buildTermMap() { + SortedMap map = new TreeMap(); + + for (int i = 0; i < nodes.length; i++) { + for (int j = 0; j < nodes[i].size(); j++) { + String term = (String) nodes[i].get(j); + + List docs; + if (map.containsKey(term)) { + docs = (List) map.get(term); + } else { + docs = new ArrayList(); + } + + docs.add(new Integer(i)); + map.put(term, docs); + } + } + + return map; + } + + public int docFreq(Term t) { + List docs = (List) termMap.get(t.text()); + + // no such term + if (docs == null) { + return 0; + } + + HashSet set = new HashSet(); + set.addAll(docs); + return set.size(); + } + + public TermDocs termDocs() { + return new NodeTermDocs(this); + } + + public TermPositions termPositions() { + return new NodeTermPositions(this); + } + + /** + * Deletion is not supported. + */ + protected void doDelete(int docNum) { + throw new UnsupportedOperationException(); + } + + /** + * Deletion is not supported. + */ + protected void doUndeleteAll() { + throw new UnsupportedOperationException(); + } + + /** + * Not applicable + */ + protected void doCommit() { + } + + /** + * Not applicable + */ + protected void doClose() { + } + + /** + * Field names are not supported. + */ + public Collection getFieldNames(FieldOption fldOption) { + throw new UnsupportedOperationException(); + } + + ArrayList[] getNodes() { + return nodes; + } + + SortedMap getTermMap() { + return termMap; + } + + /** + * Get text content of a DOM node. This is the same as DOM Level 3 method + * getTextContent(). + * @param node + * @return The text content of this node and its descendants. + */ + private String getTextContent(Node node) { + String text = null; + switch (node.getNodeType()) { + case Node.ATTRIBUTE_NODE: + case Node.CDATA_SECTION_NODE: + case Node.COMMENT_NODE: + case Node.PROCESSING_INSTRUCTION_NODE: + case Node.TEXT_NODE: + text = node.getNodeValue(); + break; + case Node.ELEMENT_NODE: + case Node.DOCUMENT_FRAGMENT_NODE: + case Node.ENTITY_NODE: + case Node.ENTITY_REFERENCE_NODE: + StringBuffer val = new StringBuffer(); + + NodeList children = node.getChildNodes(); + if (children == null || children.getLength() == 0) { + text = ""; + break; + } + + for (int i = 0; i < children.getLength(); i++) { + val.append(getTextContent(children.item(i))); + } + text = val.toString(); + break; + } + + return text; + } +} Propchange: xml/xindice/trunk/java/src/org/apache/xindice/core/query/ftsearch/NodeReader.java ------------------------------------------------------------------------------ svn:eol-style = native Propchange: xml/xindice/trunk/java/src/org/apache/xindice/core/query/ftsearch/NodeReader.java ------------------------------------------------------------------------------ svn:keywords = Id Revision Author Date Added: xml/xindice/trunk/java/src/org/apache/xindice/core/query/ftsearch/NodeTermDocs.java URL: http://svn.apache.org/viewvc/xml/xindice/trunk/java/src/org/apache/xindice/core/query/ftsearch/NodeTermDocs.java?rev=584476&view=auto ============================================================================== --- xml/xindice/trunk/java/src/org/apache/xindice/core/query/ftsearch/NodeTermDocs.java (added) +++ xml/xindice/trunk/java/src/org/apache/xindice/core/query/ftsearch/NodeTermDocs.java Sat Oct 13 19:41:50 2007 @@ -0,0 +1,110 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * $Id$ + */ + +package org.apache.xindice.core.query.ftsearch; + +import org.apache.lucene.index.TermDocs; +import org.apache.lucene.index.Term; +import org.apache.lucene.index.TermEnum; + +import java.util.ArrayList; +import java.util.Iterator; +import java.util.Map; +import java.util.List; + +/** + * TermDocs for searching over DOM nodes in memory + * + * @version $Revision$, $Date$ + */ +public class NodeTermDocs implements TermDocs { + protected int num; + protected ArrayList[] nodes; + protected Map termMap; + protected int[] freqs; + private boolean init; + + public NodeTermDocs(NodeReader reader) { + nodes = reader.getNodes(); + termMap = reader.getTermMap(); + + freqs = new int[nodes.length]; + } + + public void seek(Term term) { + List docs = (List) termMap.get(term.text()); + if (docs != null) { + for (Iterator i = docs.iterator(); i.hasNext(); ) { + int idx = ((Integer) i.next()).intValue(); + freqs[idx]++; + } + } + } + + public void seek(TermEnum termEnum) { + seek(termEnum.term()); + } + + public int doc() { + return num; + } + + public int freq() { + return freqs[num]; + } + + public boolean next() { + if (!init) { + init = true; + num = 0; + } else { + num++; + } + + while(num < nodes.length && freqs[num] == 0) { + num++; + } + + return num < nodes.length; + } + + public int read(int[] docs, int[] freqs) { + int count = 0; + for (int i = 0; i < docs.length && num < nodes.length; i++, num++) { + if (this.freqs[num] > 0) { + freqs[count] = this.freqs[num]; + docs[count] = num; + count++; + } + } + + return count; + } + + public boolean skipTo(int target) { + num = target; + return num < nodes.length; + } + + /** + * Not applicable + */ + public void close() { + } +} Propchange: xml/xindice/trunk/java/src/org/apache/xindice/core/query/ftsearch/NodeTermDocs.java ------------------------------------------------------------------------------ svn:eol-style = native Propchange: xml/xindice/trunk/java/src/org/apache/xindice/core/query/ftsearch/NodeTermDocs.java ------------------------------------------------------------------------------ svn:keywords = Id Revision Author Date Added: xml/xindice/trunk/java/src/org/apache/xindice/core/query/ftsearch/NodeTermEnum.java URL: http://svn.apache.org/viewvc/xml/xindice/trunk/java/src/org/apache/xindice/core/query/ftsearch/NodeTermEnum.java?rev=584476&view=auto ============================================================================== --- xml/xindice/trunk/java/src/org/apache/xindice/core/query/ftsearch/NodeTermEnum.java (added) +++ xml/xindice/trunk/java/src/org/apache/xindice/core/query/ftsearch/NodeTermEnum.java Sat Oct 13 19:41:50 2007 @@ -0,0 +1,86 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * $Id$ + */ + +package org.apache.xindice.core.query.ftsearch; + +import org.apache.lucene.index.TermEnum; +import org.apache.lucene.index.Term; + +import java.util.Iterator; +import java.util.SortedMap; +import java.util.HashSet; +import java.util.List; + +/** + * Ordered set of terms used by NodeReader. + * + * @version $Revision$, $Date$ + */ +public class NodeTermEnum extends TermEnum { + // map that backs this enumeration + private SortedMap termMap; + + private Iterator iterator; + + // value of the current term + private String curTerm; + + public NodeTermEnum(SortedMap termMap) { + this.termMap = termMap; + iterator = this.termMap.keySet().iterator(); + if (iterator.hasNext()) { + curTerm = (String) iterator.next(); + } + } + + public NodeTermEnum(SortedMap termMap, Term t) { + this(termMap.tailMap(t.text())); + } + + public boolean next() { + if (iterator.hasNext()) { + curTerm = (String) iterator.next(); + return true; + } + + return false; + } + + public Term term() { + if (curTerm != null) { + return new Term("", curTerm); + } + + return null; + } + + public int docFreq() { + List docs = (List) termMap.get(curTerm); + + HashSet set = new HashSet(); + set.addAll(docs); + return set.size(); + } + + /** + * Not applicable + */ + public void close() { + } +} Propchange: xml/xindice/trunk/java/src/org/apache/xindice/core/query/ftsearch/NodeTermEnum.java ------------------------------------------------------------------------------ svn:eol-style = native Propchange: xml/xindice/trunk/java/src/org/apache/xindice/core/query/ftsearch/NodeTermEnum.java ------------------------------------------------------------------------------ svn:keywords = Id Revision Author Date Added: xml/xindice/trunk/java/src/org/apache/xindice/core/query/ftsearch/NodeTermPositions.java URL: http://svn.apache.org/viewvc/xml/xindice/trunk/java/src/org/apache/xindice/core/query/ftsearch/NodeTermPositions.java?rev=584476&view=auto ============================================================================== --- xml/xindice/trunk/java/src/org/apache/xindice/core/query/ftsearch/NodeTermPositions.java (added) +++ xml/xindice/trunk/java/src/org/apache/xindice/core/query/ftsearch/NodeTermPositions.java Sat Oct 13 19:41:50 2007 @@ -0,0 +1,76 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * $Id$ + */ + +package org.apache.xindice.core.query.ftsearch; + +import org.apache.lucene.index.TermPositions; +import org.apache.lucene.index.Term; + +import java.io.IOException; + +/** + * TermPositions for searching over DOM nodes in memory + * + * @version $Revision$, $Date$ + */ +public class NodeTermPositions extends NodeTermDocs implements TermPositions { + private int pos; + private String term; + + public NodeTermPositions(NodeReader reader) { + super(reader); + } + + public void seek(Term term) { + this.term = term.text(); + super.seek(term); + } + + public int nextPosition() throws IOException { + while (pos < nodes[num].size()) { + if ((nodes[num].get(pos)).equals(term)) { + return pos; + } + pos++; + } + + return 0; + } + + public boolean next() { + if (super.next()) { + pos = 0; + return true; + } + + return false; + } + + public int getPayloadLength() { + throw new UnsupportedOperationException(); + } + + public byte[] getPayload(byte[] data, int offset) throws IOException { + throw new UnsupportedOperationException(); + } + + public boolean isPayloadAvailable() { + return false; + } +} Propchange: xml/xindice/trunk/java/src/org/apache/xindice/core/query/ftsearch/NodeTermPositions.java ------------------------------------------------------------------------------ svn:eol-style = native Propchange: xml/xindice/trunk/java/src/org/apache/xindice/core/query/ftsearch/NodeTermPositions.java ------------------------------------------------------------------------------ svn:keywords = Id Revision Author Date Added: xml/xindice/trunk/java/src/org/apache/xindice/core/query/ftsearch/Searcher.java URL: http://svn.apache.org/viewvc/xml/xindice/trunk/java/src/org/apache/xindice/core/query/ftsearch/Searcher.java?rev=584476&view=auto ============================================================================== --- xml/xindice/trunk/java/src/org/apache/xindice/core/query/ftsearch/Searcher.java (added) +++ xml/xindice/trunk/java/src/org/apache/xindice/core/query/ftsearch/Searcher.java Sat Oct 13 19:41:50 2007 @@ -0,0 +1,104 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * $Id$ + */ + +package org.apache.xindice.core.query.ftsearch; + +import org.w3c.dom.NodeList; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.queryParser.QueryParser; +import org.apache.lucene.queryParser.ParseException; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.Hits; +import org.apache.lucene.search.Query; +import org.apache.xindice.core.data.NodeSet; + +import java.io.IOException; + +/** + * Searcher executes full text queries against list of nodes and returns + * nodes that match the query in the order of relevance (score) - most relevant + * results will be return first.<br> + * <br> + * Query syntax is the same as syntax of Lucene query, except it does not use + * field names. + * + * @version $Revision$, $Date$ + */ +public class Searcher { + private NodeList nodes; + private Analyzer analyzer; + + /** + * Builds new Searcher based on list of nodes and analyzer. + * + * @param nodes List of nodes to search + * @param analyzer Analyzer that will be used to tokenize text of the nodes. + * Choice of analyzer affects query results. + * @see org.apache.lucene.analysis.Analyzer + */ + public Searcher(NodeList nodes, Analyzer analyzer) { + this.nodes = nodes; + this.analyzer = analyzer; + } + + /** + * Executes query against list of nodes and returns matches in the order + * of relevance (score). + * + * @param query Full text query + * @return NodeSet that contains matching nodes + * @throws ParseException Query failed to be parsed + */ + public NodeSet search(String query) throws ParseException { + Query compQuery = new QueryParser("", analyzer).parse(query); + NodeReader reader = new NodeReader(nodes, analyzer); + IndexSearcher searcher = new IndexSearcher(reader); + Hits hits = null; + try { + hits = searcher.search(compQuery); + } catch (IOException e) { + // this searcher does not use file IO, exception won't happen + } + + return new ResultSet(hits); + } + + private class ResultSet implements NodeSet { + private Hits hits; + private int count; + + private ResultSet(Hits hits) { + this.hits = hits; + } + + public boolean hasMoreNodes() { + return count < hits.length(); + } + + public Object getNextNode() { + try { + return nodes.item(hits.id(count++)); + } catch (IOException e) { + // ignore, does not use IO + } + + return null; + } + } +} Propchange: xml/xindice/trunk/java/src/org/apache/xindice/core/query/ftsearch/Searcher.java ------------------------------------------------------------------------------ svn:eol-style = native Propchange: xml/xindice/trunk/java/src/org/apache/xindice/core/query/ftsearch/Searcher.java ------------------------------------------------------------------------------ svn:keywords = Id Revision Author Date