Author: natalia
Date: Sat Oct 13 19:41:50 2007
New Revision: 584476
URL: http://svn.apache.org/viewvc?rev=584476&view=rev
Log:
Lucene-based full text search in the content of DOM nodes
Added:
xml/xindice/trunk/java/src/org/apache/xindice/core/query/ftsearch/
xml/xindice/trunk/java/src/org/apache/xindice/core/query/ftsearch/NodeReader.java
(with props)
xml/xindice/trunk/java/src/org/apache/xindice/core/query/ftsearch/NodeTermDocs.java
(with props)
xml/xindice/trunk/java/src/org/apache/xindice/core/query/ftsearch/NodeTermEnum.java
(with props)
xml/xindice/trunk/java/src/org/apache/xindice/core/query/ftsearch/NodeTermPositions.java
(with props)
xml/xindice/trunk/java/src/org/apache/xindice/core/query/ftsearch/Searcher.java
(with props)
Added:
xml/xindice/trunk/java/src/org/apache/xindice/core/query/ftsearch/NodeReader.java
URL:
http://svn.apache.org/viewvc/xml/xindice/trunk/java/src/org/apache/xindice/core/query/ftsearch/NodeReader.java?rev=584476&view=auto
==============================================================================
---
xml/xindice/trunk/java/src/org/apache/xindice/core/query/ftsearch/NodeReader.java
(added)
+++
xml/xindice/trunk/java/src/org/apache/xindice/core/query/ftsearch/NodeReader.java
Sat Oct 13 19:41:50 2007
@@ -0,0 +1,286 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * $Id$
+ */
+
+package org.apache.xindice.core.query.ftsearch;
+
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.TermFreqVector;
+import org.apache.lucene.index.TermEnum;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.index.TermDocs;
+import org.apache.lucene.index.TermPositions;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.FieldSelector;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.search.DefaultSimilarity;
+import org.apache.xindice.xml.dom.NodeImpl;
+import org.w3c.dom.NodeList;
+import org.w3c.dom.Node;
+
+import java.io.IOException;
+import java.io.StringReader;
+import java.util.Collection;
+import java.util.Arrays;
+import java.util.ArrayList;
+import java.util.SortedMap;
+import java.util.TreeMap;
+import java.util.HashSet;
+import java.util.List;
+
+/**
+ * Implementation of IndexReader that works with set of DOM nodes in memory.
+ * Set of nodes is constant - no modifications of it are allowed.
+ *
+ * @version $Revision$, $Date$
+ */
+public class NodeReader extends IndexReader {
+ // list of DOM nodes
+ private final ArrayList[] nodes;
+
+ private byte[] norms;
+
+ // maps term to a list of documents where it appears
+ private final SortedMap termMap;
+
+ protected NodeReader(NodeList list, Analyzer analyzer) {
+ super(null);
+
+ nodes = new ArrayList[list.getLength()];
+
+ for (int i = 0; i < nodes.length; i++) {
+ nodes[i] = new ArrayList();
+ Node node = list.item(i);
+
+ String text;
+ if (node instanceof NodeImpl) {
+ // DOM Level 3 method
+ text = ((NodeImpl) node).getTextContent();
+ } else {
+ text = getTextContent(node);
+ }
+
+ TokenStream stream = analyzer.tokenStream("", new
StringReader(text));
+ try {
+ Token token;
+ while ((token = stream.next()) != null) {
+ nodes[i].add(token.termText());
+ }
+ } catch (IOException e) {
+ // won't happen
+ }
+ }
+
+ // init norms
+ norms = new byte[nodes.length];
+ Arrays.fill(norms, DefaultSimilarity.encodeNorm(1.0f));
+
+ // build term enumeration
+ termMap = buildTermMap();
+ }
+
+ public TermFreqVector[] getTermFreqVectors(int docNumber) {
+ throw new UnsupportedOperationException();
+ }
+
+ public TermFreqVector getTermFreqVector(int docNumber, String field) {
+ throw new UnsupportedOperationException();
+ }
+
+ public int numDocs() {
+ return nodes.length;
+ }
+
+ public int maxDoc() {
+ return nodes.length;
+ }
+
+ /**
+ * Method is not supported.
+ */
+ public Document document(int n, FieldSelector fieldSelector) {
+ return null;
+ }
+
+ /**
+ * Deletion is not supported.
+ */
+ public boolean isDeleted(int n) {
+ return false;
+ }
+
+ /**
+ * Deletion is not supported.
+ */
+ public boolean hasDeletions() {
+ return false;
+ }
+
+ public byte[] norms(String field) throws IOException {
+ return field.length() == 0 ? norms : null;
+ }
+
+ public void norms(String field, byte[] bytes, int offset) {
+ System.arraycopy(norms, 0, bytes, offset, maxDoc());
+ }
+
+ protected void doSetNorm(int doc, String field, byte value) {
+ if (field.length() > 0) {
+ return;
+ }
+
+ norms[doc] = value;
+ }
+
+ public TermEnum terms() {
+ return new NodeTermEnum(termMap);
+ }
+
+ public TermEnum terms(Term t) {
+ return new NodeTermEnum(termMap, t);
+ }
+
+ /**
+ * Builds the map of all the terms in all the nodes to the list of
+ * node numbers where those terms appear.
+ * @return Map with keys sorted in ascending order
+ */
+ private SortedMap buildTermMap() {
+ SortedMap map = new TreeMap();
+
+ for (int i = 0; i < nodes.length; i++) {
+ for (int j = 0; j < nodes[i].size(); j++) {
+ String term = (String) nodes[i].get(j);
+
+ List docs;
+ if (map.containsKey(term)) {
+ docs = (List) map.get(term);
+ } else {
+ docs = new ArrayList();
+ }
+
+ docs.add(new Integer(i));
+ map.put(term, docs);
+ }
+ }
+
+ return map;
+ }
+
+ public int docFreq(Term t) {
+ List docs = (List) termMap.get(t.text());
+
+ // no such term
+ if (docs == null) {
+ return 0;
+ }
+
+ HashSet set = new HashSet();
+ set.addAll(docs);
+ return set.size();
+ }
+
+ public TermDocs termDocs() {
+ return new NodeTermDocs(this);
+ }
+
+ public TermPositions termPositions() {
+ return new NodeTermPositions(this);
+ }
+
+ /**
+ * Deletion is not supported.
+ */
+ protected void doDelete(int docNum) {
+ throw new UnsupportedOperationException();
+ }
+
+ /**
+ * Deletion is not supported.
+ */
+ protected void doUndeleteAll() {
+ throw new UnsupportedOperationException();
+ }
+
+ /**
+ * Not applicable
+ */
+ protected void doCommit() {
+ }
+
+ /**
+ * Not applicable
+ */
+ protected void doClose() {
+ }
+
+ /**
+ * Field names are not supported.
+ */
+ public Collection getFieldNames(FieldOption fldOption) {
+ throw new UnsupportedOperationException();
+ }
+
+ ArrayList[] getNodes() {
+ return nodes;
+ }
+
+ SortedMap getTermMap() {
+ return termMap;
+ }
+
+ /**
+ * Get text content of a DOM node. This is the same as DOM Level 3 method
+ * getTextContent().
+ * @param node
+ * @return The text content of this node and its descendants.
+ */
+ private String getTextContent(Node node) {
+ String text = null;
+ switch (node.getNodeType()) {
+ case Node.ATTRIBUTE_NODE:
+ case Node.CDATA_SECTION_NODE:
+ case Node.COMMENT_NODE:
+ case Node.PROCESSING_INSTRUCTION_NODE:
+ case Node.TEXT_NODE:
+ text = node.getNodeValue();
+ break;
+ case Node.ELEMENT_NODE:
+ case Node.DOCUMENT_FRAGMENT_NODE:
+ case Node.ENTITY_NODE:
+ case Node.ENTITY_REFERENCE_NODE:
+ StringBuffer val = new StringBuffer();
+
+ NodeList children = node.getChildNodes();
+ if (children == null || children.getLength() == 0) {
+ text = "";
+ break;
+ }
+
+ for (int i = 0; i < children.getLength(); i++) {
+ val.append(getTextContent(children.item(i)));
+ }
+ text = val.toString();
+ break;
+ }
+
+ return text;
+ }
+}
Propchange:
xml/xindice/trunk/java/src/org/apache/xindice/core/query/ftsearch/NodeReader.java
------------------------------------------------------------------------------
svn:eol-style = native
Propchange:
xml/xindice/trunk/java/src/org/apache/xindice/core/query/ftsearch/NodeReader.java
------------------------------------------------------------------------------
svn:keywords = Id Revision Author Date
Added:
xml/xindice/trunk/java/src/org/apache/xindice/core/query/ftsearch/NodeTermDocs.java
URL:
http://svn.apache.org/viewvc/xml/xindice/trunk/java/src/org/apache/xindice/core/query/ftsearch/NodeTermDocs.java?rev=584476&view=auto
==============================================================================
---
xml/xindice/trunk/java/src/org/apache/xindice/core/query/ftsearch/NodeTermDocs.java
(added)
+++
xml/xindice/trunk/java/src/org/apache/xindice/core/query/ftsearch/NodeTermDocs.java
Sat Oct 13 19:41:50 2007
@@ -0,0 +1,110 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * $Id$
+ */
+
+package org.apache.xindice.core.query.ftsearch;
+
+import org.apache.lucene.index.TermDocs;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.index.TermEnum;
+
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.Map;
+import java.util.List;
+
+/**
+ * TermDocs for searching over DOM nodes in memory
+ *
+ * @version $Revision$, $Date$
+ */
+public class NodeTermDocs implements TermDocs {
+ protected int num;
+ protected ArrayList[] nodes;
+ protected Map termMap;
+ protected int[] freqs;
+ private boolean init;
+
+ public NodeTermDocs(NodeReader reader) {
+ nodes = reader.getNodes();
+ termMap = reader.getTermMap();
+
+ freqs = new int[nodes.length];
+ }
+
+ public void seek(Term term) {
+ List docs = (List) termMap.get(term.text());
+ if (docs != null) {
+ for (Iterator i = docs.iterator(); i.hasNext(); ) {
+ int idx = ((Integer) i.next()).intValue();
+ freqs[idx]++;
+ }
+ }
+ }
+
+ public void seek(TermEnum termEnum) {
+ seek(termEnum.term());
+ }
+
+ public int doc() {
+ return num;
+ }
+
+ public int freq() {
+ return freqs[num];
+ }
+
+ public boolean next() {
+ if (!init) {
+ init = true;
+ num = 0;
+ } else {
+ num++;
+ }
+
+ while(num < nodes.length && freqs[num] == 0) {
+ num++;
+ }
+
+ return num < nodes.length;
+ }
+
+ public int read(int[] docs, int[] freqs) {
+ int count = 0;
+ for (int i = 0; i < docs.length && num < nodes.length; i++, num++) {
+ if (this.freqs[num] > 0) {
+ freqs[count] = this.freqs[num];
+ docs[count] = num;
+ count++;
+ }
+ }
+
+ return count;
+ }
+
+ public boolean skipTo(int target) {
+ num = target;
+ return num < nodes.length;
+ }
+
+ /**
+ * Not applicable
+ */
+ public void close() {
+ }
+}
Propchange:
xml/xindice/trunk/java/src/org/apache/xindice/core/query/ftsearch/NodeTermDocs.java
------------------------------------------------------------------------------
svn:eol-style = native
Propchange:
xml/xindice/trunk/java/src/org/apache/xindice/core/query/ftsearch/NodeTermDocs.java
------------------------------------------------------------------------------
svn:keywords = Id Revision Author Date
Added:
xml/xindice/trunk/java/src/org/apache/xindice/core/query/ftsearch/NodeTermEnum.java
URL:
http://svn.apache.org/viewvc/xml/xindice/trunk/java/src/org/apache/xindice/core/query/ftsearch/NodeTermEnum.java?rev=584476&view=auto
==============================================================================
---
xml/xindice/trunk/java/src/org/apache/xindice/core/query/ftsearch/NodeTermEnum.java
(added)
+++
xml/xindice/trunk/java/src/org/apache/xindice/core/query/ftsearch/NodeTermEnum.java
Sat Oct 13 19:41:50 2007
@@ -0,0 +1,86 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * $Id$
+ */
+
+package org.apache.xindice.core.query.ftsearch;
+
+import org.apache.lucene.index.TermEnum;
+import org.apache.lucene.index.Term;
+
+import java.util.Iterator;
+import java.util.SortedMap;
+import java.util.HashSet;
+import java.util.List;
+
+/**
+ * Ordered set of terms used by NodeReader.
+ *
+ * @version $Revision$, $Date$
+ */
+public class NodeTermEnum extends TermEnum {
+ // map that backs this enumeration
+ private SortedMap termMap;
+
+ private Iterator iterator;
+
+ // value of the current term
+ private String curTerm;
+
+ public NodeTermEnum(SortedMap termMap) {
+ this.termMap = termMap;
+ iterator = this.termMap.keySet().iterator();
+ if (iterator.hasNext()) {
+ curTerm = (String) iterator.next();
+ }
+ }
+
+ public NodeTermEnum(SortedMap termMap, Term t) {
+ this(termMap.tailMap(t.text()));
+ }
+
+ public boolean next() {
+ if (iterator.hasNext()) {
+ curTerm = (String) iterator.next();
+ return true;
+ }
+
+ return false;
+ }
+
+ public Term term() {
+ if (curTerm != null) {
+ return new Term("", curTerm);
+ }
+
+ return null;
+ }
+
+ public int docFreq() {
+ List docs = (List) termMap.get(curTerm);
+
+ HashSet set = new HashSet();
+ set.addAll(docs);
+ return set.size();
+ }
+
+ /**
+ * Not applicable
+ */
+ public void close() {
+ }
+}
Propchange:
xml/xindice/trunk/java/src/org/apache/xindice/core/query/ftsearch/NodeTermEnum.java
------------------------------------------------------------------------------
svn:eol-style = native
Propchange:
xml/xindice/trunk/java/src/org/apache/xindice/core/query/ftsearch/NodeTermEnum.java
------------------------------------------------------------------------------
svn:keywords = Id Revision Author Date
Added:
xml/xindice/trunk/java/src/org/apache/xindice/core/query/ftsearch/NodeTermPositions.java
URL:
http://svn.apache.org/viewvc/xml/xindice/trunk/java/src/org/apache/xindice/core/query/ftsearch/NodeTermPositions.java?rev=584476&view=auto
==============================================================================
---
xml/xindice/trunk/java/src/org/apache/xindice/core/query/ftsearch/NodeTermPositions.java
(added)
+++
xml/xindice/trunk/java/src/org/apache/xindice/core/query/ftsearch/NodeTermPositions.java
Sat Oct 13 19:41:50 2007
@@ -0,0 +1,76 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * $Id$
+ */
+
+package org.apache.xindice.core.query.ftsearch;
+
+import org.apache.lucene.index.TermPositions;
+import org.apache.lucene.index.Term;
+
+import java.io.IOException;
+
+/**
+ * TermPositions for searching over DOM nodes in memory
+ *
+ * @version $Revision$, $Date$
+ */
+public class NodeTermPositions extends NodeTermDocs implements TermPositions {
+ private int pos;
+ private String term;
+
+ public NodeTermPositions(NodeReader reader) {
+ super(reader);
+ }
+
+ public void seek(Term term) {
+ this.term = term.text();
+ super.seek(term);
+ }
+
+ public int nextPosition() throws IOException {
+ while (pos < nodes[num].size()) {
+ if ((nodes[num].get(pos)).equals(term)) {
+ return pos;
+ }
+ pos++;
+ }
+
+ return 0;
+ }
+
+ public boolean next() {
+ if (super.next()) {
+ pos = 0;
+ return true;
+ }
+
+ return false;
+ }
+
+ public int getPayloadLength() {
+ throw new UnsupportedOperationException();
+ }
+
+ public byte[] getPayload(byte[] data, int offset) throws IOException {
+ throw new UnsupportedOperationException();
+ }
+
+ public boolean isPayloadAvailable() {
+ return false;
+ }
+}
Propchange:
xml/xindice/trunk/java/src/org/apache/xindice/core/query/ftsearch/NodeTermPositions.java
------------------------------------------------------------------------------
svn:eol-style = native
Propchange:
xml/xindice/trunk/java/src/org/apache/xindice/core/query/ftsearch/NodeTermPositions.java
------------------------------------------------------------------------------
svn:keywords = Id Revision Author Date
Added:
xml/xindice/trunk/java/src/org/apache/xindice/core/query/ftsearch/Searcher.java
URL:
http://svn.apache.org/viewvc/xml/xindice/trunk/java/src/org/apache/xindice/core/query/ftsearch/Searcher.java?rev=584476&view=auto
==============================================================================
---
xml/xindice/trunk/java/src/org/apache/xindice/core/query/ftsearch/Searcher.java
(added)
+++
xml/xindice/trunk/java/src/org/apache/xindice/core/query/ftsearch/Searcher.java
Sat Oct 13 19:41:50 2007
@@ -0,0 +1,104 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * $Id$
+ */
+
+package org.apache.xindice.core.query.ftsearch;
+
+import org.w3c.dom.NodeList;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.queryParser.QueryParser;
+import org.apache.lucene.queryParser.ParseException;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.Hits;
+import org.apache.lucene.search.Query;
+import org.apache.xindice.core.data.NodeSet;
+
+import java.io.IOException;
+
+/**
+ * Searcher executes full text queries against list of nodes and returns
+ * nodes that match the query in the order of relevance (score) - most relevant
+ * results will be return first.<br>
+ * <br>
+ * Query syntax is the same as syntax of Lucene query, except it does not use
+ * field names.
+ *
+ * @version $Revision$, $Date$
+ */
+public class Searcher {
+ private NodeList nodes;
+ private Analyzer analyzer;
+
+ /**
+ * Builds new Searcher based on list of nodes and analyzer.
+ *
+ * @param nodes List of nodes to search
+ * @param analyzer Analyzer that will be used to tokenize text of the
nodes.
+ * Choice of analyzer affects query results.
+ * @see org.apache.lucene.analysis.Analyzer
+ */
+ public Searcher(NodeList nodes, Analyzer analyzer) {
+ this.nodes = nodes;
+ this.analyzer = analyzer;
+ }
+
+ /**
+ * Executes query against list of nodes and returns matches in the order
+ * of relevance (score).
+ *
+ * @param query Full text query
+ * @return NodeSet that contains matching nodes
+ * @throws ParseException Query failed to be parsed
+ */
+ public NodeSet search(String query) throws ParseException {
+ Query compQuery = new QueryParser("", analyzer).parse(query);
+ NodeReader reader = new NodeReader(nodes, analyzer);
+ IndexSearcher searcher = new IndexSearcher(reader);
+ Hits hits = null;
+ try {
+ hits = searcher.search(compQuery);
+ } catch (IOException e) {
+ // this searcher does not use file IO, exception won't happen
+ }
+
+ return new ResultSet(hits);
+ }
+
+ private class ResultSet implements NodeSet {
+ private Hits hits;
+ private int count;
+
+ private ResultSet(Hits hits) {
+ this.hits = hits;
+ }
+
+ public boolean hasMoreNodes() {
+ return count < hits.length();
+ }
+
+ public Object getNextNode() {
+ try {
+ return nodes.item(hits.id(count++));
+ } catch (IOException e) {
+ // ignore, does not use IO
+ }
+
+ return null;
+ }
+ }
+}
Propchange:
xml/xindice/trunk/java/src/org/apache/xindice/core/query/ftsearch/Searcher.java
------------------------------------------------------------------------------
svn:eol-style = native
Propchange:
xml/xindice/trunk/java/src/org/apache/xindice/core/query/ftsearch/Searcher.java
------------------------------------------------------------------------------
svn:keywords = Id Revision Author Date