NodeReader.java NodeTermDocs.java NodeTermEnum.java NodeTermPositions.java Searcher.java

natalia Sat, 13 Oct 2007 19:42:12 -0700

Author: natalia
Date: Sat Oct 13 19:41:50 2007
New Revision: 584476

URL: http://svn.apache.org/viewvc?rev=584476&view=rev
Log:
Lucene-based full text search in the content of DOM nodes


Added:
    xml/xindice/trunk/java/src/org/apache/xindice/core/query/ftsearch/
    
xml/xindice/trunk/java/src/org/apache/xindice/core/query/ftsearch/NodeReader.java
   (with props)
    
xml/xindice/trunk/java/src/org/apache/xindice/core/query/ftsearch/NodeTermDocs.java
   (with props)
    
xml/xindice/trunk/java/src/org/apache/xindice/core/query/ftsearch/NodeTermEnum.java
   (with props)
    
xml/xindice/trunk/java/src/org/apache/xindice/core/query/ftsearch/NodeTermPositions.java
   (with props)
    
xml/xindice/trunk/java/src/org/apache/xindice/core/query/ftsearch/Searcher.java 
  (with props)

Added: 
xml/xindice/trunk/java/src/org/apache/xindice/core/query/ftsearch/NodeReader.java
URL: 
http://svn.apache.org/viewvc/xml/xindice/trunk/java/src/org/apache/xindice/core/query/ftsearch/NodeReader.java?rev=584476&view=auto
==============================================================================
--- 
xml/xindice/trunk/java/src/org/apache/xindice/core/query/ftsearch/NodeReader.java
 (added)
+++ 
xml/xindice/trunk/java/src/org/apache/xindice/core/query/ftsearch/NodeReader.java
 Sat Oct 13 19:41:50 2007
@@ -0,0 +1,286 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * $Id$
+ */
+
+package org.apache.xindice.core.query.ftsearch;
+
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.TermFreqVector;
+import org.apache.lucene.index.TermEnum;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.index.TermDocs;
+import org.apache.lucene.index.TermPositions;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.FieldSelector;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.search.DefaultSimilarity;
+import org.apache.xindice.xml.dom.NodeImpl;
+import org.w3c.dom.NodeList;
+import org.w3c.dom.Node;
+
+import java.io.IOException;
+import java.io.StringReader;
+import java.util.Collection;
+import java.util.Arrays;
+import java.util.ArrayList;
+import java.util.SortedMap;
+import java.util.TreeMap;
+import java.util.HashSet;
+import java.util.List;
+
+/**
+ * Implementation of IndexReader that works with set of DOM nodes in memory.
+ * Set of nodes is constant - no modifications of it are allowed.
+ *
+ * @version $Revision$, $Date$
+ */
+public class NodeReader extends IndexReader {
+    // list of DOM nodes
+    private final ArrayList[] nodes;
+
+    private byte[] norms;
+
+    // maps term to a list of documents where it appears
+    private final SortedMap termMap;
+
+    protected NodeReader(NodeList list, Analyzer analyzer) {
+        super(null);
+
+        nodes = new ArrayList[list.getLength()];
+
+        for (int i = 0; i < nodes.length; i++) {
+            nodes[i] = new ArrayList();
+            Node node = list.item(i);
+
+            String text;
+            if (node instanceof NodeImpl) {
+                // DOM Level 3 method
+                text = ((NodeImpl) node).getTextContent();
+            } else {
+                text = getTextContent(node);
+            }
+
+            TokenStream stream = analyzer.tokenStream("", new 
StringReader(text));
+            try {
+                Token token;
+                while ((token = stream.next()) != null) {
+                    nodes[i].add(token.termText());
+                }
+            } catch (IOException e) {
+                // won't happen
+            }
+        }
+
+        // init norms
+        norms = new byte[nodes.length];
+        Arrays.fill(norms, DefaultSimilarity.encodeNorm(1.0f));
+
+        // build term enumeration
+        termMap = buildTermMap();
+    }
+
+    public TermFreqVector[] getTermFreqVectors(int docNumber) {
+        throw new UnsupportedOperationException();
+    }
+
+    public TermFreqVector getTermFreqVector(int docNumber, String field) {
+        throw new UnsupportedOperationException();
+    }
+
+    public int numDocs() {
+        return nodes.length;
+    }
+
+    public int maxDoc() {
+        return nodes.length;
+    }
+
+    /**
+     * Method is not supported.
+     */
+    public Document document(int n, FieldSelector fieldSelector) {
+        return null;
+    }
+
+    /**
+     * Deletion is not supported.
+     */
+    public boolean isDeleted(int n) {
+        return false;
+    }
+
+    /**
+     * Deletion is not supported.
+     */
+    public boolean hasDeletions() {
+        return false;
+    }
+
+    public byte[] norms(String field) throws IOException {
+        return field.length() == 0 ? norms : null;
+    }
+
+    public void norms(String field, byte[] bytes, int offset) {
+        System.arraycopy(norms, 0, bytes, offset, maxDoc());
+    }
+
+    protected void doSetNorm(int doc, String field, byte value) {
+        if (field.length() > 0) {
+            return;
+        }
+
+        norms[doc] = value;
+    }
+
+    public TermEnum terms() {
+        return new NodeTermEnum(termMap);
+    }
+
+    public TermEnum terms(Term t) {
+        return new NodeTermEnum(termMap, t);
+    }
+
+    /**
+     * Builds the map of all the terms in all the nodes to the list of
+     * node numbers where those terms appear.
+     * @return Map with keys sorted in ascending order
+     */
+    private SortedMap buildTermMap() {
+        SortedMap map = new TreeMap();
+
+        for (int i = 0; i < nodes.length; i++) {
+            for (int j = 0; j < nodes[i].size(); j++) {
+                String term = (String) nodes[i].get(j);
+
+                List docs;
+                if (map.containsKey(term)) {
+                    docs = (List) map.get(term);
+                } else {
+                    docs = new ArrayList();
+                }
+
+                docs.add(new Integer(i));
+                map.put(term, docs);
+            }
+        }
+
+        return map;
+    }
+
+    public int docFreq(Term t) {
+        List docs = (List) termMap.get(t.text());
+
+        // no such term
+        if (docs == null) {
+            return 0;
+        }
+
+        HashSet set = new HashSet();
+        set.addAll(docs);
+        return set.size();
+    }
+
+    public TermDocs termDocs() {
+        return new NodeTermDocs(this);
+    }
+
+    public TermPositions termPositions() {
+        return new NodeTermPositions(this);
+    }
+
+    /**
+     * Deletion is not supported.
+     */
+    protected void doDelete(int docNum) {
+        throw new UnsupportedOperationException();
+    }
+
+    /**
+     * Deletion is not supported.
+     */
+    protected void doUndeleteAll() {
+        throw new UnsupportedOperationException();
+    }
+
+    /**
+     * Not applicable
+     */
+    protected void doCommit() {
+    }
+
+    /**
+     * Not applicable
+     */
+    protected void doClose() {
+    }
+
+    /**
+     * Field names are not supported.
+     */
+    public Collection getFieldNames(FieldOption fldOption) {
+        throw new UnsupportedOperationException();
+    }
+
+    ArrayList[] getNodes() {
+        return nodes;
+    }
+
+    SortedMap getTermMap() {
+        return termMap;
+    }
+
+    /**
+     * Get text content of a DOM node. This is the same as DOM Level 3 method
+     * getTextContent().
+     * @param node
+     * @return The text content of this node and its descendants.
+     */
+    private String getTextContent(Node node) {
+        String text = null;
+        switch (node.getNodeType()) {
+            case Node.ATTRIBUTE_NODE:
+            case Node.CDATA_SECTION_NODE:
+            case Node.COMMENT_NODE:
+            case Node.PROCESSING_INSTRUCTION_NODE:
+            case Node.TEXT_NODE:
+                text = node.getNodeValue();
+                break;
+            case Node.ELEMENT_NODE:
+            case Node.DOCUMENT_FRAGMENT_NODE:
+            case Node.ENTITY_NODE:
+            case Node.ENTITY_REFERENCE_NODE:
+                StringBuffer val = new StringBuffer();
+
+                NodeList children = node.getChildNodes();
+                if (children == null || children.getLength() == 0) {
+                    text = "";
+                    break;
+                }
+
+                for (int i = 0; i < children.getLength(); i++) {
+                    val.append(getTextContent(children.item(i)));
+                }
+                text = val.toString();
+                break;
+        }
+
+        return text;
+    }
+}

Propchange: 
xml/xindice/trunk/java/src/org/apache/xindice/core/query/ftsearch/NodeReader.java
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: 
xml/xindice/trunk/java/src/org/apache/xindice/core/query/ftsearch/NodeReader.java
------------------------------------------------------------------------------
    svn:keywords = Id Revision Author Date

Added: 
xml/xindice/trunk/java/src/org/apache/xindice/core/query/ftsearch/NodeTermDocs.java
URL: 
http://svn.apache.org/viewvc/xml/xindice/trunk/java/src/org/apache/xindice/core/query/ftsearch/NodeTermDocs.java?rev=584476&view=auto
==============================================================================
--- 
xml/xindice/trunk/java/src/org/apache/xindice/core/query/ftsearch/NodeTermDocs.java
 (added)
+++ 
xml/xindice/trunk/java/src/org/apache/xindice/core/query/ftsearch/NodeTermDocs.java
 Sat Oct 13 19:41:50 2007
@@ -0,0 +1,110 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * $Id$
+ */
+
+package org.apache.xindice.core.query.ftsearch;
+
+import org.apache.lucene.index.TermDocs;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.index.TermEnum;
+
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.Map;
+import java.util.List;
+
+/**
+ * TermDocs for searching over DOM nodes in memory
+ *
+ * @version $Revision$, $Date$
+ */
+public class NodeTermDocs implements TermDocs {
+    protected int num;
+    protected ArrayList[] nodes;
+    protected Map termMap;
+    protected int[] freqs;
+    private boolean init;
+
+    public NodeTermDocs(NodeReader reader) {
+        nodes = reader.getNodes();
+        termMap = reader.getTermMap();
+
+        freqs = new int[nodes.length];
+    }
+
+    public void seek(Term term) {
+        List docs = (List) termMap.get(term.text());
+        if (docs != null) {
+            for (Iterator i = docs.iterator(); i.hasNext(); ) {
+                int idx = ((Integer) i.next()).intValue();
+                freqs[idx]++;
+            }
+        }
+    }
+
+    public void seek(TermEnum termEnum)  {
+        seek(termEnum.term());
+    }
+
+    public int doc() {
+        return num;
+    }
+
+    public int freq() {
+        return freqs[num];
+    }
+
+    public boolean next() {
+        if (!init) {
+            init = true;
+            num = 0;
+        } else {
+            num++;
+        }
+
+        while(num < nodes.length && freqs[num] == 0) {
+            num++;
+        }
+
+        return num < nodes.length;
+    }
+
+    public int read(int[] docs, int[] freqs) {
+        int count = 0;
+        for (int i = 0; i < docs.length && num < nodes.length; i++, num++) {
+            if (this.freqs[num] > 0) {
+                freqs[count] = this.freqs[num];
+                docs[count] = num;
+                count++;
+            }
+        }
+
+        return count;
+    }
+
+    public boolean skipTo(int target) {
+        num = target;
+        return num < nodes.length;
+    }
+
+    /**
+     * Not applicable
+     */
+    public void close() {
+    }
+}

Propchange: 
xml/xindice/trunk/java/src/org/apache/xindice/core/query/ftsearch/NodeTermDocs.java
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: 
xml/xindice/trunk/java/src/org/apache/xindice/core/query/ftsearch/NodeTermDocs.java
------------------------------------------------------------------------------
    svn:keywords = Id Revision Author Date

Added: 
xml/xindice/trunk/java/src/org/apache/xindice/core/query/ftsearch/NodeTermEnum.java
URL: 
http://svn.apache.org/viewvc/xml/xindice/trunk/java/src/org/apache/xindice/core/query/ftsearch/NodeTermEnum.java?rev=584476&view=auto
==============================================================================
--- 
xml/xindice/trunk/java/src/org/apache/xindice/core/query/ftsearch/NodeTermEnum.java
 (added)
+++ 
xml/xindice/trunk/java/src/org/apache/xindice/core/query/ftsearch/NodeTermEnum.java
 Sat Oct 13 19:41:50 2007
@@ -0,0 +1,86 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * $Id$
+ */
+
+package org.apache.xindice.core.query.ftsearch;
+
+import org.apache.lucene.index.TermEnum;
+import org.apache.lucene.index.Term;
+
+import java.util.Iterator;
+import java.util.SortedMap;
+import java.util.HashSet;
+import java.util.List;
+
+/**
+ * Ordered set of terms used by NodeReader.
+ *
+ * @version $Revision$, $Date$
+ */
+public class NodeTermEnum extends TermEnum {
+    // map that backs this enumeration
+    private SortedMap termMap;
+
+    private Iterator iterator;
+    
+    // value of the current term
+    private String curTerm;
+
+    public NodeTermEnum(SortedMap termMap) {
+        this.termMap = termMap;
+        iterator = this.termMap.keySet().iterator();
+        if (iterator.hasNext()) {
+            curTerm = (String) iterator.next();
+        }
+    }
+
+    public NodeTermEnum(SortedMap termMap, Term t) {
+        this(termMap.tailMap(t.text()));
+    }
+
+    public boolean next() {
+        if (iterator.hasNext()) {
+            curTerm = (String) iterator.next();
+            return true;
+        }
+
+        return false;
+    }
+
+    public Term term() {
+        if (curTerm != null) {
+            return new Term("", curTerm);
+        }
+
+        return null;
+    }
+
+    public int docFreq() {
+        List docs = (List) termMap.get(curTerm);
+
+        HashSet set = new HashSet();
+        set.addAll(docs);
+        return set.size();
+    }
+
+    /**
+     * Not applicable
+     */
+    public void close() {
+    }
+}

Propchange: 
xml/xindice/trunk/java/src/org/apache/xindice/core/query/ftsearch/NodeTermEnum.java
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: 
xml/xindice/trunk/java/src/org/apache/xindice/core/query/ftsearch/NodeTermEnum.java
------------------------------------------------------------------------------
    svn:keywords = Id Revision Author Date

Added: 
xml/xindice/trunk/java/src/org/apache/xindice/core/query/ftsearch/NodeTermPositions.java
URL: 
http://svn.apache.org/viewvc/xml/xindice/trunk/java/src/org/apache/xindice/core/query/ftsearch/NodeTermPositions.java?rev=584476&view=auto
==============================================================================
--- 
xml/xindice/trunk/java/src/org/apache/xindice/core/query/ftsearch/NodeTermPositions.java
 (added)
+++ 
xml/xindice/trunk/java/src/org/apache/xindice/core/query/ftsearch/NodeTermPositions.java
 Sat Oct 13 19:41:50 2007
@@ -0,0 +1,76 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * $Id$
+ */
+
+package org.apache.xindice.core.query.ftsearch;
+
+import org.apache.lucene.index.TermPositions;
+import org.apache.lucene.index.Term;
+
+import java.io.IOException;
+
+/**
+ * TermPositions for searching over DOM nodes in memory
+ *
+ * @version $Revision$, $Date$
+ */
+public class NodeTermPositions extends NodeTermDocs implements TermPositions {
+    private int pos;
+    private String term;
+
+    public NodeTermPositions(NodeReader reader) {
+        super(reader);
+    }
+
+    public void seek(Term term) {
+        this.term = term.text();
+        super.seek(term);
+    }
+
+    public int nextPosition() throws IOException {
+        while (pos < nodes[num].size()) {
+            if ((nodes[num].get(pos)).equals(term)) {
+                return pos;
+            }
+            pos++;
+        }
+
+        return 0;
+    }
+
+    public boolean next() {
+        if (super.next()) {
+            pos = 0;
+            return true;
+        }
+
+        return false;
+    }
+
+    public int getPayloadLength() {
+        throw new UnsupportedOperationException();
+    }
+
+    public byte[] getPayload(byte[] data, int offset) throws IOException {
+        throw new UnsupportedOperationException();
+    }
+
+    public boolean isPayloadAvailable() {
+        return false;
+    }
+}

Propchange: 
xml/xindice/trunk/java/src/org/apache/xindice/core/query/ftsearch/NodeTermPositions.java
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: 
xml/xindice/trunk/java/src/org/apache/xindice/core/query/ftsearch/NodeTermPositions.java
------------------------------------------------------------------------------
    svn:keywords = Id Revision Author Date

Added: 
xml/xindice/trunk/java/src/org/apache/xindice/core/query/ftsearch/Searcher.java
URL: 
http://svn.apache.org/viewvc/xml/xindice/trunk/java/src/org/apache/xindice/core/query/ftsearch/Searcher.java?rev=584476&view=auto
==============================================================================
--- 
xml/xindice/trunk/java/src/org/apache/xindice/core/query/ftsearch/Searcher.java 
(added)
+++ 
xml/xindice/trunk/java/src/org/apache/xindice/core/query/ftsearch/Searcher.java 
Sat Oct 13 19:41:50 2007
@@ -0,0 +1,104 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * $Id$
+ */
+
+package org.apache.xindice.core.query.ftsearch;
+
+import org.w3c.dom.NodeList;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.queryParser.QueryParser;
+import org.apache.lucene.queryParser.ParseException;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.Hits;
+import org.apache.lucene.search.Query;
+import org.apache.xindice.core.data.NodeSet;
+
+import java.io.IOException;
+
+/**
+ * Searcher executes full text queries against list of nodes and returns
+ * nodes that match the query in the order of relevance (score) - most relevant
+ * results will be return first.<br>
+ * <br>
+ * Query syntax is the same as syntax of Lucene query, except it does not use
+ * field names.
+ *
+ * @version $Revision$, $Date$
+ */
+public class Searcher {
+    private NodeList nodes;
+    private Analyzer analyzer;
+
+    /**
+     * Builds new Searcher based on list of nodes and analyzer.
+     *
+     * @param nodes List of nodes to search
+     * @param analyzer Analyzer that will be used to tokenize text of the 
nodes.
+     * Choice of analyzer affects query results.
+     * @see org.apache.lucene.analysis.Analyzer
+     */
+    public Searcher(NodeList nodes, Analyzer analyzer) {
+        this.nodes = nodes;
+        this.analyzer = analyzer;
+    }
+
+    /**
+     * Executes query against list of nodes and returns matches in the order
+     * of relevance (score).
+     *
+     * @param query Full text query
+     * @return NodeSet that contains matching nodes
+     * @throws ParseException Query failed to be parsed
+     */
+    public NodeSet search(String query) throws ParseException {
+        Query compQuery = new QueryParser("", analyzer).parse(query);
+        NodeReader reader = new NodeReader(nodes, analyzer);
+        IndexSearcher searcher = new IndexSearcher(reader);
+        Hits hits = null;
+        try {
+            hits = searcher.search(compQuery);
+        } catch (IOException e) {
+            // this searcher does not use file IO, exception won't happen
+        }
+
+        return new ResultSet(hits);
+    }
+
+    private class ResultSet implements NodeSet {
+        private Hits hits;
+        private int count;
+
+        private ResultSet(Hits hits) {
+            this.hits = hits;
+        }
+
+        public boolean hasMoreNodes() {
+            return count < hits.length();
+        }
+
+        public Object getNextNode() {
+            try {
+                return nodes.item(hits.id(count++));
+            } catch (IOException e) {
+                // ignore, does not use IO
+            }
+
+            return null;
+        }
+    }
+}

Propchange: 
xml/xindice/trunk/java/src/org/apache/xindice/core/query/ftsearch/Searcher.java
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: 
xml/xindice/trunk/java/src/org/apache/xindice/core/query/ftsearch/Searcher.java
------------------------------------------------------------------------------
    svn:keywords = Id Revision Author Date

svn commit: r584476 - in /xml/xindice/trunk/java/src/org/apache/xindice/core/query/ftsearch: ./ NodeReader.java NodeTermDocs.java NodeTermEnum.java NodeTermPositions.java Searcher.java

Reply via email to