svn commit: r1757864 - in /lucene/pylucene/trunk: ./ java/org/apache/pylucene/search/spans/ test/

vajda Fri, 26 Aug 2016 07:55:27 -0700

Author: vajda
Date: Fri Aug 26 14:54:59 2016
New Revision: 1757864

URL: http://svn.apache.org/viewvc?rev=1757864&view=rev
Log:
test_PositionIncrement.py passes


Added:
    
lucene/pylucene/trunk/java/org/apache/pylucene/search/spans/PythonSpanCollector.java
   (with props)
Removed:
    lucene/pylucene/trunk/test/MultiSpansWrapper.py
Modified:
    lucene/pylucene/trunk/extensions.xml
    lucene/pylucene/trunk/java/org/apache/pylucene/search/spans/PythonSpans.java
    lucene/pylucene/trunk/test/PyLuceneTestCase.py
    lucene/pylucene/trunk/test/test_PositionIncrement.py

Modified: lucene/pylucene/trunk/extensions.xml
URL: 
http://svn.apache.org/viewvc/lucene/pylucene/trunk/extensions.xml?rev=1757864&r1=1757863&r2=1757864&view=diff
==============================================================================
--- lucene/pylucene/trunk/extensions.xml (original)
+++ lucene/pylucene/trunk/extensions.xml Fri Aug 26 14:54:59 2016
@@ -21,7 +21,7 @@
     <javac srcdir="java/org/apache/pylucene/util"
            destdir="${classes.dir}" classpathref="classpath" />
     <javac srcdir="java/org/apache/pylucene/search" 
-           excludes="similarities/*,spans/*"
+           excludes="similarities/*"
            destdir="${classes.dir}" classpathref="classpath" />
     <javac srcdir="java/org/apache/pylucene/index" 
            destdir="${classes.dir}" classpathref="classpath" />

Added: 
lucene/pylucene/trunk/java/org/apache/pylucene/search/spans/PythonSpanCollector.java
URL: 
http://svn.apache.org/viewvc/lucene/pylucene/trunk/java/org/apache/pylucene/search/spans/PythonSpanCollector.java?rev=1757864&view=auto
==============================================================================
--- 
lucene/pylucene/trunk/java/org/apache/pylucene/search/spans/PythonSpanCollector.java
 (added)
+++ 
lucene/pylucene/trunk/java/org/apache/pylucene/search/spans/PythonSpanCollector.java
 Fri Aug 26 14:54:59 2016
@@ -0,0 +1,56 @@
+/* ====================================================================
+ *   Licensed under the Apache License, Version 2.0 (the "License");
+ *   you may not use this file except in compliance with the License.
+ *   You may obtain a copy of the License at
+ *
+ *       http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *   Unless required by applicable law or agreed to in writing, software
+ *   distributed under the License is distributed on an "AS IS" BASIS,
+ *   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *   See the License for the specific language governing permissions and
+ *   limitations under the License.
+ * ====================================================================
+ */
+
+package org.apache.pylucene.search.spans;
+
+import java.io.IOException;
+
+import org.apache.lucene.index.PostingsEnum;
+import org.apache.lucene.index.Term;
+
+import org.apache.lucene.search.spans.SpanCollector;
+
+
+public class PythonSpanCollector implements SpanCollector {
+
+    private long pythonObject;
+
+    public PythonSpanCollector()
+    {
+    }
+
+    public void pythonExtension(long pythonObject)
+    {
+        this.pythonObject = pythonObject;
+    }
+    public long pythonExtension()
+    {
+        return this.pythonObject;
+    }
+
+    public void finalize()
+        throws Throwable
+    {
+        pythonDecRef();
+    }
+
+    public native void pythonDecRef();
+
+    public native void collectLeaf(PostingsEnum postings, int position,
+                                   Term term)
+        throws IOException;
+
+    public native void reset();
+}

Propchange: 
lucene/pylucene/trunk/java/org/apache/pylucene/search/spans/PythonSpanCollector.java
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: 
lucene/pylucene/trunk/java/org/apache/pylucene/search/spans/PythonSpanCollector.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Modified: 
lucene/pylucene/trunk/java/org/apache/pylucene/search/spans/PythonSpans.java
URL: 
http://svn.apache.org/viewvc/lucene/pylucene/trunk/java/org/apache/pylucene/search/spans/PythonSpans.java?rev=1757864&r1=1757863&r2=1757864&view=diff
==============================================================================
--- 
lucene/pylucene/trunk/java/org/apache/pylucene/search/spans/PythonSpans.java 
(original)
+++ 
lucene/pylucene/trunk/java/org/apache/pylucene/search/spans/PythonSpans.java 
Fri Aug 26 14:54:59 2016
@@ -16,9 +16,9 @@
 package org.apache.pylucene.search.spans;
 
 import java.io.IOException;
-import java.util.Collection;
 
 import org.apache.lucene.search.spans.Spans;
+import org.apache.lucene.search.spans.SpanCollector;
 
 
 public class PythonSpans extends Spans {
@@ -46,16 +46,18 @@ public class PythonSpans extends Spans {
 
     public native void pythonDecRef();
 
-    public native boolean next()
+    public native int docID();
+    public native int nextDoc()
         throws IOException;
-    public native boolean skipTo(int target)
-        throws IOException;
-    public native int doc();
-    public native int start();
-    public native int end();
-    public native Collection<byte[]> getPayload()
-        throws IOException;
-    public native boolean isPayloadAvailable()
+    public native int advance(int target)
         throws IOException;
     public native long cost();
+
+    public native int nextStartPosition();
+    public native int startPosition();
+    public native int endPosition();
+    public native int width();
+    public native void collect(SpanCollector collector)
+        throws IOException;
+    public native float positionsCost();
 }

Modified: lucene/pylucene/trunk/test/PyLuceneTestCase.py
URL: 
http://svn.apache.org/viewvc/lucene/pylucene/trunk/test/PyLuceneTestCase.py?rev=1757864&r1=1757863&r2=1757864&view=diff
==============================================================================
--- lucene/pylucene/trunk/test/PyLuceneTestCase.py (original)
+++ lucene/pylucene/trunk/test/PyLuceneTestCase.py Fri Aug 26 14:54:59 2016
@@ -61,13 +61,20 @@ class PyLuceneTestCase(TestCase):
             directory = self.directory
 
         return IndexWriter(directory, config)
-        
+
     def getSearcher(self, directory=None, reader=None):
         if reader is not None:
             return IndexSearcher(reader)
         return IndexSearcher(self.getReader(directory=directory))
-    
+
     def getReader(self, directory=None):
         if directory is None:
             directory = self.directory
         return DirectoryReader.open(directory)
+
+    def getOnlyLeafReader(self, reader):
+        subReaders = reader.leaves()
+        if subReaders.size() != 1:
+            raise ValueError(reader + " has " + subReaders.size() +
+                             " segments instead of exactly one")
+        return subReaders.get(0).reader()

Modified: lucene/pylucene/trunk/test/test_PositionIncrement.py
URL: 
http://svn.apache.org/viewvc/lucene/pylucene/trunk/test/test_PositionIncrement.py?rev=1757864&r1=1757863&r2=1757864&view=diff
==============================================================================
--- lucene/pylucene/trunk/test/test_PositionIncrement.py (original)
+++ lucene/pylucene/trunk/test/test_PositionIncrement.py Fri Aug 26 14:54:59 
2016
@@ -15,7 +15,6 @@
 import sys, lucene, unittest
 from lucene import JArray
 from PyLuceneTestCase import PyLuceneTestCase
-from MultiSpansWrapper import MultiSpansWrapper
 
 from java.io import StringReader
 from org.apache.lucene.analysis import Analyzer
@@ -25,15 +24,17 @@ from org.apache.lucene.analysis.tokenatt
     CharTermAttribute, OffsetAttribute, PayloadAttribute, \
     PositionIncrementAttribute
 from org.apache.lucene.document import Document, Field, TextField
-from org.apache.lucene.index import MultiFields, Term
+from org.apache.lucene.index import MultiFields, Term, PostingsEnum
 from org.apache.lucene.queryparser.classic import QueryParser
-from org.apache.lucene.search import MultiPhraseQuery, PhraseQuery
-from org.apache.lucene.search.payloads import PayloadSpanUtil
-from org.apache.lucene.search.spans import SpanNearQuery, SpanTermQuery
-from org.apache.lucene.util import BytesRef, Version
+from org.apache.lucene.search import \
+    MultiPhraseQuery, PhraseQuery, DocIdSetIterator
+from org.apache.lucene.search.spans import \
+    Spans, SpanNearQuery, SpanTermQuery, SpanWeight
+from org.apache.lucene.util import BytesRef
 from org.apache.pylucene.analysis import \
     PythonAnalyzer, PythonFilteringTokenFilter, PythonTokenFilter, \
     PythonTokenizer
+from org.apache.pylucene.search.spans import PythonSpanCollector
 
 
 class PositionIncrementTestCase(PyLuceneTestCase):
@@ -43,8 +44,8 @@ class PositionIncrementTestCase(PyLucene
     def testSetPosition(self):
 
         class _tokenizer(PythonTokenizer):
-            def __init__(_self, reader):
-                super(_tokenizer, _self).__init__(reader)
+            def __init__(_self):
+                super(_tokenizer, _self).__init__()
 
                 _self.TOKENS = ["1", "2", "3", "4", "5"]
                 _self.INCREMENTS = [1, 2, 1, 0, 1]
@@ -65,16 +66,13 @@ class PositionIncrementTestCase(PyLucene
 
                 return True
 
-            def end(_self):
-                pass
             def reset(_self):
-                pass
-            def close(_self):
-                pass
+                super(_tokenizer, _self).reset()
+                _self.i = 0
 
         class _analyzer(PythonAnalyzer):
-            def createComponents(_self, fieldName, reader):
-                return Analyzer.TokenStreamComponents(_tokenizer(reader))
+            def createComponents(_self, fieldName):
+                return Analyzer.TokenStreamComponents(_tokenizer())
 
         writer = self.getWriter(analyzer=_analyzer())
 
@@ -87,95 +85,95 @@ class PositionIncrementTestCase(PyLucene
 
         searcher = self.getSearcher()
         reader = searcher.getIndexReader()
-        pos = MultiFields.getTermPositionsEnum(reader, 
MultiFields.getLiveDocs(reader), "field", BytesRef("1"))
+        pos = MultiFields.getTermPositionsEnum(reader, "field", BytesRef("1"))
         pos.nextDoc()
         # first token should be at position 0
         self.assertEqual(0, pos.nextPosition())
-    
-        pos = MultiFields.getTermPositionsEnum(reader, 
MultiFields.getLiveDocs(reader), "field", BytesRef("2"))
+
+        pos = MultiFields.getTermPositionsEnum(reader, "field", BytesRef("2"))
         pos.nextDoc()
         # second token should be at position 2
         self.assertEqual(2, pos.nextPosition())
-    
-        q = PhraseQuery()
-        q.add(Term("field", "1"))
-        q.add(Term("field", "2"))
-        hits = searcher.search(q, None, 1000).scoreDocs
+
+        b = PhraseQuery.Builder()
+        b.add(Term("field", "1"))
+        b.add(Term("field", "2"))
+        hits = searcher.search(b.build(), 1000).scoreDocs
         self.assertEqual(0, len(hits))
 
         # same as previous, just specify positions explicitely.
-        q = PhraseQuery() 
-        q.add(Term("field", "1"), 0)
-        q.add(Term("field", "2"), 1)
-        hits = searcher.search(q, None, 1000).scoreDocs
+        b = PhraseQuery.Builder()
+        b.add(Term("field", "1"), 0)
+        b.add(Term("field", "2"), 1)
+        hits = searcher.search(b.build(), 1000).scoreDocs
         self.assertEqual(0, len(hits))
 
         # specifying correct positions should find the phrase.
-        q = PhraseQuery()
-        q.add(Term("field", "1"), 0)
-        q.add(Term("field", "2"), 2)
-        hits = searcher.search(q, None, 1000).scoreDocs
-        self.assertEqual(1, len(hits))
-
-        q = PhraseQuery()
-        q.add(Term("field", "2"))
-        q.add(Term("field", "3"))
-        hits = searcher.search(q, None, 1000).scoreDocs
-        self.assertEqual(1, len(hits))
-
-        q = PhraseQuery()
-        q.add(Term("field", "3"))
-        q.add(Term("field", "4"))
-        hits = searcher.search(q, None, 1000).scoreDocs
+        b = PhraseQuery.Builder()
+        b.add(Term("field", "1"), 0)
+        b.add(Term("field", "2"), 2)
+        hits = searcher.search(b.build(), 1000).scoreDocs
+        self.assertEqual(1, len(hits))
+
+        b = PhraseQuery.Builder()
+        b.add(Term("field", "2"))
+        b.add(Term("field", "3"))
+        hits = searcher.search(b.build(), 1000).scoreDocs
+        self.assertEqual(1, len(hits))
+
+        b = PhraseQuery.Builder()
+        b.add(Term("field", "3"))
+        b.add(Term("field", "4"))
+        hits = searcher.search(b.build(), 1000).scoreDocs
         self.assertEqual(0, len(hits))
 
         # phrase query would find it when correct positions are specified. 
-        q = PhraseQuery()
-        q.add(Term("field", "3"), 0)
-        q.add(Term("field", "4"), 0)
-        hits = searcher.search(q, None, 1000).scoreDocs
+        b = PhraseQuery.Builder()
+        b.add(Term("field", "3"), 0)
+        b.add(Term("field", "4"), 0)
+        hits = searcher.search(b.build(), 1000).scoreDocs
         self.assertEqual(1, len(hits))
 
         # phrase query should fail for non existing searched term 
         # even if there exist another searched terms in the same searched
         # position.
-        q = PhraseQuery()
-        q.add(Term("field", "3"), 0)
-        q.add(Term("field", "9"), 0)
-        hits = searcher.search(q, None, 1000).scoreDocs
+        b = PhraseQuery.Builder()
+        b.add(Term("field", "3"), 0)
+        b.add(Term("field", "9"), 0)
+        hits = searcher.search(b.build(), 1000).scoreDocs
         self.assertEqual(0, len(hits))
 
         # multi-phrase query should succed for non existing searched term
         # because there exist another searched terms in the same searched
         # position.
 
-        mq = MultiPhraseQuery()
-        mq.add([Term("field", "3"), Term("field", "9")], 0)
-        hits = searcher.search(mq, None, 1000).scoreDocs
+        b = MultiPhraseQuery.Builder()
+        b.add([Term("field", "3"), Term("field", "9")], 0)
+        hits = searcher.search(b.build(), 1000).scoreDocs
         self.assertEqual(1, len(hits))
 
-        q = PhraseQuery()
-        q.add(Term("field", "2"))
-        q.add(Term("field", "4"))
-        hits = searcher.search(q, None, 1000).scoreDocs
+        b = PhraseQuery.Builder()
+        b.add(Term("field", "2"))
+        b.add(Term("field", "4"))
+        hits = searcher.search(b.build(), 1000).scoreDocs
         self.assertEqual(1, len(hits))
 
-        q = PhraseQuery()
-        q.add(Term("field", "3"))
-        q.add(Term("field", "5"))
-        hits = searcher.search(q, None, 1000).scoreDocs
+        b = PhraseQuery.Builder()
+        b.add(Term("field", "3"))
+        b.add(Term("field", "5"))
+        hits = searcher.search(b.build(), 1000).scoreDocs
         self.assertEqual(1, len(hits))
 
-        q = PhraseQuery()
-        q.add(Term("field", "4"))
-        q.add(Term("field", "5"))
-        hits = searcher.search(q, None, 1000).scoreDocs
+        b = PhraseQuery.Builder()
+        b.add(Term("field", "4"))
+        b.add(Term("field", "5"))
+        hits = searcher.search(b.build(), 1000).scoreDocs
         self.assertEqual(1, len(hits))
 
-        q = PhraseQuery()
-        q.add(Term("field", "2"))
-        q.add(Term("field", "5"))
-        hits = searcher.search(q, None, 1000).scoreDocs
+        b = PhraseQuery.Builder()
+        b.add(Term("field", "2"))
+        b.add(Term("field", "5"))
+        hits = searcher.search(b.build(), 1000).scoreDocs
         self.assertEqual(0, len(hits))
 
     def testPayloadsPos0(self):
@@ -186,97 +184,82 @@ class PositionIncrementTestCase(PyLucene
         doc.add(Field("content", "a a b c d e a f g h i j a b k k",
                       TextField.TYPE_STORED))
         writer.addDocument(doc)
-        reader = writer.getReader()
+        reader = self.getOnlyLeafReader(writer.getReader())
         writer.close()
 
-        tp = MultiFields.getTermPositionsEnum(reader,
-                                              MultiFields.getLiveDocs(reader),
-                                              "content", BytesRef("a"))
+        tp = reader.postings(Term("content", "a"), PostingsEnum.ALL)
 
         count = 0
-        self.assert_(tp.nextDoc() != tp.NO_MORE_DOCS)
+        self.assert_(tp.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
+
         # "a" occurs 4 times
         self.assertEqual(4, tp.freq())
-
-        expected = 0
-        self.assertEqual(expected, tp.nextPosition())
+        self.assertEqual(0, tp.nextPosition())
         self.assertEqual(1, tp.nextPosition())
         self.assertEqual(3, tp.nextPosition())
         self.assertEqual(6, tp.nextPosition())
 
         # only one doc has "a"
-        self.assert_(tp.nextDoc() == tp.NO_MORE_DOCS)
+        self.assertEqual(DocIdSetIterator.NO_MORE_DOCS, tp.nextDoc())
+
+        searcher = self.getSearcher()
 
-        searcher = self.getSearcher(reader=reader)
-    
         stq1 = SpanTermQuery(Term("content", "a"))
         stq2 = SpanTermQuery(Term("content", "k"))
-        sqs = [stq1, stq2]
+        sqs = [stq1, stq2 ]
         snq = SpanNearQuery(sqs, 30, False)
 
-        count = 0
-        sawZero = False
-        pspans = MultiSpansWrapper.wrap(searcher.getTopReaderContext(), snq)
-        while pspans.next():
-            payloads = pspans.getPayload()
-            sawZero |= pspans.start() == 0
+        count = 0;
+        collector = PayloadSpanCollector()
+        pspans = snq.createWeight(searcher, False).getSpans(
+            searcher.getIndexReader().leaves().get(0),
+            SpanWeight.Postings.PAYLOADS)
 
-            it = payloads.iterator()
-            while it.hasNext():
-                count += 1
-                it.next()
+        sawZero = False
+        while pspans.nextDoc() != Spans.NO_MORE_DOCS:
+            while pspans.nextStartPosition() != Spans.NO_MORE_POSITIONS:
+                collector.reset()
+                pspans.collect(collector)
+                sawZero = sawZero or pspans.startPosition() == 0
+                for payload in collector.payloads:
+                    count += 1
 
-        self.assertEqual(5, count)
         self.assert_(sawZero)
+        self.assertEquals(8, count)
 
-        spans = MultiSpansWrapper.wrap(searcher.getTopReaderContext(), snq)
+        spans = snq.createWeight(searcher, False).getSpans(
+            searcher.getIndexReader().leaves().get(0),
+            SpanWeight.Postings.POSITIONS)
         count = 0
-        sawZero = False
-        while spans.next():
-            count += 1
-            sawZero |= spans.start() == 0
 
-        self.assertEqual(4, count)
-        self.assert_(sawZero)
-               
         sawZero = False
-        psu = PayloadSpanUtil(searcher.getTopReaderContext())
-        pls = psu.getPayloadsForQuery(snq)
-        count = pls.size()
-        it = pls.iterator()
-        while it.hasNext():
-            bytes = JArray('byte').cast_(it.next())
-            s = bytes.string_
-            sawZero |= s == "pos: 0"
+        while spans.nextDoc() != Spans.NO_MORE_DOCS:
+            while spans.nextStartPosition() != Spans.NO_MORE_POSITIONS:
+                count += 1
+                sawZero = sawZero or spans.startPosition() == 0
 
-        self.assertEqual(5, count)
+        self.assertEquals(4, count)
         self.assert_(sawZero)
 
 
-class StopWhitespaceAnalyzer(PythonAnalyzer):
-
-    def __init__(self, enablePositionIncrements):
-        super(StopWhitespaceAnalyzer, self).__init__()
-
-        self.enablePositionIncrements = enablePositionIncrements
-
-    def createComponents(self, fieldName, reader):
+class PayloadSpanCollector(PythonSpanCollector):
 
-        class _stopFilter(PythonFilteringTokenFilter):
-            def __init__(_self, tokenStream):
-                super(_stopFilter, _self).__init__(Version.LUCENE_CURRENT, 
tokenStream)
-                _self.termAtt = _self.addAttribute(CharTermAttribute.class_);
-            def accept(_self):
-                return _self.termAtt.toString() != "stop"
+    def __init__(_self):
+        super(PayloadSpanCollector, _self).__init__()
+        _self.payloads = []
+
+    def collectLeaf(_self, postings, position, term):
+        if postings.getPayload() is not None:
+            _self.payloads.append(BytesRef.deepCopyOf(postings.getPayload()))
 
-        source = WhitespaceTokenizer(Version.LUCENE_CURRENT, reader)
-        return Analyzer.TokenStreamComponents(source, _stopFilter(source))
+    def reset(_self):
+        del _self.payloads[:]
 
 
 class TestPayloadAnalyzer(PythonAnalyzer):
 
-    def createComponents(self, fieldName, reader):
-        source = LowerCaseTokenizer(Version.LUCENE_CURRENT, reader)
+    def createComponents(self, fieldName):
+        source = LowerCaseTokenizer()
         return Analyzer.TokenStreamComponents(source, PayloadFilter(source, 
fieldName))

svn commit: r1757864 - in /lucene/pylucene/trunk: ./ java/org/apache/pylucene/search/spans/ test/

Reply via email to