Author: vajda Date: Fri Aug 26 14:54:59 2016 New Revision: 1757864 URL: http://svn.apache.org/viewvc?rev=1757864&view=rev Log: test_PositionIncrement.py passes
Added: lucene/pylucene/trunk/java/org/apache/pylucene/search/spans/PythonSpanCollector.java (with props) Removed: lucene/pylucene/trunk/test/MultiSpansWrapper.py Modified: lucene/pylucene/trunk/extensions.xml lucene/pylucene/trunk/java/org/apache/pylucene/search/spans/PythonSpans.java lucene/pylucene/trunk/test/PyLuceneTestCase.py lucene/pylucene/trunk/test/test_PositionIncrement.py Modified: lucene/pylucene/trunk/extensions.xml URL: http://svn.apache.org/viewvc/lucene/pylucene/trunk/extensions.xml?rev=1757864&r1=1757863&r2=1757864&view=diff ============================================================================== --- lucene/pylucene/trunk/extensions.xml (original) +++ lucene/pylucene/trunk/extensions.xml Fri Aug 26 14:54:59 2016 @@ -21,7 +21,7 @@ <javac srcdir="java/org/apache/pylucene/util" destdir="${classes.dir}" classpathref="classpath" /> <javac srcdir="java/org/apache/pylucene/search" - excludes="similarities/*,spans/*" + excludes="similarities/*" destdir="${classes.dir}" classpathref="classpath" /> <javac srcdir="java/org/apache/pylucene/index" destdir="${classes.dir}" classpathref="classpath" /> Added: lucene/pylucene/trunk/java/org/apache/pylucene/search/spans/PythonSpanCollector.java URL: http://svn.apache.org/viewvc/lucene/pylucene/trunk/java/org/apache/pylucene/search/spans/PythonSpanCollector.java?rev=1757864&view=auto ============================================================================== --- lucene/pylucene/trunk/java/org/apache/pylucene/search/spans/PythonSpanCollector.java (added) +++ lucene/pylucene/trunk/java/org/apache/pylucene/search/spans/PythonSpanCollector.java Fri Aug 26 14:54:59 2016 @@ -0,0 +1,56 @@ +/* ==================================================================== + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * ==================================================================== + */ + +package org.apache.pylucene.search.spans; + +import java.io.IOException; + +import org.apache.lucene.index.PostingsEnum; +import org.apache.lucene.index.Term; + +import org.apache.lucene.search.spans.SpanCollector; + + +public class PythonSpanCollector implements SpanCollector { + + private long pythonObject; + + public PythonSpanCollector() + { + } + + public void pythonExtension(long pythonObject) + { + this.pythonObject = pythonObject; + } + public long pythonExtension() + { + return this.pythonObject; + } + + public void finalize() + throws Throwable + { + pythonDecRef(); + } + + public native void pythonDecRef(); + + public native void collectLeaf(PostingsEnum postings, int position, + Term term) + throws IOException; + + public native void reset(); +} Propchange: lucene/pylucene/trunk/java/org/apache/pylucene/search/spans/PythonSpanCollector.java ------------------------------------------------------------------------------ svn:eol-style = native Propchange: lucene/pylucene/trunk/java/org/apache/pylucene/search/spans/PythonSpanCollector.java ------------------------------------------------------------------------------ svn:mime-type = text/plain Modified: lucene/pylucene/trunk/java/org/apache/pylucene/search/spans/PythonSpans.java URL: http://svn.apache.org/viewvc/lucene/pylucene/trunk/java/org/apache/pylucene/search/spans/PythonSpans.java?rev=1757864&r1=1757863&r2=1757864&view=diff ============================================================================== --- lucene/pylucene/trunk/java/org/apache/pylucene/search/spans/PythonSpans.java (original) +++ lucene/pylucene/trunk/java/org/apache/pylucene/search/spans/PythonSpans.java Fri Aug 26 14:54:59 2016 @@ -16,9 +16,9 @@ package org.apache.pylucene.search.spans; import java.io.IOException; -import java.util.Collection; import org.apache.lucene.search.spans.Spans; +import org.apache.lucene.search.spans.SpanCollector; public class PythonSpans extends Spans { @@ -46,16 +46,18 @@ public class PythonSpans extends Spans { public native void pythonDecRef(); - public native boolean next() + public native int docID(); + public native int nextDoc() throws IOException; - public native boolean skipTo(int target) - throws IOException; - public native int doc(); - public native int start(); - public native int end(); - public native Collection<byte[]> getPayload() - throws IOException; - public native boolean isPayloadAvailable() + public native int advance(int target) throws IOException; public native long cost(); + + public native int nextStartPosition(); + public native int startPosition(); + public native int endPosition(); + public native int width(); + public native void collect(SpanCollector collector) + throws IOException; + public native float positionsCost(); } Modified: lucene/pylucene/trunk/test/PyLuceneTestCase.py URL: http://svn.apache.org/viewvc/lucene/pylucene/trunk/test/PyLuceneTestCase.py?rev=1757864&r1=1757863&r2=1757864&view=diff ============================================================================== --- lucene/pylucene/trunk/test/PyLuceneTestCase.py (original) +++ lucene/pylucene/trunk/test/PyLuceneTestCase.py Fri Aug 26 14:54:59 2016 @@ -61,13 +61,20 @@ class PyLuceneTestCase(TestCase): directory = self.directory return IndexWriter(directory, config) - + def getSearcher(self, directory=None, reader=None): if reader is not None: return IndexSearcher(reader) return IndexSearcher(self.getReader(directory=directory)) - + def getReader(self, directory=None): if directory is None: directory = self.directory return DirectoryReader.open(directory) + + def getOnlyLeafReader(self, reader): + subReaders = reader.leaves() + if subReaders.size() != 1: + raise ValueError(reader + " has " + subReaders.size() + + " segments instead of exactly one") + return subReaders.get(0).reader() Modified: lucene/pylucene/trunk/test/test_PositionIncrement.py URL: http://svn.apache.org/viewvc/lucene/pylucene/trunk/test/test_PositionIncrement.py?rev=1757864&r1=1757863&r2=1757864&view=diff ============================================================================== --- lucene/pylucene/trunk/test/test_PositionIncrement.py (original) +++ lucene/pylucene/trunk/test/test_PositionIncrement.py Fri Aug 26 14:54:59 2016 @@ -15,7 +15,6 @@ import sys, lucene, unittest from lucene import JArray from PyLuceneTestCase import PyLuceneTestCase -from MultiSpansWrapper import MultiSpansWrapper from java.io import StringReader from org.apache.lucene.analysis import Analyzer @@ -25,15 +24,17 @@ from org.apache.lucene.analysis.tokenatt CharTermAttribute, OffsetAttribute, PayloadAttribute, \ PositionIncrementAttribute from org.apache.lucene.document import Document, Field, TextField -from org.apache.lucene.index import MultiFields, Term +from org.apache.lucene.index import MultiFields, Term, PostingsEnum from org.apache.lucene.queryparser.classic import QueryParser -from org.apache.lucene.search import MultiPhraseQuery, PhraseQuery -from org.apache.lucene.search.payloads import PayloadSpanUtil -from org.apache.lucene.search.spans import SpanNearQuery, SpanTermQuery -from org.apache.lucene.util import BytesRef, Version +from org.apache.lucene.search import \ + MultiPhraseQuery, PhraseQuery, DocIdSetIterator +from org.apache.lucene.search.spans import \ + Spans, SpanNearQuery, SpanTermQuery, SpanWeight +from org.apache.lucene.util import BytesRef from org.apache.pylucene.analysis import \ PythonAnalyzer, PythonFilteringTokenFilter, PythonTokenFilter, \ PythonTokenizer +from org.apache.pylucene.search.spans import PythonSpanCollector class PositionIncrementTestCase(PyLuceneTestCase): @@ -43,8 +44,8 @@ class PositionIncrementTestCase(PyLucene def testSetPosition(self): class _tokenizer(PythonTokenizer): - def __init__(_self, reader): - super(_tokenizer, _self).__init__(reader) + def __init__(_self): + super(_tokenizer, _self).__init__() _self.TOKENS = ["1", "2", "3", "4", "5"] _self.INCREMENTS = [1, 2, 1, 0, 1] @@ -65,16 +66,13 @@ class PositionIncrementTestCase(PyLucene return True - def end(_self): - pass def reset(_self): - pass - def close(_self): - pass + super(_tokenizer, _self).reset() + _self.i = 0 class _analyzer(PythonAnalyzer): - def createComponents(_self, fieldName, reader): - return Analyzer.TokenStreamComponents(_tokenizer(reader)) + def createComponents(_self, fieldName): + return Analyzer.TokenStreamComponents(_tokenizer()) writer = self.getWriter(analyzer=_analyzer()) @@ -87,95 +85,95 @@ class PositionIncrementTestCase(PyLucene searcher = self.getSearcher() reader = searcher.getIndexReader() - pos = MultiFields.getTermPositionsEnum(reader, MultiFields.getLiveDocs(reader), "field", BytesRef("1")) + pos = MultiFields.getTermPositionsEnum(reader, "field", BytesRef("1")) pos.nextDoc() # first token should be at position 0 self.assertEqual(0, pos.nextPosition()) - - pos = MultiFields.getTermPositionsEnum(reader, MultiFields.getLiveDocs(reader), "field", BytesRef("2")) + + pos = MultiFields.getTermPositionsEnum(reader, "field", BytesRef("2")) pos.nextDoc() # second token should be at position 2 self.assertEqual(2, pos.nextPosition()) - - q = PhraseQuery() - q.add(Term("field", "1")) - q.add(Term("field", "2")) - hits = searcher.search(q, None, 1000).scoreDocs + + b = PhraseQuery.Builder() + b.add(Term("field", "1")) + b.add(Term("field", "2")) + hits = searcher.search(b.build(), 1000).scoreDocs self.assertEqual(0, len(hits)) # same as previous, just specify positions explicitely. - q = PhraseQuery() - q.add(Term("field", "1"), 0) - q.add(Term("field", "2"), 1) - hits = searcher.search(q, None, 1000).scoreDocs + b = PhraseQuery.Builder() + b.add(Term("field", "1"), 0) + b.add(Term("field", "2"), 1) + hits = searcher.search(b.build(), 1000).scoreDocs self.assertEqual(0, len(hits)) # specifying correct positions should find the phrase. - q = PhraseQuery() - q.add(Term("field", "1"), 0) - q.add(Term("field", "2"), 2) - hits = searcher.search(q, None, 1000).scoreDocs - self.assertEqual(1, len(hits)) - - q = PhraseQuery() - q.add(Term("field", "2")) - q.add(Term("field", "3")) - hits = searcher.search(q, None, 1000).scoreDocs - self.assertEqual(1, len(hits)) - - q = PhraseQuery() - q.add(Term("field", "3")) - q.add(Term("field", "4")) - hits = searcher.search(q, None, 1000).scoreDocs + b = PhraseQuery.Builder() + b.add(Term("field", "1"), 0) + b.add(Term("field", "2"), 2) + hits = searcher.search(b.build(), 1000).scoreDocs + self.assertEqual(1, len(hits)) + + b = PhraseQuery.Builder() + b.add(Term("field", "2")) + b.add(Term("field", "3")) + hits = searcher.search(b.build(), 1000).scoreDocs + self.assertEqual(1, len(hits)) + + b = PhraseQuery.Builder() + b.add(Term("field", "3")) + b.add(Term("field", "4")) + hits = searcher.search(b.build(), 1000).scoreDocs self.assertEqual(0, len(hits)) # phrase query would find it when correct positions are specified. - q = PhraseQuery() - q.add(Term("field", "3"), 0) - q.add(Term("field", "4"), 0) - hits = searcher.search(q, None, 1000).scoreDocs + b = PhraseQuery.Builder() + b.add(Term("field", "3"), 0) + b.add(Term("field", "4"), 0) + hits = searcher.search(b.build(), 1000).scoreDocs self.assertEqual(1, len(hits)) # phrase query should fail for non existing searched term # even if there exist another searched terms in the same searched # position. - q = PhraseQuery() - q.add(Term("field", "3"), 0) - q.add(Term("field", "9"), 0) - hits = searcher.search(q, None, 1000).scoreDocs + b = PhraseQuery.Builder() + b.add(Term("field", "3"), 0) + b.add(Term("field", "9"), 0) + hits = searcher.search(b.build(), 1000).scoreDocs self.assertEqual(0, len(hits)) # multi-phrase query should succed for non existing searched term # because there exist another searched terms in the same searched # position. - mq = MultiPhraseQuery() - mq.add([Term("field", "3"), Term("field", "9")], 0) - hits = searcher.search(mq, None, 1000).scoreDocs + b = MultiPhraseQuery.Builder() + b.add([Term("field", "3"), Term("field", "9")], 0) + hits = searcher.search(b.build(), 1000).scoreDocs self.assertEqual(1, len(hits)) - q = PhraseQuery() - q.add(Term("field", "2")) - q.add(Term("field", "4")) - hits = searcher.search(q, None, 1000).scoreDocs + b = PhraseQuery.Builder() + b.add(Term("field", "2")) + b.add(Term("field", "4")) + hits = searcher.search(b.build(), 1000).scoreDocs self.assertEqual(1, len(hits)) - q = PhraseQuery() - q.add(Term("field", "3")) - q.add(Term("field", "5")) - hits = searcher.search(q, None, 1000).scoreDocs + b = PhraseQuery.Builder() + b.add(Term("field", "3")) + b.add(Term("field", "5")) + hits = searcher.search(b.build(), 1000).scoreDocs self.assertEqual(1, len(hits)) - q = PhraseQuery() - q.add(Term("field", "4")) - q.add(Term("field", "5")) - hits = searcher.search(q, None, 1000).scoreDocs + b = PhraseQuery.Builder() + b.add(Term("field", "4")) + b.add(Term("field", "5")) + hits = searcher.search(b.build(), 1000).scoreDocs self.assertEqual(1, len(hits)) - q = PhraseQuery() - q.add(Term("field", "2")) - q.add(Term("field", "5")) - hits = searcher.search(q, None, 1000).scoreDocs + b = PhraseQuery.Builder() + b.add(Term("field", "2")) + b.add(Term("field", "5")) + hits = searcher.search(b.build(), 1000).scoreDocs self.assertEqual(0, len(hits)) def testPayloadsPos0(self): @@ -186,97 +184,82 @@ class PositionIncrementTestCase(PyLucene doc.add(Field("content", "a a b c d e a f g h i j a b k k", TextField.TYPE_STORED)) writer.addDocument(doc) - reader = writer.getReader() + reader = self.getOnlyLeafReader(writer.getReader()) writer.close() - tp = MultiFields.getTermPositionsEnum(reader, - MultiFields.getLiveDocs(reader), - "content", BytesRef("a")) + tp = reader.postings(Term("content", "a"), PostingsEnum.ALL) count = 0 - self.assert_(tp.nextDoc() != tp.NO_MORE_DOCS) + self.assert_(tp.nextDoc() != DocIdSetIterator.NO_MORE_DOCS); + # "a" occurs 4 times self.assertEqual(4, tp.freq()) - - expected = 0 - self.assertEqual(expected, tp.nextPosition()) + self.assertEqual(0, tp.nextPosition()) self.assertEqual(1, tp.nextPosition()) self.assertEqual(3, tp.nextPosition()) self.assertEqual(6, tp.nextPosition()) # only one doc has "a" - self.assert_(tp.nextDoc() == tp.NO_MORE_DOCS) + self.assertEqual(DocIdSetIterator.NO_MORE_DOCS, tp.nextDoc()) + + searcher = self.getSearcher() - searcher = self.getSearcher(reader=reader) - stq1 = SpanTermQuery(Term("content", "a")) stq2 = SpanTermQuery(Term("content", "k")) - sqs = [stq1, stq2] + sqs = [stq1, stq2 ] snq = SpanNearQuery(sqs, 30, False) - count = 0 - sawZero = False - pspans = MultiSpansWrapper.wrap(searcher.getTopReaderContext(), snq) - while pspans.next(): - payloads = pspans.getPayload() - sawZero |= pspans.start() == 0 + count = 0; + collector = PayloadSpanCollector() + pspans = snq.createWeight(searcher, False).getSpans( + searcher.getIndexReader().leaves().get(0), + SpanWeight.Postings.PAYLOADS) - it = payloads.iterator() - while it.hasNext(): - count += 1 - it.next() + sawZero = False + while pspans.nextDoc() != Spans.NO_MORE_DOCS: + while pspans.nextStartPosition() != Spans.NO_MORE_POSITIONS: + collector.reset() + pspans.collect(collector) + sawZero = sawZero or pspans.startPosition() == 0 + for payload in collector.payloads: + count += 1 - self.assertEqual(5, count) self.assert_(sawZero) + self.assertEquals(8, count) - spans = MultiSpansWrapper.wrap(searcher.getTopReaderContext(), snq) + spans = snq.createWeight(searcher, False).getSpans( + searcher.getIndexReader().leaves().get(0), + SpanWeight.Postings.POSITIONS) count = 0 - sawZero = False - while spans.next(): - count += 1 - sawZero |= spans.start() == 0 - self.assertEqual(4, count) - self.assert_(sawZero) - sawZero = False - psu = PayloadSpanUtil(searcher.getTopReaderContext()) - pls = psu.getPayloadsForQuery(snq) - count = pls.size() - it = pls.iterator() - while it.hasNext(): - bytes = JArray('byte').cast_(it.next()) - s = bytes.string_ - sawZero |= s == "pos: 0" + while spans.nextDoc() != Spans.NO_MORE_DOCS: + while spans.nextStartPosition() != Spans.NO_MORE_POSITIONS: + count += 1 + sawZero = sawZero or spans.startPosition() == 0 - self.assertEqual(5, count) + self.assertEquals(4, count) self.assert_(sawZero) -class StopWhitespaceAnalyzer(PythonAnalyzer): - - def __init__(self, enablePositionIncrements): - super(StopWhitespaceAnalyzer, self).__init__() - - self.enablePositionIncrements = enablePositionIncrements - - def createComponents(self, fieldName, reader): +class PayloadSpanCollector(PythonSpanCollector): - class _stopFilter(PythonFilteringTokenFilter): - def __init__(_self, tokenStream): - super(_stopFilter, _self).__init__(Version.LUCENE_CURRENT, tokenStream) - _self.termAtt = _self.addAttribute(CharTermAttribute.class_); - def accept(_self): - return _self.termAtt.toString() != "stop" + def __init__(_self): + super(PayloadSpanCollector, _self).__init__() + _self.payloads = [] + + def collectLeaf(_self, postings, position, term): + if postings.getPayload() is not None: + _self.payloads.append(BytesRef.deepCopyOf(postings.getPayload())) - source = WhitespaceTokenizer(Version.LUCENE_CURRENT, reader) - return Analyzer.TokenStreamComponents(source, _stopFilter(source)) + def reset(_self): + del _self.payloads[:] class TestPayloadAnalyzer(PythonAnalyzer): - def createComponents(self, fieldName, reader): - source = LowerCaseTokenizer(Version.LUCENE_CURRENT, reader) + def createComponents(self, fieldName): + source = LowerCaseTokenizer() return Analyzer.TokenStreamComponents(source, PayloadFilter(source, fieldName))