hi everybody,

I stumbled across the problem of using term vectors with position and
offset information in pylucene. I use fields with
Field.TermVectors.WITH_POSITIONS_OFFSETS set and the getTermFreqVector
method of IndexReader to retrieve the term vector, but this is of type
TermFrequencyVector and not of TermPositionVector (a sub-interface of
TermFrequencyVector), which would provide the method getTermPositions
and getOffsets that I want to use.

I patched lucene.cpp of the latest subversion trunk (of 2007-07-30) to
provide downcast methods from TermFrequencyVector to TermPositionVector
(isTermPositionVector and toTermPositionVector).

I'd like to share this patch or be corrected if I somehow follow a wrong
way to get the positions and offsets of terms in a document.

Find attached the patch and an example script that makes use of the
downcasted TermPositionVector.

bernhard

--- lucene.cpp.ori	2007-07-30 11:39:15.000000000 +0200
+++ lucene.cpp	2007-07-30 11:41:51.000000000 +0200
@@ -2431,6 +2431,7 @@
                                           PyObject *arg);
 static PyObject *j_termfreqvector_indexesOf(j_termfreqvector *self,
                                             PyObject *args);
+DECLARE_DOWNCAST(j_termfreqvector, TermPositionVector);
 
 static PyMethodDef j_termfreqvector_methods[] = {
     DECLARE_METHOD(j_termfreqvector, getField, METH_NOARGS),
@@ -2439,6 +2440,7 @@
     DECLARE_METHOD(j_termfreqvector, getTermFrequencies, METH_NOARGS),
     DECLARE_METHOD(j_termfreqvector, indexOf, METH_O),
     DECLARE_METHOD(j_termfreqvector, indexesOf, METH_VARARGS),
+		DECLARE_DOWNCAST_METHODS(j_termfreqvector, TermPositionVector),
     { NULL, NULL, 0, NULL }
 };
 
@@ -9049,6 +9051,7 @@
     return PyErr_SetArgsError((PyObject *) self, "indexesOf", args);
 }
 
+DEFINE_DOWNCAST(j_termfreqvector, TermPositionVector, org::apache::lucene::index::TermPositionVector);
 
 /* TermPositionVector */
 
from PyLucene import StandardAnalyzer, RAMDirectory, Document, Field, IndexWriter, IndexReader

analyzer = StandardAnalyzer()

directory = RAMDirectory()

iwriter = IndexWriter(directory,analyzer,True)
ts = ["this bernhard is the text to be index text",
      "this claudia is the text to be index"]
for t in ts:
    doc = Document()
    doc.add(Field("fieldname",t,
                  Field.Store.YES, Field.Index.TOKENIZED,
                  Field.TermVector.WITH_POSITIONS_OFFSETS))
    iwriter.addDocument(doc)
iwriter.optimize()
iwriter.close()


ireader = IndexReader.open(directory)
tfv = ireader.getTermFreqVector(0,'fieldname').toTermPositionVector()
for (t,f,i) in zip(tfv.getTerms(),tfv.getTermFrequencies(),xrange(100000)):
    print 'term %s' % t
    print '  freq: %i' % f
    try:
        print '  pos: ' + str([p for p in tfv.getTermPositions(i)])
    except:
        print '  no pos'
    try:
        print '  off: ' + \
              str(["%i-%i" % (o.getStartOffset(), o.getEndOffset())
                   for o in tfv.getOffsets(i)])
    except:
        print '  no offsets'
_______________________________________________
pylucene-dev mailing list
[email protected]
http://lists.osafoundation.org/mailman/listinfo/pylucene-dev

Reply via email to