hi everybody,
I stumbled across the problem of using term vectors with position and
offset information in pylucene. I use fields with
Field.TermVectors.WITH_POSITIONS_OFFSETS set and the getTermFreqVector
method of IndexReader to retrieve the term vector, but this is of type
TermFrequencyVector and not of TermPositionVector (a sub-interface of
TermFrequencyVector), which would provide the method getTermPositions
and getOffsets that I want to use.
I patched lucene.cpp of the latest subversion trunk (of 2007-07-30) to
provide downcast methods from TermFrequencyVector to TermPositionVector
(isTermPositionVector and toTermPositionVector).
I'd like to share this patch or be corrected if I somehow follow a wrong
way to get the positions and offsets of terms in a document.
Find attached the patch and an example script that makes use of the
downcasted TermPositionVector.
bernhard
--- lucene.cpp.ori 2007-07-30 11:39:15.000000000 +0200
+++ lucene.cpp 2007-07-30 11:41:51.000000000 +0200
@@ -2431,6 +2431,7 @@
PyObject *arg);
static PyObject *j_termfreqvector_indexesOf(j_termfreqvector *self,
PyObject *args);
+DECLARE_DOWNCAST(j_termfreqvector, TermPositionVector);
static PyMethodDef j_termfreqvector_methods[] = {
DECLARE_METHOD(j_termfreqvector, getField, METH_NOARGS),
@@ -2439,6 +2440,7 @@
DECLARE_METHOD(j_termfreqvector, getTermFrequencies, METH_NOARGS),
DECLARE_METHOD(j_termfreqvector, indexOf, METH_O),
DECLARE_METHOD(j_termfreqvector, indexesOf, METH_VARARGS),
+ DECLARE_DOWNCAST_METHODS(j_termfreqvector, TermPositionVector),
{ NULL, NULL, 0, NULL }
};
@@ -9049,6 +9051,7 @@
return PyErr_SetArgsError((PyObject *) self, "indexesOf", args);
}
+DEFINE_DOWNCAST(j_termfreqvector, TermPositionVector, org::apache::lucene::index::TermPositionVector);
/* TermPositionVector */
from PyLucene import StandardAnalyzer, RAMDirectory, Document, Field, IndexWriter, IndexReader
analyzer = StandardAnalyzer()
directory = RAMDirectory()
iwriter = IndexWriter(directory,analyzer,True)
ts = ["this bernhard is the text to be index text",
"this claudia is the text to be index"]
for t in ts:
doc = Document()
doc.add(Field("fieldname",t,
Field.Store.YES, Field.Index.TOKENIZED,
Field.TermVector.WITH_POSITIONS_OFFSETS))
iwriter.addDocument(doc)
iwriter.optimize()
iwriter.close()
ireader = IndexReader.open(directory)
tfv = ireader.getTermFreqVector(0,'fieldname').toTermPositionVector()
for (t,f,i) in zip(tfv.getTerms(),tfv.getTermFrequencies(),xrange(100000)):
print 'term %s' % t
print ' freq: %i' % f
try:
print ' pos: ' + str([p for p in tfv.getTermPositions(i)])
except:
print ' no pos'
try:
print ' off: ' + \
str(["%i-%i" % (o.getStartOffset(), o.getEndOffset())
for o in tfv.getOffsets(i)])
except:
print ' no offsets'
_______________________________________________
pylucene-dev mailing list
[email protected]
http://lists.osafoundation.org/mailman/listinfo/pylucene-dev