Re: How can I create my own TokenFilter in PyLucene inherited from PythonTokenFilter

Andi Vajda Thu, 26 Mar 2015 10:53:23 -0700


On Wed, 25 Mar 2015, ?? wrote:

everyone:

I'm developing my own Analyzer in PyLucene 4.9.0 and created a TokenFilter for 
CompoundTokenFilter for compound word splitting in the analyzer as the 
DictionaryCompoundWordTokenFilter not performing very well.

DictionaryCompoundWordTokenFilter uses a brute algorithm, but I'd like to split compound words only when the 
subwords in the compound word are all in the dictionary, like "breastcancer" is split when 
"breast" and "cancer" are both in the given dictionary.

Actually the whole code is based on CompoundWordTokenFilterBase and I just edited the docompose() method in DictionaryCompoundWordTokenFilter class. But when running the program, it show that "the attribute 'length' of 'CharTermAttribute' objects is not readable", and I cannot find what's wrong with it.And I've searched for how to inherit java classes in pylucene with jcc, but cannot work it out, could someone share some experiences or give some help? Thanks!

from __future__ import division
import lucene, math, itertools

from java.lang importCharSequencefrom java.io importIOExceptionfrom java.util 
importLinkedListfrom org.apache.pylucene.analysis importPythonTokenStreamfrom 
org.apache.lucene.analysis importTokenFilterfrom org.apache.pylucene.analysis 
importPythonTokenFilterfrom org.apache.lucene.analysis importTokenStreamfrom 
org.apache.lucene.analysis.tokenattributes importCharTermAttributefrom 
org.apache.lucene.analysis.tokenattributes importOffsetAttributefrom 
org.apache.lucene.analysis.tokenattributes importPositionIncrementAttributefrom 
org.apache.lucene.analysis.util importCharArraySetfrom org.apache.lucene.util 
importAttributeSourcefrom org.apache.lucene.util 
importVersionclassCompoundTokenFilter(PythonTokenFilter):def 
__init__(self,matchVersion,input,dictionary,DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE):super(CompoundTokenFilter,self).__init__(input)self.matchVersion=matchVersion
       self.dictionary=dictionary
       self.tokens=LinkedList()self.minWordSize=DEFAULT_MIN_WORD_SIZE
       self.minSubwordSize=DEFAULT_MIN_SUBWORD_SIZE
       self.maxSubwordSize=DEFAULT_MAX_SUBWORD_SIZE
       
self.current=AttributeSource.Stateself.termAtt=input.addAttribute(CharTermAttribute.class_)self.offsetAtt=input.addAttribute(OffsetAttribute.class_)self.posIncAtt=input.addAttribute(PositionIncrementAttribute.class_)self.input=input

   def decompose(self):
       l=self.termAtt.length()
       s=self.termAtt.subSequence(0,l)if s 
inself.dictionary:self.tokens.add(CompoundToken(self.matchVersion,self.input,self.dictionary,self.minWordSize,self.minSubwordSize,self.maxSubwordSize,0,l))else:

           d=filter(lambda x:len(x)>=self.minSubwordSize and 
len(x)<=self.maxSubwordSize in s,this.dictionary)if len(d)>0:
               
start=int(math.floor(l/self.maxSubwordSize))end=int(math.ceil(l/self.minSubwordSize))
               subwords_combinations=[]for i in xrange(start,end+1):
                   subwords_combinations.extend(itertools.permutations(d,i))
               subwords_combinations=filter(lambda 
x:''.join(x)==s,subwords_combinations)
               subwords=sorted(set(reduce(lambda 
x,y:x+y,subwords_combinations)),key=lambda x:-1*len(x))for subword in subwords:
                   
tokens.add(CompoundToken(self.matchVersion,self.input,self.dictionary,self.minWordSize,self.minSubwordSize,self.maxSubwordSize,s.find(subword),s.find(subword)+len(subword)))def
 incrementToken(self):if(notself.tokens.isEmpty()):assertself.current!=None
           
token=self.tokens.removeFirst()AttributeSource.restoreState(self.current)self.termAtt.setEmpty().append(token.txt)self.offsetAttribute.setOffset(token.startOffset,
 
token.endOffset)self.posIncAtt.setPositionIncrement(0)returnTrueself.current=Noneif(self.input.incrementToken()):ifself.termAtt.length()>=self.minWordSize:
               decompose()ifnot 
tokens.isEmpty():self.current=AttributeSource.captureState()returnTrueelse:returnFalsedef
 
reset(self):super(CompoundTokenFilter,self).reset()self.tokens.clear()self.current=NoneclassCompoundToken:def
 
__init__(self,matchVersion,input,dictionary,DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE,offset,length):
           
compoundTokenFilter=CompoundTokenFilter(matchVersion,input,dictionary,DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE)self.txt=compoundTokenFilter.termAtt.subSequence(offset,
 offset + length)

           startOff = compoundWordTokenFilterBase.this.offsetAtt.startOffset()
           endOff = compoundWordTokenFilterBase.this.offsetAtt.endOffset()if 
matchVersion.onOrAfter(Version.LUCENE_4_4)or endOff - startOff != 
compoundTokenFilter.termAtt.length():self.startOffset = startOff
               self.endOffset = endOff
           else:
               newStart = startOff + offset
               self.startOffset = newStart
               self.endOffset = newStart + length

I don't know enough about the particular API you're trying to use to be ofmuch help. You might want to try this in Java first and ask onjava-u...@lucene.apache.org.

About writing a python extension of a Lucene class, there are a couple ofexamples in the PyLucene tests that should be helpful.

  $ cd test
  $ grep -l Python *.py

Andi..

Re: How can I create my own TokenFilter in PyLucene inherited from PythonTokenFilter

Reply via email to