everyone:

I'm developing my own Analyzer in PyLucene 4.9.0 and created a TokenFilter for 
CompoundTokenFilter for compound word splitting in the analyzer as the 
DictionaryCompoundWordTokenFilter not performing very well.

DictionaryCompoundWordTokenFilter uses a brute algorithm, but I'd like to split 
compound words only when the subwords in the compound word are all in the 
dictionary, like "breastcancer" is split when "breast" and "cancer" are both in 
the given dictionary.

Actually the whole code is based on CompoundWordTokenFilterBase and I just 
edited the docompose() method in DictionaryCompoundWordTokenFilter class. But 
when running the program, it show that "the attribute 'length' of 
'CharTermAttribute' objects is not readable", and I cannot find what's wrong 
with it. 
And I've searched for how to inherit java classes in pylucene with jcc, but 
cannot work it out, could someone share some experiences or give some help? 
Thanks!
from __future__ import division
import lucene, math, itertools

from java.lang importCharSequencefrom java.io importIOExceptionfrom java.util 
importLinkedListfrom org.apache.pylucene.analysis importPythonTokenStreamfrom 
org.apache.lucene.analysis importTokenFilterfrom org.apache.pylucene.analysis 
importPythonTokenFilterfrom org.apache.lucene.analysis importTokenStreamfrom 
org.apache.lucene.analysis.tokenattributes importCharTermAttributefrom 
org.apache.lucene.analysis.tokenattributes importOffsetAttributefrom 
org.apache.lucene.analysis.tokenattributes importPositionIncrementAttributefrom 
org.apache.lucene.analysis.util importCharArraySetfrom org.apache.lucene.util 
importAttributeSourcefrom org.apache.lucene.util 
importVersionclassCompoundTokenFilter(PythonTokenFilter):def 
__init__(self,matchVersion,input,dictionary,DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE):super(CompoundTokenFilter,self).__init__(input)self.matchVersion=matchVersion
        self.dictionary=dictionary
        self.tokens=LinkedList()self.minWordSize=DEFAULT_MIN_WORD_SIZE
        self.minSubwordSize=DEFAULT_MIN_SUBWORD_SIZE
        self.maxSubwordSize=DEFAULT_MAX_SUBWORD_SIZE
        
self.current=AttributeSource.Stateself.termAtt=input.addAttribute(CharTermAttribute.class_)self.offsetAtt=input.addAttribute(OffsetAttribute.class_)self.posIncAtt=input.addAttribute(PositionIncrementAttribute.class_)self.input=input

    def decompose(self):
        l=self.termAtt.length()
        s=self.termAtt.subSequence(0,l)if s 
inself.dictionary:self.tokens.add(CompoundToken(self.matchVersion,self.input,self.dictionary,self.minWordSize,self.minSubwordSize,self.maxSubwordSize,0,l))else:

            d=filter(lambda x:len(x)>=self.minSubwordSize and 
len(x)<=self.maxSubwordSize in s,this.dictionary)if len(d)>0:
                
start=int(math.floor(l/self.maxSubwordSize))end=int(math.ceil(l/self.minSubwordSize))
                subwords_combinations=[]for i in xrange(start,end+1):
                    subwords_combinations.extend(itertools.permutations(d,i))
                subwords_combinations=filter(lambda 
x:''.join(x)==s,subwords_combinations)
                subwords=sorted(set(reduce(lambda 
x,y:x+y,subwords_combinations)),key=lambda x:-1*len(x))for subword in subwords:
                    
tokens.add(CompoundToken(self.matchVersion,self.input,self.dictionary,self.minWordSize,self.minSubwordSize,self.maxSubwordSize,s.find(subword),s.find(subword)+len(subword)))def
 incrementToken(self):if(notself.tokens.isEmpty()):assertself.current!=None
            
token=self.tokens.removeFirst()AttributeSource.restoreState(self.current)self.termAtt.setEmpty().append(token.txt)self.offsetAttribute.setOffset(token.startOffset,
 
token.endOffset)self.posIncAtt.setPositionIncrement(0)returnTrueself.current=Noneif(self.input.incrementToken()):ifself.termAtt.length()>=self.minWordSize:
                decompose()ifnot 
tokens.isEmpty():self.current=AttributeSource.captureState()returnTrueelse:returnFalsedef
 
reset(self):super(CompoundTokenFilter,self).reset()self.tokens.clear()self.current=NoneclassCompoundToken:def
 
__init__(self,matchVersion,input,dictionary,DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE,offset,length):
            
compoundTokenFilter=CompoundTokenFilter(matchVersion,input,dictionary,DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE)self.txt=compoundTokenFilter.termAtt.subSequence(offset,
 offset + length)

            startOff = compoundWordTokenFilterBase.this.offsetAtt.startOffset()
            endOff = compoundWordTokenFilterBase.this.offsetAtt.endOffset()if 
matchVersion.onOrAfter(Version.LUCENE_4_4)or endOff - startOff != 
compoundTokenFilter.termAtt.length():self.startOffset = startOff
                self.endOffset = endOff
            else:
                newStart = startOff + offset
                self.startOffset = newStart
                self.endOffset = newStart + length


Reply via email to