everyone:
I'm developing my own Analyzer in PyLucene 4.9.0 and created a TokenFilter for
CompoundTokenFilter for compound word splitting in the analyzer as the
DictionaryCompoundWordTokenFilter not performing very well.
DictionaryCompoundWordTokenFilter uses a brute algorithm, but I'd like to split
compound words only when the subwords in the compound word are all in the
dictionary, like "breastcancer" is split when "breast" and "cancer" are both in
the given dictionary.
Actually the whole code is based on CompoundWordTokenFilterBase and I just
edited the docompose() method in DictionaryCompoundWordTokenFilter class. But
when running the program, it show that "the attribute 'length' of
'CharTermAttribute' objects is not readable", and I cannot find what's wrong
with it.
And I've searched for how to inherit java classes in pylucene with jcc, but
cannot work it out, could someone share some experiences or give some help?
Thanks!
from __future__ import division
import lucene, math, itertools
from java.lang importCharSequencefrom java.io importIOExceptionfrom java.util
importLinkedListfrom org.apache.pylucene.analysis importPythonTokenStreamfrom
org.apache.lucene.analysis importTokenFilterfrom org.apache.pylucene.analysis
importPythonTokenFilterfrom org.apache.lucene.analysis importTokenStreamfrom
org.apache.lucene.analysis.tokenattributes importCharTermAttributefrom
org.apache.lucene.analysis.tokenattributes importOffsetAttributefrom
org.apache.lucene.analysis.tokenattributes importPositionIncrementAttributefrom
org.apache.lucene.analysis.util importCharArraySetfrom org.apache.lucene.util
importAttributeSourcefrom org.apache.lucene.util
importVersionclassCompoundTokenFilter(PythonTokenFilter):def
__init__(self,matchVersion,input,dictionary,DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE):super(CompoundTokenFilter,self).__init__(input)self.matchVersion=matchVersion
self.dictionary=dictionary
self.tokens=LinkedList()self.minWordSize=DEFAULT_MIN_WORD_SIZE
self.minSubwordSize=DEFAULT_MIN_SUBWORD_SIZE
self.maxSubwordSize=DEFAULT_MAX_SUBWORD_SIZE
self.current=AttributeSource.Stateself.termAtt=input.addAttribute(CharTermAttribute.class_)self.offsetAtt=input.addAttribute(OffsetAttribute.class_)self.posIncAtt=input.addAttribute(PositionIncrementAttribute.class_)self.input=input
def decompose(self):
l=self.termAtt.length()
s=self.termAtt.subSequence(0,l)if s
inself.dictionary:self.tokens.add(CompoundToken(self.matchVersion,self.input,self.dictionary,self.minWordSize,self.minSubwordSize,self.maxSubwordSize,0,l))else:
d=filter(lambda x:len(x)>=self.minSubwordSize and
len(x)<=self.maxSubwordSize in s,this.dictionary)if len(d)>0:
start=int(math.floor(l/self.maxSubwordSize))end=int(math.ceil(l/self.minSubwordSize))
subwords_combinations=[]for i in xrange(start,end+1):
subwords_combinations.extend(itertools.permutations(d,i))
subwords_combinations=filter(lambda
x:''.join(x)==s,subwords_combinations)
subwords=sorted(set(reduce(lambda
x,y:x+y,subwords_combinations)),key=lambda x:-1*len(x))for subword in subwords:
tokens.add(CompoundToken(self.matchVersion,self.input,self.dictionary,self.minWordSize,self.minSubwordSize,self.maxSubwordSize,s.find(subword),s.find(subword)+len(subword)))def
incrementToken(self):if(notself.tokens.isEmpty()):assertself.current!=None
token=self.tokens.removeFirst()AttributeSource.restoreState(self.current)self.termAtt.setEmpty().append(token.txt)self.offsetAttribute.setOffset(token.startOffset,
token.endOffset)self.posIncAtt.setPositionIncrement(0)returnTrueself.current=Noneif(self.input.incrementToken()):ifself.termAtt.length()>=self.minWordSize:
decompose()ifnot
tokens.isEmpty():self.current=AttributeSource.captureState()returnTrueelse:returnFalsedef
reset(self):super(CompoundTokenFilter,self).reset()self.tokens.clear()self.current=NoneclassCompoundToken:def
__init__(self,matchVersion,input,dictionary,DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE,offset,length):
compoundTokenFilter=CompoundTokenFilter(matchVersion,input,dictionary,DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE)self.txt=compoundTokenFilter.termAtt.subSequence(offset,
offset + length)
startOff = compoundWordTokenFilterBase.this.offsetAtt.startOffset()
endOff = compoundWordTokenFilterBase.this.offsetAtt.endOffset()if
matchVersion.onOrAfter(Version.LUCENE_4_4)or endOff - startOff !=
compoundTokenFilter.termAtt.length():self.startOffset = startOff
self.endOffset = endOff
else:
newStart = startOff + offset
self.startOffset = newStart
self.endOffset = newStart + length