Hello,
I am trying collect stemming changes in my search index during the indexing
time. So I could collect a list of stemmed word -> [variety original word]
(e.g: plot -> [plots, plotting, plotted]) for a later use.
I am using k-stem filter + KeywordRepeatFilter
+ RemoveDuplicatesTokenFilter to produce the tokens. I am wondering what's
the best way to collecting such information?
I am think by comparing the term buffer, is this the right way to do it?
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import java.io.IOException;
import java.util.*;
public class FilterChangeWrapper extends TokenFilter {
private final TokenFilter fWrappedFilter;
private final CharTermAttribute termAttribute =
addAttribute(CharTermAttribute.class);
private final Map<String,Set<String>> fMappings = new HashMap<>();
public FilterChangeWrapper(TokenStream in, TokenFilter wrappedFilter) {
super(in);
fWrappedFilter = wrappedFilter;
}
@Override
public boolean incrementToken() throws IOException {
char[] startingTerm = termAttribute.buffer();
boolean result = fWrappedFilter.incrementToken();
char[] endingTerm = termAttribute.buffer();
if (!Arrays.equals(startingTerm, endingTerm)) {
addMapping(startingTerm, endingTerm);
}
return result;
}
private void addMapping(char[] startingTerm, char[] endingTerm) {
String startingString = new String(startingTerm);
String endingString = new String(endingTerm);
if (!fMappings.containsKey(startingString)) {
fMappings.put(startingString, new HashSet<String>());
}
fMappings.get(startingString).add(endingString);
}
public Map<String,Set<String>> getMappings() {
return Collections.unmodifiableMap(fMappings);
}
}
Thanks,
Xiaolong