Oh sorry, please ignore my previous code snippet, my intent was:
By checking the position increment?
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.*;
import java.io.IOException;
import java.util.LinkedHashMap;
import java.util.LinkedHashSet;
import java.util.Map;
import java.util.Set;
public class MwKStemCollectFilter extends TokenFilter {
private final CharTermAttribute termAttribute =
addAttribute(CharTermAttribute.class);
private final PositionIncrementAttribute posAttribute =
addAttribute(PositionIncrementAttribute.class);
private static final Map<String, Set<String>> stemWordMapping =
new LinkedHashMap<>();
private static String sPreviousTermString = "";
MwKStemCollectFilter(TokenStream in) {
super(in);
}
/** Returns the next, stemmed, input Token.
* @return The stemmed form of a token.
* @throws IOException If there is a low-level I/O error.
*/
@Override
public final boolean incrementToken() throws IOException {
if (!input.incrementToken())
return false;
int positionIncrement = posAttribute.getPositionIncrement();
if (positionIncrement == 0) {
String currentStemmedString = termAttribute.toString();
if (sPreviousTermString != null && !sPreviousTermString.isEmpty()) {
Set<String> originalStringSet =
stemWordMapping.get(currentStemmedString);
if (originalStringSet != null) {
originalStringSet.add(sPreviousTermString);
} else {
originalStringSet = new LinkedHashSet<>();
originalStringSet.add(sPreviousTermString);
}
stemWordMapping.put(currentStemmedString, originalStringSet);
}
System.out.println("stem->unstemm set: " +
stemWordMapping.toString());
}
sPreviousTermString = termAttribute.toString();
return true;
}
}
Sincerely,
--Xiaolong
On Fri, Feb 3, 2017 at 1:16 PM, Xiaolong Zheng <[email protected]>
wrote:
> Hello,
>
> I am trying collect stemming changes in my search index during the
> indexing time. So I could collect a list of stemmed word -> [variety
> original word] (e.g: plot -> [plots, plotting, plotted]) for a later use.
>
> I am using k-stem filter + KeywordRepeatFilter
> + RemoveDuplicatesTokenFilter to produce the tokens. I am wondering what's
> the best way to collecting such information?
>
> I am think by comparing the term buffer, is this the right way to do it?
>
>
>
> import org.apache.lucene.analysis.TokenFilter;
> import org.apache.lucene.analysis.TokenStream;
> import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
>
> import java.io.IOException;
> import java.util.*;
>
> public class FilterChangeWrapper extends TokenFilter {
> private final TokenFilter fWrappedFilter;
> private final CharTermAttribute termAttribute =
> addAttribute(CharTermAttribute.class);
> private final Map<String,Set<String>> fMappings = new HashMap<>();
>
> public FilterChangeWrapper(TokenStream in, TokenFilter wrappedFilter) {
> super(in);
> fWrappedFilter = wrappedFilter;
> }
>
> @Override
> public boolean incrementToken() throws IOException {
> char[] startingTerm = termAttribute.buffer();
> boolean result = fWrappedFilter.incrementToken();
> char[] endingTerm = termAttribute.buffer();
> if (!Arrays.equals(startingTerm, endingTerm)) {
> addMapping(startingTerm, endingTerm);
> }
> return result;
> }
>
> private void addMapping(char[] startingTerm, char[] endingTerm) {
> String startingString = new String(startingTerm);
> String endingString = new String(endingTerm);
> if (!fMappings.containsKey(startingString)) {
> fMappings.put(startingString, new HashSet<String>());
> }
>
> fMappings.get(startingString).add(endingString);
> }
>
> public Map<String,Set<String>> getMappings() {
> return Collections.unmodifiableMap(fMappings);
> }
> }
>
>
>
>
> Thanks,
> Xiaolong
>
>