Dear Wiki user, You have subscribed to a wiki page or wiki category on "Solr Wiki" for change notification.
The following page has been changed by HarryWagner: http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters/Kstem ------------------------------------------------------------------------------ - [#http://ciir.cs.umass.edu/cgi-bin/downloads/downloads.cgi KStem] is an alternative to Porter for developers looking for a less agressive stemmer. It was written by Bob Krovetz and ported to Lucene by Sergio Guzman-Lara (UMASS Amherst). Harry Wagner (OCLC) modified the KStemFilter source to KStemFilterFactory for use with Solr (see below). [#https://issues.apache.org/jira/browse/SOLR-379 Additional information] regarding KStem and Solr can be found here. Do the following to use KStem in your Solr implementation: + [#http://ciir.cs.umass.edu/cgi-bin/downloads/downloads.cgi KStem] is an alternative to Porter for developers looking for a less agressive stemmer. It was written by Bob Krovetz and ported to Lucene by Sergio Guzman-Lara (UMASS Amherst). Harry Wagner (OCLC) modified the KStemFilter source for use with Solr (see below). [#https://issues.apache.org/jira/browse/SOLR-379 Additional information] regarding KStem and Solr can be found here. Do the following to use KStem in your Solr implementation: 1. Download [#http://ciir.cs.umass.edu/cgi-bin/downloads/downloads.cgi KStem] 2. Unpack the jar file 3. Modify the package name on the source files to match your install + 4. Rename KStemFilter.java to KStemFilterFactory.java and modify as follows (apparently there are licensing issues that prevent this code from being included in SOlr or available as a download): - 4. Replace KStemFilter.java with KStemFilterFactory.java - + {{{#!java + /* + Copyright 2003, + Center for Intelligent Information Retrieval, + University of Massachusetts, Amherst. + All rights reserved. + + Redistribution and use in source and binary forms, with or without modification, + are permitted provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + + 3. The names "Center for Intelligent Information Retrieval" and + "University of Massachusetts" must not be used to endorse or promote products + derived from this software without prior written permission. To obtain + permission, contact [EMAIL PROTECTED] + + THIS SOFTWARE IS PROVIDED BY UNIVERSITY OF MASSACHUSETTS AND OTHER CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE + GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + SUCH DAMAGE. + + Modified for Solr use: H. Wagner, OCLC 2007-09-29 + */ + package your.package.name.here; + + /** + * <p>Title: </p> + * <p>Description: This filter transforms an input word into its stemmed form + * using Bob Krovetz' kstem algorithm.</p> + * <p>Copyright: Copyright (c) 2003</p> + * <p>Company: CIIR Umass Amherst (http://ciir.cs.umass.edu) </p> + * @author Sergio Guzman-Lara + * @version 1.0 + */ + + import org.apache.solr.core.Config; + import org.apache.solr.analysis.BaseTokenFilterFactory; + import org.apache.lucene.analysis.StopFilter; + import org.apache.lucene.analysis.TokenStream; + import org.apache.lucene.analysis.TokenFilter; + import org.apache.lucene.analysis.Token; + + import java.util.Map; + import java.util.List; + import java.util.Set; + import java.io.IOException; + + /** Transforms the token stream according to the KStem stemming algorithm. + * For more information about KStem see <a href="http://ciir.cs.umass.edu/pubfiles/ir-35.pdf"> + "Viewing Morphology as an Inference Process"</a> + (Krovetz, R., Proceedings of the Sixteenth Annual International ACM SIGIR + Conference on Research and Development in Information Retrieval, 191-203, 1993). + + Note: the input to the stemming filter must already be in lower case, + so you will need to use LowerCaseFilter or LowerCaseTokenizer farther + down the Tokenizer chain in order for this to work properly! + <P> + To use this filter with other analyzers, you'll want to write an + Analyzer class that sets up the TokenStream chain as you want it. + To use this with LowerCaseTokenizer, for example, you'd write an + analyzer like this: + <P> + <PRE> + class MyAnalyzer extends Analyzer { + public final TokenStream tokenStream(String fieldName, Reader reader) { + return new KStemStemFilter(new LowerCaseTokenizer(reader)); + } + } + </PRE> + + */ + + public class KStemFilterFactory extends BaseTokenFilterFactory { + public void init(Map<String, String> args) { + super.init(args); + String cacheSizeStr = args.get("cacheSize"); + if (cacheSizeStr != null) { + cacheSize = Integer.parseInt(cacheSizeStr); + } + } + + private int cacheSize = 20000; + + public TokenStream create(TokenStream input) { + return new KStemFilter(input,cacheSize); + } + } + + class KStemFilter extends TokenFilter { + private KStemmer stemmer; + + /** Create a KStemmer with the given cache size. + * @param in The TokenStream whose output will be the input to KStemFilter. + * @param cacheSize Maximum number of entries to store in the + * Stemmer's cache (stems stored in this cache do not need to be + * recomputed, speeding up the stemming process). + */ + public KStemFilter(TokenStream in, int cacheSize) { + super(in); + stemmer = new KStemmer(cacheSize); + } + + /** Create a KStemmer with the default cache size of 20 000 entries. + * @param in The TokenStream whose output will be the input to KStemFilter. + */ + public KStemFilter(TokenStream in) { + super(in); + stemmer = new KStemmer(); + } + + /** Returns the next, stemmed, input Token. + * @return The stemed form of a token. + * @throws IOException + */ + + + /** the original code from KStem + public final Token next() throws IOException { + Token token = input.next(); + if (token == null) + return null; + else { + String s = stemmer.stem(token.termText); + if (s != token.termText) // Yes, I mean object reference comparison here + token.termText = s; + return token; + } + } + **/ + + public final Token next() throws IOException { + Token tok = input.next(); + if (tok==null) return null; + String tokstr = tok.termText(); + + String s = stemmer.stem(tokstr); + if (s.equals(tokstr)) { + return tok; + } else { + Token newtok = new Token(s, tok.startOffset(), tok.endOffset(), tok.type()); + newtok.setPositionIncrement(tok.getPositionIncrement()); + return newtok; + } + } + } + }}} +
