ehatcher 2004/03/12 01:43:48 Modified: src/java/org/apache/lucene/analysis StopAnalyzer.java StopFilter.java src/java/org/apache/lucene/analysis/de GermanAnalyzer.java GermanStemFilter.java WordlistLoader.java src/java/org/apache/lucene/analysis/ru RussianAnalyzer.java src/java/org/apache/lucene/analysis/standard StandardAnalyzer.java Log: convert StopFilter to use Set, with supporting changes to avoid calling deprecated methods. never compromise on your ideals! Revision Changes Path 1.3 +5 -4 jakarta-lucene/src/java/org/apache/lucene/analysis/StopAnalyzer.java Index: StopAnalyzer.java =================================================================== RCS file: /home/cvs/jakarta-lucene/src/java/org/apache/lucene/analysis/StopAnalyzer.java,v retrieving revision 1.2 retrieving revision 1.3 diff -u -r1.2 -r1.3 --- StopAnalyzer.java 9 Dec 2002 19:02:20 -0000 1.2 +++ StopAnalyzer.java 12 Mar 2004 09:43:48 -0000 1.3 @@ -56,11 +56,12 @@ import java.io.Reader; import java.util.Hashtable; +import java.util.Set; /** Filters LetterTokenizer with LowerCaseFilter and StopFilter. */ public final class StopAnalyzer extends Analyzer { - private Hashtable stopTable; + private Set stopWords; /** An array containing some common English words that are not usually useful for searching. */ @@ -74,17 +75,17 @@ /** Builds an analyzer which removes words in ENGLISH_STOP_WORDS. */ public StopAnalyzer() { - stopTable = StopFilter.makeStopTable(ENGLISH_STOP_WORDS); + stopWords = StopFilter.makeStopSet(ENGLISH_STOP_WORDS); } /** Builds an analyzer which removes words in the provided array. */ public StopAnalyzer(String[] stopWords) { - stopTable = StopFilter.makeStopTable(stopWords); + this.stopWords = StopFilter.makeStopSet(stopWords); } /** Filters LowerCaseTokenizer with StopFilter. */ public TokenStream tokenStream(String fieldName, Reader reader) { - return new StopFilter(new LowerCaseTokenizer(reader), stopTable); + return new StopFilter(new LowerCaseTokenizer(reader), stopWords); } } 1.10 +9 -4 jakarta-lucene/src/java/org/apache/lucene/analysis/StopFilter.java Index: StopFilter.java =================================================================== RCS file: /home/cvs/jakarta-lucene/src/java/org/apache/lucene/analysis/StopFilter.java,v retrieving revision 1.9 retrieving revision 1.10 diff -u -r1.9 -r1.10 --- StopFilter.java 10 Mar 2004 23:17:37 -0000 1.9 +++ StopFilter.java 12 Mar 2004 09:43:48 -0000 1.10 @@ -57,6 +57,7 @@ import java.io.IOException; import java.util.HashSet; import java.util.Hashtable; +import java.util.Set; /** * Removes stop words from a token stream. @@ -64,7 +65,7 @@ public final class StopFilter extends TokenFilter { - private HashSet stopWords; + private Set stopWords; /** * Constructs a filter which removes words from the input @@ -79,7 +80,7 @@ * Constructs a filter which removes words from the input * TokenStream that are named in the Hashtable. * - * @deprecated Use [EMAIL PROTECTED] #StopFilter(TokenStream, HashSet)} StopFilter(TokenStream,Map)} instead + * @deprecated Use [EMAIL PROTECTED] #StopFilter(TokenStream, Set)} StopFilter(TokenStream,Map)} instead */ public StopFilter(TokenStream in, Hashtable stopTable) { super(in); @@ -89,8 +90,12 @@ /** * Constructs a filter which removes words from the input * TokenStream that are named in the Set. + * It is crucial that an efficient Set implementation is used + * for maximum performance. + * + * @see #makeStopSet(java.lang.String[]) */ - public StopFilter(TokenStream in, HashSet stopWords) { + public StopFilter(TokenStream in, Set stopWords) { super(in); this.stopWords = stopWords; } @@ -116,7 +121,7 @@ * This permits this stopWords construction to be cached once when * an Analyzer is constructed. */ - public static final HashSet makeStopSet(String[] stopWords) { + public static final Set makeStopSet(String[] stopWords) { HashSet stopTable = new HashSet(stopWords.length); for (int i = 0; i < stopWords.length; i++) stopTable.add(stopWords[i]); 1.8 +14 -12 jakarta-lucene/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java Index: GermanAnalyzer.java =================================================================== RCS file: /home/cvs/jakarta-lucene/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java,v retrieving revision 1.7 retrieving revision 1.8 diff -u -r1.7 -r1.8 --- GermanAnalyzer.java 9 Oct 2003 00:08:52 -0000 1.7 +++ GermanAnalyzer.java 12 Mar 2004 09:43:48 -0000 1.8 @@ -62,6 +62,8 @@ import java.io.File; import java.io.Reader; import java.util.Hashtable; +import java.util.Set; +import java.util.HashSet; /** * Analyzer for German language. Supports an external list of stopwords (words that @@ -96,19 +98,19 @@ /** * Contains the stopwords used with the StopFilter. */ - private Hashtable stoptable = new Hashtable(); + private Set stopSet = new HashSet(); /** * Contains words that should be indexed but not stemmed. */ - private Hashtable excltable = new Hashtable(); + private Set exclusionSet = new HashSet(); /** * Builds an analyzer. */ public GermanAnalyzer() { - stoptable = StopFilter.makeStopTable( GERMAN_STOP_WORDS ); + stopSet = StopFilter.makeStopSet( GERMAN_STOP_WORDS ); } /** @@ -116,7 +118,7 @@ */ public GermanAnalyzer( String[] stopwords ) { - stoptable = StopFilter.makeStopTable( stopwords ); + stopSet = StopFilter.makeStopSet( stopwords ); } /** @@ -124,7 +126,7 @@ */ public GermanAnalyzer( Hashtable stopwords ) { - stoptable = stopwords; + stopSet = new HashSet(stopwords.keySet()); } /** @@ -132,7 +134,7 @@ */ public GermanAnalyzer( File stopwords ) { - stoptable = WordlistLoader.getWordtable( stopwords ); + stopSet = new HashSet(WordlistLoader.getWordtable( stopwords ).keySet()); } /** @@ -140,7 +142,7 @@ */ public void setStemExclusionTable( String[] exclusionlist ) { - excltable = StopFilter.makeStopTable( exclusionlist ); + exclusionSet = StopFilter.makeStopSet( exclusionlist ); } /** @@ -148,7 +150,7 @@ */ public void setStemExclusionTable( Hashtable exclusionlist ) { - excltable = exclusionlist; + exclusionSet = new HashSet(exclusionlist.keySet()); } /** @@ -156,7 +158,7 @@ */ public void setStemExclusionTable( File exclusionlist ) { - excltable = WordlistLoader.getWordtable( exclusionlist ); + exclusionSet = new HashSet(WordlistLoader.getWordtable( exclusionlist ).keySet()); } /** @@ -170,8 +172,8 @@ TokenStream result = new StandardTokenizer( reader ); result = new StandardFilter( result ); // shouldn't there be a lowercaser before stop word filtering? - result = new StopFilter( result, stoptable ); - result = new GermanStemFilter( result, excltable ); + result = new StopFilter( result, stopSet ); + result = new GermanStemFilter( result, exclusionSet ); return result; } } 1.6 +28 -6 jakarta-lucene/src/java/org/apache/lucene/analysis/de/GermanStemFilter.java Index: GermanStemFilter.java =================================================================== RCS file: /home/cvs/jakarta-lucene/src/java/org/apache/lucene/analysis/de/GermanStemFilter.java,v retrieving revision 1.5 retrieving revision 1.6 diff -u -r1.5 -r1.6 --- GermanStemFilter.java 9 Dec 2002 19:02:21 -0000 1.5 +++ GermanStemFilter.java 12 Mar 2004 09:43:48 -0000 1.6 @@ -59,6 +59,8 @@ import org.apache.lucene.analysis.TokenStream; import java.io.IOException; import java.util.Hashtable; +import java.util.Set; +import java.util.HashSet; /** * A filter that stems German words. It supports a table of words that should @@ -75,7 +77,7 @@ */ private Token token = null; private GermanStemmer stemmer = null; - private Hashtable exclusions = null; + private Set exclusionSet = null; public GermanStemFilter( TokenStream in ) { @@ -85,13 +87,24 @@ /** * Builds a GermanStemFilter that uses an exclusiontable. + * @deprecated Use [EMAIL PROTECTED] #GermanStemFilter(org.apache.lucene.analysis.TokenStream, java.util.Set)} instead. */ public GermanStemFilter( TokenStream in, Hashtable exclusiontable ) { this( in ); - exclusions = exclusiontable; + exclusionSet = new HashSet(exclusiontable.keySet()); + } - + + /** + * Builds a GermanStemFilter that uses an exclusiontable. + */ + public GermanStemFilter( TokenStream in, Set exclusionSet ) + { + this( in ); + this.exclusionSet = exclusionSet; + } + /** * @return Returns the next token in the stream, or null at EOS */ @@ -102,7 +115,7 @@ return null; } // Check the exclusiontable - else if ( exclusions != null && exclusions.contains( token.termText() ) ) { + else if ( exclusionSet != null && exclusionSet.contains( token.termText() ) ) { return token; } else { @@ -128,9 +141,18 @@ /** * Set an alternative exclusion list for this filter. + * @deprecated Use [EMAIL PROTECTED] #setExclusionSet(java.util.Set)} instead. */ public void setExclusionTable( Hashtable exclusiontable ) { - exclusions = exclusiontable; + exclusionSet = new HashSet(exclusiontable.keySet()); + } + + /** + * Set an alternative exclusion list for this filter. + */ + public void setExclusionSet( Set exclusionSet ) + { + this.exclusionSet = exclusionSet; } } 1.6 +4 -1 jakarta-lucene/src/java/org/apache/lucene/analysis/de/WordlistLoader.java Index: WordlistLoader.java =================================================================== RCS file: /home/cvs/jakarta-lucene/src/java/org/apache/lucene/analysis/de/WordlistLoader.java,v retrieving revision 1.5 retrieving revision 1.6 diff -u -r1.5 -r1.6 --- WordlistLoader.java 10 Mar 2004 00:18:02 -0000 1.5 +++ WordlistLoader.java 12 Mar 2004 09:43:48 -0000 1.6 @@ -67,6 +67,8 @@ * * @author Gerhard Schwarz * @version $Id$ + * + * @todo refactor to convert to Sets instead of Hashtable */ public class WordlistLoader { /** @@ -92,6 +94,7 @@ /** * @param wordfile File containing the wordlist + * @todo Create a Set version of this method */ public static Hashtable getWordtable(File wordfile) { if (wordfile == null) { 1.6 +10 -7 jakarta-lucene/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java Index: RussianAnalyzer.java =================================================================== RCS file: /home/cvs/jakarta-lucene/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java,v retrieving revision 1.5 retrieving revision 1.6 diff -u -r1.5 -r1.6 --- RussianAnalyzer.java 25 Jan 2004 14:18:12 -0000 1.5 +++ RussianAnalyzer.java 12 Mar 2004 09:43:48 -0000 1.6 @@ -60,6 +60,8 @@
import java.io.Reader; import java.util.Hashtable; +import java.util.Set; +import java.util.HashSet; /** * Analyzer for Russian language. Supports an external list of stopwords (words that @@ -215,7 +217,7 @@ /** * Contains the stopwords used with the StopFilter. */ - private Hashtable stoptable = new Hashtable(); + private Set stopSet = new HashSet(); /** * Charset for Russian letters. @@ -227,7 +229,7 @@ public RussianAnalyzer() { charset = RussianCharsets.UnicodeRussian; - stoptable = StopFilter.makeStopTable( + stopSet = StopFilter.makeStopSet( makeStopWords(RussianCharsets.UnicodeRussian)); } @@ -237,7 +239,7 @@ public RussianAnalyzer(char[] charset) { this.charset = charset; - stoptable = StopFilter.makeStopTable(makeStopWords(charset)); + stopSet = StopFilter.makeStopSet(makeStopWords(charset)); } /** @@ -246,7 +248,7 @@ public RussianAnalyzer(char[] charset, String[] stopwords) { this.charset = charset; - stoptable = StopFilter.makeStopTable(stopwords); + stopSet = StopFilter.makeStopSet(stopwords); } // Takes russian stop words and translates them to a String array, using @@ -270,11 +272,12 @@ /** * Builds an analyzer with the given stop words. + * @todo create a Set version of this ctor */ public RussianAnalyzer(char[] charset, Hashtable stopwords) { this.charset = charset; - stoptable = stopwords; + stopSet = new HashSet(stopwords.keySet()); } /** @@ -287,7 +290,7 @@ { TokenStream result = new RussianLetterTokenizer(reader, charset); result = new RussianLowerCaseFilter(result, charset); - result = new StopFilter(result, stoptable); + result = new StopFilter(result, stopSet); result = new RussianStemFilter(result, charset); return result; } 1.7 +5 -5 jakarta-lucene/src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java Index: StandardAnalyzer.java =================================================================== RCS file: /home/cvs/jakarta-lucene/src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java,v retrieving revision 1.6 retrieving revision 1.7 diff -u -r1.6 -r1.7 --- StandardAnalyzer.java 10 Nov 2003 14:31:19 -0000 1.6 +++ StandardAnalyzer.java 12 Mar 2004 09:43:48 -0000 1.7 @@ -56,7 +56,7 @@ import org.apache.lucene.analysis.*; import java.io.Reader; -import java.util.Hashtable; +import java.util.Set; /** * Filters [EMAIL PROTECTED] StandardTokenizer} with [EMAIL PROTECTED] StandardFilter}, [EMAIL PROTECTED] @@ -65,7 +65,7 @@ * @version $Id$ */ public class StandardAnalyzer extends Analyzer { - private Hashtable stopTable; + private Set stopSet; /** An array containing some common English words that are usually not useful for searching. */ @@ -78,7 +78,7 @@ /** Builds an analyzer with the given stop words. */ public StandardAnalyzer(String[] stopWords) { - stopTable = StopFilter.makeStopTable(stopWords); + stopSet = StopFilter.makeStopSet(stopWords); } /** Constructs a [EMAIL PROTECTED] StandardTokenizer} filtered by a [EMAIL PROTECTED] @@ -87,7 +87,7 @@ TokenStream result = new StandardTokenizer(reader); result = new StandardFilter(result); result = new LowerCaseFilter(result); - result = new StopFilter(result, stopTable); + result = new StopFilter(result, stopSet); return result; } } --------------------------------------------------------------------- To unsubscribe, e-mail: [EMAIL PROTECTED] For additional commands, e-mail: [EMAIL PROTECTED]