ehatcher 2004/03/12 01:45:17 Modified: src/java/org/apache/lucene/analysis/de GermanAnalyzer.java Log: format clean-up Revision Changes Path 1.9 +96 -104 jakarta-lucene/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java Index: GermanAnalyzer.java =================================================================== RCS file: /home/cvs/jakarta-lucene/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java,v retrieving revision 1.8 retrieving revision 1.9 diff -u -r1.8 -r1.9 --- GermanAnalyzer.java 12 Mar 2004 09:43:48 -0000 1.8 +++ GermanAnalyzer.java 12 Mar 2004 09:45:17 -0000 1.9 @@ -59,11 +59,12 @@ import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.standard.StandardFilter; import org.apache.lucene.analysis.standard.StandardTokenizer; + import java.io.File; import java.io.Reader; +import java.util.HashSet; import java.util.Hashtable; import java.util.Set; -import java.util.HashSet; /** * Analyzer for German language. Supports an external list of stopwords (words that @@ -72,108 +73,99 @@ * A default set of stopwords is used unless an alternative list is specified, the * exclusion list is empty by default. * - * @author Gerhard Schwarz - * @version $Id$ + * @author Gerhard Schwarz + * @version $Id$ */ -public class GermanAnalyzer extends Analyzer -{ - /** - * List of typical german stopwords. - */ - private String[] GERMAN_STOP_WORDS = { - "einer", "eine", "eines", "einem", "einen", - "der", "die", "das", "dass", "daß", - "du", "er", "sie", "es", - "was", "wer", "wie", "wir", - "und", "oder", "ohne", "mit", - "am", "im", "in", "aus", "auf", - "ist", "sein", "war", "wird", - "ihr", "ihre", "ihres", - "als", "für", "von", "mit", - "dich", "dir", "mich", "mir", - "mein", "sein", "kein", - "durch", "wegen", "wird" - }; - - /** - * Contains the stopwords used with the StopFilter. - */ - private Set stopSet = new HashSet(); - - /** - * Contains words that should be indexed but not stemmed. - */ - private Set exclusionSet = new HashSet(); - - /** - * Builds an analyzer. - */ - public GermanAnalyzer() - { - stopSet = StopFilter.makeStopSet( GERMAN_STOP_WORDS ); - } - - /** - * Builds an analyzer with the given stop words. - */ - public GermanAnalyzer( String[] stopwords ) - { - stopSet = StopFilter.makeStopSet( stopwords ); - } - - /** - * Builds an analyzer with the given stop words. - */ - public GermanAnalyzer( Hashtable stopwords ) - { - stopSet = new HashSet(stopwords.keySet()); - } - - /** - * Builds an analyzer with the given stop words. - */ - public GermanAnalyzer( File stopwords ) - { - stopSet = new HashSet(WordlistLoader.getWordtable( stopwords ).keySet()); - } - - /** - * Builds an exclusionlist from an array of Strings. - */ - public void setStemExclusionTable( String[] exclusionlist ) - { - exclusionSet = StopFilter.makeStopSet( exclusionlist ); - } - - /** - * Builds an exclusionlist from a Hashtable. - */ - public void setStemExclusionTable( Hashtable exclusionlist ) - { - exclusionSet = new HashSet(exclusionlist.keySet()); - } - - /** - * Builds an exclusionlist from the words contained in the given file. - */ - public void setStemExclusionTable( File exclusionlist ) - { - exclusionSet = new HashSet(WordlistLoader.getWordtable( exclusionlist ).keySet()); - } - - /** - * Creates a TokenStream which tokenizes all the text in the provided Reader. - * - * @return A TokenStream build from a StandardTokenizer filtered with - * StandardFilter, StopFilter, GermanStemFilter - */ - public TokenStream tokenStream( String fieldName, Reader reader ) - { - TokenStream result = new StandardTokenizer( reader ); - result = new StandardFilter( result ); - // shouldn't there be a lowercaser before stop word filtering? - result = new StopFilter( result, stopSet ); - result = new GermanStemFilter( result, exclusionSet ); - return result; - } +public class GermanAnalyzer extends Analyzer { + /** + * List of typical german stopwords. + */ + private String[] GERMAN_STOP_WORDS = { + "einer", "eine", "eines", "einem", "einen", + "der", "die", "das", "dass", "daß", + "du", "er", "sie", "es", + "was", "wer", "wie", "wir", + "und", "oder", "ohne", "mit", + "am", "im", "in", "aus", "auf", + "ist", "sein", "war", "wird", + "ihr", "ihre", "ihres", + "als", "für", "von", "mit", + "dich", "dir", "mich", "mir", + "mein", "sein", "kein", + "durch", "wegen", "wird" + }; + + /** + * Contains the stopwords used with the StopFilter. + */ + private Set stopSet = new HashSet(); + + /** + * Contains words that should be indexed but not stemmed. + */ + private Set exclusionSet = new HashSet(); + + /** + * Builds an analyzer. + */ + public GermanAnalyzer() { + stopSet = StopFilter.makeStopSet(GERMAN_STOP_WORDS); + } + + /** + * Builds an analyzer with the given stop words. + */ + public GermanAnalyzer(String[] stopwords) { + stopSet = StopFilter.makeStopSet(stopwords); + } + + /** + * Builds an analyzer with the given stop words. + */ + public GermanAnalyzer(Hashtable stopwords) { + stopSet = new HashSet(stopwords.keySet()); + } + + /** + * Builds an analyzer with the given stop words. + */ + public GermanAnalyzer(File stopwords) { + stopSet = new HashSet(WordlistLoader.getWordtable(stopwords).keySet()); + } + + /** + * Builds an exclusionlist from an array of Strings. + */ + public void setStemExclusionTable(String[] exclusionlist) { + exclusionSet = StopFilter.makeStopSet(exclusionlist); + } + + /** + * Builds an exclusionlist from a Hashtable. + */ + public void setStemExclusionTable(Hashtable exclusionlist) { + exclusionSet = new HashSet(exclusionlist.keySet()); + } + + /** + * Builds an exclusionlist from the words contained in the given file. + */ + public void setStemExclusionTable(File exclusionlist) { + exclusionSet = new HashSet(WordlistLoader.getWordtable(exclusionlist).keySet()); + } + + /** + * Creates a TokenStream which tokenizes all the text in the provided Reader. + * + * @return A TokenStream build from a StandardTokenizer filtered with + * StandardFilter, StopFilter, GermanStemFilter + */ + public TokenStream tokenStream(String fieldName, Reader reader) { + TokenStream result = new StandardTokenizer(reader); + result = new StandardFilter(result); +// shouldn't there be a lowercaser before stop word filtering? + result = new StopFilter(result, stopSet); + result = new GermanStemFilter(result, exclusionSet); + return result; + } }
--------------------------------------------------------------------- To unsubscribe, e-mail: [EMAIL PROTECTED] For additional commands, e-mail: [EMAIL PROTECTED]