de GermanAnalyzer.java

ehatcher Fri, 12 Mar 2004 01:45:16 -0800

ehatcher    2004/03/12 01:45:17

  Modified:    src/java/org/apache/lucene/analysis/de GermanAnalyzer.java
  Log:
  format clean-up
  
  Revision  Changes    Path
  1.9       +96 -104   
jakarta-lucene/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java
  
  Index: GermanAnalyzer.java
  ===================================================================
  RCS file: 
/home/cvs/jakarta-lucene/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java,v
  retrieving revision 1.8
  retrieving revision 1.9
  diff -u -r1.8 -r1.9
  --- GermanAnalyzer.java       12 Mar 2004 09:43:48 -0000      1.8
  +++ GermanAnalyzer.java       12 Mar 2004 09:45:17 -0000      1.9
  @@ -59,11 +59,12 @@
   import org.apache.lucene.analysis.TokenStream;
   import org.apache.lucene.analysis.standard.StandardFilter;
   import org.apache.lucene.analysis.standard.StandardTokenizer;
  +
   import java.io.File;
   import java.io.Reader;
  +import java.util.HashSet;
   import java.util.Hashtable;
   import java.util.Set;
  -import java.util.HashSet;
   
   /**
    * Analyzer for German language. Supports an external list of stopwords (words that
  @@ -72,108 +73,99 @@
    * A default set of stopwords is used unless an alternative list is specified, the
    * exclusion list is empty by default.
    *
  - * @author    Gerhard Schwarz
  - * @version   $Id$
  + * @author Gerhard Schwarz
  + * @version $Id$
    */
  -public class GermanAnalyzer extends Analyzer
  -{
  -    /**
  -     * List of typical german stopwords.
  -     */
  -    private String[] GERMAN_STOP_WORDS = {
  -     "einer", "eine", "eines", "einem", "einen",
  -     "der", "die", "das", "dass", "daß",
  -     "du", "er", "sie", "es",
  -     "was", "wer", "wie", "wir",
  -     "und", "oder", "ohne", "mit",
  -     "am", "im", "in", "aus", "auf",
  -     "ist", "sein", "war", "wird",
  -     "ihr", "ihre", "ihres",
  -     "als", "für", "von", "mit",
  -     "dich", "dir", "mich", "mir",
  -     "mein", "sein", "kein",
  -     "durch", "wegen", "wird"
  -    };
  -
  -    /**
  -     * Contains the stopwords used with the StopFilter.
  -     */
  -    private Set stopSet = new HashSet();
  -
  -    /**
  -     * Contains words that should be indexed but not stemmed.
  -     */
  -    private Set exclusionSet = new HashSet();
  -
  -    /**
  -     * Builds an analyzer.
  -     */
  -    public GermanAnalyzer()
  -    {
  -     stopSet = StopFilter.makeStopSet( GERMAN_STOP_WORDS );
  -    }
  -
  -    /**
  -     * Builds an analyzer with the given stop words.
  -     */
  -    public GermanAnalyzer( String[] stopwords )
  -    {
  -     stopSet = StopFilter.makeStopSet( stopwords );
  -    }
  -
  -    /**
  -     * Builds an analyzer with the given stop words.
  -     */
  -    public GermanAnalyzer( Hashtable stopwords )
  -    {
  -     stopSet = new HashSet(stopwords.keySet());
  -    }
  -
  -    /**
  -     * Builds an analyzer with the given stop words.
  -     */
  -    public GermanAnalyzer( File stopwords )
  -    {
  -     stopSet = new HashSet(WordlistLoader.getWordtable( stopwords ).keySet());
  -    }
  -
  -    /**
  -     * Builds an exclusionlist from an array of Strings.
  -     */
  -    public void setStemExclusionTable( String[] exclusionlist )
  -    {
  -     exclusionSet = StopFilter.makeStopSet( exclusionlist );
  -    }
  -
  -    /**
  -     * Builds an exclusionlist from a Hashtable.
  -     */
  -    public void setStemExclusionTable( Hashtable exclusionlist )
  -    {
  -     exclusionSet = new HashSet(exclusionlist.keySet());
  -    }
  -
  -    /**
  -     * Builds an exclusionlist from the words contained in the given file.
  -     */
  -    public void setStemExclusionTable( File exclusionlist )
  -    {
  -     exclusionSet = new HashSet(WordlistLoader.getWordtable( exclusionlist 
).keySet());
  -    }
  -
  -    /**
  -     * Creates a TokenStream which tokenizes all the text in the provided Reader.
  -     *
  -     * @return  A TokenStream build from a StandardTokenizer filtered with
  -     *               StandardFilter, StopFilter, GermanStemFilter
  -     */
  -    public TokenStream tokenStream( String fieldName, Reader reader )
  -    {
  -     TokenStream result = new StandardTokenizer( reader );
  -     result = new StandardFilter( result );
  -  // shouldn't there be a lowercaser before stop word filtering?
  -  result = new StopFilter( result, stopSet );
  -     result = new GermanStemFilter( result, exclusionSet );
  -     return result;
  -    }
  +public class GermanAnalyzer extends Analyzer {
  +  /**
  +   * List of typical german stopwords.
  +   */
  +  private String[] GERMAN_STOP_WORDS = {
  +    "einer", "eine", "eines", "einem", "einen",
  +    "der", "die", "das", "dass", "daß",
  +    "du", "er", "sie", "es",
  +    "was", "wer", "wie", "wir",
  +    "und", "oder", "ohne", "mit",
  +    "am", "im", "in", "aus", "auf",
  +    "ist", "sein", "war", "wird",
  +    "ihr", "ihre", "ihres",
  +    "als", "für", "von", "mit",
  +    "dich", "dir", "mich", "mir",
  +    "mein", "sein", "kein",
  +    "durch", "wegen", "wird"
  +  };
  +
  +  /**
  +   * Contains the stopwords used with the StopFilter.
  +   */
  +  private Set stopSet = new HashSet();
  +
  +  /**
  +   * Contains words that should be indexed but not stemmed.
  +   */
  +  private Set exclusionSet = new HashSet();
  +
  +  /**
  +   * Builds an analyzer.
  +   */
  +  public GermanAnalyzer() {
  +    stopSet = StopFilter.makeStopSet(GERMAN_STOP_WORDS);
  +  }
  +
  +  /**
  +   * Builds an analyzer with the given stop words.
  +   */
  +  public GermanAnalyzer(String[] stopwords) {
  +    stopSet = StopFilter.makeStopSet(stopwords);
  +  }
  +
  +  /**
  +   * Builds an analyzer with the given stop words.
  +   */
  +  public GermanAnalyzer(Hashtable stopwords) {
  +    stopSet = new HashSet(stopwords.keySet());
  +  }
  +
  +  /**
  +   * Builds an analyzer with the given stop words.
  +   */
  +  public GermanAnalyzer(File stopwords) {
  +    stopSet = new HashSet(WordlistLoader.getWordtable(stopwords).keySet());
  +  }
  +
  +  /**
  +   * Builds an exclusionlist from an array of Strings.
  +   */
  +  public void setStemExclusionTable(String[] exclusionlist) {
  +    exclusionSet = StopFilter.makeStopSet(exclusionlist);
  +  }
  +
  +  /**
  +   * Builds an exclusionlist from a Hashtable.
  +   */
  +  public void setStemExclusionTable(Hashtable exclusionlist) {
  +    exclusionSet = new HashSet(exclusionlist.keySet());
  +  }
  +
  +  /**
  +   * Builds an exclusionlist from the words contained in the given file.
  +   */
  +  public void setStemExclusionTable(File exclusionlist) {
  +    exclusionSet = new HashSet(WordlistLoader.getWordtable(exclusionlist).keySet());
  +  }
  +
  +  /**
  +   * Creates a TokenStream which tokenizes all the text in the provided Reader.
  +   *
  +   * @return A TokenStream build from a StandardTokenizer filtered with
  +   *         StandardFilter, StopFilter, GermanStemFilter
  +   */
  +  public TokenStream tokenStream(String fieldName, Reader reader) {
  +    TokenStream result = new StandardTokenizer(reader);
  +    result = new StandardFilter(result);
  +// shouldn't there be a lowercaser before stop word filtering?
  +    result = new StopFilter(result, stopSet);
  +    result = new GermanStemFilter(result, exclusionSet);
  +    return result;
  +  }
   }


---------------------------------------------------------------------
To unsubscribe, e-mail: [EMAIL PROTECTED]
For additional commands, e-mail: [EMAIL PROTECTED]

cvs commit: jakarta-lucene/src/java/org/apache/lucene/analysis/de GermanAnalyzer.java

Reply via email to