standard StandardAnalyzer.java

ehatcher Fri, 12 Mar 2004 01:43:48 -0800

ehatcher    2004/03/12 01:43:48

  Modified:    src/java/org/apache/lucene/analysis StopAnalyzer.java
                        StopFilter.java
               src/java/org/apache/lucene/analysis/de GermanAnalyzer.java
                        GermanStemFilter.java WordlistLoader.java
               src/java/org/apache/lucene/analysis/ru RussianAnalyzer.java
               src/java/org/apache/lucene/analysis/standard
                        StandardAnalyzer.java
  Log:
  convert StopFilter to use Set, with supporting changes to avoid calling deprecated 
methods.  never compromise on your ideals!
  
  Revision  Changes    Path
  1.3       +5 -4      
jakarta-lucene/src/java/org/apache/lucene/analysis/StopAnalyzer.java
  
  Index: StopAnalyzer.java
  ===================================================================
  RCS file: 
/home/cvs/jakarta-lucene/src/java/org/apache/lucene/analysis/StopAnalyzer.java,v
  retrieving revision 1.2
  retrieving revision 1.3
  diff -u -r1.2 -r1.3
  --- StopAnalyzer.java 9 Dec 2002 19:02:20 -0000       1.2
  +++ StopAnalyzer.java 12 Mar 2004 09:43:48 -0000      1.3
  @@ -56,11 +56,12 @@
   
   import java.io.Reader;
   import java.util.Hashtable;
  +import java.util.Set;
   
   /** Filters LetterTokenizer with LowerCaseFilter and StopFilter. */
   
   public final class StopAnalyzer extends Analyzer {
  -  private Hashtable stopTable;
  +  private Set stopWords;
   
     /** An array containing some common English words that are not usually useful
       for searching. */
  @@ -74,17 +75,17 @@
   
     /** Builds an analyzer which removes words in ENGLISH_STOP_WORDS. */
     public StopAnalyzer() {
  -    stopTable = StopFilter.makeStopTable(ENGLISH_STOP_WORDS);
  +    stopWords = StopFilter.makeStopSet(ENGLISH_STOP_WORDS);
     }
   
     /** Builds an analyzer which removes words in the provided array. */
     public StopAnalyzer(String[] stopWords) {
  -    stopTable = StopFilter.makeStopTable(stopWords);
  +    this.stopWords = StopFilter.makeStopSet(stopWords);
     }
   
     /** Filters LowerCaseTokenizer with StopFilter. */
     public TokenStream tokenStream(String fieldName, Reader reader) {
  -    return new StopFilter(new LowerCaseTokenizer(reader), stopTable);
  +    return new StopFilter(new LowerCaseTokenizer(reader), stopWords);
     }
   }
   
  
  
  
  1.10      +9 -4      
jakarta-lucene/src/java/org/apache/lucene/analysis/StopFilter.java
  
  Index: StopFilter.java
  ===================================================================
  RCS file: 
/home/cvs/jakarta-lucene/src/java/org/apache/lucene/analysis/StopFilter.java,v
  retrieving revision 1.9
  retrieving revision 1.10
  diff -u -r1.9 -r1.10
  --- StopFilter.java   10 Mar 2004 23:17:37 -0000      1.9
  +++ StopFilter.java   12 Mar 2004 09:43:48 -0000      1.10
  @@ -57,6 +57,7 @@
   import java.io.IOException;
   import java.util.HashSet;
   import java.util.Hashtable;
  +import java.util.Set;
   
   /**
    * Removes stop words from a token stream.
  @@ -64,7 +65,7 @@
   
   public final class StopFilter extends TokenFilter {
   
  -  private HashSet stopWords;
  +  private Set stopWords;
   
     /**
      * Constructs a filter which removes words from the input
  @@ -79,7 +80,7 @@
      * Constructs a filter which removes words from the input
      * TokenStream that are named in the Hashtable.
      *
  -   * @deprecated Use [EMAIL PROTECTED] #StopFilter(TokenStream, HashSet)} 
StopFilter(TokenStream,Map)} instead
  +   * @deprecated Use [EMAIL PROTECTED] #StopFilter(TokenStream, Set)} 
StopFilter(TokenStream,Map)} instead
      */
     public StopFilter(TokenStream in, Hashtable stopTable) {
       super(in);
  @@ -89,8 +90,12 @@
     /**
      * Constructs a filter which removes words from the input
      * TokenStream that are named in the Set.
  +   * It is crucial that an efficient Set implementation is used
  +   * for maximum performance.
  +   *
  +   * @see #makeStopSet(java.lang.String[])
      */
  -  public StopFilter(TokenStream in, HashSet stopWords) {
  +  public StopFilter(TokenStream in, Set stopWords) {
       super(in);
       this.stopWords = stopWords;
     }
  @@ -116,7 +121,7 @@
      * This permits this stopWords construction to be cached once when
      * an Analyzer is constructed.
      */
  -  public static final HashSet makeStopSet(String[] stopWords) {
  +  public static final Set makeStopSet(String[] stopWords) {
       HashSet stopTable = new HashSet(stopWords.length);
       for (int i = 0; i < stopWords.length; i++)
         stopTable.add(stopWords[i]);
  
  
  
  1.8       +14 -12    
jakarta-lucene/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java
  
  Index: GermanAnalyzer.java
  ===================================================================
  RCS file: 
/home/cvs/jakarta-lucene/src/java/org/apache/lucene/analysis/de/GermanAnalyzer.java,v
  retrieving revision 1.7
  retrieving revision 1.8
  diff -u -r1.7 -r1.8
  --- GermanAnalyzer.java       9 Oct 2003 00:08:52 -0000       1.7
  +++ GermanAnalyzer.java       12 Mar 2004 09:43:48 -0000      1.8
  @@ -62,6 +62,8 @@
   import java.io.File;
   import java.io.Reader;
   import java.util.Hashtable;
  +import java.util.Set;
  +import java.util.HashSet;
   
   /**
    * Analyzer for German language. Supports an external list of stopwords (words that
  @@ -96,19 +98,19 @@
       /**
        * Contains the stopwords used with the StopFilter.
        */
  -    private Hashtable stoptable = new Hashtable();
  +    private Set stopSet = new HashSet();
   
       /**
        * Contains words that should be indexed but not stemmed.
        */
  -    private Hashtable excltable = new Hashtable();
  +    private Set exclusionSet = new HashSet();
   
       /**
        * Builds an analyzer.
        */
       public GermanAnalyzer()
       {
  -     stoptable = StopFilter.makeStopTable( GERMAN_STOP_WORDS );
  +     stopSet = StopFilter.makeStopSet( GERMAN_STOP_WORDS );
       }
   
       /**
  @@ -116,7 +118,7 @@
        */
       public GermanAnalyzer( String[] stopwords )
       {
  -     stoptable = StopFilter.makeStopTable( stopwords );
  +     stopSet = StopFilter.makeStopSet( stopwords );
       }
   
       /**
  @@ -124,7 +126,7 @@
        */
       public GermanAnalyzer( Hashtable stopwords )
       {
  -     stoptable = stopwords;
  +     stopSet = new HashSet(stopwords.keySet());
       }
   
       /**
  @@ -132,7 +134,7 @@
        */
       public GermanAnalyzer( File stopwords )
       {
  -     stoptable = WordlistLoader.getWordtable( stopwords );
  +     stopSet = new HashSet(WordlistLoader.getWordtable( stopwords ).keySet());
       }
   
       /**
  @@ -140,7 +142,7 @@
        */
       public void setStemExclusionTable( String[] exclusionlist )
       {
  -     excltable = StopFilter.makeStopTable( exclusionlist );
  +     exclusionSet = StopFilter.makeStopSet( exclusionlist );
       }
   
       /**
  @@ -148,7 +150,7 @@
        */
       public void setStemExclusionTable( Hashtable exclusionlist )
       {
  -     excltable = exclusionlist;
  +     exclusionSet = new HashSet(exclusionlist.keySet());
       }
   
       /**
  @@ -156,7 +158,7 @@
        */
       public void setStemExclusionTable( File exclusionlist )
       {
  -     excltable = WordlistLoader.getWordtable( exclusionlist );
  +     exclusionSet = new HashSet(WordlistLoader.getWordtable( exclusionlist 
).keySet());
       }
   
       /**
  @@ -170,8 +172,8 @@
        TokenStream result = new StandardTokenizer( reader );
        result = new StandardFilter( result );
     // shouldn't there be a lowercaser before stop word filtering?
  -  result = new StopFilter( result, stoptable );
  -     result = new GermanStemFilter( result, excltable );
  +  result = new StopFilter( result, stopSet );
  +     result = new GermanStemFilter( result, exclusionSet );
        return result;
       }
   }
  
  
  
  1.6       +28 -6     
jakarta-lucene/src/java/org/apache/lucene/analysis/de/GermanStemFilter.java
  
  Index: GermanStemFilter.java
  ===================================================================
  RCS file: 
/home/cvs/jakarta-lucene/src/java/org/apache/lucene/analysis/de/GermanStemFilter.java,v
  retrieving revision 1.5
  retrieving revision 1.6
  diff -u -r1.5 -r1.6
  --- GermanStemFilter.java     9 Dec 2002 19:02:21 -0000       1.5
  +++ GermanStemFilter.java     12 Mar 2004 09:43:48 -0000      1.6
  @@ -59,6 +59,8 @@
   import org.apache.lucene.analysis.TokenStream;
   import java.io.IOException;
   import java.util.Hashtable;
  +import java.util.Set;
  +import java.util.HashSet;
   
   /**
    * A filter that stems German words. It supports a table of words that should
  @@ -75,7 +77,7 @@
        */
       private Token token = null;
       private GermanStemmer stemmer = null;
  -    private Hashtable exclusions = null;
  +    private Set exclusionSet = null;
       
       public GermanStemFilter( TokenStream in )
       {
  @@ -85,13 +87,24 @@
       
       /**
        * Builds a GermanStemFilter that uses an exclusiontable.
  +     * @deprecated Use [EMAIL PROTECTED] 
#GermanStemFilter(org.apache.lucene.analysis.TokenStream, java.util.Set)} instead.
        */
       public GermanStemFilter( TokenStream in, Hashtable exclusiontable )
       {
        this( in );
  -     exclusions = exclusiontable;
  +     exclusionSet = new HashSet(exclusiontable.keySet());
  +
       }
  -    
  +
  +    /**
  +     * Builds a GermanStemFilter that uses an exclusiontable.
  +     */
  +    public GermanStemFilter( TokenStream in, Set exclusionSet )
  +    {
  +     this( in );
  +     this.exclusionSet = exclusionSet;
  +    }
  +
       /**
        * @return  Returns the next token in the stream, or null at EOS
        */
  @@ -102,7 +115,7 @@
            return null;
        }
        // Check the exclusiontable
  -     else if ( exclusions != null && exclusions.contains( token.termText() ) ) {
  +     else if ( exclusionSet != null && exclusionSet.contains( token.termText() ) ) {
            return token;
        }
        else {
  @@ -128,9 +141,18 @@
   
       /**
        * Set an alternative exclusion list for this filter.
  +     * @deprecated Use [EMAIL PROTECTED] #setExclusionSet(java.util.Set)} instead.
        */
       public void setExclusionTable( Hashtable exclusiontable )
       {
  -     exclusions = exclusiontable;
  +     exclusionSet = new HashSet(exclusiontable.keySet());
  +    }
  +
  +    /**
  +     * Set an alternative exclusion list for this filter.
  +     */
  +    public void setExclusionSet( Set exclusionSet )
  +    {
  +     this.exclusionSet = exclusionSet;
       }
   }
  
  
  
  1.6       +4 -1      
jakarta-lucene/src/java/org/apache/lucene/analysis/de/WordlistLoader.java
  
  Index: WordlistLoader.java
  ===================================================================
  RCS file: 
/home/cvs/jakarta-lucene/src/java/org/apache/lucene/analysis/de/WordlistLoader.java,v
  retrieving revision 1.5
  retrieving revision 1.6
  diff -u -r1.5 -r1.6
  --- WordlistLoader.java       10 Mar 2004 00:18:02 -0000      1.5
  +++ WordlistLoader.java       12 Mar 2004 09:43:48 -0000      1.6
  @@ -67,6 +67,8 @@
    *
    * @author    Gerhard Schwarz
    * @version   $Id$
  + *
  + * @todo refactor to convert to Sets instead of Hashtable
    */
   public class WordlistLoader {
     /**
  @@ -92,6 +94,7 @@
   
     /**
      * @param wordfile  File containing the wordlist
  +   * @todo Create a Set version of this method
      */
     public static Hashtable getWordtable(File wordfile) {
       if (wordfile == null) {
  
  
  
  1.6       +10 -7     
jakarta-lucene/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java
  
  Index: RussianAnalyzer.java
  ===================================================================
  RCS file: 
/home/cvs/jakarta-lucene/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java,v
  retrieving revision 1.5
  retrieving revision 1.6
  diff -u -r1.5 -r1.6
  --- RussianAnalyzer.java      25 Jan 2004 14:18:12 -0000      1.5
  +++ RussianAnalyzer.java      12 Mar 2004 09:43:48 -0000      1.6
  @@ -60,6 +60,8 @@


   import java.io.Reader;

   import java.util.Hashtable;

  +import java.util.Set;

  +import java.util.HashSet;

   

   /**

    * Analyzer for Russian language. Supports an external list of stopwords (words that

  @@ -215,7 +217,7 @@
       /**

        * Contains the stopwords used with the StopFilter.

        */

  -    private Hashtable stoptable = new Hashtable();

  +    private Set stopSet = new HashSet();

   

       /**

        * Charset for Russian letters.

  @@ -227,7 +229,7 @@
   

       public RussianAnalyzer() {

           charset = RussianCharsets.UnicodeRussian;

  -        stoptable = StopFilter.makeStopTable(

  +        stopSet = StopFilter.makeStopSet(

                       makeStopWords(RussianCharsets.UnicodeRussian));

       }

   

  @@ -237,7 +239,7 @@
       public RussianAnalyzer(char[] charset)

       {

           this.charset = charset;

  -        stoptable = StopFilter.makeStopTable(makeStopWords(charset));

  +        stopSet = StopFilter.makeStopSet(makeStopWords(charset));

       }

   

       /**

  @@ -246,7 +248,7 @@
       public RussianAnalyzer(char[] charset, String[] stopwords)

       {

           this.charset = charset;

  -        stoptable = StopFilter.makeStopTable(stopwords);

  +        stopSet = StopFilter.makeStopSet(stopwords);

       }

   

       // Takes russian stop words and translates them to a String array, using

  @@ -270,11 +272,12 @@
   

       /**

        * Builds an analyzer with the given stop words.

  +     * @todo create a Set version of this ctor

        */

       public RussianAnalyzer(char[] charset, Hashtable stopwords)

       {

           this.charset = charset;

  -        stoptable = stopwords;

  +        stopSet = new HashSet(stopwords.keySet());

       }

   

       /**

  @@ -287,7 +290,7 @@
       {

           TokenStream result = new RussianLetterTokenizer(reader, charset);

           result = new RussianLowerCaseFilter(result, charset);

  -        result = new StopFilter(result, stoptable);

  +        result = new StopFilter(result, stopSet);

           result = new RussianStemFilter(result, charset);

           return result;

       }

  
  
  
  1.7       +5 -5      
jakarta-lucene/src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java
  
  Index: StandardAnalyzer.java
  ===================================================================
  RCS file: 
/home/cvs/jakarta-lucene/src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java,v
  retrieving revision 1.6
  retrieving revision 1.7
  diff -u -r1.6 -r1.7
  --- StandardAnalyzer.java     10 Nov 2003 14:31:19 -0000      1.6
  +++ StandardAnalyzer.java     12 Mar 2004 09:43:48 -0000      1.7
  @@ -56,7 +56,7 @@
   
   import org.apache.lucene.analysis.*;
   import java.io.Reader;
  -import java.util.Hashtable;
  +import java.util.Set;
   
   /**
    * Filters [EMAIL PROTECTED] StandardTokenizer} with [EMAIL PROTECTED] 
StandardFilter}, [EMAIL PROTECTED]
  @@ -65,7 +65,7 @@
    * @version $Id$
    */
   public class StandardAnalyzer extends Analyzer {
  -  private Hashtable stopTable;
  +  private Set stopSet;
   
     /** An array containing some common English words that are usually not
     useful for searching. */
  @@ -78,7 +78,7 @@
   
     /** Builds an analyzer with the given stop words. */
     public StandardAnalyzer(String[] stopWords) {
  -    stopTable = StopFilter.makeStopTable(stopWords);
  +    stopSet = StopFilter.makeStopSet(stopWords);
     }
   
     /** Constructs a [EMAIL PROTECTED] StandardTokenizer} filtered by a [EMAIL 
PROTECTED]
  @@ -87,7 +87,7 @@
       TokenStream result = new StandardTokenizer(reader);
       result = new StandardFilter(result);
       result = new LowerCaseFilter(result);
  -    result = new StopFilter(result, stopTable);
  +    result = new StopFilter(result, stopSet);
       return result;
     }
   }
  
  
  

---------------------------------------------------------------------
To unsubscribe, e-mail: [EMAIL PROTECTED]
For additional commands, e-mail: [EMAIL PROTECTED]

cvs commit: jakarta-lucene/src/java/org/apache/lucene/analysis/standard StandardAnalyzer.java

Reply via email to