nl DutchAnalyzer.java DutchStemFilter.java DutchStemmer.java WordlistLoader.java

ehatcher Wed, 10 Mar 2004 19:05:31 -0800

ehatcher    2004/03/10 19:05:36

  Modified:    contributions/analyzers/src/java/org/apache/lucene/analysis
                        LengthFilter.java
               contributions/analyzers/src/java/org/apache/lucene/analysis/br
                        BrazilianAnalyzer.java BrazilianStemFilter.java
               contributions/analyzers/src/java/org/apache/lucene/analysis/cjk
                        CJKAnalyzer.java
               contributions/analyzers/src/java/org/apache/lucene/analysis/cz
                        CzechAnalyzer.java
               contributions/analyzers/src/java/org/apache/lucene/analysis/fr
                        FrenchAnalyzer.java FrenchStemFilter.java
               contributions/analyzers/src/java/org/apache/lucene/analysis/nl
                        DutchAnalyzer.java DutchStemFilter.java
                        DutchStemmer.java WordlistLoader.java
  Log:
  bringing sandbox analyzers up to date with changes to the core StopFilter and 
migrating away from using Hashtable
  
  Revision  Changes    Path
  1.2       +2 -2      
jakarta-lucene-sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/LengthFilter.java
  
  Index: LengthFilter.java
  ===================================================================
  RCS file: 
/home/cvs/jakarta-lucene-sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/LengthFilter.java,v
  retrieving revision 1.1
  retrieving revision 1.2
  diff -u -r1.1 -r1.2
  --- LengthFilter.java 2 Mar 2004 12:52:16 -0000       1.1
  +++ LengthFilter.java 11 Mar 2004 03:05:36 -0000      1.2
  @@ -35,7 +35,7 @@
      */
     public LengthFilter(TokenStream in, int min, int max)
     {
  -    input = in;
  +    super(in);
       this.min = min;
       this.max =max;
     }
  
  
  
  1.4       +10 -9     
jakarta-lucene-sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/br/BrazilianAnalyzer.java
  
  Index: BrazilianAnalyzer.java
  ===================================================================
  RCS file: 
/home/cvs/jakarta-lucene-sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/br/BrazilianAnalyzer.java,v
  retrieving revision 1.3
  retrieving revision 1.4
  diff -u -r1.3 -r1.4
  --- BrazilianAnalyzer.java    22 Jan 2004 20:54:46 -0000      1.3
  +++ BrazilianAnalyzer.java    11 Mar 2004 03:05:36 -0000      1.4
  @@ -64,6 +64,7 @@
   import java.io.File;
   import java.io.Reader;
   import java.util.Hashtable;
  +import java.util.HashSet;
   
   /**
    * Analyzer for brazilian language. Supports an external list of stopwords (words 
that
  @@ -102,57 +103,57 @@
        /**
         * Contains the stopwords used with the StopFilter.
         */
  -     private Hashtable stoptable = new Hashtable();
  +     private HashSet stoptable = new HashSet();
        /**
         * Contains words that should be indexed but not stemmed.
         */
  -     private Hashtable excltable = new Hashtable();
  +     private HashSet excltable = new HashSet();
   
        /**
         * Builds an analyzer.
         */
        public BrazilianAnalyzer() {
  -             stoptable = StopFilter.makeStopTable( BRAZILIAN_STOP_WORDS );
  +             stoptable = StopFilter.makeStopSet( BRAZILIAN_STOP_WORDS );
        }
   
        /**
         * Builds an analyzer with the given stop words.
         */
        public BrazilianAnalyzer( String[] stopwords ) {
  -             stoptable = StopFilter.makeStopTable( stopwords );
  +             stoptable = StopFilter.makeStopSet( stopwords );
        }
   
        /**
         * Builds an analyzer with the given stop words.
         */
        public BrazilianAnalyzer( Hashtable stopwords ) {
  -             stoptable = stopwords;
  +             stoptable = new HashSet(stopwords.keySet());
        }
   
        /**
         * Builds an analyzer with the given stop words.
         */
        public BrazilianAnalyzer( File stopwords ) {
  -             stoptable = WordlistLoader.getWordtable( stopwords );
  +             stoptable = new HashSet(WordlistLoader.getWordtable( stopwords 
).keySet());
        }
   
        /**
         * Builds an exclusionlist from an array of Strings.
         */
        public void setStemExclusionTable( String[] exclusionlist ) {
  -             excltable = StopFilter.makeStopTable( exclusionlist );
  +             excltable = StopFilter.makeStopSet( exclusionlist );
        }
        /**
         * Builds an exclusionlist from a Hashtable.
         */
        public void setStemExclusionTable( Hashtable exclusionlist ) {
  -             excltable = exclusionlist;
  +             excltable = new HashSet(exclusionlist.keySet());
        }
        /**
         * Builds an exclusionlist from the words contained in the given file.
         */
        public void setStemExclusionTable( File exclusionlist ) {
  -             excltable = WordlistLoader.getWordtable( exclusionlist );
  +             excltable = new HashSet(WordlistLoader.getWordtable( exclusionlist 
).keySet());
        }
   
        /**
  
  
  
  1.5       +9 -1      
jakarta-lucene-sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/br/BrazilianStemFilter.java
  
  Index: BrazilianStemFilter.java
  ===================================================================
  RCS file: 
/home/cvs/jakarta-lucene-sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/br/BrazilianStemFilter.java,v
  retrieving revision 1.4
  retrieving revision 1.5
  diff -u -r1.4 -r1.5
  --- BrazilianStemFilter.java  22 Jan 2004 20:54:46 -0000      1.4
  +++ BrazilianStemFilter.java  11 Mar 2004 03:05:36 -0000      1.5
  @@ -59,6 +59,7 @@
   import org.apache.lucene.analysis.TokenStream;
   import java.io.IOException;
   import java.util.Hashtable;
  +import java.util.HashSet;
   
   /**
    * Based on (copied) the GermanStemFilter
  @@ -79,7 +80,7 @@
         */
        private Token token = null;
        private BrazilianStemmer stemmer = null;
  -     private Hashtable exclusions = null;
  +     private HashSet exclusions = null;
   
        public BrazilianStemFilter( TokenStream in ) {
       super(in);
  @@ -88,8 +89,15 @@
   
        /**
         * Builds a BrazilianStemFilter that uses an exclusiontable.
  +   * 
  +   * @deprecated
         */
        public BrazilianStemFilter( TokenStream in, Hashtable exclusiontable ) {
  +             this( in );
  +             this.exclusions = new HashSet(exclusiontable.keySet());
  +     }
  +
  +     public BrazilianStemFilter( TokenStream in, HashSet exclusiontable ) {
                this( in );
                this.exclusions = exclusiontable;
        }
  
  
  
  1.3       +5 -4      
jakarta-lucene-sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/cjk/CJKAnalyzer.java
  
  Index: CJKAnalyzer.java
  ===================================================================
  RCS file: 
/home/cvs/jakarta-lucene-sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/cjk/CJKAnalyzer.java,v
  retrieving revision 1.2
  retrieving revision 1.3
  diff -u -r1.2 -r1.3
  --- CJKAnalyzer.java  22 Jan 2004 20:54:47 -0000      1.2
  +++ CJKAnalyzer.java  11 Mar 2004 03:05:36 -0000      1.3
  @@ -63,6 +63,7 @@
   import java.io.Reader;
   
   import java.util.Hashtable;
  +import java.util.HashSet;
   
   
   /**
  @@ -91,7 +92,7 @@
       //~ Instance fields --------------------------------------------------------
   
       /** stop word list */
  -    private Hashtable stopTable;
  +    private HashSet stopTable;
   
       //~ Constructors -----------------------------------------------------------
   
  @@ -99,7 +100,7 @@
        * Builds an analyzer which removes words in STOP_WORDS.
        */
       public CJKAnalyzer() {
  -        stopTable = StopFilter.makeStopTable(stopWords);
  +        stopTable = StopFilter.makeStopSet(stopWords);
       }
   
       /**
  @@ -108,7 +109,7 @@
        * @param stopWords stop word array
        */
       public CJKAnalyzer(String[] stopWords) {
  -        stopTable = StopFilter.makeStopTable(stopWords);
  +        stopTable = StopFilter.makeStopSet(stopWords);
       }
   
       //~ Methods ----------------------------------------------------------------
  
  
  
  1.3       +14 -7     
jakarta-lucene-sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/cz/CzechAnalyzer.java
  
  Index: CzechAnalyzer.java
  ===================================================================
  RCS file: 
/home/cvs/jakarta-lucene-sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/cz/CzechAnalyzer.java,v
  retrieving revision 1.2
  retrieving revision 1.3
  diff -u -r1.2 -r1.3
  --- CzechAnalyzer.java        22 Jan 2004 20:54:47 -0000      1.2
  +++ CzechAnalyzer.java        11 Mar 2004 03:05:36 -0000      1.3
  @@ -64,6 +64,7 @@
   
   import java.io.*;
   import java.util.Hashtable;
  +import java.util.HashSet;
   
   /**
    * Analyzer for Czech language. Supports an external list of stopwords (words that
  @@ -102,26 +103,32 @@
        /**
         * Contains the stopwords used with the StopFilter.
         */
  -     private Hashtable stoptable = new Hashtable();
  +     private HashSet stoptable;
   
        /**
         * Builds an analyzer.
         */
        public CzechAnalyzer() {
  -             stoptable = StopFilter.makeStopTable( STOP_WORDS );
  +             stoptable = StopFilter.makeStopSet( STOP_WORDS );
        }
   
        /**
         * Builds an analyzer with the given stop words.
         */
        public CzechAnalyzer( String[] stopwords ) {
  -             stoptable = StopFilter.makeStopTable( stopwords );
  +             stoptable = StopFilter.makeStopSet( stopwords );
        }
   
        /**
         * Builds an analyzer with the given stop words.
  +   *
  +   * @deprecated
         */
        public CzechAnalyzer( Hashtable stopwords ) {
  +             stoptable = new HashSet(stopwords.keySet());
  +     }
  +
  +     public CzechAnalyzer( HashSet stopwords ) {
                stoptable = stopwords;
        }
   
  @@ -129,7 +136,7 @@
         * Builds an analyzer with the given stop words.
         */
        public CzechAnalyzer( File stopwords ) {
  -             stoptable = WordlistLoader.getWordtable( stopwords );
  +             stoptable = new HashSet(WordlistLoader.getWordtable( stopwords 
).keySet());
        }
   
       /**
  @@ -139,12 +146,12 @@
        */
       public void loadStopWords( InputStream wordfile, String encoding ) {
           if ( wordfile == null ) {
  -            stoptable = new Hashtable();
  +            stoptable = new HashSet();
               return;
           }
           try {
               // clear any previous table (if present)
  -            stoptable = new Hashtable();
  +            stoptable = new HashSet();
   
               InputStreamReader isr;
               if (encoding == null)
  @@ -156,7 +163,7 @@
               LineNumberReader lnr = new LineNumberReader(isr);
               String word;
               while ( ( word = lnr.readLine() ) != null ) {
  -                stoptable.put(word, word);
  +                stoptable.add(word);
               }
   
           } catch ( IOException e ) {
  
  
  
  1.4       +14 -10    
jakarta-lucene-sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/fr/FrenchAnalyzer.java
  
  Index: FrenchAnalyzer.java
  ===================================================================
  RCS file: 
/home/cvs/jakarta-lucene-sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/fr/FrenchAnalyzer.java,v
  retrieving revision 1.3
  retrieving revision 1.4
  diff -u -r1.3 -r1.4
  --- FrenchAnalyzer.java       23 Jan 2004 12:49:34 -0000      1.3
  +++ FrenchAnalyzer.java       11 Mar 2004 03:05:36 -0000      1.4
  @@ -63,6 +63,8 @@
   import java.io.File;


   import java.io.Reader;

   import java.util.Hashtable;

  +import java.util.HashSet;

  +

   import org.apache.lucene.analysis.de.WordlistLoader;

   

   /**

  @@ -108,57 +110,59 @@
        /**

         * Contains the stopwords used with the StopFilter.

         */

  -     private Hashtable stoptable = new Hashtable();

  +     private HashSet stoptable = new HashSet();

        /**

         * Contains words that should be indexed but not stemmed.

         */

  -     private Hashtable excltable = new Hashtable();

  +     private HashSet excltable = new HashSet();

   

        /**

         * Builds an analyzer.

         */

        public FrenchAnalyzer() {

  -             stoptable = StopFilter.makeStopTable( FRENCH_STOP_WORDS );

  +             stoptable = StopFilter.makeStopSet( FRENCH_STOP_WORDS );

        }

   

        /**

         * Builds an analyzer with the given stop words.

         */

        public FrenchAnalyzer( String[] stopwords ) {

  -             stoptable = StopFilter.makeStopTable( stopwords );

  +             stoptable = StopFilter.makeStopSet( stopwords );

        }

   

        /**

         * Builds an analyzer with the given stop words.

  +   *

  +   * @deprecated

         */

        public FrenchAnalyzer( Hashtable stopwords ) {

  -             stoptable = stopwords;

  +             stoptable = new HashSet(stopwords.keySet());

        }

   

        /**

         * Builds an analyzer with the given stop words.

         */

        public FrenchAnalyzer( File stopwords ) {

  -             stoptable = WordlistLoader.getWordtable( stopwords );

  +             stoptable = new HashSet(WordlistLoader.getWordtable( stopwords 
).keySet());

        }

   

        /**

         * Builds an exclusionlist from an array of Strings.

         */

        public void setStemExclusionTable( String[] exclusionlist ) {

  -             excltable = StopFilter.makeStopTable( exclusionlist );

  +             excltable = StopFilter.makeStopSet( exclusionlist );

        }

        /**

         * Builds an exclusionlist from a Hashtable.

         */

        public void setStemExclusionTable( Hashtable exclusionlist ) {

  -             excltable = exclusionlist;

  +             excltable = new HashSet(exclusionlist.keySet());

        }

        /**

         * Builds an exclusionlist from the words contained in the given file.

         */

        public void setStemExclusionTable( File exclusionlist ) {

  -             excltable = WordlistLoader.getWordtable( exclusionlist );

  +             excltable = new HashSet(WordlistLoader.getWordtable( exclusionlist 
).keySet());

        }

   

        /**

  
  
  
  1.3       +10 -2     
jakarta-lucene-sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/fr/FrenchStemFilter.java
  
  Index: FrenchStemFilter.java
  ===================================================================
  RCS file: 
/home/cvs/jakarta-lucene-sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/fr/FrenchStemFilter.java,v
  retrieving revision 1.2
  retrieving revision 1.3
  diff -u -r1.2 -r1.3
  --- FrenchStemFilter.java     22 Jan 2004 20:54:47 -0000      1.2
  +++ FrenchStemFilter.java     11 Mar 2004 03:05:36 -0000      1.3
  @@ -59,6 +59,7 @@
   import org.apache.lucene.analysis.TokenStream;

   import java.io.IOException;

   import java.util.Hashtable;

  +import java.util.HashSet;

   

   /**

    * A filter that stemms french words. It supports a table of words that should

  @@ -74,7 +75,7 @@
         */

        private Token token = null;

        private FrenchStemmer stemmer = null;

  -     private Hashtable exclusions = null;

  +     private HashSet exclusions = null;

   

        public FrenchStemFilter( TokenStream in ) {

       super(in);

  @@ -83,9 +84,16 @@
   

        /**

         * Builds a FrenchStemFilter that uses an exclusiontable.

  +   *

  +   * @deprecated

         */

        public FrenchStemFilter( TokenStream in, Hashtable exclusiontable ) {

                this( in );

  +             exclusions = new HashSet(exclusiontable.keySet());

  +     }

  +

  +     public FrenchStemFilter( TokenStream in, HashSet exclusiontable ) {

  +             this( in );

                exclusions = exclusiontable;

        }

   

  @@ -122,7 +130,7 @@
         * Set an alternative exclusion list for this filter.

         */

        public void setExclusionTable( Hashtable exclusiontable ) {

  -             exclusions = exclusiontable;

  +             exclusions = new HashSet(exclusiontable.keySet());

        }

   }

   

  
  
  
  1.2       +127 -138  
jakarta-lucene-sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/nl/DutchAnalyzer.java
  
  Index: DutchAnalyzer.java
  ===================================================================
  RCS file: 
/home/cvs/jakarta-lucene-sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/nl/DutchAnalyzer.java,v
  retrieving revision 1.1
  retrieving revision 1.2
  diff -u -r1.1 -r1.2
  --- DutchAnalyzer.java        9 Mar 2004 14:55:08 -0000       1.1
  +++ DutchAnalyzer.java        11 Mar 2004 03:05:36 -0000      1.2
  @@ -21,148 +21,137 @@
   import org.apache.lucene.analysis.TokenStream;
   import org.apache.lucene.analysis.standard.StandardFilter;
   import org.apache.lucene.analysis.standard.StandardTokenizer;
  -import org.apache.lucene.analysis.Token;
  +
   import java.io.File;
  -import java.io.*;
   import java.io.Reader;
  -import java.util.Hashtable;
  +import java.util.HashMap;
  +import java.util.HashSet;
   
   /**
  - *
    * @author Edwin de Jonge
  - *
  - * Analyzer for Dutch language. Supports an external list of stopwords (words that
  - * will not be indexed at all), an external list of exclusions (word that will
  - * not be stemmed, but indexed) and an external list of word-stem pairs that 
overrule
  - * the algorithm (dictionary stemming).
  - * A default set of stopwords is used unless an alternative list is specified, the
  - * exclusion list is empty by default.
  - * As start for the Analyzer the German Analyzer was used. The stemming algorithm
  - * implemented can be found at @link
  + *         <p/>
  + *         Analyzer for Dutch language. Supports an external list of stopwords 
(words that
  + *         will not be indexed at all), an external list of exclusions (word that 
will
  + *         not be stemmed, but indexed) and an external list of word-stem pairs 
that overrule
  + *         the algorithm (dictionary stemming).
  + *         A default set of stopwords is used unless an alternative list is 
specified, the
  + *         exclusion list is empty by default.
  + *         As start for the Analyzer the German Analyzer was used. The stemming 
algorithm
  + *         implemented can be found at @link
    */
  -public class DutchAnalyzer extends Analyzer
  -{
  -     /**
  -      * List of typical Dutch stopwords.
  -      */
  -     private String[] DUTCH_STOP_WORDS =
  -     {
  -        "de","en","van","ik","te","dat","die","in","een",
  -        "hij","het","niet","zijn","is","was","op","aan","met","als","voor","had",
  -        "er","maar","om","hem","dan","zou","of","wat","mijn","men","dit","zo",
  -        "door","over","ze","zich","bij","ook","tot","je","mij","uit","der","daar",
  -        "haar","naar","heb","hoe","heeft","hebben","deze","u","want","nog","zal",
  -        "me","zij","nu","ge","geen","omdat","iets","worden","toch","al","waren",
  -        "veel","meer","doen","toen","moet","ben","zonder","kan","hun","dus",
  -        "alles","onder","ja","eens","hier","wie","werd","altijd","doch","wordt",
  -        "wezen","kunnen","ons","zelf","tegen","na","reeds","wil","kon","niets",
  -        "uw","iemand","geweest","andere"
  -     };
  -
  -
  -     /**
  -      * Contains the stopwords used with the StopFilter.
  -      */
  -     private Hashtable stoptable = new Hashtable();
  -
  -     /**
  -      * Contains words that should be indexed but not stemmed.
  -      */
  -     private Hashtable excltable = new Hashtable();
  -
  -     private Hashtable _stemdict = new Hashtable();
  -
  -
  -     /**
  -      * Builds an analyzer.
  -      */
  -     public DutchAnalyzer()
  -     {
  -             stoptable = StopFilter.makeStopTable( DUTCH_STOP_WORDS );
  -             _stemdict.put("fiets","fiets"); //otherwise fiet
  -             _stemdict.put("bromfiets","bromfiets"); //otherwise bromfiet
  -             _stemdict.put("ei","eier");
  -             _stemdict.put("kind","kinder");
  -     }
  -
  -     /**
  -      * Builds an analyzer with the given stop words.
  -      *
  -      * @param stopwords
  -      */
  -     public DutchAnalyzer( String[] stopwords )
  -     {
  -             stoptable = StopFilter.makeStopTable( stopwords );
  -     }
  -
  -     /**
  -      * Builds an analyzer with the given stop words.
  -      *
  -      * @param stopwords
  -      */
  -     public DutchAnalyzer( Hashtable stopwords )
  -     {
  -             stoptable = stopwords;
  -     }
  -
  -     /**
  -      * Builds an analyzer with the given stop words.
  -      *
  -      *  @param stopwords
  -      */
  -     public DutchAnalyzer( File stopwords )
  -     {
  -             stoptable = WordlistLoader.getWordtable( stopwords );
  -     }
  -
  -     /**
  -      * Builds an exclusionlist from an array of Strings.
  -      *
  -      * @param exclusionlist
  -      */
  -     public void setStemExclusionTable( String[] exclusionlist )
  -     {
  -             excltable = StopFilter.makeStopTable( exclusionlist );
  -     }
  -
  -     /**
  -      * Builds an exclusionlist from a Hashtable.
  -      */
  -     public void setStemExclusionTable( Hashtable exclusionlist )
  -     {
  -             excltable = exclusionlist;
  -     }
  -
  -     /**
  -      * Builds an exclusionlist from the words contained in the given file.
  -      */
  -     public void setStemExclusionTable(File exclusionlist)
  -     {
  -             excltable = WordlistLoader.getWordtable(exclusionlist);
  -     }
  -
  -     /**
  -      * Reads a stemdictionary file , that overrules the stemming algorithm
  -      * This is a textfile that contains per line
  -      * word\tstem
  -      * i.e: tabseperated
  -      */
  -     public void setStemDictionary(File stemdict)
  -     {
  -             _stemdict = WordlistLoader.getStemDict(stemdict);
  -     }
  -
  -     /**
  -      * Creates a TokenStream which tokenizes all the text in the provided 
TextReader.
  -      *
  -      * @return A TokenStream build from a StandardTokenizer filtered with 
StandardFilter, StopFilter, GermanStemFilter
  -      */
  -     public TokenStream tokenStream(String fieldName, Reader reader)
  -     {
  -             TokenStream result = new StandardTokenizer( reader );
  -             result = new StandardFilter( result );
  -             result = new StopFilter( result, stoptable );
  -             result = new DutchStemFilter( result, excltable, _stemdict);
  -             return result;
  -     }
  +public class DutchAnalyzer extends Analyzer {
  +  /**
  +   * List of typical Dutch stopwords.
  +   */
  +  private String[] DUTCH_STOP_WORDS =
  +      {
  +        "de", "en", "van", "ik", "te", "dat", "die", "in", "een",
  +        "hij", "het", "niet", "zijn", "is", "was", "op", "aan", "met", "als", 
"voor", "had",
  +        "er", "maar", "om", "hem", "dan", "zou", "of", "wat", "mijn", "men", "dit", 
"zo",
  +        "door", "over", "ze", "zich", "bij", "ook", "tot", "je", "mij", "uit", 
"der", "daar",
  +        "haar", "naar", "heb", "hoe", "heeft", "hebben", "deze", "u", "want", 
"nog", "zal",
  +        "me", "zij", "nu", "ge", "geen", "omdat", "iets", "worden", "toch", "al", 
"waren",
  +        "veel", "meer", "doen", "toen", "moet", "ben", "zonder", "kan", "hun", 
"dus",
  +        "alles", "onder", "ja", "eens", "hier", "wie", "werd", "altijd", "doch", 
"wordt",
  +        "wezen", "kunnen", "ons", "zelf", "tegen", "na", "reeds", "wil", "kon", 
"niets",
  +        "uw", "iemand", "geweest", "andere"
  +      };
  +
  +
  +  /**
  +   * Contains the stopwords used with the StopFilter.
  +   */
  +  private HashSet stoptable = new HashSet();
  +
  +  /**
  +   * Contains words that should be indexed but not stemmed.
  +   */
  +  private HashSet excltable = new HashSet();
  +
  +  private HashMap _stemdict = new HashMap();
  +
  +
  +  /**
  +   * Builds an analyzer.
  +   */
  +  public DutchAnalyzer() {
  +    stoptable = StopFilter.makeStopSet(DUTCH_STOP_WORDS);
  +    _stemdict.put("fiets", "fiets"); //otherwise fiet
  +    _stemdict.put("bromfiets", "bromfiets"); //otherwise bromfiet
  +    _stemdict.put("ei", "eier");
  +    _stemdict.put("kind", "kinder");
  +  }
  +
  +  /**
  +   * Builds an analyzer with the given stop words.
  +   *
  +   * @param stopwords
  +   */
  +  public DutchAnalyzer(String[] stopwords) {
  +    stoptable = StopFilter.makeStopSet(stopwords);
  +  }
  +
  +  /**
  +   * Builds an analyzer with the given stop words.
  +   *
  +   * @param stopwords
  +   */
  +  public DutchAnalyzer(HashSet stopwords) {
  +    stoptable = stopwords;
  +  }
  +
  +  /**
  +   * Builds an analyzer with the given stop words.
  +   *
  +   * @param stopwords
  +   */
  +  public DutchAnalyzer(File stopwords) {
  +    stoptable = new HashSet(WordlistLoader.getWordtable(stopwords).keySet());
  +  }
  +
  +  /**
  +   * Builds an exclusionlist from an array of Strings.
  +   *
  +   * @param exclusionlist
  +   */
  +  public void setStemExclusionTable(String[] exclusionlist) {
  +    excltable = StopFilter.makeStopSet(exclusionlist);
  +  }
  +
  +  /**
  +   * Builds an exclusionlist from a Hashtable.
  +   */
  +  public void setStemExclusionTable(HashSet exclusionlist) {
  +    excltable = exclusionlist;
  +  }
  +
  +  /**
  +   * Builds an exclusionlist from the words contained in the given file.
  +   */
  +  public void setStemExclusionTable(File exclusionlist) {
  +    excltable = new HashSet(WordlistLoader.getWordtable(exclusionlist).keySet());
  +  }
  +
  +  /**
  +   * Reads a stemdictionary file , that overrules the stemming algorithm
  +   * This is a textfile that contains per line
  +   * word\tstem
  +   * i.e: tabseperated
  +   */
  +  public void setStemDictionary(File stemdict) {
  +    _stemdict = WordlistLoader.getStemDict(stemdict);
  +  }
  +
  +  /**
  +   * Creates a TokenStream which tokenizes all the text in the provided TextReader.
  +   *
  +   * @return A TokenStream build from a StandardTokenizer filtered with 
StandardFilter, StopFilter, GermanStemFilter
  +   */
  +  public TokenStream tokenStream(String fieldName, Reader reader) {
  +    TokenStream result = new StandardTokenizer(reader);
  +    result = new StandardFilter(result);
  +    result = new StopFilter(result, stoptable);
  +    result = new DutchStemFilter(result, excltable, _stemdict);
  +    return result;
  +  }
   }
  
  
  
  1.2       +82 -96    
jakarta-lucene-sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/nl/DutchStemFilter.java
  
  Index: DutchStemFilter.java
  ===================================================================
  RCS file: 
/home/cvs/jakarta-lucene-sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/nl/DutchStemFilter.java,v
  retrieving revision 1.1
  retrieving revision 1.2
  diff -u -r1.1 -r1.2
  --- DutchStemFilter.java      9 Mar 2004 14:55:08 -0000       1.1
  +++ DutchStemFilter.java      11 Mar 2004 03:05:36 -0000      1.2
  @@ -19,105 +19,91 @@
   import org.apache.lucene.analysis.Token;
   import org.apache.lucene.analysis.TokenFilter;
   import org.apache.lucene.analysis.TokenStream;
  +
   import java.io.IOException;
  -import java.util.Hashtable;
  +import java.util.HashMap;
  +import java.util.HashSet;
   
   /**
  - *
    * @author Edwin de Jonge
  - *
  - * A filter that stems Dutch words. It supports a table of words that should
  - * not be stemmed at all. The stemmer used can be changed at runtime after the
  - * filter object is created (as long as it is a DutchStemmer).
  + *         <p/>
  + *         A filter that stems Dutch words. It supports a table of words that should
  + *         not be stemmed at all. The stemmer used can be changed at runtime after 
the
  + *         filter object is created (as long as it is a DutchStemmer).
    */
  -public final class DutchStemFilter extends TokenFilter
  -{
  -     /**
  -      * The actual token in the input stream.
  -      */
  -     private Token token = null;
  -     private DutchStemmer stemmer = null;
  -     private Hashtable exclusions = null;
  -
  -     public DutchStemFilter( TokenStream _in )
  -     {
  -             super(_in);
  -             stemmer = new DutchStemmer();
  -     }
  -
  -     /**
  -      * Builds a DutchStemFilter that uses an exclusiontable.
  -      */
  -     public DutchStemFilter( TokenStream _in, Hashtable exclusiontable )
  -     {
  -             this(_in);
  -             exclusions = exclusiontable;
  -     }
  -
  -     /**
  -      * @param stemdictionary Dictionary of word stem pairs, that overrule the 
algorithm
  -      */
  -     public DutchStemFilter( TokenStream _in, Hashtable exclusiontable , Hashtable 
stemdictionary)
  -     {
  -             this(_in, exclusiontable);
  -             stemmer.setStemDictionary(stemdictionary);
  -     }
  -
  -     /**
  -      * @return Returns the next token in the stream, or null at EOS
  -      */
  -     public Token next() throws IOException
  -
  -     {
  -             if ( ( token = input.next() ) == null )
  -             {
  -                     return null;
  -             }
  -
  -             // Check the exclusiontable
  -             else if ( exclusions != null && exclusions.contains( token.termText() 
) )
  -             {
  -                     return token;
  -             }
  -             else
  -             {
  -                     String s = stemmer.stem( token.termText() );
  -                     // If not stemmed, dont waste the time creating a new token
  -                     if ( !s.equals( token.termText() ) )
  -                     {
  -                             return new Token( s, token.startOffset(),
  -                                     token.endOffset(), token.type() );
  -                     }
  -                     return token;
  -             }
  -     }
  -
  -     /**
  -      * Set a alternative/custom DutchStemmer for this filter.
  -      */
  -     public void setStemmer( DutchStemmer stemmer )
  -     {
  -             if ( stemmer != null )
  -             {
  -                     this.stemmer = stemmer;
  -             }
  -     }
  -
  -     /**
  -      * Set an alternative exclusion list for this filter.
  -      */
  -     public void setExclusionTable( Hashtable exclusiontable )
  -     {
  -             exclusions = exclusiontable;
  -     }
  -
  -     /**
  -      * Set dictionary for stemming, this dictionary overrules the algorithm,
  -      * so you can correct for a particular unwanted word-stem pair.
  -      */
  -     public void setStemDictionary(Hashtable dict)
  -     {
  -             if (stemmer != null)
  -                     stemmer.setStemDictionary(dict);
  -     }
  +public final class DutchStemFilter extends TokenFilter {
  +  /**
  +   * The actual token in the input stream.
  +   */
  +  private Token token = null;
  +  private DutchStemmer stemmer = null;
  +  private HashSet exclusions = null;
  +
  +  public DutchStemFilter(TokenStream _in) {
  +    super(_in);
  +    stemmer = new DutchStemmer();
  +  }
  +
  +  /**
  +   * Builds a DutchStemFilter that uses an exclusiontable.
  +   */
  +  public DutchStemFilter(TokenStream _in, HashSet exclusiontable) {
  +    this(_in);
  +    exclusions = exclusiontable;
  +  }
  +
  +  /**
  +   * @param stemdictionary Dictionary of word stem pairs, that overrule the 
algorithm
  +   */
  +  public DutchStemFilter(TokenStream _in, HashSet exclusiontable, HashMap 
stemdictionary) {
  +    this(_in, exclusiontable);
  +    stemmer.setStemDictionary(stemdictionary);
  +  }
  +
  +  /**
  +   * @return Returns the next token in the stream, or null at EOS
  +   */
  +  public Token next() throws IOException {
  +    if ((token = input.next()) == null) {
  +      return null;
  +    }
  +
  +    // Check the exclusiontable
  +    else if (exclusions != null && exclusions.contains(token.termText())) {
  +      return token;
  +    } else {
  +      String s = stemmer.stem(token.termText());
  +      // If not stemmed, dont waste the time creating a new token
  +      if (!s.equals(token.termText())) {
  +        return new Token(s, token.startOffset(),
  +            token.endOffset(), token.type());
  +      }
  +      return token;
  +    }
  +  }
  +
  +  /**
  +   * Set a alternative/custom DutchStemmer for this filter.
  +   */
  +  public void setStemmer(DutchStemmer stemmer) {
  +    if (stemmer != null) {
  +      this.stemmer = stemmer;
  +    }
  +  }
  +
  +  /**
  +   * Set an alternative exclusion list for this filter.
  +   */
  +  public void setExclusionTable(HashSet exclusiontable) {
  +    exclusions = exclusiontable;
  +  }
  +
  +  /**
  +   * Set dictionary for stemming, this dictionary overrules the algorithm,
  +   * so you can correct for a particular unwanted word-stem pair.
  +   */
  +  public void setStemDictionary(HashMap dict) {
  +    if (stemmer != null)
  +      stemmer.setStemDictionary(dict);
  +  }
   }
  
  
  
  1.2       +379 -425  
jakarta-lucene-sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/nl/DutchStemmer.java
  
  Index: DutchStemmer.java
  ===================================================================
  RCS file: 
/home/cvs/jakarta-lucene-sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/nl/DutchStemmer.java,v
  retrieving revision 1.1
  retrieving revision 1.2
  diff -u -r1.1 -r1.2
  --- DutchStemmer.java 9 Mar 2004 14:55:08 -0000       1.1
  +++ DutchStemmer.java 11 Mar 2004 03:05:36 -0000      1.2
  @@ -16,9 +16,8 @@
    * limitations under the License.
    */
   
  -import java.util.Hashtable;
  -import java.util.ArrayList;
  -import java.io.*;
  +import java.util.HashMap;
  +
   /*
    * @author Edwin de Jonge ([EMAIL PROTECTED])
    *
  @@ -26,427 +25,382 @@
    * the <see cref="http://snowball.tartarus.org/dutch/stemmer.html";>dutch 
stemming</see>
    *  algorithm in snowball. Snowball is a project of Martin Porter (does Porter 
Stemmer ring a bell?):
    */
  -public class DutchStemmer
  -{
  -     /**
  -      * Buffer for the terms while stemming them.
  -      */
  -     private StringBuffer sb = new StringBuffer();
  -     private boolean _removedE;
  -     private Hashtable _stemDict;
  -
  -     private int _R1;
  -     private int _R2;
  -
  -     //TODO convert to internal
  -     /*
  -      * Stemms the given term to an unique <tt>discriminator</tt>.
  -      *
  -      * @param term The term that should be stemmed.
  -      * @return Discriminator for <tt>term</tt>
  -      */
  -     public String stem( String term )
  -     {
  -             term = term.toLowerCase();
  -             if ( !isStemmable( term ) )
  -                     return term;
  -             if (_stemDict != null && _stemDict.contains(term))
  -                     if (_stemDict.get(term) instanceof String)
  -                             return (String)_stemDict.get(term);
  -                     else return null;
  -
  -             // Reset the StringBuffer.
  -             sb.delete(0, sb.length());
  -             sb.insert(0, term);
  -             // Stemming starts here...
  -             substitute(sb);
  -             storeYandI(sb);
  -             _R1 = getRIndex(sb, 0);
  -             _R1 = Math.max(3,_R1);
  -             step1(sb);
  -             step2(sb);
  -             _R2 = getRIndex(sb, _R1);
  -             step3a(sb);
  -             step3b(sb);
  -             step4(sb);
  -             reStoreYandI(sb);
  -             return sb.toString();
  -     }
  -
  -     private boolean enEnding(StringBuffer sb)
  -     {
  -             String[] enend = new String[]{"ene","en"};
  -             for (int i = 0; i < enend.length; i++)
  -             {
  -                     String end = enend[i];
  -                     String s = sb.toString();
  -                     int index = s.length() - end.length();
  -                     if ( s.endsWith(end) &&
  -                               index >= _R1 &&
  -                               isValidEnEnding(sb,index-1)
  -                             )
  -                     {
  -                             sb.delete(index, index + end.length());
  -                             unDouble(sb,index);
  -                             return true;
  -                     }
  -             }
  -             return false;
  -     }
  -
  -
  -     private void step1(StringBuffer sb)
  -     {
  -             if (_R1 >= sb.length())
  -                     return;
  -
  -             String s = sb.toString();
  -             int lengthR1 = sb.length() - _R1;
  -             int index;
  -
  -             if (s.endsWith("heden"))
  -             {
  -                     sb.replace(_R1, lengthR1 + _R1, sb.substring(_R1, lengthR1 + 
_R1).replaceAll("heden", "heid"));
  -                     return;
  -             }
  -
  -             if (enEnding(sb))
  -                     return;
  -
  -             if (s.endsWith("se")              &&
  -                      (index = s.length() - 2) >= _R1  &&
  -                      isValidSEnding(sb, index -1)
  -                     )
  -             {
  -                     sb.delete(index, index + 2);
  -                     return;
  -             }
  -             if (s.endsWith("s") &&
  -                     (index = s.length() - 1) >= _R1  &&
  -                     isValidSEnding(sb, index - 1))
  -             {
  -                     sb.delete(index, index + 1);
  -             }
  -     }
  -
  -     /**
  -      * Delete suffix e if in R1 and
  -      * preceded by a non-vowel, and then undouble the ending
  -      *
  -      * @param sb String being stemmed
  -      */
  -     private void step2(StringBuffer sb)
  -     {
  -             _removedE = false;
  -             if (_R1 >= sb.length())
  -                     return;
  -             String s = sb.toString();
  -             int index = s.length() - 1;
  -             if ( index >= _R1   &&
  -                      s.endsWith("e") &&
  -                      !isVowel(sb.charAt(index-1)))
  -             {
  -                     sb.delete(index, index + 1);
  -                     unDouble(sb);
  -                     _removedE = true;
  -             }
  -     }
  -
  -     /**
  -      * Delete "heid"
  -      *
  -      * @param sb String being stemmed
  -      */
  -     private void step3a(StringBuffer sb)
  -     {
  -             if (_R2 >= sb.length())
  -                     return;
  -             String s = sb.toString();
  -             int index = s.length() - 4;
  -             if (s.endsWith("heid")&& index >= _R2 && sb.charAt(index - 1) != 'c')
  -             {
  -                     sb.delete(index, index + 4); //remove heid
  -                     enEnding(sb);
  -             }
  -     }
  -
  -     /**
  -      *  <p>A d-suffix, or derivational suffix, enables a new word,
  -      *  often with a different grammatical category, or with a different
  -      *  sense, to be built from another word. Whether a d-suffix can be
  -      *  attached is discovered not from the rules of grammar, but by
  -      *  referring to a dictionary. So in English, ness can be added to
  -      *  certain adjectives to form corresponding nouns (littleness,
  -      *  kindness, foolishness ...) but not to all adjectives
  -      *  (not for example, to big, cruel, wise ...) d-suffixes can be
  -      *  used to change meaning, often in rather exotic ways.</p>
  -      *  Remove "ing", "end", "ig", "lijk", "baar" and "bar"
  -      *
  -      * @param sb String being stemmed
  -      */
  -     private void step3b(StringBuffer sb)
  -     {
  -             if (_R2 >= sb.length())
  -                     return;
  -             String s = sb.toString();
  -             int index;
  -
  -             if ((s.endsWith("end") || s.endsWith("ing")) &&
  -                     (index = s.length() - 3) >= _R2)
  -             {
  -                     sb.delete(index, index + 3);
  -                     if (sb.charAt(index - 2) == 'i' &&
  -                             sb.charAt(index - 1) == 'g')
  -                     {
  -                             if (sb.charAt(index - 3) != 'e' & index-2 >= _R2)
  -                             {
  -                                     index -= 2;
  -                                     sb.delete(index, index + 2);
  -                             }
  -                     }
  -                     else
  -                     {
  -                             unDouble(sb,index);
  -                     }
  -                     return;
  -             }
  -             if ( s.endsWith("ig")    &&
  -                       (index = s.length() - 2) >= _R2
  -                     )
  -             {
  -                     if (sb.charAt(index - 1) != 'e')
  -                             sb.delete(index, index + 2);
  -                     return;
  -             }
  -             if (s.endsWith("lijk") &&
  -                      (index = s.length() - 4) >= _R2
  -                     )
  -             {
  -                     sb.delete(index, index + 4);
  -                     step2(sb);
  -                     return;
  -             }
  -             if (s.endsWith("baar") &&
  -                     (index = s.length() - 4) >= _R2
  -                     )
  -             {
  -                     sb.delete(index, index + 4);
  -                     return;
  -             }
  -             if (s.endsWith("bar")  &&
  -                      (index = s.length() - 3) >= _R2
  -                     )
  -             {
  -                     if (_removedE)
  -                             sb.delete(index, index + 3);
  -                     return;
  -             }
  -     }
  -
  -     /**
  -      * undouble vowel
  -      * If the words ends CVD, where C is a non-vowel, D is a non-vowel other than 
I, and V is double a, e, o or u, remove one of the vowels from V (for example, maan -> 
man, brood -> brod).
  -      *
  -      * @param sb String being stemmed
  -      */
  -     private void step4(StringBuffer sb)
  -     {
  -             if (sb.length() < 4)
  -                     return;
  -             String end = sb.substring(sb.length() - 4, sb.length());
  -             char c = end.charAt(0);
  -             char v1 = end.charAt(1);
  -             char v2 = end.charAt(2);
  -             char d = end.charAt(3);
  -             if (v1 == v2    &&
  -                      d != 'I'    &&
  -                      v1 != 'i'    &&
  -                      isVowel(v1) &&
  -                     !isVowel(d)  &&
  -                     !isVowel(c))
  -             {
  -                     sb.delete(sb.length() - 2, sb.length() - 1);
  -             }
  -     }
  -
  -     /**
  -      * Checks if a term could be stemmed.
  -      *
  -      * @return true if, and only if, the given term consists in letters.
  -      */
  -     private boolean isStemmable( String term )
  -     {
  -             for ( int c = 0; c < term.length(); c++ )
  -             {
  -                     if ( !Character.isLetter(term.charAt(c))) return false;
  -             }
  -             return true;
  -     }
  -
  -     /**
  -      * Substitute �, �, �, �, �, � , �, �, �, �
  -      */
  -     private void substitute( StringBuffer buffer )
  -     {
  -             for ( int i = 0; i < buffer.length(); i++ )
  -             {
  -                     switch (buffer.charAt(i))
  -                     {
  -                             case '�':
  -                             case '�':
  -                             {
  -                                     buffer.setCharAt(i, 'a');
  -                                     break;
  -                             }
  -                             case '�':
  -                             case '�':
  -                             {
  -                                     buffer.setCharAt(i, 'e');
  -                                     break;
  -                             }
  -                             case '�':
  -                             case '�':
  -                             {
  -                                     buffer.setCharAt(i, 'u');
  -                                     break;
  -                             }
  -                             case '�':
  -                             case 'i':
  -                             {
  -                                     buffer.setCharAt(i, 'i');
  -                                     break;
  -                             }
  -                             case '�':
  -                             case '�':
  -                             {
  -                                     buffer.setCharAt(i, 'o');
  -                                     break;
  -                             }
  -                     }
  -             }
  -     }
  -
  -     private boolean isValidSEnding(StringBuffer sb)
  -     {
  -             return  isValidSEnding(sb,sb.length() - 1);
  -     }
  -
  -     private boolean isValidSEnding(StringBuffer sb, int index)
  -     {
  -             char c = sb.charAt(index);
  -             if (isVowel(c) || c == 'j')
  -                     return false;
  -             return true;
  -     }
  -
  -     private boolean isValidEnEnding(StringBuffer sb)
  -     {
  -             return isValidEnEnding(sb,sb.length() - 1);
  -     }
  -
  -     private boolean isValidEnEnding(StringBuffer sb, int index)
  -     {
  -             char c = sb.charAt(index);
  -             if (isVowel(c))
  -                     return false;
  -             if (c < 3)
  -                     return false;
  -             // ends with "gem"?
  -             if (c == 'm' && sb.charAt(index - 2) == 'g' && sb.charAt(index-1) == 
'e')
  -                     return false;
  -             return true;
  -     }
  -
  -     private void unDouble(StringBuffer sb)
  -     {
  -             unDouble(sb, sb.length());
  -     }
  -
  -     private void unDouble(StringBuffer sb, int endIndex)
  -     {
  -             String s = sb.substring(0, endIndex);
  -             if (s.endsWith("kk") || s.endsWith("tt") || s.endsWith("dd") || 
s.endsWith("nn")|| s.endsWith("mm")|| s.endsWith("ff"))
  -             {
  -                     sb.delete(endIndex-1, endIndex);
  -             }
  -     }
  -
  -     private int getRIndex(StringBuffer sb, int start)
  -     {
  -             if (start == 0)
  -                     start = 1;
  -             int i = start;
  -             for (; i < sb.length(); i++)
  -             {
  -                     //first non-vowel preceded by a vowel
  -                     if (!isVowel(sb.charAt(i)) && isVowel(sb.charAt(i-1)))
  -                     {
  -                             return i + 1;
  -                     }
  -             }
  -             return i + 1;
  -     }
  -
  -     private void storeYandI(StringBuffer sb)
  -     {
  -             if (sb.charAt(0) == 'y')
  -                     sb.setCharAt(0, 'Y');
  -
  -             char c;
  -             int last = sb.length() - 1;
  -
  -             for (int i = 1; i < last; i++)
  -             {
  -                     switch (sb.charAt(i))
  -                     {
  -                             case 'i':
  -                             {
  -                                     if (isVowel(sb.charAt(i-1)) &&
  -                                             isVowel(sb.charAt(i+1))
  -                                             )
  -                                             sb.setCharAt(i, 'I');
  -                                     break;
  -                             }
  -                             case 'y':
  -                             {
  -                                     if (isVowel(sb.charAt(i-1)))
  -                                             sb.setCharAt(i, 'Y');
  -                                     break;
  -                             }
  -                     }
  -             }
  -             if (last > 0 && sb.charAt(last)=='y' && isVowel(sb.charAt(last-1)))
  -                     sb.setCharAt(last, 'Y');
  -     }
  -
  -     private void reStoreYandI(StringBuffer sb)
  -     {
  -             String tmp = sb.toString();
  -             sb.delete(0, sb.length());
  -             sb.insert(0, tmp.replaceAll("I","i").replaceAll("Y","y"));
  -     }
  -
  -     private boolean isVowel(char c)
  -     {
  -             switch (c)
  -             {
  -                     case 'e':
  -                     case 'a':
  -                     case 'o':
  -                     case 'i':
  -                     case 'u':
  -                     case 'y':
  -                     case '�':
  -                     {
  -                             return true;
  -                     }
  -             }
  -             return false;
  -     }
  -
  -     void setStemDictionary(Hashtable dict)
  -     {
  -             _stemDict = dict;
  -     }
  +
  +public class DutchStemmer {
  +  /**
  +   * Buffer for the terms while stemming them.
  +   */
  +  private StringBuffer sb = new StringBuffer();
  +  private boolean _removedE;
  +  private HashMap _stemDict;
  +
  +  private int _R1;
  +  private int _R2;
  +
  +  //TODO convert to internal
  +  /*
  +   * Stemms the given term to an unique <tt>discriminator</tt>.
  +   *
  +   * @param term The term that should be stemmed.
  +   * @return Discriminator for <tt>term</tt>
  +   */
  +  public String stem(String term) {
  +    term = term.toLowerCase();
  +    if (!isStemmable(term))
  +      return term;
  +    if (_stemDict != null && _stemDict.containsKey(term))
  +      if (_stemDict.get(term) instanceof String)
  +        return (String) _stemDict.get(term);
  +      else
  +        return null;
  +
  +    // Reset the StringBuffer.
  +    sb.delete(0, sb.length());
  +    sb.insert(0, term);
  +    // Stemming starts here...
  +    substitute(sb);
  +    storeYandI(sb);
  +    _R1 = getRIndex(sb, 0);
  +    _R1 = Math.max(3, _R1);
  +    step1(sb);
  +    step2(sb);
  +    _R2 = getRIndex(sb, _R1);
  +    step3a(sb);
  +    step3b(sb);
  +    step4(sb);
  +    reStoreYandI(sb);
  +    return sb.toString();
  +  }
  +
  +  private boolean enEnding(StringBuffer sb) {
  +    String[] enend = new String[]{"ene", "en"};
  +    for (int i = 0; i < enend.length; i++) {
  +      String end = enend[i];
  +      String s = sb.toString();
  +      int index = s.length() - end.length();
  +      if (s.endsWith(end) &&
  +          index >= _R1 &&
  +          isValidEnEnding(sb, index - 1)
  +      ) {
  +        sb.delete(index, index + end.length());
  +        unDouble(sb, index);
  +        return true;
  +      }
  +    }
  +    return false;
  +  }
  +
  +
  +  private void step1(StringBuffer sb) {
  +    if (_R1 >= sb.length())
  +      return;
  +
  +    String s = sb.toString();
  +    int lengthR1 = sb.length() - _R1;
  +    int index;
  +
  +    if (s.endsWith("heden")) {
  +      sb.replace(_R1, lengthR1 + _R1, sb.substring(_R1, lengthR1 + 
_R1).replaceAll("heden", "heid"));
  +      return;
  +    }
  +
  +    if (enEnding(sb))
  +      return;
  +
  +    if (s.endsWith("se") &&
  +        (index = s.length() - 2) >= _R1 &&
  +        isValidSEnding(sb, index - 1)
  +    ) {
  +      sb.delete(index, index + 2);
  +      return;
  +    }
  +    if (s.endsWith("s") &&
  +        (index = s.length() - 1) >= _R1 &&
  +        isValidSEnding(sb, index - 1)) {
  +      sb.delete(index, index + 1);
  +    }
  +  }
  +
  +  /**
  +   * Delete suffix e if in R1 and
  +   * preceded by a non-vowel, and then undouble the ending
  +   *
  +   * @param sb String being stemmed
  +   */
  +  private void step2(StringBuffer sb) {
  +    _removedE = false;
  +    if (_R1 >= sb.length())
  +      return;
  +    String s = sb.toString();
  +    int index = s.length() - 1;
  +    if (index >= _R1 &&
  +        s.endsWith("e") &&
  +        !isVowel(sb.charAt(index - 1))) {
  +      sb.delete(index, index + 1);
  +      unDouble(sb);
  +      _removedE = true;
  +    }
  +  }
  +
  +  /**
  +   * Delete "heid"
  +   *
  +   * @param sb String being stemmed
  +   */
  +  private void step3a(StringBuffer sb) {
  +    if (_R2 >= sb.length())
  +      return;
  +    String s = sb.toString();
  +    int index = s.length() - 4;
  +    if (s.endsWith("heid") && index >= _R2 && sb.charAt(index - 1) != 'c') {
  +      sb.delete(index, index + 4); //remove heid
  +      enEnding(sb);
  +    }
  +  }
  +
  +  /**
  +   * <p>A d-suffix, or derivational suffix, enables a new word,
  +   * often with a different grammatical category, or with a different
  +   * sense, to be built from another word. Whether a d-suffix can be
  +   * attached is discovered not from the rules of grammar, but by
  +   * referring to a dictionary. So in English, ness can be added to
  +   * certain adjectives to form corresponding nouns (littleness,
  +   * kindness, foolishness ...) but not to all adjectives
  +   * (not for example, to big, cruel, wise ...) d-suffixes can be
  +   * used to change meaning, often in rather exotic ways.</p>
  +   * Remove "ing", "end", "ig", "lijk", "baar" and "bar"
  +   *
  +   * @param sb String being stemmed
  +   */
  +  private void step3b(StringBuffer sb) {
  +    if (_R2 >= sb.length())
  +      return;
  +    String s = sb.toString();
  +    int index = 0;
  +
  +    if ((s.endsWith("end") || s.endsWith("ing")) &&
  +        (index = s.length() - 3) >= _R2) {
  +      sb.delete(index, index + 3);
  +      if (sb.charAt(index - 2) == 'i' &&
  +          sb.charAt(index - 1) == 'g') {
  +        if (sb.charAt(index - 3) != 'e' & index - 2 >= _R2) {
  +          index -= 2;
  +          sb.delete(index, index + 2);
  +        }
  +      } else {
  +        unDouble(sb, index);
  +      }
  +      return;
  +    }
  +    if (s.endsWith("ig") &&
  +        (index = s.length() - 2) >= _R2
  +    ) {
  +      if (sb.charAt(index - 1) != 'e')
  +        sb.delete(index, index + 2);
  +      return;
  +    }
  +    if (s.endsWith("lijk") &&
  +        (index = s.length() - 4) >= _R2
  +    ) {
  +      sb.delete(index, index + 4);
  +      step2(sb);
  +      return;
  +    }
  +    if (s.endsWith("baar") &&
  +        (index = s.length() - 4) >= _R2
  +    ) {
  +      sb.delete(index, index + 4);
  +      return;
  +    }
  +    if (s.endsWith("bar") &&
  +        (index = s.length() - 3) >= _R2
  +    ) {
  +      if (_removedE)
  +        sb.delete(index, index + 3);
  +      return;
  +    }
  +  }
  +
  +  /**
  +   * undouble vowel
  +   * If the words ends CVD, where C is a non-vowel, D is a non-vowel other than I, 
and V is double a, e, o or u, remove one of the vowels from V (for example, maan -> 
man, brood -> brod).
  +   *
  +   * @param sb String being stemmed
  +   */
  +  private void step4(StringBuffer sb) {
  +    if (sb.length() < 4)
  +      return;
  +    String end = sb.substring(sb.length() - 4, sb.length());
  +    char c = end.charAt(0);
  +    char v1 = end.charAt(1);
  +    char v2 = end.charAt(2);
  +    char d = end.charAt(3);
  +    if (v1 == v2 &&
  +        d != 'I' &&
  +        v1 != 'i' &&
  +        isVowel(v1) &&
  +        !isVowel(d) &&
  +        !isVowel(c)) {
  +      sb.delete(sb.length() - 2, sb.length() - 1);
  +    }
  +  }
  +
  +  /**
  +   * Checks if a term could be stemmed.
  +   *
  +   * @return true if, and only if, the given term consists in letters.
  +   */
  +  private boolean isStemmable(String term) {
  +    for (int c = 0; c < term.length(); c++) {
  +      if (!Character.isLetter(term.charAt(c))) return false;
  +    }
  +    return true;
  +  }
  +
  +  /**
  +   * Substitute �, �, �, �, �, � , �, �, �, �
  +   */
  +  private void substitute(StringBuffer buffer) {
  +    for (int i = 0; i < buffer.length(); i++) {
  +      switch (buffer.charAt(i)) {
  +        case '�':
  +        case '�':
  +          {
  +            buffer.setCharAt(i, 'a');
  +            break;
  +          }
  +        case '�':
  +        case '�':
  +          {
  +            buffer.setCharAt(i, 'e');
  +            break;
  +          }
  +        case '�':
  +        case '�':
  +          {
  +            buffer.setCharAt(i, 'u');
  +            break;
  +          }
  +        case '�':
  +        case 'i':
  +          {
  +            buffer.setCharAt(i, 'i');
  +            break;
  +          }
  +        case '�':
  +        case '�':
  +          {
  +            buffer.setCharAt(i, 'o');
  +            break;
  +          }
  +      }
  +    }
  +  }
  +
  +  private boolean isValidSEnding(StringBuffer sb) {
  +    return isValidSEnding(sb, sb.length() - 1);
  +  }
  +
  +  private boolean isValidSEnding(StringBuffer sb, int index) {
  +    char c = sb.charAt(index);
  +    if (isVowel(c) || c == 'j')
  +      return false;
  +    return true;
  +  }
  +
  +  private boolean isValidEnEnding(StringBuffer sb) {
  +    return isValidEnEnding(sb, sb.length() - 1);
  +  }
  +
  +  private boolean isValidEnEnding(StringBuffer sb, int index) {
  +    char c = sb.charAt(index);
  +    if (isVowel(c))
  +      return false;
  +    if (c < 3)
  +      return false;
  +    // ends with "gem"?
  +    if (c == 'm' && sb.charAt(index - 2) == 'g' && sb.charAt(index - 1) == 'e')
  +      return false;
  +    return true;
  +  }
  +
  +  private void unDouble(StringBuffer sb) {
  +    unDouble(sb, sb.length());
  +  }
  +
  +  private void unDouble(StringBuffer sb, int endIndex) {
  +    String s = sb.substring(0, endIndex);
  +    if (s.endsWith("kk") || s.endsWith("tt") || s.endsWith("dd") || 
s.endsWith("nn") || s.endsWith("mm") || s.endsWith("ff")) {
  +      sb.delete(endIndex - 1, endIndex);
  +    }
  +  }
  +
  +  private int getRIndex(StringBuffer sb, int start) {
  +    if (start == 0)
  +      start = 1;
  +    int i = start;
  +    for (; i < sb.length(); i++) {
  +      //first non-vowel preceded by a vowel
  +      if (!isVowel(sb.charAt(i)) && isVowel(sb.charAt(i - 1))) {
  +        return i + 1;
  +      }
  +    }
  +    return i + 1;
  +  }
  +
  +  private void storeYandI(StringBuffer sb) {
  +    if (sb.charAt(0) == 'y')
  +      sb.setCharAt(0, 'Y');
  +
  +    char c;
  +    int last = sb.length() - 1;
  +
  +    for (int i = 1; i < last; i++) {
  +      switch (sb.charAt(i)) {
  +        case 'i':
  +          {
  +            if (isVowel(sb.charAt(i - 1)) &&
  +                isVowel(sb.charAt(i + 1))
  +            )
  +              sb.setCharAt(i, 'I');
  +            break;
  +          }
  +        case 'y':
  +          {
  +            if (isVowel(sb.charAt(i - 1)))
  +              sb.setCharAt(i, 'Y');
  +            break;
  +          }
  +      }
  +    }
  +    if (last > 0 && sb.charAt(last) == 'y' && isVowel(sb.charAt(last - 1)))
  +      sb.setCharAt(last, 'Y');
  +  }
  +
  +  private void reStoreYandI(StringBuffer sb) {
  +    String tmp = sb.toString();
  +    sb.delete(0, sb.length());
  +    sb.insert(0, tmp.replaceAll("I", "i").replaceAll("Y", "y"));
  +  }
  +
  +  private boolean isVowel(char c) {
  +    switch (c) {
  +      case 'e':
  +      case 'a':
  +      case 'o':
  +      case 'i':
  +      case 'u':
  +      case 'y':
  +      case '�':
  +        {
  +          return true;
  +        }
  +    }
  +    return false;
  +  }
  +
  +  void setStemDictionary(HashMap dict) {
  +    _stemDict = dict;
  +  }
   
   }
  
  
  
  1.2       +92 -111   
jakarta-lucene-sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/nl/WordlistLoader.java
  
  Index: WordlistLoader.java
  ===================================================================
  RCS file: 
/home/cvs/jakarta-lucene-sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/nl/WordlistLoader.java,v
  retrieving revision 1.1
  retrieving revision 1.2
  diff -u -r1.1 -r1.2
  --- WordlistLoader.java       9 Mar 2004 14:55:08 -0000       1.1
  +++ WordlistLoader.java       11 Mar 2004 03:05:36 -0000      1.2
  @@ -20,123 +20,104 @@
   import java.io.FileReader;
   import java.io.IOException;
   import java.io.LineNumberReader;
  -import java.util.Hashtable;
  +import java.util.HashMap;
   
   /**
  - *
    * @author Gerhard Schwarz
  - *
  - * Loads a text file and adds every line as an entry to a Hashtable. Every line
  - * should contain only one word. If the file is not found or on any error, an
  - * empty table is returned.
  + *         <p/>
  + *         Loads a text file and adds every line as an entry to a Hashtable. Every 
line
  + *         should contain only one word. If the file is not found or on any error, 
an
  + *         empty table is returned.
    */
  -public class WordlistLoader
  -{
  -     /**
  -      * @param path Path to the wordlist
  -      * @param wordfile Name of the wordlist
  -      */
  -     public static Hashtable getWordtable( String path, String wordfile )
  -     {
  -             if ( path == null || wordfile == null )
  -             {
  -                     return new Hashtable();
  -             }
  -             return getWordtable(new File(path, wordfile));
  -     }
  +public class WordlistLoader {
  +  /**
  +   * @param path     Path to the wordlist
  +   * @param wordfile Name of the wordlist
  +   */
  +  public static HashMap getWordtable(String path, String wordfile) {
  +    if (path == null || wordfile == null) {
  +      return new HashMap();
  +    }
  +    return getWordtable(new File(path, wordfile));
  +  }
   
  -     /**
  -      * @param wordfile Complete path to the wordlist
  -      */
  -     public static Hashtable getWordtable( String wordfile )
  -     {
  -             if ( wordfile == null )
  -             {
  -                     return new Hashtable();
  -             }
  -             return getWordtable( new File( wordfile ) );
  -     }
  +  /**
  +   * @param wordfile Complete path to the wordlist
  +   */
  +  public static HashMap getWordtable(String wordfile) {
  +    if (wordfile == null) {
  +      return new HashMap();
  +    }
  +    return getWordtable(new File(wordfile));
  +  }
   
  -     /**
  -      * Reads a stemsdictionary. Each line contains:
  -     * word \t stem
  -      * i.e. tab seperated)
  -      *
  -      * @return Stem dictionary that overrules, the stemming algorithm
  -      */
  -     public static Hashtable getStemDict( File wordstemfile)
  -     {
  -             if ( wordstemfile == null )
  -             {
  -                     return new Hashtable();
  -             }
  -             Hashtable result = new Hashtable();
  -             try
  -             {
  -                     LineNumberReader lnr = new LineNumberReader(new 
FileReader(wordstemfile));
  -                     String line;
  -                     String[] wordstem;
  -                     while ((line = lnr.readLine()) != null)
  -                     {
  -                             wordstem = line.split("\t", 2);
  -                             result.put(wordstem[0], wordstem[1]);
  -                }
  -             }
  -             catch (IOException e)
  -             {}
  -             return result;
  -     }
  +  /**
  +   * Reads a stemsdictionary. Each line contains:
  +   * word \t stem
  +   * i.e. tab seperated)
  +   *
  +   * @return Stem dictionary that overrules, the stemming algorithm
  +   */
  +  public static HashMap getStemDict(File wordstemfile) {
  +    if (wordstemfile == null) {
  +      return new HashMap();
  +    }
  +    HashMap result = new HashMap();
  +    try {
  +      LineNumberReader lnr = new LineNumberReader(new FileReader(wordstemfile));
  +      String line;
  +      String[] wordstem;
  +      while ((line = lnr.readLine()) != null) {
  +        wordstem = line.split("\t", 2);
  +        result.put(wordstem[0], wordstem[1]);
  +      }
  +    } catch (IOException e) {
  +    }
  +    return result;
  +  }
   
  -     /**
  -      * @param wordfile File containing the wordlist
  -      */
  -     public static Hashtable getWordtable( File wordfile )
  -     {
  -             if ( wordfile == null )
  -             {
  -                     return new Hashtable();
  -             }
  -             Hashtable result = null;
  -             try
  -             {
  -                     LineNumberReader lnr = new LineNumberReader(new 
FileReader(wordfile));
  -                     String word = null;
  -                     String[] stopwords = new String[100];
  -                     int wordcount = 0;
  -                     while ( ( word = lnr.readLine() ) != null )
  -                     {
  -                             wordcount++;
  -                             if ( wordcount == stopwords.length )
  -                             {
  -                                     String[] tmp = new String[stopwords.length + 
50];
  -                                     System.arraycopy( stopwords, 0, tmp, 0, 
wordcount );
  -                                     stopwords = tmp;
  -                             }
  -                             stopwords[wordcount-1] = word;
  -                     }
  -                     result = makeWordTable( stopwords, wordcount );
  -             }
  -                     // On error, use an empty table
  -             catch (IOException e)
  -             {
  -                     result = new Hashtable();
  -             }
  -             return result;
  -     }
  +  /**
  +   * @param wordfile File containing the wordlist
  +   */
  +  public static HashMap getWordtable(File wordfile) {
  +    if (wordfile == null) {
  +      return new HashMap();
  +    }
  +    HashMap result = null;
  +    try {
  +      LineNumberReader lnr = new LineNumberReader(new FileReader(wordfile));
  +      String word = null;
  +      String[] stopwords = new String[100];
  +      int wordcount = 0;
  +      while ((word = lnr.readLine()) != null) {
  +        wordcount++;
  +        if (wordcount == stopwords.length) {
  +          String[] tmp = new String[stopwords.length + 50];
  +          System.arraycopy(stopwords, 0, tmp, 0, wordcount);
  +          stopwords = tmp;
  +        }
  +        stopwords[wordcount - 1] = word;
  +      }
  +      result = makeWordTable(stopwords, wordcount);
  +    }
  +        // On error, use an empty table
  +    catch (IOException e) {
  +      result = new HashMap();
  +    }
  +    return result;
  +  }
   
  -     /**
  -      * Builds the wordlist table.
  -      *
  -      * @param words Word that where read
  -      * @param length Amount of words that where read into <tt>words</tt>
  -      */
  -     private static Hashtable makeWordTable( String[] words, int length )
  -     {
  -             Hashtable table = new Hashtable( length );
  -             for ( int i = 0; i < length; i++ )
  -             {
  -                     table.put(words[i], words[i]);
  -             }
  -             return table;
  -     }
  +  /**
  +   * Builds the wordlist table.
  +   *
  +   * @param words  Word that where read
  +   * @param length Amount of words that where read into <tt>words</tt>
  +   */
  +  private static HashMap makeWordTable(String[] words, int length) {
  +    HashMap table = new HashMap(length);
  +    for (int i = 0; i < length; i++) {
  +      table.put(words[i], words[i]);
  +    }
  +    return table;
  +  }
   }
  
  
  

---------------------------------------------------------------------
To unsubscribe, e-mail: [EMAIL PROTECTED]
For additional commands, e-mail: [EMAIL PROTECTED]

cvs commit: jakarta-lucene-sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/nl DutchAnalyzer.java DutchStemFilter.java DutchStemmer.java WordlistLoader.java

Reply via email to