de data.txt TestGermanStemFilter.java

dnaber Sun, 08 Aug 2004 03:55:31 -0700

dnaber      2004/08/08 03:55:27

  Added:       src/test/org/apache/lucene/analysis/de data.txt
                        TestGermanStemFilter.java
  Log:
  test case for the German stemmer which also shows its limitations
  
  Revision  Changes    Path
  1.1                  jakarta-lucene/src/test/org/apache/lucene/analysis/de/data.txt
  
  Index: data.txt
  ===================================================================
  # German special characters are replaced:
  häufig;haufig
  
  # here the stemmer works okay, it maps related words to the same stem:
  abschließen;abschliess
  abschließender;abschliess
  abschließendes;abschliess
  abschließenden;abschliess
  
  Tisch;tisch
  Tische;tisch
  Tischen;tisch
  
  Haus;hau
  Hauses;hau
  Häuser;hau
  Häusern;hau
  # here's a case where overstemming occurs, i.e. a word is 
  # mapped to the same stem as unrelated words:
  hauen;hau
  
  # here's a case where understemming occurs, i.e. two related words
  # are not mapped to the same stem. This is the case with basically
  # all irregular forms:
  Drama;drama
  Dramen;dram
  
  # TODO: known bug: "ß" at the end of a word isn't replaced:
  Ausmaß;ausmaß
  
  # fake words to test if suffixes are cut off:
  xxxxxe;xxxxx
  xxxxxs;xxxxx
  xxxxxn;xxxxx
  xxxxxt;xxxxx
  xxxxxem;xxxxx
  xxxxxer;xxxxx
  xxxxxnd;xxxxx
  # the suffixes are also removed when combined:
  xxxxxetende;xxxxx
  
  # words that are shorter than four charcters are not changed:
  xxe;xxe
  # -em and -er are not removed from words shorter than five characters:
  xxem;xxem
  xxer;xxer
  # -nd is not removed from words shorter than six characters:
  xxxnd;xxxnd
  
  
  
  1.1                  
jakarta-lucene/src/test/org/apache/lucene/analysis/de/TestGermanStemFilter.java
  
  Index: TestGermanStemFilter.java
  ===================================================================
  package org.apache.lucene.analysis.de;
  
  /**
   * Copyright 2004 The Apache Software Foundation
   *
   * Licensed under the Apache License, Version 2.0 (the "License");
   * you may not use this file except in compliance with the License.
   * You may obtain a copy of the License at
   *
   *     http://www.apache.org/licenses/LICENSE-2.0
   *
   * Unless required by applicable law or agreed to in writing, software
   * distributed under the License is distributed on an "AS IS" BASIS,
   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   * See the License for the specific language governing permissions and
   * limitations under the License.
   */
  
  import java.io.BufferedReader;
  import java.io.File;
  import java.io.FileInputStream;
  import java.io.IOException;
  import java.io.InputStreamReader;
  import java.io.StringReader;
  
  import junit.framework.TestCase;
  
  import org.apache.lucene.analysis.Token;
  import org.apache.lucene.analysis.standard.StandardTokenizer;
  
  /**
   * Test the German stemmer. The stemming algorithm is known to work less 
   * than perfect, as it doesn't use any word lists with exceptions. We 
   * also check some of the cases where the algorithm is wrong.
   * 
   * @author Daniel Naber
   */
  public class TestGermanStemFilter extends TestCase {
  
    public void testStemming() {
      try {
        // read test cases from external file:
        File dataDir = new File(System.getProperty("dataDir"));
        File testFile = new File(dataDir, "org/apache/lucene/analysis/de/data.txt");
        FileInputStream fis = new FileInputStream(testFile);
        InputStreamReader isr = new InputStreamReader(fis, "iso-8859-1");
        BufferedReader breader = new BufferedReader(isr);
        while(true) {
          String line = breader.readLine();
          if (line == null)
            break;
          line = line.trim();
          if (line.startsWith("#") || line.equals(""))
            continue;    // ignore comments and empty lines
          String[] parts = line.split(";");
          //System.out.println(parts[0] + " -- " + parts[1]);
          check(parts[0], parts[1]);
        }
        breader.close();
        isr.close();
        fis.close();
      } catch (IOException e) {
         e.printStackTrace();
         fail();
      }
    }
  
    private void check(final String input, final String expected) throws IOException {
      StandardTokenizer tokenStream = new StandardTokenizer(new StringReader(input));
      GermanStemFilter filter = new GermanStemFilter(tokenStream);
      Token t = filter.next();
      if (t == null)
        fail();
      assertEquals(expected, t.termText());
      filter.close();
    }
  
  }


---------------------------------------------------------------------
To unsubscribe, e-mail: [EMAIL PROTECTED]
For additional commands, e-mail: [EMAIL PROTECTED]

cvs commit: jakarta-lucene/src/test/org/apache/lucene/analysis/de data.txt TestGermanStemFilter.java

Reply via email to