analysis TestStandardAnalyzer.java

ehatcher Tue, 18 Jan 2005 05:27:38 -0800

ehatcher    2005/01/18 05:27:23

  Added:       src/test/org/apache/lucene/analysis
                        TestStandardAnalyzer.java
  Log:
  StandardAnalyzer test contributed by Chris Lamprecht
  
  Revision  Changes    Path
  1.1                  
jakarta-lucene/src/test/org/apache/lucene/analysis/TestStandardAnalyzer.java
  
  Index: TestStandardAnalyzer.java
  ===================================================================
  package org.apache.lucene.analysis;
  
  import junit.framework.TestCase;
  import org.apache.lucene.analysis.standard.StandardAnalyzer;
  
  import java.io.StringReader;
  
  /**
   * Copyright 2004 The Apache Software Foundation
   * <p/>
   * Licensed under the Apache License, Version 2.0 (the "License");
   * you may not use this file except in compliance with the License.
   * You may obtain a copy of the License at
   * <p/>
   * http://www.apache.org/licenses/LICENSE-2.0
   * <p/>
   * Unless required by applicable law or agreed to in writing, software
   * distributed under the License is distributed on an "AS IS" BASIS,
   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   * See the License for the specific language governing permissions and
   * limitations under the License.
   */
  
  public class TestStandardAnalyzer extends TestCase {
  
    public void assertAnalyzesTo(Analyzer a, String input, String[] expected) 
throws Exception {
      TokenStream ts = a.tokenStream("dummy", new StringReader(input));
      for (int i = 0; i < expected.length; i++) {
        Token t = ts.next();
        assertNotNull(t);
        assertEquals(expected[i], t.termText());
      }
      assertNull(ts.next());
      ts.close();
    }
  
  
    public void testStandard() throws Exception {
      Analyzer a = new StandardAnalyzer();
  
      // alphanumeric tokens
      assertAnalyzesTo(a, "B2B", new String[]{"b2b"});
      assertAnalyzesTo(a, "2B", new String[]{"2b"});
  
      // underscores are delimiters, but not in email addresses (below)
      assertAnalyzesTo(a, "word_having_underscore", new String[]{"word", 
"having", "underscore"});
      assertAnalyzesTo(a, "word_with_underscore_and_stopwords", new 
String[]{"word", "underscore", "stopwords"});
  
      // other delimiters: "-", "/", ","
      assertAnalyzesTo(a, "some-dashed-phrase",   new String[]{"some", 
"dashed", "phrase" });
      assertAnalyzesTo(a, "dogs,chase,cats", new String[]{"dogs", "chase", 
"cats"});
      assertAnalyzesTo(a, "ac/dc", new String[]{"ac", "dc"});
  
      // internal apostrophes: O'Reilly, you're, O'Reilly's
      // possessives are actually removed by StardardFilter, not the tokenizer
      assertAnalyzesTo(a, "O'Reilly", new String[]{"o'reilly"});
      assertAnalyzesTo(a, "you're", new String[]{"you're"});
      assertAnalyzesTo(a, "O'Reilly's", new String[]{"o'reilly"});
  
      // company names
      assertAnalyzesTo(a, "AT&T", new String[]{"at&t"});
      assertAnalyzesTo(a, "[EMAIL PROTECTED]", new String[]{"[EMAIL 
PROTECTED]"});
  
      // domain names
      assertAnalyzesTo(a, "www.nutch.org",   new String[]{"www.nutch.org" });
  
      // email addresses, possibly with underscores, periods, etc
      assertAnalyzesTo(a, "[EMAIL PROTECTED]", new String[]{"[EMAIL 
PROTECTED]"});
      assertAnalyzesTo(a, "[EMAIL PROTECTED]", new String[]{"[EMAIL 
PROTECTED]"});
      assertAnalyzesTo(a, "[EMAIL PROTECTED]", new String[]{"[EMAIL 
PROTECTED]"});
  
      // floating point, serial, model numbers, ip addresses, etc.
      // every other segment must have at least one digit
      assertAnalyzesTo(a, "21.35", new String[]{"21.35"});
      assertAnalyzesTo(a, "R2D2 C3PO", new String[]{"r2d2", "c3po"});
      assertAnalyzesTo(a, "216.239.63.104",   new String[]{"216.239.63.104"});
      assertAnalyzesTo(a, "1-2-3",   new String[]{"1-2-3"});
      assertAnalyzesTo(a, "a1-b2-c3",   new String[]{"a1-b2-c3"});
      assertAnalyzesTo(a, "a1-b-c3",   new String[]{"a1-b-c3"});
  
      // numbers
      assertAnalyzesTo(a, "David has 5000 bones", new String[]{"david", "has", 
"5000", "bones"});
  
      // various
      assertAnalyzesTo(a, "C embedded developers wanted", new String[]{"c", 
"embedded", "developers", "wanted" });
      assertAnalyzesTo(a, "foo bar FOO BAR", new String[]{"foo", "bar", "foo", 
"bar"});
      assertAnalyzesTo(a, "foo      bar .  FOO <> BAR", new String[]{"foo", 
"bar", "foo", "bar"});
      assertAnalyzesTo(a, "\"QUOTED\" word", new String[]{"quoted", "word"});
  
      // acronyms have their dots stripped
      assertAnalyzesTo(a, "U.S.A.", new String[]{ "usa" });
  
      // It would be nice to change the grammar in StandardTokenizer.jj to make 
"C#" and "C++" end up as tokens.
      assertAnalyzesTo(a, "C++", new String[]{"c"});
      assertAnalyzesTo(a, "C#", new String[]{"c"});
  
    }
  }


---------------------------------------------------------------------
To unsubscribe, e-mail: [EMAIL PROTECTED]
For additional commands, e-mail: [EMAIL PROTECTED]

cvs commit: jakarta-lucene/src/test/org/apache/lucene/analysis TestStandardAnalyzer.java

Reply via email to