ehatcher    2003/11/27 18:03:14

  Modified:    src/java/org/apache/lucene/analysis StopFilter.java
               .        CHANGES.txt
  Added:       src/test/org/apache/lucene/analysis TestStopAnalyzer.java
  Log:
  Use position increments to account for removed stop words
  
  Revision  Changes    Path
  1.4       +22 -10    
jakarta-lucene/src/java/org/apache/lucene/analysis/StopFilter.java
  
  Index: StopFilter.java
  ===================================================================
  RCS file: 
/home/cvs/jakarta-lucene/src/java/org/apache/lucene/analysis/StopFilter.java,v
  retrieving revision 1.3
  retrieving revision 1.4
  diff -u -r1.3 -r1.4
  --- StopFilter.java   9 Dec 2002 19:02:20 -0000       1.3
  +++ StopFilter.java   28 Nov 2003 02:03:14 -0000      1.4
  @@ -57,29 +57,33 @@
   import java.io.IOException;
   import java.util.Hashtable;
   
  -/** Removes stop words from a token stream. */
  -
  +/**
  + * Removes stop words from a token stream.  Position increments
  + * on tokens emitted are adjusted to account for words
  + * removed.  Exact phrase queries will not match across holes left
  + * by stop word removal, but sloppy phrase queries may match.
  + */
   public final class StopFilter extends TokenFilter {
   
     private Hashtable table;
   
     /** Constructs a filter which removes words from the input
  -    TokenStream that are named in the array of words. */
  +   TokenStream that are named in the array of words. */
     public StopFilter(TokenStream in, String[] stopWords) {
       super(in);
       table = makeStopTable(stopWords);
     }
   
     /** Constructs a filter which removes words from the input
  -    TokenStream that are named in the Hashtable. */
  +   TokenStream that are named in the Hashtable. */
     public StopFilter(TokenStream in, Hashtable stopTable) {
       super(in);
       table = stopTable;
     }
  -  
  +
     /** Builds a Hashtable from an array of stop words, appropriate for passing
  -    into the StopFilter constructor.  This permits this table construction to
  -    be cached once when an Analyzer is constructed. */
  +   into the StopFilter constructor.  This permits this table construction to
  +   be cached once when an Analyzer is constructed. */
     public static final Hashtable makeStopTable(String[] stopWords) {
       Hashtable stopTable = new Hashtable(stopWords.length);
       for (int i = 0; i < stopWords.length; i++)
  @@ -89,10 +93,18 @@
   
     /** Returns the next input Token whose termText() is not a stop word. */
     public final Token next() throws IOException {
  +    int position = 1;
  +
       // return the first non-stop word found
  -    for (Token token = input.next(); token != null; token = input.next())
  -      if (table.get(token.termText) == null)
  -     return token;
  +    for (Token token = input.next(); token != null; token = input.next()) {
  +      if (table.get(token.termText) == null) {
  +        token.setPositionIncrement(position);
  +        position = 1;
  +        return token;
  +      }
  +
  +      position++;
  +    }
       // reached EOS -- return null
       return null;
     }
  
  
  
  1.1                  
jakarta-lucene/src/test/org/apache/lucene/analysis/TestStopAnalyzer.java
  
  Index: TestStopAnalyzer.java
  ===================================================================
  package org.apache.lucene.analysis;
  
  import junit.framework.TestCase;
  import java.io.StringReader;
  import java.util.ArrayList;
  import org.apache.lucene.index.IndexWriter;
  import org.apache.lucene.index.Term;
  import org.apache.lucene.store.RAMDirectory;
  import org.apache.lucene.document.Document;
  import org.apache.lucene.document.Field;
  import org.apache.lucene.search.IndexSearcher;
  import org.apache.lucene.search.PhraseQuery;
  import org.apache.lucene.search.Hits;
  
  public class TestStopAnalyzer extends TestCase {
    private StopAnalyzer stopAnalyzer = new StopAnalyzer();
  
    public Token[] tokensFromAnalyzer(Analyzer analyzer, String text)
                                                    throws Exception {
      TokenStream stream =
        analyzer.tokenStream("contents", new StringReader(text));
      ArrayList tokenList = new ArrayList();
      while (true) {
        Token token = stream.next();
        if (token == null) break;
  
        tokenList.add(token);
      }
  
      return (Token[]) tokenList.toArray(new Token[0]);
    }
  
  
    public void testNoHoles() throws Exception {
      Token[] tokens = tokensFromAnalyzer(stopAnalyzer,
                                          "non-stop words");
  
      assertEquals(3, tokens.length);
  
      // ensure all words are in successive positions
      assertEquals("non", 1, tokens[0].getPositionIncrement());
      assertEquals("stop", 1, tokens[1].getPositionIncrement());
      assertEquals("words", 1, tokens[2].getPositionIncrement());
    }
  
    public void testHoles() throws Exception {
      Token[] tokens = tokensFromAnalyzer(stopAnalyzer,
                                          "the stop words are here");
  
      assertEquals(3, tokens.length);
  
      // check for the holes noted by position gaps
      assertEquals("stop", 2, tokens[0].getPositionIncrement());
      assertEquals("words", 1, tokens[1].getPositionIncrement());
      assertEquals("here", 2, tokens[2].getPositionIncrement());
    }
  
    public void testPhraseQuery() throws Exception {
      RAMDirectory directory = new RAMDirectory();
      IndexWriter writer = new IndexWriter(directory, stopAnalyzer, true);
      Document doc = new Document();
      doc.add(Field.Text("field", "the stop words are here"));
      writer.addDocument(doc);
      writer.close();
  
      IndexSearcher searcher = new IndexSearcher(directory);
  
      // valid exact phrase query
      PhraseQuery query = new PhraseQuery();
      query.add(new Term("field","stop"));
      query.add(new Term("field","words"));
      Hits hits = searcher.search(query);
      assertEquals(1, hits.length());
  
      // incorrect attempt at exact phrase query over stop word hole
      query = new PhraseQuery();
      query.add(new Term("field", "words"));
      query.add(new Term("field", "here"));
      hits = searcher.search(query);
      assertEquals(0, hits.length());
  
      // add some slop, and match over the hole
      query.setSlop(1);
      hits = searcher.search(query);
      assertEquals(1, hits.length());
  
      searcher.close();
    }
  }
  
  
  
  1.60      +7 -1      jakarta-lucene/CHANGES.txt
  
  Index: CHANGES.txt
  ===================================================================
  RCS file: /home/cvs/jakarta-lucene/CHANGES.txt,v
  retrieving revision 1.59
  retrieving revision 1.60
  diff -u -r1.59 -r1.60
  --- CHANGES.txt       26 Nov 2003 11:10:54 -0000      1.59
  +++ CHANGES.txt       28 Nov 2003 02:03:14 -0000      1.60
  @@ -7,6 +7,12 @@
    1. Added catch of BooleanQuery$TooManyClauses in QueryParser to
       throw ParseException instead. (Erik Hatcher)
   
  + 2. Modified StopFilter to increment positions to account for
  +    stop words removed.  This prevents exact phrase queries from
  +    matching erroneously (use slop factor to account for missing
  +    stop words).  StopFilter is used by StopAnalyzer, StandardAnalyzer
  +    and some others.  (Erik Hatcher)
  +
   1.3 RC3
   
    1. Added minMergeDocs in IndexWriter.  This can be raised to speed
  
  
  

---------------------------------------------------------------------
To unsubscribe, e-mail: [EMAIL PROTECTED]
For additional commands, e-mail: [EMAIL PROTECTED]

Reply via email to