ehatcher 2003/11/27 18:03:14 Modified: src/java/org/apache/lucene/analysis StopFilter.java . CHANGES.txt Added: src/test/org/apache/lucene/analysis TestStopAnalyzer.java Log: Use position increments to account for removed stop words Revision Changes Path 1.4 +22 -10 jakarta-lucene/src/java/org/apache/lucene/analysis/StopFilter.java Index: StopFilter.java =================================================================== RCS file: /home/cvs/jakarta-lucene/src/java/org/apache/lucene/analysis/StopFilter.java,v retrieving revision 1.3 retrieving revision 1.4 diff -u -r1.3 -r1.4 --- StopFilter.java 9 Dec 2002 19:02:20 -0000 1.3 +++ StopFilter.java 28 Nov 2003 02:03:14 -0000 1.4 @@ -57,29 +57,33 @@ import java.io.IOException; import java.util.Hashtable; -/** Removes stop words from a token stream. */ - +/** + * Removes stop words from a token stream. Position increments + * on tokens emitted are adjusted to account for words + * removed. Exact phrase queries will not match across holes left + * by stop word removal, but sloppy phrase queries may match. + */ public final class StopFilter extends TokenFilter { private Hashtable table; /** Constructs a filter which removes words from the input - TokenStream that are named in the array of words. */ + TokenStream that are named in the array of words. */ public StopFilter(TokenStream in, String[] stopWords) { super(in); table = makeStopTable(stopWords); } /** Constructs a filter which removes words from the input - TokenStream that are named in the Hashtable. */ + TokenStream that are named in the Hashtable. */ public StopFilter(TokenStream in, Hashtable stopTable) { super(in); table = stopTable; } - + /** Builds a Hashtable from an array of stop words, appropriate for passing - into the StopFilter constructor. This permits this table construction to - be cached once when an Analyzer is constructed. */ + into the StopFilter constructor. This permits this table construction to + be cached once when an Analyzer is constructed. */ public static final Hashtable makeStopTable(String[] stopWords) { Hashtable stopTable = new Hashtable(stopWords.length); for (int i = 0; i < stopWords.length; i++) @@ -89,10 +93,18 @@ /** Returns the next input Token whose termText() is not a stop word. */ public final Token next() throws IOException { + int position = 1; + // return the first non-stop word found - for (Token token = input.next(); token != null; token = input.next()) - if (table.get(token.termText) == null) - return token; + for (Token token = input.next(); token != null; token = input.next()) { + if (table.get(token.termText) == null) { + token.setPositionIncrement(position); + position = 1; + return token; + } + + position++; + } // reached EOS -- return null return null; } 1.1 jakarta-lucene/src/test/org/apache/lucene/analysis/TestStopAnalyzer.java Index: TestStopAnalyzer.java =================================================================== package org.apache.lucene.analysis; import junit.framework.TestCase; import java.io.StringReader; import java.util.ArrayList; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.Term; import org.apache.lucene.store.RAMDirectory; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.PhraseQuery; import org.apache.lucene.search.Hits; public class TestStopAnalyzer extends TestCase { private StopAnalyzer stopAnalyzer = new StopAnalyzer(); public Token[] tokensFromAnalyzer(Analyzer analyzer, String text) throws Exception { TokenStream stream = analyzer.tokenStream("contents", new StringReader(text)); ArrayList tokenList = new ArrayList(); while (true) { Token token = stream.next(); if (token == null) break; tokenList.add(token); } return (Token[]) tokenList.toArray(new Token[0]); } public void testNoHoles() throws Exception { Token[] tokens = tokensFromAnalyzer(stopAnalyzer, "non-stop words"); assertEquals(3, tokens.length); // ensure all words are in successive positions assertEquals("non", 1, tokens[0].getPositionIncrement()); assertEquals("stop", 1, tokens[1].getPositionIncrement()); assertEquals("words", 1, tokens[2].getPositionIncrement()); } public void testHoles() throws Exception { Token[] tokens = tokensFromAnalyzer(stopAnalyzer, "the stop words are here"); assertEquals(3, tokens.length); // check for the holes noted by position gaps assertEquals("stop", 2, tokens[0].getPositionIncrement()); assertEquals("words", 1, tokens[1].getPositionIncrement()); assertEquals("here", 2, tokens[2].getPositionIncrement()); } public void testPhraseQuery() throws Exception { RAMDirectory directory = new RAMDirectory(); IndexWriter writer = new IndexWriter(directory, stopAnalyzer, true); Document doc = new Document(); doc.add(Field.Text("field", "the stop words are here")); writer.addDocument(doc); writer.close(); IndexSearcher searcher = new IndexSearcher(directory); // valid exact phrase query PhraseQuery query = new PhraseQuery(); query.add(new Term("field","stop")); query.add(new Term("field","words")); Hits hits = searcher.search(query); assertEquals(1, hits.length()); // incorrect attempt at exact phrase query over stop word hole query = new PhraseQuery(); query.add(new Term("field", "words")); query.add(new Term("field", "here")); hits = searcher.search(query); assertEquals(0, hits.length()); // add some slop, and match over the hole query.setSlop(1); hits = searcher.search(query); assertEquals(1, hits.length()); searcher.close(); } } 1.60 +7 -1 jakarta-lucene/CHANGES.txt Index: CHANGES.txt =================================================================== RCS file: /home/cvs/jakarta-lucene/CHANGES.txt,v retrieving revision 1.59 retrieving revision 1.60 diff -u -r1.59 -r1.60 --- CHANGES.txt 26 Nov 2003 11:10:54 -0000 1.59 +++ CHANGES.txt 28 Nov 2003 02:03:14 -0000 1.60 @@ -7,6 +7,12 @@ 1. Added catch of BooleanQuery$TooManyClauses in QueryParser to throw ParseException instead. (Erik Hatcher) + 2. Modified StopFilter to increment positions to account for + stop words removed. This prevents exact phrase queries from + matching erroneously (use slop factor to account for missing + stop words). StopFilter is used by StopAnalyzer, StandardAnalyzer + and some others. (Erik Hatcher) + 1.3 RC3 1. Added minMergeDocs in IndexWriter. This can be raised to speed
--------------------------------------------------------------------- To unsubscribe, e-mail: [EMAIL PROTECTED] For additional commands, e-mail: [EMAIL PROTECTED]