TestWordDelimiterFilter.java

rmuir Wed, 02 Jun 2010 11:35:42 -0700

Author: rmuir
Date: Wed Jun  2 18:35:10 2010
New Revision: 950711

URL: http://svn.apache.org/viewvc?rev=950711&view=rev
Log:
SOLR-1852: fix WordDelimiterFilterFactory bug where position increments were 
not being applied properly to subwords


Added:
    
lucene/solr/branches/branch-1.4/src/java/org/apache/solr/analysis/WordDelimiterIterator.java
   (with props)
Modified:
    lucene/solr/branches/branch-1.4/CHANGES.txt
    
lucene/solr/branches/branch-1.4/src/java/org/apache/solr/analysis/WordDelimiterFilter.java
    
lucene/solr/branches/branch-1.4/src/test/org/apache/solr/analysis/TestWordDelimiterFilter.java

Modified: lucene/solr/branches/branch-1.4/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/lucene/solr/branches/branch-1.4/CHANGES.txt?rev=950711&r1=950710&r2=950711&view=diff
==============================================================================
--- lucene/solr/branches/branch-1.4/CHANGES.txt (original)
+++ lucene/solr/branches/branch-1.4/CHANGES.txt Wed Jun  2 18:35:10 2010
@@ -121,6 +121,12 @@ Bug Fixes
 * SOLR-1936: The JSON response format needed to escape unicode code point
   U+2028 - 'LINE SEPARATOR' (Robert Hofstra, yonik)
 
+* SOLR-1852: Fix WordDelimiterFilterFactory bug where position increments
+  were not being applied properly to subwords.  (Peter Wolanin via Robert Muir)
+
+* SOLR-1706: fixed WordDelimiterFilter for certain combinations of options
+  where it would output incorrect tokens. (Robert Muir, Chris Male)
+
 ================== Release 1.4.0 ==================
 Release Date:  See http://lucene.apache.org/solr for the official release date.
 

Modified: 
lucene/solr/branches/branch-1.4/src/java/org/apache/solr/analysis/WordDelimiterFilter.java
URL: 
http://svn.apache.org/viewvc/lucene/solr/branches/branch-1.4/src/java/org/apache/solr/analysis/WordDelimiterFilter.java?rev=950711&r1=950710&r2=950711&view=diff
==============================================================================
--- 
lucene/solr/branches/branch-1.4/src/java/org/apache/solr/analysis/WordDelimiterFilter.java
 (original)
+++ 
lucene/solr/branches/branch-1.4/src/java/org/apache/solr/analysis/WordDelimiterFilter.java
 Wed Jun  2 18:35:10 2010
@@ -19,12 +19,15 @@ package org.apache.solr.analysis;
 
 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.Token;
 import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.RamUsageEstimator;
 
 import java.io.IOException;
-import java.util.ArrayList;
-import java.util.List;
 
 /**
  * Splits words into subwords and performs optional transformations on subword 
groups.
@@ -50,118 +53,108 @@ import java.util.List;
  *     - "Super-Duper-XL500-42-AutoCoder!" -> 0:"Super", 1:"Duper", 2:"XL", 
2:"SuperDuperXL", 3:"500" 4:"42", 5:"Auto", 6:"Coder", 6:"AutoCoder"
  *
  *  One use for WordDelimiterFilter is to help match words with different 
subword delimiters.
- *  For example, if the source text contained "wi-fi" one may want "wifi" 
"WiFi" "wi-fi" "wi+fi"
- *  queries to all match.
- *  One way of doing so is to specify combinations="1" in the analyzer
- *  used for indexing, and combinations="0" (the default) in the analyzer
- *  used for querying.  Given that the current StandardTokenizer
- *  immediately removes many intra-word delimiters, it is recommended that
- *  this filter be used after a tokenizer that does not do this
- *  (such as WhitespaceTokenizer).
+ *  For example, if the source text contained "wi-fi" one may want "wifi" 
"WiFi" "wi-fi" "wi+fi" queries to all match.
+ *  One way of doing so is to specify combinations="1" in the analyzer used 
for indexing, and combinations="0" (the default)
+ *  in the analyzer used for querying.  Given that the current 
StandardTokenizer immediately removes many intra-word
+ *  delimiters, it is recommended that this filter be used after a tokenizer 
that does not do this (such as WhitespaceTokenizer).
  *
  *  @version $Id$
  */
 
 final class WordDelimiterFilter extends TokenFilter {
-  private final byte[] charTypeTable;
-
-  public static final int         LOWER=0x01;
-  public static final int         UPPER=0x02;
-  public static final int         DIGIT=0x04;
-  public static final int SUBWORD_DELIM=0x08;
+  
+  public static final int LOWER = 0x01;
+  public static final int UPPER = 0x02;
+  public static final int DIGIT = 0x04;
+  public static final int SUBWORD_DELIM = 0x08;
 
   // combinations: for testing, not for setting bits
-  public static final int    ALPHA=0x03;
-  public static final int    ALPHANUM=0x07;
-
-  // TODO: should there be a WORD_DELIM category for
-  // chars that only separate words (no catenation of subwords
-  // will be done if separated by these chars?)
-  // "," would be an obvious candidate...
-
-  static byte[] defaultWordDelimTable;
-  static {
-    byte[] tab = new byte[256];
-    for (int i=0; i<256; i++) {
-      byte code = 0;
-      if (Character.isLowerCase(i)) code |= LOWER;
-      else if (Character.isUpperCase(i)) code |= UPPER;
-      else if (Character.isDigit(i)) code |= DIGIT;
-      if (code==0) code=SUBWORD_DELIM;
-      tab[i]=code;
-    }
-    defaultWordDelimTable = tab;
-  }
+  public static final int ALPHA = 0x03;
+  public static final int ALPHANUM = 0x07;
 
   /**
-   * If 1, causes parts of words to be generated:
+   * If true, causes parts of words to be generated:
    * <p/>
    * "PowerShot" => "Power" "Shot"
    */
-  final int generateWordParts;
+  final boolean generateWordParts;
 
   /**
-   * If 1, causes number subwords to be generated:
+   * If true, causes number subwords to be generated:
    * <p/>
    * "500-42" => "500" "42"
    */
-  final int generateNumberParts;
+  final boolean generateNumberParts;
 
   /**
-   * If 1, causes maximum runs of word parts to be catenated:
+   * If true, causes maximum runs of word parts to be catenated:
    * <p/>
    * "wi-fi" => "wifi"
    */
-  final int catenateWords;
+  final boolean catenateWords;
 
   /**
-   * If 1, causes maximum runs of number parts to be catenated:
+   * If true, causes maximum runs of number parts to be catenated:
    * <p/>
    * "500-42" => "50042"
    */
-  final int catenateNumbers;
+  final boolean catenateNumbers;
 
   /**
-   * If 1, causes all subword parts to be catenated:
+   * If true, causes all subword parts to be catenated:
    * <p/>
    * "wi-fi-4000" => "wifi4000"
    */
-  final int catenateAll;
+  final boolean catenateAll;
 
   /**
-   * If 0, causes case changes to be ignored (subwords will only be generated
-   * given SUBWORD_DELIM tokens). (Defaults to 1)
-   */
-  final int splitOnCaseChange;
-
-  /**
-   * If 1, original words are preserved and added to the subword list 
(Defaults to 0)
+   * If true, original words are preserved and added to the subword list 
(Defaults to false)
    * <p/>
    * "500-42" => "500" "42" "500-42"
    */
-  final int preserveOriginal;
-
-  /**
-   * If 0, causes numeric changes to be ignored (subwords will only be 
generated
-   * given SUBWORD_DELIM tokens). (Defaults to 1)
-   */
-  final int splitOnNumerics;
-
-  /**
-   * If 1, causes trailing "'s" to be removed for each subword. (Defaults to 1)
-   * <p/>
-   * "O'Neil's" => "O", "Neil"
-   */
-  final int stemEnglishPossessive;
+  final boolean preserveOriginal;
   
   /**
    * If not null is the set of tokens to protect from being delimited
    *
    */
   final CharArraySet protWords;
+    
+  private final TermAttribute termAttribute = (TermAttribute) 
addAttribute(TermAttribute.class);
+  private final OffsetAttribute offsetAttribute = (OffsetAttribute) 
addAttribute(OffsetAttribute.class);
+  private final PositionIncrementAttribute posIncAttribute = 
(PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
+  private final TypeAttribute typeAttribute = (TypeAttribute) 
addAttribute(TypeAttribute.class);
+
+  // used for iterating word delimiter breaks
+  private final WordDelimiterIterator iterator;
+
+  // used for concatenating runs of similar typed subwords (word,number)
+  private final WordDelimiterConcatenation concat = new 
WordDelimiterConcatenation();
+  // number of subwords last output by concat.
+  private int lastConcatCount = 0;
+
+  // used for catenate all
+  private final WordDelimiterConcatenation concatAll = new 
WordDelimiterConcatenation();
+
+  // used for accumulating position increment gaps
+  private int accumPosInc = 0;
+
+  private char savedBuffer[] = new char[1024];
+  private int savedStartOffset;
+  private int savedEndOffset;
+  private String savedType;
+  private boolean hasSavedState = false;
+  // if length by start + end offsets doesn't match the term text then assume
+  // this is a synonym and don't adjust the offsets.
+  private boolean hasIllegalOffsets = false;
+
+  // for a run of the same subword type within a word, have we output anything?
+  private boolean hasOutputToken = false;
+  // when preserve original is on, have we output any token following it?
+  // this token must have posInc=0!
+  private boolean hasOutputFollowingOriginal = false;
 
   /**
-   *
    * @param in Token stream to be filtered.
    * @param charTypeTable
    * @param generateWordParts If 1, causes parts of words to be generated: 
"PowerShot" => "Power" "Shot"
@@ -175,19 +168,27 @@ final class WordDelimiterFilter extends 
    * @param stemEnglishPossessive If 1, causes trailing "'s" to be removed for 
each subword: "O'Neil's" => "O", "Neil"
    * @param protWords If not null is the set of tokens to protect from being 
delimited
    */
-  public WordDelimiterFilter(TokenStream in, byte[] charTypeTable, int 
generateWordParts, int generateNumberParts, int catenateWords, int 
catenateNumbers, int catenateAll, int splitOnCaseChange, int 
preserveOriginal,int splitOnNumerics, int stemEnglishPossessive, CharArraySet 
protWords) {
+  public WordDelimiterFilter(TokenStream in,
+                             byte[] charTypeTable,
+                             int generateWordParts,
+                             int generateNumberParts,
+                             int catenateWords,
+                             int catenateNumbers,
+                             int catenateAll,
+                             int splitOnCaseChange,
+                             int preserveOriginal,
+                             int splitOnNumerics,
+                             int stemEnglishPossessive,
+                             CharArraySet protWords) {
     super(in);
-    this.generateWordParts = generateWordParts;
-    this.generateNumberParts = generateNumberParts;
-    this.catenateWords = catenateWords;
-    this.catenateNumbers = catenateNumbers;
-    this.catenateAll = catenateAll;
-    this.splitOnCaseChange = splitOnCaseChange;
-    this.preserveOriginal = preserveOriginal;
-    this.charTypeTable = charTypeTable;
-    this.splitOnNumerics = splitOnNumerics;
-    this.stemEnglishPossessive = stemEnglishPossessive;
+    this.generateWordParts = generateWordParts != 0;
+    this.generateNumberParts = generateNumberParts != 0;
+    this.catenateWords = catenateWords != 0;
+    this.catenateNumbers = catenateNumbers != 0;
+    this.catenateAll = catenateAll != 0;
+    this.preserveOriginal = preserveOriginal != 0;
     this.protWords = protWords;
+    this.iterator = new WordDelimiterIterator(charTypeTable, splitOnCaseChange 
!= 0, splitOnNumerics != 0, stemEnglishPossessive != 0);
   }
   
   /**
@@ -198,8 +199,18 @@ final class WordDelimiterFilter extends 
    *             instead.
    */
   @Deprecated
-  public WordDelimiterFilter(TokenStream in, byte[] charTypeTable, int 
generateWordParts, int generateNumberParts, int catenateWords, int 
catenateNumbers, int catenateAll, int splitOnCaseChange, int 
preserveOriginal,int splitOnNumerics, CharArraySet protWords) {
-    
this(in,charTypeTable,generateWordParts,generateNumberParts,catenateWords,catenateNumbers,catenateAll,splitOnCaseChange,preserveOriginal,
 1, 1, null);
+  public WordDelimiterFilter(TokenStream in,
+                             byte[] charTypeTable,
+                             int generateWordParts,
+                             int generateNumberParts,
+                             int catenateWords,
+                             int catenateNumbers,
+                             int catenateAll,
+                             int splitOnCaseChange,
+                             int preserveOriginal,
+                             int splitOnNumerics,
+                             CharArraySet protWords) {
+    this(in, charTypeTable, generateWordParts, generateNumberParts, 
catenateWords, catenateNumbers, catenateAll, splitOnCaseChange, 
preserveOriginal, 1, 1, null);
   }
 
   /**
@@ -210,8 +221,16 @@ final class WordDelimiterFilter extends 
    *             instead.
    */
   @Deprecated
-  public WordDelimiterFilter(TokenStream in, byte[] charTypeTable, int 
generateWordParts, int generateNumberParts, int catenateWords, int 
catenateNumbers, int catenateAll, int splitOnCaseChange, int preserveOriginal) {
-    
this(in,charTypeTable,generateWordParts,generateNumberParts,catenateWords,catenateNumbers,catenateAll,splitOnCaseChange,preserveOriginal,
 1, null);
+  public WordDelimiterFilter(TokenStream in,
+                             byte[] charTypeTable,
+                             int generateWordParts,
+                             int generateNumberParts,
+                             int catenateWords,
+                             int catenateNumbers,
+                             int catenateAll,
+                             int splitOnCaseChange,
+                             int preserveOriginal) {
+    this(in, charTypeTable, generateWordParts, generateNumberParts, 
catenateWords, catenateNumbers, catenateAll, splitOnCaseChange, 
preserveOriginal, 1, null);
   }
 
   /**
@@ -227,8 +246,18 @@ final class WordDelimiterFilter extends 
    * @param stemEnglishPossessive If 1, causes trailing "'s" to be removed for 
each subword: "O'Neil's" => "O", "Neil"
    * @param protWords If not null is the set of tokens to protect from being 
delimited
    */
-  public WordDelimiterFilter(TokenStream in, int generateWordParts, int 
generateNumberParts, int catenateWords, int catenateNumbers, int catenateAll, 
int splitOnCaseChange, int preserveOriginal,int splitOnNumerics, int 
stemEnglishPossessive, CharArraySet protWords) {
-    this(in, defaultWordDelimTable, generateWordParts, generateNumberParts, 
catenateWords, catenateNumbers, catenateAll, splitOnCaseChange, 
preserveOriginal, splitOnNumerics, stemEnglishPossessive, protWords);
+  public WordDelimiterFilter(TokenStream in,
+                             int generateWordParts,
+                             int generateNumberParts,
+                             int catenateWords,
+                             int catenateNumbers,
+                             int catenateAll,
+                             int splitOnCaseChange,
+                             int preserveOriginal,
+                             int splitOnNumerics,
+                             int stemEnglishPossessive,
+                             CharArraySet protWords) {
+    this(in, WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE, 
generateWordParts, generateNumberParts, catenateWords, catenateNumbers, 
catenateAll, splitOnCaseChange, preserveOriginal, splitOnNumerics, 
stemEnglishPossessive, protWords);
   }
   
   /**
@@ -237,8 +266,17 @@ final class WordDelimiterFilter extends 
    *             instead.
    */
   @Deprecated
-  public WordDelimiterFilter(TokenStream in, int generateWordParts, int 
generateNumberParts, int catenateWords, int catenateNumbers, int catenateAll, 
int splitOnCaseChange, int preserveOriginal,int splitOnNumerics, CharArraySet 
protWords) {
-    this(in, defaultWordDelimTable, generateWordParts, generateNumberParts, 
catenateWords, catenateNumbers, catenateAll, splitOnCaseChange, 
preserveOriginal, splitOnNumerics, 1, protWords);
+  public WordDelimiterFilter(TokenStream in,
+                             int generateWordParts,
+                             int generateNumberParts,
+                             int catenateWords,
+                             int catenateNumbers,
+                             int catenateAll,
+                             int splitOnCaseChange,
+                             int preserveOriginal,
+                             int splitOnNumerics,
+                             CharArraySet protWords) {
+    this(in, WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE, 
generateWordParts, generateNumberParts, catenateWords, catenateNumbers, 
catenateAll, splitOnCaseChange, preserveOriginal, splitOnNumerics, 1, 
protWords);
   }
 
   /**   * Compatibility constructor
@@ -248,8 +286,15 @@ final class WordDelimiterFilter extends 
    *             instead.
    */
   @Deprecated
-  public WordDelimiterFilter(TokenStream in, int generateWordParts, int 
generateNumberParts, int catenateWords, int catenateNumbers, int catenateAll, 
int splitOnCaseChange, int preserveOriginal) {
-    this(in, defaultWordDelimTable, generateWordParts, generateNumberParts, 
catenateWords, catenateNumbers, catenateAll, splitOnCaseChange, 
preserveOriginal);
+  public WordDelimiterFilter(TokenStream in,
+                             int generateWordParts,
+                             int generateNumberParts,
+                             int catenateWords,
+                             int catenateNumbers,
+                             int catenateAll,
+                             int splitOnCaseChange,
+                             int preserveOriginal) {
+    this(in, WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE, 
generateWordParts, generateNumberParts, catenateWords, catenateNumbers, 
catenateAll, splitOnCaseChange, preserveOriginal);
   }
   /**
    * Compatibility constructor
@@ -259,7 +304,13 @@ final class WordDelimiterFilter extends 
    *             instead.
    */
   @Deprecated
-  public WordDelimiterFilter(TokenStream in, byte[] charTypeTable, int 
generateWordParts, int generateNumberParts, int catenateWords, int 
catenateNumbers, int catenateAll) {
+  public WordDelimiterFilter(TokenStream in,
+                             byte[] charTypeTable,
+                             int generateWordParts,
+                             int generateNumberParts,
+                             int catenateWords,
+                             int catenateNumbers,
+                             int catenateAll) {
     this(in, charTypeTable, generateWordParts, generateNumberParts, 
catenateWords, catenateNumbers, catenateAll, 1, 0, 1, null);
   }
   /**
@@ -270,407 +321,365 @@ final class WordDelimiterFilter extends 
    *             instead.
    */
   @Deprecated
-  public WordDelimiterFilter(TokenStream in, int generateWordParts, int 
generateNumberParts, int catenateWords, int catenateNumbers, int catenateAll) {
-    this(in, defaultWordDelimTable, generateWordParts, generateNumberParts, 
catenateWords, catenateNumbers, catenateAll, 1, 0, 1, null);
+  public WordDelimiterFilter(TokenStream in,
+                             int generateWordParts,
+                             int generateNumberParts,
+                             int catenateWords,
+                             int catenateNumbers,
+                             int catenateAll) {
+    this(in, WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE, 
generateWordParts, generateNumberParts, catenateWords, catenateNumbers, 
catenateAll, 1, 0, 1, null);
   }
+  
+  public boolean incrementToken() throws IOException {
+    while (true) {
+      if (!hasSavedState) {
+        // process a new input word
+        if (!input.incrementToken()) {
+          return false;
+        }
 
+        int termLength = termAttribute.termLength();
+        char[] termBuffer = termAttribute.termBuffer();
+        
+        accumPosInc += posIncAttribute.getPositionIncrement();
+
+        iterator.setText(termBuffer, termLength);
+        iterator.next();
+
+        // word of no delimiters, or protected word: just return it
+        if ((iterator.current == 0 && iterator.end == termLength) ||
+            (protWords != null && protWords.contains(termBuffer, 0, 
termLength))) {
+          posIncAttribute.setPositionIncrement(accumPosInc);
+          accumPosInc = 0;
+          return true;
+        }
+        
+        // word of simply delimiters
+        if (iterator.end == WordDelimiterIterator.DONE && !preserveOriginal) {
+          // if the posInc is 1, simply ignore it in the accumulation
+          if (posIncAttribute.getPositionIncrement() == 1) {
+            accumPosInc--;
+          }
+          continue;
+        }
 
-  int charType(int ch) {
-    if (ch<charTypeTable.length) {
-      return charTypeTable[ch];
-    }
-    switch (Character.getType(ch)) {
-      case Character.UPPERCASE_LETTER: return UPPER;
-      case Character.LOWERCASE_LETTER: return LOWER;
-
-      case Character.TITLECASE_LETTER:
-      case Character.MODIFIER_LETTER:
-      case Character.OTHER_LETTER:
-      case Character.NON_SPACING_MARK:
-      case Character.ENCLOSING_MARK:  // depends what it encloses?
-      case Character.COMBINING_SPACING_MARK:
-        return ALPHA; 
-
-      case Character.DECIMAL_DIGIT_NUMBER:
-      case Character.LETTER_NUMBER:
-      case Character.OTHER_NUMBER:
-        return DIGIT;
-
-      // case Character.SPACE_SEPARATOR:
-      // case Character.LINE_SEPARATOR:
-      // case Character.PARAGRAPH_SEPARATOR:
-      // case Character.CONTROL:
-      // case Character.FORMAT:
-      // case Character.PRIVATE_USE:
-
-      case Character.SURROGATE:  // prevent splitting
-        return ALPHA|DIGIT;  
-
-      // case Character.DASH_PUNCTUATION:
-      // case Character.START_PUNCTUATION:
-      // case Character.END_PUNCTUATION:
-      // case Character.CONNECTOR_PUNCTUATION:
-      // case Character.OTHER_PUNCTUATION:
-      // case Character.MATH_SYMBOL:
-      // case Character.CURRENCY_SYMBOL:
-      // case Character.MODIFIER_SYMBOL:
-      // case Character.OTHER_SYMBOL:
-      // case Character.INITIAL_QUOTE_PUNCTUATION:
-      // case Character.FINAL_QUOTE_PUNCTUATION:
+        saveState();
 
-      default: return SUBWORD_DELIM;
+        hasOutputToken = false;
+        hasOutputFollowingOriginal = !preserveOriginal;
+        lastConcatCount = 0;
+        
+        if (preserveOriginal) {
+          posIncAttribute.setPositionIncrement(accumPosInc);
+          accumPosInc = 0;
+          return true;
+        }
+      }
+      
+      // at the end of the string, output any concatenations
+      if (iterator.end == WordDelimiterIterator.DONE) {
+        if (!concat.isEmpty()) {
+          if (flushConcatenation(concat)) {
+            return true;
+          }
+        }
+        
+        if (!concatAll.isEmpty()) {
+          // only if we haven't output this same combo above!
+          if (concatAll.subwordCount > lastConcatCount) {
+            concatAll.writeAndClear();
+            return true;
+          }
+          concatAll.clear();
+        }
+        
+        // no saved concatenations, on to the next input word
+        hasSavedState = false;
+        continue;
+      }
+      
+      // word surrounded by delimiters: always output
+      if (iterator.isSingleWord()) {
+        generatePart(true);
+        iterator.next();
+        return true;
+      }
+      
+      int wordType = iterator.type();
+      
+      // do we already have queued up incompatible concatenations?
+      if (!concat.isEmpty() && (concat.type & wordType) == 0) {
+        if (flushConcatenation(concat)) {
+          hasOutputToken = false;
+          return true;
+        }
+        hasOutputToken = false;
+      }
+      
+      // add subwords depending upon options
+      if (shouldConcatenate(wordType)) {
+        if (concat.isEmpty()) {
+          concat.type = wordType;
+        }
+        concatenate(concat);
+      }
+      
+      // add all subwords (catenateAll)
+      if (catenateAll) {
+        concatenate(concatAll);
+      }
+      
+      // if we should output the word or number part
+      if (shouldGenerateParts(wordType)) {
+        generatePart(false);
+        iterator.next();
+        return true;
+      }
+        
+      iterator.next();
     }
   }
 
-  // use the type of the first char as the type
-  // of the token.
-  private int tokType(Token t) {
-    return charType(t.termBuffer()[0]);
+  /**
+   * {...@inheritdoc}
+   */
+  @Override
+  public void reset() throws IOException {
+    super.reset();
+    hasSavedState = false;
+    concat.clear();
+    concatAll.clear();
+    accumPosInc = 0;
   }
 
-  // There isn't really an efficient queue class, so we will
-  // just use an array for now.
-  private ArrayList<Token> queue = new ArrayList<Token>(4);
-  private int queuePos=0;
-
-  // temporary working queue
-  private ArrayList<Token> tlist = new ArrayList<Token>(4);
+  // ================================================= Helper Methods 
================================================
 
+  /**
+   * Saves the existing attribute states
+   */
+  private void saveState() {
+    // otherwise, we have delimiters, save state
+    savedStartOffset = offsetAttribute.startOffset();
+    savedEndOffset = offsetAttribute.endOffset();
+    // if length by start + end offsets doesn't match the term text then 
assume this is a synonym and don't adjust the offsets.
+    hasIllegalOffsets = (savedEndOffset - savedStartOffset != 
termAttribute.termLength());
+    savedType = typeAttribute.type();
 
-  private Token newTok(Token orig, int start, int end) {
-    int startOff = orig.startOffset();
-    int endOff = orig.endOffset();
-    // if length by start + end offsets doesn't match the term text then assume
-    // this is a synonym and don't adjust the offsets.
-    if (orig.termLength() == endOff-startOff) {
-      endOff = startOff + end;
-      startOff += start;     
+    if (savedBuffer.length < termAttribute.termLength()) {
+      savedBuffer = new 
char[ArrayUtil.getNextSize(termAttribute.termLength())];
     }
 
-    return (Token)orig.clone(orig.termBuffer(), start, (end - start), 
startOff, endOff);
-  }
-
-
-  public final Token next(Token in) throws IOException {
+    System.arraycopy(termAttribute.termBuffer(), 0, savedBuffer, 0, 
termAttribute.termLength());
+    iterator.text = savedBuffer;
 
-    // check the queue first
-    if (queuePos<queue.size()) {
-      return queue.get(queuePos++);
-    }
+    hasSavedState = true;
+  }
 
-    // reset the queue if it had been previously used
-    if (queuePos!=0) {
-      queuePos=0;
-      queue.clear();
+  /**
+   * Flushes the given WordDelimiterConcatenation by either writing its concat 
and then clearing, or just clearing.
+   *
+   * @param concatenation WordDelimiterConcatenation that will be flushed
+   * @return {...@code true} if the concatenation was written before it was 
cleared, {...@code} false otherwise
+   */
+  private boolean flushConcatenation(WordDelimiterConcatenation concatenation) 
{
+    lastConcatCount = concatenation.subwordCount;
+    if (concatenation.subwordCount != 1 || 
!shouldGenerateParts(concatenation.type)) {
+      concatenation.writeAndClear();
+      return true;
     }
+    concatenation.clear();
+    return false;
+  }
 
+  /**
+   * Determines whether to concatenate a word or number if the current word is 
the given type
+   *
+   * @param wordType Type of the current word used to determine if it should 
be concatenated
+   * @return {...@code true} if concatenation should occur, {...@code false} 
otherwise
+   */
+  private boolean shouldConcatenate(int wordType) {
+    return (catenateWords && isAlpha(wordType)) || (catenateNumbers && 
isDigit(wordType));
+  }
 
-    // optimize for the common case: assume there will be
-    // no subwords (just a simple word)
-    //
-    // Would it actually be faster to check for the common form
-    // of isLetter() isLower()*, and then backtrack if it doesn't match?
-
-    int origPosIncrement = 0;
-    Token t;
-    while(true) {
-      // t is either returned, or a new token is made from it, so it should
-      // be safe to use the next(Token) method.
-      t = input.next(in);
-      if (t == null) return null;
-
-      char [] termBuffer = t.termBuffer();
-      int len = t.termLength();
-      int start=0;
-      if (len ==0) continue;
-
-      int posInc = t.getPositionIncrement();
-      origPosIncrement += posInc;
-
-      //skip protected tokens
-      if (protWords != null && protWords.contains(termBuffer, 0, len)) {
-        t.setPositionIncrement(origPosIncrement);
-        return t;
-      }
-
-      // Avoid calling charType more than once for each char (basically
-      // avoid any backtracking).
-      // makes code slightly more difficult, but faster.
-      int ch=termBuffer[start];
-      int type=charType(ch);
-
-      int numWords=0;
-
-      while (start< len) {
-        // first eat delimiters at the start of this subword
-        while ((type & SUBWORD_DELIM)!=0 && ++start< len) {
-          ch=termBuffer[start];
-          type=charType(ch);
-        }
-
-        int pos=start;
-
-        // save the type of the first char of the subword
-        // as a way to tell what type of subword token this is (number, word, 
etc)
-        int firstType=type;
-        int lastType=type;  // type of the previously read char
-
-
-        while (pos< len) {
-
-          if ((type & lastType)==0) {  // no overlap in character type
-            // check and remove "'s" from the end of a token.
-            // the pattern to check for is
-            //   ALPHA "'" ("s"|"S") (SUBWORD_DELIM | END)
-            if (stemEnglishPossessive != 0 && ((lastType & ALPHA)!=0)) {
-              if (ch=='\'' && pos+1< len
-                      && (termBuffer[pos+1]=='s' || termBuffer[pos+1]=='S'))
-              {
-                int subWordEnd=pos;
-                if (pos+2>= len) {
-                  // end of string detected after "'s"
-                  pos+=2;
-                } else {
-                  // make sure that a delimiter follows "'s"
-                  int ch2 = termBuffer[pos+2];
-                  int type2 = charType(ch2);
-                  if ((type2 & SUBWORD_DELIM)!=0) {
-                    // if delimiter, move position pointer
-                    // to it (skipping over "'s"
-                    ch=ch2;
-                    type=type2;
-                    pos+=2;
-                  }
-                }
-
-                queue.add(newTok(t,start,subWordEnd));
-                if ((firstType & ALPHA)!=0) numWords++;
-                break;
-              }
-            }
-
-            // For case changes, only split on a transition from
-            // lower to upper case, not vice-versa.
-            // That will correctly handle the
-            // case of a word starting with a capital (won't split).
-            // It will also handle pluralization of
-            // an uppercase word such as FOOs (won't split).
-
-            if (splitOnCaseChange == 0 && 
-                (lastType & ALPHA) != 0 && (type & ALPHA) != 0) {
-              // ALPHA->ALPHA: always ignore if case isn't considered.
-            } else if ((lastType & UPPER)!=0 && (type & ALPHA)!=0) {
-              // UPPER->letter: Don't split
-            } else if(splitOnNumerics == 0 &&
-                ( ((lastType &  ALPHA) != 0 && (type & DIGIT) != 0) || 
((lastType &  DIGIT) != 0 && (type & ALPHA) != 0) ) ) {
-              // ALPHA->NUMERIC, NUMERIC->ALPHA :Don't split
-            } else {
-              // NOTE: this code currently assumes that only one flag
-              // is set for each character now, so we don't have
-              // to explicitly check for all the classes of transitions
-              // listed below.
-
-              // LOWER->UPPER
-              // ALPHA->NUMERIC
-              // NUMERIC->ALPHA
-              // *->DELIMITER
-              queue.add(newTok(t,start,pos));
-              if ((firstType & ALPHA)!=0) numWords++;
-              break;
-            }
-          }
-
-          if (++pos >= len) {
-            if (start==0) {
-              // the subword is the whole original token, so
-              // return it unchanged.
-              t.setPositionIncrement(origPosIncrement);
-              return t;
-            }
-
-            // optimization... if this is the only token,
-            // return it immediately.
-            if (queue.size()==0 && preserveOriginal == 0) {
-              // just adjust the text w/o changing the rest
-              // of the original token
-              t.setTermBuffer(termBuffer, start, len-start);
-              t.setStartOffset(t.startOffset() + start);
-              t.setPositionIncrement(origPosIncrement);              
-              return t;
-            }
-
-            Token newtok = newTok(t,start,pos);
-
-            queue.add(newtok);
-            if ((firstType & ALPHA)!=0) numWords++;
-            break;
-          }
-
-          lastType = type;
-          ch = termBuffer[pos];
-          type = charType(ch);
-        }
-
-        // start of the next subword is the current position
-        start=pos;
-      }
-
-      // System.out.println("##########TOKEN=" + s + " ######### WORD 
DELIMITER QUEUE=" + str(queue));
-
-      final int numtok = queue.size();
-
-      // We reached the end of the current token.
-      // If the queue is empty, we should continue by reading
-      // the next token
-      if (numtok==0) {
-        // the token might have been all delimiters, in which
-        // case return it if we're meant to preserve it
-        if (preserveOriginal != 0) {
-          return t;
-        }
-
-        // if this token had a "normal" gap of 1, remove it.
-        if (posInc==1) origPosIncrement-=1;
-        continue;
-      }
-
-      // if number of tokens is 1, there are no catenations to be done.
-      if (numtok==1) {
-        break;
-      }
-
-
-      final int numNumbers = numtok - numWords;
+  /**
+   * Determines whether a word/number part should be generated for a word of 
the given type
+   *
+   * @param wordType Type of the word used to determine if a word/number part 
should be generated
+   * @return {...@code true} if a word/number part should be generated, 
{...@code false} otherwise
+   */
+  private boolean shouldGenerateParts(int wordType) {
+    return (generateWordParts && isAlpha(wordType)) || (generateNumberParts && 
isDigit(wordType));
+  }
 
-      // check conditions under which the current token
-      // queue may be used as-is (no catenations needed)
-      if (catenateAll==0    // no "everything" to catenate
-        && (catenateWords==0 || numWords<=1)   // no words to catenate
-        && (catenateNumbers==0 || numNumbers<=1)    // no numbers to catenate
-        && (generateWordParts!=0 || numWords==0)  // word generation is on
-        && (generateNumberParts!=0 || numNumbers==0)) // number generation is 
on
-      {
-        break;
-      }
+  /**
+   * Concatenates the saved buffer to the given WordDelimiterConcatenation
+   *
+   * @param concatenation WordDelimiterConcatenation to concatenate the buffer 
to
+   */
+  private void concatenate(WordDelimiterConcatenation concatenation) {
+    if (concatenation.isEmpty()) {
+      concatenation.startOffset = savedStartOffset + iterator.current;
+    }
+    concatenation.append(savedBuffer, iterator.current, iterator.end - 
iterator.current);
+    concatenation.endOffset = savedStartOffset + iterator.end;
+  }
 
+  /**
+   * Generates a word/number part, updating the appropriate attributes
+   *
+   * @param isSingleWord {...@code true} if the generation is occurring from a 
single word, {...@code false} otherwise
+   */
+  private void generatePart(boolean isSingleWord) {
+    clearAttributes();
+    termAttribute.setTermBuffer(savedBuffer, iterator.current, iterator.end - 
iterator.current);
 
-      // swap queue and the temporary working list, then clear the
-      // queue in preparation for adding all combinations back to it.
-      ArrayList<Token> tmp=tlist;
-      tlist=queue;
-      queue=tmp;
-      queue.clear();
-
-      if (numWords==0) {
-        // all numbers
-        addCombos(tlist,0,numtok,generateNumberParts!=0,catenateNumbers!=0 || 
catenateAll!=0, 1);
-        if (queue.size() > 0 || preserveOriginal!=0) break; else continue;
-      } else if (numNumbers==0) {
-        // all words
-        addCombos(tlist,0,numtok,generateWordParts!=0,catenateWords!=0 || 
catenateAll!=0, 1);
-        if (queue.size() > 0 || preserveOriginal!=0) break; else continue;
-      } else if (generateNumberParts==0 && generateWordParts==0 && 
catenateNumbers==0 && catenateWords==0) {
-        // catenate all *only*
-        // OPT:could be optimized to add to current queue...
-        addCombos(tlist,0,numtok,false,catenateAll!=0, 1);
-        if (queue.size() > 0 || preserveOriginal!=0) break; else continue;
-      }
+    int startOffSet = (isSingleWord || !hasIllegalOffsets) ? savedStartOffset 
+ iterator.current : savedStartOffset;
+    int endOffSet = (hasIllegalOffsets) ? savedEndOffset : savedStartOffset + 
iterator.end;
 
-      //
-      // Find all adjacent tokens of the same type.
-      //
-      Token tok = tlist.get(0);
-      boolean isWord = (tokType(tok) & ALPHA) != 0;
-      boolean wasWord=isWord;
-
-      for(int i=0; i<numtok;) {
-          int j;
-          for (j=i+1; j<numtok; j++) {
-            wasWord=isWord;
-            tok = tlist.get(j);
-            isWord = (tokType(tok) & ALPHA) != 0;
-            if (isWord != wasWord) break;
-          }
-          if (wasWord) {
-            addCombos(tlist,i,j,generateWordParts!=0,catenateWords!=0,1);
-          } else {
-            addCombos(tlist,i,j,generateNumberParts!=0,catenateNumbers!=0,1);
-          }
-          i=j;
-      }
+    offsetAttribute.setOffset(startOffSet, endOffSet);
+    posIncAttribute.setPositionIncrement(position(false));
+    typeAttribute.setType(savedType);
+  }
 
-      // take care catenating all subwords
-      if (catenateAll!=0) {
-        addCombos(tlist,0,numtok,false,true,0);
-      }
+  /**
+   * Get the position increment gap for a subword or concatenation
+   *
+   * @param inject true if this token wants to be injected
+   * @return position increment gap
+   */
+  private int position(boolean inject) {
+    int posInc = accumPosInc;
 
-      // NOTE: in certain cases, queue may be empty (for instance, if catenate
-      // and generate are both set to false).  Only exit the loop if the queue
-      // is not empty.
-      if (queue.size() > 0 || preserveOriginal!=0) break;
+    if (hasOutputToken) {
+      accumPosInc = 0;
+      return inject ? 0 : Math.max(1, posInc);
     }
 
-    // System.out.println("##########AFTER COMBINATIONS:"+ str(queue));
-
-    if (preserveOriginal != 0) {
-      queuePos = 0;
-      if (queue.size() > 0) {
-        // overlap first token with the original
-        queue.get(0).setPositionIncrement(0);
-      }
-      return t;  // return the original
-    } else {
-      queuePos=1;
-      Token tok = queue.get(0);
-      tok.setPositionIncrement(origPosIncrement);
-      return tok;
+    hasOutputToken = true;
+    
+    if (!hasOutputFollowingOriginal) {
+      // the first token following the original is 0 regardless
+      hasOutputFollowingOriginal = true;
+      return 0;
     }
+    // clear the accumulated position increment
+    accumPosInc = 0;
+    return Math.max(1, posInc);
   }
 
+  /**
+   * Checks if the given word type includes {...@link #ALPHA}
+   *
+   * @param type Word type to check
+   * @return {...@code true} if the type contains ALPHA, {...@code false} 
otherwise
+   */
+  static boolean isAlpha(int type) {
+    return (type & ALPHA) != 0;
+  }
 
-  // index "a","b","c" as  pos0="a", pos1="b", pos2="c", pos2="abc"
-  private void addCombos(List<Token> lst, int start, int end, boolean 
generateSubwords, boolean catenateSubwords, int posOffset) {
-    if (end-start==1) {
-      // always generate a word alone, even if generateSubwords=0 because
-      // the catenation of all the subwords *is* the subword.
-      queue.add(lst.get(start));
-      return;
-    }
-
-    StringBuilder sb = null;
-    if (catenateSubwords) sb=new StringBuilder();
-    Token firstTok=null;
-    Token tok=null;
-    for (int i=start; i<end; i++) {
-      tok = lst.get(i);
-      if (catenateSubwords) {
-        if (i==start) firstTok=tok;
-        sb.append(tok.termBuffer(), 0, tok.termLength());
-      }
-      if (generateSubwords) {
-        queue.add(tok);
-      }
-    }
+  /**
+   * Checks if the given word type includes {...@link #DIGIT}
+   *
+   * @param type Word type to check
+   * @return {...@code true} if the type contains DIGIT, {...@code false} 
otherwise
+   */
+  static boolean isDigit(int type) {
+    return (type & DIGIT) != 0;
+  }
 
-    if (catenateSubwords) {
-      Token concatTok = new Token(sb.toString(),
-              firstTok.startOffset(),
-              tok.endOffset(),
-              firstTok.type());
-      // if we indexed some other tokens, then overlap concatTok with the last.
-      // Otherwise, use the value passed in as the position offset.
-      concatTok.setPositionIncrement(generateSubwords==true ? 0 : posOffset);
-      queue.add(concatTok);
-    }
+  /**
+   * Checks if the given word type includes {...@link #SUBWORD_DELIM}
+   *
+   * @param type Word type to check
+   * @return {...@code true} if the type contains SUBWORD_DELIM, {...@code 
false} otherwise
+   */
+  static boolean isSubwordDelim(int type) {
+    return (type & SUBWORD_DELIM) != 0;
   }
 
-  @Override
-  public void reset() throws IOException {
-    input.reset();
-    queuePos=0;
-    queue.clear();    
+  /**
+   * Checks if the given word type includes {...@link #UPPER}
+   *
+   * @param type Word type to check
+   * @return {...@code true} if the type contains UPPER, {...@code false} 
otherwise
+   */
+  static boolean isUpper(int type) {
+    return (type & UPPER) != 0;
   }
 
+  // ================================================= Inner Classes 
=================================================
+
+  /**
+   * A WDF concatenated 'run'
+   */
+  final class WordDelimiterConcatenation {
+    final StringBuilder buffer = new StringBuilder();
+    int startOffset;
+    int endOffset;
+    int type;
+    int subwordCount;
+
+    /**
+     * Appends the given text of the given length, to the concetenation at the 
given offset
+     *
+     * @param text Text to append
+     * @param offset Offset in the concetenation to add the text
+     * @param length Length of the text to append
+     */
+    void append(char text[], int offset, int length) {
+      buffer.append(text, offset, length);
+      subwordCount++;
+    }
+
+    /**
+     * Writes the concatenation to the attributes
+     */
+    void write() {
+      clearAttributes();
+      if (termAttribute.termLength() < buffer.length()) {
+        termAttribute.resizeTermBuffer(buffer.length());
+      }
+      char termbuffer[] = termAttribute.termBuffer();
+      
+      buffer.getChars(0, buffer.length(), termbuffer, 0);
+      termAttribute.setTermLength(buffer.length());
+        
+      if (hasIllegalOffsets) {
+        offsetAttribute.setOffset(savedStartOffset, savedEndOffset);
+      }
+      else {
+        offsetAttribute.setOffset(startOffset, endOffset);
+      }
+      posIncAttribute.setPositionIncrement(position(true));
+      typeAttribute.setType(savedType);
+      accumPosInc = 0;
+    }
+
+    /**
+     * Determines if the concatenation is empty
+     *
+     * @return {...@code true} if the concatenation is empty, {...@code false} 
otherwise
+     */
+    boolean isEmpty() {
+      return buffer.length() == 0;
+    }
+
+    /**
+     * Clears the concatenation and resets its state
+     */
+    void clear() {
+      buffer.setLength(0);
+      startOffset = endOffset = type = subwordCount = 0;
+    }
+
+    /**
+     * Convenience method for the common scenario of having to write the 
concetenation and then clearing its state
+     */
+    void writeAndClear() {
+      write();
+      clear();
+    }
+  }
   // questions:
   // negative numbers?  -42 indexed as just 42?
   // dollar sign?  $42

Added: 
lucene/solr/branches/branch-1.4/src/java/org/apache/solr/analysis/WordDelimiterIterator.java
URL: 
http://svn.apache.org/viewvc/lucene/solr/branches/branch-1.4/src/java/org/apache/solr/analysis/WordDelimiterIterator.java?rev=950711&view=auto
==============================================================================
--- 
lucene/solr/branches/branch-1.4/src/java/org/apache/solr/analysis/WordDelimiterIterator.java
 (added)
+++ 
lucene/solr/branches/branch-1.4/src/java/org/apache/solr/analysis/WordDelimiterIterator.java
 Wed Jun  2 18:35:10 2010
@@ -0,0 +1,315 @@
+package org.apache.solr.analysis;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import static org.apache.solr.analysis.WordDelimiterFilter.*;
+
+/**
+ * A BreakIterator-like API for iterating over subwords in text, according to 
WordDelimiterFilter rules.
+ */
+final class WordDelimiterIterator {
+
+  /** Indicates the end of iteration */
+  public static final int DONE = -1;
+  
+  public static final byte[] DEFAULT_WORD_DELIM_TABLE;
+
+  char text[];
+  int length;
+  
+  /** start position of text, excluding leading delimiters */
+  int startBounds;
+  /** end position of text, excluding trailing delimiters */
+  int endBounds;
+  
+  /** Beginning of subword */
+  int current;
+  /** End of subword */
+  int end;
+  
+  /* does this string end with a possessive such as 's */
+  private boolean hasFinalPossessive = false;
+  
+  /**
+   * If false, causes case changes to be ignored (subwords will only be 
generated
+   * given SUBWORD_DELIM tokens). (Defaults to true)
+   */
+  final boolean splitOnCaseChange;
+  
+  /**
+   * If false, causes numeric changes to be ignored (subwords will only be 
generated
+   * given SUBWORD_DELIM tokens). (Defaults to true)
+   */
+  final boolean splitOnNumerics;
+
+  /**
+   * If true, causes trailing "'s" to be removed for each subword. (Defaults 
to true)
+   * <p/>
+   * "O'Neil's" => "O", "Neil"
+   */
+  final boolean stemEnglishPossessive;
+  
+  private final byte[] charTypeTable;
+  
+  /** if true, need to skip over a possessive found in the last call to next() 
*/
+  private boolean skipPossessive = false;
+
+  // TODO: should there be a WORD_DELIM category for chars that only separate 
words (no catenation of subwords will be
+  // done if separated by these chars?) "," would be an obvious candidate...
+  static {
+    byte[] tab = new byte[256];
+    for (int i = 0; i < 256; i++) {
+      byte code = 0;
+      if (Character.isLowerCase(i)) {
+        code |= LOWER;
+      }
+      else if (Character.isUpperCase(i)) {
+        code |= UPPER;
+      }
+      else if (Character.isDigit(i)) {
+        code |= DIGIT;
+      }
+      if (code == 0) {
+        code = SUBWORD_DELIM;
+      }
+      tab[i] = code;
+    }
+    DEFAULT_WORD_DELIM_TABLE = tab;
+  }
+
+  /**
+   * Create a new WordDelimiterIterator operating with the supplied rules.
+   * 
+   * @param charTypeTable table containing character types
+   * @param splitOnCaseChange if true, causes "PowerShot" to be two tokens; 
("Power-Shot" remains two parts regards)
+   * @param splitOnNumerics if true, causes "j2se" to be three tokens; "j" "2" 
"se"
+   * @param stemEnglishPossessive if true, causes trailing "'s" to be removed 
for each subword: "O'Neil's" => "O", "Neil"
+   */
+  WordDelimiterIterator(byte[] charTypeTable, boolean splitOnCaseChange, 
boolean splitOnNumerics, boolean stemEnglishPossessive) {
+    this.charTypeTable = charTypeTable;
+    this.splitOnCaseChange = splitOnCaseChange;
+    this.splitOnNumerics = splitOnNumerics;
+    this.stemEnglishPossessive = stemEnglishPossessive;
+  }
+  
+  /**
+   * Advance to the next subword in the string.
+   *
+   * @return index of the next subword, or {...@link #DONE} if all subwords 
have been returned
+   */
+  int next() {
+    current = end;
+    if (current == DONE) {
+      return DONE;
+    }
+    
+    if (skipPossessive) {
+      current += 2;
+      skipPossessive = false;
+    }
+
+    int lastType = 0;
+    
+    while (current < endBounds && (isSubwordDelim(lastType = 
charType(text[current])))) {
+      current++;
+    }
+
+    if (current >= endBounds) {
+      return end = DONE;
+    }
+    
+    for (end = current + 1; end < endBounds; end++) {
+      int type = charType(text[end]);
+      if (isBreak(lastType, type)) {
+        break;
+      }
+      lastType = type;
+    }
+    
+    if (end < endBounds - 1 && endsWithPossessive(end + 2)) {
+      skipPossessive = true;
+    }
+    
+    return end;
+  }
+
+
+  /**
+   * Return the type of the current subword.
+   * This currently uses the type of the first character in the subword.
+   *
+   * @return type of the current word
+   */
+  int type() {
+    if (end == DONE) {
+      return 0;
+    }
+    
+    int type = charType(text[current]);
+    switch (type) {
+      // return ALPHA word type for both lower and upper
+      case LOWER:
+      case UPPER:
+        return ALPHA;
+      default:
+        return type;
+    }
+  }
+
+  /**
+   * Reset the text to a new value, and reset all state
+   *
+   * @param text New text
+   * @param length length of the text
+   */
+  void setText(char text[], int length) {
+    this.text = text;
+    this.length = this.endBounds = length;
+    current = startBounds = end = 0;
+    skipPossessive = hasFinalPossessive = false;
+    setBounds();
+  }
+
+  // ================================================= Helper Methods 
================================================
+
+  /**
+   * Determines whether the transition from lastType to type indicates a break
+   *
+   * @param lastType Last subword type
+   * @param type Current subword type
+   * @return {...@code true} if the transition indicates a break, {...@code 
false} otherwise
+   */
+  private boolean isBreak(int lastType, int type) {
+    if ((type & lastType) != 0) {
+      return false;
+    }
+    
+    if (!splitOnCaseChange && isAlpha(lastType) && isAlpha(type)) {
+      // ALPHA->ALPHA: always ignore if case isn't considered.
+      return false;
+    } else if (isUpper(lastType) && isAlpha(type)) {
+      // UPPER->letter: Don't split
+      return false;
+    } else if (!splitOnNumerics && ((isAlpha(lastType) && isDigit(type)) || 
(isDigit(lastType) && isAlpha(type)))) {
+      // ALPHA->NUMERIC, NUMERIC->ALPHA :Don't split
+      return false;
+    }
+
+    return true;
+  }
+  
+  /**
+   * Determines if the current word contains only one subword.  Note, it could 
be potentially surrounded by delimiters
+   *
+   * @return {...@code true} if the current word contains only one subword, 
{...@code false} otherwise
+   */
+  boolean isSingleWord() {
+    if (hasFinalPossessive) {
+      return current == startBounds && end == endBounds - 2;
+    }
+    else {
+      return current == startBounds && end == endBounds;
+    }
+  }
+   
+  /**
+   * Set the internal word bounds (remove leading and trailing delimiters). 
Note, if a possessive is found, don't remove
+   * it yet, simply note it.
+   */
+  private void setBounds() {
+    while (startBounds < length && 
(isSubwordDelim(charType(text[startBounds])))) {
+      startBounds++;
+    }
+    
+    while (endBounds > startBounds && (isSubwordDelim(charType(text[endBounds 
- 1])))) {
+      endBounds--;
+    }
+    if (endsWithPossessive(endBounds)) {
+      hasFinalPossessive = true;
+    }
+    current = startBounds;
+  }
+  
+  /**
+   * Determines if the text at the given position indicates an English 
possessive which should be removed
+   *
+   * @param pos Position in the text to check if it indicates an English 
possessive
+   * @return {...@code true} if the text at the position indicates an English 
posessive, {...@code false} otherwise
+   */
+  private boolean endsWithPossessive(int pos) {
+    return (stemEnglishPossessive &&
+            pos > 2 &&
+            text[pos - 2] == '\'' &&
+            (text[pos - 1] == 's' || text[pos - 1] == 'S') &&
+            isAlpha(charType(text[pos - 3])) &&
+            (pos == endBounds || isSubwordDelim(charType(text[pos]))));
+  }
+
+  /**
+   * Determines the type of the given character
+   *
+   * @param ch Character whose type is to be determined
+   * @return Type of the character
+   */
+  private int charType(int ch) {
+    if (ch < charTypeTable.length) {
+      return charTypeTable[ch];
+    }
+    switch (Character.getType(ch)) {
+      case Character.UPPERCASE_LETTER: return UPPER;
+      case Character.LOWERCASE_LETTER: return LOWER;
+
+      case Character.TITLECASE_LETTER:
+      case Character.MODIFIER_LETTER:
+      case Character.OTHER_LETTER:
+      case Character.NON_SPACING_MARK:
+      case Character.ENCLOSING_MARK:  // depends what it encloses?
+      case Character.COMBINING_SPACING_MARK:
+        return ALPHA; 
+
+      case Character.DECIMAL_DIGIT_NUMBER:
+      case Character.LETTER_NUMBER:
+      case Character.OTHER_NUMBER:
+        return DIGIT;
+
+      // case Character.SPACE_SEPARATOR:
+      // case Character.LINE_SEPARATOR:
+      // case Character.PARAGRAPH_SEPARATOR:
+      // case Character.CONTROL:
+      // case Character.FORMAT:
+      // case Character.PRIVATE_USE:
+
+      case Character.SURROGATE:  // prevent splitting
+        return ALPHA|DIGIT;  
+
+      // case Character.DASH_PUNCTUATION:
+      // case Character.START_PUNCTUATION:
+      // case Character.END_PUNCTUATION:
+      // case Character.CONNECTOR_PUNCTUATION:
+      // case Character.OTHER_PUNCTUATION:
+      // case Character.MATH_SYMBOL:
+      // case Character.CURRENCY_SYMBOL:
+      // case Character.MODIFIER_SYMBOL:
+      // case Character.OTHER_SYMBOL:
+      // case Character.INITIAL_QUOTE_PUNCTUATION:
+      // case Character.FINAL_QUOTE_PUNCTUATION:
+
+      default: return SUBWORD_DELIM;
+    }
+  }
+}
\ No newline at end of file

Propchange: 
lucene/solr/branches/branch-1.4/src/java/org/apache/solr/analysis/WordDelimiterIterator.java
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: 
lucene/solr/branches/branch-1.4/src/test/org/apache/solr/analysis/TestWordDelimiterFilter.java
URL: 
http://svn.apache.org/viewvc/lucene/solr/branches/branch-1.4/src/test/org/apache/solr/analysis/TestWordDelimiterFilter.java?rev=950711&r1=950710&r2=950711&view=diff
==============================================================================
--- 
lucene/solr/branches/branch-1.4/src/test/org/apache/solr/analysis/TestWordDelimiterFilter.java
 (original)
+++ 
lucene/solr/branches/branch-1.4/src/test/org/apache/solr/analysis/TestWordDelimiterFilter.java
 Wed Jun  2 18:35:10 2010
@@ -24,6 +24,8 @@ import org.apache.lucene.analysis.TokenF
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Token;
 import org.apache.lucene.analysis.WhitespaceTokenizer;
+import org.apache.lucene.analysis.StopFilter;
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
 import org.apache.lucene.analysis.tokenattributes.TermAttribute;
@@ -484,6 +486,29 @@ public class TestWordDelimiterFilter ext
         new int[] { 0, 9, 15 },
         new int[] { 6, 14, 19 },
         new int[] { 1, 11, 1 });
+    
+    Analyzer a3 = new Analyzer() {
+      public TokenStream tokenStream(String field, Reader reader) {
+        StopFilter filter = new StopFilter(
+            new WhitespaceTokenizer(reader), StandardAnalyzer.STOP_WORDS_SET);
+        filter.setEnablePositionIncrements(true);
+        return new WordDelimiterFilter(filter, 
+            1, 1, 0, 0, 1, 1, 0, 1, 1, protWords);
+      }
+    };
+
+    assertAnalyzesTo(a3, "lucene.solr", 
+        new String[] { "lucene", "solr", "lucenesolr" },
+        new int[] { 0, 7, 0 },
+        new int[] { 6, 11, 11 },
+        new int[] { 1, 1, 0 });
+
+    /* the stopword should add a gap here */
+    assertAnalyzesTo(a3, "the lucene.solr", 
+        new String[] { "lucene", "solr", "lucenesolr" }, 
+        new int[] { 4, 11, 4 }, 
+        new int[] { 10, 15, 15 },
+        new int[] { 2, 1, 0 });
   }
 
   private void assertAnalyzesTo(Analyzer a, String input, String[] output,

svn commit: r950711 - in /lucene/solr/branches/branch-1.4: CHANGES.txt src/java/org/apache/solr/analysis/WordDelimiterFilter.java src/java/org/apache/solr/analysis/WordDelimiterIterator.java src/test/org/apache/solr/analysis/TestWordDelimiterFilter.java

Reply via email to