Re: [Lucene-users] Indexing part number

Robert J. Lebowitz Tue, 02 Oct 2001 07:38:33 -0700

Does anybody could let me know what should be changed in the "IndexFiles"
> demo to let met index and query "pure" digit part number. Currently only
> alphabetic query seem to work, digit and special characters (-, _, /, ...)
> are ignored.


If I understand your question correctly, you want to index number (digit)
sequences, not just letter characters.  You'll need a modified Analyzer and
Tokenizer for this purpose.  See the examples below.  These are simply
slightly modified adaptations of one of the Analyzers found in the Lucene
distribution.

Rob

package org.firstcall.dod.index;
import org.firstcall.dod.index;
import com.lucene.analysis.*;
import java.io.Reader;
import java.util.Hashtable;

public class AlphaNumericAnalyzer extends Analyzer {

   /**
    *  Builds an analyzer which removes words in ENGLISH_STOP_WORDS.
    */
   public AlphaNumericAnalyzer() {
      stopTable = StopFilter.makeStopTable(ENGLISH_STOP_WORDS);
   }

   /**
    *  Builds an analyzer which removes words in the provided array.
    *
    *@param  stopWords  Description of Parameter
    */
   public AlphaNumericAnalyzer(String[] stopWords) {
      stopTable = StopFilter.makeStopTable(stopWords);
   }

   /**
    *  Filters LowerCaseTokenizer with StopFilter.
    *
    *@param  reader  Description of Parameter
    *@return         Description of the Returned Value
    */
   public final TokenStream tokenStream(Reader reader) {
      return new PorterStemFilter(new StopFilter(new
AlphaNumericTokenizer(reader), stopTable));
   }

   /**
    *  Filters LetterTokenizer with LowerCaseFilter and StopFilter.
    */

   private Hashtable stopTable;

   /**
    *  An array containing some common English words that are not usually
useful
    *  for searching.
    */
   public final static String[] ENGLISH_STOP_WORDS = {
         "a", "and", "are", "as", "at", "be", "but", "by",
         "for", "if", "in", "into", "is", "it",
         "no", "not", "of", "on", "or", "s", "such",
         "t", "that", "the", "their", "then", "there", "these",
         "they", "this", "to", "was", "will", "with"
         };
}

package org.firstcall.dod.index;

import org.firstcall.dod.index;
import com.lucene.analysis.*;
import java.io.*;

public class AlphaNumericTokenizer extends Tokenizer {

   public AlphaNumericTokenizer(Reader in) {
      input = in;
   }

   public final Token next() throws IOException {
      int length = 0;
      int start = offset;
      while (true) {
         final char c;

         offset++;
         if (bufferIndex >= dataLen) {
            dataLen = input.read(ioBuffer);
            bufferIndex = 0;
         }
         ;
         if (dataLen == -1) {
            if (length > 0) {
               break;
            }
            else {
               return null;
            }
         }
         else {
            c = (char) ioBuffer[bufferIndex++];
         }
         // if it's a letter or a digit
         if (Character.isLetter(c) || Character.isDigit(c)) {

            if (length == 0) {
               // start of token
               start = offset - 1;
            }

            buffer[length++] = Character.toLowerCase(c);
            // buffer it
            if (length == MAX_WORD_LEN) {
               // buffer overflow!
               break;
            }

         }
         else if (length > 0) {
            // at non-Letter w/ chars
            break;
         }
         // return 'em
      }
      return new Token(new String(buffer, 0, length), start, start +
length);
   }

   private int offset = 0, bufferIndex = 0, dataLen = 0;
   private final char[] buffer = new char[MAX_WORD_LEN];
   private final char[] ioBuffer = new char[IO_BUFFER_SIZE];
   private final static int MAX_WORD_LEN = 255;
   private final static int IO_BUFFER_SIZE = 1024;

}



_______________________________________________
Lucene-users mailing list
[EMAIL PROTECTED]
https://lists.sourceforge.net/lists/listinfo/lucene-users

Re: [Lucene-users] Indexing part number

Reply via email to