Does anybody could let me know what should be changed in the "IndexFiles" > demo to let met index and query "pure" digit part number. Currently only > alphabetic query seem to work, digit and special characters (-, _, /, ...) > are ignored.
If I understand your question correctly, you want to index number (digit) sequences, not just letter characters. You'll need a modified Analyzer and Tokenizer for this purpose. See the examples below. These are simply slightly modified adaptations of one of the Analyzers found in the Lucene distribution. Rob package org.firstcall.dod.index; import org.firstcall.dod.index; import com.lucene.analysis.*; import java.io.Reader; import java.util.Hashtable; public class AlphaNumericAnalyzer extends Analyzer { /** * Builds an analyzer which removes words in ENGLISH_STOP_WORDS. */ public AlphaNumericAnalyzer() { stopTable = StopFilter.makeStopTable(ENGLISH_STOP_WORDS); } /** * Builds an analyzer which removes words in the provided array. * *@param stopWords Description of Parameter */ public AlphaNumericAnalyzer(String[] stopWords) { stopTable = StopFilter.makeStopTable(stopWords); } /** * Filters LowerCaseTokenizer with StopFilter. * *@param reader Description of Parameter *@return Description of the Returned Value */ public final TokenStream tokenStream(Reader reader) { return new PorterStemFilter(new StopFilter(new AlphaNumericTokenizer(reader), stopTable)); } /** * Filters LetterTokenizer with LowerCaseFilter and StopFilter. */ private Hashtable stopTable; /** * An array containing some common English words that are not usually useful * for searching. */ public final static String[] ENGLISH_STOP_WORDS = { "a", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", "is", "it", "no", "not", "of", "on", "or", "s", "such", "t", "that", "the", "their", "then", "there", "these", "they", "this", "to", "was", "will", "with" }; } package org.firstcall.dod.index; import org.firstcall.dod.index; import com.lucene.analysis.*; import java.io.*; public class AlphaNumericTokenizer extends Tokenizer { public AlphaNumericTokenizer(Reader in) { input = in; } public final Token next() throws IOException { int length = 0; int start = offset; while (true) { final char c; offset++; if (bufferIndex >= dataLen) { dataLen = input.read(ioBuffer); bufferIndex = 0; } ; if (dataLen == -1) { if (length > 0) { break; } else { return null; } } else { c = (char) ioBuffer[bufferIndex++]; } // if it's a letter or a digit if (Character.isLetter(c) || Character.isDigit(c)) { if (length == 0) { // start of token start = offset - 1; } buffer[length++] = Character.toLowerCase(c); // buffer it if (length == MAX_WORD_LEN) { // buffer overflow! break; } } else if (length > 0) { // at non-Letter w/ chars break; } // return 'em } return new Token(new String(buffer, 0, length), start, start + length); } private int offset = 0, bufferIndex = 0, dataLen = 0; private final char[] buffer = new char[MAX_WORD_LEN]; private final char[] ioBuffer = new char[IO_BUFFER_SIZE]; private final static int MAX_WORD_LEN = 255; private final static int IO_BUFFER_SIZE = 1024; } _______________________________________________ Lucene-users mailing list [EMAIL PROTECTED] https://lists.sourceforge.net/lists/listinfo/lucene-users