Does anybody could let me know what should be changed in the "IndexFiles"
> demo to let met index and query "pure" digit part number. Currently only
> alphabetic query seem to work, digit and special characters (-, _, /, ...)
> are ignored.
If I understand your question correctly, you want to index number (digit)
sequences, not just letter characters. You'll need a modified Analyzer and
Tokenizer for this purpose. See the examples below. These are simply
slightly modified adaptations of one of the Analyzers found in the Lucene
distribution.
Rob
package org.firstcall.dod.index;
import org.firstcall.dod.index;
import com.lucene.analysis.*;
import java.io.Reader;
import java.util.Hashtable;
public class AlphaNumericAnalyzer extends Analyzer {
/**
* Builds an analyzer which removes words in ENGLISH_STOP_WORDS.
*/
public AlphaNumericAnalyzer() {
stopTable = StopFilter.makeStopTable(ENGLISH_STOP_WORDS);
}
/**
* Builds an analyzer which removes words in the provided array.
*
*@param stopWords Description of Parameter
*/
public AlphaNumericAnalyzer(String[] stopWords) {
stopTable = StopFilter.makeStopTable(stopWords);
}
/**
* Filters LowerCaseTokenizer with StopFilter.
*
*@param reader Description of Parameter
*@return Description of the Returned Value
*/
public final TokenStream tokenStream(Reader reader) {
return new PorterStemFilter(new StopFilter(new
AlphaNumericTokenizer(reader), stopTable));
}
/**
* Filters LetterTokenizer with LowerCaseFilter and StopFilter.
*/
private Hashtable stopTable;
/**
* An array containing some common English words that are not usually
useful
* for searching.
*/
public final static String[] ENGLISH_STOP_WORDS = {
"a", "and", "are", "as", "at", "be", "but", "by",
"for", "if", "in", "into", "is", "it",
"no", "not", "of", "on", "or", "s", "such",
"t", "that", "the", "their", "then", "there", "these",
"they", "this", "to", "was", "will", "with"
};
}
package org.firstcall.dod.index;
import org.firstcall.dod.index;
import com.lucene.analysis.*;
import java.io.*;
public class AlphaNumericTokenizer extends Tokenizer {
public AlphaNumericTokenizer(Reader in) {
input = in;
}
public final Token next() throws IOException {
int length = 0;
int start = offset;
while (true) {
final char c;
offset++;
if (bufferIndex >= dataLen) {
dataLen = input.read(ioBuffer);
bufferIndex = 0;
}
;
if (dataLen == -1) {
if (length > 0) {
break;
}
else {
return null;
}
}
else {
c = (char) ioBuffer[bufferIndex++];
}
// if it's a letter or a digit
if (Character.isLetter(c) || Character.isDigit(c)) {
if (length == 0) {
// start of token
start = offset - 1;
}
buffer[length++] = Character.toLowerCase(c);
// buffer it
if (length == MAX_WORD_LEN) {
// buffer overflow!
break;
}
}
else if (length > 0) {
// at non-Letter w/ chars
break;
}
// return 'em
}
return new Token(new String(buffer, 0, length), start, start +
length);
}
private int offset = 0, bufferIndex = 0, dataLen = 0;
private final char[] buffer = new char[MAX_WORD_LEN];
private final char[] ioBuffer = new char[IO_BUFFER_SIZE];
private final static int MAX_WORD_LEN = 255;
private final static int IO_BUFFER_SIZE = 1024;
}
_______________________________________________
Lucene-users mailing list
[EMAIL PROTECTED]
https://lists.sourceforge.net/lists/listinfo/lucene-users