Hi,
I am certainly not the first, and probably not the last, that have had
problems with accented characters in my index. But unfortunately I
couldnt find anything in neither lucene nor the lucene-sandbox to solve
the problem.
Så I wrote an accent filter and thought that I might as well share it
with you guys :)
--
Bo Gundersen
DBA/Software Developer
M.Sc.CS.
www.atira.dk
package dk.atira.search;
import java.io.IOException;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
/**
* This filter converts accent characters to their non-accented versions.
* Also it strips unwanted characters from the tokens, mening anything
* but A-Z,a-z,0-9,ÆÅØæøå and -
* The valid characters can be changed by adding them to the string validCharsStr.
*
* Created by Bo Gundersen at Sep 28, 2004 12:39:04 PM
*
* @author Bo Gundersen ([EMAIL PROTECTED])
*/
public class AccentFilter
extends TokenFilter
{
private static final Collection validChars = new HashSet();
private static final String validCharsStr =
abcdefghijklmnopqrstuvwxyz\u00E6\u00F8\u00E5 +
ABCDEFGHIJKLMNOPQRSTUVWXYZ\u00C6\u00D8\u00C5 +
0123456789 +
-;
static {
for(int i=0; ivalidCharsStr.length(); i++)
validChars.add(new Character(validCharsStr.charAt(i)));
}
private static final Map accents = new HashMap();
static {
accents.put(new Character('\u00C0'), A);
accents.put(new Character('\u00C1'), A);
accents.put(new Character('\u00C2'), A);
accents.put(new Character('\u00C3'), A);
accents.put(new Character('\u00E0'), a);
accents.put(new Character('\u00E1'), a);
accents.put(new Character('\u00E2'), a);
accents.put(new Character('\u00E3'), a);
accents.put(new Character('\u00E4'), a);
accents.put(new Character('\u00C8'), E);
accents.put(new Character('\u00C9'), E);
accents.put(new Character('\u00CA'), E);
accents.put(new Character('\u00CB'), E);
accents.put(new Character('\u00E8'), e);
accents.put(new Character('\u00E9'), e);
accents.put(new Character('\u00EA'), e);
accents.put(new Character('\u00EB'), e);
accents.put(new Character('\u00CC'), I);
accents.put(new Character('\u00CD'), I);
accents.put(new Character('\u00CE'), I);
accents.put(new Character('\u00CF'), I);
accents.put(new Character('\u00EC'), i);
accents.put(new Character('\u00ED'), i);
accents.put(new Character('\u00EE'), i);
accents.put(new Character('\u00EF'), i);
accents.put(new Character('\u00D1'), N);
accents.put(new Character('\u00F1'), n);
accents.put(new Character('\u00D2'), O);
accents.put(new Character('\u00D3'), O);
accents.put(new Character('\u00D4'), O);
accents.put(new Character('\u00D5'), O);
accents.put(new Character('\u00D6'), O);
accents.put(new Character('\u00F2'), o);
accents.put(new Character('\u00F3'), o);
accents.put(new Character('\u00F4'), o);
accents.put(new Character('\u00F5'), o);
accents.put(new Character('\u00F6'), o);
accents.put(new Character('\u00D9'), U);
accents.put(new Character('\u00DA'), U);
accents.put(new Character('\u00DB'), U);
accents.put(new Character('\u00DC'), U);
accents.put(new Character('\u00F9'), u);
accents.put(new Character('\u00FA'), u);
accents.put(new Character('\u00FB'), u);
accents.put(new Character('\u00FC'), u);
accents.put(new Character('\u00DD'), Y);
accents.put(new Character('\u00FD'), y);
accents.put(new Character('\u00FF'), y