Accent filter

2004-09-28 Thread Bo Gundersen
Hi,
I am certainly not the first, and probably not the last, that have had 
problems with accented characters in my index. But unfortunately I 
couldnt find anything in neither lucene nor the lucene-sandbox to solve 
the problem.
Så I wrote an accent filter and thought that I might as well share it 
with you guys :)

--
Bo Gundersen
DBA/Software Developer
M.Sc.CS.
www.atira.dk
package dk.atira.search;

import java.io.IOException;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;

/**
 * This filter converts accent characters to their non-accented versions.
 * Also it strips unwanted characters from the tokens, mening anything 
 * but A-Z,a-z,0-9,ÆÅØæøå and -
 * The valid characters can be changed by adding them to the string validCharsStr.
 * 
 * Created by Bo Gundersen at Sep 28, 2004 12:39:04 PM 
 *
 * @author Bo Gundersen ([EMAIL PROTECTED])
 */
public class AccentFilter
extends TokenFilter
{
private static final Collection validChars = new HashSet();
private static final String validCharsStr = 
abcdefghijklmnopqrstuvwxyz\u00E6\u00F8\u00E5 +
ABCDEFGHIJKLMNOPQRSTUVWXYZ\u00C6\u00D8\u00C5 +
0123456789 +
-;
static {
for(int i=0; ivalidCharsStr.length(); i++)
validChars.add(new Character(validCharsStr.charAt(i)));
}

private static final Map accents = new HashMap();
static {
accents.put(new Character('\u00C0'), A);
accents.put(new Character('\u00C1'), A);
accents.put(new Character('\u00C2'), A);
accents.put(new Character('\u00C3'), A);
accents.put(new Character('\u00E0'), a);
accents.put(new Character('\u00E1'), a);
accents.put(new Character('\u00E2'), a);
accents.put(new Character('\u00E3'), a);
accents.put(new Character('\u00E4'), a);

accents.put(new Character('\u00C8'), E);
accents.put(new Character('\u00C9'), E);
accents.put(new Character('\u00CA'), E);
accents.put(new Character('\u00CB'), E);
accents.put(new Character('\u00E8'), e);
accents.put(new Character('\u00E9'), e);
accents.put(new Character('\u00EA'), e);
accents.put(new Character('\u00EB'), e);

accents.put(new Character('\u00CC'), I);
accents.put(new Character('\u00CD'), I);
accents.put(new Character('\u00CE'), I);
accents.put(new Character('\u00CF'), I);
accents.put(new Character('\u00EC'), i);
accents.put(new Character('\u00ED'), i);
accents.put(new Character('\u00EE'), i);
accents.put(new Character('\u00EF'), i);

accents.put(new Character('\u00D1'), N);
accents.put(new Character('\u00F1'), n);

accents.put(new Character('\u00D2'), O);
accents.put(new Character('\u00D3'), O);
accents.put(new Character('\u00D4'), O);
accents.put(new Character('\u00D5'), O);
accents.put(new Character('\u00D6'), O);
accents.put(new Character('\u00F2'), o);
accents.put(new Character('\u00F3'), o);
accents.put(new Character('\u00F4'), o);
accents.put(new Character('\u00F5'), o);
accents.put(new Character('\u00F6'), o);

accents.put(new Character('\u00D9'), U);
accents.put(new Character('\u00DA'), U);
accents.put(new Character('\u00DB'), U);
accents.put(new Character('\u00DC'), U);
accents.put(new Character('\u00F9'), u);
accents.put(new Character('\u00FA'), u);
accents.put(new Character('\u00FB'), u);
accents.put(new Character('\u00FC'), u);

accents.put(new Character('\u00DD'), Y);
accents.put(new Character('\u00FD'), y);
accents.put(new Character('\u00FF'), y

Re: Accent filter

2004-09-28 Thread John Moylan
Loads of very well thought out ISO-8859 + French/Irish Filters available 
here too: (I think they are all GPL'd)

http://www.nongnu.org/sdx/
Best Regards,
JOhn
Bo Gundersen wrote:
Hi,
I am certainly not the first, and probably not the last, that have had 
problems with accented characters in my index. But unfortunately I 
couldnt find anything in neither lucene nor the lucene-sandbox to solve 
the problem.
Så I wrote an accent filter and thought that I might as well share it 
with you guys :)


package dk.atira.search;
import java.io.IOException;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
/**
 * This filter converts accent characters to their non-accented versions.
 * Also it strips unwanted characters from the tokens, mening anything 
 * but A-Z,a-z,0-9,ÆÅØæøå and -
 * The valid characters can be changed by adding them to the string validCharsStr.
 * 
 * Created by Bo Gundersen at Sep 28, 2004 12:39:04 PM 
 *
 * @author Bo Gundersen ([EMAIL PROTECTED])
 */
public class AccentFilter
		extends TokenFilter
	{
		private static final Collection validChars = new HashSet();
		private static final String validCharsStr = 
			abcdefghijklmnopqrstuvwxyz\u00E6\u00F8\u00E5 +
			ABCDEFGHIJKLMNOPQRSTUVWXYZ\u00C6\u00D8\u00C5 +
			0123456789 +
			-;
		static {
			for(int i=0; ivalidCharsStr.length(); i++)
validChars.add(new Character(validCharsStr.charAt(i)));
		}
		
		private static final Map accents = new HashMap();
		static {
			accents.put(new Character('\u00C0'), A);
			accents.put(new Character('\u00C1'), A);
			accents.put(new Character('\u00C2'), A);
			accents.put(new Character('\u00C3'), A);
			accents.put(new Character('\u00E0'), a);
			accents.put(new Character('\u00E1'), a);
			accents.put(new Character('\u00E2'), a);
			accents.put(new Character('\u00E3'), a);
			accents.put(new Character('\u00E4'), a);
			
			accents.put(new Character('\u00C8'), E);
			accents.put(new Character('\u00C9'), E);
			accents.put(new Character('\u00CA'), E);
			accents.put(new Character('\u00CB'), E);
			accents.put(new Character('\u00E8'), e);
			accents.put(new Character('\u00E9'), e);
			accents.put(new Character('\u00EA'), e);
			accents.put(new Character('\u00EB'), e);

accents.put(new Character('\u00CC'), I);
accents.put(new Character('\u00CD'), I);
accents.put(new Character('\u00CE'), I);
accents.put(new Character('\u00CF'), I);
accents.put(new Character('\u00EC'), i);
accents.put(new Character('\u00ED'), i);
accents.put(new Character('\u00EE'), i);
accents.put(new Character('\u00EF'), i);
accents.put(new Character('\u00D1'), N);
accents.put(new Character('\u00F1'), n);

accents.put(new Character('\u00D2'), O);
accents.put(new Character('\u00D3'), O);
accents.put(new Character('\u00D4'), O);
accents.put(new Character('\u00D5'), O);
accents.put(new Character('\u00D6'), O);
accents.put(new Character('\u00F2'), o);
accents.put(new Character('\u00F3'), o);
accents.put(new Character('\u00F4'), o);
accents.put(new Character('\u00F5'), o);
accents.put(new Character('\u00F6'), o);

accents.put(new Character('\u00D9'), U);
accents.put(new Character('\u00DA'), U);
accents.put(new Character('\u00DB'), U);
accents.put(new Character('\u00DC'), U);
accents.put(new Character('\u00F9'), u);
accents.put(new Character('\u00FA'), u);
accents.put(new Character('\u00FB'), u);
accents.put(new Character('\u00FC'), u);

accents.put(new Character('\u00DD'), Y);
accents.put(new Character('\u00FD'), y);
accents.put(new Character('\u00FF'), y);

accents.put(new Character('\u00C6'), AE);
accents.put(new Character('\u00E6'), ae);
accents.put(new Character('\u00D8'), OE);
accents.put(new Character('\u00F8'), oe);
accents.put(new Character('\u00C5'), AA);
accents.put(new Character('\u00E5'), aa