I threw this code together a while ago and it seems to work for me. The performance could probably be improved, but if anyone wants, they're free to check it in. It goes under src/java/org/apache/nutch/util/AccentReplacer.java.
Howie /** * Copyright 2005 The Apache Software Foundation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.nutch.util; import java.util.HashMap; /** * A collection of String processing utility methods. */ public class AccentReplacer { private char[] translateChars; private HashMap translateMap; public AccentReplacer() { translateChars = new char[256]; translateMap = new HashMap(); initializeAccentReplacement(); } public void initializeAccentReplacement() { // Make an array of 256 with replace chars. for (char c=0; c<255; c++) { translateChars[c] = c; } translateChars['\300'] = 'A'; translateChars['\301'] = 'A'; translateChars['\302'] = 'A'; translateChars['\303'] = 'A'; translateChars['\304'] = 'A'; translateChars['\305'] = 'A'; translateChars['\307'] = 'C'; translateChars['\310'] = 'E'; translateChars['\311'] = 'E'; translateChars['\312'] = 'E'; translateChars['\313'] = 'E'; translateChars['\314'] = 'I'; translateChars['\315'] = 'I'; translateChars['\316'] = 'I'; translateChars['\317'] = 'I'; translateChars['\321'] = 'N'; translateChars['\322'] = 'O'; translateChars['\323'] = 'O'; translateChars['\324'] = 'O'; translateChars['\325'] = 'O'; translateChars['\326'] = 'O'; translateChars['\331'] = 'U'; translateChars['\332'] = 'U'; translateChars['\333'] = 'U'; translateChars['\334'] = 'U'; translateChars['\335'] = 'Y'; translateChars['\340'] = 'a'; translateChars['\341'] = 'a'; translateChars['\342'] = 'a'; translateChars['\343'] = 'a'; translateChars['\344'] = 'a'; translateChars['\345'] = 'a'; translateChars['\347'] = 'c'; translateChars['\350'] = 'e'; translateChars['\351'] = 'e'; translateChars['\352'] = 'e'; translateChars['\353'] = 'e'; translateChars['\354'] = 'i'; translateChars['\355'] = 'i'; translateChars['\356'] = 'i'; translateChars['\357'] = 'i'; translateChars['\361'] = 'n'; translateChars['\362'] = 'o'; translateChars['\363'] = 'o'; translateChars['\364'] = 'o'; translateChars['\365'] = 'o'; translateChars['\366'] = 'o'; translateChars['\371'] = 'u'; translateChars['\372'] = 'u'; translateChars['\373'] = 'u'; translateChars['\374'] = 'u'; translateChars['\375'] = 'y'; translateChars['\377'] = 'y'; translateMap.put("À", "A"); translateMap.put("Á", "A"); translateMap.put("Â", "A"); translateMap.put("Ã", "A"); translateMap.put("Ä", "A"); translateMap.put("Å", "A"); translateMap.put("À", "A"); translateMap.put("Á", "A"); translateMap.put("Â", "A"); translateMap.put("Ã", "A"); translateMap.put("Ä", "A"); translateMap.put("Å", "A"); translateMap.put("Æ", "Ae"); translateMap.put("Æ", "Ae"); translateMap.put("Ç", "C"); translateMap.put("Ç", "C"); translateMap.put("È", "E"); translateMap.put("É", "E"); translateMap.put("Ê", "E"); translateMap.put("Ë", "E"); translateMap.put("È", "E"); translateMap.put("É", "E"); translateMap.put("Ê", "E"); translateMap.put("Ë", "E"); translateMap.put("Ì", "I"); translateMap.put("Í", "I"); translateMap.put("Î", "I"); translateMap.put("Ï", "I"); translateMap.put("Ì", "I"); translateMap.put("Í", "I"); translateMap.put("Î", "I"); translateMap.put("Ï", "I"); translateMap.put("Ñ", "N"); translateMap.put("Ñ", "N"); translateMap.put("Ò", "O"); translateMap.put("Ó", "O"); translateMap.put("Ô", "O"); translateMap.put("Õ", "O"); translateMap.put("Ö", "O"); translateMap.put("Ò", "O"); translateMap.put("Ó", "O"); translateMap.put("Ô", "O"); translateMap.put("Õ", "O"); translateMap.put("Ö", "O"); translateMap.put("Ù", "U"); translateMap.put("Ú", "U"); translateMap.put("Û", "U"); translateMap.put("Ü", "U"); translateMap.put("Ù", "U"); translateMap.put("Ú", "U"); translateMap.put("Û", "U"); translateMap.put("Ü", "U"); translateMap.put("Ý", "Y"); translateMap.put("Ý", "Y"); translateMap.put("à", "a"); translateMap.put("á", "a"); translateMap.put("â", "a"); translateMap.put("ã", "a"); translateMap.put("ä", "a"); translateMap.put("å", "a"); translateMap.put("à", "A"); translateMap.put("á", "A"); translateMap.put("â", "A"); translateMap.put("ã", "A"); translateMap.put("ä", "A"); translateMap.put("å", "A"); translateMap.put("æ", "ae"); translateMap.put("ç", "c"); translateMap.put("è", "e"); translateMap.put("é", "e"); translateMap.put("ê", "e"); translateMap.put("ë", "e"); translateMap.put("è", "e"); translateMap.put("é", "e"); translateMap.put("ê", "e"); translateMap.put("ë", "e"); translateMap.put("ì", "i"); translateMap.put("í", "i"); translateMap.put("î", "i"); translateMap.put("ï", "i"); translateMap.put("ì", "i"); translateMap.put("í", "i"); translateMap.put("î", "i"); translateMap.put("ï", "i"); translateMap.put("ñ", "n"); translateMap.put("ñ", "n"); translateMap.put("ò", "o"); translateMap.put("ó", "o"); translateMap.put("ô", "o"); translateMap.put("õ", "o"); translateMap.put("ö", "o"); translateMap.put("ò", "o"); translateMap.put("ó", "o"); translateMap.put("ô", "o"); translateMap.put("õ", "o"); translateMap.put("ö", "o"); translateMap.put("ù", "u"); translateMap.put("ú", "u"); translateMap.put("û", "u"); translateMap.put("ü", "u"); translateMap.put("ù", "u"); translateMap.put("ú", "u"); translateMap.put("û", "u"); translateMap.put("ü", "u"); translateMap.put("ý", "y"); translateMap.put("ÿ", "y"); translateMap.put("ý", "y"); translateMap.put("ÿ", "y"); } public String replaceAccents(String s) { StringBuffer sb = new StringBuffer(s); int pos = 0; int end = -1; while (pos < sb.length()) { if (sb.charAt(pos) == '&') { end = findChar(sb, ';', pos, 8); if (end >= 0) { String temp = sb.substring(pos, end); String replace = (String)translateMap.get(temp); if (replace != null) sb.replace(pos, end, replace); } } else { if (sb.charAt(pos) < translateChars.length) sb.setCharAt(pos, translateChars[sb.charAt(pos)]); } pos++; } return sb.toString(); } public int findChar(StringBuffer sb, char ch, int start, int maxChars) { int end = start+maxChars; if (start+maxChars > sb.length()) { end = sb.length(); } for (int i=start; i<end; i++) { if (sb.charAt(i) == ch) { return i+1; } } return -1; } }
