I threw this code together a while ago and it seems to work for me.
The performance could probably be improved, but
if anyone wants, they're free to check it in. It goes under
src/java/org/apache/nutch/util/AccentReplacer.java.

Howie



/**
* Copyright 2005 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.nutch.util;

import java.util.HashMap;

/**
* A collection of String processing utility methods.
*/
public class AccentReplacer {

   private char[] translateChars;
   private HashMap translateMap;


   public AccentReplacer() {
        translateChars = new char[256];
        translateMap = new HashMap();
        initializeAccentReplacement();

   }

   public void initializeAccentReplacement() {

        // Make an array of 256 with replace chars.
        for (char c=0; c<255; c++) {
            translateChars[c] = c;
        }


        translateChars['\300'] = 'A';
        translateChars['\301'] = 'A';
        translateChars['\302'] = 'A';
        translateChars['\303'] = 'A';
        translateChars['\304'] = 'A';
        translateChars['\305'] = 'A';
        translateChars['\307'] = 'C';
        translateChars['\310'] = 'E';
        translateChars['\311'] = 'E';
        translateChars['\312'] = 'E';
        translateChars['\313'] = 'E';
        translateChars['\314'] = 'I';
        translateChars['\315'] = 'I';
        translateChars['\316'] = 'I';
        translateChars['\317'] = 'I';
        translateChars['\321'] = 'N';
        translateChars['\322'] = 'O';
        translateChars['\323'] = 'O';
        translateChars['\324'] = 'O';
        translateChars['\325'] = 'O';
        translateChars['\326'] = 'O';
        translateChars['\331'] = 'U';
        translateChars['\332'] = 'U';
        translateChars['\333'] = 'U';
        translateChars['\334'] = 'U';
        translateChars['\335'] = 'Y';
        translateChars['\340'] = 'a';
        translateChars['\341'] = 'a';
        translateChars['\342'] = 'a';
        translateChars['\343'] = 'a';
        translateChars['\344'] = 'a';
        translateChars['\345'] = 'a';
        translateChars['\347'] = 'c';
        translateChars['\350'] = 'e';
        translateChars['\351'] = 'e';
        translateChars['\352'] = 'e';
        translateChars['\353'] = 'e';
        translateChars['\354'] = 'i';
        translateChars['\355'] = 'i';
        translateChars['\356'] = 'i';
        translateChars['\357'] = 'i';
        translateChars['\361'] = 'n';
        translateChars['\362'] = 'o';
        translateChars['\363'] = 'o';
        translateChars['\364'] = 'o';
        translateChars['\365'] = 'o';
        translateChars['\366'] = 'o';
        translateChars['\371'] = 'u';
        translateChars['\372'] = 'u';
        translateChars['\373'] = 'u';
        translateChars['\374'] = 'u';
        translateChars['\375'] = 'y';
        translateChars['\377'] = 'y';


        translateMap.put("&#192;", "A");
        translateMap.put("&#193;", "A");
        translateMap.put("&#194;", "A");
        translateMap.put("&#195;", "A");
        translateMap.put("&#196;", "A");
        translateMap.put("&#197;", "A");

        translateMap.put("&Agrave;", "A");
        translateMap.put("&Aacute;", "A");
        translateMap.put("&Acirc;", "A");
        translateMap.put("&Atilde;", "A");
        translateMap.put("&Auml;", "A");
        translateMap.put("&Aring;", "A");

        translateMap.put("&#198;", "Ae");
        translateMap.put("&AElig;", "Ae");

        translateMap.put("&#199;", "C");
        translateMap.put("&Ccedil;", "C");

        translateMap.put("&#200;", "E");
        translateMap.put("&#201;", "E");
        translateMap.put("&#202;", "E");
        translateMap.put("&#203;", "E");

        translateMap.put("&Egrave;", "E");
        translateMap.put("&Eacute;", "E");
        translateMap.put("&Ecirc;", "E");
        translateMap.put("&Euml;", "E");

        translateMap.put("&#204;", "I");
        translateMap.put("&#205;", "I");
        translateMap.put("&#206;", "I");
        translateMap.put("&#207;", "I");

        translateMap.put("&Igrave;", "I");
        translateMap.put("&Iacute;", "I");
        translateMap.put("&Icirc;", "I");
        translateMap.put("&Iuml;", "I");

        translateMap.put("&#209;", "N");
        translateMap.put("&Ntilde;", "N");

        translateMap.put("&#210;", "O");
        translateMap.put("&#211;", "O");
        translateMap.put("&#212;", "O");
        translateMap.put("&#213;", "O");
        translateMap.put("&#214;", "O");

        translateMap.put("&Ograve;", "O");
        translateMap.put("&Oacute;", "O");
        translateMap.put("&Ocirc;", "O");
        translateMap.put("&Otilde;", "O");
        translateMap.put("&Ouml;", "O");

        translateMap.put("&#217;", "U");
        translateMap.put("&#218;", "U");
        translateMap.put("&#219;", "U");
        translateMap.put("&#220;", "U");

        translateMap.put("&Ugrave;", "U");
        translateMap.put("&Uacute;", "U");
        translateMap.put("&Ucirc;", "U");
        translateMap.put("&Uuml;", "U");

        translateMap.put("&#221;", "Y");
        translateMap.put("&Yacute;", "Y");

        translateMap.put("&#224;", "a");
        translateMap.put("&#225;", "a");
        translateMap.put("&#226;", "a");
        translateMap.put("&#227;", "a");
        translateMap.put("&#228;", "a");
        translateMap.put("&#229;", "a");

        translateMap.put("&agrave;", "A");
        translateMap.put("&aacute;", "A");
        translateMap.put("&acirc;", "A");
        translateMap.put("&atilde;", "A");
        translateMap.put("&auml;", "A");
        translateMap.put("&aring;", "A");

        translateMap.put("&#230;", "ae");

        translateMap.put("&#231;", "c");

        translateMap.put("&#232;", "e");
        translateMap.put("&#233;", "e");
        translateMap.put("&#234;", "e");
        translateMap.put("&#235;", "e");

        translateMap.put("&egrave;", "e");
        translateMap.put("&eacute;", "e");
        translateMap.put("&ecirc;", "e");
        translateMap.put("&euml;", "e");

        translateMap.put("&#236;", "i");
        translateMap.put("&#237;", "i");
        translateMap.put("&#238;", "i");
        translateMap.put("&#239;", "i");

        translateMap.put("&igrave;", "i");
        translateMap.put("&iacute;", "i");
        translateMap.put("&icirc;", "i");
        translateMap.put("&iuml;", "i");

        translateMap.put("&#241;", "n");
        translateMap.put("&ntilde;", "n");

        translateMap.put("&#242;", "o");
        translateMap.put("&#243;", "o");
        translateMap.put("&#244;", "o");
        translateMap.put("&#245;", "o");
        translateMap.put("&#246;", "o");

        translateMap.put("&ograve;", "o");
        translateMap.put("&oacute;", "o");
        translateMap.put("&ocirc;", "o");
        translateMap.put("&otilde;", "o");
        translateMap.put("&ouml;", "o");

        translateMap.put("&#249;", "u");
        translateMap.put("&#250;", "u");
        translateMap.put("&#251;", "u");
        translateMap.put("&#252;", "u");

        translateMap.put("&ugrave;", "u");
        translateMap.put("&uacute;", "u");
        translateMap.put("&ucirc;", "u");
        translateMap.put("&uuml;", "u");

        translateMap.put("&#253;", "y");
        translateMap.put("&#255;", "y");
        translateMap.put("&yacute;", "y");
        translateMap.put("&yuml;", "y");

   }

   public String replaceAccents(String s) {
        StringBuffer sb = new StringBuffer(s);

        int pos = 0;
        int end = -1;
        while (pos < sb.length()) {
            if (sb.charAt(pos) == '&') {
                end = findChar(sb, ';', pos, 8);
                if (end >= 0) {
                    String temp = sb.substring(pos, end);
                    String replace = (String)translateMap.get(temp);
                    if (replace != null)
                        sb.replace(pos, end, replace);
                }
            } else {
                if (sb.charAt(pos) < translateChars.length)
                    sb.setCharAt(pos, translateChars[sb.charAt(pos)]);
            }
            pos++;
        }
        return sb.toString();
   }

   public int findChar(StringBuffer sb, char ch, int start, int maxChars) {
        int end = start+maxChars;
        if (start+maxChars > sb.length()) {
            end = sb.length();
        }
        for (int i=start; i<end; i++) {
            if (sb.charAt(i) == ch) {
                return i+1;
            }
        }
        return -1;
   }

}


Reply via email to