http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1ee3a9cc/src/Lucene.Net.Analysis.Phonetic/Language/DoubleMetaphone.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Phonetic/Language/DoubleMetaphone.cs b/src/Lucene.Net.Analysis.Phonetic/Language/DoubleMetaphone.cs new file mode 100644 index 0000000..d54968d --- /dev/null +++ b/src/Lucene.Net.Analysis.Phonetic/Language/DoubleMetaphone.cs @@ -0,0 +1,1280 @@ +// commons-codec version compatibility level: 1.9 +using System; +using System.Globalization; +using System.Text; + +namespace Lucene.Net.Analysis.Phonetic.Language +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// <summary> + /// Encodes a string into a double metaphone value. This Implementation is based on the algorithm by <c>Lawrence + /// Philips</c>. + /// <para/> + /// This class is conditionally thread-safe. The instance field <see cref="maxCodeLen"/> is mutable + /// <see cref="MaxCodeLen"/> but is not volatile, and accesses are not synchronized. If an instance of the class is + /// shared between threads, the caller needs to ensure that suitable synchronization is used to ensure safe publication + /// of the value between threads, and must not set <see cref="MaxCodeLen"/> after initial setup. + /// <para/> + /// See <a href="http://drdobbs.com/184401251?pgno=2">Original Article</a> + /// <para/> + /// See <a href="http://en.wikipedia.org/wiki/Metaphone">http://en.wikipedia.org/wiki/Metaphone</a> + /// </summary> + public class DoubleMetaphone : IStringEncoder + { + /// <summary> + /// "Vowels" to test for + /// </summary> + private static readonly string VOWELS = "AEIOUY"; + + /// <summary> + /// Prefixes when present which are not pronounced + /// </summary> + private static readonly string[] SILENT_START = + { "GN", "KN", "PN", "WR", "PS" }; + private static readonly string[] L_R_N_M_B_H_F_V_W_SPACE = + { "L", "R", "N", "M", "B", "H", "F", "V", "W", " " }; + private static readonly string[] ES_EP_EB_EL_EY_IB_IL_IN_IE_EI_ER = + { "ES", "EP", "EB", "EL", "EY", "IB", "IL", "IN", "IE", "EI", "ER" }; + private static readonly string[] L_T_K_S_N_M_B_Z = + { "L", "T", "K", "S", "N", "M", "B", "Z" }; + + /// <summary> + /// Maximum length of an encoding, default is 4 + /// </summary> + private int maxCodeLen = 4; + + /// <summary> + /// Creates an instance of this <see cref="DoubleMetaphone"/> encoder + /// </summary> + public DoubleMetaphone() + : base() + { + } + + /// <summary> + /// Encode a value with Double Metaphone. + /// </summary> + /// <param name="value">String to encode.</param> + /// <returns>An encoded string.</returns> + public virtual string GetDoubleMetaphone(string value) + { + return GetDoubleMetaphone(value, false); + } + + /// <summary> + /// Encode a value with Double Metaphone, optionally using the alternate encoding. + /// </summary> + /// <param name="value">String to encode.</param> + /// <param name="alternate">Use alternate encode.</param> + /// <returns>An encoded string.</returns> + public virtual string GetDoubleMetaphone(string value, bool alternate) + { + value = CleanInput(value); + if (value == null) + { + return null; + } + + bool slavoGermanic = IsSlavoGermanic(value); + int index = IsSilentStart(value) ? 1 : 0; + + DoubleMetaphoneResult result = new DoubleMetaphoneResult(this.MaxCodeLen); + + while (!result.IsComplete && index <= value.Length - 1) + { + switch (value[index]) + { + case 'A': + case 'E': + case 'I': + case 'O': + case 'U': + case 'Y': + index = HandleAEIOUY(result, index); + break; + case 'B': + result.Append('P'); + index = CharAt(value, index + 1) == 'B' ? index + 2 : index + 1; + break; + case '\u00C7': + // A C with a Cedilla + result.Append('S'); + index++; + break; + case 'C': + index = HandleC(value, result, index); + break; + case 'D': + index = HandleD(value, result, index); + break; + case 'F': + result.Append('F'); + index = CharAt(value, index + 1) == 'F' ? index + 2 : index + 1; + break; + case 'G': + index = HandleG(value, result, index, slavoGermanic); + break; + case 'H': + index = HandleH(value, result, index); + break; + case 'J': + index = HandleJ(value, result, index, slavoGermanic); + break; + case 'K': + result.Append('K'); + index = CharAt(value, index + 1) == 'K' ? index + 2 : index + 1; + break; + case 'L': + index = HandleL(value, result, index); + break; + case 'M': + result.Append('M'); + index = ConditionM0(value, index) ? index + 2 : index + 1; + break; + case 'N': + result.Append('N'); + index = CharAt(value, index + 1) == 'N' ? index + 2 : index + 1; + break; + case '\u00D1': + // N with a tilde (spanish ene) + result.Append('N'); + index++; + break; + case 'P': + index = HandleP(value, result, index); + break; + case 'Q': + result.Append('K'); + index = CharAt(value, index + 1) == 'Q' ? index + 2 : index + 1; + break; + case 'R': + index = HandleR(value, result, index, slavoGermanic); + break; + case 'S': + index = HandleS(value, result, index, slavoGermanic); + break; + case 'T': + index = HandleT(value, result, index); + break; + case 'V': + result.Append('F'); + index = CharAt(value, index + 1) == 'V' ? index + 2 : index + 1; + break; + case 'W': + index = HandleW(value, result, index); + break; + case 'X': + index = HandleX(value, result, index); + break; + case 'Z': + index = HandleZ(value, result, index, slavoGermanic); + break; + default: + index++; + break; + } + } + + return alternate ? result.Alternate : result.Primary; + } + + // LUCENENET specific - in .NET we don't need an object overload, since strings are sealed anyway. + // /** + // * Encode the value using DoubleMetaphone. It will only work if + // * <code>obj</code> is a <code>String</code> (like <code>Metaphone</code>). + // * + // * @param obj Object to encode (should be of type String) + // * @return An encoded Object (will be of type String) + // * @throws EncoderException encode parameter is not of type String + // */ + + //public virtual object Encode(object obj) + // { + // if (!(obj is String)) { + // throw new EncoderException("DoubleMetaphone encode parameter is not of type String"); + // } + // return GetDoubleMetaphone((String) obj); + // } + + /// <summary> + /// Encode the value using DoubleMetaphone. + /// </summary> + /// <param name="value">String to encode.</param> + /// <returns>An encoded string.</returns> + public virtual string Encode(string value) + { + return GetDoubleMetaphone(value); + } + + /// <summary> + /// Check if the Double Metaphone values of two <see cref="string"/> values + /// are equal. + /// </summary> + /// <param name="value1">The left-hand side of the encoded <see cref="string.Equals(object)"/>.</param> + /// <param name="value2">The right-hand side of the encoded <see cref="string.Equals(object)"/>.</param> + /// <returns><c>true</c> if the encoded <see cref="string"/>s are equal; <c>false</c> otherwise.</returns> + public virtual bool IsDoubleMetaphoneEqual(string value1, string value2) + { + return IsDoubleMetaphoneEqual(value1, value2, false); + } + + /// <summary> + /// Check if the Double Metaphone values of two <see cref="string"/> values + /// are equal, optionally using the alternate value. + /// </summary> + /// <param name="value1">The left-hand side of the encoded <see cref="string.Equals(object)"/>.</param> + /// <param name="value2">The right-hand side of the encoded <see cref="string.Equals(object)"/>.</param> + /// <param name="alternate">Use the alternate value if <c>true</c>.</param> + /// <returns><c>true</c> if the encoded <see cref="string"/>s are equal; <c>false</c> otherwise.</returns> + public virtual bool IsDoubleMetaphoneEqual(string value1, string value2, bool alternate) + { + return GetDoubleMetaphone(value1, alternate).Equals(GetDoubleMetaphone(value2, alternate)); + } + + /// <summary> + /// Gets or Sets the maxCodeLen. + /// </summary> + public virtual int MaxCodeLen + { + get { return this.maxCodeLen; } + set { this.maxCodeLen = value; } + } + + //-- BEGIN HANDLERS --// + + /// <summary> + /// Handles 'A', 'E', 'I', 'O', 'U', and 'Y' cases. + /// </summary> + private int HandleAEIOUY(DoubleMetaphoneResult result, int index) + { + if (index == 0) + { + result.Append('A'); + } + return index + 1; + } + + /// <summary> + /// Handles 'C' cases. + /// </summary> + private int HandleC(string value, DoubleMetaphoneResult result, int index) + { + if (ConditionC0(value, index)) + { // very confusing, moved out + result.Append('K'); + index += 2; + } + else if (index == 0 && Contains(value, index, 6, "CAESAR")) + { + result.Append('S'); + index += 2; + } + else if (Contains(value, index, 2, "CH")) + { + index = HandleCH(value, result, index); + } + else if (Contains(value, index, 2, "CZ") && + !Contains(value, index - 2, 4, "WICZ")) + { + //-- "Czerny" --// + result.Append('S', 'X'); + index += 2; + } + else if (Contains(value, index + 1, 3, "CIA")) + { + //-- "focaccia" --// + result.Append('X'); + index += 3; + } + else if (Contains(value, index, 2, "CC") && + !(index == 1 && CharAt(value, 0) == 'M')) + { + //-- double "cc" but not "McClelland" --// + return HandleCC(value, result, index); + } + else if (Contains(value, index, 2, "CK", "CG", "CQ")) + { + result.Append('K'); + index += 2; + } + else if (Contains(value, index, 2, "CI", "CE", "CY")) + { + //-- Italian vs. English --// + if (Contains(value, index, 3, "CIO", "CIE", "CIA")) + { + result.Append('S', 'X'); + } + else + { + result.Append('S'); + } + index += 2; + } + else + { + result.Append('K'); + if (Contains(value, index + 1, 2, " C", " Q", " G")) + { + //-- Mac Caffrey, Mac Gregor --// + index += 3; + } + else if (Contains(value, index + 1, 1, "C", "K", "Q") && + !Contains(value, index + 1, 2, "CE", "CI")) + { + index += 2; + } + else + { + index++; + } + } + + return index; + } + + /// <summary> + /// Handles 'CC' cases. + /// </summary> + private int HandleCC(string value, DoubleMetaphoneResult result, int index) + { + if (Contains(value, index + 2, 1, "I", "E", "H") && + !Contains(value, index + 2, 2, "HU")) + { + //-- "bellocchio" but not "bacchus" --// + if ((index == 1 && CharAt(value, index - 1) == 'A') || + Contains(value, index - 1, 5, "UCCEE", "UCCES")) + { + //-- "accident", "accede", "succeed" --// + result.Append("KS"); + } + else + { + //-- "bacci", "bertucci", other Italian --// + result.Append('X'); + } + index += 3; + } + else + { // Pierce's rule + result.Append('K'); + index += 2; + } + + return index; + } + + /// <summary> + /// Handles 'CH' cases. + /// </summary> + private int HandleCH(string value, DoubleMetaphoneResult result, int index) + { + if (index > 0 && Contains(value, index, 4, "CHAE")) + { // Michael + result.Append('K', 'X'); + return index + 2; + } + else if (ConditionCH0(value, index)) + { + //-- Greek roots ("chemistry", "chorus", etc.) --// + result.Append('K'); + return index + 2; + } + else if (ConditionCH1(value, index)) + { + //-- Germanic, Greek, or otherwise 'ch' for 'kh' sound --// + result.Append('K'); + return index + 2; + } + else + { + if (index > 0) + { + if (Contains(value, 0, 2, "MC")) + { + result.Append('K'); + } + else + { + result.Append('X', 'K'); + } + } + else + { + result.Append('X'); + } + return index + 2; + } + } + + /// <summary> + /// Handles 'D' cases. + /// </summary> + private int HandleD(string value, DoubleMetaphoneResult result, int index) + { + if (Contains(value, index, 2, "DG")) + { + //-- "Edge" --// + if (Contains(value, index + 2, 1, "I", "E", "Y")) + { + result.Append('J'); + index += 3; + //-- "Edgar" --// + } + else + { + result.Append("TK"); + index += 2; + } + } + else if (Contains(value, index, 2, "DT", "DD")) + { + result.Append('T'); + index += 2; + } + else + { + result.Append('T'); + index++; + } + return index; + } + + /// <summary> + /// Handles 'G' cases. + /// </summary> + private int HandleG(string value, DoubleMetaphoneResult result, int index, + bool slavoGermanic) + { + if (CharAt(value, index + 1) == 'H') + { + index = HandleGH(value, result, index); + } + else if (CharAt(value, index + 1) == 'N') + { + if (index == 1 && IsVowel(CharAt(value, 0)) && !slavoGermanic) + { + result.Append("KN", "N"); + } + else if (!Contains(value, index + 2, 2, "EY") && + CharAt(value, index + 1) != 'Y' && !slavoGermanic) + { + result.Append("N", "KN"); + } + else + { + result.Append("KN"); + } + index = index + 2; + } + else if (Contains(value, index + 1, 2, "LI") && !slavoGermanic) + { + result.Append("KL", "L"); + index += 2; + } + else if (index == 0 && + (CharAt(value, index + 1) == 'Y' || + Contains(value, index + 1, 2, ES_EP_EB_EL_EY_IB_IL_IN_IE_EI_ER))) + { + //-- -ges-, -gep-, -gel-, -gie- at beginning --// + result.Append('K', 'J'); + index += 2; + } + else if ((Contains(value, index + 1, 2, "ER") || + CharAt(value, index + 1) == 'Y') && + !Contains(value, 0, 6, "DANGER", "RANGER", "MANGER") && + !Contains(value, index - 1, 1, "E", "I") && + !Contains(value, index - 1, 3, "RGY", "OGY")) + { + //-- -ger-, -gy- --// + result.Append('K', 'J'); + index += 2; + } + else if (Contains(value, index + 1, 1, "E", "I", "Y") || + Contains(value, index - 1, 4, "AGGI", "OGGI")) + { + //-- Italian "biaggi" --// + if (Contains(value, 0, 4, "VAN ", "VON ") || + Contains(value, 0, 3, "SCH") || + Contains(value, index + 1, 2, "ET")) + { + //-- obvious germanic --// + result.Append('K'); + } + else if (Contains(value, index + 1, 3, "IER")) + { + result.Append('J'); + } + else + { + result.Append('J', 'K'); + } + index += 2; + } + else if (CharAt(value, index + 1) == 'G') + { + index += 2; + result.Append('K'); + } + else + { + index++; + result.Append('K'); + } + return index; + } + + /// <summary> + /// Handles 'GH' cases. + /// </summary> + private int HandleGH(string value, DoubleMetaphoneResult result, int index) + { + if (index > 0 && !IsVowel(CharAt(value, index - 1))) + { + result.Append('K'); + index += 2; + } + else if (index == 0) + { + if (CharAt(value, index + 2) == 'I') + { + result.Append('J'); + } + else + { + result.Append('K'); + } + index += 2; + } + else if ((index > 1 && Contains(value, index - 2, 1, "B", "H", "D")) || + (index > 2 && Contains(value, index - 3, 1, "B", "H", "D")) || + (index > 3 && Contains(value, index - 4, 1, "B", "H"))) + { + //-- Parker's rule (with some further refinements) - "hugh" + index += 2; + } + else + { + if (index > 2 && CharAt(value, index - 1) == 'U' && + Contains(value, index - 3, 1, "C", "G", "L", "R", "T")) + { + //-- "laugh", "McLaughlin", "cough", "gough", "rough", "tough" + result.Append('F'); + } + else if (index > 0 && CharAt(value, index - 1) != 'I') + { + result.Append('K'); + } + index += 2; + } + return index; + } + + /// <summary> + /// Handles 'H' cases. + /// </summary> + private int HandleH(string value, DoubleMetaphoneResult result, int index) + { + //-- only keep if first & before vowel or between 2 vowels --// + if ((index == 0 || IsVowel(CharAt(value, index - 1))) && + IsVowel(CharAt(value, index + 1))) + { + result.Append('H'); + index += 2; + //-- also takes car of "HH" --// + } + else + { + index++; + } + return index; + } + + /// <summary> + /// Handles 'J' cases. + /// </summary> + private int HandleJ(string value, DoubleMetaphoneResult result, int index, + bool slavoGermanic) + { + if (Contains(value, index, 4, "JOSE") || Contains(value, 0, 4, "SAN ")) + { + //-- obvious Spanish, "Jose", "San Jacinto" --// + if ((index == 0 && (CharAt(value, index + 4) == ' ') || + value.Length == 4) || Contains(value, 0, 4, "SAN ")) + { + result.Append('H'); + } + else + { + result.Append('J', 'H'); + } + index++; + } + else + { + if (index == 0 && !Contains(value, index, 4, "JOSE")) + { + result.Append('J', 'A'); + } + else if (IsVowel(CharAt(value, index - 1)) && !slavoGermanic && + (CharAt(value, index + 1) == 'A' || CharAt(value, index + 1) == 'O')) + { + result.Append('J', 'H'); + } + else if (index == value.Length - 1) + { + result.Append('J', ' '); + } + else if (!Contains(value, index + 1, 1, L_T_K_S_N_M_B_Z) && + !Contains(value, index - 1, 1, "S", "K", "L")) + { + result.Append('J'); + } + + if (CharAt(value, index + 1) == 'J') + { + index += 2; + } + else + { + index++; + } + } + return index; + } + + /// <summary> + /// Handles 'L' cases. + /// </summary> + private int HandleL(string value, DoubleMetaphoneResult result, int index) + { + if (CharAt(value, index + 1) == 'L') + { + if (ConditionL0(value, index)) + { + result.AppendPrimary('L'); + } + else + { + result.Append('L'); + } + index += 2; + } + else + { + index++; + result.Append('L'); + } + return index; + } + + /// <summary> + /// Handles 'P' cases. + /// </summary> + private int HandleP(string value, DoubleMetaphoneResult result, int index) + { + if (CharAt(value, index + 1) == 'H') + { + result.Append('F'); + index += 2; + } + else + { + result.Append('P'); + index = Contains(value, index + 1, 1, "P", "B") ? index + 2 : index + 1; + } + return index; + } + + /// <summary> + /// Handles 'R' cases. + /// </summary> + private int HandleR(string value, DoubleMetaphoneResult result, int index, + bool slavoGermanic) + { + if (index == value.Length - 1 && !slavoGermanic && + Contains(value, index - 2, 2, "IE") && + !Contains(value, index - 4, 2, "ME", "MA")) + { + result.AppendAlternate('R'); + } + else + { + result.Append('R'); + } + return CharAt(value, index + 1) == 'R' ? index + 2 : index + 1; + } + + /// <summary> + /// Handles 'S' cases. + /// </summary> + private int HandleS(string value, DoubleMetaphoneResult result, int index, + bool slavoGermanic) + { + if (Contains(value, index - 1, 3, "ISL", "YSL")) + { + //-- special cases "island", "isle", "carlisle", "carlysle" --// + index++; + } + else if (index == 0 && Contains(value, index, 5, "SUGAR")) + { + //-- special case "sugar-" --// + result.Append('X', 'S'); + index++; + } + else if (Contains(value, index, 2, "SH")) + { + if (Contains(value, index + 1, 4, "HEIM", "HOEK", "HOLM", "HOLZ")) + { + //-- germanic --// + result.Append('S'); + } + else + { + result.Append('X'); + } + index += 2; + } + else if (Contains(value, index, 3, "SIO", "SIA") || Contains(value, index, 4, "SIAN")) + { + //-- Italian and Armenian --// + if (slavoGermanic) + { + result.Append('S'); + } + else + { + result.Append('S', 'X'); + } + index += 3; + } + else if ((index == 0 && Contains(value, index + 1, 1, "M", "N", "L", "W")) || + Contains(value, index + 1, 1, "Z")) + { + //-- german & anglicisations, e.g. "smith" match "schmidt" // + // "snider" match "schneider" --// + //-- also, -sz- in slavic language although in hungarian it // + // is pronounced "s" --// + result.Append('S', 'X'); + index = Contains(value, index + 1, 1, "Z") ? index + 2 : index + 1; + } + else if (Contains(value, index, 2, "SC")) + { + index = HandleSC(value, result, index); + } + else + { + if (index == value.Length - 1 && Contains(value, index - 2, 2, "AI", "OI")) + { + //-- french e.g. "resnais", "artois" --// + result.AppendAlternate('S'); + } + else + { + result.Append('S'); + } + index = Contains(value, index + 1, 1, "S", "Z") ? index + 2 : index + 1; + } + return index; + } + + /// <summary> + /// Handles 'SC' cases. + /// </summary> + private int HandleSC(string value, DoubleMetaphoneResult result, int index) + { + if (CharAt(value, index + 2) == 'H') + { + //-- Schlesinger's rule --// + if (Contains(value, index + 3, 2, "OO", "ER", "EN", "UY", "ED", "EM")) + { + //-- Dutch origin, e.g. "school", "schooner" --// + if (Contains(value, index + 3, 2, "ER", "EN")) + { + //-- "schermerhorn", "schenker" --// + result.Append("X", "SK"); + } + else + { + result.Append("SK"); + } + } + else + { + if (index == 0 && !IsVowel(CharAt(value, 3)) && CharAt(value, 3) != 'W') + { + result.Append('X', 'S'); + } + else + { + result.Append('X'); + } + } + } + else if (Contains(value, index + 2, 1, "I", "E", "Y")) + { + result.Append('S'); + } + else + { + result.Append("SK"); + } + return index + 3; + } + + /// <summary> + /// Handles 'T' cases. + /// </summary> + private int HandleT(string value, DoubleMetaphoneResult result, int index) + { + if (Contains(value, index, 4, "TION")) + { + result.Append('X'); + index += 3; + } + else if (Contains(value, index, 3, "TIA", "TCH")) + { + result.Append('X'); + index += 3; + } + else if (Contains(value, index, 2, "TH") || Contains(value, index, 3, "TTH")) + { + if (Contains(value, index + 2, 2, "OM", "AM") || + //-- special case "thomas", "thames" or germanic --// + Contains(value, 0, 4, "VAN ", "VON ") || + Contains(value, 0, 3, "SCH")) + { + result.Append('T'); + } + else + { + result.Append('0', 'T'); + } + index += 2; + } + else + { + result.Append('T'); + index = Contains(value, index + 1, 1, "T", "D") ? index + 2 : index + 1; + } + return index; + } + + /// <summary> + /// Handles 'W' cases. + /// </summary> + private int HandleW(string value, DoubleMetaphoneResult result, int index) + { + if (Contains(value, index, 2, "WR")) + { + //-- can also be in middle of word --// + result.Append('R'); + index += 2; + } + else + { + if (index == 0 && (IsVowel(CharAt(value, index + 1)) || + Contains(value, index, 2, "WH"))) + { + if (IsVowel(CharAt(value, index + 1))) + { + //-- Wasserman should match Vasserman --// + result.Append('A', 'F'); + } + else + { + //-- need Uomo to match Womo --// + result.Append('A'); + } + index++; + } + else if ((index == value.Length - 1 && IsVowel(CharAt(value, index - 1))) || + Contains(value, index - 1, 5, "EWSKI", "EWSKY", "OWSKI", "OWSKY") || + Contains(value, 0, 3, "SCH")) + { + //-- Arnow should match Arnoff --// + result.AppendAlternate('F'); + index++; + } + else if (Contains(value, index, 4, "WICZ", "WITZ")) + { + //-- Polish e.g. "filipowicz" --// + result.Append("TS", "FX"); + index += 4; + } + else + { + index++; + } + } + return index; + } + + /// <summary> + /// Handles 'X' cases. + /// </summary> + private int HandleX(string value, DoubleMetaphoneResult result, int index) + { + if (index == 0) + { + result.Append('S'); + index++; + } + else + { + if (!((index == value.Length - 1) && + (Contains(value, index - 3, 3, "IAU", "EAU") || + Contains(value, index - 2, 2, "AU", "OU")))) + { + //-- French e.g. breaux --// + result.Append("KS"); + } + index = Contains(value, index + 1, 1, "C", "X") ? index + 2 : index + 1; + } + return index; + } + + /// <summary> + /// Handles 'Z' cases. + /// </summary> + private int HandleZ(string value, DoubleMetaphoneResult result, int index, + bool slavoGermanic) + { + if (CharAt(value, index + 1) == 'H') + { + //-- Chinese pinyin e.g. "zhao" or Angelina "Zhang" --// + result.Append('J'); + index += 2; + } + else + { + if (Contains(value, index + 1, 2, "ZO", "ZI", "ZA") || + (slavoGermanic && (index > 0 && CharAt(value, index - 1) != 'T'))) + { + result.Append("S", "TS"); + } + else + { + result.Append('S'); + } + index = CharAt(value, index + 1) == 'Z' ? index + 2 : index + 1; + } + return index; + } + + //-- BEGIN CONDITIONS --// + + /// <summary> + /// Complex condition 0 for 'C'. + /// </summary> + private bool ConditionC0(string value, int index) + { + if (Contains(value, index, 4, "CHIA")) + { + return true; + } + else if (index <= 1) + { + return false; + } + else if (IsVowel(CharAt(value, index - 2))) + { + return false; + } + else if (!Contains(value, index - 1, 3, "ACH")) + { + return false; + } + else + { + char c = CharAt(value, index + 2); + return (c != 'I' && c != 'E') || + Contains(value, index - 2, 6, "BACHER", "MACHER"); + } + } + + /// <summary> + /// Complex condition 0 for 'CH'. + /// </summary> + private bool ConditionCH0(string value, int index) + { + if (index != 0) + { + return false; + } + else if (!Contains(value, index + 1, 5, "HARAC", "HARIS") && + !Contains(value, index + 1, 3, "HOR", "HYM", "HIA", "HEM")) + { + return false; + } + else if (Contains(value, 0, 5, "CHORE")) + { + return false; + } + else + { + return true; + } + } + + /// <summary> + /// Complex condition 1 for 'CH'. + /// </summary> + private bool ConditionCH1(string value, int index) + { + return ((Contains(value, 0, 4, "VAN ", "VON ") || Contains(value, 0, 3, "SCH")) || + Contains(value, index - 2, 6, "ORCHES", "ARCHIT", "ORCHID") || + Contains(value, index + 2, 1, "T", "S") || + ((Contains(value, index - 1, 1, "A", "O", "U", "E") || index == 0) && + (Contains(value, index + 2, 1, L_R_N_M_B_H_F_V_W_SPACE) || index + 1 == value.Length - 1))); + } + + /// <summary> + /// Complex condition 0 for 'L'. + /// </summary> + private bool ConditionL0(string value, int index) + { + if (index == value.Length - 3 && + Contains(value, index - 1, 4, "ILLO", "ILLA", "ALLE")) + { + return true; + } + else if ((Contains(value, value.Length - 2, 2, "AS", "OS") || + Contains(value, value.Length - 1, 1, "A", "O")) && + Contains(value, index - 1, 4, "ALLE")) + { + return true; + } + else + { + return false; + } + } + + /// <summary> + /// Complex condition 0 for 'M'. + /// </summary> + private bool ConditionM0(string value, int index) + { + if (CharAt(value, index + 1) == 'M') + { + return true; + } + return Contains(value, index - 1, 3, "UMB") && + ((index + 1) == value.Length - 1 || Contains(value, index + 2, 2, "ER")); + } + + //-- BEGIN HELPER FUNCTIONS --// + + /// <summary> + /// Determines whether or not a value is of slavo-germanic origin. A value is + /// of slavo-germanic origin if it contians any of 'W', 'K', 'CZ', or 'WITZ'. + /// </summary> + private bool IsSlavoGermanic(string value) + { + return value.IndexOf('W') > -1 || value.IndexOf('K') > -1 || + value.IndexOf("CZ") > -1 || value.IndexOf("WITZ") > -1; + } + + /// <summary> + /// Determines whether or not a character is a vowel or not + /// </summary> + private bool IsVowel(char ch) + { + return VOWELS.IndexOf(ch) != -1; + } + + /// <summary> + /// Determines whether or not the value starts with a silent letter. It will + /// return <c>true</c> if the value starts with any of 'GN', 'KN', + /// 'PN', 'WR' or 'PS'. + /// </summary> + private bool IsSilentStart(string value) + { + bool result = false; + foreach (string element in SILENT_START) + { + if (value.StartsWith(element, StringComparison.Ordinal)) + { + result = true; + break; + } + } + return result; + } + + /// <summary> + /// Cleans the input. + /// </summary> + private string CleanInput(string input) + { + if (input == null) + { + return null; + } + input = input.Trim(); + if (input.Length == 0) + { + return null; + } + return new CultureInfo("en").TextInfo.ToUpper(input); + } + + /// <summary> + /// Gets the character at index <paramref name="index"/> if available, otherwise + /// it returns <see cref="char.MinValue"/> so that there is some sort + /// of a default. + /// </summary> + protected virtual char CharAt(string value, int index) + { + if (index < 0 || index >= value.Length) + { + return char.MinValue; + } + return value[index]; + } + + /// <summary> + /// Determines whether <paramref name="value"/> contains any of the criteria starting at index <paramref name="start"/> and + /// matching up to length <paramref name="length"/>. + /// </summary> + protected static bool Contains(string value, int start, int length, + params string[] criteria) + { + bool result = false; + if (start >= 0 && start + length <= value.Length) + { + string target = value.Substring(start, length); + + foreach (string element in criteria) + { + if (target.Equals(element)) + { + result = true; + break; + } + } + } + return result; + } + + //-- BEGIN INNER CLASSES --// + + /// <summary> + /// Inner class for storing results, since there is the optional alternate encoding. + /// </summary> + public class DoubleMetaphoneResult + { + private readonly StringBuilder primary; + private readonly StringBuilder alternate; + private readonly int maxLength; + + public DoubleMetaphoneResult(int maxLength) + { + this.maxLength = maxLength; + this.primary = new StringBuilder(maxLength); + this.alternate = new StringBuilder(maxLength); + } + + public virtual void Append(char value) + { + AppendPrimary(value); + AppendAlternate(value); + } + + public virtual void Append(char primary, char alternate) + { + AppendPrimary(primary); + AppendAlternate(alternate); + } + + public virtual void AppendPrimary(char value) + { + if (this.primary.Length < this.maxLength) + { + this.primary.Append(value); + } + } + + public virtual void AppendAlternate(char value) + { + if (this.alternate.Length < this.maxLength) + { + this.alternate.Append(value); + } + } + + public virtual void Append(string value) + { + AppendPrimary(value); + AppendAlternate(value); + } + + public virtual void Append(string primary, string alternate) + { + AppendPrimary(primary); + AppendAlternate(alternate); + } + + public virtual void AppendPrimary(string value) + { + int addChars = this.maxLength - this.primary.Length; + if (value.Length <= addChars) + { + this.primary.Append(value); + } + else + { + this.primary.Append(value.Substring(0, addChars - 0)); + } + } + + public virtual void AppendAlternate(string value) + { + int addChars = this.maxLength - this.alternate.Length; + if (value.Length <= addChars) + { + this.alternate.Append(value); + } + else + { + this.alternate.Append(value.Substring(0, addChars - 0)); + } + } + + public virtual string Primary + { + get { return this.primary.ToString(); } + } + + public virtual string Alternate + { + get { return this.alternate.ToString(); } + } + + public virtual bool IsComplete + { + get + { + return this.primary.Length >= this.maxLength && + this.alternate.Length >= this.maxLength; + } + } + } + } +}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1ee3a9cc/src/Lucene.Net.Analysis.Phonetic/Language/MatchRatingApproachEncoder.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Phonetic/Language/MatchRatingApproachEncoder.cs b/src/Lucene.Net.Analysis.Phonetic/Language/MatchRatingApproachEncoder.cs new file mode 100644 index 0000000..c30e571 --- /dev/null +++ b/src/Lucene.Net.Analysis.Phonetic/Language/MatchRatingApproachEncoder.cs @@ -0,0 +1,425 @@ +// commons-codec version compatibility level: 1.9 +using System; +using System.Globalization; +using System.Text; +using System.Text.RegularExpressions; + +namespace Lucene.Net.Analysis.Phonetic.Language +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// <summary> + /// Match Rating Approach Phonetic Algorithm Developed by <c>Western Airlines</c> in 1977. + /// <para/> + /// This class is immutable and thread-safe. + /// <para/> + /// See: <a href="http://en.wikipedia.org/wiki/Match_rating_approach">Wikipedia - Match Rating Approach</a> + /// <para/> + /// since 1.8 + /// </summary> + public class MatchRatingApproachEncoder : IStringEncoder + { + private static readonly string SPACE = " "; + + private static readonly string EMPTY = ""; + + /// <summary> + /// Constants used mainly for the min rating value. + /// </summary> + private static readonly int ONE = 1, TWO = 2, THREE = 3, FOUR = 4, FIVE = 5, SIX = 6, SEVEN = 7, EIGHT = 8, + ELEVEN = 11, TWELVE = 12; + + /// <summary> + /// The plain letter equivalent of the accented letters. + /// </summary> + private static readonly string PLAIN_ASCII = "AaEeIiOoUu" + // grave + "AaEeIiOoUuYy" + // acute + "AaEeIiOoUuYy" + // circumflex + "AaOoNn" + // tilde + "AaEeIiOoUuYy" + // umlaut + "Aa" + // ring + "Cc" + // cedilla + "OoUu"; // double acute + + /// <summary> + /// Unicode characters corresponding to various accented letters. For example: \u00DA is U acute etc... + /// </summary> + private static readonly string UNICODE = "\u00C0\u00E0\u00C8\u00E8\u00CC\u00EC\u00D2\u00F2\u00D9\u00F9" + + "\u00C1\u00E1\u00C9\u00E9\u00CD\u00ED\u00D3\u00F3\u00DA\u00FA\u00DD\u00FD" + + "\u00C2\u00E2\u00CA\u00EA\u00CE\u00EE\u00D4\u00F4\u00DB\u00FB\u0176\u0177" + + "\u00C3\u00E3\u00D5\u00F5\u00D1\u00F1" + + "\u00C4\u00E4\u00CB\u00EB\u00CF\u00EF\u00D6\u00F6\u00DC\u00FC\u0178\u00FF" + + "\u00C5\u00E5" + "\u00C7\u00E7" + "\u0150\u0151\u0170\u0171"; + + private static readonly string[] DOUBLE_CONSONANT = + new string[] { "BB", "CC", "DD", "FF", "GG", "HH", "JJ", "KK", "LL", "MM", "NN", "PP", "QQ", "RR", "SS", + "TT", "VV", "WW", "XX", "YY", "ZZ" }; + + /// <summary> + /// Cleans up a name: 1. Upper-cases everything 2. Removes some common punctuation 3. Removes accents 4. Removes any + /// spaces. + /// </summary> + /// <param name="name">The name to be cleaned.</param> + /// <returns>The cleaned name.</returns> + internal string CleanName(string name) + { + string upperName = new CultureInfo("en").TextInfo.ToUpper(name); + + string[] charsToTrim = { "\\-", "[&]", "\\'", "\\.", "[\\,]" }; + foreach (string str in charsToTrim) + { + upperName = Regex.Replace(upperName, str, EMPTY); + } + + upperName = RemoveAccents(upperName); + upperName = Regex.Replace(upperName, "\\s+", EMPTY); + + return upperName; + } + + // LUCENENET specific - in .NET we don't need an object overload, since strings are sealed anyway. + // ** + // * Encodes an Object using the Match Rating Approach algorithm. Method is here to satisfy the requirements of the + // * Encoder interface Throws an EncoderException if input object is not of type java.lang.string. + // * + // * @param pObject + // * Object to encode + // * @return An object (or type java.lang.string) containing the Match Rating Approach code which corresponds to the + // * string supplied. + // * @throws EncoderException + // * if the parameter supplied is not of type java.lang.string + // */ + //public Object encode(Object pObject) throws EncoderException + //{ + //if (!(pObject instanceof string)) { + // throw new EncoderException( + // "Parameter supplied to Match Rating Approach encoder is not of type java.lang.string"); + // } + //return encode((string) pObject); + //} + + /// <summary> + /// Encodes a string using the Match Rating Approach (MRA) algorithm. + /// </summary> + /// <param name="name">String to encode.</param> + /// <returns>The MRA code corresponding to the string supplied.</returns> + public string Encode(string name) + { + // Bulletproof for trivial input - NINO + if (name == null || EMPTY.Equals(name, StringComparison.OrdinalIgnoreCase) || + SPACE.Equals(name, StringComparison.OrdinalIgnoreCase) || name.Length == 1) + { + return EMPTY; + } + + // Preprocessing + name = CleanName(name); + + // BEGIN: Actual encoding part of the algorithm... + // 1. Delete all vowels unless the vowel begins the word + name = RemoveVowels(name); + + // 2. Remove second consonant from any double consonant + name = RemoveDoubleConsonants(name); + + // 3. Reduce codex to 6 letters by joining the first 3 and last 3 letters + name = GetFirst3Last3(name); + + return name; + } + + /// <summary> + /// Gets the first & last 3 letters of a name (if > 6 characters) Else just returns the name. + /// </summary> + /// <param name="name">The string to get the substrings from.</param> + /// <returns>Annexed first & last 3 letters of input word.</returns> + internal string GetFirst3Last3(string name) + { + int nameLength = name.Length; + + if (nameLength > SIX) + { + string firstThree = name.Substring(0, THREE - 0); + string lastThree = name.Substring(nameLength - THREE, nameLength - (nameLength - THREE)); + return firstThree + lastThree; + } + else + { + return name; + } + } + + /// <summary> + /// Obtains the min rating of the length sum of the 2 names. In essence the larger the sum length the smaller the + /// min rating. Values strictly from documentation. + /// </summary> + /// <param name="sumLength">The length of 2 strings sent down.</param> + /// <returns>The min rating value.</returns> + internal int GetMinRating(int sumLength) + { + int minRating = 0; + + if (sumLength <= FOUR) + { + minRating = FIVE; + } + else if (sumLength >= FIVE && sumLength <= SEVEN) + { + minRating = FOUR; + } + else if (sumLength >= EIGHT && sumLength <= ELEVEN) + { + minRating = THREE; + } + else if (sumLength == TWELVE) + { + minRating = TWO; + } + else + { + minRating = ONE; // docs said little here. + } + + return minRating; + } + + /// <summary> + /// Determines if two names are homophonous via Match Rating Approach (MRA) algorithm. It should be noted that the + /// strings are cleaned in the same way as <see cref="Encode(string)"/>. + /// </summary> + /// <param name="name1">First of the 2 strings (names) to compare.</param> + /// <param name="name2">Second of the 2 names to compare.</param> + /// <returns><c>true</c> if the encodings are identical <c>false</c> otherwise.</returns> + public virtual bool IsEncodeEquals(string name1, string name2) + { + // Bulletproof for trivial input - NINO + if (name1 == null || EMPTY.Equals(name1, StringComparison.OrdinalIgnoreCase) || SPACE.Equals(name1, StringComparison.OrdinalIgnoreCase)) + { + return false; + } + else if (name2 == null || EMPTY.Equals(name2, StringComparison.OrdinalIgnoreCase) || SPACE.Equals(name2, StringComparison.OrdinalIgnoreCase)) + { + return false; + } + else if (name1.Length == 1 || name2.Length == 1) + { + return false; + } + else if (name1.Equals(name2, StringComparison.OrdinalIgnoreCase)) + { + return true; + } + + // Preprocessing + name1 = CleanName(name1); + name2 = CleanName(name2); + + // Actual MRA Algorithm + + // 1. Remove vowels + name1 = RemoveVowels(name1); + name2 = RemoveVowels(name2); + + // 2. Remove double consonants + name1 = RemoveDoubleConsonants(name1); + name2 = RemoveDoubleConsonants(name2); + + // 3. Reduce down to 3 letters + name1 = GetFirst3Last3(name1); + name2 = GetFirst3Last3(name2); + + // 4. Check for length difference - if 3 or greater then no similarity + // comparison is done + if (Math.Abs(name1.Length - name2.Length) >= THREE) + { + return false; + } + + // 5. Obtain the minimum rating value by calculating the length sum of the + // encoded strings and sending it down. + int sumLength = Math.Abs(name1.Length + name2.Length); + int minRating = 0; + minRating = GetMinRating(sumLength); + + // 6. Process the encoded strings from left to right and remove any + // identical characters found from both strings respectively. + int count = LeftToRightThenRightToLeftProcessing(name1, name2); + + // 7. Each PNI item that has a similarity rating equal to or greater than + // the min is considered to be a good candidate match + return count >= minRating; + + } + + /// <summary> + /// Determines if a letter is a vowel. + /// </summary> + /// <param name="letter">The letter under investiagtion.</param> + /// <returns><c>true</c> if a vowel, else <c>false</c>.</returns> + internal bool IsVowel(string letter) + { + return letter.Equals("E", StringComparison.OrdinalIgnoreCase) || letter.Equals("A", StringComparison.OrdinalIgnoreCase) || letter.Equals("O", StringComparison.OrdinalIgnoreCase) || + letter.Equals("I", StringComparison.OrdinalIgnoreCase) || letter.Equals("U", StringComparison.OrdinalIgnoreCase); + } + + /// <summary> + /// Processes the names from left to right (first) then right to left removing identical letters in same positions. + /// Then subtracts the longer string that remains from 6 and returns this. + /// </summary> + /// <param name="name1"></param> + /// <param name="name2"></param> + /// <returns></returns> + internal int LeftToRightThenRightToLeftProcessing(string name1, string name2) + { + char[] name1Char = name1.ToCharArray(); + char[] name2Char = name2.ToCharArray(); + + int name1Size = name1.Length - 1; + int name2Size = name2.Length - 1; + + string name1LtRStart = EMPTY; + string name1LtREnd = EMPTY; + + string name2RtLStart = EMPTY; + string name2RtLEnd = EMPTY; + + for (int i = 0; i < name1Char.Length; i++) + { + if (i > name2Size) + { + break; + } + + name1LtRStart = name1.Substring(i, 1); + name1LtREnd = name1.Substring(name1Size - i, 1); + + name2RtLStart = name2.Substring(i, 1); + name2RtLEnd = name2.Substring(name2Size - i, 1); + + // Left to right... + if (name1LtRStart.Equals(name2RtLStart, StringComparison.Ordinal)) + { + name1Char[i] = ' '; + name2Char[i] = ' '; + } + + // Right to left... + if (name1LtREnd.Equals(name2RtLEnd, StringComparison.Ordinal)) + { + name1Char[name1Size - i] = ' '; + name2Char[name2Size - i] = ' '; + } + } + + // Char arrays -> string & remove extraneous space + string strA = Regex.Replace(new string(name1Char), "\\s+", EMPTY); + string strB = Regex.Replace(new string(name2Char), "\\s+", EMPTY); + + // Final bit - subtract longest string from 6 and return this int value + if (strA.Length > strB.Length) + { + return Math.Abs(SIX - strA.Length); + } + else + { + return Math.Abs(SIX - strB.Length); + } + } + + /// <summary> + /// Removes accented letters and replaces with non-accented ascii equivalent Case is preserved. + /// http://www.codecodex.com/wiki/Remove_accent_from_letters_%28ex_.%C3%A9_to_e%29 + /// </summary> + /// <param name="accentedWord">The word that may have accents in it.</param> + /// <returns>De-accented word.</returns> + internal string RemoveAccents(string accentedWord) + { + if (accentedWord == null) + { + return null; + } + + StringBuilder sb = new StringBuilder(); + int n = accentedWord.Length; + + for (int i = 0; i < n; i++) + { + char c = accentedWord[i]; + int pos = UNICODE.IndexOf(c); + if (pos > -1) + { + sb.Append(PLAIN_ASCII[pos]); + } + else + { + sb.Append(c); + } + } + + return sb.ToString(); + } + + /// <summary> + /// Replaces any double consonant pair with the single letter equivalent. + /// </summary> + /// <param name="name">String to have double consonants removed.</param> + /// <returns>Single consonant word.</returns> + internal string RemoveDoubleConsonants(string name) + { + string replacedName = name.ToUpperInvariant(); + foreach (string dc in DOUBLE_CONSONANT) + { + if (replacedName.Contains(dc)) + { + string singleLetter = dc.Substring(0, 1 - 0); + replacedName = replacedName.Replace(dc, singleLetter); + } + } + return replacedName; + } + + /// <summary> + /// Deletes all vowels unless the vowel begins the word. + /// </summary> + /// <param name="name">The name to have vowels removed.</param> + /// <returns>De-voweled word.</returns> + internal string RemoveVowels(string name) + { + // Extract first letter + string firstLetter = name.Substring(0, 1 - 0); + + name = Regex.Replace(name, "A", EMPTY); + name = Regex.Replace(name, "E", EMPTY); + name = Regex.Replace(name, "I", EMPTY); + name = Regex.Replace(name, "O", EMPTY); + name = Regex.Replace(name, "U", EMPTY); + + name = Regex.Replace(name, "\\s{2,}\\b", SPACE); + + // return isVowel(firstLetter) ? (firstLetter + name) : name; + if (IsVowel(firstLetter)) + { + return firstLetter + name; + } + else + { + return name; + } + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1ee3a9cc/src/Lucene.Net.Analysis.Phonetic/Language/Metaphone.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Phonetic/Language/Metaphone.cs b/src/Lucene.Net.Analysis.Phonetic/Language/Metaphone.cs new file mode 100644 index 0000000..dd3038f --- /dev/null +++ b/src/Lucene.Net.Analysis.Phonetic/Language/Metaphone.cs @@ -0,0 +1,494 @@ +// commons-codec version compatibility level: 1.9 +using System.Globalization; +using System.Text; + +namespace Lucene.Net.Analysis.Phonetic.Language +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// <summary> + /// Encodes a string into a Metaphone value. + /// <para/> + /// Initial Java implementation by <c>William B. Brogden. December, 1997</c>. + /// Permission given by <c>wbrogden</c> for code to be used anywhere. + /// <para/> + /// <c>Hanging on the Metaphone</c> by <c>Lawrence Philips</c> in <c>Computer Language of Dec. 1990, + /// p 39.</c> + /// <para/> + /// Note, that this does not match the algorithm that ships with PHP, or the algorithm found in the Perl implementations: + /// <para/> + /// <list type="bullet"> + /// <item><description><a href="http://search.cpan.org/~mschwern/Text-Metaphone-1.96/Metaphone.pm">Text:Metaphone-1.96</a> (broken link 4/30/2013) </description></item> + /// <item><description><a href="https://metacpan.org/source/MSCHWERN/Text-Metaphone-1.96//Metaphone.pm">Text:Metaphone-1.96</a> (link checked 4/30/2013) </description></item> + /// </list> + /// <para/> + /// They have had undocumented changes from the originally published algorithm. + /// For more information, see <a href="https://issues.apache.org/jira/browse/CODEC-57">CODEC-57</a>. + /// <para/> + /// This class is conditionally thread-safe. + /// The instance field <see cref="maxCodeLen"/> is mutable <see cref="MaxCodeLen"/> + /// but is not volatile, and accesses are not synchronized. + /// If an instance of the class is shared between threads, the caller needs to ensure that suitable synchronization + /// is used to ensure safe publication of the value between threads, and must not set <see cref="MaxCodeLen"/> + /// after initial setup. + /// </summary> + public class Metaphone : IStringEncoder + { + /// <summary> + /// Five values in the English language + /// </summary> + private static readonly string VOWELS = "AEIOU"; + + /// <summary> + /// Variable used in Metaphone algorithm + /// </summary> + private static readonly string FRONTV = "EIY"; + + /// <summary> + /// Variable used in Metaphone algorithm + /// </summary> + private static readonly string VARSON = "CSPTG"; + + /// <summary> + /// The max code length for metaphone is 4 + /// </summary> + private int maxCodeLen = 4; + + /// <summary> + /// Creates an instance of the <see cref="Metaphone"/> encoder + /// </summary> + public Metaphone() + : base() + { + } + + /// <summary> + /// Find the metaphone value of a string. This is similar to the + /// soundex algorithm, but better at finding similar sounding words. + /// All input is converted to upper case. + /// Limitations: Input format is expected to be a single ASCII word + /// with only characters in the A - Z range, no punctuation or numbers. + /// </summary> + /// <param name="txt">String to find the metaphone code for.</param> + /// <returns>A metaphone code corresponding to the string supplied.</returns> + public virtual string GetMetaphone(string txt) + { + bool hard = false; + if (txt == null || txt.Length == 0) + { + return ""; + } + // single character is itself + if (txt.Length == 1) + { + return new CultureInfo("en").TextInfo.ToUpper(txt); + } + + char[] inwd = new CultureInfo("en").TextInfo.ToUpper(txt).ToCharArray(); + + StringBuilder local = new StringBuilder(40); // manipulate + StringBuilder code = new StringBuilder(10); // output + // handle initial 2 characters exceptions + switch (inwd[0]) + { + case 'K': + case 'G': + case 'P': /* looking for KN, etc*/ + if (inwd[1] == 'N') + { + local.Append(inwd, 1, inwd.Length - 1); + } + else + { + local.Append(inwd); + } + break; + case 'A': /* looking for AE */ + if (inwd[1] == 'E') + { + local.Append(inwd, 1, inwd.Length - 1); + } + else + { + local.Append(inwd); + } + break; + case 'W': /* looking for WR or WH */ + if (inwd[1] == 'R') + { // WR -> R + local.Append(inwd, 1, inwd.Length - 1); + break; + } + if (inwd[1] == 'H') + { + local.Append(inwd, 1, inwd.Length - 1); + local[0] = 'W'; // WH -> W + } + else + { + local.Append(inwd); + } + break; + case 'X': /* initial X becomes S */ + inwd[0] = 'S'; + local.Append(inwd); + break; + default: + local.Append(inwd); + break; + } // now local has working string with initials fixed + + int wdsz = local.Length; + int n = 0; + + while (code.Length < this.MaxCodeLen && + n < wdsz) + { // max code size of 4 works well + char symb = local[n]; + // remove duplicate letters except C + if (symb != 'C' && IsPreviousChar(local, n, symb)) + { + n++; + } + else + { // not dup + switch (symb) + { + case 'A': + case 'E': + case 'I': + case 'O': + case 'U': + if (n == 0) + { + code.Append(symb); + } + break; // only use vowel if leading char + case 'B': + if (IsPreviousChar(local, n, 'M') && + IsLastChar(wdsz, n)) + { // B is silent if word ends in MB + break; + } + code.Append(symb); + break; + case 'C': // lots of C special cases + /* discard if SCI, SCE or SCY */ + if (IsPreviousChar(local, n, 'S') && + !IsLastChar(wdsz, n) && + FRONTV.IndexOf(local[n + 1]) >= 0) + { + break; + } + if (RegionMatch(local, n, "CIA")) + { // "CIA" -> X + code.Append('X'); + break; + } + if (!IsLastChar(wdsz, n) && + FRONTV.IndexOf(local[n + 1]) >= 0) + { + code.Append('S'); + break; // CI,CE,CY -> S + } + if (IsPreviousChar(local, n, 'S') && + IsNextChar(local, n, 'H')) + { // SCH->sk + code.Append('K'); + break; + } + if (IsNextChar(local, n, 'H')) + { // detect CH + if (n == 0 && + wdsz >= 3 && + IsVowel(local, 2)) + { // CH consonant -> K consonant + code.Append('K'); + } + else + { + code.Append('X'); // CHvowel -> X + } + } + else + { + code.Append('K'); + } + break; + case 'D': + if (!IsLastChar(wdsz, n + 1) && + IsNextChar(local, n, 'G') && + FRONTV.IndexOf(local[n + 2]) >= 0) + { // DGE DGI DGY -> J + code.Append('J'); n += 2; + } + else + { + code.Append('T'); + } + break; + case 'G': // GH silent at end or before consonant + if (IsLastChar(wdsz, n + 1) && + IsNextChar(local, n, 'H')) + { + break; + } + if (!IsLastChar(wdsz, n + 1) && + IsNextChar(local, n, 'H') && + !IsVowel(local, n + 2)) + { + break; + } + if (n > 0 && + (RegionMatch(local, n, "GN") || + RegionMatch(local, n, "GNED"))) + { + break; // silent G + } + if (IsPreviousChar(local, n, 'G')) + { + // NOTE: Given that duplicated chars are removed, I don't see how this can ever be true + hard = true; + } + else + { + hard = false; + } + if (!IsLastChar(wdsz, n) && + FRONTV.IndexOf(local[n + 1]) >= 0 && + !hard) + { + code.Append('J'); + } + else + { + code.Append('K'); + } + break; + case 'H': + if (IsLastChar(wdsz, n)) + { + break; // terminal H + } + if (n > 0 && + VARSON.IndexOf(local[n - 1]) >= 0) + { + break; + } + if (IsVowel(local, n + 1)) + { + code.Append('H'); // Hvowel + } + break; + case 'F': + case 'J': + case 'L': + case 'M': + case 'N': + case 'R': + code.Append(symb); + break; + case 'K': + if (n > 0) + { // not initial + if (!IsPreviousChar(local, n, 'C')) + { + code.Append(symb); + } + } + else + { + code.Append(symb); // initial K + } + break; + case 'P': + if (IsNextChar(local, n, 'H')) + { + // PH -> F + code.Append('F'); + } + else + { + code.Append(symb); + } + break; + case 'Q': + code.Append('K'); + break; + case 'S': + if (RegionMatch(local, n, "SH") || + RegionMatch(local, n, "SIO") || + RegionMatch(local, n, "SIA")) + { + code.Append('X'); + } + else + { + code.Append('S'); + } + break; + case 'T': + if (RegionMatch(local, n, "TIA") || + RegionMatch(local, n, "TIO")) + { + code.Append('X'); + break; + } + if (RegionMatch(local, n, "TCH")) + { + // Silent if in "TCH" + break; + } + // substitute numeral 0 for TH (resembles theta after all) + if (RegionMatch(local, n, "TH")) + { + code.Append('0'); + } + else + { + code.Append('T'); + } + break; + case 'V': + code.Append('F'); break; + case 'W': + case 'Y': // silent if not followed by vowel + if (!IsLastChar(wdsz, n) && + IsVowel(local, n + 1)) + { + code.Append(symb); + } + break; + case 'X': + code.Append('K'); + code.Append('S'); + break; + case 'Z': + code.Append('S'); + break; + default: + // do nothing + break; + } // end switch + n++; + } // end else from symb != 'C' + if (code.Length > this.MaxCodeLen) + { + code.Length = this.MaxCodeLen; + } + } + return code.ToString(); + } + + private bool IsVowel(StringBuilder sb, int index) + { + return VOWELS.IndexOf(sb[index]) >= 0; + } + + private bool IsPreviousChar(StringBuilder sb, int index, char c) + { + bool matches = false; + if (index > 0 && + index < sb.Length) + { + matches = sb[index - 1] == c; + } + return matches; + } + + private bool IsNextChar(StringBuilder sb, int index, char c) + { + bool matches = false; + if (index >= 0 && + index < sb.Length - 1) + { + matches = sb[index + 1] == c; + } + return matches; + } + + private bool RegionMatch(StringBuilder sb, int index, string test) + { + bool matches = false; + if (index >= 0 && + index + test.Length - 1 < sb.Length) + { + string substring = sb.ToString(index, test.Length); + matches = substring.Equals(test); + } + return matches; + } + + private bool IsLastChar(int wdsz, int n) + { + return n + 1 == wdsz; + } + + // LUCENENET specific - in .NET we don't need an object overload, since strings are sealed anyway. + // /** + // * Encodes an Object using the metaphone algorithm. This method + // * is provided in order to satisfy the requirements of the + // * Encoder interface, and will throw an EncoderException if the + // * supplied object is not of type java.lang.String. + // * + // * @param obj Object to encode + // * @return An object (or type java.lang.String) containing the + // * metaphone code which corresponds to the String supplied. + // * @throws EncoderException if the parameter supplied is not + // * of type java.lang.String + // */ + // @Override + //public object encode(object obj) + // { + // if (!(obj is String)) { + // throw new EncoderException("Parameter supplied to Metaphone encode is not of type java.lang.String"); + // } + // return GetMetaphone((String) obj); + // } + + /// <summary> + /// Encodes a string using the <see cref="Metaphone"/> algorithm. + /// </summary> + /// <param name="str">String to encode.</param> + /// <returns>The metaphone code corresponding to the string supplied.</returns> + public virtual string Encode(string str) + { + return GetMetaphone(str); + } + + /// <summary> + /// Tests is the metaphones of two strings are identical. + /// </summary> + /// <param name="str1">First of two strings to compare.</param> + /// <param name="str2">Second of two strings to compare.</param> + /// <returns><c>true</c> if the metaphones of these strings are identical, <c>false</c> otherwise.</returns> + public virtual bool IsMetaphoneEqual(string str1, string str2) + { + return GetMetaphone(str1).Equals(GetMetaphone(str2)); + } + + /// <summary> + /// Gets or Sets <see cref="maxCodeLen"/>. + /// </summary> + public virtual int MaxCodeLen + { + get { return this.maxCodeLen; } + set { this.maxCodeLen = value; } + } + } +}
