[08/15] lucenenet git commit: Added Lucene.Net.Analysis.Phonetic + tests. Rather than porting over the entire commons-codec library, only the language features were ported and added to this library.

nightowl888 Tue, 27 Jun 2017 13:34:16 -0700

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1ee3a9cc/src/Lucene.Net.Analysis.Phonetic/Language/DoubleMetaphone.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Phonetic/Language/DoubleMetaphone.cs 
b/src/Lucene.Net.Analysis.Phonetic/Language/DoubleMetaphone.cs
new file mode 100644
index 0000000..d54968d
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Phonetic/Language/DoubleMetaphone.cs
@@ -0,0 +1,1280 @@
+ï»¿// commons-codec version compatibility level: 1.9
+using System;
+using System.Globalization;
+using System.Text;
+
+namespace Lucene.Net.Analysis.Phonetic.Language
+{
+    /*
+     * Licensed to the Apache Software Foundation (ASF) under one or more
+     * contributor license agreements.  See the NOTICE file distributed with
+     * this work for additional information regarding copyright ownership.
+     * The ASF licenses this file to You under the Apache License, Version 2.0
+     * (the "License"); you may not use this file except in compliance with
+     * the License.  You may obtain a copy of the License at
+     *
+     *     http://www.apache.org/licenses/LICENSE-2.0
+     *
+     * Unless required by applicable law or agreed to in writing, software
+     * distributed under the License is distributed on an "AS IS" BASIS,
+     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     * See the License for the specific language governing permissions and
+     * limitations under the License.
+     */
+
+    /// <summary>
+    /// Encodes a string into a double metaphone value. This Implementation is 
based on the algorithm by <c>Lawrence
+    /// Philips</c>.
+    /// <para/>
+    /// This class is conditionally thread-safe. The instance field <see 
cref="maxCodeLen"/> is mutable
+    /// <see cref="MaxCodeLen"/> but is not volatile, and accesses are not 
synchronized. If an instance of the class is
+    /// shared between threads, the caller needs to ensure that suitable 
synchronization is used to ensure safe publication
+    /// of the value between threads, and must not set <see 
cref="MaxCodeLen"/> after initial setup.
+    /// <para/>
+    /// See <a href="http://drdobbs.com/184401251?pgno=2";>Original Article</a>
+    /// <para/>
+    /// See <a 
href="http://en.wikipedia.org/wiki/Metaphone";>http://en.wikipedia.org/wiki/Metaphone</a>
+    /// </summary>
+    public class DoubleMetaphone : IStringEncoder
+    {
+        /// <summary>
+        /// "Vowels" to test for
+        /// </summary>
+        private static readonly string VOWELS = "AEIOUY";
+
+        /// <summary>
+        /// Prefixes when present which are not pronounced
+        /// </summary>
+        private static readonly string[] SILENT_START =
+            { "GN", "KN", "PN", "WR", "PS" };
+        private static readonly string[] L_R_N_M_B_H_F_V_W_SPACE =
+            { "L", "R", "N", "M", "B", "H", "F", "V", "W", " " };
+        private static readonly string[] ES_EP_EB_EL_EY_IB_IL_IN_IE_EI_ER =
+            { "ES", "EP", "EB", "EL", "EY", "IB", "IL", "IN", "IE", "EI", "ER" 
};
+        private static readonly string[] L_T_K_S_N_M_B_Z =
+            { "L", "T", "K", "S", "N", "M", "B", "Z" };
+
+        /// <summary>
+        /// Maximum length of an encoding, default is 4
+        /// </summary>
+        private int maxCodeLen = 4;
+
+        /// <summary>
+        /// Creates an instance of this <see cref="DoubleMetaphone"/> encoder
+        /// </summary>
+        public DoubleMetaphone()
+            : base()
+        {
+        }
+
+        /// <summary>
+        /// Encode a value with Double Metaphone.
+        /// </summary>
+        /// <param name="value">String to encode.</param>
+        /// <returns>An encoded string.</returns>
+        public virtual string GetDoubleMetaphone(string value)
+        {
+            return GetDoubleMetaphone(value, false);
+        }
+
+        /// <summary>
+        /// Encode a value with Double Metaphone, optionally using the 
alternate encoding.
+        /// </summary>
+        /// <param name="value">String to encode.</param>
+        /// <param name="alternate">Use alternate encode.</param>
+        /// <returns>An encoded string.</returns>
+        public virtual string GetDoubleMetaphone(string value, bool alternate)
+        {
+            value = CleanInput(value);
+            if (value == null)
+            {
+                return null;
+            }
+
+            bool slavoGermanic = IsSlavoGermanic(value);
+            int index = IsSilentStart(value) ? 1 : 0;
+
+            DoubleMetaphoneResult result = new 
DoubleMetaphoneResult(this.MaxCodeLen);
+
+            while (!result.IsComplete && index <= value.Length - 1)
+            {
+                switch (value[index])
+                {
+                    case 'A':
+                    case 'E':
+                    case 'I':
+                    case 'O':
+                    case 'U':
+                    case 'Y':
+                        index = HandleAEIOUY(result, index);
+                        break;
+                    case 'B':
+                        result.Append('P');
+                        index = CharAt(value, index + 1) == 'B' ? index + 2 : 
index + 1;
+                        break;
+                    case '\u00C7':
+                        // A C with a Cedilla
+                        result.Append('S');
+                        index++;
+                        break;
+                    case 'C':
+                        index = HandleC(value, result, index);
+                        break;
+                    case 'D':
+                        index = HandleD(value, result, index);
+                        break;
+                    case 'F':
+                        result.Append('F');
+                        index = CharAt(value, index + 1) == 'F' ? index + 2 : 
index + 1;
+                        break;
+                    case 'G':
+                        index = HandleG(value, result, index, slavoGermanic);
+                        break;
+                    case 'H':
+                        index = HandleH(value, result, index);
+                        break;
+                    case 'J':
+                        index = HandleJ(value, result, index, slavoGermanic);
+                        break;
+                    case 'K':
+                        result.Append('K');
+                        index = CharAt(value, index + 1) == 'K' ? index + 2 : 
index + 1;
+                        break;
+                    case 'L':
+                        index = HandleL(value, result, index);
+                        break;
+                    case 'M':
+                        result.Append('M');
+                        index = ConditionM0(value, index) ? index + 2 : index 
+ 1;
+                        break;
+                    case 'N':
+                        result.Append('N');
+                        index = CharAt(value, index + 1) == 'N' ? index + 2 : 
index + 1;
+                        break;
+                    case '\u00D1':
+                        // N with a tilde (spanish ene)
+                        result.Append('N');
+                        index++;
+                        break;
+                    case 'P':
+                        index = HandleP(value, result, index);
+                        break;
+                    case 'Q':
+                        result.Append('K');
+                        index = CharAt(value, index + 1) == 'Q' ? index + 2 : 
index + 1;
+                        break;
+                    case 'R':
+                        index = HandleR(value, result, index, slavoGermanic);
+                        break;
+                    case 'S':
+                        index = HandleS(value, result, index, slavoGermanic);
+                        break;
+                    case 'T':
+                        index = HandleT(value, result, index);
+                        break;
+                    case 'V':
+                        result.Append('F');
+                        index = CharAt(value, index + 1) == 'V' ? index + 2 : 
index + 1;
+                        break;
+                    case 'W':
+                        index = HandleW(value, result, index);
+                        break;
+                    case 'X':
+                        index = HandleX(value, result, index);
+                        break;
+                    case 'Z':
+                        index = HandleZ(value, result, index, slavoGermanic);
+                        break;
+                    default:
+                        index++;
+                        break;
+                }
+            }
+
+            return alternate ? result.Alternate : result.Primary;
+        }
+
+        // LUCENENET specific - in .NET we don't need an object overload, 
since strings are sealed anyway.
+        //    /**
+        //     * Encode the value using DoubleMetaphone.  It will only work if
+        //     * <code>obj</code> is a <code>String</code> (like 
<code>Metaphone</code>).
+        //     *
+        //     * @param obj Object to encode (should be of type String)
+        //     * @return An encoded Object (will be of type String)
+        //     * @throws EncoderException encode parameter is not of type 
String
+        //     */
+
+        //public virtual object Encode(object obj) 
+        //    {
+        //    if (!(obj is String)) {
+        //            throw new EncoderException("DoubleMetaphone encode 
parameter is not of type String");
+        //        }
+        //    return GetDoubleMetaphone((String) obj);
+        //    }
+
+        /// <summary>
+        /// Encode the value using DoubleMetaphone.
+        /// </summary>
+        /// <param name="value">String to encode.</param>
+        /// <returns>An encoded string.</returns>
+        public virtual string Encode(string value)
+        {
+            return GetDoubleMetaphone(value);
+        }
+
+        /// <summary>
+        /// Check if the Double Metaphone values of two <see cref="string"/> 
values
+        /// are equal.
+        /// </summary>
+        /// <param name="value1">The left-hand side of the encoded <see 
cref="string.Equals(object)"/>.</param>
+        /// <param name="value2">The right-hand side of the encoded <see 
cref="string.Equals(object)"/>.</param>
+        /// <returns><c>true</c> if the encoded <see cref="string"/>s are 
equal; <c>false</c> otherwise.</returns>
+        public virtual bool IsDoubleMetaphoneEqual(string value1, string 
value2)
+        {
+            return IsDoubleMetaphoneEqual(value1, value2, false);
+        }
+
+        /// <summary>
+        /// Check if the Double Metaphone values of two <see cref="string"/> 
values
+        /// are equal, optionally using the alternate value.
+        /// </summary>
+        /// <param name="value1">The left-hand side of the encoded <see 
cref="string.Equals(object)"/>.</param>
+        /// <param name="value2">The right-hand side of the encoded <see 
cref="string.Equals(object)"/>.</param>
+        /// <param name="alternate">Use the alternate value if 
<c>true</c>.</param>
+        /// <returns><c>true</c> if the encoded <see cref="string"/>s are 
equal; <c>false</c> otherwise.</returns>
+        public virtual bool IsDoubleMetaphoneEqual(string value1, string 
value2, bool alternate)
+        {
+            return GetDoubleMetaphone(value1, 
alternate).Equals(GetDoubleMetaphone(value2, alternate));
+        }
+
+        /// <summary>
+        /// Gets or Sets the maxCodeLen.
+        /// </summary>
+        public virtual int MaxCodeLen
+        {
+            get { return this.maxCodeLen; }
+            set { this.maxCodeLen = value; }
+        }
+
+        //-- BEGIN HANDLERS --//
+
+        /// <summary>
+        /// Handles 'A', 'E', 'I', 'O', 'U', and 'Y' cases.
+        /// </summary>
+        private int HandleAEIOUY(DoubleMetaphoneResult result, int index)
+        {
+            if (index == 0)
+            {
+                result.Append('A');
+            }
+            return index + 1;
+        }
+
+        /// <summary>
+        /// Handles 'C' cases.
+        /// </summary>
+        private int HandleC(string value, DoubleMetaphoneResult result, int 
index)
+        {
+            if (ConditionC0(value, index))
+            {  // very confusing, moved out
+                result.Append('K');
+                index += 2;
+            }
+            else if (index == 0 && Contains(value, index, 6, "CAESAR"))
+            {
+                result.Append('S');
+                index += 2;
+            }
+            else if (Contains(value, index, 2, "CH"))
+            {
+                index = HandleCH(value, result, index);
+            }
+            else if (Contains(value, index, 2, "CZ") &&
+                     !Contains(value, index - 2, 4, "WICZ"))
+            {
+                //-- "Czerny" --//
+                result.Append('S', 'X');
+                index += 2;
+            }
+            else if (Contains(value, index + 1, 3, "CIA"))
+            {
+                //-- "focaccia" --//
+                result.Append('X');
+                index += 3;
+            }
+            else if (Contains(value, index, 2, "CC") &&
+                     !(index == 1 && CharAt(value, 0) == 'M'))
+            {
+                //-- double "cc" but not "McClelland" --//
+                return HandleCC(value, result, index);
+            }
+            else if (Contains(value, index, 2, "CK", "CG", "CQ"))
+            {
+                result.Append('K');
+                index += 2;
+            }
+            else if (Contains(value, index, 2, "CI", "CE", "CY"))
+            {
+                //-- Italian vs. English --//
+                if (Contains(value, index, 3, "CIO", "CIE", "CIA"))
+                {
+                    result.Append('S', 'X');
+                }
+                else
+                {
+                    result.Append('S');
+                }
+                index += 2;
+            }
+            else
+            {
+                result.Append('K');
+                if (Contains(value, index + 1, 2, " C", " Q", " G"))
+                {
+                    //-- Mac Caffrey, Mac Gregor --//
+                    index += 3;
+                }
+                else if (Contains(value, index + 1, 1, "C", "K", "Q") &&
+                         !Contains(value, index + 1, 2, "CE", "CI"))
+                {
+                    index += 2;
+                }
+                else
+                {
+                    index++;
+                }
+            }
+
+            return index;
+        }
+
+        /// <summary>
+        /// Handles 'CC' cases.
+        /// </summary>
+        private int HandleCC(string value, DoubleMetaphoneResult result, int 
index)
+        {
+            if (Contains(value, index + 2, 1, "I", "E", "H") &&
+                !Contains(value, index + 2, 2, "HU"))
+            {
+                //-- "bellocchio" but not "bacchus" --//
+                if ((index == 1 && CharAt(value, index - 1) == 'A') ||
+                    Contains(value, index - 1, 5, "UCCEE", "UCCES"))
+                {
+                    //-- "accident", "accede", "succeed" --//
+                    result.Append("KS");
+                }
+                else
+                {
+                    //-- "bacci", "bertucci", other Italian --//
+                    result.Append('X');
+                }
+                index += 3;
+            }
+            else
+            {    // Pierce's rule
+                result.Append('K');
+                index += 2;
+            }
+
+            return index;
+        }
+
+        /// <summary>
+        /// Handles 'CH' cases.
+        /// </summary>
+        private int HandleCH(string value, DoubleMetaphoneResult result, int 
index)
+        {
+            if (index > 0 && Contains(value, index, 4, "CHAE"))
+            {   // Michael
+                result.Append('K', 'X');
+                return index + 2;
+            }
+            else if (ConditionCH0(value, index))
+            {
+                //-- Greek roots ("chemistry", "chorus", etc.) --//
+                result.Append('K');
+                return index + 2;
+            }
+            else if (ConditionCH1(value, index))
+            {
+                //-- Germanic, Greek, or otherwise 'ch' for 'kh' sound --//
+                result.Append('K');
+                return index + 2;
+            }
+            else
+            {
+                if (index > 0)
+                {
+                    if (Contains(value, 0, 2, "MC"))
+                    {
+                        result.Append('K');
+                    }
+                    else
+                    {
+                        result.Append('X', 'K');
+                    }
+                }
+                else
+                {
+                    result.Append('X');
+                }
+                return index + 2;
+            }
+        }
+
+        /// <summary>
+        /// Handles 'D' cases.
+        /// </summary>
+        private int HandleD(string value, DoubleMetaphoneResult result, int 
index)
+        {
+            if (Contains(value, index, 2, "DG"))
+            {
+                //-- "Edge" --//
+                if (Contains(value, index + 2, 1, "I", "E", "Y"))
+                {
+                    result.Append('J');
+                    index += 3;
+                    //-- "Edgar" --//
+                }
+                else
+                {
+                    result.Append("TK");
+                    index += 2;
+                }
+            }
+            else if (Contains(value, index, 2, "DT", "DD"))
+            {
+                result.Append('T');
+                index += 2;
+            }
+            else
+            {
+                result.Append('T');
+                index++;
+            }
+            return index;
+        }
+
+        /// <summary>
+        /// Handles 'G' cases.
+        /// </summary>
+        private int HandleG(string value, DoubleMetaphoneResult result, int 
index,
+                            bool slavoGermanic)
+        {
+            if (CharAt(value, index + 1) == 'H')
+            {
+                index = HandleGH(value, result, index);
+            }
+            else if (CharAt(value, index + 1) == 'N')
+            {
+                if (index == 1 && IsVowel(CharAt(value, 0)) && !slavoGermanic)
+                {
+                    result.Append("KN", "N");
+                }
+                else if (!Contains(value, index + 2, 2, "EY") &&
+                         CharAt(value, index + 1) != 'Y' && !slavoGermanic)
+                {
+                    result.Append("N", "KN");
+                }
+                else
+                {
+                    result.Append("KN");
+                }
+                index = index + 2;
+            }
+            else if (Contains(value, index + 1, 2, "LI") && !slavoGermanic)
+            {
+                result.Append("KL", "L");
+                index += 2;
+            }
+            else if (index == 0 &&
+                     (CharAt(value, index + 1) == 'Y' ||
+                      Contains(value, index + 1, 2, 
ES_EP_EB_EL_EY_IB_IL_IN_IE_EI_ER)))
+            {
+                //-- -ges-, -gep-, -gel-, -gie- at beginning --//
+                result.Append('K', 'J');
+                index += 2;
+            }
+            else if ((Contains(value, index + 1, 2, "ER") ||
+                      CharAt(value, index + 1) == 'Y') &&
+                     !Contains(value, 0, 6, "DANGER", "RANGER", "MANGER") &&
+                     !Contains(value, index - 1, 1, "E", "I") &&
+                     !Contains(value, index - 1, 3, "RGY", "OGY"))
+            {
+                //-- -ger-, -gy- --//
+                result.Append('K', 'J');
+                index += 2;
+            }
+            else if (Contains(value, index + 1, 1, "E", "I", "Y") ||
+                     Contains(value, index - 1, 4, "AGGI", "OGGI"))
+            {
+                //-- Italian "biaggi" --//
+                if (Contains(value, 0, 4, "VAN ", "VON ") ||
+                    Contains(value, 0, 3, "SCH") ||
+                    Contains(value, index + 1, 2, "ET"))
+                {
+                    //-- obvious germanic --//
+                    result.Append('K');
+                }
+                else if (Contains(value, index + 1, 3, "IER"))
+                {
+                    result.Append('J');
+                }
+                else
+                {
+                    result.Append('J', 'K');
+                }
+                index += 2;
+            }
+            else if (CharAt(value, index + 1) == 'G')
+            {
+                index += 2;
+                result.Append('K');
+            }
+            else
+            {
+                index++;
+                result.Append('K');
+            }
+            return index;
+        }
+
+        /// <summary>
+        /// Handles 'GH' cases.
+        /// </summary>
+        private int HandleGH(string value, DoubleMetaphoneResult result, int 
index)
+        {
+            if (index > 0 && !IsVowel(CharAt(value, index - 1)))
+            {
+                result.Append('K');
+                index += 2;
+            }
+            else if (index == 0)
+            {
+                if (CharAt(value, index + 2) == 'I')
+                {
+                    result.Append('J');
+                }
+                else
+                {
+                    result.Append('K');
+                }
+                index += 2;
+            }
+            else if ((index > 1 && Contains(value, index - 2, 1, "B", "H", 
"D")) ||
+                     (index > 2 && Contains(value, index - 3, 1, "B", "H", 
"D")) ||
+                     (index > 3 && Contains(value, index - 4, 1, "B", "H")))
+            {
+                //-- Parker's rule (with some further refinements) - "hugh"
+                index += 2;
+            }
+            else
+            {
+                if (index > 2 && CharAt(value, index - 1) == 'U' &&
+                    Contains(value, index - 3, 1, "C", "G", "L", "R", "T"))
+                {
+                    //-- "laugh", "McLaughlin", "cough", "gough", "rough", 
"tough"
+                    result.Append('F');
+                }
+                else if (index > 0 && CharAt(value, index - 1) != 'I')
+                {
+                    result.Append('K');
+                }
+                index += 2;
+            }
+            return index;
+        }
+
+        /// <summary>
+        /// Handles 'H' cases.
+        /// </summary>
+        private int HandleH(string value, DoubleMetaphoneResult result, int 
index)
+        {
+            //-- only keep if first & before vowel or between 2 vowels --//
+            if ((index == 0 || IsVowel(CharAt(value, index - 1))) &&
+                IsVowel(CharAt(value, index + 1)))
+            {
+                result.Append('H');
+                index += 2;
+                //-- also takes car of "HH" --//
+            }
+            else
+            {
+                index++;
+            }
+            return index;
+        }
+
+        /// <summary>
+        /// Handles 'J' cases.
+        /// </summary>
+        private int HandleJ(string value, DoubleMetaphoneResult result, int 
index,
+                            bool slavoGermanic)
+        {
+            if (Contains(value, index, 4, "JOSE") || Contains(value, 0, 4, 
"SAN "))
+            {
+                //-- obvious Spanish, "Jose", "San Jacinto" --//
+                if ((index == 0 && (CharAt(value, index + 4) == ' ') ||
+                     value.Length == 4) || Contains(value, 0, 4, "SAN "))
+                {
+                    result.Append('H');
+                }
+                else
+                {
+                    result.Append('J', 'H');
+                }
+                index++;
+            }
+            else
+            {
+                if (index == 0 && !Contains(value, index, 4, "JOSE"))
+                {
+                    result.Append('J', 'A');
+                }
+                else if (IsVowel(CharAt(value, index - 1)) && !slavoGermanic &&
+                         (CharAt(value, index + 1) == 'A' || CharAt(value, 
index + 1) == 'O'))
+                {
+                    result.Append('J', 'H');
+                }
+                else if (index == value.Length - 1)
+                {
+                    result.Append('J', ' ');
+                }
+                else if (!Contains(value, index + 1, 1, L_T_K_S_N_M_B_Z) &&
+                         !Contains(value, index - 1, 1, "S", "K", "L"))
+                {
+                    result.Append('J');
+                }
+
+                if (CharAt(value, index + 1) == 'J')
+                {
+                    index += 2;
+                }
+                else
+                {
+                    index++;
+                }
+            }
+            return index;
+        }
+
+        /// <summary>
+        /// Handles 'L' cases.
+        /// </summary>
+        private int HandleL(string value, DoubleMetaphoneResult result, int 
index)
+        {
+            if (CharAt(value, index + 1) == 'L')
+            {
+                if (ConditionL0(value, index))
+                {
+                    result.AppendPrimary('L');
+                }
+                else
+                {
+                    result.Append('L');
+                }
+                index += 2;
+            }
+            else
+            {
+                index++;
+                result.Append('L');
+            }
+            return index;
+        }
+
+        /// <summary>
+        /// Handles 'P' cases.
+        /// </summary>
+        private int HandleP(string value, DoubleMetaphoneResult result, int 
index)
+        {
+            if (CharAt(value, index + 1) == 'H')
+            {
+                result.Append('F');
+                index += 2;
+            }
+            else
+            {
+                result.Append('P');
+                index = Contains(value, index + 1, 1, "P", "B") ? index + 2 : 
index + 1;
+            }
+            return index;
+        }
+
+        /// <summary>
+        /// Handles 'R' cases.
+        /// </summary>
+        private int HandleR(string value, DoubleMetaphoneResult result, int 
index,
+                            bool slavoGermanic)
+        {
+            if (index == value.Length - 1 && !slavoGermanic &&
+                Contains(value, index - 2, 2, "IE") &&
+                !Contains(value, index - 4, 2, "ME", "MA"))
+            {
+                result.AppendAlternate('R');
+            }
+            else
+            {
+                result.Append('R');
+            }
+            return CharAt(value, index + 1) == 'R' ? index + 2 : index + 1;
+        }
+
+        /// <summary>
+        /// Handles 'S' cases.
+        /// </summary>
+        private int HandleS(string value, DoubleMetaphoneResult result, int 
index,
+                            bool slavoGermanic)
+        {
+            if (Contains(value, index - 1, 3, "ISL", "YSL"))
+            {
+                //-- special cases "island", "isle", "carlisle", "carlysle" 
--//
+                index++;
+            }
+            else if (index == 0 && Contains(value, index, 5, "SUGAR"))
+            {
+                //-- special case "sugar-" --//
+                result.Append('X', 'S');
+                index++;
+            }
+            else if (Contains(value, index, 2, "SH"))
+            {
+                if (Contains(value, index + 1, 4, "HEIM", "HOEK", "HOLM", 
"HOLZ"))
+                {
+                    //-- germanic --//
+                    result.Append('S');
+                }
+                else
+                {
+                    result.Append('X');
+                }
+                index += 2;
+            }
+            else if (Contains(value, index, 3, "SIO", "SIA") || 
Contains(value, index, 4, "SIAN"))
+            {
+                //-- Italian and Armenian --//
+                if (slavoGermanic)
+                {
+                    result.Append('S');
+                }
+                else
+                {
+                    result.Append('S', 'X');
+                }
+                index += 3;
+            }
+            else if ((index == 0 && Contains(value, index + 1, 1, "M", "N", 
"L", "W")) ||
+                     Contains(value, index + 1, 1, "Z"))
+            {
+                //-- german & anglicisations, e.g. "smith" match "schmidt" //
+                // "snider" match "schneider" --//
+                //-- also, -sz- in slavic language although in hungarian it //
+                //   is pronounced "s" --//
+                result.Append('S', 'X');
+                index = Contains(value, index + 1, 1, "Z") ? index + 2 : index 
+ 1;
+            }
+            else if (Contains(value, index, 2, "SC"))
+            {
+                index = HandleSC(value, result, index);
+            }
+            else
+            {
+                if (index == value.Length - 1 && Contains(value, index - 2, 2, 
"AI", "OI"))
+                {
+                    //-- french e.g. "resnais", "artois" --//
+                    result.AppendAlternate('S');
+                }
+                else
+                {
+                    result.Append('S');
+                }
+                index = Contains(value, index + 1, 1, "S", "Z") ? index + 2 : 
index + 1;
+            }
+            return index;
+        }
+
+        /// <summary>
+        /// Handles 'SC' cases.
+        /// </summary>
+        private int HandleSC(string value, DoubleMetaphoneResult result, int 
index)
+        {
+            if (CharAt(value, index + 2) == 'H')
+            {
+                //-- Schlesinger's rule --//
+                if (Contains(value, index + 3, 2, "OO", "ER", "EN", "UY", 
"ED", "EM"))
+                {
+                    //-- Dutch origin, e.g. "school", "schooner" --//
+                    if (Contains(value, index + 3, 2, "ER", "EN"))
+                    {
+                        //-- "schermerhorn", "schenker" --//
+                        result.Append("X", "SK");
+                    }
+                    else
+                    {
+                        result.Append("SK");
+                    }
+                }
+                else
+                {
+                    if (index == 0 && !IsVowel(CharAt(value, 3)) && 
CharAt(value, 3) != 'W')
+                    {
+                        result.Append('X', 'S');
+                    }
+                    else
+                    {
+                        result.Append('X');
+                    }
+                }
+            }
+            else if (Contains(value, index + 2, 1, "I", "E", "Y"))
+            {
+                result.Append('S');
+            }
+            else
+            {
+                result.Append("SK");
+            }
+            return index + 3;
+        }
+
+        /// <summary>
+        /// Handles 'T' cases.
+        /// </summary>
+        private int HandleT(string value, DoubleMetaphoneResult result, int 
index)
+        {
+            if (Contains(value, index, 4, "TION"))
+            {
+                result.Append('X');
+                index += 3;
+            }
+            else if (Contains(value, index, 3, "TIA", "TCH"))
+            {
+                result.Append('X');
+                index += 3;
+            }
+            else if (Contains(value, index, 2, "TH") || Contains(value, index, 
3, "TTH"))
+            {
+                if (Contains(value, index + 2, 2, "OM", "AM") ||
+                    //-- special case "thomas", "thames" or germanic --//
+                    Contains(value, 0, 4, "VAN ", "VON ") ||
+                    Contains(value, 0, 3, "SCH"))
+                {
+                    result.Append('T');
+                }
+                else
+                {
+                    result.Append('0', 'T');
+                }
+                index += 2;
+            }
+            else
+            {
+                result.Append('T');
+                index = Contains(value, index + 1, 1, "T", "D") ? index + 2 : 
index + 1;
+            }
+            return index;
+        }
+
+        /// <summary>
+        /// Handles 'W' cases.
+        /// </summary>
+        private int HandleW(string value, DoubleMetaphoneResult result, int 
index)
+        {
+            if (Contains(value, index, 2, "WR"))
+            {
+                //-- can also be in middle of word --//
+                result.Append('R');
+                index += 2;
+            }
+            else
+            {
+                if (index == 0 && (IsVowel(CharAt(value, index + 1)) ||
+                                   Contains(value, index, 2, "WH")))
+                {
+                    if (IsVowel(CharAt(value, index + 1)))
+                    {
+                        //-- Wasserman should match Vasserman --//
+                        result.Append('A', 'F');
+                    }
+                    else
+                    {
+                        //-- need Uomo to match Womo --//
+                        result.Append('A');
+                    }
+                    index++;
+                }
+                else if ((index == value.Length - 1 && IsVowel(CharAt(value, 
index - 1))) ||
+                         Contains(value, index - 1, 5, "EWSKI", "EWSKY", 
"OWSKI", "OWSKY") ||
+                         Contains(value, 0, 3, "SCH"))
+                {
+                    //-- Arnow should match Arnoff --//
+                    result.AppendAlternate('F');
+                    index++;
+                }
+                else if (Contains(value, index, 4, "WICZ", "WITZ"))
+                {
+                    //-- Polish e.g. "filipowicz" --//
+                    result.Append("TS", "FX");
+                    index += 4;
+                }
+                else
+                {
+                    index++;
+                }
+            }
+            return index;
+        }
+
+        /// <summary>
+        /// Handles 'X' cases.
+        /// </summary>
+        private int HandleX(string value, DoubleMetaphoneResult result, int 
index)
+        {
+            if (index == 0)
+            {
+                result.Append('S');
+                index++;
+            }
+            else
+            {
+                if (!((index == value.Length - 1) &&
+                      (Contains(value, index - 3, 3, "IAU", "EAU") ||
+                       Contains(value, index - 2, 2, "AU", "OU"))))
+                {
+                    //-- French e.g. breaux --//
+                    result.Append("KS");
+                }
+                index = Contains(value, index + 1, 1, "C", "X") ? index + 2 : 
index + 1;
+            }
+            return index;
+        }
+
+        /// <summary>
+        /// Handles 'Z' cases.
+        /// </summary>
+        private int HandleZ(string value, DoubleMetaphoneResult result, int 
index,
+                            bool slavoGermanic)
+        {
+            if (CharAt(value, index + 1) == 'H')
+            {
+                //-- Chinese pinyin e.g. "zhao" or Angelina "Zhang" --//
+                result.Append('J');
+                index += 2;
+            }
+            else
+            {
+                if (Contains(value, index + 1, 2, "ZO", "ZI", "ZA") ||
+                    (slavoGermanic && (index > 0 && CharAt(value, index - 1) 
!= 'T')))
+                {
+                    result.Append("S", "TS");
+                }
+                else
+                {
+                    result.Append('S');
+                }
+                index = CharAt(value, index + 1) == 'Z' ? index + 2 : index + 
1;
+            }
+            return index;
+        }
+
+        //-- BEGIN CONDITIONS --//
+
+        /// <summary>
+        /// Complex condition 0 for 'C'.
+        /// </summary>
+        private bool ConditionC0(string value, int index)
+        {
+            if (Contains(value, index, 4, "CHIA"))
+            {
+                return true;
+            }
+            else if (index <= 1)
+            {
+                return false;
+            }
+            else if (IsVowel(CharAt(value, index - 2)))
+            {
+                return false;
+            }
+            else if (!Contains(value, index - 1, 3, "ACH"))
+            {
+                return false;
+            }
+            else
+            {
+                char c = CharAt(value, index + 2);
+                return (c != 'I' && c != 'E') ||
+                        Contains(value, index - 2, 6, "BACHER", "MACHER");
+            }
+        }
+
+        /// <summary>
+        /// Complex condition 0 for 'CH'.
+        /// </summary>
+        private bool ConditionCH0(string value, int index)
+        {
+            if (index != 0)
+            {
+                return false;
+            }
+            else if (!Contains(value, index + 1, 5, "HARAC", "HARIS") &&
+                     !Contains(value, index + 1, 3, "HOR", "HYM", "HIA", 
"HEM"))
+            {
+                return false;
+            }
+            else if (Contains(value, 0, 5, "CHORE"))
+            {
+                return false;
+            }
+            else
+            {
+                return true;
+            }
+        }
+
+        /// <summary>
+        /// Complex condition 1 for 'CH'.
+        /// </summary>
+        private bool ConditionCH1(string value, int index)
+        {
+            return ((Contains(value, 0, 4, "VAN ", "VON ") || Contains(value, 
0, 3, "SCH")) ||
+                    Contains(value, index - 2, 6, "ORCHES", "ARCHIT", 
"ORCHID") ||
+                    Contains(value, index + 2, 1, "T", "S") ||
+                    ((Contains(value, index - 1, 1, "A", "O", "U", "E") || 
index == 0) &&
+                     (Contains(value, index + 2, 1, L_R_N_M_B_H_F_V_W_SPACE) 
|| index + 1 == value.Length - 1)));
+        }
+
+        /// <summary>
+        /// Complex condition 0 for 'L'.
+        /// </summary>
+        private bool ConditionL0(string value, int index)
+        {
+            if (index == value.Length - 3 &&
+                Contains(value, index - 1, 4, "ILLO", "ILLA", "ALLE"))
+            {
+                return true;
+            }
+            else if ((Contains(value, value.Length - 2, 2, "AS", "OS") ||
+                      Contains(value, value.Length - 1, 1, "A", "O")) &&
+                     Contains(value, index - 1, 4, "ALLE"))
+            {
+                return true;
+            }
+            else
+            {
+                return false;
+            }
+        }
+
+        /// <summary>
+        /// Complex condition 0 for 'M'.
+        /// </summary>
+        private bool ConditionM0(string value, int index)
+        {
+            if (CharAt(value, index + 1) == 'M')
+            {
+                return true;
+            }
+            return Contains(value, index - 1, 3, "UMB") &&
+                   ((index + 1) == value.Length - 1 || Contains(value, index + 
2, 2, "ER"));
+        }
+
+        //-- BEGIN HELPER FUNCTIONS --//
+
+        /// <summary>
+        /// Determines whether or not a value is of slavo-germanic origin. A 
value is
+        /// of slavo-germanic origin if it contians any of 'W', 'K', 'CZ', or 
'WITZ'.
+        /// </summary>
+        private bool IsSlavoGermanic(string value)
+        {
+            return value.IndexOf('W') > -1 || value.IndexOf('K') > -1 ||
+                value.IndexOf("CZ") > -1 || value.IndexOf("WITZ") > -1;
+        }
+
+        /// <summary>
+        /// Determines whether or not a character is a vowel or not
+        /// </summary>
+        private bool IsVowel(char ch)
+        {
+            return VOWELS.IndexOf(ch) != -1;
+        }
+
+        /// <summary>
+        /// Determines whether or not the value starts with a silent letter.  
It will
+        /// return <c>true</c> if the value starts with any of 'GN', 'KN',
+        /// 'PN', 'WR' or 'PS'.
+        /// </summary>
+        private bool IsSilentStart(string value)
+        {
+            bool result = false;
+            foreach (string element in SILENT_START)
+            {
+                if (value.StartsWith(element, StringComparison.Ordinal))
+                {
+                    result = true;
+                    break;
+                }
+            }
+            return result;
+        }
+
+        /// <summary>
+        /// Cleans the input.
+        /// </summary>
+        private string CleanInput(string input)
+        {
+            if (input == null)
+            {
+                return null;
+            }
+            input = input.Trim();
+            if (input.Length == 0)
+            {
+                return null;
+            }
+            return new CultureInfo("en").TextInfo.ToUpper(input);
+        }
+
+        /// <summary>
+        /// Gets the character at index <paramref name="index"/> if available, 
otherwise
+        /// it returns <see cref="char.MinValue"/> so that there is some sort
+        /// of a default.
+        /// </summary>
+        protected virtual char CharAt(string value, int index)
+        {
+            if (index < 0 || index >= value.Length)
+            {
+                return char.MinValue;
+            }
+            return value[index];
+        }
+
+        /// <summary>
+        /// Determines whether <paramref name="value"/> contains any of the 
criteria starting at index <paramref name="start"/> and
+        /// matching up to length <paramref name="length"/>.
+        /// </summary>
+        protected static bool Contains(string value, int start, int length,
+                                          params string[] criteria)
+        {
+            bool result = false;
+            if (start >= 0 && start + length <= value.Length)
+            {
+                string target = value.Substring(start, length);
+
+                foreach (string element in criteria)
+                {
+                    if (target.Equals(element))
+                    {
+                        result = true;
+                        break;
+                    }
+                }
+            }
+            return result;
+        }
+
+        //-- BEGIN INNER CLASSES --//
+
+        /// <summary>
+        /// Inner class for storing results, since there is the optional 
alternate encoding.
+        /// </summary>
+        public class DoubleMetaphoneResult
+        {
+            private readonly StringBuilder primary;
+            private readonly StringBuilder alternate;
+            private readonly int maxLength;
+
+            public DoubleMetaphoneResult(int maxLength)
+            {
+                this.maxLength = maxLength;
+                this.primary = new StringBuilder(maxLength);
+                this.alternate = new StringBuilder(maxLength);
+            }
+
+            public virtual void Append(char value)
+            {
+                AppendPrimary(value);
+                AppendAlternate(value);
+            }
+
+            public virtual void Append(char primary, char alternate)
+            {
+                AppendPrimary(primary);
+                AppendAlternate(alternate);
+            }
+
+            public virtual void AppendPrimary(char value)
+            {
+                if (this.primary.Length < this.maxLength)
+                {
+                    this.primary.Append(value);
+                }
+            }
+
+            public virtual void AppendAlternate(char value)
+            {
+                if (this.alternate.Length < this.maxLength)
+                {
+                    this.alternate.Append(value);
+                }
+            }
+
+            public virtual void Append(string value)
+            {
+                AppendPrimary(value);
+                AppendAlternate(value);
+            }
+
+            public virtual void Append(string primary, string alternate)
+            {
+                AppendPrimary(primary);
+                AppendAlternate(alternate);
+            }
+
+            public virtual void AppendPrimary(string value)
+            {
+                int addChars = this.maxLength - this.primary.Length;
+                if (value.Length <= addChars)
+                {
+                    this.primary.Append(value);
+                }
+                else
+                {
+                    this.primary.Append(value.Substring(0, addChars - 0));
+                }
+            }
+
+            public virtual void AppendAlternate(string value)
+            {
+                int addChars = this.maxLength - this.alternate.Length;
+                if (value.Length <= addChars)
+                {
+                    this.alternate.Append(value);
+                }
+                else
+                {
+                    this.alternate.Append(value.Substring(0, addChars - 0));
+                }
+            }
+
+            public virtual string Primary
+            {
+                get { return this.primary.ToString(); }
+            }
+
+            public virtual string Alternate
+            {
+                get { return this.alternate.ToString(); }
+            }
+
+            public virtual bool IsComplete
+            {
+                get
+                {
+                    return this.primary.Length >= this.maxLength &&
+                     this.alternate.Length >= this.maxLength;
+                }
+            }
+        }
+    }
+}


http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1ee3a9cc/src/Lucene.Net.Analysis.Phonetic/Language/MatchRatingApproachEncoder.cs
----------------------------------------------------------------------
diff --git 
a/src/Lucene.Net.Analysis.Phonetic/Language/MatchRatingApproachEncoder.cs 
b/src/Lucene.Net.Analysis.Phonetic/Language/MatchRatingApproachEncoder.cs
new file mode 100644
index 0000000..c30e571
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Phonetic/Language/MatchRatingApproachEncoder.cs
@@ -0,0 +1,425 @@
+ï»¿// commons-codec version compatibility level: 1.9
+using System;
+using System.Globalization;
+using System.Text;
+using System.Text.RegularExpressions;
+
+namespace Lucene.Net.Analysis.Phonetic.Language
+{
+    /*
+     * Licensed to the Apache Software Foundation (ASF) under one or more
+     * contributor license agreements.  See the NOTICE file distributed with
+     * this work for additional information regarding copyright ownership.
+     * The ASF licenses this file to You under the Apache License, Version 2.0
+     * (the "License"); you may not use this file except in compliance with
+     * the License.  You may obtain a copy of the License at
+     *
+     *     http://www.apache.org/licenses/LICENSE-2.0
+     *
+     * Unless required by applicable law or agreed to in writing, software
+     * distributed under the License is distributed on an "AS IS" BASIS,
+     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     * See the License for the specific language governing permissions and
+     * limitations under the License.
+     */
+
+    /// <summary>
+    /// Match Rating Approach Phonetic Algorithm Developed by <c>Western 
Airlines</c> in 1977.
+    /// <para/>
+    /// This class is immutable and thread-safe.
+    /// <para/>
+    /// See: <a 
href="http://en.wikipedia.org/wiki/Match_rating_approach";>Wikipedia - Match 
Rating Approach</a>
+    /// <para/>
+    /// since 1.8
+    /// </summary>
+    public class MatchRatingApproachEncoder : IStringEncoder
+    {
+        private static readonly string SPACE = " ";
+
+        private static readonly string EMPTY = "";
+
+        /// <summary>
+        /// Constants used mainly for the min rating value.
+        /// </summary>
+        private static readonly int ONE = 1, TWO = 2, THREE = 3, FOUR = 4, 
FIVE = 5, SIX = 6, SEVEN = 7, EIGHT = 8,
+                                 ELEVEN = 11, TWELVE = 12;
+
+        /// <summary>
+        /// The plain letter equivalent of the accented letters.
+        /// </summary>
+        private static readonly string PLAIN_ASCII = "AaEeIiOoUu" + // grave
+            "AaEeIiOoUuYy" + // acute
+            "AaEeIiOoUuYy" + // circumflex
+            "AaOoNn" + // tilde
+            "AaEeIiOoUuYy" + // umlaut
+            "Aa" + // ring
+            "Cc" + // cedilla
+            "OoUu"; // double acute
+
+        /// <summary>
+        /// Unicode characters corresponding to various accented letters. For 
example: \u00DA is U acute etc...
+        /// </summary>
+        private static readonly string UNICODE = 
"\u00C0\u00E0\u00C8\u00E8\u00CC\u00EC\u00D2\u00F2\u00D9\u00F9" +
+                
"\u00C1\u00E1\u00C9\u00E9\u00CD\u00ED\u00D3\u00F3\u00DA\u00FA\u00DD\u00FD" +
+                
"\u00C2\u00E2\u00CA\u00EA\u00CE\u00EE\u00D4\u00F4\u00DB\u00FB\u0176\u0177" +
+                "\u00C3\u00E3\u00D5\u00F5\u00D1\u00F1" +
+                
"\u00C4\u00E4\u00CB\u00EB\u00CF\u00EF\u00D6\u00F6\u00DC\u00FC\u0178\u00FF" +
+                "\u00C5\u00E5" + "\u00C7\u00E7" + "\u0150\u0151\u0170\u0171";
+
+        private static readonly string[] DOUBLE_CONSONANT =
+                new string[] { "BB", "CC", "DD", "FF", "GG", "HH", "JJ", "KK", 
"LL", "MM", "NN", "PP", "QQ", "RR", "SS",
+                           "TT", "VV", "WW", "XX", "YY", "ZZ" };
+
+        /// <summary>
+        /// Cleans up a name: 1. Upper-cases everything 2. Removes some common 
punctuation 3. Removes accents 4. Removes any
+        /// spaces.
+        /// </summary>
+        /// <param name="name">The name to be cleaned.</param>
+        /// <returns>The cleaned name.</returns>
+        internal string CleanName(string name)
+        {
+            string upperName = new CultureInfo("en").TextInfo.ToUpper(name);
+
+            string[] charsToTrim = { "\\-", "[&]", "\\'", "\\.", "[\\,]" };
+            foreach (string str in charsToTrim)
+            {
+                upperName = Regex.Replace(upperName, str, EMPTY);
+            }
+
+            upperName = RemoveAccents(upperName);
+            upperName = Regex.Replace(upperName, "\\s+", EMPTY);
+
+            return upperName;
+        }
+
+        // LUCENENET specific - in .NET we don't need an object overload, 
since strings are sealed anyway.
+        // **
+        // * Encodes an Object using the Match Rating Approach algorithm. 
Method is here to satisfy the requirements of the
+        // * Encoder interface Throws an EncoderException if input object is 
not of type java.lang.string.
+        // *
+        // * @param pObject
+        // *            Object to encode
+        // * @return An object (or type java.lang.string) containing the Match 
Rating Approach code which corresponds to the
+        // *         string supplied.
+        // * @throws EncoderException
+        // *             if the parameter supplied is not of type 
java.lang.string
+        // */
+        //public Object encode(Object pObject) throws EncoderException
+        //{
+        //if (!(pObject instanceof string)) {
+        //        throw new EncoderException(
+        //                "Parameter supplied to Match Rating Approach encoder 
is not of type java.lang.string");
+        //    }
+        //return encode((string) pObject);
+        //}
+
+        /// <summary>
+        /// Encodes a string using the Match Rating Approach (MRA) algorithm.
+        /// </summary>
+        /// <param name="name">String to encode.</param>
+        /// <returns>The MRA code corresponding to the string 
supplied.</returns>
+        public string Encode(string name)
+        {
+            // Bulletproof for trivial input - NINO
+            if (name == null || EMPTY.Equals(name, 
StringComparison.OrdinalIgnoreCase) ||
+                SPACE.Equals(name, StringComparison.OrdinalIgnoreCase) || 
name.Length == 1)
+            {
+                return EMPTY;
+            }
+
+            // Preprocessing
+            name = CleanName(name);
+
+            // BEGIN: Actual encoding part of the algorithm...
+            // 1. Delete all vowels unless the vowel begins the word
+            name = RemoveVowels(name);
+
+            // 2. Remove second consonant from any double consonant
+            name = RemoveDoubleConsonants(name);
+
+            // 3. Reduce codex to 6 letters by joining the first 3 and last 3 
letters
+            name = GetFirst3Last3(name);
+
+            return name;
+        }
+
+        /// <summary>
+        /// Gets the first &amp; last 3 letters of a name (if &gt; 6 
characters) Else just returns the name.
+        /// </summary>
+        /// <param name="name">The string to get the substrings from.</param>
+        /// <returns>Annexed first &amp; last 3 letters of input 
word.</returns>
+        internal string GetFirst3Last3(string name)
+        {
+            int nameLength = name.Length;
+
+            if (nameLength > SIX)
+            {
+                string firstThree = name.Substring(0, THREE - 0);
+                string lastThree = name.Substring(nameLength - THREE, 
nameLength - (nameLength - THREE));
+                return firstThree + lastThree;
+            }
+            else
+            {
+                return name;
+            }
+        }
+
+        /// <summary>
+        /// Obtains the min rating of the length sum of the 2 names. In 
essence the larger the sum length the smaller the
+        /// min rating. Values strictly from documentation.
+        /// </summary>
+        /// <param name="sumLength">The length of 2 strings sent down.</param>
+        /// <returns>The min rating value.</returns>
+        internal int GetMinRating(int sumLength)
+        {
+            int minRating = 0;
+
+            if (sumLength <= FOUR)
+            {
+                minRating = FIVE;
+            }
+            else if (sumLength >= FIVE && sumLength <= SEVEN)
+            {
+                minRating = FOUR;
+            }
+            else if (sumLength >= EIGHT && sumLength <= ELEVEN)
+            {
+                minRating = THREE;
+            }
+            else if (sumLength == TWELVE)
+            {
+                minRating = TWO;
+            }
+            else
+            {
+                minRating = ONE; // docs said little here.
+            }
+
+            return minRating;
+        }
+
+        /// <summary>
+        /// Determines if two names are homophonous via Match Rating Approach 
(MRA) algorithm. It should be noted that the
+        /// strings are cleaned in the same way as <see 
cref="Encode(string)"/>.
+        /// </summary>
+        /// <param name="name1">First of the 2 strings (names) to 
compare.</param>
+        /// <param name="name2">Second of the 2 names to compare.</param>
+        /// <returns><c>true</c> if the encodings are identical <c>false</c> 
otherwise.</returns>
+        public virtual bool IsEncodeEquals(string name1, string name2)
+        {
+            // Bulletproof for trivial input - NINO
+            if (name1 == null || EMPTY.Equals(name1, 
StringComparison.OrdinalIgnoreCase) || SPACE.Equals(name1, 
StringComparison.OrdinalIgnoreCase))
+            {
+                return false;
+            }
+            else if (name2 == null || EMPTY.Equals(name2, 
StringComparison.OrdinalIgnoreCase) || SPACE.Equals(name2, 
StringComparison.OrdinalIgnoreCase))
+            {
+                return false;
+            }
+            else if (name1.Length == 1 || name2.Length == 1)
+            {
+                return false;
+            }
+            else if (name1.Equals(name2, StringComparison.OrdinalIgnoreCase))
+            {
+                return true;
+            }
+
+            // Preprocessing
+            name1 = CleanName(name1);
+            name2 = CleanName(name2);
+
+            // Actual MRA Algorithm
+
+            // 1. Remove vowels
+            name1 = RemoveVowels(name1);
+            name2 = RemoveVowels(name2);
+
+            // 2. Remove double consonants
+            name1 = RemoveDoubleConsonants(name1);
+            name2 = RemoveDoubleConsonants(name2);
+
+            // 3. Reduce down to 3 letters
+            name1 = GetFirst3Last3(name1);
+            name2 = GetFirst3Last3(name2);
+
+            // 4. Check for length difference - if 3 or greater then no 
similarity
+            // comparison is done
+            if (Math.Abs(name1.Length - name2.Length) >= THREE)
+            {
+                return false;
+            }
+
+            // 5. Obtain the minimum rating value by calculating the length 
sum of the
+            // encoded strings and sending it down.
+            int sumLength = Math.Abs(name1.Length + name2.Length);
+            int minRating = 0;
+            minRating = GetMinRating(sumLength);
+
+            // 6. Process the encoded strings from left to right and remove any
+            // identical characters found from both strings respectively.
+            int count = LeftToRightThenRightToLeftProcessing(name1, name2);
+
+            // 7. Each PNI item that has a similarity rating equal to or 
greater than
+            // the min is considered to be a good candidate match
+            return count >= minRating;
+
+        }
+
+        /// <summary>
+        /// Determines if a letter is a vowel.
+        /// </summary>
+        /// <param name="letter">The letter under investiagtion.</param>
+        /// <returns><c>true</c> if a vowel, else <c>false</c>.</returns>
+        internal bool IsVowel(string letter)
+        {
+            return letter.Equals("E", StringComparison.OrdinalIgnoreCase) || 
letter.Equals("A", StringComparison.OrdinalIgnoreCase) || letter.Equals("O", 
StringComparison.OrdinalIgnoreCase) ||
+                   letter.Equals("I", StringComparison.OrdinalIgnoreCase) || 
letter.Equals("U", StringComparison.OrdinalIgnoreCase);
+        }
+
+        /// <summary>
+        /// Processes the names from left to right (first) then right to left 
removing identical letters in same positions.
+        /// Then subtracts the longer string that remains from 6 and returns 
this.
+        /// </summary>
+        /// <param name="name1"></param>
+        /// <param name="name2"></param>
+        /// <returns></returns>
+        internal int LeftToRightThenRightToLeftProcessing(string name1, string 
name2)
+        {
+            char[] name1Char = name1.ToCharArray();
+            char[] name2Char = name2.ToCharArray();
+
+            int name1Size = name1.Length - 1;
+            int name2Size = name2.Length - 1;
+
+            string name1LtRStart = EMPTY;
+            string name1LtREnd = EMPTY;
+
+            string name2RtLStart = EMPTY;
+            string name2RtLEnd = EMPTY;
+
+            for (int i = 0; i < name1Char.Length; i++)
+            {
+                if (i > name2Size)
+                {
+                    break;
+                }
+
+                name1LtRStart = name1.Substring(i, 1);
+                name1LtREnd = name1.Substring(name1Size - i, 1);
+
+                name2RtLStart = name2.Substring(i, 1);
+                name2RtLEnd = name2.Substring(name2Size - i, 1);
+
+                // Left to right...
+                if (name1LtRStart.Equals(name2RtLStart, 
StringComparison.Ordinal))
+                {
+                    name1Char[i] = ' ';
+                    name2Char[i] = ' ';
+                }
+
+                // Right to left...
+                if (name1LtREnd.Equals(name2RtLEnd, StringComparison.Ordinal))
+                {
+                    name1Char[name1Size - i] = ' ';
+                    name2Char[name2Size - i] = ' ';
+                }
+            }
+
+            // Char arrays -> string & remove extraneous space
+            string strA = Regex.Replace(new string(name1Char), "\\s+", EMPTY);
+            string strB = Regex.Replace(new string(name2Char), "\\s+", EMPTY);
+
+            // Final bit - subtract longest string from 6 and return this int 
value
+            if (strA.Length > strB.Length)
+            {
+                return Math.Abs(SIX - strA.Length);
+            }
+            else
+            {
+                return Math.Abs(SIX - strB.Length);
+            }
+        }
+
+        /// <summary>
+        /// Removes accented letters and replaces with non-accented ascii 
equivalent Case is preserved.
+        /// 
http://www.codecodex.com/wiki/Remove_accent_from_letters_%28ex_.%C3%A9_to_e%29
+        /// </summary>
+        /// <param name="accentedWord">The word that may have accents in 
it.</param>
+        /// <returns>De-accented word.</returns>
+        internal string RemoveAccents(string accentedWord)
+        {
+            if (accentedWord == null)
+            {
+                return null;
+            }
+
+            StringBuilder sb = new StringBuilder();
+            int n = accentedWord.Length;
+
+            for (int i = 0; i < n; i++)
+            {
+                char c = accentedWord[i];
+                int pos = UNICODE.IndexOf(c);
+                if (pos > -1)
+                {
+                    sb.Append(PLAIN_ASCII[pos]);
+                }
+                else
+                {
+                    sb.Append(c);
+                }
+            }
+
+            return sb.ToString();
+        }
+
+        /// <summary>
+        /// Replaces any double consonant pair with the single letter 
equivalent.
+        /// </summary>
+        /// <param name="name">String to have double consonants 
removed.</param>
+        /// <returns>Single consonant word.</returns>
+        internal string RemoveDoubleConsonants(string name)
+        {
+            string replacedName = name.ToUpperInvariant();
+            foreach (string dc in DOUBLE_CONSONANT)
+            {
+                if (replacedName.Contains(dc))
+                {
+                    string singleLetter = dc.Substring(0, 1 - 0);
+                    replacedName = replacedName.Replace(dc, singleLetter);
+                }
+            }
+            return replacedName;
+        }
+
+        /// <summary>
+        /// Deletes all vowels unless the vowel begins the word.
+        /// </summary>
+        /// <param name="name">The name to have vowels removed.</param>
+        /// <returns>De-voweled word.</returns>
+        internal string RemoveVowels(string name)
+        {
+            // Extract first letter
+            string firstLetter = name.Substring(0, 1 - 0);
+
+            name = Regex.Replace(name, "A", EMPTY);
+            name = Regex.Replace(name, "E", EMPTY);
+            name = Regex.Replace(name, "I", EMPTY);
+            name = Regex.Replace(name, "O", EMPTY);
+            name = Regex.Replace(name, "U", EMPTY);
+
+            name = Regex.Replace(name, "\\s{2,}\\b", SPACE);
+
+            // return isVowel(firstLetter) ? (firstLetter + name) : name;
+            if (IsVowel(firstLetter))
+            {
+                return firstLetter + name;
+            }
+            else
+            {
+                return name;
+            }
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1ee3a9cc/src/Lucene.Net.Analysis.Phonetic/Language/Metaphone.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Phonetic/Language/Metaphone.cs 
b/src/Lucene.Net.Analysis.Phonetic/Language/Metaphone.cs
new file mode 100644
index 0000000..dd3038f
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Phonetic/Language/Metaphone.cs
@@ -0,0 +1,494 @@
+ï»¿// commons-codec version compatibility level: 1.9
+using System.Globalization;
+using System.Text;
+
+namespace Lucene.Net.Analysis.Phonetic.Language
+{
+    /*
+     * Licensed to the Apache Software Foundation (ASF) under one or more
+     * contributor license agreements.  See the NOTICE file distributed with
+     * this work for additional information regarding copyright ownership.
+     * The ASF licenses this file to You under the Apache License, Version 2.0
+     * (the "License"); you may not use this file except in compliance with
+     * the License.  You may obtain a copy of the License at
+     *
+     *     http://www.apache.org/licenses/LICENSE-2.0
+     *
+     * Unless required by applicable law or agreed to in writing, software
+     * distributed under the License is distributed on an "AS IS" BASIS,
+     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     * See the License for the specific language governing permissions and
+     * limitations under the License.
+     */
+
+    /// <summary>
+    /// Encodes a string into a Metaphone value.
+    /// <para/>
+    /// Initial Java implementation by <c>William B. Brogden. December, 
1997</c>.
+    /// Permission given by <c>wbrogden</c> for code to be used anywhere.
+    /// <para/>
+    /// <c>Hanging on the Metaphone</c> by <c>Lawrence Philips</c> in 
<c>Computer Language of Dec. 1990,
+    /// p 39.</c>
+    /// <para/>
+    /// Note, that this does not match the algorithm that ships with PHP, or 
the algorithm found in the Perl implementations:
+    /// <para/>
+    /// <list type="bullet">
+    ///     <item><description><a 
href="http://search.cpan.org/~mschwern/Text-Metaphone-1.96/Metaphone.pm";>Text:Metaphone-1.96</a>
 (broken link 4/30/2013) </description></item>
+    ///     <item><description><a 
href="https://metacpan.org/source/MSCHWERN/Text-Metaphone-1.96//Metaphone.pm";>Text:Metaphone-1.96</a>
 (link checked 4/30/2013) </description></item>
+    /// </list>
+    /// <para/>
+    /// They have had undocumented changes from the originally published 
algorithm.
+    /// For more information, see <a 
href="https://issues.apache.org/jira/browse/CODEC-57";>CODEC-57</a>.
+    /// <para/>
+    /// This class is conditionally thread-safe.
+    /// The instance field <see cref="maxCodeLen"/> is mutable <see 
cref="MaxCodeLen"/>
+    /// but is not volatile, and accesses are not synchronized.
+    /// If an instance of the class is shared between threads, the caller 
needs to ensure that suitable synchronization
+    /// is used to ensure safe publication of the value between threads, and 
must not set <see cref="MaxCodeLen"/>
+    /// after initial setup.
+    /// </summary>
+    public class Metaphone : IStringEncoder
+    {
+        /// <summary>
+        /// Five values in the English language
+        /// </summary>
+        private static readonly string VOWELS = "AEIOU";
+
+        /// <summary>
+        /// Variable used in Metaphone algorithm
+        /// </summary>
+        private static readonly string FRONTV = "EIY";
+
+        /// <summary>
+        /// Variable used in Metaphone algorithm
+        /// </summary>
+        private static readonly string VARSON = "CSPTG";
+
+        /// <summary>
+        /// The max code length for metaphone is 4
+        /// </summary>
+        private int maxCodeLen = 4;
+
+        /// <summary>
+        /// Creates an instance of the <see cref="Metaphone"/> encoder
+        /// </summary>
+        public Metaphone()
+            : base()
+        {
+        }
+
+        /// <summary>
+        /// Find the metaphone value of a string. This is similar to the
+        /// soundex algorithm, but better at finding similar sounding words.
+        /// All input is converted to upper case.
+        /// Limitations: Input format is expected to be a single ASCII word
+        /// with only characters in the A - Z range, no punctuation or numbers.
+        /// </summary>
+        /// <param name="txt">String to find the metaphone code for.</param>
+        /// <returns>A metaphone code corresponding to the string 
supplied.</returns>
+        public virtual string GetMetaphone(string txt)
+        {
+            bool hard = false;
+            if (txt == null || txt.Length == 0)
+            {
+                return "";
+            }
+            // single character is itself
+            if (txt.Length == 1)
+            {
+                return new CultureInfo("en").TextInfo.ToUpper(txt);
+            }
+
+            char[] inwd = new 
CultureInfo("en").TextInfo.ToUpper(txt).ToCharArray();
+
+            StringBuilder local = new StringBuilder(40); // manipulate
+            StringBuilder code = new StringBuilder(10); //   output
+                                                        // handle initial 2 
characters exceptions
+            switch (inwd[0])
+            {
+                case 'K':
+                case 'G':
+                case 'P': /* looking for KN, etc*/
+                    if (inwd[1] == 'N')
+                    {
+                        local.Append(inwd, 1, inwd.Length - 1);
+                    }
+                    else
+                    {
+                        local.Append(inwd);
+                    }
+                    break;
+                case 'A': /* looking for AE */
+                    if (inwd[1] == 'E')
+                    {
+                        local.Append(inwd, 1, inwd.Length - 1);
+                    }
+                    else
+                    {
+                        local.Append(inwd);
+                    }
+                    break;
+                case 'W': /* looking for WR or WH */
+                    if (inwd[1] == 'R')
+                    {   // WR -> R
+                        local.Append(inwd, 1, inwd.Length - 1);
+                        break;
+                    }
+                    if (inwd[1] == 'H')
+                    {
+                        local.Append(inwd, 1, inwd.Length - 1);
+                        local[0] = 'W'; // WH -> W
+                    }
+                    else
+                    {
+                        local.Append(inwd);
+                    }
+                    break;
+                case 'X': /* initial X becomes S */
+                    inwd[0] = 'S';
+                    local.Append(inwd);
+                    break;
+                default:
+                    local.Append(inwd);
+                    break;
+            } // now local has working string with initials fixed
+
+            int wdsz = local.Length;
+            int n = 0;
+
+            while (code.Length < this.MaxCodeLen &&
+                   n < wdsz)
+            { // max code size of 4 works well
+                char symb = local[n];
+                // remove duplicate letters except C
+                if (symb != 'C' && IsPreviousChar(local, n, symb))
+                {
+                    n++;
+                }
+                else
+                { // not dup
+                    switch (symb)
+                    {
+                        case 'A':
+                        case 'E':
+                        case 'I':
+                        case 'O':
+                        case 'U':
+                            if (n == 0)
+                            {
+                                code.Append(symb);
+                            }
+                            break; // only use vowel if leading char
+                        case 'B':
+                            if (IsPreviousChar(local, n, 'M') &&
+                                 IsLastChar(wdsz, n))
+                            { // B is silent if word ends in MB
+                                break;
+                            }
+                            code.Append(symb);
+                            break;
+                        case 'C': // lots of C special cases
+                                  /* discard if SCI, SCE or SCY */
+                            if (IsPreviousChar(local, n, 'S') &&
+                                 !IsLastChar(wdsz, n) &&
+                                 FRONTV.IndexOf(local[n + 1]) >= 0)
+                            {
+                                break;
+                            }
+                            if (RegionMatch(local, n, "CIA"))
+                            { // "CIA" -> X
+                                code.Append('X');
+                                break;
+                            }
+                            if (!IsLastChar(wdsz, n) &&
+                                FRONTV.IndexOf(local[n + 1]) >= 0)
+                            {
+                                code.Append('S');
+                                break; // CI,CE,CY -> S
+                            }
+                            if (IsPreviousChar(local, n, 'S') &&
+                                IsNextChar(local, n, 'H'))
+                            { // SCH->sk
+                                code.Append('K');
+                                break;
+                            }
+                            if (IsNextChar(local, n, 'H'))
+                            { // detect CH
+                                if (n == 0 &&
+                                    wdsz >= 3 &&
+                                    IsVowel(local, 2))
+                                { // CH consonant -> K consonant
+                                    code.Append('K');
+                                }
+                                else
+                                {
+                                    code.Append('X'); // CHvowel -> X
+                                }
+                            }
+                            else
+                            {
+                                code.Append('K');
+                            }
+                            break;
+                        case 'D':
+                            if (!IsLastChar(wdsz, n + 1) &&
+                                IsNextChar(local, n, 'G') &&
+                                FRONTV.IndexOf(local[n + 2]) >= 0)
+                            { // DGE DGI DGY -> J
+                                code.Append('J'); n += 2;
+                            }
+                            else
+                            {
+                                code.Append('T');
+                            }
+                            break;
+                        case 'G': // GH silent at end or before consonant
+                            if (IsLastChar(wdsz, n + 1) &&
+                                IsNextChar(local, n, 'H'))
+                            {
+                                break;
+                            }
+                            if (!IsLastChar(wdsz, n + 1) &&
+                                IsNextChar(local, n, 'H') &&
+                                !IsVowel(local, n + 2))
+                            {
+                                break;
+                            }
+                            if (n > 0 &&
+                                (RegionMatch(local, n, "GN") ||
+                                  RegionMatch(local, n, "GNED")))
+                            {
+                                break; // silent G
+                            }
+                            if (IsPreviousChar(local, n, 'G'))
+                            {
+                                // NOTE: Given that duplicated chars are 
removed, I don't see how this can ever be true
+                                hard = true;
+                            }
+                            else
+                            {
+                                hard = false;
+                            }
+                            if (!IsLastChar(wdsz, n) &&
+                                FRONTV.IndexOf(local[n + 1]) >= 0 &&
+                                !hard)
+                            {
+                                code.Append('J');
+                            }
+                            else
+                            {
+                                code.Append('K');
+                            }
+                            break;
+                        case 'H':
+                            if (IsLastChar(wdsz, n))
+                            {
+                                break; // terminal H
+                            }
+                            if (n > 0 &&
+                                VARSON.IndexOf(local[n - 1]) >= 0)
+                            {
+                                break;
+                            }
+                            if (IsVowel(local, n + 1))
+                            {
+                                code.Append('H'); // Hvowel
+                            }
+                            break;
+                        case 'F':
+                        case 'J':
+                        case 'L':
+                        case 'M':
+                        case 'N':
+                        case 'R':
+                            code.Append(symb);
+                            break;
+                        case 'K':
+                            if (n > 0)
+                            { // not initial
+                                if (!IsPreviousChar(local, n, 'C'))
+                                {
+                                    code.Append(symb);
+                                }
+                            }
+                            else
+                            {
+                                code.Append(symb); // initial K
+                            }
+                            break;
+                        case 'P':
+                            if (IsNextChar(local, n, 'H'))
+                            {
+                                // PH -> F
+                                code.Append('F');
+                            }
+                            else
+                            {
+                                code.Append(symb);
+                            }
+                            break;
+                        case 'Q':
+                            code.Append('K');
+                            break;
+                        case 'S':
+                            if (RegionMatch(local, n, "SH") ||
+                                RegionMatch(local, n, "SIO") ||
+                                RegionMatch(local, n, "SIA"))
+                            {
+                                code.Append('X');
+                            }
+                            else
+                            {
+                                code.Append('S');
+                            }
+                            break;
+                        case 'T':
+                            if (RegionMatch(local, n, "TIA") ||
+                                RegionMatch(local, n, "TIO"))
+                            {
+                                code.Append('X');
+                                break;
+                            }
+                            if (RegionMatch(local, n, "TCH"))
+                            {
+                                // Silent if in "TCH"
+                                break;
+                            }
+                            // substitute numeral 0 for TH (resembles theta 
after all)
+                            if (RegionMatch(local, n, "TH"))
+                            {
+                                code.Append('0');
+                            }
+                            else
+                            {
+                                code.Append('T');
+                            }
+                            break;
+                        case 'V':
+                            code.Append('F'); break;
+                        case 'W':
+                        case 'Y': // silent if not followed by vowel
+                            if (!IsLastChar(wdsz, n) &&
+                                IsVowel(local, n + 1))
+                            {
+                                code.Append(symb);
+                            }
+                            break;
+                        case 'X':
+                            code.Append('K');
+                            code.Append('S');
+                            break;
+                        case 'Z':
+                            code.Append('S');
+                            break;
+                        default:
+                            // do nothing
+                            break;
+                    } // end switch
+                    n++;
+                } // end else from symb != 'C'
+                if (code.Length > this.MaxCodeLen)
+                {
+                    code.Length = this.MaxCodeLen;
+                }
+            }
+            return code.ToString();
+        }
+
+        private bool IsVowel(StringBuilder sb, int index)
+        {
+            return VOWELS.IndexOf(sb[index]) >= 0;
+        }
+
+        private bool IsPreviousChar(StringBuilder sb, int index, char c)
+        {
+            bool matches = false;
+            if (index > 0 &&
+                index < sb.Length)
+            {
+                matches = sb[index - 1] == c;
+            }
+            return matches;
+        }
+
+        private bool IsNextChar(StringBuilder sb, int index, char c)
+        {
+            bool matches = false;
+            if (index >= 0 &&
+                index < sb.Length - 1)
+            {
+                matches = sb[index + 1] == c;
+            }
+            return matches;
+        }
+
+        private bool RegionMatch(StringBuilder sb, int index, string test)
+        {
+            bool matches = false;
+            if (index >= 0 &&
+                index + test.Length - 1 < sb.Length)
+            {
+                string substring = sb.ToString(index, test.Length);
+                matches = substring.Equals(test);
+            }
+            return matches;
+        }
+
+        private bool IsLastChar(int wdsz, int n)
+        {
+            return n + 1 == wdsz;
+        }
+
+        // LUCENENET specific - in .NET we don't need an object overload, 
since strings are sealed anyway.
+        //    /**
+        //     * Encodes an Object using the metaphone algorithm.  This method
+        //     * is provided in order to satisfy the requirements of the
+        //     * Encoder interface, and will throw an EncoderException if the
+        //     * supplied object is not of type java.lang.String.
+        //     *
+        //     * @param obj Object to encode
+        //     * @return An object (or type java.lang.String) containing the
+        //     *         metaphone code which corresponds to the String 
supplied.
+        //     * @throws EncoderException if the parameter supplied is not
+        //     *                          of type java.lang.String
+        //     */
+        //    @Override
+        //public object encode(object obj) 
+        //    {
+        //    if (!(obj is String)) {
+        //            throw new EncoderException("Parameter supplied to 
Metaphone encode is not of type java.lang.String");
+        //        }
+        //    return GetMetaphone((String) obj);
+        //    }
+
+        /// <summary>
+        /// Encodes a string using the <see cref="Metaphone"/> algorithm.
+        /// </summary>
+        /// <param name="str">String to encode.</param>
+        /// <returns>The metaphone code corresponding to the string 
supplied.</returns>
+        public virtual string Encode(string str)
+        {
+            return GetMetaphone(str);
+        }
+
+        /// <summary>
+        /// Tests is the metaphones of two strings are identical.
+        /// </summary>
+        /// <param name="str1">First of two strings to compare.</param>
+        /// <param name="str2">Second of two strings to compare.</param>
+        /// <returns><c>true</c> if the metaphones of these strings are 
identical, <c>false</c> otherwise.</returns>
+        public virtual bool IsMetaphoneEqual(string str1, string str2)
+        {
+            return GetMetaphone(str1).Equals(GetMetaphone(str2));
+        }
+
+        /// <summary>
+        /// Gets or Sets <see cref="maxCodeLen"/>.
+        /// </summary>
+        public virtual int MaxCodeLen
+        {
+            get { return this.maxCodeLen; }
+            set { this.maxCodeLen = value; }
+        }
+    }
+}

[08/15] lucenenet git commit: Added Lucene.Net.Analysis.Phonetic + tests. Rather than porting over the entire commons-codec library, only the language features were ported and added to this library.

Reply via email to