http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1ee3a9cc/src/Lucene.Net.Analysis.Phonetic/Language/Bm/sep_rules_french.txt ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Phonetic/Language/Bm/sep_rules_french.txt b/src/Lucene.Net.Analysis.Phonetic/Language/Bm/sep_rules_french.txt new file mode 100644 index 0000000..de636f8 --- /dev/null +++ b/src/Lucene.Net.Analysis.Phonetic/Language/Bm/sep_rules_french.txt @@ -0,0 +1,91 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Sephardic + +// CONSONANTS +"kh" "" "" "x" // foreign +"ph" "" "" "f" + +"ç" "" "" "s" +"x" "" "" "ks" +"ch" "" "" "S" +"c" "" "[eiyéèê]" "s" +"c" "" "" "k" +"gn" "" "" "(n|gn)" +"g" "" "[eiy]" "Z" +"gue" "" "$" "k" +"gu" "" "[eiy]" "g" +//"aill" "" "e" "aj" // non Jewish +//"ll" "" "e" "(l|j)" // non Jewish +"que" "" "$" "k" +"qu" "" "" "k" +"q" "" "" "k" +"s" "[aeiouyéèê]" "[aeiouyéèê]" "z" +"h" "[bdgt]" "" "" // translit from Arabic +"h" "" "$" "" // foreign +"j" "" "" "Z" +"w" "" "" "v" +"ouh" "" "[aioe]" "(v|uh)" +"ou" "" "[aeio]" "v" +"uo" "" "" "(vo|o)" +"u" "" "[aeio]" "v" + +// VOWELS +"aue" "" "" "aue" +"eau" "" "" "o" +//"au" "" "" "(o|au)" // non Jewish +"ai" "" "" "aj" // [e] is non Jewish +"ay" "" "" "aj" // [e] is non Jewish +"é" "" "" "e" +"ê" "" "" "e" +"è" "" "" "e" +"à " "" "" "a" +"â" "" "" "a" +"où" "" "" "u" +"ou" "" "" "u" +"oi" "" "" "oj" // [ua] is non Jewish +"ei" "" "" "ej" // [e] is non Jewish, in Ashk should be aj +"ey" "" "" "ej" // [e] non Jewish, in Ashk should be aj +//"eu" "" "" "(e|o)" // non Jewish +"y" "[ou]" "" "j" +"e" "" "$" "(e|)" +"i" "" "[aou]" "j" +"y" "" "[aoeu]" "j" +"y" "" "" "i" + +// TRIVIAL +"a" "" "" "a" +"b" "" "" "b" +"d" "" "" "d" +"e" "" "" "e" +"f" "" "" "f" +"g" "" "" "g" +"h" "" "" "h" +"i" "" "" "i" +"k" "" "" "k" +"l" "" "" "l" +"m" "" "" "m" +"n" "" "" "n" +"o" "" "" "o" +"p" "" "" "p" +"r" "" "" "r" +"s" "" "" "s" +"t" "" "" "t" +"u" "" "" "u" +"v" "" "" "v" +"z" "" "" "z"
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1ee3a9cc/src/Lucene.Net.Analysis.Phonetic/Language/Bm/sep_rules_hebrew.txt ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Phonetic/Language/Bm/sep_rules_hebrew.txt b/src/Lucene.Net.Analysis.Phonetic/Language/Bm/sep_rules_hebrew.txt new file mode 100644 index 0000000..91cf5ba --- /dev/null +++ b/src/Lucene.Net.Analysis.Phonetic/Language/Bm/sep_rules_hebrew.txt @@ -0,0 +1,62 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Sephardic + +"××" "" "" "i" +"×¢×" "" "" "i" +"×¢×" "" "" "VV" +"××" "" "" "VV" + +"×׳" "" "" "Z" +"×׳" "" "" "dZ" + +"×" "" "" "L" +"×" "" "" "b" +"×" "" "" "g" +"×" "" "" "d" + +"×" "^" "" "1" +"×" "" "$" "1" +"×" "" "" "" + +"××" "" "" "V" +"××" "" "" "WW" +"×" "" "" "W" +"×" "" "" "z" +"×" "" "" "X" +"×" "" "" "T" +"××" "" "" "i" +"×" "" "" "i" +"×" "" "" "X" +"×" "^" "" "K" +"×" "" "" "k" +"×" "" "" "l" +"×" "" "" "m" +"×" "" "" "m" +"×" "" "" "n" +"× " "" "" "n" +"ס" "" "" "s" +"×¢" "" "" "L" +"×£" "" "" "f" +"פ" "" "" "f" +"×¥" "" "" "C" +"צ" "" "" "C" +"×§" "" "" "K" +"ר" "" "" "r" +"ש" "" "" "s" +"ת" "" "" "T" // Special for Sephardim http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1ee3a9cc/src/Lucene.Net.Analysis.Phonetic/Language/Bm/sep_rules_italian.txt ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Phonetic/Language/Bm/sep_rules_italian.txt b/src/Lucene.Net.Analysis.Phonetic/Language/Bm/sep_rules_italian.txt new file mode 100644 index 0000000..76cf14b --- /dev/null +++ b/src/Lucene.Net.Analysis.Phonetic/Language/Bm/sep_rules_italian.txt @@ -0,0 +1,76 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +"kh" "" "" "x" // foreign + +"gli" "" "" "(l|gli)" +"gn" "" "[aeou]" "(n|nj|gn)" +"gni" "" "" "(ni|gni)" + +"gi" "" "[aeou]" "dZ" +"gg" "" "[ei]" "dZ" +"g" "" "[ei]" "dZ" +"h" "[bdgt]" "" "g" // gh is It; others from Arabic translit + +"ci" "" "[aeou]" "tS" +"ch" "" "[ei]" "k" +"sc" "" "[ei]" "S" +"cc" "" "[ei]" "tS" +"c" "" "[ei]" "tS" +"s" "[aeiou]" "[aeiou]" "z" + +"i" "[aeou]" "" "j" +"i" "" "[aeou]" "j" +"y" "[aeou]" "" "j" // foreign +"y" "" "[aeou]" "j" // foreign + +"qu" "" "" "k" +"uo" "" "" "(vo|o)" +"u" "" "[aei]" "v" + +"�" "" "" "e" +"�" "" "" "e" +"�" "" "" "o" +"�" "" "" "o" + +// LATIN ALPHABET +"a" "" "" "a" +"b" "" "" "b" +"c" "" "" "k" +"d" "" "" "d" +"e" "" "" "e" +"f" "" "" "f" +"g" "" "" "g" +"h" "" "" "h" +"i" "" "" "i" +"j" "" "" "(Z|dZ|j)" // foreign +"k" "" "" "k" +"l" "" "" "l" +"m" "" "" "m" +"n" "" "" "n" +"o" "" "" "o" +"p" "" "" "p" +"q" "" "" "k" +"r" "" "" "r" +"s" "" "" "s" +"t" "" "" "t" +"u" "" "" "u" +"v" "" "" "v" +"w" "" "" "v" // foreign +"x" "" "" "ks" // foreign +"y" "" "" "i" // foreign +"z" "" "" "(ts|dz)" http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1ee3a9cc/src/Lucene.Net.Analysis.Phonetic/Language/Bm/sep_rules_portuguese.txt ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Phonetic/Language/Bm/sep_rules_portuguese.txt b/src/Lucene.Net.Analysis.Phonetic/Language/Bm/sep_rules_portuguese.txt new file mode 100644 index 0000000..67cbd9b --- /dev/null +++ b/src/Lucene.Net.Analysis.Phonetic/Language/Bm/sep_rules_portuguese.txt @@ -0,0 +1,104 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +"kh" "" "" "x" // foreign +"ch" "" "" "S" +"ss" "" "" "s" +"sc" "" "[ei]" "s" +"sç" "" "[aou]" "s" +"ç" "" "" "s" +"c" "" "[ei]" "s" +// "c" "" "[aou]" "(k|C)" + +"s" "^" "" "s" +"s" "[aáuiÃoóeéêy]" "[aáuiÃoóeéêy]" "z" +"s" "" "[dglmnrv]" "(Z|S)" // Z is Brazil + +"z" "" "$" "(Z|s|S)" // s and S in Brazil +"z" "" "[bdgv]" "(Z|z)" // Z in Brazil +"z" "" "[ptckf]" "(s|S|z)" // s and S in Brazil + +"gu" "" "[eiu]" "g" +"gu" "" "[ao]" "gv" +"g" "" "[ei]" "Z" +"qu" "" "[eiu]" "k" +"qu" "" "[ao]" "kv" + +"uo" "" "" "(vo|o|u)" +"u" "" "[aei]" "v" + +"lh" "" "" "l" +"nh" "" "" "nj" +"h" "[bdgt]" "" "" // translit. from Arabic + +"ex" "" "[aáuiÃoóeéêy]" "(ez|eS|eks)" // ez in Brazil +"ex" "" "[cs]" "e" + +"y" "[aáuiÃoóeéê]" "" "j" +"y" "" "[aeiÃou]" "j" +"m" "" "[bcdfglnprstv]" "(m|n)" // maybe to add a rule for m/n before a consonant that disappears [preceding vowel becomes nasalized] +"m" "" "$" "(m|n)" // maybe to add a rule for final m/n that disappears [preceding vowel becomes nasalized] + +"ão" "" "" "(au|an|on)" +"ãe" "" "" "(aj|an)" +"ãi" "" "" "(aj|an)" +"õe" "" "" "(oj|on)" +"i" "[aáuoóeéê]" "" "j" +"i" "" "[aeou]" "j" + +"â" "" "" "a" +"à " "" "" "a" +"á" "" "" "a" +"ã" "" "" "(a|an|on)" +"é" "" "" "e" +"ê" "" "" "e" +"Ã" "" "" "i" +"ô" "" "" "o" +"ó" "" "" "o" +"õ" "" "" "(o|on)" +"ú" "" "" "u" +"ü" "" "" "u" + +"aue" "" "" "aue" + +// LATIN ALPHABET +"a" "" "" "a" +"b" "" "" "b" +"c" "" "" "k" +"d" "" "" "d" +"e" "" "" "(e|i)" +"f" "" "" "f" +"g" "" "" "g" +"h" "" "" "h" +"i" "" "" "i" +"j" "" "" "Z" +"k" "" "" "k" +"l" "" "" "l" +"m" "" "" "m" +"n" "" "" "n" +"o" "" "" "(o|u)" +"p" "" "" "p" +"q" "" "" "k" +"r" "" "" "r" +"s" "" "" "S" +"t" "" "" "t" +"u" "" "" "u" +"v" "" "" "v" +"w" "" "" "v" +"x" "" "" "(S|ks)" +"y" "" "" "i" +"z" "" "" "z" http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1ee3a9cc/src/Lucene.Net.Analysis.Phonetic/Language/Bm/sep_rules_spanish.txt ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Phonetic/Language/Bm/sep_rules_spanish.txt b/src/Lucene.Net.Analysis.Phonetic/Language/Bm/sep_rules_spanish.txt new file mode 100644 index 0000000..b900e7e --- /dev/null +++ b/src/Lucene.Net.Analysis.Phonetic/Language/Bm/sep_rules_spanish.txt @@ -0,0 +1,95 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +//Sephardic + +// Includes both Spanish (Castillian) & Catalan + +// CONSONANTS +"ñ" "" "" "(n|nj)" +"ny" "" "" "nj" // Catalan +"ç" "" "" "s" // Catalan + +"ig" "[aeiou]" "" "(tS|ig)" // tS is Catalan +"ix" "[aeiou]" "" "S" // Catalan +"tx" "" "" "tS" // Catalan +"tj" "" "$" "tS" // Catalan +"tj" "" "" "dZ" // Catalan +"tg" "" "" "(tg|dZ)" // dZ is Catalan +"ch" "" "" "(tS|dZ)" // dZ is typical for Argentina +"bh" "" "" "b" // translit. from Arabic +"h" "[dgt]" "" "" // translit. from Arabic + +"j" "" "" "(x|Z)" // Z is Catalan +"x" "" "" "(ks|gz|S)" // ks is Spanish, all are Catalan + +//"ll" "" "" "(l|Z)" // Z is typical for Argentina, only Ashkenazic +"w" "" "" "v" // foreign words + +"v" "^" "" "(B|v)" +"b" "^" "" "(b|V)" +"v" "" "" "(b|v)" +"b" "" "" "(b|v)" +"m" "" "[bpvf]" "(m|n)" + +"c" "" "[ei]" "s" +// "c" "" "[aou]" "(k|C)" +"c" "" "" "k" + +"z" "" "" "(z|s)" // as "c" befoire "e" or "i", in Spain it is like unvoiced English "th" + +"gu" "" "[ei]" "(g|gv)" // "gv" because "u" can actually be "ü" +"g" "" "[ei]" "(x|g|dZ)" // "g" only for foreign words; dZ is Catalan + +"qu" "" "" "k" +"q" "" "" "k" + +"uo" "" "" "(vo|o)" +"u" "" "[aei]" "v" + +// "y" "" "" "(i|j|S|Z)" // S or Z are peculiar to South America; only Ashkenazic +"y" "" "" "(i|j)" + +// VOWELS +"ü" "" "" "v" +"á" "" "" "a" +"é" "" "" "e" +"Ã" "" "" "i" +"ó" "" "" "o" +"ú" "" "" "u" +"à " "" "" "a" // Catalan +"è" "" "" "e" // Catalan +"ò" "" "" "o" // Catalan + +// TRIVIAL +"a" "" "" "a" +"d" "" "" "d" +"e" "" "" "e" +"f" "" "" "f" +"g" "" "" "g" +"h" "" "" "h" +"i" "" "" "i" +"k" "" "" "k" +"l" "" "" "l" +"m" "" "" "m" +"n" "" "" "n" +"o" "" "" "o" +"p" "" "" "p" +"r" "" "" "r" +"s" "" "" "s" +"t" "" "" "t" +"u" "" "" "u" http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1ee3a9cc/src/Lucene.Net.Analysis.Phonetic/Language/Caverphone1.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Phonetic/Language/Caverphone1.cs b/src/Lucene.Net.Analysis.Phonetic/Language/Caverphone1.cs new file mode 100644 index 0000000..1abfcd1 --- /dev/null +++ b/src/Lucene.Net.Analysis.Phonetic/Language/Caverphone1.cs @@ -0,0 +1,131 @@ +// commons-codec version compatibility level: 1.9 +using System.Globalization; +using System.Text.RegularExpressions; + +namespace Lucene.Net.Analysis.Phonetic.Language +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// <summary> + /// Encodes a string into a Caverphone 1.0 value. + /// <para/> + /// This is an algorithm created by the Caversham Project at the University of Otago. It implements the Caverphone 1.0 + /// algorithm: + /// <para/> + /// See: <a href="http://en.wikipedia.org/wiki/Caverphone">Wikipedia - Caverphone</a> + /// <para/> + /// See: <a href="http://caversham.otago.ac.nz/files/working/ctp060902.pdf">Caverphone 1.0 specification</a> + /// <para/> + /// This class is immutable and thread-safe. + /// <para/> + /// since 1.5 + /// </summary> + public class Caverphone1 : AbstractCaverphone + { + private static readonly string SIX_1 = "111111"; + + /// <summary> + /// Encodes the given string into a Caverphone value. + /// </summary> + /// <param name="source">The source string.</param> + /// <returns>A caverphone code for the given string.</returns> + public override string Encode(string source) + { + string txt = source; + if (txt == null || txt.Length == 0) + { + return SIX_1; + } + + // 1. Convert to lowercase + txt = txt.ToLowerInvariant(); // LUCENENET NOTE: This doesn't work right under "en" language, but does under invariant + + // 2. Remove anything not A-Z + txt = Regex.Replace(txt, "[^a-z]", ""); + + // 3. Handle various start options + // 2 is a temporary placeholder to indicate a consonant which we are no longer interested in. + txt = Regex.Replace(txt, "^cough", "cou2f"); + txt = Regex.Replace(txt, "^rough", "rou2f"); + txt = Regex.Replace(txt, "^tough", "tou2f"); + txt = Regex.Replace(txt, "^enough", "enou2f"); + txt = Regex.Replace(txt, "^gn", "2n"); + + // End + txt = Regex.Replace(txt, "mb$", "m2"); + + // 4. Handle replacements + txt = Regex.Replace(txt, "cq", "2q"); + txt = Regex.Replace(txt, "ci", "si"); + txt = Regex.Replace(txt, "ce", "se"); + txt = Regex.Replace(txt, "cy", "sy"); + txt = Regex.Replace(txt, "tch", "2ch"); + txt = Regex.Replace(txt, "c", "k"); + txt = Regex.Replace(txt, "q", "k"); + txt = Regex.Replace(txt, "x", "k"); + txt = Regex.Replace(txt, "v", "f"); + txt = Regex.Replace(txt, "dg", "2g"); + txt = Regex.Replace(txt, "tio", "sio"); + txt = Regex.Replace(txt, "tia", "sia"); + txt = Regex.Replace(txt, "d", "t"); + txt = Regex.Replace(txt, "ph", "fh"); + txt = Regex.Replace(txt, "b", "p"); + txt = Regex.Replace(txt, "sh", "s2"); + txt = Regex.Replace(txt, "z", "s"); + txt = Regex.Replace(txt, "^[aeiou]", "A"); + // 3 is a temporary placeholder marking a vowel + txt = Regex.Replace(txt, "[aeiou]", "3"); + txt = Regex.Replace(txt, "3gh3", "3kh3"); + txt = Regex.Replace(txt, "gh", "22"); + txt = Regex.Replace(txt, "g", "k"); + txt = Regex.Replace(txt, "s+", "S"); + txt = Regex.Replace(txt, "t+", "T"); + txt = Regex.Replace(txt, "p+", "P"); + txt = Regex.Replace(txt, "k+", "K"); + txt = Regex.Replace(txt, "f+", "F"); + txt = Regex.Replace(txt, "m+", "M"); + txt = Regex.Replace(txt, "n+", "N"); + txt = Regex.Replace(txt, "w3", "W3"); + txt = Regex.Replace(txt, "wy", "Wy"); // 1.0 only + txt = Regex.Replace(txt, "wh3", "Wh3"); + txt = Regex.Replace(txt, "why", "Why"); // 1.0 only + txt = Regex.Replace(txt, "w", "2"); + txt = Regex.Replace(txt, "^h", "A"); + txt = Regex.Replace(txt, "h", "2"); + txt = Regex.Replace(txt, "r3", "R3"); + txt = Regex.Replace(txt, "ry", "Ry"); // 1.0 only + txt = Regex.Replace(txt, "r", "2"); + txt = Regex.Replace(txt, "l3", "L3"); + txt = Regex.Replace(txt, "ly", "Ly"); // 1.0 only + txt = Regex.Replace(txt, "l", "2"); + txt = Regex.Replace(txt, "j", "y"); // 1.0 only + txt = Regex.Replace(txt, "y3", "Y3"); // 1.0 only + txt = Regex.Replace(txt, "y", "2"); // 1.0 only + + // 5. Handle removals + txt = Regex.Replace(txt, "2", ""); + txt = Regex.Replace(txt, "3", ""); + + // 6. put ten 1s on the end + txt = txt + SIX_1; + + // 7. take the first six characters as the code + return txt.Substring(0, SIX_1.Length - 0); + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1ee3a9cc/src/Lucene.Net.Analysis.Phonetic/Language/Caverphone2.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Phonetic/Language/Caverphone2.cs b/src/Lucene.Net.Analysis.Phonetic/Language/Caverphone2.cs new file mode 100644 index 0000000..cec7388 --- /dev/null +++ b/src/Lucene.Net.Analysis.Phonetic/Language/Caverphone2.cs @@ -0,0 +1,133 @@ +// commons-codec version compatibility level: 1.9 +using System.Globalization; +using System.Text.RegularExpressions; + +namespace Lucene.Net.Analysis.Phonetic.Language +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// <summary> + /// Encodes a string into a Caverphone 2.0 value. + /// <para/> + /// This is an algorithm created by the Caversham Project at the University of Otago. It implements the Caverphone 2.0 + /// algorithm: + /// <para/> + /// See: <a href="http://en.wikipedia.org/wiki/Caverphone">Wikipedia - Caverphone</a> + /// <para/> + /// See: <a href="http://caversham.otago.ac.nz/files/working/ctp150804.pdf">Caverphone 2.0 specification</a> + /// <para/> + /// This class is immutable and thread-safe. + /// </summary> + public class Caverphone2 : AbstractCaverphone + { + private static readonly string TEN_1 = "1111111111"; + + /// <summary> + /// Encodes the given string into a Caverphone 2.0 value. + /// </summary> + /// <param name="source">The source string.</param> + /// <returns>A caverphone code for the given string.</returns> + public override string Encode(string source) + { + string txt = source; + if (txt == null || txt.Length == 0) + { + return TEN_1; + } + + // 1. Convert to lowercase + txt = new CultureInfo("en").TextInfo.ToLower(txt); + + // 2. Remove anything not A-Z + txt = Regex.Replace(txt, "[^a-z]", ""); + + // 2.5. Remove final e + txt = Regex.Replace(txt, "e$", ""); // 2.0 only + + // 3. Handle various start options + txt = Regex.Replace(txt, "^cough", "cou2f"); + txt = Regex.Replace(txt, "^rough", "rou2f"); + txt = Regex.Replace(txt, "^tough", "tou2f"); + txt = Regex.Replace(txt, "^enough", "enou2f"); // 2.0 only + txt = Regex.Replace(txt, "^trough", "trou2f"); // 2.0 only + // note the spec says ^enough here again, c+p error I assume + txt = Regex.Replace(txt, "^gn", "2n"); + + // End + txt = Regex.Replace(txt, "mb$", "m2"); + + // 4. Handle replacements + txt = Regex.Replace(txt, "cq", "2q"); + txt = Regex.Replace(txt, "ci", "si"); + txt = Regex.Replace(txt, "ce", "se"); + txt = Regex.Replace(txt, "cy", "sy"); + txt = Regex.Replace(txt, "tch", "2ch"); + txt = Regex.Replace(txt, "c", "k"); + txt = Regex.Replace(txt, "q", "k"); + txt = Regex.Replace(txt, "x", "k"); + txt = Regex.Replace(txt, "v", "f"); + txt = Regex.Replace(txt, "dg", "2g"); + txt = Regex.Replace(txt, "tio", "sio"); + txt = Regex.Replace(txt, "tia", "sia"); + txt = Regex.Replace(txt, "d", "t"); + txt = Regex.Replace(txt, "ph", "fh"); + txt = Regex.Replace(txt, "b", "p"); + txt = Regex.Replace(txt, "sh", "s2"); + txt = Regex.Replace(txt, "z", "s"); + txt = Regex.Replace(txt, "^[aeiou]", "A"); + txt = Regex.Replace(txt, "[aeiou]", "3"); + txt = Regex.Replace(txt, "j", "y"); // 2.0 only + txt = Regex.Replace(txt, "^y3", "Y3"); // 2.0 only + txt = Regex.Replace(txt, "^y", "A"); // 2.0 only + txt = Regex.Replace(txt, "y", "3"); // 2.0 only + txt = Regex.Replace(txt, "3gh3", "3kh3"); + txt = Regex.Replace(txt, "gh", "22"); + txt = Regex.Replace(txt, "g", "k"); + txt = Regex.Replace(txt, "s+", "S"); + txt = Regex.Replace(txt, "t+", "T"); + txt = Regex.Replace(txt, "p+", "P"); + txt = Regex.Replace(txt, "k+", "K"); + txt = Regex.Replace(txt, "f+", "F"); + txt = Regex.Replace(txt, "m+", "M"); + txt = Regex.Replace(txt, "n+", "N"); + txt = Regex.Replace(txt, "w3", "W3"); + txt = Regex.Replace(txt, "wh3", "Wh3"); + txt = Regex.Replace(txt, "w$", "3"); // 2.0 only + txt = Regex.Replace(txt, "w", "2"); + txt = Regex.Replace(txt, "^h", "A"); + txt = Regex.Replace(txt, "h", "2"); + txt = Regex.Replace(txt, "r3", "R3"); + txt = Regex.Replace(txt, "r$", "3"); // 2.0 only + txt = Regex.Replace(txt, "r", "2"); + txt = Regex.Replace(txt, "l3", "L3"); + txt = Regex.Replace(txt, "l$", "3"); // 2.0 only + txt = Regex.Replace(txt, "l", "2"); + + // 5. Handle removals + txt = Regex.Replace(txt, "2", ""); + txt = Regex.Replace(txt, "3$", "A"); // 2.0 only + txt = Regex.Replace(txt, "3", ""); + + // 6. put ten 1s on the end + txt = txt + TEN_1; + + // 7. take the first ten characters as the code + return txt.Substring(0, TEN_1.Length); + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1ee3a9cc/src/Lucene.Net.Analysis.Phonetic/Language/ColognePhonetic.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Phonetic/Language/ColognePhonetic.cs b/src/Lucene.Net.Analysis.Phonetic/Language/ColognePhonetic.cs new file mode 100644 index 0000000..a4824b3 --- /dev/null +++ b/src/Lucene.Net.Analysis.Phonetic/Language/ColognePhonetic.cs @@ -0,0 +1,501 @@ +// commons-codec version compatibility level: 1.9 +using System.Globalization; + +namespace Lucene.Net.Analysis.Phonetic.Language +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// <summary> + /// Encodes a string into a Cologne Phonetic value. + /// </summary> + /// <remarks> + /// Implements the <a href="http://de.wikipedia.org/wiki/K%C3%B6lner_Phonetik">KÖlner Phonetik</a> + /// (Cologne Phonetic) algorithm issued by Hans Joachim Postel in 1969. + /// <para/> + /// The <i>KÖlner Phonetik</i> is a phonetic algorithm which is optimized for the German language. + /// It is related to the well-known soundex algorithm. + /// <para/> + /// <h2>Algorithm</h2> + /// <list type="bullet"> + /// <item> + /// <term>Step 1:</term> + /// <description> + /// After preprocessing (conversion to upper case, transcription of <a + /// href="http://en.wikipedia.org/wiki/Germanic_umlaut">germanic umlauts</a>, removal of non alphabetical characters) the + /// letters of the supplied text are replaced by their phonetic code according to the following table. + /// <list type="table"> + /// <listheader> + /// <term>Letter</term> + /// <term>Context</term> + /// <term>Code</term> + /// </listheader> + /// <item> + /// <term>A, E, I, J, O, U, Y</term> + /// <term></term> + /// <term>0</term> + /// </item> + /// <item> + /// <term>H</term> + /// <term></term> + /// <term>-</term> + /// </item> + /// <item> + /// <term>B</term> + /// <term></term> + /// <term>1</term> + /// </item> + /// <item> + /// <term>P</term> + /// <term>not before H</term> + /// <term>1</term> + /// </item> + /// <item> + /// <term>D, T</term> + /// <term>not before C, S, Z</term> + /// <term>2</term> + /// </item> + /// <item> + /// <term>F, V, W</term> + /// <term></term> + /// <term>3</term> + /// </item> + /// <item> + /// <term>P</term> + /// <term>before H</term> + /// <term>3</term> + /// </item> + /// <item> + /// <term>G, K, Q</term> + /// <term></term> + /// <term>4</term> + /// </item> + /// <item> + /// <term>C</term> + /// <term>t onset before A, H, K, L, O, Q, R, U, X <para>OR</para> + /// before A, H, K, O, Q, U, X except after S, Z</term> + /// <term>4</term> + /// </item> + /// <item> + /// <term>X</term> + /// <term>not after C, K, Q</term> + /// <term>48</term> + /// </item> + /// <item> + /// <term>L</term> + /// <term></term> + /// <term>5</term> + /// </item> + /// <item> + /// <term>M, N</term> + /// <term></term> + /// <term>6</term> + /// </item> + /// <item> + /// <term>R</term> + /// <term></term> + /// <term>7</term> + /// </item> + /// <item> + /// <term>S, Z</term> + /// <term></term> + /// <term>8</term> + /// </item> + /// <item> + /// <term>C</term> + /// <term>after S, Z <para>OR</para> + /// at onset except before A, H, K, L, O, Q, R, U, X <para>OR</para> + /// not before A, H, K, O, Q, U, X + /// </term> + /// <term>8</term> + /// </item> + /// <item> + /// <term>D, T</term> + /// <term>before C, S, Z</term> + /// <term>8</term> + /// </item> + /// <item> + /// <term>X</term> + /// <term>after C, K, Q</term> + /// <term>8</term> + /// </item> + /// </list> + /// <para> + /// <small><i>(Source: <a href= "http://de.wikipedia.org/wiki/K%C3%B6lner_Phonetik#Buchstabencodes" >Wikipedia (de): + /// KÖlner Phonetik -- Buchstabencodes</a>)</i></small> + /// </para> + /// <h4>Example:</h4> + /// <c>"MÜller-LÜdenscheidt" => "MULLERLUDENSCHEIDT" => "6005507500206880022"</c> + /// </description> + /// </item> + /// <item> + /// <term>Step 2:</term> + /// <description> + /// Collapse of all multiple consecutive code digits. + /// <h4>Example:</h4> + /// <c>"6005507500206880022" => "6050750206802"</c> + /// </description> + /// </item> + /// <item> + /// <term>Step 3:</term> + /// <description> + /// Removal of all codes "0" except at the beginning. This means that two or more identical consecutive digits can occur + /// if they occur after removing the "0" digits. + /// <h4>Example:</h4> + /// <c>"6050750206802" => "65752682"</c> + /// </description> + /// </item> + /// </list> + /// <para/> + /// This class is thread-safe. + /// <para/> + /// See: <a href="http://de.wikipedia.org/wiki/K%C3%B6lner_Phonetik">Wikipedia (de): Kölner Phonetik (in German)</a> + /// <para/> + /// since 1.5 + /// </remarks> + public class ColognePhonetic : IStringEncoder + { + // Predefined char arrays for better performance and less GC load + private static readonly char[] AEIJOUY = new char[] { 'A', 'E', 'I', 'J', 'O', 'U', 'Y' }; + private static readonly char[] SCZ = new char[] { 'S', 'C', 'Z' }; + private static readonly char[] WFPV = new char[] { 'W', 'F', 'P', 'V' }; + private static readonly char[] GKQ = new char[] { 'G', 'K', 'Q' }; + private static readonly char[] CKQ = new char[] { 'C', 'K', 'Q' }; + private static readonly char[] AHKLOQRUX = new char[] { 'A', 'H', 'K', 'L', 'O', 'Q', 'R', 'U', 'X' }; + private static readonly char[] SZ = new char[] { 'S', 'Z' }; + private static readonly char[] AHOUKQX = new char[] { 'A', 'H', 'O', 'U', 'K', 'Q', 'X' }; + private static readonly char[] TDX = new char[] { 'T', 'D', 'X' }; + + /// <summary> + /// This class is not thread-safe; the field <see cref="length"/> is mutable. + /// However, it is not shared between threads, as it is constructed on demand + /// by the method <see cref="ColognePhonetic.GetColognePhonetic(string)"/>. + /// </summary> + private abstract class CologneBuffer + { + + protected readonly char[] data; + + protected int length = 0; + + public CologneBuffer(char[] data) + { + this.data = data; + this.length = data.Length; + } + + public CologneBuffer(int buffSize) + { + this.data = new char[buffSize]; + this.length = 0; + } + + protected abstract char[] CopyData(int start, int length); + + public virtual int Length + { + get { return length; } + } + + public override string ToString() + { + return new string(CopyData(0, length)); + } + } + + private class CologneOutputBuffer : CologneBuffer + { + public CologneOutputBuffer(int buffSize) + : base(buffSize) + { + } + + public void AddRight(char chr) + { + data[length] = chr; + length++; + } + + protected override char[] CopyData(int start, int length) + { + char[] newData = new char[length]; + System.Array.Copy(data, start, newData, 0, length); + return newData; + } + } + + private class CologneInputBuffer : CologneBuffer + { + public CologneInputBuffer(char[] data) + : base(data) + { + } + + public virtual void AddLeft(char ch) + { + length++; + data[GetNextPos()] = ch; + } + + protected override char[] CopyData(int start, int length) + { + char[] newData = new char[length]; + System.Array.Copy(data, data.Length - this.length + start, newData, 0, length); + return newData; + } + + public virtual char GetNextChar() + { + return data[GetNextPos()]; + } + + protected virtual int GetNextPos() + { + return data.Length - length; + } + + public virtual char RemoveNext() + { + char ch = GetNextChar(); + length--; + return ch; + } + } + + /// <summary> + /// Maps some Germanic characters to plain for internal processing. The following characters are mapped: + /// <list type="bullet"> + /// <item><description>capital a, umlaut mark</description></item> + /// <item><description>capital u, umlaut mark</description></item> + /// <item><description>capital o, umlaut mark</description></item> + /// <item><description>small sharp s, German</description></item> + /// </list> + /// </summary> + private static readonly char[][] PREPROCESS_MAP = { + new char[] {'\u00C4', 'A'}, // capital a, umlaut mark + new char[] {'\u00DC', 'U'}, // capital u, umlaut mark + new char[] {'\u00D6', 'O'}, // capital o, umlaut mark + new char[] {'\u00DF', 'S'} // small sharp s, German + }; + + /// <summary> + /// Returns whether the array contains the key, or not. + /// </summary> + private static bool ArrayContains(char[] arr, char key) + { + foreach (char element in arr) + { + if (element == key) + { + return true; + } + } + return false; + } + + /// <summary> + /// <para> + /// Implements the <i>Kölner Phonetik</i> algorithm. + /// </para> + /// <para> + /// In contrast to the initial description of the algorithm, this implementation does the encoding in one pass. + /// </para> + /// + /// </summary> + /// <param name="text"></param> + /// <returns>The corresponding encoding according to the <i>Kölner Phonetik</i> algorithm</returns> + public virtual string GetColognePhonetic(string text) + { + if (text == null) + { + return null; + } + + text = Preprocess(text); + + CologneOutputBuffer output = new CologneOutputBuffer(text.Length * 2); + CologneInputBuffer input = new CologneInputBuffer(text.ToCharArray()); + + char nextChar; + + char lastChar = '-'; + char lastCode = '/'; + char code; + char chr; + + int rightLength = input.Length; + + while (rightLength > 0) + { + chr = input.RemoveNext(); + + if ((rightLength = input.Length) > 0) + { + nextChar = input.GetNextChar(); + } + else + { + nextChar = '-'; + } + + if (ArrayContains(AEIJOUY, chr)) + { + code = '0'; + } + else if (chr == 'H' || chr < 'A' || chr > 'Z') + { + if (lastCode == '/') + { + continue; + } + code = '-'; + } + else if (chr == 'B' || (chr == 'P' && nextChar != 'H')) + { + code = '1'; + } + else if ((chr == 'D' || chr == 'T') && !ArrayContains(SCZ, nextChar)) + { + code = '2'; + } + else if (ArrayContains(WFPV, chr)) + { + code = '3'; + } + else if (ArrayContains(GKQ, chr)) + { + code = '4'; + } + else if (chr == 'X' && !ArrayContains(CKQ, lastChar)) + { + code = '4'; + input.AddLeft('S'); + rightLength++; + } + else if (chr == 'S' || chr == 'Z') + { + code = '8'; + } + else if (chr == 'C') + { + if (lastCode == '/') + { + if (ArrayContains(AHKLOQRUX, nextChar)) + { + code = '4'; + } + else + { + code = '8'; + } + } + else + { + if (ArrayContains(SZ, lastChar) || !ArrayContains(AHOUKQX, nextChar)) + { + code = '8'; + } + else + { + code = '4'; + } + } + } + else if (ArrayContains(TDX, chr)) + { + code = '8'; + } + else if (chr == 'R') + { + code = '7'; + } + else if (chr == 'L') + { + code = '5'; + } + else if (chr == 'M' || chr == 'N') + { + code = '6'; + } + else + { + code = chr; + } + + if (code != '-' && (lastCode != code && (code != '0' || lastCode == '/') || code < '0' || code > '8')) + { + output.AddRight(code); + } + + lastChar = chr; + lastCode = code; + } + return output.ToString(); + } + + // LUCENENET specific - in .NET we don't need an object overload, since strings are sealed anyway. + //@Override + // public Object encode(final Object object) throws EncoderException + //{ + // if (!(object instanceof String)) { + // throw new EncoderException("This method's parameter was expected to be of the type " + + // String.class.getName() + + // ". But actually it was of the type " + + // object.getClass().getName() + + // "."); + // } + // return encode((String) object); + // } + + + public virtual string Encode(string text) + { + return GetColognePhonetic(text); + } + + public virtual bool IsEncodeEqual(string text1, string text2) + { + return GetColognePhonetic(text1).Equals(GetColognePhonetic(text2)); + } + + /// <summary> + /// Converts the string to upper case and replaces germanic characters as defined in <see cref="PREPROCESS_MAP"/>. + /// </summary> + private string Preprocess(string text) + { + text = new CultureInfo("de").TextInfo.ToUpper(text); + + char[] chrs = text.ToCharArray(); + + for (int index = 0; index < chrs.Length; index++) + { + if (chrs[index] > 'Z') + { + foreach (char[] element in PREPROCESS_MAP) + { + if (chrs[index] == element[0]) + { + chrs[index] = element[1]; + break; + } + } + } + } + return new string(chrs); + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1ee3a9cc/src/Lucene.Net.Analysis.Phonetic/Language/DaitchMokotoffSoundex.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Phonetic/Language/DaitchMokotoffSoundex.cs b/src/Lucene.Net.Analysis.Phonetic/Language/DaitchMokotoffSoundex.cs new file mode 100644 index 0000000..e72bc38 --- /dev/null +++ b/src/Lucene.Net.Analysis.Phonetic/Language/DaitchMokotoffSoundex.cs @@ -0,0 +1,620 @@ +// commons-codec version compatibility level: 1.10 +using Lucene.Net.Support; +using System; +using System.Collections.Generic; +using System.IO; +using System.Reflection; +using System.Text; +using System.Text.RegularExpressions; + +namespace Lucene.Net.Analysis.Phonetic.Language +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// <summary> + /// Encodes a string into a Daitch-Mokotoff Soundex value. + /// </summary> + /// <remarks> + /// The Daitch-Mokotoff Soundex algorithm is a refinement of the Russel and American Soundex algorithms, yielding greater + /// accuracy in matching especially Slavish and Yiddish surnames with similar pronunciation but differences in spelling. + /// <para/> + /// The main differences compared to the other soundex variants are: + /// <list type="bullet"> + /// <item><description>coded names are 6 digits long</description></item> + /// <item><description>the initial character of the name is coded</description></item> + /// <item><description>rules to encoded multi-character n-grams</description></item> + /// <item><description>multiple possible encodings for the same name (branching)</description></item> + /// </list> + /// <para/> + /// This implementation supports branching, depending on the used method: + /// <list type="bullet"> + /// <item><term><see cref="Encode(string)"/></term><description>branching disabled, only the first code will be returned</description></item> + /// <item><term><see cref="GetSoundex(string)"/></term><description>branching enabled, all codes will be returned, separated by '|'</description></item> + /// </list> + /// <para/> + /// Note: this implementation has additional branching rules compared to the original description of the algorithm. The + /// rules can be customized by overriding the default rules contained in the resource file + /// <c>Lucene.Net.Analysis.Phonetic.Language.dmrules.txt</c>. + /// <para/> + /// This class is thread-safe. + /// <para/> + /// See: <a href="http://en.wikipedia.org/wiki/Daitch%E2%80%93Mokotoff_Soundex"> Wikipedia - Daitch-Mokotoff Soundex</a> + /// <para/> + /// See: <a href="http://www.avotaynu.com/soundex.htm">Avotaynu - Soundexing and Genealogy</a> + /// <para/> + /// since 1.10 + /// </remarks> + /// <seealso cref="Soundex"/> + public class DaitchMokotoffSoundex : IStringEncoder + { + /// <summary> + /// Inner class representing a branch during DM soundex encoding. + /// </summary> + private sealed class Branch + { + private readonly StringBuilder builder; + private string cachedString; + private string lastReplacement; + + internal Branch() + { + builder = new StringBuilder(); + lastReplacement = null; + cachedString = null; + } + + /// <summary> + /// Creates a new branch, identical to this branch. + /// </summary> + /// <returns>A new, identical branch.</returns> + public Branch CreateBranch() + { + Branch branch = new Branch(); + branch.builder.Append(ToString()); + branch.lastReplacement = this.lastReplacement; + return branch; + } + + public override bool Equals(object other) + { + if (this == other) + { + return true; + } + if (!(other is Branch)) + { + return false; + } + + return ToString().Equals(((Branch)other).ToString()); + } + + /// <summary> + /// Finish this branch by appending '0's until the maximum code length has been reached. + /// </summary> + public void Finish() + { + while (builder.Length < MAX_LENGTH) + { + builder.Append('0'); + cachedString = null; + } + } + + public override int GetHashCode() + { + return ToString().GetHashCode(); + } + + /// <summary> + /// Process the next replacement to be added to this branch. + /// </summary> + /// <param name="replacement">The next replacement to append.</param> + /// <param name="forceAppend">Indicates if the default processing shall be overridden.</param> + public void ProcessNextReplacement(string replacement, bool forceAppend) + { + bool append = lastReplacement == null || !lastReplacement.EndsWith(replacement, StringComparison.Ordinal) || forceAppend; + + if (append && builder.Length < MAX_LENGTH) + { + builder.Append(replacement); + // remove all characters after the maximum length + if (builder.Length > MAX_LENGTH) + { + //builder.delete(MAX_LENGTH, builder.Length); + builder.Remove(MAX_LENGTH, builder.Length - MAX_LENGTH); + } + cachedString = null; + } + + lastReplacement = replacement; + } + + public override string ToString() + { + if (cachedString == null) + { + cachedString = builder.ToString(); + } + return cachedString; + } + } + + /// <summary> + /// Inner class for storing rules. + /// </summary> + private sealed class Rule + { + private readonly string pattern; + private readonly string[] replacementAtStart; + private readonly string[] replacementBeforeVowel; + private readonly string[] replacementDefault; + + internal Rule(string pattern, string replacementAtStart, string replacementBeforeVowel, + string replacementDefault) + { + this.pattern = pattern; + this.replacementAtStart = Regex.Split(replacementAtStart, "\\|"); + this.replacementBeforeVowel = Regex.Split(replacementBeforeVowel, "\\|"); + this.replacementDefault = Regex.Split(replacementDefault, "\\|"); + } + + // LUCENENET specific - need read access to pattern + public string Pattern + { + get { return pattern; } + } + + public int PatternLength + { + get { return pattern.Length; } + } + + public string[] GetReplacements(string context, bool atStart) + { + if (atStart) + { + return replacementAtStart; + } + + int nextIndex = PatternLength; + bool nextCharIsVowel = nextIndex < context.Length ? IsVowel(context[nextIndex]) : false; + if (nextCharIsVowel) + { + return replacementBeforeVowel; + } + + return replacementDefault; + } + + private bool IsVowel(char ch) + { + return ch == 'a' || ch == 'e' || ch == 'i' || ch == 'o' || ch == 'u'; + } + + public bool Matches(string context) + { + return context.StartsWith(pattern, StringComparison.Ordinal); + } + + public override string ToString() + { + return string.Format("{0}=({1},{2},{3})", pattern, Collections.ToString(replacementAtStart), + Collections.ToString(replacementBeforeVowel), Collections.ToString(replacementDefault)); + } + } + + private static readonly string COMMENT = "//"; + private static readonly string DOUBLE_QUOTE = "\""; + + private static readonly string MULTILINE_COMMENT_END = "*/"; + + private static readonly string MULTILINE_COMMENT_START = "/*"; + + /// <summary>The resource file containing the replacement and folding rules</summary> + private static readonly string RESOURCE_FILE = "dmrules.txt"; + + /// <summary>The code length of a DM soundex value.</summary> + private static readonly int MAX_LENGTH = 6; + + /// <summary>Transformation rules indexed by the first character of their pattern.</summary> + private static readonly IDictionary<char, IList<Rule>> RULES = new Dictionary<char, IList<Rule>>(); + + /// <summary>Folding rules.</summary> + private static readonly IDictionary<char, char> FOLDINGS = new Dictionary<char, char>(); + + private class DaitchMokotoffRuleComparer : IComparer<Rule> + { + public int Compare(Rule rule1, Rule rule2) + { + return rule2.PatternLength - rule1.PatternLength; + } + } + + static DaitchMokotoffSoundex() + { + Stream rulesIS = typeof(DaitchMokotoffSoundex).GetTypeInfo().Assembly.FindAndGetManifestResourceStream(typeof(DaitchMokotoffSoundex), RESOURCE_FILE); + if (rulesIS == null) + { + throw new ArgumentException("Unable to load resource: " + RESOURCE_FILE); + } + + using (TextReader scanner = new StreamReader(rulesIS, Encoding.UTF8)) + { + ParseRules(scanner, RESOURCE_FILE, RULES, FOLDINGS); + } + + // sort RULES by pattern length in descending order + foreach (var rule in RULES) + { + IList<Rule> ruleList = rule.Value; + ruleList.Sort(new DaitchMokotoffRuleComparer()); + } + } + + private static void ParseRules(TextReader scanner, string location, + IDictionary<char, IList<Rule>> ruleMapping, IDictionary<char, char> asciiFoldings) + { + int currentLine = 0; + bool inMultilineComment = false; + + string rawLine; + while ((rawLine = scanner.ReadLine()) != null) + { + currentLine++; + string line = rawLine; + + if (inMultilineComment) + { + if (line.EndsWith(MULTILINE_COMMENT_END, StringComparison.Ordinal)) + { + inMultilineComment = false; + } + continue; + } + + if (line.StartsWith(MULTILINE_COMMENT_START, StringComparison.Ordinal)) + { + inMultilineComment = true; + } + else + { + // discard comments + int cmtI = line.IndexOf(COMMENT); + if (cmtI >= 0) + { + line = line.Substring(0, cmtI - 0); + } + + // trim leading-trailing whitespace + line = line.Trim(); + + if (line.Length == 0) + { + continue; // empty lines can be safely skipped + } + + if (line.Contains("=")) + { + // folding + string[] parts = line.Split(new string[] { "=" }, StringSplitOptions.RemoveEmptyEntries); + if (parts.Length != 2) + { + throw new ArgumentException("Malformed folding statement split into " + parts.Length + + " parts: " + rawLine + " in " + location); + } + else + { + string leftCharacter = parts[0]; + string rightCharacter = parts[1]; + + if (leftCharacter.Length != 1 || rightCharacter.Length != 1) + { + throw new ArgumentException("Malformed folding statement - " + + "patterns are not single characters: " + rawLine + " in " + location); + } + + asciiFoldings[leftCharacter[0]] = rightCharacter[0]; + } + } + else + { + // rule + string[] parts = Regex.Split(line, "\\s+"); + if (parts.Length != 4) + { + throw new ArgumentException("Malformed rule statement split into " + parts.Length + + " parts: " + rawLine + " in " + location); + } + else + { + try + { + string pattern = StripQuotes(parts[0]); + string replacement1 = StripQuotes(parts[1]); + string replacement2 = StripQuotes(parts[2]); + string replacement3 = StripQuotes(parts[3]); + + Rule r = new Rule(pattern, replacement1, replacement2, replacement3); + char patternKey = r.Pattern[0]; + IList<Rule> rules; + if (!ruleMapping.TryGetValue(patternKey, out rules) || rules == null) + { + rules = new List<Rule>(); + ruleMapping[patternKey] = rules; + } + rules.Add(r); + } + catch (ArgumentException e) + { + throw new InvalidOperationException( + "Problem parsing line '" + currentLine + "' in " + location, e); + } + } + } + } + } + } + + private static string StripQuotes(string str) + { + if (str.StartsWith(DOUBLE_QUOTE, StringComparison.Ordinal)) + { + str = str.Substring(1); + } + + if (str.EndsWith(DOUBLE_QUOTE, StringComparison.Ordinal)) + { + str = str.Substring(0, str.Length - 1); + } + + return str; + } + + /// <summary>Whether to use ASCII folding prior to encoding.</summary> + private readonly bool folding; + + /// <summary> + /// Creates a new instance with ASCII-folding enabled. + /// </summary> + public DaitchMokotoffSoundex() + : this(true) + { + } + + /// <summary> + /// Creates a new instance. + /// <para/> + /// With ASCII-folding enabled, certain accented characters will be transformed to equivalent ASCII characters, e.g. + /// è -> e. + /// </summary> + /// <param name="folding">If ASCII-folding shall be performed before encoding.</param> + public DaitchMokotoffSoundex(bool folding) + { + this.folding = folding; + } + + /// <summary> + /// Performs a cleanup of the input string before the actual soundex transformation. + /// <para/> + /// Removes all whitespace characters and performs ASCII folding if enabled. + /// </summary> + /// <param name="input">The input string to cleanup.</param> + /// <returns>A cleaned up string.</returns> + private string Cleanup(string input) + { + StringBuilder sb = new StringBuilder(); + foreach (char c in input.ToCharArray()) + { + char ch = c; + if (char.IsWhiteSpace(ch)) + { + continue; + } + + ch = char.ToLowerInvariant(ch); + if (folding && FOLDINGS.ContainsKey(ch)) + { + ch = FOLDINGS[ch]; + } + sb.Append(ch); + } + return sb.ToString(); + } + + // LUCENENET specific - in .NET we don't need an object overload, since strings are sealed anyway. + //** + // * Encodes an Object using the Daitch-Mokotoff soundex algorithm without branching. + // * <p> + // * This method is provided in order to satisfy the requirements of the Encoder interface, and will throw an + // * EncoderException if the supplied object is not of type java.lang.String. + // * </p> + // * + // * @see #soundex(String) + // * + // * @param obj + // * Object to encode + // * @return An object (of type java.lang.String) containing the DM soundex code, which corresponds to the String + // * supplied. + // * @throws EncoderException + // * if the parameter supplied is not of type java.lang.String + // * @throws IllegalArgumentException + // * if a character is not mapped + // */ + //@Override + // public Object encode(object obj) + //{ + // if (!(obj instanceof String)) { + // throw new EncoderException( + // "Parameter supplied to DaitchMokotoffSoundex encode is not of type java.lang.String"); + // } + // return encode((String) obj); + //} + + /// <summary> + /// Encodes a string using the Daitch-Mokotoff soundex algorithm without branching. + /// </summary> + /// <param name="source">A string to encode.</param> + /// <returns>A DM Soundex code corresponding to the string supplied.</returns> + /// <exception cref="ArgumentException">If a character is not mapped.</exception> + /// <seealso cref="GetSoundex(string)"/> + public virtual string Encode(string source) + { + if (source == null) + { + return null; + } + return GetSoundex(source, false)[0]; + } + + /// <summary> + /// Encodes a string using the Daitch-Mokotoff soundex algorithm with branching. + /// <para/> + /// In case a string is encoded into multiple codes (see branching rules), the result will contain all codes, + /// separated by '|'. + /// <para/> + /// Example: the name "AUERBACH" is encoded as both + /// <list type="bullet"> + /// <item><description>097400</description></item> + /// <item><description>097500</description></item> + /// </list> + /// <para/> + /// Thus the result will be "097400|097500". + /// </summary> + /// <param name="source">A string to encode.</param> + /// <returns>A string containing a set of DM Soundex codes corresponding to the string supplied.</returns> + /// <exception cref="ArgumentException">If a character is not mapped.</exception> + public virtual string GetSoundex(string source) + { + string[] branches = GetSoundex(source, true); + StringBuilder sb = new StringBuilder(); + int index = 0; + foreach (string branch in branches) + { + sb.Append(branch); + if (++index < branches.Length) + { + sb.Append('|'); + } + } + return sb.ToString(); + } + + /// <summary> + /// Perform the actual DM Soundex algorithm on the input string. + /// </summary> + /// <param name="source">A string to encode.</param> + /// <param name="branching">If branching shall be performed.</param> + /// <returns>A string array containing all DM Soundex codes corresponding to the string supplied depending on the selected branching mode.</returns> + /// <exception cref="ArgumentException">If a character is not mapped.</exception> + private string[] GetSoundex(string source, bool branching) + { + if (source == null) + { + return null; + } + + string input = Cleanup(source); + + // LinkedHashSet preserves input order. In .NET we can use List for that purpose. + IList<Branch> currentBranches = new List<Branch>(); + currentBranches.Add(new Branch()); + + char lastChar = '\0'; + for (int index = 0; index < input.Length; index++) + { + char ch = input[index]; + + // ignore whitespace inside a name + if (char.IsWhiteSpace(ch)) + { + continue; + } + + string inputContext = input.Substring(index); + IList<Rule> rules; + if (!RULES.TryGetValue(ch, out rules) || rules == null) + { + continue; + } + + // use an EMPTY_LIST to avoid false positive warnings wrt potential null pointer access + IList<Branch> nextBranches = branching ? new List<Branch>() : Collections.EmptyList<Branch>(); + + foreach (Rule rule in rules) + { + if (rule.Matches(inputContext)) + { + if (branching) + { + nextBranches.Clear(); + } + string[] replacements = rule.GetReplacements(inputContext, lastChar == '\0'); + bool branchingRequired = replacements.Length > 1 && branching; + + foreach (Branch branch in currentBranches) + { + foreach (string nextReplacement in replacements) + { + // if we have multiple replacements, always create a new branch + Branch nextBranch = branchingRequired ? branch.CreateBranch() : branch; + + // special rule: occurrences of mn or nm are treated differently + bool force = (lastChar == 'm' && ch == 'n') || (lastChar == 'n' && ch == 'm'); + + nextBranch.ProcessNextReplacement(nextReplacement, force); + + if (branching) + { + if (!nextBranches.Contains(nextBranch)) + { + nextBranches.Add(nextBranch); + } + } + else + { + break; + } + } + } + + if (branching) + { + currentBranches.Clear(); + currentBranches.AddRange(nextBranches); + } + index += rule.PatternLength - 1; + break; + } + } + + lastChar = ch; + } + + string[] result = new string[currentBranches.Count]; + int idx = 0; + foreach (Branch branch in currentBranches) + { + branch.Finish(); + result[idx++] = branch.ToString(); + } + + return result; + } + } +}
