http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1ee3a9cc/src/Lucene.Net.Tests.Analysis.Phonetic/DoubleMetaphoneFilterTest.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Tests.Analysis.Phonetic/DoubleMetaphoneFilterTest.cs b/src/Lucene.Net.Tests.Analysis.Phonetic/DoubleMetaphoneFilterTest.cs new file mode 100644 index 0000000..07e7f66 --- /dev/null +++ b/src/Lucene.Net.Tests.Analysis.Phonetic/DoubleMetaphoneFilterTest.cs @@ -0,0 +1,111 @@ +using Lucene.Net.Analysis.Core; +using Lucene.Net.Util; +using NUnit.Framework; +using System; +using System.IO; + +namespace Lucene.Net.Analysis.Phonetic +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + public class DoubleMetaphoneFilterTest : BaseTokenStreamTestCase + { + [Test] + public void TestSize4FalseInject() + { + TokenStream stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("international")); + TokenStream filter = new DoubleMetaphoneFilter(stream, 4, false); + AssertTokenStreamContents(filter, new String[] { "ANTR" }); + } + + [Test] + public void TestSize4TrueInject() + { + TokenStream stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("international")); + TokenStream filter = new DoubleMetaphoneFilter(stream, 4, true); + AssertTokenStreamContents(filter, new String[] { "international", "ANTR" }); + } + [Test] + public void TestAlternateInjectFalse() + { + TokenStream stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("Kuczewski")); + TokenStream filter = new DoubleMetaphoneFilter(stream, 4, false); + AssertTokenStreamContents(filter, new String[] { "KSSK", "KXFS" }); + } + [Test] + public void TestSize8FalseInject() + { + TokenStream stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("international")); + TokenStream filter = new DoubleMetaphoneFilter(stream, 8, false); + AssertTokenStreamContents(filter, new String[] { "ANTRNXNL" }); + } + [Test] + public void TestNonConvertableStringsWithInject() + { + TokenStream stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("12345 #$%@#^%&")); + TokenStream filter = new DoubleMetaphoneFilter(stream, 8, true); + AssertTokenStreamContents(filter, new String[] { "12345", "#$%@#^%&" }); + } + + [Test] + public void TestNonConvertableStringsWithoutInject() + { + TokenStream stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("12345 #$%@#^%&")); + TokenStream filter = new DoubleMetaphoneFilter(stream, 8, false); + AssertTokenStreamContents(filter, new String[] { "12345", "#$%@#^%&" }); + + // should have something after the stream + stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("12345 #$%@#^%& hello")); + filter = new DoubleMetaphoneFilter(stream, 8, false); + AssertTokenStreamContents(filter, new String[] { "12345", "#$%@#^%&", "HL" }); + } + + [Test] + public void TestRandom() + { + int codeLen = TestUtil.NextInt(Random(), 1, 8); + Analyzer a = Analyzer.NewAnonymous(createComponents: (fieldName, reader) => + { + Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); + return new TokenStreamComponents(tokenizer, new DoubleMetaphoneFilter(tokenizer, codeLen, false)); + }); + + CheckRandomData(Random(), a, 1000 * RANDOM_MULTIPLIER); + + Analyzer b = Analyzer.NewAnonymous(createComponents: (fieldName, reader) => + { + Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); + return new TokenStreamComponents(tokenizer, new DoubleMetaphoneFilter(tokenizer, codeLen, true)); + }); + + CheckRandomData(Random(), b, 1000 * RANDOM_MULTIPLIER); + } + + [Test] + public void TestEmptyTerm() + { + Analyzer a = Analyzer.NewAnonymous(createComponents: (fieldName, reader) => + { + Tokenizer tokenizer = new KeywordTokenizer(reader); + return new TokenStreamComponents(tokenizer, new DoubleMetaphoneFilter(tokenizer, 8, Random().nextBoolean())); + }); + + CheckOneTerm(a, "", ""); + } + } +}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1ee3a9cc/src/Lucene.Net.Tests.Analysis.Phonetic/Language/Bm/BeiderMorseEncoderTest.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Tests.Analysis.Phonetic/Language/Bm/BeiderMorseEncoderTest.cs b/src/Lucene.Net.Tests.Analysis.Phonetic/Language/Bm/BeiderMorseEncoderTest.cs new file mode 100644 index 0000000..bd3681b --- /dev/null +++ b/src/Lucene.Net.Tests.Analysis.Phonetic/Language/Bm/BeiderMorseEncoderTest.cs @@ -0,0 +1,255 @@ +using NUnit.Framework; +using System; +using System.Globalization; +using System.Text; +using System.Text.RegularExpressions; + +namespace Lucene.Net.Analysis.Phonetic.Language.Bm +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// <summary> + /// Tests BeiderMorseEncoder. + /// </summary> + public class BeiderMorseEncoderTest : StringEncoderAbstractTest<BeiderMorseEncoder> + { + private static readonly char[] TEST_CHARS = new char[] { 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'o', 'u' }; + + private void AssertNotEmpty(BeiderMorseEncoder bmpm, string value) + { + Assert.False(bmpm.Encode(value).Equals(""), value); + } + + private BeiderMorseEncoder CreateGenericApproxEncoder() + { + BeiderMorseEncoder encoder = new BeiderMorseEncoder(); + encoder.NameType=(NameType.GENERIC); + encoder.RuleType=(RuleType.APPROX); + return encoder; + } + + protected override BeiderMorseEncoder CreateStringEncoder() + { + return new BeiderMorseEncoder(); + } + + /** + * Tests we do not blow up. + * + * @throws EncoderException + */ + [Test] + public void TestAllChars() + { + BeiderMorseEncoder bmpm = CreateGenericApproxEncoder(); + for (char c = char.MinValue; c < char.MaxValue; c++) + { + bmpm.Encode(c.ToString()); + } + } + + [Test] + public void TestAsciiEncodeNotEmpty1Letter() + { + BeiderMorseEncoder bmpm = CreateGenericApproxEncoder(); + for (char c = 'a'; c <= 'z'; c++) + { + string value = c.ToString(); + string valueU = value.ToUpperInvariant(); + AssertNotEmpty(bmpm, value); + AssertNotEmpty(bmpm, valueU); + } + } + + [Test] + public void TestAsciiEncodeNotEmpty2Letters() + { + BeiderMorseEncoder bmpm = CreateGenericApproxEncoder(); + for (char c1 = 'a'; c1 <= 'z'; c1++) + { + for (char c2 = 'a'; c2 <= 'z'; c2++) + { + String value = new String(new char[] { c1, c2 }); + String valueU = value.ToUpperInvariant(); + AssertNotEmpty(bmpm, value); + AssertNotEmpty(bmpm, valueU); + } + } + } + + [Test] + public void TestEncodeAtzNotEmpty() + { + BeiderMorseEncoder bmpm = CreateGenericApproxEncoder(); + //String[] names = { "ácz", "átz", "Ignácz", "Ignátz", "Ignác" }; + String[] + names = { "\u00e1cz", "\u00e1tz", "Ign\u00e1cz", "Ign\u00e1tz", "Ign\u00e1c" }; + foreach (String name in names) + { + AssertNotEmpty(bmpm, name); + } + } + + /** + * Tests https://issues.apache.org/jira/browse/CODEC-125?focusedCommentId=13071566&page=com.atlassian.jira.plugin.system.issuetabpanels: + * comment-tabpanel#comment-13071566 + * + * @throws EncoderException + */ + [Test] + public void TestEncodeGna() + { + BeiderMorseEncoder bmpm = CreateGenericApproxEncoder(); + bmpm.Encode("gna"); + } + + [Test]//@Test(expected = IllegalArgumentException.class) + public void TestInvalidLangIllegalArgumentException() + { + Assert.Throws<ArgumentException>(() => Rule.GetInstance(NameType.GENERIC, RuleType.APPROX, "noSuchLanguage")); + } + + [Test]//@Test(expected = IllegalStateException.class) + public void TestInvalidLangIllegalStateException() + { + Assert.Throws<InvalidOperationException>(() => Lang.LoadFromResource("thisIsAMadeUpResourceName", Languages.GetInstance(NameType.GENERIC))); + } + + [Test]//@Test(expected = IllegalArgumentException.class) + public void TestInvalidLanguageIllegalArgumentException() + { + Assert.Throws<ArgumentException>(() => Languages.GetInstance("thereIsNoSuchLanguage")); + } + + [Test]//@Test(timeout = 10000L) + public void TestLongestEnglishSurname() + { + BeiderMorseEncoder bmpm = CreateGenericApproxEncoder(); + bmpm.Encode("MacGhilleseatheanaich"); + } + + [Test]//@Test(expected = IndexOutOfBoundsException.class) + public void TestNegativeIndexForRuleMatchIndexOutOfBoundsException() + { + Assert.Throws<ArgumentOutOfRangeException>(() => + { + Rule r = new Rule("a", "", "", new Phoneme("", Languages.ANY_LANGUAGE)); + r.PatternAndContextMatches("bob", -1); + }); + } + + [Test] + public void TestOOM() + { + String phrase = "200697900'-->�</ bceaeef >aadaabcf\"aedfbff<!--\'-->?>cae" + + "cfaaa><?&#<!--</script>&lang&fc;aadeaf?>>&bdquo< cc =\"abff\" /></ afe >" + + "<script><!-- f(';< cf aefbeef = \"bfabadcf\" ebbfeedd = fccabeb >"; + + BeiderMorseEncoder encoder = new BeiderMorseEncoder(); + encoder.NameType=(NameType.GENERIC); + encoder.RuleType=(RuleType.EXACT); + encoder.SetMaxPhonemes(10); + + String phonemes = encoder.Encode(phrase); + Assert.True(phonemes.Length > 0); + + String[] phonemeArr = new Regex("\\|").Split(phonemes); + Assert.True(phonemeArr.Length <= 10); + } + + [Test] + public void TestSetConcat() + { + BeiderMorseEncoder bmpm = new BeiderMorseEncoder(); + bmpm.IsConcat=(false); + Assert.False(bmpm.IsConcat, "Should be able to set concat to false"); + } + + [Test] + public void TestSetNameTypeAsh() + { + BeiderMorseEncoder bmpm = new BeiderMorseEncoder(); + bmpm.NameType=(NameType.ASHKENAZI); + Assert.AreEqual(NameType.ASHKENAZI, bmpm.NameType, "Name type should have been set to ash"); + } + + [Test] + public void TestSetRuleTypeExact() + { + BeiderMorseEncoder bmpm = new BeiderMorseEncoder(); + bmpm.RuleType=(RuleType.EXACT); + Assert.AreEqual(RuleType.EXACT, bmpm.RuleType, "Rule type should have been set to exact"); + } + + [Test]//@Test(expected = IllegalArgumentException.class) + public void TestSetRuleTypeToRulesIllegalArgumentException() + { + Assert.Throws<ArgumentException>(() => + { + BeiderMorseEncoder bmpm = new BeiderMorseEncoder(); + bmpm.RuleType=(RuleType.RULES); + }); + } + + /** + * (Un)luckily, the worse performing test because of the data in {@link #TEST_CHARS} + * + * @throws EncoderException + */ + [Test]/* timeout = 20000L */ + public void TestSpeedCheck() + { + BeiderMorseEncoder bmpm = this.CreateGenericApproxEncoder(); + StringBuilder stringBuffer = new StringBuilder(); + stringBuffer.append(TEST_CHARS[0]); + for (int i = 0, j = 1; i < 40; i++, j++) + { + if (j == TEST_CHARS.Length) + { + j = 0; + } + bmpm.Encode(stringBuffer.toString()); + stringBuffer.append(TEST_CHARS[j]); + } + } + + [Test] + public void TestSpeedCheck2() + { + BeiderMorseEncoder bmpm = this.CreateGenericApproxEncoder(); + String phrase = "ItstheendoftheworldasweknowitandIfeelfine"; + + for (int i = 1; i <= phrase.Length; i++) + { + bmpm.Encode(phrase.Substring(0, i)); + } + } + + [Test] + public void TestSpeedCheck3() + { + BeiderMorseEncoder bmpm = this.CreateGenericApproxEncoder(); + String phrase = "abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz"; + + for (int i = 1; i <= phrase.Length; i++) + { + bmpm.Encode(phrase.Substring(0, i)); + } + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1ee3a9cc/src/Lucene.Net.Tests.Analysis.Phonetic/Language/Bm/CacheSubSequencePerformanceTest.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Tests.Analysis.Phonetic/Language/Bm/CacheSubSequencePerformanceTest.cs b/src/Lucene.Net.Tests.Analysis.Phonetic/Language/Bm/CacheSubSequencePerformanceTest.cs new file mode 100644 index 0000000..45e9513 --- /dev/null +++ b/src/Lucene.Net.Tests.Analysis.Phonetic/Language/Bm/CacheSubSequencePerformanceTest.cs @@ -0,0 +1,138 @@ +using Lucene.Net.Attributes; +using Lucene.Net.Support; +using NUnit.Framework; +using System; +using System.Text; + +namespace Lucene.Net.Analysis.Phonetic.Language.Bm +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + public class CacheSubSequencePerformanceTest + { + [Test, LongRunningTest] + public void Test() + { + //int times = 10000000; + int times = 100000; // LUCENENET: 10 million times would take several minutes to run - decreasing to 100,000 + Console.WriteLine("Test with String : "); + Test("Angelo", times); + Console.WriteLine("Test with StringBuilder : "); + Test(new StringBuilder("Angelo"), times); + Console.WriteLine("Test with cached String : "); + Test(CacheSubSequence("Angelo").ToString(), times); + Console.WriteLine("Test with cached StringBuilder : "); + Test(CacheSubSequence(new StringBuilder("Angelo")).ToString(), times); + } + + private void Test(string input, int times) + { + long beginTime = DateTime.UtcNow.Ticks; + for (int i = 0; i < times; i++) + { + Test(input); + } + Console.WriteLine(DateTime.UtcNow.Ticks - beginTime + " millis"); + } + + private void Test(StringBuilder input, int times) + { + long beginTime = DateTime.UtcNow.Ticks; + for (int i = 0; i < times; i++) + { + Test(input); + } + Console.WriteLine(DateTime.UtcNow.Ticks - beginTime + " millis"); + } + + private void Test(string input) + { + for (int i = 0; i < input.Length; i++) + { + for (int j = i; j <= input.Length; j++) + { + input.Substring(i, (j - i)); + } + } + } + + private void Test(StringBuilder input) + { + for (int i = 0; i < input.Length; i++) + { + for (int j = i; j <= input.Length; j++) + { + input.ToString(i, (j - i)); + } + } + } + + private class CachedCharSequence : ICharSequence + { + private readonly string[][] cache; + private readonly string cached; + public CachedCharSequence(string[][] cache, string cached) + { + this.cache = cache; + this.cached = cached; + } + public char this[int index] + { + get + { + return cached[index]; + } + } + + public int Length + { + get + { + return cached.Length; + } + } + + public ICharSequence SubSequence(int start, int end) + { + if (start == end) + { + return "".ToCharSequence(); + } + string res = cache[start][end - 1]; + if (res == null) + { + res = cached.Substring(start, end - start); + cache[start][end - 1] = res; + } + return res.ToCharSequence(); + } + } + + private ICharSequence CacheSubSequence(string cached) + { + string[][] cache = Support.RectangularArrays.ReturnRectangularArray<string>(cached.Length, cached.Length); + return new CachedCharSequence(cache, cached); + } + + private ICharSequence CacheSubSequence(StringBuilder cached) + { + string[][] cache = Support.RectangularArrays.ReturnRectangularArray<string>(cached.Length, cached.Length); + return new CachedCharSequence(cache, cached.ToString()); + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1ee3a9cc/src/Lucene.Net.Tests.Analysis.Phonetic/Language/Bm/LanguageGuessingTest.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Tests.Analysis.Phonetic/Language/Bm/LanguageGuessingTest.cs b/src/Lucene.Net.Tests.Analysis.Phonetic/Language/Bm/LanguageGuessingTest.cs new file mode 100644 index 0000000..d50c6f7 --- /dev/null +++ b/src/Lucene.Net.Tests.Analysis.Phonetic/Language/Bm/LanguageGuessingTest.cs @@ -0,0 +1,84 @@ +using Lucene.Net.Support; +using NUnit.Framework; +using System; +using System.Collections.Generic; + +namespace Lucene.Net.Analysis.Phonetic.Language.Bm +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// <summary> + /// Tests guessLanguages API. + /// <para/> + /// since 1.6 + /// </summary> + public class LanguageGuessingTest + { + private static string EXACT = "exact"; + private static string ONE_OF = "one of"; + + public static List<object[]> Values = Arrays.AsList(new object[][] { + new object[] { "Renault", "french", EXACT }, + new object[] { "Mickiewicz", "polish", EXACT }, + new object[] { "Thompson", "english", ONE_OF }, // this also hits german and greeklatin + new object[] { "Nu\u00f1ez", "spanish", EXACT }, // Nuñez + new object[] { "Carvalho", "portuguese", EXACT }, + new object[] { "\u010capek", "czech", EXACT }, // Äapek + new object[] { "Sjneijder", "dutch", EXACT }, + new object[] { "Klausewitz", "german", EXACT }, + new object[] { "K\u00fc\u00e7\u00fck", "turkish", EXACT }, // Küçük + new object[] { "Giacometti", "italian", EXACT }, + new object[] { "Nagy", "hungarian", EXACT }, + new object[] { "Ceau\u015fescu", "romanian", EXACT }, // CeauÅescu + new object[] { "Angelopoulos", "greeklatin", EXACT }, + new object[] { "\u0391\u03b3\u03b3\u03b5\u03bb\u03cc\u03c0\u03bf\u03c5\u03bb\u03bf\u03c2", "greek", EXACT }, // ÎγγελÏÏÎ¿Ï Î»Î¿Ï + new object[] { "\u041f\u0443\u0448\u043a\u0438\u043d", "cyrillic", EXACT }, // ÐÑÑкин + new object[] { "\u05db\u05d4\u05df", "hebrew", EXACT }, // ××× + new object[] { "\u00e1cz", "any", EXACT }, // ácz + new object[] { "\u00e1tz", "any", EXACT } // átz + }); + + + + //private readonly String exactness; + + private readonly Lang lang = Lang.GetInstance(NameType.GENERIC); + //private readonly String language; + //private readonly String name; + + //[TestCaseSource("Values")] + //public LanguageGuessingTest(String name, String language, String exactness) + //{ + // this.name = name; + // this.language = language; + // this.exactness = exactness; + //} + + [Test] + [TestCaseSource("Values")] + public void TestLanguageGuessing(String name, String language, String exactness) + { + LanguageSet guesses = this.lang.GuessLanguages(name); + + Assert.True(guesses.Contains(language), + "language predicted for name '" + name + "' is wrong: " + guesses + " should contain '" + language + "'" + ); + + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1ee3a9cc/src/Lucene.Net.Tests.Analysis.Phonetic/Language/Bm/PhoneticEnginePerformanceTest.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Tests.Analysis.Phonetic/Language/Bm/PhoneticEnginePerformanceTest.cs b/src/Lucene.Net.Tests.Analysis.Phonetic/Language/Bm/PhoneticEnginePerformanceTest.cs new file mode 100644 index 0000000..7b8b400 --- /dev/null +++ b/src/Lucene.Net.Tests.Analysis.Phonetic/Language/Bm/PhoneticEnginePerformanceTest.cs @@ -0,0 +1,141 @@ +using NUnit.Framework; +using System; + +namespace Lucene.Net.Analysis.Phonetic.Language.Bm +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /** + * Tests performance for {@link PhoneticEngine}. + * <p> + * See <a href="https://issues.apache.org/jira/browse/CODEC-174">[CODEC-174] Improve performance of Beider Morse + * encoder</a>. + * </p> + * <p> + * Results for November 7, 2013, project SVN revision 1539678. + * </p> + * <p> + * Environment: + * </p> + * <ul> + * <li>java version "1.7.0_45"</li> + * <li>Java(TM) SE Runtime Environment (build 1.7.0_45-b18)</li> + * <li>Java HotSpot(TM) 64-Bit Server VM (build 24.45-b08, mixed mode)</li> + * <li>OS name: "windows 7", version: "6.1", arch: "amd64", family: "windows")</li> + * </ul> + * <ol> + * <li>Time for encoding 80,000 times the input 'Angelo': 33,039 millis.</li> + * <li>Time for encoding 80,000 times the input 'Angelo': 32,297 millis.</li> + * <li>Time for encoding 80,000 times the input 'Angelo': 32,857 millis.</li> + * <li>Time for encoding 80,000 times the input 'Angelo': <b>31,561 millis.</b></li> + * <li>Time for encoding 80,000 times the input 'Angelo': 32,665 millis.</li> + * <li>Time for encoding 80,000 times the input 'Angelo': 32,215 millis.</li> + * </ol> + * <p> + * On this file's revision 1539678, with patch <a + * href="https://issues.apache.org/jira/secure/attachment/12611963/CODEC-174-change-rules-storage-to-Map.patch" + * >CODEC-174-change-rules-storage-to-Map</a>: + * </p> + * <ol> + * <li>Time for encoding 80,000 times the input 'Angelo': 18,196 millis.</li> + * <li>Time for encoding 80,000 times the input 'Angelo': 13,858 millis.</li> + * <li>Time for encoding 80,000 times the input 'Angelo': 13,644 millis.</li> + * <li>Time for encoding 80,000 times the input 'Angelo': <b>13,591 millis.</b></li> + * <li>Time for encoding 80,000 times the input 'Angelo': 13,861 millis.</li> + * <li>Time for encoding 80,000 times the input 'Angelo': 13,696 millis.</li> + * </ol> + * <p> + * Patch applied, committed revision 1539783. + * </p> + * <p> + * On this file's revision 1539783, with patch <a + * href="https://issues.apache.org/jira/secure/attachment/12611962/CODEC-174-delete-subsequence-cache.patch" + * >CODEC-174-delete-subsequence-cache.patch</a>: + * </p> + * <ol> + * <li>Time for encoding 80,000 times the input 'Angelo': 13,547 millis.</li> + * <li>Time for encoding 80,000 times the input 'Angelo': <b>13,501 millis.</b></li> + * <li>Time for encoding 80,000 times the input 'Angelo': 13,528 millis.</li> + * <li>Time for encoding 80,000 times the input 'Angelo': 17,110 millis.</li> + * <li>Time for encoding 80,000 times the input 'Angelo': 13,910 millis.</li> + * <li>Time for encoding 80,000 times the input 'Angelo': 16,969 millis.</li> + * </ol> + * <p> + * Patch not applied. + * </p> + * <p> + * On this file's revision 1539787, with patch <a + * href="https://issues.apache.org/jira/secure/attachment/12612178/CODEC-174-reuse-set-in-PhonemeBuilder.patch" + * >CODEC-174-reuse-set-in-PhonemeBuilder.patch</a>: + * </p> + * <ol> + * <li>Time for encoding 80,000 times the input 'Angelo': 13,724 millis.</li> + * <li>Time for encoding 80,000 times the input 'Angelo': 13,451 millis.</li> + * <li>Time for encoding 80,000 times the input 'Angelo': 13,742 millis.</li> + * <li>Time for encoding 80,000 times the input 'Angelo': <b>13,186 millis.</b></li> + * <li>Time for encoding 80,000 times the input 'Angelo': 13,600 millis.</li> + * <li>Time for encoding 80,000 times the input 'Angelo': 16,405 millis.</li> + * </ol> + * <p> + * Patch applied, committed revision 1539788. + * </p> + * <p> + * Before patch https://issues.apache.org/jira/secure/attachment/12613371/CODEC-174-refactor-restrictTo-method-in-SomeLanguages.patch + * </p> + * <ol> + * <li>Time for encoding 80,000 times the input 'Angelo': 13,133 millis.</li> + * <li>Time for encoding 80,000 times the input 'Angelo': 13,064 millis.</li> + * <li>Time for encoding 80,000 times the input 'Angelo': <b>12,838 millis.</b></li> + * <li>Time for encoding 80,000 times the input 'Angelo': 12,970 millis.</li> + * <li>Time for encoding 80,000 times the input 'Angelo': 13,122 millis.</li> + * <li>Time for encoding 80,000 times the input 'Angelo': 13,293 millis.</li> + * </ol> + * <p> + * After patch https://issues.apache.org/jira/secure/attachment/12613371/CODEC-174-refactor-restrictTo-method-in-SomeLanguages.patch + * </p> + * <ol> + * <li>Time for encoding 80,000 times the input 'Angelo': 11,576 millis.</li> + * <li>Time for encoding 80,000 times the input 'Angelo': 11,506 millis.</li> + * <li>Time for encoding 80,000 times the input 'Angelo': 11,361 millis.</li> + * <li>Time for encoding 80,000 times the input 'Angelo': <b>11,142 millis.</b></li> + * <li>Time for encoding 80,000 times the input 'Angelo': 11,430 millis.</li> + * <li>Time for encoding 80,000 times the input 'Angelo': 11,297 millis.</li> + * </ol> + * <p> + * Patch applied, committed revision 1541234. + * </p> + */ + public class PhoneticEnginePerformanceTest + { + private static readonly int LOOP = 80000; + + [Test] + public void Test() + { + PhoneticEngine engine = new PhoneticEngine(NameType.GENERIC, RuleType.APPROX, true); + String input = "Angelo"; + long startMillis = DateTime.UtcNow.Ticks; + for (int i = 0; i < LOOP; i++) + { + engine.Encode(input); + } + long totalMillis = DateTime.UtcNow.Ticks - startMillis; + Console.WriteLine(String.Format("Time for encoding {0} times the input '{1}': {2} millis.", LOOP, input, totalMillis)); + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1ee3a9cc/src/Lucene.Net.Tests.Analysis.Phonetic/Language/Bm/PhoneticEngineRegressionTest.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Tests.Analysis.Phonetic/Language/Bm/PhoneticEngineRegressionTest.cs b/src/Lucene.Net.Tests.Analysis.Phonetic/Language/Bm/PhoneticEngineRegressionTest.cs new file mode 100644 index 0000000..cb9a40d --- /dev/null +++ b/src/Lucene.Net.Tests.Analysis.Phonetic/Language/Bm/PhoneticEngineRegressionTest.cs @@ -0,0 +1,234 @@ +using Lucene.Net.Support; +using NUnit.Framework; +using System; +using System.Collections.Generic; + +namespace Lucene.Net.Analysis.Phonetic.Language.Bm +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// <summary> + /// Tests <see cref="PhoneticEngine"/> and <see cref="LanguageSet"/> in ways very similar to code found in solr-3.6.0. + /// <para/> + /// since 1.7 + /// </summary> + public class PhoneticEngineRegressionTest + { + [Test] + public void TestSolrGENERIC() + { + IDictionary<String, String> args; + + // concat is true, ruleType is EXACT + args = new SortedDictionary<String, String>(); + args.Put("nameType", "GENERIC"); + Assert.AreEqual(Encode(args, true, "Angelo"), "agilo|angilo|aniilo|anilo|anxilo|anzilo|ogilo|ongilo|oniilo|onilo|onxilo|onzilo"); + args.Put("ruleType", "EXACT"); + Assert.AreEqual(Encode(args, true, "Angelo"), "anZelo|andZelo|angelo|anhelo|anjelo|anxelo"); + Assert.AreEqual(Encode(args, true, "D'Angelo"), "(anZelo|andZelo|angelo|anhelo|anjelo|anxelo)-(danZelo|dandZelo|dangelo|danhelo|danjelo|danxelo)"); + args.Put("languageSet", "italian,greek,spanish"); + Assert.AreEqual(Encode(args, true, "Angelo"), "andZelo|angelo|anxelo"); + Assert.AreEqual(Encode(args, true, "1234"), ""); + + // concat is false, ruleType is EXACT + args = new SortedDictionary<String, String>(); + Assert.AreEqual(Encode(args, false, "Angelo"), "agilo|angilo|aniilo|anilo|anxilo|anzilo|ogilo|ongilo|oniilo|onilo|onxilo|onzilo"); + args.Put("ruleType", "EXACT"); + Assert.AreEqual(Encode(args, false, "Angelo"), "anZelo|andZelo|angelo|anhelo|anjelo|anxelo"); + Assert.AreEqual(Encode(args, false, "D'Angelo"), "(anZelo|andZelo|angelo|anhelo|anjelo|anxelo)-(danZelo|dandZelo|dangelo|danhelo|danjelo|danxelo)"); + args.Put("languageSet", "italian,greek,spanish"); + Assert.AreEqual(Encode(args, false, "Angelo"), "andZelo|angelo|anxelo"); + Assert.AreEqual(Encode(args, false, "1234"), ""); + + // concat is true, ruleType is APPROX + args = new SortedDictionary<String, String>(); + Assert.AreEqual(Encode(args, true, "Angelo"), "agilo|angilo|aniilo|anilo|anxilo|anzilo|ogilo|ongilo|oniilo|onilo|onxilo|onzilo"); + args.Put("ruleType", "APPROX"); + Assert.AreEqual(Encode(args, true, "Angelo"), "agilo|angilo|aniilo|anilo|anxilo|anzilo|ogilo|ongilo|oniilo|onilo|onxilo|onzilo"); + Assert.AreEqual(Encode(args, true, "D'Angelo"), "(agilo|angilo|aniilo|anilo|anxilo|anzilo|ogilo|ongilo|oniilo|onilo|onxilo|onzilo)-(dagilo|dangilo|daniilo|danilo|danxilo|danzilo|dogilo|dongilo|doniilo|donilo|donxilo|donzilo)"); + args.Put("languageSet", "italian,greek,spanish"); + Assert.AreEqual(Encode(args, true, "Angelo"), "angilo|anxilo|anzilo|ongilo|onxilo|onzilo"); + Assert.AreEqual(Encode(args, true, "1234"), ""); + + // concat is false, ruleType is APPROX + args = new SortedDictionary<String, String>(); + Assert.AreEqual(Encode(args, false, "Angelo"), "agilo|angilo|aniilo|anilo|anxilo|anzilo|ogilo|ongilo|oniilo|onilo|onxilo|onzilo"); + args.Put("ruleType", "APPROX"); + Assert.AreEqual(Encode(args, false, "Angelo"), "agilo|angilo|aniilo|anilo|anxilo|anzilo|ogilo|ongilo|oniilo|onilo|onxilo|onzilo"); + Assert.AreEqual(Encode(args, false, "D'Angelo"), "(agilo|angilo|aniilo|anilo|anxilo|anzilo|ogilo|ongilo|oniilo|onilo|onxilo|onzilo)-(dagilo|dangilo|daniilo|danilo|danxilo|danzilo|dogilo|dongilo|doniilo|donilo|donxilo|donzilo)"); + args.Put("languageSet", "italian,greek,spanish"); + Assert.AreEqual(Encode(args, false, "Angelo"), "angilo|anxilo|anzilo|ongilo|onxilo|onzilo"); + Assert.AreEqual(Encode(args, false, "1234"), ""); + } + + [Test] + public void TestSolrASHKENAZI() + { + IDictionary<String, String> args; + + // concat is true, ruleType is EXACT + args = new SortedDictionary<String, String>(); + args.Put("nameType", "ASHKENAZI"); + Assert.AreEqual(Encode(args, true, "Angelo"), "AnElO|AnSelO|AngElO|AngzelO|AnkselO|AnzelO"); + args.Put("ruleType", "EXACT"); + Assert.AreEqual(Encode(args, true, "Angelo"), "andZelo|angelo|anhelo|anxelo"); + Assert.AreEqual(Encode(args, true, "D'Angelo"), "dandZelo|dangelo|danhelo|danxelo"); + args.Put("languageSet", "italian,greek,spanish"); + Assert.AreEqual(Encode(args, true, "Angelo"), "angelo|anxelo"); + Assert.AreEqual(Encode(args, true, "1234"), ""); + + // concat is false, ruleType is EXACT + args = new SortedDictionary<String, String>(); + args.Put("nameType", "ASHKENAZI"); + Assert.AreEqual(Encode(args, false, "Angelo"), "AnElO|AnSelO|AngElO|AngzelO|AnkselO|AnzelO"); + args.Put("ruleType", "EXACT"); + Assert.AreEqual(Encode(args, false, "Angelo"), "andZelo|angelo|anhelo|anxelo"); + Assert.AreEqual(Encode(args, false, "D'Angelo"), "dandZelo|dangelo|danhelo|danxelo"); + args.Put("languageSet", "italian,greek,spanish"); + Assert.AreEqual(Encode(args, false, "Angelo"), "angelo|anxelo"); + Assert.AreEqual(Encode(args, false, "1234"), ""); + + // concat is true, ruleType is APPROX + args = new SortedDictionary<String, String>(); + args.Put("nameType", "ASHKENAZI"); + Assert.AreEqual(Encode(args, true, "Angelo"), "AnElO|AnSelO|AngElO|AngzelO|AnkselO|AnzelO"); + args.Put("ruleType", "APPROX"); + Assert.AreEqual(Encode(args, true, "Angelo"), "AnElO|AnSelO|AngElO|AngzelO|AnkselO|AnzelO"); + Assert.AreEqual(Encode(args, true, "D'Angelo"), "dAnElO|dAnSelO|dAngElO|dAngzelO|dAnkselO|dAnzelO"); + args.Put("languageSet", "italian,greek,spanish"); + Assert.AreEqual(Encode(args, true, "Angelo"), "AnSelO|AngElO|AngzelO|AnkselO"); + Assert.AreEqual(Encode(args, true, "1234"), ""); + + // concat is false, ruleType is APPROX + args = new SortedDictionary<String, String>(); + args.Put("nameType", "ASHKENAZI"); + Assert.AreEqual(Encode(args, false, "Angelo"), "AnElO|AnSelO|AngElO|AngzelO|AnkselO|AnzelO"); + args.Put("ruleType", "APPROX"); + Assert.AreEqual(Encode(args, false, "Angelo"), "AnElO|AnSelO|AngElO|AngzelO|AnkselO|AnzelO"); + Assert.AreEqual(Encode(args, false, "D'Angelo"), "dAnElO|dAnSelO|dAngElO|dAngzelO|dAnkselO|dAnzelO"); + args.Put("languageSet", "italian,greek,spanish"); + Assert.AreEqual(Encode(args, false, "Angelo"), "AnSelO|AngElO|AngzelO|AnkselO"); + Assert.AreEqual(Encode(args, false, "1234"), ""); + } + + [Test] + public void TestSolrSEPHARDIC() + { + IDictionary<String, String> args; + + // concat is true, ruleType is EXACT + args = new SortedDictionary<String, String>(); + args.Put("nameType", "SEPHARDIC"); + Assert.AreEqual(Encode(args, true, "Angelo"), "anhila|anhilu|anzila|anzilu|nhila|nhilu|nzila|nzilu"); + args.Put("ruleType", "EXACT"); + Assert.AreEqual(Encode(args, true, "Angelo"), "anZelo|andZelo|anxelo"); + Assert.AreEqual(Encode(args, true, "D'Angelo"), "anZelo|andZelo|anxelo"); + args.Put("languageSet", "italian,greek,spanish"); + Assert.AreEqual(Encode(args, true, "Angelo"), "andZelo|anxelo"); + Assert.AreEqual(Encode(args, true, "1234"), ""); + + // concat is false, ruleType is EXACT + args = new SortedDictionary<String, String>(); + args.Put("nameType", "SEPHARDIC"); + Assert.AreEqual(Encode(args, false, "Angelo"), "anhila|anhilu|anzila|anzilu|nhila|nhilu|nzila|nzilu"); + args.Put("ruleType", "EXACT"); + Assert.AreEqual(Encode(args, false, "Angelo"), "anZelo|andZelo|anxelo"); + Assert.AreEqual(Encode(args, false, "D'Angelo"), "danZelo|dandZelo|danxelo"); + args.Put("languageSet", "italian,greek,spanish"); + Assert.AreEqual(Encode(args, false, "Angelo"), "andZelo|anxelo"); + Assert.AreEqual(Encode(args, false, "1234"), ""); + + // concat is true, ruleType is APPROX + args = new SortedDictionary<String, String>(); + args.Put("nameType", "SEPHARDIC"); + Assert.AreEqual(Encode(args, true, "Angelo"), "anhila|anhilu|anzila|anzilu|nhila|nhilu|nzila|nzilu"); + args.Put("ruleType", "APPROX"); + Assert.AreEqual(Encode(args, true, "Angelo"), "anhila|anhilu|anzila|anzilu|nhila|nhilu|nzila|nzilu"); + Assert.AreEqual(Encode(args, true, "D'Angelo"), "anhila|anhilu|anzila|anzilu|nhila|nhilu|nzila|nzilu"); + args.Put("languageSet", "italian,greek,spanish"); + Assert.AreEqual(Encode(args, true, "Angelo"), "anhila|anhilu|anzila|anzilu|nhila|nhilu|nzila|nzilu"); + Assert.AreEqual(Encode(args, true, "1234"), ""); + + // concat is false, ruleType is APPROX + args = new SortedDictionary<String, String>(); + args.Put("nameType", "SEPHARDIC"); + Assert.AreEqual(Encode(args, false, "Angelo"), "anhila|anhilu|anzila|anzilu|nhila|nhilu|nzila|nzilu"); + args.Put("ruleType", "APPROX"); + Assert.AreEqual(Encode(args, false, "Angelo"), "anhila|anhilu|anzila|anzilu|nhila|nhilu|nzila|nzilu"); + Assert.AreEqual(Encode(args, false, "D'Angelo"), "danhila|danhilu|danzila|danzilu|nhila|nhilu|nzila|nzilu"); + args.Put("languageSet", "italian,greek,spanish"); + Assert.AreEqual(Encode(args, false, "Angelo"), "anhila|anhilu|anzila|anzilu|nhila|nhilu|nzila|nzilu"); + Assert.AreEqual(Encode(args, false, "1234"), ""); + } + + /** + * This code is similar in style to code found in Solr: + * solr/core/src/java/org/apache/solr/analysis/BeiderMorseFilterFactory.java + * + * Making a JUnit test out of it to protect Solr from possible future + * regressions in Commons-Codec. + */ + private static String Encode(IDictionary<String, String> args, bool concat, String input) + { + LanguageSet languageSet; + PhoneticEngine engine; + + // PhoneticEngine = NameType + RuleType + concat + // we use common-codec's defaults: GENERIC + APPROX + true + String nameTypeArg; + args.TryGetValue("nameType", out nameTypeArg); + NameType nameType = (nameTypeArg == null) ? NameType.GENERIC : (NameType)Enum.Parse(typeof(NameType), nameTypeArg, true); + + String ruleTypeArg; + args.TryGetValue("ruleType", out ruleTypeArg); + RuleType ruleType = (ruleTypeArg == null) ? RuleType.APPROX : (RuleType)Enum.Parse(typeof(RuleType), ruleTypeArg, true); + + engine = new PhoneticEngine(nameType, ruleType, concat); + + // LanguageSet: defaults to automagic, otherwise a comma-separated list. + String languageSetArg; + args.TryGetValue("languageSet", out languageSetArg); + if (languageSetArg == null || languageSetArg.equals("auto")) + { + languageSet = null; + } + else + { + languageSet = LanguageSet.From(new HashSet<String>(Arrays.AsList(languageSetArg.Split(new string[] { "," }, StringSplitOptions.RemoveEmptyEntries)))); + } + + /* + org/apache/lucene/analysis/phonetic/BeiderMorseFilter.java (lines 96-98) does this: + + encoded = (languages == null) + ? engine.encode(termAtt.toString()) + : engine.encode(termAtt.toString(), languages); + + Hence our approach, below: + */ + if (languageSet == null) + { + return engine.Encode(input); + } + else + { + return engine.Encode(input, languageSet); + } + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1ee3a9cc/src/Lucene.Net.Tests.Analysis.Phonetic/Language/Bm/PhoneticEngineTest.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Tests.Analysis.Phonetic/Language/Bm/PhoneticEngineTest.cs b/src/Lucene.Net.Tests.Analysis.Phonetic/Language/Bm/PhoneticEngineTest.cs new file mode 100644 index 0000000..281fc45 --- /dev/null +++ b/src/Lucene.Net.Tests.Analysis.Phonetic/Language/Bm/PhoneticEngineTest.cs @@ -0,0 +1,89 @@ +using NUnit.Framework; +using System; +using System.Collections.Generic; +using System.Text.RegularExpressions; + +namespace Lucene.Net.Analysis.Phonetic.Language.Bm +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + public class PhoneticEngineTest + { + private static readonly int TEN = 10; + + public static List<Object[]> Values = new List<object[]> { new Object[] { "Renault", "rinD|rinDlt|rina|rinalt|rino|rinolt|rinu|rinult", NameType.GENERIC, RuleType.APPROX, true, TEN }, + new Object[] { "Renault", "rYnDlt|rYnalt|rYnult|rinDlt|rinalt|rinult", NameType.ASHKENAZI, RuleType.APPROX, true, TEN }, + new Object[] { "Renault", "rYnDlt", NameType.ASHKENAZI, RuleType.APPROX, true, 1 }, + new Object[] { "Renault", "rinDlt", NameType.SEPHARDIC, RuleType.APPROX, true, TEN }, + new Object[] { "SntJohn-Smith", "sntjonsmit", NameType.GENERIC, RuleType.EXACT, true, TEN }, + new Object[] { "d'ortley", "(ortlaj|ortlej)-(dortlaj|dortlej)", NameType.GENERIC, RuleType.EXACT, true, TEN }, + new Object[] { + "van helsing", + "(elSink|elsink|helSink|helsink|helzink|xelsink)-(banhelsink|fanhelsink|fanhelzink|vanhelsink|vanhelzink|vanjelsink)", + NameType.GENERIC, + RuleType.EXACT, + false, TEN } }; + + // private readonly bool concat; + //private readonly String name; + //private readonly NameType nameType; + //private readonly String phoneticExpected; + //private readonly RuleType ruleType; + //private readonly int maxPhonemes; + + // public PhoneticEngineTest(String name, String phoneticExpected, NameType nameType, + // RuleType ruleType, bool concat, int maxPhonemes) + // { + // this.name = name; + // this.phoneticExpected = phoneticExpected; + // this.nameType = nameType; + // this.ruleType = ruleType; + // this.concat = concat; + // this.maxPhonemes = maxPhonemes; + // } + + [Test]//@Test(timeout = 10000L) + [TestCaseSource("Values")] + public void TestEncode(String name, String phoneticExpected, NameType nameType, + RuleType ruleType, bool concat, int maxPhonemes) + { + PhoneticEngine engine = new PhoneticEngine(nameType, ruleType, concat, maxPhonemes); + + String phoneticActual = engine.Encode(name); + + //System.err.println("expecting: " + this.phoneticExpected); + //System.err.println("actual: " + phoneticActual); + Assert.AreEqual(phoneticExpected, phoneticActual, "phoneme incorrect"); + + if (concat) + { + String[] split = new Regex("\\|").Split(phoneticActual); + Assert.True(split.Length <= maxPhonemes); + } + else + { + String[] words = phoneticActual.Split(new string[] { "-" }, StringSplitOptions.RemoveEmptyEntries); + foreach (String word in words) + { + String[] split = new Regex("\\|").Split(word); + Assert.True(split.Length <= maxPhonemes); + } + } + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1ee3a9cc/src/Lucene.Net.Tests.Analysis.Phonetic/Language/Bm/RuleTest.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Tests.Analysis.Phonetic/Language/Bm/RuleTest.cs b/src/Lucene.Net.Tests.Analysis.Phonetic/Language/Bm/RuleTest.cs new file mode 100644 index 0000000..fd2e8a2 --- /dev/null +++ b/src/Lucene.Net.Tests.Analysis.Phonetic/Language/Bm/RuleTest.cs @@ -0,0 +1,163 @@ +using NUnit.Framework; +using System; + +namespace Lucene.Net.Analysis.Phonetic.Language.Bm +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// <summary> + /// Tests Rule. + /// <para/> + /// since 1.6 + /// </summary> + public class RuleTest + { + // private static class NegativeIntegerBaseMatcher : BaseMatcher<Integer> { + // @Override + // public void describeTo(final Description description) + // { + // description.appendText("value should be negative"); + // } + + // @Override + // public boolean matches(final Object item) + // { + // return ((Integer)item).intValue() < 0; + // } + //} + + private Phoneme[][] MakePhonemes() + { + String[][] + words = { + new string[] { "rinD", "rinDlt", "rina", "rinalt", "rino", "rinolt", "rinu", "rinult" }, + new string[] { "dortlaj", "dortlej", "ortlaj", "ortlej", "ortlej-dortlaj" } }; + Phoneme[][] phonemes = new Phoneme[words.Length][]; + + for (int i = 0; i < words.Length; i++) + { + String[] words_i = words[i]; + Phoneme[] phonemes_i = phonemes[i] = new Phoneme[words_i.Length]; + for (int j = 0; j < words_i.Length; j++) + { + phonemes_i[j] = new Phoneme(words_i[j], Languages.NO_LANGUAGES); + } + } + + return phonemes; + } + + [Test] + public void TestPhonemeComparedToLaterIsNegative() + { + foreach (Phoneme[] phs in MakePhonemes()) + { + for (int i = 0; i < phs.Length; i++) + { + for (int j = i + 1; j < phs.Length; j++) + { + int c = Phoneme.COMPARER.Compare(phs[i], phs[j]); + + Assert.True(c < 0, + "Comparing " + phs[i].GetPhonemeText() + " to " + phs[j].GetPhonemeText() + " should be negative"); + } + } + } + } + + [Test] + public void TestPhonemeComparedToSelfIsZero() + { + foreach (Phoneme[] phs in MakePhonemes()) + { + foreach (Phoneme ph in phs) + { + Assert.AreEqual(0, + Phoneme.COMPARER.Compare(ph, ph), + "Phoneme compared to itself should be zero: " + ph.GetPhonemeText()); + } + } + } + + [Test] + public void TestSubSequenceWorks() + { + // AppendableCharSequence is private to Rule. We can only make it through a Phoneme. + + Phoneme a = new Phoneme("a", null); + Phoneme b = new Phoneme("b", null); + Phoneme cd = new Phoneme("cd", null); + Phoneme ef = new Phoneme("ef", null); + Phoneme ghi = new Phoneme("ghi", null); + Phoneme jkl = new Phoneme("jkl", null); + + Assert.AreEqual('a', a.GetPhonemeText()[0]); + Assert.AreEqual('b', b.GetPhonemeText()[0]); + Assert.AreEqual('c', cd.GetPhonemeText()[0]); + Assert.AreEqual('d', cd.GetPhonemeText()[1]); + Assert.AreEqual('e', ef.GetPhonemeText()[0]); + Assert.AreEqual('f', ef.GetPhonemeText()[1]); + Assert.AreEqual('g', ghi.GetPhonemeText()[0]); + Assert.AreEqual('h', ghi.GetPhonemeText()[1]); + Assert.AreEqual('i', ghi.GetPhonemeText()[2]); + Assert.AreEqual('j', jkl.GetPhonemeText()[0]); + Assert.AreEqual('k', jkl.GetPhonemeText()[1]); + Assert.AreEqual('l', jkl.GetPhonemeText()[2]); + + Phoneme a_b = new Phoneme(a, b); + Assert.AreEqual('a', a_b.GetPhonemeText()[0]); + Assert.AreEqual('b', a_b.GetPhonemeText()[1]); + Assert.AreEqual("ab", a_b.GetPhonemeText().Substring(0, 2 - 0).toString()); + Assert.AreEqual("a", a_b.GetPhonemeText().Substring(0, 1 - 0).toString()); + Assert.AreEqual("b", a_b.GetPhonemeText().Substring(1, 2 - 1).toString()); + + Phoneme cd_ef = new Phoneme(cd, ef); + Assert.AreEqual('c', cd_ef.GetPhonemeText()[0]); + Assert.AreEqual('d', cd_ef.GetPhonemeText()[1]); + Assert.AreEqual('e', cd_ef.GetPhonemeText()[2]); + Assert.AreEqual('f', cd_ef.GetPhonemeText()[3]); + Assert.AreEqual("c", cd_ef.GetPhonemeText().Substring(0, 1 - 0).toString()); + Assert.AreEqual("d", cd_ef.GetPhonemeText().Substring(1, 2 - 1).toString()); + Assert.AreEqual("e", cd_ef.GetPhonemeText().Substring(2, 3 - 2).toString()); + Assert.AreEqual("f", cd_ef.GetPhonemeText().Substring(3, 4 - 3).toString()); + Assert.AreEqual("cd", cd_ef.GetPhonemeText().Substring(0, 2 - 0).toString()); + Assert.AreEqual("de", cd_ef.GetPhonemeText().Substring(1, 3 - 1).toString()); + Assert.AreEqual("ef", cd_ef.GetPhonemeText().Substring(2, 4 - 2).toString()); + Assert.AreEqual("cde", cd_ef.GetPhonemeText().Substring(0, 3 - 0).toString()); + Assert.AreEqual("def", cd_ef.GetPhonemeText().Substring(1, 4 - 1).toString()); + Assert.AreEqual("cdef", cd_ef.GetPhonemeText().Substring(0, 4 - 0).toString()); + + var test = new Phoneme(a, b); + Phoneme a_b_cd = new Phoneme(test, cd); + Assert.AreEqual('a', a_b_cd.GetPhonemeText()[0]); + Assert.AreEqual('b', a_b_cd.GetPhonemeText()[1]); + Assert.AreEqual('c', a_b_cd.GetPhonemeText()[2]); + Assert.AreEqual('d', a_b_cd.GetPhonemeText()[3]); + Assert.AreEqual("a", a_b_cd.GetPhonemeText().Substring(0, 1 - 0).toString()); + Assert.AreEqual("b", a_b_cd.GetPhonemeText().Substring(1, 2 - 1).toString()); + Assert.AreEqual("c", a_b_cd.GetPhonemeText().Substring(2, 3 - 2).toString()); + Assert.AreEqual("d", a_b_cd.GetPhonemeText().Substring(3, 4 - 3).toString()); + Assert.AreEqual("ab", a_b_cd.GetPhonemeText().Substring(0, 2 - 0).toString()); + Assert.AreEqual("bc", a_b_cd.GetPhonemeText().Substring(1, 3 - 1).toString()); + Assert.AreEqual("cd", a_b_cd.GetPhonemeText().Substring(2, 4 - 2).toString()); + Assert.AreEqual("abc", a_b_cd.GetPhonemeText().Substring(0, 3 - 0).toString()); + Assert.AreEqual("bcd", a_b_cd.GetPhonemeText().Substring(1, 4 - 1).toString()); + Assert.AreEqual("abcd", a_b_cd.GetPhonemeText().Substring(0, 4 - 0).toString()); + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1ee3a9cc/src/Lucene.Net.Tests.Analysis.Phonetic/Language/Caverphone1Test.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Tests.Analysis.Phonetic/Language/Caverphone1Test.cs b/src/Lucene.Net.Tests.Analysis.Phonetic/Language/Caverphone1Test.cs new file mode 100644 index 0000000..9112ed4 --- /dev/null +++ b/src/Lucene.Net.Tests.Analysis.Phonetic/Language/Caverphone1Test.cs @@ -0,0 +1,109 @@ +using NUnit.Framework; +using System; + +namespace Lucene.Net.Analysis.Phonetic.Language +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// <summary> + /// Tests Caverphone1. + /// </summary> + public class Caverphone1Test : StringEncoderAbstractTest<Caverphone1> + { + protected override Caverphone1 CreateStringEncoder() + { + return new Caverphone1(); + } + + /** + * Tests example adapted from version 2.0 http://caversham.otago.ac.nz/files/working/ctp150804.pdf + * + * AT1111 words: add, aid, at, art, eat, earth, head, hit, hot, hold, hard, heart, it, out, old + * + * @throws EncoderException + */ + [Test] + public void TestCaverphoneRevisitedCommonCodeAT1111() + { + this.CheckEncodingVariations("AT1111", new String[]{ + "add", + "aid", + "at", + "art", + "eat", + "earth", + "head", + "hit", + "hot", + "hold", + "hard", + "heart", + "it", + "out", + "old"}); + } + + [Test] + public void TestEndMb() + { + String[] + [] + data = { new string[] { "mb", "M11111" }, new string[] { "mbmb", "MPM111" } }; + this.CheckEncodings(data); + } + + /** + * Tests some examples from version 2.0 http://caversham.otago.ac.nz/files/working/ctp150804.pdf + * + * @throws EncoderException + */ + [Test] + public void TestIsCaverphoneEquals() + { + Caverphone1 caverphone = new Caverphone1(); + Assert.False(caverphone.IsEncodeEqual("Peter", "Stevenson"), "Caverphone encodings should not be equal"); + Assert.True(caverphone.IsEncodeEqual("Peter", "Peady"), "Caverphone encodings should be equal"); + } + + /** + * Tests example from http://caversham.otago.ac.nz/files/working/ctp060902.pdf + * + * @throws EncoderException + */ + [Test] + public void TestSpecificationV1Examples() + { + String[] + [] + data = { new string[] { "David", "TFT111" }, new string[] { "Whittle", "WTL111" } }; + this.CheckEncodings(data); + } + + /** + * Tests examples from http://en.wikipedia.org/wiki/Caverphone + * + * @throws EncoderException + */ + [Test] + public void TestWikipediaExamples() + { + String[][] data = { new string[] { "Lee", "L11111" }, new string[] { "Thompson", "TMPSN1" } }; + this.CheckEncodings(data); + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1ee3a9cc/src/Lucene.Net.Tests.Analysis.Phonetic/Language/Caverphone2Test .cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Tests.Analysis.Phonetic/Language/Caverphone2Test .cs b/src/Lucene.Net.Tests.Analysis.Phonetic/Language/Caverphone2Test .cs new file mode 100644 index 0000000..4ec1daa --- /dev/null +++ b/src/Lucene.Net.Tests.Analysis.Phonetic/Language/Caverphone2Test .cs @@ -0,0 +1,375 @@ +using NUnit.Framework; +using System; + +namespace Lucene.Net.Analysis.Phonetic.Language +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// <summary> + /// Tests Caverphone2. + /// </summary> + public class Caverphone2Test : StringEncoderAbstractTest<Caverphone2> + { + protected override Caverphone2 CreateStringEncoder() + { + return new Caverphone2(); + } + + /** + * See http://caversham.otago.ac.nz/files/working/ctp150804.pdf + * + * AT11111111 words: add, aid, at, art, eat, earth, head, hit, hot, hold, hard, heart, it, out, old + * + * @throws EncoderException + */ + [Test] + public void TestCaverphoneRevisitedCommonCodeAT11111111() + { + this.CheckEncodingVariations("AT11111111", new String[]{ + "add", + "aid", + "at", + "art", + "eat", + "earth", + "head", + "hit", + "hot", + "hold", + "hard", + "heart", + "it", + "out", + "old"}); + } + + /** + * See http://caversham.otago.ac.nz/files/working/ctp150804.pdf + * + * @throws EncoderException + */ + [Test] + public void TestCaverphoneRevisitedExamples() + { + String[] + [] + data = { new string[] { "Stevenson", "STFNSN1111" }, new string[] { "Peter", "PTA1111111" } }; + this.CheckEncodings(data); + } + + /** + * See http://caversham.otago.ac.nz/files/working/ctp150804.pdf + * + * @throws EncoderException + */ + [Test] + public void TestCaverphoneRevisitedRandomNameKLN1111111() + { + this.CheckEncodingVariations("KLN1111111", new String[]{ + "Cailean", + "Calan", + "Calen", + "Callahan", + "Callan", + "Callean", + "Carleen", + "Carlen", + "Carlene", + "Carlin", + "Carline", + "Carlyn", + "Carlynn", + "Carlynne", + "Charlean", + "Charleen", + "Charlene", + "Charline", + "Cherlyn", + "Chirlin", + "Clein", + "Cleon", + "Cline", + "Cohleen", + "Colan", + "Coleen", + "Colene", + "Colin", + "Colleen", + "Collen", + "Collin", + "Colline", + "Colon", + "Cullan", + "Cullen", + "Cullin", + "Gaelan", + "Galan", + "Galen", + "Garlan", + "Garlen", + "Gaulin", + "Gayleen", + "Gaylene", + "Giliane", + "Gillan", + "Gillian", + "Glen", + "Glenn", + "Glyn", + "Glynn", + "Gollin", + "Gorlin", + "Kalin", + "Karlan", + "Karleen", + "Karlen", + "Karlene", + "Karlin", + "Karlyn", + "Kaylyn", + "Keelin", + "Kellen", + "Kellene", + "Kellyann", + "Kellyn", + "Khalin", + "Kilan", + "Kilian", + "Killen", + "Killian", + "Killion", + "Klein", + "Kleon", + "Kline", + "Koerlin", + "Kylen", + "Kylynn", + "Quillan", + "Quillon", + "Qulllon", + "Xylon"}); + } + + /** + * See http://caversham.otago.ac.nz/files/working/ctp150804.pdf + * + * @throws EncoderException + */ + [Test] + public void TestCaverphoneRevisitedRandomNameTN11111111() + { + this.CheckEncodingVariations("TN11111111", new String[]{ + "Dan", + "Dane", + "Dann", + "Darn", + "Daune", + "Dawn", + "Ddene", + "Dean", + "Deane", + "Deanne", + "DeeAnn", + "Deeann", + "Deeanne", + "Deeyn", + "Den", + "Dene", + "Denn", + "Deonne", + "Diahann", + "Dian", + "Diane", + "Diann", + "Dianne", + "Diannne", + "Dine", + "Dion", + "Dione", + "Dionne", + "Doane", + "Doehne", + "Don", + "Donn", + "Doone", + "Dorn", + "Down", + "Downe", + "Duane", + "Dun", + "Dunn", + "Duyne", + "Dyan", + "Dyane", + "Dyann", + "Dyanne", + "Dyun", + "Tan", + "Tann", + "Teahan", + "Ten", + "Tenn", + "Terhune", + "Thain", + "Thaine", + "Thane", + "Thanh", + "Thayne", + "Theone", + "Thin", + "Thorn", + "Thorne", + "Thun", + "Thynne", + "Tien", + "Tine", + "Tjon", + "Town", + "Towne", + "Turne", + "Tyne"}); + } + + /** + * See http://caversham.otago.ac.nz/files/working/ctp150804.pdf + * + * @throws EncoderException + */ + [Test] + public void TestCaverphoneRevisitedRandomNameTTA1111111() + { + this.CheckEncodingVariations("TTA1111111", new String[]{ + "Darda", + "Datha", + "Dedie", + "Deedee", + "Deerdre", + "Deidre", + "Deirdre", + "Detta", + "Didi", + "Didier", + "Dido", + "Dierdre", + "Dieter", + "Dita", + "Ditter", + "Dodi", + "Dodie", + "Dody", + "Doherty", + "Dorthea", + "Dorthy", + "Doti", + "Dotti", + "Dottie", + "Dotty", + "Doty", + "Doughty", + "Douty", + "Dowdell", + "Duthie", + "Tada", + "Taddeo", + "Tadeo", + "Tadio", + "Tati", + "Teador", + "Tedda", + "Tedder", + "Teddi", + "Teddie", + "Teddy", + "Tedi", + "Tedie", + "Teeter", + "Teodoor", + "Teodor", + "Terti", + "Theda", + "Theodor", + "Theodore", + "Theta", + "Thilda", + "Thordia", + "Tilda", + "Tildi", + "Tildie", + "Tildy", + "Tita", + "Tito", + "Tjader", + "Toddie", + "Toddy", + "Torto", + "Tuddor", + "Tudor", + "Turtle", + "Tuttle", + "Tutto"}); + } + + /** + * See http://caversham.otago.ac.nz/files/working/ctp150804.pdf + * + * @throws EncoderException + */ + [Test] + public void TestCaverphoneRevisitedRandomWords() + { + this.CheckEncodingVariations("RTA1111111", new String[] { "rather", "ready", "writer" }); + this.CheckEncoding("SSA1111111", "social"); + this.CheckEncodingVariations("APA1111111", new String[] { "able", "appear" }); + } + + [Test] + public void TestEndMb() + { + String[] + [] + data = { new string[] { "mb", "M111111111" }, new string[] { "mbmb", "MPM1111111" } }; + this.CheckEncodings(data); + } + + // Caverphone Revisited + [Test] + public void TestIsCaverphoneEquals() + { + Caverphone2 caverphone = new Caverphone2(); + Assert.False(caverphone.IsEncodeEqual("Peter", "Stevenson"), "Caverphone encodings should not be equal"); + Assert.True(caverphone.IsEncodeEqual("Peter", "Peady"), "Caverphone encodings should be equal"); + } + + [Test] + public void TestSpecificationExamples() + { + String[] + [] + data = { + new string[] { "Peter", "PTA1111111"}, + new string[] { "ready", "RTA1111111"}, + new string[] { "social", "SSA1111111"}, + new string[] { "able", "APA1111111"}, + new string[] { "Tedder", "TTA1111111"}, + new string[] { "Karleen", "KLN1111111"}, + new string[] { "Dyun", "TN11111111"} + }; + this.CheckEncodings(data); + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1ee3a9cc/src/Lucene.Net.Tests.Analysis.Phonetic/Language/ColognePhoneticTest.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Tests.Analysis.Phonetic/Language/ColognePhoneticTest.cs b/src/Lucene.Net.Tests.Analysis.Phonetic/Language/ColognePhoneticTest.cs new file mode 100644 index 0000000..46b14ff --- /dev/null +++ b/src/Lucene.Net.Tests.Analysis.Phonetic/Language/ColognePhoneticTest.cs @@ -0,0 +1,171 @@ +using NUnit.Framework; +using System; + +namespace Lucene.Net.Analysis.Phonetic.Language +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// <summary> + /// Tests the <see cref="ColognePhonetic"/> class. + /// </summary> + public class ColognePhoneticTest : StringEncoderAbstractTest<ColognePhonetic> + { + protected override ColognePhonetic CreateStringEncoder() + { + return new ColognePhonetic(); + } + + [Test] + public void TestAabjoe() + { + this.CheckEncoding("01", "Aabjoe"); + } + + [Test] + public void TestAaclan() + { + this.CheckEncoding("0856", "Aaclan"); + } + + /** + * Tests [CODEC-122] + * + * @throws EncoderException + */ + [Test] + public void TestAychlmajrForCodec122() + { + this.CheckEncoding("04567", "Aychlmajr"); + } + + [Test] + public void TestEdgeCases() + { + String[][] data = { + new string[] { "a", "0"}, + new string[] { "e", "0"}, + new string[] { "i", "0"}, + new string[] { "o", "0"}, + new string[] { "u", "0"}, + new string[] { "\u00E4", "0"}, // a-umlaut + new string[] { "\u00F6", "0"}, // o-umlaut + new string[] { "\u00FC", "0"}, // u-umlaut + new string[] { "aa", "0"}, + new string[] { "ha", "0"}, + new string[] { "h", ""}, + new string[] { "aha", "0"}, + new string[] { "b", "1"}, + new string[] { "p", "1"}, + new string[] { "ph", "3"}, + new string[] { "f", "3"}, + new string[] { "v", "3"}, + new string[] { "w", "3"}, + new string[] { "g", "4"}, + new string[] { "k", "4"}, + new string[] { "q", "4"}, + new string[] { "x", "48"}, + new string[] { "ax", "048"}, + new string[] { "cx", "48"}, + new string[] { "l", "5"}, + new string[] { "cl", "45"}, + new string[] { "acl", "085"}, + new string[] { "mn", "6"}, + new string[] { "r", "7"} + }; + this.CheckEncodings(data); + } + + [Test] + public void TestExamples() + { + String[][] data = { + new string[] { "m\u00DCller", "657"}, // mÃller - why upper case U-umlaut? + new string[] { "schmidt", "862"}, + new string[] { "schneider", "8627"}, + new string[] { "fischer", "387"}, + new string[] { "weber", "317"}, + new string[] { "wagner", "3467"}, + new string[] { "becker", "147"}, + new string[] { "hoffmann", "0366"}, + new string[] { "sch\u00C4fer", "837"}, // schÃfer - why upper case A-umlaut ? + new string[] { "Breschnew", "17863"}, + new string[] { "Wikipedia", "3412"}, + new string[] { "peter", "127"}, + new string[] { "pharma", "376"}, + new string[] { "m\u00f6nchengladbach", "664645214"}, // mönchengladbach + new string[] { "deutsch", "28"}, + new string[] { "deutz", "28"}, + new string[] { "hamburg", "06174"}, + new string[] { "hannover", "0637"}, + new string[] { "christstollen", "478256"}, + new string[] { "Xanthippe", "48621"}, + new string[] { "Zacharias", "8478"}, + new string[] { "Holzbau", "0581"}, + new string[] { "matsch", "68"}, + new string[] { "matz", "68"}, + new string[] { "Arbeitsamt", "071862"}, + new string[] { "Eberhard", "01772"}, + new string[] { "Eberhardt", "01772"}, + new string[] { "heithabu", "021"} + }; + this.CheckEncodings(data); + } + + [Test] + public void TestHyphen() + { + String[][] data = { + new string[] { "bergisch-gladbach", "174845214"}, + new string[] { "M\u00fcller-L\u00fcdenscheidt", "65752682"} + }; // Müller-Lüdenscheidt + this.CheckEncodings(data); + } + + [Test] + public void TestIsEncodeEquals() + { + String[][] data = { + new string[] {"Meyer", "M\u00fcller"}, // Müller + new string[] {"Meyer", "Mayr"}, + new string[] {"house", "house"}, + new string[] {"House", "house"}, + new string[] {"Haus", "house"}, + new string[] {"ganz", "Gans"}, + new string[] {"ganz", "G\u00e4nse"}, // Gänse + new string[] {"Miyagi", "Miyako"}}; + foreach (String[] element in data) + { + this.StringEncoder.IsEncodeEqual(element[1], element[0]); + } + } + + [Test] + public void TestVariationsMella() + { + String[] data = { "mella", "milah", "moulla", "mellah", "muehle", "mule" }; + this.CheckEncodingVariations("65", data); + } + + [Test] + public void TestVariationsMeyer() + { + String[] data = { "Meier", "Maier", "Mair", "Meyer", "Meyr", "Mejer", "Major" }; + this.CheckEncodingVariations("67", data); + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1ee3a9cc/src/Lucene.Net.Tests.Analysis.Phonetic/Language/DaitchMokotoffSoundexTest.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Tests.Analysis.Phonetic/Language/DaitchMokotoffSoundexTest.cs b/src/Lucene.Net.Tests.Analysis.Phonetic/Language/DaitchMokotoffSoundexTest.cs new file mode 100644 index 0000000..84bb5d3 --- /dev/null +++ b/src/Lucene.Net.Tests.Analysis.Phonetic/Language/DaitchMokotoffSoundexTest.cs @@ -0,0 +1,176 @@ +// commons-codec version compatibility level: 1.10 +using NUnit.Framework; +using System; + +namespace Lucene.Net.Analysis.Phonetic.Language +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// <summary> + /// Tests <see cref="DaitchMokotoffSoundex"/>. + /// <para/> + /// since 1.10 + /// </summary> + public class DaitchMokotoffSoundexTest : StringEncoderAbstractTest<DaitchMokotoffSoundex> + { + protected override DaitchMokotoffSoundex CreateStringEncoder() + { + return new DaitchMokotoffSoundex(); + } + + private string GetSoundex(string source) + { + return StringEncoder.GetSoundex(source); + } + + private string Encode(string source) + { + return StringEncoder.Encode(source); + } + + [Test] + public void TestAccentedCharacterFolding() + { + Assert.AreEqual("294795", GetSoundex("StraÃburg")); + Assert.AreEqual("294795", GetSoundex("Strasburg")); + + Assert.AreEqual("095600", GetSoundex("Ãregon")); + Assert.AreEqual("095600", GetSoundex("Eregon")); + } + + [Test] + public void TestAdjacentCodes() + { + // AKSSOL + // A-KS-S-O-L + // 0-54-4---8 -> wrong + // 0-54-----8 -> correct + Assert.AreEqual("054800", GetSoundex("AKSSOL")); + + // GERSCHFELD + // G-E-RS-CH-F-E-L-D + // 5--4/94-5/4-7-8-3 -> wrong + // 5--4/94-5/--7-8-3 -> correct + Assert.AreEqual("547830|545783|594783|594578", GetSoundex("GERSCHFELD")); + } + + [Test] + public void TestEncodeBasic() + { + // same as above, but without branching + Assert.AreEqual("097400", Encode("AUERBACH")); + Assert.AreEqual("097400", Encode("OHRBACH")); + Assert.AreEqual("874400", Encode("LIPSHITZ")); + Assert.AreEqual("874400", Encode("LIPPSZYC")); + Assert.AreEqual("876450", Encode("LEWINSKY")); + Assert.AreEqual("876450", Encode("LEVINSKI")); + Assert.AreEqual("486740", Encode("SZLAMAWICZ")); + Assert.AreEqual("486740", Encode("SHLAMOVITZ")); + } + + [Test] + public void TestEncodeIgnoreApostrophes() + { + this.CheckEncodingVariations("079600", new String[] { "OBrien", "'OBrien", "O'Brien", "OB'rien", "OBr'ien", + "OBri'en", "OBrie'n", "OBrien'" }); + } + + /** + * Test data from http://www.myatt.demon.co.uk/sxalg.htm + * + * @throws EncoderException + */ + [Test] + public void TestEncodeIgnoreHyphens() + { + this.CheckEncodingVariations("565463", new String[] { "KINGSMITH", "-KINGSMITH", "K-INGSMITH", "KI-NGSMITH", + "KIN-GSMITH", "KING-SMITH", "KINGS-MITH", "KINGSM-ITH", "KINGSMI-TH", "KINGSMIT-H", "KINGSMITH-" }); + } + + [Test] + public void TestEncodeIgnoreTrimmable() + { + Assert.AreEqual("746536", Encode(" \t\n\r Washington \t\n\r ")); + Assert.AreEqual("746536", Encode("Washington")); + } + + /** + * Examples from http://www.jewishgen.org/infofiles/soundex.html + */ + [Test] + public void TestSoundexBasic() + { + Assert.AreEqual("583600", GetSoundex("GOLDEN")); + Assert.AreEqual("087930", GetSoundex("Alpert")); + Assert.AreEqual("791900", GetSoundex("Breuer")); + Assert.AreEqual("579000", GetSoundex("Haber")); + Assert.AreEqual("665600", GetSoundex("Mannheim")); + Assert.AreEqual("664000", GetSoundex("Mintz")); + Assert.AreEqual("370000", GetSoundex("Topf")); + Assert.AreEqual("586660", GetSoundex("Kleinmann")); + Assert.AreEqual("769600", GetSoundex("Ben Aron")); + + Assert.AreEqual("097400|097500", GetSoundex("AUERBACH")); + Assert.AreEqual("097400|097500", GetSoundex("OHRBACH")); + Assert.AreEqual("874400", GetSoundex("LIPSHITZ")); + Assert.AreEqual("874400|874500", GetSoundex("LIPPSZYC")); + Assert.AreEqual("876450", GetSoundex("LEWINSKY")); + Assert.AreEqual("876450", GetSoundex("LEVINSKI")); + Assert.AreEqual("486740", GetSoundex("SZLAMAWICZ")); + Assert.AreEqual("486740", GetSoundex("SHLAMOVITZ")); + } + + /** + * Examples from http://www.avotaynu.com/soundex.htm + */ + [Test] + public void TestSoundexBasic2() + { + Assert.AreEqual("467000|567000", GetSoundex("Ceniow")); + Assert.AreEqual("467000", GetSoundex("Tsenyuv")); + Assert.AreEqual("587400|587500", GetSoundex("Holubica")); + Assert.AreEqual("587400", GetSoundex("Golubitsa")); + Assert.AreEqual("746480|794648", GetSoundex("Przemysl")); + Assert.AreEqual("746480", GetSoundex("Pshemeshil")); + Assert.AreEqual("944744|944745|944754|944755|945744|945745|945754|945755", GetSoundex("Rosochowaciec")); + Assert.AreEqual("945744", GetSoundex("Rosokhovatsets")); + } + + /** + * Examples from http://en.wikipedia.org/wiki/Daitch%E2%80%93Mokotoff_Soundex + */ + [Test] + public void TestSoundexBasic3() + { + Assert.AreEqual("734000|739400", GetSoundex("Peters")); + Assert.AreEqual("734600|739460", GetSoundex("Peterson")); + Assert.AreEqual("645740", GetSoundex("Moskowitz")); + Assert.AreEqual("645740", GetSoundex("Moskovitz")); + Assert.AreEqual("154600|145460|454600|445460", GetSoundex("Jackson")); + Assert.AreEqual("154654|154645|154644|145465|145464|454654|454645|454644|445465|445464", + GetSoundex("Jackson-Jackson")); + } + + [Test] + public void TestSpecialRomanianCharacters() + { + Assert.AreEqual("364000|464000", GetSoundex("Å£amas")); // t-cedilla + Assert.AreEqual("364000|464000", GetSoundex("Èamas")); // t-comma + } + } +}
