http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1ee3a9cc/src/Lucene.Net.Tests.Analysis.Phonetic/Language/MetaphoneTest.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Tests.Analysis.Phonetic/Language/MetaphoneTest.cs b/src/Lucene.Net.Tests.Analysis.Phonetic/Language/MetaphoneTest.cs new file mode 100644 index 0000000..18a9e59 --- /dev/null +++ b/src/Lucene.Net.Tests.Analysis.Phonetic/Language/MetaphoneTest.cs @@ -0,0 +1,518 @@ +using NUnit.Framework; +using System; + +namespace Lucene.Net.Analysis.Phonetic.Language +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + public class MetaphoneTest : StringEncoderAbstractTest<Metaphone> + { + public void AssertIsMetaphoneEqual(string source, string[] matches) + { + // match source to all matches + foreach (string matche in matches) + { + Assert.True(this.StringEncoder.IsMetaphoneEqual(source, matche), + "Source: " + source + ", should have same Metaphone as: " + matche); + } + // match to each other + foreach (string matche in matches) + { + foreach (string matche2 in matches) + { + Assert.True(this.StringEncoder.IsMetaphoneEqual(matche, matche2)); + } + } + } + + public void AssertMetaphoneEqual(String[][] pairs) + { + this.ValidateFixture(pairs); + foreach (String[] pair in pairs) + { + String name0 = pair[0]; + String name1 = pair[1]; + String failMsg = "Expected match between " + name0 + " and " + name1; + Assert.True(this.StringEncoder.IsMetaphoneEqual(name0, name1), failMsg); + Assert.True(this.StringEncoder.IsMetaphoneEqual(name1, name0), failMsg); + } + } + + + protected override Metaphone CreateStringEncoder() + { + return new Metaphone(); + } + + [Test] + public void TestIsMetaphoneEqual1() + { + this.AssertMetaphoneEqual(new String[][] { new string[] { + "Case", "case" }, new string[] { + "CASE", "Case" }, new string[] { + "caSe", "cAsE" }, new string[] { + "quick", "cookie" } + }); + } + + /** + * Matches computed from http://www.lanw.com/java/phonetic/default.htm + */ + [Test] + public void TestIsMetaphoneEqual2() + { + this.AssertMetaphoneEqual(new String[][] { new string[] { "Lawrence", "Lorenza" }, new string[] { + "Gary", "Cahra" }, }); + } + + /** + * Initial AE case. + * + * Match data computed from http://www.lanw.com/java/phonetic/default.htm + */ + [Test] + public void TestIsMetaphoneEqualAero() + { + this.AssertIsMetaphoneEqual("Aero", new String[] { "Eure" }); + } + + /** + * Initial WH case. + * + * Match data computed from http://www.lanw.com/java/phonetic/default.htm + */ + [Test] + public void TestIsMetaphoneEqualWhite() + { + this.AssertIsMetaphoneEqual( + "White", + new String[] { "Wade", "Wait", "Waite", "Wat", "Whit", "Wiatt", "Wit", "Wittie", "Witty", "Wood", "Woodie", "Woody" }); + } + + /** + * Initial A, not followed by an E case. + * + * Match data computed from http://www.lanw.com/java/phonetic/default.htm + */ + [Test] + public void TestIsMetaphoneEqualAlbert() + { + this.AssertIsMetaphoneEqual("Albert", new String[] { "Ailbert", "Alberik", "Albert", "Alberto", "Albrecht" }); + } + + /** + * Match data computed from http://www.lanw.com/java/phonetic/default.htm + */ + [Test] + public void TestIsMetaphoneEqualGary() + { + this.AssertIsMetaphoneEqual( + "Gary", + new String[] { + "Cahra", + "Cara", + "Carey", + "Cari", + "Caria", + "Carie", + "Caro", + "Carree", + "Carri", + "Carrie", + "Carry", + "Cary", + "Cora", + "Corey", + "Cori", + "Corie", + "Correy", + "Corri", + "Corrie", + "Corry", + "Cory", + "Gray", + "Kara", + "Kare", + "Karee", + "Kari", + "Karia", + "Karie", + "Karrah", + "Karrie", + "Karry", + "Kary", + "Keri", + "Kerri", + "Kerrie", + "Kerry", + "Kira", + "Kiri", + "Kora", + "Kore", + "Kori", + "Korie", + "Korrie", + "Korry" }); + } + + /** + * Match data computed from http://www.lanw.com/java/phonetic/default.htm + */ + [Test] + public void TestIsMetaphoneEqualJohn() + { + this.AssertIsMetaphoneEqual( + "John", + new String[] { + "Gena", + "Gene", + "Genia", + "Genna", + "Genni", + "Gennie", + "Genny", + "Giana", + "Gianna", + "Gina", + "Ginni", + "Ginnie", + "Ginny", + "Jaine", + "Jan", + "Jana", + "Jane", + "Janey", + "Jania", + "Janie", + "Janna", + "Jany", + "Jayne", + "Jean", + "Jeana", + "Jeane", + "Jeanie", + "Jeanna", + "Jeanne", + "Jeannie", + "Jen", + "Jena", + "Jeni", + "Jenn", + "Jenna", + "Jennee", + "Jenni", + "Jennie", + "Jenny", + "Jinny", + "Jo Ann", + "Jo-Ann", + "Jo-Anne", + "Joan", + "Joana", + "Joane", + "Joanie", + "Joann", + "Joanna", + "Joanne", + "Joeann", + "Johna", + "Johnna", + "Joni", + "Jonie", + "Juana", + "June", + "Junia", + "Junie" }); + } + + /** + * Initial KN case. + * + * Match data computed from http://www.lanw.com/java/phonetic/default.htm + */ + [Test] + public void TestIsMetaphoneEqualKnight() + { + this.AssertIsMetaphoneEqual( + "Knight", + new String[] { + "Hynda", + "Nada", + "Nadia", + "Nady", + "Nat", + "Nata", + "Natty", + "Neda", + "Nedda", + "Nedi", + "Netta", + "Netti", + "Nettie", + "Netty", + "Nita", + "Nydia" }); + } + /** + * Match data computed from http://www.lanw.com/java/phonetic/default.htm + */ + [Test] + public void TestIsMetaphoneEqualMary() + { + this.AssertIsMetaphoneEqual( + "Mary", + new String[] { + "Mair", + "Maire", + "Mara", + "Mareah", + "Mari", + "Maria", + "Marie", + "Mary", + "Maura", + "Maure", + "Meara", + "Merrie", + "Merry", + "Mira", + "Moira", + "Mora", + "Moria", + "Moyra", + "Muire", + "Myra", + "Myrah" }); + } + + /** + * Match data computed from http://www.lanw.com/java/phonetic/default.htm + */ + [Test] + public void TestIsMetaphoneEqualParis() + { + this.AssertIsMetaphoneEqual("Paris", new String[] { "Pearcy", "Perris", "Piercy", "Pierz", "Pryse" }); + } + + /** + * Match data computed from http://www.lanw.com/java/phonetic/default.htm + */ + [Test] + public void TestIsMetaphoneEqualPeter() + { + this.AssertIsMetaphoneEqual( + "Peter", + new String[] { "Peadar", "Peder", "Pedro", "Peter", "Petr", "Peyter", "Pieter", "Pietro", "Piotr" }); + } + + /** + * Match data computed from http://www.lanw.com/java/phonetic/default.htm + */ + [Test] + public void TestIsMetaphoneEqualRay() + { + this.AssertIsMetaphoneEqual("Ray", new String[] { "Ray", "Rey", "Roi", "Roy", "Ruy" }); + } + + /** + * Match data computed from http://www.lanw.com/java/phonetic/default.htm + */ + [Test] + public void TestIsMetaphoneEqualSusan() + { + this.AssertIsMetaphoneEqual( + "Susan", + new String[] { + "Siusan", + "Sosanna", + "Susan", + "Susana", + "Susann", + "Susanna", + "Susannah", + "Susanne", + "Suzann", + "Suzanna", + "Suzanne", + "Zuzana" }); + } + + /** + * Initial WR case. + * + * Match data computed from http://www.lanw.com/java/phonetic/default.htm + */ + [Test] + public void TestIsMetaphoneEqualWright() + { + this.AssertIsMetaphoneEqual("Wright", new String[] { "Rota", "Rudd", "Ryde" }); + } + + /** + * Match data computed from http://www.lanw.com/java/phonetic/default.htm + */ + [Test] + public void TestIsMetaphoneEqualXalan() + { + this.AssertIsMetaphoneEqual( + "Xalan", + new String[] { "Celene", "Celina", "Celine", "Selena", "Selene", "Selina", "Seline", "Suellen", "Xylina" }); + } + + [Test] + public void TestMetaphone() + { + Assert.AreEqual("HL", this.StringEncoder.GetMetaphone("howl")); + Assert.AreEqual("TSTN", this.StringEncoder.GetMetaphone("testing")); + Assert.AreEqual("0", this.StringEncoder.GetMetaphone("The")); + Assert.AreEqual("KK", this.StringEncoder.GetMetaphone("quick")); + Assert.AreEqual("BRN", this.StringEncoder.GetMetaphone("brown")); + Assert.AreEqual("FKS", this.StringEncoder.GetMetaphone("fox")); + Assert.AreEqual("JMPT", this.StringEncoder.GetMetaphone("jumped")); + Assert.AreEqual("OFR", this.StringEncoder.GetMetaphone("over")); + Assert.AreEqual("0", this.StringEncoder.GetMetaphone("the")); + Assert.AreEqual("LS", this.StringEncoder.GetMetaphone("lazy")); + Assert.AreEqual("TKS", this.StringEncoder.GetMetaphone("dogs")); + } + + [Test] + public void TestWordEndingInMB() + { + Assert.AreEqual("KM", this.StringEncoder.GetMetaphone("COMB")); + Assert.AreEqual("TM", this.StringEncoder.GetMetaphone("TOMB")); + Assert.AreEqual("WM", this.StringEncoder.GetMetaphone("WOMB")); + } + + [Test] + public void TestDiscardOfSCEOrSCIOrSCY() + { + Assert.AreEqual("SNS", this.StringEncoder.GetMetaphone("SCIENCE")); + Assert.AreEqual("SN", this.StringEncoder.GetMetaphone("SCENE")); + Assert.AreEqual("S", this.StringEncoder.GetMetaphone("SCY")); + } + + /** + * Tests (CODEC-57) Metaphone.metaphone(String) returns an empty string when passed the word "why" + */ + [Test] + public void TestWhy() + { + // PHP returns "H". The original metaphone returns an empty string. + Assert.AreEqual("", this.StringEncoder.GetMetaphone("WHY")); + } + + [Test] + public void TestWordsWithCIA() + { + Assert.AreEqual("XP", this.StringEncoder.GetMetaphone("CIAPO")); + } + + [Test] + public void TestTranslateOfSCHAndCH() + { + Assert.AreEqual("SKTL", this.StringEncoder.GetMetaphone("SCHEDULE")); + Assert.AreEqual("SKMT", this.StringEncoder.GetMetaphone("SCHEMATIC")); + + Assert.AreEqual("KRKT", this.StringEncoder.GetMetaphone("CHARACTER")); + Assert.AreEqual("TX", this.StringEncoder.GetMetaphone("TEACH")); + } + + [Test] + public void TestTranslateToJOfDGEOrDGIOrDGY() + { + Assert.AreEqual("TJ", this.StringEncoder.GetMetaphone("DODGY")); + Assert.AreEqual("TJ", this.StringEncoder.GetMetaphone("DODGE")); + Assert.AreEqual("AJMT", this.StringEncoder.GetMetaphone("ADGIEMTI")); + } + + [Test] + public void TestDiscardOfSilentHAfterG() + { + Assert.AreEqual("KNT", this.StringEncoder.GetMetaphone("GHENT")); + Assert.AreEqual("B", this.StringEncoder.GetMetaphone("BAUGH")); + } + + [Test] + public void TestDiscardOfSilentGN() + { + // NOTE: This does not test for silent GN, but for starting with GN + Assert.AreEqual("N", this.StringEncoder.GetMetaphone("GNU")); + + // NOTE: Trying to test for GNED, but expected code does not appear to execute + Assert.AreEqual("SNT", this.StringEncoder.GetMetaphone("SIGNED")); + } + + [Test] + public void TestPHTOF() + { + Assert.AreEqual("FX", this.StringEncoder.GetMetaphone("PHISH")); + } + + [Test] + public void TestSHAndSIOAndSIAToX() + { + Assert.AreEqual("XT", this.StringEncoder.GetMetaphone("SHOT")); + Assert.AreEqual("OTXN", this.StringEncoder.GetMetaphone("ODSIAN")); + Assert.AreEqual("PLXN", this.StringEncoder.GetMetaphone("PULSION")); + } + + [Test] + public void TestTIOAndTIAToX() + { + Assert.AreEqual("OX", this.StringEncoder.GetMetaphone("OTIA")); + Assert.AreEqual("PRXN", this.StringEncoder.GetMetaphone("PORTION")); + } + + [Test] + public void TestTCH() + { + Assert.AreEqual("RX", this.StringEncoder.GetMetaphone("RETCH")); + Assert.AreEqual("WX", this.StringEncoder.GetMetaphone("WATCH")); + } + + [Test] + public void TestExceedLength() + { + // should be AKSKS, but istruncated by Max Code Length + Assert.AreEqual("AKSK", this.StringEncoder.GetMetaphone("AXEAXE")); + } + + [Test] + public void TestSetMaxLengthWithTruncation() + { + // should be AKSKS, but istruncated by Max Code Length + this.StringEncoder.MaxCodeLen=(6); + Assert.AreEqual("AKSKSK", this.StringEncoder.GetMetaphone("AXEAXEAXE")); + } + + public void ValidateFixture(String[][] pairs) + { + if (pairs.Length == 0) + { + Assert.Fail("Test fixture is empty"); + } + for (int i = 0; i < pairs.Length; i++) + { + if (pairs[i].Length != 2) + { + Assert.Fail("Error in test fixture in the data array at index " + i); + } + } + } + } +}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1ee3a9cc/src/Lucene.Net.Tests.Analysis.Phonetic/Language/NysiisTest.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Tests.Analysis.Phonetic/Language/NysiisTest.cs b/src/Lucene.Net.Tests.Analysis.Phonetic/Language/NysiisTest.cs new file mode 100644 index 0000000..d1c04d1 --- /dev/null +++ b/src/Lucene.Net.Tests.Analysis.Phonetic/Language/NysiisTest.cs @@ -0,0 +1,319 @@ +using NUnit.Framework; +using System; + +namespace Lucene.Net.Analysis.Phonetic.Language +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + public class NysiisTest : StringEncoderAbstractTest<Nysiis> + { + private readonly Nysiis fullNysiis = new Nysiis(false); + + /** + * Takes an array of String pairs where each pair's first element is the input and the second element the expected + * encoding. + * + * @param testValues + * an array of String pairs where each pair's first element is the input and the second element the + * expected encoding. + * @throws EncoderException + */ + private void AssertEncodings(params String[][] testValues) + { + foreach (String[] + arr in testValues) + { + Assert.AreEqual(arr[1], this.fullNysiis.Encode(arr[0]), "Problem with " + arr[0]); + } + } + + protected override Nysiis CreateStringEncoder() + { + return new Nysiis(); + } + + private void EncodeAll(String[] strings, String expectedEncoding) + { + foreach (String str in strings) + { + Assert.AreEqual(expectedEncoding, StringEncoder.Encode(str), "Problem with " + str); + } + } + + [Test] + public void TestBran() + { + EncodeAll(new String[] { "Brian", "Brown", "Brun" }, "BRAN"); + } + + [Test] + public void TestCap() + { + this.EncodeAll(new String[] { "Capp", "Cope", "Copp", "Kipp" }, "CAP"); + } + + [Test] + public void TestDad() + { + // Data Quality and Record Linkage Techniques P.121 claims this is DAN, + // but it should be DAD, verified also with dropby.com + this.EncodeAll(new String[] { "Dent" }, "DAD"); + } + + [Test] + public void TestDan() + { + this.EncodeAll(new String[] { "Dane", "Dean", "Dionne" }, "DAN"); + } + + /** + * Tests data gathered from around the internet. + * + * @see <a href="http://www.dropby.com/NYSIISTextStrings.html">http://www.dropby.com/NYSIISTextStrings.html</a> + * @throws EncoderException + */ + [Test] + public void TestDropBy() + { + // Explanation of differences between this implementation and the one at dropby.com is + // prepended to the test string. The referenced rules refer to the outlined steps the + // class description for Nysiis. + + this.AssertEncodings( + // 1. Transcode first characters of name + new String[] { "MACINTOSH", "MCANT" }, + // violates 4j: the second N should not be added, as the first + // key char is already a N + new String[] { "KNUTH", "NAT" }, // Original: NNAT; modified: NATH + // O and E are transcoded to A because of rule 4a + // H also to A because of rule 4h + // the N gets mysteriously lost, maybe because of a wrongly implemented rule 4h + // that skips the next char in such a case? + // the remaining A is removed because of rule 7 + new String[] { "KOEHN", "CAN" }, // Original: C + // violates 4j: see also KNUTH + new String[] { "PHILLIPSON", "FALAPSAN" }, // Original: FFALAP[SAN] + // violates 4j: see also KNUTH + new String[] { "PFEISTER", "FASTAR" }, // Original: FFASTA[R] + // violates 4j: see also KNUTH + new String[] { "SCHOENHOEFT", "SANAFT" }, // Original: SSANAF[T] + // 2. Transcode last characters of name: + new String[] { "MCKEE", "MCY" }, + new String[] { "MACKIE", "MCY" }, + new String[] { "HEITSCHMIDT", "HATSNAD" }, + new String[] { "BART", "BAD" }, + new String[] { "HURD", "HAD" }, + new String[] { "HUNT", "HAD" }, + new String[] { "WESTERLUND", "WASTARLAD" }, + // 4. Transcode remaining characters by following these rules, + // incrementing by one character each time: + new String[] { "CASSTEVENS", "CASTAFAN" }, + new String[] { "VASQUEZ", "VASG" }, + new String[] { "FRAZIER", "FRASAR" }, + new String[] { "BOWMAN", "BANAN" }, + new String[] { "MCKNIGHT", "MCNAGT" }, + new String[] { "RICKERT", "RACAD" }, + // violates 5: the last S is not removed + // when comparing to DEUTS, which is phonetically similar + // the result it also DAT, which is correct for DEUTSCH too imo + new String[] { "DEUTSCH", "DAT" }, // Original: DATS + new String[] { "WESTPHAL", "WASTFAL" }, + // violates 4h: the H should be transcoded to S and thus ignored as + // the first key character is also S + new String[] { "SHRIVER", "SRAVAR" }, // Original: SHRAVA[R] + // same as KOEHN, the L gets mysteriously lost + new String[] { "KUHL", "CAL" }, // Original: C + new String[] { "RAWSON", "RASAN" }, + // If last character is S, remove it + new String[] { "JILES", "JAL" }, + // violates 6: if the last two characters are AY, remove A + new String[] { "CARRAWAY", "CARY" }, // Original: CARAY + new String[] { "YAMADA", "YANAD" }); + } + + [Test] + public void TestFal() + { + this.EncodeAll(new String[] { "Phil" }, "FAL"); + } + + /** + * Tests data gathered from around the internets. + * + * @throws EncoderException + */ + [Test] + public void TestOthers() + { + this.AssertEncodings( + new String[] { "O'Daniel", "ODANAL" }, + new String[] { "O'Donnel", "ODANAL" }, + new String[] { "Cory", "CARY" }, + new String[] { "Corey", "CARY" }, + new String[] { "Kory", "CARY" }, + // + new String[] { "FUZZY", "FASY" }); + } + + /** + * Tests rule 1: Translate first characters of name: MAC â MCC, KN â N, K â C, PH, PF â FF, SCH â SSS + * + * @throws EncoderException + */ + [Test] + public void TestRule1() + { + this.AssertEncodings( + new String[] { "MACX", "MCX" }, + new String[] { "KNX", "NX" }, + new String[] { "KX", "CX" }, + new String[] { "PHX", "FX" }, + new String[] { "PFX", "FX" }, + new String[] { "SCHX", "SX" }); + } + + /** + * Tests rule 2: Translate last characters of name: EE â Y, IE â Y, DT, RT, RD, NT, ND â D + * + * @throws EncoderException + */ + [Test] + public void TestRule2() + { + this.AssertEncodings( + new String[] { "XEE", "XY" }, + new String[] { "XIE", "XY" }, + new String[] { "XDT", "XD" }, + new String[] { "XRT", "XD" }, + new String[] { "XRD", "XD" }, + new String[] { "XNT", "XD" }, + new String[] { "XND", "XD" }); + } + + /** + * Tests rule 4.1: EV â AF else A, E, I, O, U â A + * + * @throws EncoderException + */ + [Test] + public void TestRule4Dot1() + { + this.AssertEncodings( + new String[] { "XEV", "XAF" }, + new String[] { "XAX", "XAX" }, + new String[] { "XEX", "XAX" }, + new String[] { "XIX", "XAX" }, + new String[] { "XOX", "XAX" }, + new String[] { "XUX", "XAX" }); + } + + /** + * Tests rule 4.2: Q â G, Z â S, M â N + * + * @throws EncoderException + */ + [Test] + public void TestRule4Dot2() + { + this.AssertEncodings( + new String[] { "XQ", "XG" }, + new String[] { "XZ", "X" }, + new String[] { "XM", "XN" }); + } + + /** + * Tests rule 5: If last character is S, remove it. + * + * @throws EncoderException + */ + [Test] + public void TestRule5() + { + this.AssertEncodings( + new String[] { "XS", "X" }, + new String[] { "XSS", "X" }); + } + + /** + * Tests rule 6: If last characters are AY, replace with Y. + * + * @throws EncoderException + */ + [Test] + public void TestRule6() + { + this.AssertEncodings( + new String[] { "XAY", "XY" }, + new String[] { "XAYS", "XY" }); // Rules 5, 6 + } + + /** + * Tests rule 7: If last character is A, remove it. + * + * @throws EncoderException + */ + [Test] + public void TestRule7() + { + this.AssertEncodings( + new String[] { "XA", "X" }, + new String[] { "XAS", "X" }); // Rules 5, 7 + } + [Test] + public void TestSnad() + { + // Data Quality and Record Linkage Techniques P.121 claims this is SNAT, + // but it should be SNAD + this.EncodeAll(new String[] { "Schmidt" }, "SNAD"); + } + + [Test] + public void TestSnat() + { + this.EncodeAll(new String[] { "Smith", "Schmit" }, "SNAT"); + } + + [Test] + public void TestSpecialBranches() + { + this.EncodeAll(new String[] { "Kobwick" }, "CABWAC"); + this.EncodeAll(new String[] { "Kocher" }, "CACAR"); + this.EncodeAll(new String[] { "Fesca" }, "FASC"); + this.EncodeAll(new String[] { "Shom" }, "SAN"); + this.EncodeAll(new String[] { "Ohlo" }, "OL"); + this.EncodeAll(new String[] { "Uhu" }, "UH"); + this.EncodeAll(new String[] { "Um" }, "UN"); + } + + [Test] + public void TestTranan() + { + this.EncodeAll(new String[] { "Trueman", "Truman" }, "TRANAN"); + } + + [Test] + public void TestTrueVariant() + { + Nysiis encoder = new Nysiis(true); + + String encoded = encoder.Encode("WESTERLUND"); + Assert.True(encoded.Length <= 6); + Assert.AreEqual("WASTAR", encoded); + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1ee3a9cc/src/Lucene.Net.Tests.Analysis.Phonetic/Language/RefinedSoundexTest.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Tests.Analysis.Phonetic/Language/RefinedSoundexTest.cs b/src/Lucene.Net.Tests.Analysis.Phonetic/Language/RefinedSoundexTest.cs new file mode 100644 index 0000000..eca1827 --- /dev/null +++ b/src/Lucene.Net.Tests.Analysis.Phonetic/Language/RefinedSoundexTest.cs @@ -0,0 +1,99 @@ +using NUnit.Framework; + +namespace Lucene.Net.Analysis.Phonetic.Language +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// <summary> + /// Tests RefinedSoundex. + /// </summary> + public class RefinedSoundexTest : StringEncoderAbstractTest<RefinedSoundex> + { + protected override RefinedSoundex CreateStringEncoder() + { + return new RefinedSoundex(); + } + + [Test] + public void TestDifference() + { + // Edge cases + Assert.AreEqual(0, this.StringEncoder.Difference(null, null)); + Assert.AreEqual(0, this.StringEncoder.Difference("", "")); + Assert.AreEqual(0, this.StringEncoder.Difference(" ", " ")); + // Normal cases + Assert.AreEqual(6, this.StringEncoder.Difference("Smith", "Smythe")); + Assert.AreEqual(3, this.StringEncoder.Difference("Ann", "Andrew")); + Assert.AreEqual(1, this.StringEncoder.Difference("Margaret", "Andrew")); + Assert.AreEqual(1, this.StringEncoder.Difference("Janet", "Margaret")); + // Examples from + // http://msdn.microsoft.com/library/default.asp?url=/library/en-us/tsqlref/ts_de-dz_8co5.asp + Assert.AreEqual(5, this.StringEncoder.Difference("Green", "Greene")); + Assert.AreEqual(1, this.StringEncoder.Difference("Blotchet-Halls", "Greene")); + // Examples from + // http://msdn.microsoft.com/library/default.asp?url=/library/en-us/tsqlref/ts_setu-sus_3o6w.asp + Assert.AreEqual(6, this.StringEncoder.Difference("Smith", "Smythe")); + Assert.AreEqual(8, this.StringEncoder.Difference("Smithers", "Smythers")); + Assert.AreEqual(5, this.StringEncoder.Difference("Anothers", "Brothers")); + } + + [Test] + public void TestEncode() + { + Assert.AreEqual("T6036084", this.StringEncoder.Encode("testing")); + Assert.AreEqual("T6036084", this.StringEncoder.Encode("TESTING")); + Assert.AreEqual("T60", this.StringEncoder.Encode("The")); + Assert.AreEqual("Q503", this.StringEncoder.Encode("quick")); + Assert.AreEqual("B1908", this.StringEncoder.Encode("brown")); + Assert.AreEqual("F205", this.StringEncoder.Encode("fox")); + Assert.AreEqual("J408106", this.StringEncoder.Encode("jumped")); + Assert.AreEqual("O0209", this.StringEncoder.Encode("over")); + Assert.AreEqual("T60", this.StringEncoder.Encode("the")); + Assert.AreEqual("L7050", this.StringEncoder.Encode("lazy")); + Assert.AreEqual("D6043", this.StringEncoder.Encode("dogs")); + + // Testing CODEC-56 + Assert.AreEqual("D6043", RefinedSoundex.US_ENGLISH.Encode("dogs")); + } + + [Test] + public void TestGetMappingCodeNonLetter() + { + char code = this.StringEncoder.GetMappingCode('#'); + Assert.AreEqual(0, code, "Code does not equals zero"); + } + + [Test] + public void TestNewInstance() + { + Assert.AreEqual("D6043", new RefinedSoundex().GetSoundex("dogs")); + } + + [Test] + public void TestNewInstance2() + { + Assert.AreEqual("D6043", new RefinedSoundex(RefinedSoundex.US_ENGLISH_MAPPING_STRING.toCharArray()).GetSoundex("dogs")); + } + + [Test] + public void TestNewInstance3() + { + Assert.AreEqual("D6043", new RefinedSoundex(RefinedSoundex.US_ENGLISH_MAPPING_STRING).GetSoundex("dogs")); + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1ee3a9cc/src/Lucene.Net.Tests.Analysis.Phonetic/Language/SoundexTest.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Tests.Analysis.Phonetic/Language/SoundexTest.cs b/src/Lucene.Net.Tests.Analysis.Phonetic/Language/SoundexTest.cs new file mode 100644 index 0000000..5cc01ec --- /dev/null +++ b/src/Lucene.Net.Tests.Analysis.Phonetic/Language/SoundexTest.cs @@ -0,0 +1,424 @@ +// commons-codec version compatibility level: 1.10 +using NUnit.Framework; +using System; + +namespace Lucene.Net.Analysis.Phonetic.Language +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// <summary> + /// Tests <see cref="Soundex"/> + /// </summary> + public class SoundexTest : StringEncoderAbstractTest<Soundex> + { + protected override Soundex CreateStringEncoder() + { + return new Soundex(); + } + + [Test] + public void TestB650() + { + this.CheckEncodingVariations("B650", new string[]{ + "BARHAM", + "BARONE", + "BARRON", + "BERNA", + "BIRNEY", + "BIRNIE", + "BOOROM", + "BOREN", + "BORN", + "BOURN", + "BOURNE", + "BOWRON", + "BRAIN", + "BRAME", + "BRANN", + "BRAUN", + "BREEN", + "BRIEN", + "BRIM", + "BRIMM", + "BRINN", + "BRION", + "BROOM", + "BROOME", + "BROWN", + "BROWNE", + "BRUEN", + "BRUHN", + "BRUIN", + "BRUMM", + "BRUN", + "BRUNO", + "BRYAN", + "BURIAN", + "BURN", + "BURNEY", + "BYRAM", + "BYRNE", + "BYRON", + "BYRUM"}); + } + + [Test] + public void TestBadCharacters() + { + Assert.AreEqual("H452", this.StringEncoder.Encode("HOL>MES")); + + } + + [Test] + public void TestDifference() + { + // Edge cases + Assert.AreEqual(0, this.StringEncoder.Difference(null, null)); + Assert.AreEqual(0, this.StringEncoder.Difference("", "")); + Assert.AreEqual(0, this.StringEncoder.Difference(" ", " ")); + // Normal cases + Assert.AreEqual(4, this.StringEncoder.Difference("Smith", "Smythe")); + Assert.AreEqual(2, this.StringEncoder.Difference("Ann", "Andrew")); + Assert.AreEqual(1, this.StringEncoder.Difference("Margaret", "Andrew")); + Assert.AreEqual(0, this.StringEncoder.Difference("Janet", "Margaret")); + // Examples from http://msdn.microsoft.com/library/default.asp?url=/library/en-us/tsqlref/ts_de-dz_8co5.asp + Assert.AreEqual(4, this.StringEncoder.Difference("Green", "Greene")); + Assert.AreEqual(0, this.StringEncoder.Difference("Blotchet-Halls", "Greene")); + // Examples from http://msdn.microsoft.com/library/default.asp?url=/library/en-us/tsqlref/ts_setu-sus_3o6w.asp + Assert.AreEqual(4, this.StringEncoder.Difference("Smith", "Smythe")); + Assert.AreEqual(4, this.StringEncoder.Difference("Smithers", "Smythers")); + Assert.AreEqual(2, this.StringEncoder.Difference("Anothers", "Brothers")); + } + + [Test] + public void TestEncodeBasic() + { + Assert.AreEqual("T235", this.StringEncoder.Encode("testing")); + Assert.AreEqual("T000", this.StringEncoder.Encode("The")); + Assert.AreEqual("Q200", this.StringEncoder.Encode("quick")); + Assert.AreEqual("B650", this.StringEncoder.Encode("brown")); + Assert.AreEqual("F200", this.StringEncoder.Encode("fox")); + Assert.AreEqual("J513", this.StringEncoder.Encode("jumped")); + Assert.AreEqual("O160", this.StringEncoder.Encode("over")); + Assert.AreEqual("T000", this.StringEncoder.Encode("the")); + Assert.AreEqual("L200", this.StringEncoder.Encode("lazy")); + Assert.AreEqual("D200", this.StringEncoder.Encode("dogs")); + } + + /** + * Examples from http://www.bradandkathy.com/genealogy/overviewofsoundex.html + */ + [Test] + public void RestEncodeBatch2() + { + Assert.AreEqual("A462", this.StringEncoder.Encode("Allricht")); + Assert.AreEqual("E166", this.StringEncoder.Encode("Eberhard")); + Assert.AreEqual("E521", this.StringEncoder.Encode("Engebrethson")); + Assert.AreEqual("H512", this.StringEncoder.Encode("Heimbach")); + Assert.AreEqual("H524", this.StringEncoder.Encode("Hanselmann")); + Assert.AreEqual("H431", this.StringEncoder.Encode("Hildebrand")); + Assert.AreEqual("K152", this.StringEncoder.Encode("Kavanagh")); + Assert.AreEqual("L530", this.StringEncoder.Encode("Lind")); + Assert.AreEqual("L222", this.StringEncoder.Encode("Lukaschowsky")); + Assert.AreEqual("M235", this.StringEncoder.Encode("McDonnell")); + Assert.AreEqual("M200", this.StringEncoder.Encode("McGee")); + Assert.AreEqual("O155", this.StringEncoder.Encode("Opnian")); + Assert.AreEqual("O155", this.StringEncoder.Encode("Oppenheimer")); + Assert.AreEqual("R355", this.StringEncoder.Encode("Riedemanas")); + Assert.AreEqual("Z300", this.StringEncoder.Encode("Zita")); + Assert.AreEqual("Z325", this.StringEncoder.Encode("Zitzmeinn")); + } + + /** + * Examples from http://www.archives.gov/research_room/genealogy/census/soundex.html + */ + [Test] + public void TestEncodeBatch3() + { + Assert.AreEqual("W252", this.StringEncoder.Encode("Washington")); + Assert.AreEqual("L000", this.StringEncoder.Encode("Lee")); + Assert.AreEqual("G362", this.StringEncoder.Encode("Gutierrez")); + Assert.AreEqual("P236", this.StringEncoder.Encode("Pfister")); + Assert.AreEqual("J250", this.StringEncoder.Encode("Jackson")); + Assert.AreEqual("T522", this.StringEncoder.Encode("Tymczak")); + // For VanDeusen: D-250 (D, 2 for the S, 5 for the N, 0 added) is also + // possible. + Assert.AreEqual("V532", this.StringEncoder.Encode("VanDeusen")); + } + + /** + * Examples from: http://www.myatt.demon.co.uk/sxalg.htm + */ + [Test] + public void TestEncodeBatch4() + { + Assert.AreEqual("H452", this.StringEncoder.Encode("HOLMES")); + Assert.AreEqual("A355", this.StringEncoder.Encode("ADOMOMI")); + Assert.AreEqual("V536", this.StringEncoder.Encode("VONDERLEHR")); + Assert.AreEqual("B400", this.StringEncoder.Encode("BALL")); + Assert.AreEqual("S000", this.StringEncoder.Encode("SHAW")); + Assert.AreEqual("J250", this.StringEncoder.Encode("JACKSON")); + Assert.AreEqual("S545", this.StringEncoder.Encode("SCANLON")); + Assert.AreEqual("S532", this.StringEncoder.Encode("SAINTJOHN")); + + } + + [Test] + public void TestEncodeIgnoreApostrophes() + { + this.CheckEncodingVariations("O165", new string[]{ + "OBrien", + "'OBrien", + "O'Brien", + "OB'rien", + "OBr'ien", + "OBri'en", + "OBrie'n", + "OBrien'"}); + } + + /** + * Test data from http://www.myatt.demon.co.uk/sxalg.htm + * + * @throws EncoderException + */ + [Test] + public void TestEncodeIgnoreHyphens() + { + this.CheckEncodingVariations("K525", new String[]{ + "KINGSMITH", + "-KINGSMITH", + "K-INGSMITH", + "KI-NGSMITH", + "KIN-GSMITH", + "KING-SMITH", + "KINGS-MITH", + "KINGSM-ITH", + "KINGSMI-TH", + "KINGSMIT-H", + "KINGSMITH-"}); + } + + [Test] + public void TestEncodeIgnoreTrimmable() + { + Assert.AreEqual("W252", this.StringEncoder.Encode(" \t\n\r Washington \t\n\r ")); + } + + /** + * Consonants from the same code group separated by W or H are treated as one. + */ + [Test] + public void TestHWRuleEx1() + { + // From + // http://www.archives.gov/research_room/genealogy/census/soundex.html: + // Ashcraft is coded A-261 (A, 2 for the S, C ignored, 6 for the R, 1 + // for the F). It is not coded A-226. + Assert.AreEqual("A261", this.StringEncoder.Encode("Ashcraft")); + } + + /** + * Consonants from the same code group separated by W or H are treated as one. + * + * Test data from http://www.myatt.demon.co.uk/sxalg.htm + */ + [Test] + public void TestHWRuleEx2() + { + Assert.AreEqual("B312", this.StringEncoder.Encode("BOOTHDAVIS")); + Assert.AreEqual("B312", this.StringEncoder.Encode("BOOTH-DAVIS")); + } + + /** + * Consonants from the same code group separated by W or H are treated as one. + * + * @throws EncoderException + */ + [Test] + public void TestHWRuleEx3() + { + Assert.AreEqual("S460", this.StringEncoder.Encode("Sgler")); + Assert.AreEqual("S460", this.StringEncoder.Encode("Swhgler")); + // Also S460: + this.CheckEncodingVariations("S460", new String[]{ + "SAILOR", + "SALYER", + "SAYLOR", + "SCHALLER", + "SCHELLER", + "SCHILLER", + "SCHOOLER", + "SCHULER", + "SCHUYLER", + "SEILER", + "SEYLER", + "SHOLAR", + "SHULER", + "SILAR", + "SILER", + "SILLER"}); + } + + /** + * Examples for MS SQLServer from + * http://msdn.microsoft.com/library/default.asp?url=/library/en-us/tsqlref/ts_setu-sus_3o6w.asp + */ + [Test] + public void TestMsSqlServer1() + { + Assert.AreEqual("S530", this.StringEncoder.Encode("Smith")); + Assert.AreEqual("S530", this.StringEncoder.Encode("Smythe")); + } + + /** + * Examples for MS SQLServer from + * http://support.microsoft.com/default.aspx?scid=http://support.microsoft.com:80/support + * /kb/articles/Q100/3/65.asp&NoWebContent=1 + * + * @throws EncoderException + */ + [Test] + public void TestMsSqlServer2() + { + this.CheckEncodingVariations("E625", new String[] { "Erickson", "Erickson", "Erikson", "Ericson", "Ericksen", "Ericsen" }); + } + + /** + * Examples for MS SQLServer from http://databases.about.com/library/weekly/aa042901a.htm + */ + [Test] + public void TestMsSqlServer3() + { + Assert.AreEqual("A500", this.StringEncoder.Encode("Ann")); + Assert.AreEqual("A536", this.StringEncoder.Encode("Andrew")); + Assert.AreEqual("J530", this.StringEncoder.Encode("Janet")); + Assert.AreEqual("M626", this.StringEncoder.Encode("Margaret")); + Assert.AreEqual("S315", this.StringEncoder.Encode("Steven")); + Assert.AreEqual("M240", this.StringEncoder.Encode("Michael")); + Assert.AreEqual("R163", this.StringEncoder.Encode("Robert")); + Assert.AreEqual("L600", this.StringEncoder.Encode("Laura")); + Assert.AreEqual("A500", this.StringEncoder.Encode("Anne")); + } + + /** + * https://issues.apache.org/jira/browse/CODEC-54 https://issues.apache.org/jira/browse/CODEC-56 + */ + [Test] + public void TestNewInstance() + { + Assert.AreEqual("W452", new Soundex().GetSoundex("Williams")); + } + + [Test] + public void TestNewInstance2() + { + Assert.AreEqual("W452", new Soundex(Soundex.US_ENGLISH_MAPPING_STRING.toCharArray()).GetSoundex("Williams")); + } + + [Test] + public void TestNewInstance3() + { + Assert.AreEqual("W452", new Soundex(Soundex.US_ENGLISH_MAPPING_STRING).GetSoundex("Williams")); + } + + [Test] + public void TestSoundexUtilsConstructable() + { + new SoundexUtils(); + } + + [Test] + public void TestSoundexUtilsNullBehaviour() + { + Assert.AreEqual(null, SoundexUtils.Clean(null)); + Assert.AreEqual("", SoundexUtils.Clean("")); + Assert.AreEqual(0, SoundexUtils.DifferenceEncoded(null, "")); + Assert.AreEqual(0, SoundexUtils.DifferenceEncoded("", null)); + } + + /** + * https://issues.apache.org/jira/browse/CODEC-54 https://issues.apache.org/jira/browse/CODEC-56 + */ + [Test] + public void TestUsEnglishStatic() + { + Assert.AreEqual("W452", Soundex.US_ENGLISH.GetSoundex("Williams")); + } + + /** + * Fancy characters are not mapped by the default US mapping. + * + * http://issues.apache.org/bugzilla/show_bug.cgi?id=29080 + */ + [Test] + public void TestUsMappingEWithAcute() + { + Assert.AreEqual("E000", this.StringEncoder.Encode("e")); + if (char.IsLetter('\u00e9')) + { // e-acute + try + { + // uppercase E-acute + Assert.AreEqual("\u00c9000", this.StringEncoder.Encode("\u00e9")); + Assert.Fail("Expected IllegalArgumentException not thrown"); + } +#pragma warning disable 168 + catch (ArgumentException e) +#pragma warning restore 168 + { + // expected + } + } + else + { + Assert.AreEqual("", this.StringEncoder.Encode("\u00e9")); + } + } + + /** + * Fancy characters are not mapped by the default US mapping. + * + * http://issues.apache.org/bugzilla/show_bug.cgi?id=29080 + */ + [Test] + public void TestUsMappingOWithDiaeresis() + { + Assert.AreEqual("O000", this.StringEncoder.Encode("o")); + if (char.IsLetter('\u00f6')) + { // o-umlaut + try + { + // uppercase O-umlaut + Assert.AreEqual("\u00d6000", this.StringEncoder.Encode("\u00f6")); + Assert.Fail("Expected IllegalArgumentException not thrown"); + } +#pragma warning disable 168 + catch (ArgumentException e) +#pragma warning restore 168 + { + // expected + } + } + else + { + Assert.AreEqual("", this.StringEncoder.Encode("\u00f6")); + } + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1ee3a9cc/src/Lucene.Net.Tests.Analysis.Phonetic/Language/StringEncoderAbstractTest.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Tests.Analysis.Phonetic/Language/StringEncoderAbstractTest.cs b/src/Lucene.Net.Tests.Analysis.Phonetic/Language/StringEncoderAbstractTest.cs new file mode 100644 index 0000000..8fd8b7f --- /dev/null +++ b/src/Lucene.Net.Tests.Analysis.Phonetic/Language/StringEncoderAbstractTest.cs @@ -0,0 +1,164 @@ +using NUnit.Framework; +using System; +using System.Globalization; +using System.Threading; + +namespace Lucene.Net.Analysis.Phonetic.Language +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + public abstract class StringEncoderAbstractTest<T> + where T : IStringEncoder + { + protected T stringEncoder; + + [SetUp] + public void SetUp() + { + stringEncoder = this.CreateStringEncoder(); + } + + public virtual void CheckEncoding(string expected, string source) + { + Assert.AreEqual(expected, this.StringEncoder.Encode(source), "Source: " + source); + } + + protected virtual void CheckEncodings(string[][] data) + { + foreach (string[] + element in data) + { + this.CheckEncoding(element[1], element[0]); + } + } + + protected virtual void CheckEncodingVariations(string expected, string[] data) + { + foreach (string element in data) + { + this.CheckEncoding(expected, element); + } + } + + protected abstract T CreateStringEncoder(); + + public virtual T StringEncoder + { + get { return this.stringEncoder; } + } + + [Test] + public virtual void TestEncodeEmpty() + { + IStringEncoder encoder = this.StringEncoder; + encoder.Encode(""); + encoder.Encode(" "); + encoder.Encode("\t"); + } + + // LUCENENET specific - since strings are sealed in .NET, there + // is no point in implementing IEncoder or running these tests. + // Our version only accepts strings + [Test] + public virtual void TestEncodeNull() + { + IStringEncoder encoder = this.StringEncoder; + try + { + encoder.Encode(null); + } +#pragma warning disable 168 + catch (/*Encoder*/Exception ee) +#pragma warning restore 168 + { + // An exception should be thrown + } + } + + //[Test] + //public virtual void TestEncodeWithInvalidObject() + //{ + // bool exceptionThrown = false; + // try + // { + // IStringEncoder encoder = this.StringEncoder; + // encoder.Encode(3.4f); + // } + // catch (Exception e) + // { + // exceptionThrown = true; + // } + // Assert.True(exceptionThrown, "An exception was not thrown when we tried to encode " + "a Float object"); + //} + + [Test] + public virtual void TestLocaleIndependence() + { + IStringEncoder encoder = this.StringEncoder; + + string[] + data = { "I", "i", }; + + CultureInfo orig = CultureInfo.CurrentCulture; + CultureInfo[] locales = { new CultureInfo("en"), new CultureInfo("tr"), CultureInfo.CurrentCulture }; + + try + { + foreach (string element in data) + { + string @ref = null; + for (int j = 0; j < locales.Length; j++) + { + //Locale.setDefault(locales[j]); +#if NETSTANDARD + CultureInfo.CurrentCulture = locales[j]; +#else + Thread.CurrentThread.CurrentCulture = locales[j]; +#endif + if (j <= 0) + { + @ref = encoder.Encode(element); + } + else + { + string cur = null; + try + { + cur = encoder.Encode(element); + } + catch (Exception e) + { + Assert.Fail(CultureInfo.CurrentCulture.ToString() + ": " + e.Message); + } + Assert.AreEqual(@ref, cur, CultureInfo.CurrentCulture.ToString() + ": "); + } + } + } + } + finally + { + //Locale.setDefault(orig); +#if NETSTANDARD + CultureInfo.CurrentCulture = orig; +#else + Thread.CurrentThread.CurrentCulture = orig; +#endif + } + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1ee3a9cc/src/Lucene.Net.Tests.Analysis.Phonetic/Lucene.Net.Tests.Analysis.Phonetic.csproj ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Tests.Analysis.Phonetic/Lucene.Net.Tests.Analysis.Phonetic.csproj b/src/Lucene.Net.Tests.Analysis.Phonetic/Lucene.Net.Tests.Analysis.Phonetic.csproj new file mode 100644 index 0000000..5c38e1f --- /dev/null +++ b/src/Lucene.Net.Tests.Analysis.Phonetic/Lucene.Net.Tests.Analysis.Phonetic.csproj @@ -0,0 +1,108 @@ +<?xml version="1.0" encoding="utf-8"?> +<Project ToolsVersion="14.0" DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003"> + <Import Project="$(MSBuildExtensionsPath)\$(MSBuildToolsVersion)\Microsoft.Common.props" Condition="Exists('$(MSBuildExtensionsPath)\$(MSBuildToolsVersion)\Microsoft.Common.props')" /> + <PropertyGroup> + <Configuration Condition=" '$(Configuration)' == '' ">Debug</Configuration> + <Platform Condition=" '$(Platform)' == '' ">AnyCPU</Platform> + <ProjectGuid>{A2867797-0A5D-4878-8F59-58C399C9A4E4}</ProjectGuid> + <OutputType>Library</OutputType> + <AppDesignerFolder>Properties</AppDesignerFolder> + <RootNamespace>Lucene.Net.Analysis.Phonetic</RootNamespace> + <AssemblyName>Lucene.Net.Tests.Analysis.Phonetic</AssemblyName> + <TargetFrameworkVersion>v4.5.1</TargetFrameworkVersion> + <FileAlignment>512</FileAlignment> + </PropertyGroup> + <PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Debug|AnyCPU' "> + <DebugSymbols>true</DebugSymbols> + <DebugType>full</DebugType> + <Optimize>false</Optimize> + <OutputPath>bin\Debug\</OutputPath> + <DefineConstants>DEBUG;TRACE</DefineConstants> + <ErrorReport>prompt</ErrorReport> + <WarningLevel>4</WarningLevel> + </PropertyGroup> + <PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Release|AnyCPU' "> + <DebugType>pdbonly</DebugType> + <Optimize>true</Optimize> + <OutputPath>bin\Release\</OutputPath> + <DefineConstants>TRACE</DefineConstants> + <ErrorReport>prompt</ErrorReport> + <WarningLevel>4</WarningLevel> + </PropertyGroup> + <PropertyGroup> + <DefineConstants>$(DefineConstants);FEATURE_SERIALIZABLE</DefineConstants> + </PropertyGroup> + <ItemGroup> + <Reference Include="System" /> + <Reference Include="System.Core" /> + <Reference Include="System.Xml.Linq" /> + <Reference Include="System.Data.DataSetExtensions" /> + <Reference Include="Microsoft.CSharp" /> + <Reference Include="System.Data" /> + <Reference Include="System.Net.Http" /> + <Reference Include="System.Xml" /> + </ItemGroup> + <ItemGroup> + <Compile Include="DoubleMetaphoneFilterTest.cs" /> + <Compile Include="Language\Bm\BeiderMorseEncoderTest.cs" /> + <Compile Include="Language\Bm\CacheSubSequencePerformanceTest.cs" /> + <Compile Include="Language\Bm\LanguageGuessingTest.cs" /> + <Compile Include="Language\Bm\PhoneticEnginePerformanceTest.cs" /> + <Compile Include="Language\Bm\PhoneticEngineRegressionTest.cs" /> + <Compile Include="Language\Bm\PhoneticEngineTest.cs" /> + <Compile Include="Language\Bm\RuleTest.cs" /> + <Compile Include="Language\Caverphone1Test.cs" /> + <Compile Include="Language\Caverphone2Test .cs" /> + <Compile Include="Language\ColognePhoneticTest.cs" /> + <Compile Include="Language\DaitchMokotoffSoundexTest.cs" /> + <Compile Include="Language\DoubleMetaphone2Test.cs" /> + <Compile Include="Language\DoubleMetaphoneTest.cs" /> + <Compile Include="Language\MatchRatingApproachEncoderTest.cs" /> + <Compile Include="Language\MetaphoneTest.cs" /> + <Compile Include="Language\NysiisTest.cs" /> + <Compile Include="Language\RefinedSoundexTest.cs" /> + <Compile Include="Language\SoundexTest.cs" /> + <Compile Include="Language\StringEncoderAbstractTest.cs" /> + <Compile Include="Properties\AssemblyInfo.cs" /> + <Compile Include="TestBeiderMorseFilter.cs" /> + <Compile Include="TestBeiderMorseFilterFactory.cs" /> + <Compile Include="TestDoubleMetaphoneFilterFactory.cs" /> + <Compile Include="TestPhoneticFilter.cs" /> + <Compile Include="TestPhoneticFilterFactory.cs" /> + <Compile Include="..\CommonAssemblyInfo.cs"> + <Link>Properties\CommonAssemblyInfo.cs</Link> + </Compile> + </ItemGroup> + <ItemGroup> + <ProjectReference Include="..\Lucene.Net.Analysis.Common\Lucene.Net.Analysis.Common.csproj"> + <Project>{4ADD0BBC-B900-4715-9526-D871DE8EEA64}</Project> + <Name>Lucene.Net.Analysis.Common</Name> + </ProjectReference> + <ProjectReference Include="..\Lucene.Net.Analysis.Phonetic\Lucene.Net.Analysis.Phonetic.csproj"> + <Project>{DAFE3B64-616A-4A2F-90E5-1F135E8A9AF5}</Project> + <Name>Lucene.Net.Analysis.Phonetic</Name> + </ProjectReference> + <ProjectReference Include="..\Lucene.Net.TestFramework\Lucene.Net.TestFramework.csproj"> + <Project>{b2c0d749-ce34-4f62-a15e-00cb2ff5ddb3}</Project> + <Name>Lucene.Net.TestFramework</Name> + </ProjectReference> + <ProjectReference Include="..\Lucene.Net\Lucene.Net.csproj"> + <Project>{5D4AD9BE-1FFB-41AB-9943-25737971BF57}</Project> + <Name>Lucene.Net</Name> + </ProjectReference> + </ItemGroup> + <ItemGroup> + <None Include="Lucene.Net.Tests.Analysis.Phonetic.project.json" /> + </ItemGroup> + <ItemGroup> + <Service Include="{82A7F48D-3B50-4B1E-B82E-3ADA8210C358}" /> + </ItemGroup> + <Import Project="$(MSBuildToolsPath)\Microsoft.CSharp.targets" /> + <!-- To modify your build process, add your task inside one of the targets below and uncomment it. + Other similar extension points exist, see Microsoft.Common.targets. + <Target Name="BeforeBuild"> + </Target> + <Target Name="AfterBuild"> + </Target> + --> +</Project> \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1ee3a9cc/src/Lucene.Net.Tests.Analysis.Phonetic/Lucene.Net.Tests.Analysis.Phonetic.project.json ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Tests.Analysis.Phonetic/Lucene.Net.Tests.Analysis.Phonetic.project.json b/src/Lucene.Net.Tests.Analysis.Phonetic/Lucene.Net.Tests.Analysis.Phonetic.project.json new file mode 100644 index 0000000..8c631ab --- /dev/null +++ b/src/Lucene.Net.Tests.Analysis.Phonetic/Lucene.Net.Tests.Analysis.Phonetic.project.json @@ -0,0 +1,11 @@ +{ + "runtimes": { + "win": {} + }, + "dependencies": { + "NUnit": "3.5.0" + }, + "frameworks": { + "net451": {} + } +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1ee3a9cc/src/Lucene.Net.Tests.Analysis.Phonetic/Lucene.Net.Tests.Analysis.Phonetic.xproj ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Tests.Analysis.Phonetic/Lucene.Net.Tests.Analysis.Phonetic.xproj b/src/Lucene.Net.Tests.Analysis.Phonetic/Lucene.Net.Tests.Analysis.Phonetic.xproj new file mode 100644 index 0000000..16b7fef --- /dev/null +++ b/src/Lucene.Net.Tests.Analysis.Phonetic/Lucene.Net.Tests.Analysis.Phonetic.xproj @@ -0,0 +1,21 @@ +<?xml version="1.0" encoding="utf-8"?> +<Project ToolsVersion="14.0.25420" DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003"> + <PropertyGroup> + <VisualStudioVersion Condition="'$(VisualStudioVersion)' == ''">14.0.25420</VisualStudioVersion> + <VSToolsPath Condition="'$(VSToolsPath)' == ''">$(MSBuildExtensionsPath32)\Microsoft\VisualStudio\v$(VisualStudioVersion)</VSToolsPath> + </PropertyGroup> + <Import Project="$(VSToolsPath)\DotNet\Microsoft.DotNet.Props" Condition="'$(VSToolsPath)' != ''" /> + <PropertyGroup Label="Globals"> + <ProjectGuid>1fe12ef7-4c89-4d49-bdd1-e49dc285f21b</ProjectGuid> + <RootNamespace>Lucene.Net.Tests.Analysis.Phonetic</RootNamespace> + <BaseIntermediateOutputPath Condition="'$(BaseIntermediateOutputPath)'=='' ">.\obj</BaseIntermediateOutputPath> + <OutputPath Condition="'$(OutputPath)'=='' ">.\bin\</OutputPath> + </PropertyGroup> + <PropertyGroup> + <SchemaVersion>2.0</SchemaVersion> + </PropertyGroup> + <ItemGroup> + <Service Include="{82a7f48d-3b50-4b1e-b82e-3ada8210c358}" /> + </ItemGroup> + <Import Project="$(VSToolsPath)\DotNet\Microsoft.DotNet.targets" Condition="'$(VSToolsPath)' != ''" /> +</Project> \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1ee3a9cc/src/Lucene.Net.Tests.Analysis.Phonetic/Properties/AssemblyInfo.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Tests.Analysis.Phonetic/Properties/AssemblyInfo.cs b/src/Lucene.Net.Tests.Analysis.Phonetic/Properties/AssemblyInfo.cs new file mode 100644 index 0000000..14e5b1c --- /dev/null +++ b/src/Lucene.Net.Tests.Analysis.Phonetic/Properties/AssemblyInfo.cs @@ -0,0 +1,42 @@ +/* + * + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + * +*/ + +using System.Reflection; +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; + +// General Information about an assembly is controlled through the following +// set of attributes. Change these attribute values to modify the information +// associated with an assembly. +[assembly: AssemblyTitle("Lucene.Net.Tests.Analysis.Phonetic")] +[assembly: AssemblyDescription("")] +[assembly: AssemblyConfiguration("")] +[assembly: AssemblyCulture("")] + +// Setting ComVisible to false makes the types in this assembly not visible +// to COM components. If you need to access a type in this assembly from +// COM, set the ComVisible attribute to true on that type. +[assembly: ComVisible(false)] + +// The following GUID is for the ID of the typelib if this project is exposed to COM +[assembly: Guid("a2867797-0a5d-4878-8f59-58c399c9a4e4")] + +// NOTE: Version information is in CommonAssemblyInfo.cs http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1ee3a9cc/src/Lucene.Net.Tests.Analysis.Phonetic/TestBeiderMorseFilter.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Tests.Analysis.Phonetic/TestBeiderMorseFilter.cs b/src/Lucene.Net.Tests.Analysis.Phonetic/TestBeiderMorseFilter.cs new file mode 100644 index 0000000..cc0e897 --- /dev/null +++ b/src/Lucene.Net.Tests.Analysis.Phonetic/TestBeiderMorseFilter.cs @@ -0,0 +1,132 @@ +using Lucene.Net.Analysis.Core; +using Lucene.Net.Analysis.Miscellaneous; +using Lucene.Net.Analysis.Phonetic.Language.Bm; +using Lucene.Net.Analysis.TokenAttributes; +using NUnit.Framework; +using System; +using System.Collections.Generic; +using System.IO; +using System.Text.RegularExpressions; + +namespace Lucene.Net.Analysis.Phonetic +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// <summary> + /// Tests <see cref="BeiderMorseFilter"/> + /// </summary> + public class TestBeiderMorseFilter : BaseTokenStreamTestCase + { + private Analyzer analyzer = Analyzer.NewAnonymous(createComponents: (fieldName, reader) => + { + Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); + return new TokenStreamComponents(tokenizer, + new BeiderMorseFilter(tokenizer, new PhoneticEngine(NameType.GENERIC, RuleType.EXACT, true))); + }); + + + /** generic, "exact" configuration */ + [Test] + public void TestBasicUsage() + { + AssertAnalyzesTo(analyzer, "Angelo", + new String[] { "anZelo", "andZelo", "angelo", "anhelo", "anjelo", "anxelo" }, + new int[] { 0, 0, 0, 0, 0, 0 }, + new int[] { 6, 6, 6, 6, 6, 6 }, + new int[] { 1, 0, 0, 0, 0, 0 }); + + + AssertAnalyzesTo(analyzer, "D'Angelo", + new String[] { "anZelo", "andZelo", "angelo", "anhelo", "anjelo", "anxelo", + "danZelo", "dandZelo", "dangelo", "danhelo", "danjelo", "danxelo" }, + new int[] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + new int[] { 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8 }, + new int[] { 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }); + } + + /** restrict the output to a set of possible origin languages */ + [Test] + public void TestLanguageSet() + { + LanguageSet languages = LanguageSet.From(new HashSet<String>() { + "italian", "greek", "spanish" + }); + Analyzer analyzer = Analyzer.NewAnonymous(createComponents: (fieldName, reader) => + { + Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); + return new TokenStreamComponents(tokenizer, + new BeiderMorseFilter(tokenizer, + new PhoneticEngine(NameType.GENERIC, RuleType.EXACT, true), languages)); + }); + + AssertAnalyzesTo(analyzer, "Angelo", + new String[] { "andZelo", "angelo", "anxelo" }, + new int[] { 0, 0, 0, }, + new int[] { 6, 6, 6, }, + new int[] { 1, 0, 0, }); + } + + /** for convenience, if the input yields no output, we pass it thru as-is */ + [Test] + public void TestNumbers() + { + AssertAnalyzesTo(analyzer, "1234", + new String[] { "1234" }, + new int[] { 0 }, + new int[] { 4 }, + new int[] { 1 }); + } + + [Test] + public void TestRandom() + { + CheckRandomData(Random(), analyzer, 1000 * RANDOM_MULTIPLIER); + } + + [Test] + public void TestEmptyTerm() + { + Analyzer a = Analyzer.NewAnonymous(createComponents: (fieldName, reader) => + { + Tokenizer tokenizer = new KeywordTokenizer(reader); + return new TokenStreamComponents(tokenizer, new BeiderMorseFilter(tokenizer, new PhoneticEngine(NameType.GENERIC, RuleType.EXACT, true))); + }); + + CheckOneTerm(a, "", ""); + } + + [Test] + public void TestCustomAttribute() + { + TokenStream stream = new KeywordTokenizer(new StringReader("D'Angelo")); + stream = new PatternKeywordMarkerFilter(stream, new Regex(".*")); + stream = new BeiderMorseFilter(stream, new PhoneticEngine(NameType.GENERIC, RuleType.EXACT, true)); + IKeywordAttribute keyAtt = stream.AddAttribute<IKeywordAttribute>(); + stream.Reset(); + int i = 0; + while (stream.IncrementToken()) + { + assertTrue(keyAtt.IsKeyword); + i++; + } + assertEquals(12, i); + stream.End(); + stream.Dispose(); + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1ee3a9cc/src/Lucene.Net.Tests.Analysis.Phonetic/TestBeiderMorseFilterFactory.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Tests.Analysis.Phonetic/TestBeiderMorseFilterFactory.cs b/src/Lucene.Net.Tests.Analysis.Phonetic/TestBeiderMorseFilterFactory.cs new file mode 100644 index 0000000..5bdf1b7 --- /dev/null +++ b/src/Lucene.Net.Tests.Analysis.Phonetic/TestBeiderMorseFilterFactory.cs @@ -0,0 +1,89 @@ +using Lucene.Net.Support; +using NUnit.Framework; +using System; +using System.Collections.Generic; +using System.IO; + +namespace Lucene.Net.Analysis.Phonetic +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// <summary> + /// Simple tests for <see cref="BeiderMorseFilterFactory"/> + /// </summary> + public class TestBeiderMorseFilterFactory : BaseTokenStreamTestCase + { + [Test] + public void TestBasics() + { + BeiderMorseFilterFactory factory = new BeiderMorseFilterFactory(new Dictionary<String, String>()); + TokenStream ts = factory.Create(new MockTokenizer(new StringReader("Weinberg"), MockTokenizer.WHITESPACE, false)); + AssertTokenStreamContents(ts, + new String[] { "vDnbirk", "vanbirk", "vinbirk", "wDnbirk", "wanbirk", "winbirk" }, + new int[] { 0, 0, 0, 0, 0, 0 }, + new int[] { 8, 8, 8, 8, 8, 8 }, + new int[] { 1, 0, 0, 0, 0, 0 }); + } + + [Test] + public void TestLanguageSet() + { + IDictionary<String, String> args = new Dictionary<string, string>(); + args.Put("languageSet", "polish"); + BeiderMorseFilterFactory factory = new BeiderMorseFilterFactory(args); + TokenStream ts = factory.Create(new MockTokenizer(new StringReader("Weinberg"), MockTokenizer.WHITESPACE, false)); + AssertTokenStreamContents(ts, + new String[] { "vDmbYrk", "vDmbirk", "vambYrk", "vambirk", "vimbYrk", "vimbirk" }, + new int[] { 0, 0, 0, 0, 0, 0 }, + new int[] { 8, 8, 8, 8, 8, 8 }, + new int[] { 1, 0, 0, 0, 0, 0 }); + } + + [Test] + public void TestOptions() + { + IDictionary<String, String> args = new Dictionary<string, string>(); + args.Put("nameType", "ASHKENAZI"); + args.Put("ruleType", "EXACT"); + BeiderMorseFilterFactory factory = new BeiderMorseFilterFactory(args); + TokenStream ts = factory.Create(new MockTokenizer(new StringReader("Weinberg"), MockTokenizer.WHITESPACE, false)); + AssertTokenStreamContents(ts, + new String[] { "vajnberk" }, + new int[] { 0 }, + new int[] { 8 }, + new int[] { 1 }); + } + + /** Test that bogus arguments result in exception */ + [Test] + public void TestBogusArguments() + { + try + { + new BeiderMorseFilterFactory(new Dictionary<String, String>() { + { "bogusArg", "bogusValue" } + }); + fail(); + } + catch (ArgumentException expected) + { + assertTrue(expected.Message.Contains("Unknown parameters")); + } + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1ee3a9cc/src/Lucene.Net.Tests.Analysis.Phonetic/TestDoubleMetaphoneFilterFactory.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Tests.Analysis.Phonetic/TestDoubleMetaphoneFilterFactory.cs b/src/Lucene.Net.Tests.Analysis.Phonetic/TestDoubleMetaphoneFilterFactory.cs new file mode 100644 index 0000000..5ba337b --- /dev/null +++ b/src/Lucene.Net.Tests.Analysis.Phonetic/TestDoubleMetaphoneFilterFactory.cs @@ -0,0 +1,70 @@ +using NUnit.Framework; +using System; +using System.Collections.Generic; +using System.IO; + +namespace Lucene.Net.Analysis.Phonetic +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + public class TestDoubleMetaphoneFilterFactory : BaseTokenStreamTestCase + { + [Test] + public void TestDefaults() + { + DoubleMetaphoneFilterFactory factory = new DoubleMetaphoneFilterFactory(new Dictionary<String, String>()); + TokenStream inputStream = new MockTokenizer(new StringReader("international"), MockTokenizer.WHITESPACE, false); + + TokenStream filteredStream = factory.Create(inputStream); + assertEquals(typeof(DoubleMetaphoneFilter), filteredStream.GetType()); + AssertTokenStreamContents(filteredStream, new String[] { "international", "ANTR" }); + } + + [Test] + public void TestSettingSizeAndInject() + { + IDictionary<string, string> parameters = new Dictionary<string, string>(); + parameters["inject"] = "false"; + parameters["maxCodeLength"] = "8"; + DoubleMetaphoneFilterFactory factory = new DoubleMetaphoneFilterFactory(parameters); + + TokenStream inputStream = new MockTokenizer(new StringReader("international"), MockTokenizer.WHITESPACE, false); + + TokenStream filteredStream = factory.Create(inputStream); + assertEquals(typeof(DoubleMetaphoneFilter), filteredStream.GetType()); + AssertTokenStreamContents(filteredStream, new String[] { "ANTRNXNL" }); + } + + /** Test that bogus arguments result in exception */ + [Test] + public void TestBogusArguments() + { + try + { + new DoubleMetaphoneFilterFactory(new Dictionary<String, String>() { + { "bogusArg", "bogusValue" } + }); + fail(); + } + catch (ArgumentException expected) + { + assertTrue(expected.Message.Contains("Unknown parameters")); + } + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1ee3a9cc/src/Lucene.Net.Tests.Analysis.Phonetic/TestPhoneticFilter.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Tests.Analysis.Phonetic/TestPhoneticFilter.cs b/src/Lucene.Net.Tests.Analysis.Phonetic/TestPhoneticFilter.cs new file mode 100644 index 0000000..387765f --- /dev/null +++ b/src/Lucene.Net.Tests.Analysis.Phonetic/TestPhoneticFilter.cs @@ -0,0 +1,122 @@ +using Lucene.Net.Analysis.Core; +using Lucene.Net.Analysis.Phonetic.Language; +using NUnit.Framework; +using System; +using System.IO; + +namespace Lucene.Net.Analysis.Phonetic +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// <summary> + /// Tests <see cref="PhoneticFilter"/> + /// </summary> + public class TestPhoneticFilter : BaseTokenStreamTestCase + { + [Test] + public void TestAlgorithms() + { + assertAlgorithm(new Metaphone(), true, "aaa bbb ccc easgasg", + new String[] { "A", "aaa", "B", "bbb", "KKK", "ccc", "ESKS", "easgasg" }); + assertAlgorithm(new Metaphone(), false, "aaa bbb ccc easgasg", + new String[] { "A", "B", "KKK", "ESKS" }); + + + assertAlgorithm(new DoubleMetaphone(), true, "aaa bbb ccc easgasg", + new String[] { "A", "aaa", "PP", "bbb", "KK", "ccc", "ASKS", "easgasg" }); + assertAlgorithm(new DoubleMetaphone(), false, "aaa bbb ccc easgasg", + new String[] { "A", "PP", "KK", "ASKS" }); + + + assertAlgorithm(new Soundex(), true, "aaa bbb ccc easgasg", + new String[] { "A000", "aaa", "B000", "bbb", "C000", "ccc", "E220", "easgasg" }); + assertAlgorithm(new Soundex(), false, "aaa bbb ccc easgasg", + new String[] { "A000", "B000", "C000", "E220" }); + + + assertAlgorithm(new RefinedSoundex(), true, "aaa bbb ccc easgasg", + new String[] { "A0", "aaa", "B1", "bbb", "C3", "ccc", "E034034", "easgasg" }); + assertAlgorithm(new RefinedSoundex(), false, "aaa bbb ccc easgasg", + new String[] { "A0", "B1", "C3", "E034034" }); + + + assertAlgorithm(new Caverphone2(), true, "Darda Karleen Datha Carlene", + new String[] { "TTA1111111", "Darda", "KLN1111111", "Karleen", + "TTA1111111", "Datha", "KLN1111111", "Carlene" }); + assertAlgorithm(new Caverphone2(), false, "Darda Karleen Datha Carlene", + new String[] { "TTA1111111", "KLN1111111", "TTA1111111", "KLN1111111" }); + } + + + static void assertAlgorithm(IStringEncoder encoder, bool inject, String input, + String[] expected) + { + Tokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, + new StringReader(input)); + PhoneticFilter filter = new PhoneticFilter(tokenizer, encoder, inject); + AssertTokenStreamContents(filter, expected); + } + + /** blast some random strings through the analyzer */ + [Test] + public void TestRandomStrings() + { + IStringEncoder[] encoders = new IStringEncoder[] { + new Metaphone(), new DoubleMetaphone(), new Soundex()/*, new RefinedSoundex()*/, new Caverphone2() + }; + + foreach (IStringEncoder e in encoders) + { + Analyzer a = Analyzer.NewAnonymous(createComponents: (fieldName, reader) => + { + Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); + return new TokenStreamComponents(tokenizer, new PhoneticFilter(tokenizer, e, false)); + }); + + CheckRandomData(Random(), a, 1000 * RANDOM_MULTIPLIER); + + Analyzer b = Analyzer.NewAnonymous(createComponents: (fieldName, reader) => + { + Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); + return new TokenStreamComponents(tokenizer, new PhoneticFilter(tokenizer, e, false)); + }); + + + CheckRandomData(Random(), b, 1000 * RANDOM_MULTIPLIER); + } + } + + [Test] + public void TestEmptyTerm() + { + IStringEncoder[] encoders = new IStringEncoder[] { + new Metaphone(), new DoubleMetaphone(), new Soundex()/*, new RefinedSoundex()*/, new Caverphone2() + }; + foreach (IStringEncoder e in encoders) + { + Analyzer a = Analyzer.NewAnonymous(createComponents: (fieldName, reader) => + { + Tokenizer tokenizer = new KeywordTokenizer(reader); + return new TokenStreamComponents(tokenizer, new PhoneticFilter(tokenizer, e, Random().nextBoolean())); + }); + + CheckOneTerm(a, "", ""); + } + } + } +}
