http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1ee3a9cc/src/Lucene.Net.Analysis.Phonetic/Language/Nysiis.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Phonetic/Language/Nysiis.cs b/src/Lucene.Net.Analysis.Phonetic/Language/Nysiis.cs new file mode 100644 index 0000000..a80d4f4 --- /dev/null +++ b/src/Lucene.Net.Analysis.Phonetic/Language/Nysiis.cs @@ -0,0 +1,370 @@ +// commons-codec version compatibility level: 1.9 +using System; +using System.Text; +using System.Text.RegularExpressions; + +namespace Lucene.Net.Analysis.Phonetic.Language +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// <summary> + /// Encodes a string into a NYSIIS value. NYSIIS is an encoding used to relate similar names, but can also be used as a + /// general purpose scheme to find word with similar phonemes. + /// </summary> + /// <remarks> + /// NYSIIS features an accuracy increase of 2.7% over the traditional Soundex algorithm. + /// <para/> + /// Algorithm description: + /// <list type="number"> + /// <item> + /// <term>Transcode first characters of name</term> + /// <description> + /// <list type="number"> + /// <item><description>MAC -> MCC</description></item> + /// <item><description>KN -> NN</description></item> + /// <item><description>K -> C</description></item> + /// <item><description>PH -> FF</description></item> + /// <item><description>PF -> FF</description></item> + /// <item><description>SCH -> SSS</description></item> + /// </list> + /// </description> + /// </item> + /// <item> + /// <term>Transcode last characters of name</term> + /// <description> + /// <list type="number"> + /// <item><description>EE, IE -> Y</description></item> + /// <item><description>DT,RT,RD,NT,ND -> D</description></item> + /// </list> + /// </description> + /// </item> + /// <item> + /// <term>First character of key = first character of name</term> + /// </item> + /// <item> + /// <term>Transcode remaining characters by following these rules, incrementing by one character each time</term> + /// <description> + /// <list type="number"> + /// <item><description>EV -> AF else A,E,I,O,U -> A</description></item> + /// <item><description>Q -> G</description></item> + /// <item><description>Z -> S</description></item> + /// <item><description>M -> N</description></item> + /// <item><description>KN -> N else K -> C</description></item> + /// <item><description>SCH -> SSS</description></item> + /// <item><description>PH -> FF</description></item> + /// <item><description>H -> If previous or next is nonvowel, previous</description></item> + /// <item><description>W -> If previous is vowel, previous</description></item> + /// <item><description>Add current to key if current != last key character</description></item> + /// </list> + /// </description> + /// </item> + /// <item> + /// <term>If last character is S, remove it</term> + /// </item> + /// <item> + /// <term>If last characters are AY, replace with Y</term> + /// </item> + /// <item> + /// <term>If last character is A, remove it</term> + /// </item> + /// <item> + /// <term>Collapse all strings of repeated characters</term> + /// </item> + /// <item> + /// <term>Add original first character of name as first character of key</term> + /// </item> + /// </list> + /// <para/> + /// This class is immutable and thread-safe. + /// <para/> + /// See: <a href="http://en.wikipedia.org/wiki/NYSIIS">NYSIIS on Wikipedia</a> + /// <para/> + /// See: <a href="http://www.dropby.com/NYSIIS.html">NYSIIS on dropby.com</a> + /// <para/> + /// since 1.7 + /// </remarks> + /// <seealso cref="Soundex"/> + public class Nysiis : IStringEncoder + { + private static readonly char[] CHARS_A = new char[] { 'A' }; + private static readonly char[] CHARS_AF = new char[] { 'A', 'F' }; + private static readonly char[] CHARS_C = new char[] { 'C' }; + private static readonly char[] CHARS_FF = new char[] { 'F', 'F' }; + private static readonly char[] CHARS_G = new char[] { 'G' }; + private static readonly char[] CHARS_N = new char[] { 'N' }; + private static readonly char[] CHARS_NN = new char[] { 'N', 'N' }; + private static readonly char[] CHARS_S = new char[] { 'S' }; + private static readonly char[] CHARS_SSS = new char[] { 'S', 'S', 'S' }; + + private static readonly Regex PAT_MAC = new Regex("^MAC", RegexOptions.Compiled); + private static readonly Regex PAT_KN = new Regex("^KN", RegexOptions.Compiled); + private static readonly Regex PAT_K = new Regex("^K", RegexOptions.Compiled); + private static readonly Regex PAT_PH_PF = new Regex("^(PH|PF)", RegexOptions.Compiled); + private static readonly Regex PAT_SCH = new Regex("^SCH", RegexOptions.Compiled); + private static readonly Regex PAT_EE_IE = new Regex("(EE|IE)$", RegexOptions.Compiled); + private static readonly Regex PAT_DT_ETC = new Regex("(DT|RT|RD|NT|ND)$", RegexOptions.Compiled); + + private static readonly char SPACE = ' '; + private static readonly int TRUE_LENGTH = 6; + + /// <summary> + /// Tests if the given character is a vowel. + /// </summary> + /// <param name="c">The character to test.</param> + /// <returns><c>true</c> if the character is a vowel, <c>false</c> otherwise.</returns> + private static bool IsVowel(char c) + { + return c == 'A' || c == 'E' || c == 'I' || c == 'O' || c == 'U'; + } + + /// <summary> + /// Transcodes the remaining parts of the string. The method operates on a sliding window, looking at 4 characters at + /// a time: [i-1, i, i+1, i+2]. + /// </summary> + /// <param name="prev">The previous character.</param> + /// <param name="curr">The current character.</param> + /// <param name="next">The next character.</param> + /// <param name="aNext">The after next character</param> + /// <returns>A transcoded array of characters, starting from the current position.</returns> + private static char[] TranscodeRemaining(char prev, char curr, char next, char aNext) + { + // 1. EV -> AF + if (curr == 'E' && next == 'V') + { + return CHARS_AF; + } + + // A, E, I, O, U -> A + if (IsVowel(curr)) + { + return CHARS_A; + } + + // 2. Q -> G, Z -> S, M -> N + if (curr == 'Q') + { + return CHARS_G; + } + else if (curr == 'Z') + { + return CHARS_S; + } + else if (curr == 'M') + { + return CHARS_N; + } + + // 3. KN -> NN else K -> C + if (curr == 'K') + { + if (next == 'N') + { + return CHARS_NN; + } + else + { + return CHARS_C; + } + } + + // 4. SCH -> SSS + if (curr == 'S' && next == 'C' && aNext == 'H') + { + return CHARS_SSS; + } + + // PH -> FF + if (curr == 'P' && next == 'H') + { + return CHARS_FF; + } + + // 5. H -> If previous or next is a non vowel, previous. + if (curr == 'H' && (!IsVowel(prev) || !IsVowel(next))) + { + return new char[] { prev }; + } + + // 6. W -> If previous is vowel, previous. + if (curr == 'W' && IsVowel(prev)) + { + return new char[] { prev }; + } + + return new char[] { curr }; + } + + /// <summary>Indicates the strict mode.</summary> + private readonly bool strict; + + /// <summary> + /// Creates an instance of the <see cref="Nysiis"/> encoder with strict mode (original form), + /// i.e. encoded strings have a maximum length of 6. + /// </summary> + public Nysiis() + : this(true) + { + } + + /// <summary> + /// Create an instance of the {@link Nysiis} encoder with the specified strict mode: + /// <list type="bullet"> + /// <item><term><c>true</c>:</term><description>encoded strings have a maximum length of 6</description></item> + /// <item><term><c>false</c>:</term><description>encoded strings may have arbitrary length</description></item> + /// </list> + /// </summary> + /// <param name="strict">The strict mode.</param> + public Nysiis(bool strict) + { + this.strict = strict; + } + + // LUCENENET specific - in .NET we don't need an object overload, since strings are sealed anyway. + // /** + // * Encodes an Object using the NYSIIS algorithm. This method is provided in order to satisfy the requirements of the + // * Encoder interface, and will throw an {@link EncoderException} if the supplied object is not of type + // * {@link String}. + // * + // * @param obj + // * Object to encode + // * @return An object (or a {@link String}) containing the NYSIIS code which corresponds to the given String. + // * @throws EncoderException + // * if the parameter supplied is not of a {@link String} + // * @throws IllegalArgumentException + // * if a character is not mapped + // */ + // @Override + //public object Encode(object obj) + // { + // if (!(obj is String)) { + // throw new EncoderException("Parameter supplied to Nysiis encode is not of type java.lang.String"); + // } + // return this.nysiis((String) obj); + // } + + /// <summary> + /// Encodes a string using the NYSIIS algorithm. + /// </summary> + /// <param name="str">A string object to encode.</param> + /// <returns>A <see cref="Nysiis"/> code corresponding to the string supplied.</returns> + /// <exception cref="ArgumentException">If a character is not mapped.</exception> + public virtual string Encode(string str) + { + return this.GetNysiis(str); + } + + /// <summary> + /// Indicates the strict mode for this <see cref="Nysiis"/> encoder. + /// <c>true</c> if the encoder is configured for strict mode, <c>false</c> otherwise. + /// </summary> + public virtual bool IsStrict + { + get { return this.strict; } + } + + /// <summary> + /// Retrieves the NYSIIS code for a given string. + /// </summary> + /// <param name="str">String to encode using the NYSIIS algorithm.</param> + /// <returns>A NYSIIS code for the string supplied.</returns> + public virtual string GetNysiis(string str) + { + if (str == null) + { + return null; + } + + // Use the same clean rules as Soundex + str = SoundexUtils.Clean(str); + + if (str.Length == 0) + { + return str; + } + + // Translate first characters of name: + // MAC -> MCC, KN -> NN, K -> C, PH | PF -> FF, SCH -> SSS + str = PAT_MAC.Replace(str, "MCC", 1); + str = PAT_KN.Replace(str, "NN", 1); + str = PAT_K.Replace(str, "C", 1); + str = PAT_PH_PF.Replace(str, "FF", 1); + str = PAT_SCH.Replace(str, "SSS", 1); + + // Translate last characters of name: + // EE -> Y, IE -> Y, DT | RT | RD | NT | ND -> D + str = PAT_EE_IE.Replace(str, "Y", 1); + str = PAT_DT_ETC.Replace(str, "D", 1); + + // First character of key = first character of name. + StringBuilder key = new StringBuilder(str.Length); + key.Append(str[0]); + + // Transcode remaining characters, incrementing by one character each time + char[] chars = str.ToCharArray(); + int len = chars.Length; + + for (int i = 1; i < len; i++) + { + char next = i < len - 1 ? chars[i + 1] : SPACE; + char aNext = i < len - 2 ? chars[i + 2] : SPACE; + char[] transcoded = TranscodeRemaining(chars[i - 1], chars[i], next, aNext); + System.Array.Copy(transcoded, 0, chars, i, transcoded.Length); + + // only append the current char to the key if it is different from the last one + if (chars[i] != chars[i - 1]) + { + key.Append(chars[i]); + } + } + + if (key.Length > 1) + { + char lastChar = key[key.Length - 1]; + + // If last character is S, remove it. + if (lastChar == 'S') + { + //key.deleteCharAt(key.length() - 1); + key.Remove(key.Length - 1, 1); + lastChar = key[key.Length - 1]; + } + + if (key.Length > 2) + { + char last2Char = key[key.Length - 2]; + // If last characters are AY, replace with Y. + if (last2Char == 'A' && lastChar == 'Y') + { + //.key.deleteCharAt(key.length() - 2); + key.Remove(key.Length - 2, 1); + } + } + + // If last character is A, remove it. + if (lastChar == 'A') + { + //key.deleteCharAt(key.length() - 1); + key.Remove(key.Length - 1, 1); + } + } + + string result = key.ToString(); + return this.IsStrict ? result.Substring(0, Math.Min(TRUE_LENGTH, result.Length) - 0) : result; + } + } +}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1ee3a9cc/src/Lucene.Net.Analysis.Phonetic/Language/RefinedSoundex.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Phonetic/Language/RefinedSoundex.cs b/src/Lucene.Net.Analysis.Phonetic/Language/RefinedSoundex.cs new file mode 100644 index 0000000..e0f9071 --- /dev/null +++ b/src/Lucene.Net.Analysis.Phonetic/Language/RefinedSoundex.cs @@ -0,0 +1,202 @@ +// commons-codec version compatibility level: 1.9 +using System.Globalization; +using System.Text; + +namespace Lucene.Net.Analysis.Phonetic.Language +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// <summary> + /// Encodes a string into a Refined Soundex value. A refined soundex code is + /// optimized for spell checking words. Soundex method originally developed by + /// <c>Margaret Odell</c> and <c>Robert Russell</c>. + /// <para/> + /// This class is immutable and thread-safe. + /// </summary> + public class RefinedSoundex : IStringEncoder + { + /// <summary> + /// since 1.4 + /// </summary> + public static readonly string US_ENGLISH_MAPPING_STRING = "01360240043788015936020505"; + + /// <summary> + /// RefinedSoundex is *refined* for a number of reasons one being that the + /// mappings have been altered. This implementation contains default + /// mappings for US English. + /// </summary> + private static readonly char[] US_ENGLISH_MAPPING = US_ENGLISH_MAPPING_STRING.ToCharArray(); + + /// <summary> + /// Every letter of the alphabet is "mapped" to a numerical value. This char + /// array holds the values to which each letter is mapped. This + /// implementation contains a default map for US_ENGLISH. + /// </summary> + private readonly char[] soundexMapping; + + /// <summary> + /// This static variable contains an instance of the RefinedSoundex using + /// the US_ENGLISH mapping. + /// </summary> + public static readonly RefinedSoundex US_ENGLISH = new RefinedSoundex(); + + /// <summary> + /// Creates an instance of the <see cref="RefinedSoundex"/> object using the default US + /// English mapping. + /// </summary> + public RefinedSoundex() + { + this.soundexMapping = US_ENGLISH_MAPPING; + } + + /// <summary> + /// Creates a refined soundex instance using a custom mapping. This + /// constructor can be used to customize the mapping, and/or possibly + /// provide an internationalized mapping for a non-Western character set. + /// </summary> + /// <param name="mapping">Mapping array to use when finding the corresponding code for a given character.</param> + public RefinedSoundex(char[] mapping) + { + this.soundexMapping = new char[mapping.Length]; + System.Array.Copy(mapping, 0, this.soundexMapping, 0, mapping.Length); + } + + /// <summary> + /// Creates a refined Soundex instance using a custom mapping. This constructor can be used to customize the mapping, + /// and/or possibly provide an internationalized mapping for a non-Western character set. + /// </summary> + /// <param name="mapping">Mapping string to use when finding the corresponding code for a given character.</param> + public RefinedSoundex(string mapping) + { + this.soundexMapping = mapping.ToCharArray(); + } + + /// <summary> + /// Returns the number of characters in the two encoded strings that are the + /// same. This return value ranges from 0 to the length of the shortest + /// encoded string: 0 indicates little or no similarity, and 4 out of 4 (for + /// example) indicates strong similarity or identical values. For refined + /// Soundex, the return value can be greater than 4. + /// <para/> + /// See: <a href="http://msdn.microsoft.com/library/default.asp?url=/library/en-us/tsqlref/ts_de-dz_8co5.asp"> + /// MS T-SQL DIFFERENCE</a> + /// <para/> + /// since 1.3 + /// </summary> + /// <param name="s1">A string that will be encoded and compared.</param> + /// <param name="s2">A string that will be encoded and compared.</param> + /// <returns>The number of characters in the two encoded strings that are the same from 0 to to the length of the shortest encoded string.</returns> + /// <seealso cref="SoundexUtils.Difference(IStringEncoder, string, string)"/> + public virtual int Difference(string s1, string s2) + { + return SoundexUtils.Difference(this, s1, s2); + } + + // LUCENENET specific - in .NET we don't need an object overload, since strings are sealed anyway. + // /** + // * Encodes an Object using the refined soundex algorithm. This method is + // * provided in order to satisfy the requirements of the Encoder interface, + // * and will throw an EncoderException if the supplied object is not of type + // * java.lang.String. + // * + // * @param obj + // * Object to encode + // * @return An object (or type java.lang.String) containing the refined + // * soundex code which corresponds to the String supplied. + // * @throws EncoderException + // * if the parameter supplied is not of type java.lang.String + // */ + // @Override + //public virtual object Encode(object obj) + // { + // if (!(obj is String)) { + // throw new EncoderException("Parameter supplied to RefinedSoundex encode is not of type java.lang.String"); + // } + // return soundex((String) obj); + // } + + /// <summary> + /// Encodes a string using the refined soundex algorithm. + /// </summary> + /// <param name="str">A string object to encode.</param> + /// <returns>A Soundex code corresponding to the string supplied.</returns> + public virtual string Encode(string str) + { + return GetSoundex(str); + } + + /// <summary> + /// Returns the mapping code for a given character. The mapping codes are + /// maintained in an internal char array named soundexMapping, and the + /// default values of these mappings are US English. + /// </summary> + /// <param name="c"><see cref="char"/> to get mapping for.</param> + /// <returns>A character (really a numeral) to return for the given <see cref="char"/>.</returns> + internal char GetMappingCode(char c) + { + if (!char.IsLetter(c)) + { + return (char)0; + } + return this.soundexMapping[char.ToUpperInvariant(c) - 'A']; + } + + /// <summary> + /// Retrieves the Refined Soundex code for a given string. + /// </summary> + /// <param name="str">String to encode using the Refined Soundex algorithm.</param> + /// <returns>A soundex code for the string supplied.</returns> + public virtual string GetSoundex(string str) + { + if (str == null) + { + return null; + } + str = SoundexUtils.Clean(str); + if (str.Length == 0) + { + return str; + } + + StringBuilder sBuf = new StringBuilder(); + sBuf.Append(str[0]); + + char last, current; + last = '*'; + + for (int i = 0; i < str.Length; i++) + { + + current = GetMappingCode(str[i]); + if (current == last) + { + continue; + } + else if (current != 0) + { + sBuf.Append(current); + } + + last = current; + + } + + return sBuf.ToString(); + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1ee3a9cc/src/Lucene.Net.Analysis.Phonetic/Language/Soundex.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Phonetic/Language/Soundex.cs b/src/Lucene.Net.Analysis.Phonetic/Language/Soundex.cs new file mode 100644 index 0000000..abb70c3 --- /dev/null +++ b/src/Lucene.Net.Analysis.Phonetic/Language/Soundex.cs @@ -0,0 +1,318 @@ +// commons-codec version compatibility level: 1.10 +using System; + +namespace Lucene.Net.Analysis.Phonetic.Language +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// <summary> + /// Encodes a string into a Soundex value. Soundex is an encoding used to relate similar names, but can also be used as a + /// general purpose scheme to find word with similar phonemes. + /// <para/> + /// This class is thread-safe. + /// Although not strictly immutable, the <see cref="maxLength"/> field is not actually used. + /// </summary> + public class Soundex : IStringEncoder + { + /// <summary> + /// The marker character used to indicate a silent (ignored) character. + /// These are ignored except when they appear as the first character. + /// <para/> + /// Note: the <see cref="US_ENGLISH_MAPPING_STRING"/> does not use this mechanism + /// because changing it might break existing code. Mappings that don't contain + /// a silent marker code are treated as though H and W are silent. + /// <para/> + /// To override this, use the <see cref="Soundex(string, bool)"/> constructor. + /// <para/> + /// since 1.11 + /// </summary> + public static readonly char SILENT_MARKER = '-'; + + /// <summary> + /// This is a default mapping of the 26 letters used in US English. A value of <c>0</c> for a letter position + /// means do not encode, but treat as a separator when it occurs between consonants with the same code. + /// <para/> + /// (This constant is provided as both an implementation convenience and to allow documentation to pick + /// up the value for the constant values page.) + /// <para/> + /// <b>Note that letters H and W are treated specially.</b> + /// They are ignored (after the first letter) and don't act as separators + /// between consonants with the same code. + /// </summary> + /// <seealso cref="US_ENGLISH_MAPPING"/> + // ABCDEFGHIJKLMNOPQRSTUVWXYZ + public static readonly string US_ENGLISH_MAPPING_STRING = "01230120022455012623010202"; + + /// <summary> + /// This is a default mapping of the 26 letters used in US English. A value of <c>0</c> for a letter position + /// means do not encode. + /// </summary> + /// <seealso cref="Soundex.Soundex(char[])"/> + private static readonly char[] US_ENGLISH_MAPPING = US_ENGLISH_MAPPING_STRING.ToCharArray(); + + /// <summary> + /// An instance of Soundex using the US_ENGLISH_MAPPING mapping. + /// This treats H and W as silent letters. + /// Apart from when they appear as the first letter, they are ignored. + /// They don't act as separators between duplicate codes. + /// </summary> + /// <seealso cref="US_ENGLISH_MAPPING"/> + /// <seealso cref="US_ENGLISH_MAPPING_STRING"/> + public static readonly Soundex US_ENGLISH = new Soundex(); + + /// <summary> + /// An instance of Soundex using the Simplified Soundex mapping, as described here: + /// http://west-penwith.org.uk/misc/soundex.htm + /// <para/> + /// This treats H and W the same as vowels (AEIOUY). + /// Such letters aren't encoded (after the first), but they do + /// act as separators when dropping duplicate codes. + /// The mapping is otherwise the same as for <see cref="US_ENGLISH"/>. + /// <para/> + /// since 1.11 + /// </summary> + public static readonly Soundex US_ENGLISH_SIMPLIFIED = new Soundex(US_ENGLISH_MAPPING_STRING, false); + + /// <summary> + /// An instance of Soundex using the mapping as per the Genealogy site: + /// http://www.genealogy.com/articles/research/00000060.html + /// <para/> + /// This treats vowels (AEIOUY), H and W as silent letters. + /// Such letters are ignored (after the first) and do not + /// act as separators when dropping duplicate codes. + /// <para/> + /// The codes for consonants are otherwise the same as for + /// <see cref="US_ENGLISH_MAPPING_STRING"/> and <see cref="US_ENGLISH_SIMPLIFIED"/>. + /// <para/> + /// since 1.11 + /// </summary> + public static readonly Soundex US_ENGLISH_GENEALOGY = new Soundex("-123-12--22455-12623-1-2-2"); + // ABCDEFGHIJKLMNOPQRSTUVWXYZ + + /// <summary> + /// The maximum length of a Soundex code - Soundex codes are only four characters by definition. + /// </summary> + [Obsolete("This feature is not needed since the encoding size must be constant. Will be removed in 2.0.")] + private int maxLength = 4; + + /// <summary> + /// Every letter of the alphabet is "mapped" to a numerical value. This char array holds the values to which each + /// letter is mapped. This implementation contains a default map for US_ENGLISH + /// </summary> + private readonly char[] soundexMapping; + + /// <summary> + /// Should H and W be treated specially? + /// <para/> + /// In versions of the code prior to 1.11, + /// the code always treated H and W as silent (ignored) letters. + /// If this field is false, H and W are no longer special-cased. + /// </summary> + private readonly bool specialCaseHW; + + /// <summary> + /// Creates an instance using <see cref="US_ENGLISH_MAPPING"/>. + /// </summary> + /// <seealso cref="Soundex.Soundex(char[])"/> + /// <seealso cref="US_ENGLISH_MAPPING"/> + public Soundex() + { + this.soundexMapping = US_ENGLISH_MAPPING; + this.specialCaseHW = true; + } + + /// <summary> + /// Creates a soundex instance using the given mapping. This constructor can be used to provide an internationalized + /// mapping for a non-Western character set. + /// <para/> + /// Every letter of the alphabet is "mapped" to a numerical value. This char array holds the values to which each + /// letter is mapped. This implementation contains a default map for <see cref="US_ENGLISH"/>. + /// <para/> + /// If the mapping contains an instance of <see cref="SILENT_MARKER"/> then H and W are not given special treatment. + /// </summary> + /// <param name="mapping"> Mapping array to use when finding the corresponding code for a given character.</param> + public Soundex(char[] mapping) + { + this.soundexMapping = new char[mapping.Length]; + System.Array.Copy(mapping, 0, this.soundexMapping, 0, mapping.Length); + this.specialCaseHW = !HasMarker(this.soundexMapping); + } + + private bool HasMarker(char[] mapping) + { + foreach (char ch in mapping) + { + if (ch == SILENT_MARKER) + { + return true; + } + } + return false; + } + + /// <summary> + /// Creates a refined soundex instance using a custom mapping. This constructor can be used to customize the mapping, + /// and/or possibly provide an internationalized mapping for a non-Western character set. + /// <para/> + /// If the mapping contains an instance of <see cref="SILENT_MARKER"/> then H and W are not given special treatment. + /// <para/> + /// since 1.4 + /// </summary> + /// <param name="mapping">Mapping string to use when finding the corresponding code for a given character.</param> + public Soundex(string mapping) + { + this.soundexMapping = mapping.ToCharArray(); + this.specialCaseHW = !HasMarker(this.soundexMapping); + } + + /// <summary> + /// Creates a refined soundex instance using a custom mapping. This constructor can be used to customize the mapping, + /// and/or possibly provide an internationalized mapping for a non-Western character set. + /// <para/> + /// since 1.11 + /// </summary> + /// <param name="mapping">Mapping string to use when finding the corresponding code for a given character.</param> + /// <param name="specialCaseHW">if true, then </param> + public Soundex(string mapping, bool specialCaseHW) + { + this.soundexMapping = mapping.ToCharArray(); + this.specialCaseHW = specialCaseHW; + } + + /// <summary> + /// Encodes the strings and returns the number of characters in the two encoded strings that are the same. This + /// return value ranges from 0 through 4: 0 indicates little or no similarity, and 4 indicates strong similarity or + /// identical values. + /// <para/> + /// See: <a href="http://msdn.microsoft.com/library/default.asp?url=/library/en-us/tsqlref/ts_de-dz_8co5.asp"> MS + /// T-SQL DIFFERENCE </a> + /// <para/> + /// since 1.3 + /// </summary> + /// <param name="s1">A string that will be encoded and compared.</param> + /// <param name="s2">A string that will be encoded and compared.</param> + /// <returns>The number of characters in the two encoded strings that are the same from 0 to 4.</returns> + /// <seealso cref="SoundexUtils.Difference(IStringEncoder, string, string)"/> + public virtual int Difference(string s1, string s2) + { + return SoundexUtils.Difference(this, s1, s2); + } + + // LUCENENET specific - in .NET we don't need an object overload, since strings are sealed anyway. + // /** + // * Encodes an Object using the soundex algorithm. This method is provided in order to satisfy the requirements of + // * the Encoder interface, and will throw an EncoderException if the supplied object is not of type java.lang.String. + // * + // * @param obj + // * Object to encode + // * @return An object (or type java.lang.String) containing the soundex code which corresponds to the String + // * supplied. + // * @throws EncoderException + // * if the parameter supplied is not of type java.lang.String + // * @throws IllegalArgumentException + // * if a character is not mapped + // */ + //public virtual Object encode(object obj) + // { + // if (!(obj is string)) { + // throw new EncoderException("Parameter supplied to Soundex encode is not of type java.lang.String"); + // } + // return soundex((string) obj); + // } + + /// <summary> + /// Encodes a string using the soundex algorithm. + /// </summary> + /// <param name="str">A string to encode.</param> + /// <returns>A Soundex code corresponding to the string supplied.</returns> + /// <exception cref="ArgumentException">If a character is not mapped.</exception> + public virtual string Encode(string str) + { + return GetSoundex(str); + } + + /// <summary> + /// Gets or Sets the maxLength. Standard Soundex + /// </summary> + [Obsolete("This feature is not needed since the encoding size must be constant. Will be removed in 2.0.")] + public virtual int MaxLength + { + get { return this.maxLength; } + set { this.maxLength = value; } + } + + /// <summary> + /// Maps the given upper-case character to its Soundex code. + /// </summary> + /// <param name="ch">An upper-case character.</param> + /// <returns>A Soundex code.</returns> + /// <exception cref="ArgumentException">Thrown if <paramref name="ch"/> is not mapped.</exception> + private char Map(char ch) + { + int index = ch - 'A'; + if (index < 0 || index >= this.soundexMapping.Length) + { + throw new ArgumentException("The character is not mapped: " + ch + " (index=" + index + ")"); + } + return this.soundexMapping[index]; + } + + /// <summary> + /// Retrieves the Soundex code for a given string. + /// </summary> + /// <param name="str">String to encode using the Soundex algorithm.</param> + /// <returns>A soundex code for the string supplied.</returns> + /// <exception cref="ArgumentException">If a character is not mapped.</exception> + public virtual string GetSoundex(string str) + { + if (str == null) + { + return null; + } + str = SoundexUtils.Clean(str); + if (str.Length == 0) + { + return str; + } + char[] output = { '0', '0', '0', '0' }; + int count = 0; + char first = str[0]; + output[count++] = first; + char lastDigit = Map(first); // previous digit + for (int i = 1; i < str.Length && count < output.Length; i++) + { + char ch = str[i]; + if ((this.specialCaseHW) && (ch == 'H' || ch == 'W')) + { // these are ignored completely + continue; + } + char digit = Map(ch); + if (digit == SILENT_MARKER) + { + continue; + } + if (digit != '0' && digit != lastDigit) + { // don't store vowels or repeats + output[count++] = digit; + } + lastDigit = digit; + } + return new string(output); + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1ee3a9cc/src/Lucene.Net.Analysis.Phonetic/Language/SoundexUtils.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Phonetic/Language/SoundexUtils.cs b/src/Lucene.Net.Analysis.Phonetic/Language/SoundexUtils.cs new file mode 100644 index 0000000..e6079c2 --- /dev/null +++ b/src/Lucene.Net.Analysis.Phonetic/Language/SoundexUtils.cs @@ -0,0 +1,123 @@ +// commons-codec version compatibility level: 1.9 +using System; +using System.Globalization; + +namespace Lucene.Net.Analysis.Phonetic.Language +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// <summary> + /// Utility methods for <see cref="Soundex"/> and <see cref="RefinedSoundex"/> classes. + /// <para/> + /// This class is immutable and thread-safe. + /// <para/> + /// @since 1.3 + /// </summary> + internal sealed class SoundexUtils + { + /// <summary> + /// Cleans up the input string before Soundex processing by only returning + /// upper case letters. + /// </summary> + /// <param name="str">The string to clean.</param> + /// <returns>A clean string.</returns> + public static string Clean(string str) + { + if (str == null || str.Length == 0) + { + return str; + } + int len = str.Length; + char[] chars = new char[len]; + int count = 0; + for (int i = 0; i < len; i++) + { + if (char.IsLetter(str[i])) + { + chars[count++] = str[i]; + } + } + if (count == len) + { + return new CultureInfo("en").TextInfo.ToUpper(str); + } + return new CultureInfo("en").TextInfo.ToUpper(new string(chars, 0, count)); + } + + /// <summary> + /// Encodes the Strings and returns the number of characters in the two + /// encoded Strings that are the same. + /// <list type="bullet"> + /// <item><description> + /// For Soundex, this return value ranges from 0 through 4: 0 indicates + /// little or no similarity, and 4 indicates strong similarity or identical + /// values. + /// </description></item> + /// <item><description>For refined Soundex, the return value can be greater than 4.</description></item> + /// </list> + /// <para/> + /// See: <a href="http://msdn.microsoft.com/library/default.asp?url=/library/en-us/tsqlref/ts_de-dz_8co5.asp"> + /// MS T-SQL DIFFERENCE</a> + /// </summary> + /// <param name="encoder">The encoder to use to encode the strings.</param> + /// <param name="s1">A string that will be encoded and compared.</param> + /// <param name="s2">A string that will be encoded and compared.</param> + /// <returns>The number of characters in the two Soundex encoded strings that are the same.</returns> + /// <seealso cref="DifferenceEncoded(string, string)"/> + public static int Difference(IStringEncoder encoder, string s1, string s2) + { + return DifferenceEncoded(encoder.Encode(s1), encoder.Encode(s2)); + } + + /// <summary> + /// Returns the number of characters in the two Soundex encoded strings that + /// are the same. + /// <list type="bullet"> + /// <item><description> + /// For Soundex, this return value ranges from 0 through 4: 0 indicates + /// little or no similarity, and 4 indicates strong similarity or identical + /// values. + /// </description></item> + /// <item><description>For refined Soundex, the return value can be greater than 4.</description></item> + /// </list> + /// <para/> + /// See: <a href="http://msdn.microsoft.com/library/default.asp?url=/library/en-us/tsqlref/ts_de-dz_8co5.asp"> + /// MS T-SQL DIFFERENCE</a> + /// </summary> + /// <param name="es1">An encoded string.</param> + /// <param name="es2">An encoded string.</param> + /// <returns>The number of characters in the two Soundex encoded strings that are the same.</returns> + public static int DifferenceEncoded(string es1, string es2) + { + if (es1 == null || es2 == null) + { + return 0; + } + int lengthToMatch = Math.Min(es1.Length, es2.Length); + int diff = 0; + for (int i = 0; i < lengthToMatch; i++) + { + if (es1[i] == es2[i]) + { + diff++; + } + } + return diff; + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1ee3a9cc/src/Lucene.Net.Analysis.Phonetic/Language/StringEncoder.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Phonetic/Language/StringEncoder.cs b/src/Lucene.Net.Analysis.Phonetic/Language/StringEncoder.cs new file mode 100644 index 0000000..b4137a4 --- /dev/null +++ b/src/Lucene.Net.Analysis.Phonetic/Language/StringEncoder.cs @@ -0,0 +1,35 @@ +// commons-codec version compatibility level: 1.9 +namespace Lucene.Net.Analysis.Phonetic.Language +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// <summary> + /// Defines common encoding methods for <see cref="string"/> encoders. + /// </summary> + public interface IStringEncoder + { + /// <summary> + /// Encodes a <see cref="string"/> and returns a <see cref="string"/>. + /// </summary> + /// <param name="source">the <see cref="string"/> to encode</param> + /// <returns>the encoded <see cref="string"/></returns> + // LUCENENET specific - EncoderException not ported, as it was only thrown on a coversion from object to string type + // <exception cref="EncoderException">thrown if there is an error condition during the encoding process.</exception> + string Encode(string source); + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1ee3a9cc/src/Lucene.Net.Analysis.Phonetic/Language/dmrules.txt ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Phonetic/Language/dmrules.txt b/src/Lucene.Net.Analysis.Phonetic/Language/dmrules.txt new file mode 100644 index 0000000..db8367d --- /dev/null +++ b/src/Lucene.Net.Analysis.Phonetic/Language/dmrules.txt @@ -0,0 +1,200 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Format +// "pattern" "replacement at start of word" "replacement before a vowel" "replacement in other cases" + +// Vowels + +"a" "0" "" "" +"e" "0" "" "" +"i" "0" "" "" +"o" "0" "" "" +"u" "0" "" "" + +// Consonants + +"b" "7" "7" "7" +"d" "3" "3" "3" +"f" "7" "7" "7" +"g" "5" "5" "5" +"h" "5" "5" "" +"k" "5" "5" "5" +"l" "8" "8" "8" +"m" "6" "6" "6" +"n" "6" "6" "6" +"p" "7" "7" "7" +"q" "5" "5" "5" +"r" "9" "9" "9" +"s" "4" "4" "4" +"t" "3" "3" "3" +"v" "7" "7" "7" +"w" "7" "7" "7" +"x" "5" "54" "54" +"y" "1" "" "" +"z" "4" "4" "4" + +// Romanian t-cedilla and t-comma should be equivalent +"Å£" "3|4" "3|4" "3|4" +"È" "3|4" "3|4" "3|4" + +// Polish characters (e-ogonek and a-ogonek): default case branch either not coded or 6 +"Ä" "" "" "|6" +"Ä " "" "" "|6" + +// Other terms + +"schtsch" "2" "4" "4" +"schtsh" "2" "4" "4" +"schtch" "2" "4" "4" +"shtch" "2" "4" "4" +"shtsh" "2" "4" "4" +"stsch" "2" "4" "4" +"ttsch" "4" "4" "4" +"zhdzh" "2" "4" "4" +"shch" "2" "4" "4" +"scht" "2" "43" "43" +"schd" "2" "43" "43" +"stch" "2" "4" "4" +"strz" "2" "4" "4" +"strs" "2" "4" "4" +"stsh" "2" "4" "4" +"szcz" "2" "4" "4" +"szcs" "2" "4" "4" +"ttch" "4" "4" "4" +"tsch" "4" "4" "4" +"ttsz" "4" "4" "4" +"zdzh" "2" "4" "4" +"zsch" "4" "4" "4" +"chs" "5" "54" "54" +"csz" "4" "4" "4" +"czs" "4" "4" "4" +"drz" "4" "4" "4" +"drs" "4" "4" "4" +"dsh" "4" "4" "4" +"dsz" "4" "4" "4" +"dzh" "4" "4" "4" +"dzs" "4" "4" "4" +"sch" "4" "4" "4" +"sht" "2" "43" "43" +"szt" "2" "43" "43" +"shd" "2" "43" "43" +"szd" "2" "43" "43" +"tch" "4" "4" "4" +"trz" "4" "4" "4" +"trs" "4" "4" "4" +"tsh" "4" "4" "4" +"tts" "4" "4" "4" +"ttz" "4" "4" "4" +"tzs" "4" "4" "4" +"tsz" "4" "4" "4" +"zdz" "2" "4" "4" +"zhd" "2" "43" "43" +"zsh" "4" "4" "4" +"ai" "0" "1" "" +"aj" "0" "1" "" +"ay" "0" "1" "" +"au" "0" "7" "" +"cz" "4" "4" "4" +"cs" "4" "4" "4" +"ds" "4" "4" "4" +"dz" "4" "4" "4" +"dt" "3" "3" "3" +"ei" "0" "1" "" +"ej" "0" "1" "" +"ey" "0" "1" "" +"eu" "1" "1" "" +"fb" "7" "7" "7" +"ia" "1" "" "" +"ie" "1" "" "" +"io" "1" "" "" +"iu" "1" "" "" +"ks" "5" "54" "54" +"kh" "5" "5" "5" +"mn" "66" "66" "66" +"nm" "66" "66" "66" +"oi" "0" "1" "" +"oj" "0" "1" "" +"oy" "0" "1" "" +"pf" "7" "7" "7" +"ph" "7" "7" "7" +"sh" "4" "4" "4" +"sc" "2" "4" "4" +"st" "2" "43" "43" +"sd" "2" "43" "43" +"sz" "4" "4" "4" +"th" "3" "3" "3" +"ts" "4" "4" "4" +"tc" "4" "4" "4" +"tz" "4" "4" "4" +"ui" "0" "1" "" +"uj" "0" "1" "" +"uy" "0" "1" "" +"ue" "0" "1" "" +"zd" "2" "43" "43" +"zh" "4" "4" "4" +"zs" "4" "4" "4" + +// Branching cases + +"c" "4|5" "4|5" "4|5" +"ch" "4|5" "4|5" "4|5" +"ck" "5|45" "5|45" "5|45" +"rs" "4|94" "4|94" "4|94" +"rz" "4|94" "4|94" "4|94" +"j" "1|4" "|4" "|4" + + +// ASCII foldings + +Ã=s +à =a +á=a +â=a +ã=a +ä=a +Ã¥=a +æ=a +ç=c +è=e +é=e +ê=e +ë=e +ì=i +Ã=i +î=i +ï=i +ð=d +ñ=n +ò=o +ó=o +ô=o +õ=o +ö=o +ø=o +ù=u +ú=u +û=u +ý=y +ý=y +þ=b +ÿ=y +Ä=c +Å=l +Å=s +ż=z +ź=z http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1ee3a9cc/src/Lucene.Net.Analysis.Phonetic/Lucene.Net.Analysis.Phonetic.csproj ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Phonetic/Lucene.Net.Analysis.Phonetic.csproj b/src/Lucene.Net.Analysis.Phonetic/Lucene.Net.Analysis.Phonetic.csproj new file mode 100644 index 0000000..2a60aff --- /dev/null +++ b/src/Lucene.Net.Analysis.Phonetic/Lucene.Net.Analysis.Phonetic.csproj @@ -0,0 +1,225 @@ +<?xml version="1.0" encoding="utf-8"?> +<Project ToolsVersion="14.0" DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003"> + <Import Project="$(MSBuildExtensionsPath)\$(MSBuildToolsVersion)\Microsoft.Common.props" Condition="Exists('$(MSBuildExtensionsPath)\$(MSBuildToolsVersion)\Microsoft.Common.props')" /> + <PropertyGroup> + <Configuration Condition=" '$(Configuration)' == '' ">Debug</Configuration> + <Platform Condition=" '$(Platform)' == '' ">AnyCPU</Platform> + <ProjectGuid>{DAFE3B64-616A-4A2F-90E5-1F135E8A9AF5}</ProjectGuid> + <OutputType>Library</OutputType> + <AppDesignerFolder>Properties</AppDesignerFolder> + <RootNamespace>Lucene.Net.Analysis.Phonetic</RootNamespace> + <AssemblyName>Lucene.Net.Analysis.Phonetic</AssemblyName> + <TargetFrameworkVersion>v4.5.1</TargetFrameworkVersion> + <FileAlignment>512</FileAlignment> + </PropertyGroup> + <PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Debug|AnyCPU' "> + <DebugSymbols>true</DebugSymbols> + <DebugType>full</DebugType> + <Optimize>false</Optimize> + <OutputPath>bin\Debug\</OutputPath> + <DefineConstants>DEBUG;TRACE</DefineConstants> + <ErrorReport>prompt</ErrorReport> + <WarningLevel>4</WarningLevel> + </PropertyGroup> + <PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Release|AnyCPU' "> + <DebugType>pdbonly</DebugType> + <Optimize>true</Optimize> + <OutputPath>bin\Release\</OutputPath> + <DefineConstants>TRACE</DefineConstants> + <ErrorReport>prompt</ErrorReport> + <WarningLevel>4</WarningLevel> + </PropertyGroup> + <PropertyGroup> + <DefineConstants>$(DefineConstants);FEATURE_SERIALIZABLE</DefineConstants> + </PropertyGroup> + <ItemGroup> + <Reference Include="System" /> + <Reference Include="System.Core" /> + <Reference Include="System.Xml.Linq" /> + <Reference Include="System.Data.DataSetExtensions" /> + <Reference Include="Microsoft.CSharp" /> + <Reference Include="System.Data" /> + <Reference Include="System.Net.Http" /> + <Reference Include="System.Xml" /> + </ItemGroup> + <ItemGroup> + <Compile Include="BeiderMorseFilter.cs" /> + <Compile Include="BeiderMorseFilterFactory.cs" /> + <Compile Include="DoubleMetaphoneFilter.cs" /> + <Compile Include="DoubleMetaphoneFilterFactory.cs" /> + <Compile Include="Language\AbstractCaverphone .cs" /> + <Compile Include="Language\Bm\BeiderMorseEncoder.cs" /> + <Compile Include="Language\Bm\Lang.cs" /> + <Compile Include="Language\Bm\Languages.cs" /> + <Compile Include="Language\Bm\NameType.cs" /> + <Compile Include="Language\Bm\PhoneticEngine.cs" /> + <Compile Include="Language\Bm\ResourceConstants.cs" /> + <Compile Include="Language\Bm\Rule.cs" /> + <Compile Include="Language\Bm\RuleType.cs" /> + <Compile Include="Language\Caverphone1.cs" /> + <Compile Include="Language\Caverphone2.cs" /> + <Compile Include="Language\ColognePhonetic.cs" /> + <Compile Include="Language\DaitchMokotoffSoundex.cs" /> + <Compile Include="Language\DoubleMetaphone.cs" /> + <Compile Include="Language\MatchRatingApproachEncoder.cs" /> + <Compile Include="Language\Metaphone.cs" /> + <Compile Include="Language\Nysiis.cs" /> + <Compile Include="Language\RefinedSoundex.cs" /> + <Compile Include="Language\Soundex.cs" /> + <Compile Include="Language\SoundexUtils.cs" /> + <Compile Include="Language\StringEncoder.cs" /> + <Compile Include="PhoneticFilter.cs" /> + <Compile Include="PhoneticFilterFactory.cs" /> + <Compile Include="Properties\AssemblyInfo.cs" /> + <Compile Include="..\CommonAssemblyInfo.cs"> + <Link>Properties\CommonAssemblyInfo.cs</Link> + </Compile> + </ItemGroup> + <ItemGroup> + <ProjectReference Include="..\Lucene.Net.Analysis.Common\Lucene.Net.Analysis.Common.csproj"> + <Project>{4ADD0BBC-B900-4715-9526-D871DE8EEA64}</Project> + <Name>Lucene.Net.Analysis.Common</Name> + </ProjectReference> + <ProjectReference Include="..\Lucene.Net\Lucene.Net.csproj"> + <Project>{5D4AD9BE-1FFB-41AB-9943-25737971BF57}</Project> + <Name>Lucene.Net</Name> + </ProjectReference> + </ItemGroup> + <ItemGroup> + <EmbeddedResource Include="Language\Bm\ash_approx_any.txt" /> + <EmbeddedResource Include="Language\Bm\ash_approx_common.txt" /> + <EmbeddedResource Include="Language\Bm\ash_approx_cyrillic.txt" /> + <EmbeddedResource Include="Language\Bm\ash_approx_english.txt" /> + <EmbeddedResource Include="Language\Bm\ash_approx_french.txt" /> + <EmbeddedResource Include="Language\Bm\ash_approx_german.txt" /> + <EmbeddedResource Include="Language\Bm\ash_approx_hebrew.txt" /> + <EmbeddedResource Include="Language\Bm\ash_approx_hungarian.txt" /> + <EmbeddedResource Include="Language\Bm\ash_approx_polish.txt" /> + <EmbeddedResource Include="Language\Bm\ash_approx_romanian.txt" /> + <EmbeddedResource Include="Language\Bm\ash_approx_russian.txt" /> + <EmbeddedResource Include="Language\Bm\ash_approx_spanish.txt" /> + <EmbeddedResource Include="Language\Bm\ash_exact_any.txt" /> + <EmbeddedResource Include="Language\Bm\ash_exact_approx_common.txt" /> + <EmbeddedResource Include="Language\Bm\ash_exact_common.txt" /> + <EmbeddedResource Include="Language\Bm\ash_exact_cyrillic.txt" /> + <EmbeddedResource Include="Language\Bm\ash_exact_english.txt" /> + <EmbeddedResource Include="Language\Bm\ash_exact_french.txt" /> + <EmbeddedResource Include="Language\Bm\ash_exact_german.txt" /> + <EmbeddedResource Include="Language\Bm\ash_exact_hebrew.txt" /> + <EmbeddedResource Include="Language\Bm\ash_exact_hungarian.txt" /> + <EmbeddedResource Include="Language\Bm\ash_exact_polish.txt" /> + <EmbeddedResource Include="Language\Bm\ash_exact_romanian.txt" /> + <EmbeddedResource Include="Language\Bm\ash_exact_russian.txt" /> + <EmbeddedResource Include="Language\Bm\ash_exact_spanish.txt" /> + <EmbeddedResource Include="Language\Bm\ash_hebrew_common.txt" /> + <EmbeddedResource Include="Language\Bm\ash_languages.txt" /> + <EmbeddedResource Include="Language\Bm\ash_rules_any.txt" /> + <EmbeddedResource Include="Language\Bm\ash_rules_cyrillic.txt" /> + <EmbeddedResource Include="Language\Bm\ash_rules_english.txt" /> + <EmbeddedResource Include="Language\Bm\ash_rules_french.txt" /> + <EmbeddedResource Include="Language\Bm\ash_rules_german.txt" /> + <EmbeddedResource Include="Language\Bm\ash_rules_hebrew.txt" /> + <EmbeddedResource Include="Language\Bm\ash_rules_hungarian.txt" /> + <EmbeddedResource Include="Language\Bm\ash_rules_polish.txt" /> + <EmbeddedResource Include="Language\Bm\ash_rules_romanian.txt" /> + <EmbeddedResource Include="Language\Bm\ash_rules_russian.txt" /> + <EmbeddedResource Include="Language\Bm\ash_rules_spanish.txt" /> + <EmbeddedResource Include="Language\Bm\gen_approx_any.txt" /> + <EmbeddedResource Include="Language\Bm\gen_approx_arabic.txt" /> + <EmbeddedResource Include="Language\Bm\gen_approx_common.txt" /> + <EmbeddedResource Include="Language\Bm\gen_approx_cyrillic.txt" /> + <EmbeddedResource Include="Language\Bm\gen_approx_czech.txt" /> + <EmbeddedResource Include="Language\Bm\gen_approx_dutch.txt" /> + <EmbeddedResource Include="Language\Bm\gen_approx_english.txt" /> + <EmbeddedResource Include="Language\Bm\gen_approx_french.txt" /> + <EmbeddedResource Include="Language\Bm\gen_approx_german.txt" /> + <EmbeddedResource Include="Language\Bm\gen_approx_greek.txt" /> + <EmbeddedResource Include="Language\Bm\gen_approx_greeklatin.txt" /> + <EmbeddedResource Include="Language\Bm\gen_approx_hebrew.txt" /> + <EmbeddedResource Include="Language\Bm\gen_approx_hungarian.txt" /> + <EmbeddedResource Include="Language\Bm\gen_approx_italian.txt" /> + <EmbeddedResource Include="Language\Bm\gen_approx_polish.txt" /> + <EmbeddedResource Include="Language\Bm\gen_approx_portuguese.txt" /> + <EmbeddedResource Include="Language\Bm\gen_approx_romanian.txt" /> + <EmbeddedResource Include="Language\Bm\gen_approx_russian.txt" /> + <EmbeddedResource Include="Language\Bm\gen_approx_spanish.txt" /> + <EmbeddedResource Include="Language\Bm\gen_approx_turkish.txt" /> + <EmbeddedResource Include="Language\Bm\gen_exact_any.txt" /> + <EmbeddedResource Include="Language\Bm\gen_exact_approx_common.txt" /> + <EmbeddedResource Include="Language\Bm\gen_exact_arabic.txt" /> + <EmbeddedResource Include="Language\Bm\gen_exact_common.txt" /> + <EmbeddedResource Include="Language\Bm\gen_exact_cyrillic.txt" /> + <EmbeddedResource Include="Language\Bm\gen_exact_czech.txt" /> + <EmbeddedResource Include="Language\Bm\gen_exact_dutch.txt" /> + <EmbeddedResource Include="Language\Bm\gen_exact_english.txt" /> + <EmbeddedResource Include="Language\Bm\gen_exact_french.txt" /> + <EmbeddedResource Include="Language\Bm\gen_exact_german.txt" /> + <EmbeddedResource Include="Language\Bm\gen_exact_greek.txt" /> + <EmbeddedResource Include="Language\Bm\gen_exact_greeklatin.txt" /> + <EmbeddedResource Include="Language\Bm\gen_exact_hebrew.txt" /> + <EmbeddedResource Include="Language\Bm\gen_exact_hungarian.txt" /> + <EmbeddedResource Include="Language\Bm\gen_exact_italian.txt" /> + <EmbeddedResource Include="Language\Bm\gen_exact_polish.txt" /> + <EmbeddedResource Include="Language\Bm\gen_exact_portuguese.txt" /> + <EmbeddedResource Include="Language\Bm\gen_exact_romanian.txt" /> + <EmbeddedResource Include="Language\Bm\gen_exact_russian.txt" /> + <EmbeddedResource Include="Language\Bm\gen_exact_spanish.txt" /> + <EmbeddedResource Include="Language\Bm\gen_exact_turkish.txt" /> + <EmbeddedResource Include="Language\Bm\gen_hebrew_common.txt" /> + <EmbeddedResource Include="Language\Bm\gen_languages.txt" /> + <EmbeddedResource Include="Language\Bm\gen_rules_any.txt" /> + <EmbeddedResource Include="Language\Bm\gen_rules_arabic.txt" /> + <EmbeddedResource Include="Language\Bm\gen_rules_cyrillic.txt" /> + <EmbeddedResource Include="Language\Bm\gen_rules_czech.txt" /> + <EmbeddedResource Include="Language\Bm\gen_rules_dutch.txt" /> + <EmbeddedResource Include="Language\Bm\gen_rules_english.txt" /> + <EmbeddedResource Include="Language\Bm\gen_rules_french.txt" /> + <EmbeddedResource Include="Language\Bm\gen_rules_german.txt" /> + <EmbeddedResource Include="Language\Bm\gen_rules_greek.txt" /> + <EmbeddedResource Include="Language\Bm\gen_rules_greeklatin.txt" /> + <EmbeddedResource Include="Language\Bm\gen_rules_hebrew.txt" /> + <EmbeddedResource Include="Language\Bm\gen_rules_hungarian.txt" /> + <EmbeddedResource Include="Language\Bm\gen_rules_italian.txt" /> + <EmbeddedResource Include="Language\Bm\gen_rules_polish.txt" /> + <EmbeddedResource Include="Language\Bm\gen_rules_portuguese.txt" /> + <EmbeddedResource Include="Language\Bm\gen_rules_romanian.txt" /> + <EmbeddedResource Include="Language\Bm\gen_rules_russian.txt" /> + <EmbeddedResource Include="Language\Bm\gen_rules_spanish.txt" /> + <EmbeddedResource Include="Language\Bm\gen_rules_turkish.txt" /> + <EmbeddedResource Include="Language\Bm\lang.txt" /> + <EmbeddedResource Include="Language\Bm\sep_approx_any.txt" /> + <EmbeddedResource Include="Language\Bm\sep_approx_common.txt" /> + <EmbeddedResource Include="Language\Bm\sep_approx_french.txt" /> + <EmbeddedResource Include="Language\Bm\sep_approx_hebrew.txt" /> + <EmbeddedResource Include="Language\Bm\sep_approx_italian.txt" /> + <EmbeddedResource Include="Language\Bm\sep_approx_portuguese.txt" /> + <EmbeddedResource Include="Language\Bm\sep_approx_spanish.txt" /> + <EmbeddedResource Include="Language\Bm\sep_exact_any.txt" /> + <EmbeddedResource Include="Language\Bm\sep_exact_approx_common.txt" /> + <EmbeddedResource Include="Language\Bm\sep_exact_common.txt" /> + <EmbeddedResource Include="Language\Bm\sep_exact_french.txt" /> + <EmbeddedResource Include="Language\Bm\sep_exact_hebrew.txt" /> + <EmbeddedResource Include="Language\Bm\sep_exact_italian.txt" /> + <EmbeddedResource Include="Language\Bm\sep_exact_portuguese.txt" /> + <EmbeddedResource Include="Language\Bm\sep_exact_spanish.txt" /> + <EmbeddedResource Include="Language\Bm\sep_hebrew_common.txt" /> + <EmbeddedResource Include="Language\Bm\sep_languages.txt" /> + <EmbeddedResource Include="Language\Bm\sep_rules_any.txt" /> + <EmbeddedResource Include="Language\Bm\sep_rules_french.txt" /> + <EmbeddedResource Include="Language\Bm\sep_rules_hebrew.txt" /> + <EmbeddedResource Include="Language\Bm\sep_rules_italian.txt" /> + <EmbeddedResource Include="Language\Bm\sep_rules_portuguese.txt" /> + <EmbeddedResource Include="Language\Bm\sep_rules_spanish.txt" /> + </ItemGroup> + <ItemGroup> + <EmbeddedResource Include="Language\dmrules.txt" /> + </ItemGroup> + <Import Project="$(MSBuildToolsPath)\Microsoft.CSharp.targets" /> + <!-- To modify your build process, add your task inside one of the targets below and uncomment it. + Other similar extension points exist, see Microsoft.Common.targets. + <Target Name="BeforeBuild"> + </Target> + <Target Name="AfterBuild"> + </Target> + --> +</Project> \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1ee3a9cc/src/Lucene.Net.Analysis.Phonetic/Lucene.Net.Analysis.Phonetic.project.json ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Phonetic/Lucene.Net.Analysis.Phonetic.project.json b/src/Lucene.Net.Analysis.Phonetic/Lucene.Net.Analysis.Phonetic.project.json new file mode 100644 index 0000000..86d1c12 --- /dev/null +++ b/src/Lucene.Net.Analysis.Phonetic/Lucene.Net.Analysis.Phonetic.project.json @@ -0,0 +1,8 @@ +{ + "runtimes": { + "win": {} + }, + "frameworks": { + "net451": {} + } +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1ee3a9cc/src/Lucene.Net.Analysis.Phonetic/Lucene.Net.Analysis.Phonetic.xproj ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Phonetic/Lucene.Net.Analysis.Phonetic.xproj b/src/Lucene.Net.Analysis.Phonetic/Lucene.Net.Analysis.Phonetic.xproj new file mode 100644 index 0000000..321b9b2 --- /dev/null +++ b/src/Lucene.Net.Analysis.Phonetic/Lucene.Net.Analysis.Phonetic.xproj @@ -0,0 +1,19 @@ +<?xml version="1.0" encoding="utf-8"?> +<Project ToolsVersion="14.0.25420" DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003"> + <PropertyGroup> + <VisualStudioVersion Condition="'$(VisualStudioVersion)' == ''">14.0.25420</VisualStudioVersion> + <VSToolsPath Condition="'$(VSToolsPath)' == ''">$(MSBuildExtensionsPath32)\Microsoft\VisualStudio\v$(VisualStudioVersion)</VSToolsPath> + </PropertyGroup> + <Import Project="$(VSToolsPath)\DotNet\Microsoft.DotNet.Props" Condition="'$(VSToolsPath)' != ''" /> + <PropertyGroup Label="Globals"> + <ProjectGuid>56b2ffb7-6870-4420-8bc7-187adf5341d9</ProjectGuid> + <RootNamespace>Lucene.Net.Analysis.Phonetic</RootNamespace> + <BaseIntermediateOutputPath Condition="'$(BaseIntermediateOutputPath)'=='' ">.\obj</BaseIntermediateOutputPath> + <OutputPath Condition="'$(OutputPath)'=='' ">.\bin\</OutputPath> + </PropertyGroup> + + <PropertyGroup> + <SchemaVersion>2.0</SchemaVersion> + </PropertyGroup> + <Import Project="$(VSToolsPath)\DotNet\Microsoft.DotNet.targets" Condition="'$(VSToolsPath)' != ''" /> +</Project> \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1ee3a9cc/src/Lucene.Net.Analysis.Phonetic/PhoneticFilter.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Phonetic/PhoneticFilter.cs b/src/Lucene.Net.Analysis.Phonetic/PhoneticFilter.cs new file mode 100644 index 0000000..c5d2886 --- /dev/null +++ b/src/Lucene.Net.Analysis.Phonetic/PhoneticFilter.cs @@ -0,0 +1,109 @@ +// lucene version compatibility level: 4.8.1 +using Lucene.Net.Analysis.Phonetic.Language; +using Lucene.Net.Analysis.TokenAttributes; +using System; + +namespace Lucene.Net.Analysis.Phonetic +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// <summary> + /// Create tokens for phonetic matches. + /// See the Language namespace. + /// </summary> + public sealed class PhoneticFilter : TokenFilter + { + /// <summary>true if encoded tokens should be added as synonyms</summary> + private bool inject = true; + /// <summary>phonetic encoder</summary> + private IStringEncoder encoder = null; + /// <summary>captured state, non-null when <c>inject=true</c> and a token is buffered</summary> + private State save = null; + private readonly ICharTermAttribute termAtt; + private readonly IPositionIncrementAttribute posAtt; + + /// <summary> + /// Creates a <see cref="PhoneticFilter"/> with the specified encoder, and either + /// adding encoded forms as synonyms (<c>inject=true</c>) or + /// replacing them. + /// </summary> + public PhoneticFilter(TokenStream input, IStringEncoder encoder, bool inject) + : base(input) + { + this.encoder = encoder; + this.inject = inject; + this.termAtt = AddAttribute<ICharTermAttribute>(); + this.posAtt = AddAttribute<IPositionIncrementAttribute>(); + } + + public override bool IncrementToken() + { + if (save != null) + { + // clearAttributes(); // not currently necessary + RestoreState(save); + save = null; + return true; + } + + if (!m_input.IncrementToken()) return false; + + // pass through zero-length terms + if (termAtt.Length == 0) return true; + + string value = termAtt.ToString(); + string phonetic = null; + try + { + string v = encoder.Encode(value); + if (v.Length > 0 && !value.Equals(v)) + { + phonetic = v; + } + } + catch (Exception) { /* ignored */ } // just use the direct text + + if (phonetic == null) return true; + + if (!inject) + { + // just modify this token + termAtt.SetEmpty().Append(phonetic); + return true; + } + + // We need to return both the original and the phonetic tokens. + // to avoid a orig=captureState() change_to_phonetic() saved=captureState() restoreState(orig) + // we return the phonetic alternative first + + int origOffset = posAtt.PositionIncrement; + posAtt.PositionIncrement = 0; + save = CaptureState(); + + posAtt.PositionIncrement = origOffset; + termAtt.SetEmpty().Append(phonetic); + return true; + } + + public override void Reset() + { + m_input.Reset(); + save = null; + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1ee3a9cc/src/Lucene.Net.Analysis.Phonetic/PhoneticFilterFactory.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Phonetic/PhoneticFilterFactory.cs b/src/Lucene.Net.Analysis.Phonetic/PhoneticFilterFactory.cs new file mode 100644 index 0000000..8af2e5f --- /dev/null +++ b/src/Lucene.Net.Analysis.Phonetic/PhoneticFilterFactory.cs @@ -0,0 +1,187 @@ +// lucene version compatibility level: 4.8.1 +using Lucene.Net.Analysis.Phonetic.Language; +using Lucene.Net.Analysis.Util; +using Lucene.Net.Support; +using System; +using System.Collections.Generic; +using System.Globalization; +using System.Reflection; + +namespace Lucene.Net.Analysis.Phonetic +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// <summary> + /// Factory for <see cref="PhoneticFilter"/>. + /// <para/> + /// Create tokens based on phonetic encoders from the Language namespace. + /// <para/> + /// This takes one required argument, "encoder", and the rest are optional: + /// <list type="bullet"> + /// <item> + /// <term>encoder</term> + /// <description> + /// required, one of "DoubleMetaphone", "Metaphone", "Soundex", "RefinedSoundex", "Caverphone" (v2.0), + /// or "ColognePhonetic" (case insensitive). If encoder isn't one of these, it'll be resolved as a class name either by + /// itself if it already contains a '.' or otherwise as in the same package as these others. + /// </description> + /// </item> + /// <item> + /// <term>inject</term> + /// <description> + /// (default=true) add tokens to the stream with the offset=0 + /// </description> + /// </item> + /// <item> + /// <term>maxCodeLength</term> + /// <description> + /// The maximum length of the phonetic codes, as defined by the encoder. If an encoder doesn't + /// support this then specifying this is an error. + /// </description> + /// </item> + /// </list> + /// + /// <code> + /// <fieldType name="text_phonetic" class="solr.TextField" positionIncrementGap="100"> + /// <analyzer> + /// <tokenizer class="solr.WhitespaceTokenizerFactory"/> + /// <filter class="solr.PhoneticFilterFactory" encoder="DoubleMetaphone" inject="true"/> + /// </analyzer> + /// </fieldType> + /// </code> + /// </summary> + /// <seealso cref="PhoneticFilter"/> + public class PhoneticFilterFactory : TokenFilterFactory, IResourceLoaderAware + { + /// <summary>parameter name: either a short name or a full class name</summary> + public static readonly string ENCODER = "encoder"; + /// <summary>parameter name: true if encoded tokens should be added as synonyms</summary> + public static readonly string INJECT = "inject"; // boolean + /** parameter name: restricts the length of the phonetic code */ + public static readonly string MAX_CODE_LENGTH = "maxCodeLength"; + private static readonly string PACKAGE_CONTAINING_ENCODERS = "Lucene.Net.Analysis.Phonetic.Language."; + + //Effectively constants; uppercase keys + private static readonly IDictionary<string, Type> registry = new Dictionary<string, Type>(6); + + static PhoneticFilterFactory() + { + registry["DoubleMetaphone".ToUpperInvariant()] = typeof(DoubleMetaphone); + registry["Metaphone".ToUpperInvariant()] = typeof(Metaphone); + registry["Soundex".ToUpperInvariant()] = typeof(Soundex); + registry["RefinedSoundex".ToUpperInvariant()] = typeof(RefinedSoundex); + registry["Caverphone".ToUpperInvariant()] = typeof(Caverphone2); + registry["ColognePhonetic".ToUpperInvariant()] = typeof(ColognePhonetic); + } + + internal bool inject; //accessed by the test + private readonly string name; + private readonly int? maxCodeLength; + private Type clazz = null; + private MethodInfo setMaxCodeLenMethod = null; + + /// <summary>Creates a new <see cref="PhoneticFilterFactory"/>.</summary> + public PhoneticFilterFactory(IDictionary<string, string> args) + : base(args) + { + inject = GetBoolean(args, INJECT, true); + name = Require(args, ENCODER); + string v = Get(args, MAX_CODE_LENGTH); + if (v != null) + { + maxCodeLength = int.Parse(v, CultureInfo.InvariantCulture); + } + else + { + maxCodeLength = null; + } + if (!(args.Count == 0)) + { + throw new ArgumentException("Unknown parameters: " + args); + } + } + + + public virtual void Inform(IResourceLoader loader) + { + registry.TryGetValue(name.ToUpperInvariant(), out clazz); + if (clazz == null) + { + clazz = ResolveEncoder(name, loader); + } + + if (maxCodeLength != null) + { + try + { + setMaxCodeLenMethod = clazz.GetMethod("set_MaxCodeLen"); + } + catch (Exception e) + { + throw new ArgumentException("Encoder " + name + " / " + clazz + " does not support " + MAX_CODE_LENGTH, e); + } + } + + GetEncoder();//trigger initialization for potential problems to be thrown now + } + + private Type ResolveEncoder(string name, IResourceLoader loader) + { + string lookupName = name; + if (name.IndexOf('.') == -1) + { + lookupName = PACKAGE_CONTAINING_ENCODERS + name; + } + try + { + return loader.NewInstance<IStringEncoder>(lookupName).GetType(); + } + catch (Exception e) + { + throw new ArgumentException("Error loading encoder '" + name + "': must be full class name or one of " + Collections.ToString(registry.Keys), e); + } + } + + /// <summary>Must be thread-safe.</summary> + protected internal virtual IStringEncoder GetEncoder() + { + // Unfortunately, Commons-Codec doesn't offer any thread-safe guarantees so we must play it safe and instantiate + // every time. A simple benchmark showed this as negligible. + try + { + IStringEncoder encoder = (IStringEncoder)Activator.CreateInstance(clazz); + // Try to set the maxCodeLength + if (maxCodeLength != null && setMaxCodeLenMethod != null) + { + setMaxCodeLenMethod.Invoke(encoder, new object[] { maxCodeLength }); + } + return encoder; + } + catch (Exception e) + { + Exception t = (e is TargetInvocationException) ? e.InnerException : e; + throw new ArgumentException("Error initializing encoder: " + name + " / " + clazz, t); + } + } + + public override TokenStream Create(TokenStream input) + { + return new PhoneticFilter(input, GetEncoder(), inject); + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1ee3a9cc/src/Lucene.Net.Analysis.Phonetic/Properties/AssemblyInfo.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Phonetic/Properties/AssemblyInfo.cs b/src/Lucene.Net.Analysis.Phonetic/Properties/AssemblyInfo.cs new file mode 100644 index 0000000..b7cd03f --- /dev/null +++ b/src/Lucene.Net.Analysis.Phonetic/Properties/AssemblyInfo.cs @@ -0,0 +1,48 @@ +/* + * + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + * +*/ + +using System.Reflection; +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; + +// General Information about an assembly is controlled through the following +// set of attributes. Change these attribute values to modify the information +// associated with an assembly. +[assembly: AssemblyTitle("Lucene.Net.Analysis.Phonetic")] +[assembly: AssemblyDescription( + "Analyzer for indexing phonetic signatures (for sounds-alike search) " + + "for the Lucene.Net full-text search engine library from The Apache Software Foundation.")] +[assembly: AssemblyConfiguration("")] +[assembly: AssemblyDefaultAlias("Lucene.Net.Analysis.Phonetic")] +[assembly: AssemblyCulture("")] + +// Setting ComVisible to false makes the types in this assembly not visible +// to COM components. If you need to access a type in this assembly from +// COM, set the ComVisible attribute to true on that type. +[assembly: ComVisible(false)] + +// The following GUID is for the ID of the typelib if this project is exposed to COM +[assembly: Guid("dafe3b64-616a-4a2f-90e5-1f135e8a9af5")] + +// for testing +[assembly: InternalsVisibleTo("Lucene.Net.Tests.Analysis.Phonetic")] + +// NOTE: Version information is in CommonAssemblyInfo.cs http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1ee3a9cc/src/Lucene.Net.Analysis.Phonetic/project.json ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Phonetic/project.json b/src/Lucene.Net.Analysis.Phonetic/project.json new file mode 100644 index 0000000..460721b --- /dev/null +++ b/src/Lucene.Net.Analysis.Phonetic/project.json @@ -0,0 +1,54 @@ +{ + "version": "4.8.0", + "title": "Lucene.Net.Analysis.Phonetic", + "description": "Analyzer for indexing phonetic signatures (for sounds-alike search) for the Lucene.Net full-text search engine library from The Apache Software Foundation.", + "authors": [ "The Apache Software Foundation" ], + "packOptions": { + "projectUrl": "http://lucenenet.apache.org/", + "licenseUrl": "https://github.com/apache/lucenenet/blob/master/LICENSE.txt", + "iconUrl": "https://github.com/apache/lucenenet/blob/master/branding/logo/lucene-net-icon-128x128.png?raw=true", + "owners": [ "The Apache Software Foundation" ], + "repository": { "url": "https://github.com/apache/lucenenet" }, + "tags": [ "lucene.net", "core", "text", "search", "information", "retrieval", "lucene", "apache", "analysis", "index", "query", "soundex", "double", "metaphone", "sounds", "like", "beider", "morse", "cologne", "caverphone", "nysiis", "match", "rating" ] + }, + "buildOptions": { + "compile": { + "includeFiles": [ "../CommonAssemblyInfo.cs" ] + }, + "embed": { + "include": [ + "Language/Bm/ash_*.txt", + "Language/Bm/gen_*.txt", + "Language/Bm/sep_*.txt" + ], + "includeFiles": [ + "Language/Bm/lang.txt", + "Language/dmrules.txt" + ] + }, + "nowarn": [ "1591", "1573" ] + }, + "dependencies": { + "icu.net": "54.1.1-alpha", + "Lucene.Net": "4.8.0", + "Lucene.Net.Analysis.Common": "4.8.0" + }, + "frameworks": { + "netstandard1.5": { + "imports": "dnxcore50", + "buildOptions": { + "debugType": "portable", + "define": [ "NETSTANDARD" ] + }, + "dependencies": { + "NETStandard.Library": "1.6.0" + } + }, + "net451": { + "buildOptions": { + "debugType": "full", + "define": [ "FEATURE_SERIALIZABLE" ] + } + } + } +}
