http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1ee3a9cc/src/Lucene.Net.Analysis.Phonetic/Language/Bm/PhoneticEngine.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Phonetic/Language/Bm/PhoneticEngine.cs b/src/Lucene.Net.Analysis.Phonetic/Language/Bm/PhoneticEngine.cs new file mode 100644 index 0000000..3cf5c7a --- /dev/null +++ b/src/Lucene.Net.Analysis.Phonetic/Language/Bm/PhoneticEngine.cs @@ -0,0 +1,578 @@ +// commons-codec version compatibility level: 1.9 +using Lucene.Net.Support; +using System; +using System.Collections.Generic; +using System.Globalization; +using System.Linq; +using System.Text; +using System.Text.RegularExpressions; + +namespace Lucene.Net.Analysis.Phonetic.Language.Bm +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// <summary> + /// Converts words into potential phonetic representations. + /// </summary> + /// <remarks> + /// This is a two-stage process. Firstly, the word is converted into a phonetic representation that takes + /// into account the likely source language. Next, this phonetic representation is converted into a + /// pan-European 'average' representation, allowing comparison between different versions of essentially + /// the same word from different languages. + /// <para/> + /// This class is intentionally immutable and thread-safe. + /// If you wish to alter the settings for a PhoneticEngine, you + /// must make a new one with the updated settings. + /// <para/> + /// Ported from phoneticengine.php + /// <para/> + /// since 1.6 + /// </remarks> + public class PhoneticEngine + { + internal Regex WHITESPACE = new Regex("\\s+", RegexOptions.Compiled); + + /// <summary> + /// Utility for manipulating a set of phonemes as they are being built up. Not intended for use outside + /// this package, and probably not outside the <see cref="PhoneticEngine"/> class. + /// <para/> + /// since 1.6 + /// </summary> + internal sealed class PhonemeBuilder + { + /// <summary> + /// An empty builder where all phonemes must come from some set of languages. This will contain a single + /// phoneme of zero characters. This can then be appended to. This should be the only way to create a new + /// phoneme from scratch. + /// </summary> + /// <param name="languages">The set of languages.</param> + /// <returns>A new, empty phoneme builder.</returns> + public static PhonemeBuilder Empty(LanguageSet languages) + { + return new PhonemeBuilder(new Phoneme("", languages)); + } + + private readonly IList<Phoneme> phonemes; + + private PhonemeBuilder(Phoneme phoneme) + { + // LUCENENET NOTE: LinkedHashSet cares about insertion order - in .NET, we can just use List<T> for that + this.phonemes = new List<Phoneme>(); + this.phonemes.Add(phoneme); + } + + internal PhonemeBuilder(IList<Phoneme> phonemes) + { + this.phonemes = phonemes; + } + + /// <summary> + /// Creates a new phoneme builder containing all phonemes in this one extended by <paramref name="str"/>. + /// </summary> + /// <param name="str">The characters to append to the phonemes.</param> + public void Append(ICharSequence str) + { + foreach (Phoneme ph in this.phonemes) + { + ph.Append(str.ToString()); + } + } + + /// <summary> + /// Creates a new phoneme builder containing all phonemes in this one extended by <paramref name="str"/>. + /// </summary> + /// <param name="str">The characters to append to the phonemes.</param> + // LUCENENET specific + public void Append(string str) + { + foreach (Phoneme ph in this.phonemes) + { + ph.Append(str); + } + } + + /// <summary> + /// Creates a new phoneme builder containing all phonemes in this one extended by <paramref name="str"/>. + /// </summary> + /// <param name="str">The characters to append to the phonemes.</param> + // LUCENENET specific + public void Append(StringBuilder str) + { + foreach (Phoneme ph in this.phonemes) + { + ph.Append(str.ToString()); + } + } + + /// <summary> + /// Applies the given phoneme expression to all phonemes in this phoneme builder. + /// <para/> + /// This will lengthen phonemes that have compatible language sets to the expression, and drop those that are + /// incompatible. + /// </summary> + /// <param name="phonemeExpr">The expression to apply.</param> + /// <param name="maxPhonemes">The maximum number of phonemes to build up.</param> + public void Apply(IPhonemeExpr phonemeExpr, int maxPhonemes) + { + // LUCENENET NOTE: LinkedHashSet cares about insertion order - in .NET, we can just use List<T> for that + IList<Phoneme> newPhonemes = new List<Phoneme>(maxPhonemes); + + //EXPR_continue: + foreach (Phoneme left in this.phonemes) + { + foreach (Phoneme right in phonemeExpr.Phonemes) + { + LanguageSet languages = left.Languages.RestrictTo(right.Languages); + if (!languages.IsEmpty) + { + Phoneme join = new Phoneme(left, right, languages); + if (newPhonemes.Count < maxPhonemes) + { + newPhonemes.Add(join); + if (newPhonemes.Count >= maxPhonemes) + { + goto EXPR_break; + } + } + } + } + } + EXPR_break: { } + + this.phonemes.Clear(); + // LUCENENET: We need to filter out any duplicates, since we converted from LinkedHashSet + // to List. + this.phonemes.AddRange(newPhonemes.Where(x => !phonemes.Any(y => y.Equals(x)))); + } + + /// <summary> + /// Gets underlying phoneme set. Please don't mutate. + /// </summary> + public IList<Phoneme> Phonemes + { + get { return this.phonemes; } + } + + /// <summary> + /// Stringifies the phoneme set. This produces a single string of the strings of each phoneme, + /// joined with a pipe. This is explicitly provided in place of <see cref="object.ToString()"/> as it is a potentially + /// expensive operation, which should be avoided when debugging. + /// </summary> + /// <returns>The stringified phoneme set.</returns> + public string MakeString() + { + StringBuilder sb = new StringBuilder(); + + foreach (Phoneme ph in this.phonemes) + { + if (sb.Length > 0) + { + sb.Append("|"); + } + sb.Append(ph.GetPhonemeText()); + } + + return sb.ToString(); + } + } + + /// <summary> + /// A function closure capturing the application of a list of rules to an input sequence at a particular offset. + /// After invocation, the values <c>i</c> and <c>found</c> are updated. <c>i</c> points to the + /// index of the next char in <c>input</c> that must be processed next (the input up to that index having been + /// processed already), and <c>found</c> indicates if a matching rule was found or not. In the case where a + /// matching rule was found, <c>phonemeBuilder</c> is replaced with a new builder containing the phonemes + /// updated by the matching rule. + /// <para/> + /// Although this class is not thread-safe (it has mutable unprotected fields), it is not shared between threads + /// as it is constructed as needed by the calling methods. + /// <para/> + /// since 1.6 + /// </summary> + private sealed class RulesApplication + { + private readonly IDictionary<string, IList<Rule>> finalRules; + private readonly string input; + + private PhonemeBuilder phonemeBuilder; + private int i; + private readonly int maxPhonemes; + private bool found; + + public RulesApplication(IDictionary<string, IList<Rule>> finalRules, string input, + PhonemeBuilder phonemeBuilder, int i, int maxPhonemes) + { + if (finalRules == null) + { + throw new ArgumentNullException("The finalRules argument must not be null"); + } + this.finalRules = finalRules; + this.phonemeBuilder = phonemeBuilder; + this.input = input; + this.i = i; + this.maxPhonemes = maxPhonemes; + } + + public int I + { + get { return this.i; } + } + + public PhonemeBuilder PhonemeBuilder + { + get { return this.phonemeBuilder; } + } + + /// <summary> + /// Invokes the rules. Loops over the rules list, stopping at the first one that has a matching context + /// and pattern. Then applies this rule to the phoneme builder to produce updated phonemes. If there was no + /// match, <c>i</c> is advanced one and the character is silently dropped from the phonetic spelling. + /// </summary> + /// <returns><c>this</c></returns> + public RulesApplication Invoke() + { + this.found = false; + int patternLength = 1; + IList<Rule> rules; + if (this.finalRules.TryGetValue(input.Substring(i, patternLength), out rules) && rules != null) + { + foreach (Rule rule in rules) + { + string pattern = rule.Pattern; + patternLength = pattern.Length; + if (rule.PatternAndContextMatches(this.input, this.i)) + { + this.phonemeBuilder.Apply(rule.Phoneme, maxPhonemes); + this.found = true; + break; + } + } + } + + if (!this.found) + { + patternLength = 1; + } + + this.i += patternLength; + return this; + } + + public bool IsFound + { + get { return this.found; } + } + } + + private static readonly IDictionary<NameType, ISet<string>> NAME_PREFIXES = new Dictionary<NameType, ISet<string>>(); + + static PhoneticEngine() + { + NAME_PREFIXES[NameType.ASHKENAZI] = + Collections.UnmodifiableSet( + new HashSet<string>() { "bar", "ben", "da", "de", "van", "von" }); + NAME_PREFIXES[NameType.SEPHARDIC] = + Collections.UnmodifiableSet( + new HashSet<string>() { "al", "el", "da", "dal", "de", "del", "dela", "de la", + "della", "des", "di", "do", "dos", "du", "van", "von" }); + NAME_PREFIXES[NameType.GENERIC] = + Collections.UnmodifiableSet( + new HashSet<string>() { "da", "dal", "de", "del", "dela", "de la", "della", + "des", "di", "do", "dos", "du", "van", "von" }); + } + + /// <summary> + /// Joins some strings with an internal separator. + /// </summary> + /// <param name="strings">Strings to join.</param> + /// <param name="sep">String to separate them with.</param> + /// <returns>A single string consisting of each element of <paramref name="strings"/> interleaved by <paramref name="sep"/>.</returns> + private static string Join(IEnumerable<string> strings, string sep) + { + StringBuilder sb = new StringBuilder(); + using (IEnumerator<string> si = strings.GetEnumerator()) + { + if (si.MoveNext()) + { + sb.Append(si.Current); + } + while (si.MoveNext()) + { + sb.Append(sep).Append(si.Current); + } + } + + return sb.ToString(); + } + + private static readonly int DEFAULT_MAX_PHONEMES = 20; + + private readonly Lang lang; + + private readonly NameType nameType; + + private readonly RuleType ruleType; + + private readonly bool concat; + + private readonly int maxPhonemes; + + /// <summary> + /// Generates a new, fully-configured phonetic engine. + /// </summary> + /// <param name="nameType">The type of names it will use.</param> + /// <param name="ruleType">The type of rules it will apply.</param> + /// <param name="concat">If it will concatenate multiple encodings.</param> + public PhoneticEngine(NameType nameType, RuleType ruleType, bool concat) + : this(nameType, ruleType, concat, DEFAULT_MAX_PHONEMES) + { + } + + /// <summary> + /// Generates a new, fully-configured phonetic engine. + /// <para/> + /// since 1.7 + /// </summary> + /// <param name="nameType">The type of names it will use.</param> + /// <param name="ruleType">The type of rules it will apply.</param> + /// <param name="concat">If it will concatenate multiple encodings.</param> + /// <param name="maxPhonemes">The maximum number of phonemes that will be handled.</param> + public PhoneticEngine(NameType nameType, RuleType ruleType, bool concat, + int maxPhonemes) + { + if (ruleType == RuleType.RULES) + { + throw new ArgumentException("ruleType must not be " + RuleType.RULES); + } + this.nameType = nameType; + this.ruleType = ruleType; + this.concat = concat; + this.lang = Lang.GetInstance(nameType); + this.maxPhonemes = maxPhonemes; + } + + /// <summary> + /// Applies the final rules to convert from a language-specific phonetic representation to a + /// language-independent representation. + /// </summary> + /// <param name="phonemeBuilder">The current phonemes.</param> + /// <param name="finalRules">The final rules to apply.</param> + /// <returns>The resulting phonemes.</returns> + private PhonemeBuilder ApplyFinalRules(PhonemeBuilder phonemeBuilder, + IDictionary<string, IList<Rule>> finalRules) + { + if (finalRules == null) + { + throw new ArgumentNullException("finalRules can not be null"); + } + if (finalRules.Count == 0) + { + return phonemeBuilder; + } + + ISet<Phoneme> phonemes = new SortedSet<Phoneme>(Phoneme.COMPARER); + + foreach (Phoneme phoneme in phonemeBuilder.Phonemes) + { + PhonemeBuilder subBuilder = PhonemeBuilder.Empty(phoneme.Languages); + string phonemeText = phoneme.GetPhonemeText(); + + for (int i = 0; i < phonemeText.Length;) + { + RulesApplication rulesApplication = + new RulesApplication(finalRules, phonemeText, subBuilder, i, maxPhonemes).Invoke(); + bool found = rulesApplication.IsFound; + subBuilder = rulesApplication.PhonemeBuilder; + + if (!found) + { + // not found, appending as-is + subBuilder.Append(phonemeText.Substring(i, 1)); + } + + i = rulesApplication.I; + } + + phonemes.UnionWith(subBuilder.Phonemes); + } + + return new PhonemeBuilder(phonemes.ToList()); + } + + /// <summary> + /// Encodes a string to its phonetic representation. + /// </summary> + /// <param name="input">The string to encode.</param> + /// <returns>The encoding of the input.</returns> + public virtual string Encode(string input) + { + LanguageSet languageSet = this.lang.GuessLanguages(input); + return Encode(input, languageSet); + } + + /// <summary> + /// Encodes an input string into an output phonetic representation, given a set of possible origin languages. + /// </summary> + /// <param name="input">String to phoneticise; a string with dashes or spaces separating each word.</param> + /// <param name="languageSet"></param> + /// <returns>A phonetic representation of the input; a string containing '-'-separated phonetic representations of the input.</returns> + public virtual string Encode(string input, LanguageSet languageSet) + { + IDictionary<string, IList<Rule>> rules = Rule.GetInstanceMap(this.nameType, RuleType.RULES, languageSet); + // rules common across many (all) languages + IDictionary<string, IList<Rule>> finalRules1 = Rule.GetInstanceMap(this.nameType, this.ruleType, "common"); + // rules that apply to a specific language that may be ambiguous or wrong if applied to other languages + IDictionary<string, IList<Rule>> finalRules2 = Rule.GetInstanceMap(this.nameType, this.ruleType, languageSet); + + // tidy the input + // lower case is a locale-dependent operation + input = input.ToLowerInvariant().Replace('-', ' ').Trim(); + + if (this.nameType == NameType.GENERIC) + { + if (input.Length >= 2 && input.Substring(0, 2 - 0).Equals("d'")) + { // check for d' + string remainder = input.Substring(2); + string combined = "d" + remainder; + return "(" + Encode(remainder) + ")-(" + Encode(combined) + ")"; + } + foreach (string l in NAME_PREFIXES[this.nameType]) + { + // handle generic prefixes + if (input.StartsWith(l + " ", StringComparison.Ordinal)) + { + // check for any prefix in the words list + string remainder = input.Substring(l.Length + 1); // input without the prefix + string combined = l + remainder; // input with prefix without space + return "(" + Encode(remainder) + ")-(" + Encode(combined) + ")"; + } + } + } + + IList<string> words = WHITESPACE.Split(input).ToList(); + IList<string> words2 = new List<string>(); + + // special-case handling of word prefixes based upon the name type + switch (this.nameType) + { + case NameType.SEPHARDIC: + foreach (string aWord in words) + { + string[] parts = aWord.Split(new char[] { '\'' }, StringSplitOptions.RemoveEmptyEntries); + string lastPart = parts[parts.Length - 1]; + words2.Add(lastPart); + } + words2.RemoveAll(NAME_PREFIXES[this.nameType]); + break; + case NameType.ASHKENAZI: + words2.AddRange(words); + words2.RemoveAll(NAME_PREFIXES[this.nameType]); + break; + case NameType.GENERIC: + words2.AddRange(words); + break; + default: + throw new InvalidOperationException("Unreachable case: " + this.nameType); + } + + if (this.concat) + { + // concat mode enabled + input = Join(words2, " "); + } + else if (words2.Count == 1) + { + // not a multi-word name + //input = words.iterator().next(); + input = words.FirstOrDefault(); + } + else + { + // encode each word in a multi-word name separately (normally used for approx matches) + StringBuilder result = new StringBuilder(); + foreach (string word in words2) + { + result.Append("-").Append(Encode(word)); + } + // return the result without the leading "-" + return result.ToString(1, result.Length - 1); + } + + PhonemeBuilder phonemeBuilder = PhonemeBuilder.Empty(languageSet); + + // loop over each char in the input - we will handle the increment manually + for (int i = 0; i < input.Length;) + { + RulesApplication rulesApplication = + new RulesApplication(rules, input, phonemeBuilder, i, maxPhonemes).Invoke(); + i = rulesApplication.I; + phonemeBuilder = rulesApplication.PhonemeBuilder; + } + + // Apply the general rules + phonemeBuilder = ApplyFinalRules(phonemeBuilder, finalRules1); + // Apply the language-specific rules + phonemeBuilder = ApplyFinalRules(phonemeBuilder, finalRules2); + + return phonemeBuilder.MakeString(); + } + + /// <summary> + /// Gets the Lang language guessing rules being used. + /// </summary> + public virtual Lang Lang + { + get { return this.lang; } + } + + /// <summary> + /// Gets the <see cref="Bm.NameType"/> being used. + /// </summary> + public virtual NameType NameType + { + get { return this.nameType; } + } + + /// <summary> + /// Gets the <see cref="Bm.RuleType"/> being used. + /// </summary> + public virtual RuleType RuleType + { + get { return this.ruleType; } + } + + /// <summary> + /// Gets if multiple phonetic encodings are concatenated or if just the first one is kept. + /// Returns <c>true</c> if multiple phonetic encodings are returned, <c>false</c> if just the first is. + /// </summary> + public virtual bool IsConcat + { + get { return this.concat; } + } + + /// <summary> + /// Gets the maximum number of phonemes the engine will calculate for a given input. + /// <para/> + /// since 1.7 + /// </summary> + public virtual int MaxPhonemes + { + get { return this.maxPhonemes; } + } + } +}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1ee3a9cc/src/Lucene.Net.Analysis.Phonetic/Language/Bm/ResourceConstants.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Phonetic/Language/Bm/ResourceConstants.cs b/src/Lucene.Net.Analysis.Phonetic/Language/Bm/ResourceConstants.cs new file mode 100644 index 0000000..c70d404 --- /dev/null +++ b/src/Lucene.Net.Analysis.Phonetic/Language/Bm/ResourceConstants.cs @@ -0,0 +1,37 @@ +// commons-codec version compatibility level: 1.9 +using System.Text; + +namespace Lucene.Net.Analysis.Phonetic.Language.Bm +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// <summary> + /// Constants used to process resource files. + /// <para/> + /// This class is immutable and thread-safe. + /// <para/> + /// since 1.6 + /// </summary> + internal class ResourceConstants + { + public static readonly string CMT = "//"; + public static readonly Encoding ENCODING = Encoding.UTF8; + public static readonly string EXT_CMT_END = "*/"; + public static readonly string EXT_CMT_START = "/*"; + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1ee3a9cc/src/Lucene.Net.Analysis.Phonetic/Language/Bm/Rule.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Phonetic/Language/Bm/Rule.cs b/src/Lucene.Net.Analysis.Phonetic/Language/Bm/Rule.cs new file mode 100644 index 0000000..52f3d9a --- /dev/null +++ b/src/Lucene.Net.Analysis.Phonetic/Language/Bm/Rule.cs @@ -0,0 +1,1069 @@ +// commons-codec version compatibility level: 1.9 +using Lucene.Net.Support; +using System; +using System.Collections.Generic; +using System.IO; +using System.Reflection; +using System.Text; +using System.Text.RegularExpressions; + +namespace Lucene.Net.Analysis.Phonetic.Language.Bm +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// <summary> + /// A phoneme rule. + /// </summary> + /// <remarks> + /// Rules have a pattern, left context, right context, output phoneme, set of languages for which they apply + /// and a logical flag indicating if all languages must be in play. A rule matches if: + /// <list type="bullet"> + /// <item><description>the pattern matches at the current position</description></item> + /// <item><description>the string up until the beginning of the pattern matches the left context</description></item> + /// <item><description>the string from the end of the pattern matches the right context</description></item> + /// <item><description>logical is ALL and all languages are in scope; or</description></item> + /// <item><description>logical is any other value and at least one language is in scope</description></item> + /// </list> + /// <para/> + /// Rules are typically generated by parsing rules resources. In normal use, there will be no need for the user + /// to explicitly construct their own. + /// <para/> + /// Rules are immutable and thread-safe. + /// <para/> + /// <b>Rules resources</b> + /// <para/> + /// Rules are typically loaded from resource files. These are UTF-8 encoded text files. They are systematically + /// named following the pattern: + /// <c>Lucene.Net.Analysis.Phonetic.Language.Bm.<see cref="NameType"/>_<see cref="RuleType"/>_[language].txt</c> + /// <para/> + /// The format of these resources is the following: + /// <list type="table"> + /// <item> + /// <term>Rules:</term> + /// <description> + /// whitespace separated, double-quoted strings. There should be 4 columns to each row, and these + /// will be interpreted as: + /// <list type="number"> + /// <item><description>pattern</description></item> + /// <item><description>left context</description></item> + /// <item><description>right context</description></item> + /// <item><description>phoneme</description></item> + /// </list> + /// </description> + /// </item> + /// <item> + /// <term>End-of-line comments:</term> + /// <description>Any occurrence of '//' will cause all text following on that line to be discarded as a comment.</description> + /// </item> + /// <item> + /// <term>Multi-line comments:</term> + /// <description>Any line starting with '/*' will start multi-line commenting mode. This will skip all content until a line ending in '*' and '/' is found.</description> + /// </item> + /// <item> + /// <term>Blank lines:</term> + /// <description>All blank lines will be skipped.</description> + /// </item> + /// </list> + /// <para/> + /// since 1.6 + /// </remarks> + public class Rule + { + private static Regex PIPE = new Regex("[|]", RegexOptions.Compiled); + private static Regex WHITESPACE = new Regex("\\s+", RegexOptions.Compiled); + private static Regex PLUS = new Regex("[+]", RegexOptions.Compiled); + + private class AllStringsRMatcher : IRPattern + { + public bool IsMatch(StringBuilder input) + { + return true; + } + + public bool IsMatch(string input) + { + return true; + } + + public bool IsMatch(ICharSequence input) + { + return true; + } + } + + public static readonly IRPattern ALL_STRINGS_RMATCHER = new AllStringsRMatcher(); + + + public static readonly string ALL = "ALL"; + + private static readonly string DOUBLE_QUOTE = "\""; + + private static readonly string HASH_INCLUDE = "#include"; + + private static readonly IDictionary<NameType, IDictionary<RuleType, IDictionary<string, IDictionary<string, IList<Rule>>>>> RULES = + new Dictionary<NameType, IDictionary<RuleType, IDictionary<string, IDictionary<string, IList<Rule>>>>>(); + + static Rule() + { + foreach (NameType s in Enum.GetValues(typeof(NameType))) + { + IDictionary<RuleType, IDictionary<string, IDictionary<string, IList<Rule>>>> rts = + new Dictionary<RuleType, IDictionary<string, IDictionary<string, IList<Rule>>>>(); + + foreach (RuleType rt in Enum.GetValues(typeof(RuleType))) + { + IDictionary<string, IDictionary<string, IList<Rule>>> rs = new Dictionary<string, IDictionary<string, IList<Rule>>>(); + + Languages ls = Languages.GetInstance(s); + foreach (string l in ls.GetLanguages()) + { + try + { + rs[l] = ParseRules(CreateScanner(s, rt, l), CreateResourceName(s, rt, l)); + } + catch (InvalidOperationException e) + { + throw new InvalidOperationException("Problem processing " + CreateResourceName(s, rt, l), e); + } + } + if (!rt.Equals(RuleType.RULES)) + { + rs["common"] = ParseRules(CreateScanner(s, rt, "common"), CreateResourceName(s, rt, "common")); + } + + rts[rt] = Collections.UnmodifiableMap(rs); + } + + RULES[s] = Collections.UnmodifiableMap(rts); + } + } + + private static bool Contains(ICharSequence chars, char input) + { + for (int i = 0; i < chars.Length; i++) + { + if (chars[i] == input) + { + return true; + } + } + return false; + } + private static bool Contains(string chars, char input) + { + for (int i = 0; i < chars.Length; i++) + { + if (chars[i] == input) + { + return true; + } + } + return false; + } + private static bool Contains(StringBuilder chars, char input) + { + for (int i = 0; i < chars.Length; i++) + { + if (chars[i] == input) + { + return true; + } + } + return false; + } + + private static string CreateResourceName(NameType nameType, RuleType rt, string lang) + { + return string.Format("{0}_{1}_{2}.txt", + nameType.GetName(), rt.GetName(), lang); + } + + private static TextReader CreateScanner(NameType nameType, RuleType rt, string lang) + { + string resName = CreateResourceName(nameType, rt, lang); + Stream rulesIS = typeof(Languages).GetTypeInfo().Assembly.FindAndGetManifestResourceStream(typeof(Languages), resName); + + if (rulesIS == null) + { + throw new ArgumentException("Unable to load resource: " + resName); + } + + return new StreamReader(rulesIS, ResourceConstants.ENCODING); + } + + private static TextReader CreateScanner(string lang) + { + string resName = string.Format("{0}.txt", lang); + Stream rulesIS = typeof(Languages).GetTypeInfo().Assembly.FindAndGetManifestResourceStream(typeof(Languages), resName); + + if (rulesIS == null) + { + throw new ArgumentException("Unable to load resource: " + resName); + } + + return new StreamReader(rulesIS, ResourceConstants.ENCODING); + } + + private static bool EndsWith(ICharSequence input, string suffix) + { + if (suffix.Length > input.Length) + { + return false; + } + for (int i = input.Length - 1, j = suffix.Length - 1; j >= 0; i--, j--) + { + if (input[i] != suffix[j]) + { + return false; + } + } + return true; + } + + private static bool EndsWith(string input, string suffix) + { + if (suffix.Length > input.Length) + { + return false; + } + for (int i = input.Length - 1, j = suffix.Length - 1; j >= 0; i--, j--) + { + if (input[i] != suffix[j]) + { + return false; + } + } + return true; + } + + private static bool EndsWith(StringBuilder input, string suffix) + { + if (suffix.Length > input.Length) + { + return false; + } + for (int i = input.Length - 1, j = suffix.Length - 1; j >= 0; i--, j--) + { + if (input[i] != suffix[j]) + { + return false; + } + } + return true; + } + + /// <summary> + /// Gets rules for a combination of name type, rule type and languages. + /// </summary> + /// <param name="nameType">The <see cref="NameType"/> to consider.</param> + /// <param name="rt">The <see cref="RuleType"/> to consider.</param> + /// <param name="langs">The set of languages to consider.</param> + /// <returns>A list of <see cref="Rule"/>s that apply.</returns> + public static IList<Rule> GetInstance(NameType nameType, RuleType rt, + LanguageSet langs) + { + IDictionary<string, IList<Rule>> ruleMap = GetInstanceMap(nameType, rt, langs); + IList<Rule> allRules = new List<Rule>(); + foreach (IList<Rule> rules in ruleMap.Values) + { + allRules.AddRange(rules); + } + return allRules; + } + + /// <summary> + /// Gets rules for a combination of name type, rule type and a single language. + /// </summary> + /// <param name="nameType">The <see cref="NameType"/> to consider.</param> + /// <param name="rt">The <see cref="RuleType"/> to consider.</param> + /// <param name="lang">The language to consider.</param> + /// <returns>A list of <see cref="Rule"/>s that apply.</returns> + public static IList<Rule> GetInstance(NameType nameType, RuleType rt, string lang) + { + return GetInstance(nameType, rt, LanguageSet.From(new HashSet<string>() { lang })); + } + + /// <summary> + /// Gets rules for a combination of name type, rule type and languages. + /// <para/> + /// since 1.9 + /// </summary> + /// <param name="nameType">The <see cref="NameType"/> to consider.</param> + /// <param name="rt">The <see cref="RuleType"/> to consider.</param> + /// <param name="langs">The set of languages to consider.</param> + /// <returns>A map containing all <see cref="Rule"/>s that apply, grouped by the first character of the rule pattern.</returns> + public static IDictionary<string, IList<Rule>> GetInstanceMap(NameType nameType, RuleType rt, + LanguageSet langs) + { + return langs.IsSingleton ? GetInstanceMap(nameType, rt, langs.GetAny()) : + GetInstanceMap(nameType, rt, Languages.ANY); + } + + /// <summary> + /// Gets rules for a combination of name type, rule type and a single language. + /// <para/> + /// since 1.9 + /// </summary> + /// <param name="nameType">The <see cref="NameType"/> to consider.</param> + /// <param name="rt">The <see cref="RuleType"/> to consider.</param> + /// <param name="lang">The language to consider.</param> + /// <returns>A map containing all <see cref="Rule"/>s that apply, grouped by the first character of the rule pattern.</returns> + public static IDictionary<string, IList<Rule>> GetInstanceMap(NameType nameType, RuleType rt, + string lang) + { + IDictionary<RuleType, IDictionary<string, IDictionary<string, IList<Rule>>>> nameTypes; + IDictionary<string, IDictionary<string, IList<Rule>>> ruleTypes; + IDictionary<string, IList<Rule>> rules = null; + + if (RULES.TryGetValue(nameType, out nameTypes) && nameTypes != null && + nameTypes.TryGetValue(rt, out ruleTypes) && ruleTypes != null && + ruleTypes.TryGetValue(lang, out rules) && rules != null) + { + } + else + { + throw new ArgumentException(string.Format("No rules found for {0}, {1}, {2}.", + nameType.GetName(), rt.GetName(), lang)); + } + + return rules; + } + + private static Phoneme ParsePhoneme(string ph) + { + int open = ph.IndexOf("["); + if (open >= 0) + { + if (!ph.EndsWith("]", StringComparison.Ordinal)) + { + throw new ArgumentException("Phoneme expression contains a '[' but does not end in ']'"); + } + string before = ph.Substring(0, open - 0); + string input = ph.Substring(open + 1, (ph.Length - 1) - (open + 1)); + ISet<string> langs = new HashSet<string>(PLUS.Split(input)); + + return new Phoneme(before, LanguageSet.From(langs)); + } + else + { + return new Phoneme(ph, Languages.ANY_LANGUAGE); + } + } + + private static IPhonemeExpr ParsePhonemeExpr(string ph) + { + if (ph.StartsWith("(", StringComparison.Ordinal)) + { // we have a bracketed list of options + if (!ph.EndsWith(")", StringComparison.Ordinal)) + { + throw new ArgumentException("Phoneme starts with '(' so must end with ')'"); + } + + IList<Phoneme> phs = new List<Phoneme>(); + string body = ph.Substring(1, (ph.Length - 1) - 1); + foreach (string part in PIPE.Split(body)) + { + phs.Add(ParsePhoneme(part)); + } + if (body.StartsWith("|", StringComparison.Ordinal) || body.EndsWith("|", StringComparison.Ordinal)) + { + phs.Add(new Phoneme("", Languages.ANY_LANGUAGE)); + } + + return new PhonemeList(phs); + } + else + { + return ParsePhoneme(ph); + } + } + + private class RuleAnonymousHelper : Rule + { + private readonly int myLine; + private readonly string loc; + + public RuleAnonymousHelper(string pat, string lCon, string rCon, IPhonemeExpr ph, int cLine, string location) + : base(pat, lCon, rCon, ph) + { + this.myLine = cLine; + this.loc = location; + } + + public override string ToString() + { + StringBuilder sb = new StringBuilder(); + sb.Append("Rule"); + sb.Append("{line=").Append(myLine); + sb.Append(", loc='").Append(loc).Append('\''); + sb.Append('}'); + return sb.ToString(); + } + } + + private static IDictionary<string, IList<Rule>> ParseRules(TextReader reader, string location) + { + IDictionary<string, IList<Rule>> lines = new HashMap<string, IList<Rule>>(); + int currentLine = 0; + + bool inMultilineComment = false; + string rawLine; + try + { + while ((rawLine = reader.ReadLine()) != null) + { + currentLine++; + string line = rawLine; + + if (inMultilineComment) + { + if (line.EndsWith(ResourceConstants.EXT_CMT_END)) + { + inMultilineComment = false; + } + } + else + { + if (line.StartsWith(ResourceConstants.EXT_CMT_START)) + { + inMultilineComment = true; + } + else + { + // discard comments + int cmtI = line.IndexOf(ResourceConstants.CMT); + if (cmtI >= 0) + { + line = line.Substring(0, cmtI); + } + + // trim leading-trailing whitespace + line = line.Trim(); + + if (line.Length == 0) + { + continue; // empty lines can be safely skipped + } + + if (line.StartsWith(HASH_INCLUDE, StringComparison.Ordinal)) + { + // include statement + string incl = line.Substring(HASH_INCLUDE.Length).Trim(); + if (incl.Contains(" ")) + { + throw new ArgumentException("Malformed import statement '" + rawLine + "' in " + + location); + } + else + { + lines.PutAll(ParseRules(CreateScanner(incl), location + "->" + incl)); + } + } + else + { + // rule + string[] parts = WHITESPACE.Split(line); + if (parts.Length != 4) + { + throw new ArgumentException("Malformed rule statement split into " + parts.Length + + " parts: " + rawLine + " in " + location); + } + else + { + try + { + string pat = StripQuotes(parts[0]); + string lCon = StripQuotes(parts[1]); + string rCon = StripQuotes(parts[2]); + IPhonemeExpr ph = ParsePhonemeExpr(StripQuotes(parts[3])); + int cLine = currentLine; + Rule r = new RuleAnonymousHelper(pat, lCon, rCon, ph, cLine, location); + + string patternKey = r.pattern.Substring(0, 1 - 0); + IList<Rule> rules; + if (!lines.TryGetValue(patternKey, out rules) || rules == null) + { + rules = new List<Rule>(); + lines[patternKey] = rules; + } + rules.Add(r); + } + catch (ArgumentException e) + { + throw new InvalidOperationException("Problem parsing line '" + currentLine + "' in " + + location, e); + } + } + } + } + } + } + } + finally + { + reader.Dispose(); + } + + return lines; + } + + private class RPatternHelper : IRPattern + { + private readonly Func<StringBuilder, bool> isMatchSB; + private readonly Func<string, bool> isMatchStr; + private readonly Func<ICharSequence, bool> isMatchCS; + + public RPatternHelper(Func<StringBuilder, bool> isMatchSB, Func<string, bool> isMatchStr, Func<ICharSequence, bool> isMatchCS) + { + this.isMatchSB = isMatchSB; + this.isMatchStr = isMatchStr; + this.isMatchCS = isMatchCS; + } + + public bool IsMatch(StringBuilder input) + { + return isMatchSB(input); + } + + public bool IsMatch(string input) + { + return isMatchStr(input); + } + + public bool IsMatch(ICharSequence input) + { + return isMatchCS(input); + } + } + + /// <summary> + /// Attempts to compile the regex into direct string ops, falling back to <see cref="Regex"/> and <see cref="Match"/> in the worst case. + /// </summary> + /// <param name="regex">The regular expression to compile.</param> + /// <returns>An RPattern that will match this regex.</returns> + private static IRPattern GetPattern(string regex) + { + bool startsWith = regex.StartsWith("^"); + bool endsWith = regex.EndsWith("$"); + string content = regex.Substring(startsWith ? 1 : 0, (endsWith ? regex.Length - 1 : regex.Length) - (startsWith ? 1 : 0)); + bool boxes = content.Contains("["); + + if (!boxes) + { + if (startsWith && endsWith) + { + // exact match + if (content.Length == 0) + { + // empty + return new RPatternHelper(isMatchSB: (input) => + { + return input.Length == 0; + }, isMatchStr: (input) => + { + return input.Length == 0; + }, isMatchCS: (input) => + { + return input.Length == 0; + }); + } + else + { + + return new RPatternHelper(isMatchSB: (input) => + { + return input.Equals(content); + }, isMatchStr: (input) => + { + return input.Equals(content); + }, isMatchCS: (input) => + { + return input.Equals(content); + }); + } + } + else if ((startsWith || endsWith) && content.Length == 0) + { + // matches every string + return ALL_STRINGS_RMATCHER; + } + else if (startsWith) + { + // matches from start + return new RPatternHelper(isMatchSB: (input) => + { + return StartsWith(input, content); + }, isMatchStr: (input) => + { + return StartsWith(input, content); + }, isMatchCS: (input) => + { + return StartsWith(input, content); + }); + + } + else if (endsWith) + { + // matches from start + return new RPatternHelper(isMatchSB: (input) => + { + return EndsWith(input, content); + }, isMatchStr: (input) => + { + return EndsWith(input, content); + }, isMatchCS: (input) => + { + return EndsWith(input, content); + }); + } + } + else + { + bool startsWithBox = content.StartsWith("[", StringComparison.Ordinal); + bool endsWithBox = content.EndsWith("]", StringComparison.Ordinal); + + if (startsWithBox && endsWithBox) + { + string boxContent = content.Substring(1, (content.Length - 1) - 1); + if (!boxContent.Contains("[")) + { + // box containing alternatives + bool negate = boxContent.StartsWith("^", StringComparison.Ordinal); + if (negate) + { + boxContent = boxContent.Substring(1); + } + string bContent = boxContent; + bool shouldMatch = !negate; + + if (startsWith && endsWith) + { + // exact match + return new RPatternHelper(isMatchSB: (input) => + { + return input.Length == 1 && Contains(bContent, input[0]) == shouldMatch; + }, isMatchStr: (input) => + { + return input.Length == 1 && Contains(bContent, input[0]) == shouldMatch; + }, isMatchCS: (input) => + { + return input.Length == 1 && Contains(bContent, input[0]) == shouldMatch; + }); + } + else if (startsWith) + { + // first char + return new RPatternHelper(isMatchSB: (input) => + { + return input.Length > 0 && Contains(bContent, input[0]) == shouldMatch; + }, isMatchStr: (input) => + { + return input.Length > 0 && Contains(bContent, input[0]) == shouldMatch; + }, isMatchCS: (input) => + { + return input.Length > 0 && Contains(bContent, input[0]) == shouldMatch; + }); + } + else if (endsWith) + { + // last char + return new RPatternHelper(isMatchSB: (input) => + { + return input.Length > 0 && Contains(bContent, input[input.Length - 1]) == shouldMatch; + }, isMatchStr: (input) => + { + return input.Length > 0 && Contains(bContent, input[input.Length - 1]) == shouldMatch; + }, isMatchCS: (input) => + { + return input.Length > 0 && Contains(bContent, input[input.Length - 1]) == shouldMatch; + }); + } + } + } + } + Regex pattern = new Regex(regex, RegexOptions.Compiled); + + return new RPatternHelper(isMatchSB: (input) => + { + Match matcher = pattern.Match(input.ToString()); + return matcher.Success; + }, isMatchStr: (input) => + { + Match matcher = pattern.Match(input); + return matcher.Success; + }, isMatchCS: (input) => + { + Match matcher = pattern.Match(input.ToString()); + return matcher.Success; + }); + } + + private static bool StartsWith(ICharSequence input, string prefix) + { + if (prefix.Length > input.Length) + { + return false; + } + for (int i = 0; i < prefix.Length; i++) + { + if (input[i] != prefix[i]) + { + return false; + } + } + return true; + } + + private static bool StartsWith(string input, string prefix) + { + if (prefix.Length > input.Length) + { + return false; + } + for (int i = 0; i < prefix.Length; i++) + { + if (input[i] != prefix[i]) + { + return false; + } + } + return true; + } + + private static bool StartsWith(StringBuilder input, string prefix) + { + if (prefix.Length > input.Length) + { + return false; + } + for (int i = 0; i < prefix.Length; i++) + { + if (input[i] != prefix[i]) + { + return false; + } + } + return true; + } + + private static string StripQuotes(string str) + { + if (str.StartsWith(DOUBLE_QUOTE, StringComparison.Ordinal)) + { + str = str.Substring(1); + } + + if (str.EndsWith(DOUBLE_QUOTE, StringComparison.Ordinal)) + { + str = str.Substring(0, str.Length - 1); + } + + return str; + } + + private readonly IRPattern lContext; + + private readonly string pattern; + + private readonly IPhonemeExpr phoneme; + + private readonly IRPattern rContext; + + /// <summary> + /// Creates a new rule. + /// </summary> + /// <param name="pattern">The pattern.</param> + /// <param name="lContext">The left context.</param> + /// <param name="rContext">The right context.</param> + /// <param name="phoneme">The resulting phoneme.</param> + public Rule(string pattern, string lContext, string rContext, IPhonemeExpr phoneme) + { + this.pattern = pattern; + this.lContext = GetPattern(lContext + "$"); + this.rContext = GetPattern("^" + rContext); + this.phoneme = phoneme; + } + + /// <summary> + /// Gets the left context pattern. This is a regular expression that must match to the left of the pattern. + /// </summary> + public virtual IRPattern LContext + { + get { return this.lContext; } + } + + /// <summary> + /// Gets the pattern. This is a string-literal that must exactly match. + /// </summary> + public virtual string Pattern + { + get { return this.pattern; } + } + + /// <summary> + /// Gets the phoneme. If the rule matches, this is the phoneme associated with the pattern match. + /// </summary> + public virtual IPhonemeExpr Phoneme + { + get { return this.phoneme; } + } + + /// <summary> + /// Gets the right context pattern. This is a regular expression that must match to the right of the pattern. + /// </summary> + public virtual IRPattern RContext + { + get { return this.rContext; } + } + + /// <summary> + /// Decides if the pattern and context match the input starting at a position. It is a match if the + /// <see cref="LContext"/> matches <paramref name="input"/> up to <paramref name="i"/>, <see cref="Pattern"/> matches at <paramref name="i"/> and + /// <see cref="RContext"/> matches from the end of the match of <see cref="Pattern"/> to the end of <paramref name="input"/>. + /// </summary> + /// <param name="input">The input <see cref="ICharSequence"/>.</param> + /// <param name="i">The int position within the input.</param> + /// <returns><c>true</c> if the pattern and left/right context match, <c>false</c> otherwise.</returns> + public virtual bool PatternAndContextMatches(ICharSequence input, int i) + { + if (i < 0) + { + throw new ArgumentOutOfRangeException("Can not match pattern at negative indexes"); + } + + int patternLength = this.pattern.Length; + int ipl = i + patternLength; + + if (ipl > input.Length) + { + // not enough room for the pattern to match + return false; + } + + // evaluate the pattern, left context and right context + // fail early if any of the evaluations is not successful + if (!input.SubSequence(i, ipl).Equals(this.pattern)) + { + return false; + } + else if (!this.rContext.IsMatch(input.SubSequence(ipl, input.Length))) + { + return false; + } + return this.lContext.IsMatch(input.SubSequence(0, i)); + } + + /// <summary> + /// Decides if the pattern and context match the input starting at a position. It is a match if the + /// <see cref="LContext"/> matches <paramref name="input"/> up to <paramref name="i"/>, <see cref="Pattern"/> matches at <paramref name="i"/> and + /// <see cref="RContext"/> matches from the end of the match of <see cref="Pattern"/> to the end of <paramref name="input"/>. + /// </summary> + /// <param name="input">The input <see cref="string"/>.</param> + /// <param name="i">The int position within the input.</param> + /// <returns><c>true</c> if the pattern and left/right context match, <c>false</c> otherwise.</returns> + // LUCENENET specific + public virtual bool PatternAndContextMatches(string input, int i) + { + if (i < 0) + { + throw new ArgumentOutOfRangeException("Can not match pattern at negative indexes"); + } + + int patternLength = this.pattern.Length; + int ipl = i + patternLength; + + if (ipl > input.Length) + { + // not enough room for the pattern to match + return false; + } + + // evaluate the pattern, left context and right context + // fail early if any of the evaluations is not successful + if (!input.Substring(i, (ipl - i)).Equals(this.pattern)) + { + return false; + } + else if (!this.rContext.IsMatch(input.Substring(ipl, (input.Length - ipl)))) + { + return false; + } + return this.lContext.IsMatch(input.Substring(0, (i - 0))); + } + + /// <summary> + /// Decides if the pattern and context match the input starting at a position. It is a match if the + /// <see cref="LContext"/> matches <paramref name="input"/> up to <paramref name="i"/>, <see cref="Pattern"/> matches at <paramref name="i"/> and + /// <see cref="RContext"/> matches from the end of the match of <see cref="Pattern"/> to the end of <paramref name="input"/>. + /// </summary> + /// <param name="input">The input <see cref="StringBuilder"/>.</param> + /// <param name="i">The int position within the input.</param> + /// <returns><c>true</c> if the pattern and left/right context match, <c>false</c> otherwise.</returns> + // LUCENENET specific + public virtual bool PatternAndContextMatches(StringBuilder input, int i) + { + if (i < 0) + { + throw new ArgumentOutOfRangeException("Can not match pattern at negative indexes"); + } + + int patternLength = this.pattern.Length; + int ipl = i + patternLength; + + if (ipl > input.Length) + { + // not enough room for the pattern to match + return false; + } + + // evaluate the pattern, left context and right context + // fail early if any of the evaluations is not successful + if (!input.ToString(i, (ipl - i)).Equals(this.pattern)) + { + return false; + } + else if (!this.rContext.IsMatch(input.ToString(ipl, (input.Length - ipl)))) + { + return false; + } + return this.lContext.IsMatch(input.ToString(0, (i - 0))); + } + + } + + public sealed class Phoneme : IPhonemeExpr + { + private class PhonemeComparer : IComparer<Phoneme> + { + public int Compare(Phoneme o1, Phoneme o2) + { + for (int i = 0; i < o1.phonemeText.Length; i++) + { + if (i >= o2.phonemeText.Length) + { + return +1; + } + int c = o1.phonemeText[i] - o2.phonemeText[i]; + if (c != 0) + { + return c; + } + } + + if (o1.phonemeText.Length < o2.phonemeText.Length) + { + return -1; + } + + return 0; + } + } + + public static readonly IComparer<Phoneme> COMPARER = new PhonemeComparer(); + private readonly StringBuilder phonemeText; + private readonly LanguageSet languages; + + public Phoneme(string phonemeText, LanguageSet languages) + { + this.phonemeText = new StringBuilder(phonemeText); + this.languages = languages; + } + + public Phoneme(StringBuilder phonemeText, LanguageSet languages) + { + this.phonemeText = new StringBuilder(phonemeText.ToString()); + this.languages = languages; + } + + public Phoneme(ICharSequence phonemeText, LanguageSet languages) + { + this.phonemeText = new StringBuilder(phonemeText.ToString()); + this.languages = languages; + } + + public Phoneme(Phoneme phonemeLeft, Phoneme phonemeRight) + : this(phonemeLeft.phonemeText, phonemeLeft.languages) + { + this.phonemeText.Append(phonemeRight.phonemeText); + } + + public Phoneme(Phoneme phonemeLeft, Phoneme phonemeRight, LanguageSet languages) + : this(phonemeLeft.phonemeText, languages) + { + this.phonemeText.Append(phonemeRight.phonemeText); + } + + public Phoneme Append(string str) + { + this.phonemeText.Append(str); + return this; + } + + public LanguageSet Languages + { + get { return this.languages; } + } + + public IList<Phoneme> Phonemes + { + get { return new Phoneme[] { this }; } + } + + public string GetPhonemeText() + { + return this.phonemeText.ToString(); + } + + [Obsolete("since 1.9")] + public Phoneme Join(Phoneme right) + { + return new Phoneme(this.phonemeText.ToString() + right.phonemeText.ToString(), + this.languages.RestrictTo(right.Languages)); + } + } + + public interface IPhonemeExpr + { + IList<Phoneme> Phonemes { get; } + } + + public sealed class PhonemeList : IPhonemeExpr + { + private readonly IList<Phoneme> phonemes; + + public PhonemeList(IList<Phoneme> phonemes) + { + this.phonemes = phonemes; + } + + public IList<Phoneme> Phonemes + { + get { return this.phonemes; } + } + } + + /// <summary> + /// A minimal wrapper around the functionality of <see cref="Rule"/> Pattern that we use, to allow for alternate implementations. + /// </summary> + public interface IRPattern + { + bool IsMatch(ICharSequence input); + bool IsMatch(string input); + bool IsMatch(StringBuilder input); + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1ee3a9cc/src/Lucene.Net.Analysis.Phonetic/Language/Bm/RuleType.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Phonetic/Language/Bm/RuleType.cs b/src/Lucene.Net.Analysis.Phonetic/Language/Bm/RuleType.cs new file mode 100644 index 0000000..ff3af97 --- /dev/null +++ b/src/Lucene.Net.Analysis.Phonetic/Language/Bm/RuleType.cs @@ -0,0 +1,68 @@ +// commons-codec version compatibility level: 1.9 +using System; + +namespace Lucene.Net.Analysis.Phonetic.Language.Bm +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// <summary> + /// Types of rule. + /// <para/> + /// since 1.6 + /// </summary> + public enum RuleType + { + /// <summary> + /// Approximate rules, which will lead to the largest number of phonetic interpretations. + /// </summary> + APPROX, + + /// <summary> + /// Exact rules, which will lead to a minimum number of phonetic interpretations. + /// </summary> + EXACT, + + /// <summary> + /// For internal use only. Please use <see cref="APPROX"/> or <see cref="EXACT"/>. + /// </summary> + RULES + } + + public static class RuleTypeExtensions + { + /// <summary> + /// Gets the rule name. + /// </summary> + /// <param name="ruleType">The <see cref="RuleType"/>.</param> + /// <returns>The rule name.</returns> + public static string GetName(this RuleType ruleType) + { + switch (ruleType) + { + case RuleType.APPROX: + return "approx"; + case RuleType.EXACT: + return "exact"; + case RuleType.RULES: + return "rules"; + } + + throw new ArgumentException("Invalid ruleType"); + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1ee3a9cc/src/Lucene.Net.Analysis.Phonetic/Language/Bm/ash_approx_any.txt ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Phonetic/Language/Bm/ash_approx_any.txt b/src/Lucene.Net.Analysis.Phonetic/Language/Bm/ash_approx_any.txt new file mode 100644 index 0000000..3f4f4c9 --- /dev/null +++ b/src/Lucene.Net.Analysis.Phonetic/Language/Bm/ash_approx_any.txt @@ -0,0 +1,153 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// CONSTONANTS +"ph" "" "" "f" // foreign +"sh" "" "" "S" // foreign +"kh" "" "" "x" // foreign + +"gli" "" "" "(gli|l[italian])" +"gni" "" "" "(gni|ni[italian+french])" +"gn" "" "[aeou]" "(n[italian+french]|nj[italian+french]|gn) +"gh" "" "" "g" // It + translit. from Arabic +"dh" "" "" "d" // translit. from Arabic +"bh" "" "" "d" // translit. from Arabic +"th" "" "" "t" // translit. from Arabic +"lh" "" "" "l" // Port +"nh" "" "" "nj" // Port + +"ig" "[aeiou]" "" "(ig|tS[spanish])" +"ix" "[aeiou]" "" "S" // Sp +"tx" "" "" "tS" // Sp +"tj" "" "$" "tS" // Sp +"tj" "" "" "dZ" // Sp +"tg" "" "" "(tg|dZ[spanish])" + +"gi" "" "[aeou]" "dZ" // Italian +"g" "" "y" "Z" // French +"gg" "" "[ei]" "(gZ[portuguese+french]|dZ[italian+spanish]|x[spanish])" +"g" "" "[ei]" "(Z[portuguese+french]|dZ[italian+spanish]|x[spanish])" + +"guy" "" "" "gi" +"gue" "" "$" "(k[french]|ge)" +"gu" "" "[ei]" "(g|gv") // not It +"gu" "" "[ao]" "gv" // not It + +"ñ" "" "" "(n|nj)" +"ny" "" "" "nj" + +"sc" "" "[ei]" "(s|S[italian])" +"sç" "" "[aeiou]" "s" // not It +"ss" "" "" "s" +"ç" "" "" "s" // not It + +"ch" "" "[ei]" "(k[italian]|S[portuguese+french]|tS[spanish]|dZ[spanish])" +"ch" "" "" "(S|tS[spanish]|dZ[spanish])" + +"ci" "" "[aeou]" "(tS[italian]|si)" +"cc" "" "[eiyéèê]" "(tS[italian]|ks[portuguese+french+spanish])" +"c" "" "[eiyéèê]" "(tS[italian]|s[portuguese+french+spanish])" + //array("c" "" "[aou]" "(k|C[".($portuguese+$spanish)."])" // "C" means that the actual letter could be "ç" (cedille omitted) + +"s" "^" "" "s" +"s" "[aáuiÃoóeéêy]" "[aáuiÃoóeéêy]" "(s[spanish]|z[portuguese+french+italian])" +"s" "" "[dglmnrv]" "(z|Z[portuguese])" + +"z" "" "$" "(s|ts[italian]|S[portuguese])" // ts It, s/S/Z Port, s in Sp, z Fr +"z" "" "[bdgv]" "(z|dz[italian]|Z[portuguese])" // dz It, Z/z Port, z Sp & Fr +"z" "" "[ptckf]" "(s|ts[italian]|S[portuguese])" // ts It, s/S/z Port, z/s Sp +"z" "" "" "(z|dz[italian]|ts[italian]|s[spanish])" // ts/dz It, z Port & Fr, z/s Sp + +"que" "" "$" "(k[french]|ke)" +"qu" "" "[eiu]" "k" +"qu" "" "[ao]" "(kv|k)" // k is It + +"ex" "" "[aáuiÃoóeéêy]" "(ez[portuguese]|eS[portuguese]|eks|egz)" +"ex" "" "[cs]" "(e[portuguese]|ek)" + +"m" "" "[cdglnrst]" "(m|n[portuguese])" +"m" "" "[bfpv]" "(m|n[portuguese+spanish])" +"m" "" "$" "(m|n[portuguese])" + +"b" "^" "" "(b|V[spanish])" +"v" "^" "" "(v|B[spanish])" + + // VOWELS +"eau" "" "" "o" // Fr + +"ouh" "" "[aioe]" "(v[french]|uh)" +"uh" "" "[aioe]" "(v|uh)" +"ou" "" "[aioe]" "v" // french +"uo" "" "" "(vo|o)" +"u" "" "[aie]" "v" + +"i" "[aáuoóeéê]" "" "j" +"i" "" "[aeou]" "j" +"y" "[aáuiÃoóeéê]" "" "j" +"y" "" "[aeiÃou]" "j" +"e" "" "$" "(e|E[$french])" + +"ão" "" "" "(au|an)" // Port +"ãe" "" "" "(aj|an)" // Port +"ãi" "" "" "(aj|an)" // Port +"õe" "" "" "(oj|on)" // Port +"où" "" "" "u" // Fr +"ou" "" "" "(ou|u[french])" + +"â" "" "" "a" // Port & Fr +"à " "" "" "a" // Port +"á" "" "" "a" // Port & Sp +"ã" "" "" "(a|an)" // Port +"é" "" "" "e" +"ê" "" "" "e" // Port & Fr +"è" "" "" "e" // Sp & Fr & It +"Ã" "" "" "i" // Port & Sp +"î" "" "" "i" // Fr +"ô" "" "" "o" // Port & Fr +"ó" "" "" "o" // Port & Sp & It +"õ" "" "" "(o|on)" // Port +"ò" "" "" "o" // Sp & It +"ú" "" "" "u" // Port & Sp +"ü" "" "" "u" // Port & Sp + + // LATIN ALPHABET +"a" "" "" "a" +"b" "" "" "(b|v[spanish])" +"c" "" "" "k" +"d" "" "" "d" +"e" "" "" "e" +"f" "" "" "f" +"g" "" "" "g" +"h" "" "" "h" +"i" "" "" "i" +"j" "" "" "(x[spanish]|Z)" // not It +"k" "" "" "k" +"l" "" "" "l" +"m" "" "" "m" +"n" "" "" "n" +"o" "" "" "o" +"p" "" "" "p" +"q" "" "" "k" +"r" "" "" "r" +"s" "" "" "(s|S[portuguese])" +"t" "" "" "t" +"u" "" "" "u" +"v" "" "" "(v|b[spanish])" +"w" "" "" "v" // foreign +"x" "" "" "(ks|gz|S[portuguese+spanish])" // S/ks Port & Sp, gz Sp, It only ks +"y" "" "" "i" +"z" "" "" "z" http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1ee3a9cc/src/Lucene.Net.Analysis.Phonetic/Language/Bm/ash_approx_common.txt ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Phonetic/Language/Bm/ash_approx_common.txt b/src/Lucene.Net.Analysis.Phonetic/Language/Bm/ash_approx_common.txt new file mode 100644 index 0000000..e95a756 --- /dev/null +++ b/src/Lucene.Net.Analysis.Phonetic/Language/Bm/ash_approx_common.txt @@ -0,0 +1,219 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include ash_exact_approx_common + +// REGRESSIVE ASSIMILATION OF CONSONANTS +"n" "" "[bp]" "m" + +// PECULIARITY OF "h" +"h" "" "" "" +"H" "" "" "(x|)" + +// POLISH OGONEK IMPOSSIBLE +"F" "" "[bdgkpstvzZ]h" "e" +"F" "" "[bdgkpstvzZ]x" "e" +"B" "" "[bdgkpstvzZ]h" "a" +"B" "" "[bdgkpstvzZ]x" "a" + +// "e" and "i" ARE TO BE OMITTED BEFORE (SYLLABIC) n & l: Halperin=Halpern; Frankel = Frankl, Finkelstein = Finklstein +"e" "[bdfgklmnprsStvzZ]" "[ln]$" "" +"i" "[bdfgklmnprsStvzZ]" "[ln]$" "" +"E" "[bdfgklmnprsStvzZ]" "[ln]$" "" +"I" "[bdfgklmnprsStvzZ]" "[ln]$" "" +"F" "[bdfgklmnprsStvzZ]" "[ln]$" "" +"Q" "[bdfgklmnprsStvzZ]" "[ln]$" "" +"Y" "[bdfgklmnprsStvzZ]" "[ln]$" "" + +"e" "[bdfgklmnprsStvzZ]" "[ln][bdfgklmnprsStvzZ]" "" +"i" "[bdfgklmnprsStvzZ]" "[ln][bdfgklmnprsStvzZ]" "" +"E" "[bdfgklmnprsStvzZ]" "[ln][bdfgklmnprsStvzZ]" "" +"I" "[bdfgklmnprsStvzZ]" "[ln][bdfgklmnprsStvzZ]" "" +"F" "[bdfgklmnprsStvzZ]" "[ln][bdfgklmnprsStvzZ]" "" +"Q" "[bdfgklmnprsStvzZ]" "[ln][bdfgklmnprsStvzZ]" "" +"Y" "[bdfgklmnprsStvzZ]" "[ln][bdfgklmnprsStvzZ]" "" + +"lEs" "" "" "(lEs|lz)" // Applebaum < Appelbaum (English + blend English-something forms as Finklestein) +"lE" "[bdfgkmnprStvzZ]" "" "(lE|l)" // Applebaum < Appelbaum (English + blend English-something forms as Finklestein) + +// SIMPLIFICATION: (TRIPHTHONGS & DIPHTHONGS) -> ONE GENERIC DIPHTHONG "D" +"aue" "" "" "D" +"oue" "" "" "D" + +"AvE" "" "" "(D|AvE)" +"Ave" "" "" "(D|Ave)" +"avE" "" "" "(D|avE)" +"ave" "" "" "(D|ave)" + +"OvE" "" "" "(D|OvE)" +"Ove" "" "" "(D|Ove)" +"ovE" "" "" "(D|ovE)" +"ove" "" "" "(D|ove)" + +"ea" "" "" "(D|ea)" +"EA" "" "" "(D|EA)" +"Ea" "" "" "(D|Ea)" +"eA" "" "" "(D|eA)" + +"aji" "" "" "D" +"ajI" "" "" "D" +"aje" "" "" "D" +"ajE" "" "" "D" + +"Aji" "" "" "D" +"AjI" "" "" "D" +"Aje" "" "" "D" +"AjE" "" "" "D" + +"oji" "" "" "D" +"ojI" "" "" "D" +"oje" "" "" "D" +"ojE" "" "" "D" + +"Oji" "" "" "D" +"OjI" "" "" "D" +"Oje" "" "" "D" +"OjE" "" "" "D" + +"eji" "" "" "D" +"ejI" "" "" "D" +"eje" "" "" "D" +"ejE" "" "" "D" + +"Eji" "" "" "D" +"EjI" "" "" "D" +"Eje" "" "" "D" +"EjE" "" "" "D" + +"uji" "" "" "D" +"ujI" "" "" "D" +"uje" "" "" "D" +"ujE" "" "" "D" + +"Uji" "" "" "D" +"UjI" "" "" "D" +"Uje" "" "" "D" +"UjE" "" "" "D" + +"iji" "" "" "D" +"ijI" "" "" "D" +"ije" "" "" "D" +"ijE" "" "" "D" + +"Iji" "" "" "D" +"IjI" "" "" "D" +"Ije" "" "" "D" +"IjE" "" "" "D" + +"aja" "" "" "D" +"ajA" "" "" "D" +"ajo" "" "" "D" +"ajO" "" "" "D" +"aju" "" "" "D" +"ajU" "" "" "D" + +"Aja" "" "" "D" +"AjA" "" "" "D" +"Ajo" "" "" "D" +"AjO" "" "" "D" +"Aju" "" "" "D" +"AjU" "" "" "D" + +"oja" "" "" "D" +"ojA" "" "" "D" +"ojo" "" "" "D" +"ojO" "" "" "D" +"Aju" "" "" "D" +"AjU" "" "" "D" + +"Oja" "" "" "D" +"OjA" "" "" "D" +"Ojo" "" "" "D" +"OjO" "" "" "D" +"Aju" "" "" "D" +"AjU" "" "" "D" + +"eja" "" "" "D" +"ejA" "" "" "D" +"ejo" "" "" "D" +"ejO" "" "" "D" +"Aju" "" "" "D" +"AjU" "" "" "D" + +"Eja" "" "" "D" +"EjA" "" "" "D" +"Ejo" "" "" "D" +"EjO" "" "" "D" +"Aju" "" "" "D" +"AjU" "" "" "D" + +"uja" "" "" "D" +"ujA" "" "" "D" +"ujo" "" "" "D" +"ujO" "" "" "D" +"Aju" "" "" "D" +"AjU" "" "" "D" + +"Uja" "" "" "D" +"UjA" "" "" "D" +"Ujo" "" "" "D" +"UjO" "" "" "D" +"Aju" "" "" "D" +"AjU" "" "" "D" + +"ija" "" "" "D" +"ijA" "" "" "D" +"ijo" "" "" "D" +"ijO" "" "" "D" +"Aju" "" "" "D" +"AjU" "" "" "D" + +"Ija" "" "" "D" +"IjA" "" "" "D" +"Ijo" "" "" "D" +"IjO" "" "" "D" +"Aju" "" "" "D" +"AjU" "" "" "D" + +"j" "" "" "i" + +// lander = lender = länder +"lYndEr" "" "$" "lYnder" +"lander" "" "$" "lYnder" +"lAndEr" "" "$" "lYnder" +"lAnder" "" "$" "lYnder" +"landEr" "" "$" "lYnder" +"lender" "" "$" "lYnder" +"lEndEr" "" "$" "lYnder" +"lendEr" "" "$" "lYnder" +"lEnder" "" "$" "lYnder" + +// CONSONANTS {z & Z; s & S} are approximately interchangeable +"s" "" "[rmnl]" "z" +"S" "" "[rmnl]" "z" +"s" "[rmnl]" "" "z" +"S" "[rmnl]" "" "z" + +"dS" "" "$" "S" +"dZ" "" "$" "S" +"Z" "" "$" "S" +"S" "" "$" "(S|s)" +"z" "" "$" "(S|s)" + +"S" "" "" "s" +"dZ" "" "" "z" +"Z" "" "" "z" \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1ee3a9cc/src/Lucene.Net.Analysis.Phonetic/Language/Bm/ash_approx_cyrillic.txt ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Phonetic/Language/Bm/ash_approx_cyrillic.txt b/src/Lucene.Net.Analysis.Phonetic/Language/Bm/ash_approx_cyrillic.txt new file mode 100644 index 0000000..4210173 --- /dev/null +++ b/src/Lucene.Net.Analysis.Phonetic/Language/Bm/ash_approx_cyrillic.txt @@ -0,0 +1,18 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include ash_approx_russian \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1ee3a9cc/src/Lucene.Net.Analysis.Phonetic/Language/Bm/ash_approx_english.txt ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Phonetic/Language/Bm/ash_approx_english.txt b/src/Lucene.Net.Analysis.Phonetic/Language/Bm/ash_approx_english.txt new file mode 100644 index 0000000..84d8174 --- /dev/null +++ b/src/Lucene.Net.Analysis.Phonetic/Language/Bm/ash_approx_english.txt @@ -0,0 +1,47 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// VOWELS +"I" "" "[^aEIeiou]e" "(Q|i|D)" // like in "five" +"I" "" "$" "i" +"I" "[aEIeiou]" "" "i" +"I" "" "[^k]$" "i" +"Ik" "[lr]" "$" "(ik|Qk)" +"Ik" "" "$" "ik" +"sIts" "" "$" "(sits|sQts)" +"Its" "" "$" "its" +"I" "" "" "(i|Q)" + +"lE" "[bdfgkmnprsStvzZ]" "" "(il|li|lY)" // Applebaum < Appelbaum + +"au" "" "" "(D|a|u)" +"ou" "" "" "(D|o|u)" +"ai" "" "" "(D|a|i)" +"oi" "" "" "(D|o|i)" +"ui" "" "" "(D|u|i)" + +"E" "D[^aeiEIou]" "" "(i|)" // Weinberg, Shaneberg (shaneberg/shejneberg) --> shejnberg +"e" "D[^aeiEIou]" "" "(i|)" + +"e" "" "" "i" +"E" "" "[fklmnprsStv]$" "i" +"E" "" "ts$" "i" +"E" "[DaoiEuQY]" "" "i" +"E" "" "[aoQY]" "i" +"E" "" "" "(Y|i)" + +"a" "" "" "(a|o)" http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1ee3a9cc/src/Lucene.Net.Analysis.Phonetic/Language/Bm/ash_approx_french.txt ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Phonetic/Language/Bm/ash_approx_french.txt b/src/Lucene.Net.Analysis.Phonetic/Language/Bm/ash_approx_french.txt new file mode 100644 index 0000000..fa8ee99 --- /dev/null +++ b/src/Lucene.Net.Analysis.Phonetic/Language/Bm/ash_approx_french.txt @@ -0,0 +1,40 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +"I" "" "$" "i" +"I" "[aEIeiou]" "" "i" +"I" "" "[^k]$" "i" +"Ik" "[lr]" "$" "(ik|Qk)" +"Ik" "" "$" "ik" +"sIts" "" "$" "(sits|sQts)" +"Its" "" "$" "its" +"I" "" "" "(i|Q)" + +"au" "" "" "(D|a|u)" +"ou" "" "" "(D|o|u)" +"ai" "" "" "(D|a|i)" +"oi" "" "" "(D|o|i)" +"ui" "" "" "(D|u|i)" + +"a" "" "" "(a|o)" +"e" "" "" "i" + +"E" "" "[fklmnprsStv]$" "i" +"E" "" "ts$" "i" +"E" "[aoiuQ]" "" "i" +"E" "" "[aoQ]" "i" +"E" "" "" "(Y|i)" \ No newline at end of file
