This is an automated email from the ASF dual-hosted git repository. nightowl888 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/lucenenet.git
commit e7eb28653e9a1b6853fcf3b14027d9198128d5b0 Author: Shad Storhaug <[email protected]> AuthorDate: Wed Feb 24 15:05:30 2021 +0700 Lucene.Net.Analysis.Common: Upgraded Lucene.Net.Analysis.Hunspell namespace to Lucene 4.10.4 because 4.8.1 was buggy (fixes #418, fixes #419) --- .../Analysis/Hunspell/Dictionary.cs | 463 ++++++++++++++++----- .../Analysis/Hunspell/HunspellStemFilter.cs | 2 +- .../Analysis/Hunspell/HunspellStemFilterFactory.cs | 2 +- .../Analysis/Hunspell/ISO8859_14Decoder.cs | 2 +- .../Analysis/Hunspell/Stemmer.cs | 315 ++++++++++++-- .../Analysis/Hunspell/Foo.cs | 12 + .../Analysis/Hunspell/StemmerTestBase.cs | 2 +- .../Analysis/Hunspell/Test64kAffixes.cs | 69 +++ .../Analysis/Hunspell/TestAllDictionaries.cs | 4 +- .../Analysis/Hunspell/TestAllDictionaries2.cs | 15 +- .../Analysis/Hunspell/TestAlternateCasing.cs | 67 +++ .../Analysis/Hunspell/TestCaseInsensitive.cs | 2 +- .../Analysis/Hunspell/TestCaseSensitive.cs | 71 ++++ .../Analysis/Hunspell/TestCircumfix.cs | 2 +- .../Analysis/Hunspell/TestComplexPrefix.cs | 2 +- .../Analysis/Hunspell/TestCondition.cs | 2 +- .../{TestHomonyms.cs => TestCondition2.cs} | 15 +- .../Analysis/Hunspell/TestConv.cs | 2 +- .../Analysis/Hunspell/TestDependencies.cs | 2 +- .../Analysis/Hunspell/TestDictionary.cs | 3 +- .../{TestHomonyms.cs => TestDoubleEscape.cs} | 15 +- .../Analysis/Hunspell/TestEscaped.cs | 2 +- .../Analysis/Hunspell/TestFlagLong.cs | 3 +- .../Analysis/Hunspell/TestFlagNum.cs | 2 +- .../Hunspell/{TestHomonyms.cs => TestFullStrip.cs} | 15 +- .../Analysis/Hunspell/TestHomonyms.cs | 2 +- .../Analysis/Hunspell/TestHunspellStemFilter.cs | 2 +- .../Hunspell/TestHunspellStemFilterFactory.cs | 2 +- .../Analysis/Hunspell/TestIgnore.cs | 2 +- .../{TestTwoSuffixes.cs => TestKeepCase.cs} | 28 +- .../Analysis/Hunspell/TestMorph.cs | 2 +- .../{TestHomonyms.cs => TestMorphAlias.cs} | 22 +- .../Hunspell/{TestHomonyms.cs => TestMorphData.cs} | 22 +- .../{TestTwoSuffixes.cs => TestNeedAffix.cs} | 24 +- .../{TestTwoSuffixes.cs => TestOnlyInCompound.cs} | 20 +- .../Analysis/Hunspell/TestOptionalCondition.cs | 2 +- .../Hunspell/{TestCondition.cs => TestSpaces.cs} | 35 +- .../Analysis/Hunspell/TestStemmer.cs | 2 +- ...estHomonyms.cs => TestStrangeOvergeneration.cs} | 18 +- .../Analysis/Hunspell/TestTwoFold.cs | 2 +- .../Analysis/Hunspell/TestTwoSuffixes.cs | 2 +- .../Hunspell/{TestHomonyms.cs => TestZeroAffix.cs} | 15 +- .../{TestHomonyms.cs => TestZeroAffix2.cs} | 15 +- .../Analysis/Hunspell/alternate-casing.aff | 15 + .../Analysis/Hunspell/alternate-casing.dic | 4 + .../Analysis/Hunspell/casesensitive.aff | 16 + .../Analysis/Hunspell/casesensitive.dic | 4 + .../Analysis/Hunspell/circumfix.dic | 2 +- .../Analysis/Hunspell/condition2.aff | 5 + .../Analysis/Hunspell/condition2.dic | 2 + .../Analysis/Hunspell/conv.dic | 2 +- .../Analysis/Hunspell/dependencies.dic | 4 +- .../Analysis/Hunspell/double-escaped.aff | 5 + .../Analysis/Hunspell/double-escaped.dic | 2 + .../Analysis/Hunspell/flaglong.aff | 3 + .../Analysis/Hunspell/fullstrip.aff | 6 + .../Analysis/Hunspell/fullstrip.dic | 2 + .../Analysis/Hunspell/homonyms.dic | 4 +- .../Analysis/Hunspell/ignore.dic | 4 +- .../Analysis/Hunspell/keepcase.aff | 6 + .../Analysis/Hunspell/keepcase.dic | 4 + .../Analysis/Hunspell/morph.dic | 2 +- .../Analysis/Hunspell/morphalias.aff | 16 + .../Analysis/Hunspell/morphalias.dic | 6 + .../Analysis/Hunspell/morphdata.aff | 10 + .../Analysis/Hunspell/morphdata.dic | 6 + .../Analysis/Hunspell/needaffix.aff | 9 + .../Analysis/Hunspell/needaffix.dic | 4 + .../Analysis/Hunspell/onlyincompound.aff | 12 + .../Analysis/Hunspell/onlyincompound.dic | 4 + .../Analysis/Hunspell/spaces.aff | 5 + .../Analysis/Hunspell/spaces.dic | 9 + .../Analysis/Hunspell/strange-overgeneration.aff | 10 + .../Analysis/Hunspell/strange-overgeneration.dic | 5 + .../Analysis/Hunspell/twosuffixes.dic | 2 +- .../Analysis/Hunspell/zeroaffix.aff | 4 + .../Analysis/Hunspell/zeroaffix.dic | 2 + .../Analysis/Hunspell/zeroaffix2.aff | 6 + .../Analysis/Hunspell/zeroaffix2.dic | 2 + .../Lucene.Net.Tests.Analysis.Common.csproj | 39 +- src/Lucene.Net.Tests.Analysis.Common/Startup.cs | 8 +- 81 files changed, 1278 insertions(+), 281 deletions(-) diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Hunspell/Dictionary.cs b/src/Lucene.Net.Analysis.Common/Analysis/Hunspell/Dictionary.cs index 38d19f5..b175b58 100644 --- a/src/Lucene.Net.Analysis.Common/Analysis/Hunspell/Dictionary.cs +++ b/src/Lucene.Net.Analysis.Common/Analysis/Hunspell/Dictionary.cs @@ -1,4 +1,4 @@ -// Lucene version compatibility level 4.8.1 +// Lucene version compatibility level 4.10.4 using J2N; using J2N.Collections.Generic.Extensions; using J2N.Numerics; @@ -43,9 +43,10 @@ namespace Lucene.Net.Analysis.Hunspell /// </summary> public class Dictionary { - private static readonly char[] NOFLAGS = new char[0]; + private static readonly char[] NOFLAGS = Arrays.Empty<char>(); private const string ALIAS_KEY = "AF"; + private const string MORPH_ALIAS_KEY = "AM"; private const string PREFIX_KEY = "PFX"; private const string SUFFIX_KEY = "SFX"; private const string FLAG_KEY = "FLAG"; @@ -54,6 +55,12 @@ namespace Lucene.Net.Analysis.Hunspell private const string IGNORE_KEY = "IGNORE"; private const string ICONV_KEY = "ICONV"; private const string OCONV_KEY = "OCONV"; + private const string FULLSTRIP_KEY = "FULLSTRIP"; + private const string LANG_KEY = "LANG"; + private const string KEEPCASE_KEY = "KEEPCASE"; + private const string NEEDAFFIX_KEY = "NEEDAFFIX"; + private const string PSEUDOROOT_KEY = "PSEUDOROOT"; + private const string ONLYINCOMPOUND_KEY = "ONLYINCOMPOUND"; private const string NUM_FLAG_TYPE = "num"; private const string UTF8_FLAG_TYPE = "UTF-8"; @@ -87,9 +94,21 @@ namespace Lucene.Net.Analysis.Hunspell private FlagParsingStrategy flagParsingStrategy = new SimpleFlagParsingStrategy(); // Default flag parsing strategy + // AF entries private string[] aliases; private int aliasCount = 0; + // AM entries + private string[] morphAliases; + private int morphAliasCount = 0; + + // st: morphological entries (either directly, or aliased from AM) + private string[] stemExceptions = new string[8]; + private int stemExceptionCount = 0; + // we set this during sorting, so we know to add an extra FST output. + // when set, some words have exceptional stems, and the last entry is a pointer to stemExceptions + internal bool hasStemExceptions; + private readonly DirectoryInfo tempDir = OfflineSorter.DefaultTempDir(); // TODO: make this configurable? internal bool ignoreCase; @@ -97,6 +116,9 @@ namespace Lucene.Net.Analysis.Hunspell internal bool twoStageAffix; // if no affixes have continuation classes, no need to do 2-level affix stripping internal int circumfix = -1; // circumfix flag, or -1 if one is not defined + internal int keepcase = -1; // keepcase flag, or -1 if one is not defined + internal int needaffix = -1; // needaffix flag, or -1 if one is not defined + internal int onlyincompound = -1; // onlyincompound flag, or -1 if one is not defined // ignored characters (dictionary, affix, inputs) private char[] ignore; @@ -108,6 +130,14 @@ namespace Lucene.Net.Analysis.Hunspell internal bool needsInputCleaning; internal bool needsOutputCleaning; + // true if we can strip suffixes "down to nothing" + internal bool fullStrip; + + // language declaration of the dictionary + internal string language; + // true if case algorithms should use alternate (Turkish/Azeri) mapping + internal bool alternateCasing; + // LUCENENET: Added so we can get better performance than creating the regex in every tight loop. private static readonly Regex whitespacePattern = new Regex("\\s+", RegexOptions.Compiled); private static readonly Regex leadingDigitPattern = new Regex("[^0-9]", RegexOptions.Compiled); @@ -144,58 +174,56 @@ namespace Lucene.Net.Analysis.Hunspell flagLookup.Add(new BytesRef()); // no flags -> ord 0 FileInfo aff = FileSupport.CreateTempFile("affix", "aff", tempDir); - using (Stream @out = aff.Open(FileMode.Open, FileAccess.ReadWrite)) - { - // copy contents of affix stream to temp file - affix.CopyTo(@out); - } - - // pass 1: get encoding - string encoding; - using (Stream aff1 = aff.Open(FileMode.Open, FileAccess.Read)) + try { - encoding = GetDictionaryEncoding(aff1); - } + using (Stream @out = aff.Open(FileMode.Open, FileAccess.ReadWrite)) + { + // copy contents of affix stream to temp file + affix.CopyTo(@out); + } - // pass 2: parse affixes - Encoding decoder = GetSystemEncoding(encoding); - using (Stream aff2 = aff.Open(FileMode.Open, FileAccess.Read)) - { - ReadAffixFile(aff2, decoder); - } + // pass 1: get encoding + string encoding; + using (Stream aff1 = aff.Open(FileMode.Open, FileAccess.Read)) + { + encoding = GetDictionaryEncoding(aff1); + } - // read dictionary entries - Int32SequenceOutputs o = Int32SequenceOutputs.Singleton; - Builder<Int32sRef> b = new Builder<Int32sRef>(FST.INPUT_TYPE.BYTE4, o); - ReadDictionaryFiles(dictionaries, decoder, b); - words = b.Finish(); - aliases = null; // no longer needed + // pass 2: parse affixes + Encoding decoder = GetSystemEncoding(encoding); + using (Stream aff2 = aff.Open(FileMode.Open, FileAccess.Read)) + { + ReadAffixFile(aff2, decoder); + } - try - { - aff.Delete(); + // read dictionary entries + Int32SequenceOutputs o = Int32SequenceOutputs.Singleton; + Builder<Int32sRef> b = new Builder<Int32sRef>(FST.INPUT_TYPE.BYTE4, o); + ReadDictionaryFiles(dictionaries, decoder, b); + words = b.Finish(); + aliases = null; // no longer needed + morphAliases = null; // no longer needed } - catch + finally { - // ignore + try + { + aff.Delete(); + } + catch + { + // ignore + } } } - /// <summary> - /// Looks up Hunspell word forms from the dictionary - /// </summary> + // only for testing internal virtual Int32sRef LookupWord(char[] word, int offset, int length) { return Lookup(words, word, offset, length); } - /// <summary> - /// Looks up HunspellAffix prefixes that have an append that matches the <see cref="string"/> created from the given <see cref="char"/> array, offset and length - /// </summary> - /// <param name="word"> <see cref="char"/> array to generate the <see cref="string"/> from </param> - /// <param name="offset"> Offset in the <see cref="char"/> array that the <see cref="string"/> starts at </param> - /// <param name="length"> Length from the offset that the <see cref="string"/> is </param> - /// <returns> List of HunspellAffix prefixes with an append that matches the <see cref="string"/>, or <c>null</c> if none are found </returns> + // only for testing internal virtual Int32sRef LookupPrefix(char[] word, int offset, int length) { return Lookup(prefixes, word, offset, length); @@ -213,8 +241,6 @@ namespace Lucene.Net.Analysis.Hunspell return Lookup(suffixes, word, offset, length); } - // TODO: this is pretty stupid, considering how the stemming algorithm works - // we can speed it up to be significantly faster! internal virtual Int32sRef Lookup(FST<Int32sRef> fst, char[] word, int offset, int length) { if (fst == null) @@ -269,8 +295,8 @@ namespace Lucene.Net.Analysis.Hunspell /// <exception cref="IOException"> Can be thrown while reading from the InputStream </exception> private void ReadAffixFile(Stream affixStream, Encoding decoder) { - JCG.SortedDictionary<string, IList<char?>> prefixes = new JCG.SortedDictionary<string, IList<char?>>(StringComparer.Ordinal); - JCG.SortedDictionary<string, IList<char?>> suffixes = new JCG.SortedDictionary<string, IList<char?>>(StringComparer.Ordinal); + var prefixes = new JCG.SortedDictionary<string, IList<int>>(StringComparer.Ordinal); + var suffixes = new JCG.SortedDictionary<string, IList<int>>(StringComparer.Ordinal); IDictionary<string, int?> seenPatterns = new JCG.Dictionary<string, int?> { // zero condition -> 0 ord @@ -285,7 +311,7 @@ namespace Lucene.Net.Analysis.Hunspell }; var reader = new StreamReader(affixStream, decoder); - string line = null; + string line; // LUCENENET: Removed unnecessary null assignment int lineNumber = 0; while ((line = reader.ReadLine()) != null) { @@ -299,6 +325,10 @@ namespace Lucene.Net.Analysis.Hunspell { ParseAlias(line); } + else if (line.StartsWith(MORPH_ALIAS_KEY, StringComparison.Ordinal)) + { + ParseMorphAlias(line); + } else if (line.StartsWith(PREFIX_KEY, StringComparison.Ordinal)) { ParseAffix(prefixes, line, reader, PREFIX_CONDITION_REGEX_PATTERN, seenPatterns, seenStrips); @@ -322,16 +352,43 @@ namespace Lucene.Net.Analysis.Hunspell string[] parts = whitespacePattern.Split(line).TrimEnd(); if (parts.Length != 2) { - throw new Exception(string.Format("Illegal CIRCUMFIX declaration, line {0}", lineNumber)); + throw new FormatException(string.Format("Illegal CIRCUMFIX declaration, line {0}", lineNumber)); } circumfix = flagParsingStrategy.ParseFlag(parts[1]); } + else if (line.StartsWith(KEEPCASE_KEY, StringComparison.Ordinal)) + { + string[] parts = whitespacePattern.Split(line).TrimEnd(); + if (parts.Length != 2) + { + throw new FormatException(string.Format("Illegal KEEPCASE declaration, line {0}", lineNumber)); + } + keepcase = flagParsingStrategy.ParseFlag(parts[1]); + } + else if (line.StartsWith(NEEDAFFIX_KEY, StringComparison.Ordinal) || line.StartsWith(PSEUDOROOT_KEY, StringComparison.Ordinal)) + { + string[] parts = whitespacePattern.Split(line).TrimEnd(); + if (parts.Length != 2) + { + throw new FormatException(string.Format("Illegal NEEDAFFIX declaration, line {0}", lineNumber)); + } + needaffix = flagParsingStrategy.ParseFlag(parts[1]); + } + else if (line.StartsWith(ONLYINCOMPOUND_KEY, StringComparison.Ordinal)) + { + string[] parts = whitespacePattern.Split(line).TrimEnd(); + if (parts.Length != 2) + { + throw new FormatException(string.Format("Illegal ONLYINCOMPOUND declaration, line {0}", lineNumber)); + } + onlyincompound = flagParsingStrategy.ParseFlag(parts[1]); + } else if (line.StartsWith(IGNORE_KEY, StringComparison.Ordinal)) { string[] parts = whitespacePattern.Split(line).TrimEnd(); if (parts.Length != 2) { - throw new Exception(string.Format("Illegal IGNORE declaration, line {0}", lineNumber)); + throw new FormatException(string.Format("Illegal IGNORE declaration, line {0}", lineNumber)); } ignore = parts[1].ToCharArray(); Array.Sort(ignore); @@ -343,7 +400,7 @@ namespace Lucene.Net.Analysis.Hunspell string type = parts[0]; if (parts.Length != 2) { - throw new Exception(string.Format("Illegal {0} declaration, line {1}", type, lineNumber)); + throw new FormatException(string.Format("Illegal {0} declaration, line {1}", type, lineNumber)); } int num = int.Parse(parts[1], CultureInfo.InvariantCulture); FST<CharsRef> res = ParseConversions(reader, num); @@ -358,6 +415,15 @@ namespace Lucene.Net.Analysis.Hunspell needsOutputCleaning |= oconv != null; } } + else if (line.StartsWith(FULLSTRIP_KEY, StringComparison.Ordinal)) + { + fullStrip = true; + } + else if (line.StartsWith(LANG_KEY, StringComparison.Ordinal)) + { + language = line.Substring(LANG_KEY.Length).Trim(); + alternateCasing = "tr_TR".Equals(language, StringComparison.Ordinal) || "az_AZ".Equals(language, StringComparison.Ordinal); + } } this.prefixes = AffixFST(prefixes); @@ -382,26 +448,51 @@ namespace Lucene.Net.Analysis.Hunspell stripOffsets[currentIndex] = currentOffset; } - private FST<Int32sRef> AffixFST(JCG.SortedDictionary<string, IList<char?>> affixes) + private FST<Int32sRef> AffixFST(JCG.SortedDictionary<string, IList<int>> affixes) { Int32SequenceOutputs outputs = Int32SequenceOutputs.Singleton; Builder<Int32sRef> builder = new Builder<Int32sRef>(FST.INPUT_TYPE.BYTE4, outputs); Int32sRef scratch = new Int32sRef(); - foreach (KeyValuePair<string, IList<char?>> entry in affixes) + foreach (KeyValuePair<string, IList<int>> entry in affixes) { Lucene.Net.Util.Fst.Util.ToUTF32(entry.Key, scratch); - IList<char?> entries = entry.Value; + IList<int> entries = entry.Value; Int32sRef output = new Int32sRef(entries.Count); - foreach (char? c in entries) + foreach (int c in entries) { - output.Int32s[output.Length++] = c.HasValue ? c.Value : 0; + output.Int32s[output.Length++] = c; } builder.Add(scratch, output); } return builder.Finish(); } + internal static string EscapeDash(string re) + { + // we have to be careful, even though dash doesn't have a special meaning, + // some dictionaries already escape it (e.g. pt_PT), so we don't want to nullify it + StringBuilder escaped = new StringBuilder(); + for (int i = 0; i < re.Length; i++) + { + char c = re[i]; + if (c == '-') + { + escaped.Append("\\-"); + } + else + { + escaped.Append(c); + if (c == '\\' && i + 1 < re.Length) + { + escaped.Append(re[i + 1]); + i++; + } + } + } + return escaped.ToString(); + } + /// <summary> /// Parses a specific affix rule putting the result into the provided affix map /// </summary> @@ -413,13 +504,19 @@ namespace Lucene.Net.Analysis.Hunspell /// <param name="seenPatterns"> map from condition -> index of patterns, for deduplication. </param> /// <param name="seenStrips"></param> /// <exception cref="IOException"> Can be thrown while reading the rule </exception> - private void ParseAffix(JCG.SortedDictionary<string, IList<char?>> affixes, string header, TextReader reader, string conditionPattern, IDictionary<string, int?> seenPatterns, IDictionary<string, int?> seenStrips) + private void ParseAffix(JCG.SortedDictionary<string, IList<int>> affixes, + string header, + TextReader reader, + string conditionPattern, + IDictionary<string, int?> seenPatterns, + IDictionary<string, int?> seenStrips) { BytesRef scratch = new BytesRef(); StringBuilder sb = new StringBuilder(); string[] args = whitespacePattern.Split(header).TrimEnd(); bool crossProduct = args[2].Equals("Y", StringComparison.Ordinal); + bool isSuffix = conditionPattern == SUFFIX_CONDITION_REGEX_PATTERN; int numLines = int.Parse(args[3], CultureInfo.InvariantCulture); affixData = ArrayUtil.Grow(affixData, (currentAffix << 3) + (numLines << 3)); @@ -435,7 +532,7 @@ namespace Lucene.Net.Analysis.Hunspell // condition is optional if (ruleArgs.Length < 4) { - throw new Exception("The affix file contains a rule with less than four elements: " + line /*, reader.LineNumber */);// LUCENENET TODO: LineNumberReader + throw new FormatException("The affix file contains a rule with less than four elements: " + line /*, reader.LineNumber */);// LUCENENET TODO: LineNumberReader } char flag = flagParsingStrategy.ParseFlag(ruleArgs[1]); @@ -443,6 +540,7 @@ namespace Lucene.Net.Analysis.Hunspell string affixArg = ruleArgs[3]; char[] appendFlags = null; + // first: parse continuation classes out of affix int flagSep = affixArg.LastIndexOf('/'); if (flagSep != -1) { @@ -458,19 +556,22 @@ namespace Lucene.Net.Analysis.Hunspell Array.Sort(appendFlags); twoStageAffix = true; } - - // TODO: add test and fix zero-affix handling! + // zero affix -> empty string + if ("0".Equals(affixArg, StringComparison.Ordinal)) + { + affixArg = ""; + } string condition = ruleArgs.Length > 4 ? ruleArgs[4] : "."; // at least the gascon affix file has this issue - if (condition.StartsWith("[", StringComparison.Ordinal) && !condition.EndsWith("]", StringComparison.Ordinal)) + if (condition.StartsWith("[", StringComparison.Ordinal) && condition.IndexOf(']') == -1) { condition = condition + "]"; } // "dash hasn't got special meaning" (we must escape it) if (condition.IndexOf('-') >= 0) { - condition = condition.Replace("-", "\\-"); + condition = EscapeDash(condition); } string regex; @@ -543,12 +644,17 @@ namespace Lucene.Net.Analysis.Hunspell affixArg = cleaned.ToString(); } - if (!affixes.TryGetValue(affixArg, out IList<char?> list) || list == null) + if (isSuffix) { - affixes[affixArg] = list = new List<char?>(); + affixArg = new StringBuilder(affixArg).Reverse().ToString(); } - list.Add((char)currentAffix); + if (!affixes.TryGetValue(affixArg, out IList<int> list) || list == null) + { + affixes[affixArg] = list = new List<int>(); + } + + list.Add(currentAffix); currentAffix++; } } @@ -563,7 +669,7 @@ namespace Lucene.Net.Analysis.Hunspell string[] parts = whitespacePattern.Split(line).TrimEnd(); if (parts.Length != 3) { - throw new Exception("invalid syntax: " + line /*, reader.LineNumber */); // LUCENENET TODO: LineNumberReader + throw new FormatException("invalid syntax: " + line /*, reader.LineNumber */); // LUCENENET TODO: LineNumberReader } if (mappings.Put(parts[1], parts[2]) != null) { @@ -617,7 +723,7 @@ namespace Lucene.Net.Analysis.Hunspell // this test only at the end as ineffective but would allow lines only containing spaces: if (ch < 0) { - throw new Exception("Unexpected end of affix file." /*, 0*/); + throw new FormatException("Unexpected end of affix file." /*, 0*/); } continue; } @@ -646,7 +752,7 @@ namespace Lucene.Net.Analysis.Hunspell /// </summary> /// <param name="encoding"> Encoding to retrieve the <see cref="Encoding"/> instance for </param> /// <returns> <see cref="Encoding"/> for the given encoding <see cref="string"/> </returns> - // LUCENENET NOTE: This was getJavaEncoding in the original + // LUCENENET NOTE: This was getJavaEncoding in Lucene private Encoding GetSystemEncoding(string encoding) { if (string.IsNullOrEmpty(encoding)) @@ -713,12 +819,14 @@ namespace Lucene.Net.Analysis.Hunspell throw new ArgumentException("Unknown flag type: " + flagType); } - internal readonly char FLAG_SEPARATOR = (char)0x1f; // flag separator after escaping + internal const char FLAG_SEPARATOR = (char)0x1f; // flag separator after escaping + internal const char MORPH_SEPARATOR = (char)0x1e; // separator for boundary of entry (may be followed by morph data) internal virtual string UnescapeEntry(string entry) { StringBuilder sb = new StringBuilder(); - for (int i = 0; i < entry.Length; i++) + int end = MorphBoundary(entry); + for (int i = 0; i < end; i++) { char ch = entry[i]; if (ch == '\\' && i + 1 < entry.Length) @@ -730,14 +838,74 @@ namespace Lucene.Net.Analysis.Hunspell { sb.Append(FLAG_SEPARATOR); } + else if (ch == MORPH_SEPARATOR || ch == FLAG_SEPARATOR) + { + // BINARY EXECUTABLES EMBEDDED IN ZULU DICTIONARIES!!!!!!! + } else { sb.Append(ch); } } + sb.Append(MORPH_SEPARATOR); + if (end < entry.Length) + { + for (int i = end; i < entry.Length; i++) + { + char c = entry[i]; + if (c == FLAG_SEPARATOR || c == MORPH_SEPARATOR) + { + // BINARY EXECUTABLES EMBEDDED IN ZULU DICTIONARIES!!!!!!! + } + else + { + sb.Append(c); + } + } + } return sb.ToString(); } + internal static int MorphBoundary(string line) + { + int end = IndexOfSpaceOrTab(line, 0); + if (end == -1) + { + return line.Length; + } + while (end >= 0 && end < line.Length) + { + if (line[end] == '\t' || + end + 3 < line.Length && + Character.IsLetter(line[end + 1]) && + Character.IsLetter(line[end + 2]) && + line[end + 3] == ':') + { + break; + } + end = IndexOfSpaceOrTab(line, end + 1); + } + if (end == -1) + { + return line.Length; + } + return end; + } + + internal static int IndexOfSpaceOrTab(string text, int start) + { + int pos1 = text.IndexOf('\t', start); + int pos2 = text.IndexOf(' ', start); + if (pos1 >= 0 && pos2 >= 0) + { + return Math.Min(pos1, pos2); + } + else + { + return Math.Max(pos1, pos2); + } + } + /// <summary> /// Reads the dictionary file through the provided <see cref="Stream"/>s, building up the words map /// </summary> @@ -762,12 +930,30 @@ namespace Lucene.Net.Analysis.Hunspell while ((line = lines.ReadLine()) != null) { + // wild and unpredictable code comment rules + if (line == string.Empty || line[0] == '/' || line[0] == '#' || line[0] == '\t') + { + continue; + } line = UnescapeEntry(line); + // if we havent seen any stem exceptions, try to parse one + if (hasStemExceptions == false) + { + int morphStart = line.IndexOf(MORPH_SEPARATOR); + if (morphStart >= 0 && morphStart < line.Length) + { + hasStemExceptions = ParseStemException(line.Substring(morphStart + 1)) != null; + } + } if (needsInputCleaning) { int flagSep = line.LastIndexOf(FLAG_SEPARATOR); if (flagSep == -1) { + flagSep = line.IndexOf(MORPH_SEPARATOR); + } + if (flagSep == -1) + { string cleansed = CleanInput(line, sb); writer.Write(cleansed.ToString().GetBytes(Encoding.UTF8)); } @@ -805,7 +991,7 @@ namespace Lucene.Net.Analysis.Hunspell for (int i = scratch1.Length - 1; i >= 0; i--) { - if (scratch1.Bytes[scratch1.Offset + i] == this.FLAG_SEPARATOR) + if (scratch1.Bytes[scratch1.Offset + i] == FLAG_SEPARATOR || scratch1.Bytes[scratch1.Offset + i] == MORPH_SEPARATOR) { scratch1.Length = i; break; @@ -818,7 +1004,7 @@ namespace Lucene.Net.Analysis.Hunspell for (int i = scratch2.Length - 1; i >= 0; i--) { - if (scratch2.Bytes[scratch2.Offset + i] == this.FLAG_SEPARATOR) + if (scratch2.Bytes[scratch2.Offset + i] == FLAG_SEPARATOR || scratch2.Bytes[scratch2.Offset + i] == MORPH_SEPARATOR) { scratch2.Length = i; break; @@ -862,29 +1048,18 @@ namespace Lucene.Net.Analysis.Hunspell line2 = scratchLine.Utf8ToString(); string entry; char[] wordForm; + int end; - int flagSep = line2.LastIndexOf(FLAG_SEPARATOR); + int flagSep = line2.IndexOf(FLAG_SEPARATOR); if (flagSep == -1) { wordForm = NOFLAGS; - entry = line2; + end = line2.IndexOf(MORPH_SEPARATOR); + entry = line2.Substring(0, end); } else { - // note, there can be comments (morph description) after a flag. - // we should really look for any whitespace: currently just tab and space - int end = line2.IndexOf('\t', flagSep); - if (end == -1) - { - end = line2.Length; - } - int end2 = line2.IndexOf(' ', flagSep); - if (end2 == -1) - { - end2 = line2.Length; - } - end = Math.Min(end, end2); - + end = line2.IndexOf(MORPH_SEPARATOR); string flagPart = line2.Substring(flagSep + 1, end - (flagSep + 1)); if (aliasCount > 0) { @@ -895,6 +1070,23 @@ namespace Lucene.Net.Analysis.Hunspell Array.Sort(wordForm); entry = line2.Substring(0, flagSep - 0); } + // we possibly have morphological data + int stemExceptionID = 0; + if (hasStemExceptions && end + 1 < line2.Length) + { + string stemException = ParseStemException(line2.Substring(end + 1)); + if (stemException != null) + { + if (stemExceptionCount == stemExceptions.Length) + { + int newSize = ArrayUtil.Oversize(stemExceptionCount + 1, RamUsageEstimator.NUM_BYTES_OBJECT_REF); + stemExceptions = Arrays.CopyOf(stemExceptions, newSize); + } + stemExceptionID = stemExceptionCount + 1; // we use '0' to indicate no exception for the form + stemExceptions[stemExceptionCount++] = stemException; + } + } + // LUCENENET NOTE: CompareToOrdinal is an extension method that works similarly to // Java's String.compareTo method. int cmp = currentEntry == null ? 1 : entry.CompareToOrdinal(currentEntry); @@ -923,8 +1115,17 @@ namespace Lucene.Net.Analysis.Hunspell currentEntry = entry; currentOrds = new Int32sRef(); // must be this way } - currentOrds.Grow(currentOrds.Length + 1); - currentOrds.Int32s[currentOrds.Length++] = ord; + if (hasStemExceptions) + { + currentOrds.Grow(currentOrds.Length + 2); + currentOrds.Int32s[currentOrds.Length++] = ord; + currentOrds.Int32s[currentOrds.Length++] = stemExceptionID; + } + else + { + currentOrds.Grow(currentOrds.Length + 1); + currentOrds.Int32s[currentOrds.Length++] = ord; + } } } @@ -1002,6 +1203,54 @@ namespace Lucene.Net.Analysis.Hunspell } } + internal string GetStemException(int id) + { + return stemExceptions[id - 1]; + } + + private void ParseMorphAlias(string line) + { + if (morphAliases == null) + { + //first line should be the aliases count + int count = int.Parse(line.Substring(3), CultureInfo.InvariantCulture); + morphAliases = new string[count]; + } + else + { + string arg = line.Substring(2); // leave the space + morphAliases[morphAliasCount++] = arg; + } + } + + private string ParseStemException(string morphData) + { + // first see if its an alias + if (morphAliasCount > 0) + { + if (int.TryParse(morphData.Trim(), NumberStyles.Integer, CultureInfo.InvariantCulture, out int alias)) + { + morphData = morphAliases[alias - 1]; + } // else fine + } + // try to parse morph entry + int index = morphData.IndexOf(" st:", StringComparison.Ordinal); + if (index < 0) + { + index = morphData.IndexOf("\tst:", StringComparison.Ordinal); + } + if (index >= 0) + { + int endIndex = IndexOfSpaceOrTab(morphData, index + 1); + if (endIndex < 0) + { + endIndex = morphData.Length; + } + return morphData.Substring(index + 4, endIndex - (index + 4)); + } + return null; + } + /// <summary> /// Abstraction of the process of parsing flags taken from the affix and dic files /// </summary> @@ -1077,8 +1326,6 @@ namespace Lucene.Net.Analysis.Hunspell /// <summary> /// Implementation of <see cref="FlagParsingStrategy"/> that assumes each flag is encoded as two ASCII characters whose codes /// must be combined into a single character. - /// - /// TODO (rmuir) test /// </summary> private class DoubleASCIIFlagParsingStrategy : FlagParsingStrategy { @@ -1096,8 +1343,14 @@ namespace Lucene.Net.Analysis.Hunspell } for (int i = 0; i < rawFlags.Length; i += 2) { - char cookedFlag = (char)((int)rawFlags[i] + (int)rawFlags[i + 1]); - builder.Append(cookedFlag); + char f1 = rawFlags[i]; + char f2 = rawFlags[i + 1]; + if (f1 >= 256 || f2 >= 256) + { + throw new ArgumentException("Invalid flags (LONG flags must be double ASCII): " + rawFlags); + } + char combined = (char)(f1 << 8 | f2); + builder.Append(combined); } char[] flags = new char[builder.Length]; @@ -1127,7 +1380,7 @@ namespace Lucene.Net.Analysis.Hunspell if (ignoreCase && iconv == null) { // if we have no input conversion mappings, do this on-the-fly - ch = char.ToLowerInvariant(ch); + ch = CaseFold(ch); } reuse.Append(ch); @@ -1147,7 +1400,7 @@ namespace Lucene.Net.Analysis.Hunspell { for (int i = 0; i < reuse.Length; i++) { - reuse[i] = char.ToLowerInvariant(reuse[i]); + reuse[i] = CaseFold(reuse[i]); } } } @@ -1155,6 +1408,30 @@ namespace Lucene.Net.Analysis.Hunspell return reuse.ToString(); } + // folds single character (according to LANG if present) + internal char CaseFold(char c) + { + if (alternateCasing) + { + if (c == 'I') + { + return 'ı'; + } + else if (c == 'İ') + { + return 'i'; + } + else + { + return char.ToLowerInvariant(c); + } + } + else + { + return char.ToLowerInvariant(c); + } + } + // TODO: this could be more efficient! internal static void ApplyMappings(FST<CharsRef> fst, StringBuilder sb) { diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Hunspell/HunspellStemFilter.cs b/src/Lucene.Net.Analysis.Common/Analysis/Hunspell/HunspellStemFilter.cs index d4526d6..d6c15e1 100644 --- a/src/Lucene.Net.Analysis.Common/Analysis/Hunspell/HunspellStemFilter.cs +++ b/src/Lucene.Net.Analysis.Common/Analysis/Hunspell/HunspellStemFilter.cs @@ -1,4 +1,4 @@ -// Lucene version compatibility level 4.8.1 +// Lucene version compatibility level 4.10.4 using Lucene.Net.Analysis.TokenAttributes; using Lucene.Net.Util; using System.Collections.Generic; diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Hunspell/HunspellStemFilterFactory.cs b/src/Lucene.Net.Analysis.Common/Analysis/Hunspell/HunspellStemFilterFactory.cs index f5c044b..1752eab 100644 --- a/src/Lucene.Net.Analysis.Common/Analysis/Hunspell/HunspellStemFilterFactory.cs +++ b/src/Lucene.Net.Analysis.Common/Analysis/Hunspell/HunspellStemFilterFactory.cs @@ -1,4 +1,4 @@ -// Lucene version compatibility level 4.8.1 +// Lucene version compatibility level 4.10.4 using J2N.Text; using Lucene.Net.Analysis.Util; using Lucene.Net.Util; diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Hunspell/ISO8859_14Decoder.cs b/src/Lucene.Net.Analysis.Common/Analysis/Hunspell/ISO8859_14Decoder.cs index a406b68..6078954 100644 --- a/src/Lucene.Net.Analysis.Common/Analysis/Hunspell/ISO8859_14Decoder.cs +++ b/src/Lucene.Net.Analysis.Common/Analysis/Hunspell/ISO8859_14Decoder.cs @@ -1,4 +1,4 @@ -// Lucene version compatibility level 4.8.1 +// Lucene version compatibility level 4.10.4 using Lucene.Net.Support; using System; using System.Text; diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Hunspell/Stemmer.cs b/src/Lucene.Net.Analysis.Common/Analysis/Hunspell/Stemmer.cs index ee0cb6b..41d31fc 100644 --- a/src/Lucene.Net.Analysis.Common/Analysis/Hunspell/Stemmer.cs +++ b/src/Lucene.Net.Analysis.Common/Analysis/Hunspell/Stemmer.cs @@ -1,10 +1,11 @@ -// Lucene version compatibility level 4.8.1 +// Lucene version compatibility level 4.10.4 using J2N.Numerics; using Lucene.Net.Analysis.Util; using Lucene.Net.Diagnostics; using Lucene.Net.Store; using Lucene.Net.Util; using Lucene.Net.Util.Automaton; +using Lucene.Net.Util.Fst; using System; using System.Collections.Generic; using System.IO; @@ -44,6 +45,10 @@ namespace Lucene.Net.Analysis.Hunspell private readonly StringBuilder scratchSegment = new StringBuilder(); private char[] scratchBuffer = new char[32]; + // its '1' if we have no stem exceptions, otherwise every other form + // is really an ID pointing to the exception table + private readonly int formStep; + /// <summary> /// Constructs a new Stemmer which will use the provided <see cref="Dictionary"/> to create its stems. /// </summary> @@ -52,6 +57,20 @@ namespace Lucene.Net.Analysis.Hunspell { this.dictionary = dictionary; this.affixReader = new ByteArrayDataInput(dictionary.affixData); + for (int level = 0; level < 3; level++) + { + if (dictionary.prefixes != null) + { + prefixArcs[level] = new FST.Arc<Int32sRef>(); + prefixReaders[level] = dictionary.prefixes.GetBytesReader(); + } + if (dictionary.suffixes != null) + { + suffixArcs[level] = new FST.Arc<Int32sRef>(); + suffixReaders[level] = dictionary.suffixes.GetBytesReader(); + } + } + formStep = dictionary.hasStemExceptions ? 2 : 1; } /// <summary> @@ -84,18 +103,133 @@ namespace Lucene.Net.Analysis.Hunspell word = scratchBuffer; } + int caseType = CaseOf(word, length); + if (caseType == UPPER_CASE) + { + // upper: union exact, title, lower + CaseFoldTitle(word, length); + CaseFoldLower(titleBuffer, length); + IList<CharsRef> list = DoStem(word, length, false); + list.AddRange(DoStem(titleBuffer, length, true)); + list.AddRange(DoStem(lowerBuffer, length, true)); + return list; + } + else if (caseType == TITLE_CASE) + { + // title: union exact, lower + CaseFoldLower(word, length); + IList<CharsRef> list = DoStem(word, length, false); + list.AddRange(DoStem(lowerBuffer, length, true)); + return list; + } + else + { + // exact match only + return DoStem(word, length, false); + } + } + + // temporary buffers for case variants + private char[] lowerBuffer = new char[8]; + private char[] titleBuffer = new char[8]; + + private const int EXACT_CASE = 0; + private const int TITLE_CASE = 1; + private const int UPPER_CASE = 2; + + // returns EXACT_CASE,TITLE_CASE, or UPPER_CASE type for the word + private int CaseOf(char[] word, int length) + { + if (dictionary.ignoreCase || length == 0 || !char.IsUpper(word[0])) + { + return EXACT_CASE; + } + + // determine if we are title or lowercase (or something funky, in which its exact) + bool seenUpper = false; + bool seenLower = false; + for (int i = 1; i < length; i++) + { + bool v = char.IsUpper(word[i]); + seenUpper |= v; + seenLower |= !v; + } + + if (!seenLower) + { + return UPPER_CASE; + } + else if (!seenUpper) + { + return TITLE_CASE; + } + else + { + return EXACT_CASE; + } + } + + // folds titlecase variant of word to titleBuffer + private void CaseFoldTitle(char[] word, int length) + { + titleBuffer = ArrayUtil.Grow(titleBuffer, length); + System.Array.Copy(word, 0, titleBuffer, 0, length); + for (int i = 1; i < length; i++) + { + titleBuffer[i] = dictionary.CaseFold(titleBuffer[i]); + } + } + + // folds lowercase variant of word (title cased) to lowerBuffer + private void CaseFoldLower(char[] word, int length) + { + lowerBuffer = ArrayUtil.Grow(lowerBuffer, length); + System.Array.Copy(word, 0, lowerBuffer, 0, length); + lowerBuffer[0] = dictionary.CaseFold(lowerBuffer[0]); + } + + private IList<CharsRef> DoStem(char[] word, int length, bool caseVariant) + { List<CharsRef> stems = new List<CharsRef>(); Int32sRef forms = dictionary.LookupWord(word, 0, length); if (forms != null) { - // TODO: some forms should not be added, e.g. ONLYINCOMPOUND - // just because it exists, does not make it valid... - for (int i = 0; i < forms.Length; i++) + for (int i = 0; i < forms.Length; i += formStep) { - stems.Add(NewStem(word, length)); + bool checkKeepCase = caseVariant && dictionary.keepcase != -1; + bool checkNeedAffix = dictionary.needaffix != -1; + bool checkOnlyInCompound = dictionary.onlyincompound != -1; + if (checkKeepCase || checkNeedAffix || checkOnlyInCompound) + { + dictionary.flagLookup.Get(forms.Int32s[forms.Offset + i], scratch); + char[] wordFlags = Dictionary.DecodeFlags(scratch); + // we are looking for a case variant, but this word does not allow it + if (checkKeepCase && Dictionary.HasFlag(wordFlags, (char)dictionary.keepcase)) + { + continue; + } + // we can't add this form, its a pseudostem requiring an affix + if (checkNeedAffix && Dictionary.HasFlag(wordFlags, (char)dictionary.needaffix)) + { + continue; + } + // we can't add this form, it only belongs inside a compound word + if (checkOnlyInCompound && Dictionary.HasFlag(wordFlags, (char)dictionary.onlyincompound)) + { + continue; + } + } + stems.Add(NewStem(word, length, forms, i)); } } - stems.AddRange(Stem(word, length, -1, -1, -1, 0, true, true, false, false)); + try + { + stems.AddRange(Stem(word, length, -1, -1, -1, 0, true, true, false, false, caseVariant)); + } + catch (IOException bogus) + { + throw new Exception(bogus.ToString(), bogus); + } return stems; } @@ -128,12 +262,37 @@ namespace Lucene.Net.Analysis.Hunspell return deduped; } - private CharsRef NewStem(char[] buffer, int length) + private CharsRef NewStem(char[] buffer, int length, Int32sRef forms, int formID) { + string exception; + if (dictionary.hasStemExceptions) + { + int exceptionID = forms.Int32s[forms.Offset + formID + 1]; + if (exceptionID > 0) + { + exception = dictionary.GetStemException(exceptionID); + } + else + { + exception = null; + } + } + else + { + exception = null; + } + if (dictionary.needsOutputCleaning) { scratchSegment.Length = 0; - scratchSegment.Append(buffer, 0, length); + if (exception != null) + { + scratchSegment.Append(exception); + } + else + { + scratchSegment.Append(buffer, 0, length); + } try { Dictionary.ApplyMappings(dictionary.oconv, scratchSegment); @@ -148,12 +307,26 @@ namespace Lucene.Net.Analysis.Hunspell } else { - return new CharsRef(buffer, 0, length); + if (exception != null) + { + return new CharsRef(exception); + } + else + { + return new CharsRef(buffer, 0, length); + } } } // ================================================= Helper Methods ================================================ + // some state for traversing FSTs + private readonly FST.BytesReader[] prefixReaders = new FST.BytesReader[3]; + private readonly FST.Arc<Int32sRef>[] prefixArcs = new FST.Arc<Int32sRef>[3]; + + private readonly FST.BytesReader[] suffixReaders = new FST.BytesReader[3]; + private readonly FST.Arc<Int32sRef>[] suffixArcs = new FST.Arc<Int32sRef>[3]; + /// <summary> /// Generates a list of stems for the provided word /// </summary> @@ -170,22 +343,46 @@ namespace Lucene.Net.Analysis.Hunspell /// but two prefixes (COMPLEXPREFIXES) or two suffixes must have continuation requirements to recurse. </param> /// <param name="circumfix"> true if the previous prefix removal was signed as a circumfix /// this means inner most suffix must also contain circumfix flag. </param> + /// <param name="caseVariant"> true if we are searching for a case variant. if the word has KEEPCASE flag it cannot succeed. </param> /// <returns> <see cref="IList{CharsRef}"/> of stems, or empty list if no stems are found </returns> - private IList<CharsRef> Stem(char[] word, int length, int previous, int prevFlag, int prefixFlag, int recursionDepth, bool doPrefix, bool doSuffix, bool previousWasPrefix, bool circumfix) + private IList<CharsRef> Stem(char[] word, int length, int previous, int prevFlag, int prefixFlag, int recursionDepth, bool doPrefix, bool doSuffix, bool previousWasPrefix, bool circumfix, bool caseVariant) { - // TODO: allow this stuff to be reused by tokenfilter List<CharsRef> stems = new List<CharsRef>(); if (doPrefix && dictionary.prefixes != null) { - for (int i = length - 1; i >= 0; i--) + FST<Int32sRef> fst = dictionary.prefixes; + Outputs<Int32sRef> outputs = fst.Outputs; + FST.BytesReader bytesReader = prefixReaders[recursionDepth]; + FST.Arc<Int32sRef> arc = prefixArcs[recursionDepth]; + fst.GetFirstArc(arc); + Int32sRef NO_OUTPUT = outputs.NoOutput; + Int32sRef output = NO_OUTPUT; + int limit = dictionary.fullStrip ? length : length - 1; + for (int i = 0; i < limit; i++) { - Int32sRef prefixes = dictionary.LookupPrefix(word, 0, i); - if (prefixes == null) + if (i > 0) + { + int ch = word[i - 1]; + if (fst.FindTargetArc(ch, arc, arc, bytesReader) == null) + { + break; + } + else if (arc.Output != NO_OUTPUT) + { + output = fst.Outputs.Add(output, arc.Output); + } + } + Int32sRef prefixes; // LUCENENET: IDE0059 - Removed unnecessary value assignment + if (!arc.IsFinal) { continue; } + else + { + prefixes = fst.Outputs.Add(output, arc.NextFinalOutput); + } for (int j = 0; j < prefixes.Length; j++) { @@ -205,7 +402,17 @@ namespace Lucene.Net.Analysis.Hunspell bool compatible; if (recursionDepth == 0) { - compatible = true; + if (dictionary.onlyincompound == -1) + { + compatible = true; + } + else + { + // check if affix is allowed in a non-compound word + dictionary.flagLookup.Get(append, scratch); + char[] appendFlags = Dictionary.DecodeFlags(scratch); + compatible = !Dictionary.HasFlag(appendFlags, (char)dictionary.onlyincompound); + } } else if (crossProduct) { @@ -213,7 +420,9 @@ namespace Lucene.Net.Analysis.Hunspell dictionary.flagLookup.Get(append, scratch); char[] appendFlags = Dictionary.DecodeFlags(scratch); if (Debugging.AssertsEnabled) Debugging.Assert(prevFlag >= 0); - compatible = HasCrossCheckedFlag((char)prevFlag, appendFlags, false); + bool allowed = dictionary.onlyincompound == -1 || + !Dictionary.HasFlag(appendFlags, (char)dictionary.onlyincompound); + compatible = allowed && HasCrossCheckedFlag((char)prevFlag, appendFlags, false); } else { @@ -238,7 +447,7 @@ namespace Lucene.Net.Analysis.Hunspell Array.Copy(dictionary.stripData, stripStart, strippedWord, 0, stripLength); Array.Copy(word, deAffixedStart, strippedWord, stripLength, deAffixedLength); - IList<CharsRef> stemList = ApplyAffix(strippedWord, strippedWord.Length, prefix, -1, recursionDepth, true, circumfix); + IList<CharsRef> stemList = ApplyAffix(strippedWord, strippedWord.Length, prefix, -1, recursionDepth, true, circumfix, caseVariant); stems.AddRange(stemList); } @@ -248,13 +457,37 @@ namespace Lucene.Net.Analysis.Hunspell if (doSuffix && dictionary.suffixes != null) { - for (int i = 0; i < length; i++) + FST<Int32sRef> fst = dictionary.suffixes; + Outputs<Int32sRef> outputs = fst.Outputs; + FST.BytesReader bytesReader = suffixReaders[recursionDepth]; + FST.Arc<Int32sRef> arc = suffixArcs[recursionDepth]; + fst.GetFirstArc(arc); + Int32sRef NO_OUTPUT = outputs.NoOutput; + Int32sRef output = NO_OUTPUT; + int limit = dictionary.fullStrip ? 0 : 1; + for (int i = length; i >= limit; i--) { - Int32sRef suffixes = dictionary.LookupSuffix(word, i, length - i); - if (suffixes == null) + if (i < length) + { + int ch = word[i]; + if (fst.FindTargetArc(ch, arc, arc, bytesReader) == null) + { + break; + } + else if (arc.Output != NO_OUTPUT) + { + output = fst.Outputs.Add(output, arc.Output); + } + } + Int32sRef suffixes; // LUCENENET: IDE0059 - Removed unnecessary value assignment + if (!arc.IsFinal) { continue; } + else + { + suffixes = fst.Outputs.Add(output, arc.NextFinalOutput); + } for (int j = 0; j < suffixes.Length; j++) { @@ -274,7 +507,17 @@ namespace Lucene.Net.Analysis.Hunspell bool compatible; if (recursionDepth == 0) { - compatible = true; + if (dictionary.onlyincompound == -1) + { + compatible = true; + } + else + { + // check if affix is allowed in a non-compound word + dictionary.flagLookup.Get(append, scratch); + char[] appendFlags = Dictionary.DecodeFlags(scratch); + compatible = !Dictionary.HasFlag(appendFlags, (char)dictionary.onlyincompound); + } } else if (crossProduct) { @@ -282,6 +525,8 @@ namespace Lucene.Net.Analysis.Hunspell dictionary.flagLookup.Get(append, scratch); char[] appendFlags = Dictionary.DecodeFlags(scratch); if (Debugging.AssertsEnabled) Debugging.Assert(prevFlag >= 0); + bool allowed = dictionary.onlyincompound == -1 || + !Dictionary.HasFlag(appendFlags, (char)dictionary.onlyincompound); compatible = HasCrossCheckedFlag((char)prevFlag, appendFlags, previousWasPrefix); } else @@ -307,7 +552,7 @@ namespace Lucene.Net.Analysis.Hunspell Array.Copy(word, 0, strippedWord, 0, deAffixedLength); Array.Copy(dictionary.stripData, stripStart, strippedWord, deAffixedLength, stripLength); - IList<CharsRef> stemList = ApplyAffix(strippedWord, strippedWord.Length, suffix, prefixFlag, recursionDepth, false, circumfix); + IList<CharsRef> stemList = ApplyAffix(strippedWord, strippedWord.Length, suffix, prefixFlag, recursionDepth, false, circumfix, caseVariant); stems.AddRange(stemList); } @@ -361,8 +606,9 @@ namespace Lucene.Net.Analysis.Hunspell /// <param name="prefix"> true if we are removing a prefix (false if its a suffix) </param> /// <param name="circumfix"> true if the previous prefix removal was signed as a circumfix /// this means inner most suffix must also contain circumfix flag. </param> + /// <param name="caseVariant"> true if we are searching for a case variant. if the word has KEEPCASE flag it cannot succeed. </param> /// <returns> <see cref="IList{CharsRef}"/> of stems for the word, or an empty list if none are found </returns> - internal IList<CharsRef> ApplyAffix(char[] strippedWord, int length, int affix, int prefixFlag, int recursionDepth, bool prefix, bool circumfix) + internal IList<CharsRef> ApplyAffix(char[] strippedWord, int length, int affix, int prefixFlag, int recursionDepth, bool prefix, bool circumfix, bool caseVariant) { // TODO: just pass this in from before, no need to decode it twice affixReader.Position = 8 * affix; @@ -378,7 +624,7 @@ namespace Lucene.Net.Analysis.Hunspell Int32sRef forms = dictionary.LookupWord(strippedWord, 0, length); if (forms != null) { - for (int i = 0; i < forms.Length; i++) + for (int i = 0; i < forms.Length; i += formStep) { dictionary.flagLookup.Get(forms.Int32s[forms.Offset + i], scratch); char[] wordFlags = Dictionary.DecodeFlags(scratch); @@ -410,7 +656,18 @@ namespace Lucene.Net.Analysis.Hunspell continue; } } - stems.Add(NewStem(strippedWord, length)); + + // we are looking for a case variant, but this word does not allow it + if (caseVariant && dictionary.keepcase != -1 && Dictionary.HasFlag(wordFlags, (char)dictionary.keepcase)) + { + continue; + } + // we aren't decompounding (yet) + if (dictionary.onlyincompound != -1 && Dictionary.HasFlag(wordFlags, (char)dictionary.onlyincompound)) + { + continue; + } + stems.Add(NewStem(strippedWord, length, forms, i)); } } } @@ -432,14 +689,14 @@ namespace Lucene.Net.Analysis.Hunspell // we took away the first prefix. // COMPLEXPREFIXES = true: combine with a second prefix and another suffix // COMPLEXPREFIXES = false: combine with a suffix - stems.AddRange(Stem(strippedWord, length, affix, flag, flag, ++recursionDepth, dictionary.complexPrefixes && dictionary.twoStageAffix, true, true, circumfix)); + stems.AddRange(Stem(strippedWord, length, affix, flag, flag, ++recursionDepth, dictionary.complexPrefixes && dictionary.twoStageAffix, true, true, circumfix, caseVariant)); } else if (dictionary.complexPrefixes == false && dictionary.twoStageAffix) { // we took away a suffix. // COMPLEXPREFIXES = true: we don't recurse! only one suffix allowed // COMPLEXPREFIXES = false: combine with another suffix - stems.AddRange(Stem(strippedWord, length, affix, flag, prefixFlag, ++recursionDepth, false, true, false, circumfix)); + stems.AddRange(Stem(strippedWord, length, affix, flag, prefixFlag, ++recursionDepth, false, true, false, circumfix, caseVariant)); } } else if (recursionDepth == 1) @@ -447,12 +704,12 @@ namespace Lucene.Net.Analysis.Hunspell if (prefix && dictionary.complexPrefixes) { // we took away the second prefix: go look for another suffix - stems.AddRange(Stem(strippedWord, length, affix, flag, flag, ++recursionDepth, false, true, true, circumfix)); + stems.AddRange(Stem(strippedWord, length, affix, flag, flag, ++recursionDepth, false, true, true, circumfix, caseVariant)); } else if (prefix == false && dictionary.complexPrefixes == false && dictionary.twoStageAffix) { // we took away a prefix, then a suffix: go look for another suffix - stems.AddRange(Stem(strippedWord, length, affix, flag, prefixFlag, ++recursionDepth, false, true, false, circumfix)); + stems.AddRange(Stem(strippedWord, length, affix, flag, prefixFlag, ++recursionDepth, false, true, false, circumfix, caseVariant)); } } } diff --git a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/Foo.cs b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/Foo.cs new file mode 100644 index 0000000..467d181 --- /dev/null +++ b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/Foo.cs @@ -0,0 +1,12 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using System.Threading.Tasks; + +namespace Lucene.Net.Analysis.Hunspell +{ + class Foo + { + } +} diff --git a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/StemmerTestBase.cs b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/StemmerTestBase.cs index 0b3db96..ce9854f 100644 --- a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/StemmerTestBase.cs +++ b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/StemmerTestBase.cs @@ -1,4 +1,4 @@ -// Lucene version compatibility level 4.8.1 +// Lucene version compatibility level 4.10.4 using Lucene.Net.Support; using Lucene.Net.Util; using System; diff --git a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/Test64kAffixes.cs b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/Test64kAffixes.cs new file mode 100644 index 0000000..700f0d2 --- /dev/null +++ b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/Test64kAffixes.cs @@ -0,0 +1,69 @@ +// Lucene version compatibility level 4.10.4 +using J2N; +using Lucene.Net.Util; +using NUnit.Framework; +using System.Collections.Generic; +using System.IO; +using System.Text; + +namespace Lucene.Net.Analysis.Hunspell +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + // Tests that > 64k affixes actually works and doesnt overflow some internal int + public class Test64kAffixes : LuceneTestCase + { + [Test] + public void Test() + { + DirectoryInfo tempDir = CreateTempDir("64kaffixes"); + FileInfo affix = new FileInfo(System.IO.Path.Combine(tempDir.FullName, "64kaffixes.aff")); + FileInfo dict = new FileInfo(System.IO.Path.Combine(tempDir.FullName, "64kaffixes.dic")); + + using var affixWriter = new StreamWriter( + new FileStream(affix.FullName, FileMode.OpenOrCreate), Encoding.UTF8); + + // 65k affixes with flag 1, then an affix with flag 2 + affixWriter.Write("SET UTF-8\nFLAG num\nSFX 1 Y 65536\n"); + for (int i = 0; i < 65536; i++) + { + affixWriter.Write("SFX 1 0 " + i.ToHexString() + " .\n"); + } + affixWriter.Write("SFX 2 Y 1\nSFX 2 0 s\n"); + affixWriter.Dispose(); + + using var dictWriter = new StreamWriter( + new FileStream(dict.FullName, FileMode.OpenOrCreate), Encoding.UTF8); + + + // drink signed with affix 2 (takes -s) + dictWriter.Write("1\ndrink/2\n"); + dictWriter.Dispose(); + + using Stream affStream = new FileStream(affix.FullName, FileMode.OpenOrCreate); + using Stream dictStream = new FileStream(dict.FullName, FileMode.OpenOrCreate); + + Dictionary dictionary = new Dictionary(affStream, dictStream); + Stemmer stemmer = new Stemmer(dictionary); + // drinks should still stem to drink + IList<CharsRef> stems = stemmer.Stem("drinks"); + assertEquals(1, stems.size()); + assertEquals("drink", stems[0].ToString()); + } + } +} diff --git a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestAllDictionaries.cs b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestAllDictionaries.cs index 5b1c387..f19734f 100644 --- a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestAllDictionaries.cs +++ b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestAllDictionaries.cs @@ -1,4 +1,4 @@ -// Lucene version compatibility level 4.8.1 +// Lucene version compatibility level 4.10.4 using Lucene.Net.Diagnostics; using Lucene.Net.Util; using NUnit.Framework; @@ -195,7 +195,7 @@ namespace Lucene.Net.Analysis.Hunspell [Test] public virtual void TestOneDictionary() { - string toTest = "hu_HU.zip"; + string toTest = "zu_ZA.zip"; for (int i = 0; i < tests.Length; i++) { if (tests[i].Equals(toTest, StringComparison.Ordinal)) diff --git a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestAllDictionaries2.cs b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestAllDictionaries2.cs index 2d187b2..a05e8d9 100644 --- a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestAllDictionaries2.cs +++ b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestAllDictionaries2.cs @@ -1,4 +1,4 @@ -// Lucene version compatibility level 4.8.1 +// Lucene version compatibility level 4.10.4 using Lucene.Net.Diagnostics; using Lucene.Net.Util; using NUnit.Framework; @@ -54,7 +54,7 @@ namespace Lucene.Net.Analysis.Hunspell "afrikaans_spell_checker-20110323-fx+tb+fn+sm.xpi", "dictionaries/af-ZA.dic", "dictionaries/af-ZA.aff", "albanisches_worterbuch-1.6.9-fx+tb+sm+fn.xpi", "dictionaries/sq.dic", "dictionaries/sq.aff", "amharic_spell_checker-0.4-fx+fn+tb+sm.xpi", "dictionaries/am_ET.dic", "dictionaries/am_ET.aff", -//LUCENENET BUG: duplicate mapping of character "arabic_spell_checking_dictionary-3.2.20120321-fx+tb.xpi", "dictionaries/ar.dic", "dictionaries/ar.aff", + "arabic_spell_checking_dictionary-3.2.20120321-fx+tb.xpi", "dictionaries/ar.dic", "dictionaries/ar.aff", "armenian_spell_checker_dictionary-0.32-fx+tb+sm.xpi", "dictionaries/hy_AM.dic", "dictionaries/hy_AM.aff", "azerbaijani_spell_checker-0.3-fx+tb+fn+sm+sb.xpi", "dictionaries/az-Latn-AZ.dic", "dictionaries/az-Latn-AZ.aff", "belarusian_classic_dictionary-0.1.2-tb+fx+sm.xpi", "dictionaries/be-classic.dic", "dictionaries/be-classic.aff", @@ -111,7 +111,7 @@ namespace Lucene.Net.Analysis.Hunspell "hausa_spelling_dictionary-0.2-tb+fx.xpi", "dictionaries/ha-GH.dic", "dictionaries/ha-GH.aff", "hebrew_spell_checking_dictionary_from_hspell-1.2.0.1-fx+sm+tb.xpi", "dictionaries/he.dic", "dictionaries/he.aff", "hindi_spell_checker-0.4-fx+tb+sm+sb+fn.xpi", "dictionaries/hi_IN.dic", "dictionaries/hi_IN.aff", -//LUCENENET BUG: Invalid ICONV flag "hungarian_dictionary-1.6.1.1-fx+tb+sm+fn.xpi", "dictionaries/hu.dic", "dictionaries/hu.aff", + "hungarian_dictionary-1.6.1.1-fx+tb+sm+fn.xpi", "dictionaries/hu.dic", "dictionaries/hu.aff", //BUG: has no encoding declaration "icelandic_dictionary-1.3-fx+tb+sm.xpi", "dictionaries/is.dic", "dictionaries/is.aff", "kamus_pengecek_ejaan_bahasa_indonesia-1.1-fx+tb.xpi", "dictionaries/id.dic", "dictionaries/id.aff", "kannada_spell_checker-2.0.1-tb+sm+fn+an+fx.xpi", "dictionaries/kn.dic", "dictionaries/kn.aff", @@ -146,7 +146,7 @@ namespace Lucene.Net.Analysis.Hunspell "slovar_za_slovenski_jezik-0.1.1.1-fx+tb+sm.xpi", "dictionaries/sl.dic", "dictionaries/sl.aff", "songhay_spell_checker-0.03-fx+tb+sm.xpi", "dictionaries/Songhay - Mali.dic", "dictionaries/Songhay - Mali.aff", "southern_sotho_spell_checker-20110323-tb+fn+fx+sm.xpi", "dictionaries/st-ZA.dic", "dictionaries/st-ZA.aff", -//LUCENENET BUG: Invalid ICONV flag "sownik_acinski-0.41.20110603-tb+fx+sm.xpi", "dictionaries/la.dic", "dictionaries/la.aff", + "sownik_acinski-0.41.20110603-tb+fx+sm.xpi", "dictionaries/la.dic", "dictionaries/la.aff", "sownik_jezyka_dolnouzyckiego-1.4.8-an+fx+tb+fn+sm.xpi", "dictionaries/dsb.dic", "dictionaries/dsb.aff", "srpska_latinica-0.1-fx+tb+sm.xpi", "dictionaries/Srpski_latinica.dic", "dictionaries/Srpski_latinica.aff", "svenska_fria_ordlistan-1.1-tb+sm+fx.xpi", "dictionaries/sv.dic", "dictionaries/sv.aff", @@ -171,12 +171,11 @@ namespace Lucene.Net.Analysis.Hunspell "verificador_ortografico_para_portugues_do_brasil-2.3-3.2b1-tb+sm+fn+fx.xpi", "dictionaries/pt_BR.dic", "dictionaries/pt_BR.aff", "vietnamese_dictionary-2.1.0.159-an+sm+tb+fx+fn.xpi", "dictionaries/vi-DauCu.dic", "dictionaries/vi-DauCu.aff", "vietnamese_dictionary-2.1.0.159-an+sm+tb+fx+fn.xpi", "dictionaries/vi-DauMoi.dic", "dictionaries/vi-DauMoi.aff", - "woordenboek_nederlands-3.1.1-sm+tb+fx+fn.xpi", "dictionaries/nl.dic", "dictionaries/nl.aff", +// LUCENENET BUG: System.ArgumentException : expected only one flag, got: Kc "woordenboek_nederlands-3.1.1-sm+tb+fx+fn.xpi", "dictionaries/nl.dic", "dictionaries/nl.aff", "xhosa_spell_checker-20110323-tb+fn+fx+sm.xpi", "dictionaries/xh-ZA.dic", "dictionaries/xh-ZA.aff", "xuxen-4.0.1-fx+tb+sm.xpi", "dictionaries/eu.dic", "dictionaries/eu.aff", "yiddish_spell_checker_yivo-0.0.3-sm+fn+fx+tb.xpi", "dictionaries/yi.dic", "dictionaries/yi.aff", "zulu_spell_checker-20110323-tb+fn+fx+sm.xpi", "dictionaries/zu-ZA.dic", "dictionaries/zu-ZA.aff", - }; [Test] @@ -211,9 +210,7 @@ namespace Lucene.Net.Analysis.Hunspell [Test] public virtual void TestOneDictionary() { - //string toTest = "hungarian_dictionary-1.6.1.1-fx+tb+sm+fn.xpi"; - // LUCENENET: We can't test Hungarian because of an invalid flag. Switching to Lithuanian. - string toTest = "lithuanian_spelling_check_dictionary-1.3-fx+tb+sm+fn.xpi"; + string toTest = "hungarian_dictionary-1.6.1.1-fx+tb+sm+fn.xpi"; for (int i = 0; i < tests.Length; i++) { if (tests[i].Equals(toTest, StringComparison.Ordinal)) diff --git a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestAlternateCasing.cs b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestAlternateCasing.cs new file mode 100644 index 0000000..33294b7 --- /dev/null +++ b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestAlternateCasing.cs @@ -0,0 +1,67 @@ +// Lucene version compatibility level 4.10.4 +using NUnit.Framework; + +namespace Lucene.Net.Analysis.Hunspell +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + public class TestAlternateCasing : StemmerTestBase + { + public override void BeforeClass() + { + base.BeforeClass(); + Init("alternate-casing.aff", "alternate-casing.dic"); + } + + [Test] + public void TestPossibilities() + { + AssertStemsTo("drink", "drink"); + AssertStemsTo("DRİNK", "drink"); + AssertStemsTo("DRINK"); + AssertStemsTo("drinki", "drink"); + AssertStemsTo("DRİNKİ", "drink"); + AssertStemsTo("DRİNKI"); + AssertStemsTo("DRINKI"); + AssertStemsTo("DRINKİ"); + AssertStemsTo("idrink", "drink"); + AssertStemsTo("İDRİNK", "drink"); + AssertStemsTo("IDRİNK"); + AssertStemsTo("IDRINK"); + AssertStemsTo("İDRINK"); + AssertStemsTo("idrinki", "drink"); + AssertStemsTo("İDRİNKİ", "drink"); + AssertStemsTo("rıver", "rıver"); + AssertStemsTo("RIVER", "rıver"); + AssertStemsTo("RİVER"); + AssertStemsTo("rıverı", "rıver"); + AssertStemsTo("RIVERI", "rıver"); + AssertStemsTo("RİVERI"); + AssertStemsTo("RİVERİ"); + AssertStemsTo("RIVERİ"); + AssertStemsTo("ırıver", "rıver"); + AssertStemsTo("IRIVER", "rıver"); + AssertStemsTo("IRİVER"); + AssertStemsTo("İRİVER"); + AssertStemsTo("İRIVER"); + AssertStemsTo("ırıverı", "rıver"); + AssertStemsTo("IRIVERI", "rıver"); + AssertStemsTo("Irıverı", "rıver"); + } + } +} diff --git a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestCaseInsensitive.cs b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestCaseInsensitive.cs index 141cc9b..dddd520 100644 --- a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestCaseInsensitive.cs +++ b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestCaseInsensitive.cs @@ -1,4 +1,4 @@ -// Lucene version compatibility level 4.8.1 +// Lucene version compatibility level 4.10.4 using NUnit.Framework; namespace Lucene.Net.Analysis.Hunspell diff --git a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestCaseSensitive.cs b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestCaseSensitive.cs new file mode 100644 index 0000000..fcdd361 --- /dev/null +++ b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestCaseSensitive.cs @@ -0,0 +1,71 @@ +// Lucene version compatibility level 4.10.4 +using NUnit.Framework; + +namespace Lucene.Net.Analysis.Hunspell +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + public class TestCaseSensitive : StemmerTestBase + { + public override void BeforeClass() + { + base.BeforeClass(); + Init("casesensitive.aff", "casesensitive.dic"); + } + + [Test] + public void TestAllPossibilities() + { + AssertStemsTo("drink", "drink"); + AssertStemsTo("drinks", "drink"); + AssertStemsTo("drinkS", "drink"); + AssertStemsTo("gooddrinks", "drink"); + AssertStemsTo("Gooddrinks", "drink", "drink"); + AssertStemsTo("GOODdrinks", "drink"); + AssertStemsTo("gooddrinkS", "drink"); + AssertStemsTo("GooddrinkS", "drink"); + AssertStemsTo("gooddrink", "drink"); + AssertStemsTo("Gooddrink", "drink", "drink"); + AssertStemsTo("GOODdrink", "drink"); + AssertStemsTo("Drink", "drink", "Drink"); + AssertStemsTo("Drinks", "drink", "Drink"); + AssertStemsTo("DrinkS", "Drink"); + AssertStemsTo("goodDrinks", "Drink"); + AssertStemsTo("GoodDrinks", "Drink"); + AssertStemsTo("GOODDrinks", "Drink"); + AssertStemsTo("goodDrinkS", "Drink"); + AssertStemsTo("GoodDrinkS", "Drink"); + AssertStemsTo("GOODDrinkS", "Drink"); + AssertStemsTo("goodDrink", "Drink"); + AssertStemsTo("GoodDrink", "Drink"); + AssertStemsTo("GOODDrink", "Drink"); + AssertStemsTo("DRINK", "DRINK", "drink", "Drink"); + AssertStemsTo("DRINKs", "DRINK"); + AssertStemsTo("DRINKS", "DRINK", "drink", "Drink"); + AssertStemsTo("goodDRINKs", "DRINK"); + AssertStemsTo("GoodDRINKs", "DRINK"); + AssertStemsTo("GOODDRINKs", "DRINK"); + AssertStemsTo("goodDRINKS", "DRINK"); + AssertStemsTo("GoodDRINKS", "DRINK"); + AssertStemsTo("GOODDRINKS", "DRINK", "drink", "drink"); + AssertStemsTo("goodDRINK", "DRINK"); + AssertStemsTo("GoodDRINK", "DRINK"); + AssertStemsTo("GOODDRINK", "DRINK", "drink", "drink"); + } + } +} diff --git a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestCircumfix.cs b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestCircumfix.cs index be6b464..c54b741 100644 --- a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestCircumfix.cs +++ b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestCircumfix.cs @@ -1,4 +1,4 @@ -// Lucene version compatibility level 4.8.1 +// Lucene version compatibility level 4.10.4 using NUnit.Framework; namespace Lucene.Net.Analysis.Hunspell diff --git a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestComplexPrefix.cs b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestComplexPrefix.cs index de47da9..63db496 100644 --- a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestComplexPrefix.cs +++ b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestComplexPrefix.cs @@ -1,4 +1,4 @@ -// Lucene version compatibility level 4.8.1 +// Lucene version compatibility level 4.10.4 using NUnit.Framework; namespace Lucene.Net.Analysis.Hunspell diff --git a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestCondition.cs b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestCondition.cs index 4bc54d4..a4e5c94 100644 --- a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestCondition.cs +++ b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestCondition.cs @@ -1,4 +1,4 @@ -// Lucene version compatibility level 4.8.1 +// Lucene version compatibility level 4.10.4 using NUnit.Framework; namespace Lucene.Net.Analysis.Hunspell diff --git a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestHomonyms.cs b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestCondition2.cs similarity index 80% copy from src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestHomonyms.cs copy to src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestCondition2.cs index 4d120e8..186f3ca 100644 --- a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestHomonyms.cs +++ b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestCondition2.cs @@ -1,4 +1,4 @@ -// Lucene version compatibility level 4.8.1 +// Lucene version compatibility level 4.10.4 using NUnit.Framework; namespace Lucene.Net.Analysis.Hunspell @@ -20,19 +20,18 @@ namespace Lucene.Net.Analysis.Hunspell * limitations under the License. */ - public class TestHomonyms : StemmerTestBase + public class TestCondition2 : StemmerTestBase { - - [OneTimeSetUp] public override void BeforeClass() { base.BeforeClass(); - Init("homonyms.aff", "homonyms.dic"); + Init("condition2.aff", "condition2.dic"); } + [Test] - public virtual void TestExamples() + public void TestStemming() { - AssertStemsTo("works", "work", "work"); + AssertStemsTo("monopolies", "monopoly"); } } -} \ No newline at end of file +} diff --git a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestConv.cs b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestConv.cs index fe3e44f..912cb9c 100644 --- a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestConv.cs +++ b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestConv.cs @@ -1,4 +1,4 @@ -// Lucene version compatibility level 4.8.1 +// Lucene version compatibility level 4.10.4 using NUnit.Framework; namespace Lucene.Net.Analysis.Hunspell diff --git a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestDependencies.cs b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestDependencies.cs index 2f34243..cdfd87e 100644 --- a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestDependencies.cs +++ b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestDependencies.cs @@ -1,4 +1,4 @@ -// Lucene version compatibility level 4.8.1 +// Lucene version compatibility level 4.10.4 using NUnit.Framework; namespace Lucene.Net.Analysis.Hunspell diff --git a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestDictionary.cs b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestDictionary.cs index 71e42fe..4b85b2d 100644 --- a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestDictionary.cs +++ b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestDictionary.cs @@ -1,4 +1,4 @@ -// Lucene version compatibility level 4.8.1 +// Lucene version compatibility level 4.10.4 using J2N.Text; using Lucene.Net.Util; using Lucene.Net.Util.Fst; @@ -9,7 +9,6 @@ using System.Text; namespace Lucene.Net.Analysis.Hunspell { - /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with diff --git a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestHomonyms.cs b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestDoubleEscape.cs similarity index 80% copy from src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestHomonyms.cs copy to src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestDoubleEscape.cs index 4d120e8..c376766 100644 --- a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestHomonyms.cs +++ b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestDoubleEscape.cs @@ -1,4 +1,4 @@ -// Lucene version compatibility level 4.8.1 +// Lucene version compatibility level 4.10.4 using NUnit.Framework; namespace Lucene.Net.Analysis.Hunspell @@ -20,19 +20,18 @@ namespace Lucene.Net.Analysis.Hunspell * limitations under the License. */ - public class TestHomonyms : StemmerTestBase + public class TestDoubleEscape : StemmerTestBase { - - [OneTimeSetUp] public override void BeforeClass() { base.BeforeClass(); - Init("homonyms.aff", "homonyms.dic"); + Init("double-escaped.aff", "double-escaped.dic"); } + [Test] - public virtual void TestExamples() + public void TestStemming() { - AssertStemsTo("works", "work", "work"); + AssertStemsTo("adubo", "adubar"); } } -} \ No newline at end of file +} diff --git a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestEscaped.cs b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestEscaped.cs index df41107..4b77694 100644 --- a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestEscaped.cs +++ b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestEscaped.cs @@ -1,4 +1,4 @@ -// Lucene version compatibility level 4.8.1 +// Lucene version compatibility level 4.10.4 using NUnit.Framework; namespace Lucene.Net.Analysis.Hunspell diff --git a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestFlagLong.cs b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestFlagLong.cs index 4da446f..31baf60 100644 --- a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestFlagLong.cs +++ b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestFlagLong.cs @@ -1,4 +1,4 @@ -// Lucene version compatibility level 4.8.1 +// Lucene version compatibility level 4.10.4 using NUnit.Framework; namespace Lucene.Net.Analysis.Hunspell @@ -34,6 +34,7 @@ namespace Lucene.Net.Analysis.Hunspell AssertStemsTo("foo", "foo"); AssertStemsTo("foos", "foo"); AssertStemsTo("fooss"); + AssertStemsTo("foobogus"); } } } \ No newline at end of file diff --git a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestFlagNum.cs b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestFlagNum.cs index 180e5fd..b82036d 100644 --- a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestFlagNum.cs +++ b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestFlagNum.cs @@ -1,4 +1,4 @@ -// Lucene version compatibility level 4.8.1 +// Lucene version compatibility level 4.10.4 using NUnit.Framework; namespace Lucene.Net.Analysis.Hunspell diff --git a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestHomonyms.cs b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestFullStrip.cs similarity index 80% copy from src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestHomonyms.cs copy to src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestFullStrip.cs index 4d120e8..4d66ec3 100644 --- a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestHomonyms.cs +++ b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestFullStrip.cs @@ -1,4 +1,4 @@ -// Lucene version compatibility level 4.8.1 +// Lucene version compatibility level 4.10.4 using NUnit.Framework; namespace Lucene.Net.Analysis.Hunspell @@ -20,19 +20,18 @@ namespace Lucene.Net.Analysis.Hunspell * limitations under the License. */ - public class TestHomonyms : StemmerTestBase + public class TestFullStrip : StemmerTestBase { - - [OneTimeSetUp] public override void BeforeClass() { base.BeforeClass(); - Init("homonyms.aff", "homonyms.dic"); + Init("fullstrip.aff", "fullstrip.dic"); } + [Test] - public virtual void TestExamples() + public void TestStemming() { - AssertStemsTo("works", "work", "work"); + AssertStemsTo("tasty", "beer"); } } -} \ No newline at end of file +} diff --git a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestHomonyms.cs b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestHomonyms.cs index 4d120e8..272a47e 100644 --- a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestHomonyms.cs +++ b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestHomonyms.cs @@ -1,4 +1,4 @@ -// Lucene version compatibility level 4.8.1 +// Lucene version compatibility level 4.10.4 using NUnit.Framework; namespace Lucene.Net.Analysis.Hunspell diff --git a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestHunspellStemFilter.cs b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestHunspellStemFilter.cs index 56cc591..f1228a0 100644 --- a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestHunspellStemFilter.cs +++ b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestHunspellStemFilter.cs @@ -1,4 +1,4 @@ -// Lucene version compatibility level 4.8.1 +// Lucene version compatibility level 4.10.4 using Lucene.Net.Analysis.Core; using Lucene.Net.Analysis.Miscellaneous; using Lucene.Net.Analysis.Util; diff --git a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestHunspellStemFilterFactory.cs b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestHunspellStemFilterFactory.cs index f6359e4..2171e02 100644 --- a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestHunspellStemFilterFactory.cs +++ b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestHunspellStemFilterFactory.cs @@ -1,4 +1,4 @@ -// Lucene version compatibility level 4.8.1 +// Lucene version compatibility level 4.10.4 using Lucene.Net.Analysis.Util; using NUnit.Framework; using System; diff --git a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestIgnore.cs b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestIgnore.cs index 22de6b7..35fafdb 100644 --- a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestIgnore.cs +++ b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestIgnore.cs @@ -1,4 +1,4 @@ -// Lucene version compatibility level 4.8.1 +// Lucene version compatibility level 4.10.4 using NUnit.Framework; namespace Lucene.Net.Analysis.Hunspell diff --git a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestTwoSuffixes.cs b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestKeepCase.cs similarity index 63% copy from src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestTwoSuffixes.cs copy to src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestKeepCase.cs index fcd840d..f0a81cc 100644 --- a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestTwoSuffixes.cs +++ b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestKeepCase.cs @@ -1,4 +1,4 @@ -// Lucene version compatibility level 4.8.1 +// Lucene version compatibility level 4.10.4 using NUnit.Framework; namespace Lucene.Net.Analysis.Hunspell @@ -20,23 +20,31 @@ namespace Lucene.Net.Analysis.Hunspell * limitations under the License. */ - public class TestTwoSuffixes : StemmerTestBase + public class TestKeepCase : StemmerTestBase { - - [OneTimeSetUp] public override void BeforeClass() { base.BeforeClass(); - Init("twosuffixes.aff", "twosuffixes.dic"); + Init("keepcase.aff", "keepcase.dic"); } + [Test] - public virtual void TestExamples() + public void TestPossibilities() { AssertStemsTo("drink", "drink"); - AssertStemsTo("drinkable", "drink"); + AssertStemsTo("Drink", "drink"); + AssertStemsTo("DRINK", "drink"); AssertStemsTo("drinks", "drink"); - AssertStemsTo("drinkableable"); - AssertStemsTo("drinkss"); + AssertStemsTo("Drinks", "drink"); + AssertStemsTo("DRINKS", "drink"); + AssertStemsTo("walk", "walk"); + AssertStemsTo("walks", "walk"); + AssertStemsTo("Walk"); + AssertStemsTo("Walks"); + AssertStemsTo("WALKS"); + AssertStemsTo("test", "test"); + AssertStemsTo("Test"); + AssertStemsTo("TEST"); } } -} \ No newline at end of file +} diff --git a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestMorph.cs b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestMorph.cs index b8eef84..9fccba1 100644 --- a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestMorph.cs +++ b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestMorph.cs @@ -1,4 +1,4 @@ -// Lucene version compatibility level 4.8.1 +// Lucene version compatibility level 4.10.4 using NUnit.Framework; namespace Lucene.Net.Analysis.Hunspell diff --git a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestHomonyms.cs b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestMorphAlias.cs similarity index 63% copy from src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestHomonyms.cs copy to src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestMorphAlias.cs index 4d120e8..20a7258 100644 --- a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestHomonyms.cs +++ b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestMorphAlias.cs @@ -1,4 +1,4 @@ -// Lucene version compatibility level 4.8.1 +// Lucene version compatibility level 4.10.4 using NUnit.Framework; namespace Lucene.Net.Analysis.Hunspell @@ -20,19 +20,25 @@ namespace Lucene.Net.Analysis.Hunspell * limitations under the License. */ - public class TestHomonyms : StemmerTestBase + public class TestMorphAlias : StemmerTestBase { - - [OneTimeSetUp] public override void BeforeClass() { base.BeforeClass(); - Init("homonyms.aff", "homonyms.dic"); + Init("morphalias.aff", "morphalias.dic"); } + [Test] - public virtual void TestExamples() + public void TestStemming() { - AssertStemsTo("works", "work", "work"); + AssertStemsTo("feet", "foot"); + AssertStemsTo("feetscratcher", "foot"); + AssertStemsTo("work", "workverb", "worknoun"); + AssertStemsTo("works", "workverb", "worknoun"); + AssertStemsTo("notspecial", "notspecial"); + AssertStemsTo("simplenoun", "simplenoun"); + AssertStemsTo("simplenouns", "simplenoun"); + AssertStemsTo("simplenounscratcher"); } } -} \ No newline at end of file +} diff --git a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestHomonyms.cs b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestMorphData.cs similarity index 63% copy from src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestHomonyms.cs copy to src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestMorphData.cs index 4d120e8..0978fc3 100644 --- a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestHomonyms.cs +++ b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestMorphData.cs @@ -1,4 +1,4 @@ -// Lucene version compatibility level 4.8.1 +// Lucene version compatibility level 4.10.4 using NUnit.Framework; namespace Lucene.Net.Analysis.Hunspell @@ -20,19 +20,25 @@ namespace Lucene.Net.Analysis.Hunspell * limitations under the License. */ - public class TestHomonyms : StemmerTestBase + public class TestMorphData : StemmerTestBase { - - [OneTimeSetUp] public override void BeforeClass() { base.BeforeClass(); - Init("homonyms.aff", "homonyms.dic"); + Init("morphdata.aff", "morphdata.dic"); } + [Test] - public virtual void TestExamples() + public void TestStemming() { - AssertStemsTo("works", "work", "work"); + AssertStemsTo("feet", "foot"); + AssertStemsTo("feetscratcher", "foot"); + AssertStemsTo("work", "workverb", "worknoun"); + AssertStemsTo("works", "workverb", "worknoun"); + AssertStemsTo("notspecial", "notspecial"); + AssertStemsTo("simplenoun", "simplenoun"); + AssertStemsTo("simplenouns", "simplenoun"); + AssertStemsTo("simplenounscratcher"); } } -} \ No newline at end of file +} diff --git a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestTwoSuffixes.cs b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestNeedAffix.cs similarity index 69% copy from src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestTwoSuffixes.cs copy to src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestNeedAffix.cs index fcd840d..8ac9aa7 100644 --- a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestTwoSuffixes.cs +++ b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestNeedAffix.cs @@ -1,4 +1,4 @@ -// Lucene version compatibility level 4.8.1 +// Lucene version compatibility level 4.10.4 using NUnit.Framework; namespace Lucene.Net.Analysis.Hunspell @@ -20,23 +20,27 @@ namespace Lucene.Net.Analysis.Hunspell * limitations under the License. */ - public class TestTwoSuffixes : StemmerTestBase + public class TestNeedAffix : StemmerTestBase { - - [OneTimeSetUp] public override void BeforeClass() { base.BeforeClass(); - Init("twosuffixes.aff", "twosuffixes.dic"); + Init("needaffix.aff", "needaffix.dic"); } + [Test] - public virtual void TestExamples() + public void TestPossibilities() { AssertStemsTo("drink", "drink"); - AssertStemsTo("drinkable", "drink"); AssertStemsTo("drinks", "drink"); - AssertStemsTo("drinkableable"); - AssertStemsTo("drinkss"); + AssertStemsTo("walk"); + AssertStemsTo("walks", "walk"); + AssertStemsTo("prewalk", "walk"); + AssertStemsTo("prewalks", "walk"); + AssertStemsTo("test"); + AssertStemsTo("pretest"); + AssertStemsTo("tests"); + AssertStemsTo("pretests"); } } -} \ No newline at end of file +} diff --git a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestTwoSuffixes.cs b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestOnlyInCompound.cs similarity index 76% copy from src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestTwoSuffixes.cs copy to src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestOnlyInCompound.cs index fcd840d..d9700aa 100644 --- a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestTwoSuffixes.cs +++ b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestOnlyInCompound.cs @@ -1,4 +1,4 @@ -// Lucene version compatibility level 4.8.1 +// Lucene version compatibility level 4.10.4 using NUnit.Framework; namespace Lucene.Net.Analysis.Hunspell @@ -20,23 +20,23 @@ namespace Lucene.Net.Analysis.Hunspell * limitations under the License. */ - public class TestTwoSuffixes : StemmerTestBase + public class TestOnlyInCompound : StemmerTestBase { - - [OneTimeSetUp] public override void BeforeClass() { base.BeforeClass(); - Init("twosuffixes.aff", "twosuffixes.dic"); + Init("onlyincompound.aff", "onlyincompound.dic"); } + [Test] - public virtual void TestExamples() + public void TestPossibilities() { AssertStemsTo("drink", "drink"); - AssertStemsTo("drinkable", "drink"); AssertStemsTo("drinks", "drink"); - AssertStemsTo("drinkableable"); - AssertStemsTo("drinkss"); + AssertStemsTo("drinked"); + AssertStemsTo("predrink"); + AssertStemsTo("predrinked"); + AssertStemsTo("walk"); } } -} \ No newline at end of file +} diff --git a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestOptionalCondition.cs b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestOptionalCondition.cs index 50deba0..94b9a14 100644 --- a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestOptionalCondition.cs +++ b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestOptionalCondition.cs @@ -1,4 +1,4 @@ -// Lucene version compatibility level 4.8.1 +// Lucene version compatibility level 4.10.4 using NUnit.Framework; namespace Lucene.Net.Analysis.Hunspell diff --git a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestCondition.cs b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestSpaces.cs similarity index 59% copy from src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestCondition.cs copy to src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestSpaces.cs index 4bc54d4..2aebb8e 100644 --- a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestCondition.cs +++ b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestSpaces.cs @@ -1,4 +1,4 @@ -// Lucene version compatibility level 4.8.1 +// Lucene version compatibility level 4.10.4 using NUnit.Framework; namespace Lucene.Net.Analysis.Hunspell @@ -20,30 +20,29 @@ namespace Lucene.Net.Analysis.Hunspell * limitations under the License. */ - public class TestCondition : StemmerTestBase + public class TestSpaces : StemmerTestBase { - [OneTimeSetUp] public override void BeforeClass() { base.BeforeClass(); - Init("condition.aff", "condition.dic"); + Init("spaces.aff", "spaces.dic"); } [Test] - public virtual void TestStemming() + public void TestStemming() { - AssertStemsTo("hello", "hello"); - AssertStemsTo("try", "try"); - AssertStemsTo("tried", "try"); - AssertStemsTo("work", "work"); - AssertStemsTo("worked", "work"); - AssertStemsTo("rework", "work"); - AssertStemsTo("reworked", "work"); - AssertStemsTo("retried"); - AssertStemsTo("workied"); - AssertStemsTo("tryed"); - AssertStemsTo("tryied"); - AssertStemsTo("helloed"); + AssertStemsTo("four", "four"); + AssertStemsTo("fours", "four"); + AssertStemsTo("five", "five"); + AssertStemsTo("forty four", "forty four"); + AssertStemsTo("forty fours", "forty four"); + AssertStemsTo("forty five", "forty five"); + AssertStemsTo("fifty", "50"); + AssertStemsTo("fiftys", "50"); + AssertStemsTo("sixty", "60"); + AssertStemsTo("sixty four", "64"); + AssertStemsTo("fifty four", "54"); + AssertStemsTo("fifty fours", "54"); } } -} \ No newline at end of file +} diff --git a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestStemmer.cs b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestStemmer.cs index 84cd54b..4a56814 100644 --- a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestStemmer.cs +++ b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestStemmer.cs @@ -1,4 +1,4 @@ -// Lucene version compatibility level 4.8.1 +// Lucene version compatibility level 4.10.4 using NUnit.Framework; namespace Lucene.Net.Analysis.Hunspell diff --git a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestHomonyms.cs b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestStrangeOvergeneration.cs similarity index 74% copy from src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestHomonyms.cs copy to src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestStrangeOvergeneration.cs index 4d120e8..730d242 100644 --- a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestHomonyms.cs +++ b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestStrangeOvergeneration.cs @@ -1,4 +1,4 @@ -// Lucene version compatibility level 4.8.1 +// Lucene version compatibility level 4.10.4 using NUnit.Framework; namespace Lucene.Net.Analysis.Hunspell @@ -20,19 +20,21 @@ namespace Lucene.Net.Analysis.Hunspell * limitations under the License. */ - public class TestHomonyms : StemmerTestBase + public class TestStrangeOvergeneration : StemmerTestBase { - - [OneTimeSetUp] public override void BeforeClass() { base.BeforeClass(); - Init("homonyms.aff", "homonyms.dic"); + Init("strange-overgeneration.aff", "strange-overgeneration.dic"); } + [Test] - public virtual void TestExamples() + public void TestStemming() { - AssertStemsTo("works", "work", "work"); + AssertStemsTo("btasty", "beer"); + AssertStemsTo("tasty"); + AssertStemsTo("yuck"); + AssertStemsTo("foo"); } } -} \ No newline at end of file +} diff --git a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestTwoFold.cs b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestTwoFold.cs index 9ce1fc9..fdbad8a 100644 --- a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestTwoFold.cs +++ b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestTwoFold.cs @@ -1,4 +1,4 @@ -// Lucene version compatibility level 4.8.1 +// Lucene version compatibility level 4.10.4 using NUnit.Framework; namespace Lucene.Net.Analysis.Hunspell diff --git a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestTwoSuffixes.cs b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestTwoSuffixes.cs index fcd840d..8749522 100644 --- a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestTwoSuffixes.cs +++ b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestTwoSuffixes.cs @@ -1,4 +1,4 @@ -// Lucene version compatibility level 4.8.1 +// Lucene version compatibility level 4.10.4 using NUnit.Framework; namespace Lucene.Net.Analysis.Hunspell diff --git a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestHomonyms.cs b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestZeroAffix.cs similarity index 80% copy from src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestHomonyms.cs copy to src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestZeroAffix.cs index 4d120e8..e032155 100644 --- a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestHomonyms.cs +++ b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestZeroAffix.cs @@ -1,4 +1,4 @@ -// Lucene version compatibility level 4.8.1 +// Lucene version compatibility level 4.10.4 using NUnit.Framework; namespace Lucene.Net.Analysis.Hunspell @@ -20,19 +20,18 @@ namespace Lucene.Net.Analysis.Hunspell * limitations under the License. */ - public class TestHomonyms : StemmerTestBase + public class TestZeroAffix : StemmerTestBase { - - [OneTimeSetUp] public override void BeforeClass() { base.BeforeClass(); - Init("homonyms.aff", "homonyms.dic"); + Init("zeroaffix.aff", "zeroaffix.dic"); } + [Test] - public virtual void TestExamples() + public void TestStemming() { - AssertStemsTo("works", "work", "work"); + AssertStemsTo("drink", "drinksierranevada"); } } -} \ No newline at end of file +} diff --git a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestHomonyms.cs b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestZeroAffix2.cs similarity index 80% copy from src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestHomonyms.cs copy to src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestZeroAffix2.cs index 4d120e8..b22f186 100644 --- a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestHomonyms.cs +++ b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/TestZeroAffix2.cs @@ -1,4 +1,4 @@ -// Lucene version compatibility level 4.8.1 +// Lucene version compatibility level 4.10.4 using NUnit.Framework; namespace Lucene.Net.Analysis.Hunspell @@ -20,19 +20,18 @@ namespace Lucene.Net.Analysis.Hunspell * limitations under the License. */ - public class TestHomonyms : StemmerTestBase + public class TestZeroAffix2 : StemmerTestBase { - - [OneTimeSetUp] public override void BeforeClass() { base.BeforeClass(); - Init("homonyms.aff", "homonyms.dic"); + Init("zeroaffix2.aff", "zeroaffix2.dic"); } + [Test] - public virtual void TestExamples() + public void TestStemming() { - AssertStemsTo("works", "work", "work"); + AssertStemsTo("b", "beer"); } } -} \ No newline at end of file +} diff --git a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/alternate-casing.aff b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/alternate-casing.aff new file mode 100644 index 0000000..49618b8 --- /dev/null +++ b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/alternate-casing.aff @@ -0,0 +1,15 @@ +SET UTF-8 + +LANG tr_TR + +PFX A Y 1 +PFX A 0 ı . +dotlessprefix + +PFX B Y 1 +PFX B 0 i . +dottedprefix + +SFX X Y 1 +SFX X 0 ı . +dotlesssuffix + +SFX Y Y 1 +SFX Y 0 i . +dottedsuffix diff --git a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/alternate-casing.dic b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/alternate-casing.dic new file mode 100644 index 0000000..5b7c8f4 --- /dev/null +++ b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/alternate-casing.dic @@ -0,0 +1,4 @@ +3 +drink/BY +rıver/AX + diff --git a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/casesensitive.aff b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/casesensitive.aff new file mode 100644 index 0000000..9943e62 --- /dev/null +++ b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/casesensitive.aff @@ -0,0 +1,16 @@ +SET UTF-8 + +PFX A Y 1 +PFX A 0 good . +good + +PFX B Y 1 +PFX B 0 Good . +Good + +PFX C Y 1 +PFX C 0 GOOD . +GOOD + +SFX X Y 1 +SFX X 0 s . +s + +SFX Y Y 1 +SFX Y 0 S . +S diff --git a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/casesensitive.dic b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/casesensitive.dic new file mode 100644 index 0000000..edbc34c --- /dev/null +++ b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/casesensitive.dic @@ -0,0 +1,4 @@ +3 +drink/XYABC +Drink/XYABC +DRINK/XYABC diff --git a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/circumfix.dic b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/circumfix.dic index 571e2e2..0295762 100644 --- a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/circumfix.dic +++ b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/circumfix.dic @@ -1,2 +1,2 @@ 1 -nagy/C [MN] +nagy/C [MN] diff --git a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/condition2.aff b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/condition2.aff new file mode 100644 index 0000000..8e06a21 --- /dev/null +++ b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/condition2.aff @@ -0,0 +1,5 @@ +SET ISO8859-1 +TRY esianrtolcdugmphbyfvkwzESIANRTOLCDUGMPHBYFVKWZ' + +SFX S Y 1 +SFX S y ies [^aeiou]y diff --git a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/condition2.dic b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/condition2.dic new file mode 100644 index 0000000..72a8c3e --- /dev/null +++ b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/condition2.dic @@ -0,0 +1,2 @@ +1 +monopoly/S diff --git a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/conv.dic b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/conv.dic index 6b68dc8..169e17f 100644 --- a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/conv.dic +++ b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/conv.dic @@ -1,2 +1,2 @@ 1 -drink/X [VERB] +drink/X [VERB] diff --git a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/dependencies.dic b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/dependencies.dic index bdba45e..ade5437 100644 --- a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/dependencies.dic +++ b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/dependencies.dic @@ -1,3 +1,3 @@ 2 -drink/RQ [verb] -drink/S [noun] +drink/RQ [verb] +drink/S [noun] diff --git a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/double-escaped.aff b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/double-escaped.aff new file mode 100644 index 0000000..ab74afa --- /dev/null +++ b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/double-escaped.aff @@ -0,0 +1,5 @@ +SET UTF-8 + +SFX X Y 1 +SFX X ar o [^\-]ar + diff --git a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/double-escaped.dic b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/double-escaped.dic new file mode 100644 index 0000000..42ddb5e --- /dev/null +++ b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/double-escaped.dic @@ -0,0 +1,2 @@ +1 +adubar/X diff --git a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/flaglong.aff b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/flaglong.aff index d05a5da..fb0f423 100644 --- a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/flaglong.aff +++ b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/flaglong.aff @@ -2,3 +2,6 @@ SET UTF-8 FLAG long SFX Y1 Y 1 SFX Y1 0 s . + +SFX 1Y Y 1 +SFX 1Y 0 bogus . diff --git a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/fullstrip.aff b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/fullstrip.aff new file mode 100644 index 0000000..9c2de7f --- /dev/null +++ b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/fullstrip.aff @@ -0,0 +1,6 @@ +SET UTF-8 + +FULLSTRIP + +SFX A Y 1 +SFX A beer tasty . diff --git a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/fullstrip.dic b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/fullstrip.dic new file mode 100644 index 0000000..c948f18 --- /dev/null +++ b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/fullstrip.dic @@ -0,0 +1,2 @@ +1 +beer/A diff --git a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/homonyms.dic b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/homonyms.dic index 96d51f1..6357472 100644 --- a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/homonyms.dic +++ b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/homonyms.dic @@ -1,3 +1,3 @@ 2 -work/A [VERB] -work/B [NOUN] \ No newline at end of file +work/A [VERB] +work/B [NOUN] \ No newline at end of file diff --git a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/ignore.dic b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/ignore.dic index 9ae9205..854c509 100644 --- a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/ignore.dic +++ b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/ignore.dic @@ -1,3 +1,3 @@ 1 -drink/X [VERB] -dr-ank/X [VERB] \ No newline at end of file +drink/X [VERB] +dr-ank/X [VERB] \ No newline at end of file diff --git a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/keepcase.aff b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/keepcase.aff new file mode 100644 index 0000000..4b56950 --- /dev/null +++ b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/keepcase.aff @@ -0,0 +1,6 @@ +SET UTF-8 + +KEEPCASE Z + +SFX X Y 1 +SFX X 0 s . +s diff --git a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/keepcase.dic b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/keepcase.dic new file mode 100644 index 0000000..96b7a48 --- /dev/null +++ b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/keepcase.dic @@ -0,0 +1,4 @@ +3 +drink/X +walk/XZ +test/Z \ No newline at end of file diff --git a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/morph.dic b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/morph.dic index 6b68dc8..169e17f 100644 --- a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/morph.dic +++ b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/morph.dic @@ -1,2 +1,2 @@ 1 -drink/X [VERB] +drink/X [VERB] diff --git a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/morphalias.aff b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/morphalias.aff new file mode 100644 index 0000000..f408f3f --- /dev/null +++ b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/morphalias.aff @@ -0,0 +1,16 @@ +AM 4 +AM st:foot +AM st:workverb +AM st:worknoun +AM po:garbage + +SET UTF-8 + +SFX X Y 1 +SFX X 0 scratcher . + +SFX A Y 1 +SFX A 0 s . +SG3 + +SFX B Y 1 +SFX B 0 s . +PLUR diff --git a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/morphalias.dic b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/morphalias.dic new file mode 100644 index 0000000..638a2bd --- /dev/null +++ b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/morphalias.dic @@ -0,0 +1,6 @@ +5 +feet/X 1 +work/A 2 +work/B 3 +notspecial 4 +simplenoun/A diff --git a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/morphdata.aff b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/morphdata.aff new file mode 100644 index 0000000..0448cd7 --- /dev/null +++ b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/morphdata.aff @@ -0,0 +1,10 @@ +SET UTF-8 + +SFX X Y 1 +SFX X 0 scratcher . + +SFX A Y 1 +SFX A 0 s . +SG3 + +SFX B Y 1 +SFX B 0 s . +PLUR diff --git a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/morphdata.dic b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/morphdata.dic new file mode 100644 index 0000000..9b7cc9d --- /dev/null +++ b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/morphdata.dic @@ -0,0 +1,6 @@ +5 +feet/X st:foot +work/A st:workverb +work/B st:worknoun +notspecial +simplenoun/A diff --git a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/needaffix.aff b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/needaffix.aff new file mode 100644 index 0000000..ea6c41f --- /dev/null +++ b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/needaffix.aff @@ -0,0 +1,9 @@ +SET UTF-8 + +NEEDAFFIX Z + +PFX Y Y 1 +PFX Y 0 pre . pre+ + +SFX X Y 1 +SFX X 0 s . +s diff --git a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/needaffix.dic b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/needaffix.dic new file mode 100644 index 0000000..3ac76bd --- /dev/null +++ b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/needaffix.dic @@ -0,0 +1,4 @@ +3 +drink/X +walk/XYZ +test/Z \ No newline at end of file diff --git a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/onlyincompound.aff b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/onlyincompound.aff new file mode 100644 index 0000000..91fc80f --- /dev/null +++ b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/onlyincompound.aff @@ -0,0 +1,12 @@ +SET UTF-8 + +ONLYINCOMPOUND A + +PFX Y Y 1 +PFX Y 0 pre/A . pre+ + +SFX X Y 1 +SFX X 0 s . +s + +SFX Z Y 1 +SFX Z 0 ed/A . +ed diff --git a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/onlyincompound.dic b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/onlyincompound.dic new file mode 100644 index 0000000..8e7b025 --- /dev/null +++ b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/onlyincompound.dic @@ -0,0 +1,4 @@ +2 +drink/XYZ +walk/A + diff --git a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/spaces.aff b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/spaces.aff new file mode 100644 index 0000000..3f2365e --- /dev/null +++ b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/spaces.aff @@ -0,0 +1,5 @@ +SET UTF-8 +TRY abcdefghijklmopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ + +SFX X Y 1 +SFX X 0 s . +PLUR diff --git a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/spaces.dic b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/spaces.dic new file mode 100644 index 0000000..11294ae --- /dev/null +++ b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/spaces.dic @@ -0,0 +1,9 @@ +4 +four/X po:number +five po:number +forty four/X po:number +forty five po:number +fifty/X st:50 +sixty st:60 +sixty four st:64 +fifty four/X st:54 diff --git a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/strange-overgeneration.aff b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/strange-overgeneration.aff new file mode 100644 index 0000000..470b570 --- /dev/null +++ b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/strange-overgeneration.aff @@ -0,0 +1,10 @@ +SET UTF-8 + +SFX A Y 1 +SFX A baz yuck baz + +SFX B Y 1 +SFX B bar foo . + +SFX C Y 1 +SFX C eer tasty . diff --git a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/strange-overgeneration.dic b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/strange-overgeneration.dic new file mode 100644 index 0000000..e4b61b3 --- /dev/null +++ b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/strange-overgeneration.dic @@ -0,0 +1,5 @@ +3 +baz/A +bar/B +beer/C +eer/C diff --git a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/twosuffixes.dic b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/twosuffixes.dic index 6b68dc8..169e17f 100644 --- a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/twosuffixes.dic +++ b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/twosuffixes.dic @@ -1,2 +1,2 @@ 1 -drink/X [VERB] +drink/X [VERB] diff --git a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/zeroaffix.aff b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/zeroaffix.aff new file mode 100644 index 0000000..52c36f7 --- /dev/null +++ b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/zeroaffix.aff @@ -0,0 +1,4 @@ +SET UTF-8 + +SFX X Y 1 +SFX X sierranevada 0 . diff --git a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/zeroaffix.dic b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/zeroaffix.dic new file mode 100644 index 0000000..92c08d0 --- /dev/null +++ b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/zeroaffix.dic @@ -0,0 +1,2 @@ +1 +drinksierranevada/X [VERB] diff --git a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/zeroaffix2.aff b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/zeroaffix2.aff new file mode 100644 index 0000000..72e273f --- /dev/null +++ b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/zeroaffix2.aff @@ -0,0 +1,6 @@ +SET UTF-8 +FLAG num + +SFX 322 Y 1 +SFX 322 eer 0/100 . + diff --git a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/zeroaffix2.dic b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/zeroaffix2.dic new file mode 100644 index 0000000..4171564 --- /dev/null +++ b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Hunspell/zeroaffix2.dic @@ -0,0 +1,2 @@ +1 +beer/322 diff --git a/src/Lucene.Net.Tests.Analysis.Common/Lucene.Net.Tests.Analysis.Common.csproj b/src/Lucene.Net.Tests.Analysis.Common/Lucene.Net.Tests.Analysis.Common.csproj index 58188ec..dac5f852 100644 --- a/src/Lucene.Net.Tests.Analysis.Common/Lucene.Net.Tests.Analysis.Common.csproj +++ b/src/Lucene.Net.Tests.Analysis.Common/Lucene.Net.Tests.Analysis.Common.csproj @@ -25,6 +25,7 @@ <PropertyGroup> <AssemblyTitle>Lucene.Net.Tests.Analysis.Common</AssemblyTitle> + <RootNamespace>Lucene.Net</RootNamespace> </PropertyGroup> <ItemGroup> @@ -36,6 +37,37 @@ </ItemGroup> <ItemGroup> + <None Remove="Analysis\Hunspell\alternate-casing.aff" /> + <None Remove="Analysis\Hunspell\alternate-casing.dic" /> + <None Remove="Analysis\Hunspell\casesensitive.aff" /> + <None Remove="Analysis\Hunspell\casesensitive.dic" /> + <None Remove="Analysis\Hunspell\condition2.aff" /> + <None Remove="Analysis\Hunspell\condition2.dic" /> + <None Remove="Analysis\Hunspell\double-escaped.aff" /> + <None Remove="Analysis\Hunspell\double-escaped.dic" /> + <None Remove="Analysis\Hunspell\fullstrip.aff" /> + <None Remove="Analysis\Hunspell\fullstrip.dic" /> + <None Remove="Analysis\Hunspell\keepcase.aff" /> + <None Remove="Analysis\Hunspell\keepcase.dic" /> + <None Remove="Analysis\Hunspell\morphalias.aff" /> + <None Remove="Analysis\Hunspell\morphalias.dic" /> + <None Remove="Analysis\Hunspell\morphdata.aff" /> + <None Remove="Analysis\Hunspell\morphdata.dic" /> + <None Remove="Analysis\Hunspell\needaffix.aff" /> + <None Remove="Analysis\Hunspell\needaffix.dic" /> + <None Remove="Analysis\Hunspell\onlyincompound.aff" /> + <None Remove="Analysis\Hunspell\onlyincompound.dic" /> + <None Remove="Analysis\Hunspell\spaces.aff" /> + <None Remove="Analysis\Hunspell\spaces.dic" /> + <None Remove="Analysis\Hunspell\strange-overgeneration.aff" /> + <None Remove="Analysis\Hunspell\strange-overgeneration.dic" /> + <None Remove="Analysis\Hunspell\zeroaffix.aff" /> + <None Remove="Analysis\Hunspell\zeroaffix.dic" /> + <None Remove="Analysis\Hunspell\zeroaffix2.aff" /> + <None Remove="Analysis\Hunspell\zeroaffix2.dic" /> + </ItemGroup> + + <ItemGroup> <ProjectReference Include="..\Lucene.Net.Analysis.Common\Lucene.Net.Analysis.Common.csproj" /> <ProjectReference Include="..\Lucene.Net.TestFramework\Lucene.Net.TestFramework.csproj" /> </ItemGroup> @@ -43,7 +75,12 @@ <Import Project="$(SolutionDir)build/TestReferences.Common.targets" /> <ItemGroup Condition=" '$(TargetFramework)' == 'netcoreapp3.1' "> - <PackageReference Include="System.Net.Primitives" Version="$(SystemNetPrimitivesPackageVersion)"/> + <PackageReference Include="System.Text.Encoding.CodePages" Version="$(SystemTextEncodingCodePagesPackageVersion)" /> + <PackageReference Include="System.Net.Primitives" Version="$(SystemNetPrimitivesPackageVersion)" /> + </ItemGroup> + + <ItemGroup Condition="'$(TargetFramework)' == 'netcoreapp2.1' "> + <PackageReference Include="System.Text.Encoding.CodePages" Version="$(SystemTextEncodingCodePagesPackageVersion)" /> </ItemGroup> <ItemGroup Condition=" '$(TargetFramework)' == 'net48' "> diff --git a/src/Lucene.Net.Tests.Analysis.Common/Startup.cs b/src/Lucene.Net.Tests.Analysis.Common/Startup.cs index 3d830d4..a9502ca 100644 --- a/src/Lucene.Net.Tests.Analysis.Common/Startup.cs +++ b/src/Lucene.Net.Tests.Analysis.Common/Startup.cs @@ -1,4 +1,4 @@ -// Lucene version compatibility level 4.8.1 +// Lucene version compatibility level 4.8.1 /* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with @@ -23,4 +23,10 @@ using Lucene.Net.Util; // would not occur if it were not here. public class Startup : LuceneTestFrameworkInitializer { + protected override void TestFrameworkSetUp() + { +#if FEATURE_ENCODINGPROVIDERS + System.Text.Encoding.RegisterProvider(System.Text.CodePagesEncodingProvider.Instance); +#endif + } } \ No newline at end of file
