This is an automated email from the ASF dual-hosted git repository. nightowl888 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/lucenenet.git
commit 3c4cfa4cd2f5f4399751fe799abfc1ddf56e522b Author: Shad Storhaug <[email protected]> AuthorDate: Wed Jul 29 20:52:13 2020 +0700 BUG: Fixed surrogate pair and culture-sensitivity issues with many analyzers. (see #296) --- .../Analysis/Core/LetterTokenizer.cs | 2 +- .../Analysis/Core/WhitespaceTokenizer.cs | 5 +-- .../Analysis/El/GreekLowerCaseFilter.cs | 5 ++- .../Analysis/En/PorterStemmer.cs | 4 +-- .../Analysis/Ga/IrishLowerCaseFilter.cs | 5 ++- .../Miscellaneous/WordDelimiterIterator.cs | 17 +++++----- .../Analysis/Th/ThaiTokenizer.cs | 2 +- .../Analysis/Tr/TurkishLowerCaseFilter.cs | 39 ++++------------------ .../Analysis/Util/CharacterUtils.cs | 4 +-- .../Morfologik/MorfologikFilter.cs | 5 ++- .../Miscellaneous/TestStemmerOverrideFilter.cs | 2 +- .../Payloads/TypeAsPayloadTokenFilterTest.cs | 5 +-- .../TestPostingsHighlighterRanking.cs | 2 +- .../Support/TestWeakDictionaryBehavior.cs | 2 +- .../TestICUPostingsHighlighterRanking.cs | 2 +- 15 files changed, 43 insertions(+), 58 deletions(-) diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Core/LetterTokenizer.cs b/src/Lucene.Net.Analysis.Common/Analysis/Core/LetterTokenizer.cs index c15c327..29d5c4a 100644 --- a/src/Lucene.Net.Analysis.Common/Analysis/Core/LetterTokenizer.cs +++ b/src/Lucene.Net.Analysis.Common/Analysis/Core/LetterTokenizer.cs @@ -25,7 +25,7 @@ namespace Lucene.Net.Analysis.Core /// <summary> /// A <see cref="LetterTokenizer"/> is a tokenizer that divides text at non-letters. That's to /// say, it defines tokens as maximal strings of adjacent letters, as defined by - /// <see cref="char.IsLetter(char)"/> predicate. + /// <see cref="Character.IsLetter(int)"/> predicate. /// <para> /// Note: this does a decent job for most European languages, but does a terrible /// job for some Asian languages, where words are not separated by spaces. diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Core/WhitespaceTokenizer.cs b/src/Lucene.Net.Analysis.Common/Analysis/Core/WhitespaceTokenizer.cs index 8381dce..b6bb740 100644 --- a/src/Lucene.Net.Analysis.Common/Analysis/Core/WhitespaceTokenizer.cs +++ b/src/Lucene.Net.Analysis.Common/Analysis/Core/WhitespaceTokenizer.cs @@ -1,4 +1,5 @@ -using Lucene.Net.Analysis.Util; +using J2N; +using Lucene.Net.Analysis.Util; using Lucene.Net.Util; using System.IO; @@ -65,7 +66,7 @@ namespace Lucene.Net.Analysis.Core /// </summary> protected override bool IsTokenChar(int c) { - return !char.IsWhiteSpace((char)c); + return !Character.IsWhiteSpace(c); } } } \ No newline at end of file diff --git a/src/Lucene.Net.Analysis.Common/Analysis/El/GreekLowerCaseFilter.cs b/src/Lucene.Net.Analysis.Common/Analysis/El/GreekLowerCaseFilter.cs index 1997ade..40a0d6d 100644 --- a/src/Lucene.Net.Analysis.Common/Analysis/El/GreekLowerCaseFilter.cs +++ b/src/Lucene.Net.Analysis.Common/Analysis/El/GreekLowerCaseFilter.cs @@ -2,6 +2,7 @@ using Lucene.Net.Analysis.TokenAttributes; using Lucene.Net.Analysis.Util; using Lucene.Net.Util; +using System.Globalization; namespace Lucene.Net.Analysis.El { @@ -37,6 +38,8 @@ namespace Lucene.Net.Analysis.El private readonly ICharTermAttribute termAtt; private readonly CharacterUtils charUtils; + private static readonly CultureInfo culture = new CultureInfo("el"); // LUCENENET specific - use Greek culture when lowercasing. + /// <summary> /// Create a <see cref="GreekLowerCaseFilter"/> that normalizes Greek token text. /// </summary> @@ -127,7 +130,7 @@ namespace Lucene.Net.Analysis.El return '\u03C2'; // small final sigma default: - return Character.ToLower(codepoint); + return Character.ToLower(codepoint, culture); // LUCENENET specific - need to use specific culture to override current thread } } } diff --git a/src/Lucene.Net.Analysis.Common/Analysis/En/PorterStemmer.cs b/src/Lucene.Net.Analysis.Common/Analysis/En/PorterStemmer.cs index 9ac8ea1..7d81a3c 100644 --- a/src/Lucene.Net.Analysis.Common/Analysis/En/PorterStemmer.cs +++ b/src/Lucene.Net.Analysis.Common/Analysis/En/PorterStemmer.cs @@ -841,9 +841,9 @@ namespace Lucene.Net.Analysis.En // ch = buffer[offset++]; // } - // if (char.IsLetter((char)ch)) + // if (Character.IsLetter(ch)) // { - // s.Add(char.ToLowerInvariant((char)ch)); + // s.Add(Character.ToLower(ch, CultureInfo.InvariantCulture)); // } // else // { diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Ga/IrishLowerCaseFilter.cs b/src/Lucene.Net.Analysis.Common/Analysis/Ga/IrishLowerCaseFilter.cs index 7caa55b..7299804 100644 --- a/src/Lucene.Net.Analysis.Common/Analysis/Ga/IrishLowerCaseFilter.cs +++ b/src/Lucene.Net.Analysis.Common/Analysis/Ga/IrishLowerCaseFilter.cs @@ -1,5 +1,6 @@ using J2N; using Lucene.Net.Analysis.TokenAttributes; +using System.Globalization; namespace Lucene.Net.Analysis.Ga { @@ -28,6 +29,8 @@ namespace Lucene.Net.Analysis.Ga { private readonly ICharTermAttribute termAtt; + private static readonly CultureInfo culture = new CultureInfo("ga"); // LUCENENET specific - use Irish culture when lowercasing. + /// <summary> /// Create an <see cref="IrishLowerCaseFilter"/> that normalises Irish token text. /// </summary> @@ -60,7 +63,7 @@ namespace Lucene.Net.Analysis.Ga for (int i = idx; i < chLen;) { - i += Character.ToChars(Character.ToLower(chArray[i]), chArray, i); + i += Character.ToChars(Character.ToLower(chArray[i], culture), chArray, i); // LUCENENET specific - use Irish culture when lowercasing } return true; } diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Miscellaneous/WordDelimiterIterator.cs b/src/Lucene.Net.Analysis.Common/Analysis/Miscellaneous/WordDelimiterIterator.cs index 012ab7d..0a0a3ef 100644 --- a/src/Lucene.Net.Analysis.Common/Analysis/Miscellaneous/WordDelimiterIterator.cs +++ b/src/Lucene.Net.Analysis.Common/Analysis/Miscellaneous/WordDelimiterIterator.cs @@ -1,4 +1,5 @@ -using System.Globalization; +using J2N; +using System.Globalization; namespace Lucene.Net.Analysis.Miscellaneous { @@ -84,17 +85,17 @@ namespace Lucene.Net.Analysis.Miscellaneous for (int i = 0; i < 256; i++) { byte code = 0; - if (char.IsLower((char)i)) + if (Character.IsLower(i)) { - code |= (byte)WordDelimiterFilter.LOWER; + code |= WordDelimiterFilter.LOWER; } - else if (char.IsUpper((char)i)) + else if (Character.IsUpper(i)) { - code |= (byte)WordDelimiterFilter.UPPER; + code |= WordDelimiterFilter.UPPER; } - else if (char.IsDigit((char)i)) + else if (Character.IsDigit(i)) { - code |= (byte)WordDelimiterFilter.DIGIT; + code |= WordDelimiterFilter.DIGIT; } if (code == 0) { @@ -318,7 +319,7 @@ namespace Lucene.Net.Analysis.Miscellaneous /// <returns> Type of the character </returns> public static byte GetType(int ch) { - switch (CharUnicodeInfo.GetUnicodeCategory((char)ch)) + switch (Character.GetType(ch)) { case UnicodeCategory.UppercaseLetter: return WordDelimiterFilter.UPPER; diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Th/ThaiTokenizer.cs b/src/Lucene.Net.Analysis.Common/Analysis/Th/ThaiTokenizer.cs index f7d4097..6d045ad 100644 --- a/src/Lucene.Net.Analysis.Common/Analysis/Th/ThaiTokenizer.cs +++ b/src/Lucene.Net.Analysis.Common/Analysis/Th/ThaiTokenizer.cs @@ -95,7 +95,7 @@ namespace Lucene.Net.Analysis.Th // find the next set of boundaries, skipping over non-tokens int end = wordBreaker.Next(); - while (end != BreakIterator.Done && !char.IsLetterOrDigit((char)Character.CodePointAt(m_buffer, sentenceStart + start, sentenceEnd))) + while (end != BreakIterator.Done && !Character.IsLetterOrDigit(Character.CodePointAt(m_buffer, sentenceStart + start, sentenceEnd))) { start = end; end = wordBreaker.Next(); diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Tr/TurkishLowerCaseFilter.cs b/src/Lucene.Net.Analysis.Common/Analysis/Tr/TurkishLowerCaseFilter.cs index 0cd0464..c771754 100644 --- a/src/Lucene.Net.Analysis.Common/Analysis/Tr/TurkishLowerCaseFilter.cs +++ b/src/Lucene.Net.Analysis.Common/Analysis/Tr/TurkishLowerCaseFilter.cs @@ -1,5 +1,4 @@ using J2N; -using J2N.Globalization; using Lucene.Net.Analysis.TokenAttributes; using System; using System.Globalization; @@ -35,12 +34,13 @@ namespace Lucene.Net.Analysis.Tr public sealed class TurkishLowerCaseFilter : TokenFilter { private const int LATIN_CAPITAL_LETTER_I = '\u0049'; - private const int LATIN_CAPITAL_LETTER_DOTTED_I = '\u0130'; private const int LATIN_SMALL_LETTER_I = '\u0069'; private const int LATIN_SMALL_LETTER_DOTLESS_I = '\u0131'; private const int COMBINING_DOT_ABOVE = '\u0307'; private readonly ICharTermAttribute termAtt; + private static readonly CultureInfo culture = new CultureInfo("tr"); // LUCENENET specific - we need to do a culture-sensitive lowercase operation in Turkish + /// <summary> /// Create a new <see cref="TurkishLowerCaseFilter"/>, that normalizes Turkish token text /// to lower case. @@ -64,7 +64,7 @@ namespace Lucene.Net.Analysis.Tr { int ch = Character.CodePointAt(buffer, i, length); - iOrAfter = (ch == LATIN_CAPITAL_LETTER_I || (iOrAfter && CharUnicodeInfo.GetUnicodeCategory((char)ch) == UnicodeCategory.NonSpacingMark)); + iOrAfter = (ch == LATIN_CAPITAL_LETTER_I || (iOrAfter && Character.GetType(ch) == UnicodeCategory.NonSpacingMark)); if (iOrAfter) // all the special I turkish handling happens here. { @@ -93,32 +93,8 @@ namespace Lucene.Net.Analysis.Tr } } - using (var culture = new CultureContext("tr")) - { - switch (ch) - { - // LUCENENET: The .NET char.ToLower() function works correctly in - // Turkish as long as the current thread is set to tr-TR (well, technically the - // culture change is only required for the LATIN_CAPITAL_LETTER_I case). .NET does - // not split these characters into separate letter/non-spacing mark characters, - // but the user might still input them that way so we still need the above - // block to handle that case. - // - // LUCENENET TODO: Oddly, the Character.ToLowerCase() function below does not work right - // for Turkish. Which begs the question, should this special case be there so Turkish works - // everywhere? Or should we leave it a special case here because that is the way it works in Java? - // - // References: - // http://haacked.com/archive/2012/07/05/turkish-i-problem-and-why-you-should-care.aspx/ - // http://www.i18nguy.com/unicode/turkish-i18n.html - case LATIN_CAPITAL_LETTER_I: - case LATIN_CAPITAL_LETTER_DOTTED_I: - i += Character.ToChars(char.ToLower((char)ch), buffer, i); - continue; - } - } - - i += Character.ToChars(Character.ToLower(ch), buffer, i); + // LUCENENET specific - need to pass Turkish culture to get the correct lowercase results + i += Character.ToChars(Character.ToLower(ch, culture), buffer, i); } termAtt.Length = length; @@ -139,8 +115,7 @@ namespace Lucene.Net.Analysis.Tr for (int i = pos; i < len;) { int ch = Character.CodePointAt(s, i, len); - //if (char.getType(ch) != char.NON_SPACING_MARK) - if (CharUnicodeInfo.GetUnicodeCategory((char)ch) != UnicodeCategory.NonSpacingMark) + if (Character.GetType(ch) != UnicodeCategory.NonSpacingMark) { return false; } @@ -161,9 +136,7 @@ namespace Lucene.Net.Analysis.Tr private int Delete(char[] s, int pos, int len) { if (pos < len) - { Array.Copy(s, pos + 1, s, pos, len - pos - 1); - } return len - 1; } diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Util/CharacterUtils.cs b/src/Lucene.Net.Analysis.Common/Analysis/Util/CharacterUtils.cs index 67a4438..aa22177 100644 --- a/src/Lucene.Net.Analysis.Common/Analysis/Util/CharacterUtils.cs +++ b/src/Lucene.Net.Analysis.Common/Analysis/Util/CharacterUtils.cs @@ -168,7 +168,7 @@ namespace Lucene.Net.Analysis.Util /// <summary> - /// Converts each unicode codepoint to lowerCase via <see cref="Character.ToLower(int)"/> starting + /// Converts each unicode codepoint to lowerCase via <see cref="TextInfo.ToLower(string)"/> in the invariant culture starting /// at the given offset. </summary> /// <param name="buffer"> the char buffer to lowercase </param> /// <param name="offset"> the offset to start at </param> @@ -199,7 +199,7 @@ namespace Lucene.Net.Analysis.Util } /// <summary> - /// Converts each unicode codepoint to UpperCase via <see cref="Character.ToUpper(int)"/> starting + /// Converts each unicode codepoint to UpperCase via <see cref="TextInfo.ToUpper(string)"/> in the invariant culture starting /// at the given offset. </summary> /// <param name="buffer"> the char buffer to UPPERCASE </param> /// <param name="offset"> the offset to start at </param> diff --git a/src/Lucene.Net.Analysis.Morfologik/Morfologik/MorfologikFilter.cs b/src/Lucene.Net.Analysis.Morfologik/Morfologik/MorfologikFilter.cs index 238c88d..96071c1 100644 --- a/src/Lucene.Net.Analysis.Morfologik/Morfologik/MorfologikFilter.cs +++ b/src/Lucene.Net.Analysis.Morfologik/Morfologik/MorfologikFilter.cs @@ -8,6 +8,7 @@ using Morfologik.Stemming; using Morfologik.Stemming.Polish; using System; using System.Collections.Generic; +using System.Globalization; using System.Text; using System.Text.RegularExpressions; @@ -55,6 +56,8 @@ namespace Lucene.Net.Analysis.Morfologik private int lemmaListIndex; + private static readonly CultureInfo culture = new CultureInfo("pl"); // LUCENENET specific - do lowercasing in Polish culture + /// <summary> /// Creates a filter with the default (Polish) dictionary. /// </summary> @@ -166,7 +169,7 @@ namespace Lucene.Net.Analysis.Morfologik for (int i = 0; i < length;) { i += Character.ToChars( - Character.ToLower(Character.CodePointAt(chs, i)), buffer, i); + Character.ToLower(Character.CodePointAt(chs, i), culture), buffer, i); // LUCENENET specific - need to use explicit culture to override current thread } return scratch.ToString(); diff --git a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Miscellaneous/TestStemmerOverrideFilter.cs b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Miscellaneous/TestStemmerOverrideFilter.cs index dbf1573..df2463a 100644 --- a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Miscellaneous/TestStemmerOverrideFilter.cs +++ b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Miscellaneous/TestStemmerOverrideFilter.cs @@ -79,7 +79,7 @@ namespace Lucene.Net.Analysis.Miscellaneous for (int j = 0; j < charArray.Length;) { int cp = Character.CodePointAt(charArray, j, charArray.Length); - if (!char.IsWhiteSpace((char)cp)) + if (!Character.IsWhiteSpace(cp)) { sb.AppendCodePoint(cp); } diff --git a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Payloads/TypeAsPayloadTokenFilterTest.cs b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Payloads/TypeAsPayloadTokenFilterTest.cs index 607d82c..223f04f 100644 --- a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Payloads/TypeAsPayloadTokenFilterTest.cs +++ b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Payloads/TypeAsPayloadTokenFilterTest.cs @@ -1,6 +1,7 @@ using Lucene.Net.Analysis.TokenAttributes; using NUnit.Framework; using System; +using System.Globalization; using System.IO; namespace Lucene.Net.Analysis.Payloads @@ -39,7 +40,7 @@ namespace Lucene.Net.Analysis.Payloads nptf.Reset(); while (nptf.IncrementToken()) { - assertTrue(typeAtt.Type + " is not null and it should be", typeAtt.Type.Equals(char.ToUpper(termAtt.Buffer[0]).ToString(), StringComparison.Ordinal)); + assertTrue(typeAtt.Type + " is not null and it should be", typeAtt.Type.Equals(char.ToUpper(termAtt.Buffer[0]).ToString(), StringComparison.Ordinal)); // LUCENENET specific - intentionally using current culture assertTrue("nextToken.getPayload() is null and it shouldn't be", payloadAtt.Payload != null); string type = payloadAtt.Payload.Utf8ToString(); assertTrue(type + " is not equal to " + typeAtt.Type, type.Equals(typeAtt.Type, StringComparison.Ordinal)); @@ -67,7 +68,7 @@ namespace Lucene.Net.Analysis.Payloads { if (m_input.IncrementToken()) { - typeAtt.Type = char.ToUpper(termAtt.Buffer[0]).ToString(); + typeAtt.Type = char.ToUpper(termAtt.Buffer[0]).ToString(); // LUCENENET specific - intentionally using current culture return true; } else diff --git a/src/Lucene.Net.Tests.Highlighter/PostingsHighlight/TestPostingsHighlighterRanking.cs b/src/Lucene.Net.Tests.Highlighter/PostingsHighlight/TestPostingsHighlighterRanking.cs index 5cbf8af..654d2f9 100644 --- a/src/Lucene.Net.Tests.Highlighter/PostingsHighlight/TestPostingsHighlighterRanking.cs +++ b/src/Lucene.Net.Tests.Highlighter/PostingsHighlight/TestPostingsHighlighterRanking.cs @@ -213,7 +213,7 @@ namespace Lucene.Net.Search.PostingsHighlight assertEquals(matchStart + 1, matchEnd); // and the offsets must be correct... assertEquals(1, term.Length); - assertEquals((char)term.Bytes[term.Offset], Character.ToLower(content[matchStart])); + assertEquals((char)term.Bytes[term.Offset], Character.ToLower(content[matchStart], CultureInfo.InvariantCulture)); } // record just the start/end offset for simplicity seen.Add(new Pair(p.StartOffset, p.EndOffset)); diff --git a/src/Lucene.Net.Tests/Support/TestWeakDictionaryBehavior.cs b/src/Lucene.Net.Tests/Support/TestWeakDictionaryBehavior.cs index e1cc76d..40dbfba 100644 --- a/src/Lucene.Net.Tests/Support/TestWeakDictionaryBehavior.cs +++ b/src/Lucene.Net.Tests/Support/TestWeakDictionaryBehavior.cs @@ -110,7 +110,7 @@ namespace Lucene.Net.Support public void Test_Dictionary_AddReplace() { string key = "A"; - string key2 = "a".ToUpper(); + string key2 = "a".ToUpperInvariant(); dictionary.Add(key, "value"); dictionary[key2] = "value2"; diff --git a/src/dotnet/Lucene.Net.Tests.ICU/Search/PostingsHighlight/TestICUPostingsHighlighterRanking.cs b/src/dotnet/Lucene.Net.Tests.ICU/Search/PostingsHighlight/TestICUPostingsHighlighterRanking.cs index 0a1731b..5ac7eb4 100644 --- a/src/dotnet/Lucene.Net.Tests.ICU/Search/PostingsHighlight/TestICUPostingsHighlighterRanking.cs +++ b/src/dotnet/Lucene.Net.Tests.ICU/Search/PostingsHighlight/TestICUPostingsHighlighterRanking.cs @@ -211,7 +211,7 @@ namespace Lucene.Net.Search.PostingsHighlight assertEquals(matchStart + 1, matchEnd); // and the offsets must be correct... assertEquals(1, term.Length); - assertEquals((char)term.Bytes[term.Offset], Character.ToLower(content[matchStart])); + assertEquals((char)term.Bytes[term.Offset], Character.ToLower(content[matchStart], CultureInfo.InvariantCulture)); // LUCENENET specific - need to use invariant culture to match Java } // record just the start/end offset for simplicity seen.Add(new Pair(p.StartOffset, p.EndOffset));
