This is an automated email from the ASF dual-hosted git repository.

nightowl888 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/lucenenet.git

commit 3c4cfa4cd2f5f4399751fe799abfc1ddf56e522b
Author: Shad Storhaug <[email protected]>
AuthorDate: Wed Jul 29 20:52:13 2020 +0700

    BUG: Fixed surrogate pair and culture-sensitivity issues with many 
analyzers. (see #296)
---
 .../Analysis/Core/LetterTokenizer.cs               |  2 +-
 .../Analysis/Core/WhitespaceTokenizer.cs           |  5 +--
 .../Analysis/El/GreekLowerCaseFilter.cs            |  5 ++-
 .../Analysis/En/PorterStemmer.cs                   |  4 +--
 .../Analysis/Ga/IrishLowerCaseFilter.cs            |  5 ++-
 .../Miscellaneous/WordDelimiterIterator.cs         | 17 +++++-----
 .../Analysis/Th/ThaiTokenizer.cs                   |  2 +-
 .../Analysis/Tr/TurkishLowerCaseFilter.cs          | 39 ++++------------------
 .../Analysis/Util/CharacterUtils.cs                |  4 +--
 .../Morfologik/MorfologikFilter.cs                 |  5 ++-
 .../Miscellaneous/TestStemmerOverrideFilter.cs     |  2 +-
 .../Payloads/TypeAsPayloadTokenFilterTest.cs       |  5 +--
 .../TestPostingsHighlighterRanking.cs              |  2 +-
 .../Support/TestWeakDictionaryBehavior.cs          |  2 +-
 .../TestICUPostingsHighlighterRanking.cs           |  2 +-
 15 files changed, 43 insertions(+), 58 deletions(-)

diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Core/LetterTokenizer.cs 
b/src/Lucene.Net.Analysis.Common/Analysis/Core/LetterTokenizer.cs
index c15c327..29d5c4a 100644
--- a/src/Lucene.Net.Analysis.Common/Analysis/Core/LetterTokenizer.cs
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Core/LetterTokenizer.cs
@@ -25,7 +25,7 @@ namespace Lucene.Net.Analysis.Core
     /// <summary>
     /// A <see cref="LetterTokenizer"/> is a tokenizer that divides text at 
non-letters. That's to
     /// say, it defines tokens as maximal strings of adjacent letters, as 
defined by
-    /// <see cref="char.IsLetter(char)"/> predicate.
+    /// <see cref="Character.IsLetter(int)"/> predicate.
     /// <para>
     /// Note: this does a decent job for most European languages, but does a 
terrible
     /// job for some Asian languages, where words are not separated by spaces.
diff --git 
a/src/Lucene.Net.Analysis.Common/Analysis/Core/WhitespaceTokenizer.cs 
b/src/Lucene.Net.Analysis.Common/Analysis/Core/WhitespaceTokenizer.cs
index 8381dce..b6bb740 100644
--- a/src/Lucene.Net.Analysis.Common/Analysis/Core/WhitespaceTokenizer.cs
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Core/WhitespaceTokenizer.cs
@@ -1,4 +1,5 @@
-using Lucene.Net.Analysis.Util;
+using J2N;
+using Lucene.Net.Analysis.Util;
 using Lucene.Net.Util;
 using System.IO;
 
@@ -65,7 +66,7 @@ namespace Lucene.Net.Analysis.Core
         /// </summary>
         protected override bool IsTokenChar(int c)
         {
-            return !char.IsWhiteSpace((char)c);
+            return !Character.IsWhiteSpace(c);
         }
     }
 }
\ No newline at end of file
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/El/GreekLowerCaseFilter.cs 
b/src/Lucene.Net.Analysis.Common/Analysis/El/GreekLowerCaseFilter.cs
index 1997ade..40a0d6d 100644
--- a/src/Lucene.Net.Analysis.Common/Analysis/El/GreekLowerCaseFilter.cs
+++ b/src/Lucene.Net.Analysis.Common/Analysis/El/GreekLowerCaseFilter.cs
@@ -2,6 +2,7 @@
 using Lucene.Net.Analysis.TokenAttributes;
 using Lucene.Net.Analysis.Util;
 using Lucene.Net.Util;
+using System.Globalization;
 
 namespace Lucene.Net.Analysis.El
 {
@@ -37,6 +38,8 @@ namespace Lucene.Net.Analysis.El
         private readonly ICharTermAttribute termAtt;
         private readonly CharacterUtils charUtils;
 
+        private static readonly CultureInfo culture = new CultureInfo("el"); 
// LUCENENET specific - use Greek culture when lowercasing.
+
         /// <summary>
         /// Create a <see cref="GreekLowerCaseFilter"/> that normalizes Greek 
token text.
         /// </summary>
@@ -127,7 +130,7 @@ namespace Lucene.Net.Analysis.El
                     return '\u03C2'; // small final sigma
 
                 default:
-                    return Character.ToLower(codepoint);
+                    return Character.ToLower(codepoint, culture); // LUCENENET 
specific - need to use specific culture to override current thread
             }
         }
     }
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/En/PorterStemmer.cs 
b/src/Lucene.Net.Analysis.Common/Analysis/En/PorterStemmer.cs
index 9ac8ea1..7d81a3c 100644
--- a/src/Lucene.Net.Analysis.Common/Analysis/En/PorterStemmer.cs
+++ b/src/Lucene.Net.Analysis.Common/Analysis/En/PorterStemmer.cs
@@ -841,9 +841,9 @@ namespace Lucene.Net.Analysis.En
         //                            ch = buffer[offset++];
         //                    }
 
-        //                    if (char.IsLetter((char)ch))
+        //                    if (Character.IsLetter(ch))
         //                    {
-        //                        s.Add(char.ToLowerInvariant((char)ch));
+        //                        s.Add(Character.ToLower(ch, 
CultureInfo.InvariantCulture));
         //                    }
         //                    else
         //                    {
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Ga/IrishLowerCaseFilter.cs 
b/src/Lucene.Net.Analysis.Common/Analysis/Ga/IrishLowerCaseFilter.cs
index 7caa55b..7299804 100644
--- a/src/Lucene.Net.Analysis.Common/Analysis/Ga/IrishLowerCaseFilter.cs
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Ga/IrishLowerCaseFilter.cs
@@ -1,5 +1,6 @@
 using J2N;
 using Lucene.Net.Analysis.TokenAttributes;
+using System.Globalization;
 
 namespace Lucene.Net.Analysis.Ga
 {
@@ -28,6 +29,8 @@ namespace Lucene.Net.Analysis.Ga
     {
         private readonly ICharTermAttribute termAtt;
 
+        private static readonly CultureInfo culture = new CultureInfo("ga"); 
// LUCENENET specific - use Irish culture when lowercasing.
+
         /// <summary>
         /// Create an <see cref="IrishLowerCaseFilter"/> that normalises Irish 
token text.
         /// </summary>
@@ -60,7 +63,7 @@ namespace Lucene.Net.Analysis.Ga
 
                 for (int i = idx; i < chLen;)
                 {
-                    i += Character.ToChars(Character.ToLower(chArray[i]), 
chArray, i);
+                    i += Character.ToChars(Character.ToLower(chArray[i], 
culture), chArray, i); // LUCENENET specific - use Irish culture when 
lowercasing
                 }
                 return true;
             }
diff --git 
a/src/Lucene.Net.Analysis.Common/Analysis/Miscellaneous/WordDelimiterIterator.cs
 
b/src/Lucene.Net.Analysis.Common/Analysis/Miscellaneous/WordDelimiterIterator.cs
index 012ab7d..0a0a3ef 100644
--- 
a/src/Lucene.Net.Analysis.Common/Analysis/Miscellaneous/WordDelimiterIterator.cs
+++ 
b/src/Lucene.Net.Analysis.Common/Analysis/Miscellaneous/WordDelimiterIterator.cs
@@ -1,4 +1,5 @@
-using System.Globalization;
+using J2N;
+using System.Globalization;
 
 namespace Lucene.Net.Analysis.Miscellaneous
 {
@@ -84,17 +85,17 @@ namespace Lucene.Net.Analysis.Miscellaneous
             for (int i = 0; i < 256; i++)
             {
                 byte code = 0;
-                if (char.IsLower((char)i))
+                if (Character.IsLower(i))
                 {
-                    code |= (byte)WordDelimiterFilter.LOWER;
+                    code |= WordDelimiterFilter.LOWER;
                 }
-                else if (char.IsUpper((char)i))
+                else if (Character.IsUpper(i))
                 {
-                    code |= (byte)WordDelimiterFilter.UPPER;
+                    code |= WordDelimiterFilter.UPPER;
                 }
-                else if (char.IsDigit((char)i))
+                else if (Character.IsDigit(i))
                 {
-                    code |= (byte)WordDelimiterFilter.DIGIT;
+                    code |= WordDelimiterFilter.DIGIT;
                 }
                 if (code == 0)
                 {
@@ -318,7 +319,7 @@ namespace Lucene.Net.Analysis.Miscellaneous
         /// <returns> Type of the character </returns>
         public static byte GetType(int ch)
         {
-            switch (CharUnicodeInfo.GetUnicodeCategory((char)ch))
+            switch (Character.GetType(ch))
             {
                 case UnicodeCategory.UppercaseLetter:
                     return WordDelimiterFilter.UPPER;
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Th/ThaiTokenizer.cs 
b/src/Lucene.Net.Analysis.Common/Analysis/Th/ThaiTokenizer.cs
index f7d4097..6d045ad 100644
--- a/src/Lucene.Net.Analysis.Common/Analysis/Th/ThaiTokenizer.cs
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Th/ThaiTokenizer.cs
@@ -95,7 +95,7 @@ namespace Lucene.Net.Analysis.Th
 
             // find the next set of boundaries, skipping over non-tokens
             int end = wordBreaker.Next();
-            while (end != BreakIterator.Done && 
!char.IsLetterOrDigit((char)Character.CodePointAt(m_buffer, sentenceStart + 
start, sentenceEnd)))
+            while (end != BreakIterator.Done && 
!Character.IsLetterOrDigit(Character.CodePointAt(m_buffer, sentenceStart + 
start, sentenceEnd)))
             {
                 start = end;
                 end = wordBreaker.Next();
diff --git 
a/src/Lucene.Net.Analysis.Common/Analysis/Tr/TurkishLowerCaseFilter.cs 
b/src/Lucene.Net.Analysis.Common/Analysis/Tr/TurkishLowerCaseFilter.cs
index 0cd0464..c771754 100644
--- a/src/Lucene.Net.Analysis.Common/Analysis/Tr/TurkishLowerCaseFilter.cs
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Tr/TurkishLowerCaseFilter.cs
@@ -1,5 +1,4 @@
 using J2N;
-using J2N.Globalization;
 using Lucene.Net.Analysis.TokenAttributes;
 using System;
 using System.Globalization;
@@ -35,12 +34,13 @@ namespace Lucene.Net.Analysis.Tr
     public sealed class TurkishLowerCaseFilter : TokenFilter
     {
         private const int LATIN_CAPITAL_LETTER_I = '\u0049';
-        private const int LATIN_CAPITAL_LETTER_DOTTED_I = '\u0130';
         private const int LATIN_SMALL_LETTER_I = '\u0069';
         private const int LATIN_SMALL_LETTER_DOTLESS_I = '\u0131';
         private const int COMBINING_DOT_ABOVE = '\u0307';
         private readonly ICharTermAttribute termAtt;
 
+        private static readonly CultureInfo culture = new CultureInfo("tr"); 
// LUCENENET specific - we need to do a culture-sensitive lowercase operation 
in Turkish
+
         /// <summary>
         /// Create a new <see cref="TurkishLowerCaseFilter"/>, that normalizes 
Turkish token text 
         /// to lower case.
@@ -64,7 +64,7 @@ namespace Lucene.Net.Analysis.Tr
                 {
                     int ch = Character.CodePointAt(buffer, i, length);
 
-                    iOrAfter = (ch == LATIN_CAPITAL_LETTER_I || (iOrAfter && 
CharUnicodeInfo.GetUnicodeCategory((char)ch) == 
UnicodeCategory.NonSpacingMark));
+                    iOrAfter = (ch == LATIN_CAPITAL_LETTER_I || (iOrAfter && 
Character.GetType(ch) == UnicodeCategory.NonSpacingMark));
 
                     if (iOrAfter) // all the special I turkish handling 
happens here.
                     {
@@ -93,32 +93,8 @@ namespace Lucene.Net.Analysis.Tr
                         }
                     }
 
-                    using (var culture = new CultureContext("tr"))
-                    {
-                        switch (ch)
-                        {
-                            // LUCENENET: The .NET char.ToLower() function 
works correctly in 
-                            // Turkish as long as the current thread is set to 
tr-TR (well, technically the 
-                            // culture change is only required for the 
LATIN_CAPITAL_LETTER_I case). .NET does 
-                            // not split these characters into separate 
letter/non-spacing mark characters,
-                            // but the user might still input them that way so 
we still need the above
-                            // block to handle that case.
-                            //
-                            // LUCENENET TODO: Oddly, the 
Character.ToLowerCase() function below does not work right
-                            // for Turkish. Which begs the question, should 
this special case be there so Turkish works
-                            // everywhere? Or should we leave it a special 
case here because that is the way it works in Java?
-                            //
-                            // References:
-                            // 
http://haacked.com/archive/2012/07/05/turkish-i-problem-and-why-you-should-care.aspx/
-                            // http://www.i18nguy.com/unicode/turkish-i18n.html
-                            case LATIN_CAPITAL_LETTER_I:
-                            case LATIN_CAPITAL_LETTER_DOTTED_I:
-                                i += Character.ToChars(char.ToLower((char)ch), 
buffer, i);
-                                continue;
-                        }
-                    }
-
-                    i += Character.ToChars(Character.ToLower(ch), buffer, i);
+                    // LUCENENET specific - need to pass Turkish culture to 
get the correct lowercase results
+                    i += Character.ToChars(Character.ToLower(ch, culture), 
buffer, i);
                 }
 
                 termAtt.Length = length;
@@ -139,8 +115,7 @@ namespace Lucene.Net.Analysis.Tr
             for (int i = pos; i < len;)
             {
                 int ch = Character.CodePointAt(s, i, len);
-                //if (char.getType(ch) != char.NON_SPACING_MARK)
-                if (CharUnicodeInfo.GetUnicodeCategory((char)ch) != 
UnicodeCategory.NonSpacingMark)
+                if (Character.GetType(ch) != UnicodeCategory.NonSpacingMark)
                 {
                     return false;
                 }
@@ -161,9 +136,7 @@ namespace Lucene.Net.Analysis.Tr
         private int Delete(char[] s, int pos, int len)
         {
             if (pos < len)
-            {
                 Array.Copy(s, pos + 1, s, pos, len - pos - 1);
-            }
 
             return len - 1;
         }
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Util/CharacterUtils.cs 
b/src/Lucene.Net.Analysis.Common/Analysis/Util/CharacterUtils.cs
index 67a4438..aa22177 100644
--- a/src/Lucene.Net.Analysis.Common/Analysis/Util/CharacterUtils.cs
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Util/CharacterUtils.cs
@@ -168,7 +168,7 @@ namespace Lucene.Net.Analysis.Util
 
 
         /// <summary>
-        /// Converts each unicode codepoint to lowerCase via <see 
cref="Character.ToLower(int)"/> starting 
+        /// Converts each unicode codepoint to lowerCase via <see 
cref="TextInfo.ToLower(string)"/> in the invariant culture starting 
         /// at the given offset. </summary>
         /// <param name="buffer"> the char buffer to lowercase </param>
         /// <param name="offset"> the offset to start at </param>
@@ -199,7 +199,7 @@ namespace Lucene.Net.Analysis.Util
         }
 
         /// <summary>
-        /// Converts each unicode codepoint to UpperCase via <see 
cref="Character.ToUpper(int)"/> starting 
+        /// Converts each unicode codepoint to UpperCase via <see 
cref="TextInfo.ToUpper(string)"/> in the invariant culture starting 
         /// at the given offset. </summary>
         /// <param name="buffer"> the char buffer to UPPERCASE </param>
         /// <param name="offset"> the offset to start at </param>
diff --git a/src/Lucene.Net.Analysis.Morfologik/Morfologik/MorfologikFilter.cs 
b/src/Lucene.Net.Analysis.Morfologik/Morfologik/MorfologikFilter.cs
index 238c88d..96071c1 100644
--- a/src/Lucene.Net.Analysis.Morfologik/Morfologik/MorfologikFilter.cs
+++ b/src/Lucene.Net.Analysis.Morfologik/Morfologik/MorfologikFilter.cs
@@ -8,6 +8,7 @@ using Morfologik.Stemming;
 using Morfologik.Stemming.Polish;
 using System;
 using System.Collections.Generic;
+using System.Globalization;
 using System.Text;
 using System.Text.RegularExpressions;
 
@@ -55,6 +56,8 @@ namespace Lucene.Net.Analysis.Morfologik
 
         private int lemmaListIndex;
 
+        private static readonly CultureInfo culture = new CultureInfo("pl"); 
// LUCENENET specific - do lowercasing in Polish culture
+
         /// <summary>
         /// Creates a filter with the default (Polish) dictionary.
         /// </summary>
@@ -166,7 +169,7 @@ namespace Lucene.Net.Analysis.Morfologik
             for (int i = 0; i < length;)
             {
                 i += Character.ToChars(
-                    Character.ToLower(Character.CodePointAt(chs, i)), buffer, 
i);
+                    Character.ToLower(Character.CodePointAt(chs, i), culture), 
buffer, i); // LUCENENET specific - need to use explicit culture to override 
current thread
             }
 
             return scratch.ToString();
diff --git 
a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Miscellaneous/TestStemmerOverrideFilter.cs
 
b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Miscellaneous/TestStemmerOverrideFilter.cs
index dbf1573..df2463a 100644
--- 
a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Miscellaneous/TestStemmerOverrideFilter.cs
+++ 
b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Miscellaneous/TestStemmerOverrideFilter.cs
@@ -79,7 +79,7 @@ namespace Lucene.Net.Analysis.Miscellaneous
                 for (int j = 0; j < charArray.Length;)
                 {
                     int cp = Character.CodePointAt(charArray, j, 
charArray.Length);
-                    if (!char.IsWhiteSpace((char)cp))
+                    if (!Character.IsWhiteSpace(cp))
                     {
                         sb.AppendCodePoint(cp);
                     }
diff --git 
a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Payloads/TypeAsPayloadTokenFilterTest.cs
 
b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Payloads/TypeAsPayloadTokenFilterTest.cs
index 607d82c..223f04f 100644
--- 
a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Payloads/TypeAsPayloadTokenFilterTest.cs
+++ 
b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Payloads/TypeAsPayloadTokenFilterTest.cs
@@ -1,6 +1,7 @@
 using Lucene.Net.Analysis.TokenAttributes;
 using NUnit.Framework;
 using System;
+using System.Globalization;
 using System.IO;
 
 namespace Lucene.Net.Analysis.Payloads
@@ -39,7 +40,7 @@ namespace Lucene.Net.Analysis.Payloads
             nptf.Reset();
             while (nptf.IncrementToken())
             {
-                assertTrue(typeAtt.Type + " is not null and it should be", 
typeAtt.Type.Equals(char.ToUpper(termAtt.Buffer[0]).ToString(), 
StringComparison.Ordinal));
+                assertTrue(typeAtt.Type + " is not null and it should be", 
typeAtt.Type.Equals(char.ToUpper(termAtt.Buffer[0]).ToString(), 
StringComparison.Ordinal)); // LUCENENET specific - intentionally using current 
culture
                 assertTrue("nextToken.getPayload() is null and it shouldn't 
be", payloadAtt.Payload != null);
                 string type = payloadAtt.Payload.Utf8ToString();
                 assertTrue(type + " is not equal to " + typeAtt.Type, 
type.Equals(typeAtt.Type, StringComparison.Ordinal));
@@ -67,7 +68,7 @@ namespace Lucene.Net.Analysis.Payloads
             {
                 if (m_input.IncrementToken())
                 {
-                    typeAtt.Type = char.ToUpper(termAtt.Buffer[0]).ToString();
+                    typeAtt.Type = char.ToUpper(termAtt.Buffer[0]).ToString(); 
// LUCENENET specific - intentionally using current culture
                     return true;
                 }
                 else
diff --git 
a/src/Lucene.Net.Tests.Highlighter/PostingsHighlight/TestPostingsHighlighterRanking.cs
 
b/src/Lucene.Net.Tests.Highlighter/PostingsHighlight/TestPostingsHighlighterRanking.cs
index 5cbf8af..654d2f9 100644
--- 
a/src/Lucene.Net.Tests.Highlighter/PostingsHighlight/TestPostingsHighlighterRanking.cs
+++ 
b/src/Lucene.Net.Tests.Highlighter/PostingsHighlight/TestPostingsHighlighterRanking.cs
@@ -213,7 +213,7 @@ namespace Lucene.Net.Search.PostingsHighlight
                         assertEquals(matchStart + 1, matchEnd);
                         // and the offsets must be correct...
                         assertEquals(1, term.Length);
-                        assertEquals((char)term.Bytes[term.Offset], 
Character.ToLower(content[matchStart]));
+                        assertEquals((char)term.Bytes[term.Offset], 
Character.ToLower(content[matchStart], CultureInfo.InvariantCulture));
                     }
                     // record just the start/end offset for simplicity
                     seen.Add(new Pair(p.StartOffset, p.EndOffset));
diff --git a/src/Lucene.Net.Tests/Support/TestWeakDictionaryBehavior.cs 
b/src/Lucene.Net.Tests/Support/TestWeakDictionaryBehavior.cs
index e1cc76d..40dbfba 100644
--- a/src/Lucene.Net.Tests/Support/TestWeakDictionaryBehavior.cs
+++ b/src/Lucene.Net.Tests/Support/TestWeakDictionaryBehavior.cs
@@ -110,7 +110,7 @@ namespace Lucene.Net.Support
         public void Test_Dictionary_AddReplace()
         {
             string key = "A";
-            string key2 = "a".ToUpper();
+            string key2 = "a".ToUpperInvariant();
 
             dictionary.Add(key, "value");
             dictionary[key2] = "value2";
diff --git 
a/src/dotnet/Lucene.Net.Tests.ICU/Search/PostingsHighlight/TestICUPostingsHighlighterRanking.cs
 
b/src/dotnet/Lucene.Net.Tests.ICU/Search/PostingsHighlight/TestICUPostingsHighlighterRanking.cs
index 0a1731b..5ac7eb4 100644
--- 
a/src/dotnet/Lucene.Net.Tests.ICU/Search/PostingsHighlight/TestICUPostingsHighlighterRanking.cs
+++ 
b/src/dotnet/Lucene.Net.Tests.ICU/Search/PostingsHighlight/TestICUPostingsHighlighterRanking.cs
@@ -211,7 +211,7 @@ namespace Lucene.Net.Search.PostingsHighlight
                         assertEquals(matchStart + 1, matchEnd);
                         // and the offsets must be correct...
                         assertEquals(1, term.Length);
-                        assertEquals((char)term.Bytes[term.Offset], 
Character.ToLower(content[matchStart]));
+                        assertEquals((char)term.Bytes[term.Offset], 
Character.ToLower(content[matchStart], CultureInfo.InvariantCulture)); // 
LUCENENET specific - need to use invariant culture to match Java
                     }
                     // record just the start/end offset for simplicity
                     seen.Add(new Pair(p.StartOffset, p.EndOffset));

Reply via email to