This is an automated email from the ASF dual-hosted git repository. nightowl888 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/lucenenet.git
commit 9821215f7bbc904d66a985f4edc23b6d118fb4f3 Author: Shad Storhaug <[email protected]> AuthorDate: Wed Jul 22 18:08:20 2020 +0700 Lucene.Net.Analysis.Common: Removed cast from NGramTokenizerAnonymousInnerClassHelper.IsTokenChar(int) that was causing surrogate pairs to fail in the TestUTF8FullRange() tests of NGramTokenizerTest and EdgeNGramTokenizerTest (see #269) --- .../Analysis/NGram/NGramTokenizerTest.cs | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/src/Lucene.Net.Tests.Analysis.Common/Analysis/NGram/NGramTokenizerTest.cs b/src/Lucene.Net.Tests.Analysis.Common/Analysis/NGram/NGramTokenizerTest.cs index 2841ecb..e55e26b 100644 --- a/src/Lucene.Net.Tests.Analysis.Common/Analysis/NGram/NGramTokenizerTest.cs +++ b/src/Lucene.Net.Tests.Analysis.Common/Analysis/NGram/NGramTokenizerTest.cs @@ -1,4 +1,5 @@ using J2N; +using J2N.Text; using Lucene.Net.Analysis.TokenAttributes; using Lucene.Net.Support; using Lucene.Net.Util; @@ -165,11 +166,22 @@ namespace Lucene.Net.Analysis.NGram return codePoints; } + internal static int[] toCodePoints(ICharSequence s) + { + int[] codePoints = new int[Character.CodePointCount(s, 0, s.Length)]; + for (int i = 0, j = 0; i < s.Length; ++j) + { + codePoints[j] = Character.CodePointAt(s, i); + i += Character.CharCount(codePoints[j]); + } + return codePoints; + } + internal static bool isTokenChar(string nonTokenChars, int codePoint) { for (int i = 0; i < nonTokenChars.Length;) { - int cp = char.ConvertToUtf32(nonTokenChars, i); + int cp = nonTokenChars.CodePointAt(i); if (cp == codePoint) { return false; @@ -211,8 +223,7 @@ namespace Lucene.Net.Analysis.NGram } } assertTrue(grams.IncrementToken()); - - assertArrayEquals(Arrays.CopyOfRange(codePoints, start, end), toCodePoints(termAtt.ToString())); + assertArrayEquals(Arrays.CopyOfRange(codePoints, start, end), toCodePoints(termAtt)); assertEquals(1, posIncAtt.PositionIncrement); assertEquals(1, posLenAtt.PositionLength); assertEquals(offsets[start], offsetAtt.StartOffset); @@ -229,7 +240,7 @@ namespace Lucene.Net.Analysis.NGram private class NGramTokenizerAnonymousInnerClassHelper : NGramTokenizer { - private string nonTokenChars; + private readonly string nonTokenChars; public NGramTokenizerAnonymousInnerClassHelper(LuceneVersion TEST_VERSION_CURRENT, StringReader java, int minGram, int maxGram, bool edgesOnly, string nonTokenChars) : base(TEST_VERSION_CURRENT, java, minGram, maxGram, edgesOnly) @@ -239,7 +250,7 @@ namespace Lucene.Net.Analysis.NGram protected override bool IsTokenChar(int chr) { - return nonTokenChars.IndexOf((char)chr) < 0; + return nonTokenChars.IndexOf(chr) < 0; } }
