This is an automated email from the ASF dual-hosted git repository. nightowl888 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/lucenenet.git
commit 3abf2dbfefa23fa97269ee8695a52c387ffcc353 Author: Shad Storhaug <[email protected]> AuthorDate: Wed Oct 26 17:51:24 2022 +0700 PERFORMANCE: Lucene.Net.Analysis.Miscellaneous.StemmerOverrideFilter: Added overloads to Add for ICharSequence and char[] to reduce allocations. Added guard clauses. --- .../Miscellaneous/StemmerOverrideFilter.cs | 116 +++++++++++++++- .../Analysis/Nl/DutchAnalyzer.cs | 2 +- .../Miscellaneous/TestStemmerOverrideFilter.cs | 27 ++++ src/Lucene.Net/Lucene.Net.csproj | 4 +- src/Lucene.Net/Util/UnicodeUtil.cs | 149 ++++++++++++++++++++- 5 files changed, 284 insertions(+), 14 deletions(-) diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Miscellaneous/StemmerOverrideFilter.cs b/src/Lucene.Net.Analysis.Common/Analysis/Miscellaneous/StemmerOverrideFilter.cs index b14af8ecb..eb8772c48 100644 --- a/src/Lucene.Net.Analysis.Common/Analysis/Miscellaneous/StemmerOverrideFilter.cs +++ b/src/Lucene.Net.Analysis.Common/Analysis/Miscellaneous/StemmerOverrideFilter.cs @@ -1,8 +1,10 @@ // Lucene version compatibility level 4.8.1 using J2N; +using J2N.Text; using Lucene.Net.Analysis.TokenAttributes; using Lucene.Net.Util; using Lucene.Net.Util.Fst; +using System; using System.Collections.Generic; using System.Globalization; using System.IO; @@ -185,23 +187,101 @@ namespace Lucene.Net.Analysis.Miscellaneous /// <param name="input"> the input char sequence </param> /// <param name="output"> the stemmer override output char sequence </param> /// <returns> <c>false</c> if the input has already been added to this builder otherwise <c>true</c>. </returns> + /// <exception cref="ArgumentNullException"><paramref name="input"/> or <paramref name="output"/> is <c>null</c>.</exception> + // LUCENENET specific overload of ICharSequence public virtual bool Add(string input, string output) { + // LUCENENET: Added guard clauses + if (input is null) + throw new ArgumentNullException(nameof(input)); + if (output is null) + throw new ArgumentNullException(nameof(output)); + + int length = input.Length; + if (ignoreCase) + { + // convert on the fly to lowercase + + // LUCENENET: Reduce allocations/improve throughput by using stack and spans + var source = input.AsSpan(); + if (length * sizeof(char) <= Constants.MaxStackByteLimit) + { + // Fast path - use the stack + Span<char> buffer = stackalloc char[length]; + source.ToLowerInvariant(buffer); + + UnicodeUtil.UTF16toUTF8(buffer, spare); + } + else + { + // Slow path - use the heap + charsSpare.Grow(length); + char[] buffer = charsSpare.Chars; + + var destination = buffer.AsSpan(0, length); + source.ToLowerInvariant(destination); + + UnicodeUtil.UTF16toUTF8(buffer, 0, length, spare); + } + } + else + { + UnicodeUtil.UTF16toUTF8(input, 0, length, spare); + } + if (hash.Add(spare) >= 0) + { + outputValues.Add(output); + return true; + } + return false; + } + + /// <summary> + /// Adds an input string and it's stemmer override output to this builder. + /// </summary> + /// <param name="input"> the input char sequence </param> + /// <param name="output"> the stemmer override output char sequence </param> + /// <returns> <c>false</c> if the input has already been added to this builder otherwise <c>true</c>. </returns> + /// <exception cref="ArgumentNullException"><paramref name="input"/> or <paramref name="output"/> is <c>null</c>.</exception> + // LUCENENET specific overload of ICharSequence + public virtual bool Add(char[] input, string output) + { + // LUCENENET: Added guard clauses + if (input is null) + throw new ArgumentNullException(nameof(input)); + if (output is null) + throw new ArgumentNullException(nameof(output)); + int length = input.Length; if (ignoreCase) { // convert on the fly to lowercase - charsSpare.Grow(length); - char[] buffer = charsSpare.Chars; - for (int i = 0; i < length;) + + // LUCENENET: Reduce allocations/improve throughput by using stack and spans + var source = new ReadOnlySpan<char>(input); + if (length * sizeof(char) <= Constants.MaxStackByteLimit) { - i += Character.ToChars(Character.ToLower(Character.CodePointAt(input, i), CultureInfo.InvariantCulture), buffer, i); + // Fast path - use the stack + Span<char> buffer = stackalloc char[length]; + source.ToLowerInvariant(buffer); + + UnicodeUtil.UTF16toUTF8(buffer, spare); + } + else + { + // Slow path - use the heap + charsSpare.Grow(length); + char[] buffer = charsSpare.Chars; + + var destination = buffer.AsSpan(0, length); + source.ToLowerInvariant(destination); + + UnicodeUtil.UTF16toUTF8(buffer, 0, length, spare); } - UnicodeUtil.UTF16toUTF8(buffer, 0, length, spare); } else { - UnicodeUtil.UTF16toUTF8(input.ToCharArray(), 0, length, spare); + UnicodeUtil.UTF16toUTF8(input, 0, length, spare); } if (hash.Add(spare) >= 0) { @@ -211,6 +291,30 @@ namespace Lucene.Net.Analysis.Miscellaneous return false; } + /// <summary> + /// Adds an input string and it's stemmer override output to this builder. + /// </summary> + /// <param name="input"> the input char sequence </param> + /// <param name="output"> the stemmer override output char sequence </param> + /// <returns> <c>false</c> if the input has already been added to this builder otherwise <c>true</c>. </returns> + /// <exception cref="ArgumentNullException"><paramref name="input"/> or <paramref name="output"/> is <c>null</c>.</exception> + // LUCENENET specific overload of ICharSequence + public virtual bool Add(ICharSequence input, string output) + { + // LUCENENET: Added guard clauses + if (input is null) + throw new ArgumentNullException(nameof(input)); + if (output is null) + throw new ArgumentNullException(nameof(output)); + + if (input is CharArrayCharSequence charArrayCharSequence && charArrayCharSequence.HasValue) + return Add(charArrayCharSequence.Value, output); + + // LUCENENET: In .NET, the indexer for StringBuilder is slow, so we are better off + // converting to a string in all other cases. + return Add(input.ToString(), output); + } + /// <summary> /// Returns a <see cref="StemmerOverrideMap"/> to be used with the <see cref="StemmerOverrideFilter"/> </summary> /// <returns> a <see cref="StemmerOverrideMap"/> to be used with the <see cref="StemmerOverrideFilter"/> </returns> diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Nl/DutchAnalyzer.cs b/src/Lucene.Net.Analysis.Common/Analysis/Nl/DutchAnalyzer.cs index 6200e49ba..08579e941 100644 --- a/src/Lucene.Net.Analysis.Common/Analysis/Nl/DutchAnalyzer.cs +++ b/src/Lucene.Net.Analysis.Common/Analysis/Nl/DutchAnalyzer.cs @@ -176,7 +176,7 @@ namespace Lucene.Net.Analysis.Nl { char[] nextKey = iter.NextKey(); spare.CopyChars(nextKey, 0, nextKey.Length); - builder.Add(new string(spare.Chars), iter.CurrentValue); + builder.Add(spare.Chars, iter.CurrentValue); } } try diff --git a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Miscellaneous/TestStemmerOverrideFilter.cs b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Miscellaneous/TestStemmerOverrideFilter.cs index fe045a64d..699d86fa8 100644 --- a/src/Lucene.Net.Tests.Analysis.Common/Analysis/Miscellaneous/TestStemmerOverrideFilter.cs +++ b/src/Lucene.Net.Tests.Analysis.Common/Analysis/Miscellaneous/TestStemmerOverrideFilter.cs @@ -4,6 +4,7 @@ using J2N.Collections.Generic.Extensions; using J2N.Text; using Lucene.Net.Analysis.Core; using Lucene.Net.Analysis.En; +using Lucene.Net.Attributes; using Lucene.Net.Util; using NUnit.Framework; using System.Collections.Generic; @@ -59,6 +60,32 @@ namespace Lucene.Net.Analysis.Miscellaneous AssertTokenStreamContents(stream, new string[] { "books" }); } + [Test, LuceneNetSpecific] + public virtual void TestIgnoreCase_CharArray() + { + // lets make booked stem to books + // the override filter will convert "booked" to "books", + // but also mark it with KeywordAttribute so Porter will not change it. + StemmerOverrideFilter.Builder builder = new StemmerOverrideFilter.Builder(true); + builder.Add("boOkEd".ToCharArray(), "books"); + Tokenizer tokenizer = new KeywordTokenizer(new StringReader("BooKeD")); + TokenStream stream = new PorterStemFilter(new StemmerOverrideFilter(tokenizer, builder.Build())); + AssertTokenStreamContents(stream, new string[] { "books" }); + } + + [Test, LuceneNetSpecific] + public virtual void TestIgnoreCase_CharSequence() + { + // lets make booked stem to books + // the override filter will convert "booked" to "books", + // but also mark it with KeywordAttribute so Porter will not change it. + StemmerOverrideFilter.Builder builder = new StemmerOverrideFilter.Builder(true); + builder.Add("boOkEd".AsCharSequence(), "books"); + Tokenizer tokenizer = new KeywordTokenizer(new StringReader("BooKeD")); + TokenStream stream = new PorterStemFilter(new StemmerOverrideFilter(tokenizer, builder.Build())); + AssertTokenStreamContents(stream, new string[] { "books" }); + } + [Test] public virtual void TestNoOverrides() { diff --git a/src/Lucene.Net/Lucene.Net.csproj b/src/Lucene.Net/Lucene.Net.csproj index 3e70f21c6..5c7e3802a 100644 --- a/src/Lucene.Net/Lucene.Net.csproj +++ b/src/Lucene.Net/Lucene.Net.csproj @@ -66,13 +66,15 @@ <ItemGroup Condition=" '$(TargetFramework)' == 'netstandard2.0' "> <PackageReference Include="Microsoft.Extensions.Configuration.Abstractions" Version="$(MicrosoftExtensionsConfigurationAbstractionsPackageVersion)" /> <PackageReference Include="Prism.Core" Version="$(PrismCorePackageVersion)" /> + <PackageReference Include="System.Memory" Version="$(SystemMemoryPackageVersion)" /> </ItemGroup> <ItemGroup Condition=" '$(TargetFramework)' == 'net462' "> <PackageReference Include="Microsoft.Extensions.Configuration.Abstractions" Version="$(MicrosoftExtensionsConfigurationAbstractionsPackageVersion)" /> <PackageReference Include="Prism.Core" Version="$(PrismCorePackageVersion)" /> + <PackageReference Include="System.Memory" Version="$(SystemMemoryPackageVersion)" /> </ItemGroup> - + <ItemGroup> <InternalsVisibleTo Include="Lucene.Net.Analysis.Common" /> <InternalsVisibleTo Include="Lucene.Net.Analysis.Kuromoji" /> diff --git a/src/Lucene.Net/Util/UnicodeUtil.cs b/src/Lucene.Net/Util/UnicodeUtil.cs index 20dd3ea95..b6a6b8d30 100644 --- a/src/Lucene.Net/Util/UnicodeUtil.cs +++ b/src/Lucene.Net/Util/UnicodeUtil.cs @@ -121,13 +121,110 @@ namespace Lucene.Net.Util private const int SURROGATE_OFFSET = Character.MinSupplementaryCodePoint - (UNI_SUR_HIGH_START << (int)HALF_SHIFT) - UNI_SUR_LOW_START; + /// <summary> + /// Encode characters from a <see cref="T:char[]"/> <paramref name="source"/>, starting at + /// and ending at <paramref name="result"/>. After encoding, <c>result.Offset</c> will always be 0. + /// </summary> + /// <exception cref="ArgumentNullException"><paramref name="result"/> is <c>null</c>.</exception> + // TODO: broken if incoming result.offset != 0 + // LUCENENET specific overload + public static void UTF16toUTF8(Span<char> source, BytesRef result) + { + // LUCENENET: Added guard clause + if (result is null) + throw new ArgumentNullException(nameof(result)); + + int length = source.Length; + + int upto = 0; + int i = 0; + int end = source.Length; + var @out = result.Bytes; + + // Pre-allocate for worst case 4-for-1 + int maxLen = length * 4; + if (@out.Length < maxLen) + { + @out = result.Bytes = new byte[maxLen]; + } + result.Offset = 0; + + while (i < end) + { + int code = (int)source[i++]; + + if (code < 0x80) + { + @out[upto++] = (byte)code; + } + else if (code < 0x800) + { + @out[upto++] = (byte)(0xC0 | (code >> 6)); + @out[upto++] = (byte)(0x80 | (code & 0x3F)); + } + else if (code < 0xD800 || code > 0xDFFF) + { + @out[upto++] = (byte)(0xE0 | (code >> 12)); + @out[upto++] = (byte)(0x80 | ((code >> 6) & 0x3F)); + @out[upto++] = (byte)(0x80 | (code & 0x3F)); + } + else + { + // surrogate pair + // confirm valid high surrogate + if (code < 0xDC00 && i < end) + { + var utf32 = (int)source[i]; + // confirm valid low surrogate and write pair + if (utf32 >= 0xDC00 && utf32 <= 0xDFFF) + { + utf32 = (code << 10) + utf32 + SURROGATE_OFFSET; + i++; + @out[upto++] = (byte)(0xF0 | (utf32 >> 18)); + @out[upto++] = (byte)(0x80 | ((utf32 >> 12) & 0x3F)); + @out[upto++] = (byte)(0x80 | ((utf32 >> 6) & 0x3F)); + @out[upto++] = (byte)(0x80 | (utf32 & 0x3F)); + continue; + } + } + // replace unpaired surrogate or out-of-order low surrogate + // with substitution character + @out[upto++] = 0xEF; + @out[upto++] = 0xBF; + @out[upto++] = 0xBD; + } + } + //assert matches(source, offset, length, out, upto); + result.Length = upto; + } + /// <summary> /// Encode characters from a <see cref="T:char[]"/> <paramref name="source"/>, starting at /// <paramref name="offset"/> for <paramref name="length"/> chars. After encoding, <c>result.Offset</c> will always be 0. /// </summary> + /// <exception cref="ArgumentNullException"><paramref name="source"/> or <paramref name="result"/> is <c>null</c>.</exception> + /// <exception cref="ArgumentOutOfRangeException"> + /// <paramref name="offset"/> or <paramref name="length"/> is less than zero. + /// <para/> + /// -or- + /// <para/> + /// <paramref name="offset"/> and <paramref name="length"/> refer to a location outside of <paramref name="source"/>. + /// </exception> // TODO: broken if incoming result.offset != 0 public static void UTF16toUTF8(char[] source, int offset, int length, BytesRef result) { + // LUCENENET: Added guard clauses + if (source is null) + throw new ArgumentNullException(nameof(source)); + if (result is null) + throw new ArgumentNullException(nameof(result)); + if (offset < 0) + throw new ArgumentOutOfRangeException(nameof(offset), $"{nameof(offset)} must not be negative."); + if (length < 0) + throw new ArgumentOutOfRangeException(nameof(length), $"{nameof(length)} must not be negative."); + if (offset > source.Length - length) // Checks for int overflow + throw new ArgumentOutOfRangeException(nameof(length), $"Index and length must refer to a location within the string. For example {nameof(offset)} + {nameof(length)} <= source.{nameof(source.Length)}."); + int upto = 0; int i = offset; int end = offset + length; @@ -193,9 +290,29 @@ namespace Lucene.Net.Util /// Encode characters from this <see cref="ICharSequence"/>, starting at <paramref name="offset"/> /// for <paramref name="length"/> characters. After encoding, <c>result.Offset</c> will always be 0. /// </summary> + /// <exception cref="ArgumentNullException"><paramref name="source"/> or <paramref name="result"/> is <c>null</c>.</exception> + /// <exception cref="ArgumentOutOfRangeException"> + /// <paramref name="offset"/> or <paramref name="length"/> is less than zero. + /// <para/> + /// -or- + /// <para/> + /// <paramref name="offset"/> and <paramref name="length"/> refer to a location outside of <paramref name="source"/>. + /// </exception> // TODO: broken if incoming result.offset != 0 - public static void UTF16toUTF8(ICharSequence s, int offset, int length, BytesRef result) + public static void UTF16toUTF8(ICharSequence source, int offset, int length, BytesRef result) { + // LUCENENET: Added guard clauses + if (source is null) + throw new ArgumentNullException(nameof(source)); + if (result is null) + throw new ArgumentNullException(nameof(result)); + if (offset < 0) + throw new ArgumentOutOfRangeException(nameof(offset), $"{nameof(offset)} must not be negative."); + if (length < 0) + throw new ArgumentOutOfRangeException(nameof(length), $"{nameof(length)} must not be negative."); + if (offset > source.Length - length) // Checks for int overflow + throw new ArgumentOutOfRangeException(nameof(length), $"Index and length must refer to a location within the string. For example {nameof(offset)} + {nameof(length)} <= source.{nameof(source.Length)}."); + int end = offset + length; var @out = result.Bytes; @@ -210,7 +327,7 @@ namespace Lucene.Net.Util int upto = 0; for (int i = offset; i < end; i++) { - var code = (int)s[i]; + var code = (int)source[i]; if (code < 0x80) { @out[upto++] = (byte)code; @@ -232,7 +349,7 @@ namespace Lucene.Net.Util // confirm valid high surrogate if (code < 0xDC00 && (i < end - 1)) { - int utf32 = (int)s[i + 1]; + int utf32 = (int)source[i + 1]; // confirm valid low surrogate and write pair if (utf32 >= 0xDC00 && utf32 <= 0xDFFF) { @@ -262,9 +379,29 @@ namespace Lucene.Net.Util /// <para/> /// LUCENENET specific. /// </summary> + /// <exception cref="ArgumentNullException"><paramref name="source"/> or <paramref name="result"/> is <c>null</c>.</exception> + /// <exception cref="ArgumentOutOfRangeException"> + /// <paramref name="offset"/> or <paramref name="length"/> is less than zero. + /// <para/> + /// -or- + /// <para/> + /// <paramref name="offset"/> and <paramref name="length"/> refer to a location outside of <paramref name="source"/>. + /// </exception> // TODO: broken if incoming result.offset != 0 - public static void UTF16toUTF8(string s, int offset, int length, BytesRef result) + public static void UTF16toUTF8(string source, int offset, int length, BytesRef result) { + // LUCENENET: Added guard clauses + if (source is null) + throw new ArgumentNullException(nameof(source)); + if (result is null) + throw new ArgumentNullException(nameof(result)); + if (offset < 0) + throw new ArgumentOutOfRangeException(nameof(offset), $"{nameof(offset)} must not be negative."); + if (length < 0) + throw new ArgumentOutOfRangeException(nameof(length), $"{nameof(length)} must not be negative."); + if (offset > source.Length - length) // Checks for int overflow + throw new ArgumentOutOfRangeException(nameof(length), $"Index and length must refer to a location within the string. For example {nameof(offset)} + {nameof(length)} <= source.{nameof(source.Length)}."); + int end = offset + length; var @out = result.Bytes; @@ -279,7 +416,7 @@ namespace Lucene.Net.Util int upto = 0; for (int i = offset; i < end; i++) { - var code = (int)s[i]; + var code = (int)source[i]; if (code < 0x80) { @out[upto++] = (byte)code; @@ -301,7 +438,7 @@ namespace Lucene.Net.Util // confirm valid high surrogate if (code < 0xDC00 && (i < end - 1)) { - int utf32 = (int)s[i + 1]; + int utf32 = (int)source[i + 1]; // confirm valid low surrogate and write pair if (utf32 >= 0xDC00 && utf32 <= 0xDFFF) {
