This is an automated email from the ASF dual-hosted git repository. nightowl888 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/lucenenet.git
commit 3fbee37edede7ac1f4c864bb24a0be3227ec2cca Author: Shad Storhaug <[email protected]> AuthorDate: Fri Oct 28 12:25:14 2022 +0700 PERFORMANCE: Lucene.Net.Analsis.Util.HTMLStripCharFilter: Refactored to remove YyText property (method) which allocates a string every time it is called. Instead, we pass the underlying array to J2N.Numerics.TryParse() and OpenStringBuilder.Append() with the calculated startIndex and length to directly copy the characters without allocating substrings. --- .../Analysis/CharFilter/HTMLStripCharFilter.cs | 216 ++++++++++++--------- 1 file changed, 125 insertions(+), 91 deletions(-) diff --git a/src/Lucene.Net.Analysis.Common/Analysis/CharFilter/HTMLStripCharFilter.cs b/src/Lucene.Net.Analysis.Common/Analysis/CharFilter/HTMLStripCharFilter.cs index e1103fdc9..217a76807 100644 --- a/src/Lucene.Net.Analysis.Common/Analysis/CharFilter/HTMLStripCharFilter.cs +++ b/src/Lucene.Net.Analysis.Common/Analysis/CharFilter/HTMLStripCharFilter.cs @@ -31095,15 +31095,12 @@ namespace Lucene.Net.Analysis.CharFilters zzLexicalState = newState; } - - /// <summary> - /// Returns the text matched by the current regular expression. - /// </summary> - /// <returns>Returns the text matched by the current regular expression.</returns> - private string YyText() - { - return new string(zzBuffer, zzStartRead, zzMarkedPos - zzStartRead); - } + // LUCENENET: Not used - refactored to read the array directly to avoid allocations + ///// <summary> + ///// Returns the text matched by the current regular expression. + ///// </summary> + ///// <returns>Returns the text matched by the current regular expression.</returns> + //private string YyText => new string(zzBuffer, zzStartRead, zzMarkedPos - zzStartRead); /// <summary> /// Returns the character at position <tt>pos</tt> from the @@ -31372,9 +31369,9 @@ namespace Lucene.Net.Analysis.CharFilters inputSegment.Write(zzBuffer, zzStartRead, matchLength); if (matchLength <= 7) { // 0x10FFFF = 1114111: max 7 decimal chars - // LUCENENET: Originally, we got the value of YyText(), which allocates..so we can eliminate the allocation - // by grabbing the values YyText() converts to a string: new string(zzBuffer, zzStartRead, zzMarkedPos - zzStartRead); - if (!J2N.Numerics.Int32.TryParse(zzBuffer, zzStartRead, zzMarkedPos - zzStartRead, radix: 10, out int codePoint)) + // LUCENENET: Originally, we got the value of YyText property, which allocates. We can eliminate the allocation + // by grabbing the values YyText converts to a string: new string(zzBuffer, zzStartRead, zzMarkedPos - zzStartRead); + if (!Integer.TryParse(zzBuffer, zzStartRead, zzMarkedPos - zzStartRead, radix: 10, out int codePoint)) { if (Debugging.AssertsEnabled) Debugging.Assert(false, "Exception parsing code point '{0}'", new CharArrayFormatter(zzBuffer, zzStartRead, zzMarkedPos - zzStartRead)); } @@ -31627,7 +31624,9 @@ namespace Lucene.Net.Analysis.CharFilters inputSegment.Write(zzBuffer, zzStartRead, matchLength); if (matchLength <= 6) { // 10FFFF: max 6 hex chars - if (!J2N.Numerics.Int32.TryParse(zzBuffer, zzStartRead + 1, matchLength - 1, radix: 16, out int codePoint)) + // LUCENENET: Originally, we got the value of new string(zzBuffer, zzStartRead, zzMarkedPos - zzStartRead), which allocates. + // We can eliminate the allocation by grabbing the values YyText converts to a string via index and length. + if (!Integer.TryParse(zzBuffer, zzStartRead + 1, matchLength - 1, radix: 16, out int codePoint)) { if (Debugging.AssertsEnabled) Debugging.Assert(false, "Exception parsing hex code point '{0}'", new CharArrayFormatter(zzBuffer, zzStartRead + 1, matchLength - 1)); } @@ -31666,7 +31665,9 @@ namespace Lucene.Net.Analysis.CharFilters { if (inputSegment.Length > 2) { // Chars between "<!" and "--" - this is not a comment - inputSegment.Append(YyText()); + // LUCENENET: Originally, we got the value of YyText property, which allocates. We can eliminate the allocation + // by grabbing the values YyText converts to a string: new string(zzBuffer, zzStartRead, zzMarkedPos - zzStartRead); + inputSegment.Append(zzBuffer, zzStartRead, zzMarkedPos - zzStartRead); } else { @@ -31812,7 +31813,9 @@ namespace Lucene.Net.Analysis.CharFilters { if (inputSegment.Length > 2) { // Chars between "<!" and "[CDATA[" - this is not a CDATA section - inputSegment.Append(YyText()); + // LUCENENET: Originally, we got the value of YyText property, which allocates. We can eliminate the allocation + // by grabbing the values YyText converts to a string: new string(zzBuffer, zzStartRead, zzMarkedPos - zzStartRead); + inputSegment.Append(zzBuffer, zzStartRead, zzMarkedPos - zzStartRead); } else { @@ -31884,27 +31887,33 @@ namespace Lucene.Net.Analysis.CharFilters { // Handle paired UTF-16 surrogates. outputSegment = entitySegment; outputSegment.Clear(); - string surrogatePair = YyText(); - char highSurrogate = '\u0000'; - // LUCENENET: Optimized parse so we don't allocate a substring. - if (Integer.TryParse(surrogatePair, 2, 6 - 2, 16, out int highSurrogateInt32)) - { - highSurrogate = (char)highSurrogateInt32; - } - else // should never happen + //string surrogatePair = YyText; // LUCENENET: Refactored to use the underlying array directly instead of allocating substrings + int highSurrogate = '\u0000'; // LUCENENET: Use int to allow out parameters to use without casting. + + // LUCENENET: Originally, we got the value of YyText property, which allocates. We can eliminate the allocation + // by grabbing the values YyText converts to a string: new string(zzBuffer, zzStartRead, zzMarkedPos - zzStartRead); + int startIndex = zzStartRead + 2; + int length = 4; // (6 - 2) + + // High surrogates are in decimal range [55296, 56319] + if (!Integer.TryParse(zzBuffer, startIndex, length, radix: 16, out highSurrogate)) { - if (Debugging.AssertsEnabled) Debugging.Assert(false, "Exception parsing high surrogate '{0}'", surrogatePair.Substring(2, 6 - 2)); + // should never happen + if (Debugging.AssertsEnabled) Debugging.Assert(false, "Exception parsing high surrogate '{0}'", new CharArrayFormatter(zzBuffer, startIndex, length)); } - try + + // LUCENENET: Originally, we got the value of YyText property, which allocates. We can eliminate the allocation + // by grabbing the values YyText converts to a string: new string(zzBuffer, zzStartRead, zzMarkedPos - zzStartRead); + startIndex = zzStartRead + 10; + length = 4; // (14 - 10) + + // Low surrogates are in decimal range [56320, 57343] + if (!Integer.TryParse(zzBuffer, startIndex, length, radix: 16, out int lowSurrogate)) { - // LUCENENET: Optimized parse so we don't allocate a substring - outputSegment.UnsafeWrite((char)Integer.Parse(surrogatePair, 10, 14 - 10, 16)); + // should never happen + if (Debugging.AssertsEnabled) Debugging.Assert(false, "Exception parsing low surrogate '{0}'", new CharArrayFormatter(zzBuffer, startIndex, length)); } - catch (Exception e) when (e.IsException()) - { // should never happen - if (Debugging.AssertsEnabled) Debugging.Assert(false, "Exception parsing low surrogate '{0}'", surrogatePair.Substring(10, 14 - 10)); - } - + outputSegment.UnsafeWrite((char)lowSurrogate); // add (previously matched input length) + (this match length) - (substitution length) cumulativeDiff += inputSegment.Length + YyLength - 2; // position the correction at (already output length) + (substitution length) @@ -31916,32 +31925,38 @@ namespace Lucene.Net.Analysis.CharFilters case 103: break; case 51: { // Handle paired UTF-16 surrogates. - string surrogatePair = YyText(); - char highSurrogate = '\u0000'; - char lowSurrogate = '\u0000'; - // LUCENENET: Optimized parse so we don't allocate a substring. - if (Integer.TryParse(surrogatePair, 2, 6 - 2, 16, out int highSurrogateInt32)) + // string surrogatePair = YyText; // LUCENENET: Refactored to use the underlying array directly instead of allocating substrings + int highSurrogate = '\u0000'; // LUCENENET: Use int to allow out parameters to use without casting. + int lowSurrogate = '\u0000'; // LUCENENET: Use int to allow out parameters to use without casting. + + // LUCENENET: Originally, we got the value of YyText property, which allocates. We can eliminate the allocation + // by grabbing the values YyText converts to a string: new string(zzBuffer, zzStartRead, zzMarkedPos - zzStartRead); + int startIndex = zzStartRead + 2; + int length = 4; // (6 - 2) + + // High surrogates are in decimal range [55296, 56319] + if (!Integer.TryParse(zzBuffer, startIndex, length, radix: 16, out highSurrogate)) { - highSurrogate = (char)highSurrogateInt32; + // should never happen + if (Debugging.AssertsEnabled) Debugging.Assert(false, "Exception parsing high surrogate '{0}'", new CharArrayFormatter(zzBuffer, startIndex, length)); } - else // should never happen + + // LUCENENET: Originally, we got the value of YyText property, which allocates. We can eliminate the allocation + // by grabbing the values YyText converts to a string: new string(zzBuffer, zzStartRead, zzMarkedPos - zzStartRead); + startIndex = zzStartRead + 9; + length = 5; // (14 - 9) + + // Low surrogates are in decimal range [56320, 57343] + if (!Integer.TryParse(zzBuffer, startIndex, length, radix: 10, out lowSurrogate)) { - if (Debugging.AssertsEnabled) Debugging.Assert(false, "Exception parsing high surrogate '{0}'", surrogatePair.Substring(2, 6 - 2)); - } - try - { // Low surrogates are in decimal range [56320, 57343] - // LUCENENET: Optimized parse so we don't allocate a substring - lowSurrogate = (char)Integer.Parse(surrogatePair, 9, 14 - 9, 10); - } - catch (Exception e) when (e.IsException()) - { // should never happen - if (Debugging.AssertsEnabled) Debugging.Assert(false, "Exception parsing low surrogate '{0}'", surrogatePair.Substring(9, 14 - 9)); + // should never happen + if (Debugging.AssertsEnabled) Debugging.Assert(false, "Exception parsing low surrogate '{0}'", new CharArrayFormatter(zzBuffer, startIndex, length)); } - if (char.IsLowSurrogate(lowSurrogate)) + if (char.IsLowSurrogate((char)lowSurrogate)) { outputSegment = entitySegment; outputSegment.Clear(); - outputSegment.UnsafeWrite(lowSurrogate); + outputSegment.UnsafeWrite((char)lowSurrogate); // add (previously matched input length) + (this match length) - (substitution length) cumulativeDiff += inputSegment.Length + YyLength - 2; // position the correction at (already output length) + (substitution length) @@ -31950,7 +31965,9 @@ namespace Lucene.Net.Analysis.CharFilters YyBegin(YYINITIAL); return highSurrogate; } - YyPushBack(surrogatePair.Length - 1); // Consume only '#' + // LUCENENET: Using the underlying array to parse, so need to calculate surrogatePair.Length as (zzMarkedPos - zzStartRead) + // which would be the length of YyText, if allocated. + YyPushBack((zzMarkedPos - zzStartRead) - 1); // Consume only '#' inputSegment.Append('#'); YyBegin(NUMERIC_CHARACTER); } @@ -31958,30 +31975,37 @@ namespace Lucene.Net.Analysis.CharFilters case 104: break; case 52: { // Handle paired UTF-16 surrogates. - string surrogatePair = YyText(); - char highSurrogate = '\u0000'; - // LUCENENET: Optimized parse so we don't allocate a substring. - if (Integer.TryParse(surrogatePair, 1, 6 - 1, 10, out int highSurrogateInt32)) - { // High surrogates are in decimal range [55296, 56319] - highSurrogate = (char)highSurrogateInt32; - } - else // should never happen + //string surrogatePair = YyText; // LUCENENET: Refactored to use the underlying array directly instead of allocating substrings + int highSurrogate = '\u0000'; // LUCENENET: Use int to allow out parameters to use without casting. + + // LUCENENET: Originally, we got the value of YyText property, which allocates. We can eliminate the allocation + // by grabbing the values YyText converts to a string: new string(zzBuffer, zzStartRead, zzMarkedPos - zzStartRead); + int startIndex = zzStartRead + 1; + int length = 5; // (6 - 1) + + // High surrogates are in decimal range [55296, 56319] + if (!Integer.TryParse(zzBuffer, startIndex, length, radix: 10, out highSurrogate)) { - if (Debugging.AssertsEnabled) Debugging.Assert(false, "Exception parsing high surrogate '{0}'", surrogatePair.Substring(1, 6 - 1)); + // should never happen + if (Debugging.AssertsEnabled) Debugging.Assert(false, "Exception parsing high surrogate '{0}'", new CharArrayFormatter(zzBuffer, startIndex, length)); } - if (char.IsHighSurrogate(highSurrogate)) + if (char.IsHighSurrogate((char)highSurrogate)) { outputSegment = entitySegment; outputSegment.Clear(); - try + + // LUCENENET: Originally, we got the value of YyText property, which allocates. We can eliminate the allocation + // by grabbing the values YyText converts to a string: new string(zzBuffer, zzStartRead, zzMarkedPos - zzStartRead); + startIndex = zzStartRead + 10; + length = 4; // (14 - 10) + + // Low surrogates are in decimal range [56320, 57343] + if (!Integer.TryParse(zzBuffer, startIndex, length, radix: 16, out int lowSurrogate)) { - // LUCENENET: Optimized parse so we don't allocate a substring. - outputSegment.UnsafeWrite((char)Integer.Parse(surrogatePair, 10, 14 - 10, 16)); - } - catch (Exception e) when (e.IsException()) - { // should never happen - if (Debugging.AssertsEnabled) Debugging.Assert(false, "Exception parsing low surrogate '{0}'", surrogatePair.Substring(10, 14 - 10)); + // should never happen + if (Debugging.AssertsEnabled) Debugging.Assert(false, "Exception parsing low surrogate '{0}'", new CharArrayFormatter(zzBuffer, startIndex, length)); } + outputSegment.UnsafeWrite((char)lowSurrogate); // add (previously matched input length) + (this match length) - (substitution length) cumulativeDiff += inputSegment.Length + YyLength - 2; // position the correction at (already output length) + (substitution length) @@ -31990,7 +32014,9 @@ namespace Lucene.Net.Analysis.CharFilters YyBegin(YYINITIAL); return highSurrogate; } - YyPushBack(surrogatePair.Length - 1); // Consume only '#' + // LUCENENET: Using the underlying array to parse, so need to calculate surrogatePair.Length as (zzMarkedPos - zzStartRead) + // which would be the length of YyText, if allocated. + YyPushBack((zzMarkedPos - zzStartRead) - 1); // Consume only '#' inputSegment.Append('#'); YyBegin(NUMERIC_CHARACTER); } @@ -31998,34 +32024,40 @@ namespace Lucene.Net.Analysis.CharFilters case 105: break; case 53: { // Handle paired UTF-16 surrogates. - string surrogatePair = YyText(); - char highSurrogate = '\u0000'; - // LUCENENET: Optimized parse so we don't allocate a substring. - if (Integer.TryParse(surrogatePair, 1, 6 - 1, 10, out int highSurrogateInt32)) - { // High surrogates are in decimal range [55296, 56319] - highSurrogate = (char)highSurrogateInt32; - } - else // should never happen + //string surrogatePair = YyText(); // LUCENENET: Refactored to use the underlying array directly instead of allocating substrings + int highSurrogate = '\u0000'; // LUCENENET: Use int to allow out parameters to use without casting. + + // LUCENENET: Originally, we got the value of YyText property, which allocates. We can eliminate the allocation + // by grabbing the values YyText converts to a string: new string(zzBuffer, zzStartRead, zzMarkedPos - zzStartRead); + int startIndex = zzStartRead + 1; + int length = 5; // (6 - 1) + + // High surrogates are in decimal range [55296, 56319] + if (!Integer.TryParse(zzBuffer, startIndex, length, radix: 10, out highSurrogate)) { - if (Debugging.AssertsEnabled) Debugging.Assert(false, "Exception parsing high surrogate '{0}'", surrogatePair.Substring(1, 6 - 1)); + // should never happen + if (Debugging.AssertsEnabled) Debugging.Assert(false, "Exception parsing high surrogate '{0}'", new CharArrayFormatter(zzBuffer, startIndex, length)); } - if (char.IsHighSurrogate(highSurrogate)) + if (char.IsHighSurrogate((char)highSurrogate)) { - char lowSurrogate = '\u0000'; - // LUCENENET: Optimized parse so we don't allocate a substring. - if (Integer.TryParse(surrogatePair, 9, 14 - 9, 10, out int lowSurrogateInt32)) - { // Low surrogates are in decimal range [56320, 57343] - lowSurrogate = (char)lowSurrogateInt32; - } - else // should never happen + int lowSurrogate = '\u0000'; // LUCENENET: Use int to allow out parameters to use without casting. + + // LUCENENET: Originally, we got the value of YyText property, which allocates. We can eliminate the allocation + // by grabbing the values YyText converts to a string: new string(zzBuffer, zzStartRead, zzMarkedPos - zzStartRead); + startIndex = zzStartRead + 9; + length = 5; // (14 - 9) + + // Low surrogates are in decimal range [56320, 57343] + if (!Integer.TryParse(zzBuffer, startIndex, length, radix: 10, out lowSurrogate)) { - if (Debugging.AssertsEnabled) Debugging.Assert(false, "Exception parsing low surrogate '{0}'", surrogatePair.Substring(9, 14 - 9)); + // should never happen + if (Debugging.AssertsEnabled) Debugging.Assert(false, "Exception parsing low surrogate '{0}'", new CharArrayFormatter(zzBuffer, startIndex, length)); } - if (char.IsLowSurrogate(lowSurrogate)) + if (char.IsLowSurrogate((char)lowSurrogate)) { outputSegment = entitySegment; outputSegment.Clear(); - outputSegment.UnsafeWrite(lowSurrogate); + outputSegment.UnsafeWrite((char)lowSurrogate); // add (previously matched input length) + (this match length) - (substitution length) cumulativeDiff += inputSegment.Length + YyLength - 2; // position the correction at (already output length) + (substitution length) @@ -32035,7 +32067,9 @@ namespace Lucene.Net.Analysis.CharFilters return highSurrogate; } } - YyPushBack(surrogatePair.Length - 1); // Consume only '#' + // LUCENENET: Using the underlying array to parse, so need to calculate surrogatePair.Length as (zzMarkedPos - zzStartRead) + // which would be the length of YyText, if allocated. + YyPushBack((zzMarkedPos - zzStartRead) - 1); // Consume only '#' inputSegment.Append('#'); YyBegin(NUMERIC_CHARACTER); }
