This is an automated email from the ASF dual-hosted git repository.

nightowl888 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/lucenenet.git

commit 3fbee37edede7ac1f4c864bb24a0be3227ec2cca
Author: Shad Storhaug <[email protected]>
AuthorDate: Fri Oct 28 12:25:14 2022 +0700

    PERFORMANCE: Lucene.Net.Analsis.Util.HTMLStripCharFilter: Refactored to 
remove YyText property (method) which allocates a string every time it is 
called. Instead, we pass the underlying array to J2N.Numerics.TryParse() and 
OpenStringBuilder.Append() with the calculated startIndex and length to 
directly copy the characters without allocating substrings.
---
 .../Analysis/CharFilter/HTMLStripCharFilter.cs     | 216 ++++++++++++---------
 1 file changed, 125 insertions(+), 91 deletions(-)

diff --git 
a/src/Lucene.Net.Analysis.Common/Analysis/CharFilter/HTMLStripCharFilter.cs 
b/src/Lucene.Net.Analysis.Common/Analysis/CharFilter/HTMLStripCharFilter.cs
index e1103fdc9..217a76807 100644
--- a/src/Lucene.Net.Analysis.Common/Analysis/CharFilter/HTMLStripCharFilter.cs
+++ b/src/Lucene.Net.Analysis.Common/Analysis/CharFilter/HTMLStripCharFilter.cs
@@ -31095,15 +31095,12 @@ namespace Lucene.Net.Analysis.CharFilters
             zzLexicalState = newState;
         }
 
-
-        /// <summary>
-        /// Returns the text matched by the current regular expression.
-        /// </summary>
-        /// <returns>Returns the text matched by the current regular 
expression.</returns>
-        private string YyText()
-        {
-            return new string(zzBuffer, zzStartRead, zzMarkedPos - 
zzStartRead);
-        }
+        // LUCENENET: Not used - refactored to read the array directly to 
avoid allocations
+        ///// <summary>
+        ///// Returns the text matched by the current regular expression.
+        ///// </summary>
+        ///// <returns>Returns the text matched by the current regular 
expression.</returns>
+        //private string YyText => new string(zzBuffer, zzStartRead, 
zzMarkedPos - zzStartRead);
 
         /// <summary>
         /// Returns the character at position <tt>pos</tt> from the 
@@ -31372,9 +31369,9 @@ namespace Lucene.Net.Analysis.CharFilters
                             inputSegment.Write(zzBuffer, zzStartRead, 
matchLength);
                             if (matchLength <= 7)
                             { // 0x10FFFF = 1114111: max 7 decimal chars
-                                // LUCENENET: Originally, we got the value of 
YyText(), which allocates..so we can eliminate the allocation
-                                // by grabbing the values YyText() converts to 
a string: new string(zzBuffer, zzStartRead, zzMarkedPos - zzStartRead);
-                                if (!J2N.Numerics.Int32.TryParse(zzBuffer, 
zzStartRead, zzMarkedPos - zzStartRead, radix: 10, out int codePoint))
+                                // LUCENENET: Originally, we got the value of 
YyText property, which allocates. We can eliminate the allocation
+                                // by grabbing the values YyText converts to a 
string: new string(zzBuffer, zzStartRead, zzMarkedPos - zzStartRead);
+                                if (!Integer.TryParse(zzBuffer, zzStartRead, 
zzMarkedPos - zzStartRead, radix: 10, out int codePoint))
                                 {
                                     if (Debugging.AssertsEnabled) 
Debugging.Assert(false, "Exception parsing code point '{0}'", new 
CharArrayFormatter(zzBuffer, zzStartRead, zzMarkedPos - zzStartRead));
                                 }
@@ -31627,7 +31624,9 @@ namespace Lucene.Net.Analysis.CharFilters
                             inputSegment.Write(zzBuffer, zzStartRead, 
matchLength);
                             if (matchLength <= 6)
                             { // 10FFFF: max 6 hex chars
-                                if (!J2N.Numerics.Int32.TryParse(zzBuffer, 
zzStartRead + 1, matchLength - 1, radix: 16, out int codePoint))
+                                // LUCENENET: Originally, we got the value of 
new string(zzBuffer, zzStartRead, zzMarkedPos - zzStartRead), which allocates.
+                                // We can eliminate the allocation by grabbing 
the values YyText converts to a string via index and length.
+                                if (!Integer.TryParse(zzBuffer, zzStartRead + 
1, matchLength - 1, radix: 16, out int codePoint))
                                 {
                                     if (Debugging.AssertsEnabled) 
Debugging.Assert(false, "Exception parsing hex code point '{0}'", new 
CharArrayFormatter(zzBuffer, zzStartRead + 1, matchLength - 1));
                                 }
@@ -31666,7 +31665,9 @@ namespace Lucene.Net.Analysis.CharFilters
                         {
                             if (inputSegment.Length > 2)
                             { // Chars between "<!" and "--" - this is not a 
comment
-                                inputSegment.Append(YyText());
+                                // LUCENENET: Originally, we got the value of 
YyText property, which allocates. We can eliminate the allocation
+                                // by grabbing the values YyText converts to a 
string: new string(zzBuffer, zzStartRead, zzMarkedPos - zzStartRead);
+                                inputSegment.Append(zzBuffer, zzStartRead, 
zzMarkedPos - zzStartRead);
                             }
                             else
                             {
@@ -31812,7 +31813,9 @@ namespace Lucene.Net.Analysis.CharFilters
                         {
                             if (inputSegment.Length > 2)
                             { // Chars between "<!" and "[CDATA[" - this is 
not a CDATA section
-                                inputSegment.Append(YyText());
+                                // LUCENENET: Originally, we got the value of 
YyText property, which allocates. We can eliminate the allocation
+                                // by grabbing the values YyText converts to a 
string: new string(zzBuffer, zzStartRead, zzMarkedPos - zzStartRead);
+                                inputSegment.Append(zzBuffer, zzStartRead, 
zzMarkedPos - zzStartRead);
                             }
                             else
                             {
@@ -31884,27 +31887,33 @@ namespace Lucene.Net.Analysis.CharFilters
                         { // Handle paired UTF-16 surrogates.
                             outputSegment = entitySegment;
                             outputSegment.Clear();
-                            string surrogatePair = YyText();
-                            char highSurrogate = '\u0000';
-                            // LUCENENET: Optimized parse so we don't allocate 
a substring.
-                            if (Integer.TryParse(surrogatePair, 2, 6 - 2, 16, 
out int highSurrogateInt32))
-                            {
-                                highSurrogate = (char)highSurrogateInt32;
-                            }
-                            else // should never happen
+                            //string surrogatePair = YyText; // LUCENENET: 
Refactored to use the underlying array directly instead of allocating substrings
+                            int highSurrogate = '\u0000'; // LUCENENET: Use 
int to allow out parameters to use without casting.
+
+                            // LUCENENET: Originally, we got the value of 
YyText property, which allocates. We can eliminate the allocation
+                            // by grabbing the values YyText converts to a 
string: new string(zzBuffer, zzStartRead, zzMarkedPos - zzStartRead);
+                            int startIndex = zzStartRead + 2;
+                            int length = 4; // (6 - 2)
+
+                            // High surrogates are in decimal range [55296, 
56319]
+                            if (!Integer.TryParse(zzBuffer, startIndex, 
length, radix: 16, out highSurrogate))
                             {
-                                if (Debugging.AssertsEnabled) 
Debugging.Assert(false, "Exception parsing high surrogate '{0}'", 
surrogatePair.Substring(2, 6 - 2));
+                                // should never happen
+                                if (Debugging.AssertsEnabled) 
Debugging.Assert(false, "Exception parsing high surrogate '{0}'", new 
CharArrayFormatter(zzBuffer, startIndex, length));
                             }
-                            try
+
+                            // LUCENENET: Originally, we got the value of 
YyText property, which allocates. We can eliminate the allocation
+                            // by grabbing the values YyText converts to a 
string: new string(zzBuffer, zzStartRead, zzMarkedPos - zzStartRead);
+                            startIndex = zzStartRead + 10;
+                            length = 4; // (14 - 10)
+
+                            // Low surrogates are in decimal range [56320, 
57343]
+                            if (!Integer.TryParse(zzBuffer, startIndex, 
length, radix: 16, out int lowSurrogate))
                             {
-                                // LUCENENET: Optimized parse so we don't 
allocate a substring
-                                
outputSegment.UnsafeWrite((char)Integer.Parse(surrogatePair, 10, 14 - 10, 16));
+                                // should never happen
+                                if (Debugging.AssertsEnabled) 
Debugging.Assert(false, "Exception parsing low surrogate '{0}'", new 
CharArrayFormatter(zzBuffer, startIndex, length));
                             }
-                            catch (Exception e) when (e.IsException())
-                            { // should never happen
-                                if (Debugging.AssertsEnabled) 
Debugging.Assert(false, "Exception parsing low surrogate '{0}'", 
surrogatePair.Substring(10, 14 - 10));
-                            }
-
+                            outputSegment.UnsafeWrite((char)lowSurrogate);
                             // add (previously matched input length) + (this 
match length) - (substitution length)
                             cumulativeDiff += inputSegment.Length + YyLength - 
2;
                             // position the correction at (already output 
length) + (substitution length)
@@ -31916,32 +31925,38 @@ namespace Lucene.Net.Analysis.CharFilters
                     case 103: break;
                     case 51:
                         { // Handle paired UTF-16 surrogates.
-                            string surrogatePair = YyText();
-                            char highSurrogate = '\u0000';
-                            char lowSurrogate = '\u0000';
-                            // LUCENENET: Optimized parse so we don't allocate 
a substring.
-                            if (Integer.TryParse(surrogatePair, 2, 6 - 2, 16, 
out int highSurrogateInt32))
+                            // string surrogatePair = YyText; // LUCENENET: 
Refactored to use the underlying array directly instead of allocating substrings
+                            int highSurrogate = '\u0000'; // LUCENENET: Use 
int to allow out parameters to use without casting.
+                            int lowSurrogate = '\u0000'; // LUCENENET: Use int 
to allow out parameters to use without casting.
+
+                            // LUCENENET: Originally, we got the value of 
YyText property, which allocates. We can eliminate the allocation
+                            // by grabbing the values YyText converts to a 
string: new string(zzBuffer, zzStartRead, zzMarkedPos - zzStartRead);
+                            int startIndex = zzStartRead + 2;
+                            int length = 4; // (6 - 2)
+
+                            // High surrogates are in decimal range [55296, 
56319]
+                            if (!Integer.TryParse(zzBuffer, startIndex, 
length, radix: 16, out highSurrogate))
                             {
-                                highSurrogate = (char)highSurrogateInt32;
+                                // should never happen
+                                if (Debugging.AssertsEnabled) 
Debugging.Assert(false, "Exception parsing high surrogate '{0}'", new 
CharArrayFormatter(zzBuffer, startIndex, length));
                             }
-                            else // should never happen
+
+                            // LUCENENET: Originally, we got the value of 
YyText property, which allocates. We can eliminate the allocation
+                            // by grabbing the values YyText converts to a 
string: new string(zzBuffer, zzStartRead, zzMarkedPos - zzStartRead);
+                            startIndex = zzStartRead + 9;
+                            length = 5; // (14 - 9)
+
+                            // Low surrogates are in decimal range [56320, 
57343]
+                            if (!Integer.TryParse(zzBuffer, startIndex, 
length, radix: 10, out lowSurrogate))
                             {
-                                if (Debugging.AssertsEnabled) 
Debugging.Assert(false, "Exception parsing high surrogate '{0}'", 
surrogatePair.Substring(2, 6 - 2));
-                            }
-                            try
-                            { // Low surrogates are in decimal range [56320, 
57343]
-                                // LUCENENET: Optimized parse so we don't 
allocate a substring
-                                lowSurrogate = 
(char)Integer.Parse(surrogatePair, 9, 14 - 9, 10);
-                            }
-                            catch (Exception e) when (e.IsException())
-                            { // should never happen
-                                if (Debugging.AssertsEnabled) 
Debugging.Assert(false, "Exception parsing low surrogate '{0}'", 
surrogatePair.Substring(9, 14 - 9));
+                                // should never happen
+                                if (Debugging.AssertsEnabled) 
Debugging.Assert(false, "Exception parsing low surrogate '{0}'", new 
CharArrayFormatter(zzBuffer, startIndex, length));
                             }
-                            if (char.IsLowSurrogate(lowSurrogate))
+                            if (char.IsLowSurrogate((char)lowSurrogate))
                             {
                                 outputSegment = entitySegment;
                                 outputSegment.Clear();
-                                outputSegment.UnsafeWrite(lowSurrogate);
+                                outputSegment.UnsafeWrite((char)lowSurrogate);
                                 // add (previously matched input length) + 
(this match length) - (substitution length)
                                 cumulativeDiff += inputSegment.Length + 
YyLength - 2;
                                 // position the correction at (already output 
length) + (substitution length)
@@ -31950,7 +31965,9 @@ namespace Lucene.Net.Analysis.CharFilters
                                 YyBegin(YYINITIAL);
                                 return highSurrogate;
                             }
-                            YyPushBack(surrogatePair.Length - 1); // Consume 
only '#'
+                            // LUCENENET: Using the underlying array to parse, 
so need to calculate surrogatePair.Length as (zzMarkedPos - zzStartRead)
+                            // which would be the length of YyText, if 
allocated.
+                            YyPushBack((zzMarkedPos - zzStartRead) - 1); // 
Consume only '#'
                             inputSegment.Append('#');
                             YyBegin(NUMERIC_CHARACTER);
                         }
@@ -31958,30 +31975,37 @@ namespace Lucene.Net.Analysis.CharFilters
                     case 104: break;
                     case 52:
                         { // Handle paired UTF-16 surrogates.
-                            string surrogatePair = YyText();
-                            char highSurrogate = '\u0000';
-                            // LUCENENET: Optimized parse so we don't allocate 
a substring.
-                            if (Integer.TryParse(surrogatePair, 1, 6 - 1, 10, 
out int highSurrogateInt32))
-                            { // High surrogates are in decimal range [55296, 
56319]
-                                highSurrogate = (char)highSurrogateInt32;
-                            }
-                            else // should never happen
+                            //string surrogatePair = YyText; // LUCENENET: 
Refactored to use the underlying array directly instead of allocating substrings
+                            int highSurrogate = '\u0000'; // LUCENENET: Use 
int to allow out parameters to use without casting.
+
+                            // LUCENENET: Originally, we got the value of 
YyText property, which allocates. We can eliminate the allocation
+                            // by grabbing the values YyText converts to a 
string: new string(zzBuffer, zzStartRead, zzMarkedPos - zzStartRead);
+                            int startIndex = zzStartRead + 1;
+                            int length = 5; // (6 - 1)
+
+                            // High surrogates are in decimal range [55296, 
56319]
+                            if (!Integer.TryParse(zzBuffer, startIndex, 
length, radix: 10, out highSurrogate))
                             {
-                                if (Debugging.AssertsEnabled) 
Debugging.Assert(false, "Exception parsing high surrogate '{0}'", 
surrogatePair.Substring(1, 6 - 1));
+                                // should never happen
+                                if (Debugging.AssertsEnabled) 
Debugging.Assert(false, "Exception parsing high surrogate '{0}'", new 
CharArrayFormatter(zzBuffer, startIndex, length));
                             }
-                            if (char.IsHighSurrogate(highSurrogate))
+                            if (char.IsHighSurrogate((char)highSurrogate))
                             {
                                 outputSegment = entitySegment;
                                 outputSegment.Clear();
-                                try
+
+                                // LUCENENET: Originally, we got the value of 
YyText property, which allocates. We can eliminate the allocation
+                                // by grabbing the values YyText converts to a 
string: new string(zzBuffer, zzStartRead, zzMarkedPos - zzStartRead);
+                                startIndex = zzStartRead + 10;
+                                length = 4; // (14 - 10)
+
+                                // Low surrogates are in decimal range [56320, 
57343]
+                                if (!Integer.TryParse(zzBuffer, startIndex, 
length, radix: 16, out int lowSurrogate))
                                 {
-                                    // LUCENENET: Optimized parse so we don't 
allocate a substring.
-                                    
outputSegment.UnsafeWrite((char)Integer.Parse(surrogatePair, 10, 14 - 10, 16));
-                                }
-                                catch (Exception e) when (e.IsException())
-                                { // should never happen
-                                    if (Debugging.AssertsEnabled) 
Debugging.Assert(false, "Exception parsing low surrogate '{0}'", 
surrogatePair.Substring(10, 14 - 10));
+                                    // should never happen
+                                    if (Debugging.AssertsEnabled) 
Debugging.Assert(false, "Exception parsing low surrogate '{0}'", new 
CharArrayFormatter(zzBuffer, startIndex, length));
                                 }
+                                outputSegment.UnsafeWrite((char)lowSurrogate);
                                 // add (previously matched input length) + 
(this match length) - (substitution length)
                                 cumulativeDiff += inputSegment.Length + 
YyLength - 2;
                                 // position the correction at (already output 
length) + (substitution length)
@@ -31990,7 +32014,9 @@ namespace Lucene.Net.Analysis.CharFilters
                                 YyBegin(YYINITIAL);
                                 return highSurrogate;
                             }
-                            YyPushBack(surrogatePair.Length - 1); // Consume 
only '#'
+                            // LUCENENET: Using the underlying array to parse, 
so need to calculate surrogatePair.Length as (zzMarkedPos - zzStartRead)
+                            // which would be the length of YyText, if 
allocated.
+                            YyPushBack((zzMarkedPos - zzStartRead) - 1); // 
Consume only '#'
                             inputSegment.Append('#');
                             YyBegin(NUMERIC_CHARACTER);
                         }
@@ -31998,34 +32024,40 @@ namespace Lucene.Net.Analysis.CharFilters
                     case 105: break;
                     case 53:
                         { // Handle paired UTF-16 surrogates.
-                            string surrogatePair = YyText();
-                            char highSurrogate = '\u0000';
-                            // LUCENENET: Optimized parse so we don't allocate 
a substring.
-                            if (Integer.TryParse(surrogatePair, 1, 6 - 1, 10, 
out int highSurrogateInt32))
-                            { // High surrogates are in decimal range [55296, 
56319]
-                                highSurrogate = (char)highSurrogateInt32;
-                            }
-                            else // should never happen
+                            //string surrogatePair = YyText(); // LUCENENET: 
Refactored to use the underlying array directly instead of allocating substrings
+                            int highSurrogate = '\u0000'; // LUCENENET: Use 
int to allow out parameters to use without casting.
+
+                            // LUCENENET: Originally, we got the value of 
YyText property, which allocates. We can eliminate the allocation
+                            // by grabbing the values YyText converts to a 
string: new string(zzBuffer, zzStartRead, zzMarkedPos - zzStartRead);
+                            int startIndex = zzStartRead + 1;
+                            int length = 5; // (6 - 1)
+
+                            // High surrogates are in decimal range [55296, 
56319]
+                            if (!Integer.TryParse(zzBuffer, startIndex, 
length, radix: 10, out highSurrogate))
                             {
-                                if (Debugging.AssertsEnabled) 
Debugging.Assert(false, "Exception parsing high surrogate '{0}'", 
surrogatePair.Substring(1, 6 - 1));
+                                // should never happen
+                                if (Debugging.AssertsEnabled) 
Debugging.Assert(false, "Exception parsing high surrogate '{0}'", new 
CharArrayFormatter(zzBuffer, startIndex, length));
                             }
-                            if (char.IsHighSurrogate(highSurrogate))
+                            if (char.IsHighSurrogate((char)highSurrogate))
                             {
-                                char lowSurrogate = '\u0000';
-                                // LUCENENET: Optimized parse so we don't 
allocate a substring.
-                                if (Integer.TryParse(surrogatePair, 9, 14 - 9, 
10, out int lowSurrogateInt32))
-                                { // Low surrogates are in decimal range 
[56320, 57343]
-                                    lowSurrogate = (char)lowSurrogateInt32;
-                                }
-                                else // should never happen
+                                int lowSurrogate = '\u0000'; // LUCENENET: Use 
int to allow out parameters to use without casting.
+
+                                // LUCENENET: Originally, we got the value of 
YyText property, which allocates. We can eliminate the allocation
+                                // by grabbing the values YyText converts to a 
string: new string(zzBuffer, zzStartRead, zzMarkedPos - zzStartRead);
+                                startIndex = zzStartRead + 9;
+                                length = 5; // (14 - 9)
+
+                                // Low surrogates are in decimal range [56320, 
57343]
+                                if (!Integer.TryParse(zzBuffer, startIndex, 
length, radix: 10, out lowSurrogate))
                                 {
-                                    if (Debugging.AssertsEnabled) 
Debugging.Assert(false, "Exception parsing low surrogate '{0}'", 
surrogatePair.Substring(9, 14 - 9));
+                                    // should never happen
+                                    if (Debugging.AssertsEnabled) 
Debugging.Assert(false, "Exception parsing low surrogate '{0}'", new 
CharArrayFormatter(zzBuffer, startIndex, length));
                                 }
-                                if (char.IsLowSurrogate(lowSurrogate))
+                                if (char.IsLowSurrogate((char)lowSurrogate))
                                 {
                                     outputSegment = entitySegment;
                                     outputSegment.Clear();
-                                    outputSegment.UnsafeWrite(lowSurrogate);
+                                    
outputSegment.UnsafeWrite((char)lowSurrogate);
                                     // add (previously matched input length) + 
(this match length) - (substitution length)
                                     cumulativeDiff += inputSegment.Length + 
YyLength - 2;
                                     // position the correction at (already 
output length) + (substitution length)
@@ -32035,7 +32067,9 @@ namespace Lucene.Net.Analysis.CharFilters
                                     return highSurrogate;
                                 }
                             }
-                            YyPushBack(surrogatePair.Length - 1); // Consume 
only '#'
+                            // LUCENENET: Using the underlying array to parse, 
so need to calculate surrogatePair.Length as (zzMarkedPos - zzStartRead)
+                            // which would be the length of YyText, if 
allocated.
+                            YyPushBack((zzMarkedPos - zzStartRead) - 1); // 
Consume only '#'
                             inputSegment.Append('#');
                             YyBegin(NUMERIC_CHARACTER);
                         }

Reply via email to