[lucenenet] 10/14: PERFORMANCE: Lucene.Net.Analysis.In.IndicNormalizer: Refactored ScriptData to change Dictionary to List and eliminated unnecessary hashtable lookup. Use static fields for unknownScript and [ThreadStatic] previousScriptData to optimize character script matching.

nightowl888 Sun, 30 Oct 2022 23:19:20 -0700

This is an automated email from the ASF dual-hosted git repository.

nightowl888 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/lucenenet.git


commit d660b9d518c038eed8f28cbc6157421bc49c71a5
Author: Shad Storhaug <[email protected]>
AuthorDate: Tue Oct 25 07:56:47 2022 +0700

    PERFORMANCE: Lucene.Net.Analysis.In.IndicNormalizer: Refactored ScriptData 
to change Dictionary<Regex, ScriptData> to List<ScriptData> and eliminated 
unnecessary hashtable lookup. Use static fields for unknownScript and 
[ThreadStatic] previousScriptData to optimize character script matching.
---
 .../Analysis/In/IndicNormalizer.cs                 | 82 +++++++++++++++-------
 .../Analysis/In/TestIndicNormalizer.cs             | 10 ++-
 2 files changed, 64 insertions(+), 28 deletions(-)

diff --git a/src/Lucene.Net.Analysis.Common/Analysis/In/IndicNormalizer.cs 
b/src/Lucene.Net.Analysis.Common/Analysis/In/IndicNormalizer.cs
index 10dc257a6..02c723f9f 100644
--- a/src/Lucene.Net.Analysis.Common/Analysis/In/IndicNormalizer.cs
+++ b/src/Lucene.Net.Analysis.Common/Analysis/In/IndicNormalizer.cs
@@ -42,12 +42,14 @@ namespace Lucene.Net.Analysis.In
 
         private class ScriptData
         {
+            internal readonly Regex block;
             internal readonly UnicodeBlock flag;
             internal readonly int @base;
             internal OpenBitSet decompMask;
 
-            internal ScriptData(UnicodeBlock flag, int @base)
+            internal ScriptData(Regex block, UnicodeBlock flag, int @base)
             {
+                this.block = block;
                 this.flag = flag;
                 this.@base = @base;
             }
@@ -232,24 +234,24 @@ namespace Lucene.Net.Analysis.In
             new int[] { 0x73, 0x4B,   -1, 0x13, (int)UnicodeBlock.GURMUKHI }
         };
 
-        private static readonly IDictionary<Regex, ScriptData> scripts = 
LoadScripts(); // LUCENENET: Avoid static constructors (see 
https://github.com/apache/lucenenet/pull/224#issuecomment-469284006)
+        private static readonly IList<ScriptData> scripts = LoadScripts(); // 
LUCENENET: Avoid static constructors (see 
https://github.com/apache/lucenenet/pull/224#issuecomment-469284006)
 
-        private static IDictionary<Regex, ScriptData> LoadScripts()
+        private static IList<ScriptData> LoadScripts()
         {
-            IDictionary<Regex, ScriptData> result = new Dictionary<Regex, 
ScriptData>(capacity: 9)
+            IList<ScriptData> result = new List<ScriptData>(capacity: 9)
             {
-                { new Regex(@"\p{IsDevanagari}", RegexOptions.Compiled), new 
ScriptData(UnicodeBlock.DEVANAGARI, 0x0900) },
-                { new Regex(@"\p{IsBengali}", RegexOptions.Compiled), new 
ScriptData(UnicodeBlock.BENGALI, 0x0980) },
-                { new Regex(@"\p{IsGurmukhi}", RegexOptions.Compiled), new 
ScriptData(UnicodeBlock.GURMUKHI, 0x0A00) },
-                { new Regex(@"\p{IsGujarati}", RegexOptions.Compiled), new 
ScriptData(UnicodeBlock.GUJARATI, 0x0A80) },
-                { new Regex(@"\p{IsOriya}", RegexOptions.Compiled), new 
ScriptData(UnicodeBlock.ORIYA, 0x0B00) },
-                { new Regex(@"\p{IsTamil}", RegexOptions.Compiled), new 
ScriptData(UnicodeBlock.TAMIL, 0x0B80) },
-                { new Regex(@"\p{IsTelugu}", RegexOptions.Compiled), new 
ScriptData(UnicodeBlock.TELUGU, 0x0C00) },
-                { new Regex(@"\p{IsKannada}", RegexOptions.Compiled), new 
ScriptData(UnicodeBlock.KANNADA, 0x0C80) },
-                { new Regex(@"\p{IsMalayalam}", RegexOptions.Compiled), new 
ScriptData(UnicodeBlock.MALAYALAM, 0x0D00) },
+                new ScriptData(new Regex(@"\p{IsDevanagari}",  
RegexOptions.Compiled),  UnicodeBlock.DEVANAGARI,  0x0900),
+                new ScriptData(new Regex(@"\p{IsBengali}",     
RegexOptions.Compiled),  UnicodeBlock.BENGALI,     0x0980),
+                new ScriptData(new Regex(@"\p{IsGurmukhi}",    
RegexOptions.Compiled),  UnicodeBlock.GURMUKHI,    0x0A00),
+                new ScriptData(new Regex(@"\p{IsGujarati}",    
RegexOptions.Compiled),  UnicodeBlock.GUJARATI,    0x0A80),
+                new ScriptData(new Regex(@"\p{IsOriya}",       
RegexOptions.Compiled),  UnicodeBlock.ORIYA,       0x0B00),
+                new ScriptData(new Regex(@"\p{IsTamil}",       
RegexOptions.Compiled),  UnicodeBlock.TAMIL,       0x0B80),
+                new ScriptData(new Regex(@"\p{IsTelugu}",      
RegexOptions.Compiled),  UnicodeBlock.TELUGU,      0x0C00),
+                new ScriptData(new Regex(@"\p{IsKannada}",     
RegexOptions.Compiled),  UnicodeBlock.KANNADA,     0x0C80),
+                new ScriptData(new Regex(@"\p{IsMalayalam}",   
RegexOptions.Compiled),  UnicodeBlock.MALAYALAM,   0x0D00),
             };
 
-            foreach (ScriptData sd in result.Values)
+            foreach (ScriptData sd in result)
             {
                 sd.decompMask = new OpenBitSet(0x7F);
                 for (int i = 0; i < decompositions.Length; i++)
@@ -277,9 +279,8 @@ namespace Lucene.Net.Analysis.In
         {
             for (int i = 0; i < len; i++)
             {
-                var block = GetBlockForChar(text[i]);
-                ScriptData sd;
-                if (scripts.TryGetValue(block, out sd) && sd != null)
+                Regex block;
+                if ((block = GetBlockForChar(text[i], out ScriptData sd)) != 
unknownScript)
                 {
                     int ch = text[i] - sd.@base;
                     if (sd.decompMask.Get(ch))
@@ -302,7 +303,7 @@ namespace Lucene.Net.Analysis.In
             }
 
             int ch1 = text[pos + 1] - sd.@base;
-            var block1 = GetBlockForChar(text[pos + 1]);
+            var block1 = GetBlockForChar(text[pos + 1], out _);
             if (block1 != block0) // needs to be the same writing system
             {
                 return len;
@@ -313,7 +314,7 @@ namespace Lucene.Net.Analysis.In
             if (pos + 2 < len)
             {
                 ch2 = text[pos + 2] - sd.@base;
-                var block2 = GetBlockForChar(text[pos + 2]);
+                var block2 = GetBlockForChar(text[pos + 2], out _);
                 if (text[pos + 2] == '\u200D') // ZWJ
                 {
                     ch2 = 0xFF;
@@ -344,22 +345,49 @@ namespace Lucene.Net.Analysis.In
             return len;
         }
 
+        // LUCENENET: Never matches - we just use this as a placeholder
+        private static readonly Regex unknownScript = new Regex(@"[^\S\s]", 
RegexOptions.Compiled);
+        [ThreadStatic]
+        private static ScriptData previousScriptData;
+
         /// <summary>
-        /// LUCENENET: Returns the unicode block for the specified character
+        /// LUCENENET: Returns the unicode block for the specified character. 
Caches the
+        /// last script and script data used on the current thread to optimize 
performance
+        /// when not switching between scripts.
         /// </summary>
-        private static Regex GetBlockForChar(char c) // LUCENENET: CA1822: 
Mark members as static
+        private static Regex GetBlockForChar(char c, out ScriptData 
scriptData) // LUCENENET: CA1822: Mark members as static
         {
             string charAsString = c.ToString();
-            foreach (var block in scripts.Keys)
+            // Store reference locally to avoid threading issues
+            ScriptData previousScriptDataLocal = previousScriptData;
+            Regex previousScript = previousScriptDataLocal?.block;
+
+            // Optimize to try the most recent script first.
+            if (previousScript?.IsMatch(charAsString) ?? false)
             {
-                if (block.IsMatch(charAsString))
+                scriptData = previousScriptDataLocal;
+                return previousScript;
+            }
+
+            return GetBlockForCharSlow(previousScript, charAsString, out 
scriptData);
+
+            static Regex GetBlockForCharSlow(Regex previousScript, string 
charAsString, out ScriptData scriptData)
+            { 
+                foreach (var script in scripts)
                 {
-                    return block;
+                    Regex block = script.block;
+                    if (block != previousScript && block.IsMatch(charAsString))
+                    {
+                        previousScriptData = script;
+                        scriptData = script;
+                        return block;
+                    }
                 }
-            }
 
-            // return a regex that never matches, nor is in our scripts 
dictionary
-            return new Regex(@"[^\S\s]");
+                scriptData = null;
+                // return a regex that never matches, nor is in our scripts 
dictionary
+                return unknownScript;
+            }
         }
     }
 }
\ No newline at end of file
diff --git 
a/src/Lucene.Net.Tests.Analysis.Common/Analysis/In/TestIndicNormalizer.cs 
b/src/Lucene.Net.Tests.Analysis.Common/Analysis/In/TestIndicNormalizer.cs
index 6eabd72cc..62ab0f84e 100644
--- a/src/Lucene.Net.Tests.Analysis.Common/Analysis/In/TestIndicNormalizer.cs
+++ b/src/Lucene.Net.Tests.Analysis.Common/Analysis/In/TestIndicNormalizer.cs
@@ -1,5 +1,6 @@
-// Lucene version compatibility level 4.8.1
+// Lucene version compatibility level 4.8.1
 using Lucene.Net.Analysis.Core;
+using Lucene.Net.Attributes;
 using NUnit.Framework;
 using System.IO;
 
@@ -60,5 +61,12 @@ namespace Lucene.Net.Analysis.In
             });
             CheckOneTerm(a, "", "");
         }
+
+        [Test, LuceneNetSpecific]
+        public virtual void TestUnknownScript()
+        {
+            check("foo", "foo");
+            check("bar", "bar");
+        }
     }
 }
\ No newline at end of file

[lucenenet] 10/14: PERFORMANCE: Lucene.Net.Analysis.In.IndicNormalizer: Refactored ScriptData to change Dictionary to List and eliminated unnecessary hashtable lookup. Use static fields for unknownScript and [ThreadStatic] previousScriptData to optimize character script matching.

Reply via email to