This is an automated email from the ASF dual-hosted git repository. nightowl888 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/lucenenet.git
commit d660b9d518c038eed8f28cbc6157421bc49c71a5 Author: Shad Storhaug <[email protected]> AuthorDate: Tue Oct 25 07:56:47 2022 +0700 PERFORMANCE: Lucene.Net.Analysis.In.IndicNormalizer: Refactored ScriptData to change Dictionary<Regex, ScriptData> to List<ScriptData> and eliminated unnecessary hashtable lookup. Use static fields for unknownScript and [ThreadStatic] previousScriptData to optimize character script matching. --- .../Analysis/In/IndicNormalizer.cs | 82 +++++++++++++++------- .../Analysis/In/TestIndicNormalizer.cs | 10 ++- 2 files changed, 64 insertions(+), 28 deletions(-) diff --git a/src/Lucene.Net.Analysis.Common/Analysis/In/IndicNormalizer.cs b/src/Lucene.Net.Analysis.Common/Analysis/In/IndicNormalizer.cs index 10dc257a6..02c723f9f 100644 --- a/src/Lucene.Net.Analysis.Common/Analysis/In/IndicNormalizer.cs +++ b/src/Lucene.Net.Analysis.Common/Analysis/In/IndicNormalizer.cs @@ -42,12 +42,14 @@ namespace Lucene.Net.Analysis.In private class ScriptData { + internal readonly Regex block; internal readonly UnicodeBlock flag; internal readonly int @base; internal OpenBitSet decompMask; - internal ScriptData(UnicodeBlock flag, int @base) + internal ScriptData(Regex block, UnicodeBlock flag, int @base) { + this.block = block; this.flag = flag; this.@base = @base; } @@ -232,24 +234,24 @@ namespace Lucene.Net.Analysis.In new int[] { 0x73, 0x4B, -1, 0x13, (int)UnicodeBlock.GURMUKHI } }; - private static readonly IDictionary<Regex, ScriptData> scripts = LoadScripts(); // LUCENENET: Avoid static constructors (see https://github.com/apache/lucenenet/pull/224#issuecomment-469284006) + private static readonly IList<ScriptData> scripts = LoadScripts(); // LUCENENET: Avoid static constructors (see https://github.com/apache/lucenenet/pull/224#issuecomment-469284006) - private static IDictionary<Regex, ScriptData> LoadScripts() + private static IList<ScriptData> LoadScripts() { - IDictionary<Regex, ScriptData> result = new Dictionary<Regex, ScriptData>(capacity: 9) + IList<ScriptData> result = new List<ScriptData>(capacity: 9) { - { new Regex(@"\p{IsDevanagari}", RegexOptions.Compiled), new ScriptData(UnicodeBlock.DEVANAGARI, 0x0900) }, - { new Regex(@"\p{IsBengali}", RegexOptions.Compiled), new ScriptData(UnicodeBlock.BENGALI, 0x0980) }, - { new Regex(@"\p{IsGurmukhi}", RegexOptions.Compiled), new ScriptData(UnicodeBlock.GURMUKHI, 0x0A00) }, - { new Regex(@"\p{IsGujarati}", RegexOptions.Compiled), new ScriptData(UnicodeBlock.GUJARATI, 0x0A80) }, - { new Regex(@"\p{IsOriya}", RegexOptions.Compiled), new ScriptData(UnicodeBlock.ORIYA, 0x0B00) }, - { new Regex(@"\p{IsTamil}", RegexOptions.Compiled), new ScriptData(UnicodeBlock.TAMIL, 0x0B80) }, - { new Regex(@"\p{IsTelugu}", RegexOptions.Compiled), new ScriptData(UnicodeBlock.TELUGU, 0x0C00) }, - { new Regex(@"\p{IsKannada}", RegexOptions.Compiled), new ScriptData(UnicodeBlock.KANNADA, 0x0C80) }, - { new Regex(@"\p{IsMalayalam}", RegexOptions.Compiled), new ScriptData(UnicodeBlock.MALAYALAM, 0x0D00) }, + new ScriptData(new Regex(@"\p{IsDevanagari}", RegexOptions.Compiled), UnicodeBlock.DEVANAGARI, 0x0900), + new ScriptData(new Regex(@"\p{IsBengali}", RegexOptions.Compiled), UnicodeBlock.BENGALI, 0x0980), + new ScriptData(new Regex(@"\p{IsGurmukhi}", RegexOptions.Compiled), UnicodeBlock.GURMUKHI, 0x0A00), + new ScriptData(new Regex(@"\p{IsGujarati}", RegexOptions.Compiled), UnicodeBlock.GUJARATI, 0x0A80), + new ScriptData(new Regex(@"\p{IsOriya}", RegexOptions.Compiled), UnicodeBlock.ORIYA, 0x0B00), + new ScriptData(new Regex(@"\p{IsTamil}", RegexOptions.Compiled), UnicodeBlock.TAMIL, 0x0B80), + new ScriptData(new Regex(@"\p{IsTelugu}", RegexOptions.Compiled), UnicodeBlock.TELUGU, 0x0C00), + new ScriptData(new Regex(@"\p{IsKannada}", RegexOptions.Compiled), UnicodeBlock.KANNADA, 0x0C80), + new ScriptData(new Regex(@"\p{IsMalayalam}", RegexOptions.Compiled), UnicodeBlock.MALAYALAM, 0x0D00), }; - foreach (ScriptData sd in result.Values) + foreach (ScriptData sd in result) { sd.decompMask = new OpenBitSet(0x7F); for (int i = 0; i < decompositions.Length; i++) @@ -277,9 +279,8 @@ namespace Lucene.Net.Analysis.In { for (int i = 0; i < len; i++) { - var block = GetBlockForChar(text[i]); - ScriptData sd; - if (scripts.TryGetValue(block, out sd) && sd != null) + Regex block; + if ((block = GetBlockForChar(text[i], out ScriptData sd)) != unknownScript) { int ch = text[i] - sd.@base; if (sd.decompMask.Get(ch)) @@ -302,7 +303,7 @@ namespace Lucene.Net.Analysis.In } int ch1 = text[pos + 1] - sd.@base; - var block1 = GetBlockForChar(text[pos + 1]); + var block1 = GetBlockForChar(text[pos + 1], out _); if (block1 != block0) // needs to be the same writing system { return len; @@ -313,7 +314,7 @@ namespace Lucene.Net.Analysis.In if (pos + 2 < len) { ch2 = text[pos + 2] - sd.@base; - var block2 = GetBlockForChar(text[pos + 2]); + var block2 = GetBlockForChar(text[pos + 2], out _); if (text[pos + 2] == '\u200D') // ZWJ { ch2 = 0xFF; @@ -344,22 +345,49 @@ namespace Lucene.Net.Analysis.In return len; } + // LUCENENET: Never matches - we just use this as a placeholder + private static readonly Regex unknownScript = new Regex(@"[^\S\s]", RegexOptions.Compiled); + [ThreadStatic] + private static ScriptData previousScriptData; + /// <summary> - /// LUCENENET: Returns the unicode block for the specified character + /// LUCENENET: Returns the unicode block for the specified character. Caches the + /// last script and script data used on the current thread to optimize performance + /// when not switching between scripts. /// </summary> - private static Regex GetBlockForChar(char c) // LUCENENET: CA1822: Mark members as static + private static Regex GetBlockForChar(char c, out ScriptData scriptData) // LUCENENET: CA1822: Mark members as static { string charAsString = c.ToString(); - foreach (var block in scripts.Keys) + // Store reference locally to avoid threading issues + ScriptData previousScriptDataLocal = previousScriptData; + Regex previousScript = previousScriptDataLocal?.block; + + // Optimize to try the most recent script first. + if (previousScript?.IsMatch(charAsString) ?? false) { - if (block.IsMatch(charAsString)) + scriptData = previousScriptDataLocal; + return previousScript; + } + + return GetBlockForCharSlow(previousScript, charAsString, out scriptData); + + static Regex GetBlockForCharSlow(Regex previousScript, string charAsString, out ScriptData scriptData) + { + foreach (var script in scripts) { - return block; + Regex block = script.block; + if (block != previousScript && block.IsMatch(charAsString)) + { + previousScriptData = script; + scriptData = script; + return block; + } } - } - // return a regex that never matches, nor is in our scripts dictionary - return new Regex(@"[^\S\s]"); + scriptData = null; + // return a regex that never matches, nor is in our scripts dictionary + return unknownScript; + } } } } \ No newline at end of file diff --git a/src/Lucene.Net.Tests.Analysis.Common/Analysis/In/TestIndicNormalizer.cs b/src/Lucene.Net.Tests.Analysis.Common/Analysis/In/TestIndicNormalizer.cs index 6eabd72cc..62ab0f84e 100644 --- a/src/Lucene.Net.Tests.Analysis.Common/Analysis/In/TestIndicNormalizer.cs +++ b/src/Lucene.Net.Tests.Analysis.Common/Analysis/In/TestIndicNormalizer.cs @@ -1,5 +1,6 @@ -// Lucene version compatibility level 4.8.1 +// Lucene version compatibility level 4.8.1 using Lucene.Net.Analysis.Core; +using Lucene.Net.Attributes; using NUnit.Framework; using System.IO; @@ -60,5 +61,12 @@ namespace Lucene.Net.Analysis.In }); CheckOneTerm(a, "", ""); } + + [Test, LuceneNetSpecific] + public virtual void TestUnknownScript() + { + check("foo", "foo"); + check("bar", "bar"); + } } } \ No newline at end of file
