This is an automated email from the ASF dual-hosted git repository. nightowl888 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/lucenenet.git
commit e72315a75009854483c979462eb2406f41311796 Author: Shad Storhaug <[email protected]> AuthorDate: Thu Oct 27 08:10:47 2022 +0700 PERFORMANCE: Lucene.Net.Analysis.Th.ThaiWordBreaker: Removed unnecessary string allocations and concatenation. Use CharsRef to reuse the same memory. Removed Regex and replaced with UnicodeSet to detect Thai code points. --- .../Analysis/Th/ThaiTokenizer.cs | 24 ++++++++++++---------- .../Analysis/Th/ThaiWordFilter.cs | 4 ++-- 2 files changed, 15 insertions(+), 13 deletions(-) diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Th/ThaiTokenizer.cs b/src/Lucene.Net.Analysis.Common/Analysis/Th/ThaiTokenizer.cs index 3bc1a8314..aba8a4328 100644 --- a/src/Lucene.Net.Analysis.Common/Analysis/Th/ThaiTokenizer.cs +++ b/src/Lucene.Net.Analysis.Common/Analysis/Th/ThaiTokenizer.cs @@ -1,10 +1,12 @@ // Lucene version compatibility level 4.8.1 #if FEATURE_BREAKITERATOR +using ICU4N.Support.Text; using ICU4N.Text; using J2N; using Lucene.Net.Analysis.TokenAttributes; using Lucene.Net.Analysis.Util; using Lucene.Net.Support.Threading; +using Lucene.Net.Util; using System; using System.Collections.Generic; using System.Globalization; @@ -80,7 +82,7 @@ namespace Lucene.Net.Analysis.Th } private readonly ThaiWordBreaker wordBreaker; - private readonly CharArrayIterator wrapper = Analysis.Util.CharArrayIterator.NewWordInstance(); + private readonly CharArrayIterator wrapper = CharArrayIterator.NewWordInstance(); private int sentenceStart; private int sentenceEnd; @@ -162,7 +164,7 @@ namespace Lucene.Net.Analysis.Th this.sentenceStart = sentenceStart; this.sentenceEnd = sentenceEnd; wrapper.SetText(m_buffer, sentenceStart, sentenceEnd - sentenceStart); - wordBreaker.SetText(new string(wrapper.Text, wrapper.Start, wrapper.Length)); + wordBreaker.SetText(wrapper); } finally { @@ -215,18 +217,18 @@ namespace Lucene.Net.Analysis.Th internal class ThaiWordBreaker { private readonly BreakIterator wordBreaker; - private string text; + private readonly CharsRef text = new CharsRef(); private readonly Queue<int> transitions = new Queue<int>(); - private static readonly Regex thaiPattern = new Regex(@"\p{IsThai}+", RegexOptions.Compiled | RegexOptions.CultureInvariant); + private static readonly UnicodeSet thai = new UnicodeSet("[:Thai:]").Freeze(); public ThaiWordBreaker(BreakIterator wordBreaker) { this.wordBreaker = wordBreaker ?? throw new ArgumentNullException(nameof(wordBreaker)); } - public void SetText(string text) + public void SetText(CharArrayIterator text) { - this.text = text; + this.text.CopyChars(text.Text, text.Start, text.Length); wordBreaker.SetText(text); } @@ -262,20 +264,20 @@ namespace Lucene.Net.Analysis.Th if (current != BreakIterator.Done && current - prev > 0) { int length = text.Length; - string toMatch; + int codePoint; // Find all of the transitions between Thai and non-Thai characters and digits for (int i = prev; i < current; i++) { char high = text[i]; // Account for surrogate pairs if (char.IsHighSurrogate(high) && i < length && i + 1 < current && char.IsLowSurrogate(text[i + 1])) - toMatch = string.Empty + high + text[++i]; + codePoint = Character.ToCodePoint(high, text[++i]); else - toMatch = string.Empty + high; + codePoint = high; - if (char.IsLetter(toMatch, 0)) // Always break letters apart from digits to match the JDK + if (Character.IsLetter(codePoint)) // Always break letters apart from digits to match the JDK { - isThai = thaiPattern.IsMatch(toMatch); + isThai = thai.Contains(codePoint); isNonThai = !isThai; } else diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Th/ThaiWordFilter.cs b/src/Lucene.Net.Analysis.Common/Analysis/Th/ThaiWordFilter.cs index 0c060cfd7..ea90dbd5d 100644 --- a/src/Lucene.Net.Analysis.Common/Analysis/Th/ThaiWordFilter.cs +++ b/src/Lucene.Net.Analysis.Common/Analysis/Th/ThaiWordFilter.cs @@ -1,4 +1,4 @@ -// Lucene version compatibility level 4.8.1 +// Lucene version compatibility level 4.8.1 #if FEATURE_BREAKITERATOR using ICU4N.Text; using Lucene.Net.Analysis.Core; @@ -138,7 +138,7 @@ namespace Lucene.Net.Analysis.Th // reinit CharacterIterator charIterator.SetText(clonedTermAtt.Buffer, 0, clonedTermAtt.Length); - breaker.SetText(new string(charIterator.Text, charIterator.Start, charIterator.Length)); + breaker.SetText(charIterator); int end2 = breaker.Next(); if (end2 != BreakIterator.Done) {
