[lucenenet] 08/09: Lucene.Net.ICU: Added locking to ThaiTokenizer to only allow a single thread to manipulate the BreakIterator at a time. This helps, but is only a partial fix.

nightowl888 Mon, 24 Aug 2020 14:20:04 -0700

This is an automated email from the ASF dual-hosted git repository.

nightowl888 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/lucenenet.git


commit 5ff92583f219fa851375c9be12ae1b6bf52383c1
Author: Shad Storhaug <[email protected]>
AuthorDate: Mon Aug 24 15:09:37 2020 +0700

    Lucene.Net.ICU: Added locking to ThaiTokenizer to only allow a single 
thread to manipulate the BreakIterator at a time. This helps, but is only a 
partial fix.
---
 .../Analysis/Th/ThaiTokenizer.cs                   | 92 ++++++++++++++++------
 1 file changed, 66 insertions(+), 26 deletions(-)

diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Th/ThaiTokenizer.cs 
b/src/Lucene.Net.Analysis.Common/Analysis/Th/ThaiTokenizer.cs
index 7e0754c..d1f80f1 100644
--- a/src/Lucene.Net.Analysis.Common/Analysis/Th/ThaiTokenizer.cs
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Th/ThaiTokenizer.cs
@@ -41,13 +41,27 @@ namespace Lucene.Net.Analysis.Th
     /// </summary>
     public class ThaiTokenizer : SegmentingTokenizerBase
     {
+        private static readonly object syncLock = new object(); // LUCENENET 
specific - workaround until BreakIterator is made thread safe  (LUCENENET TODO: 
TO REVERT)
+
         // LUCENENET specific - DBBI_AVAILABLE removed because ICU always has 
a dictionary-based BreakIterator
-        private static readonly BreakIterator proto = 
BreakIterator.GetWordInstance(new CultureInfo("th"));
+        private static readonly BreakIterator proto = LoadProto();
 
         /// <summary>
         /// used for breaking the text into sentences
         /// </summary>
-        private static readonly BreakIterator sentenceProto = 
BreakIterator.GetSentenceInstance(CultureInfo.InvariantCulture);
+        private static readonly BreakIterator sentenceProto = 
LoadSentenceProto();
+
+        private static BreakIterator LoadProto()
+        {
+            lock (syncLock)
+                return BreakIterator.GetWordInstance(new CultureInfo("th"));
+        }
+
+        private static BreakIterator LoadSentenceProto()
+        {
+            lock (syncLock)
+                return 
BreakIterator.GetSentenceInstance(CultureInfo.InvariantCulture);
+        }
 
         private readonly ThaiWordBreaker wordBreaker;
         private readonly CharArrayIterator wrapper = 
Analysis.Util.CharArrayIterator.NewWordInstance();
@@ -68,48 +82,74 @@ namespace Lucene.Net.Analysis.Th
         /// <summary>
         /// Creates a new <see cref="ThaiTokenizer"/>, supplying the <see 
cref="Lucene.Net.Util.AttributeSource.AttributeFactory"/> </summary>
         public ThaiTokenizer(AttributeFactory factory, TextReader reader)
-            : base(factory, reader, (BreakIterator)sentenceProto.Clone())
+            : base(factory, reader, CreateSentenceClone())
         {
             // LUCENENET specific - DBBI_AVAILABLE removed because ICU always 
has a dictionary-based BreakIterator
 
-            wordBreaker = new ThaiWordBreaker((BreakIterator)proto.Clone());
+            lock (syncLock)
+                wordBreaker = new 
ThaiWordBreaker((BreakIterator)proto.Clone());
             termAtt = AddAttribute<ICharTermAttribute>();
             offsetAtt = AddAttribute<IOffsetAttribute>();
         }
 
-        protected override void SetNextSentence(int sentenceStart, int 
sentenceEnd)
+        private static BreakIterator CreateSentenceClone()
         {
-            this.sentenceStart = sentenceStart;
-            this.sentenceEnd = sentenceEnd;
-            wrapper.SetText(m_buffer, sentenceStart, sentenceEnd - 
sentenceStart);
-            wordBreaker.SetText(new string(wrapper.Text, wrapper.Start, 
wrapper.Length));
+            lock (syncLock)
+                return (BreakIterator)sentenceProto.Clone();
         }
 
-        protected override bool IncrementWord()
+        public override void Reset()
+        {
+            lock (syncLock)
+                base.Reset();
+        }
+
+        public override State CaptureState()
+        {
+            lock (syncLock)
+                return base.CaptureState();
+        }
+
+        protected override void SetNextSentence(int sentenceStart, int 
sentenceEnd)
         {
-            int start = wordBreaker.Current;
-            if (start == BreakIterator.Done)
+            lock (syncLock)
             {
-                return false; // BreakIterator exhausted
+                this.sentenceStart = sentenceStart;
+                this.sentenceEnd = sentenceEnd;
+                wrapper.SetText(m_buffer, sentenceStart, sentenceEnd - 
sentenceStart);
+                wordBreaker.SetText(new string(wrapper.Text, wrapper.Start, 
wrapper.Length));
             }
+        }
 
-            // find the next set of boundaries, skipping over non-tokens
-            int end = wordBreaker.Next();
-            while (end != BreakIterator.Done && 
!Character.IsLetterOrDigit(Character.CodePointAt(m_buffer, sentenceStart + 
start, sentenceEnd)))
+        protected override bool IncrementWord()
+        {
+            int start, end;
+            lock (syncLock)
             {
-                start = end;
+                start = wordBreaker.Current;
+                if (start == BreakIterator.Done)
+                {
+                    return false; // BreakIterator exhausted
+                }
+
+                // find the next set of boundaries, skipping over non-tokens
                 end = wordBreaker.Next();
-            }
+                while (end != BreakIterator.Done && 
!Character.IsLetterOrDigit(Character.CodePointAt(m_buffer, sentenceStart + 
start, sentenceEnd)))
+                {
+                    start = end;
+                    end = wordBreaker.Next();
+                }
 
-            if (end == BreakIterator.Done)
-            {
-                return false; // BreakIterator exhausted
-            }
+                if (end == BreakIterator.Done)
+                {
+                    return false; // BreakIterator exhausted
+                }
 
-            ClearAttributes();
-            termAtt.CopyBuffer(m_buffer, sentenceStart + start, end - start);
-            offsetAtt.SetOffset(CorrectOffset(m_offset + sentenceStart + 
start), CorrectOffset(m_offset + sentenceStart + end));
-            return true;
+                ClearAttributes();
+                termAtt.CopyBuffer(m_buffer, sentenceStart + start, end - 
start);
+                offsetAtt.SetOffset(CorrectOffset(m_offset + sentenceStart + 
start), CorrectOffset(m_offset + sentenceStart + end));
+                return true;
+            }
         }
     }

[lucenenet] 08/09: Lucene.Net.ICU: Added locking to ThaiTokenizer to only allow a single thread to manipulate the BreakIterator at a time. This helps, but is only a partial fix.

Reply via email to