This is an automated email from the ASF dual-hosted git repository.
nightowl888 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/lucenenet.git
The following commit(s) were added to refs/heads/master by this push:
new 0114fb0 BUG: Lucene.Net.ICU: Fixed ThaiWordBreaker to account for
surrogate pairs. Also added locking to help with thread safety.
0114fb0 is described below
commit 0114fb03be910e831277c870b862a73a22f6eb18
Author: Shad Storhaug <[email protected]>
AuthorDate: Sun Aug 2 19:38:10 2020 +0700
BUG: Lucene.Net.ICU: Fixed ThaiWordBreaker to account for surrogate pairs.
Also added locking to help with thread safety.
---
.../Analysis/Th/ThaiTokenizer.cs | 121 ++++++++++++---------
.../Analysis/Th/ThaiWordFilter.cs | 1 -
2 files changed, 72 insertions(+), 50 deletions(-)
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Th/ThaiTokenizer.cs
b/src/Lucene.Net.Analysis.Common/Analysis/Th/ThaiTokenizer.cs
index 6d045ad..283256f 100644
--- a/src/Lucene.Net.Analysis.Common/Analysis/Th/ThaiTokenizer.cs
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Th/ThaiTokenizer.cs
@@ -42,12 +42,12 @@ namespace Lucene.Net.Analysis.Th
public class ThaiTokenizer : SegmentingTokenizerBase
{
// LUCENENET specific - DBBI_AVAILABLE removed because ICU always has
a dictionary-based BreakIterator
- private static readonly BreakIterator proto =
BreakIterator.GetWordInstance(new CultureInfo("th"));
+ private static readonly BreakIterator proto =
(BreakIterator)BreakIterator.GetWordInstance(new CultureInfo("th")).Clone();
/// <summary>
/// used for breaking the text into sentences
/// </summary>
- private static readonly BreakIterator sentenceProto =
BreakIterator.GetSentenceInstance(CultureInfo.InvariantCulture);
+ private static readonly BreakIterator sentenceProto =
(BreakIterator)BreakIterator.GetSentenceInstance(CultureInfo.InvariantCulture).Clone();
private readonly ThaiWordBreaker wordBreaker;
private readonly CharArrayIterator wrapper =
Analysis.Util.CharArrayIterator.NewWordInstance();
@@ -58,6 +58,8 @@ namespace Lucene.Net.Analysis.Th
private readonly ICharTermAttribute termAtt;
private readonly IOffsetAttribute offsetAtt;
+ private readonly object syncLock = new object();
+
/// <summary>
/// Creates a new <see cref="ThaiTokenizer"/> </summary>
public ThaiTokenizer(TextReader reader)
@@ -79,42 +81,54 @@ namespace Lucene.Net.Analysis.Th
protected override void SetNextSentence(int sentenceStart, int
sentenceEnd)
{
- this.sentenceStart = sentenceStart;
- this.sentenceEnd = sentenceEnd;
- wrapper.SetText(m_buffer, sentenceStart, sentenceEnd -
sentenceStart);
- wordBreaker.SetText(new string(wrapper.Text, wrapper.Start,
wrapper.Length));
+ // LUCENENET TODO: This class isn't passing thread safety checks.
+ // Adding locking and extra cloning of BreakIterator seems to
help, but
+ // it is not a complete fix.
+ lock (syncLock)
+ {
+ this.sentenceStart = sentenceStart;
+ this.sentenceEnd = sentenceEnd;
+ wrapper.SetText(m_buffer, sentenceStart, sentenceEnd -
sentenceStart);
+ wordBreaker.SetText(new string(wrapper.Text, wrapper.Start,
wrapper.Length));
+ }
}
protected override bool IncrementWord()
{
- int start = wordBreaker.Current;
- if (start == BreakIterator.Done)
+ // LUCENENET TODO: This class isn't passing thread safety checks.
+ // Adding locking and extra cloning of BreakIterator seems to
help, but
+ // it is not a complete fix.
+ lock (syncLock)
{
- return false; // BreakIterator exhausted
- }
+ int start = wordBreaker.Current;
+ if (start == BreakIterator.Done)
+ {
+ return false; // BreakIterator exhausted
+ }
- // find the next set of boundaries, skipping over non-tokens
- int end = wordBreaker.Next();
- while (end != BreakIterator.Done &&
!Character.IsLetterOrDigit(Character.CodePointAt(m_buffer, sentenceStart +
start, sentenceEnd)))
- {
- start = end;
- end = wordBreaker.Next();
- }
+ // find the next set of boundaries, skipping over non-tokens
+ int end = wordBreaker.Next();
+ while (end != BreakIterator.Done &&
!Character.IsLetterOrDigit(Character.CodePointAt(m_buffer, sentenceStart +
start, sentenceEnd)))
+ {
+ start = end;
+ end = wordBreaker.Next();
+ }
- if (end == BreakIterator.Done)
- {
- return false; // BreakIterator exhausted
- }
+ if (end == BreakIterator.Done)
+ {
+ return false; // BreakIterator exhausted
+ }
- ClearAttributes();
- termAtt.CopyBuffer(m_buffer, sentenceStart + start, end - start);
- offsetAtt.SetOffset(CorrectOffset(m_offset + sentenceStart +
start), CorrectOffset(m_offset + sentenceStart + end));
- return true;
+ ClearAttributes();
+ termAtt.CopyBuffer(m_buffer, sentenceStart + start, end -
start);
+ offsetAtt.SetOffset(CorrectOffset(m_offset + sentenceStart +
start), CorrectOffset(m_offset + sentenceStart + end));
+ return true;
+ }
}
}
/// <summary>
- /// LUCENENET specific class to patch the behavior of the ICU
BreakIterator.
+ /// LUCENENET specific class to patch the behavior of the ICU
BreakIterator to match the behavior of the JDK.
/// Corrects the breaking of words by finding transitions between Thai and
non-Thai
/// characters.
/// </summary>
@@ -122,16 +136,12 @@ namespace Lucene.Net.Analysis.Th
{
private readonly BreakIterator wordBreaker;
private string text;
- private readonly IList<int> transitions = new List<int>();
- private readonly static Regex thaiPattern = new Regex(@"\p{IsThai}",
RegexOptions.Compiled | RegexOptions.CultureInvariant);
+ private readonly Queue<int> transitions = new Queue<int>();
+ private static readonly Regex thaiPattern = new Regex(@"\p{IsThai}+",
RegexOptions.Compiled | RegexOptions.CultureInvariant);
public ThaiWordBreaker(BreakIterator wordBreaker)
{
- if (wordBreaker == null)
- {
- throw new ArgumentNullException("wordBreaker");
- }
- this.wordBreaker = wordBreaker;
+ this.wordBreaker = wordBreaker ?? throw new
ArgumentNullException(nameof(wordBreaker));
}
public void SetText(string text)
@@ -145,9 +155,8 @@ namespace Lucene.Net.Analysis.Th
get
{
if (transitions.Count > 0)
- {
- return transitions[0];
- }
+ return transitions.Peek();
+
return wordBreaker.Current;
}
}
@@ -155,36 +164,50 @@ namespace Lucene.Net.Analysis.Th
public int Next()
{
if (transitions.Count > 0)
- {
- transitions.RemoveAt(0);
- }
+ transitions.Dequeue();
+
if (transitions.Count > 0)
- {
- return transitions[0];
- }
+ return transitions.Peek();
+
return GetNext();
}
private int GetNext()
{
- bool isThai = false, isNonThai = false;
+ bool isThai, isNonThai;
bool prevWasThai = false, prevWasNonThai = false;
int prev = wordBreaker.Current;
int current = wordBreaker.Next();
if (current != BreakIterator.Done && current - prev > 0)
{
+ int length = text.Length;
+ string toMatch;
// Find all of the transitions between Thai and non-Thai
characters and digits
for (int i = prev; i < current; i++)
{
- char c = text[i];
- isThai = char.IsLetter(c) &&
thaiPattern.IsMatch(c.ToString());
- isNonThai = char.IsLetter(c) && !isThai;
+ char high = text[i];
+ // Account for surrogate pairs
+ if (char.IsHighSurrogate(high) && i < length && i + 1 <
current && char.IsLowSurrogate(text[i + 1]))
+ toMatch = string.Empty + high + text[++i];
+ else
+ toMatch = string.Empty + high;
+
+ if (char.IsLetter(toMatch, 0)) // Always break letters
apart from digits to match the JDK
+ {
+ isThai = thaiPattern.IsMatch(toMatch);
+ isNonThai = !isThai;
+ }
+ else
+ {
+ isThai = false;
+ isNonThai = false;
+ }
if ((prevWasThai && isNonThai) ||
(prevWasNonThai && isThai))
{
- transitions.Add(i);
+ transitions.Enqueue(i);
}
// record the values for comparison with the next loop
@@ -194,8 +217,8 @@ namespace Lucene.Net.Analysis.Th
if (transitions.Count > 0)
{
- transitions.Add(current);
- return transitions[0];
+ transitions.Enqueue(current);
+ return transitions.Peek();
}
}
diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Th/ThaiWordFilter.cs
b/src/Lucene.Net.Analysis.Common/Analysis/Th/ThaiWordFilter.cs
index baa7356..03f5cce 100644
--- a/src/Lucene.Net.Analysis.Common/Analysis/Th/ThaiWordFilter.cs
+++ b/src/Lucene.Net.Analysis.Common/Analysis/Th/ThaiWordFilter.cs
@@ -3,7 +3,6 @@ using ICU4N.Text;
using Lucene.Net.Analysis.Core;
using Lucene.Net.Analysis.TokenAttributes;
using Lucene.Net.Analysis.Util;
-using Lucene.Net.Support;
using Lucene.Net.Util;
using System;
using System.Globalization;