This is an automated email from the ASF dual-hosted git repository. nightowl888 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/lucenenet.git
commit 672c5a94abfc179121146cae4fbb13379f7b7203 Author: Shad Storhaug <[email protected]> AuthorDate: Sat Aug 22 21:08:10 2020 +0700 Lucene.Net.Analysis.ICU: Updated Segmentation files to Lucene 8.6.1 to account for the latest features of ICU --- .../Icu/Segmentation/BreakIteratorWrapper.cs | 179 +++++++-------------- .../Analysis/Icu/Segmentation/CharArrayIterator.cs | 2 +- .../Icu/Segmentation/CompositeBreakIterator.cs | 6 +- .../Icu/Segmentation/DefaultICUTokenizerConfig.cs | 16 +- .../Analysis/Icu/Segmentation/ICUTokenizer.cs | 19 +-- .../Icu/Segmentation/ICUTokenizerConfig.cs | 8 +- .../Icu/Segmentation/ICUTokenizerFactory.cs | 11 +- .../Analysis/Icu/Segmentation/ScriptIterator.cs | 2 +- 8 files changed, 93 insertions(+), 150 deletions(-) diff --git a/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/BreakIteratorWrapper.cs b/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/BreakIteratorWrapper.cs index af50927..d01aacc 100644 --- a/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/BreakIteratorWrapper.cs +++ b/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/BreakIteratorWrapper.cs @@ -1,7 +1,6 @@ -// Lucene version compatibility level 7.1.0 -using ICU4N; -using ICU4N.Support.Text; +// Lucene version compatibility level 8.6.1 using ICU4N.Text; +using J2N; namespace Lucene.Net.Analysis.Icu.Segmentation { @@ -23,146 +22,88 @@ namespace Lucene.Net.Analysis.Icu.Segmentation */ /// <summary> - /// Contain all the issues surrounding BreakIterators in ICU in one place. - /// Basically this boils down to the fact that they aren't very friendly to any - /// sort of OO design. - /// <para/> - /// http://bugs.icu-project.org/trac/ticket/5901: RBBI.getRuleStatus(), hoist to - /// BreakIterator from <see cref="RuleBasedBreakIterator"/> - /// <para/> - /// DictionaryBasedBreakIterator is a subclass of <see cref="RuleBasedBreakIterator"/>, but - /// doesn't actually behave as a subclass: it always returns 0 for - /// getRuleStatus(): - /// http://bugs.icu-project.org/trac/ticket/4730: Thai RBBI, no boundary type - /// tags + /// Wraps <see cref="RuleBasedBreakIterator"/>, making object reuse convenient and + /// emitting a rule status for emoji sequences. /// <para/> /// @lucene.experimental /// </summary> - internal abstract class BreakIteratorWrapper + internal sealed class BreakIteratorWrapper { - protected readonly CharArrayIterator m_textIterator = new CharArrayIterator(); - protected char[] m_text; - protected int m_start; - protected int m_length; - - public abstract int Next(); - public abstract int Current { get; } - public abstract int RuleStatus { get; } - public abstract void SetText(CharacterIterator text); + private readonly CharArrayIterator textIterator = new CharArrayIterator(); + private readonly RuleBasedBreakIterator rbbi; + private char[] text; + private int start; + private int status; - public void SetText(char[] text, int start, int length) + internal BreakIteratorWrapper(RuleBasedBreakIterator rbbi) { - this.m_text = text; - this.m_start = start; - this.m_length = length; - m_textIterator.SetText(text, start, length); - SetText(m_textIterator); + this.rbbi = rbbi; } - /// <summary> - /// If its a <see cref="RuleBasedBreakIterator"/>, the rule status can be used for token type. If it's - /// any other <see cref="BreakIterator"/>, the rulestatus method is not available, so treat - /// it like a generic <see cref="BreakIterator"/>. - /// </summary> - /// <param name="breakIterator"></param> - /// <returns></returns> - public static BreakIteratorWrapper Wrap(BreakIterator breakIterator) + public int Current => rbbi.Current; + + public int RuleStatus => status; + + public int Next() { - if (breakIterator is RuleBasedBreakIterator) - return new RBBIWrapper((RuleBasedBreakIterator)breakIterator); - else - return new BIWrapper(breakIterator); + int current = rbbi.Current; + int next = rbbi.Next(); + status = CalcStatus(current, next); + return next; } - /// <summary> - /// <see cref="RuleBasedBreakIterator"/> wrapper: <see cref="RuleBasedBreakIterator"/> (as long as it's not - /// a DictionaryBasedBreakIterator) behaves correctly. - /// </summary> - private sealed class RBBIWrapper : BreakIteratorWrapper + /// <summary>Returns current rule status for the text between breaks. (determines token type)</summary> + private int CalcStatus(int current, int next) { - private readonly RuleBasedBreakIterator rbbi; - - internal RBBIWrapper(RuleBasedBreakIterator rbbi) - { - this.rbbi = rbbi; - } - - public override int Current => rbbi.Current; - - public override int RuleStatus => rbbi.RuleStatus; - - public override int Next() + // to support presentation selectors, we need to handle alphanum, num, and none at least, so currently not worth optimizing. + // https://unicode.org/cldr/utility/list-unicodeset.jsp?a=%5B%3AEmoji%3A%5D-%5B%3AEmoji_Presentation%3A%5D&g=Word_Break&i= + if (next != BreakIterator.Done && IsEmoji(current, next)) { - return rbbi.Next(); + return ICUTokenizerConfig.EMOJI_SEQUENCE_STATUS; } - - public override void SetText(CharacterIterator text) + else { - rbbi.SetText(text); + return rbbi.RuleStatus; } } - /// <summary> - /// Generic <see cref="BreakIterator"/> wrapper: Either the rulestatus method is not - /// available or always returns 0. Calculate a rulestatus here so it behaves - /// like <see cref="RuleBasedBreakIterator"/>. - /// </summary> - /// <remarks> - /// Note: This is slower than <see cref="RuleBasedBreakIterator"/>. - /// </remarks> - private sealed class BIWrapper : BreakIteratorWrapper - { - private readonly BreakIterator bi; - private int status; - - internal BIWrapper(BreakIterator bi) - { - this.bi = bi; - } - - public override int Current => bi.Current; - - public override int RuleStatus => status; + // See unicode doc L2/16-315 for rationale. + // basically for us the ambiguous cases (keycap/etc) as far as types go. + internal static readonly UnicodeSet EMOJI_RK = new UnicodeSet("[\u002a\u00230-9©®™〰〽]").Freeze(); + // faster than doing hasBinaryProperty() checks, at the cost of 1KB ram + //internal static readonly UnicodeSet EMOJI = new UnicodeSet("[[:Emoji:][:Extended_Pictographic:]]").Freeze(); // LUCENENET: Extended_Pictographic wasn't added until ICU 62 + internal static readonly UnicodeSet EMOJI = new UnicodeSet("[[:Emoji:]]").Freeze(); - public override int Next() - { - int current = bi.Current; - int next = bi.Next(); - status = CalcStatus(current, next); - return next; - } - - private int CalcStatus(int current, int next) + /// <summary>Returns <c>true</c> if the current text represents emoji character or sequence.</summary> + private bool IsEmoji(int current, int next) + { + int begin = start + current; + int end = start + next; + int codepoint = UTF16.CharAt(text, 0, end, begin); + if (EMOJI.Contains(codepoint)) { - if (current == BreakIterator.Done || next == BreakIterator.Done) - return BreakIterator.WordNone; - - int begin = m_start + current; - int end = m_start + next; - - int codepoint; - for (int i = begin; i < end; i += UTF16.GetCharCount(codepoint)) + if (EMOJI_RK.Contains(codepoint)) { - codepoint = UTF16.CharAt(m_text, 0, end, begin); - - if (UChar.IsDigit(codepoint)) - return BreakIterator.WordNumber; - else if (UChar.IsLetter(codepoint)) - { - // TODO: try to separately specify ideographic, kana? - // [currently all bundled as letter for this case] - return BreakIterator.WordLetter; - } + // if its in EmojiRK, we don't treat it as emoji unless there is evidence it forms emoji sequence, + // an emoji presentation selector or keycap follows. + int trailer = begin + Character.CharCount(codepoint); + return trailer < end && (text[trailer] == 0xFE0F || text[trailer] == 0x20E3); + } + else + { + return true; } - - return BreakIterator.WordNone; } + return false; + } - public override void SetText(CharacterIterator text) - { - bi.SetText(text); - status = BreakIterator.WordNone; - } + public void SetText(char[] text, int start, int length) + { + this.text = text; + this.start = start; + textIterator.SetText(text, start, length); + rbbi.SetText(textIterator); + status = RuleBasedBreakIterator.WordNone; } } } diff --git a/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/CharArrayIterator.cs b/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/CharArrayIterator.cs index 064604f..b4eef83 100644 --- a/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/CharArrayIterator.cs +++ b/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/CharArrayIterator.cs @@ -1,4 +1,4 @@ -// Lucene version compatibility level 7.1.0 +// Lucene version compatibility level 8.6.1 #if FEATURE_BREAKITERATOR using ICU4N.Support.Text; using Lucene.Net.Support; diff --git a/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/CompositeBreakIterator.cs b/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/CompositeBreakIterator.cs index d697ae1..f628e81 100644 --- a/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/CompositeBreakIterator.cs +++ b/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/CompositeBreakIterator.cs @@ -1,4 +1,4 @@ -// Lucene version compatibility level 7.1.0 +// Lucene version compatibility level 8.6.1 using ICU4N; using ICU4N.Globalization; using ICU4N.Text; @@ -124,8 +124,8 @@ namespace Lucene.Net.Analysis.Icu.Segmentation private BreakIteratorWrapper GetBreakIterator(int scriptCode) { - if (wordBreakers[scriptCode] == null) - wordBreakers[scriptCode] = BreakIteratorWrapper.Wrap(config.GetBreakIterator(scriptCode)); + if (wordBreakers[scriptCode] is null) + wordBreakers[scriptCode] = new BreakIteratorWrapper(config.GetBreakIterator(scriptCode)); return wordBreakers[scriptCode]; } } diff --git a/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/DefaultICUTokenizerConfig.cs b/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/DefaultICUTokenizerConfig.cs index 447567c..b6093cb 100644 --- a/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/DefaultICUTokenizerConfig.cs +++ b/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/DefaultICUTokenizerConfig.cs @@ -1,4 +1,4 @@ -// Lucene version compatibility level 7.1.0 +// Lucene version compatibility level 8.6.1 using ICU4N.Globalization; using ICU4N.Text; using J2N; @@ -53,6 +53,8 @@ namespace Lucene.Net.Analysis.Icu.Segmentation public static readonly string WORD_LETTER = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.ALPHANUM]; /// <summary>Token type for words that appear to be numbers</summary> public static readonly string WORD_NUMBER = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.NUM]; + /// <summary>Token type for words that appear to be emoji sequences</summary> + public static readonly string WORD_EMOJI = "<EMOJI>"; //StandardTokenizer.TOKEN_TYPES[StandardTokenizer.EMOJI]; // LUCENENET: 4.8.1 StandardTokenizer doesn't contain EMOJI /// <summary> /// the default breakiterators in use. these can be expensive to @@ -90,21 +92,21 @@ namespace Lucene.Net.Analysis.Icu.Segmentation public override bool CombineCJ => cjkAsWords; - public override BreakIterator GetBreakIterator(int script) + public override RuleBasedBreakIterator GetBreakIterator(int script) { switch (script) { - case UScript.Japanese: return (BreakIterator)cjkBreakIterator.Clone(); + case UScript.Japanese: return (RuleBasedBreakIterator)cjkBreakIterator.Clone(); case UScript.Myanmar: if (myanmarAsWords) { - return (BreakIterator)defaultBreakIterator.Clone(); + return (RuleBasedBreakIterator)defaultBreakIterator.Clone(); } else { - return (BreakIterator)myanmarSyllableIterator.Clone(); + return (RuleBasedBreakIterator)myanmarSyllableIterator.Clone(); } - default: return (BreakIterator)defaultBreakIterator.Clone(); + default: return (RuleBasedBreakIterator)defaultBreakIterator.Clone(); } } @@ -120,6 +122,8 @@ namespace Lucene.Net.Analysis.Icu.Segmentation return script == UScript.Hangul ? WORD_HANGUL : WORD_LETTER; case BreakIterator.WordNumber: //RuleBasedBreakIterator.WORD_NUMBER: return WORD_NUMBER; + case EMOJI_SEQUENCE_STATUS: + return WORD_EMOJI; default: /* some other custom code */ return "<OTHER>"; } diff --git a/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/ICUTokenizer.cs b/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/ICUTokenizer.cs index 1afbfc1..2b37cde 100644 --- a/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/ICUTokenizer.cs +++ b/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/ICUTokenizer.cs @@ -1,4 +1,4 @@ -// Lucene version compatibility level 4.8.1 +// Lucene version compatibility level 8.6.1 using ICU4N; using ICU4N.Text; using Lucene.Net.Analysis.Icu.TokenAttributes; @@ -27,6 +27,7 @@ namespace Lucene.Net.Analysis.Icu.Segmentation * See the License for the specific language governing permissions and * limitations under the License. */ + /// <summary> /// Breaks text into words according to UAX #29: Unicode Text Segmentation /// (http://www.unicode.org/reports/tr29/) @@ -211,9 +212,9 @@ namespace Lucene.Net.Analysis.Icu.Segmentation } /// <summary> - /// Returns true if there is a token from the buffer, or null if it is exhausted. + /// Returns <c>true</c> if there is a token from the buffer, or <c>false</c> if it is exhausted. /// </summary> - /// <returns>true if there is a token from the buffer, or null if it is exhausted.</returns> + /// <returns><c>true</c> if there is a token from the buffer, or <c>false</c> if it is exhausted.</returns> private bool IncrementTokenBuffer() { int start = breaker.Current; @@ -222,21 +223,13 @@ namespace Lucene.Net.Analysis.Icu.Segmentation // find the next set of boundaries, skipping over non-tokens (rule status 0) int end = breaker.Next(); - - // LUCENENET specific - ICU 60.1 does not set the rule status back to 0, - // so we need to explicitly check whether we went out of bounds. - // This is more efficient anyway, since we don't call Next() twice in - // this case. - if (end == BreakIterator.Done) - return false; // BreakIterator exhausted - - while (start != BreakIterator.Done && breaker.RuleStatus == 0) + while (end != BreakIterator.Done && breaker.RuleStatus == 0) { start = end; end = breaker.Next(); } - if (start == BreakIterator.Done) + if (end == BreakIterator.Done) return false; // BreakIterator exhausted termAtt.CopyBuffer(buffer, start, end - start); diff --git a/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/ICUTokenizerConfig.cs b/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/ICUTokenizerConfig.cs index e8014f5..1d3ece1 100644 --- a/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/ICUTokenizerConfig.cs +++ b/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/ICUTokenizerConfig.cs @@ -1,4 +1,4 @@ -// Lucene version compatibility level 7.1.0 +// Lucene version compatibility level 8.6.1 using ICU4N.Text; using Lucene.Net.Support; @@ -30,15 +30,17 @@ namespace Lucene.Net.Analysis.Icu.Segmentation [ExceptionToClassNameConvention] public abstract class ICUTokenizerConfig { + public const int EMOJI_SEQUENCE_STATUS = 299; + /// <summary> /// Sole constructor. (For invocation by subclass /// constructors, typically implicit.) /// </summary> - public ICUTokenizerConfig() { } + protected ICUTokenizerConfig() { } // LUCENENET specific - marked protected instead of public /// <summary> /// Return a breakiterator capable of processing a given script. /// </summary> - public abstract BreakIterator GetBreakIterator(int script); + public abstract RuleBasedBreakIterator GetBreakIterator(int script); /// <summary> /// Return a token type value for a given script and BreakIterator rule status. /// </summary> diff --git a/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/ICUTokenizerFactory.cs b/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/ICUTokenizerFactory.cs index 823e8a6..fe38e72 100644 --- a/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/ICUTokenizerFactory.cs +++ b/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/ICUTokenizerFactory.cs @@ -1,4 +1,4 @@ -// Lucene version compatibility level < 7.1.0 +// Lucene version compatibility level 8.6.1 using ICU4N; using ICU4N.Globalization; using ICU4N.Text; @@ -69,7 +69,10 @@ namespace Lucene.Net.Analysis.Icu.Segmentation [ExceptionToClassNameConvention] public class ICUTokenizerFactory : TokenizerFactory, IResourceLoaderAware { - internal static readonly string RULEFILES = "rulefiles"; + // SPI Name + //public const string NAME = "icu"; + + internal const string RULEFILES = "rulefiles"; private readonly IDictionary<int, string> tailored; private ICUTokenizerConfig config; private readonly bool cjkAsWords; @@ -128,11 +131,11 @@ namespace Lucene.Net.Analysis.Icu.Segmentation } } - public override BreakIterator GetBreakIterator(int script) + public override RuleBasedBreakIterator GetBreakIterator(int script) { if (breakers[script] != null) { - return (BreakIterator)breakers[script].Clone(); + return (RuleBasedBreakIterator)breakers[script].Clone(); } else { diff --git a/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/ScriptIterator.cs b/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/ScriptIterator.cs index ceda09c..1228c5d 100644 --- a/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/ScriptIterator.cs +++ b/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/ScriptIterator.cs @@ -1,4 +1,4 @@ -// Lucene version compatibility level 7.1.0 +// Lucene version compatibility level 8.6.1 using ICU4N; using ICU4N.Globalization; using ICU4N.Text;
