IcuBreakIterator: Added a setting to enable the hacks on demand. They are not required for Analysis.Common, so they are disabled by default. They are only used for Highlighter.
Project: http://git-wip-us.apache.org/repos/asf/lucenenet/repo Commit: http://git-wip-us.apache.org/repos/asf/lucenenet/commit/fc7b5b52 Tree: http://git-wip-us.apache.org/repos/asf/lucenenet/tree/fc7b5b52 Diff: http://git-wip-us.apache.org/repos/asf/lucenenet/diff/fc7b5b52 Branch: refs/heads/api-work Commit: fc7b5b52dd64877d5d63498b3d2df4e54c569bd8 Parents: 506f55a Author: Shad Storhaug <[email protected]> Authored: Thu Feb 2 18:02:48 2017 +0700 Committer: Shad Storhaug <[email protected]> Committed: Thu Feb 2 18:30:37 2017 +0700 ---------------------------------------------------------------------- src/IcuBreakIterator.cs | 23 +++++++++++++++----- .../PostingsHighlight/PostingsHighlighter.cs | 5 ++++- 2 files changed, 21 insertions(+), 7 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/lucenenet/blob/fc7b5b52/src/IcuBreakIterator.cs ---------------------------------------------------------------------- diff --git a/src/IcuBreakIterator.cs b/src/IcuBreakIterator.cs index 0bf6007..6028ba1 100644 --- a/src/IcuBreakIterator.cs +++ b/src/IcuBreakIterator.cs @@ -50,6 +50,8 @@ namespace Lucene.Net /// </summary> protected int m_end; + private bool enableHacks = false; + public IcuBreakIterator(Icu.BreakIterator.UBreakIteratorType type) : this(type, CultureInfo.CurrentCulture) { @@ -63,6 +65,13 @@ namespace Lucene.Net this.type = type; } + + public virtual bool EnableHacks + { + get { return enableHacks; } + set { enableHacks = value; } + } + /// <summary> /// Sets the current iteration position to the beginning of the text. /// </summary> @@ -280,20 +289,22 @@ namespace Lucene.Net private void LoadBoundaries(int start, int end) { - //boundaries = new List<int>(); - IEnumerable<Icu.Boundary> icuBoundaries; string offsetText = text.Substring(start, end - start); - if (type == Icu.BreakIterator.UBreakIteratorType.WORD) { - // LUCENENET TODO: HACK - replacing hyphen with "a" so hyphenated words aren't broken - icuBoundaries = Icu.BreakIterator.GetWordBoundaries(locale, offsetText.Replace("-", "a"), true); + if (enableHacks) + { + // LUCENENET TODO: HACK - replacing hyphen with "a" so hyphenated words aren't broken + offsetText = offsetText.Replace("-", "a"); + } + + icuBoundaries = Icu.BreakIterator.GetWordBoundaries(locale, offsetText, true); } else { - if (type == Icu.BreakIterator.UBreakIteratorType.SENTENCE) + if (enableHacks && type == Icu.BreakIterator.UBreakIteratorType.SENTENCE) { // LUCENENET TODO: HACK - newline character causes incorrect sentence breaking. offsetText = offsetText.Replace("\n", " "); http://git-wip-us.apache.org/repos/asf/lucenenet/blob/fc7b5b52/src/Lucene.Net.Highlighter/PostingsHighlight/PostingsHighlighter.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Highlighter/PostingsHighlight/PostingsHighlighter.cs b/src/Lucene.Net.Highlighter/PostingsHighlight/PostingsHighlighter.cs index 63c48bc..db04ee1 100644 --- a/src/Lucene.Net.Highlighter/PostingsHighlight/PostingsHighlighter.cs +++ b/src/Lucene.Net.Highlighter/PostingsHighlight/PostingsHighlighter.cs @@ -131,7 +131,10 @@ namespace Lucene.Net.Search.PostingsHighlight /// </summary> protected virtual BreakIterator GetBreakIterator(string field) { - return new IcuBreakIterator(Icu.BreakIterator.UBreakIteratorType.SENTENCE, CultureInfo.InvariantCulture); + return new IcuBreakIterator(Icu.BreakIterator.UBreakIteratorType.SENTENCE, CultureInfo.InvariantCulture) + { + EnableHacks = true + }; } /// <summary>
