SWEEP: Moved BreakIterator-dependent functionality to a common Lucene.Net.Icu library so we can manage the icu.net dependency from one place and not make the majority of the users deal with it when they don't need to
Project: http://git-wip-us.apache.org/repos/asf/lucenenet/repo Commit: http://git-wip-us.apache.org/repos/asf/lucenenet/commit/b1fdcca3 Tree: http://git-wip-us.apache.org/repos/asf/lucenenet/tree/b1fdcca3 Diff: http://git-wip-us.apache.org/repos/asf/lucenenet/diff/b1fdcca3 Branch: refs/heads/master Commit: b1fdcca3b3c3f418dfe37aafeda6f4dab75fb6d4 Parents: 63c599e Author: Shad Storhaug <[email protected]> Authored: Mon Apr 17 01:38:10 2017 +0700 Committer: Shad Storhaug <[email protected]> Committed: Mon Apr 17 01:38:10 2017 +0700 ---------------------------------------------------------------------- Lucene.Net.Portable.sln | 20 + Lucene.Net.sln | 52 +++ NuGet.config | 1 + src/IcuBreakIterator.cs | 394 ----------------- .../Analysis/Th/ThaiAnalyzer.cs | 2 +- .../Lucene.Net.Analysis.Common.csproj | 3 - src/Lucene.Net.Analysis.Common/project.json | 6 +- .../Lucene.Net.Highlighter.csproj | 5 +- .../DefaultPassageFormatter.cs | 4 +- .../PostingsHighlight/MultiTermHighlighting.cs | 4 +- .../PostingsHighlight/Passage.cs | 4 +- .../PostingsHighlight/PassageFormatter.cs | 4 +- .../PostingsHighlight/PassageScorer.cs | 4 +- .../Properties/AssemblyInfo.cs | 2 + src/Lucene.Net.Highlighter/project.json | 6 +- src/Lucene.Net.Icu/Analysis/Th/stopwords.txt | 119 ++++++ src/Lucene.Net.Icu/Lucene.Net.Icu.csproj | 124 ++++++ src/Lucene.Net.Icu/Lucene.Net.Icu.project.json | 11 + src/Lucene.Net.Icu/Lucene.Net.Icu.xproj | 19 + src/Lucene.Net.Icu/Properties/AssemblyInfo.cs | 31 ++ src/Lucene.Net.Icu/Support/BreakIterator.cs | 231 ++++++++++ src/Lucene.Net.Icu/Support/CharacterIterator.cs | 50 +++ src/Lucene.Net.Icu/Support/IcuBreakIterator.cs | 394 +++++++++++++++++ .../Support/StringCharacterIterator.cs | 232 ++++++++++ src/Lucene.Net.Icu/project.json | 63 +++ .../Lucene.Net.Tests.Highlighter.csproj | 3 +- .../TestBreakIterator.cs | 421 ------------------- src/Lucene.Net.Tests.Highlighter/project.json | 4 +- .../Lucene.Net.Tests.Icu.csproj | 121 ++++++ .../Lucene.Net.Tests.Icu.project.json | 12 + .../Lucene.Net.Tests.Icu.xproj | 22 + .../Properties/AssemblyInfo.cs | 21 + .../Search/PostingsHighlight/CambridgeMA.utf8 | 1 + .../Support/TestApiConsistency.cs | 126 ++++++ .../Support/TestExceptionSerialization.cs | 54 +++ .../Support/TestIcuBreakIterator.cs | 421 +++++++++++++++++++ src/Lucene.Net.Tests.Icu/project.json | 67 +++ src/Lucene.Net/Lucene.Net.csproj | 3 - src/Lucene.Net/Properties/AssemblyInfo.cs | 2 + src/Lucene.Net/Support/BreakIterator.cs | 231 ---------- src/Lucene.Net/Support/CharacterIterator.cs | 50 --- .../Support/StringCharacterIterator.cs | 232 ---------- 42 files changed, 2220 insertions(+), 1356 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b1fdcca3/Lucene.Net.Portable.sln ---------------------------------------------------------------------- diff --git a/Lucene.Net.Portable.sln b/Lucene.Net.Portable.sln index 8044aed..7f4edad 100644 --- a/Lucene.Net.Portable.sln +++ b/Lucene.Net.Portable.sln @@ -79,6 +79,10 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "build", "build", "{EFA10A77 build\build.ps1 = build\build.ps1 EndProjectSection EndProject +Project("{8BB2217D-0F2D-49D1-97BC-3654ED321F3B}") = "Lucene.Net.Icu", "src\Lucene.Net.Icu\Lucene.Net.Icu.xproj", "{44A5341B-0F52-429D-977A-C35E10ECCADF}" +EndProject +Project("{8BB2217D-0F2D-49D1-97BC-3654ED321F3B}") = "Lucene.Net.Tests.Icu", "src\Lucene.Net.Tests.Icu\Lucene.Net.Tests.Icu.xproj", "{32FD3471-E862-4055-B969-79C12A656366}" +EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution Debug|Any CPU = Debug|Any CPU @@ -367,6 +371,22 @@ Global {C708701D-4318-469F-9822-49A80386CFEA}.Release|Any CPU.Build.0 = Release|Any CPU {C708701D-4318-469F-9822-49A80386CFEA}.Release|x86.ActiveCfg = Release|Any CPU {C708701D-4318-469F-9822-49A80386CFEA}.Release|x86.Build.0 = Release|Any CPU + {44A5341B-0F52-429D-977A-C35E10ECCADF}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {44A5341B-0F52-429D-977A-C35E10ECCADF}.Debug|Any CPU.Build.0 = Debug|Any CPU + {44A5341B-0F52-429D-977A-C35E10ECCADF}.Debug|x86.ActiveCfg = Debug|Any CPU + {44A5341B-0F52-429D-977A-C35E10ECCADF}.Debug|x86.Build.0 = Debug|Any CPU + {44A5341B-0F52-429D-977A-C35E10ECCADF}.Release|Any CPU.ActiveCfg = Release|Any CPU + {44A5341B-0F52-429D-977A-C35E10ECCADF}.Release|Any CPU.Build.0 = Release|Any CPU + {44A5341B-0F52-429D-977A-C35E10ECCADF}.Release|x86.ActiveCfg = Release|Any CPU + {44A5341B-0F52-429D-977A-C35E10ECCADF}.Release|x86.Build.0 = Release|Any CPU + {32FD3471-E862-4055-B969-79C12A656366}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {32FD3471-E862-4055-B969-79C12A656366}.Debug|Any CPU.Build.0 = Debug|Any CPU + {32FD3471-E862-4055-B969-79C12A656366}.Debug|x86.ActiveCfg = Debug|Any CPU + {32FD3471-E862-4055-B969-79C12A656366}.Debug|x86.Build.0 = Debug|Any CPU + {32FD3471-E862-4055-B969-79C12A656366}.Release|Any CPU.ActiveCfg = Release|Any CPU + {32FD3471-E862-4055-B969-79C12A656366}.Release|Any CPU.Build.0 = Release|Any CPU + {32FD3471-E862-4055-B969-79C12A656366}.Release|x86.ActiveCfg = Release|Any CPU + {32FD3471-E862-4055-B969-79C12A656366}.Release|x86.Build.0 = Release|Any CPU EndGlobalSection GlobalSection(SolutionProperties) = preSolution HideSolutionNode = FALSE http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b1fdcca3/Lucene.Net.sln ---------------------------------------------------------------------- diff --git a/Lucene.Net.sln b/Lucene.Net.sln index b218f0d..66e91a6 100644 --- a/Lucene.Net.sln +++ b/Lucene.Net.sln @@ -88,6 +88,10 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "build", "build", "{9811D53E build\build.ps1 = build\build.ps1 EndProjectSection EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Lucene.Net.Icu", "src\Lucene.Net.Icu\Lucene.Net.Icu.csproj", "{349CB7C9-7534-4E1D-9B0A-5521441AF0AE}" +EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Lucene.Net.Tests.Icu", "src\Lucene.Net.Tests.Icu\Lucene.Net.Tests.Icu.csproj", "{D5AA1A22-1B28-4DF6-BFDA-02519A189839}" +EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution Debug|Any CPU = Debug|Any CPU @@ -849,6 +853,54 @@ Global {FBCD6AFE-0A5C-4399-8044-99C58D2912D1}.Release35|Mixed Platforms.Build.0 = Release|Any CPU {FBCD6AFE-0A5C-4399-8044-99C58D2912D1}.Release35|x86.ActiveCfg = Release|Any CPU {FBCD6AFE-0A5C-4399-8044-99C58D2912D1}.Release35|x86.Build.0 = Release|Any CPU + {349CB7C9-7534-4E1D-9B0A-5521441AF0AE}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {349CB7C9-7534-4E1D-9B0A-5521441AF0AE}.Debug|Any CPU.Build.0 = Debug|Any CPU + {349CB7C9-7534-4E1D-9B0A-5521441AF0AE}.Debug|Mixed Platforms.ActiveCfg = Debug|Any CPU + {349CB7C9-7534-4E1D-9B0A-5521441AF0AE}.Debug|Mixed Platforms.Build.0 = Debug|Any CPU + {349CB7C9-7534-4E1D-9B0A-5521441AF0AE}.Debug|x86.ActiveCfg = Debug|Any CPU + {349CB7C9-7534-4E1D-9B0A-5521441AF0AE}.Debug|x86.Build.0 = Debug|Any CPU + {349CB7C9-7534-4E1D-9B0A-5521441AF0AE}.Debug35|Any CPU.ActiveCfg = Debug|Any CPU + {349CB7C9-7534-4E1D-9B0A-5521441AF0AE}.Debug35|Any CPU.Build.0 = Debug|Any CPU + {349CB7C9-7534-4E1D-9B0A-5521441AF0AE}.Debug35|Mixed Platforms.ActiveCfg = Debug|Any CPU + {349CB7C9-7534-4E1D-9B0A-5521441AF0AE}.Debug35|Mixed Platforms.Build.0 = Debug|Any CPU + {349CB7C9-7534-4E1D-9B0A-5521441AF0AE}.Debug35|x86.ActiveCfg = Debug|Any CPU + {349CB7C9-7534-4E1D-9B0A-5521441AF0AE}.Debug35|x86.Build.0 = Debug|Any CPU + {349CB7C9-7534-4E1D-9B0A-5521441AF0AE}.Release|Any CPU.ActiveCfg = Release|Any CPU + {349CB7C9-7534-4E1D-9B0A-5521441AF0AE}.Release|Any CPU.Build.0 = Release|Any CPU + {349CB7C9-7534-4E1D-9B0A-5521441AF0AE}.Release|Mixed Platforms.ActiveCfg = Release|Any CPU + {349CB7C9-7534-4E1D-9B0A-5521441AF0AE}.Release|Mixed Platforms.Build.0 = Release|Any CPU + {349CB7C9-7534-4E1D-9B0A-5521441AF0AE}.Release|x86.ActiveCfg = Release|Any CPU + {349CB7C9-7534-4E1D-9B0A-5521441AF0AE}.Release|x86.Build.0 = Release|Any CPU + {349CB7C9-7534-4E1D-9B0A-5521441AF0AE}.Release35|Any CPU.ActiveCfg = Release|Any CPU + {349CB7C9-7534-4E1D-9B0A-5521441AF0AE}.Release35|Any CPU.Build.0 = Release|Any CPU + {349CB7C9-7534-4E1D-9B0A-5521441AF0AE}.Release35|Mixed Platforms.ActiveCfg = Release|Any CPU + {349CB7C9-7534-4E1D-9B0A-5521441AF0AE}.Release35|Mixed Platforms.Build.0 = Release|Any CPU + {349CB7C9-7534-4E1D-9B0A-5521441AF0AE}.Release35|x86.ActiveCfg = Release|Any CPU + {349CB7C9-7534-4E1D-9B0A-5521441AF0AE}.Release35|x86.Build.0 = Release|Any CPU + {D5AA1A22-1B28-4DF6-BFDA-02519A189839}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {D5AA1A22-1B28-4DF6-BFDA-02519A189839}.Debug|Any CPU.Build.0 = Debug|Any CPU + {D5AA1A22-1B28-4DF6-BFDA-02519A189839}.Debug|Mixed Platforms.ActiveCfg = Debug|Any CPU + {D5AA1A22-1B28-4DF6-BFDA-02519A189839}.Debug|Mixed Platforms.Build.0 = Debug|Any CPU + {D5AA1A22-1B28-4DF6-BFDA-02519A189839}.Debug|x86.ActiveCfg = Debug|Any CPU + {D5AA1A22-1B28-4DF6-BFDA-02519A189839}.Debug|x86.Build.0 = Debug|Any CPU + {D5AA1A22-1B28-4DF6-BFDA-02519A189839}.Debug35|Any CPU.ActiveCfg = Debug|Any CPU + {D5AA1A22-1B28-4DF6-BFDA-02519A189839}.Debug35|Any CPU.Build.0 = Debug|Any CPU + {D5AA1A22-1B28-4DF6-BFDA-02519A189839}.Debug35|Mixed Platforms.ActiveCfg = Debug|Any CPU + {D5AA1A22-1B28-4DF6-BFDA-02519A189839}.Debug35|Mixed Platforms.Build.0 = Debug|Any CPU + {D5AA1A22-1B28-4DF6-BFDA-02519A189839}.Debug35|x86.ActiveCfg = Debug|Any CPU + {D5AA1A22-1B28-4DF6-BFDA-02519A189839}.Debug35|x86.Build.0 = Debug|Any CPU + {D5AA1A22-1B28-4DF6-BFDA-02519A189839}.Release|Any CPU.ActiveCfg = Release|Any CPU + {D5AA1A22-1B28-4DF6-BFDA-02519A189839}.Release|Any CPU.Build.0 = Release|Any CPU + {D5AA1A22-1B28-4DF6-BFDA-02519A189839}.Release|Mixed Platforms.ActiveCfg = Release|Any CPU + {D5AA1A22-1B28-4DF6-BFDA-02519A189839}.Release|Mixed Platforms.Build.0 = Release|Any CPU + {D5AA1A22-1B28-4DF6-BFDA-02519A189839}.Release|x86.ActiveCfg = Release|Any CPU + {D5AA1A22-1B28-4DF6-BFDA-02519A189839}.Release|x86.Build.0 = Release|Any CPU + {D5AA1A22-1B28-4DF6-BFDA-02519A189839}.Release35|Any CPU.ActiveCfg = Release|Any CPU + {D5AA1A22-1B28-4DF6-BFDA-02519A189839}.Release35|Any CPU.Build.0 = Release|Any CPU + {D5AA1A22-1B28-4DF6-BFDA-02519A189839}.Release35|Mixed Platforms.ActiveCfg = Release|Any CPU + {D5AA1A22-1B28-4DF6-BFDA-02519A189839}.Release35|Mixed Platforms.Build.0 = Release|Any CPU + {D5AA1A22-1B28-4DF6-BFDA-02519A189839}.Release35|x86.ActiveCfg = Release|Any CPU + {D5AA1A22-1B28-4DF6-BFDA-02519A189839}.Release35|x86.Build.0 = Release|Any CPU EndGlobalSection GlobalSection(SolutionProperties) = preSolution HideSolutionNode = FALSE http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b1fdcca3/NuGet.config ---------------------------------------------------------------------- diff --git a/NuGet.config b/NuGet.config index 8df6c0f..e0c6211 100644 --- a/NuGet.config +++ b/NuGet.config @@ -2,6 +2,7 @@ <configuration> <packageSources> <clear /> + <add key="icunet" value="https://www.myget.org/F/icu-dotnet/api/v2" /> <add key="dotnet-cat" value="https://www.myget.org/F/dotnetcat/api/v2" /> <add key="spatial4n" value="https://www.myget.org/F/spatial4n/api/v2" /> <add key="nugetorg" value="https://www.nuget.org/api/v2" /> http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b1fdcca3/src/IcuBreakIterator.cs ---------------------------------------------------------------------- diff --git a/src/IcuBreakIterator.cs b/src/IcuBreakIterator.cs deleted file mode 100644 index cc0f7cd..0000000 --- a/src/IcuBreakIterator.cs +++ /dev/null @@ -1,394 +0,0 @@ -#if FEATURE_BREAKITERATOR -using Lucene.Net.Support; -using System; -using System.Collections.Generic; -using System.Globalization; -using System.Linq; -using System.Text; - -namespace Lucene.Net -{ - /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - - /// <summary> - /// A <see cref="BreakIterator"/> implementation that encapsulates the functionality - /// of icu.net's <see cref="Icu.BreakIterator"/> static class. A <see cref="BreakIterator"/> - /// provides methods to move forward, reverse, and randomly through a set of text breaks - /// defined by the <see cref="Icu.BreakIterator.UBreakIteratorType"/> enumeration. - /// </summary> - // LUCENENET specific type - internal class IcuBreakIterator : BreakIterator - { - private readonly Icu.Locale locale; - private readonly Icu.BreakIterator.UBreakIteratorType type; - - private List<int> boundaries = new List<int>(); - private int currentBoundaryIndex; // Index (not the value) of the current boundary in boundaries - private string text; - - /// <summary> - /// The start offset for the string, if supplied by a <see cref="CharacterIterator"/> - /// </summary> - protected int m_start; - - /// <summary> - /// The end offset for the string, if supplied by a <see cref="CharacterIterator"/> - /// </summary> - protected int m_end; - - private bool enableHacks = false; - - public IcuBreakIterator(Icu.BreakIterator.UBreakIteratorType type) - : this(type, CultureInfo.CurrentCulture) - { - } - - public IcuBreakIterator(Icu.BreakIterator.UBreakIteratorType type, CultureInfo locale) - { - if (locale == null) - throw new ArgumentNullException("locale"); - this.locale = new Icu.Locale(locale.Name); - this.type = type; - } - - - public virtual bool EnableHacks - { - get { return enableHacks; } - set { enableHacks = value; } - } - - /// <summary> - /// Sets the current iteration position to the beginning of the text. - /// </summary> - /// <returns>The offset of the beginning of the text.</returns> - public override int First() - { - currentBoundaryIndex = 0; - return ReturnCurrent(); - } - - /// <summary> - /// Sets the current iteration position to the end of the text. - /// </summary> - /// <returns>The text's past-the-end offset.</returns> - public override int Last() - { - currentBoundaryIndex = boundaries.Count - 1; - return ReturnCurrent(); - } - - /// <summary> - /// Advances the iterator either forward or backward the specified number of steps. - /// Negative values move backward, and positive values move forward. This is - /// equivalent to repeatedly calling <see cref="Next()"/> or <see cref="Previous()"/>. - /// </summary> - /// <param name="n">The number of steps to move. The sign indicates the direction - /// (negative is backwards, and positive is forwards).</param> - /// <returns>The character offset of the boundary position n boundaries away from - /// the current one.</returns> - public override int Next(int n) - { - int result = Current; - while (n > 0) - { - result = Next(); - --n; - } - while (n < 0) - { - result = Previous(); - ++n; - } - return result; - } - - /// <summary> - /// Advances the iterator to the next boundary position. - /// </summary> - /// <returns>The position of the first boundary after this one.</returns> - public override int Next() - { - if (currentBoundaryIndex >= boundaries.Count - 1 || boundaries.Count == 0) - { - return DONE; - } - currentBoundaryIndex++; - return ReturnCurrent(); - } - - /// <summary> - /// Advances the iterator backwards, to the last boundary preceding this one. - /// </summary> - /// <returns>The position of the last boundary position preceding this one.</returns> - public override int Previous() - { - if (currentBoundaryIndex == 0 || boundaries.Count == 0) - { - return DONE; - } - currentBoundaryIndex--; - return ReturnCurrent(); - } - - /// <summary> - /// Throw <see cref="ArgumentException"/> unless begin <= offset < end. - /// </summary> - /// <param name="offset"></param> - private void CheckOffset(int offset) - { - if (offset < m_start || offset > m_end) - { - throw new ArgumentException("offset out of bounds"); - } - } - - /// <summary> - /// Sets the iterator to refer to the first boundary position following - /// the specified position. - /// </summary> - /// <param name="offset">The position from which to begin searching for a break position.</param> - /// <returns>The position of the first break after the current position.</returns> - public override int Following(int offset) - { - CheckOffset(offset); - - if (boundaries.Count == 0) - { - return DONE; - } - - int following = GetLowestIndexGreaterThan(offset); - if (following == -1) - { - currentBoundaryIndex = boundaries.Count - 1; - return DONE; - } - else - { - currentBoundaryIndex = following; - } - return ReturnCurrent(); - } - - private int GetLowestIndexGreaterThan(int offset) - { - int index = boundaries.BinarySearch(offset); - if (index < 0) - { - return ~index; - } - else if (index + 1 < boundaries.Count) - { - return index + 1; - } - - return -1; - } - - /// <summary> - /// Sets the iterator to refer to the last boundary position before the - /// specified position. - /// </summary> - /// <param name="offset">The position to begin searching for a break from.</param> - /// <returns>The position of the last boundary before the starting position.</returns> - public override int Preceding(int offset) - { - CheckOffset(offset); - - if (boundaries.Count == 0) - { - return DONE; - } - - int preceeding = GetHighestIndexLessThan(offset); - if (preceeding == -1) - { - currentBoundaryIndex = 0; - return DONE; - } - else - { - currentBoundaryIndex = preceeding; - } - return ReturnCurrent(); - } - - private int GetHighestIndexLessThan(int offset) - { - int index = boundaries.BinarySearch(offset); - if (index < 0) - { - return ~index - 1; - } - else - { - // NOTE: This is intentionally allowed to return -1 in the case - // where index == 0. This state indicates we are before the first boundary. - return index - 1; - } - } - - /// <summary> - /// Returns the current iteration position. - /// </summary> - public override int Current - { - get { return ReturnCurrent(); } - } - - /// <summary> - /// Gets the text being analyzed. - /// </summary> - public override string Text - { - get - { - return text; - } - } - - /// <summary> - /// Set the iterator to analyze a new piece of text. This function resets - /// the current iteration position to the beginning of the text. - /// </summary> - /// <param name="newText">The text to analyze.</param> - public override void SetText(string newText) - { - text = newText; - currentBoundaryIndex = 0; - m_start = 0; - m_end = newText.Length; - - LoadBoundaries(m_start, m_end); - } - - public override void SetText(CharacterIterator newText) - { - text = newText.GetTextAsString(); - currentBoundaryIndex = 0; - m_start = newText.BeginIndex; - m_end = newText.EndIndex; - - LoadBoundaries(m_start, m_end); - } - - private void LoadBoundaries(int start, int end) - { - IEnumerable<Icu.Boundary> icuBoundaries; - string offsetText = text.Substring(start, end - start); - -#if !NETSTANDARD - try - { -#endif - if (type == Icu.BreakIterator.UBreakIteratorType.WORD) - { - if (enableHacks) - { - // LUCENENET TODO: HACK - replacing hyphen with "a" so hyphenated words aren't broken - offsetText = offsetText.Replace("-", "a"); - } - - icuBoundaries = Icu.BreakIterator.GetWordBoundaries(locale, offsetText, true); - } - else - { - if (enableHacks && type == Icu.BreakIterator.UBreakIteratorType.SENTENCE) - { - // LUCENENET TODO: HACK - newline character causes incorrect sentence breaking. - offsetText = offsetText.Replace("\n", " "); - // LUCENENET TODO: HACK - the ICU sentence logic doesn't work (in English anyway) when sentences don't - // begin with capital letters. - offsetText = CapitalizeFirst(offsetText); - } - - icuBoundaries = Icu.BreakIterator.GetBoundaries(type, locale, offsetText); - } -#if !NETSTANDARD - } - catch (AccessViolationException ace) - { - // LUCENENET TODO: Find a reliable way to reproduce and report the - // AccessViolationException that happens here to the icu-dotnet project team - throw new Exception("Hit AccessViolationException: " + ace.ToString(), ace); - } -#endif - - boundaries = icuBoundaries - .Select(t => new[] { t.Start + start, t.End + start }) - .SelectMany(b => b) - .Distinct() - .ToList(); - } - - /// <summary> - /// Returns true if the specified character offset is a text boundary. - /// </summary> - /// <param name="offset">the character offset to check.</param> - /// <returns><c>true</c> if "offset" is a boundary position, <c>false</c> otherwise.</returns> - public override bool IsBoundary(int offset) - { - CheckOffset(offset); - return boundaries.Contains(offset); - } - - private int ReturnCurrent() - { - if (boundaries.Count > 0) - { - return currentBoundaryIndex < boundaries.Count && currentBoundaryIndex > -1 - ? boundaries[currentBoundaryIndex] - : DONE; - } - - // If there are no boundaries, we must return the start offset - return m_start; - } - - /// <summary> - /// LUCENENET TODO: This is a temporary workaround for an issue with icu-dotnet - /// where it doesn't correctly break sentences unless they begin with a capital letter. - /// If/when ICU is fixed, this method should be deleted and the IcuBreakIterator - /// code changed to remove calls to this method. - /// </summary> - public static string CapitalizeFirst(string s) - { - bool isNewSentence = true; - var result = new StringBuilder(s.Length); - for (int i = 0; i < s.Length; i++) - { - if (isNewSentence && char.IsLetter(s[i])) - { - result.Append(char.ToUpper(s[i])); - isNewSentence = false; - } - else - result.Append(s[i]); - - if (s[i] == '!' || s[i] == '?' || s[i] == '.') - { - isNewSentence = true; - } - } - - return result.ToString(); - } - } -} -#endif http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b1fdcca3/src/Lucene.Net.Analysis.Common/Analysis/Th/ThaiAnalyzer.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Th/ThaiAnalyzer.cs b/src/Lucene.Net.Analysis.Common/Analysis/Th/ThaiAnalyzer.cs index aa6e1d7..0885069 100644 --- a/src/Lucene.Net.Analysis.Common/Analysis/Th/ThaiAnalyzer.cs +++ b/src/Lucene.Net.Analysis.Common/Analysis/Th/ThaiAnalyzer.cs @@ -111,7 +111,7 @@ namespace Lucene.Net.Analysis.Th /// built from a <see cref="StandardTokenizer"/> filtered with /// <see cref="StandardFilter"/>, <see cref="LowerCaseFilter"/>, <see cref="ThaiWordFilter"/>, and /// <see cref="StopFilter"/> </returns> - protected override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) + protected internal override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) { if (m_matchVersion.OnOrAfter(LuceneVersion.LUCENE_48)) { http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b1fdcca3/src/Lucene.Net.Analysis.Common/Lucene.Net.Analysis.Common.csproj ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Common/Lucene.Net.Analysis.Common.csproj b/src/Lucene.Net.Analysis.Common/Lucene.Net.Analysis.Common.csproj index fb403aa..02545b2 100644 --- a/src/Lucene.Net.Analysis.Common/Lucene.Net.Analysis.Common.csproj +++ b/src/Lucene.Net.Analysis.Common/Lucene.Net.Analysis.Common.csproj @@ -41,9 +41,6 @@ <Reference Include="System.XML" /> </ItemGroup> <ItemGroup> - <Compile Include="..\IcuBreakIterator.cs"> - <Link>IcuBreakIterator.cs</Link> - </Compile> <Compile Include="Analysis\Bg\BulgarianAnalyzer.cs" /> <Compile Include="Analysis\Bg\BulgarianStemFilter.cs" /> <Compile Include="Analysis\Bg\BulgarianStemFilterFactory.cs" /> http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b1fdcca3/src/Lucene.Net.Analysis.Common/project.json ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Common/project.json b/src/Lucene.Net.Analysis.Common/project.json index ca771a3..556a89e 100644 --- a/src/Lucene.Net.Analysis.Common/project.json +++ b/src/Lucene.Net.Analysis.Common/project.json @@ -26,8 +26,7 @@ "define": [ "NETSTANDARD" ], "compile": { "includeFiles": [ - "../CommonAssemblyInfo.cs", - "../IcuBreakIterator.cs" + "../CommonAssemblyInfo.cs" ] }, "embed": { @@ -52,8 +51,7 @@ "define": [ "FEATURE_CLONEABLE", "FEATURE_DTD_PROCESSING", "FEATURE_SERIALIZABLE" ], "compile": { "includeFiles": [ - "../CommonAssemblyInfo.cs", - "../IcuBreakIterator.cs" + "../CommonAssemblyInfo.cs" ] }, "embed": { http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b1fdcca3/src/Lucene.Net.Highlighter/Lucene.Net.Highlighter.csproj ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Highlighter/Lucene.Net.Highlighter.csproj b/src/Lucene.Net.Highlighter/Lucene.Net.Highlighter.csproj index 31ac251..9c885d4 100644 --- a/src/Lucene.Net.Highlighter/Lucene.Net.Highlighter.csproj +++ b/src/Lucene.Net.Highlighter/Lucene.Net.Highlighter.csproj @@ -44,9 +44,6 @@ <Reference Include="System.Xml" /> </ItemGroup> <ItemGroup> - <Compile Include="..\IcuBreakIterator.cs"> - <Link>IcuBreakIterator.cs</Link> - </Compile> <Compile Include="Highlight\DefaultEncoder.cs" /> <Compile Include="Highlight\GradientFormatter.cs" /> <Compile Include="Highlight\Highlighter.cs" /> @@ -101,7 +98,7 @@ <Compile Include="VectorHighlight\SingleFragListBuilder.cs" /> <Compile Include="VectorHighlight\WeightedFieldFragList.cs" /> <Compile Include="VectorHighlight\WeightedFragListBuilder.cs" /> - <Compile Include="..\CommonAssemblyInfo.cs"> + <Compile Include="..\CommonAssemblyInfo.cs"> <Link>Properties\CommonAssemblyInfo.cs</Link> </Compile> </ItemGroup> http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b1fdcca3/src/Lucene.Net.Highlighter/PostingsHighlight/DefaultPassageFormatter.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Highlighter/PostingsHighlight/DefaultPassageFormatter.cs b/src/Lucene.Net.Highlighter/PostingsHighlight/DefaultPassageFormatter.cs index 4538d46..6a38bec 100644 --- a/src/Lucene.Net.Highlighter/PostingsHighlight/DefaultPassageFormatter.cs +++ b/src/Lucene.Net.Highlighter/PostingsHighlight/DefaultPassageFormatter.cs @@ -1,4 +1,5 @@ -using System; +#if FEATURE_BREAKITERATOR +using System; using System.Text; namespace Lucene.Net.Search.PostingsHighlight @@ -161,3 +162,4 @@ namespace Lucene.Net.Search.PostingsHighlight } } } +#endif \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b1fdcca3/src/Lucene.Net.Highlighter/PostingsHighlight/MultiTermHighlighting.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Highlighter/PostingsHighlight/MultiTermHighlighting.cs b/src/Lucene.Net.Highlighter/PostingsHighlight/MultiTermHighlighting.cs index e5a5bcd..bd79c80 100644 --- a/src/Lucene.Net.Highlighter/PostingsHighlight/MultiTermHighlighting.cs +++ b/src/Lucene.Net.Highlighter/PostingsHighlight/MultiTermHighlighting.cs @@ -1,4 +1,5 @@ -using Lucene.Net.Analysis; +#if FEATURE_BREAKITERATOR +using Lucene.Net.Analysis; using Lucene.Net.Analysis.TokenAttributes; using Lucene.Net.Index; using Lucene.Net.Search.Spans; @@ -344,3 +345,4 @@ namespace Lucene.Net.Search.PostingsHighlight } } } +#endif \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b1fdcca3/src/Lucene.Net.Highlighter/PostingsHighlight/Passage.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Highlighter/PostingsHighlight/Passage.cs b/src/Lucene.Net.Highlighter/PostingsHighlight/Passage.cs index 54a2446..b9a664f 100644 --- a/src/Lucene.Net.Highlighter/PostingsHighlight/Passage.cs +++ b/src/Lucene.Net.Highlighter/PostingsHighlight/Passage.cs @@ -1,4 +1,5 @@ -using Lucene.Net.Util; +#if FEATURE_BREAKITERATOR +using Lucene.Net.Util; using System.Collections.Generic; using System.Diagnostics; @@ -183,3 +184,4 @@ namespace Lucene.Net.Search.PostingsHighlight } } } +#endif \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b1fdcca3/src/Lucene.Net.Highlighter/PostingsHighlight/PassageFormatter.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Highlighter/PostingsHighlight/PassageFormatter.cs b/src/Lucene.Net.Highlighter/PostingsHighlight/PassageFormatter.cs index ce367a6..770a6fa 100644 --- a/src/Lucene.Net.Highlighter/PostingsHighlight/PassageFormatter.cs +++ b/src/Lucene.Net.Highlighter/PostingsHighlight/PassageFormatter.cs @@ -1,4 +1,5 @@ -namespace Lucene.Net.Search.PostingsHighlight +#if FEATURE_BREAKITERATOR +namespace Lucene.Net.Search.PostingsHighlight { /* * Licensed to the Apache Software Foundation (ASF) under one or more @@ -42,3 +43,4 @@ public abstract object Format(Passage[] passages, string content); // LUCENENET TODO: Make return type generic? } } +#endif http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b1fdcca3/src/Lucene.Net.Highlighter/PostingsHighlight/PassageScorer.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Highlighter/PostingsHighlight/PassageScorer.cs b/src/Lucene.Net.Highlighter/PostingsHighlight/PassageScorer.cs index af398da..de0fd45 100644 --- a/src/Lucene.Net.Highlighter/PostingsHighlight/PassageScorer.cs +++ b/src/Lucene.Net.Highlighter/PostingsHighlight/PassageScorer.cs @@ -1,4 +1,5 @@ -using System; +#if FEATURE_BREAKITERATOR +using System; namespace Lucene.Net.Search.PostingsHighlight { @@ -110,3 +111,4 @@ namespace Lucene.Net.Search.PostingsHighlight } } } +#endif \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b1fdcca3/src/Lucene.Net.Highlighter/Properties/AssemblyInfo.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Highlighter/Properties/AssemblyInfo.cs b/src/Lucene.Net.Highlighter/Properties/AssemblyInfo.cs index 6d2eedf..8969ff6 100644 --- a/src/Lucene.Net.Highlighter/Properties/AssemblyInfo.cs +++ b/src/Lucene.Net.Highlighter/Properties/AssemblyInfo.cs @@ -24,7 +24,9 @@ using System.Runtime.InteropServices; // The following GUID is for the ID of the typelib if this project is exposed to COM [assembly: Guid("e9e769ea-8504-44bc-8dc9-ccf958765f8f")] +[assembly: InternalsVisibleTo("Lucene.Net.Icu")] // for testing [assembly: InternalsVisibleTo("Lucene.Net.Tests.Highlighter")] +[assembly: InternalsVisibleTo("Lucene.Net.Tests.Icu")] // NOTE: Version information is in CommonAssemblyInfo.cs http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b1fdcca3/src/Lucene.Net.Highlighter/project.json ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Highlighter/project.json b/src/Lucene.Net.Highlighter/project.json index 5016f93..ce4b726 100644 --- a/src/Lucene.Net.Highlighter/project.json +++ b/src/Lucene.Net.Highlighter/project.json @@ -25,8 +25,7 @@ "define": [ "NETSTANDARD" ], "compile": { "includeFiles": [ - "../CommonAssemblyInfo.cs", - "../IcuBreakIterator.cs" + "../CommonAssemblyInfo.cs" ] } }, @@ -40,8 +39,7 @@ "define": [ "FEATURE_SERIALIZABLE" ], "compile": { "includeFiles": [ - "../CommonAssemblyInfo.cs", - "../IcuBreakIterator.cs" + "../CommonAssemblyInfo.cs" ] } } http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b1fdcca3/src/Lucene.Net.Icu/Analysis/Th/stopwords.txt ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Icu/Analysis/Th/stopwords.txt b/src/Lucene.Net.Icu/Analysis/Th/stopwords.txt new file mode 100644 index 0000000..07f0fab --- /dev/null +++ b/src/Lucene.Net.Icu/Analysis/Th/stopwords.txt @@ -0,0 +1,119 @@ +# Thai stopwords from: +# "Opinion Detection in Thai Political News Columns +# Based on Subjectivity Analysis" +# Khampol Sukhum, Supot Nitsuwat, and Choochart Haruechaiyasak +à¹à¸§à¹ +à¹à¸¡à¹ +à¹à¸ +à¹à¸à¹ +à¹à¸«à¹ +à¹à¸ +à¹à¸à¸¢ +à¹à¸«à¹à¸ +à¹à¸¥à¹à¸§ +à¹à¸¥à¸° +à¹à¸£à¸ +à¹à¸à¸ +à¹à¸à¹ +à¹à¸à¸ +à¹à¸«à¹à¸ +à¹à¸¥à¸¢ +à¹à¸£à¸´à¹à¸¡ +à¹à¸£à¸² +à¹à¸¡à¸·à¹à¸ +à¹à¸à¸·à¹à¸ +à¹à¸à¸£à¸²à¸° +à¹à¸à¹à¸à¸à¸²à¸£ +à¹à¸à¹à¸ +à¹à¸à¸´à¸à¹à¸à¸¢ +à¹à¸à¸´à¸ +à¹à¸à¸·à¹à¸à¸à¸à¸²à¸ +à¹à¸à¸µà¸¢à¸§à¸à¸±à¸ +à¹à¸à¸µà¸¢à¸§ +à¹à¸à¹à¸ +à¹à¸à¸à¸²à¸° +à¹à¸à¸¢ +à¹à¸à¹à¸² +à¹à¸à¸² +à¸à¸µà¸ +à¸à¸²à¸ +à¸à¸°à¹à¸£ +à¸à¸à¸ +à¸à¸¢à¹à¸²à¸ +à¸à¸¢à¸¹à¹ +à¸à¸¢à¸²à¸ +หาภ+หลาย +หลัà¸à¸à¸²à¸ +หลัภ+หรืภ+หà¸à¸¶à¹à¸ +สà¹à¸§à¸ +สà¹à¸ +สุภ+สà¹à¸²à¸«à¸£à¸±à¸ +วà¹à¸² +วัภ+ลภ+รà¹à¸§à¸¡ +ราย +รัภ+ระหวà¹à¸²à¸ +รวม +ยัภ+มี +มาภ+มา +à¸à¸£à¹à¸à¸¡ +à¸à¸ +à¸à¹à¸²à¸ +à¸à¸¥ +à¸à¸²à¸ +à¸à¹à¸² +à¸à¸µà¹ +à¸à¹à¸² +à¸à¸±à¹à¸ +à¸à¸±à¸ +à¸à¸à¸à¸à¸²à¸ +à¸à¸¸à¸ +à¸à¸µà¹à¸ªà¸¸à¸ +à¸à¸µà¹ +à¸à¹à¸²à¹à¸«à¹ +à¸à¹à¸² +à¸à¸²à¸ +à¸à¸±à¹à¸à¸à¸µà¹ +à¸à¸±à¹à¸ +à¸à¹à¸² +à¸à¸¹à¸ +à¸à¸¶à¸ +à¸à¹à¸à¸ +à¸à¹à¸²à¸à¹ +à¸à¹à¸²à¸ +à¸à¹à¸ +à¸à¸²à¸¡ +à¸à¸±à¹à¸à¹à¸à¹ +à¸à¸±à¹à¸ +à¸à¹à¸²à¸ +à¸à¹à¸§à¸¢ +à¸à¸±à¸ +à¸à¸¶à¹à¸ +à¸à¹à¸§à¸ +à¸à¸¶à¸ +à¸à¸²à¸ +à¸à¸±à¸ +à¸à¸° +à¸à¸·à¸ +à¸à¸§à¸²à¸¡ +à¸à¸£à¸±à¹à¸ +à¸à¸ +à¸à¸¶à¹à¸ +à¸à¸à¸ +à¸à¸ +à¸à¸à¸° +à¸à¹à¸à¸ +à¸à¹ +à¸à¸²à¸£ +à¸à¸±à¸ +à¸à¸±à¸ +à¸à¸§à¹à¸² +à¸à¸¥à¹à¸²à¸§ http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b1fdcca3/src/Lucene.Net.Icu/Lucene.Net.Icu.csproj ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Icu/Lucene.Net.Icu.csproj b/src/Lucene.Net.Icu/Lucene.Net.Icu.csproj new file mode 100644 index 0000000..267132e --- /dev/null +++ b/src/Lucene.Net.Icu/Lucene.Net.Icu.csproj @@ -0,0 +1,124 @@ +<?xml version="1.0" encoding="utf-8"?> +<Project ToolsVersion="14.0" DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003"> + <Import Project="$(MSBuildExtensionsPath)\$(MSBuildToolsVersion)\Microsoft.Common.props" Condition="Exists('$(MSBuildExtensionsPath)\$(MSBuildToolsVersion)\Microsoft.Common.props')" /> + <PropertyGroup> + <Configuration Condition=" '$(Configuration)' == '' ">Debug</Configuration> + <Platform Condition=" '$(Platform)' == '' ">AnyCPU</Platform> + <ProjectGuid>{349CB7C9-7534-4E1D-9B0A-5521441AF0AE}</ProjectGuid> + <OutputType>Library</OutputType> + <AppDesignerFolder>Properties</AppDesignerFolder> + <RootNamespace>Lucene.Net</RootNamespace> + <AssemblyName>Lucene.Net.Icu</AssemblyName> + <TargetFrameworkVersion>v4.5.1</TargetFrameworkVersion> + <FileAlignment>512</FileAlignment> + </PropertyGroup> + <PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Debug|AnyCPU' "> + <DebugSymbols>true</DebugSymbols> + <DebugType>full</DebugType> + <Optimize>false</Optimize> + <OutputPath>bin\Debug\</OutputPath> + <DefineConstants>DEBUG;TRACE</DefineConstants> + <ErrorReport>prompt</ErrorReport> + <WarningLevel>4</WarningLevel> + </PropertyGroup> + <PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Release|AnyCPU' "> + <DebugType>pdbonly</DebugType> + <Optimize>true</Optimize> + <OutputPath>bin\Release\</OutputPath> + <DefineConstants>TRACE</DefineConstants> + <ErrorReport>prompt</ErrorReport> + <WarningLevel>4</WarningLevel> + </PropertyGroup> + <PropertyGroup> + <DefineConstants>$(DefineConstants);FEATURE_BREAKITERATOR;FEATURE_SERIALIZABLE</DefineConstants> + </PropertyGroup> + <ItemGroup> + <Reference Include="System" /> + <Reference Include="System.Core" /> + <Reference Include="Microsoft.CSharp" /> + <Reference Include="System.Data" /> + </ItemGroup> + <ItemGroup> + <Compile Include="..\Lucene.Net.Analysis.Common\Analysis\Th\ThaiAnalyzer.cs"> + <Link>Analysis\Th\ThaiAnalyzer.cs</Link> + </Compile> + <Compile Include="..\Lucene.Net.Analysis.Common\Analysis\Th\ThaiTokenizer.cs"> + <Link>Analysis\Th\ThaiTokenizer.cs</Link> + </Compile> + <Compile Include="..\Lucene.Net.Analysis.Common\Analysis\Th\ThaiTokenizerFactory.cs"> + <Link>Analysis\Th\ThaiTokenizerFactory.cs</Link> + </Compile> + <Compile Include="..\Lucene.Net.Analysis.Common\Analysis\Th\ThaiWordFilter.cs"> + <Link>Analysis\Th\ThaiWordFilter.cs</Link> + </Compile> + <Compile Include="..\Lucene.Net.Analysis.Common\Analysis\Th\ThaiWordFilterFactory.cs"> + <Link>Analysis\Th\ThaiWordFilterFactory.cs</Link> + </Compile> + <Compile Include="..\Lucene.Net.Analysis.Common\Analysis\Util\CharArrayIterator.cs"> + <Link>Analysis\Util\CharArrayIterator.cs</Link> + </Compile> + <Compile Include="..\Lucene.Net.Analysis.Common\Analysis\Util\SegmentingTokenizerBase.cs"> + <Link>Analysis\Util\SegmentingTokenizerBase.cs</Link> + </Compile> + <Compile Include="..\Lucene.Net.Highlighter\PostingsHighlight\DefaultPassageFormatter.cs"> + <Link>Search\PostingsHighlight\DefaultPassageFormatter.cs</Link> + </Compile> + <Compile Include="..\Lucene.Net.Highlighter\PostingsHighlight\MultiTermHighlighting.cs"> + <Link>Search\PostingsHighlight\MultiTermHighlighting.cs</Link> + </Compile> + <Compile Include="..\Lucene.Net.Highlighter\PostingsHighlight\Passage.cs"> + <Link>Search\PostingsHighlight\Passage.cs</Link> + </Compile> + <Compile Include="..\Lucene.Net.Highlighter\PostingsHighlight\PassageFormatter.cs"> + <Link>Search\PostingsHighlight\PassageFormatter.cs</Link> + </Compile> + <Compile Include="..\Lucene.Net.Highlighter\PostingsHighlight\PassageScorer.cs"> + <Link>Search\PostingsHighlight\PassageScorer.cs</Link> + </Compile> + <Compile Include="..\Lucene.Net.Highlighter\PostingsHighlight\PostingsHighlighter.cs"> + <Link>Search\PostingsHighlight\PostingsHighlighter.cs</Link> + </Compile> + <Compile Include="..\Lucene.Net.Highlighter\PostingsHighlight\WholeBreakIterator.cs"> + <Link>Search\PostingsHighlight\WholeBreakIterator.cs</Link> + </Compile> + <Compile Include="..\Lucene.Net.Highlighter\VectorHighlight\BreakIteratorBoundaryScanner.cs"> + <Link>Search\VectorHighlight\BreakIteratorBoundaryScanner.cs</Link> + </Compile> + <Compile Include="Support\BreakIterator.cs" /> + <Compile Include="Support\CharacterIterator.cs" /> + <Compile Include="Support\IcuBreakIterator.cs" /> + <Compile Include="Properties\AssemblyInfo.cs" /> + <Compile Include="..\CommonAssemblyInfo.cs"> + <Link>Properties\CommonAssemblyInfo.cs</Link> + </Compile> + <Compile Include="Support\StringCharacterIterator.cs" /> + </ItemGroup> + <ItemGroup> + <ProjectReference Include="..\Lucene.Net.Analysis.Common\Lucene.Net.Analysis.Common.csproj"> + <Project>{4add0bbc-b900-4715-9526-d871de8eea64}</Project> + <Name>Lucene.Net.Analysis.Common</Name> + </ProjectReference> + <ProjectReference Include="..\Lucene.Net.Highlighter\Lucene.Net.Highlighter.csproj"> + <Project>{e9e769ea-8504-44bc-8dc9-ccf958765f8f}</Project> + <Name>Lucene.Net.Highlighter</Name> + </ProjectReference> + <ProjectReference Include="..\Lucene.Net\Lucene.Net.csproj"> + <Project>{5d4ad9be-1ffb-41ab-9943-25737971bf57}</Project> + <Name>Lucene.Net</Name> + </ProjectReference> + </ItemGroup> + <ItemGroup> + <None Include="Lucene.Net.Icu.project.json" /> + </ItemGroup> + <ItemGroup> + <EmbeddedResource Include="Analysis\Th\stopwords.txt" /> + </ItemGroup> + <Import Project="$(MSBuildToolsPath)\Microsoft.CSharp.targets" /> + <!-- To modify your build process, add your task inside one of the targets below and uncomment it. + Other similar extension points exist, see Microsoft.Common.targets. + <Target Name="BeforeBuild"> + </Target> + <Target Name="AfterBuild"> + </Target> + --> +</Project> \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b1fdcca3/src/Lucene.Net.Icu/Lucene.Net.Icu.project.json ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Icu/Lucene.Net.Icu.project.json b/src/Lucene.Net.Icu/Lucene.Net.Icu.project.json new file mode 100644 index 0000000..af28fc8 --- /dev/null +++ b/src/Lucene.Net.Icu/Lucene.Net.Icu.project.json @@ -0,0 +1,11 @@ +{ + "runtimes": { + "win": {} + }, + "dependencies": { + "icu.net": "54.1.1-alpha" + }, + "frameworks": { + "net451": {} + } +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b1fdcca3/src/Lucene.Net.Icu/Lucene.Net.Icu.xproj ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Icu/Lucene.Net.Icu.xproj b/src/Lucene.Net.Icu/Lucene.Net.Icu.xproj new file mode 100644 index 0000000..dd48901 --- /dev/null +++ b/src/Lucene.Net.Icu/Lucene.Net.Icu.xproj @@ -0,0 +1,19 @@ +<?xml version="1.0" encoding="utf-8"?> +<Project ToolsVersion="14.0" DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003"> + <PropertyGroup> + <VisualStudioVersion Condition="'$(VisualStudioVersion)' == ''">14.0</VisualStudioVersion> + <VSToolsPath Condition="'$(VSToolsPath)' == ''">$(MSBuildExtensionsPath32)\Microsoft\VisualStudio\v$(VisualStudioVersion)</VSToolsPath> + </PropertyGroup> + <Import Project="$(VSToolsPath)\DotNet\Microsoft.DotNet.Props" Condition="'$(VSToolsPath)' != ''" /> + <PropertyGroup Label="Globals"> + <ProjectGuid>44a5341b-0f52-429d-977a-c35e10eccadf</ProjectGuid> + <RootNamespace>Lucene.Net.Search</RootNamespace> + <BaseIntermediateOutputPath Condition="'$(BaseIntermediateOutputPath)'=='' ">.\obj</BaseIntermediateOutputPath> + <OutputPath Condition="'$(OutputPath)'=='' ">.\bin\</OutputPath> + <TargetFrameworkVersion>v4.5.1</TargetFrameworkVersion> + </PropertyGroup> + <PropertyGroup> + <SchemaVersion>2.0</SchemaVersion> + </PropertyGroup> + <Import Project="$(VSToolsPath)\DotNet\Microsoft.DotNet.targets" Condition="'$(VSToolsPath)' != ''" /> +</Project> \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b1fdcca3/src/Lucene.Net.Icu/Properties/AssemblyInfo.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Icu/Properties/AssemblyInfo.cs b/src/Lucene.Net.Icu/Properties/AssemblyInfo.cs new file mode 100644 index 0000000..3cdd9b2 --- /dev/null +++ b/src/Lucene.Net.Icu/Properties/AssemblyInfo.cs @@ -0,0 +1,31 @@ +using System; +using System.Reflection; +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; + +// General Information about an assembly is controlled through the following +// set of attributes. Change these attribute values to modify the information +// associated with an assembly. +[assembly: AssemblyTitle("Lucene.Net.Icu")] +[assembly: AssemblyDescription( + "International Components for Unicode-based features including Thai analyzer support, " + + "an international postings highlighter, and BreakIterator support for the vector highlighter in Lucene.Net.Highlighter " + + "for the Lucene.Net full-text search engine library from The Apache Software Foundation.")] +[assembly: AssemblyConfiguration("")] +[assembly: AssemblyDefaultAlias("Lucene.Net.Icu")] +[assembly: AssemblyCulture("")] + +[assembly: CLSCompliant(true)] + +// Setting ComVisible to false makes the types in this assembly not visible +// to COM components. If you need to access a type in this assembly from +// COM, set the ComVisible attribute to true on that type. +[assembly: ComVisible(false)] + +// The following GUID is for the ID of the typelib if this project is exposed to COM +[assembly: Guid("349cb7c9-7534-4e1d-9b0a-5521441af0ae")] + +// for testing +[assembly: InternalsVisibleTo("Lucene.Net.Tests.Icu")] + +// NOTE: Version information is in CommonAssemblyInfo.cs http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b1fdcca3/src/Lucene.Net.Icu/Support/BreakIterator.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Icu/Support/BreakIterator.cs b/src/Lucene.Net.Icu/Support/BreakIterator.cs new file mode 100644 index 0000000..ded1c9c --- /dev/null +++ b/src/Lucene.Net.Icu/Support/BreakIterator.cs @@ -0,0 +1,231 @@ +#if FEATURE_BREAKITERATOR +using System; + +namespace Lucene.Net.Support +{ + /// <summary> + /// The <code>BreakIterator</code> class implements methods for finding + /// the location of boundaries in text. Instances of <code>BreakIterator</code> + /// maintain a current position and scan over text + /// returning the index of characters where boundaries occur. + /// </summary> + public abstract class BreakIterator +#if FEATURE_CLONEABLE + : ICloneable +#endif + { + /// <summary> + /// Constructor. BreakIterator is stateless and has no default behavior. + /// </summary> + protected BreakIterator() + { + } + + /// <summary> + /// Create a copy of this iterator + /// </summary> + /// <returns>A member-wise copy of this</returns> + public object Clone() + { + return MemberwiseClone(); + } + + /// <summary> + /// DONE is returned by Previous(), Next(), Next(int), Preceding(int) + /// and Following(int) when either the first or last text boundary has been + /// reached. + /// </summary> + public static readonly int DONE = -1; + + /// <summary> + /// Returns the first boundary. The iterator's current position is set + /// to the first text boundary. + /// </summary> + /// <returns>The character index of the first text boundary</returns> + public abstract int First(); + + /// <summary> + /// Returns the last boundary. The iterator's current position is set + /// to the last text boundary. + /// </summary> + /// <returns>The character index of the last text boundary.</returns> + public abstract int Last(); + + /// <summary> + /// Returns the nth boundary from the current boundary. If either + /// the first or last text boundary has been reached, it returns + /// <see cref="BreakIterator.DONE"/> and the current position is set to either + /// the first or last text boundary depending on which one is reached. Otherwise, + /// the iterator's current position is set to the new boundary. + /// For example, if the iterator's current position is the mth text boundary + /// and three more boundaries exist from the current boundary to the last text + /// boundary, the Next(2) call will return m + 2. The new text position is set + /// to the (m + 2)th text boundary. A Next(4) call would return + /// <see cref="BreakIterator.DONE"/> and the last text boundary would become the + /// new text position. + /// </summary> + /// <param name="n"> + /// which boundary to return. A value of 0 + /// does nothing. Negative values move to previous boundaries + /// and positive values move to later boundaries. + /// </param> + /// <returns> + /// The character index of the nth boundary from the current position + /// or <see cref="BreakIterator.DONE"/> if either first or last text boundary + /// has been reached. + /// </returns> + public abstract int Next(int n); + + /// <summary> + /// Returns the boundary following the current boundary. If the current boundary + /// is the last text boundary, it returns <c>BreakIterator.DONE</c> and + /// the iterator's current position is unchanged. Otherwise, the iterator's + /// current position is set to the boundary following the current boundary. + /// </summary> + /// <returns> + /// The character index of the next text boundary or + /// <see cref="BreakIterator.DONE"/> if the current boundary is the last text + /// boundary. + /// Equivalent to Next(1). + /// </returns> + /// <seealso cref="Next(int)"/> + public abstract int Next(); + + /// <summary> + /// Returns the boundary preceding the current boundary. If the current boundary + /// is the first text boundary, it returns <code>BreakIterator.DONE</code> and + /// the iterator's current position is unchanged. Otherwise, the iterator's + /// current position is set to the boundary preceding the current boundary. + /// </summary> + /// <returns> + /// The character index of the previous text boundary or + /// <see cref="BreakIterator.DONE"/> if the current boundary is the first text + /// boundary. + /// </returns> + public abstract int Previous(); + + /// <summary> + /// Returns the first boundary following the specified character offset. If the + /// specified offset equals to the last text boundary, it returns + /// <see cref="BreakIterator.DONE"/> and the iterator's current position is unchanged. + /// Otherwise, the iterator's current position is set to the returned boundary. + /// The value returned is always greater than the offset or the value + /// <see cref="BreakIterator.DONE"/>. + /// </summary> + /// <param name="offset">the character offset to begin scanning.</param> + /// <returns> + /// The first boundary after the specified offset or + /// <see cref="BreakIterator.DONE"/> if the last text boundary is passed in + /// as the offset. + /// </returns> + /// <exception cref="ArgumentException"> + /// if the specified offset is less than + /// the first text boundary or greater than the last text boundary. + /// </exception> + public abstract int Following(int offset); + + /// <summary> + /// Returns the last boundary preceding the specified character offset. If the + /// specified offset equals to the first text boundary, it returns + /// <see cref="BreakIterator.DONE"/> and the iterator's current position is unchanged. + /// Otherwise, the iterator's current position is set to the returned boundary. + /// The value returned is always less than the offset or the value + /// <see cref="BreakIterator.DONE"/>. + /// </summary> + /// <param name="offset">the character offset to begin scanning.</param> + /// <returns> + /// The last boundary before the specified offset or + /// <see cref="BreakIterator.DONE"/> if the first text boundary is passed in + /// as the offset. + /// </returns> + public abstract int Preceding(int offset); + //{ + // // NOTE: This implementation is here solely because we can't add new + // // abstract methods to an existing class. There is almost ALWAYS a + // // better, faster way to do this. + // int pos = Following(offset); + // while (pos >= offset && pos != DONE) + // { + // pos = Previous(); + // } + // return pos; + //} + + /// <summary> + /// Returns true if the specified character offset is a text boundary. + /// </summary> + /// <param name="offset">the character offset to check.</param> + /// <returns><c>true</c> if "offset" is a boundary position, <c>false</c> otherwise.</returns> + /// <exception cref="ArgumentException"> + /// if the specified offset is less than + /// the first text boundary or greater than the last text boundary. + /// </exception> + public abstract bool IsBoundary(int offset); + //{ + // // NOTE: This implementation probably is wrong for most situations + // // because it fails to take into account the possibility that a + // // CharacterIterator passed to setText() may not have a begin offset + // // of 0. But since the abstract BreakIterator doesn't have that + // // knowledge, it assumes the begin offset is 0. If you subclass + // // BreakIterator, copy the SimpleTextBoundary implementation of this + // // function into your subclass. [This should have been abstract at + // // this level, but it's too late to fix that now.] + // if (offset == 0) + // { + // return true; + // } + // int boundary = Following(offset - 1); + // if (boundary == DONE) + // { + // throw new ArgumentException(); + // } + // return boundary == offset; + //} + + /// <summary> + /// Returns character index of the text boundary that was most + /// recently returned by Next(), Next(int), Previous(), First(), Last(), + /// Following(int) or Preceding(int). If any of these methods returns + /// <see cref="BreakIterator.DONE"/> because either first or last text boundary + /// has been reached, it returns the first or last text boundary depending on + /// which one is reached. + /// </summary> + /// <returns> + /// The text boundary returned from the above methods, first or last + /// text boundary. + /// </returns> + /// <seealso cref="Next()"/> + /// <seealso cref="Next(int)"/> + /// <seealso cref="Previous()"/> + /// <seealso cref="First()"/> + /// <seealso cref="Last()"/> + /// <seealso cref="Following(int)"/> + /// <seealso cref="Preceding(int)"/> + public abstract int Current { get; } + + /// <summary> + /// Get the text being scanned + /// </summary> + /// <returns>the text being scanned</returns> + //public abstract CharacterIterator GetText(); + public abstract string Text { get; } + + /// <summary> + /// Set a new text string to be scanned. The current scan + /// position is reset to First(). + /// </summary> + /// <param name="newText">new text to scan.</param> + public virtual void SetText(string newText) + { + SetText(new StringCharacterIterator(newText)); + } + + /// <summary> + /// Set a new text string to be scanned. The current scan + /// position is reset to First(). + /// </summary> + /// <param name="newText">new text to scan.</param> + public abstract void SetText(CharacterIterator newText); + } +} +#endif \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b1fdcca3/src/Lucene.Net.Icu/Support/CharacterIterator.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Icu/Support/CharacterIterator.cs b/src/Lucene.Net.Icu/Support/CharacterIterator.cs new file mode 100644 index 0000000..0c81629 --- /dev/null +++ b/src/Lucene.Net.Icu/Support/CharacterIterator.cs @@ -0,0 +1,50 @@ +#if FEATURE_BREAKITERATOR +using System; + +namespace Lucene.Net.Support +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + public abstract class CharacterIterator + { + public static readonly char DONE = '\uFFFF'; + + public abstract char Current { get; } + + public abstract char First(); + + public abstract char Last(); + + public abstract char Next(); + + public abstract char Previous(); + + public abstract char SetIndex(int position); + + public abstract int BeginIndex { get; } + + public abstract int EndIndex { get; } + + public abstract int Index { get; } + + public abstract object Clone(); + + public abstract string GetTextAsString(); + } +} +#endif \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b1fdcca3/src/Lucene.Net.Icu/Support/IcuBreakIterator.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Icu/Support/IcuBreakIterator.cs b/src/Lucene.Net.Icu/Support/IcuBreakIterator.cs new file mode 100644 index 0000000..79819ed --- /dev/null +++ b/src/Lucene.Net.Icu/Support/IcuBreakIterator.cs @@ -0,0 +1,394 @@ +#if FEATURE_BREAKITERATOR +using Lucene.Net.Support; +using System; +using System.Collections.Generic; +using System.Globalization; +using System.Linq; +using System.Text; + +namespace Lucene.Net +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// <summary> + /// A <see cref="BreakIterator"/> implementation that encapsulates the functionality + /// of icu.net's <see cref="Icu.BreakIterator"/> static class. A <see cref="BreakIterator"/> + /// provides methods to move forward, reverse, and randomly through a set of text breaks + /// defined by the <see cref="Icu.BreakIterator.UBreakIteratorType"/> enumeration. + /// </summary> + // LUCENENET specific type + public class IcuBreakIterator : BreakIterator + { + private readonly Icu.Locale locale; + private readonly Icu.BreakIterator.UBreakIteratorType type; + + private List<int> boundaries = new List<int>(); + private int currentBoundaryIndex; // Index (not the value) of the current boundary in boundaries + private string text; + + /// <summary> + /// The start offset for the string, if supplied by a <see cref="CharacterIterator"/> + /// </summary> + protected int m_start; + + /// <summary> + /// The end offset for the string, if supplied by a <see cref="CharacterIterator"/> + /// </summary> + protected int m_end; + + private bool enableHacks = false; + + public IcuBreakIterator(Icu.BreakIterator.UBreakIteratorType type) + : this(type, CultureInfo.CurrentCulture) + { + } + + public IcuBreakIterator(Icu.BreakIterator.UBreakIteratorType type, CultureInfo locale) + { + if (locale == null) + throw new ArgumentNullException("locale"); + this.locale = new Icu.Locale(locale.Name); + this.type = type; + } + + + public virtual bool EnableHacks + { + get { return enableHacks; } + set { enableHacks = value; } + } + + /// <summary> + /// Sets the current iteration position to the beginning of the text. + /// </summary> + /// <returns>The offset of the beginning of the text.</returns> + public override int First() + { + currentBoundaryIndex = 0; + return ReturnCurrent(); + } + + /// <summary> + /// Sets the current iteration position to the end of the text. + /// </summary> + /// <returns>The text's past-the-end offset.</returns> + public override int Last() + { + currentBoundaryIndex = boundaries.Count - 1; + return ReturnCurrent(); + } + + /// <summary> + /// Advances the iterator either forward or backward the specified number of steps. + /// Negative values move backward, and positive values move forward. This is + /// equivalent to repeatedly calling <see cref="Next()"/> or <see cref="Previous()"/>. + /// </summary> + /// <param name="n">The number of steps to move. The sign indicates the direction + /// (negative is backwards, and positive is forwards).</param> + /// <returns>The character offset of the boundary position n boundaries away from + /// the current one.</returns> + public override int Next(int n) + { + int result = Current; + while (n > 0) + { + result = Next(); + --n; + } + while (n < 0) + { + result = Previous(); + ++n; + } + return result; + } + + /// <summary> + /// Advances the iterator to the next boundary position. + /// </summary> + /// <returns>The position of the first boundary after this one.</returns> + public override int Next() + { + if (currentBoundaryIndex >= boundaries.Count - 1 || boundaries.Count == 0) + { + return DONE; + } + currentBoundaryIndex++; + return ReturnCurrent(); + } + + /// <summary> + /// Advances the iterator backwards, to the last boundary preceding this one. + /// </summary> + /// <returns>The position of the last boundary position preceding this one.</returns> + public override int Previous() + { + if (currentBoundaryIndex == 0 || boundaries.Count == 0) + { + return DONE; + } + currentBoundaryIndex--; + return ReturnCurrent(); + } + + /// <summary> + /// Throw <see cref="ArgumentException"/> unless begin <= offset < end. + /// </summary> + /// <param name="offset"></param> + private void CheckOffset(int offset) + { + if (offset < m_start || offset > m_end) + { + throw new ArgumentException("offset out of bounds"); + } + } + + /// <summary> + /// Sets the iterator to refer to the first boundary position following + /// the specified position. + /// </summary> + /// <param name="offset">The position from which to begin searching for a break position.</param> + /// <returns>The position of the first break after the current position.</returns> + public override int Following(int offset) + { + CheckOffset(offset); + + if (boundaries.Count == 0) + { + return DONE; + } + + int following = GetLowestIndexGreaterThan(offset); + if (following == -1) + { + currentBoundaryIndex = boundaries.Count - 1; + return DONE; + } + else + { + currentBoundaryIndex = following; + } + return ReturnCurrent(); + } + + private int GetLowestIndexGreaterThan(int offset) + { + int index = boundaries.BinarySearch(offset); + if (index < 0) + { + return ~index; + } + else if (index + 1 < boundaries.Count) + { + return index + 1; + } + + return -1; + } + + /// <summary> + /// Sets the iterator to refer to the last boundary position before the + /// specified position. + /// </summary> + /// <param name="offset">The position to begin searching for a break from.</param> + /// <returns>The position of the last boundary before the starting position.</returns> + public override int Preceding(int offset) + { + CheckOffset(offset); + + if (boundaries.Count == 0) + { + return DONE; + } + + int preceeding = GetHighestIndexLessThan(offset); + if (preceeding == -1) + { + currentBoundaryIndex = 0; + return DONE; + } + else + { + currentBoundaryIndex = preceeding; + } + return ReturnCurrent(); + } + + private int GetHighestIndexLessThan(int offset) + { + int index = boundaries.BinarySearch(offset); + if (index < 0) + { + return ~index - 1; + } + else + { + // NOTE: This is intentionally allowed to return -1 in the case + // where index == 0. This state indicates we are before the first boundary. + return index - 1; + } + } + + /// <summary> + /// Returns the current iteration position. + /// </summary> + public override int Current + { + get { return ReturnCurrent(); } + } + + /// <summary> + /// Gets the text being analyzed. + /// </summary> + public override string Text + { + get + { + return text; + } + } + + /// <summary> + /// Set the iterator to analyze a new piece of text. This function resets + /// the current iteration position to the beginning of the text. + /// </summary> + /// <param name="newText">The text to analyze.</param> + public override void SetText(string newText) + { + text = newText; + currentBoundaryIndex = 0; + m_start = 0; + m_end = newText.Length; + + LoadBoundaries(m_start, m_end); + } + + public override void SetText(CharacterIterator newText) + { + text = newText.GetTextAsString(); + currentBoundaryIndex = 0; + m_start = newText.BeginIndex; + m_end = newText.EndIndex; + + LoadBoundaries(m_start, m_end); + } + + private void LoadBoundaries(int start, int end) + { + IEnumerable<Icu.Boundary> icuBoundaries; + string offsetText = text.Substring(start, end - start); + +#if !NETSTANDARD + try + { +#endif + if (type == Icu.BreakIterator.UBreakIteratorType.WORD) + { + if (enableHacks) + { + // LUCENENET TODO: HACK - replacing hyphen with "a" so hyphenated words aren't broken + offsetText = offsetText.Replace("-", "a"); + } + + icuBoundaries = Icu.BreakIterator.GetWordBoundaries(locale, offsetText, true); + } + else + { + if (enableHacks && type == Icu.BreakIterator.UBreakIteratorType.SENTENCE) + { + // LUCENENET TODO: HACK - newline character causes incorrect sentence breaking. + offsetText = offsetText.Replace("\n", " "); + // LUCENENET TODO: HACK - the ICU sentence logic doesn't work (in English anyway) when sentences don't + // begin with capital letters. + offsetText = CapitalizeFirst(offsetText); + } + + icuBoundaries = Icu.BreakIterator.GetBoundaries(type, locale, offsetText); + } +#if !NETSTANDARD + } + catch (AccessViolationException ace) + { + // LUCENENET TODO: Find a reliable way to reproduce and report the + // AccessViolationException that happens here to the icu-dotnet project team + throw new Exception("Hit AccessViolationException: " + ace.ToString(), ace); + } +#endif + + boundaries = icuBoundaries + .Select(t => new[] { t.Start + start, t.End + start }) + .SelectMany(b => b) + .Distinct() + .ToList(); + } + + /// <summary> + /// Returns true if the specified character offset is a text boundary. + /// </summary> + /// <param name="offset">the character offset to check.</param> + /// <returns><c>true</c> if "offset" is a boundary position, <c>false</c> otherwise.</returns> + public override bool IsBoundary(int offset) + { + CheckOffset(offset); + return boundaries.Contains(offset); + } + + private int ReturnCurrent() + { + if (boundaries.Count > 0) + { + return currentBoundaryIndex < boundaries.Count && currentBoundaryIndex > -1 + ? boundaries[currentBoundaryIndex] + : DONE; + } + + // If there are no boundaries, we must return the start offset + return m_start; + } + + /// <summary> + /// LUCENENET TODO: This is a temporary workaround for an issue with icu-dotnet + /// where it doesn't correctly break sentences unless they begin with a capital letter. + /// If/when ICU is fixed, this method should be deleted and the IcuBreakIterator + /// code changed to remove calls to this method. + /// </summary> + public static string CapitalizeFirst(string s) + { + bool isNewSentence = true; + var result = new StringBuilder(s.Length); + for (int i = 0; i < s.Length; i++) + { + if (isNewSentence && char.IsLetter(s[i])) + { + result.Append(char.ToUpper(s[i])); + isNewSentence = false; + } + else + result.Append(s[i]); + + if (s[i] == '!' || s[i] == '?' || s[i] == '.') + { + isNewSentence = true; + } + } + + return result.ToString(); + } + } +} +#endif http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b1fdcca3/src/Lucene.Net.Icu/Support/StringCharacterIterator.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Icu/Support/StringCharacterIterator.cs b/src/Lucene.Net.Icu/Support/StringCharacterIterator.cs new file mode 100644 index 0000000..a91e49a --- /dev/null +++ b/src/Lucene.Net.Icu/Support/StringCharacterIterator.cs @@ -0,0 +1,232 @@ +#if FEATURE_BREAKITERATOR +/* + * Copyright (c) 1996, 2013, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. Oracle designates this + * particular file as subject to the "Classpath" exception as provided + * by Oracle in the LICENSE file that accompanied this code. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +/* + * (C) Copyright Taligent, Inc. 1996, 1997 - All Rights Reserved + * (C) Copyright IBM Corp. 1996 - 1998 - All Rights Reserved + * + * The original version of this source code and documentation + * is copyrighted and owned by Taligent, Inc., a wholly-owned + * subsidiary of IBM. These materials are provided under terms + * of a License Agreement between Taligent and Sun. This technology + * is protected by multiple US and International patents. + * + * This notice and attribution to Taligent may not be removed. + * Taligent is a registered trademark of Taligent, Inc. + * + */ + +using System; +using System.Collections.Generic; +using System.Globalization; +using System.Linq; +using System.Text; +using System.Threading.Tasks; + +namespace Lucene.Net.Support +{ + /// <summary> + /// <see cref="StringCharacterIterator"/> implements the + /// <see cref="CharacterIterator"/> protocol for a <see cref="string"/>. + /// The <see cref="StringCharacterIterator"/> class iterates over the + /// entire <see cref="string"/>. + /// </summary> + /// <seealso cref="CharacterIterator"/> + public class StringCharacterIterator : CharacterIterator + { + private string text; + private int begin; + private int end; + // invariant: begin <= pos <= end + private int pos; + + + public StringCharacterIterator(string text) + : this(text, 0) + { + } + + public StringCharacterIterator(string text, int pos) + : this(text, 0, text.Length, pos) + { + } + + public StringCharacterIterator(string text, int begin, int end, int pos) + { + if (text == null) + throw new ArgumentNullException("text"); + this.text = text; + + if (begin < 0 || begin > end || end > text.Length) + throw new ArgumentException("Invalid substring range"); + + if (pos < begin || pos > end) + throw new ArgumentException("Invalid position"); + + this.begin = begin; + this.end = end; + this.pos = pos; + } + + public void SetText(string text) + { + if (text == null) + throw new ArgumentNullException("text"); + this.text = text; + this.begin = 0; + this.end = text.Length; + this.pos = 0; + } + + public override char First() + { + pos = begin; + return Current; + } + + public override char Last() + { + if (end != begin) + { + pos = end - 1; + } + else + { + pos = end; + } + return Current; + } + + public override char SetIndex(int position) + { + if (position < begin || position > end) + throw new ArgumentException("Invalid index"); + pos = position; + return Current; + } + + public override char Current + { + get + { + if (pos >= begin && pos < end) + { + return text[pos]; + } + else + { + return DONE; + } + } + } + + public override char Next() + { + if (pos < end - 1) + { + pos++; + return text[pos]; + } + else + { + pos = end; + return DONE; + } + } + + public override char Previous() + { + if (pos > begin) + { + pos--; + return text[pos]; + } + else + { + return DONE; + } + } + + + public override int BeginIndex + { + get + { + return begin; + } + } + + public override int EndIndex + { + get + { + return end; + } + } + + public override int Index + { + get + { + return pos; + } + } + + public override string GetTextAsString() + { + return text; + } + + public override bool Equals(object obj) + { + if (this == obj) + return true; + if (!(obj is StringCharacterIterator)) + return false; + + StringCharacterIterator that = (StringCharacterIterator)obj; + + if (GetHashCode() != that.GetHashCode()) + return false; + if (!text.Equals(that.text, StringComparison.Ordinal)) + return false; + if (pos != that.pos || begin != that.begin || end != that.end) + return false; + return true; + } + + public override int GetHashCode() + { + return base.GetHashCode() ^ pos ^ begin ^ end; + } + + public override object Clone() + { + return MemberwiseClone(); + } + } +} +#endif http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b1fdcca3/src/Lucene.Net.Icu/project.json ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Icu/project.json b/src/Lucene.Net.Icu/project.json new file mode 100644 index 0000000..2e8f212 --- /dev/null +++ b/src/Lucene.Net.Icu/project.json @@ -0,0 +1,63 @@ +{ + "version": "4.8.0", + "dependencies": { + "icu.net": "54.1.1-alpha", + "Lucene.Net": "4.8.0", + "Lucene.Net.Analysis.Common": "4.8.0", + "Lucene.Net.Highlighter": "4.8.0" + }, + "buildOptions": { + "debugType": "portable", + "compile": { + "includeFiles": [ + "../CommonAssemblyInfo.cs", + "../Lucene.Net.Analysis.Common/Analysis/Th/ThaiAnalyzer.cs", + "../Lucene.Net.Analysis.Common/Analysis/Th/ThaiTokenizer.cs", + "../Lucene.Net.Analysis.Common/Analysis/Th/ThaiTokenizerFactory.cs", + "../Lucene.Net.Analysis.Common/Analysis/Th/ThaiWordFilter.cs", + "../Lucene.Net.Analysis.Common/Analysis/Th/ThaiWordFilterFactory.cs", + "../Lucene.Net.Analysis.Common/Analysis/Util/CharArrayIterator.cs", + "../Lucene.Net.Analysis.Common/Analysis/Util/SegmentingTokenizerBase.cs", + "../Lucene.Net.Highlighter/PostingsHighlight/DefaultPassageFormatter.cs", + "../Lucene.Net.Highlighter/PostingsHighlight/MultiTermHighlighting.cs", + "../Lucene.Net.Highlighter/PostingsHighlight/Passage.cs", + "../Lucene.Net.Highlighter/PostingsHighlight/PassageFormatter.cs", + "../Lucene.Net.Highlighter/PostingsHighlight/PassageScorer.cs", + "../Lucene.Net.Highlighter/PostingsHighlight/PostingsHighlighter.cs", + "../Lucene.Net.Highlighter/PostingsHighlight/WholeBreakIterator.cs", + "../Lucene.Net.Highlighter/VectorHighlight/BreakIteratorBoundaryScanner.cs" + ] + }, + "embed": { + "includeFiles": [ "Analysis/Th/stopwords.txt" ] + } + }, + "packOptions": { + "summary": "<Added from AssemblyDescriptionAttribute by the build script - do not remove this>", + "licenseUrl": "https://github.com/apache/lucenenet/blob/master/LICENSE.txt", + "iconUrl": "https://github.com/apache/lucenenet/blob/master/branding/logo/lucene-net-icon-128x128.png?raw=true", + "owners": [ + "The Apache Software Foundation" + ], + "repository": { + "url": "https://github.com/apache/lucenenet" + }, + "tags": [ "lucene.net", "core", "text", "search", "information", "retrieval", "lucene", "apache", "analysis", "index", "query" ] + }, + "frameworks": { + "netstandard1.5": { + "imports": "dnxcore50", + "buildOptions": { + "define": [ "NETSTANDARD", "FEATURE_BREAKITERATOR" ] + }, + "dependencies": { + "NETStandard.Library": "1.6.0" + } + }, + "net451": { + "buildOptions": { + "define": [ "FEATURE_BREAKITERATOR", "FEATURE_SERIALIZABLE" ] + } + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/b1fdcca3/src/Lucene.Net.Tests.Highlighter/Lucene.Net.Tests.Highlighter.csproj ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Tests.Highlighter/Lucene.Net.Tests.Highlighter.csproj b/src/Lucene.Net.Tests.Highlighter/Lucene.Net.Tests.Highlighter.csproj index 3ed7239..d87e43d 100644 --- a/src/Lucene.Net.Tests.Highlighter/Lucene.Net.Tests.Highlighter.csproj +++ b/src/Lucene.Net.Tests.Highlighter/Lucene.Net.Tests.Highlighter.csproj @@ -56,7 +56,6 @@ <Compile Include="Properties\AssemblyInfo.cs" /> <Compile Include="Support\TestExceptionSerialization.cs" /> <Compile Include="Support\TestApiConsistency.cs" /> - <Compile Include="TestBreakIterator.cs" /> <Compile Include="VectorHighlight\AbstractTestCase.cs" /> <Compile Include="VectorHighlight\BreakIteratorBoundaryScannerTest.cs" /> <Compile Include="VectorHighlight\FastVectorHighlighterTest.cs" /> @@ -70,7 +69,7 @@ <Compile Include="VectorHighlight\SimpleFragmentsBuilderTest.cs" /> <Compile Include="VectorHighlight\SingleFragListBuilderTest.cs" /> <Compile Include="VectorHighlight\WeightedFragListBuilderTest.cs" /> - <Compile Include="..\CommonAssemblyInfo.cs"> + <Compile Include="..\CommonAssemblyInfo.cs"> <Link>Properties\CommonAssemblyInfo.cs</Link> </Compile> </ItemGroup>
