Initial work on BreakIterator and PostingsHighlight
Project: http://git-wip-us.apache.org/repos/asf/lucenenet/repo Commit: http://git-wip-us.apache.org/repos/asf/lucenenet/commit/f2a35194 Tree: http://git-wip-us.apache.org/repos/asf/lucenenet/tree/f2a35194 Diff: http://git-wip-us.apache.org/repos/asf/lucenenet/diff/f2a35194 Branch: refs/heads/branch_4x Commit: f2a351943a36e0907c48b48d8659c8243a2d2441 Parents: b0b647b Author: Paul Irwin <[email protected]> Authored: Tue Oct 29 10:16:40 2013 -0400 Committer: Paul Irwin <[email protected]> Committed: Tue Oct 29 10:16:40 2013 -0400 ---------------------------------------------------------------------- .../Highlighter/Contrib.Highlighter.csproj | 8 +- .../DefaultPassageFormatter.cs | 139 ++++++++++++++ .../Highlighter/PostingsHighlight/Passage.cs | 176 +++++++++++++++++ .../PostingsHighlight/PassageFormatter.cs | 29 +++ .../PostingsHighlight/PassageScorer.cs | 60 ++++++ src/core/Lucene.Net.csproj | 7 + src/core/Support/BreakIterator.cs | 191 +++++++++++++++++++ .../Support/BreakIterators/BreakIteratorBase.cs | 120 ++++++++++++ .../BreakIterators/EnglishBreakIteratorBase.cs | 26 +++ .../EnglishCharacterBreakIterator.cs | 16 ++ .../BreakIterators/EnglishLineBreakIterator.cs | 21 ++ .../EnglishSentenceBreakIterator.cs | 24 +++ .../BreakIterators/EnglishWordBreakIterator.cs | 25 +++ 13 files changed, 839 insertions(+), 3 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/lucenenet/blob/f2a35194/src/contrib/Highlighter/Contrib.Highlighter.csproj ---------------------------------------------------------------------- diff --git a/src/contrib/Highlighter/Contrib.Highlighter.csproj b/src/contrib/Highlighter/Contrib.Highlighter.csproj index 51861c1..a509181 100644 --- a/src/contrib/Highlighter/Contrib.Highlighter.csproj +++ b/src/contrib/Highlighter/Contrib.Highlighter.csproj @@ -235,6 +235,10 @@ <Compile Include="Highlight\WeightedTerm.cs"> <SubType>Code</SubType> </Compile> + <Compile Include="PostingsHighlight\DefaultPassageFormatter.cs" /> + <Compile Include="PostingsHighlight\Passage.cs" /> + <Compile Include="PostingsHighlight\PassageFormatter.cs" /> + <Compile Include="PostingsHighlight\PassageScorer.cs" /> <Compile Include="VectorHighlight\BaseFragListBuilder.cs" /> <Compile Include="VectorHighlight\BaseFragmentsBuilder.cs" /> <Compile Include="VectorHighlight\BreakIteratorBoundaryScanner.cs" /> @@ -294,9 +298,7 @@ <ItemGroup> <None Include="Lucene.Net.snk" /> </ItemGroup> - <ItemGroup> - <Folder Include="PostingsHighlight\" /> - </ItemGroup> + <ItemGroup /> <Import Project="$(MSBuildBinPath)\Microsoft.CSharp.targets" /> <PropertyGroup> <PreBuildEvent> http://git-wip-us.apache.org/repos/asf/lucenenet/blob/f2a35194/src/contrib/Highlighter/PostingsHighlight/DefaultPassageFormatter.cs ---------------------------------------------------------------------- diff --git a/src/contrib/Highlighter/PostingsHighlight/DefaultPassageFormatter.cs b/src/contrib/Highlighter/PostingsHighlight/DefaultPassageFormatter.cs new file mode 100644 index 0000000..2600c2f --- /dev/null +++ b/src/contrib/Highlighter/PostingsHighlight/DefaultPassageFormatter.cs @@ -0,0 +1,139 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; + +namespace Lucene.Net.Search.PostingsHighlight +{ + public class DefaultPassageFormatter : PassageFormatter + { + protected readonly string preTag; + protected readonly string postTag; + protected readonly string ellipsis; + protected readonly bool escape; + + public DefaultPassageFormatter() + : this (@"<b>", @"</b>", @"... ", false) + { + } + + public DefaultPassageFormatter(string preTag, string postTag, string ellipsis, bool escape) + { + if (preTag == null || postTag == null || ellipsis == null) + { + throw new NullReferenceException(); + } + + this.preTag = preTag; + this.postTag = postTag; + this.ellipsis = ellipsis; + this.escape = escape; + } + + public override string Format(Passage[] passages, string content) + { + StringBuilder sb = new StringBuilder(); + int pos = 0; + foreach (Passage passage in passages) + { + if (passage.StartOffset > pos && pos > 0) + { + sb.Append(ellipsis); + } + + pos = passage.StartOffset; + for (int i = 0; i < passage.NumMatches; i++) + { + int start = passage.MatchStarts[i]; + int end = passage.MatchEnds[i]; + if (start > pos) + { + Append(sb, content, pos, start); + } + + if (end > pos) + { + sb.Append(preTag); + Append(sb, content, Math.Max(pos, start), end); + sb.Append(postTag); + pos = end; + } + } + + Append(sb, content, pos, Math.Max(pos, passage.EndOffset)); + pos = passage.EndOffset; + } + + return sb.ToString(); + } + + protected virtual void Append(StringBuilder dest, string content, int start, int end) + { + if (escape) + { + for (int i = start; i < end; i++) + { + char ch = content[i]; + switch (ch) + { + case '&': + dest.Append(@"&"); + break; + case '<': + dest.Append(@"<"); + break; + case '>': + dest.Append(@">"); + break; + case '"': + dest.Append(@"""); + break; + case '\\': + dest.Append(@"'"); + break; + case '/': + dest.Append(@"/"); + break; + default: + if (ch >= 0x30 && ch <= 0x39 || ch >= 0x41 && ch <= 0x5A || ch >= 0x61 && ch <= 0x7A) + { + dest.Append(ch); + } + else if (ch < 0xff) + { + dest.Append(@"&#"); + dest.Append((int)ch); + dest.Append(@";"); + } + else + { + dest.Append(ch); + } + break; + } + } + } + else + { + dest.Append(content, start, end); + } + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/f2a35194/src/contrib/Highlighter/PostingsHighlight/Passage.cs ---------------------------------------------------------------------- diff --git a/src/contrib/Highlighter/PostingsHighlight/Passage.cs b/src/contrib/Highlighter/PostingsHighlight/Passage.cs new file mode 100644 index 0000000..b1fb280 --- /dev/null +++ b/src/contrib/Highlighter/PostingsHighlight/Passage.cs @@ -0,0 +1,176 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +using Lucene.Net.Support; +using Lucene.Net.Util; +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; + +namespace Lucene.Net.Search.PostingsHighlight +{ + public sealed class Passage + { + int startOffset = -1; + int endOffset = -1; + float score = 0F; + int[] matchStarts = new int[8]; + int[] matchEnds = new int[8]; + BytesRef[] matchTerms = new BytesRef[8]; + int numMatches = 0; + + internal void AddMatch(int startOffset, int endOffset, BytesRef term) + { + if (numMatches == matchStarts.Length) + { + int newLength = ArrayUtil.Oversize(numMatches + 1, RamUsageEstimator.NUM_BYTES_OBJECT_REF); + int[] newMatchStarts = new int[newLength]; + int[] newMatchEnds = new int[newLength]; + BytesRef[] newMatchTerms = new BytesRef[newLength]; + Array.Copy(matchStarts, 0, newMatchStarts, 0, numMatches); + Array.Copy(matchEnds, 0, newMatchEnds, 0, numMatches); + Array.Copy(matchTerms, 0, newMatchTerms, 0, numMatches); + matchStarts = newMatchStarts; + matchEnds = newMatchEnds; + matchTerms = newMatchTerms; + } + + matchStarts[numMatches] = startOffset; + matchEnds[numMatches] = endOffset; + matchTerms[numMatches] = term; + numMatches++; + } + + internal void Sort() + { + int[] starts = matchStarts; + int[] ends = matchEnds; + BytesRef[] terms = matchTerms; + new AnonymousSorterTemplate(this, starts, ends, terms).MergeSort(0, numMatches - 1); + } + + private sealed class AnonymousSorterTemplate : SorterTemplate + { + public AnonymousSorterTemplate(Passage parent, int[] starts, int[] ends, BytesRef[] terms) + { + this.parent = parent; + this.starts = starts; + this.ends = ends; + this.terms = terms; + } + + private readonly Passage parent; + private readonly int[] starts; + private readonly int[] ends; + private readonly BytesRef[] terms; + + protected override void Swap(int i, int j) + { + int temp = starts[i]; + starts[i] = starts[j]; + starts[j] = temp; + temp = ends[i]; + ends[i] = ends[j]; + ends[j] = temp; + BytesRef tempTerm = terms[i]; + terms[i] = terms[j]; + terms[j] = tempTerm; + } + + protected override int Compare(int i, int j) + { + return (((long)starts[i]) - starts[j]).Signum(); + } + + protected override void SetPivot(int i) + { + pivot = starts[i]; + } + + protected override int ComparePivot(int j) + { + return (((long)pivot) - starts[j]).Signum(); + } + + int pivot; + } + + internal void Reset() + { + startOffset = endOffset = -1; + score = 0F; + numMatches = 0; + } + + public int StartOffset + { + get + { + return startOffset; + } + } + + public int EndOffset + { + get + { + return endOffset; + } + } + + public float Score + { + get + { + return score; + } + } + + public int NumMatches + { + get + { + return numMatches; + } + } + + public int[] MatchStarts + { + get + { + return matchStarts; + } + } + + public int[] MatchEnds + { + get + { + return matchEnds; + } + } + + public BytesRef[] MatchTerms + { + get + { + return matchTerms; + } + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/f2a35194/src/contrib/Highlighter/PostingsHighlight/PassageFormatter.cs ---------------------------------------------------------------------- diff --git a/src/contrib/Highlighter/PostingsHighlight/PassageFormatter.cs b/src/contrib/Highlighter/PostingsHighlight/PassageFormatter.cs new file mode 100644 index 0000000..a44ca81 --- /dev/null +++ b/src/contrib/Highlighter/PostingsHighlight/PassageFormatter.cs @@ -0,0 +1,29 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; + +namespace Lucene.Net.Search.PostingsHighlight +{ + public abstract class PassageFormatter + { + public abstract string Format(Passage[] passages, string content); + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/f2a35194/src/contrib/Highlighter/PostingsHighlight/PassageScorer.cs ---------------------------------------------------------------------- diff --git a/src/contrib/Highlighter/PostingsHighlight/PassageScorer.cs b/src/contrib/Highlighter/PostingsHighlight/PassageScorer.cs new file mode 100644 index 0000000..ad977e7 --- /dev/null +++ b/src/contrib/Highlighter/PostingsHighlight/PassageScorer.cs @@ -0,0 +1,60 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; + +namespace Lucene.Net.Search.PostingsHighlight +{ + public class PassageScorer + { + readonly float k1; + readonly float b; + readonly float pivot; + + public PassageScorer() + : this(1.2F, 0.75F, 87F) + { + } + + public PassageScorer(float k1, float b, float pivot) + { + this.k1 = k1; + this.b = b; + this.pivot = pivot; + } + + public virtual float Weight(int contentLength, int totalTermFreq) + { + float numDocs = 1 + contentLength / pivot; + return (k1 + 1) * (float)Math.Log(1 + (numDocs + 0.5) / (totalTermFreq + 0.5)); + } + + public virtual float Tf(int freq, int passageLen) + { + float norm = k1 * ((1 - b) + b * (passageLen / pivot)); + return freq / (freq + norm); + } + + public virtual float Norm(int passageStart) + { + return 1 + 1 / (float)Math.Log(pivot + passageStart); + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/f2a35194/src/core/Lucene.Net.csproj ---------------------------------------------------------------------- diff --git a/src/core/Lucene.Net.csproj b/src/core/Lucene.Net.csproj index a6f3de8..47b4cd3 100644 --- a/src/core/Lucene.Net.csproj +++ b/src/core/Lucene.Net.csproj @@ -869,6 +869,13 @@ <Compile Include="Support\AtomicBoolean.cs" /> <Compile Include="Support\AttributeImplItem.cs" /> <Compile Include="Support\BitSetSupport.cs" /> + <Compile Include="Support\BreakIterator.cs" /> + <Compile Include="Support\BreakIterators\BreakIteratorBase.cs" /> + <Compile Include="Support\BreakIterators\EnglishBreakIteratorBase.cs" /> + <Compile Include="Support\BreakIterators\EnglishCharacterBreakIterator.cs" /> + <Compile Include="Support\BreakIterators\EnglishLineBreakIterator.cs" /> + <Compile Include="Support\BreakIterators\EnglishSentenceBreakIterator.cs" /> + <Compile Include="Support\BreakIterators\EnglishWordBreakIterator.cs" /> <Compile Include="Support\Buffer.cs" /> <Compile Include="Support\BufferUnderflowException.cs" /> <Compile Include="Support\BuildType.cs" /> http://git-wip-us.apache.org/repos/asf/lucenenet/blob/f2a35194/src/core/Support/BreakIterator.cs ---------------------------------------------------------------------- diff --git a/src/core/Support/BreakIterator.cs b/src/core/Support/BreakIterator.cs new file mode 100644 index 0000000..2f8387f --- /dev/null +++ b/src/core/Support/BreakIterator.cs @@ -0,0 +1,191 @@ +using Lucene.Net.Support.BreakIterators; +using System; +using System.Collections.Generic; +using System.Globalization; +using System.Linq; +using System.Text; + +namespace Lucene.Net.Support +{ + /// <summary> + /// A backfill for BreakIterator support, since .NET doesn't have it. This implementation + /// certainly will need improving. + /// </summary> + public abstract class BreakIterator : ICloneable + { + public const int DONE = -1; + + private static ISet<CultureInfo> _allLocales = new HashSet<CultureInfo>(); + private static IDictionary<CultureInfo, Type> _lineTypes = new Dictionary<CultureInfo, Type>(); + private static IDictionary<CultureInfo, Type> _characterTypes = new Dictionary<CultureInfo, Type>(); + private static IDictionary<CultureInfo, Type> _sentenceTypes = new Dictionary<CultureInfo, Type>(); + private static IDictionary<CultureInfo, Type> _wordTypes = new Dictionary<CultureInfo, Type>(); + + static BreakIterator() + { + // HACK HACK HACK HACK HACK + CultureInfo invariant = CultureInfo.InvariantCulture; + CultureInfo english = CultureInfo.GetCultureInfoByIetfLanguageTag("en"); + CultureInfo englishUs = CultureInfo.GetCultureInfoByIetfLanguageTag("en-US"); + + RegisterCharacterBreakIterator<EnglishCharacterBreakIterator>(invariant); + RegisterCharacterBreakIterator<EnglishCharacterBreakIterator>(english); + RegisterCharacterBreakIterator<EnglishCharacterBreakIterator>(englishUs); + RegisterLineBreakIterator<EnglishLineBreakIterator>(invariant); + RegisterLineBreakIterator<EnglishLineBreakIterator>(english); + RegisterLineBreakIterator<EnglishLineBreakIterator>(englishUs); + RegisterSentenceBreakIterator<EnglishSentenceBreakIterator>(invariant); + RegisterSentenceBreakIterator<EnglishSentenceBreakIterator>(english); + RegisterSentenceBreakIterator<EnglishSentenceBreakIterator>(englishUs); + RegisterWordBreakIterator<EnglishWordBreakIterator>(invariant); + RegisterWordBreakIterator<EnglishWordBreakIterator>(english); + RegisterWordBreakIterator<EnglishWordBreakIterator>(englishUs); + } + + protected BreakIterator() + { + } + + public virtual object Clone() + { + return this.MemberwiseClone(); + } + + public abstract int Current(); + + public abstract int First(); + + public abstract int Following(int offset); + + public abstract string Text { get; set; } + + public virtual bool IsBoundary(int offset) + { + return false; + } + + public abstract int Last(); + + public abstract int Next(); + + public abstract int Next(int n); + + public virtual int Preceding(int offset) + { + // goal here is to get the last boundary before the offset. + // so we start at the boundary Following() the offset, + // then go Previous() to find the next one before the offset. + + var indexAfter = Following(offset); + + // check for no more boundaries + if (indexAfter == DONE) + return DONE; + + return Previous(); // doesn't matter if DONE or not DONE, caller will decide what to do + } + + public abstract int Previous(); + + public static CultureInfo[] GetAvailableLocales() + { + return _allLocales.ToArray(); + } + + public static BreakIterator GetCharacterInstance() + { + return GetCharacterInstance(CultureInfo.InvariantCulture); + } + + public static BreakIterator GetCharacterInstance(CultureInfo locale) + { + Type it; + + if (_characterTypes.TryGetValue(locale, out it)) + return (BreakIterator)Activator.CreateInstance(it); + + it = _characterTypes[CultureInfo.InvariantCulture]; + + return (BreakIterator)Activator.CreateInstance(it); + } + + public static BreakIterator GetLineInstance() + { + return GetLineInstance(CultureInfo.InvariantCulture); + } + + public static BreakIterator GetLineInstance(CultureInfo locale) + { + Type it; + + if (_lineTypes.TryGetValue(locale, out it)) + return (BreakIterator)Activator.CreateInstance(it); + + it = _lineTypes[CultureInfo.InvariantCulture]; + + return (BreakIterator)Activator.CreateInstance(it); + } + + public static BreakIterator GetSentenceInstance() + { + return GetSentenceInstance(CultureInfo.InvariantCulture); + } + + public static BreakIterator GetSentenceInstance(CultureInfo locale) + { + Type it; + + if (_sentenceTypes.TryGetValue(locale, out it)) + return (BreakIterator)Activator.CreateInstance(it); + + it = _sentenceTypes[CultureInfo.InvariantCulture]; + + return (BreakIterator)Activator.CreateInstance(it); + } + + public static BreakIterator GetWordInstance() + { + return GetWordInstance(CultureInfo.InvariantCulture); + } + + public static BreakIterator GetWordInstance(CultureInfo locale) + { + Type it; + + if (_wordTypes.TryGetValue(locale, out it)) + return (BreakIterator)Activator.CreateInstance(it); + + it = _wordTypes[CultureInfo.InvariantCulture]; + + return (BreakIterator)Activator.CreateInstance(it); + } + + public static void RegisterCharacterBreakIterator<T>(CultureInfo locale) + where T : BreakIterator, new() + { + _allLocales.Add(locale); + _characterTypes[locale] = typeof(T); + } + + public static void RegisterLineBreakIterator<T>(CultureInfo locale) + where T : BreakIterator, new() + { + _allLocales.Add(locale); + _lineTypes[locale] = typeof(T); + } + + public static void RegisterSentenceBreakIterator<T>(CultureInfo locale) + where T : BreakIterator, new() + { + _allLocales.Add(locale); + _sentenceTypes[locale] = typeof(T); + } + + public static void RegisterWordBreakIterator<T>(CultureInfo locale) + where T : BreakIterator, new() + { + _allLocales.Add(locale); + _wordTypes[locale] = typeof(T); + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/f2a35194/src/core/Support/BreakIterators/BreakIteratorBase.cs ---------------------------------------------------------------------- diff --git a/src/core/Support/BreakIterators/BreakIteratorBase.cs b/src/core/Support/BreakIterators/BreakIteratorBase.cs new file mode 100644 index 0000000..a149907 --- /dev/null +++ b/src/core/Support/BreakIterators/BreakIteratorBase.cs @@ -0,0 +1,120 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; + +namespace Lucene.Net.Support.BreakIterators +{ + /// <summary> + /// A base implementation of BreakIterator to make some operations easier, particularly for + /// english or latin-based languages. + /// </summary> + // HACK: someone please improve this! + public abstract class BreakIteratorBase : BreakIterator + { + protected const char ENDINPUT = '\0'; + + protected string _text; + protected int _position = DONE; + + public override int Current() + { + if (_position == DONE) + return First(); + + return _position; + } + + public override int First() + { + _position = DONE; + + return Following(DONE); + } + + public override int Following(int offset) + { + _position = offset; + + do + { + _position++; + + if (_position == _text.Length) + return DONE; + } + while (!IsBoundary(_position)); + + return _position; + } + + public override string Text + { + get + { + return _text; + } + set + { + _text = value; + } + } + + public override int Last() + { + _position = _text.Length; + + return Previous(); + } + + public override int Next() + { + if (_position == DONE) + return First(); + + if (_position == _text.Length - 1) + return DONE; + + return Following(++_position); + } + + public override int Next(int n) + { + if (_position == DONE) + return n == 0 ? First() : Following(n - 1); + + if (n == 0) + return Current(); + + if (_position + n >= _text.Length) + return DONE; + + _position += (n - 1); + return Following(_position); + } + + public override int Previous() + { + do + { + _position--; + + if (_position == DONE) + return DONE; + } + while (!IsBoundary(_position)); + + return _position; + } + + public abstract override bool IsBoundary(int offset); + + public virtual char Peek(int offset) + { + if (offset < 0 || offset >= _text.Length) + return ENDINPUT; + + return _text[offset]; + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/f2a35194/src/core/Support/BreakIterators/EnglishBreakIteratorBase.cs ---------------------------------------------------------------------- diff --git a/src/core/Support/BreakIterators/EnglishBreakIteratorBase.cs b/src/core/Support/BreakIterators/EnglishBreakIteratorBase.cs new file mode 100644 index 0000000..7429a9f --- /dev/null +++ b/src/core/Support/BreakIterators/EnglishBreakIteratorBase.cs @@ -0,0 +1,26 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; + +namespace Lucene.Net.Support.BreakIterators +{ + // HACK: someone please improve this! + public abstract class EnglishBreakIteratorBase : BreakIteratorBase + { + private static ISet<char> _sentenceDelims = new HashSet<char>() { '.', '!', '?' }; + private static ISet<char> _validLineDelims = new HashSet<char>() { ' ', '\t', '\r', '\n', '-' }; + + protected static bool IsSentenceDelim(char c) + { + return _sentenceDelims.Contains(c); + } + + protected static bool IsValidLineDelim(char c) + { + return _validLineDelims.Contains(c); + } + + public abstract override bool IsBoundary(int offset); + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/f2a35194/src/core/Support/BreakIterators/EnglishCharacterBreakIterator.cs ---------------------------------------------------------------------- diff --git a/src/core/Support/BreakIterators/EnglishCharacterBreakIterator.cs b/src/core/Support/BreakIterators/EnglishCharacterBreakIterator.cs new file mode 100644 index 0000000..4148f76 --- /dev/null +++ b/src/core/Support/BreakIterators/EnglishCharacterBreakIterator.cs @@ -0,0 +1,16 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; + +namespace Lucene.Net.Support.BreakIterators +{ + // HACK: someone please improve this! + public class EnglishCharacterBreakIterator : EnglishBreakIteratorBase + { + public override bool IsBoundary(int offset) + { + return true; + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/f2a35194/src/core/Support/BreakIterators/EnglishLineBreakIterator.cs ---------------------------------------------------------------------- diff --git a/src/core/Support/BreakIterators/EnglishLineBreakIterator.cs b/src/core/Support/BreakIterators/EnglishLineBreakIterator.cs new file mode 100644 index 0000000..a0a1242 --- /dev/null +++ b/src/core/Support/BreakIterators/EnglishLineBreakIterator.cs @@ -0,0 +1,21 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; + +namespace Lucene.Net.Support.BreakIterators +{ + // HACK: someone please improve this! + public class EnglishLineBreakIterator : EnglishBreakIteratorBase + { + public override bool IsBoundary(int offset) + { + if (offset == _text.Length - 1) + return true; + + char c = Peek(offset); + + return IsValidLineDelim(c); + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/f2a35194/src/core/Support/BreakIterators/EnglishSentenceBreakIterator.cs ---------------------------------------------------------------------- diff --git a/src/core/Support/BreakIterators/EnglishSentenceBreakIterator.cs b/src/core/Support/BreakIterators/EnglishSentenceBreakIterator.cs new file mode 100644 index 0000000..4b12b48 --- /dev/null +++ b/src/core/Support/BreakIterators/EnglishSentenceBreakIterator.cs @@ -0,0 +1,24 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; + +namespace Lucene.Net.Support.BreakIterators +{ + // HACK: someone please improve this! + public class EnglishSentenceBreakIterator : EnglishBreakIteratorBase + { + public override bool IsBoundary(int offset) + { + if (offset == _text.Length - 1) + return true; + + char c = Peek(offset); + + if (!IsSentenceDelim(c)) + return false; + + return char.IsWhiteSpace(Peek(offset + 1)); + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/f2a35194/src/core/Support/BreakIterators/EnglishWordBreakIterator.cs ---------------------------------------------------------------------- diff --git a/src/core/Support/BreakIterators/EnglishWordBreakIterator.cs b/src/core/Support/BreakIterators/EnglishWordBreakIterator.cs new file mode 100644 index 0000000..0fbb39f --- /dev/null +++ b/src/core/Support/BreakIterators/EnglishWordBreakIterator.cs @@ -0,0 +1,25 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; + +namespace Lucene.Net.Support.BreakIterators +{ + // HACK: someone please improve this! + public class EnglishWordBreakIterator : EnglishBreakIteratorBase + { + public override bool IsBoundary(int offset) + { + char c = Peek(offset); + char cplus = Peek(offset + 1); + + if (char.IsLetterOrDigit(c)) + return false; + + if (cplus != ENDINPUT && char.IsLetterOrDigit(cplus)) + return false; + + return true; + } + } +}
