[20/53] [abbrv] git commit: Initial work on BreakIterator and PostingsHighlight

paulirwin Thu, 07 Nov 2013 05:57:03 -0800

Initial work on BreakIterator and PostingsHighlight


Project: http://git-wip-us.apache.org/repos/asf/lucenenet/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucenenet/commit/f2a35194
Tree: http://git-wip-us.apache.org/repos/asf/lucenenet/tree/f2a35194
Diff: http://git-wip-us.apache.org/repos/asf/lucenenet/diff/f2a35194

Branch: refs/heads/branch_4x
Commit: f2a351943a36e0907c48b48d8659c8243a2d2441
Parents: b0b647b
Author: Paul Irwin <[email protected]>
Authored: Tue Oct 29 10:16:40 2013 -0400
Committer: Paul Irwin <[email protected]>
Committed: Tue Oct 29 10:16:40 2013 -0400

----------------------------------------------------------------------
 .../Highlighter/Contrib.Highlighter.csproj      |   8 +-
 .../DefaultPassageFormatter.cs                  | 139 ++++++++++++++
 .../Highlighter/PostingsHighlight/Passage.cs    | 176 +++++++++++++++++
 .../PostingsHighlight/PassageFormatter.cs       |  29 +++
 .../PostingsHighlight/PassageScorer.cs          |  60 ++++++
 src/core/Lucene.Net.csproj                      |   7 +
 src/core/Support/BreakIterator.cs               | 191 +++++++++++++++++++
 .../Support/BreakIterators/BreakIteratorBase.cs | 120 ++++++++++++
 .../BreakIterators/EnglishBreakIteratorBase.cs  |  26 +++
 .../EnglishCharacterBreakIterator.cs            |  16 ++
 .../BreakIterators/EnglishLineBreakIterator.cs  |  21 ++
 .../EnglishSentenceBreakIterator.cs             |  24 +++
 .../BreakIterators/EnglishWordBreakIterator.cs  |  25 +++
 13 files changed, 839 insertions(+), 3 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucenenet/blob/f2a35194/src/contrib/Highlighter/Contrib.Highlighter.csproj
----------------------------------------------------------------------
diff --git a/src/contrib/Highlighter/Contrib.Highlighter.csproj 
b/src/contrib/Highlighter/Contrib.Highlighter.csproj
index 51861c1..a509181 100644
--- a/src/contrib/Highlighter/Contrib.Highlighter.csproj
+++ b/src/contrib/Highlighter/Contrib.Highlighter.csproj
@@ -235,6 +235,10 @@
     <Compile Include="Highlight\WeightedTerm.cs">
       <SubType>Code</SubType>
     </Compile>
+    <Compile Include="PostingsHighlight\DefaultPassageFormatter.cs" />
+    <Compile Include="PostingsHighlight\Passage.cs" />
+    <Compile Include="PostingsHighlight\PassageFormatter.cs" />
+    <Compile Include="PostingsHighlight\PassageScorer.cs" />
     <Compile Include="VectorHighlight\BaseFragListBuilder.cs" />
     <Compile Include="VectorHighlight\BaseFragmentsBuilder.cs" />
     <Compile Include="VectorHighlight\BreakIteratorBoundaryScanner.cs" />
@@ -294,9 +298,7 @@
   <ItemGroup>
     <None Include="Lucene.Net.snk" />
   </ItemGroup>
-  <ItemGroup>
-    <Folder Include="PostingsHighlight\" />
-  </ItemGroup>
+  <ItemGroup />
   <Import Project="$(MSBuildBinPath)\Microsoft.CSharp.targets" />
   <PropertyGroup>
     <PreBuildEvent>

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/f2a35194/src/contrib/Highlighter/PostingsHighlight/DefaultPassageFormatter.cs
----------------------------------------------------------------------
diff --git 
a/src/contrib/Highlighter/PostingsHighlight/DefaultPassageFormatter.cs 
b/src/contrib/Highlighter/PostingsHighlight/DefaultPassageFormatter.cs
new file mode 100644
index 0000000..2600c2f
--- /dev/null
+++ b/src/contrib/Highlighter/PostingsHighlight/DefaultPassageFormatter.cs
@@ -0,0 +1,139 @@
+ï»¿/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+
+namespace Lucene.Net.Search.PostingsHighlight
+{
+    public class DefaultPassageFormatter : PassageFormatter
+    {
+        protected readonly string preTag;
+        protected readonly string postTag;
+        protected readonly string ellipsis;
+        protected readonly bool escape;
+
+        public DefaultPassageFormatter()
+            : this (@"<b>", @"</b>", @"... ", false)
+        {
+        }
+
+        public DefaultPassageFormatter(string preTag, string postTag, string 
ellipsis, bool escape)
+        {
+            if (preTag == null || postTag == null || ellipsis == null)
+            {
+                throw new NullReferenceException();
+            }
+
+            this.preTag = preTag;
+            this.postTag = postTag;
+            this.ellipsis = ellipsis;
+            this.escape = escape;
+        }
+
+        public override string Format(Passage[] passages, string content)
+        {
+            StringBuilder sb = new StringBuilder();
+            int pos = 0;
+            foreach (Passage passage in passages)
+            {
+                if (passage.StartOffset > pos && pos > 0)
+                {
+                    sb.Append(ellipsis);
+                }
+
+                pos = passage.StartOffset;
+                for (int i = 0; i < passage.NumMatches; i++)
+                {
+                    int start = passage.MatchStarts[i];
+                    int end = passage.MatchEnds[i];
+                    if (start > pos)
+                    {
+                        Append(sb, content, pos, start);
+                    }
+
+                    if (end > pos)
+                    {
+                        sb.Append(preTag);
+                        Append(sb, content, Math.Max(pos, start), end);
+                        sb.Append(postTag);
+                        pos = end;
+                    }
+                }
+
+                Append(sb, content, pos, Math.Max(pos, passage.EndOffset));
+                pos = passage.EndOffset;
+            }
+
+            return sb.ToString();
+        }
+
+        protected virtual void Append(StringBuilder dest, string content, int 
start, int end)
+        {
+            if (escape)
+            {
+                for (int i = start; i < end; i++)
+                {
+                    char ch = content[i];
+                    switch (ch)
+                    {
+                        case '&':
+                            dest.Append(@"&amp;");
+                            break;
+                        case '<':
+                            dest.Append(@"&lt;");
+                            break;
+                        case '>':
+                            dest.Append(@"&gt;");
+                            break;
+                        case '"':
+                            dest.Append(@"&quot;");
+                            break;
+                        case '\\':
+                            dest.Append(@"&#x27;");
+                            break;
+                        case '/':
+                            dest.Append(@"&#x2F;");
+                            break;
+                        default:
+                            if (ch >= 0x30 && ch <= 0x39 || ch >= 0x41 && ch 
<= 0x5A || ch >= 0x61 && ch <= 0x7A)
+                            {
+                                dest.Append(ch);
+                            }
+                            else if (ch < 0xff)
+                            {
+                                dest.Append(@"&#");
+                                dest.Append((int)ch);
+                                dest.Append(@";");
+                            }
+                            else
+                            {
+                                dest.Append(ch);
+                            }
+                            break;
+                    }
+                }
+            }
+            else
+            {
+                dest.Append(content, start, end);
+            }
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/f2a35194/src/contrib/Highlighter/PostingsHighlight/Passage.cs
----------------------------------------------------------------------
diff --git a/src/contrib/Highlighter/PostingsHighlight/Passage.cs 
b/src/contrib/Highlighter/PostingsHighlight/Passage.cs
new file mode 100644
index 0000000..b1fb280
--- /dev/null
+++ b/src/contrib/Highlighter/PostingsHighlight/Passage.cs
@@ -0,0 +1,176 @@
+ï»¿/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using Lucene.Net.Support;
+using Lucene.Net.Util;
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+
+namespace Lucene.Net.Search.PostingsHighlight
+{
+    public sealed class Passage
+    {
+        int startOffset = -1;
+        int endOffset = -1;
+        float score = 0F;
+        int[] matchStarts = new int[8];
+        int[] matchEnds = new int[8];
+        BytesRef[] matchTerms = new BytesRef[8];
+        int numMatches = 0;
+        
+        internal void AddMatch(int startOffset, int endOffset, BytesRef term)
+        {
+            if (numMatches == matchStarts.Length)
+            {
+                int newLength = ArrayUtil.Oversize(numMatches + 1, 
RamUsageEstimator.NUM_BYTES_OBJECT_REF);
+                int[] newMatchStarts = new int[newLength];
+                int[] newMatchEnds = new int[newLength];
+                BytesRef[] newMatchTerms = new BytesRef[newLength];
+                Array.Copy(matchStarts, 0, newMatchStarts, 0, numMatches);
+                Array.Copy(matchEnds, 0, newMatchEnds, 0, numMatches);
+                Array.Copy(matchTerms, 0, newMatchTerms, 0, numMatches);
+                matchStarts = newMatchStarts;
+                matchEnds = newMatchEnds;
+                matchTerms = newMatchTerms;
+            }
+
+            matchStarts[numMatches] = startOffset;
+            matchEnds[numMatches] = endOffset;
+            matchTerms[numMatches] = term;
+            numMatches++;
+        }
+
+        internal void Sort()
+        {
+            int[] starts = matchStarts;
+            int[] ends = matchEnds;
+            BytesRef[] terms = matchTerms;
+            new AnonymousSorterTemplate(this, starts, ends, 
terms).MergeSort(0, numMatches - 1);
+        }
+
+        private sealed class AnonymousSorterTemplate : SorterTemplate
+        {
+            public AnonymousSorterTemplate(Passage parent, int[] starts, int[] 
ends, BytesRef[] terms)
+            {
+                this.parent = parent;
+                this.starts = starts;
+                this.ends = ends;
+                this.terms = terms;
+            }
+
+            private readonly Passage parent;
+            private readonly int[] starts;
+            private readonly int[] ends;
+            private readonly BytesRef[] terms;
+
+            protected override void Swap(int i, int j)
+            {
+                int temp = starts[i];
+                starts[i] = starts[j];
+                starts[j] = temp;
+                temp = ends[i];
+                ends[i] = ends[j];
+                ends[j] = temp;
+                BytesRef tempTerm = terms[i];
+                terms[i] = terms[j];
+                terms[j] = tempTerm;
+            }
+
+            protected override int Compare(int i, int j)
+            {
+                return (((long)starts[i]) - starts[j]).Signum();
+            }
+
+            protected override void SetPivot(int i)
+            {
+                pivot = starts[i];
+            }
+
+            protected override int ComparePivot(int j)
+            {
+                return (((long)pivot) - starts[j]).Signum();
+            }
+
+            int pivot;
+        }
+
+        internal void Reset()
+        {
+            startOffset = endOffset = -1;
+            score = 0F;
+            numMatches = 0;
+        }
+
+        public int StartOffset
+        {
+            get
+            {
+                return startOffset;
+            }
+        }
+
+        public int EndOffset
+        {
+            get
+            {
+                return endOffset;
+            }
+        }
+
+        public float Score
+        {
+            get
+            {
+                return score;
+            }
+        }
+
+        public int NumMatches
+        {
+            get
+            {
+                return numMatches;
+            }
+        }
+
+        public int[] MatchStarts
+        {
+            get
+            {
+                return matchStarts;
+            }
+        }
+
+        public int[] MatchEnds
+        {
+            get
+            {
+                return matchEnds;
+            }
+        }
+
+        public BytesRef[] MatchTerms
+        {
+            get
+            {
+                return matchTerms;
+            }
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/f2a35194/src/contrib/Highlighter/PostingsHighlight/PassageFormatter.cs
----------------------------------------------------------------------
diff --git a/src/contrib/Highlighter/PostingsHighlight/PassageFormatter.cs 
b/src/contrib/Highlighter/PostingsHighlight/PassageFormatter.cs
new file mode 100644
index 0000000..a44ca81
--- /dev/null
+++ b/src/contrib/Highlighter/PostingsHighlight/PassageFormatter.cs
@@ -0,0 +1,29 @@
+ï»¿/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+
+namespace Lucene.Net.Search.PostingsHighlight
+{
+    public abstract class PassageFormatter
+    {
+        public abstract string Format(Passage[] passages, string content);
+    }
+}

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/f2a35194/src/contrib/Highlighter/PostingsHighlight/PassageScorer.cs
----------------------------------------------------------------------
diff --git a/src/contrib/Highlighter/PostingsHighlight/PassageScorer.cs 
b/src/contrib/Highlighter/PostingsHighlight/PassageScorer.cs
new file mode 100644
index 0000000..ad977e7
--- /dev/null
+++ b/src/contrib/Highlighter/PostingsHighlight/PassageScorer.cs
@@ -0,0 +1,60 @@
+ï»¿/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+
+namespace Lucene.Net.Search.PostingsHighlight
+{
+    public class PassageScorer
+    {
+        readonly float k1;
+        readonly float b;
+        readonly float pivot;
+
+        public PassageScorer()
+            : this(1.2F, 0.75F, 87F)
+        {
+        }
+
+        public PassageScorer(float k1, float b, float pivot)
+        {
+            this.k1 = k1;
+            this.b = b;
+            this.pivot = pivot;
+        }
+
+        public virtual float Weight(int contentLength, int totalTermFreq)
+        {
+            float numDocs = 1 + contentLength / pivot;
+            return (k1 + 1) * (float)Math.Log(1 + (numDocs + 0.5) / 
(totalTermFreq + 0.5));
+        }
+
+        public virtual float Tf(int freq, int passageLen)
+        {
+            float norm = k1 * ((1 - b) + b * (passageLen / pivot));
+            return freq / (freq + norm);
+        }
+
+        public virtual float Norm(int passageStart)
+        {
+            return 1 + 1 / (float)Math.Log(pivot + passageStart);
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/f2a35194/src/core/Lucene.Net.csproj
----------------------------------------------------------------------
diff --git a/src/core/Lucene.Net.csproj b/src/core/Lucene.Net.csproj
index a6f3de8..47b4cd3 100644
--- a/src/core/Lucene.Net.csproj
+++ b/src/core/Lucene.Net.csproj
@@ -869,6 +869,13 @@
     <Compile Include="Support\AtomicBoolean.cs" />
     <Compile Include="Support\AttributeImplItem.cs" />
     <Compile Include="Support\BitSetSupport.cs" />
+    <Compile Include="Support\BreakIterator.cs" />
+    <Compile Include="Support\BreakIterators\BreakIteratorBase.cs" />
+    <Compile Include="Support\BreakIterators\EnglishBreakIteratorBase.cs" />
+    <Compile Include="Support\BreakIterators\EnglishCharacterBreakIterator.cs" 
/>
+    <Compile Include="Support\BreakIterators\EnglishLineBreakIterator.cs" />
+    <Compile Include="Support\BreakIterators\EnglishSentenceBreakIterator.cs" 
/>
+    <Compile Include="Support\BreakIterators\EnglishWordBreakIterator.cs" />
     <Compile Include="Support\Buffer.cs" />
     <Compile Include="Support\BufferUnderflowException.cs" />
     <Compile Include="Support\BuildType.cs" />

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/f2a35194/src/core/Support/BreakIterator.cs
----------------------------------------------------------------------
diff --git a/src/core/Support/BreakIterator.cs 
b/src/core/Support/BreakIterator.cs
new file mode 100644
index 0000000..2f8387f
--- /dev/null
+++ b/src/core/Support/BreakIterator.cs
@@ -0,0 +1,191 @@
+ï»¿using Lucene.Net.Support.BreakIterators;
+using System;
+using System.Collections.Generic;
+using System.Globalization;
+using System.Linq;
+using System.Text;
+
+namespace Lucene.Net.Support
+{
+    /// <summary>
+    /// A backfill for BreakIterator support, since .NET doesn't have it. This 
implementation
+    /// certainly will need improving.
+    /// </summary>
+    public abstract class BreakIterator : ICloneable
+    {
+        public const int DONE = -1;
+
+        private static ISet<CultureInfo> _allLocales = new 
HashSet<CultureInfo>();
+        private static IDictionary<CultureInfo, Type> _lineTypes = new 
Dictionary<CultureInfo, Type>();
+        private static IDictionary<CultureInfo, Type> _characterTypes = new 
Dictionary<CultureInfo, Type>();
+        private static IDictionary<CultureInfo, Type> _sentenceTypes = new 
Dictionary<CultureInfo, Type>();
+        private static IDictionary<CultureInfo, Type> _wordTypes = new 
Dictionary<CultureInfo, Type>();
+
+        static BreakIterator()
+        {
+            // HACK HACK HACK HACK HACK
+            CultureInfo invariant = CultureInfo.InvariantCulture;
+            CultureInfo english = 
CultureInfo.GetCultureInfoByIetfLanguageTag("en");
+            CultureInfo englishUs = 
CultureInfo.GetCultureInfoByIetfLanguageTag("en-US");
+            
+            
RegisterCharacterBreakIterator<EnglishCharacterBreakIterator>(invariant);
+            
RegisterCharacterBreakIterator<EnglishCharacterBreakIterator>(english);
+            
RegisterCharacterBreakIterator<EnglishCharacterBreakIterator>(englishUs);
+            RegisterLineBreakIterator<EnglishLineBreakIterator>(invariant);
+            RegisterLineBreakIterator<EnglishLineBreakIterator>(english);
+            RegisterLineBreakIterator<EnglishLineBreakIterator>(englishUs);
+            
RegisterSentenceBreakIterator<EnglishSentenceBreakIterator>(invariant);
+            
RegisterSentenceBreakIterator<EnglishSentenceBreakIterator>(english);
+            
RegisterSentenceBreakIterator<EnglishSentenceBreakIterator>(englishUs);
+            RegisterWordBreakIterator<EnglishWordBreakIterator>(invariant);
+            RegisterWordBreakIterator<EnglishWordBreakIterator>(english);
+            RegisterWordBreakIterator<EnglishWordBreakIterator>(englishUs);
+        }
+
+        protected BreakIterator()
+        {
+        }
+        
+        public virtual object Clone()
+        {
+            return this.MemberwiseClone();
+        }
+
+        public abstract int Current();
+
+        public abstract int First();
+
+        public abstract int Following(int offset);
+
+        public abstract string Text { get; set;  }
+
+        public virtual bool IsBoundary(int offset)
+        {
+            return false;
+        }
+
+        public abstract int Last();
+
+        public abstract int Next();
+
+        public abstract int Next(int n);
+
+        public virtual int Preceding(int offset)
+        {
+            // goal here is to get the last boundary before the offset. 
+            // so we start at the boundary Following() the offset,
+            // then go Previous() to find the next one before the offset.
+            
+            var indexAfter = Following(offset);
+
+            // check for no more boundaries
+            if (indexAfter == DONE)
+                return DONE;
+
+            return Previous(); // doesn't matter if DONE or not DONE, caller 
will decide what to do
+        }
+
+        public abstract int Previous();
+
+        public static CultureInfo[] GetAvailableLocales()
+        {
+            return _allLocales.ToArray();
+        }
+
+        public static BreakIterator GetCharacterInstance()
+        {
+            return GetCharacterInstance(CultureInfo.InvariantCulture);
+        }
+
+        public static BreakIterator GetCharacterInstance(CultureInfo locale)
+        {
+            Type it;
+
+            if (_characterTypes.TryGetValue(locale, out it))
+                return (BreakIterator)Activator.CreateInstance(it);
+
+            it = _characterTypes[CultureInfo.InvariantCulture];
+
+            return (BreakIterator)Activator.CreateInstance(it);
+        }
+
+        public static BreakIterator GetLineInstance()
+        {
+            return GetLineInstance(CultureInfo.InvariantCulture);
+        }
+
+        public static BreakIterator GetLineInstance(CultureInfo locale)
+        {
+            Type it;
+
+            if (_lineTypes.TryGetValue(locale, out it))
+                return (BreakIterator)Activator.CreateInstance(it);
+
+            it = _lineTypes[CultureInfo.InvariantCulture];
+
+            return (BreakIterator)Activator.CreateInstance(it);
+        }
+
+        public static BreakIterator GetSentenceInstance()
+        {
+            return GetSentenceInstance(CultureInfo.InvariantCulture);
+        }
+
+        public static BreakIterator GetSentenceInstance(CultureInfo locale)
+        {
+            Type it;
+
+            if (_sentenceTypes.TryGetValue(locale, out it))
+                return (BreakIterator)Activator.CreateInstance(it);
+
+            it = _sentenceTypes[CultureInfo.InvariantCulture];
+
+            return (BreakIterator)Activator.CreateInstance(it);
+        }
+
+        public static BreakIterator GetWordInstance()
+        {
+            return GetWordInstance(CultureInfo.InvariantCulture);
+        }
+
+        public static BreakIterator GetWordInstance(CultureInfo locale)
+        {
+            Type it;
+
+            if (_wordTypes.TryGetValue(locale, out it))
+                return (BreakIterator)Activator.CreateInstance(it);
+
+            it = _wordTypes[CultureInfo.InvariantCulture];
+
+            return (BreakIterator)Activator.CreateInstance(it);
+        }
+
+        public static void RegisterCharacterBreakIterator<T>(CultureInfo 
locale)
+            where T : BreakIterator, new()
+        {
+            _allLocales.Add(locale);
+            _characterTypes[locale] = typeof(T);
+        }
+
+        public static void RegisterLineBreakIterator<T>(CultureInfo locale)
+            where T : BreakIterator, new()
+        {
+            _allLocales.Add(locale);
+            _lineTypes[locale] = typeof(T);
+        }
+
+        public static void RegisterSentenceBreakIterator<T>(CultureInfo locale)
+            where T : BreakIterator, new()
+        {
+            _allLocales.Add(locale);
+            _sentenceTypes[locale] = typeof(T);
+        }
+
+        public static void RegisterWordBreakIterator<T>(CultureInfo locale)
+            where T : BreakIterator, new()
+        {
+            _allLocales.Add(locale);
+            _wordTypes[locale] = typeof(T);
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/f2a35194/src/core/Support/BreakIterators/BreakIteratorBase.cs
----------------------------------------------------------------------
diff --git a/src/core/Support/BreakIterators/BreakIteratorBase.cs 
b/src/core/Support/BreakIterators/BreakIteratorBase.cs
new file mode 100644
index 0000000..a149907
--- /dev/null
+++ b/src/core/Support/BreakIterators/BreakIteratorBase.cs
@@ -0,0 +1,120 @@
+ï»¿using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+
+namespace Lucene.Net.Support.BreakIterators
+{
+    /// <summary>
+    /// A base implementation of BreakIterator to make some operations easier, 
particularly for 
+    /// english or latin-based languages.
+    /// </summary>
+    // HACK: someone please improve this!
+    public abstract class BreakIteratorBase : BreakIterator
+    {
+        protected const char ENDINPUT = '\0';
+
+        protected string _text;
+        protected int _position = DONE;
+
+        public override int Current()
+        {
+            if (_position == DONE)
+                return First();
+
+            return _position;
+        }
+
+        public override int First()
+        {
+            _position = DONE;
+
+            return Following(DONE);
+        }
+
+        public override int Following(int offset)
+        {
+            _position = offset;
+
+            do
+            {
+                _position++;
+
+                if (_position == _text.Length)
+                    return DONE;
+            }
+            while (!IsBoundary(_position));
+
+            return _position;
+        }
+
+        public override string Text
+        {
+            get
+            {
+                return _text;
+            }
+            set
+            {
+                _text = value;
+            }
+        }
+
+        public override int Last()
+        {
+            _position = _text.Length;
+
+            return Previous();
+        }
+
+        public override int Next()
+        {
+            if (_position == DONE)
+                return First();
+
+            if (_position == _text.Length - 1)
+                return DONE;
+
+            return Following(++_position);
+        }
+
+        public override int Next(int n)
+        {
+            if (_position == DONE)
+                return n == 0 ? First() : Following(n - 1);
+
+            if (n == 0)
+                return Current();
+
+            if (_position + n >= _text.Length)
+                return DONE;
+
+            _position += (n - 1);
+            return Following(_position);
+        }
+
+        public override int Previous()
+        {
+            do
+            {
+                _position--;
+
+                if (_position == DONE)
+                    return DONE;
+            }
+            while (!IsBoundary(_position));
+
+            return _position;
+        }
+
+        public abstract override bool IsBoundary(int offset);
+
+        public virtual char Peek(int offset)
+        {
+            if (offset < 0 || offset >= _text.Length)
+                return ENDINPUT;
+
+            return _text[offset];
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/f2a35194/src/core/Support/BreakIterators/EnglishBreakIteratorBase.cs
----------------------------------------------------------------------
diff --git a/src/core/Support/BreakIterators/EnglishBreakIteratorBase.cs 
b/src/core/Support/BreakIterators/EnglishBreakIteratorBase.cs
new file mode 100644
index 0000000..7429a9f
--- /dev/null
+++ b/src/core/Support/BreakIterators/EnglishBreakIteratorBase.cs
@@ -0,0 +1,26 @@
+ï»¿using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+
+namespace Lucene.Net.Support.BreakIterators
+{
+    // HACK: someone please improve this!
+    public abstract class EnglishBreakIteratorBase : BreakIteratorBase
+    {
+        private static ISet<char> _sentenceDelims = new HashSet<char>() { '.', 
'!', '?' };
+        private static ISet<char> _validLineDelims = new HashSet<char>() { ' 
', '\t', '\r', '\n', '-' };
+
+        protected static bool IsSentenceDelim(char c)
+        {
+            return _sentenceDelims.Contains(c);
+        }
+
+        protected static bool IsValidLineDelim(char c)
+        {
+            return _validLineDelims.Contains(c);
+        }
+
+        public abstract override bool IsBoundary(int offset);
+    }
+}

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/f2a35194/src/core/Support/BreakIterators/EnglishCharacterBreakIterator.cs
----------------------------------------------------------------------
diff --git a/src/core/Support/BreakIterators/EnglishCharacterBreakIterator.cs 
b/src/core/Support/BreakIterators/EnglishCharacterBreakIterator.cs
new file mode 100644
index 0000000..4148f76
--- /dev/null
+++ b/src/core/Support/BreakIterators/EnglishCharacterBreakIterator.cs
@@ -0,0 +1,16 @@
+ï»¿using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+
+namespace Lucene.Net.Support.BreakIterators
+{
+    // HACK: someone please improve this!
+    public class EnglishCharacterBreakIterator : EnglishBreakIteratorBase
+    {
+        public override bool IsBoundary(int offset)
+        {
+            return true;
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/f2a35194/src/core/Support/BreakIterators/EnglishLineBreakIterator.cs
----------------------------------------------------------------------
diff --git a/src/core/Support/BreakIterators/EnglishLineBreakIterator.cs 
b/src/core/Support/BreakIterators/EnglishLineBreakIterator.cs
new file mode 100644
index 0000000..a0a1242
--- /dev/null
+++ b/src/core/Support/BreakIterators/EnglishLineBreakIterator.cs
@@ -0,0 +1,21 @@
+ï»¿using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+
+namespace Lucene.Net.Support.BreakIterators
+{
+    // HACK: someone please improve this!
+    public class EnglishLineBreakIterator : EnglishBreakIteratorBase
+    {
+        public override bool IsBoundary(int offset)
+        {
+            if (offset == _text.Length - 1)
+                return true;
+
+            char c = Peek(offset);
+
+            return IsValidLineDelim(c);
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/f2a35194/src/core/Support/BreakIterators/EnglishSentenceBreakIterator.cs
----------------------------------------------------------------------
diff --git a/src/core/Support/BreakIterators/EnglishSentenceBreakIterator.cs 
b/src/core/Support/BreakIterators/EnglishSentenceBreakIterator.cs
new file mode 100644
index 0000000..4b12b48
--- /dev/null
+++ b/src/core/Support/BreakIterators/EnglishSentenceBreakIterator.cs
@@ -0,0 +1,24 @@
+ï»¿using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+
+namespace Lucene.Net.Support.BreakIterators
+{
+    // HACK: someone please improve this!
+    public class EnglishSentenceBreakIterator : EnglishBreakIteratorBase
+    {
+        public override bool IsBoundary(int offset)
+        {
+            if (offset == _text.Length - 1)
+                return true;
+
+            char c = Peek(offset);
+
+            if (!IsSentenceDelim(c))
+                return false;
+
+            return char.IsWhiteSpace(Peek(offset + 1));
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/f2a35194/src/core/Support/BreakIterators/EnglishWordBreakIterator.cs
----------------------------------------------------------------------
diff --git a/src/core/Support/BreakIterators/EnglishWordBreakIterator.cs 
b/src/core/Support/BreakIterators/EnglishWordBreakIterator.cs
new file mode 100644
index 0000000..0fbb39f
--- /dev/null
+++ b/src/core/Support/BreakIterators/EnglishWordBreakIterator.cs
@@ -0,0 +1,25 @@
+ï»¿using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+
+namespace Lucene.Net.Support.BreakIterators
+{
+    // HACK: someone please improve this!
+    public class EnglishWordBreakIterator : EnglishBreakIteratorBase
+    {
+        public override bool IsBoundary(int offset)
+        {
+            char c = Peek(offset);
+            char cplus = Peek(offset + 1);
+
+            if (char.IsLetterOrDigit(c))
+                return false;
+
+            if (cplus != ENDINPUT && char.IsLetterOrDigit(cplus))
+                return false;
+
+            return true;
+        }
+    }
+}

[20/53] [abbrv] git commit: Initial work on BreakIterator and PostingsHighlight

Reply via email to