Repository: lucenenet Updated Branches: refs/heads/master 60e812525 -> 84fdac04c
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1191c20d/src/Lucene.Net.Analysis.ICU/Analysis/Icu/ICUNormalizer2CharFilter.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.ICU/Analysis/Icu/ICUNormalizer2CharFilter.cs b/src/Lucene.Net.Analysis.ICU/Analysis/Icu/ICUNormalizer2CharFilter.cs new file mode 100644 index 0000000..4254298 --- /dev/null +++ b/src/Lucene.Net.Analysis.ICU/Analysis/Icu/ICUNormalizer2CharFilter.cs @@ -0,0 +1,235 @@ +// LUCENENET TODO: Port issues - missing Normalizer2 dependency from icu.net + +//using Lucene.Net.Analysis.CharFilters; +//using Lucene.Net.Support; +//using System; +//using System.Collections.Generic; +//using System.IO; +//using System.Linq; +//using System.Text; +//using System.Threading.Tasks; + +//namespace Lucene.Net.Analysis.ICU +//{ +// /// <summary> +// /// Normalize token text with ICU's <see cref="Normalizer2"/>. +// /// </summary> +// public sealed class ICUNormalizer2CharFilter : BaseCharFilter +// { +// private static readonly int IO_BUFFER_SIZE = 128; + +// private readonly Normalizer2 normalizer; +// private readonly StringBuilder inputBuffer = new StringBuilder(); +// private readonly StringBuilder resultBuffer = new StringBuilder(); + +// private bool inputFinished; +// private bool afterQuickCheckYes; +// private int checkedInputBoundary; +// private int charCount; + + +// /** +// * Create a new Normalizer2CharFilter that combines NFKC normalization, Case +// * Folding, and removes Default Ignorables (NFKC_Casefold) +// */ +// public ICUNormalizer2CharFilter(TextReader input) +// : this(input, new Normalizer2(Icu.Normalizer.UNormalizationMode.UNORM_NFKC) /*Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE)*/) +// { +// } + +// /** +// * Create a new Normalizer2CharFilter with the specified Normalizer2 +// * @param in text +// * @param normalizer normalizer to use +// */ +// public ICUNormalizer2CharFilter(TextReader input, Normalizer2 normalizer) +// : base(input) +// { +// if (normalizer == null) +// { +// throw new ArgumentNullException("normalizer"); +// } +// this.normalizer = normalizer; +// } + +// public override int Read(char[] cbuf, int off, int len) +// { +// if (off < 0) throw new ArgumentException("off < 0"); +// if (off >= cbuf.Length) throw new ArgumentException("off >= cbuf.length"); +// if (len <= 0) throw new ArgumentException("len <= 0"); + +// while (!inputFinished || inputBuffer.Length > 0 || resultBuffer.Length > 0) +// { +// int retLen; + +// if (resultBuffer.Length > 0) +// { +// retLen = OutputFromResultBuffer(cbuf, off, len); +// if (retLen > 0) +// { +// return retLen; +// } +// } + +// int resLen = ReadAndNormalizeFromInput(); +// if (resLen > 0) +// { +// retLen = OutputFromResultBuffer(cbuf, off, len); +// if (retLen > 0) +// { +// return retLen; +// } +// } + +// ReadInputToBuffer(); +// } + +// return -1; +// } + +// private readonly char[] tmpBuffer = new char[IO_BUFFER_SIZE]; + +// private int ReadInputToBuffer() +// { +// int len = m_input.Read(tmpBuffer, 0, tmpBuffer.Length); +// if (len == -1) +// { +// inputFinished = true; +// return 0; +// } +// inputBuffer.Append(tmpBuffer, 0, len); + +// // if checkedInputBoundary was at the end of a buffer, we need to check that char again +// checkedInputBoundary = Math.Max(checkedInputBoundary - 1, 0); +// // this loop depends on 'isInert' (changes under normalization) but looks only at characters. +// // so we treat all surrogates as non-inert for simplicity +// if (normalizer.IsInert(tmpBuffer[len - 1]) && !char.IsSurrogate(tmpBuffer[len - 1])) +// { +// return len; +// } +// else return len + ReadInputToBuffer(); +// } + +// private int ReadAndNormalizeFromInput() +// { +// if (inputBuffer.Length <= 0) +// { +// afterQuickCheckYes = false; +// return 0; +// } +// if (!afterQuickCheckYes) +// { +// int resLen2 = ReadFromInputWhileSpanQuickCheckYes(); +// afterQuickCheckYes = true; +// if (resLen2 > 0) return resLen2; +// } +// int resLen = ReadFromIoNormalizeUptoBoundary(); +// if (resLen > 0) +// { +// afterQuickCheckYes = false; +// } +// return resLen; +// } + +// private int ReadFromInputWhileSpanQuickCheckYes() +// { +// int end = normalizer.spanQuickCheckYes(inputBuffer); +// if (end > 0) +// { +// //resultBuffer.Append(inputBuffer.subSequence(0, end)); +// resultBuffer.Append(inputBuffer.ToString(0, end)); +// //inputBuffer.delete(0, end); +// inputBuffer.Remove(0, end); +// checkedInputBoundary = Math.Max(checkedInputBoundary - end, 0); +// charCount += end; +// } +// return end; +// } + +// private int ReadFromIoNormalizeUptoBoundary() +// { +// // if there's no buffer to normalize, return 0 +// if (inputBuffer.Length <= 0) +// { +// return 0; +// } + +// bool foundBoundary = false; +// int bufLen = inputBuffer.Length; + +// while (checkedInputBoundary <= bufLen - 1) +// { +// int charLen = Character.CharCount(inputBuffer.CodePointAt(checkedInputBoundary)); +// checkedInputBoundary += charLen; +// if (checkedInputBoundary < bufLen && normalizer.HasBoundaryBefore(inputBuffer +// .CodePointAt(checkedInputBoundary))) +// { +// foundBoundary = true; +// break; +// } +// } +// if (!foundBoundary && checkedInputBoundary >= bufLen && inputFinished) +// { +// foundBoundary = true; +// checkedInputBoundary = bufLen; +// } + +// if (!foundBoundary) +// { +// return 0; +// } + +// return NormalizeInputUpto(checkedInputBoundary); +// } + +// private int NormalizeInputUpto(int length) +// { +// int destOrigLen = resultBuffer.Length; +// normalizer.NormalizeSecondAndAppend(resultBuffer, inputBuffer.ToString(0, length)); +// //inputBuffer.SubSequence(0, length)); + +// //inputBuffer.Delete(0, length); +// inputBuffer.Remove(0, length); +// checkedInputBoundary = Math.Max(checkedInputBoundary - length, 0); +// int resultLength = resultBuffer.Length - destOrigLen; +// RecordOffsetDiff(length, resultLength); +// return resultLength; +// } + +// private void RecordOffsetDiff(int inputLength, int outputLength) +// { +// if (inputLength == outputLength) +// { +// charCount += outputLength; +// return; +// } +// int diff = inputLength - outputLength; +// int cumuDiff = LastCumulativeDiff; +// if (diff < 0) +// { +// for (int i = 1; i <= -diff; ++i) +// { +// AddOffCorrectMap(charCount + i, cumuDiff - i); +// } +// } +// else +// { +// AddOffCorrectMap(charCount + outputLength, cumuDiff + diff); +// } +// charCount += outputLength; +// } + +// private int OutputFromResultBuffer(char[] cbuf, int begin, int len) +// { +// len = Math.Min(resultBuffer.Length, len); +// //resultBuffer.GetChars(0, len, cbuf, begin); +// resultBuffer.CopyTo(0, cbuf, begin, len); +// if (len > 0) +// { +// //resultBuffer.delete(0, len); +// resultBuffer.Remove(0, len); +// } +// return len; +// } +// } +//} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1191c20d/src/Lucene.Net.Analysis.ICU/Analysis/Icu/ICUNormalizer2CharFilterFactory.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.ICU/Analysis/Icu/ICUNormalizer2CharFilterFactory.cs b/src/Lucene.Net.Analysis.ICU/Analysis/Icu/ICUNormalizer2CharFilterFactory.cs new file mode 100644 index 0000000..bd4cbe5 --- /dev/null +++ b/src/Lucene.Net.Analysis.ICU/Analysis/Icu/ICUNormalizer2CharFilterFactory.cs @@ -0,0 +1,60 @@ +// LUCENENET TODO: Port issues - missing Normalizer2 dependency from icu.net + +//using Icu; +//using Lucene.Net.Analysis.Util; +//using Lucene.Net.Support; +//using System; +//using System.Collections.Generic; +//using System.IO; +//using System.Linq; +//using System.Text; +//using System.Threading.Tasks; + +//namespace Lucene.Net.Analysis.ICU +//{ +// public class ICUNormalizer2CharFilterFactory : CharFilterFactory, IMultiTermAwareComponent +// { +// private readonly Normalizer2 normalizer; + +// /// <summary>Creates a new ICUNormalizer2CharFilterFactory</summary> +// public ICUNormalizer2CharFilterFactory(IDictionary<string, string> args) +// : base(args) +// { +// string name = Get(args, "name", "NFKC"); +// //string name = Get(args, "name", "nfkc_cf"); +// //string mode = Get(args, "mode", new string[] { "compose", "decompose" }, "compose"); +// //Normalizer2 normalizer = Normalizer2.getInstance +// // (null, name, "compose".Equals(mode) ? Normalizer2.Mode.COMPOSE : Normalizer2.Mode.DECOMPOSE); + +// var mode = (Icu.Normalizer.UNormalizationMode)Enum.Parse(typeof(Icu.Normalizer.UNormalizationMode), "UNORM_" + name); +// Normalizer2 normalizer = new Normalizer2(mode); + +// string filter = Get(args, "filter"); +// if (filter != null) +// { +// //UnicodeSet set = new UnicodeSet(filter); +// var set = UnicodeSet.ToCharacters(filter); +// if (set.Any()) +// { +// //set.freeze(); +// normalizer = new FilteredNormalizer2(normalizer, set); +// } +// } +// if (args.Count != 0) +// { +// throw new ArgumentException("Unknown parameters: " + args); +// } +// this.normalizer = normalizer; +// } + +// public override TextReader Create(TextReader input) +// { +// return new ICUNormalizer2CharFilter(input, normalizer); +// } + +// public virtual AbstractAnalysisFactory GetMultiTermComponent() +// { +// return this; +// } +// } +//} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1191c20d/src/Lucene.Net.Analysis.ICU/Analysis/Icu/ICUNormalizer2Filter.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.ICU/Analysis/Icu/ICUNormalizer2Filter.cs b/src/Lucene.Net.Analysis.ICU/Analysis/Icu/ICUNormalizer2Filter.cs new file mode 100644 index 0000000..bca3d24 --- /dev/null +++ b/src/Lucene.Net.Analysis.ICU/Analysis/Icu/ICUNormalizer2Filter.cs @@ -0,0 +1,60 @@ +// LUCENENET TODO: Port issues - missing Normalizer2 dependency from icu.net + +//using Icu; +//using Lucene.Net.Analysis.TokenAttributes; +//using Lucene.Net.Support; + +//namespace Lucene.Net.Analysis.ICU +//{ +// public class ICUNormalizer2Filter : TokenFilter +// { +// private readonly ICharTermAttribute termAtt; +// private readonly Normalizer2 normalizer; + +// /// <summary> +// /// Create a new <see cref="Normalizer2Filter"/> that combines NFKC normalization, Case +// /// Folding, and removes Default Ignorables (NFKC_Casefold) +// /// </summary> +// /// <param name="input"></param> +// public ICUNormalizer2Filter(TokenStream input) +// : this(input, new Normalizer2(Normalizer.UNormalizationMode.UNORM_NFKC) /*Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE)*/) +// { +// } + +// /// <summary> +// /// Create a new <see cref="Normalizer2Filter"/> with the specified <see cref="Normalizer2"/> +// /// </summary> +// /// <param name="input">stream</param> +// /// <param name="normalizer">normalizer to use</param> +// public ICUNormalizer2Filter(TokenStream input, Normalizer2 normalizer) +// : base(input) +// { +// this.normalizer = normalizer; +// this.termAtt = AddAttribute<ICharTermAttribute>(); +// } + +// public override sealed bool IncrementToken() +// { +// if (m_input.IncrementToken()) +// { +// var term = termAtt.ToString(); +// try +// { +// if (!normalizer.IsNormalized(term)) +// { +// termAtt.SetEmpty().Append(normalizer.Normalize(term)); +// } +// } +// catch (System.Exception ex) +// { + +// } +// return true; +// } +// else +// { +// return false; +// } +// } +// } +//} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1191c20d/src/Lucene.Net.Analysis.ICU/Analysis/Icu/ICUNormalizer2FilterFactory.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.ICU/Analysis/Icu/ICUNormalizer2FilterFactory.cs b/src/Lucene.Net.Analysis.ICU/Analysis/Icu/ICUNormalizer2FilterFactory.cs new file mode 100644 index 0000000..c17fb7f --- /dev/null +++ b/src/Lucene.Net.Analysis.ICU/Analysis/Icu/ICUNormalizer2FilterFactory.cs @@ -0,0 +1,59 @@ +// LUCENENET TODO: Port issues - missing Normalizer2 dependency from icu.net + +//using Icu; +//using Lucene.Net.Analysis.Util; +//using Lucene.Net.Support; +//using System; +//using System.Collections.Generic; +//using System.Linq; + +//namespace Lucene.Net.Analysis.ICU +//{ +// public class ICUNormalizer2FilterFactory : TokenFilterFactory, IMultiTermAwareComponent +// { +// private readonly Normalizer2 normalizer; + +// /// <summary>Creates a new ICUNormalizer2FilterFactory</summary> +// public ICUNormalizer2FilterFactory(IDictionary<string, string> args) +// : base(args) +// { +// string name = Get(args, "name", "NFKC"); +// //string name = Get(args, "name", "nfkc_cf"); +// //string mode = Get(args, "mode", new string[] { "compose", "decompose" }, "compose"); + +// var mode = (Normalizer.UNormalizationMode)Enum.Parse(typeof(Normalizer.UNormalizationMode), "UNORM_" + name); +// Normalizer2 normalizer = new Normalizer2(mode); + +// //Normalizer2 normalizer = Normalizer2.getInstance +// // (null, name, "compose".Equals(mode) ? Normalizer2.Mode.COMPOSE : Normalizer2.Mode.DECOMPOSE); + +// string filter = Get(args, "filter"); +// if (filter != null) +// { +// //UnicodeSet set = new UnicodeSet(filter); +// var set = UnicodeSet.ToCharacters(filter); +// if (set.Any()) +// { +// //set.freeze(); +// normalizer = new FilteredNormalizer2(normalizer, set); +// } +// } +// if (args.Count != 0) +// { +// throw new ArgumentException("Unknown parameters: " + args); +// } +// this.normalizer = normalizer; +// } + +// // TODO: support custom normalization +// public override TokenStream Create(TokenStream input) +// { +// return new ICUNormalizer2Filter(input, normalizer); +// } + +// public virtual AbstractAnalysisFactory GetMultiTermComponent() +// { +// return this; +// } +// } +//} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1191c20d/src/Lucene.Net.Analysis.ICU/Analysis/Icu/ICUTransformFilter.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.ICU/Analysis/Icu/ICUTransformFilter.cs b/src/Lucene.Net.Analysis.ICU/Analysis/Icu/ICUTransformFilter.cs new file mode 100644 index 0000000..7f22c3d --- /dev/null +++ b/src/Lucene.Net.Analysis.ICU/Analysis/Icu/ICUTransformFilter.cs @@ -0,0 +1,152 @@ +// LUCENENET TODO: Port issues - missing Transliterator dependency from icu.net + +//using Lucene.Net.Analysis.TokenAttributes; + +//namespace Lucene.Net.Analysis.ICU +//{ +// public sealed class ICUTransformFilter : TokenFilter +// { +// // Transliterator to transform the text +// private readonly Transliterator transform; + +// // Reusable position object +// private readonly Transliterator.Position position = new Transliterator.Position(); + +// // term attribute, will be updated with transformed text. +// private readonly ICharTermAttribute termAtt; + +// // Wraps a termAttribute around the replaceable interface. +// private readonly ReplaceableTermAttribute replaceableAttribute = new ReplaceableTermAttribute(); + +// /// <summary> +// /// Create a new ICUTransformFilter that transforms text on the given stream. +// /// </summary> +// /// <param name="input"><see cref="TokenStream"/> to filter.</param> +// /// <param name="transform">Transliterator to transform the text.</param> +// public ICUTransformFilter(TokenStream input, Transliterator transform) +// : base(input) +// { +// this.transform = transform; +// this.termAtt = AddAttribute<ICharTermAttribute>(); + +// /* +// * This is cheating, but speeds things up a lot. +// * If we wanted to use pkg-private APIs we could probably do better. +// */ +// if (transform.getFilter() == null && transform is com.ibm.icu.text.RuleBasedTransliterator) +// { +// UnicodeSet sourceSet = transform.getSourceSet(); +// if (sourceSet != null && !sourceSet.isEmpty()) +// transform.setFilter(sourceSet); +// } +// } + +// public override bool IncrementToken() +// { +// /* +// * Wrap around replaceable. clear the positions, and transliterate. +// */ +// if (m_input.IncrementToken()) +// { +// replaceableAttribute.SetText(termAtt); + +// int length = termAtt.Length; +// position.start = 0; +// position.limit = length; +// position.contextStart = 0; +// position.contextLimit = length; + +// transform.FilteredTransliterate(replaceableAttribute, position, false); +// return true; +// } +// else +// { +// return false; +// } +// } + +// /// <summary> +// /// Wrap a <see cref="ICharTermAttribute"/> with the Replaceable API. +// /// </summary> +// private sealed class ReplaceableTermAttribute //: IReplaceable +// { +// private char[] buffer; +// private int length; +// private ICharTermAttribute token; + +// public void SetText(ICharTermAttribute token) +// { +// this.token = token; +// this.buffer = token.Buffer; +// this.length = token.Length; +// } + +// public int Char32At(int pos) +// { +// return UTF16.charAt(buffer, 0, length, pos); +// } + +// public char CharAt(int pos) +// { +// return buffer[pos]; +// } + +// public void Copy(int start, int limit, int dest) +// { +// char[] text = new char[limit - start]; +// GetChars(start, limit, text, 0); +// Replace(dest, dest, text, 0, limit - start); +// } + +// public void GetChars(int srcStart, int srcLimit, char[] dst, int dstStart) +// { +// System.Array.Copy(buffer, srcStart, dst, dstStart, srcLimit - srcStart); +// } + +// public bool HasMetaData +// { +// get { return false; } +// } + +// public int Length +// { +// get { return length; } +// } + +// public void Replace(int start, int limit, string text) +// { +// int charsLen = text.Length; +// int newLength = ShiftForReplace(start, limit, charsLen); +// // insert the replacement text +// //text.getChars(0, charsLen, buffer, start); +// text.CopyTo(0, buffer, start, charsLen); +// token.Length = (length = newLength); +// } + +// public void Replace(int start, int limit, char[] text, int charsStart, +// int charsLen) +// { +// // shift text if necessary for the replacement +// int newLength = ShiftForReplace(start, limit, charsLen); +// // insert the replacement text +// System.Array.Copy(text, charsStart, buffer, start, charsLen); +// token.Length = (length = newLength); +// } + +// /// <summary>shift text (if necessary) for a replacement operation</summary> +// private int ShiftForReplace(int start, int limit, int charsLen) +// { +// int replacementLength = limit - start; +// int newLength = length - replacementLength + charsLen; +// // resize if necessary +// if (newLength > length) +// buffer = token.ResizeBuffer(newLength); +// // if the substring being replaced is longer or shorter than the +// // replacement, need to shift things around +// if (replacementLength != charsLen && limit < length) +// System.Array.Copy(buffer, limit, buffer, start + charsLen, length - limit); +// return newLength; +// } +// } +// } +//} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1191c20d/src/Lucene.Net.Analysis.ICU/Analysis/Icu/ICUTransformFilterFactory.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.ICU/Analysis/Icu/ICUTransformFilterFactory.cs b/src/Lucene.Net.Analysis.ICU/Analysis/Icu/ICUTransformFilterFactory.cs new file mode 100644 index 0000000..081ebf5 --- /dev/null +++ b/src/Lucene.Net.Analysis.ICU/Analysis/Icu/ICUTransformFilterFactory.cs @@ -0,0 +1,38 @@ +// LUCENENET TODO: Port issues - missing Transliterator dependency from icu.net + +//using Lucene.Net.Analysis.Util; +//using System; +//using System.Collections.Generic; + +//namespace Lucene.Net.Analysis.ICU +//{ +// public class ICUTransformFilterFactory : TokenFilterFactory, IMultiTermAwareComponent +// { +// private readonly Transliterator transliterator; + +// // TODO: add support for custom rules +// /// <summary>Creates a new ICUTransformFilterFactory</summary> +// public ICUTransformFilterFactory(IDictionary<string, string> args) +// : base(args) +// { +// string id = Require(args, "id"); +// string direction = Get(args, "direction", new string[] { "forward", "reverse" }, "forward", false); +// int dir = "forward".Equals(direction) ? Transliterator.FORWARD : Transliterator.REVERSE; +// transliterator = Transliterator.getInstance(id, dir); +// if (args.Count != 0) +// { +// throw new ArgumentException("Unknown parameters: " + args); +// } +// } + +// public override TokenStream Create(TokenStream input) +// { +// return new ICUTransformFilter(input, transliterator); +// } + +// public virtual AbstractAnalysisFactory GetMultiTermComponent() +// { +// return this; +// } +// } +//} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1191c20d/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/BreakIteratorWrapper.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/BreakIteratorWrapper.cs b/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/BreakIteratorWrapper.cs new file mode 100644 index 0000000..c124a88 --- /dev/null +++ b/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/BreakIteratorWrapper.cs @@ -0,0 +1,166 @@ +// LUCENENET TODO: Port issues - missing dependencies + +//using Icu; +//using Lucene.Net.Analysis.Util; +//using Lucene.Net.Support; +//using System; +//using System.Collections.Generic; +//using System.Linq; +//using System.Text; +//using System.Threading.Tasks; + +//namespace Lucene.Net.Analysis.ICU.Segmentation +//{ +// /// <summary> +// /// Contain all the issues surrounding BreakIterators in ICU in one place. +// /// Basically this boils down to the fact that they aren't very friendly to any +// /// sort of OO design. +// /// <para/> +// /// http://bugs.icu-project.org/trac/ticket/5901: RBBI.getRuleStatus(), hoist to +// /// BreakIterator from RuleBasedBreakIterator +// /// <para/> +// /// DictionaryBasedBreakIterator is a subclass of RuleBasedBreakIterator, but +// /// doesn't actually behave as a subclass: it always returns 0 for +// /// getRuleStatus(): +// /// http://bugs.icu-project.org/trac/ticket/4730: Thai RBBI, no boundary type +// /// tags +// /// <para/> +// /// @lucene.experimental +// /// </summary> +// internal abstract class BreakIteratorWrapper +// { +// protected readonly CharArrayIterator textIterator = new CharArrayIterator(); +// protected char[] text; +// protected int start; +// protected int length; + +// public abstract int Next(); +// public abstract int Current { get; } +// public abstract int GetRuleStatus(); +// public abstract void SetText(CharacterIterator text); + +// public void SetText(char[] text, int start, int length) +// { +// this.text = text; +// this.start = start; +// this.length = length; +// textIterator.SetText(text, start, length); +// SetText(textIterator); +// } + +// /** +// * If its a RuleBasedBreakIterator, the rule status can be used for token type. If its +// * any other BreakIterator, the rulestatus method is not available, so treat +// * it like a generic BreakIterator. +// */ +// public static BreakIteratorWrapper Wrap(Icu.BreakIterator breakIterator) +// { +// if (breakIterator is Icu.RuleBasedBreakIterator) +// return new RBBIWrapper((Icu.RuleBasedBreakIterator)breakIterator); +// else +// return new BIWrapper(breakIterator); +// } + +// /** +// * RuleBasedBreakIterator wrapper: RuleBasedBreakIterator (as long as its not +// * a DictionaryBasedBreakIterator) behaves correctly. +// */ +// private sealed class RBBIWrapper : BreakIteratorWrapper +// { +// private readonly Icu.RuleBasedBreakIterator rbbi; + +// internal RBBIWrapper(Icu.RuleBasedBreakIterator rbbi) +// { +// this.rbbi = rbbi; +// } + +// public override int Current +// { +// get { return rbbi.Current; } +// } + +// public override int GetRuleStatus() +// { +// return rbbi.GetRuleStatus(); +// } + +// public override int Next() +// { +// return rbbi.Next(); +// } + +// public override void SetText(CharacterIterator text) +// { +// rbbi.SetText(text); +// } +// } + +// /** +// * Generic BreakIterator wrapper: Either the rulestatus method is not +// * available or always returns 0. Calculate a rulestatus here so it behaves +// * like RuleBasedBreakIterator. +// * +// * Note: This is slower than RuleBasedBreakIterator. +// */ +// private sealed class BIWrapper : BreakIteratorWrapper +// { +// private readonly Support.BreakIterator bi; +// private int status; + +// internal BIWrapper(Support.BreakIterator bi) +// { +// this.bi = bi; +// } + +// public override int Current +// { +// get { return bi.Current; } +// } + +// public override int GetRuleStatus() +// { +// return status; +// } + +// public override int Next() +// { +// int current = bi.Current; +// int next = bi.Next(); +// status = CalcStatus(current, next); +// return next; +// } + +// private int CalcStatus(int current, int next) +// { +// if (current == Support.BreakIterator.DONE || next == Support.BreakIterator.DONE) +// return RuleBasedBreakIterator.WORD_NONE; + +// int begin = start + current; +// int end = start + next; + +// int codepoint; +// for (int i = begin; i < end; i += UTF16.getCharCount(codepoint)) +// { +// codepoint = UTF16.charAt(text, 0, end, begin); + +// if (UCharacter.isDigit(codepoint)) +// return RuleBasedBreakIterator.WORD_NUMBER; +// else if (UCharacter.isLetter(codepoint)) +// { +// // TODO: try to separately specify ideographic, kana? +// // [currently all bundled as letter for this case] +// return RuleBasedBreakIterator.WORD_LETTER; +// } +// } + +// return RuleBasedBreakIterator.WORD_NONE; +// } + +// public override void SetText(CharacterIterator text) +// { +// bi.SetText(text); +// status = RuleBasedBreakIterator.WORD_NONE; +// } +// } +// } +//} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1191c20d/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/CharArrayIterator.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/CharArrayIterator.cs b/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/CharArrayIterator.cs new file mode 100644 index 0000000..209d583 --- /dev/null +++ b/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/CharArrayIterator.cs @@ -0,0 +1,134 @@ +using Lucene.Net.Support; +using System; +using System.Diagnostics.CodeAnalysis; + +namespace Lucene.Net.Analysis.Icu.Segmentation +{ + /// <summary> + /// Wraps a char[] as CharacterIterator for processing with a BreakIterator + /// <para/> + /// @lucene.experimental + /// </summary> + internal sealed class CharArrayIterator : CharacterIterator + { + private char[] array; + private int start; + private int index; + private int length; + private int limit; + + [WritableArray] + [SuppressMessage("Microsoft.Performance", "CA1819", Justification = "Lucene's design requires some writable array properties")] + public char[] Text + { + get + { + return array; + } + } + + public int Start + { + get { return start; } + } + + public int Length + { + get { return length; } + } + + /// <summary> + /// Set a new region of text to be examined by this iterator + /// </summary> + /// <param name="array">text buffer to examine</param> + /// <param name="start">offset into buffer</param> + /// <param name="length"> maximum length to examine</param> + public void SetText(char[] array, int start, int length) + { + this.array = array; + this.start = start; + this.index = start; + this.length = length; + this.limit = start + length; + } + + public override char Current + { + get { return (index == limit) ? DONE : array[index]; } + } + + public override char First() + { + index = start; + return Current; + } + + public override int BeginIndex + { + get { return 0; } + } + + public override int EndIndex + { + get { return length; } + } + + public override int Index + { + get { return index - start; } + } + + public override char Last() + { + index = (limit == start) ? limit : limit - 1; + return Current; + } + + public override char Next() + { + if (++index >= limit) + { + index = limit; + return DONE; + } + else + { + return Current; + } + } + + public override char Previous() + { + if (--index < start) + { + index = start; + return DONE; + } + else + { + return Current; + } + } + + public override char SetIndex(int position) + { + if (position < BeginIndex || position > EndIndex) + throw new ArgumentException("Illegal Position: " + position); + index = start + position; + return Current; + } + + public override string GetTextAsString() + { + return new string(array); + } + + public override object Clone() + { + CharArrayIterator clone = new CharArrayIterator(); + clone.SetText(array, start, length); + clone.index = index; + return clone; + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1191c20d/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/CompositeBreakIterator.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/CompositeBreakIterator.cs b/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/CompositeBreakIterator.cs new file mode 100644 index 0000000..a004193 --- /dev/null +++ b/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/CompositeBreakIterator.cs @@ -0,0 +1,132 @@ +// LUCENENET TODO: Port issues - missing dependencies + +//using System; +//using System.Collections.Generic; +//using System.Linq; +//using System.Text; +//using System.Threading.Tasks; + +//namespace Lucene.Net.Analysis.ICU.Segmentation +//{ +// /// <summary> +// /// An internal BreakIterator for multilingual text, following recommendations +// /// from: UAX #29: Unicode Text Segmentation. (http://unicode.org/reports/tr29/) +// /// <para/> +// /// See http://unicode.org/reports/tr29/#Tailoring for the motivation of this +// /// design. +// /// <para/> +// /// Text is first divided into script boundaries. The processing is then +// /// delegated to the appropriate break iterator for that specific script. +// /// <para/> +// /// This break iterator also allows you to retrieve the ISO 15924 script code +// /// associated with a piece of text. +// /// <para/> +// /// See also UAX #29, UTR #24 +// /// <para/> +// /// @lucene.experimental +// /// </summary> +// internal sealed class CompositeBreakIterator +// { +// private readonly ICUTokenizerConfig config; +// private readonly BreakIteratorWrapper[] wordBreakers = new BreakIteratorWrapper[UScript.CODE_LIMIT]; + +// private BreakIteratorWrapper rbbi; +// private readonly ScriptIterator scriptIterator; + +// private char[] text; + +// public CompositeBreakIterator(ICUTokenizerConfig config) +// { +// this.config = config; +// this.scriptIterator = new ScriptIterator(config.CombineCJ); +// } + +// /** +// * Retrieve the next break position. If the RBBI range is exhausted within the +// * script boundary, examine the next script boundary. +// * +// * @return the next break position or BreakIterator.DONE +// */ +// public int Next() +// { +// int next = rbbi.Next(); +// while (next == Support.BreakIterator.DONE && scriptIterator.Next()) +// { +// rbbi = GetBreakIterator(scriptIterator.GetScriptCode()); +// rbbi.SetText(text, scriptIterator.GetScriptStart(), +// scriptIterator.GetScriptLimit() - scriptIterator.GetScriptStart()); +// next = rbbi.Next(); +// } +// return (next == Support.BreakIterator.DONE) ? Support.BreakIterator.DONE : next +// + scriptIterator.GetScriptStart(); +// } + +// /** +// * Retrieve the current break position. +// * +// * @return the current break position or BreakIterator.DONE +// */ +// public int Current +// { +// get +// { +// int current = rbbi.Current; +// return (current == Support.BreakIterator.DONE) ? Support.BreakIterator.DONE : current +// + scriptIterator.GetScriptStart(); +// } +// } + +// /** +// * Retrieve the rule status code (token type) from the underlying break +// * iterator +// * +// * @return rule status code (see RuleBasedBreakIterator constants) +// */ +// public int GetRuleStatus() +// { +// return rbbi.GetRuleStatus(); +// } + +// /** +// * Retrieve the UScript script code for the current token. This code can be +// * decoded with UScript into a name or ISO 15924 code. +// * +// * @return UScript script code for the current token. +// */ +// public int GetScriptCode() +// { +// return scriptIterator.GetScriptCode(); +// } + +// /** +// * Set a new region of text to be examined by this iterator +// * +// * @param text buffer of text +// * @param start offset into buffer +// * @param length maximum length to examine +// */ +// public void SetText(char[] text, int start, int length) +// { +// this.text = text; +// scriptIterator.SetText(text, start, length); +// if (scriptIterator.Next()) +// { +// rbbi = GetBreakIterator(scriptIterator.GetScriptCode()); +// rbbi.SetText(text, scriptIterator.GetScriptStart(), +// scriptIterator.GetScriptLimit() - scriptIterator.GetScriptStart()); +// } +// else +// { +// rbbi = GetBreakIterator(UScript.COMMON); +// rbbi.SetText(text, 0, 0); +// } +// } + +// private BreakIteratorWrapper GetBreakIterator(int scriptCode) +// { +// if (wordBreakers[scriptCode] == null) +// wordBreakers[scriptCode] = BreakIteratorWrapper.Wrap(config.GetBreakIterator(scriptCode)); +// return wordBreakers[scriptCode]; +// } +// } +//} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1191c20d/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/DefaultICUTokenizerConfig.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/DefaultICUTokenizerConfig.cs b/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/DefaultICUTokenizerConfig.cs new file mode 100644 index 0000000..fc2a989 --- /dev/null +++ b/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/DefaultICUTokenizerConfig.cs @@ -0,0 +1,127 @@ +// LUCENENET TODO: Port issues - missing dependencies + +//using Icu; +//using Lucene.Net.Analysis.Standard; +//using Lucene.Net.Support; +//using System; +//using System.Collections.Generic; +//using System.Globalization; +//using System.IO; +//using System.Linq; +//using System.Text; +//using System.Threading.Tasks; + +//namespace Lucene.Net.Analysis.ICU.Segmentation +//{ +// /// <summary> +// /// Default <see cref="ICUTokenizerConfig"/> that is generally applicable +// /// to many languages. +// /// </summary> +// /// <remarks> +// /// Generally tokenizes Unicode text according to UAX#29 +// /// ({@link BreakIterator#getWordInstance(ULocale) BreakIterator.getWordInstance(ULocale.ROOT)}), +// /// but with the following tailorings: +// /// <list type="bullet"> +// /// <item><description>Thai, Lao, and CJK text is broken into words with a dictionary.</description></item> +// /// <item><description>Myanmar, and Khmer text is broken into syllables based on custom BreakIterator rules.</description></item> +// /// </list> +// /// <para/> +// /// @lucene.experimental +// /// </remarks> +// public class DefaultICUTokenizerConfig : ICUTokenizerConfig +// { +// /** Token type for words containing ideographic characters */ +// public static readonly string WORD_IDEO = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.IDEOGRAPHIC]; +// /** Token type for words containing Japanese hiragana */ +// public static readonly string WORD_HIRAGANA = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.HIRAGANA]; +// /** Token type for words containing Japanese katakana */ +// public static readonly string WORD_KATAKANA = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.KATAKANA]; +// /** Token type for words containing Korean hangul */ +// public static readonly string WORD_HANGUL = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.HANGUL]; +// /** Token type for words that contain letters */ +// public static readonly string WORD_LETTER = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.ALPHANUM]; +// /** Token type for words that appear to be numbers */ +// public static readonly string WORD_NUMBER = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.NUM]; + +// /* +// * the default breakiterators in use. these can be expensive to +// * instantiate, cheap to clone. +// */ +// // we keep the cjk breaking separate, thats because it cannot be customized (because dictionary +// // is only triggered when kind = WORD, but kind = LINE by default and we have no non-evil way to change it) +// private static readonly Icu.BreakIterator cjkBreakIterator = new Icu.RuleBasedBreakIterator(Icu.BreakIterator.UBreakIteratorType.WORD, new Locale()); //BreakIterator.getWordInstance(ULocale.ROOT); +// // the same as ROOT, except no dictionary segmentation for cjk +// private static readonly Icu.BreakIterator defaultBreakIterator = +// ReadBreakIterator("Default.brk"); +// private static readonly Icu.BreakIterator khmerBreakIterator = +// ReadBreakIterator("Khmer.brk"); +// private static readonly Icu.BreakIterator myanmarBreakIterator = +// ReadBreakIterator("Myanmar.brk"); + +// // TODO: deprecate this boolean? you only care if you are doing super-expert stuff... +// private readonly bool cjkAsWords; + +// /** +// * Creates a new config. This object is lightweight, but the first +// * time the class is referenced, breakiterators will be initialized. +// * @param cjkAsWords true if cjk text should undergo dictionary-based segmentation, +// * otherwise text will be segmented according to UAX#29 defaults. +// * If this is true, all Han+Hiragana+Katakana words will be tagged as +// * IDEOGRAPHIC. +// */ +// public DefaultICUTokenizerConfig(bool cjkAsWords) +// { +// this.cjkAsWords = cjkAsWords; +// } + +// public override bool CombineCJ +// { +// get { return cjkAsWords; } +// } + +// public override Icu.BreakIterator GetBreakIterator(int script) +// { +// switch (script) +// { +// case UScript.KHMER: return (Icu.BreakIterator)khmerBreakIterator.Clone(); +// case UScript.MYANMAR: return (Icu.BreakIterator)myanmarBreakIterator.Clone(); +// case UScript.JAPANESE: return (Icu.BreakIterator)cjkBreakIterator.Clone(); +// default: return (Icu.BreakIterator)defaultBreakIterator.Clone(); +// } +// } + +// public override string GetType(int script, int ruleStatus) +// { +// switch (ruleStatus) +// { +// case RuleBasedBreakIterator.WORD_IDEO: +// return WORD_IDEO; +// case RuleBasedBreakIterator.WORD_KANA: +// return script == UScript.HIRAGANA ? WORD_HIRAGANA : WORD_KATAKANA; +// case RuleBasedBreakIterator.WORD_LETTER: +// return script == UScript.HANGUL ? WORD_HANGUL : WORD_LETTER; +// case RuleBasedBreakIterator.WORD_NUMBER: +// return WORD_NUMBER; +// default: /* some other custom code */ +// return "<OTHER>"; +// } +// } + +// private static RuleBasedBreakIterator ReadBreakIterator(string filename) +// { +// Stream @is = +// typeof(DefaultICUTokenizerConfig).Assembly.FindAndGetManifestResourceStream(typeof(DefaultICUTokenizerConfig), filename); +// try +// { +// RuleBasedBreakIterator bi = +// RuleBasedBreakIterator.GetInstanceFromCompiledRules(@is); +// @is.Dispose(); +// return bi; +// } +// catch (IOException e) +// { +// throw new Exception(e.ToString(), e); +// } +// } +// } +//} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1191c20d/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/ICUTokenizer.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/ICUTokenizer.cs b/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/ICUTokenizer.cs new file mode 100644 index 0000000..7677c0c --- /dev/null +++ b/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/ICUTokenizer.cs @@ -0,0 +1,229 @@ +// LUCENENET TODO: Port issues - missing dependencies + +//using Lucene.Net.Analysis.ICU.TokenAttributes; +//using Lucene.Net.Analysis.TokenAttributes; +//using System; +//using System.Collections.Generic; +//using System.Diagnostics; +//using System.IO; +//using System.Linq; +//using System.Text; +//using System.Threading.Tasks; + +//namespace Lucene.Net.Analysis.ICU.Segmentation +//{ +// /// <summary> +// /// Breaks text into words according to UAX #29: Unicode Text Segmentation +// /// (http://www.unicode.org/reports/tr29/) +// /// <para/> +// /// Words are broken across script boundaries, then segmented according to +// /// the BreakIterator and typing provided by the <see cref="ICUTokenizerConfig"/> +// /// <para/> +// /// @lucene.experimental +// /// </summary> +// /// <seealso cref="ICUTokenizerConfig"/> +// public sealed class ICUTokenizer : Tokenizer +// { +// private static readonly int IOBUFFER = 4096; +// private readonly char[] buffer = new char[IOBUFFER]; +// /** true length of text in the buffer */ +// private int length = 0; +// /** length in buffer that can be evaluated safely, up to a safe end point */ +// private int usableLength = 0; +// /** accumulated offset of previous buffers for this reader, for offsetAtt */ +// private int offset = 0; + +// private readonly CompositeBreakIterator breaker; /* tokenizes a char[] of text */ +// private readonly ICUTokenizerConfig config; +// private readonly IOffsetAttribute offsetAtt; +// private readonly ICharTermAttribute termAtt; +// private readonly ITypeAttribute typeAtt; +// private readonly IScriptAttribute scriptAtt; + +// /** +// * Construct a new ICUTokenizer that breaks text into words from the given +// * Reader. +// * <p> +// * The default script-specific handling is used. +// * <p> +// * The default attribute factory is used. +// * +// * @param input Reader containing text to tokenize. +// * @see DefaultICUTokenizerConfig +// */ +// public ICUTokenizer(TextReader input) +// : this(input, new DefaultICUTokenizerConfig(true)) +// { +// } + +// /** +// * Construct a new ICUTokenizer that breaks text into words from the given +// * Reader, using a tailored BreakIterator configuration. +// * <p> +// * The default attribute factory is used. +// * +// * @param input Reader containing text to tokenize. +// * @param config Tailored BreakIterator configuration +// */ +// public ICUTokenizer(TextReader input, ICUTokenizerConfig config) +// : this(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY, input, config) +// { +// } + +// /** +// * Construct a new ICUTokenizer that breaks text into words from the given +// * Reader, using a tailored BreakIterator configuration. +// * +// * @param factory AttributeFactory to use +// * @param input Reader containing text to tokenize. +// * @param config Tailored BreakIterator configuration +// */ +// public ICUTokenizer(AttributeFactory factory, TextReader input, ICUTokenizerConfig config) +// : base(factory, input) +// { +// this.config = config; +// breaker = new CompositeBreakIterator(config); + +// this.offsetAtt = AddAttribute<IOffsetAttribute>(); +// this.termAtt = AddAttribute<ICharTermAttribute>(); +// this.typeAtt = AddAttribute<ITypeAttribute>(); +// this.scriptAtt = AddAttribute<IScriptAttribute>(); +// } + + +// public override bool IncrementToken() +// { +// ClearAttributes(); +// if (length == 0) +// Refill(); +// while (!IncrementTokenBuffer()) +// { +// Refill(); +// if (length <= 0) // no more bytes to read; +// return false; +// } +// return true; +// } + + +// public override void Reset() +// { +// base.Reset(); +// breaker.SetText(buffer, 0, 0); +// length = usableLength = offset = 0; +// } + +// public override void End() +// { +// base.End(); +// int finalOffset = (length < 0) ? offset : offset + length; +// offsetAtt.SetOffset(CorrectOffset(finalOffset), CorrectOffset(finalOffset)); +// } + +// /* +// * This tokenizes text based upon the longest matching rule, and because of +// * this, isn't friendly to a Reader. +// * +// * Text is read from the input stream in 4kB chunks. Within a 4kB chunk of +// * text, the last unambiguous break point is found (in this implementation: +// * white space character) Any remaining characters represent possible partial +// * words, so are appended to the front of the next chunk. +// * +// * There is the possibility that there are no unambiguous break points within +// * an entire 4kB chunk of text (binary data). So there is a maximum word limit +// * of 4kB since it will not try to grow the buffer in this case. +// */ + +// /** +// * Returns the last unambiguous break position in the text. +// * +// * @return position of character, or -1 if one does not exist +// */ +// private int FindSafeEnd() +// { +// for (int i = length - 1; i >= 0; i--) +// if (char.IsWhiteSpace(buffer[i])) +// return i + 1; +// return -1; +// } + +// /** +// * Refill the buffer, accumulating the offset and setting usableLength to the +// * last unambiguous break position +// * +// * @throws IOException If there is a low-level I/O error. +// */ +// private void Refill() +// { +// offset += usableLength; +// int leftover = length - usableLength; +// System.Array.Copy(buffer, usableLength, buffer, 0, leftover); +// int requested = buffer.Length - leftover; +// int returned = Read(m_input, buffer, leftover, requested); +// length = returned + leftover; +// if (returned < requested) /* reader has been emptied, process the rest */ +// usableLength = length; +// else +// { /* still more data to be read, find a safe-stopping place */ +// usableLength = FindSafeEnd(); +// if (usableLength < 0) +// usableLength = length; /* +// * more than IOBUFFER of text without space, +// * gonna possibly truncate tokens +// */ +// } + +// breaker.SetText(buffer, 0, Math.Max(0, usableLength)); +// } + +// // TODO: refactor to a shared readFully somewhere +// // (NGramTokenizer does this too): +// /** commons-io's readFully, but without bugs if offset != 0 */ +// private static int Read(TextReader input, char[] buffer, int offset, int length) +// { +// Debug.Assert(length >= 0, "length must not be negative: " + length); + +// int remaining = length; +// while (remaining > 0) +// { +// int location = length - remaining; +// int count = input.Read(buffer, offset + location, remaining); +// if (-1 == count) +// { // EOF +// break; +// } +// remaining -= count; +// } +// return length - remaining; +// } + +// /* +// * return true if there is a token from the buffer, or null if it is +// * exhausted. +// */ +// private bool IncrementTokenBuffer() +// { +// int start = breaker.Current; +// if (start == Support.BreakIterator.DONE) +// return false; // BreakIterator exhausted + +// // find the next set of boundaries, skipping over non-tokens (rule status 0) +// int end = breaker.Next(); +// while (start != Support.BreakIterator.DONE && breaker.GetRuleStatus() == 0) +// { +// start = end; +// end = breaker.Next(); +// } + +// if (start == Support.BreakIterator.DONE) +// return false; // BreakIterator exhausted + +// termAtt.CopyBuffer(buffer, start, end - start); +// offsetAtt.SetOffset(CorrectOffset(offset + start), CorrectOffset(offset + end)); +// typeAtt.Type = config.GetType(breaker.GetScriptCode(), breaker.GetRuleStatus()); +// scriptAtt.Code = breaker.GetScriptCode(); + +// return true; +// } +// } +//} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1191c20d/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/ICUTokenizerConfig.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/ICUTokenizerConfig.cs b/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/ICUTokenizerConfig.cs new file mode 100644 index 0000000..0c13316 --- /dev/null +++ b/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/ICUTokenizerConfig.cs @@ -0,0 +1,33 @@ +// LUCENENET TODO: Port issues - missing dependencies + +//using Lucene.Net.Support; + +//namespace Lucene.Net.Analysis.ICU.Segmentation +//{ +// /// <summary> +// /// Class that allows for tailored Unicode Text Segmentation on +// /// a per-writing system basis. +// /// <para/> +// /// @lucene.experimental +// /// </summary> +// public abstract class ICUTokenizerConfig +// { +// /// <summary> +// /// Sole constructor. (For invocation by subclass +// /// constructors, typically implicit.) +// /// </summary> +// public ICUTokenizerConfig() { } +// /// <summary> +// /// Return a breakiterator capable of processing a given script. +// /// </summary> +// public abstract Icu.BreakIterator GetBreakIterator(int script); +// /// <summary> +// /// Return a token type value for a given script and BreakIterator rule status. +// /// </summary> +// public abstract string GetType(int script, int ruleStatus); +// /// <summary> +// /// true if Han, Hiragana, and Katakana scripts should all be returned as Japanese +// /// </summary> +// public abstract bool CombineCJ { get; } +// } +//} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1191c20d/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/ICUTokenizerFactory.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/ICUTokenizerFactory.cs b/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/ICUTokenizerFactory.cs new file mode 100644 index 0000000..14aa9c0 --- /dev/null +++ b/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/ICUTokenizerFactory.cs @@ -0,0 +1,139 @@ +// LUCENENET TODO: Port issues - missing dependencies + +//using Icu; +//using Lucene.Net.Analysis.Util; +//using Lucene.Net.Support; +//using Lucene.Net.Util; +//using System; +//using System.Collections.Generic; +//using System.Diagnostics; +//using System.IO; +//using System.Linq; +//using System.Text; +//using System.Threading.Tasks; + +//namespace Lucene.Net.Analysis.ICU.Segmentation +//{ + +// public class ICUTokenizerFactory : TokenizerFactory, IResourceLoaderAware +// { +// internal static readonly string RULEFILES = "rulefiles"; +// private readonly IDictionary<int, string> tailored; +// private ICUTokenizerConfig config; +// private readonly bool cjkAsWords; + +// /// <summary>Creates a new ICUTokenizerFactory</summary> +// public ICUTokenizerFactory(IDictionary<string, string> args) +// : base(args) +// { +// tailored = new Dictionary<int, string>(); +// string rulefilesArg = Get(args, RULEFILES); +// if (rulefilesArg != null) +// { +// IList<string> scriptAndResourcePaths = SplitFileNames(rulefilesArg); +// foreach (string scriptAndResourcePath in scriptAndResourcePaths) +// { +// int colonPos = scriptAndResourcePath.IndexOf(":"); +// string scriptCode = scriptAndResourcePath.Substring(0, colonPos - 0).Trim(); +// string resourcePath = scriptAndResourcePath.Substring(colonPos + 1).Trim(); +// tailored[UCharacter.getPropertyValueEnum(UProperty.SCRIPT, scriptCode)] = resourcePath; +// } +// } +// cjkAsWords = GetBoolean(args, "cjkAsWords", true); +// if (args.Count != 0) +// { +// throw new ArgumentException("Unknown parameters: " + args); +// } +// } + +// public virtual void Inform(IResourceLoader loader) +// { +// Debug.Assert(tailored != null, "init must be called first!"); +// if (tailored.Count == 0) +// { +// config = new DefaultICUTokenizerConfig(cjkAsWords); +// } +// else +// { +// config = new DefaultICUTokenizerConfigAnonymousHelper(cjkAsWords, tailored, loader); + +// //BreakIterator[] breakers = new BreakIterator[UScript.CODE_LIMIT]; +// //foreach (var entry in tailored) +// //{ +// // int code = entry.Key; +// // string resourcePath = entry.Value; +// // breakers[code] = ParseRules(resourcePath, loader); +// //} +// // config = new DefaultICUTokenizerConfig(cjkAsWords) +// // { + +// // public override BreakIterator GetBreakIterator(int script) +// // { +// // if (breakers[script] != null) +// // { +// // return (BreakIterator)breakers[script].clone(); +// // } +// // else +// // { +// // return base.GetBreakIterator(script); +// // } +// // } +// // // TODO: we could also allow codes->types mapping +// //}; +// } +// } + +// private class DefaultICUTokenizerConfigAnonymousHelper : DefaultICUTokenizerConfig +// { +// private readonly Icu.BreakIterator[] breakers; +// public DefaultICUTokenizerConfigAnonymousHelper(bool cjkAsWords, IDictionary<int, string> tailored, IResourceLoader loader) +// : base(cjkAsWords) +// { +// breakers = new Icu.BreakIterator[UScript.CODE_LIMIT]; +// foreach (var entry in tailored) +// { +// int code = entry.Key; +// string resourcePath = entry.Value; +// breakers[code] = ParseRules(resourcePath, loader); +// } +// } + +// public override Icu.BreakIterator GetBreakIterator(int script) +// { +// if (breakers[script] != null) +// { +// return (Icu.BreakIterator)breakers[script].Clone(); +// } +// else +// { +// return base.GetBreakIterator(script); +// } +// } + +// private Icu.BreakIterator ParseRules(string filename, IResourceLoader loader) +// { +// StringBuilder rules = new StringBuilder(); +// Stream rulesStream = loader.OpenResource(filename); +// using (TextReader reader = IOUtils.GetDecodingReader(rulesStream, Encoding.UTF8)) +// { +// string line = null; +// while ((line = reader.ReadLine()) != null) +// { +// if (!line.StartsWith("#", StringComparison.Ordinal)) +// { +// rules.Append(line); +// } +// rules.Append('\n'); +// } +// } +// return new RuleBasedBreakIterator(rules.ToString()); +// } +// } + +// public override Tokenizer Create(AttributeSource.AttributeFactory factory, TextReader input) +// { +// Debug.Assert(config != null, "inform must be called first!"); +// return new ICUTokenizer(factory, input, config); +// } +// } +//} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1191c20d/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/ScriptIterator.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/ScriptIterator.cs b/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/ScriptIterator.cs new file mode 100644 index 0000000..f328851 --- /dev/null +++ b/src/Lucene.Net.Analysis.ICU/Analysis/Icu/Segmentation/ScriptIterator.cs @@ -0,0 +1,206 @@ +// LUCENENET TODO: Port issues - missing dependencies + +//using System; +//using System.Collections.Generic; +//using System.Linq; +//using System.Text; +//using System.Text.RegularExpressions; +//using System.Threading.Tasks; + +//namespace Lucene.Net.Analysis.ICU.Segmentation +//{ +// /// <summary> +// /// An iterator that locates ISO 15924 script boundaries in text. +// /// </summary> +// /// <remarks> +// /// This is not the same as simply looking at the Unicode block, or even the +// /// Script property. Some characters are 'common' across multiple scripts, and +// /// some 'inherit' the script value of text surrounding them. +// /// <para/> +// /// This is similar to ICU (internal-only) UScriptRun, with the following +// /// differences: +// /// <list type="bullet"> +// /// <item><description> +// /// Doesn't attempt to match paired punctuation. For tokenization purposes, this +// /// is not necessary. Its also quite expensive. +// /// </description></item> +// /// <item><description> +// /// Non-spacing marks inherit the script of their base character, following +// /// recommendations from UTR #24. +// /// </description></item> +// /// </list> +// /// <para/> +// /// @lucene.experimental +// /// </remarks> +// internal sealed class ScriptIterator +// { +// private char[] text; +// private int start; +// private int limit; +// private int index; + +// private int scriptStart; +// private int scriptLimit; +// private int scriptCode; + +// private readonly bool combineCJ; + +// /** +// * @param combineCJ if true: Han,Hiragana,Katakana will all return as {@link UScript#JAPANESE} +// */ +// internal ScriptIterator(bool combineCJ) +// { +// this.combineCJ = combineCJ; +// } + +// /** +// * Get the start of this script run +// * +// * @return start position of script run +// */ +// public int ScriptStart +// { +// get { return scriptStart; } +// } + +// /** +// * Get the index of the first character after the end of this script run +// * +// * @return position of the first character after this script run +// */ +// public int ScriptLimit +// { +// get { return scriptLimit; } +// } + +// /** +// * Get the UScript script code for this script run +// * +// * @return code for the script of the current run +// */ +// public int ScriptCode +// { +// get { return scriptCode; } +// } + +// /** +// * Iterates to the next script run, returning true if one exists. +// * +// * @return true if there is another script run, false otherwise. +// */ +// public bool Next() +// { +// if (scriptLimit >= limit) +// return false; + +// scriptCode = UScript.COMMON; +// scriptStart = scriptLimit; + +// while (index < limit) +// { +// //int ch = UTF16.charAt(text, start, limit, index - start); +// int ch = Encoding.Unicode.(text, start, limit); +// int sc = GetScript(ch); + +// /* +// * From UTR #24: Implementations that determine the boundaries between +// * characters of given scripts should never break between a non-spacing +// * mark and its base character. Thus for boundary determinations and +// * similar sorts of processing, a non-spacing mark â whatever its script +// * value â should inherit the script value of its base character. +// */ +// if (isSameScript(scriptCode, sc) +// || UCharacter.getType(ch) == ECharacterCategory.NON_SPACING_MARK) +// { +// //index += UTF16.getCharCount(ch); +// index += Encoding.Unicode.GetCharCount() + +// /* +// * Inherited or Common becomes the script code of the surrounding text. +// */ +// if (scriptCode <= UScript.INHERITED && sc > UScript.INHERITED) +// { +// scriptCode = sc; +// } + +// } +// else +// { +// break; +// } +// } + +// scriptLimit = index; +// return true; +// } + +// /** Determine if two scripts are compatible. */ +// private static bool IsSameScript(int scriptOne, int scriptTwo) +// { +// return scriptOne <= UScript.INHERITED || scriptTwo <= UScript.INHERITED +// || scriptOne == scriptTwo; +// } + +// /** +// * Set a new region of text to be examined by this iterator +// * +// * @param text text buffer to examine +// * @param start offset into buffer +// * @param length maximum length to examine +// */ +// public void SetText(char[] text, int start, int length) +// { +// this.text = text; +// this.start = start; +// this.index = start; +// this.limit = start + length; +// this.scriptStart = start; +// this.scriptLimit = start; +// this.scriptCode = UScript.INVALID_CODE; +// } + +// /** linear fast-path for basic latin case */ +// private static readonly int[] basicLatin = new int[128]; + +// static ScriptIterator() +// { +// for (int i = 0; i < basicLatin.Length; i++) +// basicLatin[i] = UScript.GetScript(i); +// } + +// /** fast version of UScript.getScript(). Basic Latin is an array lookup */ +// private int GetScript(int codepoint) +// { +// if (0 <= codepoint && codepoint < basicLatin.Length) +// { +// return basicLatin[codepoint]; +// } +// else +// { +// //int script = UScript.GetScript(codepoint); +// if (combineCJ) +// { +// if (Regex.IsMatch(new string(Support.Character.ToChars(codepoint)), @"\p{IsHangulCompatibilityJamo}+|\p{IsHiragana}+|\p{IsKatakana}+")) +// //if (script == UScript.HAN || script == UScript.HIRAGANA || script == UScript.KATAKANA) +// { +// return UScript.JAPANESE; +// } +// else if (codepoint >= 0xFF10 && codepoint <= 0xFF19) +// { +// // when using CJK dictionary breaking, don't let full width numbers go to it, otherwise +// // they are treated as punctuation. we currently have no cleaner way to fix this! +// return UScript.LATIN; +// } +// else +// { +// return script; +// } +// } +// else +// { +// return script; +// } +// } +// } +// } +//} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1191c20d/src/Lucene.Net.Analysis.ICU/Analysis/Icu/TokenAttributes/ScriptAttribute.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.ICU/Analysis/Icu/TokenAttributes/ScriptAttribute.cs b/src/Lucene.Net.Analysis.ICU/Analysis/Icu/TokenAttributes/ScriptAttribute.cs new file mode 100644 index 0000000..abc1ae2 --- /dev/null +++ b/src/Lucene.Net.Analysis.ICU/Analysis/Icu/TokenAttributes/ScriptAttribute.cs @@ -0,0 +1,42 @@ +// LUCENENET TODO: Port issues - missing dependencies + +//using Lucene.Net.Util; +//using System; +//using System.Collections.Generic; +//using System.Linq; +//using System.Text; +//using System.Threading.Tasks; + +//namespace Lucene.Net.Analysis.ICU.TokenAttributes +//{ +// /// <summary> +// /// This attribute stores the UTR #24 script value for a token of text. +// /// <para/> +// /// @lucene.experimental +// /// </summary> +// public interface IScriptAttribute : IAttribute +// { +// /** +// * Get the numeric code for this script value. +// * This is the constant value from {@link UScript}. +// * @return numeric code +// */ +// int Code { get; set; } +// ///** +// // * Set the numeric code for this script value. +// // * This is the constant value from {@link UScript}. +// // * @param code numeric code +// // */ +// //public void setCode(int code); +// /** +// * Get the full name. +// * @return UTR #24 full name. +// */ +// string GetName(); +// /** +// * Get the abbreviated name. +// * @return UTR #24 abbreviated name. +// */ +// string GetShortName(); +// } +//} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1191c20d/src/Lucene.Net.Analysis.ICU/Analysis/Icu/TokenAttributes/ScriptAttributeImpl.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.ICU/Analysis/Icu/TokenAttributes/ScriptAttributeImpl.cs b/src/Lucene.Net.Analysis.ICU/Analysis/Icu/TokenAttributes/ScriptAttributeImpl.cs new file mode 100644 index 0000000..f97ccf1 --- /dev/null +++ b/src/Lucene.Net.Analysis.ICU/Analysis/Icu/TokenAttributes/ScriptAttributeImpl.cs @@ -0,0 +1,80 @@ +// LUCENENET TODO: Port issues - missing dependencies + +//using Lucene.Net.Util; +//using System.Collections.Generic; +//using System.Linq; +//using System.Text; +//using System.Threading.Tasks; + +//namespace Lucene.Net.Analysis.ICU.TokenAttributes +//{ +// /// <summary> +// /// Implementation of <see cref="IScriptAttribute"/> that stores the script +// /// as an integer. +// /// <para/> +// /// @lucene.experimental +// /// </summary> +// public class ScriptAttribute : Attribute, IScriptAttribute, System.ICloneable +// { +// private int code = UScript.COMMON; + +// /** Initializes this attribute with <code>UScript.COMMON</code> */ +// public ScriptAttribute() { } + +// public virtual int Code +// { +// get { return code; } +// set { code = value; } +// } + +// public virtual string GetName() +// { +// return UScript.GetName(code); +// } + +// public virtual string GetShortName() +// { +// return UScript.GetShortName(code); +// } + +// public override void Clear() +// { +// code = UScript.COMMON; +// } + +// public override void CopyTo(IAttribute target) +// { +// ScriptAttribute t = (ScriptAttribute)target; +// t.Code = code; +// } + +// public override bool Equals(object other) +// { +// if (this == other) +// { +// return true; +// } + +// if (other is ScriptAttribute) +// { +// return ((ScriptAttribute)other).code == code; +// } + +// return false; +// } + +// public override int GetHashCode() +// { +// return code; +// } + +// public override void ReflectWith(IAttributeReflector reflector) +// { +// // when wordbreaking CJK, we use the 15924 code Japanese (Han+Hiragana+Katakana) to +// // mark runs of Chinese/Japanese. our use is correct (as for chinese Han is a subset), +// // but this is just to help prevent confusion. +// string name = code == UScript.JAPANESE ? "Chinese/Japanese" : GetName(); +// reflector.Reflect<IScriptAttribute>("script", name); +// } +// } +//}
