Lucene.Net.Analysis.ICU: Renamed ICU directory Icu to match namespace conventions
Project: http://git-wip-us.apache.org/repos/asf/lucenenet/repo Commit: http://git-wip-us.apache.org/repos/asf/lucenenet/commit/1191c20d Tree: http://git-wip-us.apache.org/repos/asf/lucenenet/tree/1191c20d Diff: http://git-wip-us.apache.org/repos/asf/lucenenet/diff/1191c20d Branch: refs/heads/master Commit: 1191c20dfb3761f7fd5205b3708f3479fbfa8b01 Parents: 60e8125 Author: Shad Storhaug <[email protected]> Authored: Mon Sep 11 03:56:06 2017 +0700 Committer: Shad Storhaug <[email protected]> Committed: Mon Sep 11 05:00:23 2017 +0700 ---------------------------------------------------------------------- .../Analysis/ICU/ICUFoldingFilter.cs | 32 --- .../Analysis/ICU/ICUFoldingFilterFactory.cs | 31 --- .../Analysis/ICU/ICUNormalizer2CharFilter.cs | 235 ------------------- .../ICU/ICUNormalizer2CharFilterFactory.cs | 60 ----- .../Analysis/ICU/ICUNormalizer2Filter.cs | 60 ----- .../Analysis/ICU/ICUNormalizer2FilterFactory.cs | 59 ----- .../Analysis/ICU/ICUTransformFilter.cs | 152 ------------ .../Analysis/ICU/ICUTransformFilterFactory.cs | 38 --- .../ICU/Segmentation/BreakIteratorWrapper.cs | 166 ------------- .../ICU/Segmentation/CharArrayIterator.cs | 134 ----------- .../ICU/Segmentation/CompositeBreakIterator.cs | 132 ----------- .../Segmentation/DefaultICUTokenizerConfig.cs | 127 ---------- .../Analysis/ICU/Segmentation/ICUTokenizer.cs | 229 ------------------ .../ICU/Segmentation/ICUTokenizerConfig.cs | 33 --- .../ICU/Segmentation/ICUTokenizerFactory.cs | 139 ----------- .../Analysis/ICU/Segmentation/ScriptIterator.cs | 206 ---------------- .../ICU/TokenAttributes/ScriptAttribute.cs | 42 ---- .../ICU/TokenAttributes/ScriptAttributeImpl.cs | 80 ------- .../Analysis/Icu/ICUFoldingFilter.cs | 32 +++ .../Analysis/Icu/ICUFoldingFilterFactory.cs | 31 +++ .../Analysis/Icu/ICUNormalizer2CharFilter.cs | 235 +++++++++++++++++++ .../Icu/ICUNormalizer2CharFilterFactory.cs | 60 +++++ .../Analysis/Icu/ICUNormalizer2Filter.cs | 60 +++++ .../Analysis/Icu/ICUNormalizer2FilterFactory.cs | 59 +++++ .../Analysis/Icu/ICUTransformFilter.cs | 152 ++++++++++++ .../Analysis/Icu/ICUTransformFilterFactory.cs | 38 +++ .../Icu/Segmentation/BreakIteratorWrapper.cs | 166 +++++++++++++ .../Icu/Segmentation/CharArrayIterator.cs | 134 +++++++++++ .../Icu/Segmentation/CompositeBreakIterator.cs | 132 +++++++++++ .../Segmentation/DefaultICUTokenizerConfig.cs | 127 ++++++++++ .../Analysis/Icu/Segmentation/ICUTokenizer.cs | 229 ++++++++++++++++++ .../Icu/Segmentation/ICUTokenizerConfig.cs | 33 +++ .../Icu/Segmentation/ICUTokenizerFactory.cs | 139 +++++++++++ .../Analysis/Icu/Segmentation/ScriptIterator.cs | 206 ++++++++++++++++ .../Icu/TokenAttributes/ScriptAttribute.cs | 42 ++++ .../Icu/TokenAttributes/ScriptAttributeImpl.cs | 80 +++++++ 36 files changed, 1955 insertions(+), 1955 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1191c20d/src/Lucene.Net.Analysis.ICU/Analysis/ICU/ICUFoldingFilter.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.ICU/Analysis/ICU/ICUFoldingFilter.cs b/src/Lucene.Net.Analysis.ICU/Analysis/ICU/ICUFoldingFilter.cs deleted file mode 100644 index 4ca8278..0000000 --- a/src/Lucene.Net.Analysis.ICU/Analysis/ICU/ICUFoldingFilter.cs +++ /dev/null @@ -1,32 +0,0 @@ -// LUCENENET TODO: Port issues - missing Normalizer2 dependency from icu.net - -//using Icu; -//using Lucene.Net.Support; -//using System; -//using System.Collections.Generic; -//using System.Linq; -//using System.Text; -//using System.Threading.Tasks; - -//namespace Lucene.Net.Analysis.ICU -//{ -// public sealed class ICUFoldingFilter : ICUNormalizer2Filter -// { -// private static readonly Normalizer2 normalizer; - -// /// <summary> -// /// Create a new ICUFoldingFilter on the specified input -// /// </summary> -// public ICUFoldingFilter(TokenStream input) -// : base(input, normalizer) -// { -// } - -// static ICUFoldingFilter() -// { -// normalizer = Normalizer2.GetInstance( -// typeof(ICUFoldingFilter).Assembly.FindAndGetManifestResourceStream(typeof(ICUFoldingFilter), "utr30.nrm"), -// "utr30", Normalizer2.Mode.COMPOSE); -// } -// } -//} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1191c20d/src/Lucene.Net.Analysis.ICU/Analysis/ICU/ICUFoldingFilterFactory.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.ICU/Analysis/ICU/ICUFoldingFilterFactory.cs b/src/Lucene.Net.Analysis.ICU/Analysis/ICU/ICUFoldingFilterFactory.cs deleted file mode 100644 index c25cf93..0000000 --- a/src/Lucene.Net.Analysis.ICU/Analysis/ICU/ICUFoldingFilterFactory.cs +++ /dev/null @@ -1,31 +0,0 @@ -// LUCENENET TODO: Port issues - missing Normalizer2 dependency from icu.net - -//using Lucene.Net.Analysis.Util; -//using System; -//using System.Collections.Generic; - -//namespace Lucene.Net.Analysis.ICU -//{ -// public class ICUFoldingFilterFactory : TokenFilterFactory, IMultiTermAwareComponent -// { -// /// <summary>Creates a new ICUFoldingFilterFactory</summary> -// public ICUFoldingFilterFactory(IDictionary<string, string> args) -// : base(args) -// { -// if (args.Count != 0) -// { -// throw new ArgumentException("Unknown parameters: " + args); -// } -// } - -// public override TokenStream Create(TokenStream input) -// { -// return new ICUFoldingFilter(input); -// } - -// public virtual AbstractAnalysisFactory GetMultiTermComponent() -// { -// return this; -// } -// } -//} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1191c20d/src/Lucene.Net.Analysis.ICU/Analysis/ICU/ICUNormalizer2CharFilter.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.ICU/Analysis/ICU/ICUNormalizer2CharFilter.cs b/src/Lucene.Net.Analysis.ICU/Analysis/ICU/ICUNormalizer2CharFilter.cs deleted file mode 100644 index 4254298..0000000 --- a/src/Lucene.Net.Analysis.ICU/Analysis/ICU/ICUNormalizer2CharFilter.cs +++ /dev/null @@ -1,235 +0,0 @@ -// LUCENENET TODO: Port issues - missing Normalizer2 dependency from icu.net - -//using Lucene.Net.Analysis.CharFilters; -//using Lucene.Net.Support; -//using System; -//using System.Collections.Generic; -//using System.IO; -//using System.Linq; -//using System.Text; -//using System.Threading.Tasks; - -//namespace Lucene.Net.Analysis.ICU -//{ -// /// <summary> -// /// Normalize token text with ICU's <see cref="Normalizer2"/>. -// /// </summary> -// public sealed class ICUNormalizer2CharFilter : BaseCharFilter -// { -// private static readonly int IO_BUFFER_SIZE = 128; - -// private readonly Normalizer2 normalizer; -// private readonly StringBuilder inputBuffer = new StringBuilder(); -// private readonly StringBuilder resultBuffer = new StringBuilder(); - -// private bool inputFinished; -// private bool afterQuickCheckYes; -// private int checkedInputBoundary; -// private int charCount; - - -// /** -// * Create a new Normalizer2CharFilter that combines NFKC normalization, Case -// * Folding, and removes Default Ignorables (NFKC_Casefold) -// */ -// public ICUNormalizer2CharFilter(TextReader input) -// : this(input, new Normalizer2(Icu.Normalizer.UNormalizationMode.UNORM_NFKC) /*Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE)*/) -// { -// } - -// /** -// * Create a new Normalizer2CharFilter with the specified Normalizer2 -// * @param in text -// * @param normalizer normalizer to use -// */ -// public ICUNormalizer2CharFilter(TextReader input, Normalizer2 normalizer) -// : base(input) -// { -// if (normalizer == null) -// { -// throw new ArgumentNullException("normalizer"); -// } -// this.normalizer = normalizer; -// } - -// public override int Read(char[] cbuf, int off, int len) -// { -// if (off < 0) throw new ArgumentException("off < 0"); -// if (off >= cbuf.Length) throw new ArgumentException("off >= cbuf.length"); -// if (len <= 0) throw new ArgumentException("len <= 0"); - -// while (!inputFinished || inputBuffer.Length > 0 || resultBuffer.Length > 0) -// { -// int retLen; - -// if (resultBuffer.Length > 0) -// { -// retLen = OutputFromResultBuffer(cbuf, off, len); -// if (retLen > 0) -// { -// return retLen; -// } -// } - -// int resLen = ReadAndNormalizeFromInput(); -// if (resLen > 0) -// { -// retLen = OutputFromResultBuffer(cbuf, off, len); -// if (retLen > 0) -// { -// return retLen; -// } -// } - -// ReadInputToBuffer(); -// } - -// return -1; -// } - -// private readonly char[] tmpBuffer = new char[IO_BUFFER_SIZE]; - -// private int ReadInputToBuffer() -// { -// int len = m_input.Read(tmpBuffer, 0, tmpBuffer.Length); -// if (len == -1) -// { -// inputFinished = true; -// return 0; -// } -// inputBuffer.Append(tmpBuffer, 0, len); - -// // if checkedInputBoundary was at the end of a buffer, we need to check that char again -// checkedInputBoundary = Math.Max(checkedInputBoundary - 1, 0); -// // this loop depends on 'isInert' (changes under normalization) but looks only at characters. -// // so we treat all surrogates as non-inert for simplicity -// if (normalizer.IsInert(tmpBuffer[len - 1]) && !char.IsSurrogate(tmpBuffer[len - 1])) -// { -// return len; -// } -// else return len + ReadInputToBuffer(); -// } - -// private int ReadAndNormalizeFromInput() -// { -// if (inputBuffer.Length <= 0) -// { -// afterQuickCheckYes = false; -// return 0; -// } -// if (!afterQuickCheckYes) -// { -// int resLen2 = ReadFromInputWhileSpanQuickCheckYes(); -// afterQuickCheckYes = true; -// if (resLen2 > 0) return resLen2; -// } -// int resLen = ReadFromIoNormalizeUptoBoundary(); -// if (resLen > 0) -// { -// afterQuickCheckYes = false; -// } -// return resLen; -// } - -// private int ReadFromInputWhileSpanQuickCheckYes() -// { -// int end = normalizer.spanQuickCheckYes(inputBuffer); -// if (end > 0) -// { -// //resultBuffer.Append(inputBuffer.subSequence(0, end)); -// resultBuffer.Append(inputBuffer.ToString(0, end)); -// //inputBuffer.delete(0, end); -// inputBuffer.Remove(0, end); -// checkedInputBoundary = Math.Max(checkedInputBoundary - end, 0); -// charCount += end; -// } -// return end; -// } - -// private int ReadFromIoNormalizeUptoBoundary() -// { -// // if there's no buffer to normalize, return 0 -// if (inputBuffer.Length <= 0) -// { -// return 0; -// } - -// bool foundBoundary = false; -// int bufLen = inputBuffer.Length; - -// while (checkedInputBoundary <= bufLen - 1) -// { -// int charLen = Character.CharCount(inputBuffer.CodePointAt(checkedInputBoundary)); -// checkedInputBoundary += charLen; -// if (checkedInputBoundary < bufLen && normalizer.HasBoundaryBefore(inputBuffer -// .CodePointAt(checkedInputBoundary))) -// { -// foundBoundary = true; -// break; -// } -// } -// if (!foundBoundary && checkedInputBoundary >= bufLen && inputFinished) -// { -// foundBoundary = true; -// checkedInputBoundary = bufLen; -// } - -// if (!foundBoundary) -// { -// return 0; -// } - -// return NormalizeInputUpto(checkedInputBoundary); -// } - -// private int NormalizeInputUpto(int length) -// { -// int destOrigLen = resultBuffer.Length; -// normalizer.NormalizeSecondAndAppend(resultBuffer, inputBuffer.ToString(0, length)); -// //inputBuffer.SubSequence(0, length)); - -// //inputBuffer.Delete(0, length); -// inputBuffer.Remove(0, length); -// checkedInputBoundary = Math.Max(checkedInputBoundary - length, 0); -// int resultLength = resultBuffer.Length - destOrigLen; -// RecordOffsetDiff(length, resultLength); -// return resultLength; -// } - -// private void RecordOffsetDiff(int inputLength, int outputLength) -// { -// if (inputLength == outputLength) -// { -// charCount += outputLength; -// return; -// } -// int diff = inputLength - outputLength; -// int cumuDiff = LastCumulativeDiff; -// if (diff < 0) -// { -// for (int i = 1; i <= -diff; ++i) -// { -// AddOffCorrectMap(charCount + i, cumuDiff - i); -// } -// } -// else -// { -// AddOffCorrectMap(charCount + outputLength, cumuDiff + diff); -// } -// charCount += outputLength; -// } - -// private int OutputFromResultBuffer(char[] cbuf, int begin, int len) -// { -// len = Math.Min(resultBuffer.Length, len); -// //resultBuffer.GetChars(0, len, cbuf, begin); -// resultBuffer.CopyTo(0, cbuf, begin, len); -// if (len > 0) -// { -// //resultBuffer.delete(0, len); -// resultBuffer.Remove(0, len); -// } -// return len; -// } -// } -//} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1191c20d/src/Lucene.Net.Analysis.ICU/Analysis/ICU/ICUNormalizer2CharFilterFactory.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.ICU/Analysis/ICU/ICUNormalizer2CharFilterFactory.cs b/src/Lucene.Net.Analysis.ICU/Analysis/ICU/ICUNormalizer2CharFilterFactory.cs deleted file mode 100644 index bd4cbe5..0000000 --- a/src/Lucene.Net.Analysis.ICU/Analysis/ICU/ICUNormalizer2CharFilterFactory.cs +++ /dev/null @@ -1,60 +0,0 @@ -// LUCENENET TODO: Port issues - missing Normalizer2 dependency from icu.net - -//using Icu; -//using Lucene.Net.Analysis.Util; -//using Lucene.Net.Support; -//using System; -//using System.Collections.Generic; -//using System.IO; -//using System.Linq; -//using System.Text; -//using System.Threading.Tasks; - -//namespace Lucene.Net.Analysis.ICU -//{ -// public class ICUNormalizer2CharFilterFactory : CharFilterFactory, IMultiTermAwareComponent -// { -// private readonly Normalizer2 normalizer; - -// /// <summary>Creates a new ICUNormalizer2CharFilterFactory</summary> -// public ICUNormalizer2CharFilterFactory(IDictionary<string, string> args) -// : base(args) -// { -// string name = Get(args, "name", "NFKC"); -// //string name = Get(args, "name", "nfkc_cf"); -// //string mode = Get(args, "mode", new string[] { "compose", "decompose" }, "compose"); -// //Normalizer2 normalizer = Normalizer2.getInstance -// // (null, name, "compose".Equals(mode) ? Normalizer2.Mode.COMPOSE : Normalizer2.Mode.DECOMPOSE); - -// var mode = (Icu.Normalizer.UNormalizationMode)Enum.Parse(typeof(Icu.Normalizer.UNormalizationMode), "UNORM_" + name); -// Normalizer2 normalizer = new Normalizer2(mode); - -// string filter = Get(args, "filter"); -// if (filter != null) -// { -// //UnicodeSet set = new UnicodeSet(filter); -// var set = UnicodeSet.ToCharacters(filter); -// if (set.Any()) -// { -// //set.freeze(); -// normalizer = new FilteredNormalizer2(normalizer, set); -// } -// } -// if (args.Count != 0) -// { -// throw new ArgumentException("Unknown parameters: " + args); -// } -// this.normalizer = normalizer; -// } - -// public override TextReader Create(TextReader input) -// { -// return new ICUNormalizer2CharFilter(input, normalizer); -// } - -// public virtual AbstractAnalysisFactory GetMultiTermComponent() -// { -// return this; -// } -// } -//} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1191c20d/src/Lucene.Net.Analysis.ICU/Analysis/ICU/ICUNormalizer2Filter.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.ICU/Analysis/ICU/ICUNormalizer2Filter.cs b/src/Lucene.Net.Analysis.ICU/Analysis/ICU/ICUNormalizer2Filter.cs deleted file mode 100644 index bca3d24..0000000 --- a/src/Lucene.Net.Analysis.ICU/Analysis/ICU/ICUNormalizer2Filter.cs +++ /dev/null @@ -1,60 +0,0 @@ -// LUCENENET TODO: Port issues - missing Normalizer2 dependency from icu.net - -//using Icu; -//using Lucene.Net.Analysis.TokenAttributes; -//using Lucene.Net.Support; - -//namespace Lucene.Net.Analysis.ICU -//{ -// public class ICUNormalizer2Filter : TokenFilter -// { -// private readonly ICharTermAttribute termAtt; -// private readonly Normalizer2 normalizer; - -// /// <summary> -// /// Create a new <see cref="Normalizer2Filter"/> that combines NFKC normalization, Case -// /// Folding, and removes Default Ignorables (NFKC_Casefold) -// /// </summary> -// /// <param name="input"></param> -// public ICUNormalizer2Filter(TokenStream input) -// : this(input, new Normalizer2(Normalizer.UNormalizationMode.UNORM_NFKC) /*Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE)*/) -// { -// } - -// /// <summary> -// /// Create a new <see cref="Normalizer2Filter"/> with the specified <see cref="Normalizer2"/> -// /// </summary> -// /// <param name="input">stream</param> -// /// <param name="normalizer">normalizer to use</param> -// public ICUNormalizer2Filter(TokenStream input, Normalizer2 normalizer) -// : base(input) -// { -// this.normalizer = normalizer; -// this.termAtt = AddAttribute<ICharTermAttribute>(); -// } - -// public override sealed bool IncrementToken() -// { -// if (m_input.IncrementToken()) -// { -// var term = termAtt.ToString(); -// try -// { -// if (!normalizer.IsNormalized(term)) -// { -// termAtt.SetEmpty().Append(normalizer.Normalize(term)); -// } -// } -// catch (System.Exception ex) -// { - -// } -// return true; -// } -// else -// { -// return false; -// } -// } -// } -//} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1191c20d/src/Lucene.Net.Analysis.ICU/Analysis/ICU/ICUNormalizer2FilterFactory.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.ICU/Analysis/ICU/ICUNormalizer2FilterFactory.cs b/src/Lucene.Net.Analysis.ICU/Analysis/ICU/ICUNormalizer2FilterFactory.cs deleted file mode 100644 index c17fb7f..0000000 --- a/src/Lucene.Net.Analysis.ICU/Analysis/ICU/ICUNormalizer2FilterFactory.cs +++ /dev/null @@ -1,59 +0,0 @@ -// LUCENENET TODO: Port issues - missing Normalizer2 dependency from icu.net - -//using Icu; -//using Lucene.Net.Analysis.Util; -//using Lucene.Net.Support; -//using System; -//using System.Collections.Generic; -//using System.Linq; - -//namespace Lucene.Net.Analysis.ICU -//{ -// public class ICUNormalizer2FilterFactory : TokenFilterFactory, IMultiTermAwareComponent -// { -// private readonly Normalizer2 normalizer; - -// /// <summary>Creates a new ICUNormalizer2FilterFactory</summary> -// public ICUNormalizer2FilterFactory(IDictionary<string, string> args) -// : base(args) -// { -// string name = Get(args, "name", "NFKC"); -// //string name = Get(args, "name", "nfkc_cf"); -// //string mode = Get(args, "mode", new string[] { "compose", "decompose" }, "compose"); - -// var mode = (Normalizer.UNormalizationMode)Enum.Parse(typeof(Normalizer.UNormalizationMode), "UNORM_" + name); -// Normalizer2 normalizer = new Normalizer2(mode); - -// //Normalizer2 normalizer = Normalizer2.getInstance -// // (null, name, "compose".Equals(mode) ? Normalizer2.Mode.COMPOSE : Normalizer2.Mode.DECOMPOSE); - -// string filter = Get(args, "filter"); -// if (filter != null) -// { -// //UnicodeSet set = new UnicodeSet(filter); -// var set = UnicodeSet.ToCharacters(filter); -// if (set.Any()) -// { -// //set.freeze(); -// normalizer = new FilteredNormalizer2(normalizer, set); -// } -// } -// if (args.Count != 0) -// { -// throw new ArgumentException("Unknown parameters: " + args); -// } -// this.normalizer = normalizer; -// } - -// // TODO: support custom normalization -// public override TokenStream Create(TokenStream input) -// { -// return new ICUNormalizer2Filter(input, normalizer); -// } - -// public virtual AbstractAnalysisFactory GetMultiTermComponent() -// { -// return this; -// } -// } -//} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1191c20d/src/Lucene.Net.Analysis.ICU/Analysis/ICU/ICUTransformFilter.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.ICU/Analysis/ICU/ICUTransformFilter.cs b/src/Lucene.Net.Analysis.ICU/Analysis/ICU/ICUTransformFilter.cs deleted file mode 100644 index 7f22c3d..0000000 --- a/src/Lucene.Net.Analysis.ICU/Analysis/ICU/ICUTransformFilter.cs +++ /dev/null @@ -1,152 +0,0 @@ -// LUCENENET TODO: Port issues - missing Transliterator dependency from icu.net - -//using Lucene.Net.Analysis.TokenAttributes; - -//namespace Lucene.Net.Analysis.ICU -//{ -// public sealed class ICUTransformFilter : TokenFilter -// { -// // Transliterator to transform the text -// private readonly Transliterator transform; - -// // Reusable position object -// private readonly Transliterator.Position position = new Transliterator.Position(); - -// // term attribute, will be updated with transformed text. -// private readonly ICharTermAttribute termAtt; - -// // Wraps a termAttribute around the replaceable interface. -// private readonly ReplaceableTermAttribute replaceableAttribute = new ReplaceableTermAttribute(); - -// /// <summary> -// /// Create a new ICUTransformFilter that transforms text on the given stream. -// /// </summary> -// /// <param name="input"><see cref="TokenStream"/> to filter.</param> -// /// <param name="transform">Transliterator to transform the text.</param> -// public ICUTransformFilter(TokenStream input, Transliterator transform) -// : base(input) -// { -// this.transform = transform; -// this.termAtt = AddAttribute<ICharTermAttribute>(); - -// /* -// * This is cheating, but speeds things up a lot. -// * If we wanted to use pkg-private APIs we could probably do better. -// */ -// if (transform.getFilter() == null && transform is com.ibm.icu.text.RuleBasedTransliterator) -// { -// UnicodeSet sourceSet = transform.getSourceSet(); -// if (sourceSet != null && !sourceSet.isEmpty()) -// transform.setFilter(sourceSet); -// } -// } - -// public override bool IncrementToken() -// { -// /* -// * Wrap around replaceable. clear the positions, and transliterate. -// */ -// if (m_input.IncrementToken()) -// { -// replaceableAttribute.SetText(termAtt); - -// int length = termAtt.Length; -// position.start = 0; -// position.limit = length; -// position.contextStart = 0; -// position.contextLimit = length; - -// transform.FilteredTransliterate(replaceableAttribute, position, false); -// return true; -// } -// else -// { -// return false; -// } -// } - -// /// <summary> -// /// Wrap a <see cref="ICharTermAttribute"/> with the Replaceable API. -// /// </summary> -// private sealed class ReplaceableTermAttribute //: IReplaceable -// { -// private char[] buffer; -// private int length; -// private ICharTermAttribute token; - -// public void SetText(ICharTermAttribute token) -// { -// this.token = token; -// this.buffer = token.Buffer; -// this.length = token.Length; -// } - -// public int Char32At(int pos) -// { -// return UTF16.charAt(buffer, 0, length, pos); -// } - -// public char CharAt(int pos) -// { -// return buffer[pos]; -// } - -// public void Copy(int start, int limit, int dest) -// { -// char[] text = new char[limit - start]; -// GetChars(start, limit, text, 0); -// Replace(dest, dest, text, 0, limit - start); -// } - -// public void GetChars(int srcStart, int srcLimit, char[] dst, int dstStart) -// { -// System.Array.Copy(buffer, srcStart, dst, dstStart, srcLimit - srcStart); -// } - -// public bool HasMetaData -// { -// get { return false; } -// } - -// public int Length -// { -// get { return length; } -// } - -// public void Replace(int start, int limit, string text) -// { -// int charsLen = text.Length; -// int newLength = ShiftForReplace(start, limit, charsLen); -// // insert the replacement text -// //text.getChars(0, charsLen, buffer, start); -// text.CopyTo(0, buffer, start, charsLen); -// token.Length = (length = newLength); -// } - -// public void Replace(int start, int limit, char[] text, int charsStart, -// int charsLen) -// { -// // shift text if necessary for the replacement -// int newLength = ShiftForReplace(start, limit, charsLen); -// // insert the replacement text -// System.Array.Copy(text, charsStart, buffer, start, charsLen); -// token.Length = (length = newLength); -// } - -// /// <summary>shift text (if necessary) for a replacement operation</summary> -// private int ShiftForReplace(int start, int limit, int charsLen) -// { -// int replacementLength = limit - start; -// int newLength = length - replacementLength + charsLen; -// // resize if necessary -// if (newLength > length) -// buffer = token.ResizeBuffer(newLength); -// // if the substring being replaced is longer or shorter than the -// // replacement, need to shift things around -// if (replacementLength != charsLen && limit < length) -// System.Array.Copy(buffer, limit, buffer, start + charsLen, length - limit); -// return newLength; -// } -// } -// } -//} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1191c20d/src/Lucene.Net.Analysis.ICU/Analysis/ICU/ICUTransformFilterFactory.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.ICU/Analysis/ICU/ICUTransformFilterFactory.cs b/src/Lucene.Net.Analysis.ICU/Analysis/ICU/ICUTransformFilterFactory.cs deleted file mode 100644 index 081ebf5..0000000 --- a/src/Lucene.Net.Analysis.ICU/Analysis/ICU/ICUTransformFilterFactory.cs +++ /dev/null @@ -1,38 +0,0 @@ -// LUCENENET TODO: Port issues - missing Transliterator dependency from icu.net - -//using Lucene.Net.Analysis.Util; -//using System; -//using System.Collections.Generic; - -//namespace Lucene.Net.Analysis.ICU -//{ -// public class ICUTransformFilterFactory : TokenFilterFactory, IMultiTermAwareComponent -// { -// private readonly Transliterator transliterator; - -// // TODO: add support for custom rules -// /// <summary>Creates a new ICUTransformFilterFactory</summary> -// public ICUTransformFilterFactory(IDictionary<string, string> args) -// : base(args) -// { -// string id = Require(args, "id"); -// string direction = Get(args, "direction", new string[] { "forward", "reverse" }, "forward", false); -// int dir = "forward".Equals(direction) ? Transliterator.FORWARD : Transliterator.REVERSE; -// transliterator = Transliterator.getInstance(id, dir); -// if (args.Count != 0) -// { -// throw new ArgumentException("Unknown parameters: " + args); -// } -// } - -// public override TokenStream Create(TokenStream input) -// { -// return new ICUTransformFilter(input, transliterator); -// } - -// public virtual AbstractAnalysisFactory GetMultiTermComponent() -// { -// return this; -// } -// } -//} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1191c20d/src/Lucene.Net.Analysis.ICU/Analysis/ICU/Segmentation/BreakIteratorWrapper.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.ICU/Analysis/ICU/Segmentation/BreakIteratorWrapper.cs b/src/Lucene.Net.Analysis.ICU/Analysis/ICU/Segmentation/BreakIteratorWrapper.cs deleted file mode 100644 index c124a88..0000000 --- a/src/Lucene.Net.Analysis.ICU/Analysis/ICU/Segmentation/BreakIteratorWrapper.cs +++ /dev/null @@ -1,166 +0,0 @@ -// LUCENENET TODO: Port issues - missing dependencies - -//using Icu; -//using Lucene.Net.Analysis.Util; -//using Lucene.Net.Support; -//using System; -//using System.Collections.Generic; -//using System.Linq; -//using System.Text; -//using System.Threading.Tasks; - -//namespace Lucene.Net.Analysis.ICU.Segmentation -//{ -// /// <summary> -// /// Contain all the issues surrounding BreakIterators in ICU in one place. -// /// Basically this boils down to the fact that they aren't very friendly to any -// /// sort of OO design. -// /// <para/> -// /// http://bugs.icu-project.org/trac/ticket/5901: RBBI.getRuleStatus(), hoist to -// /// BreakIterator from RuleBasedBreakIterator -// /// <para/> -// /// DictionaryBasedBreakIterator is a subclass of RuleBasedBreakIterator, but -// /// doesn't actually behave as a subclass: it always returns 0 for -// /// getRuleStatus(): -// /// http://bugs.icu-project.org/trac/ticket/4730: Thai RBBI, no boundary type -// /// tags -// /// <para/> -// /// @lucene.experimental -// /// </summary> -// internal abstract class BreakIteratorWrapper -// { -// protected readonly CharArrayIterator textIterator = new CharArrayIterator(); -// protected char[] text; -// protected int start; -// protected int length; - -// public abstract int Next(); -// public abstract int Current { get; } -// public abstract int GetRuleStatus(); -// public abstract void SetText(CharacterIterator text); - -// public void SetText(char[] text, int start, int length) -// { -// this.text = text; -// this.start = start; -// this.length = length; -// textIterator.SetText(text, start, length); -// SetText(textIterator); -// } - -// /** -// * If its a RuleBasedBreakIterator, the rule status can be used for token type. If its -// * any other BreakIterator, the rulestatus method is not available, so treat -// * it like a generic BreakIterator. -// */ -// public static BreakIteratorWrapper Wrap(Icu.BreakIterator breakIterator) -// { -// if (breakIterator is Icu.RuleBasedBreakIterator) -// return new RBBIWrapper((Icu.RuleBasedBreakIterator)breakIterator); -// else -// return new BIWrapper(breakIterator); -// } - -// /** -// * RuleBasedBreakIterator wrapper: RuleBasedBreakIterator (as long as its not -// * a DictionaryBasedBreakIterator) behaves correctly. -// */ -// private sealed class RBBIWrapper : BreakIteratorWrapper -// { -// private readonly Icu.RuleBasedBreakIterator rbbi; - -// internal RBBIWrapper(Icu.RuleBasedBreakIterator rbbi) -// { -// this.rbbi = rbbi; -// } - -// public override int Current -// { -// get { return rbbi.Current; } -// } - -// public override int GetRuleStatus() -// { -// return rbbi.GetRuleStatus(); -// } - -// public override int Next() -// { -// return rbbi.Next(); -// } - -// public override void SetText(CharacterIterator text) -// { -// rbbi.SetText(text); -// } -// } - -// /** -// * Generic BreakIterator wrapper: Either the rulestatus method is not -// * available or always returns 0. Calculate a rulestatus here so it behaves -// * like RuleBasedBreakIterator. -// * -// * Note: This is slower than RuleBasedBreakIterator. -// */ -// private sealed class BIWrapper : BreakIteratorWrapper -// { -// private readonly Support.BreakIterator bi; -// private int status; - -// internal BIWrapper(Support.BreakIterator bi) -// { -// this.bi = bi; -// } - -// public override int Current -// { -// get { return bi.Current; } -// } - -// public override int GetRuleStatus() -// { -// return status; -// } - -// public override int Next() -// { -// int current = bi.Current; -// int next = bi.Next(); -// status = CalcStatus(current, next); -// return next; -// } - -// private int CalcStatus(int current, int next) -// { -// if (current == Support.BreakIterator.DONE || next == Support.BreakIterator.DONE) -// return RuleBasedBreakIterator.WORD_NONE; - -// int begin = start + current; -// int end = start + next; - -// int codepoint; -// for (int i = begin; i < end; i += UTF16.getCharCount(codepoint)) -// { -// codepoint = UTF16.charAt(text, 0, end, begin); - -// if (UCharacter.isDigit(codepoint)) -// return RuleBasedBreakIterator.WORD_NUMBER; -// else if (UCharacter.isLetter(codepoint)) -// { -// // TODO: try to separately specify ideographic, kana? -// // [currently all bundled as letter for this case] -// return RuleBasedBreakIterator.WORD_LETTER; -// } -// } - -// return RuleBasedBreakIterator.WORD_NONE; -// } - -// public override void SetText(CharacterIterator text) -// { -// bi.SetText(text); -// status = RuleBasedBreakIterator.WORD_NONE; -// } -// } -// } -//} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1191c20d/src/Lucene.Net.Analysis.ICU/Analysis/ICU/Segmentation/CharArrayIterator.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.ICU/Analysis/ICU/Segmentation/CharArrayIterator.cs b/src/Lucene.Net.Analysis.ICU/Analysis/ICU/Segmentation/CharArrayIterator.cs deleted file mode 100644 index 209d583..0000000 --- a/src/Lucene.Net.Analysis.ICU/Analysis/ICU/Segmentation/CharArrayIterator.cs +++ /dev/null @@ -1,134 +0,0 @@ -using Lucene.Net.Support; -using System; -using System.Diagnostics.CodeAnalysis; - -namespace Lucene.Net.Analysis.Icu.Segmentation -{ - /// <summary> - /// Wraps a char[] as CharacterIterator for processing with a BreakIterator - /// <para/> - /// @lucene.experimental - /// </summary> - internal sealed class CharArrayIterator : CharacterIterator - { - private char[] array; - private int start; - private int index; - private int length; - private int limit; - - [WritableArray] - [SuppressMessage("Microsoft.Performance", "CA1819", Justification = "Lucene's design requires some writable array properties")] - public char[] Text - { - get - { - return array; - } - } - - public int Start - { - get { return start; } - } - - public int Length - { - get { return length; } - } - - /// <summary> - /// Set a new region of text to be examined by this iterator - /// </summary> - /// <param name="array">text buffer to examine</param> - /// <param name="start">offset into buffer</param> - /// <param name="length"> maximum length to examine</param> - public void SetText(char[] array, int start, int length) - { - this.array = array; - this.start = start; - this.index = start; - this.length = length; - this.limit = start + length; - } - - public override char Current - { - get { return (index == limit) ? DONE : array[index]; } - } - - public override char First() - { - index = start; - return Current; - } - - public override int BeginIndex - { - get { return 0; } - } - - public override int EndIndex - { - get { return length; } - } - - public override int Index - { - get { return index - start; } - } - - public override char Last() - { - index = (limit == start) ? limit : limit - 1; - return Current; - } - - public override char Next() - { - if (++index >= limit) - { - index = limit; - return DONE; - } - else - { - return Current; - } - } - - public override char Previous() - { - if (--index < start) - { - index = start; - return DONE; - } - else - { - return Current; - } - } - - public override char SetIndex(int position) - { - if (position < BeginIndex || position > EndIndex) - throw new ArgumentException("Illegal Position: " + position); - index = start + position; - return Current; - } - - public override string GetTextAsString() - { - return new string(array); - } - - public override object Clone() - { - CharArrayIterator clone = new CharArrayIterator(); - clone.SetText(array, start, length); - clone.index = index; - return clone; - } - } -} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1191c20d/src/Lucene.Net.Analysis.ICU/Analysis/ICU/Segmentation/CompositeBreakIterator.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.ICU/Analysis/ICU/Segmentation/CompositeBreakIterator.cs b/src/Lucene.Net.Analysis.ICU/Analysis/ICU/Segmentation/CompositeBreakIterator.cs deleted file mode 100644 index a004193..0000000 --- a/src/Lucene.Net.Analysis.ICU/Analysis/ICU/Segmentation/CompositeBreakIterator.cs +++ /dev/null @@ -1,132 +0,0 @@ -// LUCENENET TODO: Port issues - missing dependencies - -//using System; -//using System.Collections.Generic; -//using System.Linq; -//using System.Text; -//using System.Threading.Tasks; - -//namespace Lucene.Net.Analysis.ICU.Segmentation -//{ -// /// <summary> -// /// An internal BreakIterator for multilingual text, following recommendations -// /// from: UAX #29: Unicode Text Segmentation. (http://unicode.org/reports/tr29/) -// /// <para/> -// /// See http://unicode.org/reports/tr29/#Tailoring for the motivation of this -// /// design. -// /// <para/> -// /// Text is first divided into script boundaries. The processing is then -// /// delegated to the appropriate break iterator for that specific script. -// /// <para/> -// /// This break iterator also allows you to retrieve the ISO 15924 script code -// /// associated with a piece of text. -// /// <para/> -// /// See also UAX #29, UTR #24 -// /// <para/> -// /// @lucene.experimental -// /// </summary> -// internal sealed class CompositeBreakIterator -// { -// private readonly ICUTokenizerConfig config; -// private readonly BreakIteratorWrapper[] wordBreakers = new BreakIteratorWrapper[UScript.CODE_LIMIT]; - -// private BreakIteratorWrapper rbbi; -// private readonly ScriptIterator scriptIterator; - -// private char[] text; - -// public CompositeBreakIterator(ICUTokenizerConfig config) -// { -// this.config = config; -// this.scriptIterator = new ScriptIterator(config.CombineCJ); -// } - -// /** -// * Retrieve the next break position. If the RBBI range is exhausted within the -// * script boundary, examine the next script boundary. -// * -// * @return the next break position or BreakIterator.DONE -// */ -// public int Next() -// { -// int next = rbbi.Next(); -// while (next == Support.BreakIterator.DONE && scriptIterator.Next()) -// { -// rbbi = GetBreakIterator(scriptIterator.GetScriptCode()); -// rbbi.SetText(text, scriptIterator.GetScriptStart(), -// scriptIterator.GetScriptLimit() - scriptIterator.GetScriptStart()); -// next = rbbi.Next(); -// } -// return (next == Support.BreakIterator.DONE) ? Support.BreakIterator.DONE : next -// + scriptIterator.GetScriptStart(); -// } - -// /** -// * Retrieve the current break position. -// * -// * @return the current break position or BreakIterator.DONE -// */ -// public int Current -// { -// get -// { -// int current = rbbi.Current; -// return (current == Support.BreakIterator.DONE) ? Support.BreakIterator.DONE : current -// + scriptIterator.GetScriptStart(); -// } -// } - -// /** -// * Retrieve the rule status code (token type) from the underlying break -// * iterator -// * -// * @return rule status code (see RuleBasedBreakIterator constants) -// */ -// public int GetRuleStatus() -// { -// return rbbi.GetRuleStatus(); -// } - -// /** -// * Retrieve the UScript script code for the current token. This code can be -// * decoded with UScript into a name or ISO 15924 code. -// * -// * @return UScript script code for the current token. -// */ -// public int GetScriptCode() -// { -// return scriptIterator.GetScriptCode(); -// } - -// /** -// * Set a new region of text to be examined by this iterator -// * -// * @param text buffer of text -// * @param start offset into buffer -// * @param length maximum length to examine -// */ -// public void SetText(char[] text, int start, int length) -// { -// this.text = text; -// scriptIterator.SetText(text, start, length); -// if (scriptIterator.Next()) -// { -// rbbi = GetBreakIterator(scriptIterator.GetScriptCode()); -// rbbi.SetText(text, scriptIterator.GetScriptStart(), -// scriptIterator.GetScriptLimit() - scriptIterator.GetScriptStart()); -// } -// else -// { -// rbbi = GetBreakIterator(UScript.COMMON); -// rbbi.SetText(text, 0, 0); -// } -// } - -// private BreakIteratorWrapper GetBreakIterator(int scriptCode) -// { -// if (wordBreakers[scriptCode] == null) -// wordBreakers[scriptCode] = BreakIteratorWrapper.Wrap(config.GetBreakIterator(scriptCode)); -// return wordBreakers[scriptCode]; -// } -// } -//} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1191c20d/src/Lucene.Net.Analysis.ICU/Analysis/ICU/Segmentation/DefaultICUTokenizerConfig.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.ICU/Analysis/ICU/Segmentation/DefaultICUTokenizerConfig.cs b/src/Lucene.Net.Analysis.ICU/Analysis/ICU/Segmentation/DefaultICUTokenizerConfig.cs deleted file mode 100644 index fc2a989..0000000 --- a/src/Lucene.Net.Analysis.ICU/Analysis/ICU/Segmentation/DefaultICUTokenizerConfig.cs +++ /dev/null @@ -1,127 +0,0 @@ -// LUCENENET TODO: Port issues - missing dependencies - -//using Icu; -//using Lucene.Net.Analysis.Standard; -//using Lucene.Net.Support; -//using System; -//using System.Collections.Generic; -//using System.Globalization; -//using System.IO; -//using System.Linq; -//using System.Text; -//using System.Threading.Tasks; - -//namespace Lucene.Net.Analysis.ICU.Segmentation -//{ -// /// <summary> -// /// Default <see cref="ICUTokenizerConfig"/> that is generally applicable -// /// to many languages. -// /// </summary> -// /// <remarks> -// /// Generally tokenizes Unicode text according to UAX#29 -// /// ({@link BreakIterator#getWordInstance(ULocale) BreakIterator.getWordInstance(ULocale.ROOT)}), -// /// but with the following tailorings: -// /// <list type="bullet"> -// /// <item><description>Thai, Lao, and CJK text is broken into words with a dictionary.</description></item> -// /// <item><description>Myanmar, and Khmer text is broken into syllables based on custom BreakIterator rules.</description></item> -// /// </list> -// /// <para/> -// /// @lucene.experimental -// /// </remarks> -// public class DefaultICUTokenizerConfig : ICUTokenizerConfig -// { -// /** Token type for words containing ideographic characters */ -// public static readonly string WORD_IDEO = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.IDEOGRAPHIC]; -// /** Token type for words containing Japanese hiragana */ -// public static readonly string WORD_HIRAGANA = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.HIRAGANA]; -// /** Token type for words containing Japanese katakana */ -// public static readonly string WORD_KATAKANA = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.KATAKANA]; -// /** Token type for words containing Korean hangul */ -// public static readonly string WORD_HANGUL = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.HANGUL]; -// /** Token type for words that contain letters */ -// public static readonly string WORD_LETTER = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.ALPHANUM]; -// /** Token type for words that appear to be numbers */ -// public static readonly string WORD_NUMBER = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.NUM]; - -// /* -// * the default breakiterators in use. these can be expensive to -// * instantiate, cheap to clone. -// */ -// // we keep the cjk breaking separate, thats because it cannot be customized (because dictionary -// // is only triggered when kind = WORD, but kind = LINE by default and we have no non-evil way to change it) -// private static readonly Icu.BreakIterator cjkBreakIterator = new Icu.RuleBasedBreakIterator(Icu.BreakIterator.UBreakIteratorType.WORD, new Locale()); //BreakIterator.getWordInstance(ULocale.ROOT); -// // the same as ROOT, except no dictionary segmentation for cjk -// private static readonly Icu.BreakIterator defaultBreakIterator = -// ReadBreakIterator("Default.brk"); -// private static readonly Icu.BreakIterator khmerBreakIterator = -// ReadBreakIterator("Khmer.brk"); -// private static readonly Icu.BreakIterator myanmarBreakIterator = -// ReadBreakIterator("Myanmar.brk"); - -// // TODO: deprecate this boolean? you only care if you are doing super-expert stuff... -// private readonly bool cjkAsWords; - -// /** -// * Creates a new config. This object is lightweight, but the first -// * time the class is referenced, breakiterators will be initialized. -// * @param cjkAsWords true if cjk text should undergo dictionary-based segmentation, -// * otherwise text will be segmented according to UAX#29 defaults. -// * If this is true, all Han+Hiragana+Katakana words will be tagged as -// * IDEOGRAPHIC. -// */ -// public DefaultICUTokenizerConfig(bool cjkAsWords) -// { -// this.cjkAsWords = cjkAsWords; -// } - -// public override bool CombineCJ -// { -// get { return cjkAsWords; } -// } - -// public override Icu.BreakIterator GetBreakIterator(int script) -// { -// switch (script) -// { -// case UScript.KHMER: return (Icu.BreakIterator)khmerBreakIterator.Clone(); -// case UScript.MYANMAR: return (Icu.BreakIterator)myanmarBreakIterator.Clone(); -// case UScript.JAPANESE: return (Icu.BreakIterator)cjkBreakIterator.Clone(); -// default: return (Icu.BreakIterator)defaultBreakIterator.Clone(); -// } -// } - -// public override string GetType(int script, int ruleStatus) -// { -// switch (ruleStatus) -// { -// case RuleBasedBreakIterator.WORD_IDEO: -// return WORD_IDEO; -// case RuleBasedBreakIterator.WORD_KANA: -// return script == UScript.HIRAGANA ? WORD_HIRAGANA : WORD_KATAKANA; -// case RuleBasedBreakIterator.WORD_LETTER: -// return script == UScript.HANGUL ? WORD_HANGUL : WORD_LETTER; -// case RuleBasedBreakIterator.WORD_NUMBER: -// return WORD_NUMBER; -// default: /* some other custom code */ -// return "<OTHER>"; -// } -// } - -// private static RuleBasedBreakIterator ReadBreakIterator(string filename) -// { -// Stream @is = -// typeof(DefaultICUTokenizerConfig).Assembly.FindAndGetManifestResourceStream(typeof(DefaultICUTokenizerConfig), filename); -// try -// { -// RuleBasedBreakIterator bi = -// RuleBasedBreakIterator.GetInstanceFromCompiledRules(@is); -// @is.Dispose(); -// return bi; -// } -// catch (IOException e) -// { -// throw new Exception(e.ToString(), e); -// } -// } -// } -//} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1191c20d/src/Lucene.Net.Analysis.ICU/Analysis/ICU/Segmentation/ICUTokenizer.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.ICU/Analysis/ICU/Segmentation/ICUTokenizer.cs b/src/Lucene.Net.Analysis.ICU/Analysis/ICU/Segmentation/ICUTokenizer.cs deleted file mode 100644 index 7677c0c..0000000 --- a/src/Lucene.Net.Analysis.ICU/Analysis/ICU/Segmentation/ICUTokenizer.cs +++ /dev/null @@ -1,229 +0,0 @@ -// LUCENENET TODO: Port issues - missing dependencies - -//using Lucene.Net.Analysis.ICU.TokenAttributes; -//using Lucene.Net.Analysis.TokenAttributes; -//using System; -//using System.Collections.Generic; -//using System.Diagnostics; -//using System.IO; -//using System.Linq; -//using System.Text; -//using System.Threading.Tasks; - -//namespace Lucene.Net.Analysis.ICU.Segmentation -//{ -// /// <summary> -// /// Breaks text into words according to UAX #29: Unicode Text Segmentation -// /// (http://www.unicode.org/reports/tr29/) -// /// <para/> -// /// Words are broken across script boundaries, then segmented according to -// /// the BreakIterator and typing provided by the <see cref="ICUTokenizerConfig"/> -// /// <para/> -// /// @lucene.experimental -// /// </summary> -// /// <seealso cref="ICUTokenizerConfig"/> -// public sealed class ICUTokenizer : Tokenizer -// { -// private static readonly int IOBUFFER = 4096; -// private readonly char[] buffer = new char[IOBUFFER]; -// /** true length of text in the buffer */ -// private int length = 0; -// /** length in buffer that can be evaluated safely, up to a safe end point */ -// private int usableLength = 0; -// /** accumulated offset of previous buffers for this reader, for offsetAtt */ -// private int offset = 0; - -// private readonly CompositeBreakIterator breaker; /* tokenizes a char[] of text */ -// private readonly ICUTokenizerConfig config; -// private readonly IOffsetAttribute offsetAtt; -// private readonly ICharTermAttribute termAtt; -// private readonly ITypeAttribute typeAtt; -// private readonly IScriptAttribute scriptAtt; - -// /** -// * Construct a new ICUTokenizer that breaks text into words from the given -// * Reader. -// * <p> -// * The default script-specific handling is used. -// * <p> -// * The default attribute factory is used. -// * -// * @param input Reader containing text to tokenize. -// * @see DefaultICUTokenizerConfig -// */ -// public ICUTokenizer(TextReader input) -// : this(input, new DefaultICUTokenizerConfig(true)) -// { -// } - -// /** -// * Construct a new ICUTokenizer that breaks text into words from the given -// * Reader, using a tailored BreakIterator configuration. -// * <p> -// * The default attribute factory is used. -// * -// * @param input Reader containing text to tokenize. -// * @param config Tailored BreakIterator configuration -// */ -// public ICUTokenizer(TextReader input, ICUTokenizerConfig config) -// : this(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY, input, config) -// { -// } - -// /** -// * Construct a new ICUTokenizer that breaks text into words from the given -// * Reader, using a tailored BreakIterator configuration. -// * -// * @param factory AttributeFactory to use -// * @param input Reader containing text to tokenize. -// * @param config Tailored BreakIterator configuration -// */ -// public ICUTokenizer(AttributeFactory factory, TextReader input, ICUTokenizerConfig config) -// : base(factory, input) -// { -// this.config = config; -// breaker = new CompositeBreakIterator(config); - -// this.offsetAtt = AddAttribute<IOffsetAttribute>(); -// this.termAtt = AddAttribute<ICharTermAttribute>(); -// this.typeAtt = AddAttribute<ITypeAttribute>(); -// this.scriptAtt = AddAttribute<IScriptAttribute>(); -// } - - -// public override bool IncrementToken() -// { -// ClearAttributes(); -// if (length == 0) -// Refill(); -// while (!IncrementTokenBuffer()) -// { -// Refill(); -// if (length <= 0) // no more bytes to read; -// return false; -// } -// return true; -// } - - -// public override void Reset() -// { -// base.Reset(); -// breaker.SetText(buffer, 0, 0); -// length = usableLength = offset = 0; -// } - -// public override void End() -// { -// base.End(); -// int finalOffset = (length < 0) ? offset : offset + length; -// offsetAtt.SetOffset(CorrectOffset(finalOffset), CorrectOffset(finalOffset)); -// } - -// /* -// * This tokenizes text based upon the longest matching rule, and because of -// * this, isn't friendly to a Reader. -// * -// * Text is read from the input stream in 4kB chunks. Within a 4kB chunk of -// * text, the last unambiguous break point is found (in this implementation: -// * white space character) Any remaining characters represent possible partial -// * words, so are appended to the front of the next chunk. -// * -// * There is the possibility that there are no unambiguous break points within -// * an entire 4kB chunk of text (binary data). So there is a maximum word limit -// * of 4kB since it will not try to grow the buffer in this case. -// */ - -// /** -// * Returns the last unambiguous break position in the text. -// * -// * @return position of character, or -1 if one does not exist -// */ -// private int FindSafeEnd() -// { -// for (int i = length - 1; i >= 0; i--) -// if (char.IsWhiteSpace(buffer[i])) -// return i + 1; -// return -1; -// } - -// /** -// * Refill the buffer, accumulating the offset and setting usableLength to the -// * last unambiguous break position -// * -// * @throws IOException If there is a low-level I/O error. -// */ -// private void Refill() -// { -// offset += usableLength; -// int leftover = length - usableLength; -// System.Array.Copy(buffer, usableLength, buffer, 0, leftover); -// int requested = buffer.Length - leftover; -// int returned = Read(m_input, buffer, leftover, requested); -// length = returned + leftover; -// if (returned < requested) /* reader has been emptied, process the rest */ -// usableLength = length; -// else -// { /* still more data to be read, find a safe-stopping place */ -// usableLength = FindSafeEnd(); -// if (usableLength < 0) -// usableLength = length; /* -// * more than IOBUFFER of text without space, -// * gonna possibly truncate tokens -// */ -// } - -// breaker.SetText(buffer, 0, Math.Max(0, usableLength)); -// } - -// // TODO: refactor to a shared readFully somewhere -// // (NGramTokenizer does this too): -// /** commons-io's readFully, but without bugs if offset != 0 */ -// private static int Read(TextReader input, char[] buffer, int offset, int length) -// { -// Debug.Assert(length >= 0, "length must not be negative: " + length); - -// int remaining = length; -// while (remaining > 0) -// { -// int location = length - remaining; -// int count = input.Read(buffer, offset + location, remaining); -// if (-1 == count) -// { // EOF -// break; -// } -// remaining -= count; -// } -// return length - remaining; -// } - -// /* -// * return true if there is a token from the buffer, or null if it is -// * exhausted. -// */ -// private bool IncrementTokenBuffer() -// { -// int start = breaker.Current; -// if (start == Support.BreakIterator.DONE) -// return false; // BreakIterator exhausted - -// // find the next set of boundaries, skipping over non-tokens (rule status 0) -// int end = breaker.Next(); -// while (start != Support.BreakIterator.DONE && breaker.GetRuleStatus() == 0) -// { -// start = end; -// end = breaker.Next(); -// } - -// if (start == Support.BreakIterator.DONE) -// return false; // BreakIterator exhausted - -// termAtt.CopyBuffer(buffer, start, end - start); -// offsetAtt.SetOffset(CorrectOffset(offset + start), CorrectOffset(offset + end)); -// typeAtt.Type = config.GetType(breaker.GetScriptCode(), breaker.GetRuleStatus()); -// scriptAtt.Code = breaker.GetScriptCode(); - -// return true; -// } -// } -//} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1191c20d/src/Lucene.Net.Analysis.ICU/Analysis/ICU/Segmentation/ICUTokenizerConfig.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.ICU/Analysis/ICU/Segmentation/ICUTokenizerConfig.cs b/src/Lucene.Net.Analysis.ICU/Analysis/ICU/Segmentation/ICUTokenizerConfig.cs deleted file mode 100644 index 0c13316..0000000 --- a/src/Lucene.Net.Analysis.ICU/Analysis/ICU/Segmentation/ICUTokenizerConfig.cs +++ /dev/null @@ -1,33 +0,0 @@ -// LUCENENET TODO: Port issues - missing dependencies - -//using Lucene.Net.Support; - -//namespace Lucene.Net.Analysis.ICU.Segmentation -//{ -// /// <summary> -// /// Class that allows for tailored Unicode Text Segmentation on -// /// a per-writing system basis. -// /// <para/> -// /// @lucene.experimental -// /// </summary> -// public abstract class ICUTokenizerConfig -// { -// /// <summary> -// /// Sole constructor. (For invocation by subclass -// /// constructors, typically implicit.) -// /// </summary> -// public ICUTokenizerConfig() { } -// /// <summary> -// /// Return a breakiterator capable of processing a given script. -// /// </summary> -// public abstract Icu.BreakIterator GetBreakIterator(int script); -// /// <summary> -// /// Return a token type value for a given script and BreakIterator rule status. -// /// </summary> -// public abstract string GetType(int script, int ruleStatus); -// /// <summary> -// /// true if Han, Hiragana, and Katakana scripts should all be returned as Japanese -// /// </summary> -// public abstract bool CombineCJ { get; } -// } -//} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1191c20d/src/Lucene.Net.Analysis.ICU/Analysis/ICU/Segmentation/ICUTokenizerFactory.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.ICU/Analysis/ICU/Segmentation/ICUTokenizerFactory.cs b/src/Lucene.Net.Analysis.ICU/Analysis/ICU/Segmentation/ICUTokenizerFactory.cs deleted file mode 100644 index 14aa9c0..0000000 --- a/src/Lucene.Net.Analysis.ICU/Analysis/ICU/Segmentation/ICUTokenizerFactory.cs +++ /dev/null @@ -1,139 +0,0 @@ -// LUCENENET TODO: Port issues - missing dependencies - -//using Icu; -//using Lucene.Net.Analysis.Util; -//using Lucene.Net.Support; -//using Lucene.Net.Util; -//using System; -//using System.Collections.Generic; -//using System.Diagnostics; -//using System.IO; -//using System.Linq; -//using System.Text; -//using System.Threading.Tasks; - -//namespace Lucene.Net.Analysis.ICU.Segmentation -//{ - -// public class ICUTokenizerFactory : TokenizerFactory, IResourceLoaderAware -// { -// internal static readonly string RULEFILES = "rulefiles"; -// private readonly IDictionary<int, string> tailored; -// private ICUTokenizerConfig config; -// private readonly bool cjkAsWords; - -// /// <summary>Creates a new ICUTokenizerFactory</summary> -// public ICUTokenizerFactory(IDictionary<string, string> args) -// : base(args) -// { -// tailored = new Dictionary<int, string>(); -// string rulefilesArg = Get(args, RULEFILES); -// if (rulefilesArg != null) -// { -// IList<string> scriptAndResourcePaths = SplitFileNames(rulefilesArg); -// foreach (string scriptAndResourcePath in scriptAndResourcePaths) -// { -// int colonPos = scriptAndResourcePath.IndexOf(":"); -// string scriptCode = scriptAndResourcePath.Substring(0, colonPos - 0).Trim(); -// string resourcePath = scriptAndResourcePath.Substring(colonPos + 1).Trim(); -// tailored[UCharacter.getPropertyValueEnum(UProperty.SCRIPT, scriptCode)] = resourcePath; -// } -// } -// cjkAsWords = GetBoolean(args, "cjkAsWords", true); -// if (args.Count != 0) -// { -// throw new ArgumentException("Unknown parameters: " + args); -// } -// } - -// public virtual void Inform(IResourceLoader loader) -// { -// Debug.Assert(tailored != null, "init must be called first!"); -// if (tailored.Count == 0) -// { -// config = new DefaultICUTokenizerConfig(cjkAsWords); -// } -// else -// { -// config = new DefaultICUTokenizerConfigAnonymousHelper(cjkAsWords, tailored, loader); - -// //BreakIterator[] breakers = new BreakIterator[UScript.CODE_LIMIT]; -// //foreach (var entry in tailored) -// //{ -// // int code = entry.Key; -// // string resourcePath = entry.Value; -// // breakers[code] = ParseRules(resourcePath, loader); -// //} -// // config = new DefaultICUTokenizerConfig(cjkAsWords) -// // { - -// // public override BreakIterator GetBreakIterator(int script) -// // { -// // if (breakers[script] != null) -// // { -// // return (BreakIterator)breakers[script].clone(); -// // } -// // else -// // { -// // return base.GetBreakIterator(script); -// // } -// // } -// // // TODO: we could also allow codes->types mapping -// //}; -// } -// } - -// private class DefaultICUTokenizerConfigAnonymousHelper : DefaultICUTokenizerConfig -// { -// private readonly Icu.BreakIterator[] breakers; -// public DefaultICUTokenizerConfigAnonymousHelper(bool cjkAsWords, IDictionary<int, string> tailored, IResourceLoader loader) -// : base(cjkAsWords) -// { -// breakers = new Icu.BreakIterator[UScript.CODE_LIMIT]; -// foreach (var entry in tailored) -// { -// int code = entry.Key; -// string resourcePath = entry.Value; -// breakers[code] = ParseRules(resourcePath, loader); -// } -// } - -// public override Icu.BreakIterator GetBreakIterator(int script) -// { -// if (breakers[script] != null) -// { -// return (Icu.BreakIterator)breakers[script].Clone(); -// } -// else -// { -// return base.GetBreakIterator(script); -// } -// } - -// private Icu.BreakIterator ParseRules(string filename, IResourceLoader loader) -// { -// StringBuilder rules = new StringBuilder(); -// Stream rulesStream = loader.OpenResource(filename); -// using (TextReader reader = IOUtils.GetDecodingReader(rulesStream, Encoding.UTF8)) -// { -// string line = null; -// while ((line = reader.ReadLine()) != null) -// { -// if (!line.StartsWith("#", StringComparison.Ordinal)) -// { -// rules.Append(line); -// } -// rules.Append('\n'); -// } -// } -// return new RuleBasedBreakIterator(rules.ToString()); -// } -// } - -// public override Tokenizer Create(AttributeSource.AttributeFactory factory, TextReader input) -// { -// Debug.Assert(config != null, "inform must be called first!"); -// return new ICUTokenizer(factory, input, config); -// } -// } -//} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1191c20d/src/Lucene.Net.Analysis.ICU/Analysis/ICU/Segmentation/ScriptIterator.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.ICU/Analysis/ICU/Segmentation/ScriptIterator.cs b/src/Lucene.Net.Analysis.ICU/Analysis/ICU/Segmentation/ScriptIterator.cs deleted file mode 100644 index f328851..0000000 --- a/src/Lucene.Net.Analysis.ICU/Analysis/ICU/Segmentation/ScriptIterator.cs +++ /dev/null @@ -1,206 +0,0 @@ -// LUCENENET TODO: Port issues - missing dependencies - -//using System; -//using System.Collections.Generic; -//using System.Linq; -//using System.Text; -//using System.Text.RegularExpressions; -//using System.Threading.Tasks; - -//namespace Lucene.Net.Analysis.ICU.Segmentation -//{ -// /// <summary> -// /// An iterator that locates ISO 15924 script boundaries in text. -// /// </summary> -// /// <remarks> -// /// This is not the same as simply looking at the Unicode block, or even the -// /// Script property. Some characters are 'common' across multiple scripts, and -// /// some 'inherit' the script value of text surrounding them. -// /// <para/> -// /// This is similar to ICU (internal-only) UScriptRun, with the following -// /// differences: -// /// <list type="bullet"> -// /// <item><description> -// /// Doesn't attempt to match paired punctuation. For tokenization purposes, this -// /// is not necessary. Its also quite expensive. -// /// </description></item> -// /// <item><description> -// /// Non-spacing marks inherit the script of their base character, following -// /// recommendations from UTR #24. -// /// </description></item> -// /// </list> -// /// <para/> -// /// @lucene.experimental -// /// </remarks> -// internal sealed class ScriptIterator -// { -// private char[] text; -// private int start; -// private int limit; -// private int index; - -// private int scriptStart; -// private int scriptLimit; -// private int scriptCode; - -// private readonly bool combineCJ; - -// /** -// * @param combineCJ if true: Han,Hiragana,Katakana will all return as {@link UScript#JAPANESE} -// */ -// internal ScriptIterator(bool combineCJ) -// { -// this.combineCJ = combineCJ; -// } - -// /** -// * Get the start of this script run -// * -// * @return start position of script run -// */ -// public int ScriptStart -// { -// get { return scriptStart; } -// } - -// /** -// * Get the index of the first character after the end of this script run -// * -// * @return position of the first character after this script run -// */ -// public int ScriptLimit -// { -// get { return scriptLimit; } -// } - -// /** -// * Get the UScript script code for this script run -// * -// * @return code for the script of the current run -// */ -// public int ScriptCode -// { -// get { return scriptCode; } -// } - -// /** -// * Iterates to the next script run, returning true if one exists. -// * -// * @return true if there is another script run, false otherwise. -// */ -// public bool Next() -// { -// if (scriptLimit >= limit) -// return false; - -// scriptCode = UScript.COMMON; -// scriptStart = scriptLimit; - -// while (index < limit) -// { -// //int ch = UTF16.charAt(text, start, limit, index - start); -// int ch = Encoding.Unicode.(text, start, limit); -// int sc = GetScript(ch); - -// /* -// * From UTR #24: Implementations that determine the boundaries between -// * characters of given scripts should never break between a non-spacing -// * mark and its base character. Thus for boundary determinations and -// * similar sorts of processing, a non-spacing mark â whatever its script -// * value â should inherit the script value of its base character. -// */ -// if (isSameScript(scriptCode, sc) -// || UCharacter.getType(ch) == ECharacterCategory.NON_SPACING_MARK) -// { -// //index += UTF16.getCharCount(ch); -// index += Encoding.Unicode.GetCharCount() - -// /* -// * Inherited or Common becomes the script code of the surrounding text. -// */ -// if (scriptCode <= UScript.INHERITED && sc > UScript.INHERITED) -// { -// scriptCode = sc; -// } - -// } -// else -// { -// break; -// } -// } - -// scriptLimit = index; -// return true; -// } - -// /** Determine if two scripts are compatible. */ -// private static bool IsSameScript(int scriptOne, int scriptTwo) -// { -// return scriptOne <= UScript.INHERITED || scriptTwo <= UScript.INHERITED -// || scriptOne == scriptTwo; -// } - -// /** -// * Set a new region of text to be examined by this iterator -// * -// * @param text text buffer to examine -// * @param start offset into buffer -// * @param length maximum length to examine -// */ -// public void SetText(char[] text, int start, int length) -// { -// this.text = text; -// this.start = start; -// this.index = start; -// this.limit = start + length; -// this.scriptStart = start; -// this.scriptLimit = start; -// this.scriptCode = UScript.INVALID_CODE; -// } - -// /** linear fast-path for basic latin case */ -// private static readonly int[] basicLatin = new int[128]; - -// static ScriptIterator() -// { -// for (int i = 0; i < basicLatin.Length; i++) -// basicLatin[i] = UScript.GetScript(i); -// } - -// /** fast version of UScript.getScript(). Basic Latin is an array lookup */ -// private int GetScript(int codepoint) -// { -// if (0 <= codepoint && codepoint < basicLatin.Length) -// { -// return basicLatin[codepoint]; -// } -// else -// { -// //int script = UScript.GetScript(codepoint); -// if (combineCJ) -// { -// if (Regex.IsMatch(new string(Support.Character.ToChars(codepoint)), @"\p{IsHangulCompatibilityJamo}+|\p{IsHiragana}+|\p{IsKatakana}+")) -// //if (script == UScript.HAN || script == UScript.HIRAGANA || script == UScript.KATAKANA) -// { -// return UScript.JAPANESE; -// } -// else if (codepoint >= 0xFF10 && codepoint <= 0xFF19) -// { -// // when using CJK dictionary breaking, don't let full width numbers go to it, otherwise -// // they are treated as punctuation. we currently have no cleaner way to fix this! -// return UScript.LATIN; -// } -// else -// { -// return script; -// } -// } -// else -// { -// return script; -// } -// } -// } -// } -//} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1191c20d/src/Lucene.Net.Analysis.ICU/Analysis/ICU/TokenAttributes/ScriptAttribute.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.ICU/Analysis/ICU/TokenAttributes/ScriptAttribute.cs b/src/Lucene.Net.Analysis.ICU/Analysis/ICU/TokenAttributes/ScriptAttribute.cs deleted file mode 100644 index abc1ae2..0000000 --- a/src/Lucene.Net.Analysis.ICU/Analysis/ICU/TokenAttributes/ScriptAttribute.cs +++ /dev/null @@ -1,42 +0,0 @@ -// LUCENENET TODO: Port issues - missing dependencies - -//using Lucene.Net.Util; -//using System; -//using System.Collections.Generic; -//using System.Linq; -//using System.Text; -//using System.Threading.Tasks; - -//namespace Lucene.Net.Analysis.ICU.TokenAttributes -//{ -// /// <summary> -// /// This attribute stores the UTR #24 script value for a token of text. -// /// <para/> -// /// @lucene.experimental -// /// </summary> -// public interface IScriptAttribute : IAttribute -// { -// /** -// * Get the numeric code for this script value. -// * This is the constant value from {@link UScript}. -// * @return numeric code -// */ -// int Code { get; set; } -// ///** -// // * Set the numeric code for this script value. -// // * This is the constant value from {@link UScript}. -// // * @param code numeric code -// // */ -// //public void setCode(int code); -// /** -// * Get the full name. -// * @return UTR #24 full name. -// */ -// string GetName(); -// /** -// * Get the abbreviated name. -// * @return UTR #24 abbreviated name. -// */ -// string GetShortName(); -// } -//} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1191c20d/src/Lucene.Net.Analysis.ICU/Analysis/ICU/TokenAttributes/ScriptAttributeImpl.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.ICU/Analysis/ICU/TokenAttributes/ScriptAttributeImpl.cs b/src/Lucene.Net.Analysis.ICU/Analysis/ICU/TokenAttributes/ScriptAttributeImpl.cs deleted file mode 100644 index f97ccf1..0000000 --- a/src/Lucene.Net.Analysis.ICU/Analysis/ICU/TokenAttributes/ScriptAttributeImpl.cs +++ /dev/null @@ -1,80 +0,0 @@ -// LUCENENET TODO: Port issues - missing dependencies - -//using Lucene.Net.Util; -//using System.Collections.Generic; -//using System.Linq; -//using System.Text; -//using System.Threading.Tasks; - -//namespace Lucene.Net.Analysis.ICU.TokenAttributes -//{ -// /// <summary> -// /// Implementation of <see cref="IScriptAttribute"/> that stores the script -// /// as an integer. -// /// <para/> -// /// @lucene.experimental -// /// </summary> -// public class ScriptAttribute : Attribute, IScriptAttribute, System.ICloneable -// { -// private int code = UScript.COMMON; - -// /** Initializes this attribute with <code>UScript.COMMON</code> */ -// public ScriptAttribute() { } - -// public virtual int Code -// { -// get { return code; } -// set { code = value; } -// } - -// public virtual string GetName() -// { -// return UScript.GetName(code); -// } - -// public virtual string GetShortName() -// { -// return UScript.GetShortName(code); -// } - -// public override void Clear() -// { -// code = UScript.COMMON; -// } - -// public override void CopyTo(IAttribute target) -// { -// ScriptAttribute t = (ScriptAttribute)target; -// t.Code = code; -// } - -// public override bool Equals(object other) -// { -// if (this == other) -// { -// return true; -// } - -// if (other is ScriptAttribute) -// { -// return ((ScriptAttribute)other).code == code; -// } - -// return false; -// } - -// public override int GetHashCode() -// { -// return code; -// } - -// public override void ReflectWith(IAttributeReflector reflector) -// { -// // when wordbreaking CJK, we use the 15924 code Japanese (Han+Hiragana+Katakana) to -// // mark runs of Chinese/Japanese. our use is correct (as for chinese Han is a subset), -// // but this is just to help prevent confusion. -// string name = code == UScript.JAPANESE ? "Chinese/Japanese" : GetName(); -// reflector.Reflect<IScriptAttribute>("script", name); -// } -// } -//} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1191c20d/src/Lucene.Net.Analysis.ICU/Analysis/Icu/ICUFoldingFilter.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.ICU/Analysis/Icu/ICUFoldingFilter.cs b/src/Lucene.Net.Analysis.ICU/Analysis/Icu/ICUFoldingFilter.cs new file mode 100644 index 0000000..4ca8278 --- /dev/null +++ b/src/Lucene.Net.Analysis.ICU/Analysis/Icu/ICUFoldingFilter.cs @@ -0,0 +1,32 @@ +// LUCENENET TODO: Port issues - missing Normalizer2 dependency from icu.net + +//using Icu; +//using Lucene.Net.Support; +//using System; +//using System.Collections.Generic; +//using System.Linq; +//using System.Text; +//using System.Threading.Tasks; + +//namespace Lucene.Net.Analysis.ICU +//{ +// public sealed class ICUFoldingFilter : ICUNormalizer2Filter +// { +// private static readonly Normalizer2 normalizer; + +// /// <summary> +// /// Create a new ICUFoldingFilter on the specified input +// /// </summary> +// public ICUFoldingFilter(TokenStream input) +// : base(input, normalizer) +// { +// } + +// static ICUFoldingFilter() +// { +// normalizer = Normalizer2.GetInstance( +// typeof(ICUFoldingFilter).Assembly.FindAndGetManifestResourceStream(typeof(ICUFoldingFilter), "utr30.nrm"), +// "utr30", Normalizer2.Mode.COMPOSE); +// } +// } +//} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/1191c20d/src/Lucene.Net.Analysis.ICU/Analysis/Icu/ICUFoldingFilterFactory.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.ICU/Analysis/Icu/ICUFoldingFilterFactory.cs b/src/Lucene.Net.Analysis.ICU/Analysis/Icu/ICUFoldingFilterFactory.cs new file mode 100644 index 0000000..c25cf93 --- /dev/null +++ b/src/Lucene.Net.Analysis.ICU/Analysis/Icu/ICUFoldingFilterFactory.cs @@ -0,0 +1,31 @@ +// LUCENENET TODO: Port issues - missing Normalizer2 dependency from icu.net + +//using Lucene.Net.Analysis.Util; +//using System; +//using System.Collections.Generic; + +//namespace Lucene.Net.Analysis.ICU +//{ +// public class ICUFoldingFilterFactory : TokenFilterFactory, IMultiTermAwareComponent +// { +// /// <summary>Creates a new ICUFoldingFilterFactory</summary> +// public ICUFoldingFilterFactory(IDictionary<string, string> args) +// : base(args) +// { +// if (args.Count != 0) +// { +// throw new ArgumentException("Unknown parameters: " + args); +// } +// } + +// public override TokenStream Create(TokenStream input) +// { +// return new ICUFoldingFilter(input); +// } + +// public virtual AbstractAnalysisFactory GetMultiTermComponent() +// { +// return this; +// } +// } +//}
