http://git-wip-us.apache.org/repos/asf/lucenenet/blob/ab81d913/src/Lucene.Net.Analysis.Common/Analysis/Ngram/EdgeNGramTokenFilter.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Ngram/EdgeNGramTokenFilter.cs b/src/Lucene.Net.Analysis.Common/Analysis/Ngram/EdgeNGramTokenFilter.cs deleted file mode 100644 index 8cf8172..0000000 --- a/src/Lucene.Net.Analysis.Common/Analysis/Ngram/EdgeNGramTokenFilter.cs +++ /dev/null @@ -1,245 +0,0 @@ -using Lucene.Net.Analysis.TokenAttributes; -using Lucene.Net.Analysis.Util; -using Lucene.Net.Util; -using System; - -namespace Lucene.Net.Analysis.NGram -{ - /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - - /// <summary> - /// Tokenizes the given token into n-grams of given size(s). - /// <para> - /// This <see cref="TokenFilter"/> create n-grams from the beginning edge or ending edge of a input token. - /// </para> - /// <para>As of Lucene 4.4, this filter does not support - /// <see cref="Side.BACK"/> (you can use <see cref="Reverse.ReverseStringFilter"/> up-front and - /// afterward to get the same behavior), handles supplementary characters - /// correctly and does not update offsets anymore. - /// </para> - /// </summary> - public sealed class EdgeNGramTokenFilter : TokenFilter - { - public const Side DEFAULT_SIDE = Side.FRONT; - public const int DEFAULT_MAX_GRAM_SIZE = 1; - public const int DEFAULT_MIN_GRAM_SIZE = 1; - - /// <summary> - /// Specifies which side of the input the n-gram should be generated from </summary> - public enum Side - { - /// <summary> - /// Get the n-gram from the front of the input </summary> - FRONT, - - /// <summary> - /// Get the n-gram from the end of the input </summary> - [System.Obsolete] - BACK, - } - - /// <summary> - /// Get the appropriate <see cref="Side"/> from a string - /// </summary> - public static Side GetSide(string sideName) - { - Side result; - if (!Enum.TryParse(sideName, true, out result)) - { - result = Side.FRONT; - } - return result; - } - - private readonly LuceneVersion version; - private readonly CharacterUtils charUtils; - private readonly int minGram; - private readonly int maxGram; - private Side side; - private char[] curTermBuffer; - private int curTermLength; - private int curCodePointCount; - private int curGramSize; - private int tokStart; - private int tokEnd; // only used if the length changed before this filter - private bool updateOffsets; // never if the length changed before this filter - private int savePosIncr; - private int savePosLen; - - private readonly ICharTermAttribute termAtt; - private readonly IOffsetAttribute offsetAtt; - private readonly IPositionIncrementAttribute posIncrAtt; - private readonly IPositionLengthAttribute posLenAtt; - - /// <summary> - /// Creates <see cref="EdgeNGramTokenFilter"/> that can generate n-grams in the sizes of the given range - /// </summary> - /// <param name="version"> the Lucene match version - See <see cref="LuceneVersion"/> </param> - /// <param name="input"> <see cref="TokenStream"/> holding the input to be tokenized </param> - /// <param name="side"> the <see cref="Side"/> from which to chop off an n-gram </param> - /// <param name="minGram"> the smallest n-gram to generate </param> - /// <param name="maxGram"> the largest n-gram to generate </param> - [Obsolete] - public EdgeNGramTokenFilter(LuceneVersion version, TokenStream input, Side side, int minGram, int maxGram) - : base(input) - { - - //if (version == null) - //{ - // throw new System.ArgumentException("version must not be null"); - //} - - if (version.OnOrAfter(LuceneVersion.LUCENE_44) && side == Side.BACK) - { - throw new System.ArgumentException("Side.BACK is not supported anymore as of Lucene 4.4, use ReverseStringFilter up-front and afterward"); - } - - if (!Enum.IsDefined(typeof(Side), side)) - { - throw new System.ArgumentException("sideLabel must be either front or back"); - } - - if (minGram < 1) - { - throw new System.ArgumentException("minGram must be greater than zero"); - } - - if (minGram > maxGram) - { - throw new System.ArgumentException("minGram must not be greater than maxGram"); - } - - this.version = version; - this.charUtils = version.OnOrAfter(LuceneVersion.LUCENE_44) ? CharacterUtils.GetInstance(version) : CharacterUtils.Java4Instance; - this.minGram = minGram; - this.maxGram = maxGram; - this.side = side; - - this.termAtt = AddAttribute<ICharTermAttribute>(); - this.offsetAtt = AddAttribute<IOffsetAttribute>(); - this.posIncrAtt = AddAttribute<IPositionIncrementAttribute>(); - this.posLenAtt = AddAttribute<IPositionLengthAttribute>(); - } - - /// <summary> - /// Creates <see cref="EdgeNGramTokenFilter"/> that can generate n-grams in the sizes of the given range - /// </summary> - /// <param name="version"> the Lucene match version - See <see cref="LuceneVersion"/> </param> - /// <param name="input"> <see cref="TokenStream"/> holding the input to be tokenized </param> - /// <param name="sideLabel"> the name of the <see cref="Side"/> from which to chop off an n-gram </param> - /// <param name="minGram"> the smallest n-gram to generate </param> - /// <param name="maxGram"> the largest n-gram to generate </param> - [Obsolete] - public EdgeNGramTokenFilter(LuceneVersion version, TokenStream input, string sideLabel, int minGram, int maxGram) - : this(version, input, GetSide(sideLabel), minGram, maxGram) - { - } - - /// <summary> - /// Creates <see cref="EdgeNGramTokenFilter"/> that can generate n-grams in the sizes of the given range - /// </summary> - /// <param name="version"> the Lucene match version - See <see cref="LuceneVersion"/> </param> - /// <param name="input"> <see cref="TokenStream"/> holding the input to be tokenized </param> - /// <param name="minGram"> the smallest n-gram to generate </param> - /// <param name="maxGram"> the largest n-gram to generate </param> - public EdgeNGramTokenFilter(LuceneVersion version, TokenStream input, int minGram, int maxGram) -#pragma warning disable 612, 618 - : this(version, input, Side.FRONT, minGram, maxGram) -#pragma warning restore 612, 618 - { - } - - public override sealed bool IncrementToken() - { - while (true) - { - if (curTermBuffer == null) - { - if (!m_input.IncrementToken()) - { - return false; - } - else - { - curTermBuffer = (char[])termAtt.Buffer.Clone(); - curTermLength = termAtt.Length; - curCodePointCount = charUtils.CodePointCount(termAtt.ToString()); - curGramSize = minGram; - tokStart = offsetAtt.StartOffset; - tokEnd = offsetAtt.EndOffset; -#pragma warning disable 612, 618 - if (version.OnOrAfter(LuceneVersion.LUCENE_44)) -#pragma warning restore 612, 618 - { - // Never update offsets - updateOffsets = false; - } - else - { - // if length by start + end offsets doesn't match the term text then assume - // this is a synonym and don't adjust the offsets. - updateOffsets = (tokStart + curTermLength) == tokEnd; - } - savePosIncr += posIncrAtt.PositionIncrement; - savePosLen = posLenAtt.PositionLength; - } - } - if (curGramSize <= maxGram) // if we have hit the end of our n-gram size range, quit - { - if (curGramSize <= curCodePointCount) // if the remaining input is too short, we can't generate any n-grams - { - // grab gramSize chars from front or back - int start = side == Side.FRONT ? 0 : charUtils.OffsetByCodePoints(curTermBuffer, 0, curTermLength, curTermLength, -curGramSize); - int end = charUtils.OffsetByCodePoints(curTermBuffer, 0, curTermLength, start, curGramSize); - ClearAttributes(); - if (updateOffsets) - { - offsetAtt.SetOffset(tokStart + start, tokStart + end); - } - else - { - offsetAtt.SetOffset(tokStart, tokEnd); - } - // first ngram gets increment, others don't - if (curGramSize == minGram) - { - posIncrAtt.PositionIncrement = savePosIncr; - savePosIncr = 0; - } - else - { - posIncrAtt.PositionIncrement = 0; - } - posLenAtt.PositionLength = savePosLen; - termAtt.CopyBuffer(curTermBuffer, start, end - start); - curGramSize++; - return true; - } - } - curTermBuffer = null; - } - } - - public override void Reset() - { - base.Reset(); - curTermBuffer = null; - savePosIncr = 0; - } - } -} \ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/ab81d913/src/Lucene.Net.Analysis.Common/Analysis/Ngram/EdgeNGramTokenizer.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Ngram/EdgeNGramTokenizer.cs b/src/Lucene.Net.Analysis.Common/Analysis/Ngram/EdgeNGramTokenizer.cs deleted file mode 100644 index ed2cb3d..0000000 --- a/src/Lucene.Net.Analysis.Common/Analysis/Ngram/EdgeNGramTokenizer.cs +++ /dev/null @@ -1,72 +0,0 @@ -using Lucene.Net.Util; -using System.IO; - -namespace Lucene.Net.Analysis.NGram -{ - /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - - /// <summary> - /// Tokenizes the input from an edge into n-grams of given size(s). - /// <para> - /// This <see cref="Tokenizer"/> create n-grams from the beginning edge or ending edge of a input token. - /// </para> - /// <para>As of Lucene 4.4, this tokenizer - /// <list type="bullet"> - /// <item>can handle <code>maxGram</code> larger than 1024 chars, but beware that this will result in increased memory usage</item> - /// <item>doesn't trim the input,</item> - /// <item>sets position increments equal to 1 instead of 1 for the first token and 0 for all other ones</item> - /// <item>doesn't support backward n-grams anymore.</item> - /// <item>supports <see cref="Util.CharTokenizer.IsTokenChar(int)"/> pre-tokenization,</item> - /// <item>correctly handles supplementary characters.</item> - /// </list> - /// </para> - /// <para>Although <b style="color:red">highly</b> discouraged, it is still possible - /// to use the old behavior through <see cref="Lucene43EdgeNGramTokenizer"/>. - /// </para> - /// </summary> - public class EdgeNGramTokenizer : NGramTokenizer - { - public const int DEFAULT_MAX_GRAM_SIZE = 1; - public const int DEFAULT_MIN_GRAM_SIZE = 1; - - /// <summary> - /// Creates <see cref="EdgeNGramTokenizer"/> that can generate n-grams in the sizes of the given range - /// </summary> - /// <param name="version"> the Lucene match version - See <see cref="LuceneVersion"/> </param> - /// <param name="input"> <see cref="TextReader"/> holding the input to be tokenized </param> - /// <param name="minGram"> the smallest n-gram to generate </param> - /// <param name="maxGram"> the largest n-gram to generate </param> - public EdgeNGramTokenizer(LuceneVersion version, TextReader input, int minGram, int maxGram) - : base(version, input, minGram, maxGram, true) - { - } - - /// <summary> - /// Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range - /// </summary> - /// <param name="version"> the Lucene match version - See <see cref="LuceneVersion"/> </param> - /// <param name="factory"> <see cref="AttributeSource.AttributeFactory"/> to use </param> - /// <param name="input"> <see cref="TextReader"/> holding the input to be tokenized </param> - /// <param name="minGram"> the smallest n-gram to generate </param> - /// <param name="maxGram"> the largest n-gram to generate </param> - public EdgeNGramTokenizer(LuceneVersion version, AttributeSource.AttributeFactory factory, TextReader input, int minGram, int maxGram) - : base(version, factory, input, minGram, maxGram, true) - { - } - } -} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/ab81d913/src/Lucene.Net.Analysis.Common/Analysis/Ngram/EdgeNGramTokenizerFactory.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Ngram/EdgeNGramTokenizerFactory.cs b/src/Lucene.Net.Analysis.Common/Analysis/Ngram/EdgeNGramTokenizerFactory.cs deleted file mode 100644 index 00325f5..0000000 --- a/src/Lucene.Net.Analysis.Common/Analysis/Ngram/EdgeNGramTokenizerFactory.cs +++ /dev/null @@ -1,75 +0,0 @@ -using Lucene.Net.Analysis.Util; -using Lucene.Net.Util; -using System; -using System.Collections.Generic; -using System.IO; - -namespace Lucene.Net.Analysis.NGram -{ - /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - - /// <summary> - /// Creates new instances of <see cref="EdgeNGramTokenizer"/>. - /// <code> - /// <fieldType name="text_edgngrm" class="solr.TextField" positionIncrementGap="100"> - /// <analyzer> - /// <tokenizer class="solr.EdgeNGramTokenizerFactory" minGramSize="1" maxGramSize="1"/> - /// </analyzer> - /// </fieldType></code> - /// </summary> - public class EdgeNGramTokenizerFactory : TokenizerFactory - { - private readonly int maxGramSize; - private readonly int minGramSize; - private readonly string side; - - /// <summary> - /// Creates a new <see cref="EdgeNGramTokenizerFactory"/> </summary> - public EdgeNGramTokenizerFactory(IDictionary<string, string> args) : base(args) - { - minGramSize = GetInt(args, "minGramSize", EdgeNGramTokenizer.DEFAULT_MIN_GRAM_SIZE); - maxGramSize = GetInt(args, "maxGramSize", EdgeNGramTokenizer.DEFAULT_MAX_GRAM_SIZE); - side = Get(args, "side", EdgeNGramTokenFilter.Side.FRONT.ToString()); - if (args.Count > 0) - { - throw new System.ArgumentException("Unknown parameters: " + args); - } - } - - public override Tokenizer Create(AttributeSource.AttributeFactory factory, TextReader input) - { -#pragma warning disable 612, 618 - if (m_luceneMatchVersion.OnOrAfter(LuceneVersion.LUCENE_44)) -#pragma warning restore 612, 618 - { - EdgeNGramTokenFilter.Side sideEnum; - if (!Enum.TryParse(this.side, true, out sideEnum)) - { - throw new System.ArgumentException(typeof(EdgeNGramTokenizer).Name + " does not support backward n-grams as of Lucene 4.4"); - } - return new EdgeNGramTokenizer(m_luceneMatchVersion, input, minGramSize, maxGramSize); - } - else - { -#pragma warning disable 612, 618 - return new Lucene43EdgeNGramTokenizer(m_luceneMatchVersion, input, side, minGramSize, maxGramSize); -#pragma warning restore 612, 618 - } - } - } -} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/ab81d913/src/Lucene.Net.Analysis.Common/Analysis/Ngram/Lucene43EdgeNGramTokenizer.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Ngram/Lucene43EdgeNGramTokenizer.cs b/src/Lucene.Net.Analysis.Common/Analysis/Ngram/Lucene43EdgeNGramTokenizer.cs deleted file mode 100644 index 4dadbed..0000000 --- a/src/Lucene.Net.Analysis.Common/Analysis/Ngram/Lucene43EdgeNGramTokenizer.cs +++ /dev/null @@ -1,297 +0,0 @@ -using Lucene.Net.Analysis.TokenAttributes; -using Lucene.Net.Util; -using System; -using System.IO; - -namespace Lucene.Net.Analysis.NGram -{ - /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - - /// <summary> - /// Old version of <see cref="EdgeNGramTokenizer"/> which doesn't handle correctly - /// supplementary characters. - /// </summary> - [Obsolete] - public sealed class Lucene43EdgeNGramTokenizer : Tokenizer - { - public const Side DEFAULT_SIDE = Side.FRONT; - public const int DEFAULT_MAX_GRAM_SIZE = 1; - public const int DEFAULT_MIN_GRAM_SIZE = 1; - - private ICharTermAttribute termAtt; - private IOffsetAttribute offsetAtt; - private IPositionIncrementAttribute posIncrAtt; - - /// <summary> - /// Specifies which side of the input the n-gram should be generated from </summary> - public enum Side - { - /// <summary> - /// Get the n-gram from the front of the input </summary> - FRONT, - - /// <summary> - /// Get the n-gram from the end of the input </summary> - BACK, - } - - // Get the appropriate Side from a string - public static Side GetSide(string sideName) - { - Side result; - if (!Enum.TryParse(sideName, true, out result)) - { - result = Side.FRONT; - } - return result; - } - - private int minGram; - private int maxGram; - private int gramSize; - private Side side; - private bool started; - private int inLen; // length of the input AFTER trim() - private int charsRead; // length of the input - private string inStr; - - - /// <summary> - /// Creates <see cref="Lucene43EdgeNGramTokenizer"/> that can generate n-grams in the sizes of the given range - /// </summary> - /// <param name="version"> the Lucene match version - See <see cref="LuceneVersion"/> </param> - /// <param name="input"> <see cref="TextReader"/> holding the input to be tokenized </param> - /// <param name="side"> the <see cref="Side"/> from which to chop off an n-gram </param> - /// <param name="minGram"> the smallest n-gram to generate </param> - /// <param name="maxGram"> the largest n-gram to generate </param> - [Obsolete] - public Lucene43EdgeNGramTokenizer(LuceneVersion version, TextReader input, Side side, int minGram, int maxGram) - : base(input) - { - Init(version, side, minGram, maxGram); - } - - /// <summary> - /// Creates <see cref="Lucene43EdgeNGramTokenizer"/> that can generate n-grams in the sizes of the given range - /// </summary> - /// <param name="version"> the Lucene match version - See <see cref="LuceneVersion"/> </param> - /// <param name="factory"> <see cref="AttributeSource.AttributeFactory"/> to use </param> - /// <param name="input"> <see cref="TextReader"/> holding the input to be tokenized </param> - /// <param name="side"> the <see cref="Side"/> from which to chop off an n-gram </param> - /// <param name="minGram"> the smallest n-gram to generate </param> - /// <param name="maxGram"> the largest n-gram to generate </param> - [Obsolete] - public Lucene43EdgeNGramTokenizer(LuceneVersion version, AttributeFactory factory, TextReader input, Side side, int minGram, int maxGram) - : base(factory, input) - { - Init(version, side, minGram, maxGram); - } - - /// <summary> - /// Creates <see cref="Lucene43EdgeNGramTokenizer"/> that can generate n-grams in the sizes of the given range - /// </summary> - /// <param name="version"> the Lucene match version - See <see cref="LuceneVersion"/> </param> - /// <param name="input"> <see cref="TextReader"/> holding the input to be tokenized </param> - /// <param name="sideLabel"> the name of the <see cref="Side"/> from which to chop off an n-gram </param> - /// <param name="minGram"> the smallest n-gram to generate </param> - /// <param name="maxGram"> the largest n-gram to generate </param> - [Obsolete] - public Lucene43EdgeNGramTokenizer(LuceneVersion version, TextReader input, string sideLabel, int minGram, int maxGram) - : this(version, input, GetSide(sideLabel), minGram, maxGram) - { - } - - /// <summary> - /// Creates <see cref="Lucene43EdgeNGramTokenizer"/> that can generate n-grams in the sizes of the given range - /// </summary> - /// <param name="version"> the Lucene match version - See <see cref="LuceneVersion"/> </param> - /// <param name="factory"> <see cref="AttributeSource.AttributeFactory"/> to use </param> - /// <param name="input"> <see cref="TextReader"/> holding the input to be tokenized </param> - /// <param name="sideLabel"> the name of the <see cref="Side"/> from which to chop off an n-gram </param> - /// <param name="minGram"> the smallest n-gram to generate </param> - /// <param name="maxGram"> the largest n-gram to generate </param> - [Obsolete] - public Lucene43EdgeNGramTokenizer(LuceneVersion version, AttributeFactory factory, TextReader input, string sideLabel, int minGram, int maxGram) - : this(version, factory, input, GetSide(sideLabel), minGram, maxGram) - { - } - - /// <summary> - /// Creates <see cref="Lucene43EdgeNGramTokenizer"/> that can generate n-grams in the sizes of the given range - /// </summary> - /// <param name="version"> the Lucene match version - See <see cref="LuceneVersion"/> </param> - /// <param name="input"> <see cref="TextReader"/> holding the input to be tokenized </param> - /// <param name="minGram"> the smallest n-gram to generate </param> - /// <param name="maxGram"> the largest n-gram to generate </param> - public Lucene43EdgeNGramTokenizer(LuceneVersion version, TextReader input, int minGram, int maxGram) - : this(version, input, Side.FRONT, minGram, maxGram) - { - } - - /// <summary> - /// Creates <see cref="Lucene43EdgeNGramTokenizer"/> that can generate n-grams in the sizes of the given range - /// </summary> - /// <param name="version"> the Lucene match version - See <see cref="LuceneVersion"/> </param> - /// <param name="factory"> <see cref="AttributeSource.AttributeFactory"/> to use </param> - /// <param name="input"> <see cref="TextReader"/> holding the input to be tokenized </param> - /// <param name="minGram"> the smallest n-gram to generate </param> - /// <param name="maxGram"> the largest n-gram to generate </param> - public Lucene43EdgeNGramTokenizer(LuceneVersion version, AttributeFactory factory, TextReader input, int minGram, int maxGram) - : this(version, factory, input, Side.FRONT, minGram, maxGram) - { - } - - private void Init(LuceneVersion version, Side side, int minGram, int maxGram) - { - //if (version == null) - //{ - // throw new System.ArgumentException("version must not be null"); - //} - - if (!Enum.IsDefined(typeof(Side), side)) - { - throw new System.ArgumentException("sideLabel must be either front or back"); - } - - if (minGram < 1) - { - throw new System.ArgumentException("minGram must be greater than zero"); - } - - if (minGram > maxGram) - { - throw new System.ArgumentException("minGram must not be greater than maxGram"); - } - - if (version.OnOrAfter(LuceneVersion.LUCENE_44)) - { - if (side == Side.BACK) - { - throw new System.ArgumentException("Side.BACK is not supported anymore as of Lucene 4.4"); - } - } - else - { - maxGram = Math.Min(maxGram, 1024); - } - - this.minGram = minGram; - this.maxGram = maxGram; - this.side = side; - this.termAtt = AddAttribute<ICharTermAttribute>(); - this.offsetAtt = AddAttribute<IOffsetAttribute>(); - this.posIncrAtt = AddAttribute<IPositionIncrementAttribute>(); - } - - /// <summary> - /// Returns the next token in the stream, or null at EOS. </summary> - public override bool IncrementToken() - { - ClearAttributes(); - // if we are just starting, read the whole input - if (!started) - { - started = true; - gramSize = minGram; - int limit = side == Side.FRONT ? maxGram : 1024; - char[] chars = new char[Math.Min(1024, limit)]; - charsRead = 0; - // TODO: refactor to a shared readFully somewhere: - bool exhausted = false; - while (charsRead < limit) - { - int inc = m_input.Read(chars, charsRead, chars.Length - charsRead); - if (inc <= 0) - { - exhausted = true; - break; - } - charsRead += inc; - if (charsRead == chars.Length && charsRead < limit) - { - chars = ArrayUtil.Grow(chars); - } - } - - inStr = new string(chars, 0, charsRead); - inStr = inStr.Trim(); - - if (!exhausted) - { - // Read extra throwaway chars so that on end() we - // report the correct offset: - var throwaway = new char[1024]; - while (true) - { - int inc = m_input.Read(throwaway, 0, throwaway.Length); - if (inc <= 0) - { - break; - } - charsRead += inc; - } - } - - inLen = inStr.Length; - if (inLen == 0) - { - return false; - } - posIncrAtt.PositionIncrement = 1; - } - else - { - posIncrAtt.PositionIncrement = 0; - } - - // if the remaining input is too short, we can't generate any n-grams - if (gramSize > inLen) - { - return false; - } - - // if we have hit the end of our n-gram size range, quit - if (gramSize > maxGram || gramSize > inLen) - { - return false; - } - - // grab gramSize chars from front or back - int start = side == Side.FRONT ? 0 : inLen - gramSize; - int end = start + gramSize; - termAtt.SetEmpty().Append(inStr, start, end); - offsetAtt.SetOffset(CorrectOffset(start), CorrectOffset(end)); - gramSize++; - return true; - } - - public override void End() - { - base.End(); - // set final offset - int finalOffset = CorrectOffset(charsRead); - this.offsetAtt.SetOffset(finalOffset, finalOffset); - } - - public override void Reset() - { - base.Reset(); - started = false; - } - } -} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/ab81d913/src/Lucene.Net.Analysis.Common/Analysis/Ngram/Lucene43NGramTokenizer.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Ngram/Lucene43NGramTokenizer.cs b/src/Lucene.Net.Analysis.Common/Analysis/Ngram/Lucene43NGramTokenizer.cs deleted file mode 100644 index b806345..0000000 --- a/src/Lucene.Net.Analysis.Common/Analysis/Ngram/Lucene43NGramTokenizer.cs +++ /dev/null @@ -1,173 +0,0 @@ -using Lucene.Net.Analysis.TokenAttributes; -using System; -using System.IO; - -namespace Lucene.Net.Analysis.NGram -{ - /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - - /// <summary> - /// Old broken version of <see cref="NGramTokenizer"/>. - /// </summary> - [Obsolete] - public sealed class Lucene43NGramTokenizer : Tokenizer - { - public const int DEFAULT_MIN_NGRAM_SIZE = 1; - public const int DEFAULT_MAX_NGRAM_SIZE = 2; - - private int minGram, maxGram; - private int gramSize; - private int pos; - private int inLen; // length of the input AFTER trim() - private int charsRead; // length of the input - private string inStr; - private bool started; - - private ICharTermAttribute termAtt; - private IOffsetAttribute offsetAtt; - - /// <summary> - /// Creates <see cref="Lucene43NGramTokenizer"/> with given min and max n-grams. </summary> - /// <param name="input"> <see cref="TextReader"/> holding the input to be tokenized </param> - /// <param name="minGram"> the smallest n-gram to generate </param> - /// <param name="maxGram"> the largest n-gram to generate </param> - public Lucene43NGramTokenizer(TextReader input, int minGram, int maxGram) - : base(input) - { - Init(minGram, maxGram); - } - - /// <summary> - /// Creates <see cref="Lucene43NGramTokenizer"/> with given min and max n-grams. </summary> - /// <param name="factory"> <see cref="Lucene.Net.Util.AttributeSource.AttributeFactory"/> to use </param> - /// <param name="input"> <see cref="TextReader"/> holding the input to be tokenized </param> - /// <param name="minGram"> the smallest n-gram to generate </param> - /// <param name="maxGram"> the largest n-gram to generate </param> - public Lucene43NGramTokenizer(AttributeFactory factory, TextReader input, int minGram, int maxGram) - : base(factory, input) - { - Init(minGram, maxGram); - } - - /// <summary> - /// Creates <see cref="Lucene43NGramTokenizer"/> with default min and max n-grams. </summary> - /// <param name="input"> <see cref="TextReader"/> holding the input to be tokenized </param> - public Lucene43NGramTokenizer(TextReader input) - : this(input, DEFAULT_MIN_NGRAM_SIZE, DEFAULT_MAX_NGRAM_SIZE) - { - } - - private void Init(int minGram, int maxGram) - { - if (minGram < 1) - { - throw new System.ArgumentException("minGram must be greater than zero"); - } - if (minGram > maxGram) - { - throw new System.ArgumentException("minGram must not be greater than maxGram"); - } - this.minGram = minGram; - this.maxGram = maxGram; - termAtt = AddAttribute<ICharTermAttribute>(); - offsetAtt = AddAttribute<IOffsetAttribute>(); - } - - /// <summary> - /// Returns the next token in the stream, or null at EOS. </summary> - public override bool IncrementToken() - { - ClearAttributes(); - if (!started) - { - started = true; - gramSize = minGram; - char[] chars = new char[1024]; - charsRead = 0; - // TODO: refactor to a shared readFully somewhere: - while (charsRead < chars.Length) - { - int inc = m_input.Read(chars, charsRead, chars.Length - charsRead); - if (inc == -1) - { - break; - } - charsRead += inc; - } - inStr = (new string(chars, 0, charsRead)).Trim(); // remove any trailing empty strings - - if (charsRead == chars.Length) - { - // Read extra throwaway chars so that on end() we - // report the correct offset: - var throwaway = new char[1024]; - while (true) - { - int inc = m_input.Read(throwaway, 0, throwaway.Length); - if (inc == -1) - { - break; - } - charsRead += inc; - } - } - - inLen = inStr.Length; - if (inLen == 0) - { - return false; - } - } - - if (pos + gramSize > inLen) // if we hit the end of the string - { - pos = 0; // reset to beginning of string - gramSize++; // increase n-gram size - if (gramSize > maxGram) // we are done - { - return false; - } - if (pos + gramSize > inLen) - { - return false; - } - } - - int oldPos = pos; - pos++; - termAtt.SetEmpty().Append(inStr, oldPos, oldPos + gramSize); - offsetAtt.SetOffset(CorrectOffset(oldPos), CorrectOffset(oldPos + gramSize)); - return true; - } - - public override void End() - { - base.End(); - // set final offset - int finalOffset = CorrectOffset(charsRead); - this.offsetAtt.SetOffset(finalOffset, finalOffset); - } - - public override void Reset() - { - base.Reset(); - started = false; - pos = 0; - } - } -} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/ab81d913/src/Lucene.Net.Analysis.Common/Analysis/Ngram/NGramFilterFactory.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Ngram/NGramFilterFactory.cs b/src/Lucene.Net.Analysis.Common/Analysis/Ngram/NGramFilterFactory.cs deleted file mode 100644 index ca1d0bc..0000000 --- a/src/Lucene.Net.Analysis.Common/Analysis/Ngram/NGramFilterFactory.cs +++ /dev/null @@ -1,56 +0,0 @@ -using Lucene.Net.Analysis.Util; -using System.Collections.Generic; - -namespace Lucene.Net.Analysis.NGram -{ - /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - - /// <summary> - /// Factory for <see cref="NGramTokenFilter"/>. - /// <code> - /// <fieldType name="text_ngrm" class="solr.TextField" positionIncrementGap="100"> - /// <analyzer> - /// <tokenizer class="solr.WhitespaceTokenizerFactory"/> - /// <filter class="solr.NGramFilterFactory" minGramSize="1" maxGramSize="2"/> - /// </analyzer> - /// </fieldType></code> - /// </summary> - public class NGramFilterFactory : TokenFilterFactory - { - private readonly int maxGramSize; - private readonly int minGramSize; - - /// <summary> - /// Creates a new <see cref="NGramFilterFactory"/> </summary> - public NGramFilterFactory(IDictionary<string, string> args) - : base(args) - { - minGramSize = GetInt(args, "minGramSize", NGramTokenFilter.DEFAULT_MIN_NGRAM_SIZE); - maxGramSize = GetInt(args, "maxGramSize", NGramTokenFilter.DEFAULT_MAX_NGRAM_SIZE); - if (args.Count > 0) - { - throw new System.ArgumentException("Unknown parameters: " + args); - } - } - - public override TokenStream Create(TokenStream input) - { - return new NGramTokenFilter(m_luceneMatchVersion, input, minGramSize, maxGramSize); - } - } -} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/ab81d913/src/Lucene.Net.Analysis.Common/Analysis/Ngram/NGramTokenFilter.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Ngram/NGramTokenFilter.cs b/src/Lucene.Net.Analysis.Common/Analysis/Ngram/NGramTokenFilter.cs deleted file mode 100644 index f1c82c5..0000000 --- a/src/Lucene.Net.Analysis.Common/Analysis/Ngram/NGramTokenFilter.cs +++ /dev/null @@ -1,252 +0,0 @@ -using Lucene.Net.Analysis.Miscellaneous; -using Lucene.Net.Analysis.TokenAttributes; -using Lucene.Net.Analysis.Util; -using Lucene.Net.Util; - -namespace Lucene.Net.Analysis.NGram -{ - /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - - /// <summary> - /// Tokenizes the input into n-grams of the given size(s). - /// <para>You must specify the required <see cref="LuceneVersion"/> compatibility when - /// creating a <see cref="NGramTokenFilter"/>. As of Lucene 4.4, this token filters: - /// <list type="bullet"> - /// <item>handles supplementary characters correctly,</item> - /// <item>emits all n-grams for the same token at the same position,</item> - /// <item>does not modify offsets,</item> - /// <item>sorts n-grams by their offset in the original token first, then - /// increasing length (meaning that "abc" will give "a", "ab", "abc", "b", "bc", - /// "c").</item> - /// </list> - /// </para> - /// <para>You can make this filter use the old behavior by providing a version < - /// <see cref="LuceneVersion.LUCENE_44"/> in the constructor but this is not recommended as - /// it will lead to broken <see cref="TokenStream"/>s that will cause highlighting - /// bugs. - /// </para> - /// <para>If you were using this <see cref="TokenFilter"/> to perform partial highlighting, - /// this won't work anymore since this filter doesn't update offsets. You should - /// modify your analysis chain to use <see cref="NGramTokenizer"/>, and potentially - /// override <see cref="NGramTokenizer.IsTokenChar(int)"/> to perform pre-tokenization. - /// </para> - /// </summary> - public sealed class NGramTokenFilter : TokenFilter - { - public const int DEFAULT_MIN_NGRAM_SIZE = 1; - public const int DEFAULT_MAX_NGRAM_SIZE = 2; - - private readonly int minGram, maxGram; - - private char[] curTermBuffer; - private int curTermLength; - private int curCodePointCount; - private int curGramSize; - private int curPos; - private int curPosInc, curPosLen; - private int tokStart; - private int tokEnd; - private bool hasIllegalOffsets; // only if the length changed before this filter - - private readonly LuceneVersion version; - private readonly CharacterUtils charUtils; - private readonly ICharTermAttribute termAtt; - private readonly IPositionIncrementAttribute posIncAtt; - private readonly IPositionLengthAttribute posLenAtt; - private readonly IOffsetAttribute offsetAtt; - - /// <summary> - /// Creates <see cref="NGramTokenFilter"/> with given min and max n-grams. </summary> - /// <param name="version"> Lucene version to enable correct position increments. - /// See <see cref="NGramTokenFilter"/> for details. </param> - /// <param name="input"> <see cref="TokenStream"/> holding the input to be tokenized </param> - /// <param name="minGram"> the smallest n-gram to generate </param> - /// <param name="maxGram"> the largest n-gram to generate </param> - public NGramTokenFilter(LuceneVersion version, TokenStream input, int minGram, int maxGram) - : base(new CodepointCountFilter(version, input, minGram, int.MaxValue)) - { - this.version = version; - this.charUtils = version.OnOrAfter( -#pragma warning disable 612, 618 - LuceneVersion.LUCENE_44) ? -#pragma warning restore 612, 618 - CharacterUtils.GetInstance(version) : CharacterUtils.Java4Instance; - if (minGram < 1) - { - throw new System.ArgumentException("minGram must be greater than zero"); - } - if (minGram > maxGram) - { - throw new System.ArgumentException("minGram must not be greater than maxGram"); - } - this.minGram = minGram; - this.maxGram = maxGram; -#pragma warning disable 612, 618 - if (version.OnOrAfter(LuceneVersion.LUCENE_44)) -#pragma warning restore 612, 618 - { - posIncAtt = AddAttribute<IPositionIncrementAttribute>(); - posLenAtt = AddAttribute<IPositionLengthAttribute>(); - } - else - { - posIncAtt = new PositionIncrementAttributeAnonymousInnerClassHelper(this); - posLenAtt = new PositionLengthAttributeAnonymousInnerClassHelper(this); - } - termAtt = AddAttribute<ICharTermAttribute>(); - offsetAtt = AddAttribute<IOffsetAttribute>(); - } - - private class PositionIncrementAttributeAnonymousInnerClassHelper : PositionIncrementAttribute - { - private readonly NGramTokenFilter outerInstance; - - public PositionIncrementAttributeAnonymousInnerClassHelper(NGramTokenFilter outerInstance) - { - this.outerInstance = outerInstance; - } - - public override int PositionIncrement - { - set - { - } - get - { - return 0; - } - } - } - - private class PositionLengthAttributeAnonymousInnerClassHelper : PositionLengthAttribute - { - private readonly NGramTokenFilter outerInstance; - - public PositionLengthAttributeAnonymousInnerClassHelper(NGramTokenFilter outerInstance) - { - this.outerInstance = outerInstance; - } - - public override int PositionLength - { - set - { - } - get - { - return 0; - } - } - } - - /// <summary> - /// Creates <see cref="NGramTokenFilter"/> with default min and max n-grams. </summary> - /// <param name="version"> Lucene version to enable correct position increments. - /// See <see cref="NGramTokenFilter"/> for details. </param> - /// <param name="input"> <see cref="TokenStream"/> holding the input to be tokenized </param> - public NGramTokenFilter(LuceneVersion version, TokenStream input) - : this(version, input, DEFAULT_MIN_NGRAM_SIZE, DEFAULT_MAX_NGRAM_SIZE) - { - } - - /// <summary> - /// Returns the next token in the stream, or null at EOS. - /// </summary> - public override sealed bool IncrementToken() - { - while (true) - { - if (curTermBuffer == null) - { - if (!m_input.IncrementToken()) - { - return false; - } - else - { - curTermBuffer = (char[])termAtt.Buffer.Clone(); - curTermLength = termAtt.Length; - curCodePointCount = charUtils.CodePointCount(termAtt.ToString()); - curGramSize = minGram; - curPos = 0; - curPosInc = posIncAtt.PositionIncrement; - curPosLen = posLenAtt.PositionLength; - tokStart = offsetAtt.StartOffset; - tokEnd = offsetAtt.EndOffset; - // if length by start + end offsets doesn't match the term text then assume - // this is a synonym and don't adjust the offsets. - hasIllegalOffsets = (tokStart + curTermLength) != tokEnd; - } - } -#pragma warning disable 612, 618 - if (version.OnOrAfter(LuceneVersion.LUCENE_44)) -#pragma warning restore 612, 618 - { - if (curGramSize > maxGram || (curPos + curGramSize) > curCodePointCount) - { - ++curPos; - curGramSize = minGram; - } - if ((curPos + curGramSize) <= curCodePointCount) - { - ClearAttributes(); - int start = charUtils.OffsetByCodePoints(curTermBuffer, 0, curTermLength, 0, curPos); - int end = charUtils.OffsetByCodePoints(curTermBuffer, 0, curTermLength, start, curGramSize); - termAtt.CopyBuffer(curTermBuffer, start, end - start); - posIncAtt.PositionIncrement = curPosInc; - curPosInc = 0; - posLenAtt.PositionLength = curPosLen; - offsetAtt.SetOffset(tokStart, tokEnd); - curGramSize++; - return true; - } - } - else - { - while (curGramSize <= maxGram) - { - while (curPos + curGramSize <= curTermLength) // while there is input - { - ClearAttributes(); - termAtt.CopyBuffer(curTermBuffer, curPos, curGramSize); - if (hasIllegalOffsets) - { - offsetAtt.SetOffset(tokStart, tokEnd); - } - else - { - offsetAtt.SetOffset(tokStart + curPos, tokStart + curPos + curGramSize); - } - curPos++; - return true; - } - curGramSize++; // increase n-gram size - curPos = 0; - } - } - curTermBuffer = null; - } - } - - public override void Reset() - { - base.Reset(); - curTermBuffer = null; - } - } -} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/ab81d913/src/Lucene.Net.Analysis.Common/Analysis/Ngram/NGramTokenizer.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Ngram/NGramTokenizer.cs b/src/Lucene.Net.Analysis.Common/Analysis/Ngram/NGramTokenizer.cs deleted file mode 100644 index b1845c8..0000000 --- a/src/Lucene.Net.Analysis.Common/Analysis/Ngram/NGramTokenizer.cs +++ /dev/null @@ -1,319 +0,0 @@ -using Lucene.Net.Analysis.TokenAttributes; -using Lucene.Net.Analysis.Util; -using Lucene.Net.Support; -using Lucene.Net.Util; -using System; -using System.Diagnostics; -using System.IO; - -namespace Lucene.Net.Analysis.NGram -{ - /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - - /// <summary> - /// Tokenizes the input into n-grams of the given size(s). - /// <para>On the contrary to <see cref="NGramTokenFilter"/>, this class sets offsets so - /// that characters between startOffset and endOffset in the original stream are - /// the same as the term chars. - /// </para> - /// <para>For example, "abcde" would be tokenized as (minGram=2, maxGram=3): - /// <list type="table"> - /// <listheader> - /// <term>Term</term> - /// <term>Position increment</term> - /// <term>Position length</term> - /// <term>Offsets</term> - /// </listheader> - /// <item> - /// <term>ab</term> - /// <term>1</term> - /// <term>1</term> - /// <term>[0,2[</term> - /// </item> - /// <item> - /// <term>abc</term> - /// <term>1</term> - /// <term>1</term> - /// <term>[0,3[</term> - /// </item> - /// <item> - /// <term>bc</term> - /// <term>1</term> - /// <term>1</term> - /// <term>[1,3[</term> - /// </item> - /// <item> - /// <term>bcd</term> - /// <term>1</term> - /// <term>1</term> - /// <term>[1,4[</term> - /// </item> - /// <item> - /// <term>cd</term> - /// <term>1</term> - /// <term>1</term> - /// <term>[2,4[</term> - /// </item> - /// <item> - /// <term>cde</term> - /// <term>1</term> - /// <term>1</term> - /// <term>[2,5[</term> - /// </item> - /// <item> - /// <term>de</term> - /// <term>1</term> - /// <term>1</term> - /// <term>[3,5[</term> - /// </item> - /// </list> - /// </para> - /// <para>This tokenizer changed a lot in Lucene 4.4 in order to: - /// <list type="bullet"> - /// <item>tokenize in a streaming fashion to support streams which are larger - /// than 1024 chars (limit of the previous version),</item> - /// <item>count grams based on unicode code points instead of java chars (and - /// never split in the middle of surrogate pairs),</item> - /// <item>give the ability to pre-tokenize the stream (<see cref="IsTokenChar(int)"/>) - /// before computing n-grams.</item> - /// </list> - /// </para> - /// <para>Additionally, this class doesn't trim trailing whitespaces and emits - /// tokens in a different order, tokens are now emitted by increasing start - /// offsets while they used to be emitted by increasing lengths (which prevented - /// from supporting large input streams). - /// </para> - /// <para>Although <b style="color:red">highly</b> discouraged, it is still possible - /// to use the old behavior through <see cref="Lucene43NGramTokenizer"/>. - /// </para> - /// </summary> - // non-sealed to allow for overriding IsTokenChar, but all other methods should be sealed - public class NGramTokenizer : Tokenizer - { - public const int DEFAULT_MIN_NGRAM_SIZE = 1; - public const int DEFAULT_MAX_NGRAM_SIZE = 2; - - private CharacterUtils charUtils; - private CharacterUtils.CharacterBuffer charBuffer; - private int[] buffer; // like charBuffer, but converted to code points - private int bufferStart, bufferEnd; // remaining slice in buffer - private int offset; - private int gramSize; - private int minGram, maxGram; - private bool exhausted; - private int lastCheckedChar; // last offset in the buffer that we checked - private int lastNonTokenChar; // last offset that we found to not be a token char - private bool edgesOnly; // leading edges n-grams only - - private ICharTermAttribute termAtt; - private IPositionIncrementAttribute posIncAtt; - private IPositionLengthAttribute posLenAtt; - private IOffsetAttribute offsetAtt; - - internal NGramTokenizer(LuceneVersion version, TextReader input, int minGram, int maxGram, bool edgesOnly) - : base(input) - { - Init(version, minGram, maxGram, edgesOnly); - } - - /// <summary> - /// Creates <see cref="NGramTokenizer"/> with given min and max n-grams. </summary> - /// <param name="version"> the lucene compatibility version </param> - /// <param name="input"> <see cref="TextReader"/> holding the input to be tokenized </param> - /// <param name="minGram"> the smallest n-gram to generate </param> - /// <param name="maxGram"> the largest n-gram to generate </param> - public NGramTokenizer(LuceneVersion version, TextReader input, int minGram, int maxGram) - : this(version, input, minGram, maxGram, false) - { - } - - internal NGramTokenizer(LuceneVersion version, AttributeFactory factory, TextReader input, int minGram, int maxGram, bool edgesOnly) - : base(factory, input) - { - Init(version, minGram, maxGram, edgesOnly); - } - - /// <summary> - /// Creates <see cref="NGramTokenizer"/> with given min and max n-grams. </summary> - /// <param name="version"> the lucene compatibility version </param> - /// <param name="factory"> <see cref="AttributeSource.AttributeFactory"/> to use </param> - /// <param name="input"> <see cref="TextReader"/> holding the input to be tokenized </param> - /// <param name="minGram"> the smallest n-gram to generate </param> - /// <param name="maxGram"> the largest n-gram to generate </param> - public NGramTokenizer(LuceneVersion version, AttributeFactory factory, TextReader input, int minGram, int maxGram) - : this(version, factory, input, minGram, maxGram, false) - { - } - - /// <summary> - /// Creates <see cref="NGramTokenizer"/> with default min and max n-grams. </summary> - /// <param name="version"> the lucene compatibility version </param> - /// <param name="input"> <see cref="TextReader"/> holding the input to be tokenized </param> - public NGramTokenizer(LuceneVersion version, TextReader input) - : this(version, input, DEFAULT_MIN_NGRAM_SIZE, DEFAULT_MAX_NGRAM_SIZE) - { - } - - private void Init(LuceneVersion version, int minGram, int maxGram, bool edgesOnly) - { -#pragma warning disable 612, 618 - if (!version.OnOrAfter(LuceneVersion.LUCENE_44)) -#pragma warning restore 612, 618 - { - throw new System.ArgumentException("This class only works with Lucene 4.4+. To emulate the old (broken) behavior of NGramTokenizer, use Lucene43NGramTokenizer/Lucene43EdgeNGramTokenizer"); - } -#pragma warning disable 612, 618 - charUtils = version.OnOrAfter(LuceneVersion.LUCENE_44) ? -#pragma warning restore 612, 618 - CharacterUtils.GetInstance(version) : CharacterUtils.Java4Instance; - if (minGram < 1) - { - throw new System.ArgumentException("minGram must be greater than zero"); - } - if (minGram > maxGram) - { - throw new System.ArgumentException("minGram must not be greater than maxGram"); - } - termAtt = AddAttribute<ICharTermAttribute>(); - posIncAtt = AddAttribute<IPositionIncrementAttribute>(); - posLenAtt = AddAttribute<IPositionLengthAttribute>(); - offsetAtt = AddAttribute<IOffsetAttribute>(); - this.minGram = minGram; - this.maxGram = maxGram; - this.edgesOnly = edgesOnly; - charBuffer = CharacterUtils.NewCharacterBuffer(2 * maxGram + 1024); // 2 * maxGram in case all code points require 2 chars and + 1024 for buffering to not keep polling the Reader - buffer = new int[charBuffer.Buffer.Length]; - - // Make the term att large enough - termAtt.ResizeBuffer(2 * maxGram); - } - - public override sealed bool IncrementToken() - { - ClearAttributes(); - - // termination of this loop is guaranteed by the fact that every iteration - // either advances the buffer (calls consumes()) or increases gramSize - while (true) - { - // compact - if (bufferStart >= bufferEnd - maxGram - 1 && !exhausted) - { - Array.Copy(buffer, bufferStart, buffer, 0, bufferEnd - bufferStart); - bufferEnd -= bufferStart; - lastCheckedChar -= bufferStart; - lastNonTokenChar -= bufferStart; - bufferStart = 0; - - // fill in remaining space - exhausted = !charUtils.Fill(charBuffer, m_input, buffer.Length - bufferEnd); - // convert to code points - bufferEnd += charUtils.ToCodePoints(charBuffer.Buffer, 0, charBuffer.Length, buffer, bufferEnd); - } - - // should we go to the next offset? - if (gramSize > maxGram || (bufferStart + gramSize) > bufferEnd) - { - if (bufferStart + 1 + minGram > bufferEnd) - { - Debug.Assert(exhausted); - return false; - } - Consume(); - gramSize = minGram; - } - - UpdateLastNonTokenChar(); - - // retry if the token to be emitted was going to not only contain token chars - bool termContainsNonTokenChar = lastNonTokenChar >= bufferStart && lastNonTokenChar < (bufferStart + gramSize); - bool isEdgeAndPreviousCharIsTokenChar = edgesOnly && lastNonTokenChar != bufferStart - 1; - if (termContainsNonTokenChar || isEdgeAndPreviousCharIsTokenChar) - { - Consume(); - gramSize = minGram; - continue; - } - - int length = charUtils.ToChars(buffer, bufferStart, gramSize, termAtt.Buffer, 0); - termAtt.Length = length; - posIncAtt.PositionIncrement = 1; - posLenAtt.PositionLength = 1; - offsetAtt.SetOffset(CorrectOffset(offset), CorrectOffset(offset + length)); - ++gramSize; - return true; - } - } - - private void UpdateLastNonTokenChar() - { - int termEnd = bufferStart + gramSize - 1; - if (termEnd > lastCheckedChar) - { - for (int i = termEnd; i > lastCheckedChar; --i) - { - if (!IsTokenChar(buffer[i])) - { - lastNonTokenChar = i; - break; - } - } - lastCheckedChar = termEnd; - } - } - - /// <summary> - /// Consume one code point. </summary> - private void Consume() - { - offset += Character.CharCount(buffer[bufferStart++]); - } - - /// <summary> - /// Only collect characters which satisfy this condition. </summary> - protected virtual bool IsTokenChar(int chr) - { - return true; - } - - public override sealed void End() - { - base.End(); - Debug.Assert(bufferStart <= bufferEnd); - int endOffset = offset; - for (int i = bufferStart; i < bufferEnd; ++i) - { - endOffset += Character.CharCount(buffer[i]); - } - endOffset = CorrectOffset(endOffset); - // set final offset - offsetAtt.SetOffset(endOffset, endOffset); - } - - public override sealed void Reset() - { - base.Reset(); - bufferStart = bufferEnd = buffer.Length; - lastNonTokenChar = lastCheckedChar = bufferStart - 1; - offset = 0; - gramSize = minGram; - exhausted = false; - charBuffer.Reset(); - } - } -} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/ab81d913/src/Lucene.Net.Analysis.Common/Analysis/Ngram/NGramTokenizerFactory.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Common/Analysis/Ngram/NGramTokenizerFactory.cs b/src/Lucene.Net.Analysis.Common/Analysis/Ngram/NGramTokenizerFactory.cs deleted file mode 100644 index cf25b65..0000000 --- a/src/Lucene.Net.Analysis.Common/Analysis/Ngram/NGramTokenizerFactory.cs +++ /dev/null @@ -1,70 +0,0 @@ -using Lucene.Net.Analysis.Util; -using Lucene.Net.Util; -using System.Collections.Generic; -using System.IO; - -namespace Lucene.Net.Analysis.NGram -{ - /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - - /// <summary> - /// Factory for <see cref="NGramTokenizer"/>. - /// <code> - /// <fieldType name="text_ngrm" class="solr.TextField" positionIncrementGap="100"> - /// <analyzer> - /// <tokenizer class="solr.NGramTokenizerFactory" minGramSize="1" maxGramSize="2"/> - /// </analyzer> - /// </fieldType></code> - /// </summary> - public class NGramTokenizerFactory : TokenizerFactory - { - private readonly int maxGramSize; - private readonly int minGramSize; - - /// <summary> - /// Creates a new <see cref="NGramTokenizerFactory"/> </summary> - public NGramTokenizerFactory(IDictionary<string, string> args) - : base(args) - { - minGramSize = GetInt(args, "minGramSize", NGramTokenizer.DEFAULT_MIN_NGRAM_SIZE); - maxGramSize = GetInt(args, "maxGramSize", NGramTokenizer.DEFAULT_MAX_NGRAM_SIZE); - if (args.Count > 0) - { - throw new System.ArgumentException("Unknown parameters: " + args); - } - } - - /// <summary> - /// Creates the <see cref="TokenStream"/> of n-grams from the given <see cref="TextReader"/> and <see cref="AttributeSource.AttributeFactory"/>. </summary> - public override Tokenizer Create(AttributeSource.AttributeFactory factory, TextReader input) - { -#pragma warning disable 612, 618 - if (m_luceneMatchVersion.OnOrAfter(LuceneVersion.LUCENE_44)) -#pragma warning restore 612, 618 - { - return new NGramTokenizer(m_luceneMatchVersion, factory, input, minGramSize, maxGramSize); - } - else - { -#pragma warning disable 612, 618 - return new Lucene43NGramTokenizer(factory, input, minGramSize, maxGramSize); -#pragma warning restore 612, 618 - } - } - } -} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/ab81d913/src/Lucene.Net.Tests.Analysis.Common/Analysis/NGram/EdgeNGramTokenFilterTest.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Tests.Analysis.Common/Analysis/NGram/EdgeNGramTokenFilterTest.cs b/src/Lucene.Net.Tests.Analysis.Common/Analysis/NGram/EdgeNGramTokenFilterTest.cs new file mode 100644 index 0000000..ea6fbd7 --- /dev/null +++ b/src/Lucene.Net.Tests.Analysis.Common/Analysis/NGram/EdgeNGramTokenFilterTest.cs @@ -0,0 +1,390 @@ +using Lucene.Net.Analysis.Core; +using Lucene.Net.Analysis.Miscellaneous; +using Lucene.Net.Analysis.Shingle; +using Lucene.Net.Analysis.TokenAttributes; +using Lucene.Net.Support; +using Lucene.Net.Util; +using NUnit.Framework; +using System; +using System.IO; + +namespace Lucene.Net.Analysis.NGram +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// <summary> + /// Tests <seealso cref="EdgeNGramTokenFilter"/> for correctness. + /// </summary> + public class EdgeNGramTokenFilterTest : BaseTokenStreamTestCase + { + private TokenStream input; + + public override void SetUp() + { + base.SetUp(); + input = new MockTokenizer(new StringReader("abcde"), MockTokenizer.WHITESPACE, false); + } + + [Test] + public virtual void TestInvalidInput() + { + bool gotException = false; + try + { +#pragma warning disable 612, 618 + new EdgeNGramTokenFilter(TEST_VERSION_CURRENT, input, EdgeNGramTokenFilter.Side.FRONT, 0, 0); +#pragma warning restore 612, 618 + } + catch (System.ArgumentException) + { + gotException = true; + } + assertTrue(gotException); + } + + [Test] + public virtual void TestInvalidInput2() + { + bool gotException = false; + try + { +#pragma warning disable 612, 618 + new EdgeNGramTokenFilter(TEST_VERSION_CURRENT, input, EdgeNGramTokenFilter.Side.FRONT, 2, 1); +#pragma warning restore 612, 618 + } + catch (System.ArgumentException) + { + gotException = true; + } + assertTrue(gotException); + } + + [Test] + public virtual void TestInvalidInput3() + { + bool gotException = false; + try + { +#pragma warning disable 612, 618 + new EdgeNGramTokenFilter(TEST_VERSION_CURRENT, input, EdgeNGramTokenFilter.Side.FRONT, -1, 2); +#pragma warning restore 612, 618 + } + catch (System.ArgumentException) + { + gotException = true; + } + assertTrue(gotException); + } + + [Test] + public virtual void TestFrontUnigram() + { +#pragma warning disable 612, 618 + EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(TEST_VERSION_CURRENT, input, EdgeNGramTokenFilter.Side.FRONT, 1, 1); +#pragma warning restore 612, 618 + AssertTokenStreamContents(tokenizer, new string[] { "a" }, new int[] { 0 }, new int[] { 5 }); + } + + [Test] + public virtual void TestBackUnigram() + { +#pragma warning disable 612, 618 + EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(LuceneVersion.LUCENE_43, input, EdgeNGramTokenFilter.Side.BACK, 1, 1); +#pragma warning restore 612, 618 + AssertTokenStreamContents(tokenizer, new string[] { "e" }, new int[] { 4 }, new int[] { 5 }); + } + + [Test] + public virtual void TestOversizedNgrams() + { +#pragma warning disable 612, 618 + EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(TEST_VERSION_CURRENT, input, EdgeNGramTokenFilter.Side.FRONT, 6, 6); +#pragma warning restore 612, 618 + AssertTokenStreamContents(tokenizer, new string[0], new int[0], new int[0]); + } + + [Test] + public virtual void TestFrontRangeOfNgrams() + { +#pragma warning disable 612, 618 + EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(TEST_VERSION_CURRENT, input, EdgeNGramTokenFilter.Side.FRONT, 1, 3); +#pragma warning restore 612, 618 + AssertTokenStreamContents(tokenizer, new string[] { "a", "ab", "abc" }, new int[] { 0, 0, 0 }, new int[] { 5, 5, 5 }); + } + + [Test] + public virtual void TestBackRangeOfNgrams() + { +#pragma warning disable 612, 618 + EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(LuceneVersion.LUCENE_43, input, EdgeNGramTokenFilter.Side.BACK, 1, 3); +#pragma warning restore 612, 618 + AssertTokenStreamContents(tokenizer, new string[] { "e", "de", "cde" }, new int[] { 4, 3, 2 }, new int[] { 5, 5, 5 }, null, null, null, null, false); + } + + [Test] + public virtual void TestFilterPositions() + { + TokenStream ts = new MockTokenizer(new StringReader("abcde vwxyz"), MockTokenizer.WHITESPACE, false); +#pragma warning disable 612, 618 + EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(TEST_VERSION_CURRENT, ts, EdgeNGramTokenFilter.Side.FRONT, 1, 3); +#pragma warning restore 612, 618 + AssertTokenStreamContents(tokenizer, new string[] { "a", "ab", "abc", "v", "vw", "vwx" }, new int[] { 0, 0, 0, 6, 6, 6 }, new int[] { 5, 5, 5, 11, 11, 11 }, null, new int[] { 1, 0, 0, 1, 0, 0 }, null, null, false); + } + + private class PositionFilter : TokenFilter + { + + internal readonly IPositionIncrementAttribute posIncrAtt; + internal bool started; + + internal PositionFilter(TokenStream input) : base(input) + { + posIncrAtt = AddAttribute<IPositionIncrementAttribute>(); + } + + public override sealed bool IncrementToken() + { + if (m_input.IncrementToken()) + { + if (started) + { + posIncrAtt.PositionIncrement = 0; + } + else + { + started = true; + } + return true; + } + else + { + return false; + } + } + + public override void Reset() + { + base.Reset(); + started = false; + } + } + + [Test] + public virtual void TestFirstTokenPositionIncrement() + { + TokenStream ts = new MockTokenizer(new StringReader("a abc"), MockTokenizer.WHITESPACE, false); + ts = new PositionFilter(ts); // All but first token will get 0 position increment +#pragma warning disable 612, 618 + EdgeNGramTokenFilter filter = new EdgeNGramTokenFilter(TEST_VERSION_CURRENT, ts, EdgeNGramTokenFilter.Side.FRONT, 2, 3); +#pragma warning restore 612, 618 + // The first token "a" will not be output, since it's smaller than the mingram size of 2. + // The second token on input to EdgeNGramTokenFilter will have position increment of 0, + // which should be increased to 1, since this is the first output token in the stream. + AssertTokenStreamContents(filter, new string[] { "ab", "abc" }, new int[] { 2, 2 }, new int[] { 5, 5 }, new int[] { 1, 0 }); + } + + [Test] + public virtual void TestSmallTokenInStream() + { + input = new MockTokenizer(new StringReader("abc de fgh"), MockTokenizer.WHITESPACE, false); +#pragma warning disable 612, 618 + EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(TEST_VERSION_CURRENT, input, EdgeNGramTokenFilter.Side.FRONT, 3, 3); +#pragma warning restore 612, 618 + AssertTokenStreamContents(tokenizer, new string[] { "abc", "fgh" }, new int[] { 0, 7 }, new int[] { 3, 10 }); + } + + [Test] + public virtual void TestReset() + { + WhitespaceTokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("abcde")); +#pragma warning disable 612, 618 + EdgeNGramTokenFilter filter = new EdgeNGramTokenFilter(TEST_VERSION_CURRENT, tokenizer, EdgeNGramTokenFilter.Side.FRONT, 1, 3); +#pragma warning restore 612, 618 + AssertTokenStreamContents(filter, new string[] { "a", "ab", "abc" }, new int[] { 0, 0, 0 }, new int[] { 5, 5, 5 }); + tokenizer.SetReader(new StringReader("abcde")); + AssertTokenStreamContents(filter, new string[] { "a", "ab", "abc" }, new int[] { 0, 0, 0 }, new int[] { 5, 5, 5 }); + } + + // LUCENE-3642 + // EdgeNgram blindly adds term length to offset, but this can take things out of bounds + // wrt original text if a previous filter increases the length of the word (in this case æ -> ae) + // so in this case we behave like WDF, and preserve any modified offsets + [Test] + public virtual void TestInvalidOffsets() + { + Analyzer analyzer = new AnalyzerAnonymousInnerClassHelper(this); + AssertAnalyzesTo(analyzer, "mosfellsbær", new string[] { "mo", "mos", "mosf", "mosfe", "mosfel", "mosfell", "mosfells", "mosfellsb", "mosfellsba", "mosfellsbae", "mosfellsbaer" }, new int[] { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, new int[] { 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11 }); + } + + private class AnalyzerAnonymousInnerClassHelper : Analyzer + { + private readonly EdgeNGramTokenFilterTest outerInstance; + + public AnalyzerAnonymousInnerClassHelper(EdgeNGramTokenFilterTest outerInstance) + { + this.outerInstance = outerInstance; + } + + protected internal override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) + { + Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); + TokenFilter filters = new ASCIIFoldingFilter(tokenizer); +#pragma warning disable 612, 618 + filters = new EdgeNGramTokenFilter(LuceneVersion.LUCENE_43, filters, EdgeNGramTokenFilter.Side.FRONT, 2, 15); +#pragma warning restore 612, 618 + return new TokenStreamComponents(tokenizer, filters); + } + } + + /// <summary> + /// blast some random strings through the analyzer </summary> + [Test] + public virtual void TestRandomStrings() + { + for (int i = 0; i < 10; i++) + { + int min = TestUtil.NextInt(Random(), 2, 10); + int max = TestUtil.NextInt(Random(), min, 20); + + Analyzer a = new AnalyzerAnonymousInnerClassHelper2(this, min, max); + CheckRandomData(Random(), a, 100 * RANDOM_MULTIPLIER); + } + + Analyzer b = new AnalyzerAnonymousInnerClassHelper3(this); + CheckRandomData(Random(), b, 1000 * RANDOM_MULTIPLIER, 20, false, false); + } + + private class AnalyzerAnonymousInnerClassHelper2 : Analyzer + { + private readonly EdgeNGramTokenFilterTest outerInstance; + + private int min; + private int max; + + public AnalyzerAnonymousInnerClassHelper2(EdgeNGramTokenFilterTest outerInstance, int min, int max) + { + this.outerInstance = outerInstance; + this.min = min; + this.max = max; + } + + protected internal override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) + { + Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); + return new TokenStreamComponents(tokenizer, new EdgeNGramTokenFilter(TEST_VERSION_CURRENT, tokenizer, min, max)); + } + } + + private class AnalyzerAnonymousInnerClassHelper3 : Analyzer + { + private readonly EdgeNGramTokenFilterTest outerInstance; + + public AnalyzerAnonymousInnerClassHelper3(EdgeNGramTokenFilterTest outerInstance) + { + this.outerInstance = outerInstance; + } + + protected internal override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) + { + Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); +#pragma warning disable 612, 618 + return new TokenStreamComponents(tokenizer, new EdgeNGramTokenFilter(LuceneVersion.LUCENE_43, tokenizer, EdgeNGramTokenFilter.Side.BACK, 2, 4)); +#pragma warning restore 612, 618 + } + } + + [Test] + public virtual void TestEmptyTerm() + { + Random random = Random(); + Analyzer a = new AnalyzerAnonymousInnerClassHelper4(this); + CheckAnalysisConsistency(random, a, random.nextBoolean(), ""); + + Analyzer b = new AnalyzerAnonymousInnerClassHelper5(this); + CheckAnalysisConsistency(random, b, random.nextBoolean(), ""); + } + + private class AnalyzerAnonymousInnerClassHelper4 : Analyzer + { + private readonly EdgeNGramTokenFilterTest outerInstance; + + public AnalyzerAnonymousInnerClassHelper4(EdgeNGramTokenFilterTest outerInstance) + { + this.outerInstance = outerInstance; + } + + protected internal override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) + { + Tokenizer tokenizer = new KeywordTokenizer(reader); +#pragma warning disable 612, 618 + return new TokenStreamComponents(tokenizer, new EdgeNGramTokenFilter(TEST_VERSION_CURRENT, tokenizer, EdgeNGramTokenFilter.Side.FRONT, 2, 15)); +#pragma warning restore 612, 618 + } + } + + private class AnalyzerAnonymousInnerClassHelper5 : Analyzer + { + private readonly EdgeNGramTokenFilterTest outerInstance; + + public AnalyzerAnonymousInnerClassHelper5(EdgeNGramTokenFilterTest outerInstance) + { + this.outerInstance = outerInstance; + } + + protected internal override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) + { + Tokenizer tokenizer = new KeywordTokenizer(reader); +#pragma warning disable 612, 618 + return new TokenStreamComponents(tokenizer, new EdgeNGramTokenFilter(LuceneVersion.LUCENE_43, tokenizer, EdgeNGramTokenFilter.Side.BACK, 2, 15)); +#pragma warning restore 612, 618 + } + } + + [Test] + public virtual void TestGraphs() + { + TokenStream tk = new LetterTokenizer(TEST_VERSION_CURRENT, new StringReader("abc d efgh ij klmno p q")); + tk = new ShingleFilter(tk); + tk = new EdgeNGramTokenFilter(TEST_VERSION_CURRENT, tk, 7, 10); + AssertTokenStreamContents(tk, new string[] { "efgh ij", "ij klmn", "ij klmno", "klmno p" }, new int[] { 6, 11, 11, 14 }, new int[] { 13, 19, 19, 21 }, new int[] { 3, 1, 0, 1 }, new int[] { 2, 2, 2, 2 }, 23); + } + + [Test] + public virtual void TestSupplementaryCharacters() + { + string s = TestUtil.RandomUnicodeString(Random(), 10); + int codePointCount = s.CodePointCount(0, s.Length); + int minGram = TestUtil.NextInt(Random(), 1, 3); + int maxGram = TestUtil.NextInt(Random(), minGram, 10); + TokenStream tk = new KeywordTokenizer(new StringReader(s)); + tk = new EdgeNGramTokenFilter(TEST_VERSION_CURRENT, tk, minGram, maxGram); + ICharTermAttribute termAtt = tk.AddAttribute<ICharTermAttribute>(); + IOffsetAttribute offsetAtt = tk.AddAttribute<IOffsetAttribute>(); + tk.Reset(); + for (int i = minGram; i <= Math.Min(codePointCount, maxGram); ++i) + { + assertTrue(tk.IncrementToken()); + assertEquals(0, offsetAtt.StartOffset); + assertEquals(s.Length, offsetAtt.EndOffset); + int end = Character.OffsetByCodePoints(s, 0, i); + assertEquals(s.Substring(0, end), termAtt.ToString()); + } + assertFalse(tk.IncrementToken()); + } + } +} \ No newline at end of file
