http://git-wip-us.apache.org/repos/asf/lucenenet/blob/02362804/src/contrib/Analyzers/Hunspell/HunspellStemmer.cs ---------------------------------------------------------------------- diff --git a/src/contrib/Analyzers/Hunspell/HunspellStemmer.cs b/src/contrib/Analyzers/Hunspell/HunspellStemmer.cs deleted file mode 100644 index 096808d..0000000 --- a/src/contrib/Analyzers/Hunspell/HunspellStemmer.cs +++ /dev/null @@ -1,201 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -using System; -using System.Collections.Generic; -using System.Linq; -using System.Text; - -namespace Lucene.Net.Analysis.Hunspell { - /// <summary> - /// HunspellStemmer uses the affix rules declared in the HunspellDictionary to generate one or - /// more stems for a word. It conforms to the algorithm in the original hunspell algorithm, - /// including recursive suffix stripping. - /// </summary> - /// <author>Chris Male</author> - public class HunspellStemmer { - private static Int32 RECURSION_CAP = 2; - private readonly HunspellDictionary _dictionary; - - /// <summary> - /// Constructs a new HunspellStemmer which will use the provided HunspellDictionary - /// to create its stems. - /// </summary> - /// <param name="dictionary">HunspellDictionary that will be used to create the stems.</param> - public HunspellStemmer(HunspellDictionary dictionary) { - if (dictionary == null) throw new ArgumentNullException("dictionary"); - _dictionary = dictionary; - } - - /// <summary> - /// Find the stem(s) of the provided word. - /// </summary> - /// <param name="word">Word to find the stems for.</param> - /// <returns>List of stems for the word.</returns> - public IEnumerable<HunspellStem> Stem(String word) { - if (word == null) throw new ArgumentNullException("word"); - - var stems = new List<HunspellStem>(); - if (_dictionary.LookupWord(word) != null) - stems.Add(new HunspellStem(word)); - - stems.AddRange(Stem(word, null, 0)); - return stems; - } - - /// <summary> - /// Find the unique stem(s) of the provided word. - /// </summary> - /// <param name="word">Word to find the stems for.</param> - /// <returns>List of stems for the word.</returns> - public IEnumerable<HunspellStem> UniqueStems(String word) { - if (word == null) throw new ArgumentNullException("word"); - - var stems = new List<HunspellStem>(); - var terms = new CharArraySet(8, false); - if (_dictionary.LookupWord(word) != null) { - stems.Add(new HunspellStem(word)); - terms.Add(word); - } - - var otherStems = Stem(word, null, 0); - foreach (var s in otherStems) { - if (!terms.Contains(s.Stem)) { - stems.Add(s); - terms.Add(s.Stem); - } - } - - return stems; - } - - /// <summary> - /// Generates a list of stems for the provided word. - /// </summary> - /// <param name="word">Word to generate the stems for.</param> - /// <param name="flags">Flags from a previous stemming step that need to be cross-checked with any affixes in this recursive step.</param> - /// <param name="recursionDepth">Level of recursion this stemming step is at.</param> - /// <returns>List of stems, pr an empty if no stems are found.</returns> - private IEnumerable<HunspellStem> Stem(String word, Char[] flags, Int32 recursionDepth) { - if (word == null) throw new ArgumentNullException("word"); - - var stems = new List<HunspellStem>(); - var chars = word.ToCharArray(); - var length = word.Length; - - for (var i = 0; i < length; i++) { - var suffixes = _dictionary.LookupSuffix(chars, i, length - i); - if (suffixes != null) { - foreach (var suffix in suffixes) { - if (HasCrossCheckedFlag(suffix.Flag, flags)) { - var deAffixedLength = length - suffix.Append.Length; - - // TODO: can we do this in-place? - var strippedWord = new StringBuilder() - .Append(word, 0, deAffixedLength) - .Append(suffix.Strip) - .ToString(); - - var stemList = ApplyAffix(strippedWord, suffix, recursionDepth); - foreach (var stem in stemList) { - stem.AddSuffix(suffix); - } - - stems.AddRange(stemList); - } - } - } - } - - for (var i = length - 1; i >= 0; i--) { - var prefixes = _dictionary.LookupPrefix(chars, 0, i); - if (prefixes != null) { - foreach (var prefix in prefixes) { - if (HasCrossCheckedFlag(prefix.Flag, flags)) { - var deAffixedStart = prefix.Append.Length; - var deAffixedLength = length - deAffixedStart; - - var strippedWord = new StringBuilder() - .Append(prefix.Strip) - .Append(word, deAffixedStart, deAffixedLength) - .ToString(); - - var stemList = ApplyAffix(strippedWord, prefix, recursionDepth); - foreach (var stem in stemList) { - stem.AddPrefix(prefix); - } - - stems.AddRange(stemList); - } - } - } - } - - return stems; - } - - /// <summary> - /// Applies the affix rule to the given word, producing a list of stems if any are found. - /// </summary> - /// <param name="strippedWord">Word the affix has been removed and the strip added.</param> - /// <param name="affix">HunspellAffix representing the affix rule itself.</param> - /// <param name="recursionDepth">Level of recursion this stemming step is at.</param> - /// <returns>List of stems for the word, or an empty list if none are found.</returns> - public IEnumerable<HunspellStem> ApplyAffix(String strippedWord, HunspellAffix affix, Int32 recursionDepth) { - if (strippedWord == null) throw new ArgumentNullException("strippedWord"); - if (affix == null) throw new ArgumentNullException("affix"); - - if (!affix.CheckCondition(strippedWord)) { - return new List<HunspellStem>(); - } - - var words = _dictionary.LookupWord(strippedWord); - if (words == null) { - return new List<HunspellStem>(); - } - - var stems = new List<HunspellStem>(); - - foreach (var hunspellWord in words) { - if (hunspellWord.HasFlag(affix.Flag)) { - if (affix.IsCrossProduct && recursionDepth < RECURSION_CAP) { - var recursiveStems = Stem(strippedWord, affix.AppendFlags, ++recursionDepth); - if (recursiveStems.Any()) { - stems.AddRange(recursiveStems); - } else { - stems.Add(new HunspellStem(strippedWord)); - } - } else { - stems.Add(new HunspellStem(strippedWord)); - } - } - } - - return stems; - } - - /// <summary> - /// Checks if the given flag cross checks with the given array of flags. - /// </summary> - /// <param name="flag">Flag to cross check with the array of flags.</param> - /// <param name="flags">Array of flags to cross check against. Can be <c>null</c>.</param> - /// <returns><c>true</c> if the flag is found in the array or the array is <c>null</c>, <c>false</c> otherwise.</returns> - private static Boolean HasCrossCheckedFlag(Char flag, Char[] flags) { - return flags == null || Array.BinarySearch(flags, flag) >= 0; - } - } -} \ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/02362804/src/contrib/Analyzers/Hunspell/HunspellWord.cs ---------------------------------------------------------------------- diff --git a/src/contrib/Analyzers/Hunspell/HunspellWord.cs b/src/contrib/Analyzers/Hunspell/HunspellWord.cs deleted file mode 100644 index 224003e..0000000 --- a/src/contrib/Analyzers/Hunspell/HunspellWord.cs +++ /dev/null @@ -1,51 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -using System; -using System.Linq; - -namespace Lucene.Net.Analysis.Hunspell { - public class HunspellWord { - private readonly Char[] _flags; - - /// <summary> - /// Creates a new HunspellWord with no associated flags. - /// </summary> - public HunspellWord() : this(new Char[0]) { - } - - /// <summary> - /// Constructs a new HunspellWord with the given flags. - /// </summary> - /// <param name="flags">Flags to associate with the word.</param> - public HunspellWord(Char[] flags) { - if (flags == null) - throw new ArgumentNullException("flags"); - - _flags = flags; - } - - /// <summary> - /// Checks whether the word has the given flag associated with it. - /// </summary> - /// <param name="flag">Flag to check whether it is associated with the word.</param> - /// <returns><c>true</c> if the flag is associated, <c>false</c> otherwise</returns> - public Boolean HasFlag(Char flag) { - return _flags.Contains(flag); - } - } -} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/02362804/src/contrib/Analyzers/Miscellaneous/EmptyTokenStream.cs ---------------------------------------------------------------------- diff --git a/src/contrib/Analyzers/Miscellaneous/EmptyTokenStream.cs b/src/contrib/Analyzers/Miscellaneous/EmptyTokenStream.cs deleted file mode 100644 index bc70321..0000000 --- a/src/contrib/Analyzers/Miscellaneous/EmptyTokenStream.cs +++ /dev/null @@ -1,35 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -namespace Lucene.Net.Analysis.Miscellaneous -{ - /// <summary> - /// An always exhausted token stream - /// </summary> - public sealed class EmptyTokenStream : TokenStream - { - public sealed override bool IncrementToken() - { - return false; - } - - protected override void Dispose(bool disposing) - { - // Do nothing - } - } -} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/02362804/src/contrib/Analyzers/Miscellaneous/InjectablePrefixAwareTokenFilter.cs ---------------------------------------------------------------------- diff --git a/src/contrib/Analyzers/Miscellaneous/InjectablePrefixAwareTokenFilter.cs b/src/contrib/Analyzers/Miscellaneous/InjectablePrefixAwareTokenFilter.cs deleted file mode 100644 index 4d3c111..0000000 --- a/src/contrib/Analyzers/Miscellaneous/InjectablePrefixAwareTokenFilter.cs +++ /dev/null @@ -1,35 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -using System; - -namespace Lucene.Net.Analysis.Miscellaneous -{ - public class InjectablePrefixAwareTokenFilter : PrefixAwareTokenFilter - { - public InjectablePrefixAwareTokenFilter(TokenStream prefix, TokenStream suffix) : base(prefix, suffix) - { - } - - public Func<Token, Token, Token> UpdateAction { get; set; } - - public override Token UpdateSuffixToken(Token suffixToken, Token lastPrefixToken) - { - return UpdateAction(suffixToken, lastPrefixToken); - } - } -} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/02362804/src/contrib/Analyzers/Miscellaneous/PatternAnalyzer.cs ---------------------------------------------------------------------- diff --git a/src/contrib/Analyzers/Miscellaneous/PatternAnalyzer.cs b/src/contrib/Analyzers/Miscellaneous/PatternAnalyzer.cs deleted file mode 100644 index 5349c60..0000000 --- a/src/contrib/Analyzers/Miscellaneous/PatternAnalyzer.cs +++ /dev/null @@ -1,538 +0,0 @@ -/* - * - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - * -*/ - -using System; -using System.Collections.Generic; -using System.IO; -using System.Text.RegularExpressions; -using Lucene.Net.Analysis.Tokenattributes; -using Version = Lucene.Net.Util.Version; - -namespace Lucene.Net.Analysis.Miscellaneous -{ - /* - * Efficient Lucene analyzer/tokenizer that preferably operates on a String rather than a - * {@link java.io.Reader}, that can flexibly separate text into terms via a regular expression {@link Regex} - * (with behaviour identical to {@link String#split(String)}), - * and that combines the functionality of - * {@link org.apache.lucene.analysis.LetterTokenizer}, - * {@link org.apache.lucene.analysis.LowerCaseTokenizer}, - * {@link org.apache.lucene.analysis.WhitespaceTokenizer}, - * {@link org.apache.lucene.analysis.StopFilter} into a single efficient - * multi-purpose class. - * <p> - * If you are unsure how exactly a regular expression should look like, consider - * prototyping by simply trying various expressions on some test texts via - * {@link String#split(String)}. Once you are satisfied, give that regex to - * RegexAnalyzer. Also see <a target="_blank" - * href="http://java.sun.com/docs/books/tutorial/extra/regex/">Java Regular Expression Tutorial</a>. - * <p> - * This class can be considerably faster than the "normal" Lucene tokenizers. - * It can also serve as a building block in a compound Lucene - * {@link org.apache.lucene.analysis.TokenFilter} chain. For example as in this - * stemming example: - * <pre> - * RegexAnalyzer pat = ... - * TokenStream tokenStream = new SnowballFilter( - * pat.tokenStream("content", "James is running round in the woods"), - * "English")); - * </pre> - * - */ - public class PatternAnalyzer : Analyzer - { - - /* <c>"\\W+"</c>; Divides text at non-letters (NOT char.IsLetter(c)) */ - public static readonly Regex NON_WORD_PATTERN = new Regex("\\W+", RegexOptions.Compiled); - - /* <c>"\\s+"</c>; Divides text at whitespaces (char.IsWhitespace(c)) */ - public static readonly Regex WHITESPACE_PATTERN = new Regex("\\s+", RegexOptions.Compiled); - - private static readonly CharArraySet EXTENDED_ENGLISH_STOP_WORDS = - CharArraySet.UnmodifiableSet(new CharArraySet((IEnumerable<string>)new[]{ - "a", "about", "above", "across", "adj", "after", "afterwards", - "again", "against", "albeit", "all", "almost", "alone", "along", - "already", "also", "although", "always", "among", "amongst", "an", - "and", "another", "any", "anyhow", "anyone", "anything", - "anywhere", "are", "around", "as", "at", "be", "became", "because", - "become", "becomes", "becoming", "been", "before", "beforehand", - "behind", "being", "below", "beside", "besides", "between", - "beyond", "both", "but", "by", "can", "cannot", "co", "could", - "down", "during", "each", "eg", "either", "else", "elsewhere", - "enough", "etc", "even", "ever", "every", "everyone", "everything", - "everywhere", "except", "few", "first", "for", "former", - "formerly", "from", "further", "had", "has", "have", "he", "hence", - "her", "here", "hereafter", "hereby", "herein", "hereupon", "hers", - "herself", "him", "himself", "his", "how", "however", "i", "ie", "if", - "in", "inc", "indeed", "into", "is", "it", "its", "itself", "last", - "latter", "latterly", "least", "less", "ltd", "many", "may", "me", - "meanwhile", "might", "more", "moreover", "most", "mostly", "much", - "must", "my", "myself", "namely", "neither", "never", - "nevertheless", "next", "no", "nobody", "none", "noone", "nor", - "not", "nothing", "now", "nowhere", "of", "off", "often", "on", - "once one", "only", "onto", "or", "other", "others", "otherwise", - "our", "ours", "ourselves", "out", "over", "own", "per", "perhaps", - "rather", "s", "same", "seem", "seemed", "seeming", "seems", - "several", "she", "should", "since", "so", "some", "somehow", - "someone", "something", "sometime", "sometimes", "somewhere", - "still", "such", "t", "than", "that", "the", "their", "them", - "themselves", "then", "thence", "there", "thereafter", "thereby", - "therefor", "therein", "thereupon", "these", "they", "this", - "those", "though", "through", "throughout", "thru", "thus", "to", - "together", "too", "toward", "towards", "under", "until", "up", - "upon", "us", "very", "via", "was", "we", "well", "were", "what", - "whatever", "whatsoever", "when", "whence", "whenever", - "whensoever", "where", "whereafter", "whereas", "whereat", - "whereby", "wherefrom", "wherein", "whereinto", "whereof", - "whereon", "whereto", "whereunto", "whereupon", "wherever", - "wherewith", "whether", "which", "whichever", "whichsoever", - "while", "whilst", "whither", "who", "whoever", "whole", "whom", - "whomever", "whomsoever", "whose", "whosoever", "why", "will", - "with", "within", "without", "would", "xsubj", "xcal", "xauthor", - "xother ", "xnote", "yet", "you", "your", "yours", "yourself", - "yourselves" - }, true)); - - /* - * A lower-casing word analyzer with English stop words (can be shared - * freely across threads without harm); global per class loader. - */ - public static readonly PatternAnalyzer DEFAULT_ANALYZER = new PatternAnalyzer( - Version.LUCENE_CURRENT, NON_WORD_PATTERN, true, StopAnalyzer.ENGLISH_STOP_WORDS_SET); - - /* - * A lower-casing word analyzer with <b>extended </b> English stop words - * (can be shared freely across threads without harm); global per class - * loader. The stop words are borrowed from - * http://thomas.loc.gov/home/stopwords.html, see - * http://thomas.loc.gov/home/all.about.inquery.html - */ - public static readonly PatternAnalyzer EXTENDED_ANALYZER = new PatternAnalyzer( - Version.LUCENE_CURRENT, NON_WORD_PATTERN, true, EXTENDED_ENGLISH_STOP_WORDS); - - private readonly Regex Regex; - private readonly bool toLowerCase; - private readonly ISet<string> stopWords; - - private readonly Version matchVersion; - - /* - * Constructs a new instance with the given parameters. - * - * @param matchVersion If >= {@link Version#LUCENE_29}, StopFilter.enablePositionIncrement is set to true - * @param Regex - * a regular expression delimiting tokens - * @param toLowerCase - * if <c>true</c> returns tokens after applying - * String.toLowerCase() - * @param stopWords - * if non-null, ignores all tokens that are contained in the - * given stop set (after previously having applied toLowerCase() - * if applicable). For example, created via - * {@link StopFilter#makeStopSet(String[])}and/or - * {@link org.apache.lucene.analysis.WordlistLoader}as in - * <c>WordlistLoader.getWordSet(new File("samples/fulltext/stopwords.txt")</c> - * or <a href="http://www.unine.ch/info/clef/">other stop words - * lists </a>. - */ - public PatternAnalyzer(Version matchVersion, Regex Regex, bool toLowerCase, ISet<string> stopWords) - { - if (Regex == null) - throw new ArgumentException("Regex must not be null"); - - if (EqRegex(NON_WORD_PATTERN, Regex)) Regex = NON_WORD_PATTERN; - else if (EqRegex(WHITESPACE_PATTERN, Regex)) Regex = WHITESPACE_PATTERN; - - if (stopWords != null && stopWords.Count == 0) stopWords = null; - - this.Regex = Regex; - this.toLowerCase = toLowerCase; - this.stopWords = stopWords; - this.matchVersion = matchVersion; - } - - /* - * Creates a token stream that tokenizes the given string into token terms - * (aka words). - * - * @param fieldName - * the name of the field to tokenize (currently ignored). - * @param text - * the string to tokenize - * @return a new token stream - */ - public TokenStream TokenStream(String fieldName, String text) - { - // Ideally the Analyzer superclass should have a method with the same signature, - // with a default impl that simply delegates to the StringReader flavour. - if (text == null) - throw new ArgumentException("text must not be null"); - - TokenStream stream; - if (Regex == NON_WORD_PATTERN) - { // fast path - stream = new FastStringTokenizer(text, true, toLowerCase, stopWords); - } - else if (Regex == WHITESPACE_PATTERN) - { // fast path - stream = new FastStringTokenizer(text, false, toLowerCase, stopWords); - } - else - { - stream = new RegexTokenizer(text, Regex, toLowerCase); - if (stopWords != null) stream = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion), stream, stopWords); - } - - return stream; - } - - /* - * Creates a token stream that tokenizes all the text in the given Reader; - * This implementation forwards to <c>tokenStream(String, String)</c> and is - * less efficient than <c>tokenStream(String, String)</c>. - * - * @param fieldName - * the name of the field to tokenize (currently ignored). - * @param reader - * the reader delivering the text - * @return a new token stream - */ - public override TokenStream TokenStream(String fieldName, TextReader reader) - { - if (reader is FastStringReader) - { // fast path - return TokenStream(fieldName, ((FastStringReader)reader).GetString()); - } - - try - { - String text = ToString(reader); - return TokenStream(fieldName, text); - } - catch (IOException e) - { - throw new Exception("Wrapped Exception", e); - } - } - - /* - * Indicates whether some other object is "equal to" this one. - * - * @param other - * the reference object with which to compare. - * @return true if equal, false otherwise - */ - public override bool Equals(Object other) - { - if (this == other) return true; - if (this == DEFAULT_ANALYZER && other == EXTENDED_ANALYZER) return false; - if (other == DEFAULT_ANALYZER && this == EXTENDED_ANALYZER) return false; - - if (other is PatternAnalyzer) - { - PatternAnalyzer p2 = (PatternAnalyzer)other; - return - toLowerCase == p2.toLowerCase && - EqRegex(Regex, p2.Regex) && - Eq(stopWords, p2.stopWords); - } - return false; - } - - /* - * Returns a hash code value for the object. - * - * @return the hash code. - */ - public override int GetHashCode() - { - if (this == DEFAULT_ANALYZER) return -1218418418; // fast path - if (this == EXTENDED_ANALYZER) return 1303507063; // fast path - - int h = 1; - h = 31 * h + Regex.GetHashCode(); - h = 31 * h + (int)Regex.Options; - h = 31 * h + (toLowerCase ? 1231 : 1237); - h = 31 * h + (stopWords != null ? stopWords.GetHashCode() : 0); - return h; - } - - /* equality where o1 and/or o2 can be null */ - private static bool Eq(Object o1, Object o2) - { - return (o1 == o2) || (o1 != null ? o1.Equals(o2) : false); - } - - /* assumes p1 and p2 are not null */ - private static bool EqRegex(Regex p1, Regex p2) - { - return p1 == p2 || (p1.Options == p2.Options && p1.ToString() == p2.ToString()); - } - - /* - * Reads until end-of-stream and returns all read chars, finally closes the stream. - * - * @param input the input stream - * @throws IOException if an I/O error occurs while reading the stream - */ - private static String ToString(TextReader input) - { - try - { - int len = 256; - char[] buffer = new char[len]; - char[] output = new char[len]; - - len = 0; - int n; - while ((n = input.Read(buffer, 0, buffer.Length)) != 0) - { - if (len + n > output.Length) - { // grow capacity - char[] tmp = new char[Math.Max(output.Length << 1, len + n)]; - Array.Copy(output, 0, tmp, 0, len); - Array.Copy(buffer, 0, tmp, len, n); - buffer = output; // use larger buffer for future larger bulk reads - output = tmp; - } - else - { - Array.Copy(buffer, 0, output, len, n); - } - len += n; - } - - return new String(output, 0, len); - } - finally - { - if (input != null) input.Dispose(); - } - } - - - /////////////////////////////////////////////////////////////////////////////// - // Nested classes: - /////////////////////////////////////////////////////////////////////////////// - /* - * The work horse; performance isn't fantastic, but it's not nearly as bad - * as one might think - kudos to the Sun regex developers. - */ - private sealed class RegexTokenizer : TokenStream - { - - private readonly String str; - private readonly bool toLowerCase; - private Match matcher; - private int pos = 0; - private static readonly System.Globalization.CultureInfo locale = System.Globalization.CultureInfo.CurrentCulture; - private ITermAttribute termAtt; - private IOffsetAttribute offsetAtt; - - public RegexTokenizer(String str, Regex regex, bool toLowerCase) - { - this.str = str; - this.matcher = regex.Match(str); - this.toLowerCase = toLowerCase; - this.termAtt = AddAttribute<ITermAttribute>(); - this.offsetAtt = AddAttribute<IOffsetAttribute>(); - } - - public sealed override bool IncrementToken() - { - if (matcher == null) return false; - ClearAttributes(); - while (true) - { // loop takes care of leading and trailing boundary cases - int start = pos; - int end; - bool isMatch = matcher.Success; - if (isMatch) - { - end = matcher.Index; - pos = matcher.Index + matcher.Length; - matcher = matcher.NextMatch(); - } - else - { - end = str.Length; - matcher = null; // we're finished - } - - if (start != end) - { // non-empty match (header/trailer) - String text = str.Substring(start, end - start); - if (toLowerCase) text = text.ToLower(locale); - termAtt.SetTermBuffer(text); - offsetAtt.SetOffset(start, end); - return true; - } - return false; - } - } - - public override sealed void End() - { - // set final offset - int finalOffset = str.Length; - this.offsetAtt.SetOffset(finalOffset, finalOffset); - } - - protected override void Dispose(bool disposing) - { - // Do Nothing - } - } - - - /////////////////////////////////////////////////////////////////////////////// - // Nested classes: - /////////////////////////////////////////////////////////////////////////////// - /* - * Special-case class for best performance in common cases; this class is - * otherwise unnecessary. - */ - private sealed class FastStringTokenizer : TokenStream - { - - private readonly String str; - private int pos; - private readonly bool isLetter; - private readonly bool toLowerCase; - private readonly ISet<string> stopWords; - private static readonly System.Globalization.CultureInfo locale = System.Globalization.CultureInfo.CurrentCulture; - private ITermAttribute termAtt; - private IOffsetAttribute offsetAtt; - - public FastStringTokenizer(String str, bool isLetter, bool toLowerCase, ISet<string> stopWords) - { - this.str = str; - this.isLetter = isLetter; - this.toLowerCase = toLowerCase; - this.stopWords = stopWords; - this.termAtt = AddAttribute<ITermAttribute>(); - this.offsetAtt = AddAttribute<IOffsetAttribute>(); - } - - public override bool IncrementToken() - { - ClearAttributes(); - // cache loop instance vars (performance) - String s = str; - int len = s.Length; - int i = pos; - bool letter = isLetter; - - int start = 0; - String text; - do - { - // find beginning of token - text = null; - while (i < len && !IsTokenChar(s[i], letter)) - { - i++; - } - - if (i < len) - { // found beginning; now find end of token - start = i; - while (i < len && IsTokenChar(s[i], letter)) - { - i++; - } - - text = s.Substring(start, i - start); - if (toLowerCase) text = text.ToLower(locale); - // if (toLowerCase) { - //// use next line once JDK 1.5 String.toLowerCase() performance regression is fixed - //// see http://bugs.sun.com/bugdatabase/view_bug.do?bug_id=6265809 - // text = s.substring(start, i).toLowerCase(); - //// char[] chars = new char[i-start]; - //// for (int j=start; j < i; j++) chars[j-start] = char.toLowerCase(s[j] ); - //// text = new String(chars); - // } else { - // text = s.substring(start, i); - // } - } - } while (text != null && IsStopWord(text)); - - pos = i; - if (text == null) - { - return false; - } - termAtt.SetTermBuffer(text); - offsetAtt.SetOffset(start, i); - return true; - } - - public override sealed void End() - { - // set final offset - int finalOffset = str.Length; - this.offsetAtt.SetOffset(finalOffset, finalOffset); - } - - protected override void Dispose(bool disposing) - { - // Do Nothing - } - - private bool IsTokenChar(char c, bool isLetter) - { - return isLetter ? char.IsLetter(c) : !char.IsWhiteSpace(c); - } - - private bool IsStopWord(string text) - { - return stopWords != null && stopWords.Contains(text); - } - - } - - - /////////////////////////////////////////////////////////////////////////////// - // Nested classes: - /////////////////////////////////////////////////////////////////////////////// - /* - * A StringReader that exposes it's contained string for fast direct access. - * Might make sense to generalize this to CharSequence and make it public? - */ - internal sealed class FastStringReader : StringReader - { - - private readonly string s; - - protected internal FastStringReader(string s) - : base(s) - { - this.s = s; - } - - internal string GetString() - { - return s; - } - } - - } -} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/02362804/src/contrib/Analyzers/Miscellaneous/PrefixAndSuffixAwareTokenFilter.cs ---------------------------------------------------------------------- diff --git a/src/contrib/Analyzers/Miscellaneous/PrefixAndSuffixAwareTokenFilter.cs b/src/contrib/Analyzers/Miscellaneous/PrefixAndSuffixAwareTokenFilter.cs deleted file mode 100644 index 0734d3c..0000000 --- a/src/contrib/Analyzers/Miscellaneous/PrefixAndSuffixAwareTokenFilter.cs +++ /dev/null @@ -1,74 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -namespace Lucene.Net.Analysis.Miscellaneous -{ - /// <summary> - /// Links two PrefixAwareTokenFilter. - /// <p/> - /// <b>NOTE:</b> This filter might not behave correctly if used with custom Attributes, i.e. Attributes other than - /// the ones located in Lucene.Net.Analysis.Tokenattributes. - /// </summary> - public class PrefixAndSuffixAwareTokenFilter : TokenStream - { - private readonly PrefixAwareTokenFilter _suffix; - - public PrefixAndSuffixAwareTokenFilter(TokenStream prefix, TokenStream input, TokenStream suffix) : base(suffix) - { - _suffix = - new InjectablePrefixAwareTokenFilter( - new InjectablePrefixAwareTokenFilter(prefix, input) - { - UpdateAction = UpdateInputToken - }, - suffix) - { - UpdateAction = UpdateSuffixToken - }; - } - - public Token UpdateInputToken(Token inputToken, Token lastPrefixToken) - { - inputToken.StartOffset = lastPrefixToken.EndOffset + inputToken.StartOffset; - inputToken.EndOffset = lastPrefixToken.EndOffset + inputToken.EndOffset; - return inputToken; - } - - public Token UpdateSuffixToken(Token suffixToken, Token lastInputToken) - { - suffixToken.StartOffset = lastInputToken.EndOffset + suffixToken.StartOffset; - suffixToken.EndOffset = lastInputToken.EndOffset + suffixToken.EndOffset; - return suffixToken; - } - - - public override sealed bool IncrementToken() - { - return _suffix.IncrementToken(); - } - - public override void Reset() - { - _suffix.Reset(); - } - - protected override void Dispose(bool disposing) - { - _suffix.Dispose(); - } - } -} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/02362804/src/contrib/Analyzers/Miscellaneous/PrefixAwareTokenStream.cs ---------------------------------------------------------------------- diff --git a/src/contrib/Analyzers/Miscellaneous/PrefixAwareTokenStream.cs b/src/contrib/Analyzers/Miscellaneous/PrefixAwareTokenStream.cs deleted file mode 100644 index 127a503..0000000 --- a/src/contrib/Analyzers/Miscellaneous/PrefixAwareTokenStream.cs +++ /dev/null @@ -1,183 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -using Lucene.Net.Analysis.Tokenattributes; -using Lucene.Net.Index; - -namespace Lucene.Net.Analysis.Miscellaneous -{ - /// <summary> - /// Joins two token streams and leaves the last token of the first stream available - /// to be used when updating the token values in the second stream based on that token. - /// - /// The default implementation adds last prefix token end offset to the suffix token start and end offsets. - /// <p/> - /// <b>NOTE:</b> This filter might not behave correctly if used with custom Attributes, i.e. Attributes other than - /// the ones located in Lucene.Net.Analysis.TokenAttributes. - /// </summary> - public class PrefixAwareTokenFilter : TokenStream - { - private readonly IFlagsAttribute _flagsAtt; - private readonly IOffsetAttribute _offsetAtt; - private readonly IFlagsAttribute _pFlagsAtt; - - private readonly IOffsetAttribute _pOffsetAtt; - private readonly IPayloadAttribute _pPayloadAtt; - private readonly IPositionIncrementAttribute _pPosIncrAtt; - private readonly ITermAttribute _pTermAtt; - private readonly ITypeAttribute _pTypeAtt; - private readonly IPayloadAttribute _payloadAtt; - private readonly IPositionIncrementAttribute _posIncrAtt; - - private readonly Token _previousPrefixToken = new Token(); - private readonly Token _reusableToken = new Token(); - private readonly ITermAttribute _termAtt; - private readonly ITypeAttribute _typeAtt; - - private bool _prefixExhausted; - - public PrefixAwareTokenFilter(TokenStream prefix, TokenStream suffix) : base(suffix) - { - Suffix = suffix; - Prefix = prefix; - _prefixExhausted = false; - - // ReSharper disable DoNotCallOverridableMethodsInConstructor - _termAtt = AddAttribute<ITermAttribute>(); - _posIncrAtt = AddAttribute<IPositionIncrementAttribute>(); - _payloadAtt = AddAttribute<IPayloadAttribute>(); - _offsetAtt = AddAttribute<IOffsetAttribute>(); - _typeAtt = AddAttribute<ITypeAttribute>(); - _flagsAtt = AddAttribute<IFlagsAttribute>(); - // ReSharper restore DoNotCallOverridableMethodsInConstructor - - _pTermAtt = prefix.AddAttribute<ITermAttribute>(); - _pPosIncrAtt = prefix.AddAttribute<IPositionIncrementAttribute>(); - _pPayloadAtt = prefix.AddAttribute<IPayloadAttribute>(); - _pOffsetAtt = prefix.AddAttribute<IOffsetAttribute>(); - _pTypeAtt = prefix.AddAttribute<ITypeAttribute>(); - _pFlagsAtt = prefix.AddAttribute<IFlagsAttribute>(); - } - - public TokenStream Prefix { get; set; } - - public TokenStream Suffix { get; set; } - - public override sealed bool IncrementToken() - { - if (!_prefixExhausted) - { - Token nextToken = GetNextPrefixInputToken(_reusableToken); - if (nextToken == null) - { - _prefixExhausted = true; - } - else - { - _previousPrefixToken.Reinit(nextToken); - // Make it a deep copy - Payload p = _previousPrefixToken.Payload; - if (p != null) - { - _previousPrefixToken.Payload = (Payload) p.Clone(); - } - SetCurrentToken(nextToken); - return true; - } - } - - Token nextSuffixToken = GetNextSuffixInputToken(_reusableToken); - if (nextSuffixToken == null) - { - return false; - } - - nextSuffixToken = UpdateSuffixToken(nextSuffixToken, _previousPrefixToken); - SetCurrentToken(nextSuffixToken); - return true; - } - - private void SetCurrentToken(Token token) - { - if (token == null) return; - ClearAttributes(); - _termAtt.SetTermBuffer(token.TermBuffer(), 0, token.TermLength()); - _posIncrAtt.PositionIncrement = token.PositionIncrement; - _flagsAtt.Flags =token.Flags; - _offsetAtt.SetOffset(token.StartOffset, token.EndOffset); - _typeAtt.Type = token.Type; - _payloadAtt.Payload = token.Payload; - } - - private Token GetNextPrefixInputToken(Token token) - { - if (!Prefix.IncrementToken()) return null; - token.SetTermBuffer(_pTermAtt.TermBuffer(), 0, _pTermAtt.TermLength()); - token.PositionIncrement = _pPosIncrAtt.PositionIncrement; - token.Flags = _pFlagsAtt.Flags; - token.SetOffset(_pOffsetAtt.StartOffset, _pOffsetAtt.EndOffset); - token.Type = _pTypeAtt.Type; - token.Payload = _pPayloadAtt.Payload; - return token; - } - - private Token GetNextSuffixInputToken(Token token) - { - if (!Suffix.IncrementToken()) return null; - token.SetTermBuffer(_termAtt.TermBuffer(), 0, _termAtt.TermLength()); - token.PositionIncrement = _posIncrAtt.PositionIncrement; - token.Flags = _flagsAtt.Flags; - token.SetOffset(_offsetAtt.StartOffset, _offsetAtt.EndOffset); - token.Type = _typeAtt.Type; - token.Payload = _payloadAtt.Payload; - return token; - } - - /// <summary> - /// The default implementation adds last prefix token end offset to the suffix token start and end offsets. - /// </summary> - /// <param name="suffixToken">a token from the suffix stream</param> - /// <param name="lastPrefixToken">the last token from the prefix stream</param> - /// <returns>consumer token</returns> - public virtual Token UpdateSuffixToken(Token suffixToken, Token lastPrefixToken) - { - suffixToken.StartOffset = lastPrefixToken.EndOffset + suffixToken.StartOffset; - suffixToken.EndOffset = lastPrefixToken.EndOffset + suffixToken.EndOffset; - return suffixToken; - } - - protected override void Dispose(bool disposing) - { - Prefix.Dispose(); - Suffix.Dispose(); - } - - public override void Reset() - { - base.Reset(); - - if (Prefix != null) - { - _prefixExhausted = false; - Prefix.Reset(); - } - - if (Suffix != null) - Suffix.Reset(); - } - } -} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/02362804/src/contrib/Analyzers/Miscellaneous/SingleTokenTokenStream.cs ---------------------------------------------------------------------- diff --git a/src/contrib/Analyzers/Miscellaneous/SingleTokenTokenStream.cs b/src/contrib/Analyzers/Miscellaneous/SingleTokenTokenStream.cs deleted file mode 100644 index 232e326..0000000 --- a/src/contrib/Analyzers/Miscellaneous/SingleTokenTokenStream.cs +++ /dev/null @@ -1,78 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -using System.Diagnostics; -using Lucene.Net.Analysis.Tokenattributes; -using Attribute = Lucene.Net.Util.Attribute; - -namespace Lucene.Net.Analysis.Miscellaneous -{ - /// <summary> - /// A TokenStream containing a single token. - /// </summary> - public sealed class SingleTokenTokenStream : TokenStream - { - private bool _exhausted; - - // The token needs to be immutable, so work with clones! - private Token _singleToken; - private readonly Attribute _tokenAtt; - - public SingleTokenTokenStream(Token token) - : base(Token.TOKEN_ATTRIBUTE_FACTORY) - { - Debug.Assert(token != null, "Token was null!"); - _singleToken = (Token) token.Clone(); - - _tokenAtt = (Attribute)AddAttribute<ITermAttribute>(); - - Debug.Assert(_tokenAtt is Token); - } - - public override sealed bool IncrementToken() - { - if (_exhausted) - return false; - - ClearAttributes(); - _singleToken.CopyTo(_tokenAtt); - _exhausted = true; - - return true; - } - - public override void Reset() - { - _exhausted = false; - } - - protected override void Dispose(bool disposing) - { - // Do nothing - } - - public Token GetToken() - { - return (Token) _singleToken.Clone(); - } - - public void SetToken(Token token) - { - _singleToken = (Token) token.Clone(); - } - } -} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/02362804/src/contrib/Analyzers/NGram/EdgeNGramTokenFilter.cs ---------------------------------------------------------------------- diff --git a/src/contrib/Analyzers/NGram/EdgeNGramTokenFilter.cs b/src/contrib/Analyzers/NGram/EdgeNGramTokenFilter.cs deleted file mode 100644 index a4a027e..0000000 --- a/src/contrib/Analyzers/NGram/EdgeNGramTokenFilter.cs +++ /dev/null @@ -1,189 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -using System; -using System.IO; -using System.Collections; - -using Lucene.Net.Analysis; -using Lucene.Net.Analysis.Tokenattributes; -using Lucene.Net.Util; - -namespace Lucene.Net.Analysis.NGram -{ - public static class SideExtensions - { - public static string GetLabel(this Side theSide) - { - switch(theSide) - { - case Side.FRONT: - return "front"; - case Side.BACK: - return "back"; - default: - throw new ArgumentException(string.Format("{0} is not a valid value for EdgeNGramTokenFilter.Side", theSide)); - } - } - - public static Side GetSide(string sideName) - { - if (Side.FRONT.GetLabel() == sideName) - { - return Side.FRONT; - } - - if (Side.BACK.GetLabel() == sideName) - { - return Side.BACK; - } - - return (Side)(-1); // TODO: returning null instead of null? Should an exception be thrown instead? - } - } - - /// <summary> - /// Specifies which side of the input the n-gram should be generated from - /// </summary> - public enum Side - { - FRONT, - BACK - } - - /* - * Tokenizes the given token into n-grams of given size(s). - * <p> - * This <see cref="TokenFilter"/> create n-grams from the beginning edge or ending edge of a input token. - * </p> - */ - public sealed class EdgeNGramTokenFilter : TokenFilter - { - public static Side DEFAULT_SIDE = Side.FRONT; - public static int DEFAULT_MAX_GRAM_SIZE = 1; - public static int DEFAULT_MIN_GRAM_SIZE = 1; - - private int minGram; - private int maxGram; - private Side side; - private char[] curTermBuffer; - private int curTermLength; - private int curGramSize; - private int tokStart; - - private ITermAttribute termAtt; - private IOffsetAttribute offsetAtt; - - - protected EdgeNGramTokenFilter(TokenStream input) : base(input) - { - this.termAtt = AddAttribute<ITermAttribute>(); - this.offsetAtt = AddAttribute<IOffsetAttribute>(); - } - - /* - * Creates EdgeNGramTokenFilter that can generate n-grams in the sizes of the given range - * - * <param name="input"><see cref="TokenStream"/> holding the input to be tokenized</param> - * <param name="side">the <see cref="Side"/> from which to chop off an n-gram</param> - * <param name="minGram">the smallest n-gram to generate</param> - * <param name="maxGram">the largest n-gram to generate</param> - */ - public EdgeNGramTokenFilter(TokenStream input, Side side, int minGram, int maxGram) - : base(input) - { - - - if (side != Side.FRONT && side != Side.BACK) - { - throw new System.ArgumentException("sideLabel must be either front or back"); - } - - if (minGram < 1) - { - throw new System.ArgumentException("minGram must be greater than zero"); - } - - if (minGram > maxGram) - { - throw new System.ArgumentException("minGram must not be greater than maxGram"); - } - - this.minGram = minGram; - this.maxGram = maxGram; - this.side = side; - this.termAtt = AddAttribute<ITermAttribute>(); - this.offsetAtt = AddAttribute<IOffsetAttribute>(); - } - - /* - * Creates EdgeNGramTokenFilter that can generate n-grams in the sizes of the given range - * - * <param name="input"><see cref="TokenStream"/> holding the input to be tokenized</param> - * <param name="sideLabel">the name of the <see cref="Side"/> from which to chop off an n-gram</param> - * <param name="minGram">the smallest n-gram to generate</param> - * <param name="maxGram">the largest n-gram to generate</param> - */ - public EdgeNGramTokenFilter(TokenStream input, string sideLabel, int minGram, int maxGram) - : this(input, SideExtensions.GetSide(sideLabel), minGram, maxGram) - { - } - - public override bool IncrementToken() - { - while (true) - { - if (curTermBuffer == null) - { - if (!input.IncrementToken()) - { - return false; - } - else - { - curTermBuffer = (char[])termAtt.TermBuffer().Clone(); - curTermLength = termAtt.TermLength(); - curGramSize = minGram; - tokStart = offsetAtt.StartOffset; - } - } - if (curGramSize <= maxGram) - { - if (!(curGramSize > curTermLength // if the remaining input is too short, we can't generate any n-grams - || curGramSize > maxGram)) - { // if we have hit the end of our n-gram size range, quit - // grab gramSize chars from front or back - int start = side == Side.FRONT ? 0 : curTermLength - curGramSize; - int end = start + curGramSize; - ClearAttributes(); - offsetAtt.SetOffset(tokStart + start, tokStart + end); - termAtt.SetTermBuffer(curTermBuffer, start, curGramSize); - curGramSize++; - return true; - } - } - curTermBuffer = null; - } - } - - public override void Reset() - { - base.Reset(); - curTermBuffer = null; - } - } -} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/02362804/src/contrib/Analyzers/NGram/EdgeNGramTokenizer.cs ---------------------------------------------------------------------- diff --git a/src/contrib/Analyzers/NGram/EdgeNGramTokenizer.cs b/src/contrib/Analyzers/NGram/EdgeNGramTokenizer.cs deleted file mode 100644 index c174ff9..0000000 --- a/src/contrib/Analyzers/NGram/EdgeNGramTokenizer.cs +++ /dev/null @@ -1,225 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -using System.IO; -using System.Collections; - -using Lucene.Net.Analysis; -using Lucene.Net.Analysis.Tokenattributes; -using Lucene.Net.Util; - -namespace Lucene.Net.Analysis.NGram -{ - - /* - * Tokenizes the input from an edge into n-grams of given size(s). - * <p> - * This <see cref="Tokenizer"/> create n-grams from the beginning edge or ending edge of a input token. - * MaxGram can't be larger than 1024 because of limitation. - * </p> - */ - public sealed class EdgeNGramTokenizer : Tokenizer - { - public static Side DEFAULT_SIDE = Side.FRONT; - public static int DEFAULT_MAX_GRAM_SIZE = 1; - public static int DEFAULT_MIN_GRAM_SIZE = 1; - - private ITermAttribute termAtt; - private IOffsetAttribute offsetAtt; - - /* Specifies which side of the input the n-gram should be generated from */ - // Moved Side enum from this class to external definition - - private int minGram; - private int maxGram; - private int gramSize; - private Side side; - private bool started = false; - private int inLen; - private string inStr; - - - /* - * Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range - * - * <param name="input"><see cref="TextReader"/> holding the input to be tokenized</param> - * <param name="side">the <see cref="Side"/> from which to chop off an n-gram</param> - * <param name="minGram">the smallest n-gram to generate</param> - * <param name="maxGram">the largest n-gram to generate</param> - */ - public EdgeNGramTokenizer(TextReader input, Side side, int minGram, int maxGram) - : base(input) - { - init(side, minGram, maxGram); - } - - /* - * Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range - * - * <param name="source"><see cref="AttributeSource"/> to use</param> - * <param name="input"><see cref="TextReader"/> holding the input to be tokenized</param> - * <param name="side">the <see cref="Side"/> from which to chop off an n-gram</param> - * <param name="minGram">the smallest n-gram to generate</param> - * <param name="maxGram">the largest n-gram to generate</param> - */ - public EdgeNGramTokenizer(AttributeSource source, TextReader input, Side side, int minGram, int maxGram) - : base(source, input) - { - - init(side, minGram, maxGram); - } - - /* - * Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range - * - * <param name="factory"><see cref="AttributeSource.AttributeFactory"/> to use</param> - * <param name="input"><see cref="TextReader"/> holding the input to be tokenized</param> - * <param name="side">the <see cref="Side"/> from which to chop off an n-gram</param> - * <param name="minGram">the smallest n-gram to generate</param> - * <param name="maxGram">the largest n-gram to generate</param> - */ - public EdgeNGramTokenizer(AttributeFactory factory, TextReader input, Side side, int minGram, int maxGram) - : base(factory, input) - { - - init(side, minGram, maxGram); - } - - /* - * Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range - * - * <param name="input"><see cref="TextReader"/> holding the input to be tokenized</param> - * <param name="sideLabel">the name of the <see cref="Side"/> from which to chop off an n-gram</param> - * <param name="minGram">the smallest n-gram to generate</param> - * <param name="maxGram">the largest n-gram to generate</param> - */ - public EdgeNGramTokenizer(TextReader input, string sideLabel, int minGram, int maxGram) - : this(input, SideExtensions.GetSide(sideLabel), minGram, maxGram) - { - - } - - /* - * Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range - * - * <param name="source"><see cref="AttributeSource"/> to use</param> - * <param name="input"><see cref="TextReader"/> holding the input to be tokenized</param> - * <param name="sideLabel">the name of the <see cref="Side"/> from which to chop off an n-gram</param> - * <param name="minGram">the smallest n-gram to generate</param> - * <param name="maxGram">the largest n-gram to generate</param> - */ - public EdgeNGramTokenizer(AttributeSource source, TextReader input, string sideLabel, int minGram, int maxGram) - : this(source, input, SideExtensions.GetSide(sideLabel), minGram, maxGram) - { - - } - - /* - * Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range - * - * <param name="factory"><see cref="AttributeSource.AttributeFactory"/> to use</param> - * <param name="input"><see cref="TextReader"/> holding the input to be tokenized</param> - * <param name="sideLabel">the name of the <see cref="Side"/> from which to chop off an n-gram</param> - * <param name="minGram">the smallest n-gram to generate</param> - * <param name="maxGram">the largest n-gram to generate</param> - */ - public EdgeNGramTokenizer(AttributeFactory factory, TextReader input, string sideLabel, int minGram, int maxGram) : - this(factory, input, SideExtensions.GetSide(sideLabel), minGram, maxGram) - { - } - - private void init(Side side, int minGram, int maxGram) - { - if (side == null) - { - throw new System.ArgumentException("sideLabel must be either front or back"); - } - - if (minGram < 1) - { - throw new System.ArgumentException("minGram must be greater than zero"); - } - - if (minGram > maxGram) - { - throw new System.ArgumentException("minGram must not be greater than maxGram"); - } - - this.minGram = minGram; - this.maxGram = maxGram; - this.side = side; - - this.termAtt = AddAttribute<ITermAttribute>(); - this.offsetAtt = AddAttribute<IOffsetAttribute>(); - - } - - /* Returns the next token in the stream, or null at EOS. */ - public override bool IncrementToken() - { - ClearAttributes(); - // if we are just starting, read the whole input - if (!started) - { - started = true; - char[] chars = new char[1024]; - inStr = input.ReadToEnd().Trim(); // remove any leading or trailing spaces - inLen = inStr.Length; - gramSize = minGram; - } - - // if the remaining input is too short, we can't generate any n-grams - if (gramSize > inLen) - { - return false; - } - - // if we have hit the end of our n-gram size range, quit - if (gramSize > maxGram) - { - return false; - } - - // grab gramSize chars from front or back - int start = side == Side.FRONT ? 0 : inLen - gramSize; - int end = start + gramSize; - termAtt.SetTermBuffer(inStr, start, gramSize); - offsetAtt.SetOffset(CorrectOffset(start), CorrectOffset(end)); - gramSize++; - return true; - } - - public override void End() - { - // set offset - int finalOffset = inLen; - this.offsetAtt.SetOffset(finalOffset, finalOffset); - } - - public override void Reset(TextReader input) - { - base.Reset(input); - Reset(); - } - - public override void Reset() - { - base.Reset(); - started = false; - } - } -} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/02362804/src/contrib/Analyzers/NGram/NGramTokenFilter.cs ---------------------------------------------------------------------- diff --git a/src/contrib/Analyzers/NGram/NGramTokenFilter.cs b/src/contrib/Analyzers/NGram/NGramTokenFilter.cs deleted file mode 100644 index 8bb5707..0000000 --- a/src/contrib/Analyzers/NGram/NGramTokenFilter.cs +++ /dev/null @@ -1,124 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -using System.IO; -using System.Collections; - -using Lucene.Net.Analysis; -using Lucene.Net.Analysis.Tokenattributes; -using Lucene.Net.Util; - -namespace Lucene.Net.Analysis.NGram -{ - /* - * Tokenizes the input into n-grams of the given size(s). - */ - public sealed class NGramTokenFilter : TokenFilter - { - public static int DEFAULT_MIN_NGRAM_SIZE = 1; - public static int DEFAULT_MAX_NGRAM_SIZE = 2; - - private int minGram, maxGram; - - private char[] curTermBuffer; - private int curTermLength; - private int curGramSize; - private int curPos; - private int tokStart; - - private ITermAttribute termAtt; - private IOffsetAttribute offsetAtt; - - /* - * Creates NGramTokenFilter with given min and max n-grams. - * <param name="input"><see cref="TokenStream"/> holding the input to be tokenized</param> - * <param name="minGram">the smallest n-gram to generate</param> - * <param name="maxGram">the largest n-gram to generate</param> - */ - public NGramTokenFilter(TokenStream input, int minGram, int maxGram) - : base(input) - { - - if (minGram < 1) - { - throw new System.ArgumentException("minGram must be greater than zero"); - } - if (minGram > maxGram) - { - throw new System.ArgumentException("minGram must not be greater than maxGram"); - } - this.minGram = minGram; - this.maxGram = maxGram; - - this.termAtt = AddAttribute<ITermAttribute>(); - this.offsetAtt = AddAttribute<IOffsetAttribute>(); - } - - /* - * Creates NGramTokenFilter with default min and max n-grams. - * <param name="input"><see cref="TokenStream"/> holding the input to be tokenized</param> - */ - public NGramTokenFilter(TokenStream input) - : this(input, DEFAULT_MIN_NGRAM_SIZE, DEFAULT_MAX_NGRAM_SIZE) - { - - } - - /* Returns the next token in the stream, or null at EOS. */ - public override bool IncrementToken() - { - while (true) - { - if (curTermBuffer == null) - { - if (!input.IncrementToken()) - { - return false; - } - else - { - curTermBuffer = (char[])termAtt.TermBuffer().Clone(); - curTermLength = termAtt.TermLength(); - curGramSize = minGram; - curPos = 0; - tokStart = offsetAtt.StartOffset; - } - } - while (curGramSize <= maxGram) - { - while (curPos + curGramSize <= curTermLength) - { // while there is input - ClearAttributes(); - termAtt.SetTermBuffer(curTermBuffer, curPos, curGramSize); - offsetAtt.SetOffset(tokStart + curPos, tokStart + curPos + curGramSize); - curPos++; - return true; - } - curGramSize++; // increase n-gram size - curPos = 0; - } - curTermBuffer = null; - } - } - - public override void Reset() - { - base.Reset(); - curTermBuffer = null; - } - } -} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/02362804/src/contrib/Analyzers/NGram/NGramTokenizer.cs ---------------------------------------------------------------------- diff --git a/src/contrib/Analyzers/NGram/NGramTokenizer.cs b/src/contrib/Analyzers/NGram/NGramTokenizer.cs deleted file mode 100644 index 9616a22..0000000 --- a/src/contrib/Analyzers/NGram/NGramTokenizer.cs +++ /dev/null @@ -1,161 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -using System.IO; -using System.Collections; - -using Lucene.Net.Analysis; -using Lucene.Net.Analysis.Tokenattributes; -using Lucene.Net.Util; - -namespace Lucene.Net.Analysis.NGram -{ - - /* - * Tokenizes the input into n-grams of the given size(s). - */ - public sealed class NGramTokenizer : Tokenizer - { - public static int DEFAULT_MIN_NGRAM_SIZE = 1; - public static int DEFAULT_MAX_NGRAM_SIZE = 2; - - private int minGram, maxGram; - private int gramSize; - private int pos = 0; - private int inLen; - private string inStr; - private bool started = false; - - private ITermAttribute termAtt; - private IOffsetAttribute offsetAtt; - - /* - * Creates NGramTokenizer with given min and max n-grams. - * <param name="input"><see cref="TextReader"/> holding the input to be tokenized</param> - * <param name="minGram">the smallest n-gram to generate</param> - * <param name="maxGram">the largest n-gram to generate</param> - */ - public NGramTokenizer(TextReader input, int minGram, int maxGram) - : base(input) - { - init(minGram, maxGram); - } - - /* - * Creates NGramTokenizer with given min and max n-grams. - * <param name="source"><see cref="AttributeSource"/> to use</param> - * <param name="input"><see cref="TextReader"/> holding the input to be tokenized</param> - * <param name="minGram">the smallest n-gram to generate</param> - * <param name="maxGram">the largest n-gram to generate</param> - */ - public NGramTokenizer(AttributeSource source, TextReader input, int minGram, int maxGram) - : base(source, input) - { - init(minGram, maxGram); - } - - /* - * Creates NGramTokenizer with given min and max n-grams. - * <param name="factory"><see cref="AttributeSource.AttributeFactory"/> to use</param> - * <param name="input"><see cref="TextReader"/> holding the input to be tokenized</param> - * <param name="minGram">the smallest n-gram to generate</param> - * <param name="maxGram">the largest n-gram to generate</param> - */ - public NGramTokenizer(AttributeFactory factory, TextReader input, int minGram, int maxGram) - : base(factory, input) - { - init(minGram, maxGram); - } - - /* - * Creates NGramTokenizer with default min and max n-grams. - * <param name="input"><see cref="TextReader"/> holding the input to be tokenized</param> - */ - public NGramTokenizer(TextReader input) - : this(input, DEFAULT_MIN_NGRAM_SIZE, DEFAULT_MAX_NGRAM_SIZE) - { - - } - - private void init(int minGram, int maxGram) - { - if (minGram < 1) - { - throw new System.ArgumentException("minGram must be greater than zero"); - } - if (minGram > maxGram) - { - throw new System.ArgumentException("minGram must not be greater than maxGram"); - } - this.minGram = minGram; - this.maxGram = maxGram; - - this.termAtt = AddAttribute<ITermAttribute>(); - this.offsetAtt = AddAttribute<IOffsetAttribute>(); - } - - /* Returns the next token in the stream, or null at EOS. */ - public override bool IncrementToken() - { - ClearAttributes(); - if (!started) - { - started = true; - gramSize = minGram; - char[] chars = new char[1024]; - inStr = input.ReadToEnd(); // remove any trailing empty strings - inLen = inStr.Length; - } - - if (pos + gramSize > inLen) - { // if we hit the end of the string - pos = 0; // reset to beginning of string - gramSize++; // increase n-gram size - if (gramSize > maxGram) // we are done - return false; - if (pos + gramSize > inLen) - return false; - } - - int oldPos = pos; - pos++; - termAtt.SetTermBuffer(inStr, oldPos, gramSize); - offsetAtt.SetOffset(CorrectOffset(oldPos), CorrectOffset(oldPos + gramSize)); - return true; - } - - public override void End() - { - // set offset - int finalOffset = inLen; - this.offsetAtt.SetOffset(finalOffset, finalOffset); - } - - public override void Reset(TextReader input) - { - base.Reset(input); - Reset(); - } - - public override void Reset() - { - base.Reset(); - started = false; - pos = 0; - } - } -} \ No newline at end of file
