http://git-wip-us.apache.org/repos/asf/lucenenet/blob/d6717ed8/src/contrib/Analyzers/Miscellaneous/PatternAnalyzer.cs ---------------------------------------------------------------------- diff --git a/src/contrib/Analyzers/Miscellaneous/PatternAnalyzer.cs b/src/contrib/Analyzers/Miscellaneous/PatternAnalyzer.cs index 8a6caee..47c971b 100644 --- a/src/contrib/Analyzers/Miscellaneous/PatternAnalyzer.cs +++ b/src/contrib/Analyzers/Miscellaneous/PatternAnalyzer.cs @@ -21,8 +21,10 @@ using System; using System.Collections.Generic; +using System.Globalization; using System.IO; using System.Text.RegularExpressions; +using Lucene.Net.Analysis.Core; using Lucene.Net.Analysis.Tokenattributes; using Lucene.Net.Analysis.Util; using Version = Lucene.Net.Util.Version; @@ -68,49 +70,49 @@ namespace Lucene.Net.Analysis.Miscellaneous public static readonly Regex WHITESPACE_PATTERN = new Regex("\\s+", RegexOptions.Compiled); private static readonly CharArraySet EXTENDED_ENGLISH_STOP_WORDS = - CharArraySet.UnmodifiableSet(new CharArraySet((IEnumerable<string>)new[]{ - "a", "about", "above", "across", "adj", "after", "afterwards", - "again", "against", "albeit", "all", "almost", "alone", "along", - "already", "also", "although", "always", "among", "amongst", "an", - "and", "another", "any", "anyhow", "anyone", "anything", - "anywhere", "are", "around", "as", "at", "be", "became", "because", - "become", "becomes", "becoming", "been", "before", "beforehand", - "behind", "being", "below", "beside", "besides", "between", - "beyond", "both", "but", "by", "can", "cannot", "co", "could", - "down", "during", "each", "eg", "either", "else", "elsewhere", - "enough", "etc", "even", "ever", "every", "everyone", "everything", - "everywhere", "except", "few", "first", "for", "former", - "formerly", "from", "further", "had", "has", "have", "he", "hence", - "her", "here", "hereafter", "hereby", "herein", "hereupon", "hers", - "herself", "him", "himself", "his", "how", "however", "i", "ie", "if", - "in", "inc", "indeed", "into", "is", "it", "its", "itself", "last", - "latter", "latterly", "least", "less", "ltd", "many", "may", "me", - "meanwhile", "might", "more", "moreover", "most", "mostly", "much", - "must", "my", "myself", "namely", "neither", "never", - "nevertheless", "next", "no", "nobody", "none", "noone", "nor", - "not", "nothing", "now", "nowhere", "of", "off", "often", "on", - "once one", "only", "onto", "or", "other", "others", "otherwise", - "our", "ours", "ourselves", "out", "over", "own", "per", "perhaps", - "rather", "s", "same", "seem", "seemed", "seeming", "seems", - "several", "she", "should", "since", "so", "some", "somehow", - "someone", "something", "sometime", "sometimes", "somewhere", - "still", "such", "t", "than", "that", "the", "their", "them", - "themselves", "then", "thence", "there", "thereafter", "thereby", - "therefor", "therein", "thereupon", "these", "they", "this", - "those", "though", "through", "throughout", "thru", "thus", "to", - "together", "too", "toward", "towards", "under", "until", "up", - "upon", "us", "very", "via", "was", "we", "well", "were", "what", - "whatever", "whatsoever", "when", "whence", "whenever", - "whensoever", "where", "whereafter", "whereas", "whereat", - "whereby", "wherefrom", "wherein", "whereinto", "whereof", - "whereon", "whereto", "whereunto", "whereupon", "wherever", - "wherewith", "whether", "which", "whichever", "whichsoever", - "while", "whilst", "whither", "who", "whoever", "whole", "whom", - "whomever", "whomsoever", "whose", "whosoever", "why", "will", - "with", "within", "without", "would", "xsubj", "xcal", "xauthor", - "xother ", "xnote", "yet", "you", "your", "yours", "yourself", - "yourselves" - }, true)); + CharArraySet.UnmodifiableSet(new CharArraySet(Version.LUCENE_CURRENT, new[]{ + "a", "about", "above", "across", "adj", "after", "afterwards", + "again", "against", "albeit", "all", "almost", "alone", "along", + "already", "also", "although", "always", "among", "amongst", "an", + "and", "another", "any", "anyhow", "anyone", "anything", + "anywhere", "are", "around", "as", "at", "be", "became", "because", + "become", "becomes", "becoming", "been", "before", "beforehand", + "behind", "being", "below", "beside", "besides", "between", + "beyond", "both", "but", "by", "can", "cannot", "co", "could", + "down", "during", "each", "eg", "either", "else", "elsewhere", + "enough", "etc", "even", "ever", "every", "everyone", "everything", + "everywhere", "except", "few", "first", "for", "former", + "formerly", "from", "further", "had", "has", "have", "he", "hence", + "her", "here", "hereafter", "hereby", "herein", "hereupon", "hers", + "herself", "him", "himself", "his", "how", "however", "i", "ie", "if", + "in", "inc", "indeed", "into", "is", "it", "its", "itself", "last", + "latter", "latterly", "least", "less", "ltd", "many", "may", "me", + "meanwhile", "might", "more", "moreover", "most", "mostly", "much", + "must", "my", "myself", "namely", "neither", "never", + "nevertheless", "next", "no", "nobody", "none", "noone", "nor", + "not", "nothing", "now", "nowhere", "of", "off", "often", "on", + "once one", "only", "onto", "or", "other", "others", "otherwise", + "our", "ours", "ourselves", "out", "over", "own", "per", "perhaps", + "rather", "s", "same", "seem", "seemed", "seeming", "seems", + "several", "she", "should", "since", "so", "some", "somehow", + "someone", "something", "sometime", "sometimes", "somewhere", + "still", "such", "t", "than", "that", "the", "their", "them", + "themselves", "then", "thence", "there", "thereafter", "thereby", + "therefor", "therein", "thereupon", "these", "they", "this", + "those", "though", "through", "throughout", "thru", "thus", "to", + "together", "too", "toward", "towards", "under", "until", "up", + "upon", "us", "very", "via", "was", "we", "well", "were", "what", + "whatever", "whatsoever", "when", "whence", "whenever", + "whensoever", "where", "whereafter", "whereas", "whereat", + "whereby", "wherefrom", "wherein", "whereinto", "whereof", + "whereon", "whereto", "whereunto", "whereupon", "wherever", + "wherewith", "whether", "which", "whichever", "whichsoever", + "while", "whilst", "whither", "who", "whoever", "whole", "whom", + "whomever", "whomsoever", "whose", "whosoever", "why", "will", + "with", "within", "without", "would", "xsubj", "xcal", "xauthor", + "xother ", "xnote", "yet", "you", "your", "yours", "yourself", + "yourselves" + }, true)); /* * A lower-casing word analyzer with English stop words (can be shared @@ -180,30 +182,30 @@ namespace Lucene.Net.Analysis.Miscellaneous * the string to tokenize * @return a new token stream */ - public TokenStream TokenStream(String fieldName, String text) - { - // Ideally the Analyzer superclass should have a method with the same signature, - // with a default impl that simply delegates to the StringReader flavour. - if (text == null) - throw new ArgumentException("text must not be null"); - - TokenStream stream; - if (Regex == NON_WORD_PATTERN) - { // fast path - stream = new FastStringTokenizer(text, true, toLowerCase, stopWords); - } - else if (Regex == WHITESPACE_PATTERN) - { // fast path - stream = new FastStringTokenizer(text, false, toLowerCase, stopWords); - } - else - { - stream = new RegexTokenizer(text, Regex, toLowerCase); - if (stopWords != null) stream = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion), stream, stopWords); - } - - return stream; - } + //public TokenStream TokenStream(String fieldName, String text) + //{ + // // Ideally the Analyzer superclass should have a method with the same signature, + // // with a default impl that simply delegates to the StringReader flavour. + // if (text == null) + // throw new ArgumentException("text must not be null"); + + // TokenStream stream; + // if (Regex == NON_WORD_PATTERN) + // { // fast path + // stream = new FastStringTokenizer(text, true, toLowerCase, stopWords); + // } + // else if (Regex == WHITESPACE_PATTERN) + // { // fast path + // stream = new FastStringTokenizer(text, false, toLowerCase, stopWords); + // } + // else + // { + // stream = new RegexTokenizer(text, Regex, toLowerCase); + // if (stopWords != null) stream = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion), stream, stopWords); + // } + + // return stream; + //} /* * Creates a token stream that tokenizes all the text in the given Reader; @@ -216,24 +218,51 @@ namespace Lucene.Net.Analysis.Miscellaneous * the reader delivering the text * @return a new token stream */ - public override TokenStream TokenStream(String fieldName, TextReader reader) + //public override TokenStream TokenStream(String fieldName, TextReader reader) + //{ + // if (reader is FastStringReader) + // { // fast path + // return TokenStream(fieldName, ((FastStringReader)reader).GetString()); + // } + + // try + // { + // String text = ToString(reader); + // return TokenStream(fieldName, text); + // } + // catch (IOException e) + // { + // throw new Exception("Wrapped Exception", e); + // } + //} + + + public TokenStreamComponents CreateComponents(string fieldName, TextReader reader, string text) { - if (reader is FastStringReader) - { // fast path - return TokenStream(fieldName, ((FastStringReader)reader).GetString()); - } + if (reader == null) + reader = new FastStringReader(text); - try - { - String text = ToString(reader); - return TokenStream(fieldName, text); + if (Regex == NON_WORD_PATTERN) + { // fast path + return new TokenStreamComponents(new FastStringTokenizer(reader, true, toLowerCase, stopWords)); } - catch (IOException e) - { - throw new Exception("Wrapped Exception", e); + else if (Regex == WHITESPACE_PATTERN) + { // fast path + return new TokenStreamComponents(new FastStringTokenizer(reader, false, toLowerCase, stopWords)); } + + Tokenizer tokenizer = new RegexTokenizer(reader, Regex, toLowerCase); + TokenStream result = (stopWords != null) ? new StopFilter(matchVersion, tokenizer, stopWords) : tokenizer; + return new TokenStreamComponents(tokenizer, result); } + + public override Analyzer.TokenStreamComponents CreateComponents(string fieldName, TextReader reader) + { + return CreateComponents(fieldName, reader, null); + } + + /* * Indicates whether some other object is "equal to" this one. * @@ -249,7 +278,7 @@ namespace Lucene.Net.Analysis.Miscellaneous if (other is PatternAnalyzer) { - PatternAnalyzer p2 = (PatternAnalyzer)other; + var p2 = (PatternAnalyzer)other; return toLowerCase == p2.toLowerCase && EqRegex(Regex, p2.Regex) && @@ -296,6 +325,11 @@ namespace Lucene.Net.Analysis.Miscellaneous */ private static String ToString(TextReader input) { + if (input is FastStringReader) // fast path + { + return ((FastStringReader) input).GetString(); + } + try { int len = 256; @@ -304,7 +338,7 @@ namespace Lucene.Net.Analysis.Miscellaneous len = 0; int n; - while ((n = input.Read(buffer, 0, buffer.Length)) != 0) + while ((n = input.Read(buffer, 0, buffer.Length)) >= 0) { if (len + n > output.Length) { // grow capacity @@ -337,23 +371,23 @@ namespace Lucene.Net.Analysis.Miscellaneous * The work horse; performance isn't fantastic, but it's not nearly as bad * as one might think - kudos to the Sun regex developers. */ - private sealed class RegexTokenizer : TokenStream + private sealed class RegexTokenizer : Tokenizer { - - private readonly String str; + private readonly Regex regex; + private String str; private readonly bool toLowerCase; private Match matcher; private int pos = 0; - private static readonly System.Globalization.CultureInfo locale = System.Globalization.CultureInfo.CurrentCulture; - private ITermAttribute termAtt; + private static readonly CultureInfo locale = CultureInfo.CurrentCulture; + private ICharTermAttribute termAtt; private IOffsetAttribute offsetAtt; - public RegexTokenizer(String str, Regex regex, bool toLowerCase) + public RegexTokenizer(TextReader input, Regex regex, bool toLowerCase) + :base(input) { - this.str = str; - this.matcher = regex.Match(str); + this.matcher = regex.Match(""); this.toLowerCase = toLowerCase; - this.termAtt = AddAttribute<ITermAttribute>(); + this.termAtt = AddAttribute<ICharTermAttribute>(); this.offsetAtt = AddAttribute<IOffsetAttribute>(); } @@ -380,23 +414,31 @@ namespace Lucene.Net.Analysis.Miscellaneous if (start != end) { // non-empty match (header/trailer) - String text = str.Substring(start, end - start); + var text = str.Substring(start, end); if (toLowerCase) text = text.ToLower(locale); - termAtt.SetTermBuffer(text); - offsetAtt.SetOffset(start, end); + termAtt.SetEmpty().Append(text); + offsetAtt.SetOffset(CorrectOffset(start), CorrectOffset(end)); return true; } - return false; + if (!isMatch) return false; } } public override sealed void End() { // set final offset - int finalOffset = str.Length; + int finalOffset = CorrectOffset(str.Length); this.offsetAtt.SetOffset(finalOffset, finalOffset); } + public override void Reset() + { + base.Reset(); + this.str = PatternAnalyzer.ToString(input); + this.matcher = regex.Match(this.str); + this.pos = 0; + } + protected override void Dispose(bool disposing) { // Do Nothing @@ -411,25 +453,25 @@ namespace Lucene.Net.Analysis.Miscellaneous * Special-case class for best performance in common cases; this class is * otherwise unnecessary. */ - private sealed class FastStringTokenizer : TokenStream + private sealed class FastStringTokenizer : Tokenizer { private readonly String str; private int pos; private readonly bool isLetter; private readonly bool toLowerCase; - private readonly ISet<string> stopWords; - private static readonly System.Globalization.CultureInfo locale = System.Globalization.CultureInfo.CurrentCulture; - private ITermAttribute termAtt; + private readonly CharArraySet stopWords; + private static readonly CultureInfo locale = CultureInfo.CurrentCulture; + private ICharTermAttribute termAtt; private IOffsetAttribute offsetAtt; - public FastStringTokenizer(String str, bool isLetter, bool toLowerCase, ISet<string> stopWords) + public FastStringTokenizer(TextReader input, bool isLetter, bool toLowerCase, CharArraySet stopWords) + :base(input) { - this.str = str; this.isLetter = isLetter; this.toLowerCase = toLowerCase; this.stopWords = stopWords; - this.termAtt = AddAttribute<ITermAttribute>(); + this.termAtt = AddAttribute<ICharTermAttribute>(); this.offsetAtt = AddAttribute<IOffsetAttribute>(); }
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/d6717ed8/src/contrib/Analyzers/Miscellaneous/PrefixAwareTokenStream.cs ---------------------------------------------------------------------- diff --git a/src/contrib/Analyzers/Miscellaneous/PrefixAwareTokenStream.cs b/src/contrib/Analyzers/Miscellaneous/PrefixAwareTokenStream.cs index 45e1d19..4dabfa3 100644 --- a/src/contrib/Analyzers/Miscellaneous/PrefixAwareTokenStream.cs +++ b/src/contrib/Analyzers/Miscellaneous/PrefixAwareTokenStream.cs @@ -16,168 +16,206 @@ */ using Lucene.Net.Analysis.Tokenattributes; -using Lucene.Net.Index; +using Lucene.Net.Util; namespace Lucene.Net.Analysis.Miscellaneous { /// <summary> /// Joins two token streams and leaves the last token of the first stream available /// to be used when updating the token values in the second stream based on that token. - /// + /// /// The default implementation adds last prefix token end offset to the suffix token start and end offsets. /// <p/> /// <b>NOTE:</b> This filter might not behave correctly if used with custom Attributes, i.e. Attributes other than - /// the ones located in Lucene.Net.Analysis.TokenAttributes. + /// the ones located in org.apache.lucene.analysis.tokenattributes. /// </summary> public class PrefixAwareTokenFilter : TokenStream { - private readonly IFlagsAttribute _flagsAtt; - private readonly IOffsetAttribute _offsetAtt; - private readonly IFlagsAttribute _pFlagsAtt; - - private readonly IOffsetAttribute _pOffsetAtt; - private readonly IPayloadAttribute _pPayloadAtt; - private readonly IPositionIncrementAttribute _pPosIncrAtt; - private readonly ITermAttribute _pTermAtt; - private readonly ITypeAttribute _pTypeAtt; - private readonly IPayloadAttribute _payloadAtt; - private readonly IPositionIncrementAttribute _posIncrAtt; - - private readonly Token _previousPrefixToken = new Token(); - private readonly Token _reusableToken = new Token(); - private readonly ITermAttribute _termAtt; - private readonly ITypeAttribute _typeAtt; - - private bool _prefixExhausted; - - public PrefixAwareTokenFilter(TokenStream prefix, TokenStream suffix) : base(suffix) + private TokenStream prefix; + private TokenStream suffix; + + private ICharTermAttribute termAtt; + private IPositionIncrementAttribute posIncrAtt; + private IPayloadAttribute payloadAtt; + private IOffsetAttribute offsetAtt; + private ITypeAttribute typeAtt; + private IFlagsAttribute flagsAtt; + + private ICharTermAttribute p_termAtt; + private IPositionIncrementAttribute p_posIncrAtt; + private IPayloadAttribute p_payloadAtt; + private IOffsetAttribute p_offsetAtt; + private ITypeAttribute p_typeAtt; + private IFlagsAttribute p_flagsAtt; + + public PrefixAwareTokenFilter(TokenStream prefix, TokenStream suffix) + : base(suffix) { - Suffix = suffix; - Prefix = prefix; - _prefixExhausted = false; - - // ReSharper disable DoNotCallOverridableMethodsInConstructor - _termAtt = AddAttribute<ITermAttribute>(); - _posIncrAtt = AddAttribute<IPositionIncrementAttribute>(); - _payloadAtt = AddAttribute<IPayloadAttribute>(); - _offsetAtt = AddAttribute<IOffsetAttribute>(); - _typeAtt = AddAttribute<ITypeAttribute>(); - _flagsAtt = AddAttribute<IFlagsAttribute>(); - // ReSharper restore DoNotCallOverridableMethodsInConstructor - - _pTermAtt = prefix.AddAttribute<ITermAttribute>(); - _pPosIncrAtt = prefix.AddAttribute<IPositionIncrementAttribute>(); - _pPayloadAtt = prefix.AddAttribute<IPayloadAttribute>(); - _pOffsetAtt = prefix.AddAttribute<IOffsetAttribute>(); - _pTypeAtt = prefix.AddAttribute<ITypeAttribute>(); - _pFlagsAtt = prefix.AddAttribute<IFlagsAttribute>(); + this.suffix = suffix; + this.prefix = prefix; + prefixExhausted = false; + + termAtt = AddAttribute<ICharTermAttribute>(); + posIncrAtt = AddAttribute<IPositionIncrementAttribute>(); + payloadAtt = AddAttribute<IPayloadAttribute>(); + offsetAtt = AddAttribute<IOffsetAttribute>(); + typeAtt = AddAttribute<ITypeAttribute>(); + flagsAtt = AddAttribute<IFlagsAttribute>(); + + p_termAtt = prefix.AddAttribute<ICharTermAttribute>(); + p_posIncrAtt = prefix.AddAttribute<IPositionIncrementAttribute>(); + p_payloadAtt = prefix.AddAttribute<IPayloadAttribute>(); + p_offsetAtt = prefix.AddAttribute<IOffsetAttribute>(); + p_typeAtt = prefix.AddAttribute<ITypeAttribute>(); + p_flagsAtt = prefix.AddAttribute<IFlagsAttribute>(); } - public TokenStream Prefix { get; set; } + private Token previousPrefixToken = new Token(); + private Token reusableToken = new Token(); - public TokenStream Suffix { get; set; } + private bool prefixExhausted; - public override sealed bool IncrementToken() + public override bool IncrementToken() { - if (!_prefixExhausted) + Token nextToken; + if (!prefixExhausted) { - Token nextToken = GetNextPrefixInputToken(_reusableToken); + nextToken = GetNextPrefixInputToken(reusableToken); if (nextToken == null) { - _prefixExhausted = true; + prefixExhausted = true; } else { - _previousPrefixToken.Reinit(nextToken); + previousPrefixToken.Reinit(nextToken); // Make it a deep copy - Payload p = _previousPrefixToken.Payload; + var p = previousPrefixToken.Payload; if (p != null) { - _previousPrefixToken.Payload = (Payload) p.Clone(); + previousPrefixToken.Payload = (BytesRef)p.Clone(); } SetCurrentToken(nextToken); return true; } } - Token nextSuffixToken = GetNextSuffixInputToken(_reusableToken); - if (nextSuffixToken == null) + nextToken = GetNextSuffixInputToken(reusableToken); + if (nextToken == null) { return false; } - nextSuffixToken = UpdateSuffixToken(nextSuffixToken, _previousPrefixToken); - SetCurrentToken(nextSuffixToken); + nextToken = UpdateSuffixToken(nextToken, previousPrefixToken); + SetCurrentToken(nextToken); return true; } private void SetCurrentToken(Token token) { if (token == null) return; + ClearAttributes(); - _termAtt.SetTermBuffer(token.TermBuffer(), 0, token.TermLength()); - _posIncrAtt.PositionIncrement = token.PositionIncrement; - _flagsAtt.Flags =token.Flags; - _offsetAtt.SetOffset(token.StartOffset, token.EndOffset); - _typeAtt.Type = token.Type; - _payloadAtt.Payload = token.Payload; + termAtt.CopyBuffer(token.Buffer, 0, token.Length); + posIncrAtt.PositionIncrement = token.PositionIncrement; + flagsAtt.Flags = token.Flags; + offsetAtt.SetOffset(token.StartOffset, token.EndOffset); + typeAtt.Type = token.Type; + payloadAtt.Payload = token.Payload; } private Token GetNextPrefixInputToken(Token token) { - if (!Prefix.IncrementToken()) return null; - token.SetTermBuffer(_pTermAtt.TermBuffer(), 0, _pTermAtt.TermLength()); - token.PositionIncrement = _pPosIncrAtt.PositionIncrement; - token.Flags = _pFlagsAtt.Flags; - token.SetOffset(_pOffsetAtt.StartOffset, _pOffsetAtt.EndOffset); - token.Type = _pTypeAtt.Type; - token.Payload = _pPayloadAtt.Payload; + if (!prefix.IncrementToken()) return null; + + token.CopyBuffer(p_termAtt.Buffer, 0, p_termAtt.Length); + token.PositionIncrement = p_posIncrAtt.PositionIncrement; + token.Flags = p_flagsAtt.Flags; + token.SetOffset(p_offsetAtt.StartOffset, p_offsetAtt.EndOffset); + token.Type = p_typeAtt.Type; + token.Payload = p_payloadAtt.Payload; return token; } private Token GetNextSuffixInputToken(Token token) { - if (!Suffix.IncrementToken()) return null; - token.SetTermBuffer(_termAtt.TermBuffer(), 0, _termAtt.TermLength()); - token.PositionIncrement = _posIncrAtt.PositionIncrement; - token.Flags = _flagsAtt.Flags; - token.SetOffset(_offsetAtt.StartOffset, _offsetAtt.EndOffset); - token.Type = _typeAtt.Type; - token.Payload = _payloadAtt.Payload; + if (!suffix.IncrementToken()) return null; + + token.CopyBuffer(termAtt.Buffer, 0, termAtt.Length); + token.PositionIncrement = posIncrAtt.PositionIncrement; + token.Flags = flagsAtt.Flags; + token.SetOffset(offsetAtt.StartOffset, offsetAtt.EndOffset); + token.Type = typeAtt.Type; + token.Payload = payloadAtt.Payload; return token; } /// <summary> - /// The default implementation adds last prefix token end offset to the suffix token start and end offsets. + /// The default implementation adds last prefix token end offset + /// to the suffix token start and end offsets. /// </summary> - /// <param name="suffixToken">a token from the suffix stream</param> - /// <param name="lastPrefixToken">the last token from the prefix stream</param> - /// <returns>consumer token</returns> + /// <param name="suffixToken">A token from the suffix stream.</param> + /// <param name="lastPrefixToken">The last token from the prefix stream.</param> + /// <returns>Consumer token.</returns> public virtual Token UpdateSuffixToken(Token suffixToken, Token lastPrefixToken) { - suffixToken.StartOffset = lastPrefixToken.EndOffset + suffixToken.StartOffset; - suffixToken.EndOffset = lastPrefixToken.EndOffset + suffixToken.EndOffset; + suffixToken.SetOffset(lastPrefixToken.EndOffset + suffixToken.StartOffset, + lastPrefixToken.EndOffset + suffixToken.EndOffset); return suffixToken; } + + public override void End() + { + prefix.End(); + suffix.End(); + } + + + // was public override void Dispose + // changed to follow standard .NET dispose pattern protected override void Dispose(bool disposing) { - Prefix.Dispose(); - Suffix.Dispose(); + prefix.Dispose(); + suffix.Dispose(); } public override void Reset() { base.Reset(); + if (prefix != null) + { + prefixExhausted = false; + prefix.Reset(); + } + if (suffix != null) + { + suffix.Reset(); + } + } - if (Prefix != null) + + public TokenStream Prefix + { + get { - _prefixExhausted = false; - Prefix.Reset(); + return prefix; } + set + { + prefix = value; + } + } - if (Suffix != null) - Suffix.Reset(); + public TokenStream Suffix + { + get + { + return suffix; + } + set + { + suffix = value; + } } } } \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/d6717ed8/src/contrib/Analyzers/Miscellaneous/SingleTokenTokenStream.cs ---------------------------------------------------------------------- diff --git a/src/contrib/Analyzers/Miscellaneous/SingleTokenTokenStream.cs b/src/contrib/Analyzers/Miscellaneous/SingleTokenTokenStream.cs index b24c0f3..35b2e71 100644 --- a/src/contrib/Analyzers/Miscellaneous/SingleTokenTokenStream.cs +++ b/src/contrib/Analyzers/Miscellaneous/SingleTokenTokenStream.cs @@ -38,7 +38,7 @@ namespace Lucene.Net.Analysis.Miscellaneous Debug.Assert(token != null, "Token was null!"); _singleToken = (Token) token.Clone(); - _tokenAtt = (Attribute)AddAttribute<ITermAttribute>(); + _tokenAtt = (Attribute)AddAttribute<ICharTermAttribute>(); Debug.Assert(_tokenAtt is Token); } http://git-wip-us.apache.org/repos/asf/lucenenet/blob/d6717ed8/src/contrib/Analyzers/Support/StreamReaderExtensions.cs ---------------------------------------------------------------------- diff --git a/src/contrib/Analyzers/Support/StreamReaderExtensions.cs b/src/contrib/Analyzers/Support/StreamReaderExtensions.cs new file mode 100644 index 0000000..f215e45 --- /dev/null +++ b/src/contrib/Analyzers/Support/StreamReaderExtensions.cs @@ -0,0 +1,13 @@ +using System.IO; + +namespace Lucene.Net.Analysis.Support +{ + public static class StreamReaderExtensions + { + public static void Reset(this StreamReader sr) + { + sr.BaseStream.Position = 0; + sr.DiscardBufferedData(); + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/d6717ed8/src/contrib/Analyzers/Util/CharFilterFactory.cs ---------------------------------------------------------------------- diff --git a/src/contrib/Analyzers/Util/CharFilterFactory.cs b/src/contrib/Analyzers/Util/CharFilterFactory.cs new file mode 100644 index 0000000..89688e2 --- /dev/null +++ b/src/contrib/Analyzers/Util/CharFilterFactory.cs @@ -0,0 +1,96 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +using System; +using System.Collections.Generic; +using System.IO; + +namespace Lucene.Net.Analysis.Util +{ + /// <summary> + /// Abstract parent class for analysis factories that create + /// {@link CharFilter} instances. + /// </summary> + public abstract class CharFilterFactory : AbstractAnalysisFactory + { + private static readonly AnalysisSPILoader<CharFilterFactory> Loader = + new AnalysisSPILoader<CharFilterFactory>(typeof (CharFilterFactory)); + + /// <summary> + /// Looks up a CharFilter by name from context classpath. + /// </summary> + /// <param name="name"></param> + /// <param name="args"></param> + /// <returns>Returns an instance of the looked up CharFilter.</returns> + public static CharFilterFactory ForName(string name, IDictionary<string, string> args) + { + return Loader.NewInstance(name, args); + } + + /// <summary> + /// Looks up a CharFilter class by name from context classpath. + /// </summary> + /// <param name="name"></param> + /// <returns>Returns the type of the looked up CharFilter.</returns> + public static Type LookupType(string name) + { + return Loader.LookupClass(name); + } + + /// <summary> + /// Returns a list of all available CharFilter names. + /// </summary> + /// <returns>Returns a list of all available CharFilter names.</returns> + public static ICollection<string> AvailableCharFilters() + { + return Loader.AvailableServices; + } + + /// <summary> + /// Reloads the factory list from the given {@link ClassLoader}. + /// Changes to the factories are visible after the method ends, all + /// iterators ({@link #availableCharFilters()},...) stay consistent. + /// + /// <p><b>NOTE:</b> Only new factories are added, existing ones are + /// never removed or replaced. + /// + /// <p><em>This method is expensive and should only be called for discovery + /// of new factories on the given classpath/classloader!</em></p></p> + /// </summary> + public static void ReloadCharFilters() + { + Loader.Reload(); + } + + /// <summary> + /// Initialize this factory via a set of key-value pairs. + /// </summary> + /// <param name="args"></param> + protected CharFilterFactory(IDictionary<string, string> args) + : base(args) + { + + } + + /// <summary> + /// Wraps the given TextReader with a CharFilter. + /// </summary> + /// <param name="input"></param> + /// <returns></returns> + public abstract StreamReader Create(StreamReader input); + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/d6717ed8/src/contrib/Analyzers/Util/RollingCharBuffer.cs ---------------------------------------------------------------------- diff --git a/src/contrib/Analyzers/Util/RollingCharBuffer.cs b/src/contrib/Analyzers/Util/RollingCharBuffer.cs new file mode 100644 index 0000000..caa4355 --- /dev/null +++ b/src/contrib/Analyzers/Util/RollingCharBuffer.cs @@ -0,0 +1,195 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +using System; +using System.IO; +using Lucene.Net.Util; + +namespace Lucene.Net.Analysis.Util +{ + /// <summary> + /// Acts like a forever growing char[] as you read + /// characters into it from the provided reader, but + /// internally it uses a circular buffer to only hold the + /// characters that haven't been freed yet. This is like a + /// PushbackReader, except you don't have to specify + /// up-front the max size of the buffer, but you do have to + /// periodically call {@link #freeBefore}. + /// </summary> + public sealed class RollingCharBuffer + { + private TextReader reader; + + private char[] buffer = new char[512]; + + // Next array index to write to in buffer: + private int nextWrite; + + // Next absolute position to read from reader: + private int nextPos; + + // How many valid chars (wrapped) are in the buffer: + private int count; + + // True if we hit EOF + private bool end; + + /// <summary> + /// Clear array and switch to new reader. + /// </summary> + /// <param name="reader"></param> + public void Reset(TextReader reader) + { + this.reader = reader; + nextPos = 0; + nextWrite = 0; + count = 0; + end = false; + } + + /// <summary> + /// Absolute position read. NOTE: pos must not jump + /// ahead by more than 1! I.e., it's OK to read arbitrarily + /// far back (just not prior to the last {@link + /// #freeBefore}), but NOT ok to read arbitrarily far + /// ahead. Returns -1 if you hit EOF. + /// </summary> + /// <param name="pos"></param> + /// <returns></returns> + public int Get(int pos) + { + if (pos == nextPos) + { + if (end) + { + return -1; + } + if (count == buffer.Length) + { + // Grow + var newBuffer = new char[ArrayUtil.Oversize(1 + count, RamUsageEstimator.NUM_BYTES_CHAR)]; + Array.Copy(buffer, nextWrite, newBuffer, 0, buffer.Length - nextWrite); + Array.Copy(buffer, 0, newBuffer, buffer.Length - nextWrite, nextWrite); + nextWrite = buffer.Length; + buffer = newBuffer; + } + if (nextWrite == buffer.Length) + { + nextWrite = 0; + } + + var toRead = buffer.Length - Math.Max(count, nextWrite); + var readCount = reader.Read(buffer, nextWrite, toRead); + if (readCount == -1) + { + end = true; + return -1; + } + var ch = buffer[nextWrite]; + nextWrite += readCount; + count += readCount; + nextPos += readCount; + return ch; + } + else + { + // Cannot read from future (except by 1): + // assert pos < nextPos; + if (pos >= nextPos) + throw new InvalidOperationException("Cannot read from future (except by 1)."); + + // Cannot read from already freed past: + // assert nextPos - pos <= count: "nextPos=" + nextPos + " pos=" + pos + " count=" + count; + if (nextPos - pos > count) + throw new InvalidOperationException("nextPos=" + nextPos + " pos=" + pos + " count=" + count); + + return buffer[GetIndex(pos)]; + } + } + + // For assert: + private bool InBounds(int pos) + { + return pos >= 0 && pos < nextPos && pos >= nextPos - count; + } + + private int GetIndex(int pos) + { + var index = nextWrite - (nextPos - pos); + if (index < 0) + { + // Wrap: + index += buffer.Length; + //assert index >= 0; + if (index < 0) + throw new InvalidOperationException(); + } + return index; + } + + + public char[] Get(int posStart, int length) + { + if (length <= 0) + throw new ArgumentException("Must be greater than zero.", "length"); + + if (!InBounds(posStart)) + throw new ArgumentException("posStart=" + posStart + " length=" + length, "posStart"); + + var startIndex = GetIndex(posStart); + var endIndex = GetIndex(posStart + length); + + var result = new char[length]; + if (endIndex >= startIndex && length < buffer.Length) + { + Array.Copy(buffer, startIndex, result, 0, endIndex - startIndex); + } + else + { + // wrapped: + var part1 = buffer.Length - startIndex; + Array.Copy(buffer, startIndex, result, 0, part1); + Array.Copy(buffer, 0, result, buffer.Length-startIndex, length-part1); + } + return result; + } + + /// <summary> + /// Call this to notify us that no chars before this + /// absolute position are needed anymore. + /// </summary> + /// <param name="pos"></param> + public void FreeBefore(int pos) + { + if (pos < 0) + throw new ArgumentException("Must be greater than or equal to zero.", "pos"); + + if (pos > nextPos) + throw new ArgumentException("Must be less than or equal to nextPos", "pos"); + + var newCount = nextPos - pos; + + if (newCount > count) + throw new InvalidOperationException("newCount=" + newCount + " count=" + count); + + if (newCount > buffer.Length) + throw new InvalidOperationException("newCount=" + newCount + " buf.length=" + buffer.Length); + + count = newCount; + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/d6717ed8/src/core/Analysis/CharFilter.cs ---------------------------------------------------------------------- diff --git a/src/core/Analysis/CharFilter.cs b/src/core/Analysis/CharFilter.cs index b8a4332..1d1ecb7 100644 --- a/src/core/Analysis/CharFilter.cs +++ b/src/core/Analysis/CharFilter.cs @@ -15,6 +15,8 @@ * limitations under the License. */ +using System.IO; + namespace Lucene.Net.Analysis { @@ -27,11 +29,12 @@ namespace Lucene.Net.Analysis /// <version> $Id$ /// /// </version> - public abstract class CharFilter : System.IO.TextReader + public abstract class CharFilter : StreamReader { - protected readonly System.IO.TextReader input; - - public CharFilter(System.IO.TextReader input) + protected readonly StreamReader input; + + public CharFilter(StreamReader input) + : base(input.BaseStream) { this.input = input; }
