http://git-wip-us.apache.org/repos/asf/lucenenet/blob/8a97bfcf/src/Lucene.Net.Core/Analysis/Standard/StandardTokenizerImpl.jflex ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Core/Analysis/Standard/StandardTokenizerImpl.jflex b/src/Lucene.Net.Core/Analysis/Standard/StandardTokenizerImpl.jflex deleted file mode 100644 index 9308713..0000000 --- a/src/Lucene.Net.Core/Analysis/Standard/StandardTokenizerImpl.jflex +++ /dev/null @@ -1,156 +0,0 @@ -package org.apache.lucene.analysis.standard; - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - -WARNING: if you change StandardTokenizerImpl.jflex and need to regenerate - the tokenizer, only use Java 1.4 !!! - This grammar currently uses constructs (eg :digit:, :letter:) whose - meaning can vary according to the JRE used to run jflex. See - https://issues.apache.org/jira/browse/LUCENE-1126 for details. - For current backwards compatibility it is needed to support - only Java 1.4 - this will change in Lucene 3.1. - -*/ - -import org.apache.lucene.analysis.Token; -import org.apache.lucene.analysis.tokenattributes.TermAttribute; - -%% - -%class StandardTokenizerImpl -%unicode -%integer -%function getNextToken -%pack -%char - -%{ - -public static final int ALPHANUM = StandardTokenizer.ALPHANUM; -public static final int APOSTROPHE = StandardTokenizer.APOSTROPHE; -public static final int ACRONYM = StandardTokenizer.ACRONYM; -public static final int COMPANY = StandardTokenizer.COMPANY; -public static final int EMAIL = StandardTokenizer.EMAIL; -public static final int HOST = StandardTokenizer.HOST; -public static final int NUM = StandardTokenizer.NUM; -public static final int CJ = StandardTokenizer.CJ; -/* - * @deprecated this solves a bug where HOSTs that end with '.' are identified - * as ACRONYMs. - */ -public static final int ACRONYM_DEP = StandardTokenizer.ACRONYM_DEP; - -public static final String [] TOKEN_TYPES = StandardTokenizer.TOKEN_TYPES; - -public final int yychar() -{ - return yychar; -} - -/* - * Resets the Tokenizer to a new Reader. - */ -final void reset(java.io.Reader r) { - // reset to default buffer size, if buffer has grown - if (zzBuffer.length > ZZ_BUFFERSIZE) { - zzBuffer = new char[ZZ_BUFFERSIZE]; - } - yyreset(r); -} - -/* - * Fills Lucene token with the current token text. - */ -final void getText(Token t) { - t.setTermBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead); -} - -/* - * Fills TermAttribute with the current token text. - */ -final void getText(TermAttribute t) { - t.setTermBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead); -} - -%} - -THAI = [\u0E00-\u0E59] - -// basic word: a sequence of digits & letters (includes Thai to enable ThaiAnalyzer to function) -ALPHANUM = ({LETTER}|{THAI}|[:digit:])+ - -// internal apostrophes: O'Reilly, you're, O'Reilly's -// use a post-filter to remove possessives -APOSTROPHE = {ALPHA} ("'" {ALPHA})+ - -// acronyms: U.S.A., I.B.M., etc. -// use a post-filter to remove dots -ACRONYM = {LETTER} "." ({LETTER} ".")+ - -ACRONYM_DEP = {ALPHANUM} "." ({ALPHANUM} ".")+ - -// company names like AT&T and Excite@Home. -COMPANY = {ALPHA} ("&"|"@") {ALPHA} - -// email addresses -EMAIL = {ALPHANUM} (("."|"-"|"_") {ALPHANUM})* "@" {ALPHANUM} (("."|"-") {ALPHANUM})+ - -// hostname -HOST = {ALPHANUM} ((".") {ALPHANUM})+ - -// floating point, serial, model numbers, ip addresses, etc. -// every other segment must have at least one digit -NUM = ({ALPHANUM} {P} {HAS_DIGIT} - | {HAS_DIGIT} {P} {ALPHANUM} - | {ALPHANUM} ({P} {HAS_DIGIT} {P} {ALPHANUM})+ - | {HAS_DIGIT} ({P} {ALPHANUM} {P} {HAS_DIGIT})+ - | {ALPHANUM} {P} {HAS_DIGIT} ({P} {ALPHANUM} {P} {HAS_DIGIT})+ - | {HAS_DIGIT} {P} {ALPHANUM} ({P} {HAS_DIGIT} {P} {ALPHANUM})+) - -// punctuation -P = ("_"|"-"|"/"|"."|",") - -// at least one digit -HAS_DIGIT = ({LETTER}|[:digit:])* [:digit:] ({LETTER}|[:digit:])* - -ALPHA = ({LETTER})+ - -// From the JFlex manual: "the expression that matches everything of <a> not matched by <b> is !(!<a>|<b>)" -LETTER = !(![:letter:]|{CJ}) - -// Chinese and Japanese (but NOT Korean, which is included in [:letter:]) -CJ = [\u3100-\u312f\u3040-\u309F\u30A0-\u30FF\u31F0-\u31FF\u3300-\u337f\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff\uff65-\uff9f] - -WHITESPACE = \r\n | [ \r\n\t\f] - -%% - -{ALPHANUM} { return ALPHANUM; } -{APOSTROPHE} { return APOSTROPHE; } -{ACRONYM} { return ACRONYM; } -{COMPANY} { return COMPANY; } -{EMAIL} { return EMAIL; } -{HOST} { return HOST; } -{NUM} { return NUM; } -{CJ} { return CJ; } -{ACRONYM_DEP} { return ACRONYM_DEP; } - -/* Ignore the rest */ -. | {WHITESPACE} { /* ignore */ }
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/8a97bfcf/src/Lucene.Net.Core/Analysis/StopAnalyzer.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Core/Analysis/StopAnalyzer.cs b/src/Lucene.Net.Core/Analysis/StopAnalyzer.cs deleted file mode 100644 index 96a673d..0000000 --- a/src/Lucene.Net.Core/Analysis/StopAnalyzer.cs +++ /dev/null @@ -1,141 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -using System.Collections.Generic; -using Version = Lucene.Net.Util.Version; - -namespace Lucene.Net.Analysis -{ - - /// <summary> Filters <see cref="LetterTokenizer" /> with <see cref="LowerCaseFilter" /> and - /// <see cref="StopFilter" />. - /// - /// <a name="version"/> - /// <p/> - /// You must specify the required <see cref="Version" /> compatibility when creating - /// StopAnalyzer: - /// <list type="bullet"> - /// <item>As of 2.9, position increments are preserved</item> - /// </list> - /// </summary> - - public sealed class StopAnalyzer:Analyzer - { - private readonly ISet<string> stopWords; - private readonly bool enablePositionIncrements; - - /// <summary>An unmodifiable set containing some common English words that are not usually useful - /// for searching. - /// </summary> - public static ISet<string> ENGLISH_STOP_WORDS_SET; - - /// <summary> Builds an analyzer which removes words in ENGLISH_STOP_WORDS.</summary> - public StopAnalyzer(Version matchVersion) - { - stopWords = ENGLISH_STOP_WORDS_SET; - enablePositionIncrements = StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion); - } - - /// <summary>Builds an analyzer with the stop words from the given set.</summary> - public StopAnalyzer(Version matchVersion, ISet<string> stopWords) - { - this.stopWords = stopWords; - enablePositionIncrements = StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion); - } - - /// <summary> Builds an analyzer with the stop words from the given file. - /// - /// </summary> - /// <seealso cref="WordlistLoader.GetWordSet(System.IO.FileInfo)"> - /// </seealso> - /// <param name="matchVersion">See <a href="#version">above</a> - /// </param> - /// <param name="stopwordsFile">File to load stop words from - /// </param> - public StopAnalyzer(Version matchVersion, System.IO.FileInfo stopwordsFile) - { - stopWords = WordlistLoader.GetWordSet(stopwordsFile); - enablePositionIncrements = StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion); - } - - /// <summary>Builds an analyzer with the stop words from the given reader. </summary> - /// <seealso cref="WordlistLoader.GetWordSet(System.IO.TextReader)"> - /// </seealso> - /// <param name="matchVersion">See <a href="#Version">above</a> - /// </param> - /// <param name="stopwords">Reader to load stop words from - /// </param> - public StopAnalyzer(Version matchVersion, System.IO.TextReader stopwords) - { - stopWords = WordlistLoader.GetWordSet(stopwords); - enablePositionIncrements = StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion); - } - - /// <summary>Filters LowerCaseTokenizer with StopFilter. </summary> - public override TokenStream TokenStream(System.String fieldName, System.IO.TextReader reader) - { - return new StopFilter(enablePositionIncrements, new LowerCaseTokenizer(reader), stopWords); - } - - /// <summary>Filters LowerCaseTokenizer with StopFilter. </summary> - private class SavedStreams - { - public SavedStreams(StopAnalyzer enclosingInstance) - { - InitBlock(enclosingInstance); - } - private void InitBlock(StopAnalyzer enclosingInstance) - { - this.enclosingInstance = enclosingInstance; - } - private StopAnalyzer enclosingInstance; - public StopAnalyzer Enclosing_Instance - { - get - { - return enclosingInstance; - } - - } - internal Tokenizer source; - internal TokenStream result; - } - - public override TokenStream ReusableTokenStream(System.String fieldName, System.IO.TextReader reader) - { - var streams = (SavedStreams) PreviousTokenStream; - if (streams == null) - { - streams = new SavedStreams(this) {source = new LowerCaseTokenizer(reader)}; - streams.result = new StopFilter(enablePositionIncrements, streams.source, stopWords); - PreviousTokenStream = streams; - } - else - streams.source.Reset(reader); - return streams.result; - } - static StopAnalyzer() - { - { - var stopWords = new System.String[]{"a", "an", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", "is", "it", "no", "not", "of", "on", "or", "such", "that", "the", "their", "then", "there", "these", "they", "this", "to", "was", "will", "with"}; - var stopSet = new CharArraySet(stopWords.Length, false); - stopSet.AddAll(stopWords); - ENGLISH_STOP_WORDS_SET = CharArraySet.UnmodifiableSet(stopSet); - } - } - } -} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/8a97bfcf/src/Lucene.Net.Core/Analysis/StopFilter.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Core/Analysis/StopFilter.cs b/src/Lucene.Net.Core/Analysis/StopFilter.cs deleted file mode 100644 index 722faaf..0000000 --- a/src/Lucene.Net.Core/Analysis/StopFilter.cs +++ /dev/null @@ -1,178 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -using System; -using System.Collections.Generic; -using Lucene.Net.Analysis.Tokenattributes; -using Lucene.Net.Util; -using QueryParser = Lucene.Net.QueryParsers.QueryParser; -using Version = Lucene.Net.Util.Version; - -namespace Lucene.Net.Analysis -{ - - /// <summary> Removes stop words from a token stream.</summary> - - public sealed class StopFilter:TokenFilter - { - private readonly CharArraySet stopWords; - private bool enablePositionIncrements = false; - - private readonly ITermAttribute termAtt; - private readonly IPositionIncrementAttribute posIncrAtt; - - /// <summary> Construct a token stream filtering the given input. - /// If <c>stopWords</c> is an instance of <see cref="CharArraySet" /> (true if - /// <c>makeStopSet()</c> was used to construct the set) it will be directly used - /// and <c>ignoreCase</c> will be ignored since <c>CharArraySet</c> - /// directly controls case sensitivity. - /// <p/> - /// If <c>stopWords</c> is not an instance of <see cref="CharArraySet" />, - /// a new CharArraySet will be constructed and <c>ignoreCase</c> will be - /// used to specify the case sensitivity of that set. - /// </summary> - /// <param name="enablePositionIncrements">true if token positions should record the removed stop words</param> - /// <param name="input">Input TokenStream</param> - /// <param name="stopWords">A Set of strings or strings or char[] or any other ToString()-able set representing the stopwords</param> - /// <param name="ignoreCase">if true, all words are lower cased first</param> - public StopFilter(bool enablePositionIncrements, TokenStream input, ISet<string> stopWords, bool ignoreCase) - : base(input) - { - if (stopWords is CharArraySet) - { - this.stopWords = (CharArraySet) stopWords; - } - else - { - this.stopWords = new CharArraySet(stopWords.Count, ignoreCase); - this.stopWords.AddAll(stopWords); - } - this.enablePositionIncrements = enablePositionIncrements; - termAtt = AddAttribute<ITermAttribute>(); - posIncrAtt = AddAttribute<IPositionIncrementAttribute>(); - } - - /// <summary> Constructs a filter which removes words from the input - /// TokenStream that are named in the Set. - /// </summary> - /// <param name="enablePositionIncrements">true if token positions should record the removed stop words</param> - /// <param name="in">Input stream</param> - /// <param name="stopWords">A Set of strings or char[] or any other ToString()-able set representing the stopwords</param> - /// <seealso cref="MakeStopSet(String[])"/> - public StopFilter(bool enablePositionIncrements, TokenStream @in, ISet<string> stopWords) - : this(enablePositionIncrements, @in, stopWords, false) - { } - - /// <summary> Builds a Set from an array of stop words, - /// appropriate for passing into the StopFilter constructor. - /// This permits this stopWords construction to be cached once when - /// an Analyzer is constructed. - /// - /// </summary> - /// <seealso cref="MakeStopSet(String[], bool)">passing false to ignoreCase</seealso> - public static ISet<string> MakeStopSet(params string[] stopWords) - { - return MakeStopSet(stopWords, false); - } - - /// <summary> Builds a Set from an array of stop words, - /// appropriate for passing into the StopFilter constructor. - /// This permits this stopWords construction to be cached once when - /// an Analyzer is constructed. - /// </summary> - /// <param name="stopWords">A list of strings or char[] or any other ToString()-able list representing the stop words</param> - /// <seealso cref="MakeStopSet(String[], bool)">passing false to ignoreCase</seealso> - public static ISet<string> MakeStopSet(IList<object> stopWords) - { - return MakeStopSet(stopWords, false); - } - - /// <summary></summary> - /// <param name="stopWords">An array of stopwords</param> - /// <param name="ignoreCase">If true, all words are lower cased first.</param> - /// <returns> a Set containing the words</returns> - public static ISet<string> MakeStopSet(string[] stopWords, bool ignoreCase) - { - var stopSet = new CharArraySet(stopWords.Length, ignoreCase); - stopSet.AddAll(stopWords); - return stopSet; - } - - /// <summary> </summary> - /// <param name="stopWords">A List of Strings or char[] or any other toString()-able list representing the stopwords </param> - /// <param name="ignoreCase">if true, all words are lower cased first</param> - /// <returns>A Set (<see cref="CharArraySet"/>)containing the words</returns> - public static ISet<string> MakeStopSet(IList<object> stopWords, bool ignoreCase) - { - var stopSet = new CharArraySet(stopWords.Count, ignoreCase); - foreach(var word in stopWords) - stopSet.Add(word.ToString()); - return stopSet; - } - - /// <summary> Returns the next input Token whose term() is not a stop word.</summary> - public override bool IncrementToken() - { - // return the first non-stop word found - int skippedPositions = 0; - while (input.IncrementToken()) - { - if (!stopWords.Contains(termAtt.TermBuffer(), 0, termAtt.TermLength())) - { - if (enablePositionIncrements) - { - posIncrAtt.PositionIncrement = posIncrAtt.PositionIncrement + skippedPositions; - } - return true; - } - skippedPositions += posIncrAtt.PositionIncrement; - } - // reached EOS -- return false - return false; - } - - /// <summary> Returns version-dependent default for enablePositionIncrements. Analyzers - /// that embed StopFilter use this method when creating the StopFilter. Prior - /// to 2.9, this returns false. On 2.9 or later, it returns true. - /// </summary> - public static bool GetEnablePositionIncrementsVersionDefault(Version matchVersion) - { - return matchVersion.OnOrAfter(Version.LUCENE_29); - } - - /// <summary> If <c>true</c>, this StopFilter will preserve - /// positions of the incoming tokens (ie, accumulate and - /// set position increments of the removed stop tokens). - /// Generally, <c>true</c> is best as it does not - /// lose information (positions of the original tokens) - /// during indexing. - /// - /// <p/> When set, when a token is stopped - /// (omitted), the position increment of the following - /// token is incremented. - /// - /// <p/> <b>NOTE</b>: be sure to also - /// set <see cref="QueryParser.EnablePositionIncrements" /> if - /// you use QueryParser to create queries. - /// </summary> - public bool EnablePositionIncrements - { - get { return enablePositionIncrements; } - set { enablePositionIncrements = value; } - } - } -} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/8a97bfcf/src/Lucene.Net.Core/Analysis/TeeSinkTokenFilter.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Core/Analysis/TeeSinkTokenFilter.cs b/src/Lucene.Net.Core/Analysis/TeeSinkTokenFilter.cs deleted file mode 100644 index 6eb217f..0000000 --- a/src/Lucene.Net.Core/Analysis/TeeSinkTokenFilter.cs +++ /dev/null @@ -1,266 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -using System; -using System.Collections.Generic; -using Attribute = Lucene.Net.Util.Attribute; -using AttributeSource = Lucene.Net.Util.AttributeSource; - -namespace Lucene.Net.Analysis -{ - - /// <summary> This TokenFilter provides the ability to set aside attribute states - /// that have already been analyzed. This is useful in situations where multiple fields share - /// many common analysis steps and then go their separate ways. - /// <p/> - /// It is also useful for doing things like entity extraction or proper noun analysis as - /// part of the analysis workflow and saving off those tokens for use in another field. - /// - /// <code> - /// TeeSinkTokenFilter source1 = new TeeSinkTokenFilter(new WhitespaceTokenizer(reader1)); - /// TeeSinkTokenFilter.SinkTokenStream sink1 = source1.newSinkTokenStream(); - /// TeeSinkTokenFilter.SinkTokenStream sink2 = source1.newSinkTokenStream(); - /// TeeSinkTokenFilter source2 = new TeeSinkTokenFilter(new WhitespaceTokenizer(reader2)); - /// source2.addSinkTokenStream(sink1); - /// source2.addSinkTokenStream(sink2); - /// TokenStream final1 = new LowerCaseFilter(source1); - /// TokenStream final2 = source2; - /// TokenStream final3 = new EntityDetect(sink1); - /// TokenStream final4 = new URLDetect(sink2); - /// d.add(new Field("f1", final1)); - /// d.add(new Field("f2", final2)); - /// d.add(new Field("f3", final3)); - /// d.add(new Field("f4", final4)); - /// </code> - /// In this example, <c>sink1</c> and <c>sink2</c> will both get tokens from both - /// <c>reader1</c> and <c>reader2</c> after whitespace tokenizer - /// and now we can further wrap any of these in extra analysis, and more "sources" can be inserted if desired. - /// It is important, that tees are consumed before sinks (in the above example, the field names must be - /// less the sink's field names). If you are not sure, which stream is consumed first, you can simply - /// add another sink and then pass all tokens to the sinks at once using <see cref="ConsumeAllTokens" />. - /// This TokenFilter is exhausted after this. In the above example, change - /// the example above to: - /// <code> - /// ... - /// TokenStream final1 = new LowerCaseFilter(source1.newSinkTokenStream()); - /// TokenStream final2 = source2.newSinkTokenStream(); - /// sink1.consumeAllTokens(); - /// sink2.consumeAllTokens(); - /// ... - /// </code> - /// In this case, the fields can be added in any order, because the sources are not used anymore and all sinks are ready. - /// <p/>Note, the EntityDetect and URLDetect TokenStreams are for the example and do not currently exist in Lucene. - /// </summary> - public sealed class TeeSinkTokenFilter:TokenFilter - { - public class AnonymousClassSinkFilter:SinkFilter - { - public override bool Accept(AttributeSource source) - { - return true; - } - } - private readonly LinkedList<WeakReference> sinks = new LinkedList<WeakReference>(); - - /// <summary> Instantiates a new TeeSinkTokenFilter.</summary> - public TeeSinkTokenFilter(TokenStream input):base(input) - { - } - - /// <summary> Returns a new <see cref="SinkTokenStream" /> that receives all tokens consumed by this stream.</summary> - public SinkTokenStream NewSinkTokenStream() - { - return NewSinkTokenStream(ACCEPT_ALL_FILTER); - } - - /// <summary> Returns a new <see cref="SinkTokenStream" /> that receives all tokens consumed by this stream - /// that pass the supplied filter. - /// </summary> - /// <seealso cref="SinkFilter"> - /// </seealso> - public SinkTokenStream NewSinkTokenStream(SinkFilter filter) - { - var sink = new SinkTokenStream(this.CloneAttributes(), filter); - sinks.AddLast(new WeakReference(sink)); - return sink; - } - - /// <summary> Adds a <see cref="SinkTokenStream" /> created by another <c>TeeSinkTokenFilter</c> - /// to this one. The supplied stream will also receive all consumed tokens. - /// This method can be used to pass tokens from two different tees to one sink. - /// </summary> - public void AddSinkTokenStream(SinkTokenStream sink) - { - // check that sink has correct factory - if (!this.Factory.Equals(sink.Factory)) - { - throw new System.ArgumentException("The supplied sink is not compatible to this tee"); - } - // add eventually missing attribute impls to the existing sink - foreach (var impl in this.CloneAttributes().GetAttributeImplsIterator()) - { - sink.AddAttributeImpl(impl); - } - sinks.AddLast(new WeakReference(sink)); - } - - /// <summary> <c>TeeSinkTokenFilter</c> passes all tokens to the added sinks - /// when itself is consumed. To be sure, that all tokens from the input - /// stream are passed to the sinks, you can call this methods. - /// This instance is exhausted after this, but all sinks are instant available. - /// </summary> - public void ConsumeAllTokens() - { - while (IncrementToken()) - { - } - } - - public override bool IncrementToken() - { - if (input.IncrementToken()) - { - // capture state lazily - maybe no SinkFilter accepts this state - State state = null; - foreach(WeakReference wr in sinks) - { - var sink = (SinkTokenStream)wr.Target; - if (sink != null) - { - if (sink.Accept(this)) - { - if (state == null) - { - state = this.CaptureState(); - } - sink.AddState(state); - } - } - } - return true; - } - - return false; - } - - public override void End() - { - base.End(); - State finalState = CaptureState(); - foreach(WeakReference wr in sinks) - { - var sink = (SinkTokenStream)wr.Target; - if (sink != null) - { - sink.SetFinalState(finalState); - } - } - } - - /// <summary> A filter that decides which <see cref="AttributeSource" /> states to store in the sink.</summary> - public abstract class SinkFilter - { - /// <summary> Returns true, iff the current state of the passed-in <see cref="AttributeSource" /> shall be stored - /// in the sink. - /// </summary> - public abstract bool Accept(AttributeSource source); - - /// <summary> Called by <see cref="SinkTokenStream.Reset()" />. This method does nothing by default - /// and can optionally be overridden. - /// </summary> - public virtual void Reset() - { - // nothing to do; can be overridden - } - } - - public sealed class SinkTokenStream : TokenStream - { - private readonly LinkedList<State> cachedStates = new LinkedList<State>(); - private State finalState; - private IEnumerator<AttributeSource.State> it = null; - private readonly SinkFilter filter; - - internal SinkTokenStream(AttributeSource source, SinkFilter filter) - : base(source) - { - this.filter = filter; - } - - internal /*private*/ bool Accept(AttributeSource source) - { - return filter.Accept(source); - } - - internal /*private*/ void AddState(AttributeSource.State state) - { - if (it != null) - { - throw new System.SystemException("The tee must be consumed before sinks are consumed."); - } - cachedStates.AddLast(state); - } - - internal /*private*/ void SetFinalState(AttributeSource.State finalState) - { - this.finalState = finalState; - } - - public override bool IncrementToken() - { - // lazy init the iterator - if (it == null) - { - it = cachedStates.GetEnumerator(); - } - - if (!it.MoveNext()) - { - return false; - } - - State state = it.Current; - RestoreState(state); - return true; - } - - public override void End() - { - if (finalState != null) - { - RestoreState(finalState); - } - } - - public override void Reset() - { - it = cachedStates.GetEnumerator(); - } - - protected override void Dispose(bool disposing) - { - // Do nothing. - } - } - - private static readonly SinkFilter ACCEPT_ALL_FILTER; - static TeeSinkTokenFilter() - { - ACCEPT_ALL_FILTER = new AnonymousClassSinkFilter(); - } - } -} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/8a97bfcf/src/Lucene.Net.Core/Analysis/WhitespaceAnalyzer.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Core/Analysis/WhitespaceAnalyzer.cs b/src/Lucene.Net.Core/Analysis/WhitespaceAnalyzer.cs deleted file mode 100644 index ae94c44..0000000 --- a/src/Lucene.Net.Core/Analysis/WhitespaceAnalyzer.cs +++ /dev/null @@ -1,43 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -namespace Lucene.Net.Analysis -{ - - /// <summary>An Analyzer that uses <see cref="WhitespaceTokenizer" />. </summary> - - public sealed class WhitespaceAnalyzer:Analyzer - { - public override TokenStream TokenStream(System.String fieldName, System.IO.TextReader reader) - { - return new WhitespaceTokenizer(reader); - } - - public override TokenStream ReusableTokenStream(System.String fieldName, System.IO.TextReader reader) - { - var tokenizer = (Tokenizer) PreviousTokenStream; - if (tokenizer == null) - { - tokenizer = new WhitespaceTokenizer(reader); - PreviousTokenStream = tokenizer; - } - else - tokenizer.Reset(reader); - return tokenizer; - } - } -} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/8a97bfcf/src/Lucene.Net.Core/Analysis/WhitespaceTokenizer.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Core/Analysis/WhitespaceTokenizer.cs b/src/Lucene.Net.Core/Analysis/WhitespaceTokenizer.cs deleted file mode 100644 index ba19da9..0000000 --- a/src/Lucene.Net.Core/Analysis/WhitespaceTokenizer.cs +++ /dev/null @@ -1,55 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -using AttributeSource = Lucene.Net.Util.AttributeSource; - -namespace Lucene.Net.Analysis -{ - - /// <summary>A WhitespaceTokenizer is a tokenizer that divides text at whitespace. - /// Adjacent sequences of non-Whitespace characters form tokens. - /// </summary> - - public class WhitespaceTokenizer:CharTokenizer - { - /// <summary>Construct a new WhitespaceTokenizer. </summary> - public WhitespaceTokenizer(System.IO.TextReader @in) - : base(@in) - { - } - - /// <summary>Construct a new WhitespaceTokenizer using a given <see cref="AttributeSource" />. </summary> - public WhitespaceTokenizer(AttributeSource source, System.IO.TextReader @in) - : base(source, @in) - { - } - - /// <summary>Construct a new WhitespaceTokenizer using a given <see cref="Lucene.Net.Util.AttributeSource.AttributeFactory" />. </summary> - public WhitespaceTokenizer(AttributeFactory factory, System.IO.TextReader @in) - : base(factory, @in) - { - } - - /// <summary>Collects only characters which do not satisfy - /// <see cref="char.IsWhiteSpace(char)" />. - /// </summary> - protected internal override bool IsTokenChar(char c) - { - return !System.Char.IsWhiteSpace(c); - } - } -} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/8a97bfcf/src/Lucene.Net.Core/Analysis/WordlistLoader.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Core/Analysis/WordlistLoader.cs b/src/Lucene.Net.Core/Analysis/WordlistLoader.cs deleted file mode 100644 index d3abfe6..0000000 --- a/src/Lucene.Net.Core/Analysis/WordlistLoader.cs +++ /dev/null @@ -1,146 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -using System.Collections.Generic; - -namespace Lucene.Net.Analysis -{ - - /// <summary> Loader for text files that represent a list of stopwords.</summary> - public class WordlistLoader - { - - /// <summary> Loads a text file and adds every line as an entry to a HashSet (omitting - /// leading and trailing whitespace). Every line of the file should contain only - /// one word. The words need to be in lowercase if you make use of an - /// Analyzer which uses LowerCaseFilter (like StandardAnalyzer). - /// </summary> - /// <param name="wordfile">File containing the wordlist</param> - /// <returns> A HashSet with the file's words</returns> - public static ISet<string> GetWordSet(System.IO.FileInfo wordfile) - { - using (var reader = new System.IO.StreamReader(wordfile.FullName, System.Text.Encoding.Default)) - { - return GetWordSet(reader); - } - } - - /// <summary> Loads a text file and adds every non-comment line as an entry to a HashSet (omitting - /// leading and trailing whitespace). Every line of the file should contain only - /// one word. The words need to be in lowercase if you make use of an - /// Analyzer which uses LowerCaseFilter (like StandardAnalyzer). - /// </summary> - /// <param name="wordfile">File containing the wordlist</param> - /// <param name="comment">The comment string to ignore</param> - /// <returns> A HashSet with the file's words</returns> - public static ISet<string> GetWordSet(System.IO.FileInfo wordfile, System.String comment) - { - using (var reader = new System.IO.StreamReader(wordfile.FullName, System.Text.Encoding.Default)) - { - return GetWordSet(reader, comment); - } - } - - - /// <summary> Reads lines from a Reader and adds every line as an entry to a HashSet (omitting - /// leading and trailing whitespace). Every line of the Reader should contain only - /// one word. The words need to be in lowercase if you make use of an - /// Analyzer which uses LowerCaseFilter (like StandardAnalyzer). - /// </summary> - /// <param name="reader">Reader containing the wordlist</param> - /// <returns>A HashSet with the reader's words</returns> - public static ISet<string> GetWordSet(System.IO.TextReader reader) - { - var result = Support.Compatibility.SetFactory.CreateHashSet<string>(); - - System.String word; - while ((word = reader.ReadLine()) != null) - { - result.Add(word.Trim()); - } - - return result; - } - - /// <summary> Reads lines from a Reader and adds every non-comment line as an entry to a HashSet (omitting - /// leading and trailing whitespace). Every line of the Reader should contain only - /// one word. The words need to be in lowercase if you make use of an - /// Analyzer which uses LowerCaseFilter (like StandardAnalyzer). - /// - /// </summary> - /// <param name="reader">Reader containing the wordlist - /// </param> - /// <param name="comment">The string representing a comment. - /// </param> - /// <returns> A HashSet with the reader's words - /// </returns> - public static ISet<string> GetWordSet(System.IO.TextReader reader, System.String comment) - { - var result = Support.Compatibility.SetFactory.CreateHashSet<string>(); - - System.String word = null; - while ((word = reader.ReadLine()) != null) - { - if (word.StartsWith(comment) == false) - { - result.Add(word.Trim()); - } - } - - return result; - } - - - - /// <summary> Reads a stem dictionary. Each line contains: - /// <c>word<b>\t</b>stem</c> - /// (i.e. two tab seperated words) - /// - /// </summary> - /// <returns> stem dictionary that overrules the stemming algorithm - /// </returns> - /// <throws> IOException </throws> - public static Dictionary<string, string> GetStemDict(System.IO.FileInfo wordstemfile) - { - if (wordstemfile == null) - throw new System.NullReferenceException("wordstemfile may not be null"); - var result = new Dictionary<string, string>(); - System.IO.StreamReader br = null; - System.IO.StreamReader fr = null; - try - { - fr = new System.IO.StreamReader(wordstemfile.FullName, System.Text.Encoding.Default); - br = new System.IO.StreamReader(fr.BaseStream, fr.CurrentEncoding); - System.String line; - char[] tab = {'\t'}; - while ((line = br.ReadLine()) != null) - { - System.String[] wordstem = line.Split(tab, 2); - result[wordstem[0]] = wordstem[1]; - } - } - finally - { - if (fr != null) - fr.Close(); - if (br != null) - br.Close(); - } - return result; - } - } -} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/8a97bfcf/src/Lucene.Net.Core/project.json ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Core/project.json b/src/Lucene.Net.Core/project.json index ddbdf76..6c8e54a 100644 --- a/src/Lucene.Net.Core/project.json +++ b/src/Lucene.Net.Core/project.json @@ -14,12 +14,6 @@ "define": [ "FEATURE_TASKMERGESCHEDULER", "NETSTANDARD" ], "compile": { "exclude": [ - "Analysis/Standard/*", - "Analysis/Keyword*.cs", - "Analysis/LowerCase*.cs", - "Analysis/Porter*.cs", - "Analysis/Stop*.cs", - "Analysis/Whitespace*.cs", "Document/FieldSelector*.cs", "Index/CompoundFile*.cs", "Index/DefaultSkipList*.cs", @@ -39,22 +33,6 @@ ], "excludeFiles": [ "RectangularArrays.cs", - "Analysis/ASCIIFoldingFilter.cs", - "Analysis/BaseCharFilter.cs", - "Analysis/CharArraySet.cs", - "Analysis/CharReader.cs", - "Analysis/CharStream.cs", - "Analysis/CharTokenizer.cs", - "Analysis/ISOLatin1AccentFilter.cs", - "Analysis/LengthFilter.cs", - "Analysis/LetterTokenizer.cs", - "Analysis/MappingCharFilter.cs", - "Analysis/NormalizeCharMap.cs", - "Analysis/PerFieldAnalyzerWrapper.cs", - "Analysis/SimpleAnalyzer.cs", - "Analysis/TeeSinkTokenFilter.cs", - "Analysis/Tokenattributes/TermAttribute.cs", - "Analysis/WordlistLoader.cs", "Document/AbstractField.cs", "Document/DateField.cs", "Document/Fieldable.cs", @@ -178,12 +156,6 @@ ], "compile": { "exclude": [ - "Analysis/Standard/*", - "Analysis/Keyword*.cs", - "Analysis/LowerCase*.cs", - "Analysis/Porter*.cs", - "Analysis/Stop*.cs", - "Analysis/Whitespace*.cs", "Document/FieldSelector*.cs", "Index/CompoundFile*.cs", "Index/DefaultSkipList*.cs", @@ -203,22 +175,6 @@ ], "excludeFiles": [ "RectangularArrays.cs", - "Analysis/ASCIIFoldingFilter.cs", - "Analysis/BaseCharFilter.cs", - "Analysis/CharArraySet.cs", - "Analysis/CharReader.cs", - "Analysis/CharStream.cs", - "Analysis/CharTokenizer.cs", - "Analysis/ISOLatin1AccentFilter.cs", - "Analysis/LengthFilter.cs", - "Analysis/LetterTokenizer.cs", - "Analysis/MappingCharFilter.cs", - "Analysis/NormalizeCharMap.cs", - "Analysis/PerFieldAnalyzerWrapper.cs", - "Analysis/SimpleAnalyzer.cs", - "Analysis/TeeSinkTokenFilter.cs", - "Analysis/Tokenattributes/TermAttribute.cs", - "Analysis/WordlistLoader.cs", "Document/AbstractField.cs", "Document/DateField.cs", "Document/Fieldable.cs",
