http://git-wip-us.apache.org/repos/asf/lucenenet/blob/02362804/src/contrib/Analyzers/Filters/ChainedFilter.cs ---------------------------------------------------------------------- diff --git a/src/contrib/Analyzers/Filters/ChainedFilter.cs b/src/contrib/Analyzers/Filters/ChainedFilter.cs deleted file mode 100644 index 0fa4e69..0000000 --- a/src/contrib/Analyzers/Filters/ChainedFilter.cs +++ /dev/null @@ -1,275 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -using System; -using System.Collections.Generic; -using System.Linq; -using System.Text; - -using Lucene.Net.Search; -using Lucene.Net.Index; -using Lucene.Net.Util; - -namespace Lucene.Net.Analysis -{ - - ///<summary> - ///* <p> - /// * Allows multiple {@link Filter}s to be chained. - /// * Logical operations such as <b>NOT</b> and <b>XOR</b> - /// * are applied between filters. One operation can be used - /// * for all filters, or a specific operation can be declared - /// * for each filter. - /// * </p> - /// * <p> - /// * Order in which filters are called depends on - /// * the position of the filter in the chain. It's probably - /// * more efficient to place the most restrictive filters - /// * /least computationally-intensive filters first. - /// * </p> - ///</summary> - public class ChainedFilter : Filter - { - public enum Logic - { - NONE = -1, - OR = 0, - AND = 1, - ANDNOT = 2, - XOR = 3 - }; - - ///<summary>Logical operation when none is declared. Defaults to OR</summary> - public const Logic DEFAULT = Logic.OR; - - /** The filter chain */ - private Filter[] chain = null; - - private Logic[] logicArray; - - private Logic logic = Logic.NONE; - - ///<summary>Ctor</summary><param name="chain">The chain of filters</param> - public ChainedFilter(Filter[] chain) - { - this.chain = chain; - } - - ///<summary>ctor</summary> - ///<param name="chain">The chain of filters</param> - ///<param name="logicArray">Logical operations to apply between filters</param> - public ChainedFilter(Filter[] chain, Logic[] logicArray) - { - this.chain = chain; - this.logicArray = logicArray; - } - - ///<summary>ctor</summary> - ///<param name="chain">The chain of filters</param> - ///<param name="logic">Logical operation to apply to ALL filters</param> - public ChainedFilter(Filter[] chain, Logic logic) - { - this.chain = chain; - this.logic = logic; - } - - ///<see cref="Filter#getDocIdSet"/> - public override DocIdSet GetDocIdSet(IndexReader reader) - { - int[] index = new int[1]; // use array as reference to modifiable int; - index[0] = 0; // an object attribute would not be thread safe. - if (logic != Logic.NONE) - return GetDocIdSet(reader, logic, index); - else if (logicArray != null) - return GetDocIdSet(reader, logicArray, index); - else - return GetDocIdSet(reader, DEFAULT, index); - } - - private DocIdSetIterator GetDISI(Filter filter, IndexReader reader) - { - DocIdSet docIdSet = filter.GetDocIdSet(reader); - if (docIdSet == null) - { - return DocIdSet.EMPTY_DOCIDSET.Iterator(); - } - else - { - DocIdSetIterator iter = docIdSet.Iterator(); - if (iter == null) - { - return DocIdSet.EMPTY_DOCIDSET.Iterator(); - } - else - { - return iter; - } - } - } - - private OpenBitSetDISI InitialResult(IndexReader reader, Logic logic, int[] index) - { - OpenBitSetDISI result; - /** - * First AND operation takes place against a completely false - * bitset and will always return zero results. - */ - if (logic == Logic.AND) - { - result = new OpenBitSetDISI(GetDISI(chain[index[0]], reader), reader.MaxDoc()); - ++index[0]; - } - else if (logic == Logic.ANDNOT) - { - result = new OpenBitSetDISI(GetDISI(chain[index[0]], reader), reader.MaxDoc()); - result.Flip(0, reader.MaxDoc()); // NOTE: may set bits for deleted docs. - ++index[0]; - } - else - { - result = new OpenBitSetDISI(reader.MaxDoc()); - } - return result; - } - - - ///<summary> - /// * Provide a SortedVIntList when it is definitely - /// * smaller than an OpenBitSet - /// * @deprecated Either use CachingWrapperFilter, or - /// * switch to a different DocIdSet implementation yourself. - /// * This method will be removed in Lucene 4.0 - ///</summary> - protected DocIdSet FinalResult(OpenBitSetDISI result, int maxDocs) - { - return result; - } - - - /** - * Delegates to each filter in the chain. - * @param reader IndexReader - * @param logic Logical operation - * @return DocIdSet - */ - private DocIdSet GetDocIdSet(IndexReader reader, Logic logic, int[] index) - { - OpenBitSetDISI result = InitialResult(reader, logic, index); - for (; index[0] < chain.Length; index[0]++) - { - DoChain(result, logic, chain[index[0]].GetDocIdSet(reader)); - } - return FinalResult(result, reader.MaxDoc()); - } - - /** - * Delegates to each filter in the chain. - * @param reader IndexReader - * @param logic Logical operation - * @return DocIdSet - */ - private DocIdSet GetDocIdSet(IndexReader reader, Logic[] logic, int[] index) - { - if (logic.Length != chain.Length) - throw new ArgumentException("Invalid number of elements in logic array"); - - OpenBitSetDISI result = InitialResult(reader, logic[0], index); - for (; index[0] < chain.Length; index[0]++) - { - DoChain(result, logic[index[0]], chain[index[0]].GetDocIdSet(reader)); - } - return FinalResult(result, reader.MaxDoc()); - } - - public override String ToString() - { - StringBuilder sb = new StringBuilder(); - sb.Append("ChainedFilter: ["); - for (int i = 0; i < chain.Length; i++) - { - sb.Append(chain[i]); - sb.Append(' '); - } - sb.Append(']'); - return sb.ToString(); - } - - private void DoChain(OpenBitSetDISI result, Logic logic, DocIdSet dis) - { - - if (dis is OpenBitSet) - { - // optimized case for OpenBitSets - switch (logic) - { - case Logic.OR: - result.Or((OpenBitSet)dis); - break; - case Logic.AND: - result.And((OpenBitSet)dis); - break; - case Logic.ANDNOT: - result.AndNot((OpenBitSet)dis); - break; - case Logic.XOR: - result.Xor((OpenBitSet)dis); - break; - default: - DoChain(result, DEFAULT, dis); - break; - } - } - else - { - DocIdSetIterator disi; - if (dis == null) - { - disi = DocIdSet.EMPTY_DOCIDSET.Iterator(); - } - else - { - disi = dis.Iterator(); - if (disi == null) - { - disi = DocIdSet.EMPTY_DOCIDSET.Iterator(); - } - } - - switch (logic) - { - case Logic.OR: - result.InPlaceOr(disi); - break; - case Logic.AND: - result.InPlaceAnd(disi); - break; - case Logic.ANDNOT: - result.InPlaceNot(disi); - break; - case Logic.XOR: - result.InPlaceXor(disi); - break; - default: - DoChain(result, DEFAULT, dis); - break; - } - } - } - - } - -} \ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/02362804/src/contrib/Analyzers/Fr/ElisionFilter.cs ---------------------------------------------------------------------- diff --git a/src/contrib/Analyzers/Fr/ElisionFilter.cs b/src/contrib/Analyzers/Fr/ElisionFilter.cs deleted file mode 100644 index 630b29d..0000000 --- a/src/contrib/Analyzers/Fr/ElisionFilter.cs +++ /dev/null @@ -1,114 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -using System; -using System.Collections.Generic; -using Lucene.Net.Analysis.Tokenattributes; - -namespace Lucene.Net.Analysis.Fr -{ - /* - * Removes elisions from a {@link TokenStream}. For example, "l'avion" (the plane) will be - * tokenized as "avion" (plane). - * <p> - * Note that {@link StandardTokenizer} sees " ' " as a space, and cuts it out. - * - * @see <a href="http://fr.wikipedia.org/wiki/%C3%89lision">Elision in Wikipedia</a> - */ - public sealed class ElisionFilter : TokenFilter - { - private CharArraySet articles = null; - private ITermAttribute termAtt; - - private static char[] apostrophes = { '\'', 'â' }; - - public void SetArticles(ISet<string> articles) - { - if (articles is CharArraySet) - this.articles = (CharArraySet)articles; - else - this.articles = new CharArraySet(articles, true); - } - - /* - * Constructs an elision filter with standard stop words - */ - internal ElisionFilter(TokenStream input) - : this(input, new[] { "l", "m", "t", "qu", "n", "s", "j" }) - { } - - /* - * Constructs an elision filter with a Set of stop words - */ - public ElisionFilter(TokenStream input, ISet<string> articles) - : base(input) - { - SetArticles(articles); - termAtt = AddAttribute<ITermAttribute>(); - } - - /* - * Constructs an elision filter with an array of stop words - */ - public ElisionFilter(TokenStream input, IEnumerable<string> articles) - : base(input) - { - this.articles = new CharArraySet(articles, true); - termAtt = AddAttribute<ITermAttribute>(); - } - - /* - * Increments the {@link TokenStream} with a {@link TermAttribute} without elisioned start - */ - public override sealed bool IncrementToken() - { - if (input.IncrementToken()) - { - char[] termBuffer = termAtt.TermBuffer(); - int termLength = termAtt.TermLength(); - - int minPoz = int.MaxValue; - for (int i = 0; i < apostrophes.Length; i++) - { - char apos = apostrophes[i]; - // The equivalent of String.indexOf(ch) - for (int poz = 0; poz < termLength; poz++) - { - if (termBuffer[poz] == apos) - { - minPoz = Math.Min(poz, minPoz); - break; - } - } - } - - // An apostrophe has been found. If the prefix is an article strip it off. - if (minPoz != int.MaxValue - && articles.Contains(termAtt.TermBuffer(), 0, minPoz)) - { - termAtt.SetTermBuffer(termAtt.TermBuffer(), minPoz + 1, termAtt.TermLength() - (minPoz + 1)); - } - - return true; - } - else - { - return false; - } - } - } -} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/02362804/src/contrib/Analyzers/Fr/FrenchAnalyzer.cs ---------------------------------------------------------------------- diff --git a/src/contrib/Analyzers/Fr/FrenchAnalyzer.cs b/src/contrib/Analyzers/Fr/FrenchAnalyzer.cs deleted file mode 100644 index 43bd1f9..0000000 --- a/src/contrib/Analyzers/Fr/FrenchAnalyzer.cs +++ /dev/null @@ -1,262 +0,0 @@ -/* - * - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - * -*/ - -using System; -using System.Collections.Generic; -using System.IO; -using System.Text; -using System.Collections; - -using Lucene.Net.Analysis; -using Lucene.Net.Analysis.De; -using Lucene.Net.Analysis.Standard; -using Version = Lucene.Net.Util.Version; - -namespace Lucene.Net.Analysis.Fr -{ - /* - * {@link Analyzer} for French language. - * <p> - * Supports an external list of stopwords (words that - * will not be indexed at all) and an external list of exclusions (word that will - * not be stemmed, but indexed). - * A default set of stopwords is used unless an alternative list is specified, but the - * exclusion list is empty by default. - * </p> - * - * <a name="version"/> - * <p>You must specify the required {@link Version} - * compatibility when creating FrenchAnalyzer: - * <ul> - * <li> As of 2.9, StopFilter preserves position - * increments - * </ul> - * - * <p><b>NOTE</b>: This class uses the same {@link Version} - * dependent settings as {@link StandardAnalyzer}.</p> - */ - public sealed class FrenchAnalyzer : Analyzer - { - - /* - * Extended list of typical French stopwords. - * @deprecated use {@link #getDefaultStopSet()} instead - */ - // TODO make this private in 3.1 - public readonly static String[] FRENCH_STOP_WORDS = { - "a", "afin", "ai", "ainsi", "après", "attendu", "au", "aujourd", "auquel", "aussi", - "autre", "autres", "aux", "auxquelles", "auxquels", "avait", "avant", "avec", "avoir", - "c", "car", "ce", "ceci", "cela", "celle", "celles", "celui", "cependant", "certain", - "certaine", "certaines", "certains", "ces", "cet", "cette", "ceux", "chez", "ci", - "combien", "comme", "comment", "concernant", "contre", "d", "dans", "de", "debout", - "dedans", "dehors", "delà ", "depuis", "derrière", "des", "désormais", "desquelles", - "desquels", "dessous", "dessus", "devant", "devers", "devra", "divers", "diverse", - "diverses", "doit", "donc", "dont", "du", "duquel", "durant", "dès", "elle", "elles", - "en", "entre", "environ", "est", "et", "etc", "etre", "eu", "eux", "excepté", "hormis", - "hors", "hélas", "hui", "il", "ils", "j", "je", "jusqu", "jusque", "l", "la", "laquelle", - "le", "lequel", "les", "lesquelles", "lesquels", "leur", "leurs", "lorsque", "lui", "là ", - "ma", "mais", "malgré", "me", "merci", "mes", "mien", "mienne", "miennes", "miens", "moi", - "moins", "mon", "moyennant", "même", "mêmes", "n", "ne", "ni", "non", "nos", "notre", - "nous", "néanmoins", "nôtre", "nôtres", "on", "ont", "ou", "outre", "où", "par", "parmi", - "partant", "pas", "passé", "pendant", "plein", "plus", "plusieurs", "pour", "pourquoi", - "proche", "près", "puisque", "qu", "quand", "que", "quel", "quelle", "quelles", "quels", - "qui", "quoi", "quoique", "revoici", "revoilà ", "s", "sa", "sans", "sauf", "se", "selon", - "seront", "ses", "si", "sien", "sienne", "siennes", "siens", "sinon", "soi", "soit", - "son", "sont", "sous", "suivant", "sur", "ta", "te", "tes", "tien", "tienne", "tiennes", - "tiens", "toi", "ton", "tous", "tout", "toute", "toutes", "tu", "un", "une", "va", "vers", - "voici", "voilà ", "vos", "votre", "vous", "vu", "vôtre", "vôtres", "y", "à ", "ça", "ès", - "été", "être", "ô" - }; - - /* - * Contains the stopwords used with the {@link StopFilter}. - */ - private readonly ISet<string> stoptable; - /* - * Contains words that should be indexed but not stemmed. - */ - //TODO make this final in 3.0 - private ISet<string> excltable = Support.Compatibility.SetFactory.CreateHashSet<string>(); - - private readonly Version matchVersion; - - /* - * Returns an unmodifiable instance of the default stop-words set. - * @return an unmodifiable instance of the default stop-words set. - */ - public static ISet<string> GetDefaultStopSet() - { - return DefaultSetHolder.DEFAULT_STOP_SET; - } - - static class DefaultSetHolder - { - internal static ISet<string> DEFAULT_STOP_SET = CharArraySet.UnmodifiableSet(new CharArraySet((IEnumerable<string>)FRENCH_STOP_WORDS, false)); - } - - /* - * Builds an analyzer with the default stop words ({@link #FRENCH_STOP_WORDS}). - */ - public FrenchAnalyzer(Version matchVersion) - : this(matchVersion, DefaultSetHolder.DEFAULT_STOP_SET) - { - - } - - /* - * Builds an analyzer with the given stop words - * - * @param matchVersion - * lucene compatibility version - * @param stopwords - * a stopword set - */ - public FrenchAnalyzer(Version matchVersion, ISet<string> stopwords) - : this(matchVersion, stopwords, CharArraySet.EMPTY_SET) - { - } - - /* - * Builds an analyzer with the given stop words - * - * @param matchVersion - * lucene compatibility version - * @param stopwords - * a stopword set - * @param stemExclutionSet - * a stemming exclusion set - */ - public FrenchAnalyzer(Version matchVersion, ISet<string> stopwords, ISet<string> stemExclutionSet) - { - this.matchVersion = matchVersion; - this.stoptable = CharArraySet.UnmodifiableSet(CharArraySet.Copy(stopwords)); - this.excltable = CharArraySet.UnmodifiableSet(CharArraySet.Copy(stemExclutionSet)); - } - - - /* - * Builds an analyzer with the given stop words. - * @deprecated use {@link #FrenchAnalyzer(Version, Set)} instead - */ - public FrenchAnalyzer(Version matchVersion, params string[] stopwords) - : this(matchVersion, StopFilter.MakeStopSet(stopwords)) - { - - } - - /* - * Builds an analyzer with the given stop words. - * @throws IOException - * @deprecated use {@link #FrenchAnalyzer(Version, Set)} instead - */ - public FrenchAnalyzer(Version matchVersion, FileInfo stopwords) - : this(matchVersion, WordlistLoader.GetWordSet(stopwords)) - { - } - - /* - * Builds an exclusionlist from an array of Strings. - * @deprecated use {@link #FrenchAnalyzer(Version, Set, Set)} instead - */ - public void SetStemExclusionTable(params string[] exclusionlist) - { - excltable = StopFilter.MakeStopSet(exclusionlist); - PreviousTokenStream = null; // force a new stemmer to be created - } - - /* - * Builds an exclusionlist from a Map. - * @deprecated use {@link #FrenchAnalyzer(Version, Set, Set)} instead - */ - public void SetStemExclusionTable(IDictionary<string, string> exclusionlist) - { - excltable = Support.Compatibility.SetFactory.CreateHashSet(exclusionlist.Keys); - PreviousTokenStream = null; // force a new stemmer to be created - } - - /* - * Builds an exclusionlist from the words contained in the given file. - * @throws IOException - * @deprecated use {@link #FrenchAnalyzer(Version, Set, Set)} instead - */ - public void SetStemExclusionTable(FileInfo exclusionlist) - { - excltable = WordlistLoader.GetWordSet(exclusionlist); - PreviousTokenStream = null; // force a new stemmer to be created - } - - /* - * Creates a {@link TokenStream} which tokenizes all the text in the provided - * {@link Reader}. - * - * @return A {@link TokenStream} built from a {@link StandardTokenizer} - * filtered with {@link StandardFilter}, {@link StopFilter}, - * {@link FrenchStemFilter} and {@link LowerCaseFilter} - */ - public override sealed TokenStream TokenStream(String fieldName, TextReader reader) - { - TokenStream result = new StandardTokenizer(matchVersion, reader); - result = new StandardFilter(result); - result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion), - result, stoptable); - result = new FrenchStemFilter(result, excltable); - // Convert to lowercase after stemming! - result = new LowerCaseFilter(result); - return result; - } - - class SavedStreams - { - protected internal Tokenizer source; - protected internal TokenStream result; - }; - - /* - * Returns a (possibly reused) {@link TokenStream} which tokenizes all the - * text in the provided {@link Reader}. - * - * @return A {@link TokenStream} built from a {@link StandardTokenizer} - * filtered with {@link StandardFilter}, {@link StopFilter}, - * {@link FrenchStemFilter} and {@link LowerCaseFilter} - */ - public override TokenStream ReusableTokenStream(String fieldName, TextReader reader) - { - SavedStreams streams = (SavedStreams)PreviousTokenStream; - if (streams == null) - { - streams = new SavedStreams(); - streams.source = new StandardTokenizer(matchVersion, reader); - streams.result = new StandardFilter(streams.source); - streams.result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion), - streams.result, stoptable); - streams.result = new FrenchStemFilter(streams.result, excltable); - // Convert to lowercase after stemming! - streams.result = new LowerCaseFilter(streams.result); - PreviousTokenStream = streams; - } - else - { - streams.source.Reset(reader); - } - return streams.result; - } - } -} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/02362804/src/contrib/Analyzers/Fr/FrenchStemFilter.cs ---------------------------------------------------------------------- diff --git a/src/contrib/Analyzers/Fr/FrenchStemFilter.cs b/src/contrib/Analyzers/Fr/FrenchStemFilter.cs deleted file mode 100644 index 198e967..0000000 --- a/src/contrib/Analyzers/Fr/FrenchStemFilter.cs +++ /dev/null @@ -1,113 +0,0 @@ -/* - * - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - * -*/ - -using System; -using System.Collections.Generic; -using System.IO; -using System.Text; -using System.Collections; - -using Lucene.Net.Analysis; -using Lucene.Net.Analysis.Tokenattributes; - -namespace Lucene.Net.Analysis.Fr -{ - /* - * A {@link TokenFilter} that stems french words. - * <p> - * It supports a table of words that should - * not be stemmed at all. The used stemmer can be changed at runtime after the - * filter object is created (as long as it is a {@link FrenchStemmer}). - * </p> - * NOTE: This stemmer does not implement the Snowball algorithm correctly, - * especially involving case problems. It is recommended that you consider using - * the "French" stemmer in the snowball package instead. This stemmer will likely - * be deprecated in a future release. - */ - public sealed class FrenchStemFilter : TokenFilter - { - - /* - * The actual token in the input stream. - */ - private FrenchStemmer stemmer = null; - private ISet<string> exclusions = null; - - private ITermAttribute termAtt; - - public FrenchStemFilter(TokenStream _in) - : base(_in) - { - - stemmer = new FrenchStemmer(); - termAtt = AddAttribute<ITermAttribute>(); - } - - - public FrenchStemFilter(TokenStream _in, ISet<string> exclusiontable) - : this(_in) - { - exclusions = exclusiontable; - } - - /* - * @return Returns true for the next token in the stream, or false at EOS - */ - public override bool IncrementToken() - { - if (input.IncrementToken()) - { - String term = termAtt.Term; - - // Check the exclusion table - if (exclusions == null || !exclusions.Contains(term)) - { - String s = stemmer.Stem(term); - // If not stemmed, don't waste the time adjusting the token. - if ((s != null) && !s.Equals(term)) - termAtt.SetTermBuffer(s); - } - return true; - } - else - { - return false; - } - } - /* - * Set a alternative/custom {@link FrenchStemmer} for this filter. - */ - public void SetStemmer(FrenchStemmer stemmer) - { - if (stemmer != null) - { - this.stemmer = stemmer; - } - } - /* - * Set an alternative exclusion list for this filter. - */ - public void SetExclusionTable(IDictionary<string, string> exclusiontable) - { - exclusions = Support.Compatibility.SetFactory.CreateHashSet(exclusiontable.Keys); - } - } -} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/02362804/src/contrib/Analyzers/Fr/FrenchStemmer.cs ---------------------------------------------------------------------- diff --git a/src/contrib/Analyzers/Fr/FrenchStemmer.cs b/src/contrib/Analyzers/Fr/FrenchStemmer.cs deleted file mode 100644 index 2dc3a1c..0000000 --- a/src/contrib/Analyzers/Fr/FrenchStemmer.cs +++ /dev/null @@ -1,726 +0,0 @@ -/* - * - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - * -*/ - -using System; -using System.Text; - -namespace Lucene.Net.Analysis.Fr -{ - -/* - * A stemmer for French words. - * <p> - * The algorithm is based on the work of - * Dr Martin Porter on his snowball project<br> - * refer to http://snowball.sourceforge.net/french/stemmer.html<br> - * (French stemming algorithm) for details - * </p> - */ - -public class FrenchStemmer { - - /* - * Buffer for the terms while stemming them. - */ - private StringBuilder sb = new StringBuilder(); - - /* - * A temporary buffer, used to reconstruct R2 - */ - private StringBuilder tb = new StringBuilder(); - - /* - * Region R0 is equal to the whole buffer - */ - private String R0; - - /* - * Region RV - * "If the word begins with two vowels, RV is the region after the third letter, - * otherwise the region after the first vowel not at the beginning of the word, - * or the end of the word if these positions cannot be found." - */ - private String RV; - - /* - * Region R1 - * "R1 is the region after the first non-vowel following a vowel - * or is the null region at the end of the word if there is no such non-vowel" - */ - private String R1; - - /* - * Region R2 - * "R2 is the region after the first non-vowel in R1 following a vowel - * or is the null region at the end of the word if there is no such non-vowel" - */ - private String R2; - - - /* - * Set to true if we need to perform step 2 - */ - private bool suite; - - /* - * Set to true if the buffer was modified - */ - private bool modified; - - - /* - * Stems the given term to a unique <tt>discriminator</tt>. - * - * @param term java.langString The term that should be stemmed - * @return java.lang.String Discriminator for <tt>term</tt> - */ - protected internal String Stem( String term ) { - if ( !IsStemmable( term ) ) { - return term; - } - - // Use lowercase for medium stemming. - term = term.ToLower(); - - // Reset the StringBuilder. - sb.Length = 0; - sb.Insert( 0, term ); - - // reset the bools - modified = false; - suite = false; - - sb = TreatVowels( sb ); - - SetStrings(); - - Step1(); - - if (!modified || suite) - { - if (RV != null) - { - suite = Step2A(); - if (!suite) - Step2B(); - } - } - - if (modified || suite) - Step3(); - else - Step4(); - - Step5(); - - Step6(); - - return sb.ToString(); - } - - /* - * Sets the search region Strings<br> - * it needs to be done each time the buffer was modified - */ - private void SetStrings() { - // set the strings - R0 = sb.ToString(); - RV = RetrieveRV( sb ); - R1 = RetrieveR( sb ); - if ( R1 != null ) - { - tb.Length = 0; - tb.Insert( 0, R1 ); - R2 = RetrieveR( tb ); - } - else - R2 = null; - } - - /* - * First step of the Porter Algorithm<br> - * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation - */ - private void Step1( ) { - String[] suffix = { "ances", "iqUes", "ismes", "ables", "istes", "ance", "iqUe", "isme", "able", "iste" }; - DeleteFrom( R2, suffix ); - - ReplaceFrom( R2, new String[] { "logies", "logie" }, "log" ); - ReplaceFrom( R2, new String[] { "usions", "utions", "usion", "ution" }, "u" ); - ReplaceFrom( R2, new String[] { "ences", "ence" }, "ent" ); - - String[] search = { "atrices", "ateurs", "ations", "atrice", "ateur", "ation"}; - DeleteButSuffixFromElseReplace( R2, search, "ic", true, R0, "iqU" ); - - DeleteButSuffixFromElseReplace( R2, new String[] { "ements", "ement" }, "eus", false, R0, "eux" ); - DeleteButSuffixFrom( R2, new String[] { "ements", "ement" }, "ativ", false ); - DeleteButSuffixFrom( R2, new String[] { "ements", "ement" }, "iv", false ); - DeleteButSuffixFrom( R2, new String[] { "ements", "ement" }, "abl", false ); - DeleteButSuffixFrom( R2, new String[] { "ements", "ement" }, "iqU", false ); - - DeleteFromIfTestVowelBeforeIn( R1, new String[] { "issements", "issement" }, false, R0 ); - DeleteFrom( RV, new String[] { "ements", "ement" } ); - - DeleteButSuffixFromElseReplace(R2, new [] { "it\u00e9s", "it\u00e9" }, "abil", false, R0, "abl"); - DeleteButSuffixFromElseReplace(R2, new [] { "it\u00e9s", "it\u00e9" }, "ic", false, R0, "iqU"); - DeleteButSuffixFrom(R2, new [] { "it\u00e9s", "it\u00e9" }, "iv", true); - - String[] autre = { "ifs", "ives", "if", "ive" }; - DeleteButSuffixFromElseReplace( R2, autre, "icat", false, R0, "iqU" ); - DeleteButSuffixFromElseReplace( R2, autre, "at", true, R2, "iqU" ); - - ReplaceFrom( R0, new String[] { "eaux" }, "eau" ); - - ReplaceFrom( R1, new String[] { "aux" }, "al" ); - - DeleteButSuffixFromElseReplace( R2, new String[] { "euses", "euse" }, "", true, R1, "eux" ); - - DeleteFrom( R2, new String[] { "eux" } ); - - // if one of the next steps is performed, we will need to perform step2a - bool temp = false; - temp = ReplaceFrom( RV, new String[] { "amment" }, "ant" ); - if (temp == true) - suite = true; - temp = ReplaceFrom( RV, new String[] { "emment" }, "ent" ); - if (temp == true) - suite = true; - temp = DeleteFromIfTestVowelBeforeIn( RV, new String[] { "ments", "ment" }, true, RV ); - if (temp == true) - suite = true; - - } - - /* - * Second step (A) of the Porter Algorithm<br> - * Will be performed if nothing changed from the first step - * or changed were done in the amment, emment, ments or ment suffixes<br> - * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation - * - * @return bool - true if something changed in the StringBuilder - */ - private bool Step2A() { - String[] search = { "\u00eemes", "\u00eetes", "iraIent", "irait", "irais", "irai", "iras", "ira", - "irent", "iriez", "irez", "irions", "irons", "iront", - "issaIent", "issais", "issantes", "issante", "issants", "issant", - "issait", "issais", "issions", "issons", "issiez", "issez", "issent", - "isses", "isse", "ir", "is", "\u00eet", "it", "ies", "ie", "i" }; - return DeleteFromIfTestVowelBeforeIn( RV, search, false, RV ); - } - - /* - * Second step (B) of the Porter Algorithm<br> - * Will be performed if step 2 A was performed unsuccessfully<br> - * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation - */ - private void Step2B() { - String[] suffix = { "eraIent", "erais", "erait", "erai", "eras", "erions", "eriez", - "erons", "eront","erez", "\u00e8rent", "era", "\u00e9es", "iez", - "\u00e9e", "\u00e9s", "er", "ez", "\u00e9" }; - DeleteFrom( RV, suffix ); - - String[] search = { "assions", "assiez", "assent", "asses", "asse", "aIent", - "antes", "aIent", "Aient", "ante", "\u00e2mes", "\u00e2tes", "ants", "ant", - "ait", "a\u00eet", "ais", "Ait", "A\u00eet", "Ais", "\u00e2t", "as", "ai", "Ai", "a" }; - DeleteButSuffixFrom( RV, search, "e", true ); - - DeleteFrom( R2, new String[] { "ions" } ); - } - - /* - * Third step of the Porter Algorithm<br> - * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation - */ - private void Step3() { - if (sb.Length>0) - { - char ch = sb[ sb.Length -1]; - if (ch == 'Y') - { - sb[sb.Length -1] = 'i' ; - SetStrings(); - } - else if (ch == 'ç') - { - sb[sb.Length -1] = 'c'; - SetStrings(); - } - } - } - - /* - * Fourth step of the Porter Algorithm<br> - * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation - */ - private void Step4() { - if (sb.Length > 1) - { - char ch = sb[ sb.Length -1]; - if (ch == 's') - { - char b = sb[ sb.Length -2]; - if (b != 'a' && b != 'i' && b != 'o' && b != 'u' && b != 'è' && b != 's') - { - sb.Length = sb.Length - 1; - SetStrings(); - } - } - } - bool found = DeleteFromIfPrecededIn( R2, new String[] { "ion" }, RV, "s" ); - if (!found) - found = DeleteFromIfPrecededIn( R2, new String[] { "ion" }, RV, "t" ); - - ReplaceFrom(RV, new String[] { "I\u00e8re", "i\u00e8re", "Ier", "ier" }, "i"); - DeleteFrom( RV, new String[] { "e" } ); - DeleteFromIfPrecededIn(RV, new String[] { "\u00eb" }, R0, "gu"); - } - - /* - * Fifth step of the Porter Algorithm<br> - * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation - */ - private void Step5() { - if (R0 != null) - { - if (R0.EndsWith("enn") || R0.EndsWith("onn") || R0.EndsWith("ett") || R0.EndsWith("ell") || R0.EndsWith("eill")) - { - sb.Length = sb.Length - 1; - SetStrings(); - } - } - } - - /* - * Sixth (and last!) step of the Porter Algorithm<br> - * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation - */ - private void Step6() { - if (R0!=null && R0.Length>0) - { - bool seenVowel = false; - bool seenConson = false; - int pos = -1; - for (int i = R0.Length-1; i > -1; i--) - { - char ch = R0[i] ; - if (IsVowel(ch)) - { - if (!seenVowel) - { - if (ch == 'é' || ch == 'è') - { - pos = i; - break; - } - } - seenVowel = true; - } - else - { - if (seenVowel) - break; - else - seenConson = true; - } - } - if (pos > -1 && seenConson && !seenVowel) - sb[pos] = 'e'; - } - } - - /* - * Delete a suffix searched in zone "source" if zone "from" contains prefix + search string - * - * @param source java.lang.String - the primary source zone for search - * @param search java.lang.String[] - the strings to search for suppression - * @param from java.lang.String - the secondary source zone for search - * @param prefix java.lang.String - the prefix to add to the search string to test - * @return bool - true if modified - */ - private bool DeleteFromIfPrecededIn( String source, String[] search, String from, String prefix ) { - bool found = false; - if (source!=null ) - { - for (int i = 0; i < search.Length; i++) { - if ( source.EndsWith( search[i] )) - { - if (from!=null && from.EndsWith( prefix + search[i] )) - { - sb.Length = sb.Length - search[i].Length; - found = true; - SetStrings(); - break; - } - } - } - } - return found; - } - - /* - * Delete a suffix searched in zone "source" if the preceding letter is (or isn't) a vowel - * - * @param source java.lang.String - the primary source zone for search - * @param search java.lang.String[] - the strings to search for suppression - * @param vowel bool - true if we need a vowel before the search string - * @param from java.lang.String - the secondary source zone for search (where vowel could be) - * @return bool - true if modified - */ - private bool DeleteFromIfTestVowelBeforeIn( String source, String[] search, bool vowel, String from ) { - bool found = false; - if (source!=null && from!=null) - { - for (int i = 0; i < search.Length; i++) { - if ( source.EndsWith( search[i] )) - { - if ((search[i].Length + 1) <= from.Length) - { - bool test = IsVowel(sb[sb.Length -(search[i].Length+1)]); - if (test == vowel) - { - sb.Length = sb.Length - search[i].Length; - modified = true; - found = true; - SetStrings(); - break; - } - } - } - } - } - return found; - } - - /* - * Delete a suffix searched in zone "source" if preceded by the prefix - * - * @param source java.lang.String - the primary source zone for search - * @param search java.lang.String[] - the strings to search for suppression - * @param prefix java.lang.String - the prefix to add to the search string to test - * @param without bool - true if it will be deleted even without prefix found - */ - private void DeleteButSuffixFrom( String source, String[] search, String prefix, bool without ) { - if (source!=null) - { - for (int i = 0; i < search.Length; i++) { - if ( source.EndsWith( prefix + search[i] )) - { - sb.Length = sb.Length - (prefix.Length + search[i].Length); - modified = true; - SetStrings(); - break; - } - else if ( without && source.EndsWith( search[i] )) - { - sb.Length = sb.Length - search[i].Length; - modified = true; - SetStrings(); - break; - } - } - } - } - - /* - * Delete a suffix searched in zone "source" if preceded by prefix<br> - * or replace it with the replace string if preceded by the prefix in the zone "from"<br> - * or delete the suffix if specified - * - * @param source java.lang.String - the primary source zone for search - * @param search java.lang.String[] - the strings to search for suppression - * @param prefix java.lang.String - the prefix to add to the search string to test - * @param without bool - true if it will be deleted even without prefix found - */ - private void DeleteButSuffixFromElseReplace( String source, String[] search, String prefix, bool without, String from, String replace ) { - if (source!=null) - { - for (int i = 0; i < search.Length; i++) { - if ( source.EndsWith( prefix + search[i] )) - { - sb.Length = sb.Length - (prefix.Length + search[i].Length); - modified = true; - SetStrings(); - break; - } - else if ( from!=null && from.EndsWith( prefix + search[i] )) - { - // java equivalent of replace - sb.Length = sb.Length - (prefix.Length + search[i].Length); - sb.Append(replace); - - modified = true; - SetStrings(); - break; - } - else if ( without && source.EndsWith( search[i] )) - { - sb.Length = sb.Length - search[i].Length; - modified = true; - SetStrings(); - break; - } - } - } - } - - /* - * Replace a search string with another within the source zone - * - * @param source java.lang.String - the source zone for search - * @param search java.lang.String[] - the strings to search for replacement - * @param replace java.lang.String - the replacement string - */ - private bool ReplaceFrom( String source, String[] search, String replace ) { - bool found = false; - if (source!=null) - { - for (int i = 0; i < search.Length; i++) { - if ( source.EndsWith( search[i] )) - { - // java equivalent for replace - sb.Length = sb.Length - search[i].Length; - sb.Append(replace); - - modified = true; - found = true; - SetStrings(); - break; - } - } - } - return found; - } - - /* - * Delete a search string within the source zone - * - * @param source the source zone for search - * @param suffix the strings to search for suppression - */ - private void DeleteFrom(String source, String[] suffix ) { - if (source!=null) - { - for (int i = 0; i < suffix.Length; i++) { - if (source.EndsWith( suffix[i] )) - { - sb.Length = sb.Length - suffix[i].Length; - modified = true; - SetStrings(); - break; - } - } - } - } - - /* - * Test if a char is a french vowel, including accentuated ones - * - * @param ch the char to test - * @return bool - true if the char is a vowel - */ - private bool IsVowel(char ch) { - switch (ch) - { - case 'a': - case 'e': - case 'i': - case 'o': - case 'u': - case 'y': - case 'â': - case 'à ': - case 'ë': - case 'é': - case 'ê': - case 'è': - case 'ï': - case 'î': - case 'ô': - case 'ü': - case 'ù': - case 'û': - return true; - default: - return false; - } - } - - /* - * Retrieve the "R zone" (1 or 2 depending on the buffer) and return the corresponding string<br> - * "R is the region after the first non-vowel following a vowel - * or is the null region at the end of the word if there is no such non-vowel"<br> - * @param buffer java.lang.StringBuilder - the in buffer - * @return java.lang.String - the resulting string - */ - private String RetrieveR( StringBuilder buffer ) { - int len = buffer.Length; - int pos = -1; - for (int c = 0; c < len; c++) { - if (IsVowel( buffer[ c ] )) - { - pos = c; - break; - } - } - if (pos > -1) - { - int consonne = -1; - for (int c = pos; c < len; c++) { - if (!IsVowel(buffer[ c ] )) - { - consonne = c; - break; - } - } - if (consonne > -1 && (consonne+1) < len) - return buffer.ToString(consonne + 1, len - (consonne+1)); - else - return null; - } - else - return null; - } - - /* - * Retrieve the "RV zone" from a buffer an return the corresponding string<br> - * "If the word begins with two vowels, RV is the region after the third letter, - * otherwise the region after the first vowel not at the beginning of the word, - * or the end of the word if these positions cannot be found."<br> - * @param buffer java.lang.StringBuilder - the in buffer - * @return java.lang.String - the resulting string - */ - private String RetrieveRV( StringBuilder buffer ) { - int len = buffer.Length; - if ( buffer.Length > 3) - { - if ( IsVowel(buffer[ 0 ] ) && IsVowel(buffer[ 1 ] )) { - return buffer.ToString(3, len - 3); - } - else - { - int pos = 0; - for (int c = 1; c < len; c++) { - if (IsVowel( buffer[ c ] )) - { - pos = c; - break; - } - } - if ( pos+1 < len ) - return buffer.ToString(pos + 1, len - (pos+1)); - else - return null; - } - } - else - return null; - } - - - - /* - * Turns u and i preceded AND followed by a vowel to UpperCase<br> - * Turns y preceded OR followed by a vowel to UpperCase<br> - * Turns u preceded by q to UpperCase<br> - * - * @param buffer java.util.StringBuilder - the buffer to treat - * @return java.util.StringBuilder - the treated buffer - */ - private StringBuilder TreatVowels( StringBuilder buffer ) { - for ( int c = 0; c < buffer.Length; c++ ) { - char ch = buffer[ c ] ; - - if (c == 0) // first char - { - if (buffer.Length>1) - { - if (ch == 'y' && IsVowel(buffer[ c + 1 ] )) - buffer[c] = 'Y'; - } - } - else if (c == buffer.Length-1) // last char - { - if (ch == 'u' && buffer[ c - 1 ] == 'q') - buffer[c] = 'U'; - if (ch == 'y' && IsVowel(buffer[ c - 1 ] )) - buffer[c] = 'Y'; - } - else // other cases - { - if (ch == 'u') - { - if (buffer[ c - 1] == 'q') - buffer[c] = 'U'; - else if (IsVowel(buffer[ c - 1 ] ) && IsVowel(buffer[ c + 1 ] )) - buffer[c] = 'U'; - } - if (ch == 'i') - { - if (IsVowel(buffer[ c - 1 ] ) && IsVowel(buffer[ c + 1 ] )) - buffer[c] = 'I'; - } - if (ch == 'y') - { - if (IsVowel(buffer[ c - 1 ] ) || IsVowel(buffer[ c + 1 ] )) - buffer[c] = 'Y'; - } - } - } - - return buffer; - } - - /* - * Checks a term if it can be processed correctly. - * - * @return bool - true if, and only if, the given term consists in letters. - */ - private bool IsStemmable( String term ) { - bool upper = false; - int first = -1; - for ( int c = 0; c < term.Length; c++ ) { - // Discard terms that contain non-letter chars. - if ( !char.IsLetter( term[c] ) ) { - return false; - } - // Discard terms that contain multiple uppercase letters. - if ( char.IsUpper( term[ c] ) ) { - if ( upper ) { - return false; - } - // First encountered uppercase letter, set flag and save - // position. - else { - first = c; - upper = true; - } - } - } - // Discard the term if it contains a single uppercase letter that - // is not starting the term. - if ( first > 0 ) { - return false; - } - return true; - } -} - -} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/02362804/src/contrib/Analyzers/Hunspell/HunspellAffix.cs ---------------------------------------------------------------------- diff --git a/src/contrib/Analyzers/Hunspell/HunspellAffix.cs b/src/contrib/Analyzers/Hunspell/HunspellAffix.cs deleted file mode 100644 index 705d06b..0000000 --- a/src/contrib/Analyzers/Hunspell/HunspellAffix.cs +++ /dev/null @@ -1,90 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -using System; -using System.Diagnostics; -using System.Text.RegularExpressions; - -namespace Lucene.Net.Analysis.Hunspell { - /// <summary> - /// Wrapper class representing a hunspell affix. - /// </summary> - [DebuggerDisplay("{Condition}")] - public class HunspellAffix { - private String _condition; - private Regex _conditionPattern; - - /// <summary> - /// The append defined for the affix. - /// </summary> - public String Append { get; set; } - - /// <summary> - /// The flags defined for the affix append. - /// </summary> - public Char[] AppendFlags { get; set; } - - /// <summary> - /// The condition that must be met before the affix can be applied. - /// </summary> - public String Condition { - get { return _condition; } - } - - /// <summary> - /// The affix flag. - /// </summary> - public Char Flag { get; set; } - - /// <summary> - /// Whether the affix is defined as cross product. - /// </summary> - public Boolean IsCrossProduct { get; set; } - - /// <summary> - /// The stripping characters defined for the affix. - /// </summary> - public String Strip { get; set; } - - /// <summary> - /// Checks whether the String defined by the provided char array, offset - /// and length, meets the condition of this affix. - /// </summary> - /// <returns> - /// <c>true</c> if the String meets the condition, <c>false</c> otherwise. - /// </returns> - public Boolean CheckCondition(String text) { - if (text == null) - throw new ArgumentNullException("text"); - - return _conditionPattern.IsMatch(text); - } - - /// <summary> - /// Sets the condition that must be met before the affix can be applied. - /// </summary> - /// <param name="condition">Condition to be met before affix application.</param> - /// <param name="pattern">Condition as a regular expression pattern.</param> - public void SetCondition(String condition, String pattern) { - if (condition == null) throw new ArgumentNullException("condition"); - if (pattern == null) throw new ArgumentNullException("pattern"); - - _condition = condition; - _conditionPattern = new Regex(pattern); - } - } -} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/02362804/src/contrib/Analyzers/Hunspell/HunspellDictionary.cs ---------------------------------------------------------------------- diff --git a/src/contrib/Analyzers/Hunspell/HunspellDictionary.cs b/src/contrib/Analyzers/Hunspell/HunspellDictionary.cs deleted file mode 100644 index 770a5bc..0000000 --- a/src/contrib/Analyzers/Hunspell/HunspellDictionary.cs +++ /dev/null @@ -1,428 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -using System; -using System.Collections.Generic; -using System.Globalization; -using System.IO; -using System.Text; -using System.Text.RegularExpressions; - -namespace Lucene.Net.Analysis.Hunspell { - public class HunspellDictionary { - private static readonly HunspellWord NoFlags = new HunspellWord(); - - private static readonly String PREFIX_KEY = "PFX"; - private static readonly String SUFFIX_KEY = "SFX"; - private static readonly String FLAG_KEY = "FLAG"; - private static readonly String AF_KEY = "AF"; - - private static readonly String NUM_FLAG_TYPE = "num"; - private static readonly String UTF8_FLAG_TYPE = "UTF-8"; - private static readonly String LONG_FLAG_TYPE = "long"; - - private static readonly String PREFIX_CONDITION_REGEX_PATTERN = @"^{0}"; - private static readonly String SUFFIX_CONDITION_REGEX_PATTERN = @"{0}$"; - - private readonly Dictionary<String, List<HunspellAffix>> _prefixes = new Dictionary<String, List<HunspellAffix>>(); - private readonly Dictionary<String, List<HunspellAffix>> _suffixes = new Dictionary<String, List<HunspellAffix>>(); - private readonly Dictionary<String, List<HunspellWord>> _words = new Dictionary<String, List<HunspellWord>>(); - private readonly Dictionary<String, Char[]> _aliases = new Dictionary<String, Char[]>(); - private FlagParsingStrategy _flagParsingStrategy = new SimpleFlagParsingStrategy(); // Default flag parsing strategy - - /// <summary> - /// Creates a new HunspellDictionary containing the information read from the provided streams to hunspell affix and dictionary file. - /// </summary> - /// <param name = "affix">Stream for reading the hunspell affix file.</param> - /// <param name = "dictionary">Stream for reading the hunspell dictionary file.</param> - /// <exception cref = "IOException">Can be thrown while reading from the streams.</exception> - /// <exception cref = "InvalidDataException">Can be thrown if the content of the files does not meet expected formats.</exception> - public HunspellDictionary(Stream affix, Stream dictionary) - : this(affix, new[] { dictionary }) { - } - - /// <summary> - /// Creates a new HunspellDictionary containing the information read from the provided streams to hunspell affix and dictionary files. - /// </summary> - /// <param name = "affix">Stream for reading the hunspell affix file.</param> - /// <param name = "dictionaries">Streams for reading the hunspell dictionary file.</param> - /// <exception cref = "IOException">Can be thrown while reading from the streams.</exception> - /// <exception cref = "InvalidDataException">Can be thrown if the content of the files does not meet expected formats.</exception> - public HunspellDictionary(Stream affix, IEnumerable<Stream> dictionaries) { - if (affix == null) throw new ArgumentNullException("affix"); - if (dictionaries == null) throw new ArgumentNullException("dictionaries"); - - var encodingName = ReadDictionaryEncoding(affix); - var encoding = Encoding.GetEncoding(encodingName); - - ReadAffixFile(affix, encoding); - foreach (var dictionary in dictionaries) - ReadDictionaryFile(dictionary, encoding); - } - - /// <summary> - /// Looks up HunspellWords that match the String created from the given char array, offset and length. - /// </summary> - public IEnumerable<HunspellWord> LookupWord(String word) { - if (word == null) throw new ArgumentNullException("word"); - - List<HunspellWord> list; - if (_words.TryGetValue(word, out list)) - return list; - - return null; - } - - /// <summary> - /// Looks up HunspellAffix prefixes that have an append that matches the String created from the given char array, offset and length. - /// </summary> - /// <param name="word">Char array to generate the String from.</param> - /// <param name="offset">Offset in the char array that the String starts at.</param> - /// <param name="length">Length from the offset that the String is.</param> - /// <returns>List of HunspellAffix prefixes with an append that matches the String, or <c>null</c> if none are found.</returns> - public IEnumerable<HunspellAffix> LookupPrefix(char[] word, int offset, int length) { - if (word == null) throw new ArgumentNullException("word"); - var key = new String(word, offset, length); - - List<HunspellAffix> list; - if (_prefixes.TryGetValue(key, out list)) - return list; - - return null; - } - - /// <summary> - /// Looks up HunspellAffix suffixes that have an append that matches the String created from the given char array, offset and length. - /// </summary> - /// <param name="word">Char array to generate the String from.</param> - /// <param name="offset">Offset in the char array that the String starts at.</param> - /// <param name="length">Length from the offset that the String is.</param> - /// <returns>List of HunspellAffix suffixes with an append that matches the String, or <c>null</c> if none are found</returns> - public IEnumerable<HunspellAffix> LookupSuffix(char[] word, int offset, int length) { - if (word == null) throw new ArgumentNullException("word"); - var key = new String(word, offset, length); - - List<HunspellAffix> list; - if (_suffixes.TryGetValue(key, out list)) - return list; - - return null; - } - - /// <summary> - /// Reads the affix file through the provided Stream, building up the prefix and suffix maps. - /// </summary> - /// <param name="affixStream">Stream to read the content of the affix file from.</param> - /// <param name="encoding">Encoding to decode the content of the file.</param> - /// <exception cref="IOException">IOException Can be thrown while reading from the Stream.</exception> - private void ReadAffixFile(Stream affixStream, Encoding encoding) { - if (affixStream == null) throw new ArgumentNullException("affixStream"); - if (encoding == null) throw new ArgumentNullException("encoding"); - - using (var reader = new StreamReader(affixStream, encoding)) { - String line; - while ((line = reader.ReadLine()) != null) { - if (line.StartsWith(PREFIX_KEY)) { - ParseAffix(_prefixes, line, reader, PREFIX_CONDITION_REGEX_PATTERN); - } else if (line.StartsWith(SUFFIX_KEY)) { - ParseAffix(_suffixes, line, reader, SUFFIX_CONDITION_REGEX_PATTERN); - } else if (line.StartsWith(FLAG_KEY)) { - // Assume that the FLAG line comes before any prefix or suffixes - // Store the strategy so it can be used when parsing the dic file - _flagParsingStrategy = GetFlagParsingStrategy(line); - } else if (line.StartsWith(AF_KEY)) { - // Parse Alias Flag - ParseAliasFlag(line, reader); - } - } - } - } - - /// <summary> - /// Parse alias flag and put it in hash - /// </summary> - /// <param name="line"></param> - /// <param name="reader"></param> - private void ParseAliasFlag(String line, TextReader reader) { - if (reader == null) throw new ArgumentNullException("reader"); - var args = Regex.Split(line, "\\s+"); - var numLines = Int32.Parse(args[1]); - - for (var i = 0; i < numLines; i++) { - line = reader.ReadLine(); - var ruleArgs = Regex.Split(line, "\\s+"); - - if (ruleArgs[0] != "AF") - throw new Exception("File corrupted, should be AF directive : " + line); - - var appendFlags = _flagParsingStrategy.ParseFlags(ruleArgs[1]); - _aliases.Add((i+1).ToString(CultureInfo.InvariantCulture), appendFlags); - } - } - - /// <summary> - /// Parses a specific affix rule putting the result into the provided affix map. - /// </summary> - /// <param name="affixes">Map where the result of the parsing will be put.</param> - /// <param name="header">Header line of the affix rule.</param> - /// <param name="reader">TextReader to read the content of the rule from.</param> - /// <param name="conditionPattern">Pattern to be used to generate the condition regex pattern.</param> - private void ParseAffix(Dictionary<String, List<HunspellAffix>> affixes, String header, TextReader reader, String conditionPattern) { - if (affixes == null) throw new ArgumentNullException("affixes"); - if (header == null) throw new ArgumentNullException("header"); - if (reader == null) throw new ArgumentNullException("reader"); - if (conditionPattern == null) throw new ArgumentNullException("conditionPattern"); - - var args = Regex.Split(header, "\\s+"); - var crossProduct = args[2].Equals("Y"); - var numLines = Int32.Parse(args[3]); - - var hasAliases = _aliases.Count > 0; - for (var i = 0; i < numLines; i++) { - var line = reader.ReadLine(); - var ruleArgs = Regex.Split(line, "\\s+"); - - var affix = new HunspellAffix(); - - affix.Flag = _flagParsingStrategy.ParseFlag(ruleArgs[1]); - affix.Strip = (ruleArgs[2] == "0") ? "" : ruleArgs[2]; - - var affixArg = ruleArgs[3]; - - var flagSep = affixArg.LastIndexOf('/'); - if (flagSep != -1) { - var cflag = affixArg.Substring(flagSep + 1); - var appendFlags = hasAliases ? _aliases[cflag] : _flagParsingStrategy.ParseFlags(cflag); - Array.Sort(appendFlags); - affix.AppendFlags = appendFlags; - affix.Append = affixArg.Substring(0, flagSep); - } else { - affix.Append = affixArg; - } - - var condition = ruleArgs[4]; - affix.SetCondition(condition, String.Format(conditionPattern, condition)); - affix.IsCrossProduct = crossProduct; - - List<HunspellAffix> list; - if (!affixes.TryGetValue(affix.Append, out list)) - affixes.Add(affix.Append, list = new List<HunspellAffix>()); - - list.Add(affix); - } - } - - /// <summary> - /// Parses the encoding specificed in the affix file readable through the provided Stream. - /// </summary> - /// <param name="affix">Stream for reading the affix file.</param> - /// <returns>Encoding specified in the affix file.</returns> - /// <exception cref="InvalidDataException"> - /// Thrown if the first non-empty non-comment line read from the file does not - /// adhere to the format <c>SET encoding</c>. - /// </exception> - private static String ReadDictionaryEncoding(Stream affix) { - if (affix == null) throw new ArgumentNullException("affix"); - - var builder = new StringBuilder(); - for (; ; ) { - builder.Length = 0; - int ch; - while ((ch = affix.ReadByte()) >= 0) { - if (ch == '\n') { - break; - } - if (ch != '\r') { - builder.Append((char)ch); - } - } - - if (builder.Length == 0 || - builder[0] == '#' || - // this test only at the end as ineffective but would allow lines only containing spaces: - builder.ToString().Trim().Length == 0 - ) { - if (ch < 0) - throw new InvalidDataException("Unexpected end of affix file."); - - continue; - } - - if ("SET ".Equals(builder.ToString(0, 4))) { - // cleanup the encoding string, too (whitespace) - return builder.ToString(4, builder.Length - 4).Trim(); - } - - throw new InvalidDataException("The first non-comment line in the affix file must " + - "be a 'SET charset', was: '" + builder + "'"); - } - } - - /// <summary> - /// Determines the appropriate {@link FlagParsingStrategy} based on the FLAG definiton line taken from the affix file. - /// </summary> - /// <param name="flagLine">Line containing the flag information</param> - /// <returns>FlagParsingStrategy that handles parsing flags in the way specified in the FLAG definition.</returns> - private static FlagParsingStrategy GetFlagParsingStrategy(String flagLine) { - if (flagLine == null) throw new ArgumentNullException("flagLine"); - var flagType = flagLine.Substring(5); - - if (NUM_FLAG_TYPE.Equals(flagType)) - return new NumFlagParsingStrategy(); - - if (UTF8_FLAG_TYPE.Equals(flagType)) - return new SimpleFlagParsingStrategy(); - - if (LONG_FLAG_TYPE.Equals(flagType)) - return new DoubleASCIIFlagParsingStrategy(); - - throw new ArgumentException("Unknown flag type: " + flagType); - } - - /// <summary> - /// Reads the dictionary file through the provided Stream, building up the words map. - /// </summary> - /// <param name="dictionary">Stream to read the dictionary file through.</param> - /// <param name="encoding">Encoding used to decode the contents of the file.</param> - /// <exception cref="IOException">Can be thrown while reading from the file.</exception> - private void ReadDictionaryFile(Stream dictionary, Encoding encoding) { - if (dictionary == null) throw new ArgumentNullException("dictionary"); - if (encoding == null) throw new ArgumentNullException("encoding"); - var reader = new StreamReader(dictionary, encoding); - - // nocommit, don't create millions of strings. - var line = reader.ReadLine(); // first line is number of entries - var numEntries = Int32.Parse(line); - var hasAliases = _aliases.Count > 0; - - // nocommit, the flags themselves can be double-chars (long) or also numeric - // either way the trick is to encode them as char... but they must be parsed differently - while ((line = reader.ReadLine()) != null) { - String entry; - HunspellWord wordForm; - - var flagSep = line.LastIndexOf('/'); - if (flagSep == -1) { - wordForm = NoFlags; - entry = line; - } else { - // note, there can be comments (morph description) after a flag. - // we should really look for any whitespace - var end = line.IndexOf('\t', flagSep); - var cflag = end == -1 ? line.Substring(flagSep + 1) : line.Substring(flagSep + 1, end - flagSep - 1); - - wordForm = new HunspellWord(hasAliases ? _aliases[cflag] : _flagParsingStrategy.ParseFlags(cflag)); - - entry = line.Substring(0, flagSep); - } - - List<HunspellWord> entries; - if (!_words.TryGetValue(entry, out entries)) - _words.Add(entry, entries = new List<HunspellWord>()); - - entries.Add(wordForm); - } - } - - #region Nested type: DoubleASCIIFlagParsingStrategy - - /// <summary> - /// Implementation of {@link FlagParsingStrategy} that assumes each flag is encoded as - /// two ASCII characters whose codes must be combined into a single character. - /// </summary> - private class DoubleASCIIFlagParsingStrategy : FlagParsingStrategy { - public override Char[] ParseFlags(String rawFlags) { - if (rawFlags.Length == 0) - return new Char[0]; - - var builder = new StringBuilder(); - for (var i = 0; i < rawFlags.Length; i += 2) { - var cookedFlag = (Char)(rawFlags[i] + rawFlags[i + 1]); - builder.Append(cookedFlag); - } - - return builder.ToString().ToCharArray(); - } - } - - #endregion - - #region Nested type: FlagParsingStrategy - /// <summary> - /// Abstraction of the process of parsing flags taken from the affix and dic files - /// </summary> - private abstract class FlagParsingStrategy { - /// <summary> - /// Parses the given String into a single flag. - /// </summary> - /// <param name="rawFlag">String to parse into a flag.</param> - /// <returns>Parsed flag.</returns> - public Char ParseFlag(String rawFlag) { - if (rawFlag == null) - throw new ArgumentNullException("rawFlag"); - - return ParseFlags(rawFlag)[0]; - } - - /// <summary> - /// Parses the given String into multiple flag. - /// </summary> - /// <param name="rawFlags">String to parse into a flags.</param> - /// <returns>Parsed flags.</returns> - public abstract Char[] ParseFlags(String rawFlags); - } - - #endregion - - #region Nested type: NumFlagParsingStrategy - - /// <summary> - /// Implementation of {@link FlagParsingStrategy} that assumes each flag is encoded in its - /// numerical form. In the case of multiple flags, each number is separated by a comma. - /// </summary> - private class NumFlagParsingStrategy : FlagParsingStrategy { - public override Char[] ParseFlags(String rawFlags) { - var rawFlagParts = rawFlags.Trim().Split(','); - var flags = new Char[rawFlagParts.Length]; - - for (var i = 0; i < rawFlagParts.Length; i++) { - // note, removing the trailing X/leading I for nepali... what is the rule here?! - var replaced = Regex.Replace(rawFlagParts[i], "[^0-9]", ""); - flags[i] = (Char)Int32.Parse(replaced); - } - - return flags; - } - } - - #endregion - - #region Nested type: SimpleFlagParsingStrategy - - /// <summary> - /// Simple implementation of {@link FlagParsingStrategy} that treats the chars in each - /// String as a individual flags. Can be used with both the ASCII and UTF-8 flag types. - /// </summary> - private class SimpleFlagParsingStrategy : FlagParsingStrategy { - public override Char[] ParseFlags(String rawFlags) { - return rawFlags.ToCharArray(); - } - } - - #endregion - } -} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/02362804/src/contrib/Analyzers/Hunspell/HunspellStem.cs ---------------------------------------------------------------------- diff --git a/src/contrib/Analyzers/Hunspell/HunspellStem.cs b/src/contrib/Analyzers/Hunspell/HunspellStem.cs deleted file mode 100644 index 5664304..0000000 --- a/src/contrib/Analyzers/Hunspell/HunspellStem.cs +++ /dev/null @@ -1,84 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -using System; -using System.Collections.Generic; - -namespace Lucene.Net.Analysis.Hunspell { - public class HunspellStem { - private readonly List<HunspellAffix> _prefixes = new List<HunspellAffix>(); - private readonly List<HunspellAffix> _suffixes = new List<HunspellAffix>(); - private readonly String _stem; - - /// <summary> - /// the actual word stem itself. - /// </summary> - public String Stem { - get { return _stem; } - } - - /// <summary> - /// The stem length. - /// </summary> - public Int32 StemLength { - get { return _stem.Length; } - } - - /// <summary> - /// The list of prefixes used to generate the stem. - /// </summary> - public IEnumerable<HunspellAffix> Prefixes { - get { return _prefixes; } - } - - /// <summary> - /// The list of suffixes used to generate the stem. - /// </summary> - public IEnumerable<HunspellAffix> Suffixes { - get { return _suffixes; } - } - - /// <summary> - /// Creates a new Stem wrapping the given word stem. - /// </summary> - public HunspellStem(String stem) { - if (stem == null) throw new ArgumentNullException("stem"); - - _stem = stem; - } - - /// <summary> - /// Adds a prefix to the list of prefixes used to generate this stem. Because it is - /// assumed that prefixes are added depth first, the prefix is added to the front of - /// the list. - /// </summary> - /// <param name="prefix">Prefix to add to the list of prefixes for this stem.</param> - public void AddPrefix(HunspellAffix prefix) { - _prefixes.Insert(0, prefix); - } - - /// <summary> - /// Adds a suffix to the list of suffixes used to generate this stem. Because it - /// is assumed that suffixes are added depth first, the suffix is added to the end - /// of the list. - /// </summary> - /// <param name="suffix">Suffix to add to the list of suffixes for this stem.</param> - public void AddSuffix(HunspellAffix suffix) { - _suffixes.Add(suffix); - } - } -} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/02362804/src/contrib/Analyzers/Hunspell/HunspellStemFilter.cs ---------------------------------------------------------------------- diff --git a/src/contrib/Analyzers/Hunspell/HunspellStemFilter.cs b/src/contrib/Analyzers/Hunspell/HunspellStemFilter.cs deleted file mode 100644 index c59df84..0000000 --- a/src/contrib/Analyzers/Hunspell/HunspellStemFilter.cs +++ /dev/null @@ -1,92 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -using System; -using System.Collections.Generic; -using System.Linq; -using Lucene.Net.Analysis.Tokenattributes; - -namespace Lucene.Net.Analysis.Hunspell { - /// <summary> - /// TokenFilter that uses hunspell affix rules and words to stem tokens. Since hunspell supports a - /// word having multiple stems, this filter can emit multiple tokens for each consumed token. - /// </summary> - public class HunspellStemFilter : TokenFilter { - private readonly ITermAttribute _termAtt; - private readonly IPositionIncrementAttribute _posIncAtt; - private readonly HunspellStemmer _stemmer; - - private readonly Queue<HunspellStem> _buffer = new Queue<HunspellStem>(); - private State _savedState; - - private readonly Boolean _dedup; - - /// <summary> - /// Creates a new HunspellStemFilter that will stem tokens from the given TokenStream using - /// affix rules in the provided HunspellDictionary. - /// </summary> - /// <param name="input">TokenStream whose tokens will be stemmed.</param> - /// <param name="dictionary">HunspellDictionary containing the affix rules and words that will be used to stem the tokens.</param> - /// <param name="dedup">true if only unique terms should be output.</param> - public HunspellStemFilter(TokenStream input, HunspellDictionary dictionary, Boolean dedup = true) - : base(input) { - _posIncAtt = AddAttribute<IPositionIncrementAttribute>(); - _termAtt = AddAttribute<ITermAttribute>(); - - _dedup = dedup; - _stemmer = new HunspellStemmer(dictionary); - } - - public override Boolean IncrementToken() { - if (_buffer.Any()) { - var nextStem = _buffer.Dequeue(); - - RestoreState(_savedState); - _posIncAtt.PositionIncrement = 0; - _termAtt.SetTermBuffer(nextStem.Stem, 0, nextStem.StemLength); - return true; - } - - if (!input.IncrementToken()) - return false; - - var newTerms = _dedup - ? _stemmer.UniqueStems(_termAtt.Term) - : _stemmer.Stem(_termAtt.Term); - foreach (var newTerm in newTerms) - _buffer.Enqueue(newTerm); - - if (_buffer.Count == 0) - // we do not know this word, return it unchanged - return true; - - var stem = _buffer.Dequeue(); - _termAtt.SetTermBuffer(stem.Stem, 0, stem.StemLength); - - if (_buffer.Count > 0) - _savedState = CaptureState(); - - return true; - } - - public override void Reset() { - base.Reset(); - - _buffer.Clear(); - } - } -}
