http://git-wip-us.apache.org/repos/asf/lucenenet/blob/02362804/src/contrib/Analyzers/Compound/Hyphenation/hyphenation.dtd ---------------------------------------------------------------------- diff --git a/src/contrib/Analyzers/Compound/Hyphenation/hyphenation.dtd b/src/contrib/Analyzers/Compound/Hyphenation/hyphenation.dtd deleted file mode 100644 index 4f5fee8..0000000 --- a/src/contrib/Analyzers/Compound/Hyphenation/hyphenation.dtd +++ /dev/null @@ -1,68 +0,0 @@ -<?xml version="1.0" encoding="us-ascii"?> -<!-- - Copyright 1999-2004 The Apache Software Foundation - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---> -<!-- $Id: hyphenation.dtd,v 1.3 2004/02/27 18:34:59 jeremias Exp $ --> - -<!ELEMENT hyphenation-info (hyphen-char?, hyphen-min?, - classes, exceptions?, patterns)> - -<!-- Hyphen character to be used in the exception list as shortcut for - <hyphen pre-break="-"/>. Defaults to '-' ---> -<!ELEMENT hyphen-char EMPTY> -<!ATTLIST hyphen-char value CDATA #REQUIRED> - -<!-- Default minimun length in characters of hyphenated word fragments - before and after the line break. For some languages this is not - only for aesthetic purposes, wrong hyphens may be generated if this - is not accounted for. ---> -<!ELEMENT hyphen-min EMPTY> -<!ATTLIST hyphen-min before CDATA #REQUIRED> -<!ATTLIST hyphen-min after CDATA #REQUIRED> - -<!-- Character equivalent classes: space separated list of character groups, all - characters in a group are to be treated equivalent as far as - the hyphenation algorithm is concerned. The first character in a group - is the group's equivalent character. Patterns should only contain - first characters. It also defines word characters, i.e. a word that - contains characters not present in any of the classes is not hyphenated. ---> -<!ELEMENT classes (#PCDATA)> - -<!-- Hyphenation exceptions: space separated list of hyphenated words. - A hyphen is indicated by the hyphen tag, but you can use the - hyphen-char defined previously as shortcut. This is in cases - when the algorithm procedure finds wrong hyphens or you want - to provide your own hyphenation for some words. ---> -<!ELEMENT exceptions (#PCDATA|hyphen)* > - -<!-- The hyphenation patterns, space separated. A pattern is made of 'equivalent' - characters as described before, between any two word characters a digit - in the range 0 to 9 may be specified. The absence of a digit is equivalent - to zero. The '.' character is reserved to indicate begining or ending - of words. --> -<!ELEMENT patterns (#PCDATA)> - -<!-- A "full hyphen" equivalent to TeX's \discretionary - with pre-break, post-break and no-break attributes. - To be used in the exceptions list, the hyphen character is not - automatically added --> -<!ELEMENT hyphen EMPTY> -<!ATTLIST hyphen pre CDATA #IMPLIED> -<!ATTLIST hyphen no CDATA #IMPLIED> -<!ATTLIST hyphen post CDATA #IMPLIED>
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/02362804/src/contrib/Analyzers/Compound/HyphenationCompoundWordTokenFilter.cs ---------------------------------------------------------------------- diff --git a/src/contrib/Analyzers/Compound/HyphenationCompoundWordTokenFilter.cs b/src/contrib/Analyzers/Compound/HyphenationCompoundWordTokenFilter.cs deleted file mode 100644 index 6ca528b..0000000 --- a/src/contrib/Analyzers/Compound/HyphenationCompoundWordTokenFilter.cs +++ /dev/null @@ -1,230 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -//using System; -//using System.Collections.Generic; -//using System.IO; -//using System.Linq; -//using Lucene.Net.Analysis.Compound.Hyphenation; - -//namespace Lucene.Net.Analysis.Compound -//{ -// /* -// * A {@link TokenFilter} that decomposes compound words found in many Germanic languages. -// * <p> -// * "Donaudampfschiff" becomes Donau, dampf, schiff so that you can find -// * "Donaudampfschiff" even when you only enter "schiff". It uses a hyphenation -// * grammar and a word dictionary to achieve this. -// * </p> -// */ -//public class HyphenationCompoundWordTokenFilter : CompoundWordTokenFilterBase -//{ -// private HyphenationTree hyphenator; - -// /* -// * -// * @param input the {@link TokenStream} to process -// * @param hyphenator the hyphenation pattern tree to use for hyphenation -// * @param dictionary the word dictionary to match against -// * @param minWordSize only words longer than this get processed -// * @param minSubwordSize only subwords longer than this get to the output -// * stream -// * @param maxSubwordSize only subwords shorter than this get to the output -// * stream -// * @param onlyLongestMatch Add only the longest matching subword to the stream -// */ -// public HyphenationCompoundWordTokenFilter(TokenStream input, HyphenationTree hyphenator, String[] dictionary, int minWordSize, int minSubwordSize, int maxSubwordSize, bool onlyLongestMatch) -// : this(input, hyphenator, MakeDictionary(dictionary), minWordSize, minSubwordSize, maxSubwordSize, onlyLongestMatch) -// { -// } - -// /* -// * -// * @param input the {@link TokenStream} to process -// * @param hyphenator the hyphenation pattern tree to use for hyphenation -// * @param dictionary the word dictionary to match against -// */ -// public HyphenationCompoundWordTokenFilter(TokenStream input, HyphenationTree hyphenator, String[] dictionary) -// : this(input, hyphenator, MakeDictionary(dictionary), DEFAULT_MIN_WORD_SIZE, -// DEFAULT_MIN_SUBWORD_SIZE, DEFAULT_MAX_SUBWORD_SIZE, false) -// { - -// } - -// /* -// * -// * @param input the {@link TokenStream} to process -// * @param hyphenator the hyphenation pattern tree to use for hyphenation -// * @param dictionary the word dictionary to match against. If this is a {@link org.apache.lucene.analysis.CharArraySet CharArraySet} it must have set ignoreCase=false and only contain -// * lower case strings. -// */ -// public HyphenationCompoundWordTokenFilter(TokenStream input, -// HyphenationTree hyphenator, ISet<string> dictionary) -// : this(input, hyphenator, dictionary, DEFAULT_MIN_WORD_SIZE, DEFAULT_MIN_SUBWORD_SIZE, DEFAULT_MAX_SUBWORD_SIZE, false) -// { - -// } - -// /* -// * -// * @param input the {@link TokenStream} to process -// * @param hyphenator the hyphenation pattern tree to use for hyphenation -// * @param dictionary the word dictionary to match against. If this is a {@link org.apache.lucene.analysis.CharArraySet CharArraySet} it must have set ignoreCase=false and only contain -// * lower case strings. -// * @param minWordSize only words longer than this get processed -// * @param minSubwordSize only subwords longer than this get to the output -// * stream -// * @param maxSubwordSize only subwords shorter than this get to the output -// * stream -// * @param onlyLongestMatch Add only the longest matching subword to the stream -// */ -// public HyphenationCompoundWordTokenFilter(TokenStream input, -// HyphenationTree hyphenator, ISet<string> dictionary, int minWordSize, -// int minSubwordSize, int maxSubwordSize, bool onlyLongestMatch) -// : base(input, dictionary, minWordSize, minSubwordSize, maxSubwordSize, -// onlyLongestMatch) -// { - - -// this.hyphenator = hyphenator; -// } - -// /* -// * Create a hyphenator tree -// * -// * @param hyphenationFilename the filename of the XML grammar to load -// * @return An object representing the hyphenation patterns -// * @throws Exception -// */ -// public static HyphenationTree GetHyphenationTree(String hyphenationFilename) -// { -// return GetHyphenationTree(new InputSource(hyphenationFilename)); -// } - -// /* -// * Create a hyphenator tree -// * -// * @param hyphenationFile the file of the XML grammar to load -// * @return An object representing the hyphenation patterns -// * @throws Exception -// */ -// public static HyphenationTree GetHyphenationTree(FileInfo hyphenationFile) -// { -// return GetHyphenationTree(new InputSource(hyphenationFile.toURL().toExternalForm())); -// } - -// /* -// * Create a hyphenator tree -// * -// * @param hyphenationReader the reader of the XML grammar to load from -// * @return An object representing the hyphenation patterns -// * @throws Exception -// */ -// public static HyphenationTree GetHyphenationTree(TextReader hyphenationReader) -// { -// InputSource _is = new InputSource(hyphenationReader); -// // we need this to load the DTD in very old parsers (like the one in JDK 1.4). -// // The DTD itsself is provided via EntityResolver, so it should always load, but -// // some parsers still want to have a base URL (Crimson). -// _is.setSystemId("urn:java:" + HyphenationTree.class.getName()); -// return getHyphenationTree(is); -// } - -// /* -// * Create a hyphenator tree -// * -// * @param hyphenationSource the InputSource pointing to the XML grammar -// * @return An object representing the hyphenation patterns -// * @throws Exception -// */ -// public static HyphenationTree GetHyphenationTree(InputSource hyphenationSource) -//{ -// HyphenationTree tree = new HyphenationTree(); -// tree.loadPatterns(hyphenationSource); -// return tree; -// } - -// protected override void DecomposeInternal(Token token) -//{ -// // get the hyphenation points -// Hyphenation.Hyphenation hyphens = hyphenator.hyphenate(token.TermBuffer(), 0, token.TermLength(), 1, 1); -// // No hyphen points found -> exit -// if (hyphens == null) { -// return; -// } - -// int[] hyp = hyphens.GetHyphenationPoints(); -// char[] lowerCaseTermBuffer=MakeLowerCaseCopy(token.TermBuffer()); - -// for (int i = 0; i < hyp.Length; ++i) { -// int remaining = hyp.Length - i; -// int start = hyp[i]; -// Token longestMatchToken = null; -// for (int j = 1; j < remaining; j++) { -// int partLength = hyp[i + j] - start; - -// // if the part is longer than maxSubwordSize we -// // are done with this round -// if (partLength > this.maxSubwordSize) { -// break; -// } - -// // we only put subwords to the token stream -// // that are longer than minPartSize -// if (partLength < this.minSubwordSize) { -// continue; -// } - -// // check the dictionary -// if (dictionary.Contains(lowerCaseTermBuffer, start, partLength)) { -// if (this.onlyLongestMatch) { -// if (longestMatchToken != null) { -// if (longestMatchToken.TermLength() < partLength) { -// longestMatchToken = CreateToken(start, partLength, token); -// } -// } else { -// longestMatchToken = CreateToken(start, partLength, token); -// } -// } else { -// tokens.AddLast(CreateToken(start, partLength, token)); -// } -// } else if (dictionary.Contains(lowerCaseTermBuffer, start, -// partLength - 1)) { -// // check the dictionary again with a word that is one character -// // shorter -// // to avoid problems with genitive 's characters and other binding -// // characters -// if (this.onlyLongestMatch) { -// if (longestMatchToken != null) { -// if (longestMatchToken.TermLength() < partLength - 1) { -// longestMatchToken = CreateToken(start, partLength - 1, token); -// } -// } else { -// longestMatchToken = CreateToken(start, partLength - 1, token); -// } -// } else { -// tokens.AddLast(CreateToken(start, partLength - 1, token)); -// } -// } -// } -// if (this.onlyLongestMatch && longestMatchToken!=null) { -// tokens.AddLast(longestMatchToken); -// } -// } -// } -//} -//} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/02362804/src/contrib/Analyzers/Contrib.Analyzers.csproj ---------------------------------------------------------------------- diff --git a/src/contrib/Analyzers/Contrib.Analyzers.csproj b/src/contrib/Analyzers/Contrib.Analyzers.csproj deleted file mode 100644 index dfe0099..0000000 --- a/src/contrib/Analyzers/Contrib.Analyzers.csproj +++ /dev/null @@ -1,234 +0,0 @@ -<?xml version="1.0" encoding="utf-8"?> -<!-- - - Licensed to the Apache Software Foundation (ASF) under one - or more contributor license agreements. See the NOTICE file - distributed with this work for additional information - regarding copyright ownership. The ASF licenses this file - to you under the Apache License, Version 2.0 (the - "License"); you may not use this file except in compliance - with the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, - software distributed under the License is distributed on an - "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - KIND, either express or implied. See the License for the - specific language governing permissions and limitations - under the License. - ---> -<Project ToolsVersion="4.0" DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003"> - <PropertyGroup> - <Configuration Condition=" '$(Configuration)' == '' ">Debug</Configuration> - <Platform Condition=" '$(Platform)' == '' ">AnyCPU</Platform> - <ProductVersion>9.0.21022</ProductVersion> - <SchemaVersion>2.0</SchemaVersion> - <ProjectGuid>{4286E961-9143-4821-B46D-3D39D3736386}</ProjectGuid> - <AppDesignerFolder>Properties</AppDesignerFolder> - <RootNamespace>Lucene.Net.Analysis</RootNamespace> - <AssemblyName>Lucene.Net.Contrib.Analyzers</AssemblyName> - <FileAlignment>512</FileAlignment> - <FileUpgradeFlags> - </FileUpgradeFlags> - <OldToolsVersion>3.5</OldToolsVersion> - <UpgradeBackupLocation /> - <TargetFrameworkProfile /> - </PropertyGroup> - <PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Debug|AnyCPU' "> - <TargetFrameworkVersion>v4.0</TargetFrameworkVersion> - <Framework>$(TargetFrameworkVersion.Replace("v", "NET").Replace(".", ""))</Framework> - <DebugSymbols>true</DebugSymbols> - <DebugType>full</DebugType> - <Optimize>false</Optimize> - <OutputPath>..\..\..\build\bin\contrib\Analyzers\$(Configuration.Replace("35", ""))\$(Framework)\</OutputPath> - <DefineConstants>DEBUG;TRACE;$(Framework)</DefineConstants> - <ErrorReport>prompt</ErrorReport> - <WarningLevel>4</WarningLevel> - <AllowUnsafeBlocks>false</AllowUnsafeBlocks> - <NoWarn>618</NoWarn> - <OutputType>Library</OutputType> - </PropertyGroup> - <PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Debug35|AnyCPU' "> - <TargetFrameworkVersion>v3.5</TargetFrameworkVersion> - <Framework>$(TargetFrameworkVersion.Replace("v", "NET").Replace(".", ""))</Framework> - <DebugSymbols>true</DebugSymbols> - <DebugType>full</DebugType> - <Optimize>false</Optimize> - <OutputPath>..\..\..\build\bin\contrib\Analyzers\$(Configuration.Replace("35", ""))\$(Framework)\</OutputPath> - <DefineConstants>DEBUG;TRACE;$(Framework)</DefineConstants> - <ErrorReport>prompt</ErrorReport> - <WarningLevel>4</WarningLevel> - <AllowUnsafeBlocks>false</AllowUnsafeBlocks> - <NoWarn>618</NoWarn> - <OutputType>Library</OutputType> - </PropertyGroup> - <PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Release|AnyCPU' "> - <TargetFrameworkVersion>v4.0</TargetFrameworkVersion> - <Framework>$(TargetFrameworkVersion.Replace("v", "NET").Replace(".", ""))</Framework> - <DebugType>pdbonly</DebugType> - <Optimize>true</Optimize> - <OutputPath>..\..\..\build\bin\contrib\Analyzers\$(Configuration.Replace("35", ""))\$(Framework)\</OutputPath> - <DefineConstants>TRACE;$(Framework)</DefineConstants> - <ErrorReport>prompt</ErrorReport> - <WarningLevel>4</WarningLevel> - <DocumentationFile>..\..\..\build\bin\contrib\Analyzers\$(Configuration.Replace("35", ""))\$(Framework)\Lucene.Net.Contrib.Analyzers.XML</DocumentationFile> - <NoWarn>618</NoWarn> - <DebugSymbols>true</DebugSymbols> - <OutputType>Library</OutputType> - </PropertyGroup> - <PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Release35|AnyCPU' "> - <TargetFrameworkVersion>v3.5</TargetFrameworkVersion> - <Framework>$(TargetFrameworkVersion.Replace("v", "NET").Replace(".", ""))</Framework> - <DebugType>pdbonly</DebugType> - <Optimize>true</Optimize> - <OutputPath>..\..\..\build\bin\contrib\Analyzers\$(Configuration.Replace("35", ""))\$(Framework)\</OutputPath> - <DefineConstants>TRACE;$(Framework)</DefineConstants> - <ErrorReport>prompt</ErrorReport> - <WarningLevel>4</WarningLevel> - <DocumentationFile>..\..\..\build\bin\contrib\Analyzers\$(Configuration.Replace("35", ""))\$(Framework)\Lucene.Net.Contrib.Analyzers.XML</DocumentationFile> - <NoWarn>618</NoWarn> - <DebugSymbols>true</DebugSymbols> - <OutputType>Library</OutputType> - </PropertyGroup> - <PropertyGroup> - <SignAssembly>true</SignAssembly> - </PropertyGroup> - <PropertyGroup> - <AssemblyOriginatorKeyFile>Lucene.Net.snk</AssemblyOriginatorKeyFile> - </PropertyGroup> - <ItemGroup> - <Reference Include="System" /> - <Reference Condition="'$(Framework)' == 'NET35'" Include="System.Core" /> - </ItemGroup> - <ItemGroup> - <Compile Include="AR\ArabicAnalyzer.cs" /> - <Compile Include="AR\ArabicLetterTokenizer.cs" /> - <Compile Include="AR\ArabicNormalizationFilter.cs" /> - <Compile Include="AR\ArabicNormalizer.cs" /> - <Compile Include="AR\ArabicStemFilter.cs" /> - <Compile Include="AR\ArabicStemmer.cs" /> - <Compile Include="BR\BrazilianAnalyzer.cs" /> - <Compile Include="BR\BrazilianStemFilter.cs" /> - <Compile Include="BR\BrazilianStemmer.cs" /> - <Compile Include="CJK\CJKAnalyzer.cs" /> - <Compile Include="CJK\CJKTokenizer.cs" /> - <Compile Include="Cn\ChineseAnalyzer.cs" /> - <Compile Include="Cn\ChineseFilter.cs" /> - <Compile Include="Cn\ChineseTokenizer.cs" /> - <Compile Include="Compound\CompoundWordTokenFilterBase.cs" /> - <Compile Include="Compound\DictionaryCompoundWordTokenFilter.cs" /> - <Compile Include="Compound\HyphenationCompoundWordTokenFilter.cs" /> - <Compile Include="Compound\Hyphenation\ByteVector.cs" /> - <Compile Include="Compound\Hyphenation\CharVector.cs" /> - <Compile Include="Compound\Hyphenation\Hyphen.cs" /> - <Compile Include="Compound\Hyphenation\Hyphenation.cs" /> - <Compile Include="Compound\Hyphenation\HyphenationException.cs" /> - <Compile Include="Compound\Hyphenation\HyphenationTree.cs" /> - <Compile Include="Compound\Hyphenation\PatternConsumer.cs" /> - <Compile Include="Compound\Hyphenation\PatternParser.cs" /> - <Compile Include="Compound\Hyphenation\TernaryTree.cs" /> - <Compile Include="Cz\CzechAnalyzer.cs" /> - <Compile Include="De\GermanAnalyzer.cs" /> - <Compile Include="De\GermanStemFilter.cs" /> - <Compile Include="De\GermanStemmer.cs" /> - <Compile Include="De\GermanDIN2Stemmer.cs" /> - <Compile Include="El\GreekAnalyzer.cs" /> - <Compile Include="El\GreekLowerCaseFilter.cs" /> - <Compile Include="En\KStemData1.cs" /> - <Compile Include="En\KStemData2.cs" /> - <Compile Include="En\KStemData3.cs" /> - <Compile Include="En\KStemData4.cs" /> - <Compile Include="En\KStemData5.cs" /> - <Compile Include="En\KStemData6.cs" /> - <Compile Include="En\KStemData7.cs" /> - <Compile Include="En\KStemData8.cs" /> - <Compile Include="En\KStemFilter.cs" /> - <Compile Include="En\KStemmer.cs" /> - <Compile Include="Fa\PersianAnalyzer.cs" /> - <Compile Include="Fa\PersianNormalizationFilter.cs" /> - <Compile Include="Fa\PersianNormalizer.cs" /> - <Compile Include="Fr\ElisionFilter.cs" /> - <Compile Include="Fr\FrenchAnalyzer.cs" /> - <Compile Include="Fr\FrenchStemFilter.cs" /> - <Compile Include="Fr\FrenchStemmer.cs" /> - <Compile Include="Hunspell\HunspellAffix.cs" /> - <Compile Include="Hunspell\HunspellDictionary.cs" /> - <Compile Include="Hunspell\HunspellStem.cs" /> - <Compile Include="Hunspell\HunspellStemFilter.cs" /> - <Compile Include="Hunspell\HunspellStemmer.cs" /> - <Compile Include="Hunspell\HunspellWord.cs" /> - <Compile Include="Miscellaneous\EmptyTokenStream.cs" /> - <Compile Include="Miscellaneous\InjectablePrefixAwareTokenFilter.cs" /> - <Compile Include="Miscellaneous\PatternAnalyzer.cs" /> - <Compile Include="Miscellaneous\PrefixAndSuffixAwareTokenFilter.cs" /> - <Compile Include="Miscellaneous\PrefixAwareTokenStream.cs" /> - <Compile Include="Miscellaneous\SingleTokenTokenStream.cs" /> - <Compile Include="NGram\EdgeNGramTokenFilter.cs" /> - <Compile Include="NGram\EdgeNGramTokenizer.cs" /> - <Compile Include="NGram\NGramTokenFilter.cs" /> - <Compile Include="NGram\NGramTokenizer.cs" /> - <Compile Include="Nl\DutchAnalyzer.cs" /> - <Compile Include="Nl\DutchStemFilter.cs" /> - <Compile Include="Nl\DutchStemmer.cs" /> - <Compile Include="Payloads\AbstractEncoder.cs" /> - <Compile Include="Payloads\DelimitedPayloadTokenFilter.cs" /> - <Compile Include="Payloads\FloatEncoder.cs" /> - <Compile Include="Payloads\IdentityEncoder.cs" /> - <Compile Include="Payloads\IntegerEncoder.cs" /> - <Compile Include="Payloads\NumericPayloadTokenFilter.cs" /> - <Compile Include="Payloads\PayloadEncoder.cs" /> - <Compile Include="Payloads\PayloadHelper.cs" /> - <Compile Include="Payloads\TokenOffsetPayloadTokenFilter.cs" /> - <Compile Include="Payloads\TypeAsPayloadTokenFilter.cs" /> - <Compile Include="Position\PositionFilter.cs" /> - <Compile Include="Query\QueryAutoStopWordAnalyzer.cs" /> - <Compile Include="Reverse\ReverseStringFilter.cs" /> - <Compile Include="Ru\RussianAnalyzer.cs" /> - <Compile Include="Ru\RussianLetterTokenizer.cs" /> - <Compile Include="Ru\RussianLowerCaseFilter.cs" /> - <Compile Include="Ru\RussianStemFilter.cs" /> - <Compile Include="Ru\RussianStemmer.cs" /> - <Compile Include="Properties\AssemblyInfo.cs" /> - <Compile Include="Shingle\Matrix\Column.cs" /> - <Compile Include="Shingle\Matrix\Matrix.cs" /> - <Compile Include="Shingle\Matrix\MatrixPermutationIterator.cs" /> - <Compile Include="Shingle\Matrix\Row.cs" /> - <Compile Include="Shingle\ShingleAnalyzerWrapper.cs" /> - <Compile Include="Shingle\ShingleFilter.cs" /> - <Compile Include="Shingle\ShingleMatrixFilter.cs" /> - <Compile Include="Shingle\TokenPositioner.cs" /> - <Compile Include="Shingle\Codec\OneDimensionalNonWeightedTokenSettingsCodec.cs" /> - <Compile Include="Shingle\Codec\SimpleThreeDimensionalTokenSettingsCodec.cs" /> - <Compile Include="Shingle\Codec\TokenSettingsCodec.cs" /> - <Compile Include="Shingle\Codec\TwoDimensionalNonWeightedSynonymTokenSettingsCodec.cs" /> - <Compile Include="Sinks\DateRecognizerSinkFilter.cs" /> - <Compile Include="Sinks\TokenRangeSinkFilter.cs" /> - <Compile Include="Sinks\TokenTypeSinkFilter.cs" /> - <Compile Include="Th\ThaiAnalyzer.cs" /> - <Compile Include="Th\ThaiWordFilter.cs" /> - <Compile Include="WordlistLoader.cs" /> - </ItemGroup> - <ItemGroup> - <EmbeddedResource Include="AR\ArabicStopWords.txt" /> - </ItemGroup> - <ItemGroup> - <ProjectReference Include="..\..\core\Lucene.Net.csproj"> - <Project>{5D4AD9BE-1FFB-41AB-9943-25737971BF57}</Project> - <Name>Lucene.Net</Name> - </ProjectReference> - </ItemGroup> - <ItemGroup> - <None Include="Compound\Hyphenation\hyphenation.dtd" /> - <None Include="Lucene.Net.snk" /> - </ItemGroup> - <Import Project="$(MSBuildToolsPath)\Microsoft.CSharp.targets" /> - <!-- To modify your build process, add your task inside one of the targets below and uncomment it. - Other similar extension points exist, see Microsoft.Common.targets. - <Target Name="BeforeBuild"> - </Target> - <Target Name="AfterBuild"> - </Target> - --> -</Project> \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/02362804/src/contrib/Analyzers/Cz/CzechAnalyzer.cs ---------------------------------------------------------------------- diff --git a/src/contrib/Analyzers/Cz/CzechAnalyzer.cs b/src/contrib/Analyzers/Cz/CzechAnalyzer.cs deleted file mode 100644 index 48f5aa9..0000000 --- a/src/contrib/Analyzers/Cz/CzechAnalyzer.cs +++ /dev/null @@ -1,222 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -using System; -using System.Collections.Generic; -using System.IO; -using System.Text; -using System.Collections; - -using Lucene.Net.Analysis; -using Lucene.Net.Analysis.De; -using Lucene.Net.Analysis.Standard; -using Version = Lucene.Net.Util.Version; - -namespace Lucene.Net.Analysis.Cz -{ -/* - * {@link Analyzer} for Czech language. - * <p> - * Supports an external list of stopwords (words that - * will not be indexed at all). - * A default set of stopwords is used unless an alternative list is specified. - * </p> - * - * <p><b>NOTE</b>: This class uses the same {@link Version} - * dependent settings as {@link StandardAnalyzer}.</p> - */ -public sealed class CzechAnalyzer : Analyzer { - - /* - * List of typical stopwords. - * @deprecated use {@link #getDefaultStopSet()} instead - */ - // TODO make this private in 3.1 - public static readonly String[] CZECH_STOP_WORDS = { - "a","s","k","o","i","u","v","z","dnes","cz","t\u00edmto","bude\u0161","budem", - "byli","jse\u0161","m\u016fj","sv\u00fdm","ta","tomto","tohle","tuto","tyto", - "jej","zda","pro\u010d","m\u00e1te","tato","kam","tohoto","kdo","kte\u0159\u00ed", - "mi","n\u00e1m","tom","tomuto","m\u00edt","nic","proto","kterou","byla", - "toho","proto\u017ee","asi","ho","na\u0161i","napi\u0161te","re","co\u017e","t\u00edm", - "tak\u017ee","sv\u00fdch","jej\u00ed","sv\u00fdmi","jste","aj","tu","tedy","teto", - "bylo","kde","ke","prav\u00e9","ji","nad","nejsou","\u010di","pod","t\u00e9ma", - "mezi","p\u0159es","ty","pak","v\u00e1m","ani","kdy\u017e","v\u0161ak","neg","jsem", - "tento","\u010dl\u00e1nku","\u010dl\u00e1nky","aby","jsme","p\u0159ed","pta","jejich", - "byl","je\u0161t\u011b","a\u017e","bez","tak\u00e9","pouze","prvn\u00ed","va\u0161e","kter\u00e1", - "n\u00e1s","nov\u00fd","tipy","pokud","m\u016f\u017ee","strana","jeho","sv\u00e9","jin\u00e9", - "zpr\u00e1vy","nov\u00e9","nen\u00ed","v\u00e1s","jen","podle","zde","u\u017e","b\u00fdt","v\u00edce", - "bude","ji\u017e","ne\u017e","kter\u00fd","by","kter\u00e9","co","nebo","ten","tak", - "m\u00e1","p\u0159i","od","po","jsou","jak","dal\u0161\u00ed","ale","si","se","ve", - "to","jako","za","zp\u011bt","ze","do","pro","je","na","atd","atp", - "jakmile","p\u0159i\u010dem\u017e","j\u00e1","on","ona","ono","oni","ony","my","vy", - "j\u00ed","ji","m\u011b","mne","jemu","tomu","t\u011bm","t\u011bmu","n\u011bmu","n\u011bmu\u017e", - "jeho\u017e","j\u00ed\u017e","jeliko\u017e","je\u017e","jako\u017e","na\u010de\u017e", - }; - - /* - * Returns a set of default Czech-stopwords - * @return a set of default Czech-stopwords - */ - public static ISet<string> getDefaultStopSet(){ - return DefaultSetHolder.DEFAULT_SET; - } - - private static class DefaultSetHolder { - internal static ISet<string> DEFAULT_SET = CharArraySet.UnmodifiableSet(new CharArraySet( - (IEnumerable<string>)CZECH_STOP_WORDS, false)); - } - - /* - * Contains the stopwords used with the {@link StopFilter}. - */ - // TODO make this final in 3.1 - private ISet<string> stoptable; - private readonly Version matchVersion; - - /* - * Builds an analyzer with the default stop words ({@link #CZECH_STOP_WORDS}). - */ - public CzechAnalyzer(Version matchVersion) - : this(matchVersion, DefaultSetHolder.DEFAULT_SET) - { - - } - - /* - * Builds an analyzer with the given stop words and stemming exclusion words - * - * @param matchVersion - * lucene compatibility version - * @param stopwords - * a stopword set - */ - public CzechAnalyzer(Version matchVersion, ISet<string> stopwords) { - this.matchVersion = matchVersion; - this.stoptable = CharArraySet.UnmodifiableSet(CharArraySet.Copy(stopwords)); - } - - - /* - * Builds an analyzer with the given stop words. - * @deprecated use {@link #CzechAnalyzer(Version, Set)} instead - */ - public CzechAnalyzer(Version matchVersion, params string[] stopwords) - : this(matchVersion, StopFilter.MakeStopSet( stopwords )) - { - - } - - /* - * Builds an analyzer with the given stop words. - * - * @deprecated use {@link #CzechAnalyzer(Version, Set)} instead - */ - public CzechAnalyzer(Version matchVersion, HashSet<string> stopwords) - : this(matchVersion, (ISet<string>)stopwords) - { - - } - - /* - * Builds an analyzer with the given stop words. - * @deprecated use {@link #CzechAnalyzer(Version, Set)} instead - */ - public CzechAnalyzer(Version matchVersion, FileInfo stopwords ) - : this(matchVersion, WordlistLoader.GetWordSet( stopwords )) - { - - } - - /* - * Loads stopwords hash from resource stream (file, database...). - * @param wordfile File containing the wordlist - * @param encoding Encoding used (win-1250, iso-8859-2, ...), null for default system encoding - * @deprecated use {@link WordlistLoader#getWordSet(Reader, String) } - * and {@link #CzechAnalyzer(Version, Set)} instead - */ - public void LoadStopWords( Stream wordfile, System.Text.Encoding encoding ) { - PreviousTokenStream = null; // force a new stopfilter to be created - if ( wordfile == null ) - { - stoptable = Support.Compatibility.SetFactory.CreateHashSet<string>(); - return; - } - try { - // clear any previous table (if present) - stoptable = Support.Compatibility.SetFactory.CreateHashSet<string>(); - - StreamReader isr; - if (encoding == null) - isr = new StreamReader(wordfile); - else - isr = new StreamReader(wordfile, encoding); - - stoptable = WordlistLoader.GetWordSet(isr); - } catch ( IOException) { - // clear any previous table (if present) - // TODO: throw IOException - stoptable = Support.Compatibility.SetFactory.CreateHashSet<string>(); - } - } - - /* - * Creates a {@link TokenStream} which tokenizes all the text in the provided {@link Reader}. - * - * @return A {@link TokenStream} built from a {@link StandardTokenizer} filtered with - * {@link StandardFilter}, {@link LowerCaseFilter}, and {@link StopFilter} - */ - public override sealed TokenStream TokenStream( String fieldName, TextReader reader ) { - TokenStream result = new StandardTokenizer( matchVersion, reader ); - result = new StandardFilter( result ); - result = new LowerCaseFilter( result ); - result = new StopFilter( StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion), - result, stoptable ); - return result; - } - - private class SavedStreams { - protected internal Tokenizer source; - protected internal TokenStream result; - }; - - /* - * Returns a (possibly reused) {@link TokenStream} which tokenizes all the text in - * the provided {@link Reader}. - * - * @return A {@link TokenStream} built from a {@link StandardTokenizer} filtered with - * {@link StandardFilter}, {@link LowerCaseFilter}, and {@link StopFilter} - */ - public override TokenStream ReusableTokenStream(String fieldName, TextReader reader) - { - SavedStreams streams = (SavedStreams) PreviousTokenStream; - if (streams == null) { - streams = new SavedStreams(); - streams.source = new StandardTokenizer(matchVersion, reader); - streams.result = new StandardFilter(streams.source); - streams.result = new LowerCaseFilter(streams.result); - streams.result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion), - streams.result, stoptable); - PreviousTokenStream = streams; - } else { - streams.source.Reset(reader); - } - return streams.result; - } -} - - -} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/02362804/src/contrib/Analyzers/De/GermanAnalyzer.cs ---------------------------------------------------------------------- diff --git a/src/contrib/Analyzers/De/GermanAnalyzer.cs b/src/contrib/Analyzers/De/GermanAnalyzer.cs deleted file mode 100644 index 5f068e1..0000000 --- a/src/contrib/Analyzers/De/GermanAnalyzer.cs +++ /dev/null @@ -1,250 +0,0 @@ -/* - * - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - * -*/ - -using System; -using System.Collections.Generic; -using System.IO; -using System.Collections; -using System.Linq; -using Lucene.Net.Analysis.Standard; -using Lucene.Net.Analysis; -using Version = Lucene.Net.Util.Version; - -namespace Lucene.Net.Analysis.De -{ - /// <summary> - /// Analyzer for German language. Supports an external list of stopwords (words that - /// will not be indexed at all) and an external list of exclusions (word that will - /// not be stemmed, but indexed). - /// A default set of stopwords is used unless an alternative list is specified, the - /// exclusion list is empty by default. - /// </summary> - public class GermanAnalyzer : Analyzer - { - /// <summary> - /// List of typical german stopwords. - /// </summary> - //TODO: make this private in 3.1 - private static readonly String[] GERMAN_STOP_WORDS = - { - "einer", "eine", "eines", "einem", "einen", - "der", "die", "das", "dass", "daÃ", - "du", "er", "sie", "es", - "was", "wer", "wie", "wir", - "und", "oder", "ohne", "mit", - "am", "im", "in", "aus", "auf", - "ist", "sein", "war", "wird", - "ihr", "ihre", "ihres", - "als", "für", "von", - "dich", "dir", "mich", "mir", - "mein", "kein", - "durch", "wegen" - }; - - /// <summary> - /// Returns a set of default German-stopwords - /// </summary> - public static ISet<string> GetDefaultStopSet() - { - return DefaultSetHolder.DEFAULT_SET; - } - - private static class DefaultSetHolder - { - internal static readonly ISet<string> DEFAULT_SET = CharArraySet.UnmodifiableSet(new CharArraySet( - (IEnumerable<string>)GERMAN_STOP_WORDS, - false)); - } - - /// <summary> - /// Contains the stopwords used with the StopFilter. - /// </summary> - //TODO: make this readonly in 3.1 - private ISet<string> stopSet; - - /// <summary> - /// Contains words that should be indexed but not stemmed. - /// </summary> - //TODO: make this readonly in 3.1 - private ISet<string> exclusionSet; - - private Version matchVersion; - private readonly bool _normalizeDin2; - - /// <summary> - /// Builds an analyzer with the default stop words: - /// <see cref="GetDefaultStopSet"/> - /// </summary> - [Obsolete("Use GermanAnalyzer(Version) instead")] - public GermanAnalyzer() - : this(Version.LUCENE_CURRENT) - { - } - - /// <summary> - /// Builds an analyzer with the default stop words: - /// <see cref="GetDefaultStopSet"/> - /// </summary> - /// <param name="matchVersion">Lucene compatibility version</param> - public GermanAnalyzer(Version matchVersion) - : this(matchVersion, DefaultSetHolder.DEFAULT_SET) - { } - - /// <summary> - /// Builds an analyzer with the default stop words: - /// <see cref="GetDefaultStopSet"/> - /// </summary> - /// <param name="matchVersion">Lucene compatibility version</param> - /// <param name="normalizeDin2">Specifies if the DIN-2007-2 style stemmer should be used in addition to DIN1. This - /// will cause words with 'ae', 'ue', or 'oe' in them (expanded umlauts) to be first converted to 'a', 'u', and 'o' - /// respectively, before the DIN1 stemmer is invoked.</param> - public GermanAnalyzer(Version matchVersion, bool normalizeDin2) - : this(matchVersion, DefaultSetHolder.DEFAULT_SET, normalizeDin2) - { } - - /// <summary> - /// Builds an analyzer with the given stop words, using the default DIN-5007-1 stemmer - /// </summary> - /// <param name="matchVersion">Lucene compatibility version</param> - /// <param name="stopwords">a stopword set</param> - public GermanAnalyzer(Version matchVersion, ISet<string> stopwords) - : this(matchVersion, stopwords, CharArraySet.EMPTY_SET) - { - } - - /// <summary> - /// Builds an analyzer with the given stop words - /// </summary> - /// <param name="matchVersion">Lucene compatibility version</param> - /// <param name="stopwords">a stopword set</param> - /// <param name="normalizeDin2">Specifies if the DIN-2007-2 style stemmer should be used in addition to DIN1. This - /// will cause words with 'ae', 'ue', or 'oe' in them (expanded umlauts) to be first converted to 'a', 'u', and 'o' - /// respectively, before the DIN1 stemmer is invoked.</param> - public GermanAnalyzer(Version matchVersion, ISet<string> stopwords, bool normalizeDin2) - : this(matchVersion, stopwords, CharArraySet.EMPTY_SET, normalizeDin2) - { - } - - /// <summary> - /// Builds an analyzer with the given stop words, using the default DIN-5007-1 stemmer - /// </summary> - /// <param name="matchVersion">lucene compatibility version</param> - /// <param name="stopwords">a stopword set</param> - /// <param name="stemExclusionSet">a stemming exclusion set</param> - public GermanAnalyzer(Version matchVersion, ISet<string> stopwords, ISet<string> stemExclusionSet) - : this(matchVersion, stopwords, stemExclusionSet, false) - { } - - - /// <summary> - /// Builds an analyzer with the given stop words - /// </summary> - /// <param name="matchVersion">lucene compatibility version</param> - /// <param name="stopwords">a stopword set</param> - /// <param name="stemExclusionSet">a stemming exclusion set</param> - /// <param name="normalizeDin2">Specifies if the DIN-2007-2 style stemmer should be used in addition to DIN1. This - /// will cause words with 'ae', 'ue', or 'oe' in them (expanded umlauts) to be first converted to 'a', 'u', and 'o' - /// respectively, before the DIN1 stemmer is invoked.</param> - public GermanAnalyzer(Version matchVersion, ISet<string> stopwords, ISet<string> stemExclusionSet, bool normalizeDin2) - { - stopSet = CharArraySet.UnmodifiableSet(CharArraySet.Copy(stopwords)); - exclusionSet = CharArraySet.UnmodifiableSet(CharArraySet.Copy(stemExclusionSet)); - this.matchVersion = matchVersion; - _normalizeDin2 = normalizeDin2; - SetOverridesTokenStreamMethod<GermanAnalyzer>(); - } - - /// <summary> - /// Builds an analyzer with the given stop words. - /// </summary> - /// <param name="stopwords"></param> - [Obsolete("use GermanAnalyzer(Version, Set) instead")] - public GermanAnalyzer(Version matchVersion, params string[] stopwords) - : this(matchVersion, StopFilter.MakeStopSet(stopwords)) - { - } - - /// <summary> - /// Builds an analyzer with the given stop words. - /// </summary> - [Obsolete("Use GermanAnalyzer(Version, ISet)")] - public GermanAnalyzer(Version matchVersion, IDictionary<string, string> stopwords) - : this(matchVersion, stopwords.Keys.ToArray()) - { - - } - - /// <summary> - /// Builds an analyzer with the given stop words. - /// </summary> - [Obsolete("Use GermanAnalyzer(Version, ISet)")] - public GermanAnalyzer(Version matchVersion, FileInfo stopwords) - : this(matchVersion, WordlistLoader.GetWordSet(stopwords)) - { - } - - /// <summary> - /// Builds an exclusionlist from an array of Strings. - /// </summary> - [Obsolete("Use GermanAnalyzer(Version, ISet, ISet) instead")] - public void SetStemExclusionTable(String[] exclusionlist) - { - exclusionSet = StopFilter.MakeStopSet(exclusionlist); - PreviousTokenStream = null; - } - - /// <summary> - /// Builds an exclusionlist from a IDictionary. - /// </summary> - [Obsolete("Use GermanAnalyzer(Version, ISet, ISet) instead")] - public void SetStemExclusionTable(IDictionary<string, string> exclusionlist) - { - exclusionSet = Support.Compatibility.SetFactory.CreateHashSet(exclusionlist.Keys); - PreviousTokenStream = null; - } - - /// <summary> - /// Builds an exclusionlist from the words contained in the given file. - /// </summary> - [Obsolete("Use GermanAnalyzer(Version, ISet, ISet) instead")] - public void SetStemExclusionTable(FileInfo exclusionlist) - { - exclusionSet = WordlistLoader.GetWordSet(exclusionlist); - PreviousTokenStream = null; - } - - /// <summary> - /// Creates a TokenStream which tokenizes all the text in the provided TextReader. - /// </summary> - /// <param name="fieldName"></param> - /// <param name="reader"></param> - /// <returns>A TokenStream build from a StandardTokenizer filtered with StandardFilter, StopFilter, GermanStemFilter</returns> - public override TokenStream TokenStream(String fieldName, TextReader reader) - { - TokenStream result = new StandardTokenizer(matchVersion, reader); - result = new StandardFilter(result); - result = new LowerCaseFilter(result); - result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion), result, stopSet); - result = new GermanStemFilter(result, exclusionSet, _normalizeDin2); - return result; - } - } -} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/02362804/src/contrib/Analyzers/De/GermanDIN2Stemmer.cs ---------------------------------------------------------------------- diff --git a/src/contrib/Analyzers/De/GermanDIN2Stemmer.cs b/src/contrib/Analyzers/De/GermanDIN2Stemmer.cs deleted file mode 100644 index b29538b..0000000 --- a/src/contrib/Analyzers/De/GermanDIN2Stemmer.cs +++ /dev/null @@ -1,55 +0,0 @@ -/* - * - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - * -*/ - -using System; -using System.Collections.Generic; -using System.Linq; -using System.Text; - -namespace Lucene.Net.Analysis.De -{ - /// <summary> - /// A stemmer for the german language that uses the - /// DIN-5007-2 "Phone Book" rules for handling - /// umlaut characters. - /// </summary> - public sealed class GermanDIN2Stemmer : GermanStemmer - { - protected override void Substitute(StringBuilder buffer) - { - for (int c = 1; c < buffer.Length; c++) - { - if (buffer[c] == 'e') - { - switch (buffer[c - 1]) - { - case 'a': - case 'o': - case 'u': - buffer.Remove(c, 1); - break; - } - } - } - base.Substitute(buffer); - } - } -} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/02362804/src/contrib/Analyzers/De/GermanStemFilter.cs ---------------------------------------------------------------------- diff --git a/src/contrib/Analyzers/De/GermanStemFilter.cs b/src/contrib/Analyzers/De/GermanStemFilter.cs deleted file mode 100644 index 2bb4cd9..0000000 --- a/src/contrib/Analyzers/De/GermanStemFilter.cs +++ /dev/null @@ -1,123 +0,0 @@ -/* - * - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - * -*/ - -using System; -using System.Collections.Generic; -using System.IO; -using System.Collections; -using Lucene.Net.Analysis.Tokenattributes; - -namespace Lucene.Net.Analysis.De -{ - /// <summary> - /// A filter that stems German words. It supports a table of words that should - /// not be stemmed at all. The stemmer used can be changed at runtime after the - /// filter object is created (as long as it is a GermanStemmer). - /// </summary> - public sealed class GermanStemFilter : TokenFilter - { - /// <summary> - /// The actual token in the input stream. - /// </summary> - private GermanStemmer stemmer = null; - private ISet<string> exclusionSet = null; - - private ITermAttribute termAtt; - - public GermanStemFilter(TokenStream _in) - : this(_in, false) - { } - - public GermanStemFilter(TokenStream _in, bool useDin2Stemmer) - : this(_in, null, useDin2Stemmer) - { } - - /// <summary> - /// Builds a GermanStemFilter that uses an exclusiontable. - /// </summary> - /// <param name="_in"></param> - /// <param name="exclusiontable"></param> - public GermanStemFilter(TokenStream _in, ISet<string> exclusiontable) - : this(_in, exclusiontable, false) - { } - - /// <summary> - /// Builds a GermanStemFilter that uses an exclusiontable. - /// </summary> - /// <param name="_in"></param> - /// <param name="exclusiontable"></param> - /// <param name="normalizeDin2">Specifies if the DIN-2007-2 style stemmer should be used in addition to DIN1. This - /// will cause words with 'ae', 'ue', or 'oe' in them (expanded umlauts) to be first converted to 'a', 'u', and 'o' - /// respectively, before the DIN1 stemmer is invoked.</param> - public GermanStemFilter(TokenStream _in, ISet<string> exclusiontable, bool normalizeDin2) - : base(_in) - { - exclusionSet = exclusiontable; - stemmer = normalizeDin2 ? new GermanDIN2Stemmer() : new GermanStemmer(); - termAtt = AddAttribute<ITermAttribute>(); - } - - /// <returns> - /// Returns true for next token in the stream, or false at EOS - /// </returns> - public override bool IncrementToken() - { - if (input.IncrementToken()) - { - String term = termAtt.Term; - // Check the exclusion table. - if (exclusionSet == null || !exclusionSet.Contains(term)) - { - String s = stemmer.Stem(term); - // If not stemmed, don't waste the time adjusting the token. - if ((s != null) && !s.Equals(term)) - termAtt.SetTermBuffer(s); - } - return true; - } - else - { - return false; - } - } - - /// <summary> - /// Set a alternative/custom GermanStemmer for this filter. - /// </summary> - /// <param name="stemmer"></param> - public void SetStemmer(GermanStemmer stemmer) - { - if (stemmer != null) - { - this.stemmer = stemmer; - } - } - - /// <summary> - /// Set an alternative exclusion list for this filter. - /// </summary> - /// <param name="exclusiontable"></param> - public void SetExclusionTable(ISet<string> exclusiontable) - { - exclusionSet = exclusiontable; - } - } -} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/02362804/src/contrib/Analyzers/De/GermanStemmer.cs ---------------------------------------------------------------------- diff --git a/src/contrib/Analyzers/De/GermanStemmer.cs b/src/contrib/Analyzers/De/GermanStemmer.cs deleted file mode 100644 index 4dc80e3..0000000 --- a/src/contrib/Analyzers/De/GermanStemmer.cs +++ /dev/null @@ -1,308 +0,0 @@ -/* - * - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - * -*/ - -using System; -using System.IO; -using System.Text; -using System.Collections; - -namespace Lucene.Net.Analysis.De -{ - /// <summary> - /// A stemmer for German words. The algorithm is based on the report - /// "A Fast and Simple Stemming Algorithm for German Words" by Jжrg - /// Caumanns ([email protected]). - /// </summary> - public class GermanStemmer - { - /// <summary> - /// Buffer for the terms while stemming them. - /// </summary> - private StringBuilder sb = new StringBuilder(); - - /// <summary> - /// Amount of characters that are removed with <tt>Substitute()</tt> while stemming. - /// </summary> - protected int substCount = 0; - - /// <summary> - /// Stemms the given term to an unique <tt>discriminator</tt>. - /// </summary> - /// <param name="term">The term that should be stemmed.</param> - /// <returns>Discriminator for <tt>term</tt></returns> - internal String Stem( String term ) - { - // Use lowercase for medium stemming. - term = term.ToLower(); - if ( !IsStemmable( term ) ) - return term; - // Reset the StringBuilder. - sb.Remove(0, sb.Length); - sb.Insert(0, term); - // Stemming starts here... - Substitute( sb ); - Strip( sb ); - Optimize( sb ); - Resubstitute( sb ); - RemoveParticleDenotion( sb ); - return sb.ToString(); - } - - /// <summary> - /// Checks if a term could be stemmed. - /// </summary> - /// <param name="term"></param> - /// <returns>true if, and only if, the given term consists in letters.</returns> - private bool IsStemmable( String term ) - { - for ( int c = 0; c < term.Length; c++ ) - { - if ( !Char.IsLetter(term[c])) return false; - } - return true; - } - - /// <summary> - /// Suffix stripping (stemming) on the current term. The stripping is reduced - /// to the seven "base" suffixes "e", "s", "n", "t", "em", "er" and * "nd", - /// from which all regular suffixes are build of. The simplification causes - /// some overstemming, and way more irregular stems, but still provides unique. - /// discriminators in the most of those cases. - /// The algorithm is context free, except of the length restrictions. - /// </summary> - /// <param name="buffer"></param> - private void Strip( StringBuilder buffer ) - { - bool doMore = true; - while ( doMore && buffer.Length > 3 ) - { - if ( ( buffer.Length + substCount > 5 ) && - buffer.ToString().Substring(buffer.Length - 2, 2).Equals( "nd" ) ) - { - buffer.Remove( buffer.Length - 2, 2 ); - } - else if ( ( buffer.Length + substCount > 4 ) && - buffer.ToString().Substring( buffer.Length - 2, 2).Equals( "em" ) ) - { - buffer.Remove( buffer.Length - 2, 2 ); - } - else if ( ( buffer.Length + substCount > 4 ) && - buffer.ToString().Substring( buffer.Length - 2, 2).Equals( "er" ) ) - { - buffer.Remove( buffer.Length - 2, 2 ); - } - else if ( buffer[buffer.Length - 1] == 'e' ) - { - buffer.Remove(buffer.Length - 1, 1); - } - else if ( buffer[buffer.Length - 1] == 's' ) - { - buffer.Remove(buffer.Length - 1, 1); - } - else if ( buffer[buffer.Length - 1] == 'n' ) - { - buffer.Remove(buffer.Length - 1, 1); - } - // "t" occurs only as suffix of verbs. - else if ( buffer[buffer.Length - 1] == 't') - { - buffer.Remove(buffer.Length - 1, 1); - } - else - { - doMore = false; - } - } - } - - /// <summary> - /// Does some optimizations on the term. This optimisations are contextual. - /// </summary> - /// <param name="buffer"></param> - private void Optimize( StringBuilder buffer ) - { - // Additional step for female plurals of professions and inhabitants. - if ( buffer.Length > 5 && buffer.ToString().Substring(buffer.Length - 5, 5).Equals( "erin*" )) - { - buffer.Remove(buffer.Length - 1, 1); - Strip(buffer); - } - // Additional step for irregular plural nouns like "Matrizen -> Matrix". - if ( buffer[buffer.Length - 1] == ('z') ) - { - buffer[buffer.Length - 1] = 'x'; - } - } - - /// <summary> - /// Removes a particle denotion ("ge") from a term. - /// </summary> - /// <param name="buffer"></param> - private void RemoveParticleDenotion( StringBuilder buffer ) - { - if ( buffer.Length > 4 ) - { - for ( int c = 0; c < buffer.Length - 3; c++ ) - { - if ( buffer.ToString().Substring( c, 4 ).Equals( "gege" ) ) - { - buffer.Remove(c, 2); - return; - } - } - } - } - - /// <summary> - /// Do some substitutions for the term to reduce overstemming: - /// - /// - Substitute Umlauts with their corresponding vowel: äöü -> aou, - /// "ß" is substituted by "ss" - /// - Substitute a second char of a pair of equal characters with - /// an asterisk: ?? -> ?* - /// - Substitute some common character combinations with a token: - /// sch/ch/ei/ie/ig/st -> $/Ч/%/&/#/! - /// </summary> - protected virtual void Substitute( StringBuilder buffer ) - { - substCount = 0; - for ( int c = 0; c < buffer.Length; c++ ) - { - // Replace the second char of a pair of the equal characters with an asterisk - if (c > 0 && buffer[c] == buffer[c - 1]) - { - buffer[c] = '*'; - } - // Substitute Umlauts. - else if (buffer[c] == 'ä') - { - buffer[c] = 'a'; - } - else if (buffer[c] == 'ö') - { - buffer[c] = 'o'; - } - else if (buffer[c] == 'ü') - { - buffer[c] = 'u'; - } - // Fix bug so that 'Ã' at the end of a word is replaced. - else if (buffer[c] == 'Ã') - { - buffer[c] = 's'; - buffer.Insert(c + 1, 's'); - substCount++; - } - - // Take care that at least one character is left left side from the current one - if ( c < buffer.Length - 1 ) - { - // Masking several common character combinations with an token - if ( ( c < buffer.Length - 2 ) && buffer[c] == 's' && - buffer[c + 1] == 'c' && buffer[c + 2] == 'h' ) - { - buffer[c] = '$'; - buffer.Remove(c + 1, 2); - substCount =+ 2; - } - else if ( buffer[c] == 'c' && buffer[c + 1] == 'h' ) - { - buffer[c] = '§'; - buffer.Remove(c + 1, 1); - substCount++; - } - else if ( buffer[c] == 'e' && buffer[c + 1] == 'i' ) - { - buffer[c] = '%'; - buffer.Remove(c + 1, 1); - substCount++; - } - else if ( buffer[c] == 'i' && buffer[c + 1] == 'e' ) - { - buffer[c] = '&'; - buffer.Remove(c + 1, 1); - substCount++; - } - else if ( buffer[c] == 'i' && buffer[c + 1] == 'g' ) - { - buffer[c] = '#'; - buffer.Remove(c + 1, 1); - substCount++; - } - else if ( buffer[c] == 's' && buffer[c + 1] == 't' ) - { - buffer[c] = '!'; - buffer.Remove(c + 1, 1); - substCount++; - } - } - } - } - - /// <summary> - /// Undoes the changes made by Substitute(). That are character pairs and - /// character combinations. Umlauts will remain as their corresponding vowel, - /// as "?" remains as "ss". - /// </summary> - /// <param name="buffer"></param> - private void Resubstitute( StringBuilder buffer ) - { - for ( int c = 0; c < buffer.Length; c++ ) - { - if ( buffer[c] == '*' ) - { - char x = buffer[c - 1]; - buffer[c] = x; - } - else if ( buffer[c] == '$' ) - { - buffer[c] = 's'; - buffer.Insert( c + 1, new char[]{'c', 'h'}, 0, 2); - } - else if ( buffer[c] == '§' ) - { - buffer[c] = 'c'; - buffer.Insert( c + 1, 'h' ); - } - else if ( buffer[c] == '%' ) - { - buffer[c] = 'e'; - buffer.Insert( c + 1, 'i' ); - } - else if ( buffer[c] == '&' ) - { - buffer[c] = 'i'; - buffer.Insert( c + 1, 'e' ); - } - else if ( buffer[c] == '#' ) - { - buffer[c] = 'i'; - buffer.Insert( c + 1, 'g' ); - } - else if ( buffer[c] == '!' ) - { - buffer[c] = 's'; - buffer.Insert( c + 1, 't' ); - } - } - } - } -} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/02362804/src/contrib/Analyzers/El/GreekAnalyzer.cs ---------------------------------------------------------------------- diff --git a/src/contrib/Analyzers/El/GreekAnalyzer.cs b/src/contrib/Analyzers/El/GreekAnalyzer.cs deleted file mode 100644 index 354bc0f..0000000 --- a/src/contrib/Analyzers/El/GreekAnalyzer.cs +++ /dev/null @@ -1,174 +0,0 @@ -/* - * - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - * -*/ - -using System; -using System.Collections.Generic; -using System.IO; -using System.Linq; -using Lucene.Net.Analysis.Standard; -using Version = Lucene.Net.Util.Version; - -namespace Lucene.Net.Analysis.El -{ - /* - * {@link Analyzer} for the Greek language. - * <p> - * Supports an external list of stopwords (words - * that will not be indexed at all). - * A default set of stopwords is used unless an alternative list is specified. - * </p> - * - * <p><b>NOTE</b>: This class uses the same {@link Version} - * dependent settings as {@link StandardAnalyzer}.</p> - */ - public sealed class GreekAnalyzer : Analyzer - { - /* - * List of typical Greek stopwords. - */ - - private static readonly String[] GREEK_STOP_WORDS = { - "ο", "η", "Ïο", "οι", "Ïα", "ÏÎ¿Ï ", "ÏηÏ", "ÏÏν", "Ïον", - "Ïην", "και", - "κι", "κ", "ειμαι", "ειÏαι", "ειναι", "ειμαÏÏε", "ειÏÏε" - , "ÏÏο", "ÏÏον", - "ÏÏη", "ÏÏην", "μα", "αλλα", "αÏο", "για", "ÏÏοÏ", "με", - "Ïε", "ÏÏ", - "ÏαÏα", "ανÏι", "καÏα", "μεÏα", "θα", "να", "δε", "δεν", - "μη", "μην", - "εÏι", "ενÏ", "εαν", "αν", "ÏοÏε", "ÏÎ¿Ï ", "ÏÏÏ", "ÏοιοÏ" - , "Ïοια", "Ïοιο", - "Ïοιοι", "ÏοιεÏ", "ÏοιÏν", "ÏÎ¿Î¹Î¿Ï Ï", "Î±Ï ÏοÏ", "Î±Ï Ïη", - "Î±Ï Ïο", "Î±Ï Ïοι", - "Î±Ï ÏÏν", "Î±Ï ÏÎ¿Ï Ï", "Î±Ï ÏεÏ", "Î±Ï Ïα", "εκεινοÏ", "εκεινη", - "εκεινο", - "εκεινοι", "εκεινεÏ", "εκεινα", "εκεινÏν", "ÎµÎºÎµÎ¹Î½Î¿Ï Ï", - "οÏÏÏ", "ομÏÏ", - "ιÏÏÏ", "οÏο", "οÏι" - }; - - /* - * Returns a set of default Greek-stopwords - * @return a set of default Greek-stopwords - */ - public static ISet<string> GetDefaultStopSet() - { - return DefaultSetHolder.DEFAULT_SET; - } - - private static class DefaultSetHolder - { - internal static ISet<string> DEFAULT_SET = CharArraySet.UnmodifiableSet(new CharArraySet((IEnumerable<string>)GREEK_STOP_WORDS, false)); - } - - /* - * Contains the stopwords used with the {@link StopFilter}. - */ - private readonly ISet<string> stopSet; - - private readonly Version matchVersion; - - public GreekAnalyzer(Version matchVersion) - : this(matchVersion, DefaultSetHolder.DEFAULT_SET) - { - } - - /* - * Builds an analyzer with the given stop words - * - * @param matchVersion - * lucene compatibility version - * @param stopwords - * a stopword set - */ - public GreekAnalyzer(Version matchVersion, ISet<string> stopwords) - { - stopSet = CharArraySet.UnmodifiableSet(CharArraySet.Copy(stopwords)); - this.matchVersion = matchVersion; - } - - /* - * Builds an analyzer with the given stop words. - * @param stopwords Array of stopwords to use. - * @deprecated use {@link #GreekAnalyzer(Version, Set)} instead - */ - public GreekAnalyzer(Version matchVersion, params string[] stopwords) - : this(matchVersion, StopFilter.MakeStopSet(stopwords)) - { - } - - /* - * Builds an analyzer with the given stop words. - * @deprecated use {@link #GreekAnalyzer(Version, Set)} instead - */ - public GreekAnalyzer(Version matchVersion, IDictionary<string, string> stopwords) - : this(matchVersion, stopwords.Keys.ToArray()) - { - } - - /* - * Creates a {@link TokenStream} which tokenizes all the text in the provided {@link Reader}. - * - * @return A {@link TokenStream} built from a {@link StandardTokenizer} filtered with - * {@link GreekLowerCaseFilter} and {@link StopFilter} - */ - public override TokenStream TokenStream(String fieldName, TextReader reader) - { - TokenStream result = new StandardTokenizer(matchVersion, reader); - result = new GreekLowerCaseFilter(result); - result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion), - result, stopSet); - return result; - } - - private class SavedStreams - { - protected internal Tokenizer source; - protected internal TokenStream result; - }; - - /* - * Returns a (possibly reused) {@link TokenStream} which tokenizes all the text - * in the provided {@link Reader}. - * - * @return A {@link TokenStream} built from a {@link StandardTokenizer} filtered with - * {@link GreekLowerCaseFilter} and {@link StopFilter} - */ - public override TokenStream ReusableTokenStream(String fieldName, TextReader reader) - { - SavedStreams streams = (SavedStreams)PreviousTokenStream; - if (streams == null) - { - streams = new SavedStreams(); - streams.source = new StandardTokenizer(matchVersion, reader); - streams.result = new GreekLowerCaseFilter(streams.source); - streams.result = new StopFilter(StopFilter.GetEnablePositionIncrementsVersionDefault(matchVersion), - streams.result, stopSet); - PreviousTokenStream = streams; - } - else - { - streams.source.Reset(reader); - } - return streams.result; - } - } -} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/02362804/src/contrib/Analyzers/El/GreekLowerCaseFilter.cs ---------------------------------------------------------------------- diff --git a/src/contrib/Analyzers/El/GreekLowerCaseFilter.cs b/src/contrib/Analyzers/El/GreekLowerCaseFilter.cs deleted file mode 100644 index 5c157be..0000000 --- a/src/contrib/Analyzers/El/GreekLowerCaseFilter.cs +++ /dev/null @@ -1,123 +0,0 @@ -/* - * - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - * -*/ - -using Lucene.Net.Analysis.Tokenattributes; - -namespace Lucene.Net.Analysis.El -{ - /* - * Normalizes token text to lower case, removes some Greek diacritics, - * and standardizes final sigma to sigma. - * - */ - public sealed class GreekLowerCaseFilter : TokenFilter - { - private ITermAttribute termAtt; - - public GreekLowerCaseFilter(TokenStream _in) - : base(_in) - { - termAtt = AddAttribute<ITermAttribute>(); - } - - public override bool IncrementToken() - { - if (input.IncrementToken()) - { - char[] chArray = termAtt.TermBuffer(); - int chLen = termAtt.TermLength(); - // TODO: iterate codepoints to support supp. characters - for (int i = 0; i < chLen; i++) - { - chArray[i] = (char)lowerCase(chArray[i]); - } - return true; - } - else - { - return false; - } - } - - private int lowerCase(int codepoint) - { - switch (codepoint) - { - /* There are two lowercase forms of sigma: - * U+03C2: small final sigma (end of word) - * U+03C3: small sigma (otherwise) - * - * Standardize both to U+03C3 - */ - case '\u03C2': /* small final sigma */ - return '\u03C3'; /* small sigma */ - - /* Some greek characters contain diacritics. - * This filter removes these, converting to the lowercase base form. - */ - - case '\u0386': /* capital alpha with tonos */ - case '\u03AC': /* small alpha with tonos */ - return '\u03B1'; /* small alpha */ - - case '\u0388': /* capital epsilon with tonos */ - case '\u03AD': /* small epsilon with tonos */ - return '\u03B5'; /* small epsilon */ - - case '\u0389': /* capital eta with tonos */ - case '\u03AE': /* small eta with tonos */ - return '\u03B7'; /* small eta */ - - case '\u038A': /* capital iota with tonos */ - case '\u03AA': /* capital iota with dialytika */ - case '\u03AF': /* small iota with tonos */ - case '\u03CA': /* small iota with dialytika */ - case '\u0390': /* small iota with dialytika and tonos */ - return '\u03B9'; /* small iota */ - - case '\u038E': /* capital upsilon with tonos */ - case '\u03AB': /* capital upsilon with dialytika */ - case '\u03CD': /* small upsilon with tonos */ - case '\u03CB': /* small upsilon with dialytika */ - case '\u03B0': /* small upsilon with dialytika and tonos */ - return '\u03C5'; /* small upsilon */ - - case '\u038C': /* capital omicron with tonos */ - case '\u03CC': /* small omicron with tonos */ - return '\u03BF'; /* small omicron */ - - case '\u038F': /* capital omega with tonos */ - case '\u03CE': /* small omega with tonos */ - return '\u03C9'; /* small omega */ - - /* The previous implementation did the conversion below. - * Only implemented for backwards compatibility with old indexes. - */ - - case '\u03A2': /* reserved */ - return '\u03C2'; /* small final sigma */ - - default: - return char.ToLower((char)codepoint); - } - } - } -}
