Repository: lucenenet Updated Branches: refs/heads/master 468199e3f -> 46613e4c3
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/2f5d89b4/src/Lucene.Net.Analysis.SmartCn/WordSegmenter.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.SmartCn/WordSegmenter.cs b/src/Lucene.Net.Analysis.SmartCn/WordSegmenter.cs new file mode 100644 index 0000000..fed2b0b --- /dev/null +++ b/src/Lucene.Net.Analysis.SmartCn/WordSegmenter.cs @@ -0,0 +1,89 @@ +using Lucene.Net.Analysis.Cn.Smart.HHMM; +using Lucene.Net.Support; +using System.Collections.Generic; + +namespace Lucene.Net.Analysis.Cn.Smart +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// <summary> + /// Segment a sentence of Chinese text into words. + /// <para/> + /// @lucene.experimental + /// </summary> + internal class WordSegmenter + { + private HHMMSegmenter hhmmSegmenter = new HHMMSegmenter(); + + private SegTokenFilter tokenFilter = new SegTokenFilter(); + + /// <summary> + /// Segment a sentence into words with <see cref="HHMMSegmenter"/> + /// </summary> + /// <param name="sentence">input sentence</param> + /// <param name="startOffset"> start offset of sentence</param> + /// <returns><see cref="IList{T}"/> of <see cref="SegToken"/>.</returns> + public virtual IList<SegToken> SegmentSentence(string sentence, int startOffset) + { + + IList<SegToken> segTokenList = hhmmSegmenter.Process(sentence); + // tokens from sentence, excluding WordType.SENTENCE_BEGIN and WordType.SENTENCE_END + IList<SegToken> result = Collections.EmptyList<SegToken>(); + + if (segTokenList.Count > 2) // if its not an empty sentence + result = segTokenList.SubList(1, segTokenList.Count - 1); + + foreach (SegToken st in result) + { + ConvertSegToken(st, sentence, startOffset); + } + + return result; + } + + /// <summary> + /// Process a <see cref="SegToken"/> so that it is ready for indexing. + /// </summary> + /// <param name="st">st input <see cref="SegToken"/></param> + /// <param name="sentence">associated Sentence</param> + /// <param name="sentenceStartOffset">offset into sentence</param> + /// <returns>Lucene <see cref="SegToken"/></returns> + public virtual SegToken ConvertSegToken(SegToken st, string sentence, + int sentenceStartOffset) + { + + switch (st.WordType) + { + case WordType.STRING: + case WordType.NUMBER: + case WordType.FULLWIDTH_NUMBER: + case WordType.FULLWIDTH_STRING: + st.CharArray = sentence.Substring(st.StartOffset, st.EndOffset - st.StartOffset) + .ToCharArray(); + break; + default: + break; + } + + st = tokenFilter.Filter(st); + st.StartOffset += sentenceStartOffset; + st.EndOffset += sentenceStartOffset; + return st; + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/2f5d89b4/src/Lucene.Net.Analysis.SmartCn/WordTokenFilter.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.SmartCn/WordTokenFilter.cs b/src/Lucene.Net.Analysis.SmartCn/WordTokenFilter.cs new file mode 100644 index 0000000..d7a419d --- /dev/null +++ b/src/Lucene.Net.Analysis.SmartCn/WordTokenFilter.cs @@ -0,0 +1,114 @@ +using Lucene.Net.Analysis.Cn.Smart.HHMM; +using Lucene.Net.Analysis.TokenAttributes; +using System; +using System.Collections.Generic; + +namespace Lucene.Net.Analysis.Cn.Smart +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// <summary> + /// A <see cref="TokenFilter"/> that breaks sentences into words. + /// <para/> + /// @lucene.experimental + /// </summary> + [Obsolete("Use HMMChineseTokenizer instead.")] + public sealed class WordTokenFilter : TokenFilter + { + private WordSegmenter wordSegmenter; + + private IEnumerator<SegToken> tokenIter; + + private IList<SegToken> tokenBuffer; + + private readonly ICharTermAttribute termAtt; + private readonly IOffsetAttribute offsetAtt; + private readonly ITypeAttribute typeAtt; + + private int tokStart; // only used if the length changed before this filter + private int tokEnd; // only used if the length changed before this filter + private bool hasIllegalOffsets; // only if the length changed before this filter + + /// <summary> + /// Construct a new <see cref="WordTokenFilter"/>. + /// </summary> + /// <param name="input"><see cref="TokenStream"/> of sentences.</param> + public WordTokenFilter(TokenStream input) + : base(input) + { + this.wordSegmenter = new WordSegmenter(); + this.termAtt = AddAttribute<ICharTermAttribute>(); + this.offsetAtt = AddAttribute<IOffsetAttribute>(); + this.typeAtt = AddAttribute<ITypeAttribute>(); + } + + public override bool IncrementToken() + { + if (tokenIter == null || !tokenIter.MoveNext()) + { + // there are no remaining tokens from the current sentence... are there more sentences? + if (m_input.IncrementToken()) + { + tokStart = offsetAtt.StartOffset; + tokEnd = offsetAtt.EndOffset; + // if length by start + end offsets doesn't match the term text then assume + // this is a synonym and don't adjust the offsets. + hasIllegalOffsets = (tokStart + termAtt.Length) != tokEnd; + // a new sentence is available: process it. + tokenBuffer = wordSegmenter.SegmentSentence(termAtt.ToString(), offsetAtt.StartOffset); + tokenIter = tokenBuffer.GetEnumerator(); + /* + * it should not be possible to have a sentence with 0 words, check just in case. + * returning EOS isn't the best either, but its the behavior of the original code. + */ + if (!tokenIter.MoveNext()) + { + return false; + } + } + else + { + return false; // no more sentences, end of stream! + } + } + + // WordTokenFilter must clear attributes, as it is creating new tokens. + ClearAttributes(); + // There are remaining tokens from the current sentence, return the next one. + SegToken nextWord = tokenIter.Current; + + termAtt.CopyBuffer(nextWord.CharArray, 0, nextWord.CharArray.Length); + if (hasIllegalOffsets) + { + offsetAtt.SetOffset(tokStart, tokEnd); + } + else + { + offsetAtt.SetOffset(nextWord.StartOffset, nextWord.EndOffset); + } + typeAtt.Type = "word"; + return true; + } + + public override void Reset() + { + base.Reset(); + tokenIter = null; + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/2f5d89b4/src/Lucene.Net.Analysis.SmartCn/WordType.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.SmartCn/WordType.cs b/src/Lucene.Net.Analysis.SmartCn/WordType.cs new file mode 100644 index 0000000..0eb4948 --- /dev/null +++ b/src/Lucene.Net.Analysis.SmartCn/WordType.cs @@ -0,0 +1,67 @@ +namespace Lucene.Net.Analysis.Cn.Smart +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// <summary> + /// Internal <see cref="SmartChineseAnalyzer"/> token type constants + /// <para/> + /// @lucene.experimental + /// </summary> + public enum WordType + { + /// <summary> + /// Start of a Sentence + /// </summary> + SENTENCE_BEGIN = 0, + + /// <summary> + /// End of a Sentence + /// </summary> + SENTENCE_END = 1, + + /// <summary> + /// Chinese Word + /// </summary> + CHINESE_WORD = 2, + + /// <summary> + /// ASCII String + /// </summary> + STRING = 3, + + /// <summary> + /// ASCII Alphanumeric + /// </summary> + NUMBER = 4, + + /// <summary> + /// Punctuation Symbol + /// </summary> + DELIMITER = 5, + + /// <summary> + /// Full-Width String + /// </summary> + FULLWIDTH_STRING = 6, + + /// <summary> + /// Full-Width Alphanumeric + /// </summary> + FULLWIDTH_NUMBER = 7 + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/2f5d89b4/src/Lucene.Net.Analysis.SmartCn/project.json ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.SmartCn/project.json b/src/Lucene.Net.Analysis.SmartCn/project.json new file mode 100644 index 0000000..0fa0612 --- /dev/null +++ b/src/Lucene.Net.Analysis.SmartCn/project.json @@ -0,0 +1,53 @@ +{ + "version": "4.8.0", + "title": "Lucene.Net.Analysis.SmartCn", + "description": "Analyzer for indexing Chinese for the Lucene.Net full-text search engine library from The Apache Software Foundation.", + "authors": [ "The Apache Software Foundation" ], + "packOptions": { + "projectUrl": "http://lucenenet.apache.org/", + "licenseUrl": "https://github.com/apache/lucenenet/blob/master/LICENSE.txt", + "iconUrl": "https://github.com/apache/lucenenet/blob/master/branding/logo/lucene-net-icon-128x128.png?raw=true", + "owners": [ "The Apache Software Foundation" ], + "repository": { "url": "https://github.com/apache/lucenenet" }, + "tags": [ "lucene.net", "core", "text", "search", "information", "retrieval", "lucene", "apache", "analysis", "index", "query" ], + "releaseNotes": "This package depends on a temporary version of icu.net hosted on MyGet until official .NET Core support is added. To install, copy the NuGet.config file from https://github.com/apache/lucenenet/blob/master/NuGet.config into your project and then install this package via Package Manager Console as usual." + }, + "buildOptions": { + "compile": { + "includeFiles": [ "../CommonAssemblyInfo.cs" ] + }, + "embed": { + "includeFiles": [ + "stopwords.txt", + "HHMM/bigramdict.mem", + "HHMM/coredict.mem" + ] + }, + "nowarn": [ "1591", "1573" ] + }, + "dependencies": { + "icu.net": "54.1.1-alpha", + "Lucene.Net": "4.8.0", + "Lucene.Net.Analysis.Common": "4.8.0", + "Lucene.Net.ICU": "4.8.0" + }, + "frameworks": { + "netstandard1.5": { + "imports": "dnxcore50", + "buildOptions": { + "debugType": "portable", + "define": [ "NETSTANDARD" ] + }, + "dependencies": { + "NETStandard.Library": "1.6.0", + "System.Text.Encoding.CodePages": "4.4.0-preview1-25305-02" + } + }, + "net451": { + "buildOptions": { + "debugType": "full", + "define": [ "FEATURE_SERIALIZABLE" ] + } + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/2f5d89b4/src/Lucene.Net.Analysis.SmartCn/stopwords.txt ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.SmartCn/stopwords.txt b/src/Lucene.Net.Analysis.SmartCn/stopwords.txt new file mode 100644 index 0000000..fb0d71a --- /dev/null +++ b/src/Lucene.Net.Analysis.SmartCn/stopwords.txt @@ -0,0 +1,59 @@ +////////// Punctuation tokens to remove //////////////// +, +. +` +- +_ += +? +' +| +" +( +) +{ +} +[ +] +< +> +* +# +& +^ +$ +@ +! +~ +: +; ++ +/ +\ +ã +ã +â +ï¼ +ï¼ +ã +ã +ï¼ +ï¼ +ï¼ +· +ï¼ +â +â +ï¼ +ï¼ +ã +ã +ï¼» +ï¼½ +â +// the line below contains an IDEOGRAPHIC SPACE character (Used as a space in Chinese) +ã + +//////////////// English Stop Words //////////////// + +//////////////// Chinese Stop Words //////////////// http://git-wip-us.apache.org/repos/asf/lucenenet/blob/2f5d89b4/src/Lucene.Net.Tests.Analysis.SmartCn/Lucene.Net.Tests.Analysis.SmartCn.csproj ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Tests.Analysis.SmartCn/Lucene.Net.Tests.Analysis.SmartCn.csproj b/src/Lucene.Net.Tests.Analysis.SmartCn/Lucene.Net.Tests.Analysis.SmartCn.csproj new file mode 100644 index 0000000..cc14e95 --- /dev/null +++ b/src/Lucene.Net.Tests.Analysis.SmartCn/Lucene.Net.Tests.Analysis.SmartCn.csproj @@ -0,0 +1,105 @@ +<?xml version="1.0" encoding="utf-8"?> +<!-- + + Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, + software distributed under the License is distributed on an + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + KIND, either express or implied. See the License for the + specific language governing permissions and limitations + under the License. + +--> +<Project ToolsVersion="14.0" DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003"> + <Import Project="$(MSBuildExtensionsPath)\$(MSBuildToolsVersion)\Microsoft.Common.props" Condition="Exists('$(MSBuildExtensionsPath)\$(MSBuildToolsVersion)\Microsoft.Common.props')" /> + <PropertyGroup> + <Configuration Condition=" '$(Configuration)' == '' ">Debug</Configuration> + <Platform Condition=" '$(Platform)' == '' ">AnyCPU</Platform> + <ProjectGuid>{8C8D78D3-BFFD-4301-953B-FE5350B2AEEB}</ProjectGuid> + <OutputType>Library</OutputType> + <AppDesignerFolder>Properties</AppDesignerFolder> + <RootNamespace>Lucene.Net.Analysis.Cn.Smart</RootNamespace> + <AssemblyName>Lucene.Net.Tests.Analysis.SmartCn</AssemblyName> + <TargetFrameworkVersion>v4.5.1</TargetFrameworkVersion> + <FileAlignment>512</FileAlignment> + </PropertyGroup> + <PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Debug|AnyCPU' "> + <DebugSymbols>true</DebugSymbols> + <DebugType>full</DebugType> + <Optimize>false</Optimize> + <OutputPath>bin\Debug\</OutputPath> + <DefineConstants>DEBUG;TRACE</DefineConstants> + <ErrorReport>prompt</ErrorReport> + <WarningLevel>4</WarningLevel> + </PropertyGroup> + <PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Release|AnyCPU' "> + <DebugType>pdbonly</DebugType> + <Optimize>true</Optimize> + <OutputPath>bin\Release\</OutputPath> + <DefineConstants>TRACE</DefineConstants> + <ErrorReport>prompt</ErrorReport> + <WarningLevel>4</WarningLevel> + </PropertyGroup> + <PropertyGroup> + <DefineConstants>$(DefineConstants);FEATURE_SERIALIZABLE</DefineConstants> + </PropertyGroup> + <ItemGroup> + <Reference Include="System" /> + <Reference Include="System.Core" /> + <Reference Include="System.Xml.Linq" /> + <Reference Include="System.Data.DataSetExtensions" /> + <Reference Include="Microsoft.CSharp" /> + <Reference Include="System.Data" /> + <Reference Include="System.Net.Http" /> + <Reference Include="System.Xml" /> + </ItemGroup> + <ItemGroup> + <Compile Include="Properties\AssemblyInfo.cs" /> + <Compile Include="Support\TestApiConsistency.cs" /> + <Compile Include="Support\TestExceptionSerialization.cs" /> + <Compile Include="TestHMMChineseTokenizerFactory.cs" /> + <Compile Include="TestSmartChineseAnalyzer.cs" /> + <Compile Include="TestSmartChineseFactories.cs" /> + <Compile Include="..\CommonAssemblyInfo.cs"> + <Link>Properties\CommonAssemblyInfo.cs</Link> + </Compile> + </ItemGroup> + <ItemGroup> + <ProjectReference Include="..\Lucene.Net.Analysis.Common\Lucene.Net.Analysis.Common.csproj"> + <Project>{4ADD0BBC-B900-4715-9526-D871DE8EEA64}</Project> + <Name>Lucene.Net.Analysis.Common</Name> + </ProjectReference> + <ProjectReference Include="..\Lucene.Net.Analysis.SmartCn\Lucene.Net.Analysis.SmartCn.csproj"> + <Project>{DBA35EDF-A0FF-4DF7-AE4F-A103B01CD488}</Project> + <Name>Lucene.Net.Analysis.SmartCn</Name> + </ProjectReference> + <ProjectReference Include="..\Lucene.Net.TestFramework\Lucene.Net.TestFramework.csproj"> + <Project>{B2C0D749-CE34-4F62-A15E-00CB2FF5DDB3}</Project> + <Name>Lucene.Net.TestFramework</Name> + </ProjectReference> + <ProjectReference Include="..\Lucene.Net\Lucene.Net.csproj"> + <Project>{5d4ad9be-1ffb-41ab-9943-25737971bf57}</Project> + <Name>Lucene.Net</Name> + </ProjectReference> + </ItemGroup> + <ItemGroup> + <Service Include="{82A7F48D-3B50-4B1E-B82E-3ADA8210C358}" /> + </ItemGroup> + <Import Project="$(MSBuildToolsPath)\Microsoft.CSharp.targets" /> + <!-- To modify your build process, add your task inside one of the targets below and uncomment it. + Other similar extension points exist, see Microsoft.Common.targets. + <Target Name="BeforeBuild"> + </Target> + <Target Name="AfterBuild"> + </Target> + --> +</Project> \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/2f5d89b4/src/Lucene.Net.Tests.Analysis.SmartCn/Lucene.Net.Tests.Analysis.SmartCn.project.json ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Tests.Analysis.SmartCn/Lucene.Net.Tests.Analysis.SmartCn.project.json b/src/Lucene.Net.Tests.Analysis.SmartCn/Lucene.Net.Tests.Analysis.SmartCn.project.json new file mode 100644 index 0000000..8c631ab --- /dev/null +++ b/src/Lucene.Net.Tests.Analysis.SmartCn/Lucene.Net.Tests.Analysis.SmartCn.project.json @@ -0,0 +1,11 @@ +{ + "runtimes": { + "win": {} + }, + "dependencies": { + "NUnit": "3.5.0" + }, + "frameworks": { + "net451": {} + } +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/2f5d89b4/src/Lucene.Net.Tests.Analysis.SmartCn/Lucene.Net.Tests.Analysis.SmartCn.xproj ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Tests.Analysis.SmartCn/Lucene.Net.Tests.Analysis.SmartCn.xproj b/src/Lucene.Net.Tests.Analysis.SmartCn/Lucene.Net.Tests.Analysis.SmartCn.xproj new file mode 100644 index 0000000..79885d1 --- /dev/null +++ b/src/Lucene.Net.Tests.Analysis.SmartCn/Lucene.Net.Tests.Analysis.SmartCn.xproj @@ -0,0 +1,42 @@ +<?xml version="1.0" encoding="utf-8"?> +<!-- + + Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, + software distributed under the License is distributed on an + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + KIND, either express or implied. See the License for the + specific language governing permissions and limitations + under the License. + +--> + +<Project ToolsVersion="14.0.25420" DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003"> + <PropertyGroup> + <VisualStudioVersion Condition="'$(VisualStudioVersion)' == ''">14.0.25420</VisualStudioVersion> + <VSToolsPath Condition="'$(VSToolsPath)' == ''">$(MSBuildExtensionsPath32)\Microsoft\VisualStudio\v$(VisualStudioVersion)</VSToolsPath> + </PropertyGroup> + <Import Project="$(VSToolsPath)\DotNet\Microsoft.DotNet.Props" Condition="'$(VSToolsPath)' != ''" /> + <PropertyGroup Label="Globals"> + <ProjectGuid>2870fb52-1239-493f-a0be-951660194a66</ProjectGuid> + <RootNamespace>Lucene.Net.Analysis.Cn.Smart</RootNamespace> + <BaseIntermediateOutputPath Condition="'$(BaseIntermediateOutputPath)'=='' ">.\obj</BaseIntermediateOutputPath> + <OutputPath Condition="'$(OutputPath)'=='' ">.\bin\</OutputPath> + </PropertyGroup> + <PropertyGroup> + <SchemaVersion>2.0</SchemaVersion> + </PropertyGroup> + <ItemGroup> + <Service Include="{82a7f48d-3b50-4b1e-b82e-3ada8210c358}" /> + </ItemGroup> + <Import Project="$(VSToolsPath)\DotNet\Microsoft.DotNet.targets" Condition="'$(VSToolsPath)' != ''" /> +</Project> \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/2f5d89b4/src/Lucene.Net.Tests.Analysis.SmartCn/Properties/AssemblyInfo.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Tests.Analysis.SmartCn/Properties/AssemblyInfo.cs b/src/Lucene.Net.Tests.Analysis.SmartCn/Properties/AssemblyInfo.cs new file mode 100644 index 0000000..b3135b2 --- /dev/null +++ b/src/Lucene.Net.Tests.Analysis.SmartCn/Properties/AssemblyInfo.cs @@ -0,0 +1,37 @@ +/* +* Licensed to the Apache Software Foundation (ASF) under one or more +* contributor license agreements. See the NOTICE file distributed with +* this work for additional information regarding copyright ownership. +* The ASF licenses this file to You under the Apache License, Version 2.0 +* (the "License"); you may not use this file except in compliance with +* the License. You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*/ + +using System.Reflection; +using System.Runtime.InteropServices; + +// General Information about an assembly is controlled through the following +// set of attributes. Change these attribute values to modify the information +// associated with an assembly. +[assembly: AssemblyTitle("Lucene.Net.Tests.Analysis.SmartCn")] +[assembly: AssemblyDescription("")] +[assembly: AssemblyConfiguration("")] +[assembly: AssemblyCulture("")] + +// Setting ComVisible to false makes the types in this assembly not visible +// to COM components. If you need to access a type in this assembly from +// COM, set the ComVisible attribute to true on that type. +[assembly: ComVisible(false)] + +// The following GUID is for the ID of the typelib if this project is exposed to COM +[assembly: Guid("8c8d78d3-bffd-4301-953b-fe5350b2aeeb")] + +// NOTE: Version information is in CommonAssemblyInfo.cs http://git-wip-us.apache.org/repos/asf/lucenenet/blob/2f5d89b4/src/Lucene.Net.Tests.Analysis.SmartCn/Support/TestApiConsistency.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Tests.Analysis.SmartCn/Support/TestApiConsistency.cs b/src/Lucene.Net.Tests.Analysis.SmartCn/Support/TestApiConsistency.cs new file mode 100644 index 0000000..0943448 --- /dev/null +++ b/src/Lucene.Net.Tests.Analysis.SmartCn/Support/TestApiConsistency.cs @@ -0,0 +1,148 @@ +/* + * + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + * +*/ + +using Lucene.Net.Attributes; +using Lucene.Net.Support; +using NUnit.Framework; +using System; + +namespace Lucene.Net.Analysis.Cn.Smart.Support +{ + /// <summary> + /// LUCENENET specific tests for ensuring API conventions are followed + /// </summary> + public class TestApiConsistency : ApiScanTestBase + { + [Test, LuceneNetSpecific] + [TestCase(typeof(Lucene.Net.Analysis.Cn.Smart.AnalyzerProfile))] + public override void TestProtectedFieldNames(Type typeFromTargetAssembly) + { + base.TestProtectedFieldNames(typeFromTargetAssembly); + } + + [Test, LuceneNetSpecific] + [TestCase(typeof(Lucene.Net.Analysis.Cn.Smart.AnalyzerProfile))] + public override void TestPrivateFieldNames(Type typeFromTargetAssembly) + { + base.TestPrivateFieldNames(typeFromTargetAssembly); + } + + [Test, LuceneNetSpecific] + [TestCase(typeof(Lucene.Net.Analysis.Cn.Smart.AnalyzerProfile))] + public override void TestPublicFields(Type typeFromTargetAssembly) + { + base.TestPublicFields(typeFromTargetAssembly); + } + + [Test, LuceneNetSpecific] + [TestCase(typeof(Lucene.Net.Analysis.Cn.Smart.AnalyzerProfile))] + public override void TestMethodParameterNames(Type typeFromTargetAssembly) + { + base.TestMethodParameterNames(typeFromTargetAssembly); + } + + [Test, LuceneNetSpecific] + [TestCase(typeof(Lucene.Net.Analysis.Cn.Smart.AnalyzerProfile))] + public override void TestInterfaceNames(Type typeFromTargetAssembly) + { + base.TestInterfaceNames(typeFromTargetAssembly); + } + + [Test, LuceneNetSpecific] + [TestCase(typeof(Lucene.Net.Analysis.Cn.Smart.AnalyzerProfile))] + public override void TestClassNames(Type typeFromTargetAssembly) + { + base.TestClassNames(typeFromTargetAssembly); + } + + [Test, LuceneNetSpecific] + [TestCase(typeof(Lucene.Net.Analysis.Cn.Smart.AnalyzerProfile))] + public override void TestForPropertiesWithNoGetter(Type typeFromTargetAssembly) + { + base.TestForPropertiesWithNoGetter(typeFromTargetAssembly); + } + + [Test, LuceneNetSpecific] + [TestCase(typeof(Lucene.Net.Analysis.Cn.Smart.AnalyzerProfile))] + public override void TestForPropertiesThatReturnArray(Type typeFromTargetAssembly) + { + base.TestForPropertiesThatReturnArray(typeFromTargetAssembly); + } + + [Test, LuceneNetSpecific] + [TestCase(typeof(Lucene.Net.Analysis.Cn.Smart.AnalyzerProfile))] + public override void TestForMethodsThatReturnWritableArray(Type typeFromTargetAssembly) + { + base.TestForMethodsThatReturnWritableArray(typeFromTargetAssembly); + } + + [Test, LuceneNetSpecific] + [TestCase(typeof(Lucene.Net.Analysis.Cn.Smart.AnalyzerProfile))] + public override void TestForPublicMembersContainingComparer(Type typeFromTargetAssembly) + { + base.TestForPublicMembersContainingComparer(typeFromTargetAssembly); + } + + [Test, LuceneNetSpecific] + [TestCase(typeof(Lucene.Net.Analysis.Cn.Smart.AnalyzerProfile))] + public override void TestForPublicMembersNamedSize(Type typeFromTargetAssembly) + { + base.TestForPublicMembersNamedSize(typeFromTargetAssembly); + } + + [Test, LuceneNetSpecific] + [TestCase(typeof(Lucene.Net.Analysis.Cn.Smart.AnalyzerProfile))] + public override void TestForPublicMembersContainingNonNetNumeric(Type typeFromTargetAssembly) + { + base.TestForPublicMembersContainingNonNetNumeric(typeFromTargetAssembly); + } + + [Test, LuceneNetSpecific] + [TestCase(typeof(Lucene.Net.Analysis.Cn.Smart.AnalyzerProfile))] + public override void TestForTypesContainingNonNetNumeric(Type typeFromTargetAssembly) + { + base.TestForTypesContainingNonNetNumeric(typeFromTargetAssembly); + } + + [Test, LuceneNetSpecific] + [TestCase(typeof(Lucene.Net.Analysis.Cn.Smart.AnalyzerProfile))] + public override void TestForPublicMembersWithNullableEnum(Type typeFromTargetAssembly) + { + base.TestForPublicMembersWithNullableEnum(typeFromTargetAssembly); + } + + // LUCENENET NOTE: This test is only for identifying members who were changed from + // ICollection, IList or ISet to IEnumerable during the port (that should be changed back) + //[Test, LuceneNetSpecific] + //[TestCase(typeof(Lucene.Net.Analysis.Cn.Smart.AnalyzerProfile))] + //public override void TestForMembersAcceptingOrReturningIEnumerable(Type typeFromTargetAssembly) + //{ + // base.TestForMembersAcceptingOrReturningIEnumerable(typeFromTargetAssembly); + //} + + [Test, LuceneNetSpecific] + [TestCase(typeof(Lucene.Net.Analysis.Cn.Smart.AnalyzerProfile))] + public override void TestForMembersAcceptingOrReturningListOrDictionary(Type typeFromTargetAssembly) + { + base.TestForMembersAcceptingOrReturningListOrDictionary(typeFromTargetAssembly); + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/2f5d89b4/src/Lucene.Net.Tests.Analysis.SmartCn/Support/TestExceptionSerialization.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Tests.Analysis.SmartCn/Support/TestExceptionSerialization.cs b/src/Lucene.Net.Tests.Analysis.SmartCn/Support/TestExceptionSerialization.cs new file mode 100644 index 0000000..8c8d6a2 --- /dev/null +++ b/src/Lucene.Net.Tests.Analysis.SmartCn/Support/TestExceptionSerialization.cs @@ -0,0 +1,54 @@ +#if FEATURE_SERIALIZABLE +using Lucene.Net.Attributes; +using NUnit.Framework; +using System; +using System.Collections.Generic; +using System.Linq; + +namespace Lucene.Net.Support +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + [TestFixture] + public class TestExceptionSerialization : ExceptionSerializationTestBase + { + public static IEnumerable<object> ExceptionTestData + { + get + { + var exceptionTypes = typeof(Lucene.Net.Analysis.Cn.Smart.AnalyzerProfile).Assembly.GetTypes().Where(t => typeof(Exception).IsAssignableFrom(t)).Cast<object>(); + + // If the assembly has no exceptions, just provide Exception so the test will pass + if (!exceptionTypes.Any()) + { + return new Type[] { typeof(Exception) }; + } + + return exceptionTypes; + } + } + + [Test, LuceneNetSpecific] + public void AllExceptionsInLuceneNamespaceCanSerialize([ValueSource("ExceptionTestData")]Type luceneException) + { + var instance = TryInstantiate(luceneException); + Assert.That(TypeCanSerialize(instance), string.Format("Unable to serialize {0}", luceneException.FullName)); + } + } +} +#endif \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/2f5d89b4/src/Lucene.Net.Tests.Analysis.SmartCn/TestHMMChineseTokenizerFactory.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Tests.Analysis.SmartCn/TestHMMChineseTokenizerFactory.cs b/src/Lucene.Net.Tests.Analysis.SmartCn/TestHMMChineseTokenizerFactory.cs new file mode 100644 index 0000000..264ab38 --- /dev/null +++ b/src/Lucene.Net.Tests.Analysis.SmartCn/TestHMMChineseTokenizerFactory.cs @@ -0,0 +1,72 @@ +using Lucene.Net.Analysis.Util; +using NUnit.Framework; +using System; +using System.Collections.Generic; +using System.IO; + +namespace Lucene.Net.Analysis.Cn.Smart +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// <summary> + /// Tests for <see cref="HMMChineseTokenizerFactory"/> + /// </summary> + public class TestHMMChineseTokenizerFactory : BaseTokenStreamTestCase + { + [Test] + public void TestHHMMSegmenter() + { + var x = new HHMM.HHMMSegmenter(); + } + + /// <summary> + /// Test showing the behavior + /// </summary> + [Test] + public void TestSimple() + { + TextReader reader = new StringReader("æè´ä¹°äºéå ·åæè£ ã"); + TokenizerFactory factory = new HMMChineseTokenizerFactory(new Dictionary<string, string>()); + Tokenizer tokenizer = factory.Create(reader); + tokenizer.SetReader(reader); + // TODO: fix smart chinese to not emit punctuation tokens + // at the moment: you have to clean up with WDF, or use the stoplist, etc + AssertTokenStreamContents(tokenizer, + new String[] { "æ", "è´ä¹°", "äº", "éå ·", "å", "æè£ ", "," }); + } + + /// <summary> + /// Test that bogus arguments result in exception + /// </summary> + [Test] + public void TestBogusArguments() + { + try + { + new HMMChineseTokenizerFactory(new Dictionary<string, string>() { + { "bogusArg", "bogusValue" } + }); + fail(); + } + catch (ArgumentException expected) + { + assertTrue(expected.Message.Contains("Unknown parameters")); + } + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/2f5d89b4/src/Lucene.Net.Tests.Analysis.SmartCn/TestSmartChineseAnalyzer.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Tests.Analysis.SmartCn/TestSmartChineseAnalyzer.cs b/src/Lucene.Net.Tests.Analysis.SmartCn/TestSmartChineseAnalyzer.cs new file mode 100644 index 0000000..bc4f3aa --- /dev/null +++ b/src/Lucene.Net.Tests.Analysis.SmartCn/TestSmartChineseAnalyzer.cs @@ -0,0 +1,354 @@ +using Lucene.Net.Analysis.Core; +using Lucene.Net.Analysis.Miscellaneous; +using Lucene.Net.Util; +using NUnit.Framework; +using System; +using System.Text; + +namespace Lucene.Net.Analysis.Cn.Smart +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + public class TestSmartChineseAnalyzer : BaseTokenStreamTestCase + { + [Test] + public void TestChineseStopWordsDefault() + { +#pragma warning disable 612, 618 + Analyzer ca = new SmartChineseAnalyzer(LuceneVersion.LUCENE_CURRENT); /* will load stopwords */ +#pragma warning restore 612, 618 + String sentence = "æè´ä¹°äºéå ·åæè£ ã"; + String[] result = { "æ", "è´ä¹°", "äº", "éå ·", "å", "æè£ " }; + AssertAnalyzesTo(ca, sentence, result); + // set stop-words from the outer world - must yield same behavior + ca = new SmartChineseAnalyzer( +#pragma warning disable 612, 618 + LuceneVersion.LUCENE_CURRENT, +#pragma warning restore 612, 618 + SmartChineseAnalyzer.GetDefaultStopSet()); + AssertAnalyzesTo(ca, sentence, result); + } + + /* + * This test is the same as the above, except with two phrases. + * This tests to ensure the SentenceTokenizer->WordTokenFilter chain works correctly. + */ + [Test] + public void TestChineseStopWordsDefaultTwoPhrases() + { +#pragma warning disable 612, 618 + Analyzer ca = new SmartChineseAnalyzer(LuceneVersion.LUCENE_CURRENT); /* will load stopwords */ +#pragma warning restore 612, 618 + String sentence = "æè´ä¹°äºéå ·åæè£ ã æè´ä¹°äºéå ·åæè£ ã"; + String[] result = { "æ", "è´ä¹°", "äº", "éå ·", "å", "æè£ ", "æ", "è´ä¹°", "äº", "éå ·", "å", "æè£ " }; + AssertAnalyzesTo(ca, sentence, result); + } + + /* + * This test is the same as the above, except using an ideographic space as a separator. + * This tests to ensure the stopwords are working correctly. + */ + [Test] + public void TestChineseStopWordsDefaultTwoPhrasesIdeoSpace() + { +#pragma warning disable 612, 618 + Analyzer ca = new SmartChineseAnalyzer(LuceneVersion.LUCENE_CURRENT); /* will load stopwords */ +#pragma warning restore 612, 618 + String sentence = "æè´ä¹°äºéå ·åæè£ ãæè´ä¹°äºéå ·åæè£ ã"; + String[] result = { "æ", "è´ä¹°", "äº", "éå ·", "å", "æè£ ", "æ", "è´ä¹°", "äº", "éå ·", "å", "æè£ " }; + AssertAnalyzesTo(ca, sentence, result); + } + + /* + * Punctuation is handled in a strange way if you disable stopwords + * In this example the IDEOGRAPHIC FULL STOP is converted into a comma. + * if you don't supply (true) to the constructor, or use a different stopwords list, + * then punctuation is indexed. + */ + [Test] + public void TestChineseStopWordsOff() + { + Analyzer[] + analyzers = new Analyzer[] { +#pragma warning disable 612, 618 + new SmartChineseAnalyzer(LuceneVersion.LUCENE_CURRENT, false),/* doesn't load stopwords */ + new SmartChineseAnalyzer(LuceneVersion.LUCENE_CURRENT, null) /* sets stopwords to empty set */}; +#pragma warning restore 612, 618 + String sentence = "æè´ä¹°äºéå ·åæè£ ã"; + String[] result = { "æ", "è´ä¹°", "äº", "éå ·", "å", "æè£ ", "," }; + foreach (Analyzer analyzer in analyzers) + { + AssertAnalyzesTo(analyzer, sentence, result); + AssertAnalyzesTo(analyzer, sentence, result); + } + } + + /* + * Check that position increments after stopwords are correct, + * when stopfilter is configured with enablePositionIncrements + */ + [Test] + public void TestChineseStopWords2() + { +#pragma warning disable 612, 618 + Analyzer ca = new SmartChineseAnalyzer(LuceneVersion.LUCENE_CURRENT); /* will load stopwords */ +#pragma warning restore 612, 618 + String sentence = "Title:San"; // : is a stopword + String[] result = { "titl", "san" }; + int[] startOffsets = { 0, 6 }; + int[] endOffsets = { 5, 9 }; + int[] posIncr = { 1, 2 }; + AssertAnalyzesTo(ca, sentence, result, startOffsets, endOffsets, posIncr); + } + + [Test] + public void TestChineseAnalyzer() + { +#pragma warning disable 612, 618 + Analyzer ca = new SmartChineseAnalyzer(LuceneVersion.LUCENE_CURRENT, true); +#pragma warning restore 612, 618 + String sentence = "æè´ä¹°äºéå ·åæè£ ã"; + String[] result = { "æ", "è´ä¹°", "äº", "éå ·", "å", "æè£ " }; + AssertAnalyzesTo(ca, sentence, result); + } + + /* + * English words are lowercased and porter-stemmed. + */ + [Test] + public void TestMixedLatinChinese() + { + AssertAnalyzesTo(new SmartChineseAnalyzer( +#pragma warning disable 612, 618 + LuceneVersion.LUCENE_CURRENT +#pragma warning restore 612, 618 + , true), "æè´ä¹° Tests äºéå ·åæè£ ", + new String[] { "æ", "è´ä¹°", "test", "äº", "éå ·", "å", "æè£ " }); + } + + /* + * Numerics are parsed as their own tokens + */ + [Test] + public void TestNumerics() + { + AssertAnalyzesTo(new SmartChineseAnalyzer( +#pragma warning disable 612, 618 + LuceneVersion.LUCENE_CURRENT +#pragma warning restore 612, 618 + , true), "æè´ä¹° Tests äºéå ·åæè£ 1234", + new String[] { "æ", "è´ä¹°", "test", "äº", "éå ·", "å", "æè£ ", "1234" }); + } + + /* + * Full width alphas and numerics are folded to half-width + */ + [Test] + public void TestFullWidth() + { + AssertAnalyzesTo(new SmartChineseAnalyzer( +#pragma warning disable 612, 618 + LuceneVersion.LUCENE_CURRENT +#pragma warning restore 612, 618 + , true), "æè´ä¹° ï¼´ï½ ï½ï½ï½ äºéå ·åæè£ ï¼ï¼ï¼ï¼", + new String[] { "æ", "è´ä¹°", "test", "äº", "éå ·", "å", "æè£ ", "1234" }); + } + + /* + * Presentation form delimiters are removed + */ + [Test] + public void TestDelimiters() + { + AssertAnalyzesTo(new SmartChineseAnalyzer( +#pragma warning disable 612, 618 + LuceneVersion.LUCENE_CURRENT +#pragma warning restore 612, 618 + , true), "æè´ä¹°ï¸± Tests äºéå ·åæè£ ", + new String[] { "æ", "è´ä¹°", "test", "äº", "éå ·", "å", "æè£ " }); + } + + /* + * Text from writing systems other than Chinese and Latin are parsed as individual characters. + * (regardless of Unicode category) + */ + [Test] + public void TestNonChinese() + { + AssertAnalyzesTo(new SmartChineseAnalyzer( +#pragma warning disable 612, 618 + LuceneVersion.LUCENE_CURRENT +#pragma warning restore 612, 618 + , true), "æè´ä¹° Ø±ÙØ¨Ø±ØªTests äºéå ·åæè£ ", + new String[] { "æ", "è´ä¹°", "ر", "Ù", "ب", "ر", "ت", "test", "äº", "éå ·", "å", "æè£ " }); + } + + /* + * Test what the analyzer does with out-of-vocabulary words. + * In this case the name is Yousaf Raza Gillani. + * Currently it is being analyzed into single characters... + */ + [Test] + public void TestOOV() + { + AssertAnalyzesTo(new SmartChineseAnalyzer( +#pragma warning disable 612, 618 + LuceneVersion.LUCENE_CURRENT +#pragma warning restore 612, 618 + , true), "ä¼ç´ ç¦Â·ææÂ·åæå°¼", + new String[] { "ä¼", "ç´ ", "ç¦", "æ", "æ", "å", "æ", "å°¼" }); + + + AssertAnalyzesTo(new SmartChineseAnalyzer( +#pragma warning disable 612, 618 + LuceneVersion.LUCENE_CURRENT +#pragma warning restore 612, 618 + , true), "ä¼ç´ ç¦ææåæå°¼", + new String[] { "ä¼", "ç´ ", "ç¦", "æ", "æ", "å", "æ", "å°¼" }); + } + + [Test] + public void TestOffsets() + { + AssertAnalyzesTo(new SmartChineseAnalyzer( +#pragma warning disable 612, 618 + LuceneVersion.LUCENE_CURRENT +#pragma warning restore 612, 618 + , true), "æè´ä¹°äºéå ·åæè£ ", + new String[] { "æ", "è´ä¹°", "äº", "éå ·", "å", "æè£ " }, + new int[] { 0, 1, 3, 4, 6, 7 }, + new int[] { 1, 3, 4, 6, 7, 9 }); + } + + [Test] + public void TestReusableTokenStream() + { + Analyzer a = new SmartChineseAnalyzer( +#pragma warning disable 612, 618 + LuceneVersion.LUCENE_CURRENT); +#pragma warning restore 612, 618 + AssertAnalyzesTo(a, "æè´ä¹° Tests äºéå ·åæè£ ", + new String[] { "æ", "è´ä¹°", "test", "äº", "éå ·", "å", "æè£ " }, + new int[] { 0, 1, 4, 10, 11, 13, 14 }, + new int[] { 1, 3, 9, 11, 13, 14, 16 }); + AssertAnalyzesTo(a, "æè´ä¹°äºéå ·åæè£ ã", + new String[] { "æ", "è´ä¹°", "äº", "éå ·", "å", "æè£ " }, + new int[] { 0, 1, 3, 4, 6, 7 }, + new int[] { 1, 3, 4, 6, 7, 9 }); + } + + // LUCENE-3026 + [Test] + public void TestLargeDocument() + { + StringBuilder sb = new StringBuilder(); + for (int i = 0; i < 5000; i++) + { + sb.append("æè´ä¹°äºéå ·åæè£ ã"); + } + Analyzer analyzer = new SmartChineseAnalyzer(TEST_VERSION_CURRENT); + TokenStream stream = analyzer.GetTokenStream("", sb.toString()); + try + { + stream.Reset(); + while (stream.IncrementToken()) + { + } + stream.End(); + } + finally + { + IOUtils.DisposeWhileHandlingException(stream); + } + } + + // LUCENE-3026 + [Test] + public void TestLargeSentence() + { + StringBuilder sb = new StringBuilder(); + for (int i = 0; i < 5000; i++) + { + sb.append("æè´ä¹°äºéå ·åæè£ "); + } + Analyzer analyzer = new SmartChineseAnalyzer(TEST_VERSION_CURRENT); + TokenStream stream = analyzer.GetTokenStream("", sb.toString()); + try + { + stream.Reset(); + while (stream.IncrementToken()) + { + } + stream.End(); + } + finally + { + IOUtils.DisposeWhileHandlingException(stream); + } + } + + // LUCENE-3642 + [Test] + public void TestInvalidOffset() + { + Analyzer analyzer = Analyzer.NewAnonymous(createComponents: (fieldName, reader) => + { + Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); + TokenFilter filters = new ASCIIFoldingFilter(tokenizer); +#pragma warning disable 612, 618 + filters = new WordTokenFilter(filters); +#pragma warning restore 612, 618 + return new TokenStreamComponents(tokenizer, filters); + }); + + AssertAnalyzesTo(analyzer, "mosfellsbær", + new string[] { "mosfellsbaer" }, + new int[] { 0 }, + new int[] { 11 }); + } + + /** blast some random strings through the analyzer */ + [Test] + public void TestRandomStrings() + { + CheckRandomData(Random(), new SmartChineseAnalyzer(TEST_VERSION_CURRENT), 1000 * RANDOM_MULTIPLIER); + } + + /** blast some random large strings through the analyzer */ + [Test] + public void TestRandomHugeStrings() + { + Random random = Random(); + CheckRandomData(random, new SmartChineseAnalyzer(TEST_VERSION_CURRENT), 100 * RANDOM_MULTIPLIER, 8192); + } + + [Test] + public void TestEmptyTerm() + { + Random random = Random(); + Analyzer a = Analyzer.NewAnonymous(createComponents: (fieldName, reader) => + { + Tokenizer tokenizer = new KeywordTokenizer(reader); +#pragma warning disable 612, 618 + return new TokenStreamComponents(tokenizer, new WordTokenFilter(tokenizer)); +#pragma warning restore 612, 618 + }); + CheckAnalysisConsistency(random, a, random.nextBoolean(), ""); + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/2f5d89b4/src/Lucene.Net.Tests.Analysis.SmartCn/TestSmartChineseFactories.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Tests.Analysis.SmartCn/TestSmartChineseFactories.cs b/src/Lucene.Net.Tests.Analysis.SmartCn/TestSmartChineseFactories.cs new file mode 100644 index 0000000..4b23ec1 --- /dev/null +++ b/src/Lucene.Net.Tests.Analysis.SmartCn/TestSmartChineseFactories.cs @@ -0,0 +1,98 @@ +using NUnit.Framework; +using System; +using System.Collections.Generic; +using System.IO; + +namespace Lucene.Net.Analysis.Cn.Smart +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// <summary> + /// Tests for <see cref=""SmartChineseSentenceTokenizerFactory/> and + /// <see cref="SmartChineseWordTokenFilterFactory"/> + /// </summary> + [Obsolete] + public class TestSmartChineseFactories : BaseTokenStreamTestCase + { + /// <summary> + /// Test showing the behavior with whitespace + /// </summary> + [Test] + public void TestSimple() + { + TextReader reader = new StringReader("æè´ä¹°äºéå ·åæè£ ã"); + TokenStream stream = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false); + SmartChineseWordTokenFilterFactory factory = new SmartChineseWordTokenFilterFactory(new Dictionary<string, string>()); + stream = factory.Create(stream); + // TODO: fix smart chinese to not emit punctuation tokens + // at the moment: you have to clean up with WDF, or use the stoplist, etc + AssertTokenStreamContents(stream, + new String[] { "æ", "è´ä¹°", "äº", "éå ·", "å", "æè£ ", "," }); + } + + /// <summary> + /// Test showing the behavior with whitespace + /// </summary> + [Test] + public void TestTokenizer() + { + TextReader reader = new StringReader("æè´ä¹°äºéå ·åæè£ ãæè´ä¹°äºéå ·åæè£ ã"); + SmartChineseSentenceTokenizerFactory tokenizerFactory = new SmartChineseSentenceTokenizerFactory(new Dictionary<string, string>()); + TokenStream stream = tokenizerFactory.Create(reader); + SmartChineseWordTokenFilterFactory factory = new SmartChineseWordTokenFilterFactory(new Dictionary<string, string>()); + stream = factory.Create(stream); + // TODO: fix smart chinese to not emit punctuation tokens + // at the moment: you have to clean up with WDF, or use the stoplist, etc + AssertTokenStreamContents(stream, + new String[] { "æ", "è´ä¹°", "äº", "éå ·", "å", "æè£ ", ",", + "æ", "è´ä¹°", "äº", "éå ·", "å", "æè£ ", "," + }); + } + + /// <summary> + /// Test that bogus arguments result in exception + /// </summary> + [Test] + public void TestBogusArguments() + { + try + { + new SmartChineseSentenceTokenizerFactory(new Dictionary<string, string>() { + { "bogusArg", "bogusValue" } + }); + fail(); + } + catch (ArgumentException expected) + { + assertTrue(expected.Message.Contains("Unknown parameters")); + } + + try + { + new SmartChineseWordTokenFilterFactory(new Dictionary<string, string>() { + { "bogusArg", "bogusValue" } + }); + fail(); + } + catch (ArgumentException expected) + { + assertTrue(expected.Message.Contains("Unknown parameters")); + } + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/2f5d89b4/src/Lucene.Net.Tests.Analysis.SmartCn/project.json ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Tests.Analysis.SmartCn/project.json b/src/Lucene.Net.Tests.Analysis.SmartCn/project.json new file mode 100644 index 0000000..d7196a3 --- /dev/null +++ b/src/Lucene.Net.Tests.Analysis.SmartCn/project.json @@ -0,0 +1,45 @@ +{ + "version": "4.8.0", + "title": "Lucene.Net.Tests.Analysis.SmartCn", + "buildOptions": { + "compile": { + "includeFiles": [ "../CommonAssemblyInfo.cs" ] + }, + "embed": { + "includeFiles": [ + ] + } + }, + "dependencies": { + "dotnet-test-nunit-teamcity": "3.4.0-beta-3", + "Lucene.Net.Analysis.SmartCn": "4.8.0", + "Lucene.Net.TestFramework": "4.8.0", + "NUnit": "3.5.0" + }, + "testRunner": "nunit-teamcity", + "frameworks": { + "netcoreapp1.0": { + "imports": "dnxcore50", + "buildOptions": { + "debugType": "portable", + "define": [ "NETSTANDARD" ], + "compile": { + "excludeFiles": [ + "Support/TestApiConsistency.cs" + ] + } + } + }, + "net451": { + "buildOptions": { + "debugType": "full", + "define": [ "FEATURE_SERIALIZABLE" ] + } + } + }, + + "runtimes": { + "win7-x86": {}, + "win7-x64": {} + } +}
