[1/4] lucenenet git commit: Ported Lucene.Net.Analysis.SmartCn + tests

nightowl888 Sat, 24 Jun 2017 12:47:34 -0700

Repository: lucenenet
Updated Branches:
  refs/heads/master 468199e3f -> 46613e4c3



http://git-wip-us.apache.org/repos/asf/lucenenet/blob/2f5d89b4/src/Lucene.Net.Analysis.SmartCn/WordSegmenter.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.SmartCn/WordSegmenter.cs 
b/src/Lucene.Net.Analysis.SmartCn/WordSegmenter.cs
new file mode 100644
index 0000000..fed2b0b
--- /dev/null
+++ b/src/Lucene.Net.Analysis.SmartCn/WordSegmenter.cs
@@ -0,0 +1,89 @@
+ï»¿using Lucene.Net.Analysis.Cn.Smart.HHMM;
+using Lucene.Net.Support;
+using System.Collections.Generic;
+
+namespace Lucene.Net.Analysis.Cn.Smart
+{
+    /*
+     * Licensed to the Apache Software Foundation (ASF) under one or more
+     * contributor license agreements.  See the NOTICE file distributed with
+     * this work for additional information regarding copyright ownership.
+     * The ASF licenses this file to You under the Apache License, Version 2.0
+     * (the "License"); you may not use this file except in compliance with
+     * the License.  You may obtain a copy of the License at
+     *
+     *     http://www.apache.org/licenses/LICENSE-2.0
+     *
+     * Unless required by applicable law or agreed to in writing, software
+     * distributed under the License is distributed on an "AS IS" BASIS,
+     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     * See the License for the specific language governing permissions and
+     * limitations under the License.
+     */
+
+    /// <summary>
+    /// Segment a sentence of Chinese text into words.
+    /// <para/>
+    /// @lucene.experimental
+    /// </summary>
+    internal class WordSegmenter
+    {
+        private HHMMSegmenter hhmmSegmenter = new HHMMSegmenter();
+
+        private SegTokenFilter tokenFilter = new SegTokenFilter();
+
+        /// <summary>
+        /// Segment a sentence into words with <see cref="HHMMSegmenter"/>
+        /// </summary>
+        /// <param name="sentence">input sentence</param>
+        /// <param name="startOffset"> start offset of sentence</param>
+        /// <returns><see cref="IList{T}"/> of <see 
cref="SegToken"/>.</returns>
+        public virtual IList<SegToken> SegmentSentence(string sentence, int 
startOffset)
+        {
+
+            IList<SegToken> segTokenList = hhmmSegmenter.Process(sentence);
+            // tokens from sentence, excluding WordType.SENTENCE_BEGIN and 
WordType.SENTENCE_END
+            IList<SegToken> result = Collections.EmptyList<SegToken>();
+
+            if (segTokenList.Count > 2) // if its not an empty sentence
+                result = segTokenList.SubList(1, segTokenList.Count - 1);
+
+            foreach (SegToken st in result)
+            {
+                ConvertSegToken(st, sentence, startOffset);
+            }
+
+            return result;
+        }
+
+        /// <summary>
+        /// Process a <see cref="SegToken"/> so that it is ready for indexing.
+        /// </summary>
+        /// <param name="st">st input <see cref="SegToken"/></param>
+        /// <param name="sentence">associated Sentence</param>
+        /// <param name="sentenceStartOffset">offset into sentence</param>
+        /// <returns>Lucene <see cref="SegToken"/></returns>
+        public virtual SegToken ConvertSegToken(SegToken st, string sentence,
+            int sentenceStartOffset)
+        {
+
+            switch (st.WordType)
+            {
+                case WordType.STRING:
+                case WordType.NUMBER:
+                case WordType.FULLWIDTH_NUMBER:
+                case WordType.FULLWIDTH_STRING:
+                    st.CharArray = sentence.Substring(st.StartOffset, 
st.EndOffset - st.StartOffset)
+                        .ToCharArray();
+                    break;
+                default:
+                    break;
+            }
+
+            st = tokenFilter.Filter(st);
+            st.StartOffset += sentenceStartOffset;
+            st.EndOffset += sentenceStartOffset;
+            return st;
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/2f5d89b4/src/Lucene.Net.Analysis.SmartCn/WordTokenFilter.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.SmartCn/WordTokenFilter.cs 
b/src/Lucene.Net.Analysis.SmartCn/WordTokenFilter.cs
new file mode 100644
index 0000000..d7a419d
--- /dev/null
+++ b/src/Lucene.Net.Analysis.SmartCn/WordTokenFilter.cs
@@ -0,0 +1,114 @@
+ï»¿using Lucene.Net.Analysis.Cn.Smart.HHMM;
+using Lucene.Net.Analysis.TokenAttributes;
+using System;
+using System.Collections.Generic;
+
+namespace Lucene.Net.Analysis.Cn.Smart
+{
+    /*
+     * Licensed to the Apache Software Foundation (ASF) under one or more
+     * contributor license agreements.  See the NOTICE file distributed with
+     * this work for additional information regarding copyright ownership.
+     * The ASF licenses this file to You under the Apache License, Version 2.0
+     * (the "License"); you may not use this file except in compliance with
+     * the License.  You may obtain a copy of the License at
+     *
+     *     http://www.apache.org/licenses/LICENSE-2.0
+     *
+     * Unless required by applicable law or agreed to in writing, software
+     * distributed under the License is distributed on an "AS IS" BASIS,
+     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     * See the License for the specific language governing permissions and
+     * limitations under the License.
+     */
+
+    /// <summary>
+    /// A <see cref="TokenFilter"/> that breaks sentences into words.
+    /// <para/>
+    /// @lucene.experimental
+    /// </summary>
+    [Obsolete("Use HMMChineseTokenizer instead.")]
+    public sealed class WordTokenFilter : TokenFilter
+    {
+        private WordSegmenter wordSegmenter;
+
+        private IEnumerator<SegToken> tokenIter;
+
+        private IList<SegToken> tokenBuffer;
+
+        private readonly ICharTermAttribute termAtt;
+        private readonly IOffsetAttribute offsetAtt;
+        private readonly ITypeAttribute typeAtt;
+
+        private int tokStart; // only used if the length changed before this 
filter
+        private int tokEnd; // only used if the length changed before this 
filter
+        private bool hasIllegalOffsets; // only if the length changed before 
this filter
+
+        /// <summary>
+        /// Construct a new <see cref="WordTokenFilter"/>.
+        /// </summary>
+        /// <param name="input"><see cref="TokenStream"/> of sentences.</param>
+        public WordTokenFilter(TokenStream input)
+            : base(input)
+        {
+            this.wordSegmenter = new WordSegmenter();
+            this.termAtt = AddAttribute<ICharTermAttribute>();
+            this.offsetAtt = AddAttribute<IOffsetAttribute>();
+            this.typeAtt = AddAttribute<ITypeAttribute>();
+        }
+
+        public override bool IncrementToken()
+        {
+            if (tokenIter == null || !tokenIter.MoveNext())
+            {
+                // there are no remaining tokens from the current sentence... 
are there more sentences?
+                if (m_input.IncrementToken())
+                {
+                    tokStart = offsetAtt.StartOffset;
+                    tokEnd = offsetAtt.EndOffset;
+                    // if length by start + end offsets doesn't match the term 
text then assume
+                    // this is a synonym and don't adjust the offsets.
+                    hasIllegalOffsets = (tokStart + termAtt.Length) != tokEnd;
+                    // a new sentence is available: process it.
+                    tokenBuffer = 
wordSegmenter.SegmentSentence(termAtt.ToString(), offsetAtt.StartOffset);
+                    tokenIter = tokenBuffer.GetEnumerator();
+                    /* 
+                     * it should not be possible to have a sentence with 0 
words, check just in case.
+                     * returning EOS isn't the best either, but its the 
behavior of the original code.
+                     */
+                    if (!tokenIter.MoveNext())
+                    {
+                        return false;
+                    }
+                }
+                else
+                {
+                    return false; // no more sentences, end of stream!
+                }
+            }
+
+            // WordTokenFilter must clear attributes, as it is creating new 
tokens.
+            ClearAttributes();
+            // There are remaining tokens from the current sentence, return 
the next one. 
+            SegToken nextWord = tokenIter.Current;
+
+            termAtt.CopyBuffer(nextWord.CharArray, 0, 
nextWord.CharArray.Length);
+            if (hasIllegalOffsets)
+            {
+                offsetAtt.SetOffset(tokStart, tokEnd);
+            }
+            else
+            {
+                offsetAtt.SetOffset(nextWord.StartOffset, nextWord.EndOffset);
+            }
+            typeAtt.Type = "word";
+            return true;
+        }
+
+        public override void Reset()
+        {
+            base.Reset();
+            tokenIter = null;
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/2f5d89b4/src/Lucene.Net.Analysis.SmartCn/WordType.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.SmartCn/WordType.cs 
b/src/Lucene.Net.Analysis.SmartCn/WordType.cs
new file mode 100644
index 0000000..0eb4948
--- /dev/null
+++ b/src/Lucene.Net.Analysis.SmartCn/WordType.cs
@@ -0,0 +1,67 @@
+ï»¿namespace Lucene.Net.Analysis.Cn.Smart
+{
+    /*
+     * Licensed to the Apache Software Foundation (ASF) under one or more
+     * contributor license agreements.  See the NOTICE file distributed with
+     * this work for additional information regarding copyright ownership.
+     * The ASF licenses this file to You under the Apache License, Version 2.0
+     * (the "License"); you may not use this file except in compliance with
+     * the License.  You may obtain a copy of the License at
+     *
+     *     http://www.apache.org/licenses/LICENSE-2.0
+     *
+     * Unless required by applicable law or agreed to in writing, software
+     * distributed under the License is distributed on an "AS IS" BASIS,
+     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     * See the License for the specific language governing permissions and
+     * limitations under the License.
+     */
+
+    /// <summary>
+    /// Internal <see cref="SmartChineseAnalyzer"/> token type constants
+    /// <para/>
+    /// @lucene.experimental
+    /// </summary>
+    public enum WordType
+    {
+        /// <summary>
+        /// Start of a Sentence
+        /// </summary>
+        SENTENCE_BEGIN = 0,
+
+        /// <summary>
+        /// End of a Sentence
+        /// </summary>
+        SENTENCE_END = 1,
+
+        /// <summary>
+        /// Chinese Word 
+        /// </summary>
+        CHINESE_WORD = 2,
+
+        /// <summary>
+        /// ASCII String
+        /// </summary>
+        STRING = 3,
+
+        /// <summary>
+        /// ASCII Alphanumeric
+        /// </summary>
+        NUMBER = 4,
+
+        /// <summary>
+        /// Punctuation Symbol
+        /// </summary>
+        DELIMITER = 5,
+
+        /// <summary>
+        /// Full-Width String
+        /// </summary>
+        FULLWIDTH_STRING = 6,
+
+        /// <summary>
+        /// Full-Width Alphanumeric
+        /// </summary>
+        FULLWIDTH_NUMBER = 7
+    }
+}

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/2f5d89b4/src/Lucene.Net.Analysis.SmartCn/project.json
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.SmartCn/project.json 
b/src/Lucene.Net.Analysis.SmartCn/project.json
new file mode 100644
index 0000000..0fa0612
--- /dev/null
+++ b/src/Lucene.Net.Analysis.SmartCn/project.json
@@ -0,0 +1,53 @@
+{
+  "version": "4.8.0",
+  "title": "Lucene.Net.Analysis.SmartCn",
+  "description": "Analyzer for indexing Chinese for the Lucene.Net full-text 
search engine library from The Apache Software Foundation.",
+  "authors": [ "The Apache Software Foundation" ],
+  "packOptions": {
+    "projectUrl": "http://lucenenet.apache.org/";,
+    "licenseUrl": 
"https://github.com/apache/lucenenet/blob/master/LICENSE.txt";,
+    "iconUrl": 
"https://github.com/apache/lucenenet/blob/master/branding/logo/lucene-net-icon-128x128.png?raw=true";,
+    "owners": [ "The Apache Software Foundation" ],
+    "repository": { "url": "https://github.com/apache/lucenenet"; },
+    "tags": [ "lucene.net", "core", "text", "search", "information", 
"retrieval", "lucene", "apache", "analysis", "index", "query" ],
+    "releaseNotes": "This package depends on a temporary version of icu.net 
hosted on MyGet until official .NET Core support is added. To install, copy the 
NuGet.config file from 
https://github.com/apache/lucenenet/blob/master/NuGet.config into your project 
and then install this package via Package Manager Console as usual."
+  },
+  "buildOptions": {
+  "compile": {
+      "includeFiles": [ "../CommonAssemblyInfo.cs" ]
+    },
+    "embed": {
+      "includeFiles": [
+        "stopwords.txt",
+        "HHMM/bigramdict.mem",
+        "HHMM/coredict.mem"
+      ]
+    },
+    "nowarn": [ "1591", "1573" ]
+  },
+  "dependencies": {
+    "icu.net": "54.1.1-alpha",
+    "Lucene.Net": "4.8.0",
+    "Lucene.Net.Analysis.Common": "4.8.0",
+    "Lucene.Net.ICU": "4.8.0"
+  },
+  "frameworks": {
+    "netstandard1.5": {
+      "imports": "dnxcore50",
+      "buildOptions": {
+        "debugType": "portable",
+        "define": [ "NETSTANDARD" ]
+      },
+      "dependencies": {
+        "NETStandard.Library": "1.6.0",
+        "System.Text.Encoding.CodePages": "4.4.0-preview1-25305-02"
+      }
+    },
+    "net451": {
+      "buildOptions": {
+        "debugType": "full",
+        "define": [ "FEATURE_SERIALIZABLE" ]
+      }
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/2f5d89b4/src/Lucene.Net.Analysis.SmartCn/stopwords.txt
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.SmartCn/stopwords.txt 
b/src/Lucene.Net.Analysis.SmartCn/stopwords.txt
new file mode 100644
index 0000000..fb0d71a
--- /dev/null
+++ b/src/Lucene.Net.Analysis.SmartCn/stopwords.txt
@@ -0,0 +1,59 @@
+////////// Punctuation tokens to remove ////////////////
+,
+.
+`
+-
+_
+=
+?
+'
+|
+"
+(
+)
+{
+}
+[
+]
+<
+>
+*
+#
+&
+^
+$
+@
+!
+~
+:
+;
++
+/
+\
+ã
+ã
+â
+ï¼
+ï¼
+ã
+ã
+ï¼
+ï¼
+ï¼
+Â·
+ï¼
+â
+â
+ï¼
+ï¼
+ã
+ã
+ï¼»
+ï¼½
+â
+// the line below contains an IDEOGRAPHIC SPACE character (Used as a space in 
Chinese)
+ã
+
+//////////////// English Stop Words ////////////////
+
+//////////////// Chinese Stop Words ////////////////

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/2f5d89b4/src/Lucene.Net.Tests.Analysis.SmartCn/Lucene.Net.Tests.Analysis.SmartCn.csproj
----------------------------------------------------------------------
diff --git 
a/src/Lucene.Net.Tests.Analysis.SmartCn/Lucene.Net.Tests.Analysis.SmartCn.csproj
 
b/src/Lucene.Net.Tests.Analysis.SmartCn/Lucene.Net.Tests.Analysis.SmartCn.csproj
new file mode 100644
index 0000000..cc14e95
--- /dev/null
+++ 
b/src/Lucene.Net.Tests.Analysis.SmartCn/Lucene.Net.Tests.Analysis.SmartCn.csproj
@@ -0,0 +1,105 @@
+ï»¿<?xml version="1.0" encoding="utf-8"?>
+<!--
+
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements.  See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership.  The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License.  You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied.  See the License for the
+ specific language governing permissions and limitations
+ under the License.
+
+-->
+<Project ToolsVersion="14.0" DefaultTargets="Build" 
xmlns="http://schemas.microsoft.com/developer/msbuild/2003";>
+  <Import 
Project="$(MSBuildExtensionsPath)\$(MSBuildToolsVersion)\Microsoft.Common.props"
 
Condition="Exists('$(MSBuildExtensionsPath)\$(MSBuildToolsVersion)\Microsoft.Common.props')"
 />
+  <PropertyGroup>
+    <Configuration Condition=" '$(Configuration)' == '' ">Debug</Configuration>
+    <Platform Condition=" '$(Platform)' == '' ">AnyCPU</Platform>
+    <ProjectGuid>{8C8D78D3-BFFD-4301-953B-FE5350B2AEEB}</ProjectGuid>
+    <OutputType>Library</OutputType>
+    <AppDesignerFolder>Properties</AppDesignerFolder>
+    <RootNamespace>Lucene.Net.Analysis.Cn.Smart</RootNamespace>
+    <AssemblyName>Lucene.Net.Tests.Analysis.SmartCn</AssemblyName>
+    <TargetFrameworkVersion>v4.5.1</TargetFrameworkVersion>
+    <FileAlignment>512</FileAlignment>
+  </PropertyGroup>
+  <PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Debug|AnyCPU' 
">
+    <DebugSymbols>true</DebugSymbols>
+    <DebugType>full</DebugType>
+    <Optimize>false</Optimize>
+    <OutputPath>bin\Debug\</OutputPath>
+    <DefineConstants>DEBUG;TRACE</DefineConstants>
+    <ErrorReport>prompt</ErrorReport>
+    <WarningLevel>4</WarningLevel>
+  </PropertyGroup>
+  <PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 
'Release|AnyCPU' ">
+    <DebugType>pdbonly</DebugType>
+    <Optimize>true</Optimize>
+    <OutputPath>bin\Release\</OutputPath>
+    <DefineConstants>TRACE</DefineConstants>
+    <ErrorReport>prompt</ErrorReport>
+    <WarningLevel>4</WarningLevel>
+  </PropertyGroup>
+  <PropertyGroup>
+    <DefineConstants>$(DefineConstants);FEATURE_SERIALIZABLE</DefineConstants>
+  </PropertyGroup>
+  <ItemGroup>
+    <Reference Include="System" />
+    <Reference Include="System.Core" />
+    <Reference Include="System.Xml.Linq" />
+    <Reference Include="System.Data.DataSetExtensions" />
+    <Reference Include="Microsoft.CSharp" />
+    <Reference Include="System.Data" />
+    <Reference Include="System.Net.Http" />
+    <Reference Include="System.Xml" />
+  </ItemGroup>
+  <ItemGroup>
+    <Compile Include="Properties\AssemblyInfo.cs" />
+    <Compile Include="Support\TestApiConsistency.cs" />
+    <Compile Include="Support\TestExceptionSerialization.cs" />
+    <Compile Include="TestHMMChineseTokenizerFactory.cs" />
+    <Compile Include="TestSmartChineseAnalyzer.cs" />
+    <Compile Include="TestSmartChineseFactories.cs" />
+    <Compile Include="..\CommonAssemblyInfo.cs">
+      <Link>Properties\CommonAssemblyInfo.cs</Link>
+    </Compile>
+  </ItemGroup>
+  <ItemGroup>
+    <ProjectReference 
Include="..\Lucene.Net.Analysis.Common\Lucene.Net.Analysis.Common.csproj">
+      <Project>{4ADD0BBC-B900-4715-9526-D871DE8EEA64}</Project>
+      <Name>Lucene.Net.Analysis.Common</Name>
+    </ProjectReference>
+    <ProjectReference 
Include="..\Lucene.Net.Analysis.SmartCn\Lucene.Net.Analysis.SmartCn.csproj">
+      <Project>{DBA35EDF-A0FF-4DF7-AE4F-A103B01CD488}</Project>
+      <Name>Lucene.Net.Analysis.SmartCn</Name>
+    </ProjectReference>
+    <ProjectReference 
Include="..\Lucene.Net.TestFramework\Lucene.Net.TestFramework.csproj">
+      <Project>{B2C0D749-CE34-4F62-A15E-00CB2FF5DDB3}</Project>
+      <Name>Lucene.Net.TestFramework</Name>
+    </ProjectReference>
+    <ProjectReference Include="..\Lucene.Net\Lucene.Net.csproj">
+      <Project>{5d4ad9be-1ffb-41ab-9943-25737971bf57}</Project>
+      <Name>Lucene.Net</Name>
+    </ProjectReference>
+  </ItemGroup>
+  <ItemGroup>
+    <Service Include="{82A7F48D-3B50-4B1E-B82E-3ADA8210C358}" />
+  </ItemGroup>
+  <Import Project="$(MSBuildToolsPath)\Microsoft.CSharp.targets" />
+  <!-- To modify your build process, add your task inside one of the targets 
below and uncomment it. 
+       Other similar extension points exist, see Microsoft.Common.targets.
+  <Target Name="BeforeBuild">
+  </Target>
+  <Target Name="AfterBuild">
+  </Target>
+  -->
+</Project>
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/2f5d89b4/src/Lucene.Net.Tests.Analysis.SmartCn/Lucene.Net.Tests.Analysis.SmartCn.project.json
----------------------------------------------------------------------
diff --git 
a/src/Lucene.Net.Tests.Analysis.SmartCn/Lucene.Net.Tests.Analysis.SmartCn.project.json
 
b/src/Lucene.Net.Tests.Analysis.SmartCn/Lucene.Net.Tests.Analysis.SmartCn.project.json
new file mode 100644
index 0000000..8c631ab
--- /dev/null
+++ 
b/src/Lucene.Net.Tests.Analysis.SmartCn/Lucene.Net.Tests.Analysis.SmartCn.project.json
@@ -0,0 +1,11 @@
+{
+  "runtimes": {
+    "win": {}
+  },
+  "dependencies": {
+    "NUnit": "3.5.0"
+  },
+  "frameworks": {
+    "net451": {}
+  }
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/2f5d89b4/src/Lucene.Net.Tests.Analysis.SmartCn/Lucene.Net.Tests.Analysis.SmartCn.xproj
----------------------------------------------------------------------
diff --git 
a/src/Lucene.Net.Tests.Analysis.SmartCn/Lucene.Net.Tests.Analysis.SmartCn.xproj 
b/src/Lucene.Net.Tests.Analysis.SmartCn/Lucene.Net.Tests.Analysis.SmartCn.xproj
new file mode 100644
index 0000000..79885d1
--- /dev/null
+++ 
b/src/Lucene.Net.Tests.Analysis.SmartCn/Lucene.Net.Tests.Analysis.SmartCn.xproj
@@ -0,0 +1,42 @@
+ï»¿<?xml version="1.0" encoding="utf-8"?>
+<!--
+
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements.  See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership.  The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License.  You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied.  See the License for the
+ specific language governing permissions and limitations
+ under the License.
+
+-->
+
+<Project ToolsVersion="14.0.25420" DefaultTargets="Build" 
xmlns="http://schemas.microsoft.com/developer/msbuild/2003";>
+  <PropertyGroup>
+    <VisualStudioVersion Condition="'$(VisualStudioVersion)' == 
''">14.0.25420</VisualStudioVersion>
+    <VSToolsPath Condition="'$(VSToolsPath)' == 
''">$(MSBuildExtensionsPath32)\Microsoft\VisualStudio\v$(VisualStudioVersion)</VSToolsPath>
+  </PropertyGroup>
+  <Import Project="$(VSToolsPath)\DotNet\Microsoft.DotNet.Props" 
Condition="'$(VSToolsPath)' != ''" />
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>2870fb52-1239-493f-a0be-951660194a66</ProjectGuid>
+    <RootNamespace>Lucene.Net.Analysis.Cn.Smart</RootNamespace>
+    <BaseIntermediateOutputPath Condition="'$(BaseIntermediateOutputPath)'=='' 
">.\obj</BaseIntermediateOutputPath>
+    <OutputPath Condition="'$(OutputPath)'=='' ">.\bin\</OutputPath>
+  </PropertyGroup>
+  <PropertyGroup>
+    <SchemaVersion>2.0</SchemaVersion>
+  </PropertyGroup>
+  <ItemGroup>
+    <Service Include="{82a7f48d-3b50-4b1e-b82e-3ada8210c358}" />
+  </ItemGroup>
+  <Import Project="$(VSToolsPath)\DotNet\Microsoft.DotNet.targets" 
Condition="'$(VSToolsPath)' != ''" />
+</Project>
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/2f5d89b4/src/Lucene.Net.Tests.Analysis.SmartCn/Properties/AssemblyInfo.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Tests.Analysis.SmartCn/Properties/AssemblyInfo.cs 
b/src/Lucene.Net.Tests.Analysis.SmartCn/Properties/AssemblyInfo.cs
new file mode 100644
index 0000000..b3135b2
--- /dev/null
+++ b/src/Lucene.Net.Tests.Analysis.SmartCn/Properties/AssemblyInfo.cs
@@ -0,0 +1,37 @@
+ï»¿/*
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements.  See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License.  You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+using System.Reflection;
+using System.Runtime.InteropServices;
+
+// General Information about an assembly is controlled through the following 
+// set of attributes. Change these attribute values to modify the information
+// associated with an assembly.
+[assembly: AssemblyTitle("Lucene.Net.Tests.Analysis.SmartCn")]
+[assembly: AssemblyDescription("")]
+[assembly: AssemblyConfiguration("")]
+[assembly: AssemblyCulture("")]
+
+// Setting ComVisible to false makes the types in this assembly not visible 
+// to COM components.  If you need to access a type in this assembly from 
+// COM, set the ComVisible attribute to true on that type.
+[assembly: ComVisible(false)]
+
+// The following GUID is for the ID of the typelib if this project is exposed 
to COM
+[assembly: Guid("8c8d78d3-bffd-4301-953b-fe5350b2aeeb")]
+
+// NOTE: Version information is in CommonAssemblyInfo.cs

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/2f5d89b4/src/Lucene.Net.Tests.Analysis.SmartCn/Support/TestApiConsistency.cs
----------------------------------------------------------------------
diff --git 
a/src/Lucene.Net.Tests.Analysis.SmartCn/Support/TestApiConsistency.cs 
b/src/Lucene.Net.Tests.Analysis.SmartCn/Support/TestApiConsistency.cs
new file mode 100644
index 0000000..0943448
--- /dev/null
+++ b/src/Lucene.Net.Tests.Analysis.SmartCn/Support/TestApiConsistency.cs
@@ -0,0 +1,148 @@
+ï»¿/*
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+*/
+
+using Lucene.Net.Attributes;
+using Lucene.Net.Support;
+using NUnit.Framework;
+using System;
+
+namespace Lucene.Net.Analysis.Cn.Smart.Support
+{
+    /// <summary>
+    /// LUCENENET specific tests for ensuring API conventions are followed
+    /// </summary>
+    public class TestApiConsistency : ApiScanTestBase
+    {
+        [Test, LuceneNetSpecific]
+        [TestCase(typeof(Lucene.Net.Analysis.Cn.Smart.AnalyzerProfile))]
+        public override void TestProtectedFieldNames(Type 
typeFromTargetAssembly)
+        {
+            base.TestProtectedFieldNames(typeFromTargetAssembly);
+        }
+
+        [Test, LuceneNetSpecific]
+        [TestCase(typeof(Lucene.Net.Analysis.Cn.Smart.AnalyzerProfile))]
+        public override void TestPrivateFieldNames(Type typeFromTargetAssembly)
+        {
+            base.TestPrivateFieldNames(typeFromTargetAssembly);
+        }
+
+        [Test, LuceneNetSpecific]
+        [TestCase(typeof(Lucene.Net.Analysis.Cn.Smart.AnalyzerProfile))]
+        public override void TestPublicFields(Type typeFromTargetAssembly)
+        {
+            base.TestPublicFields(typeFromTargetAssembly);
+        }
+
+        [Test, LuceneNetSpecific]
+        [TestCase(typeof(Lucene.Net.Analysis.Cn.Smart.AnalyzerProfile))]
+        public override void TestMethodParameterNames(Type 
typeFromTargetAssembly)
+        {
+            base.TestMethodParameterNames(typeFromTargetAssembly);
+        }
+
+        [Test, LuceneNetSpecific]
+        [TestCase(typeof(Lucene.Net.Analysis.Cn.Smart.AnalyzerProfile))]
+        public override void TestInterfaceNames(Type typeFromTargetAssembly)
+        {
+            base.TestInterfaceNames(typeFromTargetAssembly);
+        }
+
+        [Test, LuceneNetSpecific]
+        [TestCase(typeof(Lucene.Net.Analysis.Cn.Smart.AnalyzerProfile))]
+        public override void TestClassNames(Type typeFromTargetAssembly)
+        {
+            base.TestClassNames(typeFromTargetAssembly);
+        }
+
+        [Test, LuceneNetSpecific]
+        [TestCase(typeof(Lucene.Net.Analysis.Cn.Smart.AnalyzerProfile))]
+        public override void TestForPropertiesWithNoGetter(Type 
typeFromTargetAssembly)
+        {
+            base.TestForPropertiesWithNoGetter(typeFromTargetAssembly);
+        }
+
+        [Test, LuceneNetSpecific]
+        [TestCase(typeof(Lucene.Net.Analysis.Cn.Smart.AnalyzerProfile))]
+        public override void TestForPropertiesThatReturnArray(Type 
typeFromTargetAssembly)
+        {
+            base.TestForPropertiesThatReturnArray(typeFromTargetAssembly);
+        }
+
+        [Test, LuceneNetSpecific]
+        [TestCase(typeof(Lucene.Net.Analysis.Cn.Smart.AnalyzerProfile))]
+        public override void TestForMethodsThatReturnWritableArray(Type 
typeFromTargetAssembly)
+        {
+            base.TestForMethodsThatReturnWritableArray(typeFromTargetAssembly);
+        }
+
+        [Test, LuceneNetSpecific]
+        [TestCase(typeof(Lucene.Net.Analysis.Cn.Smart.AnalyzerProfile))]
+        public override void TestForPublicMembersContainingComparer(Type 
typeFromTargetAssembly)
+        {
+            
base.TestForPublicMembersContainingComparer(typeFromTargetAssembly);
+        }
+
+        [Test, LuceneNetSpecific]
+        [TestCase(typeof(Lucene.Net.Analysis.Cn.Smart.AnalyzerProfile))]
+        public override void TestForPublicMembersNamedSize(Type 
typeFromTargetAssembly)
+        {
+            base.TestForPublicMembersNamedSize(typeFromTargetAssembly);
+        }
+
+        [Test, LuceneNetSpecific]
+        [TestCase(typeof(Lucene.Net.Analysis.Cn.Smart.AnalyzerProfile))]
+        public override void TestForPublicMembersContainingNonNetNumeric(Type 
typeFromTargetAssembly)
+        {
+            
base.TestForPublicMembersContainingNonNetNumeric(typeFromTargetAssembly);
+        }
+
+        [Test, LuceneNetSpecific]
+        [TestCase(typeof(Lucene.Net.Analysis.Cn.Smart.AnalyzerProfile))]
+        public override void TestForTypesContainingNonNetNumeric(Type 
typeFromTargetAssembly)
+        {
+            base.TestForTypesContainingNonNetNumeric(typeFromTargetAssembly);
+        }
+
+        [Test, LuceneNetSpecific]
+        [TestCase(typeof(Lucene.Net.Analysis.Cn.Smart.AnalyzerProfile))]
+        public override void TestForPublicMembersWithNullableEnum(Type 
typeFromTargetAssembly)
+        {
+            base.TestForPublicMembersWithNullableEnum(typeFromTargetAssembly);
+        }
+
+        // LUCENENET NOTE: This test is only for identifying members who were 
changed from
+        // ICollection, IList or ISet to IEnumerable during the port (that 
should be changed back)
+        //[Test, LuceneNetSpecific]
+        //[TestCase(typeof(Lucene.Net.Analysis.Cn.Smart.AnalyzerProfile))]
+        //public override void 
TestForMembersAcceptingOrReturningIEnumerable(Type typeFromTargetAssembly)
+        //{
+        //    
base.TestForMembersAcceptingOrReturningIEnumerable(typeFromTargetAssembly);
+        //}
+
+        [Test, LuceneNetSpecific]
+        [TestCase(typeof(Lucene.Net.Analysis.Cn.Smart.AnalyzerProfile))]
+        public override void 
TestForMembersAcceptingOrReturningListOrDictionary(Type typeFromTargetAssembly)
+        {
+            
base.TestForMembersAcceptingOrReturningListOrDictionary(typeFromTargetAssembly);
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/2f5d89b4/src/Lucene.Net.Tests.Analysis.SmartCn/Support/TestExceptionSerialization.cs
----------------------------------------------------------------------
diff --git 
a/src/Lucene.Net.Tests.Analysis.SmartCn/Support/TestExceptionSerialization.cs 
b/src/Lucene.Net.Tests.Analysis.SmartCn/Support/TestExceptionSerialization.cs
new file mode 100644
index 0000000..8c8d6a2
--- /dev/null
+++ 
b/src/Lucene.Net.Tests.Analysis.SmartCn/Support/TestExceptionSerialization.cs
@@ -0,0 +1,54 @@
+ï»¿#if FEATURE_SERIALIZABLE
+using Lucene.Net.Attributes;
+using NUnit.Framework;
+using System;
+using System.Collections.Generic;
+using System.Linq;
+
+namespace Lucene.Net.Support
+{
+    /*
+     * Licensed to the Apache Software Foundation (ASF) under one or more
+     * contributor license agreements.  See the NOTICE file distributed with
+     * this work for additional information regarding copyright ownership.
+     * The ASF licenses this file to You under the Apache License, Version 2.0
+     * (the "License"); you may not use this file except in compliance with
+     * the License.  You may obtain a copy of the License at
+     *
+     *     http://www.apache.org/licenses/LICENSE-2.0
+     *
+     * Unless required by applicable law or agreed to in writing, software
+     * distributed under the License is distributed on an "AS IS" BASIS,
+     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     * See the License for the specific language governing permissions and
+     * limitations under the License.
+     */
+
+    [TestFixture]
+    public class TestExceptionSerialization : ExceptionSerializationTestBase
+    {
+        public static IEnumerable<object> ExceptionTestData
+        {
+            get
+            {
+                var exceptionTypes = 
typeof(Lucene.Net.Analysis.Cn.Smart.AnalyzerProfile).Assembly.GetTypes().Where(t
 => typeof(Exception).IsAssignableFrom(t)).Cast<object>();
+
+                // If the assembly has no exceptions, just provide Exception 
so the test will pass
+                if (!exceptionTypes.Any())
+                {
+                    return new Type[] { typeof(Exception) };
+                }
+
+                return exceptionTypes;
+            }
+        }
+
+        [Test, LuceneNetSpecific]
+        public void 
AllExceptionsInLuceneNamespaceCanSerialize([ValueSource("ExceptionTestData")]Type
 luceneException)
+        {
+            var instance = TryInstantiate(luceneException);
+            Assert.That(TypeCanSerialize(instance), string.Format("Unable to 
serialize {0}", luceneException.FullName));
+        }
+    }
+}
+#endif
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/2f5d89b4/src/Lucene.Net.Tests.Analysis.SmartCn/TestHMMChineseTokenizerFactory.cs
----------------------------------------------------------------------
diff --git 
a/src/Lucene.Net.Tests.Analysis.SmartCn/TestHMMChineseTokenizerFactory.cs 
b/src/Lucene.Net.Tests.Analysis.SmartCn/TestHMMChineseTokenizerFactory.cs
new file mode 100644
index 0000000..264ab38
--- /dev/null
+++ b/src/Lucene.Net.Tests.Analysis.SmartCn/TestHMMChineseTokenizerFactory.cs
@@ -0,0 +1,72 @@
+ï»¿using Lucene.Net.Analysis.Util;
+using NUnit.Framework;
+using System;
+using System.Collections.Generic;
+using System.IO;
+
+namespace Lucene.Net.Analysis.Cn.Smart
+{
+    /*
+     * Licensed to the Apache Software Foundation (ASF) under one or more
+     * contributor license agreements.  See the NOTICE file distributed with
+     * this work for additional information regarding copyright ownership.
+     * The ASF licenses this file to You under the Apache License, Version 2.0
+     * (the "License"); you may not use this file except in compliance with
+     * the License.  You may obtain a copy of the License at
+     *
+     *     http://www.apache.org/licenses/LICENSE-2.0
+     *
+     * Unless required by applicable law or agreed to in writing, software
+     * distributed under the License is distributed on an "AS IS" BASIS,
+     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     * See the License for the specific language governing permissions and
+     * limitations under the License.
+     */
+
+    /// <summary>
+    /// Tests for <see cref="HMMChineseTokenizerFactory"/>
+    /// </summary>
+    public class TestHMMChineseTokenizerFactory : BaseTokenStreamTestCase
+    {
+        [Test]
+        public void TestHHMMSegmenter()
+        {
+            var x = new HHMM.HHMMSegmenter();
+        }
+
+        /// <summary>
+        /// Test showing the behavior
+        /// </summary>
+        [Test]
+        public void TestSimple()
+        {
+            TextReader reader = new StringReader("æè´ä¹°äºéå·åæè£
ã");
+            TokenizerFactory factory = new HMMChineseTokenizerFactory(new 
Dictionary<string, string>());
+            Tokenizer tokenizer = factory.Create(reader);
+            tokenizer.SetReader(reader);
+            // TODO: fix smart chinese to not emit punctuation tokens
+            // at the moment: you have to clean up with WDF, or use the 
stoplist, etc
+            AssertTokenStreamContents(tokenizer,
+               new String[] { "æ", "è´ä¹°", "äº", "éå·", "å", "æè£
", "," });
+        }
+
+        /// <summary>
+        /// Test that bogus arguments result in exception
+        /// </summary>
+        [Test]
+        public void TestBogusArguments()
+        {
+            try
+            {
+                new HMMChineseTokenizerFactory(new Dictionary<string, 
string>() {
+                    { "bogusArg", "bogusValue" }
+                });
+                fail();
+            }
+            catch (ArgumentException expected)
+            {
+                assertTrue(expected.Message.Contains("Unknown parameters"));
+            }
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/2f5d89b4/src/Lucene.Net.Tests.Analysis.SmartCn/TestSmartChineseAnalyzer.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Tests.Analysis.SmartCn/TestSmartChineseAnalyzer.cs 
b/src/Lucene.Net.Tests.Analysis.SmartCn/TestSmartChineseAnalyzer.cs
new file mode 100644
index 0000000..bc4f3aa
--- /dev/null
+++ b/src/Lucene.Net.Tests.Analysis.SmartCn/TestSmartChineseAnalyzer.cs
@@ -0,0 +1,354 @@
+ï»¿using Lucene.Net.Analysis.Core;
+using Lucene.Net.Analysis.Miscellaneous;
+using Lucene.Net.Util;
+using NUnit.Framework;
+using System;
+using System.Text;
+
+namespace Lucene.Net.Analysis.Cn.Smart
+{
+    /*
+     * Licensed to the Apache Software Foundation (ASF) under one or more
+     * contributor license agreements.  See the NOTICE file distributed with
+     * this work for additional information regarding copyright ownership.
+     * The ASF licenses this file to You under the Apache License, Version 2.0
+     * (the "License"); you may not use this file except in compliance with
+     * the License.  You may obtain a copy of the License at
+     *
+     *     http://www.apache.org/licenses/LICENSE-2.0
+     *
+     * Unless required by applicable law or agreed to in writing, software
+     * distributed under the License is distributed on an "AS IS" BASIS,
+     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     * See the License for the specific language governing permissions and
+     * limitations under the License.
+     */
+
+    public class TestSmartChineseAnalyzer : BaseTokenStreamTestCase
+    {
+        [Test]
+        public void TestChineseStopWordsDefault()
+        {
+#pragma warning disable 612, 618
+            Analyzer ca = new 
SmartChineseAnalyzer(LuceneVersion.LUCENE_CURRENT); /* will load stopwords */
+#pragma warning restore 612, 618
+            String sentence = "æè´ä¹°äºéå·åæè£ã";
+            String[] result = { "æ", "è´ä¹°", "äº", "éå·", "å", 
"æè£" };
+            AssertAnalyzesTo(ca, sentence, result);
+            // set stop-words from the outer world - must yield same behavior
+            ca = new SmartChineseAnalyzer(
+#pragma warning disable 612, 618
+                LuceneVersion.LUCENE_CURRENT,
+#pragma warning restore 612, 618
+                SmartChineseAnalyzer.GetDefaultStopSet());
+            AssertAnalyzesTo(ca, sentence, result);
+        }
+
+        /*
+         * This test is the same as the above, except with two phrases.
+         * This tests to ensure the SentenceTokenizer->WordTokenFilter chain 
works correctly.
+         */
+        [Test]
+        public void TestChineseStopWordsDefaultTwoPhrases()
+        {
+#pragma warning disable 612, 618
+            Analyzer ca = new 
SmartChineseAnalyzer(LuceneVersion.LUCENE_CURRENT); /* will load stopwords */
+#pragma warning restore 612, 618
+            String sentence = "æè´ä¹°äºéå·åæè£ã æè´ä¹°äºéå
·åæè£ã";
+            String[] result = { "æ", "è´ä¹°", "äº", "éå·", "å", 
"æè£", "æ", "è´ä¹°", "äº", "éå·", "å", "æè£" };
+            AssertAnalyzesTo(ca, sentence, result);
+        }
+
+        /*
+         * This test is the same as the above, except using an ideographic 
space as a separator.
+         * This tests to ensure the stopwords are working correctly.
+         */
+        [Test]
+        public void TestChineseStopWordsDefaultTwoPhrasesIdeoSpace()
+        {
+#pragma warning disable 612, 618
+            Analyzer ca = new 
SmartChineseAnalyzer(LuceneVersion.LUCENE_CURRENT); /* will load stopwords */
+#pragma warning restore 612, 618
+            String sentence = "æè´ä¹°äºéå·åæè£ãæè´ä¹°äºéå
·åæè£ã";
+            String[] result = { "æ", "è´ä¹°", "äº", "éå·", "å", 
"æè£", "æ", "è´ä¹°", "äº", "éå·", "å", "æè£" };
+            AssertAnalyzesTo(ca, sentence, result);
+        }
+
+        /*
+         * Punctuation is handled in a strange way if you disable stopwords
+         * In this example the IDEOGRAPHIC FULL STOP is converted into a comma.
+         * if you don't supply (true) to the constructor, or use a different 
stopwords list,
+         * then punctuation is indexed.
+         */
+        [Test]
+        public void TestChineseStopWordsOff()
+        {
+            Analyzer[]
+            analyzers = new Analyzer[] {
+#pragma warning disable 612, 618
+                new SmartChineseAnalyzer(LuceneVersion.LUCENE_CURRENT, 
false),/* doesn't load stopwords */
+                new SmartChineseAnalyzer(LuceneVersion.LUCENE_CURRENT, null) 
/* sets stopwords to empty set */};
+#pragma warning restore 612, 618
+            String sentence = "æè´ä¹°äºéå·åæè£ã";
+            String[] result = { "æ", "è´ä¹°", "äº", "éå·", "å", 
"æè£", "," };
+            foreach (Analyzer analyzer in analyzers)
+            {
+                AssertAnalyzesTo(analyzer, sentence, result);
+                AssertAnalyzesTo(analyzer, sentence, result);
+            }
+        }
+
+        /*
+         * Check that position increments after stopwords are correct,
+         * when stopfilter is configured with enablePositionIncrements
+         */
+        [Test]
+        public void TestChineseStopWords2()
+        {
+#pragma warning disable 612, 618
+            Analyzer ca = new 
SmartChineseAnalyzer(LuceneVersion.LUCENE_CURRENT); /* will load stopwords */
+#pragma warning restore 612, 618
+            String sentence = "Title:San"; // : is a stopword
+            String[] result = { "titl", "san" };
+            int[] startOffsets = { 0, 6 };
+            int[] endOffsets = { 5, 9 };
+            int[] posIncr = { 1, 2 };
+            AssertAnalyzesTo(ca, sentence, result, startOffsets, endOffsets, 
posIncr);
+        }
+
+        [Test]
+        public void TestChineseAnalyzer()
+        {
+#pragma warning disable 612, 618
+            Analyzer ca = new 
SmartChineseAnalyzer(LuceneVersion.LUCENE_CURRENT, true);
+#pragma warning restore 612, 618
+            String sentence = "æè´ä¹°äºéå·åæè£ã";
+            String[] result = { "æ", "è´ä¹°", "äº", "éå·", "å", 
"æè£" };
+            AssertAnalyzesTo(ca, sentence, result);
+        }
+
+        /*
+         * English words are lowercased and porter-stemmed.
+         */
+        [Test]
+        public void TestMixedLatinChinese()
+        {
+            AssertAnalyzesTo(new SmartChineseAnalyzer(
+#pragma warning disable 612, 618
+                LuceneVersion.LUCENE_CURRENT
+#pragma warning restore 612, 618
+                , true), "æè´ä¹° Tests äºéå·åæè£",
+                new String[] { "æ", "è´ä¹°", "test", "äº", "éå·", 
"å", "æè£" });
+        }
+
+        /*
+         * Numerics are parsed as their own tokens
+         */
+        [Test]
+        public void TestNumerics()
+        {
+            AssertAnalyzesTo(new SmartChineseAnalyzer(
+#pragma warning disable 612, 618
+                LuceneVersion.LUCENE_CURRENT
+#pragma warning restore 612, 618
+                , true), "æè´ä¹° Tests äºéå·åæè£1234",
+              new String[] { "æ", "è´ä¹°", "test", "äº", "éå·", "å", 
"æè£", "1234" });
+        }
+
+        /*
+         * Full width alphas and numerics are folded to half-width
+         */
+        [Test]
+        public void TestFullWidth()
+        {
+            AssertAnalyzesTo(new SmartChineseAnalyzer(
+#pragma warning disable 612, 618
+                LuceneVersion.LUCENE_CURRENT
+#pragma warning restore 612, 618
+                , true), "æè´ä¹° ï¼´ï½ï½ï½ï½ äºéå·åæè£
ï¼ï¼ï¼ï¼",
+                new String[] { "æ", "è´ä¹°", "test", "äº", "éå·", 
"å", "æè£", "1234" });
+        }
+
+        /*
+         * Presentation form delimiters are removed
+         */
+        [Test]
+        public void TestDelimiters()
+        {
+            AssertAnalyzesTo(new SmartChineseAnalyzer(
+#pragma warning disable 612, 618
+                LuceneVersion.LUCENE_CURRENT
+#pragma warning restore 612, 618
+                , true), "æè´ä¹°ï¸± Tests äºéå·åæè£",
+                new String[] { "æ", "è´ä¹°", "test", "äº", "éå·", 
"å", "æè£" });
+        }
+
+        /*
+         * Text from writing systems other than Chinese and Latin are parsed 
as individual characters.
+         * (regardless of Unicode category)
+         */
+        [Test]
+        public void TestNonChinese()
+        {
+            AssertAnalyzesTo(new SmartChineseAnalyzer(
+#pragma warning disable 612, 618
+                LuceneVersion.LUCENE_CURRENT
+#pragma warning restore 612, 618
+                , true), "æè´ä¹° Ø±ÙØ¨Ø±ØªTests äºéå·åæè£",
+                new String[] { "æ", "è´ä¹°", "Ø±", "Ù", "Ø¨", "Ø±", "Øª", 
"test", "äº", "éå·", "å", "æè£" });
+        }
+
+        /*
+         * Test what the analyzer does with out-of-vocabulary words.
+         * In this case the name is Yousaf Raza Gillani.
+         * Currently it is being analyzed into single characters...
+         */
+        [Test]
+        public void TestOOV()
+        {
+            AssertAnalyzesTo(new SmartChineseAnalyzer(
+#pragma warning disable 612, 618
+                LuceneVersion.LUCENE_CURRENT
+#pragma warning restore 612, 618
+                , true), "ä¼ç´ ç¦Â·ææÂ·åæå°¼",
+              new String[] { "ä¼", "ç´ ", "ç¦", "æ", "æ", "å", "æ", 
"å°¼" });
+
+
+            AssertAnalyzesTo(new SmartChineseAnalyzer(
+#pragma warning disable 612, 618
+                LuceneVersion.LUCENE_CURRENT
+#pragma warning restore 612, 618
+                , true), "ä¼ç´ ç¦ææåæå°¼",
+              new String[] { "ä¼", "ç´ ", "ç¦", "æ", "æ", "å", "æ", 
"å°¼" });
+        }
+
+        [Test]
+        public void TestOffsets()
+        {
+            AssertAnalyzesTo(new SmartChineseAnalyzer(
+#pragma warning disable 612, 618
+                LuceneVersion.LUCENE_CURRENT
+#pragma warning restore 612, 618
+                , true), "æè´ä¹°äºéå·åæè£",
+                new String[] { "æ", "è´ä¹°", "äº", "éå·", "å", "æè£
" },
+                new int[] { 0, 1, 3, 4, 6, 7 },
+                new int[] { 1, 3, 4, 6, 7, 9 });
+        }
+
+        [Test]
+        public void TestReusableTokenStream()
+        {
+            Analyzer a = new SmartChineseAnalyzer(
+#pragma warning disable 612, 618
+                LuceneVersion.LUCENE_CURRENT);
+#pragma warning restore 612, 618
+            AssertAnalyzesTo(a, "æè´ä¹° Tests äºéå·åæè£",
+                new String[] { "æ", "è´ä¹°", "test", "äº", "éå·", 
"å", "æè£" },
+                new int[] { 0, 1, 4, 10, 11, 13, 14 },
+                new int[] { 1, 3, 9, 11, 13, 14, 16 });
+            AssertAnalyzesTo(a, "æè´ä¹°äºéå·åæè£ã",
+                new String[] { "æ", "è´ä¹°", "äº", "éå·", "å", "æè£
" },
+                new int[] { 0, 1, 3, 4, 6, 7 },
+                new int[] { 1, 3, 4, 6, 7, 9 });
+        }
+
+        // LUCENE-3026
+        [Test]
+        public void TestLargeDocument()
+        {
+            StringBuilder sb = new StringBuilder();
+            for (int i = 0; i < 5000; i++)
+            {
+                sb.append("æè´ä¹°äºéå·åæè£ã");
+            }
+            Analyzer analyzer = new SmartChineseAnalyzer(TEST_VERSION_CURRENT);
+            TokenStream stream = analyzer.GetTokenStream("", sb.toString());
+            try
+            {
+                stream.Reset();
+                while (stream.IncrementToken())
+                {
+                }
+                stream.End();
+            }
+            finally
+            {
+                IOUtils.DisposeWhileHandlingException(stream);
+            }
+        }
+
+        // LUCENE-3026
+        [Test]
+        public void TestLargeSentence()
+        {
+            StringBuilder sb = new StringBuilder();
+            for (int i = 0; i < 5000; i++)
+            {
+                sb.append("æè´ä¹°äºéå·åæè£");
+            }
+            Analyzer analyzer = new SmartChineseAnalyzer(TEST_VERSION_CURRENT);
+            TokenStream stream = analyzer.GetTokenStream("", sb.toString());
+            try
+            {
+                stream.Reset();
+                while (stream.IncrementToken())
+                {
+                }
+                stream.End();
+            }
+            finally
+            {
+                IOUtils.DisposeWhileHandlingException(stream);
+            }
+        }
+
+        // LUCENE-3642
+        [Test]
+        public void TestInvalidOffset()
+        {
+            Analyzer analyzer = Analyzer.NewAnonymous(createComponents: 
(fieldName, reader) =>
+            {
+                Tokenizer tokenizer = new MockTokenizer(reader, 
MockTokenizer.WHITESPACE, false);
+                TokenFilter filters = new ASCIIFoldingFilter(tokenizer);
+#pragma warning disable 612, 618
+                filters = new WordTokenFilter(filters);
+#pragma warning restore 612, 618
+                return new TokenStreamComponents(tokenizer, filters);
+            });
+
+            AssertAnalyzesTo(analyzer, "mosfellsbÃ¦r",
+                new string[] { "mosfellsbaer" },
+                new int[] { 0 },
+                new int[] { 11 });
+        }
+
+        /** blast some random strings through the analyzer */
+        [Test]
+        public void TestRandomStrings()
+        {
+            CheckRandomData(Random(), new 
SmartChineseAnalyzer(TEST_VERSION_CURRENT), 1000 * RANDOM_MULTIPLIER);
+        }
+
+        /** blast some random large strings through the analyzer */
+        [Test]
+        public void TestRandomHugeStrings()
+        {
+            Random random = Random();
+            CheckRandomData(random, new 
SmartChineseAnalyzer(TEST_VERSION_CURRENT), 100 * RANDOM_MULTIPLIER, 8192);
+        }
+
+        [Test]
+        public void TestEmptyTerm()
+        {
+            Random random = Random();
+            Analyzer a = Analyzer.NewAnonymous(createComponents: (fieldName, 
reader) =>
+            {
+                Tokenizer tokenizer = new KeywordTokenizer(reader);
+#pragma warning disable 612, 618
+                return new TokenStreamComponents(tokenizer, new 
WordTokenFilter(tokenizer));
+#pragma warning restore 612, 618
+            });
+            CheckAnalysisConsistency(random, a, random.nextBoolean(), "");
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/2f5d89b4/src/Lucene.Net.Tests.Analysis.SmartCn/TestSmartChineseFactories.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Tests.Analysis.SmartCn/TestSmartChineseFactories.cs 
b/src/Lucene.Net.Tests.Analysis.SmartCn/TestSmartChineseFactories.cs
new file mode 100644
index 0000000..4b23ec1
--- /dev/null
+++ b/src/Lucene.Net.Tests.Analysis.SmartCn/TestSmartChineseFactories.cs
@@ -0,0 +1,98 @@
+ï»¿using NUnit.Framework;
+using System;
+using System.Collections.Generic;
+using System.IO;
+
+namespace Lucene.Net.Analysis.Cn.Smart
+{
+    /*
+     * Licensed to the Apache Software Foundation (ASF) under one or more
+     * contributor license agreements.  See the NOTICE file distributed with
+     * this work for additional information regarding copyright ownership.
+     * The ASF licenses this file to You under the Apache License, Version 2.0
+     * (the "License"); you may not use this file except in compliance with
+     * the License.  You may obtain a copy of the License at
+     *
+     *     http://www.apache.org/licenses/LICENSE-2.0
+     *
+     * Unless required by applicable law or agreed to in writing, software
+     * distributed under the License is distributed on an "AS IS" BASIS,
+     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     * See the License for the specific language governing permissions and
+     * limitations under the License.
+     */
+
+    /// <summary>
+    /// Tests for <see cref=""SmartChineseSentenceTokenizerFactory/> and 
+    /// <see cref="SmartChineseWordTokenFilterFactory"/>
+    /// </summary>
+    [Obsolete]
+    public class TestSmartChineseFactories : BaseTokenStreamTestCase
+    {
+        /// <summary>
+        /// Test showing the behavior with whitespace
+        /// </summary>
+        [Test]
+        public void TestSimple()
+        {
+            TextReader reader = new StringReader("æè´ä¹°äºéå·åæè£
ã");
+            TokenStream stream = new MockTokenizer(reader, 
MockTokenizer.WHITESPACE, false);
+            SmartChineseWordTokenFilterFactory factory = new 
SmartChineseWordTokenFilterFactory(new Dictionary<string, string>());
+            stream = factory.Create(stream);
+            // TODO: fix smart chinese to not emit punctuation tokens
+            // at the moment: you have to clean up with WDF, or use the 
stoplist, etc
+            AssertTokenStreamContents(stream,
+               new String[] { "æ", "è´ä¹°", "äº", "éå·", "å", "æè£
", "," });
+        }
+
+        /// <summary>
+        /// Test showing the behavior with whitespace
+        /// </summary>
+        [Test]
+        public void TestTokenizer()
+        {
+            TextReader reader = new StringReader("æè´ä¹°äºéå·åæè£
ãæè´ä¹°äºéå·åæè£ã");
+            SmartChineseSentenceTokenizerFactory tokenizerFactory = new 
SmartChineseSentenceTokenizerFactory(new Dictionary<string, string>());
+            TokenStream stream = tokenizerFactory.Create(reader);
+            SmartChineseWordTokenFilterFactory factory = new 
SmartChineseWordTokenFilterFactory(new Dictionary<string, string>());
+            stream = factory.Create(stream);
+            // TODO: fix smart chinese to not emit punctuation tokens
+            // at the moment: you have to clean up with WDF, or use the 
stoplist, etc
+            AssertTokenStreamContents(stream,
+               new String[] { "æ", "è´ä¹°", "äº", "éå·", "å", "æè£
", ",",
+                    "æ", "è´ä¹°", "äº", "éå·", "å", "æè£", ","
+                });
+        }
+
+        /// <summary>
+        /// Test that bogus arguments result in exception
+        /// </summary>
+        [Test]
+        public void TestBogusArguments()
+        {
+            try
+            {
+                new SmartChineseSentenceTokenizerFactory(new 
Dictionary<string, string>() {
+                    { "bogusArg", "bogusValue" }
+                });
+                fail();
+            }
+            catch (ArgumentException expected)
+            {
+                assertTrue(expected.Message.Contains("Unknown parameters"));
+            }
+
+            try
+            {
+                new SmartChineseWordTokenFilterFactory(new Dictionary<string, 
string>() {
+                    { "bogusArg", "bogusValue" }
+                });
+                fail();
+            }
+            catch (ArgumentException expected)
+            {
+                assertTrue(expected.Message.Contains("Unknown parameters"));
+            }
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/2f5d89b4/src/Lucene.Net.Tests.Analysis.SmartCn/project.json
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Tests.Analysis.SmartCn/project.json 
b/src/Lucene.Net.Tests.Analysis.SmartCn/project.json
new file mode 100644
index 0000000..d7196a3
--- /dev/null
+++ b/src/Lucene.Net.Tests.Analysis.SmartCn/project.json
@@ -0,0 +1,45 @@
+ï»¿{
+  "version": "4.8.0",
+  "title": "Lucene.Net.Tests.Analysis.SmartCn",
+  "buildOptions": {
+    "compile": {
+      "includeFiles": [ "../CommonAssemblyInfo.cs" ]
+    },
+    "embed": {
+      "includeFiles": [
+      ]
+    }
+  },
+  "dependencies": {
+    "dotnet-test-nunit-teamcity": "3.4.0-beta-3",
+    "Lucene.Net.Analysis.SmartCn": "4.8.0",
+    "Lucene.Net.TestFramework": "4.8.0",
+    "NUnit": "3.5.0"
+  },
+  "testRunner": "nunit-teamcity",
+  "frameworks": {
+    "netcoreapp1.0": {
+      "imports": "dnxcore50",
+      "buildOptions": {
+        "debugType": "portable",
+        "define": [ "NETSTANDARD" ],
+        "compile": {
+          "excludeFiles": [
+            "Support/TestApiConsistency.cs"
+          ]
+        }
+      }
+    },
+    "net451": {
+      "buildOptions": {
+        "debugType": "full",
+        "define": [ "FEATURE_SERIALIZABLE" ]
+      }
+    }
+  },
+
+  "runtimes": {
+    "win7-x86": {},
+    "win7-x64": {}
+  }
+}

[1/4] lucenenet git commit: Ported Lucene.Net.Analysis.SmartCn + tests

Reply via email to