[3/4] lucenenet git commit: Ported Lucene.Net.Analysis.SmartCn + tests

nightowl888 Sat, 24 Jun 2017 12:47:59 -0700

Ported Lucene.Net.Analysis.SmartCn + tests


Project: http://git-wip-us.apache.org/repos/asf/lucenenet/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucenenet/commit/2f5d89b4
Tree: http://git-wip-us.apache.org/repos/asf/lucenenet/tree/2f5d89b4
Diff: http://git-wip-us.apache.org/repos/asf/lucenenet/diff/2f5d89b4

Branch: refs/heads/master
Commit: 2f5d89b4ae979d376f2ada22b2fb2e775b6e1608
Parents: 468199e
Author: Shad Storhaug <[email protected]>
Authored: Sun Jun 25 01:26:30 2017 +0700
Committer: Shad Storhaug <[email protected]>
Committed: Sun Jun 25 02:28:26 2017 +0700

----------------------------------------------------------------------
 Lucene.Net.Portable.sln                         |  20 +
 Lucene.Net.sln                                  |  52 ++
 .../AnalyzerProfile.cs                          | 183 +++++
 src/Lucene.Net.Analysis.SmartCn/CharType.cs     |  67 ++
 .../HHMM/AbstractDictionary.cs                  | 224 ++++++
 .../HHMM/BiSegGraph.cs                          | 256 ++++++
 .../HHMM/BigramDictionary.cs                    | 431 ++++++++++
 .../HHMM/HHMMSegmenter.cs                       | 252 ++++++
 .../HHMM/PathNode.cs                            |  80 ++
 .../HHMM/SegGraph.cs                            | 160 ++++
 .../HHMM/SegToken.cs                            | 123 +++
 .../HHMM/SegTokenFilter.cs                      |  75 ++
 .../HHMM/SegTokenPair.cs                        |  95 +++
 .../HHMM/WordDictionary.cs                      | 778 +++++++++++++++++++
 .../HHMM/bigramdict.mem                         | Bin 0 -> 4825652 bytes
 .../HHMM/coredict.mem                           | Bin 0 -> 1178248 bytes
 .../HMMChineseTokenizer.cs                      |  94 +++
 .../HMMChineseTokenizerFactory.cs               |  56 ++
 .../Lucene.Net.Analysis.SmartCn.csproj          | 124 +++
 .../Lucene.Net.Analysis.SmartCn.project.json    |  11 +
 .../Lucene.Net.Analysis.SmartCn.xproj           |  40 +
 .../Properties/AssemblyInfo.cs                  |  42 +
 .../SentenceTokenizer.cs                        | 142 ++++
 .../SmartChineseAnalyzer.cs                     | 171 ++++
 .../SmartChineseSentenceTokenizerFactory.cs     |  52 ++
 .../SmartChineseWordTokenFilterFactory.cs       |  55 ++
 src/Lucene.Net.Analysis.SmartCn/Utility.cs      | 196 +++++
 .../WordSegmenter.cs                            |  89 +++
 .../WordTokenFilter.cs                          | 114 +++
 src/Lucene.Net.Analysis.SmartCn/WordType.cs     |  67 ++
 src/Lucene.Net.Analysis.SmartCn/project.json    |  53 ++
 src/Lucene.Net.Analysis.SmartCn/stopwords.txt   |  59 ++
 .../Lucene.Net.Tests.Analysis.SmartCn.csproj    | 105 +++
 ...cene.Net.Tests.Analysis.SmartCn.project.json |  11 +
 .../Lucene.Net.Tests.Analysis.SmartCn.xproj     |  42 +
 .../Properties/AssemblyInfo.cs                  |  37 +
 .../Support/TestApiConsistency.cs               | 148 ++++
 .../Support/TestExceptionSerialization.cs       |  54 ++
 .../TestHMMChineseTokenizerFactory.cs           |  72 ++
 .../TestSmartChineseAnalyzer.cs                 | 354 +++++++++
 .../TestSmartChineseFactories.cs                |  98 +++
 .../project.json                                |  45 ++
 42 files changed, 5127 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucenenet/blob/2f5d89b4/Lucene.Net.Portable.sln
----------------------------------------------------------------------
diff --git a/Lucene.Net.Portable.sln b/Lucene.Net.Portable.sln
index 9a49572..d3678ee 100644
--- a/Lucene.Net.Portable.sln
+++ b/Lucene.Net.Portable.sln
@@ -85,6 +85,10 @@ Project("{8BB2217D-0F2D-49D1-97BC-3654ED321F3B}") = 
"Lucene.Net.ICU", "src\Lucen
 EndProject
 Project("{8BB2217D-0F2D-49D1-97BC-3654ED321F3B}") = "Lucene.Net.Tests.ICU", 
"src\Lucene.Net.Tests.ICU\Lucene.Net.Tests.ICU.xproj", 
"{32FD3471-E862-4055-B969-79C12A656366}"
 EndProject
+Project("{8BB2217D-0F2D-49D1-97BC-3654ED321F3B}") = 
"Lucene.Net.Analysis.SmartCn", 
"src\Lucene.Net.Analysis.SmartCn\Lucene.Net.Analysis.SmartCn.xproj", 
"{A400916E-DCB8-4A16-BE83-91891C05191F}"
+EndProject
+Project("{8BB2217D-0F2D-49D1-97BC-3654ED321F3B}") = 
"Lucene.Net.Tests.Analysis.SmartCn", 
"src\Lucene.Net.Tests.Analysis.SmartCn\Lucene.Net.Tests.Analysis.SmartCn.xproj",
 "{2870FB52-1239-493F-A0BE-951660194A66}"
+EndProject
 Global
        GlobalSection(SolutionConfigurationPlatforms) = preSolution
                Debug|Any CPU = Debug|Any CPU
@@ -389,6 +393,22 @@ Global
                {32FD3471-E862-4055-B969-79C12A656366}.Release|Any CPU.Build.0 
= Release|Any CPU
                {32FD3471-E862-4055-B969-79C12A656366}.Release|x86.ActiveCfg = 
Release|Any CPU
                {32FD3471-E862-4055-B969-79C12A656366}.Release|x86.Build.0 = 
Release|Any CPU
+               {2870FB52-1239-493F-A0BE-951660194A66}.Debug|Any CPU.ActiveCfg 
= Debug|Any CPU
+               {2870FB52-1239-493F-A0BE-951660194A66}.Debug|Any CPU.Build.0 = 
Debug|Any CPU
+               {2870FB52-1239-493F-A0BE-951660194A66}.Debug|x86.ActiveCfg = 
Debug|Any CPU
+               {2870FB52-1239-493F-A0BE-951660194A66}.Debug|x86.Build.0 = 
Debug|Any CPU
+               {2870FB52-1239-493F-A0BE-951660194A66}.Release|Any 
CPU.ActiveCfg = Release|Any CPU
+               {2870FB52-1239-493F-A0BE-951660194A66}.Release|Any CPU.Build.0 
= Release|Any CPU
+               {2870FB52-1239-493F-A0BE-951660194A66}.Release|x86.ActiveCfg = 
Release|Any CPU
+               {2870FB52-1239-493F-A0BE-951660194A66}.Release|x86.Build.0 = 
Release|Any CPU
+               {A400916E-DCB8-4A16-BE83-91891C05191F}.Debug|Any CPU.ActiveCfg 
= Debug|Any CPU
+               {A400916E-DCB8-4A16-BE83-91891C05191F}.Debug|Any CPU.Build.0 = 
Debug|Any CPU
+               {A400916E-DCB8-4A16-BE83-91891C05191F}.Debug|x86.ActiveCfg = 
Debug|Any CPU
+               {A400916E-DCB8-4A16-BE83-91891C05191F}.Debug|x86.Build.0 = 
Debug|Any CPU
+               {A400916E-DCB8-4A16-BE83-91891C05191F}.Release|Any 
CPU.ActiveCfg = Release|Any CPU
+               {A400916E-DCB8-4A16-BE83-91891C05191F}.Release|Any CPU.Build.0 
= Release|Any CPU
+               {A400916E-DCB8-4A16-BE83-91891C05191F}.Release|x86.ActiveCfg = 
Release|Any CPU
+               {A400916E-DCB8-4A16-BE83-91891C05191F}.Release|x86.Build.0 = 
Release|Any CPU
        EndGlobalSection
        GlobalSection(SolutionProperties) = preSolution
                HideSolutionNode = FALSE

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/2f5d89b4/Lucene.Net.sln
----------------------------------------------------------------------
diff --git a/Lucene.Net.sln b/Lucene.Net.sln
index 669a57d..be5b2b9 100644
--- a/Lucene.Net.sln
+++ b/Lucene.Net.sln
@@ -94,6 +94,10 @@ Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = 
"Lucene.Net.ICU", "src\Lucen
 EndProject
 Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Lucene.Net.Tests.ICU", 
"src\Lucene.Net.Tests.ICU\Lucene.Net.Tests.ICU.csproj", 
"{D5AA1A22-1B28-4DF6-BFDA-02519A189839}"
 EndProject
+Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = 
"Lucene.Net.Analysis.SmartCn", 
"src\Lucene.Net.Analysis.SmartCn\Lucene.Net.Analysis.SmartCn.csproj", 
"{DBA35EDF-A0FF-4DF7-AE4F-A103B01CD488}"
+EndProject
+Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = 
"Lucene.Net.Tests.Analysis.SmartCn", 
"src\Lucene.Net.Tests.Analysis.SmartCn\Lucene.Net.Tests.Analysis.SmartCn.csproj",
 "{8C8D78D3-BFFD-4301-953B-FE5350B2AEEB}"
+EndProject
 Global
        GlobalSection(SolutionConfigurationPlatforms) = preSolution
                Debug|Any CPU = Debug|Any CPU
@@ -903,6 +907,54 @@ Global
                {D5AA1A22-1B28-4DF6-BFDA-02519A189839}.Release35|Mixed 
Platforms.Build.0 = Release|Any CPU
                {D5AA1A22-1B28-4DF6-BFDA-02519A189839}.Release35|x86.ActiveCfg 
= Release|Any CPU
                {D5AA1A22-1B28-4DF6-BFDA-02519A189839}.Release35|x86.Build.0 = 
Release|Any CPU
+               {DBA35EDF-A0FF-4DF7-AE4F-A103B01CD488}.Debug|Any CPU.ActiveCfg 
= Debug|Any CPU
+               {DBA35EDF-A0FF-4DF7-AE4F-A103B01CD488}.Debug|Any CPU.Build.0 = 
Debug|Any CPU
+               {DBA35EDF-A0FF-4DF7-AE4F-A103B01CD488}.Debug|Mixed 
Platforms.ActiveCfg = Debug|Any CPU
+               {DBA35EDF-A0FF-4DF7-AE4F-A103B01CD488}.Debug|Mixed 
Platforms.Build.0 = Debug|Any CPU
+               {DBA35EDF-A0FF-4DF7-AE4F-A103B01CD488}.Debug|x86.ActiveCfg = 
Debug|Any CPU
+               {DBA35EDF-A0FF-4DF7-AE4F-A103B01CD488}.Debug|x86.Build.0 = 
Debug|Any CPU
+               {DBA35EDF-A0FF-4DF7-AE4F-A103B01CD488}.Debug35|Any 
CPU.ActiveCfg = Debug|Any CPU
+               {DBA35EDF-A0FF-4DF7-AE4F-A103B01CD488}.Debug35|Any CPU.Build.0 
= Debug|Any CPU
+               {DBA35EDF-A0FF-4DF7-AE4F-A103B01CD488}.Debug35|Mixed 
Platforms.ActiveCfg = Debug|Any CPU
+               {DBA35EDF-A0FF-4DF7-AE4F-A103B01CD488}.Debug35|Mixed 
Platforms.Build.0 = Debug|Any CPU
+               {DBA35EDF-A0FF-4DF7-AE4F-A103B01CD488}.Debug35|x86.ActiveCfg = 
Debug|Any CPU
+               {DBA35EDF-A0FF-4DF7-AE4F-A103B01CD488}.Debug35|x86.Build.0 = 
Debug|Any CPU
+               {DBA35EDF-A0FF-4DF7-AE4F-A103B01CD488}.Release|Any 
CPU.ActiveCfg = Release|Any CPU
+               {DBA35EDF-A0FF-4DF7-AE4F-A103B01CD488}.Release|Any CPU.Build.0 
= Release|Any CPU
+               {DBA35EDF-A0FF-4DF7-AE4F-A103B01CD488}.Release|Mixed 
Platforms.ActiveCfg = Release|Any CPU
+               {DBA35EDF-A0FF-4DF7-AE4F-A103B01CD488}.Release|Mixed 
Platforms.Build.0 = Release|Any CPU
+               {DBA35EDF-A0FF-4DF7-AE4F-A103B01CD488}.Release|x86.ActiveCfg = 
Release|Any CPU
+               {DBA35EDF-A0FF-4DF7-AE4F-A103B01CD488}.Release|x86.Build.0 = 
Release|Any CPU
+               {DBA35EDF-A0FF-4DF7-AE4F-A103B01CD488}.Release35|Any 
CPU.ActiveCfg = Release|Any CPU
+               {DBA35EDF-A0FF-4DF7-AE4F-A103B01CD488}.Release35|Any 
CPU.Build.0 = Release|Any CPU
+               {DBA35EDF-A0FF-4DF7-AE4F-A103B01CD488}.Release35|Mixed 
Platforms.ActiveCfg = Release|Any CPU
+               {DBA35EDF-A0FF-4DF7-AE4F-A103B01CD488}.Release35|Mixed 
Platforms.Build.0 = Release|Any CPU
+               {DBA35EDF-A0FF-4DF7-AE4F-A103B01CD488}.Release35|x86.ActiveCfg 
= Release|Any CPU
+               {DBA35EDF-A0FF-4DF7-AE4F-A103B01CD488}.Release35|x86.Build.0 = 
Release|Any CPU
+               {8C8D78D3-BFFD-4301-953B-FE5350B2AEEB}.Debug|Any CPU.ActiveCfg 
= Debug|Any CPU
+               {8C8D78D3-BFFD-4301-953B-FE5350B2AEEB}.Debug|Any CPU.Build.0 = 
Debug|Any CPU
+               {8C8D78D3-BFFD-4301-953B-FE5350B2AEEB}.Debug|Mixed 
Platforms.ActiveCfg = Debug|Any CPU
+               {8C8D78D3-BFFD-4301-953B-FE5350B2AEEB}.Debug|Mixed 
Platforms.Build.0 = Debug|Any CPU
+               {8C8D78D3-BFFD-4301-953B-FE5350B2AEEB}.Debug|x86.ActiveCfg = 
Debug|Any CPU
+               {8C8D78D3-BFFD-4301-953B-FE5350B2AEEB}.Debug|x86.Build.0 = 
Debug|Any CPU
+               {8C8D78D3-BFFD-4301-953B-FE5350B2AEEB}.Debug35|Any 
CPU.ActiveCfg = Debug|Any CPU
+               {8C8D78D3-BFFD-4301-953B-FE5350B2AEEB}.Debug35|Any CPU.Build.0 
= Debug|Any CPU
+               {8C8D78D3-BFFD-4301-953B-FE5350B2AEEB}.Debug35|Mixed 
Platforms.ActiveCfg = Debug|Any CPU
+               {8C8D78D3-BFFD-4301-953B-FE5350B2AEEB}.Debug35|Mixed 
Platforms.Build.0 = Debug|Any CPU
+               {8C8D78D3-BFFD-4301-953B-FE5350B2AEEB}.Debug35|x86.ActiveCfg = 
Debug|Any CPU
+               {8C8D78D3-BFFD-4301-953B-FE5350B2AEEB}.Debug35|x86.Build.0 = 
Debug|Any CPU
+               {8C8D78D3-BFFD-4301-953B-FE5350B2AEEB}.Release|Any 
CPU.ActiveCfg = Release|Any CPU
+               {8C8D78D3-BFFD-4301-953B-FE5350B2AEEB}.Release|Any CPU.Build.0 
= Release|Any CPU
+               {8C8D78D3-BFFD-4301-953B-FE5350B2AEEB}.Release|Mixed 
Platforms.ActiveCfg = Release|Any CPU
+               {8C8D78D3-BFFD-4301-953B-FE5350B2AEEB}.Release|Mixed 
Platforms.Build.0 = Release|Any CPU
+               {8C8D78D3-BFFD-4301-953B-FE5350B2AEEB}.Release|x86.ActiveCfg = 
Release|Any CPU
+               {8C8D78D3-BFFD-4301-953B-FE5350B2AEEB}.Release|x86.Build.0 = 
Release|Any CPU
+               {8C8D78D3-BFFD-4301-953B-FE5350B2AEEB}.Release35|Any 
CPU.ActiveCfg = Release|Any CPU
+               {8C8D78D3-BFFD-4301-953B-FE5350B2AEEB}.Release35|Any 
CPU.Build.0 = Release|Any CPU
+               {8C8D78D3-BFFD-4301-953B-FE5350B2AEEB}.Release35|Mixed 
Platforms.ActiveCfg = Release|Any CPU
+               {8C8D78D3-BFFD-4301-953B-FE5350B2AEEB}.Release35|Mixed 
Platforms.Build.0 = Release|Any CPU
+               {8C8D78D3-BFFD-4301-953B-FE5350B2AEEB}.Release35|x86.ActiveCfg 
= Release|Any CPU
+               {8C8D78D3-BFFD-4301-953B-FE5350B2AEEB}.Release35|x86.Build.0 = 
Release|Any CPU
        EndGlobalSection
        GlobalSection(SolutionProperties) = preSolution
                HideSolutionNode = FALSE

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/2f5d89b4/src/Lucene.Net.Analysis.SmartCn/AnalyzerProfile.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.SmartCn/AnalyzerProfile.cs 
b/src/Lucene.Net.Analysis.SmartCn/AnalyzerProfile.cs
new file mode 100644
index 0000000..88c6c27
--- /dev/null
+++ b/src/Lucene.Net.Analysis.SmartCn/AnalyzerProfile.cs
@@ -0,0 +1,183 @@
+ï»¿using System;
+using System.IO;
+using System.Security;
+
+namespace Lucene.Net.Analysis.Cn.Smart
+{
+    /*
+     * Licensed to the Apache Software Foundation (ASF) under one or more
+     * contributor license agreements.  See the NOTICE file distributed with
+     * this work for additional information regarding copyright ownership.
+     * The ASF licenses this file to You under the Apache License, Version 2.0
+     * (the "License"); you may not use this file except in compliance with
+     * the License.  You may obtain a copy of the License at
+     *
+     *     http://www.apache.org/licenses/LICENSE-2.0
+     *
+     * Unless required by applicable law or agreed to in writing, software
+     * distributed under the License is distributed on an "AS IS" BASIS,
+     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     * See the License for the specific language governing permissions and
+     * limitations under the License.
+     */
+
+    /// <summary>
+    /// Manages analysis data configuration for <see 
cref="SmartChineseAnalyzer"/>
+    /// <para/>
+    /// <see cref="SmartChineseAnalyzer"/> has a built-in dictionary and 
stopword list out-of-box.
+    /// <para/>
+    /// NOTE: To use an alternate dicationary than the built-in one, put the 
"bigramdict.dct" and
+    /// "coredict.dct" files in a subdirectory of your application named 
"analysis-data". This subdirectory
+    /// can be placed in any directory up to and including the root directory 
(if the OS permission allows).
+    /// To place the files in an alternate location, set an environment 
variable named "analysis.data.dir"
+    /// with the name of the directory the "bigramdict.dct" and "coredict.dct" 
files can be located within.
+    /// <para/>
+    /// The default "bigramdict.dct" and "coredict.dct" files can be found at: 
+    /// <a 
href="https://issues.apache.org/jira/browse/LUCENE-1629";>https://issues.apache.org/jira/browse/LUCENE-1629</a>.
+    /// <para/>
+    /// @lucene.experimental
+    /// </summary>
+    public class AnalyzerProfile
+    {
+        /// <summary>
+        /// Global indicating the configured analysis data directory
+        /// </summary>
+        public static string ANALYSIS_DATA_DIR = "";
+
+        static AnalyzerProfile()
+        {
+            Init();
+        }
+
+        // LUCENENET specific - changed the logic here to leave the 
+        // ANALYSIS_DATA_DIR an empty string if it is not found. This
+        // allows us to skip loading files from disk if there are no files
+        // to load (and fixes LUCENE-1817 that prevents the on-disk files
+        // from ever being loaded).
+        private static void Init()
+        {
+#if NETSTANDARD
+            // Support for GB2312 encoding. See: 
https://docs.microsoft.com/en-us/dotnet/api/system.text.codepagesencodingprovider?view=netcore-2.0
+            var encodingProvider = 
System.Text.CodePagesEncodingProvider.Instance;
+            System.Text.Encoding.RegisterProvider(encodingProvider);
+#endif
+
+            string dirName = "analysis-data";
+            //string propName = "analysis.properties";
+
+            // Try the system 
propertyï¼-Danalysis.data.dir=/path/to/analysis-data
+            //ANALYSIS_DATA_DIR = System.getProperty("analysis.data.dir", "");
+            ANALYSIS_DATA_DIR = GetSystemProperty("analysis.data.dir", "");
+            if (ANALYSIS_DATA_DIR.Length != 0)
+                return;
+
+#if NETSTANDARD
+            string currentPath = System.AppContext.BaseDirectory;
+#else
+            string currentPath = AppDomain.CurrentDomain.BaseDirectory;
+#endif
+
+            //FileInfo[] cadidateFiles = new FileInfo[] { new 
FileInfo(currentPath + "/" + dirName),
+            //    new FileInfo(currentPath + "/bin/" + dirName)/*, new 
FileInfo("./" + propName),
+            //    new FileInfo("./lib/" + propName)*/ };
+            //for (int i = 0; i < cadidateFiles.Length; i++)
+            //{
+            //    FileInfo file = cadidateFiles[i];
+            //    if (file.Exists)
+            //    {
+            //        ANALYSIS_DATA_DIR = file.FullName;
+
+            //        //if (file.isDirectory())
+            //        //{
+            //        //    ANALYSIS_DATA_DIR = file.getAbsolutePath();
+            //        //}
+            //        //else if (file.isFile() && 
GetAnalysisDataDir(file).Length != 0)
+            //        //{
+            //        //    ANALYSIS_DATA_DIR = GetAnalysisDataDir(file);
+            //        //}
+            //        break;
+            //    }
+            //}
+
+            string candidatePath = System.IO.Path.Combine(currentPath, 
dirName);
+            if (Directory.Exists(candidatePath))
+            {
+                ANALYSIS_DATA_DIR = candidatePath;
+                return;
+            }
+            
+
+            try
+            {
+                while (new DirectoryInfo(currentPath).Parent != null)
+                {
+                    candidatePath = System.IO.Path.Combine(new 
DirectoryInfo(currentPath).Parent.FullName, dirName);
+                    if (Directory.Exists(candidatePath))
+                    {
+                        ANALYSIS_DATA_DIR = candidatePath;
+                        return;
+                    }
+                    currentPath = new 
DirectoryInfo(currentPath).Parent.FullName;
+                }
+            }
+            catch (SecurityException)
+            {
+                // ignore security errors
+            }
+
+
+            //for (int i = 0; i < cadidateDirectories.Count; i++)
+            //{
+            //    DirectoryInfo dir = cadidateDirectories[i];
+            //    if (dir.Exists)
+            //    {
+            //        ANALYSIS_DATA_DIR = dir.FullName;
+            //        break;
+            //    }
+            //}
+
+            //if (ANALYSIS_DATA_DIR.Length == 0)
+            //{
+            //    // Dictionary directory cannot be found.
+            //    throw new Exception("WARNING: Can not find lexical 
dictionary directory!"
+            //     + " This will cause unpredictable exceptions in your 
application!"
+            //     + " Please refer to the manual to download the 
dictionaries.");
+            //}
+
+        }
+
+        //private static string GetAnalysisDataDir(FileInfo propFile)
+        //{
+        //    Properties prop = new Properties();
+        //    try
+        //    {
+        //        string dir;
+        //        using (FileStream input = new FileStream(propFile.FullName, 
FileMode.Open, FileAccess.Read))
+        //        {
+        //            prop.load(new StreamReader(input, Encoding.UTF8));
+        //            dir = prop.getProperty("analysis.data.dir", "");
+        //        }
+        //        return dir;
+        //    }
+        //    catch (IOException e)
+        //    {
+        //        return "";
+        //    }
+        //}
+
+        private static string GetSystemProperty(string property, string 
defaultValue)
+        {
+            string setting;
+            try
+            {
+                setting = Environment.GetEnvironmentVariable(property);
+            }
+            catch (SecurityException)
+            {
+                setting = null;
+            }
+
+            return (setting == null) ? defaultValue : setting;
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/2f5d89b4/src/Lucene.Net.Analysis.SmartCn/CharType.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.SmartCn/CharType.cs 
b/src/Lucene.Net.Analysis.SmartCn/CharType.cs
new file mode 100644
index 0000000..8360802
--- /dev/null
+++ b/src/Lucene.Net.Analysis.SmartCn/CharType.cs
@@ -0,0 +1,67 @@
+ï»¿namespace Lucene.Net.Analysis.Cn.Smart
+{
+    /*
+     * Licensed to the Apache Software Foundation (ASF) under one or more
+     * contributor license agreements.  See the NOTICE file distributed with
+     * this work for additional information regarding copyright ownership.
+     * The ASF licenses this file to You under the Apache License, Version 2.0
+     * (the "License"); you may not use this file except in compliance with
+     * the License.  You may obtain a copy of the License at
+     *
+     *     http://www.apache.org/licenses/LICENSE-2.0
+     *
+     * Unless required by applicable law or agreed to in writing, software
+     * distributed under the License is distributed on an "AS IS" BASIS,
+     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     * See the License for the specific language governing permissions and
+     * limitations under the License.
+     */
+
+    /// <summary>
+    /// Internal <see cref="SmartChineseAnalyzer"/> character type constants.
+    /// <para/>
+    /// @lucene.experimental
+    /// </summary>
+    public enum CharType
+    {
+        /// <summary>
+        /// Punctuation Characters
+        /// </summary>
+        DELIMITER = 0,
+
+        /// <summary>
+        /// Letters
+        /// </summary>
+        LETTER = 1,
+
+        /// <summary>
+        /// Numeric Digits
+        /// </summary>
+        DIGIT = 2,
+
+        /// <summary>
+        /// Han Ideographs
+        /// </summary>
+        HANZI = 3,
+
+        /// <summary>
+        /// Characters that act as a space
+        /// </summary>
+        SPACE_LIKE = 4,
+
+        /// <summary>
+        /// Full-Width letters
+        /// </summary>
+        FULLWIDTH_LETTER = 5,
+
+        /// <summary>
+        /// Full-Width alphanumeric characters
+        /// </summary>
+        FULLWIDTH_DIGIT = 6,
+
+        /// <summary>
+        /// Other (not fitting any of the other categories)
+        /// </summary>
+        OTHER = 7
+    }
+}

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/2f5d89b4/src/Lucene.Net.Analysis.SmartCn/HHMM/AbstractDictionary.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.SmartCn/HHMM/AbstractDictionary.cs 
b/src/Lucene.Net.Analysis.SmartCn/HHMM/AbstractDictionary.cs
new file mode 100644
index 0000000..efac7d0
--- /dev/null
+++ b/src/Lucene.Net.Analysis.SmartCn/HHMM/AbstractDictionary.cs
@@ -0,0 +1,224 @@
+ï»¿using System;
+using System.Text;
+
+namespace Lucene.Net.Analysis.Cn.Smart.HHMM
+{
+    /*
+     * Licensed to the Apache Software Foundation (ASF) under one or more
+     * contributor license agreements.  See the NOTICE file distributed with
+     * this work for additional information regarding copyright ownership.
+     * The ASF licenses this file to You under the Apache License, Version 2.0
+     * (the "License"); you may not use this file except in compliance with
+     * the License.  You may obtain a copy of the License at
+     *
+     *     http://www.apache.org/licenses/LICENSE-2.0
+     *
+     * Unless required by applicable law or agreed to in writing, software
+     * distributed under the License is distributed on an "AS IS" BASIS,
+     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     * See the License for the specific language governing permissions and
+     * limitations under the License.
+     */
+
+    /// <summary>
+    /// <para>
+    /// <see cref="SmartChineseAnalyzer"/> abstract dictionary implementation.
+    /// </para>
+    /// <para>
+    /// Contains methods for dealing with GB2312 encoding.
+    /// </para>
+    /// @lucene.experimental
+    /// </summary>
+    internal abstract class AbstractDictionary
+    {
+        /// <summary>
+        /// First Chinese Character in GB2312 (15 * 94)
+        /// Characters in GB2312 are arranged in a grid of 94 * 94, 0-14 are 
unassigned or punctuation.
+        /// </summary>
+        public static readonly int GB2312_FIRST_CHAR = 1410;
+
+        /// <summary>
+        /// Last Chinese Character in GB2312 (87 * 94). 
+        /// Characters in GB2312 are arranged in a grid of 94 * 94, 88-94 are 
unassigned.
+        /// </summary>
+        public static readonly int GB2312_CHAR_NUM = 87 * 94;
+
+        /// <summary>
+        /// Dictionary data contains 6768 Chinese characters with frequency 
statistics.
+        /// </summary>
+        public static readonly int CHAR_NUM_IN_FILE = 6768;
+
+        // =====================================================
+        // code +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +A +B +C +D +E +F
+        // B0A0 å é¿ å æ¨ å å å ç ç è¼ ç® è¾ ç¢ ç± é
+        // B0B0 é æ°¨ å® ä¿º æ æ å²¸ èº æ¡ è® æ ç å¹ æ ç¬ ç¿±
+        // B0C0 è¢ å² å¥¥ æ æ¾³ è æ æ å å§ ç¬ å« ç¤ å·´ æ è·
+        // B0D0 é¶ æ è å é¸ ç½¢ ç¸ ç½ æ ç¾ æ ä½° è´¥ æ ç¨ æ
+        // B0E0 ç æ¬ æ³ è¬ é¢ æ¿ ç æ® æ ä¼´ ç£ å å ç» é¦ å¸®
+        // B0F0 æ¢ æ¦ è ç» æ£ ç£ è é å è°¤ è è å è¤ å¥
+        // =====================================================
+        //
+        // GB2312 character setï¼
+        // 01 94 Symbols
+        // 02 72 Numbers
+        // 03 94 Latin
+        // 04 83 Kana
+        // 05 86 Katakana
+        // 06 48 Greek
+        // 07 66 Cyrillic
+        // 08 63 Phonetic Symbols
+        // 09 76 Drawing Symbols
+        // 10-15 Unassigned
+        // 16-55 3755 Plane 1, in pinyin order
+        // 56-87 3008 Plane 2, in radical/stroke order
+        // 88-94 Unassigned
+        // ======================================================
+
+        /// <summary>
+        /// <para>
+        /// Transcode from GB2312 ID to Unicode
+        /// </para>
+        /// <para>
+        /// GB2312 is divided into a 94 * 94 grid, containing 7445 characters 
consisting of 6763 Chinese characters and 682 symbols.
+        /// Some regions are unassigned (reserved).
+        /// </para>
+        /// </summary>
+        /// <param name="ccid">GB2312 id</param>
+        /// <returns>unicode String</returns>
+        public virtual string GetCCByGB2312Id(int ccid)
+        {
+            if (ccid < 0 || ccid > AbstractDictionary.GB2312_CHAR_NUM)
+                return "";
+            int cc1 = ccid / 94 + 161;
+            int cc2 = ccid % 94 + 161;
+            byte[] buffer = new byte[2];
+            buffer[0] = (byte)cc1;
+            buffer[1] = (byte)cc2;
+            try
+            {
+                //String cchar = new String(buffer, "GB2312");
+                string cchar = 
Encoding.GetEncoding("GB2312").GetString(buffer);
+                return cchar;
+            }
+            catch (ArgumentException) // Encoding is not supported by the 
platform
+            {
+                return "";
+            }
+        }
+
+        /// <summary>
+        /// Transcode from Unicode to GB2312
+        /// </summary>
+        /// <param name="ch">input character in Unicode, or character in Basic 
Latin range.</param>
+        /// <returns>position in GB2312</returns>
+        public virtual short GetGB2312Id(char ch)
+        {
+            try
+            {
+                //byte[] buffer = Character.ToString(ch).getBytes("GB2312");
+                byte[] buffer = 
Encoding.GetEncoding("GB2312").GetBytes(ch.ToString());
+                //byte[] buffer = 
Encoding.GetEncoding("hz-gb-2312").GetBytes(ch.ToString());
+                if (buffer.Length != 2)
+                {
+                    // Should be a two-byte character
+                    return -1;
+                }
+                int b0 = (buffer[0] & 0x0FF) - 161; // Code starts from A1, 
therefore subtract 0xA1=161
+                int b1 = (buffer[1] & 0x0FF) - 161; // There is no Chinese 
char for the first and last symbol. 
+                                                    // Therefore, each code 
page only has 16*6-2=94 characters.
+                return (short)(b0 * 94 + b1);
+            }
+            catch (ArgumentException e) // Encoding is not supported by the 
platform
+            {
+                throw new Exception(e.ToString(), e);
+            }
+        }
+
+        /// <summary>
+        /// 32-bit FNV Hash Function
+        /// </summary>
+        /// <param name="c">input character</param>
+        /// <returns>hashcode</returns>
+        public virtual long Hash1(char c)
+        {
+            long p = 1099511628211L;
+            long hash = unchecked((long)0xcbf29ce484222325L);
+            hash = (hash ^ (c & 0x00FF)) * p;
+            hash = (hash ^ (c >> 8)) * p;
+            hash += hash << 13;
+            hash ^= hash >> 7;
+            hash += hash << 3;
+            hash ^= hash >> 17;
+            hash += hash << 5;
+            return hash;
+        }
+
+        /// <summary>
+        /// 32-bit FNV Hash Function
+        /// </summary>
+        /// <param name="carray">character array</param>
+        /// <returns>hashcode</returns>
+        public virtual long Hash1(char[] carray)
+        {
+            long p = 1099511628211L;
+            long hash = unchecked((long)0xcbf29ce484222325L);
+            for (int i = 0; i < carray.Length; i++)
+            {
+                char d = carray[i];
+                hash = (hash ^ (d & 0x00FF)) * p;
+                hash = (hash ^ (d >> 8)) * p;
+            }
+
+            // hash += hash << 13;
+            // hash ^= hash >> 7;
+            // hash += hash << 3;
+            // hash ^= hash >> 17;
+            // hash += hash << 5;
+            return hash;
+        }
+
+        /// <summary>
+        /// djb2 hash algorithmï¼this algorithm (k=33) was first reported by 
dan
+        /// bernstein many years ago in comp.lang.c. another version of this 
algorithm
+        /// (now favored by bernstein) uses xor: hash(i) = hash(i - 1) * 33 ^ 
str[i];
+        /// the magic of number 33 (why it works better than many other 
constants,
+        /// prime or not) has never been adequately explained.
+        /// </summary>
+        /// <param name="c">character</param>
+        /// <returns>hashcode</returns>
+        public virtual int Hash2(char c)
+        {
+            int hash = 5381;
+
+            /* hash 33 + c */
+            hash = ((hash << 5) + hash) + c & 0x00FF;
+            hash = ((hash << 5) + hash) + c >> 8;
+
+            return hash;
+        }
+
+        /// <summary>
+        /// djb2 hash algorithmï¼this algorithm (k=33) was first reported by 
dan
+        /// bernstein many years ago in comp.lang.c. another version of this 
algorithm
+        /// (now favored by bernstein) uses xor: hash(i) = hash(i - 1) * 33 ^ 
str[i];
+        /// the magic of number 33 (why it works better than many other 
constants,
+        /// prime or not) has never been adequately explained.
+        /// </summary>
+        /// <param name="carray">character array</param>
+        /// <returns>hashcode</returns>
+        public virtual int Hash2(char[] carray)
+        {
+            int hash = 5381;
+
+            /* hash 33 + c */
+            for (int i = 0; i < carray.Length; i++)
+            {
+                char d = carray[i];
+                hash = ((hash << 5) + hash) + d & 0x00FF;
+                hash = ((hash << 5) + hash) + d >> 8;
+            }
+
+            return hash;
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/2f5d89b4/src/Lucene.Net.Analysis.SmartCn/HHMM/BiSegGraph.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.SmartCn/HHMM/BiSegGraph.cs 
b/src/Lucene.Net.Analysis.SmartCn/HHMM/BiSegGraph.cs
new file mode 100644
index 0000000..adeef2a
--- /dev/null
+++ b/src/Lucene.Net.Analysis.SmartCn/HHMM/BiSegGraph.cs
@@ -0,0 +1,256 @@
+ï»¿using Lucene.Net.Support;
+using System;
+using System.Collections.Generic;
+using System.Text;
+
+namespace Lucene.Net.Analysis.Cn.Smart.HHMM
+{
+    /*
+     * Licensed to the Apache Software Foundation (ASF) under one or more
+     * contributor license agreements.  See the NOTICE file distributed with
+     * this work for additional information regarding copyright ownership.
+     * The ASF licenses this file to You under the Apache License, Version 2.0
+     * (the "License"); you may not use this file except in compliance with
+     * the License.  You may obtain a copy of the License at
+     *
+     *     http://www.apache.org/licenses/LICENSE-2.0
+     *
+     * Unless required by applicable law or agreed to in writing, software
+     * distributed under the License is distributed on an "AS IS" BASIS,
+     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     * See the License for the specific language governing permissions and
+     * limitations under the License.
+     */
+
+    /// <summary>
+    /// Graph representing possible token pairs (bigrams) at each start offset 
in the sentence.
+    /// <para>
+    /// For each start offset, a list of possible token pairs is stored.
+    /// </para>
+    /// @lucene.experimental
+    /// </summary>
+    internal class BiSegGraph
+    {
+        private IDictionary<int, IList<SegTokenPair>> tokenPairListTable = new 
Dictionary<int, IList<SegTokenPair>>();
+
+        private IList<SegToken> segTokenList;
+
+        private static BigramDictionary bigramDict = 
BigramDictionary.GetInstance();
+
+        public BiSegGraph(SegGraph segGraph)
+        {
+            segTokenList = segGraph.MakeIndex();
+            GenerateBiSegGraph(segGraph);
+        }
+
+        /// <summary>
+        /// Generate a <see cref="BiSegGraph"/> based upon a <see 
cref="SegGraph"/>
+        /// </summary>
+        private void GenerateBiSegGraph(SegGraph segGraph)
+        {
+            double smooth = 0.1;
+            int wordPairFreq = 0;
+            int maxStart = segGraph.MaxStart;
+            double oneWordFreq, weight, tinyDouble = 1.0 / 
Utility.MAX_FREQUENCE;
+
+            int next;
+            char[] idBuffer;
+            // get the list of tokens ordered and indexed
+            segTokenList = segGraph.MakeIndex();
+            // Because the beginning position of startToken is -1, therefore 
startToken can be obtained when key = -1
+            int key = -1;
+            IList<SegToken> nextTokens = null;
+            while (key < maxStart)
+            {
+                if (segGraph.IsStartExist(key))
+                {
+
+                    IList<SegToken> tokenList = segGraph.GetStartList(key);
+
+                    // Calculate all tokens for a given key.
+                    foreach (SegToken t1 in tokenList)
+                    {
+                        oneWordFreq = t1.Weight;
+                        next = t1.EndOffset;
+                        nextTokens = null;
+                        // Find the next corresponding Token.
+                        // For example: "Sunny seashore", the present Token is 
"sunny", next one should be "sea" or "seashore".
+                        // If we cannot find the next Token, then go to the 
end and repeat the same cycle.
+                        while (next <= maxStart)
+                        {
+                            // Because the beginning position of endToken is 
sentenceLen, so equal to sentenceLen can find endToken.
+                            if (segGraph.IsStartExist(next))
+                            {
+                                nextTokens = segGraph.GetStartList(next);
+                                break;
+                            }
+                            next++;
+                        }
+                        if (nextTokens == null)
+                        {
+                            break;
+                        }
+                        foreach (SegToken t2 in nextTokens)
+                        {
+                            idBuffer = new char[t1.CharArray.Length + 
t2.CharArray.Length + 1];
+                            System.Array.Copy(t1.CharArray, 0, idBuffer, 0, 
t1.CharArray.Length);
+                            idBuffer[t1.CharArray.Length] = 
BigramDictionary.WORD_SEGMENT_CHAR;
+                            System.Array.Copy(t2.CharArray, 0, idBuffer,
+                                t1.CharArray.Length + 1, t2.CharArray.Length);
+
+                            // Two linked Words frequency
+                            wordPairFreq = bigramDict.GetFrequency(idBuffer);
+
+                            // Smoothing
+
+                            // -log{a*P(Ci-1)+(1-a)P(Ci|Ci-1)} Note 0<a<1
+                            weight = -Math
+                                .Log(smooth
+                                    * (1.0 + oneWordFreq)
+                                    / (Utility.MAX_FREQUENCE + 0.0)
+                                    + (1.0 - smooth)
+                                    * ((1.0 - tinyDouble) * wordPairFreq / 
(1.0 + oneWordFreq) + tinyDouble));
+
+                            SegTokenPair tokenPair = new 
SegTokenPair(idBuffer, t1.Index,
+                                t2.Index, weight);
+                            this.AddSegTokenPair(tokenPair);
+                        }
+                    }
+                }
+                key++;
+            }
+
+        }
+
+        /// <summary>
+        /// Returns <c>true</c> if their is a list of token pairs at this 
offset (index of the second token)
+        /// </summary>
+        /// <param name="to">index of the second token in the token 
pair</param>
+        /// <returns><c>true</c> if a token pair exists</returns>
+        public virtual bool IsToExist(int to)
+        {
+            //return tokenPairListTable.get(Integer.valueOf(to)) != null;
+            //return tokenPairListTable.ContainsKey(to) && 
tokenPairListTable[to] != null;
+            IList<SegTokenPair> result;
+            return tokenPairListTable.TryGetValue(to, out result) && result != 
null;
+        }
+
+        /// <summary>
+        /// Return a <see cref="T:IList{SegTokenPair}"/> of all token pairs at 
this offset (index of the second token)
+        /// </summary>
+        /// <param name="to">index of the second token in the token 
pair</param>
+        /// <returns><see cref="T:IList{SegTokenPair}"/> of token pairs. 
</returns>
+        public virtual IList<SegTokenPair> GetToList(int to)
+        {
+            IList<SegTokenPair> result;
+            tokenPairListTable.TryGetValue(to, out result);
+            return result;
+        }
+
+        /// <summary>
+        /// Add a <see cref="SegTokenPair"/>
+        /// </summary>
+        /// <param name="tokenPair"><see cref="SegTokenPair"/></param>
+        public virtual void AddSegTokenPair(SegTokenPair tokenPair)
+        {
+            int to = tokenPair.To;
+            if (!IsToExist(to))
+            {
+                List<SegTokenPair> newlist = new List<SegTokenPair>();
+                newlist.Add(tokenPair);
+                tokenPairListTable[to] = newlist;
+            }
+            else
+            {
+                IList<SegTokenPair> tokenPairList = tokenPairListTable[to];
+                tokenPairList.Add(tokenPair);
+            }
+        }
+
+        /// <summary>
+        /// Get the number of <see cref="SegTokenPair"/> entries in the table.
+        /// </summary>
+        /// <returns>number of <see cref="SegTokenPair"/> entries</returns>
+        public virtual int ToCount
+        {
+            get { return tokenPairListTable.Count; }
+        }
+
+        /// <summary>
+        /// Find the shortest path with the Viterbi algorithm.
+        /// </summary>
+        /// <returns><see cref="T:IList{SegToken}"/></returns>
+        [ExceptionToNetNumericConvention]
+        public virtual IList<SegToken> GetShortPath()
+        {
+            int current;
+            int nodeCount = ToCount;
+            IList<PathNode> path = new List<PathNode>();
+            PathNode zeroPath = new PathNode();
+            zeroPath.Weight = 0;
+            zeroPath.PreNode = 0;
+            path.Add(zeroPath);
+            for (current = 1; current <= nodeCount; current++)
+            {
+                double weight;
+                IList<SegTokenPair> edges = GetToList(current);
+
+                double minWeight = double.MaxValue;
+                SegTokenPair minEdge = null;
+                foreach (SegTokenPair edge in edges)
+                {
+                    weight = edge.Weight;
+                    PathNode preNode2 = path[edge.From];
+                    if (preNode2.Weight + weight < minWeight)
+                    {
+                        minWeight = preNode2.Weight + weight;
+                        minEdge = edge;
+                    }
+                }
+                PathNode newNode = new PathNode();
+                newNode.Weight = minWeight;
+                newNode.PreNode = minEdge.From;
+                path.Add(newNode);
+            }
+
+            // Calculate PathNodes
+            int preNode, lastNode;
+            lastNode = path.Count - 1;
+            current = lastNode;
+            IList<int> rpath = new List<int>();
+            IList<SegToken> resultPath = new List<SegToken>();
+
+            rpath.Add(current);
+            while (current != 0)
+            {
+                PathNode currentPathNode = path[current];
+                preNode = currentPathNode.PreNode;
+                rpath.Add(preNode);
+                current = preNode;
+            }
+            for (int j = rpath.Count - 1; j >= 0; j--)
+            {
+                //int idInteger = rpath.get(j);
+                //int id = idInteger.intValue();
+                int id = rpath[j];
+                SegToken t = segTokenList[id];
+                resultPath.Add(t);
+            }
+            return resultPath;
+        }
+
+        public override string ToString()
+        {
+            StringBuilder sb = new StringBuilder();
+            ICollection<IList<SegTokenPair>> values = 
tokenPairListTable.Values;
+            foreach (IList<SegTokenPair> segList in values)
+            {
+                foreach (SegTokenPair pair in segList)
+                {
+                    sb.Append(pair + "\n");
+                }
+            }
+            return sb.ToString();
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/2f5d89b4/src/Lucene.Net.Analysis.SmartCn/HHMM/BigramDictionary.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.SmartCn/HHMM/BigramDictionary.cs 
b/src/Lucene.Net.Analysis.SmartCn/HHMM/BigramDictionary.cs
new file mode 100644
index 0000000..cc87ceb
--- /dev/null
+++ b/src/Lucene.Net.Analysis.SmartCn/HHMM/BigramDictionary.cs
@@ -0,0 +1,431 @@
+ï»¿using Lucene.Net.Support;
+using Lucene.Net.Support.IO;
+using System;
+using System.IO;
+using System.Reflection;
+using System.Text;
+
+namespace Lucene.Net.Analysis.Cn.Smart.HHMM
+{
+    /*
+     * Licensed to the Apache Software Foundation (ASF) under one or more
+     * contributor license agreements.  See the NOTICE file distributed with
+     * this work for additional information regarding copyright ownership.
+     * The ASF licenses this file to You under the Apache License, Version 2.0
+     * (the "License"); you may not use this file except in compliance with
+     * the License.  You may obtain a copy of the License at
+     *
+     *     http://www.apache.org/licenses/LICENSE-2.0
+     *
+     * Unless required by applicable law or agreed to in writing, software
+     * distributed under the License is distributed on an "AS IS" BASIS,
+     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     * See the License for the specific language governing permissions and
+     * limitations under the License.
+     */
+
+    /// <summary>
+    /// SmartChineseAnalyzer Bigram dictionary.
+    /// <para/>
+    /// @lucene.experimental
+    /// </summary>
+    internal class BigramDictionary : AbstractDictionary
+    {
+        private BigramDictionary()
+        {
+        }
+
+        public static readonly char WORD_SEGMENT_CHAR = '@';
+
+        private static BigramDictionary singleInstance;
+
+        public static readonly int PRIME_BIGRAM_LENGTH = 402137;
+
+        /// <summary>
+        /// The word associations are stored as FNV1 hashcodes, which have a 
small probability of collision, but save memory.  
+        /// </summary>
+        private long[] bigramHashTable;
+
+        private int[] frequencyTable;
+
+        private int max = 0;
+
+        private int repeat = 0;
+
+        // static Logger log = Logger.getLogger(BigramDictionary.class);
+
+        private static object syncLock = new object();
+
+        public static BigramDictionary GetInstance()
+        {
+            lock (syncLock)
+            {
+                if (singleInstance == null)
+                {
+                    singleInstance = new BigramDictionary();
+
+                    // LUCENENET specific
+                    // LUCENE-1817: 
https://issues.apache.org/jira/browse/LUCENE-1817
+                    // This issue still existed as of 4.8.0. Here is the fix - 
we only
+                    // load from a directory if the actual directory exists 
(AnalyzerProfile
+                    // ensures it is an empty string if it is not available).
+                    string dictRoot = AnalyzerProfile.ANALYSIS_DATA_DIR;
+                    if (string.IsNullOrEmpty(dictRoot))
+                    {
+                        singleInstance.Load();
+                    }
+                    else
+                    {
+                        singleInstance.Load(dictRoot);
+                    }
+
+
+                    //try
+                    //{
+                    //    singleInstance.Load();
+                    //}
+                    //catch (IOException e)
+                    //{
+                    //    string dictRoot = AnalyzerProfile.ANALYSIS_DATA_DIR;
+                    //    singleInstance.Load(dictRoot);
+                    //}
+                    //catch (TypeLoadException e)
+                    //{
+                    //    throw new Exception(e.ToString(), e);
+                    //}
+                }
+                return singleInstance;
+            }
+        }
+
+        private bool LoadFromObj(FileInfo serialObj)
+        {
+            try
+            {
+                using (Stream input = new FileStream(serialObj.FullName, 
FileMode.Open, FileAccess.Read))
+                    LoadFromInputStream(input);
+                return true;
+            }
+            catch (Exception e)
+            {
+                throw new Exception(e.ToString(), e);
+            }
+        }
+
+        // LUCENENET conversion note:
+        // The data in Lucene is stored in a proprietary binary format 
(similar to
+        // .NET's BinarySerializer) that cannot be read back in .NET. 
Therefore, the
+        // data was extracted using Java's DataOutputStream using the 
following Java code.
+        // It can then be read in using the LoadFromInputStream method below 
+        // (using a DataInputStream instead of a BinaryReader), and saved
+        // in the correct (BinaryWriter) format by calling the SaveToObj 
method.
+        // Alternatively, the data can be loaded from disk using the files
+        // here(https://issues.apache.org/jira/browse/LUCENE-1629) in the 
analysis.data.zip file, 
+        // which will automatically produce the .mem files.
+
+        //public void saveToOutputStream(java.io.DataOutputStream stream) 
throws IOException
+        //{
+        //    // save wordIndexTable
+        //    int wiLen = wordIndexTable.length;
+        //    stream.writeInt(wiLen);
+        //    for (int i = 0; i<wiLen; i++)
+        //    {
+        //        stream.writeShort(wordIndexTable[i]);
+        //    }
+
+        //    // save charIndexTable
+        //    int ciLen = charIndexTable.length;
+        //    stream.writeInt(ciLen);
+        //    for (int i = 0; i<ciLen; i++)
+        //    {
+        //        stream.writeChar(charIndexTable[i]);
+        //    }
+
+        //    int caDim1 = wordItem_charArrayTable == null ? -1 : 
wordItem_charArrayTable.length;
+        //    stream.writeInt(caDim1);
+        //    for (int i = 0; i<caDim1; i++)
+        //    {
+        //        int caDim2 = wordItem_charArrayTable[i] == null ? -1 : 
wordItem_charArrayTable[i].length;
+        //        stream.writeInt(caDim2);
+        //        for (int j = 0; j<caDim2; j++)
+        //        {
+        //            int caDim3 = wordItem_charArrayTable[i][j] == null ? -1 
: wordItem_charArrayTable[i][j].length;
+        //            stream.writeInt(caDim3);
+        //            for (int k = 0; k<caDim3; k++)
+        //            {
+        //                stream.writeChar(wordItem_charArrayTable[i][j][k]);
+        //            }
+        //        }
+        //    }
+
+        //    int fDim1 = wordItem_frequencyTable == null ? -1 : 
wordItem_frequencyTable.length;
+        //    stream.writeInt(fDim1);
+        //    for (int i = 0; i<fDim1; i++)
+        //    {
+        //        int fDim2 = wordItem_frequencyTable[i] == null ? -1 : 
wordItem_frequencyTable[i].length;
+        //        stream.writeInt(fDim2);
+        //        for (int j = 0; j<fDim2; j++)
+        //        {
+        //            stream.writeInt(wordItem_frequencyTable[i][j]);
+        //        }
+        //    }
+        //}
+
+        private void LoadFromInputStream(Stream serialObjectInputStream)
+        {
+            //ObjectInputStream input = new 
ObjectInputStream(serialObjectInputStream);
+            //bigramHashTable = (long[])input.readObject();
+            //frequencyTable = (int[])input.readObject();
+            //// log.info("load bigram dict from serialization.");
+            //input.close();
+
+            using (var reader = new BinaryReader(serialObjectInputStream))
+            //using (var reader = new DataInputStream(serialObjectInputStream))
+            {
+                // Read bigramHashTable
+                int bhLen = reader.ReadInt32();
+                bigramHashTable = new long[bhLen];
+                for (int i = 0; i < bhLen; i++)
+                {
+                    bigramHashTable[i] = reader.ReadInt64();
+                }
+
+                // Read frequencyTable
+                int fLen = reader.ReadInt32();
+                frequencyTable = new int[fLen];
+                for (int i = 0; i < fLen; i++)
+                {
+                    frequencyTable[i] = reader.ReadInt32();
+                }
+            }
+
+            // log.info("load bigram dict from serialization.");
+        }
+
+        private void SaveToObj(FileInfo serialObj)
+        {
+            try
+            {
+                //ObjectOutputStream output = new ObjectOutputStream(new 
FileStream(
+                //    serialObj.FullName, FileMode.Create, FileAccess.Write));
+                //output.writeObject(bigramHashTable);
+                //output.writeObject(frequencyTable);
+                //output.close();
+                
+                using (Stream output = new FileStream(serialObj.FullName, 
FileMode.Create, FileAccess.Write))
+                {
+                    using (BinaryWriter writer = new BinaryWriter(output))
+                    {
+                        int bhLen = bigramHashTable.Length;
+                        writer.Write(bhLen);
+                        for (int i = 0; i < bhLen; i++)
+                        {
+                            writer.Write(bigramHashTable[i]);
+                        }
+
+                        int fLen = frequencyTable.Length;
+                        writer.Write(fLen);
+                        for (int i = 0; i < fLen; i++)
+                        {
+                            writer.Write(frequencyTable[i]);
+                        }
+                    }
+                }
+                // log.info("serialize bigram dict.");
+            }
+#pragma warning disable 168
+            catch (Exception e)
+#pragma warning restore 168
+            {
+                // log.warn(e.getMessage());
+            }
+        }
+
+        private void Load()
+        {
+            using (Stream input = 
this.GetType().GetTypeInfo().Assembly.FindAndGetManifestResourceStream(this.GetType(),
 "bigramdict.mem"))
+            {
+                LoadFromInputStream(input);
+            }
+        }
+
+        private void Load(string dictRoot)
+        {
+            string bigramDictPath = System.IO.Path.Combine(dictRoot, 
"bigramdict.dct");
+
+            FileInfo serialObj = new FileInfo(System.IO.Path.Combine(dictRoot, 
"bigramdict.mem"));
+
+            if (serialObj.Exists && LoadFromObj(serialObj))
+            {
+
+            }
+            else
+            {
+                try
+                {
+                    bigramHashTable = new long[PRIME_BIGRAM_LENGTH];
+                    frequencyTable = new int[PRIME_BIGRAM_LENGTH];
+                    for (int i = 0; i < PRIME_BIGRAM_LENGTH; i++)
+                    {
+                        // it is possible for a value to hash to 0, but the 
probability is extremely low
+                        bigramHashTable[i] = 0;
+                        frequencyTable[i] = 0;
+                    }
+                    LoadFromFile(bigramDictPath);
+                }
+                catch (IOException e)
+                {
+                    throw new Exception(e.ToString(), e);
+                }
+                SaveToObj(serialObj);
+            }
+        }
+
+        /// <summary>
+        /// Load the datafile into this <see cref="BigramDictionary"/>
+        /// </summary>
+        /// <param name="dctFilePath">dctFilePath path to the Bigramdictionary 
(bigramdict.dct)</param>
+        /// <exception cref="IOException">If there is a low-level I/O 
error</exception>
+        public virtual void LoadFromFile(string dctFilePath)
+        {
+            int i, cnt, length, total = 0;
+            // The file only counted 6763 Chinese characters plus 5 reserved 
slots 3756~3760.  
+            // The 3756th is used (as a header) to store information.
+            int[]
+            buffer = new int[3];
+            byte[] intBuffer = new byte[4];
+            string tmpword;
+            //using (RandomAccessFile dctFile = new 
RandomAccessFile(dctFilePath, "r"))
+            using (var dctFile = new FileStream(dctFilePath, FileMode.Open, 
FileAccess.Read))
+            {
+
+                // GB2312 characters 0 - 6768
+                for (i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + 
CHAR_NUM_IN_FILE; i++)
+                {
+                    string currentStr = GetCCByGB2312Id(i);
+                    // if (i == 5231)
+                    // System.out.println(i);
+
+                    dctFile.Read(intBuffer, 0, intBuffer.Length);
+                    // the dictionary was developed for C, and byte order must 
be converted to work with Java
+                    cnt = 
ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LITTLE_ENDIAN).GetInt32();
+                    if (cnt <= 0)
+                    {
+                        continue;
+                    }
+                    total += cnt;
+                    int j = 0;
+                    while (j < cnt)
+                    {
+                        dctFile.Read(intBuffer, 0, intBuffer.Length);
+                        buffer[0] = 
ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LITTLE_ENDIAN)
+                            .GetInt32();// frequency
+                        dctFile.Read(intBuffer, 0, intBuffer.Length);
+                        buffer[1] = 
ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LITTLE_ENDIAN)
+                            .GetInt32();// length
+                        dctFile.Read(intBuffer, 0, intBuffer.Length);
+                        // buffer[2] = ByteBuffer.wrap(intBuffer).order(
+                        // ByteOrder.LITTLE_ENDIAN).getInt();// handle
+
+                        length = buffer[1];
+                        if (length > 0)
+                        {
+                            byte[] lchBuffer = new byte[length];
+                            dctFile.Read(lchBuffer, 0, lchBuffer.Length);
+                            //tmpword = new String(lchBuffer, "GB2312");
+                            tmpword = 
Encoding.GetEncoding("GB2312").GetString(lchBuffer);
+                            //tmpword = 
Encoding.GetEncoding("hz-gb-2312").GetString(lchBuffer);
+                            if (i != 3755 + GB2312_FIRST_CHAR)
+                            {
+                                tmpword = currentStr + tmpword;
+                            }
+                            char[] carray = tmpword.ToCharArray();
+                            long hashId = Hash1(carray);
+                            int index = GetAvaliableIndex(hashId, carray);
+                            if (index != -1)
+                            {
+                                if (bigramHashTable[index] == 0)
+                                {
+                                    bigramHashTable[index] = hashId;
+                                    // bigramStringTable[index] = tmpword;
+                                }
+                                frequencyTable[index] += buffer[0];
+                            }
+                        }
+                        j++;
+                    }
+                }
+            }
+            // log.info("load dictionary done! " + dctFilePath + " total:" + 
total);
+        }
+
+        private int GetAvaliableIndex(long hashId, char[] carray)
+        {
+            int hash1 = (int)(hashId % PRIME_BIGRAM_LENGTH);
+            int hash2 = Hash2(carray) % PRIME_BIGRAM_LENGTH;
+            if (hash1 < 0)
+                hash1 = PRIME_BIGRAM_LENGTH + hash1;
+            if (hash2 < 0)
+                hash2 = PRIME_BIGRAM_LENGTH + hash2;
+            int index = hash1;
+            int i = 1;
+            while (bigramHashTable[index] != 0 && bigramHashTable[index] != 
hashId
+                && i < PRIME_BIGRAM_LENGTH)
+            {
+                index = (hash1 + i * hash2) % PRIME_BIGRAM_LENGTH;
+                i++;
+            }
+            // System.out.println(i - 1);
+
+            if (i < PRIME_BIGRAM_LENGTH
+                && (bigramHashTable[index] == 0 || bigramHashTable[index] == 
hashId))
+            {
+                return index;
+            }
+            else
+                return -1;
+        }
+
+        /// <summary>
+        /// lookup the index into the frequency array.
+        /// </summary>
+        private int GetBigramItemIndex(char[] carray)
+        {
+            long hashId = Hash1(carray);
+            int hash1 = (int)(hashId % PRIME_BIGRAM_LENGTH);
+            int hash2 = Hash2(carray) % PRIME_BIGRAM_LENGTH;
+            if (hash1 < 0)
+                hash1 = PRIME_BIGRAM_LENGTH + hash1;
+            if (hash2 < 0)
+                hash2 = PRIME_BIGRAM_LENGTH + hash2;
+            int index = hash1;
+            int i = 1;
+            repeat++;
+            while (bigramHashTable[index] != 0 && bigramHashTable[index] != 
hashId
+                && i < PRIME_BIGRAM_LENGTH)
+            {
+                index = (hash1 + i * hash2) % PRIME_BIGRAM_LENGTH;
+                i++;
+                repeat++;
+                if (i > max)
+                    max = i;
+            }
+            // System.out.println(i - 1);
+
+            if (i < PRIME_BIGRAM_LENGTH && bigramHashTable[index] == hashId)
+            {
+                return index;
+            }
+            else
+                return -1;
+        }
+
+        public int GetFrequency(char[] carray)
+        {
+            int index = GetBigramItemIndex(carray);
+            if (index != -1)
+                return frequencyTable[index];
+            return 0;
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/2f5d89b4/src/Lucene.Net.Analysis.SmartCn/HHMM/HHMMSegmenter.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.SmartCn/HHMM/HHMMSegmenter.cs 
b/src/Lucene.Net.Analysis.SmartCn/HHMM/HHMMSegmenter.cs
new file mode 100644
index 0000000..5d6ee55
--- /dev/null
+++ b/src/Lucene.Net.Analysis.SmartCn/HHMM/HHMMSegmenter.cs
@@ -0,0 +1,252 @@
+ï»¿using System.Collections.Generic;
+using System.Text;
+
+namespace Lucene.Net.Analysis.Cn.Smart.HHMM
+{
+    /*
+     * Licensed to the Apache Software Foundation (ASF) under one or more
+     * contributor license agreements.  See the NOTICE file distributed with
+     * this work for additional information regarding copyright ownership.
+     * The ASF licenses this file to You under the Apache License, Version 2.0
+     * (the "License"); you may not use this file except in compliance with
+     * the License.  You may obtain a copy of the License at
+     *
+     *     http://www.apache.org/licenses/LICENSE-2.0
+     *
+     * Unless required by applicable law or agreed to in writing, software
+     * distributed under the License is distributed on an "AS IS" BASIS,
+     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     * See the License for the specific language governing permissions and
+     * limitations under the License.
+     */
+
+    /// <summary>
+    /// Finds the optimal segmentation of a sentence into Chinese words
+    /// <para/>
+    /// @lucene.experimental
+    /// </summary>
+    public class HHMMSegmenter
+    {
+        private static WordDictionary wordDict = WordDictionary.GetInstance();
+
+        /// <summary>
+        /// Create the <see cref="SegGraph"/> for a sentence.
+        /// </summary>
+        /// <param name="sentence">input sentence, without start and end 
markers</param>
+        /// <returns><see cref="SegGraph"/> corresponding to the input 
sentence.</returns>
+        private SegGraph CreateSegGraph(string sentence)
+        {
+            int i = 0, j;
+            int length = sentence.Length;
+            int foundIndex;
+            CharType[] charTypeArray = GetCharTypes(sentence);
+            StringBuilder wordBuf = new StringBuilder();
+            SegToken token;
+            int frequency = 0; // the number of times word appears.
+            bool hasFullWidth;
+            WordType wordType;
+            char[] charArray;
+
+            SegGraph segGraph = new SegGraph();
+            while (i < length)
+            {
+                hasFullWidth = false;
+                switch (charTypeArray[i])
+                {
+                    case CharType.SPACE_LIKE:
+                        i++;
+                        break;
+                    case CharType.HANZI:
+                        j = i + 1;
+                        //wordBuf.delete(0, wordBuf.length());
+                        wordBuf.Remove(0, wordBuf.Length);
+                        // It doesn't matter if a single Chinese character 
(Hanzi) can form a phrase or not, 
+                        // it will store that single Chinese character (Hanzi) 
in the SegGraph.  Otherwise, it will 
+                        // cause word division.
+                        wordBuf.Append(sentence[i]);
+                        charArray = new char[] { sentence[i] };
+                        frequency = wordDict.GetFrequency(charArray);
+                        token = new SegToken(charArray, i, j, 
WordType.CHINESE_WORD,
+                            frequency);
+                        segGraph.AddToken(token);
+
+                        foundIndex = wordDict.GetPrefixMatch(charArray);
+                        while (j <= length && foundIndex != -1)
+                        {
+                            if (wordDict.IsEqual(charArray, foundIndex) && 
charArray.Length > 1)
+                            {
+                                // It is the phrase we are looking for; In 
other words, we have found a phrase SegToken
+                                // from i to j.  It is not a monosyllabic word 
(single word).
+                                frequency = wordDict.GetFrequency(charArray);
+                                token = new SegToken(charArray, i, j, 
WordType.CHINESE_WORD,
+                                    frequency);
+                                segGraph.AddToken(token);
+                            }
+
+                            while (j < length && charTypeArray[j] == 
CharType.SPACE_LIKE)
+                                j++;
+
+                            if (j < length && charTypeArray[j] == 
CharType.HANZI)
+                            {
+                                wordBuf.Append(sentence[j]);
+                                charArray = new char[wordBuf.Length];
+                                //wordBuf.GetChars(0, charArray.Length, 
charArray, 0);
+                                wordBuf.CopyTo(0, charArray, 0, 
charArray.Length);
+                                // idArray has been found (foundWordIndex!=-1) 
as a prefix before.  
+                                // Therefore, idArray after it has been 
lengthened can only appear after foundWordIndex.  
+                                // So start searching after foundWordIndex.
+                                foundIndex = 
wordDict.GetPrefixMatch(charArray, foundIndex);
+                                j++;
+                            }
+                            else
+                            {
+                                break;
+                            }
+                        }
+                        i++;
+                        break;
+                    case CharType.FULLWIDTH_LETTER:
+                        hasFullWidth = true; /* intentional fallthrough */
+
+                        j = i + 1;
+                        while (j < length
+                            && (charTypeArray[j] == CharType.LETTER || 
charTypeArray[j] == CharType.FULLWIDTH_LETTER))
+                        {
+                            if (charTypeArray[j] == CharType.FULLWIDTH_LETTER)
+                                hasFullWidth = true;
+                            j++;
+                        }
+                        // Found a Token from i to j. Type is LETTER char 
string.
+                        charArray = Utility.STRING_CHAR_ARRAY;
+                        frequency = wordDict.GetFrequency(charArray);
+                        wordType = hasFullWidth ? WordType.FULLWIDTH_STRING : 
WordType.STRING;
+                        token = new SegToken(charArray, i, j, wordType, 
frequency);
+                        segGraph.AddToken(token);
+                        i = j;
+                        break;
+
+                    case CharType.LETTER:
+                        j = i + 1;
+                        while (j < length
+                            && (charTypeArray[j] == CharType.LETTER || 
charTypeArray[j] == CharType.FULLWIDTH_LETTER))
+                        {
+                            if (charTypeArray[j] == CharType.FULLWIDTH_LETTER)
+                                hasFullWidth = true;
+                            j++;
+                        }
+                        // Found a Token from i to j. Type is LETTER char 
string.
+                        charArray = Utility.STRING_CHAR_ARRAY;
+                        frequency = wordDict.GetFrequency(charArray);
+                        wordType = hasFullWidth ? WordType.FULLWIDTH_STRING : 
WordType.STRING;
+                        token = new SegToken(charArray, i, j, wordType, 
frequency);
+                        segGraph.AddToken(token);
+                        i = j;
+                        break;
+                    case CharType.FULLWIDTH_DIGIT:
+                        hasFullWidth = true; /* intentional fallthrough */
+
+                        j = i + 1;
+                        while (j < length
+                            && (charTypeArray[j] == CharType.DIGIT || 
charTypeArray[j] == CharType.FULLWIDTH_DIGIT))
+                        {
+                            if (charTypeArray[j] == CharType.FULLWIDTH_DIGIT)
+                                hasFullWidth = true;
+                            j++;
+                        }
+                        // Found a Token from i to j. Type is NUMBER char 
string.
+                        charArray = Utility.NUMBER_CHAR_ARRAY;
+                        frequency = wordDict.GetFrequency(charArray);
+                        wordType = hasFullWidth ? WordType.FULLWIDTH_NUMBER : 
WordType.NUMBER;
+                        token = new SegToken(charArray, i, j, wordType, 
frequency);
+                        segGraph.AddToken(token);
+                        i = j;
+                        break;
+
+                    case CharType.DIGIT:
+                        j = i + 1;
+                        while (j < length
+                            && (charTypeArray[j] == CharType.DIGIT || 
charTypeArray[j] == CharType.FULLWIDTH_DIGIT))
+                        {
+                            if (charTypeArray[j] == CharType.FULLWIDTH_DIGIT)
+                                hasFullWidth = true;
+                            j++;
+                        }
+                        // Found a Token from i to j. Type is NUMBER char 
string.
+                        charArray = Utility.NUMBER_CHAR_ARRAY;
+                        frequency = wordDict.GetFrequency(charArray);
+                        wordType = hasFullWidth ? WordType.FULLWIDTH_NUMBER : 
WordType.NUMBER;
+                        token = new SegToken(charArray, i, j, wordType, 
frequency);
+                        segGraph.AddToken(token);
+                        i = j;
+                        break;
+                    case CharType.DELIMITER:
+                        j = i + 1;
+                        // No need to search the weight for the punctuation.  
Picking the highest frequency will work.
+                        frequency = Utility.MAX_FREQUENCE;
+                        charArray = new char[] { sentence[i] };
+                        token = new SegToken(charArray, i, j, 
WordType.DELIMITER, frequency);
+                        segGraph.AddToken(token);
+                        i = j;
+                        break;
+                    default:
+                        j = i + 1;
+                        // Treat the unrecognized char symbol as unknown 
string.
+                        // For example, any symbol not in GB2312 is treated as 
one of these.
+                        charArray = Utility.STRING_CHAR_ARRAY;
+                        frequency = wordDict.GetFrequency(charArray);
+                        token = new SegToken(charArray, i, j, WordType.STRING, 
frequency);
+                        segGraph.AddToken(token);
+                        i = j;
+                        break;
+                }
+            }
+
+            // Add two more Tokens: "beginning xx beginning"
+            charArray = Utility.START_CHAR_ARRAY;
+            frequency = wordDict.GetFrequency(charArray);
+            token = new SegToken(charArray, -1, 0, WordType.SENTENCE_BEGIN, 
frequency);
+            segGraph.AddToken(token);
+
+            // "end xx end"
+            charArray = Utility.END_CHAR_ARRAY;
+            frequency = wordDict.GetFrequency(charArray);
+            token = new SegToken(charArray, length, length + 1, 
WordType.SENTENCE_END,
+                frequency);
+            segGraph.AddToken(token);
+
+            return segGraph;
+        }
+
+        /// <summary>
+        /// Get the character types for every character in a sentence.
+        /// </summary>
+        /// <param name="sentence">input sentence</param>
+        /// <returns>array of character types corresponding to character 
positions in the sentence</returns>
+        /// <seealso cref="Utility.GetCharType(char)"/>
+        private static CharType[] GetCharTypes(string sentence)
+        {
+            int length = sentence.Length;
+            CharType[] charTypeArray = new CharType[length];
+            // the type of each character by position
+            for (int i = 0; i < length; i++)
+            {
+                charTypeArray[i] = Utility.GetCharType(sentence[i]);
+            }
+
+            return charTypeArray;
+        }
+
+        /// <summary>
+        /// Return a list of <see cref="SegToken"/> representing the best 
segmentation of a sentence
+        /// </summary>
+        /// <param name="sentence">input sentence</param>
+        /// <returns>best segmentation as a <see 
cref="T:IList{SegToken}"/></returns>
+        public virtual IList<SegToken> Process(string sentence)
+        {
+            SegGraph segGraph = CreateSegGraph(sentence);
+            BiSegGraph biSegGraph = new BiSegGraph(segGraph);
+            IList<SegToken> shortPath = biSegGraph.GetShortPath();
+            return shortPath;
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/2f5d89b4/src/Lucene.Net.Analysis.SmartCn/HHMM/PathNode.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.SmartCn/HHMM/PathNode.cs 
b/src/Lucene.Net.Analysis.SmartCn/HHMM/PathNode.cs
new file mode 100644
index 0000000..11387ad
--- /dev/null
+++ b/src/Lucene.Net.Analysis.SmartCn/HHMM/PathNode.cs
@@ -0,0 +1,80 @@
+ï»¿using Lucene.Net.Support;
+using System;
+
+namespace Lucene.Net.Analysis.Cn.Smart.HHMM
+{
+    /*
+     * Licensed to the Apache Software Foundation (ASF) under one or more
+     * contributor license agreements.  See the NOTICE file distributed with
+     * this work for additional information regarding copyright ownership.
+     * The ASF licenses this file to You under the Apache License, Version 2.0
+     * (the "License"); you may not use this file except in compliance with
+     * the License.  You may obtain a copy of the License at
+     *
+     *     http://www.apache.org/licenses/LICENSE-2.0
+     *
+     * Unless required by applicable law or agreed to in writing, software
+     * distributed under the License is distributed on an "AS IS" BASIS,
+     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     * See the License for the specific language governing permissions and
+     * limitations under the License.
+     */
+
+    /// <summary>
+    /// SmartChineseAnalyzer internal node representation
+    /// <para>
+    /// Used by <see cref="BiSegGraph"/> to maximize the segmentation with the 
Viterbi algorithm.
+    /// </para>
+    /// @lucene.experimental
+    /// </summary>
+    internal class PathNode : IComparable<PathNode>
+    {
+        public double Weight { get; set; }
+
+        public int PreNode { get; set; }
+
+        public virtual int CompareTo(PathNode pn)
+        {
+            if (Weight < pn.Weight)
+                return -1;
+            else if (Weight == pn.Weight)
+                return 0;
+            else
+                return 1;
+        }
+
+        /// <summary>
+        /// <see cref="object.GetHashCode()"/>
+        /// </summary>
+        public override int GetHashCode()
+        {
+            int prime = 31;
+            int result = 1;
+            result = prime * result + PreNode;
+            long temp;
+            temp = Number.DoubleToInt64Bits(Weight);
+            result = prime * result + (int)(temp ^ (int)((uint)temp >> 32));
+            return result;
+        }
+
+        /// <summary>
+        /// <see cref="object.Equals(object)"/>
+        /// </summary>
+        public override bool Equals(object obj)
+        {
+            if (this == obj)
+                return true;
+            if (obj == null)
+                return false;
+            if (GetType() != obj.GetType())
+                return false;
+            PathNode other = (PathNode)obj;
+            if (PreNode != other.PreNode)
+                return false;
+            if (Number.DoubleToInt64Bits(Weight) != Number
+                .DoubleToInt64Bits(other.Weight))
+                return false;
+            return true;
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/2f5d89b4/src/Lucene.Net.Analysis.SmartCn/HHMM/SegGraph.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.SmartCn/HHMM/SegGraph.cs 
b/src/Lucene.Net.Analysis.SmartCn/HHMM/SegGraph.cs
new file mode 100644
index 0000000..e0138c1
--- /dev/null
+++ b/src/Lucene.Net.Analysis.SmartCn/HHMM/SegGraph.cs
@@ -0,0 +1,160 @@
+ï»¿using System.Collections.Generic;
+using System.Text;
+
+namespace Lucene.Net.Analysis.Cn.Smart.HHMM
+{
+    /*
+     * Licensed to the Apache Software Foundation (ASF) under one or more
+     * contributor license agreements.  See the NOTICE file distributed with
+     * this work for additional information regarding copyright ownership.
+     * The ASF licenses this file to You under the Apache License, Version 2.0
+     * (the "License"); you may not use this file except in compliance with
+     * the License.  You may obtain a copy of the License at
+     *
+     *     http://www.apache.org/licenses/LICENSE-2.0
+     *
+     * Unless required by applicable law or agreed to in writing, software
+     * distributed under the License is distributed on an "AS IS" BASIS,
+     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     * See the License for the specific language governing permissions and
+     * limitations under the License.
+     */
+
+    /// <summary>
+    /// Graph representing possible tokens at each start offset in the 
sentence.
+    /// <para>
+    /// For each start offset, a list of possible tokens is stored.
+    /// </para>
+    /// @lucene.experimental
+    /// </summary>
+    internal class SegGraph
+    {
+        /// <summary>
+        /// Map of start offsets to <see cref="T:IList{SegToken}"/> of tokens 
at that position
+        /// </summary>
+        private IDictionary<int, IList<SegToken>> tokenListTable = new 
Dictionary<int, IList<SegToken>>();
+
+        private int maxStart = -1;
+
+        /// <summary>
+        /// Returns <c>true</c> if a mapping for the specified start offset 
exists
+        /// </summary>
+        /// <param name="s">startOffset</param>
+        /// <returns><c>true</c> if there are tokens for the 
startOffset</returns>
+        public virtual bool IsStartExist(int s)
+        {
+            //return tokenListTable.get(s) != null;
+            IList<SegToken> result;
+            return tokenListTable.TryGetValue(s, out result) && result != null;
+        }
+
+        /// <summary>
+        ///  Get the list of tokens at the specified start offset
+        /// </summary>
+        /// <param name="s">startOffset</param>
+        /// <returns><see cref="T:IList{SegToken}"/> of tokens at the 
specified start offset.</returns>
+        public virtual IList<SegToken> GetStartList(int s)
+        {
+            IList<SegToken> result;
+            tokenListTable.TryGetValue(s, out result);
+            return result;
+        }
+
+        /// <summary>
+        /// Get the highest start offset in the map. Returns maximum start 
offset, or -1 if the map is empty.
+        /// </summary>
+        public virtual int MaxStart
+        {
+            get { return maxStart; }
+        }
+
+        /// <summary>
+        /// Set the <see cref="SegToken.Index"/> for each token, based upon 
its order by startOffset. 
+        /// </summary>
+        /// <returns>a <see cref="T:IList{SegToken}"/> of these ordered 
tokens.</returns>
+        public virtual IList<SegToken> MakeIndex()
+        {
+            IList<SegToken> result = new List<SegToken>();
+            int s = -1, count = 0, size = tokenListTable.Count;
+            IList<SegToken> tokenList;
+            int index = 0;
+            while (count < size)
+            {
+                if (IsStartExist(s))
+                {
+                    tokenList = tokenListTable[s];
+                    foreach (SegToken st in tokenList)
+                    {
+                        st.Index = index;
+                        result.Add(st);
+                        index++;
+                    }
+                    count++;
+                }
+                s++;
+            }
+            return result;
+        }
+
+        /// <summary>
+        /// Add a <see cref="SegToken"/> to the mapping, creating a new 
mapping at the token's startOffset if one does not exist. 
+        /// </summary>
+        /// <param name="token">token <see cref="SegToken"/>.</param>
+        public virtual void AddToken(SegToken token)
+        {
+            int s = token.StartOffset;
+            if (!IsStartExist(s))
+            {
+                List<SegToken> newlist = new List<SegToken>();
+                newlist.Add(token);
+                tokenListTable[s] = newlist;
+            }
+            else
+            {
+                IList<SegToken> tokenList = tokenListTable[s];
+                tokenList.Add(token);
+            }
+            if (s > maxStart)
+            {
+                maxStart = s;
+            }
+        }
+
+        /// <summary>
+        /// Return a <see cref="T:IList{SegToken}"/> of all tokens in the map, 
ordered by startOffset.
+        /// </summary>
+        /// <returns><see cref="T:IList{SegToken}"/> of all tokens in the 
map.</returns>
+        public virtual IList<SegToken> ToTokenList()
+        {
+            IList<SegToken> result = new List<SegToken>();
+            int s = -1, count = 0, size = tokenListTable.Count;
+            IList<SegToken> tokenList;
+
+            while (count < size)
+            {
+                if (IsStartExist(s))
+                {
+                    tokenList = tokenListTable[s];
+                    foreach (SegToken st in tokenList)
+                    {
+                        result.Add(st);
+                    }
+                    count++;
+                }
+                s++;
+            }
+            return result;
+        }
+
+        public override string ToString()
+        {
+            IList<SegToken> tokenList = this.ToTokenList();
+            StringBuilder sb = new StringBuilder();
+            foreach (SegToken t in tokenList)
+            {
+                sb.Append(t + "\n");
+            }
+            return sb.ToString();
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/2f5d89b4/src/Lucene.Net.Analysis.SmartCn/HHMM/SegToken.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.SmartCn/HHMM/SegToken.cs 
b/src/Lucene.Net.Analysis.SmartCn/HHMM/SegToken.cs
new file mode 100644
index 0000000..48ba8ce
--- /dev/null
+++ b/src/Lucene.Net.Analysis.SmartCn/HHMM/SegToken.cs
@@ -0,0 +1,123 @@
+ï»¿using Lucene.Net.Support;
+
+namespace Lucene.Net.Analysis.Cn.Smart.HHMM
+{
+    /*
+     * Licensed to the Apache Software Foundation (ASF) under one or more
+     * contributor license agreements.  See the NOTICE file distributed with
+     * this work for additional information regarding copyright ownership.
+     * The ASF licenses this file to You under the Apache License, Version 2.0
+     * (the "License"); you may not use this file except in compliance with
+     * the License.  You may obtain a copy of the License at
+     *
+     *     http://www.apache.org/licenses/LICENSE-2.0
+     *
+     * Unless required by applicable law or agreed to in writing, software
+     * distributed under the License is distributed on an "AS IS" BASIS,
+     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     * See the License for the specific language governing permissions and
+     * limitations under the License.
+     */
+
+    /// <summary>
+    /// SmartChineseAnalyzer internal token
+    /// <para/>
+    /// @lucene.experimental
+    /// </summary>
+    public class SegToken
+    {
+        /// <summary>
+        /// Character array containing token text
+        /// </summary>
+        [WritableArray]
+        public char[] CharArray { get; set; }
+
+        /// <summary>
+        /// start offset into original sentence
+        /// </summary>
+        public int StartOffset { get; set; }
+
+        /// <summary>
+        /// end offset into original sentence
+        /// </summary>
+        public int EndOffset { get; set; }
+
+        /// <summary>
+        /// <see cref="Smart.WordType"/> of the text
+        /// </summary>
+        public WordType WordType { get; set; }
+
+        /// <summary>
+        /// word frequency
+        /// </summary>
+        public int Weight { get; set; }
+
+        /// <summary>
+        /// during segmentation, this is used to store the index of the token 
in the token list table
+        /// </summary>
+        public int Index { get; set; }
+
+        /// <summary>
+        /// Create a new <see cref="SegToken"/> from a character array.
+        /// </summary>
+        /// <param name="idArray">character array containing text</param>
+        /// <param name="start">start offset of <see cref="SegToken"/> in 
original sentence</param>
+        /// <param name="end">end offset of <see cref="SegToken"/> in original 
sentence</param>
+        /// <param name="wordType"><see cref="Smart.WordType"/> of the 
text</param>
+        /// <param name="weight">word frequency</param>
+        public SegToken(char[] idArray, int start, int end, WordType wordType, 
int weight)
+        {
+            this.CharArray = idArray;
+            this.StartOffset = start;
+            this.EndOffset = end;
+            this.WordType = wordType;
+            this.Weight = weight;
+        }
+
+        /// <summary>
+        /// <see cref="object.GetHashCode()"/>
+        /// </summary>
+        public override int GetHashCode()
+        {
+            int prime = 31;
+            int result = 1;
+            for (int i = 0; i < CharArray.Length; i++)
+            {
+                result = prime * result + CharArray[i];
+            }
+            result = prime * result + EndOffset;
+            result = prime * result + Index;
+            result = prime * result + StartOffset;
+            result = prime * result + Weight;
+            result = prime * result + (int)WordType;
+            return result;
+        }
+
+        /// <summary>
+        /// <see cref="object.Equals(object)"/>
+        /// </summary>
+        public override bool Equals(object obj)
+        {
+            if (this == obj)
+                return true;
+            if (obj == null)
+                return false;
+            if (GetType() != obj.GetType())
+                return false;
+            SegToken other = (SegToken)obj;
+            if (!Arrays.Equals(CharArray, other.CharArray))
+                return false;
+            if (EndOffset != other.EndOffset)
+                return false;
+            if (Index != other.Index)
+                return false;
+            if (StartOffset != other.StartOffset)
+                return false;
+            if (Weight != other.Weight)
+                return false;
+            if (WordType != other.WordType)
+                return false;
+            return true;
+        }
+    }
+}

[3/4] lucenenet git commit: Ported Lucene.Net.Analysis.SmartCn + tests

Reply via email to