Ported Lucene.Net.Analysis.SmartCn + tests
Project: http://git-wip-us.apache.org/repos/asf/lucenenet/repo Commit: http://git-wip-us.apache.org/repos/asf/lucenenet/commit/2f5d89b4 Tree: http://git-wip-us.apache.org/repos/asf/lucenenet/tree/2f5d89b4 Diff: http://git-wip-us.apache.org/repos/asf/lucenenet/diff/2f5d89b4 Branch: refs/heads/master Commit: 2f5d89b4ae979d376f2ada22b2fb2e775b6e1608 Parents: 468199e Author: Shad Storhaug <[email protected]> Authored: Sun Jun 25 01:26:30 2017 +0700 Committer: Shad Storhaug <[email protected]> Committed: Sun Jun 25 02:28:26 2017 +0700 ---------------------------------------------------------------------- Lucene.Net.Portable.sln | 20 + Lucene.Net.sln | 52 ++ .../AnalyzerProfile.cs | 183 +++++ src/Lucene.Net.Analysis.SmartCn/CharType.cs | 67 ++ .../HHMM/AbstractDictionary.cs | 224 ++++++ .../HHMM/BiSegGraph.cs | 256 ++++++ .../HHMM/BigramDictionary.cs | 431 ++++++++++ .../HHMM/HHMMSegmenter.cs | 252 ++++++ .../HHMM/PathNode.cs | 80 ++ .../HHMM/SegGraph.cs | 160 ++++ .../HHMM/SegToken.cs | 123 +++ .../HHMM/SegTokenFilter.cs | 75 ++ .../HHMM/SegTokenPair.cs | 95 +++ .../HHMM/WordDictionary.cs | 778 +++++++++++++++++++ .../HHMM/bigramdict.mem | Bin 0 -> 4825652 bytes .../HHMM/coredict.mem | Bin 0 -> 1178248 bytes .../HMMChineseTokenizer.cs | 94 +++ .../HMMChineseTokenizerFactory.cs | 56 ++ .../Lucene.Net.Analysis.SmartCn.csproj | 124 +++ .../Lucene.Net.Analysis.SmartCn.project.json | 11 + .../Lucene.Net.Analysis.SmartCn.xproj | 40 + .../Properties/AssemblyInfo.cs | 42 + .../SentenceTokenizer.cs | 142 ++++ .../SmartChineseAnalyzer.cs | 171 ++++ .../SmartChineseSentenceTokenizerFactory.cs | 52 ++ .../SmartChineseWordTokenFilterFactory.cs | 55 ++ src/Lucene.Net.Analysis.SmartCn/Utility.cs | 196 +++++ .../WordSegmenter.cs | 89 +++ .../WordTokenFilter.cs | 114 +++ src/Lucene.Net.Analysis.SmartCn/WordType.cs | 67 ++ src/Lucene.Net.Analysis.SmartCn/project.json | 53 ++ src/Lucene.Net.Analysis.SmartCn/stopwords.txt | 59 ++ .../Lucene.Net.Tests.Analysis.SmartCn.csproj | 105 +++ ...cene.Net.Tests.Analysis.SmartCn.project.json | 11 + .../Lucene.Net.Tests.Analysis.SmartCn.xproj | 42 + .../Properties/AssemblyInfo.cs | 37 + .../Support/TestApiConsistency.cs | 148 ++++ .../Support/TestExceptionSerialization.cs | 54 ++ .../TestHMMChineseTokenizerFactory.cs | 72 ++ .../TestSmartChineseAnalyzer.cs | 354 +++++++++ .../TestSmartChineseFactories.cs | 98 +++ .../project.json | 45 ++ 42 files changed, 5127 insertions(+) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/lucenenet/blob/2f5d89b4/Lucene.Net.Portable.sln ---------------------------------------------------------------------- diff --git a/Lucene.Net.Portable.sln b/Lucene.Net.Portable.sln index 9a49572..d3678ee 100644 --- a/Lucene.Net.Portable.sln +++ b/Lucene.Net.Portable.sln @@ -85,6 +85,10 @@ Project("{8BB2217D-0F2D-49D1-97BC-3654ED321F3B}") = "Lucene.Net.ICU", "src\Lucen EndProject Project("{8BB2217D-0F2D-49D1-97BC-3654ED321F3B}") = "Lucene.Net.Tests.ICU", "src\Lucene.Net.Tests.ICU\Lucene.Net.Tests.ICU.xproj", "{32FD3471-E862-4055-B969-79C12A656366}" EndProject +Project("{8BB2217D-0F2D-49D1-97BC-3654ED321F3B}") = "Lucene.Net.Analysis.SmartCn", "src\Lucene.Net.Analysis.SmartCn\Lucene.Net.Analysis.SmartCn.xproj", "{A400916E-DCB8-4A16-BE83-91891C05191F}" +EndProject +Project("{8BB2217D-0F2D-49D1-97BC-3654ED321F3B}") = "Lucene.Net.Tests.Analysis.SmartCn", "src\Lucene.Net.Tests.Analysis.SmartCn\Lucene.Net.Tests.Analysis.SmartCn.xproj", "{2870FB52-1239-493F-A0BE-951660194A66}" +EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution Debug|Any CPU = Debug|Any CPU @@ -389,6 +393,22 @@ Global {32FD3471-E862-4055-B969-79C12A656366}.Release|Any CPU.Build.0 = Release|Any CPU {32FD3471-E862-4055-B969-79C12A656366}.Release|x86.ActiveCfg = Release|Any CPU {32FD3471-E862-4055-B969-79C12A656366}.Release|x86.Build.0 = Release|Any CPU + {2870FB52-1239-493F-A0BE-951660194A66}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {2870FB52-1239-493F-A0BE-951660194A66}.Debug|Any CPU.Build.0 = Debug|Any CPU + {2870FB52-1239-493F-A0BE-951660194A66}.Debug|x86.ActiveCfg = Debug|Any CPU + {2870FB52-1239-493F-A0BE-951660194A66}.Debug|x86.Build.0 = Debug|Any CPU + {2870FB52-1239-493F-A0BE-951660194A66}.Release|Any CPU.ActiveCfg = Release|Any CPU + {2870FB52-1239-493F-A0BE-951660194A66}.Release|Any CPU.Build.0 = Release|Any CPU + {2870FB52-1239-493F-A0BE-951660194A66}.Release|x86.ActiveCfg = Release|Any CPU + {2870FB52-1239-493F-A0BE-951660194A66}.Release|x86.Build.0 = Release|Any CPU + {A400916E-DCB8-4A16-BE83-91891C05191F}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {A400916E-DCB8-4A16-BE83-91891C05191F}.Debug|Any CPU.Build.0 = Debug|Any CPU + {A400916E-DCB8-4A16-BE83-91891C05191F}.Debug|x86.ActiveCfg = Debug|Any CPU + {A400916E-DCB8-4A16-BE83-91891C05191F}.Debug|x86.Build.0 = Debug|Any CPU + {A400916E-DCB8-4A16-BE83-91891C05191F}.Release|Any CPU.ActiveCfg = Release|Any CPU + {A400916E-DCB8-4A16-BE83-91891C05191F}.Release|Any CPU.Build.0 = Release|Any CPU + {A400916E-DCB8-4A16-BE83-91891C05191F}.Release|x86.ActiveCfg = Release|Any CPU + {A400916E-DCB8-4A16-BE83-91891C05191F}.Release|x86.Build.0 = Release|Any CPU EndGlobalSection GlobalSection(SolutionProperties) = preSolution HideSolutionNode = FALSE http://git-wip-us.apache.org/repos/asf/lucenenet/blob/2f5d89b4/Lucene.Net.sln ---------------------------------------------------------------------- diff --git a/Lucene.Net.sln b/Lucene.Net.sln index 669a57d..be5b2b9 100644 --- a/Lucene.Net.sln +++ b/Lucene.Net.sln @@ -94,6 +94,10 @@ Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Lucene.Net.ICU", "src\Lucen EndProject Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Lucene.Net.Tests.ICU", "src\Lucene.Net.Tests.ICU\Lucene.Net.Tests.ICU.csproj", "{D5AA1A22-1B28-4DF6-BFDA-02519A189839}" EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Lucene.Net.Analysis.SmartCn", "src\Lucene.Net.Analysis.SmartCn\Lucene.Net.Analysis.SmartCn.csproj", "{DBA35EDF-A0FF-4DF7-AE4F-A103B01CD488}" +EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Lucene.Net.Tests.Analysis.SmartCn", "src\Lucene.Net.Tests.Analysis.SmartCn\Lucene.Net.Tests.Analysis.SmartCn.csproj", "{8C8D78D3-BFFD-4301-953B-FE5350B2AEEB}" +EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution Debug|Any CPU = Debug|Any CPU @@ -903,6 +907,54 @@ Global {D5AA1A22-1B28-4DF6-BFDA-02519A189839}.Release35|Mixed Platforms.Build.0 = Release|Any CPU {D5AA1A22-1B28-4DF6-BFDA-02519A189839}.Release35|x86.ActiveCfg = Release|Any CPU {D5AA1A22-1B28-4DF6-BFDA-02519A189839}.Release35|x86.Build.0 = Release|Any CPU + {DBA35EDF-A0FF-4DF7-AE4F-A103B01CD488}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {DBA35EDF-A0FF-4DF7-AE4F-A103B01CD488}.Debug|Any CPU.Build.0 = Debug|Any CPU + {DBA35EDF-A0FF-4DF7-AE4F-A103B01CD488}.Debug|Mixed Platforms.ActiveCfg = Debug|Any CPU + {DBA35EDF-A0FF-4DF7-AE4F-A103B01CD488}.Debug|Mixed Platforms.Build.0 = Debug|Any CPU + {DBA35EDF-A0FF-4DF7-AE4F-A103B01CD488}.Debug|x86.ActiveCfg = Debug|Any CPU + {DBA35EDF-A0FF-4DF7-AE4F-A103B01CD488}.Debug|x86.Build.0 = Debug|Any CPU + {DBA35EDF-A0FF-4DF7-AE4F-A103B01CD488}.Debug35|Any CPU.ActiveCfg = Debug|Any CPU + {DBA35EDF-A0FF-4DF7-AE4F-A103B01CD488}.Debug35|Any CPU.Build.0 = Debug|Any CPU + {DBA35EDF-A0FF-4DF7-AE4F-A103B01CD488}.Debug35|Mixed Platforms.ActiveCfg = Debug|Any CPU + {DBA35EDF-A0FF-4DF7-AE4F-A103B01CD488}.Debug35|Mixed Platforms.Build.0 = Debug|Any CPU + {DBA35EDF-A0FF-4DF7-AE4F-A103B01CD488}.Debug35|x86.ActiveCfg = Debug|Any CPU + {DBA35EDF-A0FF-4DF7-AE4F-A103B01CD488}.Debug35|x86.Build.0 = Debug|Any CPU + {DBA35EDF-A0FF-4DF7-AE4F-A103B01CD488}.Release|Any CPU.ActiveCfg = Release|Any CPU + {DBA35EDF-A0FF-4DF7-AE4F-A103B01CD488}.Release|Any CPU.Build.0 = Release|Any CPU + {DBA35EDF-A0FF-4DF7-AE4F-A103B01CD488}.Release|Mixed Platforms.ActiveCfg = Release|Any CPU + {DBA35EDF-A0FF-4DF7-AE4F-A103B01CD488}.Release|Mixed Platforms.Build.0 = Release|Any CPU + {DBA35EDF-A0FF-4DF7-AE4F-A103B01CD488}.Release|x86.ActiveCfg = Release|Any CPU + {DBA35EDF-A0FF-4DF7-AE4F-A103B01CD488}.Release|x86.Build.0 = Release|Any CPU + {DBA35EDF-A0FF-4DF7-AE4F-A103B01CD488}.Release35|Any CPU.ActiveCfg = Release|Any CPU + {DBA35EDF-A0FF-4DF7-AE4F-A103B01CD488}.Release35|Any CPU.Build.0 = Release|Any CPU + {DBA35EDF-A0FF-4DF7-AE4F-A103B01CD488}.Release35|Mixed Platforms.ActiveCfg = Release|Any CPU + {DBA35EDF-A0FF-4DF7-AE4F-A103B01CD488}.Release35|Mixed Platforms.Build.0 = Release|Any CPU + {DBA35EDF-A0FF-4DF7-AE4F-A103B01CD488}.Release35|x86.ActiveCfg = Release|Any CPU + {DBA35EDF-A0FF-4DF7-AE4F-A103B01CD488}.Release35|x86.Build.0 = Release|Any CPU + {8C8D78D3-BFFD-4301-953B-FE5350B2AEEB}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {8C8D78D3-BFFD-4301-953B-FE5350B2AEEB}.Debug|Any CPU.Build.0 = Debug|Any CPU + {8C8D78D3-BFFD-4301-953B-FE5350B2AEEB}.Debug|Mixed Platforms.ActiveCfg = Debug|Any CPU + {8C8D78D3-BFFD-4301-953B-FE5350B2AEEB}.Debug|Mixed Platforms.Build.0 = Debug|Any CPU + {8C8D78D3-BFFD-4301-953B-FE5350B2AEEB}.Debug|x86.ActiveCfg = Debug|Any CPU + {8C8D78D3-BFFD-4301-953B-FE5350B2AEEB}.Debug|x86.Build.0 = Debug|Any CPU + {8C8D78D3-BFFD-4301-953B-FE5350B2AEEB}.Debug35|Any CPU.ActiveCfg = Debug|Any CPU + {8C8D78D3-BFFD-4301-953B-FE5350B2AEEB}.Debug35|Any CPU.Build.0 = Debug|Any CPU + {8C8D78D3-BFFD-4301-953B-FE5350B2AEEB}.Debug35|Mixed Platforms.ActiveCfg = Debug|Any CPU + {8C8D78D3-BFFD-4301-953B-FE5350B2AEEB}.Debug35|Mixed Platforms.Build.0 = Debug|Any CPU + {8C8D78D3-BFFD-4301-953B-FE5350B2AEEB}.Debug35|x86.ActiveCfg = Debug|Any CPU + {8C8D78D3-BFFD-4301-953B-FE5350B2AEEB}.Debug35|x86.Build.0 = Debug|Any CPU + {8C8D78D3-BFFD-4301-953B-FE5350B2AEEB}.Release|Any CPU.ActiveCfg = Release|Any CPU + {8C8D78D3-BFFD-4301-953B-FE5350B2AEEB}.Release|Any CPU.Build.0 = Release|Any CPU + {8C8D78D3-BFFD-4301-953B-FE5350B2AEEB}.Release|Mixed Platforms.ActiveCfg = Release|Any CPU + {8C8D78D3-BFFD-4301-953B-FE5350B2AEEB}.Release|Mixed Platforms.Build.0 = Release|Any CPU + {8C8D78D3-BFFD-4301-953B-FE5350B2AEEB}.Release|x86.ActiveCfg = Release|Any CPU + {8C8D78D3-BFFD-4301-953B-FE5350B2AEEB}.Release|x86.Build.0 = Release|Any CPU + {8C8D78D3-BFFD-4301-953B-FE5350B2AEEB}.Release35|Any CPU.ActiveCfg = Release|Any CPU + {8C8D78D3-BFFD-4301-953B-FE5350B2AEEB}.Release35|Any CPU.Build.0 = Release|Any CPU + {8C8D78D3-BFFD-4301-953B-FE5350B2AEEB}.Release35|Mixed Platforms.ActiveCfg = Release|Any CPU + {8C8D78D3-BFFD-4301-953B-FE5350B2AEEB}.Release35|Mixed Platforms.Build.0 = Release|Any CPU + {8C8D78D3-BFFD-4301-953B-FE5350B2AEEB}.Release35|x86.ActiveCfg = Release|Any CPU + {8C8D78D3-BFFD-4301-953B-FE5350B2AEEB}.Release35|x86.Build.0 = Release|Any CPU EndGlobalSection GlobalSection(SolutionProperties) = preSolution HideSolutionNode = FALSE http://git-wip-us.apache.org/repos/asf/lucenenet/blob/2f5d89b4/src/Lucene.Net.Analysis.SmartCn/AnalyzerProfile.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.SmartCn/AnalyzerProfile.cs b/src/Lucene.Net.Analysis.SmartCn/AnalyzerProfile.cs new file mode 100644 index 0000000..88c6c27 --- /dev/null +++ b/src/Lucene.Net.Analysis.SmartCn/AnalyzerProfile.cs @@ -0,0 +1,183 @@ +using System; +using System.IO; +using System.Security; + +namespace Lucene.Net.Analysis.Cn.Smart +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// <summary> + /// Manages analysis data configuration for <see cref="SmartChineseAnalyzer"/> + /// <para/> + /// <see cref="SmartChineseAnalyzer"/> has a built-in dictionary and stopword list out-of-box. + /// <para/> + /// NOTE: To use an alternate dicationary than the built-in one, put the "bigramdict.dct" and + /// "coredict.dct" files in a subdirectory of your application named "analysis-data". This subdirectory + /// can be placed in any directory up to and including the root directory (if the OS permission allows). + /// To place the files in an alternate location, set an environment variable named "analysis.data.dir" + /// with the name of the directory the "bigramdict.dct" and "coredict.dct" files can be located within. + /// <para/> + /// The default "bigramdict.dct" and "coredict.dct" files can be found at: + /// <a href="https://issues.apache.org/jira/browse/LUCENE-1629">https://issues.apache.org/jira/browse/LUCENE-1629</a>. + /// <para/> + /// @lucene.experimental + /// </summary> + public class AnalyzerProfile + { + /// <summary> + /// Global indicating the configured analysis data directory + /// </summary> + public static string ANALYSIS_DATA_DIR = ""; + + static AnalyzerProfile() + { + Init(); + } + + // LUCENENET specific - changed the logic here to leave the + // ANALYSIS_DATA_DIR an empty string if it is not found. This + // allows us to skip loading files from disk if there are no files + // to load (and fixes LUCENE-1817 that prevents the on-disk files + // from ever being loaded). + private static void Init() + { +#if NETSTANDARD + // Support for GB2312 encoding. See: https://docs.microsoft.com/en-us/dotnet/api/system.text.codepagesencodingprovider?view=netcore-2.0 + var encodingProvider = System.Text.CodePagesEncodingProvider.Instance; + System.Text.Encoding.RegisterProvider(encodingProvider); +#endif + + string dirName = "analysis-data"; + //string propName = "analysis.properties"; + + // Try the system propertyï¼-Danalysis.data.dir=/path/to/analysis-data + //ANALYSIS_DATA_DIR = System.getProperty("analysis.data.dir", ""); + ANALYSIS_DATA_DIR = GetSystemProperty("analysis.data.dir", ""); + if (ANALYSIS_DATA_DIR.Length != 0) + return; + +#if NETSTANDARD + string currentPath = System.AppContext.BaseDirectory; +#else + string currentPath = AppDomain.CurrentDomain.BaseDirectory; +#endif + + //FileInfo[] cadidateFiles = new FileInfo[] { new FileInfo(currentPath + "/" + dirName), + // new FileInfo(currentPath + "/bin/" + dirName)/*, new FileInfo("./" + propName), + // new FileInfo("./lib/" + propName)*/ }; + //for (int i = 0; i < cadidateFiles.Length; i++) + //{ + // FileInfo file = cadidateFiles[i]; + // if (file.Exists) + // { + // ANALYSIS_DATA_DIR = file.FullName; + + // //if (file.isDirectory()) + // //{ + // // ANALYSIS_DATA_DIR = file.getAbsolutePath(); + // //} + // //else if (file.isFile() && GetAnalysisDataDir(file).Length != 0) + // //{ + // // ANALYSIS_DATA_DIR = GetAnalysisDataDir(file); + // //} + // break; + // } + //} + + string candidatePath = System.IO.Path.Combine(currentPath, dirName); + if (Directory.Exists(candidatePath)) + { + ANALYSIS_DATA_DIR = candidatePath; + return; + } + + + try + { + while (new DirectoryInfo(currentPath).Parent != null) + { + candidatePath = System.IO.Path.Combine(new DirectoryInfo(currentPath).Parent.FullName, dirName); + if (Directory.Exists(candidatePath)) + { + ANALYSIS_DATA_DIR = candidatePath; + return; + } + currentPath = new DirectoryInfo(currentPath).Parent.FullName; + } + } + catch (SecurityException) + { + // ignore security errors + } + + + //for (int i = 0; i < cadidateDirectories.Count; i++) + //{ + // DirectoryInfo dir = cadidateDirectories[i]; + // if (dir.Exists) + // { + // ANALYSIS_DATA_DIR = dir.FullName; + // break; + // } + //} + + //if (ANALYSIS_DATA_DIR.Length == 0) + //{ + // // Dictionary directory cannot be found. + // throw new Exception("WARNING: Can not find lexical dictionary directory!" + // + " This will cause unpredictable exceptions in your application!" + // + " Please refer to the manual to download the dictionaries."); + //} + + } + + //private static string GetAnalysisDataDir(FileInfo propFile) + //{ + // Properties prop = new Properties(); + // try + // { + // string dir; + // using (FileStream input = new FileStream(propFile.FullName, FileMode.Open, FileAccess.Read)) + // { + // prop.load(new StreamReader(input, Encoding.UTF8)); + // dir = prop.getProperty("analysis.data.dir", ""); + // } + // return dir; + // } + // catch (IOException e) + // { + // return ""; + // } + //} + + private static string GetSystemProperty(string property, string defaultValue) + { + string setting; + try + { + setting = Environment.GetEnvironmentVariable(property); + } + catch (SecurityException) + { + setting = null; + } + + return (setting == null) ? defaultValue : setting; + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/2f5d89b4/src/Lucene.Net.Analysis.SmartCn/CharType.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.SmartCn/CharType.cs b/src/Lucene.Net.Analysis.SmartCn/CharType.cs new file mode 100644 index 0000000..8360802 --- /dev/null +++ b/src/Lucene.Net.Analysis.SmartCn/CharType.cs @@ -0,0 +1,67 @@ +namespace Lucene.Net.Analysis.Cn.Smart +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// <summary> + /// Internal <see cref="SmartChineseAnalyzer"/> character type constants. + /// <para/> + /// @lucene.experimental + /// </summary> + public enum CharType + { + /// <summary> + /// Punctuation Characters + /// </summary> + DELIMITER = 0, + + /// <summary> + /// Letters + /// </summary> + LETTER = 1, + + /// <summary> + /// Numeric Digits + /// </summary> + DIGIT = 2, + + /// <summary> + /// Han Ideographs + /// </summary> + HANZI = 3, + + /// <summary> + /// Characters that act as a space + /// </summary> + SPACE_LIKE = 4, + + /// <summary> + /// Full-Width letters + /// </summary> + FULLWIDTH_LETTER = 5, + + /// <summary> + /// Full-Width alphanumeric characters + /// </summary> + FULLWIDTH_DIGIT = 6, + + /// <summary> + /// Other (not fitting any of the other categories) + /// </summary> + OTHER = 7 + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/2f5d89b4/src/Lucene.Net.Analysis.SmartCn/HHMM/AbstractDictionary.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.SmartCn/HHMM/AbstractDictionary.cs b/src/Lucene.Net.Analysis.SmartCn/HHMM/AbstractDictionary.cs new file mode 100644 index 0000000..efac7d0 --- /dev/null +++ b/src/Lucene.Net.Analysis.SmartCn/HHMM/AbstractDictionary.cs @@ -0,0 +1,224 @@ +using System; +using System.Text; + +namespace Lucene.Net.Analysis.Cn.Smart.HHMM +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// <summary> + /// <para> + /// <see cref="SmartChineseAnalyzer"/> abstract dictionary implementation. + /// </para> + /// <para> + /// Contains methods for dealing with GB2312 encoding. + /// </para> + /// @lucene.experimental + /// </summary> + internal abstract class AbstractDictionary + { + /// <summary> + /// First Chinese Character in GB2312 (15 * 94) + /// Characters in GB2312 are arranged in a grid of 94 * 94, 0-14 are unassigned or punctuation. + /// </summary> + public static readonly int GB2312_FIRST_CHAR = 1410; + + /// <summary> + /// Last Chinese Character in GB2312 (87 * 94). + /// Characters in GB2312 are arranged in a grid of 94 * 94, 88-94 are unassigned. + /// </summary> + public static readonly int GB2312_CHAR_NUM = 87 * 94; + + /// <summary> + /// Dictionary data contains 6768 Chinese characters with frequency statistics. + /// </summary> + public static readonly int CHAR_NUM_IN_FILE = 6768; + + // ===================================================== + // code +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +A +B +C +D +E +F + // B0A0 å é¿ å æ¨ å å å ç ç è¼ ç® è¾ ç¢ ç± é + // B0B0 é æ°¨ å® ä¿º æ æ 岸 èº æ¡ è® æ ç å¹ æ ç¬ ç¿± + // B0C0 è¢ å² å¥¥ æ æ¾³ è æ æ å å§ ç¬ å « ç¤ å·´ æ è· + // B0D0 é¶ æ è å é¸ ç½¢ ç¸ ç½ æ ç¾ æ ä½° è´¥ æ ç¨ æ + // B0E0 ç æ¬ æ³ è¬ é¢ æ¿ ç æ® æ ä¼´ ç£ å å ç» é¦ å¸® + // B0F0 æ¢ æ¦ è ç» æ£ ç£ è é å è°¤ è è å è¤ å¥ + // ===================================================== + // + // GB2312 character setï¼ + // 01 94 Symbols + // 02 72 Numbers + // 03 94 Latin + // 04 83 Kana + // 05 86 Katakana + // 06 48 Greek + // 07 66 Cyrillic + // 08 63 Phonetic Symbols + // 09 76 Drawing Symbols + // 10-15 Unassigned + // 16-55 3755 Plane 1, in pinyin order + // 56-87 3008 Plane 2, in radical/stroke order + // 88-94 Unassigned + // ====================================================== + + /// <summary> + /// <para> + /// Transcode from GB2312 ID to Unicode + /// </para> + /// <para> + /// GB2312 is divided into a 94 * 94 grid, containing 7445 characters consisting of 6763 Chinese characters and 682 symbols. + /// Some regions are unassigned (reserved). + /// </para> + /// </summary> + /// <param name="ccid">GB2312 id</param> + /// <returns>unicode String</returns> + public virtual string GetCCByGB2312Id(int ccid) + { + if (ccid < 0 || ccid > AbstractDictionary.GB2312_CHAR_NUM) + return ""; + int cc1 = ccid / 94 + 161; + int cc2 = ccid % 94 + 161; + byte[] buffer = new byte[2]; + buffer[0] = (byte)cc1; + buffer[1] = (byte)cc2; + try + { + //String cchar = new String(buffer, "GB2312"); + string cchar = Encoding.GetEncoding("GB2312").GetString(buffer); + return cchar; + } + catch (ArgumentException) // Encoding is not supported by the platform + { + return ""; + } + } + + /// <summary> + /// Transcode from Unicode to GB2312 + /// </summary> + /// <param name="ch">input character in Unicode, or character in Basic Latin range.</param> + /// <returns>position in GB2312</returns> + public virtual short GetGB2312Id(char ch) + { + try + { + //byte[] buffer = Character.ToString(ch).getBytes("GB2312"); + byte[] buffer = Encoding.GetEncoding("GB2312").GetBytes(ch.ToString()); + //byte[] buffer = Encoding.GetEncoding("hz-gb-2312").GetBytes(ch.ToString()); + if (buffer.Length != 2) + { + // Should be a two-byte character + return -1; + } + int b0 = (buffer[0] & 0x0FF) - 161; // Code starts from A1, therefore subtract 0xA1=161 + int b1 = (buffer[1] & 0x0FF) - 161; // There is no Chinese char for the first and last symbol. + // Therefore, each code page only has 16*6-2=94 characters. + return (short)(b0 * 94 + b1); + } + catch (ArgumentException e) // Encoding is not supported by the platform + { + throw new Exception(e.ToString(), e); + } + } + + /// <summary> + /// 32-bit FNV Hash Function + /// </summary> + /// <param name="c">input character</param> + /// <returns>hashcode</returns> + public virtual long Hash1(char c) + { + long p = 1099511628211L; + long hash = unchecked((long)0xcbf29ce484222325L); + hash = (hash ^ (c & 0x00FF)) * p; + hash = (hash ^ (c >> 8)) * p; + hash += hash << 13; + hash ^= hash >> 7; + hash += hash << 3; + hash ^= hash >> 17; + hash += hash << 5; + return hash; + } + + /// <summary> + /// 32-bit FNV Hash Function + /// </summary> + /// <param name="carray">character array</param> + /// <returns>hashcode</returns> + public virtual long Hash1(char[] carray) + { + long p = 1099511628211L; + long hash = unchecked((long)0xcbf29ce484222325L); + for (int i = 0; i < carray.Length; i++) + { + char d = carray[i]; + hash = (hash ^ (d & 0x00FF)) * p; + hash = (hash ^ (d >> 8)) * p; + } + + // hash += hash << 13; + // hash ^= hash >> 7; + // hash += hash << 3; + // hash ^= hash >> 17; + // hash += hash << 5; + return hash; + } + + /// <summary> + /// djb2 hash algorithmï¼this algorithm (k=33) was first reported by dan + /// bernstein many years ago in comp.lang.c. another version of this algorithm + /// (now favored by bernstein) uses xor: hash(i) = hash(i - 1) * 33 ^ str[i]; + /// the magic of number 33 (why it works better than many other constants, + /// prime or not) has never been adequately explained. + /// </summary> + /// <param name="c">character</param> + /// <returns>hashcode</returns> + public virtual int Hash2(char c) + { + int hash = 5381; + + /* hash 33 + c */ + hash = ((hash << 5) + hash) + c & 0x00FF; + hash = ((hash << 5) + hash) + c >> 8; + + return hash; + } + + /// <summary> + /// djb2 hash algorithmï¼this algorithm (k=33) was first reported by dan + /// bernstein many years ago in comp.lang.c. another version of this algorithm + /// (now favored by bernstein) uses xor: hash(i) = hash(i - 1) * 33 ^ str[i]; + /// the magic of number 33 (why it works better than many other constants, + /// prime or not) has never been adequately explained. + /// </summary> + /// <param name="carray">character array</param> + /// <returns>hashcode</returns> + public virtual int Hash2(char[] carray) + { + int hash = 5381; + + /* hash 33 + c */ + for (int i = 0; i < carray.Length; i++) + { + char d = carray[i]; + hash = ((hash << 5) + hash) + d & 0x00FF; + hash = ((hash << 5) + hash) + d >> 8; + } + + return hash; + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/2f5d89b4/src/Lucene.Net.Analysis.SmartCn/HHMM/BiSegGraph.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.SmartCn/HHMM/BiSegGraph.cs b/src/Lucene.Net.Analysis.SmartCn/HHMM/BiSegGraph.cs new file mode 100644 index 0000000..adeef2a --- /dev/null +++ b/src/Lucene.Net.Analysis.SmartCn/HHMM/BiSegGraph.cs @@ -0,0 +1,256 @@ +using Lucene.Net.Support; +using System; +using System.Collections.Generic; +using System.Text; + +namespace Lucene.Net.Analysis.Cn.Smart.HHMM +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// <summary> + /// Graph representing possible token pairs (bigrams) at each start offset in the sentence. + /// <para> + /// For each start offset, a list of possible token pairs is stored. + /// </para> + /// @lucene.experimental + /// </summary> + internal class BiSegGraph + { + private IDictionary<int, IList<SegTokenPair>> tokenPairListTable = new Dictionary<int, IList<SegTokenPair>>(); + + private IList<SegToken> segTokenList; + + private static BigramDictionary bigramDict = BigramDictionary.GetInstance(); + + public BiSegGraph(SegGraph segGraph) + { + segTokenList = segGraph.MakeIndex(); + GenerateBiSegGraph(segGraph); + } + + /// <summary> + /// Generate a <see cref="BiSegGraph"/> based upon a <see cref="SegGraph"/> + /// </summary> + private void GenerateBiSegGraph(SegGraph segGraph) + { + double smooth = 0.1; + int wordPairFreq = 0; + int maxStart = segGraph.MaxStart; + double oneWordFreq, weight, tinyDouble = 1.0 / Utility.MAX_FREQUENCE; + + int next; + char[] idBuffer; + // get the list of tokens ordered and indexed + segTokenList = segGraph.MakeIndex(); + // Because the beginning position of startToken is -1, therefore startToken can be obtained when key = -1 + int key = -1; + IList<SegToken> nextTokens = null; + while (key < maxStart) + { + if (segGraph.IsStartExist(key)) + { + + IList<SegToken> tokenList = segGraph.GetStartList(key); + + // Calculate all tokens for a given key. + foreach (SegToken t1 in tokenList) + { + oneWordFreq = t1.Weight; + next = t1.EndOffset; + nextTokens = null; + // Find the next corresponding Token. + // For example: "Sunny seashore", the present Token is "sunny", next one should be "sea" or "seashore". + // If we cannot find the next Token, then go to the end and repeat the same cycle. + while (next <= maxStart) + { + // Because the beginning position of endToken is sentenceLen, so equal to sentenceLen can find endToken. + if (segGraph.IsStartExist(next)) + { + nextTokens = segGraph.GetStartList(next); + break; + } + next++; + } + if (nextTokens == null) + { + break; + } + foreach (SegToken t2 in nextTokens) + { + idBuffer = new char[t1.CharArray.Length + t2.CharArray.Length + 1]; + System.Array.Copy(t1.CharArray, 0, idBuffer, 0, t1.CharArray.Length); + idBuffer[t1.CharArray.Length] = BigramDictionary.WORD_SEGMENT_CHAR; + System.Array.Copy(t2.CharArray, 0, idBuffer, + t1.CharArray.Length + 1, t2.CharArray.Length); + + // Two linked Words frequency + wordPairFreq = bigramDict.GetFrequency(idBuffer); + + // Smoothing + + // -log{a*P(Ci-1)+(1-a)P(Ci|Ci-1)} Note 0<a<1 + weight = -Math + .Log(smooth + * (1.0 + oneWordFreq) + / (Utility.MAX_FREQUENCE + 0.0) + + (1.0 - smooth) + * ((1.0 - tinyDouble) * wordPairFreq / (1.0 + oneWordFreq) + tinyDouble)); + + SegTokenPair tokenPair = new SegTokenPair(idBuffer, t1.Index, + t2.Index, weight); + this.AddSegTokenPair(tokenPair); + } + } + } + key++; + } + + } + + /// <summary> + /// Returns <c>true</c> if their is a list of token pairs at this offset (index of the second token) + /// </summary> + /// <param name="to">index of the second token in the token pair</param> + /// <returns><c>true</c> if a token pair exists</returns> + public virtual bool IsToExist(int to) + { + //return tokenPairListTable.get(Integer.valueOf(to)) != null; + //return tokenPairListTable.ContainsKey(to) && tokenPairListTable[to] != null; + IList<SegTokenPair> result; + return tokenPairListTable.TryGetValue(to, out result) && result != null; + } + + /// <summary> + /// Return a <see cref="T:IList{SegTokenPair}"/> of all token pairs at this offset (index of the second token) + /// </summary> + /// <param name="to">index of the second token in the token pair</param> + /// <returns><see cref="T:IList{SegTokenPair}"/> of token pairs. </returns> + public virtual IList<SegTokenPair> GetToList(int to) + { + IList<SegTokenPair> result; + tokenPairListTable.TryGetValue(to, out result); + return result; + } + + /// <summary> + /// Add a <see cref="SegTokenPair"/> + /// </summary> + /// <param name="tokenPair"><see cref="SegTokenPair"/></param> + public virtual void AddSegTokenPair(SegTokenPair tokenPair) + { + int to = tokenPair.To; + if (!IsToExist(to)) + { + List<SegTokenPair> newlist = new List<SegTokenPair>(); + newlist.Add(tokenPair); + tokenPairListTable[to] = newlist; + } + else + { + IList<SegTokenPair> tokenPairList = tokenPairListTable[to]; + tokenPairList.Add(tokenPair); + } + } + + /// <summary> + /// Get the number of <see cref="SegTokenPair"/> entries in the table. + /// </summary> + /// <returns>number of <see cref="SegTokenPair"/> entries</returns> + public virtual int ToCount + { + get { return tokenPairListTable.Count; } + } + + /// <summary> + /// Find the shortest path with the Viterbi algorithm. + /// </summary> + /// <returns><see cref="T:IList{SegToken}"/></returns> + [ExceptionToNetNumericConvention] + public virtual IList<SegToken> GetShortPath() + { + int current; + int nodeCount = ToCount; + IList<PathNode> path = new List<PathNode>(); + PathNode zeroPath = new PathNode(); + zeroPath.Weight = 0; + zeroPath.PreNode = 0; + path.Add(zeroPath); + for (current = 1; current <= nodeCount; current++) + { + double weight; + IList<SegTokenPair> edges = GetToList(current); + + double minWeight = double.MaxValue; + SegTokenPair minEdge = null; + foreach (SegTokenPair edge in edges) + { + weight = edge.Weight; + PathNode preNode2 = path[edge.From]; + if (preNode2.Weight + weight < minWeight) + { + minWeight = preNode2.Weight + weight; + minEdge = edge; + } + } + PathNode newNode = new PathNode(); + newNode.Weight = minWeight; + newNode.PreNode = minEdge.From; + path.Add(newNode); + } + + // Calculate PathNodes + int preNode, lastNode; + lastNode = path.Count - 1; + current = lastNode; + IList<int> rpath = new List<int>(); + IList<SegToken> resultPath = new List<SegToken>(); + + rpath.Add(current); + while (current != 0) + { + PathNode currentPathNode = path[current]; + preNode = currentPathNode.PreNode; + rpath.Add(preNode); + current = preNode; + } + for (int j = rpath.Count - 1; j >= 0; j--) + { + //int idInteger = rpath.get(j); + //int id = idInteger.intValue(); + int id = rpath[j]; + SegToken t = segTokenList[id]; + resultPath.Add(t); + } + return resultPath; + } + + public override string ToString() + { + StringBuilder sb = new StringBuilder(); + ICollection<IList<SegTokenPair>> values = tokenPairListTable.Values; + foreach (IList<SegTokenPair> segList in values) + { + foreach (SegTokenPair pair in segList) + { + sb.Append(pair + "\n"); + } + } + return sb.ToString(); + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/2f5d89b4/src/Lucene.Net.Analysis.SmartCn/HHMM/BigramDictionary.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.SmartCn/HHMM/BigramDictionary.cs b/src/Lucene.Net.Analysis.SmartCn/HHMM/BigramDictionary.cs new file mode 100644 index 0000000..cc87ceb --- /dev/null +++ b/src/Lucene.Net.Analysis.SmartCn/HHMM/BigramDictionary.cs @@ -0,0 +1,431 @@ +using Lucene.Net.Support; +using Lucene.Net.Support.IO; +using System; +using System.IO; +using System.Reflection; +using System.Text; + +namespace Lucene.Net.Analysis.Cn.Smart.HHMM +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// <summary> + /// SmartChineseAnalyzer Bigram dictionary. + /// <para/> + /// @lucene.experimental + /// </summary> + internal class BigramDictionary : AbstractDictionary + { + private BigramDictionary() + { + } + + public static readonly char WORD_SEGMENT_CHAR = '@'; + + private static BigramDictionary singleInstance; + + public static readonly int PRIME_BIGRAM_LENGTH = 402137; + + /// <summary> + /// The word associations are stored as FNV1 hashcodes, which have a small probability of collision, but save memory. + /// </summary> + private long[] bigramHashTable; + + private int[] frequencyTable; + + private int max = 0; + + private int repeat = 0; + + // static Logger log = Logger.getLogger(BigramDictionary.class); + + private static object syncLock = new object(); + + public static BigramDictionary GetInstance() + { + lock (syncLock) + { + if (singleInstance == null) + { + singleInstance = new BigramDictionary(); + + // LUCENENET specific + // LUCENE-1817: https://issues.apache.org/jira/browse/LUCENE-1817 + // This issue still existed as of 4.8.0. Here is the fix - we only + // load from a directory if the actual directory exists (AnalyzerProfile + // ensures it is an empty string if it is not available). + string dictRoot = AnalyzerProfile.ANALYSIS_DATA_DIR; + if (string.IsNullOrEmpty(dictRoot)) + { + singleInstance.Load(); + } + else + { + singleInstance.Load(dictRoot); + } + + + //try + //{ + // singleInstance.Load(); + //} + //catch (IOException e) + //{ + // string dictRoot = AnalyzerProfile.ANALYSIS_DATA_DIR; + // singleInstance.Load(dictRoot); + //} + //catch (TypeLoadException e) + //{ + // throw new Exception(e.ToString(), e); + //} + } + return singleInstance; + } + } + + private bool LoadFromObj(FileInfo serialObj) + { + try + { + using (Stream input = new FileStream(serialObj.FullName, FileMode.Open, FileAccess.Read)) + LoadFromInputStream(input); + return true; + } + catch (Exception e) + { + throw new Exception(e.ToString(), e); + } + } + + // LUCENENET conversion note: + // The data in Lucene is stored in a proprietary binary format (similar to + // .NET's BinarySerializer) that cannot be read back in .NET. Therefore, the + // data was extracted using Java's DataOutputStream using the following Java code. + // It can then be read in using the LoadFromInputStream method below + // (using a DataInputStream instead of a BinaryReader), and saved + // in the correct (BinaryWriter) format by calling the SaveToObj method. + // Alternatively, the data can be loaded from disk using the files + // here(https://issues.apache.org/jira/browse/LUCENE-1629) in the analysis.data.zip file, + // which will automatically produce the .mem files. + + //public void saveToOutputStream(java.io.DataOutputStream stream) throws IOException + //{ + // // save wordIndexTable + // int wiLen = wordIndexTable.length; + // stream.writeInt(wiLen); + // for (int i = 0; i<wiLen; i++) + // { + // stream.writeShort(wordIndexTable[i]); + // } + + // // save charIndexTable + // int ciLen = charIndexTable.length; + // stream.writeInt(ciLen); + // for (int i = 0; i<ciLen; i++) + // { + // stream.writeChar(charIndexTable[i]); + // } + + // int caDim1 = wordItem_charArrayTable == null ? -1 : wordItem_charArrayTable.length; + // stream.writeInt(caDim1); + // for (int i = 0; i<caDim1; i++) + // { + // int caDim2 = wordItem_charArrayTable[i] == null ? -1 : wordItem_charArrayTable[i].length; + // stream.writeInt(caDim2); + // for (int j = 0; j<caDim2; j++) + // { + // int caDim3 = wordItem_charArrayTable[i][j] == null ? -1 : wordItem_charArrayTable[i][j].length; + // stream.writeInt(caDim3); + // for (int k = 0; k<caDim3; k++) + // { + // stream.writeChar(wordItem_charArrayTable[i][j][k]); + // } + // } + // } + + // int fDim1 = wordItem_frequencyTable == null ? -1 : wordItem_frequencyTable.length; + // stream.writeInt(fDim1); + // for (int i = 0; i<fDim1; i++) + // { + // int fDim2 = wordItem_frequencyTable[i] == null ? -1 : wordItem_frequencyTable[i].length; + // stream.writeInt(fDim2); + // for (int j = 0; j<fDim2; j++) + // { + // stream.writeInt(wordItem_frequencyTable[i][j]); + // } + // } + //} + + private void LoadFromInputStream(Stream serialObjectInputStream) + { + //ObjectInputStream input = new ObjectInputStream(serialObjectInputStream); + //bigramHashTable = (long[])input.readObject(); + //frequencyTable = (int[])input.readObject(); + //// log.info("load bigram dict from serialization."); + //input.close(); + + using (var reader = new BinaryReader(serialObjectInputStream)) + //using (var reader = new DataInputStream(serialObjectInputStream)) + { + // Read bigramHashTable + int bhLen = reader.ReadInt32(); + bigramHashTable = new long[bhLen]; + for (int i = 0; i < bhLen; i++) + { + bigramHashTable[i] = reader.ReadInt64(); + } + + // Read frequencyTable + int fLen = reader.ReadInt32(); + frequencyTable = new int[fLen]; + for (int i = 0; i < fLen; i++) + { + frequencyTable[i] = reader.ReadInt32(); + } + } + + // log.info("load bigram dict from serialization."); + } + + private void SaveToObj(FileInfo serialObj) + { + try + { + //ObjectOutputStream output = new ObjectOutputStream(new FileStream( + // serialObj.FullName, FileMode.Create, FileAccess.Write)); + //output.writeObject(bigramHashTable); + //output.writeObject(frequencyTable); + //output.close(); + + using (Stream output = new FileStream(serialObj.FullName, FileMode.Create, FileAccess.Write)) + { + using (BinaryWriter writer = new BinaryWriter(output)) + { + int bhLen = bigramHashTable.Length; + writer.Write(bhLen); + for (int i = 0; i < bhLen; i++) + { + writer.Write(bigramHashTable[i]); + } + + int fLen = frequencyTable.Length; + writer.Write(fLen); + for (int i = 0; i < fLen; i++) + { + writer.Write(frequencyTable[i]); + } + } + } + // log.info("serialize bigram dict."); + } +#pragma warning disable 168 + catch (Exception e) +#pragma warning restore 168 + { + // log.warn(e.getMessage()); + } + } + + private void Load() + { + using (Stream input = this.GetType().GetTypeInfo().Assembly.FindAndGetManifestResourceStream(this.GetType(), "bigramdict.mem")) + { + LoadFromInputStream(input); + } + } + + private void Load(string dictRoot) + { + string bigramDictPath = System.IO.Path.Combine(dictRoot, "bigramdict.dct"); + + FileInfo serialObj = new FileInfo(System.IO.Path.Combine(dictRoot, "bigramdict.mem")); + + if (serialObj.Exists && LoadFromObj(serialObj)) + { + + } + else + { + try + { + bigramHashTable = new long[PRIME_BIGRAM_LENGTH]; + frequencyTable = new int[PRIME_BIGRAM_LENGTH]; + for (int i = 0; i < PRIME_BIGRAM_LENGTH; i++) + { + // it is possible for a value to hash to 0, but the probability is extremely low + bigramHashTable[i] = 0; + frequencyTable[i] = 0; + } + LoadFromFile(bigramDictPath); + } + catch (IOException e) + { + throw new Exception(e.ToString(), e); + } + SaveToObj(serialObj); + } + } + + /// <summary> + /// Load the datafile into this <see cref="BigramDictionary"/> + /// </summary> + /// <param name="dctFilePath">dctFilePath path to the Bigramdictionary (bigramdict.dct)</param> + /// <exception cref="IOException">If there is a low-level I/O error</exception> + public virtual void LoadFromFile(string dctFilePath) + { + int i, cnt, length, total = 0; + // The file only counted 6763 Chinese characters plus 5 reserved slots 3756~3760. + // The 3756th is used (as a header) to store information. + int[] + buffer = new int[3]; + byte[] intBuffer = new byte[4]; + string tmpword; + //using (RandomAccessFile dctFile = new RandomAccessFile(dctFilePath, "r")) + using (var dctFile = new FileStream(dctFilePath, FileMode.Open, FileAccess.Read)) + { + + // GB2312 characters 0 - 6768 + for (i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + CHAR_NUM_IN_FILE; i++) + { + string currentStr = GetCCByGB2312Id(i); + // if (i == 5231) + // System.out.println(i); + + dctFile.Read(intBuffer, 0, intBuffer.Length); + // the dictionary was developed for C, and byte order must be converted to work with Java + cnt = ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LITTLE_ENDIAN).GetInt32(); + if (cnt <= 0) + { + continue; + } + total += cnt; + int j = 0; + while (j < cnt) + { + dctFile.Read(intBuffer, 0, intBuffer.Length); + buffer[0] = ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LITTLE_ENDIAN) + .GetInt32();// frequency + dctFile.Read(intBuffer, 0, intBuffer.Length); + buffer[1] = ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LITTLE_ENDIAN) + .GetInt32();// length + dctFile.Read(intBuffer, 0, intBuffer.Length); + // buffer[2] = ByteBuffer.wrap(intBuffer).order( + // ByteOrder.LITTLE_ENDIAN).getInt();// handle + + length = buffer[1]; + if (length > 0) + { + byte[] lchBuffer = new byte[length]; + dctFile.Read(lchBuffer, 0, lchBuffer.Length); + //tmpword = new String(lchBuffer, "GB2312"); + tmpword = Encoding.GetEncoding("GB2312").GetString(lchBuffer); + //tmpword = Encoding.GetEncoding("hz-gb-2312").GetString(lchBuffer); + if (i != 3755 + GB2312_FIRST_CHAR) + { + tmpword = currentStr + tmpword; + } + char[] carray = tmpword.ToCharArray(); + long hashId = Hash1(carray); + int index = GetAvaliableIndex(hashId, carray); + if (index != -1) + { + if (bigramHashTable[index] == 0) + { + bigramHashTable[index] = hashId; + // bigramStringTable[index] = tmpword; + } + frequencyTable[index] += buffer[0]; + } + } + j++; + } + } + } + // log.info("load dictionary done! " + dctFilePath + " total:" + total); + } + + private int GetAvaliableIndex(long hashId, char[] carray) + { + int hash1 = (int)(hashId % PRIME_BIGRAM_LENGTH); + int hash2 = Hash2(carray) % PRIME_BIGRAM_LENGTH; + if (hash1 < 0) + hash1 = PRIME_BIGRAM_LENGTH + hash1; + if (hash2 < 0) + hash2 = PRIME_BIGRAM_LENGTH + hash2; + int index = hash1; + int i = 1; + while (bigramHashTable[index] != 0 && bigramHashTable[index] != hashId + && i < PRIME_BIGRAM_LENGTH) + { + index = (hash1 + i * hash2) % PRIME_BIGRAM_LENGTH; + i++; + } + // System.out.println(i - 1); + + if (i < PRIME_BIGRAM_LENGTH + && (bigramHashTable[index] == 0 || bigramHashTable[index] == hashId)) + { + return index; + } + else + return -1; + } + + /// <summary> + /// lookup the index into the frequency array. + /// </summary> + private int GetBigramItemIndex(char[] carray) + { + long hashId = Hash1(carray); + int hash1 = (int)(hashId % PRIME_BIGRAM_LENGTH); + int hash2 = Hash2(carray) % PRIME_BIGRAM_LENGTH; + if (hash1 < 0) + hash1 = PRIME_BIGRAM_LENGTH + hash1; + if (hash2 < 0) + hash2 = PRIME_BIGRAM_LENGTH + hash2; + int index = hash1; + int i = 1; + repeat++; + while (bigramHashTable[index] != 0 && bigramHashTable[index] != hashId + && i < PRIME_BIGRAM_LENGTH) + { + index = (hash1 + i * hash2) % PRIME_BIGRAM_LENGTH; + i++; + repeat++; + if (i > max) + max = i; + } + // System.out.println(i - 1); + + if (i < PRIME_BIGRAM_LENGTH && bigramHashTable[index] == hashId) + { + return index; + } + else + return -1; + } + + public int GetFrequency(char[] carray) + { + int index = GetBigramItemIndex(carray); + if (index != -1) + return frequencyTable[index]; + return 0; + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/2f5d89b4/src/Lucene.Net.Analysis.SmartCn/HHMM/HHMMSegmenter.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.SmartCn/HHMM/HHMMSegmenter.cs b/src/Lucene.Net.Analysis.SmartCn/HHMM/HHMMSegmenter.cs new file mode 100644 index 0000000..5d6ee55 --- /dev/null +++ b/src/Lucene.Net.Analysis.SmartCn/HHMM/HHMMSegmenter.cs @@ -0,0 +1,252 @@ +using System.Collections.Generic; +using System.Text; + +namespace Lucene.Net.Analysis.Cn.Smart.HHMM +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// <summary> + /// Finds the optimal segmentation of a sentence into Chinese words + /// <para/> + /// @lucene.experimental + /// </summary> + public class HHMMSegmenter + { + private static WordDictionary wordDict = WordDictionary.GetInstance(); + + /// <summary> + /// Create the <see cref="SegGraph"/> for a sentence. + /// </summary> + /// <param name="sentence">input sentence, without start and end markers</param> + /// <returns><see cref="SegGraph"/> corresponding to the input sentence.</returns> + private SegGraph CreateSegGraph(string sentence) + { + int i = 0, j; + int length = sentence.Length; + int foundIndex; + CharType[] charTypeArray = GetCharTypes(sentence); + StringBuilder wordBuf = new StringBuilder(); + SegToken token; + int frequency = 0; // the number of times word appears. + bool hasFullWidth; + WordType wordType; + char[] charArray; + + SegGraph segGraph = new SegGraph(); + while (i < length) + { + hasFullWidth = false; + switch (charTypeArray[i]) + { + case CharType.SPACE_LIKE: + i++; + break; + case CharType.HANZI: + j = i + 1; + //wordBuf.delete(0, wordBuf.length()); + wordBuf.Remove(0, wordBuf.Length); + // It doesn't matter if a single Chinese character (Hanzi) can form a phrase or not, + // it will store that single Chinese character (Hanzi) in the SegGraph. Otherwise, it will + // cause word division. + wordBuf.Append(sentence[i]); + charArray = new char[] { sentence[i] }; + frequency = wordDict.GetFrequency(charArray); + token = new SegToken(charArray, i, j, WordType.CHINESE_WORD, + frequency); + segGraph.AddToken(token); + + foundIndex = wordDict.GetPrefixMatch(charArray); + while (j <= length && foundIndex != -1) + { + if (wordDict.IsEqual(charArray, foundIndex) && charArray.Length > 1) + { + // It is the phrase we are looking for; In other words, we have found a phrase SegToken + // from i to j. It is not a monosyllabic word (single word). + frequency = wordDict.GetFrequency(charArray); + token = new SegToken(charArray, i, j, WordType.CHINESE_WORD, + frequency); + segGraph.AddToken(token); + } + + while (j < length && charTypeArray[j] == CharType.SPACE_LIKE) + j++; + + if (j < length && charTypeArray[j] == CharType.HANZI) + { + wordBuf.Append(sentence[j]); + charArray = new char[wordBuf.Length]; + //wordBuf.GetChars(0, charArray.Length, charArray, 0); + wordBuf.CopyTo(0, charArray, 0, charArray.Length); + // idArray has been found (foundWordIndex!=-1) as a prefix before. + // Therefore, idArray after it has been lengthened can only appear after foundWordIndex. + // So start searching after foundWordIndex. + foundIndex = wordDict.GetPrefixMatch(charArray, foundIndex); + j++; + } + else + { + break; + } + } + i++; + break; + case CharType.FULLWIDTH_LETTER: + hasFullWidth = true; /* intentional fallthrough */ + + j = i + 1; + while (j < length + && (charTypeArray[j] == CharType.LETTER || charTypeArray[j] == CharType.FULLWIDTH_LETTER)) + { + if (charTypeArray[j] == CharType.FULLWIDTH_LETTER) + hasFullWidth = true; + j++; + } + // Found a Token from i to j. Type is LETTER char string. + charArray = Utility.STRING_CHAR_ARRAY; + frequency = wordDict.GetFrequency(charArray); + wordType = hasFullWidth ? WordType.FULLWIDTH_STRING : WordType.STRING; + token = new SegToken(charArray, i, j, wordType, frequency); + segGraph.AddToken(token); + i = j; + break; + + case CharType.LETTER: + j = i + 1; + while (j < length + && (charTypeArray[j] == CharType.LETTER || charTypeArray[j] == CharType.FULLWIDTH_LETTER)) + { + if (charTypeArray[j] == CharType.FULLWIDTH_LETTER) + hasFullWidth = true; + j++; + } + // Found a Token from i to j. Type is LETTER char string. + charArray = Utility.STRING_CHAR_ARRAY; + frequency = wordDict.GetFrequency(charArray); + wordType = hasFullWidth ? WordType.FULLWIDTH_STRING : WordType.STRING; + token = new SegToken(charArray, i, j, wordType, frequency); + segGraph.AddToken(token); + i = j; + break; + case CharType.FULLWIDTH_DIGIT: + hasFullWidth = true; /* intentional fallthrough */ + + j = i + 1; + while (j < length + && (charTypeArray[j] == CharType.DIGIT || charTypeArray[j] == CharType.FULLWIDTH_DIGIT)) + { + if (charTypeArray[j] == CharType.FULLWIDTH_DIGIT) + hasFullWidth = true; + j++; + } + // Found a Token from i to j. Type is NUMBER char string. + charArray = Utility.NUMBER_CHAR_ARRAY; + frequency = wordDict.GetFrequency(charArray); + wordType = hasFullWidth ? WordType.FULLWIDTH_NUMBER : WordType.NUMBER; + token = new SegToken(charArray, i, j, wordType, frequency); + segGraph.AddToken(token); + i = j; + break; + + case CharType.DIGIT: + j = i + 1; + while (j < length + && (charTypeArray[j] == CharType.DIGIT || charTypeArray[j] == CharType.FULLWIDTH_DIGIT)) + { + if (charTypeArray[j] == CharType.FULLWIDTH_DIGIT) + hasFullWidth = true; + j++; + } + // Found a Token from i to j. Type is NUMBER char string. + charArray = Utility.NUMBER_CHAR_ARRAY; + frequency = wordDict.GetFrequency(charArray); + wordType = hasFullWidth ? WordType.FULLWIDTH_NUMBER : WordType.NUMBER; + token = new SegToken(charArray, i, j, wordType, frequency); + segGraph.AddToken(token); + i = j; + break; + case CharType.DELIMITER: + j = i + 1; + // No need to search the weight for the punctuation. Picking the highest frequency will work. + frequency = Utility.MAX_FREQUENCE; + charArray = new char[] { sentence[i] }; + token = new SegToken(charArray, i, j, WordType.DELIMITER, frequency); + segGraph.AddToken(token); + i = j; + break; + default: + j = i + 1; + // Treat the unrecognized char symbol as unknown string. + // For example, any symbol not in GB2312 is treated as one of these. + charArray = Utility.STRING_CHAR_ARRAY; + frequency = wordDict.GetFrequency(charArray); + token = new SegToken(charArray, i, j, WordType.STRING, frequency); + segGraph.AddToken(token); + i = j; + break; + } + } + + // Add two more Tokens: "beginning xx beginning" + charArray = Utility.START_CHAR_ARRAY; + frequency = wordDict.GetFrequency(charArray); + token = new SegToken(charArray, -1, 0, WordType.SENTENCE_BEGIN, frequency); + segGraph.AddToken(token); + + // "end xx end" + charArray = Utility.END_CHAR_ARRAY; + frequency = wordDict.GetFrequency(charArray); + token = new SegToken(charArray, length, length + 1, WordType.SENTENCE_END, + frequency); + segGraph.AddToken(token); + + return segGraph; + } + + /// <summary> + /// Get the character types for every character in a sentence. + /// </summary> + /// <param name="sentence">input sentence</param> + /// <returns>array of character types corresponding to character positions in the sentence</returns> + /// <seealso cref="Utility.GetCharType(char)"/> + private static CharType[] GetCharTypes(string sentence) + { + int length = sentence.Length; + CharType[] charTypeArray = new CharType[length]; + // the type of each character by position + for (int i = 0; i < length; i++) + { + charTypeArray[i] = Utility.GetCharType(sentence[i]); + } + + return charTypeArray; + } + + /// <summary> + /// Return a list of <see cref="SegToken"/> representing the best segmentation of a sentence + /// </summary> + /// <param name="sentence">input sentence</param> + /// <returns>best segmentation as a <see cref="T:IList{SegToken}"/></returns> + public virtual IList<SegToken> Process(string sentence) + { + SegGraph segGraph = CreateSegGraph(sentence); + BiSegGraph biSegGraph = new BiSegGraph(segGraph); + IList<SegToken> shortPath = biSegGraph.GetShortPath(); + return shortPath; + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/2f5d89b4/src/Lucene.Net.Analysis.SmartCn/HHMM/PathNode.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.SmartCn/HHMM/PathNode.cs b/src/Lucene.Net.Analysis.SmartCn/HHMM/PathNode.cs new file mode 100644 index 0000000..11387ad --- /dev/null +++ b/src/Lucene.Net.Analysis.SmartCn/HHMM/PathNode.cs @@ -0,0 +1,80 @@ +using Lucene.Net.Support; +using System; + +namespace Lucene.Net.Analysis.Cn.Smart.HHMM +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// <summary> + /// SmartChineseAnalyzer internal node representation + /// <para> + /// Used by <see cref="BiSegGraph"/> to maximize the segmentation with the Viterbi algorithm. + /// </para> + /// @lucene.experimental + /// </summary> + internal class PathNode : IComparable<PathNode> + { + public double Weight { get; set; } + + public int PreNode { get; set; } + + public virtual int CompareTo(PathNode pn) + { + if (Weight < pn.Weight) + return -1; + else if (Weight == pn.Weight) + return 0; + else + return 1; + } + + /// <summary> + /// <see cref="object.GetHashCode()"/> + /// </summary> + public override int GetHashCode() + { + int prime = 31; + int result = 1; + result = prime * result + PreNode; + long temp; + temp = Number.DoubleToInt64Bits(Weight); + result = prime * result + (int)(temp ^ (int)((uint)temp >> 32)); + return result; + } + + /// <summary> + /// <see cref="object.Equals(object)"/> + /// </summary> + public override bool Equals(object obj) + { + if (this == obj) + return true; + if (obj == null) + return false; + if (GetType() != obj.GetType()) + return false; + PathNode other = (PathNode)obj; + if (PreNode != other.PreNode) + return false; + if (Number.DoubleToInt64Bits(Weight) != Number + .DoubleToInt64Bits(other.Weight)) + return false; + return true; + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/2f5d89b4/src/Lucene.Net.Analysis.SmartCn/HHMM/SegGraph.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.SmartCn/HHMM/SegGraph.cs b/src/Lucene.Net.Analysis.SmartCn/HHMM/SegGraph.cs new file mode 100644 index 0000000..e0138c1 --- /dev/null +++ b/src/Lucene.Net.Analysis.SmartCn/HHMM/SegGraph.cs @@ -0,0 +1,160 @@ +using System.Collections.Generic; +using System.Text; + +namespace Lucene.Net.Analysis.Cn.Smart.HHMM +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// <summary> + /// Graph representing possible tokens at each start offset in the sentence. + /// <para> + /// For each start offset, a list of possible tokens is stored. + /// </para> + /// @lucene.experimental + /// </summary> + internal class SegGraph + { + /// <summary> + /// Map of start offsets to <see cref="T:IList{SegToken}"/> of tokens at that position + /// </summary> + private IDictionary<int, IList<SegToken>> tokenListTable = new Dictionary<int, IList<SegToken>>(); + + private int maxStart = -1; + + /// <summary> + /// Returns <c>true</c> if a mapping for the specified start offset exists + /// </summary> + /// <param name="s">startOffset</param> + /// <returns><c>true</c> if there are tokens for the startOffset</returns> + public virtual bool IsStartExist(int s) + { + //return tokenListTable.get(s) != null; + IList<SegToken> result; + return tokenListTable.TryGetValue(s, out result) && result != null; + } + + /// <summary> + /// Get the list of tokens at the specified start offset + /// </summary> + /// <param name="s">startOffset</param> + /// <returns><see cref="T:IList{SegToken}"/> of tokens at the specified start offset.</returns> + public virtual IList<SegToken> GetStartList(int s) + { + IList<SegToken> result; + tokenListTable.TryGetValue(s, out result); + return result; + } + + /// <summary> + /// Get the highest start offset in the map. Returns maximum start offset, or -1 if the map is empty. + /// </summary> + public virtual int MaxStart + { + get { return maxStart; } + } + + /// <summary> + /// Set the <see cref="SegToken.Index"/> for each token, based upon its order by startOffset. + /// </summary> + /// <returns>a <see cref="T:IList{SegToken}"/> of these ordered tokens.</returns> + public virtual IList<SegToken> MakeIndex() + { + IList<SegToken> result = new List<SegToken>(); + int s = -1, count = 0, size = tokenListTable.Count; + IList<SegToken> tokenList; + int index = 0; + while (count < size) + { + if (IsStartExist(s)) + { + tokenList = tokenListTable[s]; + foreach (SegToken st in tokenList) + { + st.Index = index; + result.Add(st); + index++; + } + count++; + } + s++; + } + return result; + } + + /// <summary> + /// Add a <see cref="SegToken"/> to the mapping, creating a new mapping at the token's startOffset if one does not exist. + /// </summary> + /// <param name="token">token <see cref="SegToken"/>.</param> + public virtual void AddToken(SegToken token) + { + int s = token.StartOffset; + if (!IsStartExist(s)) + { + List<SegToken> newlist = new List<SegToken>(); + newlist.Add(token); + tokenListTable[s] = newlist; + } + else + { + IList<SegToken> tokenList = tokenListTable[s]; + tokenList.Add(token); + } + if (s > maxStart) + { + maxStart = s; + } + } + + /// <summary> + /// Return a <see cref="T:IList{SegToken}"/> of all tokens in the map, ordered by startOffset. + /// </summary> + /// <returns><see cref="T:IList{SegToken}"/> of all tokens in the map.</returns> + public virtual IList<SegToken> ToTokenList() + { + IList<SegToken> result = new List<SegToken>(); + int s = -1, count = 0, size = tokenListTable.Count; + IList<SegToken> tokenList; + + while (count < size) + { + if (IsStartExist(s)) + { + tokenList = tokenListTable[s]; + foreach (SegToken st in tokenList) + { + result.Add(st); + } + count++; + } + s++; + } + return result; + } + + public override string ToString() + { + IList<SegToken> tokenList = this.ToTokenList(); + StringBuilder sb = new StringBuilder(); + foreach (SegToken t in tokenList) + { + sb.Append(t + "\n"); + } + return sb.ToString(); + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/2f5d89b4/src/Lucene.Net.Analysis.SmartCn/HHMM/SegToken.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.SmartCn/HHMM/SegToken.cs b/src/Lucene.Net.Analysis.SmartCn/HHMM/SegToken.cs new file mode 100644 index 0000000..48ba8ce --- /dev/null +++ b/src/Lucene.Net.Analysis.SmartCn/HHMM/SegToken.cs @@ -0,0 +1,123 @@ +using Lucene.Net.Support; + +namespace Lucene.Net.Analysis.Cn.Smart.HHMM +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// <summary> + /// SmartChineseAnalyzer internal token + /// <para/> + /// @lucene.experimental + /// </summary> + public class SegToken + { + /// <summary> + /// Character array containing token text + /// </summary> + [WritableArray] + public char[] CharArray { get; set; } + + /// <summary> + /// start offset into original sentence + /// </summary> + public int StartOffset { get; set; } + + /// <summary> + /// end offset into original sentence + /// </summary> + public int EndOffset { get; set; } + + /// <summary> + /// <see cref="Smart.WordType"/> of the text + /// </summary> + public WordType WordType { get; set; } + + /// <summary> + /// word frequency + /// </summary> + public int Weight { get; set; } + + /// <summary> + /// during segmentation, this is used to store the index of the token in the token list table + /// </summary> + public int Index { get; set; } + + /// <summary> + /// Create a new <see cref="SegToken"/> from a character array. + /// </summary> + /// <param name="idArray">character array containing text</param> + /// <param name="start">start offset of <see cref="SegToken"/> in original sentence</param> + /// <param name="end">end offset of <see cref="SegToken"/> in original sentence</param> + /// <param name="wordType"><see cref="Smart.WordType"/> of the text</param> + /// <param name="weight">word frequency</param> + public SegToken(char[] idArray, int start, int end, WordType wordType, int weight) + { + this.CharArray = idArray; + this.StartOffset = start; + this.EndOffset = end; + this.WordType = wordType; + this.Weight = weight; + } + + /// <summary> + /// <see cref="object.GetHashCode()"/> + /// </summary> + public override int GetHashCode() + { + int prime = 31; + int result = 1; + for (int i = 0; i < CharArray.Length; i++) + { + result = prime * result + CharArray[i]; + } + result = prime * result + EndOffset; + result = prime * result + Index; + result = prime * result + StartOffset; + result = prime * result + Weight; + result = prime * result + (int)WordType; + return result; + } + + /// <summary> + /// <see cref="object.Equals(object)"/> + /// </summary> + public override bool Equals(object obj) + { + if (this == obj) + return true; + if (obj == null) + return false; + if (GetType() != obj.GetType()) + return false; + SegToken other = (SegToken)obj; + if (!Arrays.Equals(CharArray, other.CharArray)) + return false; + if (EndOffset != other.EndOffset) + return false; + if (Index != other.Index) + return false; + if (StartOffset != other.StartOffset) + return false; + if (Weight != other.Weight) + return false; + if (WordType != other.WordType) + return false; + return true; + } + } +}
