Lucene.Net.Analysis.SmartCn: Renamed HHMM namespace to Hhmm to follow .NET conventions better
Project: http://git-wip-us.apache.org/repos/asf/lucenenet/repo Commit: http://git-wip-us.apache.org/repos/asf/lucenenet/commit/056353d4 Tree: http://git-wip-us.apache.org/repos/asf/lucenenet/tree/056353d4 Diff: http://git-wip-us.apache.org/repos/asf/lucenenet/diff/056353d4 Branch: refs/heads/master Commit: 056353d47fbdad6f0379f959be90752ad5081bc4 Parents: 94d0291 Author: Shad Storhaug <[email protected]> Authored: Sat Sep 9 03:40:08 2017 +0700 Committer: Shad Storhaug <[email protected]> Committed: Sat Sep 9 03:41:22 2017 +0700 ---------------------------------------------------------------------- .../HHMM/AbstractDictionary.cs | 225 ------ .../HHMM/BiSegGraph.cs | 257 ------ .../HHMM/BigramDictionary.cs | 432 ---------- .../HHMM/HHMMSegmenter.cs | 253 ------ .../HHMM/PathNode.cs | 81 -- .../HHMM/SegGraph.cs | 161 ---- .../HHMM/SegToken.cs | 124 --- .../HHMM/SegTokenFilter.cs | 76 -- .../HHMM/SegTokenPair.cs | 96 --- .../HHMM/WordDictionary.cs | 779 ------------------- .../HMMChineseTokenizer.cs | 2 +- .../Hhmm/AbstractDictionary.cs | 225 ++++++ .../Hhmm/BiSegGraph.cs | 257 ++++++ .../Hhmm/BigramDictionary.cs | 432 ++++++++++ .../Hhmm/HHMMSegmenter.cs | 253 ++++++ .../Hhmm/PathNode.cs | 81 ++ .../Hhmm/SegGraph.cs | 161 ++++ .../Hhmm/SegToken.cs | 124 +++ .../Hhmm/SegTokenFilter.cs | 76 ++ .../Hhmm/SegTokenPair.cs | 96 +++ .../Hhmm/WordDictionary.cs | 779 +++++++++++++++++++ .../Lucene.Net.Analysis.SmartCn.csproj | 4 +- .../WordSegmenter.cs | 2 +- .../WordTokenFilter.cs | 2 +- .../TestHMMChineseTokenizerFactory.cs | 2 +- 25 files changed, 2491 insertions(+), 2489 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/lucenenet/blob/056353d4/src/Lucene.Net.Analysis.SmartCn/HHMM/AbstractDictionary.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.SmartCn/HHMM/AbstractDictionary.cs b/src/Lucene.Net.Analysis.SmartCn/HHMM/AbstractDictionary.cs deleted file mode 100644 index 370056a..0000000 --- a/src/Lucene.Net.Analysis.SmartCn/HHMM/AbstractDictionary.cs +++ /dev/null @@ -1,225 +0,0 @@ -// lucene version compatibility level: 4.8.1 -using System; -using System.Text; - -namespace Lucene.Net.Analysis.Cn.Smart.HHMM -{ - /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - - /// <summary> - /// <para> - /// <see cref="SmartChineseAnalyzer"/> abstract dictionary implementation. - /// </para> - /// <para> - /// Contains methods for dealing with GB2312 encoding. - /// </para> - /// @lucene.experimental - /// </summary> - internal abstract class AbstractDictionary - { - /// <summary> - /// First Chinese Character in GB2312 (15 * 94) - /// Characters in GB2312 are arranged in a grid of 94 * 94, 0-14 are unassigned or punctuation. - /// </summary> - public static readonly int GB2312_FIRST_CHAR = 1410; - - /// <summary> - /// Last Chinese Character in GB2312 (87 * 94). - /// Characters in GB2312 are arranged in a grid of 94 * 94, 88-94 are unassigned. - /// </summary> - public static readonly int GB2312_CHAR_NUM = 87 * 94; - - /// <summary> - /// Dictionary data contains 6768 Chinese characters with frequency statistics. - /// </summary> - public static readonly int CHAR_NUM_IN_FILE = 6768; - - // ===================================================== - // code +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +A +B +C +D +E +F - // B0A0 å é¿ å æ¨ å å å ç ç è¼ ç® è¾ ç¢ ç± é - // B0B0 é æ°¨ å® ä¿º æ æ 岸 èº æ¡ è® æ ç å¹ æ ç¬ ç¿± - // B0C0 è¢ å² å¥¥ æ æ¾³ è æ æ å å§ ç¬ å « ç¤ å·´ æ è· - // B0D0 é¶ æ è å é¸ ç½¢ ç¸ ç½ æ ç¾ æ ä½° è´¥ æ ç¨ æ - // B0E0 ç æ¬ æ³ è¬ é¢ æ¿ ç æ® æ ä¼´ ç£ å å ç» é¦ å¸® - // B0F0 æ¢ æ¦ è ç» æ£ ç£ è é å è°¤ è è å è¤ å¥ - // ===================================================== - // - // GB2312 character setï¼ - // 01 94 Symbols - // 02 72 Numbers - // 03 94 Latin - // 04 83 Kana - // 05 86 Katakana - // 06 48 Greek - // 07 66 Cyrillic - // 08 63 Phonetic Symbols - // 09 76 Drawing Symbols - // 10-15 Unassigned - // 16-55 3755 Plane 1, in pinyin order - // 56-87 3008 Plane 2, in radical/stroke order - // 88-94 Unassigned - // ====================================================== - - /// <summary> - /// <para> - /// Transcode from GB2312 ID to Unicode - /// </para> - /// <para> - /// GB2312 is divided into a 94 * 94 grid, containing 7445 characters consisting of 6763 Chinese characters and 682 symbols. - /// Some regions are unassigned (reserved). - /// </para> - /// </summary> - /// <param name="ccid">GB2312 id</param> - /// <returns>unicode String</returns> - public virtual string GetCCByGB2312Id(int ccid) - { - if (ccid < 0 || ccid > AbstractDictionary.GB2312_CHAR_NUM) - return ""; - int cc1 = ccid / 94 + 161; - int cc2 = ccid % 94 + 161; - byte[] buffer = new byte[2]; - buffer[0] = (byte)cc1; - buffer[1] = (byte)cc2; - try - { - //String cchar = new String(buffer, "GB2312"); - string cchar = Encoding.GetEncoding("GB2312").GetString(buffer); - return cchar; - } - catch (ArgumentException) // Encoding is not supported by the platform - { - return ""; - } - } - - /// <summary> - /// Transcode from Unicode to GB2312 - /// </summary> - /// <param name="ch">input character in Unicode, or character in Basic Latin range.</param> - /// <returns>position in GB2312</returns> - public virtual short GetGB2312Id(char ch) - { - try - { - //byte[] buffer = Character.ToString(ch).getBytes("GB2312"); - byte[] buffer = Encoding.GetEncoding("GB2312").GetBytes(ch.ToString()); - //byte[] buffer = Encoding.GetEncoding("hz-gb-2312").GetBytes(ch.ToString()); - if (buffer.Length != 2) - { - // Should be a two-byte character - return -1; - } - int b0 = (buffer[0] & 0x0FF) - 161; // Code starts from A1, therefore subtract 0xA1=161 - int b1 = (buffer[1] & 0x0FF) - 161; // There is no Chinese char for the first and last symbol. - // Therefore, each code page only has 16*6-2=94 characters. - return (short)(b0 * 94 + b1); - } - catch (ArgumentException e) // Encoding is not supported by the platform - { - throw new Exception(e.ToString(), e); - } - } - - /// <summary> - /// 32-bit FNV Hash Function - /// </summary> - /// <param name="c">input character</param> - /// <returns>hashcode</returns> - public virtual long Hash1(char c) - { - long p = 1099511628211L; - long hash = unchecked((long)0xcbf29ce484222325L); - hash = (hash ^ (c & 0x00FF)) * p; - hash = (hash ^ (c >> 8)) * p; - hash += hash << 13; - hash ^= hash >> 7; - hash += hash << 3; - hash ^= hash >> 17; - hash += hash << 5; - return hash; - } - - /// <summary> - /// 32-bit FNV Hash Function - /// </summary> - /// <param name="carray">character array</param> - /// <returns>hashcode</returns> - public virtual long Hash1(char[] carray) - { - long p = 1099511628211L; - long hash = unchecked((long)0xcbf29ce484222325L); - for (int i = 0; i < carray.Length; i++) - { - char d = carray[i]; - hash = (hash ^ (d & 0x00FF)) * p; - hash = (hash ^ (d >> 8)) * p; - } - - // hash += hash << 13; - // hash ^= hash >> 7; - // hash += hash << 3; - // hash ^= hash >> 17; - // hash += hash << 5; - return hash; - } - - /// <summary> - /// djb2 hash algorithmï¼this algorithm (k=33) was first reported by dan - /// bernstein many years ago in comp.lang.c. another version of this algorithm - /// (now favored by bernstein) uses xor: hash(i) = hash(i - 1) * 33 ^ str[i]; - /// the magic of number 33 (why it works better than many other constants, - /// prime or not) has never been adequately explained. - /// </summary> - /// <param name="c">character</param> - /// <returns>hashcode</returns> - public virtual int Hash2(char c) - { - int hash = 5381; - - /* hash 33 + c */ - hash = ((hash << 5) + hash) + c & 0x00FF; - hash = ((hash << 5) + hash) + c >> 8; - - return hash; - } - - /// <summary> - /// djb2 hash algorithmï¼this algorithm (k=33) was first reported by dan - /// bernstein many years ago in comp.lang.c. another version of this algorithm - /// (now favored by bernstein) uses xor: hash(i) = hash(i - 1) * 33 ^ str[i]; - /// the magic of number 33 (why it works better than many other constants, - /// prime or not) has never been adequately explained. - /// </summary> - /// <param name="carray">character array</param> - /// <returns>hashcode</returns> - public virtual int Hash2(char[] carray) - { - int hash = 5381; - - /* hash 33 + c */ - for (int i = 0; i < carray.Length; i++) - { - char d = carray[i]; - hash = ((hash << 5) + hash) + d & 0x00FF; - hash = ((hash << 5) + hash) + d >> 8; - } - - return hash; - } - } -} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/056353d4/src/Lucene.Net.Analysis.SmartCn/HHMM/BiSegGraph.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.SmartCn/HHMM/BiSegGraph.cs b/src/Lucene.Net.Analysis.SmartCn/HHMM/BiSegGraph.cs deleted file mode 100644 index c32c8d5..0000000 --- a/src/Lucene.Net.Analysis.SmartCn/HHMM/BiSegGraph.cs +++ /dev/null @@ -1,257 +0,0 @@ -// lucene version compatibility level: 4.8.1 -using Lucene.Net.Support; -using System; -using System.Collections.Generic; -using System.Text; - -namespace Lucene.Net.Analysis.Cn.Smart.HHMM -{ - /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - - /// <summary> - /// Graph representing possible token pairs (bigrams) at each start offset in the sentence. - /// <para> - /// For each start offset, a list of possible token pairs is stored. - /// </para> - /// @lucene.experimental - /// </summary> - internal class BiSegGraph - { - private IDictionary<int, IList<SegTokenPair>> tokenPairListTable = new Dictionary<int, IList<SegTokenPair>>(); - - private IList<SegToken> segTokenList; - - private static BigramDictionary bigramDict = BigramDictionary.GetInstance(); - - public BiSegGraph(SegGraph segGraph) - { - segTokenList = segGraph.MakeIndex(); - GenerateBiSegGraph(segGraph); - } - - /// <summary> - /// Generate a <see cref="BiSegGraph"/> based upon a <see cref="SegGraph"/> - /// </summary> - private void GenerateBiSegGraph(SegGraph segGraph) - { - double smooth = 0.1; - int wordPairFreq = 0; - int maxStart = segGraph.MaxStart; - double oneWordFreq, weight, tinyDouble = 1.0 / Utility.MAX_FREQUENCE; - - int next; - char[] idBuffer; - // get the list of tokens ordered and indexed - segTokenList = segGraph.MakeIndex(); - // Because the beginning position of startToken is -1, therefore startToken can be obtained when key = -1 - int key = -1; - IList<SegToken> nextTokens = null; - while (key < maxStart) - { - if (segGraph.IsStartExist(key)) - { - - IList<SegToken> tokenList = segGraph.GetStartList(key); - - // Calculate all tokens for a given key. - foreach (SegToken t1 in tokenList) - { - oneWordFreq = t1.Weight; - next = t1.EndOffset; - nextTokens = null; - // Find the next corresponding Token. - // For example: "Sunny seashore", the present Token is "sunny", next one should be "sea" or "seashore". - // If we cannot find the next Token, then go to the end and repeat the same cycle. - while (next <= maxStart) - { - // Because the beginning position of endToken is sentenceLen, so equal to sentenceLen can find endToken. - if (segGraph.IsStartExist(next)) - { - nextTokens = segGraph.GetStartList(next); - break; - } - next++; - } - if (nextTokens == null) - { - break; - } - foreach (SegToken t2 in nextTokens) - { - idBuffer = new char[t1.CharArray.Length + t2.CharArray.Length + 1]; - System.Array.Copy(t1.CharArray, 0, idBuffer, 0, t1.CharArray.Length); - idBuffer[t1.CharArray.Length] = BigramDictionary.WORD_SEGMENT_CHAR; - System.Array.Copy(t2.CharArray, 0, idBuffer, - t1.CharArray.Length + 1, t2.CharArray.Length); - - // Two linked Words frequency - wordPairFreq = bigramDict.GetFrequency(idBuffer); - - // Smoothing - - // -log{a*P(Ci-1)+(1-a)P(Ci|Ci-1)} Note 0<a<1 - weight = -Math - .Log(smooth - * (1.0 + oneWordFreq) - / (Utility.MAX_FREQUENCE + 0.0) - + (1.0 - smooth) - * ((1.0 - tinyDouble) * wordPairFreq / (1.0 + oneWordFreq) + tinyDouble)); - - SegTokenPair tokenPair = new SegTokenPair(idBuffer, t1.Index, - t2.Index, weight); - this.AddSegTokenPair(tokenPair); - } - } - } - key++; - } - - } - - /// <summary> - /// Returns <c>true</c> if their is a list of token pairs at this offset (index of the second token) - /// </summary> - /// <param name="to">index of the second token in the token pair</param> - /// <returns><c>true</c> if a token pair exists</returns> - public virtual bool IsToExist(int to) - { - //return tokenPairListTable.get(Integer.valueOf(to)) != null; - //return tokenPairListTable.ContainsKey(to) && tokenPairListTable[to] != null; - IList<SegTokenPair> result; - return tokenPairListTable.TryGetValue(to, out result) && result != null; - } - - /// <summary> - /// Return a <see cref="T:IList{SegTokenPair}"/> of all token pairs at this offset (index of the second token) - /// </summary> - /// <param name="to">index of the second token in the token pair</param> - /// <returns><see cref="T:IList{SegTokenPair}"/> of token pairs. </returns> - public virtual IList<SegTokenPair> GetToList(int to) - { - IList<SegTokenPair> result; - tokenPairListTable.TryGetValue(to, out result); - return result; - } - - /// <summary> - /// Add a <see cref="SegTokenPair"/> - /// </summary> - /// <param name="tokenPair"><see cref="SegTokenPair"/></param> - public virtual void AddSegTokenPair(SegTokenPair tokenPair) - { - int to = tokenPair.To; - if (!IsToExist(to)) - { - List<SegTokenPair> newlist = new List<SegTokenPair>(); - newlist.Add(tokenPair); - tokenPairListTable[to] = newlist; - } - else - { - IList<SegTokenPair> tokenPairList = tokenPairListTable[to]; - tokenPairList.Add(tokenPair); - } - } - - /// <summary> - /// Get the number of <see cref="SegTokenPair"/> entries in the table. - /// </summary> - /// <returns>number of <see cref="SegTokenPair"/> entries</returns> - public virtual int ToCount - { - get { return tokenPairListTable.Count; } - } - - /// <summary> - /// Find the shortest path with the Viterbi algorithm. - /// </summary> - /// <returns><see cref="T:IList{SegToken}"/></returns> - [ExceptionToNetNumericConvention] - public virtual IList<SegToken> GetShortPath() - { - int current; - int nodeCount = ToCount; - IList<PathNode> path = new List<PathNode>(); - PathNode zeroPath = new PathNode(); - zeroPath.Weight = 0; - zeroPath.PreNode = 0; - path.Add(zeroPath); - for (current = 1; current <= nodeCount; current++) - { - double weight; - IList<SegTokenPair> edges = GetToList(current); - - double minWeight = double.MaxValue; - SegTokenPair minEdge = null; - foreach (SegTokenPair edge in edges) - { - weight = edge.Weight; - PathNode preNode2 = path[edge.From]; - if (preNode2.Weight + weight < minWeight) - { - minWeight = preNode2.Weight + weight; - minEdge = edge; - } - } - PathNode newNode = new PathNode(); - newNode.Weight = minWeight; - newNode.PreNode = minEdge.From; - path.Add(newNode); - } - - // Calculate PathNodes - int preNode, lastNode; - lastNode = path.Count - 1; - current = lastNode; - IList<int> rpath = new List<int>(); - IList<SegToken> resultPath = new List<SegToken>(); - - rpath.Add(current); - while (current != 0) - { - PathNode currentPathNode = path[current]; - preNode = currentPathNode.PreNode; - rpath.Add(preNode); - current = preNode; - } - for (int j = rpath.Count - 1; j >= 0; j--) - { - //int idInteger = rpath.get(j); - //int id = idInteger.intValue(); - int id = rpath[j]; - SegToken t = segTokenList[id]; - resultPath.Add(t); - } - return resultPath; - } - - public override string ToString() - { - StringBuilder sb = new StringBuilder(); - ICollection<IList<SegTokenPair>> values = tokenPairListTable.Values; - foreach (IList<SegTokenPair> segList in values) - { - foreach (SegTokenPair pair in segList) - { - sb.Append(pair + "\n"); - } - } - return sb.ToString(); - } - } -} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/056353d4/src/Lucene.Net.Analysis.SmartCn/HHMM/BigramDictionary.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.SmartCn/HHMM/BigramDictionary.cs b/src/Lucene.Net.Analysis.SmartCn/HHMM/BigramDictionary.cs deleted file mode 100644 index 72e5f1f..0000000 --- a/src/Lucene.Net.Analysis.SmartCn/HHMM/BigramDictionary.cs +++ /dev/null @@ -1,432 +0,0 @@ -// lucene version compatibility level: 4.8.1 -using Lucene.Net.Support; -using Lucene.Net.Support.IO; -using System; -using System.IO; -using System.Reflection; -using System.Text; - -namespace Lucene.Net.Analysis.Cn.Smart.HHMM -{ - /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - - /// <summary> - /// SmartChineseAnalyzer Bigram dictionary. - /// <para/> - /// @lucene.experimental - /// </summary> - internal class BigramDictionary : AbstractDictionary - { - private BigramDictionary() - { - } - - public static readonly char WORD_SEGMENT_CHAR = '@'; - - private static BigramDictionary singleInstance; - - public static readonly int PRIME_BIGRAM_LENGTH = 402137; - - /// <summary> - /// The word associations are stored as FNV1 hashcodes, which have a small probability of collision, but save memory. - /// </summary> - private long[] bigramHashTable; - - private int[] frequencyTable; - - private int max = 0; - - private int repeat = 0; - - // static Logger log = Logger.getLogger(BigramDictionary.class); - - private static object syncLock = new object(); - - public static BigramDictionary GetInstance() - { - lock (syncLock) - { - if (singleInstance == null) - { - singleInstance = new BigramDictionary(); - - // LUCENENET specific - // LUCENE-1817: https://issues.apache.org/jira/browse/LUCENE-1817 - // This issue still existed as of 4.8.0. Here is the fix - we only - // load from a directory if the actual directory exists (AnalyzerProfile - // ensures it is an empty string if it is not available). - string dictRoot = AnalyzerProfile.ANALYSIS_DATA_DIR; - if (string.IsNullOrEmpty(dictRoot)) - { - singleInstance.Load(); - } - else - { - singleInstance.Load(dictRoot); - } - - - //try - //{ - // singleInstance.Load(); - //} - //catch (IOException e) - //{ - // string dictRoot = AnalyzerProfile.ANALYSIS_DATA_DIR; - // singleInstance.Load(dictRoot); - //} - //catch (TypeLoadException e) - //{ - // throw new Exception(e.ToString(), e); - //} - } - return singleInstance; - } - } - - private bool LoadFromObj(FileInfo serialObj) - { - try - { - using (Stream input = new FileStream(serialObj.FullName, FileMode.Open, FileAccess.Read)) - LoadFromInputStream(input); - return true; - } - catch (Exception e) - { - throw new Exception(e.ToString(), e); - } - } - - // LUCENENET conversion note: - // The data in Lucene is stored in a proprietary binary format (similar to - // .NET's BinarySerializer) that cannot be read back in .NET. Therefore, the - // data was extracted using Java's DataOutputStream using the following Java code. - // It can then be read in using the LoadFromInputStream method below - // (using a DataInputStream instead of a BinaryReader), and saved - // in the correct (BinaryWriter) format by calling the SaveToObj method. - // Alternatively, the data can be loaded from disk using the files - // here(https://issues.apache.org/jira/browse/LUCENE-1629) in the analysis.data.zip file, - // which will automatically produce the .mem files. - - //public void saveToOutputStream(java.io.DataOutputStream stream) throws IOException - //{ - // // save wordIndexTable - // int wiLen = wordIndexTable.length; - // stream.writeInt(wiLen); - // for (int i = 0; i<wiLen; i++) - // { - // stream.writeShort(wordIndexTable[i]); - // } - - // // save charIndexTable - // int ciLen = charIndexTable.length; - // stream.writeInt(ciLen); - // for (int i = 0; i<ciLen; i++) - // { - // stream.writeChar(charIndexTable[i]); - // } - - // int caDim1 = wordItem_charArrayTable == null ? -1 : wordItem_charArrayTable.length; - // stream.writeInt(caDim1); - // for (int i = 0; i<caDim1; i++) - // { - // int caDim2 = wordItem_charArrayTable[i] == null ? -1 : wordItem_charArrayTable[i].length; - // stream.writeInt(caDim2); - // for (int j = 0; j<caDim2; j++) - // { - // int caDim3 = wordItem_charArrayTable[i][j] == null ? -1 : wordItem_charArrayTable[i][j].length; - // stream.writeInt(caDim3); - // for (int k = 0; k<caDim3; k++) - // { - // stream.writeChar(wordItem_charArrayTable[i][j][k]); - // } - // } - // } - - // int fDim1 = wordItem_frequencyTable == null ? -1 : wordItem_frequencyTable.length; - // stream.writeInt(fDim1); - // for (int i = 0; i<fDim1; i++) - // { - // int fDim2 = wordItem_frequencyTable[i] == null ? -1 : wordItem_frequencyTable[i].length; - // stream.writeInt(fDim2); - // for (int j = 0; j<fDim2; j++) - // { - // stream.writeInt(wordItem_frequencyTable[i][j]); - // } - // } - //} - - private void LoadFromInputStream(Stream serialObjectInputStream) - { - //ObjectInputStream input = new ObjectInputStream(serialObjectInputStream); - //bigramHashTable = (long[])input.readObject(); - //frequencyTable = (int[])input.readObject(); - //// log.info("load bigram dict from serialization."); - //input.close(); - - using (var reader = new BinaryReader(serialObjectInputStream)) - //using (var reader = new DataInputStream(serialObjectInputStream)) - { - // Read bigramHashTable - int bhLen = reader.ReadInt32(); - bigramHashTable = new long[bhLen]; - for (int i = 0; i < bhLen; i++) - { - bigramHashTable[i] = reader.ReadInt64(); - } - - // Read frequencyTable - int fLen = reader.ReadInt32(); - frequencyTable = new int[fLen]; - for (int i = 0; i < fLen; i++) - { - frequencyTable[i] = reader.ReadInt32(); - } - } - - // log.info("load bigram dict from serialization."); - } - - private void SaveToObj(FileInfo serialObj) - { - try - { - //ObjectOutputStream output = new ObjectOutputStream(new FileStream( - // serialObj.FullName, FileMode.Create, FileAccess.Write)); - //output.writeObject(bigramHashTable); - //output.writeObject(frequencyTable); - //output.close(); - - using (Stream output = new FileStream(serialObj.FullName, FileMode.Create, FileAccess.Write)) - { - using (BinaryWriter writer = new BinaryWriter(output)) - { - int bhLen = bigramHashTable.Length; - writer.Write(bhLen); - for (int i = 0; i < bhLen; i++) - { - writer.Write(bigramHashTable[i]); - } - - int fLen = frequencyTable.Length; - writer.Write(fLen); - for (int i = 0; i < fLen; i++) - { - writer.Write(frequencyTable[i]); - } - } - } - // log.info("serialize bigram dict."); - } -#pragma warning disable 168 - catch (Exception e) -#pragma warning restore 168 - { - // log.warn(e.getMessage()); - } - } - - private void Load() - { - using (Stream input = this.GetType().GetTypeInfo().Assembly.FindAndGetManifestResourceStream(this.GetType(), "bigramdict.mem")) - { - LoadFromInputStream(input); - } - } - - private void Load(string dictRoot) - { - string bigramDictPath = System.IO.Path.Combine(dictRoot, "bigramdict.dct"); - - FileInfo serialObj = new FileInfo(System.IO.Path.Combine(dictRoot, "bigramdict.mem")); - - if (serialObj.Exists && LoadFromObj(serialObj)) - { - - } - else - { - try - { - bigramHashTable = new long[PRIME_BIGRAM_LENGTH]; - frequencyTable = new int[PRIME_BIGRAM_LENGTH]; - for (int i = 0; i < PRIME_BIGRAM_LENGTH; i++) - { - // it is possible for a value to hash to 0, but the probability is extremely low - bigramHashTable[i] = 0; - frequencyTable[i] = 0; - } - LoadFromFile(bigramDictPath); - } - catch (IOException e) - { - throw new Exception(e.ToString(), e); - } - SaveToObj(serialObj); - } - } - - /// <summary> - /// Load the datafile into this <see cref="BigramDictionary"/> - /// </summary> - /// <param name="dctFilePath">dctFilePath path to the Bigramdictionary (bigramdict.dct)</param> - /// <exception cref="IOException">If there is a low-level I/O error</exception> - public virtual void LoadFromFile(string dctFilePath) - { - int i, cnt, length, total = 0; - // The file only counted 6763 Chinese characters plus 5 reserved slots 3756~3760. - // The 3756th is used (as a header) to store information. - int[] - buffer = new int[3]; - byte[] intBuffer = new byte[4]; - string tmpword; - //using (RandomAccessFile dctFile = new RandomAccessFile(dctFilePath, "r")) - using (var dctFile = new FileStream(dctFilePath, FileMode.Open, FileAccess.Read)) - { - - // GB2312 characters 0 - 6768 - for (i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + CHAR_NUM_IN_FILE; i++) - { - string currentStr = GetCCByGB2312Id(i); - // if (i == 5231) - // System.out.println(i); - - dctFile.Read(intBuffer, 0, intBuffer.Length); - // the dictionary was developed for C, and byte order must be converted to work with Java - cnt = ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LITTLE_ENDIAN).GetInt32(); - if (cnt <= 0) - { - continue; - } - total += cnt; - int j = 0; - while (j < cnt) - { - dctFile.Read(intBuffer, 0, intBuffer.Length); - buffer[0] = ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LITTLE_ENDIAN) - .GetInt32();// frequency - dctFile.Read(intBuffer, 0, intBuffer.Length); - buffer[1] = ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LITTLE_ENDIAN) - .GetInt32();// length - dctFile.Read(intBuffer, 0, intBuffer.Length); - // buffer[2] = ByteBuffer.wrap(intBuffer).order( - // ByteOrder.LITTLE_ENDIAN).getInt();// handle - - length = buffer[1]; - if (length > 0) - { - byte[] lchBuffer = new byte[length]; - dctFile.Read(lchBuffer, 0, lchBuffer.Length); - //tmpword = new String(lchBuffer, "GB2312"); - tmpword = Encoding.GetEncoding("GB2312").GetString(lchBuffer); - //tmpword = Encoding.GetEncoding("hz-gb-2312").GetString(lchBuffer); - if (i != 3755 + GB2312_FIRST_CHAR) - { - tmpword = currentStr + tmpword; - } - char[] carray = tmpword.ToCharArray(); - long hashId = Hash1(carray); - int index = GetAvaliableIndex(hashId, carray); - if (index != -1) - { - if (bigramHashTable[index] == 0) - { - bigramHashTable[index] = hashId; - // bigramStringTable[index] = tmpword; - } - frequencyTable[index] += buffer[0]; - } - } - j++; - } - } - } - // log.info("load dictionary done! " + dctFilePath + " total:" + total); - } - - private int GetAvaliableIndex(long hashId, char[] carray) - { - int hash1 = (int)(hashId % PRIME_BIGRAM_LENGTH); - int hash2 = Hash2(carray) % PRIME_BIGRAM_LENGTH; - if (hash1 < 0) - hash1 = PRIME_BIGRAM_LENGTH + hash1; - if (hash2 < 0) - hash2 = PRIME_BIGRAM_LENGTH + hash2; - int index = hash1; - int i = 1; - while (bigramHashTable[index] != 0 && bigramHashTable[index] != hashId - && i < PRIME_BIGRAM_LENGTH) - { - index = (hash1 + i * hash2) % PRIME_BIGRAM_LENGTH; - i++; - } - // System.out.println(i - 1); - - if (i < PRIME_BIGRAM_LENGTH - && (bigramHashTable[index] == 0 || bigramHashTable[index] == hashId)) - { - return index; - } - else - return -1; - } - - /// <summary> - /// lookup the index into the frequency array. - /// </summary> - private int GetBigramItemIndex(char[] carray) - { - long hashId = Hash1(carray); - int hash1 = (int)(hashId % PRIME_BIGRAM_LENGTH); - int hash2 = Hash2(carray) % PRIME_BIGRAM_LENGTH; - if (hash1 < 0) - hash1 = PRIME_BIGRAM_LENGTH + hash1; - if (hash2 < 0) - hash2 = PRIME_BIGRAM_LENGTH + hash2; - int index = hash1; - int i = 1; - repeat++; - while (bigramHashTable[index] != 0 && bigramHashTable[index] != hashId - && i < PRIME_BIGRAM_LENGTH) - { - index = (hash1 + i * hash2) % PRIME_BIGRAM_LENGTH; - i++; - repeat++; - if (i > max) - max = i; - } - // System.out.println(i - 1); - - if (i < PRIME_BIGRAM_LENGTH && bigramHashTable[index] == hashId) - { - return index; - } - else - return -1; - } - - public int GetFrequency(char[] carray) - { - int index = GetBigramItemIndex(carray); - if (index != -1) - return frequencyTable[index]; - return 0; - } - } -} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/056353d4/src/Lucene.Net.Analysis.SmartCn/HHMM/HHMMSegmenter.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.SmartCn/HHMM/HHMMSegmenter.cs b/src/Lucene.Net.Analysis.SmartCn/HHMM/HHMMSegmenter.cs deleted file mode 100644 index e2ef365..0000000 --- a/src/Lucene.Net.Analysis.SmartCn/HHMM/HHMMSegmenter.cs +++ /dev/null @@ -1,253 +0,0 @@ -// lucene version compatibility level: 4.8.1 -using System.Collections.Generic; -using System.Text; - -namespace Lucene.Net.Analysis.Cn.Smart.HHMM -{ - /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - - /// <summary> - /// Finds the optimal segmentation of a sentence into Chinese words - /// <para/> - /// @lucene.experimental - /// </summary> - public class HHMMSegmenter - { - private static WordDictionary wordDict = WordDictionary.GetInstance(); - - /// <summary> - /// Create the <see cref="SegGraph"/> for a sentence. - /// </summary> - /// <param name="sentence">input sentence, without start and end markers</param> - /// <returns><see cref="SegGraph"/> corresponding to the input sentence.</returns> - private SegGraph CreateSegGraph(string sentence) - { - int i = 0, j; - int length = sentence.Length; - int foundIndex; - CharType[] charTypeArray = GetCharTypes(sentence); - StringBuilder wordBuf = new StringBuilder(); - SegToken token; - int frequency = 0; // the number of times word appears. - bool hasFullWidth; - WordType wordType; - char[] charArray; - - SegGraph segGraph = new SegGraph(); - while (i < length) - { - hasFullWidth = false; - switch (charTypeArray[i]) - { - case CharType.SPACE_LIKE: - i++; - break; - case CharType.HANZI: - j = i + 1; - //wordBuf.delete(0, wordBuf.length()); - wordBuf.Remove(0, wordBuf.Length); - // It doesn't matter if a single Chinese character (Hanzi) can form a phrase or not, - // it will store that single Chinese character (Hanzi) in the SegGraph. Otherwise, it will - // cause word division. - wordBuf.Append(sentence[i]); - charArray = new char[] { sentence[i] }; - frequency = wordDict.GetFrequency(charArray); - token = new SegToken(charArray, i, j, WordType.CHINESE_WORD, - frequency); - segGraph.AddToken(token); - - foundIndex = wordDict.GetPrefixMatch(charArray); - while (j <= length && foundIndex != -1) - { - if (wordDict.IsEqual(charArray, foundIndex) && charArray.Length > 1) - { - // It is the phrase we are looking for; In other words, we have found a phrase SegToken - // from i to j. It is not a monosyllabic word (single word). - frequency = wordDict.GetFrequency(charArray); - token = new SegToken(charArray, i, j, WordType.CHINESE_WORD, - frequency); - segGraph.AddToken(token); - } - - while (j < length && charTypeArray[j] == CharType.SPACE_LIKE) - j++; - - if (j < length && charTypeArray[j] == CharType.HANZI) - { - wordBuf.Append(sentence[j]); - charArray = new char[wordBuf.Length]; - //wordBuf.GetChars(0, charArray.Length, charArray, 0); - wordBuf.CopyTo(0, charArray, 0, charArray.Length); - // idArray has been found (foundWordIndex!=-1) as a prefix before. - // Therefore, idArray after it has been lengthened can only appear after foundWordIndex. - // So start searching after foundWordIndex. - foundIndex = wordDict.GetPrefixMatch(charArray, foundIndex); - j++; - } - else - { - break; - } - } - i++; - break; - case CharType.FULLWIDTH_LETTER: - hasFullWidth = true; /* intentional fallthrough */ - - j = i + 1; - while (j < length - && (charTypeArray[j] == CharType.LETTER || charTypeArray[j] == CharType.FULLWIDTH_LETTER)) - { - if (charTypeArray[j] == CharType.FULLWIDTH_LETTER) - hasFullWidth = true; - j++; - } - // Found a Token from i to j. Type is LETTER char string. - charArray = Utility.STRING_CHAR_ARRAY; - frequency = wordDict.GetFrequency(charArray); - wordType = hasFullWidth ? WordType.FULLWIDTH_STRING : WordType.STRING; - token = new SegToken(charArray, i, j, wordType, frequency); - segGraph.AddToken(token); - i = j; - break; - - case CharType.LETTER: - j = i + 1; - while (j < length - && (charTypeArray[j] == CharType.LETTER || charTypeArray[j] == CharType.FULLWIDTH_LETTER)) - { - if (charTypeArray[j] == CharType.FULLWIDTH_LETTER) - hasFullWidth = true; - j++; - } - // Found a Token from i to j. Type is LETTER char string. - charArray = Utility.STRING_CHAR_ARRAY; - frequency = wordDict.GetFrequency(charArray); - wordType = hasFullWidth ? WordType.FULLWIDTH_STRING : WordType.STRING; - token = new SegToken(charArray, i, j, wordType, frequency); - segGraph.AddToken(token); - i = j; - break; - case CharType.FULLWIDTH_DIGIT: - hasFullWidth = true; /* intentional fallthrough */ - - j = i + 1; - while (j < length - && (charTypeArray[j] == CharType.DIGIT || charTypeArray[j] == CharType.FULLWIDTH_DIGIT)) - { - if (charTypeArray[j] == CharType.FULLWIDTH_DIGIT) - hasFullWidth = true; - j++; - } - // Found a Token from i to j. Type is NUMBER char string. - charArray = Utility.NUMBER_CHAR_ARRAY; - frequency = wordDict.GetFrequency(charArray); - wordType = hasFullWidth ? WordType.FULLWIDTH_NUMBER : WordType.NUMBER; - token = new SegToken(charArray, i, j, wordType, frequency); - segGraph.AddToken(token); - i = j; - break; - - case CharType.DIGIT: - j = i + 1; - while (j < length - && (charTypeArray[j] == CharType.DIGIT || charTypeArray[j] == CharType.FULLWIDTH_DIGIT)) - { - if (charTypeArray[j] == CharType.FULLWIDTH_DIGIT) - hasFullWidth = true; - j++; - } - // Found a Token from i to j. Type is NUMBER char string. - charArray = Utility.NUMBER_CHAR_ARRAY; - frequency = wordDict.GetFrequency(charArray); - wordType = hasFullWidth ? WordType.FULLWIDTH_NUMBER : WordType.NUMBER; - token = new SegToken(charArray, i, j, wordType, frequency); - segGraph.AddToken(token); - i = j; - break; - case CharType.DELIMITER: - j = i + 1; - // No need to search the weight for the punctuation. Picking the highest frequency will work. - frequency = Utility.MAX_FREQUENCE; - charArray = new char[] { sentence[i] }; - token = new SegToken(charArray, i, j, WordType.DELIMITER, frequency); - segGraph.AddToken(token); - i = j; - break; - default: - j = i + 1; - // Treat the unrecognized char symbol as unknown string. - // For example, any symbol not in GB2312 is treated as one of these. - charArray = Utility.STRING_CHAR_ARRAY; - frequency = wordDict.GetFrequency(charArray); - token = new SegToken(charArray, i, j, WordType.STRING, frequency); - segGraph.AddToken(token); - i = j; - break; - } - } - - // Add two more Tokens: "beginning xx beginning" - charArray = Utility.START_CHAR_ARRAY; - frequency = wordDict.GetFrequency(charArray); - token = new SegToken(charArray, -1, 0, WordType.SENTENCE_BEGIN, frequency); - segGraph.AddToken(token); - - // "end xx end" - charArray = Utility.END_CHAR_ARRAY; - frequency = wordDict.GetFrequency(charArray); - token = new SegToken(charArray, length, length + 1, WordType.SENTENCE_END, - frequency); - segGraph.AddToken(token); - - return segGraph; - } - - /// <summary> - /// Get the character types for every character in a sentence. - /// </summary> - /// <param name="sentence">input sentence</param> - /// <returns>array of character types corresponding to character positions in the sentence</returns> - /// <seealso cref="Utility.GetCharType(char)"/> - private static CharType[] GetCharTypes(string sentence) - { - int length = sentence.Length; - CharType[] charTypeArray = new CharType[length]; - // the type of each character by position - for (int i = 0; i < length; i++) - { - charTypeArray[i] = Utility.GetCharType(sentence[i]); - } - - return charTypeArray; - } - - /// <summary> - /// Return a list of <see cref="SegToken"/> representing the best segmentation of a sentence - /// </summary> - /// <param name="sentence">input sentence</param> - /// <returns>best segmentation as a <see cref="T:IList{SegToken}"/></returns> - public virtual IList<SegToken> Process(string sentence) - { - SegGraph segGraph = CreateSegGraph(sentence); - BiSegGraph biSegGraph = new BiSegGraph(segGraph); - IList<SegToken> shortPath = biSegGraph.GetShortPath(); - return shortPath; - } - } -} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/056353d4/src/Lucene.Net.Analysis.SmartCn/HHMM/PathNode.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.SmartCn/HHMM/PathNode.cs b/src/Lucene.Net.Analysis.SmartCn/HHMM/PathNode.cs deleted file mode 100644 index b8de5fb..0000000 --- a/src/Lucene.Net.Analysis.SmartCn/HHMM/PathNode.cs +++ /dev/null @@ -1,81 +0,0 @@ -// lucene version compatibility level: 4.8.1 -using Lucene.Net.Support; -using System; - -namespace Lucene.Net.Analysis.Cn.Smart.HHMM -{ - /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - - /// <summary> - /// SmartChineseAnalyzer internal node representation - /// <para> - /// Used by <see cref="BiSegGraph"/> to maximize the segmentation with the Viterbi algorithm. - /// </para> - /// @lucene.experimental - /// </summary> - internal class PathNode : IComparable<PathNode> - { - public double Weight { get; set; } - - public int PreNode { get; set; } - - public virtual int CompareTo(PathNode pn) - { - if (Weight < pn.Weight) - return -1; - else if (Weight == pn.Weight) - return 0; - else - return 1; - } - - /// <summary> - /// <see cref="object.GetHashCode()"/> - /// </summary> - public override int GetHashCode() - { - int prime = 31; - int result = 1; - result = prime * result + PreNode; - long temp; - temp = Number.DoubleToInt64Bits(Weight); - result = prime * result + (int)(temp ^ (int)((uint)temp >> 32)); - return result; - } - - /// <summary> - /// <see cref="object.Equals(object)"/> - /// </summary> - public override bool Equals(object obj) - { - if (this == obj) - return true; - if (obj == null) - return false; - if (GetType() != obj.GetType()) - return false; - PathNode other = (PathNode)obj; - if (PreNode != other.PreNode) - return false; - if (Number.DoubleToInt64Bits(Weight) != Number - .DoubleToInt64Bits(other.Weight)) - return false; - return true; - } - } -} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/056353d4/src/Lucene.Net.Analysis.SmartCn/HHMM/SegGraph.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.SmartCn/HHMM/SegGraph.cs b/src/Lucene.Net.Analysis.SmartCn/HHMM/SegGraph.cs deleted file mode 100644 index f3643eb..0000000 --- a/src/Lucene.Net.Analysis.SmartCn/HHMM/SegGraph.cs +++ /dev/null @@ -1,161 +0,0 @@ -// lucene version compatibility level: 4.8.1 -using System.Collections.Generic; -using System.Text; - -namespace Lucene.Net.Analysis.Cn.Smart.HHMM -{ - /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - - /// <summary> - /// Graph representing possible tokens at each start offset in the sentence. - /// <para> - /// For each start offset, a list of possible tokens is stored. - /// </para> - /// @lucene.experimental - /// </summary> - internal class SegGraph - { - /// <summary> - /// Map of start offsets to <see cref="T:IList{SegToken}"/> of tokens at that position - /// </summary> - private IDictionary<int, IList<SegToken>> tokenListTable = new Dictionary<int, IList<SegToken>>(); - - private int maxStart = -1; - - /// <summary> - /// Returns <c>true</c> if a mapping for the specified start offset exists - /// </summary> - /// <param name="s">startOffset</param> - /// <returns><c>true</c> if there are tokens for the startOffset</returns> - public virtual bool IsStartExist(int s) - { - //return tokenListTable.get(s) != null; - IList<SegToken> result; - return tokenListTable.TryGetValue(s, out result) && result != null; - } - - /// <summary> - /// Get the list of tokens at the specified start offset - /// </summary> - /// <param name="s">startOffset</param> - /// <returns><see cref="T:IList{SegToken}"/> of tokens at the specified start offset.</returns> - public virtual IList<SegToken> GetStartList(int s) - { - IList<SegToken> result; - tokenListTable.TryGetValue(s, out result); - return result; - } - - /// <summary> - /// Get the highest start offset in the map. Returns maximum start offset, or -1 if the map is empty. - /// </summary> - public virtual int MaxStart - { - get { return maxStart; } - } - - /// <summary> - /// Set the <see cref="SegToken.Index"/> for each token, based upon its order by startOffset. - /// </summary> - /// <returns>a <see cref="T:IList{SegToken}"/> of these ordered tokens.</returns> - public virtual IList<SegToken> MakeIndex() - { - IList<SegToken> result = new List<SegToken>(); - int s = -1, count = 0, size = tokenListTable.Count; - IList<SegToken> tokenList; - int index = 0; - while (count < size) - { - if (IsStartExist(s)) - { - tokenList = tokenListTable[s]; - foreach (SegToken st in tokenList) - { - st.Index = index; - result.Add(st); - index++; - } - count++; - } - s++; - } - return result; - } - - /// <summary> - /// Add a <see cref="SegToken"/> to the mapping, creating a new mapping at the token's startOffset if one does not exist. - /// </summary> - /// <param name="token">token <see cref="SegToken"/>.</param> - public virtual void AddToken(SegToken token) - { - int s = token.StartOffset; - if (!IsStartExist(s)) - { - List<SegToken> newlist = new List<SegToken>(); - newlist.Add(token); - tokenListTable[s] = newlist; - } - else - { - IList<SegToken> tokenList = tokenListTable[s]; - tokenList.Add(token); - } - if (s > maxStart) - { - maxStart = s; - } - } - - /// <summary> - /// Return a <see cref="T:IList{SegToken}"/> of all tokens in the map, ordered by startOffset. - /// </summary> - /// <returns><see cref="T:IList{SegToken}"/> of all tokens in the map.</returns> - public virtual IList<SegToken> ToTokenList() - { - IList<SegToken> result = new List<SegToken>(); - int s = -1, count = 0, size = tokenListTable.Count; - IList<SegToken> tokenList; - - while (count < size) - { - if (IsStartExist(s)) - { - tokenList = tokenListTable[s]; - foreach (SegToken st in tokenList) - { - result.Add(st); - } - count++; - } - s++; - } - return result; - } - - public override string ToString() - { - IList<SegToken> tokenList = this.ToTokenList(); - StringBuilder sb = new StringBuilder(); - foreach (SegToken t in tokenList) - { - sb.Append(t + "\n"); - } - return sb.ToString(); - } - } -} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/056353d4/src/Lucene.Net.Analysis.SmartCn/HHMM/SegToken.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.SmartCn/HHMM/SegToken.cs b/src/Lucene.Net.Analysis.SmartCn/HHMM/SegToken.cs deleted file mode 100644 index f557cbe..0000000 --- a/src/Lucene.Net.Analysis.SmartCn/HHMM/SegToken.cs +++ /dev/null @@ -1,124 +0,0 @@ -// lucene version compatibility level: 4.8.1 -using Lucene.Net.Support; - -namespace Lucene.Net.Analysis.Cn.Smart.HHMM -{ - /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - - /// <summary> - /// SmartChineseAnalyzer internal token - /// <para/> - /// @lucene.experimental - /// </summary> - public class SegToken - { - /// <summary> - /// Character array containing token text - /// </summary> - [WritableArray] - public char[] CharArray { get; set; } - - /// <summary> - /// start offset into original sentence - /// </summary> - public int StartOffset { get; set; } - - /// <summary> - /// end offset into original sentence - /// </summary> - public int EndOffset { get; set; } - - /// <summary> - /// <see cref="Smart.WordType"/> of the text - /// </summary> - public WordType WordType { get; set; } - - /// <summary> - /// word frequency - /// </summary> - public int Weight { get; set; } - - /// <summary> - /// during segmentation, this is used to store the index of the token in the token list table - /// </summary> - public int Index { get; set; } - - /// <summary> - /// Create a new <see cref="SegToken"/> from a character array. - /// </summary> - /// <param name="idArray">character array containing text</param> - /// <param name="start">start offset of <see cref="SegToken"/> in original sentence</param> - /// <param name="end">end offset of <see cref="SegToken"/> in original sentence</param> - /// <param name="wordType"><see cref="Smart.WordType"/> of the text</param> - /// <param name="weight">word frequency</param> - public SegToken(char[] idArray, int start, int end, WordType wordType, int weight) - { - this.CharArray = idArray; - this.StartOffset = start; - this.EndOffset = end; - this.WordType = wordType; - this.Weight = weight; - } - - /// <summary> - /// <see cref="object.GetHashCode()"/> - /// </summary> - public override int GetHashCode() - { - int prime = 31; - int result = 1; - for (int i = 0; i < CharArray.Length; i++) - { - result = prime * result + CharArray[i]; - } - result = prime * result + EndOffset; - result = prime * result + Index; - result = prime * result + StartOffset; - result = prime * result + Weight; - result = prime * result + (int)WordType; - return result; - } - - /// <summary> - /// <see cref="object.Equals(object)"/> - /// </summary> - public override bool Equals(object obj) - { - if (this == obj) - return true; - if (obj == null) - return false; - if (GetType() != obj.GetType()) - return false; - SegToken other = (SegToken)obj; - if (!Arrays.Equals(CharArray, other.CharArray)) - return false; - if (EndOffset != other.EndOffset) - return false; - if (Index != other.Index) - return false; - if (StartOffset != other.StartOffset) - return false; - if (Weight != other.Weight) - return false; - if (WordType != other.WordType) - return false; - return true; - } - } -} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/056353d4/src/Lucene.Net.Analysis.SmartCn/HHMM/SegTokenFilter.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.SmartCn/HHMM/SegTokenFilter.cs b/src/Lucene.Net.Analysis.SmartCn/HHMM/SegTokenFilter.cs deleted file mode 100644 index 5b61cff..0000000 --- a/src/Lucene.Net.Analysis.SmartCn/HHMM/SegTokenFilter.cs +++ /dev/null @@ -1,76 +0,0 @@ -// lucene version compatibility level: 4.8.1 -namespace Lucene.Net.Analysis.Cn.Smart.HHMM -{ - /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - - /// <summary> - /// <para> - /// Filters a <see cref="SegToken"/> by converting full-width latin to half-width, then lowercasing latin. - /// Additionally, all punctuation is converted into <see cref="Utility.COMMON_DELIMITER"/> - /// </para> - /// @lucene.experimental - /// </summary> - public class SegTokenFilter - { - /// <summary> - /// Filter an input <see cref="SegToken"/> - /// <para> - /// Full-width latin will be converted to half-width, then all latin will be lowercased. - /// All punctuation is converted into <see cref="Utility.COMMON_DELIMITER"/> - /// </para> - /// </summary> - /// <param name="token">Input <see cref="SegToken"/>.</param> - /// <returns>Normalized <see cref="SegToken"/>.</returns> - public virtual SegToken Filter(SegToken token) - { - switch (token.WordType) - { - case WordType.FULLWIDTH_NUMBER: - case WordType.FULLWIDTH_STRING: /* first convert full-width -> half-width */ - for (int i = 0; i < token.CharArray.Length; i++) - { - if (token.CharArray[i] >= 0xFF10) - { - token.CharArray[i] = (char)(token.CharArray[i] - 0xFEE0); - } - - if (token.CharArray[i] >= 0x0041 && token.CharArray[i] <= 0x005A) /* lowercase latin */ - { - token.CharArray[i] = (char)(token.CharArray[i] + 0x0020); - } - } - break; - case WordType.STRING: - for (int i = 0; i < token.CharArray.Length; i++) - { - if (token.CharArray[i] >= 0x0041 && token.CharArray[i] <= 0x005A) /* lowercase latin */ - { - token.CharArray[i] = (char)(token.CharArray[i] + 0x0020); - } - } - break; - case WordType.DELIMITER: /* convert all punctuation to Utility.COMMON_DELIMITER */ - token.CharArray = Utility.COMMON_DELIMITER; - break; - default: - break; - } - return token; - } - } -} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/056353d4/src/Lucene.Net.Analysis.SmartCn/HHMM/SegTokenPair.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.SmartCn/HHMM/SegTokenPair.cs b/src/Lucene.Net.Analysis.SmartCn/HHMM/SegTokenPair.cs deleted file mode 100644 index b5ceecd..0000000 --- a/src/Lucene.Net.Analysis.SmartCn/HHMM/SegTokenPair.cs +++ /dev/null @@ -1,96 +0,0 @@ -// lucene version compatibility level: 4.8.1 -using Lucene.Net.Support; - -namespace Lucene.Net.Analysis.Cn.Smart.HHMM -{ - /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - - /// <summary> - /// A pair of tokens in <see cref="SegGraph"/> - /// <para/> - /// @lucene.experimental - /// </summary> - internal class SegTokenPair - { - [WritableArray] - public char[] CharArray { get; set; } - - /// <summary> - /// index of the first token in <see cref="SegGraph"/> - /// </summary> - public int From { get; set; } - - /// <summary> - /// index of the second token in <see cref="SegGraph"/> - /// </summary> - public int To { get; set; } - - public double Weight { get; set; } - - public SegTokenPair(char[] idArray, int from, int to, double weight) - { - this.CharArray = idArray; - this.From = from; - this.To = to; - this.Weight = weight; - } - - /// <summary> - /// <see cref="object.GetHashCode()"/> - /// </summary> - public override int GetHashCode() - { - int prime = 31; - int result = 1; - for (int i = 0; i < CharArray.Length; i++) - { - result = prime * result + CharArray[i]; - } - result = prime * result + From; - result = prime * result + To; - long temp; - temp = Number.DoubleToInt64Bits(Weight); - result = prime * result + (int)(temp ^ (int)((uint)temp >> 32)); - return result; - } - - /// <summary> - /// <see cref="object.Equals(object)"/> - /// </summary> - public override bool Equals(object obj) - { - if (this == obj) - return true; - if (obj == null) - return false; - if (GetType() != obj.GetType()) - return false; - SegTokenPair other = (SegTokenPair)obj; - if (!Arrays.Equals(CharArray, other.CharArray)) - return false; - if (From != other.From) - return false; - if (To != other.To) - return false; - if (Number.DoubleToInt64Bits(Weight) != Number - .DoubleToInt64Bits(other.Weight)) - return false; - return true; - } - } -}
