http://git-wip-us.apache.org/repos/asf/lucenenet/blob/056353d4/src/Lucene.Net.Analysis.SmartCn/Hhmm/SegGraph.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.SmartCn/Hhmm/SegGraph.cs b/src/Lucene.Net.Analysis.SmartCn/Hhmm/SegGraph.cs new file mode 100644 index 0000000..af28a90 --- /dev/null +++ b/src/Lucene.Net.Analysis.SmartCn/Hhmm/SegGraph.cs @@ -0,0 +1,161 @@ +// lucene version compatibility level: 4.8.1 +using System.Collections.Generic; +using System.Text; + +namespace Lucene.Net.Analysis.Cn.Smart.Hhmm +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// <summary> + /// Graph representing possible tokens at each start offset in the sentence. + /// <para> + /// For each start offset, a list of possible tokens is stored. + /// </para> + /// @lucene.experimental + /// </summary> + internal class SegGraph + { + /// <summary> + /// Map of start offsets to <see cref="T:IList{SegToken}"/> of tokens at that position + /// </summary> + private IDictionary<int, IList<SegToken>> tokenListTable = new Dictionary<int, IList<SegToken>>(); + + private int maxStart = -1; + + /// <summary> + /// Returns <c>true</c> if a mapping for the specified start offset exists + /// </summary> + /// <param name="s">startOffset</param> + /// <returns><c>true</c> if there are tokens for the startOffset</returns> + public virtual bool IsStartExist(int s) + { + //return tokenListTable.get(s) != null; + IList<SegToken> result; + return tokenListTable.TryGetValue(s, out result) && result != null; + } + + /// <summary> + /// Get the list of tokens at the specified start offset + /// </summary> + /// <param name="s">startOffset</param> + /// <returns><see cref="T:IList{SegToken}"/> of tokens at the specified start offset.</returns> + public virtual IList<SegToken> GetStartList(int s) + { + IList<SegToken> result; + tokenListTable.TryGetValue(s, out result); + return result; + } + + /// <summary> + /// Get the highest start offset in the map. Returns maximum start offset, or -1 if the map is empty. + /// </summary> + public virtual int MaxStart + { + get { return maxStart; } + } + + /// <summary> + /// Set the <see cref="SegToken.Index"/> for each token, based upon its order by startOffset. + /// </summary> + /// <returns>a <see cref="T:IList{SegToken}"/> of these ordered tokens.</returns> + public virtual IList<SegToken> MakeIndex() + { + IList<SegToken> result = new List<SegToken>(); + int s = -1, count = 0, size = tokenListTable.Count; + IList<SegToken> tokenList; + int index = 0; + while (count < size) + { + if (IsStartExist(s)) + { + tokenList = tokenListTable[s]; + foreach (SegToken st in tokenList) + { + st.Index = index; + result.Add(st); + index++; + } + count++; + } + s++; + } + return result; + } + + /// <summary> + /// Add a <see cref="SegToken"/> to the mapping, creating a new mapping at the token's startOffset if one does not exist. + /// </summary> + /// <param name="token">token <see cref="SegToken"/>.</param> + public virtual void AddToken(SegToken token) + { + int s = token.StartOffset; + if (!IsStartExist(s)) + { + List<SegToken> newlist = new List<SegToken>(); + newlist.Add(token); + tokenListTable[s] = newlist; + } + else + { + IList<SegToken> tokenList = tokenListTable[s]; + tokenList.Add(token); + } + if (s > maxStart) + { + maxStart = s; + } + } + + /// <summary> + /// Return a <see cref="T:IList{SegToken}"/> of all tokens in the map, ordered by startOffset. + /// </summary> + /// <returns><see cref="T:IList{SegToken}"/> of all tokens in the map.</returns> + public virtual IList<SegToken> ToTokenList() + { + IList<SegToken> result = new List<SegToken>(); + int s = -1, count = 0, size = tokenListTable.Count; + IList<SegToken> tokenList; + + while (count < size) + { + if (IsStartExist(s)) + { + tokenList = tokenListTable[s]; + foreach (SegToken st in tokenList) + { + result.Add(st); + } + count++; + } + s++; + } + return result; + } + + public override string ToString() + { + IList<SegToken> tokenList = this.ToTokenList(); + StringBuilder sb = new StringBuilder(); + foreach (SegToken t in tokenList) + { + sb.Append(t + "\n"); + } + return sb.ToString(); + } + } +}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/056353d4/src/Lucene.Net.Analysis.SmartCn/Hhmm/SegToken.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.SmartCn/Hhmm/SegToken.cs b/src/Lucene.Net.Analysis.SmartCn/Hhmm/SegToken.cs new file mode 100644 index 0000000..f0bdea4 --- /dev/null +++ b/src/Lucene.Net.Analysis.SmartCn/Hhmm/SegToken.cs @@ -0,0 +1,124 @@ +// lucene version compatibility level: 4.8.1 +using Lucene.Net.Support; + +namespace Lucene.Net.Analysis.Cn.Smart.Hhmm +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// <summary> + /// SmartChineseAnalyzer internal token + /// <para/> + /// @lucene.experimental + /// </summary> + public class SegToken + { + /// <summary> + /// Character array containing token text + /// </summary> + [WritableArray] + public char[] CharArray { get; set; } + + /// <summary> + /// start offset into original sentence + /// </summary> + public int StartOffset { get; set; } + + /// <summary> + /// end offset into original sentence + /// </summary> + public int EndOffset { get; set; } + + /// <summary> + /// <see cref="Smart.WordType"/> of the text + /// </summary> + public WordType WordType { get; set; } + + /// <summary> + /// word frequency + /// </summary> + public int Weight { get; set; } + + /// <summary> + /// during segmentation, this is used to store the index of the token in the token list table + /// </summary> + public int Index { get; set; } + + /// <summary> + /// Create a new <see cref="SegToken"/> from a character array. + /// </summary> + /// <param name="idArray">character array containing text</param> + /// <param name="start">start offset of <see cref="SegToken"/> in original sentence</param> + /// <param name="end">end offset of <see cref="SegToken"/> in original sentence</param> + /// <param name="wordType"><see cref="Smart.WordType"/> of the text</param> + /// <param name="weight">word frequency</param> + public SegToken(char[] idArray, int start, int end, WordType wordType, int weight) + { + this.CharArray = idArray; + this.StartOffset = start; + this.EndOffset = end; + this.WordType = wordType; + this.Weight = weight; + } + + /// <summary> + /// <see cref="object.GetHashCode()"/> + /// </summary> + public override int GetHashCode() + { + int prime = 31; + int result = 1; + for (int i = 0; i < CharArray.Length; i++) + { + result = prime * result + CharArray[i]; + } + result = prime * result + EndOffset; + result = prime * result + Index; + result = prime * result + StartOffset; + result = prime * result + Weight; + result = prime * result + (int)WordType; + return result; + } + + /// <summary> + /// <see cref="object.Equals(object)"/> + /// </summary> + public override bool Equals(object obj) + { + if (this == obj) + return true; + if (obj == null) + return false; + if (GetType() != obj.GetType()) + return false; + SegToken other = (SegToken)obj; + if (!Arrays.Equals(CharArray, other.CharArray)) + return false; + if (EndOffset != other.EndOffset) + return false; + if (Index != other.Index) + return false; + if (StartOffset != other.StartOffset) + return false; + if (Weight != other.Weight) + return false; + if (WordType != other.WordType) + return false; + return true; + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/056353d4/src/Lucene.Net.Analysis.SmartCn/Hhmm/SegTokenFilter.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.SmartCn/Hhmm/SegTokenFilter.cs b/src/Lucene.Net.Analysis.SmartCn/Hhmm/SegTokenFilter.cs new file mode 100644 index 0000000..a518833 --- /dev/null +++ b/src/Lucene.Net.Analysis.SmartCn/Hhmm/SegTokenFilter.cs @@ -0,0 +1,76 @@ +// lucene version compatibility level: 4.8.1 +namespace Lucene.Net.Analysis.Cn.Smart.Hhmm +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// <summary> + /// <para> + /// Filters a <see cref="SegToken"/> by converting full-width latin to half-width, then lowercasing latin. + /// Additionally, all punctuation is converted into <see cref="Utility.COMMON_DELIMITER"/> + /// </para> + /// @lucene.experimental + /// </summary> + public class SegTokenFilter + { + /// <summary> + /// Filter an input <see cref="SegToken"/> + /// <para> + /// Full-width latin will be converted to half-width, then all latin will be lowercased. + /// All punctuation is converted into <see cref="Utility.COMMON_DELIMITER"/> + /// </para> + /// </summary> + /// <param name="token">Input <see cref="SegToken"/>.</param> + /// <returns>Normalized <see cref="SegToken"/>.</returns> + public virtual SegToken Filter(SegToken token) + { + switch (token.WordType) + { + case WordType.FULLWIDTH_NUMBER: + case WordType.FULLWIDTH_STRING: /* first convert full-width -> half-width */ + for (int i = 0; i < token.CharArray.Length; i++) + { + if (token.CharArray[i] >= 0xFF10) + { + token.CharArray[i] = (char)(token.CharArray[i] - 0xFEE0); + } + + if (token.CharArray[i] >= 0x0041 && token.CharArray[i] <= 0x005A) /* lowercase latin */ + { + token.CharArray[i] = (char)(token.CharArray[i] + 0x0020); + } + } + break; + case WordType.STRING: + for (int i = 0; i < token.CharArray.Length; i++) + { + if (token.CharArray[i] >= 0x0041 && token.CharArray[i] <= 0x005A) /* lowercase latin */ + { + token.CharArray[i] = (char)(token.CharArray[i] + 0x0020); + } + } + break; + case WordType.DELIMITER: /* convert all punctuation to Utility.COMMON_DELIMITER */ + token.CharArray = Utility.COMMON_DELIMITER; + break; + default: + break; + } + return token; + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/056353d4/src/Lucene.Net.Analysis.SmartCn/Hhmm/SegTokenPair.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.SmartCn/Hhmm/SegTokenPair.cs b/src/Lucene.Net.Analysis.SmartCn/Hhmm/SegTokenPair.cs new file mode 100644 index 0000000..f454ba9 --- /dev/null +++ b/src/Lucene.Net.Analysis.SmartCn/Hhmm/SegTokenPair.cs @@ -0,0 +1,96 @@ +// lucene version compatibility level: 4.8.1 +using Lucene.Net.Support; + +namespace Lucene.Net.Analysis.Cn.Smart.Hhmm +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// <summary> + /// A pair of tokens in <see cref="SegGraph"/> + /// <para/> + /// @lucene.experimental + /// </summary> + internal class SegTokenPair + { + [WritableArray] + public char[] CharArray { get; set; } + + /// <summary> + /// index of the first token in <see cref="SegGraph"/> + /// </summary> + public int From { get; set; } + + /// <summary> + /// index of the second token in <see cref="SegGraph"/> + /// </summary> + public int To { get; set; } + + public double Weight { get; set; } + + public SegTokenPair(char[] idArray, int from, int to, double weight) + { + this.CharArray = idArray; + this.From = from; + this.To = to; + this.Weight = weight; + } + + /// <summary> + /// <see cref="object.GetHashCode()"/> + /// </summary> + public override int GetHashCode() + { + int prime = 31; + int result = 1; + for (int i = 0; i < CharArray.Length; i++) + { + result = prime * result + CharArray[i]; + } + result = prime * result + From; + result = prime * result + To; + long temp; + temp = Number.DoubleToInt64Bits(Weight); + result = prime * result + (int)(temp ^ (int)((uint)temp >> 32)); + return result; + } + + /// <summary> + /// <see cref="object.Equals(object)"/> + /// </summary> + public override bool Equals(object obj) + { + if (this == obj) + return true; + if (obj == null) + return false; + if (GetType() != obj.GetType()) + return false; + SegTokenPair other = (SegTokenPair)obj; + if (!Arrays.Equals(CharArray, other.CharArray)) + return false; + if (From != other.From) + return false; + if (To != other.To) + return false; + if (Number.DoubleToInt64Bits(Weight) != Number + .DoubleToInt64Bits(other.Weight)) + return false; + return true; + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/056353d4/src/Lucene.Net.Analysis.SmartCn/Hhmm/WordDictionary.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.SmartCn/Hhmm/WordDictionary.cs b/src/Lucene.Net.Analysis.SmartCn/Hhmm/WordDictionary.cs new file mode 100644 index 0000000..0f5d3db --- /dev/null +++ b/src/Lucene.Net.Analysis.SmartCn/Hhmm/WordDictionary.cs @@ -0,0 +1,779 @@ +// lucene version compatibility level: 4.8.1 +using Lucene.Net.Support; +using Lucene.Net.Support.IO; +using System; +using System.IO; +using System.Reflection; +using System.Text; + +namespace Lucene.Net.Analysis.Cn.Smart.Hhmm +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// <summary> + /// SmartChineseAnalyzer Word Dictionary + /// <para/> + /// @lucene.experimental + /// </summary> + internal class WordDictionary : AbstractDictionary + { + private WordDictionary() + { + } + + private static WordDictionary singleInstance; + + /// <summary> + /// Large prime number for hash function + /// </summary> + public static readonly int PRIME_INDEX_LENGTH = 12071; + + /// <summary> + /// wordIndexTable guarantees to hash all Chinese characters in Unicode into + /// PRIME_INDEX_LENGTH array. There will be conflict, but in reality this + /// program only handles the 6768 characters found in GB2312 plus some + /// ASCII characters. Therefore in order to guarantee better precision, it is + /// necessary to retain the original symbol in the charIndexTable. + /// </summary> + private short[] wordIndexTable; + + private char[] charIndexTable; + + /// <summary> + /// To avoid taking too much space, the data structure needed to store the + /// lexicon requires two multidimensional arrays to store word and frequency. + /// Each word is placed in a char[]. Each char represents a Chinese char or + /// other symbol. Each frequency is put into an int. These two arrays + /// correspond to each other one-to-one. Therefore, one can use + /// wordItem_charArrayTable[i][j] to look up word from lexicon, and + /// wordItem_frequencyTable[i][j] to look up the corresponding frequency. + /// </summary> + private char[][][] wordItem_charArrayTable; + + private int[][] wordItem_frequencyTable; + + // static Logger log = Logger.getLogger(WordDictionary.class); + + private static object syncLock = new object(); + + /// <summary> + /// Get the singleton dictionary instance. + /// </summary> + /// <returns>singleton</returns> + public static WordDictionary GetInstance() + { + lock (syncLock) + { + if (singleInstance == null) + { + singleInstance = new WordDictionary(); + + // LUCENENET specific + // LUCENE-1817: https://issues.apache.org/jira/browse/LUCENE-1817 + // This issue still existed as of 4.8.0. Here is the fix - we only + // load from a directory if the actual directory exists (AnalyzerProfile + // ensures it is an empty string if it is not available). + string dictRoot = AnalyzerProfile.ANALYSIS_DATA_DIR; + if (string.IsNullOrEmpty(dictRoot)) + { + singleInstance.Load(); + } + else + { + singleInstance.Load(dictRoot); + } + + + //try + //{ + // singleInstance.Load(); + //} + //catch (IOException e) + //{ + // string wordDictRoot = AnalyzerProfile.ANALYSIS_DATA_DIR; + // singleInstance.Load(wordDictRoot); + //} + //catch (TypeLoadException e) + //{ + // throw new Exception(e.ToString(), e); + //} + } + return singleInstance; + } + } + + /// <summary> + /// Attempt to load dictionary from provided directory, first trying coredict.mem, failing back on coredict.dct + /// </summary> + /// <param name="dctFileRoot">path to dictionary directory</param> + public virtual void Load(string dctFileRoot) + { + string dctFilePath = System.IO.Path.Combine(dctFileRoot, "coredict.dct"); + FileInfo serialObj = new FileInfo(System.IO.Path.Combine(dctFileRoot, "coredict.mem")); + + if (serialObj.Exists && LoadFromObj(serialObj)) + { + + } + else + { + try + { + wordIndexTable = new short[PRIME_INDEX_LENGTH]; + charIndexTable = new char[PRIME_INDEX_LENGTH]; + for (int i = 0; i < PRIME_INDEX_LENGTH; i++) + { + charIndexTable[i] = (char)0; + wordIndexTable[i] = -1; + } + wordItem_charArrayTable = new char[GB2312_CHAR_NUM][][]; + wordItem_frequencyTable = new int[GB2312_CHAR_NUM][]; + // int total = + LoadMainDataFromFile(dctFilePath); + ExpandDelimiterData(); + MergeSameWords(); + SortEachItems(); + // log.info("load dictionary: " + dctFilePath + " total:" + total); + } + catch (IOException e) + { + throw new Exception(e.ToString(), e); + } + + SaveToObj(serialObj); + } + + } + + /// <summary> + /// Load coredict.mem internally from the jar file. + /// </summary> + /// <exception cref="IOException">If there is a low-level I/O error.</exception> + public virtual void Load() + { + using (Stream input = this.GetType().GetTypeInfo().Assembly.FindAndGetManifestResourceStream(this.GetType(), "coredict.mem")) + { + LoadFromObjectInputStream(input); + } + } + + private bool LoadFromObj(FileInfo serialObj) + { + try + { + using (Stream input = new FileStream(serialObj.FullName, FileMode.Open, FileAccess.Read)) + LoadFromObjectInputStream(input); + return true; + } + catch (Exception e) + { + throw new Exception(e.ToString(), e); + } + } + + // LUCENENET conversion note: + // The data in Lucene is stored in a proprietary binary format (similar to + // .NET's BinarySerializer) that cannot be read back in .NET. Therefore, the + // data was extracted using Java's DataOutputStream using the following Java code. + // It can then be read in using the LoadFromInputStream method below + // (using a DataInputStream instead of a BinaryReader), and saved + // in the correct (BinaryWriter) format by calling the SaveToObj method. + // Alternatively, the data can be loaded from disk using the files + // here(https://issues.apache.org/jira/browse/LUCENE-1629) in the analysis.data.zip file, + // which will automatically produce the .mem files. + + //public void saveToOutputStream(java.io.DataOutputStream stream) throws IOException + //{ + // // save bigramHashTable + // int bhLen = bigramHashTable.length; + // stream.writeInt(bhLen); + // for (int i = 0; i<bhLen; i++) + // { + // stream.writeLong(bigramHashTable[i]); + // } + + // // save frequencyTable + // int fLen = frequencyTable.length; + // stream.writeInt(fLen); + // for (int i = 0; i<fLen; i++) + // { + // stream.writeInt(frequencyTable[i]); + // } + //} + + private void LoadFromObjectInputStream(Stream serialObjectInputStream) + { + //ObjectInputStream input = new ObjectInputStream(serialObjectInputStream); + //wordIndexTable = (short[])input.ReadObject(); + //charIndexTable = (char[])input.ReadObject(); + //wordItem_charArrayTable = (char[][][])input.ReadObject(); + //wordItem_frequencyTable = (int[][])input.ReadObject(); + //// log.info("load core dict from serialization."); + //input.close(); + + using (var reader = new BinaryReader(serialObjectInputStream)) + //using (var reader = new DataInputStream(serialObjectInputStream)) + { + + // Read wordIndexTable + int wiLen = reader.ReadInt32(); + wordIndexTable = new short[wiLen]; + for (int i = 0; i < wiLen; i++) + { + wordIndexTable[i] = reader.ReadInt16(); + } + + // Read charIndexTable + int ciLen = reader.ReadInt32(); + charIndexTable = new char[ciLen]; + for (int i = 0; i < ciLen; i++) + { + charIndexTable[i] = reader.ReadChar(); + } + + // Read wordItem_charArrayTable + int caDim1 = reader.ReadInt32(); + if (caDim1 > -1) + { + wordItem_charArrayTable = new char[caDim1][][]; + for (int i = 0; i < caDim1; i++) + { + int caDim2 = reader.ReadInt32(); + if (caDim2 > -1) + { + wordItem_charArrayTable[i] = new char[caDim2][]; + for (int j = 0; j < caDim2; j++) + { + int caDim3 = reader.ReadInt32(); + if (caDim3 > -1) + { + wordItem_charArrayTable[i][j] = new char[caDim3]; + for (int k = 0; k < caDim3; k++) + { + wordItem_charArrayTable[i][j][k] = reader.ReadChar(); + } + } + } + } + } + } + + // Read wordItem_frequencyTable + int fDim1 = reader.ReadInt32(); + if (fDim1 > -1) + { + wordItem_frequencyTable = new int[fDim1][]; + for (int i = 0; i < fDim1; i++) + { + int fDim2 = reader.ReadInt32(); + if (fDim2 > -1) + { + wordItem_frequencyTable[i] = new int[fDim2]; + for (int j = 0; j < fDim2; j++) + { + wordItem_frequencyTable[i][j] = reader.ReadInt32(); + } + } + } + } + } + + // log.info("load core dict from serialization."); + } + + private void SaveToObj(FileInfo serialObj) + { + try + { + //ObjectOutputStream output = new ObjectOutputStream(new FileStream( + // serialObj.FullName, FileMode.Create, FileAccess.Write)); + //output.writeObject(wordIndexTable); + //output.writeObject(charIndexTable); + //output.writeObject(wordItem_charArrayTable); + //output.writeObject(wordItem_frequencyTable); + //output.close(); + //// log.info("serialize core dict."); + + using (Stream stream = new FileStream(serialObj.FullName, FileMode.Create, FileAccess.Write)) + { + using (var writer = new BinaryWriter(stream)) + { + // Write wordIndexTable + int wiLen = wordIndexTable.Length; + writer.Write(wiLen); + for (int i = 0; i < wiLen; i++) + { + writer.Write(wordIndexTable[i]); + } + + // Write charIndexTable + int ciLen = charIndexTable.Length; + writer.Write(ciLen); + for (int i = 0; i < ciLen; i++) + { + writer.Write(charIndexTable[i]); + } + + // Write wordItem_charArrayTable + int caDim1 = wordItem_charArrayTable == null ? -1 : wordItem_charArrayTable.Length; + writer.Write(caDim1); + for (int i = 0; i < caDim1; i++) + { + int caDim2 = wordItem_charArrayTable[i] == null ? -1 : wordItem_charArrayTable[i].Length; + writer.Write(caDim2); + for (int j = 0; j < caDim2; j++) + { + int caDim3 = wordItem_charArrayTable[i][j] == null ? -1 : wordItem_charArrayTable[i][j].Length; + writer.Write(caDim3); + for (int k = 0; k < caDim3; k++) + { + writer.Write(wordItem_charArrayTable[i][j][k]); + } + } + } + + // Write wordItem_frequencyTable + int fDim1 = wordItem_frequencyTable == null ? -1 : wordItem_frequencyTable.Length; + writer.Write(fDim1); + for (int i = 0; i < fDim1; i++) + { + int fDim2 = wordItem_frequencyTable[i] == null ? -1 : wordItem_frequencyTable[i].Length; + writer.Write(fDim2); + for (int j = 0; j < fDim2; j++) + { + writer.Write(wordItem_frequencyTable[i][j]); + } + } + } + } + + // log.info("serialize core dict."); + } +#pragma warning disable 168 + catch (Exception e) +#pragma warning restore 168 + { + // log.warn(e.getMessage()); + } + } + + /// <summary> + /// Load the datafile into this <see cref="WordDictionary"/> + /// </summary> + /// <param name="dctFilePath">path to word dictionary (coredict.dct)</param> + /// <returns>number of words read</returns> + /// <exception cref="IOException">If there is a low-level I/O error.</exception> + private int LoadMainDataFromFile(string dctFilePath) + { + int i, cnt, length, total = 0; + // The file only counted 6763 Chinese characters plus 5 reserved slots 3756~3760. + // The 3756th is used (as a header) to store information. + int[] + buffer = new int[3]; + byte[] intBuffer = new byte[4]; + string tmpword; + //using (RandomAccessFile dctFile = new RandomAccessFile(dctFilePath, "r")) + using (var dctFile = new FileStream(dctFilePath, FileMode.Open, FileAccess.Read)) + { + + // GB2312 characters 0 - 6768 + for (i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + CHAR_NUM_IN_FILE; i++) + { + // if (i == 5231) + // System.out.println(i); + + dctFile.Read(intBuffer, 0, intBuffer.Length); + // the dictionary was developed for C, and byte order must be converted to work with Java + cnt = ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LITTLE_ENDIAN).GetInt32(); + if (cnt <= 0) + { + wordItem_charArrayTable[i] = null; + wordItem_frequencyTable[i] = null; + continue; + } + wordItem_charArrayTable[i] = new char[cnt][]; + wordItem_frequencyTable[i] = new int[cnt]; + total += cnt; + int j = 0; + while (j < cnt) + { + // wordItemTable[i][j] = new WordItem(); + dctFile.Read(intBuffer, 0, intBuffer.Length); + buffer[0] = ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LITTLE_ENDIAN) + .GetInt32();// frequency + dctFile.Read(intBuffer, 0, intBuffer.Length); + buffer[1] = ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LITTLE_ENDIAN) + .GetInt32();// length + dctFile.Read(intBuffer, 0, intBuffer.Length); + buffer[2] = ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LITTLE_ENDIAN) + .GetInt32();// handle + + // wordItemTable[i][j].frequency = buffer[0]; + wordItem_frequencyTable[i][j] = buffer[0]; + + length = buffer[1]; + if (length > 0) + { + byte[] lchBuffer = new byte[length]; + dctFile.Read(lchBuffer, 0, lchBuffer.Length); + //tmpword = new String(lchBuffer, "GB2312"); + tmpword = Encoding.GetEncoding("GB2312").GetString(lchBuffer); + //tmpword = Encoding.GetEncoding("hz-gb-2312").GetString(lchBuffer); + // indexTable[i].wordItems[j].word = tmpword; + // wordItemTable[i][j].charArray = tmpword.toCharArray(); + wordItem_charArrayTable[i][j] = tmpword.ToCharArray(); + } + else + { + // wordItemTable[i][j].charArray = null; + wordItem_charArrayTable[i][j] = null; + } + // System.out.println(indexTable[i].wordItems[j]); + j++; + } + + string str = GetCCByGB2312Id(i); + SetTableIndex(str[0], i); + } + } + return total; + } + + /// <summary> + /// The original lexicon puts all information with punctuation into a + /// chart (from 1 to 3755). Here it then gets expanded, separately being + /// placed into the chart that has the corresponding symbol. + /// </summary> + private void ExpandDelimiterData() + { + int i; + int cnt; + // Punctuation then treating index 3755 as 1, + // distribute the original punctuation corresponding dictionary into + int delimiterIndex = 3755 + GB2312_FIRST_CHAR; + i = 0; + while (i < wordItem_charArrayTable[delimiterIndex].Length) + { + char c = wordItem_charArrayTable[delimiterIndex][i][0]; + int j = GetGB2312Id(c);// the id value of the punctuation + if (wordItem_charArrayTable[j] == null) + { + + int k = i; + // Starting from i, count the number of the following worditem symbol from j + while (k < wordItem_charArrayTable[delimiterIndex].Length + && wordItem_charArrayTable[delimiterIndex][k][0] == c) + { + k++; + } + // c is the punctuation character, j is the id value of c + // k-1 represents the index of the last punctuation character + cnt = k - i; + if (cnt != 0) + { + wordItem_charArrayTable[j] = new char[cnt][]; + wordItem_frequencyTable[j] = new int[cnt]; + } + + // Assign value for each wordItem. + for (k = 0; k < cnt; k++, i++) + { + // wordItemTable[j][k] = new WordItem(); + wordItem_frequencyTable[j][k] = wordItem_frequencyTable[delimiterIndex][i]; + wordItem_charArrayTable[j][k] = new char[wordItem_charArrayTable[delimiterIndex][i].Length - 1]; + System.Array.Copy(wordItem_charArrayTable[delimiterIndex][i], 1, + wordItem_charArrayTable[j][k], 0, + wordItem_charArrayTable[j][k].Length); + } + SetTableIndex(c, j); + } + } + // Delete the original corresponding symbol array. + wordItem_charArrayTable[delimiterIndex] = null; + wordItem_frequencyTable[delimiterIndex] = null; + } + + /// <summary> + /// since we aren't doing POS-tagging, merge the frequencies for entries of the same word (with different POS) + /// </summary> + private void MergeSameWords() + { + int i; + for (i = 0; i < GB2312_FIRST_CHAR + CHAR_NUM_IN_FILE; i++) + { + if (wordItem_charArrayTable[i] == null) + continue; + int len = 1; + for (int j = 1; j < wordItem_charArrayTable[i].Length; j++) + { + if (Utility.CompareArray(wordItem_charArrayTable[i][j], 0, + wordItem_charArrayTable[i][j - 1], 0) != 0) + len++; + + } + if (len < wordItem_charArrayTable[i].Length) + { + char[][] tempArray = new char[len][]; + int[] tempFreq = new int[len]; + int k = 0; + tempArray[0] = wordItem_charArrayTable[i][0]; + tempFreq[0] = wordItem_frequencyTable[i][0]; + for (int j = 1; j < wordItem_charArrayTable[i].Length; j++) + { + if (Utility.CompareArray(wordItem_charArrayTable[i][j], 0, + tempArray[k], 0) != 0) + { + k++; + // temp[k] = wordItemTable[i][j]; + tempArray[k] = wordItem_charArrayTable[i][j]; + tempFreq[k] = wordItem_frequencyTable[i][j]; + } + else + { + // temp[k].frequency += wordItemTable[i][j].frequency; + tempFreq[k] += wordItem_frequencyTable[i][j]; + } + } + // wordItemTable[i] = temp; + wordItem_charArrayTable[i] = tempArray; + wordItem_frequencyTable[i] = tempFreq; + } + } + } + + private void SortEachItems() + { + char[] tmpArray; + int tmpFreq; + for (int i = 0; i < wordItem_charArrayTable.Length; i++) + { + if (wordItem_charArrayTable[i] != null + && wordItem_charArrayTable[i].Length > 1) + { + for (int j = 0; j < wordItem_charArrayTable[i].Length - 1; j++) + { + for (int j2 = j + 1; j2 < wordItem_charArrayTable[i].Length; j2++) + { + if (Utility.CompareArray(wordItem_charArrayTable[i][j], 0, + wordItem_charArrayTable[i][j2], 0) > 0) + { + tmpArray = wordItem_charArrayTable[i][j]; + tmpFreq = wordItem_frequencyTable[i][j]; + wordItem_charArrayTable[i][j] = wordItem_charArrayTable[i][j2]; + wordItem_frequencyTable[i][j] = wordItem_frequencyTable[i][j2]; + wordItem_charArrayTable[i][j2] = tmpArray; + wordItem_frequencyTable[i][j2] = tmpFreq; + } + } + } + } + } + } + + /// <summary> + /// Calculate character <paramref name="c"/>'s position in hash table, + /// then initialize the value of that position in the address table. + /// </summary> + private bool SetTableIndex(char c, int j) + { + int index = GetAvaliableTableIndex(c); + if (index != -1) + { + charIndexTable[index] = c; + wordIndexTable[index] = (short)j; + return true; + } + else + return false; + } + + private short GetAvaliableTableIndex(char c) + { + int hash1 = (int)(Hash1(c) % PRIME_INDEX_LENGTH); + int hash2 = Hash2(c) % PRIME_INDEX_LENGTH; + if (hash1 < 0) + hash1 = PRIME_INDEX_LENGTH + hash1; + if (hash2 < 0) + hash2 = PRIME_INDEX_LENGTH + hash2; + int index = hash1; + int i = 1; + while (charIndexTable[index] != 0 && charIndexTable[index] != c + && i < PRIME_INDEX_LENGTH) + { + index = (hash1 + i * hash2) % PRIME_INDEX_LENGTH; + i++; + } + // System.out.println(i - 1); + + if (i < PRIME_INDEX_LENGTH + && (charIndexTable[index] == 0 || charIndexTable[index] == c)) + { + return (short)index; + } + else + { + return -1; + } + } + + private short GetWordItemTableIndex(char c) + { + int hash1 = (int)(Hash1(c) % PRIME_INDEX_LENGTH); + int hash2 = Hash2(c) % PRIME_INDEX_LENGTH; + if (hash1 < 0) + hash1 = PRIME_INDEX_LENGTH + hash1; + if (hash2 < 0) + hash2 = PRIME_INDEX_LENGTH + hash2; + int index = hash1; + int i = 1; + while (charIndexTable[index] != 0 && charIndexTable[index] != c + && i < PRIME_INDEX_LENGTH) + { + index = (hash1 + i * hash2) % PRIME_INDEX_LENGTH; + i++; + } + + if (i < PRIME_INDEX_LENGTH && charIndexTable[index] == c) + { + return (short)index; + } + else + return -1; + } + + /// <summary> + /// Look up the text string corresponding with the word char array, + /// and return the position of the word list. + /// </summary> + /// <param name="knownHashIndex"> + /// already figure out position of the first word + /// symbol charArray[0] in hash table. If not calculated yet, can be + /// replaced with function int findInTable(char[] charArray). + /// </param> + /// <param name="charArray">look up the char array corresponding with the word.</param> + /// <returns>word location in word array. If not found, then return -1.</returns> + private int FindInTable(short knownHashIndex, char[] charArray) + { + if (charArray == null || charArray.Length == 0) + return -1; + + char[][] items = wordItem_charArrayTable[wordIndexTable[knownHashIndex]]; + int start = 0, end = items.Length - 1; + int mid = (start + end) / 2, cmpResult; + + // Binary search for the index of idArray + while (start <= end) + { + cmpResult = Utility.CompareArray(items[mid], 0, charArray, 1); + + if (cmpResult == 0) + return mid;// find it + else if (cmpResult < 0) + start = mid + 1; + else if (cmpResult > 0) + end = mid - 1; + + mid = (start + end) / 2; + } + return -1; + } + + /// <summary> + /// Find the first word in the dictionary that starts with the supplied prefix + /// </summary> + /// <param name="charArray">input prefix</param> + /// <returns>index of word, or -1 if not found</returns> + /// <seealso cref="GetPrefixMatch(char[], int)"/> + public virtual int GetPrefixMatch(char[] charArray) + { + return GetPrefixMatch(charArray, 0); + } + + /// <summary> + /// Find the nth word in the dictionary that starts with the supplied prefix + /// </summary> + /// <param name="charArray">input prefix</param> + /// <param name="knownStart">relative position in the dictionary to start</param> + /// <returns>index of word, or -1 if not found</returns> + /// <seealso cref="GetPrefixMatch(char[])"/> + public virtual int GetPrefixMatch(char[] charArray, int knownStart) + { + short index = GetWordItemTableIndex(charArray[0]); + if (index == -1) + return -1; + char[][] items = wordItem_charArrayTable[wordIndexTable[index]]; + int start = knownStart, end = items.Length - 1; + + int mid = (start + end) / 2, cmpResult; + + // Binary search for the index of idArray + while (start <= end) + { + cmpResult = Utility.CompareArrayByPrefix(charArray, 1, items[mid], 0); + if (cmpResult == 0) + { + // Get the first item which match the current word + while (mid >= 0 + && Utility.CompareArrayByPrefix(charArray, 1, items[mid], 0) == 0) + mid--; + mid++; + return mid;// Find the first word that uses charArray as prefix. + } + else if (cmpResult < 0) + end = mid - 1; + else + start = mid + 1; + mid = (start + end) / 2; + } + return -1; + } + + /// <summary> + /// Get the frequency of a word from the dictionary + /// </summary> + /// <param name="charArray">input word</param> + /// <returns>word frequency, or zero if the word is not found</returns> + public virtual int GetFrequency(char[] charArray) + { + short hashIndex = GetWordItemTableIndex(charArray[0]); + if (hashIndex == -1) + { + return 0; + } + int itemIndex = FindInTable(hashIndex, charArray); + if (itemIndex != -1) + { + return wordItem_frequencyTable[wordIndexTable[hashIndex]][itemIndex]; + } + return 0; + } + + /// <summary> + /// Return <c>true</c> if the dictionary entry at itemIndex for table charArray[0] is charArray + /// </summary> + /// <param name="charArray">input word</param> + /// <param name="itemIndex">item index for table charArray[0]</param> + /// <returns><c>true</c> if the entry exists</returns> + public virtual bool IsEqual(char[] charArray, int itemIndex) + { + short hashIndex = GetWordItemTableIndex(charArray[0]); + return Utility.CompareArray(charArray, 1, + wordItem_charArrayTable[wordIndexTable[hashIndex]][itemIndex], 0) == 0; + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/056353d4/src/Lucene.Net.Analysis.SmartCn/Lucene.Net.Analysis.SmartCn.csproj ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.SmartCn/Lucene.Net.Analysis.SmartCn.csproj b/src/Lucene.Net.Analysis.SmartCn/Lucene.Net.Analysis.SmartCn.csproj index 37604c0..c735c33 100644 --- a/src/Lucene.Net.Analysis.SmartCn/Lucene.Net.Analysis.SmartCn.csproj +++ b/src/Lucene.Net.Analysis.SmartCn/Lucene.Net.Analysis.SmartCn.csproj @@ -46,7 +46,9 @@ <ItemGroup> <Compile Include="..\CommonAssemblyInfo.cs" Link="Properties\CommonAssemblyInfo.cs" /> - <EmbeddedResource Include="stopwords.txt;HHMM\bigramdict.mem;HHMM\coredict.mem" /> + <EmbeddedResource Include="stopwords.txt" /> + <EmbeddedResource Include="Hhmm\bigramdict.mem" /> + <EmbeddedResource Include="Hhmm\coredict.mem" /> </ItemGroup> <ItemGroup> http://git-wip-us.apache.org/repos/asf/lucenenet/blob/056353d4/src/Lucene.Net.Analysis.SmartCn/WordSegmenter.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.SmartCn/WordSegmenter.cs b/src/Lucene.Net.Analysis.SmartCn/WordSegmenter.cs index 9917509..6ad831d 100644 --- a/src/Lucene.Net.Analysis.SmartCn/WordSegmenter.cs +++ b/src/Lucene.Net.Analysis.SmartCn/WordSegmenter.cs @@ -1,5 +1,5 @@ // lucene version compatibility level: 4.8.1 -using Lucene.Net.Analysis.Cn.Smart.HHMM; +using Lucene.Net.Analysis.Cn.Smart.Hhmm; using Lucene.Net.Support; using System.Collections.Generic; http://git-wip-us.apache.org/repos/asf/lucenenet/blob/056353d4/src/Lucene.Net.Analysis.SmartCn/WordTokenFilter.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.SmartCn/WordTokenFilter.cs b/src/Lucene.Net.Analysis.SmartCn/WordTokenFilter.cs index c8a706c..5af9a4e 100644 --- a/src/Lucene.Net.Analysis.SmartCn/WordTokenFilter.cs +++ b/src/Lucene.Net.Analysis.SmartCn/WordTokenFilter.cs @@ -1,5 +1,5 @@ // lucene version compatibility level: 4.8.1 -using Lucene.Net.Analysis.Cn.Smart.HHMM; +using Lucene.Net.Analysis.Cn.Smart.Hhmm; using Lucene.Net.Analysis.TokenAttributes; using System; using System.Collections.Generic; http://git-wip-us.apache.org/repos/asf/lucenenet/blob/056353d4/src/Lucene.Net.Tests.Analysis.SmartCn/TestHMMChineseTokenizerFactory.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Tests.Analysis.SmartCn/TestHMMChineseTokenizerFactory.cs b/src/Lucene.Net.Tests.Analysis.SmartCn/TestHMMChineseTokenizerFactory.cs index 264ab38..60f24c9 100644 --- a/src/Lucene.Net.Tests.Analysis.SmartCn/TestHMMChineseTokenizerFactory.cs +++ b/src/Lucene.Net.Tests.Analysis.SmartCn/TestHMMChineseTokenizerFactory.cs @@ -31,7 +31,7 @@ namespace Lucene.Net.Analysis.Cn.Smart [Test] public void TestHHMMSegmenter() { - var x = new HHMM.HHMMSegmenter(); + var x = new Hhmm.HHMMSegmenter(); } /// <summary>
