http://git-wip-us.apache.org/repos/asf/lucenenet/blob/056353d4/src/Lucene.Net.Analysis.SmartCn/HHMM/WordDictionary.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.SmartCn/HHMM/WordDictionary.cs b/src/Lucene.Net.Analysis.SmartCn/HHMM/WordDictionary.cs deleted file mode 100644 index c0cd331..0000000 --- a/src/Lucene.Net.Analysis.SmartCn/HHMM/WordDictionary.cs +++ /dev/null @@ -1,779 +0,0 @@ -// lucene version compatibility level: 4.8.1 -using Lucene.Net.Support; -using Lucene.Net.Support.IO; -using System; -using System.IO; -using System.Reflection; -using System.Text; - -namespace Lucene.Net.Analysis.Cn.Smart.HHMM -{ - /* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - - /// <summary> - /// SmartChineseAnalyzer Word Dictionary - /// <para/> - /// @lucene.experimental - /// </summary> - internal class WordDictionary : AbstractDictionary - { - private WordDictionary() - { - } - - private static WordDictionary singleInstance; - - /// <summary> - /// Large prime number for hash function - /// </summary> - public static readonly int PRIME_INDEX_LENGTH = 12071; - - /// <summary> - /// wordIndexTable guarantees to hash all Chinese characters in Unicode into - /// PRIME_INDEX_LENGTH array. There will be conflict, but in reality this - /// program only handles the 6768 characters found in GB2312 plus some - /// ASCII characters. Therefore in order to guarantee better precision, it is - /// necessary to retain the original symbol in the charIndexTable. - /// </summary> - private short[] wordIndexTable; - - private char[] charIndexTable; - - /// <summary> - /// To avoid taking too much space, the data structure needed to store the - /// lexicon requires two multidimensional arrays to store word and frequency. - /// Each word is placed in a char[]. Each char represents a Chinese char or - /// other symbol. Each frequency is put into an int. These two arrays - /// correspond to each other one-to-one. Therefore, one can use - /// wordItem_charArrayTable[i][j] to look up word from lexicon, and - /// wordItem_frequencyTable[i][j] to look up the corresponding frequency. - /// </summary> - private char[][][] wordItem_charArrayTable; - - private int[][] wordItem_frequencyTable; - - // static Logger log = Logger.getLogger(WordDictionary.class); - - private static object syncLock = new object(); - - /// <summary> - /// Get the singleton dictionary instance. - /// </summary> - /// <returns>singleton</returns> - public static WordDictionary GetInstance() - { - lock (syncLock) - { - if (singleInstance == null) - { - singleInstance = new WordDictionary(); - - // LUCENENET specific - // LUCENE-1817: https://issues.apache.org/jira/browse/LUCENE-1817 - // This issue still existed as of 4.8.0. Here is the fix - we only - // load from a directory if the actual directory exists (AnalyzerProfile - // ensures it is an empty string if it is not available). - string dictRoot = AnalyzerProfile.ANALYSIS_DATA_DIR; - if (string.IsNullOrEmpty(dictRoot)) - { - singleInstance.Load(); - } - else - { - singleInstance.Load(dictRoot); - } - - - //try - //{ - // singleInstance.Load(); - //} - //catch (IOException e) - //{ - // string wordDictRoot = AnalyzerProfile.ANALYSIS_DATA_DIR; - // singleInstance.Load(wordDictRoot); - //} - //catch (TypeLoadException e) - //{ - // throw new Exception(e.ToString(), e); - //} - } - return singleInstance; - } - } - - /// <summary> - /// Attempt to load dictionary from provided directory, first trying coredict.mem, failing back on coredict.dct - /// </summary> - /// <param name="dctFileRoot">path to dictionary directory</param> - public virtual void Load(string dctFileRoot) - { - string dctFilePath = System.IO.Path.Combine(dctFileRoot, "coredict.dct"); - FileInfo serialObj = new FileInfo(System.IO.Path.Combine(dctFileRoot, "coredict.mem")); - - if (serialObj.Exists && LoadFromObj(serialObj)) - { - - } - else - { - try - { - wordIndexTable = new short[PRIME_INDEX_LENGTH]; - charIndexTable = new char[PRIME_INDEX_LENGTH]; - for (int i = 0; i < PRIME_INDEX_LENGTH; i++) - { - charIndexTable[i] = (char)0; - wordIndexTable[i] = -1; - } - wordItem_charArrayTable = new char[GB2312_CHAR_NUM][][]; - wordItem_frequencyTable = new int[GB2312_CHAR_NUM][]; - // int total = - LoadMainDataFromFile(dctFilePath); - ExpandDelimiterData(); - MergeSameWords(); - SortEachItems(); - // log.info("load dictionary: " + dctFilePath + " total:" + total); - } - catch (IOException e) - { - throw new Exception(e.ToString(), e); - } - - SaveToObj(serialObj); - } - - } - - /// <summary> - /// Load coredict.mem internally from the jar file. - /// </summary> - /// <exception cref="IOException">If there is a low-level I/O error.</exception> - public virtual void Load() - { - using (Stream input = this.GetType().GetTypeInfo().Assembly.FindAndGetManifestResourceStream(this.GetType(), "coredict.mem")) - { - LoadFromObjectInputStream(input); - } - } - - private bool LoadFromObj(FileInfo serialObj) - { - try - { - using (Stream input = new FileStream(serialObj.FullName, FileMode.Open, FileAccess.Read)) - LoadFromObjectInputStream(input); - return true; - } - catch (Exception e) - { - throw new Exception(e.ToString(), e); - } - } - - // LUCENENET conversion note: - // The data in Lucene is stored in a proprietary binary format (similar to - // .NET's BinarySerializer) that cannot be read back in .NET. Therefore, the - // data was extracted using Java's DataOutputStream using the following Java code. - // It can then be read in using the LoadFromInputStream method below - // (using a DataInputStream instead of a BinaryReader), and saved - // in the correct (BinaryWriter) format by calling the SaveToObj method. - // Alternatively, the data can be loaded from disk using the files - // here(https://issues.apache.org/jira/browse/LUCENE-1629) in the analysis.data.zip file, - // which will automatically produce the .mem files. - - //public void saveToOutputStream(java.io.DataOutputStream stream) throws IOException - //{ - // // save bigramHashTable - // int bhLen = bigramHashTable.length; - // stream.writeInt(bhLen); - // for (int i = 0; i<bhLen; i++) - // { - // stream.writeLong(bigramHashTable[i]); - // } - - // // save frequencyTable - // int fLen = frequencyTable.length; - // stream.writeInt(fLen); - // for (int i = 0; i<fLen; i++) - // { - // stream.writeInt(frequencyTable[i]); - // } - //} - - private void LoadFromObjectInputStream(Stream serialObjectInputStream) - { - //ObjectInputStream input = new ObjectInputStream(serialObjectInputStream); - //wordIndexTable = (short[])input.ReadObject(); - //charIndexTable = (char[])input.ReadObject(); - //wordItem_charArrayTable = (char[][][])input.ReadObject(); - //wordItem_frequencyTable = (int[][])input.ReadObject(); - //// log.info("load core dict from serialization."); - //input.close(); - - using (var reader = new BinaryReader(serialObjectInputStream)) - //using (var reader = new DataInputStream(serialObjectInputStream)) - { - - // Read wordIndexTable - int wiLen = reader.ReadInt32(); - wordIndexTable = new short[wiLen]; - for (int i = 0; i < wiLen; i++) - { - wordIndexTable[i] = reader.ReadInt16(); - } - - // Read charIndexTable - int ciLen = reader.ReadInt32(); - charIndexTable = new char[ciLen]; - for (int i = 0; i < ciLen; i++) - { - charIndexTable[i] = reader.ReadChar(); - } - - // Read wordItem_charArrayTable - int caDim1 = reader.ReadInt32(); - if (caDim1 > -1) - { - wordItem_charArrayTable = new char[caDim1][][]; - for (int i = 0; i < caDim1; i++) - { - int caDim2 = reader.ReadInt32(); - if (caDim2 > -1) - { - wordItem_charArrayTable[i] = new char[caDim2][]; - for (int j = 0; j < caDim2; j++) - { - int caDim3 = reader.ReadInt32(); - if (caDim3 > -1) - { - wordItem_charArrayTable[i][j] = new char[caDim3]; - for (int k = 0; k < caDim3; k++) - { - wordItem_charArrayTable[i][j][k] = reader.ReadChar(); - } - } - } - } - } - } - - // Read wordItem_frequencyTable - int fDim1 = reader.ReadInt32(); - if (fDim1 > -1) - { - wordItem_frequencyTable = new int[fDim1][]; - for (int i = 0; i < fDim1; i++) - { - int fDim2 = reader.ReadInt32(); - if (fDim2 > -1) - { - wordItem_frequencyTable[i] = new int[fDim2]; - for (int j = 0; j < fDim2; j++) - { - wordItem_frequencyTable[i][j] = reader.ReadInt32(); - } - } - } - } - } - - // log.info("load core dict from serialization."); - } - - private void SaveToObj(FileInfo serialObj) - { - try - { - //ObjectOutputStream output = new ObjectOutputStream(new FileStream( - // serialObj.FullName, FileMode.Create, FileAccess.Write)); - //output.writeObject(wordIndexTable); - //output.writeObject(charIndexTable); - //output.writeObject(wordItem_charArrayTable); - //output.writeObject(wordItem_frequencyTable); - //output.close(); - //// log.info("serialize core dict."); - - using (Stream stream = new FileStream(serialObj.FullName, FileMode.Create, FileAccess.Write)) - { - using (var writer = new BinaryWriter(stream)) - { - // Write wordIndexTable - int wiLen = wordIndexTable.Length; - writer.Write(wiLen); - for (int i = 0; i < wiLen; i++) - { - writer.Write(wordIndexTable[i]); - } - - // Write charIndexTable - int ciLen = charIndexTable.Length; - writer.Write(ciLen); - for (int i = 0; i < ciLen; i++) - { - writer.Write(charIndexTable[i]); - } - - // Write wordItem_charArrayTable - int caDim1 = wordItem_charArrayTable == null ? -1 : wordItem_charArrayTable.Length; - writer.Write(caDim1); - for (int i = 0; i < caDim1; i++) - { - int caDim2 = wordItem_charArrayTable[i] == null ? -1 : wordItem_charArrayTable[i].Length; - writer.Write(caDim2); - for (int j = 0; j < caDim2; j++) - { - int caDim3 = wordItem_charArrayTable[i][j] == null ? -1 : wordItem_charArrayTable[i][j].Length; - writer.Write(caDim3); - for (int k = 0; k < caDim3; k++) - { - writer.Write(wordItem_charArrayTable[i][j][k]); - } - } - } - - // Write wordItem_frequencyTable - int fDim1 = wordItem_frequencyTable == null ? -1 : wordItem_frequencyTable.Length; - writer.Write(fDim1); - for (int i = 0; i < fDim1; i++) - { - int fDim2 = wordItem_frequencyTable[i] == null ? -1 : wordItem_frequencyTable[i].Length; - writer.Write(fDim2); - for (int j = 0; j < fDim2; j++) - { - writer.Write(wordItem_frequencyTable[i][j]); - } - } - } - } - - // log.info("serialize core dict."); - } -#pragma warning disable 168 - catch (Exception e) -#pragma warning restore 168 - { - // log.warn(e.getMessage()); - } - } - - /// <summary> - /// Load the datafile into this <see cref="WordDictionary"/> - /// </summary> - /// <param name="dctFilePath">path to word dictionary (coredict.dct)</param> - /// <returns>number of words read</returns> - /// <exception cref="IOException">If there is a low-level I/O error.</exception> - private int LoadMainDataFromFile(string dctFilePath) - { - int i, cnt, length, total = 0; - // The file only counted 6763 Chinese characters plus 5 reserved slots 3756~3760. - // The 3756th is used (as a header) to store information. - int[] - buffer = new int[3]; - byte[] intBuffer = new byte[4]; - string tmpword; - //using (RandomAccessFile dctFile = new RandomAccessFile(dctFilePath, "r")) - using (var dctFile = new FileStream(dctFilePath, FileMode.Open, FileAccess.Read)) - { - - // GB2312 characters 0 - 6768 - for (i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + CHAR_NUM_IN_FILE; i++) - { - // if (i == 5231) - // System.out.println(i); - - dctFile.Read(intBuffer, 0, intBuffer.Length); - // the dictionary was developed for C, and byte order must be converted to work with Java - cnt = ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LITTLE_ENDIAN).GetInt32(); - if (cnt <= 0) - { - wordItem_charArrayTable[i] = null; - wordItem_frequencyTable[i] = null; - continue; - } - wordItem_charArrayTable[i] = new char[cnt][]; - wordItem_frequencyTable[i] = new int[cnt]; - total += cnt; - int j = 0; - while (j < cnt) - { - // wordItemTable[i][j] = new WordItem(); - dctFile.Read(intBuffer, 0, intBuffer.Length); - buffer[0] = ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LITTLE_ENDIAN) - .GetInt32();// frequency - dctFile.Read(intBuffer, 0, intBuffer.Length); - buffer[1] = ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LITTLE_ENDIAN) - .GetInt32();// length - dctFile.Read(intBuffer, 0, intBuffer.Length); - buffer[2] = ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LITTLE_ENDIAN) - .GetInt32();// handle - - // wordItemTable[i][j].frequency = buffer[0]; - wordItem_frequencyTable[i][j] = buffer[0]; - - length = buffer[1]; - if (length > 0) - { - byte[] lchBuffer = new byte[length]; - dctFile.Read(lchBuffer, 0, lchBuffer.Length); - //tmpword = new String(lchBuffer, "GB2312"); - tmpword = Encoding.GetEncoding("GB2312").GetString(lchBuffer); - //tmpword = Encoding.GetEncoding("hz-gb-2312").GetString(lchBuffer); - // indexTable[i].wordItems[j].word = tmpword; - // wordItemTable[i][j].charArray = tmpword.toCharArray(); - wordItem_charArrayTable[i][j] = tmpword.ToCharArray(); - } - else - { - // wordItemTable[i][j].charArray = null; - wordItem_charArrayTable[i][j] = null; - } - // System.out.println(indexTable[i].wordItems[j]); - j++; - } - - string str = GetCCByGB2312Id(i); - SetTableIndex(str[0], i); - } - } - return total; - } - - /// <summary> - /// The original lexicon puts all information with punctuation into a - /// chart (from 1 to 3755). Here it then gets expanded, separately being - /// placed into the chart that has the corresponding symbol. - /// </summary> - private void ExpandDelimiterData() - { - int i; - int cnt; - // Punctuation then treating index 3755 as 1, - // distribute the original punctuation corresponding dictionary into - int delimiterIndex = 3755 + GB2312_FIRST_CHAR; - i = 0; - while (i < wordItem_charArrayTable[delimiterIndex].Length) - { - char c = wordItem_charArrayTable[delimiterIndex][i][0]; - int j = GetGB2312Id(c);// the id value of the punctuation - if (wordItem_charArrayTable[j] == null) - { - - int k = i; - // Starting from i, count the number of the following worditem symbol from j - while (k < wordItem_charArrayTable[delimiterIndex].Length - && wordItem_charArrayTable[delimiterIndex][k][0] == c) - { - k++; - } - // c is the punctuation character, j is the id value of c - // k-1 represents the index of the last punctuation character - cnt = k - i; - if (cnt != 0) - { - wordItem_charArrayTable[j] = new char[cnt][]; - wordItem_frequencyTable[j] = new int[cnt]; - } - - // Assign value for each wordItem. - for (k = 0; k < cnt; k++, i++) - { - // wordItemTable[j][k] = new WordItem(); - wordItem_frequencyTable[j][k] = wordItem_frequencyTable[delimiterIndex][i]; - wordItem_charArrayTable[j][k] = new char[wordItem_charArrayTable[delimiterIndex][i].Length - 1]; - System.Array.Copy(wordItem_charArrayTable[delimiterIndex][i], 1, - wordItem_charArrayTable[j][k], 0, - wordItem_charArrayTable[j][k].Length); - } - SetTableIndex(c, j); - } - } - // Delete the original corresponding symbol array. - wordItem_charArrayTable[delimiterIndex] = null; - wordItem_frequencyTable[delimiterIndex] = null; - } - - /// <summary> - /// since we aren't doing POS-tagging, merge the frequencies for entries of the same word (with different POS) - /// </summary> - private void MergeSameWords() - { - int i; - for (i = 0; i < GB2312_FIRST_CHAR + CHAR_NUM_IN_FILE; i++) - { - if (wordItem_charArrayTable[i] == null) - continue; - int len = 1; - for (int j = 1; j < wordItem_charArrayTable[i].Length; j++) - { - if (Utility.CompareArray(wordItem_charArrayTable[i][j], 0, - wordItem_charArrayTable[i][j - 1], 0) != 0) - len++; - - } - if (len < wordItem_charArrayTable[i].Length) - { - char[][] tempArray = new char[len][]; - int[] tempFreq = new int[len]; - int k = 0; - tempArray[0] = wordItem_charArrayTable[i][0]; - tempFreq[0] = wordItem_frequencyTable[i][0]; - for (int j = 1; j < wordItem_charArrayTable[i].Length; j++) - { - if (Utility.CompareArray(wordItem_charArrayTable[i][j], 0, - tempArray[k], 0) != 0) - { - k++; - // temp[k] = wordItemTable[i][j]; - tempArray[k] = wordItem_charArrayTable[i][j]; - tempFreq[k] = wordItem_frequencyTable[i][j]; - } - else - { - // temp[k].frequency += wordItemTable[i][j].frequency; - tempFreq[k] += wordItem_frequencyTable[i][j]; - } - } - // wordItemTable[i] = temp; - wordItem_charArrayTable[i] = tempArray; - wordItem_frequencyTable[i] = tempFreq; - } - } - } - - private void SortEachItems() - { - char[] tmpArray; - int tmpFreq; - for (int i = 0; i < wordItem_charArrayTable.Length; i++) - { - if (wordItem_charArrayTable[i] != null - && wordItem_charArrayTable[i].Length > 1) - { - for (int j = 0; j < wordItem_charArrayTable[i].Length - 1; j++) - { - for (int j2 = j + 1; j2 < wordItem_charArrayTable[i].Length; j2++) - { - if (Utility.CompareArray(wordItem_charArrayTable[i][j], 0, - wordItem_charArrayTable[i][j2], 0) > 0) - { - tmpArray = wordItem_charArrayTable[i][j]; - tmpFreq = wordItem_frequencyTable[i][j]; - wordItem_charArrayTable[i][j] = wordItem_charArrayTable[i][j2]; - wordItem_frequencyTable[i][j] = wordItem_frequencyTable[i][j2]; - wordItem_charArrayTable[i][j2] = tmpArray; - wordItem_frequencyTable[i][j2] = tmpFreq; - } - } - } - } - } - } - - /// <summary> - /// Calculate character <paramref name="c"/>'s position in hash table, - /// then initialize the value of that position in the address table. - /// </summary> - private bool SetTableIndex(char c, int j) - { - int index = GetAvaliableTableIndex(c); - if (index != -1) - { - charIndexTable[index] = c; - wordIndexTable[index] = (short)j; - return true; - } - else - return false; - } - - private short GetAvaliableTableIndex(char c) - { - int hash1 = (int)(Hash1(c) % PRIME_INDEX_LENGTH); - int hash2 = Hash2(c) % PRIME_INDEX_LENGTH; - if (hash1 < 0) - hash1 = PRIME_INDEX_LENGTH + hash1; - if (hash2 < 0) - hash2 = PRIME_INDEX_LENGTH + hash2; - int index = hash1; - int i = 1; - while (charIndexTable[index] != 0 && charIndexTable[index] != c - && i < PRIME_INDEX_LENGTH) - { - index = (hash1 + i * hash2) % PRIME_INDEX_LENGTH; - i++; - } - // System.out.println(i - 1); - - if (i < PRIME_INDEX_LENGTH - && (charIndexTable[index] == 0 || charIndexTable[index] == c)) - { - return (short)index; - } - else - { - return -1; - } - } - - private short GetWordItemTableIndex(char c) - { - int hash1 = (int)(Hash1(c) % PRIME_INDEX_LENGTH); - int hash2 = Hash2(c) % PRIME_INDEX_LENGTH; - if (hash1 < 0) - hash1 = PRIME_INDEX_LENGTH + hash1; - if (hash2 < 0) - hash2 = PRIME_INDEX_LENGTH + hash2; - int index = hash1; - int i = 1; - while (charIndexTable[index] != 0 && charIndexTable[index] != c - && i < PRIME_INDEX_LENGTH) - { - index = (hash1 + i * hash2) % PRIME_INDEX_LENGTH; - i++; - } - - if (i < PRIME_INDEX_LENGTH && charIndexTable[index] == c) - { - return (short)index; - } - else - return -1; - } - - /// <summary> - /// Look up the text string corresponding with the word char array, - /// and return the position of the word list. - /// </summary> - /// <param name="knownHashIndex"> - /// already figure out position of the first word - /// symbol charArray[0] in hash table. If not calculated yet, can be - /// replaced with function int findInTable(char[] charArray). - /// </param> - /// <param name="charArray">look up the char array corresponding with the word.</param> - /// <returns>word location in word array. If not found, then return -1.</returns> - private int FindInTable(short knownHashIndex, char[] charArray) - { - if (charArray == null || charArray.Length == 0) - return -1; - - char[][] items = wordItem_charArrayTable[wordIndexTable[knownHashIndex]]; - int start = 0, end = items.Length - 1; - int mid = (start + end) / 2, cmpResult; - - // Binary search for the index of idArray - while (start <= end) - { - cmpResult = Utility.CompareArray(items[mid], 0, charArray, 1); - - if (cmpResult == 0) - return mid;// find it - else if (cmpResult < 0) - start = mid + 1; - else if (cmpResult > 0) - end = mid - 1; - - mid = (start + end) / 2; - } - return -1; - } - - /// <summary> - /// Find the first word in the dictionary that starts with the supplied prefix - /// </summary> - /// <param name="charArray">input prefix</param> - /// <returns>index of word, or -1 if not found</returns> - /// <seealso cref="GetPrefixMatch(char[], int)"/> - public virtual int GetPrefixMatch(char[] charArray) - { - return GetPrefixMatch(charArray, 0); - } - - /// <summary> - /// Find the nth word in the dictionary that starts with the supplied prefix - /// </summary> - /// <param name="charArray">input prefix</param> - /// <param name="knownStart">relative position in the dictionary to start</param> - /// <returns>index of word, or -1 if not found</returns> - /// <seealso cref="GetPrefixMatch(char[])"/> - public virtual int GetPrefixMatch(char[] charArray, int knownStart) - { - short index = GetWordItemTableIndex(charArray[0]); - if (index == -1) - return -1; - char[][] items = wordItem_charArrayTable[wordIndexTable[index]]; - int start = knownStart, end = items.Length - 1; - - int mid = (start + end) / 2, cmpResult; - - // Binary search for the index of idArray - while (start <= end) - { - cmpResult = Utility.CompareArrayByPrefix(charArray, 1, items[mid], 0); - if (cmpResult == 0) - { - // Get the first item which match the current word - while (mid >= 0 - && Utility.CompareArrayByPrefix(charArray, 1, items[mid], 0) == 0) - mid--; - mid++; - return mid;// Find the first word that uses charArray as prefix. - } - else if (cmpResult < 0) - end = mid - 1; - else - start = mid + 1; - mid = (start + end) / 2; - } - return -1; - } - - /// <summary> - /// Get the frequency of a word from the dictionary - /// </summary> - /// <param name="charArray">input word</param> - /// <returns>word frequency, or zero if the word is not found</returns> - public virtual int GetFrequency(char[] charArray) - { - short hashIndex = GetWordItemTableIndex(charArray[0]); - if (hashIndex == -1) - { - return 0; - } - int itemIndex = FindInTable(hashIndex, charArray); - if (itemIndex != -1) - { - return wordItem_frequencyTable[wordIndexTable[hashIndex]][itemIndex]; - } - return 0; - } - - /// <summary> - /// Return <c>true</c> if the dictionary entry at itemIndex for table charArray[0] is charArray - /// </summary> - /// <param name="charArray">input word</param> - /// <param name="itemIndex">item index for table charArray[0]</param> - /// <returns><c>true</c> if the entry exists</returns> - public virtual bool IsEqual(char[] charArray, int itemIndex) - { - short hashIndex = GetWordItemTableIndex(charArray[0]); - return Utility.CompareArray(charArray, 1, - wordItem_charArrayTable[wordIndexTable[hashIndex]][itemIndex], 0) == 0; - } - } -}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/056353d4/src/Lucene.Net.Analysis.SmartCn/HMMChineseTokenizer.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.SmartCn/HMMChineseTokenizer.cs b/src/Lucene.Net.Analysis.SmartCn/HMMChineseTokenizer.cs index 27ca17c..61ca00b 100644 --- a/src/Lucene.Net.Analysis.SmartCn/HMMChineseTokenizer.cs +++ b/src/Lucene.Net.Analysis.SmartCn/HMMChineseTokenizer.cs @@ -1,5 +1,5 @@ // lucene version compatibility level: 4.8.1 -using Lucene.Net.Analysis.Cn.Smart.HHMM; +using Lucene.Net.Analysis.Cn.Smart.Hhmm; using Lucene.Net.Analysis.TokenAttributes; using Lucene.Net.Analysis.Util; using Lucene.Net.Support; http://git-wip-us.apache.org/repos/asf/lucenenet/blob/056353d4/src/Lucene.Net.Analysis.SmartCn/Hhmm/AbstractDictionary.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.SmartCn/Hhmm/AbstractDictionary.cs b/src/Lucene.Net.Analysis.SmartCn/Hhmm/AbstractDictionary.cs new file mode 100644 index 0000000..83b4614 --- /dev/null +++ b/src/Lucene.Net.Analysis.SmartCn/Hhmm/AbstractDictionary.cs @@ -0,0 +1,225 @@ +// lucene version compatibility level: 4.8.1 +using System; +using System.Text; + +namespace Lucene.Net.Analysis.Cn.Smart.Hhmm +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// <summary> + /// <para> + /// <see cref="SmartChineseAnalyzer"/> abstract dictionary implementation. + /// </para> + /// <para> + /// Contains methods for dealing with GB2312 encoding. + /// </para> + /// @lucene.experimental + /// </summary> + internal abstract class AbstractDictionary + { + /// <summary> + /// First Chinese Character in GB2312 (15 * 94) + /// Characters in GB2312 are arranged in a grid of 94 * 94, 0-14 are unassigned or punctuation. + /// </summary> + public static readonly int GB2312_FIRST_CHAR = 1410; + + /// <summary> + /// Last Chinese Character in GB2312 (87 * 94). + /// Characters in GB2312 are arranged in a grid of 94 * 94, 88-94 are unassigned. + /// </summary> + public static readonly int GB2312_CHAR_NUM = 87 * 94; + + /// <summary> + /// Dictionary data contains 6768 Chinese characters with frequency statistics. + /// </summary> + public static readonly int CHAR_NUM_IN_FILE = 6768; + + // ===================================================== + // code +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +A +B +C +D +E +F + // B0A0 å é¿ å æ¨ å å å ç ç è¼ ç® è¾ ç¢ ç± é + // B0B0 é æ°¨ å® ä¿º æ æ 岸 èº æ¡ è® æ ç å¹ æ ç¬ ç¿± + // B0C0 è¢ å² å¥¥ æ æ¾³ è æ æ å å§ ç¬ å « ç¤ å·´ æ è· + // B0D0 é¶ æ è å é¸ ç½¢ ç¸ ç½ æ ç¾ æ ä½° è´¥ æ ç¨ æ + // B0E0 ç æ¬ æ³ è¬ é¢ æ¿ ç æ® æ ä¼´ ç£ å å ç» é¦ å¸® + // B0F0 æ¢ æ¦ è ç» æ£ ç£ è é å è°¤ è è å è¤ å¥ + // ===================================================== + // + // GB2312 character setï¼ + // 01 94 Symbols + // 02 72 Numbers + // 03 94 Latin + // 04 83 Kana + // 05 86 Katakana + // 06 48 Greek + // 07 66 Cyrillic + // 08 63 Phonetic Symbols + // 09 76 Drawing Symbols + // 10-15 Unassigned + // 16-55 3755 Plane 1, in pinyin order + // 56-87 3008 Plane 2, in radical/stroke order + // 88-94 Unassigned + // ====================================================== + + /// <summary> + /// <para> + /// Transcode from GB2312 ID to Unicode + /// </para> + /// <para> + /// GB2312 is divided into a 94 * 94 grid, containing 7445 characters consisting of 6763 Chinese characters and 682 symbols. + /// Some regions are unassigned (reserved). + /// </para> + /// </summary> + /// <param name="ccid">GB2312 id</param> + /// <returns>unicode String</returns> + public virtual string GetCCByGB2312Id(int ccid) + { + if (ccid < 0 || ccid > AbstractDictionary.GB2312_CHAR_NUM) + return ""; + int cc1 = ccid / 94 + 161; + int cc2 = ccid % 94 + 161; + byte[] buffer = new byte[2]; + buffer[0] = (byte)cc1; + buffer[1] = (byte)cc2; + try + { + //String cchar = new String(buffer, "GB2312"); + string cchar = Encoding.GetEncoding("GB2312").GetString(buffer); + return cchar; + } + catch (ArgumentException) // Encoding is not supported by the platform + { + return ""; + } + } + + /// <summary> + /// Transcode from Unicode to GB2312 + /// </summary> + /// <param name="ch">input character in Unicode, or character in Basic Latin range.</param> + /// <returns>position in GB2312</returns> + public virtual short GetGB2312Id(char ch) + { + try + { + //byte[] buffer = Character.ToString(ch).getBytes("GB2312"); + byte[] buffer = Encoding.GetEncoding("GB2312").GetBytes(ch.ToString()); + //byte[] buffer = Encoding.GetEncoding("hz-gb-2312").GetBytes(ch.ToString()); + if (buffer.Length != 2) + { + // Should be a two-byte character + return -1; + } + int b0 = (buffer[0] & 0x0FF) - 161; // Code starts from A1, therefore subtract 0xA1=161 + int b1 = (buffer[1] & 0x0FF) - 161; // There is no Chinese char for the first and last symbol. + // Therefore, each code page only has 16*6-2=94 characters. + return (short)(b0 * 94 + b1); + } + catch (ArgumentException e) // Encoding is not supported by the platform + { + throw new Exception(e.ToString(), e); + } + } + + /// <summary> + /// 32-bit FNV Hash Function + /// </summary> + /// <param name="c">input character</param> + /// <returns>hashcode</returns> + public virtual long Hash1(char c) + { + long p = 1099511628211L; + long hash = unchecked((long)0xcbf29ce484222325L); + hash = (hash ^ (c & 0x00FF)) * p; + hash = (hash ^ (c >> 8)) * p; + hash += hash << 13; + hash ^= hash >> 7; + hash += hash << 3; + hash ^= hash >> 17; + hash += hash << 5; + return hash; + } + + /// <summary> + /// 32-bit FNV Hash Function + /// </summary> + /// <param name="carray">character array</param> + /// <returns>hashcode</returns> + public virtual long Hash1(char[] carray) + { + long p = 1099511628211L; + long hash = unchecked((long)0xcbf29ce484222325L); + for (int i = 0; i < carray.Length; i++) + { + char d = carray[i]; + hash = (hash ^ (d & 0x00FF)) * p; + hash = (hash ^ (d >> 8)) * p; + } + + // hash += hash << 13; + // hash ^= hash >> 7; + // hash += hash << 3; + // hash ^= hash >> 17; + // hash += hash << 5; + return hash; + } + + /// <summary> + /// djb2 hash algorithmï¼this algorithm (k=33) was first reported by dan + /// bernstein many years ago in comp.lang.c. another version of this algorithm + /// (now favored by bernstein) uses xor: hash(i) = hash(i - 1) * 33 ^ str[i]; + /// the magic of number 33 (why it works better than many other constants, + /// prime or not) has never been adequately explained. + /// </summary> + /// <param name="c">character</param> + /// <returns>hashcode</returns> + public virtual int Hash2(char c) + { + int hash = 5381; + + /* hash 33 + c */ + hash = ((hash << 5) + hash) + c & 0x00FF; + hash = ((hash << 5) + hash) + c >> 8; + + return hash; + } + + /// <summary> + /// djb2 hash algorithmï¼this algorithm (k=33) was first reported by dan + /// bernstein many years ago in comp.lang.c. another version of this algorithm + /// (now favored by bernstein) uses xor: hash(i) = hash(i - 1) * 33 ^ str[i]; + /// the magic of number 33 (why it works better than many other constants, + /// prime or not) has never been adequately explained. + /// </summary> + /// <param name="carray">character array</param> + /// <returns>hashcode</returns> + public virtual int Hash2(char[] carray) + { + int hash = 5381; + + /* hash 33 + c */ + for (int i = 0; i < carray.Length; i++) + { + char d = carray[i]; + hash = ((hash << 5) + hash) + d & 0x00FF; + hash = ((hash << 5) + hash) + d >> 8; + } + + return hash; + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/056353d4/src/Lucene.Net.Analysis.SmartCn/Hhmm/BiSegGraph.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.SmartCn/Hhmm/BiSegGraph.cs b/src/Lucene.Net.Analysis.SmartCn/Hhmm/BiSegGraph.cs new file mode 100644 index 0000000..6c2923e --- /dev/null +++ b/src/Lucene.Net.Analysis.SmartCn/Hhmm/BiSegGraph.cs @@ -0,0 +1,257 @@ +// lucene version compatibility level: 4.8.1 +using Lucene.Net.Support; +using System; +using System.Collections.Generic; +using System.Text; + +namespace Lucene.Net.Analysis.Cn.Smart.Hhmm +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// <summary> + /// Graph representing possible token pairs (bigrams) at each start offset in the sentence. + /// <para> + /// For each start offset, a list of possible token pairs is stored. + /// </para> + /// @lucene.experimental + /// </summary> + internal class BiSegGraph + { + private IDictionary<int, IList<SegTokenPair>> tokenPairListTable = new Dictionary<int, IList<SegTokenPair>>(); + + private IList<SegToken> segTokenList; + + private static BigramDictionary bigramDict = BigramDictionary.GetInstance(); + + public BiSegGraph(SegGraph segGraph) + { + segTokenList = segGraph.MakeIndex(); + GenerateBiSegGraph(segGraph); + } + + /// <summary> + /// Generate a <see cref="BiSegGraph"/> based upon a <see cref="SegGraph"/> + /// </summary> + private void GenerateBiSegGraph(SegGraph segGraph) + { + double smooth = 0.1; + int wordPairFreq = 0; + int maxStart = segGraph.MaxStart; + double oneWordFreq, weight, tinyDouble = 1.0 / Utility.MAX_FREQUENCE; + + int next; + char[] idBuffer; + // get the list of tokens ordered and indexed + segTokenList = segGraph.MakeIndex(); + // Because the beginning position of startToken is -1, therefore startToken can be obtained when key = -1 + int key = -1; + IList<SegToken> nextTokens = null; + while (key < maxStart) + { + if (segGraph.IsStartExist(key)) + { + + IList<SegToken> tokenList = segGraph.GetStartList(key); + + // Calculate all tokens for a given key. + foreach (SegToken t1 in tokenList) + { + oneWordFreq = t1.Weight; + next = t1.EndOffset; + nextTokens = null; + // Find the next corresponding Token. + // For example: "Sunny seashore", the present Token is "sunny", next one should be "sea" or "seashore". + // If we cannot find the next Token, then go to the end and repeat the same cycle. + while (next <= maxStart) + { + // Because the beginning position of endToken is sentenceLen, so equal to sentenceLen can find endToken. + if (segGraph.IsStartExist(next)) + { + nextTokens = segGraph.GetStartList(next); + break; + } + next++; + } + if (nextTokens == null) + { + break; + } + foreach (SegToken t2 in nextTokens) + { + idBuffer = new char[t1.CharArray.Length + t2.CharArray.Length + 1]; + System.Array.Copy(t1.CharArray, 0, idBuffer, 0, t1.CharArray.Length); + idBuffer[t1.CharArray.Length] = BigramDictionary.WORD_SEGMENT_CHAR; + System.Array.Copy(t2.CharArray, 0, idBuffer, + t1.CharArray.Length + 1, t2.CharArray.Length); + + // Two linked Words frequency + wordPairFreq = bigramDict.GetFrequency(idBuffer); + + // Smoothing + + // -log{a*P(Ci-1)+(1-a)P(Ci|Ci-1)} Note 0<a<1 + weight = -Math + .Log(smooth + * (1.0 + oneWordFreq) + / (Utility.MAX_FREQUENCE + 0.0) + + (1.0 - smooth) + * ((1.0 - tinyDouble) * wordPairFreq / (1.0 + oneWordFreq) + tinyDouble)); + + SegTokenPair tokenPair = new SegTokenPair(idBuffer, t1.Index, + t2.Index, weight); + this.AddSegTokenPair(tokenPair); + } + } + } + key++; + } + + } + + /// <summary> + /// Returns <c>true</c> if their is a list of token pairs at this offset (index of the second token) + /// </summary> + /// <param name="to">index of the second token in the token pair</param> + /// <returns><c>true</c> if a token pair exists</returns> + public virtual bool IsToExist(int to) + { + //return tokenPairListTable.get(Integer.valueOf(to)) != null; + //return tokenPairListTable.ContainsKey(to) && tokenPairListTable[to] != null; + IList<SegTokenPair> result; + return tokenPairListTable.TryGetValue(to, out result) && result != null; + } + + /// <summary> + /// Return a <see cref="T:IList{SegTokenPair}"/> of all token pairs at this offset (index of the second token) + /// </summary> + /// <param name="to">index of the second token in the token pair</param> + /// <returns><see cref="T:IList{SegTokenPair}"/> of token pairs. </returns> + public virtual IList<SegTokenPair> GetToList(int to) + { + IList<SegTokenPair> result; + tokenPairListTable.TryGetValue(to, out result); + return result; + } + + /// <summary> + /// Add a <see cref="SegTokenPair"/> + /// </summary> + /// <param name="tokenPair"><see cref="SegTokenPair"/></param> + public virtual void AddSegTokenPair(SegTokenPair tokenPair) + { + int to = tokenPair.To; + if (!IsToExist(to)) + { + List<SegTokenPair> newlist = new List<SegTokenPair>(); + newlist.Add(tokenPair); + tokenPairListTable[to] = newlist; + } + else + { + IList<SegTokenPair> tokenPairList = tokenPairListTable[to]; + tokenPairList.Add(tokenPair); + } + } + + /// <summary> + /// Get the number of <see cref="SegTokenPair"/> entries in the table. + /// </summary> + /// <returns>number of <see cref="SegTokenPair"/> entries</returns> + public virtual int ToCount + { + get { return tokenPairListTable.Count; } + } + + /// <summary> + /// Find the shortest path with the Viterbi algorithm. + /// </summary> + /// <returns><see cref="T:IList{SegToken}"/></returns> + [ExceptionToNetNumericConvention] + public virtual IList<SegToken> GetShortPath() + { + int current; + int nodeCount = ToCount; + IList<PathNode> path = new List<PathNode>(); + PathNode zeroPath = new PathNode(); + zeroPath.Weight = 0; + zeroPath.PreNode = 0; + path.Add(zeroPath); + for (current = 1; current <= nodeCount; current++) + { + double weight; + IList<SegTokenPair> edges = GetToList(current); + + double minWeight = double.MaxValue; + SegTokenPair minEdge = null; + foreach (SegTokenPair edge in edges) + { + weight = edge.Weight; + PathNode preNode2 = path[edge.From]; + if (preNode2.Weight + weight < minWeight) + { + minWeight = preNode2.Weight + weight; + minEdge = edge; + } + } + PathNode newNode = new PathNode(); + newNode.Weight = minWeight; + newNode.PreNode = minEdge.From; + path.Add(newNode); + } + + // Calculate PathNodes + int preNode, lastNode; + lastNode = path.Count - 1; + current = lastNode; + IList<int> rpath = new List<int>(); + IList<SegToken> resultPath = new List<SegToken>(); + + rpath.Add(current); + while (current != 0) + { + PathNode currentPathNode = path[current]; + preNode = currentPathNode.PreNode; + rpath.Add(preNode); + current = preNode; + } + for (int j = rpath.Count - 1; j >= 0; j--) + { + //int idInteger = rpath.get(j); + //int id = idInteger.intValue(); + int id = rpath[j]; + SegToken t = segTokenList[id]; + resultPath.Add(t); + } + return resultPath; + } + + public override string ToString() + { + StringBuilder sb = new StringBuilder(); + ICollection<IList<SegTokenPair>> values = tokenPairListTable.Values; + foreach (IList<SegTokenPair> segList in values) + { + foreach (SegTokenPair pair in segList) + { + sb.Append(pair + "\n"); + } + } + return sb.ToString(); + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/056353d4/src/Lucene.Net.Analysis.SmartCn/Hhmm/BigramDictionary.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.SmartCn/Hhmm/BigramDictionary.cs b/src/Lucene.Net.Analysis.SmartCn/Hhmm/BigramDictionary.cs new file mode 100644 index 0000000..b21925f --- /dev/null +++ b/src/Lucene.Net.Analysis.SmartCn/Hhmm/BigramDictionary.cs @@ -0,0 +1,432 @@ +// lucene version compatibility level: 4.8.1 +using Lucene.Net.Support; +using Lucene.Net.Support.IO; +using System; +using System.IO; +using System.Reflection; +using System.Text; + +namespace Lucene.Net.Analysis.Cn.Smart.Hhmm +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// <summary> + /// SmartChineseAnalyzer Bigram dictionary. + /// <para/> + /// @lucene.experimental + /// </summary> + internal class BigramDictionary : AbstractDictionary + { + private BigramDictionary() + { + } + + public static readonly char WORD_SEGMENT_CHAR = '@'; + + private static BigramDictionary singleInstance; + + public static readonly int PRIME_BIGRAM_LENGTH = 402137; + + /// <summary> + /// The word associations are stored as FNV1 hashcodes, which have a small probability of collision, but save memory. + /// </summary> + private long[] bigramHashTable; + + private int[] frequencyTable; + + private int max = 0; + + private int repeat = 0; + + // static Logger log = Logger.getLogger(BigramDictionary.class); + + private static object syncLock = new object(); + + public static BigramDictionary GetInstance() + { + lock (syncLock) + { + if (singleInstance == null) + { + singleInstance = new BigramDictionary(); + + // LUCENENET specific + // LUCENE-1817: https://issues.apache.org/jira/browse/LUCENE-1817 + // This issue still existed as of 4.8.0. Here is the fix - we only + // load from a directory if the actual directory exists (AnalyzerProfile + // ensures it is an empty string if it is not available). + string dictRoot = AnalyzerProfile.ANALYSIS_DATA_DIR; + if (string.IsNullOrEmpty(dictRoot)) + { + singleInstance.Load(); + } + else + { + singleInstance.Load(dictRoot); + } + + + //try + //{ + // singleInstance.Load(); + //} + //catch (IOException e) + //{ + // string dictRoot = AnalyzerProfile.ANALYSIS_DATA_DIR; + // singleInstance.Load(dictRoot); + //} + //catch (TypeLoadException e) + //{ + // throw new Exception(e.ToString(), e); + //} + } + return singleInstance; + } + } + + private bool LoadFromObj(FileInfo serialObj) + { + try + { + using (Stream input = new FileStream(serialObj.FullName, FileMode.Open, FileAccess.Read)) + LoadFromInputStream(input); + return true; + } + catch (Exception e) + { + throw new Exception(e.ToString(), e); + } + } + + // LUCENENET conversion note: + // The data in Lucene is stored in a proprietary binary format (similar to + // .NET's BinarySerializer) that cannot be read back in .NET. Therefore, the + // data was extracted using Java's DataOutputStream using the following Java code. + // It can then be read in using the LoadFromInputStream method below + // (using a DataInputStream instead of a BinaryReader), and saved + // in the correct (BinaryWriter) format by calling the SaveToObj method. + // Alternatively, the data can be loaded from disk using the files + // here(https://issues.apache.org/jira/browse/LUCENE-1629) in the analysis.data.zip file, + // which will automatically produce the .mem files. + + //public void saveToOutputStream(java.io.DataOutputStream stream) throws IOException + //{ + // // save wordIndexTable + // int wiLen = wordIndexTable.length; + // stream.writeInt(wiLen); + // for (int i = 0; i<wiLen; i++) + // { + // stream.writeShort(wordIndexTable[i]); + // } + + // // save charIndexTable + // int ciLen = charIndexTable.length; + // stream.writeInt(ciLen); + // for (int i = 0; i<ciLen; i++) + // { + // stream.writeChar(charIndexTable[i]); + // } + + // int caDim1 = wordItem_charArrayTable == null ? -1 : wordItem_charArrayTable.length; + // stream.writeInt(caDim1); + // for (int i = 0; i<caDim1; i++) + // { + // int caDim2 = wordItem_charArrayTable[i] == null ? -1 : wordItem_charArrayTable[i].length; + // stream.writeInt(caDim2); + // for (int j = 0; j<caDim2; j++) + // { + // int caDim3 = wordItem_charArrayTable[i][j] == null ? -1 : wordItem_charArrayTable[i][j].length; + // stream.writeInt(caDim3); + // for (int k = 0; k<caDim3; k++) + // { + // stream.writeChar(wordItem_charArrayTable[i][j][k]); + // } + // } + // } + + // int fDim1 = wordItem_frequencyTable == null ? -1 : wordItem_frequencyTable.length; + // stream.writeInt(fDim1); + // for (int i = 0; i<fDim1; i++) + // { + // int fDim2 = wordItem_frequencyTable[i] == null ? -1 : wordItem_frequencyTable[i].length; + // stream.writeInt(fDim2); + // for (int j = 0; j<fDim2; j++) + // { + // stream.writeInt(wordItem_frequencyTable[i][j]); + // } + // } + //} + + private void LoadFromInputStream(Stream serialObjectInputStream) + { + //ObjectInputStream input = new ObjectInputStream(serialObjectInputStream); + //bigramHashTable = (long[])input.readObject(); + //frequencyTable = (int[])input.readObject(); + //// log.info("load bigram dict from serialization."); + //input.close(); + + using (var reader = new BinaryReader(serialObjectInputStream)) + //using (var reader = new DataInputStream(serialObjectInputStream)) + { + // Read bigramHashTable + int bhLen = reader.ReadInt32(); + bigramHashTable = new long[bhLen]; + for (int i = 0; i < bhLen; i++) + { + bigramHashTable[i] = reader.ReadInt64(); + } + + // Read frequencyTable + int fLen = reader.ReadInt32(); + frequencyTable = new int[fLen]; + for (int i = 0; i < fLen; i++) + { + frequencyTable[i] = reader.ReadInt32(); + } + } + + // log.info("load bigram dict from serialization."); + } + + private void SaveToObj(FileInfo serialObj) + { + try + { + //ObjectOutputStream output = new ObjectOutputStream(new FileStream( + // serialObj.FullName, FileMode.Create, FileAccess.Write)); + //output.writeObject(bigramHashTable); + //output.writeObject(frequencyTable); + //output.close(); + + using (Stream output = new FileStream(serialObj.FullName, FileMode.Create, FileAccess.Write)) + { + using (BinaryWriter writer = new BinaryWriter(output)) + { + int bhLen = bigramHashTable.Length; + writer.Write(bhLen); + for (int i = 0; i < bhLen; i++) + { + writer.Write(bigramHashTable[i]); + } + + int fLen = frequencyTable.Length; + writer.Write(fLen); + for (int i = 0; i < fLen; i++) + { + writer.Write(frequencyTable[i]); + } + } + } + // log.info("serialize bigram dict."); + } +#pragma warning disable 168 + catch (Exception e) +#pragma warning restore 168 + { + // log.warn(e.getMessage()); + } + } + + private void Load() + { + using (Stream input = this.GetType().GetTypeInfo().Assembly.FindAndGetManifestResourceStream(this.GetType(), "bigramdict.mem")) + { + LoadFromInputStream(input); + } + } + + private void Load(string dictRoot) + { + string bigramDictPath = System.IO.Path.Combine(dictRoot, "bigramdict.dct"); + + FileInfo serialObj = new FileInfo(System.IO.Path.Combine(dictRoot, "bigramdict.mem")); + + if (serialObj.Exists && LoadFromObj(serialObj)) + { + + } + else + { + try + { + bigramHashTable = new long[PRIME_BIGRAM_LENGTH]; + frequencyTable = new int[PRIME_BIGRAM_LENGTH]; + for (int i = 0; i < PRIME_BIGRAM_LENGTH; i++) + { + // it is possible for a value to hash to 0, but the probability is extremely low + bigramHashTable[i] = 0; + frequencyTable[i] = 0; + } + LoadFromFile(bigramDictPath); + } + catch (IOException e) + { + throw new Exception(e.ToString(), e); + } + SaveToObj(serialObj); + } + } + + /// <summary> + /// Load the datafile into this <see cref="BigramDictionary"/> + /// </summary> + /// <param name="dctFilePath">dctFilePath path to the Bigramdictionary (bigramdict.dct)</param> + /// <exception cref="IOException">If there is a low-level I/O error</exception> + public virtual void LoadFromFile(string dctFilePath) + { + int i, cnt, length, total = 0; + // The file only counted 6763 Chinese characters plus 5 reserved slots 3756~3760. + // The 3756th is used (as a header) to store information. + int[] + buffer = new int[3]; + byte[] intBuffer = new byte[4]; + string tmpword; + //using (RandomAccessFile dctFile = new RandomAccessFile(dctFilePath, "r")) + using (var dctFile = new FileStream(dctFilePath, FileMode.Open, FileAccess.Read)) + { + + // GB2312 characters 0 - 6768 + for (i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + CHAR_NUM_IN_FILE; i++) + { + string currentStr = GetCCByGB2312Id(i); + // if (i == 5231) + // System.out.println(i); + + dctFile.Read(intBuffer, 0, intBuffer.Length); + // the dictionary was developed for C, and byte order must be converted to work with Java + cnt = ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LITTLE_ENDIAN).GetInt32(); + if (cnt <= 0) + { + continue; + } + total += cnt; + int j = 0; + while (j < cnt) + { + dctFile.Read(intBuffer, 0, intBuffer.Length); + buffer[0] = ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LITTLE_ENDIAN) + .GetInt32();// frequency + dctFile.Read(intBuffer, 0, intBuffer.Length); + buffer[1] = ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LITTLE_ENDIAN) + .GetInt32();// length + dctFile.Read(intBuffer, 0, intBuffer.Length); + // buffer[2] = ByteBuffer.wrap(intBuffer).order( + // ByteOrder.LITTLE_ENDIAN).getInt();// handle + + length = buffer[1]; + if (length > 0) + { + byte[] lchBuffer = new byte[length]; + dctFile.Read(lchBuffer, 0, lchBuffer.Length); + //tmpword = new String(lchBuffer, "GB2312"); + tmpword = Encoding.GetEncoding("GB2312").GetString(lchBuffer); + //tmpword = Encoding.GetEncoding("hz-gb-2312").GetString(lchBuffer); + if (i != 3755 + GB2312_FIRST_CHAR) + { + tmpword = currentStr + tmpword; + } + char[] carray = tmpword.ToCharArray(); + long hashId = Hash1(carray); + int index = GetAvaliableIndex(hashId, carray); + if (index != -1) + { + if (bigramHashTable[index] == 0) + { + bigramHashTable[index] = hashId; + // bigramStringTable[index] = tmpword; + } + frequencyTable[index] += buffer[0]; + } + } + j++; + } + } + } + // log.info("load dictionary done! " + dctFilePath + " total:" + total); + } + + private int GetAvaliableIndex(long hashId, char[] carray) + { + int hash1 = (int)(hashId % PRIME_BIGRAM_LENGTH); + int hash2 = Hash2(carray) % PRIME_BIGRAM_LENGTH; + if (hash1 < 0) + hash1 = PRIME_BIGRAM_LENGTH + hash1; + if (hash2 < 0) + hash2 = PRIME_BIGRAM_LENGTH + hash2; + int index = hash1; + int i = 1; + while (bigramHashTable[index] != 0 && bigramHashTable[index] != hashId + && i < PRIME_BIGRAM_LENGTH) + { + index = (hash1 + i * hash2) % PRIME_BIGRAM_LENGTH; + i++; + } + // System.out.println(i - 1); + + if (i < PRIME_BIGRAM_LENGTH + && (bigramHashTable[index] == 0 || bigramHashTable[index] == hashId)) + { + return index; + } + else + return -1; + } + + /// <summary> + /// lookup the index into the frequency array. + /// </summary> + private int GetBigramItemIndex(char[] carray) + { + long hashId = Hash1(carray); + int hash1 = (int)(hashId % PRIME_BIGRAM_LENGTH); + int hash2 = Hash2(carray) % PRIME_BIGRAM_LENGTH; + if (hash1 < 0) + hash1 = PRIME_BIGRAM_LENGTH + hash1; + if (hash2 < 0) + hash2 = PRIME_BIGRAM_LENGTH + hash2; + int index = hash1; + int i = 1; + repeat++; + while (bigramHashTable[index] != 0 && bigramHashTable[index] != hashId + && i < PRIME_BIGRAM_LENGTH) + { + index = (hash1 + i * hash2) % PRIME_BIGRAM_LENGTH; + i++; + repeat++; + if (i > max) + max = i; + } + // System.out.println(i - 1); + + if (i < PRIME_BIGRAM_LENGTH && bigramHashTable[index] == hashId) + { + return index; + } + else + return -1; + } + + public int GetFrequency(char[] carray) + { + int index = GetBigramItemIndex(carray); + if (index != -1) + return frequencyTable[index]; + return 0; + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/056353d4/src/Lucene.Net.Analysis.SmartCn/Hhmm/HHMMSegmenter.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.SmartCn/Hhmm/HHMMSegmenter.cs b/src/Lucene.Net.Analysis.SmartCn/Hhmm/HHMMSegmenter.cs new file mode 100644 index 0000000..4940dba --- /dev/null +++ b/src/Lucene.Net.Analysis.SmartCn/Hhmm/HHMMSegmenter.cs @@ -0,0 +1,253 @@ +// lucene version compatibility level: 4.8.1 +using System.Collections.Generic; +using System.Text; + +namespace Lucene.Net.Analysis.Cn.Smart.Hhmm +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// <summary> + /// Finds the optimal segmentation of a sentence into Chinese words + /// <para/> + /// @lucene.experimental + /// </summary> + public class HHMMSegmenter + { + private static WordDictionary wordDict = WordDictionary.GetInstance(); + + /// <summary> + /// Create the <see cref="SegGraph"/> for a sentence. + /// </summary> + /// <param name="sentence">input sentence, without start and end markers</param> + /// <returns><see cref="SegGraph"/> corresponding to the input sentence.</returns> + private SegGraph CreateSegGraph(string sentence) + { + int i = 0, j; + int length = sentence.Length; + int foundIndex; + CharType[] charTypeArray = GetCharTypes(sentence); + StringBuilder wordBuf = new StringBuilder(); + SegToken token; + int frequency = 0; // the number of times word appears. + bool hasFullWidth; + WordType wordType; + char[] charArray; + + SegGraph segGraph = new SegGraph(); + while (i < length) + { + hasFullWidth = false; + switch (charTypeArray[i]) + { + case CharType.SPACE_LIKE: + i++; + break; + case CharType.HANZI: + j = i + 1; + //wordBuf.delete(0, wordBuf.length()); + wordBuf.Remove(0, wordBuf.Length); + // It doesn't matter if a single Chinese character (Hanzi) can form a phrase or not, + // it will store that single Chinese character (Hanzi) in the SegGraph. Otherwise, it will + // cause word division. + wordBuf.Append(sentence[i]); + charArray = new char[] { sentence[i] }; + frequency = wordDict.GetFrequency(charArray); + token = new SegToken(charArray, i, j, WordType.CHINESE_WORD, + frequency); + segGraph.AddToken(token); + + foundIndex = wordDict.GetPrefixMatch(charArray); + while (j <= length && foundIndex != -1) + { + if (wordDict.IsEqual(charArray, foundIndex) && charArray.Length > 1) + { + // It is the phrase we are looking for; In other words, we have found a phrase SegToken + // from i to j. It is not a monosyllabic word (single word). + frequency = wordDict.GetFrequency(charArray); + token = new SegToken(charArray, i, j, WordType.CHINESE_WORD, + frequency); + segGraph.AddToken(token); + } + + while (j < length && charTypeArray[j] == CharType.SPACE_LIKE) + j++; + + if (j < length && charTypeArray[j] == CharType.HANZI) + { + wordBuf.Append(sentence[j]); + charArray = new char[wordBuf.Length]; + //wordBuf.GetChars(0, charArray.Length, charArray, 0); + wordBuf.CopyTo(0, charArray, 0, charArray.Length); + // idArray has been found (foundWordIndex!=-1) as a prefix before. + // Therefore, idArray after it has been lengthened can only appear after foundWordIndex. + // So start searching after foundWordIndex. + foundIndex = wordDict.GetPrefixMatch(charArray, foundIndex); + j++; + } + else + { + break; + } + } + i++; + break; + case CharType.FULLWIDTH_LETTER: + hasFullWidth = true; /* intentional fallthrough */ + + j = i + 1; + while (j < length + && (charTypeArray[j] == CharType.LETTER || charTypeArray[j] == CharType.FULLWIDTH_LETTER)) + { + if (charTypeArray[j] == CharType.FULLWIDTH_LETTER) + hasFullWidth = true; + j++; + } + // Found a Token from i to j. Type is LETTER char string. + charArray = Utility.STRING_CHAR_ARRAY; + frequency = wordDict.GetFrequency(charArray); + wordType = hasFullWidth ? WordType.FULLWIDTH_STRING : WordType.STRING; + token = new SegToken(charArray, i, j, wordType, frequency); + segGraph.AddToken(token); + i = j; + break; + + case CharType.LETTER: + j = i + 1; + while (j < length + && (charTypeArray[j] == CharType.LETTER || charTypeArray[j] == CharType.FULLWIDTH_LETTER)) + { + if (charTypeArray[j] == CharType.FULLWIDTH_LETTER) + hasFullWidth = true; + j++; + } + // Found a Token from i to j. Type is LETTER char string. + charArray = Utility.STRING_CHAR_ARRAY; + frequency = wordDict.GetFrequency(charArray); + wordType = hasFullWidth ? WordType.FULLWIDTH_STRING : WordType.STRING; + token = new SegToken(charArray, i, j, wordType, frequency); + segGraph.AddToken(token); + i = j; + break; + case CharType.FULLWIDTH_DIGIT: + hasFullWidth = true; /* intentional fallthrough */ + + j = i + 1; + while (j < length + && (charTypeArray[j] == CharType.DIGIT || charTypeArray[j] == CharType.FULLWIDTH_DIGIT)) + { + if (charTypeArray[j] == CharType.FULLWIDTH_DIGIT) + hasFullWidth = true; + j++; + } + // Found a Token from i to j. Type is NUMBER char string. + charArray = Utility.NUMBER_CHAR_ARRAY; + frequency = wordDict.GetFrequency(charArray); + wordType = hasFullWidth ? WordType.FULLWIDTH_NUMBER : WordType.NUMBER; + token = new SegToken(charArray, i, j, wordType, frequency); + segGraph.AddToken(token); + i = j; + break; + + case CharType.DIGIT: + j = i + 1; + while (j < length + && (charTypeArray[j] == CharType.DIGIT || charTypeArray[j] == CharType.FULLWIDTH_DIGIT)) + { + if (charTypeArray[j] == CharType.FULLWIDTH_DIGIT) + hasFullWidth = true; + j++; + } + // Found a Token from i to j. Type is NUMBER char string. + charArray = Utility.NUMBER_CHAR_ARRAY; + frequency = wordDict.GetFrequency(charArray); + wordType = hasFullWidth ? WordType.FULLWIDTH_NUMBER : WordType.NUMBER; + token = new SegToken(charArray, i, j, wordType, frequency); + segGraph.AddToken(token); + i = j; + break; + case CharType.DELIMITER: + j = i + 1; + // No need to search the weight for the punctuation. Picking the highest frequency will work. + frequency = Utility.MAX_FREQUENCE; + charArray = new char[] { sentence[i] }; + token = new SegToken(charArray, i, j, WordType.DELIMITER, frequency); + segGraph.AddToken(token); + i = j; + break; + default: + j = i + 1; + // Treat the unrecognized char symbol as unknown string. + // For example, any symbol not in GB2312 is treated as one of these. + charArray = Utility.STRING_CHAR_ARRAY; + frequency = wordDict.GetFrequency(charArray); + token = new SegToken(charArray, i, j, WordType.STRING, frequency); + segGraph.AddToken(token); + i = j; + break; + } + } + + // Add two more Tokens: "beginning xx beginning" + charArray = Utility.START_CHAR_ARRAY; + frequency = wordDict.GetFrequency(charArray); + token = new SegToken(charArray, -1, 0, WordType.SENTENCE_BEGIN, frequency); + segGraph.AddToken(token); + + // "end xx end" + charArray = Utility.END_CHAR_ARRAY; + frequency = wordDict.GetFrequency(charArray); + token = new SegToken(charArray, length, length + 1, WordType.SENTENCE_END, + frequency); + segGraph.AddToken(token); + + return segGraph; + } + + /// <summary> + /// Get the character types for every character in a sentence. + /// </summary> + /// <param name="sentence">input sentence</param> + /// <returns>array of character types corresponding to character positions in the sentence</returns> + /// <seealso cref="Utility.GetCharType(char)"/> + private static CharType[] GetCharTypes(string sentence) + { + int length = sentence.Length; + CharType[] charTypeArray = new CharType[length]; + // the type of each character by position + for (int i = 0; i < length; i++) + { + charTypeArray[i] = Utility.GetCharType(sentence[i]); + } + + return charTypeArray; + } + + /// <summary> + /// Return a list of <see cref="SegToken"/> representing the best segmentation of a sentence + /// </summary> + /// <param name="sentence">input sentence</param> + /// <returns>best segmentation as a <see cref="T:IList{SegToken}"/></returns> + public virtual IList<SegToken> Process(string sentence) + { + SegGraph segGraph = CreateSegGraph(sentence); + BiSegGraph biSegGraph = new BiSegGraph(segGraph); + IList<SegToken> shortPath = biSegGraph.GetShortPath(); + return shortPath; + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/056353d4/src/Lucene.Net.Analysis.SmartCn/Hhmm/PathNode.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.SmartCn/Hhmm/PathNode.cs b/src/Lucene.Net.Analysis.SmartCn/Hhmm/PathNode.cs new file mode 100644 index 0000000..7295f3f --- /dev/null +++ b/src/Lucene.Net.Analysis.SmartCn/Hhmm/PathNode.cs @@ -0,0 +1,81 @@ +// lucene version compatibility level: 4.8.1 +using Lucene.Net.Support; +using System; + +namespace Lucene.Net.Analysis.Cn.Smart.Hhmm +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// <summary> + /// SmartChineseAnalyzer internal node representation + /// <para> + /// Used by <see cref="BiSegGraph"/> to maximize the segmentation with the Viterbi algorithm. + /// </para> + /// @lucene.experimental + /// </summary> + internal class PathNode : IComparable<PathNode> + { + public double Weight { get; set; } + + public int PreNode { get; set; } + + public virtual int CompareTo(PathNode pn) + { + if (Weight < pn.Weight) + return -1; + else if (Weight == pn.Weight) + return 0; + else + return 1; + } + + /// <summary> + /// <see cref="object.GetHashCode()"/> + /// </summary> + public override int GetHashCode() + { + int prime = 31; + int result = 1; + result = prime * result + PreNode; + long temp; + temp = Number.DoubleToInt64Bits(Weight); + result = prime * result + (int)(temp ^ (int)((uint)temp >> 32)); + return result; + } + + /// <summary> + /// <see cref="object.Equals(object)"/> + /// </summary> + public override bool Equals(object obj) + { + if (this == obj) + return true; + if (obj == null) + return false; + if (GetType() != obj.GetType()) + return false; + PathNode other = (PathNode)obj; + if (PreNode != other.PreNode) + return false; + if (Number.DoubleToInt64Bits(Weight) != Number + .DoubleToInt64Bits(other.Weight)) + return false; + return true; + } + } +}
