http://git-wip-us.apache.org/repos/asf/lucenenet/blob/2f5d89b4/src/Lucene.Net.Analysis.SmartCn/HHMM/SegTokenFilter.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.SmartCn/HHMM/SegTokenFilter.cs b/src/Lucene.Net.Analysis.SmartCn/HHMM/SegTokenFilter.cs new file mode 100644 index 0000000..008460e --- /dev/null +++ b/src/Lucene.Net.Analysis.SmartCn/HHMM/SegTokenFilter.cs @@ -0,0 +1,75 @@ +namespace Lucene.Net.Analysis.Cn.Smart.HHMM +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// <summary> + /// <para> + /// Filters a <see cref="SegToken"/> by converting full-width latin to half-width, then lowercasing latin. + /// Additionally, all punctuation is converted into <see cref="Utility.COMMON_DELIMITER"/> + /// </para> + /// @lucene.experimental + /// </summary> + public class SegTokenFilter + { + /// <summary> + /// Filter an input <see cref="SegToken"/> + /// <para> + /// Full-width latin will be converted to half-width, then all latin will be lowercased. + /// All punctuation is converted into <see cref="Utility.COMMON_DELIMITER"/> + /// </para> + /// </summary> + /// <param name="token">Input <see cref="SegToken"/>.</param> + /// <returns>Normalized <see cref="SegToken"/>.</returns> + public virtual SegToken Filter(SegToken token) + { + switch (token.WordType) + { + case WordType.FULLWIDTH_NUMBER: + case WordType.FULLWIDTH_STRING: /* first convert full-width -> half-width */ + for (int i = 0; i < token.CharArray.Length; i++) + { + if (token.CharArray[i] >= 0xFF10) + { + token.CharArray[i] = (char)(token.CharArray[i] - 0xFEE0); + } + + if (token.CharArray[i] >= 0x0041 && token.CharArray[i] <= 0x005A) /* lowercase latin */ + { + token.CharArray[i] = (char)(token.CharArray[i] + 0x0020); + } + } + break; + case WordType.STRING: + for (int i = 0; i < token.CharArray.Length; i++) + { + if (token.CharArray[i] >= 0x0041 && token.CharArray[i] <= 0x005A) /* lowercase latin */ + { + token.CharArray[i] = (char)(token.CharArray[i] + 0x0020); + } + } + break; + case WordType.DELIMITER: /* convert all punctuation to Utility.COMMON_DELIMITER */ + token.CharArray = Utility.COMMON_DELIMITER; + break; + default: + break; + } + return token; + } + } +}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/2f5d89b4/src/Lucene.Net.Analysis.SmartCn/HHMM/SegTokenPair.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.SmartCn/HHMM/SegTokenPair.cs b/src/Lucene.Net.Analysis.SmartCn/HHMM/SegTokenPair.cs new file mode 100644 index 0000000..b7b697a --- /dev/null +++ b/src/Lucene.Net.Analysis.SmartCn/HHMM/SegTokenPair.cs @@ -0,0 +1,95 @@ +using Lucene.Net.Support; + +namespace Lucene.Net.Analysis.Cn.Smart.HHMM +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// <summary> + /// A pair of tokens in <see cref="SegGraph"/> + /// <para/> + /// @lucene.experimental + /// </summary> + internal class SegTokenPair + { + [WritableArray] + public char[] CharArray { get; set; } + + /// <summary> + /// index of the first token in <see cref="SegGraph"/> + /// </summary> + public int From { get; set; } + + /// <summary> + /// index of the second token in <see cref="SegGraph"/> + /// </summary> + public int To { get; set; } + + public double Weight { get; set; } + + public SegTokenPair(char[] idArray, int from, int to, double weight) + { + this.CharArray = idArray; + this.From = from; + this.To = to; + this.Weight = weight; + } + + /// <summary> + /// <see cref="object.GetHashCode()"/> + /// </summary> + public override int GetHashCode() + { + int prime = 31; + int result = 1; + for (int i = 0; i < CharArray.Length; i++) + { + result = prime * result + CharArray[i]; + } + result = prime * result + From; + result = prime * result + To; + long temp; + temp = Number.DoubleToInt64Bits(Weight); + result = prime * result + (int)(temp ^ (int)((uint)temp >> 32)); + return result; + } + + /// <summary> + /// <see cref="object.Equals(object)"/> + /// </summary> + public override bool Equals(object obj) + { + if (this == obj) + return true; + if (obj == null) + return false; + if (GetType() != obj.GetType()) + return false; + SegTokenPair other = (SegTokenPair)obj; + if (!Arrays.Equals(CharArray, other.CharArray)) + return false; + if (From != other.From) + return false; + if (To != other.To) + return false; + if (Number.DoubleToInt64Bits(Weight) != Number + .DoubleToInt64Bits(other.Weight)) + return false; + return true; + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/2f5d89b4/src/Lucene.Net.Analysis.SmartCn/HHMM/WordDictionary.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.SmartCn/HHMM/WordDictionary.cs b/src/Lucene.Net.Analysis.SmartCn/HHMM/WordDictionary.cs new file mode 100644 index 0000000..c857380 --- /dev/null +++ b/src/Lucene.Net.Analysis.SmartCn/HHMM/WordDictionary.cs @@ -0,0 +1,778 @@ +using Lucene.Net.Support; +using Lucene.Net.Support.IO; +using System; +using System.IO; +using System.Reflection; +using System.Text; + +namespace Lucene.Net.Analysis.Cn.Smart.HHMM +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// <summary> + /// SmartChineseAnalyzer Word Dictionary + /// <para/> + /// @lucene.experimental + /// </summary> + internal class WordDictionary : AbstractDictionary + { + private WordDictionary() + { + } + + private static WordDictionary singleInstance; + + /// <summary> + /// Large prime number for hash function + /// </summary> + public static readonly int PRIME_INDEX_LENGTH = 12071; + + /// <summary> + /// wordIndexTable guarantees to hash all Chinese characters in Unicode into + /// PRIME_INDEX_LENGTH array. There will be conflict, but in reality this + /// program only handles the 6768 characters found in GB2312 plus some + /// ASCII characters. Therefore in order to guarantee better precision, it is + /// necessary to retain the original symbol in the charIndexTable. + /// </summary> + private short[] wordIndexTable; + + private char[] charIndexTable; + + /// <summary> + /// To avoid taking too much space, the data structure needed to store the + /// lexicon requires two multidimensional arrays to store word and frequency. + /// Each word is placed in a char[]. Each char represents a Chinese char or + /// other symbol. Each frequency is put into an int. These two arrays + /// correspond to each other one-to-one. Therefore, one can use + /// wordItem_charArrayTable[i][j] to look up word from lexicon, and + /// wordItem_frequencyTable[i][j] to look up the corresponding frequency. + /// </summary> + private char[][][] wordItem_charArrayTable; + + private int[][] wordItem_frequencyTable; + + // static Logger log = Logger.getLogger(WordDictionary.class); + + private static object syncLock = new object(); + + /// <summary> + /// Get the singleton dictionary instance. + /// </summary> + /// <returns>singleton</returns> + public static WordDictionary GetInstance() + { + lock (syncLock) + { + if (singleInstance == null) + { + singleInstance = new WordDictionary(); + + // LUCENENET specific + // LUCENE-1817: https://issues.apache.org/jira/browse/LUCENE-1817 + // This issue still existed as of 4.8.0. Here is the fix - we only + // load from a directory if the actual directory exists (AnalyzerProfile + // ensures it is an empty string if it is not available). + string dictRoot = AnalyzerProfile.ANALYSIS_DATA_DIR; + if (string.IsNullOrEmpty(dictRoot)) + { + singleInstance.Load(); + } + else + { + singleInstance.Load(dictRoot); + } + + + //try + //{ + // singleInstance.Load(); + //} + //catch (IOException e) + //{ + // string wordDictRoot = AnalyzerProfile.ANALYSIS_DATA_DIR; + // singleInstance.Load(wordDictRoot); + //} + //catch (TypeLoadException e) + //{ + // throw new Exception(e.ToString(), e); + //} + } + return singleInstance; + } + } + + /// <summary> + /// Attempt to load dictionary from provided directory, first trying coredict.mem, failing back on coredict.dct + /// </summary> + /// <param name="dctFileRoot">path to dictionary directory</param> + public virtual void Load(string dctFileRoot) + { + string dctFilePath = System.IO.Path.Combine(dctFileRoot, "coredict.dct"); + FileInfo serialObj = new FileInfo(System.IO.Path.Combine(dctFileRoot, "coredict.mem")); + + if (serialObj.Exists && LoadFromObj(serialObj)) + { + + } + else + { + try + { + wordIndexTable = new short[PRIME_INDEX_LENGTH]; + charIndexTable = new char[PRIME_INDEX_LENGTH]; + for (int i = 0; i < PRIME_INDEX_LENGTH; i++) + { + charIndexTable[i] = (char)0; + wordIndexTable[i] = -1; + } + wordItem_charArrayTable = new char[GB2312_CHAR_NUM][][]; + wordItem_frequencyTable = new int[GB2312_CHAR_NUM][]; + // int total = + LoadMainDataFromFile(dctFilePath); + ExpandDelimiterData(); + MergeSameWords(); + SortEachItems(); + // log.info("load dictionary: " + dctFilePath + " total:" + total); + } + catch (IOException e) + { + throw new Exception(e.ToString(), e); + } + + SaveToObj(serialObj); + } + + } + + /// <summary> + /// Load coredict.mem internally from the jar file. + /// </summary> + /// <exception cref="IOException">If there is a low-level I/O error.</exception> + public virtual void Load() + { + using (Stream input = this.GetType().GetTypeInfo().Assembly.FindAndGetManifestResourceStream(this.GetType(), "coredict.mem")) + { + LoadFromObjectInputStream(input); + } + } + + private bool LoadFromObj(FileInfo serialObj) + { + try + { + using (Stream input = new FileStream(serialObj.FullName, FileMode.Open, FileAccess.Read)) + LoadFromObjectInputStream(input); + return true; + } + catch (Exception e) + { + throw new Exception(e.ToString(), e); + } + } + + // LUCENENET conversion note: + // The data in Lucene is stored in a proprietary binary format (similar to + // .NET's BinarySerializer) that cannot be read back in .NET. Therefore, the + // data was extracted using Java's DataOutputStream using the following Java code. + // It can then be read in using the LoadFromInputStream method below + // (using a DataInputStream instead of a BinaryReader), and saved + // in the correct (BinaryWriter) format by calling the SaveToObj method. + // Alternatively, the data can be loaded from disk using the files + // here(https://issues.apache.org/jira/browse/LUCENE-1629) in the analysis.data.zip file, + // which will automatically produce the .mem files. + + //public void saveToOutputStream(java.io.DataOutputStream stream) throws IOException + //{ + // // save bigramHashTable + // int bhLen = bigramHashTable.length; + // stream.writeInt(bhLen); + // for (int i = 0; i<bhLen; i++) + // { + // stream.writeLong(bigramHashTable[i]); + // } + + // // save frequencyTable + // int fLen = frequencyTable.length; + // stream.writeInt(fLen); + // for (int i = 0; i<fLen; i++) + // { + // stream.writeInt(frequencyTable[i]); + // } + //} + + private void LoadFromObjectInputStream(Stream serialObjectInputStream) + { + //ObjectInputStream input = new ObjectInputStream(serialObjectInputStream); + //wordIndexTable = (short[])input.ReadObject(); + //charIndexTable = (char[])input.ReadObject(); + //wordItem_charArrayTable = (char[][][])input.ReadObject(); + //wordItem_frequencyTable = (int[][])input.ReadObject(); + //// log.info("load core dict from serialization."); + //input.close(); + + using (var reader = new BinaryReader(serialObjectInputStream)) + //using (var reader = new DataInputStream(serialObjectInputStream)) + { + + // Read wordIndexTable + int wiLen = reader.ReadInt32(); + wordIndexTable = new short[wiLen]; + for (int i = 0; i < wiLen; i++) + { + wordIndexTable[i] = reader.ReadInt16(); + } + + // Read charIndexTable + int ciLen = reader.ReadInt32(); + charIndexTable = new char[ciLen]; + for (int i = 0; i < ciLen; i++) + { + charIndexTable[i] = reader.ReadChar(); + } + + // Read wordItem_charArrayTable + int caDim1 = reader.ReadInt32(); + if (caDim1 > -1) + { + wordItem_charArrayTable = new char[caDim1][][]; + for (int i = 0; i < caDim1; i++) + { + int caDim2 = reader.ReadInt32(); + if (caDim2 > -1) + { + wordItem_charArrayTable[i] = new char[caDim2][]; + for (int j = 0; j < caDim2; j++) + { + int caDim3 = reader.ReadInt32(); + if (caDim3 > -1) + { + wordItem_charArrayTable[i][j] = new char[caDim3]; + for (int k = 0; k < caDim3; k++) + { + wordItem_charArrayTable[i][j][k] = reader.ReadChar(); + } + } + } + } + } + } + + // Read wordItem_frequencyTable + int fDim1 = reader.ReadInt32(); + if (fDim1 > -1) + { + wordItem_frequencyTable = new int[fDim1][]; + for (int i = 0; i < fDim1; i++) + { + int fDim2 = reader.ReadInt32(); + if (fDim2 > -1) + { + wordItem_frequencyTable[i] = new int[fDim2]; + for (int j = 0; j < fDim2; j++) + { + wordItem_frequencyTable[i][j] = reader.ReadInt32(); + } + } + } + } + } + + // log.info("load core dict from serialization."); + } + + private void SaveToObj(FileInfo serialObj) + { + try + { + //ObjectOutputStream output = new ObjectOutputStream(new FileStream( + // serialObj.FullName, FileMode.Create, FileAccess.Write)); + //output.writeObject(wordIndexTable); + //output.writeObject(charIndexTable); + //output.writeObject(wordItem_charArrayTable); + //output.writeObject(wordItem_frequencyTable); + //output.close(); + //// log.info("serialize core dict."); + + using (Stream stream = new FileStream(serialObj.FullName, FileMode.Create, FileAccess.Write)) + { + using (var writer = new BinaryWriter(stream)) + { + // Write wordIndexTable + int wiLen = wordIndexTable.Length; + writer.Write(wiLen); + for (int i = 0; i < wiLen; i++) + { + writer.Write(wordIndexTable[i]); + } + + // Write charIndexTable + int ciLen = charIndexTable.Length; + writer.Write(ciLen); + for (int i = 0; i < ciLen; i++) + { + writer.Write(charIndexTable[i]); + } + + // Write wordItem_charArrayTable + int caDim1 = wordItem_charArrayTable == null ? -1 : wordItem_charArrayTable.Length; + writer.Write(caDim1); + for (int i = 0; i < caDim1; i++) + { + int caDim2 = wordItem_charArrayTable[i] == null ? -1 : wordItem_charArrayTable[i].Length; + writer.Write(caDim2); + for (int j = 0; j < caDim2; j++) + { + int caDim3 = wordItem_charArrayTable[i][j] == null ? -1 : wordItem_charArrayTable[i][j].Length; + writer.Write(caDim3); + for (int k = 0; k < caDim3; k++) + { + writer.Write(wordItem_charArrayTable[i][j][k]); + } + } + } + + // Write wordItem_frequencyTable + int fDim1 = wordItem_frequencyTable == null ? -1 : wordItem_frequencyTable.Length; + writer.Write(fDim1); + for (int i = 0; i < fDim1; i++) + { + int fDim2 = wordItem_frequencyTable[i] == null ? -1 : wordItem_frequencyTable[i].Length; + writer.Write(fDim2); + for (int j = 0; j < fDim2; j++) + { + writer.Write(wordItem_frequencyTable[i][j]); + } + } + } + } + + // log.info("serialize core dict."); + } +#pragma warning disable 168 + catch (Exception e) +#pragma warning restore 168 + { + // log.warn(e.getMessage()); + } + } + + /// <summary> + /// Load the datafile into this <see cref="WordDictionary"/> + /// </summary> + /// <param name="dctFilePath">path to word dictionary (coredict.dct)</param> + /// <returns>number of words read</returns> + /// <exception cref="IOException">If there is a low-level I/O error.</exception> + private int LoadMainDataFromFile(string dctFilePath) + { + int i, cnt, length, total = 0; + // The file only counted 6763 Chinese characters plus 5 reserved slots 3756~3760. + // The 3756th is used (as a header) to store information. + int[] + buffer = new int[3]; + byte[] intBuffer = new byte[4]; + string tmpword; + //using (RandomAccessFile dctFile = new RandomAccessFile(dctFilePath, "r")) + using (var dctFile = new FileStream(dctFilePath, FileMode.Open, FileAccess.Read)) + { + + // GB2312 characters 0 - 6768 + for (i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + CHAR_NUM_IN_FILE; i++) + { + // if (i == 5231) + // System.out.println(i); + + dctFile.Read(intBuffer, 0, intBuffer.Length); + // the dictionary was developed for C, and byte order must be converted to work with Java + cnt = ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LITTLE_ENDIAN).GetInt32(); + if (cnt <= 0) + { + wordItem_charArrayTable[i] = null; + wordItem_frequencyTable[i] = null; + continue; + } + wordItem_charArrayTable[i] = new char[cnt][]; + wordItem_frequencyTable[i] = new int[cnt]; + total += cnt; + int j = 0; + while (j < cnt) + { + // wordItemTable[i][j] = new WordItem(); + dctFile.Read(intBuffer, 0, intBuffer.Length); + buffer[0] = ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LITTLE_ENDIAN) + .GetInt32();// frequency + dctFile.Read(intBuffer, 0, intBuffer.Length); + buffer[1] = ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LITTLE_ENDIAN) + .GetInt32();// length + dctFile.Read(intBuffer, 0, intBuffer.Length); + buffer[2] = ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LITTLE_ENDIAN) + .GetInt32();// handle + + // wordItemTable[i][j].frequency = buffer[0]; + wordItem_frequencyTable[i][j] = buffer[0]; + + length = buffer[1]; + if (length > 0) + { + byte[] lchBuffer = new byte[length]; + dctFile.Read(lchBuffer, 0, lchBuffer.Length); + //tmpword = new String(lchBuffer, "GB2312"); + tmpword = Encoding.GetEncoding("GB2312").GetString(lchBuffer); + //tmpword = Encoding.GetEncoding("hz-gb-2312").GetString(lchBuffer); + // indexTable[i].wordItems[j].word = tmpword; + // wordItemTable[i][j].charArray = tmpword.toCharArray(); + wordItem_charArrayTable[i][j] = tmpword.ToCharArray(); + } + else + { + // wordItemTable[i][j].charArray = null; + wordItem_charArrayTable[i][j] = null; + } + // System.out.println(indexTable[i].wordItems[j]); + j++; + } + + string str = GetCCByGB2312Id(i); + SetTableIndex(str[0], i); + } + } + return total; + } + + /// <summary> + /// The original lexicon puts all information with punctuation into a + /// chart (from 1 to 3755). Here it then gets expanded, separately being + /// placed into the chart that has the corresponding symbol. + /// </summary> + private void ExpandDelimiterData() + { + int i; + int cnt; + // Punctuation then treating index 3755 as 1, + // distribute the original punctuation corresponding dictionary into + int delimiterIndex = 3755 + GB2312_FIRST_CHAR; + i = 0; + while (i < wordItem_charArrayTable[delimiterIndex].Length) + { + char c = wordItem_charArrayTable[delimiterIndex][i][0]; + int j = GetGB2312Id(c);// the id value of the punctuation + if (wordItem_charArrayTable[j] == null) + { + + int k = i; + // Starting from i, count the number of the following worditem symbol from j + while (k < wordItem_charArrayTable[delimiterIndex].Length + && wordItem_charArrayTable[delimiterIndex][k][0] == c) + { + k++; + } + // c is the punctuation character, j is the id value of c + // k-1 represents the index of the last punctuation character + cnt = k - i; + if (cnt != 0) + { + wordItem_charArrayTable[j] = new char[cnt][]; + wordItem_frequencyTable[j] = new int[cnt]; + } + + // Assign value for each wordItem. + for (k = 0; k < cnt; k++, i++) + { + // wordItemTable[j][k] = new WordItem(); + wordItem_frequencyTable[j][k] = wordItem_frequencyTable[delimiterIndex][i]; + wordItem_charArrayTable[j][k] = new char[wordItem_charArrayTable[delimiterIndex][i].Length - 1]; + System.Array.Copy(wordItem_charArrayTable[delimiterIndex][i], 1, + wordItem_charArrayTable[j][k], 0, + wordItem_charArrayTable[j][k].Length); + } + SetTableIndex(c, j); + } + } + // Delete the original corresponding symbol array. + wordItem_charArrayTable[delimiterIndex] = null; + wordItem_frequencyTable[delimiterIndex] = null; + } + + /// <summary> + /// since we aren't doing POS-tagging, merge the frequencies for entries of the same word (with different POS) + /// </summary> + private void MergeSameWords() + { + int i; + for (i = 0; i < GB2312_FIRST_CHAR + CHAR_NUM_IN_FILE; i++) + { + if (wordItem_charArrayTable[i] == null) + continue; + int len = 1; + for (int j = 1; j < wordItem_charArrayTable[i].Length; j++) + { + if (Utility.CompareArray(wordItem_charArrayTable[i][j], 0, + wordItem_charArrayTable[i][j - 1], 0) != 0) + len++; + + } + if (len < wordItem_charArrayTable[i].Length) + { + char[][] tempArray = new char[len][]; + int[] tempFreq = new int[len]; + int k = 0; + tempArray[0] = wordItem_charArrayTable[i][0]; + tempFreq[0] = wordItem_frequencyTable[i][0]; + for (int j = 1; j < wordItem_charArrayTable[i].Length; j++) + { + if (Utility.CompareArray(wordItem_charArrayTable[i][j], 0, + tempArray[k], 0) != 0) + { + k++; + // temp[k] = wordItemTable[i][j]; + tempArray[k] = wordItem_charArrayTable[i][j]; + tempFreq[k] = wordItem_frequencyTable[i][j]; + } + else + { + // temp[k].frequency += wordItemTable[i][j].frequency; + tempFreq[k] += wordItem_frequencyTable[i][j]; + } + } + // wordItemTable[i] = temp; + wordItem_charArrayTable[i] = tempArray; + wordItem_frequencyTable[i] = tempFreq; + } + } + } + + private void SortEachItems() + { + char[] tmpArray; + int tmpFreq; + for (int i = 0; i < wordItem_charArrayTable.Length; i++) + { + if (wordItem_charArrayTable[i] != null + && wordItem_charArrayTable[i].Length > 1) + { + for (int j = 0; j < wordItem_charArrayTable[i].Length - 1; j++) + { + for (int j2 = j + 1; j2 < wordItem_charArrayTable[i].Length; j2++) + { + if (Utility.CompareArray(wordItem_charArrayTable[i][j], 0, + wordItem_charArrayTable[i][j2], 0) > 0) + { + tmpArray = wordItem_charArrayTable[i][j]; + tmpFreq = wordItem_frequencyTable[i][j]; + wordItem_charArrayTable[i][j] = wordItem_charArrayTable[i][j2]; + wordItem_frequencyTable[i][j] = wordItem_frequencyTable[i][j2]; + wordItem_charArrayTable[i][j2] = tmpArray; + wordItem_frequencyTable[i][j2] = tmpFreq; + } + } + } + } + } + } + + /// <summary> + /// Calculate character <paramref name="c"/>'s position in hash table, + /// then initialize the value of that position in the address table. + /// </summary> + private bool SetTableIndex(char c, int j) + { + int index = GetAvaliableTableIndex(c); + if (index != -1) + { + charIndexTable[index] = c; + wordIndexTable[index] = (short)j; + return true; + } + else + return false; + } + + private short GetAvaliableTableIndex(char c) + { + int hash1 = (int)(Hash1(c) % PRIME_INDEX_LENGTH); + int hash2 = Hash2(c) % PRIME_INDEX_LENGTH; + if (hash1 < 0) + hash1 = PRIME_INDEX_LENGTH + hash1; + if (hash2 < 0) + hash2 = PRIME_INDEX_LENGTH + hash2; + int index = hash1; + int i = 1; + while (charIndexTable[index] != 0 && charIndexTable[index] != c + && i < PRIME_INDEX_LENGTH) + { + index = (hash1 + i * hash2) % PRIME_INDEX_LENGTH; + i++; + } + // System.out.println(i - 1); + + if (i < PRIME_INDEX_LENGTH + && (charIndexTable[index] == 0 || charIndexTable[index] == c)) + { + return (short)index; + } + else + { + return -1; + } + } + + private short GetWordItemTableIndex(char c) + { + int hash1 = (int)(Hash1(c) % PRIME_INDEX_LENGTH); + int hash2 = Hash2(c) % PRIME_INDEX_LENGTH; + if (hash1 < 0) + hash1 = PRIME_INDEX_LENGTH + hash1; + if (hash2 < 0) + hash2 = PRIME_INDEX_LENGTH + hash2; + int index = hash1; + int i = 1; + while (charIndexTable[index] != 0 && charIndexTable[index] != c + && i < PRIME_INDEX_LENGTH) + { + index = (hash1 + i * hash2) % PRIME_INDEX_LENGTH; + i++; + } + + if (i < PRIME_INDEX_LENGTH && charIndexTable[index] == c) + { + return (short)index; + } + else + return -1; + } + + /// <summary> + /// Look up the text string corresponding with the word char array, + /// and return the position of the word list. + /// </summary> + /// <param name="knownHashIndex"> + /// already figure out position of the first word + /// symbol charArray[0] in hash table. If not calculated yet, can be + /// replaced with function int findInTable(char[] charArray). + /// </param> + /// <param name="charArray">look up the char array corresponding with the word.</param> + /// <returns>word location in word array. If not found, then return -1.</returns> + private int FindInTable(short knownHashIndex, char[] charArray) + { + if (charArray == null || charArray.Length == 0) + return -1; + + char[][] items = wordItem_charArrayTable[wordIndexTable[knownHashIndex]]; + int start = 0, end = items.Length - 1; + int mid = (start + end) / 2, cmpResult; + + // Binary search for the index of idArray + while (start <= end) + { + cmpResult = Utility.CompareArray(items[mid], 0, charArray, 1); + + if (cmpResult == 0) + return mid;// find it + else if (cmpResult < 0) + start = mid + 1; + else if (cmpResult > 0) + end = mid - 1; + + mid = (start + end) / 2; + } + return -1; + } + + /// <summary> + /// Find the first word in the dictionary that starts with the supplied prefix + /// </summary> + /// <param name="charArray">input prefix</param> + /// <returns>index of word, or -1 if not found</returns> + /// <seealso cref="GetPrefixMatch(char[], int)"/> + public virtual int GetPrefixMatch(char[] charArray) + { + return GetPrefixMatch(charArray, 0); + } + + /// <summary> + /// Find the nth word in the dictionary that starts with the supplied prefix + /// </summary> + /// <param name="charArray">input prefix</param> + /// <param name="knownStart">relative position in the dictionary to start</param> + /// <returns>index of word, or -1 if not found</returns> + /// <seealso cref="GetPrefixMatch(char[])"/> + public virtual int GetPrefixMatch(char[] charArray, int knownStart) + { + short index = GetWordItemTableIndex(charArray[0]); + if (index == -1) + return -1; + char[][] items = wordItem_charArrayTable[wordIndexTable[index]]; + int start = knownStart, end = items.Length - 1; + + int mid = (start + end) / 2, cmpResult; + + // Binary search for the index of idArray + while (start <= end) + { + cmpResult = Utility.CompareArrayByPrefix(charArray, 1, items[mid], 0); + if (cmpResult == 0) + { + // Get the first item which match the current word + while (mid >= 0 + && Utility.CompareArrayByPrefix(charArray, 1, items[mid], 0) == 0) + mid--; + mid++; + return mid;// Find the first word that uses charArray as prefix. + } + else if (cmpResult < 0) + end = mid - 1; + else + start = mid + 1; + mid = (start + end) / 2; + } + return -1; + } + + /// <summary> + /// Get the frequency of a word from the dictionary + /// </summary> + /// <param name="charArray">input word</param> + /// <returns>word frequency, or zero if the word is not found</returns> + public virtual int GetFrequency(char[] charArray) + { + short hashIndex = GetWordItemTableIndex(charArray[0]); + if (hashIndex == -1) + { + return 0; + } + int itemIndex = FindInTable(hashIndex, charArray); + if (itemIndex != -1) + { + return wordItem_frequencyTable[wordIndexTable[hashIndex]][itemIndex]; + } + return 0; + } + + /// <summary> + /// Return <c>true</c> if the dictionary entry at itemIndex for table charArray[0] is charArray + /// </summary> + /// <param name="charArray">input word</param> + /// <param name="itemIndex">item index for table charArray[0]</param> + /// <returns><c>true</c> if the entry exists</returns> + public virtual bool IsEqual(char[] charArray, int itemIndex) + { + short hashIndex = GetWordItemTableIndex(charArray[0]); + return Utility.CompareArray(charArray, 1, + wordItem_charArrayTable[wordIndexTable[hashIndex]][itemIndex], 0) == 0; + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/2f5d89b4/src/Lucene.Net.Analysis.SmartCn/HHMM/bigramdict.mem ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.SmartCn/HHMM/bigramdict.mem b/src/Lucene.Net.Analysis.SmartCn/HHMM/bigramdict.mem new file mode 100644 index 0000000..fd561bb Binary files /dev/null and b/src/Lucene.Net.Analysis.SmartCn/HHMM/bigramdict.mem differ http://git-wip-us.apache.org/repos/asf/lucenenet/blob/2f5d89b4/src/Lucene.Net.Analysis.SmartCn/HHMM/coredict.mem ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.SmartCn/HHMM/coredict.mem b/src/Lucene.Net.Analysis.SmartCn/HHMM/coredict.mem new file mode 100644 index 0000000..2eab465 Binary files /dev/null and b/src/Lucene.Net.Analysis.SmartCn/HHMM/coredict.mem differ http://git-wip-us.apache.org/repos/asf/lucenenet/blob/2f5d89b4/src/Lucene.Net.Analysis.SmartCn/HMMChineseTokenizer.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.SmartCn/HMMChineseTokenizer.cs b/src/Lucene.Net.Analysis.SmartCn/HMMChineseTokenizer.cs new file mode 100644 index 0000000..10d6de7 --- /dev/null +++ b/src/Lucene.Net.Analysis.SmartCn/HMMChineseTokenizer.cs @@ -0,0 +1,94 @@ +using Lucene.Net.Analysis.Cn.Smart.HHMM; +using Lucene.Net.Analysis.TokenAttributes; +using Lucene.Net.Analysis.Util; +using Lucene.Net.Support; +using System.Collections.Generic; +using System.Globalization; +using System.IO; + +namespace Lucene.Net.Analysis.Cn.Smart +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// <summary> + /// Tokenizer for Chinese or mixed Chinese-English text. + /// <para/> + /// The analyzer uses probabilistic knowledge to find the optimal word segmentation for Simplified Chinese text. + /// The text is first broken into sentences, then each sentence is segmented into words. + /// </summary> + public class HMMChineseTokenizer : SegmentingTokenizerBase + { + ///** used for breaking the text into sentences */ + //private static readonly BreakIterator sentenceProto = BreakIterator.getSentenceInstance(Locale.ROOT); + + private readonly ICharTermAttribute termAtt; + private readonly IOffsetAttribute offsetAtt; + private readonly ITypeAttribute typeAtt; + + private readonly WordSegmenter wordSegmenter = new WordSegmenter(); + private IEnumerator<SegToken> tokens; + + /// <summary> + /// Creates a new <see cref="HMMChineseTokenizer"/> + /// </summary> + public HMMChineseTokenizer(TextReader reader) + : this(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY, reader) + { + } + + /// <summary> + /// Creates a new <see cref="HMMChineseTokenizer"/>, supplying the <see cref="Lucene.Net.Util.AttributeSource.AttributeFactory"/> + /// </summary> + public HMMChineseTokenizer(AttributeFactory factory, TextReader reader) + : base(factory, reader, new IcuBreakIterator(Icu.BreakIterator.UBreakIteratorType.SENTENCE, CultureInfo.InvariantCulture) { EnableHacks = false }) + { + termAtt = AddAttribute<ICharTermAttribute>(); + offsetAtt = AddAttribute<IOffsetAttribute>(); + typeAtt = AddAttribute<ITypeAttribute>(); + } + + protected override void SetNextSentence(int sentenceStart, int sentenceEnd) + { + string sentence = new string(m_buffer, sentenceStart, sentenceEnd - sentenceStart); + tokens = wordSegmenter.SegmentSentence(sentence, m_offset + sentenceStart).GetEnumerator(); + } + + protected override bool IncrementWord() + { + if (tokens == null || !tokens.MoveNext()) + { + return false; + } + else + { + SegToken token = tokens.Current; + ClearAttributes(); + termAtt.CopyBuffer(token.CharArray, 0, token.CharArray.Length); + offsetAtt.SetOffset(CorrectOffset(token.StartOffset), CorrectOffset(token.EndOffset)); + typeAtt.Type = "word"; + return true; + } + } + + public override void Reset() + { + base.Reset(); + tokens = null; + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/2f5d89b4/src/Lucene.Net.Analysis.SmartCn/HMMChineseTokenizerFactory.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.SmartCn/HMMChineseTokenizerFactory.cs b/src/Lucene.Net.Analysis.SmartCn/HMMChineseTokenizerFactory.cs new file mode 100644 index 0000000..bb2e8a9 --- /dev/null +++ b/src/Lucene.Net.Analysis.SmartCn/HMMChineseTokenizerFactory.cs @@ -0,0 +1,56 @@ +using Lucene.Net.Analysis.Util; +using Lucene.Net.Util; +using System; +using System.Collections.Generic; +using System.IO; +using System.Linq; + +namespace Lucene.Net.Analysis.Cn.Smart +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// <summary> + /// Factory for <see cref="HMMChineseTokenizer"/> + /// <para/> + /// Note: this class will currently emit tokens for punctuation. So you should either add + /// a <see cref="Miscellaneous.WordDelimiterFilter"/> after to remove these (with concatenate off), or use the + /// SmartChinese stoplist with a StopFilterFactory via: + /// <code>words="org/apache/lucene/analysis/cn/smart/stopwords.txt"</code> + /// <para/> + /// @lucene.experimental + /// </summary> + public sealed class HMMChineseTokenizerFactory : TokenizerFactory + { + /// <summary> + /// Creates a new <see cref="HMMChineseTokenizerFactory"/> + /// </summary> + public HMMChineseTokenizerFactory(IDictionary<string, string> args) + : base(args) + { + if (args.Any()) + { + throw new ArgumentException("Unknown parameters: " + args); + } + } + + public override Tokenizer Create(AttributeSource.AttributeFactory factory, TextReader reader) + { + return new HMMChineseTokenizer(factory, reader); + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/2f5d89b4/src/Lucene.Net.Analysis.SmartCn/Lucene.Net.Analysis.SmartCn.csproj ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.SmartCn/Lucene.Net.Analysis.SmartCn.csproj b/src/Lucene.Net.Analysis.SmartCn/Lucene.Net.Analysis.SmartCn.csproj new file mode 100644 index 0000000..91e3e6c --- /dev/null +++ b/src/Lucene.Net.Analysis.SmartCn/Lucene.Net.Analysis.SmartCn.csproj @@ -0,0 +1,124 @@ +<?xml version="1.0" encoding="utf-8"?> +<!-- + + Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, + software distributed under the License is distributed on an + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + KIND, either express or implied. See the License for the + specific language governing permissions and limitations + under the License. + +--> + +<Project ToolsVersion="14.0" DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003"> + <Import Project="$(MSBuildExtensionsPath)\$(MSBuildToolsVersion)\Microsoft.Common.props" Condition="Exists('$(MSBuildExtensionsPath)\$(MSBuildToolsVersion)\Microsoft.Common.props')" /> + <PropertyGroup> + <Configuration Condition=" '$(Configuration)' == '' ">Debug</Configuration> + <Platform Condition=" '$(Platform)' == '' ">AnyCPU</Platform> + <ProjectGuid>{DBA35EDF-A0FF-4DF7-AE4F-A103B01CD488}</ProjectGuid> + <OutputType>Library</OutputType> + <AppDesignerFolder>Properties</AppDesignerFolder> + <RootNamespace>Lucene.Net.Analysis.Cn.Smart</RootNamespace> + <AssemblyName>Lucene.Net.Analysis.SmartCn</AssemblyName> + <TargetFrameworkVersion>v4.5.1</TargetFrameworkVersion> + <FileAlignment>512</FileAlignment> + </PropertyGroup> + <PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Debug|AnyCPU' "> + <DebugSymbols>true</DebugSymbols> + <DebugType>full</DebugType> + <Optimize>false</Optimize> + <OutputPath>bin\Debug\</OutputPath> + <DefineConstants>DEBUG;TRACE</DefineConstants> + <ErrorReport>prompt</ErrorReport> + <WarningLevel>4</WarningLevel> + </PropertyGroup> + <PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Release|AnyCPU' "> + <DebugType>pdbonly</DebugType> + <Optimize>true</Optimize> + <OutputPath>bin\Release\</OutputPath> + <DefineConstants>TRACE</DefineConstants> + <ErrorReport>prompt</ErrorReport> + <WarningLevel>4</WarningLevel> + </PropertyGroup> + <PropertyGroup> + <DefineConstants>$(DefineConstants);FEATURE_SERIALIZABLE</DefineConstants> + </PropertyGroup> + <ItemGroup> + <Reference Include="System" /> + <Reference Include="System.Core" /> + <Reference Include="System.Xml.Linq" /> + <Reference Include="System.Data.DataSetExtensions" /> + <Reference Include="Microsoft.CSharp" /> + <Reference Include="System.Data" /> + <Reference Include="System.Net.Http" /> + <Reference Include="System.Xml" /> + </ItemGroup> + <ItemGroup> + <Compile Include="AnalyzerProfile.cs" /> + <Compile Include="CharType.cs" /> + <Compile Include="HHMM\AbstractDictionary.cs" /> + <Compile Include="HHMM\BigramDictionary.cs" /> + <Compile Include="HHMM\BiSegGraph.cs" /> + <Compile Include="HHMM\HHMMSegmenter.cs" /> + <Compile Include="HHMM\PathNode.cs" /> + <Compile Include="HHMM\SegGraph.cs" /> + <Compile Include="HHMM\SegToken.cs" /> + <Compile Include="HHMM\SegTokenFilter.cs" /> + <Compile Include="HHMM\SegTokenPair.cs" /> + <Compile Include="HHMM\WordDictionary.cs" /> + <Compile Include="HMMChineseTokenizer.cs" /> + <Compile Include="HMMChineseTokenizerFactory.cs" /> + <Compile Include="Properties\AssemblyInfo.cs" /> + <Compile Include="SentenceTokenizer.cs" /> + <Compile Include="SmartChineseAnalyzer.cs" /> + <Compile Include="SmartChineseSentenceTokenizerFactory.cs" /> + <Compile Include="SmartChineseWordTokenFilterFactory.cs" /> + <Compile Include="Utility.cs" /> + <Compile Include="WordSegmenter.cs" /> + <Compile Include="WordTokenFilter.cs" /> + <Compile Include="WordType.cs" /> + <Compile Include="..\CommonAssemblyInfo.cs"> + <Link>Properties\CommonAssemblyInfo.cs</Link> + </Compile> + </ItemGroup> + <ItemGroup> + <ProjectReference Include="..\Lucene.Net.Analysis.Common\Lucene.Net.Analysis.Common.csproj"> + <Project>{4ADD0BBC-B900-4715-9526-D871DE8EEA64}</Project> + <Name>Lucene.Net.Analysis.Common</Name> + </ProjectReference> + <ProjectReference Include="..\Lucene.Net.ICU\Lucene.Net.ICU.csproj"> + <Project>{349CB7C9-7534-4E1D-9B0A-5521441AF0AE}</Project> + <Name>Lucene.Net.ICU</Name> + </ProjectReference> + <ProjectReference Include="..\Lucene.Net\Lucene.Net.csproj"> + <Project>{5D4AD9BE-1FFB-41AB-9943-25737971BF57}</Project> + <Name>Lucene.Net</Name> + </ProjectReference> + </ItemGroup> + <ItemGroup> + <EmbeddedResource Include="HHMM\bigramdict.mem" /> + <EmbeddedResource Include="HHMM\coredict.mem" /> + <None Include="Lucene.Net.Analysis.SmartCn.project.json" /> + </ItemGroup> + <ItemGroup> + <EmbeddedResource Include="stopwords.txt" /> + </ItemGroup> + <Import Project="$(MSBuildToolsPath)\Microsoft.CSharp.targets" /> + <!-- To modify your build process, add your task inside one of the targets below and uncomment it. + Other similar extension points exist, see Microsoft.Common.targets. + <Target Name="BeforeBuild"> + </Target> + <Target Name="AfterBuild"> + </Target> + --> +</Project> \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/2f5d89b4/src/Lucene.Net.Analysis.SmartCn/Lucene.Net.Analysis.SmartCn.project.json ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.SmartCn/Lucene.Net.Analysis.SmartCn.project.json b/src/Lucene.Net.Analysis.SmartCn/Lucene.Net.Analysis.SmartCn.project.json new file mode 100644 index 0000000..74d9d80 --- /dev/null +++ b/src/Lucene.Net.Analysis.SmartCn/Lucene.Net.Analysis.SmartCn.project.json @@ -0,0 +1,11 @@ +{ + "runtimes": { + "win": {} + }, + "dependencies": { + "icu.net": "54.1.1-alpha" + }, + "frameworks": { + "net451": {} + } +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/2f5d89b4/src/Lucene.Net.Analysis.SmartCn/Lucene.Net.Analysis.SmartCn.xproj ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.SmartCn/Lucene.Net.Analysis.SmartCn.xproj b/src/Lucene.Net.Analysis.SmartCn/Lucene.Net.Analysis.SmartCn.xproj new file mode 100644 index 0000000..940e974 --- /dev/null +++ b/src/Lucene.Net.Analysis.SmartCn/Lucene.Net.Analysis.SmartCn.xproj @@ -0,0 +1,40 @@ +<?xml version="1.0" encoding="utf-8"?> +<!-- + + Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, + software distributed under the License is distributed on an + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + KIND, either express or implied. See the License for the + specific language governing permissions and limitations + under the License. + +--> + +<Project ToolsVersion="14.0.25420" DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003"> + <PropertyGroup> + <VisualStudioVersion Condition="'$(VisualStudioVersion)' == ''">14.0.25420</VisualStudioVersion> + <VSToolsPath Condition="'$(VSToolsPath)' == ''">$(MSBuildExtensionsPath32)\Microsoft\VisualStudio\v$(VisualStudioVersion)</VSToolsPath> + </PropertyGroup> + <Import Project="$(VSToolsPath)\DotNet\Microsoft.DotNet.Props" Condition="'$(VSToolsPath)' != ''" /> + <PropertyGroup Label="Globals"> + <ProjectGuid>a400916e-dcb8-4a16-be83-91891c05191f</ProjectGuid> + <RootNamespace>Lucene.Net.Analysis.Cn.Smart</RootNamespace> + <BaseIntermediateOutputPath Condition="'$(BaseIntermediateOutputPath)'=='' ">.\obj</BaseIntermediateOutputPath> + <OutputPath Condition="'$(OutputPath)'=='' ">.\bin\</OutputPath> + </PropertyGroup> + + <PropertyGroup> + <SchemaVersion>2.0</SchemaVersion> + </PropertyGroup> + <Import Project="$(VSToolsPath)\DotNet\Microsoft.DotNet.targets" Condition="'$(VSToolsPath)' != ''" /> +</Project> \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/2f5d89b4/src/Lucene.Net.Analysis.SmartCn/Properties/AssemblyInfo.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.SmartCn/Properties/AssemblyInfo.cs b/src/Lucene.Net.Analysis.SmartCn/Properties/AssemblyInfo.cs new file mode 100644 index 0000000..ce45fa0 --- /dev/null +++ b/src/Lucene.Net.Analysis.SmartCn/Properties/AssemblyInfo.cs @@ -0,0 +1,42 @@ +/* +* Licensed to the Apache Software Foundation (ASF) under one or more +* contributor license agreements. See the NOTICE file distributed with +* this work for additional information regarding copyright ownership. +* The ASF licenses this file to You under the Apache License, Version 2.0 +* (the "License"); you may not use this file except in compliance with +* the License. You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*/ + +using System; +using System.Reflection; +using System.Runtime.InteropServices; + +// General Information about an assembly is controlled through the following +// set of attributes. Change these attribute values to modify the information +// associated with an assembly. +[assembly: AssemblyTitle("Lucene.Net.Analysis.SmartCn")] +[assembly: AssemblyDescription("Analyzer for indexing Chinese " + + "for the Lucene.Net full-text search engine library from The Apache Software Foundation.")] +[assembly: AssemblyConfiguration("")] +[assembly: AssemblyDefaultAlias("Lucene.Net.Analysis.SmartCn")] +[assembly: AssemblyCulture("")] + +[assembly: CLSCompliant(true)] + +// Setting ComVisible to false makes the types in this assembly not visible +// to COM components. If you need to access a type in this assembly from +// COM, set the ComVisible attribute to true on that type. +[assembly: ComVisible(false)] + +// The following GUID is for the ID of the typelib if this project is exposed to COM +[assembly: Guid("dba35edf-a0ff-4df7-ae4f-a103b01cd488")] + +// NOTE: Version information is in CommonAssemblyInfo.cs http://git-wip-us.apache.org/repos/asf/lucenenet/blob/2f5d89b4/src/Lucene.Net.Analysis.SmartCn/SentenceTokenizer.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.SmartCn/SentenceTokenizer.cs b/src/Lucene.Net.Analysis.SmartCn/SentenceTokenizer.cs new file mode 100644 index 0000000..28e949d --- /dev/null +++ b/src/Lucene.Net.Analysis.SmartCn/SentenceTokenizer.cs @@ -0,0 +1,142 @@ +using Lucene.Net.Analysis.TokenAttributes; +using System; +using System.IO; +using System.Text; + +namespace Lucene.Net.Analysis.Cn.Smart +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// <summary> + /// Tokenizes input text into sentences. + /// <para> + /// The output tokens can then be broken into words with <see cref="WordTokenFilter"/> + /// </para> + /// @lucene.experimental + /// </summary> + [Obsolete("Use HMMChineseTokenizer instead")] + public sealed class SentenceTokenizer : Tokenizer + { + /// <summary> + /// End of sentence punctuation: ãï¼ï¼ï¼ï¼,!?; + /// </summary> + private readonly static string PUNCTION = "ãï¼ï¼ï¼ï¼,!?;"; + + private readonly StringBuilder buffer = new StringBuilder(); + + private int tokenStart = 0, tokenEnd = 0; + + private ICharTermAttribute termAtt; + private IOffsetAttribute offsetAtt; + private ITypeAttribute typeAtt; + + public SentenceTokenizer(TextReader reader) + : base(reader) + { + Init(); + } + + public SentenceTokenizer(AttributeFactory factory, TextReader reader) + : base(factory, reader) + { + Init(); + } + + private void Init() + { + termAtt = AddAttribute<ICharTermAttribute>(); + offsetAtt = AddAttribute<IOffsetAttribute>(); + typeAtt = AddAttribute<ITypeAttribute>(); + } + + + public override bool IncrementToken() + { + ClearAttributes(); + buffer.Length = 0; + int ci; + char ch, pch; + bool atBegin = true; + tokenStart = tokenEnd; + ci = m_input.Read(); + ch = (char)ci; + + while (true) + { + if (ci == -1) + { + break; + } + else if (PUNCTION.IndexOf(ch) != -1) + { + // End of a sentence + buffer.Append(ch); + tokenEnd++; + break; + } + else if (atBegin && Utility.SPACES.IndexOf(ch) != -1) + { + tokenStart++; + tokenEnd++; + ci = m_input.Read(); + ch = (char)ci; + } + else + { + buffer.Append(ch); + atBegin = false; + tokenEnd++; + pch = ch; + ci = m_input.Read(); + ch = (char)ci; + // Two spaces, such as CR, LF + if (Utility.SPACES.IndexOf(ch) != -1 + && Utility.SPACES.IndexOf(pch) != -1) + { + // buffer.append(ch); + tokenEnd++; + break; + } + } + } + if (buffer.Length == 0) + return false; + else + { + termAtt.SetEmpty().Append(buffer); + offsetAtt.SetOffset(CorrectOffset(tokenStart), CorrectOffset(tokenEnd)); + typeAtt.Type = "sentence"; + return true; + } + } + + public override void Reset() + { + base.Reset(); + tokenStart = tokenEnd = 0; + } + + public override void End() + { + base.End(); + // set final offset + int finalOffset = CorrectOffset(tokenEnd); + offsetAtt.SetOffset(finalOffset, finalOffset); + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/2f5d89b4/src/Lucene.Net.Analysis.SmartCn/SmartChineseAnalyzer.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.SmartCn/SmartChineseAnalyzer.cs b/src/Lucene.Net.Analysis.SmartCn/SmartChineseAnalyzer.cs new file mode 100644 index 0000000..97c36ee --- /dev/null +++ b/src/Lucene.Net.Analysis.SmartCn/SmartChineseAnalyzer.cs @@ -0,0 +1,171 @@ +using Lucene.Net.Analysis.Core; +using Lucene.Net.Analysis.En; +using Lucene.Net.Analysis.Util; +using Lucene.Net.Util; +using System; +using System.IO; +using System.Linq; +using System.Text; + +namespace Lucene.Net.Analysis.Cn.Smart +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// <summary> + /// <para> + /// <see cref="SmartChineseAnalyzer"/> is an analyzer for Chinese or mixed Chinese-English text. + /// The analyzer uses probabilistic knowledge to find the optimal word segmentation for Simplified Chinese text. + /// The text is first broken into sentences, then each sentence is segmented into words. + /// </para> + /// <para> + /// Segmentation is based upon the <a href="http://en.wikipedia.org/wiki/Hidden_Markov_Model">Hidden Markov Model</a>. + /// A large training corpus was used to calculate Chinese word frequency probability. + /// </para> + /// <para> + /// This analyzer requires a dictionary to provide statistical data. + /// <see cref="SmartChineseAnalyzer"/> has an included dictionary out-of-box. + /// </para> + /// <para> + /// The included dictionary data is from <a href="http://www.ictclas.org">ICTCLAS1.0</a>. + /// Thanks to ICTCLAS for their hard work, and for contributing the data under the Apache 2 License! + /// </para> + /// @lucene.experimental + /// </summary> + public sealed class SmartChineseAnalyzer : Analyzer + { + private readonly CharArraySet stopWords; + + private static readonly string DEFAULT_STOPWORD_FILE = "stopwords.txt"; + + private static readonly string STOPWORD_FILE_COMMENT = "//"; + + /// <summary> + /// Returns an unmodifiable instance of the default stop-words set. + /// </summary> + /// <returns>An unmodifiable instance of the default stop-words set.</returns> + public static CharArraySet GetDefaultStopSet() + { + return DefaultSetHolder.DEFAULT_STOP_SET; + } + + /// <summary> + /// Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer class + /// accesses the static final set the first time. + /// </summary> + private class DefaultSetHolder + { + internal static readonly CharArraySet DEFAULT_STOP_SET; + + static DefaultSetHolder() + { + try + { + DEFAULT_STOP_SET = LoadDefaultStopWordSet(); + } + catch (IOException ex) + { + // default set should always be present as it is part of the + // distribution (JAR) + throw new Exception("Unable to load default stopword set", ex); + } + } + + internal static CharArraySet LoadDefaultStopWordSet() + { + // make sure it is unmodifiable as we expose it in the outer class + return CharArraySet.UnmodifiableSet(WordlistLoader.GetWordSet(IOUtils + .GetDecodingReader(typeof(SmartChineseAnalyzer), DEFAULT_STOPWORD_FILE, + Encoding.UTF8), STOPWORD_FILE_COMMENT, +#pragma warning disable 612, 618 + LuceneVersion.LUCENE_CURRENT)); +#pragma warning restore 612, 618 + } + } + + private readonly LuceneVersion matchVersion; + + /// <summary> + /// Create a new <see cref="SmartChineseAnalyzer"/>, using the default stopword list. + /// </summary> + public SmartChineseAnalyzer(LuceneVersion matchVersion) + : this(matchVersion, true) + { + } + + /// <summary> + /// <para> + /// Create a new <see cref="SmartChineseAnalyzer"/>, optionally using the default stopword list. + /// </para> + /// <para> + /// The included default stopword list is simply a list of punctuation. + /// If you do not use this list, punctuation will not be removed from the text! + /// </para> + /// </summary> + /// <param name="matchVersion"></param> + /// <param name="useDefaultStopWords"><c>true</c> to use the default stopword list.</param> + public SmartChineseAnalyzer(LuceneVersion matchVersion, bool useDefaultStopWords) + { + stopWords = useDefaultStopWords ? DefaultSetHolder.DEFAULT_STOP_SET + : CharArraySet.EMPTY_SET; + this.matchVersion = matchVersion; + } + + /// <summary> + /// <para> + /// Create a new <see cref="SmartChineseAnalyzer"/>, using the provided <see cref="CharArraySet"/> of stopwords. + /// </para> + /// <para> + /// Note: the set should include punctuation, unless you want to index punctuation! + /// </para> + /// </summary> + /// <param name="matchVersion"></param> + /// <param name="stopWords"><see cref="CharArraySet"/> of stopwords to use.</param> + public SmartChineseAnalyzer(LuceneVersion matchVersion, CharArraySet stopWords) + { + this.stopWords = stopWords == null ? CharArraySet.EMPTY_SET : stopWords; + this.matchVersion = matchVersion; + } + + protected override TokenStreamComponents CreateComponents(string fieldName, TextReader reader) + { + Tokenizer tokenizer; + TokenStream result; + if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_48)) + { + tokenizer = new HMMChineseTokenizer(reader); + result = tokenizer; + } + else + { +#pragma warning disable 612, 618 + tokenizer = new SentenceTokenizer(reader); + result = new WordTokenFilter(tokenizer); +#pragma warning restore 612, 618 + } + // result = new LowerCaseFilter(result); + // LowerCaseFilter is not needed, as SegTokenFilter lowercases Basic Latin text. + // The porter stemming is too strict, this is not a bug, this is a feature:) + result = new PorterStemFilter(result); + if (stopWords.Any()) + { + result = new StopFilter(matchVersion, result, stopWords); + } + return new TokenStreamComponents(tokenizer, result); + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/2f5d89b4/src/Lucene.Net.Analysis.SmartCn/SmartChineseSentenceTokenizerFactory.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.SmartCn/SmartChineseSentenceTokenizerFactory.cs b/src/Lucene.Net.Analysis.SmartCn/SmartChineseSentenceTokenizerFactory.cs new file mode 100644 index 0000000..498e9fd --- /dev/null +++ b/src/Lucene.Net.Analysis.SmartCn/SmartChineseSentenceTokenizerFactory.cs @@ -0,0 +1,52 @@ +using Lucene.Net.Analysis.Util; +using Lucene.Net.Util; +using System; +using System.Collections.Generic; +using System.IO; +using System.Linq; + +namespace Lucene.Net.Analysis.Cn.Smart +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// <summary> + /// Factory for the <see cref="SmartChineseAnalyzer"/> <see cref="SentenceTokenizer"/> + /// <para/> + /// @lucene.experimental + /// </summary> + [Obsolete("Use HMMChineseTokenizerFactory instead")] + public class SmartChineseSentenceTokenizerFactory : TokenizerFactory + { + /// <summary> + /// Creates a new <see cref="SmartChineseSentenceTokenizerFactory"/> + /// </summary> + public SmartChineseSentenceTokenizerFactory(IDictionary<string, string> args) + : base(args) + { + if (args.Any()) + { + throw new ArgumentException("Unknown parameters: " + args); + } + } + + public override Tokenizer Create(AttributeSource.AttributeFactory factory, TextReader input) + { + return new SentenceTokenizer(factory, input); + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/2f5d89b4/src/Lucene.Net.Analysis.SmartCn/SmartChineseWordTokenFilterFactory.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.SmartCn/SmartChineseWordTokenFilterFactory.cs b/src/Lucene.Net.Analysis.SmartCn/SmartChineseWordTokenFilterFactory.cs new file mode 100644 index 0000000..79b0ec5 --- /dev/null +++ b/src/Lucene.Net.Analysis.SmartCn/SmartChineseWordTokenFilterFactory.cs @@ -0,0 +1,55 @@ +using Lucene.Net.Analysis.Util; +using System; +using System.Collections.Generic; +using System.Linq; + +namespace Lucene.Net.Analysis.Cn.Smart +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// <summary> + /// Factory for the <see cref="SmartChineseAnalyzer"/> <see cref="WordTokenFilter"/> + /// <para> + /// Note: this class will currently emit tokens for punctuation. So you should either add + /// a <see cref="Miscellaneous.WordDelimiterFilter"/> after to remove these (with concatenate off), or use the + /// SmartChinese stoplist with a <see cref="Core.StopFilterFactory"/> via: + /// <code>words="org/apache/lucene/analysis/cn/smart/stopwords.txt"</code> + /// </para> + /// @lucene.experimental + /// </summary> + [Obsolete("Use HMMChineseTokenizerFactory instead")] + public class SmartChineseWordTokenFilterFactory : TokenFilterFactory + { + /// <summary> + /// Creates a new <see cref="SmartChineseWordTokenFilterFactory"/> + /// </summary> + public SmartChineseWordTokenFilterFactory(IDictionary<string, string> args) + : base(args) + { + if (args.Any()) + { + throw new ArgumentException("Unknown parameters: " + args); + } + } + + public override TokenStream Create(TokenStream input) + { + return new WordTokenFilter(input); + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/2f5d89b4/src/Lucene.Net.Analysis.SmartCn/Utility.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.SmartCn/Utility.cs b/src/Lucene.Net.Analysis.SmartCn/Utility.cs new file mode 100644 index 0000000..8160ecc --- /dev/null +++ b/src/Lucene.Net.Analysis.SmartCn/Utility.cs @@ -0,0 +1,196 @@ +namespace Lucene.Net.Analysis.Cn.Smart +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// <summary> + /// <see cref="SmartChineseAnalyzer"/> utility constants and methods + /// <para/> + /// @lucene.experimental + /// </summary> + public class Utility + { + public static readonly char[] STRING_CHAR_ARRAY = "æª##串".ToCharArray(); + + public static readonly char[] NUMBER_CHAR_ARRAY = "æª##æ°".ToCharArray(); + + public static readonly char[] START_CHAR_ARRAY = "å§##å§".ToCharArray(); + + public static readonly char[] END_CHAR_ARRAY = "æ«##æ«".ToCharArray(); + + /// <summary> + /// Delimiters will be filtered to this character by <see cref="HHMM.SegTokenFilter"/> + /// </summary> + public static readonly char[] COMMON_DELIMITER = new char[] { ',' }; + + /// <summary> + /// Space-like characters that need to be skipped: such as space, tab, newline, carriage return. + /// </summary> + public static readonly string SPACES = " ã\t\r\n"; + + /// <summary> + /// Maximum bigram frequency (used in the smoothing function). + /// </summary> + public static readonly int MAX_FREQUENCE = 2079997 + 80000; + + /// <summary> + /// Compare two arrays starting at the specified offsets. + /// </summary> + /// <param name="larray">left array</param> + /// <param name="lstartIndex">start offset into <paramref name="larray"/></param> + /// <param name="rarray">right array</param> + /// <param name="rstartIndex">start offset into <paramref name="rarray"/></param> + /// <returns>0 if the arrays are equalï¼1 if <paramref name="larray"/> > + /// <paramref name="rarray"/>, -1 if <paramref name="larray"/> < <paramref name="rarray"/></returns> + public static int CompareArray(char[] larray, int lstartIndex, char[] rarray, + int rstartIndex) + { + + if (larray == null) + { + if (rarray == null || rstartIndex >= rarray.Length) + return 0; + else + return -1; + } + else + { + // larray != null + if (rarray == null) + { + if (lstartIndex >= larray.Length) + return 0; + else + return 1; + } + } + + int li = lstartIndex, ri = rstartIndex; + while (li < larray.Length && ri < rarray.Length && larray[li] == rarray[ri]) + { + li++; + ri++; + } + if (li == larray.Length) + { + if (ri == rarray.Length) + { + // Both arrays are equivalent, return 0. + return 0; + } + else + { + // larray < rarray because larray has ended first. + return -1; + } + } + else + { + // differing lengths + if (ri == rarray.Length) + { + // larray > rarray because rarray has ended first. + return 1; + } + else + { + // determine by comparison + if (larray[li] > rarray[ri]) + return 1; + else + return -1; + } + } + } + + /// <summary> + /// Compare two arrays, starting at the specified offsets, but treating <paramref name="shortArray"/> as a prefix to <paramref name="longArray"/>. + /// As long as <paramref name="shortArray"/> is a prefix of <paramref name="longArray"/>, return 0. + /// Otherwise, behave as <see cref="CompareArray(char[], int, char[], int)"/>. + /// </summary> + /// <param name="shortArray">prefix array</param> + /// <param name="shortIndex">offset into <paramref name="shortArray"/></param> + /// <param name="longArray">long array (word)</param> + /// <param name="longIndex">offset into <paramref name="longArray"/></param> + /// <returns>0 if <paramref name="shortArray"/> is a prefix of <paramref name="longArray"/>, + /// otherwise act as <see cref="CompareArray(char[], int, char[], int)"/>.</returns> + public static int CompareArrayByPrefix(char[] shortArray, int shortIndex, + char[] longArray, int longIndex) + { + + // a null prefix is a prefix of longArray + if (shortArray == null) + return 0; + else if (longArray == null) + return (shortIndex < shortArray.Length) ? 1 : 0; + + int si = shortIndex, li = longIndex; + while (si < shortArray.Length && li < longArray.Length + && shortArray[si] == longArray[li]) + { + si++; + li++; + } + if (si == shortArray.Length) + { + // shortArray is a prefix of longArray + return 0; + } + else + { + // shortArray > longArray because longArray ended first. + if (li == longArray.Length) + return 1; + else + // determine by comparison + return (shortArray[si] > longArray[li]) ? 1 : -1; + } + } + + /// <summary> + /// Return the internal <see cref="CharType"/> constant of a given character. + /// </summary> + /// <param name="ch">input character</param> + /// <returns>Constant from <see cref="CharType"/> describing the character type.</returns> + /// <seealso cref="CharType"/> + public static CharType GetCharType(char ch) + { + // Most (but not all!) of these are Han Ideographic Characters + if (ch >= 0x4E00 && ch <= 0x9FA5) + return CharType.HANZI; + if ((ch >= 0x0041 && ch <= 0x005A) || (ch >= 0x0061 && ch <= 0x007A)) + return CharType.LETTER; + if (ch >= 0x0030 && ch <= 0x0039) + return CharType.DIGIT; + if (ch == ' ' || ch == '\t' || ch == '\r' || ch == '\n' || ch == 'ã') + return CharType.SPACE_LIKE; + // Punctuation Marks + if ((ch >= 0x0021 && ch <= 0x00BB) || (ch >= 0x2010 && ch <= 0x2642) + || (ch >= 0x3001 && ch <= 0x301E)) + return CharType.DELIMITER; + + // Full-Width range + if ((ch >= 0xFF21 && ch <= 0xFF3A) || (ch >= 0xFF41 && ch <= 0xFF5A)) + return CharType.FULLWIDTH_LETTER; + if (ch >= 0xFF10 && ch <= 0xFF19) + return CharType.FULLWIDTH_DIGIT; + if (ch >= 0xFE30 && ch <= 0xFF63) + return CharType.DELIMITER; + return CharType.OTHER; + } + } +}
