[2/4] lucenenet git commit: Ported Lucene.Net.Analysis.SmartCn + tests

nightowl888 Sat, 24 Jun 2017 12:47:34 -0700

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/2f5d89b4/src/Lucene.Net.Analysis.SmartCn/HHMM/SegTokenFilter.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.SmartCn/HHMM/SegTokenFilter.cs 
b/src/Lucene.Net.Analysis.SmartCn/HHMM/SegTokenFilter.cs
new file mode 100644
index 0000000..008460e
--- /dev/null
+++ b/src/Lucene.Net.Analysis.SmartCn/HHMM/SegTokenFilter.cs
@@ -0,0 +1,75 @@
+ï»¿namespace Lucene.Net.Analysis.Cn.Smart.HHMM
+{
+    /*
+     * Licensed to the Apache Software Foundation (ASF) under one or more
+     * contributor license agreements.  See the NOTICE file distributed with
+     * this work for additional information regarding copyright ownership.
+     * The ASF licenses this file to You under the Apache License, Version 2.0
+     * (the "License"); you may not use this file except in compliance with
+     * the License.  You may obtain a copy of the License at
+     *
+     *     http://www.apache.org/licenses/LICENSE-2.0
+     *
+     * Unless required by applicable law or agreed to in writing, software
+     * distributed under the License is distributed on an "AS IS" BASIS,
+     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     * See the License for the specific language governing permissions and
+     * limitations under the License.
+     */
+
+    /// <summary>
+    /// <para>
+    /// Filters a <see cref="SegToken"/> by converting full-width latin to 
half-width, then lowercasing latin.
+    /// Additionally, all punctuation is converted into <see 
cref="Utility.COMMON_DELIMITER"/>
+    /// </para>
+    /// @lucene.experimental
+    /// </summary>
+    public class SegTokenFilter
+    {
+        /// <summary>
+        /// Filter an input <see cref="SegToken"/>
+        /// <para>
+        /// Full-width latin will be converted to half-width, then all latin 
will be lowercased.
+        /// All punctuation is converted into <see 
cref="Utility.COMMON_DELIMITER"/>
+        /// </para>
+        /// </summary>
+        /// <param name="token">Input <see cref="SegToken"/>.</param>
+        /// <returns>Normalized <see cref="SegToken"/>.</returns>
+        public virtual SegToken Filter(SegToken token)
+        {
+            switch (token.WordType)
+            {
+                case WordType.FULLWIDTH_NUMBER:
+                case WordType.FULLWIDTH_STRING: /* first convert full-width -> 
half-width */
+                    for (int i = 0; i < token.CharArray.Length; i++)
+                    {
+                        if (token.CharArray[i] >= 0xFF10)
+                        {
+                            token.CharArray[i] = (char)(token.CharArray[i] - 
0xFEE0);
+                        }
+
+                        if (token.CharArray[i] >= 0x0041 && token.CharArray[i] 
<= 0x005A) /* lowercase latin */
+                        {
+                            token.CharArray[i] = (char)(token.CharArray[i] + 
0x0020);
+                        }
+                    }
+                    break;
+                case WordType.STRING:
+                    for (int i = 0; i < token.CharArray.Length; i++)
+                    {
+                        if (token.CharArray[i] >= 0x0041 && token.CharArray[i] 
<= 0x005A) /* lowercase latin */
+                        {
+                            token.CharArray[i] = (char)(token.CharArray[i] + 
0x0020);
+                        }
+                    }
+                    break;
+                case WordType.DELIMITER: /* convert all punctuation to 
Utility.COMMON_DELIMITER */
+                    token.CharArray = Utility.COMMON_DELIMITER;
+                    break;
+                default:
+                    break;
+            }
+            return token;
+        }
+    }
+}


http://git-wip-us.apache.org/repos/asf/lucenenet/blob/2f5d89b4/src/Lucene.Net.Analysis.SmartCn/HHMM/SegTokenPair.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.SmartCn/HHMM/SegTokenPair.cs 
b/src/Lucene.Net.Analysis.SmartCn/HHMM/SegTokenPair.cs
new file mode 100644
index 0000000..b7b697a
--- /dev/null
+++ b/src/Lucene.Net.Analysis.SmartCn/HHMM/SegTokenPair.cs
@@ -0,0 +1,95 @@
+ï»¿using Lucene.Net.Support;
+
+namespace Lucene.Net.Analysis.Cn.Smart.HHMM
+{
+    /*
+     * Licensed to the Apache Software Foundation (ASF) under one or more
+     * contributor license agreements.  See the NOTICE file distributed with
+     * this work for additional information regarding copyright ownership.
+     * The ASF licenses this file to You under the Apache License, Version 2.0
+     * (the "License"); you may not use this file except in compliance with
+     * the License.  You may obtain a copy of the License at
+     *
+     *     http://www.apache.org/licenses/LICENSE-2.0
+     *
+     * Unless required by applicable law or agreed to in writing, software
+     * distributed under the License is distributed on an "AS IS" BASIS,
+     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     * See the License for the specific language governing permissions and
+     * limitations under the License.
+     */
+
+    /// <summary>
+    /// A pair of tokens in <see cref="SegGraph"/>
+    /// <para/>
+    /// @lucene.experimental
+    /// </summary>
+    internal class SegTokenPair
+    {
+        [WritableArray]
+        public char[] CharArray { get; set; }
+
+        /// <summary>
+        /// index of the first token in <see cref="SegGraph"/>
+        /// </summary>
+        public int From { get; set; }
+
+        /// <summary>
+        /// index of the second token in <see cref="SegGraph"/>
+        /// </summary>
+        public int To { get; set; }
+
+        public double Weight { get; set; }
+
+        public SegTokenPair(char[] idArray, int from, int to, double weight)
+        {
+            this.CharArray = idArray;
+            this.From = from;
+            this.To = to;
+            this.Weight = weight;
+        }
+
+        /// <summary>
+        /// <see cref="object.GetHashCode()"/>
+        /// </summary>
+        public override int GetHashCode()
+        {
+            int prime = 31;
+            int result = 1;
+            for (int i = 0; i < CharArray.Length; i++)
+            {
+                result = prime * result + CharArray[i];
+            }
+            result = prime * result + From;
+            result = prime * result + To;
+            long temp;
+            temp = Number.DoubleToInt64Bits(Weight);
+            result = prime * result + (int)(temp ^ (int)((uint)temp >> 32));
+            return result;
+        }
+
+        /// <summary>
+        /// <see cref="object.Equals(object)"/>
+        /// </summary>
+        public override bool Equals(object obj)
+        {
+            if (this == obj)
+                return true;
+            if (obj == null)
+                return false;
+            if (GetType() != obj.GetType())
+                return false;
+            SegTokenPair other = (SegTokenPair)obj;
+            if (!Arrays.Equals(CharArray, other.CharArray))
+                return false;
+            if (From != other.From)
+                return false;
+            if (To != other.To)
+                return false;
+            if (Number.DoubleToInt64Bits(Weight) != Number
+                .DoubleToInt64Bits(other.Weight))
+                return false;
+            return true;
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/2f5d89b4/src/Lucene.Net.Analysis.SmartCn/HHMM/WordDictionary.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.SmartCn/HHMM/WordDictionary.cs 
b/src/Lucene.Net.Analysis.SmartCn/HHMM/WordDictionary.cs
new file mode 100644
index 0000000..c857380
--- /dev/null
+++ b/src/Lucene.Net.Analysis.SmartCn/HHMM/WordDictionary.cs
@@ -0,0 +1,778 @@
+ï»¿using Lucene.Net.Support;
+using Lucene.Net.Support.IO;
+using System;
+using System.IO;
+using System.Reflection;
+using System.Text;
+
+namespace Lucene.Net.Analysis.Cn.Smart.HHMM
+{
+    /*
+     * Licensed to the Apache Software Foundation (ASF) under one or more
+     * contributor license agreements.  See the NOTICE file distributed with
+     * this work for additional information regarding copyright ownership.
+     * The ASF licenses this file to You under the Apache License, Version 2.0
+     * (the "License"); you may not use this file except in compliance with
+     * the License.  You may obtain a copy of the License at
+     *
+     *     http://www.apache.org/licenses/LICENSE-2.0
+     *
+     * Unless required by applicable law or agreed to in writing, software
+     * distributed under the License is distributed on an "AS IS" BASIS,
+     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     * See the License for the specific language governing permissions and
+     * limitations under the License.
+     */
+
+    /// <summary>
+    /// SmartChineseAnalyzer Word Dictionary
+    /// <para/>
+    /// @lucene.experimental
+    /// </summary>
+    internal class WordDictionary : AbstractDictionary
+    {
+        private WordDictionary()
+        {
+        }
+
+        private static WordDictionary singleInstance;
+
+        /// <summary>
+        /// Large prime number for hash function
+        /// </summary>
+        public static readonly int PRIME_INDEX_LENGTH = 12071;
+
+        /// <summary>
+        /// wordIndexTable guarantees to hash all Chinese characters in 
Unicode into 
+        /// PRIME_INDEX_LENGTH array. There will be conflict, but in reality 
this 
+        /// program only handles the 6768 characters found in GB2312 plus some 
+        /// ASCII characters. Therefore in order to guarantee better 
precision, it is
+        /// necessary to retain the original symbol in the charIndexTable.
+        /// </summary>
+        private short[] wordIndexTable;
+
+        private char[] charIndexTable;
+
+        /// <summary>
+        /// To avoid taking too much space, the data structure needed to store 
the 
+        /// lexicon requires two multidimensional arrays to store word and 
frequency.
+        /// Each word is placed in a char[]. Each char represents a Chinese 
char or 
+        /// other symbol.  Each frequency is put into an int. These two arrays 
+        /// correspond to each other one-to-one. Therefore, one can use 
+        /// wordItem_charArrayTable[i][j] to look up word from lexicon, and 
+        /// wordItem_frequencyTable[i][j] to look up the corresponding 
frequency. 
+        /// </summary>
+        private char[][][] wordItem_charArrayTable;
+
+        private int[][] wordItem_frequencyTable;
+
+        // static Logger log = Logger.getLogger(WordDictionary.class);
+
+        private static object syncLock = new object();
+
+        /// <summary>
+        /// Get the singleton dictionary instance.
+        /// </summary>
+        /// <returns>singleton</returns>
+        public static WordDictionary GetInstance()
+        {
+            lock (syncLock)
+            {
+                if (singleInstance == null)
+                {
+                    singleInstance = new WordDictionary();
+
+                    // LUCENENET specific
+                    // LUCENE-1817: 
https://issues.apache.org/jira/browse/LUCENE-1817
+                    // This issue still existed as of 4.8.0. Here is the fix - 
we only
+                    // load from a directory if the actual directory exists 
(AnalyzerProfile
+                    // ensures it is an empty string if it is not available).
+                    string dictRoot = AnalyzerProfile.ANALYSIS_DATA_DIR;
+                    if (string.IsNullOrEmpty(dictRoot))
+                    {
+                        singleInstance.Load();
+                    }
+                    else
+                    {
+                        singleInstance.Load(dictRoot);
+                    }
+
+
+                    //try
+                    //{
+                    //    singleInstance.Load();
+                    //}
+                    //catch (IOException e)
+                    //{
+                    //    string wordDictRoot = 
AnalyzerProfile.ANALYSIS_DATA_DIR;
+                    //    singleInstance.Load(wordDictRoot);
+                    //}
+                    //catch (TypeLoadException e)
+                    //{
+                    //    throw new Exception(e.ToString(), e);
+                    //}
+                }
+                return singleInstance;
+            }
+        }
+
+        /// <summary>
+        /// Attempt to load dictionary from provided directory, first trying 
coredict.mem, failing back on coredict.dct
+        /// </summary>
+        /// <param name="dctFileRoot">path to dictionary directory</param>
+        public virtual void Load(string dctFileRoot)
+        {
+            string dctFilePath = System.IO.Path.Combine(dctFileRoot, 
"coredict.dct");
+            FileInfo serialObj = new 
FileInfo(System.IO.Path.Combine(dctFileRoot, "coredict.mem"));
+
+            if (serialObj.Exists && LoadFromObj(serialObj))
+            {
+
+            }
+            else
+            {
+                try
+                {
+                    wordIndexTable = new short[PRIME_INDEX_LENGTH];
+                    charIndexTable = new char[PRIME_INDEX_LENGTH];
+                    for (int i = 0; i < PRIME_INDEX_LENGTH; i++)
+                    {
+                        charIndexTable[i] = (char)0;
+                        wordIndexTable[i] = -1;
+                    }
+                    wordItem_charArrayTable = new char[GB2312_CHAR_NUM][][];
+                    wordItem_frequencyTable = new int[GB2312_CHAR_NUM][];
+                    // int total =
+                    LoadMainDataFromFile(dctFilePath);
+                    ExpandDelimiterData();
+                    MergeSameWords();
+                    SortEachItems();
+                    // log.info("load dictionary: " + dctFilePath + " total:" 
+ total);
+                }
+                catch (IOException e)
+                {
+                    throw new Exception(e.ToString(), e);
+                }
+
+                SaveToObj(serialObj);
+            }
+
+        }
+
+        /// <summary>
+        /// Load coredict.mem internally from the jar file.
+        /// </summary>
+        /// <exception cref="IOException">If there is a low-level I/O 
error.</exception>
+        public virtual void Load()
+        {
+            using (Stream input = 
this.GetType().GetTypeInfo().Assembly.FindAndGetManifestResourceStream(this.GetType(),
 "coredict.mem"))
+            {
+                LoadFromObjectInputStream(input);
+            }
+        }
+
+        private bool LoadFromObj(FileInfo serialObj)
+        {
+            try
+            {
+                using (Stream input = new FileStream(serialObj.FullName, 
FileMode.Open, FileAccess.Read))
+                    LoadFromObjectInputStream(input);
+                return true;
+            }
+            catch (Exception e)
+            {
+                throw new Exception(e.ToString(), e);
+            }
+        }
+
+        // LUCENENET conversion note:
+        // The data in Lucene is stored in a proprietary binary format 
(similar to
+        // .NET's BinarySerializer) that cannot be read back in .NET. 
Therefore, the
+        // data was extracted using Java's DataOutputStream using the 
following Java code.
+        // It can then be read in using the LoadFromInputStream method below 
+        // (using a DataInputStream instead of a BinaryReader), and saved
+        // in the correct (BinaryWriter) format by calling the SaveToObj 
method.
+        // Alternatively, the data can be loaded from disk using the files
+        // here(https://issues.apache.org/jira/browse/LUCENE-1629) in the 
analysis.data.zip file, 
+        // which will automatically produce the .mem files.
+
+        //public void saveToOutputStream(java.io.DataOutputStream stream) 
throws IOException
+        //{
+        //    // save bigramHashTable
+        //    int bhLen = bigramHashTable.length;
+        //    stream.writeInt(bhLen);
+        //    for (int i = 0; i<bhLen; i++)
+        //    {
+        //        stream.writeLong(bigramHashTable[i]);
+        //    }
+
+        //    // save frequencyTable
+        //    int fLen = frequencyTable.length;
+        //    stream.writeInt(fLen);
+        //    for (int i = 0; i<fLen; i++)
+        //    {
+        //        stream.writeInt(frequencyTable[i]);
+        //    }
+        //}
+
+        private void LoadFromObjectInputStream(Stream serialObjectInputStream)
+        {
+            //ObjectInputStream input = new 
ObjectInputStream(serialObjectInputStream);
+            //wordIndexTable = (short[])input.ReadObject();
+            //charIndexTable = (char[])input.ReadObject();
+            //wordItem_charArrayTable = (char[][][])input.ReadObject();
+            //wordItem_frequencyTable = (int[][])input.ReadObject();
+            //// log.info("load core dict from serialization.");
+            //input.close();
+
+            using (var reader = new BinaryReader(serialObjectInputStream))
+            //using (var reader = new DataInputStream(serialObjectInputStream))
+            {
+
+                // Read wordIndexTable
+                int wiLen = reader.ReadInt32();
+                wordIndexTable = new short[wiLen];
+                for (int i = 0; i < wiLen; i++)
+                {
+                    wordIndexTable[i] = reader.ReadInt16();
+                }
+
+                // Read charIndexTable
+                int ciLen = reader.ReadInt32();
+                charIndexTable = new char[ciLen];
+                for (int i = 0; i < ciLen; i++)
+                {
+                    charIndexTable[i] = reader.ReadChar();
+                }
+
+                // Read wordItem_charArrayTable
+                int caDim1 = reader.ReadInt32();
+                if (caDim1 > -1)
+                {
+                    wordItem_charArrayTable = new char[caDim1][][];
+                    for (int i = 0; i < caDim1; i++)
+                    {
+                        int caDim2 = reader.ReadInt32();
+                        if (caDim2 > -1)
+                        {
+                            wordItem_charArrayTable[i] = new char[caDim2][];
+                            for (int j = 0; j < caDim2; j++)
+                            {
+                                int caDim3 = reader.ReadInt32();
+                                if (caDim3 > -1)
+                                {
+                                    wordItem_charArrayTable[i][j] = new 
char[caDim3];
+                                    for (int k = 0; k < caDim3; k++)
+                                    {
+                                        wordItem_charArrayTable[i][j][k] = 
reader.ReadChar();
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+
+                // Read wordItem_frequencyTable
+                int fDim1 = reader.ReadInt32();
+                if (fDim1 > -1)
+                {
+                    wordItem_frequencyTable = new int[fDim1][];
+                    for (int i = 0; i < fDim1; i++)
+                    {
+                        int fDim2 = reader.ReadInt32();
+                        if (fDim2 > -1)
+                        {
+                            wordItem_frequencyTable[i] = new int[fDim2];
+                            for (int j = 0; j < fDim2; j++)
+                            {
+                                wordItem_frequencyTable[i][j] = 
reader.ReadInt32();
+                            }
+                        }
+                    }
+                }
+            }
+
+            // log.info("load core dict from serialization.");
+        }
+
+        private void SaveToObj(FileInfo serialObj)
+        {
+            try
+            {
+                //ObjectOutputStream output = new ObjectOutputStream(new 
FileStream(
+                //    serialObj.FullName, FileMode.Create, FileAccess.Write));
+                //output.writeObject(wordIndexTable);
+                //output.writeObject(charIndexTable);
+                //output.writeObject(wordItem_charArrayTable);
+                //output.writeObject(wordItem_frequencyTable);
+                //output.close();
+                //// log.info("serialize core dict.");
+
+                using (Stream stream = new FileStream(serialObj.FullName, 
FileMode.Create, FileAccess.Write))
+                {
+                    using (var writer = new BinaryWriter(stream))
+                    {
+                        // Write wordIndexTable
+                        int wiLen = wordIndexTable.Length;
+                        writer.Write(wiLen);
+                        for (int i = 0; i < wiLen; i++)
+                        {
+                            writer.Write(wordIndexTable[i]);
+                        }
+
+                        // Write charIndexTable
+                        int ciLen = charIndexTable.Length;
+                        writer.Write(ciLen);
+                        for (int i = 0; i < ciLen; i++)
+                        {
+                            writer.Write(charIndexTable[i]);
+                        }
+
+                        // Write wordItem_charArrayTable
+                        int caDim1 = wordItem_charArrayTable == null ? -1 : 
wordItem_charArrayTable.Length;
+                        writer.Write(caDim1);
+                        for (int i = 0; i < caDim1; i++)
+                        {
+                            int caDim2 = wordItem_charArrayTable[i] == null ? 
-1 : wordItem_charArrayTable[i].Length;
+                            writer.Write(caDim2);
+                            for (int j = 0; j < caDim2; j++)
+                            {
+                                int caDim3 = wordItem_charArrayTable[i][j] == 
null ? -1 : wordItem_charArrayTable[i][j].Length;
+                                writer.Write(caDim3);
+                                for (int k = 0; k < caDim3; k++)
+                                {
+                                    
writer.Write(wordItem_charArrayTable[i][j][k]);
+                                }
+                            }
+                        }
+
+                        // Write wordItem_frequencyTable
+                        int fDim1 = wordItem_frequencyTable == null ? -1 : 
wordItem_frequencyTable.Length;
+                        writer.Write(fDim1);
+                        for (int i = 0; i < fDim1; i++)
+                        {
+                            int fDim2 = wordItem_frequencyTable[i] == null ? 
-1 : wordItem_frequencyTable[i].Length;
+                            writer.Write(fDim2);
+                            for (int j = 0; j < fDim2; j++)
+                            {
+                                writer.Write(wordItem_frequencyTable[i][j]);
+                            }
+                        }
+                    }
+                }
+
+                // log.info("serialize core dict.");
+            }
+#pragma warning disable 168
+            catch (Exception e)
+#pragma warning restore 168
+            {
+                // log.warn(e.getMessage());
+            }
+        }
+
+        /// <summary>
+        /// Load the datafile into this <see cref="WordDictionary"/>
+        /// </summary>
+        /// <param name="dctFilePath">path to word dictionary 
(coredict.dct)</param>
+        /// <returns>number of words read</returns>
+        /// <exception cref="IOException">If there is a low-level I/O 
error.</exception>
+        private int LoadMainDataFromFile(string dctFilePath)
+        {
+            int i, cnt, length, total = 0;
+            // The file only counted 6763 Chinese characters plus 5 reserved 
slots 3756~3760.
+            // The 3756th is used (as a header) to store information.
+            int[]
+            buffer = new int[3];
+            byte[] intBuffer = new byte[4];
+            string tmpword;
+            //using (RandomAccessFile dctFile = new 
RandomAccessFile(dctFilePath, "r"))
+            using (var dctFile = new FileStream(dctFilePath, FileMode.Open, 
FileAccess.Read))
+            {
+
+                // GB2312 characters 0 - 6768
+                for (i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + 
CHAR_NUM_IN_FILE; i++)
+                {
+                    // if (i == 5231)
+                    // System.out.println(i);
+
+                    dctFile.Read(intBuffer, 0, intBuffer.Length);
+                    // the dictionary was developed for C, and byte order must 
be converted to work with Java
+                    cnt = 
ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LITTLE_ENDIAN).GetInt32();
+                    if (cnt <= 0)
+                    {
+                        wordItem_charArrayTable[i] = null;
+                        wordItem_frequencyTable[i] = null;
+                        continue;
+                    }
+                    wordItem_charArrayTable[i] = new char[cnt][];
+                    wordItem_frequencyTable[i] = new int[cnt];
+                    total += cnt;
+                    int j = 0;
+                    while (j < cnt)
+                    {
+                        // wordItemTable[i][j] = new WordItem();
+                        dctFile.Read(intBuffer, 0, intBuffer.Length);
+                        buffer[0] = 
ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LITTLE_ENDIAN)
+                            .GetInt32();// frequency
+                        dctFile.Read(intBuffer, 0, intBuffer.Length);
+                        buffer[1] = 
ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LITTLE_ENDIAN)
+                            .GetInt32();// length
+                        dctFile.Read(intBuffer, 0, intBuffer.Length);
+                        buffer[2] = 
ByteBuffer.Wrap(intBuffer).SetOrder(ByteOrder.LITTLE_ENDIAN)
+                            .GetInt32();// handle
+
+                        // wordItemTable[i][j].frequency = buffer[0];
+                        wordItem_frequencyTable[i][j] = buffer[0];
+
+                        length = buffer[1];
+                        if (length > 0)
+                        {
+                            byte[] lchBuffer = new byte[length];
+                            dctFile.Read(lchBuffer, 0, lchBuffer.Length);
+                            //tmpword = new String(lchBuffer, "GB2312");
+                            tmpword = 
Encoding.GetEncoding("GB2312").GetString(lchBuffer);
+                            //tmpword = 
Encoding.GetEncoding("hz-gb-2312").GetString(lchBuffer);
+                            // indexTable[i].wordItems[j].word = tmpword;
+                            // wordItemTable[i][j].charArray = 
tmpword.toCharArray();
+                            wordItem_charArrayTable[i][j] = 
tmpword.ToCharArray();
+                        }
+                        else
+                        {
+                            // wordItemTable[i][j].charArray = null;
+                            wordItem_charArrayTable[i][j] = null;
+                        }
+                        // System.out.println(indexTable[i].wordItems[j]);
+                        j++;
+                    }
+
+                    string str = GetCCByGB2312Id(i);
+                    SetTableIndex(str[0], i);
+                }
+            }
+            return total;
+        }
+
+        /// <summary>
+        /// The original lexicon puts all information with punctuation into a 
+        /// chart (from 1 to 3755). Here it then gets expanded, separately 
being
+        /// placed into the chart that has the corresponding symbol.
+        /// </summary>
+        private void ExpandDelimiterData()
+        {
+            int i;
+            int cnt;
+            // Punctuation then treating index 3755 as 1, 
+            // distribute the original punctuation corresponding dictionary 
into 
+            int delimiterIndex = 3755 + GB2312_FIRST_CHAR;
+            i = 0;
+            while (i < wordItem_charArrayTable[delimiterIndex].Length)
+            {
+                char c = wordItem_charArrayTable[delimiterIndex][i][0];
+                int j = GetGB2312Id(c);// the id value of the punctuation
+                if (wordItem_charArrayTable[j] == null)
+                {
+
+                    int k = i;
+                    // Starting from i, count the number of the following 
worditem symbol from j
+                    while (k < wordItem_charArrayTable[delimiterIndex].Length
+                        && wordItem_charArrayTable[delimiterIndex][k][0] == c)
+                    {
+                        k++;
+                    }
+                    // c is the punctuation character, j is the id value of c
+                    // k-1 represents the index of the last punctuation 
character
+                    cnt = k - i;
+                    if (cnt != 0)
+                    {
+                        wordItem_charArrayTable[j] = new char[cnt][];
+                        wordItem_frequencyTable[j] = new int[cnt];
+                    }
+
+                    // Assign value for each wordItem.
+                    for (k = 0; k < cnt; k++, i++)
+                    {
+                        // wordItemTable[j][k] = new WordItem();
+                        wordItem_frequencyTable[j][k] = 
wordItem_frequencyTable[delimiterIndex][i];
+                        wordItem_charArrayTable[j][k] = new 
char[wordItem_charArrayTable[delimiterIndex][i].Length - 1];
+                        
System.Array.Copy(wordItem_charArrayTable[delimiterIndex][i], 1,
+                            wordItem_charArrayTable[j][k], 0,
+                            wordItem_charArrayTable[j][k].Length);
+                    }
+                    SetTableIndex(c, j);
+                }
+            }
+            // Delete the original corresponding symbol array.
+            wordItem_charArrayTable[delimiterIndex] = null;
+            wordItem_frequencyTable[delimiterIndex] = null;
+        }
+
+        /// <summary>
+        /// since we aren't doing POS-tagging, merge the frequencies for 
entries of the same word (with different POS)
+        /// </summary>
+        private void MergeSameWords()
+        {
+            int i;
+            for (i = 0; i < GB2312_FIRST_CHAR + CHAR_NUM_IN_FILE; i++)
+            {
+                if (wordItem_charArrayTable[i] == null)
+                    continue;
+                int len = 1;
+                for (int j = 1; j < wordItem_charArrayTable[i].Length; j++)
+                {
+                    if (Utility.CompareArray(wordItem_charArrayTable[i][j], 0,
+                        wordItem_charArrayTable[i][j - 1], 0) != 0)
+                        len++;
+
+                }
+                if (len < wordItem_charArrayTable[i].Length)
+                {
+                    char[][] tempArray = new char[len][];
+                    int[] tempFreq = new int[len];
+                    int k = 0;
+                    tempArray[0] = wordItem_charArrayTable[i][0];
+                    tempFreq[0] = wordItem_frequencyTable[i][0];
+                    for (int j = 1; j < wordItem_charArrayTable[i].Length; j++)
+                    {
+                        if 
(Utility.CompareArray(wordItem_charArrayTable[i][j], 0,
+                            tempArray[k], 0) != 0)
+                        {
+                            k++;
+                            // temp[k] = wordItemTable[i][j];
+                            tempArray[k] = wordItem_charArrayTable[i][j];
+                            tempFreq[k] = wordItem_frequencyTable[i][j];
+                        }
+                        else
+                        {
+                            // temp[k].frequency += 
wordItemTable[i][j].frequency;
+                            tempFreq[k] += wordItem_frequencyTable[i][j];
+                        }
+                    }
+                    // wordItemTable[i] = temp;
+                    wordItem_charArrayTable[i] = tempArray;
+                    wordItem_frequencyTable[i] = tempFreq;
+                }
+            }
+        }
+
+        private void SortEachItems()
+        {
+            char[] tmpArray;
+            int tmpFreq;
+            for (int i = 0; i < wordItem_charArrayTable.Length; i++)
+            {
+                if (wordItem_charArrayTable[i] != null
+                    && wordItem_charArrayTable[i].Length > 1)
+                {
+                    for (int j = 0; j < wordItem_charArrayTable[i].Length - 1; 
j++)
+                    {
+                        for (int j2 = j + 1; j2 < 
wordItem_charArrayTable[i].Length; j2++)
+                        {
+                            if 
(Utility.CompareArray(wordItem_charArrayTable[i][j], 0,
+                                wordItem_charArrayTable[i][j2], 0) > 0)
+                            {
+                                tmpArray = wordItem_charArrayTable[i][j];
+                                tmpFreq = wordItem_frequencyTable[i][j];
+                                wordItem_charArrayTable[i][j] = 
wordItem_charArrayTable[i][j2];
+                                wordItem_frequencyTable[i][j] = 
wordItem_frequencyTable[i][j2];
+                                wordItem_charArrayTable[i][j2] = tmpArray;
+                                wordItem_frequencyTable[i][j2] = tmpFreq;
+                            }
+                        }
+                    }
+                }
+            }
+        }
+
+        /// <summary>
+        /// Calculate character <paramref name="c"/>'s position in hash table, 
+        /// then initialize the value of that position in the address table.
+        /// </summary>
+        private bool SetTableIndex(char c, int j)
+        {
+            int index = GetAvaliableTableIndex(c);
+            if (index != -1)
+            {
+                charIndexTable[index] = c;
+                wordIndexTable[index] = (short)j;
+                return true;
+            }
+            else
+                return false;
+        }
+
+        private short GetAvaliableTableIndex(char c)
+        {
+            int hash1 = (int)(Hash1(c) % PRIME_INDEX_LENGTH);
+            int hash2 = Hash2(c) % PRIME_INDEX_LENGTH;
+            if (hash1 < 0)
+                hash1 = PRIME_INDEX_LENGTH + hash1;
+            if (hash2 < 0)
+                hash2 = PRIME_INDEX_LENGTH + hash2;
+            int index = hash1;
+            int i = 1;
+            while (charIndexTable[index] != 0 && charIndexTable[index] != c
+                && i < PRIME_INDEX_LENGTH)
+            {
+                index = (hash1 + i * hash2) % PRIME_INDEX_LENGTH;
+                i++;
+            }
+            // System.out.println(i - 1);
+
+            if (i < PRIME_INDEX_LENGTH
+                && (charIndexTable[index] == 0 || charIndexTable[index] == c))
+            {
+                return (short)index;
+            }
+            else
+            {
+                return -1;
+            }
+        }
+
+        private short GetWordItemTableIndex(char c)
+        {
+            int hash1 = (int)(Hash1(c) % PRIME_INDEX_LENGTH);
+            int hash2 = Hash2(c) % PRIME_INDEX_LENGTH;
+            if (hash1 < 0)
+                hash1 = PRIME_INDEX_LENGTH + hash1;
+            if (hash2 < 0)
+                hash2 = PRIME_INDEX_LENGTH + hash2;
+            int index = hash1;
+            int i = 1;
+            while (charIndexTable[index] != 0 && charIndexTable[index] != c
+                && i < PRIME_INDEX_LENGTH)
+            {
+                index = (hash1 + i * hash2) % PRIME_INDEX_LENGTH;
+                i++;
+            }
+
+            if (i < PRIME_INDEX_LENGTH && charIndexTable[index] == c)
+            {
+                return (short)index;
+            }
+            else
+                return -1;
+        }
+
+        /// <summary>
+        /// Look up the text string corresponding with the word char array,
+        /// and return the position of the word list.
+        /// </summary>
+        /// <param name="knownHashIndex">
+        /// already figure out position of the first word
+        /// symbol charArray[0] in hash table. If not calculated yet, can be
+        /// replaced with function int findInTable(char[] charArray).
+        /// </param>
+        /// <param name="charArray">look up the char array corresponding with 
the word.</param>
+        /// <returns>word location in word array.  If not found, then return 
-1.</returns>
+        private int FindInTable(short knownHashIndex, char[] charArray)
+        {
+            if (charArray == null || charArray.Length == 0)
+                return -1;
+
+            char[][] items = 
wordItem_charArrayTable[wordIndexTable[knownHashIndex]];
+            int start = 0, end = items.Length - 1;
+            int mid = (start + end) / 2, cmpResult;
+
+            // Binary search for the index of idArray
+            while (start <= end)
+            {
+                cmpResult = Utility.CompareArray(items[mid], 0, charArray, 1);
+
+                if (cmpResult == 0)
+                    return mid;// find it
+                else if (cmpResult < 0)
+                    start = mid + 1;
+                else if (cmpResult > 0)
+                    end = mid - 1;
+
+                mid = (start + end) / 2;
+            }
+            return -1;
+        }
+
+        /// <summary>
+        /// Find the first word in the dictionary that starts with the 
supplied prefix
+        /// </summary>
+        /// <param name="charArray">input prefix</param>
+        /// <returns>index of word, or -1 if not found</returns>
+        /// <seealso cref="GetPrefixMatch(char[], int)"/>
+        public virtual int GetPrefixMatch(char[] charArray)
+        {
+            return GetPrefixMatch(charArray, 0);
+        }
+
+        /// <summary>
+        /// Find the nth word in the dictionary that starts with the supplied 
prefix
+        /// </summary>
+        /// <param name="charArray">input prefix</param>
+        /// <param name="knownStart">relative position in the dictionary to 
start</param>
+        /// <returns>index of word, or -1 if not found</returns>
+        /// <seealso cref="GetPrefixMatch(char[])"/>
+        public virtual int GetPrefixMatch(char[] charArray, int knownStart)
+        {
+            short index = GetWordItemTableIndex(charArray[0]);
+            if (index == -1)
+                return -1;
+            char[][] items = wordItem_charArrayTable[wordIndexTable[index]];
+            int start = knownStart, end = items.Length - 1;
+
+            int mid = (start + end) / 2, cmpResult;
+
+            // Binary search for the index of idArray
+            while (start <= end)
+            {
+                cmpResult = Utility.CompareArrayByPrefix(charArray, 1, 
items[mid], 0);
+                if (cmpResult == 0)
+                {
+                    // Get the first item which match the current word
+                    while (mid >= 0
+                        && Utility.CompareArrayByPrefix(charArray, 1, 
items[mid], 0) == 0)
+                        mid--;
+                    mid++;
+                    return mid;// Find the first word that uses charArray as 
prefix.
+                }
+                else if (cmpResult < 0)
+                    end = mid - 1;
+                else
+                    start = mid + 1;
+                mid = (start + end) / 2;
+            }
+            return -1;
+        }
+
+        /// <summary>
+        /// Get the frequency of a word from the dictionary
+        /// </summary>
+        /// <param name="charArray">input word</param>
+        /// <returns>word frequency, or zero if the word is not found</returns>
+        public virtual int GetFrequency(char[] charArray)
+        {
+            short hashIndex = GetWordItemTableIndex(charArray[0]);
+            if (hashIndex == -1)
+            {
+                return 0;
+            }
+            int itemIndex = FindInTable(hashIndex, charArray);
+            if (itemIndex != -1)
+            {
+                return 
wordItem_frequencyTable[wordIndexTable[hashIndex]][itemIndex];
+            }
+            return 0;
+        }
+
+        /// <summary>
+        /// Return <c>true</c> if the dictionary entry at itemIndex for table 
charArray[0] is charArray
+        /// </summary>
+        /// <param name="charArray">input word</param>
+        /// <param name="itemIndex">item index for table charArray[0]</param>
+        /// <returns><c>true</c> if the entry exists</returns>
+        public virtual bool IsEqual(char[] charArray, int itemIndex)
+        {
+            short hashIndex = GetWordItemTableIndex(charArray[0]);
+            return Utility.CompareArray(charArray, 1,
+                wordItem_charArrayTable[wordIndexTable[hashIndex]][itemIndex], 
0) == 0;
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/2f5d89b4/src/Lucene.Net.Analysis.SmartCn/HHMM/bigramdict.mem
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.SmartCn/HHMM/bigramdict.mem 
b/src/Lucene.Net.Analysis.SmartCn/HHMM/bigramdict.mem
new file mode 100644
index 0000000..fd561bb
Binary files /dev/null and 
b/src/Lucene.Net.Analysis.SmartCn/HHMM/bigramdict.mem differ

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/2f5d89b4/src/Lucene.Net.Analysis.SmartCn/HHMM/coredict.mem
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.SmartCn/HHMM/coredict.mem 
b/src/Lucene.Net.Analysis.SmartCn/HHMM/coredict.mem
new file mode 100644
index 0000000..2eab465
Binary files /dev/null and b/src/Lucene.Net.Analysis.SmartCn/HHMM/coredict.mem 
differ

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/2f5d89b4/src/Lucene.Net.Analysis.SmartCn/HMMChineseTokenizer.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.SmartCn/HMMChineseTokenizer.cs 
b/src/Lucene.Net.Analysis.SmartCn/HMMChineseTokenizer.cs
new file mode 100644
index 0000000..10d6de7
--- /dev/null
+++ b/src/Lucene.Net.Analysis.SmartCn/HMMChineseTokenizer.cs
@@ -0,0 +1,94 @@
+ï»¿using Lucene.Net.Analysis.Cn.Smart.HHMM;
+using Lucene.Net.Analysis.TokenAttributes;
+using Lucene.Net.Analysis.Util;
+using Lucene.Net.Support;
+using System.Collections.Generic;
+using System.Globalization;
+using System.IO;
+
+namespace Lucene.Net.Analysis.Cn.Smart
+{
+    /*
+     * Licensed to the Apache Software Foundation (ASF) under one or more
+     * contributor license agreements.  See the NOTICE file distributed with
+     * this work for additional information regarding copyright ownership.
+     * The ASF licenses this file to You under the Apache License, Version 2.0
+     * (the "License"); you may not use this file except in compliance with
+     * the License.  You may obtain a copy of the License at
+     *
+     *     http://www.apache.org/licenses/LICENSE-2.0
+     *
+     * Unless required by applicable law or agreed to in writing, software
+     * distributed under the License is distributed on an "AS IS" BASIS,
+     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     * See the License for the specific language governing permissions and
+     * limitations under the License.
+     */
+
+    /// <summary>
+    /// Tokenizer for Chinese or mixed Chinese-English text.
+    /// <para/>
+    /// The analyzer uses probabilistic knowledge to find the optimal word 
segmentation for Simplified Chinese text.
+    /// The text is first broken into sentences, then each sentence is 
segmented into words.
+    /// </summary>
+    public class HMMChineseTokenizer : SegmentingTokenizerBase
+    {
+        ///** used for breaking the text into sentences */
+        //private static readonly BreakIterator sentenceProto = 
BreakIterator.getSentenceInstance(Locale.ROOT);
+
+        private readonly ICharTermAttribute termAtt;
+        private readonly IOffsetAttribute offsetAtt;
+        private readonly ITypeAttribute typeAtt;
+
+        private readonly WordSegmenter wordSegmenter = new WordSegmenter();
+        private IEnumerator<SegToken> tokens;
+
+        /// <summary>
+        /// Creates a new <see cref="HMMChineseTokenizer"/>
+        /// </summary>
+        public HMMChineseTokenizer(TextReader reader)
+            : this(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY, reader)
+        {
+        }
+
+        /// <summary>
+        /// Creates a new <see cref="HMMChineseTokenizer"/>, supplying the 
<see cref="Lucene.Net.Util.AttributeSource.AttributeFactory"/>
+        /// </summary>
+        public HMMChineseTokenizer(AttributeFactory factory, TextReader reader)
+            : base(factory, reader, new 
IcuBreakIterator(Icu.BreakIterator.UBreakIteratorType.SENTENCE, 
CultureInfo.InvariantCulture) { EnableHacks = false })
+        {
+            termAtt = AddAttribute<ICharTermAttribute>();
+            offsetAtt = AddAttribute<IOffsetAttribute>();
+            typeAtt = AddAttribute<ITypeAttribute>();
+        }
+
+        protected override void SetNextSentence(int sentenceStart, int 
sentenceEnd)
+        {
+            string sentence = new string(m_buffer, sentenceStart, sentenceEnd 
- sentenceStart);
+            tokens = wordSegmenter.SegmentSentence(sentence, m_offset + 
sentenceStart).GetEnumerator();
+        }
+
+        protected override bool IncrementWord()
+        {
+            if (tokens == null || !tokens.MoveNext())
+            {
+                return false;
+            }
+            else
+            {
+                SegToken token = tokens.Current;
+                ClearAttributes();
+                termAtt.CopyBuffer(token.CharArray, 0, token.CharArray.Length);
+                offsetAtt.SetOffset(CorrectOffset(token.StartOffset), 
CorrectOffset(token.EndOffset));
+                typeAtt.Type = "word";
+                return true;
+            }
+        }
+
+        public override void Reset()
+        {
+            base.Reset();
+            tokens = null;
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/2f5d89b4/src/Lucene.Net.Analysis.SmartCn/HMMChineseTokenizerFactory.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.SmartCn/HMMChineseTokenizerFactory.cs 
b/src/Lucene.Net.Analysis.SmartCn/HMMChineseTokenizerFactory.cs
new file mode 100644
index 0000000..bb2e8a9
--- /dev/null
+++ b/src/Lucene.Net.Analysis.SmartCn/HMMChineseTokenizerFactory.cs
@@ -0,0 +1,56 @@
+ï»¿using Lucene.Net.Analysis.Util;
+using Lucene.Net.Util;
+using System;
+using System.Collections.Generic;
+using System.IO;
+using System.Linq;
+
+namespace Lucene.Net.Analysis.Cn.Smart
+{
+    /*
+     * Licensed to the Apache Software Foundation (ASF) under one or more
+     * contributor license agreements.  See the NOTICE file distributed with
+     * this work for additional information regarding copyright ownership.
+     * The ASF licenses this file to You under the Apache License, Version 2.0
+     * (the "License"); you may not use this file except in compliance with
+     * the License.  You may obtain a copy of the License at
+     *
+     *     http://www.apache.org/licenses/LICENSE-2.0
+     *
+     * Unless required by applicable law or agreed to in writing, software
+     * distributed under the License is distributed on an "AS IS" BASIS,
+     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     * See the License for the specific language governing permissions and
+     * limitations under the License.
+     */
+
+    /// <summary>
+    /// Factory for <see cref="HMMChineseTokenizer"/>
+    /// <para/>
+    /// Note: this class will currently emit tokens for punctuation. So you 
should either add
+    /// a <see cref="Miscellaneous.WordDelimiterFilter"/> after to remove 
these (with concatenate off), or use the 
+    /// SmartChinese stoplist with a StopFilterFactory via:
+    /// <code>words="org/apache/lucene/analysis/cn/smart/stopwords.txt"</code>
+    /// <para/>
+    /// @lucene.experimental
+    /// </summary>
+    public sealed class HMMChineseTokenizerFactory : TokenizerFactory
+    {
+        /// <summary>
+        /// Creates a new <see cref="HMMChineseTokenizerFactory"/> 
+        /// </summary>
+        public HMMChineseTokenizerFactory(IDictionary<string, string> args)
+            : base(args)
+        {
+            if (args.Any())
+            {
+                throw new ArgumentException("Unknown parameters: " + args);
+            }
+        }
+
+        public override Tokenizer Create(AttributeSource.AttributeFactory 
factory, TextReader reader)
+        {
+            return new HMMChineseTokenizer(factory, reader);
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/2f5d89b4/src/Lucene.Net.Analysis.SmartCn/Lucene.Net.Analysis.SmartCn.csproj
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.SmartCn/Lucene.Net.Analysis.SmartCn.csproj 
b/src/Lucene.Net.Analysis.SmartCn/Lucene.Net.Analysis.SmartCn.csproj
new file mode 100644
index 0000000..91e3e6c
--- /dev/null
+++ b/src/Lucene.Net.Analysis.SmartCn/Lucene.Net.Analysis.SmartCn.csproj
@@ -0,0 +1,124 @@
+ï»¿<?xml version="1.0" encoding="utf-8"?>
+<!--
+
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements.  See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership.  The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License.  You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied.  See the License for the
+ specific language governing permissions and limitations
+ under the License.
+
+-->
+
+<Project ToolsVersion="14.0" DefaultTargets="Build" 
xmlns="http://schemas.microsoft.com/developer/msbuild/2003";>
+  <Import 
Project="$(MSBuildExtensionsPath)\$(MSBuildToolsVersion)\Microsoft.Common.props"
 
Condition="Exists('$(MSBuildExtensionsPath)\$(MSBuildToolsVersion)\Microsoft.Common.props')"
 />
+  <PropertyGroup>
+    <Configuration Condition=" '$(Configuration)' == '' ">Debug</Configuration>
+    <Platform Condition=" '$(Platform)' == '' ">AnyCPU</Platform>
+    <ProjectGuid>{DBA35EDF-A0FF-4DF7-AE4F-A103B01CD488}</ProjectGuid>
+    <OutputType>Library</OutputType>
+    <AppDesignerFolder>Properties</AppDesignerFolder>
+    <RootNamespace>Lucene.Net.Analysis.Cn.Smart</RootNamespace>
+    <AssemblyName>Lucene.Net.Analysis.SmartCn</AssemblyName>
+    <TargetFrameworkVersion>v4.5.1</TargetFrameworkVersion>
+    <FileAlignment>512</FileAlignment>
+  </PropertyGroup>
+  <PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Debug|AnyCPU' 
">
+    <DebugSymbols>true</DebugSymbols>
+    <DebugType>full</DebugType>
+    <Optimize>false</Optimize>
+    <OutputPath>bin\Debug\</OutputPath>
+    <DefineConstants>DEBUG;TRACE</DefineConstants>
+    <ErrorReport>prompt</ErrorReport>
+    <WarningLevel>4</WarningLevel>
+  </PropertyGroup>
+  <PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 
'Release|AnyCPU' ">
+    <DebugType>pdbonly</DebugType>
+    <Optimize>true</Optimize>
+    <OutputPath>bin\Release\</OutputPath>
+    <DefineConstants>TRACE</DefineConstants>
+    <ErrorReport>prompt</ErrorReport>
+    <WarningLevel>4</WarningLevel>
+  </PropertyGroup>
+  <PropertyGroup>
+    <DefineConstants>$(DefineConstants);FEATURE_SERIALIZABLE</DefineConstants>
+  </PropertyGroup>
+  <ItemGroup>
+    <Reference Include="System" />
+    <Reference Include="System.Core" />
+    <Reference Include="System.Xml.Linq" />
+    <Reference Include="System.Data.DataSetExtensions" />
+    <Reference Include="Microsoft.CSharp" />
+    <Reference Include="System.Data" />
+    <Reference Include="System.Net.Http" />
+    <Reference Include="System.Xml" />
+  </ItemGroup>
+  <ItemGroup>
+    <Compile Include="AnalyzerProfile.cs" />
+    <Compile Include="CharType.cs" />
+    <Compile Include="HHMM\AbstractDictionary.cs" />
+    <Compile Include="HHMM\BigramDictionary.cs" />
+    <Compile Include="HHMM\BiSegGraph.cs" />
+    <Compile Include="HHMM\HHMMSegmenter.cs" />
+    <Compile Include="HHMM\PathNode.cs" />
+    <Compile Include="HHMM\SegGraph.cs" />
+    <Compile Include="HHMM\SegToken.cs" />
+    <Compile Include="HHMM\SegTokenFilter.cs" />
+    <Compile Include="HHMM\SegTokenPair.cs" />
+    <Compile Include="HHMM\WordDictionary.cs" />
+    <Compile Include="HMMChineseTokenizer.cs" />
+    <Compile Include="HMMChineseTokenizerFactory.cs" />
+    <Compile Include="Properties\AssemblyInfo.cs" />
+    <Compile Include="SentenceTokenizer.cs" />
+    <Compile Include="SmartChineseAnalyzer.cs" />
+    <Compile Include="SmartChineseSentenceTokenizerFactory.cs" />
+    <Compile Include="SmartChineseWordTokenFilterFactory.cs" />
+    <Compile Include="Utility.cs" />
+    <Compile Include="WordSegmenter.cs" />
+    <Compile Include="WordTokenFilter.cs" />
+    <Compile Include="WordType.cs" />
+    <Compile Include="..\CommonAssemblyInfo.cs">
+      <Link>Properties\CommonAssemblyInfo.cs</Link>
+    </Compile>
+  </ItemGroup>
+  <ItemGroup>
+    <ProjectReference 
Include="..\Lucene.Net.Analysis.Common\Lucene.Net.Analysis.Common.csproj">
+      <Project>{4ADD0BBC-B900-4715-9526-D871DE8EEA64}</Project>
+      <Name>Lucene.Net.Analysis.Common</Name>
+    </ProjectReference>
+    <ProjectReference Include="..\Lucene.Net.ICU\Lucene.Net.ICU.csproj">
+      <Project>{349CB7C9-7534-4E1D-9B0A-5521441AF0AE}</Project>
+      <Name>Lucene.Net.ICU</Name>
+    </ProjectReference>
+    <ProjectReference Include="..\Lucene.Net\Lucene.Net.csproj">
+      <Project>{5D4AD9BE-1FFB-41AB-9943-25737971BF57}</Project>
+      <Name>Lucene.Net</Name>
+    </ProjectReference>
+  </ItemGroup>
+  <ItemGroup>
+    <EmbeddedResource Include="HHMM\bigramdict.mem" />
+    <EmbeddedResource Include="HHMM\coredict.mem" />
+    <None Include="Lucene.Net.Analysis.SmartCn.project.json" />
+  </ItemGroup>
+  <ItemGroup>
+    <EmbeddedResource Include="stopwords.txt" />
+  </ItemGroup>
+  <Import Project="$(MSBuildToolsPath)\Microsoft.CSharp.targets" />
+  <!-- To modify your build process, add your task inside one of the targets 
below and uncomment it. 
+       Other similar extension points exist, see Microsoft.Common.targets.
+  <Target Name="BeforeBuild">
+  </Target>
+  <Target Name="AfterBuild">
+  </Target>
+  -->
+</Project>
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/2f5d89b4/src/Lucene.Net.Analysis.SmartCn/Lucene.Net.Analysis.SmartCn.project.json
----------------------------------------------------------------------
diff --git 
a/src/Lucene.Net.Analysis.SmartCn/Lucene.Net.Analysis.SmartCn.project.json 
b/src/Lucene.Net.Analysis.SmartCn/Lucene.Net.Analysis.SmartCn.project.json
new file mode 100644
index 0000000..74d9d80
--- /dev/null
+++ b/src/Lucene.Net.Analysis.SmartCn/Lucene.Net.Analysis.SmartCn.project.json
@@ -0,0 +1,11 @@
+{
+  "runtimes": {
+    "win": {}
+  },
+  "dependencies": {
+    "icu.net": "54.1.1-alpha"
+  },
+  "frameworks": {
+    "net451": {}
+  }
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/2f5d89b4/src/Lucene.Net.Analysis.SmartCn/Lucene.Net.Analysis.SmartCn.xproj
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.SmartCn/Lucene.Net.Analysis.SmartCn.xproj 
b/src/Lucene.Net.Analysis.SmartCn/Lucene.Net.Analysis.SmartCn.xproj
new file mode 100644
index 0000000..940e974
--- /dev/null
+++ b/src/Lucene.Net.Analysis.SmartCn/Lucene.Net.Analysis.SmartCn.xproj
@@ -0,0 +1,40 @@
+ï»¿<?xml version="1.0" encoding="utf-8"?>
+<!--
+
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements.  See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership.  The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License.  You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied.  See the License for the
+ specific language governing permissions and limitations
+ under the License.
+
+-->
+
+<Project ToolsVersion="14.0.25420" DefaultTargets="Build" 
xmlns="http://schemas.microsoft.com/developer/msbuild/2003";>
+  <PropertyGroup>
+    <VisualStudioVersion Condition="'$(VisualStudioVersion)' == 
''">14.0.25420</VisualStudioVersion>
+    <VSToolsPath Condition="'$(VSToolsPath)' == 
''">$(MSBuildExtensionsPath32)\Microsoft\VisualStudio\v$(VisualStudioVersion)</VSToolsPath>
+  </PropertyGroup>
+  <Import Project="$(VSToolsPath)\DotNet\Microsoft.DotNet.Props" 
Condition="'$(VSToolsPath)' != ''" />
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>a400916e-dcb8-4a16-be83-91891c05191f</ProjectGuid>
+    <RootNamespace>Lucene.Net.Analysis.Cn.Smart</RootNamespace>
+    <BaseIntermediateOutputPath Condition="'$(BaseIntermediateOutputPath)'=='' 
">.\obj</BaseIntermediateOutputPath>
+    <OutputPath Condition="'$(OutputPath)'=='' ">.\bin\</OutputPath>
+  </PropertyGroup>
+
+  <PropertyGroup>
+    <SchemaVersion>2.0</SchemaVersion>
+  </PropertyGroup>
+  <Import Project="$(VSToolsPath)\DotNet\Microsoft.DotNet.targets" 
Condition="'$(VSToolsPath)' != ''" />
+</Project>
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/2f5d89b4/src/Lucene.Net.Analysis.SmartCn/Properties/AssemblyInfo.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.SmartCn/Properties/AssemblyInfo.cs 
b/src/Lucene.Net.Analysis.SmartCn/Properties/AssemblyInfo.cs
new file mode 100644
index 0000000..ce45fa0
--- /dev/null
+++ b/src/Lucene.Net.Analysis.SmartCn/Properties/AssemblyInfo.cs
@@ -0,0 +1,42 @@
+ï»¿/*
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements.  See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License.  You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+using System;
+using System.Reflection;
+using System.Runtime.InteropServices;
+
+// General Information about an assembly is controlled through the following 
+// set of attributes. Change these attribute values to modify the information
+// associated with an assembly.
+[assembly: AssemblyTitle("Lucene.Net.Analysis.SmartCn")]
+[assembly: AssemblyDescription("Analyzer for indexing Chinese " +
+    "for the Lucene.Net full-text search engine library from The Apache 
Software Foundation.")]
+[assembly: AssemblyConfiguration("")]
+[assembly: AssemblyDefaultAlias("Lucene.Net.Analysis.SmartCn")]
+[assembly: AssemblyCulture("")]
+
+[assembly: CLSCompliant(true)]
+
+// Setting ComVisible to false makes the types in this assembly not visible 
+// to COM components.  If you need to access a type in this assembly from 
+// COM, set the ComVisible attribute to true on that type.
+[assembly: ComVisible(false)]
+
+// The following GUID is for the ID of the typelib if this project is exposed 
to COM
+[assembly: Guid("dba35edf-a0ff-4df7-ae4f-a103b01cd488")]
+
+// NOTE: Version information is in CommonAssemblyInfo.cs

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/2f5d89b4/src/Lucene.Net.Analysis.SmartCn/SentenceTokenizer.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.SmartCn/SentenceTokenizer.cs 
b/src/Lucene.Net.Analysis.SmartCn/SentenceTokenizer.cs
new file mode 100644
index 0000000..28e949d
--- /dev/null
+++ b/src/Lucene.Net.Analysis.SmartCn/SentenceTokenizer.cs
@@ -0,0 +1,142 @@
+ï»¿using Lucene.Net.Analysis.TokenAttributes;
+using System;
+using System.IO;
+using System.Text;
+
+namespace Lucene.Net.Analysis.Cn.Smart
+{
+    /*
+     * Licensed to the Apache Software Foundation (ASF) under one or more
+     * contributor license agreements.  See the NOTICE file distributed with
+     * this work for additional information regarding copyright ownership.
+     * The ASF licenses this file to You under the Apache License, Version 2.0
+     * (the "License"); you may not use this file except in compliance with
+     * the License.  You may obtain a copy of the License at
+     *
+     *     http://www.apache.org/licenses/LICENSE-2.0
+     *
+     * Unless required by applicable law or agreed to in writing, software
+     * distributed under the License is distributed on an "AS IS" BASIS,
+     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     * See the License for the specific language governing permissions and
+     * limitations under the License.
+     */
+
+    /// <summary>
+    /// Tokenizes input text into sentences.
+    /// <para>
+    /// The output tokens can then be broken into words with <see 
cref="WordTokenFilter"/>
+    /// </para>
+    /// @lucene.experimental
+    /// </summary>
+    [Obsolete("Use HMMChineseTokenizer instead")]
+    public sealed class SentenceTokenizer : Tokenizer
+    {
+        /// <summary>
+        /// End of sentence punctuation: ãï¼ï¼ï¼ï¼,!?;
+        /// </summary>
+        private readonly static string PUNCTION = "ãï¼ï¼ï¼ï¼,!?;";
+
+        private readonly StringBuilder buffer = new StringBuilder();
+
+        private int tokenStart = 0, tokenEnd = 0;
+
+        private ICharTermAttribute termAtt;
+        private IOffsetAttribute offsetAtt;
+        private ITypeAttribute typeAtt;
+
+        public SentenceTokenizer(TextReader reader)
+                  : base(reader)
+        {
+            Init();
+        }
+
+        public SentenceTokenizer(AttributeFactory factory, TextReader reader)
+            : base(factory, reader)
+        {
+            Init();
+        }
+
+        private void Init()
+        {
+            termAtt = AddAttribute<ICharTermAttribute>();
+            offsetAtt = AddAttribute<IOffsetAttribute>();
+            typeAtt = AddAttribute<ITypeAttribute>();
+        }
+
+
+        public override bool IncrementToken()
+        {
+            ClearAttributes();
+            buffer.Length = 0;
+            int ci;
+            char ch, pch;
+            bool atBegin = true;
+            tokenStart = tokenEnd;
+            ci = m_input.Read();
+            ch = (char)ci;
+
+            while (true)
+            {
+                if (ci == -1)
+                {
+                    break;
+                }
+                else if (PUNCTION.IndexOf(ch) != -1)
+                {
+                    // End of a sentence
+                    buffer.Append(ch);
+                    tokenEnd++;
+                    break;
+                }
+                else if (atBegin && Utility.SPACES.IndexOf(ch) != -1)
+                {
+                    tokenStart++;
+                    tokenEnd++;
+                    ci = m_input.Read();
+                    ch = (char)ci;
+                }
+                else
+                {
+                    buffer.Append(ch);
+                    atBegin = false;
+                    tokenEnd++;
+                    pch = ch;
+                    ci = m_input.Read();
+                    ch = (char)ci;
+                    // Two spaces, such as CR, LF
+                    if (Utility.SPACES.IndexOf(ch) != -1
+                        && Utility.SPACES.IndexOf(pch) != -1)
+                    {
+                        // buffer.append(ch);
+                        tokenEnd++;
+                        break;
+                    }
+                }
+            }
+            if (buffer.Length == 0)
+                return false;
+            else
+            {
+                termAtt.SetEmpty().Append(buffer);
+                offsetAtt.SetOffset(CorrectOffset(tokenStart), 
CorrectOffset(tokenEnd));
+                typeAtt.Type = "sentence";
+                return true;
+            }
+        }
+
+        public override void Reset()
+        {
+            base.Reset();
+            tokenStart = tokenEnd = 0;
+        }
+
+        public override void End()
+        {
+            base.End();
+            // set final offset
+            int finalOffset = CorrectOffset(tokenEnd);
+            offsetAtt.SetOffset(finalOffset, finalOffset);
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/2f5d89b4/src/Lucene.Net.Analysis.SmartCn/SmartChineseAnalyzer.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.SmartCn/SmartChineseAnalyzer.cs 
b/src/Lucene.Net.Analysis.SmartCn/SmartChineseAnalyzer.cs
new file mode 100644
index 0000000..97c36ee
--- /dev/null
+++ b/src/Lucene.Net.Analysis.SmartCn/SmartChineseAnalyzer.cs
@@ -0,0 +1,171 @@
+ï»¿using Lucene.Net.Analysis.Core;
+using Lucene.Net.Analysis.En;
+using Lucene.Net.Analysis.Util;
+using Lucene.Net.Util;
+using System;
+using System.IO;
+using System.Linq;
+using System.Text;
+
+namespace Lucene.Net.Analysis.Cn.Smart
+{
+    /*
+     * Licensed to the Apache Software Foundation (ASF) under one or more
+     * contributor license agreements.  See the NOTICE file distributed with
+     * this work for additional information regarding copyright ownership.
+     * The ASF licenses this file to You under the Apache License, Version 2.0
+     * (the "License"); you may not use this file except in compliance with
+     * the License.  You may obtain a copy of the License at
+     *
+     *     http://www.apache.org/licenses/LICENSE-2.0
+     *
+     * Unless required by applicable law or agreed to in writing, software
+     * distributed under the License is distributed on an "AS IS" BASIS,
+     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     * See the License for the specific language governing permissions and
+     * limitations under the License.
+     */
+
+    /// <summary>
+    /// <para>
+    /// <see cref="SmartChineseAnalyzer"/> is an analyzer for Chinese or mixed 
Chinese-English text.
+    /// The analyzer uses probabilistic knowledge to find the optimal word 
segmentation for Simplified Chinese text.
+    /// The text is first broken into sentences, then each sentence is 
segmented into words.
+    /// </para>
+    /// <para>
+    /// Segmentation is based upon the <a 
href="http://en.wikipedia.org/wiki/Hidden_Markov_Model";>Hidden Markov Model</a>.
+    /// A large training corpus was used to calculate Chinese word frequency 
probability.
+    /// </para>
+    /// <para>
+    /// This analyzer requires a dictionary to provide statistical data. 
+    /// <see cref="SmartChineseAnalyzer"/> has an included dictionary 
out-of-box.
+    /// </para>
+    /// <para>
+    /// The included dictionary data is from <a 
href="http://www.ictclas.org";>ICTCLAS1.0</a>.
+    /// Thanks to ICTCLAS for their hard work, and for contributing the data 
under the Apache 2 License!
+    /// </para>
+    /// @lucene.experimental
+    /// </summary>
+    public sealed class SmartChineseAnalyzer : Analyzer
+    {
+        private readonly CharArraySet stopWords;
+
+        private static readonly string DEFAULT_STOPWORD_FILE = "stopwords.txt";
+
+        private static readonly string STOPWORD_FILE_COMMENT = "//";
+
+        /// <summary>
+        /// Returns an unmodifiable instance of the default stop-words set.
+        /// </summary>
+        /// <returns>An unmodifiable instance of the default stop-words 
set.</returns>
+        public static CharArraySet GetDefaultStopSet()
+        {
+            return DefaultSetHolder.DEFAULT_STOP_SET;
+        }
+
+        /// <summary>
+        /// Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the 
outer class 
+        /// accesses the static final set the first time.
+        /// </summary>
+        private class DefaultSetHolder
+        {
+            internal static readonly CharArraySet DEFAULT_STOP_SET;
+
+            static DefaultSetHolder()
+            {
+                try
+                {
+                    DEFAULT_STOP_SET = LoadDefaultStopWordSet();
+                }
+                catch (IOException ex)
+                {
+                    // default set should always be present as it is part of 
the
+                    // distribution (JAR)
+                    throw new Exception("Unable to load default stopword set", 
ex);
+                }
+            }
+
+            internal static CharArraySet LoadDefaultStopWordSet()
+            {
+                // make sure it is unmodifiable as we expose it in the outer 
class
+                return 
CharArraySet.UnmodifiableSet(WordlistLoader.GetWordSet(IOUtils
+                    .GetDecodingReader(typeof(SmartChineseAnalyzer), 
DEFAULT_STOPWORD_FILE,
+                        Encoding.UTF8), STOPWORD_FILE_COMMENT,
+#pragma warning disable 612, 618
+                    LuceneVersion.LUCENE_CURRENT));
+#pragma warning restore 612, 618
+            }
+        }
+
+        private readonly LuceneVersion matchVersion;
+
+        /// <summary>
+        /// Create a new <see cref="SmartChineseAnalyzer"/>, using the default 
stopword list.
+        /// </summary>
+        public SmartChineseAnalyzer(LuceneVersion matchVersion)
+              : this(matchVersion, true)
+        {
+        }
+
+        /// <summary>
+        /// <para>
+        /// Create a new <see cref="SmartChineseAnalyzer"/>, optionally using 
the default stopword list.
+        /// </para>
+        /// <para>
+        /// The included default stopword list is simply a list of punctuation.
+        /// If you do not use this list, punctuation will not be removed from 
the text!
+        /// </para>
+        /// </summary>
+        /// <param name="matchVersion"></param>
+        /// <param name="useDefaultStopWords"><c>true</c> to use the default 
stopword list.</param>
+        public SmartChineseAnalyzer(LuceneVersion matchVersion, bool 
useDefaultStopWords)
+        {
+            stopWords = useDefaultStopWords ? DefaultSetHolder.DEFAULT_STOP_SET
+              : CharArraySet.EMPTY_SET;
+            this.matchVersion = matchVersion;
+        }
+
+        /// <summary>
+        /// <para>
+        /// Create a new <see cref="SmartChineseAnalyzer"/>, using the 
provided <see cref="CharArraySet"/> of stopwords.
+        /// </para>
+        /// <para>
+        /// Note: the set should include punctuation, unless you want to index 
punctuation!
+        /// </para>
+        /// </summary>
+        /// <param name="matchVersion"></param>
+        /// <param name="stopWords"><see cref="CharArraySet"/> of stopwords to 
use.</param>
+        public SmartChineseAnalyzer(LuceneVersion matchVersion, CharArraySet 
stopWords)
+        {
+            this.stopWords = stopWords == null ? CharArraySet.EMPTY_SET : 
stopWords;
+            this.matchVersion = matchVersion;
+        }
+
+        protected override TokenStreamComponents CreateComponents(string 
fieldName, TextReader reader)
+        {
+            Tokenizer tokenizer;
+            TokenStream result;
+            if (matchVersion.OnOrAfter(LuceneVersion.LUCENE_48))
+            {
+                tokenizer = new HMMChineseTokenizer(reader);
+                result = tokenizer;
+            }
+            else
+            {
+#pragma warning disable 612, 618
+                tokenizer = new SentenceTokenizer(reader);
+                result = new WordTokenFilter(tokenizer);
+#pragma warning restore 612, 618
+            }
+            // result = new LowerCaseFilter(result);
+            // LowerCaseFilter is not needed, as SegTokenFilter lowercases 
Basic Latin text.
+            // The porter stemming is too strict, this is not a bug, this is a 
feature:)
+            result = new PorterStemFilter(result);
+            if (stopWords.Any())
+            {
+                result = new StopFilter(matchVersion, result, stopWords);
+            }
+            return new TokenStreamComponents(tokenizer, result);
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/2f5d89b4/src/Lucene.Net.Analysis.SmartCn/SmartChineseSentenceTokenizerFactory.cs
----------------------------------------------------------------------
diff --git 
a/src/Lucene.Net.Analysis.SmartCn/SmartChineseSentenceTokenizerFactory.cs 
b/src/Lucene.Net.Analysis.SmartCn/SmartChineseSentenceTokenizerFactory.cs
new file mode 100644
index 0000000..498e9fd
--- /dev/null
+++ b/src/Lucene.Net.Analysis.SmartCn/SmartChineseSentenceTokenizerFactory.cs
@@ -0,0 +1,52 @@
+ï»¿using Lucene.Net.Analysis.Util;
+using Lucene.Net.Util;
+using System;
+using System.Collections.Generic;
+using System.IO;
+using System.Linq;
+
+namespace Lucene.Net.Analysis.Cn.Smart
+{
+    /*
+     * Licensed to the Apache Software Foundation (ASF) under one or more
+     * contributor license agreements.  See the NOTICE file distributed with
+     * this work for additional information regarding copyright ownership.
+     * The ASF licenses this file to You under the Apache License, Version 2.0
+     * (the "License"); you may not use this file except in compliance with
+     * the License.  You may obtain a copy of the License at
+     *
+     *     http://www.apache.org/licenses/LICENSE-2.0
+     *
+     * Unless required by applicable law or agreed to in writing, software
+     * distributed under the License is distributed on an "AS IS" BASIS,
+     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     * See the License for the specific language governing permissions and
+     * limitations under the License.
+     */
+
+    /// <summary>
+    /// Factory for the <see cref="SmartChineseAnalyzer"/> <see 
cref="SentenceTokenizer"/>
+    /// <para/>
+    /// @lucene.experimental
+    /// </summary>
+    [Obsolete("Use HMMChineseTokenizerFactory instead")]
+    public class SmartChineseSentenceTokenizerFactory : TokenizerFactory
+    {
+        /// <summary>
+        /// Creates a new <see cref="SmartChineseSentenceTokenizerFactory"/>
+        /// </summary>
+        public SmartChineseSentenceTokenizerFactory(IDictionary<string, 
string> args)
+            : base(args)
+        {
+            if (args.Any())
+            {
+                throw new ArgumentException("Unknown parameters: " + args);
+            }
+        }
+
+        public override Tokenizer Create(AttributeSource.AttributeFactory 
factory, TextReader input)
+        {
+            return new SentenceTokenizer(factory, input);
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/2f5d89b4/src/Lucene.Net.Analysis.SmartCn/SmartChineseWordTokenFilterFactory.cs
----------------------------------------------------------------------
diff --git 
a/src/Lucene.Net.Analysis.SmartCn/SmartChineseWordTokenFilterFactory.cs 
b/src/Lucene.Net.Analysis.SmartCn/SmartChineseWordTokenFilterFactory.cs
new file mode 100644
index 0000000..79b0ec5
--- /dev/null
+++ b/src/Lucene.Net.Analysis.SmartCn/SmartChineseWordTokenFilterFactory.cs
@@ -0,0 +1,55 @@
+ï»¿using Lucene.Net.Analysis.Util;
+using System;
+using System.Collections.Generic;
+using System.Linq;
+
+namespace Lucene.Net.Analysis.Cn.Smart
+{
+    /*
+     * Licensed to the Apache Software Foundation (ASF) under one or more
+     * contributor license agreements.  See the NOTICE file distributed with
+     * this work for additional information regarding copyright ownership.
+     * The ASF licenses this file to You under the Apache License, Version 2.0
+     * (the "License"); you may not use this file except in compliance with
+     * the License.  You may obtain a copy of the License at
+     *
+     *     http://www.apache.org/licenses/LICENSE-2.0
+     *
+     * Unless required by applicable law or agreed to in writing, software
+     * distributed under the License is distributed on an "AS IS" BASIS,
+     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     * See the License for the specific language governing permissions and
+     * limitations under the License.
+     */
+
+    /// <summary>
+    /// Factory for the <see cref="SmartChineseAnalyzer"/> <see 
cref="WordTokenFilter"/>
+    /// <para>
+    /// Note: this class will currently emit tokens for punctuation. So you 
should either add
+    /// a <see cref="Miscellaneous.WordDelimiterFilter"/> after to remove 
these (with concatenate off), or use the 
+    /// SmartChinese stoplist with a <see cref="Core.StopFilterFactory"/> via:
+    /// <code>words="org/apache/lucene/analysis/cn/smart/stopwords.txt"</code>
+    /// </para>
+    /// @lucene.experimental
+    /// </summary>
+    [Obsolete("Use HMMChineseTokenizerFactory instead")]
+    public class SmartChineseWordTokenFilterFactory : TokenFilterFactory
+    {
+        /// <summary>
+        /// Creates a new <see cref="SmartChineseWordTokenFilterFactory"/>
+        /// </summary>
+        public SmartChineseWordTokenFilterFactory(IDictionary<string, string> 
args)
+            : base(args)
+        {
+            if (args.Any())
+            {
+                throw new ArgumentException("Unknown parameters: " + args);
+            }
+        }
+
+        public override TokenStream Create(TokenStream input)
+        {
+            return new WordTokenFilter(input);
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/2f5d89b4/src/Lucene.Net.Analysis.SmartCn/Utility.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.SmartCn/Utility.cs 
b/src/Lucene.Net.Analysis.SmartCn/Utility.cs
new file mode 100644
index 0000000..8160ecc
--- /dev/null
+++ b/src/Lucene.Net.Analysis.SmartCn/Utility.cs
@@ -0,0 +1,196 @@
+ï»¿namespace Lucene.Net.Analysis.Cn.Smart
+{
+    /*
+     * Licensed to the Apache Software Foundation (ASF) under one or more
+     * contributor license agreements.  See the NOTICE file distributed with
+     * this work for additional information regarding copyright ownership.
+     * The ASF licenses this file to You under the Apache License, Version 2.0
+     * (the "License"); you may not use this file except in compliance with
+     * the License.  You may obtain a copy of the License at
+     *
+     *     http://www.apache.org/licenses/LICENSE-2.0
+     *
+     * Unless required by applicable law or agreed to in writing, software
+     * distributed under the License is distributed on an "AS IS" BASIS,
+     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     * See the License for the specific language governing permissions and
+     * limitations under the License.
+     */
+
+    /// <summary>
+    /// <see cref="SmartChineseAnalyzer"/> utility constants and methods
+    /// <para/>
+    /// @lucene.experimental
+    /// </summary>
+    public class Utility
+    {
+        public static readonly char[] STRING_CHAR_ARRAY = 
"æª##ä¸²".ToCharArray();
+
+        public static readonly char[] NUMBER_CHAR_ARRAY = 
"æª##æ°".ToCharArray();
+
+        public static readonly char[] START_CHAR_ARRAY = 
"å§##å§".ToCharArray();
+
+        public static readonly char[] END_CHAR_ARRAY = 
"æ«##æ«".ToCharArray();
+
+        /// <summary>
+        /// Delimiters will be filtered to this character by <see 
cref="HHMM.SegTokenFilter"/>
+        /// </summary>
+        public static readonly char[] COMMON_DELIMITER = new char[] { ',' };
+
+        /// <summary>
+        /// Space-like characters that need to be skipped: such as space, tab, 
newline, carriage return.
+        /// </summary>
+        public static readonly string SPACES = " ã\t\r\n";
+
+        /// <summary>
+        /// Maximum bigram frequency (used in the smoothing function).
+        /// </summary>
+        public static readonly int MAX_FREQUENCE = 2079997 + 80000;
+
+        /// <summary>
+        /// Compare two arrays starting at the specified offsets.
+        /// </summary>
+        /// <param name="larray">left array</param>
+        /// <param name="lstartIndex">start offset into <paramref 
name="larray"/></param>
+        /// <param name="rarray">right array</param>
+        /// <param name="rstartIndex">start offset into <paramref 
name="rarray"/></param>
+        /// <returns>0 if the arrays are equalï¼1 if <paramref 
name="larray"/> &gt; 
+        /// <paramref name="rarray"/>, -1 if <paramref name="larray"/> &lt; 
<paramref name="rarray"/></returns>
+        public static int CompareArray(char[] larray, int lstartIndex, char[] 
rarray,
+            int rstartIndex)
+        {
+
+            if (larray == null)
+            {
+                if (rarray == null || rstartIndex >= rarray.Length)
+                    return 0;
+                else
+                    return -1;
+            }
+            else
+            {
+                // larray != null
+                if (rarray == null)
+                {
+                    if (lstartIndex >= larray.Length)
+                        return 0;
+                    else
+                        return 1;
+                }
+            }
+
+            int li = lstartIndex, ri = rstartIndex;
+            while (li < larray.Length && ri < rarray.Length && larray[li] == 
rarray[ri])
+            {
+                li++;
+                ri++;
+            }
+            if (li == larray.Length)
+            {
+                if (ri == rarray.Length)
+                {
+                    // Both arrays are equivalent, return 0.
+                    return 0;
+                }
+                else
+                {
+                    // larray < rarray because larray has ended first.
+                    return -1;
+                }
+            }
+            else
+            {
+                // differing lengths
+                if (ri == rarray.Length)
+                {
+                    // larray > rarray because rarray has ended first.
+                    return 1;
+                }
+                else
+                {
+                    // determine by comparison
+                    if (larray[li] > rarray[ri])
+                        return 1;
+                    else
+                        return -1;
+                }
+            }
+        }
+
+        /// <summary>
+        /// Compare two arrays, starting at the specified offsets, but 
treating <paramref name="shortArray"/> as a prefix to <paramref 
name="longArray"/>.
+        /// As long as <paramref name="shortArray"/> is a prefix of <paramref 
name="longArray"/>, return 0.
+        /// Otherwise, behave as <see cref="CompareArray(char[], int, char[], 
int)"/>.
+        /// </summary>
+        /// <param name="shortArray">prefix array</param>
+        /// <param name="shortIndex">offset into <paramref 
name="shortArray"/></param>
+        /// <param name="longArray">long array (word)</param>
+        /// <param name="longIndex">offset into <paramref 
name="longArray"/></param>
+        /// <returns>0 if <paramref name="shortArray"/> is a prefix of 
<paramref name="longArray"/>, 
+        /// otherwise act as <see cref="CompareArray(char[], int, char[], 
int)"/>.</returns>
+        public static int CompareArrayByPrefix(char[] shortArray, int 
shortIndex,
+            char[] longArray, int longIndex)
+        {
+
+            // a null prefix is a prefix of longArray
+            if (shortArray == null)
+                return 0;
+            else if (longArray == null)
+                return (shortIndex < shortArray.Length) ? 1 : 0;
+
+            int si = shortIndex, li = longIndex;
+            while (si < shortArray.Length && li < longArray.Length
+                && shortArray[si] == longArray[li])
+            {
+                si++;
+                li++;
+            }
+            if (si == shortArray.Length)
+            {
+                // shortArray is a prefix of longArray
+                return 0;
+            }
+            else
+            {
+                // shortArray > longArray because longArray ended first.
+                if (li == longArray.Length)
+                    return 1;
+                else
+                    // determine by comparison
+                    return (shortArray[si] > longArray[li]) ? 1 : -1;
+            }
+        }
+
+        /// <summary>
+        /// Return the internal <see cref="CharType"/> constant of a given 
character. 
+        /// </summary>
+        /// <param name="ch">input character</param>
+        /// <returns>Constant from <see cref="CharType"/> describing the 
character type.</returns>
+        /// <seealso cref="CharType"/>
+        public static CharType GetCharType(char ch)
+        {
+            // Most (but not all!) of these are Han Ideographic Characters
+            if (ch >= 0x4E00 && ch <= 0x9FA5)
+                return CharType.HANZI;
+            if ((ch >= 0x0041 && ch <= 0x005A) || (ch >= 0x0061 && ch <= 
0x007A))
+                return CharType.LETTER;
+            if (ch >= 0x0030 && ch <= 0x0039)
+                return CharType.DIGIT;
+            if (ch == ' ' || ch == '\t' || ch == '\r' || ch == '\n' || ch == 
'ã')
+                return CharType.SPACE_LIKE;
+            // Punctuation Marks
+            if ((ch >= 0x0021 && ch <= 0x00BB) || (ch >= 0x2010 && ch <= 
0x2642)
+                || (ch >= 0x3001 && ch <= 0x301E))
+                return CharType.DELIMITER;
+
+            // Full-Width range
+            if ((ch >= 0xFF21 && ch <= 0xFF3A) || (ch >= 0xFF41 && ch <= 
0xFF5A))
+                return CharType.FULLWIDTH_LETTER;
+            if (ch >= 0xFF10 && ch <= 0xFF19)
+                return CharType.FULLWIDTH_DIGIT;
+            if (ch >= 0xFE30 && ch <= 0xFF63)
+                return CharType.DELIMITER;
+            return CharType.OTHER;
+        }
+    }
+}

[2/4] lucenenet git commit: Ported Lucene.Net.Analysis.SmartCn + tests

Reply via email to