[08/13] lucenenet git commit: Ported Lucene.Net.Analysis.Kuromoji + tests

nightowl888 Sun, 23 Jul 2017 10:36:50 -0700

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0f092010/src/Lucene.Net.Analysis.Kuromoji/TokenAttributes/BaseFormAttributeImpl.cs
----------------------------------------------------------------------
diff --git 
a/src/Lucene.Net.Analysis.Kuromoji/TokenAttributes/BaseFormAttributeImpl.cs 
b/src/Lucene.Net.Analysis.Kuromoji/TokenAttributes/BaseFormAttributeImpl.cs
new file mode 100644
index 0000000..7121c73
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Kuromoji/TokenAttributes/BaseFormAttributeImpl.cs
@@ -0,0 +1,55 @@
+ï»¿using Lucene.Net.Util;
+
+namespace Lucene.Net.Analysis.Ja.TokenAttributes
+{
+    /*
+     * Licensed to the Apache Software Foundation (ASF) under one or more
+     * contributor license agreements.  See the NOTICE file distributed with
+     * this work for additional information regarding copyright ownership.
+     * The ASF licenses this file to You under the Apache License, Version 2.0
+     * (the "License"); you may not use this file except in compliance with
+     * the License.  You may obtain a copy of the License at
+     *
+     *     http://www.apache.org/licenses/LICENSE-2.0
+     *
+     * Unless required by applicable law or agreed to in writing, software
+     * distributed under the License is distributed on an "AS IS" BASIS,
+     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     * See the License for the specific language governing permissions and
+     * limitations under the License.
+     */
+
+    /// <summary>
+    /// Attribute for <see cref="Token.GetBaseForm()"/>.
+    /// </summary>
+    public class BaseFormAttribute : Attribute, IBaseFormAttribute
+    {
+        private Token token;
+
+        public virtual string GetBaseForm()
+        {
+            return token == null ? null : token.GetBaseForm();
+        }
+
+        public virtual void SetToken(Token token)
+        {
+            this.token = token;
+        }
+
+        public override void Clear()
+        {
+            token = null;
+        }
+
+        public override void CopyTo(IAttribute target)
+        {
+            BaseFormAttribute t = (BaseFormAttribute)target;
+            t.SetToken(token);
+        }
+
+        public override void ReflectWith(IAttributeReflector reflector)
+        {
+            reflector.Reflect(typeof(BaseFormAttribute), "baseForm", 
GetBaseForm());
+        }
+    }
+}


http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0f092010/src/Lucene.Net.Analysis.Kuromoji/TokenAttributes/InflectionAttribute.cs
----------------------------------------------------------------------
diff --git 
a/src/Lucene.Net.Analysis.Kuromoji/TokenAttributes/InflectionAttribute.cs 
b/src/Lucene.Net.Analysis.Kuromoji/TokenAttributes/InflectionAttribute.cs
new file mode 100644
index 0000000..975d8b0
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Kuromoji/TokenAttributes/InflectionAttribute.cs
@@ -0,0 +1,34 @@
+ï»¿using Lucene.Net.Util;
+
+namespace Lucene.Net.Analysis.Ja.TokenAttributes
+{
+    /*
+     * Licensed to the Apache Software Foundation (ASF) under one or more
+     * contributor license agreements.  See the NOTICE file distributed with
+     * this work for additional information regarding copyright ownership.
+     * The ASF licenses this file to You under the Apache License, Version 2.0
+     * (the "License"); you may not use this file except in compliance with
+     * the License.  You may obtain a copy of the License at
+     *
+     *     http://www.apache.org/licenses/LICENSE-2.0
+     *
+     * Unless required by applicable law or agreed to in writing, software
+     * distributed under the License is distributed on an "AS IS" BASIS,
+     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     * See the License for the specific language governing permissions and
+     * limitations under the License.
+     */
+
+    /// <summary>
+    /// Attribute for Kuromoji inflection data.
+    /// <para/>
+    /// Note: in some cases this value may not be applicable,
+    /// and will be null.
+    /// </summary>
+    public interface IInflectionAttribute : IAttribute
+    {
+        string GetInflectionType();
+        string GetInflectionForm();
+        void SetToken(Token token);
+    }
+}

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0f092010/src/Lucene.Net.Analysis.Kuromoji/TokenAttributes/InflectionAttributeImpl.cs
----------------------------------------------------------------------
diff --git 
a/src/Lucene.Net.Analysis.Kuromoji/TokenAttributes/InflectionAttributeImpl.cs 
b/src/Lucene.Net.Analysis.Kuromoji/TokenAttributes/InflectionAttributeImpl.cs
new file mode 100644
index 0000000..db96130
--- /dev/null
+++ 
b/src/Lucene.Net.Analysis.Kuromoji/TokenAttributes/InflectionAttributeImpl.cs
@@ -0,0 +1,68 @@
+ï»¿using Lucene.Net.Analysis.Ja.Util;
+using Lucene.Net.Util;
+
+namespace Lucene.Net.Analysis.Ja.TokenAttributes
+{
+    /*
+     * Licensed to the Apache Software Foundation (ASF) under one or more
+     * contributor license agreements.  See the NOTICE file distributed with
+     * this work for additional information regarding copyright ownership.
+     * The ASF licenses this file to You under the Apache License, Version 2.0
+     * (the "License"); you may not use this file except in compliance with
+     * the License.  You may obtain a copy of the License at
+     *
+     *     http://www.apache.org/licenses/LICENSE-2.0
+     *
+     * Unless required by applicable law or agreed to in writing, software
+     * distributed under the License is distributed on an "AS IS" BASIS,
+     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     * See the License for the specific language governing permissions and
+     * limitations under the License.
+     */
+
+    /// <summary>
+    /// Attribute for Kuromoji inflection data.
+    /// </summary>
+    public class InflectionAttribute : Attribute, IInflectionAttribute
+    {
+        private Token token;
+
+        public virtual string GetInflectionType()
+        {
+            return token == null ? null : token.GetInflectionType();
+        }
+
+        public virtual string GetInflectionForm()
+        {
+            return token == null ? null : token.GetInflectionForm();
+        }
+
+        public virtual void SetToken(Token token)
+        {
+            this.token = token;
+        }
+
+        public override void Clear()
+        {
+            token = null;
+        }
+
+        public override void CopyTo(IAttribute target)
+        {
+            InflectionAttribute t = (InflectionAttribute)target;
+            t.SetToken(token);
+        }
+
+        public override void ReflectWith(IAttributeReflector reflector)
+        {
+            string type = GetInflectionType();
+            string typeEN = type == null ? null : 
ToStringUtil.GetInflectionTypeTranslation(type);
+            reflector.Reflect<IInflectionAttribute>("inflectionType", type);
+            reflector.Reflect<IInflectionAttribute>("inflectionType (en)", 
typeEN);
+            string form = GetInflectionForm();
+            string formEN = form == null ? null : 
ToStringUtil.GetInflectedFormTranslation(form);
+            reflector.Reflect<IInflectionAttribute>("inflectionForm", form);
+            reflector.Reflect<IInflectionAttribute>("inflectionForm (en)", 
formEN);
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0f092010/src/Lucene.Net.Analysis.Kuromoji/TokenAttributes/PartOfSpeechAttribute.cs
----------------------------------------------------------------------
diff --git 
a/src/Lucene.Net.Analysis.Kuromoji/TokenAttributes/PartOfSpeechAttribute.cs 
b/src/Lucene.Net.Analysis.Kuromoji/TokenAttributes/PartOfSpeechAttribute.cs
new file mode 100644
index 0000000..722d203
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Kuromoji/TokenAttributes/PartOfSpeechAttribute.cs
@@ -0,0 +1,30 @@
+ï»¿using Lucene.Net.Util;
+
+namespace Lucene.Net.Analysis.Ja.TokenAttributes
+{
+    /*
+     * Licensed to the Apache Software Foundation (ASF) under one or more
+     * contributor license agreements.  See the NOTICE file distributed with
+     * this work for additional information regarding copyright ownership.
+     * The ASF licenses this file to You under the Apache License, Version 2.0
+     * (the "License"); you may not use this file except in compliance with
+     * the License.  You may obtain a copy of the License at
+     *
+     *     http://www.apache.org/licenses/LICENSE-2.0
+     *
+     * Unless required by applicable law or agreed to in writing, software
+     * distributed under the License is distributed on an "AS IS" BASIS,
+     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     * See the License for the specific language governing permissions and
+     * limitations under the License.
+     */
+
+    /// <summary>
+    /// Attribute for <see cref="Token.GetPartOfSpeech()"/>.
+    /// </summary>
+    public interface IPartOfSpeechAttribute : IAttribute
+    {
+        string GetPartOfSpeech();
+        void SetToken(Token token);
+    }
+}

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0f092010/src/Lucene.Net.Analysis.Kuromoji/TokenAttributes/PartOfSpeechAttributeImpl.cs
----------------------------------------------------------------------
diff --git 
a/src/Lucene.Net.Analysis.Kuromoji/TokenAttributes/PartOfSpeechAttributeImpl.cs 
b/src/Lucene.Net.Analysis.Kuromoji/TokenAttributes/PartOfSpeechAttributeImpl.cs
new file mode 100644
index 0000000..9d0451d
--- /dev/null
+++ 
b/src/Lucene.Net.Analysis.Kuromoji/TokenAttributes/PartOfSpeechAttributeImpl.cs
@@ -0,0 +1,59 @@
+ï»¿using Lucene.Net.Analysis.Ja.Util;
+using Lucene.Net.Util;
+
+namespace Lucene.Net.Analysis.Ja.TokenAttributes
+{
+    /*
+     * Licensed to the Apache Software Foundation (ASF) under one or more
+     * contributor license agreements.  See the NOTICE file distributed with
+     * this work for additional information regarding copyright ownership.
+     * The ASF licenses this file to You under the Apache License, Version 2.0
+     * (the "License"); you may not use this file except in compliance with
+     * the License.  You may obtain a copy of the License at
+     *
+     *     http://www.apache.org/licenses/LICENSE-2.0
+     *
+     * Unless required by applicable law or agreed to in writing, software
+     * distributed under the License is distributed on an "AS IS" BASIS,
+     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     * See the License for the specific language governing permissions and
+     * limitations under the License.
+     */
+
+    /// <summary>
+    /// Attribute for <see cref="Token.GetPartOfSpeech()"/>.
+    /// </summary>
+    public class PartOfSpeechAttribute : Attribute, IPartOfSpeechAttribute
+    {
+        private Token token;
+
+        public virtual string GetPartOfSpeech()
+        {
+            return token == null ? null : token.GetPartOfSpeech();
+        }
+
+        public virtual void SetToken(Token token)
+        {
+            this.token = token;
+        }
+
+        public override void Clear()
+        {
+            token = null;
+        }
+
+        public override void CopyTo(IAttribute target)
+        {
+            PartOfSpeechAttribute t = (PartOfSpeechAttribute)target;
+            t.SetToken(token);
+        }
+
+        public override void ReflectWith(IAttributeReflector reflector)
+        {
+            string partOfSpeech = GetPartOfSpeech();
+            string partOfSpeechEN = partOfSpeech == null ? null : 
ToStringUtil.GetPOSTranslation(partOfSpeech);
+            reflector.Reflect<IPartOfSpeechAttribute>("partOfSpeech", 
partOfSpeech);
+            reflector.Reflect<IPartOfSpeechAttribute>("partOfSpeech (en)", 
partOfSpeechEN);
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0f092010/src/Lucene.Net.Analysis.Kuromoji/TokenAttributes/ReadingAttribute.cs
----------------------------------------------------------------------
diff --git 
a/src/Lucene.Net.Analysis.Kuromoji/TokenAttributes/ReadingAttribute.cs 
b/src/Lucene.Net.Analysis.Kuromoji/TokenAttributes/ReadingAttribute.cs
new file mode 100644
index 0000000..2432872
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Kuromoji/TokenAttributes/ReadingAttribute.cs
@@ -0,0 +1,34 @@
+ï»¿using Lucene.Net.Util;
+
+namespace Lucene.Net.Analysis.Ja.TokenAttributes
+{
+    /*
+     * Licensed to the Apache Software Foundation (ASF) under one or more
+     * contributor license agreements.  See the NOTICE file distributed with
+     * this work for additional information regarding copyright ownership.
+     * The ASF licenses this file to You under the Apache License, Version 2.0
+     * (the "License"); you may not use this file except in compliance with
+     * the License.  You may obtain a copy of the License at
+     *
+     *     http://www.apache.org/licenses/LICENSE-2.0
+     *
+     * Unless required by applicable law or agreed to in writing, software
+     * distributed under the License is distributed on an "AS IS" BASIS,
+     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     * See the License for the specific language governing permissions and
+     * limitations under the License.
+     */
+
+    /// <summary>
+    /// Attribute for Kuromoji reading data
+    /// <para/>
+    /// Note: in some cases this value may not be applicable,
+    /// and will be null.
+    /// </summary>
+    public interface IReadingAttribute : IAttribute
+    {
+        string GetReading();
+        string GetPronunciation();
+        void SetToken(Token token);
+    }
+}

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0f092010/src/Lucene.Net.Analysis.Kuromoji/TokenAttributes/ReadingAttributeImpl.cs
----------------------------------------------------------------------
diff --git 
a/src/Lucene.Net.Analysis.Kuromoji/TokenAttributes/ReadingAttributeImpl.cs 
b/src/Lucene.Net.Analysis.Kuromoji/TokenAttributes/ReadingAttributeImpl.cs
new file mode 100644
index 0000000..bfb8a93
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Kuromoji/TokenAttributes/ReadingAttributeImpl.cs
@@ -0,0 +1,68 @@
+ï»¿using Lucene.Net.Analysis.Ja.Util;
+using Lucene.Net.Util;
+
+namespace Lucene.Net.Analysis.Ja.TokenAttributes
+{
+    /*
+     * Licensed to the Apache Software Foundation (ASF) under one or more
+     * contributor license agreements.  See the NOTICE file distributed with
+     * this work for additional information regarding copyright ownership.
+     * The ASF licenses this file to You under the Apache License, Version 2.0
+     * (the "License"); you may not use this file except in compliance with
+     * the License.  You may obtain a copy of the License at
+     *
+     *     http://www.apache.org/licenses/LICENSE-2.0
+     *
+     * Unless required by applicable law or agreed to in writing, software
+     * distributed under the License is distributed on an "AS IS" BASIS,
+     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     * See the License for the specific language governing permissions and
+     * limitations under the License.
+     */
+
+    /// <summary>
+    /// Attribute for Kuromoji reading data
+    /// </summary>
+    public class ReadingAttribute : Attribute, IReadingAttribute
+    {
+        private Token token;
+
+        public virtual string GetReading()
+        {
+            return token == null ? null : token.GetReading();
+        }
+
+        public virtual string GetPronunciation()
+        {
+            return token == null ? null : token.GetPronunciation();
+        }
+
+        public virtual void SetToken(Token token)
+        {
+            this.token = token;
+        }
+
+        public override void Clear()
+        {
+            token = null;
+        }
+
+        public override void CopyTo(IAttribute target)
+        {
+            ReadingAttribute t = (ReadingAttribute)target;
+            t.SetToken(token);
+        }
+
+        public override void ReflectWith(IAttributeReflector reflector)
+        {
+            string reading = GetReading();
+            string readingEN = reading == null ? null : 
ToStringUtil.GetRomanization(reading);
+            string pronunciation = GetPronunciation();
+            string pronunciationEN = pronunciation == null ? null : 
ToStringUtil.GetRomanization(pronunciation);
+            reflector.Reflect<IReadingAttribute>("reading", reading);
+            reflector.Reflect<IReadingAttribute>("reading (en)", readingEN);
+            reflector.Reflect<IReadingAttribute>("pronunciation", 
pronunciation);
+            reflector.Reflect<IReadingAttribute>("pronunciation (en)", 
pronunciationEN);
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0f092010/src/Lucene.Net.Analysis.Kuromoji/Tools/BinaryDictionaryWriter.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Kuromoji/Tools/BinaryDictionaryWriter.cs 
b/src/Lucene.Net.Analysis.Kuromoji/Tools/BinaryDictionaryWriter.cs
new file mode 100644
index 0000000..39d36eb
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Kuromoji/Tools/BinaryDictionaryWriter.cs
@@ -0,0 +1,370 @@
+ï»¿using Lucene.Net.Analysis.Ja.Dict;
+using Lucene.Net.Codecs;
+using Lucene.Net.Store;
+using Lucene.Net.Support.IO;
+using Lucene.Net.Util;
+using System;
+using System.Collections.Generic;
+using System.Diagnostics;
+using System.Globalization;
+using System.IO;
+using System.Text;
+
+namespace Lucene.Net.Analysis.Ja.Util
+{
+    /*
+     * Licensed to the Apache Software Foundation (ASF) under one or more
+     * contributor license agreements.  See the NOTICE file distributed with
+     * this work for additional information regarding copyright ownership.
+     * The ASF licenses this file to You under the Apache License, Version 2.0
+     * (the "License"); you may not use this file except in compliance with
+     * the License.  You may obtain a copy of the License at
+     *
+     *     http://www.apache.org/licenses/LICENSE-2.0
+     *
+     * Unless required by applicable law or agreed to in writing, software
+     * distributed under the License is distributed on an "AS IS" BASIS,
+     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     * See the License for the specific language governing permissions and
+     * limitations under the License.
+     */
+
+    public abstract class BinaryDictionaryWriter
+    {
+        protected readonly Type m_implClazz;
+        protected ByteBuffer m_buffer;
+        private int targetMapEndOffset = 0, lastWordId = -1, lastSourceId = -1;
+        private int[] targetMap = new int[8192];
+        private int[] targetMapOffsets = new int[8192];
+        private readonly List<string> posDict = new List<string>();
+
+        public BinaryDictionaryWriter(Type implClazz, int size)
+        {
+            this.m_implClazz = implClazz;
+            m_buffer = ByteBuffer.Allocate(size);
+        }
+
+        /// <summary>
+        /// Put the entry in map.
+        /// </summary>
+        /// <param name="entry"></param>
+        /// <returns>Current position of buffer, which will be wordId of next 
entry.</returns>
+        public virtual int Put(string[] entry)
+        {
+            short leftId = short.Parse(entry[1], CultureInfo.InvariantCulture);
+            short rightId = short.Parse(entry[2], 
CultureInfo.InvariantCulture);
+            short wordCost = short.Parse(entry[3], 
CultureInfo.InvariantCulture);
+
+            StringBuilder sb = new StringBuilder();
+
+            // build up the POS string
+            for (int i = 4; i < 8; i++)
+            {
+                string part = entry[i];
+                Debug.Assert(part.Length > 0);
+                if (!"*".Equals(part, StringComparison.Ordinal))
+                {
+                    if (sb.Length > 0)
+                    {
+                        sb.Append('-');
+                    }
+                    sb.Append(part);
+                }
+            }
+
+            string posData = sb.ToString();
+
+            sb.Length = 0;
+            sb.Append(CSVUtil.QuoteEscape(posData));
+            sb.Append(',');
+            if (!"*".Equals(entry[8]))
+            {
+                sb.Append(CSVUtil.QuoteEscape(entry[8]));
+            }
+            sb.Append(',');
+            if (!"*".Equals(entry[9]))
+            {
+                sb.Append(CSVUtil.QuoteEscape(entry[9]));
+            }
+            string fullPOSData = sb.ToString();
+
+            string baseForm = entry[10];
+            string reading = entry[11];
+            string pronunciation = entry[12];
+
+            // extend buffer if necessary
+            int left = m_buffer.Remaining;
+            // worst case: two short, 3 bytes, and features (all as utf-16)
+            int worstCase = 4 + 3 + 2 * (baseForm.Length + reading.Length + 
pronunciation.Length);
+            if (worstCase > left)
+            {
+                ByteBuffer newBuffer = 
ByteBuffer.Allocate(ArrayUtil.Oversize(m_buffer.Limit + worstCase - left, 1));
+                m_buffer.Flip();
+                newBuffer.Put(m_buffer);
+                m_buffer = newBuffer;
+            }
+
+            int flags = 0;
+            if (!("*".Equals(baseForm) || baseForm.Equals(entry[0])))
+            {
+                flags |= BinaryDictionary.HAS_BASEFORM;
+            }
+            if (!reading.Equals(ToKatakana(entry[0])))
+            {
+                flags |= BinaryDictionary.HAS_READING;
+            }
+            if (!pronunciation.Equals(reading))
+            {
+                flags |= BinaryDictionary.HAS_PRONUNCIATION;
+            }
+
+            Debug.Assert(leftId == rightId);
+            Debug.Assert(leftId < 4096); // there are still unused bits
+                                         // add pos mapping
+            int toFill = 1 + leftId - posDict.Count;
+            for (int i = 0; i < toFill; i++)
+            {
+                posDict.Add(null);
+            }
+
+            string existing = posDict[leftId];
+            Debug.Assert(existing == null || existing.Equals(fullPOSData));
+            posDict[leftId] = fullPOSData;
+
+            m_buffer.PutInt16((short)(leftId << 3 | flags));
+            m_buffer.PutInt16(wordCost);
+
+            if ((flags & BinaryDictionary.HAS_BASEFORM) != 0)
+            {
+                Debug.Assert(baseForm.Length < 16);
+                int shared = SharedPrefix(entry[0], baseForm);
+                int suffix = baseForm.Length - shared;
+                m_buffer.Put((byte)(shared << 4 | suffix));
+                for (int i = shared; i < baseForm.Length; i++)
+                {
+                    m_buffer.PutChar(baseForm[i]);
+                }
+            }
+
+            if ((flags & BinaryDictionary.HAS_READING) != 0)
+            {
+                if (IsKatakana(reading))
+                {
+                    m_buffer.Put((byte)(reading.Length << 1 | 1));
+                    WriteKatakana(reading);
+                }
+                else
+                {
+                    m_buffer.Put((byte)(reading.Length << 1));
+                    for (int i = 0; i < reading.Length; i++)
+                    {
+                        m_buffer.PutChar(reading[i]);
+                    }
+                }
+            }
+
+            if ((flags & BinaryDictionary.HAS_PRONUNCIATION) != 0)
+            {
+                // we can save 150KB here, but it makes the reader a little 
complicated.
+                // int shared = sharedPrefix(reading, pronunciation);
+                // buffer.put((byte) shared);
+                // pronunciation = pronunciation.substring(shared);
+                if (IsKatakana(pronunciation))
+                {
+                    m_buffer.Put((byte)(pronunciation.Length << 1 | 1));
+                    WriteKatakana(pronunciation);
+                }
+                else
+                {
+                    m_buffer.Put((byte)(pronunciation.Length << 1));
+                    for (int i = 0; i < pronunciation.Length; i++)
+                    {
+                        m_buffer.PutChar(pronunciation[i]);
+                    }
+                }
+            }
+
+            return m_buffer.Position;
+        }
+
+        private bool IsKatakana(string s)
+        {
+            for (int i = 0; i < s.Length; i++)
+            {
+                char ch = s[i];
+                if (ch < 0x30A0 || ch > 0x30FF)
+                {
+                    return false;
+                }
+            }
+            return true;
+        }
+
+        private void WriteKatakana(string s)
+        {
+            for (int i = 0; i < s.Length; i++)
+            {
+                m_buffer.Put((byte)(s[i] - 0x30A0));
+            }
+        }
+
+        private string ToKatakana(string s)
+        {
+            char[] text = new char[s.Length];
+            for (int i = 0; i < s.Length; i++)
+            {
+                char ch = s[i];
+                if (ch > 0x3040 && ch < 0x3097)
+                {
+                    text[i] = (char)(ch + 0x60);
+                }
+                else
+                {
+                    text[i] = ch;
+                }
+            }
+            return new string(text);
+        }
+
+        public static int SharedPrefix(string left, string right)
+        {
+            int len = left.Length < right.Length ? left.Length : right.Length;
+            for (int i = 0; i < len; i++)
+                if (left[i] != right[i])
+                    return i;
+            return len;
+        }
+
+        public virtual void AddMapping(int sourceId, int wordId)
+        {
+            Debug.Assert(wordId > lastWordId, "words out of order: " + wordId 
+ " vs lastID: " + lastWordId);
+
+            if (sourceId > lastSourceId)
+            {
+                Debug.Assert(sourceId > lastSourceId, "source ids out of 
order: lastSourceId=" + lastSourceId + " vs sourceId=" + sourceId);
+                targetMapOffsets = ArrayUtil.Grow(targetMapOffsets, sourceId + 
1);
+                for (int i = lastSourceId + 1; i <= sourceId; i++)
+                {
+                    targetMapOffsets[i] = targetMapEndOffset;
+                }
+            }
+            else
+            {
+                Debug.Assert(sourceId == lastSourceId);
+            }
+
+            targetMap = ArrayUtil.Grow(targetMap, targetMapEndOffset + 1);
+            targetMap[targetMapEndOffset] = wordId;
+            targetMapEndOffset++;
+
+            lastSourceId = sourceId;
+            lastWordId = wordId;
+        }
+
+        protected string GetBaseFileName(string baseDir)
+        {
+            return baseDir + System.IO.Path.DirectorySeparatorChar + 
m_implClazz.FullName.Replace('.', System.IO.Path.DirectorySeparatorChar);
+        }
+
+        /// <summary>
+        /// Write dictionary in file
+        /// </summary>
+        /// <remarks>
+        /// Dictionary format is:
+        /// [Size of dictionary(int)], [entry:{left id(short)}{right 
id(short)}{word cost(short)}{length of pos info(short)}{pos info(char)}], 
[entry...], [entry...].....
+        /// </remarks>
+        /// <param name="baseDir"></param>
+        /// <exception cref="IOException">If an I/O error occurs writing the 
dictionary files.</exception>
+        public virtual void Write(string baseDir)
+        {
+            string baseName = GetBaseFileName(baseDir);
+            WriteDictionary(baseName + BinaryDictionary.DICT_FILENAME_SUFFIX);
+            WriteTargetMap(baseName + 
BinaryDictionary.TARGETMAP_FILENAME_SUFFIX);
+            WritePosDict(baseName + BinaryDictionary.POSDICT_FILENAME_SUFFIX);
+        }
+
+        // TODO: maybe this int[] should instead be the output to the FST...
+        protected virtual void WriteTargetMap(string filename)
+        {
+            //new File(filename).getParentFile().mkdirs();
+            
System.IO.Directory.CreateDirectory(System.IO.Path.GetDirectoryName(filename));
+            using (Stream os = new FileStream(filename, FileMode.Create, 
FileAccess.Write))
+            {
+                DataOutput @out = new OutputStreamDataOutput(os);
+                CodecUtil.WriteHeader(@out, BinaryDictionary.TARGETMAP_HEADER, 
BinaryDictionary.VERSION);
+
+                int numSourceIds = lastSourceId + 1;
+                @out.WriteVInt32(targetMapEndOffset); // <-- size of main array
+                @out.WriteVInt32(numSourceIds + 1); // <-- size of offset 
array (+ 1 more entry)
+                int prev = 0, sourceId = 0;
+                for (int ofs = 0; ofs < targetMapEndOffset; ofs++)
+                {
+                    int val = targetMap[ofs], delta = val - prev;
+                    Debug.Assert(delta >= 0);
+                    if (ofs == targetMapOffsets[sourceId])
+                    {
+                        @out.WriteVInt32((delta << 1) | 0x01);
+                        sourceId++;
+                    }
+                    else
+                    {
+                        @out.WriteVInt32((delta << 1));
+                    }
+                    prev += delta;
+                }
+                Debug.Assert(sourceId == numSourceIds, "sourceId:" + sourceId 
+ " != numSourceIds:" + numSourceIds);
+            }
+        }
+
+        protected virtual void WritePosDict(string filename)
+        {
+            //new File(filename).getParentFile().mkdirs();
+            
System.IO.Directory.CreateDirectory(System.IO.Path.GetDirectoryName(filename));
+            using (Stream os = new FileStream(filename, FileMode.Create, 
FileAccess.Write))
+            {
+                DataOutput @out = new OutputStreamDataOutput(os);
+                CodecUtil.WriteHeader(@out, BinaryDictionary.POSDICT_HEADER, 
BinaryDictionary.VERSION);
+                @out.WriteVInt32(posDict.Count);
+                foreach (string s in posDict)
+                {
+                    if (s == null)
+                    {
+                        @out.WriteByte((byte)0);
+                        @out.WriteByte((byte)0);
+                        @out.WriteByte((byte)0);
+                    }
+                    else
+                    {
+                        string[] data = CSVUtil.Parse(s);
+                        Debug.Assert(data.Length == 3, "malformed 
pos/inflection: " + s);
+                        @out.WriteString(data[0]);
+                        @out.WriteString(data[1]);
+                        @out.WriteString(data[2]);
+                    }
+                }
+            }
+        }
+
+        protected virtual void WriteDictionary(string filename)
+        {
+            //new File(filename).getParentFile().mkdirs();
+            
System.IO.Directory.CreateDirectory(System.IO.Path.GetDirectoryName(filename));
+            using (Stream os = new FileStream(filename, FileMode.Create, 
FileAccess.Write))
+            {
+                DataOutput @out = new OutputStreamDataOutput(os);
+                CodecUtil.WriteHeader(@out, BinaryDictionary.DICT_HEADER, 
BinaryDictionary.VERSION);
+                @out.WriteVInt32(m_buffer.Position);
+                var writer = new BinaryWriter(os);
+
+                //WritableByteChannel channel = Channels.newChannel(os);
+                // Write Buffer
+                m_buffer.Flip();  // set position to 0, set limit to current 
position
+                //channel.write(buffer);
+
+                writer.Write(m_buffer.Array, m_buffer.Position, 
m_buffer.Limit);
+
+                Debug.Assert(m_buffer.Remaining == 0L);
+            }
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0f092010/src/Lucene.Net.Analysis.Kuromoji/Tools/CharacterDefinitionWriter.cs
----------------------------------------------------------------------
diff --git 
a/src/Lucene.Net.Analysis.Kuromoji/Tools/CharacterDefinitionWriter.cs 
b/src/Lucene.Net.Analysis.Kuromoji/Tools/CharacterDefinitionWriter.cs
new file mode 100644
index 0000000..6d5526d
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Kuromoji/Tools/CharacterDefinitionWriter.cs
@@ -0,0 +1,91 @@
+ï»¿using Lucene.Net.Analysis.Ja.Dict;
+using Lucene.Net.Codecs;
+using Lucene.Net.Store;
+using Lucene.Net.Support;
+using System;
+using System.IO;
+
+namespace Lucene.Net.Analysis.Ja.Util
+{
+    /*
+     * Licensed to the Apache Software Foundation (ASF) under one or more
+     * contributor license agreements.  See the NOTICE file distributed with
+     * this work for additional information regarding copyright ownership.
+     * The ASF licenses this file to You under the Apache License, Version 2.0
+     * (the "License"); you may not use this file except in compliance with
+     * the License.  You may obtain a copy of the License at
+     *
+     *     http://www.apache.org/licenses/LICENSE-2.0
+     *
+     * Unless required by applicable law or agreed to in writing, software
+     * distributed under the License is distributed on an "AS IS" BASIS,
+     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     * See the License for the specific language governing permissions and
+     * limitations under the License.
+     */
+
+    public sealed class CharacterDefinitionWriter
+    {
+        private readonly byte[] characterCategoryMap = new byte[0x10000];
+
+        private readonly bool[] invokeMap = new 
bool[CharacterDefinition.CLASS_COUNT];
+        private readonly bool[] groupMap = new 
bool[CharacterDefinition.CLASS_COUNT];
+
+        /// <summary>
+        /// Constructor for building. TODO: remove write access
+        /// </summary>
+        public CharacterDefinitionWriter()
+        {
+            Arrays.Fill(characterCategoryMap, CharacterDefinition.DEFAULT);
+        }
+
+        /// <summary>
+        /// Put mapping from unicode code point to character class.
+        /// </summary>
+        /// <param name="codePoint">Code point.</param>
+        /// <param name="characterClassName">Character class name.</param>
+        public void PutCharacterCategory(int codePoint, string 
characterClassName)
+        {
+            characterClassName = characterClassName.Split(new char[] { ' ' }, 
StringSplitOptions.RemoveEmptyEntries)[0]; // use first
+                                                                               
                                          // category
+                                                                               
                                          // class
+
+            // Override Nakaguro
+            if (codePoint == 0x30FB)
+            {
+                characterClassName = "SYMBOL";
+            }
+            characterCategoryMap[codePoint] = 
CharacterDefinition.LookupCharacterClass(characterClassName);
+        }
+
+        public void PutInvokeDefinition(string characterClassName, int invoke, 
int group, int length)
+        {
+            byte characterClass = 
CharacterDefinition.LookupCharacterClass(characterClassName);
+            invokeMap[characterClass] = invoke == 1;
+            groupMap[characterClass] = group == 1;
+            // TODO: length def ignored
+        }
+
+        public void Write(string baseDir)
+        {
+            string filename = baseDir + System.IO.Path.DirectorySeparatorChar +
+                typeof(CharacterDefinition).FullName.Replace('.', 
System.IO.Path.DirectorySeparatorChar) + CharacterDefinition.FILENAME_SUFFIX;
+            //new File(filename).getParentFile().mkdirs();
+            
System.IO.Directory.CreateDirectory(System.IO.Path.GetDirectoryName(baseDir));
+            using (Stream os = new FileStream(filename, FileMode.Create, 
FileAccess.Write))
+            {
+                DataOutput @out = new OutputStreamDataOutput(os);
+                CodecUtil.WriteHeader(@out, CharacterDefinition.HEADER, 
CharacterDefinition.VERSION);
+                @out.WriteBytes(characterCategoryMap, 0, 
characterCategoryMap.Length);
+                for (int i = 0; i < CharacterDefinition.CLASS_COUNT; i++)
+                {
+                    byte b = (byte)(
+                      (invokeMap[i] ? 0x01 : 0x00) |
+                      (groupMap[i] ? 0x02 : 0x00)
+                    );
+                    @out.WriteByte(b);
+                }
+            }
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0f092010/src/Lucene.Net.Analysis.Kuromoji/Tools/ConnectionCostsBuilder.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Kuromoji/Tools/ConnectionCostsBuilder.cs 
b/src/Lucene.Net.Analysis.Kuromoji/Tools/ConnectionCostsBuilder.cs
new file mode 100644
index 0000000..235c523
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Kuromoji/Tools/ConnectionCostsBuilder.cs
@@ -0,0 +1,68 @@
+ï»¿using System.Diagnostics;
+using System.Globalization;
+using System.IO;
+using System.Text;
+using System.Text.RegularExpressions;
+
+namespace Lucene.Net.Analysis.Ja.Util
+{
+    /*
+     * Licensed to the Apache Software Foundation (ASF) under one or more
+     * contributor license agreements.  See the NOTICE file distributed with
+     * this work for additional information regarding copyright ownership.
+     * The ASF licenses this file to You under the Apache License, Version 2.0
+     * (the "License"); you may not use this file except in compliance with
+     * the License.  You may obtain a copy of the License at
+     *
+     *     http://www.apache.org/licenses/LICENSE-2.0
+     *
+     * Unless required by applicable law or agreed to in writing, software
+     * distributed under the License is distributed on an "AS IS" BASIS,
+     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     * See the License for the specific language governing permissions and
+     * limitations under the License.
+     */
+
+    public class ConnectionCostsBuilder
+    {
+        private static readonly Regex whiteSpaceRegex = new Regex("\\s+", 
RegexOptions.Compiled);
+
+        private ConnectionCostsBuilder()
+        {
+        }
+
+        public static ConnectionCostsWriter Build(string filename)
+        {
+            using (Stream inputStream = new FileStream(filename, 
FileMode.Open, FileAccess.Read))
+            {
+                StreamReader streamReader = new StreamReader(inputStream, 
Encoding.ASCII);
+
+                string line = streamReader.ReadLine();
+                string[] dimensions = whiteSpaceRegex.Split(line);
+
+                Debug.Assert(dimensions.Length == 2);
+
+                int forwardSize = int.Parse(dimensions[0], 
CultureInfo.InvariantCulture);
+                int backwardSize = int.Parse(dimensions[1], 
CultureInfo.InvariantCulture);
+
+                Debug.Assert(forwardSize > 0 && backwardSize > 0);
+
+                ConnectionCostsWriter costs = new 
ConnectionCostsWriter(forwardSize, backwardSize);
+
+                while ((line = streamReader.ReadLine()) != null)
+                {
+                    string[] fields = whiteSpaceRegex.Split(line);
+
+                    Debug.Assert(fields.Length == 3);
+
+                    int forwardId = int.Parse(fields[0], 
CultureInfo.InvariantCulture);
+                    int backwardId = int.Parse(fields[1], 
CultureInfo.InvariantCulture);
+                    int cost = int.Parse(fields[2], 
CultureInfo.InvariantCulture);
+
+                    costs.Add(forwardId, backwardId, cost);
+                }
+                return costs;
+            }
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0f092010/src/Lucene.Net.Analysis.Kuromoji/Tools/ConnectionCostsWriter.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Kuromoji/Tools/ConnectionCostsWriter.cs 
b/src/Lucene.Net.Analysis.Kuromoji/Tools/ConnectionCostsWriter.cs
new file mode 100644
index 0000000..bd1376e
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Kuromoji/Tools/ConnectionCostsWriter.cs
@@ -0,0 +1,74 @@
+ï»¿using Lucene.Net.Analysis.Ja.Dict;
+using Lucene.Net.Codecs;
+using Lucene.Net.Store;
+using System.Diagnostics;
+using System.IO;
+
+namespace Lucene.Net.Analysis.Ja.Util
+{
+    /*
+     * Licensed to the Apache Software Foundation (ASF) under one or more
+     * contributor license agreements.  See the NOTICE file distributed with
+     * this work for additional information regarding copyright ownership.
+     * The ASF licenses this file to You under the Apache License, Version 2.0
+     * (the "License"); you may not use this file except in compliance with
+     * the License.  You may obtain a copy of the License at
+     *
+     *     http://www.apache.org/licenses/LICENSE-2.0
+     *
+     * Unless required by applicable law or agreed to in writing, software
+     * distributed under the License is distributed on an "AS IS" BASIS,
+     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     * See the License for the specific language governing permissions and
+     * limitations under the License.
+     */
+
+    public sealed class ConnectionCostsWriter
+    {
+        private readonly short[][] costs; // array is backward IDs first since 
get is called using the same backward ID consecutively. maybe doesn't matter.
+        private readonly int forwardSize;
+        private readonly int backwardSize;
+        /// <summary>
+        /// Constructor for building. TODO: remove write access
+        /// </summary>
+        public ConnectionCostsWriter(int forwardSize, int backwardSize)
+        {
+            this.forwardSize = forwardSize;
+            this.backwardSize = backwardSize;
+            //this.costs = new short[backwardSize][forwardSize];
+            this.costs = 
Support.RectangularArrays.ReturnRectangularArray<short>(backwardSize, 
forwardSize);
+        }
+
+        public void Add(int forwardId, int backwardId, int cost)
+        {
+            this.costs[backwardId][forwardId] = (short)cost;
+        }
+
+        public void Write(string baseDir)
+        {
+            string filename = baseDir + System.IO.Path.DirectorySeparatorChar +
+                typeof(ConnectionCosts).FullName.Replace('.', 
System.IO.Path.DirectorySeparatorChar) + ConnectionCosts.FILENAME_SUFFIX;
+            //new File(filename).getParentFile().mkdirs();
+            
System.IO.Directory.CreateDirectory(System.IO.Path.GetDirectoryName(filename));
+            using (Stream os = new FileStream(filename, FileMode.Create, 
FileAccess.Write))
+            {
+                DataOutput @out = new OutputStreamDataOutput(os);
+                CodecUtil.WriteHeader(@out, ConnectionCosts.HEADER, 
ConnectionCosts.VERSION);
+                @out.WriteVInt32(forwardSize);
+                @out.WriteVInt32(backwardSize);
+                int last = 0;
+                Debug.Assert(costs.Length == backwardSize);
+                foreach (short[] a in costs)
+                {
+                    Debug.Assert(a.Length == forwardSize);
+                    for (int i = 0; i < a.Length; i++)
+                    {
+                        int delta = (int)a[i] - last;
+                        @out.WriteVInt32((delta >> 31) ^ (delta << 1));
+                        last = a[i];
+                    }
+                }
+            }
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0f092010/src/Lucene.Net.Analysis.Kuromoji/Tools/DictionaryBuilder.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Kuromoji/Tools/DictionaryBuilder.cs 
b/src/Lucene.Net.Analysis.Kuromoji/Tools/DictionaryBuilder.cs
new file mode 100644
index 0000000..c105039
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Kuromoji/Tools/DictionaryBuilder.cs
@@ -0,0 +1,92 @@
+ï»¿using System;
+
+namespace Lucene.Net.Analysis.Ja.Util
+{
+    /*
+     * Licensed to the Apache Software Foundation (ASF) under one or more
+     * contributor license agreements.  See the NOTICE file distributed with
+     * this work for additional information regarding copyright ownership.
+     * The ASF licenses this file to You under the Apache License, Version 2.0
+     * (the "License"); you may not use this file except in compliance with
+     * the License.  You may obtain a copy of the License at
+     *
+     *     http://www.apache.org/licenses/LICENSE-2.0
+     *
+     * Unless required by applicable law or agreed to in writing, software
+     * distributed under the License is distributed on an "AS IS" BASIS,
+     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     * See the License for the specific language governing permissions and
+     * limitations under the License.
+     */
+
+    public class DictionaryBuilder
+    {
+        public enum DictionaryFormat { IPADIC, UNIDIC };
+
+        private DictionaryBuilder()
+        {
+        }
+
+        public static void Build(DictionaryFormat format,
+            string inputDirname,
+            string outputDirname,
+            string encoding,
+            bool normalizeEntry)
+        {
+            Console.WriteLine("building tokeninfo dict...");
+            TokenInfoDictionaryBuilder tokenInfoBuilder = new 
TokenInfoDictionaryBuilder(format, encoding, normalizeEntry);
+            TokenInfoDictionaryWriter tokenInfoDictionary = 
tokenInfoBuilder.Build(inputDirname);
+            tokenInfoDictionary.Write(outputDirname);
+            tokenInfoDictionary = null;
+            tokenInfoBuilder = null;
+            Console.WriteLine("done");
+
+            Console.WriteLine("building unknown word dict...");
+            UnknownDictionaryBuilder unkBuilder = new 
UnknownDictionaryBuilder(encoding);
+            UnknownDictionaryWriter unkDictionary = 
unkBuilder.Build(inputDirname);
+            unkDictionary.Write(outputDirname);
+            unkDictionary = null;
+            unkBuilder = null;
+            Console.WriteLine("done");
+
+            Console.WriteLine("building connection costs...");
+            ConnectionCostsWriter connectionCosts
+                = ConnectionCostsBuilder.Build(inputDirname + 
System.IO.Path.DirectorySeparatorChar + "matrix.def");
+            connectionCosts.Write(outputDirname);
+            Console.WriteLine("done");
+        }
+
+        public static void Main(string[] args)
+        {
+            DictionaryFormat format;
+            if (args[0].Equals("ipadic", StringComparison.OrdinalIgnoreCase))
+            {
+                format = DictionaryFormat.IPADIC;
+            }
+            else if (args[0].Equals("unidic", 
StringComparison.OrdinalIgnoreCase))
+            {
+                format = DictionaryFormat.UNIDIC;
+            }
+            else
+            {
+                Console.Error.WriteLine("Illegal format " + args[0] + " using 
unidic instead");
+                format = DictionaryFormat.IPADIC;
+            }
+
+            string inputDirname = args[1];
+            string outputDirname = args[2];
+            string inputEncoding = args[3];
+            bool normalizeEntries = bool.Parse(args[4]);
+
+            Console.WriteLine("dictionary builder");
+            Console.WriteLine();
+            Console.WriteLine("dictionary format: " + format);
+            Console.WriteLine("input directory: " + inputDirname);
+            Console.WriteLine("output directory: " + outputDirname);
+            Console.WriteLine("input encoding: " + inputEncoding);
+            Console.WriteLine("normalize entries: " + normalizeEntries);
+            Console.WriteLine();
+            DictionaryBuilder.Build(format, inputDirname, outputDirname, 
inputEncoding, normalizeEntries);
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0f092010/src/Lucene.Net.Analysis.Kuromoji/Tools/TokenInfoDictionaryBuilder.cs
----------------------------------------------------------------------
diff --git 
a/src/Lucene.Net.Analysis.Kuromoji/Tools/TokenInfoDictionaryBuilder.cs 
b/src/Lucene.Net.Analysis.Kuromoji/Tools/TokenInfoDictionaryBuilder.cs
new file mode 100644
index 0000000..0406083
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Kuromoji/Tools/TokenInfoDictionaryBuilder.cs
@@ -0,0 +1,230 @@
+ï»¿using Lucene.Net.Support;
+using Lucene.Net.Util;
+using Lucene.Net.Util.Fst;
+using Lucene.Net.Util.Packed;
+using System;
+using System.Collections.Generic;
+using System.IO;
+using System.Text;
+
+namespace Lucene.Net.Analysis.Ja.Util
+{
+    /*
+     * Licensed to the Apache Software Foundation (ASF) under one or more
+     * contributor license agreements.  See the NOTICE file distributed with
+     * this work for additional information regarding copyright ownership.
+     * The ASF licenses this file to You under the Apache License, Version 2.0
+     * (the "License"); you may not use this file except in compliance with
+     * the License.  You may obtain a copy of the License at
+     *
+     *     http://www.apache.org/licenses/LICENSE-2.0
+     *
+     * Unless required by applicable law or agreed to in writing, software
+     * distributed under the License is distributed on an "AS IS" BASIS,
+     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     * See the License for the specific language governing permissions and
+     * limitations under the License.
+     */
+
+    public class TokenInfoDictionaryBuilder
+    {
+        /// <summary>Internal word id - incrementally assigned as entries are 
read and added. This will be byte offset of dictionary file</summary>
+        private int offset = 0;
+
+        private string encoding = "euc-jp";
+
+        private bool normalizeEntries = false;
+        //private Normalizer2 normalizer;
+
+        private DictionaryBuilder.DictionaryFormat format = 
DictionaryBuilder.DictionaryFormat.IPADIC;
+
+        public TokenInfoDictionaryBuilder(DictionaryBuilder.DictionaryFormat 
format, string encoding, bool normalizeEntries)
+        {
+            this.format = format;
+            this.encoding = encoding;
+            this.normalizeEntries = normalizeEntries;
+            //this.normalizer = normalizeEntries ? 
Normalizer2.getInstance(null, "nfkc", Normalizer2.Mode.COMPOSE) : null;
+        }
+
+        public virtual TokenInfoDictionaryWriter Build(string dirname)
+        {
+            List<string> csvFiles = new List<string>();
+            foreach (FileInfo file in new 
DirectoryInfo(dirname).EnumerateFiles("*.csv"))
+            {
+                csvFiles.Add(file.FullName);
+            }
+            csvFiles.Sort();
+            return BuildDictionary(csvFiles);
+        }
+
+        public virtual TokenInfoDictionaryWriter BuildDictionary(IList<string> 
csvFiles)
+        {
+            TokenInfoDictionaryWriter dictionary = new 
TokenInfoDictionaryWriter(10 * 1024 * 1024);
+
+            // all lines in the file
+            Console.WriteLine("  parse...");
+            List<string[]> lines = new List<string[]>(400000);
+            foreach (string file in csvFiles)
+            {
+                using (Stream inputStream = new FileStream(file, 
FileMode.Open, FileAccess.Read))
+                {
+                    Encoding decoder = Encoding.GetEncoding(encoding);
+                    TextReader reader = new StreamReader(inputStream, decoder);
+
+                    string line = null;
+                    while ((line = reader.ReadLine()) != null)
+                    {
+                        string[] entry = CSVUtil.Parse(line);
+
+                        if (entry.Length < 13)
+                        {
+                            Console.WriteLine("Entry in CSV is not valid: " + 
line);
+                            continue;
+                        }
+
+                        string[] formatted = FormatEntry(entry);
+                        lines.Add(formatted);
+
+                        // NFKC normalize dictionary entry
+                        if (normalizeEntries)
+                        {
+                            //if (normalizer.isNormalized(entry[0])){
+                            if 
(entry[0].IsNormalized(NormalizationForm.FormKC))
+                            {
+                                continue;
+                            }
+                            string[] normalizedEntry = new 
string[entry.Length];
+                            for (int i = 0; i < entry.Length; i++)
+                            {
+                                //normalizedEntry[i] = 
normalizer.normalize(entry[i]);
+                                normalizedEntry[i] = 
entry[i].Normalize(NormalizationForm.FormKC);
+                            }
+
+                            formatted = FormatEntry(normalizedEntry);
+                            lines.Add(formatted);
+                        }
+                    }
+                }
+            }
+
+            Console.WriteLine("  sort...");
+
+            // sort by term: we sorted the files already and use a stable sort.
+            lines.Sort(new ComparerAnonymousHelper());
+
+            Console.WriteLine("  encode...");
+
+            PositiveInt32Outputs fstOutput = PositiveInt32Outputs.Singleton;
+            Builder<long?> fstBuilder = new 
Builder<long?>(Lucene.Net.Util.Fst.FST.INPUT_TYPE.BYTE2, 0, 0, true, true, 
int.MaxValue, fstOutput, null, true, PackedInt32s.DEFAULT, true, 15);
+            Int32sRef scratch = new Int32sRef();
+            long ord = -1; // first ord will be 0
+            string lastValue = null;
+
+            // build tokeninfo dictionary
+            foreach (string[] entry in lines)
+            {
+                int next = dictionary.Put(entry);
+
+                if (next == offset)
+                {
+                    Console.WriteLine("Failed to process line: " + 
Collections.ToString(entry));
+                    continue;
+                }
+
+                string token = entry[0];
+                if (!token.Equals(lastValue, StringComparison.Ordinal))
+                {
+                    // new word to add to fst
+                    ord++;
+                    lastValue = token;
+                    scratch.Grow(token.Length);
+                    scratch.Length = token.Length;
+                    for (int i = 0; i < token.Length; i++)
+                    {
+                        scratch.Int32s[i] = (int)token[i];
+                    }
+                    fstBuilder.Add(scratch, ord);
+                }
+                dictionary.AddMapping((int)ord, offset);
+                offset = next;
+            }
+
+            FST<long?> fst = fstBuilder.Finish();
+
+            Console.WriteLine("  " + fst.NodeCount + " nodes, " + fst.ArcCount 
+ " arcs, " + fst.GetSizeInBytes() + " bytes...  ");
+            dictionary.SetFST(fst);
+            Console.WriteLine(" done");
+
+            return dictionary;
+        }
+
+        private class ComparerAnonymousHelper : IComparer<string[]>
+        {
+            public int Compare(string[] left, string[] right)
+            {
+                return left[0].CompareToOrdinal(right[0]);
+            }
+        }
+
+        /// <summary>
+        /// IPADIC features
+        /// 
+        /// 0   - surface
+        /// 1   - left cost
+        /// 2   - right cost
+        /// 3   - word cost
+        /// 4-9 - pos
+        /// 10  - base form
+        /// 11  - reading
+        /// 12  - pronounciation
+        /// 
+        /// UniDic features
+        /// 
+        /// 0   - surface
+        /// 1   - left cost
+        /// 2   - right cost
+        /// 3   - word cost
+        /// 4-9 - pos
+        /// 10  - base form reading
+        /// 11  - base form
+        /// 12  - surface form
+        /// 13  - surface reading
+        /// </summary>
+        public virtual string[] FormatEntry(string[] features)
+        {
+            if (this.format == DictionaryBuilder.DictionaryFormat.IPADIC)
+            {
+                return features;
+            }
+            else
+            {
+                string[] features2 = new string[13];
+                features2[0] = features[0];
+                features2[1] = features[1];
+                features2[2] = features[2];
+                features2[3] = features[3];
+                features2[4] = features[4];
+                features2[5] = features[5];
+                features2[6] = features[6];
+                features2[7] = features[7];
+                features2[8] = features[8];
+                features2[9] = features[9];
+                features2[10] = features[11];
+
+                // If the surface reading is non-existent, use surface form 
for reading and pronunciation.
+                // This happens with punctuation in UniDic and there are 
possibly other cases as well
+                if (features[13].Length == 0)
+                {
+                    features2[11] = features[0];
+                    features2[12] = features[0];
+                }
+                else
+                {
+                    features2[11] = features[13];
+                    features2[12] = features[13];
+                }
+                return features2;
+            }
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0f092010/src/Lucene.Net.Analysis.Kuromoji/Tools/TokenInfoDictionaryWriter.cs
----------------------------------------------------------------------
diff --git 
a/src/Lucene.Net.Analysis.Kuromoji/Tools/TokenInfoDictionaryWriter.cs 
b/src/Lucene.Net.Analysis.Kuromoji/Tools/TokenInfoDictionaryWriter.cs
new file mode 100644
index 0000000..42338f9
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Kuromoji/Tools/TokenInfoDictionaryWriter.cs
@@ -0,0 +1,51 @@
+ï»¿using Lucene.Net.Analysis.Ja.Dict;
+using Lucene.Net.Util.Fst;
+using System.IO;
+
+namespace Lucene.Net.Analysis.Ja.Util
+{
+    /*
+     * Licensed to the Apache Software Foundation (ASF) under one or more
+     * contributor license agreements.  See the NOTICE file distributed with
+     * this work for additional information regarding copyright ownership.
+     * The ASF licenses this file to You under the Apache License, Version 2.0
+     * (the "License"); you may not use this file except in compliance with
+     * the License.  You may obtain a copy of the License at
+     *
+     *     http://www.apache.org/licenses/LICENSE-2.0
+     *
+     * Unless required by applicable law or agreed to in writing, software
+     * distributed under the License is distributed on an "AS IS" BASIS,
+     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     * See the License for the specific language governing permissions and
+     * limitations under the License.
+     */
+
+    public class TokenInfoDictionaryWriter : BinaryDictionaryWriter
+    {
+        private FST<long?> fst;
+
+        public TokenInfoDictionaryWriter(int size)
+            : base(typeof(TokenInfoDictionary), size)
+        {
+        }
+
+        public virtual void SetFST(FST<long?> fst)
+        {
+            this.fst = fst;
+        }
+
+        public override void Write(string baseDir)
+        {
+            base.Write(baseDir);
+            WriteFST(GetBaseFileName(baseDir) + 
TokenInfoDictionary.FST_FILENAME_SUFFIX);
+        }
+
+        protected virtual void WriteFST(string filename)
+        {
+            FileInfo f = new FileInfo(filename);
+            if (!f.Directory.Exists) f.Directory.Create();
+            fst.Save(f);
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0f092010/src/Lucene.Net.Analysis.Kuromoji/Tools/UnknownDictionaryBuilder.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Kuromoji/Tools/UnknownDictionaryBuilder.cs 
b/src/Lucene.Net.Analysis.Kuromoji/Tools/UnknownDictionaryBuilder.cs
new file mode 100644
index 0000000..feff4fe
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Kuromoji/Tools/UnknownDictionaryBuilder.cs
@@ -0,0 +1,146 @@
+ï»¿using Lucene.Net.Analysis.Ja.Dict;
+using System;
+using System.Collections.Generic;
+using System.Globalization;
+using System.IO;
+using System.Text;
+using System.Text.RegularExpressions;
+
+namespace Lucene.Net.Analysis.Ja.Util
+{
+    /*
+     * Licensed to the Apache Software Foundation (ASF) under one or more
+     * contributor license agreements.  See the NOTICE file distributed with
+     * this work for additional information regarding copyright ownership.
+     * The ASF licenses this file to You under the Apache License, Version 2.0
+     * (the "License"); you may not use this file except in compliance with
+     * the License.  You may obtain a copy of the License at
+     *
+     *     http://www.apache.org/licenses/LICENSE-2.0
+     *
+     * Unless required by applicable law or agreed to in writing, software
+     * distributed under the License is distributed on an "AS IS" BASIS,
+     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     * See the License for the specific language governing permissions and
+     * limitations under the License.
+     */
+
+    public class UnknownDictionaryBuilder
+    {
+        private static readonly string NGRAM_DICTIONARY_ENTRY = 
"NGRAM,5,5,-32768,è¨å·,ä¸è¬,*,*,*,*,*,*,*";
+
+        private string encoding = "euc-jp";
+
+        public UnknownDictionaryBuilder(string encoding)
+        {
+            this.encoding = encoding;
+        }
+
+        public virtual UnknownDictionaryWriter Build(string dirname)
+        {
+            UnknownDictionaryWriter unkDictionary = ReadDictionaryFile(dirname 
+ System.IO.Path.DirectorySeparatorChar + "unk.def");  //Should be only one file
+            ReadCharacterDefinition(dirname + 
System.IO.Path.DirectorySeparatorChar + "char.def", unkDictionary);
+            return unkDictionary;
+        }
+
+        public virtual UnknownDictionaryWriter ReadDictionaryFile(string 
filename)
+        {
+            return ReadDictionaryFile(filename, encoding);
+        }
+
+        public virtual UnknownDictionaryWriter ReadDictionaryFile(string 
filename, string encoding)
+        {
+            UnknownDictionaryWriter dictionary = new UnknownDictionaryWriter(5 
* 1024 * 1024);
+
+            List<string[]> lines = new List<string[]>();
+            Encoding decoder = Encoding.GetEncoding(encoding);
+            using (Stream inputStream = new FileStream(filename, 
FileMode.Open, FileAccess.Read))
+            using (TextReader reader = new StreamReader(inputStream, decoder))
+            {
+
+                dictionary.Put(CSVUtil.Parse(NGRAM_DICTIONARY_ENTRY));
+
+
+                string line = null;
+                while ((line = reader.ReadLine()) != null)
+                {
+                    // note: unk.def only has 10 fields, it simplifies the 
writer to just append empty reading and pronunciation,
+                    // even though the unknown dictionary returns hardcoded 
null here.
+                    string[] parsed = CSVUtil.Parse(line + ",*,*"); // 
Probably we don't need to validate entry
+                    lines.Add(parsed);
+                }
+            }
+
+            lines.Sort(new ComparerAnonymousHelper());
+
+            foreach (string[] entry in lines)
+            {
+                dictionary.Put(entry);
+            }
+
+            return dictionary;
+        }
+        private class ComparerAnonymousHelper : IComparer<string[]>
+        {
+            public int Compare(string[] left, string[] right)
+            {
+                int leftId = CharacterDefinition.LookupCharacterClass(left[0]);
+                int rightId = 
CharacterDefinition.LookupCharacterClass(right[0]);
+                return leftId - rightId;
+            }
+        }
+
+        public virtual void ReadCharacterDefinition(string filename, 
UnknownDictionaryWriter dictionary)
+        {
+            using (Stream inputStream = new FileStream(filename, 
FileMode.Open, FileAccess.Read))
+            using (TextReader reader = new StreamReader(inputStream, 
Encoding.GetEncoding(encoding)))
+            {
+                string line = null;
+
+                while ((line = reader.ReadLine()) != null)
+                {
+                    line = Regex.Replace(line, "^\\s", "");
+                    line = Regex.Replace(line, "\\s*#.*", "");
+                    line = Regex.Replace(line, "\\s+", " ");
+
+                    // Skip empty line or comment line
+                    if (line.Length == 0)
+                    {
+                        continue;
+                    }
+
+                    if (line.StartsWith("0x", StringComparison.Ordinal))
+                    {  // Category mapping
+                        string[] values = new Regex(" ").Split(line, 2);  // 
Split only first space
+
+                        if (!values[0].Contains(".."))
+                        {
+                            int cp = Convert.ToInt32(values[0]);
+                            dictionary.PutCharacterCategory(cp, values[1]);
+                        }
+                        else
+                        {
+                            string[] codePoints = Regex.Split(values[0], 
"\\.\\.");
+                            int cpFrom = Convert.ToInt32(codePoints[0]);
+                            int cpTo = Convert.ToInt32(codePoints[1]);
+
+                            for (int i = cpFrom; i <= cpTo; i++)
+                            {
+                                dictionary.PutCharacterCategory(i, values[1]);
+                            }
+                        }
+                    }
+                    else
+                    {  // Invoke definition
+                        string[] values = line.Split(new char[] { ' ' }, 
StringSplitOptions.RemoveEmptyEntries); // Consecutive space is merged above
+                        string characterClassName = values[0];
+                        int invoke = int.Parse(values[1], 
CultureInfo.InvariantCulture);
+                        int group = int.Parse(values[2], 
CultureInfo.InvariantCulture);
+                        int length = int.Parse(values[3], 
CultureInfo.InvariantCulture);
+                        dictionary.PutInvokeDefinition(characterClassName, 
invoke, group, length);
+                    }
+                }
+            }
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0f092010/src/Lucene.Net.Analysis.Kuromoji/Tools/UnknownDictionaryWriter.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Kuromoji/Tools/UnknownDictionaryWriter.cs 
b/src/Lucene.Net.Analysis.Kuromoji/Tools/UnknownDictionaryWriter.cs
new file mode 100644
index 0000000..7d4a982
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Kuromoji/Tools/UnknownDictionaryWriter.cs
@@ -0,0 +1,66 @@
+ï»¿using Lucene.Net.Analysis.Ja.Dict;
+
+namespace Lucene.Net.Analysis.Ja.Util
+{
+    /*
+     * Licensed to the Apache Software Foundation (ASF) under one or more
+     * contributor license agreements.  See the NOTICE file distributed with
+     * this work for additional information regarding copyright ownership.
+     * The ASF licenses this file to You under the Apache License, Version 2.0
+     * (the "License"); you may not use this file except in compliance with
+     * the License.  You may obtain a copy of the License at
+     *
+     *     http://www.apache.org/licenses/LICENSE-2.0
+     *
+     * Unless required by applicable law or agreed to in writing, software
+     * distributed under the License is distributed on an "AS IS" BASIS,
+     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     * See the License for the specific language governing permissions and
+     * limitations under the License.
+     */
+
+    public class UnknownDictionaryWriter : BinaryDictionaryWriter
+    {
+        private readonly CharacterDefinitionWriter characterDefinition = new 
CharacterDefinitionWriter();
+
+        public UnknownDictionaryWriter(int size)
+            : base(typeof(UnknownDictionary), size)
+        {
+        }
+
+        public override int Put(string[] entry)
+        {
+            // Get wordId of current entry
+            int wordId = m_buffer.Position;
+
+            // Put entry
+            int result = base.Put(entry);
+
+            // Put entry in targetMap
+            int characterId = 
CharacterDefinition.LookupCharacterClass(entry[0]);
+            AddMapping(characterId, wordId);
+            return result;
+        }
+
+        /// <summary>
+        /// Put mapping from unicode code point to character class.
+        /// </summary>
+        /// <param name="codePoint">Code point.</param>
+        /// <param name="characterClassName">Character class name.</param>
+        public virtual void PutCharacterCategory(int codePoint, string 
characterClassName)
+        {
+            characterDefinition.PutCharacterCategory(codePoint, 
characterClassName);
+        }
+
+        public virtual void PutInvokeDefinition(string characterClassName, int 
invoke, int group, int length)
+        {
+            characterDefinition.PutInvokeDefinition(characterClassName, 
invoke, group, length);
+        }
+
+        public override void Write(string baseDir)
+        {
+            base.Write(baseDir);
+            characterDefinition.Write(baseDir);
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0f092010/src/Lucene.Net.Analysis.Kuromoji/Util/CSVUtil.cs
----------------------------------------------------------------------
diff --git a/src/Lucene.Net.Analysis.Kuromoji/Util/CSVUtil.cs 
b/src/Lucene.Net.Analysis.Kuromoji/Util/CSVUtil.cs
new file mode 100644
index 0000000..9d86aed
--- /dev/null
+++ b/src/Lucene.Net.Analysis.Kuromoji/Util/CSVUtil.cs
@@ -0,0 +1,124 @@
+ï»¿using System.Collections.Generic;
+using System.Text;
+using System.Text.RegularExpressions;
+
+namespace Lucene.Net.Analysis.Ja.Util
+{
+    /*
+     * Licensed to the Apache Software Foundation (ASF) under one or more
+     * contributor license agreements.  See the NOTICE file distributed with
+     * this work for additional information regarding copyright ownership.
+     * The ASF licenses this file to You under the Apache License, Version 2.0
+     * (the "License"); you may not use this file except in compliance with
+     * the License.  You may obtain a copy of the License at
+     *
+     *     http://www.apache.org/licenses/LICENSE-2.0
+     *
+     * Unless required by applicable law or agreed to in writing, software
+     * distributed under the License is distributed on an "AS IS" BASIS,
+     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+     * See the License for the specific language governing permissions and
+     * limitations under the License.
+     */
+
+    /// <summary>
+    /// Utility class for parsing CSV text
+    /// </summary>
+    public sealed class CSVUtil
+    {
+        private static readonly char QUOTE = '"';
+
+        private static readonly char COMMA = ',';
+
+        private static readonly Regex QUOTE_REPLACE_PATTERN = new 
Regex("^\"([^\"]+)\"$", RegexOptions.Compiled);
+
+        private static readonly string ESCAPED_QUOTE = "\"\"";
+
+        private CSVUtil() { } // no instance!!!
+
+        /// <summary>
+        /// Parse CSV line
+        /// </summary>
+        /// <param name="line">line containing csv-encoded data</param>
+        /// <returns>Array of values</returns>
+        public static string[] Parse(string line)
+        {
+            bool insideQuote = false;
+            List<string> result = new List<string>();
+            int quoteCount = 0;
+            StringBuilder sb = new StringBuilder();
+            for (int i = 0; i < line.Length; i++)
+            {
+                char c = line[i];
+
+                if (c == QUOTE)
+                {
+                    insideQuote = !insideQuote;
+                    quoteCount++;
+                }
+
+                if (c == COMMA && !insideQuote)
+                {
+                    string value = sb.ToString();
+                    value = UnQuoteUnEscape(value);
+                    result.Add(value);
+                    sb.Length = 0;
+                    continue;
+                }
+
+                sb.Append(c);
+            }
+
+            result.Add(sb.ToString());
+
+            // Validate
+            if (quoteCount % 2 != 0)
+            {
+                return new string[0];
+            }
+
+            return result.ToArray(/*new String[result.size()]*/);
+        }
+
+        private static string UnQuoteUnEscape(string original)
+        {
+            string result = original;
+
+            // Unquote
+            if (result.IndexOf('\"') >= 0)
+            {
+                Match m = QUOTE_REPLACE_PATTERN.Match(original);
+                if (m.Success)
+                {
+                    result = m.Groups[1].Value;
+                }
+
+                // Unescape
+                if (result.IndexOf(ESCAPED_QUOTE) >= 0)
+                {
+                    result = result.Replace(ESCAPED_QUOTE, "\"");
+                }
+            }
+
+            return result;
+        }
+
+        /// <summary>
+        /// Quote and escape input value for CSV
+        /// </summary>
+        public static string QuoteEscape(string original)
+        {
+            string result = original;
+
+            if (result.IndexOf('\"') >= 0)
+            {
+                result.Replace("\"", ESCAPED_QUOTE);
+            }
+            if (result.IndexOf(COMMA) >= 0)
+            {
+                result = "\"" + result + "\"";
+            }
+            return result;
+        }
+    }
+}

[08/13] lucenenet git commit: Ported Lucene.Net.Analysis.Kuromoji + tests

Reply via email to