http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0f092010/src/Lucene.Net.Analysis.Kuromoji/TokenAttributes/BaseFormAttributeImpl.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Kuromoji/TokenAttributes/BaseFormAttributeImpl.cs b/src/Lucene.Net.Analysis.Kuromoji/TokenAttributes/BaseFormAttributeImpl.cs new file mode 100644 index 0000000..7121c73 --- /dev/null +++ b/src/Lucene.Net.Analysis.Kuromoji/TokenAttributes/BaseFormAttributeImpl.cs @@ -0,0 +1,55 @@ +using Lucene.Net.Util; + +namespace Lucene.Net.Analysis.Ja.TokenAttributes +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// <summary> + /// Attribute for <see cref="Token.GetBaseForm()"/>. + /// </summary> + public class BaseFormAttribute : Attribute, IBaseFormAttribute + { + private Token token; + + public virtual string GetBaseForm() + { + return token == null ? null : token.GetBaseForm(); + } + + public virtual void SetToken(Token token) + { + this.token = token; + } + + public override void Clear() + { + token = null; + } + + public override void CopyTo(IAttribute target) + { + BaseFormAttribute t = (BaseFormAttribute)target; + t.SetToken(token); + } + + public override void ReflectWith(IAttributeReflector reflector) + { + reflector.Reflect(typeof(BaseFormAttribute), "baseForm", GetBaseForm()); + } + } +}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0f092010/src/Lucene.Net.Analysis.Kuromoji/TokenAttributes/InflectionAttribute.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Kuromoji/TokenAttributes/InflectionAttribute.cs b/src/Lucene.Net.Analysis.Kuromoji/TokenAttributes/InflectionAttribute.cs new file mode 100644 index 0000000..975d8b0 --- /dev/null +++ b/src/Lucene.Net.Analysis.Kuromoji/TokenAttributes/InflectionAttribute.cs @@ -0,0 +1,34 @@ +using Lucene.Net.Util; + +namespace Lucene.Net.Analysis.Ja.TokenAttributes +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// <summary> + /// Attribute for Kuromoji inflection data. + /// <para/> + /// Note: in some cases this value may not be applicable, + /// and will be null. + /// </summary> + public interface IInflectionAttribute : IAttribute + { + string GetInflectionType(); + string GetInflectionForm(); + void SetToken(Token token); + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0f092010/src/Lucene.Net.Analysis.Kuromoji/TokenAttributes/InflectionAttributeImpl.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Kuromoji/TokenAttributes/InflectionAttributeImpl.cs b/src/Lucene.Net.Analysis.Kuromoji/TokenAttributes/InflectionAttributeImpl.cs new file mode 100644 index 0000000..db96130 --- /dev/null +++ b/src/Lucene.Net.Analysis.Kuromoji/TokenAttributes/InflectionAttributeImpl.cs @@ -0,0 +1,68 @@ +using Lucene.Net.Analysis.Ja.Util; +using Lucene.Net.Util; + +namespace Lucene.Net.Analysis.Ja.TokenAttributes +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// <summary> + /// Attribute for Kuromoji inflection data. + /// </summary> + public class InflectionAttribute : Attribute, IInflectionAttribute + { + private Token token; + + public virtual string GetInflectionType() + { + return token == null ? null : token.GetInflectionType(); + } + + public virtual string GetInflectionForm() + { + return token == null ? null : token.GetInflectionForm(); + } + + public virtual void SetToken(Token token) + { + this.token = token; + } + + public override void Clear() + { + token = null; + } + + public override void CopyTo(IAttribute target) + { + InflectionAttribute t = (InflectionAttribute)target; + t.SetToken(token); + } + + public override void ReflectWith(IAttributeReflector reflector) + { + string type = GetInflectionType(); + string typeEN = type == null ? null : ToStringUtil.GetInflectionTypeTranslation(type); + reflector.Reflect<IInflectionAttribute>("inflectionType", type); + reflector.Reflect<IInflectionAttribute>("inflectionType (en)", typeEN); + string form = GetInflectionForm(); + string formEN = form == null ? null : ToStringUtil.GetInflectedFormTranslation(form); + reflector.Reflect<IInflectionAttribute>("inflectionForm", form); + reflector.Reflect<IInflectionAttribute>("inflectionForm (en)", formEN); + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0f092010/src/Lucene.Net.Analysis.Kuromoji/TokenAttributes/PartOfSpeechAttribute.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Kuromoji/TokenAttributes/PartOfSpeechAttribute.cs b/src/Lucene.Net.Analysis.Kuromoji/TokenAttributes/PartOfSpeechAttribute.cs new file mode 100644 index 0000000..722d203 --- /dev/null +++ b/src/Lucene.Net.Analysis.Kuromoji/TokenAttributes/PartOfSpeechAttribute.cs @@ -0,0 +1,30 @@ +using Lucene.Net.Util; + +namespace Lucene.Net.Analysis.Ja.TokenAttributes +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// <summary> + /// Attribute for <see cref="Token.GetPartOfSpeech()"/>. + /// </summary> + public interface IPartOfSpeechAttribute : IAttribute + { + string GetPartOfSpeech(); + void SetToken(Token token); + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0f092010/src/Lucene.Net.Analysis.Kuromoji/TokenAttributes/PartOfSpeechAttributeImpl.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Kuromoji/TokenAttributes/PartOfSpeechAttributeImpl.cs b/src/Lucene.Net.Analysis.Kuromoji/TokenAttributes/PartOfSpeechAttributeImpl.cs new file mode 100644 index 0000000..9d0451d --- /dev/null +++ b/src/Lucene.Net.Analysis.Kuromoji/TokenAttributes/PartOfSpeechAttributeImpl.cs @@ -0,0 +1,59 @@ +using Lucene.Net.Analysis.Ja.Util; +using Lucene.Net.Util; + +namespace Lucene.Net.Analysis.Ja.TokenAttributes +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// <summary> + /// Attribute for <see cref="Token.GetPartOfSpeech()"/>. + /// </summary> + public class PartOfSpeechAttribute : Attribute, IPartOfSpeechAttribute + { + private Token token; + + public virtual string GetPartOfSpeech() + { + return token == null ? null : token.GetPartOfSpeech(); + } + + public virtual void SetToken(Token token) + { + this.token = token; + } + + public override void Clear() + { + token = null; + } + + public override void CopyTo(IAttribute target) + { + PartOfSpeechAttribute t = (PartOfSpeechAttribute)target; + t.SetToken(token); + } + + public override void ReflectWith(IAttributeReflector reflector) + { + string partOfSpeech = GetPartOfSpeech(); + string partOfSpeechEN = partOfSpeech == null ? null : ToStringUtil.GetPOSTranslation(partOfSpeech); + reflector.Reflect<IPartOfSpeechAttribute>("partOfSpeech", partOfSpeech); + reflector.Reflect<IPartOfSpeechAttribute>("partOfSpeech (en)", partOfSpeechEN); + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0f092010/src/Lucene.Net.Analysis.Kuromoji/TokenAttributes/ReadingAttribute.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Kuromoji/TokenAttributes/ReadingAttribute.cs b/src/Lucene.Net.Analysis.Kuromoji/TokenAttributes/ReadingAttribute.cs new file mode 100644 index 0000000..2432872 --- /dev/null +++ b/src/Lucene.Net.Analysis.Kuromoji/TokenAttributes/ReadingAttribute.cs @@ -0,0 +1,34 @@ +using Lucene.Net.Util; + +namespace Lucene.Net.Analysis.Ja.TokenAttributes +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// <summary> + /// Attribute for Kuromoji reading data + /// <para/> + /// Note: in some cases this value may not be applicable, + /// and will be null. + /// </summary> + public interface IReadingAttribute : IAttribute + { + string GetReading(); + string GetPronunciation(); + void SetToken(Token token); + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0f092010/src/Lucene.Net.Analysis.Kuromoji/TokenAttributes/ReadingAttributeImpl.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Kuromoji/TokenAttributes/ReadingAttributeImpl.cs b/src/Lucene.Net.Analysis.Kuromoji/TokenAttributes/ReadingAttributeImpl.cs new file mode 100644 index 0000000..bfb8a93 --- /dev/null +++ b/src/Lucene.Net.Analysis.Kuromoji/TokenAttributes/ReadingAttributeImpl.cs @@ -0,0 +1,68 @@ +using Lucene.Net.Analysis.Ja.Util; +using Lucene.Net.Util; + +namespace Lucene.Net.Analysis.Ja.TokenAttributes +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// <summary> + /// Attribute for Kuromoji reading data + /// </summary> + public class ReadingAttribute : Attribute, IReadingAttribute + { + private Token token; + + public virtual string GetReading() + { + return token == null ? null : token.GetReading(); + } + + public virtual string GetPronunciation() + { + return token == null ? null : token.GetPronunciation(); + } + + public virtual void SetToken(Token token) + { + this.token = token; + } + + public override void Clear() + { + token = null; + } + + public override void CopyTo(IAttribute target) + { + ReadingAttribute t = (ReadingAttribute)target; + t.SetToken(token); + } + + public override void ReflectWith(IAttributeReflector reflector) + { + string reading = GetReading(); + string readingEN = reading == null ? null : ToStringUtil.GetRomanization(reading); + string pronunciation = GetPronunciation(); + string pronunciationEN = pronunciation == null ? null : ToStringUtil.GetRomanization(pronunciation); + reflector.Reflect<IReadingAttribute>("reading", reading); + reflector.Reflect<IReadingAttribute>("reading (en)", readingEN); + reflector.Reflect<IReadingAttribute>("pronunciation", pronunciation); + reflector.Reflect<IReadingAttribute>("pronunciation (en)", pronunciationEN); + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0f092010/src/Lucene.Net.Analysis.Kuromoji/Tools/BinaryDictionaryWriter.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Kuromoji/Tools/BinaryDictionaryWriter.cs b/src/Lucene.Net.Analysis.Kuromoji/Tools/BinaryDictionaryWriter.cs new file mode 100644 index 0000000..39d36eb --- /dev/null +++ b/src/Lucene.Net.Analysis.Kuromoji/Tools/BinaryDictionaryWriter.cs @@ -0,0 +1,370 @@ +using Lucene.Net.Analysis.Ja.Dict; +using Lucene.Net.Codecs; +using Lucene.Net.Store; +using Lucene.Net.Support.IO; +using Lucene.Net.Util; +using System; +using System.Collections.Generic; +using System.Diagnostics; +using System.Globalization; +using System.IO; +using System.Text; + +namespace Lucene.Net.Analysis.Ja.Util +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + public abstract class BinaryDictionaryWriter + { + protected readonly Type m_implClazz; + protected ByteBuffer m_buffer; + private int targetMapEndOffset = 0, lastWordId = -1, lastSourceId = -1; + private int[] targetMap = new int[8192]; + private int[] targetMapOffsets = new int[8192]; + private readonly List<string> posDict = new List<string>(); + + public BinaryDictionaryWriter(Type implClazz, int size) + { + this.m_implClazz = implClazz; + m_buffer = ByteBuffer.Allocate(size); + } + + /// <summary> + /// Put the entry in map. + /// </summary> + /// <param name="entry"></param> + /// <returns>Current position of buffer, which will be wordId of next entry.</returns> + public virtual int Put(string[] entry) + { + short leftId = short.Parse(entry[1], CultureInfo.InvariantCulture); + short rightId = short.Parse(entry[2], CultureInfo.InvariantCulture); + short wordCost = short.Parse(entry[3], CultureInfo.InvariantCulture); + + StringBuilder sb = new StringBuilder(); + + // build up the POS string + for (int i = 4; i < 8; i++) + { + string part = entry[i]; + Debug.Assert(part.Length > 0); + if (!"*".Equals(part, StringComparison.Ordinal)) + { + if (sb.Length > 0) + { + sb.Append('-'); + } + sb.Append(part); + } + } + + string posData = sb.ToString(); + + sb.Length = 0; + sb.Append(CSVUtil.QuoteEscape(posData)); + sb.Append(','); + if (!"*".Equals(entry[8])) + { + sb.Append(CSVUtil.QuoteEscape(entry[8])); + } + sb.Append(','); + if (!"*".Equals(entry[9])) + { + sb.Append(CSVUtil.QuoteEscape(entry[9])); + } + string fullPOSData = sb.ToString(); + + string baseForm = entry[10]; + string reading = entry[11]; + string pronunciation = entry[12]; + + // extend buffer if necessary + int left = m_buffer.Remaining; + // worst case: two short, 3 bytes, and features (all as utf-16) + int worstCase = 4 + 3 + 2 * (baseForm.Length + reading.Length + pronunciation.Length); + if (worstCase > left) + { + ByteBuffer newBuffer = ByteBuffer.Allocate(ArrayUtil.Oversize(m_buffer.Limit + worstCase - left, 1)); + m_buffer.Flip(); + newBuffer.Put(m_buffer); + m_buffer = newBuffer; + } + + int flags = 0; + if (!("*".Equals(baseForm) || baseForm.Equals(entry[0]))) + { + flags |= BinaryDictionary.HAS_BASEFORM; + } + if (!reading.Equals(ToKatakana(entry[0]))) + { + flags |= BinaryDictionary.HAS_READING; + } + if (!pronunciation.Equals(reading)) + { + flags |= BinaryDictionary.HAS_PRONUNCIATION; + } + + Debug.Assert(leftId == rightId); + Debug.Assert(leftId < 4096); // there are still unused bits + // add pos mapping + int toFill = 1 + leftId - posDict.Count; + for (int i = 0; i < toFill; i++) + { + posDict.Add(null); + } + + string existing = posDict[leftId]; + Debug.Assert(existing == null || existing.Equals(fullPOSData)); + posDict[leftId] = fullPOSData; + + m_buffer.PutInt16((short)(leftId << 3 | flags)); + m_buffer.PutInt16(wordCost); + + if ((flags & BinaryDictionary.HAS_BASEFORM) != 0) + { + Debug.Assert(baseForm.Length < 16); + int shared = SharedPrefix(entry[0], baseForm); + int suffix = baseForm.Length - shared; + m_buffer.Put((byte)(shared << 4 | suffix)); + for (int i = shared; i < baseForm.Length; i++) + { + m_buffer.PutChar(baseForm[i]); + } + } + + if ((flags & BinaryDictionary.HAS_READING) != 0) + { + if (IsKatakana(reading)) + { + m_buffer.Put((byte)(reading.Length << 1 | 1)); + WriteKatakana(reading); + } + else + { + m_buffer.Put((byte)(reading.Length << 1)); + for (int i = 0; i < reading.Length; i++) + { + m_buffer.PutChar(reading[i]); + } + } + } + + if ((flags & BinaryDictionary.HAS_PRONUNCIATION) != 0) + { + // we can save 150KB here, but it makes the reader a little complicated. + // int shared = sharedPrefix(reading, pronunciation); + // buffer.put((byte) shared); + // pronunciation = pronunciation.substring(shared); + if (IsKatakana(pronunciation)) + { + m_buffer.Put((byte)(pronunciation.Length << 1 | 1)); + WriteKatakana(pronunciation); + } + else + { + m_buffer.Put((byte)(pronunciation.Length << 1)); + for (int i = 0; i < pronunciation.Length; i++) + { + m_buffer.PutChar(pronunciation[i]); + } + } + } + + return m_buffer.Position; + } + + private bool IsKatakana(string s) + { + for (int i = 0; i < s.Length; i++) + { + char ch = s[i]; + if (ch < 0x30A0 || ch > 0x30FF) + { + return false; + } + } + return true; + } + + private void WriteKatakana(string s) + { + for (int i = 0; i < s.Length; i++) + { + m_buffer.Put((byte)(s[i] - 0x30A0)); + } + } + + private string ToKatakana(string s) + { + char[] text = new char[s.Length]; + for (int i = 0; i < s.Length; i++) + { + char ch = s[i]; + if (ch > 0x3040 && ch < 0x3097) + { + text[i] = (char)(ch + 0x60); + } + else + { + text[i] = ch; + } + } + return new string(text); + } + + public static int SharedPrefix(string left, string right) + { + int len = left.Length < right.Length ? left.Length : right.Length; + for (int i = 0; i < len; i++) + if (left[i] != right[i]) + return i; + return len; + } + + public virtual void AddMapping(int sourceId, int wordId) + { + Debug.Assert(wordId > lastWordId, "words out of order: " + wordId + " vs lastID: " + lastWordId); + + if (sourceId > lastSourceId) + { + Debug.Assert(sourceId > lastSourceId, "source ids out of order: lastSourceId=" + lastSourceId + " vs sourceId=" + sourceId); + targetMapOffsets = ArrayUtil.Grow(targetMapOffsets, sourceId + 1); + for (int i = lastSourceId + 1; i <= sourceId; i++) + { + targetMapOffsets[i] = targetMapEndOffset; + } + } + else + { + Debug.Assert(sourceId == lastSourceId); + } + + targetMap = ArrayUtil.Grow(targetMap, targetMapEndOffset + 1); + targetMap[targetMapEndOffset] = wordId; + targetMapEndOffset++; + + lastSourceId = sourceId; + lastWordId = wordId; + } + + protected string GetBaseFileName(string baseDir) + { + return baseDir + System.IO.Path.DirectorySeparatorChar + m_implClazz.FullName.Replace('.', System.IO.Path.DirectorySeparatorChar); + } + + /// <summary> + /// Write dictionary in file + /// </summary> + /// <remarks> + /// Dictionary format is: + /// [Size of dictionary(int)], [entry:{left id(short)}{right id(short)}{word cost(short)}{length of pos info(short)}{pos info(char)}], [entry...], [entry...]..... + /// </remarks> + /// <param name="baseDir"></param> + /// <exception cref="IOException">If an I/O error occurs writing the dictionary files.</exception> + public virtual void Write(string baseDir) + { + string baseName = GetBaseFileName(baseDir); + WriteDictionary(baseName + BinaryDictionary.DICT_FILENAME_SUFFIX); + WriteTargetMap(baseName + BinaryDictionary.TARGETMAP_FILENAME_SUFFIX); + WritePosDict(baseName + BinaryDictionary.POSDICT_FILENAME_SUFFIX); + } + + // TODO: maybe this int[] should instead be the output to the FST... + protected virtual void WriteTargetMap(string filename) + { + //new File(filename).getParentFile().mkdirs(); + System.IO.Directory.CreateDirectory(System.IO.Path.GetDirectoryName(filename)); + using (Stream os = new FileStream(filename, FileMode.Create, FileAccess.Write)) + { + DataOutput @out = new OutputStreamDataOutput(os); + CodecUtil.WriteHeader(@out, BinaryDictionary.TARGETMAP_HEADER, BinaryDictionary.VERSION); + + int numSourceIds = lastSourceId + 1; + @out.WriteVInt32(targetMapEndOffset); // <-- size of main array + @out.WriteVInt32(numSourceIds + 1); // <-- size of offset array (+ 1 more entry) + int prev = 0, sourceId = 0; + for (int ofs = 0; ofs < targetMapEndOffset; ofs++) + { + int val = targetMap[ofs], delta = val - prev; + Debug.Assert(delta >= 0); + if (ofs == targetMapOffsets[sourceId]) + { + @out.WriteVInt32((delta << 1) | 0x01); + sourceId++; + } + else + { + @out.WriteVInt32((delta << 1)); + } + prev += delta; + } + Debug.Assert(sourceId == numSourceIds, "sourceId:" + sourceId + " != numSourceIds:" + numSourceIds); + } + } + + protected virtual void WritePosDict(string filename) + { + //new File(filename).getParentFile().mkdirs(); + System.IO.Directory.CreateDirectory(System.IO.Path.GetDirectoryName(filename)); + using (Stream os = new FileStream(filename, FileMode.Create, FileAccess.Write)) + { + DataOutput @out = new OutputStreamDataOutput(os); + CodecUtil.WriteHeader(@out, BinaryDictionary.POSDICT_HEADER, BinaryDictionary.VERSION); + @out.WriteVInt32(posDict.Count); + foreach (string s in posDict) + { + if (s == null) + { + @out.WriteByte((byte)0); + @out.WriteByte((byte)0); + @out.WriteByte((byte)0); + } + else + { + string[] data = CSVUtil.Parse(s); + Debug.Assert(data.Length == 3, "malformed pos/inflection: " + s); + @out.WriteString(data[0]); + @out.WriteString(data[1]); + @out.WriteString(data[2]); + } + } + } + } + + protected virtual void WriteDictionary(string filename) + { + //new File(filename).getParentFile().mkdirs(); + System.IO.Directory.CreateDirectory(System.IO.Path.GetDirectoryName(filename)); + using (Stream os = new FileStream(filename, FileMode.Create, FileAccess.Write)) + { + DataOutput @out = new OutputStreamDataOutput(os); + CodecUtil.WriteHeader(@out, BinaryDictionary.DICT_HEADER, BinaryDictionary.VERSION); + @out.WriteVInt32(m_buffer.Position); + var writer = new BinaryWriter(os); + + //WritableByteChannel channel = Channels.newChannel(os); + // Write Buffer + m_buffer.Flip(); // set position to 0, set limit to current position + //channel.write(buffer); + + writer.Write(m_buffer.Array, m_buffer.Position, m_buffer.Limit); + + Debug.Assert(m_buffer.Remaining == 0L); + } + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0f092010/src/Lucene.Net.Analysis.Kuromoji/Tools/CharacterDefinitionWriter.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Kuromoji/Tools/CharacterDefinitionWriter.cs b/src/Lucene.Net.Analysis.Kuromoji/Tools/CharacterDefinitionWriter.cs new file mode 100644 index 0000000..6d5526d --- /dev/null +++ b/src/Lucene.Net.Analysis.Kuromoji/Tools/CharacterDefinitionWriter.cs @@ -0,0 +1,91 @@ +using Lucene.Net.Analysis.Ja.Dict; +using Lucene.Net.Codecs; +using Lucene.Net.Store; +using Lucene.Net.Support; +using System; +using System.IO; + +namespace Lucene.Net.Analysis.Ja.Util +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + public sealed class CharacterDefinitionWriter + { + private readonly byte[] characterCategoryMap = new byte[0x10000]; + + private readonly bool[] invokeMap = new bool[CharacterDefinition.CLASS_COUNT]; + private readonly bool[] groupMap = new bool[CharacterDefinition.CLASS_COUNT]; + + /// <summary> + /// Constructor for building. TODO: remove write access + /// </summary> + public CharacterDefinitionWriter() + { + Arrays.Fill(characterCategoryMap, CharacterDefinition.DEFAULT); + } + + /// <summary> + /// Put mapping from unicode code point to character class. + /// </summary> + /// <param name="codePoint">Code point.</param> + /// <param name="characterClassName">Character class name.</param> + public void PutCharacterCategory(int codePoint, string characterClassName) + { + characterClassName = characterClassName.Split(new char[] { ' ' }, StringSplitOptions.RemoveEmptyEntries)[0]; // use first + // category + // class + + // Override Nakaguro + if (codePoint == 0x30FB) + { + characterClassName = "SYMBOL"; + } + characterCategoryMap[codePoint] = CharacterDefinition.LookupCharacterClass(characterClassName); + } + + public void PutInvokeDefinition(string characterClassName, int invoke, int group, int length) + { + byte characterClass = CharacterDefinition.LookupCharacterClass(characterClassName); + invokeMap[characterClass] = invoke == 1; + groupMap[characterClass] = group == 1; + // TODO: length def ignored + } + + public void Write(string baseDir) + { + string filename = baseDir + System.IO.Path.DirectorySeparatorChar + + typeof(CharacterDefinition).FullName.Replace('.', System.IO.Path.DirectorySeparatorChar) + CharacterDefinition.FILENAME_SUFFIX; + //new File(filename).getParentFile().mkdirs(); + System.IO.Directory.CreateDirectory(System.IO.Path.GetDirectoryName(baseDir)); + using (Stream os = new FileStream(filename, FileMode.Create, FileAccess.Write)) + { + DataOutput @out = new OutputStreamDataOutput(os); + CodecUtil.WriteHeader(@out, CharacterDefinition.HEADER, CharacterDefinition.VERSION); + @out.WriteBytes(characterCategoryMap, 0, characterCategoryMap.Length); + for (int i = 0; i < CharacterDefinition.CLASS_COUNT; i++) + { + byte b = (byte)( + (invokeMap[i] ? 0x01 : 0x00) | + (groupMap[i] ? 0x02 : 0x00) + ); + @out.WriteByte(b); + } + } + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0f092010/src/Lucene.Net.Analysis.Kuromoji/Tools/ConnectionCostsBuilder.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Kuromoji/Tools/ConnectionCostsBuilder.cs b/src/Lucene.Net.Analysis.Kuromoji/Tools/ConnectionCostsBuilder.cs new file mode 100644 index 0000000..235c523 --- /dev/null +++ b/src/Lucene.Net.Analysis.Kuromoji/Tools/ConnectionCostsBuilder.cs @@ -0,0 +1,68 @@ +using System.Diagnostics; +using System.Globalization; +using System.IO; +using System.Text; +using System.Text.RegularExpressions; + +namespace Lucene.Net.Analysis.Ja.Util +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + public class ConnectionCostsBuilder + { + private static readonly Regex whiteSpaceRegex = new Regex("\\s+", RegexOptions.Compiled); + + private ConnectionCostsBuilder() + { + } + + public static ConnectionCostsWriter Build(string filename) + { + using (Stream inputStream = new FileStream(filename, FileMode.Open, FileAccess.Read)) + { + StreamReader streamReader = new StreamReader(inputStream, Encoding.ASCII); + + string line = streamReader.ReadLine(); + string[] dimensions = whiteSpaceRegex.Split(line); + + Debug.Assert(dimensions.Length == 2); + + int forwardSize = int.Parse(dimensions[0], CultureInfo.InvariantCulture); + int backwardSize = int.Parse(dimensions[1], CultureInfo.InvariantCulture); + + Debug.Assert(forwardSize > 0 && backwardSize > 0); + + ConnectionCostsWriter costs = new ConnectionCostsWriter(forwardSize, backwardSize); + + while ((line = streamReader.ReadLine()) != null) + { + string[] fields = whiteSpaceRegex.Split(line); + + Debug.Assert(fields.Length == 3); + + int forwardId = int.Parse(fields[0], CultureInfo.InvariantCulture); + int backwardId = int.Parse(fields[1], CultureInfo.InvariantCulture); + int cost = int.Parse(fields[2], CultureInfo.InvariantCulture); + + costs.Add(forwardId, backwardId, cost); + } + return costs; + } + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0f092010/src/Lucene.Net.Analysis.Kuromoji/Tools/ConnectionCostsWriter.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Kuromoji/Tools/ConnectionCostsWriter.cs b/src/Lucene.Net.Analysis.Kuromoji/Tools/ConnectionCostsWriter.cs new file mode 100644 index 0000000..bd1376e --- /dev/null +++ b/src/Lucene.Net.Analysis.Kuromoji/Tools/ConnectionCostsWriter.cs @@ -0,0 +1,74 @@ +using Lucene.Net.Analysis.Ja.Dict; +using Lucene.Net.Codecs; +using Lucene.Net.Store; +using System.Diagnostics; +using System.IO; + +namespace Lucene.Net.Analysis.Ja.Util +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + public sealed class ConnectionCostsWriter + { + private readonly short[][] costs; // array is backward IDs first since get is called using the same backward ID consecutively. maybe doesn't matter. + private readonly int forwardSize; + private readonly int backwardSize; + /// <summary> + /// Constructor for building. TODO: remove write access + /// </summary> + public ConnectionCostsWriter(int forwardSize, int backwardSize) + { + this.forwardSize = forwardSize; + this.backwardSize = backwardSize; + //this.costs = new short[backwardSize][forwardSize]; + this.costs = Support.RectangularArrays.ReturnRectangularArray<short>(backwardSize, forwardSize); + } + + public void Add(int forwardId, int backwardId, int cost) + { + this.costs[backwardId][forwardId] = (short)cost; + } + + public void Write(string baseDir) + { + string filename = baseDir + System.IO.Path.DirectorySeparatorChar + + typeof(ConnectionCosts).FullName.Replace('.', System.IO.Path.DirectorySeparatorChar) + ConnectionCosts.FILENAME_SUFFIX; + //new File(filename).getParentFile().mkdirs(); + System.IO.Directory.CreateDirectory(System.IO.Path.GetDirectoryName(filename)); + using (Stream os = new FileStream(filename, FileMode.Create, FileAccess.Write)) + { + DataOutput @out = new OutputStreamDataOutput(os); + CodecUtil.WriteHeader(@out, ConnectionCosts.HEADER, ConnectionCosts.VERSION); + @out.WriteVInt32(forwardSize); + @out.WriteVInt32(backwardSize); + int last = 0; + Debug.Assert(costs.Length == backwardSize); + foreach (short[] a in costs) + { + Debug.Assert(a.Length == forwardSize); + for (int i = 0; i < a.Length; i++) + { + int delta = (int)a[i] - last; + @out.WriteVInt32((delta >> 31) ^ (delta << 1)); + last = a[i]; + } + } + } + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0f092010/src/Lucene.Net.Analysis.Kuromoji/Tools/DictionaryBuilder.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Kuromoji/Tools/DictionaryBuilder.cs b/src/Lucene.Net.Analysis.Kuromoji/Tools/DictionaryBuilder.cs new file mode 100644 index 0000000..c105039 --- /dev/null +++ b/src/Lucene.Net.Analysis.Kuromoji/Tools/DictionaryBuilder.cs @@ -0,0 +1,92 @@ +using System; + +namespace Lucene.Net.Analysis.Ja.Util +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + public class DictionaryBuilder + { + public enum DictionaryFormat { IPADIC, UNIDIC }; + + private DictionaryBuilder() + { + } + + public static void Build(DictionaryFormat format, + string inputDirname, + string outputDirname, + string encoding, + bool normalizeEntry) + { + Console.WriteLine("building tokeninfo dict..."); + TokenInfoDictionaryBuilder tokenInfoBuilder = new TokenInfoDictionaryBuilder(format, encoding, normalizeEntry); + TokenInfoDictionaryWriter tokenInfoDictionary = tokenInfoBuilder.Build(inputDirname); + tokenInfoDictionary.Write(outputDirname); + tokenInfoDictionary = null; + tokenInfoBuilder = null; + Console.WriteLine("done"); + + Console.WriteLine("building unknown word dict..."); + UnknownDictionaryBuilder unkBuilder = new UnknownDictionaryBuilder(encoding); + UnknownDictionaryWriter unkDictionary = unkBuilder.Build(inputDirname); + unkDictionary.Write(outputDirname); + unkDictionary = null; + unkBuilder = null; + Console.WriteLine("done"); + + Console.WriteLine("building connection costs..."); + ConnectionCostsWriter connectionCosts + = ConnectionCostsBuilder.Build(inputDirname + System.IO.Path.DirectorySeparatorChar + "matrix.def"); + connectionCosts.Write(outputDirname); + Console.WriteLine("done"); + } + + public static void Main(string[] args) + { + DictionaryFormat format; + if (args[0].Equals("ipadic", StringComparison.OrdinalIgnoreCase)) + { + format = DictionaryFormat.IPADIC; + } + else if (args[0].Equals("unidic", StringComparison.OrdinalIgnoreCase)) + { + format = DictionaryFormat.UNIDIC; + } + else + { + Console.Error.WriteLine("Illegal format " + args[0] + " using unidic instead"); + format = DictionaryFormat.IPADIC; + } + + string inputDirname = args[1]; + string outputDirname = args[2]; + string inputEncoding = args[3]; + bool normalizeEntries = bool.Parse(args[4]); + + Console.WriteLine("dictionary builder"); + Console.WriteLine(); + Console.WriteLine("dictionary format: " + format); + Console.WriteLine("input directory: " + inputDirname); + Console.WriteLine("output directory: " + outputDirname); + Console.WriteLine("input encoding: " + inputEncoding); + Console.WriteLine("normalize entries: " + normalizeEntries); + Console.WriteLine(); + DictionaryBuilder.Build(format, inputDirname, outputDirname, inputEncoding, normalizeEntries); + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0f092010/src/Lucene.Net.Analysis.Kuromoji/Tools/TokenInfoDictionaryBuilder.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Kuromoji/Tools/TokenInfoDictionaryBuilder.cs b/src/Lucene.Net.Analysis.Kuromoji/Tools/TokenInfoDictionaryBuilder.cs new file mode 100644 index 0000000..0406083 --- /dev/null +++ b/src/Lucene.Net.Analysis.Kuromoji/Tools/TokenInfoDictionaryBuilder.cs @@ -0,0 +1,230 @@ +using Lucene.Net.Support; +using Lucene.Net.Util; +using Lucene.Net.Util.Fst; +using Lucene.Net.Util.Packed; +using System; +using System.Collections.Generic; +using System.IO; +using System.Text; + +namespace Lucene.Net.Analysis.Ja.Util +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + public class TokenInfoDictionaryBuilder + { + /// <summary>Internal word id - incrementally assigned as entries are read and added. This will be byte offset of dictionary file</summary> + private int offset = 0; + + private string encoding = "euc-jp"; + + private bool normalizeEntries = false; + //private Normalizer2 normalizer; + + private DictionaryBuilder.DictionaryFormat format = DictionaryBuilder.DictionaryFormat.IPADIC; + + public TokenInfoDictionaryBuilder(DictionaryBuilder.DictionaryFormat format, string encoding, bool normalizeEntries) + { + this.format = format; + this.encoding = encoding; + this.normalizeEntries = normalizeEntries; + //this.normalizer = normalizeEntries ? Normalizer2.getInstance(null, "nfkc", Normalizer2.Mode.COMPOSE) : null; + } + + public virtual TokenInfoDictionaryWriter Build(string dirname) + { + List<string> csvFiles = new List<string>(); + foreach (FileInfo file in new DirectoryInfo(dirname).EnumerateFiles("*.csv")) + { + csvFiles.Add(file.FullName); + } + csvFiles.Sort(); + return BuildDictionary(csvFiles); + } + + public virtual TokenInfoDictionaryWriter BuildDictionary(IList<string> csvFiles) + { + TokenInfoDictionaryWriter dictionary = new TokenInfoDictionaryWriter(10 * 1024 * 1024); + + // all lines in the file + Console.WriteLine(" parse..."); + List<string[]> lines = new List<string[]>(400000); + foreach (string file in csvFiles) + { + using (Stream inputStream = new FileStream(file, FileMode.Open, FileAccess.Read)) + { + Encoding decoder = Encoding.GetEncoding(encoding); + TextReader reader = new StreamReader(inputStream, decoder); + + string line = null; + while ((line = reader.ReadLine()) != null) + { + string[] entry = CSVUtil.Parse(line); + + if (entry.Length < 13) + { + Console.WriteLine("Entry in CSV is not valid: " + line); + continue; + } + + string[] formatted = FormatEntry(entry); + lines.Add(formatted); + + // NFKC normalize dictionary entry + if (normalizeEntries) + { + //if (normalizer.isNormalized(entry[0])){ + if (entry[0].IsNormalized(NormalizationForm.FormKC)) + { + continue; + } + string[] normalizedEntry = new string[entry.Length]; + for (int i = 0; i < entry.Length; i++) + { + //normalizedEntry[i] = normalizer.normalize(entry[i]); + normalizedEntry[i] = entry[i].Normalize(NormalizationForm.FormKC); + } + + formatted = FormatEntry(normalizedEntry); + lines.Add(formatted); + } + } + } + } + + Console.WriteLine(" sort..."); + + // sort by term: we sorted the files already and use a stable sort. + lines.Sort(new ComparerAnonymousHelper()); + + Console.WriteLine(" encode..."); + + PositiveInt32Outputs fstOutput = PositiveInt32Outputs.Singleton; + Builder<long?> fstBuilder = new Builder<long?>(Lucene.Net.Util.Fst.FST.INPUT_TYPE.BYTE2, 0, 0, true, true, int.MaxValue, fstOutput, null, true, PackedInt32s.DEFAULT, true, 15); + Int32sRef scratch = new Int32sRef(); + long ord = -1; // first ord will be 0 + string lastValue = null; + + // build tokeninfo dictionary + foreach (string[] entry in lines) + { + int next = dictionary.Put(entry); + + if (next == offset) + { + Console.WriteLine("Failed to process line: " + Collections.ToString(entry)); + continue; + } + + string token = entry[0]; + if (!token.Equals(lastValue, StringComparison.Ordinal)) + { + // new word to add to fst + ord++; + lastValue = token; + scratch.Grow(token.Length); + scratch.Length = token.Length; + for (int i = 0; i < token.Length; i++) + { + scratch.Int32s[i] = (int)token[i]; + } + fstBuilder.Add(scratch, ord); + } + dictionary.AddMapping((int)ord, offset); + offset = next; + } + + FST<long?> fst = fstBuilder.Finish(); + + Console.WriteLine(" " + fst.NodeCount + " nodes, " + fst.ArcCount + " arcs, " + fst.GetSizeInBytes() + " bytes... "); + dictionary.SetFST(fst); + Console.WriteLine(" done"); + + return dictionary; + } + + private class ComparerAnonymousHelper : IComparer<string[]> + { + public int Compare(string[] left, string[] right) + { + return left[0].CompareToOrdinal(right[0]); + } + } + + /// <summary> + /// IPADIC features + /// + /// 0 - surface + /// 1 - left cost + /// 2 - right cost + /// 3 - word cost + /// 4-9 - pos + /// 10 - base form + /// 11 - reading + /// 12 - pronounciation + /// + /// UniDic features + /// + /// 0 - surface + /// 1 - left cost + /// 2 - right cost + /// 3 - word cost + /// 4-9 - pos + /// 10 - base form reading + /// 11 - base form + /// 12 - surface form + /// 13 - surface reading + /// </summary> + public virtual string[] FormatEntry(string[] features) + { + if (this.format == DictionaryBuilder.DictionaryFormat.IPADIC) + { + return features; + } + else + { + string[] features2 = new string[13]; + features2[0] = features[0]; + features2[1] = features[1]; + features2[2] = features[2]; + features2[3] = features[3]; + features2[4] = features[4]; + features2[5] = features[5]; + features2[6] = features[6]; + features2[7] = features[7]; + features2[8] = features[8]; + features2[9] = features[9]; + features2[10] = features[11]; + + // If the surface reading is non-existent, use surface form for reading and pronunciation. + // This happens with punctuation in UniDic and there are possibly other cases as well + if (features[13].Length == 0) + { + features2[11] = features[0]; + features2[12] = features[0]; + } + else + { + features2[11] = features[13]; + features2[12] = features[13]; + } + return features2; + } + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0f092010/src/Lucene.Net.Analysis.Kuromoji/Tools/TokenInfoDictionaryWriter.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Kuromoji/Tools/TokenInfoDictionaryWriter.cs b/src/Lucene.Net.Analysis.Kuromoji/Tools/TokenInfoDictionaryWriter.cs new file mode 100644 index 0000000..42338f9 --- /dev/null +++ b/src/Lucene.Net.Analysis.Kuromoji/Tools/TokenInfoDictionaryWriter.cs @@ -0,0 +1,51 @@ +using Lucene.Net.Analysis.Ja.Dict; +using Lucene.Net.Util.Fst; +using System.IO; + +namespace Lucene.Net.Analysis.Ja.Util +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + public class TokenInfoDictionaryWriter : BinaryDictionaryWriter + { + private FST<long?> fst; + + public TokenInfoDictionaryWriter(int size) + : base(typeof(TokenInfoDictionary), size) + { + } + + public virtual void SetFST(FST<long?> fst) + { + this.fst = fst; + } + + public override void Write(string baseDir) + { + base.Write(baseDir); + WriteFST(GetBaseFileName(baseDir) + TokenInfoDictionary.FST_FILENAME_SUFFIX); + } + + protected virtual void WriteFST(string filename) + { + FileInfo f = new FileInfo(filename); + if (!f.Directory.Exists) f.Directory.Create(); + fst.Save(f); + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0f092010/src/Lucene.Net.Analysis.Kuromoji/Tools/UnknownDictionaryBuilder.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Kuromoji/Tools/UnknownDictionaryBuilder.cs b/src/Lucene.Net.Analysis.Kuromoji/Tools/UnknownDictionaryBuilder.cs new file mode 100644 index 0000000..feff4fe --- /dev/null +++ b/src/Lucene.Net.Analysis.Kuromoji/Tools/UnknownDictionaryBuilder.cs @@ -0,0 +1,146 @@ +using Lucene.Net.Analysis.Ja.Dict; +using System; +using System.Collections.Generic; +using System.Globalization; +using System.IO; +using System.Text; +using System.Text.RegularExpressions; + +namespace Lucene.Net.Analysis.Ja.Util +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + public class UnknownDictionaryBuilder + { + private static readonly string NGRAM_DICTIONARY_ENTRY = "NGRAM,5,5,-32768,è¨å·,ä¸è¬,*,*,*,*,*,*,*"; + + private string encoding = "euc-jp"; + + public UnknownDictionaryBuilder(string encoding) + { + this.encoding = encoding; + } + + public virtual UnknownDictionaryWriter Build(string dirname) + { + UnknownDictionaryWriter unkDictionary = ReadDictionaryFile(dirname + System.IO.Path.DirectorySeparatorChar + "unk.def"); //Should be only one file + ReadCharacterDefinition(dirname + System.IO.Path.DirectorySeparatorChar + "char.def", unkDictionary); + return unkDictionary; + } + + public virtual UnknownDictionaryWriter ReadDictionaryFile(string filename) + { + return ReadDictionaryFile(filename, encoding); + } + + public virtual UnknownDictionaryWriter ReadDictionaryFile(string filename, string encoding) + { + UnknownDictionaryWriter dictionary = new UnknownDictionaryWriter(5 * 1024 * 1024); + + List<string[]> lines = new List<string[]>(); + Encoding decoder = Encoding.GetEncoding(encoding); + using (Stream inputStream = new FileStream(filename, FileMode.Open, FileAccess.Read)) + using (TextReader reader = new StreamReader(inputStream, decoder)) + { + + dictionary.Put(CSVUtil.Parse(NGRAM_DICTIONARY_ENTRY)); + + + string line = null; + while ((line = reader.ReadLine()) != null) + { + // note: unk.def only has 10 fields, it simplifies the writer to just append empty reading and pronunciation, + // even though the unknown dictionary returns hardcoded null here. + string[] parsed = CSVUtil.Parse(line + ",*,*"); // Probably we don't need to validate entry + lines.Add(parsed); + } + } + + lines.Sort(new ComparerAnonymousHelper()); + + foreach (string[] entry in lines) + { + dictionary.Put(entry); + } + + return dictionary; + } + private class ComparerAnonymousHelper : IComparer<string[]> + { + public int Compare(string[] left, string[] right) + { + int leftId = CharacterDefinition.LookupCharacterClass(left[0]); + int rightId = CharacterDefinition.LookupCharacterClass(right[0]); + return leftId - rightId; + } + } + + public virtual void ReadCharacterDefinition(string filename, UnknownDictionaryWriter dictionary) + { + using (Stream inputStream = new FileStream(filename, FileMode.Open, FileAccess.Read)) + using (TextReader reader = new StreamReader(inputStream, Encoding.GetEncoding(encoding))) + { + string line = null; + + while ((line = reader.ReadLine()) != null) + { + line = Regex.Replace(line, "^\\s", ""); + line = Regex.Replace(line, "\\s*#.*", ""); + line = Regex.Replace(line, "\\s+", " "); + + // Skip empty line or comment line + if (line.Length == 0) + { + continue; + } + + if (line.StartsWith("0x", StringComparison.Ordinal)) + { // Category mapping + string[] values = new Regex(" ").Split(line, 2); // Split only first space + + if (!values[0].Contains("..")) + { + int cp = Convert.ToInt32(values[0]); + dictionary.PutCharacterCategory(cp, values[1]); + } + else + { + string[] codePoints = Regex.Split(values[0], "\\.\\."); + int cpFrom = Convert.ToInt32(codePoints[0]); + int cpTo = Convert.ToInt32(codePoints[1]); + + for (int i = cpFrom; i <= cpTo; i++) + { + dictionary.PutCharacterCategory(i, values[1]); + } + } + } + else + { // Invoke definition + string[] values = line.Split(new char[] { ' ' }, StringSplitOptions.RemoveEmptyEntries); // Consecutive space is merged above + string characterClassName = values[0]; + int invoke = int.Parse(values[1], CultureInfo.InvariantCulture); + int group = int.Parse(values[2], CultureInfo.InvariantCulture); + int length = int.Parse(values[3], CultureInfo.InvariantCulture); + dictionary.PutInvokeDefinition(characterClassName, invoke, group, length); + } + } + } + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0f092010/src/Lucene.Net.Analysis.Kuromoji/Tools/UnknownDictionaryWriter.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Kuromoji/Tools/UnknownDictionaryWriter.cs b/src/Lucene.Net.Analysis.Kuromoji/Tools/UnknownDictionaryWriter.cs new file mode 100644 index 0000000..7d4a982 --- /dev/null +++ b/src/Lucene.Net.Analysis.Kuromoji/Tools/UnknownDictionaryWriter.cs @@ -0,0 +1,66 @@ +using Lucene.Net.Analysis.Ja.Dict; + +namespace Lucene.Net.Analysis.Ja.Util +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + public class UnknownDictionaryWriter : BinaryDictionaryWriter + { + private readonly CharacterDefinitionWriter characterDefinition = new CharacterDefinitionWriter(); + + public UnknownDictionaryWriter(int size) + : base(typeof(UnknownDictionary), size) + { + } + + public override int Put(string[] entry) + { + // Get wordId of current entry + int wordId = m_buffer.Position; + + // Put entry + int result = base.Put(entry); + + // Put entry in targetMap + int characterId = CharacterDefinition.LookupCharacterClass(entry[0]); + AddMapping(characterId, wordId); + return result; + } + + /// <summary> + /// Put mapping from unicode code point to character class. + /// </summary> + /// <param name="codePoint">Code point.</param> + /// <param name="characterClassName">Character class name.</param> + public virtual void PutCharacterCategory(int codePoint, string characterClassName) + { + characterDefinition.PutCharacterCategory(codePoint, characterClassName); + } + + public virtual void PutInvokeDefinition(string characterClassName, int invoke, int group, int length) + { + characterDefinition.PutInvokeDefinition(characterClassName, invoke, group, length); + } + + public override void Write(string baseDir) + { + base.Write(baseDir); + characterDefinition.Write(baseDir); + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0f092010/src/Lucene.Net.Analysis.Kuromoji/Util/CSVUtil.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Kuromoji/Util/CSVUtil.cs b/src/Lucene.Net.Analysis.Kuromoji/Util/CSVUtil.cs new file mode 100644 index 0000000..9d86aed --- /dev/null +++ b/src/Lucene.Net.Analysis.Kuromoji/Util/CSVUtil.cs @@ -0,0 +1,124 @@ +using System.Collections.Generic; +using System.Text; +using System.Text.RegularExpressions; + +namespace Lucene.Net.Analysis.Ja.Util +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// <summary> + /// Utility class for parsing CSV text + /// </summary> + public sealed class CSVUtil + { + private static readonly char QUOTE = '"'; + + private static readonly char COMMA = ','; + + private static readonly Regex QUOTE_REPLACE_PATTERN = new Regex("^\"([^\"]+)\"$", RegexOptions.Compiled); + + private static readonly string ESCAPED_QUOTE = "\"\""; + + private CSVUtil() { } // no instance!!! + + /// <summary> + /// Parse CSV line + /// </summary> + /// <param name="line">line containing csv-encoded data</param> + /// <returns>Array of values</returns> + public static string[] Parse(string line) + { + bool insideQuote = false; + List<string> result = new List<string>(); + int quoteCount = 0; + StringBuilder sb = new StringBuilder(); + for (int i = 0; i < line.Length; i++) + { + char c = line[i]; + + if (c == QUOTE) + { + insideQuote = !insideQuote; + quoteCount++; + } + + if (c == COMMA && !insideQuote) + { + string value = sb.ToString(); + value = UnQuoteUnEscape(value); + result.Add(value); + sb.Length = 0; + continue; + } + + sb.Append(c); + } + + result.Add(sb.ToString()); + + // Validate + if (quoteCount % 2 != 0) + { + return new string[0]; + } + + return result.ToArray(/*new String[result.size()]*/); + } + + private static string UnQuoteUnEscape(string original) + { + string result = original; + + // Unquote + if (result.IndexOf('\"') >= 0) + { + Match m = QUOTE_REPLACE_PATTERN.Match(original); + if (m.Success) + { + result = m.Groups[1].Value; + } + + // Unescape + if (result.IndexOf(ESCAPED_QUOTE) >= 0) + { + result = result.Replace(ESCAPED_QUOTE, "\""); + } + } + + return result; + } + + /// <summary> + /// Quote and escape input value for CSV + /// </summary> + public static string QuoteEscape(string original) + { + string result = original; + + if (result.IndexOf('\"') >= 0) + { + result.Replace("\"", ESCAPED_QUOTE); + } + if (result.IndexOf(COMMA) >= 0) + { + result = "\"" + result + "\""; + } + return result; + } + } +}
