http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0f092010/src/Lucene.Net.Analysis.Kuromoji/JapaneseTokenizer.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Kuromoji/JapaneseTokenizer.cs b/src/Lucene.Net.Analysis.Kuromoji/JapaneseTokenizer.cs new file mode 100644 index 0000000..4690549 --- /dev/null +++ b/src/Lucene.Net.Analysis.Kuromoji/JapaneseTokenizer.cs @@ -0,0 +1,1489 @@ +using Lucene.Net.Analysis.Ja.Dict; +using Lucene.Net.Analysis.Ja.TokenAttributes; +using Lucene.Net.Analysis.TokenAttributes; +using Lucene.Net.Analysis.Util; +using Lucene.Net.Support; +using Lucene.Net.Util; +using Lucene.Net.Util.Fst; +using System; +using System.Collections.Generic; +using System.Diagnostics; +using System.Globalization; +using System.IO; +using System.Linq; +using System.Threading; + +namespace Lucene.Net.Analysis.Ja +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// <summary> + /// Tokenizer for Japanese that uses morphological analysis. + /// </summary> + /// <remarks> + /// This tokenizer sets a number of additional attributes: + /// <list type="bullet"> + /// <item><description><see cref="IBaseFormAttribute"/> containing base form for inflected adjectives and verbs.</description></item> + /// <item><description><see cref="IPartOfSpeechAttribute"/> containing part-of-speech.</description></item> + /// <item><description><see cref="IReadingAttribute"/> containing reading and pronunciation.</description></item> + /// <item><description><see cref="IInflectionAttribute"/> containing additional part-of-speech information for inflected forms.</description></item> + /// </list> + /// <para/> + /// This tokenizer uses a rolling Viterbi search to find the + /// least cost segmentation (path) of the incoming characters. + /// For tokens that appear to be compound (> length 2 for all + /// Kanji, or > length 7 for non-Kanji), we see if there is a + /// 2nd best segmentation of that token after applying + /// penalties to the long tokens. If so, and the Mode is + /// <see cref="JapaneseTokenizerMode.SEARCH"/>, we output the alternate segmentation + /// as well. + /// </remarks> + public sealed class JapaneseTokenizer : Tokenizer + { + // LUCENENET specific: de-nested Mode and renamed JapaneseTokenizerMode + + /// <summary> + /// Default tokenization mode. Currently this is <see cref="JapaneseTokenizerMode.SEARCH"/>. + /// </summary> + public static readonly JapaneseTokenizerMode DEFAULT_MODE = JapaneseTokenizerMode.SEARCH; + + // LUCENENET specific: de-nested Type and renamed JapaneseTokenizerType + + + private static readonly bool VERBOSE = false; + + private static readonly int SEARCH_MODE_KANJI_LENGTH = 2; + + private static readonly int SEARCH_MODE_OTHER_LENGTH = 7; // Must be >= SEARCH_MODE_KANJI_LENGTH + + private static readonly int SEARCH_MODE_KANJI_PENALTY = 3000; + + private static readonly int SEARCH_MODE_OTHER_PENALTY = 1700; + + // For safety: + private static readonly int MAX_UNKNOWN_WORD_LENGTH = 1024; + private static readonly int MAX_BACKTRACE_GAP = 1024; + + private readonly IDictionary<JapaneseTokenizerType, IDictionary> dictionaryMap = new Dictionary<JapaneseTokenizerType, IDictionary>(); + + private readonly TokenInfoFST fst; + private readonly TokenInfoDictionary dictionary; + private readonly UnknownDictionary unkDictionary; + private readonly ConnectionCosts costs; + private readonly UserDictionary userDictionary; + private readonly CharacterDefinition characterDefinition; + + private readonly FST.Arc<long?> arc = new FST.Arc<long?>(); + private readonly FST.BytesReader fstReader; + private readonly Int32sRef wordIdRef = new Int32sRef(); + + private readonly FST.BytesReader userFSTReader; + private readonly TokenInfoFST userFST; + + private readonly RollingCharBuffer buffer = new RollingCharBuffer(); + + private readonly WrappedPositionArray positions = new WrappedPositionArray(); + + private readonly bool discardPunctuation; + private readonly bool searchMode; + private readonly bool extendedMode; + private readonly bool outputCompounds; + + // Index of the last character of unknown word: + private int unknownWordEndIndex = -1; + + // True once we've hit the EOF from the input reader: + private bool end; + + // Last absolute position we backtraced from: + private int lastBackTracePos; + + // Position of last token we returned; we use this to + // figure out whether to set posIncr to 0 or 1: + private int lastTokenPos; + + // Next absolute position to process: + private int pos; + + // Already parsed, but not yet passed to caller, tokens: + private readonly IList<Token> pending = new List<Token>(); + + private readonly ICharTermAttribute termAtt; + private readonly IOffsetAttribute offsetAtt; + private readonly IPositionIncrementAttribute posIncAtt; + private readonly IPositionLengthAttribute posLengthAtt; + private readonly IBaseFormAttribute basicFormAtt; + private readonly IPartOfSpeechAttribute posAtt; + private readonly IReadingAttribute readingAtt; + private readonly IInflectionAttribute inflectionAtt; + + /// <summary> + /// Create a new JapaneseTokenizer. + /// <para/> + /// Uses the default AttributeFactory. + /// </summary> + /// <param name="input">TextReader containing text.</param> + /// <param name="userDictionary">Optional: if non-null, user dictionary.</param> + /// <param name="discardPunctuation"><c>true</c> if punctuation tokens should be dropped from the output.</param> + /// <param name="mode">Tokenization mode.</param> + public JapaneseTokenizer(TextReader input, UserDictionary userDictionary, bool discardPunctuation, JapaneseTokenizerMode mode) + : this(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY, input, userDictionary, discardPunctuation, mode) + { + } + + /// <summary> + /// Create a new JapaneseTokenizer. + /// </summary> + /// <param name="factory">The AttributeFactory to use.</param> + /// <param name="input">TextReader containing text.</param> + /// <param name="userDictionary">Optional: if non-null, user dictionary.</param> + /// <param name="discardPunctuation"><c>true</c> if punctuation tokens should be dropped from the output.</param> + /// <param name="mode">Tokenization mode.</param> + public JapaneseTokenizer + (AttributeFactory factory, TextReader input, UserDictionary userDictionary, bool discardPunctuation, JapaneseTokenizerMode mode) + : base(factory, input) + { + this.termAtt = AddAttribute<ICharTermAttribute>(); + this.offsetAtt = AddAttribute<IOffsetAttribute>(); + this.posIncAtt = AddAttribute<IPositionIncrementAttribute>(); + this.posLengthAtt = AddAttribute<IPositionLengthAttribute>(); + this.basicFormAtt = AddAttribute<IBaseFormAttribute>(); + this.posAtt = AddAttribute<IPartOfSpeechAttribute>(); + this.readingAtt = AddAttribute<IReadingAttribute>(); + this.inflectionAtt = AddAttribute<IInflectionAttribute>(); + + dictionary = TokenInfoDictionary.GetInstance(); + fst = dictionary.FST; + unkDictionary = UnknownDictionary.GetInstance(); + characterDefinition = unkDictionary.CharacterDefinition; + this.userDictionary = userDictionary; + costs = ConnectionCosts.GetInstance(); + fstReader = fst.GetBytesReader(); + if (userDictionary != null) + { + userFST = userDictionary.FST; + userFSTReader = userFST.GetBytesReader(); + } + else + { + userFST = null; + userFSTReader = null; + } + this.discardPunctuation = discardPunctuation; + switch (mode) + { + case JapaneseTokenizerMode.SEARCH: + searchMode = true; + extendedMode = false; + outputCompounds = true; + break; + case JapaneseTokenizerMode.EXTENDED: + searchMode = true; + extendedMode = true; + outputCompounds = false; + break; + default: + searchMode = false; + extendedMode = false; + outputCompounds = false; + break; + } + buffer.Reset(this.m_input); + + ResetState(); + + dictionaryMap[JapaneseTokenizerType.KNOWN] = dictionary; + dictionaryMap[JapaneseTokenizerType.UNKNOWN] = unkDictionary; + dictionaryMap[JapaneseTokenizerType.USER] = userDictionary; + } + + private GraphvizFormatter dotOut; + + // LUCENENET specific - added getter and made into property + // so we can set this during object initialization. + + /// <summary> + /// Expert: set this to produce graphviz (dot) output of + /// the Viterbi lattice + /// </summary> + public GraphvizFormatter GraphvizFormatter + { + get { return this.dotOut; } + set { this.dotOut = value; } + } + + protected override void Dispose(bool disposing) + { + base.Dispose(disposing); + if (disposing) + { + buffer.Reset(m_input); + } + } + + public override void Reset() + { + base.Reset(); + buffer.Reset(m_input); + ResetState(); + } + + private void ResetState() + { + positions.Reset(); + unknownWordEndIndex = -1; + pos = 0; + end = false; + lastBackTracePos = 0; + lastTokenPos = -1; + pending.Clear(); + + // Add BOS: + positions.Get(0).Add(0, 0, -1, -1, -1, JapaneseTokenizerType.KNOWN); + } + + public override void End() + { + base.End(); + // Set final offset + int finalOffset = CorrectOffset(pos); + offsetAtt.SetOffset(finalOffset, finalOffset); + } + + // Returns the added cost that a 2nd best segmentation is + // allowed to have. Ie, if we see path with cost X, + // ending in a compound word, and this method returns + // threshold > 0, then we will also find the 2nd best + // segmentation and if its path score is within this + // threshold of X, we'll include it in the output: + private int ComputeSecondBestThreshold(int pos, int length) + { + // TODO: maybe we do something else here, instead of just + // using the penalty...? EG we can be more aggressive on + // when to also test for 2nd best path + return ComputePenalty(pos, length); + } + + private int ComputePenalty(int pos, int length) + { + if (length > SEARCH_MODE_KANJI_LENGTH) + { + bool allKanji = true; + // check if node consists of only kanji + int endPos = pos + length; + for (int pos2 = pos; pos2 < endPos; pos2++) + { + if (!characterDefinition.IsKanji((char)buffer.Get(pos2))) + { + allKanji = false; + break; + } + } + if (allKanji) + { // Process only Kanji keywords + return (length - SEARCH_MODE_KANJI_LENGTH) * SEARCH_MODE_KANJI_PENALTY; + } + else if (length > SEARCH_MODE_OTHER_LENGTH) + { + return (length - SEARCH_MODE_OTHER_LENGTH) * SEARCH_MODE_OTHER_PENALTY; + } + } + return 0; + } + + // LUCENENET specific - de-nested Position class + + private void Add(IDictionary dict, Position fromPosData, int endPos, int wordID, JapaneseTokenizerType type, bool addPenalty) + { + int wordCost = dict.GetWordCost(wordID); + int leftID = dict.GetLeftId(wordID); + int leastCost = int.MaxValue; + int leastIDX = -1; + Debug.Assert(fromPosData.count > 0); + for (int idx = 0; idx < fromPosData.count; idx++) + { + // Cost is path cost so far, plus word cost (added at + // end of loop), plus bigram cost: + int cost = fromPosData.costs[idx] + costs.Get(fromPosData.lastRightID[idx], leftID); + if (VERBOSE) + { + Console.WriteLine(" fromIDX=" + idx + ": cost=" + cost + " (prevCost=" + fromPosData.costs[idx] + " wordCost=" + wordCost + " bgCost=" + costs.Get(fromPosData.lastRightID[idx], leftID) + " leftID=" + leftID); + } + if (cost < leastCost) + { + leastCost = cost; + leastIDX = idx; + if (VERBOSE) + { + Console.WriteLine(" **"); + } + } + } + + leastCost += wordCost; + + if (VERBOSE) + { + Console.WriteLine(" + cost=" + leastCost + " wordID=" + wordID + " leftID=" + leftID + " leastIDX=" + leastIDX + " toPos=" + endPos + " toPos.idx=" + positions.Get(endPos).count); + } + + if ((addPenalty || (!outputCompounds && searchMode)) && type != JapaneseTokenizerType.USER) + { + int penalty = ComputePenalty(fromPosData.pos, endPos - fromPosData.pos); + if (VERBOSE) + { + if (penalty > 0) + { + Console.WriteLine(" + penalty=" + penalty + " cost=" + (leastCost + penalty)); + } + } + leastCost += penalty; + } + + //positions.get(endPos).add(leastCost, dict.getRightId(wordID), fromPosData.pos, leastIDX, wordID, type); + Debug.Assert(leftID == dict.GetRightId(wordID)); + positions.Get(endPos).Add(leastCost, leftID, fromPosData.pos, leastIDX, wordID, type); + } + + public override bool IncrementToken() + { + + // parse() is able to return w/o producing any new + // tokens, when the tokens it had produced were entirely + // punctuation. So we loop here until we get a real + // token or we end: + while (pending.Count == 0) + { + if (end) + { + return false; + } + + // Push Viterbi forward some more: + Parse(); + } + + Token token = pending.LastOrDefault(); + if (token != null) + { + pending.Remove(token); + } + + int position = token.Position; + int length = token.Length; + ClearAttributes(); + Debug.Assert(length > 0); + //System.out.println("off=" + token.getOffset() + " len=" + length + " vs " + token.getSurfaceForm().length); + termAtt.CopyBuffer(token.SurfaceForm, token.Offset, length); + offsetAtt.SetOffset(CorrectOffset(position), CorrectOffset(position + length)); + basicFormAtt.SetToken(token); + posAtt.SetToken(token); + readingAtt.SetToken(token); + inflectionAtt.SetToken(token); + if (token.Position == lastTokenPos) + { + posIncAtt.PositionIncrement = 0; + posLengthAtt.PositionLength = token.PositionLength; + } + else + { + Debug.Assert(token.Position > lastTokenPos); + posIncAtt.PositionIncrement = 1; + posLengthAtt.PositionLength = 1; + } + if (VERBOSE) + { + Console.WriteLine(Thread.CurrentThread.Name + ": incToken: return token=" + token); + } + lastTokenPos = token.Position; + return true; + } + + /// <summary> + /// Incrementally parse some more characters. This runs + /// the viterbi search forwards "enough" so that we + /// generate some more tokens. How much forward depends on + /// the chars coming in, since some chars could cause + /// longer-lasting ambiguity in the parsing. Once the + /// ambiguity is resolved, then we back trace, produce + /// the pending tokens, and return. + /// </summary> + private void Parse() + { + if (VERBOSE) + { + Console.WriteLine("\nPARSE"); + } + + // Advances over each position (character): + while (true) + { + + if (buffer.Get(pos) == -1) + { + // End + break; + } + + Position posData = positions.Get(pos); + bool isFrontier = positions.GetNextPos() == pos + 1; + + if (posData.count == 0) + { + // No arcs arrive here; move to next position: + if (VERBOSE) + { + Console.WriteLine(" no arcs in; skip pos=" + pos); + } + pos++; + continue; + } + + if (pos > lastBackTracePos && posData.count == 1 && isFrontier) + { + // if (pos > lastBackTracePos && posData.count == 1 && isFrontier) { + // We are at a "frontier", and only one node is + // alive, so whatever the eventual best path is must + // come through this node. So we can safely commit + // to the prefix of the best path at this point: + Backtrace(posData, 0); + + // Re-base cost so we don't risk int overflow: + posData.costs[0] = 0; + + if (pending.Count != 0) + { + return; + } + else + { + // This means the backtrace only produced + // punctuation tokens, so we must keep parsing. + } + } + + if (pos - lastBackTracePos >= MAX_BACKTRACE_GAP) + { + // Safety: if we've buffered too much, force a + // backtrace now. We find the least-cost partial + // path, across all paths, backtrace from it, and + // then prune all others. Note that this, in + // general, can produce the wrong result, if the + // total best path did not in fact back trace + // through this partial best path. But it's the + // best we can do... (short of not having a + // safety!). + + // First pass: find least cost partial path so far, + // including ending at future positions: + int leastIDX = -1; + int leastCost = int.MaxValue; + Position leastPosData = null; + for (int pos2 = pos; pos2 < positions.GetNextPos(); pos2++) + { + Position posData2 = positions.Get(pos2); + for (int idx = 0; idx < posData2.count; idx++) + { + //System.out.println(" idx=" + idx + " cost=" + cost); + int cost = posData2.costs[idx]; + if (cost < leastCost) + { + leastCost = cost; + leastIDX = idx; + leastPosData = posData2; + } + } + } + + // We will always have at least one live path: + Debug.Assert(leastIDX != -1); + + // Second pass: prune all but the best path: + for (int pos2 = pos; pos2 < positions.GetNextPos(); pos2++) + { + Position posData2 = positions.Get(pos2); + if (posData2 != leastPosData) + { + posData2.Reset(); + } + else + { + if (leastIDX != 0) + { + posData2.costs[0] = posData2.costs[leastIDX]; + posData2.lastRightID[0] = posData2.lastRightID[leastIDX]; + posData2.backPos[0] = posData2.backPos[leastIDX]; + posData2.backIndex[0] = posData2.backIndex[leastIDX]; + posData2.backID[0] = posData2.backID[leastIDX]; + posData2.backType[0] = posData2.backType[leastIDX]; + } + posData2.count = 1; + } + } + + Backtrace(leastPosData, 0); + + // Re-base cost so we don't risk int overflow: + Arrays.Fill(leastPosData.costs, 0, leastPosData.count, 0); + + if (pos != leastPosData.pos) + { + // We jumped into a future position: + Debug.Assert(pos < leastPosData.pos); + pos = leastPosData.pos; + } + + if (pending.Count != 0) + { + return; + } + else + { + // This means the backtrace only produced + // punctuation tokens, so we must keep parsing. + continue; + } + } + + if (VERBOSE) + { + Console.WriteLine("\n extend @ pos=" + pos + " char=" + (char)buffer.Get(pos)); + } + + if (VERBOSE) + { + Console.WriteLine(" " + posData.count + " arcs in"); + } + + bool anyMatches = false; + + // First try user dict: + if (userFST != null) + { + userFST.GetFirstArc(arc); + int output = 0; + for (int posAhead = posData.pos; ; posAhead++) + { + int ch = buffer.Get(posAhead); + if (ch == -1) + { + break; + } + if (userFST.FindTargetArc(ch, arc, arc, posAhead == posData.pos, userFSTReader) == null) + { + break; + } + output += (int)arc.Output; + if (arc.IsFinal) + { + if (VERBOSE) + { + Console.WriteLine(" USER word " + new string(buffer.Get(pos, posAhead - pos + 1)) + " toPos=" + (posAhead + 1)); + } + Add(userDictionary, posData, posAhead + 1, output + (int)arc.NextFinalOutput, JapaneseTokenizerType.USER, false); + anyMatches = true; + } + } + } + + // TODO: we can be more aggressive about user + // matches? if we are "under" a user match then don't + // extend KNOWN/UNKNOWN paths? + + if (!anyMatches) + { + // Next, try known dictionary matches + fst.GetFirstArc(arc); + int output = 0; + + for (int posAhead = posData.pos; ; posAhead++) + { + int ch = buffer.Get(posAhead); + if (ch == -1) + { + break; + } + //System.out.println(" match " + (char) ch + " posAhead=" + posAhead); + + if (fst.FindTargetArc(ch, arc, arc, posAhead == posData.pos, fstReader) == null) + { + break; + } + + output += (int)arc.Output; + + // Optimization: for known words that are too-long + // (compound), we should pre-compute the 2nd + // best segmentation and store it in the + // dictionary instead of recomputing it each time a + // match is found. + + if (arc.IsFinal) + { + dictionary.LookupWordIds(output + (int)arc.NextFinalOutput, wordIdRef); + if (VERBOSE) + { + Console.WriteLine(" KNOWN word " + new string(buffer.Get(pos, posAhead - pos + 1)) + " toPos=" + (posAhead + 1) + " " + wordIdRef.Length + " wordIDs"); + } + for (int ofs = 0; ofs < wordIdRef.Length; ofs++) + { + Add(dictionary, posData, posAhead + 1, wordIdRef.Int32s[wordIdRef.Offset + ofs], JapaneseTokenizerType.KNOWN, false); + anyMatches = true; + } + } + } + } + + // In the case of normal mode, it doesn't process unknown word greedily. + + if (!searchMode && unknownWordEndIndex > posData.pos) + { + pos++; + continue; + } + + char firstCharacter = (char)buffer.Get(pos); + if (!anyMatches || characterDefinition.IsInvoke(firstCharacter)) + { + + // Find unknown match: + int characterId = characterDefinition.GetCharacterClass(firstCharacter); + bool isPunct = IsPunctuation(firstCharacter); + + // NOTE: copied from UnknownDictionary.lookup: + int unknownWordLength; + if (!characterDefinition.IsGroup(firstCharacter)) + { + unknownWordLength = 1; + } + else + { + // Extract unknown word. Characters with the same character class are considered to be part of unknown word + unknownWordLength = 1; + for (int posAhead = pos + 1; unknownWordLength < MAX_UNKNOWN_WORD_LENGTH; posAhead++) + { + int ch = buffer.Get(posAhead); + if (ch == -1) + { + break; + } + if (characterId == characterDefinition.GetCharacterClass((char)ch) && + IsPunctuation((char)ch) == isPunct) + { + unknownWordLength++; + } + else + { + break; + } + } + } + + unkDictionary.LookupWordIds(characterId, wordIdRef); // characters in input text are supposed to be the same + if (VERBOSE) + { + Console.WriteLine(" UNKNOWN word len=" + unknownWordLength + " " + wordIdRef.Length + " wordIDs"); + } + for (int ofs = 0; ofs < wordIdRef.Length; ofs++) + { + Add(unkDictionary, posData, posData.pos + unknownWordLength, wordIdRef.Int32s[wordIdRef.Offset + ofs], JapaneseTokenizerType.UNKNOWN, false); + } + + unknownWordEndIndex = posData.pos + unknownWordLength; + } + + pos++; + } + + end = true; + + if (pos > 0) + { + + Position endPosData = positions.Get(pos); + int leastCost = int.MaxValue; + int leastIDX = -1; + if (VERBOSE) + { + Console.WriteLine(" end: " + endPosData.count + " nodes"); + } + for (int idx = 0; idx < endPosData.count; idx++) + { + // Add EOS cost: + int cost = endPosData.costs[idx] + costs.Get(endPosData.lastRightID[idx], 0); + //System.out.println(" idx=" + idx + " cost=" + cost + " (pathCost=" + endPosData.costs[idx] + " bgCost=" + costs.get(endPosData.lastRightID[idx], 0) + ") backPos=" + endPosData.backPos[idx]); + if (cost < leastCost) + { + leastCost = cost; + leastIDX = idx; + } + } + + Backtrace(endPosData, leastIDX); + } + else + { + // No characters in the input string; return no tokens! + } + } + + // Eliminates arcs from the lattice that are compound + // tokens (have a penalty) or are not congruent with the + // compound token we've matched (ie, span across the + // startPos). This should be fairly efficient, because we + // just keep the already intersected structure of the + // graph, eg we don't have to consult the FSTs again: + + private void PruneAndRescore(int startPos, int endPos, int bestStartIDX) + { + if (VERBOSE) + { + Console.WriteLine(" pruneAndRescore startPos=" + startPos + " endPos=" + endPos + " bestStartIDX=" + bestStartIDX); + } + + // First pass: walk backwards, building up the forward + // arcs and pruning inadmissible arcs: + for (int pos = endPos; pos > startPos; pos--) + { + Position posData = positions.Get(pos); + if (VERBOSE) + { + Console.WriteLine(" back pos=" + pos); + } + for (int arcIDX = 0; arcIDX < posData.count; arcIDX++) + { + int backPos = posData.backPos[arcIDX]; + if (backPos >= startPos) + { + // Keep this arc: + //System.out.println(" keep backPos=" + backPos); + positions.Get(backPos).AddForward(pos, + arcIDX, + posData.backID[arcIDX], + posData.backType[arcIDX]); + } + else + { + if (VERBOSE) + { + Console.WriteLine(" prune"); + } + } + } + if (pos != startPos) + { + posData.count = 0; + } + } + + // Second pass: walk forward, re-scoring: + for (int pos = startPos; pos < endPos; pos++) + { + Position posData = positions.Get(pos); + if (VERBOSE) + { + Console.WriteLine(" forward pos=" + pos + " count=" + posData.forwardCount); + } + if (posData.count == 0) + { + // No arcs arrive here... + if (VERBOSE) + { + Console.WriteLine(" skip"); + } + posData.forwardCount = 0; + continue; + } + + if (pos == startPos) + { + // On the initial position, only consider the best + // path so we "force congruence": the + // sub-segmentation is "in context" of what the best + // path (compound token) had matched: + int rightID; + if (startPos == 0) + { + rightID = 0; + } + else + { + rightID = GetDict(posData.backType[bestStartIDX]).GetRightId(posData.backID[bestStartIDX]); + } + int pathCost = posData.costs[bestStartIDX]; + for (int forwardArcIDX = 0; forwardArcIDX < posData.forwardCount; forwardArcIDX++) + { + JapaneseTokenizerType forwardType = posData.forwardType[forwardArcIDX]; + IDictionary dict2 = GetDict(forwardType); + int wordID = posData.forwardID[forwardArcIDX]; + int toPos = posData.forwardPos[forwardArcIDX]; + int newCost = pathCost + dict2.GetWordCost(wordID) + + costs.Get(rightID, dict2.GetLeftId(wordID)) + + ComputePenalty(pos, toPos - pos); + if (VERBOSE) + { + Console.WriteLine(" + " + forwardType + " word " + new string(buffer.Get(pos, toPos - pos)) + " toPos=" + toPos + " cost=" + newCost + " penalty=" + ComputePenalty(pos, toPos - pos) + " toPos.idx=" + positions.Get(toPos).count); + } + positions.Get(toPos).Add(newCost, + dict2.GetRightId(wordID), + pos, + bestStartIDX, + wordID, + forwardType); + } + } + else + { + // On non-initial positions, we maximize score + // across all arriving lastRightIDs: + for (int forwardArcIDX = 0; forwardArcIDX < posData.forwardCount; forwardArcIDX++) + { + JapaneseTokenizerType forwardType = posData.forwardType[forwardArcIDX]; + int toPos = posData.forwardPos[forwardArcIDX]; + if (VERBOSE) + { + Console.WriteLine(" + " + forwardType + " word " + new string(buffer.Get(pos, toPos - pos)) + " toPos=" + toPos); + } + Add(GetDict(forwardType), + posData, + toPos, + posData.forwardID[forwardArcIDX], + forwardType, + true); + } + } + posData.forwardCount = 0; + } + } + + // Backtrace from the provided position, back to the last + // time we back-traced, accumulating the resulting tokens to + // the pending list. The pending list is then in-reverse + // (last token should be returned first). + private void Backtrace(Position endPosData, int fromIDX) + { + int endPos = endPosData.pos; + + if (VERBOSE) + { + Console.WriteLine("\n backtrace: endPos=" + endPos + " pos=" + this.pos + "; " + (this.pos - lastBackTracePos) + " characters; last=" + lastBackTracePos + " cost=" + endPosData.costs[fromIDX]); + } + + char[] fragment = buffer.Get(lastBackTracePos, endPos - lastBackTracePos); + + if (dotOut != null) + { + dotOut.OnBacktrace(this, positions, lastBackTracePos, endPosData, fromIDX, fragment, end); + } + + int pos = endPos; + int bestIDX = fromIDX; + Token altToken = null; + + // We trace backwards, so this will be the leftWordID of + // the token after the one we are now on: + int lastLeftWordID = -1; + + int backCount = 0; + + // TODO: sort of silly to make Token instances here; the + // back trace has all info needed to generate the + // token. So, we could just directly set the attrs, + // from the backtrace, in incrementToken w/o ever + // creating Token; we'd have to defer calling freeBefore + // until after the backtrace was fully "consumed" by + // incrementToken. + + while (pos > lastBackTracePos) + { + //System.out.println("BT: back pos=" + pos + " bestIDX=" + bestIDX); + Position posData = positions.Get(pos); + Debug.Assert(bestIDX < posData.count); + + int backPos = posData.backPos[bestIDX]; + Debug.Assert(backPos >= lastBackTracePos, "backPos=" + backPos + " vs lastBackTracePos=" + lastBackTracePos); + int length = pos - backPos; + JapaneseTokenizerType backType = posData.backType[bestIDX]; + int backID = posData.backID[bestIDX]; + int nextBestIDX = posData.backIndex[bestIDX]; + + if (outputCompounds && searchMode && altToken == null && backType != JapaneseTokenizerType.USER) + { + + // In searchMode, if best path had picked a too-long + // token, we use the "penalty" to compute the allowed + // max cost of an alternate back-trace. If we find an + // alternate back trace with cost below that + // threshold, we pursue it instead (but also output + // the long token). + //System.out.println(" 2nd best backPos=" + backPos + " pos=" + pos); + + int penalty = ComputeSecondBestThreshold(backPos, pos - backPos); + + if (penalty > 0) + { + if (VERBOSE) + { + Console.WriteLine(" compound=" + new string(buffer.Get(backPos, pos - backPos)) + " backPos=" + backPos + " pos=" + pos + " penalty=" + penalty + " cost=" + posData.costs[bestIDX] + " bestIDX=" + bestIDX + " lastLeftID=" + lastLeftWordID); + } + + // Use the penalty to set maxCost on the 2nd best + // segmentation: + int maxCost = posData.costs[bestIDX] + penalty; + if (lastLeftWordID != -1) + { + maxCost += costs.Get(GetDict(backType).GetRightId(backID), lastLeftWordID); + } + + // Now, prune all too-long tokens from the graph: + PruneAndRescore(backPos, pos, + posData.backIndex[bestIDX]); + + // Finally, find 2nd best back-trace and resume + // backtrace there: + int leastCost = int.MaxValue; + int leastIDX = -1; + for (int idx = 0; idx < posData.count; idx++) + { + int cost = posData.costs[idx]; + //System.out.println(" idx=" + idx + " prevCost=" + cost); + + if (lastLeftWordID != -1) + { + cost += costs.Get(GetDict(posData.backType[idx]).GetRightId(posData.backID[idx]), + lastLeftWordID); + //System.out.println(" += bgCost=" + costs.get(getDict(posData.backType[idx]).getRightId(posData.backID[idx]), + //lastLeftWordID) + " -> " + cost); + } + //System.out.println("penalty " + posData.backPos[idx] + " to " + pos); + //cost += computePenalty(posData.backPos[idx], pos - posData.backPos[idx]); + if (cost < leastCost) + { + //System.out.println(" ** "); + leastCost = cost; + leastIDX = idx; + } + } + //System.out.println(" leastIDX=" + leastIDX); + + if (VERBOSE) + { + Console.WriteLine(" afterPrune: " + posData.count + " arcs arriving; leastCost=" + leastCost + " vs threshold=" + maxCost + " lastLeftWordID=" + lastLeftWordID); + } + + if (leastIDX != -1 && leastCost <= maxCost && posData.backPos[leastIDX] != backPos) + { + // We should have pruned the altToken from the graph: + Debug.Assert(posData.backPos[leastIDX] != backPos); + + // Save the current compound token, to output when + // this alternate path joins back: + altToken = new Token(backID, + fragment, + backPos - lastBackTracePos, + length, + backType, + backPos, + GetDict(backType)); + + // Redirect our backtrace to 2nd best: + bestIDX = leastIDX; + nextBestIDX = posData.backIndex[bestIDX]; + + backPos = posData.backPos[bestIDX]; + length = pos - backPos; + backType = posData.backType[bestIDX]; + backID = posData.backID[bestIDX]; + backCount = 0; + //System.out.println(" do alt token!"); + + } + else + { + // I think in theory it's possible there is no + // 2nd best path, which is fine; in this case we + // only output the compound token: + //System.out.println(" no alt token! bestIDX=" + bestIDX); + } + } + } + + int offset = backPos - lastBackTracePos; + Debug.Assert(offset >= 0); + + if (altToken != null && altToken.Position >= backPos) + { + + // We've backtraced to the position where the + // compound token starts; add it now: + + // The pruning we did when we created the altToken + // ensures that the back trace will align back with + // the start of the altToken: + Debug.Assert(altToken.Position == backPos, altToken.Position + " vs " + backPos); + + // NOTE: not quite right: the compound token may + // have had all punctuation back traced so far, but + // then the decompounded token at this position is + // not punctuation. In this case backCount is 0, + // but we should maybe add the altToken anyway...? + + if (backCount > 0) + { + backCount++; + altToken.PositionLength = backCount; + if (VERBOSE) + { + Console.WriteLine(" add altToken=" + altToken); + } + pending.Add(altToken); + } + else + { + // This means alt token was all punct tokens: + if (VERBOSE) + { + Console.WriteLine(" discard all-punctuation altToken=" + altToken); + } + Debug.Assert(discardPunctuation); + } + altToken = null; + } + + IDictionary dict = GetDict(backType); + + if (backType == JapaneseTokenizerType.USER) + { + + // Expand the phraseID we recorded into the actual + // segmentation: + int[] wordIDAndLength = userDictionary.LookupSegmentation(backID); + int wordID = wordIDAndLength[0]; + int current = 0; + for (int j = 1; j < wordIDAndLength.Length; j++) + { + int len = wordIDAndLength[j]; + //System.out.println(" add user: len=" + len); + pending.Add(new Token(wordID + j - 1, + fragment, + current + offset, + len, + JapaneseTokenizerType.USER, + current + backPos, + dict)); + if (VERBOSE) + { + Console.WriteLine(" add USER token=" + pending[pending.Count - 1]); + } + current += len; + } + + // Reverse the tokens we just added, because when we + // serve them up from incrementToken we serve in + // reverse: + Collections.Reverse(pending.SubList(pending.Count - (wordIDAndLength.Length - 1), + pending.Count)); + + backCount += wordIDAndLength.Length - 1; + } + else + { + + if (extendedMode && backType == JapaneseTokenizerType.UNKNOWN) + { + // In EXTENDED mode we convert unknown word into + // unigrams: + int unigramTokenCount = 0; + for (int i = length - 1; i >= 0; i--) + { + int charLen = 1; + if (i > 0 && char.IsLowSurrogate(fragment[offset + i])) + { + i--; + charLen = 2; + } + //System.out.println(" extended tok offset=" + //+ (offset + i)); + if (!discardPunctuation || !IsPunctuation(fragment[offset + i])) + { + pending.Add(new Token(CharacterDefinition.NGRAM, + fragment, + offset + i, + charLen, + JapaneseTokenizerType.UNKNOWN, + backPos + i, + unkDictionary)); + unigramTokenCount++; + } + } + backCount += unigramTokenCount; + + } + else if (!discardPunctuation || length == 0 || !IsPunctuation(fragment[offset])) + { + pending.Add(new Token(backID, + fragment, + offset, + length, + backType, + backPos, + dict)); + if (VERBOSE) + { + Console.WriteLine(" add token=" + pending[pending.Count - 1]); + } + backCount++; + } + else + { + if (VERBOSE) + { + Console.WriteLine(" skip punctuation token=" + new string(fragment, offset, length)); + } + } + } + + lastLeftWordID = dict.GetLeftId(backID); + pos = backPos; + bestIDX = nextBestIDX; + } + + lastBackTracePos = endPos; + + if (VERBOSE) + { + Console.WriteLine(" freeBefore pos=" + endPos); + } + // Notify the circular buffers that we are done with + // these positions: + buffer.FreeBefore(endPos); + positions.FreeBefore(endPos); + } + + internal IDictionary GetDict(JapaneseTokenizerType type) + { + IDictionary result; + dictionaryMap.TryGetValue(type, out result); + return result; + } + + private static bool IsPunctuation(char ch) + { + switch (Character.GetType(ch)) + { + case UnicodeCategory.SpaceSeparator: + case UnicodeCategory.LineSeparator: + case UnicodeCategory.ParagraphSeparator: + case UnicodeCategory.Control: + case UnicodeCategory.Format: + case UnicodeCategory.DashPunctuation: + case UnicodeCategory.OpenPunctuation: + case UnicodeCategory.ClosePunctuation: + case UnicodeCategory.ConnectorPunctuation: + case UnicodeCategory.OtherPunctuation: + case UnicodeCategory.MathSymbol: + case UnicodeCategory.CurrencySymbol: + case UnicodeCategory.ModifierSymbol: + case UnicodeCategory.OtherSymbol: + case UnicodeCategory.InitialQuotePunctuation: + case UnicodeCategory.FinalQuotePunctuation: + return true; + default: + return false; + } + } + } + + // LUCENENET specific - de-nested Mode and renamed JapaneseTokenizerMode + + /// <summary> + /// Tokenization mode: this determines how the tokenizer handles + /// compound and unknown words. + /// </summary> + public enum JapaneseTokenizerMode + { + /// <summary> + /// Ordinary segmentation: no decomposition for compounds, + /// </summary> + NORMAL, + + /// <summary> + /// Segmentation geared towards search: this includes a + /// decompounding process for long nouns, also including + /// the full compound token as a synonym. + /// </summary> + SEARCH, + + /// <summary> + /// Extended mode outputs unigrams for unknown words. + /// <para/> + /// @lucene.experimental + /// </summary> + EXTENDED + } + + // LUCENENET specific: de-nested Type and renamed JapaneseTokenizerType + + /// <summary> + /// Token type reflecting the original source of this token + /// </summary> + public enum JapaneseTokenizerType + { + /// <summary> + /// Known words from the system dictionary. + /// </summary> + KNOWN, + /// <summary> + /// Unknown words (heuristically segmented). + /// </summary> + UNKNOWN, + /// <summary> + /// Known words from the user dictionary. + /// </summary> + USER + } + + + // LUCENENET specific - De-nested Position + + // Holds all back pointers arriving to this position: + internal sealed class Position + { + + internal int pos; + + internal int count; + + // maybe single int array * 5? + internal int[] costs = new int[8]; + internal int[] lastRightID = new int[8]; + internal int[] backPos = new int[8]; + internal int[] backIndex = new int[8]; + internal int[] backID = new int[8]; + internal JapaneseTokenizerType[] backType = new JapaneseTokenizerType[8]; + + // Only used when finding 2nd best segmentation under a + // too-long token: + internal int forwardCount; + internal int[] forwardPos = new int[8]; + internal int[] forwardID = new int[8]; + internal int[] forwardIndex = new int[8]; + internal JapaneseTokenizerType[] forwardType = new JapaneseTokenizerType[8]; + + public void Grow() + { + costs = ArrayUtil.Grow(costs, 1 + count); + lastRightID = ArrayUtil.Grow(lastRightID, 1 + count); + backPos = ArrayUtil.Grow(backPos, 1 + count); + backIndex = ArrayUtil.Grow(backIndex, 1 + count); + backID = ArrayUtil.Grow(backID, 1 + count); + + // NOTE: sneaky: grow separately because + // ArrayUtil.grow will otherwise pick a different + // length than the int[]s we just grew: + JapaneseTokenizerType[] newBackType = new JapaneseTokenizerType[backID.Length]; + System.Array.Copy(backType, 0, newBackType, 0, backType.Length); + backType = newBackType; + } + + public void GrowForward() + { + forwardPos = ArrayUtil.Grow(forwardPos, 1 + forwardCount); + forwardID = ArrayUtil.Grow(forwardID, 1 + forwardCount); + forwardIndex = ArrayUtil.Grow(forwardIndex, 1 + forwardCount); + + // NOTE: sneaky: grow separately because + // ArrayUtil.grow will otherwise pick a different + // length than the int[]s we just grew: + JapaneseTokenizerType[] newForwardType = new JapaneseTokenizerType[forwardPos.Length]; + System.Array.Copy(forwardType, 0, newForwardType, 0, forwardType.Length); + forwardType = newForwardType; + } + + public void Add(int cost, int lastRightID, int backPos, int backIndex, int backID, JapaneseTokenizerType backType) + { + // NOTE: this isn't quite a true Viterbi search, + // because we should check if lastRightID is + // already present here, and only update if the new + // cost is less than the current cost, instead of + // simply appending. However, that will likely hurt + // performance (usually we add a lastRightID only once), + // and it means we actually create the full graph + // intersection instead of a "normal" Viterbi lattice: + if (count == costs.Length) + { + Grow(); + } + this.costs[count] = cost; + this.lastRightID[count] = lastRightID; + this.backPos[count] = backPos; + this.backIndex[count] = backIndex; + this.backID[count] = backID; + this.backType[count] = backType; + count++; + } + + public void AddForward(int forwardPos, int forwardIndex, int forwardID, JapaneseTokenizerType forwardType) + { + if (forwardCount == this.forwardID.Length) + { + GrowForward(); + } + this.forwardPos[forwardCount] = forwardPos; + this.forwardIndex[forwardCount] = forwardIndex; + this.forwardID[forwardCount] = forwardID; + this.forwardType[forwardCount] = forwardType; + forwardCount++; + } + + public void Reset() + { + count = 0; + // forwardCount naturally resets after it runs: + Debug.Assert(forwardCount == 0, "pos=" + pos + " forwardCount=" + forwardCount); + } + } + + + // LUCENENET specific - de-nested WrappedPositionArray + + // TODO: make generic'd version of this "circular array"? + // It's a bit tricky because we do things to the Position + // (eg, set .pos = N on reuse)... + internal sealed class WrappedPositionArray + { + private Position[] positions = new Position[8]; + + public WrappedPositionArray() + { + for (int i = 0; i < positions.Length; i++) + { + positions[i] = new Position(); + } + } + + // Next array index to write to in positions: + private int nextWrite; + + // Next position to write: + private int nextPos; + + // How many valid Position instances are held in the + // positions array: + private int count; + + public void Reset() + { + nextWrite--; + while (count > 0) + { + if (nextWrite == -1) + { + nextWrite = positions.Length - 1; + } + positions[nextWrite--].Reset(); + count--; + } + nextWrite = 0; + nextPos = 0; + count = 0; + } + + /// <summary> + /// Get Position instance for this absolute position; + /// this is allowed to be arbitrarily far "in the + /// future" but cannot be before the last freeBefore. + /// </summary> + public Position Get(int pos) + { + while (pos >= nextPos) + { + //System.out.println("count=" + count + " vs len=" + positions.length); + if (count == positions.Length) + { + Position[] newPositions = new Position[ArrayUtil.Oversize(1 + count, RamUsageEstimator.NUM_BYTES_OBJECT_REF)]; + //System.out.println("grow positions " + newPositions.length); + System.Array.Copy(positions, nextWrite, newPositions, 0, positions.Length - nextWrite); + System.Array.Copy(positions, 0, newPositions, positions.Length - nextWrite, nextWrite); + for (int i = positions.Length; i < newPositions.Length; i++) + { + newPositions[i] = new Position(); + } + nextWrite = positions.Length; + positions = newPositions; + } + if (nextWrite == positions.Length) + { + nextWrite = 0; + } + // Should have already been reset: + Debug.Assert(positions[nextWrite].count == 0); + positions[nextWrite++].pos = nextPos++; + count++; + } + Debug.Assert(InBounds(pos)); + int index = GetIndex(pos); + Debug.Assert(positions[index].pos == pos); + return positions[index]; + } + + public int GetNextPos() + { + return nextPos; + } + + // For assert: + private bool InBounds(int pos) + { + return pos < nextPos && pos >= nextPos - count; + } + + private int GetIndex(int pos) + { + int index = nextWrite - (nextPos - pos); + if (index < 0) + { + index += positions.Length; + } + return index; + } + + public void FreeBefore(int pos) + { + int toFree = count - (nextPos - pos); + Debug.Assert(toFree >= 0); + Debug.Assert(toFree <= count); + int index = nextWrite - count; + if (index < 0) + { + index += positions.Length; + } + for (int i = 0; i < toFree; i++) + { + if (index == positions.Length) + { + index = 0; + } + //System.out.println(" fb idx=" + index); + positions[index].Reset(); + index++; + } + count -= toFree; + } + } +}
http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0f092010/src/Lucene.Net.Analysis.Kuromoji/JapaneseTokenizerFactory.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Kuromoji/JapaneseTokenizerFactory.cs b/src/Lucene.Net.Analysis.Kuromoji/JapaneseTokenizerFactory.cs new file mode 100644 index 0000000..5fe99d5 --- /dev/null +++ b/src/Lucene.Net.Analysis.Kuromoji/JapaneseTokenizerFactory.cs @@ -0,0 +1,100 @@ +using Lucene.Net.Analysis.Ja.Dict; +using Lucene.Net.Analysis.Util; +using Lucene.Net.Util; +using System; +using System.Collections.Generic; +using System.IO; +using System.Text; + +namespace Lucene.Net.Analysis.Ja +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// <summary> + /// Factory for <see cref="JapaneseTokenizer"/>. + /// <code> + /// <fieldType name="text_ja" class="solr.TextField"> + /// <analyzer> + /// <tokenizer class="solr.JapaneseTokenizerFactory" + /// mode="NORMAL" + /// userDictionary="user.txt" + /// userDictionaryEncoding="UTF-8" + /// discardPunctuation="true" + /// /> + /// <filter class="solr.JapaneseBaseFormFilterFactory"/> + /// </analyzer> + /// </fieldType> + /// </code> + /// </summary> + public class JapaneseTokenizerFactory : TokenizerFactory, IResourceLoaderAware + { + private static readonly string MODE = "mode"; + + private static readonly string USER_DICT_PATH = "userDictionary"; + + private static readonly string USER_DICT_ENCODING = "userDictionaryEncoding"; + + private static readonly string DISCARD_PUNCTUATION = "discardPunctuation"; // Expert option + + private UserDictionary userDictionary; + + private readonly JapaneseTokenizerMode mode; + private readonly bool discardPunctuation; + private readonly string userDictionaryPath; + private readonly string userDictionaryEncoding; + + /// <summary>Creates a new <see cref="JapaneseTokenizerFactory"/>.</summary> + public JapaneseTokenizerFactory(IDictionary<string, string> args) + : base(args) + { + Enum.TryParse(Get(args, MODE, JapaneseTokenizer.DEFAULT_MODE.ToString()), true, out mode); + userDictionaryPath = Get(args, USER_DICT_PATH); + userDictionaryEncoding = Get(args, USER_DICT_ENCODING); + discardPunctuation = GetBoolean(args, DISCARD_PUNCTUATION, true); + if (args.Count > 0) + { + throw new ArgumentException("Unknown parameters: " + args); + } + } + + public virtual void Inform(IResourceLoader loader) + { + if (userDictionaryPath != null) + { + Stream stream = loader.OpenResource(userDictionaryPath); + string encoding = userDictionaryEncoding; + if (encoding == null) + { + encoding = Encoding.UTF8.WebName; + } + Encoding decoder = Encoding.GetEncoding(encoding); + TextReader reader = new StreamReader(stream, decoder); + userDictionary = new UserDictionary(reader); + } + else + { + userDictionary = null; + } + } + + public override Tokenizer Create(AttributeSource.AttributeFactory factory, TextReader input) + { + return new JapaneseTokenizer(factory, input, userDictionary, discardPunctuation, mode); + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0f092010/src/Lucene.Net.Analysis.Kuromoji/Lucene.Net.Analysis.Kuromoji.csproj ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Kuromoji/Lucene.Net.Analysis.Kuromoji.csproj b/src/Lucene.Net.Analysis.Kuromoji/Lucene.Net.Analysis.Kuromoji.csproj new file mode 100644 index 0000000..782aad3 --- /dev/null +++ b/src/Lucene.Net.Analysis.Kuromoji/Lucene.Net.Analysis.Kuromoji.csproj @@ -0,0 +1,118 @@ +<?xml version="1.0" encoding="utf-8"?> +<Project ToolsVersion="14.0" DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003"> + <Import Project="$(MSBuildExtensionsPath)\$(MSBuildToolsVersion)\Microsoft.Common.props" Condition="Exists('$(MSBuildExtensionsPath)\$(MSBuildToolsVersion)\Microsoft.Common.props')" /> + <PropertyGroup> + <Configuration Condition=" '$(Configuration)' == '' ">Debug</Configuration> + <Platform Condition=" '$(Platform)' == '' ">AnyCPU</Platform> + <ProjectGuid>{8408625A-2508-46D5-8519-045183C43724}</ProjectGuid> + <OutputType>Library</OutputType> + <AppDesignerFolder>Properties</AppDesignerFolder> + <RootNamespace>Lucene.Net.Analysis.Ja</RootNamespace> + <AssemblyName>Lucene.Net.Analysis.Kuromoji</AssemblyName> + <TargetFrameworkVersion>v4.5.1</TargetFrameworkVersion> + <FileAlignment>512</FileAlignment> + </PropertyGroup> + <PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Debug|AnyCPU' "> + <DebugSymbols>true</DebugSymbols> + <DebugType>full</DebugType> + <Optimize>false</Optimize> + <OutputPath>bin\Debug\</OutputPath> + <DefineConstants>DEBUG;TRACE</DefineConstants> + <ErrorReport>prompt</ErrorReport> + <WarningLevel>4</WarningLevel> + </PropertyGroup> + <PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Release|AnyCPU' "> + <DebugType>pdbonly</DebugType> + <Optimize>true</Optimize> + <OutputPath>bin\Release\</OutputPath> + <DefineConstants>TRACE</DefineConstants> + <ErrorReport>prompt</ErrorReport> + <WarningLevel>4</WarningLevel> + </PropertyGroup> + <ItemGroup> + <Reference Include="System" /> + <Reference Include="System.Core" /> + <Reference Include="Microsoft.CSharp" /> + </ItemGroup> + <ItemGroup> + <Compile Include="Dict\BinaryDictionary.cs" /> + <Compile Include="Dict\CharacterDefinition.cs" /> + <Compile Include="Dict\ConnectionCosts.cs" /> + <Compile Include="Dict\Dictionary.cs" /> + <Compile Include="Dict\TokenInfoDictionary.cs" /> + <Compile Include="Dict\TokenInfoFST.cs" /> + <Compile Include="Dict\UnknownDictionary.cs" /> + <Compile Include="Dict\UserDictionary.cs" /> + <Compile Include="GraphvizFormatter.cs" /> + <Compile Include="JapaneseAnalyzer.cs" /> + <Compile Include="JapaneseBaseFormFilter.cs" /> + <Compile Include="JapaneseBaseFormFilterFactory.cs" /> + <Compile Include="JapaneseIterationMarkCharFilter.cs" /> + <Compile Include="JapaneseIterationMarkCharFilterFactory.cs" /> + <Compile Include="JapaneseKatakanaStemFilter.cs" /> + <Compile Include="JapaneseKatakanaStemFilterFactory.cs" /> + <Compile Include="JapanesePartOfSpeechStopFilter.cs" /> + <Compile Include="JapanesePartOfSpeechStopFilterFactory.cs" /> + <Compile Include="JapaneseReadingFormFilter.cs" /> + <Compile Include="JapaneseReadingFormFilterFactory.cs" /> + <Compile Include="JapaneseTokenizer.cs" /> + <Compile Include="JapaneseTokenizerFactory.cs" /> + <Compile Include="Properties\AssemblyInfo.cs" /> + <Compile Include="Token.cs" /> + <Compile Include="TokenAttributes\BaseFormAttribute.cs" /> + <Compile Include="TokenAttributes\BaseFormAttributeImpl.cs" /> + <Compile Include="TokenAttributes\InflectionAttribute.cs" /> + <Compile Include="TokenAttributes\InflectionAttributeImpl.cs" /> + <Compile Include="TokenAttributes\PartOfSpeechAttribute.cs" /> + <Compile Include="TokenAttributes\PartOfSpeechAttributeImpl.cs" /> + <Compile Include="TokenAttributes\ReadingAttribute.cs" /> + <Compile Include="TokenAttributes\ReadingAttributeImpl.cs" /> + <Compile Include="Tools\BinaryDictionaryWriter.cs" /> + <Compile Include="Tools\CharacterDefinitionWriter.cs" /> + <Compile Include="Tools\ConnectionCostsBuilder.cs" /> + <Compile Include="Tools\ConnectionCostsWriter.cs" /> + <Compile Include="Tools\DictionaryBuilder.cs" /> + <Compile Include="Tools\TokenInfoDictionaryBuilder.cs" /> + <Compile Include="Tools\TokenInfoDictionaryWriter.cs" /> + <Compile Include="Tools\UnknownDictionaryBuilder.cs" /> + <Compile Include="Tools\UnknownDictionaryWriter.cs" /> + <Compile Include="Util\CSVUtil.cs" /> + <Compile Include="Util\ToStringUtil.cs" /> + <Compile Include="..\CommonAssemblyInfo.cs"> + <Link>Properties\CommonAssemblyInfo.cs</Link> + </Compile> + </ItemGroup> + <ItemGroup> + <ProjectReference Include="..\Lucene.Net.Analysis.Common\Lucene.Net.Analysis.Common.csproj"> + <Project>{4ADD0BBC-B900-4715-9526-D871DE8EEA64}</Project> + <Name>Lucene.Net.Analysis.Common</Name> + </ProjectReference> + <ProjectReference Include="..\Lucene.Net\Lucene.Net.csproj"> + <Project>{5D4AD9BE-1FFB-41AB-9943-25737971BF57}</Project> + <Name>Lucene.Net</Name> + </ProjectReference> + </ItemGroup> + <ItemGroup> + <EmbeddedResource Include="Dict\CharacterDefinition.dat" /> + <EmbeddedResource Include="Dict\ConnectionCosts.dat" /> + <EmbeddedResource Include="Dict\TokenInfoDictionary%24buffer.dat" /> + <EmbeddedResource Include="Dict\TokenInfoDictionary%24fst.dat" /> + <EmbeddedResource Include="Dict\TokenInfoDictionary%24posDict.dat" /> + <EmbeddedResource Include="Dict\TokenInfoDictionary%24targetMap.dat" /> + <EmbeddedResource Include="Dict\UnknownDictionary%24buffer.dat" /> + <EmbeddedResource Include="Dict\UnknownDictionary%24posDict.dat" /> + <EmbeddedResource Include="Dict\UnknownDictionary%24targetMap.dat" /> + </ItemGroup> + <ItemGroup> + <EmbeddedResource Include="stoptags.txt" /> + <EmbeddedResource Include="stopwords.txt" /> + </ItemGroup> + <Import Project="$(MSBuildToolsPath)\Microsoft.CSharp.targets" /> + <!-- To modify your build process, add your task inside one of the targets below and uncomment it. + Other similar extension points exist, see Microsoft.Common.targets. + <Target Name="BeforeBuild"> + </Target> + <Target Name="AfterBuild"> + </Target> + --> +</Project> \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0f092010/src/Lucene.Net.Analysis.Kuromoji/Lucene.Net.Analysis.Kuromoji.project.json ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Kuromoji/Lucene.Net.Analysis.Kuromoji.project.json b/src/Lucene.Net.Analysis.Kuromoji/Lucene.Net.Analysis.Kuromoji.project.json new file mode 100644 index 0000000..86d1c12 --- /dev/null +++ b/src/Lucene.Net.Analysis.Kuromoji/Lucene.Net.Analysis.Kuromoji.project.json @@ -0,0 +1,8 @@ +{ + "runtimes": { + "win": {} + }, + "frameworks": { + "net451": {} + } +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0f092010/src/Lucene.Net.Analysis.Kuromoji/Lucene.Net.Analysis.Kuromoji.xproj ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Kuromoji/Lucene.Net.Analysis.Kuromoji.xproj b/src/Lucene.Net.Analysis.Kuromoji/Lucene.Net.Analysis.Kuromoji.xproj new file mode 100644 index 0000000..59a3016 --- /dev/null +++ b/src/Lucene.Net.Analysis.Kuromoji/Lucene.Net.Analysis.Kuromoji.xproj @@ -0,0 +1,38 @@ +<?xml version="1.0" encoding="utf-8"?> +<!-- + + Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, + software distributed under the License is distributed on an + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + KIND, either express or implied. See the License for the + specific language governing permissions and limitations + under the License. + +--> +<Project ToolsVersion="14.0.25420" DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003"> + <PropertyGroup> + <VisualStudioVersion Condition="'$(VisualStudioVersion)' == ''">14.0.25420</VisualStudioVersion> + <VSToolsPath Condition="'$(VSToolsPath)' == ''">$(MSBuildExtensionsPath32)\Microsoft\VisualStudio\v$(VisualStudioVersion)</VSToolsPath> + </PropertyGroup> + <Import Project="$(VSToolsPath)\DotNet\Microsoft.DotNet.Props" Condition="'$(VSToolsPath)' != ''" /> + <PropertyGroup Label="Globals"> + <ProjectGuid>87e54ca7-7394-4705-a99a-0dd638265c56</ProjectGuid> + <RootNamespace>Lucene.Net.Analysis.Ja</RootNamespace> + <BaseIntermediateOutputPath Condition="'$(BaseIntermediateOutputPath)'=='' ">.\obj</BaseIntermediateOutputPath> + <OutputPath Condition="'$(OutputPath)'=='' ">.\bin\</OutputPath> + </PropertyGroup> + <PropertyGroup> + <SchemaVersion>2.0</SchemaVersion> + </PropertyGroup> + <Import Project="$(VSToolsPath)\DotNet\Microsoft.DotNet.targets" Condition="'$(VSToolsPath)' != ''" /> +</Project> \ No newline at end of file http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0f092010/src/Lucene.Net.Analysis.Kuromoji/Properties/AssemblyInfo.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Kuromoji/Properties/AssemblyInfo.cs b/src/Lucene.Net.Analysis.Kuromoji/Properties/AssemblyInfo.cs new file mode 100644 index 0000000..2ea0e44 --- /dev/null +++ b/src/Lucene.Net.Analysis.Kuromoji/Properties/AssemblyInfo.cs @@ -0,0 +1,46 @@ +/* +* Licensed to the Apache Software Foundation (ASF) under one or more +* contributor license agreements. See the NOTICE file distributed with +* this work for additional information regarding copyright ownership. +* The ASF licenses this file to You under the Apache License, Version 2.0 +* (the "License"); you may not use this file except in compliance with +* the License. You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*/ + +using System; +using System.Reflection; +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; + +// General Information about an assembly is controlled through the following +// set of attributes. Change these attribute values to modify the information +// associated with an assembly. +[assembly: AssemblyTitle("Lucene.Net.Analysis.Kuromoji")] +[assembly: AssemblyDescription("Japanese Morphological Analyzer " + + "for the Lucene.Net full-text search engine library from The Apache Software Foundation.")] +[assembly: AssemblyConfiguration("")] +[assembly: AssemblyDefaultAlias("Lucene.Net.Analysis.Kuromoji")] +[assembly: AssemblyCulture("")] + +[assembly: CLSCompliant(true)] + +// Setting ComVisible to false makes the types in this assembly not visible +// to COM components. If you need to access a type in this assembly from +// COM, set the ComVisible attribute to true on that type. +[assembly: ComVisible(false)] + +// The following GUID is for the ID of the typelib if this project is exposed to COM +[assembly: Guid("8408625a-2508-46d5-8519-045183c43724")] + +// for testing +[assembly: InternalsVisibleTo("Lucene.Net.Tests.Analysis.Kuromoji")] + +// NOTE: Version information is in CommonAssemblyInfo.cs http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0f092010/src/Lucene.Net.Analysis.Kuromoji/Token.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Kuromoji/Token.cs b/src/Lucene.Net.Analysis.Kuromoji/Token.cs new file mode 100644 index 0000000..fd1afd2 --- /dev/null +++ b/src/Lucene.Net.Analysis.Kuromoji/Token.cs @@ -0,0 +1,194 @@ +using Lucene.Net.Analysis.Ja.Dict; +using Lucene.Net.Support; +using System.Diagnostics.CodeAnalysis; + +namespace Lucene.Net.Analysis.Ja +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// <summary> + /// Analyzed token with morphological data from its dictionary. + /// </summary> + public class Token + { + private readonly IDictionary dictionary; + + private readonly int wordId; + + private readonly char[] surfaceForm; + private readonly int offset; + private readonly int length; + + private readonly int position; + private int positionLength; + + private readonly JapaneseTokenizerType type; + + public Token(int wordId, char[] surfaceForm, int offset, int length, JapaneseTokenizerType type, int position, IDictionary dictionary) + { + this.wordId = wordId; + this.surfaceForm = surfaceForm; + this.offset = offset; + this.length = length; + this.type = type; + this.position = position; + this.dictionary = dictionary; + } + + public override string ToString() + { + return "Token(\"" + new string(surfaceForm, offset, length) + "\" pos=" + position + " length=" + length + + " posLen=" + positionLength + " type=" + type + " wordId=" + wordId + + " leftID=" + dictionary.GetLeftId(wordId) + ")"; + } + + /// <summary> + /// surfaceForm + /// </summary> + [WritableArray] + [SuppressMessage("Microsoft.Performance", "CA1819", Justification = "Lucene's design requires some writable array properties")] + public virtual char[] SurfaceForm + { + get { return surfaceForm; } + } + + /// <summary> + /// offset into surfaceForm + /// </summary> + public virtual int Offset + { + get { return offset; } + } + + /// <summary> + /// length of surfaceForm + /// </summary> + public virtual int Length + { + get { return length; } + } + + /// <summary> + /// surfaceForm as a String + /// </summary> + /// <returns>surfaceForm as a String</returns> + public virtual string GetSurfaceFormString() + { + return new string(surfaceForm, offset, length); + } + + /// <summary> + /// reading. <c>null</c> if token doesn't have reading. + /// </summary> + /// <returns>reading. <c>null</c> if token doesn't have reading.</returns> + public virtual string GetReading() + { + return dictionary.GetReading(wordId, surfaceForm, offset, length); + } + + /// <summary> + /// pronunciation. <c>null</c> if token doesn't have pronunciation. + /// </summary> + /// <returns>pronunciation. <c>null</c> if token doesn't have pronunciation.</returns> + public virtual string GetPronunciation() + { + return dictionary.GetPronunciation(wordId, surfaceForm, offset, length); + } + + /// <summary> + /// part of speech. + /// </summary> + /// <returns>part of speech.</returns> + public virtual string GetPartOfSpeech() + { + return dictionary.GetPartOfSpeech(wordId); + } + + /// <summary> + /// inflection type or <c>null</c> + /// </summary> + /// <returns>inflection type or <c>null</c></returns> + public virtual string GetInflectionType() + { + return dictionary.GetInflectionType(wordId); + } + + /// <summary> + /// inflection form or <c>null</c> + /// </summary> + /// <returns>inflection form or <c>null</c></returns> + public virtual string GetInflectionForm() + { + return dictionary.GetInflectionForm(wordId); + } + + /// <summary> + /// base form or <c>null</c> if token is not inflected + /// </summary> + /// <returns>base form or <c>null</c> if token is not inflected</returns> + public virtual string GetBaseForm() + { + return dictionary.GetBaseForm(wordId, surfaceForm, offset, length); + } + + /// <summary> + /// Returns <c>true</c> if this token is known word. + /// </summary> + /// <returns><c>true</c> if this token is in standard dictionary. <c>false</c> if not.</returns> + public virtual bool IsKnown() + { + return type == JapaneseTokenizerType.KNOWN; + } + + /// <summary> + /// Returns <c>true</c> if this token is unknown word. + /// </summary> + /// <returns><c>true</c> if this token is unknown word. <c>false</c> if not.</returns> + public virtual bool IsUnknown() + { + return type == JapaneseTokenizerType.UNKNOWN; + } + + /// <summary> + /// Returns <c>true</c> if this token is defined in user dictionary. + /// </summary> + /// <returns><c>true</c> if this token is in user dictionary. <c>false</c> if not.</returns> + public virtual bool IsUser() + { + return type == JapaneseTokenizerType.USER; + } + + /// <summary> + /// Get index of this token in input text. Returns position of token. + /// </summary> + public virtual int Position + { + get { return position; } + } + + /// <summary> + /// Gets or Sets the length (in tokens) of this token. For normal + /// tokens this is 1; for compound tokens it's > 1. + /// </summary> + public virtual int PositionLength + { + get { return positionLength; } + set { this.positionLength = value; } + } + } +} http://git-wip-us.apache.org/repos/asf/lucenenet/blob/0f092010/src/Lucene.Net.Analysis.Kuromoji/TokenAttributes/BaseFormAttribute.cs ---------------------------------------------------------------------- diff --git a/src/Lucene.Net.Analysis.Kuromoji/TokenAttributes/BaseFormAttribute.cs b/src/Lucene.Net.Analysis.Kuromoji/TokenAttributes/BaseFormAttribute.cs new file mode 100644 index 0000000..e3a06b3 --- /dev/null +++ b/src/Lucene.Net.Analysis.Kuromoji/TokenAttributes/BaseFormAttribute.cs @@ -0,0 +1,33 @@ +using Lucene.Net.Util; + +namespace Lucene.Net.Analysis.Ja.TokenAttributes +{ + /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + /// <summary> + /// Attribute for <see cref="Token.BaseForm"/> + /// <para/> + /// Note: depending on part of speech, this value may not be applicable, + /// and will be null. + /// </summary> + public interface IBaseFormAttribute : IAttribute + { + string GetBaseForm(); + void SetToken(Token token); + } +}
