This is an automated email from the ASF dual-hosted git repository. sergeykamov pushed a commit to branch NLPCRAFT-456 in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git
commit 300042735860e626d90002b30b4d5b072892b73a Author: Sergey Kamov <[email protected]> AuthorDate: Tue Sep 28 12:55:21 2021 +0300 Code cleanup. --- .../nlpcraft/probe/mgrs/NCProbeIdlToken.scala | 41 +++++- .../apache/nlpcraft/probe/mgrs/NCProbeModel.scala | 3 + .../probe/mgrs/nlp/NCProbeEnrichmentManager.scala | 2 +- .../mgrs/nlp/enrichers/model/NCModelEnricher.scala | 104 +++++++------- .../probe/mgrs/sentence/NCSentenceManager.scala | 25 ++-- .../probe/mgrs/synonyms/NCSynonymsManager.scala | 150 ++++++++++++--------- 6 files changed, 196 insertions(+), 129 deletions(-) diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeIdlToken.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeIdlToken.scala index 5da9808..d4fc27c 100644 --- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeIdlToken.scala +++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeIdlToken.scala @@ -26,11 +26,46 @@ import org.apache.nlpcraft.model.{NCToken, _} * @param word */ case class NCProbeIdlToken(token: NCToken, word: NCNlpSentenceToken) { - val (origText: String, wordIndexes: Set[Int], minIndex: Int, maxIndex: Int, isToken: Boolean, isWord: Boolean) = + require(token != null ^ word != null) + + val ( + origText: String, + normText: String, + stem: String, + wordIndexes: Set[Int], + minIndex: Int, + maxIndex: Int, + startCharIndex: Int, + endCharIndex: Int, + isToken: Boolean, + isWord: Boolean + ) = if (token != null) - (token.origText, token.wordIndexes.toSet, token.wordIndexes.head, token.wordIndexes.last, true, false) + ( + token.origText, + token.normText, + token.stem, + token.wordIndexes.toSet, + token.wordIndexes.head, + token.wordIndexes.last, + token.getStartCharIndex, + token.getEndCharIndex, + true, + false + ) else - (word.origText, word.wordIndexes.toSet, word.wordIndexes.head, word.wordIndexes.last, false, true) + ( + word.origText, + word.normText, + word.stem, + word.wordIndexes.toSet, + word.wordIndexes.head, + word.wordIndexes.last, + word.startCharIndex, + word.endCharIndex, + false, + true + ) private lazy val hash = if (isToken) Seq(wordIndexes, token.getId).hashCode() else wordIndexes.hashCode() diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeModel.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeModel.scala index ea41793..6b6a8e8 100644 --- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeModel.scala +++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeModel.scala @@ -62,6 +62,9 @@ case class NCProbeModel( lazy val hasNoIdlSynonyms: Boolean = continuousSynonyms.nonEmpty || sparseSynonyms.nonEmpty lazy val hasSparseSynonyms: Boolean = sparseSynonyms.nonEmpty || idlSynonyms.exists(_._2.exists(_.sparse)) lazy val hasContinuousSynonyms: Boolean = continuousSynonyms.nonEmpty || idlSynonyms.exists(_._2.exists(!_.sparse)) + lazy val isComplex: Boolean = hasIdlSynonyms || !model.getParsers.isEmpty def hasIdlSynonyms(elemId: String): Boolean = idlSynonyms.contains(elemId) + + } diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/NCProbeEnrichmentManager.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/NCProbeEnrichmentManager.scala index fde865f..560ddff 100644 --- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/NCProbeEnrichmentManager.scala +++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/NCProbeEnrichmentManager.scala @@ -492,7 +492,7 @@ object NCProbeEnrichmentManager extends NCService with NCOpenCensusModelStats { }).toMap // Loop has sense if model is complex (has user defined parsers or IDL based synonyms) - continue = NCModelEnricher.isComplex(mdl) && res.exists { case (_, same) => !same } + continue = mdl.isComplex && res.exists { case (_, same) => !same } if (DEEP_DEBUG) if (continue) { diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala index 7196985..a39edfd 100644 --- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala +++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala @@ -83,8 +83,6 @@ object NCModelEnricher extends NCProbeEnricher { ackStopped() } - def isComplex(mdl: NCProbeModel): Boolean = mdl.hasIdlSynonyms || !mdl.model.getParsers.isEmpty - /** * * @param ns @@ -180,7 +178,8 @@ object NCModelEnricher extends NCProbeEnricher { new NCCustomElement() { override def getElementId: String = noteId override def getWords: JList[NCCustomWord] = words - override def getMetadata: JavaMeta = md.map(p => p._1 -> p._2.asInstanceOf[AnyRef]).asJava + override def getMetadata: JavaMeta = + md.map { case (k, v) => k -> v.asInstanceOf[AnyRef] }.asJava } }).asJava ) @@ -228,7 +227,7 @@ object NCModelEnricher extends NCProbeEnricher { * Example: Piece: 'x1, x2(stopword), x3(stopword), x4' will be expanded into * {'x1, x2, x3, x4', 'x1, x2, x4', 'x1, x3, x4', 'x1, x4'} * - * 3. All variants collected, duplicated deleted, etc. + * 3. All variants collected, duplicated sets deleted, etc. * * @param toks */ @@ -244,7 +243,7 @@ object NCModelEnricher extends NCProbeEnricher { else slides += mutable.ArrayBuffer.empty :+ stop - // Too many stopords inside skipped. + // Too many stopwords inside skipped. val bigSlides = slides.filter(_.size > 2) var stops4Delete: Seq[Seq[NlpToken]] = @@ -255,7 +254,7 @@ object NCModelEnricher extends NCProbeEnricher { if (stops4AllCombs.nonEmpty) for ( seq1 <- Range.inclusive(0, stops4AllCombs.size).flatMap(stops4AllCombs.combinations); - seq2 <- Range.inclusive(0, bigSlides.size).flatMap(bigSlides.combinations) + seq2 <- Range.inclusive(0, bigSlides.size).flatMap(bigSlides.combinations) ) yield seq1 ++ seq2.flatten else @@ -268,11 +267,10 @@ object NCModelEnricher extends NCProbeEnricher { stops4Delete = stops4Delete.filter(seq => !seq.contains(combo.head) && !seq.contains(combo.last)) (Seq(combo) ++ stops4Delete.map(del => combo.filter(t => !del.contains(t)))).map(_ -> combo).distinct - }). - filter(_._1.nonEmpty). - groupBy(_._1). - map(p => p._1 -> p._2.map(_._2).minBy(p => (-p.size, p.head.index))). + filter { case (seq, _) => seq.nonEmpty }. + groupBy { case (seq, _) => seq }. + map { case (toksKey, seq) => toksKey -> seq.map(_._2).minBy(p => (-p.size, p.head.index)) }. sortBy { case(data, combo) => (-combo.size, -data.size, combo.head.index, data.head.index) } /** @@ -297,15 +295,17 @@ object NCModelEnricher extends NCProbeEnricher { /** * - * @param tows + * @param idlToks * @param ns */ - private def toTokens(tows: Seq[IdlToken], ns: Sentence): Seq[NlpToken] = - ( - tows.filter(_.isWord).map(_.word) ++ - tows.filter(_.isToken).map(_.token). - flatMap(w => ns.filter(t => t.wordIndexes.intersect(w.wordIndexes).nonEmpty)) - ).sortBy(_.startCharIndex) + private def toNlpTokens(idlToks: Seq[IdlToken], ns: Sentence): Seq[NlpToken] = { + val words = idlToks.filter(_.isWord).map(_.word) + val suitableToks = + idlToks.filter(_.isToken).map(_.token). + flatMap(w => ns.filter(t => t.wordIndexes.intersect(w.wordIndexes).nonEmpty)) + + (words ++ suitableToks).sortBy(_.startCharIndex) + } /** * @@ -378,6 +378,7 @@ object NCModelEnricher extends NCProbeEnricher { } /** + * Prepares IDL tokens based on NLP tokens. * * @param h * @param toks @@ -391,9 +392,7 @@ object NCModelEnricher extends NCProbeEnricher { // Drops without tokens (IDL part works with tokens). if (rec.nonEmpty) - Some(rec ++ - (seq.wordsIndexes.intersect(idxs) -- rec.flatMap(_.wordIndexes)).map(h.tokens) - ) + Some(rec ++ (seq.wordsIndexes.intersect(idxs) -- rec.flatMap(_.wordIndexes)).map(h.tokens)) else None }).seq @@ -440,11 +439,11 @@ object NCModelEnricher extends NCProbeEnricher { for ( // 'toksExt' is piece of sentence, 'toks' is the same as 'toksExt' or without some stopwords set. (toks, toksExt) <- combosTokens(ns.toSeq); - idxs = toks.map(_.index); - e <- mdl.elements.values; - elemId = e.getId; - greedy = e.isGreedy.orElse(mdl.model.isGreedy) - if !greedy || !alreadyMarked(ns, elemId, toks, idxs) + idxs = toks.map(_.index); + e <- mdl.elements.values; + elemId = e.getId; + greedy = e.isGreedy.orElse(mdl.model.isGreedy) + if !greedy || !alreadyMarked(ns, elemId, toks, idxs) ) { def add( dbgType: String, @@ -456,7 +455,7 @@ object NCModelEnricher extends NCProbeEnricher { val ok = (!greedy || !alreadyMarked(ns, elemId, elemToks, idxs)) && - ( parts.isEmpty || !parts.exists { case (t, _) => t.getId == elemId }) + ( parts.isEmpty || !parts.exists { case (tok, _) => tok.getId == elemId }) if (ok) mark( @@ -563,7 +562,7 @@ object NCModelEnricher extends NCProbeEnricher { ) } else - // 2.2 Sparse. + // 2.2 Sparse. for (syn <- allSyns; comb <- allCombs) NCSynonymsManager.onSparseMatch( ns.srvReqId, @@ -573,7 +572,7 @@ object NCModelEnricher extends NCProbeEnricher { req, variantsToks, res => { - val toks = getSparsedTokens(toTokens(res, ns), toTokens(comb, ns)) + val toks = getSparsedTokens(toNlpTokens(res, ns), toNlpTokens(comb, ns)) val parts = toParts(mdl, ns.srvReqId, res, syn) val typ = if (syn.sparse) "IDL sparse"else "IDL continuous" @@ -607,6 +606,9 @@ object NCModelEnricher extends NCProbeEnricher { * @param ns */ private def normalize(ns: Sentence): Unit = { + // Find and removes user notes if sentence contains notes with similar structure but less count of swallowed stop-words. + // These stop-words can be used fro detection another user tokens and harmless if they are free words. + // Notes with links and with references on them - aren't touched. val usrNotes = ns.flatten.filter(_.isUser).distinct val links = NCSentenceManager.getLinks(usrNotes) val parts = NCSentenceManager.getPartKeys(usrNotes) @@ -638,28 +640,34 @@ object NCModelEnricher extends NCProbeEnricher { // TODO: simplify, add tests, check model properties (sparse etc) for optimization. /** * - * @param elmId - * @param toks - * @param sliceToksIdxsSorted + * @param elmId Element ID. + * @param toks Tokens. + * @param idxs Indexes, note that it can be not exactly tokens indexes (sparse case) */ - private def alreadyMarked(ns: Sentence, elmId: String, toks: Seq[NlpToken], sliceToksIdxsSorted: Seq[Int]): Boolean = { + private def alreadyMarked(ns: Sentence, elmId: String, toks: Seq[NlpToken], idxs: Seq[Int]): Boolean = { lazy val toksIdxsSorted = toks.map(_.index).sorted - sliceToksIdxsSorted.map(ns).forall(_.exists(n => n.noteType == elmId && n.sparsity == 0)) || - toks.exists(_.exists(n => - n.noteType == elmId && - ( - (n.sparsity == 0 && - (sliceToksIdxsSorted.containsSlice(n.tokenIndexes) || n.tokenIndexes.containsSlice(toksIdxsSorted)) - ) - || - ( - n.tokenIndexes == toksIdxsSorted || - n.tokenIndexes.containsSlice(toksIdxsSorted) && - U.isContinuous(toksIdxsSorted) && - U.isContinuous(n.tokenIndexes) - ) - ) - )) + // All tokens with given indexes found with zero sparsity. + val ok1 = idxs.map(ns).forall(_.exists(n => n.noteType == elmId && n.sparsity == 0)) + + lazy val ok2 = + toks.exists(_.exists(n => + if (n.noteType == elmId) { + val noteOk1 = n.sparsity == 0 && + (idxs.containsSlice(n.tokenIndexes) || n.tokenIndexes.containsSlice(toksIdxsSorted)) + + lazy val noteOk2 = + n.tokenIndexes == toksIdxsSorted || + n.tokenIndexes.containsSlice(toksIdxsSorted) && + U.isContinuous(toksIdxsSorted) && + U.isContinuous(n.tokenIndexes) + + noteOk1 || noteOk2 + } + else + false + )) + + ok1 || ok2 } } diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceManager.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceManager.scala index 00d6bdf..f9f7a01 100644 --- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceManager.scala +++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceManager.scala @@ -369,7 +369,7 @@ object NCSentenceManager extends NCService { val t = NCNlpSentenceToken(idx) // Note, it adds stop-words too. - val content = nsCopyToks.zipWithIndex.filter(p => indexes.contains(p._2)).map(_._1) + val content = nsCopyToks.zipWithIndex.filter { case (_, idx) => indexes.contains(idx) }.map { case (tok, _) => tok} content.foreach(t => history += t.index -> idx) @@ -378,15 +378,12 @@ object NCSentenceManager extends NCService { val n = content.size - 1 - content.zipWithIndex.foreach(p => { - val t = p._1 - val idx = p._2 - + content.zipWithIndex.foreach { case (t, idx) => buf += get(t) if (idx < n && t.endCharIndex != content(idx + 1).startCharIndex) buf += " " - }) + } buf.mkString } @@ -459,8 +456,7 @@ object NCSentenceManager extends NCService { for (tok <- ns.filter(_.isTypeOf(noteType)) if ok) tok.getNoteOpt(noteType, idxsField) match { case Some(n) => - val idxs: Seq[Seq[Int]] = - n.data[JList[JList[Int]]](idxsField).asScala.map(_.asScala.toSeq).toSeq + val idxs: Seq[Seq[Int]] = n.data[JList[JList[Int]]](idxsField).asScala.map(_.asScala.toSeq).toSeq var fixed = idxs history.foreach { @@ -539,8 +535,7 @@ object NCSentenceManager extends NCService { // Validation (all indexes calculated well) require( !res || - !ns.flatten. - exists(n => ns.filter(_.wordIndexes.exists(n.wordIndexes.contains)).exists(t => !t.contains(n))), + !ns.flatten.exists(n => ns.filter(_.wordIndexes.exists(n.wordIndexes.contains)).exists(t => !t.contains(n))), s"Invalid sentence:\n" + ns.map(t => // Human readable invalid sentence for debugging. @@ -745,9 +740,11 @@ object NCSentenceManager extends NCService { ) ) + // There are optimizations below. Similar variants by some criteria deleted. + def notNlpNotes(s: NCNlpSentence): Seq[NCNlpSentenceNote] = s.flatten.filter(!_.isNlp) - // Drops similar sentences (with same notes structure). Keeps with more found. + // Drops similar sentences with same notes structure based on greedy elements. Keeps with more notes found. val notGreedyElems = mdl.getElements.asScala.flatMap(e => if (!e.isGreedy.orElse(mdl.isGreedy)) Some(e.getId) else None).toSet @@ -768,6 +765,7 @@ object NCSentenceManager extends NCService { var sensWithNotesIdxs = sensWithNotes.zipWithIndex + // Drops similar sentences if there are other sentences with superset of notes. sens = sensWithNotesIdxs.filter { case ((_, notNlpNotes1), idx1) => !sensWithNotesIdxs. @@ -775,13 +773,12 @@ object NCSentenceManager extends NCService { exists { case((_, notNlpNotes2), _) => notNlpNotes1.subsetOf(notNlpNotes2) } }.map { case ((sen, _), _) => sen } - // Drops similar sentences (with same tokens structure). - // Among similar sentences we prefer one with minimal free words count. + // Drops similar sentences. Among similar sentences we prefer one with minimal free words count. sens = sens.groupBy(notNlpNotes(_).map(_.getKey(withIndexes = false))). map { case (_, seq) => seq.minBy(_.filter(p => p.isNlp && !p.isStopWord).map(_.wordIndexes.length).sum) }. toSeq - // Drops sentences if they are just subset of another. + // Drops sentences if they are just subset of another (indexes ignored here) sensWithNotes = sensWithNotes.filter { case (sen, _) => sens.contains(sen) } sensWithNotesIdxs = sensWithNotes.zipWithIndex diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/synonyms/NCSynonymsManager.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/synonyms/NCSynonymsManager.scala index 80f9c19..fa31f26 100644 --- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/synonyms/NCSynonymsManager.scala +++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/synonyms/NCSynonymsManager.scala @@ -38,7 +38,7 @@ object NCSynonymsManager extends NCService { private lazy val cache = mutable.HashMap.empty[String, mutable.HashMap[Int, mutable.HashMap[Seq[T], mutable.HashSet[Synonym]]]] - def isUnprocessed(elemId: String, s: Synonym, tokens: Seq[T]): Boolean = + def isUnprocessed(elemId: String, syn: Synonym, tokens: Seq[T]): Boolean = cache. getOrElseUpdate( elemId, @@ -51,7 +51,7 @@ object NCSynonymsManager extends NCService { getOrElseUpdate( tokens, mutable.HashSet.empty[Synonym] - ).add(s) + ).add(syn) } private case class SavedIdlKey(id: String, startCharIndex: Int, endCharIndex: Int, other: Map[String, AnyRef] = Map.empty) @@ -72,13 +72,11 @@ object NCSynonymsManager extends NCService { ) } - private case class Value(request: NCRequest, variants: Seq[Seq[NCToken]], predicate: NCIdlFunction) { - override def toString: String = variants.toString() - } + private case class SavedIdlValue(request: NCRequest, variants: Seq[Seq[NCToken]], predicate: NCIdlFunction) private case class IdlChunkKey(token: IdlToken, chunk: NCProbeSynonymChunk) - private val savedIdl = mutable.HashMap.empty[String, mutable.HashMap[SavedIdlKey, mutable.ArrayBuffer[Value]]] + private val savedIdl = mutable.HashMap.empty[String, mutable.HashMap[SavedIdlKey, mutable.ArrayBuffer[SavedIdlValue]]] private val idlChunksCache = mutable.HashMap.empty[String, mutable.HashMap[IdlChunkKey, Boolean]] private val idlCaches = mutable.HashMap.empty[String, CacheHolder[IdlToken]] private val tokCaches = mutable.HashMap.empty[String, CacheHolder[Int]] @@ -120,7 +118,7 @@ object NCSynonymsManager extends NCService { /** * - * @param s + * @param syn * @param toks * @param isMatch * @param getIndex @@ -128,19 +126,23 @@ object NCSynonymsManager extends NCService { * @tparam T */ private def sparseMatch0[T]( - s: Synonym, + syn: Synonym, toks: Seq[T], isMatch: (T, NCProbeSynonymChunk) => Boolean, getIndex: T => Int, shouldBeNeighbors: Boolean ): Option[Seq[T]] = - if (toks.size >= s.size) { + if (toks.size >= syn.size) { lazy val res = mutable.ArrayBuffer.empty[T] lazy val all = mutable.HashSet.empty[T] + // There are 3 states: + // 0 - initial working state, first step. + // 1 - working state, not first step. + // -1 - stop state. var state = 0 - for (chunk <- s if state != -1) { + for (chunk <- syn if state != -1) { val seq = if (state == 0) { state = 1 @@ -153,12 +155,12 @@ object NCSynonymsManager extends NCService { if (seq.nonEmpty) { val head = seq.head - if (!s.permute && res.nonEmpty && getIndex(head) <= getIndex(res.last)) + if (!syn.permute && res.nonEmpty && getIndex(head) <= getIndex(res.last)) state = -1 else { all ++= seq - if (all.size > s.size) + if (all.size > syn.size) state = -1 else res += head @@ -168,7 +170,12 @@ object NCSynonymsManager extends NCService { state = -1 } - if (state != -1 && all.size == res.size && (!shouldBeNeighbors || U.isIncreased(res.map(getIndex).toSeq.sorted))) + if ( + state != -1 && // State ok. + all.size == res.size && // There aren't excess processed tokens. + // `neighbors` conditions, important for simple not sparse synonyms. + (!shouldBeNeighbors || U.isIncreased(res.map(getIndex).toSeq.sorted)) + ) Some(res.toSeq) else None @@ -186,69 +193,75 @@ object NCSynonymsManager extends NCService { private def save(req: NCRequest, tok: NCToken, pred: NCIdlFunction, variantsToks: Seq[Seq[NCToken]]): Unit = { savedIdl. getOrElseUpdate(req.getServerRequestId, mutable.HashMap.empty). - getOrElseUpdate(SavedIdlKey(tok), mutable.ArrayBuffer.empty) += - Value(req, variantsToks, pred) + getOrElseUpdate(SavedIdlKey(tok), mutable.ArrayBuffer.empty) += + SavedIdlValue(req, variantsToks, pred) } /** + * Checks that given synonym is not checked yet with given NLP tokens' indexes. * * @param srvReqId * @param elemId - * @param s + * @param syn * @param tokens */ - private def isUnprocessedTokens(srvReqId: String, elemId: String, s: Synonym, tokens: Seq[Int]): Boolean = - tokCaches.getOrElseUpdate(srvReqId, new CacheHolder[Int]).isUnprocessed(elemId, s, tokens) + private def isUnprocessedTokens(srvReqId: String, elemId: String, syn: Synonym, tokens: Seq[Int]): Boolean = + tokCaches.getOrElseUpdate(srvReqId, new CacheHolder[Int]).isUnprocessed(elemId, syn, tokens) /** + * Checks that given synonym is not checked yet with given IDL tokens. * * @param srvReqId * @param elemId - * @param s + * @param syn * @param tokens */ - private def isUnprocessedIdl(srvReqId: String, elemId: String, s: Synonym, tokens: Seq[IdlToken]): Boolean = - idlCaches.getOrElseUpdate(srvReqId, new CacheHolder[IdlToken]).isUnprocessed(elemId, s, tokens) + private def isUnprocessedIdl(srvReqId: String, elemId: String, syn: Synonym, tokens: Seq[IdlToken]): Boolean = + idlCaches.getOrElseUpdate(srvReqId, new CacheHolder[IdlToken]).isUnprocessed(elemId, syn, tokens) /** + * Checks matching IDL token with synonym's chunk. * - * @param tow - * @param chunk - * @param req - * @param variantsToks + * @param t IDL token. + * @param chunk Synonym's chunk. + * @param req Request. + * @param variantsToks All possible request's variants. */ private def isMatch( - tow: IdlToken, chunk: NCProbeSynonymChunk, req: NCRequest, variantsToks: Seq[Seq[NCToken]] + t: IdlToken, chunk: NCProbeSynonymChunk, req: NCRequest, variantsToks: Seq[Seq[NCToken]] ): Boolean = idlChunksCache. - getOrElseUpdate(req.getServerRequestId, + getOrElseUpdate( + req.getServerRequestId, mutable.HashMap.empty[IdlChunkKey, Boolean] ). getOrElseUpdate( - IdlChunkKey(tow, chunk), + IdlChunkKey(t, chunk), { - def get0[T](fromToken: NCToken => T, fromWord: NlpToken => T): T = - if (tow.isToken) fromToken(tow.token) else fromWord(tow.word) - chunk.kind match { - case TEXT => chunk.wordStem == get0(_.stem, _.stem) + case TEXT => chunk.wordStem == t.stem case REGEX => - chunk.regex.matcher(get0(_.origText, _.origText)).matches() || - chunk.regex.matcher(get0(_.normText, _.normText)).matches() + chunk.regex.matcher(t.origText).matches() || chunk.regex.matcher(t.normText).matches() case IDL => - val ok = + val ok = { + // IDL condition just for tokens. + t.isToken && + // Should be found at least one suitable variant (valid NCIdlContext) for given token. + // This variant will be checked again on last processing phase. variantsToks.par.exists(vrntToks => - get0(t => - chunk.idlPred.apply(t, NCIdlContext(toks = vrntToks, req = req)). - value.asInstanceOf[Boolean], - _ => false + chunk.idlPred.apply( + t.token, + NCIdlContext(toks = vrntToks, req = req)).value.asInstanceOf[Boolean] ) - ) + } + // Saves all variants for next validation. + // All suitable variants can be deleted, so this positive result can be abolished + // on last processing phase. if (ok) - save(req, tow.token, chunk.idlPred, variantsToks) + save(req, t.token, chunk.idlPred, variantsToks) ok @@ -270,22 +283,29 @@ object NCSynonymsManager extends NCService { require(toks != null) require(!syn.sparse && !syn.hasIdl) - if ( - toks.length == syn.length && { + if (toks.length == syn.length) { // Same length. + val ok = if (syn.isTextOnly) - toks.zip(syn).forall(p => p._1.stem == p._2.wordStem) + toks.zip(syn). + // Checks all synonym chunks with all tokens. + forall { case (tok, chunk) => tok.stem == chunk.wordStem } else - toks.zip(syn).sortBy(p => getSort(p._2.kind)).forall { case (tok, chunk) => isMatch(tok, chunk) } - } - ) - callback(()) + toks.zip(syn). + // Pre-sort by chunk kind for performance reasons, easier to compare should be first. + sortBy { case (_, chunk) => getSort(chunk.kind) }. + // Checks all synonym chunks with all tokens. + forall { case (tok, chunk) => isMatch(tok, chunk) } + + if (ok) + callback(()) + } } /** * * @param srvReqId * @param elemId - * @param s + * @param syn * @param toks * @param req * @param variantsToks @@ -294,22 +314,22 @@ object NCSynonymsManager extends NCService { def onMatch( srvReqId: String, elemId: String, - s: Synonym, + syn: Synonym, toks: Seq[IdlToken], req: NCRequest, variantsToks: Seq[Seq[NCToken]], callback: Unit => Unit ): Unit = - if (isUnprocessedIdl(srvReqId, elemId, s, toks)) { + if (isUnprocessedIdl(srvReqId, elemId, syn, toks)) { require(toks != null) if ( - toks.length == s.length && // Same length. - toks.count(_.isToken) >= s.idlChunks && // Enough tokens. - toks.zip(s).sortBy { // Pre-sort by chunk kind. + toks.length == syn.length && // Same length. + toks.count(_.isToken) >= syn.idlChunks && // Enough tokens. + toks.zip(syn).sortBy { // Pre-sort by chunk kind for performance reasons, easier to compare should be first. case (_, chunk) => getSort(chunk.kind) - } - .forall { // TODO? + }. + forall { // Checks all synonym chunks with all tokens. case (idlTok, chunk) => isMatch(idlTok, chunk, req, variantsToks) } ) @@ -365,7 +385,7 @@ object NCSynonymsManager extends NCService { syn, toks, (t: IdlToken, chunk: NCProbeSynonymChunk) => isMatch(t, chunk, req, variantsToks), - (t: IdlToken) => if (t.isToken) t.token.getStartCharIndex else t.word.startCharIndex, + (t: IdlToken) => t.startCharIndex, shouldBeNeighbors = !syn.sparse ) match { case Some(res) => callback(res) @@ -374,13 +394,15 @@ object NCSynonymsManager extends NCService { } /** + * Checks that suitable variant wasn't deleted and IDL condition for token is still valid. + * We have to check it because NCIdlContext which used in predicate based on variant. * * @param srvReqId - * @param senToks + * @param toks */ - def isStillValidIdl(srvReqId: String, senToks: Seq[NCToken]): Boolean = + def isStillValidIdl(srvReqId: String, toks: Seq[NCToken]): Boolean = savedIdl.get(srvReqId) match { - case Some(m) => + case Some(map) => lazy val allCheckedSenToks = { val set = mutable.HashSet.empty[SavedIdlKey] @@ -390,13 +412,13 @@ object NCSynonymsManager extends NCService { t.getPartTokens.asScala.foreach(add) } - senToks.foreach(add) + toks.foreach(add) set } - senToks.forall(tok => - m.get(SavedIdlKey(tok)) match { + toks.forall(tok => + map.get(SavedIdlKey(tok)) match { case Some(vals) => vals.exists( v => @@ -417,6 +439,7 @@ object NCSynonymsManager extends NCService { } /** + * Called when request processing finished. * * @param srvReqId */ @@ -427,6 +450,7 @@ object NCSynonymsManager extends NCService { } /** + * Called on each request enrichment iteration. * * @param srvReqId */
