This is an automated email from the ASF dual-hosted git repository. sergeykamov pushed a commit to branch NLPCRAFT-287 in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git
commit 70b9c9b60dd148a40828e6becef7003fbd5441e0 Author: Sergey Kamov <[email protected]> AuthorDate: Thu Apr 8 16:10:43 2021 +0300 WIP. --- .../apache/nlpcraft/common/nlp/NCNlpSentence.scala | 6 +- .../apache/nlpcraft/probe/mgrs/NCProbeModel.scala | 12 +- .../probe/mgrs/deploy/NCDeployManager.scala | 7 +- .../nlpcraft/probe/mgrs/model/NCModelManager.scala | 26 +- .../probe/mgrs/nlp/NCProbeEnrichmentManager.scala | 2 + .../mgrs/nlp/enrichers/model/NCModelEnricher.scala | 629 ++++++++++++--------- .../model/NCEnricherNestedModelSpec.scala | 50 +- .../model/NCEnricherNestedModelSpec2.scala | 5 +- 8 files changed, 414 insertions(+), 323 deletions(-) diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentence.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentence.scala index 91ca5a9..f2965cb 100644 --- a/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentence.scala +++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentence.scala @@ -48,7 +48,8 @@ class NCNlpSentence( override val tokens: mutable.ArrayBuffer[NCNlpSentenceToken] = new mutable.ArrayBuffer[NCNlpSentenceToken](32), private val deletedNotes: mutable.HashMap[NCNlpSentenceNote, Seq[NCNlpSentenceToken]] = mutable.HashMap.empty, private var initNlpNotes: Map[NoteKey, NCNlpSentenceNote] = null, - private val nlpTokens: mutable.HashMap[TokenKey, NCNlpSentenceToken] = mutable.HashMap.empty + private val nlpTokens: mutable.HashMap[TokenKey, NCNlpSentenceToken] = mutable.HashMap.empty, + var firstProbePhase: Boolean = true ) extends NCNlpSentenceTokenBuffer(tokens) with JSerializable { @transient private var hash: java.lang.Integer = _ @@ -65,7 +66,8 @@ class NCNlpSentence( tokens = tokens.map(_.clone()), deletedNotes = deletedNotes.map(p ⇒ p._1.clone() → p._2.map(_.clone())), initNlpNotes = initNlpNotes, - nlpTokens = nlpTokens + nlpTokens = nlpTokens, + firstProbePhase = firstProbePhase ) /** diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeModel.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeModel.scala index b99ddd0..0e418b3 100644 --- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeModel.scala +++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeModel.scala @@ -28,9 +28,9 @@ import scala.collection.{Map, Seq} * @param model * @param solver * @param intents - * @param nonSparseSynonyms + * @param directSynonyms * @param sparseSynonyms - * @param nonSparseSynonymsDsl + * @param directSynonymsDsl * @param addStopWordsStems * @param exclStopWordsStems * @param suspWordsStems @@ -40,13 +40,15 @@ case class NCProbeModel( model: NCModel, solver: NCIntentSolver, intents: Seq[NCIdlIntent], - nonSparseSynonyms: Map[String /*Element ID*/ , Map[Int /*Synonym length*/ , NCProbeSynonymsWrapper]], // Fast access map. + directSynonyms: Map[String /*Element ID*/ , Map[Int /*Synonym length*/ , NCProbeSynonymsWrapper]], // Fast access map. sparseSynonyms: Map[String /*Element ID*/, Seq[NCProbeSynonym]], - nonSparseSynonymsDsl: Map[String /*Element ID*/ , Map[Int /*Synonym length*/ , Seq[NCProbeSynonym]]], // Fast access map. + directSynonymsDsl: Map[String /*Element ID*/ , Seq[NCProbeSynonym]], // Fast access map. sparseSynonymsDsl: Map[String /*Element ID*/ , Seq[NCProbeSynonym]], addStopWordsStems: Set[String], exclStopWordsStems: Set[String], suspWordsStems: Set[String], elements: Map[String /*Element ID*/ , NCElement], samples: Set[(String, Seq[Seq[String]])] -) +) { + def hasDslSynonyms(elemId: String): Boolean = directSynonymsDsl.contains(elemId) || sparseSynonymsDsl.contains(elemId) +} diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/deploy/NCDeployManager.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/deploy/NCDeployManager.scala index 06fe040..aa3b99e 100644 --- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/deploy/NCDeployManager.scala +++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/deploy/NCDeployManager.scala @@ -198,7 +198,7 @@ object NCDeployManager extends NCService with DecorateAsScala { // TODO: Sparse for nonDSL def ok(b: Boolean, exp: Boolean): Boolean = if (exp) b else !b def filter(dsl: Boolean, sparse: Boolean): Set[SynonymHolder] = - syns.toSet.filter(s ⇒ ok(s.syn.exists(_.kind == IDL), dsl) && ok(s.sparse, sparse)) + syns.toSet.filter(s ⇒ ok(s.syn.exists(_.kind == IDL), dsl) && ok(s.sparse && s.syn.size > 1, sparse)) var cnt = 0 val maxCnt = mdl.getMaxTotalSynonyms @@ -502,6 +502,7 @@ object NCDeployManager extends NCService with DecorateAsScala { else logger.warn(s"Model has no intent: $mdlId") + // TODO: sort!!! def toMap(set: Set[SynonymHolder]): Map[String, Seq[NCProbeSynonym]] = set.groupBy(_.elmId).map(p ⇒ p._1 → p._2.map(_.syn).toSeq.sortBy(-_.size)) @@ -509,9 +510,9 @@ object NCDeployManager extends NCService with DecorateAsScala { model = mdl, solver = solver, intents = intents.map(_._1).toSeq, - nonSparseSynonyms = mkFastAccessMap(filter(dsl = false, sparse = false), NCProbeSynonymsWrapper(_)), + directSynonyms = mkFastAccessMap(filter(dsl = false, sparse = false), NCProbeSynonymsWrapper(_)), sparseSynonyms = toMap(filter(dsl = false, sparse = true)), - nonSparseSynonymsDsl = mkFastAccessMap(filter(dsl = true, sparse = false), _.sorted.reverse), + directSynonymsDsl = toMap(filter(dsl = true, sparse = false)), sparseSynonymsDsl = toMap(filter(dsl = true, sparse = true)), addStopWordsStems = addStopWords, exclStopWordsStems = exclStopWords, diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/model/NCModelManager.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/model/NCModelManager.scala index 03c59ff..457bf35 100644 --- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/model/NCModelManager.scala +++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/model/NCModelManager.scala @@ -58,26 +58,26 @@ object NCModelManager extends NCService with DecorateAsScala { data.values.foreach(w ⇒ { val mdl = w.model - val synCnt = w.nonSparseSynonyms.flatMap(_._2.map(_._2.count)).sum - val synDslCnt = w.nonSparseSynonymsDsl.map(_._2.size).sum + val synCnt = w.directSynonyms.flatMap(_._2.map(_._2.count)).sum + val synDslCnt = w.directSynonymsDsl.map(_._2.size).sum val synSparseCnt = w.sparseSynonyms.map(_._2.size).sum val synSparseDslCnt = w.sparseSynonymsDsl.map(_._2.size).sum val elmCnt = w.elements.keySet.size val intentCnt = w.intents.size - def getWithWarning(i: Int): String = if (i == 0) s"0 ${r("(!)")}" else i.toString + def withWarn(i: Int): String = if (i == 0) s"0 ${r("(!)")}" else i.toString tbl += Seq( - s"Name: ${bo(c(mdl.getName))}", - s"ID: ${bo(mdl.getId)}", - s"Version: ${mdl.getVersion}", - s"Origin: ${mdl.getOrigin}", - s"Elements: ${getWithWarning(elmCnt)}", - s"Synonyms(Continuous) $synCnt", - s"Synonyms(Continuous, DSL): $synDslCnt", - s"Synonyms(Sparse): $synSparseCnt", - s"Synonyms(Sparse, DSL): $synSparseDslCnt", - s"Intents: ${getWithWarning(intentCnt)}" + s"Name: ${bo(c(mdl.getName))}", + s"ID: ${bo(mdl.getId)}", + s"Version: ${mdl.getVersion}", + s"Origin: ${mdl.getOrigin}", + s"Elements: ${withWarn(elmCnt)}", + s"Synonyms(Direct) $synCnt", + s"Synonyms(Direct, DSL): $synDslCnt", + s"Synonyms(Sparse): $synSparseCnt", + s"Synonyms(Sparse, DSL): $synSparseDslCnt", + s"Intents: ${withWarn(intentCnt)}" ) }) } diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/NCProbeEnrichmentManager.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/NCProbeEnrichmentManager.scala index 4a1466e..a1dbdac 100644 --- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/NCProbeEnrichmentManager.scala +++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/NCProbeEnrichmentManager.scala @@ -486,6 +486,8 @@ object NCProbeEnrichmentManager extends NCService with NCOpenCensusModelStats { // Loop has sense if model is complex (has user defined parsers or IDL based synonyms) continue = NCModelEnricher.isComplex(mdl) && res.exists { case (_, same) ⇒ !same } + nlpSen.firstProbePhase = false + if (DEEP_DEBUG) if (continue) { val changed = res.filter(!_._2).keys.map(_.getClass.getSimpleName).mkString(", ") diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala index f74346b..0542174 100644 --- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala +++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala @@ -19,14 +19,14 @@ package org.apache.nlpcraft.probe.mgrs.nlp.enrichers.model import io.opencensus.trace.Span import org.apache.nlpcraft.common._ -import org.apache.nlpcraft.common.nlp.{NCNlpSentenceToken => NlpToken, _} +import org.apache.nlpcraft.common.nlp.{NCNlpSentenceToken ⇒ NlpToken, _} import org.apache.nlpcraft.model._ import org.apache.nlpcraft.probe.mgrs.NCProbeSynonym.NCDslContent import org.apache.nlpcraft.probe.mgrs.NCProbeSynonymChunkKind.{NCSynonymChunkKind, _} import org.apache.nlpcraft.probe.mgrs.nlp.NCProbeEnricher import org.apache.nlpcraft.probe.mgrs.nlp.impl.NCRequestImpl import org.apache.nlpcraft.probe.mgrs.sentence.NCSentenceManager -import org.apache.nlpcraft.probe.mgrs.{NCProbeModel, NCProbeSynonym ⇒ Synonym, NCProbeVariants} +import org.apache.nlpcraft.probe.mgrs.{NCProbeModel, NCProbeVariants, NCProbeSynonym ⇒ Synonym} import java.io.Serializable import java.util @@ -91,6 +91,7 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala { else wordIndexes.subsetOf(indexes) + override def equals(obj: Any): Boolean = obj match { case x: Complex ⇒ hash == x.hash && (isToken && x.isToken && token == x.token || isWord && x.isWord && word == x.word) @@ -101,7 +102,7 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala { override def toString: String = { val idxs = wordIndexes.mkString(",") - if (isToken) s"'$origText' (${token.getId}) [$idxs]]" else s"'$origText' [$idxs]" + if (isToken && token.getId != "nlpcraft:nlp") s"'$origText' (${token.getId}) [$idxs]]" else s"'$origText' [$idxs]" } } @@ -125,6 +126,8 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala { override def toString: String = tokensComplexes.mkString(" | ") } + case class ComplexHolder(complexesWords: Seq[Complex], complexes: Seq[ComplexSeq]) + // Found-by-synonym model element. case class ElementMatch( element: NCElement, @@ -164,6 +167,14 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala { override def toString: String = s"Element=${element.getId}, indexes=${tokens.map(_.index).mkString(",")}, synonym=$synonym" } + object State extends Enumeration { + type State = Value + + val SIMPLE, DSL_FIRST, DSL_NEXT = Value + } + + import State._ + /** * * @param parent Optional parent span. @@ -246,6 +257,92 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala { } /** + * + * @param mdl + * @param ns + * @param span + * @param req + */ + private def processParsers(mdl: NCProbeModel, ns: NCNlpSentence, span: Span, req: NCRequestImpl): Unit = { + val parsers = mdl.model.getParsers + + for (parser ← parsers.asScala) { + parser.onInit() + + startScopedSpan("customParser", span, + "srvReqId" → ns.srvReqId, + "mdlId" → mdl.model.getId, + "txt" → ns.text) { _ ⇒ + def to(t: NlpToken): NCCustomWord = + new NCCustomWord { + override def getNormalizedText: String = t.normText + override def getOriginalText: String = t.origText + override def getStartCharIndex: Int = t.startCharIndex + override def getEndCharIndex: Int = t.endCharIndex + override def getPos: String = t.pos + override def getPosDescription: String = t.posDesc + override def getLemma: String = t.lemma + override def getStem: String = t.stem + override def isStopWord: Boolean = t.isStopWord + override def isBracketed: Boolean = t.isBracketed + override def isQuoted: Boolean = t.isQuoted + override def isKnownWord: Boolean = t.isKnownWord + override def isSwearWord: Boolean = t.isSwearWord + override def isEnglish: Boolean = t.isEnglish + } + + val res = parser.parse( + req, + mdl.model, + ns.map(to).asJava, + ns.flatten.distinct.filter(!_.isNlp).map(n ⇒ { + val noteId = n.noteType + val words = ns.filter(t ⇒ n.tokenIndexes.contains(t.index)).map(to).asJava + val md = n.asMetadata() + + new NCCustomElement() { + override def getElementId: String = noteId + override def getWords: util.List[NCCustomWord] = words + override def getMetadata: JavaMeta = md.map(p ⇒ p._1 → p._2.asInstanceOf[AnyRef]).asJava + } + }).asJava + ) + + if (res != null) + res.asScala.foreach(e ⇒ { + val elemId = e.getElementId + val words = e.getWords + + if (elemId == null) + throw new NCE(s"Custom model parser cannot return 'null' element ID.") + + if (words == null || words.isEmpty) + throw new NCE(s"Custom model parser cannot return empty custom tokens [elementId=$elemId]") + + val matchedToks = words.asScala.map(w ⇒ + ns.find(t ⇒ + t.startCharIndex == w.getStartCharIndex && t.endCharIndex == w.getEndCharIndex + ).getOrElse(throw new AssertionError(s"Custom model parser returned an invalid custom token: $w")) + ) + + if (!alreadyMarked(matchedToks, elemId)) + mark( + ns, + elem = mdl.elements.getOrElse(elemId, throw new NCE(s"Custom model parser returned unknown element ID: $elemId")), + toks = matchedToks, + direct = true, + syn = None, + metaOpt = Some(e.getMetadata.asScala), + parts = Seq.empty + ) + }) + } + + parser.onDiscard() + } + } + + /** * Gets all sequential permutations of given tokens. * * For example, if buffer contains "a b c d" tokens, then this function will return the @@ -306,7 +403,7 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala { * @param tows * @param ns */ - private def toNlpTokens(tows: Seq[NCDslContent], ns: NCNlpSentence): Seq[NlpToken] = + private def toTokens(tows: Seq[NCDslContent], ns: NCNlpSentence): Seq[NlpToken] = ( tows.filter(_.isRight).map(_.right.get) ++ tows.filter(_.isLeft).map(_.left.get). @@ -320,6 +417,14 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala { private def tokString(toks: Seq[NlpToken]): String = toks.map(t ⇒ (t.origText, t.index)).mkString(" ") /** + * + * @param m + * @param id + * @return + */ + private def get(m: Map[String , Seq[Synonym]], id: String): Seq[Synonym] = m.getOrElse(id, Seq.empty) + + /** * Gets synonyms sorted in descending order by their weight (already prepared), * i.e. first synonym in the sequence is the most important one. * @@ -330,308 +435,284 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala { private def fastAccess[T](fastMap: Map[String, Map[Int, T]], elmId: String, len: Int): Option[T] = fastMap.getOrElse(elmId, Map.empty[Int, T]).get(len) + /** + * + * @param mdl + * @param ns + */ + private def mkComplexes(mdl: NCProbeModel, ns: NCNlpSentence): ComplexHolder = { + val complexesWords = ns.map(Complex(_)) + + val complexes: Seq[ComplexSeq] = + NCProbeVariants.convert(ns.srvReqId, mdl, NCSentenceManager.collapse(mdl.model, ns.clone())). + map(_.asScala). + par. + flatMap(sen ⇒ + // Tokens splitting. + // For example sentence "A B С D E" (5 words) processed as 3 tokens on first phase after collapsing + // 'A B' (2 words), 'C D' (2 words) and 'E' (1 word) + // So, result combinations will be: + // Token(AB) + Token(CD) + Token(E) + // Token(AB) + Word(C) + Word(D) + Token(E) + // Word(A) + Word(B) + Token(CD) + Token(E) + // Word(A) + Word(B) + Word(C) + Word(D) + Token(E) + combos(sen). + map(senPartComb ⇒ { + sen.flatMap(t ⇒ + // Single word token is not split as words - token. + // Partly (not strict in) token - word. + if (t.wordIndexes.length == 1 || senPartComb.contains(t)) + Seq(Complex(t)) + else + t.wordIndexes.map(complexesWords) + ) + // Drops without tokens (IDL part works with tokens). + }).filter(_.exists(_.isToken)).map(ComplexSeq(_)).distinct + ).seq + + ComplexHolder(complexesWords, complexes) + } + + /** + * + * @param h + * @param toks + */ + private def mkComplexCombinations(h: ComplexHolder, toks: Seq[NlpToken]): Seq[Seq[Complex]] = { + val idxsSeq = toks.flatMap(_.wordIndexes) +// val idxsSorted = idxsSeq.sorted + val idxs = idxsSeq.toSet +// val idxMin = idxsSorted.head +// val idxMax = idxsSorted.last + + h.complexes.par. + flatMap(complexSeq ⇒ { + //val rec = complexSeq.tokensComplexes.filter(_.isSubsetOf(idxMin, idxMax, idxs)) + val rec = complexSeq.tokensComplexes.filter(_.wordIndexes.exists(idxsSeq.contains)) + + // Drops without tokens (IDL part works with tokens). + if (rec.nonEmpty) + Some( + rec ++ + (complexSeq.wordsIndexes.intersect(idxs) -- rec.flatMap(_.wordIndexes)).map(h.complexesWords) + ) + else + None + }).seq + } + + /** + * + * @param ns + * @param mdlId + * @param matches + */ + private def processMatches(ns: NCNlpSentence, mdlId: String, matches: Seq[ElementMatch]): Unit = { + // Scans by elements that are found with same tokens length. + // Inside, for each token we drop all non-optimized combinations. + // Example: + // 1. element's synonym - 'a b', isSparse 'true', isPermuteSynonyms 'true' + // 2. Request 'a b a b', + // Initially found 0-1, 1-2, 2-3, 0-3. + // 0-3 will be deleted because for 0 and 3 tokens best variants found for same element with same tokens length. + val matchesNorm = + matches. + flatMap(m ⇒ m.tokens.map(_ → m)). + groupBy { case (t, m) ⇒ (m.element.getId, m.length, t) }. + flatMap { case (_, seq) ⇒ + // Optimization by sparsity sum for each tokens set for one element found with same tokens count. + U.permute( + seq.groupBy { case (tok, _) ⇒ tok }. + map { case (_, seq) ⇒ seq.map { case (_, m) ⇒ m }.toList }.toList + ).minBy(_.map(_.sparsity).sum) + }. + toSeq. + distinct + + val matchCnt = matchesNorm.size + + // TODO:matchesNorm + // Add notes for all remaining (non-intersecting) matches. + for ((m, idx) ← matches.zipWithIndex) { + if (DEEP_DEBUG) + logger.trace( + s"Model '$mdlId' element found (${idx + 1} of $matchCnt) [" + + s"elementId=${m.element.getId}, " + + s"synonym=${m.synonym}, " + + s"tokens=${tokString(m.tokens)}" + + s"]" + ) + + val elm = m.element + val syn = m.synonym + + val tokIdxs = m.tokens.map(_.index) + val direct = syn.isDirect && (tokIdxs == tokIdxs.sorted) + + // TODO: + if (!alreadyMarked(m.tokens, elm.getId)) { + mark(ns, elem = elm, toks = m.tokens, direct = direct, syn = Some(syn), metaOpt = None, parts = m.parts) + + println(s"SET: ${elm.getId}, m.tokens=${m.tokens.map(_.origText).mkString("|")}") + } + else + println(s"NOT SET: ${elm.getId}, m.tokens=${m.tokens.map(_.origText).mkString("|")}") + } + } + @throws[NCE] override def enrich(mdl: NCProbeModel, ns: NCNlpSentence, senMeta: Map[String, Serializable], parent: Span = null): Unit = { require(isStarted) - startScopedSpan("enrich", parent, - "srvReqId" → ns.srvReqId, - "mdlId" → mdl.model.getId, - "txt" → ns.text - ) { span ⇒ - val req = NCRequestImpl(senMeta, ns.srvReqId) - val tokIdxs = ns.map(t ⇒ t → t.wordIndexes).toMap - val firstPhase = !ns.exists(_.isUser) + val mdlId = mdl.model.getId + val srvReqId = ns.srvReqId + + startScopedSpan("enrich", parent, "srvReqId" → srvReqId, "mdlId" → mdlId, "txt" → ns.text) { span ⇒ + val req = NCRequestImpl(senMeta, srvReqId) val matches = mutable.ArrayBuffer.empty[ElementMatch] val cacheSparse = mkCache() - val cacheNotSparse = mkCache() + val cacheDirect = mkCache() + val h = mkComplexes(mdl, ns) + + var found = false + + def add(typ: String, elm: NCElement, cache: Cache, res: Seq[NlpToken], tokIdxs: Seq[Int], s: Synonym, parts: Seq[TokType]): Unit = { + val toksSet = res.toSet - def addMatch(elm: NCElement, toks: Seq[NlpToken], syn: Synonym, parts: Seq[TokType]): Unit = { - val toksSet = toks.toSet + var added = false // TODO: - //if (!matches.exists(m ⇒ m.element.getId == elm.getId && toksSet.subsetOf(m.tokensSet))) - matches += ElementMatch(elm, toks, syn, parts) - } + if (!matches.exists(m ⇒ m.element.getId == elm.getId && toksSet.subsetOf(m.tokensSet))) { + matches += ElementMatch(elm, res, s, parts) - lazy val complexesWords = ns.map(Complex(_)) - lazy val complexes: Seq[ComplexSeq] = - NCProbeVariants.convert(ns.srvReqId, mdl, NCSentenceManager.collapse(mdl.model, ns.clone())). - map(_.asScala). - par. - flatMap(sen ⇒ - // Tokens splitting. - // For example sentence "A B С D E" (5 words) processed as 3 tokens on first phase after collapsing - // 'A B' (2 words), 'C D' (2 words) and 'E' (1 word) - // So, result combinations will be: - // Token(AB) + Token(CD) + Token(E) - // Token(AB) + Word(C) + Word(D) + Token(E) - // Word(A) + Word(B) + Token(CD) + Token(E) - // Word(A) + Word(B) + Word(C) + Word(D) + Token(E) - combos(sen). - map(senPartComb ⇒ { - sen.flatMap(t ⇒ - // Single word token is not split as words - token. - // Partly (not strict in) token - word. - if (t.wordIndexes.length == 1 || senPartComb.contains(t)) - Seq(Complex(t)) - else - t.wordIndexes.map(complexesWords) - ) - // Drops without tokens (IDL part works with tokens). - }).filter(_.exists(_.isToken)).map(ComplexSeq(_)).distinct - ).seq - - startScopedSpan("synsProc", span, - "srvReqId" → ns.srvReqId, - "mdlId" → mdl.model.getId, - "txt" → ns.text - ) { - _ ⇒ - for (toks ← combos(ns)) { - val indexes = toks.map(_.index) - - lazy val dslCombs: Map[Int, Seq[Seq[Complex]]] = { - val idxsSeq = toks.flatMap(tokIdxs) - val idxsSorted = idxsSeq.sorted - val idxs = idxsSeq.toSet - val idxMin = idxsSorted.head - val idxMax = idxsSorted.last - - lazy val sorted = idxsSorted.zipWithIndex.toMap - - complexes.par. - flatMap(complexSeq ⇒ { - val rec = complexSeq.tokensComplexes.filter(_.isSubsetOf(idxMin, idxMax, idxs)) - - // Drops without tokens (IDL part works with tokens). - if (rec.nonEmpty) - Some( - rec ++ - (complexSeq.wordsIndexes.intersect(idxs) -- rec.flatMap(_.wordIndexes)). - map(complexesWords) - ) - else - None - }). - map(_.sortBy(p ⇒ sorted(p.wordIndexes.head))).seq.groupBy(_.length) - } + added = true + } - lazy val tokStems = toks.map(_.stem).mkString(" ") - - // Attempt to match each element. - for ( - elm ← mdl.elements.values; - elemId = elm.getId - if !alreadyMarked(toks, elm.getId); - sparseEnabled = !cacheSparse(elemId).exists(_.containsSlice(indexes)); - notSparseEnabled = !cacheNotSparse(elemId).exists(_.containsSlice(indexes)) - if sparseEnabled || notSparseEnabled - ) { - var found = false - - def add(cache: Cache, res: Seq[NlpToken], s: Synonym, parts: Seq[TokType]): Unit = { - addMatch(elm, res, s, parts) - cache(elemId) += indexes - found = true - } + cache(elm.getId) += tokIdxs + found = true - def addSparse(res: Seq[NlpToken], s: Synonym, parts: Seq[TokType]): Unit = add(cacheSparse, res, s, parts) - def addNotSparse(s: Synonym, parts: Seq[TokType]): Unit = add(cacheNotSparse, toks, s, parts) - - // 1. Simple, not sparse. - if (firstPhase && notSparseEnabled && !found) - fastAccess(mdl.nonSparseSynonyms, elemId, toks.length) match { - case Some(h) ⇒ - def tryMap(synsMap: Map[String, Synonym], notFound: () ⇒ Unit): Unit = - synsMap.get(tokStems) match { - case Some(syn) ⇒ addNotSparse(syn, Seq.empty) - case None ⇒ notFound() - } - - def tryScan(synsSeq: Seq[Synonym]): Unit = - for (syn ← synsSeq if !found) - if (syn.isMatch(toks)) - addNotSparse(syn, Seq.empty) - - tryMap( - h.txtDirectSynonyms, - () ⇒ { - tryScan(h.notTxtDirectSynonyms) - - if (!found) - tryMap( - h.txtNotDirectSynonyms, - () ⇒ tryScan(h.notTxtNotDirectSynonyms) - ) - } - ) - case None ⇒ // No-op. - } - - // 2. DSL, non sparse. - if (notSparseEnabled && mdl.nonSparseSynonymsDsl.nonEmpty && !found) { - for ( - (len, seq) ← dslCombs; - syn ← fastAccess(mdl.nonSparseSynonymsDsl, elemId, len).getOrElse(Seq.empty); - comb ← seq if !found - ) { - if (syn.isMatch(comb.map(_.data), req)) - addNotSparse(syn, getPartsComplex(comb, syn)) - } - } + println(s"ADDED: ${elm.getId}, type=$typ, res=${res.map(_.origText).mkString("|")}, tokIdxs=${tokIdxs.mkString("|")}, added=$added") + } - // 3. Simple, sparse. - if (firstPhase && sparseEnabled && !found) - for (syn ← mdl.sparseSynonyms.getOrElse(elemId, Seq.empty) if !found) - syn.trySparseMatch(toks) match { - case Some(res) ⇒ addSparse(res, syn, Seq.empty) - case None ⇒ // No-op. + startScopedSpan("synsProc", span, "srvReqId" → srvReqId, "mdlId" → mdlId, "txt" → ns.text) { _ ⇒ + var state = if (ns.firstProbePhase) SIMPLE else DSL_NEXT + ns.firstProbePhase = false + val combosToks = combos(ns) + + def go(): Unit = { + println + println(s"GO $state") + + for (toks ← combosToks) { + val tokIdxs = toks.map(_.index) + lazy val dslCombs: Seq[Seq[Complex]] = mkComplexCombinations(h, toks) + lazy val tokStems = toks.map(_.stem).mkString(" ") + + // Attempt to match each element. + // TODO: alreadyMarked - может быть найдено тоже самое но отмечено меньше (как это сразу не рассматривать?) + for ( + elm ← mdl.elements.values; + elemId = elm.getId; + if + !alreadyMarked(toks, elm.getId) + ) { + val directProc = cacheDirect(elemId).exists(_.containsSlice(tokIdxs)) + val sparseProc = cacheSparse(elemId).exists(_.containsSlice(tokIdxs)) + + // 1. SIMPLE. + found = false + + val simpleEnabled: Boolean = + state match { + case SIMPLE ⇒ !mdl.hasDslSynonyms(elemId) + case DSL_FIRST ⇒ mdl.hasDslSynonyms(elemId) + case _ ⇒ false } - // 4. DSL, sparse. - if (sparseEnabled && mdl.sparseSynonymsDsl.nonEmpty && !found) - for ( - syn ← mdl.sparseSynonymsDsl.getOrElse(elemId, Seq.empty); - (_, seq) ← dslCombs; - comb ← seq if !found - ) { - syn.trySparseMatch(comb.map(_.data), req) match { - case Some(towsRes) ⇒ addSparse(toNlpTokens(towsRes, ns), syn, toParts(towsRes, syn)) + // 1.1 Direct. + if (simpleEnabled && !directProc && !found) + fastAccess(mdl.directSynonyms, elemId, toks.length) match { + case Some(h) ⇒ + def tryMap(syns: Map[String, Synonym], notFound: () ⇒ Unit): Unit = + syns.get(tokStems) match { + case Some(s) ⇒ add("direct simple", elm, cacheDirect, toks, tokIdxs, s, Seq.empty) + case None ⇒ notFound() + } + + def tryScan(syns: Seq[Synonym]): Unit = + for (s ← syns if !found) + if (s.isMatch(toks)) + add("direct simple2", elm, cacheDirect, toks, tokIdxs, s, Seq.empty) + + tryMap( + h.txtDirectSynonyms, + () ⇒ { + tryScan(h.notTxtDirectSynonyms) + + if (!found) + tryMap(h.txtNotDirectSynonyms, () ⇒ tryScan(h.notTxtNotDirectSynonyms)) + } + ) case None ⇒ // No-op. } - } - } - } - } - // Scans by elements that are found with same tokens length. - // Inside, for each token we drop all non-optimized combinations. - // Example: - // 1. element's synonym - 'a b', isSparse 'true', isPermuteSynonyms 'true' - // 2. Request 'a b a b', - // Initially found 0-1, 1-2, 2-3, 0-3. - // 0-3 will be deleted because for 0 and 3 tokens best variants found for same element with same tokens length. - val matchesNorm = - matches. - flatMap(m ⇒ m.tokens.map(_ → m)). - groupBy { case (t, m) ⇒ (m.element.getId, m.length, t) }. - flatMap { case (_, seq) ⇒ - // Optimization by sparsity sum for each tokens set for one element found with same tokens count. - U.permute( - seq.groupBy { case (tok, _) ⇒ tok }. - map { case (_, seq) ⇒ seq.map { case (_, m) ⇒ m }.toList }.toList - ).minBy(_.map(_.sparsity).sum) - }. - toSeq. - distinct - - val matchCnt = matchesNorm.size - - // TODO:matchesNorm - // Add notes for all remaining (non-intersecting) matches. - for ((m, idx) ← matches.zipWithIndex) { - if (DEEP_DEBUG) - logger.trace( - s"Model '${mdl.model.getId}' element found (${idx + 1} of $matchCnt) [" + - s"elementId=${m.element.getId}, " + - s"synonym=${m.synonym}, " + - s"tokens=${tokString(m.tokens)}" + - s"]" - ) - - val elm = m.element - val syn = m.synonym + // 1.2 Sparse. + if (simpleEnabled && !sparseProc && !found) + for (s ← get(mdl.sparseSynonyms, elemId) if !found) + s.trySparseMatch(toks) match { + case Some(res) ⇒ add("sparse simple", elm, cacheSparse, res, tokIdxs, s, Seq.empty) + case None ⇒ // No-op. + } + + // 2. DSL. + found = false + val dslEnabled = state != SIMPLE + + // 2.1 Direct. + if (dslEnabled && mdl.directSynonymsDsl.nonEmpty && !directProc && !found) + for (s ← get(mdl.directSynonymsDsl, elemId); comb ← dslCombs if !found) { + if (s.isMatch(comb.map(_.data), req)) { + println(s"OK $elemId for s=$s for toks:${toks.map(_.origText)}") + + add("direct DSL", elm, cacheDirect, toks, tokIdxs, s, getPartsComplex(comb, s)) + } + println { + println(s"NOT OK $elemId for s=$s for toks:${toks.map(_.origText)}") + } + } - val tokIdxs = m.tokens.map(_.index) - val direct = syn.isDirect && (tokIdxs == tokIdxs.sorted) + // 2.2 Sparse. + if (dslEnabled && mdl.sparseSynonymsDsl.nonEmpty && !sparseProc && !found) + for (s ← get(mdl.sparseSynonymsDsl, elemId); comb ← dslCombs if !found) + s.trySparseMatch(comb.map(_.data), req) match { + case Some(res) ⇒ add("sparse DSL", elm, cacheSparse, toTokens(res, ns), tokIdxs, s, toParts(res, s)) + case None ⇒ // No-op. + } + } + } - // TODO: - if (!alreadyMarked(m.tokens, elm.getId)) - mark(ns, elem = elm, toks = m.tokens, direct = direct, syn = Some(syn), metaOpt = None, parts = m.parts) - } + processMatches(ns, mdlId, matches) + } - val parsers = mdl.model.getParsers - - for (parser ← parsers.asScala) { - parser.onInit() - - startScopedSpan("customParser", span, - "srvReqId" → ns.srvReqId, - "mdlId" → mdl.model.getId, - "txt" → ns.text) { _ ⇒ - def to(t: NlpToken): NCCustomWord = - new NCCustomWord { - override def getNormalizedText: String = t.normText - override def getOriginalText: String = t.origText - override def getStartCharIndex: Int = t.startCharIndex - override def getEndCharIndex: Int = t.endCharIndex - override def getPos: String = t.pos - override def getPosDescription: String = t.posDesc - override def getLemma: String = t.lemma - override def getStem: String = t.stem - override def isStopWord: Boolean = t.isStopWord - override def isBracketed: Boolean = t.isBracketed - override def isQuoted: Boolean = t.isQuoted - override def isKnownWord: Boolean = t.isKnownWord - override def isSwearWord: Boolean = t.isSwearWord - override def isEnglish: Boolean = t.isEnglish - } + go() - val res = parser.parse( - req, - mdl.model, - ns.map(to).asJava, - ns.flatten.distinct.filter(!_.isNlp).map(n ⇒ { - val noteId = n.noteType - val words = ns.filter(t ⇒ n.tokenIndexes.contains(t.index)).map(to).asJava - val md = n.asMetadata() - - new NCCustomElement() { - override def getElementId: String = noteId - override def getWords: util.List[NCCustomWord] = words - override def getMetadata: JavaMeta = md.map(p ⇒ p._1 → p._2.asInstanceOf[AnyRef]).asJava - } - }).asJava - ) + if (state == SIMPLE) { + state = DSL_FIRST - if (res != null) - res.asScala.foreach(e ⇒ { - val elemId = e.getElementId - val words = e.getWords + go() + } - if (elemId == null) - throw new NCE(s"Custom model parser cannot return 'null' element ID.") - if (words == null || words.isEmpty) - throw new NCE(s"Custom model parser cannot return empty custom tokens [elementId=$elemId]") + } - val matchedToks = words.asScala.map(w ⇒ - ns.find(t ⇒ - t.startCharIndex == w.getStartCharIndex && t.endCharIndex == w.getEndCharIndex - ).getOrElse(throw new AssertionError(s"Custom model parser returned an invalid custom token: $w")) - ) - if (!alreadyMarked(matchedToks, elemId)) - mark( - ns, - elem = mdl.elements.getOrElse(elemId, throw new NCE(s"Custom model parser returned unknown element ID: $elemId")), - toks = matchedToks, - direct = true, - syn = None, - metaOpt = Some(e.getMetadata.asScala), - parts = Seq.empty - ) - }) - } - parser.onDiscard() - } + processParsers(mdl, ns, span, req) } } def isComplex(mdl: NCProbeModel): Boolean = - mdl.nonSparseSynonymsDsl.nonEmpty || + mdl.directSynonymsDsl.nonEmpty || mdl.sparseSynonymsDsl.nonEmpty || !mdl.model.getParsers.isEmpty } \ No newline at end of file diff --git a/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCEnricherNestedModelSpec.scala b/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCEnricherNestedModelSpec.scala index fa9b3c7..00ee50f 100644 --- a/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCEnricherNestedModelSpec.scala +++ b/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCEnricherNestedModelSpec.scala @@ -79,34 +79,34 @@ class NCEnricherNestedModelSpec2 extends NCEnricherNestedModelSpec1 { @Test def test2(): Unit = runBatch( - _ ⇒ checkExists( - "test tomorrow", - usr(text = "test tomorrow", id = "x1") - ), - _ ⇒ checkExists( - "tomorrow test", - usr(text = "tomorrow test", id = "x1") - ), +// _ ⇒ checkExists( +// "test tomorrow", +// usr(text = "test tomorrow", id = "x3") +// ), +// _ ⇒ checkExists( +// "tomorrow test", +// usr(text = "tomorrow test", id = "x3") +// ), _ ⇒ checkExists( "test xxx tomorrow", - usr(text = "test tomorrow", id = "x1"), + usr(text = "test tomorrow", id = "x3"), nlp(text = "xxx"), ), - _ ⇒ checkExists( - "y the y", - usr(text = "y y", id = "y3"), - nlp(text = "the", isStop = true) - ), - _ ⇒ checkExists( - "y xxx y", - usr(text = "y y", id = "y3"), - nlp(text = "xxx") - ), - _ ⇒ checkExists( - "aaa y xxx y", - nlp(text = "aaa"), - usr(text = "y y", id = "y3"), - nlp(text = "xxx") - ) +// _ ⇒ checkExists( +// "y the y", +// usr(text = "y y", id = "y3"), +// nlp(text = "the", isStop = true) +// ), +// _ ⇒ checkExists( +// "y xxx y", +// usr(text = "y y", id = "y3"), +// nlp(text = "xxx") +// ), +// _ ⇒ checkExists( +// "aaa y xxx y", +// nlp(text = "aaa"), +// usr(text = "y y", id = "y3"), +// nlp(text = "xxx") +// ) ) } \ No newline at end of file diff --git a/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCEnricherNestedModelSpec2.scala b/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCEnricherNestedModelSpec2.scala index ede9153..7b81473 100644 --- a/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCEnricherNestedModelSpec2.scala +++ b/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCEnricherNestedModelSpec2.scala @@ -27,8 +27,11 @@ import java.util * Nested Elements test model. */ class NCNestedTestModel21 extends NCModelAdapter("nlpcraft.nested2.test.mdl", "Nested Test Model", "1.0") { - override def getElements: util.Set[NCElement] = + override def getElements: util.Set[NCElement] = { + // Note - it defines one simple and one DSL synonyms. + // But it should be caught by long (DSL) variant (for `10 word`) Set(NCTestElement("e1", "{^^{tok_id() == 'nlpcraft:num'}^^|_} word")) + } @NCIntent("intent=onE1 term(t1)={tok_id() == 'e1'}") def onAB(ctx: NCIntentMatch): NCResult = NCResult.text("OK")
