This is an automated email from the ASF dual-hosted git repository. sergeykamov pushed a commit to branch NLPCRAFT-287 in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git
commit 0387d2e157b0b6d189d9e439bdbc8c30c484c9c1 Author: Sergey Kamov <[email protected]> AuthorDate: Thu Apr 15 12:33:07 2021 +0300 WIP. --- .../apache/nlpcraft/probe/mgrs/NCProbeModel.scala | 7 +- .../nlpcraft/probe/mgrs/NCProbeSynonym.scala | 19 +++- .../mgrs/nlp/enrichers/model/NCModelEnricher.scala | 126 +++++++++++++-------- 3 files changed, 100 insertions(+), 52 deletions(-) diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeModel.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeModel.scala index 2670fb7..03c5cb3 100644 --- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeModel.scala +++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeModel.scala @@ -48,7 +48,10 @@ case class NCProbeModel( elements: Map[String /*Element ID*/ , NCElement], samples: Set[(String, Seq[Seq[String]])] ) { + lazy val hasIdlSynonyms: Boolean = idlSynonyms.nonEmpty + lazy val hasNoIdlSynonyms: Boolean = continuousSynonyms.nonEmpty || sparseSynonyms.nonEmpty + lazy val hasSparseSynonyms: Boolean = sparseSynonyms.nonEmpty || idlSynonyms.exists(_._2.exists(_.sparse)) + lazy val hasContinuousSynonyms: Boolean = continuousSynonyms.nonEmpty || idlSynonyms.exists(_._2.exists(!_.sparse)) + def hasIdlSynonyms(elemId: String): Boolean = idlSynonyms.contains(elemId) - def hasIdlSynonyms: Boolean = idlSynonyms.nonEmpty - def hasNoIdlSynonyms: Boolean = continuousSynonyms.nonEmpty || sparseSynonyms.nonEmpty } diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeSynonym.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeSynonym.scala index b944ccc..d5361f8 100644 --- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeSynonym.scala +++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeSynonym.scala @@ -186,6 +186,23 @@ class NCProbeSynonym( /** * + * @param tows + * @param req + * @return + */ + def isMatch(tows: Seq[NCIdlContent], req: NCRequest): Boolean = { + require(tows != null) + + if (tows.length == length && tows.count(_.isLeft) >= idlChunks) + tows.zip(this).sortBy(p ⇒ getSort(p._2.kind)).forall { case (tow, chunk) ⇒ isMatch(tow, chunk, req) } + else + false + } + + + + /** + * * @param toks */ def sparseMatch(toks: NCNlpSentenceTokenBuffer): Option[Seq[NCNlpSentenceToken]] = { @@ -200,7 +217,7 @@ class NCProbeSynonym( * @param tows * @param req */ - def idlMatch(tows: Seq[NCIdlContent], req: NCRequest): Option[Seq[NCIdlContent]] = { + def sparseMatch(tows: Seq[NCIdlContent], req: NCRequest): Option[Seq[NCIdlContent]] = { require(tows != null) require(req != null) require(hasIdl) diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala index 353687b..120d8d0 100644 --- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala +++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala @@ -402,7 +402,7 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala { * @param h * @param toks */ - private def mkComplexCombinations(h: ComplexHolder, toks: Seq[NlpToken], cache: Set[Seq[Complex]]): Seq[Seq[Complex]] = { + private def mkCombinations(h: ComplexHolder, toks: Seq[NlpToken], cache: Set[Seq[Complex]]): Seq[Seq[Complex]] = { val idxs = toks.flatMap(_.wordIndexes).toSet h.complexes.par. @@ -423,9 +423,9 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala { } private def add( + dbgType: String, ns: NCNlpSentence, - contCache: mutable.Map[String, ArrayBuffer[Seq[Int]]], - typ: String, + contCache: Cache, elm: NCElement, res: Seq[NlpToken], allToksIdxs: Seq[Int], @@ -449,7 +449,7 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala { logger.trace( s"${if (ok) "Added" else "Skipped"} element [" + s"id=${elm.getId}, " + - s"type=$typ, " + + s"type=$dbgType, " + s"text='${res.map(_.origText).mkString(" ")}', " + s"indexes=${resIdxs.mkString("[", ",", "]")}, " + s"allTokensIndexes=${allToksIdxs.mkString("[", ",", "]")}, " + @@ -462,85 +462,110 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala { override def enrich(mdl: NCProbeModel, ns: NCNlpSentence, senMeta: Map[String, Serializable], parent: Span = null): Unit = { require(isStarted) - startScopedSpan("enrich", parent, "srvReqId" → ns.srvReqId, "mdlId" → mdl.model.getId, "txt" → ns.text) { span ⇒ + startScopedSpan( + "enrich", parent, "srvReqId" → ns.srvReqId, "mdlId" → mdl.model.getId, "txt" → ns.text + ) { span ⇒ val req = NCRequestImpl(senMeta, ns.srvReqId) val combToks = combos(ns) lazy val ch = mkComplexes(mdl, ns) def execute(simpleEnabled: Boolean, idlEnabled: Boolean): Unit = - startScopedSpan("execute", parent, "srvReqId" → ns.srvReqId, "mdlId" → mdl.model.getId, "txt" → ns.text) { _ ⇒ + startScopedSpan( + "execute", span, "srvReqId" → ns.srvReqId, "mdlId" → mdl.model.getId, "txt" → ns.text + ) { _ ⇒ if (DEEP_DEBUG) logger.trace(s"Execution started [simpleEnabled=$simpleEnabled, idlEnabled=$idlEnabled]") - val contCache = mutable.HashMap.empty ++ mdl.elements.keys.map(k ⇒ k → mutable.ArrayBuffer.empty[Seq[Int]]) + val contCache = mutable.HashMap.empty ++ + mdl.elements.keys.map(k ⇒ k → mutable.ArrayBuffer.empty[Seq[Int]]) lazy val idlCache = mutable.HashSet.empty[Seq[Complex]] for ( toks ← combToks; - tokIdxs = toks.map(_.index); - elm ← mdl.elements.values; - elemId = elm.getId + idxs = toks.map(_.index); + e ← mdl.elements.values; + eId = e.getId if - !contCache(elemId).exists(_.containsSlice(tokIdxs)) && - !alreadyMarked(ns, elemId, toks, tokIdxs) + !contCache(eId).exists(_.containsSlice(idxs)) && + !alreadyMarked(ns, eId, toks, idxs) ) { // 1. SIMPLE. - if (simpleEnabled && (if (idlEnabled) mdl.hasIdlSynonyms(elemId) else !mdl.hasIdlSynonyms(elemId))) { + if (simpleEnabled && (if (idlEnabled) mdl.hasIdlSynonyms(eId) else !mdl.hasIdlSynonyms(eId))) { lazy val tokStems = toks.map(_.stem).mkString(" ") // 1.1 Continuous. var found = false - fastAccess(mdl.continuousSynonyms, elemId, toks.length) match { - case Some(h) ⇒ - def tryMap(syns: Map[String, Synonym], notFound: () ⇒ Unit): Unit = - syns.get(tokStems) match { - case Some(s) ⇒ - found = true - add(ns, contCache,"simple continuous", elm, toks, tokIdxs, s) - case None ⇒ notFound() - } - - def tryScan(syns: Seq[Synonym]): Unit = - for (s ← syns if !found) - if (s.isMatch(toks)) { - found = true - add(ns, contCache, "simple continuous scan", elm, toks, tokIdxs, s) + if (mdl.hasContinuousSynonyms) + fastAccess(mdl.continuousSynonyms, eId, toks.length) match { + case Some(h) ⇒ + def tryMap(syns: Map[String, Synonym], notFound: () ⇒ Unit): Unit = + syns.get(tokStems) match { + case Some(s) ⇒ + found = true + add("simple continuous", ns, contCache, e, toks, idxs, s) + case None ⇒ notFound() } - tryMap( - h.txtDirectSynonyms, - () ⇒ { - tryScan(h.notTxtDirectSynonyms) + def tryScan(syns: Seq[Synonym]): Unit = + for (s ← syns if !found) + if (s.isMatch(toks)) { + found = true + add("simple continuous scan", ns, contCache, e, toks, idxs, s) + } - if (!found) - tryMap(h.txtNotDirectSynonyms, () ⇒ tryScan(h.notTxtNotDirectSynonyms)) - } - ) - case None ⇒ // No-op. - } + tryMap( + h.txtDirectSynonyms, + () ⇒ { + tryScan(h.notTxtDirectSynonyms) + + if (!found) + tryMap(h.txtNotDirectSynonyms, () ⇒ tryScan(h.notTxtNotDirectSynonyms)) + } + ) + case None ⇒ // No-op. + } // 1.2 Sparse. - if (!found) - for (s ← get(mdl.sparseSynonyms, elemId)) + if (!found && mdl.hasSparseSynonyms) + for (s ← get(mdl.sparseSynonyms, eId)) s.sparseMatch(toks) match { - case Some(res) ⇒ add(ns, contCache, "simple sparse", elm, res, tokIdxs, s) + case Some(res) ⇒ add("simple sparse", ns, contCache, e, res, idxs, s) case None ⇒ // No-op. } } // 2. IDL. - if (idlEnabled) - for (s ← get(mdl.idlSynonyms, elemId); comb ← mkComplexCombinations(ch, toks, idlCache.toSet)) - s.idlMatch(comb.map(_.data), req) match { - case Some(res) ⇒ - val typ = if (s.sparse) "IDL sparse" else "IDL continuous" + if (idlEnabled) { + if (mdl.hasSparseSynonyms) + for (s ← get(mdl.idlSynonyms, eId); comb ← mkCombinations(ch, toks, idlCache.toSet)) + s.sparseMatch(comb.map(_.data), req) match { + case Some(res) ⇒ + val typ = if (s.sparse) "IDL sparse" else "IDL continuous" + + add(typ, ns, contCache, e, toTokens(res, ns), idxs, s, toParts(res, s)) - add(ns, contCache, typ, elm, toTokens(res, ns), tokIdxs, s, toParts(res, s)) + idlCache += comb + case None ⇒ // No-op. + } + else { + var found = false + + for ( + s ← get(mdl.idlSynonyms, eId); + comb ← mkCombinations(ch, toks, idlCache.toSet); + data = comb.map(_.data) + if !found + ) + if (s.isMatch(data, req)) { + add("IDL continuous", ns, contCache, e, toks, idxs, s, toParts(data, s)) idlCache += comb - case None ⇒ // No-op. + + found = true } + } + } } } @@ -559,6 +584,7 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala { } /** + * TODO: simplify, add tests, check model properties (sparse etc) for optimization. * * @param elemId * @param toks @@ -577,7 +603,9 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala { || ( n.tokenIndexes == toksIdxsSorted || - n.tokenIndexes.containsSlice(toksIdxsSorted) && U.isContinuous(toksIdxsSorted) && U.isContinuous(n.tokenIndexes) + n.tokenIndexes.containsSlice(toksIdxsSorted) && + U.isContinuous(toksIdxsSorted) && + U.isContinuous(n.tokenIndexes) ) ) ))
