This is an automated email from the ASF dual-hosted git repository. sergeykamov pushed a commit to branch NLPCRAFT-287 in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git
commit 66593155e7f4fbf96aa694afa6a0675096ce3166 Author: Sergey Kamov <[email protected]> AuthorDate: Tue Apr 6 12:14:45 2021 +0300 WIP. --- .../nlpcraft/probe/mgrs/NCProbeSynonym.scala | 3 +- .../mgrs/nlp/enrichers/model/NCModelEnricher.scala | 61 +++++++++++----------- .../probe/mgrs/sentence/NCSentenceManager.scala | 13 +---- 3 files changed, 34 insertions(+), 43 deletions(-) diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeSynonym.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeSynonym.scala index 95c526f..c54b347 100644 --- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeSynonym.scala +++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeSynonym.scala @@ -114,11 +114,12 @@ class NCProbeSynonym( if (!perm && res.nonEmpty && getIndex(head) <= getIndex(res.last)) state = -1 else { - res += head all ++= seq if (all.size > res.size) state = -1 + else + res += head } } else diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala index f9acd95..30f5084 100644 --- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala +++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala @@ -19,14 +19,14 @@ package org.apache.nlpcraft.probe.mgrs.nlp.enrichers.model import io.opencensus.trace.Span import org.apache.nlpcraft.common._ -import org.apache.nlpcraft.common.nlp.{NCNlpSentenceToken ⇒ NlpToken, _} +import org.apache.nlpcraft.common.nlp.{NCNlpSentenceToken => NlpToken, _} import org.apache.nlpcraft.model._ import org.apache.nlpcraft.probe.mgrs.NCProbeSynonym.NCDslContent import org.apache.nlpcraft.probe.mgrs.NCProbeSynonymChunkKind.{NCSynonymChunkKind, TEXT} import org.apache.nlpcraft.probe.mgrs.nlp.NCProbeEnricher import org.apache.nlpcraft.probe.mgrs.nlp.impl.NCRequestImpl import org.apache.nlpcraft.probe.mgrs.sentence.NCSentenceManager -import org.apache.nlpcraft.probe.mgrs.{NCProbeModel, NCProbeSynonym, NCProbeVariants} +import org.apache.nlpcraft.probe.mgrs.{NCProbeModel, NCProbeSynonym ⇒ Synonym, NCProbeVariants} import java.io.Serializable import java.util @@ -39,8 +39,9 @@ import scala.collection.{Map, Seq, mutable} * Model elements enricher. */ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala { - type TokenData = (NCToken, NCSynonymChunkKind) - + type TokType = (NCToken, NCSynonymChunkKind) + type Cache = mutable.Map[String, ArrayBuffer[Seq[Int]]] + object Complex { def apply(t: NCToken): Complex = Complex( @@ -128,8 +129,8 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala { case class ElementMatch( element: NCElement, tokens: Seq[NlpToken], - synonym: NCProbeSynonym, - parts: Seq[TokenData] + synonym: Synonym, + parts: Seq[TokType] ) extends Ordered[ElementMatch] { // Tokens sparsity. lazy val sparsity = U.calcSparsity(tokens.map(_.index)) @@ -197,9 +198,9 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala { elem: NCElement, toks: Seq[NlpToken], direct: Boolean, - syn: Option[NCProbeSynonym], + syn: Option[Synonym], metaOpt: Option[Map[String, Object]], - parts: Seq[TokenData] + parts: Seq[TokType] ): Unit = { val params = mutable.ArrayBuffer.empty[(String, AnyRef)] @@ -279,7 +280,7 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala { * @param comb * @param syn */ - private def getPartsComplex(comb: Seq[Complex], syn: NCProbeSynonym): Seq[TokenData] = + private def getPartsComplex(comb: Seq[Complex], syn: Synonym): Seq[TokType] = comb.zip(syn.map(_.kind)).flatMap { case (complex, kind) ⇒ if (complex.isToken) Some(complex.token → kind) else None @@ -290,19 +291,18 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala { * @param comb * @param syn */ - private def getPartsContent(comb: Seq[NCDslContent], syn: NCProbeSynonym): Seq[TokenData] = + private def toParts(comb: Seq[NCDslContent], syn: Synonym): Seq[TokType] = comb.zip(syn.map(_.kind)).flatMap { - case (complex, kind) ⇒ if (complex.isLeft) Some(complex.left.get → kind) - else None + case (complex, kind) ⇒ if (complex.isLeft) Some(complex.left.get → kind) else None } - private def mkCache(): mutable.Map[String, ArrayBuffer[Seq[Int]]] = + private def mkCache(): Cache = mutable.HashMap.empty[ String, mutable.ArrayBuffer[Seq[Int]] ].withDefault(_ ⇒ mutable.ArrayBuffer.empty[Seq[Int]]) - private def convert(tows: Seq[NCDslContent], ns: NCNlpSentence): Seq[NlpToken] = + private def toNlpTokens(tows: Seq[NCDslContent], ns: NCNlpSentence): Seq[NlpToken] = ( tows.filter(_.isRight).map(_.right.get) ++ tows.filter(_.isLeft).map(_.left.get). @@ -325,7 +325,7 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala { val cacheSparse = mkCache() val cacheNotSparse = mkCache() - def addMatch(elm: NCElement, toks: Seq[NlpToken], syn: NCProbeSynonym, parts: Seq[TokenData]): Unit = { + def addMatch(elm: NCElement, toks: Seq[NlpToken], syn: Synonym, parts: Seq[TokType]): Unit = { val toksSet = toks.toSet // TODO: @@ -419,35 +419,36 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala { lazy val tokStems = toks.map(_.stem).mkString(" ") // Attempt to match each element. - for (elm ← mdl.elements.values) { - val elemId = elm.getId - val sparseEnabled = !cacheSparse(elemId).exists(_.containsSlice(indexes)) - val notSparseEnabled = !cacheNotSparse(elemId).exists(_.containsSlice(indexes)) + for ( + elm ← mdl.elements.values; + elemId = elm.getId; + sparseEnabled = !cacheSparse(elemId).exists(_.containsSlice(indexes)); + notSparseEnabled = !cacheNotSparse(elemId).exists(_.containsSlice(indexes)) + + if !alreadyMarked(toks, elm.getId) && (sparseEnabled || notSparseEnabled) + ) { var found = false - def addSparse(res: Seq[NlpToken], syn: NCProbeSynonym, parts: Seq[TokenData]): Unit = { - addMatch(elm, res, syn, parts) - cacheSparse(elemId) += indexes + def add(cache: Cache, res: Seq[NlpToken], s: Synonym, parts: Seq[TokType]): Unit = { + addMatch(elm, res, s, parts) + cache(elemId) += indexes found = true } - def addNotSparse(syn: NCProbeSynonym, parts: Seq[TokenData]): Unit = { - addMatch(elm, toks, syn, parts) - cacheNotSparse(elemId) += indexes - found = true - } + def addSparse(res: Seq[NlpToken], s: Synonym, parts: Seq[TokType]): Unit = add(cacheSparse, res, s, parts) + def addNotSparse(s: Synonym, parts: Seq[TokType]): Unit = add(cacheNotSparse, toks, s, parts) // 1. Simple, not sparse. if (firstPhase && notSparseEnabled && !found) fastAccess(mdl.nonSparseSynonyms, elemId, toks.length) match { case Some(h) ⇒ - def tryMap(synsMap: Map[String, NCProbeSynonym], notFound: () ⇒ Unit): Unit = + def tryMap(synsMap: Map[String, Synonym], notFound: () ⇒ Unit): Unit = synsMap.get(tokStems) match { case Some(syn) ⇒ addNotSparse(syn, Seq.empty) case None ⇒ notFound() } - def tryScan(synsSeq: Seq[NCProbeSynonym]): Unit = + def tryScan(synsSeq: Seq[Synonym]): Unit = for (syn ← synsSeq if !found) if (syn.isMatch(toks)) addNotSparse(syn, Seq.empty) @@ -495,7 +496,7 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala { comb ← seq if !found ) { syn.trySparseMatch(comb.map(_.data), req) match { - case Some(towsRes) ⇒ addSparse(convert(towsRes, ns), syn, getPartsContent(towsRes, syn)) + case Some(towsRes) ⇒ addSparse(toNlpTokens(towsRes, ns), syn, toParts(towsRes, syn)) case None ⇒ // No-op. } } diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceManager.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceManager.scala index a938f59..fb676d0 100644 --- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceManager.scala +++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceManager.scala @@ -37,8 +37,6 @@ import scala.language.implicitConversions object NCSentenceManager extends NCService { @volatile private var pool: java.util.concurrent.ForkJoinPool = _ - private val cache = U.mkLRUMap[Seq[Set[NCNlpSentenceNote]], util.List[util.List[NCNlpSentenceNote]]]("sentence-combinations-cache", 500) - case class PartKey(id: String, start: Int, end: Int) { require(start <= end) @@ -686,17 +684,8 @@ object NCSentenceManager extends NCService { map { case (_, seq) ⇒ seq.map { case (_, note) ⇒ note }.toSet }. toSeq.sortBy(-_.size) - - var combs: JList[JList[NCNlpSentenceNote]] = cache.get(toksByIdx) - - if (combs == null) { - combs = NCSentenceHelper.findCombinations(toksByIdx.map(_.asJava).asJava, pool) - - cache.put(toksByIdx, combs) - } - val seqSens = - combs.asScala.map(_.asScala). + NCSentenceHelper.findCombinations(toksByIdx.map(_.asJava).asJava, pool).asScala.map(_.asScala). par. flatMap(delComb ⇒ { val nsClone = sen.clone()
