This is an automated email from the ASF dual-hosted git repository.
sergeykamov pushed a commit to branch NLPCRAFT-287
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git
The following commit(s) were added to refs/heads/NLPCRAFT-287 by this push:
new e35bbb4 WIP.
e35bbb4 is described below
commit e35bbb4043559792cac2619f29419a1403295ec7
Author: Sergey Kamov <[email protected]>
AuthorDate: Mon Apr 5 14:41:32 2021 +0300
WIP.
---
.../nlpcraft/probe/mgrs/NCProbeSynonym.scala | 75 +++---
.../mgrs/nlp/enrichers/model/NCModelEnricher.scala | 279 +++++++++++----------
2 files changed, 185 insertions(+), 169 deletions(-)
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeSynonym.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeSynonym.scala
index 4dd1c61..4b2639e 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeSynonym.scala
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeSynonym.scala
@@ -17,7 +17,6 @@
package org.apache.nlpcraft.probe.mgrs
-import org.apache.nlpcraft.common.U
import org.apache.nlpcraft.common.nlp.{NCNlpSentenceToken,
NCNlpSentenceTokenBuffer}
import org.apache.nlpcraft.model._
import org.apache.nlpcraft.model.intent.NCIdlContext
@@ -86,44 +85,44 @@ class NCProbeSynonym(
/**
*
- * @param sen
+ * @param toks
* @return
*/
- private def trySparseMatch0[T](sen: Seq[T], isMatch: (T,
NCProbeSynonymChunk) ⇒ Boolean, getIndex: T ⇒ Int): List[List[T]] = {
- require(sen != null)
- require(sen.nonEmpty)
+ private def trySparseMatch0[T](toks: Seq[T], isMatch: (T,
NCProbeSynonymChunk) ⇒ Boolean, getIndex: T ⇒ Int): Option[Seq[T]] = {
+ require(toks != null)
+ require(toks.nonEmpty)
require(this.size > 1)
- lazy val buf = mutable.ArrayBuffer.empty[List[T]]
- lazy val flattenBuf = mutable.ArrayBuffer.empty[T]
-
- var ok = true
-
- for (chunk ← this if ok) {
- val res = sen.filter(tok ⇒ !flattenBuf.contains(tok) &&
isMatch(tok, chunk))
-
- if (res.nonEmpty) {
- buf += res.toList
- flattenBuf ++= res
+ lazy val buf = mutable.ArrayBuffer.empty[T]
+ var state = 0
+
+ for (chunk ← this if state != -1)
+ toks.find(t ⇒ {
+ if (state == 0) {
+ state = 1
+
+ isMatch(t, chunk) && !buf.contains(t)
+ }
+ else
+ !buf.contains(t) && isMatch(t, chunk)
+ }) match {
+ case Some(t) ⇒ buf += t
+ case None ⇒ state = -1
}
- else
- ok = false
- }
-
- if (ok) {
- var variants = U.permute(buf.toList)
- def isOrdered(list: List[T]): Boolean =
- list.tail.zipWithIndex.forall { case (t, idx) ⇒ getIndex(t) >
getIndex(list(idx)) }
+ if (state != -1 &&
+ buf.contains(toks.head) &&
+ buf.contains(toks.last) &&
+ (perm || buf.tail.zipWithIndex.forall { case (t, idx) ⇒
getIndex(t) > getIndex(buf(idx)) }) &&
+ {
+ val remained = toks.filter(t ⇒ !buf.contains(t))
- if (!perm)
- variants = variants.filter(isOrdered)
-
- variants
-
- }
+ !this.exists(chunk ⇒ remained.exists(t ⇒ isMatch(t, chunk)))
+ }
+ )
+ Some(buf)
else
- List.empty
+ None
}
/**
@@ -170,12 +169,12 @@ class NCProbeSynonym(
/**
*
- * @param sen
+ * @param toks
* @return
*/
- def trySparseMatch(sen: NCNlpSentenceTokenBuffer):
List[List[NCNlpSentenceToken]] =
+ def trySparseMatch(toks: NCNlpSentenceTokenBuffer):
Option[Seq[NCNlpSentenceToken]] =
trySparseMatch0(
- sen,
+ toks,
isMatch,
(t: NCNlpSentenceToken) ⇒ t.startCharIndex
)
@@ -197,12 +196,12 @@ class NCProbeSynonym(
/**
*
- * @param sen
+ * @param tows
* @param req
*/
- def trySparseMatch(sen: Seq[NCDslContent], req: NCRequest):
List[List[NCDslContent]] =
+ def trySparseMatch(tows: Seq[NCDslContent], req: NCRequest):
Option[Seq[NCDslContent]] =
trySparseMatch0(
- sen,
+ tows,
(t: NCDslContent, chunk: NCProbeSynonymChunk) ⇒ isMatch(t, chunk,
req),
(t: NCDslContent) ⇒ if (t.isLeft) t.left.get.getStartCharIndex
else t.right.get.startCharIndex
)
@@ -318,4 +317,4 @@ object NCProbeSynonym {
syn
}
-}
\ No newline at end of file
+}
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
index 9b98dc2..2dd6391 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
@@ -32,6 +32,7 @@ import java.io.Serializable
import java.util
import scala.collection.JavaConverters._
import scala.collection.convert.DecorateAsScala
+import scala.collection.mutable.ArrayBuffer
import scala.collection.{Map, Seq, mutable}
/**
@@ -131,9 +132,7 @@ object NCModelEnricher extends NCProbeEnricher with
DecorateAsScala {
parts: Seq[TokenData]
) extends Ordered[ElementMatch] {
// Tokens sparsity.
- lazy val sparsity: Int = tokens.zipWithIndex.tail.map {
- case (tok, idx) ⇒ Math.abs(tok.index - tokens(idx - 1).index)
- }.sum - tokens.length + 1
+ lazy val sparsity = U.calcSparsity(tokens.map(_.index))
// Number of tokens.
lazy val length: Int = tokens.size
@@ -160,6 +159,8 @@ object NCModelEnricher extends NCProbeEnricher with
DecorateAsScala {
else
0
}
+
+ override def toString: String = s"Element=${element.getId},
indexes=${tokens.map(_.index).mkString(",")}, synonym=$synonym"
}
/**
@@ -265,12 +266,43 @@ object NCModelEnricher extends NCProbeEnricher with
DecorateAsScala {
private def combos[T](toks: Seq[T]): Seq[Seq[T]] =
(for (n ← toks.size until 0 by -1) yield
toks.sliding(n)).flatten.map(p ⇒ p)
+ // TODO:
+// /**
+// *
+// * @param toks
+// * @param elemId
+// */
+// private def alreadyMarked(toks: Seq[NlpToken], elemId: String): Boolean
= toks.forall(_.isTypeOf(elemId))
+
/**
*
- * @param toks
- * @param elemId
+ * @param comb
+ * @param syn
*/
- private def alreadyMarked(toks: Seq[NlpToken], elemId: String): Boolean =
toks.forall(_.isTypeOf(elemId))
+ private def getParts(comb: Seq[Complex], syn: NCProbeSynonym):
Seq[TokenData] =
+ comb.zip(syn.map(_.kind)).flatMap {
+ case (complex, kind) ⇒ if (complex.isToken) Some(complex.token →
kind)
+ else None
+ }
+
+ private def mkCache(): mutable.Map[String, ArrayBuffer[Seq[NlpToken]]] =
+ mutable.HashMap.empty[
+ String,
+ mutable.ArrayBuffer[Seq[NlpToken]]
+ ].withDefault(_ ⇒ mutable.ArrayBuffer.empty[Seq[NlpToken]])
+
+ private def convert(tows: Seq[NCDslContent], ns: NCNlpSentence):
Seq[NlpToken] =
+ (
+ tows.filter(_.isRight).map(_.right.get) ++
+ tows.filter(_.isLeft).map(_.left.get).
+ flatMap(w ⇒
+ ns.filter(
+ t ⇒
+ t.startCharIndex >= w.getStartCharIndex &&
+ t.endCharIndex <= w.getEndCharIndex
+ )
+ )
+ ).sortBy(_.startCharIndex)
@throws[NCE]
override def enrich(mdl: NCProbeModel, ns: NCNlpSentence, senMeta:
Map[String, Serializable], parent: Span = null): Unit = {
@@ -281,29 +313,23 @@ object NCModelEnricher extends NCProbeEnricher with
DecorateAsScala {
"mdlId" → mdl.model.getId,
"txt" → ns.text
) { span ⇒
- val cache = mutable.HashSet.empty[Seq[Int]]
val req = NCRequestImpl(senMeta, ns.srvReqId)
val tokIdxs = ns.map(t ⇒ t → t.wordIndexes).toMap
- val senHasUserTokens = ns.exists(_.isUser)
+ val firstPhase = !ns.exists(_.isUser)
val matches = mutable.ArrayBuffer.empty[ElementMatch]
+ val cacheSparse = mkCache()
+ val cacheNotSparse = mkCache()
+
+ def addMatch(elm: NCElement, toks: Seq[NlpToken], syn:
NCProbeSynonym, parts: Seq[TokenData]): Unit = {
+ val toksSet = toks.toSet
- def addMatch(elm: NCElement, toks: Seq[NlpToken], syn:
NCProbeSynonym, parts: Seq[TokenData]): Boolean = {
- val tokensSet = toks.toSet
+ // TODO:
+ //require(!matches.exists(m ⇒ m.element.getId == elm.getId &&
toksSet.subsetOf(m.tokensSet)))
- if (!matches.exists(m ⇒ m.element.getId == elm.getId &&
tokensSet.subsetOf(m.tokensSet))) {
+ if (!matches.exists(m ⇒ m.element.getId == elm.getId &&
toksSet.subsetOf(m.tokensSet)))
matches += ElementMatch(elm, toks, syn, parts)
- true
- }
- else
- false
}
- def getParts(comb: Seq[Complex], syn: NCProbeSynonym):
Seq[TokenData] =
- comb.zip(syn.map(_.kind)).flatMap {
- case (complex, kind) ⇒ if (complex.isToken)
Some(complex.token → kind)
- else None
- }
-
/**
* Gets synonyms sorted in descending order by their weight
(already prepared),
* i.e. first synonym in the sequence is the most important one.
@@ -350,132 +376,122 @@ object NCModelEnricher extends NCProbeEnricher with
DecorateAsScala {
}).filter(_.exists(_.isToken)).map(ComplexSeq(_)).distinct
).seq
-
startScopedSpan("synsProc", span,
"srvReqId" → ns.srvReqId,
"mdlId" → mdl.model.getId,
"txt" → ns.text
) {
_ ⇒
- // 1. Simple, sparse.
- if (!senHasUserTokens)
- for ((elemId, syns) ← mdl.sparseSynonyms; syn ← syns)
- syn.trySparseMatch(ns).foreach(toks ⇒
addMatch(mdl.elements(elemId), toks, syn, Seq.empty))
-
- // 2. DSL, sparse.
- for ((elemId, syns) ← mdl.sparseSynonymsDsl; syn ← syns) {
- for (complex ← complexes) {
- val comb = complex.tokensComplexes
-
- syn.trySparseMatch(comb.map(_.data), req).foreach(tows
⇒ {
- val toks =
- tows.filter(_.isRight).map(_.right.get) ++
-
tows.filter(_.isLeft).map(_.left.get).flatMap(w ⇒
- ns.filter(
- t ⇒
- t.startCharIndex >=
w.getStartCharIndex &&
- t.endCharIndex <=
w.getEndCharIndex
- )
- )
+ for (toks ← combos(ns)) {
+ val idxsSeq = toks.flatMap(tokIdxs)
+ val idxsSorted = idxsSeq.sorted
+ val idxs = idxsSeq.toSet
+ val idxMin = idxsSorted.head
+ val idxMax = idxsSorted.last
- addMatch(mdl.elements(elemId),
toks.sortBy(_.startCharIndex), syn, getParts(comb, syn))
- })
- }
- }
+ lazy val sorted = idxsSorted.zipWithIndex.toMap
- for (toks ← combos(ns)) {
- val key = toks.map(_.index).sorted
-
- if (!cache.contains(key)) {
- cache += key
-
- val idxsSeq = toks.flatMap(tokIdxs)
- val idxsSorted = idxsSeq.sorted
- val idxs = idxsSeq.toSet
- val idxMin = idxsSorted.head
- val idxMax = idxsSorted.last
-
- lazy val sorted = idxsSorted.zipWithIndex.toMap
-
- lazy val dslCombs: Map[Int, Seq[Seq[Complex]]] =
- complexes.par.
- flatMap(complexSeq ⇒ {
- val rec =
complexSeq.tokensComplexes.filter(_.isSubsetOf(idxMin, idxMax, idxs))
-
- // Drops without tokens (IDL part works
with tokens).
- if (rec.nonEmpty)
- Some(
- rec ++
-
(complexSeq.wordsIndexes.intersect(idxs) -- rec.flatMap(_.wordIndexes)).
- map(complexesWords)
- )
- else
- None
- }).
- map(_.sortBy(p ⇒
sorted(p.wordIndexes.head))).seq.groupBy(_.length)
+ lazy val dslCombs: Map[Int, Seq[Seq[Complex]]] =
+ complexes.par.
+ flatMap(complexSeq ⇒ {
+ val rec =
complexSeq.tokensComplexes.filter(_.isSubsetOf(idxMin, idxMax, idxs))
- lazy val tokStems = toks.map(_.stem).mkString(" ")
+ // Drops without tokens (IDL part works with
tokens).
+ if (rec.nonEmpty)
+ Some(
+ rec ++
+
(complexSeq.wordsIndexes.intersect(idxs) -- rec.flatMap(_.wordIndexes)).
+ map(complexesWords)
+ )
+ else
+ None
+ }).
+ map(_.sortBy(p ⇒
sorted(p.wordIndexes.head))).seq.groupBy(_.length)
+
+ lazy val tokStems = toks.map(_.stem).mkString(" ")
+
+ // Attempt to match each element.
+ for (elm ← mdl.elements.values) {
+ val elemId = elm.getId
+ val sparseEnabled =
!cacheSparse(elemId).exists(_.contains(toks))
+ val notSparseEnabled =
!cacheNotSparse(elemId).exists(_.contains(toks))
+ var foundSparse = false
+ var foundNotSparse = false
+
+ def addSparse(res: Seq[NlpToken], syn: NCProbeSynonym,
parts: Seq[TokenData]): Unit = {
+ addMatch(elm, res, syn, parts)
+ cacheSparse(elemId) += toks
+ foundSparse = true
+ }
- // Attempt to match each element.
- for (elm ← mdl.elements.values if !alreadyMarked(toks,
elm.getId)) {
- var found = false
+ def addNotSparse(syn: NCProbeSynonym, parts:
Seq[TokenData]): Unit = {
+ addMatch(elm, toks, syn, parts)
+ cacheNotSparse(elemId) += toks
+ foundNotSparse = true
+ }
- def setFound(
- elm: NCElement,
- toks: Seq[NlpToken],
- syn: NCProbeSynonym,
- parts: Seq[TokenData]
- ): Unit = {
- addMatch(elm, toks, syn, parts)
+ // 1. Simple, sparse.
+ if (firstPhase && sparseEnabled)
+ for (syn ← mdl.sparseSynonyms.getOrElse(elemId,
Seq.empty) if !foundSparse)
+ syn.trySparseMatch(toks) match {
+ case Some(res) ⇒ addSparse(res, syn,
Seq.empty)
+ case None ⇒ // No-op.
+ }
- found = true
+ // 2. Simple, not sparse.
+ // Optimization - plain synonyms can be used only on
first iteration
+ if (firstPhase && notSparseEnabled)
+ fastAccess(mdl.nonSparseSynonyms, elemId,
toks.length) match {
+ case Some(h) ⇒
+ def tryMap(synsMap: Map[String,
NCProbeSynonym], notFound: () ⇒ Unit): Unit =
+ synsMap.get(tokStems) match {
+ case Some(syn) ⇒ addNotSparse(syn,
Seq.empty)
+ // TODO:
+ //if (!found)
+ // notFound()
+ case None ⇒ notFound()
+ }
+
+ def tryScan(synsSeq: Seq[NCProbeSynonym]):
Unit =
+ for (syn ← synsSeq if !foundNotSparse)
+ if (syn.isMatch(toks))
+ addNotSparse(syn, Seq.empty)
+
+ tryMap(
+ h.txtDirectSynonyms,
+ () ⇒ {
+ tryScan(h.notTxtDirectSynonyms)
+
+ if (!foundNotSparse)
+ tryMap(
+ h.txtNotDirectSynonyms,
+ () ⇒
tryScan(h.notTxtNotDirectSynonyms)
+ )
+ }
+ )
+ case None ⇒ // No-op.
}
- // 3. Simple, not sparse.
- // Optimization - plain synonyms can be used only
on first iteration
- if (mdl.nonSparseSynonyms.nonEmpty &&
!senHasUserTokens)
- fastAccess(mdl.nonSparseSynonyms, elm.getId,
toks.length) match {
- case Some(h) ⇒
- def tryMap(synsMap: Map[String,
NCProbeSynonym], notFound: () ⇒ Unit): Unit =
- synsMap.get(tokStems) match {
- case Some(syn) ⇒
- setFound(elm, toks, syn,
Seq.empty)
-
- if (!found)
- notFound()
- case None ⇒ notFound()
- }
-
- def tryScan(synsSeq:
Seq[NCProbeSynonym]): Unit =
- for (syn ← synsSeq if !found)
- if (syn.isMatch(toks))
- setFound(elm, toks, syn,
Seq.empty)
-
- tryMap(
- h.txtDirectSynonyms,
- () ⇒ {
- tryScan(h.notTxtDirectSynonyms)
-
- if (!found)
- tryMap(
- h.txtNotDirectSynonyms,
- () ⇒
tryScan(h.notTxtNotDirectSynonyms)
- )
- }
- )
+ // 3. DSL, sparse.
+ if (sparseEnabled)
+ for (syn ← mdl.sparseSynonymsDsl.getOrElse(elemId,
Seq.empty); complex ← complexes if !foundSparse) {
+ val comb = complex.tokensComplexes
+
+ syn.trySparseMatch(comb.map(_.data), req)
match {
+ case Some(towsRes) ⇒
addSparse(convert(towsRes, ns), syn, getParts(comb, syn))
case None ⇒ // No-op.
}
+ }
- if (mdl.nonSparseSynonymsDsl.nonEmpty)
- // 4. DSL, non sparse.
- for (
- (len, seq) ← dslCombs;
- syn ← fastAccess(mdl.nonSparseSynonymsDsl,
elm.getId, len).getOrElse(Seq.empty);
- comb ← seq if !found;
- data = comb.map(_.data)
- )
- if (syn.isMatch(data, req))
- setFound(elm, toks, syn,
getParts(comb, syn))
+ if (notSparseEnabled) {
+ // 4. DSL, non sparse.
+ for (
+ (len, seq) ← dslCombs;
+ syn ← fastAccess(mdl.nonSparseSynonymsDsl,
elemId, len).getOrElse(Seq.empty);
+ comb ← seq if !foundNotSparse
+ )
+ if (syn.isMatch(comb.map(_.data), req))
+ addNotSparse(syn, getParts(comb, syn))
}
}
}
@@ -585,7 +601,8 @@ object NCModelEnricher extends NCProbeEnricher with
DecorateAsScala {
).getOrElse(throw new AssertionError(s"Custom
model parser returned an invalid custom token: $w"))
)
- if (!alreadyMarked(matchedToks, elemId))
+ // TODO:
+ //if (!alreadyMarked(matchedToks, elemId))
mark(
ns,
elem = mdl.elements.getOrElse(elemId,
throw new NCE(s"Custom model parser returned unknown element ID: $elemId")),