This is an automated email from the ASF dual-hosted git repository.
sergeykamov pushed a commit to branch NLPCRAFT-287
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git
The following commit(s) were added to refs/heads/NLPCRAFT-287 by this push:
new 7ee5235 WIP.
7ee5235 is described below
commit 7ee5235ec8c15d38980aad2b0c7dac88cdd6bc7e
Author: Sergey Kamov <[email protected]>
AuthorDate: Sat Apr 3 22:13:02 2021 +0300
WIP.
---
.../org/apache/nlpcraft/common/util/NCUtils.scala | 12 ++
.../nlpcraft/probe/mgrs/NCProbeSynonym.scala | 166 ++++++++-------------
.../mgrs/nlp/enrichers/model/NCModelEnricher.scala | 21 +--
3 files changed, 83 insertions(+), 116 deletions(-)
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/common/util/NCUtils.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/common/util/NCUtils.scala
index fc8bcf8..141e813 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/common/util/NCUtils.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/common/util/NCUtils.scala
@@ -2110,4 +2110,16 @@ object NCUtils extends LazyLogging {
* @return
*/
def getYamlMapper: ObjectMapper = YAML
+
+ /**
+ *
+ * @param list
+ * @tparam T
+ * @return
+ */
+ def permute[T](list: List[List[T]]): List[List[T]] =
+ list match {
+ case Nil ⇒ List(Nil)
+ case head :: tail ⇒ for (h ← head; t ← permute(tail)) yield h :: t
+ }
}
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeSynonym.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeSynonym.scala
index d09418a..4dd1c61 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeSynonym.scala
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeSynonym.scala
@@ -17,10 +17,11 @@
package org.apache.nlpcraft.probe.mgrs
+import org.apache.nlpcraft.common.U
import org.apache.nlpcraft.common.nlp.{NCNlpSentenceToken,
NCNlpSentenceTokenBuffer}
import org.apache.nlpcraft.model._
-import org.apache.nlpcraft.probe.mgrs.NCProbeSynonym.NCDslContent
import org.apache.nlpcraft.model.intent.NCIdlContext
+import org.apache.nlpcraft.probe.mgrs.NCProbeSynonym.NCDslContent
import org.apache.nlpcraft.probe.mgrs.NCProbeSynonymChunkKind._
import scala.collection.mutable
@@ -85,90 +86,44 @@ class NCProbeSynonym(
/**
*
- * @param toks
+ * @param sen
* @return
*/
- def isMatch(toks: NCNlpSentenceTokenBuffer): Boolean = {
- require(toks != null)
+ private def trySparseMatch0[T](sen: Seq[T], isMatch: (T,
NCProbeSynonymChunk) ⇒ Boolean, getIndex: T ⇒ Int): List[List[T]] = {
+ require(sen != null)
+ require(sen.nonEmpty)
+ require(this.size > 1)
- if (toks.length == length) {
- if (isTextOnly)
- toks.stemsHash == stemsHash && toks.stems == stems
- else
- toks.zip(this).sortBy(p ⇒ getSort(p._2.kind)).forall { case
(tok, chunk) ⇒ isMatch(tok, chunk) }
- }
- else
- false
- }
+ lazy val buf = mutable.ArrayBuffer.empty[List[T]]
+ lazy val flattenBuf = mutable.ArrayBuffer.empty[T]
- /**
- *
- * @param ok
- * @param buf
- * @tparam T
- */
- private def convertResult[T](ok: Boolean, buf: mutable.ArrayBuffer[T]):
Option[Seq[T]] =
- if (ok) {
- require(buf.nonEmpty)
-
- Some(buf)
- }
- else
- None
+ var ok = true
- private def collectMatches[T](seq: Seq[T], tryMatch: Seq[T] ⇒
Option[Seq[T]]): Seq[Seq[T]] = {
- val buf = mutable.ArrayBuffer.empty[Seq[T]]
+ for (chunk ← this if ok) {
+ val res = sen.filter(tok ⇒ !flattenBuf.contains(tok) &&
isMatch(tok, chunk))
- var ok = true
- var arg = seq
-
- while (ok) {
- tryMatch(arg) match {
- case Some(ts) ⇒
- buf += ts
- arg = arg.filter(t ⇒ !ts.contains(t))
- case None ⇒ ok = false
+ if (res.nonEmpty) {
+ buf += res.toList
+ flattenBuf ++= res
}
+ else
+ ok = false
}
- buf
- }
-
- /**
- *
- * @param sen
- * @return
- */
- def trySparseMatch(sen: NCNlpSentenceTokenBuffer):
Seq[Seq[NCNlpSentenceToken]] = {
- require(sen != null)
- require(sen.nonEmpty)
- require(this.size > 1)
+ if (ok) {
+ var variants = U.permute(buf.toList)
- def trySparseMatch0(sen: Seq[NCNlpSentenceToken]):
Option[Seq[NCNlpSentenceToken]] = {
- var ok = true
- val buf = mutable.ArrayBuffer.empty[NCNlpSentenceToken]
+ def isOrdered(list: List[T]): Boolean =
+ list.tail.zipWithIndex.forall { case (t, idx) ⇒ getIndex(t) >
getIndex(list(idx)) }
- if (!perm) {
- var lastIdx = 0
- val tokIdxs = sen.zipWithIndex.toMap
+ if (!perm)
+ variants = variants.filter(isOrdered)
- for (chunk ← this if ok)
- sen.drop(lastIdx).find(tok ⇒ isMatch(tok, chunk)) match {
- case Some(tok) ⇒ buf += tok; lastIdx = tokIdxs(tok) + 1
- case None ⇒ ok = false
- }
- }
- else
- for (chunk ← this if ok)
- sen.find(tok ⇒ !buf.contains(tok) && isMatch(tok, chunk))
match {
- case Some(tok) ⇒ buf += tok
- case None ⇒ ok = false
- }
+ variants
- convertResult(ok, buf)
}
-
- collectMatches(sen, trySparseMatch0)
+ else
+ List.empty
}
/**
@@ -197,6 +152,36 @@ class NCProbeSynonym(
/**
*
+ * @param toks
+ * @return
+ */
+ def isMatch(toks: NCNlpSentenceTokenBuffer): Boolean = {
+ require(toks != null)
+
+ if (toks.length == length) {
+ if (isTextOnly)
+ toks.stemsHash == stemsHash && toks.stems == stems
+ else
+ toks.zip(this).sortBy(p ⇒ getSort(p._2.kind)).forall { case
(tok, chunk) ⇒ isMatch(tok, chunk) }
+ }
+ else
+ false
+ }
+
+ /**
+ *
+ * @param sen
+ * @return
+ */
+ def trySparseMatch(sen: NCNlpSentenceTokenBuffer):
List[List[NCNlpSentenceToken]] =
+ trySparseMatch0(
+ sen,
+ isMatch,
+ (t: NCNlpSentenceToken) ⇒ t.startCharIndex
+ )
+
+ /**
+ *
* @param tows
* @param req
* @return
@@ -215,37 +200,12 @@ class NCProbeSynonym(
* @param sen
* @param req
*/
- def trySparseMatch(sen: Seq[NCDslContent], req: NCRequest):
Seq[Seq[NCDslContent]] = {
- require(sen != null)
- require(sen.nonEmpty)
- require(this.size > 1)
-
- def trySparseMatch0(sen: Seq[NCDslContent]): Option[Seq[NCDslContent]]
= {
- var ok = true
- val buf = mutable.ArrayBuffer.empty[NCDslContent]
-
- if (!perm) {
- var lastIdx = 0
- val tokIdxs = sen.zipWithIndex.toMap
-
- for (chunk ← this if ok)
- sen.drop(lastIdx).find(tow ⇒ isMatch(tow, chunk, req))
match {
- case Some(t) ⇒ buf += t; lastIdx = tokIdxs(t) + 1
- case None ⇒ ok = false
- }
- }
- else
- for (chunk ← this if ok)
- sen.find(tow ⇒ !buf.contains(tow) && isMatch(tow, chunk,
req)) match {
- case Some(tow) ⇒ buf += tow
- case None ⇒ ok = false
- }
-
- convertResult(ok, buf)
- }
-
- collectMatches(sen, trySparseMatch0)
- }
+ def trySparseMatch(sen: Seq[NCDslContent], req: NCRequest):
List[List[NCDslContent]] =
+ trySparseMatch0(
+ sen,
+ (t: NCDslContent, chunk: NCProbeSynonymChunk) ⇒ isMatch(t, chunk,
req),
+ (t: NCDslContent) ⇒ if (t.isLeft) t.left.get.getStartCharIndex
else t.right.get.startCharIndex
+ )
override def toString(): String = mkString(" ")
@@ -358,4 +318,4 @@ object NCProbeSynonym {
syn
}
-}
+}
\ No newline at end of file
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
index 22a6a5b..9b98dc2 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
@@ -279,9 +279,12 @@ object NCModelEnricher extends NCProbeEnricher with
DecorateAsScala {
startScopedSpan("enrich", parent,
"srvReqId" → ns.srvReqId,
"mdlId" → mdl.model.getId,
- "txt" → ns.text) { span ⇒
+ "txt" → ns.text
+ ) { span ⇒
val cache = mutable.HashSet.empty[Seq[Int]]
val req = NCRequestImpl(senMeta, ns.srvReqId)
+ val tokIdxs = ns.map(t ⇒ t → t.wordIndexes).toMap
+ val senHasUserTokens = ns.exists(_.isUser)
val matches = mutable.ArrayBuffer.empty[ElementMatch]
def addMatch(elm: NCElement, toks: Seq[NlpToken], syn:
NCProbeSynonym, parts: Seq[TokenData]): Boolean = {
@@ -347,7 +350,6 @@ object NCModelEnricher extends NCProbeEnricher with
DecorateAsScala {
}).filter(_.exists(_.isToken)).map(ComplexSeq(_)).distinct
).seq
- val tokIdxs = ns.map(t ⇒ t → t.wordIndexes).toMap
startScopedSpan("synsProc", span,
"srvReqId" → ns.srvReqId,
@@ -356,7 +358,7 @@ object NCModelEnricher extends NCProbeEnricher with
DecorateAsScala {
) {
_ ⇒
// 1. Simple, sparse.
- if (!ns.exists(_.isUser))
+ if (!senHasUserTokens)
for ((elemId, syns) ← mdl.sparseSynonyms; syn ← syns)
syn.trySparseMatch(ns).foreach(toks ⇒
addMatch(mdl.elements(elemId), toks, syn, Seq.empty))
@@ -431,7 +433,7 @@ object NCModelEnricher extends NCProbeEnricher with
DecorateAsScala {
// 3. Simple, not sparse.
// Optimization - plain synonyms can be used only
on first iteration
- if (mdl.nonSparseSynonyms.nonEmpty &&
!ns.exists(_.isUser))
+ if (mdl.nonSparseSynonyms.nonEmpty &&
!senHasUserTokens)
fastAccess(mdl.nonSparseSynonyms, elm.getId,
toks.length) match {
case Some(h) ⇒
def tryMap(synsMap: Map[String,
NCProbeSynonym], notFound: () ⇒ Unit): Unit =
@@ -464,7 +466,7 @@ object NCModelEnricher extends NCProbeEnricher with
DecorateAsScala {
case None ⇒ // No-op.
}
- if (mdl.nonSparseSynonymsDsl.nonEmpty) {
+ if (mdl.nonSparseSynonymsDsl.nonEmpty)
// 4. DSL, non sparse.
for (
(len, seq) ← dslCombs;
@@ -474,7 +476,6 @@ object NCModelEnricher extends NCProbeEnricher with
DecorateAsScala {
)
if (syn.isMatch(data, req))
setFound(elm, toks, syn,
getParts(comb, syn))
- }
}
}
}
@@ -492,14 +493,8 @@ object NCModelEnricher extends NCProbeEnricher with
DecorateAsScala {
flatMap(m ⇒ m.tokens.map(_ → m)).
groupBy { case (t, m) ⇒ (m.element.getId, m.length, t) }.
flatMap { case (_, seq) ⇒
- def perm[T](list: List[List[T]]): List[List[T]] =
- list match {
- case Nil ⇒ List(Nil)
- case head :: tail ⇒ for (h ← head; t ←
perm(tail)) yield h :: t
- }
-
// Optimization by sparsity sum for each tokens set
for one element found with same tokens count.
- perm(
+ U.permute(
seq.groupBy { case (tok, _) ⇒ tok }.
map { case (_, seq) ⇒ seq.map { case (_, m) ⇒
m }.toList }.toList
).minBy(_.map(_.sparsity).sum)