This is an automated email from the ASF dual-hosted git repository.
sergeykamov pushed a commit to branch NLPCRAFT-443
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git
The following commit(s) were added to refs/heads/NLPCRAFT-443 by this push:
new 1cc3c46 WIP.
1cc3c46 is described below
commit 1cc3c467675bf6e01404ae441a50ad3a88995126
Author: Sergey Kamov <[email protected]>
AuthorDate: Sat Sep 25 18:54:10 2021 +0300
WIP.
---
.../nlpcraft/probe/mgrs/NCProbeSynonym.scala | 2 +-
.../nlpcraft/probe/mgrs/NCProbeVariants.scala | 2 +-
.../mgrs/nlp/enrichers/model/NCModelEnricher.scala | 49 +++----
.../probe/mgrs/synonyms/NCSynonymsManager.scala | 157 ++++++++++++++-------
4 files changed, 128 insertions(+), 82 deletions(-)
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeSynonym.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeSynonym.scala
index d865c6a..aa13574 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeSynonym.scala
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeSynonym.scala
@@ -138,7 +138,7 @@ class NCProbeSynonym(
}
object NCProbeSynonym {
- type NCIdlContent = Either[NCToken, NCNlpSentenceToken]
+ type NCIdlToken = Either[NCToken, NCNlpSentenceToken]
/**
*
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeVariants.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeVariants.scala
index 8aced5f..2b91128 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeVariants.scala
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeVariants.scala
@@ -268,7 +268,7 @@ object NCProbeVariants {
for ((tok, tokNlp) <- toks.zip(nlpSen) if tokNlp.isUser)
process(tok, tokNlp)
- ok = ok && NCSynonymsManager.isStillValid(srvReqId, toks.toSeq)
+ ok = ok && NCSynonymsManager.isStillValidIdl(srvReqId,
toks.toSeq)
if (ok) Some(new NCVariantImpl(toks.asJava)) else None
})
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
index a6aba57..bd7804b 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
@@ -23,7 +23,7 @@ import org.apache.nlpcraft.common.nlp.NCNlpSentence.NoteLink
import org.apache.nlpcraft.common.nlp.{NCNlpSentence => Sentence,
NCNlpSentenceNote => NlpNote, NCNlpSentenceToken => NlpToken}
import org.apache.nlpcraft.model._
import org.apache.nlpcraft.model.impl.NCTokenImpl
-import org.apache.nlpcraft.probe.mgrs.NCProbeSynonym.NCIdlContent
+import org.apache.nlpcraft.probe.mgrs.NCProbeSynonym.NCIdlToken
import
org.apache.nlpcraft.probe.mgrs.NCProbeSynonymChunkKind.NCSynonymChunkKind
import org.apache.nlpcraft.probe.mgrs.nlp.NCProbeEnricher
import org.apache.nlpcraft.probe.mgrs.nlp.impl.NCRequestImpl
@@ -72,7 +72,7 @@ object NCModelEnricher extends NCProbeEnricher {
}
case class Complex(
- data: NCIdlContent,
+ data: NCIdlToken,
isToken: Boolean,
isWord: Boolean,
token: NCToken,
@@ -128,26 +128,6 @@ object NCModelEnricher extends NCProbeEnricher {
case class ComplexHolder(complexesWords: Seq[Complex], complexes:
Seq[ComplexSeq])
- class CacheHolder[T] {
- private lazy val cache =
- mutable.HashMap.empty[String, mutable.HashMap[Int,
mutable.HashMap[Seq[T], mutable.HashSet[Synonym]]]]
-
- def isUnprocessed(elemId: String, s: Synonym, tokens: Seq[T]): Boolean
= {
- cache.
- getOrElseUpdate(
- elemId,
- mutable.HashMap.empty[Int, mutable.HashMap[Seq[T],
mutable.HashSet[Synonym]]]
- ).
- getOrElseUpdate(
- tokens.length,
- mutable.HashMap.empty[Seq[T], mutable.HashSet[Synonym]]
- ).
- getOrElseUpdate(
- tokens,
- mutable.HashSet.empty[Synonym]
- ).add(s)
- }
- }
/**
*
@@ -378,7 +358,7 @@ object NCModelEnricher extends NCProbeEnricher {
* @param seq
* @param s
*/
- private def toParts(mdl: NCProbeModel, stvReqId: String, seq:
Seq[NCIdlContent], s: Synonym): Seq[TokType] =
+ private def toParts(mdl: NCProbeModel, stvReqId: String, seq:
Seq[NCIdlToken], s: Synonym): Seq[TokType] =
seq.zip(s.map(_.kind)).flatMap {
case (complex, kind) =>
val t = if (complex.isLeft) complex.swap.toOption.get else
mkNlpToken(mdl, stvReqId, complex.toOption.get)
@@ -391,7 +371,7 @@ object NCModelEnricher extends NCProbeEnricher {
* @param tows
* @param ns
*/
- private def toTokens(tows: Seq[NCIdlContent], ns: Sentence): Seq[NlpToken]
=
+ private def toTokens(tows: Seq[NCIdlToken], ns: Sentence): Seq[NlpToken] =
(
tows.filter(_.isRight).map(_.toOption.get) ++
tows.filter(_.isLeft).map(_.swap.toOption.get).
@@ -516,9 +496,6 @@ object NCModelEnricher extends NCProbeEnricher {
p => p.tokensComplexes.map(p => if (p.isToken) p.token
else mkNlpToken(mdl, ns.srvReqId, p.word))
)
- lazy val idlCache = new CacheHolder[NCIdlContent]()
- lazy val tokCache = new CacheHolder[Int]()
-
def execute(simpleEnabled: Boolean, idlEnabled: Boolean): Unit =
startScopedSpan(
"execute", span, "srvReqId" -> ns.srvReqId, "mdlId" ->
mdl.model.getId, "txt" -> ns.text
@@ -582,7 +559,12 @@ object NCModelEnricher extends NCProbeEnricher {
}
def tryScan(syns: Seq[Synonym]): Unit =
- for (s <- syns if !found &&
tokCache.isUnprocessed(eId, s, idxs))
+ for (
+ s <- syns
+ if
+ !found &&
+
NCSynonymsManager.isUnprocessedTokens(ns.srvReqId, eId, s, idxs)
+ )
if
(NCSynonymsManager.isMatch(s, toks)) {
found = true
add("simple continuous
scan", toksExt, s)
@@ -602,7 +584,10 @@ object NCModelEnricher extends NCProbeEnricher {
// 1.2 Sparse.
if (!found && mdl.hasSparseSynonyms)
- for (s <- get(mdl.sparseSynonyms, eId) if
tokCache.isUnprocessed(eId, s, idxs))
+ for (
+ s <- get(mdl.sparseSynonyms, eId)
+ if
NCSynonymsManager.isUnprocessedTokens(ns.srvReqId, eId, s, idxs)
+ )
NCSynonymsManager.sparseMatch(s, toks)
match {
case Some(res) =>
add("simple sparse",
getSparsedTokens(res, toks), s)
@@ -623,7 +608,7 @@ object NCModelEnricher extends NCProbeEnricher {
s <- allSyns;
comb <- allCombs;
data = comb.map(_.data)
- if !found && idlCache.isUnprocessed(eId,
s, data)
+ if !found &&
NCSynonymsManager.isUnprocessedIdl(ns.srvReqId, eId, s, data)
)
if (NCSynonymsManager.isMatch(s, data,
req, variantsToks)) {
val parts = toParts(mdl, ns.srvReqId,
data, s)
@@ -639,7 +624,7 @@ object NCModelEnricher extends NCProbeEnricher {
s <- allSyns;
comb <- allCombs;
data = comb.map(_.data)
- if idlCache.isUnprocessed(eId, s, data)
+ if
NCSynonymsManager.isUnprocessedIdl(ns.srvReqId, eId, s, data)
)
NCSynonymsManager.sparseMatch(s, data,
req, variantsToks) match {
case Some(res) =>
@@ -667,6 +652,8 @@ object NCModelEnricher extends NCProbeEnricher {
processParsers(mdl, ns, span, req)
}
+ NCSynonymsManager.clearRequestIterationData(ns.srvReqId)
+
normalize(ns)
}
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/synonyms/NCSynonymsManager.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/synonyms/NCSynonymsManager.scala
index 465af93..7996510 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/synonyms/NCSynonymsManager.scala
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/synonyms/NCSynonymsManager.scala
@@ -22,7 +22,7 @@ import org.apache.nlpcraft.common.nlp.{NCNlpSentenceNote,
NCNlpSentenceToken}
import org.apache.nlpcraft.common.{NCService, U}
import org.apache.nlpcraft.model._
import org.apache.nlpcraft.model.intent.{NCIdlContext, NCIdlFunction}
-import org.apache.nlpcraft.probe.mgrs.NCProbeSynonym.NCIdlContent
+import org.apache.nlpcraft.probe.mgrs.NCProbeSynonym.NCIdlToken
import org.apache.nlpcraft.probe.mgrs.NCProbeSynonymChunkKind.{IDL,
NCSynonymChunkKind, REGEX, TEXT}
import org.apache.nlpcraft.probe.mgrs.{NCProbeSynonymChunk, NCProbeSynonym =>
Synonym}
@@ -35,14 +35,35 @@ import scala.jdk.CollectionConverters.ListHasAsScala
*
*/
object NCSynonymsManager extends NCService {
- case class Key(id: String, startCharIndex: Int, endCharIndex: Int, other:
Map[String, AnyRef] = Map.empty)
+ class CacheHolder[T] {
+ private lazy val cache =
+ mutable.HashMap.empty[String, mutable.HashMap[Int,
mutable.HashMap[Seq[T], mutable.HashSet[Synonym]]]]
+
+ def isUnprocessed(elemId: String, s: Synonym, tokens: Seq[T]): Boolean
= {
+ cache.
+ getOrElseUpdate(
+ elemId,
+ mutable.HashMap.empty[Int, mutable.HashMap[Seq[T],
mutable.HashSet[Synonym]]]
+ ).
+ getOrElseUpdate(
+ tokens.length,
+ mutable.HashMap.empty[Seq[T], mutable.HashSet[Synonym]]
+ ).
+ getOrElseUpdate(
+ tokens,
+ mutable.HashSet.empty[Synonym]
+ ).add(s)
+ }
+ }
+
+ case class SavedIdlKey(id: String, startCharIndex: Int, endCharIndex: Int,
other: Map[String, AnyRef] = Map.empty)
- object Key {
- def apply(t: NCToken): Key =
+ object SavedIdlKey {
+ def apply(t: NCToken): SavedIdlKey =
if (t.isUserDefined)
- Key(t.getId, t.getStartCharIndex, t.getEndCharIndex)
+ SavedIdlKey(t.getId, t.getStartCharIndex, t.getEndCharIndex)
else
- Key(
+ SavedIdlKey(
t.getId,
t.getStartCharIndex,
t.getEndCharIndex,
@@ -57,7 +78,10 @@ object NCSynonymsManager extends NCService {
override def toString: String = variants.toString()
}
- private val idlCache = mutable.HashMap.empty[String, mutable.HashMap[Key,
mutable.ArrayBuffer[Value]]]
+ private val savedIdl = mutable.HashMap.empty[String,
mutable.HashMap[SavedIdlKey, mutable.ArrayBuffer[Value]]]
+ private val idlChunksCache = mutable.HashMap.empty[String,
mutable.HashMap[(NCIdlToken, NCProbeSynonymChunk), Boolean]]
+ private val idlCaches = mutable.HashMap.empty[String,
CacheHolder[NCIdlToken]]
+ private val tokCaches = mutable.HashMap.empty[String, CacheHolder[Int]]
override def start(parent: Span): NCService = {
ackStarting()
@@ -165,9 +189,9 @@ object NCSynonymsManager extends NCService {
* @param variantsToks
*/
private def save(req: NCRequest, tok: NCToken, pred: NCIdlFunction,
variantsToks: Seq[Seq[NCToken]]): Unit = {
- idlCache.
+ savedIdl.
getOrElseUpdate(req.getServerRequestId, mutable.HashMap.empty).
- getOrElseUpdate(Key(tok), mutable.ArrayBuffer.empty) +=
+ getOrElseUpdate(SavedIdlKey(tok), mutable.ArrayBuffer.empty) +=
Value(req, variantsToks, pred)
}
@@ -178,38 +202,43 @@ object NCSynonymsManager extends NCService {
* @param req
* @param variantsToks
*/
- private def isMatch(
- tow: NCIdlContent, chunk: NCProbeSynonymChunk, req: NCRequest,
variantsToks: Seq[Seq[NCToken]]
- ): Boolean = {
- def get0[T](fromToken: NCToken => T, fromWord: NCNlpSentenceToken =>
T): T =
- if (tow.isLeft) fromToken(tow.swap.toOption.get)
- else fromWord(tow.toOption.get)
-
- chunk.kind match {
- case TEXT => chunk.wordStem == get0(_.stem, _.stem)
-
- case REGEX =>
- val r = chunk.regex
-
- r.matcher(get0(_.origText, _.origText)).matches() ||
r.matcher(get0(_.normText, _.normText)).matches()
-
- case IDL =>
- val ok =
- variantsToks.par.exists(vrntToks =>
- get0(t =>
- chunk.idlPred.apply(t, NCIdlContext(toks =
vrntToks, req = req)).value.asInstanceOf[Boolean],
- _ => false
- )
- )
+ private def isMatch(tow: NCIdlToken, chunk: NCProbeSynonymChunk, req:
NCRequest, variantsToks: Seq[Seq[NCToken]]): Boolean =
+ idlChunksCache.
+ getOrElseUpdate(req.getServerRequestId,
+ mutable.HashMap.empty[(NCIdlToken, NCProbeSynonymChunk),
Boolean]
+ ).
+ getOrElseUpdate(
+ (tow, chunk),
+ {
+ def get0[T](fromToken: NCToken => T, fromWord:
NCNlpSentenceToken => T): T =
+ if (tow.isLeft) fromToken(tow.swap.toOption.get)
+ else fromWord(tow.toOption.get)
+
+ chunk.kind match {
+ case TEXT => chunk.wordStem == get0(_.stem, _.stem)
+
+ case REGEX =>
+ chunk.regex.matcher(get0(_.origText,
_.origText)).matches() ||
+ chunk.regex.matcher(get0(_.normText,
_.normText)).matches()
+
+ case IDL =>
+ val ok =
+ variantsToks.par.exists(vrntToks =>
+ get0(t =>
+ chunk.idlPred.apply(t,
NCIdlContext(toks = vrntToks, req = req)).value.asInstanceOf[Boolean],
+ _ => false
+ )
+ )
- if (ok)
- save(req, tow.swap.toOption.get, chunk.idlPred,
variantsToks)
+ if (ok)
+ save(req, tow.swap.toOption.get,
chunk.idlPred, variantsToks)
- ok
+ ok
- case _ => throw new AssertionError()
- }
- }
+ case _ => throw new AssertionError()
+ }
+ }
+ )
/**
*
@@ -237,7 +266,7 @@ object NCSynonymsManager extends NCService {
* @param req
* @param variantsToks
*/
- def isMatch(s: Synonym, tows: Seq[NCIdlContent], req: NCRequest,
variantsToks: Seq[Seq[NCToken]]): Boolean = {
+ def isMatch(s: Synonym, tows: Seq[NCIdlToken], req: NCRequest,
variantsToks: Seq[Seq[NCToken]]): Boolean = {
require(tows != null)
if (tows.length == s.length && tows.count(_.isLeft) >= s.idlChunks)
@@ -267,7 +296,7 @@ object NCSynonymsManager extends NCService {
* @param req
* @param variantsToks
*/
- def sparseMatch(s: Synonym, tows: Seq[NCIdlContent], req: NCRequest,
variantsToks: Seq[Seq[NCToken]]): Option[Seq[NCIdlContent]] = {
+ def sparseMatch(s: Synonym, tows: Seq[NCIdlToken], req: NCRequest,
variantsToks: Seq[Seq[NCToken]]): Option[Seq[NCIdlToken]] = {
require(tows != null)
require(req != null)
require(s.hasIdl)
@@ -275,8 +304,8 @@ object NCSynonymsManager extends NCService {
sparseMatch0(
s,
tows,
- (t: NCIdlContent, chunk: NCProbeSynonymChunk) => isMatch(t, chunk,
req, variantsToks),
- (t: NCIdlContent) => if (t.isLeft)
t.swap.toOption.get.getStartCharIndex
+ (t: NCIdlToken, chunk: NCProbeSynonymChunk) => isMatch(t, chunk,
req, variantsToks),
+ (t: NCIdlToken) => if (t.isLeft)
t.swap.toOption.get.getStartCharIndex
else t.toOption.get.startCharIndex,
shouldBeNeighbors = !s.sparse
)
@@ -287,14 +316,14 @@ object NCSynonymsManager extends NCService {
* @param srvReqId
* @param senToks
*/
- def isStillValid(srvReqId: String, senToks: Seq[NCToken]): Boolean =
- idlCache.get(srvReqId) match {
+ def isStillValidIdl(srvReqId: String, senToks: Seq[NCToken]): Boolean =
+ savedIdl.get(srvReqId) match {
case Some(m) =>
lazy val allCheckedSenToks = {
- val set = mutable.HashSet.empty[Key]
+ val set = mutable.HashSet.empty[SavedIdlKey]
def add(t: NCToken): Unit = {
- set += Key(t)
+ set += SavedIdlKey(t)
t.getPartTokens.asScala.foreach(add)
}
@@ -305,7 +334,7 @@ object NCSynonymsManager extends NCService {
}
senToks.forall(tok =>
- m.get(Key(tok)) match {
+ m.get(SavedIdlKey(tok)) match {
case Some(vals) =>
vals.exists(
v =>
@@ -313,7 +342,7 @@ object NCSynonymsManager extends NCService {
v.predicate.apply(
tok, NCIdlContext(toks =
winHistVariant, req = v.request)
).value.asInstanceOf[Boolean] &&
-
winHistVariant.map(Key(_)).forall(t =>
+
winHistVariant.map(SavedIdlKey(_)).forall(t =>
t.id == "nlpcraft:nlp" ||
allCheckedSenToks.contains(t)
)
)
@@ -328,6 +357,36 @@ object NCSynonymsManager extends NCService {
/**
*
* @param srvReqId
+ * @param elemId
+ * @param s
+ * @param tokens
+ */
+ def isUnprocessedTokens(srvReqId: String, elemId: String, s: Synonym,
tokens: Seq[Int]): Boolean =
+ tokCaches.getOrElseUpdate(srvReqId, new
CacheHolder[Int]).isUnprocessed(elemId, s, tokens)
+
+ /**
+ *
+ * @param srvReqId
+ * @param elemId
+ * @param s
+ * @param tokens
*/
- def clearRequestData(srvReqId: String): Unit = idlCache -= srvReqId
+ def isUnprocessedIdl(srvReqId: String, elemId: String, s: Synonym, tokens:
Seq[NCIdlToken]): Boolean =
+ idlCaches.getOrElseUpdate(srvReqId, new
CacheHolder[NCIdlToken]).isUnprocessed(elemId, s, tokens)
+
+ /**
+ *
+ * @param srvReqId
+ */
+ def clearRequestData(srvReqId: String): Unit = savedIdl -= srvReqId
+
+ /**
+ *
+ * @param srvReqId
+ */
+ def clearRequestIterationData(srvReqId: String): Unit = {
+ idlChunksCache -= srvReqId
+ idlCaches -= srvReqId
+ tokCaches -= srvReqId
+ }
}