This is an automated email from the ASF dual-hosted git repository.
sergeykamov pushed a commit to branch NLPCRAFT-443
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git
The following commit(s) were added to refs/heads/NLPCRAFT-443 by this push:
new 0d1907b WIP.
0d1907b is described below
commit 0d1907b99591d26c4c4a36a0722f5bc99fe1e258
Author: Sergey Kamov <[email protected]>
AuthorDate: Fri Sep 24 18:37:43 2021 +0300
WIP.
---
.../nlpcraft/probe/mgrs/NCProbeSynonym.scala | 46 +++----
.../mgrs/nlp/enrichers/model/NCModelEnricher.scala | 148 ++++++++++-----------
.../mgrs/nlp/enrichers/model/NCSentenceCache.scala | 110 ---------------
.../probe/mgrs/synonyms/NCSynonymsManager.scala | 3 +-
4 files changed, 92 insertions(+), 215 deletions(-)
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeSynonym.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeSynonym.scala
index 7eefd97..d865c6a 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeSynonym.scala
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeSynonym.scala
@@ -52,6 +52,18 @@ class NCProbeSynonym(
lazy val isValueSynonym: Boolean = value != null
lazy val stems: String = map(_.wordStem).mkString(" ")
+ private lazy val hash =
+ Seq(
+ super.hashCode(),
+ isTextOnly,
+ regexChunks,
+ idlChunks,
+ isValueSynonym,
+ isElementId,
+ isValueName,
+ value
+ ).map(p => if (p == null) 0 else p.hashCode()).foldLeft(0)((a, b) =>
31 * a + b)
+
override def toString(): String = mkString(" ")
// Orders synonyms from least to most significant.
@@ -109,36 +121,20 @@ class NCProbeSynonym(
}
}
- override def canEqual(other: Any): Boolean =
other.isInstanceOf[NCProbeSynonym]
-
override def equals(other: Any): Boolean = other match {
case that: NCProbeSynonym =>
- super.equals(that) &&
- (that canEqual this) &&
- isTextOnly == that.isTextOnly &&
- regexChunks == that.regexChunks &&
- idlChunks == that.idlChunks &&
- isValueSynonym == that.isValueSynonym &&
- isElementId == that.isElementId &&
- isValueName == that.isValueName &&
- value == that.value
+ isElementId == that.isElementId &&
+ isTextOnly == that.isTextOnly &&
+ regexChunks == that.regexChunks &&
+ idlChunks == that.idlChunks &&
+ isValueSynonym == that.isValueSynonym &&
+ isValueName == that.isValueName &&
+ value == that.value &&
+ super.equals(that)
case _ => false
}
- override def hashCode(): Int = {
- val state = Seq(
- super.hashCode(),
- isTextOnly,
- regexChunks,
- idlChunks,
- isValueSynonym,
- isElementId,
- isValueName,
- value
- )
-
- state.map(p => if (p == null) 0 else p.hashCode()).foldLeft(0)((a, b)
=> 31 * a + b)
- }
+ override def hashCode(): Int = hash
}
object NCProbeSynonym {
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
index 0cd8a92..e079aff 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
@@ -34,7 +34,6 @@ import org.apache.nlpcraft.probe.mgrs.{NCProbeModel,
NCProbeVariants, NCTokenPar
import java.io.Serializable
import java.util.{List => JList}
import scala.collection.mutable
-import scala.collection.mutable.ArrayBuffer
import scala.collection.parallel.CollectionConverters._
import scala.jdk.CollectionConverters.{ListHasAsScala, MapHasAsJava,
MapHasAsScala, SeqHasAsJava}
@@ -43,7 +42,6 @@ import scala.jdk.CollectionConverters.{ListHasAsScala,
MapHasAsJava, MapHasAsSca
*/
object NCModelEnricher extends NCProbeEnricher {
type TokType = (NCToken, NCSynonymChunkKind)
- type Cache = mutable.Map[String, ArrayBuffer[Seq[Int]]]
object Complex {
def apply(t: NCToken): Complex =
@@ -130,6 +128,22 @@ object NCModelEnricher extends NCProbeEnricher {
case class ComplexHolder(complexesWords: Seq[Complex], complexes:
Seq[ComplexSeq])
+ class CacheHolder[T] {
+ private lazy val cache = mutable.HashMap.empty[Int,
mutable.HashMap[Seq[T], mutable.HashSet[Synonym]]]
+
+ def add(s: Synonym, tokens: Seq[T]): Boolean = {
+ cache.
+ getOrElseUpdate(
+ tokens.length,
+ mutable.HashMap.empty[Seq[T], mutable.HashSet[Synonym]]
+ ).
+ getOrElseUpdate(
+ tokens,
+ mutable.HashSet.empty[Synonym]
+ ).add(s)
+ }
+ }
+
/**
*
* @param parent Optional parent span.
@@ -379,12 +393,6 @@ object NCModelEnricher extends NCProbeEnricher {
flatMap(w => ns.filter(t =>
t.wordIndexes.intersect(w.wordIndexes).nonEmpty))
).sortBy(_.startCharIndex)
- /**
- *
- * @param m
- * @param id
- * @return
- */
private def get(m: Map[String , Seq[Synonym]], id: String): Seq[Synonym] =
m.getOrElse(id, Seq.empty)
/**
@@ -455,7 +463,7 @@ object NCModelEnricher extends NCProbeEnricher {
* @param h
* @param toks
*/
- private def mkCombinations(h: ComplexHolder, toks: Seq[NlpToken], cache:
mutable.HashSet[Seq[Complex]]): Seq[Seq[Complex]] = {
+ private def mkCombinations(h: ComplexHolder, toks: Seq[NlpToken]):
Seq[Seq[Complex]] = {
val idxs = toks.flatMap(_.wordIndexes).toSet
h.complexes.par.
@@ -463,54 +471,15 @@ object NCModelEnricher extends NCProbeEnricher {
val rec =
complexSeq.tokensComplexes.filter(_.wordIndexes.exists(idxs.contains))
// Drops without tokens (IDL part works with tokens).
- if (rec.nonEmpty) {
- val data = rec ++
+ if (rec.nonEmpty)
+ Some(rec ++
(complexSeq.wordsIndexes.intersect(idxs) --
rec.flatMap(_.wordIndexes)).map(h.complexesWords)
-
- if (!cache.contains(data)) Some(data) else None
- }
+ )
else
None
}).seq
}
- private def add(
- dbgType: String,
- ns: Sentence,
- contCache: Cache,
- elemId: String,
- greedy: Boolean,
- elemToks: Seq[NlpToken],
- sliceToksIdxs: Seq[Int],
- syn: Synonym,
- parts: Seq[TokType] = Seq.empty
- ): Unit = {
- val resIdxs = elemToks.map(_.index)
- val resIdxsSorted = resIdxs.sorted
-
- if (resIdxsSorted == sliceToksIdxs && U.isContinuous(resIdxsSorted))
- contCache(elemId) += sliceToksIdxs
-
- val ok =
- (!greedy || !alreadyMarked(ns, elemId, elemToks, sliceToksIdxs)) &&
- ( parts.isEmpty || !parts.exists { case (t, _) => t.getId ==
elemId })
-
- if (ok)
- mark(ns, elemId, elemToks, direct = syn.isDirect &&
U.isIncreased(resIdxs), syn = Some(syn), parts = parts)
-
- if (DEEP_DEBUG)
- logger.trace(
- s"${if (ok) "Added" else "Skipped"} element [" +
- s"id=$elemId, " +
- s"type=$dbgType, " +
- s"text='${elemToks.map(_.origText).mkString(" ")}', " +
- s"indexes=${resIdxs.mkString("[", ",", "]")}, " +
- s"allTokensIndexes=${sliceToksIdxs.mkString("[", ",",
"]")}, " +
- s"synonym=$syn" +
- s"]"
- )
- }
-
/**
*
* @param matched
@@ -536,13 +505,15 @@ object NCModelEnricher extends NCProbeEnricher {
) { span =>
val req = NCRequestImpl(senMeta, ns.srvReqId)
- val combToks = combosTokens(ns.toSeq)
lazy val ch = mkComplexes(mdl, ns)
lazy val variantsToks =
ch.complexes.map(
p => p.tokensComplexes.map(p => if (p.isToken) p.token
else mkNlpToken(mdl, ns.srvReqId, p.word))
)
+ lazy val idlCache = new CacheHolder[NCIdlContent]()
+ lazy val tokCache = new CacheHolder[Int]()
+
def execute(simpleEnabled: Boolean, idlEnabled: Boolean): Unit =
startScopedSpan(
"execute", span, "srvReqId" -> ns.srvReqId, "mdlId" ->
mdl.model.getId, "txt" -> ns.text
@@ -550,21 +521,43 @@ object NCModelEnricher extends NCProbeEnricher {
if (DEEP_DEBUG)
logger.trace(s"Execution started
[simpleEnabled=$simpleEnabled, idlEnabled=$idlEnabled]")
- val contCache = mutable.HashMap.empty ++
- mdl.elements.keys.map(k => k ->
mutable.ArrayBuffer.empty[Seq[Int]])
- lazy val idlCache = mutable.HashSet.empty[Seq[Complex]]
-
for (
// 'toksExt' is piece of sentence, 'toks' is the same
as 'toksExt' or without some stopwords set.
- (toks, toksExt) <- combToks;
+ (toks, toksExt) <- combosTokens(ns.toSeq);
idxs = toks.map(_.index);
e <- mdl.elements.values;
eId = e.getId;
greedy = e.isGreedy.orElse(mdl.model.isGreedy)
- if
- !greedy ||
- !contCache(eId).exists(_.containsSlice(idxs)) &&
!alreadyMarked(ns, eId, toks, idxs)
+ if !greedy || !alreadyMarked(ns, eId, toks, idxs)
) {
+ def add(
+ dbgType: String,
+ elemToks: Seq[NlpToken],
+ syn: Synonym,
+ parts: Seq[TokType] = Seq.empty
+ ): Unit = {
+ val resIdxs = elemToks.map(_.index)
+
+ val ok =
+ (!greedy || !alreadyMarked(ns, eId, elemToks,
idxs)) &&
+ ( parts.isEmpty || !parts.exists { case (t,
_) => t.getId == eId })
+
+ if (ok)
+ mark(ns, eId, elemToks, direct = syn.isDirect
&& U.isIncreased(resIdxs), syn = Some(syn), parts = parts)
+
+ if (DEEP_DEBUG)
+ logger.trace(
+ s"${if (ok) "Added" else "Skipped"}
element [" +
+ s"id=$eId, " +
+ s"type=$dbgType, " +
+
s"text='${elemToks.map(_.origText).mkString(" ")}', " +
+ s"indexes=${resIdxs.mkString("[", ",",
"]")}, " +
+
s"allTokensIndexes=${idxs.mkString("[", ",", "]")}, " +
+ s"synonym=$syn" +
+ s"]"
+ )
+ }
+
// 1. SIMPLE.
if (simpleEnabled && (if (idlEnabled)
mdl.hasIdlSynonyms(eId) else !mdl.hasIdlSynonyms(eId))) {
lazy val tokStems = toks.map(_.stem).mkString(" ")
@@ -579,15 +572,15 @@ object NCModelEnricher extends NCProbeEnricher {
syns.get(tokStems) match {
case Some(s) =>
found = true
- add("simple continuous",
ns, contCache, eId, greedy, toksExt, idxs, s)
+ add("simple continuous",
toksExt, s)
case None => notFound()
}
def tryScan(syns: Seq[Synonym]): Unit =
- for (s <- syns if !found)
+ for (s <- syns if !found &&
tokCache.add(s, idxs))
if
(NCSynonymsManager.isMatch(s, toks)) {
found = true
- add("simple continuous
scan", ns, contCache, eId, greedy, toksExt, idxs, s)
+ add("simple continuous
scan", toksExt, s)
}
tryMap(
@@ -604,10 +597,10 @@ object NCModelEnricher extends NCProbeEnricher {
// 1.2 Sparse.
if (!found && mdl.hasSparseSynonyms)
- for (s <- get(mdl.sparseSynonyms, eId))
+ for (s <- get(mdl.sparseSynonyms, eId) if
tokCache.add(s, idxs))
NCSynonymsManager.sparseMatch(s, toks)
match {
case Some(res) =>
- add("simple sparse", ns,
contCache, eId, greedy, getSparsedTokens(res, toks), idxs, s)
+ add("simple sparse",
getSparsedTokens(res, toks), s)
case None => // No-op.
}
}
@@ -615,8 +608,7 @@ object NCModelEnricher extends NCProbeEnricher {
// 2. IDL.
if (idlEnabled) {
val allSyns = get(mdl.idlSynonyms, eId)
- lazy val allCombs = mkCombinations(ch, toks,
idlCache)
-
+ lazy val allCombs = mkCombinations(ch, toks)
// 2.1 Continuous.
if (!mdl.hasSparseSynonyms) {
@@ -624,16 +616,14 @@ object NCModelEnricher extends NCProbeEnricher {
for (
s <- allSyns;
- comb <- allCombs
- if !found;
+ comb <- allCombs;
data = comb.map(_.data)
+ if !found && idlCache.add(s, data)
)
if (NCSynonymsManager.isMatch(s, data,
req, variantsToks)) {
val parts = toParts(mdl, ns.srvReqId,
data, s)
- add("IDL continuous", ns, contCache,
eId, greedy, toksExt, idxs, s, parts)
-
- idlCache += comb
+ add("IDL continuous", toksExt, s,
parts)
found = true
}
@@ -642,17 +632,17 @@ object NCModelEnricher extends NCProbeEnricher {
// 2.2 Sparse.
for (
s <- allSyns;
- comb <- allCombs
+ comb <- allCombs;
+ data = comb.map(_.data)
+ if idlCache.add(s, data)
)
- NCSynonymsManager.sparseMatch(s,
comb.map(_.data), req, variantsToks) match {
+ NCSynonymsManager.sparseMatch(s, data,
req, variantsToks) match {
case Some(res) =>
- val typ = if (s.sparse) "IDL
sparse" else "IDL continuous"
-
+ val toks =
getSparsedTokens(toTokens(res, ns), toTokens(comb.map(_.data), ns))
val parts = toParts(mdl,
ns.srvReqId, res, s)
+ val typ = if (s.sparse) "IDL
sparse"else "IDL continuous"
- add(typ, ns, contCache, eId,
greedy, getSparsedTokens(toTokens(res, ns), toTokens(comb.map(_.data), ns)),
idxs, s, parts)
-
- idlCache += comb
+ add(typ, toks, s, parts)
case None => // No-op.
}
}
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCSentenceCache.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCSentenceCache.scala
deleted file mode 100644
index e5b6e3e..0000000
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCSentenceCache.scala
+++ /dev/null
@@ -1,110 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * https://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nlpcraft.probe.mgrs.nlp.enrichers.model
-
-import org.apache.nlpcraft.common.nlp.{NCNlpSentenceToken => NlpToken}
-import org.apache.nlpcraft.probe.mgrs.{NCProbeSynonym => Synonym}
-import org.apache.nlpcraft.probe.mgrs.NCProbeSynonym.{NCIdlContent => IdlToken}
-import org.apache.nlpcraft.model.NCRequest
-import scala.collection.mutable
-
-class NCSentenceCache {
-// case class Key(elemId: String, indexes: Seq[Int])
-// case class Value[T](synonym: Synonym, result: Seq[T])
-//
-// val cacheToks = mutable.HashMap.empty[Key, mutable.HashMap[Seq[Int],
Value[NlpToken]]]
-// val cacheIdl = mutable.HashMap.empty[Key, mutable.HashMap[Seq[Int],
Value[IdlToken]]]
-//
-// var cacheHits = 0
-// var cacheCnt = 0
-// var time = 0L
-//
-// private def process[T](
-// elemId: String,
-// elemSyns: Seq[Synonym],
-// toks: Seq[T],
-// extract: (Synonym, Seq[T]) => Option[Seq[T]],
-// cache: mutable.Map[Key, mutable.HashMap[Seq[Int], Value[T]]],
-// getIndex: T => Int,
-// callback: (Synonym, Seq[T]) => Unit
-// ): Unit = {
-// val t = System.currentTimeMillis()
-//
-// val hash = toks.map(getIndex)
-// val key = Key(elemId, hash)
-//
-// cacheCnt += 1
-//
-// cache.get(key) match {
-// case Some(data) =>
-// cacheHits += 1
-// data.get(hash) match {
-// case Some(v) => callback(v.synonym, v.result)
-// case None => // No-op.
-// }
-// case None =>
-// // mutable.HashMap.empty[Key[IdlToken], Map[Seq[IdlToken],
Value[IdlToken]]]
-// val hit = mutable.HashMap.empty[Seq[Int], Value[T]]
-//
-// for (s <- elemSyns)
-// extract(s, toks) match {
-// case Some(res) =>
-// callback(s, res)
-// hit += hash -> Value(s, res)
-// case None => // No-op.
-// }
-//
-// cache += key -> hit
-// }
-//
-// time += (System.currentTimeMillis() - t)
-// }
-//
-// def processSparseTokens(
-// elemId: String,
-// elemSyns: Seq[Synonym],
-// toks: Seq[NlpToken],
-// callback: (Synonym, Seq[NlpToken]) => Unit
-// ): Unit =
-// process(
-// elemId,
-// elemSyns,
-// toks,
-// (s: Synonym, toks: Seq[NlpToken]) => s.sparseMatch(toks),
-// cacheToks,
-// (t: NlpToken) => t.index,
-// callback
-// )
-//
-// def processSparseIdl(
-// elemId: String,
-// req: NCRequest,
-// elemSyns: Seq[Synonym],
-// toks: Seq[IdlToken],
-// callback: (Synonym, Seq[IdlToken]) => Unit
-// ): Unit =
-// process(
-// elemId,
-// elemSyns,
-// toks,
-// (s: Synonym, toks: Seq[IdlToken]) => s.sparseMatch(toks, req),
-// cacheIdl,
-// (t: IdlToken) => if (t.isRight) t.toOption.get.index else
t.swap.toOption.get.getIndex,
-// callback
-// )
-}
\ No newline at end of file
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/synonyms/NCSynonymsManager.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/synonyms/NCSynonymsManager.scala
index d66f5af..465af93 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/synonyms/NCSynonymsManager.scala
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/synonyms/NCSynonymsManager.scala
@@ -27,6 +27,7 @@ import
org.apache.nlpcraft.probe.mgrs.NCProbeSynonymChunkKind.{IDL, NCSynonymChu
import org.apache.nlpcraft.probe.mgrs.{NCProbeSynonymChunk, NCProbeSynonym =>
Synonym}
import scala.collection.mutable
+import
scala.collection.parallel.CollectionConverters.ImmutableIterableIsParallelizable
import scala.compat.java8.OptionConverters._
import scala.jdk.CollectionConverters.ListHasAsScala
@@ -194,7 +195,7 @@ object NCSynonymsManager extends NCService {
case IDL =>
val ok =
- variantsToks.exists(vrntToks =>
+ variantsToks.par.exists(vrntToks =>
get0(t =>
chunk.idlPred.apply(t, NCIdlContext(toks =
vrntToks, req = req)).value.asInstanceOf[Boolean],
_ => false