This is an automated email from the ASF dual-hosted git repository.
sergeykamov pushed a commit to branch NLPCRAFT-287
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git
The following commit(s) were added to refs/heads/NLPCRAFT-287 by this push:
new bee2c6f WIP.
bee2c6f is described below
commit bee2c6f163e09d93fcb5e97836291181174b222d
Author: Sergey Kamov <[email protected]>
AuthorDate: Fri Apr 2 18:58:19 2021 +0300
WIP.
---
.../mgrs/nlp/enrichers/model/NCModelEnricher.scala | 60 ++++++++++------------
1 file changed, 26 insertions(+), 34 deletions(-)
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
index 7df8f02..ed80630 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
@@ -19,7 +19,7 @@ package org.apache.nlpcraft.probe.mgrs.nlp.enrichers.model
import io.opencensus.trace.Span
import org.apache.nlpcraft.common._
-import org.apache.nlpcraft.common.nlp.{NCNlpSentenceToken, _}
+import org.apache.nlpcraft.common.nlp.{NCNlpSentenceToken ⇒ NlpToken, _}
import org.apache.nlpcraft.model._
import org.apache.nlpcraft.probe.mgrs.NCProbeSynonym.NCDslContent
import
org.apache.nlpcraft.probe.mgrs.NCProbeSynonymChunkKind.{NCSynonymChunkKind,
TEXT}
@@ -32,13 +32,14 @@ import java.io.Serializable
import java.util
import scala.collection.JavaConverters._
import scala.collection.convert.DecorateAsScala
-import scala.collection.mutable.ArrayBuffer
import scala.collection.{Map, Seq, mutable}
/**
* Model elements enricher.
*/
object NCModelEnricher extends NCProbeEnricher with DecorateAsScala {
+ type TokenData = (NCToken, NCSynonymChunkKind)
+
object Complex {
def apply(t: NCToken): Complex =
Complex(
@@ -53,7 +54,7 @@ object NCModelEnricher extends NCProbeEnricher with
DecorateAsScala {
maxIndex = t.wordIndexes.last
)
- def apply(t: NCNlpSentenceToken): Complex =
+ def apply(t: NlpToken): Complex =
Complex(
data = Right(t),
isToken = false,
@@ -72,7 +73,7 @@ object NCModelEnricher extends NCProbeEnricher with
DecorateAsScala {
isToken: Boolean,
isWord: Boolean,
token: NCToken,
- word: NCNlpSentenceToken,
+ word: NlpToken,
origText: String,
wordIndexes: Set[Int],
minIndex: Int,
@@ -125,9 +126,9 @@ object NCModelEnricher extends NCProbeEnricher with
DecorateAsScala {
// Found-by-synonym model element.
case class ElementMatch(
element: NCElement,
- tokens: Seq[NCNlpSentenceToken],
+ tokens: Seq[NlpToken],
synonym: NCProbeSynonym,
- parts: Seq[(NCToken, NCSynonymChunkKind)]
+ parts: Seq[TokenData]
) extends Ordered[ElementMatch] {
// Tokens sparsity.
lazy val sparsity: Int = tokens.zipWithIndex.tail.map {
@@ -136,7 +137,7 @@ object NCModelEnricher extends NCProbeEnricher with
DecorateAsScala {
// Number of tokens.
lazy val length: Int = tokens.size
- lazy val tokensSet: Set[NCNlpSentenceToken] = tokens.toSet
+ lazy val tokensSet: Set[NlpToken] = tokens.toSet
override def compare(that: ElementMatch): Int = {
// Check synonym first, then length and then sparsity.
@@ -193,11 +194,11 @@ object NCModelEnricher extends NCProbeEnricher with
DecorateAsScala {
private def mark(
ns: NCNlpSentence,
elem: NCElement,
- toks: Seq[NCNlpSentenceToken],
+ toks: Seq[NlpToken],
direct: Boolean,
syn: Option[NCProbeSynonym],
metaOpt: Option[Map[String, Object]],
- parts: Seq[(NCToken, NCSynonymChunkKind)]
+ parts: Seq[TokenData]
): Unit = {
val params = mutable.ArrayBuffer.empty[(String, AnyRef)]
@@ -269,7 +270,7 @@ object NCModelEnricher extends NCProbeEnricher with
DecorateAsScala {
* @param toks
* @param elemId
*/
- private def alreadyMarked(toks: Seq[NCNlpSentenceToken], elemId: String):
Boolean = toks.forall(_.isTypeOf(elemId))
+ private def alreadyMarked(toks: Seq[NlpToken], elemId: String): Boolean =
toks.forall(_.isTypeOf(elemId))
@throws[NCE]
override def enrich(mdl: NCProbeModel, ns: NCNlpSentence, senMeta:
Map[String, Serializable], parent: Span = null): Unit = {
@@ -281,17 +282,12 @@ object NCModelEnricher extends NCProbeEnricher with
DecorateAsScala {
"txt" → ns.text) { span ⇒
val cache = mutable.HashSet.empty[Seq[Int]]
val req = NCRequestImpl(senMeta, ns.srvReqId)
- val matches = ArrayBuffer.empty[ElementMatch]
+ val matches = mutable.ArrayBuffer.empty[ElementMatch]
- def addMatch(
- elm: NCElement,
- toks: Seq[NCNlpSentenceToken],
- syn: NCProbeSynonym,
- parts: Seq[(NCToken, NCSynonymChunkKind)]
- ): Boolean = {
- val toksSet = toks.toSet
+ def addMatch(elm: NCElement, toks: Seq[NlpToken], syn:
NCProbeSynonym, parts: Seq[TokenData]): Boolean = {
+ val tokensSet = toks.toSet
- if (!matches.exists(m ⇒ m.element.getId == elm.getId &&
toksSet.subsetOf(m.tokensSet))) {
+ if (!matches.exists(m ⇒ m.element.getId == elm.getId &&
tokensSet.subsetOf(m.tokensSet))) {
matches += ElementMatch(elm, toks, syn, parts)
true
@@ -300,7 +296,7 @@ object NCModelEnricher extends NCProbeEnricher with
DecorateAsScala {
false
}
- def getParts(comb: Seq[Complex], syn: NCProbeSynonym):
Seq[(NCToken, NCSynonymChunkKind)] =
+ def getParts(comb: Seq[Complex], syn: NCProbeSynonym):
Seq[TokenData] =
comb.zip(syn.map(_.kind)).flatMap {
case (complex, kind) ⇒ if (complex.isToken)
Some(complex.token → kind)
else None
@@ -322,16 +318,11 @@ object NCModelEnricher extends NCProbeEnricher with
DecorateAsScala {
* @param toks
* @return
*/
- def tokString(toks: Seq[NCNlpSentenceToken]): String = toks.map(t
⇒ (t.origText, t.index)).mkString(" ")
+ def tokString(toks: Seq[NlpToken]): String = toks.map(t ⇒
(t.origText, t.index)).mkString(" ")
lazy val complexesWords = ns.map(Complex(_))
lazy val complexes: Seq[ComplexSeq] =
- NCProbeVariants.
- convert(
- ns.srvReqId,
- mdl,
- NCSentenceManager.collapse(mdl.model, ns.clone())
- ).
+ NCProbeVariants.convert(ns.srvReqId, mdl,
NCSentenceManager.collapse(mdl.model, ns.clone())).
map(_.asScala).
par.
flatMap(sen ⇒
@@ -362,13 +353,14 @@ object NCModelEnricher extends NCProbeEnricher with
DecorateAsScala {
startScopedSpan("synsProc", span,
"srvReqId" → ns.srvReqId,
"mdlId" → mdl.model.getId,
- "txt" → ns.text) {
+ "txt" → ns.text
+ ) {
_ ⇒
// 1. Simple, sparse.
for ((elemId, syns) ← mdl.sparseSynonyms; syn ← syns)
syn.trySparseMatch(ns).foreach(toks ⇒
addMatch(mdl.elements(elemId), toks, syn, Seq.empty))
- // 2. DSL, sparse.
+ // 2. DSL, sparse.
for ((elemId, syns) ← mdl.sparseSynonymsDsl; syn ← syns) {
for (complex ← complexes) {
val comb = complex.tokensComplexes
@@ -378,13 +370,13 @@ object NCModelEnricher extends NCProbeEnricher with
DecorateAsScala {
tows.filter(_.isRight).map(_.right.get) ++
tows.filter(_.isLeft).map(_.left.get).flatMap(w ⇒
ns.filter(
- t ⇒ t.startCharIndex >=
w.getStartCharIndex &&
+ t ⇒
+ t.startCharIndex >=
w.getStartCharIndex &&
t.endCharIndex <=
w.getEndCharIndex
)
)
addMatch(mdl.elements(elemId),
toks.sortBy(_.startCharIndex), syn, getParts(comb, syn))
-
})
}
}
@@ -428,9 +420,9 @@ object NCModelEnricher extends NCProbeEnricher with
DecorateAsScala {
def setFound(
elm: NCElement,
- toks: Seq[NCNlpSentenceToken],
+ toks: Seq[NlpToken],
syn: NCProbeSynonym,
- parts: Seq[(NCToken, NCSynonymChunkKind)]
+ parts: Seq[TokenData]
): Unit =
if (addMatch(elm, toks, syn, parts))
found = true
@@ -544,7 +536,7 @@ object NCModelEnricher extends NCProbeEnricher with
DecorateAsScala {
"srvReqId" → ns.srvReqId,
"mdlId" → mdl.model.getId,
"txt" → ns.text) { _ ⇒
- def to(t: NCNlpSentenceToken): NCCustomWord =
+ def to(t: NlpToken): NCCustomWord =
new NCCustomWord {
override def getNormalizedText: String = t.normText
override def getOriginalText: String = t.origText