[incubator-nlpcraft] branch NLPCRAFT-287 updated: WIP.

sergeykamov Mon, 05 Apr 2021 04:42:00 -0700

This is an automated email from the ASF dual-hosted git repository.

sergeykamov pushed a commit to branch NLPCRAFT-287
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git



The following commit(s) were added to refs/heads/NLPCRAFT-287 by this push:
     new e35bbb4  WIP.
e35bbb4 is described below

commit e35bbb4043559792cac2619f29419a1403295ec7
Author: Sergey Kamov <[email protected]>
AuthorDate: Mon Apr 5 14:41:32 2021 +0300

    WIP.
---
 .../nlpcraft/probe/mgrs/NCProbeSynonym.scala       |  75 +++---
 .../mgrs/nlp/enrichers/model/NCModelEnricher.scala | 279 +++++++++++----------
 2 files changed, 185 insertions(+), 169 deletions(-)

diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeSynonym.scala 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeSynonym.scala
index 4dd1c61..4b2639e 100644
--- 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeSynonym.scala
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeSynonym.scala
@@ -17,7 +17,6 @@
 
 package org.apache.nlpcraft.probe.mgrs
 
-import org.apache.nlpcraft.common.U
 import org.apache.nlpcraft.common.nlp.{NCNlpSentenceToken, 
NCNlpSentenceTokenBuffer}
 import org.apache.nlpcraft.model._
 import org.apache.nlpcraft.model.intent.NCIdlContext
@@ -86,44 +85,44 @@ class NCProbeSynonym(
 
     /**
       *
-      * @param sen
+      * @param toks
       * @return
       */
-    private def trySparseMatch0[T](sen: Seq[T], isMatch: (T, 
NCProbeSynonymChunk) ⇒ Boolean, getIndex: T ⇒ Int): List[List[T]] = {
-        require(sen != null)
-        require(sen.nonEmpty)
+    private def trySparseMatch0[T](toks: Seq[T], isMatch: (T, 
NCProbeSynonymChunk) ⇒ Boolean, getIndex: T ⇒ Int): Option[Seq[T]] = {
+        require(toks != null)
+        require(toks.nonEmpty)
         require(this.size > 1)
 
-        lazy val buf = mutable.ArrayBuffer.empty[List[T]]
-        lazy val flattenBuf = mutable.ArrayBuffer.empty[T]
-
-        var ok = true
-
-        for (chunk ← this if ok) {
-            val res = sen.filter(tok ⇒ !flattenBuf.contains(tok) && 
isMatch(tok, chunk))
-
-            if (res.nonEmpty) {
-                buf += res.toList
-                flattenBuf ++= res
+        lazy val buf = mutable.ArrayBuffer.empty[T]
+        var state = 0
+
+        for (chunk ← this if state != -1)
+            toks.find(t ⇒ {
+                if (state == 0) {
+                    state = 1
+
+                    isMatch(t, chunk) && !buf.contains(t)
+                }
+                else
+                    !buf.contains(t) && isMatch(t, chunk)
+            }) match {
+                case Some(t) ⇒ buf += t
+                case None ⇒ state = -1
             }
-            else
-                ok = false
-        }
-
-        if (ok) {
-            var variants = U.permute(buf.toList)
 
-            def isOrdered(list: List[T]): Boolean =
-                list.tail.zipWithIndex.forall { case (t, idx) ⇒ getIndex(t) > 
getIndex(list(idx)) }
+        if (state != -1 &&
+            buf.contains(toks.head) &&
+            buf.contains(toks.last) &&
+            (perm || buf.tail.zipWithIndex.forall { case (t, idx) ⇒ 
getIndex(t) > getIndex(buf(idx)) }) &&
+            {
+                val remained = toks.filter(t ⇒ !buf.contains(t))
 
-            if (!perm)
-                variants = variants.filter(isOrdered)
-
-            variants
-
-        }
+                !this.exists(chunk ⇒ remained.exists(t ⇒ isMatch(t, chunk)))
+            }
+        )
+            Some(buf)
         else
-            List.empty
+            None
     }
 
     /**
@@ -170,12 +169,12 @@ class NCProbeSynonym(
 
     /**
       *
-      * @param sen
+      * @param toks
       * @return
       */
-    def trySparseMatch(sen: NCNlpSentenceTokenBuffer): 
List[List[NCNlpSentenceToken]] =
+    def trySparseMatch(toks: NCNlpSentenceTokenBuffer): 
Option[Seq[NCNlpSentenceToken]] =
         trySparseMatch0(
-            sen,
+            toks,
             isMatch,
             (t: NCNlpSentenceToken) ⇒ t.startCharIndex
         )
@@ -197,12 +196,12 @@ class NCProbeSynonym(
 
     /**
       *
-      * @param sen
+      * @param tows
       * @param req
       */
-    def trySparseMatch(sen: Seq[NCDslContent], req: NCRequest): 
List[List[NCDslContent]] =
+    def trySparseMatch(tows: Seq[NCDslContent], req: NCRequest): 
Option[Seq[NCDslContent]] =
         trySparseMatch0(
-            sen,
+            tows,
             (t: NCDslContent, chunk: NCProbeSynonymChunk) ⇒ isMatch(t, chunk, 
req),
             (t: NCDslContent) ⇒ if (t.isLeft) t.left.get.getStartCharIndex 
else t.right.get.startCharIndex
         )
@@ -318,4 +317,4 @@ object NCProbeSynonym {
         
         syn
     }
-}
\ No newline at end of file
+}
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
index 9b98dc2..2dd6391 100644
--- 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
@@ -32,6 +32,7 @@ import java.io.Serializable
 import java.util
 import scala.collection.JavaConverters._
 import scala.collection.convert.DecorateAsScala
+import scala.collection.mutable.ArrayBuffer
 import scala.collection.{Map, Seq, mutable}
 
 /**
@@ -131,9 +132,7 @@ object NCModelEnricher extends NCProbeEnricher with 
DecorateAsScala {
         parts: Seq[TokenData]
     ) extends Ordered[ElementMatch] {
         // Tokens sparsity.
-        lazy val sparsity: Int = tokens.zipWithIndex.tail.map {
-            case (tok, idx) ⇒ Math.abs(tok.index - tokens(idx - 1).index)
-        }.sum - tokens.length + 1
+        lazy val sparsity = U.calcSparsity(tokens.map(_.index))
 
         // Number of tokens.
         lazy val length: Int = tokens.size
@@ -160,6 +159,8 @@ object NCModelEnricher extends NCProbeEnricher with 
DecorateAsScala {
             else
                 0
         }
+
+        override def toString: String = s"Element=${element.getId}, 
indexes=${tokens.map(_.index).mkString(",")}, synonym=$synonym"
     }
 
     /**
@@ -265,12 +266,43 @@ object NCModelEnricher extends NCProbeEnricher with 
DecorateAsScala {
     private def combos[T](toks: Seq[T]): Seq[Seq[T]] =
         (for (n ← toks.size until 0 by -1) yield 
toks.sliding(n)).flatten.map(p ⇒ p)
 
+    // TODO:
+//    /**
+//      *
+//      * @param toks
+//      * @param elemId
+//      */
+//    private def alreadyMarked(toks: Seq[NlpToken], elemId: String): Boolean 
= toks.forall(_.isTypeOf(elemId))
+
     /**
       *
-      * @param toks
-      * @param elemId
+      * @param comb
+      * @param syn
       */
-    private def alreadyMarked(toks: Seq[NlpToken], elemId: String): Boolean = 
toks.forall(_.isTypeOf(elemId))
+    private def getParts(comb: Seq[Complex], syn: NCProbeSynonym): 
Seq[TokenData] =
+        comb.zip(syn.map(_.kind)).flatMap {
+            case (complex, kind) ⇒ if (complex.isToken) Some(complex.token → 
kind)
+            else None
+        }
+
+    private def mkCache(): mutable.Map[String, ArrayBuffer[Seq[NlpToken]]] =
+        mutable.HashMap.empty[
+            String,
+            mutable.ArrayBuffer[Seq[NlpToken]]
+        ].withDefault(_ ⇒ mutable.ArrayBuffer.empty[Seq[NlpToken]])
+
+    private def convert(tows: Seq[NCDslContent], ns: NCNlpSentence): 
Seq[NlpToken] =
+        (
+            tows.filter(_.isRight).map(_.right.get) ++
+                tows.filter(_.isLeft).map(_.left.get).
+                    flatMap(w ⇒
+                        ns.filter(
+                            t ⇒
+                                t.startCharIndex >= w.getStartCharIndex &&
+                                    t.endCharIndex <= w.getEndCharIndex
+                        )
+                    )
+        ).sortBy(_.startCharIndex)
 
     @throws[NCE]
     override def enrich(mdl: NCProbeModel, ns: NCNlpSentence, senMeta: 
Map[String, Serializable], parent: Span = null): Unit = {
@@ -281,29 +313,23 @@ object NCModelEnricher extends NCProbeEnricher with 
DecorateAsScala {
             "mdlId" → mdl.model.getId,
             "txt" → ns.text
         ) { span ⇒
-            val cache = mutable.HashSet.empty[Seq[Int]]
             val req = NCRequestImpl(senMeta, ns.srvReqId)
             val tokIdxs = ns.map(t ⇒ t → t.wordIndexes).toMap
-            val senHasUserTokens = ns.exists(_.isUser)
+            val firstPhase = !ns.exists(_.isUser)
             val matches = mutable.ArrayBuffer.empty[ElementMatch]
+            val cacheSparse = mkCache()
+            val cacheNotSparse = mkCache()
+
+            def addMatch(elm: NCElement, toks: Seq[NlpToken], syn: 
NCProbeSynonym, parts: Seq[TokenData]): Unit = {
+                val toksSet = toks.toSet
 
-            def addMatch(elm: NCElement, toks: Seq[NlpToken], syn: 
NCProbeSynonym, parts: Seq[TokenData]): Boolean = {
-                val tokensSet = toks.toSet
+                // TODO:
+                //require(!matches.exists(m ⇒ m.element.getId == elm.getId && 
toksSet.subsetOf(m.tokensSet)))
 
-                if (!matches.exists(m ⇒ m.element.getId == elm.getId && 
tokensSet.subsetOf(m.tokensSet))) {
+                if (!matches.exists(m ⇒ m.element.getId == elm.getId && 
toksSet.subsetOf(m.tokensSet)))
                     matches += ElementMatch(elm, toks, syn, parts)
-                    true
-                }
-                else
-                    false
             }
 
-            def getParts(comb: Seq[Complex], syn: NCProbeSynonym): 
Seq[TokenData] =
-                comb.zip(syn.map(_.kind)).flatMap {
-                    case (complex, kind) ⇒ if (complex.isToken) 
Some(complex.token → kind)
-                    else None
-                }
-
             /**
               * Gets synonyms sorted in descending order by their weight 
(already prepared),
               * i.e. first synonym in the sequence is the most important one.
@@ -350,132 +376,122 @@ object NCModelEnricher extends NCProbeEnricher with 
DecorateAsScala {
                             
}).filter(_.exists(_.isToken)).map(ComplexSeq(_)).distinct
                     ).seq
 
-
             startScopedSpan("synsProc", span,
                 "srvReqId" → ns.srvReqId,
                 "mdlId" → mdl.model.getId,
                 "txt" → ns.text
             ) {
                 _ ⇒
-                // 1. Simple, sparse.
-                if (!senHasUserTokens)
-                    for ((elemId, syns) ← mdl.sparseSynonyms; syn ← syns)
-                        syn.trySparseMatch(ns).foreach(toks ⇒ 
addMatch(mdl.elements(elemId), toks, syn, Seq.empty))
-
-                // 2. DSL, sparse.
-                for ((elemId, syns) ← mdl.sparseSynonymsDsl; syn ← syns) {
-                    for (complex ← complexes) {
-                        val comb = complex.tokensComplexes
-
-                        syn.trySparseMatch(comb.map(_.data), req).foreach(tows 
⇒ {
-                            val toks =
-                                tows.filter(_.isRight).map(_.right.get) ++
-                                    
tows.filter(_.isLeft).map(_.left.get).flatMap(w ⇒
-                                        ns.filter(
-                                            t ⇒
-                                                t.startCharIndex >= 
w.getStartCharIndex &&
-                                                t.endCharIndex <= 
w.getEndCharIndex
-                                        )
-                                    )
+                for (toks ← combos(ns)) {
+                    val idxsSeq = toks.flatMap(tokIdxs)
+                    val idxsSorted = idxsSeq.sorted
+                    val idxs = idxsSeq.toSet
+                    val idxMin = idxsSorted.head
+                    val idxMax = idxsSorted.last
 
-                            addMatch(mdl.elements(elemId), 
toks.sortBy(_.startCharIndex), syn, getParts(comb, syn))
-                        })
-                    }
-                }
+                    lazy val sorted = idxsSorted.zipWithIndex.toMap
 
-                for (toks ← combos(ns)) {
-                    val key = toks.map(_.index).sorted
-
-                    if (!cache.contains(key)) {
-                        cache += key
-
-                        val idxsSeq = toks.flatMap(tokIdxs)
-                        val idxsSorted = idxsSeq.sorted
-                        val idxs = idxsSeq.toSet
-                        val idxMin = idxsSorted.head
-                        val idxMax = idxsSorted.last
-
-                        lazy val sorted = idxsSorted.zipWithIndex.toMap
-
-                        lazy val dslCombs: Map[Int, Seq[Seq[Complex]]] =
-                            complexes.par.
-                                flatMap(complexSeq ⇒ {
-                                    val rec = 
complexSeq.tokensComplexes.filter(_.isSubsetOf(idxMin, idxMax, idxs))
-
-                                    // Drops without tokens (IDL part works 
with tokens).
-                                    if (rec.nonEmpty)
-                                        Some(
-                                            rec ++
-                                                
(complexSeq.wordsIndexes.intersect(idxs) -- rec.flatMap(_.wordIndexes)).
-                                                map(complexesWords)
-                                        )
-                                    else
-                                        None
-                                }).
-                                map(_.sortBy(p ⇒ 
sorted(p.wordIndexes.head))).seq.groupBy(_.length)
+                    lazy val dslCombs: Map[Int, Seq[Seq[Complex]]] =
+                        complexes.par.
+                            flatMap(complexSeq ⇒ {
+                                val rec = 
complexSeq.tokensComplexes.filter(_.isSubsetOf(idxMin, idxMax, idxs))
 
-                        lazy val tokStems = toks.map(_.stem).mkString(" ")
+                                // Drops without tokens (IDL part works with 
tokens).
+                                if (rec.nonEmpty)
+                                    Some(
+                                        rec ++
+                                            
(complexSeq.wordsIndexes.intersect(idxs) -- rec.flatMap(_.wordIndexes)).
+                                            map(complexesWords)
+                                    )
+                                else
+                                    None
+                            }).
+                            map(_.sortBy(p ⇒ 
sorted(p.wordIndexes.head))).seq.groupBy(_.length)
+
+                    lazy val tokStems = toks.map(_.stem).mkString(" ")
+
+                    // Attempt to match each element.
+                    for (elm ← mdl.elements.values) {
+                        val elemId = elm.getId
+                        val sparseEnabled = 
!cacheSparse(elemId).exists(_.contains(toks))
+                        val notSparseEnabled = 
!cacheNotSparse(elemId).exists(_.contains(toks))
+                        var foundSparse = false
+                        var foundNotSparse = false
+
+                        def addSparse(res: Seq[NlpToken], syn: NCProbeSynonym, 
parts: Seq[TokenData]): Unit = {
+                            addMatch(elm, res, syn, parts)
+                            cacheSparse(elemId) += toks
+                            foundSparse = true
+                        }
 
-                        // Attempt to match each element.
-                        for (elm ← mdl.elements.values if !alreadyMarked(toks, 
elm.getId)) {
-                            var found = false
+                        def addNotSparse(syn: NCProbeSynonym, parts: 
Seq[TokenData]): Unit = {
+                            addMatch(elm, toks, syn, parts)
+                            cacheNotSparse(elemId) += toks
+                            foundNotSparse = true
+                        }
 
-                            def setFound(
-                                elm: NCElement,
-                                toks: Seq[NlpToken],
-                                syn: NCProbeSynonym,
-                                parts: Seq[TokenData]
-                            ): Unit = {
-                                addMatch(elm, toks, syn, parts)
+                        // 1. Simple, sparse.
+                        if (firstPhase && sparseEnabled)
+                            for (syn ← mdl.sparseSynonyms.getOrElse(elemId, 
Seq.empty) if !foundSparse)
+                                syn.trySparseMatch(toks) match {
+                                    case Some(res) ⇒ addSparse(res, syn, 
Seq.empty)
+                                    case None ⇒ // No-op.
+                                }
 
-                                found = true
+                        // 2. Simple, not sparse.
+                        // Optimization - plain synonyms can be used only on 
first iteration
+                        if (firstPhase && notSparseEnabled)
+                            fastAccess(mdl.nonSparseSynonyms, elemId, 
toks.length) match {
+                                case Some(h) ⇒
+                                    def tryMap(synsMap: Map[String, 
NCProbeSynonym], notFound: () ⇒ Unit): Unit =
+                                        synsMap.get(tokStems) match {
+                                            case Some(syn) ⇒ addNotSparse(syn, 
Seq.empty)
+                                                // TODO:
+                                                //if (!found)
+                                                //   notFound()
+                                            case None ⇒ notFound()
+                                        }
+
+                                    def tryScan(synsSeq: Seq[NCProbeSynonym]): 
Unit =
+                                        for (syn ← synsSeq if !foundNotSparse)
+                                            if (syn.isMatch(toks))
+                                                addNotSparse(syn, Seq.empty)
+
+                                    tryMap(
+                                        h.txtDirectSynonyms,
+                                        () ⇒ {
+                                            tryScan(h.notTxtDirectSynonyms)
+
+                                            if (!foundNotSparse)
+                                                tryMap(
+                                                    h.txtNotDirectSynonyms,
+                                                    () ⇒ 
tryScan(h.notTxtNotDirectSynonyms)
+                                                )
+                                        }
+                                    )
+                                case None ⇒ // No-op.
                             }
 
-                            // 3. Simple, not sparse.
-                            // Optimization - plain synonyms can be used only 
on first iteration
-                            if (mdl.nonSparseSynonyms.nonEmpty && 
!senHasUserTokens)
-                                fastAccess(mdl.nonSparseSynonyms, elm.getId, 
toks.length) match {
-                                    case Some(h) ⇒
-                                        def tryMap(synsMap: Map[String, 
NCProbeSynonym], notFound: () ⇒ Unit): Unit =
-                                            synsMap.get(tokStems) match {
-                                                case Some(syn) ⇒
-                                                    setFound(elm, toks, syn, 
Seq.empty)
-
-                                                    if (!found)
-                                                        notFound()
-                                                case None ⇒ notFound()
-                                            }
-
-                                        def tryScan(synsSeq: 
Seq[NCProbeSynonym]): Unit =
-                                            for (syn ← synsSeq if !found)
-                                                if (syn.isMatch(toks))
-                                                    setFound(elm, toks, syn, 
Seq.empty)
-
-                                        tryMap(
-                                            h.txtDirectSynonyms,
-                                            () ⇒ {
-                                                tryScan(h.notTxtDirectSynonyms)
-
-                                                if (!found)
-                                                    tryMap(
-                                                        h.txtNotDirectSynonyms,
-                                                        () ⇒ 
tryScan(h.notTxtNotDirectSynonyms)
-                                                    )
-                                            }
-                                        )
+                        // 3. DSL, sparse.
+                        if (sparseEnabled)
+                            for (syn ← mdl.sparseSynonymsDsl.getOrElse(elemId, 
Seq.empty); complex ← complexes if !foundSparse) {
+                                val comb = complex.tokensComplexes
+
+                                syn.trySparseMatch(comb.map(_.data), req) 
match {
+                                    case Some(towsRes) ⇒ 
addSparse(convert(towsRes, ns), syn, getParts(comb, syn))
                                     case None ⇒ // No-op.
                                 }
+                            }
 
-                            if (mdl.nonSparseSynonymsDsl.nonEmpty)
-                                // 4. DSL, non sparse.
-                                for (
-                                    (len, seq) ← dslCombs;
-                                    syn ← fastAccess(mdl.nonSparseSynonymsDsl, 
elm.getId, len).getOrElse(Seq.empty);
-                                    comb ← seq if !found;
-                                    data = comb.map(_.data)
-                                )
-                                    if (syn.isMatch(data, req))
-                                        setFound(elm, toks, syn, 
getParts(comb, syn))
+                        if (notSparseEnabled) {
+                            // 4. DSL, non sparse.
+                            for (
+                                (len, seq) ← dslCombs;
+                                syn ← fastAccess(mdl.nonSparseSynonymsDsl, 
elemId, len).getOrElse(Seq.empty);
+                                comb ← seq if !foundNotSparse
+                            )
+                                if (syn.isMatch(comb.map(_.data), req))
+                                    addNotSparse(syn, getParts(comb, syn))
                         }
                     }
                 }
@@ -585,7 +601,8 @@ object NCModelEnricher extends NCProbeEnricher with 
DecorateAsScala {
                                 ).getOrElse(throw new AssertionError(s"Custom 
model parser returned an invalid custom token: $w"))
                             )
 
-                            if (!alreadyMarked(matchedToks, elemId))
+                            // TODO:
+                            //if (!alreadyMarked(matchedToks, elemId))
                                 mark(
                                     ns,
                                     elem = mdl.elements.getOrElse(elemId, 
throw new NCE(s"Custom model parser returned unknown element ID: $elemId")),

[incubator-nlpcraft] branch NLPCRAFT-287 updated: WIP.

Reply via email to