This is an automated email from the ASF dual-hosted git repository.

sergeykamov pushed a commit to branch NLPCRAFT-287
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git

commit facba336dd6c9af04654e2cde381ffe085b107e7
Author: Sergey Kamov <[email protected]>
AuthorDate: Wed Apr 14 18:14:31 2021 +0300

    WIP.
---
 .../apache/nlpcraft/probe/mgrs/NCProbeModel.scala  |   1 +
 .../nlpcraft/probe/mgrs/model/NCModelManager.scala |   8 -
 .../mgrs/nlp/enrichers/model/NCModelEnricher.scala | 193 ++++++++++-----------
 3 files changed, 95 insertions(+), 107 deletions(-)

diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeModel.scala 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeModel.scala
index a4e55a6..2670fb7 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeModel.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeModel.scala
@@ -50,4 +50,5 @@ case class NCProbeModel(
 ) {
     def hasIdlSynonyms(elemId: String): Boolean = idlSynonyms.contains(elemId)
     def hasIdlSynonyms: Boolean = idlSynonyms.nonEmpty
+    def hasNoIdlSynonyms: Boolean = continuousSynonyms.nonEmpty || 
sparseSynonyms.nonEmpty
 }
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/model/NCModelManager.scala
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/model/NCModelManager.scala
index 2b8313c..3bd052c 100644
--- 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/model/NCModelManager.scala
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/model/NCModelManager.scala
@@ -58,14 +58,6 @@ object NCModelManager extends NCService with DecorateAsScala 
{
             data.values.foreach(w ⇒ {
                 val mdl = w.model
 
-                // TODO:
-                val elemId = "col:orders_order_date"
-
-                
println("w.directSynonyms="+w.continuousSynonyms.getOrElse(elemId, 
Map.empty).mkString("\n"))
-                println("w.sparseSynonyms="+w.sparseSynonyms.getOrElse(elemId, 
Seq.empty).mkString("\n"))
-                println("w.idlSynonyms="+w.idlSynonyms.getOrElse(elemId, 
Seq.empty).mkString("\n"))
-                println
-
                 val contCnt = 
w.continuousSynonyms.flatMap(_._2.map(_._2.count)).sum
                 val sparseCnt = w.sparseSynonyms.map(_._2.size).sum
                 val allIdlSyns = w.idlSynonyms.values.flatten
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
index f01619c..79bcf3f 100644
--- 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
@@ -147,6 +147,8 @@ object NCModelEnricher extends NCProbeEnricher with 
DecorateAsScala {
         ackStopped()
     }
 
+    def isComplex(mdl: NCProbeModel): Boolean = mdl.hasIdlSynonyms || 
!mdl.model.getParsers.isEmpty
+
     /**
       *
       * @param ns
@@ -156,7 +158,6 @@ object NCModelEnricher extends NCProbeEnricher with 
DecorateAsScala {
       * @param syn
       * @param metaOpt
       * @param parts
-      * @param allToksIdxs
       * @param continuous
       */
     private def mark(
@@ -164,11 +165,10 @@ object NCModelEnricher extends NCProbeEnricher with 
DecorateAsScala {
         elem: NCElement,
         toks: Seq[NlpToken],
         direct: Boolean,
-        syn: Option[Synonym],
-        metaOpt: Option[Map[String, Object]],
-        parts: Seq[TokType],
-        allToksIdxs: Seq[Int],
-        continuous: java.lang.Boolean
+        continuous: java.lang.Boolean,
+        syn: Option[Synonym] = None,
+        parts: Seq[TokType] = Seq.empty,
+        metaOpt: Option[Map[String, Object]] = None
     ): Unit = {
         val params = mutable.ArrayBuffer.empty[(String, AnyRef)]
 
@@ -176,7 +176,7 @@ object NCModelEnricher extends NCProbeEnricher with 
DecorateAsScala {
         params += "direct" → direct.asInstanceOf[AnyRef]
 
         // Internal usage.
-        params += "allToksIndexes" → allToksIdxs.asJava
+        params += "sortedTokensIndexes" → toks.map(_.index).sorted.asJava
         params += "continuous" → continuous
 
         syn match {
@@ -287,20 +287,16 @@ object NCModelEnricher extends NCProbeEnricher with 
DecorateAsScala {
                         )
 
                         // Checks element's tokens.
-                        val idxs = matchedToks.map(_.index)
-                        val continuous = U.isContinuous(idxs.sorted)
+                        val idxs = matchedToks.map(_.index).sorted
 
-                        if (!alreadyMarked(matchedToks, idxs, continuous, 
elemId))
+                        if (!alreadyMarked(ns, elemId, matchedToks, idxs))
                             mark(
                                 ns,
                                 elem = mdl.elements.getOrElse(elemId, throw 
new NCE(s"Custom model parser returned unknown element ID: $elemId")),
                                 toks = matchedToks,
                                 direct = true,
-                                syn = None,
-                                metaOpt = Some(e.getMetadata.asScala),
-                                parts = Seq.empty,
-                                idxs,
-                                continuous
+                                U.isContinuous(idxs),
+                                metaOpt = Some(e.getMetadata.asScala)
                             )
                     })
             }
@@ -333,39 +329,6 @@ object NCModelEnricher extends NCProbeEnricher with 
DecorateAsScala {
 
     /**
       *
-      * @param toks
-      * @param elemId
-      */
-    private def alreadyMarked(toks: Seq[NlpToken], allToksIndexes: Seq[Int], 
continuous: Boolean, elemId: String): Boolean = {
-        toks.forall(t ⇒ t.isTypeOf(elemId)) &&
-        toks.head.filter(_.noteType == elemId).exists(n ⇒ n.tokenIndexes.toSet 
== toks.map(_.index).toSet)
-        toks.flatten.exists(n ⇒
-            n.noteType == elemId && {
-                if (n.data("continuous").asInstanceOf[Boolean])
-                    false
-                else {
-                    if (continuous)
-                        false
-                    else
-                        
n.data("allToksIndexes").asInstanceOf[JList[Int]].asScala.containsSlice(allToksIndexes)
-                }
-            }
-        )
-    }
-
-    /**
-      *
-      * @param seq
-      * @param s
-      */
-    private def toPartsComplex(seq: Seq[Complex], s: Synonym): Seq[TokType] =
-        seq.zip(s.map(_.kind)).flatMap {
-            case (complex, kind) ⇒ if (complex.isToken) Some(complex.token → 
kind)
-            else None
-        }
-
-    /**
-      *
       * @param seq
       * @param s
       */
@@ -468,6 +431,46 @@ object NCModelEnricher extends NCProbeEnricher with 
DecorateAsScala {
             }).seq
     }
 
+    private def add(
+        ns: NCNlpSentence,
+        contCache: mutable.Map[String, ArrayBuffer[Seq[Int]]],
+        typ: String,
+        elm: NCElement,
+        res: Seq[NlpToken],
+        allToksIdxs: Seq[Int],
+        syn: Synonym,
+        parts: Seq[TokType] = Seq.empty)
+    : Unit = {
+        val resIdxs = res.map(_.index)
+
+        val continuous = U.isContinuous(resIdxs.sorted)
+
+        if (continuous && resIdxs == allToksIdxs)
+            contCache(elm.getId) += allToksIdxs
+
+        val ok = !alreadyMarked(ns, elm.getId, res, allToksIdxs)
+
+        if (ok) {
+            val direct = syn.isDirect && U.isIncreased(resIdxs)
+
+            mark(ns, elm, res, direct, continuous, syn = Some(syn), parts)
+        }
+
+        if (DEEP_DEBUG && ok)
+            println(
+                s"${if (ok) "Added" else "Skipped"} element [" +
+                    s"id=${elm.getId}, " +
+                    s"type=$typ, " +
+                    s"text='${res.map(_.origText).mkString(" ")}', " +
+                    s"indexes=${resIdxs.mkString("[", ",", "]")}, " +
+                    s"allTokensIndexes=${allToksIdxs.mkString("[", ",", "]")}, 
" +
+                    s"continuous=$continuous, " +
+                    s"synonym=$syn" +
+                    s"]"
+            )
+    }
+
+
     /**
       *
       * @param mdl
@@ -493,41 +496,9 @@ object NCModelEnricher extends NCProbeEnricher with 
DecorateAsScala {
             if (DEEP_DEBUG)
                 println(s"Execution started [simpleEnabled=$simpleEnabled, 
idlEnabled=$idlEnabled]")
 
-            val contCache =
-                mutable.HashMap.empty[String, mutable.ArrayBuffer[Seq[Int]]] ++
-                    mdl.elements.keys.map(k ⇒ k → 
mutable.ArrayBuffer.empty[Seq[Int]])
+            val contCache = mutable.HashMap.empty ++ mdl.elements.keys.map(k ⇒ 
k → mutable.ArrayBuffer.empty[Seq[Int]])
             lazy val idlCache = mutable.HashSet.empty[Seq[Complex]]
 
-            def add(typ: String, elm: NCElement, res: Seq[NlpToken], 
allToksIdxs: Seq[Int], s: Synonym, parts: Seq[TokType] = Seq.empty): Unit = {
-                val resIdxs = res.map(_.index)
-
-                val continuous = U.isContinuous(resIdxs.sorted)
-
-                if (continuous && resIdxs == allToksIdxs)
-                    contCache(elm.getId) += allToksIdxs
-
-                val added = !alreadyMarked(res, allToksIdxs, continuous, 
elm.getId)
-
-                if (added) {
-                    val direct = s.isDirect && U.isIncreased(resIdxs)
-
-                    mark(ns, elm, res, direct, syn = Some(s), metaOpt = None, 
parts, allToksIdxs, continuous)
-                }
-
-                if (DEEP_DEBUG)
-                    println(
-                        s"${if (added) "Added" else "Skipped"} element [" +
-                        s"id=${elm.getId}, " +
-                        s"type=$typ, " +
-                        s"text='${res.map(_.origText).mkString(" ")}', " +
-                        s"indexes=${resIdxs.mkString("[", ",", "]")}, " +
-                        s"allTokensIndexes=${allToksIdxs.mkString("[", ",", 
"]")}, " +
-                        s"continuous=$continuous, " +
-                        s"synonym=$s" +
-                        s"]"
-                    )
-            }
-
             for (toks ← combosToks) {
                 val tokIdxs = toks.map(_.index)
                 lazy val tokStems = toks.map(_.stem).mkString(" ")
@@ -537,8 +508,8 @@ object NCModelEnricher extends NCProbeEnricher with 
DecorateAsScala {
                     elm ← mdl.elements.values;
                     elemId = elm.getId
                     if
-                    !contCache(elemId).exists(_.containsSlice(tokIdxs)) &&
-                    !alreadyMarked(toks, tokIdxs, continuous = true, elemId)   
// Checks whole tokens slice.
+                        !contCache(elemId).exists(_.containsSlice(tokIdxs)) &&
+                        !alreadyMarked(ns, elemId, toks, tokIdxs)
                 ) {
                     // 1. SIMPLE.
                     if (simpleEnabled && (if (idlEnabled) 
mdl.hasIdlSynonyms(elemId) else !mdl.hasIdlSynonyms(elemId))) {
@@ -551,7 +522,7 @@ object NCModelEnricher extends NCProbeEnricher with 
DecorateAsScala {
                                     syns.get(tokStems) match {
                                         case Some(s) ⇒
                                             found = true
-                                            add("simple continuous", elm, 
toks, tokIdxs, s)
+                                            add(ns, contCache,"simple 
continuous", elm, toks, tokIdxs, s)
                                         case None ⇒ notFound()
                                     }
 
@@ -559,7 +530,7 @@ object NCModelEnricher extends NCProbeEnricher with 
DecorateAsScala {
                                     for (s ← syns if !found)
                                         if (s.isMatch(toks)) {
                                             found = true
-                                            add("simple continuous scan", elm, 
toks, tokIdxs, s)
+                                            add(ns, contCache, "simple 
continuous scan", elm, toks, tokIdxs, s)
                                         }
 
                                 tryMap(
@@ -575,23 +546,24 @@ object NCModelEnricher extends NCProbeEnricher with 
DecorateAsScala {
                         }
 
                         // 1.2 Sparse.
-                        for (s ← get(mdl.sparseSynonyms, elemId))
-                            s.sparseMatch(toks) match {
-                                case Some(res) ⇒ add("simple sparse", elm, 
res, tokIdxs, s)
-                                case None ⇒ // No-op.
-                            }
+                        if (!found)
+                            for (s ← get(mdl.sparseSynonyms, elemId))
+                                s.sparseMatch(toks) match {
+                                    case Some(res) ⇒ add(ns, contCache, 
"simple sparse", elm, res, tokIdxs, s)
+                                    case None ⇒ // No-op.
+                                }
                     }
 
                     // 2. IDL.
                     if (idlEnabled) {
-                        val idlCombs = mkComplexCombinations(ch, toks, 
idlCache.toSet)
+                        lazy val idlCombs = mkComplexCombinations(ch, toks, 
idlCache.toSet)
 
                         for (s ← get(mdl.idlSynonyms, elemId); comb ← idlCombs)
                             s.idlMatch(comb.map(_.data), req) match {
                                 case Some(res) ⇒
                                     val typ = if (s.sparse) "IDL sparse" else 
"IDL continuous"
 
-                                    add(typ, elm, toTokens(res, ns), tokIdxs, 
s, toParts(res, s))
+                                    add(ns, contCache, typ, elm, toTokens(res, 
ns), tokIdxs, s, toParts(res, s))
 
                                     idlCache += comb
                                 case None ⇒ // No-op.
@@ -611,22 +583,45 @@ object NCModelEnricher extends NCProbeEnricher with 
DecorateAsScala {
 
             lazy val h = mkComplexes(mdl, ns)
 
-            val idlEnabled = mdl.hasIdlSynonyms
-
             if (ns.firstProbePhase) {
                 ns.firstProbePhase = false
 
-                execute(mdl, ns, combToks, simpleEnabled = true, idlEnabled = 
false, req, h, parent)
-                execute(mdl, ns, combToks, simpleEnabled = true, idlEnabled, 
req, h, parent)
+                if (mdl.hasNoIdlSynonyms)
+                    execute(mdl, ns, combToks, simpleEnabled = true, 
idlEnabled = false, req, h, parent)
+                execute(mdl, ns, combToks, simpleEnabled = 
mdl.hasNoIdlSynonyms, mdl.hasIdlSynonyms, req, h, parent)
             }
             else {
-                if (idlEnabled)
-                    execute(mdl, ns, combToks, simpleEnabled = false, 
idlEnabled, req, h, parent)
+                if (mdl.hasIdlSynonyms)
+                    execute(mdl, ns, combToks, simpleEnabled = false, 
idlEnabled = true, req, h, parent)
             }
 
             processParsers(mdl, ns, span, req)
         }
     }
 
-    def isComplex(mdl: NCProbeModel): Boolean = mdl.hasIdlSynonyms || 
!mdl.model.getParsers.isEmpty
+    /**
+      *
+      * @param ns
+      * @param elemId
+      * @param toks
+      * @param allSortedSliceIdxs
+      */
+    private def alreadyMarked(ns: NCNlpSentence, elemId: String, toks: 
Seq[NlpToken], allSortedSliceIdxs: Seq[Int]): Boolean = {
+        lazy val toksIdxsSorted = toks.map(_.index).sorted
+        lazy val isCont = U.isContinuous(toksIdxsSorted)
+
+        ns.flatten.exists(
+            n ⇒
+                n.noteType == elemId &&
+                {
+                    lazy val nToksIdxsSorted = 
n.data[JList[Int]]("sortedTokensIndexes").asScala
+
+                    n.data[Boolean]("continuous") && 
allSortedSliceIdxs.containsSlice(nToksIdxsSorted) ||
+                        {
+                            nToksIdxsSorted == toksIdxsSorted ||
+                            isCont && U.isContinuous(nToksIdxsSorted) && 
nToksIdxsSorted.containsSlice(toksIdxsSorted)
+                        }
+                }
+        )
+    }
 }
\ No newline at end of file

Reply via email to