This is an automated email from the ASF dual-hosted git repository.

sergeykamov pushed a commit to branch NLPCRAFT-443
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git


The following commit(s) were added to refs/heads/NLPCRAFT-443 by this push:
     new 0d2c798  WIP.
0d2c798 is described below

commit 0d2c7985d508e674ecbe6db6ebc312b9263aa2ce
Author: Sergey Kamov <[email protected]>
AuthorDate: Wed Sep 15 13:49:23 2021 +0300

    WIP.
---
 .../mgrs/nlp/enrichers/model/NCModelEnricher.scala | 55 ++++++++++++----------
 .../model/stop/NCStopWordsInsideSpec.scala         | 23 ++++++---
 2 files changed, 47 insertions(+), 31 deletions(-)

diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
index 6e6f7d1..4823a68 100644
--- 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
@@ -194,7 +194,10 @@ object NCModelEnricher extends NCProbeEnricher {
         toks.foreach(_.add(note))
 
         // For NLP elements.
-        toks.foreach(t => ns.fixNote(t.getNlpNote, "direct" -> direct))
+        toks.foreach(t => {
+            ns.fixNote(t.getNlpNote, "direct" -> direct)
+            ns.fixNote(t.getNlpNote, "stopWord" -> false)
+        })
     }
 
     /**
@@ -282,23 +285,26 @@ object NCModelEnricher extends NCProbeEnricher {
     }
 
     /**
-      * Gets all sequential permutations of given tokens.
       *
-      * For example, if buffer contains "a b c d" tokens, then this function 
will return the
-      * sequence of following token sequences in this order:
-      * "a b c d"
-      * "a b c"
-      * "b c d"
-      * "a b"
-      * "b c"
-      * "c d"
-      * "a"
-      * "b"
-      * "c"
-      * "d"
+      * @param toks
+      */
+    private def combosNlpTokens(toks: Seq[NlpToken]): Seq[(Seq[NlpToken], 
Seq[NlpToken])] =
+        combos(toks).flatMap(combo => {
+            val stops = combo.filter(_.isStopWord)
+
+            val stops4Delete = Range.inclusive(1, 
stops.size).flatMap(stops.combinations)
+
+            (Seq(combo) ++ stops4Delete.map(del => combo.filter(t => 
!del.contains(t)))).map(_ -> combo)
+        }).
+            toMap.
+            filter(_._1.nonEmpty).
+            groupBy(_._1).
+            map(p => p._1 -> p._2.values.minBy(p => (-p.size, p.head.index))).
+            sortBy(p => (-p._2.size, -p._1.size, -p._2.head.index, 
-p._1.head.index))
+
+    /**
       *
       * @param toks
-      * @return
       */
     private def combos[T](toks: Seq[T]): Seq[Seq[T]] =
         (for (n <- toks.size until 0 by -1) yield 
toks.sliding(n)).flatten.map(p => p)
@@ -451,7 +457,7 @@ object NCModelEnricher extends NCProbeEnricher {
             "enrich", parent, "srvReqId" -> ns.srvReqId, "mdlId" -> 
mdl.model.getId, "txt" -> ns.text
         ) { span =>
             val req = NCRequestImpl(senMeta, ns.srvReqId)
-            val combToks = combos(ns.toSeq)
+            val combToks = combosNlpTokens(ns.toSeq)
             lazy val ch = mkComplexes(mdl, ns)
 
             def execute(simpleEnabled: Boolean, idlEnabled: Boolean): Unit =
@@ -466,7 +472,9 @@ object NCModelEnricher extends NCProbeEnricher {
                     lazy val idlCache = mutable.HashSet.empty[Seq[Complex]]
 
                     for (
-                        toks <- combToks;
+                        // toksExt is part of sentence.
+                        // toks is toksExt or toksExt without some stopwords 
set. All stopwords combinations are taking into account.
+                        (toks, toksExt) <- combToks;
                         idxs = toks.map(_.index);
                         e <- mdl.elements.values;
                         eId = e.getId;
@@ -489,7 +497,7 @@ object NCModelEnricher extends NCProbeEnricher {
                                             syns.get(tokStems) match {
                                                 case Some(s) =>
                                                     found = true
-                                                    add("simple continuous", 
ns, contCache, eId, greedy, toks, idxs, s)
+                                                    add("simple continuous", 
ns, contCache, eId, greedy, toksExt, idxs, s)
                                                 case None => notFound()
                                             }
 
@@ -497,7 +505,7 @@ object NCModelEnricher extends NCProbeEnricher {
                                             for (s <- syns if !found)
                                                 if (s.isMatch(toks)) {
                                                     found = true
-                                                    add("simple continuous 
scan", ns, contCache, eId, greedy, toks, idxs, s)
+                                                    add("simple continuous 
scan", ns, contCache, eId, greedy, toksExt, idxs, s)
                                                 }
 
                                         tryMap(
@@ -527,7 +535,6 @@ object NCModelEnricher extends NCProbeEnricher {
                             lazy val allCombs = mkCombinations(ch, toks, 
idlCache.toSet)
 
                             // 2.1 Continuous.
-
                             if (!mdl.hasSparseSynonyms) {
                                 var found = false
 
@@ -538,7 +545,7 @@ object NCModelEnricher extends NCProbeEnricher {
                                     data = comb.map(_.data)
                                 )
                                     if (s.isMatch(data, req)) {
-                                        add("IDL continuous", ns, contCache, 
eId, greedy, toks, idxs, s, toParts(data, s))
+                                        add("IDL continuous", ns, contCache, 
eId, greedy, toksExt, idxs, s, toParts(data, s))
 
                                         idlCache += comb
 
@@ -598,9 +605,9 @@ object NCModelEnricher extends NCProbeEnricher {
                     ||
                 (
                     n.tokenIndexes == toksIdxsSorted ||
-                        n.tokenIndexes.containsSlice(toksIdxsSorted) &&
-                        U.isContinuous(toksIdxsSorted) &&
-                        U.isContinuous(n.tokenIndexes)
+                    n.tokenIndexes.containsSlice(toksIdxsSorted) &&
+                    U.isContinuous(toksIdxsSorted) &&
+                    U.isContinuous(n.tokenIndexes)
                 )
             )
         ))
diff --git 
a/nlpcraft/src/test/scala/org/apache/nlpcraft/model/stop/NCStopWordsInsideSpec.scala
 
b/nlpcraft/src/test/scala/org/apache/nlpcraft/model/stop/NCStopWordsInsideSpec.scala
index 64ddf02..cac6983 100644
--- 
a/nlpcraft/src/test/scala/org/apache/nlpcraft/model/stop/NCStopWordsInsideSpec.scala
+++ 
b/nlpcraft/src/test/scala/org/apache/nlpcraft/model/stop/NCStopWordsInsideSpec.scala
@@ -31,7 +31,7 @@ class NCStopWordsInsideModel extends 
NCModelAdapter("nlpcraft.test", "Test Model
     override def getElements: util.Set[NCElement] = 
Set(NCTestElement("complex", "a b"))
 
     @NCIntent("intent=i term={# == 'complex'}")
-    private def onI(): NCResult = NCResult.text("OK")
+    def onI(): NCResult = NCResult.text("OK")
 }
 
 /**
@@ -42,10 +42,19 @@ class NCStopWordsInsideSpec extends NCTestContext {
     @Test
     def test(): Unit = {
         checkIntent("a b", "i")
-        checkResult("a the b", "i")
-        checkResult("a , b", "i")
-        checkResult("a,,b", "i")
-        checkResult("a, ,b", "i")
-        checkResult("a, the,b", "i")
+        checkIntent("a the b", "i")
+        checkIntent("a , b", "i")
+        checkIntent("a, b", "i")
+        checkIntent("a, the b", "i")
+        checkIntent("a, the, b", "i")
     }
-}
\ No newline at end of file
+}
+
+class NCStopWordsInsideSparseModel extends NCStopWordsInsideModel {
+    override def isPermutateSynonyms: Boolean = true
+    override def isSparse: Boolean = true
+}
+
+@NCTestEnvironment(model = classOf[NCStopWordsInsideSparseModel], startClient 
= true)
+class NCStopWordsInsideSparseSpec extends NCStopWordsInsideSpec
+

Reply via email to