[incubator-nlpcraft] branch NLPCRAFT-443 updated: WIP.

sergeykamov Tue, 14 Sep 2021 11:56:38 -0700

This is an automated email from the ASF dual-hosted git repository.

sergeykamov pushed a commit to branch NLPCRAFT-443
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git



The following commit(s) were added to refs/heads/NLPCRAFT-443 by this push:
     new 60a5225  WIP.
60a5225 is described below

commit 60a5225e551c743015fe005c6fd8352776ec90b2
Author: Sergey Kamov <[email protected]>
AuthorDate: Tue Sep 14 21:55:39 2021 +0300

    WIP.
---
 .../nlpcraft/common/nlp/NCNlpSentenceToken.scala   |  4 +-
 .../mgrs/nlp/enrichers/limit/NCLimitEnricher.scala |  2 +-
 .../enrichers/relation/NCRelationEnricher.scala    |  2 +-
 .../mgrs/nlp/enrichers/sort/NCSortEnricher.scala   |  2 +-
 .../enrichers/stopword/NCStopWordEnricher.scala    | 66 ----------------------
 .../enrichers/stopword/NCStopWordEnricher.scala    |  1 +
 .../abstract/NCAbstractTokensIntentsSpec.scala     |  6 --
 .../model/abstract/NCAbstractTokensModel.scala     |  2 -
 .../model/properties/NCTokensPropertiesSpec.scala  |  2 -
 .../model/NCEnricherNestedModelSpec.scala          | 15 +----
 .../model/NCEnricherNestedModelSpec3.scala         |  1 -
 .../model/NCEnricherNestedModelSpec4.scala         |  2 -
 12 files changed, 7 insertions(+), 98 deletions(-)

diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentenceToken.scala
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentenceToken.scala
index 00f1dd0..4b94b98 100644
--- 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentenceToken.scala
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentenceToken.scala
@@ -194,9 +194,7 @@ case class NCNlpSentenceToken(
       * @param reason
       */
     def addStopReason(reason: NCNlpSentenceNote): Unit = stopsReasons += reason
-    /**
-      *
-      */
+
     override def toString: String =
         notes.toSeq.sortBy(t => (if (t.isNlp) 0 else 1, 
t.noteType)).mkString("NLP token [", "|", "]")
 }
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/limit/NCLimitEnricher.scala
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/limit/NCLimitEnricher.scala
index 0286db3..7bad3c5 100644
--- 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/limit/NCLimitEnricher.scala
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/limit/NCLimitEnricher.scala
@@ -262,7 +262,7 @@ object NCLimitEnricher extends NCProbeEnricher {
 
             // Tries to grab tokens reverse way.
             // Example: A, B, C => ABC, BC, AB .. (BC will be processed first)
-            for (toks <- ns.tokenMix().sortBy(p => (-p.size, -p.head.index)) 
if validImportant(ns, toks)) {
+            for (toks <- ns.tokenMixWithStopWords().sortBy(p => (-p.size, 
-p.head.index)) if validImportant(ns, toks)) {
                 if (numsMap == null) {
                     numsMap = NCNumericManager.find(ns).map(p => p.tokens -> 
p).toMap
                     groupsMap = groupNums(ns, numsMap.values)
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/relation/NCRelationEnricher.scala
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/relation/NCRelationEnricher.scala
index d44b4cb..fa564b9 100644
--- 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/relation/NCRelationEnricher.scala
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/relation/NCRelationEnricher.scala
@@ -163,7 +163,7 @@ object NCRelationEnricher extends NCProbeEnricher {
             // Example: A, B, C => ABC, AB, BC .. (AB will be processed first)
             val notes = mutable.HashSet.empty[NCNlpSentenceNote]
 
-            for (toks <- ns.tokenMix() if validImportant(ns, toks))
+            for (toks <- ns.tokenMixWithStopWords() if validImportant(ns, 
toks))
                 tryToMatch(toks) match {
                     case Some(m) =>
                         for (refNote <- m.refNotes if 
!restricted.contains(refNote)) {
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/sort/NCSortEnricher.scala
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/sort/NCSortEnricher.scala
index fdb6d9a..286c8b4 100644
--- 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/sort/NCSortEnricher.scala
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/sort/NCSortEnricher.scala
@@ -454,7 +454,7 @@ object NCSortEnricher extends NCProbeEnricher {
             val notes = mutable.HashSet.empty[NCNlpSentenceNote]
             val matches = mutable.ArrayBuffer.empty[Match]
 
-            for (toks <- ns.tokenMix() if validImportant(ns, toks)) {
+            for (toks <- ns.tokenMixWithStopWords() if validImportant(ns, 
toks)) {
                 tryToMatch(toks) match {
                     case Some(m)  =>
                         if (!matches.exists(_.isSubCase(m)) && 
!m.intersect(restricted)) {
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/stopword/NCStopWordEnricher.scala
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/stopword/NCStopWordEnricher.scala
index 1af22cb..6d9b434 100644
--- 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/stopword/NCStopWordEnricher.scala
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/stopword/NCStopWordEnricher.scala
@@ -216,67 +216,6 @@ object NCStopWordEnricher extends NCProbeEnricher {
         processCommonStops0(mdl, ns)
     }
 
-    /**
-      *
-      * @param mdl
-      * @param ns
-      */
-    private def eraseNlpStops(mdl: NCProbeModel, ns: NCNlpSentence): Unit = {
-        val impStops = mutable.HashSet.empty[NCNlpSentenceToken]
-
-        val allContSyns: Map[Int, Iterable[NCProbeSynonymsWrapper]] =
-            mdl.continuousSynonyms.values.flatMap(_.toSeq).groupBy(_._1).map(p 
=> p._1 -> p._2.map(_._2))
-
-        for (toks <- ns.tokenMix(stopWords = true) if toks.exists(t => 
t.isStopWord && !impStops.contains(t))) {
-            allContSyns.get(toks.size) match {
-                case Some(ws) =>
-                    val stems = toks.map(_.stem).mkString(" ")
-
-                    if (ws.exists(w => w.txtDirectSynonyms.contains(stems) || 
w.txtNotDirectSynonyms.contains(stems)))
-                        impStops ++= toks.filter(_.isStopWord)
-
-                case None => // No-op.
-            }
-        }
-
-        val del = ns.tokens.filter(t => t.isStopWord && !impStops.contains(t))
-
-        impStops.foreach(t => ns.fixNote(t.getNlpNote, "stopWord" -> false))
-
-        if (del.nonEmpty) {
-            del.foreach(t => require(t.isNlp))
-
-            val delIdxs = del.flatMap(_.wordIndexes).sorted
-
-            val old = ns.tokens.clone()
-
-            ns.tokens.clear()
-            ns.tokens ++= old.filter(t => !del.contains(t)).zipWithIndex.map { 
case (t, idx) => t.clone(idx) }
-
-            ns.tokens.foreach(t => {
-                val tokNotes = notes(t)
-
-                tokNotes.foreach(n => {
-                    val tokIdxs = n.tokenIndexes.map(i => i - delIdxs.count(_ 
< i))
-                    val wordIdxs = n.wordIndexes.map(i => i - delIdxs.count(_ 
< i))
-
-                    t.remove(n)
-                    t.add(n.clone(tokIdxs, wordIdxs))
-                })
-            })
-
-            // TODO:
-            logger.info(
-                s"Stopwords deleted from sentence [" +
-                s"srvReqId=${ns.srvReqId}, " +
-                s"originText=${ns.text}, " +
-                s"fixedText=${ns.tokens.map(_.origText).mkString(" ")}, " +
-                s"stopWords=${del.map(p => 
s"${p.origText}(index=${p.wordIndexes.head})").mkString("|")}" +
-                s"]"
-            )
-        }
-    }
-
     @throws[NCE]
     override def enrich(mdl: NCProbeModel, ns: NCNlpSentence, senMeta: 
Map[String, Serializable], parent: Span = null): Unit = {
         require(isStarted)
@@ -297,12 +236,7 @@ object NCStopWordEnricher extends NCProbeEnricher {
                 processGeo(ns)
                 processDate(ns)
                 processNums(ns)
-
-                eraseNlpStops(mdl, ns)
-
                 processCommonStops(mdl, ns)
-
-                eraseNlpStops(mdl, ns)
             }
             else
                 ns.filter(_.isStopWord).foreach(t => ns.fixNote(t.getNlpNote, 
"stopWord" -> false))
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/server/nlp/enrichers/stopword/NCStopWordEnricher.scala
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/server/nlp/enrichers/stopword/NCStopWordEnricher.scala
index 5a9169d..a4e396f 100644
--- 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/server/nlp/enrichers/stopword/NCStopWordEnricher.scala
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/server/nlp/enrichers/stopword/NCStopWordEnricher.scala
@@ -683,6 +683,7 @@ object NCStopWordEnricher extends NCServerEnricher {
             "percent"
         ).map(NCNlpCoreManager.stem)
 
+
         // Stemmatization is done already by generator.
         possessiveWords = 
U.readTextGzipResource("stopwords/possessive_words.txt.gz", "UTF-8", 
logger).toSet
         firstWords = U.readTextGzipResource("stopwords/first_words.txt.gz", 
"UTF-8", logger).toSet
diff --git 
a/nlpcraft/src/test/scala/org/apache/nlpcraft/model/abstract/NCAbstractTokensIntentsSpec.scala
 
b/nlpcraft/src/test/scala/org/apache/nlpcraft/model/abstract/NCAbstractTokensIntentsSpec.scala
index c47661f..33ab3c3 100644
--- 
a/nlpcraft/src/test/scala/org/apache/nlpcraft/model/abstract/NCAbstractTokensIntentsSpec.scala
+++ 
b/nlpcraft/src/test/scala/org/apache/nlpcraft/model/abstract/NCAbstractTokensIntentsSpec.scala
@@ -21,9 +21,6 @@ import org.apache.nlpcraft.model.{NCIntent, NCIntentMatch, 
NCResult}
 import org.apache.nlpcraft.{NCTestContext, NCTestEnvironment}
 import org.junit.jupiter.api.Test
 
-import java.util
-import scala.jdk.CollectionConverters.{SetHasAsJava, SetHasAsScala}
-
 class NCAbstractTokensModelIntents extends NCAbstractTokensModel {
     @NCIntent("intent=wrapAnyWordIntent term(t)={# == 'wrapAnyWord'}")
     private def onWrapInternal(ctx: NCIntentMatch): NCResult = 
NCResult.text("OK")
@@ -36,9 +33,6 @@ class NCAbstractTokensModelIntents extends 
NCAbstractTokensModel {
 
     @NCIntent("intent=wrapWrapLimit term(t1)={# == 'wrapWrapLimit'} 
term(t2)={# == 'wrapAnyWord'}")
     private def wrapWrapLimit(ctx: NCIntentMatch): NCResult = 
NCResult.text("OK")
-
-    // TODO: w1 and w2 are stopwords according to 
src/main/resources/stopwords/stop_words.txt
-    override def getExcludedStopWords: util.Set[String] = (Set("w1", "w2") ++ 
super.getExcludedStopWords.asScala).asJava
 }
 
 @NCTestEnvironment(model = classOf[NCAbstractTokensModelIntents], startClient 
= true)
diff --git 
a/nlpcraft/src/test/scala/org/apache/nlpcraft/model/abstract/NCAbstractTokensModel.scala
 
b/nlpcraft/src/test/scala/org/apache/nlpcraft/model/abstract/NCAbstractTokensModel.scala
index 15700fe..3fb8319 100644
--- 
a/nlpcraft/src/test/scala/org/apache/nlpcraft/model/abstract/NCAbstractTokensModel.scala
+++ 
b/nlpcraft/src/test/scala/org/apache/nlpcraft/model/abstract/NCAbstractTokensModel.scala
@@ -38,6 +38,4 @@ class NCAbstractTokensModel extends NCModelAdapter(
     override def getAbstractTokens: util.Set[String] = Set("nlpcraft:num", 
"anyWord").asJava
     override def isPermutateSynonyms: Boolean = false
     override def isSparse: Boolean = false
-
-    override def getExcludedStopWords: util.Set[String] = Set("the").asJava
 }
diff --git 
a/nlpcraft/src/test/scala/org/apache/nlpcraft/model/properties/NCTokensPropertiesSpec.scala
 
b/nlpcraft/src/test/scala/org/apache/nlpcraft/model/properties/NCTokensPropertiesSpec.scala
index a60d762..0dd39bf 100644
--- 
a/nlpcraft/src/test/scala/org/apache/nlpcraft/model/properties/NCTokensPropertiesSpec.scala
+++ 
b/nlpcraft/src/test/scala/org/apache/nlpcraft/model/properties/NCTokensPropertiesSpec.scala
@@ -38,8 +38,6 @@ abstract class NCTokenPropertiesModelAbstract extends 
NCModelAdapter(
 
     override def isPermutateSynonyms: Boolean = true
     override def isSparse: Boolean = true
-
-    override def isStopWordsAllowed: Boolean = false
 }
 
 case class NCPropTestElement(
diff --git 
a/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCEnricherNestedModelSpec.scala
 
b/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCEnricherNestedModelSpec.scala
index bf4d6f1..4d5d991 100644
--- 
a/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCEnricherNestedModelSpec.scala
+++ 
b/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCEnricherNestedModelSpec.scala
@@ -92,22 +92,11 @@ class NCEnricherNestedModelSpec2 extends 
NCEnricherNestedModelSpec1 {
                 usr(text = "test tomorrow", id = "x3"),
                 nlp(text = "xxx"),
             ),
-            
             _ => checkExists(
                 "y the y",
-                usr(text = "y y", id = "y3")
-            ),
-            _ => checkExists(
-                "y the y",
-                usr(text = "y", id = "y1"),
-                usr(text = "y", id = "y1")
-            ),
-            _ => checkExists(
-                "y the y",
-                usr(text = "y", id = "y2"),
-                usr(text = "y", id = "y2")
+                usr(text = "y y", id = "y3"),
+                nlp(text = "the", isStop = true)
             ),
-
             _ => checkExists(
                 "y xxx y",
                 usr(text = "y y", id = "y3"),
diff --git 
a/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCEnricherNestedModelSpec3.scala
 
b/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCEnricherNestedModelSpec3.scala
index 0b10a61..2303e30 100644
--- 
a/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCEnricherNestedModelSpec3.scala
+++ 
b/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCEnricherNestedModelSpec3.scala
@@ -36,7 +36,6 @@ class NCNestedTestModel3 extends 
NCModelAdapter("nlpcraft.nested3.test.mdl", "Ne
 
     override def getAbstractTokens: util.Set[String] = Set("e1").asJava
     override def getEnabledBuiltInTokens: util.Set[String] = 
Set.empty[String].asJava
-    override def getExcludedStopWords: util.Set[String] = Set("a").asJava
 
     @NCIntent("intent=onE2 term(t1)={# == 'e2'}[12, 100]")
     def onAB(): NCResult = NCResult.text("OK")
diff --git 
a/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCEnricherNestedModelSpec4.scala
 
b/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCEnricherNestedModelSpec4.scala
index be643d5..27082f1 100644
--- 
a/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCEnricherNestedModelSpec4.scala
+++ 
b/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCEnricherNestedModelSpec4.scala
@@ -42,8 +42,6 @@ class NCNestedTestModel41 extends 
NCModelAdapter("nlpcraft.nested4.test.mdl", "N
 
     override def isPermutateSynonyms: Boolean = false
     override def isSparse: Boolean = false
-
-    override def getExcludedStopWords: util.Set[String] = Set("the", 
"a").asJava
 }
 
 /**

[incubator-nlpcraft] branch NLPCRAFT-443 updated: WIP.

Reply via email to