This is an automated email from the ASF dual-hosted git repository.
sergeykamov pushed a commit to branch NLPCRAFT-443
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git
The following commit(s) were added to refs/heads/NLPCRAFT-443 by this push:
new 60a5225 WIP.
60a5225 is described below
commit 60a5225e551c743015fe005c6fd8352776ec90b2
Author: Sergey Kamov <[email protected]>
AuthorDate: Tue Sep 14 21:55:39 2021 +0300
WIP.
---
.../nlpcraft/common/nlp/NCNlpSentenceToken.scala | 4 +-
.../mgrs/nlp/enrichers/limit/NCLimitEnricher.scala | 2 +-
.../enrichers/relation/NCRelationEnricher.scala | 2 +-
.../mgrs/nlp/enrichers/sort/NCSortEnricher.scala | 2 +-
.../enrichers/stopword/NCStopWordEnricher.scala | 66 ----------------------
.../enrichers/stopword/NCStopWordEnricher.scala | 1 +
.../abstract/NCAbstractTokensIntentsSpec.scala | 6 --
.../model/abstract/NCAbstractTokensModel.scala | 2 -
.../model/properties/NCTokensPropertiesSpec.scala | 2 -
.../model/NCEnricherNestedModelSpec.scala | 15 +----
.../model/NCEnricherNestedModelSpec3.scala | 1 -
.../model/NCEnricherNestedModelSpec4.scala | 2 -
12 files changed, 7 insertions(+), 98 deletions(-)
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentenceToken.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentenceToken.scala
index 00f1dd0..4b94b98 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentenceToken.scala
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentenceToken.scala
@@ -194,9 +194,7 @@ case class NCNlpSentenceToken(
* @param reason
*/
def addStopReason(reason: NCNlpSentenceNote): Unit = stopsReasons += reason
- /**
- *
- */
+
override def toString: String =
notes.toSeq.sortBy(t => (if (t.isNlp) 0 else 1,
t.noteType)).mkString("NLP token [", "|", "]")
}
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/limit/NCLimitEnricher.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/limit/NCLimitEnricher.scala
index 0286db3..7bad3c5 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/limit/NCLimitEnricher.scala
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/limit/NCLimitEnricher.scala
@@ -262,7 +262,7 @@ object NCLimitEnricher extends NCProbeEnricher {
// Tries to grab tokens reverse way.
// Example: A, B, C => ABC, BC, AB .. (BC will be processed first)
- for (toks <- ns.tokenMix().sortBy(p => (-p.size, -p.head.index))
if validImportant(ns, toks)) {
+ for (toks <- ns.tokenMixWithStopWords().sortBy(p => (-p.size,
-p.head.index)) if validImportant(ns, toks)) {
if (numsMap == null) {
numsMap = NCNumericManager.find(ns).map(p => p.tokens ->
p).toMap
groupsMap = groupNums(ns, numsMap.values)
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/relation/NCRelationEnricher.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/relation/NCRelationEnricher.scala
index d44b4cb..fa564b9 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/relation/NCRelationEnricher.scala
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/relation/NCRelationEnricher.scala
@@ -163,7 +163,7 @@ object NCRelationEnricher extends NCProbeEnricher {
// Example: A, B, C => ABC, AB, BC .. (AB will be processed first)
val notes = mutable.HashSet.empty[NCNlpSentenceNote]
- for (toks <- ns.tokenMix() if validImportant(ns, toks))
+ for (toks <- ns.tokenMixWithStopWords() if validImportant(ns,
toks))
tryToMatch(toks) match {
case Some(m) =>
for (refNote <- m.refNotes if
!restricted.contains(refNote)) {
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/sort/NCSortEnricher.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/sort/NCSortEnricher.scala
index fdb6d9a..286c8b4 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/sort/NCSortEnricher.scala
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/sort/NCSortEnricher.scala
@@ -454,7 +454,7 @@ object NCSortEnricher extends NCProbeEnricher {
val notes = mutable.HashSet.empty[NCNlpSentenceNote]
val matches = mutable.ArrayBuffer.empty[Match]
- for (toks <- ns.tokenMix() if validImportant(ns, toks)) {
+ for (toks <- ns.tokenMixWithStopWords() if validImportant(ns,
toks)) {
tryToMatch(toks) match {
case Some(m) =>
if (!matches.exists(_.isSubCase(m)) &&
!m.intersect(restricted)) {
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/stopword/NCStopWordEnricher.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/stopword/NCStopWordEnricher.scala
index 1af22cb..6d9b434 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/stopword/NCStopWordEnricher.scala
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/stopword/NCStopWordEnricher.scala
@@ -216,67 +216,6 @@ object NCStopWordEnricher extends NCProbeEnricher {
processCommonStops0(mdl, ns)
}
- /**
- *
- * @param mdl
- * @param ns
- */
- private def eraseNlpStops(mdl: NCProbeModel, ns: NCNlpSentence): Unit = {
- val impStops = mutable.HashSet.empty[NCNlpSentenceToken]
-
- val allContSyns: Map[Int, Iterable[NCProbeSynonymsWrapper]] =
- mdl.continuousSynonyms.values.flatMap(_.toSeq).groupBy(_._1).map(p
=> p._1 -> p._2.map(_._2))
-
- for (toks <- ns.tokenMix(stopWords = true) if toks.exists(t =>
t.isStopWord && !impStops.contains(t))) {
- allContSyns.get(toks.size) match {
- case Some(ws) =>
- val stems = toks.map(_.stem).mkString(" ")
-
- if (ws.exists(w => w.txtDirectSynonyms.contains(stems) ||
w.txtNotDirectSynonyms.contains(stems)))
- impStops ++= toks.filter(_.isStopWord)
-
- case None => // No-op.
- }
- }
-
- val del = ns.tokens.filter(t => t.isStopWord && !impStops.contains(t))
-
- impStops.foreach(t => ns.fixNote(t.getNlpNote, "stopWord" -> false))
-
- if (del.nonEmpty) {
- del.foreach(t => require(t.isNlp))
-
- val delIdxs = del.flatMap(_.wordIndexes).sorted
-
- val old = ns.tokens.clone()
-
- ns.tokens.clear()
- ns.tokens ++= old.filter(t => !del.contains(t)).zipWithIndex.map {
case (t, idx) => t.clone(idx) }
-
- ns.tokens.foreach(t => {
- val tokNotes = notes(t)
-
- tokNotes.foreach(n => {
- val tokIdxs = n.tokenIndexes.map(i => i - delIdxs.count(_
< i))
- val wordIdxs = n.wordIndexes.map(i => i - delIdxs.count(_
< i))
-
- t.remove(n)
- t.add(n.clone(tokIdxs, wordIdxs))
- })
- })
-
- // TODO:
- logger.info(
- s"Stopwords deleted from sentence [" +
- s"srvReqId=${ns.srvReqId}, " +
- s"originText=${ns.text}, " +
- s"fixedText=${ns.tokens.map(_.origText).mkString(" ")}, " +
- s"stopWords=${del.map(p =>
s"${p.origText}(index=${p.wordIndexes.head})").mkString("|")}" +
- s"]"
- )
- }
- }
-
@throws[NCE]
override def enrich(mdl: NCProbeModel, ns: NCNlpSentence, senMeta:
Map[String, Serializable], parent: Span = null): Unit = {
require(isStarted)
@@ -297,12 +236,7 @@ object NCStopWordEnricher extends NCProbeEnricher {
processGeo(ns)
processDate(ns)
processNums(ns)
-
- eraseNlpStops(mdl, ns)
-
processCommonStops(mdl, ns)
-
- eraseNlpStops(mdl, ns)
}
else
ns.filter(_.isStopWord).foreach(t => ns.fixNote(t.getNlpNote,
"stopWord" -> false))
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/server/nlp/enrichers/stopword/NCStopWordEnricher.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/server/nlp/enrichers/stopword/NCStopWordEnricher.scala
index 5a9169d..a4e396f 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/server/nlp/enrichers/stopword/NCStopWordEnricher.scala
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/server/nlp/enrichers/stopword/NCStopWordEnricher.scala
@@ -683,6 +683,7 @@ object NCStopWordEnricher extends NCServerEnricher {
"percent"
).map(NCNlpCoreManager.stem)
+
// Stemmatization is done already by generator.
possessiveWords =
U.readTextGzipResource("stopwords/possessive_words.txt.gz", "UTF-8",
logger).toSet
firstWords = U.readTextGzipResource("stopwords/first_words.txt.gz",
"UTF-8", logger).toSet
diff --git
a/nlpcraft/src/test/scala/org/apache/nlpcraft/model/abstract/NCAbstractTokensIntentsSpec.scala
b/nlpcraft/src/test/scala/org/apache/nlpcraft/model/abstract/NCAbstractTokensIntentsSpec.scala
index c47661f..33ab3c3 100644
---
a/nlpcraft/src/test/scala/org/apache/nlpcraft/model/abstract/NCAbstractTokensIntentsSpec.scala
+++
b/nlpcraft/src/test/scala/org/apache/nlpcraft/model/abstract/NCAbstractTokensIntentsSpec.scala
@@ -21,9 +21,6 @@ import org.apache.nlpcraft.model.{NCIntent, NCIntentMatch,
NCResult}
import org.apache.nlpcraft.{NCTestContext, NCTestEnvironment}
import org.junit.jupiter.api.Test
-import java.util
-import scala.jdk.CollectionConverters.{SetHasAsJava, SetHasAsScala}
-
class NCAbstractTokensModelIntents extends NCAbstractTokensModel {
@NCIntent("intent=wrapAnyWordIntent term(t)={# == 'wrapAnyWord'}")
private def onWrapInternal(ctx: NCIntentMatch): NCResult =
NCResult.text("OK")
@@ -36,9 +33,6 @@ class NCAbstractTokensModelIntents extends
NCAbstractTokensModel {
@NCIntent("intent=wrapWrapLimit term(t1)={# == 'wrapWrapLimit'}
term(t2)={# == 'wrapAnyWord'}")
private def wrapWrapLimit(ctx: NCIntentMatch): NCResult =
NCResult.text("OK")
-
- // TODO: w1 and w2 are stopwords according to
src/main/resources/stopwords/stop_words.txt
- override def getExcludedStopWords: util.Set[String] = (Set("w1", "w2") ++
super.getExcludedStopWords.asScala).asJava
}
@NCTestEnvironment(model = classOf[NCAbstractTokensModelIntents], startClient
= true)
diff --git
a/nlpcraft/src/test/scala/org/apache/nlpcraft/model/abstract/NCAbstractTokensModel.scala
b/nlpcraft/src/test/scala/org/apache/nlpcraft/model/abstract/NCAbstractTokensModel.scala
index 15700fe..3fb8319 100644
---
a/nlpcraft/src/test/scala/org/apache/nlpcraft/model/abstract/NCAbstractTokensModel.scala
+++
b/nlpcraft/src/test/scala/org/apache/nlpcraft/model/abstract/NCAbstractTokensModel.scala
@@ -38,6 +38,4 @@ class NCAbstractTokensModel extends NCModelAdapter(
override def getAbstractTokens: util.Set[String] = Set("nlpcraft:num",
"anyWord").asJava
override def isPermutateSynonyms: Boolean = false
override def isSparse: Boolean = false
-
- override def getExcludedStopWords: util.Set[String] = Set("the").asJava
}
diff --git
a/nlpcraft/src/test/scala/org/apache/nlpcraft/model/properties/NCTokensPropertiesSpec.scala
b/nlpcraft/src/test/scala/org/apache/nlpcraft/model/properties/NCTokensPropertiesSpec.scala
index a60d762..0dd39bf 100644
---
a/nlpcraft/src/test/scala/org/apache/nlpcraft/model/properties/NCTokensPropertiesSpec.scala
+++
b/nlpcraft/src/test/scala/org/apache/nlpcraft/model/properties/NCTokensPropertiesSpec.scala
@@ -38,8 +38,6 @@ abstract class NCTokenPropertiesModelAbstract extends
NCModelAdapter(
override def isPermutateSynonyms: Boolean = true
override def isSparse: Boolean = true
-
- override def isStopWordsAllowed: Boolean = false
}
case class NCPropTestElement(
diff --git
a/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCEnricherNestedModelSpec.scala
b/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCEnricherNestedModelSpec.scala
index bf4d6f1..4d5d991 100644
---
a/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCEnricherNestedModelSpec.scala
+++
b/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCEnricherNestedModelSpec.scala
@@ -92,22 +92,11 @@ class NCEnricherNestedModelSpec2 extends
NCEnricherNestedModelSpec1 {
usr(text = "test tomorrow", id = "x3"),
nlp(text = "xxx"),
),
-
_ => checkExists(
"y the y",
- usr(text = "y y", id = "y3")
- ),
- _ => checkExists(
- "y the y",
- usr(text = "y", id = "y1"),
- usr(text = "y", id = "y1")
- ),
- _ => checkExists(
- "y the y",
- usr(text = "y", id = "y2"),
- usr(text = "y", id = "y2")
+ usr(text = "y y", id = "y3"),
+ nlp(text = "the", isStop = true)
),
-
_ => checkExists(
"y xxx y",
usr(text = "y y", id = "y3"),
diff --git
a/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCEnricherNestedModelSpec3.scala
b/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCEnricherNestedModelSpec3.scala
index 0b10a61..2303e30 100644
---
a/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCEnricherNestedModelSpec3.scala
+++
b/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCEnricherNestedModelSpec3.scala
@@ -36,7 +36,6 @@ class NCNestedTestModel3 extends
NCModelAdapter("nlpcraft.nested3.test.mdl", "Ne
override def getAbstractTokens: util.Set[String] = Set("e1").asJava
override def getEnabledBuiltInTokens: util.Set[String] =
Set.empty[String].asJava
- override def getExcludedStopWords: util.Set[String] = Set("a").asJava
@NCIntent("intent=onE2 term(t1)={# == 'e2'}[12, 100]")
def onAB(): NCResult = NCResult.text("OK")
diff --git
a/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCEnricherNestedModelSpec4.scala
b/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCEnricherNestedModelSpec4.scala
index be643d5..27082f1 100644
---
a/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCEnricherNestedModelSpec4.scala
+++
b/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCEnricherNestedModelSpec4.scala
@@ -42,8 +42,6 @@ class NCNestedTestModel41 extends
NCModelAdapter("nlpcraft.nested4.test.mdl", "N
override def isPermutateSynonyms: Boolean = false
override def isSparse: Boolean = false
-
- override def getExcludedStopWords: util.Set[String] = Set("the",
"a").asJava
}
/**