[incubator-nlpcraft] branch NLPCRAFT-30 updated: WIP.

sergeykamov Wed, 29 Apr 2020 12:22:17 -0700

This is an automated email from the ASF dual-hosted git repository.

sergeykamov pushed a commit to branch NLPCRAFT-30
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git



The following commit(s) were added to refs/heads/NLPCRAFT-30 by this push:
     new fcd204e  WIP.
fcd204e is described below

commit fcd204e5d94997f11feee53c9ebacb4934ea59fc
Author: Sergey Kamov <[email protected]>
AuthorDate: Wed Apr 29 22:21:54 2020 +0300

    WIP.
---
 .../org/apache/nlpcraft/probe/NCProbeBoot.scala    |   6 +-
 .../nlpcraft/probe/mgrs/nlp/NCProbeEnricher.scala  |  62 +-----
 .../probe/mgrs/nlp/NCProbeEnrichmentManager.scala  |  81 +++-----
 .../mgrs/nlp/enrichers/limit/NCLimitEnricher.scala |  44 ++++-
 .../mgrs/nlp/enrichers/model/NCModelEnricher.scala |   6 +-
 .../enrichers/relation/NCRelationEnricher.scala    |  12 +-
 .../mgrs/nlp/enrichers/sort/NCSortEnricher.scala   |  13 +-
 .../NCEnricherProcessor.scala}                     | 220 ++++++++++-----------
 8 files changed, 202 insertions(+), 242 deletions(-)

diff --git a/src/main/scala/org/apache/nlpcraft/probe/NCProbeBoot.scala 
b/src/main/scala/org/apache/nlpcraft/probe/NCProbeBoot.scala
index caba0bb..77ee49a 100644
--- a/src/main/scala/org/apache/nlpcraft/probe/NCProbeBoot.scala
+++ b/src/main/scala/org/apache/nlpcraft/probe/NCProbeBoot.scala
@@ -41,7 +41,7 @@ import 
org.apache.nlpcraft.probe.mgrs.nlp.NCProbeEnrichmentManager
 import 
org.apache.nlpcraft.probe.mgrs.nlp.enrichers.dictionary.NCDictionaryEnricher
 import org.apache.nlpcraft.probe.mgrs.nlp.enrichers.limit.NCLimitEnricher
 import org.apache.nlpcraft.probe.mgrs.nlp.enrichers.model.NCModelEnricher
-import org.apache.nlpcraft.probe.mgrs.nlp.enrichers.post.NCPostEnrichProcessor
+import org.apache.nlpcraft.probe.mgrs.nlp.enrichers.utils.NCEnricherProcessor
 import org.apache.nlpcraft.probe.mgrs.nlp.enrichers.relation.NCRelationEnricher
 import org.apache.nlpcraft.probe.mgrs.nlp.enrichers.sort.NCSortEnricher
 import org.apache.nlpcraft.probe.mgrs.nlp.enrichers.stopword.NCStopWordEnricher
@@ -425,6 +425,7 @@ private [probe] object NCProbeBoot extends LazyLogging with 
NCOpenCensusTrace {
             NCModelManager.start(span)
             NCCommandManager.start(span)
             NCDictionaryManager.start(span)
+            NCEnricherProcessor.start(span)
             NCStopWordEnricher.start(span)
             NCModelEnricher.start(span)
             NCLimitEnricher.start(span)
@@ -432,7 +433,6 @@ private [probe] object NCProbeBoot extends LazyLogging with 
NCOpenCensusTrace {
             NCRelationEnricher.start(span)
             NCSuspiciousNounsEnricher.start(span)
             NCValidateManager.start(span)
-            NCPostEnrichProcessor.start(span)
             NCDictionaryEnricher.start(span)
             NCConversationManager.start(span)
             NCProbeEnrichmentManager.start(span)
@@ -451,8 +451,8 @@ private [probe] object NCProbeBoot extends LazyLogging with 
NCOpenCensusTrace {
             NCConnectionManager.stop(span)
             NCProbeEnrichmentManager.stop(span)
             NCConversationManager.stop(span)
+            NCEnricherProcessor.stop(span)
             NCDictionaryEnricher.stop(span)
-            NCPostEnrichProcessor.stop(span)
             NCValidateManager.stop(span)
             NCSuspiciousNounsEnricher.stop(span)
             NCRelationEnricher.stop(span)
diff --git 
a/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/NCProbeEnricher.scala 
b/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/NCProbeEnricher.scala
index dda8875..15acb12 100644
--- a/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/NCProbeEnricher.scala
+++ b/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/NCProbeEnricher.scala
@@ -25,7 +25,7 @@ import org.apache.nlpcraft.common.nlp._
 import org.apache.nlpcraft.common.{NCService, _}
 import org.apache.nlpcraft.probe.mgrs.NCModelDecorator
 
-import scala.collection.{Map, Seq}
+import scala.collection.Map
 import scala.language.implicitConversions
 
 /**
@@ -33,66 +33,6 @@ import scala.language.implicitConversions
  */
 abstract class NCProbeEnricher extends NCService with LazyLogging {
     /**
-      * Checks whether important tokens deleted as stopwords or not.
-      *
-      * @param ns Sentence.
-      * @param toks Tokens in which some stopwords can be deleted.
-      * @param isImportant Token important criteria.
-      */
-    protected def validImportant(
-        ns: NCNlpSentence,
-        toks: Seq[NCNlpSentenceToken],
-        isImportant: NCNlpSentenceToken ⇒ Boolean
-    ): Boolean = {
-        val idxs = toks.map(_.index)
-
-        require(idxs == idxs.sorted)
-
-        val toks2 = ns.slice(idxs.head, idxs.last + 1)
-
-        toks.length == toks2.length || toks.count(isImportant) == 
toks2.count(isImportant)
-    }
-
-    /**
-      *
-      * @param toks
-      * @param pred
-      */
-    protected def getCommonNotes(
-        toks: Seq[NCNlpSentenceToken], pred: Option[NCNlpSentenceNote ⇒ 
Boolean] = None
-    ): Set[String] =
-        if (toks.isEmpty)
-            Set.empty
-        else {
-            def getCommon(sortedToks: Seq[NCNlpSentenceToken]): Set[String] = {
-                require(sortedToks.nonEmpty)
-
-                val h = sortedToks.head
-                val l = sortedToks.last
-
-                val notes = pred match {
-                    case Some(p) ⇒ h.filter(p)
-                    case None ⇒ h.map(p ⇒ p)
-                }
-
-                notes.filter(!_.isNlp).filter(n ⇒ h.index == n.tokenFrom && 
l.index == n.tokenTo).map(_.noteType).toSet
-            }
-
-            var sortedToks = toks.sortBy(_.index)
-
-            var res = getCommon(sortedToks)
-
-            if (res.isEmpty) {
-                sortedToks = sortedToks.filter(!_.isStopWord)
-
-                if (sortedToks.nonEmpty)
-                    res = getCommon(sortedToks)
-            }
-
-            if (res.isEmpty) Set.empty else res
-        }
-
-    /**
       *
       * Processes this NLP sentence.
       *
diff --git 
a/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/NCProbeEnrichmentManager.scala
 
b/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/NCProbeEnrichmentManager.scala
index 520d079..bdfebb5 100644
--- 
a/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/NCProbeEnrichmentManager.scala
+++ 
b/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/NCProbeEnrichmentManager.scala
@@ -42,11 +42,11 @@ import org.apache.nlpcraft.probe.mgrs.model.NCModelManager
 import 
org.apache.nlpcraft.probe.mgrs.nlp.enrichers.dictionary.NCDictionaryEnricher
 import org.apache.nlpcraft.probe.mgrs.nlp.enrichers.limit.NCLimitEnricher
 import org.apache.nlpcraft.probe.mgrs.nlp.enrichers.model.NCModelEnricher
-import org.apache.nlpcraft.probe.mgrs.nlp.enrichers.post._
 import org.apache.nlpcraft.probe.mgrs.nlp.enrichers.relation.NCRelationEnricher
 import org.apache.nlpcraft.probe.mgrs.nlp.enrichers.sort.NCSortEnricher
 import org.apache.nlpcraft.probe.mgrs.nlp.enrichers.stopword.NCStopWordEnricher
 import 
org.apache.nlpcraft.probe.mgrs.nlp.enrichers.suspicious.NCSuspiciousNounsEnricher
+import org.apache.nlpcraft.probe.mgrs.nlp.enrichers.utils._
 import org.apache.nlpcraft.probe.mgrs.nlp.impl._
 import org.apache.nlpcraft.probe.mgrs.nlp.validate._
 
@@ -182,52 +182,6 @@ object NCProbeEnrichmentManager extends NCService with 
NCOpenCensusModelStats {
     }
 
     /**
-      *
-      * @param nlpSen
-      * @param notes1
-      * @param notes2
-      * @param stopIdxs
-      * @param typ
-      */
-    private def squeeze(
-        nlpSen: NCNlpSentence,
-        notes1: Seq[NCNlpSentenceNote],
-        notes2: Seq[NCNlpSentenceNote],
-        stopIdxs: Seq[Int],
-        typ: String
-    ): Boolean = {
-        // Filters notes and adds unique key.
-        def toMap(notes: Seq[NCNlpSentenceNote], filter: NCNlpSentenceNote ⇒ 
Boolean):
-        Map[NCNlpSentenceNote, Any] =
-            notes.
-                filter(filter).
-                map(p ⇒ p → NCPostEnrichProcessor.getUniqueKey(p, withIndexes 
= false)).
-                toMap
-
-        // One possible difference - stopwords indexes.
-        def equalOrNearly(n1: NCNlpSentenceNote, n2: NCNlpSentenceNote): 
Boolean = {
-            val set1 = n1.wordIndexes.toSet
-            val set2 = n2.wordIndexes.toSet
-
-            set1 == set2 || set1.subsetOf(set2) && 
set2.diff(set1).forall(stopIdxs.contains)
-        }
-
-        val notesTyp1 = toMap(notes1, (n: NCNlpSentenceNote) ⇒ n.noteType == 
typ)
-        val diff = toMap(notes2, (n: NCNlpSentenceNote) ⇒ n.noteType == typ && 
!notesTyp1.contains(n))
-
-        // New notes are same as already prepared with one difference -
-        // their tokens contain or not stopwords.
-        // Such "new" tokens can be deleted.
-        val diffRedundant = diff.filter { case (n1, key1) ⇒
-            notesTyp1.exists { case (n2, key2) ⇒ key1 == key2 && 
(equalOrNearly(n2, n1) || equalOrNearly(n1, n2)) }
-        }.map { case (n, _) ⇒ n }
-
-        diffRedundant.foreach(nlpSen.removeNote)
-
-        diffRedundant.size == diff.size
-    }
-
-    /**
       * Processes 'ask' request from probe server.
       *
       * @param srvReqId Server request ID.
@@ -438,14 +392,35 @@ object NCProbeEnrichmentManager extends NCService with 
NCOpenCensusModelStats {
 
                     var same = notes1 == notes2
 
-                    if (!same)
+                    if (!same) {
+                        def squeeze(typ: String): Boolean = {
+                            val diff = notes2.filter(n ⇒ !notes1.contains(n))
+
+                            val diffRedundant = diff.flatMap(n2 ⇒
+                                notes1.find(n1 ⇒ 
NCEnricherProcessor.sameForSentence(n1, n2, nlpSen)) match {
+                                    case Some(similar) ⇒ Some(n2 → similar)
+                                    case None ⇒ None
+                                }
+                            )
+
+                            diffRedundant.foreach { case (del, similar) ⇒
+                                // TODO: log level
+                                logger.info(s"Redundant note removed: $del, 
because similar exists: $similar")
+
+                                nlpSen.removeNote(del)
+                            }
+
+                            diffRedundant.size == diff.size
+                        }
+
                         h.enricher match {
-                            case NCSortEnricher ⇒ same = squeeze(nlpSen, 
notes1, notes2, stopIdxs,"nlpcraft:sort")
-                            case NCLimitEnricher ⇒ same = squeeze(nlpSen, 
notes1, notes2, stopIdxs,"nlpcraft:limit")
-                            case NCRelationEnricher ⇒ same = squeeze(nlpSen, 
notes1, notes2, stopIdxs,"nlpcraft:relation")
+                            case NCSortEnricher ⇒ same = 
squeeze("nlpcraft:sort")
+                            case NCLimitEnricher ⇒ same = 
squeeze("nlpcraft:limit")
+                            case NCRelationEnricher ⇒ same = 
squeeze("nlpcraft:relation")
 
                             case _ ⇒ // No-op.
                         }
+                    }
 
                     h.enricher → same
                 }).toMap
@@ -462,7 +437,7 @@ object NCProbeEnrichmentManager extends NCService with 
NCOpenCensusModelStats {
                         logger.info(s"Enrichment finished [step=$step]")
             }
 
-            NCPostEnrichProcessor.collapse(mdlDec, nlpSen.clone(), span).
+            NCEnricherProcessor.collapse(mdlDec, nlpSen.clone(), span).
                 // Sorted to support deterministic logs.
                 sortBy(p ⇒
                 p.map(p ⇒ {
@@ -503,7 +478,7 @@ object NCProbeEnrichmentManager extends NCService with 
NCOpenCensusModelStats {
         val varsNlp = sensSeq.map(_.toSeq)
         val req = NCRequestImpl(meta, srvReqId)
 
-        var senVars = NCPostEnrichProcessor.convert(mdlDec, srvReqId, varsNlp)
+        var senVars = NCEnricherProcessor.convert(mdlDec, srvReqId, varsNlp)
 
         // Sentence variants can be filtered by model.
         val fltSenVars: Seq[(Seq[NCToken], Int)] =
diff --git 
a/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/limit/NCLimitEnricher.scala
 
b/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/limit/NCLimitEnricher.scala
index 87f02c9..861de12 100644
--- 
a/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/limit/NCLimitEnricher.scala
+++ 
b/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/limit/NCLimitEnricher.scala
@@ -27,6 +27,7 @@ import org.apache.nlpcraft.common.nlp.{NCNlpSentence, 
NCNlpSentenceNote, NCNlpSe
 import org.apache.nlpcraft.common.{NCE, NCService}
 import org.apache.nlpcraft.probe.mgrs.NCModelDecorator
 import org.apache.nlpcraft.probe.mgrs.nlp.NCProbeEnricher
+import org.apache.nlpcraft.probe.mgrs.nlp.enrichers.utils.NCEnricherProcessor
 
 import scala.collection.JavaConverters._
 import scala.collection.{Map, Seq, mutable}
@@ -150,7 +151,6 @@ object NCLimitEnricher extends NCProbeEnricher {
         s"$CD of",
         s"$CD <POST_WORDS>",
         s"<POST_WORDS> $CD"
-
     )
 
     private final val LIMITS: Seq[String] = {
@@ -191,14 +191,16 @@ object NCLimitEnricher extends NCProbeEnricher {
             "srvReqId" → ns.srvReqId,
             "modelId" → mdl.model.getId,
             "txt" → ns.text) { _ ⇒
+            val notes = mutable.HashSet.empty[NCNlpSentenceNote]
             val numsMap = 
NCNumericManager.find(ns).filter(_.unit.isEmpty).map(p ⇒ p.tokens → p).toMap
             val groupsMap = groupNums(ns, numsMap.values)
+
             def isImportant(t: NCNlpSentenceToken): Boolean = t.isUser || 
TECH_WORDS.contains(t.stem)
 
             // Tries to grab tokens reverse way.
             // Example: A, B, C ⇒ ABC, BC, AB .. (BC will be processed first)
             for (toks ← ns.tokenMixWithStopWords().sortBy(p ⇒ (-p.size, 
-p.head.index))
-                 if validImportant(ns, toks, isImportant)
+                 if NCEnricherProcessor.validImportant(ns, toks, isImportant)
             )
                 tryToMatch(numsMap, groupsMap, toks) match {
                     case Some(m) ⇒
@@ -214,11 +216,47 @@ object NCLimitEnricher extends NCProbeEnricher {
 
                             val note = 
NCNlpSentenceNote(m.matched.map(_.index), TOK_ID, params: _*)
 
-                            m.matched.foreach(_.add(note))
+                            if (!notes.exists(n ⇒ 
NCEnricherProcessor.sameForSentence(note, n, ns))) {
+                                notes += note
+
+                                m.matched.foreach(_.add(note))
+                            }
                         }
                     case None ⇒ // No-op.
                 }
         }
+
+    /**
+      *
+      * @param toks
+      */
+    private def getCommonNotes(toks: Seq[NCNlpSentenceToken]): Set[String] =
+        if (toks.isEmpty)
+            Set.empty
+        else {
+            def getCommon(sortedToks: Seq[NCNlpSentenceToken]): Set[String] = {
+                require(sortedToks.nonEmpty)
+
+                val h = sortedToks.head
+                val l = sortedToks.last
+
+                h.filter(!_.isNlp).filter(n ⇒ h.index == n.tokenFrom && 
l.index == n.tokenTo).map(_.noteType).toSet
+            }
+
+            var sortedToks = toks.sortBy(_.index)
+
+            var res = getCommon(sortedToks)
+
+            if (res.isEmpty) {
+                sortedToks = sortedToks.filter(!_.isStopWord)
+
+                if (sortedToks.nonEmpty)
+                    res = getCommon(sortedToks)
+            }
+
+            if (res.isEmpty) Set.empty else res
+        }
+
     /**
       *
       * @param numsMap
diff --git 
a/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
 
b/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
index 42f743b..e8b0f0d 100644
--- 
a/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
+++ 
b/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
@@ -25,7 +25,7 @@ import org.apache.nlpcraft.common._
 import org.apache.nlpcraft.common.nlp.{NCNlpSentenceToken, _}
 import org.apache.nlpcraft.model._
 import org.apache.nlpcraft.probe.mgrs.nlp.NCProbeEnricher
-import org.apache.nlpcraft.probe.mgrs.nlp.enrichers.post.NCPostEnrichProcessor
+import org.apache.nlpcraft.probe.mgrs.nlp.enrichers.utils.NCEnricherProcessor
 import org.apache.nlpcraft.probe.mgrs.nlp.impl.NCRequestImpl
 import org.apache.nlpcraft.probe.mgrs.{NCModelDecorator, NCSynonym}
 
@@ -377,10 +377,10 @@ object NCModelEnricher extends NCProbeEnricher with 
DecorateAsScala {
 
                                 if (collapsedSens == null)
                                     collapsedSens =
-                                        NCPostEnrichProcessor.convert(
+                                        NCEnricherProcessor.convert(
                                             mdl,
                                             ns.srvReqId,
-                                            
NCPostEnrichProcessor.collapse(mdl, ns.clone(), span).map(_.tokens)
+                                            NCEnricherProcessor.collapse(mdl, 
ns.clone(), span).map(_.tokens)
                                         )
 
                                 if (seq == null)
diff --git 
a/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/relation/NCRelationEnricher.scala
 
b/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/relation/NCRelationEnricher.scala
index dab01e9..fbd9be9 100644
--- 
a/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/relation/NCRelationEnricher.scala
+++ 
b/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/relation/NCRelationEnricher.scala
@@ -26,6 +26,7 @@ import org.apache.nlpcraft.common.nlp.{NCNlpSentence, 
NCNlpSentenceNote, NCNlpSe
 import org.apache.nlpcraft.common.{NCE, NCService}
 import org.apache.nlpcraft.probe.mgrs.NCModelDecorator
 import org.apache.nlpcraft.probe.mgrs.nlp.NCProbeEnricher
+import org.apache.nlpcraft.probe.mgrs.nlp.enrichers.utils.NCEnricherProcessor
 
 import scala.collection.JavaConverters._
 import scala.collection.{Map, Seq, mutable}
@@ -126,11 +127,12 @@ object NCRelationEnricher extends NCProbeEnricher {
             "txt" → ns.text) { _ ⇒
             // Tries to grab tokens direct way.
             // Example: A, B, C ⇒ ABC, AB, BC .. (AB will be processed first)
+            val notes = mutable.HashSet.empty[NCNlpSentenceNote]
 
             def isImportant(t: NCNlpSentenceToken): Boolean =
                 t.exists(n ⇒ n.isUser || REL_TYPES.contains(n.noteType)) || 
ALL_FUNC_STEMS.contains(t.stem)
 
-            for (toks ← ns.tokenMixWithStopWords() if validImportant(ns, toks, 
isImportant))
+            for (toks ← ns.tokenMixWithStopWords() if 
NCEnricherProcessor.validImportant(ns, toks, isImportant))
                 tryToMatch(toks) match {
                     case Some(m) ⇒
                         for (refNote ← m.refNotes) {
@@ -142,9 +144,13 @@ object NCRelationEnricher extends NCProbeEnricher {
                                 "note" → refNote
                             )
 
-                            m.matched.filter(_ != 
m.matchedHead).foreach(_.addStopReason(note))
+                            if (!notes.exists(n ⇒ 
NCEnricherProcessor.sameForSentence(note, n, ns))) {
+                                notes += note
 
-                            m.matchedHead.add(note)
+                                m.matched.filter(_ != 
m.matchedHead).foreach(_.addStopReason(note))
+
+                                m.matchedHead.add(note)
+                            }
                         }
                     case None ⇒ // No-op.
                 }
diff --git 
a/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/sort/NCSortEnricher.scala
 
b/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/sort/NCSortEnricher.scala
index b0509bb..ed445bc 100644
--- 
a/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/sort/NCSortEnricher.scala
+++ 
b/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/sort/NCSortEnricher.scala
@@ -26,6 +26,7 @@ import org.apache.nlpcraft.common.nlp.core.NCNlpCoreManager
 import org.apache.nlpcraft.common.nlp.{NCNlpSentence, NCNlpSentenceNote, 
NCNlpSentenceToken}
 import org.apache.nlpcraft.probe.mgrs.NCModelDecorator
 import org.apache.nlpcraft.probe.mgrs.nlp.NCProbeEnricher
+import org.apache.nlpcraft.probe.mgrs.nlp.enrichers.utils.NCEnricherProcessor
 
 import scala.collection.JavaConverters._
 import scala.collection.mutable.ArrayBuffer
@@ -411,10 +412,10 @@ object NCSortEnricher extends NCProbeEnricher {
             "srvReqId" → ns.srvReqId,
             "modelId" → mdl.model.getId,
             "txt" → ns.text) { _ ⇒
-            val buf = mutable.Buffer.empty[Set[NCNlpSentenceToken]]
+            val notes = mutable.HashSet.empty[NCNlpSentenceNote]
             def isImportant(t: NCNlpSentenceToken): Boolean = t.isUser || 
MASK_WORDS.contains(t.stem)
 
-            for (toks ← ns.tokenMixWithStopWords() if validImportant(ns, toks, 
isImportant)) {
+            for (toks ← ns.tokenMixWithStopWords() if 
NCEnricherProcessor.validImportant(ns, toks, isImportant)) {
                 tryToMatch(toks) match {
                     case Some(m) ⇒
                         def addNotes(
@@ -432,8 +433,12 @@ object NCSortEnricher extends NCProbeEnricher {
                         def mkNote(params: ArrayBuffer[(String, Any)]): Unit = 
{
                             val note = NCNlpSentenceNote(m.main.map(_.index), 
TOK_ID, params: _*)
 
-                            m.main.foreach(_.add(note))
-                            m.stop.foreach(_.addStopReason(note))
+                            if (!notes.exists(n ⇒ 
NCEnricherProcessor.sameForSentence(note, n, ns))) {
+                                notes += note
+
+                                m.main.foreach(_.add(note))
+                                m.stop.foreach(_.addStopReason(note))
+                            }
                         }
 
                         def mkParams(): mutable.ArrayBuffer[(String, Any)] = {
diff --git 
a/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/post/NCPostEnrichProcessor.scala
 
b/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/utils/NCEnricherProcessor.scala
similarity index 83%
rename from 
src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/post/NCPostEnrichProcessor.scala
rename to 
src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/utils/NCEnricherProcessor.scala
index 47447a3..08b73ba 100644
--- 
a/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/post/NCPostEnrichProcessor.scala
+++ 
b/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/utils/NCEnricherProcessor.scala
@@ -15,15 +15,16 @@
  * limitations under the License.
  */
 
-package org.apache.nlpcraft.probe.mgrs.nlp.enrichers.post
+package org.apache.nlpcraft.probe.mgrs.nlp.enrichers.utils
 
 import java.io.Serializable
 import java.util
+import java.util.Collections
 
 import com.typesafe.scalalogging.LazyLogging
 import io.opencensus.trace.Span
 import org.apache.nlpcraft.common.nlp.pos._
-import org.apache.nlpcraft.common.nlp.{NCNlpSentence, _}
+import org.apache.nlpcraft.common.nlp.{NCNlpSentence, NCNlpSentenceNote, _}
 import org.apache.nlpcraft.common.{NCService, _}
 import org.apache.nlpcraft.model.NCToken
 import org.apache.nlpcraft.model.impl.NCTokenImpl
@@ -33,14 +34,9 @@ import scala.collection.JavaConverters._
 import scala.collection._
 
 /**
-  * This collapser handles several tasks:
-  * - "overall" collapsing after all other individual collapsers had their 
turn.
-  * - Special further enrichment of tokens like linking, etc.
   *
-  * In all cases of overlap (full or partial) - the "longest" note wins. In 
case of overlap and equal
-  * lengths - the winning note is chosen based on this priority.
   */
-object NCPostEnrichProcessor extends NCService with LazyLogging {
+object NCEnricherProcessor extends NCService with LazyLogging {
     override def start(parent: Span = null): NCService = 
startScopedSpan("start", parent) { _ ⇒
         super.start()
     }
@@ -53,117 +49,39 @@ object NCPostEnrichProcessor extends NCService with 
LazyLogging {
       *
       * @param note
       * @param withIndexes
+      * @param withReferences
       * @return
       */
-    def getUniqueKey(note: NCNlpSentenceNote, withIndexes: Boolean = true): 
Any = {
-        val seq1 = if (withIndexes) Seq(note.wordIndexes, note.noteType) else 
Seq(note.noteType)
+    def getKey(note: NCNlpSentenceNote, withIndexes: Boolean = true, 
withReferences: Boolean = true): Seq[Any] = {
+        def addRefs(names: String*): Seq[String] = if (withReferences) names 
else Seq.empty
 
-        val seq2: Seq[Any] =
+        val names: Seq[String] =
             if (note.isUser)
                 Seq.empty
-            else {
+            else
                 note.noteType match {
-                    case "nlpcraft:continent" ⇒
-                        Seq(
-                            note.get("continent")
-                        )
-                    case "nlpcraft:subcontinent" ⇒
-                        Seq(
-                            note.get("continent"),
-                            note.get("subcontinent")
-                        )
-                    case "nlpcraft:country" ⇒
-                        Seq(
-                            note.get("continent"),
-                            note.get("subcontinent"),
-                            note.get("country")
-                        )
-                    case "nlpcraft:region" ⇒
-                        Seq(
-                            note.wordIndexes,
-                            note.noteType,
-                            note.get("continent"),
-                            note.get("subcontinent"),
-                            note.get("country"),
-                            note.get("region")
-                        )
-                    case "nlpcraft:city" ⇒
-                        Seq(
-                            note.get("continent"),
-                            note.get("subcontinent"),
-                            note.get("country"),
-                            note.get("region"),
-                            note.get("city")
-                        )
-                    case "nlpcraft:metro" ⇒
-                        Seq(
-                            note.get("metro")
-                        )
-                    case "nlpcraft:date" ⇒
-                        Seq(
-                            note.get("from"),
-                            note.get("to")
-                        )
-                    case "nlpcraft:relation" ⇒
-                        Seq(
-                            note.get("type"),
-                            note.get("indexes"),
-                            note.get("note")
-                        )
-                    case "nlpcraft:sort" ⇒
-                        Seq(
-                            note.wordIndexes,
-                            note.noteType,
-                            note.get("subjnotes"),
-                            note.get("subjindexes"),
-                            note.getOrElse("bynotes", null),
-                            note.getOrElse("byindexes", null),
-                            note.getOrElse("asc", null)
-                        )
-                    case "nlpcraft:limit" ⇒
-                        Seq(
-                            note.get("limit"),
-                            note.getOrElse("asc", null),
-                            note.get("indexes"),
-                            note.get("note")
-                        )
-                    case "nlpcraft:coordinate" ⇒
-                        Seq(
-                            note.get("latitude"),
-                            note.get("longitude")
-                        )
-                    case "nlpcraft:num" ⇒
-                        Seq(
-                            note.get("from"),
-                            note.get("to"),
-                            note.getOrElse("indexes", null),
-                            note.getOrElse("note", null)
-
-                        )
-                    case x if x.startsWith("google:") ⇒
-                        Seq(
-                            note.get("meta"),
-                            note.get("mentionsBeginOffsets"),
-                            note.get("mentionsContents"),
-                            note.get("mentionsTypes")
-                        )
-                    case x if x.startsWith("stanford:") ⇒
-                        Seq(
-                            note.get("nne")
-                        )
-                    case x if x.startsWith("opennlp:") ⇒
-                        Seq(
-                            note.wordIndexes,
-                            note.noteType
-                        )
-                    case x if x.startsWith("spacy:") ⇒
-                        Seq(
-                            note.get("vector")
-                        )
+                    case "nlpcraft:continent" ⇒ Seq("continent")
+                    case "nlpcraft:subcontinent" ⇒ Seq("continent", 
"subcontinent")
+                    case "nlpcraft:country" ⇒ Seq("continent", "subcontinent", 
"country")
+                    case "nlpcraft:region" ⇒ Seq("continent", "subcontinent", 
"country", "region")
+                    case "nlpcraft:city" ⇒ Seq("continent", "subcontinent", 
"country", "region", "city")
+                    case "nlpcraft:metro" ⇒ Seq("metro")
+                    case "nlpcraft:date" ⇒ Seq("from", "to")
+                    case "nlpcraft:relation" ⇒ Seq("type", "note") ++ 
addRefs("indexes")
+                    case "nlpcraft:sort" ⇒ Seq("asc", "subjnotes", "bynotes") 
++ addRefs("subjindexes", "byindexes")
+                    case "nlpcraft:limit" ⇒ Seq("limit", "asc", "note") ++ 
addRefs("indexes")
+                    case "nlpcraft:coordinate" ⇒ Seq("latitude", "longitude")
+                    case "nlpcraft:num" ⇒ Seq("from", "to", "unit", "unitType")
+                    case x if x.startsWith("google:") ⇒ Seq("meta", 
"mentionsBeginOffsets", "mentionsContents", "mentionsTypes")
+                    case x if x.startsWith("stanford:") ⇒ Seq("nne")
+                    case x if x.startsWith("opennlp:") ⇒ Seq.empty
+                    case x if x.startsWith("spacy:") ⇒ Seq("vector")
 
                     case _ ⇒ throw new AssertionError(s"Unexpected note type: 
${note.noteType}")
                 }
-        }
+
+        val seq1 = if (withIndexes) Seq(note.wordIndexes, note.noteType) else 
Seq(note.noteType)
+        val seq2 = names.map(name ⇒ note.getOrElse(name, null))
 
         seq1 ++ seq2
     }
@@ -587,6 +505,12 @@ object NCPostEnrichProcessor extends NCService with 
LazyLogging {
     }
 
     /**
+      * This collapser handles several tasks:
+      * - "overall" collapsing after all other individual collapsers had their 
turn.
+      * - Special further enrichment of tokens like linking, etc.
+      *
+      * In all cases of overlap (full or partial) - the "longest" note wins. 
In case of overlap and equal
+      * lengths - the winning note is chosen based on this priority.
       *
       * @param mdl
       * @param ns
@@ -605,7 +529,7 @@ object NCPostEnrichProcessor extends NCService with 
LazyLogging {
             // other variants for these words are redundant.
             val redundant: Seq[NCNlpSentenceNote] =
                 ns.flatten.filter(!_.isNlp).distinct.
-                    groupBy(p ⇒ getUniqueKey(p)).
+                    groupBy(p ⇒ getKey(p)).
                     map(p ⇒ p._2.sortBy(p ⇒
                         (
                             // System notes don't have such flags.
@@ -722,7 +646,7 @@ object NCPostEnrichProcessor extends NCService with 
LazyLogging {
 
             // Drops similar sentences (with same tokens structure).
             // Among similar sentences we prefer one with minimal free words 
count.
-            sens.groupBy(_.flatten.filter(!_.isNlp).map(note ⇒ 
getUniqueKey(note, withIndexes = false))).
+            sens.groupBy(_.flatten.filter(!_.isNlp).map(note ⇒ getKey(note, 
withIndexes = false))).
             map { case (_, seq) ⇒ seq.minBy(_.filter(p ⇒ p.isNlp && 
!p.isStopWord).map(_.wordIndexes.length).sum) }.
             toSeq
         }
@@ -786,4 +710,76 @@ object NCPostEnrichProcessor extends NCService with 
LazyLogging {
             )
         )
     }
+
+    /**
+      * Checks whether important tokens deleted as stopwords or not.
+      *
+      * @param ns Sentence.
+      * @param toks Tokens in which some stopwords can be deleted.
+      * @param isImportant Token important criteria.
+      */
+    def validImportant(
+        ns: NCNlpSentence,
+        toks: Seq[NCNlpSentenceToken],
+        isImportant: NCNlpSentenceToken ⇒ Boolean
+    ): Boolean = {
+        val idxs = toks.map(_.index)
+
+        require(idxs == idxs.sorted)
+
+        val toks2 = ns.slice(idxs.head, idxs.last + 1)
+
+        toks.length == toks2.length || toks.count(isImportant) == 
toks2.count(isImportant)
+    }
+
+    /**
+      *
+      * @param n1
+      * @param n2
+      * @param sen
+      * @return
+      */
+    def sameForSentence(n1: NCNlpSentenceNote, n2: NCNlpSentenceNote, sen: 
NCNlpSentence): Boolean = {
+        require(n1.noteType == n2.noteType)
+
+        val stopIdxs = sen.filter(_.isStopWord).map(_.index)
+
+        // One possible difference - stopwords indexes.
+        def wordsEqualOrSimilar(n1: NCNlpSentenceNote, n2: NCNlpSentenceNote): 
Boolean = {
+            val set1 = n1.wordIndexes.toSet
+            val set2 = n2.wordIndexes.toSet
+
+            set1 == set2 || set1.subsetOf(set2) && 
set2.diff(set1).forall(stopIdxs.contains)
+        }
+
+        def tokensEqualOrSimilar(set1: Set[NCNlpSentenceToken], set2: 
Set[NCNlpSentenceToken]): Boolean =
+            set1 == set2 || set1.subsetOf(set2) && 
set2.diff(set1).forall(_.isStopWord)
+
+        val refIdxNames =
+            n1.noteType match {
+                case "nlpcraft:sort" ⇒ Seq("subjindexes", "byindexes")
+                case "nlpcraft:limit" ⇒ Seq("indexes")
+                case "nlpcraft:reference" ⇒ Seq("indexes")
+
+                case _ ⇒ Seq.empty
+            }
+
+        def extract(n: NCNlpSentenceNote, refIdxName: String): 
Set[NCNlpSentenceToken] =
+            n.getOrElse(refIdxName, 
Collections.emptyList).asInstanceOf[java.util.List[Int]].asScala.map(sen(_)).toSet
+
+        def referencesEqualOrNearly(n1: NCNlpSentenceNote, n2: 
NCNlpSentenceNote): Boolean =
+            refIdxNames.isEmpty || refIdxNames.forall(refIdxName ⇒ {
+                val refs1 = extract(n1, refIdxName)
+                val refs2 = extract(n2, refIdxName)
+
+                tokensEqualOrSimilar(refs1, refs2) || 
tokensEqualOrSimilar(refs2, refs1)
+            })
+
+        def getUniqueKey0(n: NCNlpSentenceNote): Seq[Any] = getKey(n, 
withIndexes = false, withReferences = false)
+
+        getUniqueKey0(n1) == getUniqueKey0(n2) &&
+            (wordsEqualOrSimilar(n2, n1) || wordsEqualOrSimilar(n1, n2)) &&
+            (referencesEqualOrNearly(n2, n1) || referencesEqualOrNearly(n1, 
n2))
+    }
+
 }

[incubator-nlpcraft] branch NLPCRAFT-30 updated: WIP.

Reply via email to