[incubator-nlpcraft] branch NLPCRAFT-30 updated: WIP.

sergeykamov Wed, 29 Apr 2020 01:26:22 -0700

This is an automated email from the ASF dual-hosted git repository.

sergeykamov pushed a commit to branch NLPCRAFT-30
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git



The following commit(s) were added to refs/heads/NLPCRAFT-30 by this push:
     new f1cbe77  WIP.
f1cbe77 is described below

commit f1cbe777368ab845034b9a88285f90549262edc5
Author: Sergey Kamov <[email protected]>
AuthorDate: Wed Apr 29 11:25:33 2020 +0300

    WIP.
---
 .../probe/mgrs/nlp/NCProbeEnrichmentManager.scala  | 72 ++++++++++++++++++++--
 .../nlp/enrichers/post/NCPostEnrichProcessor.scala |  6 +-
 .../mgrs/nlp/enrichers/sort/NCSortEnricher.scala   |  1 +
 3 files changed, 70 insertions(+), 9 deletions(-)

diff --git 
a/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/NCProbeEnrichmentManager.scala
 
b/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/NCProbeEnrichmentManager.scala
index 4f470b9..520d079 100644
--- 
a/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/NCProbeEnrichmentManager.scala
+++ 
b/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/NCProbeEnrichmentManager.scala
@@ -51,7 +51,7 @@ import org.apache.nlpcraft.probe.mgrs.nlp.impl._
 import org.apache.nlpcraft.probe.mgrs.nlp.validate._
 
 import scala.collection.JavaConverters._
-import scala.collection._
+import scala.collection.{Seq, _}
 import scala.concurrent.ExecutionContext
 
 /**
@@ -182,6 +182,52 @@ object NCProbeEnrichmentManager extends NCService with 
NCOpenCensusModelStats {
     }
 
     /**
+      *
+      * @param nlpSen
+      * @param notes1
+      * @param notes2
+      * @param stopIdxs
+      * @param typ
+      */
+    private def squeeze(
+        nlpSen: NCNlpSentence,
+        notes1: Seq[NCNlpSentenceNote],
+        notes2: Seq[NCNlpSentenceNote],
+        stopIdxs: Seq[Int],
+        typ: String
+    ): Boolean = {
+        // Filters notes and adds unique key.
+        def toMap(notes: Seq[NCNlpSentenceNote], filter: NCNlpSentenceNote ⇒ 
Boolean):
+        Map[NCNlpSentenceNote, Any] =
+            notes.
+                filter(filter).
+                map(p ⇒ p → NCPostEnrichProcessor.getUniqueKey(p, withIndexes 
= false)).
+                toMap
+
+        // One possible difference - stopwords indexes.
+        def equalOrNearly(n1: NCNlpSentenceNote, n2: NCNlpSentenceNote): 
Boolean = {
+            val set1 = n1.wordIndexes.toSet
+            val set2 = n2.wordIndexes.toSet
+
+            set1 == set2 || set1.subsetOf(set2) && 
set2.diff(set1).forall(stopIdxs.contains)
+        }
+
+        val notesTyp1 = toMap(notes1, (n: NCNlpSentenceNote) ⇒ n.noteType == 
typ)
+        val diff = toMap(notes2, (n: NCNlpSentenceNote) ⇒ n.noteType == typ && 
!notesTyp1.contains(n))
+
+        // New notes are same as already prepared with one difference -
+        // their tokens contain or not stopwords.
+        // Such "new" tokens can be deleted.
+        val diffRedundant = diff.filter { case (n1, key1) ⇒
+            notesTyp1.exists { case (n2, key2) ⇒ key1 == key2 && 
(equalOrNearly(n2, n1) || equalOrNearly(n1, n2)) }
+        }.map { case (n, _) ⇒ n }
+
+        diffRedundant.foreach(nlpSen.removeNote)
+
+        diffRedundant.size == diff.size
+    }
+
+    /**
       * Processes 'ask' request from probe server.
       *
       * @param srvReqId Server request ID.
@@ -374,6 +420,8 @@ object NCProbeEnrichmentManager extends NCService with 
NCOpenCensusModelStats {
             var step = 0
             var continue = true
 
+            val stopIdxs = nlpSen.filter(_.isStopWord).map(_.index)
+
             while (continue) {
                 step = step + 1
 
@@ -382,22 +430,34 @@ object NCProbeEnrichmentManager extends NCService with 
NCOpenCensusModelStats {
 
                 val res = loopEnrichers.map(h ⇒ {
                     def get(): Seq[NCNlpSentenceNote] = h.getNotes().sortBy(p 
⇒ (p.tokenIndexes.head, p.noteType))
-
                     val notes1 = get()
 
                     h → h.enricher.enrich(mdlDec, nlpSen, senMeta, span)
 
                     val notes2 = get()
 
-                    h.enricher → (notes1 == notes2)
+                    var same = notes1 == notes2
+
+                    if (!same)
+                        h.enricher match {
+                            case NCSortEnricher ⇒ same = squeeze(nlpSen, 
notes1, notes2, stopIdxs,"nlpcraft:sort")
+                            case NCLimitEnricher ⇒ same = squeeze(nlpSen, 
notes1, notes2, stopIdxs,"nlpcraft:limit")
+                            case NCRelationEnricher ⇒ same = squeeze(nlpSen, 
notes1, notes2, stopIdxs,"nlpcraft:relation")
+
+                            case _ ⇒ // No-op.
+                        }
+
+                    h.enricher → same
                 }).toMap
 
                 // Loop has sense if model is complex (has user defined 
parsers or DSL based synonyms)
                 continue = NCModelEnricher.isComplex(mdlDec) && res.exists { 
case (_, same) ⇒ !same }
 
-                if (DEEP_DEBUG)
-                    if (continue)
-                        logger.info(s"Enrichment iteration finished - more 
needed [step=$step, changed=${res.keys.mkString(", ")}]")
+                    if (continue) {
+                        val changed = 
res.filter(!_._2).keys.map(_.getClass.getSimpleName).mkString(", ")
+
+                        logger.info(s"Enrichment iteration finished - more 
needed [step=$step, changed=$changed]")
+                    }
                     else
                         logger.info(s"Enrichment finished [step=$step]")
             }
diff --git 
a/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/post/NCPostEnrichProcessor.scala
 
b/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/post/NCPostEnrichProcessor.scala
index 86b36ad..47447a3 100644
--- 
a/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/post/NCPostEnrichProcessor.scala
+++ 
b/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/post/NCPostEnrichProcessor.scala
@@ -55,7 +55,7 @@ object NCPostEnrichProcessor extends NCService with 
LazyLogging {
       * @param withIndexes
       * @return
       */
-    private def getParameters(note: NCNlpSentenceNote, withIndexes: Boolean = 
true): Any = {
+    def getUniqueKey(note: NCNlpSentenceNote, withIndexes: Boolean = true): 
Any = {
         val seq1 = if (withIndexes) Seq(note.wordIndexes, note.noteType) else 
Seq(note.noteType)
 
         val seq2: Seq[Any] =
@@ -605,7 +605,7 @@ object NCPostEnrichProcessor extends NCService with 
LazyLogging {
             // other variants for these words are redundant.
             val redundant: Seq[NCNlpSentenceNote] =
                 ns.flatten.filter(!_.isNlp).distinct.
-                    groupBy(p ⇒ getParameters(p)).
+                    groupBy(p ⇒ getUniqueKey(p)).
                     map(p ⇒ p._2.sortBy(p ⇒
                         (
                             // System notes don't have such flags.
@@ -722,7 +722,7 @@ object NCPostEnrichProcessor extends NCService with 
LazyLogging {
 
             // Drops similar sentences (with same tokens structure).
             // Among similar sentences we prefer one with minimal free words 
count.
-            sens.groupBy(_.flatten.filter(!_.isNlp).map(note ⇒ 
getParameters(note, withIndexes = false))).
+            sens.groupBy(_.flatten.filter(!_.isNlp).map(note ⇒ 
getUniqueKey(note, withIndexes = false))).
             map { case (_, seq) ⇒ seq.minBy(_.filter(p ⇒ p.isNlp && 
!p.isStopWord).map(_.wordIndexes.length).sum) }.
             toSeq
         }
diff --git 
a/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/sort/NCSortEnricher.scala
 
b/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/sort/NCSortEnricher.scala
index 3cf17dc..b0509bb 100644
--- 
a/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/sort/NCSortEnricher.scala
+++ 
b/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/sort/NCSortEnricher.scala
@@ -411,6 +411,7 @@ object NCSortEnricher extends NCProbeEnricher {
             "srvReqId" → ns.srvReqId,
             "modelId" → mdl.model.getId,
             "txt" → ns.text) { _ ⇒
+            val buf = mutable.Buffer.empty[Set[NCNlpSentenceToken]]
             def isImportant(t: NCNlpSentenceToken): Boolean = t.isUser || 
MASK_WORDS.contains(t.stem)
 
             for (toks ← ns.tokenMixWithStopWords() if validImportant(ns, toks, 
isImportant)) {

[incubator-nlpcraft] branch NLPCRAFT-30 updated: WIP.

Reply via email to