This is an automated email from the ASF dual-hosted git repository.
sergeykamov pushed a commit to branch NLPCRAFT-30
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git
The following commit(s) were added to refs/heads/NLPCRAFT-30 by this push:
new f1cbe77 WIP.
f1cbe77 is described below
commit f1cbe777368ab845034b9a88285f90549262edc5
Author: Sergey Kamov <[email protected]>
AuthorDate: Wed Apr 29 11:25:33 2020 +0300
WIP.
---
.../probe/mgrs/nlp/NCProbeEnrichmentManager.scala | 72 ++++++++++++++++++++--
.../nlp/enrichers/post/NCPostEnrichProcessor.scala | 6 +-
.../mgrs/nlp/enrichers/sort/NCSortEnricher.scala | 1 +
3 files changed, 70 insertions(+), 9 deletions(-)
diff --git
a/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/NCProbeEnrichmentManager.scala
b/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/NCProbeEnrichmentManager.scala
index 4f470b9..520d079 100644
---
a/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/NCProbeEnrichmentManager.scala
+++
b/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/NCProbeEnrichmentManager.scala
@@ -51,7 +51,7 @@ import org.apache.nlpcraft.probe.mgrs.nlp.impl._
import org.apache.nlpcraft.probe.mgrs.nlp.validate._
import scala.collection.JavaConverters._
-import scala.collection._
+import scala.collection.{Seq, _}
import scala.concurrent.ExecutionContext
/**
@@ -182,6 +182,52 @@ object NCProbeEnrichmentManager extends NCService with
NCOpenCensusModelStats {
}
/**
+ *
+ * @param nlpSen
+ * @param notes1
+ * @param notes2
+ * @param stopIdxs
+ * @param typ
+ */
+ private def squeeze(
+ nlpSen: NCNlpSentence,
+ notes1: Seq[NCNlpSentenceNote],
+ notes2: Seq[NCNlpSentenceNote],
+ stopIdxs: Seq[Int],
+ typ: String
+ ): Boolean = {
+ // Filters notes and adds unique key.
+ def toMap(notes: Seq[NCNlpSentenceNote], filter: NCNlpSentenceNote ⇒
Boolean):
+ Map[NCNlpSentenceNote, Any] =
+ notes.
+ filter(filter).
+ map(p ⇒ p → NCPostEnrichProcessor.getUniqueKey(p, withIndexes
= false)).
+ toMap
+
+ // One possible difference - stopwords indexes.
+ def equalOrNearly(n1: NCNlpSentenceNote, n2: NCNlpSentenceNote):
Boolean = {
+ val set1 = n1.wordIndexes.toSet
+ val set2 = n2.wordIndexes.toSet
+
+ set1 == set2 || set1.subsetOf(set2) &&
set2.diff(set1).forall(stopIdxs.contains)
+ }
+
+ val notesTyp1 = toMap(notes1, (n: NCNlpSentenceNote) ⇒ n.noteType ==
typ)
+ val diff = toMap(notes2, (n: NCNlpSentenceNote) ⇒ n.noteType == typ &&
!notesTyp1.contains(n))
+
+ // New notes are same as already prepared with one difference -
+ // their tokens contain or not stopwords.
+ // Such "new" tokens can be deleted.
+ val diffRedundant = diff.filter { case (n1, key1) ⇒
+ notesTyp1.exists { case (n2, key2) ⇒ key1 == key2 &&
(equalOrNearly(n2, n1) || equalOrNearly(n1, n2)) }
+ }.map { case (n, _) ⇒ n }
+
+ diffRedundant.foreach(nlpSen.removeNote)
+
+ diffRedundant.size == diff.size
+ }
+
+ /**
* Processes 'ask' request from probe server.
*
* @param srvReqId Server request ID.
@@ -374,6 +420,8 @@ object NCProbeEnrichmentManager extends NCService with
NCOpenCensusModelStats {
var step = 0
var continue = true
+ val stopIdxs = nlpSen.filter(_.isStopWord).map(_.index)
+
while (continue) {
step = step + 1
@@ -382,22 +430,34 @@ object NCProbeEnrichmentManager extends NCService with
NCOpenCensusModelStats {
val res = loopEnrichers.map(h ⇒ {
def get(): Seq[NCNlpSentenceNote] = h.getNotes().sortBy(p
⇒ (p.tokenIndexes.head, p.noteType))
-
val notes1 = get()
h → h.enricher.enrich(mdlDec, nlpSen, senMeta, span)
val notes2 = get()
- h.enricher → (notes1 == notes2)
+ var same = notes1 == notes2
+
+ if (!same)
+ h.enricher match {
+ case NCSortEnricher ⇒ same = squeeze(nlpSen,
notes1, notes2, stopIdxs,"nlpcraft:sort")
+ case NCLimitEnricher ⇒ same = squeeze(nlpSen,
notes1, notes2, stopIdxs,"nlpcraft:limit")
+ case NCRelationEnricher ⇒ same = squeeze(nlpSen,
notes1, notes2, stopIdxs,"nlpcraft:relation")
+
+ case _ ⇒ // No-op.
+ }
+
+ h.enricher → same
}).toMap
// Loop has sense if model is complex (has user defined
parsers or DSL based synonyms)
continue = NCModelEnricher.isComplex(mdlDec) && res.exists {
case (_, same) ⇒ !same }
- if (DEEP_DEBUG)
- if (continue)
- logger.info(s"Enrichment iteration finished - more
needed [step=$step, changed=${res.keys.mkString(", ")}]")
+ if (continue) {
+ val changed =
res.filter(!_._2).keys.map(_.getClass.getSimpleName).mkString(", ")
+
+ logger.info(s"Enrichment iteration finished - more
needed [step=$step, changed=$changed]")
+ }
else
logger.info(s"Enrichment finished [step=$step]")
}
diff --git
a/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/post/NCPostEnrichProcessor.scala
b/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/post/NCPostEnrichProcessor.scala
index 86b36ad..47447a3 100644
---
a/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/post/NCPostEnrichProcessor.scala
+++
b/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/post/NCPostEnrichProcessor.scala
@@ -55,7 +55,7 @@ object NCPostEnrichProcessor extends NCService with
LazyLogging {
* @param withIndexes
* @return
*/
- private def getParameters(note: NCNlpSentenceNote, withIndexes: Boolean =
true): Any = {
+ def getUniqueKey(note: NCNlpSentenceNote, withIndexes: Boolean = true):
Any = {
val seq1 = if (withIndexes) Seq(note.wordIndexes, note.noteType) else
Seq(note.noteType)
val seq2: Seq[Any] =
@@ -605,7 +605,7 @@ object NCPostEnrichProcessor extends NCService with
LazyLogging {
// other variants for these words are redundant.
val redundant: Seq[NCNlpSentenceNote] =
ns.flatten.filter(!_.isNlp).distinct.
- groupBy(p ⇒ getParameters(p)).
+ groupBy(p ⇒ getUniqueKey(p)).
map(p ⇒ p._2.sortBy(p ⇒
(
// System notes don't have such flags.
@@ -722,7 +722,7 @@ object NCPostEnrichProcessor extends NCService with
LazyLogging {
// Drops similar sentences (with same tokens structure).
// Among similar sentences we prefer one with minimal free words
count.
- sens.groupBy(_.flatten.filter(!_.isNlp).map(note ⇒
getParameters(note, withIndexes = false))).
+ sens.groupBy(_.flatten.filter(!_.isNlp).map(note ⇒
getUniqueKey(note, withIndexes = false))).
map { case (_, seq) ⇒ seq.minBy(_.filter(p ⇒ p.isNlp &&
!p.isStopWord).map(_.wordIndexes.length).sum) }.
toSeq
}
diff --git
a/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/sort/NCSortEnricher.scala
b/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/sort/NCSortEnricher.scala
index 3cf17dc..b0509bb 100644
---
a/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/sort/NCSortEnricher.scala
+++
b/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/sort/NCSortEnricher.scala
@@ -411,6 +411,7 @@ object NCSortEnricher extends NCProbeEnricher {
"srvReqId" → ns.srvReqId,
"modelId" → mdl.model.getId,
"txt" → ns.text) { _ ⇒
+ val buf = mutable.Buffer.empty[Set[NCNlpSentenceToken]]
def isImportant(t: NCNlpSentenceToken): Boolean = t.isUser ||
MASK_WORDS.contains(t.stem)
for (toks ← ns.tokenMixWithStopWords() if validImportant(ns, toks,
isImportant)) {