This is an automated email from the ASF dual-hosted git repository.
sergeykamov pushed a commit to branch NLPCRAFT-30
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git
The following commit(s) were added to refs/heads/NLPCRAFT-30 by this push:
new fcd204e WIP.
fcd204e is described below
commit fcd204e5d94997f11feee53c9ebacb4934ea59fc
Author: Sergey Kamov <[email protected]>
AuthorDate: Wed Apr 29 22:21:54 2020 +0300
WIP.
---
.../org/apache/nlpcraft/probe/NCProbeBoot.scala | 6 +-
.../nlpcraft/probe/mgrs/nlp/NCProbeEnricher.scala | 62 +-----
.../probe/mgrs/nlp/NCProbeEnrichmentManager.scala | 81 +++-----
.../mgrs/nlp/enrichers/limit/NCLimitEnricher.scala | 44 ++++-
.../mgrs/nlp/enrichers/model/NCModelEnricher.scala | 6 +-
.../enrichers/relation/NCRelationEnricher.scala | 12 +-
.../mgrs/nlp/enrichers/sort/NCSortEnricher.scala | 13 +-
.../NCEnricherProcessor.scala} | 220 ++++++++++-----------
8 files changed, 202 insertions(+), 242 deletions(-)
diff --git a/src/main/scala/org/apache/nlpcraft/probe/NCProbeBoot.scala
b/src/main/scala/org/apache/nlpcraft/probe/NCProbeBoot.scala
index caba0bb..77ee49a 100644
--- a/src/main/scala/org/apache/nlpcraft/probe/NCProbeBoot.scala
+++ b/src/main/scala/org/apache/nlpcraft/probe/NCProbeBoot.scala
@@ -41,7 +41,7 @@ import
org.apache.nlpcraft.probe.mgrs.nlp.NCProbeEnrichmentManager
import
org.apache.nlpcraft.probe.mgrs.nlp.enrichers.dictionary.NCDictionaryEnricher
import org.apache.nlpcraft.probe.mgrs.nlp.enrichers.limit.NCLimitEnricher
import org.apache.nlpcraft.probe.mgrs.nlp.enrichers.model.NCModelEnricher
-import org.apache.nlpcraft.probe.mgrs.nlp.enrichers.post.NCPostEnrichProcessor
+import org.apache.nlpcraft.probe.mgrs.nlp.enrichers.utils.NCEnricherProcessor
import org.apache.nlpcraft.probe.mgrs.nlp.enrichers.relation.NCRelationEnricher
import org.apache.nlpcraft.probe.mgrs.nlp.enrichers.sort.NCSortEnricher
import org.apache.nlpcraft.probe.mgrs.nlp.enrichers.stopword.NCStopWordEnricher
@@ -425,6 +425,7 @@ private [probe] object NCProbeBoot extends LazyLogging with
NCOpenCensusTrace {
NCModelManager.start(span)
NCCommandManager.start(span)
NCDictionaryManager.start(span)
+ NCEnricherProcessor.start(span)
NCStopWordEnricher.start(span)
NCModelEnricher.start(span)
NCLimitEnricher.start(span)
@@ -432,7 +433,6 @@ private [probe] object NCProbeBoot extends LazyLogging with
NCOpenCensusTrace {
NCRelationEnricher.start(span)
NCSuspiciousNounsEnricher.start(span)
NCValidateManager.start(span)
- NCPostEnrichProcessor.start(span)
NCDictionaryEnricher.start(span)
NCConversationManager.start(span)
NCProbeEnrichmentManager.start(span)
@@ -451,8 +451,8 @@ private [probe] object NCProbeBoot extends LazyLogging with
NCOpenCensusTrace {
NCConnectionManager.stop(span)
NCProbeEnrichmentManager.stop(span)
NCConversationManager.stop(span)
+ NCEnricherProcessor.stop(span)
NCDictionaryEnricher.stop(span)
- NCPostEnrichProcessor.stop(span)
NCValidateManager.stop(span)
NCSuspiciousNounsEnricher.stop(span)
NCRelationEnricher.stop(span)
diff --git
a/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/NCProbeEnricher.scala
b/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/NCProbeEnricher.scala
index dda8875..15acb12 100644
--- a/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/NCProbeEnricher.scala
+++ b/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/NCProbeEnricher.scala
@@ -25,7 +25,7 @@ import org.apache.nlpcraft.common.nlp._
import org.apache.nlpcraft.common.{NCService, _}
import org.apache.nlpcraft.probe.mgrs.NCModelDecorator
-import scala.collection.{Map, Seq}
+import scala.collection.Map
import scala.language.implicitConversions
/**
@@ -33,66 +33,6 @@ import scala.language.implicitConversions
*/
abstract class NCProbeEnricher extends NCService with LazyLogging {
/**
- * Checks whether important tokens deleted as stopwords or not.
- *
- * @param ns Sentence.
- * @param toks Tokens in which some stopwords can be deleted.
- * @param isImportant Token important criteria.
- */
- protected def validImportant(
- ns: NCNlpSentence,
- toks: Seq[NCNlpSentenceToken],
- isImportant: NCNlpSentenceToken ⇒ Boolean
- ): Boolean = {
- val idxs = toks.map(_.index)
-
- require(idxs == idxs.sorted)
-
- val toks2 = ns.slice(idxs.head, idxs.last + 1)
-
- toks.length == toks2.length || toks.count(isImportant) ==
toks2.count(isImportant)
- }
-
- /**
- *
- * @param toks
- * @param pred
- */
- protected def getCommonNotes(
- toks: Seq[NCNlpSentenceToken], pred: Option[NCNlpSentenceNote ⇒
Boolean] = None
- ): Set[String] =
- if (toks.isEmpty)
- Set.empty
- else {
- def getCommon(sortedToks: Seq[NCNlpSentenceToken]): Set[String] = {
- require(sortedToks.nonEmpty)
-
- val h = sortedToks.head
- val l = sortedToks.last
-
- val notes = pred match {
- case Some(p) ⇒ h.filter(p)
- case None ⇒ h.map(p ⇒ p)
- }
-
- notes.filter(!_.isNlp).filter(n ⇒ h.index == n.tokenFrom &&
l.index == n.tokenTo).map(_.noteType).toSet
- }
-
- var sortedToks = toks.sortBy(_.index)
-
- var res = getCommon(sortedToks)
-
- if (res.isEmpty) {
- sortedToks = sortedToks.filter(!_.isStopWord)
-
- if (sortedToks.nonEmpty)
- res = getCommon(sortedToks)
- }
-
- if (res.isEmpty) Set.empty else res
- }
-
- /**
*
* Processes this NLP sentence.
*
diff --git
a/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/NCProbeEnrichmentManager.scala
b/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/NCProbeEnrichmentManager.scala
index 520d079..bdfebb5 100644
---
a/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/NCProbeEnrichmentManager.scala
+++
b/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/NCProbeEnrichmentManager.scala
@@ -42,11 +42,11 @@ import org.apache.nlpcraft.probe.mgrs.model.NCModelManager
import
org.apache.nlpcraft.probe.mgrs.nlp.enrichers.dictionary.NCDictionaryEnricher
import org.apache.nlpcraft.probe.mgrs.nlp.enrichers.limit.NCLimitEnricher
import org.apache.nlpcraft.probe.mgrs.nlp.enrichers.model.NCModelEnricher
-import org.apache.nlpcraft.probe.mgrs.nlp.enrichers.post._
import org.apache.nlpcraft.probe.mgrs.nlp.enrichers.relation.NCRelationEnricher
import org.apache.nlpcraft.probe.mgrs.nlp.enrichers.sort.NCSortEnricher
import org.apache.nlpcraft.probe.mgrs.nlp.enrichers.stopword.NCStopWordEnricher
import
org.apache.nlpcraft.probe.mgrs.nlp.enrichers.suspicious.NCSuspiciousNounsEnricher
+import org.apache.nlpcraft.probe.mgrs.nlp.enrichers.utils._
import org.apache.nlpcraft.probe.mgrs.nlp.impl._
import org.apache.nlpcraft.probe.mgrs.nlp.validate._
@@ -182,52 +182,6 @@ object NCProbeEnrichmentManager extends NCService with
NCOpenCensusModelStats {
}
/**
- *
- * @param nlpSen
- * @param notes1
- * @param notes2
- * @param stopIdxs
- * @param typ
- */
- private def squeeze(
- nlpSen: NCNlpSentence,
- notes1: Seq[NCNlpSentenceNote],
- notes2: Seq[NCNlpSentenceNote],
- stopIdxs: Seq[Int],
- typ: String
- ): Boolean = {
- // Filters notes and adds unique key.
- def toMap(notes: Seq[NCNlpSentenceNote], filter: NCNlpSentenceNote ⇒
Boolean):
- Map[NCNlpSentenceNote, Any] =
- notes.
- filter(filter).
- map(p ⇒ p → NCPostEnrichProcessor.getUniqueKey(p, withIndexes
= false)).
- toMap
-
- // One possible difference - stopwords indexes.
- def equalOrNearly(n1: NCNlpSentenceNote, n2: NCNlpSentenceNote):
Boolean = {
- val set1 = n1.wordIndexes.toSet
- val set2 = n2.wordIndexes.toSet
-
- set1 == set2 || set1.subsetOf(set2) &&
set2.diff(set1).forall(stopIdxs.contains)
- }
-
- val notesTyp1 = toMap(notes1, (n: NCNlpSentenceNote) ⇒ n.noteType ==
typ)
- val diff = toMap(notes2, (n: NCNlpSentenceNote) ⇒ n.noteType == typ &&
!notesTyp1.contains(n))
-
- // New notes are same as already prepared with one difference -
- // their tokens contain or not stopwords.
- // Such "new" tokens can be deleted.
- val diffRedundant = diff.filter { case (n1, key1) ⇒
- notesTyp1.exists { case (n2, key2) ⇒ key1 == key2 &&
(equalOrNearly(n2, n1) || equalOrNearly(n1, n2)) }
- }.map { case (n, _) ⇒ n }
-
- diffRedundant.foreach(nlpSen.removeNote)
-
- diffRedundant.size == diff.size
- }
-
- /**
* Processes 'ask' request from probe server.
*
* @param srvReqId Server request ID.
@@ -438,14 +392,35 @@ object NCProbeEnrichmentManager extends NCService with
NCOpenCensusModelStats {
var same = notes1 == notes2
- if (!same)
+ if (!same) {
+ def squeeze(typ: String): Boolean = {
+ val diff = notes2.filter(n ⇒ !notes1.contains(n))
+
+ val diffRedundant = diff.flatMap(n2 ⇒
+ notes1.find(n1 ⇒
NCEnricherProcessor.sameForSentence(n1, n2, nlpSen)) match {
+ case Some(similar) ⇒ Some(n2 → similar)
+ case None ⇒ None
+ }
+ )
+
+ diffRedundant.foreach { case (del, similar) ⇒
+ // TODO: log level
+ logger.info(s"Redundant note removed: $del,
because similar exists: $similar")
+
+ nlpSen.removeNote(del)
+ }
+
+ diffRedundant.size == diff.size
+ }
+
h.enricher match {
- case NCSortEnricher ⇒ same = squeeze(nlpSen,
notes1, notes2, stopIdxs,"nlpcraft:sort")
- case NCLimitEnricher ⇒ same = squeeze(nlpSen,
notes1, notes2, stopIdxs,"nlpcraft:limit")
- case NCRelationEnricher ⇒ same = squeeze(nlpSen,
notes1, notes2, stopIdxs,"nlpcraft:relation")
+ case NCSortEnricher ⇒ same =
squeeze("nlpcraft:sort")
+ case NCLimitEnricher ⇒ same =
squeeze("nlpcraft:limit")
+ case NCRelationEnricher ⇒ same =
squeeze("nlpcraft:relation")
case _ ⇒ // No-op.
}
+ }
h.enricher → same
}).toMap
@@ -462,7 +437,7 @@ object NCProbeEnrichmentManager extends NCService with
NCOpenCensusModelStats {
logger.info(s"Enrichment finished [step=$step]")
}
- NCPostEnrichProcessor.collapse(mdlDec, nlpSen.clone(), span).
+ NCEnricherProcessor.collapse(mdlDec, nlpSen.clone(), span).
// Sorted to support deterministic logs.
sortBy(p ⇒
p.map(p ⇒ {
@@ -503,7 +478,7 @@ object NCProbeEnrichmentManager extends NCService with
NCOpenCensusModelStats {
val varsNlp = sensSeq.map(_.toSeq)
val req = NCRequestImpl(meta, srvReqId)
- var senVars = NCPostEnrichProcessor.convert(mdlDec, srvReqId, varsNlp)
+ var senVars = NCEnricherProcessor.convert(mdlDec, srvReqId, varsNlp)
// Sentence variants can be filtered by model.
val fltSenVars: Seq[(Seq[NCToken], Int)] =
diff --git
a/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/limit/NCLimitEnricher.scala
b/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/limit/NCLimitEnricher.scala
index 87f02c9..861de12 100644
---
a/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/limit/NCLimitEnricher.scala
+++
b/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/limit/NCLimitEnricher.scala
@@ -27,6 +27,7 @@ import org.apache.nlpcraft.common.nlp.{NCNlpSentence,
NCNlpSentenceNote, NCNlpSe
import org.apache.nlpcraft.common.{NCE, NCService}
import org.apache.nlpcraft.probe.mgrs.NCModelDecorator
import org.apache.nlpcraft.probe.mgrs.nlp.NCProbeEnricher
+import org.apache.nlpcraft.probe.mgrs.nlp.enrichers.utils.NCEnricherProcessor
import scala.collection.JavaConverters._
import scala.collection.{Map, Seq, mutable}
@@ -150,7 +151,6 @@ object NCLimitEnricher extends NCProbeEnricher {
s"$CD of",
s"$CD <POST_WORDS>",
s"<POST_WORDS> $CD"
-
)
private final val LIMITS: Seq[String] = {
@@ -191,14 +191,16 @@ object NCLimitEnricher extends NCProbeEnricher {
"srvReqId" → ns.srvReqId,
"modelId" → mdl.model.getId,
"txt" → ns.text) { _ ⇒
+ val notes = mutable.HashSet.empty[NCNlpSentenceNote]
val numsMap =
NCNumericManager.find(ns).filter(_.unit.isEmpty).map(p ⇒ p.tokens → p).toMap
val groupsMap = groupNums(ns, numsMap.values)
+
def isImportant(t: NCNlpSentenceToken): Boolean = t.isUser ||
TECH_WORDS.contains(t.stem)
// Tries to grab tokens reverse way.
// Example: A, B, C ⇒ ABC, BC, AB .. (BC will be processed first)
for (toks ← ns.tokenMixWithStopWords().sortBy(p ⇒ (-p.size,
-p.head.index))
- if validImportant(ns, toks, isImportant)
+ if NCEnricherProcessor.validImportant(ns, toks, isImportant)
)
tryToMatch(numsMap, groupsMap, toks) match {
case Some(m) ⇒
@@ -214,11 +216,47 @@ object NCLimitEnricher extends NCProbeEnricher {
val note =
NCNlpSentenceNote(m.matched.map(_.index), TOK_ID, params: _*)
- m.matched.foreach(_.add(note))
+ if (!notes.exists(n ⇒
NCEnricherProcessor.sameForSentence(note, n, ns))) {
+ notes += note
+
+ m.matched.foreach(_.add(note))
+ }
}
case None ⇒ // No-op.
}
}
+
+ /**
+ *
+ * @param toks
+ */
+ private def getCommonNotes(toks: Seq[NCNlpSentenceToken]): Set[String] =
+ if (toks.isEmpty)
+ Set.empty
+ else {
+ def getCommon(sortedToks: Seq[NCNlpSentenceToken]): Set[String] = {
+ require(sortedToks.nonEmpty)
+
+ val h = sortedToks.head
+ val l = sortedToks.last
+
+ h.filter(!_.isNlp).filter(n ⇒ h.index == n.tokenFrom &&
l.index == n.tokenTo).map(_.noteType).toSet
+ }
+
+ var sortedToks = toks.sortBy(_.index)
+
+ var res = getCommon(sortedToks)
+
+ if (res.isEmpty) {
+ sortedToks = sortedToks.filter(!_.isStopWord)
+
+ if (sortedToks.nonEmpty)
+ res = getCommon(sortedToks)
+ }
+
+ if (res.isEmpty) Set.empty else res
+ }
+
/**
*
* @param numsMap
diff --git
a/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
b/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
index 42f743b..e8b0f0d 100644
---
a/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
+++
b/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
@@ -25,7 +25,7 @@ import org.apache.nlpcraft.common._
import org.apache.nlpcraft.common.nlp.{NCNlpSentenceToken, _}
import org.apache.nlpcraft.model._
import org.apache.nlpcraft.probe.mgrs.nlp.NCProbeEnricher
-import org.apache.nlpcraft.probe.mgrs.nlp.enrichers.post.NCPostEnrichProcessor
+import org.apache.nlpcraft.probe.mgrs.nlp.enrichers.utils.NCEnricherProcessor
import org.apache.nlpcraft.probe.mgrs.nlp.impl.NCRequestImpl
import org.apache.nlpcraft.probe.mgrs.{NCModelDecorator, NCSynonym}
@@ -377,10 +377,10 @@ object NCModelEnricher extends NCProbeEnricher with
DecorateAsScala {
if (collapsedSens == null)
collapsedSens =
- NCPostEnrichProcessor.convert(
+ NCEnricherProcessor.convert(
mdl,
ns.srvReqId,
-
NCPostEnrichProcessor.collapse(mdl, ns.clone(), span).map(_.tokens)
+ NCEnricherProcessor.collapse(mdl,
ns.clone(), span).map(_.tokens)
)
if (seq == null)
diff --git
a/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/relation/NCRelationEnricher.scala
b/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/relation/NCRelationEnricher.scala
index dab01e9..fbd9be9 100644
---
a/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/relation/NCRelationEnricher.scala
+++
b/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/relation/NCRelationEnricher.scala
@@ -26,6 +26,7 @@ import org.apache.nlpcraft.common.nlp.{NCNlpSentence,
NCNlpSentenceNote, NCNlpSe
import org.apache.nlpcraft.common.{NCE, NCService}
import org.apache.nlpcraft.probe.mgrs.NCModelDecorator
import org.apache.nlpcraft.probe.mgrs.nlp.NCProbeEnricher
+import org.apache.nlpcraft.probe.mgrs.nlp.enrichers.utils.NCEnricherProcessor
import scala.collection.JavaConverters._
import scala.collection.{Map, Seq, mutable}
@@ -126,11 +127,12 @@ object NCRelationEnricher extends NCProbeEnricher {
"txt" → ns.text) { _ ⇒
// Tries to grab tokens direct way.
// Example: A, B, C ⇒ ABC, AB, BC .. (AB will be processed first)
+ val notes = mutable.HashSet.empty[NCNlpSentenceNote]
def isImportant(t: NCNlpSentenceToken): Boolean =
t.exists(n ⇒ n.isUser || REL_TYPES.contains(n.noteType)) ||
ALL_FUNC_STEMS.contains(t.stem)
- for (toks ← ns.tokenMixWithStopWords() if validImportant(ns, toks,
isImportant))
+ for (toks ← ns.tokenMixWithStopWords() if
NCEnricherProcessor.validImportant(ns, toks, isImportant))
tryToMatch(toks) match {
case Some(m) ⇒
for (refNote ← m.refNotes) {
@@ -142,9 +144,13 @@ object NCRelationEnricher extends NCProbeEnricher {
"note" → refNote
)
- m.matched.filter(_ !=
m.matchedHead).foreach(_.addStopReason(note))
+ if (!notes.exists(n ⇒
NCEnricherProcessor.sameForSentence(note, n, ns))) {
+ notes += note
- m.matchedHead.add(note)
+ m.matched.filter(_ !=
m.matchedHead).foreach(_.addStopReason(note))
+
+ m.matchedHead.add(note)
+ }
}
case None ⇒ // No-op.
}
diff --git
a/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/sort/NCSortEnricher.scala
b/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/sort/NCSortEnricher.scala
index b0509bb..ed445bc 100644
---
a/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/sort/NCSortEnricher.scala
+++
b/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/sort/NCSortEnricher.scala
@@ -26,6 +26,7 @@ import org.apache.nlpcraft.common.nlp.core.NCNlpCoreManager
import org.apache.nlpcraft.common.nlp.{NCNlpSentence, NCNlpSentenceNote,
NCNlpSentenceToken}
import org.apache.nlpcraft.probe.mgrs.NCModelDecorator
import org.apache.nlpcraft.probe.mgrs.nlp.NCProbeEnricher
+import org.apache.nlpcraft.probe.mgrs.nlp.enrichers.utils.NCEnricherProcessor
import scala.collection.JavaConverters._
import scala.collection.mutable.ArrayBuffer
@@ -411,10 +412,10 @@ object NCSortEnricher extends NCProbeEnricher {
"srvReqId" → ns.srvReqId,
"modelId" → mdl.model.getId,
"txt" → ns.text) { _ ⇒
- val buf = mutable.Buffer.empty[Set[NCNlpSentenceToken]]
+ val notes = mutable.HashSet.empty[NCNlpSentenceNote]
def isImportant(t: NCNlpSentenceToken): Boolean = t.isUser ||
MASK_WORDS.contains(t.stem)
- for (toks ← ns.tokenMixWithStopWords() if validImportant(ns, toks,
isImportant)) {
+ for (toks ← ns.tokenMixWithStopWords() if
NCEnricherProcessor.validImportant(ns, toks, isImportant)) {
tryToMatch(toks) match {
case Some(m) ⇒
def addNotes(
@@ -432,8 +433,12 @@ object NCSortEnricher extends NCProbeEnricher {
def mkNote(params: ArrayBuffer[(String, Any)]): Unit =
{
val note = NCNlpSentenceNote(m.main.map(_.index),
TOK_ID, params: _*)
- m.main.foreach(_.add(note))
- m.stop.foreach(_.addStopReason(note))
+ if (!notes.exists(n ⇒
NCEnricherProcessor.sameForSentence(note, n, ns))) {
+ notes += note
+
+ m.main.foreach(_.add(note))
+ m.stop.foreach(_.addStopReason(note))
+ }
}
def mkParams(): mutable.ArrayBuffer[(String, Any)] = {
diff --git
a/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/post/NCPostEnrichProcessor.scala
b/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/utils/NCEnricherProcessor.scala
similarity index 83%
rename from
src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/post/NCPostEnrichProcessor.scala
rename to
src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/utils/NCEnricherProcessor.scala
index 47447a3..08b73ba 100644
---
a/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/post/NCPostEnrichProcessor.scala
+++
b/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/utils/NCEnricherProcessor.scala
@@ -15,15 +15,16 @@
* limitations under the License.
*/
-package org.apache.nlpcraft.probe.mgrs.nlp.enrichers.post
+package org.apache.nlpcraft.probe.mgrs.nlp.enrichers.utils
import java.io.Serializable
import java.util
+import java.util.Collections
import com.typesafe.scalalogging.LazyLogging
import io.opencensus.trace.Span
import org.apache.nlpcraft.common.nlp.pos._
-import org.apache.nlpcraft.common.nlp.{NCNlpSentence, _}
+import org.apache.nlpcraft.common.nlp.{NCNlpSentence, NCNlpSentenceNote, _}
import org.apache.nlpcraft.common.{NCService, _}
import org.apache.nlpcraft.model.NCToken
import org.apache.nlpcraft.model.impl.NCTokenImpl
@@ -33,14 +34,9 @@ import scala.collection.JavaConverters._
import scala.collection._
/**
- * This collapser handles several tasks:
- * - "overall" collapsing after all other individual collapsers had their
turn.
- * - Special further enrichment of tokens like linking, etc.
*
- * In all cases of overlap (full or partial) - the "longest" note wins. In
case of overlap and equal
- * lengths - the winning note is chosen based on this priority.
*/
-object NCPostEnrichProcessor extends NCService with LazyLogging {
+object NCEnricherProcessor extends NCService with LazyLogging {
override def start(parent: Span = null): NCService =
startScopedSpan("start", parent) { _ ⇒
super.start()
}
@@ -53,117 +49,39 @@ object NCPostEnrichProcessor extends NCService with
LazyLogging {
*
* @param note
* @param withIndexes
+ * @param withReferences
* @return
*/
- def getUniqueKey(note: NCNlpSentenceNote, withIndexes: Boolean = true):
Any = {
- val seq1 = if (withIndexes) Seq(note.wordIndexes, note.noteType) else
Seq(note.noteType)
+ def getKey(note: NCNlpSentenceNote, withIndexes: Boolean = true,
withReferences: Boolean = true): Seq[Any] = {
+ def addRefs(names: String*): Seq[String] = if (withReferences) names
else Seq.empty
- val seq2: Seq[Any] =
+ val names: Seq[String] =
if (note.isUser)
Seq.empty
- else {
+ else
note.noteType match {
- case "nlpcraft:continent" ⇒
- Seq(
- note.get("continent")
- )
- case "nlpcraft:subcontinent" ⇒
- Seq(
- note.get("continent"),
- note.get("subcontinent")
- )
- case "nlpcraft:country" ⇒
- Seq(
- note.get("continent"),
- note.get("subcontinent"),
- note.get("country")
- )
- case "nlpcraft:region" ⇒
- Seq(
- note.wordIndexes,
- note.noteType,
- note.get("continent"),
- note.get("subcontinent"),
- note.get("country"),
- note.get("region")
- )
- case "nlpcraft:city" ⇒
- Seq(
- note.get("continent"),
- note.get("subcontinent"),
- note.get("country"),
- note.get("region"),
- note.get("city")
- )
- case "nlpcraft:metro" ⇒
- Seq(
- note.get("metro")
- )
- case "nlpcraft:date" ⇒
- Seq(
- note.get("from"),
- note.get("to")
- )
- case "nlpcraft:relation" ⇒
- Seq(
- note.get("type"),
- note.get("indexes"),
- note.get("note")
- )
- case "nlpcraft:sort" ⇒
- Seq(
- note.wordIndexes,
- note.noteType,
- note.get("subjnotes"),
- note.get("subjindexes"),
- note.getOrElse("bynotes", null),
- note.getOrElse("byindexes", null),
- note.getOrElse("asc", null)
- )
- case "nlpcraft:limit" ⇒
- Seq(
- note.get("limit"),
- note.getOrElse("asc", null),
- note.get("indexes"),
- note.get("note")
- )
- case "nlpcraft:coordinate" ⇒
- Seq(
- note.get("latitude"),
- note.get("longitude")
- )
- case "nlpcraft:num" ⇒
- Seq(
- note.get("from"),
- note.get("to"),
- note.getOrElse("indexes", null),
- note.getOrElse("note", null)
-
- )
- case x if x.startsWith("google:") ⇒
- Seq(
- note.get("meta"),
- note.get("mentionsBeginOffsets"),
- note.get("mentionsContents"),
- note.get("mentionsTypes")
- )
- case x if x.startsWith("stanford:") ⇒
- Seq(
- note.get("nne")
- )
- case x if x.startsWith("opennlp:") ⇒
- Seq(
- note.wordIndexes,
- note.noteType
- )
- case x if x.startsWith("spacy:") ⇒
- Seq(
- note.get("vector")
- )
+ case "nlpcraft:continent" ⇒ Seq("continent")
+ case "nlpcraft:subcontinent" ⇒ Seq("continent",
"subcontinent")
+ case "nlpcraft:country" ⇒ Seq("continent", "subcontinent",
"country")
+ case "nlpcraft:region" ⇒ Seq("continent", "subcontinent",
"country", "region")
+ case "nlpcraft:city" ⇒ Seq("continent", "subcontinent",
"country", "region", "city")
+ case "nlpcraft:metro" ⇒ Seq("metro")
+ case "nlpcraft:date" ⇒ Seq("from", "to")
+ case "nlpcraft:relation" ⇒ Seq("type", "note") ++
addRefs("indexes")
+ case "nlpcraft:sort" ⇒ Seq("asc", "subjnotes", "bynotes")
++ addRefs("subjindexes", "byindexes")
+ case "nlpcraft:limit" ⇒ Seq("limit", "asc", "note") ++
addRefs("indexes")
+ case "nlpcraft:coordinate" ⇒ Seq("latitude", "longitude")
+ case "nlpcraft:num" ⇒ Seq("from", "to", "unit", "unitType")
+ case x if x.startsWith("google:") ⇒ Seq("meta",
"mentionsBeginOffsets", "mentionsContents", "mentionsTypes")
+ case x if x.startsWith("stanford:") ⇒ Seq("nne")
+ case x if x.startsWith("opennlp:") ⇒ Seq.empty
+ case x if x.startsWith("spacy:") ⇒ Seq("vector")
case _ ⇒ throw new AssertionError(s"Unexpected note type:
${note.noteType}")
}
- }
+
+ val seq1 = if (withIndexes) Seq(note.wordIndexes, note.noteType) else
Seq(note.noteType)
+ val seq2 = names.map(name ⇒ note.getOrElse(name, null))
seq1 ++ seq2
}
@@ -587,6 +505,12 @@ object NCPostEnrichProcessor extends NCService with
LazyLogging {
}
/**
+ * This collapser handles several tasks:
+ * - "overall" collapsing after all other individual collapsers had their
turn.
+ * - Special further enrichment of tokens like linking, etc.
+ *
+ * In all cases of overlap (full or partial) - the "longest" note wins.
In case of overlap and equal
+ * lengths - the winning note is chosen based on this priority.
*
* @param mdl
* @param ns
@@ -605,7 +529,7 @@ object NCPostEnrichProcessor extends NCService with
LazyLogging {
// other variants for these words are redundant.
val redundant: Seq[NCNlpSentenceNote] =
ns.flatten.filter(!_.isNlp).distinct.
- groupBy(p ⇒ getUniqueKey(p)).
+ groupBy(p ⇒ getKey(p)).
map(p ⇒ p._2.sortBy(p ⇒
(
// System notes don't have such flags.
@@ -722,7 +646,7 @@ object NCPostEnrichProcessor extends NCService with
LazyLogging {
// Drops similar sentences (with same tokens structure).
// Among similar sentences we prefer one with minimal free words
count.
- sens.groupBy(_.flatten.filter(!_.isNlp).map(note ⇒
getUniqueKey(note, withIndexes = false))).
+ sens.groupBy(_.flatten.filter(!_.isNlp).map(note ⇒ getKey(note,
withIndexes = false))).
map { case (_, seq) ⇒ seq.minBy(_.filter(p ⇒ p.isNlp &&
!p.isStopWord).map(_.wordIndexes.length).sum) }.
toSeq
}
@@ -786,4 +710,76 @@ object NCPostEnrichProcessor extends NCService with
LazyLogging {
)
)
}
+
+ /**
+ * Checks whether important tokens deleted as stopwords or not.
+ *
+ * @param ns Sentence.
+ * @param toks Tokens in which some stopwords can be deleted.
+ * @param isImportant Token important criteria.
+ */
+ def validImportant(
+ ns: NCNlpSentence,
+ toks: Seq[NCNlpSentenceToken],
+ isImportant: NCNlpSentenceToken ⇒ Boolean
+ ): Boolean = {
+ val idxs = toks.map(_.index)
+
+ require(idxs == idxs.sorted)
+
+ val toks2 = ns.slice(idxs.head, idxs.last + 1)
+
+ toks.length == toks2.length || toks.count(isImportant) ==
toks2.count(isImportant)
+ }
+
+ /**
+ *
+ * @param n1
+ * @param n2
+ * @param sen
+ * @return
+ */
+ def sameForSentence(n1: NCNlpSentenceNote, n2: NCNlpSentenceNote, sen:
NCNlpSentence): Boolean = {
+ require(n1.noteType == n2.noteType)
+
+ val stopIdxs = sen.filter(_.isStopWord).map(_.index)
+
+ // One possible difference - stopwords indexes.
+ def wordsEqualOrSimilar(n1: NCNlpSentenceNote, n2: NCNlpSentenceNote):
Boolean = {
+ val set1 = n1.wordIndexes.toSet
+ val set2 = n2.wordIndexes.toSet
+
+ set1 == set2 || set1.subsetOf(set2) &&
set2.diff(set1).forall(stopIdxs.contains)
+ }
+
+ def tokensEqualOrSimilar(set1: Set[NCNlpSentenceToken], set2:
Set[NCNlpSentenceToken]): Boolean =
+ set1 == set2 || set1.subsetOf(set2) &&
set2.diff(set1).forall(_.isStopWord)
+
+ val refIdxNames =
+ n1.noteType match {
+ case "nlpcraft:sort" ⇒ Seq("subjindexes", "byindexes")
+ case "nlpcraft:limit" ⇒ Seq("indexes")
+ case "nlpcraft:reference" ⇒ Seq("indexes")
+
+ case _ ⇒ Seq.empty
+ }
+
+ def extract(n: NCNlpSentenceNote, refIdxName: String):
Set[NCNlpSentenceToken] =
+ n.getOrElse(refIdxName,
Collections.emptyList).asInstanceOf[java.util.List[Int]].asScala.map(sen(_)).toSet
+
+ def referencesEqualOrNearly(n1: NCNlpSentenceNote, n2:
NCNlpSentenceNote): Boolean =
+ refIdxNames.isEmpty || refIdxNames.forall(refIdxName ⇒ {
+ val refs1 = extract(n1, refIdxName)
+ val refs2 = extract(n2, refIdxName)
+
+ tokensEqualOrSimilar(refs1, refs2) ||
tokensEqualOrSimilar(refs2, refs1)
+ })
+
+ def getUniqueKey0(n: NCNlpSentenceNote): Seq[Any] = getKey(n,
withIndexes = false, withReferences = false)
+
+ getUniqueKey0(n1) == getUniqueKey0(n2) &&
+ (wordsEqualOrSimilar(n2, n1) || wordsEqualOrSimilar(n1, n2)) &&
+ (referencesEqualOrNearly(n2, n1) || referencesEqualOrNearly(n1,
n2))
+ }
+
}