This is an automated email from the ASF dual-hosted git repository.
sergeykamov pushed a commit to branch NLPCRAFT-443
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git
The following commit(s) were added to refs/heads/NLPCRAFT-443 by this push:
new 308b097 WIP.
308b097 is described below
commit 308b09761856852938c28f3de3f332b5df4a7e2c
Author: Sergey Kamov <[email protected]>
AuthorDate: Sun Sep 19 12:26:18 2021 +0300
WIP.
---
.../apache/nlpcraft/common/nlp/NCNlpSentence.scala | 7 ++--
.../probe/mgrs/sentence/NCSentenceManager.scala | 38 ++++++++++++----------
2 files changed, 25 insertions(+), 20 deletions(-)
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentence.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentence.scala
index f508745..478d930 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentence.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentence.scala
@@ -101,10 +101,11 @@ class NCNlpSentence(
override def equals(obj: Any): Boolean = obj match {
case x: NCNlpSentence =>
+ tokens.size == x.tokens.size &&
tokens == x.tokens &&
- srvReqId == x.srvReqId &&
- text == x.text &&
- enabledBuiltInToks == x.enabledBuiltInToks
+ srvReqId == x.srvReqId &&
+ text == x.text &&
+ enabledBuiltInToks == x.enabledBuiltInToks
case _ => false
}
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceManager.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceManager.scala
index c842825..0c0288d 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceManager.scala
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceManager.scala
@@ -487,8 +487,9 @@ object NCSentenceManager extends NCService {
*
* @param ns Sentence.
* @param notNlpTypes Token types.
+ * @param lastPhase Phase.
*/
- private def collapseSentence(ns: NCNlpSentence, notNlpTypes: Seq[String]):
Boolean = {
+ private def collapseSentence(ns: NCNlpSentence, notNlpTypes: Seq[String],
lastPhase: Boolean): Boolean = {
ns.
filter(!_.isNlp).
filter(_.isStopWord).
@@ -523,7 +524,8 @@ object NCSentenceManager extends NCService {
fixIndexesReferencesList("nlpcraft:sort", "subjindexes",
"subjnotes", ns, histSeq) &&
fixIndexesReferencesList("nlpcraft:sort", "byindexes", "bynotes",
ns, histSeq)
- if (res) {
+ // On last phase - just for performance reasons.
+ if (res && lastPhase) {
// Validation (all indexes calculated well)
require(
!res ||
@@ -603,7 +605,7 @@ object NCSentenceManager extends NCService {
if (lastPhase)
dropAbstract(mdl, ns)
- if (collapseSentence(ns,
getNotNlpNotes(ns.toSeq).map(_.noteType).distinct)) Some(ns) else None
+ if (collapseSentence(ns,
getNotNlpNotes(ns.toSeq).map(_.noteType).distinct, lastPhase)) Some(ns) else
None
}
// Always deletes `similar` notes.
@@ -752,14 +754,16 @@ object NCSentenceManager extends NCService {
}
}.toSeq
- sens =
- sens.filter(s => {
- def mkNotNlp(s: NCNlpSentence): Set[NCNlpSentenceNote] =
s.flatten.filter(!_.isNlp).toSet
+ var sensWithNotes = sens.map(s => s ->
s.flatten.filter(!_.isNlp).toSet)
- val notNlpNotes = mkNotNlp(s)
+ var sensWithNotesIdxs = sensWithNotes.zipWithIndex
- !sens.filter(_ != s).map(mkNotNlp).exists(notNlpNotes.subsetOf)
- })
+ sens =
+ sensWithNotesIdxs.filter { case ((_, notNlpNotes1), idx1) =>
+ !sensWithNotesIdxs.
+ filter { case (_, idx2) => idx2 != idx1 }.
+ exists { case((_, notNlpNotes2), _) =>
notNlpNotes1.subsetOf(notNlpNotes2) }
+ }.map { case ((sen, _), _) => sen }
// Drops similar sentences (with same tokens structure).
// Among similar sentences we prefer one with minimal free words count.
@@ -769,17 +773,17 @@ object NCSentenceManager extends NCService {
// Drops sentences if they are just subset of another.
// (Maybe better for lastPhase?)
- sens = sens.filter(s1 => {
- val notes1 = s1.tokens.flatten.distinct.filter(!_.isNlp)
+ sensWithNotes = sensWithNotes.filter { case (sen, _) =>
sens.contains(sen) }
- !sens.exists(s2 =>
- s1 != s2 && {
- val notes2 = s2.tokens.flatten.distinct.filter(!_.isNlp)
+ sensWithNotesIdxs = sensWithNotes.zipWithIndex
- notes2.size > notes1.size && notes1.forall(t1 =>
notes2.exists(_.equalsWithoutIndexes(t1)))
+ sens = sensWithNotesIdxs.filter { case ((s1, notNlpNotes1), idx1) =>
+ !sensWithNotesIdxs.exists { case ((s2, notNlpNotes2), idx2) =>
+ idx1 != idx2 && {
+ notNlpNotes2.size > notNlpNotes1.size &&
notNlpNotes1.forall(t1 => notNlpNotes2.exists(_.equalsWithoutIndexes(t1)))
}
- )
- })
+ }
+ }.map { case ((sen, _), _) => sen }
sens
}