This is an automated email from the ASF dual-hosted git repository. sergeykamov pushed a commit to branch NLPCRAFT-287 in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git
commit a349b6fe1baa5d6069f725f232b32146efb21873 Author: Sergey Kamov <[email protected]> AuthorDate: Fri Apr 9 14:09:52 2021 +0300 WIP. --- .../nlpcraft/common/nlp/NCNlpSentenceNote.scala | 25 ++-- .../apache/nlpcraft/model/impl/NCTokenImpl.scala | 8 +- .../mgrs/nlp/enrichers/model/NCModelEnricher.scala | 144 +++++++++++---------- .../probe/mgrs/sentence/NCSentenceManager.scala | 15 ++- .../nlpcraft/model/sparse/NCSparseSpec.scala | 15 ++- 5 files changed, 118 insertions(+), 89 deletions(-) diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentenceNote.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentenceNote.scala index c0923ae..9adbe01 100644 --- a/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentenceNote.scala +++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentenceNote.scala @@ -23,12 +23,13 @@ import org.apache.nlpcraft.common.ascii._ import scala.collection.JavaConverters._ import scala.collection.{Seq, Set, mutable} import scala.language.implicitConversions +import java.io.{Serializable ⇒ JSerializable} /** * Sentence token note is a typed map of KV pairs. * */ -class NCNlpSentenceNote(private val values: Map[String, java.io.Serializable]) extends java.io.Serializable with NCAsciiLike { +class NCNlpSentenceNote(private val values: Map[String, JSerializable]) extends JSerializable with NCAsciiLike { import NCNlpSentenceNote._ @transient @@ -75,7 +76,7 @@ class NCNlpSentenceNote(private val values: Map[String, java.io.Serializable]) e ) override def clone(): NCNlpSentenceNote = { - val m = mutable.Map.empty[String, java.io.Serializable] ++ values + val m = mutable.Map.empty[String, JSerializable] ++ values new NCNlpSentenceNote(m.toMap) } @@ -91,20 +92,20 @@ class NCNlpSentenceNote(private val values: Map[String, java.io.Serializable]) e * * @return */ - def skipNlp(): Map[String, java.io.Serializable] = + def skipNlp(): Map[String, JSerializable] = values.filter { case (key, _) ⇒ !SKIP_CLONE.contains(key) && key != "noteType" } /** * */ - def asMetadata(): Map[String, java.io.Serializable] = + def asMetadata(): Map[String, JSerializable] = if (isUser) values.get("meta") match { - case Some(meta) ⇒ meta.asInstanceOf[Map[String, java.io.Serializable]] - case None ⇒ Map.empty[String, java.io.Serializable] + case Some(meta) ⇒ meta.asInstanceOf[Map[String, JSerializable]] + case None ⇒ Map.empty[String, JSerializable] } else { - val md = mutable.Map.empty[String, java.io.Serializable] + val md = mutable.Map.empty[String, JSerializable] val m = if (noteType != "nlpcraft:nlp") skipNlp() else values @@ -117,8 +118,8 @@ class NCNlpSentenceNote(private val values: Map[String, java.io.Serializable]) e * * @param kvs */ - def clone(kvs : (String, java.io.Serializable)*): NCNlpSentenceNote = { - val m = mutable.HashMap.empty[String, java.io.Serializable] ++ values + def clone(kvs : (String, JSerializable)*): NCNlpSentenceNote = { + val m = mutable.HashMap.empty[String, JSerializable] ++ values kvs.foreach(kv ⇒ m += kv._1 → kv._2) @@ -206,7 +207,7 @@ object NCNlpSentenceNote { /** * To immutable map. */ - implicit def values(note: NCNlpSentenceNote): Map[String, java.io.Serializable] = note.values + implicit def values(note: NCNlpSentenceNote): Map[String, JSerializable] = note.values /** * Creates new note with given parameters. @@ -228,7 +229,7 @@ object NCNlpSentenceNote { val (sparsity, tokMinIndex, tokMaxIndex, tokWordIndexes, len) = calc(wordIndexesOpt.getOrElse(indexes)) new NCNlpSentenceNote( - mutable.HashMap[String, java.io.Serializable](( + mutable.HashMap[String, JSerializable](( params.filter(_._2 != null) :+ ("noteType" → typ) :+ ("tokMinIndex" → indexes.min) :+ @@ -240,7 +241,7 @@ object NCNlpSentenceNote { ("wordLength" → len) :+ ("sparsity" → sparsity) :+ ("contiguous" → (sparsity == 0)) - ).map(p ⇒ p._1 → p._2.asInstanceOf[java.io.Serializable]): _*).toMap + ).map(p ⇒ p._1 → p._2.asInstanceOf[JSerializable]): _*).toMap ) } diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/NCTokenImpl.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/NCTokenImpl.scala index 017ead1..8c5005a 100644 --- a/nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/NCTokenImpl.scala +++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/model/impl/NCTokenImpl.scala @@ -17,7 +17,7 @@ package org.apache.nlpcraft.model.impl -import java.io.Serializable +import java.io.{Serializable ⇒ JSerializable} import java.util.Collections import org.apache.nlpcraft.common._ @@ -50,7 +50,7 @@ private[nlpcraft] class NCTokenImpl( endCharIndex: Int, meta: Map[String, Object], isAbstractProp: Boolean -) extends NCToken with Serializable { +) extends NCToken with JSerializable { require(mdl != null) require(srvReqId != null) require(id != null) @@ -106,7 +106,7 @@ private[nlpcraft] object NCTokenImpl { // nlpcraft:nlp and some optional (after collapsing). require(tok.size <= 2, s"Unexpected token [size=${tok.size}, token=$tok]") - val md = mutable.HashMap.empty[String, java.io.Serializable] + val md = mutable.HashMap.empty[String, JSerializable] tok.foreach(n ⇒ { val id = n.noteType.toLowerCase @@ -142,7 +142,7 @@ private[nlpcraft] object NCTokenImpl { // Special synthetic meta data element. md.put("nlpcraft:nlp:freeword", false) - elm.getMetadata.asScala.foreach { case (k, v) ⇒ md.put(k, v.asInstanceOf[java.io.Serializable]) } + elm.getMetadata.asScala.foreach { case (k, v) ⇒ md.put(k, v.asInstanceOf[JSerializable]) } new NCTokenImpl( mdl.model, diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala index 0ec40cd..d668c02 100644 --- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala +++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala @@ -130,13 +130,21 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala { case class ComplexHolder(complexesWords: Seq[Complex], complexes: Seq[ComplexSeq]) - // Found-by-synonym model element. + /** + * Found-by-synonym model element. + * + * @param element Element. + * @param tokens Element tokens. + * @param synonym Synonyms. + * @param parts Parts for DSL synonyms. + * @param allToksIdxs All tokens indexes (whole tokens slice, has sense for sparse tokens) + */ case class ElementMatch( element: NCElement, tokens: Seq[NlpToken], synonym: Synonym, parts: Seq[TokType], - tokIdxs: Seq[Int] + allToksIdxs: Seq[Int] ) extends Ordered[ElementMatch] { // Tokens sparsity. lazy val sparsity: Int = U.calcSparsity(tokens.map(_.index)) @@ -206,7 +214,7 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala { * @param syn * @param metaOpt * @param parts - * @param toksIdxs + * @param allToksIdxs */ private def mark( ns: NCNlpSentence, @@ -216,16 +224,15 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala { syn: Option[Synonym], metaOpt: Option[Map[String, Object]], parts: Seq[TokType], - toksIdxs: Seq[Int] + allToksIdxs: Seq[Int] ): Unit = { val params = mutable.ArrayBuffer.empty[(String, AnyRef)] // For system elements. params += "direct" → direct.asInstanceOf[AnyRef] - val toksIdxsJava: JList[Int] = toksIdxs.asJava - - params += "allToksIndexes" → toksIdxsJava + // Internal usage. + params += "allToksIndexes" → allToksIdxs.asJava syn match { case Some(s) ⇒ @@ -334,6 +341,7 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala { ).getOrElse(throw new AssertionError(s"Custom model parser returned an invalid custom token: $w")) ) + // Checks element's tokens. if (!alreadyMarked(matchedToks, elemId)) mark( ns, @@ -379,17 +387,15 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala { * @param toks * @param elemId */ - private def alreadyMarked(toks: Seq[NlpToken], elemId: String): Boolean = - toks.forall(_.isTypeOf(elemId)) || - toks.flatten.exists(n ⇒ - n.noteType == elemId && - ( - n.dataOpt("allToksIndexes").asInstanceOf[Option[JList[Int]]] match { - case Some(idxs) ⇒ idxs.asScala.containsSlice(toks.map(_.index)) - case None ⇒ false - } - ) - ) + private def alreadyMarked(toks: Seq[NlpToken], elemId: String): Boolean = { + def hasIndex(n: NCNlpSentenceNote): Boolean = + n.dataOpt("allToksIndexes").asInstanceOf[Option[JList[Int]]] match { + case Some(idxs) ⇒ idxs.asScala.containsSlice(toks.map(_.index)) + case None ⇒ false + } + + toks.flatten.exists(n ⇒ n.noteType == elemId && hasIndex(n)) + } /** * @@ -519,39 +525,6 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala { }).seq } - /** - * - * @param ns - * @param mdlId - * @param matches - */ - private def processMatches(ns: NCNlpSentence, mdlId: String, matches: Seq[ElementMatch]): Unit = { - // TODO:matchesNorm - // Add notes for all remaining (non-intersecting) matches. - for ((m, idx) ← matches.zipWithIndex) { - if (DEEP_DEBUG) - logger.trace( - s"Model '$mdlId' element found (${idx + 1} of ${matches.size}) [" + - s"elementId=${m.element.getId}, " + - s"synonym=${m.synonym}, " + - s"tokens=${tokString(m.tokens)}" + - s"]" - ) - - val tokIdxs = m.tokens.map(_.index) - val direct = m.synonym.isDirect && (tokIdxs == tokIdxs.sorted) - - // TODO: - if (!alreadyMarked(m.tokens, m.element.getId)) { - mark(ns, m.element, m.tokens, direct, syn = Some(m.synonym), metaOpt = None, m.parts, m.tokIdxs) - - println(s"SET: ${m.element.getId}, m.tokens=${m.tokens.map(_.origText).mkString("|")}") - } - else - println(s"NOT SET: ${m.element.getId}, m.tokens=${m.tokens.map(_.origText).mkString("|")}") - } - } - @throws[NCE] override def enrich(mdl: NCProbeModel, ns: NCNlpSentence, senMeta: Map[String, Serializable], parent: Span = null): Unit = { require(isStarted) @@ -561,7 +534,7 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala { startScopedSpan("enrich", parent, "srvReqId" → srvReqId, "mdlId" → mdlId, "txt" → ns.text) { span ⇒ val req = NCRequestImpl(senMeta, srvReqId) - val h = mkComplexes(mdl, ns) + lazy val h = mkComplexes(mdl, ns) startScopedSpan("synsProc", span, "srvReqId" → srvReqId, "mdlId" → mdlId, "txt" → ns.text) { _ ⇒ var state = if (ns.firstProbePhase) SIMPLE else DSL_NEXT @@ -571,9 +544,6 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala { val combosToks = combos(ns) def go(): Unit = { - println - println(s"GO $state") - val matches = mutable.ArrayBuffer.empty[ElementMatch] val cacheSparse = mkCache(mdl) @@ -582,22 +552,30 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala { var found = false - def add(typ: String, elm: NCElement, cache: Cache, res: Seq[NlpToken], tokIdxs: Seq[Int], s: Synonym, parts: Seq[TokType] = Seq.empty): Unit = { + def add(typ: String, elm: NCElement, cache: Cache, res: Seq[NlpToken], allToksIdxs: Seq[Int], s: Synonym, parts: Seq[TokType] = Seq.empty): Unit = { var added = false if (!matchExist(elm.getId, res)) { - matches += ElementMatch(elm, res, s, parts, tokIdxs) + matches += ElementMatch(elm, res, s, parts, allToksIdxs) added = true } - cache(elm.getId) += tokIdxs + cache(elm.getId) += allToksIdxs found = true - println(s"ADDED: ${elm.getId}, type=$typ, res=${res.map(_.origText).mkString("|")}, toks=${tokIdxs.mkString("|")}, added=$added") + if (DEEP_DEBUG) + logger.trace( + s"Found element [" + + s"id=${elm.getId}, " + + s"type=$typ, " + + s"indexes=${res.map(_.index).mkString("|")}, " + + s"allTokensIndexes=${allToksIdxs.mkString("|")}, " + + s"added=$added" + + s"]" + ) } - // TODO: def matchExist(elemId: String, toks: Seq[NlpToken]): Boolean = matches.exists(m ⇒ m.element.getId == elemId && toks.toSet.subsetOf(m.tokensSet)) @@ -607,15 +585,16 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala { lazy val tokStems = toks.map(_.stem).mkString(" ") // Attempt to match each element. - // TODO: alreadyMarked - может быть найдено тоже самое но отмечено меньше (как это сразу не рассматривать?) for ( elm ← mdl.elements.values; elemId = elm.getId; dirProc = cacheDirect(elemId).exists(_.containsSlice(tokIdxs)); sparseProc = cacheSparse(elemId).exists(_.containsSlice(tokIdxs)) - if (!dirProc || !sparseProc) && !alreadyMarked(toks, elemId) && !matchExist(elemId, toks) + if + (!dirProc || !sparseProc) && + // Checks whole tokens slice. + !alreadyMarked(toks, elemId) && !matchExist(elemId, toks) ) { - //println(s"State=$elemId, dirProc=$dirProc, sparseProc=$sparseProc, cacheSparse(elemId)="+cacheSparse(elemId).mkString("|")) // 1. SIMPLE. found = false @@ -662,9 +641,7 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala { } // 2. DSL. - val dslEnabled = state != SIMPLE - - if (dslEnabled && mdl.synonymsDsl.nonEmpty) { + if (state != SIMPLE && mdl.synonymsDsl.nonEmpty) { found = false // 2.1 Sparse. @@ -691,9 +668,42 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala { } } - processMatches(ns, mdlId, matches) + for ((m, idx) ← matches.zipWithIndex) { + if (DEEP_DEBUG) + logger.trace( + s"Model '$mdlId' element found (${idx + 1} of ${matches.size}) [" + + s"elementId=${m.element.getId}, " + + s"synonym=${m.synonym}, " + + s"tokens=${tokString(m.tokens)}" + + s"]" + ) + + val tokIdxs = m.tokens.map(_.index) + val direct = m.synonym.isDirect && !tokIdxs.zip(tokIdxs.tail).exists { case (x, y) ⇒ x > y } + + var added = false + + // Checks element's tokens. + if (!alreadyMarked(m.tokens, m.element.getId)) { + mark(ns, m.element, m.tokens, direct, syn = Some(m.synonym), metaOpt = None, m.parts, m.allToksIdxs) + + added = true + } + + if (DEEP_DEBUG) + logger.trace( + s"Element ${if (added) "added" else "skipped"} [" + + s"id=${m.element.getId}, " + + s"indexes=${m.tokens.map(_.index).mkString("|")}, " + + s"allTokensIndexes=${m.allToksIdxs.mkString("|")}, " + + s"]" + ) + } } + if (DEEP_DEBUG) + logger.trace(s"Exexucution started with state: $state") + go() if (state == SIMPLE) { diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceManager.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceManager.scala index 2776677..541966a 100644 --- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceManager.scala +++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceManager.scala @@ -733,9 +733,22 @@ object NCSentenceManager extends NCService { ) ) + def notNlpNotes(s: NCNlpSentence): Seq[NCNlpSentenceNote] = s.flatten.filter(!_.isNlp) + + // Drops similar sentences (with same notes structure). Keeps with more found. + sens = sens.groupBy(notNlpNotes(_).groupBy(_.noteType).keys.toSeq.sorted.distinct). + flatMap(p ⇒ { + val m: Map[NCNlpSentence, Int] = p._2.map(p ⇒ p → notNlpNotes(p).size).toMap + + val max = m.values.max + + m.filter(_._2 == max).keys + }). + toSeq + // Drops similar sentences (with same tokens structure). // Among similar sentences we prefer one with minimal free words count. - sens.groupBy(_.flatten.filter(!_.isNlp).map(_.getKey(withIndexes = false))). + sens.groupBy(notNlpNotes(_).map(_.getKey(withIndexes = false))). map { case (_, seq) ⇒ seq.minBy(_.filter(p ⇒ p.isNlp && !p.isStopWord).map(_.wordIndexes.length).sum) }. toSeq } diff --git a/nlpcraft/src/test/scala/org/apache/nlpcraft/model/sparse/NCSparseSpec.scala b/nlpcraft/src/test/scala/org/apache/nlpcraft/model/sparse/NCSparseSpec.scala index 37df085..8441532 100644 --- a/nlpcraft/src/test/scala/org/apache/nlpcraft/model/sparse/NCSparseSpec.scala +++ b/nlpcraft/src/test/scala/org/apache/nlpcraft/model/sparse/NCSparseSpec.scala @@ -36,17 +36,20 @@ class NCSparseModel extends NCAbstractTokensModel { val variants = ctx.getVariants.asScala def checkOneVariant(sparsity: Int): Unit = { - require(variants.size == 1) + require(variants.size == 1, "There is should be single variant.") val toks = variants.head.asScala.filter(_.getId == "xyz") - require(toks.size == 3) + require(toks.size == 3, "There are should be 3 `xyz` tokens.") checkSparsity(sparsity, toks) } def checkSparsity(sparsity: Int, toks: mutable.Buffer[NCToken]): Unit = - require(toks.forall(_.getMetadata.get("nlpcraft:nlp:sparsity").asInstanceOf[Int] == sparsity)) + require( + toks.forall(_.getMetadata.get("nlpcraft:nlp:sparsity").asInstanceOf[Int] == sparsity), + s"Sparsity of each tokens should be: $sparsity." + ) def checkExists(sparsity: Int): Unit = require( @@ -58,9 +61,11 @@ class NCSparseModel extends NCAbstractTokensModel { checkSparsity(sparsity, toks) true - case _ ⇒ false + case _ ⇒ + false } - }) + }), + s"Variant with 3 `xyz` tokens should be exists." ) ctx.getRequest.getNormalizedText match {
