This is an automated email from the ASF dual-hosted git repository. sergeykamov pushed a commit to branch NLPCRAFT-287 in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git
commit facba336dd6c9af04654e2cde381ffe085b107e7 Author: Sergey Kamov <[email protected]> AuthorDate: Wed Apr 14 18:14:31 2021 +0300 WIP. --- .../apache/nlpcraft/probe/mgrs/NCProbeModel.scala | 1 + .../nlpcraft/probe/mgrs/model/NCModelManager.scala | 8 - .../mgrs/nlp/enrichers/model/NCModelEnricher.scala | 193 ++++++++++----------- 3 files changed, 95 insertions(+), 107 deletions(-) diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeModel.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeModel.scala index a4e55a6..2670fb7 100644 --- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeModel.scala +++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeModel.scala @@ -50,4 +50,5 @@ case class NCProbeModel( ) { def hasIdlSynonyms(elemId: String): Boolean = idlSynonyms.contains(elemId) def hasIdlSynonyms: Boolean = idlSynonyms.nonEmpty + def hasNoIdlSynonyms: Boolean = continuousSynonyms.nonEmpty || sparseSynonyms.nonEmpty } diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/model/NCModelManager.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/model/NCModelManager.scala index 2b8313c..3bd052c 100644 --- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/model/NCModelManager.scala +++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/model/NCModelManager.scala @@ -58,14 +58,6 @@ object NCModelManager extends NCService with DecorateAsScala { data.values.foreach(w ⇒ { val mdl = w.model - // TODO: - val elemId = "col:orders_order_date" - - println("w.directSynonyms="+w.continuousSynonyms.getOrElse(elemId, Map.empty).mkString("\n")) - println("w.sparseSynonyms="+w.sparseSynonyms.getOrElse(elemId, Seq.empty).mkString("\n")) - println("w.idlSynonyms="+w.idlSynonyms.getOrElse(elemId, Seq.empty).mkString("\n")) - println - val contCnt = w.continuousSynonyms.flatMap(_._2.map(_._2.count)).sum val sparseCnt = w.sparseSynonyms.map(_._2.size).sum val allIdlSyns = w.idlSynonyms.values.flatten diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala index f01619c..79bcf3f 100644 --- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala +++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala @@ -147,6 +147,8 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala { ackStopped() } + def isComplex(mdl: NCProbeModel): Boolean = mdl.hasIdlSynonyms || !mdl.model.getParsers.isEmpty + /** * * @param ns @@ -156,7 +158,6 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala { * @param syn * @param metaOpt * @param parts - * @param allToksIdxs * @param continuous */ private def mark( @@ -164,11 +165,10 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala { elem: NCElement, toks: Seq[NlpToken], direct: Boolean, - syn: Option[Synonym], - metaOpt: Option[Map[String, Object]], - parts: Seq[TokType], - allToksIdxs: Seq[Int], - continuous: java.lang.Boolean + continuous: java.lang.Boolean, + syn: Option[Synonym] = None, + parts: Seq[TokType] = Seq.empty, + metaOpt: Option[Map[String, Object]] = None ): Unit = { val params = mutable.ArrayBuffer.empty[(String, AnyRef)] @@ -176,7 +176,7 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala { params += "direct" → direct.asInstanceOf[AnyRef] // Internal usage. - params += "allToksIndexes" → allToksIdxs.asJava + params += "sortedTokensIndexes" → toks.map(_.index).sorted.asJava params += "continuous" → continuous syn match { @@ -287,20 +287,16 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala { ) // Checks element's tokens. - val idxs = matchedToks.map(_.index) - val continuous = U.isContinuous(idxs.sorted) + val idxs = matchedToks.map(_.index).sorted - if (!alreadyMarked(matchedToks, idxs, continuous, elemId)) + if (!alreadyMarked(ns, elemId, matchedToks, idxs)) mark( ns, elem = mdl.elements.getOrElse(elemId, throw new NCE(s"Custom model parser returned unknown element ID: $elemId")), toks = matchedToks, direct = true, - syn = None, - metaOpt = Some(e.getMetadata.asScala), - parts = Seq.empty, - idxs, - continuous + U.isContinuous(idxs), + metaOpt = Some(e.getMetadata.asScala) ) }) } @@ -333,39 +329,6 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala { /** * - * @param toks - * @param elemId - */ - private def alreadyMarked(toks: Seq[NlpToken], allToksIndexes: Seq[Int], continuous: Boolean, elemId: String): Boolean = { - toks.forall(t ⇒ t.isTypeOf(elemId)) && - toks.head.filter(_.noteType == elemId).exists(n ⇒ n.tokenIndexes.toSet == toks.map(_.index).toSet) - toks.flatten.exists(n ⇒ - n.noteType == elemId && { - if (n.data("continuous").asInstanceOf[Boolean]) - false - else { - if (continuous) - false - else - n.data("allToksIndexes").asInstanceOf[JList[Int]].asScala.containsSlice(allToksIndexes) - } - } - ) - } - - /** - * - * @param seq - * @param s - */ - private def toPartsComplex(seq: Seq[Complex], s: Synonym): Seq[TokType] = - seq.zip(s.map(_.kind)).flatMap { - case (complex, kind) ⇒ if (complex.isToken) Some(complex.token → kind) - else None - } - - /** - * * @param seq * @param s */ @@ -468,6 +431,46 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala { }).seq } + private def add( + ns: NCNlpSentence, + contCache: mutable.Map[String, ArrayBuffer[Seq[Int]]], + typ: String, + elm: NCElement, + res: Seq[NlpToken], + allToksIdxs: Seq[Int], + syn: Synonym, + parts: Seq[TokType] = Seq.empty) + : Unit = { + val resIdxs = res.map(_.index) + + val continuous = U.isContinuous(resIdxs.sorted) + + if (continuous && resIdxs == allToksIdxs) + contCache(elm.getId) += allToksIdxs + + val ok = !alreadyMarked(ns, elm.getId, res, allToksIdxs) + + if (ok) { + val direct = syn.isDirect && U.isIncreased(resIdxs) + + mark(ns, elm, res, direct, continuous, syn = Some(syn), parts) + } + + if (DEEP_DEBUG && ok) + println( + s"${if (ok) "Added" else "Skipped"} element [" + + s"id=${elm.getId}, " + + s"type=$typ, " + + s"text='${res.map(_.origText).mkString(" ")}', " + + s"indexes=${resIdxs.mkString("[", ",", "]")}, " + + s"allTokensIndexes=${allToksIdxs.mkString("[", ",", "]")}, " + + s"continuous=$continuous, " + + s"synonym=$syn" + + s"]" + ) + } + + /** * * @param mdl @@ -493,41 +496,9 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala { if (DEEP_DEBUG) println(s"Execution started [simpleEnabled=$simpleEnabled, idlEnabled=$idlEnabled]") - val contCache = - mutable.HashMap.empty[String, mutable.ArrayBuffer[Seq[Int]]] ++ - mdl.elements.keys.map(k ⇒ k → mutable.ArrayBuffer.empty[Seq[Int]]) + val contCache = mutable.HashMap.empty ++ mdl.elements.keys.map(k ⇒ k → mutable.ArrayBuffer.empty[Seq[Int]]) lazy val idlCache = mutable.HashSet.empty[Seq[Complex]] - def add(typ: String, elm: NCElement, res: Seq[NlpToken], allToksIdxs: Seq[Int], s: Synonym, parts: Seq[TokType] = Seq.empty): Unit = { - val resIdxs = res.map(_.index) - - val continuous = U.isContinuous(resIdxs.sorted) - - if (continuous && resIdxs == allToksIdxs) - contCache(elm.getId) += allToksIdxs - - val added = !alreadyMarked(res, allToksIdxs, continuous, elm.getId) - - if (added) { - val direct = s.isDirect && U.isIncreased(resIdxs) - - mark(ns, elm, res, direct, syn = Some(s), metaOpt = None, parts, allToksIdxs, continuous) - } - - if (DEEP_DEBUG) - println( - s"${if (added) "Added" else "Skipped"} element [" + - s"id=${elm.getId}, " + - s"type=$typ, " + - s"text='${res.map(_.origText).mkString(" ")}', " + - s"indexes=${resIdxs.mkString("[", ",", "]")}, " + - s"allTokensIndexes=${allToksIdxs.mkString("[", ",", "]")}, " + - s"continuous=$continuous, " + - s"synonym=$s" + - s"]" - ) - } - for (toks ← combosToks) { val tokIdxs = toks.map(_.index) lazy val tokStems = toks.map(_.stem).mkString(" ") @@ -537,8 +508,8 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala { elm ← mdl.elements.values; elemId = elm.getId if - !contCache(elemId).exists(_.containsSlice(tokIdxs)) && - !alreadyMarked(toks, tokIdxs, continuous = true, elemId) // Checks whole tokens slice. + !contCache(elemId).exists(_.containsSlice(tokIdxs)) && + !alreadyMarked(ns, elemId, toks, tokIdxs) ) { // 1. SIMPLE. if (simpleEnabled && (if (idlEnabled) mdl.hasIdlSynonyms(elemId) else !mdl.hasIdlSynonyms(elemId))) { @@ -551,7 +522,7 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala { syns.get(tokStems) match { case Some(s) ⇒ found = true - add("simple continuous", elm, toks, tokIdxs, s) + add(ns, contCache,"simple continuous", elm, toks, tokIdxs, s) case None ⇒ notFound() } @@ -559,7 +530,7 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala { for (s ← syns if !found) if (s.isMatch(toks)) { found = true - add("simple continuous scan", elm, toks, tokIdxs, s) + add(ns, contCache, "simple continuous scan", elm, toks, tokIdxs, s) } tryMap( @@ -575,23 +546,24 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala { } // 1.2 Sparse. - for (s ← get(mdl.sparseSynonyms, elemId)) - s.sparseMatch(toks) match { - case Some(res) ⇒ add("simple sparse", elm, res, tokIdxs, s) - case None ⇒ // No-op. - } + if (!found) + for (s ← get(mdl.sparseSynonyms, elemId)) + s.sparseMatch(toks) match { + case Some(res) ⇒ add(ns, contCache, "simple sparse", elm, res, tokIdxs, s) + case None ⇒ // No-op. + } } // 2. IDL. if (idlEnabled) { - val idlCombs = mkComplexCombinations(ch, toks, idlCache.toSet) + lazy val idlCombs = mkComplexCombinations(ch, toks, idlCache.toSet) for (s ← get(mdl.idlSynonyms, elemId); comb ← idlCombs) s.idlMatch(comb.map(_.data), req) match { case Some(res) ⇒ val typ = if (s.sparse) "IDL sparse" else "IDL continuous" - add(typ, elm, toTokens(res, ns), tokIdxs, s, toParts(res, s)) + add(ns, contCache, typ, elm, toTokens(res, ns), tokIdxs, s, toParts(res, s)) idlCache += comb case None ⇒ // No-op. @@ -611,22 +583,45 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala { lazy val h = mkComplexes(mdl, ns) - val idlEnabled = mdl.hasIdlSynonyms - if (ns.firstProbePhase) { ns.firstProbePhase = false - execute(mdl, ns, combToks, simpleEnabled = true, idlEnabled = false, req, h, parent) - execute(mdl, ns, combToks, simpleEnabled = true, idlEnabled, req, h, parent) + if (mdl.hasNoIdlSynonyms) + execute(mdl, ns, combToks, simpleEnabled = true, idlEnabled = false, req, h, parent) + execute(mdl, ns, combToks, simpleEnabled = mdl.hasNoIdlSynonyms, mdl.hasIdlSynonyms, req, h, parent) } else { - if (idlEnabled) - execute(mdl, ns, combToks, simpleEnabled = false, idlEnabled, req, h, parent) + if (mdl.hasIdlSynonyms) + execute(mdl, ns, combToks, simpleEnabled = false, idlEnabled = true, req, h, parent) } processParsers(mdl, ns, span, req) } } - def isComplex(mdl: NCProbeModel): Boolean = mdl.hasIdlSynonyms || !mdl.model.getParsers.isEmpty + /** + * + * @param ns + * @param elemId + * @param toks + * @param allSortedSliceIdxs + */ + private def alreadyMarked(ns: NCNlpSentence, elemId: String, toks: Seq[NlpToken], allSortedSliceIdxs: Seq[Int]): Boolean = { + lazy val toksIdxsSorted = toks.map(_.index).sorted + lazy val isCont = U.isContinuous(toksIdxsSorted) + + ns.flatten.exists( + n ⇒ + n.noteType == elemId && + { + lazy val nToksIdxsSorted = n.data[JList[Int]]("sortedTokensIndexes").asScala + + n.data[Boolean]("continuous") && allSortedSliceIdxs.containsSlice(nToksIdxsSorted) || + { + nToksIdxsSorted == toksIdxsSorted || + isCont && U.isContinuous(nToksIdxsSorted) && nToksIdxsSorted.containsSlice(toksIdxsSorted) + } + } + ) + } } \ No newline at end of file
