This is an automated email from the ASF dual-hosted git repository. sergeykamov pushed a commit to branch NLPCRAFT-249 in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git
commit 71f7e1c0fa7ada8bcb6b2f80e33cfd36483eb180 Author: Sergey Kamov <[email protected]> AuthorDate: Mon Feb 22 16:29:22 2021 +0300 WIP. --- .../mgrs/nlp/enrichers/model/NCModelEnricher.scala | 31 ++++++- .../nlpcraft/model/jiggle/NCJiggleSpec.scala | 99 ++++++++++++++++++++++ 2 files changed, 128 insertions(+), 2 deletions(-) diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala index 28fcd21..d7ea084 100644 --- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala +++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala @@ -431,10 +431,37 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala { "totalJiggledPerms" → permCnt ) - val matchCnt = matches.size + // Scans by elements which found with same tokens length. + // Inside, for which token we drop all non-optimized combinations. + // Example: + // 1. element's synonym - 'a b', jiggle factor 4 (default), isPermuteSynonyms 'true' (default) + // 2. Request 'a b a b', + // Initially found 0-1, 1-2, 2-3, 0-3. + // 0-3 will be deleted because for 0 and 3 tokens best variants found for same element with same tokens length. + val matchesNorm = + matches. + flatMap(m ⇒ m.tokens.map(_ → m)). + groupBy { case (t, m) ⇒ (m.element.getId, m.length, t) }. + flatMap { case (_, seq) ⇒ + def perm[T](list: List[List[T]]): List[List[T]] = + list match { + case Nil ⇒ List(Nil) + case head :: tail ⇒ for (n ← head; t ← perm(tail)) yield n :: t + } + + // Optimization by sparsity sum for each tokens set for one element found with same tokens count. + perm( + seq.groupBy { case (tok, _) ⇒ tok }. + map { case (_, seq) ⇒ seq.map { case (_, m) ⇒ m} .toList }.toList + ).minBy(_.map(_.sparsity).sum) + }. + toSeq. + distinct + + val matchCnt = matchesNorm.size // Add notes for all remaining (non-intersecting) matches. - for ((m, idx) ← matches.zipWithIndex) { + for ((m, idx) ← matchesNorm.zipWithIndex) { if (DEEP_DEBUG) logger.trace( s"Model '${mdl.model.getId}' element found (${idx + 1} of $matchCnt) [" + diff --git a/nlpcraft/src/test/scala/org/apache/nlpcraft/model/jiggle/NCJiggleSpec.scala b/nlpcraft/src/test/scala/org/apache/nlpcraft/model/jiggle/NCJiggleSpec.scala new file mode 100644 index 0000000..24ca12c --- /dev/null +++ b/nlpcraft/src/test/scala/org/apache/nlpcraft/model/jiggle/NCJiggleSpec.scala @@ -0,0 +1,99 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nlpcraft.model.jiggle + +import org.apache.nlpcraft.model.`abstract`.NCAbstractTokensModel +import org.apache.nlpcraft.model.{NCContext, NCElement, NCResult, NCToken} +import org.apache.nlpcraft.{NCTestContext, NCTestElement, NCTestEnvironment} +import org.junit.jupiter.api.Test + +import java.util +import scala.collection.JavaConverters._ +import scala.collection.mutable + +class NJiggleModel1 extends NCAbstractTokensModel { + override def getElements: util.Set[NCElement] = Set(NCTestElement("xyz", "x y z")) + + // Default values. + override def isPermutateSynonyms: Boolean = true + override def getJiggleFactor: Int = 4 + + override def onContext(ctx: NCContext): NCResult = { + val variants = ctx.getVariants.asScala + + def checkOneVariant(sparsity: Int): Unit = { + require(variants.size == 1) + + val toks = variants.head.asScala.filter(_.getId == "xyz") + + require(toks.size == 3) + + checkSparsity(sparsity, toks) + } + + def checkSparsity(sparsity: Int, toks: mutable.Buffer[NCToken]): Unit = + require(toks.forall(_.getMetadata.get("nlpcraft:nlp:sparsity").asInstanceOf[Int] == sparsity)) + + def checkExists(sparsity: Int): Unit = { + require( + variants.exists(v ⇒ { + val toks = v.asScala.filter(_.getId == "xyz") + + toks.size match { + case 3 ⇒ + checkSparsity(sparsity, toks) + + true + case _ ⇒ false + } + }) + ) + } + + ctx.getRequest.getNormalizedText match { + case "x y z x y z x y z" ⇒ checkOneVariant(0) + case "x y z test x y z test x y z test" ⇒ checkOneVariant(0) + case "x test y z x test y z x y test z" ⇒ checkOneVariant(1) + case "x z y x z y x z y" ⇒ checkExists(0) + case "x z y test x z y test x z y test" ⇒ checkExists(0) + case "x test z y x test z y x test z y" ⇒ checkExists(1) + + case _ ⇒ throw new AssertionError(s"Unexpected request: ${ctx.getRequest.getNormalizedText}") + } + + NCResult.text("OK") + } + +} + +@NCTestEnvironment(model = classOf[NJiggleModel1], startClient = true) +class NCJiggleSpec1 extends NCTestContext { + @Test + def test(): Unit = { + checkResult("x y z x y z x y z", "OK") + checkResult("x y z test x y z test x y z test", "OK") + checkResult("x test y z x test y z x y test z", "OK") + + // We don't check for sparsity > 1 because logic of synonyms permutation (neighbors only). + // Tests will not be clear. + + checkResult("x z y x z y x z y", "OK") + checkResult("x z y test x z y test x z y test", "OK") + checkResult("x test z y x test z y x test z y", "OK") + } +} \ No newline at end of file
