This is an automated email from the ASF dual-hosted git repository.
sergeykamov pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git
The following commit(s) were added to refs/heads/master by this push:
new e67355f Direct synonyms priorities issue.
e67355f is described below
commit e67355f6329f6e94b27034fc79e781be662a9624
Author: Sergey Kamov <[email protected]>
AuthorDate: Tue Feb 23 17:08:08 2021 +0300
Direct synonyms priorities issue.
---
.../mgrs/nlp/enrichers/model/NCModelEnricher.scala | 31 ++++++-
.../nlpcraft/model/jiggle/NCJiggleSpec.scala | 99 ++++++++++++++++++++++
2 files changed, 128 insertions(+), 2 deletions(-)
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
index 28fcd21..bea4eaa 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
@@ -431,10 +431,37 @@ object NCModelEnricher extends NCProbeEnricher with
DecorateAsScala {
"totalJiggledPerms" → permCnt
)
- val matchCnt = matches.size
+ // Scans by elements that are found with same tokens length.
+ // Inside, for each token we drop all non-optimized combinations.
+ // Example:
+ // 1. element's synonym - 'a b', jiggle factor 4 (default),
isPermuteSynonyms 'true' (default)
+ // 2. Request 'a b a b',
+ // Initially found 0-1, 1-2, 2-3, 0-3.
+ // 0-3 will be deleted because for 0 and 3 tokens best variants
found for same element with same tokens length.
+ val matchesNorm =
+ matches.
+ flatMap(m ⇒ m.tokens.map(_ → m)).
+ groupBy { case (t, m) ⇒ (m.element.getId, m.length, t) }.
+ flatMap { case (_, seq) ⇒
+ def perm[T](list: List[List[T]]): List[List[T]] =
+ list match {
+ case Nil ⇒ List(Nil)
+ case head :: tail ⇒ for (h ← head; t ← perm(tail))
yield h :: t
+ }
+
+ // Optimization by sparsity sum for each tokens set for
one element found with same tokens count.
+ perm(
+ seq.groupBy { case (tok, _) ⇒ tok }.
+ map { case (_, seq) ⇒ seq.map { case (_, m) ⇒ m}
.toList }.toList
+ ).minBy(_.map(_.sparsity).sum)
+ }.
+ toSeq.
+ distinct
+
+ val matchCnt = matchesNorm.size
// Add notes for all remaining (non-intersecting) matches.
- for ((m, idx) ← matches.zipWithIndex) {
+ for ((m, idx) ← matchesNorm.zipWithIndex) {
if (DEEP_DEBUG)
logger.trace(
s"Model '${mdl.model.getId}' element found (${idx + 1}
of $matchCnt) [" +
diff --git
a/nlpcraft/src/test/scala/org/apache/nlpcraft/model/jiggle/NCJiggleSpec.scala
b/nlpcraft/src/test/scala/org/apache/nlpcraft/model/jiggle/NCJiggleSpec.scala
new file mode 100644
index 0000000..24ca12c
--- /dev/null
+++
b/nlpcraft/src/test/scala/org/apache/nlpcraft/model/jiggle/NCJiggleSpec.scala
@@ -0,0 +1,99 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nlpcraft.model.jiggle
+
+import org.apache.nlpcraft.model.`abstract`.NCAbstractTokensModel
+import org.apache.nlpcraft.model.{NCContext, NCElement, NCResult, NCToken}
+import org.apache.nlpcraft.{NCTestContext, NCTestElement, NCTestEnvironment}
+import org.junit.jupiter.api.Test
+
+import java.util
+import scala.collection.JavaConverters._
+import scala.collection.mutable
+
+class NJiggleModel1 extends NCAbstractTokensModel {
+ override def getElements: util.Set[NCElement] = Set(NCTestElement("xyz",
"x y z"))
+
+ // Default values.
+ override def isPermutateSynonyms: Boolean = true
+ override def getJiggleFactor: Int = 4
+
+ override def onContext(ctx: NCContext): NCResult = {
+ val variants = ctx.getVariants.asScala
+
+ def checkOneVariant(sparsity: Int): Unit = {
+ require(variants.size == 1)
+
+ val toks = variants.head.asScala.filter(_.getId == "xyz")
+
+ require(toks.size == 3)
+
+ checkSparsity(sparsity, toks)
+ }
+
+ def checkSparsity(sparsity: Int, toks: mutable.Buffer[NCToken]): Unit =
+
require(toks.forall(_.getMetadata.get("nlpcraft:nlp:sparsity").asInstanceOf[Int]
== sparsity))
+
+ def checkExists(sparsity: Int): Unit = {
+ require(
+ variants.exists(v ⇒ {
+ val toks = v.asScala.filter(_.getId == "xyz")
+
+ toks.size match {
+ case 3 ⇒
+ checkSparsity(sparsity, toks)
+
+ true
+ case _ ⇒ false
+ }
+ })
+ )
+ }
+
+ ctx.getRequest.getNormalizedText match {
+ case "x y z x y z x y z" ⇒ checkOneVariant(0)
+ case "x y z test x y z test x y z test" ⇒ checkOneVariant(0)
+ case "x test y z x test y z x y test z" ⇒ checkOneVariant(1)
+ case "x z y x z y x z y" ⇒ checkExists(0)
+ case "x z y test x z y test x z y test" ⇒ checkExists(0)
+ case "x test z y x test z y x test z y" ⇒ checkExists(1)
+
+ case _ ⇒ throw new AssertionError(s"Unexpected request:
${ctx.getRequest.getNormalizedText}")
+ }
+
+ NCResult.text("OK")
+ }
+
+}
+
+@NCTestEnvironment(model = classOf[NJiggleModel1], startClient = true)
+class NCJiggleSpec1 extends NCTestContext {
+ @Test
+ def test(): Unit = {
+ checkResult("x y z x y z x y z", "OK")
+ checkResult("x y z test x y z test x y z test", "OK")
+ checkResult("x test y z x test y z x y test z", "OK")
+
+ // We don't check for sparsity > 1 because logic of synonyms
permutation (neighbors only).
+ // Tests will not be clear.
+
+ checkResult("x z y x z y x z y", "OK")
+ checkResult("x z y test x z y test x z y test", "OK")
+ checkResult("x test z y x test z y x test z y", "OK")
+ }
+}
\ No newline at end of file