[incubator-nlpcraft] branch master updated: Part tokens detection logic fixes.

sergeykamov Thu, 24 Jun 2021 23:51:01 -0700

This is an automated email from the ASF dual-hosted git repository.

sergeykamov pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git



The following commit(s) were added to refs/heads/master by this push:
     new 70fad8d  Part tokens detection logic fixes.
70fad8d is described below

commit 70fad8d21ea2091defe42242f3484be83132f12f
Author: Sergey Kamov <[email protected]>
AuthorDate: Fri Jun 25 09:50:38 2021 +0300

    Part tokens detection logic fixes.
---
 .../nlpcraft/probe/mgrs/NCProbeVariants.scala      |  46 +++----
 .../nlpcraft/probe/mgrs/NCTokenPartKey.scala       | 134 +++++++++++++++++++++
 .../mgrs/nlp/enrichers/model/NCModelEnricher.scala |  26 +---
 .../probe/mgrs/sentence/NCSentenceManager.scala    |  30 ++---
 .../abstract/NCAbstractTokensVariantsSpec.scala    |   3 +-
 5 files changed, 167 insertions(+), 72 deletions(-)

diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeVariants.scala 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeVariants.scala
index f3122b3..bcf2c9c 100644
--- 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeVariants.scala
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeVariants.scala
@@ -20,8 +20,8 @@ package org.apache.nlpcraft.probe.mgrs
 import org.apache.nlpcraft.common.nlp.pos.NCPennTreebank
 import org.apache.nlpcraft.common.nlp.{NCNlpSentence => NlpSentence, 
NCNlpSentenceNote => NlpNote, NCNlpSentenceToken => NlpToken}
 import org.apache.nlpcraft.common.{NCE, TOK_META_ALIASES_KEY}
+import org.apache.nlpcraft.model.NCVariant
 import org.apache.nlpcraft.model.impl.{NCTokenImpl, NCTokenLogger, 
NCVariantImpl}
-import org.apache.nlpcraft.model.{NCToken, NCVariant}
 
 import java.io.{Serializable => JSerializable}
 import java.util
@@ -37,18 +37,6 @@ object NCProbeVariants {
     private final val IDXS: JSerializable = 
singletonList(IDX).asInstanceOf[JSerializable]
     private final val IDXS2: JSerializable = 
singletonList(singletonList(IDX)).asInstanceOf[JSerializable]
 
-    case class Key(id: String, from: Int, to: Int)
-
-    object Key {
-        def apply(m: util.HashMap[String, JSerializable]): Key = {
-            def get[T](name: String): T = m.get(name).asInstanceOf[T]
-
-            Key(get("id"), get("startcharindex"), get("endcharindex"))
-        }
-
-        def apply(t: NCToken): Key = Key(t.getId, t.getStartCharIndex, 
t.getEndCharIndex)
-    }
-
     /**
       *
       * @param t
@@ -77,17 +65,17 @@ object NCProbeVariants {
       *
       * @param key
       * @param delNotes
-      * @param noteTypePred
+      * @param delNoteTypePred
       * @return
       */
     private def findDeletedToken(
-        key: Key,
+        key: NCTokenPartKey,
         delNotes: Map[NlpNote, Seq[NlpToken]],
-        noteTypePred: String => Boolean
+        delNoteTypePred: NlpNote => Boolean
     ): Option[NlpToken] =
         delNotes.to(LazyList).
             flatMap { case (delNote, delNoteToks) =>
-                if (noteTypePred(delNote.noteType)) {
+                if (delNoteTypePred(delNote)) {
                     val toks =
                         delNoteToks.
                             dropWhile(_.startCharIndex != key.from).
@@ -111,7 +99,7 @@ object NCProbeVariants {
                                     case _ => // No-op.
                                 }
 
-                                artTok.add(delNote.clone(ps.toSeq :_*))
+                                artTok.add(delNote.clone(ps.toSeq: _*))
                             }
 
                             Some(artTok)
@@ -200,18 +188,18 @@ object NCProbeVariants {
                 }
 
                 val toks = nlpSen.map(mkToken)
-                val keys2Toks = toks.map(t => Key(t) -> t).toMap
+                val keys2Toks = toks.map(t => NCTokenPartKey(t) -> t).toMap
 
                 def process(tok: NCTokenImpl, tokNlp: NlpToken): Unit = {
-                    val optList: Option[util.List[util.HashMap[String, 
JSerializable]]] =
+                    val optList: Option[util.List[NCTokenPartKey]] =
                         tokNlp.find(_.isUser) match {
                             case Some(u) => u.dataOpt("parts")
                             case None => None
                         }
 
                     optList match {
-                        case Some(list) =>
-                            val keys = list.asScala.map(Key(_))
+                        case Some(keysJava) =>
+                            val keys = keysJava.asScala
 
                             val parts = keys.map(key =>
                                 keys2Toks.get(key) match {
@@ -221,7 +209,11 @@ object NCProbeVariants {
                                         val delNotes = nlpSen.getDeletedNotes
 
                                         // Tries to find with same key.
-                                        var nlpTokOpt = findDeletedToken(key, 
delNotes, _ == key.id)
+                                        var nlpTokOpt = findDeletedToken(
+                                            key,
+                                            delNotes,
+                                            (delNote: NlpNote) => 
key.similar(delNote)
+                                        )
 
                                         // If couldn't find nlp note, we can 
try to find any note on the same position.
                                         if (nlpTokOpt.isEmpty && key.id == 
"nlpcraft:nlp")
@@ -249,10 +241,10 @@ object NCProbeVariants {
                                 }
                             )
 
-                            parts.zip(list.asScala).foreach { case (part, map) 
=>
-                                map.get(TOK_META_ALIASES_KEY) match {
+                            parts.zip(keys).foreach { case (part, key) =>
+                                key.aliases match {
                                     case null => // No-op.
-                                    case aliases => 
part.getMetadata.put(TOK_META_ALIASES_KEY, aliases.asInstanceOf[Object])
+                                    case aliases => 
part.getMetadata.put(TOK_META_ALIASES_KEY, aliases)
                                 }
                             }
 
@@ -267,7 +259,7 @@ object NCProbeVariants {
                                         getOrElse(throw new NCE(s"Token not 
found for $tok"))
                                 )
 
-                            ok = ok && !toks.exists(t => t.getId != 
"nlpcraft:nlp" && keys.contains(Key(t)))
+                            ok = ok && !toks.exists(t => t.getId != 
"nlpcraft:nlp" && keys.contains(NCTokenPartKey(t)))
                         case None => // No-op.
                     }
                 }
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCTokenPartKey.scala 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCTokenPartKey.scala
new file mode 100644
index 0000000..c89cae1
--- /dev/null
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCTokenPartKey.scala
@@ -0,0 +1,134 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nlpcraft.probe.mgrs
+
+import org.apache.nlpcraft.common.TOK_META_ALIASES_KEY
+import org.apache.nlpcraft.common.nlp.{NCNlpSentence, NCNlpSentenceNote, 
NCNlpSentenceToken}
+import org.apache.nlpcraft.model.NCToken
+import 
org.apache.nlpcraft.probe.mgrs.NCProbeSynonymChunkKind.{NCSynonymChunkKind, _}
+
+import java.io.{Serializable => JSerializable}
+import java.util
+import java.util.{List => JList}
+import scala.compat.java8.OptionConverters.RichOptionalGeneric
+import scala.jdk.CollectionConverters.{MapHasAsJava, MapHasAsScala}
+import scala.language.implicitConversions
+import scala.collection.mutable
+
+/**
+  *
+  */
+object NCTokenPartKey {
+    def apply(m: util.HashMap[String, JSerializable]): NCTokenPartKey = {
+        def get[T](name: String): T = m.get(name).asInstanceOf[T]
+
+        NCTokenPartKey(get("id"), get("startcharindex"), get("endcharindex"), 
get("data"))
+    }
+
+    def apply(part: NCToken, kind: NCSynonymChunkKind): NCTokenPartKey = {
+        val id = part.getId
+
+        val m: Map[String, Any] =
+            if (kind != TEXT)
+                id match {
+                    case "nlpcraft:relation" =>
+                        Map(
+                            "type" -> part.meta[String](s"$id:type"),
+                            "note" -> part.meta[String](s"$id:note")
+                        )
+                    case "nlpcraft:limit" =>
+                        Map(
+                            "limit" -> part.meta[Double](s"$id:limit"),
+                            "note" -> part.meta[String](s"$id:note")
+                        )
+                    case "nlpcraft:sort" =>
+                        val m = mutable.HashMap.empty[String, Any]
+
+                        def add(name: String): Unit =
+                            part.metaOpt[JList[String]](s"$id:$name").asScala 
match {
+                                case Some(list) => m += name -> list
+                                case None => // No-op.
+                            }
+
+                        add("subjnotes")
+                        add("bynotes")
+
+                        m.toMap
+                    case _ => Map.empty
+                }
+            else
+                Map.empty
+
+        val key = new NCTokenPartKey(
+            if (kind == TEXT) "nlpcraft:nlp" else id,
+            part.getStartCharIndex,
+            part.getEndCharIndex,
+            m.asJava
+        )
+
+        key.aliases = part.getMetadata.get(TOK_META_ALIASES_KEY)
+
+        key
+    }
+
+    def apply(t: NCToken): NCTokenPartKey =
+        new NCTokenPartKey(t.getId, t.getStartCharIndex, t.getEndCharIndex, 
Map.empty[String, Any].asJava)
+
+    def apply(note: NCNlpSentenceNote, sen: NCNlpSentence): NCTokenPartKey =
+        NCTokenPartKey(
+            note.noteType,
+            sen(note.tokenFrom).startCharIndex,
+            sen(note.tokenTo).endCharIndex,
+            Map.empty[String, Any].asJava
+        )
+
+    def apply(note: NCNlpSentenceNote, toks: Seq[NCNlpSentenceToken]): 
NCTokenPartKey = {
+        val sorted = toks.sortBy(_.index)
+
+        NCTokenPartKey(
+            note.noteType,
+            sorted.head.startCharIndex,
+            sorted.last.endCharIndex,
+            Map.empty[String, Any].asJava
+        )
+    }
+}
+
+/**
+  *
+  * @param id
+  * @param from
+  * @param to
+  * @param data
+  */
+case class NCTokenPartKey(id: String, from: Int, to: Int, data: 
util.Map[String, Any]) {
+    require(from <= to)
+
+    var aliases: AnyRef = _
+
+    private def in(i: Int): Boolean = i >= from && i <= to
+
+    def intersect(id: String, from: Int, to: Int): Boolean = id == this.id && 
(in(from) || in(to))
+
+    def similar(note: NCNlpSentenceNote): Boolean =
+        id == note.noteType &&
+        (
+            data.isEmpty ||
+            data.asScala.forall { case (k, v) => note.contains(k) && 
note.data(k) == v }
+        )
+}
\ No newline at end of file
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
index 1061ff8..a2deee8 100644
--- 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
@@ -22,19 +22,18 @@ import org.apache.nlpcraft.common._
 import org.apache.nlpcraft.common.nlp.{NCNlpSentence => Sentence, 
NCNlpSentenceNote => NlpNote, NCNlpSentenceToken => NlpToken}
 import org.apache.nlpcraft.model._
 import org.apache.nlpcraft.probe.mgrs.NCProbeSynonym.NCIdlContent
-import 
org.apache.nlpcraft.probe.mgrs.NCProbeSynonymChunkKind.{NCSynonymChunkKind, _}
+import 
org.apache.nlpcraft.probe.mgrs.NCProbeSynonymChunkKind.NCSynonymChunkKind
 import org.apache.nlpcraft.probe.mgrs.nlp.NCProbeEnricher
 import org.apache.nlpcraft.probe.mgrs.nlp.impl.NCRequestImpl
 import org.apache.nlpcraft.probe.mgrs.sentence.NCSentenceManager
-import org.apache.nlpcraft.probe.mgrs.{NCProbeModel, NCProbeVariants, 
NCProbeSynonym => Synonym}
+import org.apache.nlpcraft.probe.mgrs.{NCProbeModel, NCProbeVariants, 
NCTokenPartKey, NCProbeSynonym => Synonym}
 
 import java.io.Serializable
-import java.util
 import java.util.{List => JList}
-import scala.collection.mutable.ArrayBuffer
 import scala.collection.mutable
-import scala.jdk.CollectionConverters.{ListHasAsScala, MapHasAsJava, 
MapHasAsScala, SeqHasAsJava}
+import scala.collection.mutable.ArrayBuffer
 import scala.collection.parallel.CollectionConverters._
+import scala.jdk.CollectionConverters.{ListHasAsScala, MapHasAsJava, 
MapHasAsScala, SeqHasAsJava}
 
 /**
   * Model elements enricher.
@@ -185,21 +184,8 @@ object NCModelEnricher extends NCProbeEnricher {
             case None => // No-op.
         }
 
-        if (parts.nonEmpty) {
-            val partsData: Seq[util.HashMap[String, Any]] =
-                parts.map { case (part, kind) =>
-                    val m = new util.HashMap[String, Any]()
-
-                    m.put("id", if (kind == TEXT) "nlpcraft:nlp" else 
part.getId)
-                    m.put("startcharindex", part.getStartCharIndex)
-                    m.put("endcharindex", part.getEndCharIndex)
-                    m.put(TOK_META_ALIASES_KEY, 
part.getMetadata.get(TOK_META_ALIASES_KEY))
-
-                    m
-                }
-
-            params += "parts" -> partsData.asJava
-        }
+        if (parts.nonEmpty)
+            params += "parts" -> parts.map { case (p, kind) => 
NCTokenPartKey(p, kind) }.asJava
 
         val idxs = toks.map(_.index).sorted
 
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceManager.scala
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceManager.scala
index 339bb4c..74ead87 100644
--- 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceManager.scala
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceManager.scala
@@ -21,14 +21,15 @@ import io.opencensus.trace.Span
 import org.apache.nlpcraft.common.nlp.NCNlpSentence.NoteLink
 import org.apache.nlpcraft.common.nlp.pos.NCPennTreebank
 import org.apache.nlpcraft.common.nlp.{NCNlpSentence, NCNlpSentenceNote, 
NCNlpSentenceToken}
-import org.apache.nlpcraft.common.{NCE, NCService, U}
+import org.apache.nlpcraft.common.{NCE, NCService, U, _}
 import org.apache.nlpcraft.model.NCModel
+import org.apache.nlpcraft.probe.mgrs.NCTokenPartKey
 
 import java.io.{Serializable => JSerializable}
 import java.util
 import java.util.{List => JList}
 import scala.collection.mutable
-import 
scala.collection.parallel.CollectionConverters.ImmutableIterableIsParallelizable
+import scala.collection.parallel.CollectionConverters._
 import scala.jdk.CollectionConverters.{ListHasAsScala, SeqHasAsJava, 
SetHasAsJava}
 import scala.language.implicitConversions
 
@@ -42,23 +43,6 @@ object NCSentenceManager extends NCService {
     type CacheValue = Seq[Seq[NCNlpSentenceNote]]
     private val combCache = mutable.HashMap.empty[String, 
mutable.HashMap[CacheKey, CacheValue]]
 
-    case class PartKey(id: String, start: Int, end: Int) {
-        require(start <= end)
-
-        private def in(i: Int): Boolean = i >= start && i <= end
-        def intersect(id: String, start: Int, end: Int): Boolean = id == 
this.id && (in(start) || in(end))
-    }
-
-    object PartKey {
-        def apply(m: util.HashMap[String, JSerializable]): PartKey = {
-            def get[T](name: String): T = m.get(name).asInstanceOf[T]
-
-            PartKey(get("id"), get("startcharindex"), get("endcharindex"))
-        }
-
-        def apply(t: NCNlpSentenceNote, sen: NCNlpSentence): PartKey =
-            PartKey(t.noteType, sen(t.tokenFrom).startCharIndex, 
sen(t.tokenTo).endCharIndex)
-    }
 
     /**
       *
@@ -95,14 +79,14 @@ object NCSentenceManager extends NCService {
       *
       * @param notes
       */
-    private def getPartKeys(notes: NCNlpSentenceNote*): Seq[PartKey] =
+    private def getPartKeys(notes: NCNlpSentenceNote*): Seq[NCTokenPartKey] =
         notes.
             filter(_.isUser).
             flatMap(n => {
-                val optList: Option[JList[util.HashMap[String, 
JSerializable]]] = n.dataOpt("parts")
+                val optList: Option[JList[NCTokenPartKey]] = n.dataOpt("parts")
 
                 optList
-            }).flatMap(_.asScala).map(m => PartKey(m)).distinct
+            }).flatMap(_.asScala).distinct
 
     /**
       *
@@ -666,7 +650,7 @@ object NCSentenceManager extends NCService {
                 filter(getPartKeys(_).isEmpty).
                 flatMap(note => {
                     val noteWordsIdxs = note.wordIndexes.toSet
-                    val key = PartKey(note, sen)
+                    val key = NCTokenPartKey(note, sen)
 
                     val delCombOthers =
                         delCombs.filter(_ != note).flatMap(n => if 
(getPartKeys(n).contains(key)) Some(n) else None)
diff --git 
a/nlpcraft/src/test/scala/org/apache/nlpcraft/model/abstract/NCAbstractTokensVariantsSpec.scala
 
b/nlpcraft/src/test/scala/org/apache/nlpcraft/model/abstract/NCAbstractTokensVariantsSpec.scala
index 10a28e8..a83f697 100644
--- 
a/nlpcraft/src/test/scala/org/apache/nlpcraft/model/abstract/NCAbstractTokensVariantsSpec.scala
+++ 
b/nlpcraft/src/test/scala/org/apache/nlpcraft/model/abstract/NCAbstractTokensVariantsSpec.scala
@@ -44,8 +44,7 @@ class NCAbstractTokensModelVariants extends 
NCAbstractTokensModel {
 
             val limNote = 
limitPart.getMetadata.get("nlpcraft:limit:note").asInstanceOf[String]
 
-            // TODO: wrapAnyWord? - check it (ticket NLPCRAFT-337)
-            require(limNote == "anyWord", s"Unexpected limit token note: 
'$limNote', token: $limitPart, meta: ${limitPart.getMetadata}")
+            require(limNote == "wrapAnyWord", s"Unexpected limit token note: 
'$limNote', token: $limitPart, meta: ${limitPart.getMetadata}")
 
             val limIdxs = 
limitPart.getMetadata.get("nlpcraft:limit:indexes").asInstanceOf[util.List[Integer]].asScala

[incubator-nlpcraft] branch master updated: Part tokens detection logic fixes.

Reply via email to