This is an automated email from the ASF dual-hosted git repository.
sergeykamov pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git
The following commit(s) were added to refs/heads/master by this push:
new 70fad8d Part tokens detection logic fixes.
70fad8d is described below
commit 70fad8d21ea2091defe42242f3484be83132f12f
Author: Sergey Kamov <[email protected]>
AuthorDate: Fri Jun 25 09:50:38 2021 +0300
Part tokens detection logic fixes.
---
.../nlpcraft/probe/mgrs/NCProbeVariants.scala | 46 +++----
.../nlpcraft/probe/mgrs/NCTokenPartKey.scala | 134 +++++++++++++++++++++
.../mgrs/nlp/enrichers/model/NCModelEnricher.scala | 26 +---
.../probe/mgrs/sentence/NCSentenceManager.scala | 30 ++---
.../abstract/NCAbstractTokensVariantsSpec.scala | 3 +-
5 files changed, 167 insertions(+), 72 deletions(-)
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeVariants.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeVariants.scala
index f3122b3..bcf2c9c 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeVariants.scala
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeVariants.scala
@@ -20,8 +20,8 @@ package org.apache.nlpcraft.probe.mgrs
import org.apache.nlpcraft.common.nlp.pos.NCPennTreebank
import org.apache.nlpcraft.common.nlp.{NCNlpSentence => NlpSentence,
NCNlpSentenceNote => NlpNote, NCNlpSentenceToken => NlpToken}
import org.apache.nlpcraft.common.{NCE, TOK_META_ALIASES_KEY}
+import org.apache.nlpcraft.model.NCVariant
import org.apache.nlpcraft.model.impl.{NCTokenImpl, NCTokenLogger,
NCVariantImpl}
-import org.apache.nlpcraft.model.{NCToken, NCVariant}
import java.io.{Serializable => JSerializable}
import java.util
@@ -37,18 +37,6 @@ object NCProbeVariants {
private final val IDXS: JSerializable =
singletonList(IDX).asInstanceOf[JSerializable]
private final val IDXS2: JSerializable =
singletonList(singletonList(IDX)).asInstanceOf[JSerializable]
- case class Key(id: String, from: Int, to: Int)
-
- object Key {
- def apply(m: util.HashMap[String, JSerializable]): Key = {
- def get[T](name: String): T = m.get(name).asInstanceOf[T]
-
- Key(get("id"), get("startcharindex"), get("endcharindex"))
- }
-
- def apply(t: NCToken): Key = Key(t.getId, t.getStartCharIndex,
t.getEndCharIndex)
- }
-
/**
*
* @param t
@@ -77,17 +65,17 @@ object NCProbeVariants {
*
* @param key
* @param delNotes
- * @param noteTypePred
+ * @param delNoteTypePred
* @return
*/
private def findDeletedToken(
- key: Key,
+ key: NCTokenPartKey,
delNotes: Map[NlpNote, Seq[NlpToken]],
- noteTypePred: String => Boolean
+ delNoteTypePred: NlpNote => Boolean
): Option[NlpToken] =
delNotes.to(LazyList).
flatMap { case (delNote, delNoteToks) =>
- if (noteTypePred(delNote.noteType)) {
+ if (delNoteTypePred(delNote)) {
val toks =
delNoteToks.
dropWhile(_.startCharIndex != key.from).
@@ -111,7 +99,7 @@ object NCProbeVariants {
case _ => // No-op.
}
- artTok.add(delNote.clone(ps.toSeq :_*))
+ artTok.add(delNote.clone(ps.toSeq: _*))
}
Some(artTok)
@@ -200,18 +188,18 @@ object NCProbeVariants {
}
val toks = nlpSen.map(mkToken)
- val keys2Toks = toks.map(t => Key(t) -> t).toMap
+ val keys2Toks = toks.map(t => NCTokenPartKey(t) -> t).toMap
def process(tok: NCTokenImpl, tokNlp: NlpToken): Unit = {
- val optList: Option[util.List[util.HashMap[String,
JSerializable]]] =
+ val optList: Option[util.List[NCTokenPartKey]] =
tokNlp.find(_.isUser) match {
case Some(u) => u.dataOpt("parts")
case None => None
}
optList match {
- case Some(list) =>
- val keys = list.asScala.map(Key(_))
+ case Some(keysJava) =>
+ val keys = keysJava.asScala
val parts = keys.map(key =>
keys2Toks.get(key) match {
@@ -221,7 +209,11 @@ object NCProbeVariants {
val delNotes = nlpSen.getDeletedNotes
// Tries to find with same key.
- var nlpTokOpt = findDeletedToken(key,
delNotes, _ == key.id)
+ var nlpTokOpt = findDeletedToken(
+ key,
+ delNotes,
+ (delNote: NlpNote) =>
key.similar(delNote)
+ )
// If couldn't find nlp note, we can
try to find any note on the same position.
if (nlpTokOpt.isEmpty && key.id ==
"nlpcraft:nlp")
@@ -249,10 +241,10 @@ object NCProbeVariants {
}
)
- parts.zip(list.asScala).foreach { case (part, map)
=>
- map.get(TOK_META_ALIASES_KEY) match {
+ parts.zip(keys).foreach { case (part, key) =>
+ key.aliases match {
case null => // No-op.
- case aliases =>
part.getMetadata.put(TOK_META_ALIASES_KEY, aliases.asInstanceOf[Object])
+ case aliases =>
part.getMetadata.put(TOK_META_ALIASES_KEY, aliases)
}
}
@@ -267,7 +259,7 @@ object NCProbeVariants {
getOrElse(throw new NCE(s"Token not
found for $tok"))
)
- ok = ok && !toks.exists(t => t.getId !=
"nlpcraft:nlp" && keys.contains(Key(t)))
+ ok = ok && !toks.exists(t => t.getId !=
"nlpcraft:nlp" && keys.contains(NCTokenPartKey(t)))
case None => // No-op.
}
}
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCTokenPartKey.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCTokenPartKey.scala
new file mode 100644
index 0000000..c89cae1
--- /dev/null
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCTokenPartKey.scala
@@ -0,0 +1,134 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * https://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nlpcraft.probe.mgrs
+
+import org.apache.nlpcraft.common.TOK_META_ALIASES_KEY
+import org.apache.nlpcraft.common.nlp.{NCNlpSentence, NCNlpSentenceNote,
NCNlpSentenceToken}
+import org.apache.nlpcraft.model.NCToken
+import
org.apache.nlpcraft.probe.mgrs.NCProbeSynonymChunkKind.{NCSynonymChunkKind, _}
+
+import java.io.{Serializable => JSerializable}
+import java.util
+import java.util.{List => JList}
+import scala.compat.java8.OptionConverters.RichOptionalGeneric
+import scala.jdk.CollectionConverters.{MapHasAsJava, MapHasAsScala}
+import scala.language.implicitConversions
+import scala.collection.mutable
+
+/**
+ *
+ */
+object NCTokenPartKey {
+ def apply(m: util.HashMap[String, JSerializable]): NCTokenPartKey = {
+ def get[T](name: String): T = m.get(name).asInstanceOf[T]
+
+ NCTokenPartKey(get("id"), get("startcharindex"), get("endcharindex"),
get("data"))
+ }
+
+ def apply(part: NCToken, kind: NCSynonymChunkKind): NCTokenPartKey = {
+ val id = part.getId
+
+ val m: Map[String, Any] =
+ if (kind != TEXT)
+ id match {
+ case "nlpcraft:relation" =>
+ Map(
+ "type" -> part.meta[String](s"$id:type"),
+ "note" -> part.meta[String](s"$id:note")
+ )
+ case "nlpcraft:limit" =>
+ Map(
+ "limit" -> part.meta[Double](s"$id:limit"),
+ "note" -> part.meta[String](s"$id:note")
+ )
+ case "nlpcraft:sort" =>
+ val m = mutable.HashMap.empty[String, Any]
+
+ def add(name: String): Unit =
+ part.metaOpt[JList[String]](s"$id:$name").asScala
match {
+ case Some(list) => m += name -> list
+ case None => // No-op.
+ }
+
+ add("subjnotes")
+ add("bynotes")
+
+ m.toMap
+ case _ => Map.empty
+ }
+ else
+ Map.empty
+
+ val key = new NCTokenPartKey(
+ if (kind == TEXT) "nlpcraft:nlp" else id,
+ part.getStartCharIndex,
+ part.getEndCharIndex,
+ m.asJava
+ )
+
+ key.aliases = part.getMetadata.get(TOK_META_ALIASES_KEY)
+
+ key
+ }
+
+ def apply(t: NCToken): NCTokenPartKey =
+ new NCTokenPartKey(t.getId, t.getStartCharIndex, t.getEndCharIndex,
Map.empty[String, Any].asJava)
+
+ def apply(note: NCNlpSentenceNote, sen: NCNlpSentence): NCTokenPartKey =
+ NCTokenPartKey(
+ note.noteType,
+ sen(note.tokenFrom).startCharIndex,
+ sen(note.tokenTo).endCharIndex,
+ Map.empty[String, Any].asJava
+ )
+
+ def apply(note: NCNlpSentenceNote, toks: Seq[NCNlpSentenceToken]):
NCTokenPartKey = {
+ val sorted = toks.sortBy(_.index)
+
+ NCTokenPartKey(
+ note.noteType,
+ sorted.head.startCharIndex,
+ sorted.last.endCharIndex,
+ Map.empty[String, Any].asJava
+ )
+ }
+}
+
+/**
+ *
+ * @param id
+ * @param from
+ * @param to
+ * @param data
+ */
+case class NCTokenPartKey(id: String, from: Int, to: Int, data:
util.Map[String, Any]) {
+ require(from <= to)
+
+ var aliases: AnyRef = _
+
+ private def in(i: Int): Boolean = i >= from && i <= to
+
+ def intersect(id: String, from: Int, to: Int): Boolean = id == this.id &&
(in(from) || in(to))
+
+ def similar(note: NCNlpSentenceNote): Boolean =
+ id == note.noteType &&
+ (
+ data.isEmpty ||
+ data.asScala.forall { case (k, v) => note.contains(k) &&
note.data(k) == v }
+ )
+}
\ No newline at end of file
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
index 1061ff8..a2deee8 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
@@ -22,19 +22,18 @@ import org.apache.nlpcraft.common._
import org.apache.nlpcraft.common.nlp.{NCNlpSentence => Sentence,
NCNlpSentenceNote => NlpNote, NCNlpSentenceToken => NlpToken}
import org.apache.nlpcraft.model._
import org.apache.nlpcraft.probe.mgrs.NCProbeSynonym.NCIdlContent
-import
org.apache.nlpcraft.probe.mgrs.NCProbeSynonymChunkKind.{NCSynonymChunkKind, _}
+import
org.apache.nlpcraft.probe.mgrs.NCProbeSynonymChunkKind.NCSynonymChunkKind
import org.apache.nlpcraft.probe.mgrs.nlp.NCProbeEnricher
import org.apache.nlpcraft.probe.mgrs.nlp.impl.NCRequestImpl
import org.apache.nlpcraft.probe.mgrs.sentence.NCSentenceManager
-import org.apache.nlpcraft.probe.mgrs.{NCProbeModel, NCProbeVariants,
NCProbeSynonym => Synonym}
+import org.apache.nlpcraft.probe.mgrs.{NCProbeModel, NCProbeVariants,
NCTokenPartKey, NCProbeSynonym => Synonym}
import java.io.Serializable
-import java.util
import java.util.{List => JList}
-import scala.collection.mutable.ArrayBuffer
import scala.collection.mutable
-import scala.jdk.CollectionConverters.{ListHasAsScala, MapHasAsJava,
MapHasAsScala, SeqHasAsJava}
+import scala.collection.mutable.ArrayBuffer
import scala.collection.parallel.CollectionConverters._
+import scala.jdk.CollectionConverters.{ListHasAsScala, MapHasAsJava,
MapHasAsScala, SeqHasAsJava}
/**
* Model elements enricher.
@@ -185,21 +184,8 @@ object NCModelEnricher extends NCProbeEnricher {
case None => // No-op.
}
- if (parts.nonEmpty) {
- val partsData: Seq[util.HashMap[String, Any]] =
- parts.map { case (part, kind) =>
- val m = new util.HashMap[String, Any]()
-
- m.put("id", if (kind == TEXT) "nlpcraft:nlp" else
part.getId)
- m.put("startcharindex", part.getStartCharIndex)
- m.put("endcharindex", part.getEndCharIndex)
- m.put(TOK_META_ALIASES_KEY,
part.getMetadata.get(TOK_META_ALIASES_KEY))
-
- m
- }
-
- params += "parts" -> partsData.asJava
- }
+ if (parts.nonEmpty)
+ params += "parts" -> parts.map { case (p, kind) =>
NCTokenPartKey(p, kind) }.asJava
val idxs = toks.map(_.index).sorted
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceManager.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceManager.scala
index 339bb4c..74ead87 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceManager.scala
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceManager.scala
@@ -21,14 +21,15 @@ import io.opencensus.trace.Span
import org.apache.nlpcraft.common.nlp.NCNlpSentence.NoteLink
import org.apache.nlpcraft.common.nlp.pos.NCPennTreebank
import org.apache.nlpcraft.common.nlp.{NCNlpSentence, NCNlpSentenceNote,
NCNlpSentenceToken}
-import org.apache.nlpcraft.common.{NCE, NCService, U}
+import org.apache.nlpcraft.common.{NCE, NCService, U, _}
import org.apache.nlpcraft.model.NCModel
+import org.apache.nlpcraft.probe.mgrs.NCTokenPartKey
import java.io.{Serializable => JSerializable}
import java.util
import java.util.{List => JList}
import scala.collection.mutable
-import
scala.collection.parallel.CollectionConverters.ImmutableIterableIsParallelizable
+import scala.collection.parallel.CollectionConverters._
import scala.jdk.CollectionConverters.{ListHasAsScala, SeqHasAsJava,
SetHasAsJava}
import scala.language.implicitConversions
@@ -42,23 +43,6 @@ object NCSentenceManager extends NCService {
type CacheValue = Seq[Seq[NCNlpSentenceNote]]
private val combCache = mutable.HashMap.empty[String,
mutable.HashMap[CacheKey, CacheValue]]
- case class PartKey(id: String, start: Int, end: Int) {
- require(start <= end)
-
- private def in(i: Int): Boolean = i >= start && i <= end
- def intersect(id: String, start: Int, end: Int): Boolean = id ==
this.id && (in(start) || in(end))
- }
-
- object PartKey {
- def apply(m: util.HashMap[String, JSerializable]): PartKey = {
- def get[T](name: String): T = m.get(name).asInstanceOf[T]
-
- PartKey(get("id"), get("startcharindex"), get("endcharindex"))
- }
-
- def apply(t: NCNlpSentenceNote, sen: NCNlpSentence): PartKey =
- PartKey(t.noteType, sen(t.tokenFrom).startCharIndex,
sen(t.tokenTo).endCharIndex)
- }
/**
*
@@ -95,14 +79,14 @@ object NCSentenceManager extends NCService {
*
* @param notes
*/
- private def getPartKeys(notes: NCNlpSentenceNote*): Seq[PartKey] =
+ private def getPartKeys(notes: NCNlpSentenceNote*): Seq[NCTokenPartKey] =
notes.
filter(_.isUser).
flatMap(n => {
- val optList: Option[JList[util.HashMap[String,
JSerializable]]] = n.dataOpt("parts")
+ val optList: Option[JList[NCTokenPartKey]] = n.dataOpt("parts")
optList
- }).flatMap(_.asScala).map(m => PartKey(m)).distinct
+ }).flatMap(_.asScala).distinct
/**
*
@@ -666,7 +650,7 @@ object NCSentenceManager extends NCService {
filter(getPartKeys(_).isEmpty).
flatMap(note => {
val noteWordsIdxs = note.wordIndexes.toSet
- val key = PartKey(note, sen)
+ val key = NCTokenPartKey(note, sen)
val delCombOthers =
delCombs.filter(_ != note).flatMap(n => if
(getPartKeys(n).contains(key)) Some(n) else None)
diff --git
a/nlpcraft/src/test/scala/org/apache/nlpcraft/model/abstract/NCAbstractTokensVariantsSpec.scala
b/nlpcraft/src/test/scala/org/apache/nlpcraft/model/abstract/NCAbstractTokensVariantsSpec.scala
index 10a28e8..a83f697 100644
---
a/nlpcraft/src/test/scala/org/apache/nlpcraft/model/abstract/NCAbstractTokensVariantsSpec.scala
+++
b/nlpcraft/src/test/scala/org/apache/nlpcraft/model/abstract/NCAbstractTokensVariantsSpec.scala
@@ -44,8 +44,7 @@ class NCAbstractTokensModelVariants extends
NCAbstractTokensModel {
val limNote =
limitPart.getMetadata.get("nlpcraft:limit:note").asInstanceOf[String]
- // TODO: wrapAnyWord? - check it (ticket NLPCRAFT-337)
- require(limNote == "anyWord", s"Unexpected limit token note:
'$limNote', token: $limitPart, meta: ${limitPart.getMetadata}")
+ require(limNote == "wrapAnyWord", s"Unexpected limit token note:
'$limNote', token: $limitPart, meta: ${limitPart.getMetadata}")
val limIdxs =
limitPart.getMetadata.get("nlpcraft:limit:indexes").asInstanceOf[util.List[Integer]].asScala