This is an automated email from the ASF dual-hosted git repository.
sergeykamov pushed a commit to branch NLPCRAFT-30
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git
The following commit(s) were added to refs/heads/NLPCRAFT-30 by this push:
new 0ab1873 SortEnricher bugfix.
0ab1873 is described below
commit 0ab1873c94d46be102707237105f0430ea58317c
Author: Sergey Kamov <[email protected]>
AuthorDate: Sun Apr 5 14:28:12 2020 +0300
SortEnricher bugfix.
---
.../mgrs/nlp/enrichers/sort/NCSortEnricher.scala | 103 ++++++++++++---------
1 file changed, 57 insertions(+), 46 deletions(-)
diff --git
a/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/sort/NCSortEnricher.scala
b/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/sort/NCSortEnricher.scala
index 3a9661b..95cc6ef 100644
---
a/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/sort/NCSortEnricher.scala
+++
b/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/sort/NCSortEnricher.scala
@@ -143,63 +143,75 @@ object NCSortEnricher extends NCProbeEnricher {
})
}
+ private def toNoteData(toks: Seq[NCNlpSentenceToken]): Seq[NoteData] = {
+ require(toks.nonEmpty)
+
+ val min = toks.head.index
+ val max = toks.last.index
+
+ toks.flatten.
+ filter(!_.isNlp).
+ filter(n ⇒ n.tokenIndexes.head >= min && n.tokenIndexes.last <=
max).
+ map(n ⇒ NoteData(n.noteType, n.tokenFrom to n.tokenTo)).
+ sortBy(_.indexes.head).distinct
+ }
+
/**
* [Token] -> [NoteData]
* [Token(A, B), Token(A), Token(C, D), Token(C, D, X), Token(Z)] ⇒
* [ [A (0, 1), C (2, 3), Z (4)], [A (0, 1), D (2, 3), Z (4) ] ]
*
- * @param toks
+ * @param toksNoteData
*/
- private def split(toks: Seq[NCNlpSentenceToken]): Seq[Seq[NoteData]] = {
- require(toks.nonEmpty)
+ private def split(toks: Seq[NCNlpSentenceToken], toksNoteData:
Seq[NoteData], nullable: Boolean): Seq[Seq[NoteData]] = {
+ val res =
+ if (toksNoteData.nonEmpty) {
+ val res = mutable.ArrayBuffer.empty[Seq[NoteData]]
+
+ /**
+ * Returns flag which indicates are token contiguous or not.
+ *
+ * @param tok1Idx First token index.
+ * @param tok2Idx Second token index.
+ */
+ def contiguous(tok1Idx: Int, tok2Idx: Int): Boolean = {
+ val between = toks.filter(t ⇒ t.index > tok1Idx && t.index
< tok2Idx)
+
+ between.isEmpty || between.forall(p ⇒ p.isStopWord ||
p.stem == STEM_AND)
+ }
- val min = toks.head.index
- val max = toks.last.index
+ val minIdx = toks.dropWhile(_.isNlp).head.index
+ val maxIdx = toks.reverse.dropWhile(_.isNlp).head.index
- val all =
- toks.flatten.
- filter(!_.isNlp).
- filter(n ⇒ n.tokenIndexes.head >= min && n.tokenIndexes.last
<= max).
- map(n ⇒ NoteData(n.noteType, n.tokenFrom to n.tokenTo)).
- sortBy(_.indexes.head).distinct
-
- if (all.nonEmpty) {
- val res = mutable.ArrayBuffer.empty[Seq[NoteData]]
-
- /**
- * Returns flag which indicates are token contiguous or not.
- *
- * @param tok1Idx First token index.
- * @param tok2Idx Second token index.
- */
- def contiguous(tok1Idx: Int, tok2Idx: Int): Boolean = {
- val between = toks.filter(t ⇒ t.index > tok1Idx && t.index <
tok2Idx)
-
- between.isEmpty || between.forall(p ⇒ p.isStopWord || p.stem
== STEM_AND)
- }
+ require(minIdx <= maxIdx)
- val minIdx = toks.dropWhile(_.isNlp).head.index
- val maxIdx = toks.reverse.dropWhile(_.isNlp).head.index
+ def fill(nd: NoteData, seq: mutable.ArrayBuffer[NoteData] =
mutable.ArrayBuffer.empty[NoteData]): Unit = {
+ seq += nd
- require(minIdx <= maxIdx)
+ toksNoteData.
+ filter(p ⇒ nd.indexes.last < p.indexes.head &&
contiguous(nd.indexes.last, p.indexes.head)).
+ foreach(fill(_, mutable.ArrayBuffer.empty[NoteData] ++
seq.clone()))
- def fill(nd: NoteData, seq: mutable.ArrayBuffer[NoteData] =
mutable.ArrayBuffer.empty[NoteData]): Unit = {
- seq += nd
+ if (seq.nonEmpty && seq.head.indexes.head == minIdx &&
seq.last.indexes.last == maxIdx)
+ res += seq
+ }
- all.
- filter(p ⇒ nd.indexes.last < p.indexes.head &&
contiguous(nd.indexes.last, p.indexes.head)).
- foreach(fill(_, mutable.ArrayBuffer.empty[NoteData] ++
seq.clone()))
+ toksNoteData.filter(_.indexes.head == minIdx).foreach(p ⇒
fill(p))
- if (seq.nonEmpty && seq.head.indexes.head == minIdx &&
seq.last.indexes.last == maxIdx)
- res += seq
+ res
}
+ else
+ Seq.empty
- all.filter(_.indexes.head == minIdx).foreach(p ⇒ fill(p))
+ if (res.isEmpty && !nullable)
+ throw new AssertionError(s"Invalid null result " +
+ s"[tokensTexts=[${toks.map(_.origText).mkString(", ")}]" +
+ s", tokensIndexes=[${toks.map(_.index).mkString(", ")}]" +
+ s", allData=[${toksNoteData.mkString(", ")}]" +
+ s"]"
+ )
- res
- }
- else
- Seq.empty
+ res
}
/**
@@ -302,14 +314,13 @@ object NCSortEnricher extends NCProbeEnricher {
else
(others.filter(_.index < sepIdxs.head),
others.filter(_.index > sepIdxs.last))
- val notes = subj.flatten
-
require(subj.nonEmpty)
- val subjSeq = split(subj)
+ val subjNoteData = toNoteData(subj)
- if (subjSeq.nonEmpty) {
- val bySeq = if (by.isEmpty) Seq.empty else
split(by)
+ if (subjNoteData.nonEmpty) {
+ val subjSeq = split(subj, subjNoteData, nullable =
false)
+ val bySeq = if (by.isEmpty) Seq.empty else
split(by, toNoteData(by), nullable = true)
val asc = h.order.flatMap(order ⇒
Some(ORDER(order.synonymIndex)._2))
Some(Match(asc, main = h.sort.tokens, stop =
h.byTokens ++ h.orderTokens, subjSeq, bySeq))