[incubator-nlpcraft] branch NLPCRAFT-30 updated: SortEnricher bugfix.

sergeykamov Sun, 05 Apr 2020 04:28:32 -0700

This is an automated email from the ASF dual-hosted git repository.

sergeykamov pushed a commit to branch NLPCRAFT-30
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git



The following commit(s) were added to refs/heads/NLPCRAFT-30 by this push:
     new 0ab1873  SortEnricher bugfix.
0ab1873 is described below

commit 0ab1873c94d46be102707237105f0430ea58317c
Author: Sergey Kamov <[email protected]>
AuthorDate: Sun Apr 5 14:28:12 2020 +0300

    SortEnricher bugfix.
---
 .../mgrs/nlp/enrichers/sort/NCSortEnricher.scala   | 103 ++++++++++++---------
 1 file changed, 57 insertions(+), 46 deletions(-)

diff --git 
a/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/sort/NCSortEnricher.scala
 
b/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/sort/NCSortEnricher.scala
index 3a9661b..95cc6ef 100644
--- 
a/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/sort/NCSortEnricher.scala
+++ 
b/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/sort/NCSortEnricher.scala
@@ -143,63 +143,75 @@ object NCSortEnricher extends NCProbeEnricher {
         })
     }
 
+    private def toNoteData(toks: Seq[NCNlpSentenceToken]): Seq[NoteData] = {
+        require(toks.nonEmpty)
+
+        val min = toks.head.index
+        val max = toks.last.index
+
+        toks.flatten.
+            filter(!_.isNlp).
+            filter(n ⇒ n.tokenIndexes.head >= min && n.tokenIndexes.last <= 
max).
+            map(n ⇒ NoteData(n.noteType, n.tokenFrom to n.tokenTo)).
+            sortBy(_.indexes.head).distinct
+    }
+
     /**
       * [Token] -> [NoteData]
       * [Token(A, B), Token(A), Token(C, D), Token(C, D, X), Token(Z)] ⇒
       * [ [A (0, 1), C (2, 3), Z (4)], [A (0, 1), D (2, 3), Z (4) ] ]
       *
-      * @param toks
+      * @param toksNoteData
       */
-    private def split(toks: Seq[NCNlpSentenceToken]): Seq[Seq[NoteData]] = {
-        require(toks.nonEmpty)
+    private def split(toks: Seq[NCNlpSentenceToken], toksNoteData: 
Seq[NoteData], nullable: Boolean): Seq[Seq[NoteData]] = {
+        val res =
+            if (toksNoteData.nonEmpty) {
+                val res = mutable.ArrayBuffer.empty[Seq[NoteData]]
+
+                /**
+                  * Returns flag which indicates are token contiguous or not.
+                  *
+                  * @param tok1Idx First token index.
+                  * @param tok2Idx Second token index.
+                  */
+                def contiguous(tok1Idx: Int, tok2Idx: Int): Boolean = {
+                    val between = toks.filter(t ⇒ t.index > tok1Idx && t.index 
< tok2Idx)
+
+                    between.isEmpty || between.forall(p ⇒ p.isStopWord || 
p.stem == STEM_AND)
+                }
 
-        val min = toks.head.index
-        val max = toks.last.index
+                val minIdx = toks.dropWhile(_.isNlp).head.index
+                val maxIdx = toks.reverse.dropWhile(_.isNlp).head.index
 
-        val all =
-            toks.flatten.
-                filter(!_.isNlp).
-                filter(n ⇒ n.tokenIndexes.head >= min && n.tokenIndexes.last 
<= max).
-                map(n ⇒ NoteData(n.noteType, n.tokenFrom to n.tokenTo)).
-                sortBy(_.indexes.head).distinct
-
-        if (all.nonEmpty) {
-            val res = mutable.ArrayBuffer.empty[Seq[NoteData]]
-
-            /**
-              * Returns flag which indicates are token contiguous or not.
-              *
-              * @param tok1Idx First token index.
-              * @param tok2Idx Second token index.
-              */
-            def contiguous(tok1Idx: Int, tok2Idx: Int): Boolean = {
-                val between = toks.filter(t ⇒ t.index > tok1Idx && t.index < 
tok2Idx)
-
-                between.isEmpty || between.forall(p ⇒ p.isStopWord || p.stem 
== STEM_AND)
-            }
+                require(minIdx <= maxIdx)
 
-            val minIdx = toks.dropWhile(_.isNlp).head.index
-            val maxIdx = toks.reverse.dropWhile(_.isNlp).head.index
+                def fill(nd: NoteData, seq: mutable.ArrayBuffer[NoteData] = 
mutable.ArrayBuffer.empty[NoteData]): Unit = {
+                    seq += nd
 
-            require(minIdx <= maxIdx)
+                    toksNoteData.
+                        filter(p ⇒ nd.indexes.last < p.indexes.head && 
contiguous(nd.indexes.last, p.indexes.head)).
+                        foreach(fill(_, mutable.ArrayBuffer.empty[NoteData] ++ 
seq.clone()))
 
-            def fill(nd: NoteData, seq: mutable.ArrayBuffer[NoteData] = 
mutable.ArrayBuffer.empty[NoteData]): Unit = {
-                seq += nd
+                    if (seq.nonEmpty && seq.head.indexes.head == minIdx && 
seq.last.indexes.last == maxIdx)
+                        res += seq
+                }
 
-                all.
-                    filter(p ⇒ nd.indexes.last < p.indexes.head && 
contiguous(nd.indexes.last, p.indexes.head)).
-                    foreach(fill(_, mutable.ArrayBuffer.empty[NoteData] ++ 
seq.clone()))
+                toksNoteData.filter(_.indexes.head == minIdx).foreach(p ⇒ 
fill(p))
 
-                if (seq.nonEmpty && seq.head.indexes.head == minIdx && 
seq.last.indexes.last == maxIdx)
-                    res += seq
+                res
             }
+            else
+                Seq.empty
 
-            all.filter(_.indexes.head == minIdx).foreach(p ⇒ fill(p))
+        if (res.isEmpty && !nullable)
+            throw new AssertionError(s"Invalid null result " +
+                s"[tokensTexts=[${toks.map(_.origText).mkString(", ")}]" +
+                s", tokensIndexes=[${toks.map(_.index).mkString(", ")}]" +
+                s", allData=[${toksNoteData.mkString(", ")}]" +
+                s"]"
+            )
 
-            res
-        }
-        else
-            Seq.empty
+        res
     }
 
     /**
@@ -302,14 +314,13 @@ object NCSortEnricher extends NCProbeEnricher {
                             else
                                 (others.filter(_.index < sepIdxs.head), 
others.filter(_.index > sepIdxs.last))
 
-                        val notes = subj.flatten
-
                         require(subj.nonEmpty)
 
-                        val subjSeq = split(subj)
+                        val subjNoteData = toNoteData(subj)
 
-                        if (subjSeq.nonEmpty) {
-                            val bySeq = if (by.isEmpty) Seq.empty else 
split(by)
+                        if (subjNoteData.nonEmpty) {
+                            val subjSeq = split(subj, subjNoteData, nullable = 
false)
+                            val bySeq = if (by.isEmpty) Seq.empty else 
split(by, toNoteData(by), nullable = true)
                             val asc = h.order.flatMap(order ⇒ 
Some(ORDER(order.synonymIndex)._2))
 
                             Some(Match(asc, main = h.sort.tokens, stop = 
h.byTokens ++ h.orderTokens, subjSeq, bySeq))

[incubator-nlpcraft] branch NLPCRAFT-30 updated: SortEnricher bugfix.

Reply via email to