[incubator-nlpcraft] 01/01: Code cleanup.

sergeykamov Tue, 28 Sep 2021 02:55:36 -0700

This is an automated email from the ASF dual-hosted git repository.

sergeykamov pushed a commit to branch NLPCRAFT-456
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git


commit 300042735860e626d90002b30b4d5b072892b73a
Author: Sergey Kamov <[email protected]>
AuthorDate: Tue Sep 28 12:55:21 2021 +0300

    Code cleanup.
---
 .../nlpcraft/probe/mgrs/NCProbeIdlToken.scala      |  41 +++++-
 .../apache/nlpcraft/probe/mgrs/NCProbeModel.scala  |   3 +
 .../probe/mgrs/nlp/NCProbeEnrichmentManager.scala  |   2 +-
 .../mgrs/nlp/enrichers/model/NCModelEnricher.scala | 104 +++++++-------
 .../probe/mgrs/sentence/NCSentenceManager.scala    |  25 ++--
 .../probe/mgrs/synonyms/NCSynonymsManager.scala    | 150 ++++++++++++---------
 6 files changed, 196 insertions(+), 129 deletions(-)

diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeIdlToken.scala 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeIdlToken.scala
index 5da9808..d4fc27c 100644
--- 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeIdlToken.scala
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeIdlToken.scala
@@ -26,11 +26,46 @@ import org.apache.nlpcraft.model.{NCToken, _}
   * @param word
   */
 case class NCProbeIdlToken(token: NCToken, word: NCNlpSentenceToken) {
-    val (origText: String, wordIndexes: Set[Int], minIndex: Int, maxIndex: 
Int, isToken: Boolean, isWord: Boolean) =
+    require(token != null ^ word != null)
+
+    val (
+        origText: String,
+        normText: String,
+        stem: String,
+        wordIndexes: Set[Int],
+        minIndex: Int,
+        maxIndex: Int,
+        startCharIndex: Int,
+        endCharIndex: Int,
+        isToken: Boolean,
+        isWord: Boolean
+        ) =
         if (token != null)
-            (token.origText, token.wordIndexes.toSet, token.wordIndexes.head, 
token.wordIndexes.last, true, false)
+            (
+                token.origText,
+                token.normText,
+                token.stem,
+                token.wordIndexes.toSet,
+                token.wordIndexes.head,
+                token.wordIndexes.last,
+                token.getStartCharIndex,
+                token.getEndCharIndex,
+                true,
+                false
+            )
         else
-            (word.origText, word.wordIndexes.toSet, word.wordIndexes.head, 
word.wordIndexes.last, false, true)
+            (
+                word.origText,
+                word.normText,
+                word.stem,
+                word.wordIndexes.toSet,
+                word.wordIndexes.head,
+                word.wordIndexes.last,
+                word.startCharIndex,
+                word.endCharIndex,
+                false,
+                true
+            )
 
     private lazy val hash = if (isToken) Seq(wordIndexes, 
token.getId).hashCode() else wordIndexes.hashCode()
 
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeModel.scala 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeModel.scala
index ea41793..6b6a8e8 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeModel.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeModel.scala
@@ -62,6 +62,9 @@ case class NCProbeModel(
     lazy val hasNoIdlSynonyms: Boolean = continuousSynonyms.nonEmpty || 
sparseSynonyms.nonEmpty
     lazy val hasSparseSynonyms: Boolean = sparseSynonyms.nonEmpty || 
idlSynonyms.exists(_._2.exists(_.sparse))
     lazy val hasContinuousSynonyms: Boolean = continuousSynonyms.nonEmpty || 
idlSynonyms.exists(_._2.exists(!_.sparse))
+    lazy val isComplex: Boolean = hasIdlSynonyms || !model.getParsers.isEmpty
 
     def hasIdlSynonyms(elemId: String): Boolean = idlSynonyms.contains(elemId)
+
+
 }
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/NCProbeEnrichmentManager.scala
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/NCProbeEnrichmentManager.scala
index fde865f..560ddff 100644
--- 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/NCProbeEnrichmentManager.scala
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/NCProbeEnrichmentManager.scala
@@ -492,7 +492,7 @@ object NCProbeEnrichmentManager extends NCService with 
NCOpenCensusModelStats {
                 }).toMap
 
                 // Loop has sense if model is complex (has user defined 
parsers or IDL based synonyms)
-                continue = NCModelEnricher.isComplex(mdl) && res.exists { case 
(_, same) => !same }
+                continue = mdl.isComplex && res.exists { case (_, same) => 
!same }
 
                 if (DEEP_DEBUG)
                     if (continue) {
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
index 7196985..a39edfd 100644
--- 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
@@ -83,8 +83,6 @@ object NCModelEnricher extends NCProbeEnricher {
         ackStopped()
     }
 
-    def isComplex(mdl: NCProbeModel): Boolean = mdl.hasIdlSynonyms || 
!mdl.model.getParsers.isEmpty
-
     /**
       *
       * @param ns
@@ -180,7 +178,8 @@ object NCModelEnricher extends NCProbeEnricher {
                         new NCCustomElement() {
                             override def getElementId: String = noteId
                             override def getWords: JList[NCCustomWord] = words
-                            override def getMetadata: JavaMeta = md.map(p => 
p._1 -> p._2.asInstanceOf[AnyRef]).asJava
+                            override def getMetadata: JavaMeta =
+                                md.map { case (k, v) => k -> 
v.asInstanceOf[AnyRef] }.asJava
                         }
                     }).asJava
                 )
@@ -228,7 +227,7 @@ object NCModelEnricher extends NCProbeEnricher {
       *  Example: Piece: 'x1, x2(stopword), x3(stopword), x4' will be expanded 
 into
       *  {'x1, x2, x3, x4', 'x1, x2, x4', 'x1, x3, x4', 'x1, x4'}
       *
-      *  3. All variants collected, duplicated deleted, etc.
+      *  3. All variants collected, duplicated sets deleted, etc.
       *
       * @param toks
       */
@@ -244,7 +243,7 @@ object NCModelEnricher extends NCProbeEnricher {
                 else
                     slides += mutable.ArrayBuffer.empty :+ stop
 
-            // Too many stopords inside skipped.
+            // Too many stopwords inside skipped.
             val bigSlides = slides.filter(_.size > 2)
 
             var stops4Delete: Seq[Seq[NlpToken]] =
@@ -255,7 +254,7 @@ object NCModelEnricher extends NCProbeEnricher {
                     if (stops4AllCombs.nonEmpty)
                         for (
                             seq1 <- Range.inclusive(0, 
stops4AllCombs.size).flatMap(stops4AllCombs.combinations);
-                            seq2 <- Range.inclusive(0, 
bigSlides.size).flatMap(bigSlides.combinations)
+                                seq2 <- Range.inclusive(0, 
bigSlides.size).flatMap(bigSlides.combinations)
                         )
                         yield seq1 ++ seq2.flatten
                     else
@@ -268,11 +267,10 @@ object NCModelEnricher extends NCProbeEnricher {
             stops4Delete = stops4Delete.filter(seq => 
!seq.contains(combo.head) && !seq.contains(combo.last))
 
             (Seq(combo) ++ stops4Delete.map(del => combo.filter(t => 
!del.contains(t)))).map(_ -> combo).distinct
-
         }).
-            filter(_._1.nonEmpty).
-            groupBy(_._1).
-            map(p => p._1 -> p._2.map(_._2).minBy(p => (-p.size, 
p.head.index))).
+            filter { case (seq, _) => seq.nonEmpty }.
+            groupBy { case (seq, _) => seq }.
+            map { case (toksKey, seq) => toksKey -> seq.map(_._2).minBy(p => 
(-p.size, p.head.index)) }.
             sortBy { case(data, combo) => (-combo.size, -data.size, 
combo.head.index, data.head.index) }
 
     /**
@@ -297,15 +295,17 @@ object NCModelEnricher extends NCProbeEnricher {
 
     /**
       *
-      * @param tows
+      * @param idlToks
       * @param ns
       */
-    private def toTokens(tows: Seq[IdlToken], ns: Sentence): Seq[NlpToken] =
-        (
-            tows.filter(_.isWord).map(_.word) ++
-                tows.filter(_.isToken).map(_.token).
-                    flatMap(w => ns.filter(t => 
t.wordIndexes.intersect(w.wordIndexes).nonEmpty))
-        ).sortBy(_.startCharIndex)
+    private def toNlpTokens(idlToks: Seq[IdlToken], ns: Sentence): 
Seq[NlpToken] = {
+        val words = idlToks.filter(_.isWord).map(_.word)
+        val suitableToks =
+            idlToks.filter(_.isToken).map(_.token).
+                flatMap(w => ns.filter(t => 
t.wordIndexes.intersect(w.wordIndexes).nonEmpty))
+
+        (words ++ suitableToks).sortBy(_.startCharIndex)
+    }
 
     /**
       *
@@ -378,6 +378,7 @@ object NCModelEnricher extends NCProbeEnricher {
     }
 
     /**
+      * Prepares IDL tokens based on NLP tokens.
       *
       * @param h
       * @param toks
@@ -391,9 +392,7 @@ object NCModelEnricher extends NCProbeEnricher {
 
                 // Drops without tokens (IDL part works with tokens).
                 if (rec.nonEmpty)
-                    Some(rec ++
-                        (seq.wordsIndexes.intersect(idxs) -- 
rec.flatMap(_.wordIndexes)).map(h.tokens)
-                    )
+                    Some(rec ++ (seq.wordsIndexes.intersect(idxs) -- 
rec.flatMap(_.wordIndexes)).map(h.tokens))
                 else
                     None
             }).seq
@@ -440,11 +439,11 @@ object NCModelEnricher extends NCProbeEnricher {
                     for (
                         // 'toksExt' is piece of sentence, 'toks' is the same 
as 'toksExt' or without some stopwords set.
                         (toks, toksExt) <- combosTokens(ns.toSeq);
-                        idxs = toks.map(_.index);
-                        e <- mdl.elements.values;
-                        elemId = e.getId;
-                        greedy = e.isGreedy.orElse(mdl.model.isGreedy)
-                        if !greedy || !alreadyMarked(ns, elemId, toks, idxs)
+                            idxs = toks.map(_.index);
+                            e <- mdl.elements.values;
+                            elemId = e.getId;
+                            greedy = e.isGreedy.orElse(mdl.model.isGreedy)
+                            if !greedy || !alreadyMarked(ns, elemId, toks, 
idxs)
                     ) {
                         def add(
                             dbgType: String,
@@ -456,7 +455,7 @@ object NCModelEnricher extends NCProbeEnricher {
 
                             val ok =
                                 (!greedy || !alreadyMarked(ns, elemId, 
elemToks, idxs)) &&
-                                 ( parts.isEmpty || !parts.exists { case (t, 
_) => t.getId == elemId })
+                                    ( parts.isEmpty || !parts.exists { case 
(tok, _) => tok.getId == elemId })
 
                             if (ok)
                                 mark(
@@ -563,7 +562,7 @@ object NCModelEnricher extends NCProbeEnricher {
                                     )
                             }
                             else
-                                // 2.2 Sparse.
+                            // 2.2 Sparse.
                                 for (syn <- allSyns; comb <- allCombs)
                                     NCSynonymsManager.onSparseMatch(
                                         ns.srvReqId,
@@ -573,7 +572,7 @@ object NCModelEnricher extends NCProbeEnricher {
                                         req,
                                         variantsToks,
                                         res => {
-                                            val toks = 
getSparsedTokens(toTokens(res, ns), toTokens(comb, ns))
+                                            val toks = 
getSparsedTokens(toNlpTokens(res, ns), toNlpTokens(comb, ns))
                                             val parts = toParts(mdl, 
ns.srvReqId, res, syn)
                                             val typ = if (syn.sparse) "IDL 
sparse"else "IDL continuous"
 
@@ -607,6 +606,9 @@ object NCModelEnricher extends NCProbeEnricher {
       * @param ns
       */
     private def normalize(ns: Sentence): Unit = {
+        // Find and removes user notes if sentence contains notes with similar 
structure but less count of swallowed stop-words.
+        // These stop-words can be used fro detection another user tokens and 
harmless if they are free words.
+        // Notes with links and with references on them - aren't touched.
         val usrNotes = ns.flatten.filter(_.isUser).distinct
         val links = NCSentenceManager.getLinks(usrNotes)
         val parts = NCSentenceManager.getPartKeys(usrNotes)
@@ -638,28 +640,34 @@ object NCModelEnricher extends NCProbeEnricher {
     // TODO: simplify, add tests, check model properties (sparse etc) for 
optimization.
     /**
       *
-      * @param elmId
-      * @param toks
-      * @param sliceToksIdxsSorted
+      * @param elmId Element ID.
+      * @param toks Tokens.
+      * @param idxs Indexes, note that it can be not exactly tokens indexes 
(sparse case)
       */
-    private def alreadyMarked(ns: Sentence, elmId: String, toks: 
Seq[NlpToken], sliceToksIdxsSorted: Seq[Int]): Boolean = {
+    private def alreadyMarked(ns: Sentence, elmId: String, toks: 
Seq[NlpToken], idxs: Seq[Int]): Boolean = {
         lazy val toksIdxsSorted = toks.map(_.index).sorted
 
-        sliceToksIdxsSorted.map(ns).forall(_.exists(n => n.noteType == elmId 
&& n.sparsity == 0)) ||
-        toks.exists(_.exists(n =>
-            n.noteType == elmId &&
-            (
-                (n.sparsity == 0 &&
-                    (sliceToksIdxsSorted.containsSlice(n.tokenIndexes) || 
n.tokenIndexes.containsSlice(toksIdxsSorted))
-                )
-                    ||
-                (
-                    n.tokenIndexes == toksIdxsSorted ||
-                    n.tokenIndexes.containsSlice(toksIdxsSorted) &&
-                    U.isContinuous(toksIdxsSorted) &&
-                    U.isContinuous(n.tokenIndexes)
-                )
-            )
-        ))
+        // All tokens with given indexes found with zero sparsity.
+        val ok1 = idxs.map(ns).forall(_.exists(n => n.noteType == elmId && 
n.sparsity == 0))
+
+        lazy val ok2 =
+            toks.exists(_.exists(n =>
+                if (n.noteType == elmId) {
+                    val noteOk1 = n.sparsity == 0 &&
+                        (idxs.containsSlice(n.tokenIndexes) || 
n.tokenIndexes.containsSlice(toksIdxsSorted))
+
+                    lazy val noteOk2 =
+                        n.tokenIndexes == toksIdxsSorted ||
+                            n.tokenIndexes.containsSlice(toksIdxsSorted) &&
+                                U.isContinuous(toksIdxsSorted) &&
+                                U.isContinuous(n.tokenIndexes)
+
+                    noteOk1 || noteOk2
+                }
+                else
+                    false
+            ))
+
+        ok1 || ok2
     }
 }
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceManager.scala
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceManager.scala
index 00d6bdf..f9f7a01 100644
--- 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceManager.scala
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceManager.scala
@@ -369,7 +369,7 @@ object NCSentenceManager extends NCService {
         val t = NCNlpSentenceToken(idx)
 
         // Note, it adds stop-words too.
-        val content = nsCopyToks.zipWithIndex.filter(p => 
indexes.contains(p._2)).map(_._1)
+        val content = nsCopyToks.zipWithIndex.filter { case (_, idx) => 
indexes.contains(idx) }.map { case (tok, _) => tok}
 
         content.foreach(t => history += t.index -> idx)
 
@@ -378,15 +378,12 @@ object NCSentenceManager extends NCService {
 
             val n = content.size - 1
 
-            content.zipWithIndex.foreach(p => {
-                val t = p._1
-                val idx = p._2
-
+            content.zipWithIndex.foreach { case (t, idx) =>
                 buf += get(t)
 
                 if (idx < n && t.endCharIndex != content(idx + 
1).startCharIndex)
                     buf += " "
-            })
+            }
 
             buf.mkString
         }
@@ -459,8 +456,7 @@ object NCSentenceManager extends NCService {
         for (tok <- ns.filter(_.isTypeOf(noteType)) if ok)
             tok.getNoteOpt(noteType, idxsField) match {
                 case Some(n) =>
-                    val idxs: Seq[Seq[Int]] =
-                        
n.data[JList[JList[Int]]](idxsField).asScala.map(_.asScala.toSeq).toSeq
+                    val idxs: Seq[Seq[Int]] = 
n.data[JList[JList[Int]]](idxsField).asScala.map(_.asScala.toSeq).toSeq
                     var fixed = idxs
 
                     history.foreach {
@@ -539,8 +535,7 @@ object NCSentenceManager extends NCService {
             // Validation (all indexes calculated well)
             require(
                 !res ||
-                    !ns.flatten.
-                        exists(n => 
ns.filter(_.wordIndexes.exists(n.wordIndexes.contains)).exists(t => 
!t.contains(n))),
+                !ns.flatten.exists(n => 
ns.filter(_.wordIndexes.exists(n.wordIndexes.contains)).exists(t => 
!t.contains(n))),
                 s"Invalid sentence:\n" +
                     ns.map(t =>
                         // Human readable invalid sentence for debugging.
@@ -745,9 +740,11 @@ object NCSentenceManager extends NCService {
             )
         )
 
+        // There are optimizations below. Similar variants by some criteria 
deleted.
+
         def notNlpNotes(s: NCNlpSentence): Seq[NCNlpSentenceNote] = 
s.flatten.filter(!_.isNlp)
 
-        // Drops similar sentences (with same notes structure). Keeps with 
more found.
+        // Drops similar sentences with same notes structure based on greedy 
elements. Keeps with more notes found.
         val notGreedyElems =
             mdl.getElements.asScala.flatMap(e => if 
(!e.isGreedy.orElse(mdl.isGreedy)) Some(e.getId) else None).toSet
 
@@ -768,6 +765,7 @@ object NCSentenceManager extends NCService {
 
         var sensWithNotesIdxs = sensWithNotes.zipWithIndex
 
+        // Drops similar sentences if there are other sentences with superset 
of notes.
         sens =
             sensWithNotesIdxs.filter { case ((_, notNlpNotes1), idx1) =>
                 !sensWithNotesIdxs.
@@ -775,13 +773,12 @@ object NCSentenceManager extends NCService {
                     exists { case((_, notNlpNotes2), _) => 
notNlpNotes1.subsetOf(notNlpNotes2) }
             }.map { case ((sen, _), _) => sen }
 
-        // Drops similar sentences (with same tokens structure).
-        // Among similar sentences we prefer one with minimal free words count.
+        // Drops similar sentences. Among similar sentences we prefer one with 
minimal free words count.
         sens = sens.groupBy(notNlpNotes(_).map(_.getKey(withIndexes = false))).
             map { case (_, seq) => seq.minBy(_.filter(p => p.isNlp && 
!p.isStopWord).map(_.wordIndexes.length).sum) }.
             toSeq
 
-        // Drops sentences if they are just subset of another.
+        // Drops sentences if they are just subset of another (indexes ignored 
here)
         sensWithNotes = sensWithNotes.filter { case (sen, _) => 
sens.contains(sen) }
 
         sensWithNotesIdxs = sensWithNotes.zipWithIndex
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/synonyms/NCSynonymsManager.scala
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/synonyms/NCSynonymsManager.scala
index 80f9c19..fa31f26 100644
--- 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/synonyms/NCSynonymsManager.scala
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/synonyms/NCSynonymsManager.scala
@@ -38,7 +38,7 @@ object NCSynonymsManager extends NCService {
         private lazy val cache =
             mutable.HashMap.empty[String, mutable.HashMap[Int, 
mutable.HashMap[Seq[T], mutable.HashSet[Synonym]]]]
 
-        def isUnprocessed(elemId: String, s: Synonym, tokens: Seq[T]): Boolean 
=
+        def isUnprocessed(elemId: String, syn: Synonym, tokens: Seq[T]): 
Boolean =
             cache.
                 getOrElseUpdate(
                     elemId,
@@ -51,7 +51,7 @@ object NCSynonymsManager extends NCService {
                 getOrElseUpdate(
                     tokens,
                     mutable.HashSet.empty[Synonym]
-                ).add(s)
+                ).add(syn)
     }
 
     private case class SavedIdlKey(id: String, startCharIndex: Int, 
endCharIndex: Int, other: Map[String, AnyRef] = Map.empty)
@@ -72,13 +72,11 @@ object NCSynonymsManager extends NCService {
                 )
     }
 
-    private case class Value(request: NCRequest, variants: Seq[Seq[NCToken]], 
predicate: NCIdlFunction) {
-        override def toString: String = variants.toString()
-    }
+    private case class SavedIdlValue(request: NCRequest, variants: 
Seq[Seq[NCToken]], predicate: NCIdlFunction)
 
     private case class IdlChunkKey(token: IdlToken, chunk: NCProbeSynonymChunk)
 
-    private val savedIdl = mutable.HashMap.empty[String, 
mutable.HashMap[SavedIdlKey, mutable.ArrayBuffer[Value]]]
+    private val savedIdl = mutable.HashMap.empty[String, 
mutable.HashMap[SavedIdlKey, mutable.ArrayBuffer[SavedIdlValue]]]
     private val idlChunksCache = mutable.HashMap.empty[String, 
mutable.HashMap[IdlChunkKey, Boolean]]
     private val idlCaches = mutable.HashMap.empty[String, 
CacheHolder[IdlToken]]
     private val tokCaches = mutable.HashMap.empty[String, CacheHolder[Int]]
@@ -120,7 +118,7 @@ object NCSynonymsManager extends NCService {
 
     /**
       *
-      * @param s
+      * @param syn
       * @param toks
       * @param isMatch
       * @param getIndex
@@ -128,19 +126,23 @@ object NCSynonymsManager extends NCService {
       * @tparam T
       */
     private def sparseMatch0[T](
-        s: Synonym,
+        syn: Synonym,
         toks: Seq[T],
         isMatch: (T, NCProbeSynonymChunk) => Boolean,
         getIndex: T => Int,
         shouldBeNeighbors: Boolean
     ): Option[Seq[T]] =
-        if (toks.size >= s.size) {
+        if (toks.size >= syn.size) {
             lazy val res = mutable.ArrayBuffer.empty[T]
             lazy val all = mutable.HashSet.empty[T]
 
+            // There are 3 states:
+            // 0 - initial working state, first step.
+            // 1 - working state, not first step.
+            // -1 - stop state.
             var state = 0
 
-            for (chunk <- s if state != -1) {
+            for (chunk <- syn if state != -1) {
                 val seq =
                     if (state == 0) {
                         state = 1
@@ -153,12 +155,12 @@ object NCSynonymsManager extends NCService {
                 if (seq.nonEmpty) {
                     val head = seq.head
 
-                    if (!s.permute && res.nonEmpty && getIndex(head) <= 
getIndex(res.last))
+                    if (!syn.permute && res.nonEmpty && getIndex(head) <= 
getIndex(res.last))
                         state = -1
                     else {
                         all ++= seq
 
-                        if (all.size > s.size)
+                        if (all.size > syn.size)
                             state = -1
                         else
                             res += head
@@ -168,7 +170,12 @@ object NCSynonymsManager extends NCService {
                     state = -1
             }
 
-            if (state != -1 && all.size == res.size && (!shouldBeNeighbors || 
U.isIncreased(res.map(getIndex).toSeq.sorted)))
+            if (
+                state != -1 && // State ok.
+                all.size == res.size && // There aren't excess processed 
tokens.
+                // `neighbors` conditions, important for simple not sparse 
synonyms.
+                (!shouldBeNeighbors || 
U.isIncreased(res.map(getIndex).toSeq.sorted))
+            )
                 Some(res.toSeq)
             else
                 None
@@ -186,69 +193,75 @@ object NCSynonymsManager extends NCService {
     private def save(req: NCRequest, tok: NCToken, pred: NCIdlFunction, 
variantsToks: Seq[Seq[NCToken]]): Unit = {
         savedIdl.
             getOrElseUpdate(req.getServerRequestId, mutable.HashMap.empty).
-            getOrElseUpdate(SavedIdlKey(tok), mutable.ArrayBuffer.empty) +=
-                Value(req, variantsToks, pred)
+                getOrElseUpdate(SavedIdlKey(tok), mutable.ArrayBuffer.empty) +=
+                    SavedIdlValue(req, variantsToks, pred)
     }
 
     /**
+      * Checks that given synonym is not checked yet with given NLP tokens' 
indexes.
       *
       * @param srvReqId
       * @param elemId
-      * @param s
+      * @param syn
       * @param tokens
       */
-    private def isUnprocessedTokens(srvReqId: String, elemId: String, s: 
Synonym, tokens: Seq[Int]): Boolean =
-        tokCaches.getOrElseUpdate(srvReqId, new 
CacheHolder[Int]).isUnprocessed(elemId, s, tokens)
+    private def isUnprocessedTokens(srvReqId: String, elemId: String, syn: 
Synonym, tokens: Seq[Int]): Boolean =
+        tokCaches.getOrElseUpdate(srvReqId, new 
CacheHolder[Int]).isUnprocessed(elemId, syn, tokens)
 
     /**
+      * Checks that given synonym is not checked yet with given IDL tokens.
       *
       * @param srvReqId
       * @param elemId
-      * @param s
+      * @param syn
       * @param tokens
       */
-    private def isUnprocessedIdl(srvReqId: String, elemId: String, s: Synonym, 
tokens: Seq[IdlToken]): Boolean =
-        idlCaches.getOrElseUpdate(srvReqId, new 
CacheHolder[IdlToken]).isUnprocessed(elemId, s, tokens)
+    private def isUnprocessedIdl(srvReqId: String, elemId: String, syn: 
Synonym, tokens: Seq[IdlToken]): Boolean =
+        idlCaches.getOrElseUpdate(srvReqId, new 
CacheHolder[IdlToken]).isUnprocessed(elemId, syn, tokens)
 
     /**
+      * Checks matching IDL token with synonym's chunk.
       *
-      * @param tow
-      * @param chunk
-      * @param req
-      * @param variantsToks
+      * @param t IDL token.
+      * @param chunk Synonym's chunk.
+      * @param req Request.
+      * @param variantsToks All possible request's variants.
       */
     private def isMatch(
-        tow: IdlToken, chunk: NCProbeSynonymChunk, req: NCRequest, 
variantsToks: Seq[Seq[NCToken]]
+        t: IdlToken, chunk: NCProbeSynonymChunk, req: NCRequest, variantsToks: 
Seq[Seq[NCToken]]
     ): Boolean =
         idlChunksCache.
-            getOrElseUpdate(req.getServerRequestId,
+            getOrElseUpdate(
+                req.getServerRequestId,
                 mutable.HashMap.empty[IdlChunkKey, Boolean]
             ).
             getOrElseUpdate(
-                IdlChunkKey(tow, chunk),
+                IdlChunkKey(t, chunk),
                 {
-                    def get0[T](fromToken: NCToken => T, fromWord: NlpToken => 
T): T =
-                        if (tow.isToken) fromToken(tow.token) else 
fromWord(tow.word)
-
                     chunk.kind match {
-                        case TEXT => chunk.wordStem == get0(_.stem, _.stem)
+                        case TEXT => chunk.wordStem == t.stem
 
                         case REGEX =>
-                            chunk.regex.matcher(get0(_.origText, 
_.origText)).matches() ||
-                            chunk.regex.matcher(get0(_.normText, 
_.normText)).matches()
+                            chunk.regex.matcher(t.origText).matches() || 
chunk.regex.matcher(t.normText).matches()
 
                         case IDL =>
-                            val ok =
+                            val ok = {
+                                // IDL condition just for tokens.
+                                t.isToken &&
+                                // Should be found at least one suitable 
variant (valid NCIdlContext) for given token.
+                                // This variant will be checked again on last 
processing phase.
                                 variantsToks.par.exists(vrntToks =>
-                                    get0(t =>
-                                        chunk.idlPred.apply(t, 
NCIdlContext(toks = vrntToks, req = req)).
-                                            value.asInstanceOf[Boolean],
-                                        _ => false
+                                    chunk.idlPred.apply(
+                                        t.token,
+                                        NCIdlContext(toks = vrntToks, req = 
req)).value.asInstanceOf[Boolean]
                                     )
-                                )
+                            }
 
+                            // Saves all variants for next validation.
+                            // All suitable variants can be deleted, so this 
positive result can be abolished
+                            // on last processing phase.
                             if (ok)
-                                save(req, tow.token, chunk.idlPred, 
variantsToks)
+                                save(req, t.token, chunk.idlPred, variantsToks)
 
                             ok
 
@@ -270,22 +283,29 @@ object NCSynonymsManager extends NCService {
             require(toks != null)
             require(!syn.sparse && !syn.hasIdl)
 
-            if (
-                toks.length == syn.length && {
+            if (toks.length == syn.length) { // Same length.
+                val ok =
                     if (syn.isTextOnly)
-                        toks.zip(syn).forall(p => p._1.stem == p._2.wordStem)
+                        toks.zip(syn).
+                            // Checks all synonym chunks with all tokens.
+                            forall { case (tok, chunk) => tok.stem == 
chunk.wordStem }
                     else
-                        toks.zip(syn).sortBy(p => getSort(p._2.kind)).forall { 
case (tok, chunk) => isMatch(tok, chunk) }
-                }
-            )
-                callback(())
+                        toks.zip(syn).
+                            // Pre-sort by chunk kind for performance reasons, 
easier to compare should be first.
+                            sortBy { case (_, chunk) => getSort(chunk.kind) }.
+                            // Checks all synonym chunks with all tokens.
+                            forall { case (tok, chunk) => isMatch(tok, chunk) }
+
+                if (ok)
+                    callback(())
+            }
         }
 
     /**
       *
       * @param srvReqId
       * @param elemId
-      * @param s
+      * @param syn
       * @param toks
       * @param req
       * @param variantsToks
@@ -294,22 +314,22 @@ object NCSynonymsManager extends NCService {
     def onMatch(
         srvReqId: String,
         elemId: String,
-        s: Synonym,
+        syn: Synonym,
         toks: Seq[IdlToken],
         req: NCRequest,
         variantsToks: Seq[Seq[NCToken]],
         callback: Unit => Unit
     ): Unit =
-        if (isUnprocessedIdl(srvReqId, elemId, s, toks)) {
+        if (isUnprocessedIdl(srvReqId, elemId, syn, toks)) {
             require(toks != null)
 
             if (
-                toks.length == s.length && // Same length.
-                toks.count(_.isToken) >= s.idlChunks && // Enough tokens.
-                toks.zip(s).sortBy { // Pre-sort by chunk kind.
+                toks.length == syn.length && // Same length.
+                toks.count(_.isToken) >= syn.idlChunks && // Enough tokens.
+                toks.zip(syn).sortBy { // Pre-sort by chunk kind for 
performance reasons, easier to compare should be first.
                     case (_, chunk) => getSort(chunk.kind)
-                }
-                .forall { // TODO?
+                }.
+                forall { // Checks all synonym chunks with all tokens.
                     case (idlTok, chunk) => isMatch(idlTok, chunk, req, 
variantsToks)
                 }
             )
@@ -365,7 +385,7 @@ object NCSynonymsManager extends NCService {
                 syn,
                 toks,
                 (t: IdlToken, chunk: NCProbeSynonymChunk) => isMatch(t, chunk, 
req, variantsToks),
-                (t: IdlToken) => if (t.isToken) t.token.getStartCharIndex else 
t.word.startCharIndex,
+                (t: IdlToken) => t.startCharIndex,
                 shouldBeNeighbors = !syn.sparse
             ) match {
                 case Some(res) => callback(res)
@@ -374,13 +394,15 @@ object NCSynonymsManager extends NCService {
         }
 
     /**
+      * Checks that suitable variant wasn't deleted and IDL condition for 
token is still valid.
+      * We have to check it because NCIdlContext which used in predicate based 
on variant.
       *
       * @param srvReqId
-      * @param senToks
+      * @param toks
       */
-    def isStillValidIdl(srvReqId: String, senToks: Seq[NCToken]): Boolean =
+    def isStillValidIdl(srvReqId: String, toks: Seq[NCToken]): Boolean =
         savedIdl.get(srvReqId) match {
-            case Some(m) =>
+            case Some(map) =>
                 lazy val allCheckedSenToks = {
                     val set = mutable.HashSet.empty[SavedIdlKey]
 
@@ -390,13 +412,13 @@ object NCSynonymsManager extends NCService {
                         t.getPartTokens.asScala.foreach(add)
                     }
 
-                    senToks.foreach(add)
+                    toks.foreach(add)
 
                     set
                 }
 
-                senToks.forall(tok =>
-                    m.get(SavedIdlKey(tok)) match {
+                toks.forall(tok =>
+                    map.get(SavedIdlKey(tok)) match {
                         case Some(vals) =>
                             vals.exists(
                                 v =>
@@ -417,6 +439,7 @@ object NCSynonymsManager extends NCService {
         }
 
     /**
+      * Called when request processing finished.
       *
       * @param srvReqId
       */
@@ -427,6 +450,7 @@ object NCSynonymsManager extends NCService {
     }
 
     /**
+      * Called on each request enrichment iteration.
       *
       * @param srvReqId
       */

[incubator-nlpcraft] 01/01: Code cleanup.

Reply via email to