[incubator-nlpcraft] branch NLPCRAFT-443 updated: WIP.

sergeykamov Fri, 24 Sep 2021 08:37:57 -0700

This is an automated email from the ASF dual-hosted git repository.

sergeykamov pushed a commit to branch NLPCRAFT-443
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git



The following commit(s) were added to refs/heads/NLPCRAFT-443 by this push:
     new 0d1907b  WIP.
0d1907b is described below

commit 0d1907b99591d26c4c4a36a0722f5bc99fe1e258
Author: Sergey Kamov <[email protected]>
AuthorDate: Fri Sep 24 18:37:43 2021 +0300

    WIP.
---
 .../nlpcraft/probe/mgrs/NCProbeSynonym.scala       |  46 +++----
 .../mgrs/nlp/enrichers/model/NCModelEnricher.scala | 148 ++++++++++-----------
 .../mgrs/nlp/enrichers/model/NCSentenceCache.scala | 110 ---------------
 .../probe/mgrs/synonyms/NCSynonymsManager.scala    |   3 +-
 4 files changed, 92 insertions(+), 215 deletions(-)

diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeSynonym.scala 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeSynonym.scala
index 7eefd97..d865c6a 100644
--- 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeSynonym.scala
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/NCProbeSynonym.scala
@@ -52,6 +52,18 @@ class NCProbeSynonym(
     lazy val isValueSynonym: Boolean = value != null
     lazy val stems: String = map(_.wordStem).mkString(" ")
 
+    private lazy val hash =
+        Seq(
+            super.hashCode(),
+            isTextOnly,
+            regexChunks,
+            idlChunks,
+            isValueSynonym,
+            isElementId,
+            isValueName,
+            value
+        ).map(p => if (p == null) 0 else p.hashCode()).foldLeft(0)((a, b) => 
31 * a + b)
+
     override def toString(): String = mkString(" ")
 
     // Orders synonyms from least to most significant.
@@ -109,36 +121,20 @@ class NCProbeSynonym(
             }
     }
 
-    override def canEqual(other: Any): Boolean = 
other.isInstanceOf[NCProbeSynonym]
-
     override def equals(other: Any): Boolean = other match {
         case that: NCProbeSynonym =>
-            super.equals(that) &&
-                (that canEqual this) &&
-                isTextOnly == that.isTextOnly &&
-                regexChunks == that.regexChunks &&
-                idlChunks == that.idlChunks &&
-                isValueSynonym == that.isValueSynonym &&
-                isElementId == that.isElementId &&
-                isValueName == that.isValueName &&
-                value == that.value
+            isElementId == that.isElementId &&
+            isTextOnly == that.isTextOnly &&
+            regexChunks == that.regexChunks &&
+            idlChunks == that.idlChunks &&
+            isValueSynonym == that.isValueSynonym &&
+            isValueName == that.isValueName &&
+            value == that.value &&
+            super.equals(that)
         case _ => false
     }
 
-    override def hashCode(): Int = {
-        val state = Seq(
-            super.hashCode(),
-            isTextOnly,
-            regexChunks,
-            idlChunks,
-            isValueSynonym,
-            isElementId,
-            isValueName,
-            value
-        )
-
-        state.map(p => if (p == null) 0 else p.hashCode()).foldLeft(0)((a, b) 
=> 31 * a + b)
-    }
+    override def hashCode(): Int = hash
 }
 
 object NCProbeSynonym {
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
index 0cd8a92..e079aff 100644
--- 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala
@@ -34,7 +34,6 @@ import org.apache.nlpcraft.probe.mgrs.{NCProbeModel, 
NCProbeVariants, NCTokenPar
 import java.io.Serializable
 import java.util.{List => JList}
 import scala.collection.mutable
-import scala.collection.mutable.ArrayBuffer
 import scala.collection.parallel.CollectionConverters._
 import scala.jdk.CollectionConverters.{ListHasAsScala, MapHasAsJava, 
MapHasAsScala, SeqHasAsJava}
 
@@ -43,7 +42,6 @@ import scala.jdk.CollectionConverters.{ListHasAsScala, 
MapHasAsJava, MapHasAsSca
   */
 object NCModelEnricher extends NCProbeEnricher {
     type TokType = (NCToken, NCSynonymChunkKind)
-    type Cache = mutable.Map[String, ArrayBuffer[Seq[Int]]]
 
     object Complex {
         def apply(t: NCToken): Complex =
@@ -130,6 +128,22 @@ object NCModelEnricher extends NCProbeEnricher {
 
     case class ComplexHolder(complexesWords: Seq[Complex], complexes: 
Seq[ComplexSeq])
 
+    class CacheHolder[T] {
+        private lazy val cache = mutable.HashMap.empty[Int, 
mutable.HashMap[Seq[T], mutable.HashSet[Synonym]]]
+
+        def add(s: Synonym, tokens: Seq[T]): Boolean = {
+            cache.
+                getOrElseUpdate(
+                    tokens.length,
+                    mutable.HashMap.empty[Seq[T], mutable.HashSet[Synonym]]
+                ).
+                getOrElseUpdate(
+                    tokens,
+                    mutable.HashSet.empty[Synonym]
+                ).add(s)
+        }
+    }
+
     /**
       *
       * @param parent Optional parent span.
@@ -379,12 +393,6 @@ object NCModelEnricher extends NCProbeEnricher {
                     flatMap(w => ns.filter(t => 
t.wordIndexes.intersect(w.wordIndexes).nonEmpty))
         ).sortBy(_.startCharIndex)
 
-    /**
-      *
-      * @param m
-      * @param id
-      * @return
-      */
     private def get(m: Map[String , Seq[Synonym]], id: String): Seq[Synonym] = 
m.getOrElse(id, Seq.empty)
 
     /**
@@ -455,7 +463,7 @@ object NCModelEnricher extends NCProbeEnricher {
       * @param h
       * @param toks
       */
-    private def mkCombinations(h: ComplexHolder, toks: Seq[NlpToken], cache: 
mutable.HashSet[Seq[Complex]]): Seq[Seq[Complex]] = {
+    private def mkCombinations(h: ComplexHolder, toks: Seq[NlpToken]): 
Seq[Seq[Complex]] = {
         val idxs = toks.flatMap(_.wordIndexes).toSet
 
         h.complexes.par.
@@ -463,54 +471,15 @@ object NCModelEnricher extends NCProbeEnricher {
                 val rec = 
complexSeq.tokensComplexes.filter(_.wordIndexes.exists(idxs.contains))
 
                 // Drops without tokens (IDL part works with tokens).
-                if (rec.nonEmpty) {
-                    val data = rec ++
+                if (rec.nonEmpty)
+                    Some(rec ++
                         (complexSeq.wordsIndexes.intersect(idxs) -- 
rec.flatMap(_.wordIndexes)).map(h.complexesWords)
-
-                    if (!cache.contains(data)) Some(data) else None
-                }
+                    )
                 else
                     None
             }).seq
     }
 
-    private def add(
-        dbgType: String,
-        ns: Sentence,
-        contCache: Cache,
-        elemId: String,
-        greedy: Boolean,
-        elemToks: Seq[NlpToken],
-        sliceToksIdxs: Seq[Int],
-        syn: Synonym,
-        parts: Seq[TokType] = Seq.empty
-    ): Unit = {
-        val resIdxs = elemToks.map(_.index)
-        val resIdxsSorted = resIdxs.sorted
-
-        if (resIdxsSorted == sliceToksIdxs && U.isContinuous(resIdxsSorted))
-            contCache(elemId) += sliceToksIdxs
-
-        val ok =
-            (!greedy || !alreadyMarked(ns, elemId, elemToks, sliceToksIdxs)) &&
-            ( parts.isEmpty || !parts.exists { case (t, _) => t.getId == 
elemId })
-
-        if (ok)
-            mark(ns, elemId, elemToks, direct = syn.isDirect && 
U.isIncreased(resIdxs), syn = Some(syn), parts = parts)
-
-        if (DEEP_DEBUG)
-            logger.trace(
-                s"${if (ok) "Added" else "Skipped"} element [" +
-                    s"id=$elemId, " +
-                    s"type=$dbgType, " +
-                    s"text='${elemToks.map(_.origText).mkString(" ")}', " +
-                    s"indexes=${resIdxs.mkString("[", ",", "]")}, " +
-                    s"allTokensIndexes=${sliceToksIdxs.mkString("[", ",", 
"]")}, " +
-                    s"synonym=$syn" +
-                    s"]"
-            )
-    }
-
     /**
       *
       * @param matched
@@ -536,13 +505,15 @@ object NCModelEnricher extends NCProbeEnricher {
         ) { span =>
             val req = NCRequestImpl(senMeta, ns.srvReqId)
 
-            val combToks = combosTokens(ns.toSeq)
             lazy val ch = mkComplexes(mdl, ns)
             lazy val variantsToks =
                 ch.complexes.map(
                     p => p.tokensComplexes.map(p => if (p.isToken) p.token 
else mkNlpToken(mdl, ns.srvReqId, p.word))
                 )
 
+            lazy val idlCache = new CacheHolder[NCIdlContent]()
+            lazy val tokCache = new CacheHolder[Int]()
+
             def execute(simpleEnabled: Boolean, idlEnabled: Boolean): Unit =
                 startScopedSpan(
                     "execute", span, "srvReqId" -> ns.srvReqId, "mdlId" -> 
mdl.model.getId, "txt" -> ns.text
@@ -550,21 +521,43 @@ object NCModelEnricher extends NCProbeEnricher {
                     if (DEEP_DEBUG)
                         logger.trace(s"Execution started 
[simpleEnabled=$simpleEnabled, idlEnabled=$idlEnabled]")
 
-                    val contCache = mutable.HashMap.empty ++
-                        mdl.elements.keys.map(k => k -> 
mutable.ArrayBuffer.empty[Seq[Int]])
-                    lazy val idlCache = mutable.HashSet.empty[Seq[Complex]]
-
                     for (
                         // 'toksExt' is piece of sentence, 'toks' is the same 
as 'toksExt' or without some stopwords set.
-                        (toks, toksExt) <- combToks;
+                        (toks, toksExt) <- combosTokens(ns.toSeq);
                         idxs = toks.map(_.index);
                         e <- mdl.elements.values;
                         eId = e.getId;
                         greedy = e.isGreedy.orElse(mdl.model.isGreedy)
-                        if
-                            !greedy ||
-                            !contCache(eId).exists(_.containsSlice(idxs))  && 
!alreadyMarked(ns, eId, toks, idxs)
+                        if !greedy || !alreadyMarked(ns, eId, toks, idxs)
                     ) {
+                        def add(
+                            dbgType: String,
+                            elemToks: Seq[NlpToken],
+                            syn: Synonym,
+                            parts: Seq[TokType] = Seq.empty
+                        ): Unit = {
+                            val resIdxs = elemToks.map(_.index)
+
+                            val ok =
+                                (!greedy || !alreadyMarked(ns, eId, elemToks, 
idxs)) &&
+                                 ( parts.isEmpty || !parts.exists { case (t, 
_) => t.getId == eId })
+
+                            if (ok)
+                                mark(ns, eId, elemToks, direct = syn.isDirect 
&& U.isIncreased(resIdxs), syn = Some(syn), parts = parts)
+
+                            if (DEEP_DEBUG)
+                                logger.trace(
+                                    s"${if (ok) "Added" else "Skipped"} 
element [" +
+                                        s"id=$eId, " +
+                                        s"type=$dbgType, " +
+                                        
s"text='${elemToks.map(_.origText).mkString(" ")}', " +
+                                        s"indexes=${resIdxs.mkString("[", ",", 
"]")}, " +
+                                        
s"allTokensIndexes=${idxs.mkString("[", ",", "]")}, " +
+                                        s"synonym=$syn" +
+                                        s"]"
+                                )
+                        }
+
                         // 1. SIMPLE.
                         if (simpleEnabled && (if (idlEnabled) 
mdl.hasIdlSynonyms(eId) else !mdl.hasIdlSynonyms(eId))) {
                             lazy val tokStems = toks.map(_.stem).mkString(" ")
@@ -579,15 +572,15 @@ object NCModelEnricher extends NCProbeEnricher {
                                             syns.get(tokStems) match {
                                                 case Some(s) =>
                                                     found = true
-                                                    add("simple continuous", 
ns, contCache, eId, greedy, toksExt, idxs, s)
+                                                    add("simple continuous", 
toksExt, s)
                                                 case None => notFound()
                                             }
 
                                         def tryScan(syns: Seq[Synonym]): Unit =
-                                            for (s <- syns if !found)
+                                            for (s <- syns if !found && 
tokCache.add(s, idxs))
                                                 if 
(NCSynonymsManager.isMatch(s, toks)) {
                                                     found = true
-                                                    add("simple continuous 
scan", ns, contCache, eId, greedy, toksExt, idxs, s)
+                                                    add("simple continuous 
scan", toksExt, s)
                                                 }
 
                                         tryMap(
@@ -604,10 +597,10 @@ object NCModelEnricher extends NCProbeEnricher {
 
                             // 1.2 Sparse.
                             if (!found && mdl.hasSparseSynonyms)
-                                for (s <- get(mdl.sparseSynonyms, eId))
+                                for (s <- get(mdl.sparseSynonyms, eId) if 
tokCache.add(s, idxs))
                                     NCSynonymsManager.sparseMatch(s, toks) 
match {
                                         case Some(res) =>
-                                            add("simple sparse", ns, 
contCache, eId, greedy, getSparsedTokens(res, toks), idxs, s)
+                                            add("simple sparse", 
getSparsedTokens(res, toks), s)
                                         case None => // No-op.
                                     }
                         }
@@ -615,8 +608,7 @@ object NCModelEnricher extends NCProbeEnricher {
                         // 2. IDL.
                         if (idlEnabled) {
                             val allSyns = get(mdl.idlSynonyms, eId)
-                            lazy val allCombs = mkCombinations(ch, toks, 
idlCache)
-
+                            lazy val allCombs = mkCombinations(ch, toks)
 
                             // 2.1 Continuous.
                             if (!mdl.hasSparseSynonyms) {
@@ -624,16 +616,14 @@ object NCModelEnricher extends NCProbeEnricher {
 
                                 for (
                                     s <- allSyns;
-                                    comb <- allCombs
-                                    if !found;
+                                    comb <- allCombs;
                                     data = comb.map(_.data)
+                                    if !found && idlCache.add(s, data)
                                 )
                                     if (NCSynonymsManager.isMatch(s, data, 
req, variantsToks)) {
                                         val parts = toParts(mdl, ns.srvReqId, 
data, s)
 
-                                        add("IDL continuous", ns, contCache, 
eId, greedy, toksExt, idxs, s, parts)
-
-                                        idlCache += comb
+                                        add("IDL continuous", toksExt, s, 
parts)
 
                                         found = true
                                     }
@@ -642,17 +632,17 @@ object NCModelEnricher extends NCProbeEnricher {
                                 // 2.2 Sparse.
                                 for (
                                     s <- allSyns;
-                                    comb <- allCombs
+                                    comb <- allCombs;
+                                    data = comb.map(_.data)
+                                    if idlCache.add(s, data)
                                 )
-                                    NCSynonymsManager.sparseMatch(s, 
comb.map(_.data), req, variantsToks) match {
+                                    NCSynonymsManager.sparseMatch(s, data, 
req, variantsToks) match {
                                         case Some(res) =>
-                                            val typ = if (s.sparse) "IDL 
sparse" else "IDL continuous"
-
+                                            val toks = 
getSparsedTokens(toTokens(res, ns), toTokens(comb.map(_.data), ns))
                                             val parts = toParts(mdl, 
ns.srvReqId, res, s)
+                                            val typ = if (s.sparse) "IDL 
sparse"else "IDL continuous"
 
-                                            add(typ, ns, contCache, eId, 
greedy, getSparsedTokens(toTokens(res, ns), toTokens(comb.map(_.data), ns)), 
idxs, s, parts)
-
-                                            idlCache += comb
+                                            add(typ, toks, s, parts)
                                         case None => // No-op.
                                     }
                         }
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCSentenceCache.scala
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCSentenceCache.scala
deleted file mode 100644
index e5b6e3e..0000000
--- 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCSentenceCache.scala
+++ /dev/null
@@ -1,110 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *      https://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nlpcraft.probe.mgrs.nlp.enrichers.model
-
-import org.apache.nlpcraft.common.nlp.{NCNlpSentenceToken => NlpToken}
-import org.apache.nlpcraft.probe.mgrs.{NCProbeSynonym => Synonym}
-import org.apache.nlpcraft.probe.mgrs.NCProbeSynonym.{NCIdlContent => IdlToken}
-import org.apache.nlpcraft.model.NCRequest
-import scala.collection.mutable
-
-class NCSentenceCache {
-//    case class Key(elemId: String, indexes: Seq[Int])
-//    case class Value[T](synonym: Synonym, result: Seq[T])
-//
-//    val cacheToks = mutable.HashMap.empty[Key, mutable.HashMap[Seq[Int], 
Value[NlpToken]]]
-//    val cacheIdl = mutable.HashMap.empty[Key, mutable.HashMap[Seq[Int], 
Value[IdlToken]]]
-//
-//    var cacheHits = 0
-//    var cacheCnt = 0
-//    var time = 0L
-//
-//    private def process[T](
-//        elemId: String,
-//        elemSyns: Seq[Synonym],
-//        toks: Seq[T],
-//        extract: (Synonym, Seq[T]) => Option[Seq[T]],
-//        cache: mutable.Map[Key, mutable.HashMap[Seq[Int], Value[T]]],
-//        getIndex: T => Int,
-//        callback: (Synonym, Seq[T]) => Unit
-//    ): Unit = {
-//        val t = System.currentTimeMillis()
-//
-//        val hash = toks.map(getIndex)
-//        val key = Key(elemId, hash)
-//
-//        cacheCnt += 1
-//
-//        cache.get(key) match {
-//            case Some(data) =>
-//                cacheHits += 1
-//                data.get(hash) match {
-//                    case Some(v) => callback(v.synonym, v.result)
-//                    case None => // No-op.
-//                }
-//            case None =>
-//                // mutable.HashMap.empty[Key[IdlToken], Map[Seq[IdlToken], 
Value[IdlToken]]]
-//                val hit = mutable.HashMap.empty[Seq[Int], Value[T]]
-//
-//                for (s <- elemSyns)
-//                    extract(s, toks) match {
-//                        case Some(res) =>
-//                            callback(s, res)
-//                            hit += hash -> Value(s, res)
-//                        case None => // No-op.
-//                    }
-//
-//                cache += key -> hit
-//        }
-//
-//        time += (System.currentTimeMillis() - t)
-//    }
-//
-//    def processSparseTokens(
-//        elemId: String,
-//        elemSyns: Seq[Synonym],
-//        toks: Seq[NlpToken],
-//        callback: (Synonym, Seq[NlpToken]) => Unit
-//    ): Unit =
-//        process(
-//            elemId,
-//            elemSyns,
-//            toks,
-//            (s: Synonym, toks: Seq[NlpToken]) => s.sparseMatch(toks),
-//            cacheToks,
-//            (t: NlpToken) => t.index,
-//            callback
-//        )
-//
-//    def processSparseIdl(
-//        elemId: String,
-//        req: NCRequest,
-//        elemSyns: Seq[Synonym],
-//        toks: Seq[IdlToken],
-//        callback: (Synonym, Seq[IdlToken]) => Unit
-//    ): Unit =
-//        process(
-//            elemId,
-//            elemSyns,
-//            toks,
-//            (s: Synonym, toks: Seq[IdlToken]) => s.sparseMatch(toks, req),
-//            cacheIdl,
-//            (t: IdlToken) => if (t.isRight) t.toOption.get.index else 
t.swap.toOption.get.getIndex,
-//            callback
-//        )
-}
\ No newline at end of file
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/synonyms/NCSynonymsManager.scala
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/synonyms/NCSynonymsManager.scala
index d66f5af..465af93 100644
--- 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/synonyms/NCSynonymsManager.scala
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/synonyms/NCSynonymsManager.scala
@@ -27,6 +27,7 @@ import 
org.apache.nlpcraft.probe.mgrs.NCProbeSynonymChunkKind.{IDL, NCSynonymChu
 import org.apache.nlpcraft.probe.mgrs.{NCProbeSynonymChunk, NCProbeSynonym => 
Synonym}
 
 import scala.collection.mutable
+import 
scala.collection.parallel.CollectionConverters.ImmutableIterableIsParallelizable
 import scala.compat.java8.OptionConverters._
 import scala.jdk.CollectionConverters.ListHasAsScala
 
@@ -194,7 +195,7 @@ object NCSynonymsManager extends NCService {
 
             case IDL =>
                 val ok =
-                    variantsToks.exists(vrntToks =>
+                    variantsToks.par.exists(vrntToks =>
                         get0(t =>
                             chunk.idlPred.apply(t, NCIdlContext(toks = 
vrntToks, req = req)).value.asInstanceOf[Boolean],
                             _ => false

[incubator-nlpcraft] branch NLPCRAFT-443 updated: WIP.

Reply via email to