[incubator-nlpcraft] 02/02: WIP.

sergeykamov Mon, 28 Jun 2021 07:09:47 -0700

This is an automated email from the ASF dual-hosted git repository.

sergeykamov pushed a commit to branch NLPCRAFT-70_NEW
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git


commit 3c5fafb57f142c597b40332aaa561721ef8e469f
Author: Sergey Kamov <[email protected]>
AuthorDate: Mon Jun 28 17:07:54 2021 +0300

    WIP.
---
 .../nlpcraft/common/nlp/pos/NCPennTreebank.scala   |   4 +-
 .../enrichers/ctxword/NCContextWordEnricher.scala  | 114 +++++++++++++--------
 .../nlpcraft/model/ctxword/NCContextWordSpec.scala |  25 +++--
 3 files changed, 91 insertions(+), 52 deletions(-)

diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/pos/NCPennTreebank.scala
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/pos/NCPennTreebank.scala
index a61c63a..0c6e0de 100644
--- 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/pos/NCPennTreebank.scala
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/pos/NCPennTreebank.scala
@@ -68,7 +68,9 @@ object NCPennTreebank {
     final val SYNTH_POS_DESC = "Synthetic tag"
 
     // Useful POS tags sets.
-    final val NOUNS_POS = Seq("NN", "NNS", "NNP", "NNPS")
+    final val NOUNS_POS_PLURALS = Seq("NNS", "NNPS")
+    final val NOUNS_POS_SINGULAR = Seq("NN", "NNP")
+    final val NOUNS_POS = NOUNS_POS_PLURALS ++ NOUNS_POS_SINGULAR
     final val VERBS_POS = Seq("VB", "VBD", "VBG", "VBN", "VBP", "VBZ")
     final val WHS_POS = Seq("WDT", "WP", "WP$", "WRB")
     final val JJS_POS = Seq("JJ", "JJR", "JJS")
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/server/nlp/enrichers/ctxword/NCContextWordEnricher.scala
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/server/nlp/enrichers/ctxword/NCContextWordEnricher.scala
index 79c970e..4d83ab7 100644
--- 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/server/nlp/enrichers/ctxword/NCContextWordEnricher.scala
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/server/nlp/enrichers/ctxword/NCContextWordEnricher.scala
@@ -18,14 +18,13 @@
 package org.apache.nlpcraft.server.nlp.enrichers.ctxword
 
 import io.opencensus.trace.Span
-import org.apache.nlpcraft.common.nlp.core.NCNlpCoreManager
 import org.apache.nlpcraft.common.nlp.core.NCNlpCoreManager.stem
-import org.apache.nlpcraft.common.nlp.pos.NCPennTreebank
+import org.apache.nlpcraft.common.nlp.pos.NCPennTreebank._
 import org.apache.nlpcraft.common.nlp.{NCNlpSentence, NCNlpSentenceToken}
 import org.apache.nlpcraft.common.{NCE, NCService, U}
 import org.apache.nlpcraft.server.mdo.NCModelMLConfigMdo
+import org.apache.nlpcraft.server.nlp.core.{NCNlpParser, NCNlpServerManager, 
NCNlpWord}
 import org.apache.nlpcraft.server.nlp.enrichers.NCServerEnricher
-import org.apache.nlpcraft.server.nlp.enrichers.basenlp.NCBaseNlpEnricher
 import org.apache.nlpcraft.server.sugsyn.{NCSuggestSynonymManager, 
NCSuggestionRequest, NCWordSuggestion}
 import org.jibx.schema.codegen.extend.DefaultNameConverter
 
@@ -37,9 +36,6 @@ import scala.concurrent.duration.Duration
   * ContextWord enricher.
   */
 object NCContextWordEnricher extends NCServerEnricher {
-    private final val POS_PLURALS = Set("NNS", "NNPS")
-    private final val POS_SINGULAR = Set("NN", "NNP")
-
     private final val MAX_CTXWORD_SCORE = 2
     private final val EXCLUSIVE_MIN_SCORE = -1.0
 
@@ -47,17 +43,24 @@ object NCContextWordEnricher extends NCServerEnricher {
 
     private case class ModelProbeKey(probeId: String, modelId: String)
     private case class ElementScore(elementId: String, averageScore: Double, 
senScore: Double, sampleScore: Double)
+    private case class ValuesHolder(
+        values: Map[/** Stem */String, /** Element ID */Set[String]],
+        valuesStems: Map[/** Value */String, /** Element ID */Set[String]],
+    )
 
     private type ElementStemScore = Map[/** Element ID */String, Map[/** Stem 
*/String,/** Score */Double]]
 
-    @volatile private var values: mutable.HashMap[ModelProbeKey, Map[/** Stem 
*/String, /** Element ID */Set[String]]] = _
+    @volatile private var valuesStems: mutable.HashMap[ModelProbeKey, 
ValuesHolder] = _
     @volatile private var samples: mutable.HashMap[ModelProbeKey, 
ElementStemScore] = _
 
+    @volatile private var parser: NCNlpParser = _
+
     override def start(parent: Span = null): NCService = 
startScopedSpan("start", parent) { _ =>
         ackStarting()
 
-        values = mutable.HashMap.empty
+        valuesStems = mutable.HashMap.empty
         samples = mutable.HashMap.empty
+        parser = NCNlpServerManager.getParser
 
         ackStarted()
     }
@@ -65,8 +68,9 @@ object NCContextWordEnricher extends NCServerEnricher {
     override def stop(parent: Span = null): Unit = startScopedSpan("stop", 
parent) { _ =>
         ackStopping()
 
+        parser = null
         samples = null
-        values = null
+        valuesStems = null
 
         ackStopped()
     }
@@ -96,6 +100,7 @@ object NCContextWordEnricher extends NCServerEnricher {
 
     /**
       *
+      * @param nlpWords
       * @param sampleWords
       * @param sampleWordsStems
       * @param elemValuesSyns
@@ -103,27 +108,56 @@ object NCContextWordEnricher extends NCServerEnricher {
       * @return
       */
     private def parseSample(
+        nlpWords: Seq[Seq[NCNlpWord]],
         sampleWords: Seq[Seq[String]],
         sampleWordsStems: Seq[Seq[String]],
         elemValuesSyns: Set[String],
         elemValuesSynsStems: Set[String]
     ): Iterable[NCSuggestionRequest] = {
+        require(nlpWords.size == sampleWords.size)
         require(sampleWords.size == sampleWordsStems.size)
         require(elemValuesSyns.size == elemValuesSynsStems.size)
 
-        sampleWordsStems.zip(sampleWords).flatMap { case (sampleWordsStem, 
sampleWord) =>
+        sampleWordsStems.zip(sampleWords).zip(nlpWords).flatMap { case 
((sampleWordsStem, sampleWords), nlpWords) =>
             val idxs = elemValuesSynsStems.flatMap(valSynsStem => {
                 val i = sampleWordsStem.indexOf(valSynsStem)
 
                 if (i >= 0) Some(i) else None
             })
 
+            def mkRequest(idx: Int, syn: String): NCSuggestionRequest = {
+                def mkSentence(syn: String): String =
+                    sampleWords.zipWithIndex.map { case (w, i) => if (i != 
idx) w else syn }.mkString(" ")
+
+                var newSen = mkSentence(syn)
+
+                val nlpWordsNew = parser.parse(newSen)
+
+                require(nlpWords.size == nlpWordsNew.size)
+
+                val pos = nlpWords(idx).pos
+                val posNew = nlpWordsNew(idx).pos
+
+                if (NOUNS_POS_SINGULAR.contains(pos) && 
NOUNS_POS_PLURALS.contains(posNew)) {
+                    println(s"newSen1=$newSen")
+
+                    newSen = mkSentence(CONVERTER.depluralize(syn))
+
+                    println(s"newSen2=$newSen")
+                }
+                else if (NOUNS_POS_PLURALS.contains(pos) && 
NOUNS_POS_SINGULAR.contains(posNew)) {
+                    println(s"newSen1=$newSen")
+
+                    newSen = mkSentence(CONVERTER.pluralize(syn))
+
+                    println(s"newSen3=$newSen")
+                }
+
+                NCSuggestionRequest(newSen, idx)
+            }
+
             for (idx <- idxs; syn <- elemValuesSyns)
-                yield
-                    NCSuggestionRequest(
-                        sampleWord.zipWithIndex.map { case (w, i) => if (i != 
idx) w else syn }.mkString(" "),
-                        idx
-                    )
+                yield mkRequest(idx, syn)
         }
     }
 
@@ -157,17 +191,20 @@ object NCContextWordEnricher extends NCServerEnricher {
       * @param key
       * @return
       */
-    private def getValuesData(cfg: NCModelMLConfigMdo, key: ModelProbeKey): 
Map[String, Set[String]] =
-        values.synchronized { values.get(key) } match {
+    private def getValuesData(cfg: NCModelMLConfigMdo, key: ModelProbeKey): 
ValuesHolder =
+        valuesStems.synchronized { valuesStems.get(key) } match {
             case Some(cache) => cache
             case None =>
-                val res = cfg.values.
-                    flatMap { case (elemId, vals) => vals.map { case (_, vals) 
=> vals.map(stem(_) -> elemId) } }.
-                    flatten.
-                    groupBy { case (stem, _) => stem }.
-                    map { case (stem, map) => stem -> map.map {case (_, 
elemId) => elemId }.toSet }
+                def mkMap(convert: String => String): Map[String, Set[String]] 
=
+                    cfg.values.
+                        flatMap { case (elemId, vals) => vals.map { case (_, 
vals) => vals.map(convert(_) -> elemId) } }.
+                        flatten.
+                        groupBy { case (converted, _) => converted }.
+                        map { case (converted, map) => converted -> map.map 
{case (_, elemId) => elemId }.toSet }
+
+                val res = ValuesHolder(mkMap(stem), mkMap(_.toLowerCase))
 
-                values.synchronized { values += key -> res }
+                valuesStems.synchronized { valuesStems += key -> res }
 
                 res
         }
@@ -179,29 +216,20 @@ object NCContextWordEnricher extends NCServerEnricher {
       */
     @throws[NCE]
     private def askSamples(cfg: NCModelMLConfigMdo): ElementStemScore = {
-        val sampleWords = cfg.samples.map(spaceTokenize).toSeq
-
-
-        sampleWords.map(s => {
-            val sampleSen = new NCNlpSentence("sampleReqId", 
sampleWords.mkString(" "), Set.empty)
-
-            NCBaseNlpEnricher.enrich(sampleSen)
-
-            sampleSen.
-        })
-
-
-
+        val samplesSeq = cfg.samples.toSeq
+        val sampleWords = samplesSeq.map(spaceTokenize)
+        val nlpWords = samplesSeq.map(s => parser.parse(s))
 
         val sampleWordsStems = sampleWords.map(_.map(stem))
 
-        val recs: Map[String, Seq[NCSuggestionRequest]] =
+        val recs =
             (
                 for (
                     (elemId, elemValues) <- cfg.values;
-                    elemValuesSyns = elemValues.flatMap(_._2).toSet;
+                    // Uses single words synonyms only.
+                    elemValuesSyns = 
elemValues.flatMap(_._2).toSet.filter(!_.contains(' '));
                     elemValuesSynsStems = elemValuesSyns.map(stem);
-                    suggReq <- parseSample(sampleWords, sampleWordsStems, 
elemValuesSyns, elemValuesSynsStems)
+                    suggReq <- parseSample(nlpWords, sampleWords, 
sampleWordsStems, elemValuesSyns, elemValuesSynsStems)
                 )
                     yield (elemId, suggReq)
             ).
@@ -249,7 +277,7 @@ object NCContextWordEnricher extends NCServerEnricher {
                     }
                 }
 
-                val nounToks = ns.tokens.filter(t => 
NCPennTreebank.NOUNS_POS.contains(t.pos))
+                val nounToks = ns.tokens.filter(t => NOUNS_POS.contains(t.pos))
 
                 if (nounToks.nonEmpty) {
                     val key = ModelProbeKey(cfg.probeId, cfg.modelId)
@@ -257,7 +285,11 @@ object NCContextWordEnricher extends NCServerEnricher {
                     // 1. Values. Direct.
                     val valuesData = getValuesData(cfg, key)
 
-                    for (nounTok <- nounToks; elemId <- 
valuesData.getOrElse(nounTok.stem, Set.empty))
+                    for (nounTok <- nounToks; elemId <- 
valuesData.values.getOrElse(nounTok.lemma.toLowerCase, Set.empty))
+                        add(nounTok, elemId, 1, 1, 1)
+                    for (nounTok <- nounToks; elemId <- 
valuesData.values.getOrElse(nounTok.normText, Set.empty))
+                        add(nounTok, elemId, 1, 1, 1)
+                    for (nounTok <- nounToks; elemId <- 
valuesData.valuesStems.getOrElse(nounTok.stem, Set.empty))
                         add(nounTok, elemId, 1, 1, 1)
 
                     // 2. Via examples.
diff --git 
a/nlpcraft/src/test/scala/org/apache/nlpcraft/model/ctxword/NCContextWordSpec.scala
 
b/nlpcraft/src/test/scala/org/apache/nlpcraft/model/ctxword/NCContextWordSpec.scala
index cbe7ce2..f8d3f12 100644
--- 
a/nlpcraft/src/test/scala/org/apache/nlpcraft/model/ctxword/NCContextWordSpec.scala
+++ 
b/nlpcraft/src/test/scala/org/apache/nlpcraft/model/ctxword/NCContextWordSpec.scala
@@ -106,8 +106,12 @@ class NCContextWordSpecModel extends NCModel {
   */
 @NCTestEnvironment(model = classOf[NCContextWordSpecModel], startClient = true)
 class NCContextWordSpec extends NCTestContext {
-    private def check(txt: String, elemId: String, words: String*): Unit =
-        require(s"$elemId ${words.mkString(" ")}" == 
getClient.ask(txt).getResult.get())
+    private def check(txt: String, elemId: String, words: String*): Unit = {
+        val res = getClient.ask(txt).getResult.get()
+        val exp = s"$elemId ${words.mkString(" ")}"
+
+        require(exp == res, s"Expected: $exp, result: $res")
+    }
 
     @BeforeEach
     private[ctxword] def before(): Unit = testsData.clear()
@@ -117,13 +121,14 @@ class NCContextWordSpec extends NCTestContext {
 
     @Test
     private[ctxword] def test(): Unit = {
-        check("I want to have a dog and fox", "class:animal", "dog", "fox")
-        check("I fed your fish", "class:animal", "fish")
-
-        check("I like to drive my Porsche and Volkswagen", "class:cars", 
"Porsche", "Volkswagen")
-        check("Peugeot added motorcycles to its range in 1901", "class:cars", 
"Peugeot", "motorcycles")
-
-        check("The frost is possible today", "class:weather", "frost")
-        check("There's a very strong wind from the east now", "class:weather", 
"wind")
+        check("I want to have a dogs and foxes", "class:animal", "dogs", 
"foxes")
+        //check("I want to have a dog and fox", "class:animal", "dog", "fox")
+//        check("I fed your fish", "class:animal", "fish")
+//
+//        check("I like to drive my Porsche and Volkswagen", "class:cars", 
"Porsche", "Volkswagen")
+//        check("Peugeot added motorcycles to its range in 1901", 
"class:cars", "Peugeot", "motorcycles")
+//
+//        check("The frost is possible today", "class:weather", "frost")
+//        check("There's a very strong wind from the east now", 
"class:weather", "wind")
     }
 }

[incubator-nlpcraft] 02/02: WIP.

Reply via email to