This is an automated email from the ASF dual-hosted git repository.
sergeykamov pushed a commit to branch NLPCRAFT-70_NEW
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git
The following commit(s) were added to refs/heads/NLPCRAFT-70_NEW by this push:
new bffad45 WIP.
bffad45 is described below
commit bffad45c5e021190469bfced0c7cc05a803c74ba
Author: Sergey Kamov <[email protected]>
AuthorDate: Wed Jul 7 13:50:34 2021 +0300
WIP.
---
.../probe/mgrs/conn/NCConnectionManager.scala | 15 +++----
.../probe/mgrs/deploy/NCDeployManager.scala | 49 +++++++++++++---------
.../nlpcraft/server/mdo/NCProbeModelMdo.scala | 2 +-
.../ctxword/NCContextWordCategoriesEnricher.scala | 38 ++++++++---------
.../nlpcraft/server/probe/NCProbeManager.scala | 8 ++--
5 files changed, 62 insertions(+), 50 deletions(-)
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/conn/NCConnectionManager.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/conn/NCConnectionManager.scala
index 3d6a72f..3278078 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/conn/NCConnectionManager.scala
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/conn/NCConnectionManager.scala
@@ -35,7 +35,7 @@ import java.{lang, util}
import java.util.concurrent.CountDownLatch
import java.util.{Collections, Properties, TimeZone}
import scala.collection.mutable
-import scala.jdk.CollectionConverters.{ListHasAsScala, MapHasAsJava,
SetHasAsJava, SetHasAsScala}
+import scala.jdk.CollectionConverters.{ListHasAsScala, MapHasAsJava,
SeqHasAsJava, SetHasAsJava, SetHasAsScala}
/**
* Probe down/up link connection manager.
@@ -216,7 +216,7 @@ object NCConnectionManager extends NCService {
val mdl = wrapper.model
val (
- values,
+ singleValues,
corpus,
categoriesElements
): (
@@ -234,8 +234,6 @@ object NCConnectionManager extends NCService {
if (ctxCatElems.isEmpty)
(Collections.emptyMap(),
Collections.emptySet(), Collections.emptyMap())
else {
- var corpus =
wrapper.samples.flatMap(_._2.flatMap(p => p))
-
val values =
mdl.getElements.
asScala.
@@ -243,7 +241,10 @@ object NCConnectionManager extends NCService {
map(e =>
e.getId ->
e.getValues.asScala.map(p =>
p.getName -> {
- val set: util.Set[String] =
new util.HashSet(p.getSynonyms)
+ val set: util.Set[String] =
+ new util.HashSet(
+
p.getSynonyms.asScala.filter(p => !p.contains(" ")).asJava
+ )
set
}).toMap.asJava
@@ -251,7 +252,7 @@ object NCConnectionManager extends NCService {
(
values.asJava,
- corpus.asJava,
+ wrapper.samples.flatMap(_._2.flatMap(p
=> p)).asJava,
ctxCatElems.asJava
)
}
@@ -266,7 +267,7 @@ object NCConnectionManager extends NCService {
mdl.getName,
mdl.getVersion,
new
util.HashSet[String](mdl.getEnabledBuiltInTokens),
- values,
+ singleValues,
corpus,
categoriesElements
)
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/deploy/NCDeployManager.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/deploy/NCDeployManager.scala
index 741ac11..5eae7a5 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/deploy/NCDeployManager.scala
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/deploy/NCDeployManager.scala
@@ -422,6 +422,7 @@ object NCDeployManager extends NCService {
// Validates context words parameters.
val elems = mdl.getElements.asScala
+
val ctxCatElems = elems.flatMap(e =>
e.getCategoryConfidence.asScala match {
case Some(v) => Some(e.getId -> v)
@@ -430,22 +431,20 @@ object NCDeployManager extends NCService {
).toMap
if (ctxCatElems.nonEmpty) {
- val cnt =
mdl.getElements.asScala.map(_.getValues.asScala.map(_.getSynonyms.size()).sum).sum
-
- if (cnt > MAX_CTXWORD_VALS_CNT)
- // TODO: do we need print recommended value.?
- logger.warn(
- s"Too many values synonyms detected for context words
elements [" +
- s"mdlId=$mdlId, " +
- s"cnt=$cnt," +
- s"recommended=$MAX_CTXWORD_VALS_CNT" +
- s"]"
- )
+ val singleValsElems: Map[String, Int] =
+ elems.flatMap(e => {
+ val cnt =
+ if (e.getValues != null)
+ e.getValues.asScala.map(
+ p => if (p.getSynonyms != null)
p.getSynonyms.asScala.count(!_.contains(" ")) else 0
+ ).sum
+ else
+ 0
+ if (cnt != 0) Some(e.getId -> cnt) else None
+ }).toMap
- val valsElems = elems.filter(p => p.getValues != null &&
!p.getValues.isEmpty).
- map(p => p.getId -> p.getValues.size()).toMap
- var ids = ctxCatElems.filter { case (elemId, _) =>
!valsElems.keySet.contains(elemId) }.keys
+ var ids = ctxCatElems.filter { case (elemId, _) =>
!singleValsElems.keySet.contains(elemId) }.keys
if (ids.nonEmpty)
// TODO:
@@ -456,6 +455,18 @@ object NCDeployManager extends NCService {
if (ids.nonEmpty)
// TODO:
throw new NCE(s"Context word confidences are out of range
(0..1) for elements : ${ids.mkString(", ")}")
+
+ val cnt = singleValsElems.values.sum
+
+ if (cnt > MAX_CTXWORD_VALS_CNT)
+ // TODO: do we need print recommended value.?
+ logger.warn(
+ s"Too many values synonyms detected for context words
elements [" +
+ s"mdlId=$mdlId, " +
+ s"cnt=$cnt," +
+ s"recommendedMax=$MAX_CTXWORD_VALS_CNT" +
+ s"]"
+ )
}
// Discard value loaders.
@@ -544,11 +555,6 @@ object NCDeployManager extends NCService {
else
logger.warn(s"Model has no intent: $mdlId")
- def toMap(set: Set[SynonymHolder]): Map[String, Seq[NCProbeSynonym]] =
- set.groupBy(_.elmId).map(p => p._1 ->
p._2.map(_.syn).toSeq.sorted.reverse)
-
- val simple = idl(syns.toSet, idl = false)
-
val samples = scanSamples(mdl)
if (ctxCatElems.nonEmpty && samples.size > MAX_CTXWORD_SAMPLES_CNT)
@@ -561,6 +567,11 @@ object NCDeployManager extends NCService {
s"]"
)
+ val simple = idl(syns.toSet, idl = false)
+
+ def toMap(set: Set[SynonymHolder]): Map[String, Seq[NCProbeSynonym]] =
+ set.groupBy(_.elmId).map(p => p._1 ->
p._2.map(_.syn).toSeq.sorted.reverse)
+
NCProbeModel(
model = mdl,
solver = solver,
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/server/mdo/NCProbeModelMdo.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/server/mdo/NCProbeModelMdo.scala
index cb85a83..8a538ab 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/server/mdo/NCProbeModelMdo.scala
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/server/mdo/NCProbeModelMdo.scala
@@ -23,7 +23,7 @@ import org.apache.nlpcraft.server.mdo.impl._
case class NCCtxWordCategoriesConfigMdo(
@NCMdoField probeId: String,
@NCMdoField modelId: String,
- @NCMdoField values: Map[String /*Element ID*/, Map[/*Value*/String,
/*Synonym*/Set[String]]],
+ @NCMdoField singleValues: Map[String /*Element ID*/, Map[/*Value*/String,
/*Synonym*/Set[String]]],
@NCMdoField corpus: Set[String],
@NCMdoField elements: Map[String /*Element ID*/, /*Confidence*/ Double]
)
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/server/nlp/enrichers/ctxword/NCContextWordCategoriesEnricher.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/server/nlp/enrichers/ctxword/NCContextWordCategoriesEnricher.scala
index e19c30f..d98f63b 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/server/nlp/enrichers/ctxword/NCContextWordCategoriesEnricher.scala
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/server/nlp/enrichers/ctxword/NCContextWordCategoriesEnricher.scala
@@ -153,7 +153,7 @@ object NCContextWordCategoriesEnricher extends
NCServerEnricher {
* @param s
* @return
*/
- private def normCase(s: String): String = s.toLowerCase
+ private def norm(s: String): String = s.toLowerCase
/**
*
@@ -170,8 +170,8 @@ object NCContextWordCategoriesEnricher extends
NCServerEnricher {
* @param corpusWordsStems
* @param corpusWordsNorm
* @param elemValsSyns
- * @param elemValuesSynsStems
- * @param elemValuesSynsNorm
+ * @param elemValsSynsStems
+ * @param elemValsSynsNorm
* @return
*/
private def mkRequests(
@@ -180,14 +180,14 @@ object NCContextWordCategoriesEnricher extends
NCServerEnricher {
corpusWordsStems: Seq[Seq[String]],
corpusWordsNorm: Seq[Seq[String]],
elemValsSyns: Set[String],
- elemValuesSynsStems: Set[String],
- elemValuesSynsNorm: Set[String]
+ elemValsSynsStems: Set[String],
+ elemValsSynsNorm: Set[String]
): Iterable[NCSuggestionRequest] = {
require(nlpWords.size == corpusWords.size)
require(corpusWords.size == corpusWordsStems.size)
require(corpusWords.size == corpusWordsNorm.size)
- require(elemValsSyns.size == elemValuesSynsStems.size)
- require(elemValsSyns.size == elemValuesSynsNorm.size)
+ require(elemValsSyns.size == elemValsSynsStems.size)
+ require(elemValsSyns.size == elemValsSynsNorm.size)
corpusWordsStems.
zip(corpusWords).
@@ -203,7 +203,7 @@ object NCContextWordCategoriesEnricher extends
NCServerEnricher {
})
val idxs =
- getIndexes(elemValuesSynsStems, corpusWordsStem) ++
getIndexes(elemValuesSynsNorm, corpusWordsNorm)
+ getIndexes(elemValsSynsStems, corpusWordsStem) ++
getIndexes(elemValsSynsNorm, corpusWordsNorm)
def mkRequest(idx: Int, syn: String): NCSuggestionRequest
= {
var newSen = substitute(corpusWords, syn, idx)
@@ -274,13 +274,13 @@ object NCContextWordCategoriesEnricher extends
NCServerEnricher {
case Some(cache) => cache
case None =>
def mkMap(convert: String => String): Map[String, Set[String]]
=
- cfg.values.
+ cfg.singleValues.
flatMap { case (elemId, vals) => vals.map { case (_,
vals) => vals.map(convert(_) -> elemId) } }.
flatten.
groupBy { case (converted, _) => converted }.
map { case (converted, map) => converted -> map.map {
case (_, elemId) => elemId }.toSet }
- val normsMap = mkMap(normCase)
+ val normsMap = mkMap(norm)
val stemsMap = mkMap(stem)
val h = ValuesHolder(normal = normsMap, stems =
stemsMap.filter(p => !normsMap.keySet.contains(p._1)))
@@ -327,22 +327,22 @@ object NCContextWordCategoriesEnricher extends
NCServerEnricher {
val corpusWords = nlpWords.map(_.map(_.word))
val corpusWordsStems = corpusWords.map(_.map(stem))
- val corpusWordsNorm = corpusWords.map(_.map(normCase))
+ val corpusWordsNorm = corpusWords.map(_.map(norm))
val recs: Map[String, Seq[NCSuggestionRequest]] =
(
for (
- (elemId, elemValues) <- cfg.values.toSeq;
+ (elemId, elemSingleVals) <- cfg.singleValues.toSeq;
// Uses single words synonyms only.
- elemValuesSyns =
elemValues.flatMap(_._2).toSet.filter(!_.contains(' '));
+ elemSingleValsSet = elemSingleVals.flatMap(_._2).toSet;
suggReq <- mkRequests(
nlpWords = nlpWords,
corpusWords = corpusWords,
corpusWordsStems = corpusWordsStems,
corpusWordsNorm = corpusWordsNorm,
- elemValsSyns = elemValuesSyns,
- elemValuesSynsStems = elemValuesSyns.map(stem),
- elemValuesSynsNorm = elemValuesSyns.map(normCase)
+ elemValsSyns = elemSingleValsSet,
+ elemValsSynsStems = elemSingleValsSet.map(stem),
+ elemValsSynsNorm = elemSingleValsSet.map(norm)
)
) yield (elemId, suggReq)
).
@@ -394,7 +394,7 @@ object NCContextWordCategoriesEnricher extends
NCServerEnricher {
}
}
- val normals = mkMap { (_, sugg) => normCase(sugg.word) }
+ val normals = mkMap { (_, sugg) => norm(sugg.word) }
val stems = mkMap { (_, sugg) => stem(sugg.word) }
val lemmas = mkMap { (req, sugg) => getLemma(req, sugg) }
@@ -510,7 +510,7 @@ object NCContextWordCategoriesEnricher extends
NCServerEnricher {
for (
n <- nouns;
- elemId <- get(vNorms, n.normText) ++ get(vNorms,
normCase(n.lemma)) ++ get(vStems, n.stem)
+ elemId <- get(vNorms, n.normText) ++ get(vNorms,
norm(n.lemma)) ++ get(vStems, n.stem)
)
add(n, elemId, Confidence(INCL_MAX_CONFIDENCE))
@@ -566,7 +566,7 @@ object NCContextWordCategoriesEnricher extends
NCServerEnricher {
suggConf = normalizeConf(sugg.score);
(elemId, elemData) <- corpusData;
elemConf = cfg.elements(elemId);
- corpConfOpt = elemData.get(normCase(sugg.word),
stem(sugg.word), getLemma(req, sugg))
+ corpConfOpt = elemData.get(norm(sugg.word),
stem(sugg.word), getLemma(req, sugg))
if corpConfOpt.isDefined;
corpConf = corpConfOpt.get;
normConf = ConfMath.calculate(suggConf, corpConf)
diff --git
a/nlpcraft/src/main/scala/org/apache/nlpcraft/server/probe/NCProbeManager.scala
b/nlpcraft/src/main/scala/org/apache/nlpcraft/server/probe/NCProbeManager.scala
index a6cbd57..5a0f203 100644
---
a/nlpcraft/src/main/scala/org/apache/nlpcraft/server/probe/NCProbeManager.scala
+++
b/nlpcraft/src/main/scala/org/apache/nlpcraft/server/probe/NCProbeManager.scala
@@ -628,7 +628,7 @@ object NCProbeManager extends NCService {
mdlName,
mdlVer,
enabledBuiltInToks,
- values,
+ singleValues,
corpus,
categoriesElements
) =>
@@ -636,7 +636,7 @@ object NCProbeManager extends NCService {
require(mdlName != null)
require(mdlVer != null)
require(enabledBuiltInToks != null)
- require(values.isEmpty && corpus.isEmpty ||
!values.isEmpty && !corpus.isEmpty)
+ require(singleValues.isEmpty && corpus.isEmpty
|| !singleValues.isEmpty && !corpus.isEmpty)
NCProbeModelMdo(
id = mdlId,
@@ -644,12 +644,12 @@ object NCProbeManager extends NCService {
version = mdlVer,
enabledBuiltInTokens =
enabledBuiltInToks.asScala.toSet,
ctxWordConfig =
- if (!values.isEmpty) {
+ if (!singleValues.isEmpty) {
Some(
NCCtxWordCategoriesConfigMdo(
probeId = probeId,
modelId = mdlId,
- values =
values.asScala.map {
+ singleValues =
singleValues.asScala.map {
case (elemId, map) =>
elemId ->
map.asScala.map {