This is an automated email from the ASF dual-hosted git repository. sergeykamov pushed a commit to branch NLPCRAFT-483-1-1 in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git
commit 6f65a01b55b7a8d5d9868dde41c7a13aadb06393 Author: Sergey Kamov <skhdlem...@gmail.com> AuthorDate: Fri Mar 11 10:40:20 2022 +0300 WIP. --- .../examples/lightswitch/LightSwitchRuModel.scala | 9 ++-- .../apache/nlpcraft/NCModelPipelineBuilder.java | 2 + .../parser/impl/NCSemanticEntityParserImpl.scala | 50 +++++++++++++++++++--- 3 files changed, 52 insertions(+), 9 deletions(-) diff --git a/nlpcraft-examples/lightswitch-ru/src/main/java/org/apache/nlpcraft/examples/lightswitch/LightSwitchRuModel.scala b/nlpcraft-examples/lightswitch-ru/src/main/java/org/apache/nlpcraft/examples/lightswitch/LightSwitchRuModel.scala index 656fd44..41c1d8d 100644 --- a/nlpcraft-examples/lightswitch-ru/src/main/java/org/apache/nlpcraft/examples/lightswitch/LightSwitchRuModel.scala +++ b/nlpcraft-examples/lightswitch-ru/src/main/java/org/apache/nlpcraft/examples/lightswitch/LightSwitchRuModel.scala @@ -64,13 +64,14 @@ class LightSwitchRuModel extends NCModelAdapter( "Включи свет в детской", "Включай повсюду освещение", "Включайте лампы в детской комнате", - "Свет на кухне пожалуйста приглуши", - "Нельзя ли повсюду выключить свет", + "Свет на кухне, пожалуйста, приглуши", + "Нельзя ли повсюду выключить свет?", "Пожалуйста без света", - "Отключи электричесвто в ванной", + "Отключи электричество в ванной", "Выключи, пожалуйста, тут всюду свет", "Выключай все!", - "Свет пожалуйсте везде включи" + "Свет пожалуйста везде включи", + "Зажги лампу на кухне" )) def onMatch( @NCIntentTerm("act") actEnt: NCEntity, diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/NCModelPipelineBuilder.java b/nlpcraft/src/main/scala/org/apache/nlpcraft/NCModelPipelineBuilder.java index 05f0110..a80c5c6 100644 --- a/nlpcraft/src/main/scala/org/apache/nlpcraft/NCModelPipelineBuilder.java +++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/NCModelPipelineBuilder.java @@ -94,6 +94,8 @@ public class NCModelPipelineBuilder { tokEnrichers.add(new NCENBracketsTokenEnricher()); this.entParsers.addAll(entParsers); + + break; default: throw new IllegalArgumentException("Unsupported language: " + lang); } diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/mult/entity/parser/impl/NCSemanticEntityParserImpl.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/mult/entity/parser/impl/NCSemanticEntityParserImpl.scala index 881a9eb..6f01131 100644 --- a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/mult/entity/parser/impl/NCSemanticEntityParserImpl.scala +++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/mult/entity/parser/impl/NCSemanticEntityParserImpl.scala @@ -23,12 +23,14 @@ import org.apache.nlpcraft.internal.makro.NCMacroParser import org.apache.nlpcraft.internal.util.NCUtils import org.apache.nlpcraft.nlp.mult.entity.parser.* import org.apache.nlpcraft.nlp.mult.entity.parser.impl.NCSemanticChunkKind.* +import org.apache.nlpcraft.nlp.mult.entity.parser.impl.NCSemanticEntityParserImpl.combine import org.apache.nlpcraft.nlp.mult.entity.parser.impl.NCSemanticSourceType.* import java.io.* import java.util import java.util.regex.* import java.util.{List as JList, Map as JMap, Set as JSet} +import scala.annotation.tailrec import scala.collection.mutable import scala.jdk.CollectionConverters.* @@ -139,6 +141,23 @@ object NCSemanticEntityParserImpl: ) }) + /** + * Multiply 2 data sets. + * Examples: if input is (A, B) and (1, 2) output will be ((A, B), (1, 2), (A, 2), (1, B)) + * + * @param data1 + * @param data2 + * @param i + * @param tmp + * @return + */ + @tailrec private def combine(data1: Seq[String], data2: Seq[String], i: Int = 0, tmp: Set[List[String]] = Set(List.empty)): Set[List[String]] = + require(data1.size == data2.size) + + if data1.isEmpty then Set.empty + else if i >= data1.size then tmp + else combine(data1, data2, i + 1, tmp.map(_ :+ data1(i)) ++ tmp.map(_ :+ data2(i))) + import org.apache.nlpcraft.nlp.mult.entity.parser.impl.NCSemanticEntityParserImpl.* /** @@ -187,8 +206,19 @@ class NCSemanticEntityParserImpl( val toks = toksList.asScala.toSeq val stems = toks.map(p => p -> stemmer.stem(p.getText)).toMap - if toks.exists(_.getOpt[Boolean]("stopword").isEmpty) then - logger.warn("'stopword' property not found. Is stopword token enricher configured?") + val stems4Lemms = + var ok = true + val seq = + for (t <- toks; lemma = t.get[String]("lemma") if ok) + yield + ok = lemma != null + t -> lemma + + if ok then + seq.toMap.map { (tok, lemma) => tok -> stemmer.stem(lemma) } + else + logger.warn("'lemma' property not found. Is proper token enricher configured?") + Map.empty val cache = mutable.HashSet.empty[Seq[Int]] // Variants (tokens without stopwords) can be repeated. case class Holder(elemId: String, tokens: Seq[NCToken], value: Option[String]): @@ -202,17 +232,27 @@ class NCSemanticEntityParserImpl( val idxs = variant.map(_.getIndex) if cache.add(idxs) then - synsHolder.textSynonyms.get(variant.map(t => stems(t)).mkString(" ")) match + // Tries to search by stems. + synsHolder.textSynonyms.get(variant.map(stems).mkString(" ")) match case Some(elems) => elems.foreach(elem => add(elem.elementId, elem.value)) case None => + // Combines stems(origin) and stems(lemma) + var found = false + if stems4Lemms.nonEmpty then + for (comb <- combine(variant.map(stems), variant.map(stems4Lemms)) if !found) + synsHolder.textSynonyms.get(comb.mkString(" ")) match + case Some(elems) => + found = true + elems.foreach(elem => add(elem.elementId, elem.value)) + case None => // No-op. + // With regex. for ((elemId, syns) <- synsHolder.mixedSynonyms.getOrElse(variant.size, Seq.empty)) - var found = false for (s <- syns if !found) found = s.chunks.zip(variant). sortBy { (chunk, _) => if chunk.isText then 0 else 1 }. forall { (chunk, tok) => if chunk.isText then - chunk.stem == stems(tok) + chunk.stem == stems(tok) || (stems4Lemms.nonEmpty && chunk.stem == stems4Lemms(tok)) else def match0(txt: String) = chunk.regex.matcher(txt).matches() match0(tok.getText) || match0(tok.getText.toLowerCase)