This is an automated email from the ASF dual-hosted git repository. sergeykamov pushed a commit to branch NLPCRAFT-483-1-1 in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git
The following commit(s) were added to refs/heads/NLPCRAFT-483-1-1 by this push: new 649904e WIP. 649904e is described below commit 649904ed3210525eb09e6f5219d2c56923357041 Author: Sergey Kamov <skhdlem...@gmail.com> AuthorDate: Fri Mar 11 14:18:08 2022 +0300 WIP. --- .../parser/impl/NCSemanticEntityParserImpl.scala | 31 ++++++++++++---------- 1 file changed, 17 insertions(+), 14 deletions(-) diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/mult/entity/parser/impl/NCSemanticEntityParserImpl.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/mult/entity/parser/impl/NCSemanticEntityParserImpl.scala index 9fdad68..7de4cde 100644 --- a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/mult/entity/parser/impl/NCSemanticEntityParserImpl.scala +++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/mult/entity/parser/impl/NCSemanticEntityParserImpl.scala @@ -228,9 +228,11 @@ class NCSemanticEntityParserImpl( Map.empty val cache = mutable.HashSet.empty[Seq[Int]] // Variants (tokens without stopwords) can be repeated. + case class Holder(elemId: String, tokens: Seq[NCToken], value: Option[String]): private val idxs = tokens.map(_.getIndex).toSet def isSuperSet(toks: Seq[NCToken]): Boolean = idxs.size > toks.size && toks.map(_.getIndex).toSet.subsetOf(idxs) + val hs = mutable.ArrayBuffer.empty[Holder] for (piece <- getPieces(toks) if !hs.exists(_.isSuperSet(piece.baseTokens)); @@ -253,20 +255,21 @@ class NCSemanticEntityParserImpl( elems.foreach(elem => add(elem.elementId, elem.value)) case None => // No-op. // With regex. - if !found then - for ((elemId, syns) <- synsHolder.mixedSynonyms.getOrElse(variant.size, Seq.empty)) - for (s <- syns if !found) - found = s.chunks.zip(variant). - sortBy { (chunk, _) => if chunk.isText then 0 else 1 }. - forall { (chunk, tok) => - if chunk.isText then - chunk.stem == stems(tok) || (stems4Lemms.nonEmpty && chunk.stem == stems4Lemms(tok)) - else - def match0(txt: String) = chunk.regex.matcher(txt).matches() - match0(tok.getText) || match0(tok.getText.toLowerCase) - } - - if found then add(elemId, Option.when(s.value != null)(s.value)) + for ((elemId, syns) <- synsHolder.mixedSynonyms.getOrElse(variant.size, Seq.empty)) + found = false + + for (s <- syns if !found) + found = s.chunks.zip(variant). + sortBy { (chunk, _) => if chunk.isText then 0 else 1 }. + forall { (chunk, tok) => + if chunk.isText then + chunk.stem == stems(tok) || (stems4Lemms.nonEmpty && chunk.stem == stems4Lemms(tok)) + else + def match0(txt: String) = chunk.regex.matcher(txt).matches() + match0(tok.getText) || match0(tok.getText.toLowerCase) + } + + if found then add(elemId, Option.when(s.value != null)(s.value)) hs.toSeq.map(h => { val e = elemsMap(h.elemId)