This is an automated email from the ASF dual-hosted git repository. sergeykamov pushed a commit to branch NLPCRAFT-472 in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git
The following commit(s) were added to refs/heads/NLPCRAFT-472 by this push: new 81d4b06 WIP. 81d4b06 is described below commit 81d4b06a0e32d61aa87300bfeded7588c30858d6 Author: Sergey Kamov <skhdlem...@gmail.com> AuthorDate: Thu Dec 30 22:57:22 2021 +0300 WIP. --- .../semantic/impl/NCSemanticEntityParserImpl.scala | 47 ++++++++++++++-------- .../impl/NCSemanticSynonymsProcessor.scala | 2 +- 2 files changed, 32 insertions(+), 17 deletions(-) diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSemanticEntityParserImpl.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSemanticEntityParserImpl.scala index 44adac1..1a3dfbe 100644 --- a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSemanticEntityParserImpl.scala +++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSemanticEntityParserImpl.scala @@ -44,7 +44,7 @@ object NCSemanticEntityParserImpl: require(stemmer != null) require(mdlSrc != null) - new NCSemanticEntityParserImpl(stemmer, res = mdlSrc, typ = NCSemanticSourceType(mdlSrc)) + new NCSemanticEntityParserImpl(stemmer, mdlSrc = mdlSrc, typ = NCSemanticSourceType(mdlSrc)) /** * @param baseTokens Tokens. @@ -119,18 +119,18 @@ class NCSemanticEntityParserImpl( stemmer: NCSemanticTextStemmer, macros: Map[String, String] = null, elements: Seq[NCSemanticElement] = null, - res: String = null, + mdlSrc: String = null, typ: NCSemanticSourceType = null ) extends NCEntityParser with LazyLogging: require(stemmer != null) - require(macros != null && elements != null || res != null && typ != null) + require(macros != null && elements != null || mdlSrc != null && typ != null) @volatile private var h: NCSemanticSynonymsHolder = _ override def start(cfg: NCModelConfig): Unit = val (macros, elements) = - if res != null then - val src = NCSemanticDataReader.read(new BufferedInputStream(NCUtils.getStream(res)), typ) + if mdlSrc != null then + val src = NCSemanticDataReader.read(new BufferedInputStream(NCUtils.getStream(mdlSrc)), typ) (src.macros, src.elements) else (this.macros, this.elements) @@ -142,21 +142,19 @@ class NCSemanticEntityParserImpl( override def parse(req: NCRequest, cfg: NCModelConfig, toksList: JList[NCToken]): JList[NCEntity] = val toks = toksList.asScala.toSeq val cache = mutable.HashSet.empty[Seq[Int]] // Variants (tokens without stopwords) can be repeated. - val ents = mutable.ArrayBuffer.empty[NCEntity] + + case class Holder(elemId: String, tokens: Seq[NCToken]) + + val hs = mutable.ArrayBuffer.empty[Holder] for (piece <- getPieces(toks); variant <- Seq(piece.baseTokens) ++ piece.variants) - def addEntity(elemId: String): Unit = - ents += - new NCPropertyMapAdapter with NCEntity: - override def getTokens: JList[NCToken] = piece.baseTokens.asJava - override def getRequestId: String = req.getRequestId - override def getId: String = elemId + def add(elemId: String): Unit = hs += Holder(elemId, piece.baseTokens) val idxs = variant.map(_.getIndex) if cache.add(idxs) then h.textSynonyms.get(variant.map(_.getStem).mkString(" ")) match - case Some(elemIds) => elemIds.foreach(addEntity) + case Some(elemIds) => elemIds.foreach(add) case None => for ((elemId, syns) <- h.mixedSynonyms.getOrElse(variant.size, Seq.empty)) var found = false @@ -171,6 +169,23 @@ class NCSemanticEntityParserImpl( if chunk.isText then chunk.stem == tok.getStem else match0(tok.getText) || match0(tok.getText.toLowerCase) } - if found then addEntity(elemId) - - ents.toSeq.asJava \ No newline at end of file + if found then add(elemId) + + val hsIdxs = hs.zipWithIndex + + // Drops redundant according to well-known theorem. + hs --= + hs.zipWithIndex.filter { (h1, idx1) => + hsIdxs.exists { (h2, idx2) => + idx2 != idx1 && + h2.tokens.size > h1.tokens.size && + h1.tokens.forall(h2.tokens.contains) + } + }.map { (h, _) => h } + + hs.toSeq.map(h => + new NCPropertyMapAdapter with NCEntity: + override def getTokens: JList[NCToken] = h.tokens.asJava + override def getRequestId: String = req.getRequestId + override def getId: String = h.elemId + ).asJava \ No newline at end of file diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSemanticSynonymsProcessor.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSemanticSynonymsProcessor.scala index 9577451..4a49dae 100644 --- a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSemanticSynonymsProcessor.scala +++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSemanticSynonymsProcessor.scala @@ -54,7 +54,7 @@ private[impl] object NCSemanticSynonymsProcessor extends LazyLogging: if macros != null then // TODO: check empty. if macros.contains(null) then throw new NCException("Some macro names are null") - if macros.values.contains(null) then throw new NCException("Some macro bodies are null") + // if macros.values.contains(null) then throw new NCException("Some macro bodies are null") val set = elements.filter(_.getSynonyms != null).flatMap(_.getSynonyms.asScala) ++ macros.values