This is an automated email from the ASF dual-hosted git repository. sergeykamov pushed a commit to branch NLPCRAFT-472 in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git
The following commit(s) were added to refs/heads/NLPCRAFT-472 by this push: new b9229a7 WIP. b9229a7 is described below commit b9229a74384b436643e59c24e98543a9f5bed378 Author: Sergey Kamov <skhdlem...@gmail.com> AuthorDate: Mon Jan 10 18:05:10 2022 +0300 WIP. --- .../opennlp/impl/NCOpenNlpEntityParserImpl.scala | 2 +- .../semantic/impl/NCSemanticEntityParserImpl.scala | 30 ++--- .../parser/semantic/impl/NCSemanticSynonym.scala | 3 +- .../impl/NCSemanticSynonymsProcessor.scala | 122 ++++++++++++++------- .../semantic/NCSemanticEntityParserSpec.scala | 35 +++++- .../org/apache/nlpcraft/nlp/util/NCTestUtils.scala | 6 +- 6 files changed, 136 insertions(+), 62 deletions(-) diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/opennlp/impl/NCOpenNlpEntityParserImpl.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/opennlp/impl/NCOpenNlpEntityParserImpl.scala index 41abdcb..cd5228a 100644 --- a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/opennlp/impl/NCOpenNlpEntityParserImpl.scala +++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/opennlp/impl/NCOpenNlpEntityParserImpl.scala @@ -54,7 +54,7 @@ class NCOpenNlpEntityParserImpl(resources: JList[String]) extends NCEntityParser logger.trace(s"Loaded resource: $res") - finders += f + finders.synchronized { finders += f } })*)(ExecutionContext.Implicits.global) this.finders = finders.toSeq diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSemanticEntityParserImpl.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSemanticEntityParserImpl.scala index 8e6a9a2..d95942c 100644 --- a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSemanticEntityParserImpl.scala +++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSemanticEntityParserImpl.scala @@ -156,7 +156,7 @@ class NCSemanticEntityParserImpl( require(stemmer != null && parser != null) require(macros != null && elements != null || mdlSrc != null && scrType != null) - private var h: NCSemanticSynonymsHolder = _ + private var synsHolder: NCSemanticSynonymsHolder = _ private var elemsMap: Map[String, NCSemanticElement] = _ start() @@ -174,7 +174,7 @@ class NCSemanticEntityParserImpl( else (this.macros, this.elements, toMap(this.elements)) - this.h = NCSemanticSynonymsProcessor.prepare(stemmer, parser, macros, elements) + this.synsHolder = NCSemanticSynonymsProcessor.prepare(stemmer, parser, macros, elements) this.elemsMap = elemsMap override def parse(req: NCRequest, cfg: NCModelConfig, toksList: JList[NCToken]): JList[NCEntity] = @@ -187,25 +187,24 @@ class NCSemanticEntityParserImpl( val cache = mutable.HashSet.empty[Seq[Int]] // Variants (tokens without stopwords) can be repeated. case class Holder(elemId: String, tokens: Seq[NCToken], value: Option[String]): - private val idxs = toks.map(_.getIndex).toSet - // Skips redundant according to well-known theorem. - def more(toks: Seq[NCToken]): Boolean = idxs.size > toks.size && toks.map(_.getIndex).toSet.subsetOf(idxs) + private val idxs = tokens.map(_.getIndex).toSet + def isSuperSet(toks: Seq[NCToken]): Boolean = idxs.size > toks.size && toks.map(_.getIndex).toSet.subsetOf(idxs) val hs = mutable.ArrayBuffer.empty[Holder] for ( - piece <- getPieces(toks); - variant <- Seq(piece.baseTokens) ++ piece.variants if !hs.exists(_.more(piece.baseTokens)) + piece <- getPieces(toks) if !hs.exists(_.isSuperSet(piece.baseTokens)); + variant <- Seq(piece.baseTokens) ++ piece.variants ) - def add(elemId: String, value: Option[String]): Unit = hs += Holder(elemId, piece.baseTokens, value) + def add(elemId: String, value: Option[String]): Unit = hs += Holder(elemId, variant, value) val idxs = variant.map(_.getIndex) if cache.add(idxs) then - h.textSynonyms.get(variant.map(p => stems(p)).mkString(" ")) match - case Some(keys) => keys.foreach(key => add(key.elementId, key.value)) + synsHolder.textSynonyms.get(variant.map(t => stems(t)).mkString(" ")) match + case Some(elems) => elems.foreach(elem => add(elem.elementId, elem.value)) case None => - for ((elemId, syns) <- h.mixedSynonyms.getOrElse(variant.size, Seq.empty)) + for ((elemId, syns) <- synsHolder.mixedSynonyms.getOrElse(variant.size, Seq.empty)) var found = false for (s <- syns if !found) @@ -213,10 +212,11 @@ class NCSemanticEntityParserImpl( s.chunks.zip(variant). sortBy { (chunk, _) => if chunk.isText then 0 else 1 }. forall { (chunk, tok) => - def match0(txt: String) = chunk.regex.matcher(txt).matches() - - if chunk.isText then chunk.stem == stems(tok) - else match0(tok.getText) || match0(tok.getText.toLowerCase) + if chunk.isText then + chunk.stem == stems(tok) + else + def match0(txt: String) = chunk.regex.matcher(txt).matches() + match0(tok.getText) || match0(tok.getText.toLowerCase) } if found then add(elemId, Option.when(s.value != null)(s.value)) diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSemanticSynonym.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSemanticSynonym.scala index 1a804fd..923a89a 100644 --- a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSemanticSynonym.scala +++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSemanticSynonym.scala @@ -38,8 +38,9 @@ private[impl] case class NCSemanticSynonymChunk( kind: NCSemanticChunkKind, text: String, stem: String = null, regex: Pattern = null ) { require(text != null && kind != null) + require(stem != null ^ regex != null) - val isText: Boolean = text != null + val isText: Boolean = stem != null override def toString = s"($text|$kind)" } diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSemanticSynonymsProcessor.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSemanticSynonymsProcessor.scala index 3305fd7..cb8ecbb 100644 --- a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSemanticSynonymsProcessor.scala +++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/impl/NCSemanticSynonymsProcessor.scala @@ -22,6 +22,7 @@ import com.fasterxml.jackson.module.scala.DefaultScalaModule import com.typesafe.scalalogging.LazyLogging import org.apache.nlpcraft.* import org.apache.nlpcraft.internal.makro.NCMacroParser +import org.apache.nlpcraft.internal.util.NCUtils import org.apache.nlpcraft.nlp.entity.parser.semantic.* import org.apache.nlpcraft.nlp.entity.parser.semantic.impl.NCSemanticChunkKind.* @@ -88,14 +89,8 @@ private[impl] object NCSemanticSynonymsProcessor extends LazyLogging: for ((name, value) <- macros if isSuspicious(name) || (isSuspicious(value) && !value.contains("//"))) logger.warn(s"Suspicious macro definition (use of ${SUSP_SYNS_CHARS.map(s => s"'$s'").mkString(", ")} chars) [" + s"macro=$name" + - s"]") - - /** - * - * @param s - * @return - */ - private def hasWhitespace(s: String): Boolean = s.exists(_.isWhitespace) + s"]" + ) /** * @@ -140,7 +135,7 @@ private[impl] object NCSemanticSynonymsProcessor extends LazyLogging: throw new NCException(s"Some element IDs are not provided or empty.") // TODO: error text. else if !elemId.matches(ID_REGEX) then throw new NCException(s"Element ID does not match regex [element=$elemId, regex=$ID_REGEX]") // TODO: error text. - else if hasWhitespace(elemId) then + else if elemId.exists(_.isWhitespace) then throw new NCException(s"Element ID cannot have whitespaces [element=$elemId]") // TODO: error text. checkSynonyms(e.getSynonyms, elemId) @@ -154,51 +149,105 @@ private[impl] object NCSemanticSynonymsProcessor extends LazyLogging: for ((name, syns) <- vals.asScala) checkSynonyms(syns, elemId, Some(name)) - private def startsAndEnds(fix: String, s: String): Boolean = s.startsWith(fix) && s.endsWith(fix) - private def mkChunk(stemmer: NCSemanticStemmer, chunk: String): NCSemanticSynonymChunk = - def stripSuffix(fix: String, s: String): String = s.slice(fix.length, s.length - fix.length) - - // Regex synonym. - if startsAndEnds(REGEX_FIX, chunk) then - val ptrn = stripSuffix(REGEX_FIX, chunk) - if ptrn.nonEmpty then - try - NCSemanticSynonymChunk(kind = REGEX, text = chunk, regex = Pattern.compile(ptrn)) - catch - case e: PatternSyntaxException => - throw new NCException(s"Invalid regex synonym syntax detected [chunk=$chunk]", e) + /** + * + * @param stemmer + * @param tokParser + * @param macroParser + * @param elemId + * @param syns + * @return + */ + private def convertSynonyms( + stemmer: NCSemanticStemmer, + tokParser: NCTokenParser, + macroParser: NCMacroParser, + elemId: String, + syns: JSet[String] + ): Seq[Seq[NCSemanticSynonymChunk]] = + case class RegexHolder(text: String, var used: Boolean = false): + private def stripSuffix(fix: String, s: String): String = s.slice(fix.length, s.length - fix.length) + + def mkChunk(): NCSemanticSynonymChunk = + val ptrn = stripSuffix(REGEX_FIX, text) + + if ptrn.nonEmpty then + try + NCSemanticSynonymChunk(REGEX, text, regex = Pattern.compile(ptrn)) + catch + case e: PatternSyntaxException => + // TODO: error text. + throw new NCException(s"Invalid regex synonym syntax detected [element=$elemId, chunk=$text]", e) + else + throw new NCException(s"Empty regex synonym detected [element=$elemId]") // TODO: error text. + + val regexes = mutable.HashMap.empty[Int, RegexHolder] + + def findRegex(t: NCToken): Option[RegexHolder] = + if regexes.nonEmpty then + (t.getStartCharIndex to t.getEndCharIndex).flatMap(regexes.get).to(LazyList).headOption else - throw new NCException(s"Empty regex synonym detected [chunk=$chunk]") // TODO: error text. - else - NCSemanticSynonymChunk(kind = TEXT, text = chunk, stem = stemmer.stem(chunk)) + None + + syns.asScala.flatMap(macroParser.expand). + map(syn => { + // Drops redundant spaces without any warnings. + val normSyn = syn.split(" ").map(_.strip).filter(_.nonEmpty) + + var start = 0 + var end = -1 + regexes.clear() + + // Saves regex chunks positions. Regex chunks can be found without tokenizer, just split by spaces. + for (ch <- normSyn) + start = end + 1 + end = start + ch.length + + if ch.startsWith(REGEX_FIX) && ch.endsWith(REGEX_FIX) then + val r = RegexHolder(ch) + + (start to end).foreach(regexes += _ -> r) + + // Tokenizes synonym without regex chunks. Regex chunks are used as is, without tokenization. + tokParser.tokenize(normSyn.mkString(" ")).asScala.flatMap(tok => + findRegex(tok) match + case Some(regex) => + if regex.used then + None + else + regex.used = true + Some(regex.mkChunk()) + case None => Some(NCSemanticSynonymChunk(TEXT, tok.getText, stemmer.stem(tok.getText))) + ).toSeq + }).toSeq /** * * @param stemmer - * @param parser + * @param tokParser * @param macros * @param elements * @return */ def prepare( stemmer: NCSemanticStemmer, - parser: NCTokenParser, + tokParser: NCTokenParser, macros: Map[String, String], elements: Seq[NCSemanticElement] ): NCSemanticSynonymsHolder = - require(stemmer != null && parser != null) + require(stemmer != null && tokParser != null) // Order is important. checkElements(elements) checkMacros(macros, elements) - val p = new NCMacroParser + val macroParser = new NCMacroParser if macros != null then - for ((name, body) <- macros) p.addMacro(name, body) + for ((name, body) <- macros) macroParser.addMacro(name, body) case class Holder(synonym: NCSemanticSynonym, elementId: String) { - lazy val root: String = synonym.chunks.map(p => if p.isText then p.stem else p.regex.pattern()).mkString(" ") + lazy val root: String = synonym.chunks.map(p => if p.isText then p.stem else p.text).mkString(" ") } val buf = mutable.ArrayBuffer.empty[Holder] @@ -210,21 +259,20 @@ private[impl] object NCSemanticSynonymsProcessor extends LazyLogging: def addSpec(txt: String, value: String = null): Unit = buf += Holder(NCSemanticSynonym(Seq(NCSemanticSynonymChunk(TEXT, txt, stemmer.stem(txt))), value), elemId) - def convert(syns: JSet[String]): Seq[Seq[NCSemanticSynonymChunk]] = - syns.asScala.flatMap(p.expand). - map(t => parser.tokenize(t).asScala.map(w => mkChunk(stemmer, w.getText)).toSeq).toSeq - addSpec(elemId) if e.getSynonyms != null then - add(convert(e.getSynonyms).map(NCSemanticSynonym(_))) + add(convertSynonyms(stemmer, tokParser, macroParser, elemId, e.getSynonyms).map(NCSemanticSynonym(_))) if e.getValues != null then for ((name, syns) <- e.getValues.asScala) addSpec(name, value = name) if syns != null then - add(convert(syns).map(chunks => NCSemanticSynonym(chunks, value = name))) + add( + convertSynonyms(stemmer, tokParser, macroParser, elemId, syns). + map(chunks => NCSemanticSynonym(chunks, value = name)) + ) buf.groupBy(_.root).values.foreach(hs => { val elemIds = hs.map(_.elementId).toSet diff --git a/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticEntityParserSpec.scala b/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticEntityParserSpec.scala index 2340543..2c749ec 100644 --- a/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticEntityParserSpec.scala +++ b/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/entity/parser/semantic/NCSemanticEntityParserSpec.scala @@ -72,7 +72,10 @@ class NCSemanticEntityParserSpec: // Value. Multiple words. Element("t5", values = Map("value5" -> Set("value 5"))), // Elements data. - Element("t6", props = Map("testKey" -> "testValue")) + Element("t6", props = Map("testKey" -> "testValue")), + // Regex. + Element("t7", synonyms = Set("x //[a-d]+//")) + ).asJava ) @@ -93,21 +96,38 @@ class NCSemanticEntityParserSpec: stopWordsEnricher.enrich(req, NCTestConfig.EN, toks) + NCTestUtils.printTokens(toks.asScala.toSeq) + val ents = parser.parse(req, NCTestConfig.EN, toks).asScala.toSeq NCTestUtils.printEntities(txt, ents) require(ents.size == 1) - val tok = ents.head - require(tok.getId == id) + val e = ents.head + require(e.getId == id) value match - case Some(v) => require(tok.get[Any](s"$id:value") == v) + case Some(v) => require(e.get[Any](s"$id:value") == v) case None => // No-op. elemData match - case Some(m) => m.foreach { (k, v) => require(tok.get[Any](s"$id:$k") == v) } + case Some(m) => m.foreach { (k, v) => require(e.get[Any](s"$id:$k") == v) } case None => // No-op. + private def checkMultiple(txt: String, ids: String*): Unit = + val req = NCTestRequest(txt) + val toks = NCTestConfig.EN.getTokenParser.tokenize(txt) + + stopWordsEnricher.enrich(req, NCTestConfig.EN, toks) + + NCTestUtils.printTokens(toks.asScala.toSeq) + + val ents = parser.parse(req, NCTestConfig.EN, toks).asScala.toSeq + + NCTestUtils.printEntities(txt, ents) + require(ents.size == ids.size) + + ents.map(_.getId).sorted.zip(ids.sorted).foreach { case (eId, id) => require(eId == id) } + @Test def test(): Unit = check("t1", "t1") @@ -118,4 +138,7 @@ class NCSemanticEntityParserSpec: check("t3 the t3", "t3") // With stopword inside. check("value4", "t4", value = Some("value4")) check("value the 5", "t5", value = Some("value5")) // With stopword inside. - check("t6", "t6", elemData = Some(Map("testKey" -> "testValue"))) \ No newline at end of file + check("t6", "t6", elemData = Some(Map("testKey" -> "testValue"))) + check("the x abc x abe", "t7") // `x abc` should be matched, `x abe` shouldn't. + + checkMultiple("t1 the x abc the x the abc", "t1", "t7", "t7") \ No newline at end of file diff --git a/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/util/NCTestUtils.scala b/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/util/NCTestUtils.scala index 4eaa5c1..0641415 100644 --- a/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/util/NCTestUtils.scala +++ b/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/util/NCTestUtils.scala @@ -65,12 +65,14 @@ object NCTestUtils: * @param ents */ def printEntities(req: String, ents: Seq[NCEntity]): Unit = - val tbl = NCAsciiTable("EntityId", "Tokens", "Properties") + val tbl = NCAsciiTable("EntityId", "Tokens", "Position", "Properties") for (e <- ents) + val toks = e.getTokens.asScala tbl += ( e.getId, - e.getTokens.asScala.map(_.getText).mkString("|"), + toks.map(_.getText).mkString("|"), + toks.map(p => s"${p.getStartCharIndex}-${p.getEndCharIndex}").mkString("|"), mkProps(e) )