This is an automated email from the ASF dual-hosted git repository. sergeykamov pushed a commit to branch NLPCRAFT-520 in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git
The following commit(s) were added to refs/heads/NLPCRAFT-520 by this push: new 2e1a72fd WIP. 2e1a72fd is described below commit 2e1a72fdde3e45d51726f50cdb74807a72cbe7bf Author: Sergey Kamov <skhem...@gmail.com> AuthorDate: Mon Dec 19 15:39:40 2022 +0400 WIP. --- .../nlp/enrichers/NCBracketsTokenEnricher.scala | 54 ++++++++++++---------- .../nlp/enrichers/NCQuotesTokenEnricher.scala | 4 +- .../enrichers/NCBracketsTokenEnricherSpec.scala | 18 ++++++-- 3 files changed, 44 insertions(+), 32 deletions(-) diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCBracketsTokenEnricher.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCBracketsTokenEnricher.scala index c0e692a3..b4d8f563 100644 --- a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCBracketsTokenEnricher.scala +++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCBracketsTokenEnricher.scala @@ -21,16 +21,26 @@ import com.typesafe.scalalogging.LazyLogging import org.apache.nlpcraft.* import java.io.* -import scala.collection.mutable +import scala.collection.{Map, mutable} + +/** + * Companion helper. + */ +object NCBracketsTokenEnricher: + private val BRACKETS = Map("(" -> ")", "{" -> "}", "[" -> "]", "<" -> ">") + private val BRACKETS_REVERSED = BRACKETS.map { case (key, value) => value -> key } + +import NCBracketsTokenEnricher.* /** * Brackets [[NCTokenEnricher token enricher]]. * * This enricher adds `brackets` boolean [[NCPropertyMap metadata]] property to the [[NCToken token]] - * instance if the word it represents is enclosed in brackets. Supported brackets are: `()`, `{}`, - * `[]` and `<>`. + * instance if the word it represents is enclosed in brackets. + * + * Supported brackets are: `()`, `{}`, `[]` and `<>`. * - * **NOTE:** invalid enclosed brackets are ignored. + * **NOTE:** invalid enclosed brackets are ignored and for all input tokens property `brackets` assigned as `false`. */ //noinspection DuplicatedCode,ScalaWeakerAccess class NCBracketsTokenEnricher extends NCTokenEnricher with LazyLogging: @@ -41,26 +51,20 @@ class NCBracketsTokenEnricher extends NCTokenEnricher with LazyLogging: var ok = true def check(expected: String): Unit = if stack.empty() || stack.pop() != expected then ok = false - def mark(t: NCToken): Unit = map += t -> !stack.isEmpty + def add(t: NCToken): Unit = map += t -> !stack.isEmpty - for (t <- toks if ok) - t.getText match - case "(" | "{" | "[" | "<" => - mark(t) - stack.push(t.getText) - case ")" => - check("(") - mark(t) - case "}" => - check("{") - mark(t) - case "]" => - check("[") - mark(t) - case ">" => - check("<") - mark(t) - case _ => mark(t) + for (t <- toks if ok; txt = t.getText) + if BRACKETS.contains(txt) then + add(t) + stack.push(txt) + else if BRACKETS_REVERSED.contains(txt) then + check(BRACKETS_REVERSED(txt)) + add(t) + else + add(t) - if ok && stack.isEmpty then map.foreach { (tok, b) => tok.put("brackets", b) } - else logger.warn(s"Detected invalid brackets in: ${req.getText}") \ No newline at end of file + if ok && stack.isEmpty then + map.foreach { (tok, b) => tok.put("brackets", b) } + else + toks.foreach(_.put("brackets",false)) + logger.warn(s"Detected invalid brackets in: ${req.getText}") \ No newline at end of file diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCQuotesTokenEnricher.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCQuotesTokenEnricher.scala index 8912e178..f2abb1c8 100644 --- a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCQuotesTokenEnricher.scala +++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCQuotesTokenEnricher.scala @@ -45,8 +45,8 @@ import NCQuotesTokenEnricher.* * `false` value indicates otherwise. * * Supported quotes are: **«**, **»**, **"**, **'**, **`**. - * For any invalid cases, like invalid quotes order otr count, - * property `quoted` assigned as `false` for all input tokens. + * + * **NOTE:** invalid enclosed quotes are ignored and for all input tokens property `quoted` assigned as `false`. */ //noinspection ScalaWeakerAccess class NCQuotesTokenEnricher extends NCTokenEnricher with LazyLogging: diff --git a/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/enrichers/NCBracketsTokenEnricherSpec.scala b/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/enrichers/NCBracketsTokenEnricherSpec.scala index 6739a703..82bcff24 100644 --- a/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/enrichers/NCBracketsTokenEnricherSpec.scala +++ b/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/enrichers/NCBracketsTokenEnricherSpec.scala @@ -33,16 +33,24 @@ class NCBracketsTokenEnricherSpec extends AnyFunSuite: * @param txt * @param brackets */ - private def check(txt: String, brackets: Set[Integer]): Unit = + private def check(txt: String, brackets: Integer*): Unit = val toks = EN_TOK_PARSER.tokenize(txt) bracketsEnricher.enrich(NCTestRequest(txt), CFG, toks) NCTestUtils.printTokens(toks) - toks.foreach (tok => require(!(tok[Boolean]("brackets") ^ brackets.contains(tok.getIndex)))) + if brackets.isEmpty then require(toks.forall(p => !p[Boolean]("brackets"))) + else toks.foreach (tok => require(!(tok[Boolean]("brackets") ^ brackets.contains(tok.getIndex)))) test("test") { - check("A [ B C ] D", Set(2, 3)) - check("A [ B { C } ] D", Set(2, 3, 4, 5)) - check("A [ B { C } ] [ [ D ] ] [ E ]", Set(2, 3, 4, 5, 8, 9, 10, 13)) + check("A [ B C ] D", 2, 3) + check("A [ B { C } ] D", 2, 3, 4, 5) + check("A [ B { C } ] [ [ D ] ] [ E ]", 2, 3, 4, 5, 8, 9, 10, 13) + + // Invalid. + check("[[a]") + check("[a[") + check("{[a[}") + check("[") + check("}") }