This is an automated email from the ASF dual-hosted git repository. sergeykamov pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git
The following commit(s) were added to refs/heads/master by this push: new 00baf4f POS and Lemmatizer optional configuration for OpenNlp token parser. 00baf4f is described below commit 00baf4f64e81d05e59eeb5dc30be6f9ebe4360a4 Author: Sergey Kamov <skhdlem...@gmail.com> AuthorDate: Fri Feb 25 15:18:02 2022 +0300 POS and Lemmatizer optional configuration for OpenNlp token parser. --- .../impl/NCStanfordNLPTokenParserImpl.scala | 12 ++++--- .../token/parser/opennlp/NCOpenNLPTokenParser.java | 2 -- .../opennlp/impl/NCOpenNLPTokenParserImpl.scala | 18 +++++----- .../parser/opennlp/NCOpenNLPTokenParserSpec.scala | 40 ++++++++++++++++++++++ 4 files changed, 57 insertions(+), 15 deletions(-) diff --git a/nlpcraft-stanford/src/main/java/org/apache/nlpcraft/nlp/token/parser/stanford/impl/NCStanfordNLPTokenParserImpl.scala b/nlpcraft-stanford/src/main/java/org/apache/nlpcraft/nlp/token/parser/stanford/impl/NCStanfordNLPTokenParserImpl.scala index ba24664..15152e8 100644 --- a/nlpcraft-stanford/src/main/java/org/apache/nlpcraft/nlp/token/parser/stanford/impl/NCStanfordNLPTokenParserImpl.scala +++ b/nlpcraft-stanford/src/main/java/org/apache/nlpcraft/nlp/token/parser/stanford/impl/NCStanfordNLPTokenParserImpl.scala @@ -37,6 +37,8 @@ import scala.jdk.CollectionConverters.* class NCStanfordNLPTokenParserImpl(stanford: StanfordCoreNLP) extends NCTokenParser: require(stanford != null) + private def nvl(v: String, dflt : => String): String = if v != null then v else dflt + override def tokenize(text: String): JList[NCToken] = val doc = new CoreDocument(text) stanford.annotate(doc) @@ -45,10 +47,12 @@ class NCStanfordNLPTokenParserImpl(stanford: StanfordCoreNLP) extends NCTokenPar val toks = ann.asScala.flatMap(_.asInstanceOf[ArrayCoreMap].get(classOf[TokensAnnotation]).asScala). zipWithIndex.map { (t, idx) => - new NCPropertyMapAdapter with NCToken : - override val getText: String = t.originalText() - override val getLemma: String = t.lemma() - override val getPos: String = t.tag() + val txt = t.originalText() + + new NCPropertyMapAdapter with NCToken: + override val getText: String = txt + override val getLemma: String = nvl(t.lemma(), txt) + override val getPos: String = nvl(t.tag(), "") override val getIndex: Int = idx override val getStartCharIndex: Int = t.beginPosition() override val getEndCharIndex: Int = t.endPosition() diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/token/parser/opennlp/NCOpenNLPTokenParser.java b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/token/parser/opennlp/NCOpenNLPTokenParser.java index a9cdbf2..629c8aa 100644 --- a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/token/parser/opennlp/NCOpenNLPTokenParser.java +++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/token/parser/opennlp/NCOpenNLPTokenParser.java @@ -45,8 +45,6 @@ public class NCOpenNLPTokenParser implements NCTokenParser { */ public NCOpenNLPTokenParser(String tokMdlSrc, String posMdlSrc, String lemmaDicSrc) { Objects.requireNonNull(tokMdlSrc, "Tokenizer model path cannot be null."); - Objects.requireNonNull(posMdlSrc, "POS model path cannot be null."); - Objects.requireNonNull(lemmaDicSrc, "Lemmatizer model path cannot be null."); impl = new NCOpenNLPTokenParserImpl(tokMdlSrc, posMdlSrc, lemmaDicSrc); } diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/token/parser/opennlp/impl/NCOpenNLPTokenParserImpl.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/token/parser/opennlp/impl/NCOpenNLPTokenParserImpl.scala index c1074fb..b52d32e 100644 --- a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/token/parser/opennlp/impl/NCOpenNLPTokenParserImpl.scala +++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/token/parser/opennlp/impl/NCOpenNLPTokenParserImpl.scala @@ -40,8 +40,6 @@ import scala.jdk.CollectionConverters.* */ class NCOpenNLPTokenParserImpl(tokMdl: String, posMdlSrc: String, lemmaDicSrc: String) extends NCTokenParser with LazyLogging: require(tokMdl != null) - require(posMdlSrc != null) - require(lemmaDicSrc != null) private var tagger: POSTaggerME = _ private var lemmatizer: DictionaryLemmatizer = _ @@ -52,12 +50,14 @@ class NCOpenNLPTokenParserImpl(tokMdl: String, posMdlSrc: String, lemmaDicSrc: private def init(): Unit = NCUtils.execPar( () => { - tagger = new POSTaggerME(new POSModel(NCUtils.getStream(posMdlSrc))) - logger.trace(s"Loaded resource: $posMdlSrc") + if posMdlSrc != null then + tagger = new POSTaggerME(new POSModel(NCUtils.getStream(posMdlSrc))) + logger.trace(s"Loaded resource: $posMdlSrc") }, () => { - lemmatizer = new DictionaryLemmatizer(NCUtils.getStream(lemmaDicSrc)) - logger.trace(s"Loaded resource: $lemmaDicSrc") + if lemmaDicSrc != null then + lemmatizer = new DictionaryLemmatizer(NCUtils.getStream(lemmaDicSrc)) + logger.trace(s"Loaded resource: $lemmaDicSrc") }, () => { tokenizer = new TokenizerME(new TokenizerModel(NCUtils.getStream(tokMdl))) @@ -71,8 +71,8 @@ class NCOpenNLPTokenParserImpl(tokMdl: String, posMdlSrc: String, lemmaDicSrc: this.synchronized { val hs = tokenizer.tokenizePos(text).map(p => Holder(p.getCoveredText(text).toString, p.getStart, p.getEnd)) val toks = hs.map(_.text) - val poses = tagger.tag(toks) - var lemmas = lemmatizer.lemmatize(toks, poses) + val poses = if tagger != null then tagger.tag(toks) else toks.map(_ => "") + var lemmas = if lemmatizer != null then lemmatizer.lemmatize(toks, poses) else toks require(toks.length == poses.length && toks.length == lemmas.length) @@ -83,7 +83,7 @@ class NCOpenNLPTokenParserImpl(tokMdl: String, posMdlSrc: String, lemmaDicSrc: case ((lemma, pos), i) => Option.when(lemma == "O" && pos == "NN")(i) } - if suspIdxs.nonEmpty then + if suspIdxs.nonEmpty && lemmatizer != null then val fixes: Map[Int, String] = lemmatizer. lemmatize(suspIdxs.map(i => toks(i)), suspIdxs.map(_ => "NNN")). zipWithIndex. diff --git a/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/token/parser/opennlp/NCOpenNLPTokenParserSpec.scala b/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/token/parser/opennlp/NCOpenNLPTokenParserSpec.scala index 12b52d3..009097c 100644 --- a/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/token/parser/opennlp/NCOpenNLPTokenParserSpec.scala +++ b/nlpcraft/src/test/scala/org/apache/nlpcraft/nlp/token/parser/opennlp/NCOpenNLPTokenParserSpec.scala @@ -96,3 +96,43 @@ class NCOpenNLPTokenParserSpec: "< < [ a ] > >", toks => require(!isStopWord(toks.find(_.getText == "a").get)) ) + + @Test + def testNullable(): Unit = + val txt = "parents had files" + + // 1. Nullable. + var parser = new NCOpenNLPTokenParser( + "opennlp/en-token.bin", + null, + null + ) + + var tbl = NCAsciiTable("Text", "Lemma", "POS") + + for (t <- parser.tokenize(txt).asScala) + tbl += (t.getText, t.getLemma, t.getPos) + + require(t.getPos.isEmpty) + require(t.getText == t.getLemma) + + println(tbl.toString) + + // 2. Not nullable. + parser = new NCOpenNLPTokenParser( + "opennlp/en-token.bin", + "opennlp/en-pos-maxent.bin", + "opennlp/en-lemmatizer.dict" + ) + + tbl = NCAsciiTable("Text", "Lemma", "POS") + + for (t <- parser.tokenize(txt).asScala) + tbl += (t.getText, t.getLemma, t.getPos) + + require(t.getPos.nonEmpty) + require(t.getText != t.getLemma) + + println(tbl.toString) + +