This is an automated email from the ASF dual-hosted git repository. sergeykamov pushed a commit to branch NLPCRAFT-520 in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git
commit 2427be979d3d87e9fe94e752ea3c73f9ee376b5a Author: Sergey Kamov <skhem...@gmail.com> AuthorDate: Mon Dec 19 11:08:24 2022 +0400 WIP. --- .../apache/nlpcraft/internal/util/NCUtils.scala | 18 -------- .../nlp/enrichers/NCEnStopWordsTokenEnricher.scala | 24 ++++++----- .../{tools => impl}/NCEnStopWordGenerator.scala | 50 ++++++++-------------- 3 files changed, 31 insertions(+), 61 deletions(-) diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/util/NCUtils.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/util/NCUtils.scala index 6791429d..1b81acd0 100644 --- a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/util/NCUtils.scala +++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/util/NCUtils.scala @@ -424,24 +424,6 @@ object NCUtils extends LazyLogging: out catch case e: IOException => E(s"Failed to read stream: $res", e) - /** - * @param res Gzip resource, file absolute or relative path. - * @param res - * @param enc Encoding. Default value is "UTF-8". - * @param strip Strip flag. If `true` it strips all read lines. Default value is `true`. - * @param convert Line conversion method. Applied after `strip`. By default it passes lines as is. - * @param filterText . Filtering text flag. If `true` it skips empty lines and lines with headers (# symbol). Default value is `false`. - * @param log Logger. - */ - def readGzipLines( - res: String, - enc: String = "UTF-8", - strip: Boolean = true, - convert: String => String = s => s, - filterText: Boolean = false, - log: Logger = logger - ): Iterator[String] = readLines(new GZIPInputStream(getStream(res)), enc, strip, convert, filterText, log) - /** * * @param bodies diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnStopWordsTokenEnricher.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnStopWordsTokenEnricher.scala index 6dfb1b2c..cb1baae2 100644 --- a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnStopWordsTokenEnricher.scala +++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnStopWordsTokenEnricher.scala @@ -20,6 +20,7 @@ package org.apache.nlpcraft.nlp.enrichers import com.typesafe.scalalogging.LazyLogging import org.apache.nlpcraft.* import org.apache.nlpcraft.internal.util.NCUtils as U +import org.apache.nlpcraft.nlp.enrichers.impl.NCEnStopWordGenerator import org.apache.nlpcraft.nlp.stemmer.* import java.io.* @@ -82,11 +83,6 @@ private object NCEnStopWordsTokenEnricher extends LazyLogging: "--" // Synthetic POS. ) - // Stemmatization is done already by generator. - // It is initialized in the companion for test performance reasons. - private val FIRST_WORDS: Set[String] = read("stopwords/first_words.txt.gz") - private val NOUN_WORDS: Set[String] = read("stopwords/noun_words.txt.gz") - private val STOP_BEFORE_STOP: Seq[Word] = Seq("DT", "PRP", "PRP$", "WDT", "WP", "WP$", "WRB") private val Q_POS = Set("``", "''") private val PERCENTS = Set( @@ -100,7 +96,6 @@ private object NCEnStopWordsTokenEnricher extends LazyLogging: "percent" ) - private def read(path: String): Set[String] = U.readGzipLines(path, convert = _.toLowerCase, filterText = true, log = logger).toSet private def getPos(t: NCToken): String = U.getProperty(t, "pos") private def getLemma(t: NCToken): String = U.getProperty(t, "lemma") private def isQuote(t: NCToken): Boolean = Q_POS.contains(getPos(t)) @@ -199,6 +194,8 @@ class NCEnStopWordsTokenEnricher( exclSet: Set[String] = Set.empty, stemmer: NCStemmer = new NCEnStemmer ) extends NCTokenEnricher with LazyLogging: + require(addSet != null, "Additional stopwords cannot be null.") + require(exclSet != null, "Exceptions stopwords cannot be null.") require(stemmer != null, "Stemmer cannot be null.") private var addStems: Set[String] = _ @@ -206,6 +203,8 @@ class NCEnStopWordsTokenEnricher( private var percents: Set[String] = _ private var stopWords: StopWordHolder = _ private var exceptions: StopWordHolder = _ + private var firstWords: Set[String] = _ + private var nounWords: Set[String] = _ private case class TokenExtra(lemma: String, stemTxt: String, stemLemma: String) private object TokenExtra: @@ -309,8 +308,8 @@ class NCEnStopWordsTokenEnricher( * */ private def init(): Unit = - addStems = if addSet == null then Set.empty else addSet.map(getStem) - exclStems = if exclSet == null then Set.empty else exclSet.map(getStem) + addStems = addSet.map(getStem) + exclStems = exclSet.map(getStem) def check(name: String, set: Set[String]): Unit = if set.exists(_.exists(_.isWhitespace)) then throw E(s"$name contain a string with whitespaces.") @@ -329,6 +328,11 @@ class NCEnStopWordsTokenEnricher( stopWords = m(false) exceptions = m(true) + val gen = new NCEnStopWordGenerator(stemmer) + + firstWords = gen.mkFirstWords() + nounWords = gen.mkNounWords() + /** * Parses configuration template. * @@ -601,7 +605,7 @@ class NCEnStopWordsTokenEnricher( // All sentence first stopword + first non stop word. val startToks = toks.takeWhile(isStopWord) ++ toks.find(p => !isStopWord(p)).map(p => p) - for (startTok <- startToks; tup <- origToks.filter(_._1.head == startTok); key = tup._2 if FIRST_WORDS.contains(key) && !isException(tup._1)) + for (startTok <- startToks; tup <- origToks.filter(_._1.head == startTok); key = tup._2 if firstWords.contains(key) && !isException(tup._1)) tup._1.foreach(tok => stops += tok) foundKeys += key @@ -611,7 +615,7 @@ class NCEnStopWordsTokenEnricher( // +-------------------------------------------------+ for (tup <- origToks; key = tup._2 if !foundKeys.contains(key) && !isException(tup._1)) foundKeys.find(key.startsWith) match - case Some(s) => if NOUN_WORDS.contains(key.substring(s.length).strip) then tup._1.foreach(tok => stops += tok) + case Some(s) => if nounWords.contains(key.substring(s.length).strip) then tup._1.foreach(tok => stops += tok) case None => () // +-------------------------------------------------+ diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/tools/NCEnStopWordGenerator.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/impl/NCEnStopWordGenerator.scala similarity index 86% rename from nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/tools/NCEnStopWordGenerator.scala rename to nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/impl/NCEnStopWordGenerator.scala index adb66e4a..b90e0567 100644 --- a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/tools/NCEnStopWordGenerator.scala +++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/impl/NCEnStopWordGenerator.scala @@ -15,23 +15,15 @@ * limitations under the License. */ -package org.apache.nlpcraft.nlp.enrichers.tools +package org.apache.nlpcraft.nlp.enrichers.impl import org.apache.nlpcraft.internal.util.NCUtils -import org.apache.nlpcraft.nlp.stemmer.NCEnStemmer +import org.apache.nlpcraft.nlp.stemmer.* import scala.collection.mutable -/** - * Generates first word sequences. - */ -object NCEnStopWordGenerator: - private final lazy val stemmer = new NCEnStemmer - - // Output files. - private val FIRST_WORDS_FILE = "first_words.txt" - private val NOUN_WORDS_FILE = "noun_words.txt" - +private[enrichers] object NCEnStopWordGenerator: + // All string data should be in lowercase. private final val QWORDS = Seq( "what", "when", @@ -157,13 +149,14 @@ object NCEnStopWordGenerator: "couple of" ) - private def mkGzip(path: String, lines: Iterable[Any]): Unit = - val p = NCUtils.mkPath(s"nlpcraft/src/main/resources/stopwords/$path") - NCUtils.mkTextFile(p, lines) - NCUtils.gzipPath(p) +import org.apache.nlpcraft.nlp.enrichers.impl.NCEnStopWordGenerator.* - private[tools] def mkNounWords(): Unit = - val buf = new mutable.ArrayBuffer[String]() +/** + * Generates first word sequences. + */ +private[enrichers] class NCEnStopWordGenerator(stemmer: NCStemmer): + def mkNounWords(): Set[String] = + val buf = new mutable.HashSet[String]() for (w1 <- NOUN_WORDS) buf += s"$w1" @@ -171,13 +164,10 @@ object NCEnStopWordGenerator: for (w1 <- NOUN_WORDS; w2 <- NOUN_WORDS2) buf += s"$w1 $w2" - mkGzip(NOUN_WORDS_FILE, stem(buf.toSeq)) - - private def stem(s: String): String = s.split(" ").map(p => stemmer.stem(p.toLowerCase)).mkString(" ") - private def stem(seq: Seq[String]): Seq[String] = seq.map(stem) + buf.map(stem).toSet - private[tools] def mkFirstWords(): Unit = - val buf = new mutable.ArrayBuffer[String]() + def mkFirstWords(): Set[String] = + val buf = new mutable.HashSet[String]() // is there for (w1 <- QWORDS2) @@ -307,14 +297,8 @@ object NCEnStopWordGenerator: for (w0 <- DWORDS_PRE; w1 <- DWORDS; w2 <- DWORDS_SUP; w3 <- QWORDS) buf += s"$w0 $w1 $w2 $w3" - mkGzip(FIRST_WORDS_FILE, stem(buf.toSeq)) + buf.map(stem).toSet - /** - * - * @param args - */ - def main(args: Array[String]): Unit = - mkFirstWords() - mkNounWords() + // All data already in lowercase. + private def stem(s: String): String = s.split(" ").map(stemmer.stem).mkString(" ") - sys.exit()