[incubator-nlpcraft] 01/03: WIP.

sergeykamov Mon, 19 Dec 2022 00:34:08 -0800

This is an automated email from the ASF dual-hosted git repository.

sergeykamov pushed a commit to branch NLPCRAFT-520
in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git


commit 2427be979d3d87e9fe94e752ea3c73f9ee376b5a
Author: Sergey Kamov <skhem...@gmail.com>
AuthorDate: Mon Dec 19 11:08:24 2022 +0400

    WIP.
---
 .../apache/nlpcraft/internal/util/NCUtils.scala    | 18 --------
 .../nlp/enrichers/NCEnStopWordsTokenEnricher.scala | 24 ++++++-----
 .../{tools => impl}/NCEnStopWordGenerator.scala    | 50 ++++++++--------------
 3 files changed, 31 insertions(+), 61 deletions(-)

diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/util/NCUtils.scala 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/util/NCUtils.scala
index 6791429d..1b81acd0 100644
--- a/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/util/NCUtils.scala
+++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/internal/util/NCUtils.scala
@@ -424,24 +424,6 @@ object NCUtils extends LazyLogging:
             out
         catch case e: IOException => E(s"Failed to read stream: $res", e)
 
-    /**
-      * @param res Gzip resource, file absolute or relative path.
-      * @param res
-      * @param enc        Encoding. Default value is "UTF-8".
-      * @param strip      Strip flag. If `true` it strips all read lines. 
Default value is `true`.
-      * @param convert    Line conversion method. Applied after `strip`. By 
default it passes lines as is.
-      * @param filterText . Filtering text flag. If `true` it skips empty 
lines and lines with headers (# symbol). Default value is `false`.
-      * @param log Logger.
-      */
-    def readGzipLines(
-        res: String,
-        enc: String = "UTF-8",
-        strip: Boolean = true,
-        convert: String => String = s => s,
-        filterText: Boolean = false,
-        log: Logger = logger
-    ): Iterator[String] = readLines(new GZIPInputStream(getStream(res)), enc, 
strip, convert, filterText, log)
-
     /**
       *
       * @param bodies
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnStopWordsTokenEnricher.scala
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnStopWordsTokenEnricher.scala
index 6dfb1b2c..cb1baae2 100644
--- 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnStopWordsTokenEnricher.scala
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/NCEnStopWordsTokenEnricher.scala
@@ -20,6 +20,7 @@ package org.apache.nlpcraft.nlp.enrichers
 import com.typesafe.scalalogging.LazyLogging
 import org.apache.nlpcraft.*
 import org.apache.nlpcraft.internal.util.NCUtils as U
+import org.apache.nlpcraft.nlp.enrichers.impl.NCEnStopWordGenerator
 import org.apache.nlpcraft.nlp.stemmer.*
 
 import java.io.*
@@ -82,11 +83,6 @@ private object NCEnStopWordsTokenEnricher extends 
LazyLogging:
         "--" // Synthetic POS.
     )
 
-    // Stemmatization is done already by generator.
-    // It is initialized in the companion for test performance reasons.
-    private val FIRST_WORDS: Set[String] = read("stopwords/first_words.txt.gz")
-    private val NOUN_WORDS: Set[String] = read("stopwords/noun_words.txt.gz")
-
     private val STOP_BEFORE_STOP: Seq[Word] = Seq("DT", "PRP", "PRP$", "WDT", 
"WP", "WP$", "WRB")
     private val Q_POS = Set("``", "''")
     private val PERCENTS = Set(
@@ -100,7 +96,6 @@ private object NCEnStopWordsTokenEnricher extends 
LazyLogging:
         "percent"
     )
 
-    private def read(path: String): Set[String] = U.readGzipLines(path, 
convert = _.toLowerCase, filterText = true, log = logger).toSet
     private def getPos(t: NCToken): String = U.getProperty(t, "pos")
     private def getLemma(t: NCToken): String = U.getProperty(t, "lemma")
     private def isQuote(t: NCToken): Boolean = Q_POS.contains(getPos(t))
@@ -199,6 +194,8 @@ class NCEnStopWordsTokenEnricher(
     exclSet: Set[String] = Set.empty,
     stemmer: NCStemmer = new NCEnStemmer
 ) extends NCTokenEnricher with LazyLogging:
+    require(addSet != null, "Additional stopwords cannot be null.")
+    require(exclSet != null, "Exceptions stopwords cannot be null.")
     require(stemmer != null, "Stemmer cannot be null.")
 
     private var addStems: Set[String] = _
@@ -206,6 +203,8 @@ class NCEnStopWordsTokenEnricher(
     private var percents: Set[String] = _
     private var stopWords: StopWordHolder = _
     private var exceptions: StopWordHolder = _
+    private var firstWords: Set[String] = _
+    private var nounWords: Set[String] = _
 
     private case class TokenExtra(lemma: String, stemTxt: String, stemLemma: 
String)
     private object TokenExtra:
@@ -309,8 +308,8 @@ class NCEnStopWordsTokenEnricher(
       *
       */
     private def init(): Unit =
-        addStems = if addSet == null then Set.empty else addSet.map(getStem)
-        exclStems = if exclSet == null then Set.empty else exclSet.map(getStem)
+        addStems = addSet.map(getStem)
+        exclStems = exclSet.map(getStem)
 
         def check(name: String, set: Set[String]): Unit =
             if set.exists(_.exists(_.isWhitespace)) then throw E(s"$name 
contain a string with whitespaces.")
@@ -329,6 +328,11 @@ class NCEnStopWordsTokenEnricher(
         stopWords = m(false)
         exceptions = m(true)
 
+        val gen = new NCEnStopWordGenerator(stemmer)
+
+        firstWords = gen.mkFirstWords()
+        nounWords = gen.mkNounWords()
+
     /**
       * Parses configuration template.
       *
@@ -601,7 +605,7 @@ class NCEnStopWordsTokenEnricher(
 
         // All sentence first stopword + first non stop word.
         val startToks = toks.takeWhile(isStopWord) ++ toks.find(p => 
!isStopWord(p)).map(p => p)
-        for (startTok <- startToks; tup <- origToks.filter(_._1.head == 
startTok); key = tup._2 if FIRST_WORDS.contains(key) && !isException(tup._1))
+        for (startTok <- startToks; tup <- origToks.filter(_._1.head == 
startTok); key = tup._2 if firstWords.contains(key) && !isException(tup._1))
             tup._1.foreach(tok => stops += tok)
             foundKeys += key
 
@@ -611,7 +615,7 @@ class NCEnStopWordsTokenEnricher(
         // +-------------------------------------------------+
         for (tup <- origToks; key = tup._2 if !foundKeys.contains(key) && 
!isException(tup._1))
             foundKeys.find(key.startsWith) match
-                case Some(s) => if 
NOUN_WORDS.contains(key.substring(s.length).strip) then tup._1.foreach(tok => 
stops += tok)
+                case Some(s) => if 
nounWords.contains(key.substring(s.length).strip) then tup._1.foreach(tok => 
stops += tok)
                 case None => ()
 
         // +-------------------------------------------------+
diff --git 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/tools/NCEnStopWordGenerator.scala
 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/impl/NCEnStopWordGenerator.scala
similarity index 86%
rename from 
nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/tools/NCEnStopWordGenerator.scala
rename to 
nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/impl/NCEnStopWordGenerator.scala
index adb66e4a..b90e0567 100644
--- 
a/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/tools/NCEnStopWordGenerator.scala
+++ 
b/nlpcraft/src/main/scala/org/apache/nlpcraft/nlp/enrichers/impl/NCEnStopWordGenerator.scala
@@ -15,23 +15,15 @@
  * limitations under the License.
  */
 
-package org.apache.nlpcraft.nlp.enrichers.tools
+package org.apache.nlpcraft.nlp.enrichers.impl
 
 import org.apache.nlpcraft.internal.util.NCUtils
-import org.apache.nlpcraft.nlp.stemmer.NCEnStemmer
+import org.apache.nlpcraft.nlp.stemmer.*
 
 import scala.collection.mutable
 
-/**
-  * Generates first word sequences.
-  */
-object NCEnStopWordGenerator:
-    private final lazy val stemmer = new NCEnStemmer
-
-    // Output files.
-    private val FIRST_WORDS_FILE = "first_words.txt"
-    private val NOUN_WORDS_FILE = "noun_words.txt"
-
+private[enrichers] object NCEnStopWordGenerator:
+    // All string data should be in lowercase.
     private final val QWORDS = Seq(
         "what",
         "when",
@@ -157,13 +149,14 @@ object NCEnStopWordGenerator:
         "couple of"
     )
 
-    private def mkGzip(path: String, lines: Iterable[Any]): Unit =
-        val p = NCUtils.mkPath(s"nlpcraft/src/main/resources/stopwords/$path")
-        NCUtils.mkTextFile(p, lines)
-        NCUtils.gzipPath(p)
+import org.apache.nlpcraft.nlp.enrichers.impl.NCEnStopWordGenerator.*
 
-    private[tools] def mkNounWords(): Unit =
-        val buf = new mutable.ArrayBuffer[String]()
+/**
+  * Generates first word sequences.
+  */
+private[enrichers] class NCEnStopWordGenerator(stemmer: NCStemmer):
+    def mkNounWords(): Set[String] =
+        val buf = new mutable.HashSet[String]()
 
         for (w1 <- NOUN_WORDS)
             buf += s"$w1"
@@ -171,13 +164,10 @@ object NCEnStopWordGenerator:
         for (w1 <- NOUN_WORDS; w2 <- NOUN_WORDS2)
             buf += s"$w1 $w2"
 
-        mkGzip(NOUN_WORDS_FILE, stem(buf.toSeq))
-
-    private def stem(s: String): String = s.split(" ").map(p => 
stemmer.stem(p.toLowerCase)).mkString(" ")
-    private def stem(seq: Seq[String]): Seq[String] = seq.map(stem)
+        buf.map(stem).toSet
 
-    private[tools] def mkFirstWords(): Unit =
-        val buf = new mutable.ArrayBuffer[String]()
+    def mkFirstWords(): Set[String] =
+        val buf = new mutable.HashSet[String]()
 
         // is there
         for (w1 <- QWORDS2)
@@ -307,14 +297,8 @@ object NCEnStopWordGenerator:
         for (w0 <- DWORDS_PRE; w1 <- DWORDS; w2 <- DWORDS_SUP; w3 <- QWORDS)
             buf += s"$w0 $w1 $w2 $w3"
 
-        mkGzip(FIRST_WORDS_FILE, stem(buf.toSeq))
+        buf.map(stem).toSet
 
-    /**
-      *
-      * @param args
-      */
-    def main(args: Array[String]): Unit =
-        mkFirstWords()
-        mkNounWords()
+    // All data already in lowercase.
+    private def stem(s: String): String = s.split(" 
").map(stemmer.stem).mkString(" ")
 
-        sys.exit()

[incubator-nlpcraft] 01/03: WIP.

Reply via email to