This is an automated email from the ASF dual-hosted git repository. sergeykamov pushed a commit to branch NLPCRAFT-435 in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git
commit 527493e9536d6c260ae4165b7f85354a781419a6 Author: Sergey Kamov <[email protected]> AuthorDate: Thu Sep 9 11:18:19 2021 +0300 WIP. --- nlpcraft/src/main/resources/date/full.txt.gz | Bin 6985934 -> 24138388 bytes nlpcraft/src/main/resources/date/parts.txt.gz | Bin 8146091 -> 7563491 bytes nlpcraft/src/main/resources/date/parts_dmy.txt.gz | Bin 4986446 -> 4765022 bytes nlpcraft/src/main/resources/date/parts_mdy.txt.gz | Bin 4955873 -> 4744459 bytes nlpcraft/src/main/resources/date/parts_ymd.txt.gz | Bin 5326821 -> 5096584 bytes .../org/apache/nlpcraft/common/util/NCUtils.scala | 2 +- .../server/nlp/enrichers/date/NCDateEnricher.scala | 64 +++++++++++---------- .../nlp/enrichers/date/tools/NCDateGenerator.scala | 22 +++---- .../nlp/enrichers/date/NCEnricherDateSpec.scala | 3 +- 9 files changed, 47 insertions(+), 44 deletions(-) diff --git a/nlpcraft/src/main/resources/date/full.txt.gz b/nlpcraft/src/main/resources/date/full.txt.gz index 37216a0..15d7718 100644 Binary files a/nlpcraft/src/main/resources/date/full.txt.gz and b/nlpcraft/src/main/resources/date/full.txt.gz differ diff --git a/nlpcraft/src/main/resources/date/parts.txt.gz b/nlpcraft/src/main/resources/date/parts.txt.gz index 4b7008a..9c90460 100644 Binary files a/nlpcraft/src/main/resources/date/parts.txt.gz and b/nlpcraft/src/main/resources/date/parts.txt.gz differ diff --git a/nlpcraft/src/main/resources/date/parts_dmy.txt.gz b/nlpcraft/src/main/resources/date/parts_dmy.txt.gz index 42d35d7..58b2792 100644 Binary files a/nlpcraft/src/main/resources/date/parts_dmy.txt.gz and b/nlpcraft/src/main/resources/date/parts_dmy.txt.gz differ diff --git a/nlpcraft/src/main/resources/date/parts_mdy.txt.gz b/nlpcraft/src/main/resources/date/parts_mdy.txt.gz index faeac93..834719e 100644 Binary files a/nlpcraft/src/main/resources/date/parts_mdy.txt.gz and b/nlpcraft/src/main/resources/date/parts_mdy.txt.gz differ diff --git a/nlpcraft/src/main/resources/date/parts_ymd.txt.gz b/nlpcraft/src/main/resources/date/parts_ymd.txt.gz index 22f38bf..da5d4ba 100644 Binary files a/nlpcraft/src/main/resources/date/parts_ymd.txt.gz and b/nlpcraft/src/main/resources/date/parts_ymd.txt.gz differ diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/common/util/NCUtils.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/common/util/NCUtils.scala index d8c1900..d2c2b03 100644 --- a/nlpcraft/src/main/scala/org/apache/nlpcraft/common/util/NCUtils.scala +++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/common/util/NCUtils.scala @@ -552,7 +552,7 @@ object NCUtils extends LazyLogging { * @return */ private def readLcTrimFilter(in: BufferedSource): List[String] = - in.getLines().map(_.toLowerCase.strip).filter(s => s.nonEmpty && !s.startsWith("#")).toList + in.getLines().map(_.toLowerCase.strip).filter(s => s.nonEmpty && s.head!= '#').toList /** * Reads lines from given file converting to lower case, trimming, and filtering diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/server/nlp/enrichers/date/NCDateEnricher.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/server/nlp/enrichers/date/NCDateEnricher.scala index a4e8e11..9d7a549 100644 --- a/nlpcraft/src/main/scala/org/apache/nlpcraft/server/nlp/enrichers/date/NCDateEnricher.scala +++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/server/nlp/enrichers/date/NCDateEnricher.scala @@ -27,12 +27,9 @@ import org.apache.nlpcraft.server.nlp.enrichers.date.NCDateConstants._ import org.apache.nlpcraft.server.nlp.enrichers.date.NCDateFormatType._ import java.util -import java.util.{Calendar => C} -import java.util.{List => JList} - +import java.util.{Calendar => C, List => JList} import scala.collection.immutable.Iterable import scala.collection.mutable -import scala.collection.mutable.{LinkedHashMap => LHM} import scala.concurrent.ExecutionContext import scala.jdk.CollectionConverters.ListHasAsScala @@ -40,8 +37,6 @@ import scala.jdk.CollectionConverters.ListHasAsScala * Date enricher. */ object NCDateEnricher extends NCServerEnricher { - private type LHM_SS = LHM[String, String] - private object Config extends NCConfigurable { def style: NCDateFormatType = getObject("nlpcraft.server.datesFormatStyle", NCDateFormatType.withName) } @@ -55,8 +50,8 @@ object NCDateEnricher extends NCServerEnricher { private[date] val prepsBtwIncl = mkBetweenPrepositions(BETWEEN_INCLUSIVE) private[date] val prepsBtwExcl = mkBetweenPrepositions(BETWEEN_EXCLUSIVE) - @volatile private var cacheFull: LHM_SS = _ - @volatile private var cacheParts: LHM_SS = _ + private val cacheFull = new util.HashMap[String, String]() + private val cacheParts = new util.HashMap[String, String]() // Preposition data holder. case class P(text: String) { @@ -121,6 +116,10 @@ object NCDateEnricher extends NCServerEnricher { */ override def stop(parent: Span = null): Unit = startScopedSpan("stop", parent) { _ => ackStopping() + + cacheFull.clear() + cacheParts.clear() + ackStopped() } @@ -132,18 +131,25 @@ object NCDateEnricher extends NCServerEnricher { override def start(parent: Span = null): NCService = startScopedSpan("start", parent) { span => ackStarting() - def read(res: String): LHM_SS = { + val sep = '|'.asInstanceOf[Int] + + def read(dest: util.Map[String, String], res: String): Unit = + U.readTextGzipResource(res, "UTF-8", logger).foreach(p => { + val idx = p.indexOf(sep) + + // Data already trimmed. + dest.put(p.take(idx), p.drop(idx + 1)) + }) + + def readCommon(dest: util.Map[String, String], res: String): Unit = { startScopedSpan("read", span, "res" -> res) { _ => - val m: LHM_SS = new LHM_SS() - - val map = U.readTextGzipResource(res, "UTF-8", logger).map(p => { - val idx = p.indexOf("|") - p.take(idx).strip -> p.drop(idx + 1).trim - }) - - m ++= map - - m + val m = new util.HashMap[String, String]() + + read(m, res) + + dest.synchronized { + dest.putAll(m) + } } } @@ -155,17 +161,12 @@ object NCDateEnricher extends NCServerEnricher { case _ => throw new AssertionError(s"Unexpected format type: ${Config.style}") } - var p1: LHM_SS = null - var p2: LHM_SS = null - U.executeParallel( - () => cacheFull = read("date/full.txt.gz"), - () => p1 = read("date/parts.txt.gz"), - () => p2 = read(s"date/$file") + () => read(cacheFull, "date/full.txt.gz"), + () => readCommon(cacheParts, "date/parts.txt.gz"), + () => readCommon(cacheParts, s"date/$file") ) - cacheParts = p1 ++ p2 - ackStarted() } @@ -342,12 +343,13 @@ object NCDateEnricher extends NCServerEnricher { } cacheFull.get(s) match { - case Some(body) => add(body, isFull = true) - case None => + case null => cacheParts.get(s) match { - case Some(body) => add(body, isFull = false) - case None => // No-op. + case null => // No-op. + case body => add(body, isFull = false) } + + case body => add(body, isFull = true) } } } diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/server/nlp/enrichers/date/tools/NCDateGenerator.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/server/nlp/enrichers/date/tools/NCDateGenerator.scala index 646267c..6fbbff7 100644 --- a/nlpcraft/src/main/scala/org/apache/nlpcraft/server/nlp/enrichers/date/tools/NCDateGenerator.scala +++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/server/nlp/enrichers/date/tools/NCDateGenerator.scala @@ -17,17 +17,16 @@ package org.apache.nlpcraft.server.nlp.enrichers.date.tools -import java.text.{DateFormat, SimpleDateFormat} -import java.util.{Date, Locale, Calendar => C} import org.apache.nlpcraft.common._ import org.apache.nlpcraft.common.nlp.numeric.NCNumericGenerator import org.apache.nlpcraft.server.nlp.enrichers.date.NCDateConstants._ import org.apache.nlpcraft.server.nlp.enrichers.date.NCDateFormatType._ +import org.apache.nlpcraft.server.nlp.enrichers.date.tools.NCDateGenerator._ -import scala.collection.mutable.{LinkedHashMap => LHM} -import NCDateGenerator._ - +import java.text.{DateFormat, SimpleDateFormat} +import java.util.{Date, Locale, Calendar => C} import scala.collection.mutable +import scala.collection.mutable.{LinkedHashMap => LHM} /** * Pre-built date ranges generator. @@ -106,6 +105,7 @@ object NCDateGenerator { private val NUM_MONTH_MAP = zipIndexes(CAL_MONTHS) private val MMMM_MONTH_SEQ = CAL_MONTHS.map(month) private val YEARS_SEQ = for (i <- 1900 to C.getInstance().get(C.YEAR) + 5) yield i + private val YEARS_SEQ_EXT = for (i <- 1500 to C.getInstance().get(C.YEAR) + 5) yield i private val MMMM_MONTH_MAP = zipIndexes(MMMM_MONTH_SEQ) // USA week. @@ -403,7 +403,7 @@ object NCDateGenerator { } private[date] def years(df: LHM_SS): Unit = - for (y <- YEARS_SEQ) + for (y <- YEARS_SEQ_EXT) mkYears(y).foreach(s => df += s"$s" -> s"${y}y") private[date] def months(df: LHM_SS, fmts: Seq[SimpleDateFormat]): Unit = { @@ -463,10 +463,10 @@ object NCDateGenerator { } // Between. - for ((from, to) <- BETWEEN_INCLUSIVE; y1 <- YEARS_SEQ; y2 <- YEARS_SEQ if y2 > y1) + for ((from, to) <- BETWEEN_INCLUSIVE; y1 <- YEARS_SEQ_EXT; y2 <- YEARS_SEQ_EXT if y2 > y1) addRange(from, to, y1, y2, s"${y1}y:${y2}y") - for ((from, to) <- BETWEEN_EXCLUSIVE; y1 <- YEARS_SEQ; y2 <- YEARS_SEQ if y2 > y1) + for ((from, to) <- BETWEEN_EXCLUSIVE; y1 <- YEARS_SEQ_EXT; y2 <- YEARS_SEQ_EXT if y2 > y1) addRange(from, to, y1, y2, s"${y1}y:${y2-1}y") def add(word: String, y: Int, templ: String): Unit = { @@ -478,10 +478,10 @@ object NCDateGenerator { } // From. - for (f <- FROM; y <- YEARS_SEQ) add(f, y, toNow(s"${y}y")) + for (f <- FROM; y <- YEARS_SEQ_EXT) add(f, y, toNow(s"${y}y")) // Till. - for (t <- TO; y <- YEARS_SEQ) add(t, y, to(s"${y}y")) + for (t <- TO; y <- YEARS_SEQ_EXT) add(t, y, to(s"${y}y")) } private[date] def simpleQuarters(df: LHM_SS): Unit = { @@ -856,7 +856,7 @@ object NCDateGenerator { object DLDateGeneratorRunner extends App { private def mkPath(path: String): String = U.mkPath(s"nlpcraft/src/main/resources/date/$path") - private def convert(entry: (String, String)): String = s"${entry._1} | ${entry._2}" + private def convert(entry: (String, String)): String = s"${entry._1.strip}|${entry._2.strip}" private def process(): Unit = { val fileFull = mkPath("full.txt") diff --git a/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/date/NCEnricherDateSpec.scala b/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/date/NCEnricherDateSpec.scala index 429b24c..30f9a65 100644 --- a/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/date/NCEnricherDateSpec.scala +++ b/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/date/NCEnricherDateSpec.scala @@ -38,7 +38,8 @@ class NCEnricherDateSpec extends NCEnricherBaseSpec { "1900 year", "from 1900 year", "between 1900 and 1905", - "between 1900 and 1905 years" + "between 1501 and 1905 years", + "after 1501 year" ).map(txt => { val f: Unit => Unit = _ => checkExists(txt, dte(text = txt))
