This is an automated email from the ASF dual-hosted git repository. sergeykamov pushed a commit to branch 261tmp in repository https://gitbox.apache.org/repos/asf/incubator-nlpcraft.git
commit 7815090de3aa042a4839793cdb65cd1677afbb86 Author: Sergey Kamov <[email protected]> AuthorDate: Mon Mar 8 15:23:17 2021 +0300 WIP. --- .../apache/nlpcraft/common/nlp/NCNlpSentence.scala | 776 +-------------------- .../org/apache/nlpcraft/common/nlp/SyTest.java | 241 ------- .../org/apache/nlpcraft/probe/NCProbeBoot.scala | 2 + .../probe/mgrs/nlp/NCProbeEnrichmentManager.scala | 3 +- .../mgrs/nlp/enrichers/model/NCModelEnricher.scala | 7 +- .../mgrs/sentence/NCSentenceManager.scala} | 380 +++------- .../nlpcraft/probe/mgrs/nlp/enrichers/SyTest.java | 278 -------- .../model/NCEnricherNestedModelSpec4.scala | 2 +- .../probe/mgrs/nlp/enrichers/model/Test1.java | 135 ---- .../probe/mgrs/nlp/enrichers/model/Test2.java | 114 --- 10 files changed, 108 insertions(+), 1830 deletions(-) diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentence.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentence.scala index 95a98a3..11d515a 100644 --- a/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentence.scala +++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentence.scala @@ -18,778 +18,17 @@ package org.apache.nlpcraft.common.nlp import com.typesafe.scalalogging.LazyLogging -import org.apache.nlpcraft.common.NCE -import org.apache.nlpcraft.common.nlp.pos.NCPennTreebank -import org.apache.nlpcraft.common.util.NCComboRecursiveTask -import org.apache.nlpcraft.model.NCModel - -import java.io.{Serializable => JSerializable} -import java.util -import java.util.concurrent.ForkJoinPool -import java.util.{Collections, Comparator, List => JList} + +import java.io.{Serializable ⇒ JSerializable} +import java.util.{Collections, List ⇒ JList} import scala.collection.JavaConverters._ -import scala.collection.mutable.ArrayBuffer import scala.collection.{Map, Seq, Set, mutable} import scala.language.implicitConversions object NCNlpSentence extends LazyLogging { - implicit def toTokens(x: NCNlpSentence): ArrayBuffer[NCNlpSentenceToken] = x.tokens - case class NoteKey(start: Int, end: Int) case class TokenKey(id: String, start: Int, end: Int) case class NoteLink(note: String, indexes: Seq[Int]) - - case class PartKey(id: String, start: Int, end: Int) { - require(start <= end) - - private def in(i: Int): Boolean = i >= start && i <= end - def intersect(id: String, start: Int, end: Int): Boolean = id == this.id && (in(start) || in(end)) - } - - object PartKey { - def apply(m: util.HashMap[String, JSerializable]): PartKey = { - def get[T](name: String): T = m.get(name).asInstanceOf[T] - - PartKey(get("id"), get("startcharindex"), get("endcharindex")) - } - - def apply(t: NCNlpSentenceNote, sen: NCNlpSentence): PartKey = - PartKey(t.noteType, sen(t.tokenFrom).startCharIndex, sen(t.tokenTo).endCharIndex) - } - - private def getLinks(notes: Seq[NCNlpSentenceNote]): Seq[NoteLink] = { - val noteLinks = mutable.ArrayBuffer.empty[NoteLink] - - for (n ← notes.filter(n ⇒ n.noteType == "nlpcraft:limit" || n.noteType == "nlpcraft:references")) - noteLinks += NoteLink(n("note").asInstanceOf[String], n("indexes").asInstanceOf[JList[Int]].asScala.sorted) - - for (n ← notes.filter(_.noteType == "nlpcraft:sort")) { - def add(noteName: String, idxsName: String): Unit = { - val names = n(noteName).asInstanceOf[JList[String]] - val idxsSeq = n(idxsName).asInstanceOf[JList[JList[Int]]] - - require(names.size() == idxsSeq.size()) - - noteLinks ++= - (for ((name, idxs) ← names.asScala.zip(idxsSeq.asScala.map(_.asScala))) - yield NoteLink(name, idxs.sorted) - ) - } - - if (n.contains("subjnotes")) add("subjnotes", "subjindexes") - if (n.contains("bynotes")) add("bynotes", "byindexes") - } - - noteLinks - } - - private def getPartKeys(notes: NCNlpSentenceNote*): Seq[PartKey] = - notes. - filter(_.isUser). - flatMap(n ⇒ { - val optList: Option[JList[util.HashMap[String, JSerializable]]] = n.dataOpt("parts") - - optList - }).flatMap(_.asScala).map(m ⇒ PartKey(m)).distinct - - /** - * - * @param ns - * @param idxs - * @param notesType - * @param note - * @return - */ - private def checkRelation(ns: NCNlpSentence, idxs: Seq[Int], notesType: String, note: NCNlpSentenceNote): Boolean = { - val types = idxs.flatMap(idx ⇒ ns(idx).map(p ⇒ p).filter(!_.isNlp).map(_.noteType)).distinct - - /** - * Example: - * 1. Sentence 'maximum x' (single element related function) - * - maximum is aggregate function linked to date element. - * - x defined as 2 elements: date and num. - * So, the variant 'maximum x (as num)' should be excluded. - * * - * 2. Sentence 'compare x and y' (multiple elements related function) - * - compare is relation function linked to date element. - * - x an y defined as 2 elements: date and num. - * So, variants 'x (as num) and x (as date)' and 'x (as date) and x (as num)' - * should not be excluded, but invalid relation should be deleted for these combinations. - */ - types.size match { - case 0 ⇒ false - case 1 ⇒ types.head == notesType - case _ ⇒ - // Equal elements should be processed together with function element. - if (types.size == 1) - false - else { - ns.removeNote(note) - - logger.trace(s"Removed note: $note") - - true - } - } - } - - /** - * Fixes notes with references to other notes indexes. - * Note that 'idxsField' is 'indexes' and 'noteField' is 'note' for all kind of references. - * - * @param noteType Note type. - * @param idxsField Indexes field. - * @param noteField Note field. - * @param ns Sentence. - * @param history Indexes transformation history. - * @return Valid flag. - */ - private def fixIndexesReferences( - noteType: String, - idxsField: String, - noteField: String, - ns: NCNlpSentence, - history: Seq[(Int, Int)] - ): Boolean = { - ns.filter(_.isTypeOf(noteType)).foreach(tok ⇒ - tok.getNoteOpt(noteType, idxsField) match { - case Some(n) ⇒ - val idxs: Seq[Int] = n.data[JList[Int]](idxsField).asScala - var fixed = idxs - - history.foreach { case (idxOld, idxNew) ⇒ fixed = fixed.map(i ⇒ if (i == idxOld) idxNew else i) } - - fixed = fixed.distinct - - if (idxs != fixed) - ns.fixNote(n, "indexes" → fixed.asJava.asInstanceOf[JSerializable]) - case None ⇒ // No-op. - } - ) - - ns.flatMap(_.getNotes(noteType)).forall( - n ⇒ checkRelation(ns, n.data[JList[Int]]("indexes").asScala, n.data[String](noteField), n) - ) - } - - /** - * - * @param note - * @param idxsField - * @param noteField - * @param ns - */ - private def fixNoteIndexes(note: String, idxsField: String, noteField: String, ns: NCNlpSentence): Unit = - ns.flatMap(_.getNotes(note)).foreach( - n ⇒ checkRelation(ns, n.data[JList[Int]](idxsField).asScala, n.data[String](noteField), n) - ) - - /** - * - * @param note - * @param idxsField - * @param noteField - * @param ns - */ - private def fixNoteIndexesList(note: String, idxsField: String, noteField: String, ns: NCNlpSentence): Unit = { - ns.flatMap(_.getNotes(note)).foreach(rel ⇒ - rel.dataOpt[JList[JList[Int]]](idxsField) match { - case Some(idxsList) ⇒ - val notesTypes = rel.data[JList[String]](noteField) - - require(idxsList.size() == notesTypes.size()) - - idxsList.asScala.zip(notesTypes.asScala).foreach { - case (idxs, notesType) ⇒ checkRelation(ns, idxs.asScala, notesType, rel) - } - case None ⇒ // No-op. - } - ) - } - - /** - * Copies token. - * - * @param ns Sentence. - * @param history Indexes transformation history. - * @param toksCopy Copied tokens. - * @param i Index. - */ - private def simpleCopy( - ns: NCNlpSentence, - history: mutable.ArrayBuffer[(Int, Int)], - toksCopy: NCNlpSentence, i: Int - ): Seq[NCNlpSentenceToken] = { - val tokCopy = toksCopy(i) - - history += tokCopy.index → ns.size - - ns += tokCopy.clone(ns.size) - } - - /** - * Glues stop words. - * - * @param ns Sentence. - * @param userNoteTypes Notes types. - * @param history Indexes transformation history. - */ - private def unionStops( - ns: NCNlpSentence, - userNoteTypes: Seq[String], - history: mutable.ArrayBuffer[(Int, Int)] - ): Unit = { - // Java collection used because using scala collections (mutable.Buffer.empty[mutable.Buffer[Token]]) is reason - // Of compilation errors which seems as scala compiler internal error. - val bufs = new util.ArrayList[mutable.Buffer[NCNlpSentenceToken]]() - - def last[T](l: JList[T]): T = l.get(l.size() - 1) - - ns.filter(t ⇒ t.isStopWord && !t.isBracketed).foreach(t ⇒ - if (!bufs.isEmpty && last(bufs).last.index + 1 == t.index) - last(bufs) += t - else - bufs.add(mutable.Buffer.empty[NCNlpSentenceToken] :+ t) - ) - - val idxsSeq = bufs.asScala.filter(_.lengthCompare(1) > 0).map(_.map(_.index)) - - if (idxsSeq.nonEmpty) { - val nsCopyToks = ns.clone() - ns.clear() - - val buf = mutable.Buffer.empty[Int] - - for (i ← nsCopyToks.indices) - idxsSeq.find(_.contains(i)) match { - case Some(idxs) ⇒ - if (!buf.contains(idxs.head)) { - buf += idxs.head - - ns += mkCompound(ns, nsCopyToks, idxs, stop = true, ns.size, None, history) - } - case None ⇒ simpleCopy(ns, history, nsCopyToks, i) - } - - fixIndexes(ns, userNoteTypes) - } - } - - /** - * Fixes indexes for all notes after recreating tokens. - * - * @param ns Sentence. - * @param userNoteTypes Notes types. - */ - private def fixIndexes(ns: NCNlpSentence, userNoteTypes: Seq[String]) { - // Replaces other notes indexes. - for (t ← userNoteTypes :+ "nlpcraft:nlp"; note ← ns.getNotes(t)) { - val toks = ns.filter(_.contains(note)).sortBy(_.index) - - val newNote = note.clone(toks.map(_.index), toks.flatMap(_.wordIndexes).sorted) - - toks.foreach(t ⇒ { - t.remove(note) - t.add(newNote) - }) - } - - // Special case - field index of core NLP note. - ns.zipWithIndex.foreach { case (tok, idx) ⇒ ns.fixNote(tok.getNlpNote, "index" → idx) } - } - - /** - * Zip notes with same type. - * - * @param ns Sentence. - * @param nType Notes type. - * @param userNotesTypes Notes types. - * @param history Indexes transformation history. - */ - private def zipNotes( - ns: NCNlpSentence, - nType: String, - userNotesTypes: Seq[String], - history: mutable.ArrayBuffer[(Int, Int)] - ): Unit = { - val nts = ns.getNotes(nType).filter(n ⇒ n.tokenFrom != n.tokenTo).sortBy(_.tokenFrom) - - val overlapped = - nts.flatMap(n ⇒ n.tokenFrom to n.tokenTo).map(ns(_)).exists( - t ⇒ userNotesTypes.map(pt ⇒ t.getNotes(pt).size).sum > 1 - ) - - if (nts.nonEmpty && !overlapped) { - val nsCopyToks = ns.clone() - ns.clear() - - val buf = mutable.ArrayBuffer.empty[Int] - - for (i ← nsCopyToks.indices) - nts.find(_.tokenIndexes.contains(i)) match { - case Some(n) ⇒ - if (!buf.contains(n.tokenFrom)) { - buf += n.tokenFrom - - ns += mkCompound(ns, nsCopyToks, n.tokenIndexes, stop = false, ns.size, Some(n), history) - } - case None ⇒ simpleCopy(ns, history, nsCopyToks, i) - } - - fixIndexes(ns, userNotesTypes) - } - } - - /** - * Makes compound note. - * - * @param ns Sentence. - * @param nsCopyToks Tokens. - * @param indexes Indexes. - * @param stop Flag. - * @param idx Index. - * @param commonNote Common note. - * @param history Indexes transformation history. - */ - private def mkCompound( - ns: NCNlpSentence, - nsCopyToks: Seq[NCNlpSentenceToken], - indexes: Seq[Int], - stop: Boolean, - idx: Int, - commonNote: Option[NCNlpSentenceNote], - history: mutable.ArrayBuffer[(Int, Int)] - ): NCNlpSentenceToken = { - val t = NCNlpSentenceToken(idx) - - // Note, it adds stop-words too. - val content = nsCopyToks.zipWithIndex.filter(p ⇒ indexes.contains(p._2)).map(_._1) - - content.foreach(t ⇒ history += t.index → idx) - - def mkValue(get: NCNlpSentenceToken ⇒ String): String = { - val buf = mutable.Buffer.empty[String] - - val n = content.size - 1 - - content.zipWithIndex.foreach(p ⇒ { - val t = p._1 - val idx = p._2 - - buf += get(t) - - if (idx < n && t.endCharIndex != content(idx + 1).startCharIndex) - buf += " " - }) - - buf.mkString - } - - val origText = mkValue((t: NCNlpSentenceToken) ⇒ t.origText) - - val idxs = Seq(idx) - val wordIdxs = content.flatMap(_.wordIndexes).sorted - - val direct = - commonNote match { - case Some(n) if n.isUser ⇒ n.isDirect - case _ ⇒ content.forall(_.isDirect) - } - - val params = Seq( - "index" → idx, - "pos" → NCPennTreebank.SYNTH_POS, - "posDesc" → NCPennTreebank.SYNTH_POS_DESC, - "lemma" → mkValue((t: NCNlpSentenceToken) ⇒ t.lemma), - "origText" → origText, - "normText" → mkValue((t: NCNlpSentenceToken) ⇒ t.normText), - "stem" → mkValue((t: NCNlpSentenceToken) ⇒ t.stem), - "start" → content.head.startCharIndex, - "end" → content.last.endCharIndex, - "charLength" → origText.length, - "quoted" → false, - "stopWord" → stop, - "bracketed" → false, - "direct" → direct, - "dict" → (if (nsCopyToks.size == 1) nsCopyToks.head.getNlpNote.data[Boolean]("dict") else false), - "english" → nsCopyToks.forall(_.getNlpNote.data[Boolean]("english")), - "swear" → nsCopyToks.exists(_.getNlpNote.data[Boolean]("swear")) - ) - - val nlpNote = NCNlpSentenceNote(idxs, wordIdxs, "nlpcraft:nlp", params: _*) - - t.add(nlpNote) - - // Adds processed note with fixed indexes. - commonNote match { - case Some(n) ⇒ - ns.removeNote(n) - t.add(n.clone(idxs, wordIdxs)) - case None ⇒ // No-op. - } - - t - } - - /** - * Fixes notes with references list to other notes indexes. - * - * @param noteType Note type. - * @param idxsField Indexes field. - * @param noteField Note field. - * @param ns Sentence. - * @param history Indexes transformation history. - * @return Valid flag. - */ - private def fixIndexesReferencesList( - noteType: String, - idxsField: String, - noteField: String, - ns: NCNlpSentence, - history: Seq[(Int, Int)] - ): Boolean = { - var ok = true - - for (tok ← ns.filter(_.isTypeOf(noteType)) if ok) - tok.getNoteOpt(noteType, idxsField) match { - case Some(n) ⇒ - val idxs: Seq[Seq[Int]] = - n.data[JList[JList[Int]]](idxsField).asScala.map(_.asScala) - var fixed = idxs - - history.foreach { - case (idxOld, idxNew) ⇒ fixed = fixed.map(_.map(i ⇒ if (i == idxOld) idxNew else i).distinct) - } - - if (fixed.forall(_.size == 1)) - // Fix double dimension array to one dimension, - // so it should be called always in spite of 'fixIndexesReferences' method. - ns.fixNote(n, idxsField → fixed.map(_.head).asJava.asInstanceOf[JSerializable]) - else - ok = false - case None ⇒ // No-op. - } - - ok && - ns.flatMap(_.getNotes(noteType)).forall(rel ⇒ - rel.dataOpt[JList[Int]](idxsField) match { - case Some(idxsList) ⇒ - val notesTypes = rel.data[JList[String]](noteField) - - require(idxsList.size() == notesTypes.size()) - - idxsList.asScala.zip(notesTypes.asScala).forall { - case (idxs, notesType) ⇒ checkRelation(ns, Seq(idxs), notesType, rel) - } - case None ⇒ true - } - ) - } - - /** - * Fixes tokens positions. - * - * @param ns Sentence. - * @param notNlpTypes Token types. - */ - private def collapseSentence(ns: NCNlpSentence, notNlpTypes: Seq[String]): Boolean = { - ns. - filter(!_.isNlp). - filter(_.isStopWord). - flatten. - filter(_.isNlp). - foreach(n ⇒ ns.fixNote(n, "stopWord" → false)) - - val all = ns.tokens.flatten - val nsNotes: Map[String, Seq[Int]] = all.map(p ⇒ p.noteType → p.tokenIndexes).toMap - - for ( - t ← ns.tokens; stopReason ← t.stopsReasons - if all.contains(stopReason) && nsNotes.getOrElse(stopReason.noteType, Seq.empty) == stopReason.tokenIndexes - ) - ns.fixNote(t.getNlpNote, "stopWord" → true) - - val history = mutable.ArrayBuffer.empty[(Int, Int)] - - fixNoteIndexes("nlpcraft:relation", "indexes", "note", ns) - fixNoteIndexes("nlpcraft:limit", "indexes", "note", ns) - fixNoteIndexesList("nlpcraft:sort", "subjindexes", "subjnotes", ns) - fixNoteIndexesList("nlpcraft:sort", "byindexes", "bynotes", ns) - - notNlpTypes.foreach(typ ⇒ zipNotes(ns, typ, notNlpTypes, history)) - unionStops(ns, notNlpTypes, history) - - val res = - fixIndexesReferences("nlpcraft:relation", "indexes", "note", ns, history) && - fixIndexesReferences("nlpcraft:limit", "indexes", "note", ns, history) && - fixIndexesReferencesList("nlpcraft:sort", "subjindexes", "subjnotes", ns, history) && - fixIndexesReferencesList("nlpcraft:sort", "byindexes", "bynotes", ns, history) - - if (res) { - // Validation (all indexes calculated well) - require( - !res || - !ns.flatten. - exists(n ⇒ ns.filter(_.wordIndexes.exists(n.wordIndexes.contains)).exists(t ⇒ !t.contains(n))), - s"Invalid sentence:\n" + - ns.map(t ⇒ - // Human readable invalid sentence for debugging. - s"${t.origText}{index:${t.index}}[${t.map(n ⇒ s"${n.noteType}, {range:${n.tokenFrom}-${n.tokenTo}}").mkString("|")}]" - ).mkString("\n") - ) - } - - res - } - - private def dropAbstract(mdl: NCModel, ns: NCNlpSentence): Unit = - if (!mdl.getAbstractTokens.isEmpty) { - val notes = ns.flatten - - val keys = getPartKeys(notes: _*) - val noteLinks = getLinks(notes) - - notes.filter(n ⇒ { - val noteToks = ns.tokens.filter(_.contains(n)) - - mdl.getAbstractTokens.contains(n.noteType) && - !keys.exists(_.intersect(n.noteType, noteToks.head.startCharIndex, noteToks.last.startCharIndex)) && - !noteLinks.contains(NoteLink(n.noteType, n.tokenIndexes.sorted)) - }).foreach(ns.removeNote) - } - - private def getNotNlpNotes(toks: Seq[NCNlpSentenceToken]): Seq[NCNlpSentenceNote] = - toks.flatten.filter(!_.isNlp).distinct - - private def addDeleted(thisSen: NCNlpSentence, sen: NCNlpSentence, dels: Iterable[NCNlpSentenceNote]): Unit = - sen.deletedNotes ++= dels.map(n ⇒ { - val savedDelNote = n.clone() - val savedDelToks = n.tokenIndexes.map(idx ⇒ thisSen(idx).clone()) - - val mainNotes = savedDelToks.flatten.filter(n ⇒ n.noteType != "nlpcraft:nlp" && n != savedDelNote) - - // Deleted note's tokens should contains only nlp data and deleted notes. - for (savedDelTok ← savedDelToks; mainNote ← mainNotes) - savedDelTok.remove(mainNote) - - savedDelNote → savedDelToks - }) - - /** - * This collapser handles several tasks: - * - "overall" collapsing after all other individual collapsers had their turn. - * - Special further enrichment of tokens like linking, etc. - * - * In all cases of overlap (full or partial) - the "longest" note wins. In case of overlap and equal - * lengths - the winning note is chosen based on this priority. - */ - @throws[NCE] - private def collapseSentence(thisSen: NCNlpSentence, mdl: NCModel, lastPhase: Boolean = false): Seq[NCNlpSentence] = { - def collapse0(ns: NCNlpSentence): Option[NCNlpSentence] = { - if (lastPhase) - dropAbstract(mdl, ns) - - if (collapseSentence(ns, getNotNlpNotes(ns).map(_.noteType).distinct)) Some(ns) else None - } - - // Always deletes `similar` notes. - // Some words with same note type can be detected various ways. - // We keep only one variant - with `best` direct and sparsity parameters, - // other variants for these words are redundant. - val redundant: Seq[NCNlpSentenceNote] = - thisSen.flatten.filter(!_.isNlp).distinct. - groupBy(_.getKey()). - map(p ⇒ p._2.sortBy(p ⇒ - ( - // System notes don't have such flags. - if (p.isUser) { - if (p.isDirect) - 0 - else - 1 - } - else - 0, - if (p.isUser) - p.sparsity - else - 0 - ) - )). - flatMap(_.drop(1)). - toSeq - - redundant.foreach(thisSen.removeNote) - - var delCombs: Seq[NCNlpSentenceNote] = - getNotNlpNotes(thisSen). - flatMap(note ⇒ getNotNlpNotes(note.tokenIndexes.sorted.map(i ⇒ thisSen(i))).filter(_ != note)). - distinct - - println("delCombs="+delCombs.mkString("\n")) - - // Optimization. Deletes all wholly swallowed notes. - val links = getLinks(thisSen.flatten) - - val swallowed = - delCombs. - // There aren't links on it. - filter(n ⇒ !links.contains(NoteLink(n.noteType, n.tokenIndexes.sorted))). - // It doesn't have links. - filter(getPartKeys(_).isEmpty). - flatMap(note ⇒ { - val noteWordsIdxs = note.wordIndexes.toSet - val key = PartKey(note, thisSen) - - val delCombOthers = - delCombs.filter(_ != note).flatMap(n ⇒ if (getPartKeys(n).contains(key)) Some(n) else None) - - if (delCombOthers.exists(o ⇒ noteWordsIdxs == o.wordIndexes.toSet)) Some(note) else None - }) - - delCombs = delCombs.filter(p ⇒ !swallowed.contains(p)) - addDeleted(thisSen, thisSen, swallowed) - swallowed.foreach(thisSen.removeNote) - - val toksByIdx: Seq[Seq[NCNlpSentenceNote]] = - delCombs.flatMap(note ⇒ note.wordIndexes.map(_ → note)). - groupBy { case (idx, _) ⇒ idx }. - map { case (_, seq) ⇒ seq.map { case (_, note) ⇒ note } }. - toSeq.sortBy(-_.size) - -// val toksByIdx1 = -// delCombs.flatMap(note ⇒ note.wordIndexes.map(_ → note)). -// groupBy { case (idx, _) ⇒ idx }. -// map { case (idx, seq) ⇒ idx → seq.map { case (_, note) ⇒ note } }. -// toSeq.sortBy(_._2.size) -// -// toksByIdx.foreach{ case (seq) ⇒ -// println(s"toksByIdx seq=${seq.map(i ⇒ s"${i.noteType} ${i.wordIndexes.mkString(",")}").mkString(" | ")}") -// } - -// toksByIdx1.sortBy(_._1).foreach{ case (i, seq) ⇒ -// println(s"toksByIdx1 ${i} seq=${seq.map(i ⇒ s"${i.noteType} ${i.wordIndexes.mkString(",")}").mkString(" | ")}") -// } - - val dict = mutable.HashMap.empty[String, NCNlpSentenceNote] - val dictBack = mutable.HashMap.empty[NCNlpSentenceNote, String] - - var i = 'A' - - val converted: Seq[Seq[String]] = - toksByIdx.map(seq ⇒ { - seq.map( - n ⇒ { - val s = - dictBack.get(n) match { - case Some(s) ⇒ s - case None ⇒ { - val s = s"$i" - - i = (i.toInt + 1).toChar - - dict += s → n - dictBack += n → s - - s - } - } - - s - } - ) - }) - - //val minDelSize = if (toksByIdx.isEmpty) 1 else toksByIdx.map(_.size).max - 1 - - var sens = - if (delCombs.nonEmpty) { - val p = new ForkJoinPool() - - val tmp = NCComboRecursiveTask.findCombinations( - toksByIdx.map(_.asJava).asJava, - p - ) - - p.shutdown() - - val seq1 = tmp.asScala.map(_.asScala) - - val sens = - seq1. - flatMap(p ⇒ { - val delComb: Seq[NCNlpSentenceNote] = p - - val nsClone = thisSen.clone() - - // Saves deleted notes for sentence and their tokens. - addDeleted(thisSen, nsClone, delComb) - delComb.foreach(nsClone.removeNote) - - // Has overlapped notes for some tokens. - require( - !nsClone.exists(_.count(!_.isNlp) > 1), - s"Invalid notes: ${nsClone.filter(_.count(!_.isNlp) > 1).mkString("|")}" - ) - - collapse0(nsClone) - }) - - // It removes sentences which have only one difference - 'direct' flag of their user tokens. - // `Direct` sentences have higher priority. - case class Key(sysNotes: Seq[Map[String, JSerializable]], userNotes: Seq[Map[String, JSerializable]]) - case class Value(sentence: NCNlpSentence, directCount: Int) - - val m = mutable.HashMap.empty[Key, Value] - - sens.map(sen ⇒ { - val notes = sen.flatten - - val sysNotes = notes.filter(_.isSystem) - val nlpNotes = notes.filter(_.isNlp) - val userNotes = notes.filter(_.isUser) - - def get(seq: Seq[NCNlpSentenceNote]): Seq[Map[String, JSerializable]] = - seq.map(p ⇒ - // We have to delete some keys to have possibility to compare sentences. - p.clone().filter(_._1 != "direct") - ) - - (Key(get(sysNotes), get(userNotes)), sen, nlpNotes.map(p ⇒ if (p.isDirect) 0 else 1).sum) - }). - foreach { case (key, sen, directCnt) ⇒ - m.get(key) match { - case Some(v) ⇒ - // Best sentence is sentence with `direct` synonyms. - if (v.directCount > directCnt) - m += key → Value(sen, directCnt) - case None ⇒ m += key → Value(sen, directCnt) - } - } - - m.values.map(_.sentence).toSeq - } - else - collapse0(thisSen).flatMap(p ⇒ Option(Seq(p))).getOrElse(Seq.empty) - - sens = sens.distinct - - sens.foreach(sen ⇒ - sen.foreach(tok ⇒ - tok.size match { - case 1 ⇒ require(tok.head.isNlp, s"Unexpected non-'nlpcraft:nlp' token: $tok") - case 2 ⇒ require(tok.head.isNlp ^ tok.last.isNlp, s"Unexpected token notes: $tok") - case _ ⇒ require(requirement = false, s"Unexpected token notes count: $tok") - } - ) - ) - - // Drops similar sentences (with same tokens structure). - // Among similar sentences we prefer one with minimal free words count. - sens.groupBy(_.flatten.filter(!_.isNlp).map(_.getKey(withIndexes = false))). - map { case (_, seq) ⇒ seq.minBy(_.filter(p ⇒ p.isNlp && !p.isStopWord).map(_.wordIndexes.length).sum) }. - toSeq - } } import org.apache.nlpcraft.common.nlp.NCNlpSentence._ @@ -975,7 +214,10 @@ class NCNlpSentence( */ def getDeletedNotes: Predef.Map[NCNlpSentenceNote, Seq[NCNlpSentenceToken]] = deletedNotes.toMap - def collapse(mdl: NCModel, lastPhase: Boolean = false): Seq[NCNlpSentence] = { - collapseSentence(this, mdl, lastPhase) - } + /*** + * + * @param deletedNotes + */ + def addDeletedNotes(deletedNotes: Map[NCNlpSentenceNote, Seq[NCNlpSentenceToken]]): Unit = + this.deletedNotes ++= deletedNotes } diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/SyTest.java b/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/SyTest.java deleted file mode 100644 index db5f657..0000000 --- a/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/SyTest.java +++ /dev/null @@ -1,241 +0,0 @@ -package org.apache.nlpcraft.common.nlp; - -import java.util.ArrayList; -import java.util.Collection; -import java.util.Comparator; -import java.util.List; -import java.util.Set; -import java.util.TreeSet; -import java.util.concurrent.ForkJoinPool; -import java.util.concurrent.RecursiveTask; -import java.util.stream.IntStream; - -import static java.util.stream.Collectors.toList; -import static java.util.Arrays.asList; -import static java.util.stream.Collectors.toUnmodifiableList; - -public class SyTest { - public static class ComboSearch extends RecursiveTask<List<Long>> { - private static final long THRESHOLD = (long)Math.pow(2, 20); - - private final long lo; - - private final long hi; - - private final long[] wordBits; - - private final int[] wordCounts; - - public ComboSearch( - long lo, - long hi, - long[] words, - int[] wordCounts - ) { - this.lo = lo; - this.hi = hi; - this.wordBits = words; - this.wordCounts = wordCounts; - } - - public static <T> List<List<T>> findCombos(List<List<T>> inp, ForkJoinPool pool) { - List<List<T>> uniqueInp = inp.stream() - .filter(row -> inp.stream().noneMatch(it -> it != row && it.containsAll(row))) - .map(i -> i.stream().distinct().sorted().collect(toList())) - .collect(toList()); - - // Build dictionary of unique words. - List<T> dict = uniqueInp.stream() - .flatMap(Collection::stream) - .distinct() - .sorted() - .collect(toList()); - - System.out.println("uniqueInp="+uniqueInp.size()); - System.out.println("dict="+dict.size()); - - if (dict.size() > Long.SIZE) { - // Note: Power set of 64 words results in 9223372036854775807 combinations. - throw new IllegalArgumentException("Can handle more than " + Long.SIZE + " unique words in the dictionary."); - } - - // Convert words to bitmasks (each bit corresponds to an index in the dictionary). - long[] wordBits = uniqueInp.stream() - .sorted(Comparator.comparingInt(List::size)) - .mapToLong(row -> wordsToBits(row, dict)) - .toArray(); - - // Cache words count per row. - int[] wordCounts = uniqueInp.stream() - .sorted(Comparator.comparingInt(List::size)) - .mapToInt(List::size) - .toArray(); - - // Prepare Fork/Join task to iterate over the power set of all combinations. - int lo = 1; - long hi = (long)Math.pow(2, dict.size()); - - ComboSearch task = new ComboSearch( - lo, - hi, - wordBits, - wordCounts - ); - - return pool.invoke(task).stream() - .map(bits -> bitsToWords(bits, dict)) - .collect(toList()); - } - - @Override - protected List<Long> compute() { - if (hi - lo <= THRESHOLD) { - return computeLocal(); - } else { - return forkJoin(); - } - } - - private List<Long> computeLocal() { - List<Long> result = new ArrayList<>(); - - for (long comboBits = lo; comboBits < hi; comboBits++) { - boolean match = true; - - // For each input row we check if subtracting the current combination of words - // from the input row would give us the expected result. - for (int j = 0; j < wordBits.length; j++) { - // Get bitmask of how many words can be subtracted from the row. - long commonBits = wordBits[j] & comboBits; - - int wordsToRemove = Long.bitCount(commonBits); - - // Check if there is more than 1 word remaining after subtraction. - if (wordCounts[j] - wordsToRemove > 1) { - // Skip this combination. - match = false; - - break; - } - } - - if (match && !includes(comboBits, result)) { - result.add(comboBits); - } - } - - return result; - } - - private List<Long> forkJoin() { - long mid = lo + hi >>> 1L; - - ComboSearch t1 = new ComboSearch(lo, mid, wordBits, wordCounts); - ComboSearch t2 = new ComboSearch(mid, hi, wordBits, wordCounts); - - t2.fork(); - - return merge(t1.compute(), t2.join()); - } - - private List<Long> merge(List<Long> l1, List<Long> l2) { - if (l1.isEmpty()) { - return l2; - } else if (l2.isEmpty()) { - return l1; - } - - int size1 = l1.size(); - int size2 = l2.size(); - - if (size1 == 1 && size2 > 1 || size2 == 1 && size1 > 1) { - // Minor optimization in case if one of the lists has only one element. - List<Long> list = size1 == 1 ? l2 : l1; - Long val = size1 == 1 ? l1.get(0) : l2.get(0); - - if (!includes(val, list)) { - list.add(val); - } - - return list; - } else { - List<Long> result = new ArrayList<>(size1 + size2); - - for (int i = 0, max = Math.max(size1, size2); i < max; i++) { - Long v1 = i < size1 ? l1.get(i) : null; - Long v2 = i < size2 ? l2.get(i) : null; - - if (v1 != null && v2 != null) { - if (containsAllBits(v1, v2)) { - v1 = null; - } else if (containsAllBits(v2, v1)) { - v2 = null; - } - } - - if (v1 != null && !includes(v1, result)) { - result.add(v1); - } - - if (v2 != null && !includes(v2, result)) { - result.add(v2); - } - } - - return result; - } - } - - private static boolean includes(long bits, List<Long> allBits) { - for (long existing : allBits) { - if (containsAllBits(bits, existing)) { - return true; - } - } - - return false; - } - - private static boolean containsAllBits(long bitSet1, long bitSet2) { - return (bitSet1 & bitSet2) == bitSet2; - } - - private static <T> long wordsToBits(List<T> words, List<T> dict) { - long bits = 0; - - for (int i = 0; i < dict.size(); i++) { - if (words.contains(dict.get(i))) { - bits |= 1L << i; - } - } - - return bits; - } - - private static <T> List<T> bitsToWords(long bits, List<T> dict) { - List<T> words = new ArrayList<>(Long.bitCount(bits)); - - for (int i = 0; i < dict.size(); i++) { - if ((bits & 1L << i) != 0) { - words.add(dict.get(i)); - } - } - - return words; - } - } - - public static void main(String[] args) throws InterruptedException { - List<List<String>> words = IntStream.range(0, 35) - .mapToObj(i -> IntStream.range(0, i + 1).mapToObj(String::valueOf).collect(toUnmodifiableList())) - .collect(toUnmodifiableList()); - - long t = System.currentTimeMillis(); - - ForkJoinPool forkJoinPool = new ForkJoinPool(); - final List<List<String>> combos = ComboSearch.findCombos(words, forkJoinPool); - - System.out.println("size=" + combos.size()); - System.out.println("time=" + (System.currentTimeMillis() - t)); - } -} \ No newline at end of file diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/NCProbeBoot.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/NCProbeBoot.scala index 19308e3..da13b07 100644 --- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/NCProbeBoot.scala +++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/NCProbeBoot.scala @@ -49,6 +49,7 @@ import org.apache.nlpcraft.probe.mgrs.nlp.enrichers.sort.NCSortEnricher import org.apache.nlpcraft.probe.mgrs.nlp.enrichers.stopword.NCStopWordEnricher import org.apache.nlpcraft.probe.mgrs.nlp.enrichers.suspicious.NCSuspiciousNounsEnricher import org.apache.nlpcraft.probe.mgrs.nlp.validate.NCValidateManager +import org.apache.nlpcraft.probe.mgrs.sentence.NCSentenceManager import resource.managed import java.io._ @@ -512,6 +513,7 @@ private [probe] object NCProbeBoot extends LazyLogging with NCOpenCensusTrace { startedMgrs += NCProbeEnrichmentManager.start(span) startedMgrs += NCConnectionManager.start(span) startedMgrs += NCDialogFlowManager.start(span) + startedMgrs += NCSentenceManager.start(span) } } diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/NCProbeEnrichmentManager.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/NCProbeEnrichmentManager.scala index 368f0c4..c328e57 100644 --- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/NCProbeEnrichmentManager.scala +++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/NCProbeEnrichmentManager.scala @@ -43,6 +43,7 @@ import org.apache.nlpcraft.probe.mgrs.nlp.enrichers.stopword.NCStopWordEnricher import org.apache.nlpcraft.probe.mgrs.nlp.enrichers.suspicious.NCSuspiciousNounsEnricher import org.apache.nlpcraft.probe.mgrs.nlp.impl._ import org.apache.nlpcraft.probe.mgrs.nlp.validate._ +import org.apache.nlpcraft.probe.mgrs.sentence.NCSentenceManager import org.apache.nlpcraft.probe.mgrs.{NCProbeMessage, NCProbeVariants} import java.io.Serializable @@ -500,7 +501,7 @@ object NCProbeEnrichmentManager extends NCService with NCOpenCensusModelStats { s"]") } - nlpSen.clone().collapse(mdl.model, lastPhase = true). + NCSentenceManager.collapse(mdl.model, nlpSen.clone(), lastPhase = true). // Sorted to support deterministic logs. sortBy(p ⇒ p.map(p ⇒ { diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala index 2a9dec0..82edf69 100644 --- a/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala +++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCModelEnricher.scala @@ -24,6 +24,7 @@ import org.apache.nlpcraft.model._ import org.apache.nlpcraft.probe.mgrs.NCProbeSynonymChunkKind.{NCSynonymChunkKind, TEXT} import org.apache.nlpcraft.probe.mgrs.nlp.NCProbeEnricher import org.apache.nlpcraft.probe.mgrs.nlp.impl.NCRequestImpl +import org.apache.nlpcraft.probe.mgrs.sentence.NCSentenceManager import org.apache.nlpcraft.probe.mgrs.{NCProbeModel, NCProbeSynonym, NCProbeVariants} import java.io.Serializable @@ -355,7 +356,11 @@ object NCModelEnricher extends NCProbeEnricher with DecorateAsScala { toks.map(t ⇒ (t.origText, t.index)).mkString(" ") var permCnt = 0 - lazy val collapsedSens = NCProbeVariants.convert(ns.srvReqId, mdl, ns.clone().collapse(mdl.model)).map(_.asScala) + lazy val collapsedSens = NCProbeVariants.convert( + ns.srvReqId, + mdl, + NCSentenceManager.collapse(mdl.model, ns.clone()) + ).map(_.asScala) /** * diff --git a/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentence.scala b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceManager.scala similarity index 68% copy from nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentence.scala copy to nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceManager.scala index 95a98a3..e0dd59c 100644 --- a/nlpcraft/src/main/scala/org/apache/nlpcraft/common/nlp/NCNlpSentence.scala +++ b/nlpcraft/src/main/scala/org/apache/nlpcraft/probe/mgrs/sentence/NCSentenceManager.scala @@ -15,29 +15,30 @@ * limitations under the License. */ -package org.apache.nlpcraft.common.nlp +package org.apache.nlpcraft.probe.mgrs.sentence -import com.typesafe.scalalogging.LazyLogging -import org.apache.nlpcraft.common.NCE +import io.opencensus.trace.Span +import org.apache.nlpcraft.common.nlp.NCNlpSentence.NoteLink import org.apache.nlpcraft.common.nlp.pos.NCPennTreebank -import org.apache.nlpcraft.common.util.NCComboRecursiveTask +import org.apache.nlpcraft.common.nlp.{NCNlpSentence, NCNlpSentenceNote, NCNlpSentenceToken} +import org.apache.nlpcraft.common.{NCE, NCService, U} import org.apache.nlpcraft.model.NCModel -import java.io.{Serializable => JSerializable} +import java.io.{Serializable ⇒ JSerializable} import java.util -import java.util.concurrent.ForkJoinPool -import java.util.{Collections, Comparator, List => JList} -import scala.collection.JavaConverters._ +import java.util.{List ⇒ JList} +import scala.collection.JavaConverters.{asScalaBufferConverter, _} import scala.collection.mutable.ArrayBuffer import scala.collection.{Map, Seq, Set, mutable} import scala.language.implicitConversions -object NCNlpSentence extends LazyLogging { - implicit def toTokens(x: NCNlpSentence): ArrayBuffer[NCNlpSentenceToken] = x.tokens +/** + * Sentences processing manager. + */ +object NCSentenceManager extends NCService { + @volatile private var pool: java.util.concurrent.ForkJoinPool = _ - case class NoteKey(start: Int, end: Int) - case class TokenKey(id: String, start: Int, end: Int) - case class NoteLink(note: String, indexes: Seq[Int]) + implicit def toTokens(x: NCNlpSentence): ArrayBuffer[NCNlpSentenceToken] = x.tokens case class PartKey(id: String, start: Int, end: Int) { require(start <= end) @@ -73,7 +74,7 @@ object NCNlpSentence extends LazyLogging { noteLinks ++= (for ((name, idxs) ← names.asScala.zip(idxsSeq.asScala.map(_.asScala))) yield NoteLink(name, idxs.sorted) - ) + ) } if (n.contains("subjnotes")) add("subjnotes", "subjindexes") @@ -462,8 +463,8 @@ object NCNlpSentence extends LazyLogging { } if (fixed.forall(_.size == 1)) - // Fix double dimension array to one dimension, - // so it should be called always in spite of 'fixIndexesReferences' method. + // Fix double dimension array to one dimension, + // so it should be called always in spite of 'fixIndexesReferences' method. ns.fixNote(n, idxsField → fixed.map(_.head).asJava.asInstanceOf[JSerializable]) else ok = false @@ -521,9 +522,9 @@ object NCNlpSentence extends LazyLogging { val res = fixIndexesReferences("nlpcraft:relation", "indexes", "note", ns, history) && - fixIndexesReferences("nlpcraft:limit", "indexes", "note", ns, history) && - fixIndexesReferencesList("nlpcraft:sort", "subjindexes", "subjnotes", ns, history) && - fixIndexesReferencesList("nlpcraft:sort", "byindexes", "bynotes", ns, history) + fixIndexesReferences("nlpcraft:limit", "indexes", "note", ns, history) && + fixIndexesReferencesList("nlpcraft:sort", "subjindexes", "subjnotes", ns, history) && + fixIndexesReferencesList("nlpcraft:sort", "byindexes", "bynotes", ns, history) if (res) { // Validation (all indexes calculated well) @@ -562,7 +563,7 @@ object NCNlpSentence extends LazyLogging { toks.flatten.filter(!_.isNlp).distinct private def addDeleted(thisSen: NCNlpSentence, sen: NCNlpSentence, dels: Iterable[NCNlpSentenceNote]): Unit = - sen.deletedNotes ++= dels.map(n ⇒ { + sen.addDeletedNotes(dels.map(n ⇒ { val savedDelNote = n.clone() val savedDelToks = n.tokenIndexes.map(idx ⇒ thisSen(idx).clone()) @@ -573,7 +574,7 @@ object NCNlpSentence extends LazyLogging { savedDelTok.remove(mainNote) savedDelNote → savedDelToks - }) + }).toMap) /** * This collapser handles several tasks: @@ -597,27 +598,27 @@ object NCNlpSentence extends LazyLogging { // We keep only one variant - with `best` direct and sparsity parameters, // other variants for these words are redundant. val redundant: Seq[NCNlpSentenceNote] = - thisSen.flatten.filter(!_.isNlp).distinct. - groupBy(_.getKey()). - map(p ⇒ p._2.sortBy(p ⇒ - ( - // System notes don't have such flags. - if (p.isUser) { - if (p.isDirect) - 0 - else - 1 - } - else - 0, - if (p.isUser) - p.sparsity - else + thisSen.flatten.filter(!_.isNlp).distinct. + groupBy(_.getKey()). + map(p ⇒ p._2.sortBy(p ⇒ + ( + // System notes don't have such flags. + if (p.isUser) { + if (p.isDirect) 0 - ) - )). - flatMap(_.drop(1)). - toSeq + else + 1 + } + else + 0, + if (p.isUser) + p.sparsity + else + 0 + ) + )). + flatMap(_.drop(1)). + toSeq redundant.foreach(thisSen.removeNote) @@ -626,8 +627,6 @@ object NCNlpSentence extends LazyLogging { flatMap(note ⇒ getNotNlpNotes(note.tokenIndexes.sorted.map(i ⇒ thisSen(i))).filter(_ != note)). distinct - println("delCombs="+delCombs.mkString("\n")) - // Optimization. Deletes all wholly swallowed notes. val links = getLinks(thisSen.flatten) @@ -651,89 +650,55 @@ object NCNlpSentence extends LazyLogging { addDeleted(thisSen, thisSen, swallowed) swallowed.foreach(thisSen.removeNote) - val toksByIdx: Seq[Seq[NCNlpSentenceNote]] = + val toksByIdx: Seq[Set[NCNlpSentenceNote]] = delCombs.flatMap(note ⇒ note.wordIndexes.map(_ → note)). groupBy { case (idx, _) ⇒ idx }. - map { case (_, seq) ⇒ seq.map { case (_, note) ⇒ note } }. + map { case (_, seq) ⇒ seq.map { case (_, note) ⇒ note }.toSet }. toSeq.sortBy(-_.size) -// val toksByIdx1 = -// delCombs.flatMap(note ⇒ note.wordIndexes.map(_ → note)). -// groupBy { case (idx, _) ⇒ idx }. -// map { case (idx, seq) ⇒ idx → seq.map { case (_, note) ⇒ note } }. -// toSeq.sortBy(_._2.size) -// -// toksByIdx.foreach{ case (seq) ⇒ -// println(s"toksByIdx seq=${seq.map(i ⇒ s"${i.noteType} ${i.wordIndexes.mkString(",")}").mkString(" | ")}") -// } - -// toksByIdx1.sortBy(_._1).foreach{ case (i, seq) ⇒ -// println(s"toksByIdx1 ${i} seq=${seq.map(i ⇒ s"${i.noteType} ${i.wordIndexes.mkString(",")}").mkString(" | ")}") -// } - - val dict = mutable.HashMap.empty[String, NCNlpSentenceNote] - val dictBack = mutable.HashMap.empty[NCNlpSentenceNote, String] - - var i = 'A' - - val converted: Seq[Seq[String]] = - toksByIdx.map(seq ⇒ { - seq.map( - n ⇒ { - val s = - dictBack.get(n) match { - case Some(s) ⇒ s - case None ⇒ { - val s = s"$i" - - i = (i.toInt + 1).toChar - - dict += s → n - dictBack += n → s - - s - } - } - - s - } - ) - }) - - //val minDelSize = if (toksByIdx.isEmpty) 1 else toksByIdx.map(_.size).max - 1 +// val minDelSize = if (toksByIdx.isEmpty) 1 else toksByIdx.map(_.size).max - 1 var sens = if (delCombs.nonEmpty) { - val p = new ForkJoinPool() - - val tmp = NCComboRecursiveTask.findCombinations( - toksByIdx.map(_.asJava).asJava, - p - ) - - p.shutdown() - - val seq1 = tmp.asScala.map(_.asScala) + val deleted = mutable.ArrayBuffer.empty[Set[NCNlpSentenceNote]] + +// val combs = (minDelSize to delCombs.size). +// flatMap(i ⇒ +// delCombs.combinations(i). +// filter(delComb ⇒ +// !toksByIdx.exists( +// rec ⇒ +// rec.size - delCombs.size <= 1 && +// rec.count(note ⇒ !delComb.contains(note)) > 1 +// ) +// ) +// ). +// sortBy(_.size). +// map(_.toSet) +// + val combs = NCComboHelper.findCombinations(toksByIdx.map(_.asJava).asJava, pool).asScala.map(_.asScala) val sens = - seq1. - flatMap(p ⇒ { - val delComb: Seq[NCNlpSentenceNote] = p + combs. + flatMap(delComb ⇒ + // Already processed with less subset of same deleted tokens. + if (!deleted.exists(_.subsetOf(delComb.toSet))) { + val nsClone = thisSen.clone() - val nsClone = thisSen.clone() + // Saves deleted notes for sentence and their tokens. + addDeleted(thisSen, nsClone, delComb) + delComb.foreach(nsClone.removeNote) - // Saves deleted notes for sentence and their tokens. - addDeleted(thisSen, nsClone, delComb) - delComb.foreach(nsClone.removeNote) + // Has overlapped notes for some tokens. + require(!nsClone.exists(_.count(!_.isNlp) > 1)) - // Has overlapped notes for some tokens. - require( - !nsClone.exists(_.count(!_.isNlp) > 1), - s"Invalid notes: ${nsClone.filter(_.count(!_.isNlp) > 1).mkString("|")}" - ) + deleted += delComb.toSet - collapse0(nsClone) - }) + collapse0(nsClone) + } + else + None + ) // It removes sentences which have only one difference - 'direct' flag of their user tokens. // `Direct` sentences have higher priority. @@ -790,192 +755,23 @@ object NCNlpSentence extends LazyLogging { map { case (_, seq) ⇒ seq.minBy(_.filter(p ⇒ p.isNlp && !p.isStopWord).map(_.wordIndexes.length).sum) }. toSeq } -} - -import org.apache.nlpcraft.common.nlp.NCNlpSentence._ -/** - * Parsed NLP sentence is a collection of tokens. Each token is a collection of notes and - * each note is a collection of KV pairs. - * - * @param srvReqId Server request ID. - * @param text Normalized text. - * @param enabledBuiltInToks Enabled built-in tokens. - * @param tokens Initial buffer. - * @param deletedNotes Deleted overridden notes with their tokens. - */ -class NCNlpSentence( - val srvReqId: String, - val text: String, - val enabledBuiltInToks: Set[String], - override val tokens: mutable.ArrayBuffer[NCNlpSentenceToken] = new mutable.ArrayBuffer[NCNlpSentenceToken](32), - private val deletedNotes: mutable.HashMap[NCNlpSentenceNote, Seq[NCNlpSentenceToken]] = mutable.HashMap.empty, - private var initNlpNotes: Map[NoteKey, NCNlpSentenceNote] = null, - private val nlpTokens: mutable.HashMap[TokenKey, NCNlpSentenceToken] = mutable.HashMap.empty -) extends NCNlpSentenceTokenBuffer(tokens) with JSerializable { - @transient - private var hash: java.lang.Integer = _ - - private def calcHash(): Int = - Seq(srvReqId, text, enabledBuiltInToks, tokens).map(_.hashCode()).foldLeft(0)((a, b) ⇒ 31 * a + b) - - // Deep copy. - override def clone(): NCNlpSentence = - new NCNlpSentence( - srvReqId, - text, - enabledBuiltInToks, - tokens.map(_.clone()), - deletedNotes.map(p ⇒ p._1.clone() → p._2.map(_.clone())), - initNlpNotes = initNlpNotes - ) + override def start(parent: Span): NCService = { + ackStarting() - /** - * Utility method that gets set of notes for given note type collected from - * tokens in this sentence. Notes are sorted in the same order they appear - * in this sentence. - * - * @param noteType Note type. - */ - def getNotes(noteType: String): Seq[NCNlpSentenceNote] = this.flatMap(_.getNotes(noteType)).distinct + pool = new java.util.concurrent.ForkJoinPool() - /** - * Utility method that removes note with given ID from all tokens in this sentence. - * No-op if such note wasn't found. - * - * @param note Note. - */ - def removeNote(note: NCNlpSentenceNote): Unit = this.foreach(_.remove(note)) - - //noinspection HashCodeUsesVar - override def hashCode(): Int = { - if (hash == null) - hash = calcHash() - - hash + ackStarted() } - def fixNote(note: NCNlpSentenceNote, kvs: (String, JSerializable)*): Unit = { - val fixed = note.clone(kvs: _*) + override def stop(parent: Span): Unit = { + ackStopping() - this.filter(t ⇒ t.index >= fixed.tokenIndexes.head && t.index <= fixed.tokenIndexes.last).foreach(t ⇒ { - t.remove(note) - t.add(fixed) - }) + U.shutdownPool(pool) - hash = null + ackStopped() } - /** - * Returns flag are note notes equal (or similar) or not. Reason of ignored difference can be stopwords tokens. - * - * @param n1 First note. - * @param n2 Second note. - */ - def notesEqualOrSimilar(n1: NCNlpSentenceNote, n2: NCNlpSentenceNote): Boolean = - if (n1.noteType != n2.noteType) - false - else { - val stopIdxs = this.filter(_.isStopWord).map(_.index) - - // One possible difference - stopwords indexes. - def wordsEqualOrSimilar0(n1: NCNlpSentenceNote, n2: NCNlpSentenceNote): Boolean = { - val set1 = n1.wordIndexes.toSet - val set2 = n2.wordIndexes.toSet - - set1 == set2 || set1.subsetOf(set2) && set2.diff(set1).forall(stopIdxs.contains) - } - - def wordsEqualOrSimilar(n1: NCNlpSentenceNote, n2: NCNlpSentenceNote): Boolean = - wordsEqualOrSimilar0(n1, n2) || wordsEqualOrSimilar0(n2, n1) - - def tokensEqualOrSimilar0(set1: Set[NCNlpSentenceToken], set2: Set[NCNlpSentenceToken]): Boolean = - set1 == set2 || set1.subsetOf(set2) && set2.diff(set1).forall(_.isStopWord) - - def tokensEqualOrSimilar(set1: Set[NCNlpSentenceToken], set2: Set[NCNlpSentenceToken]): Boolean = - tokensEqualOrSimilar0(set1, set2) || tokensEqualOrSimilar0(set2, set1) - - def getList(n: NCNlpSentenceNote, refIdxName: String): Set[NCNlpSentenceToken] = - n.getOrElse(refIdxName, Collections.emptyList).asInstanceOf[JList[Int]].asScala. - map(this (_)).toSet - - def getListList(n: NCNlpSentenceNote, refIdxName: String): Set[NCNlpSentenceToken] = - n.getOrElse(refIdxName, Collections.emptyList).asInstanceOf[JList[JList[Int]]].asScala. - flatMap(_.asScala.map(this (_))).toSet - - def referencesEqualOrSimilar0(n1: NCNlpSentenceNote, n2: NCNlpSentenceNote): Boolean = { - require(n1.noteType == n2.noteType) - - n1.noteType match { - case "nlpcraft:sort" ⇒ - tokensEqualOrSimilar(getListList(n1, "subjindexes"), getListList(n2, "subjindexes")) && - tokensEqualOrSimilar(getListList(n1, "byindexes"), getListList(n2, "byindexes")) - case "nlpcraft:limit" ⇒ - tokensEqualOrSimilar(getList(n1, "indexes"), getList(n2, "indexes")) - case "nlpcraft:reference" ⇒ - tokensEqualOrSimilar(getList(n1, "indexes"), getList(n2, "indexes")) - - case _ ⇒ true - } - } - - def referencesEqualOrSimilar(n1: NCNlpSentenceNote, n2: NCNlpSentenceNote): Boolean = - referencesEqualOrSimilar0(n1, n2) || referencesEqualOrSimilar0(n2, n1) - - def getUniqueKey0(n: NCNlpSentenceNote): Seq[Any] = n.getKey(withIndexes = false, withReferences = false) - - getUniqueKey0(n1) == getUniqueKey0(n2) && wordsEqualOrSimilar(n1, n2) && referencesEqualOrSimilar(n1, n2) - } - - override def equals(obj: Any): Boolean = obj match { - case x: NCNlpSentence ⇒ - tokens == x.tokens && - srvReqId == x.srvReqId && - text == x.text && - enabledBuiltInToks == x.enabledBuiltInToks - - case _ ⇒ false - } - - /** - * - */ - def saveNlpNotes(): Unit = - initNlpNotes = this.map(t ⇒ NoteKey(t.startCharIndex, t.endCharIndex) → t.getNlpNote).toMap - - /** - * - * @return - */ - def findInitialNlpNote(startCharIndex: Int, endCharIndex: Int): Option[NCNlpSentenceNote] = - initNlpNotes.get(NoteKey(startCharIndex, endCharIndex)) - - /** - * - * @param nlp - */ - def addNlpToken(nlp: NCNlpSentenceToken): Unit = { - require(nlp.size <= 2) - - nlp.foreach(n ⇒ nlpTokens += TokenKey(n.noteType, nlp.startCharIndex, nlp.endCharIndex) → nlp) - } - - /** - * - * @param noteType - * @param startCharIndex - * @param endCharIndex - * @return - */ - def findNlpToken(noteType: String, startCharIndex: Int, endCharIndex: Int): Option[NCNlpSentenceToken] = - nlpTokens.get(TokenKey(noteType, startCharIndex, endCharIndex)) - - /** - * - */ - def getDeletedNotes: Predef.Map[NCNlpSentenceNote, Seq[NCNlpSentenceToken]] = deletedNotes.toMap - - def collapse(mdl: NCModel, lastPhase: Boolean = false): Seq[NCNlpSentence] = { - collapseSentence(this, mdl, lastPhase) - } + def collapse(mdl: NCModel, sen: NCNlpSentence, lastPhase: Boolean = false): Seq[NCNlpSentence] = + collapseSentence(sen, mdl, lastPhase) } diff --git a/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/SyTest.java b/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/SyTest.java deleted file mode 100644 index 69d9ab4..0000000 --- a/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/SyTest.java +++ /dev/null @@ -1,278 +0,0 @@ -package org.apache.nlpcraft.probe.mgrs.nlp.enrichers; - -import java.util.ArrayList; -import java.util.Collection; -import java.util.Comparator; -import java.util.HashSet; -import java.util.List; -import java.util.NavigableSet; -import java.util.Optional; -import java.util.Set; -import java.util.TreeSet; -import java.util.concurrent.CopyOnWriteArrayList; -import java.util.concurrent.CountDownLatch; -import java.util.concurrent.ExecutorService; -import java.util.concurrent.Executors; -import java.util.concurrent.TimeUnit; -import java.util.stream.Collectors; - -import static java.util.Arrays.asList; -import static java.util.stream.Collectors.toList; -import static java.util.stream.Collectors.toSet; - -public class SyTest { - public static void main(String[] args) { -// List<List<String>> words = asList( -// asList("A", "B", "C"), -// asList("B", "C", "D"), -// asList("B", "D") -// ); - List<List<String>> words = asList( - asList("A", "B"), - asList("C", "B"), - asList("D", "E"), - asList("D", "F"), - asList("G", "H"), - asList("I", "H"), - asList("J", "K"), - asList("L", "K"), - asList("M", "N"), - asList("M", "O"), - asList("P", "Q"), - asList("P", "R"), - asList("S", "T"), - asList("S", "U"), - asList("V", "W"), - asList("X", "W") - , - asList("Y", "Z"), - asList("A1", "A2"), - asList("A3", "A3"), - asList("A4", "A5", "A6") - ); - - System.out.println( - "Dictionary size:" - + words.stream() - .flatMap(Collection::stream) - .distinct() - .count() - ); - - System.out.println("===== Performance ====="); - - for (int i = 0; i < 1; i++) { - long t = System.currentTimeMillis(); - - Set<Set<String>> combos = findCombos(words); - - - - System.out.println("Iteration " + i + " Time: " + (System.currentTimeMillis() - t) + ", resCnt=" + combos.size()); - } - - if (true) { - return; - } - - Set<Set<String>> combos = findCombos(words); - - System.out.println(); - System.out.println("===== Result ====="); - System.out.println("Total combos: " + combos.size()); - System.out.println(); -// combos.stream() -// .sorted(Comparator.comparing(Collection::size)) -// .forEach(combo -> -// print(words, combo) -// ); - } - - public static <T extends Comparable<T>> Set<Set<T>> findCombos(List<List<T>> inp) { - - - List<List<T>> uniqueInp = inp.stream() - .filter(row -> inp.stream().noneMatch(it -> it != row && it.containsAll(row))) - .map(i -> i.stream().distinct().sorted().collect(toList())) - .collect(toList()); - - // Build dictionary of unique words. - List<T> dict = uniqueInp.stream() - .flatMap(Collection::stream) - .distinct() - .sorted() - .collect(toList()); - - if (dict.size() > Integer.SIZE) { - // Note: Power set of 32 words results in 4294967296 combinations. - throw new IllegalArgumentException("Can handle more than " + Integer.SIZE + " unique words in the dictionary."); - } - - // Convert words to bitmasks (each bit corresponds to an index in the dictionary). - int[] wordBits = uniqueInp.stream() - .sorted(Comparator.comparingInt(List::size)) - .mapToInt(row -> wordsToBits(row, dict)) - .toArray(); - - // Cache words count per row. - int[] wordCounts = uniqueInp.stream() - .sorted(Comparator.comparingInt(List::size)) - .mapToInt(List::size) - .toArray(); - - int min = 1; - int max = (int)Math.pow(2, dict.size()) - 1; - - int batchFactor = 100; - int threads = 13; - - ExecutorService pool = Executors.newFixedThreadPool(threads); - CountDownLatch cdl = new CountDownLatch(batchFactor); - - int divRes = max / batchFactor; - int divRest = max % batchFactor; - - int to = 0; - - List<Integer> result = new CopyOnWriteArrayList<>(); - - for (int k = 0; k < batchFactor; k++) { - to += divRes; - - if (k == divRes - 1) { - to += divRest; - } - - int toFinal = to; - int fromFinal = min + k * divRes; - - pool.execute( - () -> { - List<Integer> locRes = new ArrayList<>(); - - for (int comboBits = fromFinal; comboBits < toFinal; comboBits++) { - boolean match = true; - - // For each input row we check if subtracting the current combination of words - // from the input row would give us the expected result. - for (int j = 0; j < wordBits.length; j++) { - // Get bitmask of how many words can be subtracted from the row. - int commonBits = wordBits[j] & comboBits; - - int wordsToRemove = Integer.bitCount(commonBits); - - // Check if there are more than 1 word remaining after subtraction. - if (wordCounts[j] - wordsToRemove > 1) { - // Skip this combination. - match = false; - - break; - } - } - - if (match && !includes(comboBits, locRes)) { - locRes.add(comboBits); - } - } - - result.addAll(locRes); - - cdl.countDown(); - } - ); - } - -// Iterate over the power set. - - //pool.shutdown(); - try { - cdl.await(Long.MAX_VALUE, TimeUnit.MILLISECONDS); - //pool.awaitTermination(Long.MAX_VALUE, TimeUnit.MILLISECONDS); - } catch (InterruptedException e) { - e.printStackTrace(); - } - - // Convert found results from bitmasks back to words. - TreeSet<Set<T>> treeSet = new TreeSet<>(Comparator.comparingInt(Set::size)); - - treeSet.addAll(result.stream().map(bits -> bitsToWords(bits, dict)).collect(toSet())); - - Set<Set<T>> normCombs = new HashSet<>(); - - for (Set<T> set : treeSet) { - boolean b = true; - - for (Set<T> added : normCombs) { - if (added.containsAll(set)) { - b = false; - - break; - } - } - - if (b) { - normCombs.add(set); - } - } - - return normCombs; - - } - - private static <T> Set<Set<T>> squeeze(Set<Set<T>> combs) { - Set<Set<T>> normCombs = new HashSet<>(); - - combs.stream().sorted(Comparator.comparingInt(Set::size)).forEach(comb -> { - // Skips already added shorter variants. - if (normCombs.stream().filter(comb::containsAll).findAny().isEmpty()) { - normCombs.add(comb); - } - }); - - return normCombs; - } - - - private static boolean includes(int bits, List<Integer> allBits) { - for (int existing : allBits) { - if ((bits & existing) == existing) { - return true; - } - } - - return false; - } - - private static <T> int wordsToBits(List<T> words, List<T> dict) { - int bits = 0; - - for (int i = 0; i < dict.size(); i++) { - if (words.contains(dict.get(i))) { - bits |= 1 << i; - } - } - - return bits; - } - - private static <T> Set<T> bitsToWords(int bits, List<T> dict) { - Set<T> words = new HashSet<>(Integer.bitCount(bits)); - - for (int i = 0; i < dict.size(); i++) { - if ((bits & 1 << i) != 0) { - words.add(dict.get(i)); - } - } - - return words; - } - - private static void print(List<List<String>> inp, List<String> combo) { - System.out.println("==== " + combo + "(" + combo.size() + ')'); - inp.stream().forEach(row -> { - Set<String> s = new TreeSet<>(row); - s.removeAll(combo); - System.out.println(s); - }); - } -} diff --git a/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCEnricherNestedModelSpec4.scala b/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCEnricherNestedModelSpec4.scala index afdeaab..b354533 100644 --- a/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCEnricherNestedModelSpec4.scala +++ b/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/NCEnricherNestedModelSpec4.scala @@ -48,6 +48,6 @@ class NCNestedTestModel4 extends NCModelAdapter( */ @NCTestEnvironment(model = classOf[NCNestedTestModel4], startClient = true) class NCEnricherNestedModelSpec4 extends NCTestContext { - @Test1 + @Test def test(): Unit = checkIntent("the a " * 8, "onE2") } \ No newline at end of file diff --git a/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/Test1.java b/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/Test1.java deleted file mode 100644 index 398e858..0000000 --- a/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/Test1.java +++ /dev/null @@ -1,135 +0,0 @@ -package org.apache.nlpcraft.probe.mgrs.nlp.enrichers.model; - -import com.google.common.collect.ImmutableSet; - -import java.util.Arrays; -import java.util.Collection; -import java.util.Comparator; -import java.util.HashSet; -import java.util.List; -import java.util.Set; -import java.util.stream.Collectors; -import java.util.stream.Stream; - -public class Test1 { - private static List<Set<String>> ROWS = - Arrays.asList( - ImmutableSet.of("A", "B", "C"), - ImmutableSet.of("B", "C", "D"), - ImmutableSet.of("B", "D") - ); - -// // Uncomment it. Works too long time. Normalized result is 256. -// private static List<Set<String>> ROWS = Arrays.asList( -// ImmutableSet.of("A", "B"), -// ImmutableSet.of("C", "B"), -// ImmutableSet.of("D", "E"), -// ImmutableSet.of("D", "F"), -// ImmutableSet.of("G", "H"), -// ImmutableSet.of("I", "H"), -// ImmutableSet.of("J", "K"), -// ImmutableSet.of("L", "K"), -// ImmutableSet.of("M", "N"), -// ImmutableSet.of("M", "O"), -// ImmutableSet.of("P", "Q"), -// ImmutableSet.of("P", "R"), -// ImmutableSet.of("S", "T"), -// ImmutableSet.of("S", "U"), -// ImmutableSet.of("V", "W"), -// ImmutableSet.of("X", "W") -// ); - - private static Set<String> ALL = ROWS.stream().flatMap(Collection::stream).collect(Collectors.toSet()); - - // Goal: Find minimal set of combinations with following feature. - // After removing combination values from each row - list should contain rows with size <= 1. - - // Expected solution: [C, B], [A, C, D], [A, B, D] - // Example: - // list - [C, B] = {{A}, {D}, {D}} - // list - [A, C, D] = {{B}, {B}, {B}} - // list - [A, B, D] = {{C}, {C}, {null}} - - - // Additional. Redundant solutions: [A, B, C] ([C, B] enough), [A, B, C, D] ([A, C, D] enough) etc - - // Easiest. - public static void main(String[] args) { - long t = System.currentTimeMillis(); - - System.out.println("1. start [time=" + (System.currentTimeMillis() - t) + ']'); - - // 1. Extends. - List<Set<String>> extRows = extendNulls(); - - // 2. All valid rows (permutation) - // Or manually permute, like https://stackoverflow.com/questions/17192796/generate-all-combinations-from-multiple-lists - Set<List<String>> allSingleOrNullRows = com.google.common.collect.Sets.cartesianProduct(extRows); - - System.out.println("2. permuted [size=" + allSingleOrNullRows.size() + ", time=" + (System.currentTimeMillis() - t) + ']'); - - // 3. Collects all suitable combinations. - Set<Set<String>> combs = - allSingleOrNullRows. - stream(). - // Calculates how that single or empty lines can be constructed (it is required combination). - map(row -> { - Set<String> copy = new HashSet<>(ALL); - - copy.removeAll(row); - - return copy; - }). - distinct(). - filter(Test1::isSuitable). - collect(Collectors.toSet()); - - System.out.println("3. calculated [size=" + combs.size() + ", time=" + (System.currentTimeMillis() - t) + ']'); - - // 3. Normalize variants (keeps only minimal valid subsets, see task description) - Set<Set<String>> normCombs = squeeze(combs); - - System.out.println("4. normalized [size=" + normCombs.size() + ", time=" + (System.currentTimeMillis() - t) + ']'); - System.out.println("Norm results:" + normCombs); - } - - /** - * Removes `candidate` from each row of ROWS. - * Return true if result list doesn't contain any row with size > 1. - * <p> - * If ROWS is {{a, b}, {a, c}}. Candidate {a, b} - ok, candidate {a} - ok, candidate {b} - no. - */ - private static boolean isSuitable(Set<String> candidate) { - for (Set<String> row : ROWS) { - Set<String> copy = new HashSet<>(row); - - copy.removeAll(candidate); - - if (copy.size() > 1) { - return false; - } - } - - return true; - } - - private static Set<Set<String>> squeeze(Set<Set<String>> combs) { - Set<Set<String>> normCombs = new HashSet<>(); - - for (Set<String> comb : combs.stream().sorted(Comparator.comparingInt(Set::size)).collect(Collectors.toList())) { - // Skips already added shorter variants. - if (normCombs.stream().filter(comb::containsAll).findAny().isEmpty()) { - normCombs.add(comb); - } - } - return normCombs; - } - - // Adds "" which means empty row. For example for small row it returns {{A, B, C, ""}, {B, C, D, ""}, {B, D, ""} } - private static List<Set<String>> extendNulls() { - return ROWS.stream().map( - p -> Stream.concat(p.stream(), Stream.of("")).collect(Collectors.toSet()) - ).collect(Collectors.toList()); - } -} - diff --git a/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/Test2.java b/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/Test2.java deleted file mode 100644 index b8e5e42..0000000 --- a/nlpcraft/src/test/scala/org/apache/nlpcraft/probe/mgrs/nlp/enrichers/model/Test2.java +++ /dev/null @@ -1,114 +0,0 @@ -package org.apache.nlpcraft.probe.mgrs.nlp.enrichers.model; - -import com.google.common.collect.ImmutableSet; -import com.google.common.collect.Sets; - -import java.util.Arrays; -import java.util.Collection; -import java.util.Comparator; -import java.util.HashSet; -import java.util.List; -import java.util.Set; -import java.util.stream.Collectors; - -public class Test2 { -// private static List<Set<String>> ROWS = -// Arrays.asList( -// ImmutableSet.of("A", "B", "C"), -// ImmutableSet.of("B", "C", "D"), -// ImmutableSet.of("B", "D") -// ); - - // Uncomment it. Works too long time. Normalized result is 256. - private static List<Set<String>> ROWS = Arrays.asList( - ImmutableSet.of("A", "B"), - ImmutableSet.of("C", "B"), - ImmutableSet.of("D", "E"), - ImmutableSet.of("D", "F"), - ImmutableSet.of("G", "H"), - ImmutableSet.of("I", "H"), - ImmutableSet.of("J", "K"), - ImmutableSet.of("L", "K"), - ImmutableSet.of("M", "N"), - ImmutableSet.of("M", "O"), - ImmutableSet.of("P", "Q"), - ImmutableSet.of("P", "R"), - ImmutableSet.of("S", "T"), - ImmutableSet.of("S", "U"), - ImmutableSet.of("V", "W"), - ImmutableSet.of("X", "W") - ); - - private static Set<String> ALL = ROWS.stream().flatMap(Collection::stream).collect(Collectors.toSet()); - - // Goal: Find minimal set of combinations with following feature. - // After removing combination values from each row - list should contain rows with size <= 1. - - // Expected solution: [C, B], [A, C, D], [A, B, D] - // Example: - // list - [C, B] = {{A}, {D}, {D}} - // list - [A, C, D] = {{B}, {B}, {B}} - // list - [A, B, D] = {{C}, {C}, {null}} - - - // Additional. Redundant solutions: [A, B, C] ([C, B] enough), [A, B, C, D] ([A, C, D] enough) etc - - // Easiest. - public static void main(String[] args) { - long t = System.currentTimeMillis(); - - System.out.println("1. start [time=" + (System.currentTimeMillis() - t) + ']'); - - Set<Set<String>> combs = new HashSet<>(); - - for (int i = 1; i < ALL.size(); i++) { - combs.addAll( - Sets.combinations(ALL, i). - stream(). - filter(Test2::isSuitable). - collect(Collectors.toSet()) - ); - } - - System.out.println("2. calculated [size=" + combs.size() + ", time=" + (System.currentTimeMillis() - t) + ']'); - - // Normalize variants (keeps only minimal valid subsets, see task description) - Set<Set<String>> normCombs = squeeze(combs); - - System.out.println("3. normalized [size=" + normCombs.size() + ", time=" + (System.currentTimeMillis() - t) + ']'); - System.out.println("Norm results:" + normCombs); - } - - private static Set<Set<String>> squeeze(Set<Set<String>> combs) { - Set<Set<String>> normCombs = new HashSet<>(); - - for (Set<String> comb : combs.stream().sorted(Comparator.comparingInt(Set::size)).collect(Collectors.toList())) { - // Skips already added shorter variants. - if (normCombs.stream().filter(comb::containsAll).findAny().isEmpty()) { - normCombs.add(comb); - } - } - return normCombs; - } - - /** - * Removes `candidate` from each row of ROWS. - * Return true if result list doesn't contain any row with size > 1. - * <p> - * If ROWS is {{a, b}, {a, c}}. Candidate {a, b} - ok, candidate {a} - ok, candidate {b} - no. - */ - private static boolean isSuitable(Set<String> candidate) { - for (Set<String> row : ROWS) { - Set<String> copy = new HashSet<>(row); - - copy.removeAll(candidate); - - if (copy.size() > 1) { - return false; - } - } - - return true; - } -} -
