This is an automated email from the ASF dual-hosted git repository.
aradzinski pushed a commit to branch NLPCRAFT-366
in repository
https://gitbox.apache.org/repos/asf/incubator-nlpcraft-java-client.git
The following commit(s) were added to refs/heads/NLPCRAFT-366 by this push:
new 29355b3 WIP on NLPCRAFT-366.
29355b3 is described below
commit 29355b330a4e8c3da2d6666c3d87774fd4d62691
Author: Nikita Ivanov <>
AuthorDate: Fri Aug 6 16:33:03 2021 -0700
WIP on NLPCRAFT-366.
---
.../java/org/apache/nlpcraft/client/NCModel.java | 13 +-
.../org/apache/nlpcraft/client/NCModelInfo.java | 327 ++++++++++++++++++++-
.../java/org/apache/nlpcraft/client/NCValue.java | 23 +-
3 files changed, 355 insertions(+), 8 deletions(-)
diff --git a/src/main/java/org/apache/nlpcraft/client/NCModel.java
b/src/main/java/org/apache/nlpcraft/client/NCModel.java
index 4712df8..4e9c4ae 100644
--- a/src/main/java/org/apache/nlpcraft/client/NCModel.java
+++ b/src/main/java/org/apache/nlpcraft/client/NCModel.java
@@ -21,30 +21,31 @@ package org.apache.nlpcraft.client;
* Data model descriptor.
*
* @see NCClient#getProbes()
- * @see NCProbe#getModels()
+ * @see NCProbe#getModels()
+ * @see NCModelInfo
*/
public interface NCModel {
/**
* Gets unique, <i>immutable</i> ID of this model.
* <p>
* Note that <b>model IDs are immutable</b> while name and version
- * can be changed freely. Changing model ID is equal to creating a
completely new model that will have
- * to be re-trained and re-learned again. Model IDs (unlike name and
version) are not exposed to
- * the end user and only serve a technical purpose.
+ * can be changed freely. Changing model ID is equal to creating a
completely new model.
+ * Model IDs (unlike name and version) are not exposed to the end user and
only serve a
+ * technical purpose. ID's max length is 32 characters.
*
* @return Unique, <i>immutable</i> ID of this model.
*/
String getId();
/**
- * Gets descriptive name of this model.
+ * Gets descriptive name of this model. Name's max length is 64 characters.
*
* @return Descriptive name for this model.
*/
String getName();
/**
- * Gets the version of this model using semantic versioning.
+ * Gets the version of this model using semantic versioning. Version's max
length is 16 characters.
*
* @return A version compatible with (<a
href="http://www.semver.org">www.semver.org</a>) specification.
*/
diff --git a/src/main/java/org/apache/nlpcraft/client/NCModelInfo.java
b/src/main/java/org/apache/nlpcraft/client/NCModelInfo.java
index 1c7b05c..4b86711 100644
--- a/src/main/java/org/apache/nlpcraft/client/NCModelInfo.java
+++ b/src/main/java/org/apache/nlpcraft/client/NCModelInfo.java
@@ -17,78 +17,403 @@
package org.apache.nlpcraft.client;
+import java.util.Collections;
+import java.util.List;
import java.util.Map;
import java.util.Set;
-// TODO: same as NCModelView without getParsers
+/**
+ * A model view descriptor.
+ *
+ * @see NCClient#getModelInfo(String)
+ * @see NCModel
+ */
public interface NCModelInfo {
+ /**
+ * Gets unique, <i>immutable</i> ID of this model.
+ * <p>
+ * Note that <b>model IDs are immutable</b> while name and version
+ * can be changed freely. Changing model ID is equal to creating a
completely new model.
+ * Model IDs (unlike name and version) are not exposed to the end user and
only serve a
+ * technical purpose. ID's max length is 32 characters.
+ *
+ * @return Unique, <i>immutable</i> ID of this model.
+ */
String getId();
+ /**
+ * Gets descriptive name of this model. Name's max length is 64 characters.
+ *
+ * @return Descriptive name for this model.
+ */
String getName();
+ /**
+ * Gets the version of this model using semantic versioning. Version's max
length is 16 characters.
+ *
+ * @return A version compatible with (<a
href="http://www.semver.org">www.semver.org</a>) specification.
+ */
String getVersion();
+ /**
+ * Gets optional short model description. This can be displayed by the
management tools.
+ *
+ * @return Optional short model description.
+ */
String getDescription();
+ /**
+ * Gets the origin of this model like name of the class, file path or URL.
+ *
+ * @return Origin of this model like name of the class, file path or URL.
+ */
String getOrigin();
+ /**
+ * Gets maximum number of unknown words until automatic rejection. An
unknown word is a word
+ * that is not part of Princeton WordNet database. If you expect a very
formalized and well-defined
+ * input without uncommon slang and abbreviations you can set this to a
small number
+ * like one or two. However, in most cases we recommend leaving it as or
set it to a larger
+ * number like five or more.
+ *
+ * @return Maximum number of unknown words until automatic rejection.
+ */
int getMaxUnknownWords();
+ /**
+ * Gets maximum number of free words until automatic rejection. A free
word is a known word that is
+ * not part of any recognized token. In other words, a word that is
present in the user input
+ * but won't be used to understand its meaning. Setting it to a non-zero
risks the misunderstanding
+ * of the user input, while setting it to zero often makes understanding
logic too rigid. In most
+ * cases we recommend setting to between one and three. If you expect the
user input to contain
+ * many <i>noisy</i> idioms, slang or colloquials - you can set it to a
larger number.
+ *
+ * @return Maximum number of free words until automatic rejection.
+ */
int getMaxFreeWords();
+ /**
+ * Gets maximum number of suspicious words until automatic rejection. A
suspicious word is a word
+ * that is defined by the model that should not appear in a valid user
input under no circumstances.
+ * A typical example of suspicious words would be words "sex" or "porn"
when processing
+ * queries about children books. In most cases this should be set to zero
(default) to automatically
+ * reject any such suspicious words in the user input.
+ *
+ * @return Maximum number of suspicious words until automatic rejection.
+ */
int getMaxSuspiciousWords();
+ /**
+ * Gets minimum word count (<i>including</i> stopwords) below which user
input will be automatically
+ * rejected as too short. In almost all cases this value should be greater
than or equal to one.
+ *
+ * @return Minimum word count (<i>including</i> stopwords) below which
user input will be automatically
+ * rejected as too short.
+ */
int getMinWords();
+ /**
+ * Gets maximum word count (<i>including</i> stopwords) above which user
input will be automatically
+ * rejected as too long. In almost all cases this value should be greater
than or equal to one.
+ *
+ * @return Maximum word count (<i>including</i> stopwords) above which
user input will be automatically
+ * rejected as too long.
+ */
int getMaxWords();
+ /**
+ * Gets minimum number of all tokens (system and user defined) below which
user input will be
+ * automatically rejected as too short. In almost all cases this value
should be greater than or equal to one.
+ *
+ * @return Minimum number of all tokens.
+ */
int getMinTokens();
+ /**
+ * Gets maximum number of all tokens (system and user defined) above which
user input will be
+ * automatically rejected as too long. Note that sentences with large
number of token can result
+ * in significant processing delay and substantial memory consumption.
+ *
+ * @return Maximum number of all tokens.
+ */
int getMaxTokens();
+ /**
+ * Gets minimum word count (<i>excluding</i> stopwords) below which user
input will be automatically rejected
+ * as ambiguous sentence.
+ *
+ * @return Minimum word count (<i>excluding</i> stopwords) below which
user input will be automatically
+ * rejected as too short.
+ */
int getMinNonStopwords();
+ /**
+ * Whether to allow non-English language in user input.
+ * Currently, only English language is supported. However, model can
choose whether
+ * to automatically reject user input that is detected to be a
non-English. Note that current
+ * algorithm only works reliably on longer user input (10+ words). On
short sentences it will
+ * often produce an incorrect result.
+ *
+ * @return Whether to allow non-English language in user input.
+ */
boolean isNonEnglishAllowed();
+ /**
+ * Whether to allow non-Latin charset in user input. Currently, only
+ * Latin charset is supported. However, model can choose whether to
automatically reject user
+ * input with characters outside of Latin charset. If {@code false} such
user input will be automatically
+ * rejected.
+ *
+ * @return Whether to allow non-Latin charset in user input.
+ */
boolean isNotLatinCharsetAllowed();
+ /**
+ * Whether to allow known English swear words in user input. If {@code
false} - user input with
+ * detected known English swear words will be automatically rejected.
+ *
+ * @return Whether to allow known swear words in user input.
+ */
boolean isSwearWordsAllowed();
+ /**
+ * Whether to allow user input without a single noun. If {@code false}
such user input
+ * will be automatically rejected. Typically, for strict command or
query-oriented models this should be set to
+ * {@code false} as any command or query should have at least one noun
subject. However, for conversational
+ * models this can be set to {@code false} to allow for a smalltalk and
one-liners.
+ *
+ * @return Whether to allow user input without a single noun.
+ */
boolean isNoNounsAllowed();
+ /**
+ * Whether to permutate multi-word synonyms. Automatic multi-word synonyms
permutations greatly
+ * increase the total number of synonyms in the system and allows for
better multi-word synonym detection.
+ * For example, if permutation is allowed the synonym "a b c" will be
automatically converted into a
+ * sequence of synonyms of "a b c", "b a c", "a c b". This property is
closely related to {@link #isSparse()}
+ * which are typically changed together. Note that individual model
elements can override this property using
+ * {@link NCElement#isPermutateSynonyms()} method.
+ *
+ * @return Whether to permutate multi-word synonyms.
+ * @see NCElement#isPermutateSynonyms()
+ * @see NCElement#isSparse()
+ * @see #isSparse()
+ */
boolean isPermutateSynonyms();
+ /**
+ * Whether duplicate synonyms are allowed. If {@code true} - the model
will pick the random
+ * model element when multiple elements found due to duplicate synonyms.
If {@code false} - model
+ * will print error message and will not deploy.
+ *
+ * @return Whether to allow duplicate synonyms.
+ */
boolean isDupSynonymsAllowed();
+ /**
+ * Total number of synonyms allowed per model. Model won't deploy if total
number of synonyms exceeds this
+ * number.
+ *
+ * @return Total number of synonyms allowed per model.
+ * @see #getMaxElementSynonyms()
+ */
int getMaxTotalSynonyms();
+ /**
+ * Whether to allow the user input with no user token detected. If {@code
false} such user
+ * input will be automatically rejected. Note that this property only
applies to user-defined
+ * token (i.e. model element). Even if there are no user defined tokens,
the user input may still
+ * contain system token like <code>nlpcraft:city</code> or
<code>nlpcraft:date</code>. In many cases models
+ * should be build to allow user input without user tokens. However, set
it to {@code false} if presence
+ * of at least one user token is mandatory.
+ *
+ * @return Whether to allow the user input with no user token detected.
+ */
boolean isNoUserTokensAllowed();
+ /**
+ * Whether this model elements allow non-stop words gaps in their
multi-word synonyms.
+ * This property is closely related to {@link #isPermutateSynonyms()}
which are typically changed together.
+ * Note that individual model elements can override this property using
{@link NCElement#isSparse()}
+ * method.
+ *
+ * @return Optional multi-word synonym sparsity model property.
+ * @see NCElement#isSparse()
+ * @see NCElement#isPermutateSynonyms()
+ * @see #isPermutateSynonyms()
+ */
boolean isSparse();
+ /**
+ * Gets optional user defined model metadata that can be set by the
developer and accessed later.
+ * By default, it returns an empty map. Note that this metadata is mutable
and can be
+ * changed at runtime by the model's code.
+ *
+ * @return Optional user defined model metadata. By default, returns an
empty map. Never returns {@code null}.
+ */
Map<String, Object> getMetadata();
+ /**
+ * Gets an optional list of stopwords to add to the built-in ones.
+ * <p>
+ * Stopword is an individual word (i.e. sequence of characters excluding
whitespaces) that contribute no
+ * semantic meaning to the sentence. For example, 'the', 'wow', or 'hm'
provide no semantic meaning to the
+ * sentence and can be safely excluded from semantic analysis.
+ * <p>
+ * NLPCraft comes with a carefully selected list of English stopwords
which should be sufficient
+ * for a majority of use cases. However, you can add additional stopwords
to this list. The typical
+ * use for user-defined stopwords are jargon parasite words that are
specific to the model's domain.
+ *
+ * @return Potentially empty list of additional stopwords.
+ */
Set<String> getAdditionalStopWords();
+ /**
+ * Gets an optional list of stopwords to exclude from the built-in list of
stopwords.
+ * <p>
+ * Just like you can add additional stopwords via {@link
#getAdditionalStopWords()} you can exclude
+ * certain words from the list of stopwords. This can be useful in rare
cases when built-in
+ * stopword has specific meaning of your model. In order to process them
you need to exclude them
+ * from the list of stopwords.
+ *
+ * @return Potentially empty list of excluded stopwords.
+ */
Set<String> getExcludedStopWords();
+ /**
+ * Gets an optional list of suspicious words. A suspicious word is a word
that generally should not appear in user
+ * sentence when used with this model. For example, if a particular model
is for children oriented book search,
+ * the words "sex" and "porn" should probably NOT appear in the user input
and can be automatically rejected
+ * when added here and model's metadata {@code MAX_SUSPICIOUS_WORDS}
property set to zero.
+ *
+ * @return Potentially empty list of suspicious words in their lemma form.
+ */
Set<String> getSuspiciousWords();
+ /**
+ * Gets an optional map of macros to be used in this model. Macros and
option groups are instrumental
+ * in defining model's elements.
+ *
+ * @return Potentially empty map of macros.
+ */
Map<String, String> getMacros();
+ /**
+ * Gets a set of model elements or named entities. Model can have zero or
more user defined elements.
+ *
+ * @return Set of model elements, potentially empty.
+ */
Set<NCElement> getElements();
+ /**
+ * Gets a set of IDs for built-in named entities (tokens) that should be
enabled and detected for this model.
+ * Unless model requests (i.e. enables) the built-in tokens in this method
the NLP subsystem will not attempt
+ * to detect them. Explicit enablement of the token significantly improves
the overall performance by avoiding
+ * unnecessary token detection. Note that you don't have to specify your
own user elements here as they are
+ * always enabled.
+ *
+ * @return Set of built-in tokens, potentially empty but never {@code
null}, that should be enabled
+ * and detected for this model.
+ */
Set<String> getEnabledBuiltInTokens();
+ /**
+ * Gets s set of named entities (token) IDs that will be considered as
abstract tokens.
+ * An abstract token is only detected when it is either a constituent part
of some other non-abstract token
+ * or referenced by built-in tokens. In other words, an abstract token
will not be detected in a standalone
+ * unreferenced position. By (unless returned by this method), all named
entities considered to be
+ * non-abstract.
+ * <p>
+ * Declaring tokens as abstract is important to minimize number of parsing
variants automatically
+ * generated as permutation of all possible parsing compositions. For
example, if it is known that a particular
+ * named entity will only be used as a constituent part of some other
token - declaring such named entity as
+ * abstract can significantly reduce the number of parsing variants
leading to a better performance,
+ * and often simpler corresponding intent definition and callback logic.
+ *
+ * @return Set of abstract token IDs. Can be empty but never {@code null}.
+ */
Set<String> getAbstractTokens();
+ /**
+ * Gets maximum number of unique synonyms per model element after which
either warning or error will be
+ * triggered. Note that there is no technical limit on how many synonyms a
model element can have apart
+ * from memory consumption and performance considerations. However, in
cases where synonyms are auto-generated
+ * (i.e. from database) this property can serve as a courtesy notification
that a model element has too many
+ * synonyms. Also, in general, too many synonyms can potentially lead to a
performance degradation.
+ *
+ * @return Maximum number of unique synonyms per model element after which
either warning or
+ * error will be triggered.
+ * @see #isMaxSynonymsThresholdError()
+ * @see #getMaxTotalSynonyms()
+ */
int getMaxElementSynonyms();
+ /**
+ * Whether exceeding {@link #getMaxElementSynonyms()} will trigger a
warning log or throwing an exception.
+ * Note that throwing exception will prevent data probe from starting.
+ *
+ * @return Whether exceeding {@link #getMaxElementSynonyms()} will trigger
a warning log or
+ * throwing an exception.
+ * @see #getMaxElementSynonyms()
+ */
boolean isMaxSynonymsThresholdError();
+ /**
+ * Gets timeout in ms after which the unused conversation element is
automatically "forgotten".
+ * <p>
+ * Just like in a normal human conversation if we talk about, say,
"Chicago", and then don't mention it
+ * for certain period of time during further dialog, the conversation
participants subconsciously "forget"
+ * about it and exclude it from conversation context. In other words, the
term "Chicago" is no longer in
+ * conversation's short-term-memory.
+ * <p>
+ * Note that both conversation timeout and {@link #getConversationDepth()
depth}
+ * combined define the expiration policy for the conversation management.
These two properties allow fine-tuning
+ * for different types of dialogs. For example, setting longer timeout and
smaller depth mimics
+ * slow-moving but topic-focused conversation. Alternatively, settings
shorter timeout and longer depth better
+ * supports fast-moving wide-ranging conversation that may cover multiple
topics.
+ *
+ * @return Timeout in ms after which the unused conversation element is
automatically "forgotten".
+ * @see #getConversationDepth()
+ */
long getConversationTimeout();
+ /**
+ * Gets maximum number of requests after which the unused conversation
element is automatically "forgotten".
+ * <p>
+ * Just like in a normal human conversation if we talk about, say,
"Chicago", and then don't mention it
+ * for a certain number of utterances during further dialog, the
conversation participants subconsciously "forget"
+ * about it and exclude it from conversation context. In other words, the
term "Chicago" is no longer in
+ * conversation's short-term-memory.
+ * <p>
+ * Note that both conversation {@link #getConversationTimeout() timeout}
and depth
+ * combined define the expiration policy for the conversation management.
These two properties allow fine-tuning
+ * for different types of dialogs. For example, setting longer timeout and
smaller depth mimics
+ * slow-moving but topic-focused conversation. Alternatively, settings
shorter timeout and longer depth better
+ * supports fast-moving wide-ranging conversation that may cover multiple
topics.
+ *
+ * @return Maximum number of requests after which the unused conversation
element is automatically "forgotten".
+ * @see #getConversationTimeout()
+ */
int getConversationDepth();
+ /**
+ * Gets an optional map of restricted named entity combinations (linkage).
Returned map is a map of entity ID to a set
+ * of other entity IDs, with each key-value pair defining the restricted
combination. Restricting certain entities
+ * from being linked (or referenced) by some other entities allows
reducing "wasteful" parsing variant
+ * generation. For example, if we know that entity with ID "adjective"
cannot be sorted, we can restrict it
+ * from being linked with <code>nlpcraft:limit</code> and
<code>nlpcraft:sort</code> entities to reduce the
+ * amount of parsing variants being generated.
+ * <p>
+ * Only the following built-in entities can be restricted (i.e., to be the
keys in the returned map):
+ * <ul>
+ * <li><code>nlpcraft:limit</code></li>
+ * <li><code>nlpcraft:sort</code></li>
+ * <li><code>nlpcraft:relation</code></li>
+ * </ul>
+ * Note that entity cannot be restricted to itself (entity ID cannot
appear as key as well as a
+ * part of the value's set).
+ *
+ * @return Optional map of restricted named entity combinations. Can be
empty but never {@code null}.
+ */
Map<String, Set<String>> getRestrictedCombinations();
}
diff --git a/src/main/java/org/apache/nlpcraft/client/NCValue.java
b/src/main/java/org/apache/nlpcraft/client/NCValue.java
index e4ceebc..0b96595 100644
--- a/src/main/java/org/apache/nlpcraft/client/NCValue.java
+++ b/src/main/java/org/apache/nlpcraft/client/NCValue.java
@@ -19,8 +19,29 @@ package org.apache.nlpcraft.client;
import java.util.List;
-// TODO: like model NCValue
+/**
+ * Model element's value descriptor.
+ * <p>
+ * Each model element can generally be recognized either by one of its
synonyms or values. Elements and their values
+ * are analogous to types and instances of that type in programming languages.
Each value
+ * has a name and optional set of its own synonyms by which that value, and
ultimately its element, can be
+ * recognized by. Note that value name itself acts as an implicit synonym even
when no additional synonyms added
+ * for that value.
+ *
+ * @see NCElement#getValues()
+ */
public interface NCValue {
+ /**
+ * Gets value name.
+ *
+ * @return Value name.
+ */
String getName();
+
+ /**
+ * Gets optional list of value's synonyms.
+ *
+ * @return Potentially empty list of value's synonyms.
+ */
List<String> getSynonyms();
}