This is an automated email from the ASF dual-hosted git repository.
epugh pushed a commit to branch branch_9x
in repository https://gitbox.apache.org/repos/asf/solr.git
The following commit(s) were added to refs/heads/branch_9x by this push:
new a5c4efa065f SOLR-12089: FileBasedSpellChecker docs have some missing
params (#2356)
a5c4efa065f is described below
commit a5c4efa065f4bb05b498193ab0ee003d8327f164
Author: Andrey Bozhko <[email protected]>
AuthorDate: Fri Mar 22 07:07:17 2024 -0500
SOLR-12089: FileBasedSpellChecker docs have some missing params (#2356)
* Now handles a accept accuracy as float.
* deprecate misspelled `breakSugestionTieBreaker` parameter in favor of
`breakSuggestionTieBreaker`
in WordBreakSolrSpellChecker.
* Audit and update the Ref Guide for missing parameters.
---------
Co-authored-by: Andrey Bozhko <[email protected]>
Co-authored-by: Eric Pugh <[email protected]>
Co-authored-by: Christine Poerschke <[email protected]>
---
solr/CHANGES.txt | 4 +
.../handler/component/SpellCheckComponent.java | 2 +-
.../solr/spelling/AbstractLuceneSpellChecker.java | 4 +-
.../solr/spelling/WordBreakSolrSpellChecker.java | 29 +++-
.../solr/collection1/conf/solrconfig-minhash.xml | 2 +-
.../collection1/conf/solrconfig-plugcollector.xml | 2 +-
.../conf/solrconfig-spellcheckcomponent.xml | 5 +-
.../solr/collection1/conf/solrconfig.xml | 2 +-
.../conf/solrconfig.xml | 2 +-
.../query-guide/pages/function-queries.adoc | 2 +-
.../modules/query-guide/pages/spell-checking.adoc | 168 +++++++++++++++------
.../solr/collection1/conf/solrconfig.xml | 2 +-
12 files changed, 161 insertions(+), 63 deletions(-)
diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt
index 20401f1f477..0efa84908ce 100644
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@@ -82,6 +82,10 @@ Other Changes
* SOLR-17190: Replace org.apache.solr.util.LongSet with hppc LongHashSet
(Michael Gibney)
+* SOLR-12089: Update FileBasedSpellChecker and IndexBasedSpellChecker to
accept accuracy parameter
+ as float; deprecate `breakSugestionTieBreaker` parameter in favor of
`breakSuggestionTieBreaker`
+ in WordBreakSolrSpellChecker (Andrey Bozhko via Eric Pugh)
+
* SOLR-17201: Http2SolrClient and friends no longer marked as
@lucene.experimental.
Krb5HttpClientBuilder and PreemptiveBasicAuthClientBuilderFactory no longer
deprecated (janhoy)
diff --git
a/solr/core/src/java/org/apache/solr/handler/component/SpellCheckComponent.java
b/solr/core/src/java/org/apache/solr/handler/component/SpellCheckComponent.java
index 9a45aefea02..a9603de49b8 100644
---
a/solr/core/src/java/org/apache/solr/handler/component/SpellCheckComponent.java
+++
b/solr/core/src/java/org/apache/solr/handler/component/SpellCheckComponent.java
@@ -760,7 +760,7 @@ public class SpellCheckComponent extends SearchComponent
implements SolrCoreAwar
private boolean addSpellChecker(SolrCore core, boolean hasDefault,
NamedList<?> spellchecker) {
String className = (String) spellchecker.get("classname");
if (className == null) className = (String) spellchecker.get("class");
- // TODO: this is a little bit sneaky: warn if class isnt supplied
+ // TODO: this is a little bit sneaky: warn if class isn't supplied
// so that it's mandatory in a future release?
if (className == null) className = IndexBasedSpellChecker.class.getName();
SolrResourceLoader loader = core.getResourceLoader();
diff --git
a/solr/core/src/java/org/apache/solr/spelling/AbstractLuceneSpellChecker.java
b/solr/core/src/java/org/apache/solr/spelling/AbstractLuceneSpellChecker.java
index df413b01e06..a54ea8ab9e1 100644
---
a/solr/core/src/java/org/apache/solr/spelling/AbstractLuceneSpellChecker.java
+++
b/solr/core/src/java/org/apache/solr/spelling/AbstractLuceneSpellChecker.java
@@ -81,7 +81,6 @@ public abstract class AbstractLuceneSpellChecker extends
SolrSpellChecker {
public String init(NamedList<?> config, SolrCore core) {
super.init(config, core);
indexDir = (String) config.get(INDEX_DIR);
- String accuracy = (String) config.get(ACCURACY);
// If indexDir is relative then create index inside core.getDataDir()
if (indexDir != null) {
if (!new File(indexDir).isAbsolute()) {
@@ -120,9 +119,10 @@ public abstract class AbstractLuceneSpellChecker extends
SolrSpellChecker {
} catch (IOException e) {
throw new RuntimeException(e);
}
+ Object accuracy = config.get(ACCURACY);
if (accuracy != null) {
try {
- this.accuracy = Float.parseFloat(accuracy);
+ this.accuracy = Float.parseFloat(accuracy.toString());
spellChecker.setAccuracy(this.accuracy);
} catch (NumberFormatException e) {
throw new RuntimeException("Unparseable accuracy given for dictionary:
" + name, e);
diff --git
a/solr/core/src/java/org/apache/solr/spelling/WordBreakSolrSpellChecker.java
b/solr/core/src/java/org/apache/solr/spelling/WordBreakSolrSpellChecker.java
index 90029320a37..db7bd7f5665 100644
--- a/solr/core/src/java/org/apache/solr/spelling/WordBreakSolrSpellChecker.java
+++ b/solr/core/src/java/org/apache/solr/spelling/WordBreakSolrSpellChecker.java
@@ -17,6 +17,7 @@
package org.apache.solr.spelling;
import java.io.IOException;
+import java.lang.invoke.MethodHandles;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Iterator;
@@ -32,6 +33,8 @@ import
org.apache.lucene.search.spell.WordBreakSpellChecker.BreakSuggestionSortM
import org.apache.solr.common.util.NamedList;
import org.apache.solr.core.SolrCore;
import org.apache.solr.search.SolrIndexSearcher;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
/**
* A spellchecker that breaks and combines words.
@@ -46,6 +49,9 @@ import org.apache.solr.search.SolrIndexSearcher;
* properly sets these flags.
*/
public class WordBreakSolrSpellChecker extends SolrSpellChecker {
+
+ private static final Logger log =
LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
+
/** Try to combine multiple words into one? [true|false] */
public static final String PARAM_COMBINE_WORDS = "combineWords";
@@ -61,8 +67,16 @@ public class WordBreakSolrSpellChecker extends
SolrSpellChecker {
/** See {@link WordBreakSpellChecker#setMinBreakWordLength} */
public static final String PARAM_MIN_BREAK_WORD_LENGTH = "minBreakLength";
+ /**
+ * See {@link BreakSuggestionTieBreaker} for options.
+ *
+ * @deprecated Only used for backwards compatibility. It will be removed in
10.x.
+ */
+ @Deprecated(since = "9.6")
+ private static final String PARAM_BREAK_SUGESTION_TIE_BREAKER =
"breakSugestionTieBreaker";
+
/** See {@link BreakSuggestionTieBreaker} for options. */
- public static final String PARAM_BREAK_SUGGESTION_TIE_BREAKER =
"breakSugestionTieBreaker";
+ public static final String PARAM_BREAK_SUGGESTION_TIE_BREAKER =
"breakSuggestionTieBreaker";
/** See {@link WordBreakSpellChecker#setMaxEvaluations} */
public static final String PARAM_MAX_EVALUATIONS = "maxEvaluations";
@@ -70,7 +84,7 @@ public class WordBreakSolrSpellChecker extends
SolrSpellChecker {
/** See {@link WordBreakSpellChecker#setMinSuggestionFrequency} */
public static final String PARAM_MIN_SUGGESTION_FREQUENCY =
"minSuggestionFreq";
- /** Specify a value on the "breakSugestionTieBreaker" parameter. The default
is MAX_FREQ. */
+ /** Specify a value on the "breakSuggestionTieBreaker" parameter. The
default is MAX_FREQ. */
public enum BreakSuggestionTieBreaker {
/** See {@link BreakSuggestionSortMethod#NUM_CHANGES_THEN_MAX_FREQUENCY} #
*/
MAX_FREQ,
@@ -92,6 +106,17 @@ public class WordBreakSolrSpellChecker extends
SolrSpellChecker {
breakWords = boolParam(config, PARAM_BREAK_WORDS);
wbsp = new WordBreakSpellChecker();
String bstb = strParam(config, PARAM_BREAK_SUGGESTION_TIE_BREAKER);
+ if (bstb == null) {
+ bstb = strParam(config, PARAM_BREAK_SUGESTION_TIE_BREAKER);
+ if (bstb != null && log.isWarnEnabled()) {
+ log.warn(
+ "Parameter '"
+ + PARAM_BREAK_SUGESTION_TIE_BREAKER
+ + "' is deprecated and will be removed in Solr 10.x. Please
use '"
+ + PARAM_BREAK_SUGGESTION_TIE_BREAKER
+ + "' instead."); // nowarn
+ }
+ }
if (bstb != null) {
bstb = bstb.toUpperCase(Locale.ROOT);
if (bstb.equals(BreakSuggestionTieBreaker.SUM_FREQ.name())) {
diff --git
a/solr/core/src/test-files/solr/collection1/conf/solrconfig-minhash.xml
b/solr/core/src/test-files/solr/collection1/conf/solrconfig-minhash.xml
index e23b30b62c1..9fa236dda0b 100644
--- a/solr/core/src/test-files/solr/collection1/conf/solrconfig-minhash.xml
+++ b/solr/core/src/test-files/solr/collection1/conf/solrconfig-minhash.xml
@@ -271,7 +271,7 @@
<str name="name">freq</str>
<str name="field">lowerfilt</str>
<str name="spellcheckIndexDir">spellcheckerFreq</str>
- <!-- comparatorClass be one of:
+ <!-- comparatorClass can be one of:
1. score (default)
2. freq (Frequency first, then score)
3. A fully qualified class name
diff --git
a/solr/core/src/test-files/solr/collection1/conf/solrconfig-plugcollector.xml
b/solr/core/src/test-files/solr/collection1/conf/solrconfig-plugcollector.xml
index 34636cd6cfd..845998ec2f4 100644
---
a/solr/core/src/test-files/solr/collection1/conf/solrconfig-plugcollector.xml
+++
b/solr/core/src/test-files/solr/collection1/conf/solrconfig-plugcollector.xml
@@ -259,7 +259,7 @@
<str name="name">freq</str>
<str name="field">lowerfilt</str>
<str name="spellcheckIndexDir">spellcheckerFreq</str>
- <!-- comparatorClass be one of:
+ <!-- comparatorClass can be one of:
1. score (default)
2. freq (Frequency first, then score)
3. A fully qualified class name
diff --git
a/solr/core/src/test-files/solr/collection1/conf/solrconfig-spellcheckcomponent.xml
b/solr/core/src/test-files/solr/collection1/conf/solrconfig-spellcheckcomponent.xml
index 7760eb261e3..0253d91b804 100644
---
a/solr/core/src/test-files/solr/collection1/conf/solrconfig-spellcheckcomponent.xml
+++
b/solr/core/src/test-files/solr/collection1/conf/solrconfig-spellcheckcomponent.xml
@@ -83,7 +83,7 @@
<str name="field">lowerfilt</str>
<str name="combineWords">true</str>
<str name="breakWords">true</str>
- <str name="breakSugestionTieBreaker">MAX_FREQ</str>
+ <str name="breakSuggestionTieBreaker">MAX_FREQ</str>
<int name="maxChanges">10</int>
</lst>
<lst name="spellchecker">
@@ -122,13 +122,14 @@
<str name="sourceLocation">spellings.txt</str>
<str name="characterEncoding">UTF-8</str>
<str name="spellcheckIndexDir">spellchecker3</str>
+ <float name="accuracy">0.5</float>
</lst>
<!-- Comparator -->
<lst name="spellchecker">
<str name="name">freq</str>
<str name="field">lowerfilt</str>
<str name="spellcheckIndexDir">spellcheckerFreq</str>
- <!-- comparatorClass be one of:
+ <!-- comparatorClass can be one of:
1. score (default)
2. freq (Frequency first, then score)
3. A fully qualified class name
diff --git a/solr/core/src/test-files/solr/collection1/conf/solrconfig.xml
b/solr/core/src/test-files/solr/collection1/conf/solrconfig.xml
index 9b66df4cc0e..82dca6384d8 100644
--- a/solr/core/src/test-files/solr/collection1/conf/solrconfig.xml
+++ b/solr/core/src/test-files/solr/collection1/conf/solrconfig.xml
@@ -278,7 +278,7 @@
<str name="name">freq</str>
<str name="field">lowerfilt</str>
<str name="spellcheckIndexDir">spellcheckerFreq</str>
- <!-- comparatorClass be one of:
+ <!-- comparatorClass can be one of:
1. score (default)
2. freq (Frequency first, then score)
3. A fully qualified class name
diff --git
a/solr/server/solr/configsets/sample_techproducts_configs/conf/solrconfig.xml
b/solr/server/solr/configsets/sample_techproducts_configs/conf/solrconfig.xml
index 9b83b3e6db5..6b019488435 100644
---
a/solr/server/solr/configsets/sample_techproducts_configs/conf/solrconfig.xml
+++
b/solr/server/solr/configsets/sample_techproducts_configs/conf/solrconfig.xml
@@ -828,7 +828,7 @@
<!-- a spellchecker that use an alternate comparator
- comparatorClass be one of:
+ comparatorClass can be one of:
1. score (default)
2. freq (Frequency first, then score)
3. A fully qualified class name
diff --git
a/solr/solr-ref-guide/modules/query-guide/pages/function-queries.adoc
b/solr/solr-ref-guide/modules/query-guide/pages/function-queries.adoc
index 48f9345f1cd..7c6f1a9d0ea 100644
--- a/solr/solr-ref-guide/modules/query-guide/pages/function-queries.adoc
+++ b/solr/solr-ref-guide/modules/query-guide/pages/function-queries.adoc
@@ -473,7 +473,7 @@ Uses the Lucene spell checker `StringDistance` interface
and supports all of the
Possible values for distance measure are:
* jw: Jaro-Winkler
-* edit: Levenstein or Edit distance
+* edit: Levenshtein or Edit distance
* ngram: The NGramDistance, if specified, can optionally pass in the ngram
size too.
Default is 2.
* FQN: Fully Qualified class Name for an implementation of the StringDistance
interface.
diff --git a/solr/solr-ref-guide/modules/query-guide/pages/spell-checking.adoc
b/solr/solr-ref-guide/modules/query-guide/pages/spell-checking.adoc
index f6962ce8d4d..fea8e94c5bf 100644
--- a/solr/solr-ref-guide/modules/query-guide/pages/spell-checking.adoc
+++ b/solr/solr-ref-guide/modules/query-guide/pages/spell-checking.adoc
@@ -25,26 +25,34 @@ The basis for these suggestions can be terms in a field in
Solr, externally crea
=== Define Spell Check in solrconfig.xml
The first step is to specify the source of terms in `solrconfig.xml`.
-There are three approaches to spell checking in Solr, discussed below.
+There are a number of approaches to spell checking in Solr, discussed below.
==== IndexBasedSpellChecker
The `IndexBasedSpellChecker` uses a Solr index as the basis for a parallel
index used for spell checking.
It requires defining a field as the basis for the index terms; a common
practice is to copy terms from some fields (such as `title`, `body`, etc.) to
another field created for spell checking.
-Here is a simple example of configuring `solrconfig.xml` with the
`IndexBasedSpellChecker`:
+Here is an example of configuring `IndexBasedSpellChecker` in `solrconfig.xml`:
[source,xml]
----
<searchComponent name="spellcheck" class="solr.SpellCheckComponent">
<lst name="spellchecker">
<str name="classname">solr.IndexBasedSpellChecker</str>
- <str name="spellcheckIndexDir">./spellchecker</str>
+ <!-- required parameters -->
<str name="field">content</str>
- <str name="buildOnCommit">true</str>
- <!-- optional elements with defaults
+ <!-- optional parameters for IndexBasedSpellChecker -->
+ <str name="sourceLocation">./folder/with/index/files</str>
+ <!-- optional parameters for all spellcheckers -->
+ <str name="spellcheckIndexDir">./spellcheckerDir</str>
+ <str name="name">default</str>
+ <str name="fieldType">content_ft</str>
+ <str name="queryAnalyzerFieldType">text_general</str>
<str
name="distanceMeasure">org.apache.lucene.search.spell.LevenshteinDistance</str>
- <str name="accuracy">0.5</str>
- -->
+ <str name="comparatorClass">score</str>
+ <float name="accuracy">0.5</float>
+ <float name="thresholdTokenFrequency">0.0</float>
+ <str name="buildOnCommit">true</str>
+ <str name="buildOnOptimize">false</str>
</lst>
</searchComponent>
----
@@ -53,47 +61,83 @@ The first element defines the `searchComponent` to use the
`solr.SpellCheckCompo
The `classname` is the specific implementation of the SpellCheckComponent, in
this case `solr.IndexBasedSpellChecker`.
Defining the `classname` is optional; if not defined, it will default to
`IndexBasedSpellChecker`.
-The `spellcheckIndexDir` defines the location of the directory that holds the
spellcheck index, while the `field` defines the source field (defined in the
Schema) for spell check terms.
+The `spellcheckIndexDir` defines the location of the directory that holds the
spellcheck index, while the `field` defines the source field (defined in the
Schema) for spellcheck terms.
When choosing a field for the spellcheck index, it's best to avoid a heavily
processed field to get more accurate results.
If the field has many word variations from processing synonyms and/or
stemming, the dictionary will be created with those variations in addition to
more valid spelling data.
-Finally, _buildOnCommit_ defines whether to build the spell check index at
every commit (that is, every time new documents are added to the index).
-It is optional, and can be omitted if you would rather set it to `false`.
+By default, this spellchecker builds its dictionary from the Solr index.
+This can be changed by specifying `sourceLocation` - a folder with static
Lucene index files to use instead of the Solr index.
+
+The spellchecker can be assigned a descriptive label, `name`, - which can be
helpful if the search component defines
+multiple spellcheckers. With that, a spellcheck query can identify a subset of
spellcheckers that should be consulted
+(see xref:spell-check-parameters[] for more details).
+
+The query analyzer for the `field` is used to tokenize the spellcheck query.
+If there's a need to override that behavior, configure a `fieldType` and the
spellchecker
+will use the query analyzer for that field type instead.
+
+`queryAnalyzerFieldType` is a field type from Solr's schema, and works
similarly to the `fieldType` parameter.
+The key difference is that Solr uses `field` or `fieldType` when it tokenizes
the spellcheck query
+supplied via `spellcheck.q`, and uses `queryAnalyzerFieldType` when the query
is instead provided via the `q` parameter.
+
+The field type specified by this parameter should do minimal transformations.
+It's usually a best practice to avoid types that aggressively stem or NGram,
for instance, since those types of analysis can throw off spell checking.
+
+Common configuration parameters like `distanceMeasure`, `comparatorClass`,
`accuracy`, and `thresholdTokenFrequency`
+provide control over the returned spellcheck suggestions.
+
+If the `distanceMeasure` is not specified, Solr will use the Levenshtein
metric which is the default metric for other spellchecker implementations as
well (except for `DirectSolrSpellChecker`).
+
+When `comparatorClass` is configured as "score", the suggestions with lower
distance (i.e., higher similarity) scores are considered more relevant.
+The alternative value is "freq" - this prioritizes suggestions with higher
document frequency.
+
+The `accuracy` setting defines the threshold for a valid suggestion, and the
`thresholdTokenFrequency` setting allows
+skipping suggestions which have low document frequency in the index.
+
+Finally, `buildOnCommit` and `buildOnOptimize` define whether to build the
spellcheck index at every commit (that is, every time new documents are added
to the index)
+or at every optimize request.
+Both are optional, and can be omitted if you would rather set their values to
`false`.
==== DirectSolrSpellChecker
The `DirectSolrSpellChecker` uses terms from the Solr index without building a
parallel index like the `IndexBasedSpellChecker`.
-This spell checker has the benefit of not having to be built regularly,
meaning that the terms are always up-to-date with terms in the index.
-Here is how this might be configured in `solrconfig.xml`
+This spellchecker has the benefit of not having to be built regularly, meaning
that the terms are always up-to-date with terms in the index.
+Here is how this might be configured in `solrconfig.xml`:
[source,xml]
----
<searchComponent name="spellcheck" class="solr.SpellCheckComponent">
<lst name="spellchecker">
- <str name="name">default</str>
- <str name="field">name</str>
<str name="classname">solr.DirectSolrSpellChecker</str>
- <str name="distanceMeasure">internal</str>
- <float name="accuracy">0.5</float>
+ <!-- required parameters -->
+ <str name="field">name</str>
+ <!-- optional parameters for DirectSolrSpellChecker -->
<int name="maxEdits">2</int>
<int name="minPrefix">1</int>
<int name="maxInspections">5</int>
<int name="minQueryLength">4</int>
<int name="maxQueryLength">40</int>
<float name="maxQueryFrequency">0.01</float>
- <float name="thresholdTokenFrequency">.01</float>
+ <!-- optional parameters for all spellcheckers -->
+ <str name="name">default</str>
+ <str name="fieldType">name</str>
+ <str name="queryAnalyzerFieldType">text_general</str>
+ <str name="distanceMeasure">internal</str>
+ <str name="comparatorClass">score</str>
+ <float name="accuracy">0.5</float>
+ <float name="thresholdTokenFrequency">0.0</float>
</lst>
</searchComponent>
----
-When choosing a `field` to query for this spell checker, you want one which
has relatively little analysis performed on it (particularly analysis such as
stemming).
+When choosing a `field` to query for this spellchecker, you want one which has
relatively little analysis performed on it (particularly analysis such as
stemming).
Note that you need to specify a field to use for the suggestions, so like the
`IndexBasedSpellChecker`, you may want to copy data from fields like `title`,
`body`, etc., to a field dedicated to providing spelling suggestions.
-Many of the parameters relate to how this spell checker should query the index
for term suggestions.
-The `distanceMeasure` defines the metric to use during the spell check query.
-The value "internal" uses the default Levenshtein metric, which is the same
metric used with the other spell checker implementations.
+Many of the parameters relate to how this spellchecker should query the index
for term suggestions.
+The `distanceMeasure` defines the metric to use during the spellcheck query -
the default value for this spellchecker is "internal",
+which corresponds to the Damerau-Levenshtein metric.
-Because this spell checker is querying the main index, you may want to limit
how often it queries the index to be sure to avoid any performance conflicts
with user queries.
+Because this spellchecker is querying the main index, you may want to limit
how often it queries the index to be sure to avoid any performance conflicts
with user queries.
The `accuracy` setting defines the threshold for a valid suggestion, while
`maxEdits` defines the number of changes to the term to allow.
Since most spelling mistakes are only 1 letter off, setting this to 1 will
reduce the number of possible suggestions (the default, however, is 2); the
value can only be 1 or 2.
`minPrefix` defines the minimum number of characters the terms should share.
@@ -101,10 +145,10 @@ Setting this to 1 means that the spelling suggestions
will all start with the sa
The `maxInspections` parameter defines the maximum number of possible matches
to review before returning results; the default is 5.
`minQueryLength` defines how many characters must be in the query before
suggestions are provided; the default is 4.
-`maxQueryLength` enables the spell checker to skip over very long query terms,
which can avoid expensive operations or exceptions.
+`maxQueryLength` enables the spellchecker to skip over very long query terms,
which can avoid expensive operations or exceptions.
There is no limit to term length by default.
-At first, spellchecker analyses incoming query words by looking up them in the
index.
+At first, spellchecker analyses incoming query words by looking them up in the
index.
Only query words which are absent from the index, or too rare (equal to or
below `maxQueryFrequency`) are considered as misspelled and used for finding
suggestions.
Words which are more frequent than `maxQueryFrequency` bypass spellchecker
unchanged.
After suggestions for every misspelled word are found they are filtered for
enough frequency with `thresholdTokenFrequency` as boundary value.
@@ -126,19 +170,31 @@ In `solrconfig.xml`, you would define the searchComponent
as so:
<searchComponent name="spellcheck" class="solr.SpellCheckComponent">
<lst name="spellchecker">
<str name="classname">solr.FileBasedSpellChecker</str>
- <str name="name">file</str>
+ <!-- required parameters -->
<str name="sourceLocation">spellings.txt</str>
+ <!-- optional parameters for FileBasedSpellChecker -->
+ <str name="fieldType">text_general</str>
<str name="characterEncoding">UTF-8</str>
- <str name="spellcheckIndexDir">./spellcheckerFile</str>
- <!-- optional elements with defaults
+ <!-- optional parameters for all spellcheckers -->
+ <str name="spellcheckIndexDir">./spellcheckerDir</str>
+ <str name="name">file</str>
+ <str name="queryAnalyzerFieldType">text_general</str>
<str
name="distanceMeasure">org.apache.lucene.search.spell.LevenshteinDistance</str>
- <str name="accuracy">0.5</str>
- -->
+ <str name="comparatorClass">score</str>
+ <float name="accuracy">0.5</float>
+ <float name="thresholdTokenFrequency">0.0</float>
+ <bool name="buildOnCommit">false</bool>
+ <bool name="buildOnOptimize">false</bool>
</lst>
</searchComponent>
----
-The differences here are the use of the `sourceLocation` to define the
location of the file of terms and the use of `characterEncoding` to define the
encoding of the terms file.
+The configuration is very similar to the `IndexBasedSpellChecker`, and the
differences here are the use of the `sourceLocation` to define the location of
the file of terms, and the use of `characterEncoding` to define the encoding of
the terms file.
+
+If the `fieldType` parameter is specified and matches a type from the Solr
schema, Solr will build the spellcheck index
+by first tokenizing each line from the external file using the `fieldType`
index analyzer, and then adding each token to the index.
+
+If not, Solr will treat each line from the external file as an individual
token, and add them to the spellcheck index as is.
[TIP]
====
@@ -160,20 +216,45 @@ Here is how it might be configured in `solrconfig.xml`:
----
<searchComponent name="spellcheck" class="solr.SpellCheckComponent">
<lst name="spellchecker">
- <str name="name">wordbreak</str>
<str name="classname">solr.WordBreakSolrSpellChecker</str>
+ <!-- required parameters -->
<str name="field">lowerfilt</str>
+ <!-- optional parameters for WordBreakSpellChecker -->
<str name="combineWords">true</str>
<str name="breakWords">true</str>
- <int name="maxChanges">10</int>
+ <str name="breakSuggestionTieBreaker">max_freq</str>
+ <int name="maxChanges">1</int>
+ <int name="maxCombinedLength">20</int>
+ <int name="minBreakLength">1</int>
+ <int name="maxEvaluations">1000</int>
+ <int name="minSuggestionFreq">1</int>
+ <!-- optional parameters for all spellcheckers -->
+ <str name="name">wordbreak</str>
+ <str name="fieldType">lowerfilt_ft</str>
+ <str name="queryAnalyzerFieldType">text_general</str>
</lst>
</searchComponent>
----
-Some of the parameters will be familiar from the discussion of the other spell
checkers, such as `name`, `classname`, and `field`.
-New for this spell checker is `combineWords`, which defines whether words
should be combined in a dictionary search (default is true); `breakWords`,
which defines if words should be broken during a dictionary search (default is
true); and `maxChanges`, an integer which defines how many times the spell
checker should check collation possibilities against the index (default is 10).
+Some of the parameters should be familiar from the discussion of the other
spellcheckers, such as `name`, `classname`, and `field`.
+New for this spellchecker is `combineWords`, which defines whether words
should be combined in a dictionary search (default is true);
+and `breakWords`, which defines if words should be broken during a dictionary
search (default is true).
+
+`maxChanges` is an integer which defines how many times the spellchecker
should check collation possibilities against the index.
+
+`maxCombinedLength` allows skipping over the suggestions which are too long.
+Similarly, `minBreakLength` instructs the spellchecker to not break the word
into parts that are too short.
+
+`maxEvaluations` defines the maximum number of word combinations to evaluate -
a higher value might improve
+the result quality, while a lower value might improve performance.
+
+`minSuggestionFreq` sets the minimum frequency a term must have to be included
as part of a suggestion.
-The spellchecker can be configured with a traditional checker (i.e.,
`DirectSolrSpellChecker`).
+Finally, the `breakSuggestionTieBreaker` setting ("max_freq" or "sum_freq")
instructs Solr to
+sort the suggestions by the number of word breaks, and then by the maximum or
by the sum of all the component term's
+frequencies, respectively.
+
+The spellchecker can be configured together with a traditional checker (i.e.,
`DirectSolrSpellChecker`).
The results are combined and collations can contain a mix of corrections from
both spellcheckers.
=== Add It to a Request Handler
@@ -278,19 +359,6 @@ If this parameter isn't set, the value defaults to `1`.
If the parameter is set but not assigned a number, the value defaults to `5`.
If the parameter is set to a positive integer, that number becomes the maximum
number of suggestions returned by the spellchecker.
-`spellcheck.queryAnalyzerFieldType`::
-+
-[%autowidth,frame=none]
-|===
-|Optional |Default: none
-|===
-+
-A field type from Solr's schema.
-The analyzer configured for the provided field type is used by the
QueryConverter to tokenize the value for `q` parameter.
-+
-The field type specified by this parameter should do minimal transformations.
-It's usually a best practice to avoid types that aggressively stem or NGram,
for instance, since those types of analysis can throw off spell checking.
-
`spellcheck.onlyMorePopular`::
+
[%autowidth,frame=none]
@@ -423,7 +491,7 @@ For example, even if your regular search results allow for
loose matching of one
|===
+
This parameter causes Solr to use the dictionary named in the parameter's
argument.
-This parameter can be used to invoke a specific spellchecker on a per request
basis.
+This parameter can be used to invoke a specific spellchecker on a per-request
basis.
`spellcheck.accuracy`::
+
@@ -530,4 +598,4 @@ For example:
http://localhost:8983/solr/techproducts/spell?spellcheck=true&spellcheck.build=true&spellcheck.q=toyata&shards.qt=/spell&shards=solr-shard1:8983/solr/techproducts,solr-shard2:8983/solr/techproducts
In case of a distributed request to the SpellCheckComponent, the shards are
requested for at least five suggestions even if the `spellcheck.count`
parameter value is less than five.
-Once the suggestions are collected, they are ranked by the configured distance
measure (Levenstein Distance by default) and then by aggregate frequency.
+Once the suggestions are collected, they are ranked by the configured distance
measure (Levenshtein distance by default) and then by aggregate frequency.
diff --git
a/solr/test-framework/src/test-files/solr/collection1/conf/solrconfig.xml
b/solr/test-framework/src/test-files/solr/collection1/conf/solrconfig.xml
index 9b66df4cc0e..82dca6384d8 100644
--- a/solr/test-framework/src/test-files/solr/collection1/conf/solrconfig.xml
+++ b/solr/test-framework/src/test-files/solr/collection1/conf/solrconfig.xml
@@ -278,7 +278,7 @@
<str name="name">freq</str>
<str name="field">lowerfilt</str>
<str name="spellcheckIndexDir">spellcheckerFreq</str>
- <!-- comparatorClass be one of:
+ <!-- comparatorClass can be one of:
1. score (default)
2. freq (Frequency first, then score)
3. A fully qualified class name