This is an automated email from the ASF dual-hosted git repository.
krisden pushed a commit to branch branch_9x
in repository https://gitbox.apache.org/repos/asf/solr.git
The following commit(s) were added to refs/heads/branch_9x by this push:
new 8078824 SOLR-16033: Enable spotless on analysis-extras module
8078824 is described below
commit 8078824a21a29fc01f627d3d408da1769c9b863b
Author: Kevin Risden <[email protected]>
AuthorDate: Sat Feb 19 10:57:49 2022 -0500
SOLR-16033: Enable spotless on analysis-extras module
---
gradle/validation/spotless.gradle | 1 -
.../org/apache/solr/schema/ICUCollationField.java | 183 +++++-----
...ExtractNamedEntitiesUpdateProcessorFactory.java | 391 +++++++++++++--------
.../analysis/TestFoldingMultitermExtrasQuery.java | 9 +-
.../apache/solr/schema/TestICUCollationField.java | 185 +++++-----
.../schema/TestICUCollationFieldDocValues.java | 182 +++++-----
.../solr/schema/TestICUCollationFieldOptions.java | 104 +++---
.../solr/schema/TestICUCollationFieldUDVAS.java | 23 +-
...ExtractNamedEntitiesUpdateProcessorFactory.java | 249 ++++++++-----
9 files changed, 763 insertions(+), 564 deletions(-)
diff --git a/gradle/validation/spotless.gradle
b/gradle/validation/spotless.gradle
index 29e8ef9..bc726e8 100644
--- a/gradle/validation/spotless.gradle
+++ b/gradle/validation/spotless.gradle
@@ -44,7 +44,6 @@ configure(project(":solr").subprojects) { prj ->
// Exclude certain files (generated ones, mostly).
switch (project.path) {
- case ":solr:modules:analysis-extras":
case ":solr:modules:clustering":
case ":solr:modules:extraction":
case ":solr:modules:gcs-repository":
diff --git
a/solr/modules/analysis-extras/src/java/org/apache/solr/schema/ICUCollationField.java
b/solr/modules/analysis-extras/src/java/org/apache/solr/schema/ICUCollationField.java
index cb84168..6a4a9a7 100644
---
a/solr/modules/analysis-extras/src/java/org/apache/solr/schema/ICUCollationField.java
+++
b/solr/modules/analysis-extras/src/java/org/apache/solr/schema/ICUCollationField.java
@@ -16,6 +16,11 @@
*/
package org.apache.solr.schema;
+import static org.apache.solr.core.XmlConfigFile.assertWarnOrFail;
+
+import com.ibm.icu.text.Collator;
+import com.ibm.icu.text.RuleBasedCollator;
+import com.ibm.icu.util.ULocale;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.StandardCharsets;
@@ -23,13 +28,11 @@ import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Map;
-
import org.apache.commons.io.IOUtils;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
-import org.apache.lucene.util.ResourceLoader;
import org.apache.lucene.analysis.icu.ICUCollationKeyAnalyzer;
+import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
import org.apache.lucene.document.SortedDocValuesField;
import org.apache.lucene.document.SortedSetDocValuesField;
import org.apache.lucene.index.IndexableField;
@@ -37,52 +40,52 @@ import org.apache.lucene.search.Query;
import org.apache.lucene.search.SortField;
import org.apache.lucene.search.TermRangeQuery;
import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.ResourceLoader;
import org.apache.lucene.util.Version;
-import org.apache.solr.common.SolrException.ErrorCode;
import org.apache.solr.common.SolrException;
+import org.apache.solr.common.SolrException.ErrorCode;
import org.apache.solr.response.TextResponseWriter;
import org.apache.solr.search.QParser;
import org.apache.solr.uninverting.UninvertingReader.Type;
-import com.ibm.icu.text.Collator;
-import com.ibm.icu.text.RuleBasedCollator;
-import com.ibm.icu.util.ULocale;
-
-import static org.apache.solr.core.XmlConfigFile.assertWarnOrFail;
-
/**
- * Field for collated sort keys.
- * These can be used for locale-sensitive sort and range queries.
- * <p>
- * This field can be created in two ways:
+ * Field for collated sort keys. These can be used for locale-sensitive sort
and range queries.
+ *
+ * <p>This field can be created in two ways:
+ *
* <ul>
- * <li>Based upon a system collator associated with a Locale.
- * <li>Based upon a tailored ruleset.
+ * <li>Based upon a system collator associated with a Locale.
+ * <li>Based upon a tailored ruleset.
* </ul>
- * <p>
- * Using a System collator:
+ *
+ * <p>Using a System collator:
+ *
* <ul>
- * <li>locale: RFC 3066 locale ID (mandatory)
- * <li>strength: 'primary','secondary','tertiary', 'quaternary', or
'identical' (optional)
- * <li>decomposition: 'no', or 'canonical' (optional)
+ * <li>locale: RFC 3066 locale ID (mandatory)
+ * <li>strength: 'primary','secondary','tertiary', 'quaternary', or
'identical' (optional)
+ * <li>decomposition: 'no', or 'canonical' (optional)
* </ul>
- * <p>
- * Using a Tailored ruleset:
+ *
+ * <p>Using a Tailored ruleset:
+ *
* <ul>
- * <li>custom: UTF-8 text file containing rules supported by
RuleBasedCollator (mandatory)
- * <li>strength: 'primary','secondary','tertiary', 'quaternary', or
'identical' (optional)
- * <li>decomposition: 'no' or 'canonical' (optional)
+ * <li>custom: UTF-8 text file containing rules supported by
RuleBasedCollator (mandatory)
+ * <li>strength: 'primary','secondary','tertiary', 'quaternary', or
'identical' (optional)
+ * <li>decomposition: 'no' or 'canonical' (optional)
* </ul>
- * <p>
- * Expert options:
+ *
+ * <p>Expert options:
+ *
* <ul>
- * <li>alternate: 'shifted' or 'non-ignorable'. Can be used to ignore
punctuation/whitespace.
- * <li>caseLevel: 'true' or 'false'. Useful with strength=primary to ignore
accents but not case.
- * <li>caseFirst: 'lower' or 'upper'. Useful to control which is sorted first
when case is not ignored.
- * <li>numeric: 'true' or 'false'. Digits are sorted according to numeric
value, e.g. foobar-9 sorts before foobar-10
- * <li>variableTop: single character or contraction. Controls what is
variable for 'alternate'
+ * <li>alternate: 'shifted' or 'non-ignorable'. Can be used to ignore
punctuation/whitespace.
+ * <li>caseLevel: 'true' or 'false'. Useful with strength=primary to ignore
accents but not case.
+ * <li>caseFirst: 'lower' or 'upper'. Useful to control which is sorted
first when case is not
+ * ignored.
+ * <li>numeric: 'true' or 'false'. Digits are sorted according to numeric
value, e.g. foobar-9
+ * sorts before foobar-10
+ * <li>variableTop: single character or contraction. Controls what is
variable for 'alternate'
* </ul>
- *
+ *
* @see Collator
* @see ULocale
* @see RuleBasedCollator
@@ -91,13 +94,20 @@ public class ICUCollationField extends FieldType {
private Analyzer analyzer;
private boolean failHardOnUdvas;
- // ICUCollation keys are not even necessarily valid UTF-8, so udvas is
pathological. See SOLR-15777
+ // ICUCollation keys are not even necessarily valid UTF-8, so udvas is
pathological. See
+ // SOLR-15777
static final Version UDVAS_FORBIDDEN_AS_OF = Version.LUCENE_9_0_0;
- static final String UDVAS_MESSAGE = "useDocValuesAsStored is forbidden for "
+ ICUCollationField.class + " as of "
- + IndexSchema.LUCENE_MATCH_VERSION_PARAM + " " +
UDVAS_FORBIDDEN_AS_OF;
+ static final String UDVAS_MESSAGE =
+ "useDocValuesAsStored is forbidden for "
+ + ICUCollationField.class
+ + " as of "
+ + IndexSchema.LUCENE_MATCH_VERSION_PARAM
+ + " "
+ + UDVAS_FORBIDDEN_AS_OF;
private static void warnOrFailUdvas(boolean failHardOnUdvas) {
- // NOTE: it may seem odd that we're checking these conditions ourselves
rather than relying on the internal
+ // NOTE: it may seem odd that we're checking these conditions ourselves
rather than relying on
+ // the internal
// checking of `assertWarnOrFail(...)`. But the main reason we're logging
this error via
// `XMLConfigFile.assertWarnOrFail(...)` is because this is at its root an
xml config file
// error, so we log in a way that's consistent with that.
@@ -114,7 +124,7 @@ public class ICUCollationField extends FieldType {
}
@Override
- protected void init(IndexSchema schema, Map<String,String> args) {
+ protected void init(IndexSchema schema, Map<String, String> args) {
failHardOnUdvas = schema.luceneVersion.onOrAfter(UDVAS_FORBIDDEN_AS_OF);
if (on(trueProperties, USE_DOCVALUES_AS_STORED)) {
// fail fast at fieldType init
@@ -125,16 +135,14 @@ public class ICUCollationField extends FieldType {
setup(schema.getResourceLoader(), args);
super.init(schema, args);
}
-
- /**
- * Setup the field according to the provided parameters
- */
- private void setup(ResourceLoader loader, Map<String,String> args) {
+
+ /** Setup the field according to the provided parameters */
+ private void setup(ResourceLoader loader, Map<String, String> args) {
String custom = args.remove("custom");
String localeID = args.remove("locale");
String strength = args.remove("strength");
String decomposition = args.remove("decomposition");
-
+
String alternate = args.remove("alternate");
String caseLevel = args.remove("caseLevel");
String caseFirst = args.remove("caseFirst");
@@ -143,38 +151,34 @@ public class ICUCollationField extends FieldType {
if (custom == null && localeID == null)
throw new SolrException(ErrorCode.SERVER_ERROR, "Either custom or locale
is required.");
-
+
if (custom != null && localeID != null)
- throw new SolrException(ErrorCode.SERVER_ERROR, "Cannot specify both
locale and custom. "
- + "To tailor rules for a built-in language, see the javadocs for
RuleBasedCollator. "
- + "Then save the entire customized ruleset to a file, and use with
the custom parameter");
-
+ throw new SolrException(
+ ErrorCode.SERVER_ERROR,
+ "Cannot specify both locale and custom. "
+ + "To tailor rules for a built-in language, see the javadocs for
RuleBasedCollator. "
+ + "Then save the entire customized ruleset to a file, and use
with the custom parameter");
+
final Collator collator;
-
- if (localeID != null) {
+
+ if (localeID != null) {
// create from a system collator, based on Locale.
collator = createFromLocale(localeID);
- } else {
+ } else {
// create from a custom ruleset
collator = createFromRules(custom, loader);
}
-
+
// set the strength flag, otherwise it will be the default.
if (strength != null) {
- if (strength.equalsIgnoreCase("primary"))
- collator.setStrength(Collator.PRIMARY);
- else if (strength.equalsIgnoreCase("secondary"))
- collator.setStrength(Collator.SECONDARY);
- else if (strength.equalsIgnoreCase("tertiary"))
- collator.setStrength(Collator.TERTIARY);
- else if (strength.equalsIgnoreCase("quaternary"))
- collator.setStrength(Collator.QUATERNARY);
- else if (strength.equalsIgnoreCase("identical"))
- collator.setStrength(Collator.IDENTICAL);
- else
- throw new SolrException(ErrorCode.SERVER_ERROR, "Invalid strength: " +
strength);
+ if (strength.equalsIgnoreCase("primary"))
collator.setStrength(Collator.PRIMARY);
+ else if (strength.equalsIgnoreCase("secondary"))
collator.setStrength(Collator.SECONDARY);
+ else if (strength.equalsIgnoreCase("tertiary"))
collator.setStrength(Collator.TERTIARY);
+ else if (strength.equalsIgnoreCase("quaternary"))
collator.setStrength(Collator.QUATERNARY);
+ else if (strength.equalsIgnoreCase("identical"))
collator.setStrength(Collator.IDENTICAL);
+ else throw new SolrException(ErrorCode.SERVER_ERROR, "Invalid strength:
" + strength);
}
-
+
// set the decomposition flag, otherwise it will be the default.
if (decomposition != null) {
if (decomposition.equalsIgnoreCase("no"))
@@ -184,7 +188,7 @@ public class ICUCollationField extends FieldType {
else
throw new SolrException(ErrorCode.SERVER_ERROR, "Invalid
decomposition: " + decomposition);
}
-
+
// expert options: concrete subclasses are always a RuleBasedCollator
RuleBasedCollator rbc = (RuleBasedCollator) collator;
if (alternate != null) {
@@ -217,25 +221,22 @@ public class ICUCollationField extends FieldType {
analyzer = new ICUCollationKeyAnalyzer(collator);
}
-
- /**
- * Create a locale from localeID.
- * Then return the appropriate collator for the locale.
- */
+
+ /** Create a locale from localeID. Then return the appropriate collator for
the locale. */
private Collator createFromLocale(String localeID) {
return Collator.getInstance(new ULocale(localeID));
}
-
+
/**
- * Read custom rules from a file, and create a RuleBasedCollator
- * The file cannot support comments, as # might be in the rules!
+ * Read custom rules from a file, and create a RuleBasedCollator The file
cannot support comments,
+ * as # might be in the rules!
*/
static Collator createFromRules(String fileName, ResourceLoader loader) {
InputStream input = null;
try {
- input = loader.openResource(fileName);
- String rules = IOUtils.toString(input, StandardCharsets.UTF_8);
- return new RuleBasedCollator(rules);
+ input = loader.openResource(fileName);
+ String rules = IOUtils.toString(input, StandardCharsets.UTF_8);
+ return new RuleBasedCollator(rules);
} catch (Exception e) {
// io error or invalid rules
throw new RuntimeException(e);
@@ -253,11 +254,11 @@ public class ICUCollationField extends FieldType {
public SortField getSortField(SchemaField field, boolean top) {
return getStringSort(field, top);
}
-
+
@Override
public Type getUninversionType(SchemaField sf) {
if (sf.multiValued()) {
- return Type.SORTED_SET_BINARY;
+ return Type.SORTED_SET_BINARY;
} else {
return Type.SORTED;
}
@@ -274,32 +275,36 @@ public class ICUCollationField extends FieldType {
}
/**
- * analyze the text with the analyzer, instead of the collator.
- * because icu collators are not thread safe, this keeps things
- * simple (we already have a threadlocal clone in the reused TS)
+ * analyze the text with the analyzer, instead of the collator. because icu
collators are not
+ * thread safe, this keeps things simple (we already have a threadlocal
clone in the reused TS)
*/
private BytesRef getCollationKey(String field, String text) {
try (TokenStream source = analyzer.tokenStream(field, text)) {
source.reset();
-
+
TermToBytesRefAttribute termAtt =
source.getAttribute(TermToBytesRefAttribute.class);
-
// we control the analyzer here: most errors are impossible
if (!source.incrementToken())
throw new IllegalArgumentException("analyzer returned no terms for
text: " + text);
BytesRef bytes = BytesRef.deepCopyOf(termAtt.getBytesRef());
assert !source.incrementToken();
-
+
source.end();
return bytes;
} catch (IOException e) {
throw new RuntimeException("Unable to analyze text: " + text, e);
}
}
-
+
@Override
- protected Query getSpecializedRangeQuery(QParser parser, SchemaField field,
String part1, String part2, boolean minInclusive, boolean maxInclusive) {
+ protected Query getSpecializedRangeQuery(
+ QParser parser,
+ SchemaField field,
+ String part1,
+ String part2,
+ boolean minInclusive,
+ boolean maxInclusive) {
String f = field.getName();
BytesRef low = part1 == null ? null : getCollationKey(f, part1);
BytesRef high = part2 == null ? null : getCollationKey(f, part2);
diff --git
a/solr/modules/analysis-extras/src/java/org/apache/solr/update/processor/OpenNLPExtractNamedEntitiesUpdateProcessorFactory.java
b/solr/modules/analysis-extras/src/java/org/apache/solr/update/processor/OpenNLPExtractNamedEntitiesUpdateProcessorFactory.java
index 911b9e6..f67c7d9 100644
---
a/solr/modules/analysis-extras/src/java/org/apache/solr/update/processor/OpenNLPExtractNamedEntitiesUpdateProcessorFactory.java
+++
b/solr/modules/analysis-extras/src/java/org/apache/solr/update/processor/OpenNLPExtractNamedEntitiesUpdateProcessorFactory.java
@@ -17,6 +17,8 @@
package org.apache.solr.update.processor;
+import static org.apache.solr.common.SolrException.ErrorCode.SERVER_ERROR;
+
import java.io.IOException;
import java.lang.invoke.MethodHandles;
import java.util.ArrayList;
@@ -29,7 +31,6 @@ import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;
-
import opennlp.tools.util.Span;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
@@ -55,14 +56,11 @@ import org.apache.solr.util.plugin.SolrCoreAware;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
-import static org.apache.solr.common.SolrException.ErrorCode.SERVER_ERROR;
-
/**
- * Extracts named entities using an OpenNLP NER <code>modelFile</code> from
the values found in
- * any matching <code>source</code> field into a configured <code>dest</code>
field, after
- * first tokenizing the source text using the index analyzer on the configured
- * <code>analyzerFieldType</code>, which must include
<code>solr.OpenNLPTokenizerFactory</code>
- * as the tokenizer. E.g.:
+ * Extracts named entities using an OpenNLP NER <code>modelFile</code> from
the values found in any
+ * matching <code>source</code> field into a configured <code>dest</code>
field, after first
+ * tokenizing the source text using the index analyzer on the configured
<code>analyzerFieldType
+ * </code>, which must include <code>solr.OpenNLPTokenizerFactory</code> as
the tokenizer. E.g.:
*
* <pre class="prettyprint">
* <fieldType name="opennlp-en-tokenization" class="solr.TextField">
@@ -74,55 +72,51 @@ import static
org.apache.solr.common.SolrException.ErrorCode.SERVER_ERROR;
* </fieldType>
* </pre>
*
- * <p>See the <a href="https://opennlp.apache.org/models.html">OpenNLP
website</a>
- * for information on downloading pre-trained models.</p>
- *
- * Note that in order to use model files larger than 1MB on SolrCloud,
- * <a
href="https://solr.apache.org/guide/setting-up-an-external-zookeeper-ensemble.html#increasing-the-file-size-limit"
+ * <p>See the <a href="https://opennlp.apache.org/models.html">OpenNLP
website</a> for information
+ * on downloading pre-trained models. Note that in order to use model files
larger than 1MB on
+ * SolrCloud, <a
+ *
href="https://solr.apache.org/guide/setting-up-an-external-zookeeper-ensemble.html#increasing-the-file-size-limit"
* >ZooKeeper server and client configuration is required</a>.
*
- * <p>
- * The <code>source</code> field(s) can be configured as either:
- * </p>
+ * <p>The <code>source</code> field(s) can be configured as either:
+ *
* <ul>
- * <li>One or more <code><str></code></li>
- * <li>An <code><arr></code> of <code><str></code></li>
- * <li>A <code><lst></code> containing
- * {@link FieldMutatingUpdateProcessor FieldMutatingUpdateProcessorFactory
style selector arguments}</li>
+ * <li>One or more <code><str></code>
+ * <li>An <code><arr></code> of <code><str></code>
+ * <li>A <code><lst></code> containing {@link
FieldMutatingUpdateProcessor
+ * FieldMutatingUpdateProcessorFactory style selector arguments}
* </ul>
*
- * <p>The <code>dest</code> field can be a single <code><str></code>
- * containing the literal name of a destination field, or it may be a
<code><lst></code> specifying a
- * regex <code>pattern</code> and a <code>replacement</code> string. If the
pattern + replacement option
- * is used the pattern will be matched against all fields matched by the
source selector, and the replacement
- * string (including any capture groups specified from the pattern) will be
evaluated a using
- * {@link Matcher#replaceAll(String)} to generate the literal name of the
destination field. Additionally,
- * an occurrence of the string "{EntityType}" in the <code>dest</code> field
specification, or in the
- * <code>replacement</code> string, will be replaced with the entity type(s)
returned for each entity by
- * the OpenNLP NER model; as a result, if the model extracts more than one
entity type, then more than one
- * <code>dest</code> field will be populated.
- * </p>
+ * <p>The <code>dest</code> field can be a single <code><str></code>
containing the literal
+ * name of a destination field, or it may be a <code><lst></code>
specifying a regex <code>
+ * pattern</code> and a <code>replacement</code> string. If the pattern +
replacement option is used
+ * the pattern will be matched against all fields matched by the source
selector, and the
+ * replacement string (including any capture groups specified from the
pattern) will be evaluated a
+ * using {@link Matcher#replaceAll(String)} to generate the literal name of
the destination field.
+ * Additionally, an occurrence of the string "{EntityType}" in the
<code>dest</code> field
+ * specification, or in the <code>replacement</code> string, will be replaced
with the entity
+ * type(s) returned for each entity by the OpenNLP NER model; as a result, if
the model extracts
+ * more than one entity type, then more than one <code>dest</code> field will
be populated.
+ *
+ * <p>If the resolved <code>dest</code> field already exists in the document,
then the named
+ * entities extracted from the <code>source</code> fields will be added to it.
+ *
+ * <p>In the example below:
*
- * <p>If the resolved <code>dest</code> field already exists in the document,
then the
- * named entities extracted from the <code>source</code> fields will be added
to it.
- * </p>
- * <p>
- * In the example below:
- * </p>
* <ul>
- * <li>Named entities will be extracted from the <code>text</code> field and
added
- * to the <code>names_ss</code> field</li>
- * <li>Named entities will be extracted from both the <code>title</code> and
- * <code>subtitle</code> fields and added into the
<code>titular_people</code> field</li>
- * <li>Named entities will be extracted from any field with a name ending in
<code>_txt</code>
- * -- except for <code>notes_txt</code> -- and added into the
<code>people_ss</code> field</li>
+ * <li>Named entities will be extracted from the <code>text</code> field and
added to the <code>
+ * names_ss</code> field
+ * <li>Named entities will be extracted from both the <code>title</code> and
<code>subtitle</code>
+ * fields and added into the <code>titular_people</code> field
+ * <li>Named entities will be extracted from any field with a name ending in
<code>_txt</code> --
+ * except for <code>notes_txt</code> -- and added into the
<code>people_ss</code> field
* <li>Named entities will be extracted from any field with a name beginning
with "desc" and
* ending in "s" (e.g. "descs" and "descriptions") and added to a field
prefixed with "key_",
* not ending in "s", and suffixed with "_people". (e.g.
"key_desc_people" or
- * "key_description_people")</li>
- * <li>Named entities will be extracted from the <code>summary</code> field
and added
- * to the <code>summary_person_ss</code> field, assuming that the
modelFile only extracts
- * entities of type "person".</li>
+ * "key_description_people")
+ * <li>Named entities will be extracted from the <code>summary</code> field
and added to the
+ * <code>summary_person_ss</code> field, assuming that the modelFile
only extracts entities of
+ * type "person".
* </ul>
*
* <pre class="prettyprint">
@@ -177,8 +171,8 @@ import static
org.apache.solr.common.SolrException.ErrorCode.SERVER_ERROR;
*
* @since 7.3.0
*/
-public class OpenNLPExtractNamedEntitiesUpdateProcessorFactory
- extends UpdateRequestProcessorFactory implements SolrCoreAware {
+public class OpenNLPExtractNamedEntitiesUpdateProcessorFactory extends
UpdateRequestProcessorFactory
+ implements SolrCoreAware {
private static final Logger log =
LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
@@ -199,8 +193,9 @@ public class
OpenNLPExtractNamedEntitiesUpdateProcessorFactory
private String analyzerFieldType = null;
/**
- * If pattern is null, this this is a literal field name. If pattern is
non-null then this
- * is a replacement string that may contain meta-characters (ie: capture
group identifiers)
+ * If pattern is null, this this is a literal field name. If pattern is
non-null then this is a
+ * replacement string that may contain meta-characters (ie: capture group
identifiers)
+ *
* @see #pattern
*/
private String dest = null;
@@ -210,7 +205,8 @@ public class
OpenNLPExtractNamedEntitiesUpdateProcessorFactory
protected final FieldNameSelector getSourceSelector() {
if (null != srcSelector) return srcSelector;
- throw new SolrException(SERVER_ERROR, "selector was never initialized,
inform(SolrCore) never called???");
+ throw new SolrException(
+ SERVER_ERROR, "selector was never initialized, inform(SolrCore) never
called???");
}
@Override
@@ -219,31 +215,41 @@ public class
OpenNLPExtractNamedEntitiesUpdateProcessorFactory
// high level (loose) check for which type of config we have.
//
// individual init methods do more strict syntax checking
- if (0 <= args.indexOf(SOURCE_PARAM, 0) && 0 <= args.indexOf(DEST_PARAM, 0)
) {
+ if (0 <= args.indexOf(SOURCE_PARAM, 0) && 0 <= args.indexOf(DEST_PARAM,
0)) {
initSourceSelectorSyntax(args);
} else if (0 <= args.indexOf(PATTERN_PARAM, 0) && 0 <=
args.indexOf(REPLACEMENT_PARAM, 0)) {
initSimpleRegexReplacement(args);
} else {
- throw new SolrException(SERVER_ERROR, "A combination of either '" +
SOURCE_PARAM + "' + '"+
- DEST_PARAM + "', or '" + REPLACEMENT_PARAM + "' + '" +
- PATTERN_PARAM + "' init params are mandatory");
+ throw new SolrException(
+ SERVER_ERROR,
+ "A combination of either '"
+ + SOURCE_PARAM
+ + "' + '"
+ + DEST_PARAM
+ + "', or '"
+ + REPLACEMENT_PARAM
+ + "' + '"
+ + PATTERN_PARAM
+ + "' init params are mandatory");
}
Object modelParam = args.remove(MODEL_PARAM);
if (null == modelParam) {
throw new SolrException(SERVER_ERROR, "Missing required init param '" +
MODEL_PARAM + "'");
}
- if ( ! (modelParam instanceof CharSequence)) {
+ if (!(modelParam instanceof CharSequence)) {
throw new SolrException(SERVER_ERROR, "Init param '" + MODEL_PARAM + "'
must be a <str>");
}
modelFile = modelParam.toString();
Object analyzerFieldTypeParam = args.remove(ANALYZER_FIELD_TYPE_PARAM);
if (null == analyzerFieldTypeParam) {
- throw new SolrException(SERVER_ERROR, "Missing required init param '" +
ANALYZER_FIELD_TYPE_PARAM + "'");
+ throw new SolrException(
+ SERVER_ERROR, "Missing required init param '" +
ANALYZER_FIELD_TYPE_PARAM + "'");
}
- if ( ! (analyzerFieldTypeParam instanceof CharSequence)) {
- throw new SolrException(SERVER_ERROR, "Init param '" +
ANALYZER_FIELD_TYPE_PARAM + "' must be a <str>");
+ if (!(analyzerFieldTypeParam instanceof CharSequence)) {
+ throw new SolrException(
+ SERVER_ERROR, "Init param '" + ANALYZER_FIELD_TYPE_PARAM + "' must
be a <str>");
}
analyzerFieldType = analyzerFieldTypeParam.toString();
@@ -255,22 +261,32 @@ public class
OpenNLPExtractNamedEntitiesUpdateProcessorFactory
}
/**
- * init helper method that should only be called when we know for certain
that both the
- * "source" and "dest" init params do <em>not</em> exist.
+ * init helper method that should only be called when we know for certain
that both the "source"
+ * and "dest" init params do <em>not</em> exist.
*/
private void initSimpleRegexReplacement(NamedList<?> args) {
- // The syntactic sugar for the case where there is only one regex pattern
for source and the same pattern
+ // The syntactic sugar for the case where there is only one regex pattern
for source and the
+ // same pattern
// is used for the destination pattern...
//
// pattern != null && replacement != null
//
// ...as top level elements, with no other config options specified
- // if we got here we know we had pattern and replacement, now check for
the other two so that we can give a better
+ // if we got here we know we had pattern and replacement, now check for
the other two so that
+ // we can give a better
// message than "unexpected"
- if (0 <= args.indexOf(SOURCE_PARAM, 0) || 0 <= args.indexOf(DEST_PARAM, 0)
) {
- throw new SolrException(SERVER_ERROR,"Short hand syntax must not be
mixed with full syntax. Found " +
- PATTERN_PARAM + " and " + REPLACEMENT_PARAM + " but also found " +
SOURCE_PARAM + " or " + DEST_PARAM);
+ if (0 <= args.indexOf(SOURCE_PARAM, 0) || 0 <= args.indexOf(DEST_PARAM,
0)) {
+ throw new SolrException(
+ SERVER_ERROR,
+ "Short hand syntax must not be mixed with full syntax. Found "
+ + PATTERN_PARAM
+ + " and "
+ + REPLACEMENT_PARAM
+ + " but also found "
+ + SOURCE_PARAM
+ + " or "
+ + DEST_PARAM);
}
assert args.indexOf(SOURCE_PARAM, 0) < 0;
@@ -279,39 +295,56 @@ public class
OpenNLPExtractNamedEntitiesUpdateProcessorFactory
Object replacement = args.remove(REPLACEMENT_PARAM);
if (null == patt || null == replacement) {
- throw new SolrException(SERVER_ERROR, "Init params '" + PATTERN_PARAM +
"' and '" +
- REPLACEMENT_PARAM + "' are both mandatory if '" + SOURCE_PARAM + "'
and '"+
- DEST_PARAM + "' are not both specified");
+ throw new SolrException(
+ SERVER_ERROR,
+ "Init params '"
+ + PATTERN_PARAM
+ + "' and '"
+ + REPLACEMENT_PARAM
+ + "' are both mandatory if '"
+ + SOURCE_PARAM
+ + "' and '"
+ + DEST_PARAM
+ + "' are not both specified");
}
if (0 != args.size()) {
- throw new SolrException(SERVER_ERROR, "Init params '" +
REPLACEMENT_PARAM + "' and '" +
- PATTERN_PARAM + "' must be children of '" + DEST_PARAM +
- "' to be combined with other options.");
+ throw new SolrException(
+ SERVER_ERROR,
+ "Init params '"
+ + REPLACEMENT_PARAM
+ + "' and '"
+ + PATTERN_PARAM
+ + "' must be children of '"
+ + DEST_PARAM
+ + "' to be combined with other options.");
}
if (!(replacement instanceof String)) {
- throw new SolrException(SERVER_ERROR, "Init param '" + REPLACEMENT_PARAM
+ "' must be a string (i.e. <str>)");
+ throw new SolrException(
+ SERVER_ERROR, "Init param '" + REPLACEMENT_PARAM + "' must be a
string (i.e. <str>)");
}
if (!(patt instanceof String)) {
- throw new SolrException(SERVER_ERROR, "Init param '" + PATTERN_PARAM +
"' must be a string (i.e. <str>)");
+ throw new SolrException(
+ SERVER_ERROR, "Init param '" + PATTERN_PARAM + "' must be a string
(i.e. <str>)");
}
dest = replacement.toString();
try {
this.pattern = Pattern.compile(patt.toString());
} catch (PatternSyntaxException pe) {
- throw new SolrException(SERVER_ERROR, "Init param " + PATTERN_PARAM +
- " is not a valid regex pattern: " + patt, pe);
-
+ throw new SolrException(
+ SERVER_ERROR,
+ "Init param " + PATTERN_PARAM + " is not a valid regex pattern: " +
patt,
+ pe);
}
srcInclusions = new SelectorParams();
srcInclusions.fieldRegex = Collections.singletonList(this.pattern);
}
/**
- * init helper method that should only be called when we know for certain
that both the
- * "source" and "dest" init params <em>do</em> exist.
+ * init helper method that should only be called when we know for certain
that both the "source"
+ * and "dest" init params <em>do</em> exist.
*/
private void initSourceSelectorSyntax(NamedList<?> args) {
// Full and complete syntax where source and dest are mandatory.
@@ -321,11 +354,20 @@ public class
OpenNLPExtractNamedEntitiesUpdateProcessorFactory
//
// source != null && dest != null
- // if we got here we know we had source and dest, now check for the other
two so that we can give a better
+ // if we got here we know we had source and dest, now check for the other
two so that we can
+ // give a better
// message than "unexpected"
- if (0 <= args.indexOf(PATTERN_PARAM, 0) || 0 <=
args.indexOf(REPLACEMENT_PARAM, 0) ) {
- throw new SolrException(SERVER_ERROR,"Short hand syntax must not be
mixed with full syntax. Found " +
- SOURCE_PARAM + " and " + DEST_PARAM + " but also found " +
PATTERN_PARAM + " or " + REPLACEMENT_PARAM);
+ if (0 <= args.indexOf(PATTERN_PARAM, 0) || 0 <=
args.indexOf(REPLACEMENT_PARAM, 0)) {
+ throw new SolrException(
+ SERVER_ERROR,
+ "Short hand syntax must not be mixed with full syntax. Found "
+ + SOURCE_PARAM
+ + " and "
+ + DEST_PARAM
+ + " but also found "
+ + PATTERN_PARAM
+ + " or "
+ + REPLACEMENT_PARAM);
}
Object d = args.remove(DEST_PARAM);
@@ -345,28 +387,36 @@ public class
OpenNLPExtractNamedEntitiesUpdateProcessorFactory
for (Object excObj : excList) {
if (null == excObj) {
- throw new SolrException(SERVER_ERROR, "Init param '" +
SOURCE_PARAM +
- "' child 'exclude' can not be null");
+ throw new SolrException(
+ SERVER_ERROR, "Init param '" + SOURCE_PARAM + "' child
'exclude' can not be null");
}
if (!(excObj instanceof NamedList)) {
- throw new SolrException(SERVER_ERROR, "Init param '" +
SOURCE_PARAM +
- "' child 'exclude' must be <lst/>");
+ throw new SolrException(
+ SERVER_ERROR, "Init param '" + SOURCE_PARAM + "' child
'exclude' must be <lst/>");
}
NamedList<?> exc = (NamedList<?>) excObj;
srcExclusions.add(parseSelectorParams(exc));
if (0 < exc.size()) {
- throw new SolrException(SERVER_ERROR, "Init param '" +
SOURCE_PARAM +
- "' has unexpected 'exclude' sub-param(s): '"
- + selectorConfig.getName(0) + "'");
+ throw new SolrException(
+ SERVER_ERROR,
+ "Init param '"
+ + SOURCE_PARAM
+ + "' has unexpected 'exclude' sub-param(s): '"
+ + selectorConfig.getName(0)
+ + "'");
}
// call once per instance
selectorConfig.remove("exclude");
}
if (0 < selectorConfig.size()) {
- throw new SolrException(SERVER_ERROR, "Init param '" + SOURCE_PARAM +
- "' contains unexpected child param(s): '" +
- selectorConfig.getName(0) + "'");
+ throw new SolrException(
+ SERVER_ERROR,
+ "Init param '"
+ + SOURCE_PARAM
+ + "' contains unexpected child param(s): '"
+ + selectorConfig.getName(0)
+ + "'");
}
// consume from the named list so it doesn't interfere with subsequent
processing
sources.remove(0);
@@ -377,10 +427,18 @@ public class
OpenNLPExtractNamedEntitiesUpdateProcessorFactory
srcInclusions.fieldName = new HashSet<>(args.removeConfigArgs("source"));
}
if (srcInclusions == null) {
- throw new SolrException(SERVER_ERROR,
+ throw new SolrException(
+ SERVER_ERROR,
"Init params do not specify any field from which to extract
entities, please supply either "
- + SOURCE_PARAM + " and " + DEST_PARAM + " or " + PATTERN_PARAM + "
and " + REPLACEMENT_PARAM + ". See javadocs" +
- "for OpenNLPExtractNamedEntitiesUpdateProcessor for further
details.");
+ + SOURCE_PARAM
+ + " and "
+ + DEST_PARAM
+ + " or "
+ + PATTERN_PARAM
+ + " and "
+ + REPLACEMENT_PARAM
+ + ". See javadocs"
+ + "for OpenNLPExtractNamedEntitiesUpdateProcessor for further
details.");
}
if (d instanceof NamedList) {
@@ -390,50 +448,86 @@ public class
OpenNLPExtractNamedEntitiesUpdateProcessorFactory
Object replacement = destList.remove(REPLACEMENT_PARAM);
if (null == patt || null == replacement) {
- throw new SolrException(SERVER_ERROR, "Init param '" + DEST_PARAM + "'
children '" +
- PATTERN_PARAM + "' and '" + REPLACEMENT_PARAM +
- "' are both mandatory and can not be null");
+ throw new SolrException(
+ SERVER_ERROR,
+ "Init param '"
+ + DEST_PARAM
+ + "' children '"
+ + PATTERN_PARAM
+ + "' and '"
+ + REPLACEMENT_PARAM
+ + "' are both mandatory and can not be null");
}
- if (! (patt instanceof String && replacement instanceof String)) {
- throw new SolrException(SERVER_ERROR, "Init param '" + DEST_PARAM + "'
children '" +
- PATTERN_PARAM + "' and '" + REPLACEMENT_PARAM +
- "' must both be strings (i.e. <str>)");
+ if (!(patt instanceof String && replacement instanceof String)) {
+ throw new SolrException(
+ SERVER_ERROR,
+ "Init param '"
+ + DEST_PARAM
+ + "' children '"
+ + PATTERN_PARAM
+ + "' and '"
+ + REPLACEMENT_PARAM
+ + "' must both be strings (i.e. <str>)");
}
if (0 != destList.size()) {
- throw new SolrException(SERVER_ERROR, "Init param '" + DEST_PARAM + "'
has unexpected children: '"
- + destList.getName(0) + "'");
+ throw new SolrException(
+ SERVER_ERROR,
+ "Init param '"
+ + DEST_PARAM
+ + "' has unexpected children: '"
+ + destList.getName(0)
+ + "'");
}
try {
this.pattern = Pattern.compile(patt.toString());
} catch (PatternSyntaxException pe) {
- throw new SolrException(SERVER_ERROR, "Init param '" + DEST_PARAM + "'
child '" + PATTERN_PARAM +
- " is not a valid regex pattern: " + patt, pe);
+ throw new SolrException(
+ SERVER_ERROR,
+ "Init param '"
+ + DEST_PARAM
+ + "' child '"
+ + PATTERN_PARAM
+ + " is not a valid regex pattern: "
+ + patt,
+ pe);
}
dest = replacement.toString();
} else if (d instanceof String) {
dest = d.toString();
} else {
- throw new SolrException(SERVER_ERROR, "Init param '" + DEST_PARAM + "'
must either be a string " +
- "(i.e. <str>) or a list (i.e. <lst>) containing '" +
- PATTERN_PARAM + "' and '" + REPLACEMENT_PARAM);
+ throw new SolrException(
+ SERVER_ERROR,
+ "Init param '"
+ + DEST_PARAM
+ + "' must either be a string "
+ + "(i.e. <str>) or a list (i.e. <lst>) containing '"
+ + PATTERN_PARAM
+ + "' and '"
+ + REPLACEMENT_PARAM);
}
-
}
@Override
public void inform(final SolrCore core) {
srcSelector =
- FieldMutatingUpdateProcessor.createFieldNameSelector
- (core.getResourceLoader(), core, srcInclusions,
FieldMutatingUpdateProcessor.SELECT_NO_FIELDS);
+ FieldMutatingUpdateProcessor.createFieldNameSelector(
+ core.getResourceLoader(),
+ core,
+ srcInclusions,
+ FieldMutatingUpdateProcessor.SELECT_NO_FIELDS);
for (SelectorParams exc : srcExclusions) {
- srcSelector = FieldMutatingUpdateProcessor.wrap
- (srcSelector,
- FieldMutatingUpdateProcessor.createFieldNameSelector
- (core.getResourceLoader(), core, exc,
FieldMutatingUpdateProcessor.SELECT_NO_FIELDS));
+ srcSelector =
+ FieldMutatingUpdateProcessor.wrap(
+ srcSelector,
+ FieldMutatingUpdateProcessor.createFieldNameSelector(
+ core.getResourceLoader(),
+ core,
+ exc,
+ FieldMutatingUpdateProcessor.SELECT_NO_FIELDS));
}
try {
OpenNLPOpsFactory.getNERTaggerModel(modelFile, core.getResourceLoader());
@@ -443,19 +537,24 @@ public class
OpenNLPExtractNamedEntitiesUpdateProcessorFactory
}
@Override
- public final UpdateRequestProcessor getInstance
- (SolrQueryRequest req, SolrQueryResponse rsp, UpdateRequestProcessor
next) {
+ public final UpdateRequestProcessor getInstance(
+ SolrQueryRequest req, SolrQueryResponse rsp, UpdateRequestProcessor
next) {
final FieldNameSelector srcSelector = getSourceSelector();
return new UpdateRequestProcessor(next) {
private final NLPNERTaggerOp nerTaggerOp;
private Analyzer analyzer = null;
+
{
try {
nerTaggerOp = OpenNLPOpsFactory.getNERTagger(modelFile);
FieldType fieldType =
req.getSchema().getFieldTypeByName(analyzerFieldType);
if (fieldType == null) {
- throw new SolrException
- (SERVER_ERROR, ANALYZER_FIELD_TYPE_PARAM + " '" +
analyzerFieldType + "' not found in the schema.");
+ throw new SolrException(
+ SERVER_ERROR,
+ ANALYZER_FIELD_TYPE_PARAM
+ + " '"
+ + analyzerFieldType
+ + "' not found in the schema.");
}
analyzer = fieldType.getIndexAnalyzer();
} catch (IOException e) {
@@ -470,11 +569,11 @@ public class
OpenNLPExtractNamedEntitiesUpdateProcessorFactory
// Destination may be regex replace string, or "{EntityType}" replaced
by
// each entity's type, both of which can cause multiple output fields.
- Map<String,SolrInputField> destMap = new HashMap<>();
+ Map<String, SolrInputField> destMap = new HashMap<>();
// preserve initial values
for (final String fname : doc.getFieldNames()) {
- if ( ! srcSelector.shouldMutate(fname)) continue;
+ if (!srcSelector.shouldMutate(fname)) continue;
Collection<Object> srcFieldValues = doc.getFieldValues(fname);
if (srcFieldValues == null || srcFieldValues.isEmpty()) continue;
@@ -486,14 +585,16 @@ public class
OpenNLPExtractNamedEntitiesUpdateProcessorFactory
if (matcher.find()) {
resolvedDest = matcher.replaceAll(dest);
} else {
- log.debug("srcSelector.shouldMutate('{}') returned true, " +
- "but replacement pattern did not match, field skipped.",
fname);
+ log.debug(
+ "srcSelector.shouldMutate('{}') returned true, "
+ + "but replacement pattern did not match, field
skipped.",
+ fname);
continue;
}
}
for (Object val : srcFieldValues) {
- for (Pair<String,String> entity : extractTypedNamedEntities(val)) {
+ for (Pair<String, String> entity : extractTypedNamedEntities(val))
{
SolrInputField destField = null;
String entityName = entity.first();
String entityType = entity.second();
@@ -516,15 +617,18 @@ public class
OpenNLPExtractNamedEntitiesUpdateProcessorFactory
}
}
- for (Map.Entry<String,SolrInputField> entry : destMap.entrySet()) {
+ for (Map.Entry<String, SolrInputField> entry : destMap.entrySet()) {
doc.put(entry.getKey(), entry.getValue());
}
super.processAdd(cmd);
}
- /** Using configured NER model, extracts (name, type) pairs from the
given source field value */
- private List<Pair<String,String>> extractTypedNamedEntities(Object
srcFieldValue) throws IOException {
- List<Pair<String,String>> entitiesWithType = new ArrayList<>();
+ /**
+ * Using configured NER model, extracts (name, type) pairs from the
given source field value
+ */
+ private List<Pair<String, String>> extractTypedNamedEntities(Object
srcFieldValue)
+ throws IOException {
+ List<Pair<String, String>> entitiesWithType = new ArrayList<>();
List<String> terms = new ArrayList<>();
List<Integer> startOffsets = new ArrayList<>();
List<Integer> endOffsets = new ArrayList<>();
@@ -540,24 +644,33 @@ public class
OpenNLPExtractNamedEntitiesUpdateProcessorFactory
startOffsets.add(offsetAtt.startOffset());
endOffsets.add(offsetAtt.endOffset());
boolean endOfSentence = 0 != (flagsAtt.getFlags() &
OpenNLPTokenizer.EOS_FLAG_BIT);
- if (endOfSentence) { // extract named entities one sentence at
a time
- extractEntitiesFromSentence(fullText, terms, startOffsets,
endOffsets, entitiesWithType);
+ if (endOfSentence) { // extract named entities one sentence at a
time
+ extractEntitiesFromSentence(
+ fullText, terms, startOffsets, endOffsets, entitiesWithType);
}
}
tokenStream.end();
tokenStream.close();
- if (!terms.isEmpty()) { // In case last token of last sentence isn't
properly flagged with EOS_FLAG_BIT
- extractEntitiesFromSentence(fullText, terms, startOffsets,
endOffsets, entitiesWithType);
+ if (!terms.isEmpty()) { // In case last token of last sentence isn't
properly flagged with
+ // EOS_FLAG_BIT
+ extractEntitiesFromSentence(
+ fullText, terms, startOffsets, endOffsets, entitiesWithType);
}
- nerTaggerOp.reset(); // Forget all adaptive data collected
during previous calls
+ nerTaggerOp.reset(); // Forget all adaptive data collected during
previous calls
}
return entitiesWithType;
}
- private void extractEntitiesFromSentence(String fullText, List<String>
terms, List<Integer> startOffsets,
- List<Integer> endOffsets,
List<Pair<String,String>> entitiesWithType) {
+ private void extractEntitiesFromSentence(
+ String fullText,
+ List<String> terms,
+ List<Integer> startOffsets,
+ List<Integer> endOffsets,
+ List<Pair<String, String>> entitiesWithType) {
for (Span span : nerTaggerOp.getNames(terms.toArray(new
String[terms.size()]))) {
- String text = fullText.substring(startOffsets.get(span.getStart()),
endOffsets.get(span.getEnd() - 1));
+ String text =
+ fullText.substring(
+ startOffsets.get(span.getStart()),
endOffsets.get(span.getEnd() - 1));
entitiesWithType.add(new Pair<>(text, span.getType()));
}
terms.clear();
diff --git
a/solr/modules/analysis-extras/src/test/org/apache/solr/analysis/TestFoldingMultitermExtrasQuery.java
b/solr/modules/analysis-extras/src/test/org/apache/solr/analysis/TestFoldingMultitermExtrasQuery.java
index b2cdbc2..f6138cd 100644
---
a/solr/modules/analysis-extras/src/test/org/apache/solr/analysis/TestFoldingMultitermExtrasQuery.java
+++
b/solr/modules/analysis-extras/src/test/org/apache/solr/analysis/TestFoldingMultitermExtrasQuery.java
@@ -17,13 +17,13 @@
package org.apache.solr.analysis;
import java.io.File;
-
import org.apache.commons.io.FileUtils;
import org.apache.solr.SolrTestCaseJ4;
import org.junit.BeforeClass;
import org.junit.Test;
-// See: https://issues.apache.org/jira/browse/SOLR-12028 Tests cannot remove
files on Windows machines occasionally
+// See: https://issues.apache.org/jira/browse/SOLR-12028 Tests cannot remove
files on Windows
+// machines occasionally
public class TestFoldingMultitermExtrasQuery extends SolrTestCaseJ4 {
public String getCoreName() {
@@ -34,7 +34,7 @@ public class TestFoldingMultitermExtrasQuery extends
SolrTestCaseJ4 {
public static void beforeTests() throws Exception {
File testHome = createTempDir().toFile();
FileUtils.copyDirectory(getFile("analysis-extras/solr"), testHome);
- initCore("solrconfig-icucollate.xml","schema-folding-extra.xml",
testHome.getAbsolutePath());
+ initCore("solrconfig-icucollate.xml", "schema-folding-extra.xml",
testHome.getAbsolutePath());
int idx = 1;
// ICUFoldingFilterFactory
@@ -72,6 +72,7 @@ public class TestFoldingMultitermExtrasQuery extends
SolrTestCaseJ4 {
assertQ(req("q", "content_icufolding:RE\u0301su*"),
"//result[@numFound='2']");
assertQ(req("q", "content_icufolding:El*"), "//result[@numFound='2']");
}
+
@Test
public void testICUNormalizer2() {
assertQ(req("q", "content_icunormalizer2:BadMagicicuFold*"),
"//result[@numFound='1']");
@@ -80,7 +81,7 @@ public class TestFoldingMultitermExtrasQuery extends
SolrTestCaseJ4 {
assertQ(req("q", "content_icunormalizer2:re\u0301Su*"),
"//result[@numFound='2']");
assertQ(req("q", "content_icunormalizer2:eL*"), "//result[@numFound='2']");
}
-
+
public void testICUTransform() {
assertQ(req("q", "content_icutransform:Росс*"), "//result[@numFound='1']");
}
diff --git
a/solr/modules/analysis-extras/src/test/org/apache/solr/schema/TestICUCollationField.java
b/solr/modules/analysis-extras/src/test/org/apache/solr/schema/TestICUCollationField.java
index 8fdafa9..8b5294d 100644
---
a/solr/modules/analysis-extras/src/test/org/apache/solr/schema/TestICUCollationField.java
+++
b/solr/modules/analysis-extras/src/test/org/apache/solr/schema/TestICUCollationField.java
@@ -16,31 +16,27 @@
*/
package org.apache.solr.schema;
+import com.ibm.icu.text.Collator;
+import com.ibm.icu.text.RuleBasedCollator;
+import com.ibm.icu.util.ULocale;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
-
import org.apache.commons.io.FileUtils;
import org.apache.lucene.analysis.util.FilesystemResourceLoader;
import org.apache.lucene.util.ResourceLoader;
import org.apache.solr.SolrTestCaseJ4;
import org.junit.BeforeClass;
-
-import com.ibm.icu.text.Collator;
-import com.ibm.icu.text.RuleBasedCollator;
-import com.ibm.icu.util.ULocale;
import org.mockito.Mockito;
-/**
- * Tests {@link ICUCollationField} with TermQueries, RangeQueries, and sort
order.
- */
+/** Tests {@link ICUCollationField} with TermQueries, RangeQueries, and sort
order. */
public class TestICUCollationField extends SolrTestCaseJ4 {
-
+
@BeforeClass
public static void beforeClass() throws Exception {
String home = setupSolrHome();
- initCore("solrconfig.xml","schema.xml", home);
+ initCore("solrconfig.xml", "schema.xml", home);
// add some docs
assertU(adoc("id", "1", "text", "\u0633\u0627\u0628"));
assertU(adoc("id", "2", "text", "I WİLL USE TURKİSH CASING"));
@@ -56,33 +52,38 @@ public class TestICUCollationField extends SolrTestCaseJ4 {
assertU(adoc("id", "12", "text", "\u0698\u0698"));
assertU(commit());
}
-
+
/**
- * Ugly: but what to do? We want to test custom sort, which reads rules in
as a resource.
- * These are largish files, and jvm-specific (as our documentation says, you
should always
- * look out for jvm differences with collation).
- * So it's preferable to create this file on-the-fly.
+ * Ugly: but what to do? We want to test custom sort, which reads rules in
as a resource. These
+ * are largish files, and jvm-specific (as our documentation says, you
should always look out for
+ * jvm differences with collation). So it's preferable to create this file
on-the-fly.
*/
public static String setupSolrHome() throws Exception {
String tmpFile = createTempDir().toFile().getAbsolutePath();
// make data and conf dirs
- new File(tmpFile + "/collection1", "data").mkdirs();
+ new File(tmpFile + "/collection1", "data").mkdirs();
File confDir = new File(tmpFile + "/collection1", "conf");
confDir.mkdirs();
-
+
// copy over configuration files
-
FileUtils.copyFile(getFile("analysis-extras/solr/collection1/conf/solrconfig-icucollate.xml"),
new File(confDir, "solrconfig.xml"));
-
FileUtils.copyFile(getFile("analysis-extras/solr/collection1/conf/schema-icucollate.xml"),
new File(confDir, "schema.xml"));
-
+ FileUtils.copyFile(
+
getFile("analysis-extras/solr/collection1/conf/solrconfig-icucollate.xml"),
+ new File(confDir, "solrconfig.xml"));
+ FileUtils.copyFile(
+ getFile("analysis-extras/solr/collection1/conf/schema-icucollate.xml"),
+ new File(confDir, "schema.xml"));
+
// generate custom collation rules (DIN 5007-2), saving to customrules.dat
- RuleBasedCollator baseCollator = (RuleBasedCollator)
Collator.getInstance(new ULocale("de", "DE"));
+ RuleBasedCollator baseCollator =
+ (RuleBasedCollator) Collator.getInstance(new ULocale("de", "DE"));
String DIN5007_2_tailorings =
- "& ae , a\u0308 & AE , A\u0308"+
- "& oe , o\u0308 & OE , O\u0308"+
- "& ue , u\u0308 & UE , u\u0308";
+ "& ae , a\u0308 & AE , A\u0308"
+ + "& oe , o\u0308 & OE , O\u0308"
+ + "& ue , u\u0308 & UE , u\u0308";
- RuleBasedCollator tailoredCollator = new
RuleBasedCollator(baseCollator.getRules() + DIN5007_2_tailorings);
+ RuleBasedCollator tailoredCollator =
+ new RuleBasedCollator(baseCollator.getRules() + DIN5007_2_tailorings);
String tailoredRules = tailoredCollator.getRules();
final String osFileName = "customrules.dat";
Files.writeString(confDir.toPath().resolve(osFileName), tailoredRules,
StandardCharsets.UTF_8);
@@ -93,9 +94,11 @@ public class TestICUCollationField extends SolrTestCaseJ4 {
if (random().nextBoolean()) {
loader = Mockito.mock(ResourceLoader.class);
Mockito.when(loader.openResource(Mockito.anyString()))
- .thenReturn(new
ByteArrayInputStream(tailoredRules.getBytes(StandardCharsets.UTF_8)));
+ .thenReturn(new
ByteArrayInputStream(tailoredRules.getBytes(StandardCharsets.UTF_8)));
} else {
- loader = new FilesystemResourceLoader(confDir.toPath(),
TestICUCollationField.class.getClassLoader());
+ loader =
+ new FilesystemResourceLoader(
+ confDir.toPath(), TestICUCollationField.class.getClassLoader());
}
final Collator readCollator =
ICUCollationField.createFromRules(osFileName, loader);
assertEquals(tailoredCollator, readCollator);
@@ -103,93 +106,89 @@ public class TestICUCollationField extends SolrTestCaseJ4
{
return tmpFile;
}
- /**
- * Test termquery with german DIN 5007-1 primary strength.
- * In this case, ö is equivalent to o (but not oe)
+ /**
+ * Test termquery with german DIN 5007-1 primary strength. In this case, ö
is equivalent to o (but
+ * not oe)
*/
public void testBasicTermQuery() {
- assertQ("Collated TQ: ",
- req("fl", "id", "q", "sort_de:tone", "sort", "id asc" ),
- "//*[@numFound='2']",
- "//result/doc[1]/str[@name='id'][.=4]",
- "//result/doc[2]/str[@name='id'][.=7]"
- );
+ assertQ(
+ "Collated TQ: ",
+ req("fl", "id", "q", "sort_de:tone", "sort", "id asc"),
+ "//*[@numFound='2']",
+ "//result/doc[1]/str[@name='id'][.=4]",
+ "//result/doc[2]/str[@name='id'][.=7]");
}
-
- /**
- * Test rangequery again with the DIN 5007-1 collator.
- * We do a range query of tone .. tp, in binary order this
- * would retrieve nothing due to case and accent differences.
+
+ /**
+ * Test rangequery again with the DIN 5007-1 collator. We do a range query
of tone .. tp, in
+ * binary order this would retrieve nothing due to case and accent
differences.
*/
public void testBasicRangeQuery() {
- assertQ("Collated RangeQ: ",
- req("fl", "id", "q", "sort_de:[tone TO tp]", "sort", "id asc" ),
- "//*[@numFound='2']",
- "//result/doc[1]/str[@name='id'][.=4]",
- "//result/doc[2]/str[@name='id'][.=7]"
- );
+ assertQ(
+ "Collated RangeQ: ",
+ req("fl", "id", "q", "sort_de:[tone TO tp]", "sort", "id asc"),
+ "//*[@numFound='2']",
+ "//result/doc[1]/str[@name='id'][.=4]",
+ "//result/doc[2]/str[@name='id'][.=7]");
}
-
- /**
- * Test sort with a danish collator. ö is ordered after z
- */
+
+ /** Test sort with a danish collator. ö is ordered after z */
public void testBasicSort() {
- assertQ("Collated Sort: ",
- req("fl", "id", "q", "sort_da:[tz TO töz]", "sort", "sort_da asc" ),
- "//*[@numFound='2']",
- "//result/doc[1]/str[@name='id'][.=11]",
- "//result/doc[2]/str[@name='id'][.=4]"
- );
+ assertQ(
+ "Collated Sort: ",
+ req("fl", "id", "q", "sort_da:[tz TO töz]", "sort", "sort_da asc"),
+ "//*[@numFound='2']",
+ "//result/doc[1]/str[@name='id'][.=11]",
+ "//result/doc[2]/str[@name='id'][.=4]");
}
-
- /**
- * Test sort with an arabic collator. U+0633 is ordered after U+0698.
- * With a binary collator, the range would also return nothing.
+
+ /**
+ * Test sort with an arabic collator. U+0633 is ordered after U+0698. With a
binary collator, the
+ * range would also return nothing.
*/
public void testArabicSort() {
- assertQ("Collated Sort: ",
- req("fl", "id", "q", "sort_ar:[\u0698 TO \u0633\u0633]", "sort",
"sort_ar asc" ),
- "//*[@numFound='2']",
- "//result/doc[1]/str[@name='id'][.=12]",
- "//result/doc[2]/str[@name='id'][.=1]"
- );
+ assertQ(
+ "Collated Sort: ",
+ req("fl", "id", "q", "sort_ar:[\u0698 TO \u0633\u0633]", "sort",
"sort_ar asc"),
+ "//*[@numFound='2']",
+ "//result/doc[1]/str[@name='id'][.=12]",
+ "//result/doc[2]/str[@name='id'][.=1]");
}
- /**
- * Test rangequery again with an Arabic collator.
- * Binary order would normally order U+0633 in this range.
+ /**
+ * Test rangequery again with an Arabic collator. Binary order would
normally order U+0633 in this
+ * range.
*/
public void testNegativeRangeQuery() {
- assertQ("Collated RangeQ: ",
- req("fl", "id", "q", "sort_ar:[\u062F TO \u0698]", "sort", "id asc" ),
- "//*[@numFound='0']"
- );
+ assertQ(
+ "Collated RangeQ: ",
+ req("fl", "id", "q", "sort_ar:[\u062F TO \u0698]", "sort", "id asc"),
+ "//*[@numFound='0']");
}
/**
- * Test canonical decomposition with turkish primary strength.
- * With this sort order, İ is the uppercase form of i, and I is the
uppercase form of ı.
- * We index a decomposed form of İ.
+ * Test canonical decomposition with turkish primary strength. With this
sort order, İ is the
+ * uppercase form of i, and I is the uppercase form of ı. We index a
decomposed form of İ.
*/
public void testCanonicalDecomposition() {
- assertQ("Collated TQ: ",
- req("fl", "id", "q", "sort_tr_canon:\"I Will Use Turkish Casıng\"",
"sort", "id asc" ),
- "//*[@numFound='3']",
- "//result/doc[1]/str[@name='id'][.=2]",
- "//result/doc[2]/str[@name='id'][.=3]",
- "//result/doc[3]/str[@name='id'][.=5]"
- );
+ assertQ(
+ "Collated TQ: ",
+ req("fl", "id", "q", "sort_tr_canon:\"I Will Use Turkish Casıng\"",
"sort", "id asc"),
+ "//*[@numFound='3']",
+ "//result/doc[1]/str[@name='id'][.=2]",
+ "//result/doc[2]/str[@name='id'][.=3]",
+ "//result/doc[3]/str[@name='id'][.=5]");
}
-
- /**
- * Test termquery with custom collator (DIN 5007-2).
- * In this case, ö is equivalent to oe (but not o)
+
+ /**
+ * Test termquery with custom collator (DIN 5007-2). In this case, ö is
equivalent to oe (but not
+ * o)
*/
public void testCustomCollation() {
- assertQ("Collated TQ: ",
+ assertQ(
+ "Collated TQ: ",
req("fl", "id", "q", "sort_custom:toene"),
- "//*[@numFound='2']",
- "//result/doc/str[@name='id'][.=4]",
- "//result/doc/str[@name='id'][.=10]"
- );
+ "//*[@numFound='2']",
+ "//result/doc/str[@name='id'][.=4]",
+ "//result/doc/str[@name='id'][.=10]");
}
}
diff --git
a/solr/modules/analysis-extras/src/test/org/apache/solr/schema/TestICUCollationFieldDocValues.java
b/solr/modules/analysis-extras/src/test/org/apache/solr/schema/TestICUCollationFieldDocValues.java
index 2ebc72b..322a915 100644
---
a/solr/modules/analysis-extras/src/test/org/apache/solr/schema/TestICUCollationFieldDocValues.java
+++
b/solr/modules/analysis-extras/src/test/org/apache/solr/schema/TestICUCollationFieldDocValues.java
@@ -16,27 +16,23 @@
*/
package org.apache.solr.schema;
+import com.ibm.icu.text.Collator;
+import com.ibm.icu.text.RuleBasedCollator;
+import com.ibm.icu.util.ULocale;
import java.io.File;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
-
import org.apache.commons.io.FileUtils;
import org.apache.solr.SolrTestCaseJ4;
import org.junit.BeforeClass;
-import com.ibm.icu.text.Collator;
-import com.ibm.icu.text.RuleBasedCollator;
-import com.ibm.icu.util.ULocale;
-
-/**
- * Tests {@link ICUCollationField} with docValues.
- */
+/** Tests {@link ICUCollationField} with docValues. */
public class TestICUCollationFieldDocValues extends SolrTestCaseJ4 {
-
+
@BeforeClass
public static void beforeClass() throws Exception {
String home = setupSolrHome();
- initCore("solrconfig.xml","schema.xml", home);
+ initCore("solrconfig.xml", "schema.xml", home);
// add some docs
assertU(adoc("id", "1", "text", "\u0633\u0627\u0628"));
assertU(adoc("id", "2", "text", "I WİLL USE TURKİSH CASING"));
@@ -52,127 +48,129 @@ public class TestICUCollationFieldDocValues extends
SolrTestCaseJ4 {
assertU(adoc("id", "12", "text", "\u0698\u0698"));
assertU(commit());
}
-
+
/**
- * Ugly: but what to do? We want to test custom sort, which reads rules in
as a resource.
- * These are largish files, and jvm-specific (as our documentation says, you
should always
- * look out for jvm differences with collation).
- * So it's preferable to create this file on-the-fly.
+ * Ugly: but what to do? We want to test custom sort, which reads rules in
as a resource. These
+ * are largish files, and jvm-specific (as our documentation says, you
should always look out for
+ * jvm differences with collation). So it's preferable to create this file
on-the-fly.
*/
public static String setupSolrHome() throws Exception {
File tmpFile = createTempDir().toFile();
-
+
// make data and conf dirs
new File(tmpFile + "/collection1", "data").mkdirs();
File confDir = new File(tmpFile + "/collection1", "conf");
confDir.mkdirs();
-
+
// copy over configuration files
-
FileUtils.copyFile(getFile("analysis-extras/solr/collection1/conf/solrconfig-icucollate.xml"),
new File(confDir, "solrconfig.xml"));
-
FileUtils.copyFile(getFile("analysis-extras/solr/collection1/conf/schema-icucollate-dv.xml"),
new File(confDir, "schema.xml"));
-
+ FileUtils.copyFile(
+
getFile("analysis-extras/solr/collection1/conf/solrconfig-icucollate.xml"),
+ new File(confDir, "solrconfig.xml"));
+ FileUtils.copyFile(
+
getFile("analysis-extras/solr/collection1/conf/schema-icucollate-dv.xml"),
+ new File(confDir, "schema.xml"));
+
// generate custom collation rules (DIN 5007-2), saving to customrules.dat
- RuleBasedCollator baseCollator = (RuleBasedCollator)
Collator.getInstance(new ULocale("de", "DE"));
+ RuleBasedCollator baseCollator =
+ (RuleBasedCollator) Collator.getInstance(new ULocale("de", "DE"));
String DIN5007_2_tailorings =
- "& ae , a\u0308 & AE , A\u0308"+
- "& oe , o\u0308 & OE , O\u0308"+
- "& ue , u\u0308 & UE , u\u0308";
+ "& ae , a\u0308 & AE , A\u0308"
+ + "& oe , o\u0308 & OE , O\u0308"
+ + "& ue , u\u0308 & UE , u\u0308";
- RuleBasedCollator tailoredCollator = new
RuleBasedCollator(baseCollator.getRules() + DIN5007_2_tailorings);
+ RuleBasedCollator tailoredCollator =
+ new RuleBasedCollator(baseCollator.getRules() + DIN5007_2_tailorings);
String tailoredRules = tailoredCollator.getRules();
- Files.writeString(confDir.toPath().resolve("customrules.dat"),
tailoredRules, StandardCharsets.UTF_8);
+ Files.writeString(
+ confDir.toPath().resolve("customrules.dat"), tailoredRules,
StandardCharsets.UTF_8);
return tmpFile.getAbsolutePath();
}
- /**
- * Test termquery with german DIN 5007-1 primary strength.
- * In this case, ö is equivalent to o (but not oe)
+ /**
+ * Test termquery with german DIN 5007-1 primary strength. In this case, ö
is equivalent to o (but
+ * not oe)
*/
public void testBasicTermQuery() {
- assertQ("Collated TQ: ",
- req("fl", "id", "q", "sort_de:tone", "sort", "id asc" ),
- "//*[@numFound='2']",
- "//result/doc[1]/str[@name='id'][.=4]",
- "//result/doc[2]/str[@name='id'][.=7]"
- );
+ assertQ(
+ "Collated TQ: ",
+ req("fl", "id", "q", "sort_de:tone", "sort", "id asc"),
+ "//*[@numFound='2']",
+ "//result/doc[1]/str[@name='id'][.=4]",
+ "//result/doc[2]/str[@name='id'][.=7]");
}
-
- /**
- * Test rangequery again with the DIN 5007-1 collator.
- * We do a range query of tone .. tp, in binary order this
- * would retrieve nothing due to case and accent differences.
+
+ /**
+ * Test rangequery again with the DIN 5007-1 collator. We do a range query
of tone .. tp, in
+ * binary order this would retrieve nothing due to case and accent
differences.
*/
public void testBasicRangeQuery() {
- assertQ("Collated RangeQ: ",
- req("fl", "id", "q", "sort_de:[tone TO tp]", "sort", "id asc" ),
- "//*[@numFound='2']",
- "//result/doc[1]/str[@name='id'][.=4]",
- "//result/doc[2]/str[@name='id'][.=7]"
- );
+ assertQ(
+ "Collated RangeQ: ",
+ req("fl", "id", "q", "sort_de:[tone TO tp]", "sort", "id asc"),
+ "//*[@numFound='2']",
+ "//result/doc[1]/str[@name='id'][.=4]",
+ "//result/doc[2]/str[@name='id'][.=7]");
}
-
- /**
- * Test sort with a danish collator. ö is ordered after z
- */
+
+ /** Test sort with a danish collator. ö is ordered after z */
public void testBasicSort() {
- assertQ("Collated Sort: ",
- req("fl", "id", "q", "sort_da:[tz TO töz]", "sort", "sort_da asc" ),
- "//*[@numFound='2']",
- "//result/doc[1]/str[@name='id'][.=11]",
- "//result/doc[2]/str[@name='id'][.=4]"
- );
+ assertQ(
+ "Collated Sort: ",
+ req("fl", "id", "q", "sort_da:[tz TO töz]", "sort", "sort_da asc"),
+ "//*[@numFound='2']",
+ "//result/doc[1]/str[@name='id'][.=11]",
+ "//result/doc[2]/str[@name='id'][.=4]");
}
-
- /**
- * Test sort with an arabic collator. U+0633 is ordered after U+0698.
- * With a binary collator, the range would also return nothing.
+
+ /**
+ * Test sort with an arabic collator. U+0633 is ordered after U+0698. With a
binary collator, the
+ * range would also return nothing.
*/
public void testArabicSort() {
- assertQ("Collated Sort: ",
- req("fl", "id", "q", "sort_ar:[\u0698 TO \u0633\u0633]", "sort",
"sort_ar asc" ),
- "//*[@numFound='2']",
- "//result/doc[1]/str[@name='id'][.=12]",
- "//result/doc[2]/str[@name='id'][.=1]"
- );
+ assertQ(
+ "Collated Sort: ",
+ req("fl", "id", "q", "sort_ar:[\u0698 TO \u0633\u0633]", "sort",
"sort_ar asc"),
+ "//*[@numFound='2']",
+ "//result/doc[1]/str[@name='id'][.=12]",
+ "//result/doc[2]/str[@name='id'][.=1]");
}
- /**
- * Test rangequery again with an Arabic collator.
- * Binary order would normally order U+0633 in this range.
+ /**
+ * Test rangequery again with an Arabic collator. Binary order would
normally order U+0633 in this
+ * range.
*/
public void testNegativeRangeQuery() {
- assertQ("Collated RangeQ: ",
- req("fl", "id", "q", "sort_ar:[\u062F TO \u0698]", "sort", "id asc" ),
- "//*[@numFound='0']"
- );
+ assertQ(
+ "Collated RangeQ: ",
+ req("fl", "id", "q", "sort_ar:[\u062F TO \u0698]", "sort", "id asc"),
+ "//*[@numFound='0']");
}
/**
- * Test canonical decomposition with turkish primary strength.
- * With this sort order, İ is the uppercase form of i, and I is the
uppercase form of ı.
- * We index a decomposed form of İ.
+ * Test canonical decomposition with turkish primary strength. With this
sort order, İ is the
+ * uppercase form of i, and I is the uppercase form of ı. We index a
decomposed form of İ.
*/
public void testCanonicalDecomposition() {
- assertQ("Collated TQ: ",
- req("fl", "id", "q", "sort_tr_canon:\"I Will Use Turkish Casıng\"",
"sort", "id asc" ),
- "//*[@numFound='3']",
- "//result/doc[1]/str[@name='id'][.=2]",
- "//result/doc[2]/str[@name='id'][.=3]",
- "//result/doc[3]/str[@name='id'][.=5]"
- );
+ assertQ(
+ "Collated TQ: ",
+ req("fl", "id", "q", "sort_tr_canon:\"I Will Use Turkish Casıng\"",
"sort", "id asc"),
+ "//*[@numFound='3']",
+ "//result/doc[1]/str[@name='id'][.=2]",
+ "//result/doc[2]/str[@name='id'][.=3]",
+ "//result/doc[3]/str[@name='id'][.=5]");
}
-
- /**
- * Test termquery with custom collator (DIN 5007-2).
- * In this case, ö is equivalent to oe (but not o)
+
+ /**
+ * Test termquery with custom collator (DIN 5007-2). In this case, ö is
equivalent to oe (but not
+ * o)
*/
public void testCustomCollation() {
- assertQ("Collated TQ: ",
+ assertQ(
+ "Collated TQ: ",
req("fl", "id", "q", "sort_custom:toene"),
- "//*[@numFound='2']",
- "//result/doc/str[@name='id'][.=4]",
- "//result/doc/str[@name='id'][.=10]"
- );
+ "//*[@numFound='2']",
+ "//result/doc/str[@name='id'][.=4]",
+ "//result/doc/str[@name='id'][.=10]");
}
}
diff --git
a/solr/modules/analysis-extras/src/test/org/apache/solr/schema/TestICUCollationFieldOptions.java
b/solr/modules/analysis-extras/src/test/org/apache/solr/schema/TestICUCollationFieldOptions.java
index 0b198b7..63f8b00 100644
---
a/solr/modules/analysis-extras/src/test/org/apache/solr/schema/TestICUCollationFieldOptions.java
+++
b/solr/modules/analysis-extras/src/test/org/apache/solr/schema/TestICUCollationFieldOptions.java
@@ -16,21 +16,19 @@
*/
package org.apache.solr.schema;
+import java.io.File;
import org.apache.commons.io.FileUtils;
import org.apache.solr.SolrTestCaseJ4;
import org.junit.BeforeClass;
-import java.io.File;
-
-/**
- * Tests expert options of {@link ICUCollationField}.
- */
+/** Tests expert options of {@link ICUCollationField}. */
public class TestICUCollationFieldOptions extends SolrTestCaseJ4 {
@BeforeClass
public static void beforeClass() throws Exception {
File testHome = createTempDir().toFile();
FileUtils.copyDirectory(getFile("analysis-extras/solr"), testHome);
- initCore("solrconfig-icucollate.xml","schema-icucollateoptions.xml",
testHome.getAbsolutePath());
+ initCore(
+ "solrconfig-icucollate.xml", "schema-icucollateoptions.xml",
testHome.getAbsolutePath());
// add some docs
assertU(adoc("id", "1", "text", "foo-bar"));
assertU(adoc("id", "2", "text", "foo bar"));
@@ -43,77 +41,77 @@ public class TestICUCollationFieldOptions extends
SolrTestCaseJ4 {
assertU(adoc("id", "9", "text", "résumé"));
assertU(commit());
}
-
+
/*
* Setting alternate=shifted to shift whitespace, punctuation and symbols
- * to quaternary level
+ * to quaternary level
*/
- public void testIgnorePunctuation() {
- assertQ("Collated TQ: ",
- req("fl", "id", "q", "sort_ignore_punctuation:foobar", "sort", "id
asc" ),
- "//*[@numFound='3']",
- "//result/doc[1]/str[@name='id'][.=1]",
- "//result/doc[2]/str[@name='id'][.=2]",
- "//result/doc[3]/str[@name='id'][.=3]"
- );
+ public void testIgnorePunctuation() {
+ assertQ(
+ "Collated TQ: ",
+ req("fl", "id", "q", "sort_ignore_punctuation:foobar", "sort", "id
asc"),
+ "//*[@numFound='3']",
+ "//result/doc[1]/str[@name='id'][.=1]",
+ "//result/doc[2]/str[@name='id'][.=2]",
+ "//result/doc[3]/str[@name='id'][.=3]");
}
-
+
/*
- * Setting alternate=shifted and variableTop to shift whitespace, but not
- * punctuation or symbols, to quaternary level
+ * Setting alternate=shifted and variableTop to shift whitespace, but not
+ * punctuation or symbols, to quaternary level
*/
public void testIgnoreWhitespace() {
- assertQ("Collated TQ: ",
- req("fl", "id", "q", "sort_ignore_space:\"foo bar\"", "sort", "id asc"
),
- "//*[@numFound='2']",
- "//result/doc[1]/str[@name='id'][.=2]",
- "//result/doc[2]/str[@name='id'][.=3]"
- );
+ assertQ(
+ "Collated TQ: ",
+ req("fl", "id", "q", "sort_ignore_space:\"foo bar\"", "sort", "id
asc"),
+ "//*[@numFound='2']",
+ "//result/doc[1]/str[@name='id'][.=2]",
+ "//result/doc[2]/str[@name='id'][.=3]");
}
-
+
/*
* Setting numeric to encode digits with numeric value, so that
* foobar-9 sorts before foobar-10
*/
public void testNumerics() {
- assertQ("Collated sort: ",
- req("fl", "id", "q", "id:[4 TO 5]", "sort", "sort_numerics asc" ),
- "//*[@numFound='2']",
- "//result/doc[1]/str[@name='id'][.=5]",
- "//result/doc[2]/str[@name='id'][.=4]"
- );
+ assertQ(
+ "Collated sort: ",
+ req("fl", "id", "q", "id:[4 TO 5]", "sort", "sort_numerics asc"),
+ "//*[@numFound='2']",
+ "//result/doc[1]/str[@name='id'][.=5]",
+ "//result/doc[2]/str[@name='id'][.=4]");
}
-
+
/*
* Setting caseLevel=true to create an additional case level between
* secondary and tertiary
*/
public void testIgnoreAccentsButNotCase() {
- assertQ("Collated TQ: ",
- req("fl", "id", "q", "sort_ignore_accents:resume", "sort", "id asc" ),
- "//*[@numFound='2']",
- "//result/doc[1]/str[@name='id'][.=6]",
- "//result/doc[2]/str[@name='id'][.=9]"
- );
-
- assertQ("Collated TQ: ",
- req("fl", "id", "q", "sort_ignore_accents:Resume", "sort", "id asc" ),
- "//*[@numFound='2']",
- "//result/doc[1]/str[@name='id'][.=7]",
- "//result/doc[2]/str[@name='id'][.=8]"
- );
+ assertQ(
+ "Collated TQ: ",
+ req("fl", "id", "q", "sort_ignore_accents:resume", "sort", "id asc"),
+ "//*[@numFound='2']",
+ "//result/doc[1]/str[@name='id'][.=6]",
+ "//result/doc[2]/str[@name='id'][.=9]");
+
+ assertQ(
+ "Collated TQ: ",
+ req("fl", "id", "q", "sort_ignore_accents:Resume", "sort", "id asc"),
+ "//*[@numFound='2']",
+ "//result/doc[1]/str[@name='id'][.=7]",
+ "//result/doc[2]/str[@name='id'][.=8]");
}
-
+
/*
* Setting caseFirst=upper to cause uppercase strings to sort
* before lowercase ones.
*/
public void testUpperCaseFirst() {
- assertQ("Collated sort: ",
- req("fl", "id", "q", "id:6 OR id:8", "sort", "sort_uppercase_first
asc" ),
- "//*[@numFound='2']",
- "//result/doc[1]/str[@name='id'][.=8]",
- "//result/doc[2]/str[@name='id'][.=6]"
- );
+ assertQ(
+ "Collated sort: ",
+ req("fl", "id", "q", "id:6 OR id:8", "sort", "sort_uppercase_first
asc"),
+ "//*[@numFound='2']",
+ "//result/doc[1]/str[@name='id'][.=8]",
+ "//result/doc[2]/str[@name='id'][.=6]");
}
}
diff --git
a/solr/modules/analysis-extras/src/test/org/apache/solr/schema/TestICUCollationFieldUDVAS.java
b/solr/modules/analysis-extras/src/test/org/apache/solr/schema/TestICUCollationFieldUDVAS.java
index 83fc584..a57a45a 100644
---
a/solr/modules/analysis-extras/src/test/org/apache/solr/schema/TestICUCollationFieldUDVAS.java
+++
b/solr/modules/analysis-extras/src/test/org/apache/solr/schema/TestICUCollationFieldUDVAS.java
@@ -24,8 +24,8 @@ import org.apache.solr.util.LogListener;
import org.junit.BeforeClass;
/**
- * Tests warn/failure of {@link ICUCollationField} when schema explicitly sets
- * <code>useDocValuesAsStored="true"</code>
+ * Tests warn/failure of {@link ICUCollationField} when schema explicitly sets
<code>
+ * useDocValuesAsStored="true"</code>
*/
public class TestICUCollationFieldUDVAS extends SolrTestCaseJ4 {
@@ -40,7 +40,11 @@ public class TestICUCollationFieldUDVAS extends
SolrTestCaseJ4 {
home = TestICUCollationFieldDocValues.setupSolrHome();
}
- private enum Mode { OK, WARN, FAIL }
+ private enum Mode {
+ OK,
+ WARN,
+ FAIL
+ }
@SuppressWarnings("fallthrough")
public void testInitCore() throws Exception {
@@ -64,10 +68,13 @@ public class TestICUCollationFieldUDVAS extends
SolrTestCaseJ4 {
if (mode == Mode.OK) {
restoreLuceneMatchVersion = null;
} else {
- System.setProperty(random().nextBoolean() ? ICU_TYPE_UDVAS_PROPNAME :
ICU_FIELD_UDVAS_PROPNAME, "true");
- restoreLuceneMatchVersion =
System.setProperty(TEST_LUCENE_MATCH_VERSION_PROPNAME, useVersion.toString());
+ System.setProperty(
+ random().nextBoolean() ? ICU_TYPE_UDVAS_PROPNAME :
ICU_FIELD_UDVAS_PROPNAME, "true");
+ restoreLuceneMatchVersion =
+ System.setProperty(TEST_LUCENE_MATCH_VERSION_PROPNAME,
useVersion.toString());
}
- try (LogListener warnLog =
LogListener.warn(XmlConfigFile.class).substring(ICUCollationField.UDVAS_MESSAGE)){
+ try (LogListener warnLog =
+
LogListener.warn(XmlConfigFile.class).substring(ICUCollationField.UDVAS_MESSAGE))
{
initCore("solrconfig.xml", "schema.xml", home);
switch (mode) {
case FAIL:
@@ -85,7 +92,9 @@ public class TestICUCollationFieldUDVAS extends
SolrTestCaseJ4 {
}
} catch (SolrException ex) {
assertSame("unexpected hard failure for " + useVersion + ": " + ex,
mode, Mode.FAIL);
- assertTrue("unexpected failure message",
getRootCause(ex).getMessage().contains(ICUCollationField.UDVAS_MESSAGE));
+ assertTrue(
+ "unexpected failure message",
+
getRootCause(ex).getMessage().contains(ICUCollationField.UDVAS_MESSAGE));
} finally {
restoreSysProps(restoreLuceneMatchVersion);
}
diff --git
a/solr/modules/analysis-extras/src/test/org/apache/solr/update/processor/TestOpenNLPExtractNamedEntitiesUpdateProcessorFactory.java
b/solr/modules/analysis-extras/src/test/org/apache/solr/update/processor/TestOpenNLPExtractNamedEntitiesUpdateProcessorFactory.java
index 851fea0..78c73ab 100644
---
a/solr/modules/analysis-extras/src/test/org/apache/solr/update/processor/TestOpenNLPExtractNamedEntitiesUpdateProcessorFactory.java
+++
b/solr/modules/analysis-extras/src/test/org/apache/solr/update/processor/TestOpenNLPExtractNamedEntitiesUpdateProcessorFactory.java
@@ -19,7 +19,6 @@ package org.apache.solr.update.processor;
import java.io.File;
import java.util.Arrays;
-
import org.apache.commons.io.FileUtils;
import org.apache.solr.common.SolrInputDocument;
import org.junit.BeforeClass;
@@ -31,149 +30,213 @@ public class
TestOpenNLPExtractNamedEntitiesUpdateProcessorFactory extends Updat
public static void beforeClass() throws Exception {
File testHome = createTempDir().toFile();
FileUtils.copyDirectory(getFile("analysis-extras/solr"), testHome);
- initCore("solrconfig-opennlp-extract.xml", "schema-opennlp-extract.xml",
testHome.getAbsolutePath());
+ initCore(
+ "solrconfig-opennlp-extract.xml", "schema-opennlp-extract.xml",
testHome.getAbsolutePath());
}
@Test
public void testSimpleExtract() throws Exception {
- SolrInputDocument doc = processAdd("extract-single",
- doc(f("id", "1"),
- f("source1_s", "Take this to Mr. Flashman.")));
+ SolrInputDocument doc =
+ processAdd(
+ "extract-single", doc(f("id", "1"), f("source1_s", "Take this to
Mr. Flashman.")));
assertEquals("dest_s should have stringValue", "Flashman",
doc.getFieldValue("dest_s"));
}
@Test
public void testMultiExtract() throws Exception {
- SolrInputDocument doc = processAdd("extract-multi",
- doc(f("id", "1"),
- f("source1_s", "Hello Flashman."),
- f("source2_s", "Calling Flashman.")));
+ SolrInputDocument doc =
+ processAdd(
+ "extract-multi",
+ doc(
+ f("id", "1"),
+ f("source1_s", "Hello Flashman."),
+ f("source2_s", "Calling Flashman.")));
assertEquals(Arrays.asList("Flashman", "Flashman"),
doc.getFieldValues("dest_s"));
}
@Test
public void testArrayExtract() throws Exception {
- SolrInputDocument doc = processAdd("extract-array",
- doc(f("id", "1"),
- f("source1_s", "Currently we have Flashman. Not much else."),
- f("source2_s", "Flashman. Is. Not. There.")));
+ SolrInputDocument doc =
+ processAdd(
+ "extract-array",
+ doc(
+ f("id", "1"),
+ f("source1_s", "Currently we have Flashman. Not much else."),
+ f("source2_s", "Flashman. Is. Not. There.")));
assertEquals(Arrays.asList("Flashman", "Flashman"),
doc.getFieldValues("dest_s"));
}
@Test
public void testSelectorExtract() throws Exception {
- SolrInputDocument doc = processAdd("extract-selector",
- doc(f("id", "1"),
- f("source0_s", "Flashman. Or not."),
- f("source1_s", "Serendipitously, he was. I mean, Flashman. And
yet."),
- f("source2_s", "Correct, Flashman.")));
+ SolrInputDocument doc =
+ processAdd(
+ "extract-selector",
+ doc(
+ f("id", "1"),
+ f("source0_s", "Flashman. Or not."),
+ f("source1_s", "Serendipitously, he was. I mean, Flashman. And
yet."),
+ f("source2_s", "Correct, Flashman.")));
assertEquals(Arrays.asList("Flashman", "Flashman"),
doc.getFieldValues("dest_s"));
}
public void testMultipleExtracts() throws Exception {
// test example from the javadocs
- SolrInputDocument doc = processAdd("multiple-extract",
- doc(f("id", "1"),
- f("text", "From Flashman. To Panman."),
- f("title", "It's Captain Flashman.", "Privately, Flashman."),
- f("subtitle", "Ineluctably, Flashman."),
- f("corrolary_txt", "Forsooth thou bringeth Flashman."),
- f("notes_txt", "Yes Flashman."),
- f("summary", "Many aspire to be Flashman in London."),
- f("descs", "Courage, Flashman.", "Ain't he Flashman."),
- f("descriptions", "Flashman. Flashman. Flashman.")));
+ SolrInputDocument doc =
+ processAdd(
+ "multiple-extract",
+ doc(
+ f("id", "1"),
+ f("text", "From Flashman. To Panman."),
+ f("title", "It's Captain Flashman.", "Privately, Flashman."),
+ f("subtitle", "Ineluctably, Flashman."),
+ f("corrolary_txt", "Forsooth thou bringeth Flashman."),
+ f("notes_txt", "Yes Flashman."),
+ f("summary", "Many aspire to be Flashman in London."),
+ f("descs", "Courage, Flashman.", "Ain't he Flashman."),
+ f("descriptions", "Flashman. Flashman. Flashman.")));
assertEquals(Arrays.asList("Flashman", "Flashman"),
doc.getFieldValues("people_s"));
- assertEquals(Arrays.asList("Flashman", "Flashman", "Flashman"),
doc.getFieldValues("titular_people"));
+ assertEquals(
+ Arrays.asList("Flashman", "Flashman", "Flashman"),
doc.getFieldValues("titular_people"));
assertEquals(Arrays.asList("Flashman", "Flashman"),
doc.getFieldValues("key_desc_people"));
- assertEquals(Arrays.asList("Flashman", "Flashman", "Flashman"),
doc.getFieldValues("key_description_people"));
- assertEquals("Flashman", doc.getFieldValue("summary_person_s")); //
{EntityType} field name interpolation
- assertEquals("London", doc.getFieldValue("summary_location_s")); //
{EntityType} field name interpolation
+ assertEquals(
+ Arrays.asList("Flashman", "Flashman", "Flashman"),
+ doc.getFieldValues("key_description_people"));
+ assertEquals(
+ "Flashman", doc.getFieldValue("summary_person_s")); // {EntityType}
field name interpolation
+ assertEquals(
+ "London", doc.getFieldValue("summary_location_s")); // {EntityType}
field name interpolation
}
public void testEquivalentExtraction() throws Exception {
SolrInputDocument d;
// regardless of chain, all of these checks should be equivalent
- for (String chain : Arrays.asList("extract-single", "extract-single-regex",
- "extract-multi", "extract-multi-regex",
- "extract-array", "extract-array-regex",
- "extract-selector", "extract-selector-regex")) {
+ for (String chain :
+ Arrays.asList(
+ "extract-single",
+ "extract-single-regex",
+ "extract-multi",
+ "extract-multi-regex",
+ "extract-array",
+ "extract-array-regex",
+ "extract-selector",
+ "extract-selector-regex")) {
// simple extract
- d = processAdd(chain,
- doc(f("id", "1111"),
- f("source0_s", "Totally Flashman."), // not extracted
- f("source1_s", "One nation under Flashman.", "Good Flashman.")));
+ d =
+ processAdd(
+ chain,
+ doc(
+ f("id", "1111"),
+ f("source0_s", "Totally Flashman."), // not extracted
+ f("source1_s", "One nation under Flashman.", "Good
Flashman.")));
assertNotNull(chain, d);
assertEquals(chain, Arrays.asList("Flashman", "Flashman"),
d.getFieldValues("dest_s"));
// append to existing values
- d = processAdd(chain,
- doc(f("id", "1111"),
- field("dest_s", "orig1", "orig2"),
- f("source0_s", "Flashman. In totality."), // not extracted
- f("source1_s", "Two nations under Flashman.", "Meh Flashman.")));
+ d =
+ processAdd(
+ chain,
+ doc(
+ f("id", "1111"),
+ field("dest_s", "orig1", "orig2"),
+ f("source0_s", "Flashman. In totality."), // not extracted
+ f("source1_s", "Two nations under Flashman.", "Meh
Flashman.")));
assertNotNull(chain, d);
- assertEquals(chain, Arrays.asList("orig1", "orig2", "Flashman",
"Flashman"), d.getFieldValues("dest_s"));
+ assertEquals(
+ chain,
+ Arrays.asList("orig1", "orig2", "Flashman", "Flashman"),
+ d.getFieldValues("dest_s"));
}
// should be equivalent for any chain matching source1_s and source2_s
(but not source0_s)
- for (String chain : Arrays.asList("extract-multi", "extract-multi-regex",
- "extract-array", "extract-array-regex",
- "extract-selector", "extract-selector-regex")) {
+ for (String chain :
+ Arrays.asList(
+ "extract-multi",
+ "extract-multi-regex",
+ "extract-array",
+ "extract-array-regex",
+ "extract-selector",
+ "extract-selector-regex")) {
// simple extract
- d = processAdd(chain,
- doc(f("id", "1111"),
- f("source0_s", "Not Flashman."), // not extracted
- f("source1_s", "Could have had a Flashman.", "Bad Flashman."),
- f("source2_s", "Indubitably Flashman.")));
+ d =
+ processAdd(
+ chain,
+ doc(
+ f("id", "1111"),
+ f("source0_s", "Not Flashman."), // not extracted
+ f("source1_s", "Could have had a Flashman.", "Bad
Flashman."),
+ f("source2_s", "Indubitably Flashman.")));
assertNotNull(chain, d);
- assertEquals(chain, Arrays.asList("Flashman", "Flashman", "Flashman"),
d.getFieldValues("dest_s"));
+ assertEquals(
+ chain, Arrays.asList("Flashman", "Flashman", "Flashman"),
d.getFieldValues("dest_s"));
// append to existing values
- d = processAdd(chain,
- doc(f("id", "1111"),
- field("dest_s", "orig1", "orig2"),
- f("source0_s", "Never Flashman."), // not extracted
- f("source1_s", "Seeking Flashman.", "Evil incarnate Flashman."),
- f("source2_s", "Perfunctorily Flashman.")));
+ d =
+ processAdd(
+ chain,
+ doc(
+ f("id", "1111"),
+ field("dest_s", "orig1", "orig2"),
+ f("source0_s", "Never Flashman."), // not extracted
+ f("source1_s", "Seeking Flashman.", "Evil incarnate
Flashman."),
+ f("source2_s", "Perfunctorily Flashman.")));
assertNotNull(chain, d);
- assertEquals(chain, Arrays.asList("orig1", "orig2", "Flashman",
"Flashman", "Flashman"), d.getFieldValues("dest_s"));
+ assertEquals(
+ chain,
+ Arrays.asList("orig1", "orig2", "Flashman", "Flashman", "Flashman"),
+ d.getFieldValues("dest_s"));
}
// any chain that copies source1_s to dest_s should be equivalent for
these assertions
- for (String chain : Arrays.asList("extract-single", "extract-single-regex",
- "extract-multi", "extract-multi-regex",
- "extract-array", "extract-array-regex",
- "extract-selector", "extract-selector-regex")) {
+ for (String chain :
+ Arrays.asList(
+ "extract-single",
+ "extract-single-regex",
+ "extract-multi",
+ "extract-multi-regex",
+ "extract-array",
+ "extract-array-regex",
+ "extract-selector",
+ "extract-selector-regex")) {
// simple extract
- d = processAdd(chain,
- doc(f("id", "1111"),
- f("source1_s", "Always Flashman.", "Flashman. Noone else.")));
+ d =
+ processAdd(
+ chain,
+ doc(f("id", "1111"), f("source1_s", "Always Flashman.",
"Flashman. Noone else.")));
assertNotNull(chain, d);
assertEquals(chain, Arrays.asList("Flashman", "Flashman"),
d.getFieldValues("dest_s"));
// append to existing values
- d = processAdd(chain,
- doc(f("id", "1111"),
- field("dest_s", "orig1", "orig2"),
- f("source1_s", "Flashman. And, scene.", "Contemporary Flashman.
Yeesh.")));
+ d =
+ processAdd(
+ chain,
+ doc(
+ f("id", "1111"),
+ field("dest_s", "orig1", "orig2"),
+ f("source1_s", "Flashman. And, scene.", "Contemporary
Flashman. Yeesh.")));
assertNotNull(chain, d);
- assertEquals(chain, Arrays.asList("orig1", "orig2", "Flashman",
"Flashman"), d.getFieldValues("dest_s"));
+ assertEquals(
+ chain,
+ Arrays.asList("orig1", "orig2", "Flashman", "Flashman"),
+ d.getFieldValues("dest_s"));
}
}
public void testExtractFieldRegexReplaceAll() throws Exception {
- SolrInputDocument d = processAdd("extract-regex-replaceall",
- doc(f("id", "1111"),
- f("foo_x2_s", "Infrequently Flashman.", "In the words of
Flashman."),
- f("foo_x3_x7_s", "Flashman. Whoa.")));
+ SolrInputDocument d =
+ processAdd(
+ "extract-regex-replaceall",
+ doc(
+ f("id", "1111"),
+ f("foo_x2_s", "Infrequently Flashman.", "In the words of
Flashman."),
+ f("foo_x3_x7_s", "Flashman. Whoa.")));
assertNotNull(d);
assertEquals(Arrays.asList("Flashman", "Flashman"),
d.getFieldValues("foo_y2_s"));
@@ -181,15 +244,29 @@ public class
TestOpenNLPExtractNamedEntitiesUpdateProcessorFactory extends Updat
}
public void testExtractFieldRegexReplaceAllWithEntityType() throws Exception
{
- SolrInputDocument d =
processAdd("extract-regex-replaceall-with-entity-type",
- doc(f("id", "1111"),
- f("foo_x2_s", "Infrequently Flashman in London.", "In the words of
Flashman in London."),
- f("foo_x3_x7_s", "Flashman in London. Whoa.")));
+ SolrInputDocument d =
+ processAdd(
+ "extract-regex-replaceall-with-entity-type",
+ doc(
+ f("id", "1111"),
+ f(
+ "foo_x2_s",
+ "Infrequently Flashman in London.",
+ "In the words of Flashman in London."),
+ f("foo_x3_x7_s", "Flashman in London. Whoa.")));
assertNotNull(d);
- assertEquals(d.getFieldNames().toString(), Arrays.asList("Flashman",
"Flashman"), d.getFieldValues("foo_person_y2_s"));
- assertEquals(d.getFieldNames().toString(), Arrays.asList("London",
"London"), d.getFieldValues("foo_location_y2_s"));
- assertEquals(d.getFieldNames().toString(),"Flashman",
d.getFieldValue("foo_person_y3_person_y7_s"));
- assertEquals(d.getFieldNames().toString(),"London",
d.getFieldValue("foo_location_y3_location_y7_s"));
+ assertEquals(
+ d.getFieldNames().toString(),
+ Arrays.asList("Flashman", "Flashman"),
+ d.getFieldValues("foo_person_y2_s"));
+ assertEquals(
+ d.getFieldNames().toString(),
+ Arrays.asList("London", "London"),
+ d.getFieldValues("foo_location_y2_s"));
+ assertEquals(
+ d.getFieldNames().toString(), "Flashman",
d.getFieldValue("foo_person_y3_person_y7_s"));
+ assertEquals(
+ d.getFieldNames().toString(), "London",
d.getFieldValue("foo_location_y3_location_y7_s"));
}
}