This is an automated email from the ASF dual-hosted git repository.
krisden pushed a commit to branch branch_9_0
in repository https://gitbox.apache.org/repos/asf/solr.git
The following commit(s) were added to refs/heads/branch_9_0 by this push:
new 548bcf0 SOLR-16028: Enable spotless on langid module
548bcf0 is described below
commit 548bcf0a7590028ec36dbe245bcbcdfa48b8029d
Author: Kevin Risden <[email protected]>
AuthorDate: Sat Feb 19 10:42:45 2022 -0500
SOLR-16028: Enable spotless on langid module
---
gradle/validation/spotless.gradle | 1 -
.../solr/update/processor/DetectedLanguage.java | 10 +-
...angDetectLanguageIdentifierUpdateProcessor.java | 33 +--
...ctLanguageIdentifierUpdateProcessorFactory.java | 51 ++--
.../apache/solr/update/processor/LangIdParams.java | 60 +++--
.../LanguageIdentifierUpdateProcessor.java | 202 ++++++++++------
.../OpenNLPLangDetectUpdateProcessor.java | 37 +--
.../OpenNLPLangDetectUpdateProcessorFactory.java | 37 +--
.../update/processor/SolrInputDocumentReader.java | 64 +++--
.../TikaLanguageIdentifierUpdateProcessor.java | 33 +--
...kaLanguageIdentifierUpdateProcessorFactory.java | 40 ++--
...nguageIdentifierUpdateProcessorFactoryTest.java | 126 ++++++++--
...geIdentifierUpdateProcessorFactoryTestCase.java | 266 ++++++++++++++++-----
...penNLPLangDetectUpdateProcessorFactoryTest.java | 64 ++++-
.../processor/SolrInputDocumentReaderTest.java | 39 +--
...nguageIdentifierUpdateProcessorFactoryTest.java | 44 ++--
16 files changed, 730 insertions(+), 377 deletions(-)
diff --git a/gradle/validation/spotless.gradle
b/gradle/validation/spotless.gradle
index 71b06cc..e01d4d2 100644
--- a/gradle/validation/spotless.gradle
+++ b/gradle/validation/spotless.gradle
@@ -47,7 +47,6 @@ configure(project(":solr").subprojects) { prj ->
case ":solr:modules:gcs-repository":
case ":solr:modules:hadoop-auth":
case ":solr:modules:hdfs":
- case ":solr:modules:langid":
case ":solr:modules:scripting":
case ":solr:core":
case ":solr:solrj":
diff --git
a/solr/modules/langid/src/java/org/apache/solr/update/processor/DetectedLanguage.java
b/solr/modules/langid/src/java/org/apache/solr/update/processor/DetectedLanguage.java
index e8e6fbe..07d4e75 100644
---
a/solr/modules/langid/src/java/org/apache/solr/update/processor/DetectedLanguage.java
+++
b/solr/modules/langid/src/java/org/apache/solr/update/processor/DetectedLanguage.java
@@ -16,20 +16,19 @@
*/
package org.apache.solr.update.processor;
-/**
- * Bean holding a language and a detection certainty
- */
+/** Bean holding a language and a detection certainty */
public class DetectedLanguage {
private final String langCode;
private final Double certainty;
-
+
DetectedLanguage(String lang, Double certainty) {
this.langCode = lang;
this.certainty = certainty;
}
-
+
/**
* Returns the detected language code
+ *
* @return language code as a string
*/
public String getLangCode() {
@@ -38,6 +37,7 @@ public class DetectedLanguage {
/**
* Returns the detected certainty for this language
+ *
* @return certainty as a value between 0.0 and 1.0 where 1.0 is 100% certain
*/
public Double getCertainty() {
diff --git
a/solr/modules/langid/src/java/org/apache/solr/update/processor/LangDetectLanguageIdentifierUpdateProcessor.java
b/solr/modules/langid/src/java/org/apache/solr/update/processor/LangDetectLanguageIdentifierUpdateProcessor.java
index 3206656..608627e 100644
---
a/solr/modules/langid/src/java/org/apache/solr/update/processor/LangDetectLanguageIdentifierUpdateProcessor.java
+++
b/solr/modules/langid/src/java/org/apache/solr/update/processor/LangDetectLanguageIdentifierUpdateProcessor.java
@@ -16,42 +16,42 @@
*/
package org.apache.solr.update.processor;
+import com.cybozu.labs.langdetect.Detector;
+import com.cybozu.labs.langdetect.DetectorFactory;
+import com.cybozu.labs.langdetect.LangDetectException;
+import com.cybozu.labs.langdetect.Language;
import java.io.IOException;
import java.io.Reader;
import java.lang.invoke.MethodHandles;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
-
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.response.SolrQueryResponse;
-
-import com.cybozu.labs.langdetect.Detector;
-import com.cybozu.labs.langdetect.DetectorFactory;
-import com.cybozu.labs.langdetect.LangDetectException;
-import com.cybozu.labs.langdetect.Language;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
- * Identifies the language of a set of input fields using
https://github.com/shuyo/language-detection
- * <p>
- * See <a
href="https://solr.apache.org/guide/language-detection.html">Detecting
Languages During
+ * Identifies the language of a set of input fields using
+ * https://github.com/shuyo/language-detection
+ *
+ * <p>See <a
href="https://solr.apache.org/guide/language-detection.html">Detecting
Languages During
* Indexing</a> in the Solr Ref Guide
+ *
* @since 3.5
*/
public class LangDetectLanguageIdentifierUpdateProcessor extends
LanguageIdentifierUpdateProcessor {
private static final Logger log =
LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
- public LangDetectLanguageIdentifierUpdateProcessor(SolrQueryRequest req,
- SolrQueryResponse rsp, UpdateRequestProcessor next) {
+ public LangDetectLanguageIdentifierUpdateProcessor(
+ SolrQueryRequest req, SolrQueryResponse rsp, UpdateRequestProcessor
next) {
super(req, rsp, next);
}
/**
- * Detects language(s) from a reader, typically based on some fields in
SolrInputDocument
- * Classes wishing to implement their own language detection module should
override this method.
+ * Detects language(s) from a reader, typically based on some fields in
SolrInputDocument Classes
+ * wishing to implement their own language detection module should override
this method.
*
* @param solrDocReader A reader serving the text from the document to detect
* @return List of detected language(s) according to RFC-3066
@@ -62,16 +62,17 @@ public class LangDetectLanguageIdentifierUpdateProcessor
extends LanguageIdentif
Detector detector = DetectorFactory.create();
detector.setMaxTextLength(maxTotalChars);
- // TODO Work around bug in LangDetect 1.1 which does not expect a -1
return value at end of stream,
+ // TODO Work around bug in LangDetect 1.1 which does not expect a -1
return value at end of
+ // stream,
// but instead only looks at ready()
if (solrDocReader instanceof SolrInputDocumentReader) {
- ((SolrInputDocumentReader)solrDocReader).setEodReturnValue(0);
+ ((SolrInputDocumentReader) solrDocReader).setEodReturnValue(0);
}
detector.append(solrDocReader);
ArrayList<Language> langlist = detector.getProbabilities();
ArrayList<DetectedLanguage> solrLangList = new ArrayList<>();
- for (Language l: langlist) {
+ for (Language l : langlist) {
solrLangList.add(new DetectedLanguage(l.lang, l.prob));
}
return solrLangList;
diff --git
a/solr/modules/langid/src/java/org/apache/solr/update/processor/LangDetectLanguageIdentifierUpdateProcessorFactory.java
b/solr/modules/langid/src/java/org/apache/solr/update/processor/LangDetectLanguageIdentifierUpdateProcessorFactory.java
index 7a0db3a..99e04f0 100644
---
a/solr/modules/langid/src/java/org/apache/solr/update/processor/LangDetectLanguageIdentifierUpdateProcessorFactory.java
+++
b/solr/modules/langid/src/java/org/apache/solr/update/processor/LangDetectLanguageIdentifierUpdateProcessorFactory.java
@@ -16,6 +16,8 @@
*/
package org.apache.solr.update.processor;
+import com.cybozu.labs.langdetect.DetectorFactory;
+import com.cybozu.labs.langdetect.LangDetectException;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
@@ -23,7 +25,6 @@ import java.io.InputStreamReader;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.List;
-
import org.apache.commons.io.IOUtils;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.common.util.NamedList;
@@ -33,16 +34,13 @@ import org.apache.solr.response.SolrQueryResponse;
import org.apache.solr.util.SolrPluginUtils;
import org.apache.solr.util.plugin.SolrCoreAware;
-import com.cybozu.labs.langdetect.DetectorFactory;
-import com.cybozu.labs.langdetect.LangDetectException;
-
/**
* Identifies the language of a set of input fields using
* http://code.google.com/p/language-detection
- * <p>
- * The UpdateProcessorChain config entry can take a number of parameters
- * which may also be passed as HTTP parameters on the update request
- * and override the defaults. Here is the simplest processor config possible:
+ *
+ * <p>The UpdateProcessorChain config entry can take a number of parameters
which may also be passed
+ * as HTTP parameters on the update request and override the defaults. Here is
the simplest
+ * processor config possible:
*
* <pre class="prettyprint" >
* <processor
class="org.apache.solr.update.processor.LangDetectLanguageIdentifierUpdateProcessorFactory">
@@ -50,32 +48,35 @@ import com.cybozu.labs.langdetect.LangDetectException;
* <str name="langid.langField">language_s</str>
* </processor>
* </pre>
- * See <a
href="https://solr.apache.org/guide/language-detection.html">https://solr.apache.org/guide/language-detection.html</a>
+ *
+ * See <a
+ *
href="https://solr.apache.org/guide/language-detection.html">https://solr.apache.org/guide/language-detection.html</a>
+ *
* @since 3.5
*/
-public class LangDetectLanguageIdentifierUpdateProcessorFactory extends
- UpdateRequestProcessorFactory implements SolrCoreAware, LangIdParams {
+public class LangDetectLanguageIdentifierUpdateProcessorFactory
+ extends UpdateRequestProcessorFactory implements SolrCoreAware,
LangIdParams {
protected SolrParams defaults;
protected SolrParams appends;
protected SolrParams invariants;
@Override
- public void inform(SolrCore core) {
- }
+ public void inform(SolrCore core) {}
/**
- * The UpdateRequestProcessor may be initialized in solrconfig.xml similarly
- * to a RequestHandler, with defaults, appends and invariants.
+ * The UpdateRequestProcessor may be initialized in solrconfig.xml similarly
to a RequestHandler,
+ * with defaults, appends and invariants.
+ *
* @param args a NamedList with the configuration parameters
*/
@Override
- public void init(NamedList<?> args )
- {
+ public void init(NamedList<?> args) {
try {
loadData();
} catch (Exception e) {
- throw new RuntimeException("Couldn't load profile data, will return
empty languages always!", e);
+ throw new RuntimeException(
+ "Couldn't load profile data, will return empty languages always!",
e);
}
if (args != null) {
Object o;
@@ -97,16 +98,15 @@ public class
LangDetectLanguageIdentifierUpdateProcessorFactory extends
}
@Override
- public UpdateRequestProcessor getInstance(SolrQueryRequest req,
- SolrQueryResponse rsp,
UpdateRequestProcessor next) {
+ public UpdateRequestProcessor getInstance(
+ SolrQueryRequest req, SolrQueryResponse rsp, UpdateRequestProcessor
next) {
// Process defaults, appends and invariants if we got a request
- if(req != null) {
+ if (req != null) {
SolrPluginUtils.setDefaults(req, defaults, appends, invariants);
}
return new LangDetectLanguageIdentifierUpdateProcessor(req, rsp, next);
}
-
// DetectorFactory is totally global, so we only want to do this once...
ever!!!
static boolean loaded;
@@ -125,8 +125,11 @@ public class
LangDetectLanguageIdentifierUpdateProcessorFactory extends
loaded = true;
List<String> profileData = new ArrayList<>();
for (String language : languages) {
- InputStream stream =
LangDetectLanguageIdentifierUpdateProcessor.class.getResourceAsStream("langdetect-profiles/"
+ language);
- BufferedReader reader = new BufferedReader(new InputStreamReader(stream,
StandardCharsets.UTF_8));
+ InputStream stream =
+
LangDetectLanguageIdentifierUpdateProcessor.class.getResourceAsStream(
+ "langdetect-profiles/" + language);
+ BufferedReader reader =
+ new BufferedReader(new InputStreamReader(stream,
StandardCharsets.UTF_8));
profileData.add(new String(IOUtils.toCharArray(reader)));
reader.close();
}
diff --git
a/solr/modules/langid/src/java/org/apache/solr/update/processor/LangIdParams.java
b/solr/modules/langid/src/java/org/apache/solr/update/processor/LangIdParams.java
index 4dc04ee..3eb55b0 100644
---
a/solr/modules/langid/src/java/org/apache/solr/update/processor/LangIdParams.java
+++
b/solr/modules/langid/src/java/org/apache/solr/update/processor/LangIdParams.java
@@ -19,31 +19,43 @@ package org.apache.solr.update.processor;
public interface LangIdParams {
String LANGUAGE_ID = "langid";
- String DOCID_PARAM = LANGUAGE_ID + ".idField";
+ String DOCID_PARAM = LANGUAGE_ID + ".idField";
+
+ String FIELDS_PARAM = LANGUAGE_ID + ".fl"; // Field list to detect from
+ String LANG_FIELD = LANGUAGE_ID + ".langField"; // Main language detected
+ String LANGS_FIELD = LANGUAGE_ID + ".langsField"; // All languages detected
(multiValued)
+ String FALLBACK = LANGUAGE_ID + ".fallback"; // Fallback lang code
+ String FALLBACK_FIELDS = LANGUAGE_ID + ".fallbackFields"; // Comma-sep list
of fallback fields
+ String OVERWRITE =
+ LANGUAGE_ID + ".overwrite"; // Overwrite if existing language value in
LANG_FIELD
+ String THRESHOLD = LANGUAGE_ID + ".threshold"; // Detection threshold
+ String ENFORCE_SCHEMA =
+ LANGUAGE_ID + ".enforceSchema"; // Enforces that output fields exist in
schema
- String FIELDS_PARAM = LANGUAGE_ID + ".fl"; // Field list to
detect from
- String LANG_FIELD = LANGUAGE_ID + ".langField"; // Main language
detected
- String LANGS_FIELD = LANGUAGE_ID + ".langsField"; // All languages
detected (multiValued)
- String FALLBACK = LANGUAGE_ID + ".fallback"; // Fallback lang
code
- String FALLBACK_FIELDS = LANGUAGE_ID + ".fallbackFields"; // Comma-sep list
of fallback fields
- String OVERWRITE = LANGUAGE_ID + ".overwrite"; // Overwrite if
existing language value in LANG_FIELD
- String THRESHOLD = LANGUAGE_ID + ".threshold"; // Detection
threshold
- String ENFORCE_SCHEMA = LANGUAGE_ID + ".enforceSchema"; // Enforces that
output fields exist in schema
@Deprecated(since = "9.0.0")
- String LANG_WHITELIST = LANGUAGE_ID + ".whitelist"; // Old property
name for allowed languages
- String LANG_ALLOWLIST = LANGUAGE_ID + ".allowlist"; // Allowed
languages
- String LCMAP = LANGUAGE_ID + ".lcmap"; // Maps detected
langcode to other value
- String MAP_ENABLE = LANGUAGE_ID + ".map"; // Turns on or
off the field mapping
- String MAP_FL = LANGUAGE_ID + ".map.fl"; // Field list for
mapping
- String MAP_OVERWRITE = LANGUAGE_ID + ".map.overwrite"; // Whether to
overwrite existing fields
- String MAP_KEEP_ORIG = LANGUAGE_ID + ".map.keepOrig"; // Keep original
field after mapping
- String MAP_INDIVIDUAL = LANGUAGE_ID + ".map.individual"; // Detect
language per individual field
- String MAP_INDIVIDUAL_FL = LANGUAGE_ID + ".map.individual.fl";// Field list
of fields to redetect language for
- String MAP_LCMAP = LANGUAGE_ID + ".map.lcmap"; // Enables
mapping multiple langs to same output field
- String MAP_PATTERN = LANGUAGE_ID + ".map.pattern"; // RegEx pattern
to match field name
- String MAP_REPLACE = LANGUAGE_ID + ".map.replace"; // Replace pattern
- String MAX_FIELD_VALUE_CHARS = LANGUAGE_ID + ".maxFieldValueChars"; //
Maximum number of characters to use per field for language detection
- String MAX_TOTAL_CHARS = LANGUAGE_ID + ".maxTotalChars"; // Maximum number
of characters to use per all concatenated fields for language detection
+ String LANG_WHITELIST = LANGUAGE_ID + ".whitelist"; // Old property name for
allowed languages
+
+ String LANG_ALLOWLIST = LANGUAGE_ID + ".allowlist"; // Allowed languages
+ String LCMAP = LANGUAGE_ID + ".lcmap"; // Maps detected langcode to other
value
+ String MAP_ENABLE = LANGUAGE_ID + ".map"; // Turns on or off the field
mapping
+ String MAP_FL = LANGUAGE_ID + ".map.fl"; // Field list for mapping
+ String MAP_OVERWRITE = LANGUAGE_ID + ".map.overwrite"; // Whether to
overwrite existing fields
+ String MAP_KEEP_ORIG = LANGUAGE_ID + ".map.keepOrig"; // Keep original field
after mapping
+ String MAP_INDIVIDUAL = LANGUAGE_ID + ".map.individual"; // Detect language
per individual field
+ String MAP_INDIVIDUAL_FL =
+ LANGUAGE_ID + ".map.individual.fl"; // Field list of fields to redetect
language for
+ String MAP_LCMAP =
+ LANGUAGE_ID + ".map.lcmap"; // Enables mapping multiple langs to same
output field
+ String MAP_PATTERN = LANGUAGE_ID + ".map.pattern"; // RegEx pattern to match
field name
+ String MAP_REPLACE = LANGUAGE_ID + ".map.replace"; // Replace pattern
+ String MAX_FIELD_VALUE_CHARS =
+ LANGUAGE_ID
+ + ".maxFieldValueChars"; // Maximum number of characters to use per
field for language
+ // detection
+ String MAX_TOTAL_CHARS =
+ LANGUAGE_ID
+ + ".maxTotalChars"; // Maximum number of characters to use per all
concatenated fields for
+ // language detection
String DOCID_FIELD_DEFAULT = "id";
String DOCID_LANGFIELD_DEFAULT = null;
@@ -53,7 +65,7 @@ public interface LangIdParams {
int MAX_FIELD_VALUE_CHARS_DEFAULT = 10000;
int MAX_TOTAL_CHARS_DEFAULT = 20000;
- // TODO: This default threshold accepts even "uncertain" detections.
+ // TODO: This default threshold accepts even "uncertain" detections.
// Increase &langid.threshold above 0.5 to return only certain detections
Double DOCID_THRESHOLD_DEFAULT = 0.5;
}
diff --git
a/solr/modules/langid/src/java/org/apache/solr/update/processor/LanguageIdentifierUpdateProcessor.java
b/solr/modules/langid/src/java/org/apache/solr/update/processor/LanguageIdentifierUpdateProcessor.java
index ff630f6..ad744e8 100644
---
a/solr/modules/langid/src/java/org/apache/solr/update/processor/LanguageIdentifierUpdateProcessor.java
+++
b/solr/modules/langid/src/java/org/apache/solr/update/processor/LanguageIdentifierUpdateProcessor.java
@@ -25,7 +25,6 @@ import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.regex.Pattern;
-
import org.apache.solr.common.SolrException;
import org.apache.solr.common.SolrException.ErrorCode;
import org.apache.solr.common.SolrInputDocument;
@@ -39,17 +38,16 @@ import org.apache.solr.update.AddUpdateCommand;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
-
/**
- * <p>
- * Identifies the language of a set of input fields.
- * Also supports mapping of field names based on detected language.
- * </p>
- * See <a
href="https://solr.apache.org/guide/language-detection.html">Detecting
Languages During Indexing</a> in reference guide
+ * Identifies the language of a set of input fields. Also supports mapping of
field names based on
+ * detected language. See <a
href="https://solr.apache.org/guide/language-detection.html">Detecting
+ * Languages During Indexing</a> in reference guide
+ *
* @since 3.5
* @lucene.experimental
*/
-public abstract class LanguageIdentifierUpdateProcessor extends
UpdateRequestProcessor implements LangIdParams {
+public abstract class LanguageIdentifierUpdateProcessor extends
UpdateRequestProcessor
+ implements LangIdParams {
private static final Logger log =
LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
@@ -74,8 +72,8 @@ public abstract class LanguageIdentifierUpdateProcessor
extends UpdateRequestPro
protected HashSet<String> langAllowlist;
protected HashSet<String> mapIndividualFieldsSet;
protected HashSet<String> allMapFieldsSet;
- protected HashMap<String,String> lcMap;
- protected HashMap<String,String> mapLcMap;
+ protected HashMap<String, String> lcMap;
+ protected HashMap<String, String> mapLcMap;
protected IndexSchema schema;
protected int maxFieldValueChars;
protected int maxTotalChars;
@@ -84,8 +82,8 @@ public abstract class LanguageIdentifierUpdateProcessor
extends UpdateRequestPro
protected final Pattern tikaSimilarityPattern =
Pattern.compile(".*\\((.*?)\\)");
protected final Pattern langPattern = Pattern.compile("\\{lang\\}");
- public LanguageIdentifierUpdateProcessor(SolrQueryRequest req,
- SolrQueryResponse rsp,
UpdateRequestProcessor next) {
+ public LanguageIdentifierUpdateProcessor(
+ SolrQueryRequest req, SolrQueryResponse rsp, UpdateRequestProcessor
next) {
super(next);
schema = req.getSchema();
@@ -96,34 +94,40 @@ public abstract class LanguageIdentifierUpdateProcessor
extends UpdateRequestPro
if (params != null) {
// Document-centric langId params
setEnabled(params.getBool(LANGUAGE_ID, true));
- if(params.get(FIELDS_PARAM, "").length() > 0) {
+ if (params.get(FIELDS_PARAM, "").length() > 0) {
inputFields = params.get(FIELDS_PARAM, "").split(",");
}
langField = params.get(LANG_FIELD, DOCID_LANGFIELD_DEFAULT);
langsField = params.get(LANGS_FIELD, DOCID_LANGSFIELD_DEFAULT);
SchemaField uniqueKeyField = schema.getUniqueKeyField();
- docIdField = params.get(DOCID_PARAM, uniqueKeyField == null ?
DOCID_FIELD_DEFAULT : uniqueKeyField.getName());
+ docIdField =
+ params.get(
+ DOCID_PARAM, uniqueKeyField == null ? DOCID_FIELD_DEFAULT :
uniqueKeyField.getName());
fallbackValue = params.get(FALLBACK);
- if(params.get(FALLBACK_FIELDS, "").length() > 0) {
+ if (params.get(FALLBACK_FIELDS, "").length() > 0) {
fallbackFields = params.get(FALLBACK_FIELDS).split(",");
}
overwrite = params.getBool(OVERWRITE, false);
langAllowlist = new HashSet<>();
threshold = params.getDouble(THRESHOLD, DOCID_THRESHOLD_DEFAULT);
String legacyAllowList = params.get(LANG_WHITELIST, "");
- if(legacyAllowList.length() > 0) {
+ if (legacyAllowList.length() > 0) {
// nowarn compile time string concatenation
- log.warn(LANG_WHITELIST + " parameter is deprecated; use " +
LANG_ALLOWLIST + " instead."); // nowarn
+ log.warn(
+ LANG_WHITELIST
+ + " parameter is deprecated; use "
+ + LANG_ALLOWLIST
+ + " instead."); // nowarn
}
- if(params.get(LANG_ALLOWLIST, legacyAllowList).length() > 0) {
- for(String lang : params.get(LANG_ALLOWLIST, "").split(",")) {
+ if (params.get(LANG_ALLOWLIST, legacyAllowList).length() > 0) {
+ for (String lang : params.get(LANG_ALLOWLIST, "").split(",")) {
langAllowlist.add(lang);
}
}
// Mapping params (field centric)
enableMapping = params.getBool(MAP_ENABLE, false);
- if(params.get(MAP_FL, "").length() > 0) {
+ if (params.get(MAP_FL, "").length() > 0) {
mapFields = params.get(MAP_FL, "").split(",");
} else {
mapFields = inputFields;
@@ -134,7 +138,7 @@ public abstract class LanguageIdentifierUpdateProcessor
extends UpdateRequestPro
// Process individual fields
String[] mapIndividualFields = {};
- if(params.get(MAP_INDIVIDUAL_FL, "").length() > 0) {
+ if (params.get(MAP_INDIVIDUAL_FL, "").length() > 0) {
mapIndividualFields = params.get(MAP_INDIVIDUAL_FL, "").split(",");
} else {
mapIndividualFields = mapFields;
@@ -142,16 +146,16 @@ public abstract class LanguageIdentifierUpdateProcessor
extends UpdateRequestPro
mapIndividualFieldsSet = new
HashSet<>(Arrays.asList(mapIndividualFields));
// Compile a union of the lists of fields to map
allMapFieldsSet = new HashSet<>(Arrays.asList(mapFields));
- if(Arrays.equals(mapFields, mapIndividualFields)) {
+ if (Arrays.equals(mapFields, mapIndividualFields)) {
allMapFieldsSet.addAll(mapIndividualFieldsSet);
}
// Normalize detected langcode onto normalized langcode
lcMap = new HashMap<>();
- if(params.get(LCMAP) != null) {
- for(String mapping : params.get(LCMAP).split("[, ]")) {
+ if (params.get(LCMAP) != null) {
+ for (String mapping : params.get(LCMAP).split("[, ]")) {
String[] keyVal = mapping.split(":");
- if(keyVal.length == 2) {
+ if (keyVal.length == 2) {
lcMap.put(keyVal[0], keyVal[1]);
} else {
log.error("Unsupported format for langid.lcmap: {}. Skipping this
mapping.", mapping);
@@ -161,13 +165,14 @@ public abstract class LanguageIdentifierUpdateProcessor
extends UpdateRequestPro
// Language Code mapping
mapLcMap = new HashMap<>();
- if(params.get(MAP_LCMAP) != null) {
- for(String mapping : params.get(MAP_LCMAP).split("[, ]")) {
+ if (params.get(MAP_LCMAP) != null) {
+ for (String mapping : params.get(MAP_LCMAP).split("[, ]")) {
String[] keyVal = mapping.split(":");
- if(keyVal.length == 2) {
+ if (keyVal.length == 2) {
mapLcMap.put(keyVal[0], keyVal[1]);
} else {
- log.error("Unsupported format for langid.map.lcmap: {}. Skipping
this mapping.", mapping);
+ log.error(
+ "Unsupported format for langid.map.lcmap: {}. Skipping this
mapping.", mapping);
}
}
}
@@ -180,27 +185,36 @@ public abstract class LanguageIdentifierUpdateProcessor
extends UpdateRequestPro
if (maxFieldValueChars > maxTotalChars) {
if (maxTotalChars == MAX_TOTAL_CHARS_DEFAULT) {
// If the user specified only maxFieldValueChars, make maxTotalChars
the same as it
- log.warn("{} ({}) is less than {} ({}). Setting {} to {}."
- , MAX_FIELD_VALUE_CHARS, maxFieldValueChars, MAX_TOTAL_CHARS
- , maxTotalChars, MAX_TOTAL_CHARS, maxFieldValueChars);
+ log.warn(
+ "{} ({}) is less than {} ({}). Setting {} to {}.",
+ MAX_FIELD_VALUE_CHARS,
+ maxFieldValueChars,
+ MAX_TOTAL_CHARS,
+ maxTotalChars,
+ MAX_TOTAL_CHARS,
+ maxFieldValueChars);
maxTotalChars = maxFieldValueChars;
} else {
// If the user specified maxTotalChars, make maxFieldValueChars the
same as it
- log.warn("{} ({}) is less than {} ({}). Setting {} to {}."
- , MAX_FIELD_VALUE_CHARS, maxFieldValueChars, MAX_TOTAL_CHARS
- , maxTotalChars, MAX_FIELD_VALUE_CHARS, maxTotalChars );
+ log.warn(
+ "{} ({}) is less than {} ({}). Setting {} to {}.",
+ MAX_FIELD_VALUE_CHARS,
+ maxFieldValueChars,
+ MAX_TOTAL_CHARS,
+ maxTotalChars,
+ MAX_FIELD_VALUE_CHARS,
+ maxTotalChars);
maxFieldValueChars = maxTotalChars;
}
}
}
log.debug("LangId configured");
-
if (inputFields.length == 0) {
- throw new SolrException(ErrorCode.BAD_REQUEST,
- "Missing or faulty configuration of
LanguageIdentifierUpdateProcessor. Input fields must be specified as a comma
separated list");
+ throw new SolrException(
+ ErrorCode.BAD_REQUEST,
+ "Missing or faulty configuration of
LanguageIdentifierUpdateProcessor. Input fields must be specified as a comma
separated list");
}
-
}
@Override
@@ -215,6 +229,7 @@ public abstract class LanguageIdentifierUpdateProcessor
extends UpdateRequestPro
/**
* This is the main process method called from processAdd()
+ *
* @param doc the SolrInputDocument to modify
*/
protected void process(SolrInputDocument doc) {
@@ -222,20 +237,25 @@ public abstract class LanguageIdentifierUpdateProcessor
extends UpdateRequestPro
HashSet<String> docLangs = new HashSet<>();
String fallbackLang = getFallbackLang(doc, fallbackFields, fallbackValue);
- if(langField == null || !doc.containsKey(langField) ||
(doc.containsKey(langField) && overwrite)) {
+ if (langField == null
+ || !doc.containsKey(langField)
+ || (doc.containsKey(langField) && overwrite)) {
List<DetectedLanguage> languagelist = detectLanguage(doc);
docLang = resolveLanguage(languagelist, fallbackLang);
docLangs.add(docLang);
if (log.isDebugEnabled()) {
- log.debug("Detected main document language from fields {}: {}",
Arrays.toString(inputFields), docLang);
+ log.debug(
+ "Detected main document language from fields {}: {}",
+ Arrays.toString(inputFields),
+ docLang);
}
- if(doc.containsKey(langField) && overwrite) {
+ if (doc.containsKey(langField) && overwrite) {
if (log.isDebugEnabled()) {
log.debug("Overwritten old value {}", doc.getFieldValue(langField));
}
}
- if(langField != null && langField.length() != 0) {
+ if (langField != null && langField.length() != 0) {
doc.setField(langField, docLang);
}
} else {
@@ -245,15 +265,17 @@ public abstract class LanguageIdentifierUpdateProcessor
extends UpdateRequestPro
log.debug("Field {} already contained value {}, not overwriting.",
langField, docLang);
}
- if(enableMapping) {
+ if (enableMapping) {
for (String fieldName : allMapFieldsSet) {
- if(doc.containsKey(fieldName)) {
+ if (doc.containsKey(fieldName)) {
String fieldLang;
- if(mapIndividual && mapIndividualFieldsSet.contains(fieldName)) {
- List<DetectedLanguage> languagelist =
detectLanguage(solrDocReader(doc, new String[]{fieldName}));
+ if (mapIndividual && mapIndividualFieldsSet.contains(fieldName)) {
+ List<DetectedLanguage> languagelist =
+ detectLanguage(solrDocReader(doc, new String[] {fieldName}));
fieldLang = resolveLanguage(languagelist, docLang);
docLangs.add(fieldLang);
- log.debug("Mapping field {} using individually detected language
{}", fieldName, fieldLang);
+ log.debug(
+ "Mapping field {} using individually detected language {}",
fieldName, fieldLang);
} else {
fieldLang = docLang;
log.debug("Mapping field {} using document global language {}",
fieldName, fieldLang);
@@ -266,40 +288,46 @@ public abstract class LanguageIdentifierUpdateProcessor
extends UpdateRequestPro
}
SolrInputField inField = doc.getField(fieldName);
doc.setField(mappedOutputField, inField.getValue());
- if(!mapKeepOrig) {
+ if (!mapKeepOrig) {
log.debug("Removing old field {}", fieldName);
doc.removeField(fieldName);
}
} else {
- throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,
"Invalid output field mapping for "
- + fieldName + " field and language: " + fieldLang);
+ throw new SolrException(
+ SolrException.ErrorCode.BAD_REQUEST,
+ "Invalid output field mapping for "
+ + fieldName
+ + " field and language: "
+ + fieldLang);
}
}
}
}
// Set the languages field to an array of all detected languages
- if(langsField != null && langsField.length() != 0) {
+ if (langsField != null && langsField.length() != 0) {
doc.setField(langsField, docLangs.toArray());
}
}
/**
* Decides the fallback language, either from content of fallback field or
fallback value
+ *
* @param doc the Solr document
* @param fallbackFields an array of strings with field names containing
fallback language codes
* @param fallbackValue a language code to use in case no fallbackFields are
found
*/
- private String getFallbackLang(SolrInputDocument doc, String[]
fallbackFields, String fallbackValue) {
+ private String getFallbackLang(
+ SolrInputDocument doc, String[] fallbackFields, String fallbackValue) {
String lang = null;
- for(String field : fallbackFields) {
- if(doc.containsKey(field)) {
+ for (String field : fallbackFields) {
+ if (doc.containsKey(field)) {
lang = (String) doc.getFieldValue(field);
log.debug("Language fallback to field {}", field);
break;
}
}
- if(lang == null) {
+ if (lang == null) {
log.debug("Language fallback to value {}", fallbackValue);
lang = fallbackValue;
}
@@ -308,6 +336,7 @@ public abstract class LanguageIdentifierUpdateProcessor
extends UpdateRequestPro
/**
* Detects language(s) from all configured fields
+ *
* @param doc The solr document
* @return List of detected language(s) according to RFC-3066
*/
@@ -316,8 +345,9 @@ public abstract class LanguageIdentifierUpdateProcessor
extends UpdateRequestPro
}
/**
- * Detects language(s) from a reader, typically based on some fields in
SolrInputDocument
- * Classes wishing to implement their own language detection module should
override this method.
+ * Detects language(s) from a reader, typically based on some fields in
SolrInputDocument Classes
+ * wishing to implement their own language detection module should override
this method.
+ *
* @param solrDocReader A reader serving the text from the document to detect
* @return List of detected language(s) according to RFC-3066
*/
@@ -325,6 +355,7 @@ public abstract class LanguageIdentifierUpdateProcessor
extends UpdateRequestPro
/**
* Chooses a language based on the list of candidates detected
+ *
* @param language language code as a string
* @param fallbackLang the language code to use as a fallback
* @return a string of the chosen language
@@ -337,37 +368,42 @@ public abstract class LanguageIdentifierUpdateProcessor
extends UpdateRequestPro
/**
* Chooses a language based on the list of candidates detected
+ *
* @param languages a List of DetectedLanguages with certainty score
* @param fallbackLang the language code to use as a fallback
* @return a string of the chosen language
*/
protected String resolveLanguage(List<DetectedLanguage> languages, String
fallbackLang) {
String langStr;
- if(languages.size() == 0) {
+ if (languages.size() == 0) {
log.debug("No language detected, using fallback {}", fallbackLang);
langStr = fallbackLang;
} else {
DetectedLanguage lang = languages.get(0);
String normalizedLang = normalizeLangCode(lang.getLangCode());
- if(langAllowlist.isEmpty() || langAllowlist.contains(normalizedLang)) {
+ if (langAllowlist.isEmpty() || langAllowlist.contains(normalizedLang)) {
if (log.isDebugEnabled()) {
log.debug("Language detected {} with certainty {}", normalizedLang,
lang.getCertainty());
}
- if(lang.getCertainty() >= threshold) {
+ if (lang.getCertainty() >= threshold) {
langStr = normalizedLang;
} else {
- log.debug("Detected language below threshold {}, using fallback {}",
threshold, fallbackLang);
+ log.debug(
+ "Detected language below threshold {}, using fallback {}",
threshold, fallbackLang);
langStr = fallbackLang;
}
} else {
if (log.isDebugEnabled()) {
- log.debug("Detected a language not in allowlist ({}), using fallback
{}", lang.getLangCode(), fallbackLang);
+ log.debug(
+ "Detected a language not in allowlist ({}), using fallback {}",
+ lang.getLangCode(),
+ fallbackLang);
}
langStr = fallbackLang;
}
}
- if(langStr == null || langStr.length() == 0) {
+ if (langStr == null || langStr.length() == 0) {
log.warn("Language resolved to null or empty string. Fallback not
configured?");
langStr = "";
}
@@ -377,6 +413,7 @@ public abstract class LanguageIdentifierUpdateProcessor
extends UpdateRequestPro
/**
* Looks up language code in map (langid.lcmap) and returns mapped value
+ *
* @param langCode the language code string returned from detector
* @return the normalized/mapped language code
*/
@@ -390,10 +427,10 @@ public abstract class LanguageIdentifierUpdateProcessor
extends UpdateRequestPro
}
/**
- * Returns the name of the field to map the current contents into, so that
they are properly analyzed. For instance
- * if the currentField is "text" and the code is "en", the new field would
by default be "text_en".
- * This method also performs custom regex pattern replace if configured. If
enforceSchema=true
- * and the resulting field name doesn't exist, then null is returned.
+ * Returns the name of the field to map the current contents into, so that
they are properly
+ * analyzed. For instance if the currentField is "text" and the code is
"en", the new field would
+ * by default be "text_en". This method also performs custom regex pattern
replace if configured.
+ * If enforceSchema=true and the resulting field name doesn't exist, then
null is returned.
*
* @param currentField The current field name
* @param language the language code
@@ -401,18 +438,29 @@ public abstract class LanguageIdentifierUpdateProcessor
extends UpdateRequestPro
*/
protected String getMappedField(String currentField, String language) {
String lc = mapLcMap.containsKey(language) ? mapLcMap.get(language) :
language;
- String newFieldName =
langPattern.matcher(mapPattern.matcher(currentField).replaceFirst(mapReplaceStr)).replaceFirst(lc);
- if(enforceSchema && schema.getFieldOrNull(newFieldName) == null) {
- log.warn("Unsuccessful field name mapping from {} to {}, field does not
exist and enforceSchema=true; skipping mapping.", currentField, newFieldName);
+ String newFieldName =
+ langPattern
+
.matcher(mapPattern.matcher(currentField).replaceFirst(mapReplaceStr))
+ .replaceFirst(lc);
+ if (enforceSchema && schema.getFieldOrNull(newFieldName) == null) {
+ log.warn(
+ "Unsuccessful field name mapping from {} to {}, field does not exist
and enforceSchema=true; skipping mapping.",
+ currentField,
+ newFieldName);
return null;
} else {
- log.debug("Doing mapping from {} with language {} to field {}",
currentField, language, newFieldName);
+ log.debug(
+ "Doing mapping from {} with language {} to field {}",
+ currentField,
+ language,
+ newFieldName);
}
return newFieldName;
}
/**
* Tells if this processor is enabled or not
+ *
* @return true if enabled, else false
*/
public boolean isEnabled() {
@@ -424,8 +472,9 @@ public abstract class LanguageIdentifierUpdateProcessor
extends UpdateRequestPro
}
/**
- * Returns a reader that streams String content from fields.
- * This is more memory efficient than building a full string buffer
+ * Returns a reader that streams String content from fields. This is more
memory efficient than
+ * building a full string buffer
+ *
* @param doc the solr document
* @param fields the field names to read
* @return a reader over the fields
@@ -434,10 +483,7 @@ public abstract class LanguageIdentifierUpdateProcessor
extends UpdateRequestPro
return new SolrInputDocumentReader(doc, fields, maxTotalChars,
maxFieldValueChars, " ");
}
- /**
- * Concatenates content from input fields defined in langid.fl.
- * For test purposes only
- */
+ /** Concatenates content from input fields defined in langid.fl. For test
purposes only */
protected String concatFields(SolrInputDocument doc) {
return SolrInputDocumentReader.asString(solrDocReader(doc, inputFields));
}
diff --git
a/solr/modules/langid/src/java/org/apache/solr/update/processor/OpenNLPLangDetectUpdateProcessor.java
b/solr/modules/langid/src/java/org/apache/solr/update/processor/OpenNLPLangDetectUpdateProcessor.java
index ab17133..8f6b611 100644
---
a/solr/modules/langid/src/java/org/apache/solr/update/processor/OpenNLPLangDetectUpdateProcessor.java
+++
b/solr/modules/langid/src/java/org/apache/solr/update/processor/OpenNLPLangDetectUpdateProcessor.java
@@ -23,21 +23,20 @@ import java.util.HashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
-
+import opennlp.tools.langdetect.Language;
+import opennlp.tools.langdetect.LanguageDetectorME;
+import opennlp.tools.langdetect.LanguageDetectorModel;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.response.SolrQueryResponse;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
-import opennlp.tools.langdetect.Language;
-import opennlp.tools.langdetect.LanguageDetectorME;
-import opennlp.tools.langdetect.LanguageDetectorModel;
-
/**
- * Identifies the language of a set of input fields using <a
href="https://opennlp.apache.org/">Apache OpenNLP</a>.
- * <p>
- * See "Language Detector" section of
- * <a
href="https://opennlp.apache.org/docs/1.8.3/manual/opennlp.html">https://opennlp.apache.org/docs/1.8.3/manual/opennlp.html</a>
+ * Identifies the language of a set of input fields using <a
+ * href="https://opennlp.apache.org/">Apache OpenNLP</a>.
+ *
+ * <p>See "Language Detector" section of <a
+ *
href="https://opennlp.apache.org/docs/1.8.3/manual/opennlp.html">https://opennlp.apache.org/docs/1.8.3/manual/opennlp.html</a>
*/
public class OpenNLPLangDetectUpdateProcessor extends
LanguageIdentifierUpdateProcessor {
@@ -45,10 +44,13 @@ public class OpenNLPLangDetectUpdateProcessor extends
LanguageIdentifierUpdatePr
private static final Logger log =
LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
/** Maps ISO 639-3 (3-letter language code) to ISO 639-1 (2-letter language
code) */
- private static final Map<String,String> ISO639_MAP = make_ISO639_map();
-
- public OpenNLPLangDetectUpdateProcessor(SolrQueryRequest req,
SolrQueryResponse rsp,
- UpdateRequestProcessor next, LanguageDetectorModel model) {
+ private static final Map<String, String> ISO639_MAP = make_ISO639_map();
+
+ public OpenNLPLangDetectUpdateProcessor(
+ SolrQueryRequest req,
+ SolrQueryResponse rsp,
+ UpdateRequestProcessor next,
+ LanguageDetectorModel model) {
super(req, rsp, next);
this.model = model;
}
@@ -60,8 +62,9 @@ public class OpenNLPLangDetectUpdateProcessor extends
LanguageIdentifierUpdatePr
if (content.length() != 0) {
LanguageDetectorME ldme = new LanguageDetectorME(model);
Language[] langs = ldme.predictLanguages(content);
- for(Language language: langs){
- languages.add(new DetectedLanguage(ISO639_MAP.get(language.getLang()),
language.getConfidence()));
+ for (Language language : langs) {
+ languages.add(
+ new DetectedLanguage(ISO639_MAP.get(language.getLang()),
language.getConfidence()));
}
} else {
log.debug("No input text to detect language from, returning empty list");
@@ -69,8 +72,8 @@ public class OpenNLPLangDetectUpdateProcessor extends
LanguageIdentifierUpdatePr
return languages;
}
- private static Map<String,String> make_ISO639_map() {
- Map<String,String> map = new HashMap<>();
+ private static Map<String, String> make_ISO639_map() {
+ Map<String, String> map = new HashMap<>();
for (String lang : Locale.getISOLanguages()) {
Locale locale = new Locale(lang);
map.put(locale.getISO3Language(), locale.getLanguage());
diff --git
a/solr/modules/langid/src/java/org/apache/solr/update/processor/OpenNLPLangDetectUpdateProcessorFactory.java
b/solr/modules/langid/src/java/org/apache/solr/update/processor/OpenNLPLangDetectUpdateProcessorFactory.java
index 14e9fa9..109fff0 100644
---
a/solr/modules/langid/src/java/org/apache/solr/update/processor/OpenNLPLangDetectUpdateProcessorFactory.java
+++
b/solr/modules/langid/src/java/org/apache/solr/update/processor/OpenNLPLangDetectUpdateProcessorFactory.java
@@ -18,7 +18,7 @@ package org.apache.solr.update.processor;
import java.io.IOException;
import java.io.InputStream;
-
+import opennlp.tools.langdetect.LanguageDetectorModel;
import org.apache.commons.io.IOUtils;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.common.util.NamedList;
@@ -29,14 +29,13 @@ import org.apache.solr.response.SolrQueryResponse;
import org.apache.solr.util.SolrPluginUtils;
import org.apache.solr.util.plugin.SolrCoreAware;
-import opennlp.tools.langdetect.LanguageDetectorModel;
-
/**
- * Identifies the language of a set of input fields using <a
href="https://opennlp.apache.org/">Apache OpenNLP</a>.
- * <p>
- * The UpdateProcessorChain config entry can take a number of parameters
- * which may also be passed as HTTP parameters on the update request
- * and override the defaults. Here is the simplest processor config possible:
+ * Identifies the language of a set of input fields using <a
+ * href="https://opennlp.apache.org/">Apache OpenNLP</a>.
+ *
+ * <p>The UpdateProcessorChain config entry can take a number of parameters
which may also be passed
+ * as HTTP parameters on the update request and override the defaults. Here is
the simplest
+ * processor config possible:
*
* <pre class="prettyprint" >
* <processor
class="org.apache.solr.update.processor.OpenNLPLangDetectUpdateProcessorFactory">
@@ -45,12 +44,14 @@ import opennlp.tools.langdetect.LanguageDetectorModel;
* <str name="langid.model">langdetect-183.bin</str>
* </processor>
* </pre>
- * See <a
href="https://solr.apache.org/guide/language-detection.html#configuring-opennlp-language-detection">https://solr.apache.org/guide/language-detection.html#configuring-opennlp-language-detection</a>
+ *
+ * See <a
+ *
href="https://solr.apache.org/guide/language-detection.html#configuring-opennlp-language-detection">https://solr.apache.org/guide/language-detection.html#configuring-opennlp-language-detection</a>
*
* @since 7.3.0
*/
public class OpenNLPLangDetectUpdateProcessorFactory extends
UpdateRequestProcessorFactory
- implements SolrCoreAware {
+ implements SolrCoreAware {
private static final String MODEL_PARAM = "langid.model";
private String modelFile;
@@ -61,8 +62,7 @@ public class OpenNLPLangDetectUpdateProcessorFactory extends
UpdateRequestProces
private SolrResourceLoader solrResourceLoader;
@Override
- public void init(NamedList<?> args )
- {
+ public void init(NamedList<?> args) {
if (args != null) {
Object o;
o = args.get("defaults");
@@ -91,7 +91,8 @@ public class OpenNLPLangDetectUpdateProcessorFactory extends
UpdateRequestProces
} else {
modelFile = defaults.get(MODEL_PARAM);
if (modelFile == null) {
- throw new RuntimeException("Couldn't load language model, will
return empty languages always!");
+ throw new RuntimeException(
+ "Couldn't load language model, will return empty languages
always!");
}
}
}
@@ -99,7 +100,8 @@ public class OpenNLPLangDetectUpdateProcessorFactory extends
UpdateRequestProces
}
@Override
- public UpdateRequestProcessor getInstance(SolrQueryRequest req,
SolrQueryResponse rsp, UpdateRequestProcessor next) {
+ public UpdateRequestProcessor getInstance(
+ SolrQueryRequest req, SolrQueryResponse rsp, UpdateRequestProcessor
next) {
// Process defaults, appends and invariants if we got a request
if (req != null) {
SolrPluginUtils.setDefaults(req, defaults, appends, invariants);
@@ -109,19 +111,18 @@ public class OpenNLPLangDetectUpdateProcessorFactory
extends UpdateRequestProces
private void loadModel() throws IOException {
InputStream is = null;
- try{
+ try {
if (modelFile != null) {
is = solrResourceLoader.openResource(modelFile);
model = new LanguageDetectorModel(is);
}
- }
- finally{
+ } finally {
IOUtils.closeQuietly(is);
}
}
@Override
- public void inform(SolrCore core){
+ public void inform(SolrCore core) {
solrResourceLoader = core.getResourceLoader();
try {
loadModel();
diff --git
a/solr/modules/langid/src/java/org/apache/solr/update/processor/SolrInputDocumentReader.java
b/solr/modules/langid/src/java/org/apache/solr/update/processor/SolrInputDocumentReader.java
index ed839de..84fb505 100644
---
a/solr/modules/langid/src/java/org/apache/solr/update/processor/SolrInputDocumentReader.java
+++
b/solr/modules/langid/src/java/org/apache/solr/update/processor/SolrInputDocumentReader.java
@@ -23,7 +23,6 @@ import java.util.Iterator;
import java.util.List;
import java.util.stream.Collectors;
import java.util.stream.StreamSupport;
-
import org.apache.commons.io.IOUtils;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.SolrInputDocument;
@@ -33,7 +32,9 @@ import org.slf4j.LoggerFactory;
/**
* Reader on top of SolrInputDocument that can "stream" a document as a
character stream in a memory
- * efficient way, to avoid potentially large intermediate string buffers
containing whole document content.
+ * efficient way, to avoid potentially large intermediate string buffers
containing whole document
+ * content.
+ *
* @lucene.experimental
*/
public class SolrInputDocumentReader extends Reader {
@@ -51,23 +52,26 @@ public class SolrInputDocumentReader extends Reader {
private int currentFieldValueIdx = 0;
private int currentFieldValueOffset = 0;
private boolean eod = false;
- // Normally a Reader will return -1 at end of document, but to work around
LangDetect's bug, we allow another value
+ // Normally a Reader will return -1 at end of document, but to work around
LangDetect's bug, we
+ // allow another value
private int eodReturnValue = -1;
/**
- * Creates a character-stream reader that streams all String fields in the
document with space as separator
+ * Creates a character-stream reader that streams all String fields in the
document with space as
+ * separator
*
* @param doc Solr input document
* @param maxCharsPerFieldValue max chars to consume per field value
* @param maxTotalChars max chars to consume total
*/
- public SolrInputDocumentReader(SolrInputDocument doc, int maxTotalChars, int
maxCharsPerFieldValue) {
+ public SolrInputDocumentReader(
+ SolrInputDocument doc, int maxTotalChars, int maxCharsPerFieldValue) {
this(doc, getStringFields(doc), maxTotalChars, maxCharsPerFieldValue, " ");
}
-
+
/**
- * Creates a character-stream reader that reads the listed fields in order,
with
- * max lengths as specified.
+ * Creates a character-stream reader that reads the listed fields in order,
with max lengths as
+ * specified.
*
* @param doc Solr input document
* @param fields list of field names to include
@@ -75,12 +79,17 @@ public class SolrInputDocumentReader extends Reader {
* @param maxCharsPerFieldValue max chars to consume per field value
* @param maxTotalChars max chars to consume total
*/
- public SolrInputDocumentReader(SolrInputDocument doc, String[] fields, int
maxTotalChars,
- int maxCharsPerFieldValue, String
fieldValueSep) {
+ public SolrInputDocumentReader(
+ SolrInputDocument doc,
+ String[] fields,
+ int maxTotalChars,
+ int maxCharsPerFieldValue,
+ String fieldValueSep) {
this.doc = doc;
this.fields = fields;
this.fieldValueSep = fieldValueSep;
- if (fields == null || fields.length == 0) throw new
IllegalArgumentException("fields cannot be empty");
+ if (fields == null || fields.length == 0)
+ throw new IllegalArgumentException("fields cannot be empty");
this.maxTotalChars = maxTotalChars;
this.maxCharsPerFieldValue = maxCharsPerFieldValue;
}
@@ -116,13 +125,13 @@ public class SolrInputDocumentReader extends Reader {
}
private int nextDocChunk(StringBuilder sb, int maxChunkLength) {
- if (currentFieldIdx > fields.length-1) {
+ if (currentFieldIdx > fields.length - 1) {
return returnEod();
}
int startFieldValueIdx = currentFieldValueIdx;
int startFieldValueOffset = currentFieldValueOffset;
-
+
do {
SolrInputField f = doc.getField(fields[currentFieldIdx]);
if (f == null) {
@@ -139,7 +148,7 @@ public class SolrInputDocumentReader extends Reader {
startFieldValueIdx = 0;
if (sb.length() > 0) {
if (maxChunkLength - sb.length() < fieldValueSep.length()) {
- sb.append(fieldValueSep.substring(0,maxChunkLength - sb.length()));
+ sb.append(fieldValueSep.substring(0, maxChunkLength -
sb.length()));
} else {
sb.append(fieldValueSep);
}
@@ -162,7 +171,7 @@ public class SolrInputDocumentReader extends Reader {
} else {
incField(sb);
}
- } while (currentFieldIdx <= fields.length-1 && sb.length() <
maxChunkLength);
+ } while (currentFieldIdx <= fields.length - 1 && sb.length() <
maxChunkLength);
return sb.length() == 0 ? eodReturnValue : sb.length();
}
@@ -186,7 +195,9 @@ public class SolrInputDocumentReader extends Reader {
}
@Override
- public void close() throws IOException { /* ignored */ }
+ public void close() throws IOException {
+ /* ignored */
+ }
@Override
public boolean ready() throws IOException {
@@ -194,8 +205,9 @@ public class SolrInputDocumentReader extends Reader {
}
/**
- * Choose another return value than -1 for end of document reached.
- * <b>Warning: Only to work around buggy consumers such as LangDetect 1.1</b>
+ * Choose another return value than -1 for end of document reached.
<b>Warning: Only to work
+ * around buggy consumers such as LangDetect 1.1</b>
+ *
* @param eodReturnValue integer which defaults to -1
*/
public void setEodReturnValue(int eodReturnValue) {
@@ -203,22 +215,26 @@ public class SolrInputDocumentReader extends Reader {
}
/**
- * Gets the whole reader as a String
+ * Gets the whole reader as a String
+ *
* @return string of concatenated fields
*/
public static String asString(Reader reader) {
try {
return IOUtils.toString(reader);
} catch (IOException e) {
- throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Failed
reading doc content from reader", e);
+ throw new SolrException(
+ SolrException.ErrorCode.SERVER_ERROR, "Failed reading doc content
from reader", e);
}
}
-
+
protected static String[] getStringFields(SolrInputDocument doc) {
Iterable<SolrInputField> iterable = () -> doc.iterator();
- List<String> strFields = StreamSupport.stream(iterable.spliterator(),
false)
+ List<String> strFields =
+ StreamSupport.stream(iterable.spliterator(), false)
.filter(f -> f.getFirstValue() instanceof String)
- .map(SolrInputField::getName).collect(Collectors.toList());
- return strFields.toArray(new String[0]);
+ .map(SolrInputField::getName)
+ .collect(Collectors.toList());
+ return strFields.toArray(new String[0]);
}
}
diff --git
a/solr/modules/langid/src/java/org/apache/solr/update/processor/TikaLanguageIdentifierUpdateProcessor.java
b/solr/modules/langid/src/java/org/apache/solr/update/processor/TikaLanguageIdentifierUpdateProcessor.java
index 5537780..64c57e7 100644
---
a/solr/modules/langid/src/java/org/apache/solr/update/processor/TikaLanguageIdentifierUpdateProcessor.java
+++
b/solr/modules/langid/src/java/org/apache/solr/update/processor/TikaLanguageIdentifierUpdateProcessor.java
@@ -20,7 +20,6 @@ import java.io.Reader;
import java.lang.invoke.MethodHandles;
import java.util.ArrayList;
import java.util.List;
-
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.response.SolrQueryResponse;
import org.apache.tika.language.LanguageIdentifier;
@@ -28,19 +27,20 @@ import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
- * Identifies the language of a set of input fields using Tika's
- * LanguageIdentifier.
- * The tika-core-x.y.jar must be on the classpath
- * <p>
- * See <a
href="https://solr.apache.org/guide/language-detection.html#configuring-tika-language-detection">https://solr.apache.org/guide/language-detection.html#configuring-tika-language-detection</a>
+ * Identifies the language of a set of input fields using Tika's
LanguageIdentifier. The
+ * tika-core-x.y.jar must be on the classpath
+ *
+ * <p>See <a
+ *
href="https://solr.apache.org/guide/language-detection.html#configuring-tika-language-detection">https://solr.apache.org/guide/language-detection.html#configuring-tika-language-detection</a>
+ *
* @since 3.5
*/
public class TikaLanguageIdentifierUpdateProcessor extends
LanguageIdentifierUpdateProcessor {
private static final Logger log =
LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
- public TikaLanguageIdentifierUpdateProcessor(SolrQueryRequest req,
- SolrQueryResponse rsp, UpdateRequestProcessor next) {
+ public TikaLanguageIdentifierUpdateProcessor(
+ SolrQueryRequest req, SolrQueryResponse rsp, UpdateRequestProcessor
next) {
super(req, rsp, next);
}
@@ -51,16 +51,21 @@ public class TikaLanguageIdentifierUpdateProcessor extends
LanguageIdentifierUpd
if (content.length() != 0) {
LanguageIdentifier identifier = new LanguageIdentifier(content);
// FIXME: Hack - we get the distance from toString and calculate our own
certainty score
- Double distance =
Double.parseDouble(tikaSimilarityPattern.matcher(identifier.toString()).replaceFirst("$1"));
- // This formula gives: 0.02 => 0.8, 0.1 => 0.5 which is a better
sweetspot than isReasonablyCertain()
+ Double distance =
+ Double.parseDouble(
+
tikaSimilarityPattern.matcher(identifier.toString()).replaceFirst("$1"));
+ // This formula gives: 0.02 => 0.8, 0.1 => 0.5 which is a better
sweetspot than
+ // isReasonablyCertain()
Double certainty = 1 - (5 * distance);
- if (certainty < 0)
- certainty = 0d;
+ if (certainty < 0) certainty = 0d;
DetectedLanguage language = new
DetectedLanguage(identifier.getLanguage(), certainty);
languages.add(language);
if (log.isDebugEnabled()) {
- log.debug("Language detected as {} with a certainty of {} (Tika
distance={})"
- , language, language.getCertainty(), identifier);
+ log.debug(
+ "Language detected as {} with a certainty of {} (Tika
distance={})",
+ language,
+ language.getCertainty(),
+ identifier);
}
} else {
log.debug("No input text to detect language from, returning empty list");
diff --git
a/solr/modules/langid/src/java/org/apache/solr/update/processor/TikaLanguageIdentifierUpdateProcessorFactory.java
b/solr/modules/langid/src/java/org/apache/solr/update/processor/TikaLanguageIdentifierUpdateProcessorFactory.java
index 4c79dd5..a1ea4bf 100644
---
a/solr/modules/langid/src/java/org/apache/solr/update/processor/TikaLanguageIdentifierUpdateProcessorFactory.java
+++
b/solr/modules/langid/src/java/org/apache/solr/update/processor/TikaLanguageIdentifierUpdateProcessorFactory.java
@@ -25,12 +25,12 @@ import org.apache.solr.util.SolrPluginUtils;
import org.apache.solr.util.plugin.SolrCoreAware;
/**
- * Identifies the language of a set of input fields using Tika's
- * LanguageIdentifier. The tika-core-x.y.jar must be on the classpath
- * <p>
- * The UpdateProcessorChain config entry can take a number of parameters
- * which may also be passed as HTTP parameters on the update request
- * and override the defaults. Here is the simplest processor config possible:
+ * Identifies the language of a set of input fields using Tika's
LanguageIdentifier. The
+ * tika-core-x.y.jar must be on the classpath
+ *
+ * <p>The UpdateProcessorChain config entry can take a number of parameters
which may also be passed
+ * as HTTP parameters on the update request and override the defaults. Here is
the simplest
+ * processor config possible:
*
* <pre class="prettyprint" >
* <processor
class="org.apache.solr.update.processor.TikaLanguageIdentifierUpdateProcessorFactory">
@@ -38,28 +38,30 @@ import org.apache.solr.util.plugin.SolrCoreAware;
* <str name="langid.langField">language_s</str>
* </processor>
* </pre>
- * See <a
href="https://solr.apache.org/guide/language-detection.html#configuring-tika-language-detection">https://solr.apache.org/guide/language-detection.html#configuring-tika-language-detection</a>
+ *
+ * See <a
+ *
href="https://solr.apache.org/guide/language-detection.html#configuring-tika-language-detection">https://solr.apache.org/guide/language-detection.html#configuring-tika-language-detection</a>
+ *
* @since 3.5
*/
-public class TikaLanguageIdentifierUpdateProcessorFactory extends
- UpdateRequestProcessorFactory implements SolrCoreAware, LangIdParams {
+public class TikaLanguageIdentifierUpdateProcessorFactory extends
UpdateRequestProcessorFactory
+ implements SolrCoreAware, LangIdParams {
protected SolrParams defaults;
protected SolrParams appends;
protected SolrParams invariants;
@Override
- public void inform(SolrCore core) {
- }
+ public void inform(SolrCore core) {}
/**
- * The UpdateRequestProcessor may be initialized in solrconfig.xml similarly
- * to a RequestHandler, with defaults, appends and invariants.
+ * The UpdateRequestProcessor may be initialized in solrconfig.xml similarly
to a RequestHandler,
+ * with defaults, appends and invariants.
+ *
* @param args a NamedList with the configuration parameters
*/
@Override
- public void init(NamedList<?> args )
- {
+ public void init(NamedList<?> args) {
if (args != null) {
Object o;
o = args.get("defaults");
@@ -80,14 +82,12 @@ public class TikaLanguageIdentifierUpdateProcessorFactory
extends
}
@Override
- public UpdateRequestProcessor getInstance(SolrQueryRequest req,
- SolrQueryResponse rsp,
UpdateRequestProcessor next) {
+ public UpdateRequestProcessor getInstance(
+ SolrQueryRequest req, SolrQueryResponse rsp, UpdateRequestProcessor
next) {
// Process defaults, appends and invariants if we got a request
- if(req != null) {
+ if (req != null) {
SolrPluginUtils.setDefaults(req, defaults, appends, invariants);
}
return new TikaLanguageIdentifierUpdateProcessor(req, rsp, next);
}
-
-
}
diff --git
a/solr/modules/langid/src/test/org/apache/solr/update/processor/LangDetectLanguageIdentifierUpdateProcessorFactoryTest.java
b/solr/modules/langid/src/test/org/apache/solr/update/processor/LangDetectLanguageIdentifierUpdateProcessorFactoryTest.java
index e7d3c15..722205c 100644
---
a/solr/modules/langid/src/test/org/apache/solr/update/processor/LangDetectLanguageIdentifierUpdateProcessorFactoryTest.java
+++
b/solr/modules/langid/src/test/org/apache/solr/update/processor/LangDetectLanguageIdentifierUpdateProcessorFactoryTest.java
@@ -20,12 +20,15 @@ import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.common.params.ModifiableSolrParams;
import org.junit.Test;
-public class LangDetectLanguageIdentifierUpdateProcessorFactoryTest extends
LanguageIdentifierUpdateProcessorFactoryTestCase {
+public class LangDetectLanguageIdentifierUpdateProcessorFactoryTest
+ extends LanguageIdentifierUpdateProcessorFactoryTestCase {
@Override
- protected LanguageIdentifierUpdateProcessor
createLangIdProcessor(ModifiableSolrParams parameters) throws Exception {
- return new
LangDetectLanguageIdentifierUpdateProcessor(_parser.buildRequestFrom(h.getCore(),
parameters, null), resp, null);
+ protected LanguageIdentifierUpdateProcessor
createLangIdProcessor(ModifiableSolrParams parameters)
+ throws Exception {
+ return new LangDetectLanguageIdentifierUpdateProcessor(
+ _parser.buildRequestFrom(h.getCore(), parameters, null), resp, null);
}
-
+
// this one actually works better it seems with short docs
@Override
protected SolrInputDocument tooShortDoc() {
@@ -33,29 +36,114 @@ public class
LangDetectLanguageIdentifierUpdateProcessorFactoryTest extends Lang
doc.addField("text", "");
return doc;
}
-
+
/* we don't return 'un' for the super-short one (this detector things
hungarian?).
* replace this with japanese
*/
- @Test @Override
+ @Test
+ @Override
public void testLangIdGlobal() throws Exception {
ModifiableSolrParams parameters = new ModifiableSolrParams();
parameters.add("langid.fl", "name,subject");
parameters.add("langid.langField", "language_s");
parameters.add("langid.fallback", "un");
liProcessor = createLangIdProcessor(parameters);
-
- assertLang("no", "id", "1no", "name", "Lucene", "subject", "Lucene er et
fri/åpen kildekode programvarebibliotek for informasjonsgjenfinning,
opprinnelig utviklet i programmeringsspråket Java av Doug Cutting. Lucene
støttes av Apache Software Foundation og utgis under Apache-lisensen.");
- assertLang("en", "id", "2en", "name", "Lucene", "subject", "Apache Lucene
is a free/open source information retrieval software library, originally
created in Java by Doug Cutting. It is supported by the Apache Software
Foundation and is released under the Apache Software License.");
- assertLang("sv", "id", "3sv", "name", "Maven", "subject", "Apache Maven är
ett verktyg utvecklat av Apache Software Foundation och används inom
systemutveckling av datorprogram i programspråket Java. Maven används för att
automatiskt paketera (bygga) programfilerna till en distribuerbar enhet. Maven
används inom samma område som Apache Ant men dess byggfiler är deklarativa till
skillnad ifrån Ants skriptbaserade.");
- assertLang("es", "id", "4es", "name", "Lucene", "subject", "Lucene es un
API de código abierto para recuperación de información, originalmente
implementada en Java por Doug Cutting. Está apoyado por el Apache Software
Foundation y se distribuye bajo la Apache Software License. Lucene tiene
versiones para otros lenguajes incluyendo Delphi, Perl, C#, C++, Python, Ruby y
PHP.");
- assertLang("ja", "id", "5ja", "name", "Japanese", "subject",
"日本語(にほんご、にっぽんご)は主として、日本で使用されてきた言語である。日本国は法令上、公用語を明記していないが、事実上の公用語となっており、学校教育の「国語」で教えられる。");
- assertLang("th", "id", "6th", "name", "บทความคัดสรรเดือนนี้", "subject",
"อันเนอลีส มารี อันเนอ ฟรังค์ หรือมักรู้จักในภาษาไทยว่า แอนน์ แฟรงค์
เป็นเด็กหญิงชาวยิว เกิดที่เมืองแฟรงก์เฟิร์ต ประเทศเยอรมนี
เธอมีชื่อเสียงโด่งดังในฐานะผู้เขียนบันทึกประจำวันซึ่งต่อมาได้รับการตีพิมพ์เป็นหนังสือ
บรรยายเหตุการณ์ขณะหลบซ่อนตัวจากการล่าชาวยิวในประเทศเนเธอร์แลนด์
ระหว่างที่ถูกเยอรมนีเข้าค
รอบครองในช่วงสงครามโลกครั้งที่สอง");
- assertLang("ru", "id", "7ru", "name", "Lucene", "subject", "The Apache
Lucene — это свободная библиотека для высокоскоростного полнотекстового поиска,
написанная на Java. Может быть использована для поиска в интернете и других
областях компьютерной лингвистики (аналитическая философия).");
- assertLang("de", "id", "8de", "name", "Lucene", "subject", "Lucene ist ein
Freie-Software-Projekt der Apache Software Foundation, das eine Suchsoftware
erstellt. Durch die hohe Leistungsfähigkeit und Skalierbarkeit können die
Lucene-Werkzeuge für beliebige Projektgrößen und Anforderungen eingesetzt
werden. So setzt beispielsweise Wikipedia Lucene für die Volltextsuche ein.
Zudem verwenden die beiden Desktop-Suchprogramme Beagle und Strigi eine C#-
bzw. C++- Portierung von Lucene als [...]
- assertLang("fr", "id", "9fr", "name", "Lucene", "subject", "Lucene est un
moteur de recherche libre écrit en Java qui permet d'indexer et de rechercher
du texte. C'est un projet open source de la fondation Apache mis à disposition
sous licence Apache. Il est également disponible pour les langages Ruby, Perl,
C++, PHP.");
- assertLang("nl", "id", "10nl", "name", "Lucene", "subject", "Lucene is een
gratis open source, tekst gebaseerde information retrieval API van origine
geschreven in Java door Doug Cutting. Het wordt ondersteund door de Apache
Software Foundation en is vrijgegeven onder de Apache Software Licentie. Lucene
is ook beschikbaar in andere programeertalen zoals Perl, C#, C++, Python, Ruby
en PHP.");
- assertLang("it", "id", "11it", "name", "Lucene", "subject", "Lucene è una
API gratuita ed open source per il reperimento di informazioni inizialmente
implementata in Java da Doug Cutting. È supportata dall'Apache Software
Foundation ed è resa disponibile con l'Apache License. Lucene è stata
successivamente reimplementata in Perl, C#, C++, Python, Ruby e PHP.");
- assertLang("pt", "id", "12pt", "name", "Lucene", "subject", "Apache
Lucene, ou simplesmente Lucene, é um software de busca e uma API de indexação
de documentos, escrito na linguagem de programação Java. É um software de
código aberto da Apache Software Foundation licenciado através da licença
Apache.");
+
+ assertLang(
+ "no",
+ "id",
+ "1no",
+ "name",
+ "Lucene",
+ "subject",
+ "Lucene er et fri/åpen kildekode programvarebibliotek for
informasjonsgjenfinning, opprinnelig utviklet i programmeringsspråket Java av
Doug Cutting. Lucene støttes av Apache Software Foundation og utgis under
Apache-lisensen.");
+ assertLang(
+ "en",
+ "id",
+ "2en",
+ "name",
+ "Lucene",
+ "subject",
+ "Apache Lucene is a free/open source information retrieval software
library, originally created in Java by Doug Cutting. It is supported by the
Apache Software Foundation and is released under the Apache Software License.");
+ assertLang(
+ "sv",
+ "id",
+ "3sv",
+ "name",
+ "Maven",
+ "subject",
+ "Apache Maven är ett verktyg utvecklat av Apache Software Foundation
och används inom systemutveckling av datorprogram i programspråket Java. Maven
används för att automatiskt paketera (bygga) programfilerna till en
distribuerbar enhet. Maven används inom samma område som Apache Ant men dess
byggfiler är deklarativa till skillnad ifrån Ants skriptbaserade.");
+ assertLang(
+ "es",
+ "id",
+ "4es",
+ "name",
+ "Lucene",
+ "subject",
+ "Lucene es un API de código abierto para recuperación de información,
originalmente implementada en Java por Doug Cutting. Está apoyado por el Apache
Software Foundation y se distribuye bajo la Apache Software License. Lucene
tiene versiones para otros lenguajes incluyendo Delphi, Perl, C#, C++, Python,
Ruby y PHP.");
+ assertLang(
+ "ja",
+ "id",
+ "5ja",
+ "name",
+ "Japanese",
+ "subject",
+
"日本語(にほんご、にっぽんご)は主として、日本で使用されてきた言語である。日本国は法令上、公用語を明記していないが、事実上の公用語となっており、学校教育の「国語」で教えられる。");
+ assertLang(
+ "th",
+ "id",
+ "6th",
+ "name",
+ "บทความคัดสรรเดือนนี้",
+ "subject",
+ "อันเนอลีส มารี อันเนอ ฟรังค์ หรือมักรู้จักในภาษาไทยว่า แอนน์ แฟรงค์
เป็นเด็กหญิงชาวยิว เกิดที่เมืองแฟรงก์เฟิร์ต ประเทศเยอรมนี
เธอมีชื่อเสียงโด่งดังในฐานะผู้เขียนบันทึกประจำวันซึ่งต่อมาได้รับการตีพิมพ์เป็นหนังสือ
บรรยายเหตุการณ์ขณะหลบซ่อนตัวจากการล่าชาวยิวในประเทศเนเธอร์แลนด์
ระหว่างที่ถูกเยอรมนีเข้าครอบครองในช่วงสงครามโลกครั้งที่สอง");
+ assertLang(
+ "ru",
+ "id",
+ "7ru",
+ "name",
+ "Lucene",
+ "subject",
+ "The Apache Lucene — это свободная библиотека для высокоскоростного
полнотекстового поиска, написанная на Java. Может быть использована для поиска
в интернете и других областях компьютерной лингвистики (аналитическая
философия).");
+ assertLang(
+ "de",
+ "id",
+ "8de",
+ "name",
+ "Lucene",
+ "subject",
+ "Lucene ist ein Freie-Software-Projekt der Apache Software Foundation,
das eine Suchsoftware erstellt. Durch die hohe Leistungsfähigkeit und
Skalierbarkeit können die Lucene-Werkzeuge für beliebige Projektgrößen und
Anforderungen eingesetzt werden. So setzt beispielsweise Wikipedia Lucene für
die Volltextsuche ein. Zudem verwenden die beiden Desktop-Suchprogramme Beagle
und Strigi eine C#- bzw. C++- Portierung von Lucene als Indexer.");
+ assertLang(
+ "fr",
+ "id",
+ "9fr",
+ "name",
+ "Lucene",
+ "subject",
+ "Lucene est un moteur de recherche libre écrit en Java qui permet
d'indexer et de rechercher du texte. C'est un projet open source de la
fondation Apache mis à disposition sous licence Apache. Il est également
disponible pour les langages Ruby, Perl, C++, PHP.");
+ assertLang(
+ "nl",
+ "id",
+ "10nl",
+ "name",
+ "Lucene",
+ "subject",
+ "Lucene is een gratis open source, tekst gebaseerde information
retrieval API van origine geschreven in Java door Doug Cutting. Het wordt
ondersteund door de Apache Software Foundation en is vrijgegeven onder de
Apache Software Licentie. Lucene is ook beschikbaar in andere programeertalen
zoals Perl, C#, C++, Python, Ruby en PHP.");
+ assertLang(
+ "it",
+ "id",
+ "11it",
+ "name",
+ "Lucene",
+ "subject",
+ "Lucene è una API gratuita ed open source per il reperimento di
informazioni inizialmente implementata in Java da Doug Cutting. È supportata
dall'Apache Software Foundation ed è resa disponibile con l'Apache License.
Lucene è stata successivamente reimplementata in Perl, C#, C++, Python, Ruby e
PHP.");
+ assertLang(
+ "pt",
+ "id",
+ "12pt",
+ "name",
+ "Lucene",
+ "subject",
+ "Apache Lucene, ou simplesmente Lucene, é um software de busca e uma
API de indexação de documentos, escrito na linguagem de programação Java. É um
software de código aberto da Apache Software Foundation licenciado através da
licença Apache.");
}
}
diff --git
a/solr/modules/langid/src/test/org/apache/solr/update/processor/LanguageIdentifierUpdateProcessorFactoryTestCase.java
b/solr/modules/langid/src/test/org/apache/solr/update/processor/LanguageIdentifierUpdateProcessorFactoryTestCase.java
index 4b19900..95a3a3b 100644
---
a/solr/modules/langid/src/test/org/apache/solr/update/processor/LanguageIdentifierUpdateProcessorFactoryTestCase.java
+++
b/solr/modules/langid/src/test/org/apache/solr/update/processor/LanguageIdentifierUpdateProcessorFactoryTestCase.java
@@ -18,7 +18,6 @@ package org.apache.solr.update.processor;
import java.util.ArrayList;
import java.util.List;
-
import org.apache.solr.SolrTestCaseJ4;
import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.common.params.ModifiableSolrParams;
@@ -37,7 +36,10 @@ public abstract class
LanguageIdentifierUpdateProcessorFactoryTestCase extends S
@BeforeClass
public static void beforeClass() throws Exception {
- initCore("solrconfig-languageidentifier.xml", "schema.xml",
getFile("langid/solr").getAbsolutePath());
+ initCore(
+ "solrconfig-languageidentifier.xml",
+ "schema.xml",
+ getFile("langid/solr").getAbsolutePath());
SolrCore core = h.getCore();
UpdateRequestProcessorChain chained =
core.getUpdateProcessingChain("lang_id_tika");
assertNotNull(chained);
@@ -62,30 +64,163 @@ public abstract class
LanguageIdentifierUpdateProcessorFactoryTestCase extends S
parameters.add("langid.langField", "language_s");
parameters.add("langid.fallback", "un");
liProcessor = createLangIdProcessor(parameters);
-
- assertLang("no", "id", "1no", "name", "Lucene", "subject", "Lucene er et
fri/åpen kildekode programvarebibliotek for informasjonsgjenfinning,
opprinnelig utviklet i programmeringsspråket Java av Doug Cutting. Lucene
støttes av Apache Software Foundation og utgis under Apache-lisensen.");
- assertLang("en", "id", "2en", "name", "Lucene", "subject", "Apache Lucene
is a free/open source information retrieval software library, originally
created in Java by Doug Cutting. It is supported by the Apache Software
Foundation and is released under the Apache Software License.");
- assertLang("sv", "id", "3sv", "name", "Maven", "subject", "Apache Maven är
ett verktyg utvecklat av Apache Software Foundation och används inom
systemutveckling av datorprogram i programspråket Java. Maven används för att
automatiskt paketera (bygga) programfilerna till en distribuerbar enhet. Maven
används inom samma område som Apache Ant men dess byggfiler är deklarativa till
skillnad ifrån Ants skriptbaserade.");
- assertLang("es", "id", "4es", "name", "Español", "subject", "El español,
como las otras lenguas romances, es una continuación moderna del latín hablado
(denominado latín vulgar), desde el siglo III, que tras el desmembramiento del
Imperio romano fue divergiendo de las otras variantes del latín que se hablaban
en las distintas provincias del antiguo Imperio, dando lugar mediante una lenta
evolución a las distintas lenguas romances. Debido a su propagación por
América, el español es, c [...]
+
+ assertLang(
+ "no",
+ "id",
+ "1no",
+ "name",
+ "Lucene",
+ "subject",
+ "Lucene er et fri/åpen kildekode programvarebibliotek for
informasjonsgjenfinning, opprinnelig utviklet i programmeringsspråket Java av
Doug Cutting. Lucene støttes av Apache Software Foundation og utgis under
Apache-lisensen.");
+ assertLang(
+ "en",
+ "id",
+ "2en",
+ "name",
+ "Lucene",
+ "subject",
+ "Apache Lucene is a free/open source information retrieval software
library, originally created in Java by Doug Cutting. It is supported by the
Apache Software Foundation and is released under the Apache Software License.");
+ assertLang(
+ "sv",
+ "id",
+ "3sv",
+ "name",
+ "Maven",
+ "subject",
+ "Apache Maven är ett verktyg utvecklat av Apache Software Foundation
och används inom systemutveckling av datorprogram i programspråket Java. Maven
används för att automatiskt paketera (bygga) programfilerna till en
distribuerbar enhet. Maven används inom samma område som Apache Ant men dess
byggfiler är deklarativa till skillnad ifrån Ants skriptbaserade.");
+ assertLang(
+ "es",
+ "id",
+ "4es",
+ "name",
+ "Español",
+ "subject",
+ "El español, como las otras lenguas romances, es una continuación
moderna del latín hablado (denominado latín vulgar), desde el siglo III, que
tras el desmembramiento del Imperio romano fue divergiendo de las otras
variantes del latín que se hablaban en las distintas provincias del antiguo
Imperio, dando lugar mediante una lenta evolución a las distintas lenguas
romances. Debido a su propagación por América, el español es, con diferencia,
la lengua romance que ha logrado mayor di [...]
assertLang("un", "id", "5un", "name", "a", "subject", "b");
- assertLang("th", "id", "6th", "name", "บทความคัดสรรเดือนนี้", "subject",
"อันเนอลีส มารี อันเนอ ฟรังค์ หรือมักรู้จักในภาษาไทยว่า แอนน์ แฟรงค์
เป็นเด็กหญิงชาวยิว เกิดที่เมืองแฟรงก์เฟิร์ต ประเทศเยอรมนี
เธอมีชื่อเสียงโด่งดังในฐานะผู้เขียนบันทึกประจำวันซึ่งต่อมาได้รับการตีพิมพ์เป็นหนังสือ
บรรยายเหตุการณ์ขณะหลบซ่อนตัวจากการล่าชาวยิวในประเทศเนเธอร์แลนด์
ระหว่างที่ถูกเยอรมนีเข้าค
รอบครองในช่วงสงครามโลกครั้งที่สอง");
- assertLang("ru", "id", "7ru", "name", "Lucene", "subject", "The Apache
Lucene — это свободная библиотека для высокоскоростного полнотекстового поиска,
написанная на Java. Может быть использована для поиска в интернете и других
областях компьютерной лингвистики (аналитическая философия).");
- assertLang("de", "id", "8de", "name", "Lucene", "subject", "Lucene ist ein
Freie-Software-Projekt der Apache Software Foundation, das eine Suchsoftware
erstellt. Durch die hohe Leistungsfähigkeit und Skalierbarkeit können die
Lucene-Werkzeuge für beliebige Projektgrößen und Anforderungen eingesetzt
werden. So setzt beispielsweise Wikipedia Lucene für die Volltextsuche ein.
Zudem verwenden die beiden Desktop-Suchprogramme Beagle und Strigi eine C#-
bzw. C++- Portierung von Lucene als [...]
- assertLang("fr", "id", "9fr", "name", "Lucene", "subject", "Lucene est un
moteur de recherche libre écrit en Java qui permet d'indexer et de rechercher
du texte. C'est un projet open source de la fondation Apache mis à disposition
sous licence Apache. Il est également disponible pour les langages Ruby, Perl,
C++, PHP.");
- assertLang("nl", "id", "10nl", "name", "Lucene", "subject", "Lucene is een
gratis open source, tekst gebaseerde information retrieval API van origine
geschreven in Java door Doug Cutting. Het wordt ondersteund door de Apache
Software Foundation en is vrijgegeven onder de Apache Software Licentie. Lucene
is ook beschikbaar in andere programeertalen zoals Perl, C#, C++, Python, Ruby
en PHP.");
- assertLang("it", "id", "11it", "name", "Lucene", "subject", "Lucene è una
API gratuita ed open source per il reperimento di informazioni inizialmente
implementata in Java da Doug Cutting. È supportata dall'Apache Software
Foundation ed è resa disponibile con l'Apache License. Lucene è stata
successivamente reimplementata in Perl, C#, C++, Python, Ruby e PHP.");
- assertLang("pt", "id", "12pt", "name", "Lucene", "subject", "Apache
Lucene, ou simplesmente Lucene, é um software de busca e uma API de indexação
de documentos, escrito na linguagem de programação Java. É um software de
código aberto da Apache Software Foundation licenciado através da licença
Apache.");
+ assertLang(
+ "th",
+ "id",
+ "6th",
+ "name",
+ "บทความคัดสรรเดือนนี้",
+ "subject",
+ "อันเนอลีส มารี อันเนอ ฟรังค์ หรือมักรู้จักในภาษาไทยว่า แอนน์ แฟรงค์
เป็นเด็กหญิงชาวยิว เกิดที่เมืองแฟรงก์เฟิร์ต ประเทศเยอรมนี
เธอมีชื่อเสียงโด่งดังในฐานะผู้เขียนบันทึกประจำวันซึ่งต่อมาได้รับการตีพิมพ์เป็นหนังสือ
บรรยายเหตุการณ์ขณะหลบซ่อนตัวจากการล่าชาวยิวในประเทศเนเธอร์แลนด์
ระหว่างที่ถูกเยอรมนีเข้าครอบครองในช่วงสงครามโลกครั้งที่สอง");
+ assertLang(
+ "ru",
+ "id",
+ "7ru",
+ "name",
+ "Lucene",
+ "subject",
+ "The Apache Lucene — это свободная библиотека для высокоскоростного
полнотекстового поиска, написанная на Java. Может быть использована для поиска
в интернете и других областях компьютерной лингвистики (аналитическая
философия).");
+ assertLang(
+ "de",
+ "id",
+ "8de",
+ "name",
+ "Lucene",
+ "subject",
+ "Lucene ist ein Freie-Software-Projekt der Apache Software Foundation,
das eine Suchsoftware erstellt. Durch die hohe Leistungsfähigkeit und
Skalierbarkeit können die Lucene-Werkzeuge für beliebige Projektgrößen und
Anforderungen eingesetzt werden. So setzt beispielsweise Wikipedia Lucene für
die Volltextsuche ein. Zudem verwenden die beiden Desktop-Suchprogramme Beagle
und Strigi eine C#- bzw. C++- Portierung von Lucene als Indexer.");
+ assertLang(
+ "fr",
+ "id",
+ "9fr",
+ "name",
+ "Lucene",
+ "subject",
+ "Lucene est un moteur de recherche libre écrit en Java qui permet
d'indexer et de rechercher du texte. C'est un projet open source de la
fondation Apache mis à disposition sous licence Apache. Il est également
disponible pour les langages Ruby, Perl, C++, PHP.");
+ assertLang(
+ "nl",
+ "id",
+ "10nl",
+ "name",
+ "Lucene",
+ "subject",
+ "Lucene is een gratis open source, tekst gebaseerde information
retrieval API van origine geschreven in Java door Doug Cutting. Het wordt
ondersteund door de Apache Software Foundation en is vrijgegeven onder de
Apache Software Licentie. Lucene is ook beschikbaar in andere programeertalen
zoals Perl, C#, C++, Python, Ruby en PHP.");
+ assertLang(
+ "it",
+ "id",
+ "11it",
+ "name",
+ "Lucene",
+ "subject",
+ "Lucene è una API gratuita ed open source per il reperimento di
informazioni inizialmente implementata in Java da Doug Cutting. È supportata
dall'Apache Software Foundation ed è resa disponibile con l'Apache License.
Lucene è stata successivamente reimplementata in Perl, C#, C++, Python, Ruby e
PHP.");
+ assertLang(
+ "pt",
+ "id",
+ "12pt",
+ "name",
+ "Lucene",
+ "subject",
+ "Apache Lucene, ou simplesmente Lucene, é um software de busca e uma
API de indexação de documentos, escrito na linguagem de programação Java. É um
software de código aberto da Apache Software Foundation licenciado através da
licença Apache.");
// New in Tika1.0
- assertLang("ca", "id", "13ca", "name", "Catalan", "subject", "El català
posseeix dos estàndards principals: el regulat per l'Institut d'Estudis
Catalans, o estàndard general, que pren com a base l'ortografia establerta per
Pompeu Fabra amb els trets gramaticals i ortogràfics característics del català
central; i el regulat per l'Acadèmia Valenciana de la Llengua, estàndard
d'àmbit restringit, centrat en l'estandardització del valencià i que pren com a
base les Normes de Castelló, és a [...]
- assertLang("be", "id", "14be", "name", "Belarusian", "subject", "Наступнай
буйной дзяржавай на беларускай зямлі было Вялікае княства Літоўскае, Рускае і
Жамойцкае (ВКЛ). Падчас стварэння і пачатковага развіцця гэтай дзяржавы
найбуйнейшым і асноўным яе цэнтрам быў Новагародак. Акрамя сучасных земляў
Беларусі, у склад гэтай дзяржавы ўваходзілі таксама землі сучаснай Літвы,
паўночная частка сучаснай Украіны і частка сучаснай Расіі.");
- assertLang("eo", "id", "15eo", "name", "Esperanto", "subject", "La
vortprovizo de Esperanto devenas plejparte el la okcidenteŭropaj lingvoj, dum
ĝia sintakso kaj morfologio montras ankaŭ slavlingvan influon. La morfemoj ne
ŝanĝiĝas kaj oni povas ilin preskaŭ senlime kombini, kreante diverssignifajn
vortojn, Esperanto do havas multajn kunaĵojn kun la analizaj lingvoj, al kiuj
apartenas ekzemple la ĉina; kontraŭe la interna strukturo de Esperanto
certagrade respegulas la aglutinajn lin [...]
- assertLang("gl", "id", "16gl", "name", "Galician", "subject", "A cifra de
falantes medrou axiña durante as décadas seguintes, nun principio no Imperio
ruso e na Europa oriental, logo na Europa occidental, América, China e no
Xapón. Nos primeiros anos do movemento, os esperantistas mantiñan contacto por
correspondencia, pero en 1905 o primeiro Congreso Universal de Esperanto
levouse a cabo na cidade francesa de Boulogne-sur-Mer. Dende entón, os
congresos mundiais organizáronse nos cin [...]
- assertLang("ro", "id", "17ro", "name", "Romanian", "subject", "La momentul
destrămării Uniunii Sovietice și a înlăturării regimului comunist instalat în
România (1989), țara a inițiat o serie de reforme economice și politice. După
un deceniu de probleme economice, România a introdus noi reforme economice de
ordin general (precum cota unică de impozitare, în 2005) și a aderat la Uniunea
Europeană la 1 ianuarie 2007.");
- assertLang("sk", "id", "18sk", "name", "Slovakian", "subject", "Boli
vytvorené dva národné parlamenty - Česká národná rada a Slovenská národná rada
a spoločný jednokomorový česko-slovenský parlament bol premenovaný z Národného
zhromaždenia na Federálne zhromaždenie s dvoma komorami - Snemovňou ľudu a
Snemovňu národov.");
- assertLang("sl", "id", "19sl", "name", "Slovenian", "subject", "Slovenska
Wikipedija je različica spletne enciklopedije Wikipedije v slovenskem jeziku.
Projekt slovenske Wikipedije se je začel 26. februarja 2002 z ustanovitvijo
njene spletne strani, njen pobudnik pa je bil uporabnik Jani Melik.");
- assertLang("uk", "id", "20uk", "name", "Ukrainian", "subject",
"Народно-господарський комплекс країни включає такі види промисловості як важке
машинобудування, чорна та кольорова металургія, суднобудування, виробництво
автобусів, легкових та вантажних автомобілів, тракторів та іншої
сільськогосподарської техніки, тепловозів, верстатів, турбін, авіаційних
двигунів та літаків, обладнання для електростанцій, нафто-газової та хімічної
промисловості тощо. Крім того, Україна є потужним вир [...]
+ assertLang(
+ "ca",
+ "id",
+ "13ca",
+ "name",
+ "Catalan",
+ "subject",
+ "El català posseeix dos estàndards principals: el regulat per
l'Institut d'Estudis Catalans, o estàndard general, que pren com a base
l'ortografia establerta per Pompeu Fabra amb els trets gramaticals i
ortogràfics característics del català central; i el regulat per l'Acadèmia
Valenciana de la Llengua, estàndard d'àmbit restringit, centrat en
l'estandardització del valencià i que pren com a base les Normes de Castelló,
és a dir, l'ortografia de Pompeu Fabra però més adaptada a la [...]
+ assertLang(
+ "be",
+ "id",
+ "14be",
+ "name",
+ "Belarusian",
+ "subject",
+ "Наступнай буйной дзяржавай на беларускай зямлі было Вялікае княства
Літоўскае, Рускае і Жамойцкае (ВКЛ). Падчас стварэння і пачатковага развіцця
гэтай дзяржавы найбуйнейшым і асноўным яе цэнтрам быў Новагародак. Акрамя
сучасных земляў Беларусі, у склад гэтай дзяржавы ўваходзілі таксама землі
сучаснай Літвы, паўночная частка сучаснай Украіны і частка сучаснай Расіі.");
+ assertLang(
+ "eo",
+ "id",
+ "15eo",
+ "name",
+ "Esperanto",
+ "subject",
+ "La vortprovizo de Esperanto devenas plejparte el la okcidenteŭropaj
lingvoj, dum ĝia sintakso kaj morfologio montras ankaŭ slavlingvan influon. La
morfemoj ne ŝanĝiĝas kaj oni povas ilin preskaŭ senlime kombini, kreante
diverssignifajn vortojn, Esperanto do havas multajn kunaĵojn kun la analizaj
lingvoj, al kiuj apartenas ekzemple la ĉina; kontraŭe la interna strukturo de
Esperanto certagrade respegulas la aglutinajn lingvojn, kiel la japanan,
svahilan aŭ turkan.");
+ assertLang(
+ "gl",
+ "id",
+ "16gl",
+ "name",
+ "Galician",
+ "subject",
+ "A cifra de falantes medrou axiña durante as décadas seguintes, nun
principio no Imperio ruso e na Europa oriental, logo na Europa occidental,
América, China e no Xapón. Nos primeiros anos do movemento, os esperantistas
mantiñan contacto por correspondencia, pero en 1905 o primeiro Congreso
Universal de Esperanto levouse a cabo na cidade francesa de Boulogne-sur-Mer.
Dende entón, os congresos mundiais organizáronse nos cinco continentes ano tras
ano agás durante as dúas Guerras M [...]
+ assertLang(
+ "ro",
+ "id",
+ "17ro",
+ "name",
+ "Romanian",
+ "subject",
+ "La momentul destrămării Uniunii Sovietice și a înlăturării regimului
comunist instalat în România (1989), țara a inițiat o serie de reforme
economice și politice. După un deceniu de probleme economice, România a
introdus noi reforme economice de ordin general (precum cota unică de
impozitare, în 2005) și a aderat la Uniunea Europeană la 1 ianuarie 2007.");
+ assertLang(
+ "sk",
+ "id",
+ "18sk",
+ "name",
+ "Slovakian",
+ "subject",
+ "Boli vytvorené dva národné parlamenty - Česká národná rada a
Slovenská národná rada a spoločný jednokomorový česko-slovenský parlament bol
premenovaný z Národného zhromaždenia na Federálne zhromaždenie s dvoma komorami
- Snemovňou ľudu a Snemovňu národov.");
+ assertLang(
+ "sl",
+ "id",
+ "19sl",
+ "name",
+ "Slovenian",
+ "subject",
+ "Slovenska Wikipedija je različica spletne enciklopedije Wikipedije v
slovenskem jeziku. Projekt slovenske Wikipedije se je začel 26. februarja 2002
z ustanovitvijo njene spletne strani, njen pobudnik pa je bil uporabnik Jani
Melik.");
+ assertLang(
+ "uk",
+ "id",
+ "20uk",
+ "name",
+ "Ukrainian",
+ "subject",
+ "Народно-господарський комплекс країни включає такі види промисловості
як важке машинобудування, чорна та кольорова металургія, суднобудування,
виробництво автобусів, легкових та вантажних автомобілів, тракторів та іншої
сільськогосподарської техніки, тепловозів, верстатів, турбін, авіаційних
двигунів та літаків, обладнання для електростанцій, нафто-газової та хімічної
промисловості тощо. Крім того, Україна є потужним виробником електроенергії.
Україна має розвинуте сільське госп [...]
}
-
+
@Test
public void testMapFieldName() throws Exception {
ModifiableSolrParams parameters = new ModifiableSolrParams();
@@ -93,7 +228,7 @@ public abstract class
LanguageIdentifierUpdateProcessorFactoryTestCase extends S
parameters.add("langid.map.lcmap", "jp:s zh:cjk ko:cjk");
parameters.set("langid.enforceSchema", "false");
liProcessor = createLangIdProcessor(parameters);
-
+
assertEquals("test_no", liProcessor.getMappedField("test", "no"));
assertEquals("test_en", liProcessor.getMappedField("test", "en"));
assertEquals("test_s", liProcessor.getMappedField("test", "jp"));
@@ -139,11 +274,11 @@ public abstract class
LanguageIdentifierUpdateProcessorFactoryTestCase extends S
parameters.add("langid.enforceSchema", "false");
parameters.add("langid.map", "true");
liProcessor = createLangIdProcessor(parameters);
-
+
doc = englishDoc();
assertEquals("en", process(doc).getFieldValue("language"));
assertEquals("en", process(doc).getFieldValue("languages"));
-
+
doc = englishDoc();
doc.setField("language", "no");
assertEquals("no", process(doc).getFieldValue("language"));
@@ -152,8 +287,8 @@ public abstract class
LanguageIdentifierUpdateProcessorFactoryTestCase extends S
}
/**
- * Test not only 1st value taken into account (empty string),
- * but all other values of 'text_multivalue' field ('en').
+ * Test not only 1st value taken into account (empty string), but all other
values of
+ * 'text_multivalue' field ('en').
*/
@Test
public void testPreExistingMultiValue() throws Exception {
@@ -165,11 +300,11 @@ public abstract class
LanguageIdentifierUpdateProcessorFactoryTestCase extends S
parameters.add("langid.enforceSchema", "false");
parameters.add("langid.map", "true");
liProcessor = createLangIdProcessor(parameters);
-
+
doc = englishDoc();
assertEquals("en", process(doc).getFieldValue("language"));
assertEquals("en", process(doc).getFieldValue("languages"));
-
+
doc = englishDoc();
doc.setField("language", "no");
assertEquals("no", process(doc).getFieldValue("language"));
@@ -178,8 +313,8 @@ public abstract class
LanguageIdentifierUpdateProcessorFactoryTestCase extends S
}
/**
- * Test not only 1st value taken into account (ru text),
- * but all values of 'text_multivalue' field ('ru' and 'en').
+ * Test not only 1st value taken into account (ru text), but all values of
'text_multivalue' field
+ * ('ru' and 'en').
*/
@Test
public void testPreExistingMultiValueMixedLang() throws Exception {
@@ -211,7 +346,7 @@ public abstract class
LanguageIdentifierUpdateProcessorFactoryTestCase extends S
parameters.add("langid.langField", "language");
parameters.add("langid.enforceSchema", "false");
liProcessor = createLangIdProcessor(parameters);
-
+
doc = tooShortDoc();
assertEquals("", process(doc).getFieldValue("language"));
}
@@ -239,7 +374,7 @@ public abstract class
LanguageIdentifierUpdateProcessorFactoryTestCase extends S
parameters.add("langid.fallback", "fbVal");
parameters.add("langid.enforceSchema", "false");
liProcessor = createLangIdProcessor(parameters);
-
+
// Verify fallback to field fb (noop field does not exist and is skipped)
doc = tooShortDoc();
doc.addField("fb", "fbField");
@@ -247,9 +382,9 @@ public abstract class
LanguageIdentifierUpdateProcessorFactoryTestCase extends S
// Verify fallback to fallback value since no fallback fields exist
doc = tooShortDoc();
- assertEquals("fbVal", process(doc).getFieldValue("language"));
+ assertEquals("fbVal", process(doc).getFieldValue("language"));
}
-
+
@Test
public void testResolveLanguage() throws Exception {
List<DetectedLanguage> langs;
@@ -265,14 +400,14 @@ public abstract class
LanguageIdentifierUpdateProcessorFactoryTestCase extends S
// One detected language
langs.add(new DetectedLanguage("one", 1.0));
- assertEquals("one", liProcessor.resolveLanguage(langs, "fallback"));
+ assertEquals("one", liProcessor.resolveLanguage(langs, "fallback"));
// One detected language under default threshold
langs = new ArrayList<>();
langs.add(new DetectedLanguage("under", 0.1));
- assertEquals("fallback", liProcessor.resolveLanguage(langs, "fallback"));
+ assertEquals("fallback", liProcessor.resolveLanguage(langs, "fallback"));
}
-
+
@Test
public void testKeepOrig() throws Exception {
ModifiableSolrParams parameters = new ModifiableSolrParams();
@@ -289,7 +424,7 @@ public abstract class
LanguageIdentifierUpdateProcessorFactoryTestCase extends S
assertEquals("en", mappedNoOrig.getFieldValue("language"));
assertTrue(mappedNoOrig.containsKey("text_en"));
assertFalse(mappedNoOrig.containsKey("text"));
-
+
// keepOrig true
parameters.set("langid.map.keepOrig", "true");
liProcessor = createLangIdProcessor(parameters);
@@ -298,7 +433,7 @@ public abstract class
LanguageIdentifierUpdateProcessorFactoryTestCase extends S
assertTrue(mappedKeepOrig.containsKey("text_en"));
assertTrue(mappedKeepOrig.containsKey("text"));
assertEquals(englishDoc().getFieldValue("text"),
mappedKeepOrig.getFieldValue("text_en"));
-
+
// keepOrig and map individual
parameters.set("langid.map.individual", "true");
parameters.set("langid.fl", "text,text2");
@@ -309,7 +444,8 @@ public abstract class
LanguageIdentifierUpdateProcessorFactoryTestCase extends S
assertTrue(mappedIndividual.containsKey("text"));
assertTrue(mappedIndividual.containsKey("text2_ru"));
assertTrue(mappedIndividual.containsKey("text2"));
- assertEquals(languagePerFieldDoc().getFieldValue("text"),
mappedIndividual.getFieldValue("text_en"));
+ assertEquals(
+ languagePerFieldDoc().getFieldValue("text"),
mappedIndividual.getFieldValue("text_en"));
}
@Test
@@ -327,32 +463,45 @@ public abstract class
LanguageIdentifierUpdateProcessorFactoryTestCase extends S
assertTrue(mappedIndividual.containsKey("text_en"));
assertTrue(mappedIndividual.containsKey("text2_ru"));
}
-
+
// Various utility methods
-
+
private SolrInputDocument englishDoc() {
SolrInputDocument doc = new SolrInputDocument();
- doc.addField("text", "Apache Lucene is a free/open source information
retrieval software library, originally created in Java by Doug Cutting. It is
supported by the Apache Software Foundation and is released under the Apache
Software License.");
- doc.addField("text_multivalue", new String[]{"", "Apache Lucene is a
free/open source information retrieval software library, originally created in
Java by Doug Cutting. It is supported by the Apache Software Foundation and is
released under the Apache Software License."});
+ doc.addField(
+ "text",
+ "Apache Lucene is a free/open source information retrieval software
library, originally created in Java by Doug Cutting. It is supported by the
Apache Software Foundation and is released under the Apache Software License.");
+ doc.addField(
+ "text_multivalue",
+ new String[] {
+ "",
+ "Apache Lucene is a free/open source information retrieval software
library, originally created in Java by Doug Cutting. It is supported by the
Apache Software Foundation and is released under the Apache Software License."
+ });
return doc;
}
private SolrInputDocument languagePerFieldDoc() {
SolrInputDocument doc = englishDoc();
- doc.addField("text2", "The Apache Lucene — это свободная библиотека для
высокоскоростного полнотекстового поиска, написанная на Java. Может быть
использована для поиска в интернете и других областях компьютерной лингвистики
(аналитическая философия).");
+ doc.addField(
+ "text2",
+ "The Apache Lucene — это свободная библиотека для высокоскоростного
полнотекстового поиска, написанная на Java. Может быть использована для поиска
в интернете и других областях компьютерной лингвистики (аналитическая
философия).");
return doc;
}
-
+
/**
* Construct document containing multi-value fields in different languages.
+ *
* @return solr input document
*/
private SolrInputDocument mixedEnglishRussianDoc() {
SolrInputDocument doc = new SolrInputDocument();
- doc.addField("text_multivalue", new String[]{"The Apache Lucene — это
свободная библиотека для высокоскоростного полнотекстового поиска, написанная
на Java. Может быть использована для поиска в интернете и других областях
компьютерной лингвистики (аналитическая философия).",
- "Apache Lucene is a free/open
source information retrieval software library, originally created in Java by
Doug Cutting. It is supported by the Apache Software Foundation and is released
under the Apache Software License.",
- "Solr (pronounced \"solar\") is an open source enterprise search
platform from the Apache Lucene project. Its major features include full-text
search, hit highlighting, faceted search, dynamic clustering, database
integration, and rich document (e.g., Word, PDF) handling."
- });
+ doc.addField(
+ "text_multivalue",
+ new String[] {
+ "The Apache Lucene — это свободная библиотека для высокоскоростного
полнотекстового поиска, написанная на Java. Может быть использована для поиска
в интернете и других областях компьютерной лингвистики (аналитическая
философия).",
+ "Apache Lucene is a free/open source information retrieval software
library, originally created in Java by Doug Cutting. It is supported by the
Apache Software Foundation and is released under the Apache Software License.",
+ "Solr (pronounced \"solar\") is an open source enterprise search
platform from the Apache Lucene project. Its major features include full-text
search, hit highlighting, faceted search, dynamic clustering, database
integration, and rich document (e.g., Word, PDF) handling."
+ });
return doc;
}
@@ -362,23 +511,24 @@ public abstract class
LanguageIdentifierUpdateProcessorFactoryTestCase extends S
return doc;
}
- protected abstract LanguageIdentifierUpdateProcessor
createLangIdProcessor(ModifiableSolrParams parameters) throws Exception;
+ protected abstract LanguageIdentifierUpdateProcessor createLangIdProcessor(
+ ModifiableSolrParams parameters) throws Exception;
protected void assertLang(String langCode, String... fieldsAndValues) throws
Exception {
- if(liProcessor == null)
+ if (liProcessor == null)
throw new Exception("Processor must be initialized before calling
assertLang()");
SolrInputDocument doc = sid(fieldsAndValues);
assertEquals(langCode, process(doc).getFieldValue(liProcessor.langField));
}
-
+
private SolrInputDocument sid(String... fieldsAndValues) {
SolrInputDocument doc = new SolrInputDocument();
- for (int i = 0; i < fieldsAndValues.length; i+=2) {
- doc.addField(fieldsAndValues[i], fieldsAndValues[i+1]);
+ for (int i = 0; i < fieldsAndValues.length; i += 2) {
+ doc.addField(fieldsAndValues[i], fieldsAndValues[i + 1]);
}
return doc;
}
-
+
/*
Utility test method to process a clone of a document
*/
diff --git
a/solr/modules/langid/src/test/org/apache/solr/update/processor/OpenNLPLangDetectUpdateProcessorFactoryTest.java
b/solr/modules/langid/src/test/org/apache/solr/update/processor/OpenNLPLangDetectUpdateProcessorFactoryTest.java
index 3016324..e09d8d8 100644
---
a/solr/modules/langid/src/test/org/apache/solr/update/processor/OpenNLPLangDetectUpdateProcessorFactoryTest.java
+++
b/solr/modules/langid/src/test/org/apache/solr/update/processor/OpenNLPLangDetectUpdateProcessorFactoryTest.java
@@ -24,22 +24,26 @@ import org.apache.solr.request.SolrQueryRequest;
import org.junit.Test;
@ThreadLeakLingering(linger = 0)
-public class OpenNLPLangDetectUpdateProcessorFactoryTest extends
LanguageIdentifierUpdateProcessorFactoryTestCase {
+public class OpenNLPLangDetectUpdateProcessorFactoryTest
+ extends LanguageIdentifierUpdateProcessorFactoryTestCase {
private static final String TEST_MODEL =
"opennlp-langdetect.eng-swe-spa-rus-deu.bin";
-
+
@Override
- protected OpenNLPLangDetectUpdateProcessor
createLangIdProcessor(ModifiableSolrParams parameters) throws Exception {
- if (parameters.get("langid.model") == null) { // handle superclass tests
that don't provide the model filename
+ protected OpenNLPLangDetectUpdateProcessor
createLangIdProcessor(ModifiableSolrParams parameters)
+ throws Exception {
+ if (parameters.get("langid.model")
+ == null) { // handle superclass tests that don't provide the model
filename
parameters.set("langid.model", TEST_MODEL);
}
- if (parameters.get("langid.threshold") == null) { // handle superclass
tests that don't provide confidence threshold
+ if (parameters.get("langid.threshold")
+ == null) { // handle superclass tests that don't provide confidence
threshold
parameters.set("langid.threshold", "0.3");
}
SolrQueryRequest req = _parser.buildRequestFrom(h.getCore(), new
ModifiableSolrParams(), null);
OpenNLPLangDetectUpdateProcessorFactory factory = new
OpenNLPLangDetectUpdateProcessorFactory();
factory.init(parameters.toNamedList());
factory.inform(h.getCore());
- return (OpenNLPLangDetectUpdateProcessor)factory.getInstance(req, resp,
null);
+ return (OpenNLPLangDetectUpdateProcessor) factory.getInstance(req, resp,
null);
}
// this one actually works better it seems with short docs
@@ -50,7 +54,8 @@ public class OpenNLPLangDetectUpdateProcessorFactoryTest
extends LanguageIdentif
return doc;
}
- @Test @Override
+ @Test
+ @Override
public void testLangIdGlobal() throws Exception {
ModifiableSolrParams parameters = new ModifiableSolrParams();
parameters.add("langid.fl", "name,subject");
@@ -59,10 +64,45 @@ public class OpenNLPLangDetectUpdateProcessorFactoryTest
extends LanguageIdentif
parameters.add("langid.threshold", "0.3");
liProcessor = createLangIdProcessor(parameters);
- assertLang("en", "id", "1en", "name", "Lucene", "subject", "Apache Lucene
is a free/open source information retrieval software library, originally
created in Java by Doug Cutting. It is supported by the Apache Software
Foundation and is released under the Apache Software License.");
- assertLang("sv", "id", "2sv", "name", "Maven", "subject", "Apache Maven är
ett verktyg utvecklat av Apache Software Foundation och används inom
systemutveckling av datorprogram i programspråket Java. Maven används för att
automatiskt paketera (bygga) programfilerna till en distribuerbar enhet. Maven
används inom samma område som Apache Ant men dess byggfiler är deklarativa till
skillnad ifrån Ants skriptbaserade.");
- assertLang("es", "id", "3es", "name", "Lucene", "subject", "Lucene es un
API de código abierto para recuperación de información, originalmente
implementada en Java por Doug Cutting. Está apoyado por el Apache Software
Foundation y se distribuye bajo la Apache Software License. Lucene tiene
versiones para otros lenguajes incluyendo Delphi, Perl, C#, C++, Python, Ruby y
PHP.");
- assertLang("ru", "id", "4ru", "name", "Lucene", "subject", "The Apache
Lucene — это свободная библиотека для высокоскоростного полнотекстового поиска,
написанная на Java. Может быть использована для поиска в интернете и других
областях компьютерной лингвистики (аналитическая философия).");
- assertLang("de", "id", "5de", "name", "Lucene", "subject", "Lucene ist ein
Freie-Software-Projekt der Apache Software Foundation, das eine Suchsoftware
erstellt. Durch die hohe Leistungsfähigkeit und Skalierbarkeit können die
Lucene-Werkzeuge für beliebige Projektgrößen und Anforderungen eingesetzt
werden. So setzt beispielsweise Wikipedia Lucene für die Volltextsuche ein.
Zudem verwenden die beiden Desktop-Suchprogramme Beagle und Strigi eine C#-
bzw. C++- Portierung von Lucene als [...]
+ assertLang(
+ "en",
+ "id",
+ "1en",
+ "name",
+ "Lucene",
+ "subject",
+ "Apache Lucene is a free/open source information retrieval software
library, originally created in Java by Doug Cutting. It is supported by the
Apache Software Foundation and is released under the Apache Software License.");
+ assertLang(
+ "sv",
+ "id",
+ "2sv",
+ "name",
+ "Maven",
+ "subject",
+ "Apache Maven är ett verktyg utvecklat av Apache Software Foundation
och används inom systemutveckling av datorprogram i programspråket Java. Maven
används för att automatiskt paketera (bygga) programfilerna till en
distribuerbar enhet. Maven används inom samma område som Apache Ant men dess
byggfiler är deklarativa till skillnad ifrån Ants skriptbaserade.");
+ assertLang(
+ "es",
+ "id",
+ "3es",
+ "name",
+ "Lucene",
+ "subject",
+ "Lucene es un API de código abierto para recuperación de información,
originalmente implementada en Java por Doug Cutting. Está apoyado por el Apache
Software Foundation y se distribuye bajo la Apache Software License. Lucene
tiene versiones para otros lenguajes incluyendo Delphi, Perl, C#, C++, Python,
Ruby y PHP.");
+ assertLang(
+ "ru",
+ "id",
+ "4ru",
+ "name",
+ "Lucene",
+ "subject",
+ "The Apache Lucene — это свободная библиотека для высокоскоростного
полнотекстового поиска, написанная на Java. Может быть использована для поиска
в интернете и других областях компьютерной лингвистики (аналитическая
философия).");
+ assertLang(
+ "de",
+ "id",
+ "5de",
+ "name",
+ "Lucene",
+ "subject",
+ "Lucene ist ein Freie-Software-Projekt der Apache Software Foundation,
das eine Suchsoftware erstellt. Durch die hohe Leistungsfähigkeit und
Skalierbarkeit können die Lucene-Werkzeuge für beliebige Projektgrößen und
Anforderungen eingesetzt werden. So setzt beispielsweise Wikipedia Lucene für
die Volltextsuche ein. Zudem verwenden die beiden Desktop-Suchprogramme Beagle
und Strigi eine C#- bzw. C++- Portierung von Lucene als Indexer.");
}
}
diff --git
a/solr/modules/langid/src/test/org/apache/solr/update/processor/SolrInputDocumentReaderTest.java
b/solr/modules/langid/src/test/org/apache/solr/update/processor/SolrInputDocumentReaderTest.java
index 5e28a52..b6ceda7 100644
---
a/solr/modules/langid/src/test/org/apache/solr/update/processor/SolrInputDocumentReaderTest.java
+++
b/solr/modules/langid/src/test/org/apache/solr/update/processor/SolrInputDocumentReaderTest.java
@@ -16,16 +16,15 @@
*/
package org.apache.solr.update.processor;
-import java.util.Arrays;
+import static org.junit.Assert.assertArrayEquals;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+import java.util.Arrays;
import org.apache.solr.common.SolrInputDocument;
import org.junit.Before;
import org.junit.Test;
-import static org.junit.Assert.assertArrayEquals;
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertTrue;
-
public class SolrInputDocumentReaderTest {
private SolrInputDocument doc;
private String[] allFields;
@@ -41,19 +40,14 @@ public class SolrInputDocumentReaderTest {
doc.addField("f4", "12345678901234567890");
allFields = new String[] {"f1", "f2", "f3", "f4"};
}
-
+
@Test
public void readChunked() throws Exception {
- SolrInputDocumentReader reader = new SolrInputDocumentReader(
- doc,
- allFields,
- 20,
- 18,
- " - ");
+ SolrInputDocumentReader reader = new SolrInputDocumentReader(doc,
allFields, 20, 18, " - ");
assertTrue(reader.ready());
char[] chars = new char[1000];
int len;
- assertEquals(9, len=reader.read(chars, 0, 9));
+ assertEquals(9, len = reader.read(chars, 0, 9));
assertArrEqu("a b c - m", chars, len);
len += reader.read(chars, 9, 2);
assertArrEqu("a b c - mul", chars, len);
@@ -66,13 +60,7 @@ public class SolrInputDocumentReaderTest {
@Test
public void maxFieldValueLength() throws Exception {
- SolrInputDocumentReader reader = new SolrInputDocumentReader(
- doc,
- allFields,
- 21,
- 2,
- " - "
- );
+ SolrInputDocumentReader reader = new SolrInputDocumentReader(doc,
allFields, 21, 2, " - ");
assertTrue(reader.ready());
char[] chars = new char[1000];
int len = reader.read(chars, 0, 22);
@@ -82,18 +70,14 @@ public class SolrInputDocumentReaderTest {
@Test
public void allStrFields() throws Exception {
- SolrInputDocumentReader reader = new SolrInputDocumentReader(
- doc,
- 20000,
- 10000
- );
+ SolrInputDocumentReader reader = new SolrInputDocumentReader(doc, 20000,
10000);
assertTrue(reader.ready());
char[] chars = new char[1000];
int len = reader.read(chars, 0, 1000);
assertEquals(45, len);
assertArrEqu("a b c multi valued field 12345678901234567890", chars, len);
}
-
+
@Test
public void testGetStringFields() throws Exception {
String[] expected = new String[] {"f1", "f2", "f4"};
@@ -104,5 +88,4 @@ public class SolrInputDocumentReaderTest {
String str = new String(Arrays.copyOf(chars, len));
assertEquals(expected, str);
}
-
-}
\ No newline at end of file
+}
diff --git
a/solr/modules/langid/src/test/org/apache/solr/update/processor/TikaLanguageIdentifierUpdateProcessorFactoryTest.java
b/solr/modules/langid/src/test/org/apache/solr/update/processor/TikaLanguageIdentifierUpdateProcessorFactoryTest.java
index 172b892..b110b39 100644
---
a/solr/modules/langid/src/test/org/apache/solr/update/processor/TikaLanguageIdentifierUpdateProcessorFactoryTest.java
+++
b/solr/modules/langid/src/test/org/apache/solr/update/processor/TikaLanguageIdentifierUpdateProcessorFactoryTest.java
@@ -20,25 +20,30 @@ import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.common.params.ModifiableSolrParams;
import org.junit.Test;
-public class TikaLanguageIdentifierUpdateProcessorFactoryTest extends
LanguageIdentifierUpdateProcessorFactoryTestCase {
+public class TikaLanguageIdentifierUpdateProcessorFactoryTest
+ extends LanguageIdentifierUpdateProcessorFactoryTestCase {
@Override
- protected LanguageIdentifierUpdateProcessor
createLangIdProcessor(ModifiableSolrParams parameters) throws Exception {
- return new
TikaLanguageIdentifierUpdateProcessor(_parser.buildRequestFrom(h.getCore(),
parameters, null), resp, null);
+ protected LanguageIdentifierUpdateProcessor
createLangIdProcessor(ModifiableSolrParams parameters)
+ throws Exception {
+ return new TikaLanguageIdentifierUpdateProcessor(
+ _parser.buildRequestFrom(h.getCore(), parameters, null), resp, null);
}
-
@Test
public void testMaxFieldValueChars() throws Exception {
SolrInputDocument doc = new SolrInputDocument();
- String valueF1 = "Apache Lucene is a free/open source information
retrieval software library, originally created in Java by Doug Cutting. It is
supported by the Apache Software Foundation and is released under the Apache
Software License.";
- String valueF2 = "An open-source search server based on the Lucene Java
search library. News, documentation, resources, and download.";
+ String valueF1 =
+ "Apache Lucene is a free/open source information retrieval software
library, originally created in Java by Doug Cutting. It is supported by the
Apache Software Foundation and is released under the Apache Software License.";
+ String valueF2 =
+ "An open-source search server based on the Lucene Java search library.
News, documentation, resources, and download.";
doc.addField("foo_s", valueF1);
ModifiableSolrParams parameters = new ModifiableSolrParams();
parameters.add("langid.fl", "foo_s");
parameters.add("langid.langField", "language");
parameters.add("langid.enforceSchema", "false");
- TikaLanguageIdentifierUpdateProcessor p =
(TikaLanguageIdentifierUpdateProcessor) createLangIdProcessor(parameters);
+ TikaLanguageIdentifierUpdateProcessor p =
+ (TikaLanguageIdentifierUpdateProcessor)
createLangIdProcessor(parameters);
assertEquals(valueF1, p.concatFields(doc).trim());
parameters = new ModifiableSolrParams();
@@ -73,21 +78,23 @@ public class
TikaLanguageIdentifierUpdateProcessorFactoryTest extends LanguageId
parameters.add("langid.maxFieldValueChars", "100000");
p = (TikaLanguageIdentifierUpdateProcessor)
createLangIdProcessor(parameters);
assertEquals(valueF1 + " " + valueF2, p.concatFields(doc).trim());
-
-}
+ }
@Test
public void testMaxTotalChars() throws Exception {
SolrInputDocument doc = new SolrInputDocument();
- String valueF1 = "Apache Lucene is a free/open source information
retrieval software library, originally created in Java by Doug Cutting. It is
supported by the Apache Software Foundation and is released under the Apache
Software License.";
- String valueF2 = "An open-source search server based on the Lucene Java
search library. News, documentation, resources, and download.";
+ String valueF1 =
+ "Apache Lucene is a free/open source information retrieval software
library, originally created in Java by Doug Cutting. It is supported by the
Apache Software Foundation and is released under the Apache Software License.";
+ String valueF2 =
+ "An open-source search server based on the Lucene Java search library.
News, documentation, resources, and download.";
doc.addField("foo_s", valueF1);
ModifiableSolrParams parameters = new ModifiableSolrParams();
parameters.add("langid.fl", "foo_s");
parameters.add("langid.langField", "language");
parameters.add("langid.enforceSchema", "false");
- TikaLanguageIdentifierUpdateProcessor p =
(TikaLanguageIdentifierUpdateProcessor) createLangIdProcessor(parameters);
+ TikaLanguageIdentifierUpdateProcessor p =
+ (TikaLanguageIdentifierUpdateProcessor)
createLangIdProcessor(parameters);
assertEquals(valueF1, p.concatFields(doc).trim());
parameters = new ModifiableSolrParams();
@@ -122,22 +129,23 @@ public class
TikaLanguageIdentifierUpdateProcessorFactoryTest extends LanguageId
parameters.add("langid.maxTotalChars", "100000");
p = (TikaLanguageIdentifierUpdateProcessor)
createLangIdProcessor(parameters);
assertEquals(valueF1 + " " + valueF2, p.concatFields(doc).trim());
-
}
-
@Test
public void testMaxFieldValueCharsAndMaxTotalChars() throws Exception {
SolrInputDocument doc = new SolrInputDocument();
- String valueF1 = "Apache Lucene is a free/open source information
retrieval software library, originally created in Java by Doug Cutting. It is
supported by the Apache Software Foundation and is released under the Apache
Software License.";
- String valueF2 = "An open-source search server based on the Lucene Java
search library. News, documentation, resources, and download.";
+ String valueF1 =
+ "Apache Lucene is a free/open source information retrieval software
library, originally created in Java by Doug Cutting. It is supported by the
Apache Software Foundation and is released under the Apache Software License.";
+ String valueF2 =
+ "An open-source search server based on the Lucene Java search library.
News, documentation, resources, and download.";
doc.addField("foo_s", valueF1);
ModifiableSolrParams parameters = new ModifiableSolrParams();
parameters.add("langid.fl", "foo_s");
parameters.add("langid.langField", "language");
parameters.add("langid.enforceSchema", "false");
- TikaLanguageIdentifierUpdateProcessor p =
(TikaLanguageIdentifierUpdateProcessor) createLangIdProcessor(parameters);
+ TikaLanguageIdentifierUpdateProcessor p =
+ (TikaLanguageIdentifierUpdateProcessor)
createLangIdProcessor(parameters);
assertEquals(valueF1, p.concatFields(doc).trim());
parameters = new ModifiableSolrParams();
@@ -175,7 +183,5 @@ public class
TikaLanguageIdentifierUpdateProcessorFactoryTest extends LanguageId
parameters.add("langid.maxTotalChars", "100000");
p = (TikaLanguageIdentifierUpdateProcessor)
createLangIdProcessor(parameters);
assertEquals(valueF1 + " " + valueF2, p.concatFields(doc).trim());
-
}
-
}